ingestify 0.4.1__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ingestify-0.4.1 → ingestify-0.4.2}/PKG-INFO +1 -1
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/__init__.py +1 -1
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/application/loader.py +8 -4
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/cmdline.py +10 -1
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/ingestion/ingestion_job.py +14 -8
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/ingestion/ingestion_job_summary.py +13 -13
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/store/dataset/sqlalchemy/repository.py +62 -29
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/main.py +13 -6
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify.egg-info/PKG-INFO +1 -1
- {ingestify-0.4.1 → ingestify-0.4.2}/README.md +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/application/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/application/dataset_store.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/application/ingestion_engine.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/base.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/data_spec_version_collection.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/dataset/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/dataset/collection.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/dataset/dataset.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/dataset/dataset_state.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/dataset/events.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/dataset/file.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/dataset/file_collection.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/dataset/file_repository.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/dataset/revision.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/dataset/selector.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/event/domain_event.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/event/event_bus.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/fetch_policy.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/ingestion/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/resources/dataset_resource.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/sink.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/task/task.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/task/task_summary.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/timing.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/services/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/services/identifier_key_transformer.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/services/transformers/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/exceptions.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/fetch/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/fetch/http.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/serialization/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/sink/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/sink/postgresql.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/source/statsbomb_github.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/source/wyscout.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/store/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/store/dataset/sqlalchemy/tables.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/store/file/local_file_repository.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/infra/store/file/s3_file_repository.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/server.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/source_base.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/static/templates/statsbomb_github/README.md +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/static/templates/statsbomb_github/query.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/static/templates/wyscout/.env +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/static/templates/wyscout/.gitignore +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/static/templates/wyscout/README.md +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/static/templates/wyscout/database/README.md +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/static/templates/wyscout/query.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify/utils.py +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify.egg-info/SOURCES.txt +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify.egg-info/requires.txt +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/setup.cfg +0 -0
- {ingestify-0.4.1 → ingestify-0.4.2}/setup.py +0 -0
|
@@ -35,11 +35,8 @@ class Loader:
|
|
|
35
35
|
provider: Optional[str] = None,
|
|
36
36
|
source: Optional[str] = None,
|
|
37
37
|
):
|
|
38
|
-
|
|
39
|
-
selectors = {}
|
|
38
|
+
ingestion_plans = []
|
|
40
39
|
for ingestion_plan in self.ingestion_plans:
|
|
41
|
-
logger.info(f"Determining selectors for {ingestion_plan}")
|
|
42
|
-
|
|
43
40
|
if provider is not None:
|
|
44
41
|
if ingestion_plan.source.provider != provider:
|
|
45
42
|
logger.info(
|
|
@@ -54,6 +51,13 @@ class Loader:
|
|
|
54
51
|
)
|
|
55
52
|
continue
|
|
56
53
|
|
|
54
|
+
ingestion_plans.append(ingestion_plan)
|
|
55
|
+
|
|
56
|
+
# First collect all selectors, before discovering datasets
|
|
57
|
+
selectors = {}
|
|
58
|
+
for ingestion_plan in ingestion_plans:
|
|
59
|
+
logger.info(f"Determining selectors for {ingestion_plan}")
|
|
60
|
+
|
|
57
61
|
static_selectors = [
|
|
58
62
|
selector
|
|
59
63
|
for selector in ingestion_plan.selectors
|
|
@@ -88,6 +88,14 @@ def cli():
|
|
|
88
88
|
help="Source - only run tasks for a single source",
|
|
89
89
|
type=str,
|
|
90
90
|
)
|
|
91
|
+
@click.option(
|
|
92
|
+
"--disable-events",
|
|
93
|
+
"disable_events",
|
|
94
|
+
required=False,
|
|
95
|
+
help="Disable events - disable all event handlers",
|
|
96
|
+
is_flag=True,
|
|
97
|
+
type=bool,
|
|
98
|
+
)
|
|
91
99
|
def run(
|
|
92
100
|
config_file: str,
|
|
93
101
|
bucket: Optional[str],
|
|
@@ -95,9 +103,10 @@ def run(
|
|
|
95
103
|
provider: Optional[str],
|
|
96
104
|
source: Optional[str],
|
|
97
105
|
debug: Optional[bool],
|
|
106
|
+
disable_events: Optional[bool],
|
|
98
107
|
):
|
|
99
108
|
try:
|
|
100
|
-
engine = get_engine(config_file, bucket)
|
|
109
|
+
engine = get_engine(config_file, bucket, disable_events=disable_events)
|
|
101
110
|
except ConfigurationError as e:
|
|
102
111
|
if debug:
|
|
103
112
|
raise
|
|
@@ -218,7 +218,7 @@ class IngestionJob:
|
|
|
218
218
|
# Process all items in batches. Yield a IngestionJobSummary per batch
|
|
219
219
|
|
|
220
220
|
logger.info("Finding metadata")
|
|
221
|
-
with ingestion_job_summary.record_timing("
|
|
221
|
+
with ingestion_job_summary.record_timing("get_dataset_collection_metadata"):
|
|
222
222
|
dataset_collection_metadata = store.get_dataset_collection(
|
|
223
223
|
dataset_type=self.ingestion_plan.dataset_type,
|
|
224
224
|
provider=self.ingestion_plan.source.provider,
|
|
@@ -232,6 +232,7 @@ class IngestionJob:
|
|
|
232
232
|
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
233
233
|
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
234
234
|
try:
|
|
235
|
+
logger.info(f"Finding datasets for selector={self.selector}")
|
|
235
236
|
with ingestion_job_summary.record_timing("find_datasets"):
|
|
236
237
|
dataset_resources = self.ingestion_plan.source.find_datasets(
|
|
237
238
|
dataset_type=self.ingestion_plan.dataset_type,
|
|
@@ -249,6 +250,8 @@ class IngestionJob:
|
|
|
249
250
|
yield ingestion_job_summary
|
|
250
251
|
return
|
|
251
252
|
|
|
253
|
+
logger.info("Starting tasks")
|
|
254
|
+
|
|
252
255
|
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
253
256
|
|
|
254
257
|
while True:
|
|
@@ -273,13 +276,16 @@ class IngestionJob:
|
|
|
273
276
|
for dataset_resource in batch
|
|
274
277
|
]
|
|
275
278
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
#
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
279
|
+
logger.info(f"Searching for existing Datasets for DatasetResources")
|
|
280
|
+
|
|
281
|
+
with ingestion_job_summary.record_timing("get_dataset_collection"):
|
|
282
|
+
# Load all available datasets based on the discovered dataset identifiers
|
|
283
|
+
dataset_collection = store.get_dataset_collection(
|
|
284
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
285
|
+
# Assume all DatasetResources share the same provider
|
|
286
|
+
provider=batch[0].provider,
|
|
287
|
+
selector=dataset_identifiers,
|
|
288
|
+
)
|
|
283
289
|
|
|
284
290
|
skipped_datasets = 0
|
|
285
291
|
|
{ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/models/ingestion/ingestion_job_summary.py
RENAMED
|
@@ -66,7 +66,7 @@ class IngestionJobSummary(BaseModel, HasTiming):
|
|
|
66
66
|
self.skipped_datasets += skipped_datasets
|
|
67
67
|
|
|
68
68
|
def task_count(self):
|
|
69
|
-
return len(self.task_summaries)
|
|
69
|
+
return len(self.task_summaries) + self.skipped_datasets
|
|
70
70
|
|
|
71
71
|
def _set_ended(self):
|
|
72
72
|
self.failed_tasks = len(
|
|
@@ -106,22 +106,22 @@ class IngestionJobSummary(BaseModel, HasTiming):
|
|
|
106
106
|
f"\nIngestionJobSummary {self.state.value} in {format_duration(self.duration)}"
|
|
107
107
|
)
|
|
108
108
|
print("********************************")
|
|
109
|
-
print(f"
|
|
110
|
-
print(f"
|
|
111
|
-
print(f"
|
|
112
|
-
print(f"
|
|
113
|
-
print(f"
|
|
114
|
-
print(f"
|
|
109
|
+
print(f" - IngestionPlan:")
|
|
110
|
+
print(f" Source: {self.source_name}")
|
|
111
|
+
print(f" Provider: {self.provider}")
|
|
112
|
+
print(f" DatasetType: {self.dataset_type}")
|
|
113
|
+
print(f" - Selector: {self.selector}")
|
|
114
|
+
print(f" - Timings: ")
|
|
115
115
|
for timing in self.timings:
|
|
116
|
-
print(f"
|
|
116
|
+
print(f" - {timing.name}: {format_duration(timing.duration)}")
|
|
117
117
|
print(
|
|
118
|
-
f"
|
|
118
|
+
f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
|
|
119
119
|
)
|
|
120
120
|
|
|
121
|
-
print(f"
|
|
122
|
-
print(f"
|
|
123
|
-
print(f"
|
|
124
|
-
print(f"
|
|
121
|
+
print(f" - Failed tasks: {self.failed_tasks}")
|
|
122
|
+
print(f" - Successful tasks: {self.successful_tasks}")
|
|
123
|
+
print(f" - Successful ignored tasks: {self.ignored_successful_tasks}")
|
|
124
|
+
print(f" - Skipped datasets: {self.skipped_datasets}")
|
|
125
125
|
print("********************************")
|
|
126
126
|
|
|
127
127
|
def __enter__(self):
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import itertools
|
|
2
|
+
import logging
|
|
2
3
|
import uuid
|
|
3
4
|
from typing import Optional, Union, List
|
|
4
5
|
|
|
@@ -14,10 +15,11 @@ from sqlalchemy import (
|
|
|
14
15
|
and_,
|
|
15
16
|
Column,
|
|
16
17
|
or_,
|
|
18
|
+
Dialect,
|
|
17
19
|
)
|
|
18
20
|
from sqlalchemy.engine import make_url
|
|
19
21
|
from sqlalchemy.exc import NoSuchModuleError
|
|
20
|
-
from sqlalchemy.orm import Session
|
|
22
|
+
from sqlalchemy.orm import Session, Query
|
|
21
23
|
|
|
22
24
|
from ingestify.domain import File, Revision
|
|
23
25
|
from ingestify.domain.models import (
|
|
@@ -42,6 +44,8 @@ from .tables import (
|
|
|
42
44
|
task_summary_table,
|
|
43
45
|
)
|
|
44
46
|
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
45
49
|
|
|
46
50
|
def parse_value(v):
|
|
47
51
|
try:
|
|
@@ -93,6 +97,7 @@ class SqlAlchemySessionProvider:
|
|
|
93
97
|
# Use the default isolation level, don't need SERIALIZABLE
|
|
94
98
|
# isolation_level="SERIALIZABLE",
|
|
95
99
|
)
|
|
100
|
+
self.dialect = self.engine.dialect
|
|
96
101
|
self.session = Session(bind=self.engine)
|
|
97
102
|
|
|
98
103
|
def __init__(self, url: str):
|
|
@@ -110,18 +115,18 @@ class SqlAlchemySessionProvider:
|
|
|
110
115
|
self.url = state["url"]
|
|
111
116
|
self._init_engine()
|
|
112
117
|
|
|
113
|
-
def _close_engine(self):
|
|
114
|
-
if hasattr(self, "session"):
|
|
115
|
-
self.session.close()
|
|
116
|
-
self.engine.dispose()
|
|
117
|
-
|
|
118
118
|
def __del__(self):
|
|
119
|
-
self.
|
|
119
|
+
self.close()
|
|
120
120
|
|
|
121
121
|
def reset(self):
|
|
122
|
-
self.
|
|
122
|
+
self.close()
|
|
123
123
|
self._init_engine()
|
|
124
124
|
|
|
125
|
+
def close(self):
|
|
126
|
+
if hasattr(self, "session"):
|
|
127
|
+
self.session.close()
|
|
128
|
+
self.engine.dispose()
|
|
129
|
+
|
|
125
130
|
def get(self):
|
|
126
131
|
return self.session
|
|
127
132
|
|
|
@@ -138,8 +143,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
138
143
|
def session(self):
|
|
139
144
|
return self.session_provider.get()
|
|
140
145
|
|
|
146
|
+
@property
|
|
147
|
+
def dialect(self) -> Dialect:
|
|
148
|
+
return self.session_provider.dialect
|
|
149
|
+
|
|
141
150
|
def _upsert(self, connection: Connection, table: Table, entities: list[dict]):
|
|
142
|
-
dialect = self.
|
|
151
|
+
dialect = self.dialect.name
|
|
143
152
|
if dialect == "mysql":
|
|
144
153
|
from sqlalchemy.dialects.mysql import insert
|
|
145
154
|
elif dialect == "postgresql":
|
|
@@ -183,7 +192,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
183
192
|
else:
|
|
184
193
|
query = query.filter(dataset_table.c.dataset_id == dataset_id)
|
|
185
194
|
|
|
186
|
-
dialect = self.
|
|
195
|
+
dialect = self.dialect.name
|
|
187
196
|
|
|
188
197
|
if not isinstance(selector, list):
|
|
189
198
|
where, selector = selector.split("where")
|
|
@@ -199,9 +208,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
199
208
|
if not selectors:
|
|
200
209
|
raise ValueError("Selectors must contain at least one item")
|
|
201
210
|
|
|
202
|
-
attribute_keys = selectors[
|
|
203
|
-
0
|
|
204
|
-
].filtered_attributes.keys() # Assume all selectors have the same keys
|
|
205
211
|
attribute_sets = {
|
|
206
212
|
tuple(selector.filtered_attributes.items()) for selector in selectors
|
|
207
213
|
}
|
|
@@ -249,7 +255,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
249
255
|
|
|
250
256
|
return query
|
|
251
257
|
|
|
252
|
-
def
|
|
258
|
+
def _load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
|
|
253
259
|
if not dataset_ids:
|
|
254
260
|
return []
|
|
255
261
|
|
|
@@ -303,6 +309,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
303
309
|
)
|
|
304
310
|
return datasets
|
|
305
311
|
|
|
312
|
+
def _debug_query(self, q: Query):
|
|
313
|
+
text_ = q.statement.compile(
|
|
314
|
+
compile_kwargs={"literal_binds": True}, dialect=self.dialect
|
|
315
|
+
)
|
|
316
|
+
logger.debug(f"Running query: {text_}")
|
|
317
|
+
|
|
306
318
|
def get_dataset_collection(
|
|
307
319
|
self,
|
|
308
320
|
bucket: str,
|
|
@@ -322,22 +334,40 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
322
334
|
selector=selector,
|
|
323
335
|
)
|
|
324
336
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
self.session.query(dataset_table.c.dataset_id)
|
|
328
|
-
)
|
|
329
|
-
dataset_ids = [row.dataset_id for row in dataset_query]
|
|
330
|
-
datasets = self.load_datasets(dataset_ids)
|
|
331
|
-
else:
|
|
332
|
-
datasets = []
|
|
337
|
+
with self.session:
|
|
338
|
+
# Use a contextmanager to make sure it's closed afterwards
|
|
333
339
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
340
|
+
if not metadata_only:
|
|
341
|
+
dataset_query = apply_query_filter(
|
|
342
|
+
self.session.query(dataset_table.c.dataset_id)
|
|
343
|
+
)
|
|
344
|
+
self._debug_query(dataset_query)
|
|
345
|
+
dataset_ids = [row.dataset_id for row in dataset_query]
|
|
346
|
+
datasets = self._load_datasets(dataset_ids)
|
|
347
|
+
|
|
348
|
+
dataset_collection_metadata = DatasetCollectionMetadata(
|
|
349
|
+
last_modified=max(dataset.last_modified_at for dataset in datasets)
|
|
350
|
+
if datasets
|
|
351
|
+
else None,
|
|
352
|
+
row_count=len(datasets),
|
|
353
|
+
)
|
|
354
|
+
else:
|
|
355
|
+
datasets = []
|
|
356
|
+
|
|
357
|
+
metadata_result_query = apply_query_filter(
|
|
358
|
+
self.session.query(
|
|
359
|
+
func.max(dataset_table.c.last_modified_at).label(
|
|
360
|
+
"last_modified_at"
|
|
361
|
+
),
|
|
362
|
+
func.count().label("row_count"),
|
|
363
|
+
)
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
self._debug_query(metadata_result_query)
|
|
367
|
+
|
|
368
|
+
dataset_collection_metadata = DatasetCollectionMetadata(
|
|
369
|
+
*metadata_result_query.first()
|
|
370
|
+
)
|
|
341
371
|
|
|
342
372
|
return DatasetCollection(dataset_collection_metadata, datasets)
|
|
343
373
|
|
|
@@ -350,6 +380,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
350
380
|
def connect(self):
|
|
351
381
|
return self.session_provider.engine.connect()
|
|
352
382
|
|
|
383
|
+
def __del__(self):
|
|
384
|
+
self.session_provider.close()
|
|
385
|
+
|
|
353
386
|
def _save(self, datasets: list[Dataset]):
|
|
354
387
|
"""Only do upserts. Never delete. Rows get only deleted when an entire Dataset is removed."""
|
|
355
388
|
datasets_entities = []
|
|
@@ -182,7 +182,9 @@ def get_event_subscriber_cls(key: str) -> Type[Subscriber]:
|
|
|
182
182
|
return import_cls(key)
|
|
183
183
|
|
|
184
184
|
|
|
185
|
-
def get_engine(
|
|
185
|
+
def get_engine(
|
|
186
|
+
config_file, bucket: Optional[str] = None, disable_events: bool = False
|
|
187
|
+
) -> IngestionEngine:
|
|
186
188
|
config = parse_config(config_file, default_value="")
|
|
187
189
|
|
|
188
190
|
logger.info("Initializing sources")
|
|
@@ -201,11 +203,16 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
|
|
|
201
203
|
|
|
202
204
|
# Setup an EventBus and wire some more components
|
|
203
205
|
event_bus = EventBus()
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
206
|
+
if not disable_events:
|
|
207
|
+
# When we disable all events we don't register any publishers
|
|
208
|
+
publisher = Publisher()
|
|
209
|
+
for subscriber in config.get("event_subscribers", []):
|
|
210
|
+
cls = get_event_subscriber_cls(subscriber["type"])
|
|
211
|
+
publisher.add_subscriber(cls(store))
|
|
212
|
+
event_bus.register(publisher)
|
|
213
|
+
else:
|
|
214
|
+
logger.info("Disabling all event handlers")
|
|
215
|
+
|
|
209
216
|
store.set_event_bus(event_bus)
|
|
210
217
|
|
|
211
218
|
ingestion_engine = IngestionEngine(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.4.1 → ingestify-0.4.2}/ingestify/domain/services/transformers/kloppy_to_pandas.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.4.1 → ingestify-0.4.2}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2
RENAMED
|
File without changes
|
{ingestify-0.4.1 → ingestify-0.4.2}/ingestify/static/templates/statsbomb_github/database/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|