ingestify 0.5.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ingestify-0.5.0 → ingestify-0.6.0}/PKG-INFO +1 -1
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/__init__.py +1 -1
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/application/loader.py +8 -2
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/data_spec_version_collection.py +0 -1
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/ingestion/ingestion_job.py +4 -2
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +75 -22
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify.egg-info/PKG-INFO +1 -1
- {ingestify-0.5.0 → ingestify-0.6.0}/README.md +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/application/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/application/dataset_store.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/application/ingestion_engine.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/cmdline.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/base.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/collection.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/dataset.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/dataset_state.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/events.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/file.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/file_collection.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/file_repository.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/revision.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/selector.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/domain_event.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/event_bus.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/fetch_policy.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/ingestion/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/resources/dataset_resource.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/sink.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/task/task.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/task/task_summary.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/timing.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/services/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/services/identifier_key_transformer.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/services/transformers/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/exceptions.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/fetch/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/fetch/http.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/serialization/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/sink/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/sink/postgresql.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/source/statsbomb_github.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/source/wyscout.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/dataset/sqlalchemy/tables.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/main.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/server.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/source_base.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/statsbomb_github/README.md +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/statsbomb_github/query.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/wyscout/.env +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/wyscout/.gitignore +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/wyscout/README.md +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/wyscout/database/README.md +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/wyscout/query.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/utils.py +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify.egg-info/SOURCES.txt +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify.egg-info/requires.txt +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/setup.cfg +0 -0
- {ingestify-0.5.0 → ingestify-0.6.0}/setup.py +0 -0
|
@@ -155,13 +155,19 @@ class Loader:
|
|
|
155
155
|
|
|
156
156
|
IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
|
|
157
157
|
"""
|
|
158
|
-
|
|
158
|
+
|
|
159
|
+
ingestion_job_prefix = str(uuid.uuid1())
|
|
160
|
+
for ingestion_job_idx, (ingestion_plan, selector) in enumerate(
|
|
161
|
+
selectors.values()
|
|
162
|
+
):
|
|
159
163
|
logger.info(
|
|
160
164
|
f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
|
|
161
165
|
)
|
|
162
166
|
|
|
163
167
|
ingestion_job = IngestionJob(
|
|
164
|
-
|
|
168
|
+
# Create a combined IngestionJobId.
|
|
169
|
+
# This allows us to group all IngestionJobs within the same run
|
|
170
|
+
ingestion_job_id=f"{ingestion_job_prefix}.{ingestion_job_idx}",
|
|
165
171
|
ingestion_plan=ingestion_plan,
|
|
166
172
|
selector=selector,
|
|
167
173
|
)
|
|
@@ -24,7 +24,7 @@ from ingestify.utils import TaskExecutor, chunker
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
DEFAULT_CHUNK_SIZE =
|
|
27
|
+
DEFAULT_CHUNK_SIZE = 1000
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
def run_task(task):
|
|
@@ -255,8 +255,10 @@ class IngestionJob:
|
|
|
255
255
|
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
256
256
|
|
|
257
257
|
while True:
|
|
258
|
+
logger.info(f"Finding next batch of datasets for selector={self.selector}")
|
|
258
259
|
try:
|
|
259
|
-
|
|
260
|
+
with ingestion_job_summary.record_timing("find_datasets"):
|
|
261
|
+
batch = next(batches)
|
|
260
262
|
except StopIteration:
|
|
261
263
|
break
|
|
262
264
|
except Exception as e:
|
|
@@ -13,9 +13,12 @@ from sqlalchemy import (
|
|
|
13
13
|
literal,
|
|
14
14
|
select,
|
|
15
15
|
and_,
|
|
16
|
-
Column,
|
|
17
|
-
or_,
|
|
18
16
|
Dialect,
|
|
17
|
+
values,
|
|
18
|
+
CTE,
|
|
19
|
+
column as sqlalchemy_column,
|
|
20
|
+
Integer,
|
|
21
|
+
String,
|
|
19
22
|
)
|
|
20
23
|
from sqlalchemy.engine import make_url
|
|
21
24
|
from sqlalchemy.exc import NoSuchModuleError
|
|
@@ -137,10 +140,6 @@ class SqlAlchemySessionProvider:
|
|
|
137
140
|
return self.session()
|
|
138
141
|
|
|
139
142
|
|
|
140
|
-
def in_(column: Column, values):
|
|
141
|
-
return or_(*[column == value for value in values])
|
|
142
|
-
|
|
143
|
-
|
|
144
143
|
class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
145
144
|
def __init__(self, session_provider: SqlAlchemySessionProvider):
|
|
146
145
|
self.session_provider = session_provider
|
|
@@ -178,6 +177,40 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
178
177
|
|
|
179
178
|
connection.execute(stmt)
|
|
180
179
|
|
|
180
|
+
def _build_cte_sqlite(self, records, name: str) -> CTE:
|
|
181
|
+
"""SQLite has a limit of 500 compound select statements. When we have more records,
|
|
182
|
+
create a nested CTE"""
|
|
183
|
+
if len(records) > 500:
|
|
184
|
+
return union_all(
|
|
185
|
+
select(self._build_cte_sqlite(records[:500], name + "1")),
|
|
186
|
+
select(self._build_cte_sqlite(records[500:], name + "2")),
|
|
187
|
+
).cte(name)
|
|
188
|
+
|
|
189
|
+
return union_all(
|
|
190
|
+
*[
|
|
191
|
+
select(*(literal(value).label(key) for key, value in record.items()))
|
|
192
|
+
for record in records
|
|
193
|
+
]
|
|
194
|
+
).cte(name)
|
|
195
|
+
|
|
196
|
+
def _build_cte(self, records: list[dict], name: str) -> CTE:
|
|
197
|
+
"""Build a CTE from a list of dictionaries."""
|
|
198
|
+
|
|
199
|
+
if self.dialect.name == "sqlite":
|
|
200
|
+
return self._build_cte_sqlite(records, name)
|
|
201
|
+
|
|
202
|
+
first_row = records[0]
|
|
203
|
+
columns = []
|
|
204
|
+
for key, value in first_row.items():
|
|
205
|
+
columns.append(
|
|
206
|
+
sqlalchemy_column(key, Integer if isinstance(value, int) else String)
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Prepare the data in tuples, in same order as columns
|
|
210
|
+
data = [tuple(record[column.name] for column in columns) for record in records]
|
|
211
|
+
|
|
212
|
+
return select(values(*columns, name=name).data(data)).cte(name)
|
|
213
|
+
|
|
181
214
|
def _filter_query(
|
|
182
215
|
self,
|
|
183
216
|
query,
|
|
@@ -194,7 +227,17 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
194
227
|
# return an empty DatasetCollection
|
|
195
228
|
return DatasetCollection()
|
|
196
229
|
|
|
197
|
-
|
|
230
|
+
dataset_ids_cte = self._build_cte(
|
|
231
|
+
[{"dataset_id": dataset_id} for dataset_id in set(dataset_id)],
|
|
232
|
+
"dataset_ids",
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
query = query.select_from(
|
|
236
|
+
dataset_table.join(
|
|
237
|
+
dataset_ids_cte,
|
|
238
|
+
dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
|
|
239
|
+
)
|
|
240
|
+
)
|
|
198
241
|
else:
|
|
199
242
|
query = query.filter(dataset_table.c.dataset_id == dataset_id)
|
|
200
243
|
|
|
@@ -214,17 +257,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
214
257
|
if not selectors:
|
|
215
258
|
raise ValueError("Selectors must contain at least one item")
|
|
216
259
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
# Define a virtual table using a CTE for all attributes
|
|
222
|
-
attribute_cte = union_all(
|
|
223
|
-
*[
|
|
224
|
-
select(*(literal(value).label(key) for key, value in attr_set))
|
|
225
|
-
for attr_set in attribute_sets
|
|
226
|
-
]
|
|
227
|
-
).cte("attributes")
|
|
260
|
+
attribute_cte = self._build_cte(
|
|
261
|
+
[selector.filtered_attributes for selector in selectors], "attributes"
|
|
262
|
+
)
|
|
228
263
|
|
|
229
264
|
keys = list(selectors[0].filtered_attributes.keys())
|
|
230
265
|
first_selector = selectors[0].filtered_attributes
|
|
@@ -265,15 +300,28 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
265
300
|
if not dataset_ids:
|
|
266
301
|
return []
|
|
267
302
|
|
|
303
|
+
dataset_ids_cte = self._build_cte(
|
|
304
|
+
[{"dataset_id": dataset_id} for dataset_id in set(dataset_ids)],
|
|
305
|
+
"dataset_ids",
|
|
306
|
+
)
|
|
307
|
+
|
|
268
308
|
dataset_rows = list(
|
|
269
|
-
self.session.query(dataset_table).
|
|
270
|
-
|
|
309
|
+
self.session.query(dataset_table).select_from(
|
|
310
|
+
dataset_table.join(
|
|
311
|
+
dataset_ids_cte,
|
|
312
|
+
dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
|
|
313
|
+
)
|
|
271
314
|
)
|
|
272
315
|
)
|
|
273
316
|
revisions_per_dataset = {}
|
|
274
317
|
rows = (
|
|
275
318
|
self.session.query(revision_table)
|
|
276
|
-
.
|
|
319
|
+
.select_from(
|
|
320
|
+
revision_table.join(
|
|
321
|
+
dataset_ids_cte,
|
|
322
|
+
dataset_ids_cte.c.dataset_id == revision_table.c.dataset_id,
|
|
323
|
+
)
|
|
324
|
+
)
|
|
277
325
|
.order_by(revision_table.c.dataset_id)
|
|
278
326
|
)
|
|
279
327
|
|
|
@@ -285,7 +333,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
285
333
|
files_per_revision = {}
|
|
286
334
|
rows = (
|
|
287
335
|
self.session.query(file_table)
|
|
288
|
-
.
|
|
336
|
+
.select_from(
|
|
337
|
+
file_table.join(
|
|
338
|
+
dataset_ids_cte,
|
|
339
|
+
dataset_ids_cte.c.dataset_id == file_table.c.dataset_id,
|
|
340
|
+
)
|
|
341
|
+
)
|
|
289
342
|
.order_by(file_table.c.dataset_id, file_table.c.revision_id)
|
|
290
343
|
)
|
|
291
344
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2
RENAMED
|
File without changes
|
{ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/statsbomb_github/database/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|