ingestify 0.5.1__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ingestify-0.5.1 → ingestify-0.6.1}/PKG-INFO +1 -1
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/__init__.py +1 -1
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/application/loader.py +8 -2
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/data_spec_version_collection.py +0 -1
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/ingestion/ingestion_job.py +8 -6
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/store/dataset/sqlalchemy/repository.py +50 -25
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify.egg-info/PKG-INFO +1 -1
- {ingestify-0.5.1 → ingestify-0.6.1}/README.md +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/application/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/application/dataset_store.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/application/ingestion_engine.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/cmdline.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/base.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/dataset/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/dataset/collection.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/dataset/dataset.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/dataset/dataset_state.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/dataset/events.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/dataset/file.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/dataset/file_collection.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/dataset/file_repository.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/dataset/revision.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/dataset/selector.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/event/domain_event.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/event/event_bus.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/fetch_policy.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/ingestion/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/resources/dataset_resource.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/sink.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/task/task.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/task/task_summary.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/timing.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/services/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/services/identifier_key_transformer.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/services/transformers/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/exceptions.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/fetch/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/fetch/http.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/serialization/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/sink/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/sink/postgresql.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/source/statsbomb_github.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/source/wyscout.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/store/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/store/dataset/sqlalchemy/tables.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/store/file/local_file_repository.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/infra/store/file/s3_file_repository.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/main.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/server.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/source_base.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/static/templates/statsbomb_github/README.md +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/static/templates/statsbomb_github/query.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/static/templates/wyscout/.env +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/static/templates/wyscout/.gitignore +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/static/templates/wyscout/README.md +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/static/templates/wyscout/database/README.md +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/static/templates/wyscout/query.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify/utils.py +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify.egg-info/SOURCES.txt +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify.egg-info/requires.txt +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/setup.cfg +0 -0
- {ingestify-0.5.1 → ingestify-0.6.1}/setup.py +0 -0
|
@@ -155,13 +155,19 @@ class Loader:
|
|
|
155
155
|
|
|
156
156
|
IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
|
|
157
157
|
"""
|
|
158
|
-
|
|
158
|
+
|
|
159
|
+
ingestion_job_prefix = str(uuid.uuid1())
|
|
160
|
+
for ingestion_job_idx, (ingestion_plan, selector) in enumerate(
|
|
161
|
+
selectors.values()
|
|
162
|
+
):
|
|
159
163
|
logger.info(
|
|
160
164
|
f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
|
|
161
165
|
)
|
|
162
166
|
|
|
163
167
|
ingestion_job = IngestionJob(
|
|
164
|
-
|
|
168
|
+
# Create a combined IngestionJobId.
|
|
169
|
+
# This allows us to group all IngestionJobs within the same run
|
|
170
|
+
ingestion_job_id=f"{ingestion_job_prefix}.{ingestion_job_idx}",
|
|
165
171
|
ingestion_plan=ingestion_plan,
|
|
166
172
|
selector=selector,
|
|
167
173
|
)
|
|
@@ -24,9 +24,7 @@ from ingestify.utils import TaskExecutor, chunker
|
|
|
24
24
|
logger = logging.getLogger(__name__)
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
|
|
28
|
-
# a compound select, which breaks at more than 500 select statements
|
|
29
|
-
DEFAULT_CHUNK_SIZE = 500
|
|
27
|
+
DEFAULT_CHUNK_SIZE = 1000
|
|
30
28
|
|
|
31
29
|
|
|
32
30
|
def run_task(task):
|
|
@@ -257,10 +255,14 @@ class IngestionJob:
|
|
|
257
255
|
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
258
256
|
|
|
259
257
|
while True:
|
|
258
|
+
logger.info(f"Finding next batch of datasets for selector={self.selector}")
|
|
259
|
+
|
|
260
260
|
try:
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
261
|
+
with ingestion_job_summary.record_timing("find_datasets"):
|
|
262
|
+
try:
|
|
263
|
+
batch = next(batches)
|
|
264
|
+
except StopIteration:
|
|
265
|
+
break
|
|
264
266
|
except Exception as e:
|
|
265
267
|
logger.exception("Failed to fetch next batch")
|
|
266
268
|
|
|
@@ -13,9 +13,12 @@ from sqlalchemy import (
|
|
|
13
13
|
literal,
|
|
14
14
|
select,
|
|
15
15
|
and_,
|
|
16
|
-
Column,
|
|
17
|
-
or_,
|
|
18
16
|
Dialect,
|
|
17
|
+
values,
|
|
18
|
+
CTE,
|
|
19
|
+
column as sqlalchemy_column,
|
|
20
|
+
Integer,
|
|
21
|
+
String,
|
|
19
22
|
)
|
|
20
23
|
from sqlalchemy.engine import make_url
|
|
21
24
|
from sqlalchemy.exc import NoSuchModuleError
|
|
@@ -174,6 +177,40 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
174
177
|
|
|
175
178
|
connection.execute(stmt)
|
|
176
179
|
|
|
180
|
+
def _build_cte_sqlite(self, records, name: str) -> CTE:
|
|
181
|
+
"""SQLite has a limit of 500 compound select statements. When we have more records,
|
|
182
|
+
create a nested CTE"""
|
|
183
|
+
if len(records) > 500:
|
|
184
|
+
return union_all(
|
|
185
|
+
select(self._build_cte_sqlite(records[:500], name + "1")),
|
|
186
|
+
select(self._build_cte_sqlite(records[500:], name + "2")),
|
|
187
|
+
).cte(name)
|
|
188
|
+
|
|
189
|
+
return union_all(
|
|
190
|
+
*[
|
|
191
|
+
select(*(literal(value).label(key) for key, value in record.items()))
|
|
192
|
+
for record in records
|
|
193
|
+
]
|
|
194
|
+
).cte(name)
|
|
195
|
+
|
|
196
|
+
def _build_cte(self, records: list[dict], name: str) -> CTE:
|
|
197
|
+
"""Build a CTE from a list of dictionaries."""
|
|
198
|
+
|
|
199
|
+
if self.dialect.name == "sqlite":
|
|
200
|
+
return self._build_cte_sqlite(records, name)
|
|
201
|
+
|
|
202
|
+
first_row = records[0]
|
|
203
|
+
columns = []
|
|
204
|
+
for key, value in first_row.items():
|
|
205
|
+
columns.append(
|
|
206
|
+
sqlalchemy_column(key, Integer if isinstance(value, int) else String)
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Prepare the data in tuples, in same order as columns
|
|
210
|
+
data = [tuple(record[column.name] for column in columns) for record in records]
|
|
211
|
+
|
|
212
|
+
return select(values(*columns, name=name).data(data)).cte(name)
|
|
213
|
+
|
|
177
214
|
def _filter_query(
|
|
178
215
|
self,
|
|
179
216
|
query,
|
|
@@ -190,12 +227,10 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
190
227
|
# return an empty DatasetCollection
|
|
191
228
|
return DatasetCollection()
|
|
192
229
|
|
|
193
|
-
dataset_ids_cte =
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
]
|
|
198
|
-
).cte("dataset_ids")
|
|
230
|
+
dataset_ids_cte = self._build_cte(
|
|
231
|
+
[{"dataset_id": dataset_id} for dataset_id in set(dataset_id)],
|
|
232
|
+
"dataset_ids",
|
|
233
|
+
)
|
|
199
234
|
|
|
200
235
|
query = query.select_from(
|
|
201
236
|
dataset_table.join(
|
|
@@ -222,17 +257,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
222
257
|
if not selectors:
|
|
223
258
|
raise ValueError("Selectors must contain at least one item")
|
|
224
259
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
# Define a virtual table using a CTE for all attributes
|
|
230
|
-
attribute_cte = union_all(
|
|
231
|
-
*[
|
|
232
|
-
select(*(literal(value).label(key) for key, value in attr_set))
|
|
233
|
-
for attr_set in attribute_sets
|
|
234
|
-
]
|
|
235
|
-
).cte("attributes")
|
|
260
|
+
attribute_cte = self._build_cte(
|
|
261
|
+
[selector.filtered_attributes for selector in selectors], "attributes"
|
|
262
|
+
)
|
|
236
263
|
|
|
237
264
|
keys = list(selectors[0].filtered_attributes.keys())
|
|
238
265
|
first_selector = selectors[0].filtered_attributes
|
|
@@ -273,12 +300,10 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
273
300
|
if not dataset_ids:
|
|
274
301
|
return []
|
|
275
302
|
|
|
276
|
-
dataset_ids_cte =
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
]
|
|
281
|
-
).cte("dataset_ids")
|
|
303
|
+
dataset_ids_cte = self._build_cte(
|
|
304
|
+
[{"dataset_id": dataset_id} for dataset_id in set(dataset_ids)],
|
|
305
|
+
"dataset_ids",
|
|
306
|
+
)
|
|
282
307
|
|
|
283
308
|
dataset_rows = list(
|
|
284
309
|
self.session.query(dataset_table).select_from(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/models/ingestion/ingestion_job_summary.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.5.1 → ingestify-0.6.1}/ingestify/domain/services/transformers/kloppy_to_pandas.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.5.1 → ingestify-0.6.1}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2
RENAMED
|
File without changes
|
{ingestify-0.5.1 → ingestify-0.6.1}/ingestify/static/templates/statsbomb_github/database/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|