ingestify 0.4.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ingestify-0.4.0 → ingestify-0.4.1}/PKG-INFO +1 -1
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/__init__.py +1 -1
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/dataset/sqlalchemy/repository.py +43 -30
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify.egg-info/PKG-INFO +1 -1
- {ingestify-0.4.0 → ingestify-0.4.1}/README.md +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/application/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/application/dataset_store.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/application/ingestion_engine.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/application/loader.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/cmdline.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/base.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/data_spec_version_collection.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/collection.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/dataset.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/dataset_state.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/events.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/file.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/file_collection.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/file_repository.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/revision.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/selector.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/domain_event.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/event_bus.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/fetch_policy.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/ingestion/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/ingestion/ingestion_job.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/resources/dataset_resource.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/sink.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/task/task.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/task/task_summary.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/timing.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/services/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/services/identifier_key_transformer.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/services/transformers/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/exceptions.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/fetch/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/fetch/http.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/serialization/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/sink/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/sink/postgresql.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/source/statsbomb_github.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/source/wyscout.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/file/local_file_repository.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/file/s3_file_repository.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/main.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/server.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/source_base.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/statsbomb_github/README.md +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/statsbomb_github/query.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/wyscout/.env +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/wyscout/.gitignore +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/wyscout/README.md +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/wyscout/database/README.md +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/wyscout/query.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/utils.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify.egg-info/SOURCES.txt +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify.egg-info/requires.txt +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/setup.cfg +0 -0
- {ingestify-0.4.0 → ingestify-0.4.1}/setup.py +0 -0
|
@@ -1,32 +1,31 @@
|
|
|
1
1
|
import itertools
|
|
2
|
-
import json
|
|
3
2
|
import uuid
|
|
4
|
-
from collections import defaultdict
|
|
5
3
|
from typing import Optional, Union, List
|
|
6
4
|
|
|
7
5
|
from sqlalchemy import (
|
|
8
6
|
create_engine,
|
|
9
7
|
func,
|
|
10
8
|
text,
|
|
11
|
-
tuple_,
|
|
12
9
|
Table,
|
|
13
|
-
insert,
|
|
14
|
-
Transaction,
|
|
15
10
|
Connection,
|
|
11
|
+
union_all,
|
|
12
|
+
literal,
|
|
13
|
+
select,
|
|
14
|
+
and_,
|
|
15
|
+
Column,
|
|
16
|
+
or_,
|
|
16
17
|
)
|
|
17
18
|
from sqlalchemy.engine import make_url
|
|
18
19
|
from sqlalchemy.exc import NoSuchModuleError
|
|
19
|
-
from sqlalchemy.orm import Session
|
|
20
|
+
from sqlalchemy.orm import Session
|
|
20
21
|
|
|
21
22
|
from ingestify.domain import File, Revision
|
|
22
23
|
from ingestify.domain.models import (
|
|
23
24
|
Dataset,
|
|
24
25
|
DatasetCollection,
|
|
25
26
|
DatasetRepository,
|
|
26
|
-
Identifier,
|
|
27
27
|
Selector,
|
|
28
28
|
)
|
|
29
|
-
from ingestify.domain.models.base import BaseModel
|
|
30
29
|
from ingestify.domain.models.dataset.collection_metadata import (
|
|
31
30
|
DatasetCollectionMetadata,
|
|
32
31
|
)
|
|
@@ -127,6 +126,10 @@ class SqlAlchemySessionProvider:
|
|
|
127
126
|
return self.session
|
|
128
127
|
|
|
129
128
|
|
|
129
|
+
def in_(column: Column, values):
|
|
130
|
+
return or_(*[column == value for value in values])
|
|
131
|
+
|
|
132
|
+
|
|
130
133
|
class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
131
134
|
def __init__(self, session_provider: SqlAlchemySessionProvider):
|
|
132
135
|
self.session_provider = session_provider
|
|
@@ -169,11 +172,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
169
172
|
dataset_id: Optional[Union[str, List[str]]] = None,
|
|
170
173
|
selector: Optional[Union[Selector, List[Selector]]] = None,
|
|
171
174
|
):
|
|
172
|
-
query = query.filter(dataset_table.c.bucket == bucket)
|
|
173
|
-
if dataset_type:
|
|
174
|
-
query = query.filter(dataset_table.c.dataset_type == dataset_type)
|
|
175
|
-
if provider:
|
|
176
|
-
query = query.filter(dataset_table.c.provider == provider)
|
|
177
175
|
if dataset_id is not None:
|
|
178
176
|
if isinstance(dataset_id, list):
|
|
179
177
|
if len(dataset_id) == 0:
|
|
@@ -181,7 +179,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
181
179
|
# return an empty DatasetCollection
|
|
182
180
|
return DatasetCollection()
|
|
183
181
|
|
|
184
|
-
query = query.filter(dataset_table.c.dataset_id
|
|
182
|
+
query = query.filter(in_(dataset_table.c.dataset_id, dataset_id))
|
|
185
183
|
else:
|
|
186
184
|
query = query.filter(dataset_table.c.dataset_id == dataset_id)
|
|
187
185
|
|
|
@@ -201,13 +199,25 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
201
199
|
if not selectors:
|
|
202
200
|
raise ValueError("Selectors must contain at least one item")
|
|
203
201
|
|
|
204
|
-
|
|
202
|
+
attribute_keys = selectors[
|
|
203
|
+
0
|
|
204
|
+
].filtered_attributes.keys() # Assume all selectors have the same keys
|
|
205
|
+
attribute_sets = {
|
|
206
|
+
tuple(selector.filtered_attributes.items()) for selector in selectors
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
# Define a virtual table using a CTE for all attributes
|
|
210
|
+
attribute_cte = union_all(
|
|
211
|
+
*[
|
|
212
|
+
select(*(literal(value).label(key) for key, value in attr_set))
|
|
213
|
+
for attr_set in attribute_sets
|
|
214
|
+
]
|
|
215
|
+
).cte("attributes")
|
|
205
216
|
|
|
206
|
-
|
|
217
|
+
keys = list(selectors[0].filtered_attributes.keys())
|
|
207
218
|
first_selector = selectors[0].filtered_attributes
|
|
208
219
|
|
|
209
|
-
|
|
210
|
-
# SELECT * FROM dataset WHERE (column1, column2, column3) IN ((1, 2, 3), (4, 5, 6), (7, 8, 9))
|
|
220
|
+
join_conditions = []
|
|
211
221
|
for k in keys:
|
|
212
222
|
if dialect == "postgresql":
|
|
213
223
|
column = dataset_table.c.identifier[k]
|
|
@@ -215,25 +225,28 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
215
225
|
# Take the value from the first selector to determine the type.
|
|
216
226
|
# TODO: check all selectors to determine the type
|
|
217
227
|
v = first_selector[k]
|
|
218
|
-
if
|
|
228
|
+
if isinstance(v, int):
|
|
219
229
|
column = column.as_integer()
|
|
220
|
-
elif isfloat(v):
|
|
221
|
-
column = column.as_float()
|
|
222
230
|
else:
|
|
223
231
|
column = column.as_string()
|
|
224
232
|
else:
|
|
225
233
|
column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
|
|
226
|
-
columns.append(column)
|
|
227
234
|
|
|
228
|
-
|
|
229
|
-
for selector in selectors:
|
|
230
|
-
filtered_attributes = selector.filtered_attributes
|
|
231
|
-
values.append(tuple([filtered_attributes[k] for k in keys]))
|
|
235
|
+
join_conditions.append(attribute_cte.c[k] == column)
|
|
232
236
|
|
|
233
|
-
query = query.
|
|
237
|
+
query = query.select_from(
|
|
238
|
+
dataset_table.join(attribute_cte, and_(*join_conditions))
|
|
239
|
+
)
|
|
234
240
|
|
|
235
241
|
if where:
|
|
236
242
|
query = query.filter(text(where))
|
|
243
|
+
|
|
244
|
+
query = query.filter(dataset_table.c.bucket == bucket)
|
|
245
|
+
if dataset_type:
|
|
246
|
+
query = query.filter(dataset_table.c.dataset_type == dataset_type)
|
|
247
|
+
if provider:
|
|
248
|
+
query = query.filter(dataset_table.c.provider == provider)
|
|
249
|
+
|
|
237
250
|
return query
|
|
238
251
|
|
|
239
252
|
def load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
|
|
@@ -242,13 +255,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
242
255
|
|
|
243
256
|
dataset_rows = list(
|
|
244
257
|
self.session.query(dataset_table).filter(
|
|
245
|
-
dataset_table.c.dataset_id
|
|
258
|
+
in_(dataset_table.c.dataset_id, dataset_ids)
|
|
246
259
|
)
|
|
247
260
|
)
|
|
248
261
|
revisions_per_dataset = {}
|
|
249
262
|
rows = (
|
|
250
263
|
self.session.query(revision_table)
|
|
251
|
-
.filter(revision_table.c.dataset_id
|
|
264
|
+
.filter(in_(revision_table.c.dataset_id, dataset_ids))
|
|
252
265
|
.order_by(revision_table.c.dataset_id)
|
|
253
266
|
)
|
|
254
267
|
|
|
@@ -260,7 +273,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
260
273
|
files_per_revision = {}
|
|
261
274
|
rows = (
|
|
262
275
|
self.session.query(file_table)
|
|
263
|
-
.filter(file_table.c.dataset_id
|
|
276
|
+
.filter(in_(file_table.c.dataset_id, dataset_ids))
|
|
264
277
|
.order_by(file_table.c.dataset_id, file_table.c.revision_id)
|
|
265
278
|
)
|
|
266
279
|
|
|
@@ -14,6 +14,7 @@ from sqlalchemy import (
|
|
|
14
14
|
String,
|
|
15
15
|
Table,
|
|
16
16
|
TypeDecorator,
|
|
17
|
+
Index,
|
|
17
18
|
)
|
|
18
19
|
|
|
19
20
|
from sqlalchemy.dialects.postgresql import JSONB
|
|
@@ -167,6 +168,15 @@ dataset_table = Table(
|
|
|
167
168
|
Column("created_at", TZDateTime(6)),
|
|
168
169
|
Column("updated_at", TZDateTime(6)),
|
|
169
170
|
Column("last_modified_at", TZDateTime(6)),
|
|
171
|
+
# Required for performance querying when there are a lot of Datasets
|
|
172
|
+
# with the same provider and dataset_type
|
|
173
|
+
Index(
|
|
174
|
+
"idx_bucket_type_provider_last_modified",
|
|
175
|
+
"bucket",
|
|
176
|
+
"provider",
|
|
177
|
+
"dataset_type",
|
|
178
|
+
"last_modified_at",
|
|
179
|
+
),
|
|
170
180
|
)
|
|
171
181
|
|
|
172
182
|
revision_table = Table(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/ingestion/ingestion_job_summary.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/services/transformers/kloppy_to_pandas.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2
RENAMED
|
File without changes
|
{ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/statsbomb_github/database/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|