ingestify 0.4.0__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ingestify-0.4.0 → ingestify-0.4.2}/PKG-INFO +1 -1
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/__init__.py +1 -1
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/application/loader.py +8 -4
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/cmdline.py +10 -1
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/ingestion/ingestion_job.py +14 -8
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/ingestion/ingestion_job_summary.py +13 -13
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/store/dataset/sqlalchemy/repository.py +101 -55
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/main.py +13 -6
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify.egg-info/PKG-INFO +1 -1
- {ingestify-0.4.0 → ingestify-0.4.2}/README.md +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/application/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/application/dataset_store.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/application/ingestion_engine.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/base.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/data_spec_version_collection.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/dataset/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/dataset/collection.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/dataset/dataset.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/dataset/dataset_state.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/dataset/events.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/dataset/file.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/dataset/file_collection.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/dataset/file_repository.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/dataset/revision.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/dataset/selector.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/event/domain_event.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/event/event_bus.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/fetch_policy.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/ingestion/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/resources/dataset_resource.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/sink.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/task/task.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/task/task_summary.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/timing.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/services/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/services/identifier_key_transformer.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/services/transformers/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/exceptions.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/fetch/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/fetch/http.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/serialization/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/sink/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/sink/postgresql.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/source/statsbomb_github.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/source/wyscout.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/store/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/store/file/local_file_repository.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/infra/store/file/s3_file_repository.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/server.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/source_base.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/static/templates/statsbomb_github/README.md +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/static/templates/statsbomb_github/query.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/static/templates/wyscout/.env +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/static/templates/wyscout/.gitignore +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/static/templates/wyscout/README.md +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/static/templates/wyscout/database/README.md +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/static/templates/wyscout/query.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify/utils.py +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify.egg-info/SOURCES.txt +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify.egg-info/requires.txt +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/setup.cfg +0 -0
- {ingestify-0.4.0 → ingestify-0.4.2}/setup.py +0 -0
|
@@ -35,11 +35,8 @@ class Loader:
|
|
|
35
35
|
provider: Optional[str] = None,
|
|
36
36
|
source: Optional[str] = None,
|
|
37
37
|
):
|
|
38
|
-
|
|
39
|
-
selectors = {}
|
|
38
|
+
ingestion_plans = []
|
|
40
39
|
for ingestion_plan in self.ingestion_plans:
|
|
41
|
-
logger.info(f"Determining selectors for {ingestion_plan}")
|
|
42
|
-
|
|
43
40
|
if provider is not None:
|
|
44
41
|
if ingestion_plan.source.provider != provider:
|
|
45
42
|
logger.info(
|
|
@@ -54,6 +51,13 @@ class Loader:
|
|
|
54
51
|
)
|
|
55
52
|
continue
|
|
56
53
|
|
|
54
|
+
ingestion_plans.append(ingestion_plan)
|
|
55
|
+
|
|
56
|
+
# First collect all selectors, before discovering datasets
|
|
57
|
+
selectors = {}
|
|
58
|
+
for ingestion_plan in ingestion_plans:
|
|
59
|
+
logger.info(f"Determining selectors for {ingestion_plan}")
|
|
60
|
+
|
|
57
61
|
static_selectors = [
|
|
58
62
|
selector
|
|
59
63
|
for selector in ingestion_plan.selectors
|
|
@@ -88,6 +88,14 @@ def cli():
|
|
|
88
88
|
help="Source - only run tasks for a single source",
|
|
89
89
|
type=str,
|
|
90
90
|
)
|
|
91
|
+
@click.option(
|
|
92
|
+
"--disable-events",
|
|
93
|
+
"disable_events",
|
|
94
|
+
required=False,
|
|
95
|
+
help="Disable events - disable all event handlers",
|
|
96
|
+
is_flag=True,
|
|
97
|
+
type=bool,
|
|
98
|
+
)
|
|
91
99
|
def run(
|
|
92
100
|
config_file: str,
|
|
93
101
|
bucket: Optional[str],
|
|
@@ -95,9 +103,10 @@ def run(
|
|
|
95
103
|
provider: Optional[str],
|
|
96
104
|
source: Optional[str],
|
|
97
105
|
debug: Optional[bool],
|
|
106
|
+
disable_events: Optional[bool],
|
|
98
107
|
):
|
|
99
108
|
try:
|
|
100
|
-
engine = get_engine(config_file, bucket)
|
|
109
|
+
engine = get_engine(config_file, bucket, disable_events=disable_events)
|
|
101
110
|
except ConfigurationError as e:
|
|
102
111
|
if debug:
|
|
103
112
|
raise
|
|
@@ -218,7 +218,7 @@ class IngestionJob:
|
|
|
218
218
|
# Process all items in batches. Yield a IngestionJobSummary per batch
|
|
219
219
|
|
|
220
220
|
logger.info("Finding metadata")
|
|
221
|
-
with ingestion_job_summary.record_timing("
|
|
221
|
+
with ingestion_job_summary.record_timing("get_dataset_collection_metadata"):
|
|
222
222
|
dataset_collection_metadata = store.get_dataset_collection(
|
|
223
223
|
dataset_type=self.ingestion_plan.dataset_type,
|
|
224
224
|
provider=self.ingestion_plan.source.provider,
|
|
@@ -232,6 +232,7 @@ class IngestionJob:
|
|
|
232
232
|
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
233
233
|
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
234
234
|
try:
|
|
235
|
+
logger.info(f"Finding datasets for selector={self.selector}")
|
|
235
236
|
with ingestion_job_summary.record_timing("find_datasets"):
|
|
236
237
|
dataset_resources = self.ingestion_plan.source.find_datasets(
|
|
237
238
|
dataset_type=self.ingestion_plan.dataset_type,
|
|
@@ -249,6 +250,8 @@ class IngestionJob:
|
|
|
249
250
|
yield ingestion_job_summary
|
|
250
251
|
return
|
|
251
252
|
|
|
253
|
+
logger.info("Starting tasks")
|
|
254
|
+
|
|
252
255
|
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
253
256
|
|
|
254
257
|
while True:
|
|
@@ -273,13 +276,16 @@ class IngestionJob:
|
|
|
273
276
|
for dataset_resource in batch
|
|
274
277
|
]
|
|
275
278
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
#
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
279
|
+
logger.info(f"Searching for existing Datasets for DatasetResources")
|
|
280
|
+
|
|
281
|
+
with ingestion_job_summary.record_timing("get_dataset_collection"):
|
|
282
|
+
# Load all available datasets based on the discovered dataset identifiers
|
|
283
|
+
dataset_collection = store.get_dataset_collection(
|
|
284
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
285
|
+
# Assume all DatasetResources share the same provider
|
|
286
|
+
provider=batch[0].provider,
|
|
287
|
+
selector=dataset_identifiers,
|
|
288
|
+
)
|
|
283
289
|
|
|
284
290
|
skipped_datasets = 0
|
|
285
291
|
|
{ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/models/ingestion/ingestion_job_summary.py
RENAMED
|
@@ -66,7 +66,7 @@ class IngestionJobSummary(BaseModel, HasTiming):
|
|
|
66
66
|
self.skipped_datasets += skipped_datasets
|
|
67
67
|
|
|
68
68
|
def task_count(self):
|
|
69
|
-
return len(self.task_summaries)
|
|
69
|
+
return len(self.task_summaries) + self.skipped_datasets
|
|
70
70
|
|
|
71
71
|
def _set_ended(self):
|
|
72
72
|
self.failed_tasks = len(
|
|
@@ -106,22 +106,22 @@ class IngestionJobSummary(BaseModel, HasTiming):
|
|
|
106
106
|
f"\nIngestionJobSummary {self.state.value} in {format_duration(self.duration)}"
|
|
107
107
|
)
|
|
108
108
|
print("********************************")
|
|
109
|
-
print(f"
|
|
110
|
-
print(f"
|
|
111
|
-
print(f"
|
|
112
|
-
print(f"
|
|
113
|
-
print(f"
|
|
114
|
-
print(f"
|
|
109
|
+
print(f" - IngestionPlan:")
|
|
110
|
+
print(f" Source: {self.source_name}")
|
|
111
|
+
print(f" Provider: {self.provider}")
|
|
112
|
+
print(f" DatasetType: {self.dataset_type}")
|
|
113
|
+
print(f" - Selector: {self.selector}")
|
|
114
|
+
print(f" - Timings: ")
|
|
115
115
|
for timing in self.timings:
|
|
116
|
-
print(f"
|
|
116
|
+
print(f" - {timing.name}: {format_duration(timing.duration)}")
|
|
117
117
|
print(
|
|
118
|
-
f"
|
|
118
|
+
f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
|
|
119
119
|
)
|
|
120
120
|
|
|
121
|
-
print(f"
|
|
122
|
-
print(f"
|
|
123
|
-
print(f"
|
|
124
|
-
print(f"
|
|
121
|
+
print(f" - Failed tasks: {self.failed_tasks}")
|
|
122
|
+
print(f" - Successful tasks: {self.successful_tasks}")
|
|
123
|
+
print(f" - Successful ignored tasks: {self.ignored_successful_tasks}")
|
|
124
|
+
print(f" - Skipped datasets: {self.skipped_datasets}")
|
|
125
125
|
print("********************************")
|
|
126
126
|
|
|
127
127
|
def __enter__(self):
|
|
@@ -1,32 +1,33 @@
|
|
|
1
1
|
import itertools
|
|
2
|
-
import
|
|
2
|
+
import logging
|
|
3
3
|
import uuid
|
|
4
|
-
from collections import defaultdict
|
|
5
4
|
from typing import Optional, Union, List
|
|
6
5
|
|
|
7
6
|
from sqlalchemy import (
|
|
8
7
|
create_engine,
|
|
9
8
|
func,
|
|
10
9
|
text,
|
|
11
|
-
tuple_,
|
|
12
10
|
Table,
|
|
13
|
-
insert,
|
|
14
|
-
Transaction,
|
|
15
11
|
Connection,
|
|
12
|
+
union_all,
|
|
13
|
+
literal,
|
|
14
|
+
select,
|
|
15
|
+
and_,
|
|
16
|
+
Column,
|
|
17
|
+
or_,
|
|
18
|
+
Dialect,
|
|
16
19
|
)
|
|
17
20
|
from sqlalchemy.engine import make_url
|
|
18
21
|
from sqlalchemy.exc import NoSuchModuleError
|
|
19
|
-
from sqlalchemy.orm import Session,
|
|
22
|
+
from sqlalchemy.orm import Session, Query
|
|
20
23
|
|
|
21
24
|
from ingestify.domain import File, Revision
|
|
22
25
|
from ingestify.domain.models import (
|
|
23
26
|
Dataset,
|
|
24
27
|
DatasetCollection,
|
|
25
28
|
DatasetRepository,
|
|
26
|
-
Identifier,
|
|
27
29
|
Selector,
|
|
28
30
|
)
|
|
29
|
-
from ingestify.domain.models.base import BaseModel
|
|
30
31
|
from ingestify.domain.models.dataset.collection_metadata import (
|
|
31
32
|
DatasetCollectionMetadata,
|
|
32
33
|
)
|
|
@@ -43,6 +44,8 @@ from .tables import (
|
|
|
43
44
|
task_summary_table,
|
|
44
45
|
)
|
|
45
46
|
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
46
49
|
|
|
47
50
|
def parse_value(v):
|
|
48
51
|
try:
|
|
@@ -94,6 +97,7 @@ class SqlAlchemySessionProvider:
|
|
|
94
97
|
# Use the default isolation level, don't need SERIALIZABLE
|
|
95
98
|
# isolation_level="SERIALIZABLE",
|
|
96
99
|
)
|
|
100
|
+
self.dialect = self.engine.dialect
|
|
97
101
|
self.session = Session(bind=self.engine)
|
|
98
102
|
|
|
99
103
|
def __init__(self, url: str):
|
|
@@ -111,22 +115,26 @@ class SqlAlchemySessionProvider:
|
|
|
111
115
|
self.url = state["url"]
|
|
112
116
|
self._init_engine()
|
|
113
117
|
|
|
114
|
-
def _close_engine(self):
|
|
115
|
-
if hasattr(self, "session"):
|
|
116
|
-
self.session.close()
|
|
117
|
-
self.engine.dispose()
|
|
118
|
-
|
|
119
118
|
def __del__(self):
|
|
120
|
-
self.
|
|
119
|
+
self.close()
|
|
121
120
|
|
|
122
121
|
def reset(self):
|
|
123
|
-
self.
|
|
122
|
+
self.close()
|
|
124
123
|
self._init_engine()
|
|
125
124
|
|
|
125
|
+
def close(self):
|
|
126
|
+
if hasattr(self, "session"):
|
|
127
|
+
self.session.close()
|
|
128
|
+
self.engine.dispose()
|
|
129
|
+
|
|
126
130
|
def get(self):
|
|
127
131
|
return self.session
|
|
128
132
|
|
|
129
133
|
|
|
134
|
+
def in_(column: Column, values):
|
|
135
|
+
return or_(*[column == value for value in values])
|
|
136
|
+
|
|
137
|
+
|
|
130
138
|
class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
131
139
|
def __init__(self, session_provider: SqlAlchemySessionProvider):
|
|
132
140
|
self.session_provider = session_provider
|
|
@@ -135,8 +143,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
135
143
|
def session(self):
|
|
136
144
|
return self.session_provider.get()
|
|
137
145
|
|
|
146
|
+
@property
|
|
147
|
+
def dialect(self) -> Dialect:
|
|
148
|
+
return self.session_provider.dialect
|
|
149
|
+
|
|
138
150
|
def _upsert(self, connection: Connection, table: Table, entities: list[dict]):
|
|
139
|
-
dialect = self.
|
|
151
|
+
dialect = self.dialect.name
|
|
140
152
|
if dialect == "mysql":
|
|
141
153
|
from sqlalchemy.dialects.mysql import insert
|
|
142
154
|
elif dialect == "postgresql":
|
|
@@ -169,11 +181,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
169
181
|
dataset_id: Optional[Union[str, List[str]]] = None,
|
|
170
182
|
selector: Optional[Union[Selector, List[Selector]]] = None,
|
|
171
183
|
):
|
|
172
|
-
query = query.filter(dataset_table.c.bucket == bucket)
|
|
173
|
-
if dataset_type:
|
|
174
|
-
query = query.filter(dataset_table.c.dataset_type == dataset_type)
|
|
175
|
-
if provider:
|
|
176
|
-
query = query.filter(dataset_table.c.provider == provider)
|
|
177
184
|
if dataset_id is not None:
|
|
178
185
|
if isinstance(dataset_id, list):
|
|
179
186
|
if len(dataset_id) == 0:
|
|
@@ -181,11 +188,11 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
181
188
|
# return an empty DatasetCollection
|
|
182
189
|
return DatasetCollection()
|
|
183
190
|
|
|
184
|
-
query = query.filter(dataset_table.c.dataset_id
|
|
191
|
+
query = query.filter(in_(dataset_table.c.dataset_id, dataset_id))
|
|
185
192
|
else:
|
|
186
193
|
query = query.filter(dataset_table.c.dataset_id == dataset_id)
|
|
187
194
|
|
|
188
|
-
dialect = self.
|
|
195
|
+
dialect = self.dialect.name
|
|
189
196
|
|
|
190
197
|
if not isinstance(selector, list):
|
|
191
198
|
where, selector = selector.split("where")
|
|
@@ -201,13 +208,22 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
201
208
|
if not selectors:
|
|
202
209
|
raise ValueError("Selectors must contain at least one item")
|
|
203
210
|
|
|
204
|
-
|
|
211
|
+
attribute_sets = {
|
|
212
|
+
tuple(selector.filtered_attributes.items()) for selector in selectors
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
# Define a virtual table using a CTE for all attributes
|
|
216
|
+
attribute_cte = union_all(
|
|
217
|
+
*[
|
|
218
|
+
select(*(literal(value).label(key) for key, value in attr_set))
|
|
219
|
+
for attr_set in attribute_sets
|
|
220
|
+
]
|
|
221
|
+
).cte("attributes")
|
|
205
222
|
|
|
206
|
-
|
|
223
|
+
keys = list(selectors[0].filtered_attributes.keys())
|
|
207
224
|
first_selector = selectors[0].filtered_attributes
|
|
208
225
|
|
|
209
|
-
|
|
210
|
-
# SELECT * FROM dataset WHERE (column1, column2, column3) IN ((1, 2, 3), (4, 5, 6), (7, 8, 9))
|
|
226
|
+
join_conditions = []
|
|
211
227
|
for k in keys:
|
|
212
228
|
if dialect == "postgresql":
|
|
213
229
|
column = dataset_table.c.identifier[k]
|
|
@@ -215,40 +231,43 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
215
231
|
# Take the value from the first selector to determine the type.
|
|
216
232
|
# TODO: check all selectors to determine the type
|
|
217
233
|
v = first_selector[k]
|
|
218
|
-
if
|
|
234
|
+
if isinstance(v, int):
|
|
219
235
|
column = column.as_integer()
|
|
220
|
-
elif isfloat(v):
|
|
221
|
-
column = column.as_float()
|
|
222
236
|
else:
|
|
223
237
|
column = column.as_string()
|
|
224
238
|
else:
|
|
225
239
|
column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
|
|
226
|
-
columns.append(column)
|
|
227
240
|
|
|
228
|
-
|
|
229
|
-
for selector in selectors:
|
|
230
|
-
filtered_attributes = selector.filtered_attributes
|
|
231
|
-
values.append(tuple([filtered_attributes[k] for k in keys]))
|
|
241
|
+
join_conditions.append(attribute_cte.c[k] == column)
|
|
232
242
|
|
|
233
|
-
query = query.
|
|
243
|
+
query = query.select_from(
|
|
244
|
+
dataset_table.join(attribute_cte, and_(*join_conditions))
|
|
245
|
+
)
|
|
234
246
|
|
|
235
247
|
if where:
|
|
236
248
|
query = query.filter(text(where))
|
|
249
|
+
|
|
250
|
+
query = query.filter(dataset_table.c.bucket == bucket)
|
|
251
|
+
if dataset_type:
|
|
252
|
+
query = query.filter(dataset_table.c.dataset_type == dataset_type)
|
|
253
|
+
if provider:
|
|
254
|
+
query = query.filter(dataset_table.c.provider == provider)
|
|
255
|
+
|
|
237
256
|
return query
|
|
238
257
|
|
|
239
|
-
def
|
|
258
|
+
def _load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
|
|
240
259
|
if not dataset_ids:
|
|
241
260
|
return []
|
|
242
261
|
|
|
243
262
|
dataset_rows = list(
|
|
244
263
|
self.session.query(dataset_table).filter(
|
|
245
|
-
dataset_table.c.dataset_id
|
|
264
|
+
in_(dataset_table.c.dataset_id, dataset_ids)
|
|
246
265
|
)
|
|
247
266
|
)
|
|
248
267
|
revisions_per_dataset = {}
|
|
249
268
|
rows = (
|
|
250
269
|
self.session.query(revision_table)
|
|
251
|
-
.filter(revision_table.c.dataset_id
|
|
270
|
+
.filter(in_(revision_table.c.dataset_id, dataset_ids))
|
|
252
271
|
.order_by(revision_table.c.dataset_id)
|
|
253
272
|
)
|
|
254
273
|
|
|
@@ -260,7 +279,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
260
279
|
files_per_revision = {}
|
|
261
280
|
rows = (
|
|
262
281
|
self.session.query(file_table)
|
|
263
|
-
.filter(file_table.c.dataset_id
|
|
282
|
+
.filter(in_(file_table.c.dataset_id, dataset_ids))
|
|
264
283
|
.order_by(file_table.c.dataset_id, file_table.c.revision_id)
|
|
265
284
|
)
|
|
266
285
|
|
|
@@ -290,6 +309,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
290
309
|
)
|
|
291
310
|
return datasets
|
|
292
311
|
|
|
312
|
+
def _debug_query(self, q: Query):
|
|
313
|
+
text_ = q.statement.compile(
|
|
314
|
+
compile_kwargs={"literal_binds": True}, dialect=self.dialect
|
|
315
|
+
)
|
|
316
|
+
logger.debug(f"Running query: {text_}")
|
|
317
|
+
|
|
293
318
|
def get_dataset_collection(
|
|
294
319
|
self,
|
|
295
320
|
bucket: str,
|
|
@@ -309,22 +334,40 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
309
334
|
selector=selector,
|
|
310
335
|
)
|
|
311
336
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
self.session.query(dataset_table.c.dataset_id)
|
|
315
|
-
)
|
|
316
|
-
dataset_ids = [row.dataset_id for row in dataset_query]
|
|
317
|
-
datasets = self.load_datasets(dataset_ids)
|
|
318
|
-
else:
|
|
319
|
-
datasets = []
|
|
337
|
+
with self.session:
|
|
338
|
+
# Use a contextmanager to make sure it's closed afterwards
|
|
320
339
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
340
|
+
if not metadata_only:
|
|
341
|
+
dataset_query = apply_query_filter(
|
|
342
|
+
self.session.query(dataset_table.c.dataset_id)
|
|
343
|
+
)
|
|
344
|
+
self._debug_query(dataset_query)
|
|
345
|
+
dataset_ids = [row.dataset_id for row in dataset_query]
|
|
346
|
+
datasets = self._load_datasets(dataset_ids)
|
|
347
|
+
|
|
348
|
+
dataset_collection_metadata = DatasetCollectionMetadata(
|
|
349
|
+
last_modified=max(dataset.last_modified_at for dataset in datasets)
|
|
350
|
+
if datasets
|
|
351
|
+
else None,
|
|
352
|
+
row_count=len(datasets),
|
|
353
|
+
)
|
|
354
|
+
else:
|
|
355
|
+
datasets = []
|
|
356
|
+
|
|
357
|
+
metadata_result_query = apply_query_filter(
|
|
358
|
+
self.session.query(
|
|
359
|
+
func.max(dataset_table.c.last_modified_at).label(
|
|
360
|
+
"last_modified_at"
|
|
361
|
+
),
|
|
362
|
+
func.count().label("row_count"),
|
|
363
|
+
)
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
self._debug_query(metadata_result_query)
|
|
367
|
+
|
|
368
|
+
dataset_collection_metadata = DatasetCollectionMetadata(
|
|
369
|
+
*metadata_result_query.first()
|
|
370
|
+
)
|
|
328
371
|
|
|
329
372
|
return DatasetCollection(dataset_collection_metadata, datasets)
|
|
330
373
|
|
|
@@ -337,6 +380,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
337
380
|
def connect(self):
|
|
338
381
|
return self.session_provider.engine.connect()
|
|
339
382
|
|
|
383
|
+
def __del__(self):
|
|
384
|
+
self.session_provider.close()
|
|
385
|
+
|
|
340
386
|
def _save(self, datasets: list[Dataset]):
|
|
341
387
|
"""Only do upserts. Never delete. Rows get only deleted when an entire Dataset is removed."""
|
|
342
388
|
datasets_entities = []
|
|
@@ -14,6 +14,7 @@ from sqlalchemy import (
|
|
|
14
14
|
String,
|
|
15
15
|
Table,
|
|
16
16
|
TypeDecorator,
|
|
17
|
+
Index,
|
|
17
18
|
)
|
|
18
19
|
|
|
19
20
|
from sqlalchemy.dialects.postgresql import JSONB
|
|
@@ -167,6 +168,15 @@ dataset_table = Table(
|
|
|
167
168
|
Column("created_at", TZDateTime(6)),
|
|
168
169
|
Column("updated_at", TZDateTime(6)),
|
|
169
170
|
Column("last_modified_at", TZDateTime(6)),
|
|
171
|
+
# Required for performance querying when there are a lot of Datasets
|
|
172
|
+
# with the same provider and dataset_type
|
|
173
|
+
Index(
|
|
174
|
+
"idx_bucket_type_provider_last_modified",
|
|
175
|
+
"bucket",
|
|
176
|
+
"provider",
|
|
177
|
+
"dataset_type",
|
|
178
|
+
"last_modified_at",
|
|
179
|
+
),
|
|
170
180
|
)
|
|
171
181
|
|
|
172
182
|
revision_table = Table(
|
|
@@ -182,7 +182,9 @@ def get_event_subscriber_cls(key: str) -> Type[Subscriber]:
|
|
|
182
182
|
return import_cls(key)
|
|
183
183
|
|
|
184
184
|
|
|
185
|
-
def get_engine(
|
|
185
|
+
def get_engine(
|
|
186
|
+
config_file, bucket: Optional[str] = None, disable_events: bool = False
|
|
187
|
+
) -> IngestionEngine:
|
|
186
188
|
config = parse_config(config_file, default_value="")
|
|
187
189
|
|
|
188
190
|
logger.info("Initializing sources")
|
|
@@ -201,11 +203,16 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
|
|
|
201
203
|
|
|
202
204
|
# Setup an EventBus and wire some more components
|
|
203
205
|
event_bus = EventBus()
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
206
|
+
if not disable_events:
|
|
207
|
+
# When we disable all events we don't register any publishers
|
|
208
|
+
publisher = Publisher()
|
|
209
|
+
for subscriber in config.get("event_subscribers", []):
|
|
210
|
+
cls = get_event_subscriber_cls(subscriber["type"])
|
|
211
|
+
publisher.add_subscriber(cls(store))
|
|
212
|
+
event_bus.register(publisher)
|
|
213
|
+
else:
|
|
214
|
+
logger.info("Disabling all event handlers")
|
|
215
|
+
|
|
209
216
|
store.set_event_bus(event_bus)
|
|
210
217
|
|
|
211
218
|
ingestion_engine = IngestionEngine(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.4.0 → ingestify-0.4.2}/ingestify/domain/services/transformers/kloppy_to_pandas.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.4.0 → ingestify-0.4.2}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2
RENAMED
|
File without changes
|
{ingestify-0.4.0 → ingestify-0.4.2}/ingestify/static/templates/statsbomb_github/database/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|