ingestify 0.1.3__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/application/dataset_store.py +47 -36
- ingestify/application/ingestion_engine.py +3 -3
- ingestify/application/loader.py +71 -241
- ingestify/domain/models/__init__.py +1 -6
- ingestify/domain/models/base.py +22 -0
- ingestify/domain/models/data_spec_version_collection.py +6 -0
- ingestify/domain/models/dataset/__init__.py +3 -5
- ingestify/domain/models/dataset/dataset.py +15 -32
- ingestify/domain/models/dataset/dataset_repository.py +1 -15
- ingestify/domain/models/dataset/dataset_state.py +11 -0
- ingestify/domain/models/dataset/events.py +6 -16
- ingestify/domain/models/dataset/file.py +21 -34
- ingestify/domain/models/dataset/file_collection.py +3 -1
- ingestify/domain/models/dataset/file_repository.py +29 -28
- ingestify/domain/models/dataset/revision.py +26 -3
- ingestify/domain/models/event/domain_event.py +8 -4
- ingestify/domain/models/ingestion/__init__.py +0 -0
- ingestify/domain/models/ingestion/ingestion_job.py +325 -0
- ingestify/domain/models/ingestion/ingestion_job_summary.py +123 -0
- ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
- ingestify/domain/models/resources/dataset_resource.py +29 -37
- ingestify/domain/models/sink.py +1 -8
- ingestify/domain/models/task/task.py +3 -1
- ingestify/domain/models/task/task_summary.py +118 -0
- ingestify/domain/models/timing.py +16 -0
- ingestify/domain/services/identifier_key_transformer.py +111 -0
- ingestify/infra/fetch/http.py +5 -0
- ingestify/infra/source/statsbomb_github.py +67 -54
- ingestify/infra/store/dataset/__init__.py +0 -2
- ingestify/infra/store/dataset/sqlalchemy/mapping.py +187 -4
- ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -24
- ingestify/infra/store/file/local_file_repository.py +3 -5
- ingestify/infra/store/file/s3_file_repository.py +4 -9
- ingestify/main.py +64 -25
- ingestify/utils.py +15 -78
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/METADATA +2 -1
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/RECORD +41 -34
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/WHEEL +1 -1
- ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -29,22 +29,6 @@ def parse_value(v):
|
|
|
29
29
|
return v
|
|
30
30
|
|
|
31
31
|
|
|
32
|
-
def json_serializer(o):
|
|
33
|
-
return json.dumps(o)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def json_deserializer(o):
|
|
37
|
-
o = json.loads(o)
|
|
38
|
-
# THIS BREAKS WHEN USING OTHER JSON COLUMNS!!
|
|
39
|
-
o = Identifier(**o)
|
|
40
|
-
return o
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
# @compiles(DateTime, "mysql")
|
|
44
|
-
# def compile_datetime_mysql(type_, compiler, **kw):
|
|
45
|
-
# return "DATETIME(6)"
|
|
46
|
-
|
|
47
|
-
|
|
48
32
|
def isfloat(x):
|
|
49
33
|
try:
|
|
50
34
|
a = float(x)
|
|
@@ -64,7 +48,7 @@ def isint(x):
|
|
|
64
48
|
return a == b
|
|
65
49
|
|
|
66
50
|
|
|
67
|
-
class
|
|
51
|
+
class SqlAlchemySessionProvider:
|
|
68
52
|
@staticmethod
|
|
69
53
|
def fix_url(url: str) -> str:
|
|
70
54
|
if url.startswith("postgres://"):
|
|
@@ -87,8 +71,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
87
71
|
self.url,
|
|
88
72
|
# Use the default isolation level, don't need SERIALIZABLE
|
|
89
73
|
# isolation_level="SERIALIZABLE",
|
|
90
|
-
json_serializer=json_serializer,
|
|
91
|
-
json_deserializer=json_deserializer,
|
|
92
74
|
)
|
|
93
75
|
self.session = Session(bind=self.engine)
|
|
94
76
|
|
|
@@ -107,9 +89,29 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
107
89
|
self.url = state["url"]
|
|
108
90
|
self._init_engine()
|
|
109
91
|
|
|
92
|
+
def _close_engine(self):
|
|
93
|
+
if hasattr(self, "session"):
|
|
94
|
+
self.session.close()
|
|
95
|
+
self.engine.dispose()
|
|
96
|
+
|
|
110
97
|
def __del__(self):
|
|
111
|
-
self.
|
|
112
|
-
|
|
98
|
+
self._close_engine()
|
|
99
|
+
|
|
100
|
+
def reset(self):
|
|
101
|
+
self._close_engine()
|
|
102
|
+
self._init_engine()
|
|
103
|
+
|
|
104
|
+
def get(self):
|
|
105
|
+
return self.session
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
109
|
+
def __init__(self, session_provider: SqlAlchemySessionProvider):
|
|
110
|
+
self.session_provider = session_provider
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def session(self):
|
|
114
|
+
return self.session_provider.get()
|
|
113
115
|
|
|
114
116
|
def _filter_query(
|
|
115
117
|
self,
|
|
@@ -207,9 +209,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
207
209
|
)
|
|
208
210
|
|
|
209
211
|
if not metadata_only:
|
|
210
|
-
dataset_query = apply_query_filter(
|
|
211
|
-
self.session.query(Dataset).options(joinedload(Dataset.revisions))
|
|
212
|
-
)
|
|
212
|
+
dataset_query = apply_query_filter(self.session.query(Dataset))
|
|
213
213
|
datasets = list(dataset_query)
|
|
214
214
|
else:
|
|
215
215
|
datasets = []
|
|
@@ -19,14 +19,12 @@ class LocalFileRepository(FileRepository):
|
|
|
19
19
|
filename: str,
|
|
20
20
|
stream: BinaryIO,
|
|
21
21
|
) -> Path:
|
|
22
|
-
path = self.
|
|
22
|
+
path = self.get_write_path(bucket, dataset, revision_id, filename)
|
|
23
23
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
24
24
|
|
|
25
25
|
with open(path, "wb") as fp:
|
|
26
26
|
shutil.copyfileobj(stream, fp)
|
|
27
27
|
return path
|
|
28
28
|
|
|
29
|
-
def load_content(
|
|
30
|
-
self,
|
|
31
|
-
) -> BinaryIO:
|
|
32
|
-
return open(self.get_path(bucket, dataset, revision_id, filename), "rb")
|
|
29
|
+
def load_content(self, storage_path: str) -> BinaryIO:
|
|
30
|
+
return open(self.get_read_path(storage_path), "rb")
|
|
@@ -8,10 +8,7 @@ from ingestify.domain.models import FileRepository
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class S3FileRepository(FileRepository):
|
|
11
|
-
|
|
12
|
-
super().__init__(url)
|
|
13
|
-
|
|
14
|
-
self._s3 = None
|
|
11
|
+
_s3 = None
|
|
15
12
|
|
|
16
13
|
@property
|
|
17
14
|
def s3(self):
|
|
@@ -30,16 +27,14 @@ class S3FileRepository(FileRepository):
|
|
|
30
27
|
filename: str,
|
|
31
28
|
stream: BinaryIO,
|
|
32
29
|
) -> Path:
|
|
33
|
-
key = self.
|
|
30
|
+
key = self.get_write_path(bucket, dataset, revision_id, filename)
|
|
34
31
|
s3_bucket = Path(key.parts[0])
|
|
35
32
|
|
|
36
33
|
self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).put(Body=stream)
|
|
37
34
|
return key
|
|
38
35
|
|
|
39
|
-
def load_content(
|
|
40
|
-
|
|
41
|
-
) -> BinaryIO:
|
|
42
|
-
key = self.get_path(bucket, dataset, revision_id, filename)
|
|
36
|
+
def load_content(self, storage_path: str) -> BinaryIO:
|
|
37
|
+
key = self.get_read_path(storage_path)
|
|
43
38
|
s3_bucket = Path(key.parts[0])
|
|
44
39
|
return self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).get()[
|
|
45
40
|
"Body"
|
ingestify/main.py
CHANGED
|
@@ -11,19 +11,21 @@ from ingestify import Source
|
|
|
11
11
|
from ingestify.application.dataset_store import DatasetStore
|
|
12
12
|
from ingestify.application.ingestion_engine import IngestionEngine
|
|
13
13
|
from ingestify.application.secrets_manager import SecretsManager
|
|
14
|
-
from ingestify.domain import Selector
|
|
15
|
-
from ingestify.domain.models import (
|
|
16
|
-
dataset_repository_factory,
|
|
17
|
-
file_repository_factory,
|
|
18
|
-
)
|
|
14
|
+
from ingestify.domain import Selector, FileRepository
|
|
19
15
|
from ingestify.domain.models.data_spec_version_collection import (
|
|
20
16
|
DataSpecVersionCollection,
|
|
21
17
|
)
|
|
22
18
|
from ingestify.domain.models.event import EventBus, Publisher, Subscriber
|
|
23
19
|
|
|
24
|
-
from ingestify.domain.models.
|
|
20
|
+
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
25
21
|
from ingestify.domain.models.fetch_policy import FetchPolicy
|
|
22
|
+
from ingestify.domain.services.identifier_key_transformer import IdentifierTransformer
|
|
26
23
|
from ingestify.exceptions import ConfigurationError
|
|
24
|
+
from ingestify.infra import S3FileRepository, LocalFileRepository
|
|
25
|
+
from ingestify.infra.store.dataset.sqlalchemy import SqlAlchemyDatasetRepository
|
|
26
|
+
from ingestify.infra.store.dataset.sqlalchemy.repository import (
|
|
27
|
+
SqlAlchemySessionProvider,
|
|
28
|
+
)
|
|
27
29
|
|
|
28
30
|
logger = logging.getLogger(__name__)
|
|
29
31
|
|
|
@@ -59,8 +61,23 @@ def import_cls(name):
|
|
|
59
61
|
return getattr(mod, components[-1])
|
|
60
62
|
|
|
61
63
|
|
|
64
|
+
def build_file_repository(file_url: str, identifier_transformer) -> FileRepository:
|
|
65
|
+
if file_url.startswith("s3://"):
|
|
66
|
+
repository = S3FileRepository(
|
|
67
|
+
url=file_url, identifier_transformer=identifier_transformer
|
|
68
|
+
)
|
|
69
|
+
elif file_url.startswith("file://"):
|
|
70
|
+
repository = LocalFileRepository(
|
|
71
|
+
url=file_url, identifier_transformer=identifier_transformer
|
|
72
|
+
)
|
|
73
|
+
else:
|
|
74
|
+
raise Exception(f"Cannot find repository to handle file {file_url}")
|
|
75
|
+
|
|
76
|
+
return repository
|
|
77
|
+
|
|
78
|
+
|
|
62
79
|
def get_dataset_store_by_urls(
|
|
63
|
-
|
|
80
|
+
metadata_url: str, file_url: str, bucket: str, dataset_types
|
|
64
81
|
) -> DatasetStore:
|
|
65
82
|
"""
|
|
66
83
|
Initialize a DatasetStore by a DatasetRepository and a FileRepository
|
|
@@ -68,15 +85,30 @@ def get_dataset_store_by_urls(
|
|
|
68
85
|
if not bucket:
|
|
69
86
|
raise Exception("Bucket is not specified")
|
|
70
87
|
|
|
71
|
-
|
|
88
|
+
identifier_transformer = IdentifierTransformer()
|
|
89
|
+
for dataset_type in dataset_types:
|
|
90
|
+
for id_key, id_config in dataset_type["identifier_keys"].items():
|
|
91
|
+
identifier_transformer.register_transformation(
|
|
92
|
+
provider=dataset_type["provider"],
|
|
93
|
+
dataset_type=dataset_type["dataset_type"],
|
|
94
|
+
id_key=id_key,
|
|
95
|
+
transformation=id_config["transformation"],
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
file_repository = build_file_repository(
|
|
99
|
+
file_url, identifier_transformer=identifier_transformer
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
if secrets_manager.supports(metadata_url):
|
|
103
|
+
metadata_url = secrets_manager.load_as_db_url(metadata_url)
|
|
104
|
+
|
|
105
|
+
if metadata_url.startswith("postgres://"):
|
|
106
|
+
metadata_url = metadata_url.replace("postgress://", "postgress+")
|
|
72
107
|
|
|
73
|
-
|
|
74
|
-
dataset_url = secrets_manager.load_as_db_url(dataset_url)
|
|
108
|
+
sqlalchemy_session_provider = SqlAlchemySessionProvider(metadata_url)
|
|
75
109
|
|
|
76
|
-
|
|
77
|
-
dataset_url = dataset_url.replace("postgress://", "postgress+")
|
|
110
|
+
dataset_repository = SqlAlchemyDatasetRepository(sqlalchemy_session_provider)
|
|
78
111
|
|
|
79
|
-
dataset_repository = dataset_repository_factory.build_if_supports(url=dataset_url)
|
|
80
112
|
return DatasetStore(
|
|
81
113
|
dataset_repository=dataset_repository,
|
|
82
114
|
file_repository=file_repository,
|
|
@@ -88,14 +120,15 @@ def get_datastore(config_file, bucket: Optional[str] = None) -> DatasetStore:
|
|
|
88
120
|
config = parse_config(config_file, default_value="")
|
|
89
121
|
|
|
90
122
|
return get_dataset_store_by_urls(
|
|
91
|
-
|
|
123
|
+
metadata_url=config["main"]["metadata_url"],
|
|
92
124
|
file_url=config["main"]["file_url"],
|
|
93
125
|
bucket=bucket or config["main"].get("default_bucket"),
|
|
126
|
+
dataset_types=config.get("dataset_types", []),
|
|
94
127
|
)
|
|
95
128
|
|
|
96
129
|
|
|
97
130
|
def get_remote_datastore(url: str, bucket: str, **kwargs) -> DatasetStore:
|
|
98
|
-
return get_dataset_store_by_urls(
|
|
131
|
+
return get_dataset_store_by_urls(metadata_url=url, file_url=url, bucket=bucket)
|
|
99
132
|
|
|
100
133
|
|
|
101
134
|
def get_source_cls(key: str) -> Type[Source]:
|
|
@@ -155,9 +188,10 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
|
|
|
155
188
|
|
|
156
189
|
logger.info("Initializing IngestionEngine")
|
|
157
190
|
store = get_dataset_store_by_urls(
|
|
158
|
-
|
|
191
|
+
metadata_url=config["main"]["metadata_url"],
|
|
159
192
|
file_url=config["main"]["file_url"],
|
|
160
193
|
bucket=bucket or config["main"].get("default_bucket"),
|
|
194
|
+
dataset_types=config.get("dataset_types", []),
|
|
161
195
|
)
|
|
162
196
|
|
|
163
197
|
# Setup an EventBus and wire some more components
|
|
@@ -173,19 +207,24 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
|
|
|
173
207
|
store=store,
|
|
174
208
|
)
|
|
175
209
|
|
|
176
|
-
logger.info("
|
|
210
|
+
logger.info("Adding IngestionPlans...")
|
|
177
211
|
|
|
178
212
|
fetch_policy = FetchPolicy()
|
|
179
213
|
|
|
180
|
-
|
|
214
|
+
# Previous naming
|
|
215
|
+
ingestion_plans = config.get("extract_jobs", [])
|
|
216
|
+
# New naming
|
|
217
|
+
ingestion_plans.extend(config.get("ingestion_plans", []))
|
|
218
|
+
|
|
219
|
+
for ingestion_plan in ingestion_plans:
|
|
181
220
|
data_spec_versions = DataSpecVersionCollection.from_dict(
|
|
182
|
-
|
|
221
|
+
ingestion_plan.get("data_spec_versions", {"default": {"v1"}})
|
|
183
222
|
)
|
|
184
223
|
|
|
185
|
-
if "selectors" in
|
|
224
|
+
if "selectors" in ingestion_plan:
|
|
186
225
|
selectors = [
|
|
187
226
|
Selector.build(selector, data_spec_versions=data_spec_versions)
|
|
188
|
-
for selector_args in
|
|
227
|
+
for selector_args in ingestion_plan["selectors"]
|
|
189
228
|
for selector in _product_selectors(selector_args)
|
|
190
229
|
]
|
|
191
230
|
else:
|
|
@@ -193,13 +232,13 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
|
|
|
193
232
|
# but makes it easier later one where we loop over selectors.
|
|
194
233
|
selectors = [Selector.build({}, data_spec_versions=data_spec_versions)]
|
|
195
234
|
|
|
196
|
-
|
|
197
|
-
source=sources[
|
|
198
|
-
dataset_type=
|
|
235
|
+
ingestion_plan = IngestionPlan(
|
|
236
|
+
source=sources[ingestion_plan["source"]],
|
|
237
|
+
dataset_type=ingestion_plan["dataset_type"],
|
|
199
238
|
selectors=selectors,
|
|
200
239
|
fetch_policy=fetch_policy,
|
|
201
240
|
data_spec_versions=data_spec_versions,
|
|
202
241
|
)
|
|
203
|
-
ingestion_engine.
|
|
242
|
+
ingestion_engine.add_ingestion_plan(ingestion_plan)
|
|
204
243
|
|
|
205
244
|
return ingestion_engine
|
ingestify/utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import abc
|
|
2
|
+
import asyncio
|
|
2
3
|
import inspect
|
|
3
4
|
import logging
|
|
4
5
|
import os
|
|
@@ -8,7 +9,19 @@ from multiprocessing import get_context, cpu_count, get_all_start_methods
|
|
|
8
9
|
|
|
9
10
|
from datetime import datetime, timezone
|
|
10
11
|
from string import Template
|
|
11
|
-
from typing import
|
|
12
|
+
from typing import (
|
|
13
|
+
Dict,
|
|
14
|
+
Generic,
|
|
15
|
+
Type,
|
|
16
|
+
TypeVar,
|
|
17
|
+
Tuple,
|
|
18
|
+
Optional,
|
|
19
|
+
Any,
|
|
20
|
+
Callable,
|
|
21
|
+
Awaitable,
|
|
22
|
+
List,
|
|
23
|
+
Iterable,
|
|
24
|
+
)
|
|
12
25
|
|
|
13
26
|
import cloudpickle
|
|
14
27
|
from typing_extensions import Self
|
|
@@ -39,83 +52,6 @@ def sanitize_exception_message(exception_message):
|
|
|
39
52
|
return sanitized_message
|
|
40
53
|
|
|
41
54
|
|
|
42
|
-
class ComponentRegistry:
|
|
43
|
-
def __init__(self):
|
|
44
|
-
self.__registered_components = {}
|
|
45
|
-
|
|
46
|
-
class _Registered(abc.ABCMeta):
|
|
47
|
-
def __new__(mcs, cls_name, bases, class_dict):
|
|
48
|
-
class_dict["name"] = cls_name
|
|
49
|
-
component_cls = super(_Registered, mcs).__new__(
|
|
50
|
-
mcs, cls_name, bases, class_dict
|
|
51
|
-
)
|
|
52
|
-
if not inspect.isabstract(component_cls):
|
|
53
|
-
self.register_component(cls_name, component_cls)
|
|
54
|
-
else:
|
|
55
|
-
if bases[0] != abc.ABC:
|
|
56
|
-
raise Exception(
|
|
57
|
-
f"Class '{cls_name}' seems to be an concrete class, but missing some abstract methods"
|
|
58
|
-
)
|
|
59
|
-
return component_cls
|
|
60
|
-
|
|
61
|
-
self.__metaclass = _Registered
|
|
62
|
-
|
|
63
|
-
@property
|
|
64
|
-
def metaclass(self):
|
|
65
|
-
return self.__metaclass
|
|
66
|
-
|
|
67
|
-
def register_component(self, cls_name, component_cls):
|
|
68
|
-
self.__registered_components[cls_name] = component_cls
|
|
69
|
-
|
|
70
|
-
def get_component(self, cls_name: str):
|
|
71
|
-
return self.__registered_components[cls_name]
|
|
72
|
-
|
|
73
|
-
def get_supporting_component(self, **kwargs) -> str:
|
|
74
|
-
for cls_name, class_ in self.__registered_components.items():
|
|
75
|
-
if not hasattr(class_, "supports"):
|
|
76
|
-
raise Exception(
|
|
77
|
-
f"Class '{cls_name}' does not implemented a 'supports' classmethod. "
|
|
78
|
-
f"This is required when using 'get_supporting_component'."
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
if class_.supports(**kwargs):
|
|
82
|
-
return cls_name
|
|
83
|
-
|
|
84
|
-
kwargs_str = sanitize_exception_message(str(kwargs))
|
|
85
|
-
raise Exception(f"No supporting class found for {kwargs_str}")
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
T = TypeVar("T")
|
|
89
|
-
R = TypeVar("R")
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
class ComponentFactory(Generic[T]):
|
|
93
|
-
def __init__(self, registry: ComponentRegistry):
|
|
94
|
-
self.registry = registry
|
|
95
|
-
|
|
96
|
-
@classmethod
|
|
97
|
-
def build_factory(
|
|
98
|
-
cls, component_cls: Type[R], registry: ComponentRegistry
|
|
99
|
-
) -> "ComponentFactory[R]":
|
|
100
|
-
return cls[component_cls](registry)
|
|
101
|
-
|
|
102
|
-
def build(self, cls_name, **kwargs) -> T:
|
|
103
|
-
component_cls = self.registry.get_component(cls_name)
|
|
104
|
-
try:
|
|
105
|
-
return component_cls.from_dict(**kwargs)
|
|
106
|
-
except AttributeError:
|
|
107
|
-
pass
|
|
108
|
-
try:
|
|
109
|
-
return component_cls(**kwargs)
|
|
110
|
-
except TypeError as e:
|
|
111
|
-
raise e
|
|
112
|
-
# raise TypeError(f"Could not initialize {cls_name}")
|
|
113
|
-
|
|
114
|
-
def build_if_supports(self, **kwargs) -> T:
|
|
115
|
-
cls_name = self.registry.get_supporting_component(**kwargs)
|
|
116
|
-
return self.build(cls_name, **kwargs)
|
|
117
|
-
|
|
118
|
-
|
|
119
55
|
def key_from_dict(d: dict) -> str:
|
|
120
56
|
return "/".join([f"{k}={v}" for k, v in sorted(d.items()) if not k.startswith("_")])
|
|
121
57
|
|
|
@@ -270,6 +206,7 @@ class TaskExecutor:
|
|
|
270
206
|
logger.info(
|
|
271
207
|
f"Finished {len(res)} tasks in {took:.1f} seconds. {(len(res)/took):.1f} tasks/sec"
|
|
272
208
|
)
|
|
209
|
+
return res
|
|
273
210
|
|
|
274
211
|
def join(self):
|
|
275
212
|
self.pool.close()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ingestify
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Standardizing soccer tracking- and event data
|
|
5
5
|
Author: Koen Vossen
|
|
6
6
|
Author-email: info@koenvossen.nl
|
|
@@ -16,6 +16,7 @@ Requires-Dist: python-dotenv
|
|
|
16
16
|
Requires-Dist: pyaml-env
|
|
17
17
|
Requires-Dist: boto3
|
|
18
18
|
Requires-Dist: pytz
|
|
19
|
+
Requires-Dist: pydantic>=2.0.0
|
|
19
20
|
Provides-Extra: test
|
|
20
21
|
Requires-Dist: pytest<7,>=6.2.5; extra == "test"
|
|
21
22
|
|
|
@@ -1,67 +1,74 @@
|
|
|
1
|
-
ingestify/__init__.py,sha256=
|
|
1
|
+
ingestify/__init__.py,sha256=DnPPEtJT32gAPuUKXgIsqUE4fIvc6QA96vrcKr6nz6A,301
|
|
2
2
|
ingestify/cmdline.py,sha256=bIuyPgGEw4wIglNzpG9zp7TsJozsP8NSVsCe4eAyWUg,7189
|
|
3
3
|
ingestify/exceptions.py,sha256=wMMuajl4AkQRfW60TLN7btJmQaH8-lUczXyW_2g9kOU,143
|
|
4
|
-
ingestify/main.py,sha256=
|
|
4
|
+
ingestify/main.py,sha256=0sTNoLcS7euOavIAviQIMTolRnXsvOvNbmFdXgXgxhE,8516
|
|
5
5
|
ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
|
|
6
6
|
ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
|
|
7
|
-
ingestify/utils.py,sha256=
|
|
7
|
+
ingestify/utils.py,sha256=HETGhAoUlutLG0cQR63nac2JbFei9gnktDHeBQoYWfU,5692
|
|
8
8
|
ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
ingestify/application/dataset_store.py,sha256=
|
|
10
|
-
ingestify/application/ingestion_engine.py,sha256=
|
|
11
|
-
ingestify/application/loader.py,sha256=
|
|
9
|
+
ingestify/application/dataset_store.py,sha256=6xMHa_ShyPOyegIKl2xwmRl3BlV5i21z95cpKW3oARw,11712
|
|
10
|
+
ingestify/application/ingestion_engine.py,sha256=PtMjKMpvfqB802G5zfKLzyamdH7qFOXl3x6_97y8w60,2288
|
|
11
|
+
ingestify/application/loader.py,sha256=v8ZcpMDEml9k_uFPFqT4WaCjXED_OIpAr7g0Pz5Hp6Y,7153
|
|
12
12
|
ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
|
|
13
13
|
ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
|
|
14
|
-
ingestify/domain/models/__init__.py,sha256=
|
|
15
|
-
ingestify/domain/models/
|
|
16
|
-
ingestify/domain/models/
|
|
14
|
+
ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
|
|
15
|
+
ingestify/domain/models/base.py,sha256=6jzzIqSkH1mPsXZ2OTXMj09S_IlvMOrOBHBJyWAKEjE,555
|
|
16
|
+
ingestify/domain/models/data_spec_version_collection.py,sha256=CAXlO4W2AOOWAPdPAuymqBHnJpiYtkr2z7fYFJ3HSCk,1372
|
|
17
17
|
ingestify/domain/models/fetch_policy.py,sha256=d7K1TzliNJXxqaqzqEOQWLhvgIvmmqhUQEliXvSUcTs,1405
|
|
18
|
-
ingestify/domain/models/sink.py,sha256=
|
|
18
|
+
ingestify/domain/models/sink.py,sha256=OBVfFMpB7puJmHg4q2KYx4qgoAnlmX8xKWYnPi8a9pc,178
|
|
19
19
|
ingestify/domain/models/source.py,sha256=sB3aqr2LfjIbtw7ODJpHnPj3RUeo7gYmTU7MXvfaYg4,973
|
|
20
|
-
ingestify/domain/models/
|
|
20
|
+
ingestify/domain/models/timing.py,sha256=TvvH6Szo61CD8wCP7Awyc45CXga5lKqvoW2U-0TRHlA,388
|
|
21
|
+
ingestify/domain/models/dataset/__init__.py,sha256=i1kswluvWjw0xn4OUByRt7yeRvNHu1mauevv-Vmayx4,630
|
|
21
22
|
ingestify/domain/models/dataset/collection.py,sha256=E2utQ6oyaFFrfQFMiwP9J_I7Wm21z0sRvE4Zc3QEs20,1310
|
|
22
23
|
ingestify/domain/models/dataset/collection_metadata.py,sha256=gI5cb9M0QRsheIr2jA71wOyWfI5lGx5ES2Qw7rbDIoA,371
|
|
23
|
-
ingestify/domain/models/dataset/dataset.py,sha256=
|
|
24
|
-
ingestify/domain/models/dataset/dataset_repository.py,sha256=
|
|
25
|
-
ingestify/domain/models/dataset/
|
|
26
|
-
ingestify/domain/models/dataset/
|
|
27
|
-
ingestify/domain/models/dataset/
|
|
28
|
-
ingestify/domain/models/dataset/
|
|
24
|
+
ingestify/domain/models/dataset/dataset.py,sha256=ReL50BXNaJVU29OB5_9CQEI7BekWsgi1t3AR7e5jENc,2743
|
|
25
|
+
ingestify/domain/models/dataset/dataset_repository.py,sha256=kUjiqW58kOUOli1gZCLR5xw4dBX0bqI1UJsf16hgNsQ,812
|
|
26
|
+
ingestify/domain/models/dataset/dataset_state.py,sha256=O95mea5N34HDXw7XsYzxHna4FVk_T-ZNUDezkvt7VzY,220
|
|
27
|
+
ingestify/domain/models/dataset/events.py,sha256=58VacQejQt-WPh9BywP4st5McauM3gXBQo0kaDnSekY,481
|
|
28
|
+
ingestify/domain/models/dataset/file.py,sha256=nuoZI9GI5OysYwWCCyNsHMlm1Z9A1GbEKd38jvBzJ4E,4119
|
|
29
|
+
ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
|
|
30
|
+
ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
|
|
29
31
|
ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
|
|
30
|
-
ingestify/domain/models/dataset/revision.py,sha256=
|
|
32
|
+
ingestify/domain/models/dataset/revision.py,sha256=O_1HG2S2EmYdWqI2K282S_D-d6IhRh_f4Q3wV8MEhkk,1311
|
|
31
33
|
ingestify/domain/models/dataset/selector.py,sha256=kEGpU8pIyjZ0zwE9n2uo_NY5xrNanWiTTgapyMAUEsw,1039
|
|
32
34
|
ingestify/domain/models/event/__init__.py,sha256=OdPTpE9bj5QqdGmrYqRTLPX1f-LR9GWJYlGMPPEsuL8,138
|
|
33
35
|
ingestify/domain/models/event/_old_event.py,sha256=RktgCAj9SMdtqkAc_bOwoghEb2Z6m4r5_xWXin9wqx4,472
|
|
34
36
|
ingestify/domain/models/event/dispatcher.py,sha256=5WnyUJ7Qzr612btAtl1dMG9JBXDPcsBLyLmW6H7Q1zk,154
|
|
35
|
-
ingestify/domain/models/event/domain_event.py,sha256=
|
|
37
|
+
ingestify/domain/models/event/domain_event.py,sha256=OR6va417j2lisRr0gjQZ9rshAtlys5sVu7KU-W0r0xA,316
|
|
36
38
|
ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmceWLstOxiP3-2qU,576
|
|
37
39
|
ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
|
|
38
40
|
ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
|
|
41
|
+
ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
+
ingestify/domain/models/ingestion/ingestion_job.py,sha256=U6B62c7NGeHBAjmKhgOa4uHeul34xyR66WtWaPSRNTU,12276
|
|
43
|
+
ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=1l9O3QJkYLs74HhrwAijwNEriPMwHN9OFG64Iz4z3uI,4262
|
|
44
|
+
ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
|
|
39
45
|
ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
|
|
40
|
-
ingestify/domain/models/resources/dataset_resource.py,sha256=
|
|
46
|
+
ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
|
|
41
47
|
ingestify/domain/models/task/__init__.py,sha256=BdlyIPvE07Xax_IzLgO9DUw0wsz9OZutxnxdDNyRlys,79
|
|
42
48
|
ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIpXtlhW4,430
|
|
43
|
-
ingestify/domain/models/task/task.py,sha256=
|
|
49
|
+
ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
|
|
50
|
+
ingestify/domain/models/task/task_summary.py,sha256=ovzqKPstngRVzVA_JboQMluq5uQjKVJDsWNNcfcadhU,3774
|
|
44
51
|
ingestify/domain/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
|
+
ingestify/domain/services/identifier_key_transformer.py,sha256=y4GS9u9Ej1MO2jUhAxWbifp0mrE_MqTHvVVcoQzSKb4,4034
|
|
45
53
|
ingestify/domain/services/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
54
|
ingestify/domain/services/transformers/kloppy_to_pandas.py,sha256=NcN6nTBGVn9gz-_hWZJTMcduS1Gg7EM4X95Cqxi1QIM,809
|
|
47
55
|
ingestify/infra/__init__.py,sha256=V0hpLzPVTcOHRVh0gguF6FT30YIgEOUd5v87xUHkfZ4,88
|
|
48
56
|
ingestify/infra/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
49
|
-
ingestify/infra/fetch/http.py,sha256=
|
|
57
|
+
ingestify/infra/fetch/http.py,sha256=ldaXy6alBbI9z63H97lXfYZNT0ZCBkTac1W6-acNjjY,4127
|
|
50
58
|
ingestify/infra/serialization/__init__.py,sha256=LwfmRoO4qykZkJZXxVPSKpwoVIkg9qzXa7Egut9JjL4,1772
|
|
51
59
|
ingestify/infra/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
60
|
ingestify/infra/sink/postgresql.py,sha256=SxuM3LntfYcpCriUpqJhMvgAf0s9cohXf6WkxSEDYDY,1816
|
|
53
61
|
ingestify/infra/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
-
ingestify/infra/source/statsbomb_github.py,sha256=
|
|
62
|
+
ingestify/infra/source/statsbomb_github.py,sha256=IzzrlIRqkChgJp87yW3ugG1my4g_5uMx_xEnoQLWNss,3543
|
|
55
63
|
ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nOUGxE,5626
|
|
56
64
|
ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
|
|
57
|
-
ingestify/infra/store/dataset/__init__.py,sha256=
|
|
58
|
-
ingestify/infra/store/dataset/local_dataset_repository.py,sha256=UMgSe1M9u_629V4WyuTJ-QegZJiDczzMo7vkNbNleqA,2064
|
|
65
|
+
ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
59
66
|
ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
|
|
60
|
-
ingestify/infra/store/dataset/sqlalchemy/mapping.py,sha256=
|
|
61
|
-
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256
|
|
67
|
+
ingestify/infra/store/dataset/sqlalchemy/mapping.py,sha256=UlEIfNusSOEWOxPi_ORrdLSylbi6-TO1qwEmcrBLwog,9447
|
|
68
|
+
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=-eSR_F9tS9Hd3JNEpoJoDAb5RY38rFaKLMI3eBedjx8,7068
|
|
62
69
|
ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
|
|
63
|
-
ingestify/infra/store/file/local_file_repository.py,sha256=
|
|
64
|
-
ingestify/infra/store/file/s3_file_repository.py,sha256=
|
|
70
|
+
ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
|
|
71
|
+
ingestify/infra/store/file/s3_file_repository.py,sha256=_sekV1rfEbwIaSGhKRnFQlj92E9qNgONiwXt6ZLCyGg,1188
|
|
65
72
|
ingestify/static/templates/statsbomb_github/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
73
|
ingestify/static/templates/statsbomb_github/config.yaml.jinja2,sha256=_gAuAipfBL3ddLacyS1IBP5JluvPS2vmrb8GGaFtcUM,386
|
|
67
74
|
ingestify/static/templates/statsbomb_github/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
|
|
@@ -72,8 +79,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
|
|
|
72
79
|
ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
|
|
73
80
|
ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
|
|
74
81
|
ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
|
|
75
|
-
ingestify-0.
|
|
76
|
-
ingestify-0.
|
|
77
|
-
ingestify-0.
|
|
78
|
-
ingestify-0.
|
|
79
|
-
ingestify-0.
|
|
82
|
+
ingestify-0.3.0.dist-info/METADATA,sha256=-QlChdV6OYWkqSyXUmkQTG4deBliRsSmmZMTWKeURnI,18853
|
|
83
|
+
ingestify-0.3.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
84
|
+
ingestify-0.3.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
|
|
85
|
+
ingestify-0.3.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
|
|
86
|
+
ingestify-0.3.0.dist-info/RECORD,,
|
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
import glob
|
|
2
|
-
import os
|
|
3
|
-
import pickle
|
|
4
|
-
import uuid
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Optional
|
|
7
|
-
|
|
8
|
-
from ingestify.domain.models import (
|
|
9
|
-
Dataset,
|
|
10
|
-
DatasetCollection,
|
|
11
|
-
DatasetRepository,
|
|
12
|
-
Selector,
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def parse_value(v):
|
|
17
|
-
try:
|
|
18
|
-
return int(v)
|
|
19
|
-
except ValueError:
|
|
20
|
-
return v
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class LocalDatasetRepository(DatasetRepository):
|
|
24
|
-
def destroy(self, dataset: Dataset):
|
|
25
|
-
path = (
|
|
26
|
-
self.base_dir / dataset.identifier.key.replace("/", "__") / "dataset.pickle"
|
|
27
|
-
)
|
|
28
|
-
path.unlink()
|
|
29
|
-
|
|
30
|
-
@classmethod
|
|
31
|
-
def supports(cls, url: str) -> bool:
|
|
32
|
-
return url.startswith("file://")
|
|
33
|
-
|
|
34
|
-
def __init__(self, url: str):
|
|
35
|
-
self.base_dir = Path(url[7:])
|
|
36
|
-
raise DeprecationWarning(
|
|
37
|
-
"This Repository should not be used. Better use SqlAlchemyDatasetRepository with a local sqlite database."
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
def get_dataset_collection(
|
|
41
|
-
self,
|
|
42
|
-
dataset_type: Optional[str] = None,
|
|
43
|
-
provider: Optional[str] = None,
|
|
44
|
-
dataset_id: Optional[str] = None,
|
|
45
|
-
selector: Optional[Selector] = None,
|
|
46
|
-
**kwargs
|
|
47
|
-
) -> DatasetCollection:
|
|
48
|
-
|
|
49
|
-
datasets = []
|
|
50
|
-
for dir_name in glob.glob(str(self.base_dir / "*")):
|
|
51
|
-
attributes = {
|
|
52
|
-
item[0]: parse_value(item[1])
|
|
53
|
-
for item in [
|
|
54
|
-
part.split("=") for part in os.path.basename(dir_name).split("__")
|
|
55
|
-
]
|
|
56
|
-
}
|
|
57
|
-
if not selector or selector.matches(attributes):
|
|
58
|
-
with open(dir_name + "/dataset.pickle", "rb") as fp:
|
|
59
|
-
dataset = pickle.load(fp)
|
|
60
|
-
datasets.append(dataset)
|
|
61
|
-
return DatasetCollection(datasets)
|
|
62
|
-
|
|
63
|
-
def save(self, bucket: str, dataset: Dataset):
|
|
64
|
-
path = (
|
|
65
|
-
self.base_dir / dataset.identifier.key.replace("/", "__") / "dataset.pickle"
|
|
66
|
-
)
|
|
67
|
-
path.parent.mkdir(parents=True, exist_ok=True)
|
|
68
|
-
|
|
69
|
-
with open(path, "wb") as fp:
|
|
70
|
-
pickle.dump(dataset, fp)
|
|
71
|
-
|
|
72
|
-
def next_identity(self):
|
|
73
|
-
return str(uuid.uuid4())
|
|
File without changes
|
|
File without changes
|