ingestify 0.1.3__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. ingestify/__init__.py +1 -1
  2. ingestify/application/dataset_store.py +47 -36
  3. ingestify/application/ingestion_engine.py +3 -3
  4. ingestify/application/loader.py +71 -241
  5. ingestify/domain/models/__init__.py +1 -6
  6. ingestify/domain/models/base.py +22 -0
  7. ingestify/domain/models/data_spec_version_collection.py +6 -0
  8. ingestify/domain/models/dataset/__init__.py +3 -5
  9. ingestify/domain/models/dataset/dataset.py +15 -32
  10. ingestify/domain/models/dataset/dataset_repository.py +1 -15
  11. ingestify/domain/models/dataset/dataset_state.py +11 -0
  12. ingestify/domain/models/dataset/events.py +6 -16
  13. ingestify/domain/models/dataset/file.py +21 -34
  14. ingestify/domain/models/dataset/file_collection.py +3 -1
  15. ingestify/domain/models/dataset/file_repository.py +29 -28
  16. ingestify/domain/models/dataset/revision.py +26 -3
  17. ingestify/domain/models/event/domain_event.py +8 -4
  18. ingestify/domain/models/ingestion/__init__.py +0 -0
  19. ingestify/domain/models/ingestion/ingestion_job.py +325 -0
  20. ingestify/domain/models/ingestion/ingestion_job_summary.py +123 -0
  21. ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
  22. ingestify/domain/models/resources/dataset_resource.py +29 -37
  23. ingestify/domain/models/sink.py +1 -8
  24. ingestify/domain/models/task/task.py +3 -1
  25. ingestify/domain/models/task/task_summary.py +118 -0
  26. ingestify/domain/models/timing.py +16 -0
  27. ingestify/domain/services/identifier_key_transformer.py +111 -0
  28. ingestify/infra/fetch/http.py +5 -0
  29. ingestify/infra/source/statsbomb_github.py +67 -54
  30. ingestify/infra/store/dataset/__init__.py +0 -2
  31. ingestify/infra/store/dataset/sqlalchemy/mapping.py +187 -4
  32. ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -24
  33. ingestify/infra/store/file/local_file_repository.py +3 -5
  34. ingestify/infra/store/file/s3_file_repository.py +4 -9
  35. ingestify/main.py +64 -25
  36. ingestify/utils.py +15 -78
  37. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/METADATA +2 -1
  38. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/RECORD +41 -34
  39. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/WHEEL +1 -1
  40. ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
  41. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/entry_points.txt +0 -0
  42. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/top_level.txt +0 -0
@@ -29,22 +29,6 @@ def parse_value(v):
29
29
  return v
30
30
 
31
31
 
32
- def json_serializer(o):
33
- return json.dumps(o)
34
-
35
-
36
- def json_deserializer(o):
37
- o = json.loads(o)
38
- # THIS BREAKS WHEN USING OTHER JSON COLUMNS!!
39
- o = Identifier(**o)
40
- return o
41
-
42
-
43
- # @compiles(DateTime, "mysql")
44
- # def compile_datetime_mysql(type_, compiler, **kw):
45
- # return "DATETIME(6)"
46
-
47
-
48
32
  def isfloat(x):
49
33
  try:
50
34
  a = float(x)
@@ -64,7 +48,7 @@ def isint(x):
64
48
  return a == b
65
49
 
66
50
 
67
- class SqlAlchemyDatasetRepository(DatasetRepository):
51
+ class SqlAlchemySessionProvider:
68
52
  @staticmethod
69
53
  def fix_url(url: str) -> str:
70
54
  if url.startswith("postgres://"):
@@ -87,8 +71,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
87
71
  self.url,
88
72
  # Use the default isolation level, don't need SERIALIZABLE
89
73
  # isolation_level="SERIALIZABLE",
90
- json_serializer=json_serializer,
91
- json_deserializer=json_deserializer,
92
74
  )
93
75
  self.session = Session(bind=self.engine)
94
76
 
@@ -107,9 +89,29 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
107
89
  self.url = state["url"]
108
90
  self._init_engine()
109
91
 
92
+ def _close_engine(self):
93
+ if hasattr(self, "session"):
94
+ self.session.close()
95
+ self.engine.dispose()
96
+
110
97
  def __del__(self):
111
- self.session.close()
112
- self.engine.dispose()
98
+ self._close_engine()
99
+
100
+ def reset(self):
101
+ self._close_engine()
102
+ self._init_engine()
103
+
104
+ def get(self):
105
+ return self.session
106
+
107
+
108
+ class SqlAlchemyDatasetRepository(DatasetRepository):
109
+ def __init__(self, session_provider: SqlAlchemySessionProvider):
110
+ self.session_provider = session_provider
111
+
112
+ @property
113
+ def session(self):
114
+ return self.session_provider.get()
113
115
 
114
116
  def _filter_query(
115
117
  self,
@@ -207,9 +209,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
207
209
  )
208
210
 
209
211
  if not metadata_only:
210
- dataset_query = apply_query_filter(
211
- self.session.query(Dataset).options(joinedload(Dataset.revisions))
212
- )
212
+ dataset_query = apply_query_filter(self.session.query(Dataset))
213
213
  datasets = list(dataset_query)
214
214
  else:
215
215
  datasets = []
@@ -19,14 +19,12 @@ class LocalFileRepository(FileRepository):
19
19
  filename: str,
20
20
  stream: BinaryIO,
21
21
  ) -> Path:
22
- path = self.get_path(bucket, dataset, revision_id, filename)
22
+ path = self.get_write_path(bucket, dataset, revision_id, filename)
23
23
  path.parent.mkdir(parents=True, exist_ok=True)
24
24
 
25
25
  with open(path, "wb") as fp:
26
26
  shutil.copyfileobj(stream, fp)
27
27
  return path
28
28
 
29
- def load_content(
30
- self, bucket: str, dataset: Dataset, revision_id: int, filename: str
31
- ) -> BinaryIO:
32
- return open(self.get_path(bucket, dataset, revision_id, filename), "rb")
29
+ def load_content(self, storage_path: str) -> BinaryIO:
30
+ return open(self.get_read_path(storage_path), "rb")
@@ -8,10 +8,7 @@ from ingestify.domain.models import FileRepository
8
8
 
9
9
 
10
10
  class S3FileRepository(FileRepository):
11
- def __init__(self, url):
12
- super().__init__(url)
13
-
14
- self._s3 = None
11
+ _s3 = None
15
12
 
16
13
  @property
17
14
  def s3(self):
@@ -30,16 +27,14 @@ class S3FileRepository(FileRepository):
30
27
  filename: str,
31
28
  stream: BinaryIO,
32
29
  ) -> Path:
33
- key = self.get_path(bucket, dataset, revision_id, filename)
30
+ key = self.get_write_path(bucket, dataset, revision_id, filename)
34
31
  s3_bucket = Path(key.parts[0])
35
32
 
36
33
  self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).put(Body=stream)
37
34
  return key
38
35
 
39
- def load_content(
40
- self, bucket: str, dataset: Dataset, revision_id: int, filename: str
41
- ) -> BinaryIO:
42
- key = self.get_path(bucket, dataset, revision_id, filename)
36
+ def load_content(self, storage_path: str) -> BinaryIO:
37
+ key = self.get_read_path(storage_path)
43
38
  s3_bucket = Path(key.parts[0])
44
39
  return self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).get()[
45
40
  "Body"
ingestify/main.py CHANGED
@@ -11,19 +11,21 @@ from ingestify import Source
11
11
  from ingestify.application.dataset_store import DatasetStore
12
12
  from ingestify.application.ingestion_engine import IngestionEngine
13
13
  from ingestify.application.secrets_manager import SecretsManager
14
- from ingestify.domain import Selector
15
- from ingestify.domain.models import (
16
- dataset_repository_factory,
17
- file_repository_factory,
18
- )
14
+ from ingestify.domain import Selector, FileRepository
19
15
  from ingestify.domain.models.data_spec_version_collection import (
20
16
  DataSpecVersionCollection,
21
17
  )
22
18
  from ingestify.domain.models.event import EventBus, Publisher, Subscriber
23
19
 
24
- from ingestify.domain.models.extract_job import ExtractJob
20
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
25
21
  from ingestify.domain.models.fetch_policy import FetchPolicy
22
+ from ingestify.domain.services.identifier_key_transformer import IdentifierTransformer
26
23
  from ingestify.exceptions import ConfigurationError
24
+ from ingestify.infra import S3FileRepository, LocalFileRepository
25
+ from ingestify.infra.store.dataset.sqlalchemy import SqlAlchemyDatasetRepository
26
+ from ingestify.infra.store.dataset.sqlalchemy.repository import (
27
+ SqlAlchemySessionProvider,
28
+ )
27
29
 
28
30
  logger = logging.getLogger(__name__)
29
31
 
@@ -59,8 +61,23 @@ def import_cls(name):
59
61
  return getattr(mod, components[-1])
60
62
 
61
63
 
64
+ def build_file_repository(file_url: str, identifier_transformer) -> FileRepository:
65
+ if file_url.startswith("s3://"):
66
+ repository = S3FileRepository(
67
+ url=file_url, identifier_transformer=identifier_transformer
68
+ )
69
+ elif file_url.startswith("file://"):
70
+ repository = LocalFileRepository(
71
+ url=file_url, identifier_transformer=identifier_transformer
72
+ )
73
+ else:
74
+ raise Exception(f"Cannot find repository to handle file {file_url}")
75
+
76
+ return repository
77
+
78
+
62
79
  def get_dataset_store_by_urls(
63
- dataset_url: str, file_url: str, bucket: str
80
+ metadata_url: str, file_url: str, bucket: str, dataset_types
64
81
  ) -> DatasetStore:
65
82
  """
66
83
  Initialize a DatasetStore by a DatasetRepository and a FileRepository
@@ -68,15 +85,30 @@ def get_dataset_store_by_urls(
68
85
  if not bucket:
69
86
  raise Exception("Bucket is not specified")
70
87
 
71
- file_repository = file_repository_factory.build_if_supports(url=file_url)
88
+ identifier_transformer = IdentifierTransformer()
89
+ for dataset_type in dataset_types:
90
+ for id_key, id_config in dataset_type["identifier_keys"].items():
91
+ identifier_transformer.register_transformation(
92
+ provider=dataset_type["provider"],
93
+ dataset_type=dataset_type["dataset_type"],
94
+ id_key=id_key,
95
+ transformation=id_config["transformation"],
96
+ )
97
+
98
+ file_repository = build_file_repository(
99
+ file_url, identifier_transformer=identifier_transformer
100
+ )
101
+
102
+ if secrets_manager.supports(metadata_url):
103
+ metadata_url = secrets_manager.load_as_db_url(metadata_url)
104
+
105
+ if metadata_url.startswith("postgres://"):
106
+ metadata_url = metadata_url.replace("postgress://", "postgress+")
72
107
 
73
- if secrets_manager.supports(dataset_url):
74
- dataset_url = secrets_manager.load_as_db_url(dataset_url)
108
+ sqlalchemy_session_provider = SqlAlchemySessionProvider(metadata_url)
75
109
 
76
- if dataset_url.startswith("postgres://"):
77
- dataset_url = dataset_url.replace("postgress://", "postgress+")
110
+ dataset_repository = SqlAlchemyDatasetRepository(sqlalchemy_session_provider)
78
111
 
79
- dataset_repository = dataset_repository_factory.build_if_supports(url=dataset_url)
80
112
  return DatasetStore(
81
113
  dataset_repository=dataset_repository,
82
114
  file_repository=file_repository,
@@ -88,14 +120,15 @@ def get_datastore(config_file, bucket: Optional[str] = None) -> DatasetStore:
88
120
  config = parse_config(config_file, default_value="")
89
121
 
90
122
  return get_dataset_store_by_urls(
91
- dataset_url=config["main"]["dataset_url"],
123
+ metadata_url=config["main"]["metadata_url"],
92
124
  file_url=config["main"]["file_url"],
93
125
  bucket=bucket or config["main"].get("default_bucket"),
126
+ dataset_types=config.get("dataset_types", []),
94
127
  )
95
128
 
96
129
 
97
130
  def get_remote_datastore(url: str, bucket: str, **kwargs) -> DatasetStore:
98
- return get_dataset_store_by_urls(dataset_url=url, file_url=url, bucket=bucket)
131
+ return get_dataset_store_by_urls(metadata_url=url, file_url=url, bucket=bucket)
99
132
 
100
133
 
101
134
  def get_source_cls(key: str) -> Type[Source]:
@@ -155,9 +188,10 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
155
188
 
156
189
  logger.info("Initializing IngestionEngine")
157
190
  store = get_dataset_store_by_urls(
158
- dataset_url=config["main"]["dataset_url"],
191
+ metadata_url=config["main"]["metadata_url"],
159
192
  file_url=config["main"]["file_url"],
160
193
  bucket=bucket or config["main"].get("default_bucket"),
194
+ dataset_types=config.get("dataset_types", []),
161
195
  )
162
196
 
163
197
  # Setup an EventBus and wire some more components
@@ -173,19 +207,24 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
173
207
  store=store,
174
208
  )
175
209
 
176
- logger.info("Determining tasks...")
210
+ logger.info("Adding IngestionPlans...")
177
211
 
178
212
  fetch_policy = FetchPolicy()
179
213
 
180
- for job in config["extract_jobs"]:
214
+ # Previous naming
215
+ ingestion_plans = config.get("extract_jobs", [])
216
+ # New naming
217
+ ingestion_plans.extend(config.get("ingestion_plans", []))
218
+
219
+ for ingestion_plan in ingestion_plans:
181
220
  data_spec_versions = DataSpecVersionCollection.from_dict(
182
- job.get("data_spec_versions", {"default": {"v1"}})
221
+ ingestion_plan.get("data_spec_versions", {"default": {"v1"}})
183
222
  )
184
223
 
185
- if "selectors" in job:
224
+ if "selectors" in ingestion_plan:
186
225
  selectors = [
187
226
  Selector.build(selector, data_spec_versions=data_spec_versions)
188
- for selector_args in job["selectors"]
227
+ for selector_args in ingestion_plan["selectors"]
189
228
  for selector in _product_selectors(selector_args)
190
229
  ]
191
230
  else:
@@ -193,13 +232,13 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
193
232
  # but makes it easier later one where we loop over selectors.
194
233
  selectors = [Selector.build({}, data_spec_versions=data_spec_versions)]
195
234
 
196
- import_job = ExtractJob(
197
- source=sources[job["source"]],
198
- dataset_type=job["dataset_type"],
235
+ ingestion_plan = IngestionPlan(
236
+ source=sources[ingestion_plan["source"]],
237
+ dataset_type=ingestion_plan["dataset_type"],
199
238
  selectors=selectors,
200
239
  fetch_policy=fetch_policy,
201
240
  data_spec_versions=data_spec_versions,
202
241
  )
203
- ingestion_engine.add_extract_job(import_job)
242
+ ingestion_engine.add_ingestion_plan(ingestion_plan)
204
243
 
205
244
  return ingestion_engine
ingestify/utils.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import abc
2
+ import asyncio
2
3
  import inspect
3
4
  import logging
4
5
  import os
@@ -8,7 +9,19 @@ from multiprocessing import get_context, cpu_count, get_all_start_methods
8
9
 
9
10
  from datetime import datetime, timezone
10
11
  from string import Template
11
- from typing import Dict, Generic, Type, TypeVar, Tuple, Optional, Any
12
+ from typing import (
13
+ Dict,
14
+ Generic,
15
+ Type,
16
+ TypeVar,
17
+ Tuple,
18
+ Optional,
19
+ Any,
20
+ Callable,
21
+ Awaitable,
22
+ List,
23
+ Iterable,
24
+ )
12
25
 
13
26
  import cloudpickle
14
27
  from typing_extensions import Self
@@ -39,83 +52,6 @@ def sanitize_exception_message(exception_message):
39
52
  return sanitized_message
40
53
 
41
54
 
42
- class ComponentRegistry:
43
- def __init__(self):
44
- self.__registered_components = {}
45
-
46
- class _Registered(abc.ABCMeta):
47
- def __new__(mcs, cls_name, bases, class_dict):
48
- class_dict["name"] = cls_name
49
- component_cls = super(_Registered, mcs).__new__(
50
- mcs, cls_name, bases, class_dict
51
- )
52
- if not inspect.isabstract(component_cls):
53
- self.register_component(cls_name, component_cls)
54
- else:
55
- if bases[0] != abc.ABC:
56
- raise Exception(
57
- f"Class '{cls_name}' seems to be an concrete class, but missing some abstract methods"
58
- )
59
- return component_cls
60
-
61
- self.__metaclass = _Registered
62
-
63
- @property
64
- def metaclass(self):
65
- return self.__metaclass
66
-
67
- def register_component(self, cls_name, component_cls):
68
- self.__registered_components[cls_name] = component_cls
69
-
70
- def get_component(self, cls_name: str):
71
- return self.__registered_components[cls_name]
72
-
73
- def get_supporting_component(self, **kwargs) -> str:
74
- for cls_name, class_ in self.__registered_components.items():
75
- if not hasattr(class_, "supports"):
76
- raise Exception(
77
- f"Class '{cls_name}' does not implemented a 'supports' classmethod. "
78
- f"This is required when using 'get_supporting_component'."
79
- )
80
-
81
- if class_.supports(**kwargs):
82
- return cls_name
83
-
84
- kwargs_str = sanitize_exception_message(str(kwargs))
85
- raise Exception(f"No supporting class found for {kwargs_str}")
86
-
87
-
88
- T = TypeVar("T")
89
- R = TypeVar("R")
90
-
91
-
92
- class ComponentFactory(Generic[T]):
93
- def __init__(self, registry: ComponentRegistry):
94
- self.registry = registry
95
-
96
- @classmethod
97
- def build_factory(
98
- cls, component_cls: Type[R], registry: ComponentRegistry
99
- ) -> "ComponentFactory[R]":
100
- return cls[component_cls](registry)
101
-
102
- def build(self, cls_name, **kwargs) -> T:
103
- component_cls = self.registry.get_component(cls_name)
104
- try:
105
- return component_cls.from_dict(**kwargs)
106
- except AttributeError:
107
- pass
108
- try:
109
- return component_cls(**kwargs)
110
- except TypeError as e:
111
- raise e
112
- # raise TypeError(f"Could not initialize {cls_name}")
113
-
114
- def build_if_supports(self, **kwargs) -> T:
115
- cls_name = self.registry.get_supporting_component(**kwargs)
116
- return self.build(cls_name, **kwargs)
117
-
118
-
119
55
  def key_from_dict(d: dict) -> str:
120
56
  return "/".join([f"{k}={v}" for k, v in sorted(d.items()) if not k.startswith("_")])
121
57
 
@@ -270,6 +206,7 @@ class TaskExecutor:
270
206
  logger.info(
271
207
  f"Finished {len(res)} tasks in {took:.1f} seconds. {(len(res)/took):.1f} tasks/sec"
272
208
  )
209
+ return res
273
210
 
274
211
  def join(self):
275
212
  self.pool.close()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.1.3
3
+ Version: 0.3.0
4
4
  Summary: Standardizing soccer tracking- and event data
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -16,6 +16,7 @@ Requires-Dist: python-dotenv
16
16
  Requires-Dist: pyaml-env
17
17
  Requires-Dist: boto3
18
18
  Requires-Dist: pytz
19
+ Requires-Dist: pydantic>=2.0.0
19
20
  Provides-Extra: test
20
21
  Requires-Dist: pytest<7,>=6.2.5; extra == "test"
21
22
 
@@ -1,67 +1,74 @@
1
- ingestify/__init__.py,sha256=7SmMqBObZfacvSUG-7g0fFyt6ZlamLxBNgU5ReQ36lk,301
1
+ ingestify/__init__.py,sha256=DnPPEtJT32gAPuUKXgIsqUE4fIvc6QA96vrcKr6nz6A,301
2
2
  ingestify/cmdline.py,sha256=bIuyPgGEw4wIglNzpG9zp7TsJozsP8NSVsCe4eAyWUg,7189
3
3
  ingestify/exceptions.py,sha256=wMMuajl4AkQRfW60TLN7btJmQaH8-lUczXyW_2g9kOU,143
4
- ingestify/main.py,sha256=YjrAOiGzwurtoDyIf981DSJHHA6IT5q09k3QNzTKCC8,6814
4
+ ingestify/main.py,sha256=0sTNoLcS7euOavIAviQIMTolRnXsvOvNbmFdXgXgxhE,8516
5
5
  ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
6
6
  ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
7
- ingestify/utils.py,sha256=WcbG2mEb-oLF_sA-2JPRbx8nD55HqASrHQRz7Zd0Ejg,8198
7
+ ingestify/utils.py,sha256=HETGhAoUlutLG0cQR63nac2JbFei9gnktDHeBQoYWfU,5692
8
8
  ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- ingestify/application/dataset_store.py,sha256=NAW-XSvp118Lr2hXZd3qtuQr6VkPdWCLksIwd5MSs30,11489
10
- ingestify/application/ingestion_engine.py,sha256=GYIhb8a9ePkEcNOBPdfu-YawiD7eRZMRlxCA-6g9DRA,2249
11
- ingestify/application/loader.py,sha256=DSdSNFf7WynGsMCoK3iQGiMKkO76fZ_KIOBDEMZK3zU,13495
9
+ ingestify/application/dataset_store.py,sha256=6xMHa_ShyPOyegIKl2xwmRl3BlV5i21z95cpKW3oARw,11712
10
+ ingestify/application/ingestion_engine.py,sha256=PtMjKMpvfqB802G5zfKLzyamdH7qFOXl3x6_97y8w60,2288
11
+ ingestify/application/loader.py,sha256=v8ZcpMDEml9k_uFPFqT4WaCjXED_OIpAr7g0Pz5Hp6Y,7153
12
12
  ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
13
13
  ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
14
- ingestify/domain/models/__init__.py,sha256=xHVQZP57ZQYUKwAtbccnDKX89_yTOvBKAtn4XDVbEbY,930
15
- ingestify/domain/models/data_spec_version_collection.py,sha256=qjEM6-gt-Uf5orQlv64P6NJCEdWiUPX2oTZv8cC-KVY,1203
16
- ingestify/domain/models/extract_job.py,sha256=yXrlF2Vt5hxB1Vo9CicpgyW5rjvJaEPfSiMzaAqhqB0,624
14
+ ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
15
+ ingestify/domain/models/base.py,sha256=6jzzIqSkH1mPsXZ2OTXMj09S_IlvMOrOBHBJyWAKEjE,555
16
+ ingestify/domain/models/data_spec_version_collection.py,sha256=CAXlO4W2AOOWAPdPAuymqBHnJpiYtkr2z7fYFJ3HSCk,1372
17
17
  ingestify/domain/models/fetch_policy.py,sha256=d7K1TzliNJXxqaqzqEOQWLhvgIvmmqhUQEliXvSUcTs,1405
18
- ingestify/domain/models/sink.py,sha256=AieqDQ76Vj7WGxCrl3-F93AKe-VBfoPHtMNH28GTQM4,384
18
+ ingestify/domain/models/sink.py,sha256=OBVfFMpB7puJmHg4q2KYx4qgoAnlmX8xKWYnPi8a9pc,178
19
19
  ingestify/domain/models/source.py,sha256=sB3aqr2LfjIbtw7ODJpHnPj3RUeo7gYmTU7MXvfaYg4,973
20
- ingestify/domain/models/dataset/__init__.py,sha256=kSn3XZo0o-D0WzMb2VDxhOXw9Rr9jvS-8fkHdOnrccU,748
20
+ ingestify/domain/models/timing.py,sha256=TvvH6Szo61CD8wCP7Awyc45CXga5lKqvoW2U-0TRHlA,388
21
+ ingestify/domain/models/dataset/__init__.py,sha256=i1kswluvWjw0xn4OUByRt7yeRvNHu1mauevv-Vmayx4,630
21
22
  ingestify/domain/models/dataset/collection.py,sha256=E2utQ6oyaFFrfQFMiwP9J_I7Wm21z0sRvE4Zc3QEs20,1310
22
23
  ingestify/domain/models/dataset/collection_metadata.py,sha256=gI5cb9M0QRsheIr2jA71wOyWfI5lGx5ES2Qw7rbDIoA,371
23
- ingestify/domain/models/dataset/dataset.py,sha256=m0iVJPXd1KOAHbDg7fmY_7MCdrKQaILUekIWUfo5pXI,2893
24
- ingestify/domain/models/dataset/dataset_repository.py,sha256=eiloP5msmDau4WRHee8gA7pLoH_ca2JXAhPx9UecPIA,1185
25
- ingestify/domain/models/dataset/events.py,sha256=x4l_pdzBHbemE_722EyCYXzWy9t8IcTx5j-wNFxWs6o,708
26
- ingestify/domain/models/dataset/file.py,sha256=O-yJom9dr13PaHfmc_4crtSa9B1Q9iruHsnf-m01McU,3943
27
- ingestify/domain/models/dataset/file_collection.py,sha256=V5wh2aSc61UA4HWcHi9PvyQUIUvssDRkaPVe2YR6XwU,1140
28
- ingestify/domain/models/dataset/file_repository.py,sha256=lxf3Dh8e-_67dRspMZHT1DZ79IWW_vlvb3z8lKjypj4,1514
24
+ ingestify/domain/models/dataset/dataset.py,sha256=ReL50BXNaJVU29OB5_9CQEI7BekWsgi1t3AR7e5jENc,2743
25
+ ingestify/domain/models/dataset/dataset_repository.py,sha256=kUjiqW58kOUOli1gZCLR5xw4dBX0bqI1UJsf16hgNsQ,812
26
+ ingestify/domain/models/dataset/dataset_state.py,sha256=O95mea5N34HDXw7XsYzxHna4FVk_T-ZNUDezkvt7VzY,220
27
+ ingestify/domain/models/dataset/events.py,sha256=58VacQejQt-WPh9BywP4st5McauM3gXBQo0kaDnSekY,481
28
+ ingestify/domain/models/dataset/file.py,sha256=nuoZI9GI5OysYwWCCyNsHMlm1Z9A1GbEKd38jvBzJ4E,4119
29
+ ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
30
+ ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
29
31
  ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
30
- ingestify/domain/models/dataset/revision.py,sha256=fiHnd_mad0iYmNCGswKImUHpauhIf2gW_ukztDFVP48,781
32
+ ingestify/domain/models/dataset/revision.py,sha256=O_1HG2S2EmYdWqI2K282S_D-d6IhRh_f4Q3wV8MEhkk,1311
31
33
  ingestify/domain/models/dataset/selector.py,sha256=kEGpU8pIyjZ0zwE9n2uo_NY5xrNanWiTTgapyMAUEsw,1039
32
34
  ingestify/domain/models/event/__init__.py,sha256=OdPTpE9bj5QqdGmrYqRTLPX1f-LR9GWJYlGMPPEsuL8,138
33
35
  ingestify/domain/models/event/_old_event.py,sha256=RktgCAj9SMdtqkAc_bOwoghEb2Z6m4r5_xWXin9wqx4,472
34
36
  ingestify/domain/models/event/dispatcher.py,sha256=5WnyUJ7Qzr612btAtl1dMG9JBXDPcsBLyLmW6H7Q1zk,154
35
- ingestify/domain/models/event/domain_event.py,sha256=a5nNNwDWSAqou8aSBGIEA6aQOHTOxYyMEUXB91fYUIM,187
37
+ ingestify/domain/models/event/domain_event.py,sha256=OR6va417j2lisRr0gjQZ9rshAtlys5sVu7KU-W0r0xA,316
36
38
  ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmceWLstOxiP3-2qU,576
37
39
  ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
38
40
  ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
41
+ ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
+ ingestify/domain/models/ingestion/ingestion_job.py,sha256=U6B62c7NGeHBAjmKhgOa4uHeul34xyR66WtWaPSRNTU,12276
43
+ ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=1l9O3QJkYLs74HhrwAijwNEriPMwHN9OFG64Iz4z3uI,4262
44
+ ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
39
45
  ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
40
- ingestify/domain/models/resources/dataset_resource.py,sha256=HH5wMqzoWvcL84GzNa7QL3YsciI757FG4iZu9DbXn_k,3181
46
+ ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
41
47
  ingestify/domain/models/task/__init__.py,sha256=BdlyIPvE07Xax_IzLgO9DUw0wsz9OZutxnxdDNyRlys,79
42
48
  ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIpXtlhW4,430
43
- ingestify/domain/models/task/task.py,sha256=R6tEZub-N_Wjl4VjwlPySdFb3L9D7nH4St2CcDzFoKA,107
49
+ ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
50
+ ingestify/domain/models/task/task_summary.py,sha256=ovzqKPstngRVzVA_JboQMluq5uQjKVJDsWNNcfcadhU,3774
44
51
  ingestify/domain/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
+ ingestify/domain/services/identifier_key_transformer.py,sha256=y4GS9u9Ej1MO2jUhAxWbifp0mrE_MqTHvVVcoQzSKb4,4034
45
53
  ingestify/domain/services/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
54
  ingestify/domain/services/transformers/kloppy_to_pandas.py,sha256=NcN6nTBGVn9gz-_hWZJTMcduS1Gg7EM4X95Cqxi1QIM,809
47
55
  ingestify/infra/__init__.py,sha256=V0hpLzPVTcOHRVh0gguF6FT30YIgEOUd5v87xUHkfZ4,88
48
56
  ingestify/infra/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
- ingestify/infra/fetch/http.py,sha256=4CcEkwtNzYkPspNIaQIfcthA5yLow0x_M9xpEsoucWw,3982
57
+ ingestify/infra/fetch/http.py,sha256=ldaXy6alBbI9z63H97lXfYZNT0ZCBkTac1W6-acNjjY,4127
50
58
  ingestify/infra/serialization/__init__.py,sha256=LwfmRoO4qykZkJZXxVPSKpwoVIkg9qzXa7Egut9JjL4,1772
51
59
  ingestify/infra/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
60
  ingestify/infra/sink/postgresql.py,sha256=SxuM3LntfYcpCriUpqJhMvgAf0s9cohXf6WkxSEDYDY,1816
53
61
  ingestify/infra/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
- ingestify/infra/source/statsbomb_github.py,sha256=CuHZoJn6fU8ZKQl4f1-gyaVYsmxL6R33n0cbOx1jQmI,2895
62
+ ingestify/infra/source/statsbomb_github.py,sha256=IzzrlIRqkChgJp87yW3ugG1my4g_5uMx_xEnoQLWNss,3543
55
63
  ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nOUGxE,5626
56
64
  ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
57
- ingestify/infra/store/dataset/__init__.py,sha256=8oVJFiA-IKccrEpiYxAmSc65dfpNut7PYx8PUhylmdU,113
58
- ingestify/infra/store/dataset/local_dataset_repository.py,sha256=UMgSe1M9u_629V4WyuTJ-QegZJiDczzMo7vkNbNleqA,2064
65
+ ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
66
  ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
60
- ingestify/infra/store/dataset/sqlalchemy/mapping.py,sha256=Q7Od3zBnoZgxE5aThdZE93waWeKVut9dstrCnEYb9nc,3981
61
- ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=ynoIVMVD0_w9aa2hFKkcLxRKzJDoET_SNfGHXPIoN40,7067
67
+ ingestify/infra/store/dataset/sqlalchemy/mapping.py,sha256=UlEIfNusSOEWOxPi_ORrdLSylbi6-TO1qwEmcrBLwog,9447
68
+ ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=-eSR_F9tS9Hd3JNEpoJoDAb5RY38rFaKLMI3eBedjx8,7068
62
69
  ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
63
- ingestify/infra/store/file/local_file_repository.py,sha256=0oIzjjKO5U_7gPXhsBJFUqQBarQTFQS499ZK7HNxMxo,893
64
- ingestify/infra/store/file/s3_file_repository.py,sha256=txDviBrY9EHn3soqLFvTrjSPkyh548RxUgx4T83j0QY,1331
70
+ ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
71
+ ingestify/infra/store/file/s3_file_repository.py,sha256=_sekV1rfEbwIaSGhKRnFQlj92E9qNgONiwXt6ZLCyGg,1188
65
72
  ingestify/static/templates/statsbomb_github/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
73
  ingestify/static/templates/statsbomb_github/config.yaml.jinja2,sha256=_gAuAipfBL3ddLacyS1IBP5JluvPS2vmrb8GGaFtcUM,386
67
74
  ingestify/static/templates/statsbomb_github/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
@@ -72,8 +79,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
72
79
  ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
73
80
  ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
74
81
  ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
75
- ingestify-0.1.3.dist-info/METADATA,sha256=N5OO5RAYulhFCXCqp4Hi2gEcEB9dnf6W7n59ZOGRUQ8,18822
76
- ingestify-0.1.3.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
77
- ingestify-0.1.3.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
78
- ingestify-0.1.3.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
79
- ingestify-0.1.3.dist-info/RECORD,,
82
+ ingestify-0.3.0.dist-info/METADATA,sha256=-QlChdV6OYWkqSyXUmkQTG4deBliRsSmmZMTWKeURnI,18853
83
+ ingestify-0.3.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
84
+ ingestify-0.3.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
85
+ ingestify-0.3.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
86
+ ingestify-0.3.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.44.0)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,73 +0,0 @@
1
- import glob
2
- import os
3
- import pickle
4
- import uuid
5
- from pathlib import Path
6
- from typing import Optional
7
-
8
- from ingestify.domain.models import (
9
- Dataset,
10
- DatasetCollection,
11
- DatasetRepository,
12
- Selector,
13
- )
14
-
15
-
16
- def parse_value(v):
17
- try:
18
- return int(v)
19
- except ValueError:
20
- return v
21
-
22
-
23
- class LocalDatasetRepository(DatasetRepository):
24
- def destroy(self, dataset: Dataset):
25
- path = (
26
- self.base_dir / dataset.identifier.key.replace("/", "__") / "dataset.pickle"
27
- )
28
- path.unlink()
29
-
30
- @classmethod
31
- def supports(cls, url: str) -> bool:
32
- return url.startswith("file://")
33
-
34
- def __init__(self, url: str):
35
- self.base_dir = Path(url[7:])
36
- raise DeprecationWarning(
37
- "This Repository should not be used. Better use SqlAlchemyDatasetRepository with a local sqlite database."
38
- )
39
-
40
- def get_dataset_collection(
41
- self,
42
- dataset_type: Optional[str] = None,
43
- provider: Optional[str] = None,
44
- dataset_id: Optional[str] = None,
45
- selector: Optional[Selector] = None,
46
- **kwargs
47
- ) -> DatasetCollection:
48
-
49
- datasets = []
50
- for dir_name in glob.glob(str(self.base_dir / "*")):
51
- attributes = {
52
- item[0]: parse_value(item[1])
53
- for item in [
54
- part.split("=") for part in os.path.basename(dir_name).split("__")
55
- ]
56
- }
57
- if not selector or selector.matches(attributes):
58
- with open(dir_name + "/dataset.pickle", "rb") as fp:
59
- dataset = pickle.load(fp)
60
- datasets.append(dataset)
61
- return DatasetCollection(datasets)
62
-
63
- def save(self, bucket: str, dataset: Dataset):
64
- path = (
65
- self.base_dir / dataset.identifier.key.replace("/", "__") / "dataset.pickle"
66
- )
67
- path.parent.mkdir(parents=True, exist_ok=True)
68
-
69
- with open(path, "wb") as fp:
70
- pickle.dump(dataset, fp)
71
-
72
- def next_identity(self):
73
- return str(uuid.uuid4())