ingestify 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. ingestify/__init__.py +11 -0
  2. ingestify/application/__init__.py +0 -0
  3. ingestify/application/dataset_store.py +339 -0
  4. ingestify/application/ingestion_engine.py +62 -0
  5. ingestify/application/loader.py +329 -0
  6. ingestify/application/secrets_manager.py +53 -0
  7. ingestify/cmdline.py +283 -0
  8. ingestify/domain/__init__.py +2 -0
  9. ingestify/domain/models/__init__.py +45 -0
  10. ingestify/domain/models/data_spec_version_collection.py +33 -0
  11. ingestify/domain/models/dataset/__init__.py +27 -0
  12. ingestify/domain/models/dataset/collection.py +44 -0
  13. ingestify/domain/models/dataset/collection_metadata.py +13 -0
  14. ingestify/domain/models/dataset/dataset.py +104 -0
  15. ingestify/domain/models/dataset/dataset_repository.py +46 -0
  16. ingestify/domain/models/dataset/events.py +31 -0
  17. ingestify/domain/models/dataset/file.py +146 -0
  18. ingestify/domain/models/dataset/file_collection.py +35 -0
  19. ingestify/domain/models/dataset/file_repository.py +59 -0
  20. ingestify/domain/models/dataset/identifier.py +24 -0
  21. ingestify/domain/models/dataset/revision.py +29 -0
  22. ingestify/domain/models/dataset/selector.py +37 -0
  23. ingestify/domain/models/event/__init__.py +4 -0
  24. ingestify/domain/models/event/_old_event.py +21 -0
  25. ingestify/domain/models/event/dispatcher.py +8 -0
  26. ingestify/domain/models/event/domain_event.py +10 -0
  27. ingestify/domain/models/event/event_bus.py +24 -0
  28. ingestify/domain/models/event/publisher.py +23 -0
  29. ingestify/domain/models/event/subscriber.py +39 -0
  30. ingestify/domain/models/extract_job.py +23 -0
  31. ingestify/domain/models/fetch_policy.py +40 -0
  32. ingestify/domain/models/resources/__init__.py +1 -0
  33. ingestify/domain/models/resources/dataset_resource.py +99 -0
  34. ingestify/domain/models/sink.py +16 -0
  35. ingestify/domain/models/source.py +34 -0
  36. ingestify/domain/models/task/__init__.py +4 -0
  37. ingestify/domain/models/task/set.py +21 -0
  38. ingestify/domain/models/task/task.py +7 -0
  39. ingestify/domain/services/__init__.py +0 -0
  40. ingestify/domain/services/transformers/__init__.py +0 -0
  41. ingestify/domain/services/transformers/kloppy_to_pandas.py +25 -0
  42. ingestify/exceptions.py +10 -0
  43. ingestify/infra/__init__.py +4 -0
  44. ingestify/infra/fetch/__init__.py +0 -0
  45. ingestify/infra/fetch/http.py +100 -0
  46. ingestify/infra/serialization/__init__.py +50 -0
  47. ingestify/infra/sink/__init__.py +0 -0
  48. ingestify/infra/sink/postgresql.py +50 -0
  49. ingestify/infra/source/__init__.py +0 -0
  50. ingestify/infra/source/statsbomb_github.py +92 -0
  51. ingestify/infra/source/wyscout.py +175 -0
  52. ingestify/infra/store/__init__.py +2 -0
  53. ingestify/infra/store/dataset/__init__.py +2 -0
  54. ingestify/infra/store/dataset/local_dataset_repository.py +73 -0
  55. ingestify/infra/store/dataset/sqlalchemy/__init__.py +1 -0
  56. ingestify/infra/store/dataset/sqlalchemy/mapping.py +153 -0
  57. ingestify/infra/store/dataset/sqlalchemy/repository.py +239 -0
  58. ingestify/infra/store/file/__init__.py +2 -0
  59. ingestify/infra/store/file/local_file_repository.py +32 -0
  60. ingestify/infra/store/file/s3_file_repository.py +50 -0
  61. ingestify/main.py +205 -0
  62. ingestify/server.py +78 -0
  63. ingestify/source_base.py +23 -0
  64. ingestify/static/templates/statsbomb_github/README.md +0 -0
  65. ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +19 -0
  66. ingestify/static/templates/statsbomb_github/database/README.md +1 -0
  67. ingestify/static/templates/statsbomb_github/query.py +14 -0
  68. ingestify/static/templates/wyscout/.env +5 -0
  69. ingestify/static/templates/wyscout/.gitignore +2 -0
  70. ingestify/static/templates/wyscout/README.md +0 -0
  71. ingestify/static/templates/wyscout/config.yaml.jinja2 +18 -0
  72. ingestify/static/templates/wyscout/database/README.md +1 -0
  73. ingestify/static/templates/wyscout/query.py +14 -0
  74. ingestify/utils.py +276 -0
  75. ingestify-0.1.0.dist-info/METADATA +265 -0
  76. ingestify-0.1.0.dist-info/RECORD +79 -0
  77. ingestify-0.1.0.dist-info/WHEEL +5 -0
  78. ingestify-0.1.0.dist-info/entry_points.txt +2 -0
  79. ingestify-0.1.0.dist-info/top_level.txt +1 -0
ingestify/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ # detect if we are imported from the setup procedure (borrowed from numpy code)
2
+ try:
3
+ __INGESTIFY_SETUP__
4
+ except NameError:
5
+ __INGESTIFY_SETUP__ = False
6
+
7
+ if not __INGESTIFY_SETUP__:
8
+ from .infra import retrieve_http
9
+ from .source_base import Source, DatasetResource
10
+
11
+ __version__ = "0.1.0"
File without changes
@@ -0,0 +1,339 @@
1
+ import gzip
2
+ import hashlib
3
+ import logging
4
+ import mimetypes
5
+ import os
6
+ import shutil
7
+ from dataclasses import asdict
8
+ from io import BytesIO, StringIO
9
+
10
+ from typing import Dict, List, Optional, Union, Callable, BinaryIO
11
+
12
+ from ingestify.domain.models.dataset.dataset import DatasetState
13
+ from ingestify.domain.models.dataset.events import RevisionAdded, MetadataUpdated
14
+ from ingestify.domain.models.dataset.file_collection import FileCollection
15
+ from ingestify.domain.models.event import EventBus
16
+ from ingestify.domain.models import (
17
+ Dataset,
18
+ DatasetCollection,
19
+ DatasetRepository,
20
+ DatasetResource,
21
+ DraftFile,
22
+ File,
23
+ LoadedFile,
24
+ FileRepository,
25
+ Identifier,
26
+ Selector,
27
+ Revision,
28
+ DatasetCreated,
29
+ )
30
+ from ingestify.utils import utcnow, map_in_pool
31
+
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ class DatasetStore:
37
+ def __init__(
38
+ self,
39
+ dataset_repository: DatasetRepository,
40
+ file_repository: FileRepository,
41
+ bucket: str,
42
+ ):
43
+ self.dataset_repository = dataset_repository
44
+ self.file_repository = file_repository
45
+ self.storage_compression_method = "gzip"
46
+ self.bucket = bucket
47
+ self.event_bus: Optional[EventBus] = None
48
+
49
+ # def __getstate__(self):
50
+ # return {"file_repository": self.file_repository, "bucket": self.bucket}
51
+
52
+ def set_event_bus(self, event_bus: EventBus):
53
+ self.event_bus = event_bus
54
+
55
+ def dispatch(self, event):
56
+ if self.event_bus:
57
+ self.event_bus.dispatch(event)
58
+
59
+ def get_dataset_collection(
60
+ self,
61
+ dataset_type: Optional[str] = None,
62
+ provider: Optional[str] = None,
63
+ dataset_id: Optional[str] = None,
64
+ **selector,
65
+ ) -> DatasetCollection:
66
+ if "selector" in selector:
67
+ selector = selector["selector"]
68
+ if isinstance(selector, dict):
69
+ # By-pass the build as we don't want to specify data_spec_versions here... (for now)
70
+ selector = Selector(selector)
71
+ elif isinstance(selector, list):
72
+ if not selector:
73
+ return DatasetCollection()
74
+
75
+ if isinstance(selector[0], dict):
76
+ # Convert all selector dicts to Selectors
77
+ selector = [Selector(_) for _ in selector]
78
+
79
+ dataset_collection = self.dataset_repository.get_dataset_collection(
80
+ bucket=self.bucket,
81
+ dataset_type=dataset_type,
82
+ dataset_id=dataset_id,
83
+ provider=provider,
84
+ selector=selector,
85
+ )
86
+ return dataset_collection
87
+
88
+ #
89
+ # def destroy_dataset(self, dataset_id: str):
90
+ # dataset = self.dataset_repository.
91
+ # self.dataset_repository.destroy_dataset(dataset_id)
92
+
93
+ def _prepare_write_stream(self, file_: DraftFile) -> tuple[BytesIO, int, str]:
94
+ if self.storage_compression_method == "gzip":
95
+ stream = BytesIO()
96
+ with gzip.GzipFile(fileobj=stream, compresslevel=9, mode="wb") as fp:
97
+ shutil.copyfileobj(file_.stream, fp)
98
+
99
+ stream.seek(0, os.SEEK_END)
100
+ storage_size = stream.tell()
101
+ stream.seek(0)
102
+ suffix = ".gz"
103
+ else:
104
+ stream = file_.stream
105
+ storage_size = file_.size
106
+ suffix = ""
107
+
108
+ return stream, storage_size, suffix
109
+
110
+ def _prepare_read_stream(self) -> tuple[Callable[[BinaryIO], BytesIO], str]:
111
+ if self.storage_compression_method == "gzip":
112
+
113
+ def reader(fh: BinaryIO) -> BytesIO:
114
+ stream = BytesIO()
115
+ with gzip.GzipFile(fileobj=fh, compresslevel=9, mode="rb") as fp:
116
+ shutil.copyfileobj(fp, stream)
117
+ stream.seek(0)
118
+ return stream
119
+
120
+ return reader, ".gz"
121
+ else:
122
+ return lambda fh: fh, ""
123
+
124
+ def _persist_files(
125
+ self,
126
+ dataset: Dataset,
127
+ revision_id: int,
128
+ modified_files: Dict[str, Optional[DraftFile]],
129
+ ) -> List[File]:
130
+ modified_files_ = []
131
+
132
+ current_revision = dataset.current_revision
133
+
134
+ for file_id, file_ in modified_files.items():
135
+ if file_ is None:
136
+ # It's always allowed to pass None as file. This means it didn't change and must be ignored.
137
+ continue
138
+
139
+ current_file = (
140
+ current_revision.modified_files_map.get(file_id)
141
+ if current_revision
142
+ else None
143
+ )
144
+ if current_file and current_file.tag == file_.tag:
145
+ # File didn't change. Ignore it.
146
+ continue
147
+
148
+ stream, storage_size, suffix = self._prepare_write_stream(file_)
149
+
150
+ # TODO: check if this is a very clean way to go from DraftFile to File
151
+ full_path = self.file_repository.save_content(
152
+ bucket=self.bucket,
153
+ dataset=dataset,
154
+ revision_id=revision_id,
155
+ filename=file_id + "." + file_.data_serialization_format + suffix,
156
+ stream=stream,
157
+ )
158
+ file = File.from_draft(
159
+ file_,
160
+ file_id,
161
+ storage_size=storage_size,
162
+ storage_compression_method=self.storage_compression_method,
163
+ path=self.file_repository.get_relative_path(full_path),
164
+ )
165
+
166
+ modified_files_.append(file)
167
+
168
+ return modified_files_
169
+
170
+ def add_revision(
171
+ self, dataset: Dataset, files: Dict[str, DraftFile], description: str = "Update"
172
+ ):
173
+ """
174
+ Create new revision first, so FileRepository can use
175
+ revision_id in the key.
176
+ """
177
+ revision_id = dataset.next_revision_id()
178
+ created_at = utcnow()
179
+
180
+ persisted_files_ = self._persist_files(dataset, revision_id, files)
181
+ if persisted_files_:
182
+ # It can happen an API tells us data is changed, but it was not changed. In this case
183
+ # we decide to ignore it.
184
+ # Make sure there are files changed before creating a new revision
185
+ dataset.add_revision(
186
+ Revision(
187
+ revision_id=revision_id,
188
+ created_at=created_at,
189
+ description=description,
190
+ modified_files=persisted_files_,
191
+ )
192
+ )
193
+
194
+ self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
195
+ self.dispatch(RevisionAdded(dataset=dataset))
196
+ logger.info(
197
+ f"Added a new revision to {dataset.identifier} -> {', '.join([file.file_id for file in persisted_files_])}"
198
+ )
199
+ return True
200
+ else:
201
+ logger.info(
202
+ f"Ignoring a new revision without changed files -> {dataset.identifier}"
203
+ )
204
+ return False
205
+
206
+ def update_dataset(
207
+ self,
208
+ dataset: Dataset,
209
+ dataset_resource: DatasetResource,
210
+ files: Dict[str, DraftFile],
211
+ ):
212
+ """The add_revision will also save the dataset."""
213
+ metadata_changed = False
214
+ if dataset.update_from_resource(dataset_resource):
215
+ self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
216
+ metadata_changed = True
217
+
218
+ self.add_revision(dataset, files)
219
+
220
+ if metadata_changed:
221
+ # Dispatch after revision added. Otherwise, the downstream handlers are not able to see
222
+ # the new revision
223
+ self.dispatch(MetadataUpdated(dataset=dataset))
224
+
225
+ def destroy_dataset(self, dataset: Dataset):
226
+ # TODO: remove files. Now we leave some orphaned files around
227
+ self.dataset_repository.destroy(dataset)
228
+
229
+ def create_dataset(
230
+ self,
231
+ dataset_type: str,
232
+ provider: str,
233
+ dataset_identifier: Identifier,
234
+ name: str,
235
+ state: DatasetState,
236
+ metadata: dict,
237
+ files: Dict[str, DraftFile],
238
+ description: str = "Create",
239
+ ):
240
+ now = utcnow()
241
+
242
+ dataset = Dataset(
243
+ bucket=self.bucket,
244
+ dataset_id=self.dataset_repository.next_identity(),
245
+ name=name,
246
+ state=state,
247
+ identifier=dataset_identifier,
248
+ dataset_type=dataset_type,
249
+ provider=provider,
250
+ metadata=metadata,
251
+ created_at=now,
252
+ updated_at=now,
253
+ )
254
+ self.add_revision(dataset, files, description)
255
+
256
+ self.dispatch(DatasetCreated(dataset=dataset))
257
+
258
+ def load_files(
259
+ self,
260
+ dataset: Dataset,
261
+ data_feed_keys: Optional[List[str]] = None,
262
+ lazy: bool = False,
263
+ auto_rewind: bool = True,
264
+ ) -> FileCollection:
265
+ current_revision = dataset.current_revision
266
+ files = {}
267
+
268
+ reader, suffix = self._prepare_read_stream()
269
+ for file in current_revision.modified_files:
270
+ if data_feed_keys and file.data_feed_key not in data_feed_keys:
271
+ continue
272
+
273
+ def get_stream(file_):
274
+ revision_id = file_.revision_id
275
+ if revision_id is None:
276
+ revision_id = current_revision.revision_id
277
+
278
+ return reader(
279
+ self.file_repository.load_content(
280
+ bucket=self.bucket,
281
+ dataset=dataset,
282
+ # When file.revision_id is set we must use it.
283
+ revision_id=revision_id,
284
+ filename=file_.file_id
285
+ + "."
286
+ + file_.data_serialization_format
287
+ + suffix,
288
+ )
289
+ )
290
+
291
+ loaded_file = LoadedFile(
292
+ _stream=get_stream if lazy else get_stream(file),
293
+ **asdict(file),
294
+ )
295
+ files[file.file_id] = loaded_file
296
+ return FileCollection(files, auto_rewind=auto_rewind)
297
+
298
+ def load_with_kloppy(self, dataset: Dataset, **kwargs):
299
+ files = self.load_files(dataset)
300
+ if dataset.provider == "statsbomb":
301
+ from kloppy import statsbomb
302
+
303
+ try:
304
+ return statsbomb.load(
305
+ event_data=files.get_file("events").stream,
306
+ lineup_data=files.get_file("lineups").stream,
307
+ **kwargs,
308
+ )
309
+ except Exception as e:
310
+ raise Exception(f"Error loading {dataset}") from e
311
+ elif dataset.provider == "wyscout":
312
+ from kloppy import wyscout
313
+
314
+ return wyscout.load(
315
+ event_data=files["events.json"].stream, data_version="V3", **kwargs
316
+ )
317
+ else:
318
+ raise Exception(f"Don't know how to load a '{dataset.provider}' dataset")
319
+
320
+ # def load_content(self, dataset_id: str, version_id: int, filename: str):
321
+ # datasets = self.dataset_repository.get_dataset_collection(
322
+ # bucket=self.bucket, dataset_id=dataset_id
323
+ # )
324
+ # if not len(datasets):
325
+ # raise Exception("Not found")
326
+ # else:
327
+ # dataset = datasets.get_dataset_by_id(dataset_id)
328
+ #
329
+ # return self.file_repository.load_content(
330
+ # bucket=self.bucket,
331
+ # dataset=dataset,
332
+ # version_id=version_id,
333
+ # filename=filename,
334
+ # )
335
+
336
+ def map(
337
+ self, fn, dataset_collection: DatasetCollection, processes: Optional[int] = None
338
+ ):
339
+ return map_in_pool(fn, dataset_collection, processes)
@@ -0,0 +1,62 @@
1
+ import itertools
2
+ import logging
3
+ from typing import Optional, List
4
+
5
+
6
+ from .loader import Loader
7
+ from .dataset_store import DatasetStore
8
+ from ..domain.models.extract_job import ExtractJob
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class IngestionEngine:
14
+ def __init__(self, store: DatasetStore):
15
+
16
+ # Note: disconnect event from loading. Event should only be used for
17
+ # metadata and 'loaded_files' for the actual data.
18
+ self.store = store
19
+ self.loader = Loader(self.store)
20
+
21
+ def add_extract_job(self, extract_job: ExtractJob):
22
+ self.loader.add_extract_job(extract_job)
23
+
24
+ def load(self, dry_run: bool = False, provider: Optional[str] = None):
25
+ self.loader.collect_and_run(dry_run=dry_run, provider=provider)
26
+
27
+ def list_datasets(self, as_count: bool = False):
28
+ """Consider moving this to DataStore"""
29
+ datasets = sorted(
30
+ self.store.get_dataset_collection(),
31
+ key=lambda dataset_: (
32
+ dataset_.provider,
33
+ dataset_.dataset_type,
34
+ str(dataset_.identifier),
35
+ ),
36
+ )
37
+ if as_count:
38
+ print(f"Count: {len(datasets)}")
39
+ else:
40
+ for provider, datasets_per_provider in itertools.groupby(
41
+ datasets, key=lambda dataset_: dataset_.provider
42
+ ):
43
+ print(f"{provider}:")
44
+ for dataset_type, datasets_per_type in itertools.groupby(
45
+ datasets_per_provider, key=lambda dataset_: dataset_.dataset_type
46
+ ):
47
+ print(f" {dataset_type}:")
48
+ for dataset in datasets_per_type:
49
+ print(
50
+ f" {dataset.identifier}: {dataset.name} / {dataset.state} {dataset.dataset_id}"
51
+ )
52
+ # print(dataset.dataset_id)
53
+
54
+ def destroy_dataset(
55
+ self, dataset_id: Optional[str] = None, **selector
56
+ ) -> List[str]:
57
+ datasets = self.store.get_dataset_collection(dataset_id=dataset_id, **selector)
58
+ dataset_ids = []
59
+ for dataset in datasets:
60
+ self.store.destroy_dataset(dataset)
61
+ dataset_ids.append(dataset.dataset_id)
62
+ return dataset_ids