ingestify 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +11 -0
- ingestify/application/__init__.py +0 -0
- ingestify/application/dataset_store.py +339 -0
- ingestify/application/ingestion_engine.py +62 -0
- ingestify/application/loader.py +329 -0
- ingestify/application/secrets_manager.py +53 -0
- ingestify/cmdline.py +283 -0
- ingestify/domain/__init__.py +2 -0
- ingestify/domain/models/__init__.py +45 -0
- ingestify/domain/models/data_spec_version_collection.py +33 -0
- ingestify/domain/models/dataset/__init__.py +27 -0
- ingestify/domain/models/dataset/collection.py +44 -0
- ingestify/domain/models/dataset/collection_metadata.py +13 -0
- ingestify/domain/models/dataset/dataset.py +104 -0
- ingestify/domain/models/dataset/dataset_repository.py +46 -0
- ingestify/domain/models/dataset/events.py +31 -0
- ingestify/domain/models/dataset/file.py +146 -0
- ingestify/domain/models/dataset/file_collection.py +35 -0
- ingestify/domain/models/dataset/file_repository.py +59 -0
- ingestify/domain/models/dataset/identifier.py +24 -0
- ingestify/domain/models/dataset/revision.py +29 -0
- ingestify/domain/models/dataset/selector.py +37 -0
- ingestify/domain/models/event/__init__.py +4 -0
- ingestify/domain/models/event/_old_event.py +21 -0
- ingestify/domain/models/event/dispatcher.py +8 -0
- ingestify/domain/models/event/domain_event.py +10 -0
- ingestify/domain/models/event/event_bus.py +24 -0
- ingestify/domain/models/event/publisher.py +23 -0
- ingestify/domain/models/event/subscriber.py +39 -0
- ingestify/domain/models/extract_job.py +23 -0
- ingestify/domain/models/fetch_policy.py +40 -0
- ingestify/domain/models/resources/__init__.py +1 -0
- ingestify/domain/models/resources/dataset_resource.py +99 -0
- ingestify/domain/models/sink.py +16 -0
- ingestify/domain/models/source.py +34 -0
- ingestify/domain/models/task/__init__.py +4 -0
- ingestify/domain/models/task/set.py +21 -0
- ingestify/domain/models/task/task.py +7 -0
- ingestify/domain/services/__init__.py +0 -0
- ingestify/domain/services/transformers/__init__.py +0 -0
- ingestify/domain/services/transformers/kloppy_to_pandas.py +25 -0
- ingestify/exceptions.py +10 -0
- ingestify/infra/__init__.py +4 -0
- ingestify/infra/fetch/__init__.py +0 -0
- ingestify/infra/fetch/http.py +100 -0
- ingestify/infra/serialization/__init__.py +50 -0
- ingestify/infra/sink/__init__.py +0 -0
- ingestify/infra/sink/postgresql.py +50 -0
- ingestify/infra/source/__init__.py +0 -0
- ingestify/infra/source/statsbomb_github.py +92 -0
- ingestify/infra/source/wyscout.py +175 -0
- ingestify/infra/store/__init__.py +2 -0
- ingestify/infra/store/dataset/__init__.py +2 -0
- ingestify/infra/store/dataset/local_dataset_repository.py +73 -0
- ingestify/infra/store/dataset/sqlalchemy/__init__.py +1 -0
- ingestify/infra/store/dataset/sqlalchemy/mapping.py +153 -0
- ingestify/infra/store/dataset/sqlalchemy/repository.py +239 -0
- ingestify/infra/store/file/__init__.py +2 -0
- ingestify/infra/store/file/local_file_repository.py +32 -0
- ingestify/infra/store/file/s3_file_repository.py +50 -0
- ingestify/main.py +205 -0
- ingestify/server.py +78 -0
- ingestify/source_base.py +23 -0
- ingestify/static/templates/statsbomb_github/README.md +0 -0
- ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +19 -0
- ingestify/static/templates/statsbomb_github/database/README.md +1 -0
- ingestify/static/templates/statsbomb_github/query.py +14 -0
- ingestify/static/templates/wyscout/.env +5 -0
- ingestify/static/templates/wyscout/.gitignore +2 -0
- ingestify/static/templates/wyscout/README.md +0 -0
- ingestify/static/templates/wyscout/config.yaml.jinja2 +18 -0
- ingestify/static/templates/wyscout/database/README.md +1 -0
- ingestify/static/templates/wyscout/query.py +14 -0
- ingestify/utils.py +276 -0
- ingestify-0.1.0.dist-info/METADATA +265 -0
- ingestify-0.1.0.dist-info/RECORD +79 -0
- ingestify-0.1.0.dist-info/WHEEL +5 -0
- ingestify-0.1.0.dist-info/entry_points.txt +2 -0
- ingestify-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
ingestify_version: {{ ingestify_version }}
|
|
2
|
+
|
|
3
|
+
main:
|
|
4
|
+
dataset_url: sqlite:///database/catalog.db
|
|
5
|
+
file_url: file://database/files/
|
|
6
|
+
default_bucket: main
|
|
7
|
+
|
|
8
|
+
sources:
|
|
9
|
+
wyscout:
|
|
10
|
+
type: ingestify.wyscout
|
|
11
|
+
configuration:
|
|
12
|
+
username: !ENV ${WYSCOUT_USERNAME}
|
|
13
|
+
password: !ENV ${WYSCOUT_PASSWORD}
|
|
14
|
+
|
|
15
|
+
extract_jobs:
|
|
16
|
+
- source: wyscout
|
|
17
|
+
selectors:
|
|
18
|
+
- season_id: 188105
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# This will contain the database
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from ingestify.main import get_datastore
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def main():
|
|
5
|
+
store = get_datastore("config.yaml")
|
|
6
|
+
dataset_collection = store.get_dataset_collection()
|
|
7
|
+
|
|
8
|
+
for dataset in dataset_collection:
|
|
9
|
+
kloppy_dataset = store.load_with_kloppy(dataset)
|
|
10
|
+
print(f"Loaded dataset with {len(kloppy_dataset.records)} events")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
if __name__ == "__main__":
|
|
14
|
+
main()
|
ingestify/utils.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import inspect
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
import re
|
|
7
|
+
from multiprocessing import get_context, cpu_count, get_all_start_methods
|
|
8
|
+
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from string import Template
|
|
11
|
+
from typing import Dict, Generic, Type, TypeVar, Tuple, Optional, Any
|
|
12
|
+
|
|
13
|
+
import cloudpickle
|
|
14
|
+
from typing_extensions import Self
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
from itertools import islice
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def chunker(it, size):
|
|
24
|
+
iterator = iter(it)
|
|
25
|
+
while chunk := list(islice(iterator, size)):
|
|
26
|
+
yield chunk
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def sanitize_exception_message(exception_message):
|
|
30
|
+
"""
|
|
31
|
+
Sanitizes an exception message by removing any sensitive information such as passwords.
|
|
32
|
+
"""
|
|
33
|
+
# Regular expression to identify potential sensitive information like URLs with passwords
|
|
34
|
+
sensitive_info_pattern = r":(\w+)@"
|
|
35
|
+
|
|
36
|
+
# Replace sensitive information with a placeholder
|
|
37
|
+
sanitized_message = re.sub(sensitive_info_pattern, ":******@", exception_message)
|
|
38
|
+
|
|
39
|
+
return sanitized_message
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ComponentRegistry:
|
|
43
|
+
def __init__(self):
|
|
44
|
+
self.__registered_components = {}
|
|
45
|
+
|
|
46
|
+
class _Registered(abc.ABCMeta):
|
|
47
|
+
def __new__(mcs, cls_name, bases, class_dict):
|
|
48
|
+
class_dict["name"] = cls_name
|
|
49
|
+
component_cls = super(_Registered, mcs).__new__(
|
|
50
|
+
mcs, cls_name, bases, class_dict
|
|
51
|
+
)
|
|
52
|
+
if not inspect.isabstract(component_cls):
|
|
53
|
+
self.register_component(cls_name, component_cls)
|
|
54
|
+
else:
|
|
55
|
+
if bases[0] != abc.ABC:
|
|
56
|
+
raise Exception(
|
|
57
|
+
f"Class '{cls_name}' seems to be an concrete class, but missing some abstract methods"
|
|
58
|
+
)
|
|
59
|
+
return component_cls
|
|
60
|
+
|
|
61
|
+
self.__metaclass = _Registered
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def metaclass(self):
|
|
65
|
+
return self.__metaclass
|
|
66
|
+
|
|
67
|
+
def register_component(self, cls_name, component_cls):
|
|
68
|
+
self.__registered_components[cls_name] = component_cls
|
|
69
|
+
|
|
70
|
+
def get_component(self, cls_name: str):
|
|
71
|
+
return self.__registered_components[cls_name]
|
|
72
|
+
|
|
73
|
+
def get_supporting_component(self, **kwargs) -> str:
|
|
74
|
+
for cls_name, class_ in self.__registered_components.items():
|
|
75
|
+
if not hasattr(class_, "supports"):
|
|
76
|
+
raise Exception(
|
|
77
|
+
f"Class '{cls_name}' does not implemented a 'supports' classmethod. "
|
|
78
|
+
f"This is required when using 'get_supporting_component'."
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if class_.supports(**kwargs):
|
|
82
|
+
return cls_name
|
|
83
|
+
|
|
84
|
+
kwargs_str = sanitize_exception_message(str(kwargs))
|
|
85
|
+
raise Exception(f"No supporting class found for {kwargs_str}")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
T = TypeVar("T")
|
|
89
|
+
R = TypeVar("R")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class ComponentFactory(Generic[T]):
|
|
93
|
+
def __init__(self, registry: ComponentRegistry):
|
|
94
|
+
self.registry = registry
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def build_factory(
|
|
98
|
+
cls, component_cls: Type[R], registry: ComponentRegistry
|
|
99
|
+
) -> "ComponentFactory[R]":
|
|
100
|
+
return cls[component_cls](registry)
|
|
101
|
+
|
|
102
|
+
def build(self, cls_name, **kwargs) -> T:
|
|
103
|
+
component_cls = self.registry.get_component(cls_name)
|
|
104
|
+
try:
|
|
105
|
+
return component_cls.from_dict(**kwargs)
|
|
106
|
+
except AttributeError:
|
|
107
|
+
pass
|
|
108
|
+
try:
|
|
109
|
+
return component_cls(**kwargs)
|
|
110
|
+
except TypeError as e:
|
|
111
|
+
raise e
|
|
112
|
+
# raise TypeError(f"Could not initialize {cls_name}")
|
|
113
|
+
|
|
114
|
+
def build_if_supports(self, **kwargs) -> T:
|
|
115
|
+
cls_name = self.registry.get_supporting_component(**kwargs)
|
|
116
|
+
return self.build(cls_name, **kwargs)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def key_from_dict(d: dict) -> str:
|
|
120
|
+
return "/".join([f"{k}={v}" for k, v in sorted(d.items()) if not k.startswith("_")])
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def utcnow() -> datetime:
|
|
124
|
+
return datetime.fromtimestamp(time.time(), timezone.utc)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
NOT_SET = object()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class AttributeBag:
|
|
131
|
+
def __init__(self, attributes=NOT_SET, **kwargs):
|
|
132
|
+
if attributes is not NOT_SET:
|
|
133
|
+
self.attributes = attributes
|
|
134
|
+
else:
|
|
135
|
+
self.attributes = kwargs
|
|
136
|
+
self.key = key_from_dict(self.attributes)
|
|
137
|
+
|
|
138
|
+
def __getattr__(self, item):
|
|
139
|
+
if item in self.__dict__:
|
|
140
|
+
return self.__dict__[item]
|
|
141
|
+
if "attributes" in self.__dict__ and item in self.attributes:
|
|
142
|
+
return self.attributes[item]
|
|
143
|
+
raise AttributeError(f"{item} not found")
|
|
144
|
+
|
|
145
|
+
def items(self):
|
|
146
|
+
return self.attributes.items()
|
|
147
|
+
|
|
148
|
+
def format_string(self, string: str):
|
|
149
|
+
return Template(string).substitute(**self.attributes)
|
|
150
|
+
|
|
151
|
+
def matches(self, attributes: Dict) -> bool:
|
|
152
|
+
for k, v in self.attributes.items():
|
|
153
|
+
if attributes.get(k) != v:
|
|
154
|
+
return False
|
|
155
|
+
return True
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def filtered_attributes(self):
|
|
159
|
+
return {k: v for k, v in self.attributes.items() if not k.startswith("_")}
|
|
160
|
+
|
|
161
|
+
def __eq__(self, other):
|
|
162
|
+
if isinstance(other, AttributeBag):
|
|
163
|
+
return self.key == other.key
|
|
164
|
+
|
|
165
|
+
def __hash__(self):
|
|
166
|
+
return hash(self.key)
|
|
167
|
+
|
|
168
|
+
def __repr__(self):
|
|
169
|
+
return f"{self.__class__.__name__}({', '.join([f'{k}={v}' for k, v in self.filtered_attributes.items()])})"
|
|
170
|
+
|
|
171
|
+
def __str__(self):
|
|
172
|
+
return "/".join([f"{k}={v}" for k, v in self.filtered_attributes.items()])
|
|
173
|
+
|
|
174
|
+
@classmethod
|
|
175
|
+
def create_from(cls, other: "AttributeBag", **kwargs):
|
|
176
|
+
_args = dict(**other.attributes)
|
|
177
|
+
_args.update(kwargs)
|
|
178
|
+
|
|
179
|
+
return cls(**_args)
|
|
180
|
+
|
|
181
|
+
def split(self, attribute_name: str) -> Tuple[Self, Optional[Any]]:
|
|
182
|
+
return self.attributes.get(attribute_name), self.__class__(
|
|
183
|
+
**{k: v for k, v in self.attributes.items() if k != attribute_name}
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def cloud_unpack_and_call(args):
|
|
188
|
+
f_pickled, org_args = args
|
|
189
|
+
|
|
190
|
+
f = cloudpickle.loads(f_pickled)
|
|
191
|
+
return f(org_args)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def map_in_pool(func, iterable, processes=0):
|
|
195
|
+
# TODO: move to cmdline
|
|
196
|
+
if os.environ.get("INGESTIFY_RUN_EAGER") == "true":
|
|
197
|
+
return list(map(func, iterable))
|
|
198
|
+
|
|
199
|
+
if not processes:
|
|
200
|
+
processes = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
|
|
201
|
+
|
|
202
|
+
if "fork" in get_all_start_methods():
|
|
203
|
+
ctx = get_context("fork")
|
|
204
|
+
else:
|
|
205
|
+
ctx = get_context("spawn")
|
|
206
|
+
|
|
207
|
+
wrapped_fn = cloudpickle.dumps(func)
|
|
208
|
+
|
|
209
|
+
with ctx.Pool(processes or cpu_count()) as pool:
|
|
210
|
+
return pool.map(
|
|
211
|
+
cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class SyncPool:
|
|
216
|
+
def map(self, func, iterable):
|
|
217
|
+
return [func(item) for item in iterable]
|
|
218
|
+
|
|
219
|
+
def join(self):
|
|
220
|
+
return True
|
|
221
|
+
|
|
222
|
+
def close(self):
|
|
223
|
+
return True
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class DummyPool:
|
|
227
|
+
def map(self, func, iterable):
|
|
228
|
+
logger.info(f"DummyPool: not running {len(list(iterable))} tasks")
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
def join(self):
|
|
232
|
+
return True
|
|
233
|
+
|
|
234
|
+
def close(self):
|
|
235
|
+
return True
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class TaskExecutor:
|
|
239
|
+
def __init__(self, processes=0, dry_run: bool = False):
|
|
240
|
+
if dry_run:
|
|
241
|
+
pool = DummyPool()
|
|
242
|
+
elif os.environ.get("INGESTIFY_RUN_EAGER") == "true":
|
|
243
|
+
pool = SyncPool()
|
|
244
|
+
else:
|
|
245
|
+
if not processes:
|
|
246
|
+
processes = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
|
|
247
|
+
|
|
248
|
+
if "fork" in get_all_start_methods():
|
|
249
|
+
ctx = get_context("fork")
|
|
250
|
+
else:
|
|
251
|
+
ctx = get_context("spawn")
|
|
252
|
+
|
|
253
|
+
pool = ctx.Pool(processes or cpu_count())
|
|
254
|
+
self.pool = pool
|
|
255
|
+
|
|
256
|
+
def __enter__(self):
|
|
257
|
+
return self
|
|
258
|
+
|
|
259
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
260
|
+
self.join()
|
|
261
|
+
|
|
262
|
+
def run(self, func, iterable):
|
|
263
|
+
wrapped_fn = cloudpickle.dumps(func)
|
|
264
|
+
start_time = time.time()
|
|
265
|
+
res = self.pool.map(
|
|
266
|
+
cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
|
|
267
|
+
)
|
|
268
|
+
if res:
|
|
269
|
+
took = time.time() - start_time
|
|
270
|
+
logger.info(
|
|
271
|
+
f"Finished {len(res)} tasks in {took:.1f} seconds. {(len(res)/took):.1f} tasks/sec"
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
def join(self):
|
|
275
|
+
self.pool.close()
|
|
276
|
+
self.pool.join()
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ingestify
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Standardizing soccer tracking- and event data
|
|
5
|
+
Author: Koen Vossen
|
|
6
|
+
Author-email: info@koenvossen.nl
|
|
7
|
+
License: AGPL
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: requests<3,>=2.0.0
|
|
10
|
+
Requires-Dist: SQLAlchemy
|
|
11
|
+
Requires-Dist: dataclass-factory
|
|
12
|
+
Requires-Dist: cloudpickle
|
|
13
|
+
Requires-Dist: click
|
|
14
|
+
Requires-Dist: jinja2
|
|
15
|
+
Requires-Dist: python-dotenv
|
|
16
|
+
Requires-Dist: pyaml-env
|
|
17
|
+
Requires-Dist: boto3
|
|
18
|
+
Requires-Dist: pytz
|
|
19
|
+
Provides-Extra: test
|
|
20
|
+
Requires-Dist: pytest<7,>=6.2.5; extra == "test"
|
|
21
|
+
|
|
22
|
+
# Ingestify
|
|
23
|
+
|
|
24
|
+
## Data Management Platform
|
|
25
|
+
|
|
26
|
+
In general a data management platform contains:
|
|
27
|
+
1. Ingestion of data (Extract from Source into Load into Data Lake)
|
|
28
|
+
2. Transformation of data (Extract from Data Lake, Transform and Load into Data Warehouse)
|
|
29
|
+
3. Utilization of data
|
|
30
|
+
|
|
31
|
+
<img src="https://www.getdbt.com/ui/img/blog/what-exactly-is-dbt/1-BogoeTTK1OXFU1hPfUyCFw.png" />
|
|
32
|
+
Source: https://www.getdbt.com/blog/what-exactly-is-dbt/
|
|
33
|
+
|
|
34
|
+
TODO: Improve drawings and explain more
|
|
35
|
+
|
|
36
|
+
## Ingestify
|
|
37
|
+
|
|
38
|
+
Ingestify focus' on Ingestion of data.
|
|
39
|
+
|
|
40
|
+
### How does Ingestify work?
|
|
41
|
+
|
|
42
|
+
1. A `Source` is asked for all available `Datasets` using the `discover_datasets` method
|
|
43
|
+
2. All available `Datasets` are compared with what's already fetched, and if it's changed (using a `FetchPolicy`)
|
|
44
|
+
3. A `TaskQueue` is filled with `Tasks` to fetch all missing or stale `Datasets`
|
|
45
|
+
|
|
46
|
+
<img src="https://raw.githubusercontent.com/PySport/ingestify/refs/heads/main/docs/overview.svg" />
|
|
47
|
+
|
|
48
|
+
- [Source](blob/main/ingestify/domain/models/source.py) is the main entrance from Ingestify to external sources. A Source must always define:
|
|
49
|
+
- `discover_datasets` - Creates a list of all available datasets on the Source
|
|
50
|
+
- `fetch_dataset_files` - Fetches a single dataset for a Source
|
|
51
|
+
- [Dataset Store](blob/main/ingestify/application/dataset_store.py) manages the access to the Metadata storage and the file storage. It keeps track of versions, and knows how to load data.
|
|
52
|
+
- [Loader](blob/main/ingestify/application/loader.py) organizes the fetching process. It does this by executing the following steps:
|
|
53
|
+
1. Ask `Source` for all available datasets for a selector
|
|
54
|
+
2. Ask `Dataset Store` for all available datasets for a selector
|
|
55
|
+
3. Determines missing `Datasets`
|
|
56
|
+
4. Create tasks for data retrieval and puts in `TaskQueue`
|
|
57
|
+
5. Use multiprocessing to execute all tasks
|
|
58
|
+
|
|
59
|
+
## Get started
|
|
60
|
+
|
|
61
|
+
### Install
|
|
62
|
+
|
|
63
|
+
Make sure you have installed the latest version:
|
|
64
|
+
```bash
|
|
65
|
+
pip install git+https://github.com/PySport/ingestify.git
|
|
66
|
+
|
|
67
|
+
# OR
|
|
68
|
+
|
|
69
|
+
pip install git+ssh://git@github.com/PySport/ingestify.git
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Using a template
|
|
73
|
+
|
|
74
|
+
Ingestify provides some templates to get started quickly. When using `ingestify init` a new project will be created and example files are copied.
|
|
75
|
+
Currently, Ingestify offers a `statsbomb_github` and `wyscout` template.
|
|
76
|
+
|
|
77
|
+
#### Statsbomb Github
|
|
78
|
+
|
|
79
|
+
This uses https://github.com/statsbomb/open-data as source and syncs some competitions.
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
bash# ingestify init --template statsbomb_github /tmp/ingestify-test
|
|
83
|
+
|
|
84
|
+
2023-05-23 08:57:51,250 [INFO] ingestify.cmdline: Initialized project at `/tmp/ingestify-test` with template `statsbomb_github`
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
#### Wyscout
|
|
88
|
+
|
|
89
|
+
This requires valid Wyscout credentials. The templates includes some security best practices like using a `.env` file for credentials which isn't part of version control.
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
bash# ingestify init --template wyscout /tmp/ingestify-test
|
|
93
|
+
|
|
94
|
+
2023-05-23 08:58:18,720 [INFO] ingestify.cmdline: Initialized project at `/tmp/ingestify-test` with template `wyscout`
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Running Ingestify
|
|
98
|
+
|
|
99
|
+
To actually run Ingestify you first change the current directory to the project directory.
|
|
100
|
+
|
|
101
|
+
Then run:
|
|
102
|
+
```bash
|
|
103
|
+
bash# ingestify run
|
|
104
|
+
|
|
105
|
+
2023-05-23 08:59:07,066 [INFO] ingestify.main: Initializing sources
|
|
106
|
+
2023-05-23 08:59:07,068 [INFO] ingestify.main: Initializing IngestionEngine
|
|
107
|
+
2023-05-23 08:59:07,086 [INFO] ingestify.main: Determining tasks...
|
|
108
|
+
2023-05-23 08:59:07,364 [INFO] ingestify.application.loader: Discovered 33 datasets from StatsbombGithub using selector competition_id=11/season_id=42 => 33 tasks. 0 skipped.
|
|
109
|
+
2023-05-23 08:59:07,625 [INFO] ingestify.application.loader: Discovered 35 datasets from StatsbombGithub using selector competition_id=11/season_id=90 => 35 tasks. 0 skipped.
|
|
110
|
+
2023-05-23 08:59:07,625 [INFO] ingestify.application.loader: Scheduled 68 tasks. With 10 processes
|
|
111
|
+
2023-05-23 08:59:07,654 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303516)
|
|
112
|
+
2023-05-23 08:59:07,654 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303731)
|
|
113
|
+
2023-05-23 08:59:07,655 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303430)
|
|
114
|
+
2023-05-23 08:59:07,655 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303504)
|
|
115
|
+
2023-05-23 08:59:07,655 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303421)
|
|
116
|
+
2023-05-23 08:59:07,655 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303400)
|
|
117
|
+
2023-05-23 08:59:07,656 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303664)
|
|
118
|
+
2023-05-23 08:59:07,656 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303680)
|
|
119
|
+
2023-05-23 08:59:07,657 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303487)
|
|
120
|
+
2023-05-23 08:59:07,658 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303615)
|
|
121
|
+
2023-05-23 08:59:08,419 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303532)
|
|
122
|
+
2023-05-23 08:59:08,421 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303682)
|
|
123
|
+
2023-05-23 08:59:08,444 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303451)
|
|
124
|
+
2023-05-23 08:59:08,462 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303596)
|
|
125
|
+
2023-05-23 08:59:08,518 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303634)
|
|
126
|
+
2023-05-23 08:59:08,528 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303479)
|
|
127
|
+
2023-05-23 08:59:08,541 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303696)
|
|
128
|
+
2023-05-23 08:59:08,638 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303725)
|
|
129
|
+
2023-05-23 08:59:08,684 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303600)
|
|
130
|
+
2023-05-23 08:59:08,962 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303493)
|
|
131
|
+
2023-05-23 08:59:09,270 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303548)
|
|
132
|
+
2023-05-23 08:59:09,276 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303674)
|
|
133
|
+
2023-05-23 08:59:09,292 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303700)
|
|
134
|
+
2023-05-23 08:59:09,332 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303666)
|
|
135
|
+
2023-05-23 08:59:09,411 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303377)
|
|
136
|
+
2023-05-23 08:59:09,462 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303517)
|
|
137
|
+
2023-05-23 08:59:09,491 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303473)
|
|
138
|
+
2023-05-23 08:59:09,511 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773631)
|
|
139
|
+
2023-05-23 08:59:09,726 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773497)
|
|
140
|
+
2023-05-23 08:59:09,757 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773593)
|
|
141
|
+
2023-05-23 08:59:09,957 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303652)
|
|
142
|
+
2023-05-23 08:59:09,999 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303715)
|
|
143
|
+
2023-05-23 08:59:10,075 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303470)
|
|
144
|
+
2023-05-23 08:59:10,103 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303707)
|
|
145
|
+
2023-05-23 08:59:10,188 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773457)
|
|
146
|
+
2023-05-23 08:59:10,248 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303524)
|
|
147
|
+
2023-05-23 08:59:10,282 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773665)
|
|
148
|
+
2023-05-23 08:59:10,411 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=42/match_id=303610)
|
|
149
|
+
2023-05-23 08:59:10,563 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773466)
|
|
150
|
+
2023-05-23 08:59:10,711 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773585)
|
|
151
|
+
2023-05-23 08:59:10,768 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773672)
|
|
152
|
+
2023-05-23 08:59:10,778 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773565)
|
|
153
|
+
2023-05-23 08:59:10,867 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773660)
|
|
154
|
+
2023-05-23 08:59:10,954 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773656)
|
|
155
|
+
2023-05-23 08:59:10,974 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773586)
|
|
156
|
+
2023-05-23 08:59:11,026 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773387)
|
|
157
|
+
2023-05-23 08:59:11,136 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773369)
|
|
158
|
+
2023-05-23 08:59:11,438 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773552)
|
|
159
|
+
2023-05-23 08:59:11,515 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773597)
|
|
160
|
+
2023-05-23 08:59:11,586 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773571)
|
|
161
|
+
2023-05-23 08:59:11,610 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773587)
|
|
162
|
+
2023-05-23 08:59:11,690 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773386)
|
|
163
|
+
2023-05-23 08:59:11,727 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773377)
|
|
164
|
+
2023-05-23 08:59:11,757 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773372)
|
|
165
|
+
2023-05-23 08:59:11,899 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3764661)
|
|
166
|
+
2023-05-23 08:59:11,901 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773695)
|
|
167
|
+
2023-05-23 08:59:12,006 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773661)
|
|
168
|
+
2023-05-23 08:59:12,186 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773474)
|
|
169
|
+
2023-05-23 08:59:12,283 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773523)
|
|
170
|
+
2023-05-23 08:59:12,339 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773403)
|
|
171
|
+
2023-05-23 08:59:12,426 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773428)
|
|
172
|
+
2023-05-23 08:59:12,582 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773415)
|
|
173
|
+
2023-05-23 08:59:12,583 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773689)
|
|
174
|
+
2023-05-23 08:59:12,705 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773526)
|
|
175
|
+
2023-05-23 08:59:13,510 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773477)
|
|
176
|
+
2023-05-23 08:59:13,538 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3764440)
|
|
177
|
+
2023-05-23 08:59:13,592 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773625)
|
|
178
|
+
2023-05-23 08:59:15,017 [INFO] ingestify.application.loader: Running task CreateDatasetTask(StatsbombGithub -> competition_id=11/season_id=90/match_id=3773547)
|
|
179
|
+
2023-05-23 08:59:15,917 [INFO] ingestify.cmdline: Done
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
When we run it for the second time:
|
|
183
|
+
```bash
|
|
184
|
+
bash# ingestify run
|
|
185
|
+
|
|
186
|
+
2023-05-23 08:59:48,001 [INFO] ingestify.main: Initializing sources
|
|
187
|
+
2023-05-23 08:59:48,002 [INFO] ingestify.main: Initializing IngestionEngine
|
|
188
|
+
2023-05-23 08:59:48,006 [INFO] ingestify.main: Determining tasks...
|
|
189
|
+
2023-05-23 08:59:48,067 [INFO] ingestify.application.loader: Discovered 33 datasets from StatsbombGithub using selector competition_id=11/season_id=42 => 0 tasks. 33 skipped.
|
|
190
|
+
2023-05-23 08:59:48,118 [INFO] ingestify.application.loader: Discovered 35 datasets from StatsbombGithub using selector competition_id=11/season_id=90 => 0 tasks. 35 skipped.
|
|
191
|
+
2023-05-23 08:59:48,118 [INFO] ingestify.application.loader: Nothing to do.
|
|
192
|
+
2023-05-23 08:59:48,119 [INFO] ingestify.cmdline: Done
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Using the data
|
|
196
|
+
|
|
197
|
+
The project contains a `query.py` file with an example of how to use the data.
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
bash# python query.py
|
|
201
|
+
|
|
202
|
+
Loaded dataset with 3702 events
|
|
203
|
+
Loaded dataset with 3994 events
|
|
204
|
+
Loaded dataset with 3831 events
|
|
205
|
+
Loaded dataset with 3647 events
|
|
206
|
+
Loaded dataset with 4062 events
|
|
207
|
+
Loaded dataset with 4051 events
|
|
208
|
+
|
|
209
|
+
.....
|
|
210
|
+
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
How to go from raw data to parquet files:
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
from ingestify.main import get_datastore
|
|
218
|
+
|
|
219
|
+
store = get_datastore("config.yaml")
|
|
220
|
+
|
|
221
|
+
dataset_collection = store.get_dataset_collection(
|
|
222
|
+
provider="statsbomb", stage="raw"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Store.map is using multiprocessing by default
|
|
226
|
+
store.map(
|
|
227
|
+
lambda dataset: (
|
|
228
|
+
store
|
|
229
|
+
|
|
230
|
+
# As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
|
|
231
|
+
.load_with_kloppy(dataset)
|
|
232
|
+
|
|
233
|
+
# Convert it into a polars dataframe using all columns in the original data and some more additional ones
|
|
234
|
+
.to_df(
|
|
235
|
+
"*",
|
|
236
|
+
match_id=dataset.identifier.match_id,
|
|
237
|
+
competition_id=dataset.identifier.competition_id,
|
|
238
|
+
season_id=dataset.identifier.season_id,
|
|
239
|
+
|
|
240
|
+
engine="polars"
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Write to parquet format
|
|
244
|
+
.write_parquet(
|
|
245
|
+
f"/tmp/files/blaat/{dataset.identifier.match_id}.parquet"
|
|
246
|
+
)
|
|
247
|
+
),
|
|
248
|
+
dataset_collection,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# TODO:
|
|
252
|
+
# - when a file is written in parquet format (on any other format) it should be added as such to the store.
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
## Future work
|
|
257
|
+
|
|
258
|
+
Some future work include:
|
|
259
|
+
- Workflow tools - Run custom workflows using with tools like [Airflow](https://airflow.apache.org/), [Dagster](https://docs.dagster.io/getting-started), [Prefect](https://www.prefect.io/), [DBT](https://www.getdbt.com/)
|
|
260
|
+
- Execution engines - Run tasks on other execution engines like [AWS Lambda](https://aws.amazon.com/lambda/), [Dask](https://www.dask.org/)
|
|
261
|
+
- Lineage - Keep track of lineage with tools like [SQLLineage](https://sqllineage.readthedocs.io/en/latest/index.html)
|
|
262
|
+
- Data quality - Monitor data quality with tools like [Great Expectations](https://docs.greatexpectations.io/docs/tutorials/quickstart/)
|
|
263
|
+
- Event Bus - Automatically publish events to external systems like [AWS Event Bridge](https://aws.amazon.com/eventbridge/), [Azure Event Grid](https://learn.microsoft.com/en-us/azure/event-grid/overview), [Google Cloud Pub/Sub](https://cloud.google.com/pubsub/docs/overview), [Kafka](https://kafka.apache.org/), [RabbitMQ](https://www.rabbitmq.com/)
|
|
264
|
+
- Query Engines - Integrate with query engines to run SQL queries directly on the store using tools like [DuckDB](https://duckdb.org/), [DataBend](https://databend.rs/), [DataFusion](https://arrow.apache.org/datafusion/), [Polars](https://www.pola.rs/), [Spark](https://spark.apache.org/)
|
|
265
|
+
- Streaming Data - Ingest streaming data
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
ingestify/__init__.py,sha256=DdhKleT3RggJUMj5Auq_ImGLmKm-3HHs5Yerx_VsH_w,301
|
|
2
|
+
ingestify/cmdline.py,sha256=gLy79Cq3OnEyoEcI6koWIEbCwvgFZ1E8n3UU1sKS8FM,7143
|
|
3
|
+
ingestify/exceptions.py,sha256=wMMuajl4AkQRfW60TLN7btJmQaH8-lUczXyW_2g9kOU,143
|
|
4
|
+
ingestify/main.py,sha256=YjrAOiGzwurtoDyIf981DSJHHA6IT5q09k3QNzTKCC8,6814
|
|
5
|
+
ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
|
|
6
|
+
ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
|
|
7
|
+
ingestify/utils.py,sha256=eEHwulqNEb2YTRDrCMVxr6mWZYI6KOcNCAIWFTi74u0,8029
|
|
8
|
+
ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
ingestify/application/dataset_store.py,sha256=NAW-XSvp118Lr2hXZd3qtuQr6VkPdWCLksIwd5MSs30,11489
|
|
10
|
+
ingestify/application/ingestion_engine.py,sha256=GYIhb8a9ePkEcNOBPdfu-YawiD7eRZMRlxCA-6g9DRA,2249
|
|
11
|
+
ingestify/application/loader.py,sha256=d7iXmdHN_yhDkEc2MMZ_6BLMEdRz9ChpBMy4yCWvxQo,13317
|
|
12
|
+
ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
|
|
13
|
+
ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
|
|
14
|
+
ingestify/domain/models/__init__.py,sha256=xHVQZP57ZQYUKwAtbccnDKX89_yTOvBKAtn4XDVbEbY,930
|
|
15
|
+
ingestify/domain/models/data_spec_version_collection.py,sha256=qjEM6-gt-Uf5orQlv64P6NJCEdWiUPX2oTZv8cC-KVY,1203
|
|
16
|
+
ingestify/domain/models/extract_job.py,sha256=yXrlF2Vt5hxB1Vo9CicpgyW5rjvJaEPfSiMzaAqhqB0,624
|
|
17
|
+
ingestify/domain/models/fetch_policy.py,sha256=d7K1TzliNJXxqaqzqEOQWLhvgIvmmqhUQEliXvSUcTs,1405
|
|
18
|
+
ingestify/domain/models/sink.py,sha256=AieqDQ76Vj7WGxCrl3-F93AKe-VBfoPHtMNH28GTQM4,384
|
|
19
|
+
ingestify/domain/models/source.py,sha256=sB3aqr2LfjIbtw7ODJpHnPj3RUeo7gYmTU7MXvfaYg4,973
|
|
20
|
+
ingestify/domain/models/dataset/__init__.py,sha256=kSn3XZo0o-D0WzMb2VDxhOXw9Rr9jvS-8fkHdOnrccU,748
|
|
21
|
+
ingestify/domain/models/dataset/collection.py,sha256=E2utQ6oyaFFrfQFMiwP9J_I7Wm21z0sRvE4Zc3QEs20,1310
|
|
22
|
+
ingestify/domain/models/dataset/collection_metadata.py,sha256=gI5cb9M0QRsheIr2jA71wOyWfI5lGx5ES2Qw7rbDIoA,371
|
|
23
|
+
ingestify/domain/models/dataset/dataset.py,sha256=m0iVJPXd1KOAHbDg7fmY_7MCdrKQaILUekIWUfo5pXI,2893
|
|
24
|
+
ingestify/domain/models/dataset/dataset_repository.py,sha256=eiloP5msmDau4WRHee8gA7pLoH_ca2JXAhPx9UecPIA,1185
|
|
25
|
+
ingestify/domain/models/dataset/events.py,sha256=x4l_pdzBHbemE_722EyCYXzWy9t8IcTx5j-wNFxWs6o,708
|
|
26
|
+
ingestify/domain/models/dataset/file.py,sha256=O-yJom9dr13PaHfmc_4crtSa9B1Q9iruHsnf-m01McU,3943
|
|
27
|
+
ingestify/domain/models/dataset/file_collection.py,sha256=V5wh2aSc61UA4HWcHi9PvyQUIUvssDRkaPVe2YR6XwU,1140
|
|
28
|
+
ingestify/domain/models/dataset/file_repository.py,sha256=lxf3Dh8e-_67dRspMZHT1DZ79IWW_vlvb3z8lKjypj4,1514
|
|
29
|
+
ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
|
|
30
|
+
ingestify/domain/models/dataset/revision.py,sha256=fiHnd_mad0iYmNCGswKImUHpauhIf2gW_ukztDFVP48,781
|
|
31
|
+
ingestify/domain/models/dataset/selector.py,sha256=kEGpU8pIyjZ0zwE9n2uo_NY5xrNanWiTTgapyMAUEsw,1039
|
|
32
|
+
ingestify/domain/models/event/__init__.py,sha256=OdPTpE9bj5QqdGmrYqRTLPX1f-LR9GWJYlGMPPEsuL8,138
|
|
33
|
+
ingestify/domain/models/event/_old_event.py,sha256=RktgCAj9SMdtqkAc_bOwoghEb2Z6m4r5_xWXin9wqx4,472
|
|
34
|
+
ingestify/domain/models/event/dispatcher.py,sha256=5WnyUJ7Qzr612btAtl1dMG9JBXDPcsBLyLmW6H7Q1zk,154
|
|
35
|
+
ingestify/domain/models/event/domain_event.py,sha256=a5nNNwDWSAqou8aSBGIEA6aQOHTOxYyMEUXB91fYUIM,187
|
|
36
|
+
ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmceWLstOxiP3-2qU,576
|
|
37
|
+
ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
|
|
38
|
+
ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
|
|
39
|
+
ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
|
|
40
|
+
ingestify/domain/models/resources/dataset_resource.py,sha256=g0tu9QZQEdAGR-dRXQPL3ddcbEEGI__pvkDJGoscUTE,3027
|
|
41
|
+
ingestify/domain/models/task/__init__.py,sha256=BdlyIPvE07Xax_IzLgO9DUw0wsz9OZutxnxdDNyRlys,79
|
|
42
|
+
ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIpXtlhW4,430
|
|
43
|
+
ingestify/domain/models/task/task.py,sha256=R6tEZub-N_Wjl4VjwlPySdFb3L9D7nH4St2CcDzFoKA,107
|
|
44
|
+
ingestify/domain/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
|
+
ingestify/domain/services/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
|
+
ingestify/domain/services/transformers/kloppy_to_pandas.py,sha256=NcN6nTBGVn9gz-_hWZJTMcduS1Gg7EM4X95Cqxi1QIM,809
|
|
47
|
+
ingestify/infra/__init__.py,sha256=V0hpLzPVTcOHRVh0gguF6FT30YIgEOUd5v87xUHkfZ4,88
|
|
48
|
+
ingestify/infra/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
49
|
+
ingestify/infra/fetch/http.py,sha256=gm7x0dACp3sTY1FMlbv8zRoQLZuZgtXmBg3HbhQ0syI,3086
|
|
50
|
+
ingestify/infra/serialization/__init__.py,sha256=LwfmRoO4qykZkJZXxVPSKpwoVIkg9qzXa7Egut9JjL4,1772
|
|
51
|
+
ingestify/infra/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
|
+
ingestify/infra/sink/postgresql.py,sha256=SxuM3LntfYcpCriUpqJhMvgAf0s9cohXf6WkxSEDYDY,1816
|
|
53
|
+
ingestify/infra/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
+
ingestify/infra/source/statsbomb_github.py,sha256=CuHZoJn6fU8ZKQl4f1-gyaVYsmxL6R33n0cbOx1jQmI,2895
|
|
55
|
+
ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nOUGxE,5626
|
|
56
|
+
ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
|
|
57
|
+
ingestify/infra/store/dataset/__init__.py,sha256=8oVJFiA-IKccrEpiYxAmSc65dfpNut7PYx8PUhylmdU,113
|
|
58
|
+
ingestify/infra/store/dataset/local_dataset_repository.py,sha256=UMgSe1M9u_629V4WyuTJ-QegZJiDczzMo7vkNbNleqA,2064
|
|
59
|
+
ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
|
|
60
|
+
ingestify/infra/store/dataset/sqlalchemy/mapping.py,sha256=Q7Od3zBnoZgxE5aThdZE93waWeKVut9dstrCnEYb9nc,3981
|
|
61
|
+
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=ynoIVMVD0_w9aa2hFKkcLxRKzJDoET_SNfGHXPIoN40,7067
|
|
62
|
+
ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
|
|
63
|
+
ingestify/infra/store/file/local_file_repository.py,sha256=0oIzjjKO5U_7gPXhsBJFUqQBarQTFQS499ZK7HNxMxo,893
|
|
64
|
+
ingestify/infra/store/file/s3_file_repository.py,sha256=txDviBrY9EHn3soqLFvTrjSPkyh548RxUgx4T83j0QY,1331
|
|
65
|
+
ingestify/static/templates/statsbomb_github/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
|
+
ingestify/static/templates/statsbomb_github/config.yaml.jinja2,sha256=_gAuAipfBL3ddLacyS1IBP5JluvPS2vmrb8GGaFtcUM,386
|
|
67
|
+
ingestify/static/templates/statsbomb_github/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
|
|
68
|
+
ingestify/static/templates/statsbomb_github/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
|
|
69
|
+
ingestify/static/templates/wyscout/.env,sha256=o2kfuDC_seZNIqDscPf2Ww5TGiJmLh_DMOUNykGvs8Q,141
|
|
70
|
+
ingestify/static/templates/wyscout/.gitignore,sha256=db0A2IjIeZf5fLLwXKD-bLmC4pETofxm848bljymnNs,13
|
|
71
|
+
ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
72
|
+
ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
|
|
73
|
+
ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
|
|
74
|
+
ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
|
|
75
|
+
ingestify-0.1.0.dist-info/METADATA,sha256=ryin_4RwMcyvqa4l6nESqysjlutcffHHWwoBvcwU784,18822
|
|
76
|
+
ingestify-0.1.0.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
|
77
|
+
ingestify-0.1.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
|
|
78
|
+
ingestify-0.1.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
|
|
79
|
+
ingestify-0.1.0.dist-info/RECORD,,
|