ingestify 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +11 -0
- ingestify/application/__init__.py +0 -0
- ingestify/application/dataset_store.py +339 -0
- ingestify/application/ingestion_engine.py +62 -0
- ingestify/application/loader.py +329 -0
- ingestify/application/secrets_manager.py +53 -0
- ingestify/cmdline.py +283 -0
- ingestify/domain/__init__.py +2 -0
- ingestify/domain/models/__init__.py +45 -0
- ingestify/domain/models/data_spec_version_collection.py +33 -0
- ingestify/domain/models/dataset/__init__.py +27 -0
- ingestify/domain/models/dataset/collection.py +44 -0
- ingestify/domain/models/dataset/collection_metadata.py +13 -0
- ingestify/domain/models/dataset/dataset.py +104 -0
- ingestify/domain/models/dataset/dataset_repository.py +46 -0
- ingestify/domain/models/dataset/events.py +31 -0
- ingestify/domain/models/dataset/file.py +146 -0
- ingestify/domain/models/dataset/file_collection.py +35 -0
- ingestify/domain/models/dataset/file_repository.py +59 -0
- ingestify/domain/models/dataset/identifier.py +24 -0
- ingestify/domain/models/dataset/revision.py +29 -0
- ingestify/domain/models/dataset/selector.py +37 -0
- ingestify/domain/models/event/__init__.py +4 -0
- ingestify/domain/models/event/_old_event.py +21 -0
- ingestify/domain/models/event/dispatcher.py +8 -0
- ingestify/domain/models/event/domain_event.py +10 -0
- ingestify/domain/models/event/event_bus.py +24 -0
- ingestify/domain/models/event/publisher.py +23 -0
- ingestify/domain/models/event/subscriber.py +39 -0
- ingestify/domain/models/extract_job.py +23 -0
- ingestify/domain/models/fetch_policy.py +40 -0
- ingestify/domain/models/resources/__init__.py +1 -0
- ingestify/domain/models/resources/dataset_resource.py +99 -0
- ingestify/domain/models/sink.py +16 -0
- ingestify/domain/models/source.py +34 -0
- ingestify/domain/models/task/__init__.py +4 -0
- ingestify/domain/models/task/set.py +21 -0
- ingestify/domain/models/task/task.py +7 -0
- ingestify/domain/services/__init__.py +0 -0
- ingestify/domain/services/transformers/__init__.py +0 -0
- ingestify/domain/services/transformers/kloppy_to_pandas.py +25 -0
- ingestify/exceptions.py +10 -0
- ingestify/infra/__init__.py +4 -0
- ingestify/infra/fetch/__init__.py +0 -0
- ingestify/infra/fetch/http.py +100 -0
- ingestify/infra/serialization/__init__.py +50 -0
- ingestify/infra/sink/__init__.py +0 -0
- ingestify/infra/sink/postgresql.py +50 -0
- ingestify/infra/source/__init__.py +0 -0
- ingestify/infra/source/statsbomb_github.py +92 -0
- ingestify/infra/source/wyscout.py +175 -0
- ingestify/infra/store/__init__.py +2 -0
- ingestify/infra/store/dataset/__init__.py +2 -0
- ingestify/infra/store/dataset/local_dataset_repository.py +73 -0
- ingestify/infra/store/dataset/sqlalchemy/__init__.py +1 -0
- ingestify/infra/store/dataset/sqlalchemy/mapping.py +153 -0
- ingestify/infra/store/dataset/sqlalchemy/repository.py +239 -0
- ingestify/infra/store/file/__init__.py +2 -0
- ingestify/infra/store/file/local_file_repository.py +32 -0
- ingestify/infra/store/file/s3_file_repository.py +50 -0
- ingestify/main.py +205 -0
- ingestify/server.py +78 -0
- ingestify/source_base.py +23 -0
- ingestify/static/templates/statsbomb_github/README.md +0 -0
- ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +19 -0
- ingestify/static/templates/statsbomb_github/database/README.md +1 -0
- ingestify/static/templates/statsbomb_github/query.py +14 -0
- ingestify/static/templates/wyscout/.env +5 -0
- ingestify/static/templates/wyscout/.gitignore +2 -0
- ingestify/static/templates/wyscout/README.md +0 -0
- ingestify/static/templates/wyscout/config.yaml.jinja2 +18 -0
- ingestify/static/templates/wyscout/database/README.md +1 -0
- ingestify/static/templates/wyscout/query.py +14 -0
- ingestify/utils.py +276 -0
- ingestify-0.1.0.dist-info/METADATA +265 -0
- ingestify-0.1.0.dist-info/RECORD +79 -0
- ingestify-0.1.0.dist-info/WHEEL +5 -0
- ingestify-0.1.0.dist-info/entry_points.txt +2 -0
- ingestify-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from typing import Optional, Union, List
|
|
4
|
+
|
|
5
|
+
from sqlalchemy import create_engine, func, text, tuple_
|
|
6
|
+
from sqlalchemy.engine import make_url
|
|
7
|
+
from sqlalchemy.exc import NoSuchModuleError
|
|
8
|
+
from sqlalchemy.orm import Session, joinedload
|
|
9
|
+
|
|
10
|
+
from ingestify.domain import File
|
|
11
|
+
from ingestify.domain.models import (
|
|
12
|
+
Dataset,
|
|
13
|
+
DatasetCollection,
|
|
14
|
+
DatasetRepository,
|
|
15
|
+
Identifier,
|
|
16
|
+
Selector,
|
|
17
|
+
)
|
|
18
|
+
from ingestify.domain.models.dataset.collection_metadata import (
|
|
19
|
+
DatasetCollectionMetadata,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
from .mapping import dataset_table, metadata
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def parse_value(v):
|
|
26
|
+
try:
|
|
27
|
+
return int(v)
|
|
28
|
+
except ValueError:
|
|
29
|
+
return v
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def json_serializer(o):
|
|
33
|
+
return json.dumps(o)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def json_deserializer(o):
|
|
37
|
+
o = json.loads(o)
|
|
38
|
+
# THIS BREAKS WHEN USING OTHER JSON COLUMNS!!
|
|
39
|
+
o = Identifier(**o)
|
|
40
|
+
return o
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# @compiles(DateTime, "mysql")
|
|
44
|
+
# def compile_datetime_mysql(type_, compiler, **kw):
|
|
45
|
+
# return "DATETIME(6)"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def isfloat(x):
|
|
49
|
+
try:
|
|
50
|
+
a = float(x)
|
|
51
|
+
except (TypeError, ValueError):
|
|
52
|
+
return False
|
|
53
|
+
else:
|
|
54
|
+
return True
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def isint(x):
|
|
58
|
+
try:
|
|
59
|
+
a = float(x)
|
|
60
|
+
b = int(a)
|
|
61
|
+
except (TypeError, ValueError):
|
|
62
|
+
return False
|
|
63
|
+
else:
|
|
64
|
+
return a == b
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
68
|
+
@staticmethod
|
|
69
|
+
def fix_url(url: str) -> str:
|
|
70
|
+
if url.startswith("postgres://"):
|
|
71
|
+
url = url.replace("postgres://", "postgresql://")
|
|
72
|
+
return url
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def supports(cls, url: str) -> bool:
|
|
76
|
+
url = cls.fix_url(url)
|
|
77
|
+
|
|
78
|
+
_url = make_url(url)
|
|
79
|
+
try:
|
|
80
|
+
_url.get_dialect()
|
|
81
|
+
except NoSuchModuleError:
|
|
82
|
+
return False
|
|
83
|
+
return True
|
|
84
|
+
|
|
85
|
+
def _init_engine(self):
|
|
86
|
+
self.engine = create_engine(
|
|
87
|
+
self.url,
|
|
88
|
+
# Use the default isolation level, don't need SERIALIZABLE
|
|
89
|
+
# isolation_level="SERIALIZABLE",
|
|
90
|
+
json_serializer=json_serializer,
|
|
91
|
+
json_deserializer=json_deserializer,
|
|
92
|
+
)
|
|
93
|
+
self.session = Session(bind=self.engine)
|
|
94
|
+
|
|
95
|
+
def __init__(self, url: str):
|
|
96
|
+
url = self.fix_url(url)
|
|
97
|
+
|
|
98
|
+
self.url = url
|
|
99
|
+
self._init_engine()
|
|
100
|
+
|
|
101
|
+
metadata.create_all(self.engine)
|
|
102
|
+
|
|
103
|
+
def __getstate__(self):
|
|
104
|
+
return {"url": self.url}
|
|
105
|
+
|
|
106
|
+
def __setstate__(self, state):
|
|
107
|
+
self.url = state["url"]
|
|
108
|
+
self._init_engine()
|
|
109
|
+
|
|
110
|
+
def __del__(self):
|
|
111
|
+
self.session.close()
|
|
112
|
+
self.engine.dispose()
|
|
113
|
+
|
|
114
|
+
def _filter_query(
|
|
115
|
+
self,
|
|
116
|
+
query,
|
|
117
|
+
bucket: str,
|
|
118
|
+
dataset_type: Optional[str] = None,
|
|
119
|
+
provider: Optional[str] = None,
|
|
120
|
+
dataset_id: Optional[Union[str, List[str]]] = None,
|
|
121
|
+
selector: Optional[Union[Selector, List[Selector]]] = None,
|
|
122
|
+
):
|
|
123
|
+
query = query.filter(Dataset.bucket == bucket)
|
|
124
|
+
if dataset_type:
|
|
125
|
+
query = query.filter(Dataset.dataset_type == dataset_type)
|
|
126
|
+
if provider:
|
|
127
|
+
query = query.filter(Dataset.provider == provider)
|
|
128
|
+
if dataset_id is not None:
|
|
129
|
+
if isinstance(dataset_id, list):
|
|
130
|
+
if len(dataset_id) == 0:
|
|
131
|
+
# When an empty list is explicitly passed, make sure we
|
|
132
|
+
# return an empty DatasetCollection
|
|
133
|
+
return DatasetCollection()
|
|
134
|
+
|
|
135
|
+
query = query.filter(Dataset.dataset_id.in_(dataset_id))
|
|
136
|
+
else:
|
|
137
|
+
query = query.filter(Dataset.dataset_id == dataset_id)
|
|
138
|
+
|
|
139
|
+
dialect = self.session.bind.dialect.name
|
|
140
|
+
|
|
141
|
+
if not isinstance(selector, list):
|
|
142
|
+
where, selector = selector.split("where")
|
|
143
|
+
else:
|
|
144
|
+
where = None
|
|
145
|
+
|
|
146
|
+
if selector:
|
|
147
|
+
if isinstance(selector, list):
|
|
148
|
+
selectors = selector
|
|
149
|
+
else:
|
|
150
|
+
selectors = [selector]
|
|
151
|
+
|
|
152
|
+
if not selectors:
|
|
153
|
+
raise ValueError("Selectors must contain at least one item")
|
|
154
|
+
|
|
155
|
+
keys = list(selectors[0].filtered_attributes.keys())
|
|
156
|
+
|
|
157
|
+
columns = []
|
|
158
|
+
first_selector = selectors[0].filtered_attributes
|
|
159
|
+
|
|
160
|
+
# Create a query like this:
|
|
161
|
+
# SELECT * FROM dataset WHERE (column1, column2, column3) IN ((1, 2, 3), (4, 5, 6), (7, 8, 9))
|
|
162
|
+
for k in keys:
|
|
163
|
+
if dialect == "postgresql":
|
|
164
|
+
column = dataset_table.c.identifier[k]
|
|
165
|
+
|
|
166
|
+
# Take the value from the first selector to determine the type.
|
|
167
|
+
# TODO: check all selectors to determine the type
|
|
168
|
+
v = first_selector[k]
|
|
169
|
+
if isint(v):
|
|
170
|
+
column = column.as_integer()
|
|
171
|
+
elif isfloat(v):
|
|
172
|
+
column = column.as_float()
|
|
173
|
+
else:
|
|
174
|
+
column = column.as_string()
|
|
175
|
+
else:
|
|
176
|
+
column = func.json_extract(Dataset.identifier, f"$.{k}")
|
|
177
|
+
columns.append(column)
|
|
178
|
+
|
|
179
|
+
values = []
|
|
180
|
+
for selector in selectors:
|
|
181
|
+
filtered_attributes = selector.filtered_attributes
|
|
182
|
+
values.append(tuple([filtered_attributes[k] for k in keys]))
|
|
183
|
+
|
|
184
|
+
query = query.filter(tuple_(*columns).in_(values))
|
|
185
|
+
|
|
186
|
+
if where:
|
|
187
|
+
query = query.filter(text(where))
|
|
188
|
+
return query
|
|
189
|
+
|
|
190
|
+
def get_dataset_collection(
|
|
191
|
+
self,
|
|
192
|
+
bucket: str,
|
|
193
|
+
dataset_type: Optional[str] = None,
|
|
194
|
+
provider: Optional[str] = None,
|
|
195
|
+
dataset_id: Optional[Union[str, List[str]]] = None,
|
|
196
|
+
selector: Optional[Union[Selector, List[Selector]]] = None,
|
|
197
|
+
metadata_only: bool = False,
|
|
198
|
+
) -> DatasetCollection:
|
|
199
|
+
def apply_query_filter(query):
|
|
200
|
+
return self._filter_query(
|
|
201
|
+
query,
|
|
202
|
+
bucket=bucket,
|
|
203
|
+
dataset_type=dataset_type,
|
|
204
|
+
provider=provider,
|
|
205
|
+
dataset_id=dataset_id,
|
|
206
|
+
selector=selector,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
if not metadata_only:
|
|
210
|
+
dataset_query = apply_query_filter(
|
|
211
|
+
self.session.query(Dataset).options(joinedload(Dataset.revisions))
|
|
212
|
+
)
|
|
213
|
+
datasets = list(dataset_query)
|
|
214
|
+
else:
|
|
215
|
+
datasets = []
|
|
216
|
+
|
|
217
|
+
metadata_result_row = apply_query_filter(
|
|
218
|
+
self.session.query(
|
|
219
|
+
func.min(File.modified_at).label("first_modified_at"),
|
|
220
|
+
func.max(File.modified_at).label("last_modified_at"),
|
|
221
|
+
func.count().label("row_count"),
|
|
222
|
+
).join(Dataset, Dataset.dataset_id == File.dataset_id)
|
|
223
|
+
).first()
|
|
224
|
+
dataset_collection_metadata = DatasetCollectionMetadata(*metadata_result_row)
|
|
225
|
+
|
|
226
|
+
return DatasetCollection(dataset_collection_metadata, datasets)
|
|
227
|
+
|
|
228
|
+
def save(self, bucket: str, dataset: Dataset):
|
|
229
|
+
# Just make sure
|
|
230
|
+
dataset.bucket = bucket
|
|
231
|
+
self.session.add(dataset)
|
|
232
|
+
self.session.commit()
|
|
233
|
+
|
|
234
|
+
def destroy(self, dataset: Dataset):
|
|
235
|
+
self.session.delete(dataset)
|
|
236
|
+
self.session.commit()
|
|
237
|
+
|
|
238
|
+
def next_identity(self):
|
|
239
|
+
return str(uuid.uuid4())
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import IO, AnyStr, BinaryIO
|
|
5
|
+
|
|
6
|
+
from ingestify.domain.models import Dataset, FileRepository
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LocalFileRepository(FileRepository):
|
|
10
|
+
@classmethod
|
|
11
|
+
def supports(cls, url: str) -> bool:
|
|
12
|
+
return url.startswith("file://")
|
|
13
|
+
|
|
14
|
+
def save_content(
|
|
15
|
+
self,
|
|
16
|
+
bucket: str,
|
|
17
|
+
dataset: Dataset,
|
|
18
|
+
revision_id: int,
|
|
19
|
+
filename: str,
|
|
20
|
+
stream: BinaryIO,
|
|
21
|
+
) -> Path:
|
|
22
|
+
path = self.get_path(bucket, dataset, revision_id, filename)
|
|
23
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
24
|
+
|
|
25
|
+
with open(path, "wb") as fp:
|
|
26
|
+
shutil.copyfileobj(stream, fp)
|
|
27
|
+
return path
|
|
28
|
+
|
|
29
|
+
def load_content(
|
|
30
|
+
self, bucket: str, dataset: Dataset, revision_id: int, filename: str
|
|
31
|
+
) -> BinaryIO:
|
|
32
|
+
return open(self.get_path(bucket, dataset, revision_id, filename), "rb")
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import BinaryIO
|
|
3
|
+
|
|
4
|
+
import boto3 as boto3
|
|
5
|
+
|
|
6
|
+
from ingestify.domain import Dataset
|
|
7
|
+
from ingestify.domain.models import FileRepository
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class S3FileRepository(FileRepository):
|
|
11
|
+
def __init__(self, url):
|
|
12
|
+
super().__init__(url)
|
|
13
|
+
|
|
14
|
+
self._s3 = None
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def s3(self):
|
|
18
|
+
if not self._s3:
|
|
19
|
+
self._s3 = boto3.resource("s3")
|
|
20
|
+
return self._s3
|
|
21
|
+
|
|
22
|
+
def __getstate__(self):
|
|
23
|
+
return {"base_dir": self.base_dir, "_s3": None}
|
|
24
|
+
|
|
25
|
+
def save_content(
|
|
26
|
+
self,
|
|
27
|
+
bucket: str,
|
|
28
|
+
dataset: Dataset,
|
|
29
|
+
revision_id: int,
|
|
30
|
+
filename: str,
|
|
31
|
+
stream: BinaryIO,
|
|
32
|
+
) -> Path:
|
|
33
|
+
key = self.get_path(bucket, dataset, revision_id, filename)
|
|
34
|
+
s3_bucket = Path(key.parts[0])
|
|
35
|
+
|
|
36
|
+
self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).put(Body=stream)
|
|
37
|
+
return key
|
|
38
|
+
|
|
39
|
+
def load_content(
|
|
40
|
+
self, bucket: str, dataset: Dataset, revision_id: int, filename: str
|
|
41
|
+
) -> BinaryIO:
|
|
42
|
+
key = self.get_path(bucket, dataset, revision_id, filename)
|
|
43
|
+
s3_bucket = Path(key.parts[0])
|
|
44
|
+
return self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).get()[
|
|
45
|
+
"Body"
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def supports(cls, url: str) -> bool:
|
|
50
|
+
return url.startswith("s3://")
|
ingestify/main.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from itertools import product
|
|
6
|
+
from typing import Optional, Type
|
|
7
|
+
|
|
8
|
+
from pyaml_env import parse_config
|
|
9
|
+
|
|
10
|
+
from ingestify import Source
|
|
11
|
+
from ingestify.application.dataset_store import DatasetStore
|
|
12
|
+
from ingestify.application.ingestion_engine import IngestionEngine
|
|
13
|
+
from ingestify.application.secrets_manager import SecretsManager
|
|
14
|
+
from ingestify.domain import Selector
|
|
15
|
+
from ingestify.domain.models import (
|
|
16
|
+
dataset_repository_factory,
|
|
17
|
+
file_repository_factory,
|
|
18
|
+
)
|
|
19
|
+
from ingestify.domain.models.data_spec_version_collection import (
|
|
20
|
+
DataSpecVersionCollection,
|
|
21
|
+
)
|
|
22
|
+
from ingestify.domain.models.event import EventBus, Publisher, Subscriber
|
|
23
|
+
|
|
24
|
+
from ingestify.domain.models.extract_job import ExtractJob
|
|
25
|
+
from ingestify.domain.models.fetch_policy import FetchPolicy
|
|
26
|
+
from ingestify.exceptions import ConfigurationError
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
secrets_manager = SecretsManager()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _product_selectors(selector_args):
|
|
34
|
+
if not selector_args:
|
|
35
|
+
# Empty selector passed. This is a special case when
|
|
36
|
+
# a Source doesn't have discover_selectors but also doesn't require
|
|
37
|
+
# selectors
|
|
38
|
+
yield dict()
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
if isinstance(selector_args, str):
|
|
42
|
+
if selector_args == "*":
|
|
43
|
+
yield lambda dict_selector: True
|
|
44
|
+
else:
|
|
45
|
+
yield lambda dict_selector: eval(selector_args, {}, dict_selector)
|
|
46
|
+
return
|
|
47
|
+
|
|
48
|
+
selector_args_ = {
|
|
49
|
+
k: v if isinstance(v, list) else [v] for k, v in selector_args.items()
|
|
50
|
+
}
|
|
51
|
+
keys, values = zip(*selector_args_.items())
|
|
52
|
+
for bundle in product(*values):
|
|
53
|
+
yield dict(zip(keys, bundle))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def import_cls(name):
|
|
57
|
+
components = name.split(".")
|
|
58
|
+
mod = importlib.import_module(".".join(components[:-1]))
|
|
59
|
+
return getattr(mod, components[-1])
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_dataset_store_by_urls(
|
|
63
|
+
dataset_url: str, file_url: str, bucket: str
|
|
64
|
+
) -> DatasetStore:
|
|
65
|
+
"""
|
|
66
|
+
Initialize a DatasetStore by a DatasetRepository and a FileRepository
|
|
67
|
+
"""
|
|
68
|
+
if not bucket:
|
|
69
|
+
raise Exception("Bucket is not specified")
|
|
70
|
+
|
|
71
|
+
file_repository = file_repository_factory.build_if_supports(url=file_url)
|
|
72
|
+
|
|
73
|
+
if secrets_manager.supports(dataset_url):
|
|
74
|
+
dataset_url = secrets_manager.load_as_db_url(dataset_url)
|
|
75
|
+
|
|
76
|
+
if dataset_url.startswith("postgres://"):
|
|
77
|
+
dataset_url = dataset_url.replace("postgress://", "postgress+")
|
|
78
|
+
|
|
79
|
+
dataset_repository = dataset_repository_factory.build_if_supports(url=dataset_url)
|
|
80
|
+
return DatasetStore(
|
|
81
|
+
dataset_repository=dataset_repository,
|
|
82
|
+
file_repository=file_repository,
|
|
83
|
+
bucket=bucket,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_datastore(config_file, bucket: Optional[str] = None) -> DatasetStore:
|
|
88
|
+
config = parse_config(config_file, default_value="")
|
|
89
|
+
|
|
90
|
+
return get_dataset_store_by_urls(
|
|
91
|
+
dataset_url=config["main"]["dataset_url"],
|
|
92
|
+
file_url=config["main"]["file_url"],
|
|
93
|
+
bucket=bucket or config["main"].get("default_bucket"),
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_remote_datastore(url: str, bucket: str, **kwargs) -> DatasetStore:
|
|
98
|
+
return get_dataset_store_by_urls(dataset_url=url, file_url=url, bucket=bucket)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def get_source_cls(key: str) -> Type[Source]:
|
|
102
|
+
if key.startswith("ingestify."):
|
|
103
|
+
_, type_ = key.split(".")
|
|
104
|
+
if type_ == "wyscout":
|
|
105
|
+
from ingestify.infra.source.wyscout import Wyscout
|
|
106
|
+
|
|
107
|
+
return Wyscout
|
|
108
|
+
|
|
109
|
+
elif type_ == "statsbomb_github":
|
|
110
|
+
from ingestify.infra.source.statsbomb_github import StatsbombGithub
|
|
111
|
+
|
|
112
|
+
return StatsbombGithub
|
|
113
|
+
else:
|
|
114
|
+
raise Exception(f"Unknown source type 'ingestify.{type_}'")
|
|
115
|
+
else:
|
|
116
|
+
return import_cls(key)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def build_source(name, source_args):
|
|
120
|
+
source_cls = get_source_cls(source_args["type"])
|
|
121
|
+
raw_configuration = source_args.get("configuration", {})
|
|
122
|
+
configuration = {}
|
|
123
|
+
if isinstance(raw_configuration, list):
|
|
124
|
+
# This normally means the data needs to be loaded from somewhere else
|
|
125
|
+
for item in raw_configuration:
|
|
126
|
+
if isinstance(item, dict):
|
|
127
|
+
configuration.update(item)
|
|
128
|
+
elif secrets_manager.supports(item):
|
|
129
|
+
item = secrets_manager.load_as_dict(item)
|
|
130
|
+
configuration.update(item)
|
|
131
|
+
else:
|
|
132
|
+
raise ConfigurationError(
|
|
133
|
+
f"Don't know how to use source configuration '{item}'"
|
|
134
|
+
)
|
|
135
|
+
elif isinstance(raw_configuration, str):
|
|
136
|
+
configuration = secrets_manager.load_as_dict(raw_configuration)
|
|
137
|
+
else:
|
|
138
|
+
configuration = raw_configuration
|
|
139
|
+
|
|
140
|
+
return source_cls(name=name, **configuration)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def get_event_subscriber_cls(key: str) -> Type[Subscriber]:
|
|
144
|
+
return import_cls(key)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
|
|
148
|
+
config = parse_config(config_file, default_value="")
|
|
149
|
+
|
|
150
|
+
logger.info("Initializing sources")
|
|
151
|
+
sources = {}
|
|
152
|
+
sys.path.append(os.path.dirname(config_file))
|
|
153
|
+
for name, source_args in config["sources"].items():
|
|
154
|
+
sources[name] = build_source(name=name, source_args=source_args)
|
|
155
|
+
|
|
156
|
+
logger.info("Initializing IngestionEngine")
|
|
157
|
+
store = get_dataset_store_by_urls(
|
|
158
|
+
dataset_url=config["main"]["dataset_url"],
|
|
159
|
+
file_url=config["main"]["file_url"],
|
|
160
|
+
bucket=bucket or config["main"].get("default_bucket"),
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Setup an EventBus and wire some more components
|
|
164
|
+
event_bus = EventBus()
|
|
165
|
+
publisher = Publisher()
|
|
166
|
+
for subscriber in config.get("event_subscribers", []):
|
|
167
|
+
cls = get_event_subscriber_cls(subscriber["type"])
|
|
168
|
+
publisher.add_subscriber(cls(store))
|
|
169
|
+
event_bus.register(publisher)
|
|
170
|
+
store.set_event_bus(event_bus)
|
|
171
|
+
|
|
172
|
+
ingestion_engine = IngestionEngine(
|
|
173
|
+
store=store,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
logger.info("Determining tasks...")
|
|
177
|
+
|
|
178
|
+
fetch_policy = FetchPolicy()
|
|
179
|
+
|
|
180
|
+
for job in config["extract_jobs"]:
|
|
181
|
+
data_spec_versions = DataSpecVersionCollection.from_dict(
|
|
182
|
+
job.get("data_spec_versions", {"default": {"v1"}})
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
if "selectors" in job:
|
|
186
|
+
selectors = [
|
|
187
|
+
Selector.build(selector, data_spec_versions=data_spec_versions)
|
|
188
|
+
for selector_args in job["selectors"]
|
|
189
|
+
for selector in _product_selectors(selector_args)
|
|
190
|
+
]
|
|
191
|
+
else:
|
|
192
|
+
# Add a single empty selector. This won't match anything
|
|
193
|
+
# but makes it easier later one where we loop over selectors.
|
|
194
|
+
selectors = [Selector.build({}, data_spec_versions=data_spec_versions)]
|
|
195
|
+
|
|
196
|
+
import_job = ExtractJob(
|
|
197
|
+
source=sources[job["source"]],
|
|
198
|
+
dataset_type=job["dataset_type"],
|
|
199
|
+
selectors=selectors,
|
|
200
|
+
fetch_policy=fetch_policy,
|
|
201
|
+
data_spec_versions=data_spec_versions,
|
|
202
|
+
)
|
|
203
|
+
ingestion_engine.add_extract_job(import_job)
|
|
204
|
+
|
|
205
|
+
return ingestion_engine
|
ingestify/server.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import flask
|
|
2
|
+
from flask import Response, request
|
|
3
|
+
from flask_restful import Resource, Api
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# Dataset
|
|
7
|
+
from ingestify.infra.serialization import serialize
|
|
8
|
+
from ingestify.main import get_datastore
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def create_server(config_file: str):
|
|
12
|
+
app = flask.Flask(__name__)
|
|
13
|
+
api = Api(app, prefix="/api")
|
|
14
|
+
|
|
15
|
+
# @api.representation('application/json')
|
|
16
|
+
# def output_json(data, code, headers=None):
|
|
17
|
+
# resp = make_response(json.dumps(data, cls=DecimalEncoder), code)
|
|
18
|
+
# resp.headers.extend(headers or {})
|
|
19
|
+
# return resp
|
|
20
|
+
|
|
21
|
+
datastore_cache = {}
|
|
22
|
+
|
|
23
|
+
def get_datastore_by_bucket(bucket: str):
|
|
24
|
+
try:
|
|
25
|
+
return datastore_cache[bucket]
|
|
26
|
+
except KeyError:
|
|
27
|
+
datastore_cache[bucket] = get_datastore(config_file, bucket=bucket)
|
|
28
|
+
return datastore_cache[bucket]
|
|
29
|
+
|
|
30
|
+
class DatasetResource(Resource):
|
|
31
|
+
def patch(self, bucket: str, dataset_id: str):
|
|
32
|
+
# TODO: Filter out dataset from body
|
|
33
|
+
return "OK"
|
|
34
|
+
|
|
35
|
+
def delete(self, bucket: str, dataset_id: str):
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
class DatasetListResource(Resource):
|
|
39
|
+
def get(self, bucket: str):
|
|
40
|
+
return serialize(get_datastore_by_bucket(bucket).get_dataset_collection())
|
|
41
|
+
|
|
42
|
+
class FileResource(Resource):
|
|
43
|
+
def get(self, bucket, dataset_id: str, version: int, filename: str):
|
|
44
|
+
return Response(
|
|
45
|
+
get_datastore_by_bucket(bucket)
|
|
46
|
+
.load_content(dataset_id, version, filename)
|
|
47
|
+
.read()
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def put(self, bucket, dataset_id: str, version: int, filename: str):
|
|
51
|
+
return Response(
|
|
52
|
+
get_datastore_by_bucket(bucket).save_content(
|
|
53
|
+
dataset_id, version, filename, request.stream
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
api.add_resource(
|
|
58
|
+
DatasetListResource, "/buckets/<string:bucket>/datasets", methods=["GET"]
|
|
59
|
+
)
|
|
60
|
+
api.add_resource(
|
|
61
|
+
DatasetResource,
|
|
62
|
+
"/buckets/<string:bucket>/datasets/<string:dataset_id>",
|
|
63
|
+
methods=["PATCH", "DELETE"],
|
|
64
|
+
)
|
|
65
|
+
api.add_resource(
|
|
66
|
+
FileResource,
|
|
67
|
+
"/buckets/<string:bucket>/"
|
|
68
|
+
"datasets/<string:dataset_id>/files/"
|
|
69
|
+
"<string:version>/<string:filename>",
|
|
70
|
+
methods=["GET", "PUT"],
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
return app
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
|
+
app = create_server(config_file="../examples/statsbomb/config_local.yaml")
|
|
78
|
+
app.run(host="0.0.0.0", port=8080)
|
ingestify/source_base.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from ingestify.application.dataset_store import DatasetStore
|
|
2
|
+
from ingestify.domain.models import (
|
|
3
|
+
Dataset,
|
|
4
|
+
DatasetResource,
|
|
5
|
+
DraftFile,
|
|
6
|
+
File,
|
|
7
|
+
Identifier,
|
|
8
|
+
Selector,
|
|
9
|
+
Source,
|
|
10
|
+
Revision,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"Selector",
|
|
15
|
+
"Identifier",
|
|
16
|
+
"Source",
|
|
17
|
+
"DatasetStore",
|
|
18
|
+
"Dataset",
|
|
19
|
+
"DatasetResource",
|
|
20
|
+
"Revision",
|
|
21
|
+
"File",
|
|
22
|
+
"DraftFile",
|
|
23
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
ingestify_version: {{ ingestify_version }}
|
|
2
|
+
|
|
3
|
+
main:
|
|
4
|
+
dataset_url: sqlite:///database/catalog.db
|
|
5
|
+
file_url: file://database/files/
|
|
6
|
+
default_bucket: main
|
|
7
|
+
|
|
8
|
+
sources:
|
|
9
|
+
statsbomb:
|
|
10
|
+
type: ingestify.statsbomb_github
|
|
11
|
+
|
|
12
|
+
extract_jobs:
|
|
13
|
+
- source: statsbomb
|
|
14
|
+
selectors:
|
|
15
|
+
- competition_id: 11
|
|
16
|
+
season_id: [42, 90]
|
|
17
|
+
|
|
18
|
+
# passing an empty selector means: fetch everything
|
|
19
|
+
# -
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# This will contain the database
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from ingestify.main import get_datastore
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def main():
|
|
5
|
+
store = get_datastore("config.yaml")
|
|
6
|
+
dataset_collection = store.get_dataset_collection()
|
|
7
|
+
|
|
8
|
+
for dataset in dataset_collection:
|
|
9
|
+
kloppy_dataset = store.load_with_kloppy(dataset)
|
|
10
|
+
print(f"Loaded dataset with {len(kloppy_dataset.records)} events")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
if __name__ == "__main__":
|
|
14
|
+
main()
|
|
File without changes
|