ingestify 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. ingestify/__init__.py +11 -0
  2. ingestify/application/__init__.py +0 -0
  3. ingestify/application/dataset_store.py +339 -0
  4. ingestify/application/ingestion_engine.py +62 -0
  5. ingestify/application/loader.py +329 -0
  6. ingestify/application/secrets_manager.py +53 -0
  7. ingestify/cmdline.py +283 -0
  8. ingestify/domain/__init__.py +2 -0
  9. ingestify/domain/models/__init__.py +45 -0
  10. ingestify/domain/models/data_spec_version_collection.py +33 -0
  11. ingestify/domain/models/dataset/__init__.py +27 -0
  12. ingestify/domain/models/dataset/collection.py +44 -0
  13. ingestify/domain/models/dataset/collection_metadata.py +13 -0
  14. ingestify/domain/models/dataset/dataset.py +104 -0
  15. ingestify/domain/models/dataset/dataset_repository.py +46 -0
  16. ingestify/domain/models/dataset/events.py +31 -0
  17. ingestify/domain/models/dataset/file.py +146 -0
  18. ingestify/domain/models/dataset/file_collection.py +35 -0
  19. ingestify/domain/models/dataset/file_repository.py +59 -0
  20. ingestify/domain/models/dataset/identifier.py +24 -0
  21. ingestify/domain/models/dataset/revision.py +29 -0
  22. ingestify/domain/models/dataset/selector.py +37 -0
  23. ingestify/domain/models/event/__init__.py +4 -0
  24. ingestify/domain/models/event/_old_event.py +21 -0
  25. ingestify/domain/models/event/dispatcher.py +8 -0
  26. ingestify/domain/models/event/domain_event.py +10 -0
  27. ingestify/domain/models/event/event_bus.py +24 -0
  28. ingestify/domain/models/event/publisher.py +23 -0
  29. ingestify/domain/models/event/subscriber.py +39 -0
  30. ingestify/domain/models/extract_job.py +23 -0
  31. ingestify/domain/models/fetch_policy.py +40 -0
  32. ingestify/domain/models/resources/__init__.py +1 -0
  33. ingestify/domain/models/resources/dataset_resource.py +99 -0
  34. ingestify/domain/models/sink.py +16 -0
  35. ingestify/domain/models/source.py +34 -0
  36. ingestify/domain/models/task/__init__.py +4 -0
  37. ingestify/domain/models/task/set.py +21 -0
  38. ingestify/domain/models/task/task.py +7 -0
  39. ingestify/domain/services/__init__.py +0 -0
  40. ingestify/domain/services/transformers/__init__.py +0 -0
  41. ingestify/domain/services/transformers/kloppy_to_pandas.py +25 -0
  42. ingestify/exceptions.py +10 -0
  43. ingestify/infra/__init__.py +4 -0
  44. ingestify/infra/fetch/__init__.py +0 -0
  45. ingestify/infra/fetch/http.py +100 -0
  46. ingestify/infra/serialization/__init__.py +50 -0
  47. ingestify/infra/sink/__init__.py +0 -0
  48. ingestify/infra/sink/postgresql.py +50 -0
  49. ingestify/infra/source/__init__.py +0 -0
  50. ingestify/infra/source/statsbomb_github.py +92 -0
  51. ingestify/infra/source/wyscout.py +175 -0
  52. ingestify/infra/store/__init__.py +2 -0
  53. ingestify/infra/store/dataset/__init__.py +2 -0
  54. ingestify/infra/store/dataset/local_dataset_repository.py +73 -0
  55. ingestify/infra/store/dataset/sqlalchemy/__init__.py +1 -0
  56. ingestify/infra/store/dataset/sqlalchemy/mapping.py +153 -0
  57. ingestify/infra/store/dataset/sqlalchemy/repository.py +239 -0
  58. ingestify/infra/store/file/__init__.py +2 -0
  59. ingestify/infra/store/file/local_file_repository.py +32 -0
  60. ingestify/infra/store/file/s3_file_repository.py +50 -0
  61. ingestify/main.py +205 -0
  62. ingestify/server.py +78 -0
  63. ingestify/source_base.py +23 -0
  64. ingestify/static/templates/statsbomb_github/README.md +0 -0
  65. ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +19 -0
  66. ingestify/static/templates/statsbomb_github/database/README.md +1 -0
  67. ingestify/static/templates/statsbomb_github/query.py +14 -0
  68. ingestify/static/templates/wyscout/.env +5 -0
  69. ingestify/static/templates/wyscout/.gitignore +2 -0
  70. ingestify/static/templates/wyscout/README.md +0 -0
  71. ingestify/static/templates/wyscout/config.yaml.jinja2 +18 -0
  72. ingestify/static/templates/wyscout/database/README.md +1 -0
  73. ingestify/static/templates/wyscout/query.py +14 -0
  74. ingestify/utils.py +276 -0
  75. ingestify-0.1.0.dist-info/METADATA +265 -0
  76. ingestify-0.1.0.dist-info/RECORD +79 -0
  77. ingestify-0.1.0.dist-info/WHEEL +5 -0
  78. ingestify-0.1.0.dist-info/entry_points.txt +2 -0
  79. ingestify-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,239 @@
1
+ import json
2
+ import uuid
3
+ from typing import Optional, Union, List
4
+
5
+ from sqlalchemy import create_engine, func, text, tuple_
6
+ from sqlalchemy.engine import make_url
7
+ from sqlalchemy.exc import NoSuchModuleError
8
+ from sqlalchemy.orm import Session, joinedload
9
+
10
+ from ingestify.domain import File
11
+ from ingestify.domain.models import (
12
+ Dataset,
13
+ DatasetCollection,
14
+ DatasetRepository,
15
+ Identifier,
16
+ Selector,
17
+ )
18
+ from ingestify.domain.models.dataset.collection_metadata import (
19
+ DatasetCollectionMetadata,
20
+ )
21
+
22
+ from .mapping import dataset_table, metadata
23
+
24
+
25
+ def parse_value(v):
26
+ try:
27
+ return int(v)
28
+ except ValueError:
29
+ return v
30
+
31
+
32
+ def json_serializer(o):
33
+ return json.dumps(o)
34
+
35
+
36
+ def json_deserializer(o):
37
+ o = json.loads(o)
38
+ # THIS BREAKS WHEN USING OTHER JSON COLUMNS!!
39
+ o = Identifier(**o)
40
+ return o
41
+
42
+
43
+ # @compiles(DateTime, "mysql")
44
+ # def compile_datetime_mysql(type_, compiler, **kw):
45
+ # return "DATETIME(6)"
46
+
47
+
48
+ def isfloat(x):
49
+ try:
50
+ a = float(x)
51
+ except (TypeError, ValueError):
52
+ return False
53
+ else:
54
+ return True
55
+
56
+
57
+ def isint(x):
58
+ try:
59
+ a = float(x)
60
+ b = int(a)
61
+ except (TypeError, ValueError):
62
+ return False
63
+ else:
64
+ return a == b
65
+
66
+
67
+ class SqlAlchemyDatasetRepository(DatasetRepository):
68
+ @staticmethod
69
+ def fix_url(url: str) -> str:
70
+ if url.startswith("postgres://"):
71
+ url = url.replace("postgres://", "postgresql://")
72
+ return url
73
+
74
+ @classmethod
75
+ def supports(cls, url: str) -> bool:
76
+ url = cls.fix_url(url)
77
+
78
+ _url = make_url(url)
79
+ try:
80
+ _url.get_dialect()
81
+ except NoSuchModuleError:
82
+ return False
83
+ return True
84
+
85
+ def _init_engine(self):
86
+ self.engine = create_engine(
87
+ self.url,
88
+ # Use the default isolation level, don't need SERIALIZABLE
89
+ # isolation_level="SERIALIZABLE",
90
+ json_serializer=json_serializer,
91
+ json_deserializer=json_deserializer,
92
+ )
93
+ self.session = Session(bind=self.engine)
94
+
95
+ def __init__(self, url: str):
96
+ url = self.fix_url(url)
97
+
98
+ self.url = url
99
+ self._init_engine()
100
+
101
+ metadata.create_all(self.engine)
102
+
103
+ def __getstate__(self):
104
+ return {"url": self.url}
105
+
106
+ def __setstate__(self, state):
107
+ self.url = state["url"]
108
+ self._init_engine()
109
+
110
+ def __del__(self):
111
+ self.session.close()
112
+ self.engine.dispose()
113
+
114
+ def _filter_query(
115
+ self,
116
+ query,
117
+ bucket: str,
118
+ dataset_type: Optional[str] = None,
119
+ provider: Optional[str] = None,
120
+ dataset_id: Optional[Union[str, List[str]]] = None,
121
+ selector: Optional[Union[Selector, List[Selector]]] = None,
122
+ ):
123
+ query = query.filter(Dataset.bucket == bucket)
124
+ if dataset_type:
125
+ query = query.filter(Dataset.dataset_type == dataset_type)
126
+ if provider:
127
+ query = query.filter(Dataset.provider == provider)
128
+ if dataset_id is not None:
129
+ if isinstance(dataset_id, list):
130
+ if len(dataset_id) == 0:
131
+ # When an empty list is explicitly passed, make sure we
132
+ # return an empty DatasetCollection
133
+ return DatasetCollection()
134
+
135
+ query = query.filter(Dataset.dataset_id.in_(dataset_id))
136
+ else:
137
+ query = query.filter(Dataset.dataset_id == dataset_id)
138
+
139
+ dialect = self.session.bind.dialect.name
140
+
141
+ if not isinstance(selector, list):
142
+ where, selector = selector.split("where")
143
+ else:
144
+ where = None
145
+
146
+ if selector:
147
+ if isinstance(selector, list):
148
+ selectors = selector
149
+ else:
150
+ selectors = [selector]
151
+
152
+ if not selectors:
153
+ raise ValueError("Selectors must contain at least one item")
154
+
155
+ keys = list(selectors[0].filtered_attributes.keys())
156
+
157
+ columns = []
158
+ first_selector = selectors[0].filtered_attributes
159
+
160
+ # Create a query like this:
161
+ # SELECT * FROM dataset WHERE (column1, column2, column3) IN ((1, 2, 3), (4, 5, 6), (7, 8, 9))
162
+ for k in keys:
163
+ if dialect == "postgresql":
164
+ column = dataset_table.c.identifier[k]
165
+
166
+ # Take the value from the first selector to determine the type.
167
+ # TODO: check all selectors to determine the type
168
+ v = first_selector[k]
169
+ if isint(v):
170
+ column = column.as_integer()
171
+ elif isfloat(v):
172
+ column = column.as_float()
173
+ else:
174
+ column = column.as_string()
175
+ else:
176
+ column = func.json_extract(Dataset.identifier, f"$.{k}")
177
+ columns.append(column)
178
+
179
+ values = []
180
+ for selector in selectors:
181
+ filtered_attributes = selector.filtered_attributes
182
+ values.append(tuple([filtered_attributes[k] for k in keys]))
183
+
184
+ query = query.filter(tuple_(*columns).in_(values))
185
+
186
+ if where:
187
+ query = query.filter(text(where))
188
+ return query
189
+
190
+ def get_dataset_collection(
191
+ self,
192
+ bucket: str,
193
+ dataset_type: Optional[str] = None,
194
+ provider: Optional[str] = None,
195
+ dataset_id: Optional[Union[str, List[str]]] = None,
196
+ selector: Optional[Union[Selector, List[Selector]]] = None,
197
+ metadata_only: bool = False,
198
+ ) -> DatasetCollection:
199
+ def apply_query_filter(query):
200
+ return self._filter_query(
201
+ query,
202
+ bucket=bucket,
203
+ dataset_type=dataset_type,
204
+ provider=provider,
205
+ dataset_id=dataset_id,
206
+ selector=selector,
207
+ )
208
+
209
+ if not metadata_only:
210
+ dataset_query = apply_query_filter(
211
+ self.session.query(Dataset).options(joinedload(Dataset.revisions))
212
+ )
213
+ datasets = list(dataset_query)
214
+ else:
215
+ datasets = []
216
+
217
+ metadata_result_row = apply_query_filter(
218
+ self.session.query(
219
+ func.min(File.modified_at).label("first_modified_at"),
220
+ func.max(File.modified_at).label("last_modified_at"),
221
+ func.count().label("row_count"),
222
+ ).join(Dataset, Dataset.dataset_id == File.dataset_id)
223
+ ).first()
224
+ dataset_collection_metadata = DatasetCollectionMetadata(*metadata_result_row)
225
+
226
+ return DatasetCollection(dataset_collection_metadata, datasets)
227
+
228
+ def save(self, bucket: str, dataset: Dataset):
229
+ # Just make sure
230
+ dataset.bucket = bucket
231
+ self.session.add(dataset)
232
+ self.session.commit()
233
+
234
+ def destroy(self, dataset: Dataset):
235
+ self.session.delete(dataset)
236
+ self.session.commit()
237
+
238
+ def next_identity(self):
239
+ return str(uuid.uuid4())
@@ -0,0 +1,2 @@
1
+ from .local_file_repository import LocalFileRepository
2
+ from .s3_file_repository import S3FileRepository
@@ -0,0 +1,32 @@
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import IO, AnyStr, BinaryIO
5
+
6
+ from ingestify.domain.models import Dataset, FileRepository
7
+
8
+
9
+ class LocalFileRepository(FileRepository):
10
+ @classmethod
11
+ def supports(cls, url: str) -> bool:
12
+ return url.startswith("file://")
13
+
14
+ def save_content(
15
+ self,
16
+ bucket: str,
17
+ dataset: Dataset,
18
+ revision_id: int,
19
+ filename: str,
20
+ stream: BinaryIO,
21
+ ) -> Path:
22
+ path = self.get_path(bucket, dataset, revision_id, filename)
23
+ path.parent.mkdir(parents=True, exist_ok=True)
24
+
25
+ with open(path, "wb") as fp:
26
+ shutil.copyfileobj(stream, fp)
27
+ return path
28
+
29
+ def load_content(
30
+ self, bucket: str, dataset: Dataset, revision_id: int, filename: str
31
+ ) -> BinaryIO:
32
+ return open(self.get_path(bucket, dataset, revision_id, filename), "rb")
@@ -0,0 +1,50 @@
1
+ from pathlib import Path
2
+ from typing import BinaryIO
3
+
4
+ import boto3 as boto3
5
+
6
+ from ingestify.domain import Dataset
7
+ from ingestify.domain.models import FileRepository
8
+
9
+
10
+ class S3FileRepository(FileRepository):
11
+ def __init__(self, url):
12
+ super().__init__(url)
13
+
14
+ self._s3 = None
15
+
16
+ @property
17
+ def s3(self):
18
+ if not self._s3:
19
+ self._s3 = boto3.resource("s3")
20
+ return self._s3
21
+
22
+ def __getstate__(self):
23
+ return {"base_dir": self.base_dir, "_s3": None}
24
+
25
+ def save_content(
26
+ self,
27
+ bucket: str,
28
+ dataset: Dataset,
29
+ revision_id: int,
30
+ filename: str,
31
+ stream: BinaryIO,
32
+ ) -> Path:
33
+ key = self.get_path(bucket, dataset, revision_id, filename)
34
+ s3_bucket = Path(key.parts[0])
35
+
36
+ self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).put(Body=stream)
37
+ return key
38
+
39
+ def load_content(
40
+ self, bucket: str, dataset: Dataset, revision_id: int, filename: str
41
+ ) -> BinaryIO:
42
+ key = self.get_path(bucket, dataset, revision_id, filename)
43
+ s3_bucket = Path(key.parts[0])
44
+ return self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).get()[
45
+ "Body"
46
+ ]
47
+
48
+ @classmethod
49
+ def supports(cls, url: str) -> bool:
50
+ return url.startswith("s3://")
ingestify/main.py ADDED
@@ -0,0 +1,205 @@
1
+ import importlib
2
+ import logging
3
+ import os
4
+ import sys
5
+ from itertools import product
6
+ from typing import Optional, Type
7
+
8
+ from pyaml_env import parse_config
9
+
10
+ from ingestify import Source
11
+ from ingestify.application.dataset_store import DatasetStore
12
+ from ingestify.application.ingestion_engine import IngestionEngine
13
+ from ingestify.application.secrets_manager import SecretsManager
14
+ from ingestify.domain import Selector
15
+ from ingestify.domain.models import (
16
+ dataset_repository_factory,
17
+ file_repository_factory,
18
+ )
19
+ from ingestify.domain.models.data_spec_version_collection import (
20
+ DataSpecVersionCollection,
21
+ )
22
+ from ingestify.domain.models.event import EventBus, Publisher, Subscriber
23
+
24
+ from ingestify.domain.models.extract_job import ExtractJob
25
+ from ingestify.domain.models.fetch_policy import FetchPolicy
26
+ from ingestify.exceptions import ConfigurationError
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ secrets_manager = SecretsManager()
31
+
32
+
33
+ def _product_selectors(selector_args):
34
+ if not selector_args:
35
+ # Empty selector passed. This is a special case when
36
+ # a Source doesn't have discover_selectors but also doesn't require
37
+ # selectors
38
+ yield dict()
39
+ return
40
+
41
+ if isinstance(selector_args, str):
42
+ if selector_args == "*":
43
+ yield lambda dict_selector: True
44
+ else:
45
+ yield lambda dict_selector: eval(selector_args, {}, dict_selector)
46
+ return
47
+
48
+ selector_args_ = {
49
+ k: v if isinstance(v, list) else [v] for k, v in selector_args.items()
50
+ }
51
+ keys, values = zip(*selector_args_.items())
52
+ for bundle in product(*values):
53
+ yield dict(zip(keys, bundle))
54
+
55
+
56
+ def import_cls(name):
57
+ components = name.split(".")
58
+ mod = importlib.import_module(".".join(components[:-1]))
59
+ return getattr(mod, components[-1])
60
+
61
+
62
+ def get_dataset_store_by_urls(
63
+ dataset_url: str, file_url: str, bucket: str
64
+ ) -> DatasetStore:
65
+ """
66
+ Initialize a DatasetStore by a DatasetRepository and a FileRepository
67
+ """
68
+ if not bucket:
69
+ raise Exception("Bucket is not specified")
70
+
71
+ file_repository = file_repository_factory.build_if_supports(url=file_url)
72
+
73
+ if secrets_manager.supports(dataset_url):
74
+ dataset_url = secrets_manager.load_as_db_url(dataset_url)
75
+
76
+ if dataset_url.startswith("postgres://"):
77
+ dataset_url = dataset_url.replace("postgress://", "postgress+")
78
+
79
+ dataset_repository = dataset_repository_factory.build_if_supports(url=dataset_url)
80
+ return DatasetStore(
81
+ dataset_repository=dataset_repository,
82
+ file_repository=file_repository,
83
+ bucket=bucket,
84
+ )
85
+
86
+
87
+ def get_datastore(config_file, bucket: Optional[str] = None) -> DatasetStore:
88
+ config = parse_config(config_file, default_value="")
89
+
90
+ return get_dataset_store_by_urls(
91
+ dataset_url=config["main"]["dataset_url"],
92
+ file_url=config["main"]["file_url"],
93
+ bucket=bucket or config["main"].get("default_bucket"),
94
+ )
95
+
96
+
97
+ def get_remote_datastore(url: str, bucket: str, **kwargs) -> DatasetStore:
98
+ return get_dataset_store_by_urls(dataset_url=url, file_url=url, bucket=bucket)
99
+
100
+
101
+ def get_source_cls(key: str) -> Type[Source]:
102
+ if key.startswith("ingestify."):
103
+ _, type_ = key.split(".")
104
+ if type_ == "wyscout":
105
+ from ingestify.infra.source.wyscout import Wyscout
106
+
107
+ return Wyscout
108
+
109
+ elif type_ == "statsbomb_github":
110
+ from ingestify.infra.source.statsbomb_github import StatsbombGithub
111
+
112
+ return StatsbombGithub
113
+ else:
114
+ raise Exception(f"Unknown source type 'ingestify.{type_}'")
115
+ else:
116
+ return import_cls(key)
117
+
118
+
119
+ def build_source(name, source_args):
120
+ source_cls = get_source_cls(source_args["type"])
121
+ raw_configuration = source_args.get("configuration", {})
122
+ configuration = {}
123
+ if isinstance(raw_configuration, list):
124
+ # This normally means the data needs to be loaded from somewhere else
125
+ for item in raw_configuration:
126
+ if isinstance(item, dict):
127
+ configuration.update(item)
128
+ elif secrets_manager.supports(item):
129
+ item = secrets_manager.load_as_dict(item)
130
+ configuration.update(item)
131
+ else:
132
+ raise ConfigurationError(
133
+ f"Don't know how to use source configuration '{item}'"
134
+ )
135
+ elif isinstance(raw_configuration, str):
136
+ configuration = secrets_manager.load_as_dict(raw_configuration)
137
+ else:
138
+ configuration = raw_configuration
139
+
140
+ return source_cls(name=name, **configuration)
141
+
142
+
143
+ def get_event_subscriber_cls(key: str) -> Type[Subscriber]:
144
+ return import_cls(key)
145
+
146
+
147
+ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
148
+ config = parse_config(config_file, default_value="")
149
+
150
+ logger.info("Initializing sources")
151
+ sources = {}
152
+ sys.path.append(os.path.dirname(config_file))
153
+ for name, source_args in config["sources"].items():
154
+ sources[name] = build_source(name=name, source_args=source_args)
155
+
156
+ logger.info("Initializing IngestionEngine")
157
+ store = get_dataset_store_by_urls(
158
+ dataset_url=config["main"]["dataset_url"],
159
+ file_url=config["main"]["file_url"],
160
+ bucket=bucket or config["main"].get("default_bucket"),
161
+ )
162
+
163
+ # Setup an EventBus and wire some more components
164
+ event_bus = EventBus()
165
+ publisher = Publisher()
166
+ for subscriber in config.get("event_subscribers", []):
167
+ cls = get_event_subscriber_cls(subscriber["type"])
168
+ publisher.add_subscriber(cls(store))
169
+ event_bus.register(publisher)
170
+ store.set_event_bus(event_bus)
171
+
172
+ ingestion_engine = IngestionEngine(
173
+ store=store,
174
+ )
175
+
176
+ logger.info("Determining tasks...")
177
+
178
+ fetch_policy = FetchPolicy()
179
+
180
+ for job in config["extract_jobs"]:
181
+ data_spec_versions = DataSpecVersionCollection.from_dict(
182
+ job.get("data_spec_versions", {"default": {"v1"}})
183
+ )
184
+
185
+ if "selectors" in job:
186
+ selectors = [
187
+ Selector.build(selector, data_spec_versions=data_spec_versions)
188
+ for selector_args in job["selectors"]
189
+ for selector in _product_selectors(selector_args)
190
+ ]
191
+ else:
192
+ # Add a single empty selector. This won't match anything
193
+ # but makes it easier later one where we loop over selectors.
194
+ selectors = [Selector.build({}, data_spec_versions=data_spec_versions)]
195
+
196
+ import_job = ExtractJob(
197
+ source=sources[job["source"]],
198
+ dataset_type=job["dataset_type"],
199
+ selectors=selectors,
200
+ fetch_policy=fetch_policy,
201
+ data_spec_versions=data_spec_versions,
202
+ )
203
+ ingestion_engine.add_extract_job(import_job)
204
+
205
+ return ingestion_engine
ingestify/server.py ADDED
@@ -0,0 +1,78 @@
1
+ import flask
2
+ from flask import Response, request
3
+ from flask_restful import Resource, Api
4
+
5
+
6
+ # Dataset
7
+ from ingestify.infra.serialization import serialize
8
+ from ingestify.main import get_datastore
9
+
10
+
11
+ def create_server(config_file: str):
12
+ app = flask.Flask(__name__)
13
+ api = Api(app, prefix="/api")
14
+
15
+ # @api.representation('application/json')
16
+ # def output_json(data, code, headers=None):
17
+ # resp = make_response(json.dumps(data, cls=DecimalEncoder), code)
18
+ # resp.headers.extend(headers or {})
19
+ # return resp
20
+
21
+ datastore_cache = {}
22
+
23
+ def get_datastore_by_bucket(bucket: str):
24
+ try:
25
+ return datastore_cache[bucket]
26
+ except KeyError:
27
+ datastore_cache[bucket] = get_datastore(config_file, bucket=bucket)
28
+ return datastore_cache[bucket]
29
+
30
+ class DatasetResource(Resource):
31
+ def patch(self, bucket: str, dataset_id: str):
32
+ # TODO: Filter out dataset from body
33
+ return "OK"
34
+
35
+ def delete(self, bucket: str, dataset_id: str):
36
+ pass
37
+
38
+ class DatasetListResource(Resource):
39
+ def get(self, bucket: str):
40
+ return serialize(get_datastore_by_bucket(bucket).get_dataset_collection())
41
+
42
+ class FileResource(Resource):
43
+ def get(self, bucket, dataset_id: str, version: int, filename: str):
44
+ return Response(
45
+ get_datastore_by_bucket(bucket)
46
+ .load_content(dataset_id, version, filename)
47
+ .read()
48
+ )
49
+
50
+ def put(self, bucket, dataset_id: str, version: int, filename: str):
51
+ return Response(
52
+ get_datastore_by_bucket(bucket).save_content(
53
+ dataset_id, version, filename, request.stream
54
+ )
55
+ )
56
+
57
+ api.add_resource(
58
+ DatasetListResource, "/buckets/<string:bucket>/datasets", methods=["GET"]
59
+ )
60
+ api.add_resource(
61
+ DatasetResource,
62
+ "/buckets/<string:bucket>/datasets/<string:dataset_id>",
63
+ methods=["PATCH", "DELETE"],
64
+ )
65
+ api.add_resource(
66
+ FileResource,
67
+ "/buckets/<string:bucket>/"
68
+ "datasets/<string:dataset_id>/files/"
69
+ "<string:version>/<string:filename>",
70
+ methods=["GET", "PUT"],
71
+ )
72
+
73
+ return app
74
+
75
+
76
+ if __name__ == "__main__":
77
+ app = create_server(config_file="../examples/statsbomb/config_local.yaml")
78
+ app.run(host="0.0.0.0", port=8080)
@@ -0,0 +1,23 @@
1
+ from ingestify.application.dataset_store import DatasetStore
2
+ from ingestify.domain.models import (
3
+ Dataset,
4
+ DatasetResource,
5
+ DraftFile,
6
+ File,
7
+ Identifier,
8
+ Selector,
9
+ Source,
10
+ Revision,
11
+ )
12
+
13
+ __all__ = [
14
+ "Selector",
15
+ "Identifier",
16
+ "Source",
17
+ "DatasetStore",
18
+ "Dataset",
19
+ "DatasetResource",
20
+ "Revision",
21
+ "File",
22
+ "DraftFile",
23
+ ]
File without changes
@@ -0,0 +1,19 @@
1
+ ingestify_version: {{ ingestify_version }}
2
+
3
+ main:
4
+ dataset_url: sqlite:///database/catalog.db
5
+ file_url: file://database/files/
6
+ default_bucket: main
7
+
8
+ sources:
9
+ statsbomb:
10
+ type: ingestify.statsbomb_github
11
+
12
+ extract_jobs:
13
+ - source: statsbomb
14
+ selectors:
15
+ - competition_id: 11
16
+ season_id: [42, 90]
17
+
18
+ # passing an empty selector means: fetch everything
19
+ # -
@@ -0,0 +1 @@
1
+ # This will contain the database
@@ -0,0 +1,14 @@
1
+ from ingestify.main import get_datastore
2
+
3
+
4
+ def main():
5
+ store = get_datastore("config.yaml")
6
+ dataset_collection = store.get_dataset_collection()
7
+
8
+ for dataset in dataset_collection:
9
+ kloppy_dataset = store.load_with_kloppy(dataset)
10
+ print(f"Loaded dataset with {len(kloppy_dataset.records)} events")
11
+
12
+
13
+ if __name__ == "__main__":
14
+ main()
@@ -0,0 +1,5 @@
1
+ # Template .env file from Ingestify
2
+ # You should not add this file to a version control system like git
3
+
4
+ WYSCOUT_USERNAME=
5
+ WYSCOUT_PASSWORD=
@@ -0,0 +1,2 @@
1
+ .env
2
+ database
File without changes