ingestify 0.6.4__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +2 -1
- ingestify/application/dataset_store.py +228 -11
- ingestify/application/ingestion_engine.py +232 -7
- ingestify/application/loader.py +163 -28
- ingestify/cmdline.py +0 -48
- ingestify/domain/models/__init__.py +2 -0
- ingestify/domain/models/dataset/collection.py +0 -9
- ingestify/domain/models/dataset/dataset_repository.py +4 -0
- ingestify/domain/models/dataset/dataset_state.py +5 -0
- ingestify/domain/models/dataset/events.py +13 -0
- ingestify/domain/models/dataset/file.py +7 -1
- ingestify/domain/models/dataset/selector.py +8 -1
- ingestify/domain/models/event/event_bus.py +16 -1
- ingestify/domain/models/ingestion/ingestion_job.py +23 -4
- ingestify/domain/models/resources/dataset_resource.py +0 -1
- ingestify/infra/source/statsbomb/base.py +36 -0
- ingestify/infra/source/statsbomb/match.py +137 -0
- ingestify/infra/source/statsbomb_github.py +46 -44
- ingestify/infra/store/dataset/sqlalchemy/repository.py +77 -10
- ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
- ingestify/main.py +190 -10
- ingestify/utils.py +2 -32
- ingestify-0.8.0.dist-info/METADATA +257 -0
- {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/RECORD +28 -36
- ingestify/infra/source/wyscout.py +0 -175
- ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -19
- ingestify/static/templates/statsbomb_github/database/README.md +0 -1
- ingestify/static/templates/statsbomb_github/query.py +0 -14
- ingestify/static/templates/wyscout/.env +0 -5
- ingestify/static/templates/wyscout/.gitignore +0 -2
- ingestify/static/templates/wyscout/README.md +0 -0
- ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -18
- ingestify/static/templates/wyscout/database/README.md +0 -1
- ingestify/static/templates/wyscout/query.py +0 -14
- ingestify-0.6.4.dist-info/METADATA +0 -266
- /ingestify/{static/templates/statsbomb_github/README.md → infra/source/statsbomb/__init__.py} +0 -0
- {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/WHEEL +0 -0
- {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/top_level.txt +0 -0
ingestify/application/loader.py
CHANGED
|
@@ -9,6 +9,9 @@ from ingestify.utils import TaskExecutor
|
|
|
9
9
|
|
|
10
10
|
from .dataset_store import DatasetStore
|
|
11
11
|
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
12
|
+
from ingestify.domain.models.fetch_policy import FetchPolicy
|
|
13
|
+
from ingestify.domain import DataSpecVersionCollection
|
|
14
|
+
from ingestify.infra.source.statsbomb_github import StatsbombGithub
|
|
12
15
|
from ..domain.models.ingestion.ingestion_job import IngestionJob
|
|
13
16
|
from ..exceptions import ConfigurationError
|
|
14
17
|
|
|
@@ -21,6 +24,34 @@ else:
|
|
|
21
24
|
logger = logging.getLogger(__name__)
|
|
22
25
|
|
|
23
26
|
|
|
27
|
+
# Registry of open data sources that can be auto-instantiated
|
|
28
|
+
OPEN_DATA_SOURCES = {
|
|
29
|
+
"statsbomb": StatsbombGithub,
|
|
30
|
+
# Add more open data sources here as they become available
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _create_open_data_plan(provider: str, dataset_type: str) -> Optional[IngestionPlan]:
|
|
35
|
+
"""Create a temporary ingestion plan for open data sources."""
|
|
36
|
+
if provider not in OPEN_DATA_SOURCES:
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
source_class = OPEN_DATA_SOURCES[provider]
|
|
40
|
+
source = source_class(name=f"open_data_{provider}")
|
|
41
|
+
|
|
42
|
+
# Create empty selector to trigger discover_selectors
|
|
43
|
+
data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
|
|
44
|
+
empty_selector = Selector.build({}, data_spec_versions=data_spec_versions)
|
|
45
|
+
|
|
46
|
+
return IngestionPlan(
|
|
47
|
+
source=source,
|
|
48
|
+
fetch_policy=FetchPolicy(),
|
|
49
|
+
selectors=[empty_selector],
|
|
50
|
+
dataset_type=dataset_type,
|
|
51
|
+
data_spec_versions=data_spec_versions,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
24
55
|
class Loader:
|
|
25
56
|
def __init__(self, store: DatasetStore):
|
|
26
57
|
self.store = store
|
|
@@ -29,30 +60,67 @@ class Loader:
|
|
|
29
60
|
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
30
61
|
self.ingestion_plans.append(ingestion_plan)
|
|
31
62
|
|
|
32
|
-
def
|
|
63
|
+
def collect(
|
|
33
64
|
self,
|
|
34
|
-
dry_run: bool = False,
|
|
35
65
|
provider: Optional[str] = None,
|
|
36
66
|
source: Optional[str] = None,
|
|
67
|
+
dataset_type: Optional[str] = None,
|
|
68
|
+
auto_ingest_config: Optional[dict] = None,
|
|
69
|
+
**selector_filters,
|
|
37
70
|
):
|
|
71
|
+
"""Collect and prepare selectors for execution."""
|
|
38
72
|
ingestion_plans = []
|
|
39
73
|
for ingestion_plan in self.ingestion_plans:
|
|
40
74
|
if provider is not None:
|
|
41
75
|
if ingestion_plan.source.provider != provider:
|
|
42
|
-
logger.
|
|
76
|
+
logger.debug(
|
|
43
77
|
f"Skipping {ingestion_plan} because provider doesn't match '{provider}'"
|
|
44
78
|
)
|
|
45
79
|
continue
|
|
46
80
|
|
|
47
81
|
if source is not None:
|
|
48
82
|
if ingestion_plan.source.name != source:
|
|
49
|
-
logger.
|
|
83
|
+
logger.debug(
|
|
50
84
|
f"Skipping {ingestion_plan} because source doesn't match '{source}'"
|
|
51
85
|
)
|
|
52
86
|
continue
|
|
53
87
|
|
|
88
|
+
if dataset_type is not None:
|
|
89
|
+
if ingestion_plan.dataset_type != dataset_type:
|
|
90
|
+
logger.debug(
|
|
91
|
+
f"Skipping {ingestion_plan} because dataset_type doesn't match '{dataset_type}'"
|
|
92
|
+
)
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
# Note: Selector filtering is now done after all selectors are collected
|
|
96
|
+
# to allow discover_selectors to run for plans with empty selectors
|
|
97
|
+
|
|
54
98
|
ingestion_plans.append(ingestion_plan)
|
|
55
99
|
|
|
100
|
+
# Check if we need to add open data plans
|
|
101
|
+
auto_ingest_config = auto_ingest_config or {}
|
|
102
|
+
if auto_ingest_config.get("use_open_data", False):
|
|
103
|
+
# Validate prerequisites for open data
|
|
104
|
+
if not provider:
|
|
105
|
+
raise ConfigurationError(
|
|
106
|
+
"use_open_data requires 'provider' to be specified"
|
|
107
|
+
)
|
|
108
|
+
if not dataset_type:
|
|
109
|
+
raise ConfigurationError(
|
|
110
|
+
"use_open_data requires 'dataset_type' to be specified"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Only add open data plan if no matching configured plans found
|
|
114
|
+
if not ingestion_plans:
|
|
115
|
+
open_data_plan = _create_open_data_plan(provider, dataset_type)
|
|
116
|
+
if open_data_plan:
|
|
117
|
+
logger.info(f"Auto-discovered open data source: {open_data_plan}")
|
|
118
|
+
ingestion_plans.append(open_data_plan)
|
|
119
|
+
else:
|
|
120
|
+
logger.warning(
|
|
121
|
+
f"No open data source available for provider '{provider}'"
|
|
122
|
+
)
|
|
123
|
+
|
|
56
124
|
# First collect all selectors, before discovering datasets
|
|
57
125
|
selectors = {}
|
|
58
126
|
for ingestion_plan in ingestion_plans:
|
|
@@ -134,32 +202,45 @@ class Loader:
|
|
|
134
202
|
else:
|
|
135
203
|
selectors[key] = (ingestion_plan, selector)
|
|
136
204
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
205
|
+
# Convert to list
|
|
206
|
+
collected_selectors = list(selectors.values())
|
|
207
|
+
|
|
208
|
+
# Apply selector filters if provided
|
|
209
|
+
if selector_filters:
|
|
210
|
+
filtered_selectors = []
|
|
211
|
+
for ingestion_plan, selector in collected_selectors:
|
|
212
|
+
if selector.matches(selector_filters):
|
|
213
|
+
# Merge selector with user filters to make it more strict
|
|
214
|
+
merged_attributes = {
|
|
215
|
+
**selector.filtered_attributes,
|
|
216
|
+
**selector_filters,
|
|
217
|
+
}
|
|
218
|
+
strict_selector = Selector.build(
|
|
219
|
+
merged_attributes,
|
|
220
|
+
data_spec_versions=selector.data_spec_versions,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Check if selector was actually made more strict
|
|
224
|
+
if len(strict_selector.filtered_attributes) > len(
|
|
225
|
+
selector.filtered_attributes
|
|
226
|
+
):
|
|
227
|
+
logger.debug(
|
|
228
|
+
f"Made selector more strict: {selector} -> {strict_selector}"
|
|
229
|
+
)
|
|
158
230
|
|
|
231
|
+
filtered_selectors.append((ingestion_plan, strict_selector))
|
|
232
|
+
else:
|
|
233
|
+
logger.debug(
|
|
234
|
+
f"Filtering out selector {selector} because it doesn't match filters"
|
|
235
|
+
)
|
|
236
|
+
collected_selectors = filtered_selectors
|
|
237
|
+
|
|
238
|
+
return collected_selectors
|
|
239
|
+
|
|
240
|
+
def run(self, selectors, dry_run: bool = False):
|
|
241
|
+
"""Execute the collected selectors."""
|
|
159
242
|
ingestion_job_prefix = str(uuid.uuid1())
|
|
160
|
-
for ingestion_job_idx, (ingestion_plan, selector) in enumerate(
|
|
161
|
-
selectors.values()
|
|
162
|
-
):
|
|
243
|
+
for ingestion_job_idx, (ingestion_plan, selector) in enumerate(selectors):
|
|
163
244
|
logger.info(
|
|
164
245
|
f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
|
|
165
246
|
)
|
|
@@ -186,3 +267,57 @@ class Loader:
|
|
|
186
267
|
self.store.save_ingestion_job_summary(ingestion_job_summary)
|
|
187
268
|
|
|
188
269
|
logger.info("Done")
|
|
270
|
+
|
|
271
|
+
def collect_and_run(
|
|
272
|
+
self,
|
|
273
|
+
dry_run: bool = False,
|
|
274
|
+
provider: Optional[str] = None,
|
|
275
|
+
source: Optional[str] = None,
|
|
276
|
+
dataset_type: Optional[str] = None,
|
|
277
|
+
auto_ingest_config: Optional[dict] = None,
|
|
278
|
+
**selector_filters,
|
|
279
|
+
):
|
|
280
|
+
"""
|
|
281
|
+
Backward compatibility method - collect then run.
|
|
282
|
+
|
|
283
|
+
Data flow explanation:
|
|
284
|
+
|
|
285
|
+
IngestionPlans are structured hierarchically:
|
|
286
|
+
- IngestionPlan #1
|
|
287
|
+
- Selector 1.1
|
|
288
|
+
- Selector 1.2
|
|
289
|
+
- Selector 1.3
|
|
290
|
+
- IngestionPlan #2
|
|
291
|
+
- Selector 2.1
|
|
292
|
+
- Selector 2.2
|
|
293
|
+
|
|
294
|
+
But we process them as flat (plan, selector) pairs for execution:
|
|
295
|
+
- (IngestionPlan #1, Selector 1.1)
|
|
296
|
+
- (IngestionPlan #1, Selector 1.2)
|
|
297
|
+
- (IngestionPlan #1, Selector 1.3)
|
|
298
|
+
- (IngestionPlan #2, Selector 2.1)
|
|
299
|
+
- (IngestionPlan #2, Selector 2.2)
|
|
300
|
+
|
|
301
|
+
Each IngestionJobSummary tracks the execution of one (IngestionPlan, Selector) pair.
|
|
302
|
+
"""
|
|
303
|
+
selectors = self.collect(
|
|
304
|
+
provider=provider,
|
|
305
|
+
source=source,
|
|
306
|
+
dataset_type=dataset_type,
|
|
307
|
+
auto_ingest_config=auto_ingest_config,
|
|
308
|
+
**selector_filters,
|
|
309
|
+
)
|
|
310
|
+
if (provider or source or dataset_type or selector_filters) and not selectors:
|
|
311
|
+
filters_applied = {
|
|
312
|
+
k: v
|
|
313
|
+
for k, v in {
|
|
314
|
+
"provider": provider,
|
|
315
|
+
"source": source,
|
|
316
|
+
"dataset_type": dataset_type,
|
|
317
|
+
**selector_filters,
|
|
318
|
+
}.items()
|
|
319
|
+
if v
|
|
320
|
+
}
|
|
321
|
+
logger.warning(f"No data found matching filters: {filters_applied}")
|
|
322
|
+
else:
|
|
323
|
+
self.run(selectors, dry_run=dry_run)
|
ingestify/cmdline.py
CHANGED
|
@@ -5,7 +5,6 @@ from pathlib import Path
|
|
|
5
5
|
from typing import Optional
|
|
6
6
|
|
|
7
7
|
import click
|
|
8
|
-
import jinja2
|
|
9
8
|
from dotenv import find_dotenv, load_dotenv
|
|
10
9
|
|
|
11
10
|
from ingestify.exceptions import ConfigurationError
|
|
@@ -219,53 +218,6 @@ def delete_dataset(
|
|
|
219
218
|
logger.info("Done")
|
|
220
219
|
|
|
221
220
|
|
|
222
|
-
@cli.command()
|
|
223
|
-
@click.option(
|
|
224
|
-
"--template",
|
|
225
|
-
"template",
|
|
226
|
-
required=True,
|
|
227
|
-
help="Template",
|
|
228
|
-
type=click.Choice(["wyscout", "statsbomb_github"]),
|
|
229
|
-
)
|
|
230
|
-
@click.argument("project_name")
|
|
231
|
-
def init(template: str, project_name: str):
|
|
232
|
-
logger.warning(
|
|
233
|
-
"`ingestify init` is currently not supported. See https://github.com/PySport/ingestify/issues/11"
|
|
234
|
-
)
|
|
235
|
-
return
|
|
236
|
-
|
|
237
|
-
directory = Path(project_name)
|
|
238
|
-
if directory.exists():
|
|
239
|
-
logger.warning(f"Directory '{directory}' already exists")
|
|
240
|
-
return sys.exit(1)
|
|
241
|
-
|
|
242
|
-
if template == "wyscout":
|
|
243
|
-
template_dir = Path(__file__).parent / "static/templates/wyscout"
|
|
244
|
-
elif template == "statsbomb_github":
|
|
245
|
-
template_dir = Path(__file__).parent / "static/templates/statsbomb_github"
|
|
246
|
-
else:
|
|
247
|
-
raise Exception(f"Template {template} not found")
|
|
248
|
-
|
|
249
|
-
directory.mkdir(parents=True)
|
|
250
|
-
|
|
251
|
-
for file in template_dir.glob("*"):
|
|
252
|
-
filename = file.name
|
|
253
|
-
if file.is_file():
|
|
254
|
-
data = file.open("r").read()
|
|
255
|
-
|
|
256
|
-
if filename.endswith(".jinja2"):
|
|
257
|
-
raw_input = jinja2.Template(data)
|
|
258
|
-
data = raw_input.render(ingestify_version=__version__)
|
|
259
|
-
filename = filename.rstrip(".jinja2")
|
|
260
|
-
|
|
261
|
-
with open(directory / filename, "w") as fp:
|
|
262
|
-
fp.write(data)
|
|
263
|
-
elif file.is_dir():
|
|
264
|
-
(directory / filename).mkdir()
|
|
265
|
-
|
|
266
|
-
logger.info(f"Initialized project at `{directory}` with template `{template}`")
|
|
267
|
-
|
|
268
|
-
|
|
269
221
|
#
|
|
270
222
|
# @cli.command("list")
|
|
271
223
|
# @click.option(
|
|
@@ -12,6 +12,7 @@ from .dataset import (
|
|
|
12
12
|
Selector,
|
|
13
13
|
Revision,
|
|
14
14
|
)
|
|
15
|
+
from .dataset.dataset_state import DatasetState
|
|
15
16
|
from .sink import Sink
|
|
16
17
|
from .source import Source
|
|
17
18
|
from .task import Task, TaskSet
|
|
@@ -37,4 +38,5 @@ __all__ = [
|
|
|
37
38
|
"Task",
|
|
38
39
|
"Sink",
|
|
39
40
|
"DataSpecVersionCollection",
|
|
41
|
+
"DatasetState",
|
|
40
42
|
]
|
|
@@ -19,9 +19,6 @@ class DatasetCollection:
|
|
|
19
19
|
}
|
|
20
20
|
self.metadata = metadata
|
|
21
21
|
|
|
22
|
-
def loaded(self):
|
|
23
|
-
return self.metadata.count == len(self.datasets)
|
|
24
|
-
|
|
25
22
|
def get(self, dataset_identifier: Identifier) -> Dataset:
|
|
26
23
|
return self.datasets.get(dataset_identifier.key)
|
|
27
24
|
|
|
@@ -31,12 +28,6 @@ class DatasetCollection:
|
|
|
31
28
|
def __iter__(self):
|
|
32
29
|
return iter(self.datasets.values())
|
|
33
30
|
|
|
34
|
-
def get_dataset_by_id(self, dataset_id):
|
|
35
|
-
for dataset in self:
|
|
36
|
-
if dataset.dataset_id == dataset_id:
|
|
37
|
-
return dataset
|
|
38
|
-
return None
|
|
39
|
-
|
|
40
31
|
def first(self):
|
|
41
32
|
try:
|
|
42
33
|
return next(iter(self.datasets.values()))
|
|
@@ -3,6 +3,7 @@ from typing import Optional, List, Union
|
|
|
3
3
|
|
|
4
4
|
from .collection import DatasetCollection
|
|
5
5
|
from .dataset import Dataset
|
|
6
|
+
from .dataset_state import DatasetState
|
|
6
7
|
from .selector import Selector
|
|
7
8
|
|
|
8
9
|
|
|
@@ -16,6 +17,9 @@ class DatasetRepository(ABC):
|
|
|
16
17
|
provider: Optional[str] = None,
|
|
17
18
|
selector: Optional[Union[Selector, List[Selector]]] = None,
|
|
18
19
|
metadata_only: bool = False,
|
|
20
|
+
dataset_state: Optional[List[DatasetState]] = None,
|
|
21
|
+
page: Optional[int] = None,
|
|
22
|
+
page_size: Optional[int] = None,
|
|
19
23
|
) -> DatasetCollection:
|
|
20
24
|
pass
|
|
21
25
|
|
|
@@ -4,6 +4,7 @@ from pydantic import BaseModel
|
|
|
4
4
|
|
|
5
5
|
from ingestify.domain.models.event.domain_event import DomainEvent
|
|
6
6
|
from .dataset import Dataset
|
|
7
|
+
from .selector import Selector
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class DatasetCreated(DomainEvent):
|
|
@@ -19,3 +20,15 @@ class RevisionAdded(DomainEvent):
|
|
|
19
20
|
class MetadataUpdated(DomainEvent):
|
|
20
21
|
dataset: Dataset
|
|
21
22
|
event_type: ClassVar[str] = "metadata_updated"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SelectorSkipped(DomainEvent):
|
|
26
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
27
|
+
|
|
28
|
+
selector: Selector
|
|
29
|
+
event_type: ClassVar[str] = "selector_skipped"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DatasetSkipped(DomainEvent):
|
|
33
|
+
dataset: Dataset
|
|
34
|
+
event_type: ClassVar[str] = "dataset_skipped"
|
|
@@ -39,6 +39,12 @@ class DraftFile(BaseModel):
|
|
|
39
39
|
stream = BytesIO(file_.read().encode("utf-8"))
|
|
40
40
|
elif isinstance(file_, BytesIO):
|
|
41
41
|
stream = file_
|
|
42
|
+
elif hasattr(file_, "read"):
|
|
43
|
+
data = file_.read()
|
|
44
|
+
if isinstance(data, bytes):
|
|
45
|
+
stream = BytesIO(data)
|
|
46
|
+
else:
|
|
47
|
+
stream = BytesIO(data.encode("utf-8"))
|
|
42
48
|
else:
|
|
43
49
|
raise Exception(f"Not possible to create DraftFile from {type(file_)}")
|
|
44
50
|
|
|
@@ -135,4 +141,4 @@ class LoadedFile(BaseModel):
|
|
|
135
141
|
return self.stream_
|
|
136
142
|
|
|
137
143
|
|
|
138
|
-
__all__ = ["File", "DraftFile", "LoadedFile"]
|
|
144
|
+
__all__ = ["File", "DraftFile", "LoadedFile", "NotModifiedFile"]
|
|
@@ -38,10 +38,17 @@ class Selector(AttributeBag):
|
|
|
38
38
|
except AttributeError:
|
|
39
39
|
return None
|
|
40
40
|
|
|
41
|
+
@property
|
|
42
|
+
def name(self) -> Optional[str]:
|
|
43
|
+
try:
|
|
44
|
+
return self._name
|
|
45
|
+
except AttributeError:
|
|
46
|
+
return None
|
|
47
|
+
|
|
41
48
|
@property
|
|
42
49
|
def custom_attributes(self):
|
|
43
50
|
return {
|
|
44
51
|
k: v
|
|
45
52
|
for k, v in self.items()
|
|
46
|
-
if k not in ("_matcher", "_data_spec_versions", "_last_modified")
|
|
53
|
+
if k not in ("_matcher", "_data_spec_versions", "_last_modified", "_name")
|
|
47
54
|
}
|
|
@@ -7,6 +7,14 @@ from .dispatcher import Dispatcher
|
|
|
7
7
|
logger = logging.getLogger(__name__)
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
class QueueForwarder:
|
|
11
|
+
def __init__(self, queue):
|
|
12
|
+
self.queue = queue
|
|
13
|
+
|
|
14
|
+
def dispatch(self, event):
|
|
15
|
+
self.queue.put(event)
|
|
16
|
+
|
|
17
|
+
|
|
10
18
|
class EventBus:
|
|
11
19
|
def __init__(self):
|
|
12
20
|
self.dispatchers: list[Dispatcher] = []
|
|
@@ -14,8 +22,15 @@ class EventBus:
|
|
|
14
22
|
def register(self, dispatcher: Dispatcher):
|
|
15
23
|
self.dispatchers.append(dispatcher)
|
|
16
24
|
|
|
17
|
-
|
|
25
|
+
def unregister():
|
|
26
|
+
self.dispatchers.remove(dispatcher)
|
|
27
|
+
|
|
28
|
+
return unregister
|
|
18
29
|
|
|
30
|
+
def register_queue(self, queue):
|
|
31
|
+
return self.register(QueueForwarder(queue))
|
|
32
|
+
|
|
33
|
+
def dispatch(self, event):
|
|
19
34
|
for dispatcher in self.dispatchers:
|
|
20
35
|
try:
|
|
21
36
|
dispatcher.dispatch(event)
|
|
@@ -5,6 +5,8 @@ import uuid
|
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from typing import Optional, Iterator, Union
|
|
7
7
|
|
|
8
|
+
from pydantic import ValidationError
|
|
9
|
+
|
|
8
10
|
from ingestify import retrieve_http
|
|
9
11
|
from ingestify.application.dataset_store import DatasetStore
|
|
10
12
|
from ingestify.domain import Selector, Identifier, TaskSet, Dataset, DraftFile, Task
|
|
@@ -14,12 +16,13 @@ from ingestify.domain.models.ingestion.ingestion_job_summary import (
|
|
|
14
16
|
IngestionJobSummary,
|
|
15
17
|
)
|
|
16
18
|
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
19
|
+
from ingestify.domain.models.dataset.events import SelectorSkipped, DatasetSkipped
|
|
17
20
|
from ingestify.domain.models.resources.dataset_resource import (
|
|
18
21
|
FileResource,
|
|
19
22
|
DatasetResource,
|
|
20
23
|
)
|
|
21
24
|
from ingestify.domain.models.task.task_summary import TaskSummary
|
|
22
|
-
from ingestify.exceptions import SaveError
|
|
25
|
+
from ingestify.exceptions import SaveError, IngestifyError
|
|
23
26
|
from ingestify.utils import TaskExecutor, chunker
|
|
24
27
|
|
|
25
28
|
logger = logging.getLogger(__name__)
|
|
@@ -241,6 +244,9 @@ class IngestionJob:
|
|
|
241
244
|
f"'{self.selector.last_modified}' < metadata last_modified "
|
|
242
245
|
f"'{dataset_collection_metadata.last_modified}'"
|
|
243
246
|
)
|
|
247
|
+
# Emit event for streaming datasets
|
|
248
|
+
store.dispatch(SelectorSkipped(selector=self.selector))
|
|
249
|
+
|
|
244
250
|
ingestion_job_summary.set_skipped()
|
|
245
251
|
yield ingestion_job_summary
|
|
246
252
|
return
|
|
@@ -260,6 +266,16 @@ class IngestionJob:
|
|
|
260
266
|
|
|
261
267
|
# We need to include the to_batches as that will start the generator
|
|
262
268
|
batches = to_batches(dataset_resources)
|
|
269
|
+
except ValidationError as e:
|
|
270
|
+
# Make sure to pass this to the highest level as this means the Source is wrong
|
|
271
|
+
if "Field required" in str(e):
|
|
272
|
+
raise IngestifyError("failed to run find_datasets") from e
|
|
273
|
+
else:
|
|
274
|
+
logger.exception("Failed to find datasets")
|
|
275
|
+
|
|
276
|
+
ingestion_job_summary.set_exception(e)
|
|
277
|
+
yield ingestion_job_summary
|
|
278
|
+
return
|
|
263
279
|
except Exception as e:
|
|
264
280
|
logger.exception("Failed to find datasets")
|
|
265
281
|
|
|
@@ -327,6 +343,8 @@ class IngestionJob:
|
|
|
327
343
|
)
|
|
328
344
|
)
|
|
329
345
|
else:
|
|
346
|
+
# Emit event for streaming datasets
|
|
347
|
+
store.dispatch(DatasetSkipped(dataset=dataset))
|
|
330
348
|
skipped_tasks += 1
|
|
331
349
|
else:
|
|
332
350
|
if self.ingestion_plan.fetch_policy.should_fetch(
|
|
@@ -348,9 +366,10 @@ class IngestionJob:
|
|
|
348
366
|
f"using selector {self.selector} => {len(task_set)} tasks. {skipped_tasks} skipped."
|
|
349
367
|
)
|
|
350
368
|
logger.info(f"Running {len(task_set)} tasks")
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
369
|
+
|
|
370
|
+
task_summaries = task_executor.run(run_task, task_set)
|
|
371
|
+
|
|
372
|
+
ingestion_job_summary.add_task_summaries(task_summaries)
|
|
354
373
|
else:
|
|
355
374
|
logger.info(
|
|
356
375
|
f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
from ingestify import Source
|
|
6
|
+
from ingestify.exceptions import ConfigurationError
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class StatsBombBaseAPI(Source):
|
|
10
|
+
provider = "statsbomb"
|
|
11
|
+
BASE_URL = "https://data.statsbombservices.com/api"
|
|
12
|
+
|
|
13
|
+
def __init__(self, name: str, username: str, password: str):
|
|
14
|
+
super().__init__(name)
|
|
15
|
+
|
|
16
|
+
self.username = username.strip()
|
|
17
|
+
self.password = password.strip()
|
|
18
|
+
|
|
19
|
+
if not self.username:
|
|
20
|
+
raise ConfigurationError(
|
|
21
|
+
f"Username of StatsBomb source named '{self.name}' cannot be empty"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
if not self.password:
|
|
25
|
+
raise ConfigurationError(
|
|
26
|
+
f"Username of StatsBomb source named '{self.name}' cannot be empty"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def get_url(self, data_feed_key: str, data_spec_version: str, path: str):
|
|
30
|
+
return f"{self.BASE_URL}/{data_spec_version}/{data_feed_key}/{path}"
|
|
31
|
+
|
|
32
|
+
def get(self, data_spec_version: str, path: str):
|
|
33
|
+
url = f"{self.BASE_URL}/{data_spec_version}/{path}"
|
|
34
|
+
res = requests.get(url, auth=(self.username, self.password))
|
|
35
|
+
res.raise_for_status()
|
|
36
|
+
return res.json()
|