ingestify 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +2 -1
- ingestify/application/ingestion_engine.py +3 -0
- ingestify/application/loader.py +12 -2
- ingestify/domain/models/dataset/file.py +6 -0
- ingestify/main.py +155 -0
- {ingestify-0.7.0.dist-info → ingestify-0.8.0.dist-info}/METADATA +49 -3
- {ingestify-0.7.0.dist-info → ingestify-0.8.0.dist-info}/RECORD +10 -10
- {ingestify-0.7.0.dist-info → ingestify-0.8.0.dist-info}/WHEEL +0 -0
- {ingestify-0.7.0.dist-info → ingestify-0.8.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.7.0.dist-info → ingestify-0.8.0.dist-info}/top_level.txt +0 -0
ingestify/__init__.py
CHANGED
|
@@ -110,6 +110,9 @@ class IngestionEngine:
|
|
|
110
110
|
else:
|
|
111
111
|
do_load()
|
|
112
112
|
|
|
113
|
+
# Alias for load() - more intuitive name for running ingestion
|
|
114
|
+
run = load
|
|
115
|
+
|
|
113
116
|
def list_datasets(self, as_count: bool = False):
|
|
114
117
|
"""Consider moving this to DataStore"""
|
|
115
118
|
datasets = sorted(
|
ingestify/application/loader.py
CHANGED
|
@@ -307,7 +307,17 @@ class Loader:
|
|
|
307
307
|
auto_ingest_config=auto_ingest_config,
|
|
308
308
|
**selector_filters,
|
|
309
309
|
)
|
|
310
|
-
if selector_filters and not selectors:
|
|
311
|
-
|
|
310
|
+
if (provider or source or dataset_type or selector_filters) and not selectors:
|
|
311
|
+
filters_applied = {
|
|
312
|
+
k: v
|
|
313
|
+
for k, v in {
|
|
314
|
+
"provider": provider,
|
|
315
|
+
"source": source,
|
|
316
|
+
"dataset_type": dataset_type,
|
|
317
|
+
**selector_filters,
|
|
318
|
+
}.items()
|
|
319
|
+
if v
|
|
320
|
+
}
|
|
321
|
+
logger.warning(f"No data found matching filters: {filters_applied}")
|
|
312
322
|
else:
|
|
313
323
|
self.run(selectors, dry_run=dry_run)
|
|
@@ -39,6 +39,12 @@ class DraftFile(BaseModel):
|
|
|
39
39
|
stream = BytesIO(file_.read().encode("utf-8"))
|
|
40
40
|
elif isinstance(file_, BytesIO):
|
|
41
41
|
stream = file_
|
|
42
|
+
elif hasattr(file_, "read"):
|
|
43
|
+
data = file_.read()
|
|
44
|
+
if isinstance(data, bytes):
|
|
45
|
+
stream = BytesIO(data)
|
|
46
|
+
else:
|
|
47
|
+
stream = BytesIO(data.encode("utf-8"))
|
|
42
48
|
else:
|
|
43
49
|
raise Exception(f"Not possible to create DraftFile from {type(file_)}")
|
|
44
50
|
|
ingestify/main.py
CHANGED
|
@@ -279,3 +279,158 @@ def get_engine(
|
|
|
279
279
|
ingestion_engine.add_ingestion_plan(ingestion_plan_)
|
|
280
280
|
|
|
281
281
|
return ingestion_engine
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def get_dev_engine(
|
|
285
|
+
source: Source,
|
|
286
|
+
dataset_type: str,
|
|
287
|
+
data_spec_versions: dict,
|
|
288
|
+
ephemeral: bool = True,
|
|
289
|
+
configure_logging: bool = True,
|
|
290
|
+
dev_dir: Optional[str] = None,
|
|
291
|
+
) -> IngestionEngine:
|
|
292
|
+
"""
|
|
293
|
+
Quick development helper - creates an engine with minimal setup.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
source: The source to test
|
|
297
|
+
dataset_type: Dataset type to ingest
|
|
298
|
+
data_spec_versions: Dict like {"hops": "v1"}
|
|
299
|
+
ephemeral: If True, uses temp dir that gets cleaned. If False, uses persistent /tmp storage.
|
|
300
|
+
configure_logging: If True, configures basic logging (default: True)
|
|
301
|
+
dev_dir: Optional custom directory for data storage (overrides ephemeral)
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
IngestionEngine configured for development
|
|
305
|
+
|
|
306
|
+
Example:
|
|
307
|
+
>>> source = MySource(name="test", ...)
|
|
308
|
+
>>> engine = get_dev_engine(source, "hops", {"hops": "v1"})
|
|
309
|
+
>>> engine.run()
|
|
310
|
+
>>>
|
|
311
|
+
>>> # Access the datasets
|
|
312
|
+
>>> datasets = engine.store.get_dataset_collection()
|
|
313
|
+
>>> print(f"Ingested {len(datasets)} datasets")
|
|
314
|
+
"""
|
|
315
|
+
import tempfile
|
|
316
|
+
from pathlib import Path
|
|
317
|
+
|
|
318
|
+
if configure_logging:
|
|
319
|
+
logging.basicConfig(
|
|
320
|
+
level=logging.INFO,
|
|
321
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
if dev_dir:
|
|
325
|
+
# Use provided directory
|
|
326
|
+
dev_dir = Path(dev_dir)
|
|
327
|
+
elif ephemeral:
|
|
328
|
+
# Use temp directory that will be cleaned up
|
|
329
|
+
import uuid
|
|
330
|
+
|
|
331
|
+
dev_dir = Path(tempfile.gettempdir()) / f"ingestify-dev-{uuid.uuid4().hex[:8]}"
|
|
332
|
+
else:
|
|
333
|
+
# Use persistent directory
|
|
334
|
+
dev_dir = Path(tempfile.gettempdir()) / "ingestify-dev"
|
|
335
|
+
|
|
336
|
+
dev_dir.mkdir(parents=True, exist_ok=True)
|
|
337
|
+
metadata_url = f"sqlite:///{dev_dir / 'database.db'}"
|
|
338
|
+
file_url = f"file://{dev_dir}"
|
|
339
|
+
|
|
340
|
+
logger.info(f"Dev mode: storing data in {dev_dir}")
|
|
341
|
+
|
|
342
|
+
engine = get_engine(
|
|
343
|
+
metadata_url=metadata_url,
|
|
344
|
+
file_url=file_url,
|
|
345
|
+
bucket="main",
|
|
346
|
+
disable_events=True,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
data_spec_versions_obj = DataSpecVersionCollection.from_dict(data_spec_versions)
|
|
350
|
+
|
|
351
|
+
engine.add_ingestion_plan(
|
|
352
|
+
IngestionPlan(
|
|
353
|
+
source=source,
|
|
354
|
+
dataset_type=dataset_type,
|
|
355
|
+
selectors=[Selector.build({}, data_spec_versions=data_spec_versions_obj)],
|
|
356
|
+
fetch_policy=FetchPolicy(),
|
|
357
|
+
data_spec_versions=data_spec_versions_obj,
|
|
358
|
+
)
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
return engine
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def debug_source(
|
|
365
|
+
source: Source,
|
|
366
|
+
*,
|
|
367
|
+
dataset_type: str,
|
|
368
|
+
data_spec_versions: dict,
|
|
369
|
+
ephemeral: bool = True,
|
|
370
|
+
configure_logging: bool = True,
|
|
371
|
+
dev_dir: Optional[str] = None,
|
|
372
|
+
**kwargs,
|
|
373
|
+
) -> IngestionEngine:
|
|
374
|
+
"""
|
|
375
|
+
Debug helper - creates a dev engine, runs ingestion, and shows results.
|
|
376
|
+
|
|
377
|
+
This is a convenience wrapper around get_dev_engine() that does everything:
|
|
378
|
+
creates the engine, runs ingestion, and displays results.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
source: The source to debug
|
|
382
|
+
dataset_type: Dataset type (e.g., "match")
|
|
383
|
+
data_spec_versions: Dict like {"match": "v1"} - explicit, no defaults!
|
|
384
|
+
ephemeral: If True, uses temp dir. If False, uses persistent /tmp storage.
|
|
385
|
+
configure_logging: If True, configures basic logging (default: True)
|
|
386
|
+
dev_dir: Optional custom directory for data storage (overrides ephemeral)
|
|
387
|
+
**kwargs: Selector arguments. For sources with discover_selectors(), these
|
|
388
|
+
filter discovered selectors. Otherwise passed to find_datasets().
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
IngestionEngine: The engine used for ingestion (for further inspection)
|
|
392
|
+
|
|
393
|
+
Example:
|
|
394
|
+
>>> # Simple source without discover_selectors
|
|
395
|
+
>>> source = StatsBombHOPSS3(name="test", s3_bucket="my-bucket", s3_prefix="HOPS")
|
|
396
|
+
>>> engine = debug_source(source, dataset_type="hops", data_spec_versions={"hops": "v1"})
|
|
397
|
+
|
|
398
|
+
>>> # Source with discover_selectors - discovers all competitions
|
|
399
|
+
>>> source = StatsBombMatchAPI(name="test", ...)
|
|
400
|
+
>>> engine = debug_source(
|
|
401
|
+
... source,
|
|
402
|
+
... dataset_type="match",
|
|
403
|
+
... data_spec_versions={"match": "v6"}
|
|
404
|
+
... )
|
|
405
|
+
|
|
406
|
+
>>> # Filter discovered selectors
|
|
407
|
+
>>> engine = debug_source(
|
|
408
|
+
... source,
|
|
409
|
+
... dataset_type="match",
|
|
410
|
+
... data_spec_versions={"match": "v6"},
|
|
411
|
+
... competition_id=46 # Filters to specific competition
|
|
412
|
+
... )
|
|
413
|
+
"""
|
|
414
|
+
logger.info(f"Debug mode for source: {source.name}")
|
|
415
|
+
|
|
416
|
+
engine = get_dev_engine(
|
|
417
|
+
source=source,
|
|
418
|
+
dataset_type=dataset_type,
|
|
419
|
+
data_spec_versions=data_spec_versions,
|
|
420
|
+
ephemeral=ephemeral,
|
|
421
|
+
configure_logging=configure_logging,
|
|
422
|
+
dev_dir=dev_dir,
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Run ingestion
|
|
426
|
+
# Empty selector {} automatically triggers discover_selectors() if available
|
|
427
|
+
# kwargs filter discovered selectors or are passed to find_datasets()
|
|
428
|
+
engine.run(**kwargs)
|
|
429
|
+
|
|
430
|
+
# Show results
|
|
431
|
+
datasets = engine.store.get_dataset_collection()
|
|
432
|
+
logger.info("=" * 60)
|
|
433
|
+
logger.info(f"✓ Ingestion complete: {len(datasets)} dataset(s)")
|
|
434
|
+
logger.info("=" * 60)
|
|
435
|
+
|
|
436
|
+
return engine
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ingestify
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Data Ingestion Framework
|
|
5
5
|
Author: Koen Vossen
|
|
6
6
|
Author-email: info@koenvossen.nl
|
|
@@ -77,6 +77,43 @@ Ingestify fixes that by building **your own data lake** of untouched provider fi
|
|
|
77
77
|
pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
|
|
78
78
|
```
|
|
79
79
|
|
|
80
|
+
### Developing a new Source
|
|
81
|
+
|
|
82
|
+
When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from ingestify import Source, debug_source
|
|
86
|
+
|
|
87
|
+
class MyCustomSource(Source):
|
|
88
|
+
provider = "my_provider"
|
|
89
|
+
|
|
90
|
+
def __init__(self, name: str, api_key: str):
|
|
91
|
+
super().__init__(name)
|
|
92
|
+
self.api_key = api_key
|
|
93
|
+
|
|
94
|
+
def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
|
|
95
|
+
# Your source implementation
|
|
96
|
+
...
|
|
97
|
+
|
|
98
|
+
# Quick debug - runs full ingestion with temp storage
|
|
99
|
+
if __name__ == "__main__":
|
|
100
|
+
source = MyCustomSource(name="test", api_key="...")
|
|
101
|
+
|
|
102
|
+
debug_source(
|
|
103
|
+
source,
|
|
104
|
+
dataset_type="match",
|
|
105
|
+
data_spec_versions={"events": "v1"},
|
|
106
|
+
)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
The `debug_source()` helper:
|
|
110
|
+
- ✅ Creates an ephemeral dev engine with temp storage
|
|
111
|
+
- ✅ Configures logging automatically
|
|
112
|
+
- ✅ Runs the full ingestion cycle
|
|
113
|
+
- ✅ Shows storage location and results
|
|
114
|
+
|
|
115
|
+
Perfect for testing your source before adding it to production config!
|
|
116
|
+
|
|
80
117
|
### Minimal `config.yaml`
|
|
81
118
|
|
|
82
119
|
```yaml
|
|
@@ -175,8 +212,16 @@ pip install kloppy
|
|
|
175
212
|
```
|
|
176
213
|
|
|
177
214
|
```python
|
|
215
|
+
import logging, sys
|
|
216
|
+
|
|
178
217
|
from ingestify.main import get_engine
|
|
179
218
|
|
|
219
|
+
logging.basicConfig(
|
|
220
|
+
level=logging.INFO,
|
|
221
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
222
|
+
stream=sys.stderr,
|
|
223
|
+
)
|
|
224
|
+
|
|
180
225
|
engine = get_engine(
|
|
181
226
|
metadata_url="sqlite:///database_open_data/catalog.db",
|
|
182
227
|
file_url="file://database_open_data/files/"
|
|
@@ -188,12 +233,13 @@ dataset_iter = engine.iter_datasets(
|
|
|
188
233
|
|
|
189
234
|
provider="statsbomb",
|
|
190
235
|
dataset_type="match",
|
|
191
|
-
competition_id=43,
|
|
192
|
-
season_id=281
|
|
236
|
+
competition_id=43, # "FIFA World Cup"
|
|
237
|
+
#season_id=281
|
|
193
238
|
)
|
|
194
239
|
|
|
195
240
|
for dataset in dataset_iter:
|
|
196
241
|
kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
|
|
242
|
+
logging.info(f"Loaded {kloppy_dataset}")
|
|
197
243
|
```
|
|
198
244
|
|
|
199
245
|
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
ingestify/__init__.py,sha256=
|
|
1
|
+
ingestify/__init__.py,sha256=FeK7pau-iTc6ooJiPelblIhkrPLojVHKpTHXIrkdpq8,336
|
|
2
2
|
ingestify/cmdline.py,sha256=Rs1_lSKSIJrcygH5fvtOGicOl_e0sZYW7deqp4_jGbY,6233
|
|
3
3
|
ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
|
|
4
|
-
ingestify/main.py,sha256=
|
|
4
|
+
ingestify/main.py,sha256=WjhcsT21F7dOibrg_S7wRiui6Ytj5ScsWqMCGuv9fs8,14938
|
|
5
5
|
ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
|
|
6
6
|
ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
|
|
7
7
|
ingestify/utils.py,sha256=tsoo-GgeSrwK161WCqW793BAm5bjvnGwI8yGgLTJ1lk,6486
|
|
8
8
|
ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
ingestify/application/dataset_store.py,sha256=GP6wGjVirefEn6hlqWIkOBqdELad9L_mmTpdHdzj18M,20353
|
|
10
|
-
ingestify/application/ingestion_engine.py,sha256=
|
|
11
|
-
ingestify/application/loader.py,sha256=
|
|
10
|
+
ingestify/application/ingestion_engine.py,sha256=we16yiDS9QGOlAUiP1vidDycihjWK3B2jo64uqKmrXE,11246
|
|
11
|
+
ingestify/application/loader.py,sha256=K99ZJuHMEJFO6CIlxoyHKGSQtXw63JgOYu3moUD6sR0,13400
|
|
12
12
|
ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
|
|
13
13
|
ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
|
|
14
14
|
ingestify/domain/models/__init__.py,sha256=WuKS34uiR1EwyczKujBHYGupqseJP-U2P5IQS4kpsA8,838
|
|
@@ -25,7 +25,7 @@ ingestify/domain/models/dataset/dataset.py,sha256=OiP03nY0-m06y2GTrs_m-RiZE8Hwyp
|
|
|
25
25
|
ingestify/domain/models/dataset/dataset_repository.py,sha256=bf3F_1cKw0CvUberD3FMROE8iowAmYefnD4L6aPB39k,989
|
|
26
26
|
ingestify/domain/models/dataset/dataset_state.py,sha256=IaYG02WzgooGaM_AuwRhZgljs-9NhCF_LpBZXkl5ELY,324
|
|
27
27
|
ingestify/domain/models/dataset/events.py,sha256=M8jrHWCm9iXapAy3xjvZZtiiOxXDnfefBixiMwkas24,786
|
|
28
|
-
ingestify/domain/models/dataset/file.py,sha256=
|
|
28
|
+
ingestify/domain/models/dataset/file.py,sha256=cXDjSw19HRMCGFpVN4u1oejxE1V8SMQptfNVDVixj6o,4464
|
|
29
29
|
ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
|
|
30
30
|
ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
|
|
31
31
|
ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
|
|
@@ -72,8 +72,8 @@ ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRX
|
|
|
72
72
|
ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
|
|
73
73
|
ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
|
|
74
74
|
ingestify/infra/store/file/s3_file_repository.py,sha256=tz_EZ_gun7W2qJMlI3j_R03iKBZlJSDcG7AUJ1JkdpE,1501
|
|
75
|
-
ingestify-0.
|
|
76
|
-
ingestify-0.
|
|
77
|
-
ingestify-0.
|
|
78
|
-
ingestify-0.
|
|
79
|
-
ingestify-0.
|
|
75
|
+
ingestify-0.8.0.dist-info/METADATA,sha256=rpC2ALX0e4Ii-XzhJWmRKfW7YoBgl6gEpP2cUGFlQp4,8089
|
|
76
|
+
ingestify-0.8.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
77
|
+
ingestify-0.8.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
|
|
78
|
+
ingestify-0.8.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
|
|
79
|
+
ingestify-0.8.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|