ingestify 0.7.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ingestify/__init__.py CHANGED
@@ -7,5 +7,6 @@ except NameError:
7
7
  if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
+ from .main import debug_source
10
11
 
11
- __version__ = "0.7.0"
12
+ __version__ = "0.8.0"
@@ -110,6 +110,9 @@ class IngestionEngine:
110
110
  else:
111
111
  do_load()
112
112
 
113
+ # Alias for load() - more intuitive name for running ingestion
114
+ run = load
115
+
113
116
  def list_datasets(self, as_count: bool = False):
114
117
  """Consider moving this to DataStore"""
115
118
  datasets = sorted(
@@ -307,7 +307,17 @@ class Loader:
307
307
  auto_ingest_config=auto_ingest_config,
308
308
  **selector_filters,
309
309
  )
310
- if selector_filters and not selectors:
311
- logger.warning(f"No data found matching {selector_filters}")
310
+ if (provider or source or dataset_type or selector_filters) and not selectors:
311
+ filters_applied = {
312
+ k: v
313
+ for k, v in {
314
+ "provider": provider,
315
+ "source": source,
316
+ "dataset_type": dataset_type,
317
+ **selector_filters,
318
+ }.items()
319
+ if v
320
+ }
321
+ logger.warning(f"No data found matching filters: {filters_applied}")
312
322
  else:
313
323
  self.run(selectors, dry_run=dry_run)
@@ -39,6 +39,12 @@ class DraftFile(BaseModel):
39
39
  stream = BytesIO(file_.read().encode("utf-8"))
40
40
  elif isinstance(file_, BytesIO):
41
41
  stream = file_
42
+ elif hasattr(file_, "read"):
43
+ data = file_.read()
44
+ if isinstance(data, bytes):
45
+ stream = BytesIO(data)
46
+ else:
47
+ stream = BytesIO(data.encode("utf-8"))
42
48
  else:
43
49
  raise Exception(f"Not possible to create DraftFile from {type(file_)}")
44
50
 
ingestify/main.py CHANGED
@@ -279,3 +279,158 @@ def get_engine(
279
279
  ingestion_engine.add_ingestion_plan(ingestion_plan_)
280
280
 
281
281
  return ingestion_engine
282
+
283
+
284
+ def get_dev_engine(
285
+ source: Source,
286
+ dataset_type: str,
287
+ data_spec_versions: dict,
288
+ ephemeral: bool = True,
289
+ configure_logging: bool = True,
290
+ dev_dir: Optional[str] = None,
291
+ ) -> IngestionEngine:
292
+ """
293
+ Quick development helper - creates an engine with minimal setup.
294
+
295
+ Args:
296
+ source: The source to test
297
+ dataset_type: Dataset type to ingest
298
+ data_spec_versions: Dict like {"hops": "v1"}
299
+ ephemeral: If True, uses temp dir that gets cleaned. If False, uses persistent /tmp storage.
300
+ configure_logging: If True, configures basic logging (default: True)
301
+ dev_dir: Optional custom directory for data storage (overrides ephemeral)
302
+
303
+ Returns:
304
+ IngestionEngine configured for development
305
+
306
+ Example:
307
+ >>> source = MySource(name="test", ...)
308
+ >>> engine = get_dev_engine(source, "hops", {"hops": "v1"})
309
+ >>> engine.run()
310
+ >>>
311
+ >>> # Access the datasets
312
+ >>> datasets = engine.store.get_dataset_collection()
313
+ >>> print(f"Ingested {len(datasets)} datasets")
314
+ """
315
+ import tempfile
316
+ from pathlib import Path
317
+
318
+ if configure_logging:
319
+ logging.basicConfig(
320
+ level=logging.INFO,
321
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
322
+ )
323
+
324
+ if dev_dir:
325
+ # Use provided directory
326
+ dev_dir = Path(dev_dir)
327
+ elif ephemeral:
328
+ # Use temp directory that will be cleaned up
329
+ import uuid
330
+
331
+ dev_dir = Path(tempfile.gettempdir()) / f"ingestify-dev-{uuid.uuid4().hex[:8]}"
332
+ else:
333
+ # Use persistent directory
334
+ dev_dir = Path(tempfile.gettempdir()) / "ingestify-dev"
335
+
336
+ dev_dir.mkdir(parents=True, exist_ok=True)
337
+ metadata_url = f"sqlite:///{dev_dir / 'database.db'}"
338
+ file_url = f"file://{dev_dir}"
339
+
340
+ logger.info(f"Dev mode: storing data in {dev_dir}")
341
+
342
+ engine = get_engine(
343
+ metadata_url=metadata_url,
344
+ file_url=file_url,
345
+ bucket="main",
346
+ disable_events=True,
347
+ )
348
+
349
+ data_spec_versions_obj = DataSpecVersionCollection.from_dict(data_spec_versions)
350
+
351
+ engine.add_ingestion_plan(
352
+ IngestionPlan(
353
+ source=source,
354
+ dataset_type=dataset_type,
355
+ selectors=[Selector.build({}, data_spec_versions=data_spec_versions_obj)],
356
+ fetch_policy=FetchPolicy(),
357
+ data_spec_versions=data_spec_versions_obj,
358
+ )
359
+ )
360
+
361
+ return engine
362
+
363
+
364
+ def debug_source(
365
+ source: Source,
366
+ *,
367
+ dataset_type: str,
368
+ data_spec_versions: dict,
369
+ ephemeral: bool = True,
370
+ configure_logging: bool = True,
371
+ dev_dir: Optional[str] = None,
372
+ **kwargs,
373
+ ) -> IngestionEngine:
374
+ """
375
+ Debug helper - creates a dev engine, runs ingestion, and shows results.
376
+
377
+ This is a convenience wrapper around get_dev_engine() that does everything:
378
+ creates the engine, runs ingestion, and displays results.
379
+
380
+ Args:
381
+ source: The source to debug
382
+ dataset_type: Dataset type (e.g., "match")
383
+ data_spec_versions: Dict like {"match": "v1"} - explicit, no defaults!
384
+ ephemeral: If True, uses temp dir. If False, uses persistent /tmp storage.
385
+ configure_logging: If True, configures basic logging (default: True)
386
+ dev_dir: Optional custom directory for data storage (overrides ephemeral)
387
+ **kwargs: Selector arguments. For sources with discover_selectors(), these
388
+ filter discovered selectors. Otherwise passed to find_datasets().
389
+
390
+ Returns:
391
+ IngestionEngine: The engine used for ingestion (for further inspection)
392
+
393
+ Example:
394
+ >>> # Simple source without discover_selectors
395
+ >>> source = StatsBombHOPSS3(name="test", s3_bucket="my-bucket", s3_prefix="HOPS")
396
+ >>> engine = debug_source(source, dataset_type="hops", data_spec_versions={"hops": "v1"})
397
+
398
+ >>> # Source with discover_selectors - discovers all competitions
399
+ >>> source = StatsBombMatchAPI(name="test", ...)
400
+ >>> engine = debug_source(
401
+ ... source,
402
+ ... dataset_type="match",
403
+ ... data_spec_versions={"match": "v6"}
404
+ ... )
405
+
406
+ >>> # Filter discovered selectors
407
+ >>> engine = debug_source(
408
+ ... source,
409
+ ... dataset_type="match",
410
+ ... data_spec_versions={"match": "v6"},
411
+ ... competition_id=46 # Filters to specific competition
412
+ ... )
413
+ """
414
+ logger.info(f"Debug mode for source: {source.name}")
415
+
416
+ engine = get_dev_engine(
417
+ source=source,
418
+ dataset_type=dataset_type,
419
+ data_spec_versions=data_spec_versions,
420
+ ephemeral=ephemeral,
421
+ configure_logging=configure_logging,
422
+ dev_dir=dev_dir,
423
+ )
424
+
425
+ # Run ingestion
426
+ # Empty selector {} automatically triggers discover_selectors() if available
427
+ # kwargs filter discovered selectors or are passed to find_datasets()
428
+ engine.run(**kwargs)
429
+
430
+ # Show results
431
+ datasets = engine.store.get_dataset_collection()
432
+ logger.info("=" * 60)
433
+ logger.info(f"✓ Ingestion complete: {len(datasets)} dataset(s)")
434
+ logger.info("=" * 60)
435
+
436
+ return engine
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -77,6 +77,43 @@ Ingestify fixes that by building **your own data lake** of untouched provider fi
77
77
  pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
78
78
  ```
79
79
 
80
+ ### Developing a new Source
81
+
82
+ When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
83
+
84
+ ```python
85
+ from ingestify import Source, debug_source
86
+
87
+ class MyCustomSource(Source):
88
+ provider = "my_provider"
89
+
90
+ def __init__(self, name: str, api_key: str):
91
+ super().__init__(name)
92
+ self.api_key = api_key
93
+
94
+ def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
95
+ # Your source implementation
96
+ ...
97
+
98
+ # Quick debug - runs full ingestion with temp storage
99
+ if __name__ == "__main__":
100
+ source = MyCustomSource(name="test", api_key="...")
101
+
102
+ debug_source(
103
+ source,
104
+ dataset_type="match",
105
+ data_spec_versions={"events": "v1"},
106
+ )
107
+ ```
108
+
109
+ The `debug_source()` helper:
110
+ - ✅ Creates an ephemeral dev engine with temp storage
111
+ - ✅ Configures logging automatically
112
+ - ✅ Runs the full ingestion cycle
113
+ - ✅ Shows storage location and results
114
+
115
+ Perfect for testing your source before adding it to production config!
116
+
80
117
  ### Minimal `config.yaml`
81
118
 
82
119
  ```yaml
@@ -175,8 +212,16 @@ pip install kloppy
175
212
  ```
176
213
 
177
214
  ```python
215
+ import logging, sys
216
+
178
217
  from ingestify.main import get_engine
179
218
 
219
+ logging.basicConfig(
220
+ level=logging.INFO,
221
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
222
+ stream=sys.stderr,
223
+ )
224
+
180
225
  engine = get_engine(
181
226
  metadata_url="sqlite:///database_open_data/catalog.db",
182
227
  file_url="file://database_open_data/files/"
@@ -188,12 +233,13 @@ dataset_iter = engine.iter_datasets(
188
233
 
189
234
  provider="statsbomb",
190
235
  dataset_type="match",
191
- competition_id=43,
192
- season_id=281
236
+ competition_id=43, # "FIFA World Cup"
237
+ #season_id=281
193
238
  )
194
239
 
195
240
  for dataset in dataset_iter:
196
241
  kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
242
+ logging.info(f"Loaded {kloppy_dataset}")
197
243
  ```
198
244
 
199
245
 
@@ -1,14 +1,14 @@
1
- ingestify/__init__.py,sha256=IuO5KQRTWAjWxmJ6Knte5-Q2Ybq1BDDkM5UlZjYRl84,301
1
+ ingestify/__init__.py,sha256=FeK7pau-iTc6ooJiPelblIhkrPLojVHKpTHXIrkdpq8,336
2
2
  ingestify/cmdline.py,sha256=Rs1_lSKSIJrcygH5fvtOGicOl_e0sZYW7deqp4_jGbY,6233
3
3
  ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
4
- ingestify/main.py,sha256=YegWoI_xIgoz30BSS7N6Ew3SAXSr1-jPFMPqvFda3DI,9797
4
+ ingestify/main.py,sha256=WjhcsT21F7dOibrg_S7wRiui6Ytj5ScsWqMCGuv9fs8,14938
5
5
  ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
6
6
  ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
7
7
  ingestify/utils.py,sha256=tsoo-GgeSrwK161WCqW793BAm5bjvnGwI8yGgLTJ1lk,6486
8
8
  ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  ingestify/application/dataset_store.py,sha256=GP6wGjVirefEn6hlqWIkOBqdELad9L_mmTpdHdzj18M,20353
10
- ingestify/application/ingestion_engine.py,sha256=cG4JgU667PcsHBngOWUk58KffDrHkJOMv8LrVjaRQ1o,11163
11
- ingestify/application/loader.py,sha256=xImeaOdj97iCpprZ0WxRbQJ4w6nS1cEmPba7TN1lE6I,13038
10
+ ingestify/application/ingestion_engine.py,sha256=we16yiDS9QGOlAUiP1vidDycihjWK3B2jo64uqKmrXE,11246
11
+ ingestify/application/loader.py,sha256=K99ZJuHMEJFO6CIlxoyHKGSQtXw63JgOYu3moUD6sR0,13400
12
12
  ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
13
13
  ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
14
14
  ingestify/domain/models/__init__.py,sha256=WuKS34uiR1EwyczKujBHYGupqseJP-U2P5IQS4kpsA8,838
@@ -25,7 +25,7 @@ ingestify/domain/models/dataset/dataset.py,sha256=OiP03nY0-m06y2GTrs_m-RiZE8Hwyp
25
25
  ingestify/domain/models/dataset/dataset_repository.py,sha256=bf3F_1cKw0CvUberD3FMROE8iowAmYefnD4L6aPB39k,989
26
26
  ingestify/domain/models/dataset/dataset_state.py,sha256=IaYG02WzgooGaM_AuwRhZgljs-9NhCF_LpBZXkl5ELY,324
27
27
  ingestify/domain/models/dataset/events.py,sha256=M8jrHWCm9iXapAy3xjvZZtiiOxXDnfefBixiMwkas24,786
28
- ingestify/domain/models/dataset/file.py,sha256=2wpBfluS8i_mCPQkdSu1x1Af3kc15bVBLAKXeogB4jA,4243
28
+ ingestify/domain/models/dataset/file.py,sha256=cXDjSw19HRMCGFpVN4u1oejxE1V8SMQptfNVDVixj6o,4464
29
29
  ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
30
30
  ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
31
31
  ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
@@ -72,8 +72,8 @@ ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRX
72
72
  ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
73
73
  ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
74
74
  ingestify/infra/store/file/s3_file_repository.py,sha256=tz_EZ_gun7W2qJMlI3j_R03iKBZlJSDcG7AUJ1JkdpE,1501
75
- ingestify-0.7.0.dist-info/METADATA,sha256=cNjbTVDpw0MTkPj-xoywVXzdrynoRAO5z7qyu1LftLg,6871
76
- ingestify-0.7.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
77
- ingestify-0.7.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
78
- ingestify-0.7.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
79
- ingestify-0.7.0.dist-info/RECORD,,
75
+ ingestify-0.8.0.dist-info/METADATA,sha256=rpC2ALX0e4Ii-XzhJWmRKfW7YoBgl6gEpP2cUGFlQp4,8089
76
+ ingestify-0.8.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
77
+ ingestify-0.8.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
78
+ ingestify-0.8.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
79
+ ingestify-0.8.0.dist-info/RECORD,,