ingestify 0.7.0__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {ingestify-0.7.0 → ingestify-0.8.0}/PKG-INFO +49 -3
  2. {ingestify-0.7.0 → ingestify-0.8.0}/README.md +48 -2
  3. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/__init__.py +2 -1
  4. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/application/ingestion_engine.py +3 -0
  5. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/application/loader.py +12 -2
  6. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/file.py +6 -0
  7. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/main.py +155 -0
  8. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify.egg-info/PKG-INFO +49 -3
  9. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/application/__init__.py +0 -0
  10. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/application/dataset_store.py +0 -0
  11. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/application/secrets_manager.py +0 -0
  12. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/cmdline.py +0 -0
  13. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/__init__.py +0 -0
  14. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/__init__.py +0 -0
  15. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/base.py +0 -0
  16. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/data_spec_version_collection.py +0 -0
  17. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/__init__.py +0 -0
  18. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/collection.py +0 -0
  19. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
  20. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/dataset.py +0 -0
  21. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
  22. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/dataset_state.py +0 -0
  23. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/events.py +0 -0
  24. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/file_collection.py +0 -0
  25. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/file_repository.py +0 -0
  26. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/identifier.py +0 -0
  27. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/revision.py +0 -0
  28. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/selector.py +0 -0
  29. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/event/__init__.py +0 -0
  30. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/event/_old_event.py +0 -0
  31. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/event/dispatcher.py +0 -0
  32. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/event/domain_event.py +0 -0
  33. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/event/event_bus.py +0 -0
  34. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/event/publisher.py +0 -0
  35. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/event/subscriber.py +0 -0
  36. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/fetch_policy.py +0 -0
  37. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/ingestion/__init__.py +0 -0
  38. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/ingestion/ingestion_job.py +0 -0
  39. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
  40. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  41. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/resources/__init__.py +0 -0
  42. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/resources/dataset_resource.py +0 -0
  43. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/sink.py +0 -0
  44. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/source.py +0 -0
  45. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/task/__init__.py +0 -0
  46. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/task/set.py +0 -0
  47. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/task/task.py +0 -0
  48. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/task/task_summary.py +0 -0
  49. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/timing.py +0 -0
  50. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/services/__init__.py +0 -0
  51. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  52. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/services/transformers/__init__.py +0 -0
  53. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  54. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/exceptions.py +0 -0
  55. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/__init__.py +0 -0
  56. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/fetch/__init__.py +0 -0
  57. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/fetch/http.py +0 -0
  58. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/serialization/__init__.py +0 -0
  59. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/sink/__init__.py +0 -0
  60. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/sink/postgresql.py +0 -0
  61. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/source/__init__.py +0 -0
  62. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/source/statsbomb/__init__.py +0 -0
  63. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/source/statsbomb/base.py +0 -0
  64. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/source/statsbomb/match.py +0 -0
  65. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/source/statsbomb_github.py +0 -0
  66. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/store/__init__.py +0 -0
  67. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/store/dataset/__init__.py +0 -0
  68. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  69. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +0 -0
  70. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/store/dataset/sqlalchemy/tables.py +0 -0
  71. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/store/file/__init__.py +0 -0
  72. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  73. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
  74. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  75. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/server.py +0 -0
  76. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/source_base.py +0 -0
  77. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify/utils.py +0 -0
  78. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify.egg-info/SOURCES.txt +0 -0
  79. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify.egg-info/dependency_links.txt +0 -0
  80. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify.egg-info/entry_points.txt +0 -0
  81. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify.egg-info/requires.txt +0 -0
  82. {ingestify-0.7.0 → ingestify-0.8.0}/ingestify.egg-info/top_level.txt +0 -0
  83. {ingestify-0.7.0 → ingestify-0.8.0}/setup.cfg +0 -0
  84. {ingestify-0.7.0 → ingestify-0.8.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -68,6 +68,43 @@ Ingestify fixes that by building **your own data lake** of untouched provider fi
68
68
  pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
69
69
  ```
70
70
 
71
+ ### Developing a new Source
72
+
73
+ When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
74
+
75
+ ```python
76
+ from ingestify import Source, debug_source
77
+
78
+ class MyCustomSource(Source):
79
+ provider = "my_provider"
80
+
81
+ def __init__(self, name: str, api_key: str):
82
+ super().__init__(name)
83
+ self.api_key = api_key
84
+
85
+ def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
86
+ # Your source implementation
87
+ ...
88
+
89
+ # Quick debug - runs full ingestion with temp storage
90
+ if __name__ == "__main__":
91
+ source = MyCustomSource(name="test", api_key="...")
92
+
93
+ debug_source(
94
+ source,
95
+ dataset_type="match",
96
+ data_spec_versions={"events": "v1"},
97
+ )
98
+ ```
99
+
100
+ The `debug_source()` helper:
101
+ - ✅ Creates an ephemeral dev engine with temp storage
102
+ - ✅ Configures logging automatically
103
+ - ✅ Runs the full ingestion cycle
104
+ - ✅ Shows storage location and results
105
+
106
+ Perfect for testing your source before adding it to production config!
107
+
71
108
  ### Minimal `config.yaml`
72
109
 
73
110
  ```yaml
@@ -166,8 +203,16 @@ pip install kloppy
166
203
  ```
167
204
 
168
205
  ```python
206
+ import logging, sys
207
+
169
208
  from ingestify.main import get_engine
170
209
 
210
+ logging.basicConfig(
211
+ level=logging.INFO,
212
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
213
+ stream=sys.stderr,
214
+ )
215
+
171
216
  engine = get_engine(
172
217
  metadata_url="sqlite:///database_open_data/catalog.db",
173
218
  file_url="file://database_open_data/files/"
@@ -179,12 +224,13 @@ dataset_iter = engine.iter_datasets(
179
224
 
180
225
  provider="statsbomb",
181
226
  dataset_type="match",
182
- competition_id=43,
183
- season_id=281
227
+ competition_id=43, # "FIFA World Cup"
228
+ #season_id=281
184
229
  )
185
230
 
186
231
  for dataset in dataset_iter:
187
232
  kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
233
+ logging.info(f"Loaded {kloppy_dataset}")
188
234
  ```
189
235
 
190
236
 
@@ -58,6 +58,43 @@ Ingestify fixes that by building **your own data lake** of untouched provider fi
58
58
  pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
59
59
  ```
60
60
 
61
+ ### Developing a new Source
62
+
63
+ When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
64
+
65
+ ```python
66
+ from ingestify import Source, debug_source
67
+
68
+ class MyCustomSource(Source):
69
+ provider = "my_provider"
70
+
71
+ def __init__(self, name: str, api_key: str):
72
+ super().__init__(name)
73
+ self.api_key = api_key
74
+
75
+ def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
76
+ # Your source implementation
77
+ ...
78
+
79
+ # Quick debug - runs full ingestion with temp storage
80
+ if __name__ == "__main__":
81
+ source = MyCustomSource(name="test", api_key="...")
82
+
83
+ debug_source(
84
+ source,
85
+ dataset_type="match",
86
+ data_spec_versions={"events": "v1"},
87
+ )
88
+ ```
89
+
90
+ The `debug_source()` helper:
91
+ - ✅ Creates an ephemeral dev engine with temp storage
92
+ - ✅ Configures logging automatically
93
+ - ✅ Runs the full ingestion cycle
94
+ - ✅ Shows storage location and results
95
+
96
+ Perfect for testing your source before adding it to production config!
97
+
61
98
  ### Minimal `config.yaml`
62
99
 
63
100
  ```yaml
@@ -156,8 +193,16 @@ pip install kloppy
156
193
  ```
157
194
 
158
195
  ```python
196
+ import logging, sys
197
+
159
198
  from ingestify.main import get_engine
160
199
 
200
+ logging.basicConfig(
201
+ level=logging.INFO,
202
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
203
+ stream=sys.stderr,
204
+ )
205
+
161
206
  engine = get_engine(
162
207
  metadata_url="sqlite:///database_open_data/catalog.db",
163
208
  file_url="file://database_open_data/files/"
@@ -169,12 +214,13 @@ dataset_iter = engine.iter_datasets(
169
214
 
170
215
  provider="statsbomb",
171
216
  dataset_type="match",
172
- competition_id=43,
173
- season_id=281
217
+ competition_id=43, # "FIFA World Cup"
218
+ #season_id=281
174
219
  )
175
220
 
176
221
  for dataset in dataset_iter:
177
222
  kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
223
+ logging.info(f"Loaded {kloppy_dataset}")
178
224
  ```
179
225
 
180
226
 
@@ -7,5 +7,6 @@ except NameError:
7
7
  if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
+ from .main import debug_source
10
11
 
11
- __version__ = "0.7.0"
12
+ __version__ = "0.8.0"
@@ -110,6 +110,9 @@ class IngestionEngine:
110
110
  else:
111
111
  do_load()
112
112
 
113
+ # Alias for load() - more intuitive name for running ingestion
114
+ run = load
115
+
113
116
  def list_datasets(self, as_count: bool = False):
114
117
  """Consider moving this to DataStore"""
115
118
  datasets = sorted(
@@ -307,7 +307,17 @@ class Loader:
307
307
  auto_ingest_config=auto_ingest_config,
308
308
  **selector_filters,
309
309
  )
310
- if selector_filters and not selectors:
311
- logger.warning(f"No data found matching {selector_filters}")
310
+ if (provider or source or dataset_type or selector_filters) and not selectors:
311
+ filters_applied = {
312
+ k: v
313
+ for k, v in {
314
+ "provider": provider,
315
+ "source": source,
316
+ "dataset_type": dataset_type,
317
+ **selector_filters,
318
+ }.items()
319
+ if v
320
+ }
321
+ logger.warning(f"No data found matching filters: {filters_applied}")
312
322
  else:
313
323
  self.run(selectors, dry_run=dry_run)
@@ -39,6 +39,12 @@ class DraftFile(BaseModel):
39
39
  stream = BytesIO(file_.read().encode("utf-8"))
40
40
  elif isinstance(file_, BytesIO):
41
41
  stream = file_
42
+ elif hasattr(file_, "read"):
43
+ data = file_.read()
44
+ if isinstance(data, bytes):
45
+ stream = BytesIO(data)
46
+ else:
47
+ stream = BytesIO(data.encode("utf-8"))
42
48
  else:
43
49
  raise Exception(f"Not possible to create DraftFile from {type(file_)}")
44
50
 
@@ -279,3 +279,158 @@ def get_engine(
279
279
  ingestion_engine.add_ingestion_plan(ingestion_plan_)
280
280
 
281
281
  return ingestion_engine
282
+
283
+
284
+ def get_dev_engine(
285
+ source: Source,
286
+ dataset_type: str,
287
+ data_spec_versions: dict,
288
+ ephemeral: bool = True,
289
+ configure_logging: bool = True,
290
+ dev_dir: Optional[str] = None,
291
+ ) -> IngestionEngine:
292
+ """
293
+ Quick development helper - creates an engine with minimal setup.
294
+
295
+ Args:
296
+ source: The source to test
297
+ dataset_type: Dataset type to ingest
298
+ data_spec_versions: Dict like {"hops": "v1"}
299
+ ephemeral: If True, uses temp dir that gets cleaned. If False, uses persistent /tmp storage.
300
+ configure_logging: If True, configures basic logging (default: True)
301
+ dev_dir: Optional custom directory for data storage (overrides ephemeral)
302
+
303
+ Returns:
304
+ IngestionEngine configured for development
305
+
306
+ Example:
307
+ >>> source = MySource(name="test", ...)
308
+ >>> engine = get_dev_engine(source, "hops", {"hops": "v1"})
309
+ >>> engine.run()
310
+ >>>
311
+ >>> # Access the datasets
312
+ >>> datasets = engine.store.get_dataset_collection()
313
+ >>> print(f"Ingested {len(datasets)} datasets")
314
+ """
315
+ import tempfile
316
+ from pathlib import Path
317
+
318
+ if configure_logging:
319
+ logging.basicConfig(
320
+ level=logging.INFO,
321
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
322
+ )
323
+
324
+ if dev_dir:
325
+ # Use provided directory
326
+ dev_dir = Path(dev_dir)
327
+ elif ephemeral:
328
+ # Use temp directory that will be cleaned up
329
+ import uuid
330
+
331
+ dev_dir = Path(tempfile.gettempdir()) / f"ingestify-dev-{uuid.uuid4().hex[:8]}"
332
+ else:
333
+ # Use persistent directory
334
+ dev_dir = Path(tempfile.gettempdir()) / "ingestify-dev"
335
+
336
+ dev_dir.mkdir(parents=True, exist_ok=True)
337
+ metadata_url = f"sqlite:///{dev_dir / 'database.db'}"
338
+ file_url = f"file://{dev_dir}"
339
+
340
+ logger.info(f"Dev mode: storing data in {dev_dir}")
341
+
342
+ engine = get_engine(
343
+ metadata_url=metadata_url,
344
+ file_url=file_url,
345
+ bucket="main",
346
+ disable_events=True,
347
+ )
348
+
349
+ data_spec_versions_obj = DataSpecVersionCollection.from_dict(data_spec_versions)
350
+
351
+ engine.add_ingestion_plan(
352
+ IngestionPlan(
353
+ source=source,
354
+ dataset_type=dataset_type,
355
+ selectors=[Selector.build({}, data_spec_versions=data_spec_versions_obj)],
356
+ fetch_policy=FetchPolicy(),
357
+ data_spec_versions=data_spec_versions_obj,
358
+ )
359
+ )
360
+
361
+ return engine
362
+
363
+
364
+ def debug_source(
365
+ source: Source,
366
+ *,
367
+ dataset_type: str,
368
+ data_spec_versions: dict,
369
+ ephemeral: bool = True,
370
+ configure_logging: bool = True,
371
+ dev_dir: Optional[str] = None,
372
+ **kwargs,
373
+ ) -> IngestionEngine:
374
+ """
375
+ Debug helper - creates a dev engine, runs ingestion, and shows results.
376
+
377
+ This is a convenience wrapper around get_dev_engine() that does everything:
378
+ creates the engine, runs ingestion, and displays results.
379
+
380
+ Args:
381
+ source: The source to debug
382
+ dataset_type: Dataset type (e.g., "match")
383
+ data_spec_versions: Dict like {"match": "v1"} - explicit, no defaults!
384
+ ephemeral: If True, uses temp dir. If False, uses persistent /tmp storage.
385
+ configure_logging: If True, configures basic logging (default: True)
386
+ dev_dir: Optional custom directory for data storage (overrides ephemeral)
387
+ **kwargs: Selector arguments. For sources with discover_selectors(), these
388
+ filter discovered selectors. Otherwise passed to find_datasets().
389
+
390
+ Returns:
391
+ IngestionEngine: The engine used for ingestion (for further inspection)
392
+
393
+ Example:
394
+ >>> # Simple source without discover_selectors
395
+ >>> source = StatsBombHOPSS3(name="test", s3_bucket="my-bucket", s3_prefix="HOPS")
396
+ >>> engine = debug_source(source, dataset_type="hops", data_spec_versions={"hops": "v1"})
397
+
398
+ >>> # Source with discover_selectors - discovers all competitions
399
+ >>> source = StatsBombMatchAPI(name="test", ...)
400
+ >>> engine = debug_source(
401
+ ... source,
402
+ ... dataset_type="match",
403
+ ... data_spec_versions={"match": "v6"}
404
+ ... )
405
+
406
+ >>> # Filter discovered selectors
407
+ >>> engine = debug_source(
408
+ ... source,
409
+ ... dataset_type="match",
410
+ ... data_spec_versions={"match": "v6"},
411
+ ... competition_id=46 # Filters to specific competition
412
+ ... )
413
+ """
414
+ logger.info(f"Debug mode for source: {source.name}")
415
+
416
+ engine = get_dev_engine(
417
+ source=source,
418
+ dataset_type=dataset_type,
419
+ data_spec_versions=data_spec_versions,
420
+ ephemeral=ephemeral,
421
+ configure_logging=configure_logging,
422
+ dev_dir=dev_dir,
423
+ )
424
+
425
+ # Run ingestion
426
+ # Empty selector {} automatically triggers discover_selectors() if available
427
+ # kwargs filter discovered selectors or are passed to find_datasets()
428
+ engine.run(**kwargs)
429
+
430
+ # Show results
431
+ datasets = engine.store.get_dataset_collection()
432
+ logger.info("=" * 60)
433
+ logger.info(f"✓ Ingestion complete: {len(datasets)} dataset(s)")
434
+ logger.info("=" * 60)
435
+
436
+ return engine
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -68,6 +68,43 @@ Ingestify fixes that by building **your own data lake** of untouched provider fi
68
68
  pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
69
69
  ```
70
70
 
71
+ ### Developing a new Source
72
+
73
+ When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
74
+
75
+ ```python
76
+ from ingestify import Source, debug_source
77
+
78
+ class MyCustomSource(Source):
79
+ provider = "my_provider"
80
+
81
+ def __init__(self, name: str, api_key: str):
82
+ super().__init__(name)
83
+ self.api_key = api_key
84
+
85
+ def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
86
+ # Your source implementation
87
+ ...
88
+
89
+ # Quick debug - runs full ingestion with temp storage
90
+ if __name__ == "__main__":
91
+ source = MyCustomSource(name="test", api_key="...")
92
+
93
+ debug_source(
94
+ source,
95
+ dataset_type="match",
96
+ data_spec_versions={"events": "v1"},
97
+ )
98
+ ```
99
+
100
+ The `debug_source()` helper:
101
+ - ✅ Creates an ephemeral dev engine with temp storage
102
+ - ✅ Configures logging automatically
103
+ - ✅ Runs the full ingestion cycle
104
+ - ✅ Shows storage location and results
105
+
106
+ Perfect for testing your source before adding it to production config!
107
+
71
108
  ### Minimal `config.yaml`
72
109
 
73
110
  ```yaml
@@ -166,8 +203,16 @@ pip install kloppy
166
203
  ```
167
204
 
168
205
  ```python
206
+ import logging, sys
207
+
169
208
  from ingestify.main import get_engine
170
209
 
210
+ logging.basicConfig(
211
+ level=logging.INFO,
212
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
213
+ stream=sys.stderr,
214
+ )
215
+
171
216
  engine = get_engine(
172
217
  metadata_url="sqlite:///database_open_data/catalog.db",
173
218
  file_url="file://database_open_data/files/"
@@ -179,12 +224,13 @@ dataset_iter = engine.iter_datasets(
179
224
 
180
225
  provider="statsbomb",
181
226
  dataset_type="match",
182
- competition_id=43,
183
- season_id=281
227
+ competition_id=43, # "FIFA World Cup"
228
+ #season_id=281
184
229
  )
185
230
 
186
231
  for dataset in dataset_iter:
187
232
  kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
233
+ logging.info(f"Loaded {kloppy_dataset}")
188
234
  ```
189
235
 
190
236
 
File without changes
File without changes
File without changes
File without changes