ingestify 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +2 -1
- ingestify/application/ingestion_engine.py +3 -0
- ingestify/application/loader.py +12 -2
- ingestify/domain/models/dataset/dataset_state.py +1 -0
- ingestify/domain/models/dataset/file.py +6 -0
- ingestify/domain/models/ingestion/ingestion_job.py +5 -1
- ingestify/domain/models/resources/dataset_resource.py +13 -1
- ingestify/infra/fetch/http.py +3 -3
- ingestify/infra/store/dataset/sqlalchemy/repository.py +90 -50
- ingestify/infra/store/dataset/sqlalchemy/tables.py +191 -174
- ingestify/main.py +189 -5
- ingestify/tests/__init__.py +0 -0
- ingestify/tests/conftest.py +17 -0
- ingestify/tests/test_auto_ingest.py +418 -0
- ingestify/tests/test_engine.py +501 -0
- ingestify/tests/test_events.py +201 -0
- ingestify/tests/test_file_cache.py +98 -0
- ingestify/tests/test_pagination.py +162 -0
- ingestify/tests/test_store_version.py +73 -0
- ingestify/tests/test_table_prefix.py +78 -0
- {ingestify-0.7.0.dist-info → ingestify-0.9.0.dist-info}/METADATA +59 -5
- {ingestify-0.7.0.dist-info → ingestify-0.9.0.dist-info}/RECORD +25 -16
- {ingestify-0.7.0.dist-info → ingestify-0.9.0.dist-info}/WHEEL +1 -1
- {ingestify-0.7.0.dist-info → ingestify-0.9.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.7.0.dist-info → ingestify-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -149,185 +149,202 @@ class IngestionJobStateString(TypeDecorator):
|
|
|
149
149
|
return IngestionJobState[value]
|
|
150
150
|
|
|
151
151
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
"
|
|
179
|
-
"
|
|
180
|
-
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
("
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
152
|
+
def get_tables(table_prefix: str = ""):
|
|
153
|
+
"""
|
|
154
|
+
Create all SQLAlchemy table definitions with an optional prefix.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
table_prefix: Optional prefix for all table names (e.g., "prod_" would create "prod_dataset")
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
A dictionary containing all table objects and metadata
|
|
161
|
+
"""
|
|
162
|
+
metadata = MetaData()
|
|
163
|
+
|
|
164
|
+
dataset_table = Table(
|
|
165
|
+
f"{table_prefix}dataset",
|
|
166
|
+
metadata,
|
|
167
|
+
Column("bucket", String(255), default=None),
|
|
168
|
+
Column("dataset_id", String(255), primary_key=True),
|
|
169
|
+
Column("provider", String(255), index=True),
|
|
170
|
+
Column("dataset_type", String(255), index=True),
|
|
171
|
+
Column("state", DatasetStateString),
|
|
172
|
+
Column("name", String(255)),
|
|
173
|
+
Column(
|
|
174
|
+
"identifier",
|
|
175
|
+
# Use JSONB when available
|
|
176
|
+
JSON().with_variant(JSONB(), "postgresql"),
|
|
177
|
+
),
|
|
178
|
+
Column("metadata", JSON),
|
|
179
|
+
Column("created_at", TZDateTime(6)),
|
|
180
|
+
Column("updated_at", TZDateTime(6)),
|
|
181
|
+
Column("last_modified_at", TZDateTime(6)),
|
|
182
|
+
# Required for performance querying when there are a lot of Datasets
|
|
183
|
+
# with the same provider and dataset_type
|
|
184
|
+
Index(
|
|
185
|
+
"idx_bucket_type_provider_last_modified",
|
|
186
|
+
"bucket",
|
|
187
|
+
"provider",
|
|
188
|
+
"dataset_type",
|
|
189
|
+
"last_modified_at",
|
|
190
|
+
),
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
revision_table = Table(
|
|
194
|
+
f"{table_prefix}revision",
|
|
195
|
+
metadata,
|
|
196
|
+
Column(
|
|
197
|
+
"dataset_id",
|
|
198
|
+
String(255),
|
|
199
|
+
ForeignKey(f"{table_prefix}dataset.dataset_id"),
|
|
200
|
+
primary_key=True,
|
|
201
|
+
),
|
|
202
|
+
Column("revision_id", Integer, primary_key=True),
|
|
203
|
+
Column("description", String(255)),
|
|
204
|
+
Column("created_at", TZDateTime(6)),
|
|
205
|
+
Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
|
|
206
|
+
Column("source", JSONType()),
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
file_table = Table(
|
|
210
|
+
f"{table_prefix}file",
|
|
211
|
+
metadata,
|
|
212
|
+
Column("dataset_id", String(255), primary_key=True),
|
|
213
|
+
Column("revision_id", Integer, primary_key=True),
|
|
214
|
+
Column("file_id", String(255), primary_key=True),
|
|
215
|
+
Column("created_at", TZDateTime(6)),
|
|
216
|
+
Column("modified_at", TZDateTime(6)),
|
|
217
|
+
Column("tag", String(255)),
|
|
218
|
+
Column("content_type", String(255)),
|
|
219
|
+
Column("size", BigInteger),
|
|
220
|
+
Column("data_feed_key", String(255)),
|
|
221
|
+
Column("data_spec_version", String(255)),
|
|
222
|
+
Column("data_serialization_format", String(255)),
|
|
223
|
+
Column("storage_compression_method", String(255)),
|
|
224
|
+
Column("storage_size", BigInteger),
|
|
225
|
+
Column("storage_path", PathString),
|
|
226
|
+
ForeignKeyConstraint(
|
|
227
|
+
("dataset_id", "revision_id"),
|
|
228
|
+
[revision_table.c.dataset_id, revision_table.c.revision_id],
|
|
229
|
+
ondelete="CASCADE",
|
|
230
|
+
),
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
ingestion_job_summary_table = Table(
|
|
234
|
+
f"{table_prefix}ingestion_job_summary",
|
|
235
|
+
metadata,
|
|
236
|
+
Column("ingestion_job_summary_id", String(255), primary_key=True),
|
|
237
|
+
Column("ingestion_job_id", String(255), index=True),
|
|
238
|
+
# From the IngestionPlan
|
|
239
|
+
Column("source_name", String(255)),
|
|
240
|
+
Column("provider", String(255)),
|
|
241
|
+
Column("dataset_type", String(255)),
|
|
242
|
+
Column(
|
|
243
|
+
"data_spec_versions",
|
|
244
|
+
JSONType(
|
|
245
|
+
serializer=lambda data_spec_versions: {
|
|
246
|
+
key: list(value) for key, value in data_spec_versions.items()
|
|
247
|
+
},
|
|
248
|
+
deserializer=lambda data_spec_versions: DataSpecVersionCollection.from_dict(
|
|
249
|
+
data_spec_versions
|
|
250
|
+
),
|
|
237
251
|
),
|
|
238
252
|
),
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
253
|
+
Column(
|
|
254
|
+
"selector",
|
|
255
|
+
JSONType(
|
|
256
|
+
serializer=lambda selector: selector.filtered_attributes,
|
|
257
|
+
deserializer=lambda selector: Selector(**selector),
|
|
258
|
+
),
|
|
245
259
|
),
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
260
|
+
Column("started_at", TZDateTime(6)),
|
|
261
|
+
Column("ended_at", TZDateTime(6)),
|
|
262
|
+
# Some task counters
|
|
263
|
+
Column("state", IngestionJobStateString),
|
|
264
|
+
Column("total_tasks", Integer),
|
|
265
|
+
Column("successful_tasks", Integer),
|
|
266
|
+
Column("ignored_successful_tasks", Integer),
|
|
267
|
+
Column("skipped_tasks", Integer),
|
|
268
|
+
Column("failed_tasks", Integer),
|
|
269
|
+
Column(
|
|
270
|
+
"timings",
|
|
271
|
+
JSONType(
|
|
272
|
+
serializer=lambda timings: [
|
|
273
|
+
# Timing is probably already a dictionary. Load it into Timing first, so it can be dumped
|
|
274
|
+
# in json mode
|
|
275
|
+
Timing.model_validate(timing).model_dump(mode="json")
|
|
276
|
+
for timing in timings
|
|
277
|
+
],
|
|
278
|
+
deserializer=lambda timings: [
|
|
279
|
+
Timing.model_validate(timing) for timing in timings
|
|
280
|
+
],
|
|
281
|
+
),
|
|
268
282
|
),
|
|
269
|
-
)
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
# ],
|
|
280
|
-
# ),
|
|
281
|
-
# ),
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
task_summary_table = Table(
|
|
286
|
-
"task_summary",
|
|
287
|
-
metadata,
|
|
288
|
-
Column(
|
|
289
|
-
"ingestion_job_summary_id",
|
|
290
|
-
String(255),
|
|
291
|
-
ForeignKey("ingestion_job_summary.ingestion_job_summary_id"),
|
|
292
|
-
primary_key=True,
|
|
293
|
-
),
|
|
294
|
-
Column("task_id", String(255), primary_key=True),
|
|
295
|
-
Column("started_at", TZDateTime(6)),
|
|
296
|
-
Column("ended_at", TZDateTime(6)),
|
|
297
|
-
Column("operation", OperationString),
|
|
298
|
-
Column(
|
|
299
|
-
"dataset_identifier", JSONType(deserializer=lambda item: Identifier(**item))
|
|
300
|
-
),
|
|
301
|
-
Column("persisted_file_count", Integer),
|
|
302
|
-
Column("bytes_retrieved", Integer),
|
|
303
|
-
Column("last_modified", TZDateTime(6)),
|
|
304
|
-
Column("state", TaskStateString),
|
|
305
|
-
Column(
|
|
306
|
-
"timings",
|
|
307
|
-
JSONType(
|
|
308
|
-
serializer=lambda timings: [
|
|
309
|
-
Timing.model_validate(timing).model_dump(mode="json")
|
|
310
|
-
for timing in timings
|
|
311
|
-
],
|
|
312
|
-
deserializer=lambda timings: [
|
|
313
|
-
Timing.model_validate(timing) for timing in timings
|
|
314
|
-
],
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
task_summary_table = Table(
|
|
286
|
+
f"{table_prefix}task_summary",
|
|
287
|
+
metadata,
|
|
288
|
+
Column(
|
|
289
|
+
"ingestion_job_summary_id",
|
|
290
|
+
String(255),
|
|
291
|
+
ForeignKey(f"{table_prefix}ingestion_job_summary.ingestion_job_summary_id"),
|
|
292
|
+
primary_key=True,
|
|
315
293
|
),
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
)
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
294
|
+
Column("task_id", String(255), primary_key=True),
|
|
295
|
+
Column("started_at", TZDateTime(6)),
|
|
296
|
+
Column("ended_at", TZDateTime(6)),
|
|
297
|
+
Column("operation", OperationString),
|
|
298
|
+
Column(
|
|
299
|
+
"dataset_identifier", JSONType(deserializer=lambda item: Identifier(**item))
|
|
300
|
+
),
|
|
301
|
+
Column("persisted_file_count", Integer),
|
|
302
|
+
Column("bytes_retrieved", Integer),
|
|
303
|
+
Column("last_modified", TZDateTime(6)),
|
|
304
|
+
Column("state", TaskStateString),
|
|
305
|
+
Column(
|
|
306
|
+
"timings",
|
|
307
|
+
JSONType(
|
|
308
|
+
serializer=lambda timings: [
|
|
309
|
+
Timing.model_validate(timing).model_dump(mode="json")
|
|
310
|
+
for timing in timings
|
|
311
|
+
],
|
|
312
|
+
deserializer=lambda timings: [
|
|
313
|
+
Timing.model_validate(timing) for timing in timings
|
|
314
|
+
],
|
|
315
|
+
),
|
|
316
|
+
),
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
store_version_table = Table(
|
|
320
|
+
f"{table_prefix}store_version",
|
|
321
|
+
metadata,
|
|
322
|
+
Column("id", Integer, primary_key=True, default=1),
|
|
323
|
+
Column("ingestify_version", String(255), nullable=False),
|
|
324
|
+
Column("created_at", TZDateTime(6), nullable=False),
|
|
325
|
+
Column("updated_at", TZDateTime(6), nullable=False),
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
return {
|
|
329
|
+
"metadata": metadata,
|
|
330
|
+
"dataset_table": dataset_table,
|
|
331
|
+
"revision_table": revision_table,
|
|
332
|
+
"file_table": file_table,
|
|
333
|
+
"ingestion_job_summary_table": ingestion_job_summary_table,
|
|
334
|
+
"task_summary_table": task_summary_table,
|
|
335
|
+
"store_version_table": store_version_table,
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
# Create default tables without prefix for backward compatibility
|
|
340
|
+
_default_tables = get_tables("")
|
|
341
|
+
metadata = _default_tables["metadata"]
|
|
342
|
+
dataset_table = _default_tables["dataset_table"]
|
|
343
|
+
revision_table = _default_tables["revision_table"]
|
|
344
|
+
file_table = _default_tables["file_table"]
|
|
345
|
+
ingestion_job_summary_table = _default_tables["ingestion_job_summary_table"]
|
|
346
|
+
task_summary_table = _default_tables["task_summary_table"]
|
|
347
|
+
store_version_table = _default_tables["store_version_table"]
|
|
331
348
|
#
|
|
332
349
|
#
|
|
333
350
|
# mapper_registry = registry()
|
ingestify/main.py
CHANGED
|
@@ -82,14 +82,28 @@ def build_file_repository(file_url: str, identifier_transformer) -> FileReposito
|
|
|
82
82
|
|
|
83
83
|
|
|
84
84
|
def get_dataset_store_by_urls(
|
|
85
|
-
metadata_url: str,
|
|
85
|
+
metadata_url: str,
|
|
86
|
+
file_url: str,
|
|
87
|
+
bucket: str,
|
|
88
|
+
dataset_types,
|
|
89
|
+
metadata_options: dict = None,
|
|
86
90
|
) -> DatasetStore:
|
|
87
91
|
"""
|
|
88
92
|
Initialize a DatasetStore by a DatasetRepository and a FileRepository
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
metadata_url: Database connection URL
|
|
96
|
+
file_url: File storage URL
|
|
97
|
+
bucket: Bucket name
|
|
98
|
+
dataset_types: Dataset type configurations
|
|
99
|
+
metadata_options: Optional dict with metadata store options (e.g., table_prefix)
|
|
89
100
|
"""
|
|
90
101
|
if not bucket:
|
|
91
102
|
raise Exception("Bucket is not specified")
|
|
92
103
|
|
|
104
|
+
if metadata_options is None:
|
|
105
|
+
metadata_options = {}
|
|
106
|
+
|
|
93
107
|
identifier_transformer = IdentifierTransformer()
|
|
94
108
|
for dataset_type in dataset_types:
|
|
95
109
|
for id_key, id_config in dataset_type["identifier_keys"].items():
|
|
@@ -110,7 +124,12 @@ def get_dataset_store_by_urls(
|
|
|
110
124
|
if metadata_url.startswith("postgres://"):
|
|
111
125
|
metadata_url = metadata_url.replace("postgress://", "postgress+")
|
|
112
126
|
|
|
113
|
-
|
|
127
|
+
# Extract table_prefix from metadata_options
|
|
128
|
+
table_prefix = metadata_options.get("table_prefix", "")
|
|
129
|
+
|
|
130
|
+
sqlalchemy_session_provider = SqlAlchemySessionProvider(
|
|
131
|
+
metadata_url, table_prefix=table_prefix
|
|
132
|
+
)
|
|
114
133
|
|
|
115
134
|
dataset_repository = SqlAlchemyDatasetRepository(sqlalchemy_session_provider)
|
|
116
135
|
|
|
@@ -124,11 +143,16 @@ def get_dataset_store_by_urls(
|
|
|
124
143
|
def get_datastore(config_file, bucket: Optional[str] = None) -> DatasetStore:
|
|
125
144
|
config = parse_config(config_file, default_value="")
|
|
126
145
|
|
|
146
|
+
# Extract metadata_options if provided
|
|
147
|
+
main_config = config["main"]
|
|
148
|
+
metadata_options = main_config.get("metadata_options", {})
|
|
149
|
+
|
|
127
150
|
return get_dataset_store_by_urls(
|
|
128
|
-
metadata_url=
|
|
129
|
-
file_url=
|
|
130
|
-
bucket=bucket or
|
|
151
|
+
metadata_url=main_config["metadata_url"],
|
|
152
|
+
file_url=main_config["file_url"],
|
|
153
|
+
bucket=bucket or main_config.get("default_bucket"),
|
|
131
154
|
dataset_types=config.get("dataset_types", []),
|
|
155
|
+
metadata_options=metadata_options,
|
|
132
156
|
)
|
|
133
157
|
|
|
134
158
|
|
|
@@ -219,11 +243,16 @@ def get_engine(
|
|
|
219
243
|
sources[name] = build_source(name=name, source_args=source_args)
|
|
220
244
|
|
|
221
245
|
logger.info("Initializing IngestionEngine")
|
|
246
|
+
|
|
247
|
+
# Extract metadata_options if provided
|
|
248
|
+
metadata_options = config["main"].get("metadata_options", {})
|
|
249
|
+
|
|
222
250
|
store = get_dataset_store_by_urls(
|
|
223
251
|
metadata_url=config["main"]["metadata_url"],
|
|
224
252
|
file_url=config["main"]["file_url"],
|
|
225
253
|
bucket=bucket or config["main"].get("default_bucket"),
|
|
226
254
|
dataset_types=config.get("dataset_types", []),
|
|
255
|
+
metadata_options=metadata_options,
|
|
227
256
|
)
|
|
228
257
|
|
|
229
258
|
# Setup an EventBus and wire some more components
|
|
@@ -279,3 +308,158 @@ def get_engine(
|
|
|
279
308
|
ingestion_engine.add_ingestion_plan(ingestion_plan_)
|
|
280
309
|
|
|
281
310
|
return ingestion_engine
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def get_dev_engine(
|
|
314
|
+
source: Source,
|
|
315
|
+
dataset_type: str,
|
|
316
|
+
data_spec_versions: dict,
|
|
317
|
+
ephemeral: bool = True,
|
|
318
|
+
configure_logging: bool = True,
|
|
319
|
+
dev_dir: Optional[str] = None,
|
|
320
|
+
) -> IngestionEngine:
|
|
321
|
+
"""
|
|
322
|
+
Quick development helper - creates an engine with minimal setup.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
source: The source to test
|
|
326
|
+
dataset_type: Dataset type to ingest
|
|
327
|
+
data_spec_versions: Dict like {"hops": "v1"}
|
|
328
|
+
ephemeral: If True, uses temp dir that gets cleaned. If False, uses persistent /tmp storage.
|
|
329
|
+
configure_logging: If True, configures basic logging (default: True)
|
|
330
|
+
dev_dir: Optional custom directory for data storage (overrides ephemeral)
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
IngestionEngine configured for development
|
|
334
|
+
|
|
335
|
+
Example:
|
|
336
|
+
>>> source = MySource(name="test", ...)
|
|
337
|
+
>>> engine = get_dev_engine(source, "hops", {"hops": "v1"})
|
|
338
|
+
>>> engine.run()
|
|
339
|
+
>>>
|
|
340
|
+
>>> # Access the datasets
|
|
341
|
+
>>> datasets = engine.store.get_dataset_collection()
|
|
342
|
+
>>> print(f"Ingested {len(datasets)} datasets")
|
|
343
|
+
"""
|
|
344
|
+
import tempfile
|
|
345
|
+
from pathlib import Path
|
|
346
|
+
|
|
347
|
+
if configure_logging:
|
|
348
|
+
logging.basicConfig(
|
|
349
|
+
level=logging.INFO,
|
|
350
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
if dev_dir:
|
|
354
|
+
# Use provided directory
|
|
355
|
+
dev_dir = Path(dev_dir)
|
|
356
|
+
elif ephemeral:
|
|
357
|
+
# Use temp directory that will be cleaned up
|
|
358
|
+
import uuid
|
|
359
|
+
|
|
360
|
+
dev_dir = Path(tempfile.gettempdir()) / f"ingestify-dev-{uuid.uuid4().hex[:8]}"
|
|
361
|
+
else:
|
|
362
|
+
# Use persistent directory
|
|
363
|
+
dev_dir = Path(tempfile.gettempdir()) / "ingestify-dev"
|
|
364
|
+
|
|
365
|
+
dev_dir.mkdir(parents=True, exist_ok=True)
|
|
366
|
+
metadata_url = f"sqlite:///{dev_dir / 'database.db'}"
|
|
367
|
+
file_url = f"file://{dev_dir}"
|
|
368
|
+
|
|
369
|
+
logger.info(f"Dev mode: storing data in {dev_dir}")
|
|
370
|
+
|
|
371
|
+
engine = get_engine(
|
|
372
|
+
metadata_url=metadata_url,
|
|
373
|
+
file_url=file_url,
|
|
374
|
+
bucket="main",
|
|
375
|
+
disable_events=True,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
data_spec_versions_obj = DataSpecVersionCollection.from_dict(data_spec_versions)
|
|
379
|
+
|
|
380
|
+
engine.add_ingestion_plan(
|
|
381
|
+
IngestionPlan(
|
|
382
|
+
source=source,
|
|
383
|
+
dataset_type=dataset_type,
|
|
384
|
+
selectors=[Selector.build({}, data_spec_versions=data_spec_versions_obj)],
|
|
385
|
+
fetch_policy=FetchPolicy(),
|
|
386
|
+
data_spec_versions=data_spec_versions_obj,
|
|
387
|
+
)
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
return engine
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def debug_source(
|
|
394
|
+
source: Source,
|
|
395
|
+
*,
|
|
396
|
+
dataset_type: str,
|
|
397
|
+
data_spec_versions: dict,
|
|
398
|
+
ephemeral: bool = True,
|
|
399
|
+
configure_logging: bool = True,
|
|
400
|
+
dev_dir: Optional[str] = None,
|
|
401
|
+
**kwargs,
|
|
402
|
+
) -> IngestionEngine:
|
|
403
|
+
"""
|
|
404
|
+
Debug helper - creates a dev engine, runs ingestion, and shows results.
|
|
405
|
+
|
|
406
|
+
This is a convenience wrapper around get_dev_engine() that does everything:
|
|
407
|
+
creates the engine, runs ingestion, and displays results.
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
source: The source to debug
|
|
411
|
+
dataset_type: Dataset type (e.g., "match")
|
|
412
|
+
data_spec_versions: Dict like {"match": "v1"} - explicit, no defaults!
|
|
413
|
+
ephemeral: If True, uses temp dir. If False, uses persistent /tmp storage.
|
|
414
|
+
configure_logging: If True, configures basic logging (default: True)
|
|
415
|
+
dev_dir: Optional custom directory for data storage (overrides ephemeral)
|
|
416
|
+
**kwargs: Selector arguments. For sources with discover_selectors(), these
|
|
417
|
+
filter discovered selectors. Otherwise passed to find_datasets().
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
IngestionEngine: The engine used for ingestion (for further inspection)
|
|
421
|
+
|
|
422
|
+
Example:
|
|
423
|
+
>>> # Simple source without discover_selectors
|
|
424
|
+
>>> source = StatsBombHOPSS3(name="test", s3_bucket="my-bucket", s3_prefix="HOPS")
|
|
425
|
+
>>> engine = debug_source(source, dataset_type="hops", data_spec_versions={"hops": "v1"})
|
|
426
|
+
|
|
427
|
+
>>> # Source with discover_selectors - discovers all competitions
|
|
428
|
+
>>> source = StatsBombMatchAPI(name="test", ...)
|
|
429
|
+
>>> engine = debug_source(
|
|
430
|
+
... source,
|
|
431
|
+
... dataset_type="match",
|
|
432
|
+
... data_spec_versions={"match": "v6"}
|
|
433
|
+
... )
|
|
434
|
+
|
|
435
|
+
>>> # Filter discovered selectors
|
|
436
|
+
>>> engine = debug_source(
|
|
437
|
+
... source,
|
|
438
|
+
... dataset_type="match",
|
|
439
|
+
... data_spec_versions={"match": "v6"},
|
|
440
|
+
... competition_id=46 # Filters to specific competition
|
|
441
|
+
... )
|
|
442
|
+
"""
|
|
443
|
+
logger.info(f"Debug mode for source: {source.name}")
|
|
444
|
+
|
|
445
|
+
engine = get_dev_engine(
|
|
446
|
+
source=source,
|
|
447
|
+
dataset_type=dataset_type,
|
|
448
|
+
data_spec_versions=data_spec_versions,
|
|
449
|
+
ephemeral=ephemeral,
|
|
450
|
+
configure_logging=configure_logging,
|
|
451
|
+
dev_dir=dev_dir,
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
# Run ingestion
|
|
455
|
+
# Empty selector {} automatically triggers discover_selectors() if available
|
|
456
|
+
# kwargs filter discovered selectors or are passed to find_datasets()
|
|
457
|
+
engine.run(**kwargs)
|
|
458
|
+
|
|
459
|
+
# Show results
|
|
460
|
+
datasets = engine.store.get_dataset_collection()
|
|
461
|
+
logger.info("=" * 60)
|
|
462
|
+
logger.info(f"✓ Ingestion complete: {len(datasets)} dataset(s)")
|
|
463
|
+
logger.info("=" * 60)
|
|
464
|
+
|
|
465
|
+
return engine
|
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@pytest.fixture(scope="function", autouse=True)
|
|
8
|
+
def datastore_dir():
|
|
9
|
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
10
|
+
os.environ["TEST_DIR"] = tmpdirname
|
|
11
|
+
os.environ["INGESTIFY_RUN_EAGER"] = "true"
|
|
12
|
+
yield tmpdirname
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@pytest.fixture(scope="session")
|
|
16
|
+
def config_file():
|
|
17
|
+
return os.path.abspath(os.path.dirname(__file__) + "/config.yaml")
|