ingestify 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -149,185 +149,202 @@ class IngestionJobStateString(TypeDecorator):
149
149
  return IngestionJobState[value]
150
150
 
151
151
 
152
- metadata = MetaData()
153
-
154
- dataset_table = Table(
155
- "dataset",
156
- metadata,
157
- Column("bucket", String(255), default=None),
158
- Column("dataset_id", String(255), primary_key=True),
159
- Column("provider", String(255), index=True),
160
- Column("dataset_type", String(255), index=True),
161
- Column("state", DatasetStateString),
162
- Column("name", String(255)),
163
- Column(
164
- "identifier",
165
- # Use JSONB when available
166
- JSON().with_variant(JSONB(), "postgresql"),
167
- ),
168
- Column("metadata", JSON),
169
- Column("created_at", TZDateTime(6)),
170
- Column("updated_at", TZDateTime(6)),
171
- Column("last_modified_at", TZDateTime(6)),
172
- # Required for performance querying when there are a lot of Datasets
173
- # with the same provider and dataset_type
174
- Index(
175
- "idx_bucket_type_provider_last_modified",
176
- "bucket",
177
- "provider",
178
- "dataset_type",
179
- "last_modified_at",
180
- ),
181
- )
182
-
183
- revision_table = Table(
184
- "revision",
185
- metadata,
186
- Column(
187
- "dataset_id", String(255), ForeignKey("dataset.dataset_id"), primary_key=True
188
- ),
189
- Column("revision_id", Integer, primary_key=True),
190
- Column("description", String(255)),
191
- Column("created_at", TZDateTime(6)),
192
- Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
193
- Column("source", JSONType()),
194
- )
195
-
196
- file_table = Table(
197
- "file",
198
- metadata,
199
- Column("dataset_id", String(255), primary_key=True),
200
- Column("revision_id", Integer, primary_key=True),
201
- Column("file_id", String(255), primary_key=True),
202
- Column("created_at", TZDateTime(6)),
203
- Column("modified_at", TZDateTime(6)),
204
- Column("tag", String(255)),
205
- Column("content_type", String(255)),
206
- Column("size", BigInteger),
207
- Column("data_feed_key", String(255)),
208
- Column("data_spec_version", String(255)),
209
- Column("data_serialization_format", String(255)),
210
- Column("storage_compression_method", String(255)),
211
- Column("storage_size", BigInteger),
212
- Column("storage_path", PathString),
213
- ForeignKeyConstraint(
214
- ("dataset_id", "revision_id"),
215
- [revision_table.c.dataset_id, revision_table.c.revision_id],
216
- ondelete="CASCADE",
217
- ),
218
- )
219
-
220
- ingestion_job_summary_table = Table(
221
- "ingestion_job_summary",
222
- metadata,
223
- Column("ingestion_job_summary_id", String(255), primary_key=True),
224
- Column("ingestion_job_id", String(255), index=True),
225
- # From the IngestionPlan
226
- Column("source_name", String(255)),
227
- Column("provider", String(255)),
228
- Column("dataset_type", String(255)),
229
- Column(
230
- "data_spec_versions",
231
- JSONType(
232
- serializer=lambda data_spec_versions: {
233
- key: list(value) for key, value in data_spec_versions.items()
234
- },
235
- deserializer=lambda data_spec_versions: DataSpecVersionCollection.from_dict(
236
- data_spec_versions
152
+ def get_tables(table_prefix: str = ""):
153
+ """
154
+ Create all SQLAlchemy table definitions with an optional prefix.
155
+
156
+ Args:
157
+ table_prefix: Optional prefix for all table names (e.g., "prod_" would create "prod_dataset")
158
+
159
+ Returns:
160
+ A dictionary containing all table objects and metadata
161
+ """
162
+ metadata = MetaData()
163
+
164
+ dataset_table = Table(
165
+ f"{table_prefix}dataset",
166
+ metadata,
167
+ Column("bucket", String(255), default=None),
168
+ Column("dataset_id", String(255), primary_key=True),
169
+ Column("provider", String(255), index=True),
170
+ Column("dataset_type", String(255), index=True),
171
+ Column("state", DatasetStateString),
172
+ Column("name", String(255)),
173
+ Column(
174
+ "identifier",
175
+ # Use JSONB when available
176
+ JSON().with_variant(JSONB(), "postgresql"),
177
+ ),
178
+ Column("metadata", JSON),
179
+ Column("created_at", TZDateTime(6)),
180
+ Column("updated_at", TZDateTime(6)),
181
+ Column("last_modified_at", TZDateTime(6)),
182
+ # Required for performance querying when there are a lot of Datasets
183
+ # with the same provider and dataset_type
184
+ Index(
185
+ "idx_bucket_type_provider_last_modified",
186
+ "bucket",
187
+ "provider",
188
+ "dataset_type",
189
+ "last_modified_at",
190
+ ),
191
+ )
192
+
193
+ revision_table = Table(
194
+ f"{table_prefix}revision",
195
+ metadata,
196
+ Column(
197
+ "dataset_id",
198
+ String(255),
199
+ ForeignKey(f"{table_prefix}dataset.dataset_id"),
200
+ primary_key=True,
201
+ ),
202
+ Column("revision_id", Integer, primary_key=True),
203
+ Column("description", String(255)),
204
+ Column("created_at", TZDateTime(6)),
205
+ Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
206
+ Column("source", JSONType()),
207
+ )
208
+
209
+ file_table = Table(
210
+ f"{table_prefix}file",
211
+ metadata,
212
+ Column("dataset_id", String(255), primary_key=True),
213
+ Column("revision_id", Integer, primary_key=True),
214
+ Column("file_id", String(255), primary_key=True),
215
+ Column("created_at", TZDateTime(6)),
216
+ Column("modified_at", TZDateTime(6)),
217
+ Column("tag", String(255)),
218
+ Column("content_type", String(255)),
219
+ Column("size", BigInteger),
220
+ Column("data_feed_key", String(255)),
221
+ Column("data_spec_version", String(255)),
222
+ Column("data_serialization_format", String(255)),
223
+ Column("storage_compression_method", String(255)),
224
+ Column("storage_size", BigInteger),
225
+ Column("storage_path", PathString),
226
+ ForeignKeyConstraint(
227
+ ("dataset_id", "revision_id"),
228
+ [revision_table.c.dataset_id, revision_table.c.revision_id],
229
+ ondelete="CASCADE",
230
+ ),
231
+ )
232
+
233
+ ingestion_job_summary_table = Table(
234
+ f"{table_prefix}ingestion_job_summary",
235
+ metadata,
236
+ Column("ingestion_job_summary_id", String(255), primary_key=True),
237
+ Column("ingestion_job_id", String(255), index=True),
238
+ # From the IngestionPlan
239
+ Column("source_name", String(255)),
240
+ Column("provider", String(255)),
241
+ Column("dataset_type", String(255)),
242
+ Column(
243
+ "data_spec_versions",
244
+ JSONType(
245
+ serializer=lambda data_spec_versions: {
246
+ key: list(value) for key, value in data_spec_versions.items()
247
+ },
248
+ deserializer=lambda data_spec_versions: DataSpecVersionCollection.from_dict(
249
+ data_spec_versions
250
+ ),
237
251
  ),
238
252
  ),
239
- ),
240
- Column(
241
- "selector",
242
- JSONType(
243
- serializer=lambda selector: selector.filtered_attributes,
244
- deserializer=lambda selector: Selector(**selector),
253
+ Column(
254
+ "selector",
255
+ JSONType(
256
+ serializer=lambda selector: selector.filtered_attributes,
257
+ deserializer=lambda selector: Selector(**selector),
258
+ ),
245
259
  ),
246
- ),
247
- Column("started_at", TZDateTime(6)),
248
- Column("ended_at", TZDateTime(6)),
249
- # Some task counters
250
- Column("state", IngestionJobStateString),
251
- Column("total_tasks", Integer),
252
- Column("successful_tasks", Integer),
253
- Column("ignored_successful_tasks", Integer),
254
- Column("skipped_tasks", Integer),
255
- Column("failed_tasks", Integer),
256
- Column(
257
- "timings",
258
- JSONType(
259
- serializer=lambda timings: [
260
- # Timing is probably already a dictionary. Load it into Timing first, so it can be dumped
261
- # in json mode
262
- Timing.model_validate(timing).model_dump(mode="json")
263
- for timing in timings
264
- ],
265
- deserializer=lambda timings: [
266
- Timing.model_validate(timing) for timing in timings
267
- ],
260
+ Column("started_at", TZDateTime(6)),
261
+ Column("ended_at", TZDateTime(6)),
262
+ # Some task counters
263
+ Column("state", IngestionJobStateString),
264
+ Column("total_tasks", Integer),
265
+ Column("successful_tasks", Integer),
266
+ Column("ignored_successful_tasks", Integer),
267
+ Column("skipped_tasks", Integer),
268
+ Column("failed_tasks", Integer),
269
+ Column(
270
+ "timings",
271
+ JSONType(
272
+ serializer=lambda timings: [
273
+ # Timing is probably already a dictionary. Load it into Timing first, so it can be dumped
274
+ # in json mode
275
+ Timing.model_validate(timing).model_dump(mode="json")
276
+ for timing in timings
277
+ ],
278
+ deserializer=lambda timings: [
279
+ Timing.model_validate(timing) for timing in timings
280
+ ],
281
+ ),
268
282
  ),
269
- ),
270
- # Column(
271
- # "task_summaries",
272
- # JSONType(
273
- # serializer=lambda task_summaries: [
274
- # task_summary.model_dump(mode="json") for task_summary in task_summaries
275
- # ],
276
- # deserializer=lambda task_summaries: [
277
- # TaskSummary.model_validate(task_summary)
278
- # for task_summary in task_summaries
279
- # ],
280
- # ),
281
- # ),
282
- )
283
-
284
-
285
- task_summary_table = Table(
286
- "task_summary",
287
- metadata,
288
- Column(
289
- "ingestion_job_summary_id",
290
- String(255),
291
- ForeignKey("ingestion_job_summary.ingestion_job_summary_id"),
292
- primary_key=True,
293
- ),
294
- Column("task_id", String(255), primary_key=True),
295
- Column("started_at", TZDateTime(6)),
296
- Column("ended_at", TZDateTime(6)),
297
- Column("operation", OperationString),
298
- Column(
299
- "dataset_identifier", JSONType(deserializer=lambda item: Identifier(**item))
300
- ),
301
- Column("persisted_file_count", Integer),
302
- Column("bytes_retrieved", Integer),
303
- Column("last_modified", TZDateTime(6)),
304
- Column("state", TaskStateString),
305
- Column(
306
- "timings",
307
- JSONType(
308
- serializer=lambda timings: [
309
- Timing.model_validate(timing).model_dump(mode="json")
310
- for timing in timings
311
- ],
312
- deserializer=lambda timings: [
313
- Timing.model_validate(timing) for timing in timings
314
- ],
283
+ )
284
+
285
+ task_summary_table = Table(
286
+ f"{table_prefix}task_summary",
287
+ metadata,
288
+ Column(
289
+ "ingestion_job_summary_id",
290
+ String(255),
291
+ ForeignKey(f"{table_prefix}ingestion_job_summary.ingestion_job_summary_id"),
292
+ primary_key=True,
315
293
  ),
316
- ),
317
- # Column("description", String(255)),
318
- # Column("created_at", TZDateTime(6)),
319
- # Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
320
- # Column("source", JSONType()),
321
- )
322
-
323
- store_version_table = Table(
324
- "store_version",
325
- metadata,
326
- Column("id", Integer, primary_key=True, default=1),
327
- Column("ingestify_version", String(255), nullable=False),
328
- Column("created_at", TZDateTime(6), nullable=False),
329
- Column("updated_at", TZDateTime(6), nullable=False),
330
- )
294
+ Column("task_id", String(255), primary_key=True),
295
+ Column("started_at", TZDateTime(6)),
296
+ Column("ended_at", TZDateTime(6)),
297
+ Column("operation", OperationString),
298
+ Column(
299
+ "dataset_identifier", JSONType(deserializer=lambda item: Identifier(**item))
300
+ ),
301
+ Column("persisted_file_count", Integer),
302
+ Column("bytes_retrieved", Integer),
303
+ Column("last_modified", TZDateTime(6)),
304
+ Column("state", TaskStateString),
305
+ Column(
306
+ "timings",
307
+ JSONType(
308
+ serializer=lambda timings: [
309
+ Timing.model_validate(timing).model_dump(mode="json")
310
+ for timing in timings
311
+ ],
312
+ deserializer=lambda timings: [
313
+ Timing.model_validate(timing) for timing in timings
314
+ ],
315
+ ),
316
+ ),
317
+ )
318
+
319
+ store_version_table = Table(
320
+ f"{table_prefix}store_version",
321
+ metadata,
322
+ Column("id", Integer, primary_key=True, default=1),
323
+ Column("ingestify_version", String(255), nullable=False),
324
+ Column("created_at", TZDateTime(6), nullable=False),
325
+ Column("updated_at", TZDateTime(6), nullable=False),
326
+ )
327
+
328
+ return {
329
+ "metadata": metadata,
330
+ "dataset_table": dataset_table,
331
+ "revision_table": revision_table,
332
+ "file_table": file_table,
333
+ "ingestion_job_summary_table": ingestion_job_summary_table,
334
+ "task_summary_table": task_summary_table,
335
+ "store_version_table": store_version_table,
336
+ }
337
+
338
+
339
+ # Create default tables without prefix for backward compatibility
340
+ _default_tables = get_tables("")
341
+ metadata = _default_tables["metadata"]
342
+ dataset_table = _default_tables["dataset_table"]
343
+ revision_table = _default_tables["revision_table"]
344
+ file_table = _default_tables["file_table"]
345
+ ingestion_job_summary_table = _default_tables["ingestion_job_summary_table"]
346
+ task_summary_table = _default_tables["task_summary_table"]
347
+ store_version_table = _default_tables["store_version_table"]
331
348
  #
332
349
  #
333
350
  # mapper_registry = registry()
ingestify/main.py CHANGED
@@ -82,14 +82,28 @@ def build_file_repository(file_url: str, identifier_transformer) -> FileReposito
82
82
 
83
83
 
84
84
  def get_dataset_store_by_urls(
85
- metadata_url: str, file_url: str, bucket: str, dataset_types
85
+ metadata_url: str,
86
+ file_url: str,
87
+ bucket: str,
88
+ dataset_types,
89
+ metadata_options: dict = None,
86
90
  ) -> DatasetStore:
87
91
  """
88
92
  Initialize a DatasetStore by a DatasetRepository and a FileRepository
93
+
94
+ Args:
95
+ metadata_url: Database connection URL
96
+ file_url: File storage URL
97
+ bucket: Bucket name
98
+ dataset_types: Dataset type configurations
99
+ metadata_options: Optional dict with metadata store options (e.g., table_prefix)
89
100
  """
90
101
  if not bucket:
91
102
  raise Exception("Bucket is not specified")
92
103
 
104
+ if metadata_options is None:
105
+ metadata_options = {}
106
+
93
107
  identifier_transformer = IdentifierTransformer()
94
108
  for dataset_type in dataset_types:
95
109
  for id_key, id_config in dataset_type["identifier_keys"].items():
@@ -110,7 +124,12 @@ def get_dataset_store_by_urls(
110
124
  if metadata_url.startswith("postgres://"):
111
125
  metadata_url = metadata_url.replace("postgress://", "postgress+")
112
126
 
113
- sqlalchemy_session_provider = SqlAlchemySessionProvider(metadata_url)
127
+ # Extract table_prefix from metadata_options
128
+ table_prefix = metadata_options.get("table_prefix", "")
129
+
130
+ sqlalchemy_session_provider = SqlAlchemySessionProvider(
131
+ metadata_url, table_prefix=table_prefix
132
+ )
114
133
 
115
134
  dataset_repository = SqlAlchemyDatasetRepository(sqlalchemy_session_provider)
116
135
 
@@ -124,11 +143,16 @@ def get_dataset_store_by_urls(
124
143
  def get_datastore(config_file, bucket: Optional[str] = None) -> DatasetStore:
125
144
  config = parse_config(config_file, default_value="")
126
145
 
146
+ # Extract metadata_options if provided
147
+ main_config = config["main"]
148
+ metadata_options = main_config.get("metadata_options", {})
149
+
127
150
  return get_dataset_store_by_urls(
128
- metadata_url=config["main"]["metadata_url"],
129
- file_url=config["main"]["file_url"],
130
- bucket=bucket or config["main"].get("default_bucket"),
151
+ metadata_url=main_config["metadata_url"],
152
+ file_url=main_config["file_url"],
153
+ bucket=bucket or main_config.get("default_bucket"),
131
154
  dataset_types=config.get("dataset_types", []),
155
+ metadata_options=metadata_options,
132
156
  )
133
157
 
134
158
 
@@ -219,11 +243,16 @@ def get_engine(
219
243
  sources[name] = build_source(name=name, source_args=source_args)
220
244
 
221
245
  logger.info("Initializing IngestionEngine")
246
+
247
+ # Extract metadata_options if provided
248
+ metadata_options = config["main"].get("metadata_options", {})
249
+
222
250
  store = get_dataset_store_by_urls(
223
251
  metadata_url=config["main"]["metadata_url"],
224
252
  file_url=config["main"]["file_url"],
225
253
  bucket=bucket or config["main"].get("default_bucket"),
226
254
  dataset_types=config.get("dataset_types", []),
255
+ metadata_options=metadata_options,
227
256
  )
228
257
 
229
258
  # Setup an EventBus and wire some more components
File without changes
@@ -0,0 +1,17 @@
1
+ import tempfile
2
+
3
+ import pytest
4
+ import os
5
+
6
+
7
+ @pytest.fixture(scope="function", autouse=True)
8
+ def datastore_dir():
9
+ with tempfile.TemporaryDirectory() as tmpdirname:
10
+ os.environ["TEST_DIR"] = tmpdirname
11
+ os.environ["INGESTIFY_RUN_EAGER"] = "true"
12
+ yield tmpdirname
13
+
14
+
15
+ @pytest.fixture(scope="session")
16
+ def config_file():
17
+ return os.path.abspath(os.path.dirname(__file__) + "/config.yaml")