ingestify 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/domain/models/dataset/dataset_state.py +1 -0
- ingestify/domain/models/ingestion/ingestion_job.py +5 -1
- ingestify/domain/models/resources/dataset_resource.py +13 -1
- ingestify/infra/fetch/http.py +1 -3
- ingestify/infra/store/dataset/sqlalchemy/repository.py +90 -50
- ingestify/infra/store/dataset/sqlalchemy/tables.py +191 -174
- ingestify/main.py +34 -5
- ingestify/tests/__init__.py +0 -0
- ingestify/tests/conftest.py +17 -0
- ingestify/tests/test_auto_ingest.py +418 -0
- ingestify/tests/test_engine.py +501 -0
- ingestify/tests/test_events.py +201 -0
- ingestify/tests/test_file_cache.py +98 -0
- ingestify/tests/test_pagination.py +162 -0
- ingestify/tests/test_store_version.py +73 -0
- ingestify/tests/test_table_prefix.py +78 -0
- {ingestify-0.8.0.dist-info → ingestify-0.9.1.dist-info}/METADATA +11 -3
- {ingestify-0.8.0.dist-info → ingestify-0.9.1.dist-info}/RECORD +22 -13
- {ingestify-0.8.0.dist-info → ingestify-0.9.1.dist-info}/WHEEL +1 -1
- {ingestify-0.8.0.dist-info → ingestify-0.9.1.dist-info}/entry_points.txt +0 -0
- {ingestify-0.8.0.dist-info → ingestify-0.9.1.dist-info}/top_level.txt +0 -0
ingestify/__init__.py
CHANGED
|
@@ -129,7 +129,6 @@ class UpdateDatasetTask(Task):
|
|
|
129
129
|
with TaskSummary.update(
|
|
130
130
|
self.task_id, dataset_identifier=dataset_identifier
|
|
131
131
|
) as task_summary:
|
|
132
|
-
|
|
133
132
|
files = {
|
|
134
133
|
file_id: task_summary.record_load_file(
|
|
135
134
|
lambda: load_file(file_resource, dataset=self.dataset),
|
|
@@ -138,6 +137,8 @@ class UpdateDatasetTask(Task):
|
|
|
138
137
|
for file_id, file_resource in self.dataset_resource.files.items()
|
|
139
138
|
}
|
|
140
139
|
|
|
140
|
+
self.dataset_resource.run_post_load_files(files)
|
|
141
|
+
|
|
141
142
|
try:
|
|
142
143
|
revision = self.store.update_dataset(
|
|
143
144
|
dataset=self.dataset,
|
|
@@ -181,6 +182,9 @@ class CreateDatasetTask(Task):
|
|
|
181
182
|
)
|
|
182
183
|
for file_id, file_resource in self.dataset_resource.files.items()
|
|
183
184
|
}
|
|
185
|
+
|
|
186
|
+
self.dataset_resource.run_post_load_files(files)
|
|
187
|
+
|
|
184
188
|
try:
|
|
185
189
|
revision = self.store.create_dataset(
|
|
186
190
|
dataset_type=self.dataset_resource.dataset_type,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
|
-
from typing import Optional, Callable, Any, Protocol, TYPE_CHECKING # noqa
|
|
2
|
+
from typing import Optional, Callable, Any, Protocol, TYPE_CHECKING, Dict # noqa
|
|
3
3
|
from pydantic import Field
|
|
4
4
|
|
|
5
5
|
from ingestify.domain.models.base import BaseModel
|
|
@@ -50,6 +50,18 @@ class DatasetResource(BaseModel):
|
|
|
50
50
|
metadata: dict = Field(default_factory=dict)
|
|
51
51
|
state: DatasetState = Field(default_factory=lambda: DatasetState.COMPLETE)
|
|
52
52
|
files: dict[str, FileResource] = Field(default_factory=dict)
|
|
53
|
+
post_load_files: Optional[
|
|
54
|
+
Callable[["DatasetResource", Dict[str, DraftFile]], None]
|
|
55
|
+
] = None
|
|
56
|
+
|
|
57
|
+
def run_post_load_files(self, files: Dict[str, DraftFile]):
|
|
58
|
+
"""Hook to modify dataset attributes based on loaded file content.
|
|
59
|
+
|
|
60
|
+
Useful for setting state based on file content, e.g., keep state=SCHEDULED
|
|
61
|
+
when files contain '{}', change to COMPLETE when they contain actual data.
|
|
62
|
+
"""
|
|
63
|
+
if self.post_load_files:
|
|
64
|
+
self.post_load_files(self, files)
|
|
53
65
|
|
|
54
66
|
def add_file(
|
|
55
67
|
self,
|
ingestify/infra/fetch/http.py
CHANGED
|
@@ -58,9 +58,7 @@ def retrieve_http(
|
|
|
58
58
|
)
|
|
59
59
|
# else:
|
|
60
60
|
# print(f"{current_file.modified_at=} {last_modified=}")
|
|
61
|
-
|
|
62
|
-
# format_datetime(current_file.modified_at, usegmt=True),
|
|
63
|
-
# )
|
|
61
|
+
headers["if-modified-since"] = format_datetime(current_file.modified_at, usegmt=True)
|
|
64
62
|
headers["if-none-match"] = current_file.tag
|
|
65
63
|
|
|
66
64
|
http_kwargs = {}
|
|
@@ -40,15 +40,7 @@ from ingestify.domain.models.task.task_summary import TaskSummary
|
|
|
40
40
|
from ingestify.exceptions import IngestifyError
|
|
41
41
|
from ingestify.utils import get_concurrency
|
|
42
42
|
|
|
43
|
-
from .tables import
|
|
44
|
-
metadata,
|
|
45
|
-
dataset_table,
|
|
46
|
-
file_table,
|
|
47
|
-
revision_table,
|
|
48
|
-
ingestion_job_summary_table,
|
|
49
|
-
task_summary_table,
|
|
50
|
-
store_version_table,
|
|
51
|
-
)
|
|
43
|
+
from .tables import get_tables
|
|
52
44
|
|
|
53
45
|
logger = logging.getLogger(__name__)
|
|
54
46
|
|
|
@@ -112,20 +104,33 @@ class SqlAlchemySessionProvider:
|
|
|
112
104
|
session_factory = sessionmaker(bind=self.engine)
|
|
113
105
|
self.session = scoped_session(session_factory)
|
|
114
106
|
|
|
107
|
+
# Create tables with the specified prefix
|
|
108
|
+
tables = get_tables(self.table_prefix)
|
|
109
|
+
self.metadata = tables["metadata"]
|
|
110
|
+
self.dataset_table = tables["dataset_table"]
|
|
111
|
+
self.revision_table = tables["revision_table"]
|
|
112
|
+
self.file_table = tables["file_table"]
|
|
113
|
+
self.ingestion_job_summary_table = tables["ingestion_job_summary_table"]
|
|
114
|
+
self.task_summary_table = tables["task_summary_table"]
|
|
115
|
+
self.store_version_table = tables["store_version_table"]
|
|
116
|
+
|
|
115
117
|
def __getstate__(self):
|
|
116
|
-
return {"url": self.url}
|
|
118
|
+
return {"url": self.url, "table_prefix": self.table_prefix}
|
|
117
119
|
|
|
118
120
|
def __setstate__(self, state):
|
|
119
121
|
self.url = state["url"]
|
|
122
|
+
self.table_prefix = state.get("table_prefix", "")
|
|
120
123
|
self._init_engine()
|
|
121
124
|
|
|
122
|
-
def __init__(self, url: str):
|
|
125
|
+
def __init__(self, url: str, table_prefix: str = ""):
|
|
123
126
|
url = self.fix_url(url)
|
|
124
127
|
|
|
125
128
|
self.url = url
|
|
129
|
+
self.table_prefix = table_prefix
|
|
126
130
|
self._init_engine()
|
|
127
131
|
|
|
128
|
-
|
|
132
|
+
# Create all tables in the database
|
|
133
|
+
self.metadata.create_all(self.engine)
|
|
129
134
|
|
|
130
135
|
def __del__(self):
|
|
131
136
|
self.close()
|
|
@@ -154,6 +159,30 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
154
159
|
def dialect(self) -> Dialect:
|
|
155
160
|
return self.session_provider.dialect
|
|
156
161
|
|
|
162
|
+
@property
|
|
163
|
+
def dataset_table(self):
|
|
164
|
+
return self.session_provider.dataset_table
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def revision_table(self):
|
|
168
|
+
return self.session_provider.revision_table
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def file_table(self):
|
|
172
|
+
return self.session_provider.file_table
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def ingestion_job_summary_table(self):
|
|
176
|
+
return self.session_provider.ingestion_job_summary_table
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def task_summary_table(self):
|
|
180
|
+
return self.session_provider.task_summary_table
|
|
181
|
+
|
|
182
|
+
@property
|
|
183
|
+
def store_version_table(self):
|
|
184
|
+
return self.session_provider.store_version_table
|
|
185
|
+
|
|
157
186
|
def _upsert(
|
|
158
187
|
self,
|
|
159
188
|
connection: Connection,
|
|
@@ -251,13 +280,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
251
280
|
)
|
|
252
281
|
|
|
253
282
|
query = query.select_from(
|
|
254
|
-
dataset_table.join(
|
|
283
|
+
self.dataset_table.join(
|
|
255
284
|
dataset_ids_cte,
|
|
256
|
-
dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
|
|
285
|
+
dataset_ids_cte.c.dataset_id == self.dataset_table.c.dataset_id,
|
|
257
286
|
)
|
|
258
287
|
)
|
|
259
288
|
else:
|
|
260
|
-
query = query.filter(dataset_table.c.dataset_id == dataset_id)
|
|
289
|
+
query = query.filter(self.dataset_table.c.dataset_id == dataset_id)
|
|
261
290
|
|
|
262
291
|
dialect = self.dialect.name
|
|
263
292
|
|
|
@@ -287,7 +316,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
287
316
|
join_conditions = []
|
|
288
317
|
for k in keys:
|
|
289
318
|
if dialect == "postgresql":
|
|
290
|
-
column = dataset_table.c.identifier[k]
|
|
319
|
+
column = self.dataset_table.c.identifier[k]
|
|
291
320
|
|
|
292
321
|
# Take the value from the first selector to determine the type.
|
|
293
322
|
# TODO: check all selectors to determine the type
|
|
@@ -297,24 +326,26 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
297
326
|
else:
|
|
298
327
|
column = column.as_string()
|
|
299
328
|
else:
|
|
300
|
-
column = func.json_extract(
|
|
329
|
+
column = func.json_extract(
|
|
330
|
+
self.dataset_table.c.identifier, f"$.{k}"
|
|
331
|
+
)
|
|
301
332
|
|
|
302
333
|
join_conditions.append(attribute_cte.c[k] == column)
|
|
303
334
|
|
|
304
335
|
query = query.select_from(
|
|
305
|
-
dataset_table.join(attribute_cte, and_(*join_conditions))
|
|
336
|
+
self.dataset_table.join(attribute_cte, and_(*join_conditions))
|
|
306
337
|
)
|
|
307
338
|
|
|
308
339
|
if where:
|
|
309
340
|
query = query.filter(text(where))
|
|
310
341
|
|
|
311
|
-
query = query.filter(dataset_table.c.bucket == bucket)
|
|
342
|
+
query = query.filter(self.dataset_table.c.bucket == bucket)
|
|
312
343
|
if dataset_type:
|
|
313
|
-
query = query.filter(dataset_table.c.dataset_type == dataset_type)
|
|
344
|
+
query = query.filter(self.dataset_table.c.dataset_type == dataset_type)
|
|
314
345
|
if provider:
|
|
315
|
-
query = query.filter(dataset_table.c.provider == provider)
|
|
346
|
+
query = query.filter(self.dataset_table.c.provider == provider)
|
|
316
347
|
if dataset_state:
|
|
317
|
-
query = query.filter(dataset_table.c.state.in_(dataset_state))
|
|
348
|
+
query = query.filter(self.dataset_table.c.state.in_(dataset_state))
|
|
318
349
|
|
|
319
350
|
return query
|
|
320
351
|
|
|
@@ -328,23 +359,23 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
328
359
|
)
|
|
329
360
|
|
|
330
361
|
dataset_rows = list(
|
|
331
|
-
self.session.query(dataset_table).select_from(
|
|
332
|
-
dataset_table.join(
|
|
362
|
+
self.session.query(self.dataset_table).select_from(
|
|
363
|
+
self.dataset_table.join(
|
|
333
364
|
dataset_ids_cte,
|
|
334
|
-
dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
|
|
365
|
+
dataset_ids_cte.c.dataset_id == self.dataset_table.c.dataset_id,
|
|
335
366
|
)
|
|
336
367
|
)
|
|
337
368
|
)
|
|
338
369
|
revisions_per_dataset = {}
|
|
339
370
|
rows = (
|
|
340
|
-
self.session.query(revision_table)
|
|
371
|
+
self.session.query(self.revision_table)
|
|
341
372
|
.select_from(
|
|
342
|
-
revision_table.join(
|
|
373
|
+
self.revision_table.join(
|
|
343
374
|
dataset_ids_cte,
|
|
344
|
-
dataset_ids_cte.c.dataset_id == revision_table.c.dataset_id,
|
|
375
|
+
dataset_ids_cte.c.dataset_id == self.revision_table.c.dataset_id,
|
|
345
376
|
)
|
|
346
377
|
)
|
|
347
|
-
.order_by(revision_table.c.dataset_id)
|
|
378
|
+
.order_by(self.revision_table.c.dataset_id)
|
|
348
379
|
)
|
|
349
380
|
|
|
350
381
|
for dataset_id, revisions in itertools.groupby(
|
|
@@ -354,14 +385,14 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
354
385
|
|
|
355
386
|
files_per_revision = {}
|
|
356
387
|
rows = (
|
|
357
|
-
self.session.query(file_table)
|
|
388
|
+
self.session.query(self.file_table)
|
|
358
389
|
.select_from(
|
|
359
|
-
file_table.join(
|
|
390
|
+
self.file_table.join(
|
|
360
391
|
dataset_ids_cte,
|
|
361
|
-
dataset_ids_cte.c.dataset_id == file_table.c.dataset_id,
|
|
392
|
+
dataset_ids_cte.c.dataset_id == self.file_table.c.dataset_id,
|
|
362
393
|
)
|
|
363
394
|
)
|
|
364
|
-
.order_by(file_table.c.dataset_id, file_table.c.revision_id)
|
|
395
|
+
.order_by(self.file_table.c.dataset_id, self.file_table.c.revision_id)
|
|
365
396
|
)
|
|
366
397
|
|
|
367
398
|
for (dataset_id, revision_id), files in itertools.groupby(
|
|
@@ -425,8 +456,8 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
425
456
|
if not metadata_only:
|
|
426
457
|
# Apply sorting by created_at in ascending order
|
|
427
458
|
dataset_query = apply_query_filter(
|
|
428
|
-
self.session.query(dataset_table.c.dataset_id)
|
|
429
|
-
).order_by(dataset_table.c.created_at.asc())
|
|
459
|
+
self.session.query(self.dataset_table.c.dataset_id)
|
|
460
|
+
).order_by(self.dataset_table.c.created_at.asc())
|
|
430
461
|
|
|
431
462
|
# Apply pagination if both page and page_size are provided
|
|
432
463
|
if page is not None and page_size is not None:
|
|
@@ -448,9 +479,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
448
479
|
|
|
449
480
|
metadata_result_query = (
|
|
450
481
|
apply_query_filter(
|
|
451
|
-
self.session.query(dataset_table.c.last_modified_at)
|
|
482
|
+
self.session.query(self.dataset_table.c.last_modified_at)
|
|
452
483
|
)
|
|
453
|
-
.order_by(dataset_table.c.last_modified_at.desc())
|
|
484
|
+
.order_by(self.dataset_table.c.last_modified_at.desc())
|
|
454
485
|
.limit(1)
|
|
455
486
|
)
|
|
456
487
|
|
|
@@ -508,11 +539,16 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
508
539
|
|
|
509
540
|
with self.connect() as connection:
|
|
510
541
|
try:
|
|
511
|
-
self._upsert(connection, dataset_table, datasets_entities)
|
|
542
|
+
self._upsert(connection, self.dataset_table, datasets_entities)
|
|
512
543
|
self._upsert(
|
|
513
|
-
connection,
|
|
544
|
+
connection,
|
|
545
|
+
self.revision_table,
|
|
546
|
+
revision_entities,
|
|
547
|
+
immutable_rows=True,
|
|
548
|
+
)
|
|
549
|
+
self._upsert(
|
|
550
|
+
connection, self.file_table, file_entities, immutable_rows=True
|
|
514
551
|
)
|
|
515
|
-
self._upsert(connection, file_table, file_entities, immutable_rows=True)
|
|
516
552
|
except Exception:
|
|
517
553
|
connection.rollback()
|
|
518
554
|
raise
|
|
@@ -569,11 +605,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
569
605
|
try:
|
|
570
606
|
self._upsert(
|
|
571
607
|
connection,
|
|
572
|
-
ingestion_job_summary_table,
|
|
608
|
+
self.ingestion_job_summary_table,
|
|
573
609
|
ingestion_job_summary_entities,
|
|
574
610
|
)
|
|
575
611
|
if task_summary_entities:
|
|
576
|
-
self._upsert(
|
|
612
|
+
self._upsert(
|
|
613
|
+
connection, self.task_summary_table, task_summary_entities
|
|
614
|
+
)
|
|
577
615
|
except Exception:
|
|
578
616
|
connection.rollback()
|
|
579
617
|
raise
|
|
@@ -584,13 +622,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
584
622
|
ingestion_job_summary_ids = [
|
|
585
623
|
row.ingestion_job_summary_id
|
|
586
624
|
for row in self.session.query(
|
|
587
|
-
ingestion_job_summary_table.c.ingestion_job_summary_id
|
|
625
|
+
self.ingestion_job_summary_table.c.ingestion_job_summary_id
|
|
588
626
|
)
|
|
589
627
|
]
|
|
590
628
|
|
|
591
629
|
ingestion_job_summary_rows = list(
|
|
592
|
-
self.session.query(ingestion_job_summary_table).filter(
|
|
593
|
-
ingestion_job_summary_table.c.ingestion_job_summary_id.in_(
|
|
630
|
+
self.session.query(self.ingestion_job_summary_table).filter(
|
|
631
|
+
self.ingestion_job_summary_table.c.ingestion_job_summary_id.in_(
|
|
594
632
|
ingestion_job_summary_ids
|
|
595
633
|
)
|
|
596
634
|
)
|
|
@@ -598,13 +636,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
598
636
|
|
|
599
637
|
task_summary_entities_per_job_summary = {}
|
|
600
638
|
rows = (
|
|
601
|
-
self.session.query(task_summary_table)
|
|
639
|
+
self.session.query(self.task_summary_table)
|
|
602
640
|
.filter(
|
|
603
|
-
task_summary_table.c.ingestion_job_summary_id.in_(
|
|
641
|
+
self.task_summary_table.c.ingestion_job_summary_id.in_(
|
|
604
642
|
ingestion_job_summary_ids
|
|
605
643
|
)
|
|
606
644
|
)
|
|
607
|
-
.order_by(task_summary_table.c.ingestion_job_summary_id)
|
|
645
|
+
.order_by(self.task_summary_table.c.ingestion_job_summary_id)
|
|
608
646
|
)
|
|
609
647
|
|
|
610
648
|
for ingestion_job_summary_id, task_summaries_rows in itertools.groupby(
|
|
@@ -636,7 +674,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
636
674
|
def get_store_version(self) -> Optional[str]:
|
|
637
675
|
"""Get the current Ingestify version stored for this store."""
|
|
638
676
|
with self.session:
|
|
639
|
-
row = self.session.query(
|
|
677
|
+
row = self.session.query(
|
|
678
|
+
self.store_version_table.c.ingestify_version
|
|
679
|
+
).first()
|
|
640
680
|
return row.ingestify_version if row else None
|
|
641
681
|
|
|
642
682
|
def set_store_version(self, version: str):
|
|
@@ -653,7 +693,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
653
693
|
|
|
654
694
|
with self.connect() as connection:
|
|
655
695
|
try:
|
|
656
|
-
self._upsert(connection, store_version_table, [entity])
|
|
696
|
+
self._upsert(connection, self.store_version_table, [entity])
|
|
657
697
|
connection.commit()
|
|
658
698
|
except Exception:
|
|
659
699
|
connection.rollback()
|