ingestify 0.8.0__tar.gz → 0.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. {ingestify-0.8.0 → ingestify-0.9.1}/PKG-INFO +19 -2
  2. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/__init__.py +1 -1
  3. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/dataset/dataset_state.py +1 -0
  4. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/ingestion/ingestion_job.py +5 -1
  5. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/resources/dataset_resource.py +13 -1
  6. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/fetch/http.py +1 -3
  7. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/store/dataset/sqlalchemy/repository.py +90 -50
  8. ingestify-0.9.1/ingestify/infra/store/dataset/sqlalchemy/tables.py +398 -0
  9. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/main.py +34 -5
  10. ingestify-0.9.1/ingestify/tests/__init__.py +0 -0
  11. ingestify-0.9.1/ingestify/tests/conftest.py +17 -0
  12. ingestify-0.9.1/ingestify/tests/test_auto_ingest.py +418 -0
  13. ingestify-0.9.1/ingestify/tests/test_engine.py +501 -0
  14. ingestify-0.9.1/ingestify/tests/test_events.py +201 -0
  15. ingestify-0.9.1/ingestify/tests/test_file_cache.py +98 -0
  16. ingestify-0.9.1/ingestify/tests/test_pagination.py +162 -0
  17. ingestify-0.9.1/ingestify/tests/test_store_version.py +73 -0
  18. ingestify-0.9.1/ingestify/tests/test_table_prefix.py +78 -0
  19. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify.egg-info/PKG-INFO +19 -2
  20. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify.egg-info/SOURCES.txt +10 -1
  21. ingestify-0.8.0/ingestify/infra/store/dataset/sqlalchemy/tables.py +0 -381
  22. {ingestify-0.8.0 → ingestify-0.9.1}/README.md +0 -0
  23. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/application/__init__.py +0 -0
  24. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/application/dataset_store.py +0 -0
  25. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/application/ingestion_engine.py +0 -0
  26. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/application/loader.py +0 -0
  27. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/application/secrets_manager.py +0 -0
  28. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/cmdline.py +0 -0
  29. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/__init__.py +0 -0
  30. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/__init__.py +0 -0
  31. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/base.py +0 -0
  32. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/data_spec_version_collection.py +0 -0
  33. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/dataset/__init__.py +0 -0
  34. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/dataset/collection.py +0 -0
  35. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
  36. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/dataset/dataset.py +0 -0
  37. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
  38. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/dataset/events.py +0 -0
  39. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/dataset/file.py +0 -0
  40. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/dataset/file_collection.py +0 -0
  41. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/dataset/file_repository.py +0 -0
  42. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/dataset/identifier.py +0 -0
  43. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/dataset/revision.py +0 -0
  44. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/dataset/selector.py +0 -0
  45. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/event/__init__.py +0 -0
  46. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/event/_old_event.py +0 -0
  47. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/event/dispatcher.py +0 -0
  48. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/event/domain_event.py +0 -0
  49. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/event/event_bus.py +0 -0
  50. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/event/publisher.py +0 -0
  51. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/event/subscriber.py +0 -0
  52. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/fetch_policy.py +0 -0
  53. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/ingestion/__init__.py +0 -0
  54. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
  55. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  56. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/resources/__init__.py +0 -0
  57. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/sink.py +0 -0
  58. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/source.py +0 -0
  59. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/task/__init__.py +0 -0
  60. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/task/set.py +0 -0
  61. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/task/task.py +0 -0
  62. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/task/task_summary.py +0 -0
  63. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/models/timing.py +0 -0
  64. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/services/__init__.py +0 -0
  65. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  66. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/services/transformers/__init__.py +0 -0
  67. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  68. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/exceptions.py +0 -0
  69. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/__init__.py +0 -0
  70. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/fetch/__init__.py +0 -0
  71. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/serialization/__init__.py +0 -0
  72. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/sink/__init__.py +0 -0
  73. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/sink/postgresql.py +0 -0
  74. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/source/__init__.py +0 -0
  75. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/source/statsbomb/__init__.py +0 -0
  76. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/source/statsbomb/base.py +0 -0
  77. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/source/statsbomb/match.py +0 -0
  78. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/source/statsbomb_github.py +0 -0
  79. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/store/__init__.py +0 -0
  80. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/store/dataset/__init__.py +0 -0
  81. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  82. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/store/file/__init__.py +0 -0
  83. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  84. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/store/file/local_file_repository.py +0 -0
  85. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  86. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/server.py +0 -0
  87. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/source_base.py +0 -0
  88. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify/utils.py +0 -0
  89. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify.egg-info/dependency_links.txt +0 -0
  90. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify.egg-info/entry_points.txt +0 -0
  91. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify.egg-info/requires.txt +0 -0
  92. {ingestify-0.8.0 → ingestify-0.9.1}/ingestify.egg-info/top_level.txt +0 -0
  93. {ingestify-0.8.0 → ingestify-0.9.1}/setup.cfg +0 -0
  94. {ingestify-0.8.0 → ingestify-0.9.1}/setup.py +0 -0
@@ -1,12 +1,29 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: ingestify
3
- Version: 0.8.0
3
+ Version: 0.9.1
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
7
7
  License: AGPL
8
8
  Description-Content-Type: text/markdown
9
+ Requires-Dist: requests<3,>=2.0.0
10
+ Requires-Dist: SQLAlchemy<3,>=2
11
+ Requires-Dist: click>=8
12
+ Requires-Dist: python-dotenv
13
+ Requires-Dist: pyaml_env
14
+ Requires-Dist: boto3
15
+ Requires-Dist: pydantic>=2.0.0
9
16
  Provides-Extra: test
17
+ Requires-Dist: pytest<7,>=6.2.5; extra == "test"
18
+ Requires-Dist: pytz; extra == "test"
19
+ Dynamic: author
20
+ Dynamic: author-email
21
+ Dynamic: description
22
+ Dynamic: description-content-type
23
+ Dynamic: license
24
+ Dynamic: provides-extra
25
+ Dynamic: requires-dist
26
+ Dynamic: summary
10
27
 
11
28
  # Ingestify
12
29
 
@@ -9,4 +9,4 @@ if not __INGESTIFY_SETUP__:
9
9
  from .source_base import Source, DatasetResource
10
10
  from .main import debug_source
11
11
 
12
- __version__ = "0.8.0"
12
+ __version__ = "0.9.1"
@@ -10,6 +10,7 @@ class DatasetState(str, Enum):
10
10
  SCHEDULED = "SCHEDULED"
11
11
  PARTIAL = "PARTIAL"
12
12
  COMPLETE = "COMPLETE"
13
+ MISSING = "MISSING"
13
14
 
14
15
  @property
15
16
  def is_complete(self):
@@ -129,7 +129,6 @@ class UpdateDatasetTask(Task):
129
129
  with TaskSummary.update(
130
130
  self.task_id, dataset_identifier=dataset_identifier
131
131
  ) as task_summary:
132
-
133
132
  files = {
134
133
  file_id: task_summary.record_load_file(
135
134
  lambda: load_file(file_resource, dataset=self.dataset),
@@ -138,6 +137,8 @@ class UpdateDatasetTask(Task):
138
137
  for file_id, file_resource in self.dataset_resource.files.items()
139
138
  }
140
139
 
140
+ self.dataset_resource.run_post_load_files(files)
141
+
141
142
  try:
142
143
  revision = self.store.update_dataset(
143
144
  dataset=self.dataset,
@@ -181,6 +182,9 @@ class CreateDatasetTask(Task):
181
182
  )
182
183
  for file_id, file_resource in self.dataset_resource.files.items()
183
184
  }
185
+
186
+ self.dataset_resource.run_post_load_files(files)
187
+
184
188
  try:
185
189
  revision = self.store.create_dataset(
186
190
  dataset_type=self.dataset_resource.dataset_type,
@@ -1,5 +1,5 @@
1
1
  from datetime import datetime
2
- from typing import Optional, Callable, Any, Protocol, TYPE_CHECKING # noqa
2
+ from typing import Optional, Callable, Any, Protocol, TYPE_CHECKING, Dict # noqa
3
3
  from pydantic import Field
4
4
 
5
5
  from ingestify.domain.models.base import BaseModel
@@ -50,6 +50,18 @@ class DatasetResource(BaseModel):
50
50
  metadata: dict = Field(default_factory=dict)
51
51
  state: DatasetState = Field(default_factory=lambda: DatasetState.COMPLETE)
52
52
  files: dict[str, FileResource] = Field(default_factory=dict)
53
+ post_load_files: Optional[
54
+ Callable[["DatasetResource", Dict[str, DraftFile]], None]
55
+ ] = None
56
+
57
+ def run_post_load_files(self, files: Dict[str, DraftFile]):
58
+ """Hook to modify dataset attributes based on loaded file content.
59
+
60
+ Useful for setting state based on file content, e.g., keep state=SCHEDULED
61
+ when files contain '{}', change to COMPLETE when they contain actual data.
62
+ """
63
+ if self.post_load_files:
64
+ self.post_load_files(self, files)
53
65
 
54
66
  def add_file(
55
67
  self,
@@ -58,9 +58,7 @@ def retrieve_http(
58
58
  )
59
59
  # else:
60
60
  # print(f"{current_file.modified_at=} {last_modified=}")
61
- # headers["if-modified-since"] = (
62
- # format_datetime(current_file.modified_at, usegmt=True),
63
- # )
61
+ headers["if-modified-since"] = format_datetime(current_file.modified_at, usegmt=True)
64
62
  headers["if-none-match"] = current_file.tag
65
63
 
66
64
  http_kwargs = {}
@@ -40,15 +40,7 @@ from ingestify.domain.models.task.task_summary import TaskSummary
40
40
  from ingestify.exceptions import IngestifyError
41
41
  from ingestify.utils import get_concurrency
42
42
 
43
- from .tables import (
44
- metadata,
45
- dataset_table,
46
- file_table,
47
- revision_table,
48
- ingestion_job_summary_table,
49
- task_summary_table,
50
- store_version_table,
51
- )
43
+ from .tables import get_tables
52
44
 
53
45
  logger = logging.getLogger(__name__)
54
46
 
@@ -112,20 +104,33 @@ class SqlAlchemySessionProvider:
112
104
  session_factory = sessionmaker(bind=self.engine)
113
105
  self.session = scoped_session(session_factory)
114
106
 
107
+ # Create tables with the specified prefix
108
+ tables = get_tables(self.table_prefix)
109
+ self.metadata = tables["metadata"]
110
+ self.dataset_table = tables["dataset_table"]
111
+ self.revision_table = tables["revision_table"]
112
+ self.file_table = tables["file_table"]
113
+ self.ingestion_job_summary_table = tables["ingestion_job_summary_table"]
114
+ self.task_summary_table = tables["task_summary_table"]
115
+ self.store_version_table = tables["store_version_table"]
116
+
115
117
  def __getstate__(self):
116
- return {"url": self.url}
118
+ return {"url": self.url, "table_prefix": self.table_prefix}
117
119
 
118
120
  def __setstate__(self, state):
119
121
  self.url = state["url"]
122
+ self.table_prefix = state.get("table_prefix", "")
120
123
  self._init_engine()
121
124
 
122
- def __init__(self, url: str):
125
+ def __init__(self, url: str, table_prefix: str = ""):
123
126
  url = self.fix_url(url)
124
127
 
125
128
  self.url = url
129
+ self.table_prefix = table_prefix
126
130
  self._init_engine()
127
131
 
128
- metadata.create_all(self.engine)
132
+ # Create all tables in the database
133
+ self.metadata.create_all(self.engine)
129
134
 
130
135
  def __del__(self):
131
136
  self.close()
@@ -154,6 +159,30 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
154
159
  def dialect(self) -> Dialect:
155
160
  return self.session_provider.dialect
156
161
 
162
+ @property
163
+ def dataset_table(self):
164
+ return self.session_provider.dataset_table
165
+
166
+ @property
167
+ def revision_table(self):
168
+ return self.session_provider.revision_table
169
+
170
+ @property
171
+ def file_table(self):
172
+ return self.session_provider.file_table
173
+
174
+ @property
175
+ def ingestion_job_summary_table(self):
176
+ return self.session_provider.ingestion_job_summary_table
177
+
178
+ @property
179
+ def task_summary_table(self):
180
+ return self.session_provider.task_summary_table
181
+
182
+ @property
183
+ def store_version_table(self):
184
+ return self.session_provider.store_version_table
185
+
157
186
  def _upsert(
158
187
  self,
159
188
  connection: Connection,
@@ -251,13 +280,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
251
280
  )
252
281
 
253
282
  query = query.select_from(
254
- dataset_table.join(
283
+ self.dataset_table.join(
255
284
  dataset_ids_cte,
256
- dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
285
+ dataset_ids_cte.c.dataset_id == self.dataset_table.c.dataset_id,
257
286
  )
258
287
  )
259
288
  else:
260
- query = query.filter(dataset_table.c.dataset_id == dataset_id)
289
+ query = query.filter(self.dataset_table.c.dataset_id == dataset_id)
261
290
 
262
291
  dialect = self.dialect.name
263
292
 
@@ -287,7 +316,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
287
316
  join_conditions = []
288
317
  for k in keys:
289
318
  if dialect == "postgresql":
290
- column = dataset_table.c.identifier[k]
319
+ column = self.dataset_table.c.identifier[k]
291
320
 
292
321
  # Take the value from the first selector to determine the type.
293
322
  # TODO: check all selectors to determine the type
@@ -297,24 +326,26 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
297
326
  else:
298
327
  column = column.as_string()
299
328
  else:
300
- column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
329
+ column = func.json_extract(
330
+ self.dataset_table.c.identifier, f"$.{k}"
331
+ )
301
332
 
302
333
  join_conditions.append(attribute_cte.c[k] == column)
303
334
 
304
335
  query = query.select_from(
305
- dataset_table.join(attribute_cte, and_(*join_conditions))
336
+ self.dataset_table.join(attribute_cte, and_(*join_conditions))
306
337
  )
307
338
 
308
339
  if where:
309
340
  query = query.filter(text(where))
310
341
 
311
- query = query.filter(dataset_table.c.bucket == bucket)
342
+ query = query.filter(self.dataset_table.c.bucket == bucket)
312
343
  if dataset_type:
313
- query = query.filter(dataset_table.c.dataset_type == dataset_type)
344
+ query = query.filter(self.dataset_table.c.dataset_type == dataset_type)
314
345
  if provider:
315
- query = query.filter(dataset_table.c.provider == provider)
346
+ query = query.filter(self.dataset_table.c.provider == provider)
316
347
  if dataset_state:
317
- query = query.filter(dataset_table.c.state.in_(dataset_state))
348
+ query = query.filter(self.dataset_table.c.state.in_(dataset_state))
318
349
 
319
350
  return query
320
351
 
@@ -328,23 +359,23 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
328
359
  )
329
360
 
330
361
  dataset_rows = list(
331
- self.session.query(dataset_table).select_from(
332
- dataset_table.join(
362
+ self.session.query(self.dataset_table).select_from(
363
+ self.dataset_table.join(
333
364
  dataset_ids_cte,
334
- dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
365
+ dataset_ids_cte.c.dataset_id == self.dataset_table.c.dataset_id,
335
366
  )
336
367
  )
337
368
  )
338
369
  revisions_per_dataset = {}
339
370
  rows = (
340
- self.session.query(revision_table)
371
+ self.session.query(self.revision_table)
341
372
  .select_from(
342
- revision_table.join(
373
+ self.revision_table.join(
343
374
  dataset_ids_cte,
344
- dataset_ids_cte.c.dataset_id == revision_table.c.dataset_id,
375
+ dataset_ids_cte.c.dataset_id == self.revision_table.c.dataset_id,
345
376
  )
346
377
  )
347
- .order_by(revision_table.c.dataset_id)
378
+ .order_by(self.revision_table.c.dataset_id)
348
379
  )
349
380
 
350
381
  for dataset_id, revisions in itertools.groupby(
@@ -354,14 +385,14 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
354
385
 
355
386
  files_per_revision = {}
356
387
  rows = (
357
- self.session.query(file_table)
388
+ self.session.query(self.file_table)
358
389
  .select_from(
359
- file_table.join(
390
+ self.file_table.join(
360
391
  dataset_ids_cte,
361
- dataset_ids_cte.c.dataset_id == file_table.c.dataset_id,
392
+ dataset_ids_cte.c.dataset_id == self.file_table.c.dataset_id,
362
393
  )
363
394
  )
364
- .order_by(file_table.c.dataset_id, file_table.c.revision_id)
395
+ .order_by(self.file_table.c.dataset_id, self.file_table.c.revision_id)
365
396
  )
366
397
 
367
398
  for (dataset_id, revision_id), files in itertools.groupby(
@@ -425,8 +456,8 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
425
456
  if not metadata_only:
426
457
  # Apply sorting by created_at in ascending order
427
458
  dataset_query = apply_query_filter(
428
- self.session.query(dataset_table.c.dataset_id)
429
- ).order_by(dataset_table.c.created_at.asc())
459
+ self.session.query(self.dataset_table.c.dataset_id)
460
+ ).order_by(self.dataset_table.c.created_at.asc())
430
461
 
431
462
  # Apply pagination if both page and page_size are provided
432
463
  if page is not None and page_size is not None:
@@ -448,9 +479,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
448
479
 
449
480
  metadata_result_query = (
450
481
  apply_query_filter(
451
- self.session.query(dataset_table.c.last_modified_at)
482
+ self.session.query(self.dataset_table.c.last_modified_at)
452
483
  )
453
- .order_by(dataset_table.c.last_modified_at.desc())
484
+ .order_by(self.dataset_table.c.last_modified_at.desc())
454
485
  .limit(1)
455
486
  )
456
487
 
@@ -508,11 +539,16 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
508
539
 
509
540
  with self.connect() as connection:
510
541
  try:
511
- self._upsert(connection, dataset_table, datasets_entities)
542
+ self._upsert(connection, self.dataset_table, datasets_entities)
512
543
  self._upsert(
513
- connection, revision_table, revision_entities, immutable_rows=True
544
+ connection,
545
+ self.revision_table,
546
+ revision_entities,
547
+ immutable_rows=True,
548
+ )
549
+ self._upsert(
550
+ connection, self.file_table, file_entities, immutable_rows=True
514
551
  )
515
- self._upsert(connection, file_table, file_entities, immutable_rows=True)
516
552
  except Exception:
517
553
  connection.rollback()
518
554
  raise
@@ -569,11 +605,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
569
605
  try:
570
606
  self._upsert(
571
607
  connection,
572
- ingestion_job_summary_table,
608
+ self.ingestion_job_summary_table,
573
609
  ingestion_job_summary_entities,
574
610
  )
575
611
  if task_summary_entities:
576
- self._upsert(connection, task_summary_table, task_summary_entities)
612
+ self._upsert(
613
+ connection, self.task_summary_table, task_summary_entities
614
+ )
577
615
  except Exception:
578
616
  connection.rollback()
579
617
  raise
@@ -584,13 +622,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
584
622
  ingestion_job_summary_ids = [
585
623
  row.ingestion_job_summary_id
586
624
  for row in self.session.query(
587
- ingestion_job_summary_table.c.ingestion_job_summary_id
625
+ self.ingestion_job_summary_table.c.ingestion_job_summary_id
588
626
  )
589
627
  ]
590
628
 
591
629
  ingestion_job_summary_rows = list(
592
- self.session.query(ingestion_job_summary_table).filter(
593
- ingestion_job_summary_table.c.ingestion_job_summary_id.in_(
630
+ self.session.query(self.ingestion_job_summary_table).filter(
631
+ self.ingestion_job_summary_table.c.ingestion_job_summary_id.in_(
594
632
  ingestion_job_summary_ids
595
633
  )
596
634
  )
@@ -598,13 +636,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
598
636
 
599
637
  task_summary_entities_per_job_summary = {}
600
638
  rows = (
601
- self.session.query(task_summary_table)
639
+ self.session.query(self.task_summary_table)
602
640
  .filter(
603
- task_summary_table.c.ingestion_job_summary_id.in_(
641
+ self.task_summary_table.c.ingestion_job_summary_id.in_(
604
642
  ingestion_job_summary_ids
605
643
  )
606
644
  )
607
- .order_by(task_summary_table.c.ingestion_job_summary_id)
645
+ .order_by(self.task_summary_table.c.ingestion_job_summary_id)
608
646
  )
609
647
 
610
648
  for ingestion_job_summary_id, task_summaries_rows in itertools.groupby(
@@ -636,7 +674,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
636
674
  def get_store_version(self) -> Optional[str]:
637
675
  """Get the current Ingestify version stored for this store."""
638
676
  with self.session:
639
- row = self.session.query(store_version_table.c.ingestify_version).first()
677
+ row = self.session.query(
678
+ self.store_version_table.c.ingestify_version
679
+ ).first()
640
680
  return row.ingestify_version if row else None
641
681
 
642
682
  def set_store_version(self, version: str):
@@ -653,7 +693,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
653
693
 
654
694
  with self.connect() as connection:
655
695
  try:
656
- self._upsert(connection, store_version_table, [entity])
696
+ self._upsert(connection, self.store_version_table, [entity])
657
697
  connection.commit()
658
698
  except Exception:
659
699
  connection.rollback()