ingestify 0.7.0__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. {ingestify-0.7.0/ingestify.egg-info → ingestify-0.9.0}/PKG-INFO +67 -4
  2. ingestify-0.7.0/PKG-INFO → ingestify-0.9.0/README.md +48 -12
  3. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/__init__.py +2 -1
  4. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/application/ingestion_engine.py +3 -0
  5. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/application/loader.py +12 -2
  6. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/dataset_state.py +1 -0
  7. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/file.py +6 -0
  8. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/ingestion/ingestion_job.py +5 -1
  9. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/resources/dataset_resource.py +13 -1
  10. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/fetch/http.py +3 -3
  11. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +90 -50
  12. ingestify-0.9.0/ingestify/infra/store/dataset/sqlalchemy/tables.py +398 -0
  13. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/main.py +189 -5
  14. ingestify-0.9.0/ingestify/tests/__init__.py +0 -0
  15. ingestify-0.9.0/ingestify/tests/conftest.py +17 -0
  16. ingestify-0.9.0/ingestify/tests/test_auto_ingest.py +418 -0
  17. ingestify-0.9.0/ingestify/tests/test_engine.py +501 -0
  18. ingestify-0.9.0/ingestify/tests/test_events.py +201 -0
  19. ingestify-0.9.0/ingestify/tests/test_file_cache.py +98 -0
  20. ingestify-0.9.0/ingestify/tests/test_pagination.py +162 -0
  21. ingestify-0.9.0/ingestify/tests/test_store_version.py +73 -0
  22. ingestify-0.9.0/ingestify/tests/test_table_prefix.py +78 -0
  23. ingestify-0.7.0/README.md → ingestify-0.9.0/ingestify.egg-info/PKG-INFO +75 -2
  24. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify.egg-info/SOURCES.txt +10 -1
  25. ingestify-0.7.0/ingestify/infra/store/dataset/sqlalchemy/tables.py +0 -381
  26. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/application/__init__.py +0 -0
  27. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/application/dataset_store.py +0 -0
  28. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/application/secrets_manager.py +0 -0
  29. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/cmdline.py +0 -0
  30. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/__init__.py +0 -0
  31. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/__init__.py +0 -0
  32. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/base.py +0 -0
  33. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/data_spec_version_collection.py +0 -0
  34. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/__init__.py +0 -0
  35. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/collection.py +0 -0
  36. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
  37. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/dataset.py +0 -0
  38. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
  39. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/events.py +0 -0
  40. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/file_collection.py +0 -0
  41. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/file_repository.py +0 -0
  42. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/identifier.py +0 -0
  43. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/revision.py +0 -0
  44. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/selector.py +0 -0
  45. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/event/__init__.py +0 -0
  46. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/event/_old_event.py +0 -0
  47. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/event/dispatcher.py +0 -0
  48. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/event/domain_event.py +0 -0
  49. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/event/event_bus.py +0 -0
  50. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/event/publisher.py +0 -0
  51. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/event/subscriber.py +0 -0
  52. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/fetch_policy.py +0 -0
  53. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/ingestion/__init__.py +0 -0
  54. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
  55. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  56. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/resources/__init__.py +0 -0
  57. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/sink.py +0 -0
  58. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/source.py +0 -0
  59. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/task/__init__.py +0 -0
  60. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/task/set.py +0 -0
  61. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/task/task.py +0 -0
  62. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/task/task_summary.py +0 -0
  63. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/timing.py +0 -0
  64. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/services/__init__.py +0 -0
  65. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  66. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/services/transformers/__init__.py +0 -0
  67. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  68. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/exceptions.py +0 -0
  69. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/__init__.py +0 -0
  70. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/fetch/__init__.py +0 -0
  71. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/serialization/__init__.py +0 -0
  72. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/sink/__init__.py +0 -0
  73. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/sink/postgresql.py +0 -0
  74. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/source/__init__.py +0 -0
  75. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/source/statsbomb/__init__.py +0 -0
  76. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/source/statsbomb/base.py +0 -0
  77. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/source/statsbomb/match.py +0 -0
  78. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/source/statsbomb_github.py +0 -0
  79. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/store/__init__.py +0 -0
  80. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/store/dataset/__init__.py +0 -0
  81. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  82. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/store/file/__init__.py +0 -0
  83. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  84. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
  85. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  86. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/server.py +0 -0
  87. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/source_base.py +0 -0
  88. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify/utils.py +0 -0
  89. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify.egg-info/dependency_links.txt +0 -0
  90. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify.egg-info/entry_points.txt +0 -0
  91. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify.egg-info/requires.txt +0 -0
  92. {ingestify-0.7.0 → ingestify-0.9.0}/ingestify.egg-info/top_level.txt +0 -0
  93. {ingestify-0.7.0 → ingestify-0.9.0}/setup.cfg +0 -0
  94. {ingestify-0.7.0 → ingestify-0.9.0}/setup.py +0 -0
@@ -1,12 +1,29 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: ingestify
3
- Version: 0.7.0
3
+ Version: 0.9.0
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
7
7
  License: AGPL
8
8
  Description-Content-Type: text/markdown
9
+ Requires-Dist: requests<3,>=2.0.0
10
+ Requires-Dist: SQLAlchemy<3,>=2
11
+ Requires-Dist: click>=8
12
+ Requires-Dist: python-dotenv
13
+ Requires-Dist: pyaml_env
14
+ Requires-Dist: boto3
15
+ Requires-Dist: pydantic>=2.0.0
9
16
  Provides-Extra: test
17
+ Requires-Dist: pytest<7,>=6.2.5; extra == "test"
18
+ Requires-Dist: pytz; extra == "test"
19
+ Dynamic: author
20
+ Dynamic: author-email
21
+ Dynamic: description
22
+ Dynamic: description-content-type
23
+ Dynamic: license
24
+ Dynamic: provides-extra
25
+ Dynamic: requires-dist
26
+ Dynamic: summary
10
27
 
11
28
  # Ingestify
12
29
 
@@ -68,6 +85,43 @@ Ingestify fixes that by building **your own data lake** of untouched provider fi
68
85
  pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
69
86
  ```
70
87
 
88
+ ### Developing a new Source
89
+
90
+ When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
91
+
92
+ ```python
93
+ from ingestify import Source, debug_source
94
+
95
+ class MyCustomSource(Source):
96
+ provider = "my_provider"
97
+
98
+ def __init__(self, name: str, api_key: str):
99
+ super().__init__(name)
100
+ self.api_key = api_key
101
+
102
+ def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
103
+ # Your source implementation
104
+ ...
105
+
106
+ # Quick debug - runs full ingestion with temp storage
107
+ if __name__ == "__main__":
108
+ source = MyCustomSource(name="test", api_key="...")
109
+
110
+ debug_source(
111
+ source,
112
+ dataset_type="match",
113
+ data_spec_versions={"events": "v1"},
114
+ )
115
+ ```
116
+
117
+ The `debug_source()` helper:
118
+ - ✅ Creates an ephemeral dev engine with temp storage
119
+ - ✅ Configures logging automatically
120
+ - ✅ Runs the full ingestion cycle
121
+ - ✅ Shows storage location and results
122
+
123
+ Perfect for testing your source before adding it to production config!
124
+
71
125
  ### Minimal `config.yaml`
72
126
 
73
127
  ```yaml
@@ -166,8 +220,16 @@ pip install kloppy
166
220
  ```
167
221
 
168
222
  ```python
223
+ import logging, sys
224
+
169
225
  from ingestify.main import get_engine
170
226
 
227
+ logging.basicConfig(
228
+ level=logging.INFO,
229
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
230
+ stream=sys.stderr,
231
+ )
232
+
171
233
  engine = get_engine(
172
234
  metadata_url="sqlite:///database_open_data/catalog.db",
173
235
  file_url="file://database_open_data/files/"
@@ -179,12 +241,13 @@ dataset_iter = engine.iter_datasets(
179
241
 
180
242
  provider="statsbomb",
181
243
  dataset_type="match",
182
- competition_id=43,
183
- season_id=281
244
+ competition_id=43, # "FIFA World Cup"
245
+ #season_id=281
184
246
  )
185
247
 
186
248
  for dataset in dataset_iter:
187
249
  kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
250
+ logging.info(f"Loaded {kloppy_dataset}")
188
251
  ```
189
252
 
190
253
 
@@ -1,13 +1,3 @@
1
- Metadata-Version: 2.1
2
- Name: ingestify
3
- Version: 0.7.0
4
- Summary: Data Ingestion Framework
5
- Author: Koen Vossen
6
- Author-email: info@koenvossen.nl
7
- License: AGPL
8
- Description-Content-Type: text/markdown
9
- Provides-Extra: test
10
-
11
1
  # Ingestify
12
2
 
13
3
  _Ingest everything – JSON, CSV, tracking ZIPs, even MP4 – keep it version‑safe, sync only what changed, and analyse while you ingest._
@@ -68,6 +58,43 @@ Ingestify fixes that by building **your own data lake** of untouched provider fi
68
58
  pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
69
59
  ```
70
60
 
61
+ ### Developing a new Source
62
+
63
+ When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
64
+
65
+ ```python
66
+ from ingestify import Source, debug_source
67
+
68
+ class MyCustomSource(Source):
69
+ provider = "my_provider"
70
+
71
+ def __init__(self, name: str, api_key: str):
72
+ super().__init__(name)
73
+ self.api_key = api_key
74
+
75
+ def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
76
+ # Your source implementation
77
+ ...
78
+
79
+ # Quick debug - runs full ingestion with temp storage
80
+ if __name__ == "__main__":
81
+ source = MyCustomSource(name="test", api_key="...")
82
+
83
+ debug_source(
84
+ source,
85
+ dataset_type="match",
86
+ data_spec_versions={"events": "v1"},
87
+ )
88
+ ```
89
+
90
+ The `debug_source()` helper:
91
+ - ✅ Creates an ephemeral dev engine with temp storage
92
+ - ✅ Configures logging automatically
93
+ - ✅ Runs the full ingestion cycle
94
+ - ✅ Shows storage location and results
95
+
96
+ Perfect for testing your source before adding it to production config!
97
+
71
98
  ### Minimal `config.yaml`
72
99
 
73
100
  ```yaml
@@ -166,8 +193,16 @@ pip install kloppy
166
193
  ```
167
194
 
168
195
  ```python
196
+ import logging, sys
197
+
169
198
  from ingestify.main import get_engine
170
199
 
200
+ logging.basicConfig(
201
+ level=logging.INFO,
202
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
203
+ stream=sys.stderr,
204
+ )
205
+
171
206
  engine = get_engine(
172
207
  metadata_url="sqlite:///database_open_data/catalog.db",
173
208
  file_url="file://database_open_data/files/"
@@ -179,12 +214,13 @@ dataset_iter = engine.iter_datasets(
179
214
 
180
215
  provider="statsbomb",
181
216
  dataset_type="match",
182
- competition_id=43,
183
- season_id=281
217
+ competition_id=43, # "FIFA World Cup"
218
+ #season_id=281
184
219
  )
185
220
 
186
221
  for dataset in dataset_iter:
187
222
  kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
223
+ logging.info(f"Loaded {kloppy_dataset}")
188
224
  ```
189
225
 
190
226
 
@@ -7,5 +7,6 @@ except NameError:
7
7
  if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
+ from .main import debug_source
10
11
 
11
- __version__ = "0.7.0"
12
+ __version__ = "0.9.0"
@@ -110,6 +110,9 @@ class IngestionEngine:
110
110
  else:
111
111
  do_load()
112
112
 
113
+ # Alias for load() - more intuitive name for running ingestion
114
+ run = load
115
+
113
116
  def list_datasets(self, as_count: bool = False):
114
117
  """Consider moving this to DataStore"""
115
118
  datasets = sorted(
@@ -307,7 +307,17 @@ class Loader:
307
307
  auto_ingest_config=auto_ingest_config,
308
308
  **selector_filters,
309
309
  )
310
- if selector_filters and not selectors:
311
- logger.warning(f"No data found matching {selector_filters}")
310
+ if (provider or source or dataset_type or selector_filters) and not selectors:
311
+ filters_applied = {
312
+ k: v
313
+ for k, v in {
314
+ "provider": provider,
315
+ "source": source,
316
+ "dataset_type": dataset_type,
317
+ **selector_filters,
318
+ }.items()
319
+ if v
320
+ }
321
+ logger.warning(f"No data found matching filters: {filters_applied}")
312
322
  else:
313
323
  self.run(selectors, dry_run=dry_run)
@@ -10,6 +10,7 @@ class DatasetState(str, Enum):
10
10
  SCHEDULED = "SCHEDULED"
11
11
  PARTIAL = "PARTIAL"
12
12
  COMPLETE = "COMPLETE"
13
+ MISSING = "MISSING"
13
14
 
14
15
  @property
15
16
  def is_complete(self):
@@ -39,6 +39,12 @@ class DraftFile(BaseModel):
39
39
  stream = BytesIO(file_.read().encode("utf-8"))
40
40
  elif isinstance(file_, BytesIO):
41
41
  stream = file_
42
+ elif hasattr(file_, "read"):
43
+ data = file_.read()
44
+ if isinstance(data, bytes):
45
+ stream = BytesIO(data)
46
+ else:
47
+ stream = BytesIO(data.encode("utf-8"))
42
48
  else:
43
49
  raise Exception(f"Not possible to create DraftFile from {type(file_)}")
44
50
 
@@ -129,7 +129,6 @@ class UpdateDatasetTask(Task):
129
129
  with TaskSummary.update(
130
130
  self.task_id, dataset_identifier=dataset_identifier
131
131
  ) as task_summary:
132
-
133
132
  files = {
134
133
  file_id: task_summary.record_load_file(
135
134
  lambda: load_file(file_resource, dataset=self.dataset),
@@ -138,6 +137,8 @@ class UpdateDatasetTask(Task):
138
137
  for file_id, file_resource in self.dataset_resource.files.items()
139
138
  }
140
139
 
140
+ self.dataset_resource.run_post_load_files(files)
141
+
141
142
  try:
142
143
  revision = self.store.update_dataset(
143
144
  dataset=self.dataset,
@@ -181,6 +182,9 @@ class CreateDatasetTask(Task):
181
182
  )
182
183
  for file_id, file_resource in self.dataset_resource.files.items()
183
184
  }
185
+
186
+ self.dataset_resource.run_post_load_files(files)
187
+
184
188
  try:
185
189
  revision = self.store.create_dataset(
186
190
  dataset_type=self.dataset_resource.dataset_type,
@@ -1,5 +1,5 @@
1
1
  from datetime import datetime
2
- from typing import Optional, Callable, Any, Protocol, TYPE_CHECKING # noqa
2
+ from typing import Optional, Callable, Any, Protocol, TYPE_CHECKING, Dict # noqa
3
3
  from pydantic import Field
4
4
 
5
5
  from ingestify.domain.models.base import BaseModel
@@ -50,6 +50,18 @@ class DatasetResource(BaseModel):
50
50
  metadata: dict = Field(default_factory=dict)
51
51
  state: DatasetState = Field(default_factory=lambda: DatasetState.COMPLETE)
52
52
  files: dict[str, FileResource] = Field(default_factory=dict)
53
+ post_load_files: Optional[
54
+ Callable[["DatasetResource", Dict[str, DraftFile]], None]
55
+ ] = None
56
+
57
+ def run_post_load_files(self, files: Dict[str, DraftFile]):
58
+ """Hook to modify dataset attributes based on loaded file content.
59
+
60
+ Useful for setting state based on file content, e.g., keep state=SCHEDULED
61
+ when files contain '{}', change to COMPLETE when they contain actual data.
62
+ """
63
+ if self.post_load_files:
64
+ self.post_load_files(self, files)
53
65
 
54
66
  def add_file(
55
67
  self,
@@ -58,9 +58,9 @@ def retrieve_http(
58
58
  )
59
59
  # else:
60
60
  # print(f"{current_file.modified_at=} {last_modified=}")
61
- # headers["if-modified-since"] = (
62
- # format_datetime(current_file.modified_at, usegmt=True),
63
- # )
61
+ headers["if-modified-since"] = (
62
+ format_datetime(current_file.modified_at, usegmt=True),
63
+ )
64
64
  headers["if-none-match"] = current_file.tag
65
65
 
66
66
  http_kwargs = {}
@@ -40,15 +40,7 @@ from ingestify.domain.models.task.task_summary import TaskSummary
40
40
  from ingestify.exceptions import IngestifyError
41
41
  from ingestify.utils import get_concurrency
42
42
 
43
- from .tables import (
44
- metadata,
45
- dataset_table,
46
- file_table,
47
- revision_table,
48
- ingestion_job_summary_table,
49
- task_summary_table,
50
- store_version_table,
51
- )
43
+ from .tables import get_tables
52
44
 
53
45
  logger = logging.getLogger(__name__)
54
46
 
@@ -112,20 +104,33 @@ class SqlAlchemySessionProvider:
112
104
  session_factory = sessionmaker(bind=self.engine)
113
105
  self.session = scoped_session(session_factory)
114
106
 
107
+ # Create tables with the specified prefix
108
+ tables = get_tables(self.table_prefix)
109
+ self.metadata = tables["metadata"]
110
+ self.dataset_table = tables["dataset_table"]
111
+ self.revision_table = tables["revision_table"]
112
+ self.file_table = tables["file_table"]
113
+ self.ingestion_job_summary_table = tables["ingestion_job_summary_table"]
114
+ self.task_summary_table = tables["task_summary_table"]
115
+ self.store_version_table = tables["store_version_table"]
116
+
115
117
  def __getstate__(self):
116
- return {"url": self.url}
118
+ return {"url": self.url, "table_prefix": self.table_prefix}
117
119
 
118
120
  def __setstate__(self, state):
119
121
  self.url = state["url"]
122
+ self.table_prefix = state.get("table_prefix", "")
120
123
  self._init_engine()
121
124
 
122
- def __init__(self, url: str):
125
+ def __init__(self, url: str, table_prefix: str = ""):
123
126
  url = self.fix_url(url)
124
127
 
125
128
  self.url = url
129
+ self.table_prefix = table_prefix
126
130
  self._init_engine()
127
131
 
128
- metadata.create_all(self.engine)
132
+ # Create all tables in the database
133
+ self.metadata.create_all(self.engine)
129
134
 
130
135
  def __del__(self):
131
136
  self.close()
@@ -154,6 +159,30 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
154
159
  def dialect(self) -> Dialect:
155
160
  return self.session_provider.dialect
156
161
 
162
+ @property
163
+ def dataset_table(self):
164
+ return self.session_provider.dataset_table
165
+
166
+ @property
167
+ def revision_table(self):
168
+ return self.session_provider.revision_table
169
+
170
+ @property
171
+ def file_table(self):
172
+ return self.session_provider.file_table
173
+
174
+ @property
175
+ def ingestion_job_summary_table(self):
176
+ return self.session_provider.ingestion_job_summary_table
177
+
178
+ @property
179
+ def task_summary_table(self):
180
+ return self.session_provider.task_summary_table
181
+
182
+ @property
183
+ def store_version_table(self):
184
+ return self.session_provider.store_version_table
185
+
157
186
  def _upsert(
158
187
  self,
159
188
  connection: Connection,
@@ -251,13 +280,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
251
280
  )
252
281
 
253
282
  query = query.select_from(
254
- dataset_table.join(
283
+ self.dataset_table.join(
255
284
  dataset_ids_cte,
256
- dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
285
+ dataset_ids_cte.c.dataset_id == self.dataset_table.c.dataset_id,
257
286
  )
258
287
  )
259
288
  else:
260
- query = query.filter(dataset_table.c.dataset_id == dataset_id)
289
+ query = query.filter(self.dataset_table.c.dataset_id == dataset_id)
261
290
 
262
291
  dialect = self.dialect.name
263
292
 
@@ -287,7 +316,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
287
316
  join_conditions = []
288
317
  for k in keys:
289
318
  if dialect == "postgresql":
290
- column = dataset_table.c.identifier[k]
319
+ column = self.dataset_table.c.identifier[k]
291
320
 
292
321
  # Take the value from the first selector to determine the type.
293
322
  # TODO: check all selectors to determine the type
@@ -297,24 +326,26 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
297
326
  else:
298
327
  column = column.as_string()
299
328
  else:
300
- column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
329
+ column = func.json_extract(
330
+ self.dataset_table.c.identifier, f"$.{k}"
331
+ )
301
332
 
302
333
  join_conditions.append(attribute_cte.c[k] == column)
303
334
 
304
335
  query = query.select_from(
305
- dataset_table.join(attribute_cte, and_(*join_conditions))
336
+ self.dataset_table.join(attribute_cte, and_(*join_conditions))
306
337
  )
307
338
 
308
339
  if where:
309
340
  query = query.filter(text(where))
310
341
 
311
- query = query.filter(dataset_table.c.bucket == bucket)
342
+ query = query.filter(self.dataset_table.c.bucket == bucket)
312
343
  if dataset_type:
313
- query = query.filter(dataset_table.c.dataset_type == dataset_type)
344
+ query = query.filter(self.dataset_table.c.dataset_type == dataset_type)
314
345
  if provider:
315
- query = query.filter(dataset_table.c.provider == provider)
346
+ query = query.filter(self.dataset_table.c.provider == provider)
316
347
  if dataset_state:
317
- query = query.filter(dataset_table.c.state.in_(dataset_state))
348
+ query = query.filter(self.dataset_table.c.state.in_(dataset_state))
318
349
 
319
350
  return query
320
351
 
@@ -328,23 +359,23 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
328
359
  )
329
360
 
330
361
  dataset_rows = list(
331
- self.session.query(dataset_table).select_from(
332
- dataset_table.join(
362
+ self.session.query(self.dataset_table).select_from(
363
+ self.dataset_table.join(
333
364
  dataset_ids_cte,
334
- dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
365
+ dataset_ids_cte.c.dataset_id == self.dataset_table.c.dataset_id,
335
366
  )
336
367
  )
337
368
  )
338
369
  revisions_per_dataset = {}
339
370
  rows = (
340
- self.session.query(revision_table)
371
+ self.session.query(self.revision_table)
341
372
  .select_from(
342
- revision_table.join(
373
+ self.revision_table.join(
343
374
  dataset_ids_cte,
344
- dataset_ids_cte.c.dataset_id == revision_table.c.dataset_id,
375
+ dataset_ids_cte.c.dataset_id == self.revision_table.c.dataset_id,
345
376
  )
346
377
  )
347
- .order_by(revision_table.c.dataset_id)
378
+ .order_by(self.revision_table.c.dataset_id)
348
379
  )
349
380
 
350
381
  for dataset_id, revisions in itertools.groupby(
@@ -354,14 +385,14 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
354
385
 
355
386
  files_per_revision = {}
356
387
  rows = (
357
- self.session.query(file_table)
388
+ self.session.query(self.file_table)
358
389
  .select_from(
359
- file_table.join(
390
+ self.file_table.join(
360
391
  dataset_ids_cte,
361
- dataset_ids_cte.c.dataset_id == file_table.c.dataset_id,
392
+ dataset_ids_cte.c.dataset_id == self.file_table.c.dataset_id,
362
393
  )
363
394
  )
364
- .order_by(file_table.c.dataset_id, file_table.c.revision_id)
395
+ .order_by(self.file_table.c.dataset_id, self.file_table.c.revision_id)
365
396
  )
366
397
 
367
398
  for (dataset_id, revision_id), files in itertools.groupby(
@@ -425,8 +456,8 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
425
456
  if not metadata_only:
426
457
  # Apply sorting by created_at in ascending order
427
458
  dataset_query = apply_query_filter(
428
- self.session.query(dataset_table.c.dataset_id)
429
- ).order_by(dataset_table.c.created_at.asc())
459
+ self.session.query(self.dataset_table.c.dataset_id)
460
+ ).order_by(self.dataset_table.c.created_at.asc())
430
461
 
431
462
  # Apply pagination if both page and page_size are provided
432
463
  if page is not None and page_size is not None:
@@ -448,9 +479,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
448
479
 
449
480
  metadata_result_query = (
450
481
  apply_query_filter(
451
- self.session.query(dataset_table.c.last_modified_at)
482
+ self.session.query(self.dataset_table.c.last_modified_at)
452
483
  )
453
- .order_by(dataset_table.c.last_modified_at.desc())
484
+ .order_by(self.dataset_table.c.last_modified_at.desc())
454
485
  .limit(1)
455
486
  )
456
487
 
@@ -508,11 +539,16 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
508
539
 
509
540
  with self.connect() as connection:
510
541
  try:
511
- self._upsert(connection, dataset_table, datasets_entities)
542
+ self._upsert(connection, self.dataset_table, datasets_entities)
512
543
  self._upsert(
513
- connection, revision_table, revision_entities, immutable_rows=True
544
+ connection,
545
+ self.revision_table,
546
+ revision_entities,
547
+ immutable_rows=True,
548
+ )
549
+ self._upsert(
550
+ connection, self.file_table, file_entities, immutable_rows=True
514
551
  )
515
- self._upsert(connection, file_table, file_entities, immutable_rows=True)
516
552
  except Exception:
517
553
  connection.rollback()
518
554
  raise
@@ -569,11 +605,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
569
605
  try:
570
606
  self._upsert(
571
607
  connection,
572
- ingestion_job_summary_table,
608
+ self.ingestion_job_summary_table,
573
609
  ingestion_job_summary_entities,
574
610
  )
575
611
  if task_summary_entities:
576
- self._upsert(connection, task_summary_table, task_summary_entities)
612
+ self._upsert(
613
+ connection, self.task_summary_table, task_summary_entities
614
+ )
577
615
  except Exception:
578
616
  connection.rollback()
579
617
  raise
@@ -584,13 +622,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
584
622
  ingestion_job_summary_ids = [
585
623
  row.ingestion_job_summary_id
586
624
  for row in self.session.query(
587
- ingestion_job_summary_table.c.ingestion_job_summary_id
625
+ self.ingestion_job_summary_table.c.ingestion_job_summary_id
588
626
  )
589
627
  ]
590
628
 
591
629
  ingestion_job_summary_rows = list(
592
- self.session.query(ingestion_job_summary_table).filter(
593
- ingestion_job_summary_table.c.ingestion_job_summary_id.in_(
630
+ self.session.query(self.ingestion_job_summary_table).filter(
631
+ self.ingestion_job_summary_table.c.ingestion_job_summary_id.in_(
594
632
  ingestion_job_summary_ids
595
633
  )
596
634
  )
@@ -598,13 +636,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
598
636
 
599
637
  task_summary_entities_per_job_summary = {}
600
638
  rows = (
601
- self.session.query(task_summary_table)
639
+ self.session.query(self.task_summary_table)
602
640
  .filter(
603
- task_summary_table.c.ingestion_job_summary_id.in_(
641
+ self.task_summary_table.c.ingestion_job_summary_id.in_(
604
642
  ingestion_job_summary_ids
605
643
  )
606
644
  )
607
- .order_by(task_summary_table.c.ingestion_job_summary_id)
645
+ .order_by(self.task_summary_table.c.ingestion_job_summary_id)
608
646
  )
609
647
 
610
648
  for ingestion_job_summary_id, task_summaries_rows in itertools.groupby(
@@ -636,7 +674,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
636
674
  def get_store_version(self) -> Optional[str]:
637
675
  """Get the current Ingestify version stored for this store."""
638
676
  with self.session:
639
- row = self.session.query(store_version_table.c.ingestify_version).first()
677
+ row = self.session.query(
678
+ self.store_version_table.c.ingestify_version
679
+ ).first()
640
680
  return row.ingestify_version if row else None
641
681
 
642
682
  def set_store_version(self, version: str):
@@ -653,7 +693,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
653
693
 
654
694
  with self.connect() as connection:
655
695
  try:
656
- self._upsert(connection, store_version_table, [entity])
696
+ self._upsert(connection, self.store_version_table, [entity])
657
697
  connection.commit()
658
698
  except Exception:
659
699
  connection.rollback()