ingestify 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ingestify/__init__.py CHANGED
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.5.1"
11
+ __version__ = "0.6.0"
@@ -155,13 +155,19 @@ class Loader:
155
155
 
156
156
  IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
157
157
  """
158
- for ingestion_plan, selector in selectors.values():
158
+
159
+ ingestion_job_prefix = str(uuid.uuid1())
160
+ for ingestion_job_idx, (ingestion_plan, selector) in enumerate(
161
+ selectors.values()
162
+ ):
159
163
  logger.info(
160
164
  f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
161
165
  )
162
166
 
163
167
  ingestion_job = IngestionJob(
164
- ingestion_job_id=str(uuid.uuid1()),
168
+ # Create a combined IngestionJobId.
169
+ # This allows us to group all IngestionJobs within the same run
170
+ ingestion_job_id=f"{ingestion_job_prefix}.{ingestion_job_idx}",
165
171
  ingestion_plan=ingestion_plan,
166
172
  selector=selector,
167
173
  )
@@ -1,5 +1,4 @@
1
1
  import copy
2
- from dataclasses import dataclass
3
2
  from typing import Dict, Union, List, Set, Optional
4
3
 
5
4
 
@@ -24,9 +24,7 @@ from ingestify.utils import TaskExecutor, chunker
24
24
  logger = logging.getLogger(__name__)
25
25
 
26
26
 
27
- # Decrease batch size from 1_000 to 500. The sqlalchemy repository uses
28
- # a compound select, which breaks at more than 500 select statements
29
- DEFAULT_CHUNK_SIZE = 500
27
+ DEFAULT_CHUNK_SIZE = 1000
30
28
 
31
29
 
32
30
  def run_task(task):
@@ -257,8 +255,10 @@ class IngestionJob:
257
255
  finish_task_timer = ingestion_job_summary.start_timing("tasks")
258
256
 
259
257
  while True:
258
+ logger.info(f"Finding next batch of datasets for selector={self.selector}")
260
259
  try:
261
- batch = next(batches)
260
+ with ingestion_job_summary.record_timing("find_datasets"):
261
+ batch = next(batches)
262
262
  except StopIteration:
263
263
  break
264
264
  except Exception as e:
@@ -13,9 +13,12 @@ from sqlalchemy import (
13
13
  literal,
14
14
  select,
15
15
  and_,
16
- Column,
17
- or_,
18
16
  Dialect,
17
+ values,
18
+ CTE,
19
+ column as sqlalchemy_column,
20
+ Integer,
21
+ String,
19
22
  )
20
23
  from sqlalchemy.engine import make_url
21
24
  from sqlalchemy.exc import NoSuchModuleError
@@ -174,6 +177,40 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
174
177
 
175
178
  connection.execute(stmt)
176
179
 
180
+ def _build_cte_sqlite(self, records, name: str) -> CTE:
181
+ """SQLite has a limit of 500 compound select statements. When we have more records,
182
+ create a nested CTE"""
183
+ if len(records) > 500:
184
+ return union_all(
185
+ select(self._build_cte_sqlite(records[:500], name + "1")),
186
+ select(self._build_cte_sqlite(records[500:], name + "2")),
187
+ ).cte(name)
188
+
189
+ return union_all(
190
+ *[
191
+ select(*(literal(value).label(key) for key, value in record.items()))
192
+ for record in records
193
+ ]
194
+ ).cte(name)
195
+
196
+ def _build_cte(self, records: list[dict], name: str) -> CTE:
197
+ """Build a CTE from a list of dictionaries."""
198
+
199
+ if self.dialect.name == "sqlite":
200
+ return self._build_cte_sqlite(records, name)
201
+
202
+ first_row = records[0]
203
+ columns = []
204
+ for key, value in first_row.items():
205
+ columns.append(
206
+ sqlalchemy_column(key, Integer if isinstance(value, int) else String)
207
+ )
208
+
209
+ # Prepare the data in tuples, in same order as columns
210
+ data = [tuple(record[column.name] for column in columns) for record in records]
211
+
212
+ return select(values(*columns, name=name).data(data)).cte(name)
213
+
177
214
  def _filter_query(
178
215
  self,
179
216
  query,
@@ -190,12 +227,10 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
190
227
  # return an empty DatasetCollection
191
228
  return DatasetCollection()
192
229
 
193
- dataset_ids_cte = union_all(
194
- *[
195
- select(literal(dataset_id).label("dataset_id"))
196
- for dataset_id in set(dataset_id)
197
- ]
198
- ).cte("dataset_ids")
230
+ dataset_ids_cte = self._build_cte(
231
+ [{"dataset_id": dataset_id} for dataset_id in set(dataset_id)],
232
+ "dataset_ids",
233
+ )
199
234
 
200
235
  query = query.select_from(
201
236
  dataset_table.join(
@@ -222,17 +257,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
222
257
  if not selectors:
223
258
  raise ValueError("Selectors must contain at least one item")
224
259
 
225
- attribute_sets = {
226
- tuple(selector.filtered_attributes.items()) for selector in selectors
227
- }
228
-
229
- # Define a virtual table using a CTE for all attributes
230
- attribute_cte = union_all(
231
- *[
232
- select(*(literal(value).label(key) for key, value in attr_set))
233
- for attr_set in attribute_sets
234
- ]
235
- ).cte("attributes")
260
+ attribute_cte = self._build_cte(
261
+ [selector.filtered_attributes for selector in selectors], "attributes"
262
+ )
236
263
 
237
264
  keys = list(selectors[0].filtered_attributes.keys())
238
265
  first_selector = selectors[0].filtered_attributes
@@ -273,12 +300,10 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
273
300
  if not dataset_ids:
274
301
  return []
275
302
 
276
- dataset_ids_cte = union_all(
277
- *[
278
- select(literal(dataset_id).label("dataset_id"))
279
- for dataset_id in set(dataset_ids)
280
- ]
281
- ).cte("dataset_ids")
303
+ dataset_ids_cte = self._build_cte(
304
+ [{"dataset_id": dataset_id} for dataset_id in set(dataset_ids)],
305
+ "dataset_ids",
306
+ )
282
307
 
283
308
  dataset_rows = list(
284
309
  self.session.query(dataset_table).select_from(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -1,4 +1,4 @@
1
- ingestify/__init__.py,sha256=Un08YQgoC1u_2dbvOdtOD59OKsUL78ekru-86GA3zpA,301
1
+ ingestify/__init__.py,sha256=lkRolxtOzdwCaj9gGdmYjUXstsVpm3SKvP_lEMwKJ7U,301
2
2
  ingestify/cmdline.py,sha256=JcveX6e4i6mJtIllhTuruwbqxyoKIITIWE8kB6byvJU,7721
3
3
  ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
4
4
  ingestify/main.py,sha256=yYKA-4WAk04RdBCGmatsCKiPFQzpyufoG4VzHiWkVtU,8979
@@ -8,12 +8,12 @@ ingestify/utils.py,sha256=EMdG3ZP3bX9DHxHvBLdkYLC3vcEVym7dmpIXQTikI3I,7281
8
8
  ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  ingestify/application/dataset_store.py,sha256=JkAb1W0HaUgOwbohKntM4ttyrFXQ7df1uZSu2rbZllY,11680
10
10
  ingestify/application/ingestion_engine.py,sha256=4SAmPZDm3e2QA5jZvMrb6xz1eDDshKoSZDWH3TCe4Bo,2372
11
- ingestify/application/loader.py,sha256=Lg3qPLaeKOFGheeqqfVeCBEF3cn61oZThgYYHoqfOvQ,7694
11
+ ingestify/application/loader.py,sha256=OvlBBmCiQS3KTs5G7kBbxcP80WTfFxJZ-CXGsJJGH8M,7958
12
12
  ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
13
13
  ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
14
14
  ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
15
15
  ingestify/domain/models/base.py,sha256=4gKbREajxJHlS-VwKoosNtHVupZ4eDLKMqnJ4ib0aS8,184
16
- ingestify/domain/models/data_spec_version_collection.py,sha256=CAXlO4W2AOOWAPdPAuymqBHnJpiYtkr2z7fYFJ3HSCk,1372
16
+ ingestify/domain/models/data_spec_version_collection.py,sha256=x5BvBnVI9QAfqhjCrUK19HKAiujdU1m8PkbQZwDheFU,1338
17
17
  ingestify/domain/models/fetch_policy.py,sha256=d7K1TzliNJXxqaqzqEOQWLhvgIvmmqhUQEliXvSUcTs,1405
18
18
  ingestify/domain/models/sink.py,sha256=OBVfFMpB7puJmHg4q2KYx4qgoAnlmX8xKWYnPi8a9pc,178
19
19
  ingestify/domain/models/source.py,sha256=sB3aqr2LfjIbtw7ODJpHnPj3RUeo7gYmTU7MXvfaYg4,973
@@ -39,7 +39,7 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
39
39
  ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
40
40
  ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
41
41
  ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- ingestify/domain/models/ingestion/ingestion_job.py,sha256=2Tibe1fKZU84LO_qHniO33ChTcJy3K0YLkVro8CjJPs,13573
42
+ ingestify/domain/models/ingestion/ingestion_job.py,sha256=uv0nM-5T_xBUUgb8e6A5Km0oG4GNU3CRp9x0GVW9TUU,13600
43
43
  ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=ZEoL8kZfDM_NUYXD4_7Xpmtz6WduN50UcJBgNOxOxrE,4669
44
44
  ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
45
45
  ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
@@ -64,7 +64,7 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
64
64
  ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
65
65
  ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
66
  ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
67
- ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=hn4x5tglqxO4EdyiAt_4wnDXmmWU87twHtBmVBRHVSY,18309
67
+ ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=28m_eC7BhcRQLWyc66NwmFmaviV52V4n9D_X9YXJX7w,19239
68
68
  ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=kALM32jbdeZ4Wn9gON-w2WSb5tH1lIWaBFgn5i29qTk,10635
69
69
  ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
70
70
  ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
@@ -80,8 +80,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
80
80
  ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
81
81
  ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
82
82
  ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
83
- ingestify-0.5.1.dist-info/METADATA,sha256=TOqbUz13KxM8v8kR-owtafHvYEXwj5ruaDP_bigoIbI,18854
84
- ingestify-0.5.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
- ingestify-0.5.1.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
86
- ingestify-0.5.1.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
87
- ingestify-0.5.1.dist-info/RECORD,,
83
+ ingestify-0.6.0.dist-info/METADATA,sha256=aT5qaahlo57g3hlk_MUudKrheOefMKmnFUiwF0bMYHU,18854
84
+ ingestify-0.6.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
+ ingestify-0.6.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
86
+ ingestify-0.6.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
87
+ ingestify-0.6.0.dist-info/RECORD,,