ingestify 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ingestify/__init__.py CHANGED
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.5.0"
11
+ __version__ = "0.5.1"
@@ -24,7 +24,9 @@ from ingestify.utils import TaskExecutor, chunker
24
24
  logger = logging.getLogger(__name__)
25
25
 
26
26
 
27
- DEFAULT_CHUNK_SIZE = 1_000
27
+ # Decrease batch size from 1_000 to 500. The sqlalchemy repository uses
28
+ # a compound select, which breaks at more than 500 select statements
29
+ DEFAULT_CHUNK_SIZE = 500
28
30
 
29
31
 
30
32
  def run_task(task):
@@ -137,10 +137,6 @@ class SqlAlchemySessionProvider:
137
137
  return self.session()
138
138
 
139
139
 
140
- def in_(column: Column, values):
141
- return or_(*[column == value for value in values])
142
-
143
-
144
140
  class SqlAlchemyDatasetRepository(DatasetRepository):
145
141
  def __init__(self, session_provider: SqlAlchemySessionProvider):
146
142
  self.session_provider = session_provider
@@ -194,7 +190,19 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
194
190
  # return an empty DatasetCollection
195
191
  return DatasetCollection()
196
192
 
197
- query = query.filter(in_(dataset_table.c.dataset_id, dataset_id))
193
+ dataset_ids_cte = union_all(
194
+ *[
195
+ select(literal(dataset_id).label("dataset_id"))
196
+ for dataset_id in set(dataset_id)
197
+ ]
198
+ ).cte("dataset_ids")
199
+
200
+ query = query.select_from(
201
+ dataset_table.join(
202
+ dataset_ids_cte,
203
+ dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
204
+ )
205
+ )
198
206
  else:
199
207
  query = query.filter(dataset_table.c.dataset_id == dataset_id)
200
208
 
@@ -265,15 +273,30 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
265
273
  if not dataset_ids:
266
274
  return []
267
275
 
276
+ dataset_ids_cte = union_all(
277
+ *[
278
+ select(literal(dataset_id).label("dataset_id"))
279
+ for dataset_id in set(dataset_ids)
280
+ ]
281
+ ).cte("dataset_ids")
282
+
268
283
  dataset_rows = list(
269
- self.session.query(dataset_table).filter(
270
- in_(dataset_table.c.dataset_id, dataset_ids)
284
+ self.session.query(dataset_table).select_from(
285
+ dataset_table.join(
286
+ dataset_ids_cte,
287
+ dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
288
+ )
271
289
  )
272
290
  )
273
291
  revisions_per_dataset = {}
274
292
  rows = (
275
293
  self.session.query(revision_table)
276
- .filter(in_(revision_table.c.dataset_id, dataset_ids))
294
+ .select_from(
295
+ revision_table.join(
296
+ dataset_ids_cte,
297
+ dataset_ids_cte.c.dataset_id == revision_table.c.dataset_id,
298
+ )
299
+ )
277
300
  .order_by(revision_table.c.dataset_id)
278
301
  )
279
302
 
@@ -285,7 +308,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
285
308
  files_per_revision = {}
286
309
  rows = (
287
310
  self.session.query(file_table)
288
- .filter(in_(file_table.c.dataset_id, dataset_ids))
311
+ .select_from(
312
+ file_table.join(
313
+ dataset_ids_cte,
314
+ dataset_ids_cte.c.dataset_id == file_table.c.dataset_id,
315
+ )
316
+ )
289
317
  .order_by(file_table.c.dataset_id, file_table.c.revision_id)
290
318
  )
291
319
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -1,4 +1,4 @@
1
- ingestify/__init__.py,sha256=6SmxhtKjGRDG31Ij8xc2i9L-7qC3qjA5DE89jQoD48Q,301
1
+ ingestify/__init__.py,sha256=Un08YQgoC1u_2dbvOdtOD59OKsUL78ekru-86GA3zpA,301
2
2
  ingestify/cmdline.py,sha256=JcveX6e4i6mJtIllhTuruwbqxyoKIITIWE8kB6byvJU,7721
3
3
  ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
4
4
  ingestify/main.py,sha256=yYKA-4WAk04RdBCGmatsCKiPFQzpyufoG4VzHiWkVtU,8979
@@ -39,7 +39,7 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
39
39
  ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
40
40
  ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
41
41
  ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- ingestify/domain/models/ingestion/ingestion_job.py,sha256=-SxHunvtG8J2u8LwXacF26oItwMkLJN7Suelt-hjHgk,13434
42
+ ingestify/domain/models/ingestion/ingestion_job.py,sha256=2Tibe1fKZU84LO_qHniO33ChTcJy3K0YLkVro8CjJPs,13573
43
43
  ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=ZEoL8kZfDM_NUYXD4_7Xpmtz6WduN50UcJBgNOxOxrE,4669
44
44
  ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
45
45
  ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
@@ -64,7 +64,7 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
64
64
  ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
65
65
  ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
66
  ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
67
- ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=mIF7ly-lyCSNJQeem2Dpxlllzn34MxEA97qV929ARDY,17361
67
+ ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=hn4x5tglqxO4EdyiAt_4wnDXmmWU87twHtBmVBRHVSY,18309
68
68
  ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=kALM32jbdeZ4Wn9gON-w2WSb5tH1lIWaBFgn5i29qTk,10635
69
69
  ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
70
70
  ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
@@ -80,8 +80,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
80
80
  ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
81
81
  ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
82
82
  ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
83
- ingestify-0.5.0.dist-info/METADATA,sha256=EsJsolUWxelVsEOhLUyiut_tKPYfqHx9Pvvg_T-HFG4,18854
84
- ingestify-0.5.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
- ingestify-0.5.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
86
- ingestify-0.5.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
87
- ingestify-0.5.0.dist-info/RECORD,,
83
+ ingestify-0.5.1.dist-info/METADATA,sha256=TOqbUz13KxM8v8kR-owtafHvYEXwj5ruaDP_bigoIbI,18854
84
+ ingestify-0.5.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
+ ingestify-0.5.1.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
86
+ ingestify-0.5.1.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
87
+ ingestify-0.5.1.dist-info/RECORD,,