ingestify 0.5.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {ingestify-0.5.0 → ingestify-0.5.1}/PKG-INFO +1 -1
  2. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/__init__.py +1 -1
  3. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/ingestion/ingestion_job.py +3 -1
  4. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/store/dataset/sqlalchemy/repository.py +37 -9
  5. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify.egg-info/PKG-INFO +1 -1
  6. {ingestify-0.5.0 → ingestify-0.5.1}/README.md +0 -0
  7. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/application/__init__.py +0 -0
  8. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/application/dataset_store.py +0 -0
  9. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/application/ingestion_engine.py +0 -0
  10. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/application/loader.py +0 -0
  11. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/application/secrets_manager.py +0 -0
  12. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/cmdline.py +0 -0
  13. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/__init__.py +0 -0
  14. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/__init__.py +0 -0
  15. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/base.py +0 -0
  16. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/data_spec_version_collection.py +0 -0
  17. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/dataset/__init__.py +0 -0
  18. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/dataset/collection.py +0 -0
  19. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
  20. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/dataset/dataset.py +0 -0
  21. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
  22. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/dataset/dataset_state.py +0 -0
  23. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/dataset/events.py +0 -0
  24. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/dataset/file.py +0 -0
  25. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/dataset/file_collection.py +0 -0
  26. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/dataset/file_repository.py +0 -0
  27. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/dataset/identifier.py +0 -0
  28. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/dataset/revision.py +0 -0
  29. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/dataset/selector.py +0 -0
  30. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/event/__init__.py +0 -0
  31. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/event/_old_event.py +0 -0
  32. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/event/dispatcher.py +0 -0
  33. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/event/domain_event.py +0 -0
  34. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/event/event_bus.py +0 -0
  35. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/event/publisher.py +0 -0
  36. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/event/subscriber.py +0 -0
  37. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/fetch_policy.py +0 -0
  38. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/ingestion/__init__.py +0 -0
  39. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
  40. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  41. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/resources/__init__.py +0 -0
  42. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/resources/dataset_resource.py +0 -0
  43. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/sink.py +0 -0
  44. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/source.py +0 -0
  45. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/task/__init__.py +0 -0
  46. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/task/set.py +0 -0
  47. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/task/task.py +0 -0
  48. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/task/task_summary.py +0 -0
  49. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/models/timing.py +0 -0
  50. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/services/__init__.py +0 -0
  51. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  52. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/services/transformers/__init__.py +0 -0
  53. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  54. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/exceptions.py +0 -0
  55. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/__init__.py +0 -0
  56. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/fetch/__init__.py +0 -0
  57. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/fetch/http.py +0 -0
  58. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/serialization/__init__.py +0 -0
  59. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/sink/__init__.py +0 -0
  60. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/sink/postgresql.py +0 -0
  61. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/source/__init__.py +0 -0
  62. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/source/statsbomb_github.py +0 -0
  63. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/source/wyscout.py +0 -0
  64. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/store/__init__.py +0 -0
  65. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/store/dataset/__init__.py +0 -0
  66. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  67. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/store/dataset/sqlalchemy/tables.py +0 -0
  68. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/store/file/__init__.py +0 -0
  69. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  70. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/store/file/local_file_repository.py +0 -0
  71. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  72. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/main.py +0 -0
  73. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/server.py +0 -0
  74. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/source_base.py +0 -0
  75. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/static/templates/statsbomb_github/README.md +0 -0
  76. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
  77. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
  78. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/static/templates/statsbomb_github/query.py +0 -0
  79. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/static/templates/wyscout/.env +0 -0
  80. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/static/templates/wyscout/.gitignore +0 -0
  81. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/static/templates/wyscout/README.md +0 -0
  82. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
  83. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/static/templates/wyscout/database/README.md +0 -0
  84. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/static/templates/wyscout/query.py +0 -0
  85. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify/utils.py +0 -0
  86. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify.egg-info/SOURCES.txt +0 -0
  87. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify.egg-info/dependency_links.txt +0 -0
  88. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify.egg-info/entry_points.txt +0 -0
  89. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify.egg-info/requires.txt +0 -0
  90. {ingestify-0.5.0 → ingestify-0.5.1}/ingestify.egg-info/top_level.txt +0 -0
  91. {ingestify-0.5.0 → ingestify-0.5.1}/setup.cfg +0 -0
  92. {ingestify-0.5.0 → ingestify-0.5.1}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.5.0"
11
+ __version__ = "0.5.1"
@@ -24,7 +24,9 @@ from ingestify.utils import TaskExecutor, chunker
24
24
  logger = logging.getLogger(__name__)
25
25
 
26
26
 
27
- DEFAULT_CHUNK_SIZE = 1_000
27
+ # Decrease batch size from 1_000 to 500. The sqlalchemy repository uses
28
+ # a compound select, which breaks at more than 500 select statements
29
+ DEFAULT_CHUNK_SIZE = 500
28
30
 
29
31
 
30
32
  def run_task(task):
@@ -137,10 +137,6 @@ class SqlAlchemySessionProvider:
137
137
  return self.session()
138
138
 
139
139
 
140
- def in_(column: Column, values):
141
- return or_(*[column == value for value in values])
142
-
143
-
144
140
  class SqlAlchemyDatasetRepository(DatasetRepository):
145
141
  def __init__(self, session_provider: SqlAlchemySessionProvider):
146
142
  self.session_provider = session_provider
@@ -194,7 +190,19 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
194
190
  # return an empty DatasetCollection
195
191
  return DatasetCollection()
196
192
 
197
- query = query.filter(in_(dataset_table.c.dataset_id, dataset_id))
193
+ dataset_ids_cte = union_all(
194
+ *[
195
+ select(literal(dataset_id).label("dataset_id"))
196
+ for dataset_id in set(dataset_id)
197
+ ]
198
+ ).cte("dataset_ids")
199
+
200
+ query = query.select_from(
201
+ dataset_table.join(
202
+ dataset_ids_cte,
203
+ dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
204
+ )
205
+ )
198
206
  else:
199
207
  query = query.filter(dataset_table.c.dataset_id == dataset_id)
200
208
 
@@ -265,15 +273,30 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
265
273
  if not dataset_ids:
266
274
  return []
267
275
 
276
+ dataset_ids_cte = union_all(
277
+ *[
278
+ select(literal(dataset_id).label("dataset_id"))
279
+ for dataset_id in set(dataset_ids)
280
+ ]
281
+ ).cte("dataset_ids")
282
+
268
283
  dataset_rows = list(
269
- self.session.query(dataset_table).filter(
270
- in_(dataset_table.c.dataset_id, dataset_ids)
284
+ self.session.query(dataset_table).select_from(
285
+ dataset_table.join(
286
+ dataset_ids_cte,
287
+ dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
288
+ )
271
289
  )
272
290
  )
273
291
  revisions_per_dataset = {}
274
292
  rows = (
275
293
  self.session.query(revision_table)
276
- .filter(in_(revision_table.c.dataset_id, dataset_ids))
294
+ .select_from(
295
+ revision_table.join(
296
+ dataset_ids_cte,
297
+ dataset_ids_cte.c.dataset_id == revision_table.c.dataset_id,
298
+ )
299
+ )
277
300
  .order_by(revision_table.c.dataset_id)
278
301
  )
279
302
 
@@ -285,7 +308,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
285
308
  files_per_revision = {}
286
309
  rows = (
287
310
  self.session.query(file_table)
288
- .filter(in_(file_table.c.dataset_id, dataset_ids))
311
+ .select_from(
312
+ file_table.join(
313
+ dataset_ids_cte,
314
+ dataset_ids_cte.c.dataset_id == file_table.c.dataset_id,
315
+ )
316
+ )
289
317
  .order_by(file_table.c.dataset_id, file_table.c.revision_id)
290
318
  )
291
319
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes