ingestify 0.5.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {ingestify-0.5.0 → ingestify-0.6.0}/PKG-INFO +1 -1
  2. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/__init__.py +1 -1
  3. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/application/loader.py +8 -2
  4. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/data_spec_version_collection.py +0 -1
  5. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/ingestion/ingestion_job.py +4 -2
  6. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +75 -22
  7. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify.egg-info/PKG-INFO +1 -1
  8. {ingestify-0.5.0 → ingestify-0.6.0}/README.md +0 -0
  9. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/application/__init__.py +0 -0
  10. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/application/dataset_store.py +0 -0
  11. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/application/ingestion_engine.py +0 -0
  12. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/application/secrets_manager.py +0 -0
  13. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/cmdline.py +0 -0
  14. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/__init__.py +0 -0
  15. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/__init__.py +0 -0
  16. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/base.py +0 -0
  17. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/__init__.py +0 -0
  18. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/collection.py +0 -0
  19. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
  20. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/dataset.py +0 -0
  21. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
  22. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/dataset_state.py +0 -0
  23. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/events.py +0 -0
  24. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/file.py +0 -0
  25. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/file_collection.py +0 -0
  26. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/file_repository.py +0 -0
  27. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/identifier.py +0 -0
  28. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/revision.py +0 -0
  29. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/dataset/selector.py +0 -0
  30. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/__init__.py +0 -0
  31. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/_old_event.py +0 -0
  32. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/dispatcher.py +0 -0
  33. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/domain_event.py +0 -0
  34. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/event_bus.py +0 -0
  35. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/publisher.py +0 -0
  36. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/event/subscriber.py +0 -0
  37. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/fetch_policy.py +0 -0
  38. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/ingestion/__init__.py +0 -0
  39. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
  40. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  41. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/resources/__init__.py +0 -0
  42. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/resources/dataset_resource.py +0 -0
  43. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/sink.py +0 -0
  44. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/source.py +0 -0
  45. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/task/__init__.py +0 -0
  46. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/task/set.py +0 -0
  47. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/task/task.py +0 -0
  48. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/task/task_summary.py +0 -0
  49. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/models/timing.py +0 -0
  50. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/services/__init__.py +0 -0
  51. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  52. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/services/transformers/__init__.py +0 -0
  53. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  54. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/exceptions.py +0 -0
  55. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/__init__.py +0 -0
  56. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/fetch/__init__.py +0 -0
  57. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/fetch/http.py +0 -0
  58. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/serialization/__init__.py +0 -0
  59. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/sink/__init__.py +0 -0
  60. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/sink/postgresql.py +0 -0
  61. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/source/__init__.py +0 -0
  62. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/source/statsbomb_github.py +0 -0
  63. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/source/wyscout.py +0 -0
  64. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/__init__.py +0 -0
  65. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/dataset/__init__.py +0 -0
  66. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  67. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/dataset/sqlalchemy/tables.py +0 -0
  68. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/file/__init__.py +0 -0
  69. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  70. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
  71. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  72. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/main.py +0 -0
  73. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/server.py +0 -0
  74. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/source_base.py +0 -0
  75. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/statsbomb_github/README.md +0 -0
  76. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
  77. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
  78. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/statsbomb_github/query.py +0 -0
  79. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/wyscout/.env +0 -0
  80. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/wyscout/.gitignore +0 -0
  81. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/wyscout/README.md +0 -0
  82. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
  83. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/wyscout/database/README.md +0 -0
  84. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/static/templates/wyscout/query.py +0 -0
  85. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify/utils.py +0 -0
  86. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify.egg-info/SOURCES.txt +0 -0
  87. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify.egg-info/dependency_links.txt +0 -0
  88. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify.egg-info/entry_points.txt +0 -0
  89. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify.egg-info/requires.txt +0 -0
  90. {ingestify-0.5.0 → ingestify-0.6.0}/ingestify.egg-info/top_level.txt +0 -0
  91. {ingestify-0.5.0 → ingestify-0.6.0}/setup.cfg +0 -0
  92. {ingestify-0.5.0 → ingestify-0.6.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.5.0"
11
+ __version__ = "0.6.0"
@@ -155,13 +155,19 @@ class Loader:
155
155
 
156
156
  IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
157
157
  """
158
- for ingestion_plan, selector in selectors.values():
158
+
159
+ ingestion_job_prefix = str(uuid.uuid1())
160
+ for ingestion_job_idx, (ingestion_plan, selector) in enumerate(
161
+ selectors.values()
162
+ ):
159
163
  logger.info(
160
164
  f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
161
165
  )
162
166
 
163
167
  ingestion_job = IngestionJob(
164
- ingestion_job_id=str(uuid.uuid1()),
168
+ # Create a combined IngestionJobId.
169
+ # This allows us to group all IngestionJobs within the same run
170
+ ingestion_job_id=f"{ingestion_job_prefix}.{ingestion_job_idx}",
165
171
  ingestion_plan=ingestion_plan,
166
172
  selector=selector,
167
173
  )
@@ -1,5 +1,4 @@
1
1
  import copy
2
- from dataclasses import dataclass
3
2
  from typing import Dict, Union, List, Set, Optional
4
3
 
5
4
 
@@ -24,7 +24,7 @@ from ingestify.utils import TaskExecutor, chunker
24
24
  logger = logging.getLogger(__name__)
25
25
 
26
26
 
27
- DEFAULT_CHUNK_SIZE = 1_000
27
+ DEFAULT_CHUNK_SIZE = 1000
28
28
 
29
29
 
30
30
  def run_task(task):
@@ -255,8 +255,10 @@ class IngestionJob:
255
255
  finish_task_timer = ingestion_job_summary.start_timing("tasks")
256
256
 
257
257
  while True:
258
+ logger.info(f"Finding next batch of datasets for selector={self.selector}")
258
259
  try:
259
- batch = next(batches)
260
+ with ingestion_job_summary.record_timing("find_datasets"):
261
+ batch = next(batches)
260
262
  except StopIteration:
261
263
  break
262
264
  except Exception as e:
@@ -13,9 +13,12 @@ from sqlalchemy import (
13
13
  literal,
14
14
  select,
15
15
  and_,
16
- Column,
17
- or_,
18
16
  Dialect,
17
+ values,
18
+ CTE,
19
+ column as sqlalchemy_column,
20
+ Integer,
21
+ String,
19
22
  )
20
23
  from sqlalchemy.engine import make_url
21
24
  from sqlalchemy.exc import NoSuchModuleError
@@ -137,10 +140,6 @@ class SqlAlchemySessionProvider:
137
140
  return self.session()
138
141
 
139
142
 
140
- def in_(column: Column, values):
141
- return or_(*[column == value for value in values])
142
-
143
-
144
143
  class SqlAlchemyDatasetRepository(DatasetRepository):
145
144
  def __init__(self, session_provider: SqlAlchemySessionProvider):
146
145
  self.session_provider = session_provider
@@ -178,6 +177,40 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
178
177
 
179
178
  connection.execute(stmt)
180
179
 
180
+ def _build_cte_sqlite(self, records, name: str) -> CTE:
181
+ """SQLite has a limit of 500 compound select statements. When we have more records,
182
+ create a nested CTE"""
183
+ if len(records) > 500:
184
+ return union_all(
185
+ select(self._build_cte_sqlite(records[:500], name + "1")),
186
+ select(self._build_cte_sqlite(records[500:], name + "2")),
187
+ ).cte(name)
188
+
189
+ return union_all(
190
+ *[
191
+ select(*(literal(value).label(key) for key, value in record.items()))
192
+ for record in records
193
+ ]
194
+ ).cte(name)
195
+
196
+ def _build_cte(self, records: list[dict], name: str) -> CTE:
197
+ """Build a CTE from a list of dictionaries."""
198
+
199
+ if self.dialect.name == "sqlite":
200
+ return self._build_cte_sqlite(records, name)
201
+
202
+ first_row = records[0]
203
+ columns = []
204
+ for key, value in first_row.items():
205
+ columns.append(
206
+ sqlalchemy_column(key, Integer if isinstance(value, int) else String)
207
+ )
208
+
209
+ # Prepare the data in tuples, in same order as columns
210
+ data = [tuple(record[column.name] for column in columns) for record in records]
211
+
212
+ return select(values(*columns, name=name).data(data)).cte(name)
213
+
181
214
  def _filter_query(
182
215
  self,
183
216
  query,
@@ -194,7 +227,17 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
194
227
  # return an empty DatasetCollection
195
228
  return DatasetCollection()
196
229
 
197
- query = query.filter(in_(dataset_table.c.dataset_id, dataset_id))
230
+ dataset_ids_cte = self._build_cte(
231
+ [{"dataset_id": dataset_id} for dataset_id in set(dataset_id)],
232
+ "dataset_ids",
233
+ )
234
+
235
+ query = query.select_from(
236
+ dataset_table.join(
237
+ dataset_ids_cte,
238
+ dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
239
+ )
240
+ )
198
241
  else:
199
242
  query = query.filter(dataset_table.c.dataset_id == dataset_id)
200
243
 
@@ -214,17 +257,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
214
257
  if not selectors:
215
258
  raise ValueError("Selectors must contain at least one item")
216
259
 
217
- attribute_sets = {
218
- tuple(selector.filtered_attributes.items()) for selector in selectors
219
- }
220
-
221
- # Define a virtual table using a CTE for all attributes
222
- attribute_cte = union_all(
223
- *[
224
- select(*(literal(value).label(key) for key, value in attr_set))
225
- for attr_set in attribute_sets
226
- ]
227
- ).cte("attributes")
260
+ attribute_cte = self._build_cte(
261
+ [selector.filtered_attributes for selector in selectors], "attributes"
262
+ )
228
263
 
229
264
  keys = list(selectors[0].filtered_attributes.keys())
230
265
  first_selector = selectors[0].filtered_attributes
@@ -265,15 +300,28 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
265
300
  if not dataset_ids:
266
301
  return []
267
302
 
303
+ dataset_ids_cte = self._build_cte(
304
+ [{"dataset_id": dataset_id} for dataset_id in set(dataset_ids)],
305
+ "dataset_ids",
306
+ )
307
+
268
308
  dataset_rows = list(
269
- self.session.query(dataset_table).filter(
270
- in_(dataset_table.c.dataset_id, dataset_ids)
309
+ self.session.query(dataset_table).select_from(
310
+ dataset_table.join(
311
+ dataset_ids_cte,
312
+ dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
313
+ )
271
314
  )
272
315
  )
273
316
  revisions_per_dataset = {}
274
317
  rows = (
275
318
  self.session.query(revision_table)
276
- .filter(in_(revision_table.c.dataset_id, dataset_ids))
319
+ .select_from(
320
+ revision_table.join(
321
+ dataset_ids_cte,
322
+ dataset_ids_cte.c.dataset_id == revision_table.c.dataset_id,
323
+ )
324
+ )
277
325
  .order_by(revision_table.c.dataset_id)
278
326
  )
279
327
 
@@ -285,7 +333,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
285
333
  files_per_revision = {}
286
334
  rows = (
287
335
  self.session.query(file_table)
288
- .filter(in_(file_table.c.dataset_id, dataset_ids))
336
+ .select_from(
337
+ file_table.join(
338
+ dataset_ids_cte,
339
+ dataset_ids_cte.c.dataset_id == file_table.c.dataset_id,
340
+ )
341
+ )
289
342
  .order_by(file_table.c.dataset_id, file_table.c.revision_id)
290
343
  )
291
344
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes