ingestify 0.4.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {ingestify-0.4.0 → ingestify-0.4.1}/PKG-INFO +1 -1
  2. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/__init__.py +1 -1
  3. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/dataset/sqlalchemy/repository.py +43 -30
  4. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
  5. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify.egg-info/PKG-INFO +1 -1
  6. {ingestify-0.4.0 → ingestify-0.4.1}/README.md +0 -0
  7. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/application/__init__.py +0 -0
  8. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/application/dataset_store.py +0 -0
  9. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/application/ingestion_engine.py +0 -0
  10. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/application/loader.py +0 -0
  11. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/application/secrets_manager.py +0 -0
  12. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/cmdline.py +0 -0
  13. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/__init__.py +0 -0
  14. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/__init__.py +0 -0
  15. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/base.py +0 -0
  16. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/data_spec_version_collection.py +0 -0
  17. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/__init__.py +0 -0
  18. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/collection.py +0 -0
  19. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
  20. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/dataset.py +0 -0
  21. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
  22. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/dataset_state.py +0 -0
  23. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/events.py +0 -0
  24. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/file.py +0 -0
  25. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/file_collection.py +0 -0
  26. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/file_repository.py +0 -0
  27. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/identifier.py +0 -0
  28. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/revision.py +0 -0
  29. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/dataset/selector.py +0 -0
  30. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/__init__.py +0 -0
  31. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/_old_event.py +0 -0
  32. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/dispatcher.py +0 -0
  33. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/domain_event.py +0 -0
  34. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/event_bus.py +0 -0
  35. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/publisher.py +0 -0
  36. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/event/subscriber.py +0 -0
  37. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/fetch_policy.py +0 -0
  38. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/ingestion/__init__.py +0 -0
  39. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/ingestion/ingestion_job.py +0 -0
  40. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
  41. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  42. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/resources/__init__.py +0 -0
  43. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/resources/dataset_resource.py +0 -0
  44. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/sink.py +0 -0
  45. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/source.py +0 -0
  46. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/task/__init__.py +0 -0
  47. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/task/set.py +0 -0
  48. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/task/task.py +0 -0
  49. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/task/task_summary.py +0 -0
  50. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/models/timing.py +0 -0
  51. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/services/__init__.py +0 -0
  52. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  53. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/services/transformers/__init__.py +0 -0
  54. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  55. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/exceptions.py +0 -0
  56. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/__init__.py +0 -0
  57. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/fetch/__init__.py +0 -0
  58. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/fetch/http.py +0 -0
  59. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/serialization/__init__.py +0 -0
  60. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/sink/__init__.py +0 -0
  61. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/sink/postgresql.py +0 -0
  62. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/source/__init__.py +0 -0
  63. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/source/statsbomb_github.py +0 -0
  64. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/source/wyscout.py +0 -0
  65. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/__init__.py +0 -0
  66. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/dataset/__init__.py +0 -0
  67. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  68. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/file/__init__.py +0 -0
  69. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  70. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/file/local_file_repository.py +0 -0
  71. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  72. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/main.py +0 -0
  73. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/server.py +0 -0
  74. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/source_base.py +0 -0
  75. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/statsbomb_github/README.md +0 -0
  76. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
  77. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
  78. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/statsbomb_github/query.py +0 -0
  79. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/wyscout/.env +0 -0
  80. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/wyscout/.gitignore +0 -0
  81. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/wyscout/README.md +0 -0
  82. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
  83. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/wyscout/database/README.md +0 -0
  84. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/static/templates/wyscout/query.py +0 -0
  85. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify/utils.py +0 -0
  86. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify.egg-info/SOURCES.txt +0 -0
  87. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify.egg-info/dependency_links.txt +0 -0
  88. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify.egg-info/entry_points.txt +0 -0
  89. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify.egg-info/requires.txt +0 -0
  90. {ingestify-0.4.0 → ingestify-0.4.1}/ingestify.egg-info/top_level.txt +0 -0
  91. {ingestify-0.4.0 → ingestify-0.4.1}/setup.cfg +0 -0
  92. {ingestify-0.4.0 → ingestify-0.4.1}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.4.0
3
+ Version: 0.4.1
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.4.0"
11
+ __version__ = "0.4.1"
@@ -1,32 +1,31 @@
1
1
  import itertools
2
- import json
3
2
  import uuid
4
- from collections import defaultdict
5
3
  from typing import Optional, Union, List
6
4
 
7
5
  from sqlalchemy import (
8
6
  create_engine,
9
7
  func,
10
8
  text,
11
- tuple_,
12
9
  Table,
13
- insert,
14
- Transaction,
15
10
  Connection,
11
+ union_all,
12
+ literal,
13
+ select,
14
+ and_,
15
+ Column,
16
+ or_,
16
17
  )
17
18
  from sqlalchemy.engine import make_url
18
19
  from sqlalchemy.exc import NoSuchModuleError
19
- from sqlalchemy.orm import Session, joinedload
20
+ from sqlalchemy.orm import Session
20
21
 
21
22
  from ingestify.domain import File, Revision
22
23
  from ingestify.domain.models import (
23
24
  Dataset,
24
25
  DatasetCollection,
25
26
  DatasetRepository,
26
- Identifier,
27
27
  Selector,
28
28
  )
29
- from ingestify.domain.models.base import BaseModel
30
29
  from ingestify.domain.models.dataset.collection_metadata import (
31
30
  DatasetCollectionMetadata,
32
31
  )
@@ -127,6 +126,10 @@ class SqlAlchemySessionProvider:
127
126
  return self.session
128
127
 
129
128
 
129
+ def in_(column: Column, values):
130
+ return or_(*[column == value for value in values])
131
+
132
+
130
133
  class SqlAlchemyDatasetRepository(DatasetRepository):
131
134
  def __init__(self, session_provider: SqlAlchemySessionProvider):
132
135
  self.session_provider = session_provider
@@ -169,11 +172,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
169
172
  dataset_id: Optional[Union[str, List[str]]] = None,
170
173
  selector: Optional[Union[Selector, List[Selector]]] = None,
171
174
  ):
172
- query = query.filter(dataset_table.c.bucket == bucket)
173
- if dataset_type:
174
- query = query.filter(dataset_table.c.dataset_type == dataset_type)
175
- if provider:
176
- query = query.filter(dataset_table.c.provider == provider)
177
175
  if dataset_id is not None:
178
176
  if isinstance(dataset_id, list):
179
177
  if len(dataset_id) == 0:
@@ -181,7 +179,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
181
179
  # return an empty DatasetCollection
182
180
  return DatasetCollection()
183
181
 
184
- query = query.filter(dataset_table.c.dataset_id.in_(dataset_id))
182
+ query = query.filter(in_(dataset_table.c.dataset_id, dataset_id))
185
183
  else:
186
184
  query = query.filter(dataset_table.c.dataset_id == dataset_id)
187
185
 
@@ -201,13 +199,25 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
201
199
  if not selectors:
202
200
  raise ValueError("Selectors must contain at least one item")
203
201
 
204
- keys = list(selectors[0].filtered_attributes.keys())
202
+ attribute_keys = selectors[
203
+ 0
204
+ ].filtered_attributes.keys() # Assume all selectors have the same keys
205
+ attribute_sets = {
206
+ tuple(selector.filtered_attributes.items()) for selector in selectors
207
+ }
208
+
209
+ # Define a virtual table using a CTE for all attributes
210
+ attribute_cte = union_all(
211
+ *[
212
+ select(*(literal(value).label(key) for key, value in attr_set))
213
+ for attr_set in attribute_sets
214
+ ]
215
+ ).cte("attributes")
205
216
 
206
- columns = []
217
+ keys = list(selectors[0].filtered_attributes.keys())
207
218
  first_selector = selectors[0].filtered_attributes
208
219
 
209
- # Create a query like this:
210
- # SELECT * FROM dataset WHERE (column1, column2, column3) IN ((1, 2, 3), (4, 5, 6), (7, 8, 9))
220
+ join_conditions = []
211
221
  for k in keys:
212
222
  if dialect == "postgresql":
213
223
  column = dataset_table.c.identifier[k]
@@ -215,25 +225,28 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
215
225
  # Take the value from the first selector to determine the type.
216
226
  # TODO: check all selectors to determine the type
217
227
  v = first_selector[k]
218
- if isint(v):
228
+ if isinstance(v, int):
219
229
  column = column.as_integer()
220
- elif isfloat(v):
221
- column = column.as_float()
222
230
  else:
223
231
  column = column.as_string()
224
232
  else:
225
233
  column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
226
- columns.append(column)
227
234
 
228
- values = []
229
- for selector in selectors:
230
- filtered_attributes = selector.filtered_attributes
231
- values.append(tuple([filtered_attributes[k] for k in keys]))
235
+ join_conditions.append(attribute_cte.c[k] == column)
232
236
 
233
- query = query.filter(tuple_(*columns).in_(values))
237
+ query = query.select_from(
238
+ dataset_table.join(attribute_cte, and_(*join_conditions))
239
+ )
234
240
 
235
241
  if where:
236
242
  query = query.filter(text(where))
243
+
244
+ query = query.filter(dataset_table.c.bucket == bucket)
245
+ if dataset_type:
246
+ query = query.filter(dataset_table.c.dataset_type == dataset_type)
247
+ if provider:
248
+ query = query.filter(dataset_table.c.provider == provider)
249
+
237
250
  return query
238
251
 
239
252
  def load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
@@ -242,13 +255,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
242
255
 
243
256
  dataset_rows = list(
244
257
  self.session.query(dataset_table).filter(
245
- dataset_table.c.dataset_id.in_(dataset_ids)
258
+ in_(dataset_table.c.dataset_id, dataset_ids)
246
259
  )
247
260
  )
248
261
  revisions_per_dataset = {}
249
262
  rows = (
250
263
  self.session.query(revision_table)
251
- .filter(revision_table.c.dataset_id.in_(dataset_ids))
264
+ .filter(in_(revision_table.c.dataset_id, dataset_ids))
252
265
  .order_by(revision_table.c.dataset_id)
253
266
  )
254
267
 
@@ -260,7 +273,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
260
273
  files_per_revision = {}
261
274
  rows = (
262
275
  self.session.query(file_table)
263
- .filter(file_table.c.dataset_id.in_(dataset_ids))
276
+ .filter(in_(file_table.c.dataset_id, dataset_ids))
264
277
  .order_by(file_table.c.dataset_id, file_table.c.revision_id)
265
278
  )
266
279
 
@@ -14,6 +14,7 @@ from sqlalchemy import (
14
14
  String,
15
15
  Table,
16
16
  TypeDecorator,
17
+ Index,
17
18
  )
18
19
 
19
20
  from sqlalchemy.dialects.postgresql import JSONB
@@ -167,6 +168,15 @@ dataset_table = Table(
167
168
  Column("created_at", TZDateTime(6)),
168
169
  Column("updated_at", TZDateTime(6)),
169
170
  Column("last_modified_at", TZDateTime(6)),
171
+ # Required for performance querying when there are a lot of Datasets
172
+ # with the same provider and dataset_type
173
+ Index(
174
+ "idx_bucket_type_provider_last_modified",
175
+ "bucket",
176
+ "provider",
177
+ "dataset_type",
178
+ "last_modified_at",
179
+ ),
170
180
  )
171
181
 
172
182
  revision_table = Table(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.4.0
3
+ Version: 0.4.1
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes