ingestify 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ingestify/__init__.py CHANGED
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.4.1"
11
+ __version__ = "0.4.2"
@@ -35,11 +35,8 @@ class Loader:
35
35
  provider: Optional[str] = None,
36
36
  source: Optional[str] = None,
37
37
  ):
38
- # First collect all selectors, before discovering datasets
39
- selectors = {}
38
+ ingestion_plans = []
40
39
  for ingestion_plan in self.ingestion_plans:
41
- logger.info(f"Determining selectors for {ingestion_plan}")
42
-
43
40
  if provider is not None:
44
41
  if ingestion_plan.source.provider != provider:
45
42
  logger.info(
@@ -54,6 +51,13 @@ class Loader:
54
51
  )
55
52
  continue
56
53
 
54
+ ingestion_plans.append(ingestion_plan)
55
+
56
+ # First collect all selectors, before discovering datasets
57
+ selectors = {}
58
+ for ingestion_plan in ingestion_plans:
59
+ logger.info(f"Determining selectors for {ingestion_plan}")
60
+
57
61
  static_selectors = [
58
62
  selector
59
63
  for selector in ingestion_plan.selectors
ingestify/cmdline.py CHANGED
@@ -88,6 +88,14 @@ def cli():
88
88
  help="Source - only run tasks for a single source",
89
89
  type=str,
90
90
  )
91
+ @click.option(
92
+ "--disable-events",
93
+ "disable_events",
94
+ required=False,
95
+ help="Disable events - disable all event handlers",
96
+ is_flag=True,
97
+ type=bool,
98
+ )
91
99
  def run(
92
100
  config_file: str,
93
101
  bucket: Optional[str],
@@ -95,9 +103,10 @@ def run(
95
103
  provider: Optional[str],
96
104
  source: Optional[str],
97
105
  debug: Optional[bool],
106
+ disable_events: Optional[bool],
98
107
  ):
99
108
  try:
100
- engine = get_engine(config_file, bucket)
109
+ engine = get_engine(config_file, bucket, disable_events=disable_events)
101
110
  except ConfigurationError as e:
102
111
  if debug:
103
112
  raise
@@ -218,7 +218,7 @@ class IngestionJob:
218
218
  # Process all items in batches. Yield a IngestionJobSummary per batch
219
219
 
220
220
  logger.info("Finding metadata")
221
- with ingestion_job_summary.record_timing("get_dataset_collection"):
221
+ with ingestion_job_summary.record_timing("get_dataset_collection_metadata"):
222
222
  dataset_collection_metadata = store.get_dataset_collection(
223
223
  dataset_type=self.ingestion_plan.dataset_type,
224
224
  provider=self.ingestion_plan.source.provider,
@@ -232,6 +232,7 @@ class IngestionJob:
232
232
  # 1. The discover_datasets returns a list, and the entire list can be processed at once
233
233
  # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
234
234
  try:
235
+ logger.info(f"Finding datasets for selector={self.selector}")
235
236
  with ingestion_job_summary.record_timing("find_datasets"):
236
237
  dataset_resources = self.ingestion_plan.source.find_datasets(
237
238
  dataset_type=self.ingestion_plan.dataset_type,
@@ -249,6 +250,8 @@ class IngestionJob:
249
250
  yield ingestion_job_summary
250
251
  return
251
252
 
253
+ logger.info("Starting tasks")
254
+
252
255
  finish_task_timer = ingestion_job_summary.start_timing("tasks")
253
256
 
254
257
  while True:
@@ -273,13 +276,16 @@ class IngestionJob:
273
276
  for dataset_resource in batch
274
277
  ]
275
278
 
276
- # Load all available datasets based on the discovered dataset identifiers
277
- dataset_collection = store.get_dataset_collection(
278
- dataset_type=self.ingestion_plan.dataset_type,
279
- # Assume all DatasetResources share the same provider
280
- provider=batch[0].provider,
281
- selector=dataset_identifiers,
282
- )
279
+ logger.info(f"Searching for existing Datasets for DatasetResources")
280
+
281
+ with ingestion_job_summary.record_timing("get_dataset_collection"):
282
+ # Load all available datasets based on the discovered dataset identifiers
283
+ dataset_collection = store.get_dataset_collection(
284
+ dataset_type=self.ingestion_plan.dataset_type,
285
+ # Assume all DatasetResources share the same provider
286
+ provider=batch[0].provider,
287
+ selector=dataset_identifiers,
288
+ )
283
289
 
284
290
  skipped_datasets = 0
285
291
 
@@ -66,7 +66,7 @@ class IngestionJobSummary(BaseModel, HasTiming):
66
66
  self.skipped_datasets += skipped_datasets
67
67
 
68
68
  def task_count(self):
69
- return len(self.task_summaries)
69
+ return len(self.task_summaries) + self.skipped_datasets
70
70
 
71
71
  def _set_ended(self):
72
72
  self.failed_tasks = len(
@@ -106,22 +106,22 @@ class IngestionJobSummary(BaseModel, HasTiming):
106
106
  f"\nIngestionJobSummary {self.state.value} in {format_duration(self.duration)}"
107
107
  )
108
108
  print("********************************")
109
- print(f"* - IngestionPlan:")
110
- print(f"* Source: {self.source_name}")
111
- print(f"* Provider: {self.provider}")
112
- print(f"* DatasetType: {self.dataset_type}")
113
- print(f"* - Selector: {self.selector}")
114
- print(f"* - Timings: ")
109
+ print(f" - IngestionPlan:")
110
+ print(f" Source: {self.source_name}")
111
+ print(f" Provider: {self.provider}")
112
+ print(f" DatasetType: {self.dataset_type}")
113
+ print(f" - Selector: {self.selector}")
114
+ print(f" - Timings: ")
115
115
  for timing in self.timings:
116
- print(f"* - {timing.name}: {format_duration(timing.duration)}")
116
+ print(f" - {timing.name}: {format_duration(timing.duration)}")
117
117
  print(
118
- f"* - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
118
+ f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
119
119
  )
120
120
 
121
- print(f"* - Failed tasks: {self.failed_tasks}")
122
- print(f"* - Successful tasks: {self.successful_tasks}")
123
- print(f"* - Successful ignored tasks: {self.ignored_successful_tasks}")
124
- print(f"* - Skipped datasets: {self.skipped_datasets}")
121
+ print(f" - Failed tasks: {self.failed_tasks}")
122
+ print(f" - Successful tasks: {self.successful_tasks}")
123
+ print(f" - Successful ignored tasks: {self.ignored_successful_tasks}")
124
+ print(f" - Skipped datasets: {self.skipped_datasets}")
125
125
  print("********************************")
126
126
 
127
127
  def __enter__(self):
@@ -1,4 +1,5 @@
1
1
  import itertools
2
+ import logging
2
3
  import uuid
3
4
  from typing import Optional, Union, List
4
5
 
@@ -14,10 +15,11 @@ from sqlalchemy import (
14
15
  and_,
15
16
  Column,
16
17
  or_,
18
+ Dialect,
17
19
  )
18
20
  from sqlalchemy.engine import make_url
19
21
  from sqlalchemy.exc import NoSuchModuleError
20
- from sqlalchemy.orm import Session
22
+ from sqlalchemy.orm import Session, Query
21
23
 
22
24
  from ingestify.domain import File, Revision
23
25
  from ingestify.domain.models import (
@@ -42,6 +44,8 @@ from .tables import (
42
44
  task_summary_table,
43
45
  )
44
46
 
47
+ logger = logging.getLogger(__name__)
48
+
45
49
 
46
50
  def parse_value(v):
47
51
  try:
@@ -93,6 +97,7 @@ class SqlAlchemySessionProvider:
93
97
  # Use the default isolation level, don't need SERIALIZABLE
94
98
  # isolation_level="SERIALIZABLE",
95
99
  )
100
+ self.dialect = self.engine.dialect
96
101
  self.session = Session(bind=self.engine)
97
102
 
98
103
  def __init__(self, url: str):
@@ -110,18 +115,18 @@ class SqlAlchemySessionProvider:
110
115
  self.url = state["url"]
111
116
  self._init_engine()
112
117
 
113
- def _close_engine(self):
114
- if hasattr(self, "session"):
115
- self.session.close()
116
- self.engine.dispose()
117
-
118
118
  def __del__(self):
119
- self._close_engine()
119
+ self.close()
120
120
 
121
121
  def reset(self):
122
- self._close_engine()
122
+ self.close()
123
123
  self._init_engine()
124
124
 
125
+ def close(self):
126
+ if hasattr(self, "session"):
127
+ self.session.close()
128
+ self.engine.dispose()
129
+
125
130
  def get(self):
126
131
  return self.session
127
132
 
@@ -138,8 +143,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
138
143
  def session(self):
139
144
  return self.session_provider.get()
140
145
 
146
+ @property
147
+ def dialect(self) -> Dialect:
148
+ return self.session_provider.dialect
149
+
141
150
  def _upsert(self, connection: Connection, table: Table, entities: list[dict]):
142
- dialect = self.session.bind.dialect.name
151
+ dialect = self.dialect.name
143
152
  if dialect == "mysql":
144
153
  from sqlalchemy.dialects.mysql import insert
145
154
  elif dialect == "postgresql":
@@ -183,7 +192,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
183
192
  else:
184
193
  query = query.filter(dataset_table.c.dataset_id == dataset_id)
185
194
 
186
- dialect = self.session.bind.dialect.name
195
+ dialect = self.dialect.name
187
196
 
188
197
  if not isinstance(selector, list):
189
198
  where, selector = selector.split("where")
@@ -199,9 +208,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
199
208
  if not selectors:
200
209
  raise ValueError("Selectors must contain at least one item")
201
210
 
202
- attribute_keys = selectors[
203
- 0
204
- ].filtered_attributes.keys() # Assume all selectors have the same keys
205
211
  attribute_sets = {
206
212
  tuple(selector.filtered_attributes.items()) for selector in selectors
207
213
  }
@@ -249,7 +255,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
249
255
 
250
256
  return query
251
257
 
252
- def load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
258
+ def _load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
253
259
  if not dataset_ids:
254
260
  return []
255
261
 
@@ -303,6 +309,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
303
309
  )
304
310
  return datasets
305
311
 
312
+ def _debug_query(self, q: Query):
313
+ text_ = q.statement.compile(
314
+ compile_kwargs={"literal_binds": True}, dialect=self.dialect
315
+ )
316
+ logger.debug(f"Running query: {text_}")
317
+
306
318
  def get_dataset_collection(
307
319
  self,
308
320
  bucket: str,
@@ -322,22 +334,40 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
322
334
  selector=selector,
323
335
  )
324
336
 
325
- if not metadata_only:
326
- dataset_query = apply_query_filter(
327
- self.session.query(dataset_table.c.dataset_id)
328
- )
329
- dataset_ids = [row.dataset_id for row in dataset_query]
330
- datasets = self.load_datasets(dataset_ids)
331
- else:
332
- datasets = []
337
+ with self.session:
338
+ # Use a contextmanager to make sure it's closed afterwards
333
339
 
334
- metadata_result_row = apply_query_filter(
335
- self.session.query(
336
- func.max(dataset_table.c.last_modified_at).label("last_modified_at"),
337
- func.count().label("row_count"),
338
- )
339
- ).first()
340
- dataset_collection_metadata = DatasetCollectionMetadata(*metadata_result_row)
340
+ if not metadata_only:
341
+ dataset_query = apply_query_filter(
342
+ self.session.query(dataset_table.c.dataset_id)
343
+ )
344
+ self._debug_query(dataset_query)
345
+ dataset_ids = [row.dataset_id for row in dataset_query]
346
+ datasets = self._load_datasets(dataset_ids)
347
+
348
+ dataset_collection_metadata = DatasetCollectionMetadata(
349
+ last_modified=max(dataset.last_modified_at for dataset in datasets)
350
+ if datasets
351
+ else None,
352
+ row_count=len(datasets),
353
+ )
354
+ else:
355
+ datasets = []
356
+
357
+ metadata_result_query = apply_query_filter(
358
+ self.session.query(
359
+ func.max(dataset_table.c.last_modified_at).label(
360
+ "last_modified_at"
361
+ ),
362
+ func.count().label("row_count"),
363
+ )
364
+ )
365
+
366
+ self._debug_query(metadata_result_query)
367
+
368
+ dataset_collection_metadata = DatasetCollectionMetadata(
369
+ *metadata_result_query.first()
370
+ )
341
371
 
342
372
  return DatasetCollection(dataset_collection_metadata, datasets)
343
373
 
@@ -350,6 +380,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
350
380
  def connect(self):
351
381
  return self.session_provider.engine.connect()
352
382
 
383
+ def __del__(self):
384
+ self.session_provider.close()
385
+
353
386
  def _save(self, datasets: list[Dataset]):
354
387
  """Only do upserts. Never delete. Rows get only deleted when an entire Dataset is removed."""
355
388
  datasets_entities = []
ingestify/main.py CHANGED
@@ -182,7 +182,9 @@ def get_event_subscriber_cls(key: str) -> Type[Subscriber]:
182
182
  return import_cls(key)
183
183
 
184
184
 
185
- def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
185
+ def get_engine(
186
+ config_file, bucket: Optional[str] = None, disable_events: bool = False
187
+ ) -> IngestionEngine:
186
188
  config = parse_config(config_file, default_value="")
187
189
 
188
190
  logger.info("Initializing sources")
@@ -201,11 +203,16 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
201
203
 
202
204
  # Setup an EventBus and wire some more components
203
205
  event_bus = EventBus()
204
- publisher = Publisher()
205
- for subscriber in config.get("event_subscribers", []):
206
- cls = get_event_subscriber_cls(subscriber["type"])
207
- publisher.add_subscriber(cls(store))
208
- event_bus.register(publisher)
206
+ if not disable_events:
207
+ # When we disable all events we don't register any publishers
208
+ publisher = Publisher()
209
+ for subscriber in config.get("event_subscribers", []):
210
+ cls = get_event_subscriber_cls(subscriber["type"])
211
+ publisher.add_subscriber(cls(store))
212
+ event_bus.register(publisher)
213
+ else:
214
+ logger.info("Disabling all event handlers")
215
+
209
216
  store.set_event_bus(event_bus)
210
217
 
211
218
  ingestion_engine = IngestionEngine(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -1,14 +1,14 @@
1
- ingestify/__init__.py,sha256=xCS7JQ_JaB6zVzrq6WUeAZyNxVKJEOc7AKh-3vY_Ji8,301
2
- ingestify/cmdline.py,sha256=oagUe-Jup1SU3s6jVl25f0cSG0wlNYhxFY-gGBwWmr0,7482
1
+ ingestify/__init__.py,sha256=x4r1Cw7NXlEu1lunx4jwI0b3SZ7MhTbWSVlHStDtVaI,301
2
+ ingestify/cmdline.py,sha256=JcveX6e4i6mJtIllhTuruwbqxyoKIITIWE8kB6byvJU,7721
3
3
  ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
4
- ingestify/main.py,sha256=Xr0VbGgstPO7doDX18xqk4lBb4W2sbGWtQuXZaARsHA,8763
4
+ ingestify/main.py,sha256=yYKA-4WAk04RdBCGmatsCKiPFQzpyufoG4VzHiWkVtU,8979
5
5
  ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
6
6
  ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
7
7
  ingestify/utils.py,sha256=6BqgEZjecLW_anqYP5WrFpi93bmdhF-EdrebEkm59Ds,6806
8
8
  ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  ingestify/application/dataset_store.py,sha256=JkAb1W0HaUgOwbohKntM4ttyrFXQ7df1uZSu2rbZllY,11680
10
10
  ingestify/application/ingestion_engine.py,sha256=4SAmPZDm3e2QA5jZvMrb6xz1eDDshKoSZDWH3TCe4Bo,2372
11
- ingestify/application/loader.py,sha256=2LpYSHvedsoe5wFsIkQv0xLcKcqtebwVOSPWapAypao,7566
11
+ ingestify/application/loader.py,sha256=Lg3qPLaeKOFGheeqqfVeCBEF3cn61oZThgYYHoqfOvQ,7694
12
12
  ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
13
13
  ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
14
14
  ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
@@ -39,8 +39,8 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
39
39
  ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
40
40
  ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
41
41
  ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- ingestify/domain/models/ingestion/ingestion_job.py,sha256=H9jfbbWFZw73nxMOW0480LgSHV-o4sA5IcvpUZmFpS4,13140
43
- ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=Xvmtu0BwE9C7FxBl6D8tN49I6--E_RngcMfWeme4DPA,4499
42
+ ingestify/domain/models/ingestion/ingestion_job.py,sha256=Xprxv3SiMrJ5efleEbH2HS6MxZdMqDd7Pw2qp-yiM2U,13452
43
+ ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=MYd0-IYbEtAp4VWAXLA0xnyat1e52VNOevDZo3M4jg0,4499
44
44
  ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
45
45
  ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
46
46
  ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
@@ -64,7 +64,7 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
64
64
  ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
65
65
  ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
66
  ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
67
- ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=3xDTqEEy_MxZoIX9qezpXasOFW7NMmduJEaR0PwTZXk,16110
67
+ ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=ope_F-PVkXVo_oiUmsYdbUplC9aUnrTe4anlou-Y-y8,17078
68
68
  ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=OLB1FMElb3gSAnOsKX-oiLl_YVXaVEa6Q29QoHp2okU,10602
69
69
  ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
70
70
  ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
@@ -80,8 +80,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
80
80
  ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
81
81
  ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
82
82
  ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
83
- ingestify-0.4.1.dist-info/METADATA,sha256=Tz062FbilTuQmmW2FPyr2sj0GIK1vjtZs189R5bkxEM,18854
84
- ingestify-0.4.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
- ingestify-0.4.1.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
86
- ingestify-0.4.1.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
87
- ingestify-0.4.1.dist-info/RECORD,,
83
+ ingestify-0.4.2.dist-info/METADATA,sha256=E_if9fF-7cbW-CD3a4aQyinXPCgna-ZEv4mg_sTyl-0,18854
84
+ ingestify-0.4.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
+ ingestify-0.4.2.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
86
+ ingestify-0.4.2.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
87
+ ingestify-0.4.2.dist-info/RECORD,,