ingestify 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ingestify/__init__.py CHANGED
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.4.1"
11
+ __version__ = "0.5.0"
@@ -35,11 +35,8 @@ class Loader:
35
35
  provider: Optional[str] = None,
36
36
  source: Optional[str] = None,
37
37
  ):
38
- # First collect all selectors, before discovering datasets
39
- selectors = {}
38
+ ingestion_plans = []
40
39
  for ingestion_plan in self.ingestion_plans:
41
- logger.info(f"Determining selectors for {ingestion_plan}")
42
-
43
40
  if provider is not None:
44
41
  if ingestion_plan.source.provider != provider:
45
42
  logger.info(
@@ -54,6 +51,13 @@ class Loader:
54
51
  )
55
52
  continue
56
53
 
54
+ ingestion_plans.append(ingestion_plan)
55
+
56
+ # First collect all selectors, before discovering datasets
57
+ selectors = {}
58
+ for ingestion_plan in ingestion_plans:
59
+ logger.info(f"Determining selectors for {ingestion_plan}")
60
+
57
61
  static_selectors = [
58
62
  selector
59
63
  for selector in ingestion_plan.selectors
ingestify/cmdline.py CHANGED
@@ -88,6 +88,14 @@ def cli():
88
88
  help="Source - only run tasks for a single source",
89
89
  type=str,
90
90
  )
91
+ @click.option(
92
+ "--disable-events",
93
+ "disable_events",
94
+ required=False,
95
+ help="Disable events - disable all event handlers",
96
+ is_flag=True,
97
+ type=bool,
98
+ )
91
99
  def run(
92
100
  config_file: str,
93
101
  bucket: Optional[str],
@@ -95,9 +103,10 @@ def run(
95
103
  provider: Optional[str],
96
104
  source: Optional[str],
97
105
  debug: Optional[bool],
106
+ disable_events: Optional[bool],
98
107
  ):
99
108
  try:
100
- engine = get_engine(config_file, bucket)
109
+ engine = get_engine(config_file, bucket, disable_events=disable_events)
101
110
  except ConfigurationError as e:
102
111
  if debug:
103
112
  raise
@@ -218,7 +218,7 @@ class IngestionJob:
218
218
  # Process all items in batches. Yield a IngestionJobSummary per batch
219
219
 
220
220
  logger.info("Finding metadata")
221
- with ingestion_job_summary.record_timing("get_dataset_collection"):
221
+ with ingestion_job_summary.record_timing("get_dataset_collection_metadata"):
222
222
  dataset_collection_metadata = store.get_dataset_collection(
223
223
  dataset_type=self.ingestion_plan.dataset_type,
224
224
  provider=self.ingestion_plan.source.provider,
@@ -232,6 +232,7 @@ class IngestionJob:
232
232
  # 1. The discover_datasets returns a list, and the entire list can be processed at once
233
233
  # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
234
234
  try:
235
+ logger.info(f"Finding datasets for selector={self.selector}")
235
236
  with ingestion_job_summary.record_timing("find_datasets"):
236
237
  dataset_resources = self.ingestion_plan.source.find_datasets(
237
238
  dataset_type=self.ingestion_plan.dataset_type,
@@ -249,6 +250,8 @@ class IngestionJob:
249
250
  yield ingestion_job_summary
250
251
  return
251
252
 
253
+ logger.info("Starting tasks")
254
+
252
255
  finish_task_timer = ingestion_job_summary.start_timing("tasks")
253
256
 
254
257
  while True:
@@ -273,15 +276,18 @@ class IngestionJob:
273
276
  for dataset_resource in batch
274
277
  ]
275
278
 
276
- # Load all available datasets based on the discovered dataset identifiers
277
- dataset_collection = store.get_dataset_collection(
278
- dataset_type=self.ingestion_plan.dataset_type,
279
- # Assume all DatasetResources share the same provider
280
- provider=batch[0].provider,
281
- selector=dataset_identifiers,
282
- )
279
+ logger.info(f"Searching for existing Datasets for DatasetResources")
280
+
281
+ with ingestion_job_summary.record_timing("get_dataset_collection"):
282
+ # Load all available datasets based on the discovered dataset identifiers
283
+ dataset_collection = store.get_dataset_collection(
284
+ dataset_type=self.ingestion_plan.dataset_type,
285
+ # Assume all DatasetResources share the same provider
286
+ provider=batch[0].provider,
287
+ selector=dataset_identifiers,
288
+ )
283
289
 
284
- skipped_datasets = 0
290
+ skipped_tasks = 0
285
291
 
286
292
  task_set = TaskSet()
287
293
  for dataset_resource in batch:
@@ -301,7 +307,7 @@ class IngestionJob:
301
307
  )
302
308
  )
303
309
  else:
304
- skipped_datasets += 1
310
+ skipped_tasks += 1
305
311
  else:
306
312
  if self.ingestion_plan.fetch_policy.should_fetch(dataset_resource):
307
313
  task_set.add(
@@ -311,12 +317,12 @@ class IngestionJob:
311
317
  )
312
318
  )
313
319
  else:
314
- skipped_datasets += 1
320
+ skipped_tasks += 1
315
321
 
316
322
  if task_set:
317
323
  logger.info(
318
324
  f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
319
- f"using selector {self.selector} => {len(task_set)} tasks. {skipped_datasets} skipped."
325
+ f"using selector {self.selector} => {len(task_set)} tasks. {skipped_tasks} skipped."
320
326
  )
321
327
  logger.info(f"Running {len(task_set)} tasks")
322
328
  ingestion_job_summary.add_task_summaries(
@@ -328,7 +334,7 @@ class IngestionJob:
328
334
  f"using selector {self.selector} => nothing to do"
329
335
  )
330
336
 
331
- ingestion_job_summary.increase_skipped_datasets(skipped_datasets)
337
+ ingestion_job_summary.increase_skipped_tasks(skipped_tasks)
332
338
 
333
339
  if ingestion_job_summary.task_count() >= MAX_TASKS_PER_CHUNK:
334
340
  finish_task_timer()
@@ -41,7 +41,8 @@ class IngestionJobSummary(BaseModel, HasTiming):
41
41
  state: IngestionJobState = IngestionJobState.RUNNING
42
42
  task_summaries: List[TaskSummary] = Field(default_factory=list)
43
43
 
44
- skipped_datasets: int = 0
44
+ total_tasks: int = 0
45
+ skipped_tasks: int = 0
45
46
  failed_tasks: int = 0
46
47
  successful_tasks: int = 0
47
48
  ignored_successful_tasks: int = 0
@@ -62,11 +63,11 @@ class IngestionJobSummary(BaseModel, HasTiming):
62
63
  def add_task_summaries(self, task_summaries: List[TaskSummary]):
63
64
  self.task_summaries.extend(task_summaries)
64
65
 
65
- def increase_skipped_datasets(self, skipped_datasets: int):
66
- self.skipped_datasets += skipped_datasets
66
+ def increase_skipped_tasks(self, skipped_tasks: int):
67
+ self.skipped_tasks += skipped_tasks
67
68
 
68
69
  def task_count(self):
69
- return len(self.task_summaries)
70
+ return len(self.task_summaries) + self.skipped_tasks
70
71
 
71
72
  def _set_ended(self):
72
73
  self.failed_tasks = len(
@@ -82,6 +83,12 @@ class IngestionJobSummary(BaseModel, HasTiming):
82
83
  if task.state == TaskState.FINISHED_IGNORED
83
84
  ]
84
85
  )
86
+ self.total_tasks = (
87
+ self.failed_tasks
88
+ + self.successful_tasks
89
+ + self.ignored_successful_tasks
90
+ + self.skipped_tasks
91
+ )
85
92
  self.ended_at = utcnow()
86
93
 
87
94
  # Only keep failed tasks. Rest isn't interesting
@@ -106,22 +113,22 @@ class IngestionJobSummary(BaseModel, HasTiming):
106
113
  f"\nIngestionJobSummary {self.state.value} in {format_duration(self.duration)}"
107
114
  )
108
115
  print("********************************")
109
- print(f"* - IngestionPlan:")
110
- print(f"* Source: {self.source_name}")
111
- print(f"* Provider: {self.provider}")
112
- print(f"* DatasetType: {self.dataset_type}")
113
- print(f"* - Selector: {self.selector}")
114
- print(f"* - Timings: ")
116
+ print(f" - IngestionPlan:")
117
+ print(f" Source: {self.source_name}")
118
+ print(f" Provider: {self.provider}")
119
+ print(f" DatasetType: {self.dataset_type}")
120
+ print(f" - Selector: {self.selector}")
121
+ print(f" - Timings: ")
115
122
  for timing in self.timings:
116
- print(f"* - {timing.name}: {format_duration(timing.duration)}")
123
+ print(f" - {timing.name}: {format_duration(timing.duration)}")
117
124
  print(
118
- f"* - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
125
+ f" - Tasks: {self.total_tasks} - {(self.total_tasks / self.duration.total_seconds()):.1f} tasks/sec"
119
126
  )
120
127
 
121
- print(f"* - Failed tasks: {self.failed_tasks}")
122
- print(f"* - Successful tasks: {self.successful_tasks}")
123
- print(f"* - Successful ignored tasks: {self.ignored_successful_tasks}")
124
- print(f"* - Skipped datasets: {self.skipped_datasets}")
128
+ print(f" - Failed tasks: {self.failed_tasks}")
129
+ print(f" - Successful tasks: {self.successful_tasks}")
130
+ print(f" - Successful ignored tasks: {self.ignored_successful_tasks}")
131
+ print(f" - Skipped datasets: {self.skipped_tasks}")
125
132
  print("********************************")
126
133
 
127
134
  def __enter__(self):
@@ -1,4 +1,5 @@
1
1
  import itertools
2
+ import logging
2
3
  import uuid
3
4
  from typing import Optional, Union, List
4
5
 
@@ -14,10 +15,11 @@ from sqlalchemy import (
14
15
  and_,
15
16
  Column,
16
17
  or_,
18
+ Dialect,
17
19
  )
18
20
  from sqlalchemy.engine import make_url
19
21
  from sqlalchemy.exc import NoSuchModuleError
20
- from sqlalchemy.orm import Session
22
+ from sqlalchemy.orm import Session, Query, sessionmaker, scoped_session
21
23
 
22
24
  from ingestify.domain import File, Revision
23
25
  from ingestify.domain.models import (
@@ -32,6 +34,7 @@ from ingestify.domain.models.dataset.collection_metadata import (
32
34
  from ingestify.domain.models.ingestion.ingestion_job_summary import IngestionJobSummary
33
35
  from ingestify.domain.models.task.task_summary import TaskSummary
34
36
  from ingestify.exceptions import IngestifyError
37
+ from ingestify.utils import get_concurrency
35
38
 
36
39
  from .tables import (
37
40
  metadata,
@@ -42,6 +45,8 @@ from .tables import (
42
45
  task_summary_table,
43
46
  )
44
47
 
48
+ logger = logging.getLogger(__name__)
49
+
45
50
 
46
51
  def parse_value(v):
47
52
  try:
@@ -92,16 +97,15 @@ class SqlAlchemySessionProvider:
92
97
  self.url,
93
98
  # Use the default isolation level, don't need SERIALIZABLE
94
99
  # isolation_level="SERIALIZABLE",
100
+ pool_size=get_concurrency(), # Maximum number of connections in the pool
101
+ max_overflow=5,
102
+ pool_recycle=1800,
103
+ pool_pre_ping=True,
95
104
  )
96
- self.session = Session(bind=self.engine)
97
-
98
- def __init__(self, url: str):
99
- url = self.fix_url(url)
100
-
101
- self.url = url
102
- self._init_engine()
105
+ self.dialect = self.engine.dialect
103
106
 
104
- metadata.create_all(self.engine)
107
+ session_factory = sessionmaker(bind=self.engine)
108
+ self.session = scoped_session(session_factory)
105
109
 
106
110
  def __getstate__(self):
107
111
  return {"url": self.url}
@@ -110,20 +114,27 @@ class SqlAlchemySessionProvider:
110
114
  self.url = state["url"]
111
115
  self._init_engine()
112
116
 
113
- def _close_engine(self):
114
- if hasattr(self, "session"):
115
- self.session.close()
116
- self.engine.dispose()
117
+ def __init__(self, url: str):
118
+ url = self.fix_url(url)
119
+
120
+ self.url = url
121
+ self._init_engine()
122
+
123
+ metadata.create_all(self.engine)
117
124
 
118
125
  def __del__(self):
119
- self._close_engine()
126
+ self.close()
120
127
 
121
128
  def reset(self):
122
- self._close_engine()
129
+ self.close()
123
130
  self._init_engine()
124
131
 
132
+ def close(self):
133
+ if hasattr(self, "engine"):
134
+ self.engine.dispose()
135
+
125
136
  def get(self):
126
- return self.session
137
+ return self.session()
127
138
 
128
139
 
129
140
  def in_(column: Column, values):
@@ -138,8 +149,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
138
149
  def session(self):
139
150
  return self.session_provider.get()
140
151
 
152
+ @property
153
+ def dialect(self) -> Dialect:
154
+ return self.session_provider.dialect
155
+
141
156
  def _upsert(self, connection: Connection, table: Table, entities: list[dict]):
142
- dialect = self.session.bind.dialect.name
157
+ dialect = self.dialect.name
143
158
  if dialect == "mysql":
144
159
  from sqlalchemy.dialects.mysql import insert
145
160
  elif dialect == "postgresql":
@@ -183,7 +198,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
183
198
  else:
184
199
  query = query.filter(dataset_table.c.dataset_id == dataset_id)
185
200
 
186
- dialect = self.session.bind.dialect.name
201
+ dialect = self.dialect.name
187
202
 
188
203
  if not isinstance(selector, list):
189
204
  where, selector = selector.split("where")
@@ -199,9 +214,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
199
214
  if not selectors:
200
215
  raise ValueError("Selectors must contain at least one item")
201
216
 
202
- attribute_keys = selectors[
203
- 0
204
- ].filtered_attributes.keys() # Assume all selectors have the same keys
205
217
  attribute_sets = {
206
218
  tuple(selector.filtered_attributes.items()) for selector in selectors
207
219
  }
@@ -249,7 +261,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
249
261
 
250
262
  return query
251
263
 
252
- def load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
264
+ def _load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
253
265
  if not dataset_ids:
254
266
  return []
255
267
 
@@ -303,6 +315,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
303
315
  )
304
316
  return datasets
305
317
 
318
+ def _debug_query(self, q: Query):
319
+ text_ = q.statement.compile(
320
+ compile_kwargs={"literal_binds": True}, dialect=self.dialect
321
+ )
322
+ logger.debug(f"Running query: {text_}")
323
+
306
324
  def get_dataset_collection(
307
325
  self,
308
326
  bucket: str,
@@ -322,22 +340,40 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
322
340
  selector=selector,
323
341
  )
324
342
 
325
- if not metadata_only:
326
- dataset_query = apply_query_filter(
327
- self.session.query(dataset_table.c.dataset_id)
328
- )
329
- dataset_ids = [row.dataset_id for row in dataset_query]
330
- datasets = self.load_datasets(dataset_ids)
331
- else:
332
- datasets = []
343
+ with self.session:
344
+ # Use a contextmanager to make sure it's closed afterwards
333
345
 
334
- metadata_result_row = apply_query_filter(
335
- self.session.query(
336
- func.max(dataset_table.c.last_modified_at).label("last_modified_at"),
337
- func.count().label("row_count"),
338
- )
339
- ).first()
340
- dataset_collection_metadata = DatasetCollectionMetadata(*metadata_result_row)
346
+ if not metadata_only:
347
+ dataset_query = apply_query_filter(
348
+ self.session.query(dataset_table.c.dataset_id)
349
+ )
350
+ self._debug_query(dataset_query)
351
+ dataset_ids = [row.dataset_id for row in dataset_query]
352
+ datasets = self._load_datasets(dataset_ids)
353
+
354
+ dataset_collection_metadata = DatasetCollectionMetadata(
355
+ last_modified=max(dataset.last_modified_at for dataset in datasets)
356
+ if datasets
357
+ else None,
358
+ row_count=len(datasets),
359
+ )
360
+ else:
361
+ datasets = []
362
+
363
+ metadata_result_query = apply_query_filter(
364
+ self.session.query(
365
+ func.max(dataset_table.c.last_modified_at).label(
366
+ "last_modified_at"
367
+ ),
368
+ func.count().label("row_count"),
369
+ )
370
+ )
371
+
372
+ self._debug_query(metadata_result_query)
373
+
374
+ dataset_collection_metadata = DatasetCollectionMetadata(
375
+ *metadata_result_query.first()
376
+ )
341
377
 
342
378
  return DatasetCollection(dataset_collection_metadata, datasets)
343
379
 
@@ -350,6 +386,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
350
386
  def connect(self):
351
387
  return self.session_provider.engine.connect()
352
388
 
389
+ def __del__(self):
390
+ self.session_provider.close()
391
+
353
392
  def _save(self, datasets: list[Dataset]):
354
393
  """Only do upserts. Never delete. Rows get only deleted when an entire Dataset is removed."""
355
394
  datasets_entities = []
@@ -247,9 +247,10 @@ ingestion_job_summary_table = Table(
247
247
  Column("ended_at", TZDateTime(6)),
248
248
  # Some task counters
249
249
  Column("state", IngestionJobStateString),
250
+ Column("total_tasks", Integer),
250
251
  Column("successful_tasks", Integer),
251
252
  Column("ignored_successful_tasks", Integer),
252
- Column("skipped_datasets", Integer),
253
+ Column("skipped_tasks", Integer),
253
254
  Column("failed_tasks", Integer),
254
255
  Column(
255
256
  "timings",
@@ -2,9 +2,11 @@ from pathlib import Path
2
2
  from typing import BinaryIO
3
3
 
4
4
  import boto3 as boto3
5
+ import botocore.config
5
6
 
6
7
  from ingestify.domain import Dataset
7
8
  from ingestify.domain.models import FileRepository
9
+ from ingestify.utils import get_concurrency
8
10
 
9
11
 
10
12
  class S3FileRepository(FileRepository):
@@ -13,7 +15,10 @@ class S3FileRepository(FileRepository):
13
15
  @property
14
16
  def s3(self):
15
17
  if not self._s3:
16
- self._s3 = boto3.resource("s3")
18
+ client_config = botocore.config.Config(
19
+ max_pool_connections=get_concurrency(),
20
+ )
21
+ self._s3 = boto3.resource("s3", config=client_config)
17
22
  return self._s3
18
23
 
19
24
  def __getstate__(self):
ingestify/main.py CHANGED
@@ -182,7 +182,9 @@ def get_event_subscriber_cls(key: str) -> Type[Subscriber]:
182
182
  return import_cls(key)
183
183
 
184
184
 
185
- def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
185
+ def get_engine(
186
+ config_file, bucket: Optional[str] = None, disable_events: bool = False
187
+ ) -> IngestionEngine:
186
188
  config = parse_config(config_file, default_value="")
187
189
 
188
190
  logger.info("Initializing sources")
@@ -201,11 +203,16 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
201
203
 
202
204
  # Setup an EventBus and wire some more components
203
205
  event_bus = EventBus()
204
- publisher = Publisher()
205
- for subscriber in config.get("event_subscribers", []):
206
- cls = get_event_subscriber_cls(subscriber["type"])
207
- publisher.add_subscriber(cls(store))
208
- event_bus.register(publisher)
206
+ if not disable_events:
207
+ # When we disable all events we don't register any publishers
208
+ publisher = Publisher()
209
+ for subscriber in config.get("event_subscribers", []):
210
+ cls = get_event_subscriber_cls(subscriber["type"])
211
+ publisher.add_subscriber(cls(store))
212
+ event_bus.register(publisher)
213
+ else:
214
+ logger.info("Disabling all event handlers")
215
+
209
216
  store.set_event_bus(event_bus)
210
217
 
211
218
  ingestion_engine = IngestionEngine(
ingestify/utils.py CHANGED
@@ -3,6 +3,7 @@ import os
3
3
  import time
4
4
  import re
5
5
  import traceback
6
+ from concurrent.futures import ThreadPoolExecutor
6
7
  from contextlib import contextmanager
7
8
  from multiprocessing import get_context, cpu_count, get_all_start_methods
8
9
 
@@ -137,59 +138,65 @@ def map_in_pool(func, iterable, processes=0):
137
138
  )
138
139
 
139
140
 
140
- class SyncPool:
141
+ class SyncExecutor:
141
142
  def map(self, func, iterable):
142
143
  return [func(item) for item in iterable]
143
144
 
144
- def join(self):
145
- return True
145
+ def __enter__(self):
146
+ return self
146
147
 
147
- def close(self):
148
- return True
148
+ def __exit__(self, exc_type, exc_val, exc_tb):
149
+ pass
149
150
 
150
151
 
151
- class DummyPool:
152
+ class DummyExecutor:
152
153
  def map(self, func, iterable):
153
154
  logger.info(f"DummyPool: not running {len(list(iterable))} tasks")
154
155
  return None
155
156
 
156
- def join(self):
157
- return True
157
+ def __enter__(self):
158
+ return self
158
159
 
159
- def close(self):
160
- return True
160
+ def __exit__(self, exc_type, exc_val, exc_tb):
161
+ pass
161
162
 
162
163
 
163
164
  class TaskExecutor:
164
165
  def __init__(self, processes=0, dry_run: bool = False):
165
166
  if dry_run:
166
- pool = DummyPool()
167
+ executor = DummyExecutor()
167
168
  elif os.environ.get("INGESTIFY_RUN_EAGER") == "true":
168
- pool = SyncPool()
169
+ executor = SyncExecutor()
169
170
  else:
170
171
  if not processes:
171
- processes = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
172
+ processes = get_concurrency()
173
+
174
+ # if "fork" in get_all_start_methods():
175
+ # ctx = get_context("fork")
176
+ # else:
177
+ # ctx = get_context("spawn")
172
178
 
173
- if "fork" in get_all_start_methods():
174
- ctx = get_context("fork")
175
- else:
176
- ctx = get_context("spawn")
179
+ # pool = ctx.Pool(processes or cpu_count())
177
180
 
178
- pool = ctx.Pool(processes or cpu_count())
179
- self.pool = pool
181
+ executor = ThreadPoolExecutor(max_workers=processes)
182
+
183
+ self.executor = executor
180
184
 
181
185
  def __enter__(self):
186
+ self.executor.__enter__()
182
187
  return self
183
188
 
184
189
  def __exit__(self, exc_type, exc_val, exc_tb):
185
- self.join()
190
+ self.executor.__exit__(exc_type, exc_val, exc_tb)
186
191
 
187
192
  def run(self, func, iterable):
188
- wrapped_fn = cloudpickle.dumps(func)
193
+ # If multiprocessing
194
+ # wrapped_fn = cloudpickle.dumps(func)
195
+ # res = self.executor.map(
196
+ # cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
197
+ # )
189
198
  start_time = time.time()
190
- res = self.pool.map(
191
- cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
192
- )
199
+ res = list(self.executor.map(func, iterable))
193
200
  if res:
194
201
  took = time.time() - start_time
195
202
  logger.info(
@@ -197,10 +204,6 @@ class TaskExecutor:
197
204
  )
198
205
  return res
199
206
 
200
- def join(self):
201
- self.pool.close()
202
- self.pool.join()
203
-
204
207
 
205
208
  def try_number(s: str):
206
209
  try:
@@ -253,3 +256,10 @@ class HasTiming:
253
256
  self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
254
257
 
255
258
  return finish
259
+
260
+
261
+ def get_concurrency():
262
+ concurrency = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
263
+ if not concurrency:
264
+ concurrency = min(32, (os.cpu_count() or 1) + 4)
265
+ return concurrency
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.4.1
3
+ Version: 0.5.0
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -1,14 +1,14 @@
1
- ingestify/__init__.py,sha256=xCS7JQ_JaB6zVzrq6WUeAZyNxVKJEOc7AKh-3vY_Ji8,301
2
- ingestify/cmdline.py,sha256=oagUe-Jup1SU3s6jVl25f0cSG0wlNYhxFY-gGBwWmr0,7482
1
+ ingestify/__init__.py,sha256=6SmxhtKjGRDG31Ij8xc2i9L-7qC3qjA5DE89jQoD48Q,301
2
+ ingestify/cmdline.py,sha256=JcveX6e4i6mJtIllhTuruwbqxyoKIITIWE8kB6byvJU,7721
3
3
  ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
4
- ingestify/main.py,sha256=Xr0VbGgstPO7doDX18xqk4lBb4W2sbGWtQuXZaARsHA,8763
4
+ ingestify/main.py,sha256=yYKA-4WAk04RdBCGmatsCKiPFQzpyufoG4VzHiWkVtU,8979
5
5
  ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
6
6
  ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
7
- ingestify/utils.py,sha256=6BqgEZjecLW_anqYP5WrFpi93bmdhF-EdrebEkm59Ds,6806
7
+ ingestify/utils.py,sha256=EMdG3ZP3bX9DHxHvBLdkYLC3vcEVym7dmpIXQTikI3I,7281
8
8
  ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  ingestify/application/dataset_store.py,sha256=JkAb1W0HaUgOwbohKntM4ttyrFXQ7df1uZSu2rbZllY,11680
10
10
  ingestify/application/ingestion_engine.py,sha256=4SAmPZDm3e2QA5jZvMrb6xz1eDDshKoSZDWH3TCe4Bo,2372
11
- ingestify/application/loader.py,sha256=2LpYSHvedsoe5wFsIkQv0xLcKcqtebwVOSPWapAypao,7566
11
+ ingestify/application/loader.py,sha256=Lg3qPLaeKOFGheeqqfVeCBEF3cn61oZThgYYHoqfOvQ,7694
12
12
  ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
13
13
  ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
14
14
  ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
@@ -39,8 +39,8 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
39
39
  ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
40
40
  ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
41
41
  ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- ingestify/domain/models/ingestion/ingestion_job.py,sha256=H9jfbbWFZw73nxMOW0480LgSHV-o4sA5IcvpUZmFpS4,13140
43
- ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=Xvmtu0BwE9C7FxBl6D8tN49I6--E_RngcMfWeme4DPA,4499
42
+ ingestify/domain/models/ingestion/ingestion_job.py,sha256=-SxHunvtG8J2u8LwXacF26oItwMkLJN7Suelt-hjHgk,13434
43
+ ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=ZEoL8kZfDM_NUYXD4_7Xpmtz6WduN50UcJBgNOxOxrE,4669
44
44
  ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
45
45
  ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
46
46
  ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
@@ -64,12 +64,12 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
64
64
  ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
65
65
  ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
66
  ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
67
- ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=3xDTqEEy_MxZoIX9qezpXasOFW7NMmduJEaR0PwTZXk,16110
68
- ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=OLB1FMElb3gSAnOsKX-oiLl_YVXaVEa6Q29QoHp2okU,10602
67
+ ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=mIF7ly-lyCSNJQeem2Dpxlllzn34MxEA97qV929ARDY,17361
68
+ ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=kALM32jbdeZ4Wn9gON-w2WSb5tH1lIWaBFgn5i29qTk,10635
69
69
  ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
70
70
  ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
71
71
  ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
72
- ingestify/infra/store/file/s3_file_repository.py,sha256=Zu7j3qqeQhKi9Lx8UQRKZ2g1vT0h0OucOaHjq0uZpFs,1290
72
+ ingestify/infra/store/file/s3_file_repository.py,sha256=tz_EZ_gun7W2qJMlI3j_R03iKBZlJSDcG7AUJ1JkdpE,1501
73
73
  ingestify/static/templates/statsbomb_github/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
74
  ingestify/static/templates/statsbomb_github/config.yaml.jinja2,sha256=_gAuAipfBL3ddLacyS1IBP5JluvPS2vmrb8GGaFtcUM,386
75
75
  ingestify/static/templates/statsbomb_github/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
@@ -80,8 +80,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
80
80
  ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
81
81
  ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
82
82
  ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
83
- ingestify-0.4.1.dist-info/METADATA,sha256=Tz062FbilTuQmmW2FPyr2sj0GIK1vjtZs189R5bkxEM,18854
84
- ingestify-0.4.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
- ingestify-0.4.1.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
86
- ingestify-0.4.1.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
87
- ingestify-0.4.1.dist-info/RECORD,,
83
+ ingestify-0.5.0.dist-info/METADATA,sha256=EsJsolUWxelVsEOhLUyiut_tKPYfqHx9Pvvg_T-HFG4,18854
84
+ ingestify-0.5.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
+ ingestify-0.5.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
86
+ ingestify-0.5.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
87
+ ingestify-0.5.0.dist-info/RECORD,,