ingestify 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/application/loader.py +8 -4
- ingestify/cmdline.py +10 -1
- ingestify/domain/models/ingestion/ingestion_job.py +19 -13
- ingestify/domain/models/ingestion/ingestion_job_summary.py +23 -16
- ingestify/infra/store/dataset/sqlalchemy/repository.py +76 -37
- ingestify/infra/store/dataset/sqlalchemy/tables.py +2 -1
- ingestify/infra/store/file/s3_file_repository.py +6 -1
- ingestify/main.py +13 -6
- ingestify/utils.py +38 -28
- {ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/METADATA +1 -1
- {ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/RECORD +15 -15
- {ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/WHEEL +0 -0
- {ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.4.1.dist-info → ingestify-0.5.0.dist-info}/top_level.txt +0 -0
ingestify/__init__.py
CHANGED
ingestify/application/loader.py
CHANGED
|
@@ -35,11 +35,8 @@ class Loader:
|
|
|
35
35
|
provider: Optional[str] = None,
|
|
36
36
|
source: Optional[str] = None,
|
|
37
37
|
):
|
|
38
|
-
|
|
39
|
-
selectors = {}
|
|
38
|
+
ingestion_plans = []
|
|
40
39
|
for ingestion_plan in self.ingestion_plans:
|
|
41
|
-
logger.info(f"Determining selectors for {ingestion_plan}")
|
|
42
|
-
|
|
43
40
|
if provider is not None:
|
|
44
41
|
if ingestion_plan.source.provider != provider:
|
|
45
42
|
logger.info(
|
|
@@ -54,6 +51,13 @@ class Loader:
|
|
|
54
51
|
)
|
|
55
52
|
continue
|
|
56
53
|
|
|
54
|
+
ingestion_plans.append(ingestion_plan)
|
|
55
|
+
|
|
56
|
+
# First collect all selectors, before discovering datasets
|
|
57
|
+
selectors = {}
|
|
58
|
+
for ingestion_plan in ingestion_plans:
|
|
59
|
+
logger.info(f"Determining selectors for {ingestion_plan}")
|
|
60
|
+
|
|
57
61
|
static_selectors = [
|
|
58
62
|
selector
|
|
59
63
|
for selector in ingestion_plan.selectors
|
ingestify/cmdline.py
CHANGED
|
@@ -88,6 +88,14 @@ def cli():
|
|
|
88
88
|
help="Source - only run tasks for a single source",
|
|
89
89
|
type=str,
|
|
90
90
|
)
|
|
91
|
+
@click.option(
|
|
92
|
+
"--disable-events",
|
|
93
|
+
"disable_events",
|
|
94
|
+
required=False,
|
|
95
|
+
help="Disable events - disable all event handlers",
|
|
96
|
+
is_flag=True,
|
|
97
|
+
type=bool,
|
|
98
|
+
)
|
|
91
99
|
def run(
|
|
92
100
|
config_file: str,
|
|
93
101
|
bucket: Optional[str],
|
|
@@ -95,9 +103,10 @@ def run(
|
|
|
95
103
|
provider: Optional[str],
|
|
96
104
|
source: Optional[str],
|
|
97
105
|
debug: Optional[bool],
|
|
106
|
+
disable_events: Optional[bool],
|
|
98
107
|
):
|
|
99
108
|
try:
|
|
100
|
-
engine = get_engine(config_file, bucket)
|
|
109
|
+
engine = get_engine(config_file, bucket, disable_events=disable_events)
|
|
101
110
|
except ConfigurationError as e:
|
|
102
111
|
if debug:
|
|
103
112
|
raise
|
|
@@ -218,7 +218,7 @@ class IngestionJob:
|
|
|
218
218
|
# Process all items in batches. Yield a IngestionJobSummary per batch
|
|
219
219
|
|
|
220
220
|
logger.info("Finding metadata")
|
|
221
|
-
with ingestion_job_summary.record_timing("
|
|
221
|
+
with ingestion_job_summary.record_timing("get_dataset_collection_metadata"):
|
|
222
222
|
dataset_collection_metadata = store.get_dataset_collection(
|
|
223
223
|
dataset_type=self.ingestion_plan.dataset_type,
|
|
224
224
|
provider=self.ingestion_plan.source.provider,
|
|
@@ -232,6 +232,7 @@ class IngestionJob:
|
|
|
232
232
|
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
233
233
|
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
234
234
|
try:
|
|
235
|
+
logger.info(f"Finding datasets for selector={self.selector}")
|
|
235
236
|
with ingestion_job_summary.record_timing("find_datasets"):
|
|
236
237
|
dataset_resources = self.ingestion_plan.source.find_datasets(
|
|
237
238
|
dataset_type=self.ingestion_plan.dataset_type,
|
|
@@ -249,6 +250,8 @@ class IngestionJob:
|
|
|
249
250
|
yield ingestion_job_summary
|
|
250
251
|
return
|
|
251
252
|
|
|
253
|
+
logger.info("Starting tasks")
|
|
254
|
+
|
|
252
255
|
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
253
256
|
|
|
254
257
|
while True:
|
|
@@ -273,15 +276,18 @@ class IngestionJob:
|
|
|
273
276
|
for dataset_resource in batch
|
|
274
277
|
]
|
|
275
278
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
#
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
279
|
+
logger.info(f"Searching for existing Datasets for DatasetResources")
|
|
280
|
+
|
|
281
|
+
with ingestion_job_summary.record_timing("get_dataset_collection"):
|
|
282
|
+
# Load all available datasets based on the discovered dataset identifiers
|
|
283
|
+
dataset_collection = store.get_dataset_collection(
|
|
284
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
285
|
+
# Assume all DatasetResources share the same provider
|
|
286
|
+
provider=batch[0].provider,
|
|
287
|
+
selector=dataset_identifiers,
|
|
288
|
+
)
|
|
283
289
|
|
|
284
|
-
|
|
290
|
+
skipped_tasks = 0
|
|
285
291
|
|
|
286
292
|
task_set = TaskSet()
|
|
287
293
|
for dataset_resource in batch:
|
|
@@ -301,7 +307,7 @@ class IngestionJob:
|
|
|
301
307
|
)
|
|
302
308
|
)
|
|
303
309
|
else:
|
|
304
|
-
|
|
310
|
+
skipped_tasks += 1
|
|
305
311
|
else:
|
|
306
312
|
if self.ingestion_plan.fetch_policy.should_fetch(dataset_resource):
|
|
307
313
|
task_set.add(
|
|
@@ -311,12 +317,12 @@ class IngestionJob:
|
|
|
311
317
|
)
|
|
312
318
|
)
|
|
313
319
|
else:
|
|
314
|
-
|
|
320
|
+
skipped_tasks += 1
|
|
315
321
|
|
|
316
322
|
if task_set:
|
|
317
323
|
logger.info(
|
|
318
324
|
f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
|
|
319
|
-
f"using selector {self.selector} => {len(task_set)} tasks. {
|
|
325
|
+
f"using selector {self.selector} => {len(task_set)} tasks. {skipped_tasks} skipped."
|
|
320
326
|
)
|
|
321
327
|
logger.info(f"Running {len(task_set)} tasks")
|
|
322
328
|
ingestion_job_summary.add_task_summaries(
|
|
@@ -328,7 +334,7 @@ class IngestionJob:
|
|
|
328
334
|
f"using selector {self.selector} => nothing to do"
|
|
329
335
|
)
|
|
330
336
|
|
|
331
|
-
ingestion_job_summary.
|
|
337
|
+
ingestion_job_summary.increase_skipped_tasks(skipped_tasks)
|
|
332
338
|
|
|
333
339
|
if ingestion_job_summary.task_count() >= MAX_TASKS_PER_CHUNK:
|
|
334
340
|
finish_task_timer()
|
|
@@ -41,7 +41,8 @@ class IngestionJobSummary(BaseModel, HasTiming):
|
|
|
41
41
|
state: IngestionJobState = IngestionJobState.RUNNING
|
|
42
42
|
task_summaries: List[TaskSummary] = Field(default_factory=list)
|
|
43
43
|
|
|
44
|
-
|
|
44
|
+
total_tasks: int = 0
|
|
45
|
+
skipped_tasks: int = 0
|
|
45
46
|
failed_tasks: int = 0
|
|
46
47
|
successful_tasks: int = 0
|
|
47
48
|
ignored_successful_tasks: int = 0
|
|
@@ -62,11 +63,11 @@ class IngestionJobSummary(BaseModel, HasTiming):
|
|
|
62
63
|
def add_task_summaries(self, task_summaries: List[TaskSummary]):
|
|
63
64
|
self.task_summaries.extend(task_summaries)
|
|
64
65
|
|
|
65
|
-
def
|
|
66
|
-
self.
|
|
66
|
+
def increase_skipped_tasks(self, skipped_tasks: int):
|
|
67
|
+
self.skipped_tasks += skipped_tasks
|
|
67
68
|
|
|
68
69
|
def task_count(self):
|
|
69
|
-
return len(self.task_summaries)
|
|
70
|
+
return len(self.task_summaries) + self.skipped_tasks
|
|
70
71
|
|
|
71
72
|
def _set_ended(self):
|
|
72
73
|
self.failed_tasks = len(
|
|
@@ -82,6 +83,12 @@ class IngestionJobSummary(BaseModel, HasTiming):
|
|
|
82
83
|
if task.state == TaskState.FINISHED_IGNORED
|
|
83
84
|
]
|
|
84
85
|
)
|
|
86
|
+
self.total_tasks = (
|
|
87
|
+
self.failed_tasks
|
|
88
|
+
+ self.successful_tasks
|
|
89
|
+
+ self.ignored_successful_tasks
|
|
90
|
+
+ self.skipped_tasks
|
|
91
|
+
)
|
|
85
92
|
self.ended_at = utcnow()
|
|
86
93
|
|
|
87
94
|
# Only keep failed tasks. Rest isn't interesting
|
|
@@ -106,22 +113,22 @@ class IngestionJobSummary(BaseModel, HasTiming):
|
|
|
106
113
|
f"\nIngestionJobSummary {self.state.value} in {format_duration(self.duration)}"
|
|
107
114
|
)
|
|
108
115
|
print("********************************")
|
|
109
|
-
print(f"
|
|
110
|
-
print(f"
|
|
111
|
-
print(f"
|
|
112
|
-
print(f"
|
|
113
|
-
print(f"
|
|
114
|
-
print(f"
|
|
116
|
+
print(f" - IngestionPlan:")
|
|
117
|
+
print(f" Source: {self.source_name}")
|
|
118
|
+
print(f" Provider: {self.provider}")
|
|
119
|
+
print(f" DatasetType: {self.dataset_type}")
|
|
120
|
+
print(f" - Selector: {self.selector}")
|
|
121
|
+
print(f" - Timings: ")
|
|
115
122
|
for timing in self.timings:
|
|
116
|
-
print(f"
|
|
123
|
+
print(f" - {timing.name}: {format_duration(timing.duration)}")
|
|
117
124
|
print(
|
|
118
|
-
f"
|
|
125
|
+
f" - Tasks: {self.total_tasks} - {(self.total_tasks / self.duration.total_seconds()):.1f} tasks/sec"
|
|
119
126
|
)
|
|
120
127
|
|
|
121
|
-
print(f"
|
|
122
|
-
print(f"
|
|
123
|
-
print(f"
|
|
124
|
-
print(f"
|
|
128
|
+
print(f" - Failed tasks: {self.failed_tasks}")
|
|
129
|
+
print(f" - Successful tasks: {self.successful_tasks}")
|
|
130
|
+
print(f" - Successful ignored tasks: {self.ignored_successful_tasks}")
|
|
131
|
+
print(f" - Skipped datasets: {self.skipped_tasks}")
|
|
125
132
|
print("********************************")
|
|
126
133
|
|
|
127
134
|
def __enter__(self):
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import itertools
|
|
2
|
+
import logging
|
|
2
3
|
import uuid
|
|
3
4
|
from typing import Optional, Union, List
|
|
4
5
|
|
|
@@ -14,10 +15,11 @@ from sqlalchemy import (
|
|
|
14
15
|
and_,
|
|
15
16
|
Column,
|
|
16
17
|
or_,
|
|
18
|
+
Dialect,
|
|
17
19
|
)
|
|
18
20
|
from sqlalchemy.engine import make_url
|
|
19
21
|
from sqlalchemy.exc import NoSuchModuleError
|
|
20
|
-
from sqlalchemy.orm import Session
|
|
22
|
+
from sqlalchemy.orm import Session, Query, sessionmaker, scoped_session
|
|
21
23
|
|
|
22
24
|
from ingestify.domain import File, Revision
|
|
23
25
|
from ingestify.domain.models import (
|
|
@@ -32,6 +34,7 @@ from ingestify.domain.models.dataset.collection_metadata import (
|
|
|
32
34
|
from ingestify.domain.models.ingestion.ingestion_job_summary import IngestionJobSummary
|
|
33
35
|
from ingestify.domain.models.task.task_summary import TaskSummary
|
|
34
36
|
from ingestify.exceptions import IngestifyError
|
|
37
|
+
from ingestify.utils import get_concurrency
|
|
35
38
|
|
|
36
39
|
from .tables import (
|
|
37
40
|
metadata,
|
|
@@ -42,6 +45,8 @@ from .tables import (
|
|
|
42
45
|
task_summary_table,
|
|
43
46
|
)
|
|
44
47
|
|
|
48
|
+
logger = logging.getLogger(__name__)
|
|
49
|
+
|
|
45
50
|
|
|
46
51
|
def parse_value(v):
|
|
47
52
|
try:
|
|
@@ -92,16 +97,15 @@ class SqlAlchemySessionProvider:
|
|
|
92
97
|
self.url,
|
|
93
98
|
# Use the default isolation level, don't need SERIALIZABLE
|
|
94
99
|
# isolation_level="SERIALIZABLE",
|
|
100
|
+
pool_size=get_concurrency(), # Maximum number of connections in the pool
|
|
101
|
+
max_overflow=5,
|
|
102
|
+
pool_recycle=1800,
|
|
103
|
+
pool_pre_ping=True,
|
|
95
104
|
)
|
|
96
|
-
self.
|
|
97
|
-
|
|
98
|
-
def __init__(self, url: str):
|
|
99
|
-
url = self.fix_url(url)
|
|
100
|
-
|
|
101
|
-
self.url = url
|
|
102
|
-
self._init_engine()
|
|
105
|
+
self.dialect = self.engine.dialect
|
|
103
106
|
|
|
104
|
-
|
|
107
|
+
session_factory = sessionmaker(bind=self.engine)
|
|
108
|
+
self.session = scoped_session(session_factory)
|
|
105
109
|
|
|
106
110
|
def __getstate__(self):
|
|
107
111
|
return {"url": self.url}
|
|
@@ -110,20 +114,27 @@ class SqlAlchemySessionProvider:
|
|
|
110
114
|
self.url = state["url"]
|
|
111
115
|
self._init_engine()
|
|
112
116
|
|
|
113
|
-
def
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
+
def __init__(self, url: str):
|
|
118
|
+
url = self.fix_url(url)
|
|
119
|
+
|
|
120
|
+
self.url = url
|
|
121
|
+
self._init_engine()
|
|
122
|
+
|
|
123
|
+
metadata.create_all(self.engine)
|
|
117
124
|
|
|
118
125
|
def __del__(self):
|
|
119
|
-
self.
|
|
126
|
+
self.close()
|
|
120
127
|
|
|
121
128
|
def reset(self):
|
|
122
|
-
self.
|
|
129
|
+
self.close()
|
|
123
130
|
self._init_engine()
|
|
124
131
|
|
|
132
|
+
def close(self):
|
|
133
|
+
if hasattr(self, "engine"):
|
|
134
|
+
self.engine.dispose()
|
|
135
|
+
|
|
125
136
|
def get(self):
|
|
126
|
-
return self.session
|
|
137
|
+
return self.session()
|
|
127
138
|
|
|
128
139
|
|
|
129
140
|
def in_(column: Column, values):
|
|
@@ -138,8 +149,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
138
149
|
def session(self):
|
|
139
150
|
return self.session_provider.get()
|
|
140
151
|
|
|
152
|
+
@property
|
|
153
|
+
def dialect(self) -> Dialect:
|
|
154
|
+
return self.session_provider.dialect
|
|
155
|
+
|
|
141
156
|
def _upsert(self, connection: Connection, table: Table, entities: list[dict]):
|
|
142
|
-
dialect = self.
|
|
157
|
+
dialect = self.dialect.name
|
|
143
158
|
if dialect == "mysql":
|
|
144
159
|
from sqlalchemy.dialects.mysql import insert
|
|
145
160
|
elif dialect == "postgresql":
|
|
@@ -183,7 +198,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
183
198
|
else:
|
|
184
199
|
query = query.filter(dataset_table.c.dataset_id == dataset_id)
|
|
185
200
|
|
|
186
|
-
dialect = self.
|
|
201
|
+
dialect = self.dialect.name
|
|
187
202
|
|
|
188
203
|
if not isinstance(selector, list):
|
|
189
204
|
where, selector = selector.split("where")
|
|
@@ -199,9 +214,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
199
214
|
if not selectors:
|
|
200
215
|
raise ValueError("Selectors must contain at least one item")
|
|
201
216
|
|
|
202
|
-
attribute_keys = selectors[
|
|
203
|
-
0
|
|
204
|
-
].filtered_attributes.keys() # Assume all selectors have the same keys
|
|
205
217
|
attribute_sets = {
|
|
206
218
|
tuple(selector.filtered_attributes.items()) for selector in selectors
|
|
207
219
|
}
|
|
@@ -249,7 +261,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
249
261
|
|
|
250
262
|
return query
|
|
251
263
|
|
|
252
|
-
def
|
|
264
|
+
def _load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
|
|
253
265
|
if not dataset_ids:
|
|
254
266
|
return []
|
|
255
267
|
|
|
@@ -303,6 +315,12 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
303
315
|
)
|
|
304
316
|
return datasets
|
|
305
317
|
|
|
318
|
+
def _debug_query(self, q: Query):
|
|
319
|
+
text_ = q.statement.compile(
|
|
320
|
+
compile_kwargs={"literal_binds": True}, dialect=self.dialect
|
|
321
|
+
)
|
|
322
|
+
logger.debug(f"Running query: {text_}")
|
|
323
|
+
|
|
306
324
|
def get_dataset_collection(
|
|
307
325
|
self,
|
|
308
326
|
bucket: str,
|
|
@@ -322,22 +340,40 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
322
340
|
selector=selector,
|
|
323
341
|
)
|
|
324
342
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
self.session.query(dataset_table.c.dataset_id)
|
|
328
|
-
)
|
|
329
|
-
dataset_ids = [row.dataset_id for row in dataset_query]
|
|
330
|
-
datasets = self.load_datasets(dataset_ids)
|
|
331
|
-
else:
|
|
332
|
-
datasets = []
|
|
343
|
+
with self.session:
|
|
344
|
+
# Use a contextmanager to make sure it's closed afterwards
|
|
333
345
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
346
|
+
if not metadata_only:
|
|
347
|
+
dataset_query = apply_query_filter(
|
|
348
|
+
self.session.query(dataset_table.c.dataset_id)
|
|
349
|
+
)
|
|
350
|
+
self._debug_query(dataset_query)
|
|
351
|
+
dataset_ids = [row.dataset_id for row in dataset_query]
|
|
352
|
+
datasets = self._load_datasets(dataset_ids)
|
|
353
|
+
|
|
354
|
+
dataset_collection_metadata = DatasetCollectionMetadata(
|
|
355
|
+
last_modified=max(dataset.last_modified_at for dataset in datasets)
|
|
356
|
+
if datasets
|
|
357
|
+
else None,
|
|
358
|
+
row_count=len(datasets),
|
|
359
|
+
)
|
|
360
|
+
else:
|
|
361
|
+
datasets = []
|
|
362
|
+
|
|
363
|
+
metadata_result_query = apply_query_filter(
|
|
364
|
+
self.session.query(
|
|
365
|
+
func.max(dataset_table.c.last_modified_at).label(
|
|
366
|
+
"last_modified_at"
|
|
367
|
+
),
|
|
368
|
+
func.count().label("row_count"),
|
|
369
|
+
)
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
self._debug_query(metadata_result_query)
|
|
373
|
+
|
|
374
|
+
dataset_collection_metadata = DatasetCollectionMetadata(
|
|
375
|
+
*metadata_result_query.first()
|
|
376
|
+
)
|
|
341
377
|
|
|
342
378
|
return DatasetCollection(dataset_collection_metadata, datasets)
|
|
343
379
|
|
|
@@ -350,6 +386,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
350
386
|
def connect(self):
|
|
351
387
|
return self.session_provider.engine.connect()
|
|
352
388
|
|
|
389
|
+
def __del__(self):
|
|
390
|
+
self.session_provider.close()
|
|
391
|
+
|
|
353
392
|
def _save(self, datasets: list[Dataset]):
|
|
354
393
|
"""Only do upserts. Never delete. Rows get only deleted when an entire Dataset is removed."""
|
|
355
394
|
datasets_entities = []
|
|
@@ -247,9 +247,10 @@ ingestion_job_summary_table = Table(
|
|
|
247
247
|
Column("ended_at", TZDateTime(6)),
|
|
248
248
|
# Some task counters
|
|
249
249
|
Column("state", IngestionJobStateString),
|
|
250
|
+
Column("total_tasks", Integer),
|
|
250
251
|
Column("successful_tasks", Integer),
|
|
251
252
|
Column("ignored_successful_tasks", Integer),
|
|
252
|
-
Column("
|
|
253
|
+
Column("skipped_tasks", Integer),
|
|
253
254
|
Column("failed_tasks", Integer),
|
|
254
255
|
Column(
|
|
255
256
|
"timings",
|
|
@@ -2,9 +2,11 @@ from pathlib import Path
|
|
|
2
2
|
from typing import BinaryIO
|
|
3
3
|
|
|
4
4
|
import boto3 as boto3
|
|
5
|
+
import botocore.config
|
|
5
6
|
|
|
6
7
|
from ingestify.domain import Dataset
|
|
7
8
|
from ingestify.domain.models import FileRepository
|
|
9
|
+
from ingestify.utils import get_concurrency
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class S3FileRepository(FileRepository):
|
|
@@ -13,7 +15,10 @@ class S3FileRepository(FileRepository):
|
|
|
13
15
|
@property
|
|
14
16
|
def s3(self):
|
|
15
17
|
if not self._s3:
|
|
16
|
-
|
|
18
|
+
client_config = botocore.config.Config(
|
|
19
|
+
max_pool_connections=get_concurrency(),
|
|
20
|
+
)
|
|
21
|
+
self._s3 = boto3.resource("s3", config=client_config)
|
|
17
22
|
return self._s3
|
|
18
23
|
|
|
19
24
|
def __getstate__(self):
|
ingestify/main.py
CHANGED
|
@@ -182,7 +182,9 @@ def get_event_subscriber_cls(key: str) -> Type[Subscriber]:
|
|
|
182
182
|
return import_cls(key)
|
|
183
183
|
|
|
184
184
|
|
|
185
|
-
def get_engine(
|
|
185
|
+
def get_engine(
|
|
186
|
+
config_file, bucket: Optional[str] = None, disable_events: bool = False
|
|
187
|
+
) -> IngestionEngine:
|
|
186
188
|
config = parse_config(config_file, default_value="")
|
|
187
189
|
|
|
188
190
|
logger.info("Initializing sources")
|
|
@@ -201,11 +203,16 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
|
|
|
201
203
|
|
|
202
204
|
# Setup an EventBus and wire some more components
|
|
203
205
|
event_bus = EventBus()
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
206
|
+
if not disable_events:
|
|
207
|
+
# When we disable all events we don't register any publishers
|
|
208
|
+
publisher = Publisher()
|
|
209
|
+
for subscriber in config.get("event_subscribers", []):
|
|
210
|
+
cls = get_event_subscriber_cls(subscriber["type"])
|
|
211
|
+
publisher.add_subscriber(cls(store))
|
|
212
|
+
event_bus.register(publisher)
|
|
213
|
+
else:
|
|
214
|
+
logger.info("Disabling all event handlers")
|
|
215
|
+
|
|
209
216
|
store.set_event_bus(event_bus)
|
|
210
217
|
|
|
211
218
|
ingestion_engine = IngestionEngine(
|
ingestify/utils.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import time
|
|
4
4
|
import re
|
|
5
5
|
import traceback
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
7
|
from contextlib import contextmanager
|
|
7
8
|
from multiprocessing import get_context, cpu_count, get_all_start_methods
|
|
8
9
|
|
|
@@ -137,59 +138,65 @@ def map_in_pool(func, iterable, processes=0):
|
|
|
137
138
|
)
|
|
138
139
|
|
|
139
140
|
|
|
140
|
-
class
|
|
141
|
+
class SyncExecutor:
|
|
141
142
|
def map(self, func, iterable):
|
|
142
143
|
return [func(item) for item in iterable]
|
|
143
144
|
|
|
144
|
-
def
|
|
145
|
-
return
|
|
145
|
+
def __enter__(self):
|
|
146
|
+
return self
|
|
146
147
|
|
|
147
|
-
def
|
|
148
|
-
|
|
148
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
149
|
+
pass
|
|
149
150
|
|
|
150
151
|
|
|
151
|
-
class
|
|
152
|
+
class DummyExecutor:
|
|
152
153
|
def map(self, func, iterable):
|
|
153
154
|
logger.info(f"DummyPool: not running {len(list(iterable))} tasks")
|
|
154
155
|
return None
|
|
155
156
|
|
|
156
|
-
def
|
|
157
|
-
return
|
|
157
|
+
def __enter__(self):
|
|
158
|
+
return self
|
|
158
159
|
|
|
159
|
-
def
|
|
160
|
-
|
|
160
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
161
|
+
pass
|
|
161
162
|
|
|
162
163
|
|
|
163
164
|
class TaskExecutor:
|
|
164
165
|
def __init__(self, processes=0, dry_run: bool = False):
|
|
165
166
|
if dry_run:
|
|
166
|
-
|
|
167
|
+
executor = DummyExecutor()
|
|
167
168
|
elif os.environ.get("INGESTIFY_RUN_EAGER") == "true":
|
|
168
|
-
|
|
169
|
+
executor = SyncExecutor()
|
|
169
170
|
else:
|
|
170
171
|
if not processes:
|
|
171
|
-
processes =
|
|
172
|
+
processes = get_concurrency()
|
|
173
|
+
|
|
174
|
+
# if "fork" in get_all_start_methods():
|
|
175
|
+
# ctx = get_context("fork")
|
|
176
|
+
# else:
|
|
177
|
+
# ctx = get_context("spawn")
|
|
172
178
|
|
|
173
|
-
|
|
174
|
-
ctx = get_context("fork")
|
|
175
|
-
else:
|
|
176
|
-
ctx = get_context("spawn")
|
|
179
|
+
# pool = ctx.Pool(processes or cpu_count())
|
|
177
180
|
|
|
178
|
-
|
|
179
|
-
|
|
181
|
+
executor = ThreadPoolExecutor(max_workers=processes)
|
|
182
|
+
|
|
183
|
+
self.executor = executor
|
|
180
184
|
|
|
181
185
|
def __enter__(self):
|
|
186
|
+
self.executor.__enter__()
|
|
182
187
|
return self
|
|
183
188
|
|
|
184
189
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
185
|
-
self.
|
|
190
|
+
self.executor.__exit__(exc_type, exc_val, exc_tb)
|
|
186
191
|
|
|
187
192
|
def run(self, func, iterable):
|
|
188
|
-
|
|
193
|
+
# If multiprocessing
|
|
194
|
+
# wrapped_fn = cloudpickle.dumps(func)
|
|
195
|
+
# res = self.executor.map(
|
|
196
|
+
# cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
|
|
197
|
+
# )
|
|
189
198
|
start_time = time.time()
|
|
190
|
-
res = self.
|
|
191
|
-
cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
|
|
192
|
-
)
|
|
199
|
+
res = list(self.executor.map(func, iterable))
|
|
193
200
|
if res:
|
|
194
201
|
took = time.time() - start_time
|
|
195
202
|
logger.info(
|
|
@@ -197,10 +204,6 @@ class TaskExecutor:
|
|
|
197
204
|
)
|
|
198
205
|
return res
|
|
199
206
|
|
|
200
|
-
def join(self):
|
|
201
|
-
self.pool.close()
|
|
202
|
-
self.pool.join()
|
|
203
|
-
|
|
204
207
|
|
|
205
208
|
def try_number(s: str):
|
|
206
209
|
try:
|
|
@@ -253,3 +256,10 @@ class HasTiming:
|
|
|
253
256
|
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
254
257
|
|
|
255
258
|
return finish
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def get_concurrency():
|
|
262
|
+
concurrency = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
|
|
263
|
+
if not concurrency:
|
|
264
|
+
concurrency = min(32, (os.cpu_count() or 1) + 4)
|
|
265
|
+
return concurrency
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
ingestify/__init__.py,sha256=
|
|
2
|
-
ingestify/cmdline.py,sha256=
|
|
1
|
+
ingestify/__init__.py,sha256=6SmxhtKjGRDG31Ij8xc2i9L-7qC3qjA5DE89jQoD48Q,301
|
|
2
|
+
ingestify/cmdline.py,sha256=JcveX6e4i6mJtIllhTuruwbqxyoKIITIWE8kB6byvJU,7721
|
|
3
3
|
ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
|
|
4
|
-
ingestify/main.py,sha256=
|
|
4
|
+
ingestify/main.py,sha256=yYKA-4WAk04RdBCGmatsCKiPFQzpyufoG4VzHiWkVtU,8979
|
|
5
5
|
ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
|
|
6
6
|
ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
|
|
7
|
-
ingestify/utils.py,sha256=
|
|
7
|
+
ingestify/utils.py,sha256=EMdG3ZP3bX9DHxHvBLdkYLC3vcEVym7dmpIXQTikI3I,7281
|
|
8
8
|
ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
ingestify/application/dataset_store.py,sha256=JkAb1W0HaUgOwbohKntM4ttyrFXQ7df1uZSu2rbZllY,11680
|
|
10
10
|
ingestify/application/ingestion_engine.py,sha256=4SAmPZDm3e2QA5jZvMrb6xz1eDDshKoSZDWH3TCe4Bo,2372
|
|
11
|
-
ingestify/application/loader.py,sha256=
|
|
11
|
+
ingestify/application/loader.py,sha256=Lg3qPLaeKOFGheeqqfVeCBEF3cn61oZThgYYHoqfOvQ,7694
|
|
12
12
|
ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
|
|
13
13
|
ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
|
|
14
14
|
ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
|
|
@@ -39,8 +39,8 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
|
|
|
39
39
|
ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
|
|
40
40
|
ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
|
|
41
41
|
ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
-
ingestify/domain/models/ingestion/ingestion_job.py,sha256
|
|
43
|
-
ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=
|
|
42
|
+
ingestify/domain/models/ingestion/ingestion_job.py,sha256=-SxHunvtG8J2u8LwXacF26oItwMkLJN7Suelt-hjHgk,13434
|
|
43
|
+
ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=ZEoL8kZfDM_NUYXD4_7Xpmtz6WduN50UcJBgNOxOxrE,4669
|
|
44
44
|
ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
|
|
45
45
|
ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
|
|
46
46
|
ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
|
|
@@ -64,12 +64,12 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
|
|
|
64
64
|
ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
|
|
65
65
|
ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
66
|
ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
|
|
67
|
-
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=
|
|
68
|
-
ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=
|
|
67
|
+
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=mIF7ly-lyCSNJQeem2Dpxlllzn34MxEA97qV929ARDY,17361
|
|
68
|
+
ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=kALM32jbdeZ4Wn9gON-w2WSb5tH1lIWaBFgn5i29qTk,10635
|
|
69
69
|
ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
|
|
70
70
|
ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
|
|
71
71
|
ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
|
|
72
|
-
ingestify/infra/store/file/s3_file_repository.py,sha256=
|
|
72
|
+
ingestify/infra/store/file/s3_file_repository.py,sha256=tz_EZ_gun7W2qJMlI3j_R03iKBZlJSDcG7AUJ1JkdpE,1501
|
|
73
73
|
ingestify/static/templates/statsbomb_github/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
74
74
|
ingestify/static/templates/statsbomb_github/config.yaml.jinja2,sha256=_gAuAipfBL3ddLacyS1IBP5JluvPS2vmrb8GGaFtcUM,386
|
|
75
75
|
ingestify/static/templates/statsbomb_github/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
|
|
@@ -80,8 +80,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
|
|
|
80
80
|
ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
|
|
81
81
|
ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
|
|
82
82
|
ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
|
|
83
|
-
ingestify-0.
|
|
84
|
-
ingestify-0.
|
|
85
|
-
ingestify-0.
|
|
86
|
-
ingestify-0.
|
|
87
|
-
ingestify-0.
|
|
83
|
+
ingestify-0.5.0.dist-info/METADATA,sha256=EsJsolUWxelVsEOhLUyiut_tKPYfqHx9Pvvg_T-HFG4,18854
|
|
84
|
+
ingestify-0.5.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
85
|
+
ingestify-0.5.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
|
|
86
|
+
ingestify-0.5.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
|
|
87
|
+
ingestify-0.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|