ingestify 0.4.2__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ingestify-0.4.2 → ingestify-0.5.0}/PKG-INFO +1 -1
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/__init__.py +1 -1
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/ingestion/ingestion_job.py +5 -5
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py +13 -6
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +18 -12
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/store/dataset/sqlalchemy/tables.py +2 -1
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/store/file/s3_file_repository.py +6 -1
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/utils.py +38 -28
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify.egg-info/PKG-INFO +1 -1
- {ingestify-0.4.2 → ingestify-0.5.0}/README.md +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/application/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/application/dataset_store.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/application/ingestion_engine.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/application/loader.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/cmdline.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/base.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/data_spec_version_collection.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/dataset/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/dataset/collection.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/dataset/dataset.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/dataset/dataset_state.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/dataset/events.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/dataset/file.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/dataset/file_collection.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/dataset/file_repository.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/dataset/revision.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/dataset/selector.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/event/domain_event.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/event/event_bus.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/fetch_policy.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/ingestion/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/resources/dataset_resource.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/sink.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/task/task.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/task/task_summary.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/timing.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/services/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/services/identifier_key_transformer.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/services/transformers/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/exceptions.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/fetch/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/fetch/http.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/serialization/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/sink/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/sink/postgresql.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/source/statsbomb_github.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/source/wyscout.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/store/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/main.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/server.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/source_base.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/static/templates/statsbomb_github/README.md +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/static/templates/statsbomb_github/query.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/static/templates/wyscout/.env +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/static/templates/wyscout/.gitignore +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/static/templates/wyscout/README.md +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/static/templates/wyscout/database/README.md +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify/static/templates/wyscout/query.py +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify.egg-info/SOURCES.txt +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify.egg-info/requires.txt +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/setup.cfg +0 -0
- {ingestify-0.4.2 → ingestify-0.5.0}/setup.py +0 -0
|
@@ -287,7 +287,7 @@ class IngestionJob:
|
|
|
287
287
|
selector=dataset_identifiers,
|
|
288
288
|
)
|
|
289
289
|
|
|
290
|
-
|
|
290
|
+
skipped_tasks = 0
|
|
291
291
|
|
|
292
292
|
task_set = TaskSet()
|
|
293
293
|
for dataset_resource in batch:
|
|
@@ -307,7 +307,7 @@ class IngestionJob:
|
|
|
307
307
|
)
|
|
308
308
|
)
|
|
309
309
|
else:
|
|
310
|
-
|
|
310
|
+
skipped_tasks += 1
|
|
311
311
|
else:
|
|
312
312
|
if self.ingestion_plan.fetch_policy.should_fetch(dataset_resource):
|
|
313
313
|
task_set.add(
|
|
@@ -317,12 +317,12 @@ class IngestionJob:
|
|
|
317
317
|
)
|
|
318
318
|
)
|
|
319
319
|
else:
|
|
320
|
-
|
|
320
|
+
skipped_tasks += 1
|
|
321
321
|
|
|
322
322
|
if task_set:
|
|
323
323
|
logger.info(
|
|
324
324
|
f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
|
|
325
|
-
f"using selector {self.selector} => {len(task_set)} tasks. {
|
|
325
|
+
f"using selector {self.selector} => {len(task_set)} tasks. {skipped_tasks} skipped."
|
|
326
326
|
)
|
|
327
327
|
logger.info(f"Running {len(task_set)} tasks")
|
|
328
328
|
ingestion_job_summary.add_task_summaries(
|
|
@@ -334,7 +334,7 @@ class IngestionJob:
|
|
|
334
334
|
f"using selector {self.selector} => nothing to do"
|
|
335
335
|
)
|
|
336
336
|
|
|
337
|
-
ingestion_job_summary.
|
|
337
|
+
ingestion_job_summary.increase_skipped_tasks(skipped_tasks)
|
|
338
338
|
|
|
339
339
|
if ingestion_job_summary.task_count() >= MAX_TASKS_PER_CHUNK:
|
|
340
340
|
finish_task_timer()
|
{ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py
RENAMED
|
@@ -41,7 +41,8 @@ class IngestionJobSummary(BaseModel, HasTiming):
|
|
|
41
41
|
state: IngestionJobState = IngestionJobState.RUNNING
|
|
42
42
|
task_summaries: List[TaskSummary] = Field(default_factory=list)
|
|
43
43
|
|
|
44
|
-
|
|
44
|
+
total_tasks: int = 0
|
|
45
|
+
skipped_tasks: int = 0
|
|
45
46
|
failed_tasks: int = 0
|
|
46
47
|
successful_tasks: int = 0
|
|
47
48
|
ignored_successful_tasks: int = 0
|
|
@@ -62,11 +63,11 @@ class IngestionJobSummary(BaseModel, HasTiming):
|
|
|
62
63
|
def add_task_summaries(self, task_summaries: List[TaskSummary]):
|
|
63
64
|
self.task_summaries.extend(task_summaries)
|
|
64
65
|
|
|
65
|
-
def
|
|
66
|
-
self.
|
|
66
|
+
def increase_skipped_tasks(self, skipped_tasks: int):
|
|
67
|
+
self.skipped_tasks += skipped_tasks
|
|
67
68
|
|
|
68
69
|
def task_count(self):
|
|
69
|
-
return len(self.task_summaries) + self.
|
|
70
|
+
return len(self.task_summaries) + self.skipped_tasks
|
|
70
71
|
|
|
71
72
|
def _set_ended(self):
|
|
72
73
|
self.failed_tasks = len(
|
|
@@ -82,6 +83,12 @@ class IngestionJobSummary(BaseModel, HasTiming):
|
|
|
82
83
|
if task.state == TaskState.FINISHED_IGNORED
|
|
83
84
|
]
|
|
84
85
|
)
|
|
86
|
+
self.total_tasks = (
|
|
87
|
+
self.failed_tasks
|
|
88
|
+
+ self.successful_tasks
|
|
89
|
+
+ self.ignored_successful_tasks
|
|
90
|
+
+ self.skipped_tasks
|
|
91
|
+
)
|
|
85
92
|
self.ended_at = utcnow()
|
|
86
93
|
|
|
87
94
|
# Only keep failed tasks. Rest isn't interesting
|
|
@@ -115,13 +122,13 @@ class IngestionJobSummary(BaseModel, HasTiming):
|
|
|
115
122
|
for timing in self.timings:
|
|
116
123
|
print(f" - {timing.name}: {format_duration(timing.duration)}")
|
|
117
124
|
print(
|
|
118
|
-
f" - Tasks: {
|
|
125
|
+
f" - Tasks: {self.total_tasks} - {(self.total_tasks / self.duration.total_seconds()):.1f} tasks/sec"
|
|
119
126
|
)
|
|
120
127
|
|
|
121
128
|
print(f" - Failed tasks: {self.failed_tasks}")
|
|
122
129
|
print(f" - Successful tasks: {self.successful_tasks}")
|
|
123
130
|
print(f" - Successful ignored tasks: {self.ignored_successful_tasks}")
|
|
124
|
-
print(f" - Skipped datasets: {self.
|
|
131
|
+
print(f" - Skipped datasets: {self.skipped_tasks}")
|
|
125
132
|
print("********************************")
|
|
126
133
|
|
|
127
134
|
def __enter__(self):
|
|
@@ -19,7 +19,7 @@ from sqlalchemy import (
|
|
|
19
19
|
)
|
|
20
20
|
from sqlalchemy.engine import make_url
|
|
21
21
|
from sqlalchemy.exc import NoSuchModuleError
|
|
22
|
-
from sqlalchemy.orm import Session, Query
|
|
22
|
+
from sqlalchemy.orm import Session, Query, sessionmaker, scoped_session
|
|
23
23
|
|
|
24
24
|
from ingestify.domain import File, Revision
|
|
25
25
|
from ingestify.domain.models import (
|
|
@@ -34,6 +34,7 @@ from ingestify.domain.models.dataset.collection_metadata import (
|
|
|
34
34
|
from ingestify.domain.models.ingestion.ingestion_job_summary import IngestionJobSummary
|
|
35
35
|
from ingestify.domain.models.task.task_summary import TaskSummary
|
|
36
36
|
from ingestify.exceptions import IngestifyError
|
|
37
|
+
from ingestify.utils import get_concurrency
|
|
37
38
|
|
|
38
39
|
from .tables import (
|
|
39
40
|
metadata,
|
|
@@ -96,17 +97,15 @@ class SqlAlchemySessionProvider:
|
|
|
96
97
|
self.url,
|
|
97
98
|
# Use the default isolation level, don't need SERIALIZABLE
|
|
98
99
|
# isolation_level="SERIALIZABLE",
|
|
100
|
+
pool_size=get_concurrency(), # Maximum number of connections in the pool
|
|
101
|
+
max_overflow=5,
|
|
102
|
+
pool_recycle=1800,
|
|
103
|
+
pool_pre_ping=True,
|
|
99
104
|
)
|
|
100
105
|
self.dialect = self.engine.dialect
|
|
101
|
-
self.session = Session(bind=self.engine)
|
|
102
106
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
self.url = url
|
|
107
|
-
self._init_engine()
|
|
108
|
-
|
|
109
|
-
metadata.create_all(self.engine)
|
|
107
|
+
session_factory = sessionmaker(bind=self.engine)
|
|
108
|
+
self.session = scoped_session(session_factory)
|
|
110
109
|
|
|
111
110
|
def __getstate__(self):
|
|
112
111
|
return {"url": self.url}
|
|
@@ -115,6 +114,14 @@ class SqlAlchemySessionProvider:
|
|
|
115
114
|
self.url = state["url"]
|
|
116
115
|
self._init_engine()
|
|
117
116
|
|
|
117
|
+
def __init__(self, url: str):
|
|
118
|
+
url = self.fix_url(url)
|
|
119
|
+
|
|
120
|
+
self.url = url
|
|
121
|
+
self._init_engine()
|
|
122
|
+
|
|
123
|
+
metadata.create_all(self.engine)
|
|
124
|
+
|
|
118
125
|
def __del__(self):
|
|
119
126
|
self.close()
|
|
120
127
|
|
|
@@ -123,12 +130,11 @@ class SqlAlchemySessionProvider:
|
|
|
123
130
|
self._init_engine()
|
|
124
131
|
|
|
125
132
|
def close(self):
|
|
126
|
-
if hasattr(self, "
|
|
127
|
-
self.session.close()
|
|
133
|
+
if hasattr(self, "engine"):
|
|
128
134
|
self.engine.dispose()
|
|
129
135
|
|
|
130
136
|
def get(self):
|
|
131
|
-
return self.session
|
|
137
|
+
return self.session()
|
|
132
138
|
|
|
133
139
|
|
|
134
140
|
def in_(column: Column, values):
|
|
@@ -247,9 +247,10 @@ ingestion_job_summary_table = Table(
|
|
|
247
247
|
Column("ended_at", TZDateTime(6)),
|
|
248
248
|
# Some task counters
|
|
249
249
|
Column("state", IngestionJobStateString),
|
|
250
|
+
Column("total_tasks", Integer),
|
|
250
251
|
Column("successful_tasks", Integer),
|
|
251
252
|
Column("ignored_successful_tasks", Integer),
|
|
252
|
-
Column("
|
|
253
|
+
Column("skipped_tasks", Integer),
|
|
253
254
|
Column("failed_tasks", Integer),
|
|
254
255
|
Column(
|
|
255
256
|
"timings",
|
|
@@ -2,9 +2,11 @@ from pathlib import Path
|
|
|
2
2
|
from typing import BinaryIO
|
|
3
3
|
|
|
4
4
|
import boto3 as boto3
|
|
5
|
+
import botocore.config
|
|
5
6
|
|
|
6
7
|
from ingestify.domain import Dataset
|
|
7
8
|
from ingestify.domain.models import FileRepository
|
|
9
|
+
from ingestify.utils import get_concurrency
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class S3FileRepository(FileRepository):
|
|
@@ -13,7 +15,10 @@ class S3FileRepository(FileRepository):
|
|
|
13
15
|
@property
|
|
14
16
|
def s3(self):
|
|
15
17
|
if not self._s3:
|
|
16
|
-
|
|
18
|
+
client_config = botocore.config.Config(
|
|
19
|
+
max_pool_connections=get_concurrency(),
|
|
20
|
+
)
|
|
21
|
+
self._s3 = boto3.resource("s3", config=client_config)
|
|
17
22
|
return self._s3
|
|
18
23
|
|
|
19
24
|
def __getstate__(self):
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import time
|
|
4
4
|
import re
|
|
5
5
|
import traceback
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
7
|
from contextlib import contextmanager
|
|
7
8
|
from multiprocessing import get_context, cpu_count, get_all_start_methods
|
|
8
9
|
|
|
@@ -137,59 +138,65 @@ def map_in_pool(func, iterable, processes=0):
|
|
|
137
138
|
)
|
|
138
139
|
|
|
139
140
|
|
|
140
|
-
class
|
|
141
|
+
class SyncExecutor:
|
|
141
142
|
def map(self, func, iterable):
|
|
142
143
|
return [func(item) for item in iterable]
|
|
143
144
|
|
|
144
|
-
def
|
|
145
|
-
return
|
|
145
|
+
def __enter__(self):
|
|
146
|
+
return self
|
|
146
147
|
|
|
147
|
-
def
|
|
148
|
-
|
|
148
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
149
|
+
pass
|
|
149
150
|
|
|
150
151
|
|
|
151
|
-
class
|
|
152
|
+
class DummyExecutor:
|
|
152
153
|
def map(self, func, iterable):
|
|
153
154
|
logger.info(f"DummyPool: not running {len(list(iterable))} tasks")
|
|
154
155
|
return None
|
|
155
156
|
|
|
156
|
-
def
|
|
157
|
-
return
|
|
157
|
+
def __enter__(self):
|
|
158
|
+
return self
|
|
158
159
|
|
|
159
|
-
def
|
|
160
|
-
|
|
160
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
161
|
+
pass
|
|
161
162
|
|
|
162
163
|
|
|
163
164
|
class TaskExecutor:
|
|
164
165
|
def __init__(self, processes=0, dry_run: bool = False):
|
|
165
166
|
if dry_run:
|
|
166
|
-
|
|
167
|
+
executor = DummyExecutor()
|
|
167
168
|
elif os.environ.get("INGESTIFY_RUN_EAGER") == "true":
|
|
168
|
-
|
|
169
|
+
executor = SyncExecutor()
|
|
169
170
|
else:
|
|
170
171
|
if not processes:
|
|
171
|
-
processes =
|
|
172
|
+
processes = get_concurrency()
|
|
173
|
+
|
|
174
|
+
# if "fork" in get_all_start_methods():
|
|
175
|
+
# ctx = get_context("fork")
|
|
176
|
+
# else:
|
|
177
|
+
# ctx = get_context("spawn")
|
|
172
178
|
|
|
173
|
-
|
|
174
|
-
ctx = get_context("fork")
|
|
175
|
-
else:
|
|
176
|
-
ctx = get_context("spawn")
|
|
179
|
+
# pool = ctx.Pool(processes or cpu_count())
|
|
177
180
|
|
|
178
|
-
|
|
179
|
-
|
|
181
|
+
executor = ThreadPoolExecutor(max_workers=processes)
|
|
182
|
+
|
|
183
|
+
self.executor = executor
|
|
180
184
|
|
|
181
185
|
def __enter__(self):
|
|
186
|
+
self.executor.__enter__()
|
|
182
187
|
return self
|
|
183
188
|
|
|
184
189
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
185
|
-
self.
|
|
190
|
+
self.executor.__exit__(exc_type, exc_val, exc_tb)
|
|
186
191
|
|
|
187
192
|
def run(self, func, iterable):
|
|
188
|
-
|
|
193
|
+
# If multiprocessing
|
|
194
|
+
# wrapped_fn = cloudpickle.dumps(func)
|
|
195
|
+
# res = self.executor.map(
|
|
196
|
+
# cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
|
|
197
|
+
# )
|
|
189
198
|
start_time = time.time()
|
|
190
|
-
res = self.
|
|
191
|
-
cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
|
|
192
|
-
)
|
|
199
|
+
res = list(self.executor.map(func, iterable))
|
|
193
200
|
if res:
|
|
194
201
|
took = time.time() - start_time
|
|
195
202
|
logger.info(
|
|
@@ -197,10 +204,6 @@ class TaskExecutor:
|
|
|
197
204
|
)
|
|
198
205
|
return res
|
|
199
206
|
|
|
200
|
-
def join(self):
|
|
201
|
-
self.pool.close()
|
|
202
|
-
self.pool.join()
|
|
203
|
-
|
|
204
207
|
|
|
205
208
|
def try_number(s: str):
|
|
206
209
|
try:
|
|
@@ -253,3 +256,10 @@ class HasTiming:
|
|
|
253
256
|
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
254
257
|
|
|
255
258
|
return finish
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def get_concurrency():
|
|
262
|
+
concurrency = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
|
|
263
|
+
if not concurrency:
|
|
264
|
+
concurrency = min(32, (os.cpu_count() or 1) + 4)
|
|
265
|
+
return concurrency
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.4.2 → ingestify-0.5.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.4.2 → ingestify-0.5.0}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2
RENAMED
|
File without changes
|
{ingestify-0.4.2 → ingestify-0.5.0}/ingestify/static/templates/statsbomb_github/database/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|