ingestify 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/application/dataset_store.py +1 -0
- ingestify/application/ingestion_engine.py +7 -2
- ingestify/application/loader.py +14 -1
- ingestify/cmdline.py +20 -2
- ingestify/domain/models/dataset/collection_metadata.py +2 -1
- ingestify/domain/models/dataset/dataset.py +10 -0
- ingestify/domain/models/dataset/revision.py +4 -0
- ingestify/domain/models/ingestion/ingestion_job.py +8 -10
- ingestify/domain/models/ingestion/ingestion_job_summary.py +24 -34
- ingestify/domain/models/task/task_summary.py +3 -24
- ingestify/infra/serialization/__init__.py +2 -13
- ingestify/infra/store/dataset/sqlalchemy/repository.py +2 -3
- ingestify/infra/store/dataset/sqlalchemy/tables.py +14 -7
- ingestify/utils.py +48 -16
- {ingestify-0.3.4.dist-info → ingestify-0.4.0.dist-info}/METADATA +1 -1
- {ingestify-0.3.4.dist-info → ingestify-0.4.0.dist-info}/RECORD +20 -20
- {ingestify-0.3.4.dist-info → ingestify-0.4.0.dist-info}/WHEEL +0 -0
- {ingestify-0.3.4.dist-info → ingestify-0.4.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.3.4.dist-info → ingestify-0.4.0.dist-info}/top_level.txt +0 -0
ingestify/__init__.py
CHANGED
|
@@ -21,8 +21,13 @@ class IngestionEngine:
|
|
|
21
21
|
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
22
22
|
self.loader.add_ingestion_plan(ingestion_plan)
|
|
23
23
|
|
|
24
|
-
def load(
|
|
25
|
-
self
|
|
24
|
+
def load(
|
|
25
|
+
self,
|
|
26
|
+
dry_run: bool = False,
|
|
27
|
+
provider: Optional[str] = None,
|
|
28
|
+
source: Optional[str] = None,
|
|
29
|
+
):
|
|
30
|
+
self.loader.collect_and_run(dry_run=dry_run, provider=provider, source=source)
|
|
26
31
|
|
|
27
32
|
def list_datasets(self, as_count: bool = False):
|
|
28
33
|
"""Consider moving this to DataStore"""
|
ingestify/application/loader.py
CHANGED
|
@@ -29,7 +29,12 @@ class Loader:
|
|
|
29
29
|
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
30
30
|
self.ingestion_plans.append(ingestion_plan)
|
|
31
31
|
|
|
32
|
-
def collect_and_run(
|
|
32
|
+
def collect_and_run(
|
|
33
|
+
self,
|
|
34
|
+
dry_run: bool = False,
|
|
35
|
+
provider: Optional[str] = None,
|
|
36
|
+
source: Optional[str] = None,
|
|
37
|
+
):
|
|
33
38
|
# First collect all selectors, before discovering datasets
|
|
34
39
|
selectors = {}
|
|
35
40
|
for ingestion_plan in self.ingestion_plans:
|
|
@@ -42,6 +47,13 @@ class Loader:
|
|
|
42
47
|
)
|
|
43
48
|
continue
|
|
44
49
|
|
|
50
|
+
if source is not None:
|
|
51
|
+
if ingestion_plan.source.name != source:
|
|
52
|
+
logger.info(
|
|
53
|
+
f"Skipping {ingestion_plan} because source doesn't match '{source}'"
|
|
54
|
+
)
|
|
55
|
+
continue
|
|
56
|
+
|
|
45
57
|
static_selectors = [
|
|
46
58
|
selector
|
|
47
59
|
for selector in ingestion_plan.selectors
|
|
@@ -60,6 +72,7 @@ class Loader:
|
|
|
60
72
|
|
|
61
73
|
# TODO: consider making this lazy and fetch once per Source instead of
|
|
62
74
|
# once per IngestionPlan
|
|
75
|
+
# TODO: Log exception when `discover_selectors` fails
|
|
63
76
|
all_selectors = ingestion_plan.source.discover_selectors(
|
|
64
77
|
ingestion_plan.dataset_type
|
|
65
78
|
)
|
ingestify/cmdline.py
CHANGED
|
@@ -58,7 +58,14 @@ def cli():
|
|
|
58
58
|
help="bucket",
|
|
59
59
|
type=str,
|
|
60
60
|
)
|
|
61
|
-
@click.option(
|
|
61
|
+
@click.option(
|
|
62
|
+
"--debug",
|
|
63
|
+
"debug",
|
|
64
|
+
required=False,
|
|
65
|
+
help="Debugging enabled",
|
|
66
|
+
is_flag=True,
|
|
67
|
+
type=bool,
|
|
68
|
+
)
|
|
62
69
|
@click.option(
|
|
63
70
|
"--dry-run",
|
|
64
71
|
"dry_run",
|
|
@@ -74,11 +81,19 @@ def cli():
|
|
|
74
81
|
help="Provider - only run tasks for a single provider",
|
|
75
82
|
type=str,
|
|
76
83
|
)
|
|
84
|
+
@click.option(
|
|
85
|
+
"--source",
|
|
86
|
+
"source",
|
|
87
|
+
required=False,
|
|
88
|
+
help="Source - only run tasks for a single source",
|
|
89
|
+
type=str,
|
|
90
|
+
)
|
|
77
91
|
def run(
|
|
78
92
|
config_file: str,
|
|
79
93
|
bucket: Optional[str],
|
|
80
94
|
dry_run: Optional[bool],
|
|
81
95
|
provider: Optional[str],
|
|
96
|
+
source: Optional[str],
|
|
82
97
|
debug: Optional[bool],
|
|
83
98
|
):
|
|
84
99
|
try:
|
|
@@ -90,7 +105,10 @@ def run(
|
|
|
90
105
|
logger.exception(f"Failed due a configuration error: {e}")
|
|
91
106
|
sys.exit(1)
|
|
92
107
|
|
|
93
|
-
|
|
108
|
+
if debug:
|
|
109
|
+
logging.getLogger("root").setLevel(logging.DEBUG)
|
|
110
|
+
|
|
111
|
+
engine.load(dry_run=dry_run, provider=provider, source=source)
|
|
94
112
|
|
|
95
113
|
logger.info("Done")
|
|
96
114
|
|
|
@@ -6,7 +6,8 @@ from typing import Optional
|
|
|
6
6
|
@dataclass
|
|
7
7
|
class DatasetCollectionMetadata:
|
|
8
8
|
# This can be useful to figure out if a backfill is required
|
|
9
|
-
|
|
9
|
+
# TODO - Note: not stored at Dataset level and requires joined query to retrieve
|
|
10
|
+
# first_modified: Optional[datetime]
|
|
10
11
|
|
|
11
12
|
# Use the last modified to only retrieve datasets that are changed
|
|
12
13
|
last_modified: Optional[datetime]
|
|
@@ -22,7 +22,10 @@ class Dataset(BaseModel):
|
|
|
22
22
|
metadata: dict
|
|
23
23
|
created_at: datetime
|
|
24
24
|
updated_at: datetime
|
|
25
|
+
|
|
25
26
|
revisions: List[Revision] = Field(default_factory=list)
|
|
27
|
+
# The last_modified_at is equal to the max modified_at of all files in all revisions
|
|
28
|
+
last_modified_at: Optional[datetime]
|
|
26
29
|
|
|
27
30
|
@field_validator("identifier", mode="before")
|
|
28
31
|
@classmethod
|
|
@@ -42,6 +45,13 @@ class Dataset(BaseModel):
|
|
|
42
45
|
self.revisions.append(revision)
|
|
43
46
|
self.updated_at = utcnow()
|
|
44
47
|
|
|
48
|
+
if self.last_modified_at:
|
|
49
|
+
self.last_modified_at = max(
|
|
50
|
+
self.last_modified_at, revision.last_modified_at
|
|
51
|
+
)
|
|
52
|
+
else:
|
|
53
|
+
self.last_modified_at = revision.last_modified_at
|
|
54
|
+
|
|
45
55
|
def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
|
|
46
56
|
changed = False
|
|
47
57
|
if self.name != name:
|
|
@@ -36,6 +36,10 @@ class Revision(BaseModel):
|
|
|
36
36
|
is_squashed: bool = False
|
|
37
37
|
state: RevisionState = RevisionState.PENDING_VALIDATION
|
|
38
38
|
|
|
39
|
+
@property
|
|
40
|
+
def last_modified_at(self):
|
|
41
|
+
return max(file.modified_at for file in self.modified_files)
|
|
42
|
+
|
|
39
43
|
@property
|
|
40
44
|
def modified_files_map(self) -> Dict[str, File]:
|
|
41
45
|
return {file.file_id: file for file in self.modified_files}
|
|
@@ -214,9 +214,6 @@ class IngestionJob:
|
|
|
214
214
|
self, store: DatasetStore, task_executor: TaskExecutor
|
|
215
215
|
) -> Iterator[IngestionJobSummary]:
|
|
216
216
|
is_first_chunk = True
|
|
217
|
-
ingestion_job_exception = (
|
|
218
|
-
None # Indicate if there was an exception during the IngestionJob itself
|
|
219
|
-
)
|
|
220
217
|
ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
|
|
221
218
|
# Process all items in batches. Yield a IngestionJobSummary per batch
|
|
222
219
|
|
|
@@ -224,6 +221,7 @@ class IngestionJob:
|
|
|
224
221
|
with ingestion_job_summary.record_timing("get_dataset_collection"):
|
|
225
222
|
dataset_collection_metadata = store.get_dataset_collection(
|
|
226
223
|
dataset_type=self.ingestion_plan.dataset_type,
|
|
224
|
+
provider=self.ingestion_plan.source.provider,
|
|
227
225
|
data_spec_versions=self.selector.data_spec_versions,
|
|
228
226
|
selector=self.selector,
|
|
229
227
|
metadata_only=True,
|
|
@@ -233,8 +231,8 @@ class IngestionJob:
|
|
|
233
231
|
# There are two different, but similar flows here:
|
|
234
232
|
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
235
233
|
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
236
|
-
|
|
237
|
-
|
|
234
|
+
try:
|
|
235
|
+
with ingestion_job_summary.record_timing("find_datasets"):
|
|
238
236
|
dataset_resources = self.ingestion_plan.source.find_datasets(
|
|
239
237
|
dataset_type=self.ingestion_plan.dataset_type,
|
|
240
238
|
data_spec_versions=self.selector.data_spec_versions,
|
|
@@ -244,12 +242,12 @@ class IngestionJob:
|
|
|
244
242
|
|
|
245
243
|
# We need to include the to_batches as that will start the generator
|
|
246
244
|
batches = to_batches(dataset_resources)
|
|
247
|
-
|
|
248
|
-
|
|
245
|
+
except Exception as e:
|
|
246
|
+
logger.exception("Failed to find datasets")
|
|
249
247
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
248
|
+
ingestion_job_summary.set_exception(e)
|
|
249
|
+
yield ingestion_job_summary
|
|
250
|
+
return
|
|
253
251
|
|
|
254
252
|
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
255
253
|
|
|
@@ -9,7 +9,7 @@ from ingestify.domain import Selector, DataSpecVersionCollection
|
|
|
9
9
|
from ingestify.domain.models.base import BaseModel
|
|
10
10
|
from ingestify.domain.models.task.task_summary import TaskSummary, TaskState
|
|
11
11
|
from ingestify.domain.models.timing import Timing
|
|
12
|
-
from ingestify.utils import utcnow
|
|
12
|
+
from ingestify.utils import utcnow, HasTiming
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
15
|
from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
|
|
@@ -25,7 +25,7 @@ def format_duration(duration: timedelta):
|
|
|
25
25
|
return f"{duration.total_seconds():.2f}sec"
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
class IngestionJobSummary(BaseModel):
|
|
28
|
+
class IngestionJobSummary(BaseModel, HasTiming):
|
|
29
29
|
ingestion_job_summary_id: str
|
|
30
30
|
ingestion_job_id: str
|
|
31
31
|
|
|
@@ -39,7 +39,6 @@ class IngestionJobSummary(BaseModel):
|
|
|
39
39
|
started_at: datetime = Field(default_factory=utcnow)
|
|
40
40
|
ended_at: Optional[datetime] = None
|
|
41
41
|
state: IngestionJobState = IngestionJobState.RUNNING
|
|
42
|
-
timings: List[Timing] = Field(default_factory=list)
|
|
43
42
|
task_summaries: List[TaskSummary] = Field(default_factory=list)
|
|
44
43
|
|
|
45
44
|
skipped_datasets: int = 0
|
|
@@ -60,22 +59,6 @@ class IngestionJobSummary(BaseModel):
|
|
|
60
59
|
)
|
|
61
60
|
return cls(**args)
|
|
62
61
|
|
|
63
|
-
@contextmanager
|
|
64
|
-
def record_timing(self, name: str):
|
|
65
|
-
start = utcnow()
|
|
66
|
-
try:
|
|
67
|
-
yield
|
|
68
|
-
finally:
|
|
69
|
-
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
70
|
-
|
|
71
|
-
def start_timing(self, name):
|
|
72
|
-
start = utcnow()
|
|
73
|
-
|
|
74
|
-
def finish():
|
|
75
|
-
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
76
|
-
|
|
77
|
-
return finish
|
|
78
|
-
|
|
79
62
|
def add_task_summaries(self, task_summaries: List[TaskSummary]):
|
|
80
63
|
self.task_summaries.extend(task_summaries)
|
|
81
64
|
|
|
@@ -101,6 +84,11 @@ class IngestionJobSummary(BaseModel):
|
|
|
101
84
|
)
|
|
102
85
|
self.ended_at = utcnow()
|
|
103
86
|
|
|
87
|
+
# Only keep failed tasks. Rest isn't interesting
|
|
88
|
+
self.task_summaries = [
|
|
89
|
+
task for task in self.task_summaries if task.state == TaskState.FAILED
|
|
90
|
+
]
|
|
91
|
+
|
|
104
92
|
def set_finished(self):
|
|
105
93
|
self.state = IngestionJobState.FINISHED
|
|
106
94
|
self._set_ended()
|
|
@@ -114,25 +102,27 @@ class IngestionJobSummary(BaseModel):
|
|
|
114
102
|
return self.ended_at - self.started_at
|
|
115
103
|
|
|
116
104
|
def output_report(self):
|
|
117
|
-
print(
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
print(
|
|
121
|
-
print(f"
|
|
122
|
-
print(f"
|
|
123
|
-
print(f"
|
|
124
|
-
print(f"
|
|
105
|
+
print(
|
|
106
|
+
f"\nIngestionJobSummary {self.state.value} in {format_duration(self.duration)}"
|
|
107
|
+
)
|
|
108
|
+
print("********************************")
|
|
109
|
+
print(f"* - IngestionPlan:")
|
|
110
|
+
print(f"* Source: {self.source_name}")
|
|
111
|
+
print(f"* Provider: {self.provider}")
|
|
112
|
+
print(f"* DatasetType: {self.dataset_type}")
|
|
113
|
+
print(f"* - Selector: {self.selector}")
|
|
114
|
+
print(f"* - Timings: ")
|
|
125
115
|
for timing in self.timings:
|
|
126
|
-
print(f" - {timing.name}: {format_duration(timing.duration)}")
|
|
116
|
+
print(f"* - {timing.name}: {format_duration(timing.duration)}")
|
|
127
117
|
print(
|
|
128
|
-
f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
|
|
118
|
+
f"* - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
|
|
129
119
|
)
|
|
130
120
|
|
|
131
|
-
print(f" - Failed tasks: {self.failed_tasks}")
|
|
132
|
-
print(f" - Successful tasks: {self.successful_tasks}")
|
|
133
|
-
print(f" - Successful ignored tasks: {self.ignored_successful_tasks}")
|
|
134
|
-
print(f" - Skipped datasets: {self.skipped_datasets}")
|
|
135
|
-
print("
|
|
121
|
+
print(f"* - Failed tasks: {self.failed_tasks}")
|
|
122
|
+
print(f"* - Successful tasks: {self.successful_tasks}")
|
|
123
|
+
print(f"* - Successful ignored tasks: {self.ignored_successful_tasks}")
|
|
124
|
+
print(f"* - Skipped datasets: {self.skipped_datasets}")
|
|
125
|
+
print("********************************")
|
|
136
126
|
|
|
137
127
|
def __enter__(self):
|
|
138
128
|
return self
|
|
@@ -10,8 +10,7 @@ from ingestify.domain.models.base import BaseModel
|
|
|
10
10
|
from ingestify.domain.models.dataset.identifier import Identifier
|
|
11
11
|
from ingestify.domain.models.timing import Timing
|
|
12
12
|
from ingestify.exceptions import IngestifyError
|
|
13
|
-
from ingestify.utils import utcnow
|
|
14
|
-
|
|
13
|
+
from ingestify.utils import utcnow, HasTiming
|
|
15
14
|
|
|
16
15
|
logger = logging.getLogger(__name__)
|
|
17
16
|
|
|
@@ -28,7 +27,7 @@ class Operation(str, Enum):
|
|
|
28
27
|
UPDATE = "UPDATE"
|
|
29
28
|
|
|
30
29
|
|
|
31
|
-
class TaskSummary(BaseModel):
|
|
30
|
+
class TaskSummary(BaseModel, HasTiming):
|
|
32
31
|
task_id: str
|
|
33
32
|
started_at: datetime
|
|
34
33
|
operation: Operation
|
|
@@ -38,7 +37,6 @@ class TaskSummary(BaseModel):
|
|
|
38
37
|
bytes_retrieved: int = 0
|
|
39
38
|
last_modified: Optional[datetime] = None
|
|
40
39
|
state: TaskState = TaskState.RUNNING
|
|
41
|
-
timings: List[Timing] = Field(default_factory=list)
|
|
42
40
|
|
|
43
41
|
@field_validator("dataset_identifier", mode="before")
|
|
44
42
|
@classmethod
|
|
@@ -48,27 +46,8 @@ class TaskSummary(BaseModel):
|
|
|
48
46
|
return value
|
|
49
47
|
|
|
50
48
|
def record_load_file(self, fn, metadata: dict):
|
|
51
|
-
|
|
52
|
-
try:
|
|
53
|
-
result = None
|
|
49
|
+
with self.record_timing(f"Load of {metadata.get('file_id', 'file')}", metadata):
|
|
54
50
|
return fn()
|
|
55
|
-
except Exception as e:
|
|
56
|
-
result = {
|
|
57
|
-
"type": type(e).__name__,
|
|
58
|
-
"message": str(e),
|
|
59
|
-
"traceback": traceback.format_exc(),
|
|
60
|
-
}
|
|
61
|
-
raise e
|
|
62
|
-
finally:
|
|
63
|
-
metadata = dict(result=result, **metadata)
|
|
64
|
-
self.timings.append(
|
|
65
|
-
Timing(
|
|
66
|
-
name=f"Load of {metadata.get('file_id', 'file')}",
|
|
67
|
-
started_at=start,
|
|
68
|
-
ended_at=utcnow(),
|
|
69
|
-
metadata=metadata,
|
|
70
|
-
)
|
|
71
|
-
)
|
|
72
51
|
|
|
73
52
|
@classmethod
|
|
74
53
|
@contextmanager
|
|
@@ -1,12 +1,5 @@
|
|
|
1
|
-
import
|
|
2
|
-
from
|
|
3
|
-
from typing import Type, Any, TypeVar
|
|
4
|
-
|
|
5
|
-
from dataclass_factory import Schema, Factory, NameStyle
|
|
6
|
-
from dataclass_factory.schema_helpers import type_checker
|
|
7
|
-
|
|
8
|
-
from ingestify.domain import DatasetCreated, Identifier
|
|
9
|
-
from ingestify.domain.models.dataset.events import MetadataUpdated, RevisionAdded
|
|
1
|
+
from ingestify.domain import DatasetCreated
|
|
2
|
+
from ingestify.domain.models.dataset.events import RevisionAdded
|
|
10
3
|
from ingestify.domain.models.event import DomainEvent
|
|
11
4
|
|
|
12
5
|
|
|
@@ -18,10 +11,6 @@ event_types = {
|
|
|
18
11
|
|
|
19
12
|
def deserialize(event_dict: dict) -> DomainEvent:
|
|
20
13
|
event_cls = event_types[event_dict["event_type"]]
|
|
21
|
-
event_dict["dataset"]["identifier"] = Identifier(
|
|
22
|
-
**event_dict["dataset"]["identifier"]
|
|
23
|
-
)
|
|
24
|
-
|
|
25
14
|
return event_cls.model_validate(event_dict)
|
|
26
15
|
|
|
27
16
|
|
|
@@ -320,10 +320,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
320
320
|
|
|
321
321
|
metadata_result_row = apply_query_filter(
|
|
322
322
|
self.session.query(
|
|
323
|
-
func.
|
|
324
|
-
func.max(file_table.c.modified_at).label("last_modified_at"),
|
|
323
|
+
func.max(dataset_table.c.last_modified_at).label("last_modified_at"),
|
|
325
324
|
func.count().label("row_count"),
|
|
326
|
-
)
|
|
325
|
+
)
|
|
327
326
|
).first()
|
|
328
327
|
dataset_collection_metadata = DatasetCollectionMetadata(*metadata_result_row)
|
|
329
328
|
|
|
@@ -16,6 +16,8 @@ from sqlalchemy import (
|
|
|
16
16
|
TypeDecorator,
|
|
17
17
|
)
|
|
18
18
|
|
|
19
|
+
from sqlalchemy.dialects.postgresql import JSONB
|
|
20
|
+
|
|
19
21
|
from ingestify.domain import Identifier, DataSpecVersionCollection, Selector
|
|
20
22
|
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
21
23
|
from ingestify.domain.models.ingestion.ingestion_job_summary import IngestionJobState
|
|
@@ -25,18 +27,18 @@ from ingestify.domain.models.timing import Timing
|
|
|
25
27
|
from ingestify.domain.models.dataset.revision import RevisionState
|
|
26
28
|
|
|
27
29
|
|
|
28
|
-
def JSONType(serializer=None, deserializer=None):
|
|
30
|
+
def JSONType(serializer=None, deserializer=None, base_type=JSON):
|
|
29
31
|
class _JsonType(TypeDecorator):
|
|
30
32
|
cache_ok = True
|
|
31
|
-
impl =
|
|
33
|
+
impl = base_type
|
|
32
34
|
|
|
33
35
|
def process_bind_param(self, value, dialect):
|
|
34
|
-
if serializer is not None:
|
|
36
|
+
if serializer and value is not None:
|
|
35
37
|
return serializer(value)
|
|
36
38
|
return value
|
|
37
39
|
|
|
38
40
|
def process_result_value(self, value, dialect):
|
|
39
|
-
if deserializer is not None:
|
|
41
|
+
if deserializer and value is not None:
|
|
40
42
|
return deserializer(value)
|
|
41
43
|
return value
|
|
42
44
|
|
|
@@ -152,14 +154,19 @@ dataset_table = Table(
|
|
|
152
154
|
metadata,
|
|
153
155
|
Column("bucket", String(255), default=None),
|
|
154
156
|
Column("dataset_id", String(255), primary_key=True),
|
|
155
|
-
Column("provider", String(255)),
|
|
156
|
-
Column("dataset_type", String(255)),
|
|
157
|
+
Column("provider", String(255), index=True),
|
|
158
|
+
Column("dataset_type", String(255), index=True),
|
|
157
159
|
Column("state", DatasetStateString),
|
|
158
160
|
Column("name", String(255)),
|
|
159
|
-
Column(
|
|
161
|
+
Column(
|
|
162
|
+
"identifier",
|
|
163
|
+
# Use JSONB when available
|
|
164
|
+
JSON().with_variant(JSONB(), "postgresql"),
|
|
165
|
+
),
|
|
160
166
|
Column("metadata", JSON),
|
|
161
167
|
Column("created_at", TZDateTime(6)),
|
|
162
168
|
Column("updated_at", TZDateTime(6)),
|
|
169
|
+
Column("last_modified_at", TZDateTime(6)),
|
|
163
170
|
)
|
|
164
171
|
|
|
165
172
|
revision_table = Table(
|
ingestify/utils.py
CHANGED
|
@@ -1,34 +1,23 @@
|
|
|
1
|
-
import abc
|
|
2
|
-
import asyncio
|
|
3
|
-
import inspect
|
|
4
1
|
import logging
|
|
5
2
|
import os
|
|
6
3
|
import time
|
|
7
4
|
import re
|
|
5
|
+
import traceback
|
|
6
|
+
from contextlib import contextmanager
|
|
8
7
|
from multiprocessing import get_context, cpu_count, get_all_start_methods
|
|
9
8
|
|
|
10
9
|
from datetime import datetime, timezone
|
|
11
10
|
from string import Template
|
|
12
|
-
from typing import
|
|
13
|
-
Dict,
|
|
14
|
-
Generic,
|
|
15
|
-
Type,
|
|
16
|
-
TypeVar,
|
|
17
|
-
Tuple,
|
|
18
|
-
Optional,
|
|
19
|
-
Any,
|
|
20
|
-
Callable,
|
|
21
|
-
Awaitable,
|
|
22
|
-
List,
|
|
23
|
-
Iterable,
|
|
24
|
-
)
|
|
11
|
+
from typing import Dict, Tuple, Optional, Any, List
|
|
25
12
|
|
|
26
13
|
import cloudpickle
|
|
14
|
+
from pydantic import Field
|
|
27
15
|
from typing_extensions import Self
|
|
28
16
|
|
|
29
17
|
|
|
30
18
|
from itertools import islice
|
|
31
19
|
|
|
20
|
+
from ingestify.domain.models.timing import Timing
|
|
32
21
|
|
|
33
22
|
logger = logging.getLogger(__name__)
|
|
34
23
|
|
|
@@ -221,3 +210,46 @@ def try_number(s: str):
|
|
|
221
210
|
return float(s)
|
|
222
211
|
except ValueError:
|
|
223
212
|
return s
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class HasTiming:
|
|
216
|
+
"""Mixin to give Pydantic models ability to time actions."""
|
|
217
|
+
|
|
218
|
+
timings: List[Timing] = Field(default_factory=list)
|
|
219
|
+
|
|
220
|
+
@contextmanager
|
|
221
|
+
def record_timing(
|
|
222
|
+
self, description: str, metadata: Optional[dict] = None
|
|
223
|
+
) -> Timing:
|
|
224
|
+
if not metadata:
|
|
225
|
+
metadata = {}
|
|
226
|
+
|
|
227
|
+
start = utcnow()
|
|
228
|
+
try:
|
|
229
|
+
result = None
|
|
230
|
+
yield
|
|
231
|
+
except Exception as e:
|
|
232
|
+
result = {
|
|
233
|
+
"type": type(e).__name__,
|
|
234
|
+
"message": str(e),
|
|
235
|
+
"traceback": traceback.format_exc(),
|
|
236
|
+
}
|
|
237
|
+
raise e
|
|
238
|
+
finally:
|
|
239
|
+
metadata = dict(result=result, **metadata)
|
|
240
|
+
self.timings.append(
|
|
241
|
+
Timing(
|
|
242
|
+
name=description,
|
|
243
|
+
started_at=start,
|
|
244
|
+
ended_at=utcnow(),
|
|
245
|
+
metadata=metadata,
|
|
246
|
+
)
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
def start_timing(self, name):
|
|
250
|
+
start = utcnow()
|
|
251
|
+
|
|
252
|
+
def finish():
|
|
253
|
+
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
254
|
+
|
|
255
|
+
return finish
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
ingestify/__init__.py,sha256=
|
|
2
|
-
ingestify/cmdline.py,sha256=
|
|
1
|
+
ingestify/__init__.py,sha256=pqbfmiQtpinnyWcferY68eDjMbrSslyzmNBk9XErzak,301
|
|
2
|
+
ingestify/cmdline.py,sha256=oagUe-Jup1SU3s6jVl25f0cSG0wlNYhxFY-gGBwWmr0,7482
|
|
3
3
|
ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
|
|
4
4
|
ingestify/main.py,sha256=Xr0VbGgstPO7doDX18xqk4lBb4W2sbGWtQuXZaARsHA,8763
|
|
5
5
|
ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
|
|
6
6
|
ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
|
|
7
|
-
ingestify/utils.py,sha256=
|
|
7
|
+
ingestify/utils.py,sha256=6BqgEZjecLW_anqYP5WrFpi93bmdhF-EdrebEkm59Ds,6806
|
|
8
8
|
ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
ingestify/application/dataset_store.py,sha256=
|
|
10
|
-
ingestify/application/ingestion_engine.py,sha256=
|
|
11
|
-
ingestify/application/loader.py,sha256=
|
|
9
|
+
ingestify/application/dataset_store.py,sha256=JkAb1W0HaUgOwbohKntM4ttyrFXQ7df1uZSu2rbZllY,11680
|
|
10
|
+
ingestify/application/ingestion_engine.py,sha256=4SAmPZDm3e2QA5jZvMrb6xz1eDDshKoSZDWH3TCe4Bo,2372
|
|
11
|
+
ingestify/application/loader.py,sha256=2LpYSHvedsoe5wFsIkQv0xLcKcqtebwVOSPWapAypao,7566
|
|
12
12
|
ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
|
|
13
13
|
ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
|
|
14
14
|
ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
|
|
@@ -20,8 +20,8 @@ ingestify/domain/models/source.py,sha256=sB3aqr2LfjIbtw7ODJpHnPj3RUeo7gYmTU7MXvf
|
|
|
20
20
|
ingestify/domain/models/timing.py,sha256=TvvH6Szo61CD8wCP7Awyc45CXga5lKqvoW2U-0TRHlA,388
|
|
21
21
|
ingestify/domain/models/dataset/__init__.py,sha256=i1kswluvWjw0xn4OUByRt7yeRvNHu1mauevv-Vmayx4,630
|
|
22
22
|
ingestify/domain/models/dataset/collection.py,sha256=E2utQ6oyaFFrfQFMiwP9J_I7Wm21z0sRvE4Zc3QEs20,1310
|
|
23
|
-
ingestify/domain/models/dataset/collection_metadata.py,sha256=
|
|
24
|
-
ingestify/domain/models/dataset/dataset.py,sha256=
|
|
23
|
+
ingestify/domain/models/dataset/collection_metadata.py,sha256=u2H3XZ-6NMfuAcVD_mb7jEc1IkS1MgQahDIe5CWHjtc,458
|
|
24
|
+
ingestify/domain/models/dataset/dataset.py,sha256=STew8_zCBro_x_u03JrjMvq8yqUDaNndlOlolf9osdM,3332
|
|
25
25
|
ingestify/domain/models/dataset/dataset_repository.py,sha256=kUjiqW58kOUOli1gZCLR5xw4dBX0bqI1UJsf16hgNsQ,812
|
|
26
26
|
ingestify/domain/models/dataset/dataset_state.py,sha256=O95mea5N34HDXw7XsYzxHna4FVk_T-ZNUDezkvt7VzY,220
|
|
27
27
|
ingestify/domain/models/dataset/events.py,sha256=58VacQejQt-WPh9BywP4st5McauM3gXBQo0kaDnSekY,481
|
|
@@ -29,7 +29,7 @@ ingestify/domain/models/dataset/file.py,sha256=1Thdv6A1YmC1UfutaRf2q3FGHQYO0SWEp
|
|
|
29
29
|
ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
|
|
30
30
|
ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
|
|
31
31
|
ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
|
|
32
|
-
ingestify/domain/models/dataset/revision.py,sha256=
|
|
32
|
+
ingestify/domain/models/dataset/revision.py,sha256=jBjMqYXDbvt_VAIwL_db09jcH4W8JPRKsXJb4JCniuM,1447
|
|
33
33
|
ingestify/domain/models/dataset/selector.py,sha256=kEGpU8pIyjZ0zwE9n2uo_NY5xrNanWiTTgapyMAUEsw,1039
|
|
34
34
|
ingestify/domain/models/event/__init__.py,sha256=OdPTpE9bj5QqdGmrYqRTLPX1f-LR9GWJYlGMPPEsuL8,138
|
|
35
35
|
ingestify/domain/models/event/_old_event.py,sha256=RktgCAj9SMdtqkAc_bOwoghEb2Z6m4r5_xWXin9wqx4,472
|
|
@@ -39,15 +39,15 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
|
|
|
39
39
|
ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
|
|
40
40
|
ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
|
|
41
41
|
ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
-
ingestify/domain/models/ingestion/ingestion_job.py,sha256=
|
|
43
|
-
ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=
|
|
42
|
+
ingestify/domain/models/ingestion/ingestion_job.py,sha256=H9jfbbWFZw73nxMOW0480LgSHV-o4sA5IcvpUZmFpS4,13140
|
|
43
|
+
ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=Xvmtu0BwE9C7FxBl6D8tN49I6--E_RngcMfWeme4DPA,4499
|
|
44
44
|
ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
|
|
45
45
|
ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
|
|
46
46
|
ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
|
|
47
47
|
ingestify/domain/models/task/__init__.py,sha256=BdlyIPvE07Xax_IzLgO9DUw0wsz9OZutxnxdDNyRlys,79
|
|
48
48
|
ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIpXtlhW4,430
|
|
49
49
|
ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
|
|
50
|
-
ingestify/domain/models/task/task_summary.py,sha256=
|
|
50
|
+
ingestify/domain/models/task/task_summary.py,sha256=T9BSGhOZjKCPfym34VUdBXuMy0o6E832GAI4WMtjGao,3181
|
|
51
51
|
ingestify/domain/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
52
|
ingestify/domain/services/identifier_key_transformer.py,sha256=y4GS9u9Ej1MO2jUhAxWbifp0mrE_MqTHvVVcoQzSKb4,4034
|
|
53
53
|
ingestify/domain/services/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -55,7 +55,7 @@ ingestify/domain/services/transformers/kloppy_to_pandas.py,sha256=NcN6nTBGVn9gz-
|
|
|
55
55
|
ingestify/infra/__init__.py,sha256=V0hpLzPVTcOHRVh0gguF6FT30YIgEOUd5v87xUHkfZ4,88
|
|
56
56
|
ingestify/infra/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
57
57
|
ingestify/infra/fetch/http.py,sha256=ldaXy6alBbI9z63H97lXfYZNT0ZCBkTac1W6-acNjjY,4127
|
|
58
|
-
ingestify/infra/serialization/__init__.py,sha256
|
|
58
|
+
ingestify/infra/serialization/__init__.py,sha256=UqXWJmKTp7Mi58ZyDASGguPFlqdVWVUbm_sg9GWx9eI,702
|
|
59
59
|
ingestify/infra/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
60
|
ingestify/infra/sink/postgresql.py,sha256=SxuM3LntfYcpCriUpqJhMvgAf0s9cohXf6WkxSEDYDY,1816
|
|
61
61
|
ingestify/infra/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -64,8 +64,8 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
|
|
|
64
64
|
ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
|
|
65
65
|
ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
66
|
ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
|
|
67
|
-
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=
|
|
68
|
-
ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=
|
|
67
|
+
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=R7OJVC_dh5hGhaCvU_Ixyfb16Xyd-hxvm7n0zJ6KxDk,15857
|
|
68
|
+
ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=FEMEkBiefozEoWCYMQnc4DD3ZsDeg3KaFs1c58A66ME,10314
|
|
69
69
|
ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
|
|
70
70
|
ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
|
|
71
71
|
ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
|
|
@@ -80,8 +80,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
|
|
|
80
80
|
ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
|
|
81
81
|
ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
|
|
82
82
|
ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
|
|
83
|
-
ingestify-0.
|
|
84
|
-
ingestify-0.
|
|
85
|
-
ingestify-0.
|
|
86
|
-
ingestify-0.
|
|
87
|
-
ingestify-0.
|
|
83
|
+
ingestify-0.4.0.dist-info/METADATA,sha256=t02z0kStKbk2lECODDJieRK4Ev1TDKR0Z6G2DpUeeGM,18854
|
|
84
|
+
ingestify-0.4.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
85
|
+
ingestify-0.4.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
|
|
86
|
+
ingestify-0.4.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
|
|
87
|
+
ingestify-0.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|