ingestify 0.1.3__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/application/dataset_store.py +47 -36
- ingestify/application/ingestion_engine.py +3 -3
- ingestify/application/loader.py +71 -241
- ingestify/domain/models/__init__.py +1 -6
- ingestify/domain/models/base.py +22 -0
- ingestify/domain/models/data_spec_version_collection.py +6 -0
- ingestify/domain/models/dataset/__init__.py +3 -5
- ingestify/domain/models/dataset/dataset.py +15 -32
- ingestify/domain/models/dataset/dataset_repository.py +1 -15
- ingestify/domain/models/dataset/dataset_state.py +11 -0
- ingestify/domain/models/dataset/events.py +6 -16
- ingestify/domain/models/dataset/file.py +21 -34
- ingestify/domain/models/dataset/file_collection.py +3 -1
- ingestify/domain/models/dataset/file_repository.py +29 -28
- ingestify/domain/models/dataset/revision.py +26 -3
- ingestify/domain/models/event/domain_event.py +8 -4
- ingestify/domain/models/ingestion/__init__.py +0 -0
- ingestify/domain/models/ingestion/ingestion_job.py +325 -0
- ingestify/domain/models/ingestion/ingestion_job_summary.py +123 -0
- ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
- ingestify/domain/models/resources/dataset_resource.py +29 -37
- ingestify/domain/models/sink.py +1 -8
- ingestify/domain/models/task/task.py +3 -1
- ingestify/domain/models/task/task_summary.py +118 -0
- ingestify/domain/models/timing.py +16 -0
- ingestify/domain/services/identifier_key_transformer.py +111 -0
- ingestify/infra/fetch/http.py +5 -0
- ingestify/infra/source/statsbomb_github.py +67 -54
- ingestify/infra/store/dataset/__init__.py +0 -2
- ingestify/infra/store/dataset/sqlalchemy/mapping.py +187 -4
- ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -24
- ingestify/infra/store/file/local_file_repository.py +3 -5
- ingestify/infra/store/file/s3_file_repository.py +4 -9
- ingestify/main.py +64 -25
- ingestify/utils.py +15 -78
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/METADATA +2 -1
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/RECORD +41 -34
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/WHEEL +1 -1
- ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import traceback
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Optional, List
|
|
7
|
+
from pydantic import Field, field_validator
|
|
8
|
+
|
|
9
|
+
from ingestify.domain.models.base import BaseModel
|
|
10
|
+
from ingestify.domain.models.dataset.identifier import Identifier
|
|
11
|
+
from ingestify.domain.models.timing import Timing
|
|
12
|
+
from ingestify.exceptions import IngestifyError
|
|
13
|
+
from ingestify.utils import utcnow
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TaskStatus(str, Enum):
|
|
20
|
+
RUNNING = "RUNNING"
|
|
21
|
+
FINISHED = "FINISHED"
|
|
22
|
+
FINISHED_IGNORED = "FINISHED_IGNORED" # Finished, but didn't produce any new data
|
|
23
|
+
FAILED = "FAILED"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Operation(str, Enum):
|
|
27
|
+
CREATE = "CREATE"
|
|
28
|
+
UPDATE = "UPDATE"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class TaskSummary(BaseModel):
|
|
32
|
+
task_id: str
|
|
33
|
+
started_at: datetime
|
|
34
|
+
operation: Operation
|
|
35
|
+
dataset_identifier: Identifier
|
|
36
|
+
ended_at: Optional[datetime] = None
|
|
37
|
+
persisted_file_count: int = 0
|
|
38
|
+
bytes_retrieved: int = 0
|
|
39
|
+
last_modified: Optional[datetime] = None
|
|
40
|
+
status: TaskStatus = TaskStatus.RUNNING
|
|
41
|
+
timings: List[Timing] = Field(default_factory=list)
|
|
42
|
+
|
|
43
|
+
@field_validator("dataset_identifier", mode="before")
|
|
44
|
+
@classmethod
|
|
45
|
+
def ensure_list(cls, value) -> Identifier:
|
|
46
|
+
if not isinstance(value, Identifier):
|
|
47
|
+
return Identifier(**value)
|
|
48
|
+
return value
|
|
49
|
+
|
|
50
|
+
def record_load_file(self, fn, metadata: dict):
|
|
51
|
+
start = utcnow()
|
|
52
|
+
try:
|
|
53
|
+
result = None
|
|
54
|
+
return fn()
|
|
55
|
+
except Exception as e:
|
|
56
|
+
result = {
|
|
57
|
+
"type": type(e).__name__,
|
|
58
|
+
"message": str(e),
|
|
59
|
+
"traceback": traceback.format_exc(),
|
|
60
|
+
}
|
|
61
|
+
raise e
|
|
62
|
+
finally:
|
|
63
|
+
metadata = dict(result=result, **metadata)
|
|
64
|
+
self.timings.append(
|
|
65
|
+
Timing(
|
|
66
|
+
name=f"Load of {metadata.get('file_id', 'file')}",
|
|
67
|
+
started_at=start,
|
|
68
|
+
ended_at=utcnow(),
|
|
69
|
+
metadata=metadata,
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
@classmethod
|
|
74
|
+
@contextmanager
|
|
75
|
+
def new(cls, task_id: str, operation: Operation, dataset_identifier: Identifier):
|
|
76
|
+
start = utcnow()
|
|
77
|
+
task_summary = cls(
|
|
78
|
+
task_id=task_id,
|
|
79
|
+
started_at=start,
|
|
80
|
+
operation=operation,
|
|
81
|
+
dataset_identifier=dataset_identifier,
|
|
82
|
+
)
|
|
83
|
+
try:
|
|
84
|
+
yield task_summary
|
|
85
|
+
|
|
86
|
+
task_summary.set_status(TaskStatus.FINISHED)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
logger.exception(f"Failed to execute task.")
|
|
89
|
+
task_summary.set_status(TaskStatus.FAILED)
|
|
90
|
+
|
|
91
|
+
# When the error comes from our own code, make sure it will be raised to the highest level
|
|
92
|
+
# raise
|
|
93
|
+
if isinstance(e, IngestifyError):
|
|
94
|
+
raise
|
|
95
|
+
finally:
|
|
96
|
+
task_summary.ended_at = utcnow()
|
|
97
|
+
|
|
98
|
+
@classmethod
|
|
99
|
+
def update(cls, task_id: str, dataset_identifier: Identifier):
|
|
100
|
+
return cls.new(task_id, Operation.UPDATE, dataset_identifier)
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def create(cls, task_id: str, dataset_identifier: Identifier):
|
|
104
|
+
return cls.new(task_id, Operation.CREATE, dataset_identifier)
|
|
105
|
+
|
|
106
|
+
def set_stats_from_revision(self, revision: Optional["Revision"]):
|
|
107
|
+
if revision:
|
|
108
|
+
self.persisted_file_count = len(revision.modified_files)
|
|
109
|
+
self.bytes_retrieved = sum(file.size for file in revision.modified_files)
|
|
110
|
+
self.last_modified = max(
|
|
111
|
+
file.modified_at for file in revision.modified_files
|
|
112
|
+
)
|
|
113
|
+
else:
|
|
114
|
+
self.status = TaskStatus.FINISHED_IGNORED
|
|
115
|
+
|
|
116
|
+
def set_status(self, status: TaskStatus):
|
|
117
|
+
if self.status == TaskStatus.RUNNING:
|
|
118
|
+
self.status = status
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Optional, Any
|
|
3
|
+
from pydantic import BaseModel, ConfigDict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Timing(BaseModel):
|
|
7
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
8
|
+
|
|
9
|
+
name: str
|
|
10
|
+
started_at: datetime
|
|
11
|
+
ended_at: datetime
|
|
12
|
+
metadata: Optional[dict[str, Any]] = None
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def duration(self):
|
|
16
|
+
return self.ended_at - self.started_at
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Callable, Optional, Union
|
|
4
|
+
|
|
5
|
+
from ingestify.exceptions import IngestifyError
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TransformationType(Enum):
|
|
9
|
+
IDENTITY = "IDENTITY"
|
|
10
|
+
BUCKET = "BUCKET"
|
|
11
|
+
RANGE = "RANGE"
|
|
12
|
+
CUSTOM = "CUSTOM"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Transformation(ABC):
|
|
16
|
+
@property
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def transformation_type(self) -> TransformationType:
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
def is_identity(self) -> bool:
|
|
22
|
+
return self.transformation_type == TransformationType.IDENTITY
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def __call__(self, id_key_value: Union[str, int]) -> str:
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def from_dict(cls, config: dict) -> "Transformation":
|
|
30
|
+
type_ = config.pop("type")
|
|
31
|
+
if type_ == "bucket":
|
|
32
|
+
return BucketTransformation(**config)
|
|
33
|
+
else:
|
|
34
|
+
raise IngestifyError(f"Cannot build Transformation from {config}")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class IdentityTransformation(Transformation):
|
|
38
|
+
transformation_type = TransformationType.IDENTITY
|
|
39
|
+
|
|
40
|
+
def __call__(self, id_key_value: Union[str, int]) -> str:
|
|
41
|
+
# Return the original value as a string
|
|
42
|
+
return str(id_key_value)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class BucketTransformation(Transformation):
|
|
46
|
+
transformation_type = TransformationType.BUCKET
|
|
47
|
+
|
|
48
|
+
def __init__(self, bucket_size: int = None, bucket_count: int = None):
|
|
49
|
+
self.bucket_size = bucket_size
|
|
50
|
+
self.bucket_count = bucket_count
|
|
51
|
+
|
|
52
|
+
def __call__(self, id_key_value: Union[str, int]) -> str:
|
|
53
|
+
if self.bucket_count:
|
|
54
|
+
return str(int(id_key_value) % self.bucket_count)
|
|
55
|
+
elif self.bucket_size:
|
|
56
|
+
bucket_start = int(id_key_value) // self.bucket_size * self.bucket_size
|
|
57
|
+
bucket_end = bucket_start + self.bucket_size - 1
|
|
58
|
+
return f"{bucket_start}-{bucket_end}"
|
|
59
|
+
else:
|
|
60
|
+
raise IngestifyError("Invalid BucketTransformation")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class IdentifierTransformer:
|
|
64
|
+
def __init__(self):
|
|
65
|
+
# Mapping of (provider, dataset_type, id_key) to the transformation
|
|
66
|
+
self.key_transformations: dict[tuple[str, str, str], Transformation] = {}
|
|
67
|
+
|
|
68
|
+
def register_transformation(
|
|
69
|
+
self,
|
|
70
|
+
provider: str,
|
|
71
|
+
dataset_type: str,
|
|
72
|
+
id_key: str,
|
|
73
|
+
transformation: Union[Transformation, dict],
|
|
74
|
+
):
|
|
75
|
+
"""
|
|
76
|
+
Registers a transformation for a specific (provider, dataset_type, id_key).
|
|
77
|
+
"""
|
|
78
|
+
if isinstance(transformation, dict):
|
|
79
|
+
transformation = Transformation.from_dict(transformation)
|
|
80
|
+
|
|
81
|
+
self.key_transformations[(provider, dataset_type, id_key)] = transformation
|
|
82
|
+
|
|
83
|
+
def get_transformation(
|
|
84
|
+
self, provider: str, dataset_type: str, id_key: str
|
|
85
|
+
) -> Transformation:
|
|
86
|
+
"""
|
|
87
|
+
Retrieves the transformation for the given column or defaults to identity.
|
|
88
|
+
"""
|
|
89
|
+
transformation = self.key_transformations.get((provider, dataset_type, id_key))
|
|
90
|
+
return transformation if transformation else IdentityTransformation()
|
|
91
|
+
|
|
92
|
+
def to_path(self, provider: str, dataset_type: str, identifier: dict) -> str:
|
|
93
|
+
"""
|
|
94
|
+
Transforms the identifier into a path string using registered transformations.
|
|
95
|
+
For non-identity transformations, includes both transformed and original values,
|
|
96
|
+
with the transformed value appearing first and including the suffix.
|
|
97
|
+
"""
|
|
98
|
+
path_parts = []
|
|
99
|
+
for key, value in identifier.items():
|
|
100
|
+
transformation = self.get_transformation(provider, dataset_type, key)
|
|
101
|
+
if not transformation.is_identity():
|
|
102
|
+
# Non-identity transformation: include both transformed and original
|
|
103
|
+
transformed_value = transformation(value)
|
|
104
|
+
suffix = transformation.transformation_type.value.lower()
|
|
105
|
+
path_parts.append(f"{key}_{suffix}={transformed_value}")
|
|
106
|
+
|
|
107
|
+
# Append the original value (either standalone for identity or alongside transformed)
|
|
108
|
+
path_parts.append(f"{key}={value}")
|
|
109
|
+
|
|
110
|
+
# Join the parts with `/` to form the full path
|
|
111
|
+
return "/".join(path_parts)
|
ingestify/infra/fetch/http.py
CHANGED
|
@@ -69,7 +69,12 @@ def retrieve_http(
|
|
|
69
69
|
else:
|
|
70
70
|
raise Exception(f"Don't know how to use {key}")
|
|
71
71
|
|
|
72
|
+
ignore_not_found = http_kwargs.pop("ignore_not_found", False)
|
|
73
|
+
|
|
72
74
|
response = get_session().get(url, headers=headers, **http_kwargs)
|
|
75
|
+
if response.status_code == 404 and ignore_not_found:
|
|
76
|
+
return None
|
|
77
|
+
|
|
73
78
|
response.raise_for_status()
|
|
74
79
|
if response.status_code == 304:
|
|
75
80
|
# Not modified
|
|
@@ -1,22 +1,22 @@
|
|
|
1
|
-
import json
|
|
2
1
|
from datetime import datetime
|
|
3
2
|
|
|
4
3
|
import requests
|
|
5
4
|
|
|
6
|
-
from ingestify import Source,
|
|
7
|
-
from ingestify.domain import DraftFile
|
|
5
|
+
from ingestify import Source, DatasetResource
|
|
8
6
|
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
9
7
|
|
|
10
8
|
BASE_URL = "https://raw.githubusercontent.com/statsbomb/open-data/master/data"
|
|
9
|
+
DATA_SPEC_VERSION = "v1-open-data"
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
class StatsbombGithub(Source):
|
|
14
13
|
provider = "statsbomb"
|
|
15
14
|
|
|
16
|
-
def discover_selectors(self, dataset_type: str
|
|
15
|
+
def discover_selectors(self, dataset_type: str):
|
|
17
16
|
assert dataset_type == "match"
|
|
18
17
|
|
|
19
18
|
competitions = requests.get(f"{BASE_URL}/competitions.json").json()
|
|
19
|
+
|
|
20
20
|
return [
|
|
21
21
|
dict(
|
|
22
22
|
competition_id=competition["competition_id"],
|
|
@@ -25,68 +25,81 @@ class StatsbombGithub(Source):
|
|
|
25
25
|
for competition in competitions
|
|
26
26
|
]
|
|
27
27
|
|
|
28
|
-
def
|
|
28
|
+
def find_datasets(
|
|
29
29
|
self,
|
|
30
|
-
dataset_type,
|
|
31
|
-
competition_id: str
|
|
32
|
-
season_id: str
|
|
30
|
+
dataset_type: str,
|
|
31
|
+
competition_id: str,
|
|
32
|
+
season_id: str,
|
|
33
|
+
match_id: str = None,
|
|
33
34
|
data_spec_versions=None,
|
|
35
|
+
dataset_collection_metadata=None,
|
|
34
36
|
):
|
|
35
37
|
assert dataset_type == "match"
|
|
36
38
|
|
|
37
|
-
datasets = []
|
|
38
|
-
|
|
39
39
|
matches = requests.get(
|
|
40
40
|
f"{BASE_URL}/matches/{competition_id}/{season_id}.json"
|
|
41
41
|
).json()
|
|
42
42
|
|
|
43
43
|
for match in matches:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
last_updated += "Z"
|
|
48
|
-
|
|
49
|
-
last_modified = datetime.fromisoformat(last_updated.replace("Z", "+00:00"))
|
|
50
|
-
|
|
51
|
-
dataset = dict(
|
|
52
|
-
competition_id=competition_id,
|
|
53
|
-
season_id=season_id,
|
|
54
|
-
match_id=match["match_id"],
|
|
55
|
-
_last_modified=last_modified,
|
|
56
|
-
_match=match,
|
|
57
|
-
_metadata=match,
|
|
58
|
-
_state=DatasetState.COMPLETE,
|
|
59
|
-
)
|
|
60
|
-
datasets.append(dataset)
|
|
61
|
-
return datasets
|
|
44
|
+
if match_id:
|
|
45
|
+
if match["match_id"] != match_id:
|
|
46
|
+
continue
|
|
62
47
|
|
|
63
|
-
|
|
64
|
-
self, dataset_type, identifier, current_revision, data_spec_versions
|
|
65
|
-
):
|
|
66
|
-
assert dataset_type == "match"
|
|
48
|
+
last_modified = datetime.fromisoformat(match["last_updated"] + "+00:00")
|
|
67
49
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
(
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
50
|
+
# Open data is always complete.. I guess?
|
|
51
|
+
state = DatasetState.COMPLETE
|
|
52
|
+
|
|
53
|
+
name = (
|
|
54
|
+
f"{match['match_date']} / "
|
|
55
|
+
f"{match['home_team']['home_team_name']} - {match['away_team']['away_team_name']}"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
dataset_resource = DatasetResource(
|
|
59
|
+
dataset_resource_id=dict(
|
|
60
|
+
competition_id=competition_id,
|
|
61
|
+
season_id=season_id,
|
|
62
|
+
match_id=match["match_id"],
|
|
63
|
+
),
|
|
64
|
+
dataset_type=dataset_type,
|
|
65
|
+
provider=self.provider,
|
|
66
|
+
name=name,
|
|
67
|
+
metadata=match,
|
|
68
|
+
state=state,
|
|
82
69
|
)
|
|
83
70
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
)
|
|
71
|
+
dataset_resource.add_file(
|
|
72
|
+
last_modified=last_modified,
|
|
73
|
+
data_feed_key="match",
|
|
74
|
+
data_spec_version=DATA_SPEC_VERSION,
|
|
75
|
+
json_content=match,
|
|
76
|
+
)
|
|
91
77
|
|
|
92
|
-
|
|
78
|
+
if state.is_complete:
|
|
79
|
+
name += f" / {match['home_score']}-{match['away_score']}"
|
|
80
|
+
|
|
81
|
+
for data_feed_key in ["lineups", "events"]:
|
|
82
|
+
dataset_resource.add_file(
|
|
83
|
+
last_modified=last_modified,
|
|
84
|
+
data_feed_key=data_feed_key,
|
|
85
|
+
data_spec_version=DATA_SPEC_VERSION,
|
|
86
|
+
url=f"{BASE_URL}/{data_feed_key}/{match['match_id']}.json",
|
|
87
|
+
data_serialization_format="json",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
if (
|
|
91
|
+
match["last_updated_360"]
|
|
92
|
+
and match["match_status_360"] == "available"
|
|
93
|
+
):
|
|
94
|
+
dataset_resource.add_file(
|
|
95
|
+
last_modified=datetime.fromisoformat(
|
|
96
|
+
match["last_updated_360"] + "+00:00"
|
|
97
|
+
),
|
|
98
|
+
data_feed_key="360-frames",
|
|
99
|
+
data_spec_version=DATA_SPEC_VERSION,
|
|
100
|
+
url=f"{BASE_URL}/three-sixty/{match['match_id']}.json",
|
|
101
|
+
data_serialization_format="json",
|
|
102
|
+
http_options={"ignore_not_found": True},
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
yield dataset_resource
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import datetime
|
|
2
|
+
from dataclasses import is_dataclass, asdict
|
|
2
3
|
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
3
5
|
|
|
4
6
|
from sqlalchemy import (
|
|
5
7
|
JSON,
|
|
@@ -13,11 +15,37 @@ from sqlalchemy import (
|
|
|
13
15
|
String,
|
|
14
16
|
Table,
|
|
15
17
|
TypeDecorator,
|
|
18
|
+
Boolean,
|
|
16
19
|
)
|
|
17
20
|
from sqlalchemy.orm import registry, relationship
|
|
18
21
|
|
|
22
|
+
from ingestify.domain import Selector, Identifier, DataSpecVersionCollection
|
|
19
23
|
from ingestify.domain.models import Dataset, File, Revision
|
|
20
24
|
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
25
|
+
from ingestify.domain.models.ingestion.ingestion_job_summary import (
|
|
26
|
+
IngestionJobSummary,
|
|
27
|
+
)
|
|
28
|
+
from ingestify.domain.models.task.task_summary import TaskSummary, Operation, TaskStatus
|
|
29
|
+
from ingestify.domain.models.timing import Timing
|
|
30
|
+
from ingestify.domain.models.dataset.revision import RevisionState
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def JSONType(serializer=None, deserializer=None):
|
|
34
|
+
class _JsonType(TypeDecorator):
|
|
35
|
+
cache_ok = True
|
|
36
|
+
impl = JSON
|
|
37
|
+
|
|
38
|
+
def process_bind_param(self, value, dialect):
|
|
39
|
+
if serializer is not None:
|
|
40
|
+
return serializer(value)
|
|
41
|
+
return value
|
|
42
|
+
|
|
43
|
+
def process_result_value(self, value, dialect):
|
|
44
|
+
if deserializer is not None:
|
|
45
|
+
return deserializer(value)
|
|
46
|
+
return value
|
|
47
|
+
|
|
48
|
+
return _JsonType
|
|
21
49
|
|
|
22
50
|
|
|
23
51
|
class TZDateTime(TypeDecorator):
|
|
@@ -25,7 +53,10 @@ class TZDateTime(TypeDecorator):
|
|
|
25
53
|
LOCAL_TIMEZONE = datetime.datetime.utcnow().astimezone().tzinfo
|
|
26
54
|
cache_ok = True
|
|
27
55
|
|
|
28
|
-
def process_bind_param(self, value: datetime, dialect):
|
|
56
|
+
def process_bind_param(self, value: Optional[datetime.datetime], dialect):
|
|
57
|
+
if not value:
|
|
58
|
+
return None
|
|
59
|
+
|
|
29
60
|
if value.tzinfo is None:
|
|
30
61
|
value = value.astimezone(self.LOCAL_TIMEZONE)
|
|
31
62
|
|
|
@@ -67,6 +98,45 @@ class DatasetStateString(TypeDecorator):
|
|
|
67
98
|
return DatasetState[value]
|
|
68
99
|
|
|
69
100
|
|
|
101
|
+
class RevisionStateString(TypeDecorator):
|
|
102
|
+
impl = String(255)
|
|
103
|
+
|
|
104
|
+
def process_bind_param(self, value: RevisionState, dialect):
|
|
105
|
+
return value.value
|
|
106
|
+
|
|
107
|
+
def process_result_value(self, value, dialect):
|
|
108
|
+
if not value:
|
|
109
|
+
return value
|
|
110
|
+
|
|
111
|
+
return RevisionState[value]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class OperationString(TypeDecorator):
|
|
115
|
+
impl = String(255)
|
|
116
|
+
|
|
117
|
+
def process_bind_param(self, value: Operation, dialect):
|
|
118
|
+
return value.value
|
|
119
|
+
|
|
120
|
+
def process_result_value(self, value, dialect):
|
|
121
|
+
if not value:
|
|
122
|
+
return value
|
|
123
|
+
|
|
124
|
+
return Operation[value]
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class TaskStatusString(TypeDecorator):
|
|
128
|
+
impl = String(255)
|
|
129
|
+
|
|
130
|
+
def process_bind_param(self, value: TaskStatus, dialect):
|
|
131
|
+
return value.value
|
|
132
|
+
|
|
133
|
+
def process_result_value(self, value, dialect):
|
|
134
|
+
if not value:
|
|
135
|
+
return value
|
|
136
|
+
|
|
137
|
+
return TaskStatus[value]
|
|
138
|
+
|
|
139
|
+
|
|
70
140
|
mapper_registry = registry()
|
|
71
141
|
|
|
72
142
|
metadata = MetaData()
|
|
@@ -80,7 +150,7 @@ dataset_table = Table(
|
|
|
80
150
|
Column("dataset_type", String(255)),
|
|
81
151
|
Column("state", DatasetStateString),
|
|
82
152
|
Column("name", String(255)),
|
|
83
|
-
Column("identifier",
|
|
153
|
+
Column("identifier", JSONType(deserializer=lambda item: Identifier(**item))),
|
|
84
154
|
Column("metadata", JSON),
|
|
85
155
|
Column("created_at", TZDateTime(6)),
|
|
86
156
|
Column("updated_at", TZDateTime(6)),
|
|
@@ -95,7 +165,10 @@ revision_table = Table(
|
|
|
95
165
|
Column("revision_id", Integer, primary_key=True),
|
|
96
166
|
Column("description", String(255)),
|
|
97
167
|
Column("created_at", TZDateTime(6)),
|
|
168
|
+
Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
|
|
169
|
+
Column("source", JSONType()),
|
|
98
170
|
)
|
|
171
|
+
|
|
99
172
|
file_table = Table(
|
|
100
173
|
"file",
|
|
101
174
|
metadata,
|
|
@@ -129,7 +202,7 @@ mapper_registry.map_imperatively(
|
|
|
129
202
|
Revision,
|
|
130
203
|
backref="dataset",
|
|
131
204
|
order_by=revision_table.c.revision_id,
|
|
132
|
-
lazy="
|
|
205
|
+
lazy="selectin",
|
|
133
206
|
cascade="all, delete-orphan",
|
|
134
207
|
),
|
|
135
208
|
},
|
|
@@ -143,7 +216,7 @@ mapper_registry.map_imperatively(
|
|
|
143
216
|
File,
|
|
144
217
|
order_by=file_table.c.file_id,
|
|
145
218
|
primaryjoin="and_(Revision.revision_id==File.revision_id, Revision.dataset_id==File.dataset_id)",
|
|
146
|
-
lazy="
|
|
219
|
+
lazy="selectin",
|
|
147
220
|
cascade="all, delete-orphan",
|
|
148
221
|
)
|
|
149
222
|
},
|
|
@@ -151,3 +224,113 @@ mapper_registry.map_imperatively(
|
|
|
151
224
|
|
|
152
225
|
|
|
153
226
|
mapper_registry.map_imperatively(File, file_table)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
ingestion_job_summary = Table(
|
|
230
|
+
"ingestion_job_summary",
|
|
231
|
+
metadata,
|
|
232
|
+
Column("ingestion_job_summary_id", String(255), primary_key=True),
|
|
233
|
+
Column("ingestion_job_id", String(255), index=True),
|
|
234
|
+
# From the IngestionPlan
|
|
235
|
+
Column("source_name", String(255)),
|
|
236
|
+
Column("provider", String(255)),
|
|
237
|
+
Column("dataset_type", String(255)),
|
|
238
|
+
Column(
|
|
239
|
+
"data_spec_versions",
|
|
240
|
+
JSONType(
|
|
241
|
+
serializer=lambda data_spec_versions: data_spec_versions.to_dict(),
|
|
242
|
+
deserializer=lambda data_spec_versions: DataSpecVersionCollection.from_dict(
|
|
243
|
+
data_spec_versions
|
|
244
|
+
),
|
|
245
|
+
),
|
|
246
|
+
),
|
|
247
|
+
Column(
|
|
248
|
+
"selector", JSONType(serializer=lambda selector: selector.filtered_attributes)
|
|
249
|
+
),
|
|
250
|
+
Column("started_at", TZDateTime(6)),
|
|
251
|
+
Column("finished_at", TZDateTime(6)),
|
|
252
|
+
# Some task counters
|
|
253
|
+
Column("successful_tasks", Integer),
|
|
254
|
+
Column("ignored_successful_tasks", Integer),
|
|
255
|
+
Column("skipped_datasets", Integer),
|
|
256
|
+
Column("failed_tasks", Integer),
|
|
257
|
+
Column(
|
|
258
|
+
"timings",
|
|
259
|
+
JSONType(
|
|
260
|
+
serializer=lambda timings: [
|
|
261
|
+
timing.model_dump(mode="json") for timing in timings
|
|
262
|
+
],
|
|
263
|
+
deserializer=lambda timings: [
|
|
264
|
+
Timing.model_validate(timing) for timing in timings
|
|
265
|
+
],
|
|
266
|
+
),
|
|
267
|
+
),
|
|
268
|
+
# Column(
|
|
269
|
+
# "task_summaries",
|
|
270
|
+
# JSONType(
|
|
271
|
+
# serializer=lambda task_summaries: [
|
|
272
|
+
# task_summary.model_dump(mode="json") for task_summary in task_summaries
|
|
273
|
+
# ],
|
|
274
|
+
# deserializer=lambda task_summaries: [
|
|
275
|
+
# TaskSummary.model_validate(task_summary)
|
|
276
|
+
# for task_summary in task_summaries
|
|
277
|
+
# ],
|
|
278
|
+
# ),
|
|
279
|
+
# ),
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
task_summary_table = Table(
|
|
284
|
+
"task_summary",
|
|
285
|
+
metadata,
|
|
286
|
+
Column(
|
|
287
|
+
"ingestion_job_summary_id",
|
|
288
|
+
String(255),
|
|
289
|
+
ForeignKey("ingestion_job_summary.ingestion_job_summary_id"),
|
|
290
|
+
primary_key=True,
|
|
291
|
+
),
|
|
292
|
+
Column("task_id", Integer, primary_key=True),
|
|
293
|
+
Column("started_at", TZDateTime(6)),
|
|
294
|
+
Column("ended_at", TZDateTime(6)),
|
|
295
|
+
Column("operation", OperationString),
|
|
296
|
+
Column(
|
|
297
|
+
"dataset_identifier", JSONType(deserializer=lambda item: Identifier(**item))
|
|
298
|
+
),
|
|
299
|
+
Column("persisted_file_count", Integer),
|
|
300
|
+
Column("bytes_retrieved", Integer),
|
|
301
|
+
Column("last_modified", TZDateTime(6)),
|
|
302
|
+
Column("status", TaskStatusString),
|
|
303
|
+
Column(
|
|
304
|
+
"timings",
|
|
305
|
+
JSONType(
|
|
306
|
+
serializer=lambda timings: [
|
|
307
|
+
timing.model_dump(mode="json") for timing in timings
|
|
308
|
+
],
|
|
309
|
+
deserializer=lambda timings: [
|
|
310
|
+
Timing.model_validate(timing) for timing in timings
|
|
311
|
+
],
|
|
312
|
+
),
|
|
313
|
+
),
|
|
314
|
+
# Column("description", String(255)),
|
|
315
|
+
# Column("created_at", TZDateTime(6)),
|
|
316
|
+
# Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
|
|
317
|
+
# Column("source", JSONType()),
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
mapper_registry.map_imperatively(
|
|
322
|
+
IngestionJobSummary,
|
|
323
|
+
ingestion_job_summary,
|
|
324
|
+
properties={
|
|
325
|
+
"task_summaries": relationship(
|
|
326
|
+
TaskSummary,
|
|
327
|
+
backref="ingestion_job_summary",
|
|
328
|
+
# order_by=task_summary_table.c.revision_id,
|
|
329
|
+
lazy="selectin",
|
|
330
|
+
cascade="all, delete-orphan",
|
|
331
|
+
),
|
|
332
|
+
},
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
mapper_registry.map_imperatively(TaskSummary, task_summary_table)
|