ingestify 0.1.3__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. ingestify/__init__.py +1 -1
  2. ingestify/application/dataset_store.py +47 -36
  3. ingestify/application/ingestion_engine.py +3 -3
  4. ingestify/application/loader.py +71 -241
  5. ingestify/domain/models/__init__.py +1 -6
  6. ingestify/domain/models/base.py +22 -0
  7. ingestify/domain/models/data_spec_version_collection.py +6 -0
  8. ingestify/domain/models/dataset/__init__.py +3 -5
  9. ingestify/domain/models/dataset/dataset.py +15 -32
  10. ingestify/domain/models/dataset/dataset_repository.py +1 -15
  11. ingestify/domain/models/dataset/dataset_state.py +11 -0
  12. ingestify/domain/models/dataset/events.py +6 -16
  13. ingestify/domain/models/dataset/file.py +21 -34
  14. ingestify/domain/models/dataset/file_collection.py +3 -1
  15. ingestify/domain/models/dataset/file_repository.py +29 -28
  16. ingestify/domain/models/dataset/revision.py +26 -3
  17. ingestify/domain/models/event/domain_event.py +8 -4
  18. ingestify/domain/models/ingestion/__init__.py +0 -0
  19. ingestify/domain/models/ingestion/ingestion_job.py +325 -0
  20. ingestify/domain/models/ingestion/ingestion_job_summary.py +123 -0
  21. ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
  22. ingestify/domain/models/resources/dataset_resource.py +29 -37
  23. ingestify/domain/models/sink.py +1 -8
  24. ingestify/domain/models/task/task.py +3 -1
  25. ingestify/domain/models/task/task_summary.py +118 -0
  26. ingestify/domain/models/timing.py +16 -0
  27. ingestify/domain/services/identifier_key_transformer.py +111 -0
  28. ingestify/infra/fetch/http.py +5 -0
  29. ingestify/infra/source/statsbomb_github.py +67 -54
  30. ingestify/infra/store/dataset/__init__.py +0 -2
  31. ingestify/infra/store/dataset/sqlalchemy/mapping.py +187 -4
  32. ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -24
  33. ingestify/infra/store/file/local_file_repository.py +3 -5
  34. ingestify/infra/store/file/s3_file_repository.py +4 -9
  35. ingestify/main.py +64 -25
  36. ingestify/utils.py +15 -78
  37. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/METADATA +2 -1
  38. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/RECORD +41 -34
  39. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/WHEEL +1 -1
  40. ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
  41. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/entry_points.txt +0 -0
  42. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,118 @@
1
+ import logging
2
+ import traceback
3
+ from contextlib import contextmanager
4
+ from datetime import datetime
5
+ from enum import Enum
6
+ from typing import Optional, List
7
+ from pydantic import Field, field_validator
8
+
9
+ from ingestify.domain.models.base import BaseModel
10
+ from ingestify.domain.models.dataset.identifier import Identifier
11
+ from ingestify.domain.models.timing import Timing
12
+ from ingestify.exceptions import IngestifyError
13
+ from ingestify.utils import utcnow
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class TaskStatus(str, Enum):
20
+ RUNNING = "RUNNING"
21
+ FINISHED = "FINISHED"
22
+ FINISHED_IGNORED = "FINISHED_IGNORED" # Finished, but didn't produce any new data
23
+ FAILED = "FAILED"
24
+
25
+
26
+ class Operation(str, Enum):
27
+ CREATE = "CREATE"
28
+ UPDATE = "UPDATE"
29
+
30
+
31
+ class TaskSummary(BaseModel):
32
+ task_id: str
33
+ started_at: datetime
34
+ operation: Operation
35
+ dataset_identifier: Identifier
36
+ ended_at: Optional[datetime] = None
37
+ persisted_file_count: int = 0
38
+ bytes_retrieved: int = 0
39
+ last_modified: Optional[datetime] = None
40
+ status: TaskStatus = TaskStatus.RUNNING
41
+ timings: List[Timing] = Field(default_factory=list)
42
+
43
+ @field_validator("dataset_identifier", mode="before")
44
+ @classmethod
45
+ def ensure_list(cls, value) -> Identifier:
46
+ if not isinstance(value, Identifier):
47
+ return Identifier(**value)
48
+ return value
49
+
50
+ def record_load_file(self, fn, metadata: dict):
51
+ start = utcnow()
52
+ try:
53
+ result = None
54
+ return fn()
55
+ except Exception as e:
56
+ result = {
57
+ "type": type(e).__name__,
58
+ "message": str(e),
59
+ "traceback": traceback.format_exc(),
60
+ }
61
+ raise e
62
+ finally:
63
+ metadata = dict(result=result, **metadata)
64
+ self.timings.append(
65
+ Timing(
66
+ name=f"Load of {metadata.get('file_id', 'file')}",
67
+ started_at=start,
68
+ ended_at=utcnow(),
69
+ metadata=metadata,
70
+ )
71
+ )
72
+
73
+ @classmethod
74
+ @contextmanager
75
+ def new(cls, task_id: str, operation: Operation, dataset_identifier: Identifier):
76
+ start = utcnow()
77
+ task_summary = cls(
78
+ task_id=task_id,
79
+ started_at=start,
80
+ operation=operation,
81
+ dataset_identifier=dataset_identifier,
82
+ )
83
+ try:
84
+ yield task_summary
85
+
86
+ task_summary.set_status(TaskStatus.FINISHED)
87
+ except Exception as e:
88
+ logger.exception(f"Failed to execute task.")
89
+ task_summary.set_status(TaskStatus.FAILED)
90
+
91
+ # When the error comes from our own code, make sure it will be raised to the highest level
92
+ # raise
93
+ if isinstance(e, IngestifyError):
94
+ raise
95
+ finally:
96
+ task_summary.ended_at = utcnow()
97
+
98
+ @classmethod
99
+ def update(cls, task_id: str, dataset_identifier: Identifier):
100
+ return cls.new(task_id, Operation.UPDATE, dataset_identifier)
101
+
102
+ @classmethod
103
+ def create(cls, task_id: str, dataset_identifier: Identifier):
104
+ return cls.new(task_id, Operation.CREATE, dataset_identifier)
105
+
106
+ def set_stats_from_revision(self, revision: Optional["Revision"]):
107
+ if revision:
108
+ self.persisted_file_count = len(revision.modified_files)
109
+ self.bytes_retrieved = sum(file.size for file in revision.modified_files)
110
+ self.last_modified = max(
111
+ file.modified_at for file in revision.modified_files
112
+ )
113
+ else:
114
+ self.status = TaskStatus.FINISHED_IGNORED
115
+
116
+ def set_status(self, status: TaskStatus):
117
+ if self.status == TaskStatus.RUNNING:
118
+ self.status = status
@@ -0,0 +1,16 @@
1
+ from datetime import datetime
2
+ from typing import Optional, Any
3
+ from pydantic import BaseModel, ConfigDict
4
+
5
+
6
+ class Timing(BaseModel):
7
+ model_config = ConfigDict(arbitrary_types_allowed=True)
8
+
9
+ name: str
10
+ started_at: datetime
11
+ ended_at: datetime
12
+ metadata: Optional[dict[str, Any]] = None
13
+
14
+ @property
15
+ def duration(self):
16
+ return self.ended_at - self.started_at
@@ -0,0 +1,111 @@
1
+ from abc import ABC, abstractmethod
2
+ from enum import Enum
3
+ from typing import Callable, Optional, Union
4
+
5
+ from ingestify.exceptions import IngestifyError
6
+
7
+
8
+ class TransformationType(Enum):
9
+ IDENTITY = "IDENTITY"
10
+ BUCKET = "BUCKET"
11
+ RANGE = "RANGE"
12
+ CUSTOM = "CUSTOM"
13
+
14
+
15
+ class Transformation(ABC):
16
+ @property
17
+ @abstractmethod
18
+ def transformation_type(self) -> TransformationType:
19
+ pass
20
+
21
+ def is_identity(self) -> bool:
22
+ return self.transformation_type == TransformationType.IDENTITY
23
+
24
+ @abstractmethod
25
+ def __call__(self, id_key_value: Union[str, int]) -> str:
26
+ pass
27
+
28
+ @classmethod
29
+ def from_dict(cls, config: dict) -> "Transformation":
30
+ type_ = config.pop("type")
31
+ if type_ == "bucket":
32
+ return BucketTransformation(**config)
33
+ else:
34
+ raise IngestifyError(f"Cannot build Transformation from {config}")
35
+
36
+
37
+ class IdentityTransformation(Transformation):
38
+ transformation_type = TransformationType.IDENTITY
39
+
40
+ def __call__(self, id_key_value: Union[str, int]) -> str:
41
+ # Return the original value as a string
42
+ return str(id_key_value)
43
+
44
+
45
+ class BucketTransformation(Transformation):
46
+ transformation_type = TransformationType.BUCKET
47
+
48
+ def __init__(self, bucket_size: int = None, bucket_count: int = None):
49
+ self.bucket_size = bucket_size
50
+ self.bucket_count = bucket_count
51
+
52
+ def __call__(self, id_key_value: Union[str, int]) -> str:
53
+ if self.bucket_count:
54
+ return str(int(id_key_value) % self.bucket_count)
55
+ elif self.bucket_size:
56
+ bucket_start = int(id_key_value) // self.bucket_size * self.bucket_size
57
+ bucket_end = bucket_start + self.bucket_size - 1
58
+ return f"{bucket_start}-{bucket_end}"
59
+ else:
60
+ raise IngestifyError("Invalid BucketTransformation")
61
+
62
+
63
+ class IdentifierTransformer:
64
+ def __init__(self):
65
+ # Mapping of (provider, dataset_type, id_key) to the transformation
66
+ self.key_transformations: dict[tuple[str, str, str], Transformation] = {}
67
+
68
+ def register_transformation(
69
+ self,
70
+ provider: str,
71
+ dataset_type: str,
72
+ id_key: str,
73
+ transformation: Union[Transformation, dict],
74
+ ):
75
+ """
76
+ Registers a transformation for a specific (provider, dataset_type, id_key).
77
+ """
78
+ if isinstance(transformation, dict):
79
+ transformation = Transformation.from_dict(transformation)
80
+
81
+ self.key_transformations[(provider, dataset_type, id_key)] = transformation
82
+
83
+ def get_transformation(
84
+ self, provider: str, dataset_type: str, id_key: str
85
+ ) -> Transformation:
86
+ """
87
+ Retrieves the transformation for the given column or defaults to identity.
88
+ """
89
+ transformation = self.key_transformations.get((provider, dataset_type, id_key))
90
+ return transformation if transformation else IdentityTransformation()
91
+
92
+ def to_path(self, provider: str, dataset_type: str, identifier: dict) -> str:
93
+ """
94
+ Transforms the identifier into a path string using registered transformations.
95
+ For non-identity transformations, includes both transformed and original values,
96
+ with the transformed value appearing first and including the suffix.
97
+ """
98
+ path_parts = []
99
+ for key, value in identifier.items():
100
+ transformation = self.get_transformation(provider, dataset_type, key)
101
+ if not transformation.is_identity():
102
+ # Non-identity transformation: include both transformed and original
103
+ transformed_value = transformation(value)
104
+ suffix = transformation.transformation_type.value.lower()
105
+ path_parts.append(f"{key}_{suffix}={transformed_value}")
106
+
107
+ # Append the original value (either standalone for identity or alongside transformed)
108
+ path_parts.append(f"{key}={value}")
109
+
110
+ # Join the parts with `/` to form the full path
111
+ return "/".join(path_parts)
@@ -69,7 +69,12 @@ def retrieve_http(
69
69
  else:
70
70
  raise Exception(f"Don't know how to use {key}")
71
71
 
72
+ ignore_not_found = http_kwargs.pop("ignore_not_found", False)
73
+
72
74
  response = get_session().get(url, headers=headers, **http_kwargs)
75
+ if response.status_code == 404 and ignore_not_found:
76
+ return None
77
+
73
78
  response.raise_for_status()
74
79
  if response.status_code == 304:
75
80
  # Not modified
@@ -1,22 +1,22 @@
1
- import json
2
1
  from datetime import datetime
3
2
 
4
3
  import requests
5
4
 
6
- from ingestify import Source, retrieve_http
7
- from ingestify.domain import DraftFile
5
+ from ingestify import Source, DatasetResource
8
6
  from ingestify.domain.models.dataset.dataset import DatasetState
9
7
 
10
8
  BASE_URL = "https://raw.githubusercontent.com/statsbomb/open-data/master/data"
9
+ DATA_SPEC_VERSION = "v1-open-data"
11
10
 
12
11
 
13
12
  class StatsbombGithub(Source):
14
13
  provider = "statsbomb"
15
14
 
16
- def discover_selectors(self, dataset_type: str, data_spec_versions: None = None):
15
+ def discover_selectors(self, dataset_type: str):
17
16
  assert dataset_type == "match"
18
17
 
19
18
  competitions = requests.get(f"{BASE_URL}/competitions.json").json()
19
+
20
20
  return [
21
21
  dict(
22
22
  competition_id=competition["competition_id"],
@@ -25,68 +25,81 @@ class StatsbombGithub(Source):
25
25
  for competition in competitions
26
26
  ]
27
27
 
28
- def discover_datasets(
28
+ def find_datasets(
29
29
  self,
30
- dataset_type,
31
- competition_id: str = None,
32
- season_id: str = None,
30
+ dataset_type: str,
31
+ competition_id: str,
32
+ season_id: str,
33
+ match_id: str = None,
33
34
  data_spec_versions=None,
35
+ dataset_collection_metadata=None,
34
36
  ):
35
37
  assert dataset_type == "match"
36
38
 
37
- datasets = []
38
-
39
39
  matches = requests.get(
40
40
  f"{BASE_URL}/matches/{competition_id}/{season_id}.json"
41
41
  ).json()
42
42
 
43
43
  for match in matches:
44
- last_updated = match["last_updated"]
45
- if "Z" not in last_updated:
46
- # Assume UTC
47
- last_updated += "Z"
48
-
49
- last_modified = datetime.fromisoformat(last_updated.replace("Z", "+00:00"))
50
-
51
- dataset = dict(
52
- competition_id=competition_id,
53
- season_id=season_id,
54
- match_id=match["match_id"],
55
- _last_modified=last_modified,
56
- _match=match,
57
- _metadata=match,
58
- _state=DatasetState.COMPLETE,
59
- )
60
- datasets.append(dataset)
61
- return datasets
44
+ if match_id:
45
+ if match["match_id"] != match_id:
46
+ continue
62
47
 
63
- def fetch_dataset_files(
64
- self, dataset_type, identifier, current_revision, data_spec_versions
65
- ):
66
- assert dataset_type == "match"
48
+ last_modified = datetime.fromisoformat(match["last_updated"] + "+00:00")
67
49
 
68
- current_files = current_revision.modified_files_map if current_revision else {}
69
- files = {}
70
- for filename, url in [
71
- ("lineups.json", f"{BASE_URL}/lineups/{identifier.match_id}.json"),
72
- ("events.json", f"{BASE_URL}/events/{identifier.match_id}.json"),
73
- ]:
74
- data_feed_key = filename.split(".")[0]
75
- file_id = data_feed_key + "__v1"
76
- files[file_id] = retrieve_http(
77
- url,
78
- current_files.get(filename),
79
- file_data_feed_key=data_feed_key,
80
- file_data_spec_version="v1",
81
- file_data_serialization_format="json",
50
+ # Open data is always complete.. I guess?
51
+ state = DatasetState.COMPLETE
52
+
53
+ name = (
54
+ f"{match['match_date']} / "
55
+ f"{match['home_team']['home_team_name']} - {match['away_team']['away_team_name']}"
56
+ )
57
+
58
+ dataset_resource = DatasetResource(
59
+ dataset_resource_id=dict(
60
+ competition_id=competition_id,
61
+ season_id=season_id,
62
+ match_id=match["match_id"],
63
+ ),
64
+ dataset_type=dataset_type,
65
+ provider=self.provider,
66
+ name=name,
67
+ metadata=match,
68
+ state=state,
82
69
  )
83
70
 
84
- files["match__v1"] = DraftFile.from_input(
85
- json.dumps(identifier._match, indent=4),
86
- data_feed_key="match",
87
- data_spec_version="v1",
88
- data_serialization_format="json",
89
- modified_at=None,
90
- )
71
+ dataset_resource.add_file(
72
+ last_modified=last_modified,
73
+ data_feed_key="match",
74
+ data_spec_version=DATA_SPEC_VERSION,
75
+ json_content=match,
76
+ )
91
77
 
92
- return files
78
+ if state.is_complete:
79
+ name += f" / {match['home_score']}-{match['away_score']}"
80
+
81
+ for data_feed_key in ["lineups", "events"]:
82
+ dataset_resource.add_file(
83
+ last_modified=last_modified,
84
+ data_feed_key=data_feed_key,
85
+ data_spec_version=DATA_SPEC_VERSION,
86
+ url=f"{BASE_URL}/{data_feed_key}/{match['match_id']}.json",
87
+ data_serialization_format="json",
88
+ )
89
+
90
+ if (
91
+ match["last_updated_360"]
92
+ and match["match_status_360"] == "available"
93
+ ):
94
+ dataset_resource.add_file(
95
+ last_modified=datetime.fromisoformat(
96
+ match["last_updated_360"] + "+00:00"
97
+ ),
98
+ data_feed_key="360-frames",
99
+ data_spec_version=DATA_SPEC_VERSION,
100
+ url=f"{BASE_URL}/three-sixty/{match['match_id']}.json",
101
+ data_serialization_format="json",
102
+ http_options={"ignore_not_found": True},
103
+ )
104
+
105
+ yield dataset_resource
@@ -1,2 +0,0 @@
1
- from .local_dataset_repository import LocalDatasetRepository
2
- from .sqlalchemy import SqlAlchemyDatasetRepository
@@ -1,5 +1,7 @@
1
1
  import datetime
2
+ from dataclasses import is_dataclass, asdict
2
3
  from pathlib import Path
4
+ from typing import Optional
3
5
 
4
6
  from sqlalchemy import (
5
7
  JSON,
@@ -13,11 +15,37 @@ from sqlalchemy import (
13
15
  String,
14
16
  Table,
15
17
  TypeDecorator,
18
+ Boolean,
16
19
  )
17
20
  from sqlalchemy.orm import registry, relationship
18
21
 
22
+ from ingestify.domain import Selector, Identifier, DataSpecVersionCollection
19
23
  from ingestify.domain.models import Dataset, File, Revision
20
24
  from ingestify.domain.models.dataset.dataset import DatasetState
25
+ from ingestify.domain.models.ingestion.ingestion_job_summary import (
26
+ IngestionJobSummary,
27
+ )
28
+ from ingestify.domain.models.task.task_summary import TaskSummary, Operation, TaskStatus
29
+ from ingestify.domain.models.timing import Timing
30
+ from ingestify.domain.models.dataset.revision import RevisionState
31
+
32
+
33
+ def JSONType(serializer=None, deserializer=None):
34
+ class _JsonType(TypeDecorator):
35
+ cache_ok = True
36
+ impl = JSON
37
+
38
+ def process_bind_param(self, value, dialect):
39
+ if serializer is not None:
40
+ return serializer(value)
41
+ return value
42
+
43
+ def process_result_value(self, value, dialect):
44
+ if deserializer is not None:
45
+ return deserializer(value)
46
+ return value
47
+
48
+ return _JsonType
21
49
 
22
50
 
23
51
  class TZDateTime(TypeDecorator):
@@ -25,7 +53,10 @@ class TZDateTime(TypeDecorator):
25
53
  LOCAL_TIMEZONE = datetime.datetime.utcnow().astimezone().tzinfo
26
54
  cache_ok = True
27
55
 
28
- def process_bind_param(self, value: datetime, dialect):
56
+ def process_bind_param(self, value: Optional[datetime.datetime], dialect):
57
+ if not value:
58
+ return None
59
+
29
60
  if value.tzinfo is None:
30
61
  value = value.astimezone(self.LOCAL_TIMEZONE)
31
62
 
@@ -67,6 +98,45 @@ class DatasetStateString(TypeDecorator):
67
98
  return DatasetState[value]
68
99
 
69
100
 
101
+ class RevisionStateString(TypeDecorator):
102
+ impl = String(255)
103
+
104
+ def process_bind_param(self, value: RevisionState, dialect):
105
+ return value.value
106
+
107
+ def process_result_value(self, value, dialect):
108
+ if not value:
109
+ return value
110
+
111
+ return RevisionState[value]
112
+
113
+
114
+ class OperationString(TypeDecorator):
115
+ impl = String(255)
116
+
117
+ def process_bind_param(self, value: Operation, dialect):
118
+ return value.value
119
+
120
+ def process_result_value(self, value, dialect):
121
+ if not value:
122
+ return value
123
+
124
+ return Operation[value]
125
+
126
+
127
+ class TaskStatusString(TypeDecorator):
128
+ impl = String(255)
129
+
130
+ def process_bind_param(self, value: TaskStatus, dialect):
131
+ return value.value
132
+
133
+ def process_result_value(self, value, dialect):
134
+ if not value:
135
+ return value
136
+
137
+ return TaskStatus[value]
138
+
139
+
70
140
  mapper_registry = registry()
71
141
 
72
142
  metadata = MetaData()
@@ -80,7 +150,7 @@ dataset_table = Table(
80
150
  Column("dataset_type", String(255)),
81
151
  Column("state", DatasetStateString),
82
152
  Column("name", String(255)),
83
- Column("identifier", JSON),
153
+ Column("identifier", JSONType(deserializer=lambda item: Identifier(**item))),
84
154
  Column("metadata", JSON),
85
155
  Column("created_at", TZDateTime(6)),
86
156
  Column("updated_at", TZDateTime(6)),
@@ -95,7 +165,10 @@ revision_table = Table(
95
165
  Column("revision_id", Integer, primary_key=True),
96
166
  Column("description", String(255)),
97
167
  Column("created_at", TZDateTime(6)),
168
+ Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
169
+ Column("source", JSONType()),
98
170
  )
171
+
99
172
  file_table = Table(
100
173
  "file",
101
174
  metadata,
@@ -129,7 +202,7 @@ mapper_registry.map_imperatively(
129
202
  Revision,
130
203
  backref="dataset",
131
204
  order_by=revision_table.c.revision_id,
132
- lazy="joined",
205
+ lazy="selectin",
133
206
  cascade="all, delete-orphan",
134
207
  ),
135
208
  },
@@ -143,7 +216,7 @@ mapper_registry.map_imperatively(
143
216
  File,
144
217
  order_by=file_table.c.file_id,
145
218
  primaryjoin="and_(Revision.revision_id==File.revision_id, Revision.dataset_id==File.dataset_id)",
146
- lazy="joined",
219
+ lazy="selectin",
147
220
  cascade="all, delete-orphan",
148
221
  )
149
222
  },
@@ -151,3 +224,113 @@ mapper_registry.map_imperatively(
151
224
 
152
225
 
153
226
  mapper_registry.map_imperatively(File, file_table)
227
+
228
+
229
+ ingestion_job_summary = Table(
230
+ "ingestion_job_summary",
231
+ metadata,
232
+ Column("ingestion_job_summary_id", String(255), primary_key=True),
233
+ Column("ingestion_job_id", String(255), index=True),
234
+ # From the IngestionPlan
235
+ Column("source_name", String(255)),
236
+ Column("provider", String(255)),
237
+ Column("dataset_type", String(255)),
238
+ Column(
239
+ "data_spec_versions",
240
+ JSONType(
241
+ serializer=lambda data_spec_versions: data_spec_versions.to_dict(),
242
+ deserializer=lambda data_spec_versions: DataSpecVersionCollection.from_dict(
243
+ data_spec_versions
244
+ ),
245
+ ),
246
+ ),
247
+ Column(
248
+ "selector", JSONType(serializer=lambda selector: selector.filtered_attributes)
249
+ ),
250
+ Column("started_at", TZDateTime(6)),
251
+ Column("finished_at", TZDateTime(6)),
252
+ # Some task counters
253
+ Column("successful_tasks", Integer),
254
+ Column("ignored_successful_tasks", Integer),
255
+ Column("skipped_datasets", Integer),
256
+ Column("failed_tasks", Integer),
257
+ Column(
258
+ "timings",
259
+ JSONType(
260
+ serializer=lambda timings: [
261
+ timing.model_dump(mode="json") for timing in timings
262
+ ],
263
+ deserializer=lambda timings: [
264
+ Timing.model_validate(timing) for timing in timings
265
+ ],
266
+ ),
267
+ ),
268
+ # Column(
269
+ # "task_summaries",
270
+ # JSONType(
271
+ # serializer=lambda task_summaries: [
272
+ # task_summary.model_dump(mode="json") for task_summary in task_summaries
273
+ # ],
274
+ # deserializer=lambda task_summaries: [
275
+ # TaskSummary.model_validate(task_summary)
276
+ # for task_summary in task_summaries
277
+ # ],
278
+ # ),
279
+ # ),
280
+ )
281
+
282
+
283
+ task_summary_table = Table(
284
+ "task_summary",
285
+ metadata,
286
+ Column(
287
+ "ingestion_job_summary_id",
288
+ String(255),
289
+ ForeignKey("ingestion_job_summary.ingestion_job_summary_id"),
290
+ primary_key=True,
291
+ ),
292
+ Column("task_id", Integer, primary_key=True),
293
+ Column("started_at", TZDateTime(6)),
294
+ Column("ended_at", TZDateTime(6)),
295
+ Column("operation", OperationString),
296
+ Column(
297
+ "dataset_identifier", JSONType(deserializer=lambda item: Identifier(**item))
298
+ ),
299
+ Column("persisted_file_count", Integer),
300
+ Column("bytes_retrieved", Integer),
301
+ Column("last_modified", TZDateTime(6)),
302
+ Column("status", TaskStatusString),
303
+ Column(
304
+ "timings",
305
+ JSONType(
306
+ serializer=lambda timings: [
307
+ timing.model_dump(mode="json") for timing in timings
308
+ ],
309
+ deserializer=lambda timings: [
310
+ Timing.model_validate(timing) for timing in timings
311
+ ],
312
+ ),
313
+ ),
314
+ # Column("description", String(255)),
315
+ # Column("created_at", TZDateTime(6)),
316
+ # Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
317
+ # Column("source", JSONType()),
318
+ )
319
+
320
+
321
+ mapper_registry.map_imperatively(
322
+ IngestionJobSummary,
323
+ ingestion_job_summary,
324
+ properties={
325
+ "task_summaries": relationship(
326
+ TaskSummary,
327
+ backref="ingestion_job_summary",
328
+ # order_by=task_summary_table.c.revision_id,
329
+ lazy="selectin",
330
+ cascade="all, delete-orphan",
331
+ ),
332
+ },
333
+ )
334
+
335
+
336
+ mapper_registry.map_imperatively(TaskSummary, task_summary_table)