ingestify 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/application/dataset_store.py +44 -24
- ingestify/application/ingestion_engine.py +3 -3
- ingestify/application/loader.py +67 -237
- ingestify/domain/models/__init__.py +1 -6
- ingestify/domain/models/base.py +22 -0
- ingestify/domain/models/data_spec_version_collection.py +6 -0
- ingestify/domain/models/dataset/__init__.py +3 -5
- ingestify/domain/models/dataset/dataset.py +15 -32
- ingestify/domain/models/dataset/dataset_repository.py +1 -15
- ingestify/domain/models/dataset/dataset_state.py +11 -0
- ingestify/domain/models/dataset/events.py +6 -16
- ingestify/domain/models/dataset/file.py +21 -34
- ingestify/domain/models/dataset/file_collection.py +3 -1
- ingestify/domain/models/dataset/file_repository.py +1 -10
- ingestify/domain/models/dataset/revision.py +26 -3
- ingestify/domain/models/event/domain_event.py +8 -4
- ingestify/domain/models/ingestion/__init__.py +0 -0
- ingestify/domain/models/ingestion/ingestion_job.py +292 -0
- ingestify/domain/models/ingestion/ingestion_job_summary.py +106 -0
- ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
- ingestify/domain/models/resources/dataset_resource.py +29 -37
- ingestify/domain/models/sink.py +1 -8
- ingestify/domain/models/task/task.py +3 -1
- ingestify/domain/models/task/task_summary.py +118 -0
- ingestify/domain/models/timing.py +16 -0
- ingestify/infra/fetch/http.py +5 -0
- ingestify/infra/source/statsbomb_github.py +67 -54
- ingestify/infra/store/dataset/__init__.py +0 -2
- ingestify/infra/store/dataset/sqlalchemy/mapping.py +184 -4
- ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -22
- ingestify/main.py +42 -22
- ingestify/utils.py +15 -78
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/METADATA +2 -1
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/RECORD +38 -32
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/WHEEL +1 -1
- ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/top_level.txt +0 -0
ingestify/infra/fetch/http.py
CHANGED
|
@@ -69,7 +69,12 @@ def retrieve_http(
|
|
|
69
69
|
else:
|
|
70
70
|
raise Exception(f"Don't know how to use {key}")
|
|
71
71
|
|
|
72
|
+
ignore_not_found = http_kwargs.pop("ignore_not_found", False)
|
|
73
|
+
|
|
72
74
|
response = get_session().get(url, headers=headers, **http_kwargs)
|
|
75
|
+
if response.status_code == 404 and ignore_not_found:
|
|
76
|
+
return None
|
|
77
|
+
|
|
73
78
|
response.raise_for_status()
|
|
74
79
|
if response.status_code == 304:
|
|
75
80
|
# Not modified
|
|
@@ -1,22 +1,22 @@
|
|
|
1
|
-
import json
|
|
2
1
|
from datetime import datetime
|
|
3
2
|
|
|
4
3
|
import requests
|
|
5
4
|
|
|
6
|
-
from ingestify import Source,
|
|
7
|
-
from ingestify.domain import DraftFile
|
|
5
|
+
from ingestify import Source, DatasetResource
|
|
8
6
|
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
9
7
|
|
|
10
8
|
BASE_URL = "https://raw.githubusercontent.com/statsbomb/open-data/master/data"
|
|
9
|
+
DATA_SPEC_VERSION = "v1-open-data"
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
class StatsbombGithub(Source):
|
|
14
13
|
provider = "statsbomb"
|
|
15
14
|
|
|
16
|
-
def discover_selectors(self, dataset_type: str
|
|
15
|
+
def discover_selectors(self, dataset_type: str):
|
|
17
16
|
assert dataset_type == "match"
|
|
18
17
|
|
|
19
18
|
competitions = requests.get(f"{BASE_URL}/competitions.json").json()
|
|
19
|
+
|
|
20
20
|
return [
|
|
21
21
|
dict(
|
|
22
22
|
competition_id=competition["competition_id"],
|
|
@@ -25,68 +25,81 @@ class StatsbombGithub(Source):
|
|
|
25
25
|
for competition in competitions
|
|
26
26
|
]
|
|
27
27
|
|
|
28
|
-
def
|
|
28
|
+
def find_datasets(
|
|
29
29
|
self,
|
|
30
|
-
dataset_type,
|
|
31
|
-
competition_id: str
|
|
32
|
-
season_id: str
|
|
30
|
+
dataset_type: str,
|
|
31
|
+
competition_id: str,
|
|
32
|
+
season_id: str,
|
|
33
|
+
match_id: str = None,
|
|
33
34
|
data_spec_versions=None,
|
|
35
|
+
dataset_collection_metadata=None,
|
|
34
36
|
):
|
|
35
37
|
assert dataset_type == "match"
|
|
36
38
|
|
|
37
|
-
datasets = []
|
|
38
|
-
|
|
39
39
|
matches = requests.get(
|
|
40
40
|
f"{BASE_URL}/matches/{competition_id}/{season_id}.json"
|
|
41
41
|
).json()
|
|
42
42
|
|
|
43
43
|
for match in matches:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
last_updated += "Z"
|
|
48
|
-
|
|
49
|
-
last_modified = datetime.fromisoformat(last_updated.replace("Z", "+00:00"))
|
|
50
|
-
|
|
51
|
-
dataset = dict(
|
|
52
|
-
competition_id=competition_id,
|
|
53
|
-
season_id=season_id,
|
|
54
|
-
match_id=match["match_id"],
|
|
55
|
-
_last_modified=last_modified,
|
|
56
|
-
_match=match,
|
|
57
|
-
_metadata=match,
|
|
58
|
-
_state=DatasetState.COMPLETE,
|
|
59
|
-
)
|
|
60
|
-
datasets.append(dataset)
|
|
61
|
-
return datasets
|
|
44
|
+
if match_id:
|
|
45
|
+
if match["match_id"] != match_id:
|
|
46
|
+
continue
|
|
62
47
|
|
|
63
|
-
|
|
64
|
-
self, dataset_type, identifier, current_revision, data_spec_versions
|
|
65
|
-
):
|
|
66
|
-
assert dataset_type == "match"
|
|
48
|
+
last_modified = datetime.fromisoformat(match["last_updated"] + "+00:00")
|
|
67
49
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
(
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
50
|
+
# Open data is always complete.. I guess?
|
|
51
|
+
state = DatasetState.COMPLETE
|
|
52
|
+
|
|
53
|
+
name = (
|
|
54
|
+
f"{match['match_date']} / "
|
|
55
|
+
f"{match['home_team']['home_team_name']} - {match['away_team']['away_team_name']}"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
dataset_resource = DatasetResource(
|
|
59
|
+
dataset_resource_id=dict(
|
|
60
|
+
competition_id=competition_id,
|
|
61
|
+
season_id=season_id,
|
|
62
|
+
match_id=match["match_id"],
|
|
63
|
+
),
|
|
64
|
+
dataset_type=dataset_type,
|
|
65
|
+
provider=self.provider,
|
|
66
|
+
name=name,
|
|
67
|
+
metadata=match,
|
|
68
|
+
state=state,
|
|
82
69
|
)
|
|
83
70
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
)
|
|
71
|
+
dataset_resource.add_file(
|
|
72
|
+
last_modified=last_modified,
|
|
73
|
+
data_feed_key="match",
|
|
74
|
+
data_spec_version=DATA_SPEC_VERSION,
|
|
75
|
+
json_content=match,
|
|
76
|
+
)
|
|
91
77
|
|
|
92
|
-
|
|
78
|
+
if state.is_complete:
|
|
79
|
+
name += f" / {match['home_score']}-{match['away_score']}"
|
|
80
|
+
|
|
81
|
+
for data_feed_key in ["lineups", "events"]:
|
|
82
|
+
dataset_resource.add_file(
|
|
83
|
+
last_modified=last_modified,
|
|
84
|
+
data_feed_key=data_feed_key,
|
|
85
|
+
data_spec_version=DATA_SPEC_VERSION,
|
|
86
|
+
url=f"{BASE_URL}/{data_feed_key}/{match['match_id']}.json",
|
|
87
|
+
data_serialization_format="json",
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
if (
|
|
91
|
+
match["last_updated_360"]
|
|
92
|
+
and match["match_status_360"] == "available"
|
|
93
|
+
):
|
|
94
|
+
dataset_resource.add_file(
|
|
95
|
+
last_modified=datetime.fromisoformat(
|
|
96
|
+
match["last_updated_360"] + "+00:00"
|
|
97
|
+
),
|
|
98
|
+
data_feed_key="360-frames",
|
|
99
|
+
data_spec_version=DATA_SPEC_VERSION,
|
|
100
|
+
url=f"{BASE_URL}/three-sixty/{match['match_id']}.json",
|
|
101
|
+
data_serialization_format="json",
|
|
102
|
+
http_options={"ignore_not_found": True},
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
yield dataset_resource
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import datetime
|
|
2
|
+
from dataclasses import is_dataclass, asdict
|
|
2
3
|
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
3
5
|
|
|
4
6
|
from sqlalchemy import (
|
|
5
7
|
JSON,
|
|
@@ -13,11 +15,37 @@ from sqlalchemy import (
|
|
|
13
15
|
String,
|
|
14
16
|
Table,
|
|
15
17
|
TypeDecorator,
|
|
18
|
+
Boolean,
|
|
16
19
|
)
|
|
17
20
|
from sqlalchemy.orm import registry, relationship
|
|
18
21
|
|
|
22
|
+
from ingestify.domain import Selector, Identifier, DataSpecVersionCollection
|
|
19
23
|
from ingestify.domain.models import Dataset, File, Revision
|
|
20
24
|
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
25
|
+
from ingestify.domain.models.ingestion.ingestion_job_summary import (
|
|
26
|
+
IngestionJobSummary,
|
|
27
|
+
)
|
|
28
|
+
from ingestify.domain.models.task.task_summary import TaskSummary, Operation, TaskStatus
|
|
29
|
+
from ingestify.domain.models.timing import Timing
|
|
30
|
+
from ingestify.domain.models.dataset.revision import RevisionState
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def JSONType(serializer=None, deserializer=None):
|
|
34
|
+
class _JsonType(TypeDecorator):
|
|
35
|
+
cache_ok = True
|
|
36
|
+
impl = JSON
|
|
37
|
+
|
|
38
|
+
def process_bind_param(self, value, dialect):
|
|
39
|
+
if serializer is not None:
|
|
40
|
+
return serializer(value)
|
|
41
|
+
return value
|
|
42
|
+
|
|
43
|
+
def process_result_value(self, value, dialect):
|
|
44
|
+
if deserializer is not None:
|
|
45
|
+
return deserializer(value)
|
|
46
|
+
return value
|
|
47
|
+
|
|
48
|
+
return _JsonType
|
|
21
49
|
|
|
22
50
|
|
|
23
51
|
class TZDateTime(TypeDecorator):
|
|
@@ -25,7 +53,10 @@ class TZDateTime(TypeDecorator):
|
|
|
25
53
|
LOCAL_TIMEZONE = datetime.datetime.utcnow().astimezone().tzinfo
|
|
26
54
|
cache_ok = True
|
|
27
55
|
|
|
28
|
-
def process_bind_param(self, value: datetime, dialect):
|
|
56
|
+
def process_bind_param(self, value: Optional[datetime.datetime], dialect):
|
|
57
|
+
if not value:
|
|
58
|
+
return None
|
|
59
|
+
|
|
29
60
|
if value.tzinfo is None:
|
|
30
61
|
value = value.astimezone(self.LOCAL_TIMEZONE)
|
|
31
62
|
|
|
@@ -67,6 +98,45 @@ class DatasetStateString(TypeDecorator):
|
|
|
67
98
|
return DatasetState[value]
|
|
68
99
|
|
|
69
100
|
|
|
101
|
+
class RevisionStateString(TypeDecorator):
|
|
102
|
+
impl = String(255)
|
|
103
|
+
|
|
104
|
+
def process_bind_param(self, value: RevisionState, dialect):
|
|
105
|
+
return value.value
|
|
106
|
+
|
|
107
|
+
def process_result_value(self, value, dialect):
|
|
108
|
+
if not value:
|
|
109
|
+
return value
|
|
110
|
+
|
|
111
|
+
return RevisionState[value]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class OperationString(TypeDecorator):
|
|
115
|
+
impl = String(255)
|
|
116
|
+
|
|
117
|
+
def process_bind_param(self, value: Operation, dialect):
|
|
118
|
+
return value.value
|
|
119
|
+
|
|
120
|
+
def process_result_value(self, value, dialect):
|
|
121
|
+
if not value:
|
|
122
|
+
return value
|
|
123
|
+
|
|
124
|
+
return Operation[value]
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class TaskStatusString(TypeDecorator):
|
|
128
|
+
impl = String(255)
|
|
129
|
+
|
|
130
|
+
def process_bind_param(self, value: TaskStatus, dialect):
|
|
131
|
+
return value.value
|
|
132
|
+
|
|
133
|
+
def process_result_value(self, value, dialect):
|
|
134
|
+
if not value:
|
|
135
|
+
return value
|
|
136
|
+
|
|
137
|
+
return TaskStatus[value]
|
|
138
|
+
|
|
139
|
+
|
|
70
140
|
mapper_registry = registry()
|
|
71
141
|
|
|
72
142
|
metadata = MetaData()
|
|
@@ -80,7 +150,7 @@ dataset_table = Table(
|
|
|
80
150
|
Column("dataset_type", String(255)),
|
|
81
151
|
Column("state", DatasetStateString),
|
|
82
152
|
Column("name", String(255)),
|
|
83
|
-
Column("identifier",
|
|
153
|
+
Column("identifier", JSONType(deserializer=lambda item: Identifier(**item))),
|
|
84
154
|
Column("metadata", JSON),
|
|
85
155
|
Column("created_at", TZDateTime(6)),
|
|
86
156
|
Column("updated_at", TZDateTime(6)),
|
|
@@ -95,7 +165,10 @@ revision_table = Table(
|
|
|
95
165
|
Column("revision_id", Integer, primary_key=True),
|
|
96
166
|
Column("description", String(255)),
|
|
97
167
|
Column("created_at", TZDateTime(6)),
|
|
168
|
+
Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
|
|
169
|
+
Column("source", JSONType()),
|
|
98
170
|
)
|
|
171
|
+
|
|
99
172
|
file_table = Table(
|
|
100
173
|
"file",
|
|
101
174
|
metadata,
|
|
@@ -129,7 +202,7 @@ mapper_registry.map_imperatively(
|
|
|
129
202
|
Revision,
|
|
130
203
|
backref="dataset",
|
|
131
204
|
order_by=revision_table.c.revision_id,
|
|
132
|
-
lazy="
|
|
205
|
+
lazy="selectin",
|
|
133
206
|
cascade="all, delete-orphan",
|
|
134
207
|
),
|
|
135
208
|
},
|
|
@@ -143,7 +216,7 @@ mapper_registry.map_imperatively(
|
|
|
143
216
|
File,
|
|
144
217
|
order_by=file_table.c.file_id,
|
|
145
218
|
primaryjoin="and_(Revision.revision_id==File.revision_id, Revision.dataset_id==File.dataset_id)",
|
|
146
|
-
lazy="
|
|
219
|
+
lazy="selectin",
|
|
147
220
|
cascade="all, delete-orphan",
|
|
148
221
|
)
|
|
149
222
|
},
|
|
@@ -151,3 +224,110 @@ mapper_registry.map_imperatively(
|
|
|
151
224
|
|
|
152
225
|
|
|
153
226
|
mapper_registry.map_imperatively(File, file_table)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
ingestion_job_summary = Table(
|
|
230
|
+
"ingestion_job_summary",
|
|
231
|
+
metadata,
|
|
232
|
+
Column("ingestion_job_id", String(255), primary_key=True),
|
|
233
|
+
# From the IngestionPlan
|
|
234
|
+
Column("source_name", String(255)),
|
|
235
|
+
Column("dataset_type", String(255)),
|
|
236
|
+
Column(
|
|
237
|
+
"data_spec_versions",
|
|
238
|
+
JSONType(
|
|
239
|
+
serializer=lambda data_spec_versions: data_spec_versions.to_dict(),
|
|
240
|
+
deserializer=lambda data_spec_versions: DataSpecVersionCollection.from_dict(
|
|
241
|
+
data_spec_versions
|
|
242
|
+
),
|
|
243
|
+
),
|
|
244
|
+
),
|
|
245
|
+
Column(
|
|
246
|
+
"selector", JSONType(serializer=lambda selector: selector.filtered_attributes)
|
|
247
|
+
),
|
|
248
|
+
Column("started_at", TZDateTime(6)),
|
|
249
|
+
Column("finished_at", TZDateTime(6)),
|
|
250
|
+
# Some task counters
|
|
251
|
+
Column("successful_tasks", Integer),
|
|
252
|
+
Column("ignored_successful_tasks", Integer),
|
|
253
|
+
Column("failed_tasks", Integer),
|
|
254
|
+
Column(
|
|
255
|
+
"timings",
|
|
256
|
+
JSONType(
|
|
257
|
+
serializer=lambda timings: [
|
|
258
|
+
timing.model_dump(mode="json") for timing in timings
|
|
259
|
+
],
|
|
260
|
+
deserializer=lambda timings: [
|
|
261
|
+
Timing.model_validate(timing) for timing in timings
|
|
262
|
+
],
|
|
263
|
+
),
|
|
264
|
+
),
|
|
265
|
+
# Column(
|
|
266
|
+
# "task_summaries",
|
|
267
|
+
# JSONType(
|
|
268
|
+
# serializer=lambda task_summaries: [
|
|
269
|
+
# task_summary.model_dump(mode="json") for task_summary in task_summaries
|
|
270
|
+
# ],
|
|
271
|
+
# deserializer=lambda task_summaries: [
|
|
272
|
+
# TaskSummary.model_validate(task_summary)
|
|
273
|
+
# for task_summary in task_summaries
|
|
274
|
+
# ],
|
|
275
|
+
# ),
|
|
276
|
+
# ),
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
task_summary_table = Table(
|
|
281
|
+
"task_summary",
|
|
282
|
+
metadata,
|
|
283
|
+
Column(
|
|
284
|
+
"ingestion_job_id",
|
|
285
|
+
String(255),
|
|
286
|
+
ForeignKey("ingestion_job_summary.ingestion_job_id"),
|
|
287
|
+
primary_key=True,
|
|
288
|
+
),
|
|
289
|
+
Column("task_id", Integer, primary_key=True),
|
|
290
|
+
Column("started_at", TZDateTime(6)),
|
|
291
|
+
Column("ended_at", TZDateTime(6)),
|
|
292
|
+
Column("operation", OperationString),
|
|
293
|
+
Column(
|
|
294
|
+
"dataset_identifier", JSONType(deserializer=lambda item: Identifier(**item))
|
|
295
|
+
),
|
|
296
|
+
Column("persisted_file_count", Integer),
|
|
297
|
+
Column("bytes_retrieved", Integer),
|
|
298
|
+
Column("last_modified", TZDateTime(6)),
|
|
299
|
+
Column("status", TaskStatusString),
|
|
300
|
+
Column(
|
|
301
|
+
"timings",
|
|
302
|
+
JSONType(
|
|
303
|
+
serializer=lambda timings: [
|
|
304
|
+
timing.model_dump(mode="json") for timing in timings
|
|
305
|
+
],
|
|
306
|
+
deserializer=lambda timings: [
|
|
307
|
+
Timing.model_validate(timing) for timing in timings
|
|
308
|
+
],
|
|
309
|
+
),
|
|
310
|
+
),
|
|
311
|
+
# Column("description", String(255)),
|
|
312
|
+
# Column("created_at", TZDateTime(6)),
|
|
313
|
+
# Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
|
|
314
|
+
# Column("source", JSONType()),
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
mapper_registry.map_imperatively(
|
|
319
|
+
IngestionJobSummary,
|
|
320
|
+
ingestion_job_summary,
|
|
321
|
+
properties={
|
|
322
|
+
"task_summaries": relationship(
|
|
323
|
+
TaskSummary,
|
|
324
|
+
backref="ingestion_job_summary",
|
|
325
|
+
# order_by=task_summary_table.c.revision_id,
|
|
326
|
+
lazy="selectin",
|
|
327
|
+
cascade="all, delete-orphan",
|
|
328
|
+
),
|
|
329
|
+
},
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
mapper_registry.map_imperatively(TaskSummary, task_summary_table)
|
|
@@ -29,22 +29,6 @@ def parse_value(v):
|
|
|
29
29
|
return v
|
|
30
30
|
|
|
31
31
|
|
|
32
|
-
def json_serializer(o):
|
|
33
|
-
return json.dumps(o)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def json_deserializer(o):
|
|
37
|
-
o = json.loads(o)
|
|
38
|
-
# THIS BREAKS WHEN USING OTHER JSON COLUMNS!!
|
|
39
|
-
o = Identifier(**o)
|
|
40
|
-
return o
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
# @compiles(DateTime, "mysql")
|
|
44
|
-
# def compile_datetime_mysql(type_, compiler, **kw):
|
|
45
|
-
# return "DATETIME(6)"
|
|
46
|
-
|
|
47
|
-
|
|
48
32
|
def isfloat(x):
|
|
49
33
|
try:
|
|
50
34
|
a = float(x)
|
|
@@ -64,7 +48,7 @@ def isint(x):
|
|
|
64
48
|
return a == b
|
|
65
49
|
|
|
66
50
|
|
|
67
|
-
class
|
|
51
|
+
class SqlAlchemySessionProvider:
|
|
68
52
|
@staticmethod
|
|
69
53
|
def fix_url(url: str) -> str:
|
|
70
54
|
if url.startswith("postgres://"):
|
|
@@ -87,8 +71,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
87
71
|
self.url,
|
|
88
72
|
# Use the default isolation level, don't need SERIALIZABLE
|
|
89
73
|
# isolation_level="SERIALIZABLE",
|
|
90
|
-
json_serializer=json_serializer,
|
|
91
|
-
json_deserializer=json_deserializer,
|
|
92
74
|
)
|
|
93
75
|
self.session = Session(bind=self.engine)
|
|
94
76
|
|
|
@@ -107,9 +89,29 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
107
89
|
self.url = state["url"]
|
|
108
90
|
self._init_engine()
|
|
109
91
|
|
|
92
|
+
def _close_engine(self):
|
|
93
|
+
if hasattr(self, "session"):
|
|
94
|
+
self.session.close()
|
|
95
|
+
self.engine.dispose()
|
|
96
|
+
|
|
110
97
|
def __del__(self):
|
|
111
|
-
self.
|
|
112
|
-
|
|
98
|
+
self._close_engine()
|
|
99
|
+
|
|
100
|
+
def reset(self):
|
|
101
|
+
self._close_engine()
|
|
102
|
+
self._init_engine()
|
|
103
|
+
|
|
104
|
+
def get(self):
|
|
105
|
+
return self.session
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
109
|
+
def __init__(self, session_provider: SqlAlchemySessionProvider):
|
|
110
|
+
self.session_provider = session_provider
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def session(self):
|
|
114
|
+
return self.session_provider.get()
|
|
113
115
|
|
|
114
116
|
def _filter_query(
|
|
115
117
|
self,
|
|
@@ -208,7 +210,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
208
210
|
|
|
209
211
|
if not metadata_only:
|
|
210
212
|
dataset_query = apply_query_filter(
|
|
211
|
-
self.session.query(Dataset).options(joinedload(Dataset.revisions))
|
|
213
|
+
self.session.query(Dataset) # .options(joinedload(Dataset.revisions))
|
|
212
214
|
)
|
|
213
215
|
datasets = list(dataset_query)
|
|
214
216
|
else:
|
ingestify/main.py
CHANGED
|
@@ -11,19 +11,20 @@ from ingestify import Source
|
|
|
11
11
|
from ingestify.application.dataset_store import DatasetStore
|
|
12
12
|
from ingestify.application.ingestion_engine import IngestionEngine
|
|
13
13
|
from ingestify.application.secrets_manager import SecretsManager
|
|
14
|
-
from ingestify.domain import Selector
|
|
15
|
-
from ingestify.domain.models import (
|
|
16
|
-
dataset_repository_factory,
|
|
17
|
-
file_repository_factory,
|
|
18
|
-
)
|
|
14
|
+
from ingestify.domain import Selector, FileRepository
|
|
19
15
|
from ingestify.domain.models.data_spec_version_collection import (
|
|
20
16
|
DataSpecVersionCollection,
|
|
21
17
|
)
|
|
22
18
|
from ingestify.domain.models.event import EventBus, Publisher, Subscriber
|
|
23
19
|
|
|
24
|
-
from ingestify.domain.models.
|
|
20
|
+
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
25
21
|
from ingestify.domain.models.fetch_policy import FetchPolicy
|
|
26
22
|
from ingestify.exceptions import ConfigurationError
|
|
23
|
+
from ingestify.infra import S3FileRepository, LocalFileRepository
|
|
24
|
+
from ingestify.infra.store.dataset.sqlalchemy import SqlAlchemyDatasetRepository
|
|
25
|
+
from ingestify.infra.store.dataset.sqlalchemy.repository import (
|
|
26
|
+
SqlAlchemySessionProvider,
|
|
27
|
+
)
|
|
27
28
|
|
|
28
29
|
logger = logging.getLogger(__name__)
|
|
29
30
|
|
|
@@ -59,8 +60,19 @@ def import_cls(name):
|
|
|
59
60
|
return getattr(mod, components[-1])
|
|
60
61
|
|
|
61
62
|
|
|
63
|
+
def build_file_repository(file_url: str) -> FileRepository:
|
|
64
|
+
if file_url.startswith("s3://"):
|
|
65
|
+
repository = S3FileRepository(url=file_url)
|
|
66
|
+
elif file_url.startswith("file://"):
|
|
67
|
+
repository = LocalFileRepository(url=file_url)
|
|
68
|
+
else:
|
|
69
|
+
raise Exception(f"Cannot find repository to handle file {file_url}")
|
|
70
|
+
|
|
71
|
+
return repository
|
|
72
|
+
|
|
73
|
+
|
|
62
74
|
def get_dataset_store_by_urls(
|
|
63
|
-
|
|
75
|
+
metadata_url: str, file_url: str, bucket: str
|
|
64
76
|
) -> DatasetStore:
|
|
65
77
|
"""
|
|
66
78
|
Initialize a DatasetStore by a DatasetRepository and a FileRepository
|
|
@@ -68,15 +80,18 @@ def get_dataset_store_by_urls(
|
|
|
68
80
|
if not bucket:
|
|
69
81
|
raise Exception("Bucket is not specified")
|
|
70
82
|
|
|
71
|
-
file_repository =
|
|
83
|
+
file_repository = build_file_repository(file_url)
|
|
84
|
+
|
|
85
|
+
if secrets_manager.supports(metadata_url):
|
|
86
|
+
metadata_url = secrets_manager.load_as_db_url(metadata_url)
|
|
87
|
+
|
|
88
|
+
if metadata_url.startswith("postgres://"):
|
|
89
|
+
metadata_url = metadata_url.replace("postgress://", "postgress+")
|
|
72
90
|
|
|
73
|
-
|
|
74
|
-
dataset_url = secrets_manager.load_as_db_url(dataset_url)
|
|
91
|
+
sqlalchemy_session_provider = SqlAlchemySessionProvider(metadata_url)
|
|
75
92
|
|
|
76
|
-
|
|
77
|
-
dataset_url = dataset_url.replace("postgress://", "postgress+")
|
|
93
|
+
dataset_repository = SqlAlchemyDatasetRepository(sqlalchemy_session_provider)
|
|
78
94
|
|
|
79
|
-
dataset_repository = dataset_repository_factory.build_if_supports(url=dataset_url)
|
|
80
95
|
return DatasetStore(
|
|
81
96
|
dataset_repository=dataset_repository,
|
|
82
97
|
file_repository=file_repository,
|
|
@@ -155,7 +170,7 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
|
|
|
155
170
|
|
|
156
171
|
logger.info("Initializing IngestionEngine")
|
|
157
172
|
store = get_dataset_store_by_urls(
|
|
158
|
-
|
|
173
|
+
metadata_url=config["main"]["metadata_url"],
|
|
159
174
|
file_url=config["main"]["file_url"],
|
|
160
175
|
bucket=bucket or config["main"].get("default_bucket"),
|
|
161
176
|
)
|
|
@@ -177,15 +192,20 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
|
|
|
177
192
|
|
|
178
193
|
fetch_policy = FetchPolicy()
|
|
179
194
|
|
|
180
|
-
|
|
195
|
+
# Previous naming
|
|
196
|
+
ingestion_plans = config.get("extract_jobs", [])
|
|
197
|
+
# New naming
|
|
198
|
+
ingestion_plans.extend(config.get("ingestion_plans", []))
|
|
199
|
+
|
|
200
|
+
for ingestion_plan in ingestion_plans:
|
|
181
201
|
data_spec_versions = DataSpecVersionCollection.from_dict(
|
|
182
|
-
|
|
202
|
+
ingestion_plan.get("data_spec_versions", {"default": {"v1"}})
|
|
183
203
|
)
|
|
184
204
|
|
|
185
|
-
if "selectors" in
|
|
205
|
+
if "selectors" in ingestion_plan:
|
|
186
206
|
selectors = [
|
|
187
207
|
Selector.build(selector, data_spec_versions=data_spec_versions)
|
|
188
|
-
for selector_args in
|
|
208
|
+
for selector_args in ingestion_plan["selectors"]
|
|
189
209
|
for selector in _product_selectors(selector_args)
|
|
190
210
|
]
|
|
191
211
|
else:
|
|
@@ -193,13 +213,13 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
|
|
|
193
213
|
# but makes it easier later one where we loop over selectors.
|
|
194
214
|
selectors = [Selector.build({}, data_spec_versions=data_spec_versions)]
|
|
195
215
|
|
|
196
|
-
|
|
197
|
-
source=sources[
|
|
198
|
-
dataset_type=
|
|
216
|
+
ingestion_plan = IngestionPlan(
|
|
217
|
+
source=sources[ingestion_plan["source"]],
|
|
218
|
+
dataset_type=ingestion_plan["dataset_type"],
|
|
199
219
|
selectors=selectors,
|
|
200
220
|
fetch_policy=fetch_policy,
|
|
201
221
|
data_spec_versions=data_spec_versions,
|
|
202
222
|
)
|
|
203
|
-
ingestion_engine.
|
|
223
|
+
ingestion_engine.add_ingestion_plan(ingestion_plan)
|
|
204
224
|
|
|
205
225
|
return ingestion_engine
|