ingestify 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. ingestify/__init__.py +1 -1
  2. ingestify/application/dataset_store.py +44 -24
  3. ingestify/application/ingestion_engine.py +3 -3
  4. ingestify/application/loader.py +67 -237
  5. ingestify/domain/models/__init__.py +1 -6
  6. ingestify/domain/models/base.py +22 -0
  7. ingestify/domain/models/data_spec_version_collection.py +6 -0
  8. ingestify/domain/models/dataset/__init__.py +3 -5
  9. ingestify/domain/models/dataset/dataset.py +15 -32
  10. ingestify/domain/models/dataset/dataset_repository.py +1 -15
  11. ingestify/domain/models/dataset/dataset_state.py +11 -0
  12. ingestify/domain/models/dataset/events.py +6 -16
  13. ingestify/domain/models/dataset/file.py +21 -34
  14. ingestify/domain/models/dataset/file_collection.py +3 -1
  15. ingestify/domain/models/dataset/file_repository.py +1 -10
  16. ingestify/domain/models/dataset/revision.py +26 -3
  17. ingestify/domain/models/event/domain_event.py +8 -4
  18. ingestify/domain/models/ingestion/__init__.py +0 -0
  19. ingestify/domain/models/ingestion/ingestion_job.py +292 -0
  20. ingestify/domain/models/ingestion/ingestion_job_summary.py +106 -0
  21. ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
  22. ingestify/domain/models/resources/dataset_resource.py +29 -37
  23. ingestify/domain/models/sink.py +1 -8
  24. ingestify/domain/models/task/task.py +3 -1
  25. ingestify/domain/models/task/task_summary.py +118 -0
  26. ingestify/domain/models/timing.py +16 -0
  27. ingestify/infra/fetch/http.py +5 -0
  28. ingestify/infra/source/statsbomb_github.py +67 -54
  29. ingestify/infra/store/dataset/__init__.py +0 -2
  30. ingestify/infra/store/dataset/sqlalchemy/mapping.py +184 -4
  31. ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -22
  32. ingestify/main.py +42 -22
  33. ingestify/utils.py +15 -78
  34. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/METADATA +2 -1
  35. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/RECORD +38 -32
  36. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/WHEEL +1 -1
  37. ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
  38. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/entry_points.txt +0 -0
  39. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/top_level.txt +0 -0
@@ -69,7 +69,12 @@ def retrieve_http(
69
69
  else:
70
70
  raise Exception(f"Don't know how to use {key}")
71
71
 
72
+ ignore_not_found = http_kwargs.pop("ignore_not_found", False)
73
+
72
74
  response = get_session().get(url, headers=headers, **http_kwargs)
75
+ if response.status_code == 404 and ignore_not_found:
76
+ return None
77
+
73
78
  response.raise_for_status()
74
79
  if response.status_code == 304:
75
80
  # Not modified
@@ -1,22 +1,22 @@
1
- import json
2
1
  from datetime import datetime
3
2
 
4
3
  import requests
5
4
 
6
- from ingestify import Source, retrieve_http
7
- from ingestify.domain import DraftFile
5
+ from ingestify import Source, DatasetResource
8
6
  from ingestify.domain.models.dataset.dataset import DatasetState
9
7
 
10
8
  BASE_URL = "https://raw.githubusercontent.com/statsbomb/open-data/master/data"
9
+ DATA_SPEC_VERSION = "v1-open-data"
11
10
 
12
11
 
13
12
  class StatsbombGithub(Source):
14
13
  provider = "statsbomb"
15
14
 
16
- def discover_selectors(self, dataset_type: str, data_spec_versions: None = None):
15
+ def discover_selectors(self, dataset_type: str):
17
16
  assert dataset_type == "match"
18
17
 
19
18
  competitions = requests.get(f"{BASE_URL}/competitions.json").json()
19
+
20
20
  return [
21
21
  dict(
22
22
  competition_id=competition["competition_id"],
@@ -25,68 +25,81 @@ class StatsbombGithub(Source):
25
25
  for competition in competitions
26
26
  ]
27
27
 
28
- def discover_datasets(
28
+ def find_datasets(
29
29
  self,
30
- dataset_type,
31
- competition_id: str = None,
32
- season_id: str = None,
30
+ dataset_type: str,
31
+ competition_id: str,
32
+ season_id: str,
33
+ match_id: str = None,
33
34
  data_spec_versions=None,
35
+ dataset_collection_metadata=None,
34
36
  ):
35
37
  assert dataset_type == "match"
36
38
 
37
- datasets = []
38
-
39
39
  matches = requests.get(
40
40
  f"{BASE_URL}/matches/{competition_id}/{season_id}.json"
41
41
  ).json()
42
42
 
43
43
  for match in matches:
44
- last_updated = match["last_updated"]
45
- if "Z" not in last_updated:
46
- # Assume UTC
47
- last_updated += "Z"
48
-
49
- last_modified = datetime.fromisoformat(last_updated.replace("Z", "+00:00"))
50
-
51
- dataset = dict(
52
- competition_id=competition_id,
53
- season_id=season_id,
54
- match_id=match["match_id"],
55
- _last_modified=last_modified,
56
- _match=match,
57
- _metadata=match,
58
- _state=DatasetState.COMPLETE,
59
- )
60
- datasets.append(dataset)
61
- return datasets
44
+ if match_id:
45
+ if match["match_id"] != match_id:
46
+ continue
62
47
 
63
- def fetch_dataset_files(
64
- self, dataset_type, identifier, current_revision, data_spec_versions
65
- ):
66
- assert dataset_type == "match"
48
+ last_modified = datetime.fromisoformat(match["last_updated"] + "+00:00")
67
49
 
68
- current_files = current_revision.modified_files_map if current_revision else {}
69
- files = {}
70
- for filename, url in [
71
- ("lineups.json", f"{BASE_URL}/lineups/{identifier.match_id}.json"),
72
- ("events.json", f"{BASE_URL}/events/{identifier.match_id}.json"),
73
- ]:
74
- data_feed_key = filename.split(".")[0]
75
- file_id = data_feed_key + "__v1"
76
- files[file_id] = retrieve_http(
77
- url,
78
- current_files.get(filename),
79
- file_data_feed_key=data_feed_key,
80
- file_data_spec_version="v1",
81
- file_data_serialization_format="json",
50
+ # Open data is always complete.. I guess?
51
+ state = DatasetState.COMPLETE
52
+
53
+ name = (
54
+ f"{match['match_date']} / "
55
+ f"{match['home_team']['home_team_name']} - {match['away_team']['away_team_name']}"
56
+ )
57
+
58
+ dataset_resource = DatasetResource(
59
+ dataset_resource_id=dict(
60
+ competition_id=competition_id,
61
+ season_id=season_id,
62
+ match_id=match["match_id"],
63
+ ),
64
+ dataset_type=dataset_type,
65
+ provider=self.provider,
66
+ name=name,
67
+ metadata=match,
68
+ state=state,
82
69
  )
83
70
 
84
- files["match__v1"] = DraftFile.from_input(
85
- json.dumps(identifier._match, indent=4),
86
- data_feed_key="match",
87
- data_spec_version="v1",
88
- data_serialization_format="json",
89
- modified_at=None,
90
- )
71
+ dataset_resource.add_file(
72
+ last_modified=last_modified,
73
+ data_feed_key="match",
74
+ data_spec_version=DATA_SPEC_VERSION,
75
+ json_content=match,
76
+ )
91
77
 
92
- return files
78
+ if state.is_complete:
79
+ name += f" / {match['home_score']}-{match['away_score']}"
80
+
81
+ for data_feed_key in ["lineups", "events"]:
82
+ dataset_resource.add_file(
83
+ last_modified=last_modified,
84
+ data_feed_key=data_feed_key,
85
+ data_spec_version=DATA_SPEC_VERSION,
86
+ url=f"{BASE_URL}/{data_feed_key}/{match['match_id']}.json",
87
+ data_serialization_format="json",
88
+ )
89
+
90
+ if (
91
+ match["last_updated_360"]
92
+ and match["match_status_360"] == "available"
93
+ ):
94
+ dataset_resource.add_file(
95
+ last_modified=datetime.fromisoformat(
96
+ match["last_updated_360"] + "+00:00"
97
+ ),
98
+ data_feed_key="360-frames",
99
+ data_spec_version=DATA_SPEC_VERSION,
100
+ url=f"{BASE_URL}/three-sixty/{match['match_id']}.json",
101
+ data_serialization_format="json",
102
+ http_options={"ignore_not_found": True},
103
+ )
104
+
105
+ yield dataset_resource
@@ -1,2 +0,0 @@
1
- from .local_dataset_repository import LocalDatasetRepository
2
- from .sqlalchemy import SqlAlchemyDatasetRepository
@@ -1,5 +1,7 @@
1
1
  import datetime
2
+ from dataclasses import is_dataclass, asdict
2
3
  from pathlib import Path
4
+ from typing import Optional
3
5
 
4
6
  from sqlalchemy import (
5
7
  JSON,
@@ -13,11 +15,37 @@ from sqlalchemy import (
13
15
  String,
14
16
  Table,
15
17
  TypeDecorator,
18
+ Boolean,
16
19
  )
17
20
  from sqlalchemy.orm import registry, relationship
18
21
 
22
+ from ingestify.domain import Selector, Identifier, DataSpecVersionCollection
19
23
  from ingestify.domain.models import Dataset, File, Revision
20
24
  from ingestify.domain.models.dataset.dataset import DatasetState
25
+ from ingestify.domain.models.ingestion.ingestion_job_summary import (
26
+ IngestionJobSummary,
27
+ )
28
+ from ingestify.domain.models.task.task_summary import TaskSummary, Operation, TaskStatus
29
+ from ingestify.domain.models.timing import Timing
30
+ from ingestify.domain.models.dataset.revision import RevisionState
31
+
32
+
33
+ def JSONType(serializer=None, deserializer=None):
34
+ class _JsonType(TypeDecorator):
35
+ cache_ok = True
36
+ impl = JSON
37
+
38
+ def process_bind_param(self, value, dialect):
39
+ if serializer is not None:
40
+ return serializer(value)
41
+ return value
42
+
43
+ def process_result_value(self, value, dialect):
44
+ if deserializer is not None:
45
+ return deserializer(value)
46
+ return value
47
+
48
+ return _JsonType
21
49
 
22
50
 
23
51
  class TZDateTime(TypeDecorator):
@@ -25,7 +53,10 @@ class TZDateTime(TypeDecorator):
25
53
  LOCAL_TIMEZONE = datetime.datetime.utcnow().astimezone().tzinfo
26
54
  cache_ok = True
27
55
 
28
- def process_bind_param(self, value: datetime, dialect):
56
+ def process_bind_param(self, value: Optional[datetime.datetime], dialect):
57
+ if not value:
58
+ return None
59
+
29
60
  if value.tzinfo is None:
30
61
  value = value.astimezone(self.LOCAL_TIMEZONE)
31
62
 
@@ -67,6 +98,45 @@ class DatasetStateString(TypeDecorator):
67
98
  return DatasetState[value]
68
99
 
69
100
 
101
+ class RevisionStateString(TypeDecorator):
102
+ impl = String(255)
103
+
104
+ def process_bind_param(self, value: RevisionState, dialect):
105
+ return value.value
106
+
107
+ def process_result_value(self, value, dialect):
108
+ if not value:
109
+ return value
110
+
111
+ return RevisionState[value]
112
+
113
+
114
+ class OperationString(TypeDecorator):
115
+ impl = String(255)
116
+
117
+ def process_bind_param(self, value: Operation, dialect):
118
+ return value.value
119
+
120
+ def process_result_value(self, value, dialect):
121
+ if not value:
122
+ return value
123
+
124
+ return Operation[value]
125
+
126
+
127
+ class TaskStatusString(TypeDecorator):
128
+ impl = String(255)
129
+
130
+ def process_bind_param(self, value: TaskStatus, dialect):
131
+ return value.value
132
+
133
+ def process_result_value(self, value, dialect):
134
+ if not value:
135
+ return value
136
+
137
+ return TaskStatus[value]
138
+
139
+
70
140
  mapper_registry = registry()
71
141
 
72
142
  metadata = MetaData()
@@ -80,7 +150,7 @@ dataset_table = Table(
80
150
  Column("dataset_type", String(255)),
81
151
  Column("state", DatasetStateString),
82
152
  Column("name", String(255)),
83
- Column("identifier", JSON),
153
+ Column("identifier", JSONType(deserializer=lambda item: Identifier(**item))),
84
154
  Column("metadata", JSON),
85
155
  Column("created_at", TZDateTime(6)),
86
156
  Column("updated_at", TZDateTime(6)),
@@ -95,7 +165,10 @@ revision_table = Table(
95
165
  Column("revision_id", Integer, primary_key=True),
96
166
  Column("description", String(255)),
97
167
  Column("created_at", TZDateTime(6)),
168
+ Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
169
+ Column("source", JSONType()),
98
170
  )
171
+
99
172
  file_table = Table(
100
173
  "file",
101
174
  metadata,
@@ -129,7 +202,7 @@ mapper_registry.map_imperatively(
129
202
  Revision,
130
203
  backref="dataset",
131
204
  order_by=revision_table.c.revision_id,
132
- lazy="joined",
205
+ lazy="selectin",
133
206
  cascade="all, delete-orphan",
134
207
  ),
135
208
  },
@@ -143,7 +216,7 @@ mapper_registry.map_imperatively(
143
216
  File,
144
217
  order_by=file_table.c.file_id,
145
218
  primaryjoin="and_(Revision.revision_id==File.revision_id, Revision.dataset_id==File.dataset_id)",
146
- lazy="joined",
219
+ lazy="selectin",
147
220
  cascade="all, delete-orphan",
148
221
  )
149
222
  },
@@ -151,3 +224,110 @@ mapper_registry.map_imperatively(
151
224
 
152
225
 
153
226
  mapper_registry.map_imperatively(File, file_table)
227
+
228
+
229
+ ingestion_job_summary = Table(
230
+ "ingestion_job_summary",
231
+ metadata,
232
+ Column("ingestion_job_id", String(255), primary_key=True),
233
+ # From the IngestionPlan
234
+ Column("source_name", String(255)),
235
+ Column("dataset_type", String(255)),
236
+ Column(
237
+ "data_spec_versions",
238
+ JSONType(
239
+ serializer=lambda data_spec_versions: data_spec_versions.to_dict(),
240
+ deserializer=lambda data_spec_versions: DataSpecVersionCollection.from_dict(
241
+ data_spec_versions
242
+ ),
243
+ ),
244
+ ),
245
+ Column(
246
+ "selector", JSONType(serializer=lambda selector: selector.filtered_attributes)
247
+ ),
248
+ Column("started_at", TZDateTime(6)),
249
+ Column("finished_at", TZDateTime(6)),
250
+ # Some task counters
251
+ Column("successful_tasks", Integer),
252
+ Column("ignored_successful_tasks", Integer),
253
+ Column("failed_tasks", Integer),
254
+ Column(
255
+ "timings",
256
+ JSONType(
257
+ serializer=lambda timings: [
258
+ timing.model_dump(mode="json") for timing in timings
259
+ ],
260
+ deserializer=lambda timings: [
261
+ Timing.model_validate(timing) for timing in timings
262
+ ],
263
+ ),
264
+ ),
265
+ # Column(
266
+ # "task_summaries",
267
+ # JSONType(
268
+ # serializer=lambda task_summaries: [
269
+ # task_summary.model_dump(mode="json") for task_summary in task_summaries
270
+ # ],
271
+ # deserializer=lambda task_summaries: [
272
+ # TaskSummary.model_validate(task_summary)
273
+ # for task_summary in task_summaries
274
+ # ],
275
+ # ),
276
+ # ),
277
+ )
278
+
279
+
280
+ task_summary_table = Table(
281
+ "task_summary",
282
+ metadata,
283
+ Column(
284
+ "ingestion_job_id",
285
+ String(255),
286
+ ForeignKey("ingestion_job_summary.ingestion_job_id"),
287
+ primary_key=True,
288
+ ),
289
+ Column("task_id", Integer, primary_key=True),
290
+ Column("started_at", TZDateTime(6)),
291
+ Column("ended_at", TZDateTime(6)),
292
+ Column("operation", OperationString),
293
+ Column(
294
+ "dataset_identifier", JSONType(deserializer=lambda item: Identifier(**item))
295
+ ),
296
+ Column("persisted_file_count", Integer),
297
+ Column("bytes_retrieved", Integer),
298
+ Column("last_modified", TZDateTime(6)),
299
+ Column("status", TaskStatusString),
300
+ Column(
301
+ "timings",
302
+ JSONType(
303
+ serializer=lambda timings: [
304
+ timing.model_dump(mode="json") for timing in timings
305
+ ],
306
+ deserializer=lambda timings: [
307
+ Timing.model_validate(timing) for timing in timings
308
+ ],
309
+ ),
310
+ ),
311
+ # Column("description", String(255)),
312
+ # Column("created_at", TZDateTime(6)),
313
+ # Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
314
+ # Column("source", JSONType()),
315
+ )
316
+
317
+
318
+ mapper_registry.map_imperatively(
319
+ IngestionJobSummary,
320
+ ingestion_job_summary,
321
+ properties={
322
+ "task_summaries": relationship(
323
+ TaskSummary,
324
+ backref="ingestion_job_summary",
325
+ # order_by=task_summary_table.c.revision_id,
326
+ lazy="selectin",
327
+ cascade="all, delete-orphan",
328
+ ),
329
+ },
330
+ )
331
+
332
+
333
+ mapper_registry.map_imperatively(TaskSummary, task_summary_table)
@@ -29,22 +29,6 @@ def parse_value(v):
29
29
  return v
30
30
 
31
31
 
32
- def json_serializer(o):
33
- return json.dumps(o)
34
-
35
-
36
- def json_deserializer(o):
37
- o = json.loads(o)
38
- # THIS BREAKS WHEN USING OTHER JSON COLUMNS!!
39
- o = Identifier(**o)
40
- return o
41
-
42
-
43
- # @compiles(DateTime, "mysql")
44
- # def compile_datetime_mysql(type_, compiler, **kw):
45
- # return "DATETIME(6)"
46
-
47
-
48
32
  def isfloat(x):
49
33
  try:
50
34
  a = float(x)
@@ -64,7 +48,7 @@ def isint(x):
64
48
  return a == b
65
49
 
66
50
 
67
- class SqlAlchemyDatasetRepository(DatasetRepository):
51
+ class SqlAlchemySessionProvider:
68
52
  @staticmethod
69
53
  def fix_url(url: str) -> str:
70
54
  if url.startswith("postgres://"):
@@ -87,8 +71,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
87
71
  self.url,
88
72
  # Use the default isolation level, don't need SERIALIZABLE
89
73
  # isolation_level="SERIALIZABLE",
90
- json_serializer=json_serializer,
91
- json_deserializer=json_deserializer,
92
74
  )
93
75
  self.session = Session(bind=self.engine)
94
76
 
@@ -107,9 +89,29 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
107
89
  self.url = state["url"]
108
90
  self._init_engine()
109
91
 
92
+ def _close_engine(self):
93
+ if hasattr(self, "session"):
94
+ self.session.close()
95
+ self.engine.dispose()
96
+
110
97
  def __del__(self):
111
- self.session.close()
112
- self.engine.dispose()
98
+ self._close_engine()
99
+
100
+ def reset(self):
101
+ self._close_engine()
102
+ self._init_engine()
103
+
104
+ def get(self):
105
+ return self.session
106
+
107
+
108
+ class SqlAlchemyDatasetRepository(DatasetRepository):
109
+ def __init__(self, session_provider: SqlAlchemySessionProvider):
110
+ self.session_provider = session_provider
111
+
112
+ @property
113
+ def session(self):
114
+ return self.session_provider.get()
113
115
 
114
116
  def _filter_query(
115
117
  self,
@@ -208,7 +210,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
208
210
 
209
211
  if not metadata_only:
210
212
  dataset_query = apply_query_filter(
211
- self.session.query(Dataset).options(joinedload(Dataset.revisions))
213
+ self.session.query(Dataset) # .options(joinedload(Dataset.revisions))
212
214
  )
213
215
  datasets = list(dataset_query)
214
216
  else:
ingestify/main.py CHANGED
@@ -11,19 +11,20 @@ from ingestify import Source
11
11
  from ingestify.application.dataset_store import DatasetStore
12
12
  from ingestify.application.ingestion_engine import IngestionEngine
13
13
  from ingestify.application.secrets_manager import SecretsManager
14
- from ingestify.domain import Selector
15
- from ingestify.domain.models import (
16
- dataset_repository_factory,
17
- file_repository_factory,
18
- )
14
+ from ingestify.domain import Selector, FileRepository
19
15
  from ingestify.domain.models.data_spec_version_collection import (
20
16
  DataSpecVersionCollection,
21
17
  )
22
18
  from ingestify.domain.models.event import EventBus, Publisher, Subscriber
23
19
 
24
- from ingestify.domain.models.extract_job import ExtractJob
20
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
25
21
  from ingestify.domain.models.fetch_policy import FetchPolicy
26
22
  from ingestify.exceptions import ConfigurationError
23
+ from ingestify.infra import S3FileRepository, LocalFileRepository
24
+ from ingestify.infra.store.dataset.sqlalchemy import SqlAlchemyDatasetRepository
25
+ from ingestify.infra.store.dataset.sqlalchemy.repository import (
26
+ SqlAlchemySessionProvider,
27
+ )
27
28
 
28
29
  logger = logging.getLogger(__name__)
29
30
 
@@ -59,8 +60,19 @@ def import_cls(name):
59
60
  return getattr(mod, components[-1])
60
61
 
61
62
 
63
+ def build_file_repository(file_url: str) -> FileRepository:
64
+ if file_url.startswith("s3://"):
65
+ repository = S3FileRepository(url=file_url)
66
+ elif file_url.startswith("file://"):
67
+ repository = LocalFileRepository(url=file_url)
68
+ else:
69
+ raise Exception(f"Cannot find repository to handle file {file_url}")
70
+
71
+ return repository
72
+
73
+
62
74
  def get_dataset_store_by_urls(
63
- dataset_url: str, file_url: str, bucket: str
75
+ metadata_url: str, file_url: str, bucket: str
64
76
  ) -> DatasetStore:
65
77
  """
66
78
  Initialize a DatasetStore by a DatasetRepository and a FileRepository
@@ -68,15 +80,18 @@ def get_dataset_store_by_urls(
68
80
  if not bucket:
69
81
  raise Exception("Bucket is not specified")
70
82
 
71
- file_repository = file_repository_factory.build_if_supports(url=file_url)
83
+ file_repository = build_file_repository(file_url)
84
+
85
+ if secrets_manager.supports(metadata_url):
86
+ metadata_url = secrets_manager.load_as_db_url(metadata_url)
87
+
88
+ if metadata_url.startswith("postgres://"):
89
+ metadata_url = metadata_url.replace("postgress://", "postgress+")
72
90
 
73
- if secrets_manager.supports(dataset_url):
74
- dataset_url = secrets_manager.load_as_db_url(dataset_url)
91
+ sqlalchemy_session_provider = SqlAlchemySessionProvider(metadata_url)
75
92
 
76
- if dataset_url.startswith("postgres://"):
77
- dataset_url = dataset_url.replace("postgress://", "postgress+")
93
+ dataset_repository = SqlAlchemyDatasetRepository(sqlalchemy_session_provider)
78
94
 
79
- dataset_repository = dataset_repository_factory.build_if_supports(url=dataset_url)
80
95
  return DatasetStore(
81
96
  dataset_repository=dataset_repository,
82
97
  file_repository=file_repository,
@@ -155,7 +170,7 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
155
170
 
156
171
  logger.info("Initializing IngestionEngine")
157
172
  store = get_dataset_store_by_urls(
158
- dataset_url=config["main"]["dataset_url"],
173
+ metadata_url=config["main"]["metadata_url"],
159
174
  file_url=config["main"]["file_url"],
160
175
  bucket=bucket or config["main"].get("default_bucket"),
161
176
  )
@@ -177,15 +192,20 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
177
192
 
178
193
  fetch_policy = FetchPolicy()
179
194
 
180
- for job in config["extract_jobs"]:
195
+ # Previous naming
196
+ ingestion_plans = config.get("extract_jobs", [])
197
+ # New naming
198
+ ingestion_plans.extend(config.get("ingestion_plans", []))
199
+
200
+ for ingestion_plan in ingestion_plans:
181
201
  data_spec_versions = DataSpecVersionCollection.from_dict(
182
- job.get("data_spec_versions", {"default": {"v1"}})
202
+ ingestion_plan.get("data_spec_versions", {"default": {"v1"}})
183
203
  )
184
204
 
185
- if "selectors" in job:
205
+ if "selectors" in ingestion_plan:
186
206
  selectors = [
187
207
  Selector.build(selector, data_spec_versions=data_spec_versions)
188
- for selector_args in job["selectors"]
208
+ for selector_args in ingestion_plan["selectors"]
189
209
  for selector in _product_selectors(selector_args)
190
210
  ]
191
211
  else:
@@ -193,13 +213,13 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
193
213
  # but makes it easier later one where we loop over selectors.
194
214
  selectors = [Selector.build({}, data_spec_versions=data_spec_versions)]
195
215
 
196
- import_job = ExtractJob(
197
- source=sources[job["source"]],
198
- dataset_type=job["dataset_type"],
216
+ ingestion_plan = IngestionPlan(
217
+ source=sources[ingestion_plan["source"]],
218
+ dataset_type=ingestion_plan["dataset_type"],
199
219
  selectors=selectors,
200
220
  fetch_policy=fetch_policy,
201
221
  data_spec_versions=data_spec_versions,
202
222
  )
203
- ingestion_engine.add_extract_job(import_job)
223
+ ingestion_engine.add_ingestion_plan(ingestion_plan)
204
224
 
205
225
  return ingestion_engine