ingestify 0.1.3__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. ingestify/__init__.py +1 -1
  2. ingestify/application/dataset_store.py +47 -36
  3. ingestify/application/ingestion_engine.py +3 -3
  4. ingestify/application/loader.py +71 -241
  5. ingestify/domain/models/__init__.py +1 -6
  6. ingestify/domain/models/base.py +22 -0
  7. ingestify/domain/models/data_spec_version_collection.py +6 -0
  8. ingestify/domain/models/dataset/__init__.py +3 -5
  9. ingestify/domain/models/dataset/dataset.py +15 -32
  10. ingestify/domain/models/dataset/dataset_repository.py +1 -15
  11. ingestify/domain/models/dataset/dataset_state.py +11 -0
  12. ingestify/domain/models/dataset/events.py +6 -16
  13. ingestify/domain/models/dataset/file.py +21 -34
  14. ingestify/domain/models/dataset/file_collection.py +3 -1
  15. ingestify/domain/models/dataset/file_repository.py +29 -28
  16. ingestify/domain/models/dataset/revision.py +26 -3
  17. ingestify/domain/models/event/domain_event.py +8 -4
  18. ingestify/domain/models/ingestion/__init__.py +0 -0
  19. ingestify/domain/models/ingestion/ingestion_job.py +325 -0
  20. ingestify/domain/models/ingestion/ingestion_job_summary.py +123 -0
  21. ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
  22. ingestify/domain/models/resources/dataset_resource.py +29 -37
  23. ingestify/domain/models/sink.py +1 -8
  24. ingestify/domain/models/task/task.py +3 -1
  25. ingestify/domain/models/task/task_summary.py +118 -0
  26. ingestify/domain/models/timing.py +16 -0
  27. ingestify/domain/services/identifier_key_transformer.py +111 -0
  28. ingestify/infra/fetch/http.py +5 -0
  29. ingestify/infra/source/statsbomb_github.py +67 -54
  30. ingestify/infra/store/dataset/__init__.py +0 -2
  31. ingestify/infra/store/dataset/sqlalchemy/mapping.py +187 -4
  32. ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -24
  33. ingestify/infra/store/file/local_file_repository.py +3 -5
  34. ingestify/infra/store/file/s3_file_repository.py +4 -9
  35. ingestify/main.py +64 -25
  36. ingestify/utils.py +15 -78
  37. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/METADATA +2 -1
  38. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/RECORD +41 -34
  39. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/WHEEL +1 -1
  40. ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
  41. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/entry_points.txt +0 -0
  42. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,325 @@
1
+ import itertools
2
+ import json
3
+ import logging
4
+ import uuid
5
+ from typing import Optional, Iterator
6
+
7
+ from ingestify import retrieve_http
8
+ from ingestify.application.dataset_store import DatasetStore
9
+ from ingestify.domain import Selector, Identifier, TaskSet, Dataset, DraftFile, Task
10
+ from ingestify.domain.models.dataset.revision import RevisionSource, SourceType
11
+ from ingestify.domain.models.ingestion.ingestion_job_summary import (
12
+ IngestionJobSummary,
13
+ )
14
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
15
+ from ingestify.domain.models.resources.dataset_resource import (
16
+ FileResource,
17
+ DatasetResource,
18
+ )
19
+ from ingestify.domain.models.task.task_summary import TaskSummary
20
+ from ingestify.utils import TaskExecutor, chunker
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ DEFAULT_CHUNK_SIZE = 1_000
26
+
27
+
28
+ def run_task(task):
29
+ logger.info(f"Running task {task}")
30
+ return task.run()
31
+
32
+
33
+ def to_batches(input_):
34
+ if isinstance(input_, list):
35
+ batches = iter(input_)
36
+ else:
37
+ # Assume it's an iterator. Peek what's inside, and put it back
38
+ try:
39
+ peek = next(input_)
40
+ except StopIteration:
41
+ # Nothing to batch
42
+ return iter([])
43
+
44
+ input_ = itertools.chain([peek], input_)
45
+
46
+ if not isinstance(peek, list):
47
+ batches = chunker(input_, DEFAULT_CHUNK_SIZE)
48
+ else:
49
+ batches = input_
50
+ return batches
51
+
52
+
53
+ def load_file(
54
+ file_resource: FileResource, dataset: Optional[Dataset] = None
55
+ ) -> Optional[DraftFile]:
56
+ current_file = None
57
+ if dataset:
58
+ current_file = dataset.current_revision.modified_files_map.get(
59
+ file_resource.file_id
60
+ )
61
+
62
+ if file_resource.json_content is not None:
63
+ # Empty dictionary is allowed
64
+ file = DraftFile.from_input(
65
+ file_=json.dumps(file_resource.json_content, indent=4),
66
+ data_serialization_format="json",
67
+ data_feed_key=file_resource.data_feed_key,
68
+ data_spec_version=file_resource.data_spec_version,
69
+ modified_at=file_resource.last_modified,
70
+ )
71
+ if current_file and current_file.tag == file.tag:
72
+ # Nothing changed
73
+ return None
74
+ return file
75
+ elif file_resource.url:
76
+ http_options = {}
77
+ if file_resource.http_options:
78
+ for k, v in file_resource.http_options.items():
79
+ http_options[f"http_{k}"] = v
80
+
81
+ return retrieve_http(
82
+ url=file_resource.url,
83
+ current_file=current_file,
84
+ file_data_feed_key=file_resource.data_feed_key,
85
+ file_data_spec_version=file_resource.data_spec_version,
86
+ file_data_serialization_format=file_resource.data_serialization_format
87
+ or "txt",
88
+ last_modified=file_resource.last_modified,
89
+ **http_options,
90
+ **file_resource.loader_kwargs,
91
+ )
92
+ else:
93
+ return file_resource.file_loader(
94
+ file_resource,
95
+ current_file,
96
+ # TODO: check how to fix this with typehints
97
+ **file_resource.loader_kwargs,
98
+ )
99
+
100
+
101
+ class UpdateDatasetTask(Task):
102
+ def __init__(
103
+ self,
104
+ dataset: Dataset,
105
+ dataset_resource: DatasetResource,
106
+ store: DatasetStore,
107
+ ):
108
+ self.dataset = dataset
109
+ self.dataset_resource = dataset_resource
110
+ self.store = store
111
+ self.task_id = str(uuid.uuid1())
112
+
113
+ def run(self):
114
+ dataset_identifier = Identifier(**self.dataset_resource.dataset_resource_id)
115
+
116
+ revision_source = RevisionSource(
117
+ source_id=self.task_id, source_type=SourceType.TASK
118
+ )
119
+
120
+ with TaskSummary.update(
121
+ self.task_id, dataset_identifier=dataset_identifier
122
+ ) as task_summary:
123
+ revision = self.store.update_dataset(
124
+ dataset=self.dataset,
125
+ name=self.dataset_resource.name,
126
+ state=self.dataset_resource.state,
127
+ metadata=self.dataset_resource.metadata,
128
+ files={
129
+ file_id: task_summary.record_load_file(
130
+ lambda: load_file(file_resource, dataset=self.dataset),
131
+ metadata={"file_id": file_id},
132
+ )
133
+ for file_id, file_resource in self.dataset_resource.files.items()
134
+ },
135
+ revision_source=revision_source,
136
+ )
137
+ task_summary.set_stats_from_revision(revision)
138
+
139
+ return task_summary
140
+
141
+ def __repr__(self):
142
+ return f"UpdateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
143
+
144
+
145
+ class CreateDatasetTask(Task):
146
+ def __init__(
147
+ self,
148
+ dataset_resource: DatasetResource,
149
+ store: DatasetStore,
150
+ ):
151
+ self.dataset_resource = dataset_resource
152
+ self.store = store
153
+ self.task_id = str(uuid.uuid1())
154
+
155
+ def run(self):
156
+ dataset_identifier = Identifier(**self.dataset_resource.dataset_resource_id)
157
+ revision_source = RevisionSource(
158
+ source_id=self.task_id, source_type=SourceType.TASK
159
+ )
160
+
161
+ with TaskSummary.create(self.task_id, dataset_identifier) as task_summary:
162
+ revision = self.store.create_dataset(
163
+ dataset_type=self.dataset_resource.dataset_type,
164
+ provider=self.dataset_resource.provider,
165
+ dataset_identifier=dataset_identifier,
166
+ name=self.dataset_resource.name,
167
+ state=self.dataset_resource.state,
168
+ metadata=self.dataset_resource.metadata,
169
+ files={
170
+ file_id: task_summary.record_load_file(
171
+ lambda: load_file(file_resource, dataset=None),
172
+ metadata={"file_id": file_id},
173
+ )
174
+ for file_id, file_resource in self.dataset_resource.files.items()
175
+ },
176
+ revision_source=revision_source,
177
+ )
178
+
179
+ task_summary.set_stats_from_revision(revision)
180
+
181
+ return task_summary
182
+
183
+ def __repr__(self):
184
+ return f"CreateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
185
+
186
+
187
+ MAX_TASKS_PER_CHUNK = 10_000
188
+
189
+
190
+ class IngestionJob:
191
+ def __init__(
192
+ self,
193
+ ingestion_job_id: str,
194
+ ingestion_plan: IngestionPlan,
195
+ selector: Selector,
196
+ ):
197
+ self.ingestion_job_id = ingestion_job_id
198
+ self.ingestion_plan = ingestion_plan
199
+ self.selector = selector
200
+
201
+ def execute(
202
+ self, store: DatasetStore, task_executor: TaskExecutor
203
+ ) -> Iterator[IngestionJobSummary]:
204
+ is_first_chunk = True
205
+ ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
206
+ # Process all items in batches. Yield a IngestionJobSummary per batch
207
+
208
+ logger.info("Finding metadata")
209
+ with ingestion_job_summary.record_timing("get_dataset_collection"):
210
+ dataset_collection_metadata = store.get_dataset_collection(
211
+ dataset_type=self.ingestion_plan.dataset_type,
212
+ data_spec_versions=self.selector.data_spec_versions,
213
+ selector=self.selector,
214
+ metadata_only=True,
215
+ ).metadata
216
+ logger.info(f"Done: {dataset_collection_metadata}")
217
+
218
+ # There are two different, but similar flows here:
219
+ # 1. The discover_datasets returns a list, and the entire list can be processed at once
220
+ # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
221
+ with ingestion_job_summary.record_timing("find_datasets"):
222
+ # Timing might be incorrect as it is an iterator
223
+ dataset_resources = self.ingestion_plan.source.find_datasets(
224
+ dataset_type=self.ingestion_plan.dataset_type,
225
+ data_spec_versions=self.selector.data_spec_versions,
226
+ dataset_collection_metadata=dataset_collection_metadata,
227
+ **self.selector.custom_attributes,
228
+ )
229
+
230
+ finish_task_timer = ingestion_job_summary.start_timing("tasks")
231
+
232
+ batches = to_batches(dataset_resources)
233
+
234
+ while True:
235
+ try:
236
+ batch = next(batches)
237
+ except StopIteration:
238
+ break
239
+ except Exception:
240
+ # TODO: handle exception on IngestionJob level
241
+ raise
242
+
243
+ dataset_identifiers = [
244
+ Identifier.create_from_selector(
245
+ self.selector, **dataset_resource.dataset_resource_id
246
+ )
247
+ # We have to pass the data_spec_versions here as a Source can add some
248
+ # extra data to the identifier which is retrieved in a certain data format
249
+ for dataset_resource in batch
250
+ ]
251
+
252
+ # Load all available datasets based on the discovered dataset identifiers
253
+ dataset_collection = store.get_dataset_collection(
254
+ dataset_type=self.ingestion_plan.dataset_type,
255
+ # Assume all DatasetResources share the same provider
256
+ provider=batch[0].provider,
257
+ selector=dataset_identifiers,
258
+ )
259
+
260
+ skipped_datasets = 0
261
+
262
+ task_set = TaskSet()
263
+ for dataset_resource in batch:
264
+ dataset_identifier = Identifier.create_from_selector(
265
+ self.selector, **dataset_resource.dataset_resource_id
266
+ )
267
+
268
+ if dataset := dataset_collection.get(dataset_identifier):
269
+ if self.ingestion_plan.fetch_policy.should_refetch(
270
+ dataset, dataset_resource
271
+ ):
272
+ task_set.add(
273
+ UpdateDatasetTask(
274
+ dataset=dataset, # Current dataset from the database
275
+ dataset_resource=dataset_resource, # Most recent dataset_resource
276
+ store=store,
277
+ )
278
+ )
279
+ else:
280
+ skipped_datasets += 1
281
+ else:
282
+ if self.ingestion_plan.fetch_policy.should_fetch(dataset_resource):
283
+ task_set.add(
284
+ CreateDatasetTask(
285
+ dataset_resource=dataset_resource,
286
+ store=store,
287
+ )
288
+ )
289
+ else:
290
+ skipped_datasets += 1
291
+
292
+ if task_set:
293
+ logger.info(
294
+ f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
295
+ f"using selector {self.selector} => {len(task_set)} tasks. {skipped_datasets} skipped."
296
+ )
297
+ logger.info(f"Running {len(task_set)} tasks")
298
+ ingestion_job_summary.add_task_summaries(
299
+ task_executor.run(run_task, task_set)
300
+ )
301
+ else:
302
+ logger.info(
303
+ f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
304
+ f"using selector {self.selector} => nothing to do"
305
+ )
306
+
307
+ ingestion_job_summary.increase_skipped_datasets(skipped_datasets)
308
+
309
+ if ingestion_job_summary.task_count() >= MAX_TASKS_PER_CHUNK:
310
+ finish_task_timer()
311
+ ingestion_job_summary.set_finished()
312
+ yield ingestion_job_summary
313
+
314
+ # Start a new one
315
+ is_first_chunk = False
316
+ ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
317
+
318
+ # We will resume tasks, start timer right away
319
+ finish_task_timer = ingestion_job_summary.start_timing("tasks")
320
+
321
+ if ingestion_job_summary.task_count() > 0 or is_first_chunk:
322
+ # When there is interesting information to store, or there was no data at all, store it
323
+ finish_task_timer()
324
+ ingestion_job_summary.set_finished()
325
+ yield ingestion_job_summary
@@ -0,0 +1,123 @@
1
+ import uuid
2
+ from contextlib import contextmanager
3
+ from datetime import datetime, timedelta
4
+ from typing import Optional, List, TYPE_CHECKING
5
+ from pydantic import Field
6
+
7
+ from ingestify.domain import Selector, DataSpecVersionCollection
8
+ from ingestify.domain.models.base import BaseModel
9
+ from ingestify.domain.models.task.task_summary import TaskSummary, TaskStatus
10
+ from ingestify.domain.models.timing import Timing
11
+ from ingestify.utils import utcnow
12
+
13
+ if TYPE_CHECKING:
14
+ from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
15
+
16
+
17
+ def format_duration(duration: timedelta):
18
+ return f"{duration.total_seconds():.2f}sec"
19
+
20
+
21
+ class IngestionJobSummary(BaseModel):
22
+ ingestion_job_summary_id: str
23
+ ingestion_job_id: str
24
+
25
+ # From the IngestionPlan
26
+ provider: str
27
+ source_name: str
28
+ dataset_type: str
29
+ data_spec_versions: DataSpecVersionCollection
30
+ selector: Selector
31
+
32
+ started_at: datetime = Field(default_factory=utcnow)
33
+ finished_at: Optional[datetime] = None
34
+ timings: List[Timing] = Field(default_factory=list)
35
+ task_summaries: List[TaskSummary] = Field(default_factory=list)
36
+
37
+ skipped_datasets: int = 0
38
+ failed_tasks: int = 0
39
+ successful_tasks: int = 0
40
+ ignored_successful_tasks: int = 0
41
+
42
+ @classmethod
43
+ def new(cls, ingestion_job: "IngestionJob"):
44
+ args = dict(
45
+ ingestion_job_summary_id=str(uuid.uuid1()),
46
+ ingestion_job_id=ingestion_job.ingestion_job_id,
47
+ provider=ingestion_job.ingestion_plan.source.provider,
48
+ source_name=ingestion_job.ingestion_plan.source.name,
49
+ dataset_type=ingestion_job.ingestion_plan.dataset_type,
50
+ data_spec_versions=ingestion_job.ingestion_plan.data_spec_versions,
51
+ selector=ingestion_job.selector,
52
+ )
53
+ return cls(**args)
54
+
55
+ @contextmanager
56
+ def record_timing(self, name: str):
57
+ start = utcnow()
58
+ yield
59
+ self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
60
+
61
+ def start_timing(self, name):
62
+ start = utcnow()
63
+
64
+ def finish():
65
+ self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
66
+
67
+ return finish
68
+
69
+ def add_task_summaries(self, task_summaries: List[TaskSummary]):
70
+ self.task_summaries.extend(task_summaries)
71
+
72
+ def increase_skipped_datasets(self, skipped_datasets: int):
73
+ self.skipped_datasets += skipped_datasets
74
+
75
+ def task_count(self):
76
+ return len(self.task_summaries)
77
+
78
+ def set_finished(self):
79
+ self.failed_tasks = len(
80
+ [task for task in self.task_summaries if task.status == TaskStatus.FAILED]
81
+ )
82
+ self.successful_tasks = len(
83
+ [task for task in self.task_summaries if task.status == TaskStatus.FINISHED]
84
+ )
85
+ self.ignored_successful_tasks = len(
86
+ [
87
+ task
88
+ for task in self.task_summaries
89
+ if task.status == TaskStatus.FINISHED_IGNORED
90
+ ]
91
+ )
92
+ self.finished_at = utcnow()
93
+
94
+ @property
95
+ def duration(self) -> timedelta:
96
+ return self.finished_at - self.started_at
97
+
98
+ def output_report(self):
99
+ print(f"\nIngestionJobSummary finished in {format_duration(self.duration)}")
100
+ print("--------------------")
101
+ print(f" - IngestionPlan:")
102
+ print(f" Source: {self.source_name}")
103
+ print(f" Provider: {self.provider}")
104
+ print(f" DatasetType: {self.dataset_type}")
105
+ print(f" - Selector: {self.selector}")
106
+ print(f" - Timings: ")
107
+ for timing in self.timings:
108
+ print(f" - {timing.name}: {format_duration(timing.duration)}")
109
+ print(
110
+ f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
111
+ )
112
+
113
+ print(f" - Failed tasks: {self.failed_tasks}")
114
+ print(f" - Successful tasks: {self.successful_tasks}")
115
+ print(f" - Successful ignored tasks: {self.successful_tasks}")
116
+ print(f" - Skipped datasets: {self.skipped_datasets}")
117
+ print("--------------------")
118
+
119
+ def __enter__(self):
120
+ return self
121
+
122
+ def __exit__(self, exc_type, exc_val, exc_tb):
123
+ pass
@@ -1,15 +1,15 @@
1
- from dataclasses import dataclass
2
1
  from typing import List
3
2
 
4
3
  from ingestify.domain.models import Source, Selector
4
+ from ingestify.domain.models.base import BaseModel
5
5
  from ingestify.domain.models.data_spec_version_collection import (
6
6
  DataSpecVersionCollection,
7
7
  )
8
8
  from ingestify.domain.models.fetch_policy import FetchPolicy
9
9
 
10
10
 
11
- @dataclass
12
- class ExtractJob:
11
+ class IngestionPlan(BaseModel):
12
+
13
13
  source: Source
14
14
  selectors: List[Selector]
15
15
  fetch_policy: FetchPolicy
@@ -17,7 +17,7 @@ class ExtractJob:
17
17
  data_spec_versions: DataSpecVersionCollection
18
18
 
19
19
  def __repr__(self):
20
- return f'<ExtractJob source="{self.source.name}" dataset_type="{self.dataset_type}">'
20
+ return f'<IngestionPlan source="{self.source.name}" dataset_type="{self.dataset_type}">'
21
21
 
22
22
  def __str__(self):
23
23
  return repr(self)
@@ -1,35 +1,40 @@
1
- from dataclasses import dataclass, field
1
+ from dataclasses import dataclass
2
2
  from datetime import datetime
3
- from typing import Optional, Callable, TYPE_CHECKING
3
+ from typing import Optional, Callable, Any, Protocol, TYPE_CHECKING # noqa
4
+ from pydantic import Field
4
5
 
6
+ from ingestify.domain.models.base import BaseModel
7
+ from ingestify.domain.models.dataset.dataset_state import DatasetState
5
8
  from ingestify.exceptions import DuplicateFile
6
9
 
7
- if TYPE_CHECKING:
8
- from ingestify.domain import DraftFile, File
9
- from ingestify.domain.models.dataset.dataset import DatasetState
10
+ from ingestify.domain.models import File, DraftFile
10
11
 
11
12
 
12
- @dataclass(frozen=True)
13
- class FileResource:
13
+ class FileLoaderProtocol(Protocol):
14
+ def __call__(
15
+ self,
16
+ file_resource: "FileResource",
17
+ file: Optional["File"] = None,
18
+ **kwargs: Any,
19
+ ) -> Optional["DraftFile"]:
20
+ ...
21
+
22
+
23
+ class FileResource(BaseModel):
14
24
  dataset_resource: "DatasetResource"
15
25
  file_id: str
16
26
  last_modified: datetime
17
27
  data_feed_key: str
18
28
  data_spec_version: str
19
-
20
- # DataSerializationFormat is "json" in case of json_content, otherwise file_loader will return it
21
- # data_serialization_format: str
22
-
23
29
  json_content: Optional[dict] = None
24
-
25
30
  url: Optional[str] = None
26
31
  http_options: Optional[dict] = None
32
+ # DataSerializationFormat is "json" in case of json_content, otherwise file_loader will return it
27
33
  data_serialization_format: Optional[str] = None
28
-
29
34
  file_loader: Optional[
30
35
  Callable[["FileResource", Optional["File"]], Optional["DraftFile"]]
31
36
  ] = None
32
- loader_kwargs: dict = field(default_factory=dict)
37
+ loader_kwargs: dict = Field(default_factory=dict)
33
38
 
34
39
  def __post_init__(self):
35
40
  if self.json_content is None and not self.url and not self.file_loader:
@@ -38,27 +43,14 @@ class FileResource:
38
43
  )
39
44
 
40
45
 
41
- class DatasetResource:
42
- def __init__(
43
- self,
44
- dataset_resource_id: dict,
45
- /,
46
- dataset_type: str,
47
- provider: str,
48
- name: str,
49
- metadata: Optional[dict] = None,
50
- state: Optional["DatasetState"] = None,
51
- ):
52
- from ingestify.domain.models.dataset.dataset import DatasetState
53
-
54
- self.dataset_type = dataset_type
55
- self.provider = provider
56
- self.dataset_resource_id = dataset_resource_id
57
- self.name = name
58
- self.metadata = metadata or {}
59
- self.state = state or DatasetState.COMPLETE
60
-
61
- self.files = {}
46
+ class DatasetResource(BaseModel):
47
+ dataset_resource_id: dict
48
+ dataset_type: str
49
+ provider: str
50
+ name: str
51
+ metadata: dict = Field(default_factory=dict)
52
+ state: DatasetState = Field(default_factory=lambda: DatasetState.COMPLETE)
53
+ files: dict[str, FileResource] = Field(default_factory=dict)
62
54
 
63
55
  def add_file(
64
56
  self,
@@ -72,8 +64,8 @@ class DatasetResource:
72
64
  data_serialization_format: Optional[str] = None,
73
65
  file_loader: Optional[
74
66
  Callable[
75
- ["FileResource", Optional["File"]],
76
- Optional["DraftFile"],
67
+ [FileResource, Optional[File]],
68
+ Optional[DraftFile],
77
69
  ]
78
70
  ] = None,
79
71
  loader_kwargs: Optional[dict] = None,
@@ -1,16 +1,9 @@
1
1
  from abc import ABC, abstractmethod
2
2
 
3
- from ingestify.utils import ComponentFactory, ComponentRegistry
4
-
5
3
  from .dataset import Dataset
6
4
 
7
- sink_registry = ComponentRegistry()
8
-
9
5
 
10
- class Sink(ABC, metaclass=sink_registry.metaclass):
6
+ class Sink(ABC):
11
7
  @abstractmethod
12
8
  def upsert(self, dataset: Dataset, data, params: dict):
13
9
  pass
14
-
15
-
16
- sink_factory = ComponentFactory.build_factory(Sink, sink_registry)
@@ -1,7 +1,9 @@
1
1
  from abc import ABC, abstractmethod
2
2
 
3
+ from .task_summary import TaskSummary
4
+
3
5
 
4
6
  class Task(ABC):
5
7
  @abstractmethod
6
- def run(self):
8
+ def run(self) -> TaskSummary:
7
9
  pass