ingestify 0.3.4__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {ingestify-0.3.4 → ingestify-0.4.0}/PKG-INFO +1 -1
  2. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/__init__.py +1 -1
  3. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/application/dataset_store.py +1 -0
  4. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/application/ingestion_engine.py +7 -2
  5. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/application/loader.py +14 -1
  6. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/cmdline.py +20 -2
  7. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/dataset/collection_metadata.py +2 -1
  8. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/dataset/dataset.py +10 -0
  9. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/dataset/revision.py +4 -0
  10. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/ingestion/ingestion_job.py +8 -10
  11. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py +24 -34
  12. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/task/task_summary.py +3 -24
  13. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/serialization/__init__.py +2 -13
  14. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +2 -3
  15. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/store/dataset/sqlalchemy/tables.py +14 -7
  16. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/utils.py +48 -16
  17. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify.egg-info/PKG-INFO +1 -1
  18. {ingestify-0.3.4 → ingestify-0.4.0}/README.md +0 -0
  19. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/application/__init__.py +0 -0
  20. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/application/secrets_manager.py +0 -0
  21. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/__init__.py +0 -0
  22. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/__init__.py +0 -0
  23. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/base.py +0 -0
  24. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/data_spec_version_collection.py +0 -0
  25. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/dataset/__init__.py +0 -0
  26. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/dataset/collection.py +0 -0
  27. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
  28. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/dataset/dataset_state.py +0 -0
  29. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/dataset/events.py +0 -0
  30. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/dataset/file.py +0 -0
  31. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/dataset/file_collection.py +0 -0
  32. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/dataset/file_repository.py +0 -0
  33. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/dataset/identifier.py +0 -0
  34. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/dataset/selector.py +0 -0
  35. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/event/__init__.py +0 -0
  36. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/event/_old_event.py +0 -0
  37. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/event/dispatcher.py +0 -0
  38. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/event/domain_event.py +0 -0
  39. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/event/event_bus.py +0 -0
  40. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/event/publisher.py +0 -0
  41. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/event/subscriber.py +0 -0
  42. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/fetch_policy.py +0 -0
  43. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/ingestion/__init__.py +0 -0
  44. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  45. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/resources/__init__.py +0 -0
  46. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/resources/dataset_resource.py +0 -0
  47. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/sink.py +0 -0
  48. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/source.py +0 -0
  49. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/task/__init__.py +0 -0
  50. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/task/set.py +0 -0
  51. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/task/task.py +0 -0
  52. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/models/timing.py +0 -0
  53. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/services/__init__.py +0 -0
  54. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  55. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/services/transformers/__init__.py +0 -0
  56. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  57. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/exceptions.py +0 -0
  58. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/__init__.py +0 -0
  59. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/fetch/__init__.py +0 -0
  60. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/fetch/http.py +0 -0
  61. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/sink/__init__.py +0 -0
  62. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/sink/postgresql.py +0 -0
  63. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/source/__init__.py +0 -0
  64. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/source/statsbomb_github.py +0 -0
  65. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/source/wyscout.py +0 -0
  66. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/store/__init__.py +0 -0
  67. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/store/dataset/__init__.py +0 -0
  68. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  69. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/store/file/__init__.py +0 -0
  70. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  71. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
  72. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  73. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/main.py +0 -0
  74. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/server.py +0 -0
  75. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/source_base.py +0 -0
  76. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/static/templates/statsbomb_github/README.md +0 -0
  77. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
  78. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
  79. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/static/templates/statsbomb_github/query.py +0 -0
  80. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/static/templates/wyscout/.env +0 -0
  81. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/static/templates/wyscout/.gitignore +0 -0
  82. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/static/templates/wyscout/README.md +0 -0
  83. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
  84. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/static/templates/wyscout/database/README.md +0 -0
  85. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify/static/templates/wyscout/query.py +0 -0
  86. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify.egg-info/SOURCES.txt +0 -0
  87. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify.egg-info/dependency_links.txt +0 -0
  88. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify.egg-info/entry_points.txt +0 -0
  89. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify.egg-info/requires.txt +0 -0
  90. {ingestify-0.3.4 → ingestify-0.4.0}/ingestify.egg-info/top_level.txt +0 -0
  91. {ingestify-0.3.4 → ingestify-0.4.0}/setup.cfg +0 -0
  92. {ingestify-0.3.4 → ingestify-0.4.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.3.4
3
+ Version: 0.4.0
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.3.4"
11
+ __version__ = "0.4.0"
@@ -270,6 +270,7 @@ class DatasetStore:
270
270
  metadata=metadata,
271
271
  created_at=now,
272
272
  updated_at=now,
273
+ last_modified_at=None, # Not known at this moment
273
274
  )
274
275
  revision = self.add_revision(dataset, files, revision_source, description)
275
276
 
@@ -21,8 +21,13 @@ class IngestionEngine:
21
21
  def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
22
22
  self.loader.add_ingestion_plan(ingestion_plan)
23
23
 
24
- def load(self, dry_run: bool = False, provider: Optional[str] = None):
25
- self.loader.collect_and_run(dry_run=dry_run, provider=provider)
24
+ def load(
25
+ self,
26
+ dry_run: bool = False,
27
+ provider: Optional[str] = None,
28
+ source: Optional[str] = None,
29
+ ):
30
+ self.loader.collect_and_run(dry_run=dry_run, provider=provider, source=source)
26
31
 
27
32
  def list_datasets(self, as_count: bool = False):
28
33
  """Consider moving this to DataStore"""
@@ -29,7 +29,12 @@ class Loader:
29
29
  def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
30
30
  self.ingestion_plans.append(ingestion_plan)
31
31
 
32
- def collect_and_run(self, dry_run: bool = False, provider: Optional[str] = None):
32
+ def collect_and_run(
33
+ self,
34
+ dry_run: bool = False,
35
+ provider: Optional[str] = None,
36
+ source: Optional[str] = None,
37
+ ):
33
38
  # First collect all selectors, before discovering datasets
34
39
  selectors = {}
35
40
  for ingestion_plan in self.ingestion_plans:
@@ -42,6 +47,13 @@ class Loader:
42
47
  )
43
48
  continue
44
49
 
50
+ if source is not None:
51
+ if ingestion_plan.source.name != source:
52
+ logger.info(
53
+ f"Skipping {ingestion_plan} because source doesn't match '{source}'"
54
+ )
55
+ continue
56
+
45
57
  static_selectors = [
46
58
  selector
47
59
  for selector in ingestion_plan.selectors
@@ -60,6 +72,7 @@ class Loader:
60
72
 
61
73
  # TODO: consider making this lazy and fetch once per Source instead of
62
74
  # once per IngestionPlan
75
+ # TODO: Log exception when `discover_selectors` fails
63
76
  all_selectors = ingestion_plan.source.discover_selectors(
64
77
  ingestion_plan.dataset_type
65
78
  )
@@ -58,7 +58,14 @@ def cli():
58
58
  help="bucket",
59
59
  type=str,
60
60
  )
61
- @click.option("--debug", "debug", required=False, help="Debugging enabled", type=bool)
61
+ @click.option(
62
+ "--debug",
63
+ "debug",
64
+ required=False,
65
+ help="Debugging enabled",
66
+ is_flag=True,
67
+ type=bool,
68
+ )
62
69
  @click.option(
63
70
  "--dry-run",
64
71
  "dry_run",
@@ -74,11 +81,19 @@ def cli():
74
81
  help="Provider - only run tasks for a single provider",
75
82
  type=str,
76
83
  )
84
+ @click.option(
85
+ "--source",
86
+ "source",
87
+ required=False,
88
+ help="Source - only run tasks for a single source",
89
+ type=str,
90
+ )
77
91
  def run(
78
92
  config_file: str,
79
93
  bucket: Optional[str],
80
94
  dry_run: Optional[bool],
81
95
  provider: Optional[str],
96
+ source: Optional[str],
82
97
  debug: Optional[bool],
83
98
  ):
84
99
  try:
@@ -90,7 +105,10 @@ def run(
90
105
  logger.exception(f"Failed due a configuration error: {e}")
91
106
  sys.exit(1)
92
107
 
93
- engine.load(dry_run=dry_run, provider=provider)
108
+ if debug:
109
+ logging.getLogger("root").setLevel(logging.DEBUG)
110
+
111
+ engine.load(dry_run=dry_run, provider=provider, source=source)
94
112
 
95
113
  logger.info("Done")
96
114
 
@@ -6,7 +6,8 @@ from typing import Optional
6
6
  @dataclass
7
7
  class DatasetCollectionMetadata:
8
8
  # This can be useful to figure out if a backfill is required
9
- first_modified: Optional[datetime]
9
+ # TODO - Note: not stored at Dataset level and requires joined query to retrieve
10
+ # first_modified: Optional[datetime]
10
11
 
11
12
  # Use the last modified to only retrieve datasets that are changed
12
13
  last_modified: Optional[datetime]
@@ -22,7 +22,10 @@ class Dataset(BaseModel):
22
22
  metadata: dict
23
23
  created_at: datetime
24
24
  updated_at: datetime
25
+
25
26
  revisions: List[Revision] = Field(default_factory=list)
27
+ # The last_modified_at is equal to the max modified_at of all files in all revisions
28
+ last_modified_at: Optional[datetime]
26
29
 
27
30
  @field_validator("identifier", mode="before")
28
31
  @classmethod
@@ -42,6 +45,13 @@ class Dataset(BaseModel):
42
45
  self.revisions.append(revision)
43
46
  self.updated_at = utcnow()
44
47
 
48
+ if self.last_modified_at:
49
+ self.last_modified_at = max(
50
+ self.last_modified_at, revision.last_modified_at
51
+ )
52
+ else:
53
+ self.last_modified_at = revision.last_modified_at
54
+
45
55
  def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
46
56
  changed = False
47
57
  if self.name != name:
@@ -36,6 +36,10 @@ class Revision(BaseModel):
36
36
  is_squashed: bool = False
37
37
  state: RevisionState = RevisionState.PENDING_VALIDATION
38
38
 
39
+ @property
40
+ def last_modified_at(self):
41
+ return max(file.modified_at for file in self.modified_files)
42
+
39
43
  @property
40
44
  def modified_files_map(self) -> Dict[str, File]:
41
45
  return {file.file_id: file for file in self.modified_files}
@@ -214,9 +214,6 @@ class IngestionJob:
214
214
  self, store: DatasetStore, task_executor: TaskExecutor
215
215
  ) -> Iterator[IngestionJobSummary]:
216
216
  is_first_chunk = True
217
- ingestion_job_exception = (
218
- None # Indicate if there was an exception during the IngestionJob itself
219
- )
220
217
  ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
221
218
  # Process all items in batches. Yield a IngestionJobSummary per batch
222
219
 
@@ -224,6 +221,7 @@ class IngestionJob:
224
221
  with ingestion_job_summary.record_timing("get_dataset_collection"):
225
222
  dataset_collection_metadata = store.get_dataset_collection(
226
223
  dataset_type=self.ingestion_plan.dataset_type,
224
+ provider=self.ingestion_plan.source.provider,
227
225
  data_spec_versions=self.selector.data_spec_versions,
228
226
  selector=self.selector,
229
227
  metadata_only=True,
@@ -233,8 +231,8 @@ class IngestionJob:
233
231
  # There are two different, but similar flows here:
234
232
  # 1. The discover_datasets returns a list, and the entire list can be processed at once
235
233
  # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
236
- with ingestion_job_summary.record_timing("find_datasets"):
237
- try:
234
+ try:
235
+ with ingestion_job_summary.record_timing("find_datasets"):
238
236
  dataset_resources = self.ingestion_plan.source.find_datasets(
239
237
  dataset_type=self.ingestion_plan.dataset_type,
240
238
  data_spec_versions=self.selector.data_spec_versions,
@@ -244,12 +242,12 @@ class IngestionJob:
244
242
 
245
243
  # We need to include the to_batches as that will start the generator
246
244
  batches = to_batches(dataset_resources)
247
- except Exception as e:
248
- logger.exception("Failed to find datasets")
245
+ except Exception as e:
246
+ logger.exception("Failed to find datasets")
249
247
 
250
- ingestion_job_summary.set_exception(e)
251
- yield ingestion_job_summary
252
- return
248
+ ingestion_job_summary.set_exception(e)
249
+ yield ingestion_job_summary
250
+ return
253
251
 
254
252
  finish_task_timer = ingestion_job_summary.start_timing("tasks")
255
253
 
@@ -9,7 +9,7 @@ from ingestify.domain import Selector, DataSpecVersionCollection
9
9
  from ingestify.domain.models.base import BaseModel
10
10
  from ingestify.domain.models.task.task_summary import TaskSummary, TaskState
11
11
  from ingestify.domain.models.timing import Timing
12
- from ingestify.utils import utcnow
12
+ from ingestify.utils import utcnow, HasTiming
13
13
 
14
14
  if TYPE_CHECKING:
15
15
  from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
@@ -25,7 +25,7 @@ def format_duration(duration: timedelta):
25
25
  return f"{duration.total_seconds():.2f}sec"
26
26
 
27
27
 
28
- class IngestionJobSummary(BaseModel):
28
+ class IngestionJobSummary(BaseModel, HasTiming):
29
29
  ingestion_job_summary_id: str
30
30
  ingestion_job_id: str
31
31
 
@@ -39,7 +39,6 @@ class IngestionJobSummary(BaseModel):
39
39
  started_at: datetime = Field(default_factory=utcnow)
40
40
  ended_at: Optional[datetime] = None
41
41
  state: IngestionJobState = IngestionJobState.RUNNING
42
- timings: List[Timing] = Field(default_factory=list)
43
42
  task_summaries: List[TaskSummary] = Field(default_factory=list)
44
43
 
45
44
  skipped_datasets: int = 0
@@ -60,22 +59,6 @@ class IngestionJobSummary(BaseModel):
60
59
  )
61
60
  return cls(**args)
62
61
 
63
- @contextmanager
64
- def record_timing(self, name: str):
65
- start = utcnow()
66
- try:
67
- yield
68
- finally:
69
- self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
70
-
71
- def start_timing(self, name):
72
- start = utcnow()
73
-
74
- def finish():
75
- self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
76
-
77
- return finish
78
-
79
62
  def add_task_summaries(self, task_summaries: List[TaskSummary]):
80
63
  self.task_summaries.extend(task_summaries)
81
64
 
@@ -101,6 +84,11 @@ class IngestionJobSummary(BaseModel):
101
84
  )
102
85
  self.ended_at = utcnow()
103
86
 
87
+ # Only keep failed tasks. Rest isn't interesting
88
+ self.task_summaries = [
89
+ task for task in self.task_summaries if task.state == TaskState.FAILED
90
+ ]
91
+
104
92
  def set_finished(self):
105
93
  self.state = IngestionJobState.FINISHED
106
94
  self._set_ended()
@@ -114,25 +102,27 @@ class IngestionJobSummary(BaseModel):
114
102
  return self.ended_at - self.started_at
115
103
 
116
104
  def output_report(self):
117
- print(f"\nIngestionJobSummary {self.state} in {format_duration(self.duration)}")
118
- print("--------------------")
119
- print(f" - IngestionPlan:")
120
- print(f" Source: {self.source_name}")
121
- print(f" Provider: {self.provider}")
122
- print(f" DatasetType: {self.dataset_type}")
123
- print(f" - Selector: {self.selector}")
124
- print(f" - Timings: ")
105
+ print(
106
+ f"\nIngestionJobSummary {self.state.value} in {format_duration(self.duration)}"
107
+ )
108
+ print("********************************")
109
+ print(f"* - IngestionPlan:")
110
+ print(f"* Source: {self.source_name}")
111
+ print(f"* Provider: {self.provider}")
112
+ print(f"* DatasetType: {self.dataset_type}")
113
+ print(f"* - Selector: {self.selector}")
114
+ print(f"* - Timings: ")
125
115
  for timing in self.timings:
126
- print(f" - {timing.name}: {format_duration(timing.duration)}")
116
+ print(f"* - {timing.name}: {format_duration(timing.duration)}")
127
117
  print(
128
- f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
118
+ f"* - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
129
119
  )
130
120
 
131
- print(f" - Failed tasks: {self.failed_tasks}")
132
- print(f" - Successful tasks: {self.successful_tasks}")
133
- print(f" - Successful ignored tasks: {self.ignored_successful_tasks}")
134
- print(f" - Skipped datasets: {self.skipped_datasets}")
135
- print("--------------------")
121
+ print(f"* - Failed tasks: {self.failed_tasks}")
122
+ print(f"* - Successful tasks: {self.successful_tasks}")
123
+ print(f"* - Successful ignored tasks: {self.ignored_successful_tasks}")
124
+ print(f"* - Skipped datasets: {self.skipped_datasets}")
125
+ print("********************************")
136
126
 
137
127
  def __enter__(self):
138
128
  return self
@@ -10,8 +10,7 @@ from ingestify.domain.models.base import BaseModel
10
10
  from ingestify.domain.models.dataset.identifier import Identifier
11
11
  from ingestify.domain.models.timing import Timing
12
12
  from ingestify.exceptions import IngestifyError
13
- from ingestify.utils import utcnow
14
-
13
+ from ingestify.utils import utcnow, HasTiming
15
14
 
16
15
  logger = logging.getLogger(__name__)
17
16
 
@@ -28,7 +27,7 @@ class Operation(str, Enum):
28
27
  UPDATE = "UPDATE"
29
28
 
30
29
 
31
- class TaskSummary(BaseModel):
30
+ class TaskSummary(BaseModel, HasTiming):
32
31
  task_id: str
33
32
  started_at: datetime
34
33
  operation: Operation
@@ -38,7 +37,6 @@ class TaskSummary(BaseModel):
38
37
  bytes_retrieved: int = 0
39
38
  last_modified: Optional[datetime] = None
40
39
  state: TaskState = TaskState.RUNNING
41
- timings: List[Timing] = Field(default_factory=list)
42
40
 
43
41
  @field_validator("dataset_identifier", mode="before")
44
42
  @classmethod
@@ -48,27 +46,8 @@ class TaskSummary(BaseModel):
48
46
  return value
49
47
 
50
48
  def record_load_file(self, fn, metadata: dict):
51
- start = utcnow()
52
- try:
53
- result = None
49
+ with self.record_timing(f"Load of {metadata.get('file_id', 'file')}", metadata):
54
50
  return fn()
55
- except Exception as e:
56
- result = {
57
- "type": type(e).__name__,
58
- "message": str(e),
59
- "traceback": traceback.format_exc(),
60
- }
61
- raise e
62
- finally:
63
- metadata = dict(result=result, **metadata)
64
- self.timings.append(
65
- Timing(
66
- name=f"Load of {metadata.get('file_id', 'file')}",
67
- started_at=start,
68
- ended_at=utcnow(),
69
- metadata=metadata,
70
- )
71
- )
72
51
 
73
52
  @classmethod
74
53
  @contextmanager
@@ -1,12 +1,5 @@
1
- import json
2
- from datetime import datetime
3
- from typing import Type, Any, TypeVar
4
-
5
- from dataclass_factory import Schema, Factory, NameStyle
6
- from dataclass_factory.schema_helpers import type_checker
7
-
8
- from ingestify.domain import DatasetCreated, Identifier
9
- from ingestify.domain.models.dataset.events import MetadataUpdated, RevisionAdded
1
+ from ingestify.domain import DatasetCreated
2
+ from ingestify.domain.models.dataset.events import RevisionAdded
10
3
  from ingestify.domain.models.event import DomainEvent
11
4
 
12
5
 
@@ -18,10 +11,6 @@ event_types = {
18
11
 
19
12
  def deserialize(event_dict: dict) -> DomainEvent:
20
13
  event_cls = event_types[event_dict["event_type"]]
21
- event_dict["dataset"]["identifier"] = Identifier(
22
- **event_dict["dataset"]["identifier"]
23
- )
24
-
25
14
  return event_cls.model_validate(event_dict)
26
15
 
27
16
 
@@ -320,10 +320,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
320
320
 
321
321
  metadata_result_row = apply_query_filter(
322
322
  self.session.query(
323
- func.min(file_table.c.modified_at).label("first_modified_at"),
324
- func.max(file_table.c.modified_at).label("last_modified_at"),
323
+ func.max(dataset_table.c.last_modified_at).label("last_modified_at"),
325
324
  func.count().label("row_count"),
326
- ).join(dataset_table, dataset_table.c.dataset_id == file_table.c.dataset_id)
325
+ )
327
326
  ).first()
328
327
  dataset_collection_metadata = DatasetCollectionMetadata(*metadata_result_row)
329
328
 
@@ -16,6 +16,8 @@ from sqlalchemy import (
16
16
  TypeDecorator,
17
17
  )
18
18
 
19
+ from sqlalchemy.dialects.postgresql import JSONB
20
+
19
21
  from ingestify.domain import Identifier, DataSpecVersionCollection, Selector
20
22
  from ingestify.domain.models.dataset.dataset import DatasetState
21
23
  from ingestify.domain.models.ingestion.ingestion_job_summary import IngestionJobState
@@ -25,18 +27,18 @@ from ingestify.domain.models.timing import Timing
25
27
  from ingestify.domain.models.dataset.revision import RevisionState
26
28
 
27
29
 
28
- def JSONType(serializer=None, deserializer=None):
30
+ def JSONType(serializer=None, deserializer=None, base_type=JSON):
29
31
  class _JsonType(TypeDecorator):
30
32
  cache_ok = True
31
- impl = JSON
33
+ impl = base_type
32
34
 
33
35
  def process_bind_param(self, value, dialect):
34
- if serializer is not None:
36
+ if serializer and value is not None:
35
37
  return serializer(value)
36
38
  return value
37
39
 
38
40
  def process_result_value(self, value, dialect):
39
- if deserializer is not None:
41
+ if deserializer and value is not None:
40
42
  return deserializer(value)
41
43
  return value
42
44
 
@@ -152,14 +154,19 @@ dataset_table = Table(
152
154
  metadata,
153
155
  Column("bucket", String(255), default=None),
154
156
  Column("dataset_id", String(255), primary_key=True),
155
- Column("provider", String(255)),
156
- Column("dataset_type", String(255)),
157
+ Column("provider", String(255), index=True),
158
+ Column("dataset_type", String(255), index=True),
157
159
  Column("state", DatasetStateString),
158
160
  Column("name", String(255)),
159
- Column("identifier", JSONType(deserializer=lambda item: Identifier(**item))),
161
+ Column(
162
+ "identifier",
163
+ # Use JSONB when available
164
+ JSON().with_variant(JSONB(), "postgresql"),
165
+ ),
160
166
  Column("metadata", JSON),
161
167
  Column("created_at", TZDateTime(6)),
162
168
  Column("updated_at", TZDateTime(6)),
169
+ Column("last_modified_at", TZDateTime(6)),
163
170
  )
164
171
 
165
172
  revision_table = Table(
@@ -1,34 +1,23 @@
1
- import abc
2
- import asyncio
3
- import inspect
4
1
  import logging
5
2
  import os
6
3
  import time
7
4
  import re
5
+ import traceback
6
+ from contextlib import contextmanager
8
7
  from multiprocessing import get_context, cpu_count, get_all_start_methods
9
8
 
10
9
  from datetime import datetime, timezone
11
10
  from string import Template
12
- from typing import (
13
- Dict,
14
- Generic,
15
- Type,
16
- TypeVar,
17
- Tuple,
18
- Optional,
19
- Any,
20
- Callable,
21
- Awaitable,
22
- List,
23
- Iterable,
24
- )
11
+ from typing import Dict, Tuple, Optional, Any, List
25
12
 
26
13
  import cloudpickle
14
+ from pydantic import Field
27
15
  from typing_extensions import Self
28
16
 
29
17
 
30
18
  from itertools import islice
31
19
 
20
+ from ingestify.domain.models.timing import Timing
32
21
 
33
22
  logger = logging.getLogger(__name__)
34
23
 
@@ -221,3 +210,46 @@ def try_number(s: str):
221
210
  return float(s)
222
211
  except ValueError:
223
212
  return s
213
+
214
+
215
+ class HasTiming:
216
+ """Mixin to give Pydantic models ability to time actions."""
217
+
218
+ timings: List[Timing] = Field(default_factory=list)
219
+
220
+ @contextmanager
221
+ def record_timing(
222
+ self, description: str, metadata: Optional[dict] = None
223
+ ) -> Timing:
224
+ if not metadata:
225
+ metadata = {}
226
+
227
+ start = utcnow()
228
+ try:
229
+ result = None
230
+ yield
231
+ except Exception as e:
232
+ result = {
233
+ "type": type(e).__name__,
234
+ "message": str(e),
235
+ "traceback": traceback.format_exc(),
236
+ }
237
+ raise e
238
+ finally:
239
+ metadata = dict(result=result, **metadata)
240
+ self.timings.append(
241
+ Timing(
242
+ name=description,
243
+ started_at=start,
244
+ ended_at=utcnow(),
245
+ metadata=metadata,
246
+ )
247
+ )
248
+
249
+ def start_timing(self, name):
250
+ start = utcnow()
251
+
252
+ def finish():
253
+ self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
254
+
255
+ return finish
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.3.4
3
+ Version: 0.4.0
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
File without changes
File without changes
File without changes
File without changes
File without changes