ingestify 0.3.4__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ingestify/__init__.py CHANGED
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.3.4"
11
+ __version__ = "0.4.1"
@@ -270,6 +270,7 @@ class DatasetStore:
270
270
  metadata=metadata,
271
271
  created_at=now,
272
272
  updated_at=now,
273
+ last_modified_at=None, # Not known at this moment
273
274
  )
274
275
  revision = self.add_revision(dataset, files, revision_source, description)
275
276
 
@@ -21,8 +21,13 @@ class IngestionEngine:
21
21
  def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
22
22
  self.loader.add_ingestion_plan(ingestion_plan)
23
23
 
24
- def load(self, dry_run: bool = False, provider: Optional[str] = None):
25
- self.loader.collect_and_run(dry_run=dry_run, provider=provider)
24
+ def load(
25
+ self,
26
+ dry_run: bool = False,
27
+ provider: Optional[str] = None,
28
+ source: Optional[str] = None,
29
+ ):
30
+ self.loader.collect_and_run(dry_run=dry_run, provider=provider, source=source)
26
31
 
27
32
  def list_datasets(self, as_count: bool = False):
28
33
  """Consider moving this to DataStore"""
@@ -29,7 +29,12 @@ class Loader:
29
29
  def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
30
30
  self.ingestion_plans.append(ingestion_plan)
31
31
 
32
- def collect_and_run(self, dry_run: bool = False, provider: Optional[str] = None):
32
+ def collect_and_run(
33
+ self,
34
+ dry_run: bool = False,
35
+ provider: Optional[str] = None,
36
+ source: Optional[str] = None,
37
+ ):
33
38
  # First collect all selectors, before discovering datasets
34
39
  selectors = {}
35
40
  for ingestion_plan in self.ingestion_plans:
@@ -42,6 +47,13 @@ class Loader:
42
47
  )
43
48
  continue
44
49
 
50
+ if source is not None:
51
+ if ingestion_plan.source.name != source:
52
+ logger.info(
53
+ f"Skipping {ingestion_plan} because source doesn't match '{source}'"
54
+ )
55
+ continue
56
+
45
57
  static_selectors = [
46
58
  selector
47
59
  for selector in ingestion_plan.selectors
@@ -60,6 +72,7 @@ class Loader:
60
72
 
61
73
  # TODO: consider making this lazy and fetch once per Source instead of
62
74
  # once per IngestionPlan
75
+ # TODO: Log exception when `discover_selectors` fails
63
76
  all_selectors = ingestion_plan.source.discover_selectors(
64
77
  ingestion_plan.dataset_type
65
78
  )
ingestify/cmdline.py CHANGED
@@ -58,7 +58,14 @@ def cli():
58
58
  help="bucket",
59
59
  type=str,
60
60
  )
61
- @click.option("--debug", "debug", required=False, help="Debugging enabled", type=bool)
61
+ @click.option(
62
+ "--debug",
63
+ "debug",
64
+ required=False,
65
+ help="Debugging enabled",
66
+ is_flag=True,
67
+ type=bool,
68
+ )
62
69
  @click.option(
63
70
  "--dry-run",
64
71
  "dry_run",
@@ -74,11 +81,19 @@ def cli():
74
81
  help="Provider - only run tasks for a single provider",
75
82
  type=str,
76
83
  )
84
+ @click.option(
85
+ "--source",
86
+ "source",
87
+ required=False,
88
+ help="Source - only run tasks for a single source",
89
+ type=str,
90
+ )
77
91
  def run(
78
92
  config_file: str,
79
93
  bucket: Optional[str],
80
94
  dry_run: Optional[bool],
81
95
  provider: Optional[str],
96
+ source: Optional[str],
82
97
  debug: Optional[bool],
83
98
  ):
84
99
  try:
@@ -90,7 +105,10 @@ def run(
90
105
  logger.exception(f"Failed due a configuration error: {e}")
91
106
  sys.exit(1)
92
107
 
93
- engine.load(dry_run=dry_run, provider=provider)
108
+ if debug:
109
+ logging.getLogger("root").setLevel(logging.DEBUG)
110
+
111
+ engine.load(dry_run=dry_run, provider=provider, source=source)
94
112
 
95
113
  logger.info("Done")
96
114
 
@@ -6,7 +6,8 @@ from typing import Optional
6
6
  @dataclass
7
7
  class DatasetCollectionMetadata:
8
8
  # This can be useful to figure out if a backfill is required
9
- first_modified: Optional[datetime]
9
+ # TODO - Note: not stored at Dataset level and requires joined query to retrieve
10
+ # first_modified: Optional[datetime]
10
11
 
11
12
  # Use the last modified to only retrieve datasets that are changed
12
13
  last_modified: Optional[datetime]
@@ -22,7 +22,10 @@ class Dataset(BaseModel):
22
22
  metadata: dict
23
23
  created_at: datetime
24
24
  updated_at: datetime
25
+
25
26
  revisions: List[Revision] = Field(default_factory=list)
27
+ # The last_modified_at is equal to the max modified_at of all files in all revisions
28
+ last_modified_at: Optional[datetime]
26
29
 
27
30
  @field_validator("identifier", mode="before")
28
31
  @classmethod
@@ -42,6 +45,13 @@ class Dataset(BaseModel):
42
45
  self.revisions.append(revision)
43
46
  self.updated_at = utcnow()
44
47
 
48
+ if self.last_modified_at:
49
+ self.last_modified_at = max(
50
+ self.last_modified_at, revision.last_modified_at
51
+ )
52
+ else:
53
+ self.last_modified_at = revision.last_modified_at
54
+
45
55
  def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
46
56
  changed = False
47
57
  if self.name != name:
@@ -36,6 +36,10 @@ class Revision(BaseModel):
36
36
  is_squashed: bool = False
37
37
  state: RevisionState = RevisionState.PENDING_VALIDATION
38
38
 
39
+ @property
40
+ def last_modified_at(self):
41
+ return max(file.modified_at for file in self.modified_files)
42
+
39
43
  @property
40
44
  def modified_files_map(self) -> Dict[str, File]:
41
45
  return {file.file_id: file for file in self.modified_files}
@@ -214,9 +214,6 @@ class IngestionJob:
214
214
  self, store: DatasetStore, task_executor: TaskExecutor
215
215
  ) -> Iterator[IngestionJobSummary]:
216
216
  is_first_chunk = True
217
- ingestion_job_exception = (
218
- None # Indicate if there was an exception during the IngestionJob itself
219
- )
220
217
  ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
221
218
  # Process all items in batches. Yield a IngestionJobSummary per batch
222
219
 
@@ -224,6 +221,7 @@ class IngestionJob:
224
221
  with ingestion_job_summary.record_timing("get_dataset_collection"):
225
222
  dataset_collection_metadata = store.get_dataset_collection(
226
223
  dataset_type=self.ingestion_plan.dataset_type,
224
+ provider=self.ingestion_plan.source.provider,
227
225
  data_spec_versions=self.selector.data_spec_versions,
228
226
  selector=self.selector,
229
227
  metadata_only=True,
@@ -233,8 +231,8 @@ class IngestionJob:
233
231
  # There are two different, but similar flows here:
234
232
  # 1. The discover_datasets returns a list, and the entire list can be processed at once
235
233
  # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
236
- with ingestion_job_summary.record_timing("find_datasets"):
237
- try:
234
+ try:
235
+ with ingestion_job_summary.record_timing("find_datasets"):
238
236
  dataset_resources = self.ingestion_plan.source.find_datasets(
239
237
  dataset_type=self.ingestion_plan.dataset_type,
240
238
  data_spec_versions=self.selector.data_spec_versions,
@@ -244,12 +242,12 @@ class IngestionJob:
244
242
 
245
243
  # We need to include the to_batches as that will start the generator
246
244
  batches = to_batches(dataset_resources)
247
- except Exception as e:
248
- logger.exception("Failed to find datasets")
245
+ except Exception as e:
246
+ logger.exception("Failed to find datasets")
249
247
 
250
- ingestion_job_summary.set_exception(e)
251
- yield ingestion_job_summary
252
- return
248
+ ingestion_job_summary.set_exception(e)
249
+ yield ingestion_job_summary
250
+ return
253
251
 
254
252
  finish_task_timer = ingestion_job_summary.start_timing("tasks")
255
253
 
@@ -9,7 +9,7 @@ from ingestify.domain import Selector, DataSpecVersionCollection
9
9
  from ingestify.domain.models.base import BaseModel
10
10
  from ingestify.domain.models.task.task_summary import TaskSummary, TaskState
11
11
  from ingestify.domain.models.timing import Timing
12
- from ingestify.utils import utcnow
12
+ from ingestify.utils import utcnow, HasTiming
13
13
 
14
14
  if TYPE_CHECKING:
15
15
  from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
@@ -25,7 +25,7 @@ def format_duration(duration: timedelta):
25
25
  return f"{duration.total_seconds():.2f}sec"
26
26
 
27
27
 
28
- class IngestionJobSummary(BaseModel):
28
+ class IngestionJobSummary(BaseModel, HasTiming):
29
29
  ingestion_job_summary_id: str
30
30
  ingestion_job_id: str
31
31
 
@@ -39,7 +39,6 @@ class IngestionJobSummary(BaseModel):
39
39
  started_at: datetime = Field(default_factory=utcnow)
40
40
  ended_at: Optional[datetime] = None
41
41
  state: IngestionJobState = IngestionJobState.RUNNING
42
- timings: List[Timing] = Field(default_factory=list)
43
42
  task_summaries: List[TaskSummary] = Field(default_factory=list)
44
43
 
45
44
  skipped_datasets: int = 0
@@ -60,22 +59,6 @@ class IngestionJobSummary(BaseModel):
60
59
  )
61
60
  return cls(**args)
62
61
 
63
- @contextmanager
64
- def record_timing(self, name: str):
65
- start = utcnow()
66
- try:
67
- yield
68
- finally:
69
- self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
70
-
71
- def start_timing(self, name):
72
- start = utcnow()
73
-
74
- def finish():
75
- self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
76
-
77
- return finish
78
-
79
62
  def add_task_summaries(self, task_summaries: List[TaskSummary]):
80
63
  self.task_summaries.extend(task_summaries)
81
64
 
@@ -101,6 +84,11 @@ class IngestionJobSummary(BaseModel):
101
84
  )
102
85
  self.ended_at = utcnow()
103
86
 
87
+ # Only keep failed tasks. Rest isn't interesting
88
+ self.task_summaries = [
89
+ task for task in self.task_summaries if task.state == TaskState.FAILED
90
+ ]
91
+
104
92
  def set_finished(self):
105
93
  self.state = IngestionJobState.FINISHED
106
94
  self._set_ended()
@@ -114,25 +102,27 @@ class IngestionJobSummary(BaseModel):
114
102
  return self.ended_at - self.started_at
115
103
 
116
104
  def output_report(self):
117
- print(f"\nIngestionJobSummary {self.state} in {format_duration(self.duration)}")
118
- print("--------------------")
119
- print(f" - IngestionPlan:")
120
- print(f" Source: {self.source_name}")
121
- print(f" Provider: {self.provider}")
122
- print(f" DatasetType: {self.dataset_type}")
123
- print(f" - Selector: {self.selector}")
124
- print(f" - Timings: ")
105
+ print(
106
+ f"\nIngestionJobSummary {self.state.value} in {format_duration(self.duration)}"
107
+ )
108
+ print("********************************")
109
+ print(f"* - IngestionPlan:")
110
+ print(f"* Source: {self.source_name}")
111
+ print(f"* Provider: {self.provider}")
112
+ print(f"* DatasetType: {self.dataset_type}")
113
+ print(f"* - Selector: {self.selector}")
114
+ print(f"* - Timings: ")
125
115
  for timing in self.timings:
126
- print(f" - {timing.name}: {format_duration(timing.duration)}")
116
+ print(f"* - {timing.name}: {format_duration(timing.duration)}")
127
117
  print(
128
- f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
118
+ f"* - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
129
119
  )
130
120
 
131
- print(f" - Failed tasks: {self.failed_tasks}")
132
- print(f" - Successful tasks: {self.successful_tasks}")
133
- print(f" - Successful ignored tasks: {self.ignored_successful_tasks}")
134
- print(f" - Skipped datasets: {self.skipped_datasets}")
135
- print("--------------------")
121
+ print(f"* - Failed tasks: {self.failed_tasks}")
122
+ print(f"* - Successful tasks: {self.successful_tasks}")
123
+ print(f"* - Successful ignored tasks: {self.ignored_successful_tasks}")
124
+ print(f"* - Skipped datasets: {self.skipped_datasets}")
125
+ print("********************************")
136
126
 
137
127
  def __enter__(self):
138
128
  return self
@@ -10,8 +10,7 @@ from ingestify.domain.models.base import BaseModel
10
10
  from ingestify.domain.models.dataset.identifier import Identifier
11
11
  from ingestify.domain.models.timing import Timing
12
12
  from ingestify.exceptions import IngestifyError
13
- from ingestify.utils import utcnow
14
-
13
+ from ingestify.utils import utcnow, HasTiming
15
14
 
16
15
  logger = logging.getLogger(__name__)
17
16
 
@@ -28,7 +27,7 @@ class Operation(str, Enum):
28
27
  UPDATE = "UPDATE"
29
28
 
30
29
 
31
- class TaskSummary(BaseModel):
30
+ class TaskSummary(BaseModel, HasTiming):
32
31
  task_id: str
33
32
  started_at: datetime
34
33
  operation: Operation
@@ -38,7 +37,6 @@ class TaskSummary(BaseModel):
38
37
  bytes_retrieved: int = 0
39
38
  last_modified: Optional[datetime] = None
40
39
  state: TaskState = TaskState.RUNNING
41
- timings: List[Timing] = Field(default_factory=list)
42
40
 
43
41
  @field_validator("dataset_identifier", mode="before")
44
42
  @classmethod
@@ -48,27 +46,8 @@ class TaskSummary(BaseModel):
48
46
  return value
49
47
 
50
48
  def record_load_file(self, fn, metadata: dict):
51
- start = utcnow()
52
- try:
53
- result = None
49
+ with self.record_timing(f"Load of {metadata.get('file_id', 'file')}", metadata):
54
50
  return fn()
55
- except Exception as e:
56
- result = {
57
- "type": type(e).__name__,
58
- "message": str(e),
59
- "traceback": traceback.format_exc(),
60
- }
61
- raise e
62
- finally:
63
- metadata = dict(result=result, **metadata)
64
- self.timings.append(
65
- Timing(
66
- name=f"Load of {metadata.get('file_id', 'file')}",
67
- started_at=start,
68
- ended_at=utcnow(),
69
- metadata=metadata,
70
- )
71
- )
72
51
 
73
52
  @classmethod
74
53
  @contextmanager
@@ -1,12 +1,5 @@
1
- import json
2
- from datetime import datetime
3
- from typing import Type, Any, TypeVar
4
-
5
- from dataclass_factory import Schema, Factory, NameStyle
6
- from dataclass_factory.schema_helpers import type_checker
7
-
8
- from ingestify.domain import DatasetCreated, Identifier
9
- from ingestify.domain.models.dataset.events import MetadataUpdated, RevisionAdded
1
+ from ingestify.domain import DatasetCreated
2
+ from ingestify.domain.models.dataset.events import RevisionAdded
10
3
  from ingestify.domain.models.event import DomainEvent
11
4
 
12
5
 
@@ -18,10 +11,6 @@ event_types = {
18
11
 
19
12
  def deserialize(event_dict: dict) -> DomainEvent:
20
13
  event_cls = event_types[event_dict["event_type"]]
21
- event_dict["dataset"]["identifier"] = Identifier(
22
- **event_dict["dataset"]["identifier"]
23
- )
24
-
25
14
  return event_cls.model_validate(event_dict)
26
15
 
27
16
 
@@ -1,32 +1,31 @@
1
1
  import itertools
2
- import json
3
2
  import uuid
4
- from collections import defaultdict
5
3
  from typing import Optional, Union, List
6
4
 
7
5
  from sqlalchemy import (
8
6
  create_engine,
9
7
  func,
10
8
  text,
11
- tuple_,
12
9
  Table,
13
- insert,
14
- Transaction,
15
10
  Connection,
11
+ union_all,
12
+ literal,
13
+ select,
14
+ and_,
15
+ Column,
16
+ or_,
16
17
  )
17
18
  from sqlalchemy.engine import make_url
18
19
  from sqlalchemy.exc import NoSuchModuleError
19
- from sqlalchemy.orm import Session, joinedload
20
+ from sqlalchemy.orm import Session
20
21
 
21
22
  from ingestify.domain import File, Revision
22
23
  from ingestify.domain.models import (
23
24
  Dataset,
24
25
  DatasetCollection,
25
26
  DatasetRepository,
26
- Identifier,
27
27
  Selector,
28
28
  )
29
- from ingestify.domain.models.base import BaseModel
30
29
  from ingestify.domain.models.dataset.collection_metadata import (
31
30
  DatasetCollectionMetadata,
32
31
  )
@@ -127,6 +126,10 @@ class SqlAlchemySessionProvider:
127
126
  return self.session
128
127
 
129
128
 
129
+ def in_(column: Column, values):
130
+ return or_(*[column == value for value in values])
131
+
132
+
130
133
  class SqlAlchemyDatasetRepository(DatasetRepository):
131
134
  def __init__(self, session_provider: SqlAlchemySessionProvider):
132
135
  self.session_provider = session_provider
@@ -169,11 +172,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
169
172
  dataset_id: Optional[Union[str, List[str]]] = None,
170
173
  selector: Optional[Union[Selector, List[Selector]]] = None,
171
174
  ):
172
- query = query.filter(dataset_table.c.bucket == bucket)
173
- if dataset_type:
174
- query = query.filter(dataset_table.c.dataset_type == dataset_type)
175
- if provider:
176
- query = query.filter(dataset_table.c.provider == provider)
177
175
  if dataset_id is not None:
178
176
  if isinstance(dataset_id, list):
179
177
  if len(dataset_id) == 0:
@@ -181,7 +179,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
181
179
  # return an empty DatasetCollection
182
180
  return DatasetCollection()
183
181
 
184
- query = query.filter(dataset_table.c.dataset_id.in_(dataset_id))
182
+ query = query.filter(in_(dataset_table.c.dataset_id, dataset_id))
185
183
  else:
186
184
  query = query.filter(dataset_table.c.dataset_id == dataset_id)
187
185
 
@@ -201,13 +199,25 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
201
199
  if not selectors:
202
200
  raise ValueError("Selectors must contain at least one item")
203
201
 
204
- keys = list(selectors[0].filtered_attributes.keys())
202
+ attribute_keys = selectors[
203
+ 0
204
+ ].filtered_attributes.keys() # Assume all selectors have the same keys
205
+ attribute_sets = {
206
+ tuple(selector.filtered_attributes.items()) for selector in selectors
207
+ }
208
+
209
+ # Define a virtual table using a CTE for all attributes
210
+ attribute_cte = union_all(
211
+ *[
212
+ select(*(literal(value).label(key) for key, value in attr_set))
213
+ for attr_set in attribute_sets
214
+ ]
215
+ ).cte("attributes")
205
216
 
206
- columns = []
217
+ keys = list(selectors[0].filtered_attributes.keys())
207
218
  first_selector = selectors[0].filtered_attributes
208
219
 
209
- # Create a query like this:
210
- # SELECT * FROM dataset WHERE (column1, column2, column3) IN ((1, 2, 3), (4, 5, 6), (7, 8, 9))
220
+ join_conditions = []
211
221
  for k in keys:
212
222
  if dialect == "postgresql":
213
223
  column = dataset_table.c.identifier[k]
@@ -215,25 +225,28 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
215
225
  # Take the value from the first selector to determine the type.
216
226
  # TODO: check all selectors to determine the type
217
227
  v = first_selector[k]
218
- if isint(v):
228
+ if isinstance(v, int):
219
229
  column = column.as_integer()
220
- elif isfloat(v):
221
- column = column.as_float()
222
230
  else:
223
231
  column = column.as_string()
224
232
  else:
225
233
  column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
226
- columns.append(column)
227
234
 
228
- values = []
229
- for selector in selectors:
230
- filtered_attributes = selector.filtered_attributes
231
- values.append(tuple([filtered_attributes[k] for k in keys]))
235
+ join_conditions.append(attribute_cte.c[k] == column)
232
236
 
233
- query = query.filter(tuple_(*columns).in_(values))
237
+ query = query.select_from(
238
+ dataset_table.join(attribute_cte, and_(*join_conditions))
239
+ )
234
240
 
235
241
  if where:
236
242
  query = query.filter(text(where))
243
+
244
+ query = query.filter(dataset_table.c.bucket == bucket)
245
+ if dataset_type:
246
+ query = query.filter(dataset_table.c.dataset_type == dataset_type)
247
+ if provider:
248
+ query = query.filter(dataset_table.c.provider == provider)
249
+
237
250
  return query
238
251
 
239
252
  def load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
@@ -242,13 +255,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
242
255
 
243
256
  dataset_rows = list(
244
257
  self.session.query(dataset_table).filter(
245
- dataset_table.c.dataset_id.in_(dataset_ids)
258
+ in_(dataset_table.c.dataset_id, dataset_ids)
246
259
  )
247
260
  )
248
261
  revisions_per_dataset = {}
249
262
  rows = (
250
263
  self.session.query(revision_table)
251
- .filter(revision_table.c.dataset_id.in_(dataset_ids))
264
+ .filter(in_(revision_table.c.dataset_id, dataset_ids))
252
265
  .order_by(revision_table.c.dataset_id)
253
266
  )
254
267
 
@@ -260,7 +273,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
260
273
  files_per_revision = {}
261
274
  rows = (
262
275
  self.session.query(file_table)
263
- .filter(file_table.c.dataset_id.in_(dataset_ids))
276
+ .filter(in_(file_table.c.dataset_id, dataset_ids))
264
277
  .order_by(file_table.c.dataset_id, file_table.c.revision_id)
265
278
  )
266
279
 
@@ -320,10 +333,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
320
333
 
321
334
  metadata_result_row = apply_query_filter(
322
335
  self.session.query(
323
- func.min(file_table.c.modified_at).label("first_modified_at"),
324
- func.max(file_table.c.modified_at).label("last_modified_at"),
336
+ func.max(dataset_table.c.last_modified_at).label("last_modified_at"),
325
337
  func.count().label("row_count"),
326
- ).join(dataset_table, dataset_table.c.dataset_id == file_table.c.dataset_id)
338
+ )
327
339
  ).first()
328
340
  dataset_collection_metadata = DatasetCollectionMetadata(*metadata_result_row)
329
341
 
@@ -14,8 +14,11 @@ from sqlalchemy import (
14
14
  String,
15
15
  Table,
16
16
  TypeDecorator,
17
+ Index,
17
18
  )
18
19
 
20
+ from sqlalchemy.dialects.postgresql import JSONB
21
+
19
22
  from ingestify.domain import Identifier, DataSpecVersionCollection, Selector
20
23
  from ingestify.domain.models.dataset.dataset import DatasetState
21
24
  from ingestify.domain.models.ingestion.ingestion_job_summary import IngestionJobState
@@ -25,18 +28,18 @@ from ingestify.domain.models.timing import Timing
25
28
  from ingestify.domain.models.dataset.revision import RevisionState
26
29
 
27
30
 
28
- def JSONType(serializer=None, deserializer=None):
31
+ def JSONType(serializer=None, deserializer=None, base_type=JSON):
29
32
  class _JsonType(TypeDecorator):
30
33
  cache_ok = True
31
- impl = JSON
34
+ impl = base_type
32
35
 
33
36
  def process_bind_param(self, value, dialect):
34
- if serializer is not None:
37
+ if serializer and value is not None:
35
38
  return serializer(value)
36
39
  return value
37
40
 
38
41
  def process_result_value(self, value, dialect):
39
- if deserializer is not None:
42
+ if deserializer and value is not None:
40
43
  return deserializer(value)
41
44
  return value
42
45
 
@@ -152,14 +155,28 @@ dataset_table = Table(
152
155
  metadata,
153
156
  Column("bucket", String(255), default=None),
154
157
  Column("dataset_id", String(255), primary_key=True),
155
- Column("provider", String(255)),
156
- Column("dataset_type", String(255)),
158
+ Column("provider", String(255), index=True),
159
+ Column("dataset_type", String(255), index=True),
157
160
  Column("state", DatasetStateString),
158
161
  Column("name", String(255)),
159
- Column("identifier", JSONType(deserializer=lambda item: Identifier(**item))),
162
+ Column(
163
+ "identifier",
164
+ # Use JSONB when available
165
+ JSON().with_variant(JSONB(), "postgresql"),
166
+ ),
160
167
  Column("metadata", JSON),
161
168
  Column("created_at", TZDateTime(6)),
162
169
  Column("updated_at", TZDateTime(6)),
170
+ Column("last_modified_at", TZDateTime(6)),
171
+ # Required for performance querying when there are a lot of Datasets
172
+ # with the same provider and dataset_type
173
+ Index(
174
+ "idx_bucket_type_provider_last_modified",
175
+ "bucket",
176
+ "provider",
177
+ "dataset_type",
178
+ "last_modified_at",
179
+ ),
163
180
  )
164
181
 
165
182
  revision_table = Table(
ingestify/utils.py CHANGED
@@ -1,34 +1,23 @@
1
- import abc
2
- import asyncio
3
- import inspect
4
1
  import logging
5
2
  import os
6
3
  import time
7
4
  import re
5
+ import traceback
6
+ from contextlib import contextmanager
8
7
  from multiprocessing import get_context, cpu_count, get_all_start_methods
9
8
 
10
9
  from datetime import datetime, timezone
11
10
  from string import Template
12
- from typing import (
13
- Dict,
14
- Generic,
15
- Type,
16
- TypeVar,
17
- Tuple,
18
- Optional,
19
- Any,
20
- Callable,
21
- Awaitable,
22
- List,
23
- Iterable,
24
- )
11
+ from typing import Dict, Tuple, Optional, Any, List
25
12
 
26
13
  import cloudpickle
14
+ from pydantic import Field
27
15
  from typing_extensions import Self
28
16
 
29
17
 
30
18
  from itertools import islice
31
19
 
20
+ from ingestify.domain.models.timing import Timing
32
21
 
33
22
  logger = logging.getLogger(__name__)
34
23
 
@@ -221,3 +210,46 @@ def try_number(s: str):
221
210
  return float(s)
222
211
  except ValueError:
223
212
  return s
213
+
214
+
215
+ class HasTiming:
216
+ """Mixin to give Pydantic models ability to time actions."""
217
+
218
+ timings: List[Timing] = Field(default_factory=list)
219
+
220
+ @contextmanager
221
+ def record_timing(
222
+ self, description: str, metadata: Optional[dict] = None
223
+ ) -> Timing:
224
+ if not metadata:
225
+ metadata = {}
226
+
227
+ start = utcnow()
228
+ try:
229
+ result = None
230
+ yield
231
+ except Exception as e:
232
+ result = {
233
+ "type": type(e).__name__,
234
+ "message": str(e),
235
+ "traceback": traceback.format_exc(),
236
+ }
237
+ raise e
238
+ finally:
239
+ metadata = dict(result=result, **metadata)
240
+ self.timings.append(
241
+ Timing(
242
+ name=description,
243
+ started_at=start,
244
+ ended_at=utcnow(),
245
+ metadata=metadata,
246
+ )
247
+ )
248
+
249
+ def start_timing(self, name):
250
+ start = utcnow()
251
+
252
+ def finish():
253
+ self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
254
+
255
+ return finish
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.3.4
3
+ Version: 0.4.1
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -1,14 +1,14 @@
1
- ingestify/__init__.py,sha256=lyBZ_P8y4qlkE1e11F4T41fSTp8WbReifRxX9UGizxA,301
2
- ingestify/cmdline.py,sha256=bIuyPgGEw4wIglNzpG9zp7TsJozsP8NSVsCe4eAyWUg,7189
1
+ ingestify/__init__.py,sha256=xCS7JQ_JaB6zVzrq6WUeAZyNxVKJEOc7AKh-3vY_Ji8,301
2
+ ingestify/cmdline.py,sha256=oagUe-Jup1SU3s6jVl25f0cSG0wlNYhxFY-gGBwWmr0,7482
3
3
  ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
4
4
  ingestify/main.py,sha256=Xr0VbGgstPO7doDX18xqk4lBb4W2sbGWtQuXZaARsHA,8763
5
5
  ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
6
6
  ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
7
- ingestify/utils.py,sha256=HETGhAoUlutLG0cQR63nac2JbFei9gnktDHeBQoYWfU,5692
7
+ ingestify/utils.py,sha256=6BqgEZjecLW_anqYP5WrFpi93bmdhF-EdrebEkm59Ds,6806
8
8
  ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- ingestify/application/dataset_store.py,sha256=c10EIxzOfO4ksKwPOI9jcOn33j54QWu_qXOMLwe-Y-A,11617
10
- ingestify/application/ingestion_engine.py,sha256=PtMjKMpvfqB802G5zfKLzyamdH7qFOXl3x6_97y8w60,2288
11
- ingestify/application/loader.py,sha256=v8ZcpMDEml9k_uFPFqT4WaCjXED_OIpAr7g0Pz5Hp6Y,7153
9
+ ingestify/application/dataset_store.py,sha256=JkAb1W0HaUgOwbohKntM4ttyrFXQ7df1uZSu2rbZllY,11680
10
+ ingestify/application/ingestion_engine.py,sha256=4SAmPZDm3e2QA5jZvMrb6xz1eDDshKoSZDWH3TCe4Bo,2372
11
+ ingestify/application/loader.py,sha256=2LpYSHvedsoe5wFsIkQv0xLcKcqtebwVOSPWapAypao,7566
12
12
  ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
13
13
  ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
14
14
  ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
@@ -20,8 +20,8 @@ ingestify/domain/models/source.py,sha256=sB3aqr2LfjIbtw7ODJpHnPj3RUeo7gYmTU7MXvf
20
20
  ingestify/domain/models/timing.py,sha256=TvvH6Szo61CD8wCP7Awyc45CXga5lKqvoW2U-0TRHlA,388
21
21
  ingestify/domain/models/dataset/__init__.py,sha256=i1kswluvWjw0xn4OUByRt7yeRvNHu1mauevv-Vmayx4,630
22
22
  ingestify/domain/models/dataset/collection.py,sha256=E2utQ6oyaFFrfQFMiwP9J_I7Wm21z0sRvE4Zc3QEs20,1310
23
- ingestify/domain/models/dataset/collection_metadata.py,sha256=gI5cb9M0QRsheIr2jA71wOyWfI5lGx5ES2Qw7rbDIoA,371
24
- ingestify/domain/models/dataset/dataset.py,sha256=6iQgBApRK08GhxArnJjjE9SuJMMOsKx_gI6JDHy5nZc,2970
23
+ ingestify/domain/models/dataset/collection_metadata.py,sha256=u2H3XZ-6NMfuAcVD_mb7jEc1IkS1MgQahDIe5CWHjtc,458
24
+ ingestify/domain/models/dataset/dataset.py,sha256=STew8_zCBro_x_u03JrjMvq8yqUDaNndlOlolf9osdM,3332
25
25
  ingestify/domain/models/dataset/dataset_repository.py,sha256=kUjiqW58kOUOli1gZCLR5xw4dBX0bqI1UJsf16hgNsQ,812
26
26
  ingestify/domain/models/dataset/dataset_state.py,sha256=O95mea5N34HDXw7XsYzxHna4FVk_T-ZNUDezkvt7VzY,220
27
27
  ingestify/domain/models/dataset/events.py,sha256=58VacQejQt-WPh9BywP4st5McauM3gXBQo0kaDnSekY,481
@@ -29,7 +29,7 @@ ingestify/domain/models/dataset/file.py,sha256=1Thdv6A1YmC1UfutaRf2q3FGHQYO0SWEp
29
29
  ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
30
30
  ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
31
31
  ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
32
- ingestify/domain/models/dataset/revision.py,sha256=HPOZpVmQSwdcsr90RNVlOQ7c1_W7grzi5E1NOiEK92g,1331
32
+ ingestify/domain/models/dataset/revision.py,sha256=jBjMqYXDbvt_VAIwL_db09jcH4W8JPRKsXJb4JCniuM,1447
33
33
  ingestify/domain/models/dataset/selector.py,sha256=kEGpU8pIyjZ0zwE9n2uo_NY5xrNanWiTTgapyMAUEsw,1039
34
34
  ingestify/domain/models/event/__init__.py,sha256=OdPTpE9bj5QqdGmrYqRTLPX1f-LR9GWJYlGMPPEsuL8,138
35
35
  ingestify/domain/models/event/_old_event.py,sha256=RktgCAj9SMdtqkAc_bOwoghEb2Z6m4r5_xWXin9wqx4,472
@@ -39,15 +39,15 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
39
39
  ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
40
40
  ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
41
41
  ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- ingestify/domain/models/ingestion/ingestion_job.py,sha256=H3vnEUS3izuNJfmD7ZGbznemX9r2JZ1po7D7D9ArzwM,13230
43
- ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=cgm8kLoX3eK9SkBYe5HhwA7kg5FAyN4kfTCJrVHaRlc,4702
42
+ ingestify/domain/models/ingestion/ingestion_job.py,sha256=H9jfbbWFZw73nxMOW0480LgSHV-o4sA5IcvpUZmFpS4,13140
43
+ ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=Xvmtu0BwE9C7FxBl6D8tN49I6--E_RngcMfWeme4DPA,4499
44
44
  ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
45
45
  ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
46
46
  ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
47
47
  ingestify/domain/models/task/__init__.py,sha256=BdlyIPvE07Xax_IzLgO9DUw0wsz9OZutxnxdDNyRlys,79
48
48
  ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIpXtlhW4,430
49
49
  ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
50
- ingestify/domain/models/task/task_summary.py,sha256=Ncf6ij_aLkElZOsBgep-kd82FyzHjr5xjhAbAXNRJUs,3757
50
+ ingestify/domain/models/task/task_summary.py,sha256=T9BSGhOZjKCPfym34VUdBXuMy0o6E832GAI4WMtjGao,3181
51
51
  ingestify/domain/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
52
  ingestify/domain/services/identifier_key_transformer.py,sha256=y4GS9u9Ej1MO2jUhAxWbifp0mrE_MqTHvVVcoQzSKb4,4034
53
53
  ingestify/domain/services/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -55,7 +55,7 @@ ingestify/domain/services/transformers/kloppy_to_pandas.py,sha256=NcN6nTBGVn9gz-
55
55
  ingestify/infra/__init__.py,sha256=V0hpLzPVTcOHRVh0gguF6FT30YIgEOUd5v87xUHkfZ4,88
56
56
  ingestify/infra/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
57
  ingestify/infra/fetch/http.py,sha256=ldaXy6alBbI9z63H97lXfYZNT0ZCBkTac1W6-acNjjY,4127
58
- ingestify/infra/serialization/__init__.py,sha256=-i8XLJDI2hwlX65JITcIzuOaGLJaNekgG9OfA6L7Enc,1035
58
+ ingestify/infra/serialization/__init__.py,sha256=UqXWJmKTp7Mi58ZyDASGguPFlqdVWVUbm_sg9GWx9eI,702
59
59
  ingestify/infra/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  ingestify/infra/sink/postgresql.py,sha256=SxuM3LntfYcpCriUpqJhMvgAf0s9cohXf6WkxSEDYDY,1816
61
61
  ingestify/infra/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -64,8 +64,8 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
64
64
  ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
65
65
  ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
66
  ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
67
- ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=86BqLhj5pB45iNSfYWbuMNwo-9KnGbbSYtdD8WJw_qo,16003
68
- ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=b73jqpW-_QubtZpFJv7BTKdTsKbufESP0O1uJCmFfBE,10106
67
+ ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=3xDTqEEy_MxZoIX9qezpXasOFW7NMmduJEaR0PwTZXk,16110
68
+ ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=OLB1FMElb3gSAnOsKX-oiLl_YVXaVEa6Q29QoHp2okU,10602
69
69
  ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
70
70
  ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
71
71
  ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
@@ -80,8 +80,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
80
80
  ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
81
81
  ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
82
82
  ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
83
- ingestify-0.3.4.dist-info/METADATA,sha256=v5rEF3343auBHwK8K5Zu0C8tTYfm0WjGtyZs0SmY3xg,18854
84
- ingestify-0.3.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
- ingestify-0.3.4.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
86
- ingestify-0.3.4.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
87
- ingestify-0.3.4.dist-info/RECORD,,
83
+ ingestify-0.4.1.dist-info/METADATA,sha256=Tz062FbilTuQmmW2FPyr2sj0GIK1vjtZs189R5bkxEM,18854
84
+ ingestify-0.4.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
+ ingestify-0.4.1.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
86
+ ingestify-0.4.1.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
87
+ ingestify-0.4.1.dist-info/RECORD,,