ingestify 0.3.4__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/application/dataset_store.py +1 -0
- ingestify/application/ingestion_engine.py +7 -2
- ingestify/application/loader.py +14 -1
- ingestify/cmdline.py +20 -2
- ingestify/domain/models/dataset/collection_metadata.py +2 -1
- ingestify/domain/models/dataset/dataset.py +10 -0
- ingestify/domain/models/dataset/revision.py +4 -0
- ingestify/domain/models/ingestion/ingestion_job.py +8 -10
- ingestify/domain/models/ingestion/ingestion_job_summary.py +24 -34
- ingestify/domain/models/task/task_summary.py +3 -24
- ingestify/infra/serialization/__init__.py +2 -13
- ingestify/infra/store/dataset/sqlalchemy/repository.py +45 -33
- ingestify/infra/store/dataset/sqlalchemy/tables.py +24 -7
- ingestify/utils.py +48 -16
- {ingestify-0.3.4.dist-info → ingestify-0.4.1.dist-info}/METADATA +1 -1
- {ingestify-0.3.4.dist-info → ingestify-0.4.1.dist-info}/RECORD +20 -20
- {ingestify-0.3.4.dist-info → ingestify-0.4.1.dist-info}/WHEEL +0 -0
- {ingestify-0.3.4.dist-info → ingestify-0.4.1.dist-info}/entry_points.txt +0 -0
- {ingestify-0.3.4.dist-info → ingestify-0.4.1.dist-info}/top_level.txt +0 -0
ingestify/__init__.py
CHANGED
|
@@ -21,8 +21,13 @@ class IngestionEngine:
|
|
|
21
21
|
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
22
22
|
self.loader.add_ingestion_plan(ingestion_plan)
|
|
23
23
|
|
|
24
|
-
def load(
|
|
25
|
-
self
|
|
24
|
+
def load(
|
|
25
|
+
self,
|
|
26
|
+
dry_run: bool = False,
|
|
27
|
+
provider: Optional[str] = None,
|
|
28
|
+
source: Optional[str] = None,
|
|
29
|
+
):
|
|
30
|
+
self.loader.collect_and_run(dry_run=dry_run, provider=provider, source=source)
|
|
26
31
|
|
|
27
32
|
def list_datasets(self, as_count: bool = False):
|
|
28
33
|
"""Consider moving this to DataStore"""
|
ingestify/application/loader.py
CHANGED
|
@@ -29,7 +29,12 @@ class Loader:
|
|
|
29
29
|
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
30
30
|
self.ingestion_plans.append(ingestion_plan)
|
|
31
31
|
|
|
32
|
-
def collect_and_run(
|
|
32
|
+
def collect_and_run(
|
|
33
|
+
self,
|
|
34
|
+
dry_run: bool = False,
|
|
35
|
+
provider: Optional[str] = None,
|
|
36
|
+
source: Optional[str] = None,
|
|
37
|
+
):
|
|
33
38
|
# First collect all selectors, before discovering datasets
|
|
34
39
|
selectors = {}
|
|
35
40
|
for ingestion_plan in self.ingestion_plans:
|
|
@@ -42,6 +47,13 @@ class Loader:
|
|
|
42
47
|
)
|
|
43
48
|
continue
|
|
44
49
|
|
|
50
|
+
if source is not None:
|
|
51
|
+
if ingestion_plan.source.name != source:
|
|
52
|
+
logger.info(
|
|
53
|
+
f"Skipping {ingestion_plan} because source doesn't match '{source}'"
|
|
54
|
+
)
|
|
55
|
+
continue
|
|
56
|
+
|
|
45
57
|
static_selectors = [
|
|
46
58
|
selector
|
|
47
59
|
for selector in ingestion_plan.selectors
|
|
@@ -60,6 +72,7 @@ class Loader:
|
|
|
60
72
|
|
|
61
73
|
# TODO: consider making this lazy and fetch once per Source instead of
|
|
62
74
|
# once per IngestionPlan
|
|
75
|
+
# TODO: Log exception when `discover_selectors` fails
|
|
63
76
|
all_selectors = ingestion_plan.source.discover_selectors(
|
|
64
77
|
ingestion_plan.dataset_type
|
|
65
78
|
)
|
ingestify/cmdline.py
CHANGED
|
@@ -58,7 +58,14 @@ def cli():
|
|
|
58
58
|
help="bucket",
|
|
59
59
|
type=str,
|
|
60
60
|
)
|
|
61
|
-
@click.option(
|
|
61
|
+
@click.option(
|
|
62
|
+
"--debug",
|
|
63
|
+
"debug",
|
|
64
|
+
required=False,
|
|
65
|
+
help="Debugging enabled",
|
|
66
|
+
is_flag=True,
|
|
67
|
+
type=bool,
|
|
68
|
+
)
|
|
62
69
|
@click.option(
|
|
63
70
|
"--dry-run",
|
|
64
71
|
"dry_run",
|
|
@@ -74,11 +81,19 @@ def cli():
|
|
|
74
81
|
help="Provider - only run tasks for a single provider",
|
|
75
82
|
type=str,
|
|
76
83
|
)
|
|
84
|
+
@click.option(
|
|
85
|
+
"--source",
|
|
86
|
+
"source",
|
|
87
|
+
required=False,
|
|
88
|
+
help="Source - only run tasks for a single source",
|
|
89
|
+
type=str,
|
|
90
|
+
)
|
|
77
91
|
def run(
|
|
78
92
|
config_file: str,
|
|
79
93
|
bucket: Optional[str],
|
|
80
94
|
dry_run: Optional[bool],
|
|
81
95
|
provider: Optional[str],
|
|
96
|
+
source: Optional[str],
|
|
82
97
|
debug: Optional[bool],
|
|
83
98
|
):
|
|
84
99
|
try:
|
|
@@ -90,7 +105,10 @@ def run(
|
|
|
90
105
|
logger.exception(f"Failed due a configuration error: {e}")
|
|
91
106
|
sys.exit(1)
|
|
92
107
|
|
|
93
|
-
|
|
108
|
+
if debug:
|
|
109
|
+
logging.getLogger("root").setLevel(logging.DEBUG)
|
|
110
|
+
|
|
111
|
+
engine.load(dry_run=dry_run, provider=provider, source=source)
|
|
94
112
|
|
|
95
113
|
logger.info("Done")
|
|
96
114
|
|
|
@@ -6,7 +6,8 @@ from typing import Optional
|
|
|
6
6
|
@dataclass
|
|
7
7
|
class DatasetCollectionMetadata:
|
|
8
8
|
# This can be useful to figure out if a backfill is required
|
|
9
|
-
|
|
9
|
+
# TODO - Note: not stored at Dataset level and requires joined query to retrieve
|
|
10
|
+
# first_modified: Optional[datetime]
|
|
10
11
|
|
|
11
12
|
# Use the last modified to only retrieve datasets that are changed
|
|
12
13
|
last_modified: Optional[datetime]
|
|
@@ -22,7 +22,10 @@ class Dataset(BaseModel):
|
|
|
22
22
|
metadata: dict
|
|
23
23
|
created_at: datetime
|
|
24
24
|
updated_at: datetime
|
|
25
|
+
|
|
25
26
|
revisions: List[Revision] = Field(default_factory=list)
|
|
27
|
+
# The last_modified_at is equal to the max modified_at of all files in all revisions
|
|
28
|
+
last_modified_at: Optional[datetime]
|
|
26
29
|
|
|
27
30
|
@field_validator("identifier", mode="before")
|
|
28
31
|
@classmethod
|
|
@@ -42,6 +45,13 @@ class Dataset(BaseModel):
|
|
|
42
45
|
self.revisions.append(revision)
|
|
43
46
|
self.updated_at = utcnow()
|
|
44
47
|
|
|
48
|
+
if self.last_modified_at:
|
|
49
|
+
self.last_modified_at = max(
|
|
50
|
+
self.last_modified_at, revision.last_modified_at
|
|
51
|
+
)
|
|
52
|
+
else:
|
|
53
|
+
self.last_modified_at = revision.last_modified_at
|
|
54
|
+
|
|
45
55
|
def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
|
|
46
56
|
changed = False
|
|
47
57
|
if self.name != name:
|
|
@@ -36,6 +36,10 @@ class Revision(BaseModel):
|
|
|
36
36
|
is_squashed: bool = False
|
|
37
37
|
state: RevisionState = RevisionState.PENDING_VALIDATION
|
|
38
38
|
|
|
39
|
+
@property
|
|
40
|
+
def last_modified_at(self):
|
|
41
|
+
return max(file.modified_at for file in self.modified_files)
|
|
42
|
+
|
|
39
43
|
@property
|
|
40
44
|
def modified_files_map(self) -> Dict[str, File]:
|
|
41
45
|
return {file.file_id: file for file in self.modified_files}
|
|
@@ -214,9 +214,6 @@ class IngestionJob:
|
|
|
214
214
|
self, store: DatasetStore, task_executor: TaskExecutor
|
|
215
215
|
) -> Iterator[IngestionJobSummary]:
|
|
216
216
|
is_first_chunk = True
|
|
217
|
-
ingestion_job_exception = (
|
|
218
|
-
None # Indicate if there was an exception during the IngestionJob itself
|
|
219
|
-
)
|
|
220
217
|
ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
|
|
221
218
|
# Process all items in batches. Yield a IngestionJobSummary per batch
|
|
222
219
|
|
|
@@ -224,6 +221,7 @@ class IngestionJob:
|
|
|
224
221
|
with ingestion_job_summary.record_timing("get_dataset_collection"):
|
|
225
222
|
dataset_collection_metadata = store.get_dataset_collection(
|
|
226
223
|
dataset_type=self.ingestion_plan.dataset_type,
|
|
224
|
+
provider=self.ingestion_plan.source.provider,
|
|
227
225
|
data_spec_versions=self.selector.data_spec_versions,
|
|
228
226
|
selector=self.selector,
|
|
229
227
|
metadata_only=True,
|
|
@@ -233,8 +231,8 @@ class IngestionJob:
|
|
|
233
231
|
# There are two different, but similar flows here:
|
|
234
232
|
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
235
233
|
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
236
|
-
|
|
237
|
-
|
|
234
|
+
try:
|
|
235
|
+
with ingestion_job_summary.record_timing("find_datasets"):
|
|
238
236
|
dataset_resources = self.ingestion_plan.source.find_datasets(
|
|
239
237
|
dataset_type=self.ingestion_plan.dataset_type,
|
|
240
238
|
data_spec_versions=self.selector.data_spec_versions,
|
|
@@ -244,12 +242,12 @@ class IngestionJob:
|
|
|
244
242
|
|
|
245
243
|
# We need to include the to_batches as that will start the generator
|
|
246
244
|
batches = to_batches(dataset_resources)
|
|
247
|
-
|
|
248
|
-
|
|
245
|
+
except Exception as e:
|
|
246
|
+
logger.exception("Failed to find datasets")
|
|
249
247
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
248
|
+
ingestion_job_summary.set_exception(e)
|
|
249
|
+
yield ingestion_job_summary
|
|
250
|
+
return
|
|
253
251
|
|
|
254
252
|
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
255
253
|
|
|
@@ -9,7 +9,7 @@ from ingestify.domain import Selector, DataSpecVersionCollection
|
|
|
9
9
|
from ingestify.domain.models.base import BaseModel
|
|
10
10
|
from ingestify.domain.models.task.task_summary import TaskSummary, TaskState
|
|
11
11
|
from ingestify.domain.models.timing import Timing
|
|
12
|
-
from ingestify.utils import utcnow
|
|
12
|
+
from ingestify.utils import utcnow, HasTiming
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
15
|
from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
|
|
@@ -25,7 +25,7 @@ def format_duration(duration: timedelta):
|
|
|
25
25
|
return f"{duration.total_seconds():.2f}sec"
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
class IngestionJobSummary(BaseModel):
|
|
28
|
+
class IngestionJobSummary(BaseModel, HasTiming):
|
|
29
29
|
ingestion_job_summary_id: str
|
|
30
30
|
ingestion_job_id: str
|
|
31
31
|
|
|
@@ -39,7 +39,6 @@ class IngestionJobSummary(BaseModel):
|
|
|
39
39
|
started_at: datetime = Field(default_factory=utcnow)
|
|
40
40
|
ended_at: Optional[datetime] = None
|
|
41
41
|
state: IngestionJobState = IngestionJobState.RUNNING
|
|
42
|
-
timings: List[Timing] = Field(default_factory=list)
|
|
43
42
|
task_summaries: List[TaskSummary] = Field(default_factory=list)
|
|
44
43
|
|
|
45
44
|
skipped_datasets: int = 0
|
|
@@ -60,22 +59,6 @@ class IngestionJobSummary(BaseModel):
|
|
|
60
59
|
)
|
|
61
60
|
return cls(**args)
|
|
62
61
|
|
|
63
|
-
@contextmanager
|
|
64
|
-
def record_timing(self, name: str):
|
|
65
|
-
start = utcnow()
|
|
66
|
-
try:
|
|
67
|
-
yield
|
|
68
|
-
finally:
|
|
69
|
-
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
70
|
-
|
|
71
|
-
def start_timing(self, name):
|
|
72
|
-
start = utcnow()
|
|
73
|
-
|
|
74
|
-
def finish():
|
|
75
|
-
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
76
|
-
|
|
77
|
-
return finish
|
|
78
|
-
|
|
79
62
|
def add_task_summaries(self, task_summaries: List[TaskSummary]):
|
|
80
63
|
self.task_summaries.extend(task_summaries)
|
|
81
64
|
|
|
@@ -101,6 +84,11 @@ class IngestionJobSummary(BaseModel):
|
|
|
101
84
|
)
|
|
102
85
|
self.ended_at = utcnow()
|
|
103
86
|
|
|
87
|
+
# Only keep failed tasks. Rest isn't interesting
|
|
88
|
+
self.task_summaries = [
|
|
89
|
+
task for task in self.task_summaries if task.state == TaskState.FAILED
|
|
90
|
+
]
|
|
91
|
+
|
|
104
92
|
def set_finished(self):
|
|
105
93
|
self.state = IngestionJobState.FINISHED
|
|
106
94
|
self._set_ended()
|
|
@@ -114,25 +102,27 @@ class IngestionJobSummary(BaseModel):
|
|
|
114
102
|
return self.ended_at - self.started_at
|
|
115
103
|
|
|
116
104
|
def output_report(self):
|
|
117
|
-
print(
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
print(
|
|
121
|
-
print(f"
|
|
122
|
-
print(f"
|
|
123
|
-
print(f"
|
|
124
|
-
print(f"
|
|
105
|
+
print(
|
|
106
|
+
f"\nIngestionJobSummary {self.state.value} in {format_duration(self.duration)}"
|
|
107
|
+
)
|
|
108
|
+
print("********************************")
|
|
109
|
+
print(f"* - IngestionPlan:")
|
|
110
|
+
print(f"* Source: {self.source_name}")
|
|
111
|
+
print(f"* Provider: {self.provider}")
|
|
112
|
+
print(f"* DatasetType: {self.dataset_type}")
|
|
113
|
+
print(f"* - Selector: {self.selector}")
|
|
114
|
+
print(f"* - Timings: ")
|
|
125
115
|
for timing in self.timings:
|
|
126
|
-
print(f" - {timing.name}: {format_duration(timing.duration)}")
|
|
116
|
+
print(f"* - {timing.name}: {format_duration(timing.duration)}")
|
|
127
117
|
print(
|
|
128
|
-
f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
|
|
118
|
+
f"* - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
|
|
129
119
|
)
|
|
130
120
|
|
|
131
|
-
print(f" - Failed tasks: {self.failed_tasks}")
|
|
132
|
-
print(f" - Successful tasks: {self.successful_tasks}")
|
|
133
|
-
print(f" - Successful ignored tasks: {self.ignored_successful_tasks}")
|
|
134
|
-
print(f" - Skipped datasets: {self.skipped_datasets}")
|
|
135
|
-
print("
|
|
121
|
+
print(f"* - Failed tasks: {self.failed_tasks}")
|
|
122
|
+
print(f"* - Successful tasks: {self.successful_tasks}")
|
|
123
|
+
print(f"* - Successful ignored tasks: {self.ignored_successful_tasks}")
|
|
124
|
+
print(f"* - Skipped datasets: {self.skipped_datasets}")
|
|
125
|
+
print("********************************")
|
|
136
126
|
|
|
137
127
|
def __enter__(self):
|
|
138
128
|
return self
|
|
@@ -10,8 +10,7 @@ from ingestify.domain.models.base import BaseModel
|
|
|
10
10
|
from ingestify.domain.models.dataset.identifier import Identifier
|
|
11
11
|
from ingestify.domain.models.timing import Timing
|
|
12
12
|
from ingestify.exceptions import IngestifyError
|
|
13
|
-
from ingestify.utils import utcnow
|
|
14
|
-
|
|
13
|
+
from ingestify.utils import utcnow, HasTiming
|
|
15
14
|
|
|
16
15
|
logger = logging.getLogger(__name__)
|
|
17
16
|
|
|
@@ -28,7 +27,7 @@ class Operation(str, Enum):
|
|
|
28
27
|
UPDATE = "UPDATE"
|
|
29
28
|
|
|
30
29
|
|
|
31
|
-
class TaskSummary(BaseModel):
|
|
30
|
+
class TaskSummary(BaseModel, HasTiming):
|
|
32
31
|
task_id: str
|
|
33
32
|
started_at: datetime
|
|
34
33
|
operation: Operation
|
|
@@ -38,7 +37,6 @@ class TaskSummary(BaseModel):
|
|
|
38
37
|
bytes_retrieved: int = 0
|
|
39
38
|
last_modified: Optional[datetime] = None
|
|
40
39
|
state: TaskState = TaskState.RUNNING
|
|
41
|
-
timings: List[Timing] = Field(default_factory=list)
|
|
42
40
|
|
|
43
41
|
@field_validator("dataset_identifier", mode="before")
|
|
44
42
|
@classmethod
|
|
@@ -48,27 +46,8 @@ class TaskSummary(BaseModel):
|
|
|
48
46
|
return value
|
|
49
47
|
|
|
50
48
|
def record_load_file(self, fn, metadata: dict):
|
|
51
|
-
|
|
52
|
-
try:
|
|
53
|
-
result = None
|
|
49
|
+
with self.record_timing(f"Load of {metadata.get('file_id', 'file')}", metadata):
|
|
54
50
|
return fn()
|
|
55
|
-
except Exception as e:
|
|
56
|
-
result = {
|
|
57
|
-
"type": type(e).__name__,
|
|
58
|
-
"message": str(e),
|
|
59
|
-
"traceback": traceback.format_exc(),
|
|
60
|
-
}
|
|
61
|
-
raise e
|
|
62
|
-
finally:
|
|
63
|
-
metadata = dict(result=result, **metadata)
|
|
64
|
-
self.timings.append(
|
|
65
|
-
Timing(
|
|
66
|
-
name=f"Load of {metadata.get('file_id', 'file')}",
|
|
67
|
-
started_at=start,
|
|
68
|
-
ended_at=utcnow(),
|
|
69
|
-
metadata=metadata,
|
|
70
|
-
)
|
|
71
|
-
)
|
|
72
51
|
|
|
73
52
|
@classmethod
|
|
74
53
|
@contextmanager
|
|
@@ -1,12 +1,5 @@
|
|
|
1
|
-
import
|
|
2
|
-
from
|
|
3
|
-
from typing import Type, Any, TypeVar
|
|
4
|
-
|
|
5
|
-
from dataclass_factory import Schema, Factory, NameStyle
|
|
6
|
-
from dataclass_factory.schema_helpers import type_checker
|
|
7
|
-
|
|
8
|
-
from ingestify.domain import DatasetCreated, Identifier
|
|
9
|
-
from ingestify.domain.models.dataset.events import MetadataUpdated, RevisionAdded
|
|
1
|
+
from ingestify.domain import DatasetCreated
|
|
2
|
+
from ingestify.domain.models.dataset.events import RevisionAdded
|
|
10
3
|
from ingestify.domain.models.event import DomainEvent
|
|
11
4
|
|
|
12
5
|
|
|
@@ -18,10 +11,6 @@ event_types = {
|
|
|
18
11
|
|
|
19
12
|
def deserialize(event_dict: dict) -> DomainEvent:
|
|
20
13
|
event_cls = event_types[event_dict["event_type"]]
|
|
21
|
-
event_dict["dataset"]["identifier"] = Identifier(
|
|
22
|
-
**event_dict["dataset"]["identifier"]
|
|
23
|
-
)
|
|
24
|
-
|
|
25
14
|
return event_cls.model_validate(event_dict)
|
|
26
15
|
|
|
27
16
|
|
|
@@ -1,32 +1,31 @@
|
|
|
1
1
|
import itertools
|
|
2
|
-
import json
|
|
3
2
|
import uuid
|
|
4
|
-
from collections import defaultdict
|
|
5
3
|
from typing import Optional, Union, List
|
|
6
4
|
|
|
7
5
|
from sqlalchemy import (
|
|
8
6
|
create_engine,
|
|
9
7
|
func,
|
|
10
8
|
text,
|
|
11
|
-
tuple_,
|
|
12
9
|
Table,
|
|
13
|
-
insert,
|
|
14
|
-
Transaction,
|
|
15
10
|
Connection,
|
|
11
|
+
union_all,
|
|
12
|
+
literal,
|
|
13
|
+
select,
|
|
14
|
+
and_,
|
|
15
|
+
Column,
|
|
16
|
+
or_,
|
|
16
17
|
)
|
|
17
18
|
from sqlalchemy.engine import make_url
|
|
18
19
|
from sqlalchemy.exc import NoSuchModuleError
|
|
19
|
-
from sqlalchemy.orm import Session
|
|
20
|
+
from sqlalchemy.orm import Session
|
|
20
21
|
|
|
21
22
|
from ingestify.domain import File, Revision
|
|
22
23
|
from ingestify.domain.models import (
|
|
23
24
|
Dataset,
|
|
24
25
|
DatasetCollection,
|
|
25
26
|
DatasetRepository,
|
|
26
|
-
Identifier,
|
|
27
27
|
Selector,
|
|
28
28
|
)
|
|
29
|
-
from ingestify.domain.models.base import BaseModel
|
|
30
29
|
from ingestify.domain.models.dataset.collection_metadata import (
|
|
31
30
|
DatasetCollectionMetadata,
|
|
32
31
|
)
|
|
@@ -127,6 +126,10 @@ class SqlAlchemySessionProvider:
|
|
|
127
126
|
return self.session
|
|
128
127
|
|
|
129
128
|
|
|
129
|
+
def in_(column: Column, values):
|
|
130
|
+
return or_(*[column == value for value in values])
|
|
131
|
+
|
|
132
|
+
|
|
130
133
|
class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
131
134
|
def __init__(self, session_provider: SqlAlchemySessionProvider):
|
|
132
135
|
self.session_provider = session_provider
|
|
@@ -169,11 +172,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
169
172
|
dataset_id: Optional[Union[str, List[str]]] = None,
|
|
170
173
|
selector: Optional[Union[Selector, List[Selector]]] = None,
|
|
171
174
|
):
|
|
172
|
-
query = query.filter(dataset_table.c.bucket == bucket)
|
|
173
|
-
if dataset_type:
|
|
174
|
-
query = query.filter(dataset_table.c.dataset_type == dataset_type)
|
|
175
|
-
if provider:
|
|
176
|
-
query = query.filter(dataset_table.c.provider == provider)
|
|
177
175
|
if dataset_id is not None:
|
|
178
176
|
if isinstance(dataset_id, list):
|
|
179
177
|
if len(dataset_id) == 0:
|
|
@@ -181,7 +179,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
181
179
|
# return an empty DatasetCollection
|
|
182
180
|
return DatasetCollection()
|
|
183
181
|
|
|
184
|
-
query = query.filter(dataset_table.c.dataset_id
|
|
182
|
+
query = query.filter(in_(dataset_table.c.dataset_id, dataset_id))
|
|
185
183
|
else:
|
|
186
184
|
query = query.filter(dataset_table.c.dataset_id == dataset_id)
|
|
187
185
|
|
|
@@ -201,13 +199,25 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
201
199
|
if not selectors:
|
|
202
200
|
raise ValueError("Selectors must contain at least one item")
|
|
203
201
|
|
|
204
|
-
|
|
202
|
+
attribute_keys = selectors[
|
|
203
|
+
0
|
|
204
|
+
].filtered_attributes.keys() # Assume all selectors have the same keys
|
|
205
|
+
attribute_sets = {
|
|
206
|
+
tuple(selector.filtered_attributes.items()) for selector in selectors
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
# Define a virtual table using a CTE for all attributes
|
|
210
|
+
attribute_cte = union_all(
|
|
211
|
+
*[
|
|
212
|
+
select(*(literal(value).label(key) for key, value in attr_set))
|
|
213
|
+
for attr_set in attribute_sets
|
|
214
|
+
]
|
|
215
|
+
).cte("attributes")
|
|
205
216
|
|
|
206
|
-
|
|
217
|
+
keys = list(selectors[0].filtered_attributes.keys())
|
|
207
218
|
first_selector = selectors[0].filtered_attributes
|
|
208
219
|
|
|
209
|
-
|
|
210
|
-
# SELECT * FROM dataset WHERE (column1, column2, column3) IN ((1, 2, 3), (4, 5, 6), (7, 8, 9))
|
|
220
|
+
join_conditions = []
|
|
211
221
|
for k in keys:
|
|
212
222
|
if dialect == "postgresql":
|
|
213
223
|
column = dataset_table.c.identifier[k]
|
|
@@ -215,25 +225,28 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
215
225
|
# Take the value from the first selector to determine the type.
|
|
216
226
|
# TODO: check all selectors to determine the type
|
|
217
227
|
v = first_selector[k]
|
|
218
|
-
if
|
|
228
|
+
if isinstance(v, int):
|
|
219
229
|
column = column.as_integer()
|
|
220
|
-
elif isfloat(v):
|
|
221
|
-
column = column.as_float()
|
|
222
230
|
else:
|
|
223
231
|
column = column.as_string()
|
|
224
232
|
else:
|
|
225
233
|
column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
|
|
226
|
-
columns.append(column)
|
|
227
234
|
|
|
228
|
-
|
|
229
|
-
for selector in selectors:
|
|
230
|
-
filtered_attributes = selector.filtered_attributes
|
|
231
|
-
values.append(tuple([filtered_attributes[k] for k in keys]))
|
|
235
|
+
join_conditions.append(attribute_cte.c[k] == column)
|
|
232
236
|
|
|
233
|
-
query = query.
|
|
237
|
+
query = query.select_from(
|
|
238
|
+
dataset_table.join(attribute_cte, and_(*join_conditions))
|
|
239
|
+
)
|
|
234
240
|
|
|
235
241
|
if where:
|
|
236
242
|
query = query.filter(text(where))
|
|
243
|
+
|
|
244
|
+
query = query.filter(dataset_table.c.bucket == bucket)
|
|
245
|
+
if dataset_type:
|
|
246
|
+
query = query.filter(dataset_table.c.dataset_type == dataset_type)
|
|
247
|
+
if provider:
|
|
248
|
+
query = query.filter(dataset_table.c.provider == provider)
|
|
249
|
+
|
|
237
250
|
return query
|
|
238
251
|
|
|
239
252
|
def load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
|
|
@@ -242,13 +255,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
242
255
|
|
|
243
256
|
dataset_rows = list(
|
|
244
257
|
self.session.query(dataset_table).filter(
|
|
245
|
-
dataset_table.c.dataset_id
|
|
258
|
+
in_(dataset_table.c.dataset_id, dataset_ids)
|
|
246
259
|
)
|
|
247
260
|
)
|
|
248
261
|
revisions_per_dataset = {}
|
|
249
262
|
rows = (
|
|
250
263
|
self.session.query(revision_table)
|
|
251
|
-
.filter(revision_table.c.dataset_id
|
|
264
|
+
.filter(in_(revision_table.c.dataset_id, dataset_ids))
|
|
252
265
|
.order_by(revision_table.c.dataset_id)
|
|
253
266
|
)
|
|
254
267
|
|
|
@@ -260,7 +273,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
260
273
|
files_per_revision = {}
|
|
261
274
|
rows = (
|
|
262
275
|
self.session.query(file_table)
|
|
263
|
-
.filter(file_table.c.dataset_id
|
|
276
|
+
.filter(in_(file_table.c.dataset_id, dataset_ids))
|
|
264
277
|
.order_by(file_table.c.dataset_id, file_table.c.revision_id)
|
|
265
278
|
)
|
|
266
279
|
|
|
@@ -320,10 +333,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
320
333
|
|
|
321
334
|
metadata_result_row = apply_query_filter(
|
|
322
335
|
self.session.query(
|
|
323
|
-
func.
|
|
324
|
-
func.max(file_table.c.modified_at).label("last_modified_at"),
|
|
336
|
+
func.max(dataset_table.c.last_modified_at).label("last_modified_at"),
|
|
325
337
|
func.count().label("row_count"),
|
|
326
|
-
)
|
|
338
|
+
)
|
|
327
339
|
).first()
|
|
328
340
|
dataset_collection_metadata = DatasetCollectionMetadata(*metadata_result_row)
|
|
329
341
|
|
|
@@ -14,8 +14,11 @@ from sqlalchemy import (
|
|
|
14
14
|
String,
|
|
15
15
|
Table,
|
|
16
16
|
TypeDecorator,
|
|
17
|
+
Index,
|
|
17
18
|
)
|
|
18
19
|
|
|
20
|
+
from sqlalchemy.dialects.postgresql import JSONB
|
|
21
|
+
|
|
19
22
|
from ingestify.domain import Identifier, DataSpecVersionCollection, Selector
|
|
20
23
|
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
21
24
|
from ingestify.domain.models.ingestion.ingestion_job_summary import IngestionJobState
|
|
@@ -25,18 +28,18 @@ from ingestify.domain.models.timing import Timing
|
|
|
25
28
|
from ingestify.domain.models.dataset.revision import RevisionState
|
|
26
29
|
|
|
27
30
|
|
|
28
|
-
def JSONType(serializer=None, deserializer=None):
|
|
31
|
+
def JSONType(serializer=None, deserializer=None, base_type=JSON):
|
|
29
32
|
class _JsonType(TypeDecorator):
|
|
30
33
|
cache_ok = True
|
|
31
|
-
impl =
|
|
34
|
+
impl = base_type
|
|
32
35
|
|
|
33
36
|
def process_bind_param(self, value, dialect):
|
|
34
|
-
if serializer is not None:
|
|
37
|
+
if serializer and value is not None:
|
|
35
38
|
return serializer(value)
|
|
36
39
|
return value
|
|
37
40
|
|
|
38
41
|
def process_result_value(self, value, dialect):
|
|
39
|
-
if deserializer is not None:
|
|
42
|
+
if deserializer and value is not None:
|
|
40
43
|
return deserializer(value)
|
|
41
44
|
return value
|
|
42
45
|
|
|
@@ -152,14 +155,28 @@ dataset_table = Table(
|
|
|
152
155
|
metadata,
|
|
153
156
|
Column("bucket", String(255), default=None),
|
|
154
157
|
Column("dataset_id", String(255), primary_key=True),
|
|
155
|
-
Column("provider", String(255)),
|
|
156
|
-
Column("dataset_type", String(255)),
|
|
158
|
+
Column("provider", String(255), index=True),
|
|
159
|
+
Column("dataset_type", String(255), index=True),
|
|
157
160
|
Column("state", DatasetStateString),
|
|
158
161
|
Column("name", String(255)),
|
|
159
|
-
Column(
|
|
162
|
+
Column(
|
|
163
|
+
"identifier",
|
|
164
|
+
# Use JSONB when available
|
|
165
|
+
JSON().with_variant(JSONB(), "postgresql"),
|
|
166
|
+
),
|
|
160
167
|
Column("metadata", JSON),
|
|
161
168
|
Column("created_at", TZDateTime(6)),
|
|
162
169
|
Column("updated_at", TZDateTime(6)),
|
|
170
|
+
Column("last_modified_at", TZDateTime(6)),
|
|
171
|
+
# Required for performance querying when there are a lot of Datasets
|
|
172
|
+
# with the same provider and dataset_type
|
|
173
|
+
Index(
|
|
174
|
+
"idx_bucket_type_provider_last_modified",
|
|
175
|
+
"bucket",
|
|
176
|
+
"provider",
|
|
177
|
+
"dataset_type",
|
|
178
|
+
"last_modified_at",
|
|
179
|
+
),
|
|
163
180
|
)
|
|
164
181
|
|
|
165
182
|
revision_table = Table(
|
ingestify/utils.py
CHANGED
|
@@ -1,34 +1,23 @@
|
|
|
1
|
-
import abc
|
|
2
|
-
import asyncio
|
|
3
|
-
import inspect
|
|
4
1
|
import logging
|
|
5
2
|
import os
|
|
6
3
|
import time
|
|
7
4
|
import re
|
|
5
|
+
import traceback
|
|
6
|
+
from contextlib import contextmanager
|
|
8
7
|
from multiprocessing import get_context, cpu_count, get_all_start_methods
|
|
9
8
|
|
|
10
9
|
from datetime import datetime, timezone
|
|
11
10
|
from string import Template
|
|
12
|
-
from typing import
|
|
13
|
-
Dict,
|
|
14
|
-
Generic,
|
|
15
|
-
Type,
|
|
16
|
-
TypeVar,
|
|
17
|
-
Tuple,
|
|
18
|
-
Optional,
|
|
19
|
-
Any,
|
|
20
|
-
Callable,
|
|
21
|
-
Awaitable,
|
|
22
|
-
List,
|
|
23
|
-
Iterable,
|
|
24
|
-
)
|
|
11
|
+
from typing import Dict, Tuple, Optional, Any, List
|
|
25
12
|
|
|
26
13
|
import cloudpickle
|
|
14
|
+
from pydantic import Field
|
|
27
15
|
from typing_extensions import Self
|
|
28
16
|
|
|
29
17
|
|
|
30
18
|
from itertools import islice
|
|
31
19
|
|
|
20
|
+
from ingestify.domain.models.timing import Timing
|
|
32
21
|
|
|
33
22
|
logger = logging.getLogger(__name__)
|
|
34
23
|
|
|
@@ -221,3 +210,46 @@ def try_number(s: str):
|
|
|
221
210
|
return float(s)
|
|
222
211
|
except ValueError:
|
|
223
212
|
return s
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class HasTiming:
|
|
216
|
+
"""Mixin to give Pydantic models ability to time actions."""
|
|
217
|
+
|
|
218
|
+
timings: List[Timing] = Field(default_factory=list)
|
|
219
|
+
|
|
220
|
+
@contextmanager
|
|
221
|
+
def record_timing(
|
|
222
|
+
self, description: str, metadata: Optional[dict] = None
|
|
223
|
+
) -> Timing:
|
|
224
|
+
if not metadata:
|
|
225
|
+
metadata = {}
|
|
226
|
+
|
|
227
|
+
start = utcnow()
|
|
228
|
+
try:
|
|
229
|
+
result = None
|
|
230
|
+
yield
|
|
231
|
+
except Exception as e:
|
|
232
|
+
result = {
|
|
233
|
+
"type": type(e).__name__,
|
|
234
|
+
"message": str(e),
|
|
235
|
+
"traceback": traceback.format_exc(),
|
|
236
|
+
}
|
|
237
|
+
raise e
|
|
238
|
+
finally:
|
|
239
|
+
metadata = dict(result=result, **metadata)
|
|
240
|
+
self.timings.append(
|
|
241
|
+
Timing(
|
|
242
|
+
name=description,
|
|
243
|
+
started_at=start,
|
|
244
|
+
ended_at=utcnow(),
|
|
245
|
+
metadata=metadata,
|
|
246
|
+
)
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
def start_timing(self, name):
|
|
250
|
+
start = utcnow()
|
|
251
|
+
|
|
252
|
+
def finish():
|
|
253
|
+
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
254
|
+
|
|
255
|
+
return finish
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
ingestify/__init__.py,sha256=
|
|
2
|
-
ingestify/cmdline.py,sha256=
|
|
1
|
+
ingestify/__init__.py,sha256=xCS7JQ_JaB6zVzrq6WUeAZyNxVKJEOc7AKh-3vY_Ji8,301
|
|
2
|
+
ingestify/cmdline.py,sha256=oagUe-Jup1SU3s6jVl25f0cSG0wlNYhxFY-gGBwWmr0,7482
|
|
3
3
|
ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
|
|
4
4
|
ingestify/main.py,sha256=Xr0VbGgstPO7doDX18xqk4lBb4W2sbGWtQuXZaARsHA,8763
|
|
5
5
|
ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
|
|
6
6
|
ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
|
|
7
|
-
ingestify/utils.py,sha256=
|
|
7
|
+
ingestify/utils.py,sha256=6BqgEZjecLW_anqYP5WrFpi93bmdhF-EdrebEkm59Ds,6806
|
|
8
8
|
ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
ingestify/application/dataset_store.py,sha256=
|
|
10
|
-
ingestify/application/ingestion_engine.py,sha256=
|
|
11
|
-
ingestify/application/loader.py,sha256=
|
|
9
|
+
ingestify/application/dataset_store.py,sha256=JkAb1W0HaUgOwbohKntM4ttyrFXQ7df1uZSu2rbZllY,11680
|
|
10
|
+
ingestify/application/ingestion_engine.py,sha256=4SAmPZDm3e2QA5jZvMrb6xz1eDDshKoSZDWH3TCe4Bo,2372
|
|
11
|
+
ingestify/application/loader.py,sha256=2LpYSHvedsoe5wFsIkQv0xLcKcqtebwVOSPWapAypao,7566
|
|
12
12
|
ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
|
|
13
13
|
ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
|
|
14
14
|
ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
|
|
@@ -20,8 +20,8 @@ ingestify/domain/models/source.py,sha256=sB3aqr2LfjIbtw7ODJpHnPj3RUeo7gYmTU7MXvf
|
|
|
20
20
|
ingestify/domain/models/timing.py,sha256=TvvH6Szo61CD8wCP7Awyc45CXga5lKqvoW2U-0TRHlA,388
|
|
21
21
|
ingestify/domain/models/dataset/__init__.py,sha256=i1kswluvWjw0xn4OUByRt7yeRvNHu1mauevv-Vmayx4,630
|
|
22
22
|
ingestify/domain/models/dataset/collection.py,sha256=E2utQ6oyaFFrfQFMiwP9J_I7Wm21z0sRvE4Zc3QEs20,1310
|
|
23
|
-
ingestify/domain/models/dataset/collection_metadata.py,sha256=
|
|
24
|
-
ingestify/domain/models/dataset/dataset.py,sha256=
|
|
23
|
+
ingestify/domain/models/dataset/collection_metadata.py,sha256=u2H3XZ-6NMfuAcVD_mb7jEc1IkS1MgQahDIe5CWHjtc,458
|
|
24
|
+
ingestify/domain/models/dataset/dataset.py,sha256=STew8_zCBro_x_u03JrjMvq8yqUDaNndlOlolf9osdM,3332
|
|
25
25
|
ingestify/domain/models/dataset/dataset_repository.py,sha256=kUjiqW58kOUOli1gZCLR5xw4dBX0bqI1UJsf16hgNsQ,812
|
|
26
26
|
ingestify/domain/models/dataset/dataset_state.py,sha256=O95mea5N34HDXw7XsYzxHna4FVk_T-ZNUDezkvt7VzY,220
|
|
27
27
|
ingestify/domain/models/dataset/events.py,sha256=58VacQejQt-WPh9BywP4st5McauM3gXBQo0kaDnSekY,481
|
|
@@ -29,7 +29,7 @@ ingestify/domain/models/dataset/file.py,sha256=1Thdv6A1YmC1UfutaRf2q3FGHQYO0SWEp
|
|
|
29
29
|
ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
|
|
30
30
|
ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
|
|
31
31
|
ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
|
|
32
|
-
ingestify/domain/models/dataset/revision.py,sha256=
|
|
32
|
+
ingestify/domain/models/dataset/revision.py,sha256=jBjMqYXDbvt_VAIwL_db09jcH4W8JPRKsXJb4JCniuM,1447
|
|
33
33
|
ingestify/domain/models/dataset/selector.py,sha256=kEGpU8pIyjZ0zwE9n2uo_NY5xrNanWiTTgapyMAUEsw,1039
|
|
34
34
|
ingestify/domain/models/event/__init__.py,sha256=OdPTpE9bj5QqdGmrYqRTLPX1f-LR9GWJYlGMPPEsuL8,138
|
|
35
35
|
ingestify/domain/models/event/_old_event.py,sha256=RktgCAj9SMdtqkAc_bOwoghEb2Z6m4r5_xWXin9wqx4,472
|
|
@@ -39,15 +39,15 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
|
|
|
39
39
|
ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
|
|
40
40
|
ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
|
|
41
41
|
ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
-
ingestify/domain/models/ingestion/ingestion_job.py,sha256=
|
|
43
|
-
ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=
|
|
42
|
+
ingestify/domain/models/ingestion/ingestion_job.py,sha256=H9jfbbWFZw73nxMOW0480LgSHV-o4sA5IcvpUZmFpS4,13140
|
|
43
|
+
ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=Xvmtu0BwE9C7FxBl6D8tN49I6--E_RngcMfWeme4DPA,4499
|
|
44
44
|
ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
|
|
45
45
|
ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
|
|
46
46
|
ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
|
|
47
47
|
ingestify/domain/models/task/__init__.py,sha256=BdlyIPvE07Xax_IzLgO9DUw0wsz9OZutxnxdDNyRlys,79
|
|
48
48
|
ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIpXtlhW4,430
|
|
49
49
|
ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
|
|
50
|
-
ingestify/domain/models/task/task_summary.py,sha256=
|
|
50
|
+
ingestify/domain/models/task/task_summary.py,sha256=T9BSGhOZjKCPfym34VUdBXuMy0o6E832GAI4WMtjGao,3181
|
|
51
51
|
ingestify/domain/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
52
|
ingestify/domain/services/identifier_key_transformer.py,sha256=y4GS9u9Ej1MO2jUhAxWbifp0mrE_MqTHvVVcoQzSKb4,4034
|
|
53
53
|
ingestify/domain/services/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -55,7 +55,7 @@ ingestify/domain/services/transformers/kloppy_to_pandas.py,sha256=NcN6nTBGVn9gz-
|
|
|
55
55
|
ingestify/infra/__init__.py,sha256=V0hpLzPVTcOHRVh0gguF6FT30YIgEOUd5v87xUHkfZ4,88
|
|
56
56
|
ingestify/infra/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
57
57
|
ingestify/infra/fetch/http.py,sha256=ldaXy6alBbI9z63H97lXfYZNT0ZCBkTac1W6-acNjjY,4127
|
|
58
|
-
ingestify/infra/serialization/__init__.py,sha256
|
|
58
|
+
ingestify/infra/serialization/__init__.py,sha256=UqXWJmKTp7Mi58ZyDASGguPFlqdVWVUbm_sg9GWx9eI,702
|
|
59
59
|
ingestify/infra/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
60
|
ingestify/infra/sink/postgresql.py,sha256=SxuM3LntfYcpCriUpqJhMvgAf0s9cohXf6WkxSEDYDY,1816
|
|
61
61
|
ingestify/infra/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -64,8 +64,8 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
|
|
|
64
64
|
ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
|
|
65
65
|
ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
66
|
ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
|
|
67
|
-
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=
|
|
68
|
-
ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=
|
|
67
|
+
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=3xDTqEEy_MxZoIX9qezpXasOFW7NMmduJEaR0PwTZXk,16110
|
|
68
|
+
ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=OLB1FMElb3gSAnOsKX-oiLl_YVXaVEa6Q29QoHp2okU,10602
|
|
69
69
|
ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
|
|
70
70
|
ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
|
|
71
71
|
ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
|
|
@@ -80,8 +80,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
|
|
|
80
80
|
ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
|
|
81
81
|
ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
|
|
82
82
|
ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
|
|
83
|
-
ingestify-0.
|
|
84
|
-
ingestify-0.
|
|
85
|
-
ingestify-0.
|
|
86
|
-
ingestify-0.
|
|
87
|
-
ingestify-0.
|
|
83
|
+
ingestify-0.4.1.dist-info/METADATA,sha256=Tz062FbilTuQmmW2FPyr2sj0GIK1vjtZs189R5bkxEM,18854
|
|
84
|
+
ingestify-0.4.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
85
|
+
ingestify-0.4.1.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
|
|
86
|
+
ingestify-0.4.1.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
|
|
87
|
+
ingestify-0.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|