ingestify 0.6.1__tar.gz → 0.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ingestify-0.6.1 → ingestify-0.6.2}/PKG-INFO +1 -1
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/__init__.py +1 -1
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/application/dataset_store.py +19 -5
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/dataset/collection_metadata.py +3 -1
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/dataset/dataset.py +17 -1
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/dataset/file.py +6 -1
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/dataset/revision.py +11 -2
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/dataset/selector.py +11 -1
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/fetch_policy.py +3 -1
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/ingestion/ingestion_job.py +65 -52
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/ingestion/ingestion_job_summary.py +5 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/fetch/http.py +14 -6
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/store/dataset/sqlalchemy/repository.py +14 -9
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify.egg-info/PKG-INFO +1 -1
- {ingestify-0.6.1 → ingestify-0.6.2}/README.md +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/application/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/application/ingestion_engine.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/application/loader.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/cmdline.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/base.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/data_spec_version_collection.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/dataset/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/dataset/collection.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/dataset/dataset_state.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/dataset/events.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/dataset/file_collection.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/dataset/file_repository.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/event/domain_event.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/event/event_bus.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/ingestion/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/resources/dataset_resource.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/sink.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/task/task.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/task/task_summary.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/timing.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/services/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/services/identifier_key_transformer.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/services/transformers/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/exceptions.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/fetch/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/serialization/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/sink/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/sink/postgresql.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/source/statsbomb_github.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/source/wyscout.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/store/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/store/dataset/sqlalchemy/tables.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/store/file/local_file_repository.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/infra/store/file/s3_file_repository.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/main.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/server.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/source_base.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/static/templates/statsbomb_github/README.md +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/static/templates/statsbomb_github/query.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/static/templates/wyscout/.env +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/static/templates/wyscout/.gitignore +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/static/templates/wyscout/README.md +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/static/templates/wyscout/database/README.md +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/static/templates/wyscout/query.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify/utils.py +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify.egg-info/SOURCES.txt +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify.egg-info/requires.txt +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/setup.cfg +0 -0
- {ingestify-0.6.1 → ingestify-0.6.2}/setup.py +0 -0
|
@@ -11,6 +11,7 @@ from typing import Dict, List, Optional, Union, Callable, BinaryIO, Awaitable
|
|
|
11
11
|
|
|
12
12
|
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
13
13
|
from ingestify.domain.models.dataset.events import RevisionAdded, MetadataUpdated
|
|
14
|
+
from ingestify.domain.models.dataset.file import NotModifiedFile
|
|
14
15
|
from ingestify.domain.models.dataset.file_collection import FileCollection
|
|
15
16
|
from ingestify.domain.models.dataset.revision import RevisionSource
|
|
16
17
|
from ingestify.domain.models.event import EventBus
|
|
@@ -140,8 +141,8 @@ class DatasetStore:
|
|
|
140
141
|
current_revision = dataset.current_revision
|
|
141
142
|
|
|
142
143
|
for file_id, file_ in modified_files.items():
|
|
143
|
-
if file_
|
|
144
|
-
# It's always allowed to pass
|
|
144
|
+
if isinstance(file_, NotModifiedFile):
|
|
145
|
+
# It's always allowed to pass NotModifiedFile as file. This means it didn't change and must be ignored.
|
|
145
146
|
continue
|
|
146
147
|
|
|
147
148
|
current_file = (
|
|
@@ -210,9 +211,22 @@ class DatasetStore:
|
|
|
210
211
|
f"Added a new revision to {dataset.identifier} -> {', '.join([file.file_id for file in persisted_files_])}"
|
|
211
212
|
)
|
|
212
213
|
else:
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
214
|
+
if dataset.update_last_modified(files):
|
|
215
|
+
# For some Datasets the last modified doesn't make sense (for sources that don't provide it)
|
|
216
|
+
# Do we want to update last modified of a Dataset when the value is utcnow()?
|
|
217
|
+
# self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
|
|
218
|
+
# TODO: dispatch some event?
|
|
219
|
+
# self.dispatch(DatasetLastModifiedChanged(dataset=dataset))
|
|
220
|
+
logger.info(
|
|
221
|
+
f"Ignoring a new revision without changed files -> {dataset.identifier}, but "
|
|
222
|
+
f"might need to update last modified to {dataset.last_modified_at} ?"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
else:
|
|
226
|
+
logger.info(
|
|
227
|
+
f"Ignoring a new revision without changed files -> {dataset.identifier}"
|
|
228
|
+
)
|
|
229
|
+
|
|
216
230
|
revision = None
|
|
217
231
|
|
|
218
232
|
return revision
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import List, Optional
|
|
3
|
+
from typing import List, Optional, Dict
|
|
4
4
|
from pydantic import Field, field_validator
|
|
5
5
|
|
|
6
6
|
from ingestify.utils import utcnow
|
|
@@ -52,6 +52,22 @@ class Dataset(BaseModel):
|
|
|
52
52
|
else:
|
|
53
53
|
self.last_modified_at = revision.last_modified_at
|
|
54
54
|
|
|
55
|
+
def update_last_modified(self, files: Dict[str, DraftFile]):
|
|
56
|
+
"""Update the last modified, even tho there was no new revision. Some Sources
|
|
57
|
+
may report a Dataset is changed, even when there are no changed files.
|
|
58
|
+
Update the last_modified to prevent hitting the same Source for updates
|
|
59
|
+
"""
|
|
60
|
+
changed = False
|
|
61
|
+
for file in files.values():
|
|
62
|
+
if file.modified_at and (
|
|
63
|
+
self.last_modified_at is None
|
|
64
|
+
or file.modified_at > self.last_modified_at
|
|
65
|
+
):
|
|
66
|
+
# Update, and continue looking for others
|
|
67
|
+
self.last_modified_at = file.modified_at
|
|
68
|
+
changed = True
|
|
69
|
+
return changed
|
|
70
|
+
|
|
55
71
|
def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
|
|
56
72
|
changed = False
|
|
57
73
|
if self.name != name:
|
|
@@ -29,7 +29,7 @@ class DraftFile(BaseModel):
|
|
|
29
29
|
modified_at: Optional[datetime] = None,
|
|
30
30
|
):
|
|
31
31
|
# Pass-through for these types
|
|
32
|
-
if isinstance(file_, DraftFile)
|
|
32
|
+
if isinstance(file_, (DraftFile, NotModifiedFile)):
|
|
33
33
|
return file_
|
|
34
34
|
elif isinstance(file_, str):
|
|
35
35
|
stream = BytesIO(file_.encode("utf-8"))
|
|
@@ -102,6 +102,11 @@ class File(BaseModel):
|
|
|
102
102
|
)
|
|
103
103
|
|
|
104
104
|
|
|
105
|
+
class NotModifiedFile(BaseModel):
|
|
106
|
+
modified_at: datetime
|
|
107
|
+
reason: str
|
|
108
|
+
|
|
109
|
+
|
|
105
110
|
class LoadedFile(BaseModel):
|
|
106
111
|
# Unique key to identify this File within a Dataset
|
|
107
112
|
file_id: str
|
|
@@ -44,13 +44,22 @@ class Revision(BaseModel):
|
|
|
44
44
|
def modified_files_map(self) -> Dict[str, File]:
|
|
45
45
|
return {file.file_id: file for file in self.modified_files}
|
|
46
46
|
|
|
47
|
-
def is_changed(
|
|
47
|
+
def is_changed(
|
|
48
|
+
self, files: Dict[str, datetime], dataset_last_modified_at: datetime
|
|
49
|
+
) -> bool:
|
|
48
50
|
modified_files_map = self.modified_files_map
|
|
49
51
|
for file_id, last_modified in files.items():
|
|
50
52
|
if file_id not in modified_files_map:
|
|
51
53
|
return True
|
|
52
54
|
|
|
53
55
|
if modified_files_map[file_id].modified_at < last_modified:
|
|
54
|
-
|
|
56
|
+
if dataset_last_modified_at < last_modified:
|
|
57
|
+
# For StatsBomb we use last_modified of match for lineups, and events files.
|
|
58
|
+
# When only match is updated, the lineups and events files won't be updated
|
|
59
|
+
# as the content is not changed. Therefore, those modified_at is not updated,
|
|
60
|
+
# and we try to update it over and over again.
|
|
61
|
+
# This check prevents that; always take the LastModifiedAt of the Dataset
|
|
62
|
+
# into account
|
|
63
|
+
return True
|
|
55
64
|
|
|
56
65
|
return False
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
1
4
|
from ingestify.domain.models.data_spec_version_collection import (
|
|
2
5
|
DataSpecVersionCollection,
|
|
3
6
|
)
|
|
@@ -28,10 +31,17 @@ class Selector(AttributeBag):
|
|
|
28
31
|
def data_spec_versions(self):
|
|
29
32
|
return self._data_spec_versions
|
|
30
33
|
|
|
34
|
+
@property
|
|
35
|
+
def last_modified(self) -> Optional[datetime]:
|
|
36
|
+
try:
|
|
37
|
+
return self._last_modified
|
|
38
|
+
except AttributeError:
|
|
39
|
+
return None
|
|
40
|
+
|
|
31
41
|
@property
|
|
32
42
|
def custom_attributes(self):
|
|
33
43
|
return {
|
|
34
44
|
k: v
|
|
35
45
|
for k, v in self.items()
|
|
36
|
-
if k not in ("_matcher", "_data_spec_versions")
|
|
46
|
+
if k not in ("_matcher", "_data_spec_versions", "_last_modified")
|
|
37
47
|
}
|
|
@@ -26,7 +26,9 @@ class FetchPolicy:
|
|
|
26
26
|
file.file_id: file.last_modified
|
|
27
27
|
for file in dataset_resource.files.values()
|
|
28
28
|
}
|
|
29
|
-
if current_revision.is_changed(
|
|
29
|
+
if current_revision.is_changed(
|
|
30
|
+
files_last_modified, dataset.last_modified_at
|
|
31
|
+
):
|
|
30
32
|
return True
|
|
31
33
|
|
|
32
34
|
# We don't set last_modified on Dataset level anymore, only on file level
|
|
@@ -3,11 +3,12 @@ import json
|
|
|
3
3
|
import logging
|
|
4
4
|
import uuid
|
|
5
5
|
from enum import Enum
|
|
6
|
-
from typing import Optional, Iterator
|
|
6
|
+
from typing import Optional, Iterator, Union
|
|
7
7
|
|
|
8
8
|
from ingestify import retrieve_http
|
|
9
9
|
from ingestify.application.dataset_store import DatasetStore
|
|
10
10
|
from ingestify.domain import Selector, Identifier, TaskSet, Dataset, DraftFile, Task
|
|
11
|
+
from ingestify.domain.models.dataset.file import NotModifiedFile
|
|
11
12
|
from ingestify.domain.models.dataset.revision import RevisionSource, SourceType
|
|
12
13
|
from ingestify.domain.models.ingestion.ingestion_job_summary import (
|
|
13
14
|
IngestionJobSummary,
|
|
@@ -54,7 +55,7 @@ def to_batches(input_):
|
|
|
54
55
|
|
|
55
56
|
def load_file(
|
|
56
57
|
file_resource: FileResource, dataset: Optional[Dataset] = None
|
|
57
|
-
) ->
|
|
58
|
+
) -> Union[DraftFile, NotModifiedFile]:
|
|
58
59
|
current_file = None
|
|
59
60
|
if dataset:
|
|
60
61
|
current_file = dataset.current_revision.modified_files_map.get(
|
|
@@ -72,7 +73,10 @@ def load_file(
|
|
|
72
73
|
)
|
|
73
74
|
if current_file and current_file.tag == file.tag:
|
|
74
75
|
# Nothing changed
|
|
75
|
-
return
|
|
76
|
+
return NotModifiedFile(
|
|
77
|
+
modified_at=file_resource.last_modified,
|
|
78
|
+
reason="tag matched current_file",
|
|
79
|
+
)
|
|
76
80
|
return file
|
|
77
81
|
elif file_resource.url:
|
|
78
82
|
http_options = {}
|
|
@@ -228,6 +232,19 @@ class IngestionJob:
|
|
|
228
232
|
).metadata
|
|
229
233
|
logger.info(f"Done: {dataset_collection_metadata}")
|
|
230
234
|
|
|
235
|
+
if self.selector.last_modified and dataset_collection_metadata.last_modified:
|
|
236
|
+
# This check might fail when the data_spec_versions is changed;
|
|
237
|
+
# missing files are not detected
|
|
238
|
+
if self.selector.last_modified < dataset_collection_metadata.last_modified:
|
|
239
|
+
logger.info(
|
|
240
|
+
f"Skipping find_datasets because selector last_modified "
|
|
241
|
+
f"'{self.selector.last_modified}' < metadata last_modified "
|
|
242
|
+
f"'{dataset_collection_metadata.last_modified}'"
|
|
243
|
+
)
|
|
244
|
+
ingestion_job_summary.set_skipped()
|
|
245
|
+
yield ingestion_job_summary
|
|
246
|
+
return
|
|
247
|
+
|
|
231
248
|
# There are two different, but similar flows here:
|
|
232
249
|
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
233
250
|
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
@@ -252,8 +269,6 @@ class IngestionJob:
|
|
|
252
269
|
|
|
253
270
|
logger.info("Starting tasks")
|
|
254
271
|
|
|
255
|
-
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
256
|
-
|
|
257
272
|
while True:
|
|
258
273
|
logger.info(f"Finding next batch of datasets for selector={self.selector}")
|
|
259
274
|
|
|
@@ -266,7 +281,6 @@ class IngestionJob:
|
|
|
266
281
|
except Exception as e:
|
|
267
282
|
logger.exception("Failed to fetch next batch")
|
|
268
283
|
|
|
269
|
-
finish_task_timer()
|
|
270
284
|
ingestion_job_summary.set_exception(e)
|
|
271
285
|
yield ingestion_job_summary
|
|
272
286
|
return
|
|
@@ -294,54 +308,57 @@ class IngestionJob:
|
|
|
294
308
|
skipped_tasks = 0
|
|
295
309
|
|
|
296
310
|
task_set = TaskSet()
|
|
297
|
-
for dataset_resource in batch:
|
|
298
|
-
dataset_identifier = Identifier.create_from_selector(
|
|
299
|
-
self.selector, **dataset_resource.dataset_resource_id
|
|
300
|
-
)
|
|
301
311
|
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
312
|
+
with ingestion_job_summary.record_timing("build_task_set"):
|
|
313
|
+
for dataset_resource in batch:
|
|
314
|
+
dataset_identifier = Identifier.create_from_selector(
|
|
315
|
+
self.selector, **dataset_resource.dataset_resource_id
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
if dataset := dataset_collection.get(dataset_identifier):
|
|
319
|
+
if self.ingestion_plan.fetch_policy.should_refetch(
|
|
320
|
+
dataset, dataset_resource
|
|
321
|
+
):
|
|
322
|
+
task_set.add(
|
|
323
|
+
UpdateDatasetTask(
|
|
324
|
+
dataset=dataset, # Current dataset from the database
|
|
325
|
+
dataset_resource=dataset_resource, # Most recent dataset_resource
|
|
326
|
+
store=store,
|
|
327
|
+
)
|
|
311
328
|
)
|
|
312
|
-
|
|
329
|
+
else:
|
|
330
|
+
skipped_tasks += 1
|
|
313
331
|
else:
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
332
|
+
if self.ingestion_plan.fetch_policy.should_fetch(
|
|
333
|
+
dataset_resource
|
|
334
|
+
):
|
|
335
|
+
task_set.add(
|
|
336
|
+
CreateDatasetTask(
|
|
337
|
+
dataset_resource=dataset_resource,
|
|
338
|
+
store=store,
|
|
339
|
+
)
|
|
321
340
|
)
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
ingestion_job_summary.increase_skipped_tasks(skipped_tasks)
|
|
341
|
+
else:
|
|
342
|
+
skipped_tasks += 1
|
|
343
|
+
|
|
344
|
+
with ingestion_job_summary.record_timing("tasks"):
|
|
345
|
+
if task_set:
|
|
346
|
+
logger.info(
|
|
347
|
+
f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
|
|
348
|
+
f"using selector {self.selector} => {len(task_set)} tasks. {skipped_tasks} skipped."
|
|
349
|
+
)
|
|
350
|
+
logger.info(f"Running {len(task_set)} tasks")
|
|
351
|
+
ingestion_job_summary.add_task_summaries(
|
|
352
|
+
task_executor.run(run_task, task_set)
|
|
353
|
+
)
|
|
354
|
+
else:
|
|
355
|
+
logger.info(
|
|
356
|
+
f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
|
|
357
|
+
f"using selector {self.selector} => nothing to do"
|
|
358
|
+
)
|
|
359
|
+
ingestion_job_summary.increase_skipped_tasks(skipped_tasks)
|
|
342
360
|
|
|
343
361
|
if ingestion_job_summary.task_count() >= MAX_TASKS_PER_CHUNK:
|
|
344
|
-
finish_task_timer()
|
|
345
362
|
ingestion_job_summary.set_finished()
|
|
346
363
|
yield ingestion_job_summary
|
|
347
364
|
|
|
@@ -349,11 +366,7 @@ class IngestionJob:
|
|
|
349
366
|
is_first_chunk = False
|
|
350
367
|
ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
|
|
351
368
|
|
|
352
|
-
# We will resume tasks, start timer right away
|
|
353
|
-
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
354
|
-
|
|
355
369
|
if ingestion_job_summary.task_count() > 0 or is_first_chunk:
|
|
356
370
|
# When there is interesting information to store, or there was no data at all, store it
|
|
357
|
-
finish_task_timer()
|
|
358
371
|
ingestion_job_summary.set_finished()
|
|
359
372
|
yield ingestion_job_summary
|
{ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/models/ingestion/ingestion_job_summary.py
RENAMED
|
@@ -18,6 +18,7 @@ if TYPE_CHECKING:
|
|
|
18
18
|
class IngestionJobState(str, Enum):
|
|
19
19
|
RUNNING = "RUNNING"
|
|
20
20
|
FINISHED = "FINISHED"
|
|
21
|
+
SKIPPED = "SKIPPED"
|
|
21
22
|
FAILED = "FAILED"
|
|
22
23
|
|
|
23
24
|
|
|
@@ -104,6 +105,10 @@ class IngestionJobSummary(BaseModel, HasTiming):
|
|
|
104
105
|
self.state = IngestionJobState.FAILED
|
|
105
106
|
self._set_ended()
|
|
106
107
|
|
|
108
|
+
def set_skipped(self):
|
|
109
|
+
self.state = IngestionJobState.SKIPPED
|
|
110
|
+
self._set_ended()
|
|
111
|
+
|
|
107
112
|
@property
|
|
108
113
|
def duration(self) -> timedelta:
|
|
109
114
|
return self.ended_at - self.started_at
|
|
@@ -3,13 +3,14 @@ from datetime import datetime
|
|
|
3
3
|
from email.utils import format_datetime, parsedate
|
|
4
4
|
from hashlib import sha1
|
|
5
5
|
from io import BytesIO
|
|
6
|
-
from typing import Optional, Callable, Tuple
|
|
6
|
+
from typing import Optional, Callable, Tuple, Union
|
|
7
7
|
|
|
8
8
|
import requests
|
|
9
9
|
from requests.adapters import HTTPAdapter
|
|
10
10
|
from urllib3 import Retry
|
|
11
11
|
|
|
12
12
|
from ingestify.domain.models import DraftFile, File
|
|
13
|
+
from ingestify.domain.models.dataset.file import NotModifiedFile
|
|
13
14
|
from ingestify.utils import utcnow
|
|
14
15
|
|
|
15
16
|
_session = None
|
|
@@ -46,12 +47,15 @@ def retrieve_http(
|
|
|
46
47
|
pager: Optional[Tuple[str, Callable[[str, dict], Optional[str]]]] = None,
|
|
47
48
|
last_modified: Optional[datetime] = None,
|
|
48
49
|
**kwargs,
|
|
49
|
-
) ->
|
|
50
|
+
) -> Union[DraftFile, NotModifiedFile]:
|
|
50
51
|
headers = headers or {}
|
|
51
52
|
if current_file:
|
|
52
53
|
if last_modified and current_file.modified_at >= last_modified:
|
|
53
54
|
# Not changed
|
|
54
|
-
return
|
|
55
|
+
return NotModifiedFile(
|
|
56
|
+
modified_at=last_modified,
|
|
57
|
+
reason=f"last-modified same as current file: {current_file.modified_at} >= {last_modified}",
|
|
58
|
+
)
|
|
55
59
|
# else:
|
|
56
60
|
# print(f"{current_file.modified_at=} {last_modified=}")
|
|
57
61
|
# headers["if-modified-since"] = (
|
|
@@ -73,12 +77,14 @@ def retrieve_http(
|
|
|
73
77
|
|
|
74
78
|
response = get_session().get(url, headers=headers, **http_kwargs)
|
|
75
79
|
if response.status_code == 404 and ignore_not_found:
|
|
76
|
-
return
|
|
80
|
+
return NotModifiedFile(
|
|
81
|
+
modified_at=last_modified, reason="404 http code and ignore-not-found"
|
|
82
|
+
)
|
|
77
83
|
|
|
78
84
|
response.raise_for_status()
|
|
79
85
|
if response.status_code == 304:
|
|
80
86
|
# Not modified
|
|
81
|
-
return
|
|
87
|
+
return NotModifiedFile(modified_at=last_modified, reason="304 http code")
|
|
82
88
|
|
|
83
89
|
if last_modified:
|
|
84
90
|
# From metadata received from api in discover_datasets
|
|
@@ -120,7 +126,9 @@ def retrieve_http(
|
|
|
120
126
|
|
|
121
127
|
if current_file and current_file.tag == tag:
|
|
122
128
|
# Not changed. Don't keep it
|
|
123
|
-
return
|
|
129
|
+
return NotModifiedFile(
|
|
130
|
+
modified_at=last_modified, reason="tag matched current_file"
|
|
131
|
+
)
|
|
124
132
|
|
|
125
133
|
return DraftFile(
|
|
126
134
|
created_at=utcnow(),
|
|
@@ -413,20 +413,25 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
413
413
|
else:
|
|
414
414
|
datasets = []
|
|
415
415
|
|
|
416
|
-
metadata_result_query =
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
"last_modified_at"
|
|
420
|
-
),
|
|
421
|
-
func.count().label("row_count"),
|
|
416
|
+
metadata_result_query = (
|
|
417
|
+
apply_query_filter(
|
|
418
|
+
self.session.query(dataset_table.c.last_modified_at)
|
|
422
419
|
)
|
|
420
|
+
.order_by(dataset_table.c.last_modified_at.desc())
|
|
421
|
+
.limit(1)
|
|
423
422
|
)
|
|
424
423
|
|
|
425
424
|
self._debug_query(metadata_result_query)
|
|
426
425
|
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
426
|
+
metadata_row = metadata_result_query.first()
|
|
427
|
+
if metadata_row:
|
|
428
|
+
dataset_collection_metadata = DatasetCollectionMetadata(
|
|
429
|
+
last_modified=metadata_row.last_modified_at
|
|
430
|
+
)
|
|
431
|
+
else:
|
|
432
|
+
dataset_collection_metadata = DatasetCollectionMetadata(
|
|
433
|
+
last_modified=None
|
|
434
|
+
)
|
|
430
435
|
|
|
431
436
|
return DatasetCollection(dataset_collection_metadata, datasets)
|
|
432
437
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.6.1 → ingestify-0.6.2}/ingestify/domain/services/transformers/kloppy_to_pandas.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.6.1 → ingestify-0.6.2}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2
RENAMED
|
File without changes
|
{ingestify-0.6.1 → ingestify-0.6.2}/ingestify/static/templates/statsbomb_github/database/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|