ingestify 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +11 -0
- ingestify/application/__init__.py +0 -0
- ingestify/application/dataset_store.py +339 -0
- ingestify/application/ingestion_engine.py +62 -0
- ingestify/application/loader.py +329 -0
- ingestify/application/secrets_manager.py +53 -0
- ingestify/cmdline.py +283 -0
- ingestify/domain/__init__.py +2 -0
- ingestify/domain/models/__init__.py +45 -0
- ingestify/domain/models/data_spec_version_collection.py +33 -0
- ingestify/domain/models/dataset/__init__.py +27 -0
- ingestify/domain/models/dataset/collection.py +44 -0
- ingestify/domain/models/dataset/collection_metadata.py +13 -0
- ingestify/domain/models/dataset/dataset.py +104 -0
- ingestify/domain/models/dataset/dataset_repository.py +46 -0
- ingestify/domain/models/dataset/events.py +31 -0
- ingestify/domain/models/dataset/file.py +146 -0
- ingestify/domain/models/dataset/file_collection.py +35 -0
- ingestify/domain/models/dataset/file_repository.py +59 -0
- ingestify/domain/models/dataset/identifier.py +24 -0
- ingestify/domain/models/dataset/revision.py +29 -0
- ingestify/domain/models/dataset/selector.py +37 -0
- ingestify/domain/models/event/__init__.py +4 -0
- ingestify/domain/models/event/_old_event.py +21 -0
- ingestify/domain/models/event/dispatcher.py +8 -0
- ingestify/domain/models/event/domain_event.py +10 -0
- ingestify/domain/models/event/event_bus.py +24 -0
- ingestify/domain/models/event/publisher.py +23 -0
- ingestify/domain/models/event/subscriber.py +39 -0
- ingestify/domain/models/extract_job.py +23 -0
- ingestify/domain/models/fetch_policy.py +40 -0
- ingestify/domain/models/resources/__init__.py +1 -0
- ingestify/domain/models/resources/dataset_resource.py +99 -0
- ingestify/domain/models/sink.py +16 -0
- ingestify/domain/models/source.py +34 -0
- ingestify/domain/models/task/__init__.py +4 -0
- ingestify/domain/models/task/set.py +21 -0
- ingestify/domain/models/task/task.py +7 -0
- ingestify/domain/services/__init__.py +0 -0
- ingestify/domain/services/transformers/__init__.py +0 -0
- ingestify/domain/services/transformers/kloppy_to_pandas.py +25 -0
- ingestify/exceptions.py +10 -0
- ingestify/infra/__init__.py +4 -0
- ingestify/infra/fetch/__init__.py +0 -0
- ingestify/infra/fetch/http.py +100 -0
- ingestify/infra/serialization/__init__.py +50 -0
- ingestify/infra/sink/__init__.py +0 -0
- ingestify/infra/sink/postgresql.py +50 -0
- ingestify/infra/source/__init__.py +0 -0
- ingestify/infra/source/statsbomb_github.py +92 -0
- ingestify/infra/source/wyscout.py +175 -0
- ingestify/infra/store/__init__.py +2 -0
- ingestify/infra/store/dataset/__init__.py +2 -0
- ingestify/infra/store/dataset/local_dataset_repository.py +73 -0
- ingestify/infra/store/dataset/sqlalchemy/__init__.py +1 -0
- ingestify/infra/store/dataset/sqlalchemy/mapping.py +153 -0
- ingestify/infra/store/dataset/sqlalchemy/repository.py +239 -0
- ingestify/infra/store/file/__init__.py +2 -0
- ingestify/infra/store/file/local_file_repository.py +32 -0
- ingestify/infra/store/file/s3_file_repository.py +50 -0
- ingestify/main.py +205 -0
- ingestify/server.py +78 -0
- ingestify/source_base.py +23 -0
- ingestify/static/templates/statsbomb_github/README.md +0 -0
- ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +19 -0
- ingestify/static/templates/statsbomb_github/database/README.md +1 -0
- ingestify/static/templates/statsbomb_github/query.py +14 -0
- ingestify/static/templates/wyscout/.env +5 -0
- ingestify/static/templates/wyscout/.gitignore +2 -0
- ingestify/static/templates/wyscout/README.md +0 -0
- ingestify/static/templates/wyscout/config.yaml.jinja2 +18 -0
- ingestify/static/templates/wyscout/database/README.md +1 -0
- ingestify/static/templates/wyscout/query.py +14 -0
- ingestify/utils.py +276 -0
- ingestify-0.1.0.dist-info/METADATA +265 -0
- ingestify-0.1.0.dist-info/RECORD +79 -0
- ingestify-0.1.0.dist-info/WHEEL +5 -0
- ingestify-0.1.0.dist-info/entry_points.txt +2 -0
- ingestify-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Protocol
|
|
3
|
+
|
|
4
|
+
from ingestify.domain import DatasetCreated
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
#
|
|
8
|
+
# class EventRepository:
|
|
9
|
+
# def __init__(self):
|
|
10
|
+
# self.events = []
|
|
11
|
+
#
|
|
12
|
+
# def save(self, event):
|
|
13
|
+
# self.events.append(event)
|
|
14
|
+
#
|
|
15
|
+
#
|
|
16
|
+
# class EventWriter:
|
|
17
|
+
# def __init__(self, event_repository: EventRepository):
|
|
18
|
+
# self.event_repository = event_repository
|
|
19
|
+
#
|
|
20
|
+
# def dispatch(self, event):
|
|
21
|
+
# self.event_repository.save(event)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from .dispatcher import Dispatcher
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EventBus:
|
|
11
|
+
def __init__(self):
|
|
12
|
+
self.dispatchers: list[Dispatcher] = []
|
|
13
|
+
|
|
14
|
+
def register(self, dispatcher: Dispatcher):
|
|
15
|
+
self.dispatchers.append(dispatcher)
|
|
16
|
+
|
|
17
|
+
def dispatch(self, event):
|
|
18
|
+
|
|
19
|
+
for dispatcher in self.dispatchers:
|
|
20
|
+
try:
|
|
21
|
+
dispatcher.dispatch(event)
|
|
22
|
+
except Exception as e:
|
|
23
|
+
logger.exception(f"Failed to handle {event}")
|
|
24
|
+
raise Exception(f"Failed to handle {event}") from e
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from .dispatcher import Dispatcher
|
|
4
|
+
from .domain_event import DomainEvent
|
|
5
|
+
from .subscriber import Subscriber
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Publisher(Dispatcher):
|
|
12
|
+
def __init__(self):
|
|
13
|
+
self.subscribers: list[Subscriber] = []
|
|
14
|
+
|
|
15
|
+
def dispatch(self, event: DomainEvent):
|
|
16
|
+
for subscriber in self.subscribers:
|
|
17
|
+
try:
|
|
18
|
+
subscriber.handle(event)
|
|
19
|
+
except Exception:
|
|
20
|
+
logger.exception(f"Failed to handle {event} by {subscriber}")
|
|
21
|
+
|
|
22
|
+
def add_subscriber(self, subscriber: Subscriber):
|
|
23
|
+
self.subscribers.append(subscriber)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
from .domain_event import DomainEvent
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from ingestify.domain.models.dataset.events import (
|
|
7
|
+
DatasetCreated,
|
|
8
|
+
MetadataUpdated,
|
|
9
|
+
RevisionAdded,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Subscriber:
|
|
14
|
+
def __init__(self, store):
|
|
15
|
+
self.store = store
|
|
16
|
+
|
|
17
|
+
def on_dataset_created(self, event: "DatasetCreated"):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
def on_metadata_updated(self, event: "MetadataUpdated"):
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
def on_revision_added(self, event: "RevisionAdded"):
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
def handle(self, event: DomainEvent):
|
|
27
|
+
# TODO: fix the circular dependencies
|
|
28
|
+
from ingestify.domain.models.dataset.events import (
|
|
29
|
+
DatasetCreated,
|
|
30
|
+
MetadataUpdated,
|
|
31
|
+
RevisionAdded,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
if isinstance(event, DatasetCreated):
|
|
35
|
+
self.on_dataset_created(event)
|
|
36
|
+
elif isinstance(event, MetadataUpdated):
|
|
37
|
+
self.on_metadata_updated(event)
|
|
38
|
+
elif isinstance(event, RevisionAdded):
|
|
39
|
+
self.on_revision_added(event)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from ingestify.domain.models import Source, Selector
|
|
5
|
+
from ingestify.domain.models.data_spec_version_collection import (
|
|
6
|
+
DataSpecVersionCollection,
|
|
7
|
+
)
|
|
8
|
+
from ingestify.domain.models.fetch_policy import FetchPolicy
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class ExtractJob:
|
|
13
|
+
source: Source
|
|
14
|
+
selectors: List[Selector]
|
|
15
|
+
fetch_policy: FetchPolicy
|
|
16
|
+
dataset_type: str
|
|
17
|
+
data_spec_versions: DataSpecVersionCollection
|
|
18
|
+
|
|
19
|
+
def __repr__(self):
|
|
20
|
+
return f'<ExtractJob source="{self.source.name}" dataset_type="{self.dataset_type}">'
|
|
21
|
+
|
|
22
|
+
def __str__(self):
|
|
23
|
+
return repr(self)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from datetime import timedelta
|
|
2
|
+
|
|
3
|
+
from ingestify.domain import Dataset, Identifier, DatasetResource
|
|
4
|
+
from ingestify.utils import utcnow
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FetchPolicy:
|
|
8
|
+
def __init__(self):
|
|
9
|
+
# refresh all data that changed less than two day ago
|
|
10
|
+
self.min_age = utcnow() - timedelta(days=2)
|
|
11
|
+
self.last_change = utcnow() - timedelta(days=1)
|
|
12
|
+
|
|
13
|
+
def should_fetch(self, dataset_resource: DatasetResource) -> bool:
|
|
14
|
+
# this is called when dataset does not exist yet
|
|
15
|
+
return True
|
|
16
|
+
|
|
17
|
+
def should_refetch(
|
|
18
|
+
self, dataset: Dataset, dataset_resource: DatasetResource
|
|
19
|
+
) -> bool:
|
|
20
|
+
current_revision = dataset.current_revision
|
|
21
|
+
if not dataset.revisions:
|
|
22
|
+
# TODO: this is weird? Dataset without any data. Fetch error?
|
|
23
|
+
return True
|
|
24
|
+
elif current_revision:
|
|
25
|
+
files_last_modified = {
|
|
26
|
+
file.file_id: file.last_modified
|
|
27
|
+
for file in dataset_resource.files.values()
|
|
28
|
+
}
|
|
29
|
+
if current_revision.is_changed(files_last_modified):
|
|
30
|
+
return True
|
|
31
|
+
|
|
32
|
+
# We don't set last_modified on Dataset level anymore, only on file level
|
|
33
|
+
# else:
|
|
34
|
+
# if (
|
|
35
|
+
# identifier.last_modified
|
|
36
|
+
# and current_revision.created_at < identifier.last_modified
|
|
37
|
+
# ):
|
|
38
|
+
# return True
|
|
39
|
+
|
|
40
|
+
return False
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .dataset_resource import DatasetResource
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Optional, Callable, TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from ingestify.exceptions import DuplicateFile
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from ingestify.domain import DraftFile, File
|
|
9
|
+
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class FileResource:
|
|
14
|
+
dataset_resource: "DatasetResource"
|
|
15
|
+
file_id: str
|
|
16
|
+
last_modified: datetime
|
|
17
|
+
data_feed_key: str
|
|
18
|
+
data_spec_version: str
|
|
19
|
+
|
|
20
|
+
# DataSerializationFormat is "json" in case of json_content, otherwise file_loader will return it
|
|
21
|
+
# data_serialization_format: str
|
|
22
|
+
|
|
23
|
+
json_content: Optional[dict] = None
|
|
24
|
+
|
|
25
|
+
url: Optional[str] = None
|
|
26
|
+
http_options: Optional[dict] = None
|
|
27
|
+
data_serialization_format: Optional[str] = None
|
|
28
|
+
|
|
29
|
+
file_loader: Optional[
|
|
30
|
+
Callable[["FileResource", Optional["File"]], Optional["DraftFile"]]
|
|
31
|
+
] = None
|
|
32
|
+
|
|
33
|
+
def __post_init__(self):
|
|
34
|
+
if self.json_content is None and not self.url and not self.file_loader:
|
|
35
|
+
raise TypeError(
|
|
36
|
+
"You need to specify `json_content`, `url` or a custom `file_loader`"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class DatasetResource:
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
dataset_resource_id: dict,
|
|
44
|
+
/,
|
|
45
|
+
dataset_type: str,
|
|
46
|
+
provider: str,
|
|
47
|
+
name: str,
|
|
48
|
+
metadata: Optional[dict] = None,
|
|
49
|
+
state: Optional["DatasetState"] = None,
|
|
50
|
+
):
|
|
51
|
+
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
52
|
+
|
|
53
|
+
self.dataset_type = dataset_type
|
|
54
|
+
self.provider = provider
|
|
55
|
+
self.dataset_resource_id = dataset_resource_id
|
|
56
|
+
self.name = name
|
|
57
|
+
self.metadata = metadata or {}
|
|
58
|
+
self.state = state or DatasetState.COMPLETE
|
|
59
|
+
|
|
60
|
+
self.files = {}
|
|
61
|
+
|
|
62
|
+
def add_file(
|
|
63
|
+
self,
|
|
64
|
+
last_modified: datetime,
|
|
65
|
+
data_feed_key: str,
|
|
66
|
+
# Some sources might not have a DataSpecVersion. Set a default
|
|
67
|
+
data_spec_version: str = "v1",
|
|
68
|
+
json_content: Optional[dict] = None,
|
|
69
|
+
url: Optional[str] = None,
|
|
70
|
+
http_options: Optional[dict] = None,
|
|
71
|
+
data_serialization_format: Optional[str] = None,
|
|
72
|
+
file_loader: Optional[
|
|
73
|
+
Callable[
|
|
74
|
+
["FileResource", Optional["File"]],
|
|
75
|
+
Optional["DraftFile"],
|
|
76
|
+
]
|
|
77
|
+
] = None,
|
|
78
|
+
):
|
|
79
|
+
file_id = f"{data_feed_key}__{data_spec_version}"
|
|
80
|
+
if file_id in self.files:
|
|
81
|
+
raise DuplicateFile(f"File with id {file_id} already exists.")
|
|
82
|
+
|
|
83
|
+
file_resource = FileResource(
|
|
84
|
+
dataset_resource=self,
|
|
85
|
+
file_id=file_id,
|
|
86
|
+
data_feed_key=data_feed_key,
|
|
87
|
+
data_spec_version=data_spec_version,
|
|
88
|
+
last_modified=last_modified,
|
|
89
|
+
json_content=json_content,
|
|
90
|
+
url=url,
|
|
91
|
+
http_options=http_options,
|
|
92
|
+
data_serialization_format=data_serialization_format,
|
|
93
|
+
file_loader=file_loader,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
self.files[file_id] = file_resource
|
|
97
|
+
|
|
98
|
+
# Allow chaining
|
|
99
|
+
return self
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
from ingestify.utils import ComponentFactory, ComponentRegistry
|
|
4
|
+
|
|
5
|
+
from .dataset import Dataset
|
|
6
|
+
|
|
7
|
+
sink_registry = ComponentRegistry()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Sink(ABC, metaclass=sink_registry.metaclass):
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def upsert(self, dataset: Dataset, data, params: dict):
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
sink_factory = ComponentFactory.build_factory(Sink, sink_registry)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Dict, List, Optional, Iterable, Iterator, Union
|
|
3
|
+
|
|
4
|
+
from .data_spec_version_collection import DataSpecVersionCollection
|
|
5
|
+
from .dataset.collection_metadata import DatasetCollectionMetadata
|
|
6
|
+
from .resources.dataset_resource import DatasetResource
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Source(ABC):
|
|
10
|
+
def __init__(self, name: str, **kwargs):
|
|
11
|
+
self.name = name
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def provider(self) -> str:
|
|
16
|
+
raise NotImplemented
|
|
17
|
+
|
|
18
|
+
# TODO: consider making this required...
|
|
19
|
+
# @abstractmethod
|
|
20
|
+
# def discover_selectors(self, dataset_type: str) -> List[Dict]:
|
|
21
|
+
# pass
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def find_datasets(
|
|
25
|
+
self,
|
|
26
|
+
dataset_type: str,
|
|
27
|
+
data_spec_versions: DataSpecVersionCollection,
|
|
28
|
+
dataset_collection_metadata: DatasetCollectionMetadata,
|
|
29
|
+
**kwargs
|
|
30
|
+
) -> Iterator[List[DatasetResource]]:
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
def __repr__(self):
|
|
34
|
+
return self.__class__.__name__
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from .task import Task
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class TaskSet:
|
|
5
|
+
def __init__(self, tasks=None):
|
|
6
|
+
self.tasks = tasks or []
|
|
7
|
+
|
|
8
|
+
def add(self, task: Task):
|
|
9
|
+
self.tasks.append(task)
|
|
10
|
+
|
|
11
|
+
def __len__(self):
|
|
12
|
+
return len(self.tasks)
|
|
13
|
+
|
|
14
|
+
def __iter__(self):
|
|
15
|
+
return iter(self.tasks)
|
|
16
|
+
|
|
17
|
+
def __add__(self, other: "TaskSet"):
|
|
18
|
+
return TaskSet(self.tasks + other.tasks)
|
|
19
|
+
|
|
20
|
+
def __bool__(self):
|
|
21
|
+
return len(self) > 0
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from kloppy import StatsBombSerializer
|
|
5
|
+
|
|
6
|
+
from ingestify.domain.models import Dataset, LoadedFile
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class KloppyToPandasTransformer(Transformer):
|
|
10
|
+
def transform(
|
|
11
|
+
self, dataset: Dataset, loaded_files: Dict[str, LoadedFile]
|
|
12
|
+
) -> pd.DataFrame:
|
|
13
|
+
if dataset.provider == "statsbomb":
|
|
14
|
+
serializer = StatsBombSerializer()
|
|
15
|
+
kloppy_dataset = serializer.deserialize(
|
|
16
|
+
inputs=dict(
|
|
17
|
+
event_data=loaded_files["events.json"].stream,
|
|
18
|
+
lineup_data=loaded_files["lineup.json"].stream,
|
|
19
|
+
),
|
|
20
|
+
options={},
|
|
21
|
+
)
|
|
22
|
+
else:
|
|
23
|
+
raise Exception(f"Dataset provider {dataset.provider} not known")
|
|
24
|
+
|
|
25
|
+
return kloppy_dataset.to_pandas()
|
ingestify/exceptions.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from email.utils import format_datetime, parsedate
|
|
4
|
+
from hashlib import sha1
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
from typing import Optional, Callable, Tuple
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
from ingestify.domain.models import DraftFile, File
|
|
11
|
+
from ingestify.utils import utcnow
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def retrieve_http(
|
|
15
|
+
url,
|
|
16
|
+
current_file: Optional[File] = None,
|
|
17
|
+
headers: Optional[dict] = None,
|
|
18
|
+
pager: Optional[Tuple[str, Callable[[str, dict], Optional[str]]]] = None,
|
|
19
|
+
last_modified: Optional[datetime] = None,
|
|
20
|
+
**kwargs,
|
|
21
|
+
) -> Optional[DraftFile]:
|
|
22
|
+
headers = headers or {}
|
|
23
|
+
if current_file:
|
|
24
|
+
if last_modified and current_file.modified_at >= last_modified:
|
|
25
|
+
# Not changed
|
|
26
|
+
return None
|
|
27
|
+
# else:
|
|
28
|
+
# print(f"{current_file.modified_at=} {last_modified=}")
|
|
29
|
+
# headers["if-modified-since"] = (
|
|
30
|
+
# format_datetime(current_file.modified_at, usegmt=True),
|
|
31
|
+
# )
|
|
32
|
+
headers["if-none-match"] = current_file.tag
|
|
33
|
+
|
|
34
|
+
http_kwargs = {}
|
|
35
|
+
file_attributes = {}
|
|
36
|
+
for key, item in kwargs.items():
|
|
37
|
+
if key.startswith("http_"):
|
|
38
|
+
http_kwargs[key[5:]] = item
|
|
39
|
+
elif key.startswith("file_"):
|
|
40
|
+
file_attributes[key[5:]] = item
|
|
41
|
+
else:
|
|
42
|
+
raise Exception(f"Don't know how to use {key}")
|
|
43
|
+
|
|
44
|
+
response = requests.get(url, headers=headers, **http_kwargs)
|
|
45
|
+
response.raise_for_status()
|
|
46
|
+
if response.status_code == 304:
|
|
47
|
+
# Not modified
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
if last_modified:
|
|
51
|
+
# From metadata received from api in discover_datasets
|
|
52
|
+
modified_at = last_modified
|
|
53
|
+
elif "last-modified" in response.headers:
|
|
54
|
+
# Received from the webserver
|
|
55
|
+
modified_at = parsedate(response.headers["last-modified"])
|
|
56
|
+
else:
|
|
57
|
+
modified_at = utcnow()
|
|
58
|
+
|
|
59
|
+
tag = response.headers.get("etag")
|
|
60
|
+
# content_length = int(response.headers.get("content-length", 0))
|
|
61
|
+
|
|
62
|
+
if pager:
|
|
63
|
+
"""
|
|
64
|
+
A pager helps with responses that return the data in pages.
|
|
65
|
+
"""
|
|
66
|
+
data_path, pager_fn = pager
|
|
67
|
+
data = []
|
|
68
|
+
while True:
|
|
69
|
+
current_page_data = response.json()
|
|
70
|
+
data.extend(current_page_data[data_path])
|
|
71
|
+
next_url = pager_fn(url, current_page_data)
|
|
72
|
+
if not next_url:
|
|
73
|
+
break
|
|
74
|
+
else:
|
|
75
|
+
response = requests.get(next_url, headers=headers, **http_kwargs)
|
|
76
|
+
|
|
77
|
+
content = json.dumps({data_path: data}).encode("utf-8")
|
|
78
|
+
else:
|
|
79
|
+
content = response.content
|
|
80
|
+
|
|
81
|
+
if not tag:
|
|
82
|
+
tag = sha1(content).hexdigest()
|
|
83
|
+
|
|
84
|
+
# if not content_length: - Don't use http header as it might be wrong
|
|
85
|
+
# for example in case of compressed data
|
|
86
|
+
content_length = len(content)
|
|
87
|
+
|
|
88
|
+
if current_file and current_file.tag == tag:
|
|
89
|
+
# Not changed. Don't keep it
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
return DraftFile(
|
|
93
|
+
created_at=utcnow(),
|
|
94
|
+
modified_at=modified_at,
|
|
95
|
+
tag=tag,
|
|
96
|
+
size=content_length,
|
|
97
|
+
content_type=response.headers.get("content-type"),
|
|
98
|
+
stream=BytesIO(content),
|
|
99
|
+
**file_attributes,
|
|
100
|
+
)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Type, Any, TypeVar
|
|
4
|
+
|
|
5
|
+
from dataclass_factory import Schema, Factory, NameStyle
|
|
6
|
+
from dataclass_factory.schema_helpers import type_checker
|
|
7
|
+
|
|
8
|
+
from ingestify.domain import DatasetCreated, Identifier
|
|
9
|
+
from ingestify.domain.models.dataset.events import MetadataUpdated, RevisionAdded
|
|
10
|
+
|
|
11
|
+
isotime_schema = Schema(
|
|
12
|
+
parser=lambda x: datetime.fromisoformat(x.replace("Z", "+00:00")), # type: ignore
|
|
13
|
+
serializer=lambda x: datetime.isoformat(x).replace("+00:00", "Z"),
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
identifier_schema = Schema(
|
|
17
|
+
# json.loads(x) for backwards compatibility
|
|
18
|
+
parser=lambda x: Identifier(x if isinstance(x, dict) else json.loads(x)),
|
|
19
|
+
serializer=lambda x: dict(x),
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
factory = Factory(
|
|
23
|
+
schemas={
|
|
24
|
+
datetime: isotime_schema,
|
|
25
|
+
Identifier: identifier_schema,
|
|
26
|
+
DatasetCreated: Schema(
|
|
27
|
+
pre_parse=type_checker(DatasetCreated.event_type, "event_type")
|
|
28
|
+
),
|
|
29
|
+
MetadataUpdated: Schema(
|
|
30
|
+
pre_parse=type_checker(MetadataUpdated.event_type, "event_type")
|
|
31
|
+
),
|
|
32
|
+
RevisionAdded: Schema(
|
|
33
|
+
pre_parse=type_checker(RevisionAdded.event_type, "event_type")
|
|
34
|
+
),
|
|
35
|
+
# ClipSelectionContent: Schema(pre_parse=type_checker(ClipSelectionContent.content_type, field="contentType")),
|
|
36
|
+
# TeamInfoImageContent: Schema(pre_parse=type_checker(TeamInfoImageContent.content_type, field="contentType")),
|
|
37
|
+
# StaticVideoContent: Schema(pre_parse=type_checker(StaticVideoContent.content_type, field="contentType"))
|
|
38
|
+
},
|
|
39
|
+
default_schema=Schema(),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
T = TypeVar("T")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def serialize(data: T, class_: Type[T] = None) -> Any:
|
|
46
|
+
return factory.dump(data, class_)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def unserialize(data: Any, class_: Type[T]) -> T:
|
|
50
|
+
return factory.load(data, class_)
|
|
File without changes
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from io import StringIO
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from sqlalchemy import create_engine, text
|
|
5
|
+
|
|
6
|
+
from ingestify.domain.models import Dataset, Sink
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# https://stackoverflow.com/questions/13947327/to-ignore-duplicate-keys-during-copy-from-in-postgresql
|
|
10
|
+
def _copy_to(conn, tablename, data, extra_columns):
|
|
11
|
+
if isinstance(data, pd.DataFrame):
|
|
12
|
+
for k, v in extra_columns:
|
|
13
|
+
data[k] = v
|
|
14
|
+
tsv_file = data.to_csv(sep="\t", header=True, index=False)
|
|
15
|
+
stream = StringIO(tsv_file)
|
|
16
|
+
else:
|
|
17
|
+
raise Exception("Dont know how to handle data")
|
|
18
|
+
|
|
19
|
+
raw_connection = conn.connection
|
|
20
|
+
driver = conn.engine.dialect.driver
|
|
21
|
+
|
|
22
|
+
sql = f"COPY {tablename} FROM STDIN WITH CSV DELIMITER '\t' HEADER"
|
|
23
|
+
if driver == "pg8000":
|
|
24
|
+
# https://github.com/tlocke/pg8000/blob/13bc039e805e8a2cd8d816b939362b40018ea8ef/test/native/test_copy.py
|
|
25
|
+
raw_connection.run(sql=sql, stream=stream)
|
|
26
|
+
elif driver == "pgcopy2":
|
|
27
|
+
# https://github.com/psycopg/psycopg2/blob/1d3a89a0bba621dc1cc9b32db6d241bd2da85ad1/tests/test_copy.py
|
|
28
|
+
with raw_connection.cursor() as cursor:
|
|
29
|
+
cursor.copy_expert(sql=sql, file=stream)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class PostgresSQLSink(Sink):
|
|
33
|
+
def __init__(self, url: str):
|
|
34
|
+
self.engine = create_engine(url)
|
|
35
|
+
|
|
36
|
+
def upsert(self, dataset: Dataset, data, params: dict):
|
|
37
|
+
if not isinstance(data, pd.DataFrame):
|
|
38
|
+
raise TypeError(
|
|
39
|
+
f"Data {type(data)} is not supported by the PostgresSQLSink"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
table_name = params["table_name"]
|
|
43
|
+
|
|
44
|
+
with self.engine.begin() as conn:
|
|
45
|
+
conn.query(
|
|
46
|
+
text(
|
|
47
|
+
f"DELETE FROM {table_name} WHERE dataset_id = {dataset.dataset_id}"
|
|
48
|
+
)
|
|
49
|
+
)
|
|
50
|
+
_copy_to(conn, table_name, data, dict(dataset_id=dataset.dataset_id))
|
|
File without changes
|