ingestify 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. ingestify/__init__.py +11 -0
  2. ingestify/application/__init__.py +0 -0
  3. ingestify/application/dataset_store.py +339 -0
  4. ingestify/application/ingestion_engine.py +62 -0
  5. ingestify/application/loader.py +329 -0
  6. ingestify/application/secrets_manager.py +53 -0
  7. ingestify/cmdline.py +283 -0
  8. ingestify/domain/__init__.py +2 -0
  9. ingestify/domain/models/__init__.py +45 -0
  10. ingestify/domain/models/data_spec_version_collection.py +33 -0
  11. ingestify/domain/models/dataset/__init__.py +27 -0
  12. ingestify/domain/models/dataset/collection.py +44 -0
  13. ingestify/domain/models/dataset/collection_metadata.py +13 -0
  14. ingestify/domain/models/dataset/dataset.py +104 -0
  15. ingestify/domain/models/dataset/dataset_repository.py +46 -0
  16. ingestify/domain/models/dataset/events.py +31 -0
  17. ingestify/domain/models/dataset/file.py +146 -0
  18. ingestify/domain/models/dataset/file_collection.py +35 -0
  19. ingestify/domain/models/dataset/file_repository.py +59 -0
  20. ingestify/domain/models/dataset/identifier.py +24 -0
  21. ingestify/domain/models/dataset/revision.py +29 -0
  22. ingestify/domain/models/dataset/selector.py +37 -0
  23. ingestify/domain/models/event/__init__.py +4 -0
  24. ingestify/domain/models/event/_old_event.py +21 -0
  25. ingestify/domain/models/event/dispatcher.py +8 -0
  26. ingestify/domain/models/event/domain_event.py +10 -0
  27. ingestify/domain/models/event/event_bus.py +24 -0
  28. ingestify/domain/models/event/publisher.py +23 -0
  29. ingestify/domain/models/event/subscriber.py +39 -0
  30. ingestify/domain/models/extract_job.py +23 -0
  31. ingestify/domain/models/fetch_policy.py +40 -0
  32. ingestify/domain/models/resources/__init__.py +1 -0
  33. ingestify/domain/models/resources/dataset_resource.py +99 -0
  34. ingestify/domain/models/sink.py +16 -0
  35. ingestify/domain/models/source.py +34 -0
  36. ingestify/domain/models/task/__init__.py +4 -0
  37. ingestify/domain/models/task/set.py +21 -0
  38. ingestify/domain/models/task/task.py +7 -0
  39. ingestify/domain/services/__init__.py +0 -0
  40. ingestify/domain/services/transformers/__init__.py +0 -0
  41. ingestify/domain/services/transformers/kloppy_to_pandas.py +25 -0
  42. ingestify/exceptions.py +10 -0
  43. ingestify/infra/__init__.py +4 -0
  44. ingestify/infra/fetch/__init__.py +0 -0
  45. ingestify/infra/fetch/http.py +100 -0
  46. ingestify/infra/serialization/__init__.py +50 -0
  47. ingestify/infra/sink/__init__.py +0 -0
  48. ingestify/infra/sink/postgresql.py +50 -0
  49. ingestify/infra/source/__init__.py +0 -0
  50. ingestify/infra/source/statsbomb_github.py +92 -0
  51. ingestify/infra/source/wyscout.py +175 -0
  52. ingestify/infra/store/__init__.py +2 -0
  53. ingestify/infra/store/dataset/__init__.py +2 -0
  54. ingestify/infra/store/dataset/local_dataset_repository.py +73 -0
  55. ingestify/infra/store/dataset/sqlalchemy/__init__.py +1 -0
  56. ingestify/infra/store/dataset/sqlalchemy/mapping.py +153 -0
  57. ingestify/infra/store/dataset/sqlalchemy/repository.py +239 -0
  58. ingestify/infra/store/file/__init__.py +2 -0
  59. ingestify/infra/store/file/local_file_repository.py +32 -0
  60. ingestify/infra/store/file/s3_file_repository.py +50 -0
  61. ingestify/main.py +205 -0
  62. ingestify/server.py +78 -0
  63. ingestify/source_base.py +23 -0
  64. ingestify/static/templates/statsbomb_github/README.md +0 -0
  65. ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +19 -0
  66. ingestify/static/templates/statsbomb_github/database/README.md +1 -0
  67. ingestify/static/templates/statsbomb_github/query.py +14 -0
  68. ingestify/static/templates/wyscout/.env +5 -0
  69. ingestify/static/templates/wyscout/.gitignore +2 -0
  70. ingestify/static/templates/wyscout/README.md +0 -0
  71. ingestify/static/templates/wyscout/config.yaml.jinja2 +18 -0
  72. ingestify/static/templates/wyscout/database/README.md +1 -0
  73. ingestify/static/templates/wyscout/query.py +14 -0
  74. ingestify/utils.py +276 -0
  75. ingestify-0.1.0.dist-info/METADATA +265 -0
  76. ingestify-0.1.0.dist-info/RECORD +79 -0
  77. ingestify-0.1.0.dist-info/WHEEL +5 -0
  78. ingestify-0.1.0.dist-info/entry_points.txt +2 -0
  79. ingestify-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ from dataclasses import dataclass
2
+ from typing import Protocol
3
+
4
+ from ingestify.domain import DatasetCreated
5
+
6
+
7
+ #
8
+ # class EventRepository:
9
+ # def __init__(self):
10
+ # self.events = []
11
+ #
12
+ # def save(self, event):
13
+ # self.events.append(event)
14
+ #
15
+ #
16
+ # class EventWriter:
17
+ # def __init__(self, event_repository: EventRepository):
18
+ # self.event_repository = event_repository
19
+ #
20
+ # def dispatch(self, event):
21
+ # self.event_repository.save(event)
@@ -0,0 +1,8 @@
1
+ from typing import Protocol
2
+
3
+ from .domain_event import DomainEvent
4
+
5
+
6
+ class Dispatcher(Protocol):
7
+ def dispatch(self, event: DomainEvent):
8
+ pass
@@ -0,0 +1,10 @@
1
+ from abc import abstractmethod, ABC
2
+ from dataclasses import dataclass
3
+
4
+
5
+ @dataclass
6
+ class DomainEvent(ABC):
7
+ @property
8
+ @abstractmethod
9
+ def event_type(self) -> str:
10
+ pass
@@ -0,0 +1,24 @@
1
+ import logging
2
+
3
+
4
+ from .dispatcher import Dispatcher
5
+
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class EventBus:
11
+ def __init__(self):
12
+ self.dispatchers: list[Dispatcher] = []
13
+
14
+ def register(self, dispatcher: Dispatcher):
15
+ self.dispatchers.append(dispatcher)
16
+
17
+ def dispatch(self, event):
18
+
19
+ for dispatcher in self.dispatchers:
20
+ try:
21
+ dispatcher.dispatch(event)
22
+ except Exception as e:
23
+ logger.exception(f"Failed to handle {event}")
24
+ raise Exception(f"Failed to handle {event}") from e
@@ -0,0 +1,23 @@
1
+ import logging
2
+
3
+ from .dispatcher import Dispatcher
4
+ from .domain_event import DomainEvent
5
+ from .subscriber import Subscriber
6
+
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class Publisher(Dispatcher):
12
+ def __init__(self):
13
+ self.subscribers: list[Subscriber] = []
14
+
15
+ def dispatch(self, event: DomainEvent):
16
+ for subscriber in self.subscribers:
17
+ try:
18
+ subscriber.handle(event)
19
+ except Exception:
20
+ logger.exception(f"Failed to handle {event} by {subscriber}")
21
+
22
+ def add_subscriber(self, subscriber: Subscriber):
23
+ self.subscribers.append(subscriber)
@@ -0,0 +1,39 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from .domain_event import DomainEvent
4
+
5
+ if TYPE_CHECKING:
6
+ from ingestify.domain.models.dataset.events import (
7
+ DatasetCreated,
8
+ MetadataUpdated,
9
+ RevisionAdded,
10
+ )
11
+
12
+
13
+ class Subscriber:
14
+ def __init__(self, store):
15
+ self.store = store
16
+
17
+ def on_dataset_created(self, event: "DatasetCreated"):
18
+ pass
19
+
20
+ def on_metadata_updated(self, event: "MetadataUpdated"):
21
+ pass
22
+
23
+ def on_revision_added(self, event: "RevisionAdded"):
24
+ pass
25
+
26
+ def handle(self, event: DomainEvent):
27
+ # TODO: fix the circular dependencies
28
+ from ingestify.domain.models.dataset.events import (
29
+ DatasetCreated,
30
+ MetadataUpdated,
31
+ RevisionAdded,
32
+ )
33
+
34
+ if isinstance(event, DatasetCreated):
35
+ self.on_dataset_created(event)
36
+ elif isinstance(event, MetadataUpdated):
37
+ self.on_metadata_updated(event)
38
+ elif isinstance(event, RevisionAdded):
39
+ self.on_revision_added(event)
@@ -0,0 +1,23 @@
1
+ from dataclasses import dataclass
2
+ from typing import List
3
+
4
+ from ingestify.domain.models import Source, Selector
5
+ from ingestify.domain.models.data_spec_version_collection import (
6
+ DataSpecVersionCollection,
7
+ )
8
+ from ingestify.domain.models.fetch_policy import FetchPolicy
9
+
10
+
11
+ @dataclass
12
+ class ExtractJob:
13
+ source: Source
14
+ selectors: List[Selector]
15
+ fetch_policy: FetchPolicy
16
+ dataset_type: str
17
+ data_spec_versions: DataSpecVersionCollection
18
+
19
+ def __repr__(self):
20
+ return f'<ExtractJob source="{self.source.name}" dataset_type="{self.dataset_type}">'
21
+
22
+ def __str__(self):
23
+ return repr(self)
@@ -0,0 +1,40 @@
1
+ from datetime import timedelta
2
+
3
+ from ingestify.domain import Dataset, Identifier, DatasetResource
4
+ from ingestify.utils import utcnow
5
+
6
+
7
+ class FetchPolicy:
8
+ def __init__(self):
9
+ # refresh all data that changed less than two day ago
10
+ self.min_age = utcnow() - timedelta(days=2)
11
+ self.last_change = utcnow() - timedelta(days=1)
12
+
13
+ def should_fetch(self, dataset_resource: DatasetResource) -> bool:
14
+ # this is called when dataset does not exist yet
15
+ return True
16
+
17
+ def should_refetch(
18
+ self, dataset: Dataset, dataset_resource: DatasetResource
19
+ ) -> bool:
20
+ current_revision = dataset.current_revision
21
+ if not dataset.revisions:
22
+ # TODO: this is weird? Dataset without any data. Fetch error?
23
+ return True
24
+ elif current_revision:
25
+ files_last_modified = {
26
+ file.file_id: file.last_modified
27
+ for file in dataset_resource.files.values()
28
+ }
29
+ if current_revision.is_changed(files_last_modified):
30
+ return True
31
+
32
+ # We don't set last_modified on Dataset level anymore, only on file level
33
+ # else:
34
+ # if (
35
+ # identifier.last_modified
36
+ # and current_revision.created_at < identifier.last_modified
37
+ # ):
38
+ # return True
39
+
40
+ return False
@@ -0,0 +1 @@
1
+ from .dataset_resource import DatasetResource
@@ -0,0 +1,99 @@
1
+ from dataclasses import dataclass
2
+ from datetime import datetime
3
+ from typing import Optional, Callable, TYPE_CHECKING
4
+
5
+ from ingestify.exceptions import DuplicateFile
6
+
7
+ if TYPE_CHECKING:
8
+ from ingestify.domain import DraftFile, File
9
+ from ingestify.domain.models.dataset.dataset import DatasetState
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class FileResource:
14
+ dataset_resource: "DatasetResource"
15
+ file_id: str
16
+ last_modified: datetime
17
+ data_feed_key: str
18
+ data_spec_version: str
19
+
20
+ # DataSerializationFormat is "json" in case of json_content, otherwise file_loader will return it
21
+ # data_serialization_format: str
22
+
23
+ json_content: Optional[dict] = None
24
+
25
+ url: Optional[str] = None
26
+ http_options: Optional[dict] = None
27
+ data_serialization_format: Optional[str] = None
28
+
29
+ file_loader: Optional[
30
+ Callable[["FileResource", Optional["File"]], Optional["DraftFile"]]
31
+ ] = None
32
+
33
+ def __post_init__(self):
34
+ if self.json_content is None and not self.url and not self.file_loader:
35
+ raise TypeError(
36
+ "You need to specify `json_content`, `url` or a custom `file_loader`"
37
+ )
38
+
39
+
40
+ class DatasetResource:
41
+ def __init__(
42
+ self,
43
+ dataset_resource_id: dict,
44
+ /,
45
+ dataset_type: str,
46
+ provider: str,
47
+ name: str,
48
+ metadata: Optional[dict] = None,
49
+ state: Optional["DatasetState"] = None,
50
+ ):
51
+ from ingestify.domain.models.dataset.dataset import DatasetState
52
+
53
+ self.dataset_type = dataset_type
54
+ self.provider = provider
55
+ self.dataset_resource_id = dataset_resource_id
56
+ self.name = name
57
+ self.metadata = metadata or {}
58
+ self.state = state or DatasetState.COMPLETE
59
+
60
+ self.files = {}
61
+
62
+ def add_file(
63
+ self,
64
+ last_modified: datetime,
65
+ data_feed_key: str,
66
+ # Some sources might not have a DataSpecVersion. Set a default
67
+ data_spec_version: str = "v1",
68
+ json_content: Optional[dict] = None,
69
+ url: Optional[str] = None,
70
+ http_options: Optional[dict] = None,
71
+ data_serialization_format: Optional[str] = None,
72
+ file_loader: Optional[
73
+ Callable[
74
+ ["FileResource", Optional["File"]],
75
+ Optional["DraftFile"],
76
+ ]
77
+ ] = None,
78
+ ):
79
+ file_id = f"{data_feed_key}__{data_spec_version}"
80
+ if file_id in self.files:
81
+ raise DuplicateFile(f"File with id {file_id} already exists.")
82
+
83
+ file_resource = FileResource(
84
+ dataset_resource=self,
85
+ file_id=file_id,
86
+ data_feed_key=data_feed_key,
87
+ data_spec_version=data_spec_version,
88
+ last_modified=last_modified,
89
+ json_content=json_content,
90
+ url=url,
91
+ http_options=http_options,
92
+ data_serialization_format=data_serialization_format,
93
+ file_loader=file_loader,
94
+ )
95
+
96
+ self.files[file_id] = file_resource
97
+
98
+ # Allow chaining
99
+ return self
@@ -0,0 +1,16 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ from ingestify.utils import ComponentFactory, ComponentRegistry
4
+
5
+ from .dataset import Dataset
6
+
7
+ sink_registry = ComponentRegistry()
8
+
9
+
10
+ class Sink(ABC, metaclass=sink_registry.metaclass):
11
+ @abstractmethod
12
+ def upsert(self, dataset: Dataset, data, params: dict):
13
+ pass
14
+
15
+
16
+ sink_factory = ComponentFactory.build_factory(Sink, sink_registry)
@@ -0,0 +1,34 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Dict, List, Optional, Iterable, Iterator, Union
3
+
4
+ from .data_spec_version_collection import DataSpecVersionCollection
5
+ from .dataset.collection_metadata import DatasetCollectionMetadata
6
+ from .resources.dataset_resource import DatasetResource
7
+
8
+
9
+ class Source(ABC):
10
+ def __init__(self, name: str, **kwargs):
11
+ self.name = name
12
+
13
+ @property
14
+ @abstractmethod
15
+ def provider(self) -> str:
16
+ raise NotImplemented
17
+
18
+ # TODO: consider making this required...
19
+ # @abstractmethod
20
+ # def discover_selectors(self, dataset_type: str) -> List[Dict]:
21
+ # pass
22
+
23
+ @abstractmethod
24
+ def find_datasets(
25
+ self,
26
+ dataset_type: str,
27
+ data_spec_versions: DataSpecVersionCollection,
28
+ dataset_collection_metadata: DatasetCollectionMetadata,
29
+ **kwargs
30
+ ) -> Iterator[List[DatasetResource]]:
31
+ pass
32
+
33
+ def __repr__(self):
34
+ return self.__class__.__name__
@@ -0,0 +1,4 @@
1
+ from .set import TaskSet
2
+ from .task import Task
3
+
4
+ __all__ = ["Task", "TaskSet"]
@@ -0,0 +1,21 @@
1
+ from .task import Task
2
+
3
+
4
+ class TaskSet:
5
+ def __init__(self, tasks=None):
6
+ self.tasks = tasks or []
7
+
8
+ def add(self, task: Task):
9
+ self.tasks.append(task)
10
+
11
+ def __len__(self):
12
+ return len(self.tasks)
13
+
14
+ def __iter__(self):
15
+ return iter(self.tasks)
16
+
17
+ def __add__(self, other: "TaskSet"):
18
+ return TaskSet(self.tasks + other.tasks)
19
+
20
+ def __bool__(self):
21
+ return len(self) > 0
@@ -0,0 +1,7 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class Task(ABC):
5
+ @abstractmethod
6
+ def run(self):
7
+ pass
File without changes
File without changes
@@ -0,0 +1,25 @@
1
+ from typing import Dict, List
2
+
3
+ import pandas as pd
4
+ from kloppy import StatsBombSerializer
5
+
6
+ from ingestify.domain.models import Dataset, LoadedFile
7
+
8
+
9
+ class KloppyToPandasTransformer(Transformer):
10
+ def transform(
11
+ self, dataset: Dataset, loaded_files: Dict[str, LoadedFile]
12
+ ) -> pd.DataFrame:
13
+ if dataset.provider == "statsbomb":
14
+ serializer = StatsBombSerializer()
15
+ kloppy_dataset = serializer.deserialize(
16
+ inputs=dict(
17
+ event_data=loaded_files["events.json"].stream,
18
+ lineup_data=loaded_files["lineup.json"].stream,
19
+ ),
20
+ options={},
21
+ )
22
+ else:
23
+ raise Exception(f"Dataset provider {dataset.provider} not known")
24
+
25
+ return kloppy_dataset.to_pandas()
@@ -0,0 +1,10 @@
1
+ class IngestifyError(Exception):
2
+ pass
3
+
4
+
5
+ class ConfigurationError(IngestifyError):
6
+ pass
7
+
8
+
9
+ class DuplicateFile(IngestifyError):
10
+ pass
@@ -0,0 +1,4 @@
1
+ from .fetch.http import retrieve_http
2
+ from .store import *
3
+
4
+ __all__ = ["retrieve_http"]
File without changes
@@ -0,0 +1,100 @@
1
+ import json
2
+ from datetime import datetime
3
+ from email.utils import format_datetime, parsedate
4
+ from hashlib import sha1
5
+ from io import BytesIO
6
+ from typing import Optional, Callable, Tuple
7
+
8
+ import requests
9
+
10
+ from ingestify.domain.models import DraftFile, File
11
+ from ingestify.utils import utcnow
12
+
13
+
14
+ def retrieve_http(
15
+ url,
16
+ current_file: Optional[File] = None,
17
+ headers: Optional[dict] = None,
18
+ pager: Optional[Tuple[str, Callable[[str, dict], Optional[str]]]] = None,
19
+ last_modified: Optional[datetime] = None,
20
+ **kwargs,
21
+ ) -> Optional[DraftFile]:
22
+ headers = headers or {}
23
+ if current_file:
24
+ if last_modified and current_file.modified_at >= last_modified:
25
+ # Not changed
26
+ return None
27
+ # else:
28
+ # print(f"{current_file.modified_at=} {last_modified=}")
29
+ # headers["if-modified-since"] = (
30
+ # format_datetime(current_file.modified_at, usegmt=True),
31
+ # )
32
+ headers["if-none-match"] = current_file.tag
33
+
34
+ http_kwargs = {}
35
+ file_attributes = {}
36
+ for key, item in kwargs.items():
37
+ if key.startswith("http_"):
38
+ http_kwargs[key[5:]] = item
39
+ elif key.startswith("file_"):
40
+ file_attributes[key[5:]] = item
41
+ else:
42
+ raise Exception(f"Don't know how to use {key}")
43
+
44
+ response = requests.get(url, headers=headers, **http_kwargs)
45
+ response.raise_for_status()
46
+ if response.status_code == 304:
47
+ # Not modified
48
+ return None
49
+
50
+ if last_modified:
51
+ # From metadata received from api in discover_datasets
52
+ modified_at = last_modified
53
+ elif "last-modified" in response.headers:
54
+ # Received from the webserver
55
+ modified_at = parsedate(response.headers["last-modified"])
56
+ else:
57
+ modified_at = utcnow()
58
+
59
+ tag = response.headers.get("etag")
60
+ # content_length = int(response.headers.get("content-length", 0))
61
+
62
+ if pager:
63
+ """
64
+ A pager helps with responses that return the data in pages.
65
+ """
66
+ data_path, pager_fn = pager
67
+ data = []
68
+ while True:
69
+ current_page_data = response.json()
70
+ data.extend(current_page_data[data_path])
71
+ next_url = pager_fn(url, current_page_data)
72
+ if not next_url:
73
+ break
74
+ else:
75
+ response = requests.get(next_url, headers=headers, **http_kwargs)
76
+
77
+ content = json.dumps({data_path: data}).encode("utf-8")
78
+ else:
79
+ content = response.content
80
+
81
+ if not tag:
82
+ tag = sha1(content).hexdigest()
83
+
84
+ # if not content_length: - Don't use http header as it might be wrong
85
+ # for example in case of compressed data
86
+ content_length = len(content)
87
+
88
+ if current_file and current_file.tag == tag:
89
+ # Not changed. Don't keep it
90
+ return None
91
+
92
+ return DraftFile(
93
+ created_at=utcnow(),
94
+ modified_at=modified_at,
95
+ tag=tag,
96
+ size=content_length,
97
+ content_type=response.headers.get("content-type"),
98
+ stream=BytesIO(content),
99
+ **file_attributes,
100
+ )
@@ -0,0 +1,50 @@
1
+ import json
2
+ from datetime import datetime
3
+ from typing import Type, Any, TypeVar
4
+
5
+ from dataclass_factory import Schema, Factory, NameStyle
6
+ from dataclass_factory.schema_helpers import type_checker
7
+
8
+ from ingestify.domain import DatasetCreated, Identifier
9
+ from ingestify.domain.models.dataset.events import MetadataUpdated, RevisionAdded
10
+
11
+ isotime_schema = Schema(
12
+ parser=lambda x: datetime.fromisoformat(x.replace("Z", "+00:00")), # type: ignore
13
+ serializer=lambda x: datetime.isoformat(x).replace("+00:00", "Z"),
14
+ )
15
+
16
+ identifier_schema = Schema(
17
+ # json.loads(x) for backwards compatibility
18
+ parser=lambda x: Identifier(x if isinstance(x, dict) else json.loads(x)),
19
+ serializer=lambda x: dict(x),
20
+ )
21
+
22
+ factory = Factory(
23
+ schemas={
24
+ datetime: isotime_schema,
25
+ Identifier: identifier_schema,
26
+ DatasetCreated: Schema(
27
+ pre_parse=type_checker(DatasetCreated.event_type, "event_type")
28
+ ),
29
+ MetadataUpdated: Schema(
30
+ pre_parse=type_checker(MetadataUpdated.event_type, "event_type")
31
+ ),
32
+ RevisionAdded: Schema(
33
+ pre_parse=type_checker(RevisionAdded.event_type, "event_type")
34
+ ),
35
+ # ClipSelectionContent: Schema(pre_parse=type_checker(ClipSelectionContent.content_type, field="contentType")),
36
+ # TeamInfoImageContent: Schema(pre_parse=type_checker(TeamInfoImageContent.content_type, field="contentType")),
37
+ # StaticVideoContent: Schema(pre_parse=type_checker(StaticVideoContent.content_type, field="contentType"))
38
+ },
39
+ default_schema=Schema(),
40
+ )
41
+
42
+ T = TypeVar("T")
43
+
44
+
45
+ def serialize(data: T, class_: Type[T] = None) -> Any:
46
+ return factory.dump(data, class_)
47
+
48
+
49
+ def unserialize(data: Any, class_: Type[T]) -> T:
50
+ return factory.load(data, class_)
File without changes
@@ -0,0 +1,50 @@
1
+ from io import StringIO
2
+
3
+ import pandas as pd
4
+ from sqlalchemy import create_engine, text
5
+
6
+ from ingestify.domain.models import Dataset, Sink
7
+
8
+
9
+ # https://stackoverflow.com/questions/13947327/to-ignore-duplicate-keys-during-copy-from-in-postgresql
10
+ def _copy_to(conn, tablename, data, extra_columns):
11
+ if isinstance(data, pd.DataFrame):
12
+ for k, v in extra_columns:
13
+ data[k] = v
14
+ tsv_file = data.to_csv(sep="\t", header=True, index=False)
15
+ stream = StringIO(tsv_file)
16
+ else:
17
+ raise Exception("Dont know how to handle data")
18
+
19
+ raw_connection = conn.connection
20
+ driver = conn.engine.dialect.driver
21
+
22
+ sql = f"COPY {tablename} FROM STDIN WITH CSV DELIMITER '\t' HEADER"
23
+ if driver == "pg8000":
24
+ # https://github.com/tlocke/pg8000/blob/13bc039e805e8a2cd8d816b939362b40018ea8ef/test/native/test_copy.py
25
+ raw_connection.run(sql=sql, stream=stream)
26
+ elif driver == "pgcopy2":
27
+ # https://github.com/psycopg/psycopg2/blob/1d3a89a0bba621dc1cc9b32db6d241bd2da85ad1/tests/test_copy.py
28
+ with raw_connection.cursor() as cursor:
29
+ cursor.copy_expert(sql=sql, file=stream)
30
+
31
+
32
+ class PostgresSQLSink(Sink):
33
+ def __init__(self, url: str):
34
+ self.engine = create_engine(url)
35
+
36
+ def upsert(self, dataset: Dataset, data, params: dict):
37
+ if not isinstance(data, pd.DataFrame):
38
+ raise TypeError(
39
+ f"Data {type(data)} is not supported by the PostgresSQLSink"
40
+ )
41
+
42
+ table_name = params["table_name"]
43
+
44
+ with self.engine.begin() as conn:
45
+ conn.query(
46
+ text(
47
+ f"DELETE FROM {table_name} WHERE dataset_id = {dataset.dataset_id}"
48
+ )
49
+ )
50
+ _copy_to(conn, table_name, data, dict(dataset_id=dataset.dataset_id))
File without changes