ingestify 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. ingestify/__init__.py +11 -0
  2. ingestify/application/__init__.py +0 -0
  3. ingestify/application/dataset_store.py +339 -0
  4. ingestify/application/ingestion_engine.py +62 -0
  5. ingestify/application/loader.py +329 -0
  6. ingestify/application/secrets_manager.py +53 -0
  7. ingestify/cmdline.py +283 -0
  8. ingestify/domain/__init__.py +2 -0
  9. ingestify/domain/models/__init__.py +45 -0
  10. ingestify/domain/models/data_spec_version_collection.py +33 -0
  11. ingestify/domain/models/dataset/__init__.py +27 -0
  12. ingestify/domain/models/dataset/collection.py +44 -0
  13. ingestify/domain/models/dataset/collection_metadata.py +13 -0
  14. ingestify/domain/models/dataset/dataset.py +104 -0
  15. ingestify/domain/models/dataset/dataset_repository.py +46 -0
  16. ingestify/domain/models/dataset/events.py +31 -0
  17. ingestify/domain/models/dataset/file.py +146 -0
  18. ingestify/domain/models/dataset/file_collection.py +35 -0
  19. ingestify/domain/models/dataset/file_repository.py +59 -0
  20. ingestify/domain/models/dataset/identifier.py +24 -0
  21. ingestify/domain/models/dataset/revision.py +29 -0
  22. ingestify/domain/models/dataset/selector.py +37 -0
  23. ingestify/domain/models/event/__init__.py +4 -0
  24. ingestify/domain/models/event/_old_event.py +21 -0
  25. ingestify/domain/models/event/dispatcher.py +8 -0
  26. ingestify/domain/models/event/domain_event.py +10 -0
  27. ingestify/domain/models/event/event_bus.py +24 -0
  28. ingestify/domain/models/event/publisher.py +23 -0
  29. ingestify/domain/models/event/subscriber.py +39 -0
  30. ingestify/domain/models/extract_job.py +23 -0
  31. ingestify/domain/models/fetch_policy.py +40 -0
  32. ingestify/domain/models/resources/__init__.py +1 -0
  33. ingestify/domain/models/resources/dataset_resource.py +99 -0
  34. ingestify/domain/models/sink.py +16 -0
  35. ingestify/domain/models/source.py +34 -0
  36. ingestify/domain/models/task/__init__.py +4 -0
  37. ingestify/domain/models/task/set.py +21 -0
  38. ingestify/domain/models/task/task.py +7 -0
  39. ingestify/domain/services/__init__.py +0 -0
  40. ingestify/domain/services/transformers/__init__.py +0 -0
  41. ingestify/domain/services/transformers/kloppy_to_pandas.py +25 -0
  42. ingestify/exceptions.py +10 -0
  43. ingestify/infra/__init__.py +4 -0
  44. ingestify/infra/fetch/__init__.py +0 -0
  45. ingestify/infra/fetch/http.py +100 -0
  46. ingestify/infra/serialization/__init__.py +50 -0
  47. ingestify/infra/sink/__init__.py +0 -0
  48. ingestify/infra/sink/postgresql.py +50 -0
  49. ingestify/infra/source/__init__.py +0 -0
  50. ingestify/infra/source/statsbomb_github.py +92 -0
  51. ingestify/infra/source/wyscout.py +175 -0
  52. ingestify/infra/store/__init__.py +2 -0
  53. ingestify/infra/store/dataset/__init__.py +2 -0
  54. ingestify/infra/store/dataset/local_dataset_repository.py +73 -0
  55. ingestify/infra/store/dataset/sqlalchemy/__init__.py +1 -0
  56. ingestify/infra/store/dataset/sqlalchemy/mapping.py +153 -0
  57. ingestify/infra/store/dataset/sqlalchemy/repository.py +239 -0
  58. ingestify/infra/store/file/__init__.py +2 -0
  59. ingestify/infra/store/file/local_file_repository.py +32 -0
  60. ingestify/infra/store/file/s3_file_repository.py +50 -0
  61. ingestify/main.py +205 -0
  62. ingestify/server.py +78 -0
  63. ingestify/source_base.py +23 -0
  64. ingestify/static/templates/statsbomb_github/README.md +0 -0
  65. ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +19 -0
  66. ingestify/static/templates/statsbomb_github/database/README.md +1 -0
  67. ingestify/static/templates/statsbomb_github/query.py +14 -0
  68. ingestify/static/templates/wyscout/.env +5 -0
  69. ingestify/static/templates/wyscout/.gitignore +2 -0
  70. ingestify/static/templates/wyscout/README.md +0 -0
  71. ingestify/static/templates/wyscout/config.yaml.jinja2 +18 -0
  72. ingestify/static/templates/wyscout/database/README.md +1 -0
  73. ingestify/static/templates/wyscout/query.py +14 -0
  74. ingestify/utils.py +276 -0
  75. ingestify-0.1.0.dist-info/METADATA +265 -0
  76. ingestify-0.1.0.dist-info/RECORD +79 -0
  77. ingestify-0.1.0.dist-info/WHEEL +5 -0
  78. ingestify-0.1.0.dist-info/entry_points.txt +2 -0
  79. ingestify-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,45 @@
1
+ from .dataset import (
2
+ Dataset,
3
+ DatasetCollection,
4
+ DatasetRepository,
5
+ DatasetCreated,
6
+ DraftFile,
7
+ File,
8
+ FileRepository,
9
+ FileCollection,
10
+ Identifier,
11
+ LoadedFile,
12
+ Selector,
13
+ Revision,
14
+ dataset_repository_factory,
15
+ file_repository_factory,
16
+ )
17
+ from .sink import Sink, sink_factory
18
+ from .source import Source
19
+ from .task import Task, TaskSet
20
+ from .data_spec_version_collection import DataSpecVersionCollection
21
+ from .resources import DatasetResource
22
+
23
+ __all__ = [
24
+ "Selector",
25
+ "Identifier",
26
+ "Source",
27
+ "Revision",
28
+ "Dataset",
29
+ "DatasetCollection",
30
+ "DatasetResource",
31
+ "File",
32
+ "DraftFile",
33
+ "DatasetCreated",
34
+ "LoadedFile",
35
+ "FileRepository",
36
+ "FileCollection",
37
+ "DatasetRepository",
38
+ "dataset_repository_factory",
39
+ "file_repository_factory",
40
+ "TaskSet",
41
+ "Task",
42
+ "Sink",
43
+ "sink_factory",
44
+ "DataSpecVersionCollection",
45
+ ]
@@ -0,0 +1,33 @@
1
+ import copy
2
+ from dataclasses import dataclass
3
+ from typing import Dict, Union, List, Set, Optional
4
+
5
+
6
+ class DataSpecVersionCollection(dict):
7
+ @classmethod
8
+ def from_dict(cls, items: Dict[str, Union[str, List[str], Set[str]]]):
9
+ items_ = {}
10
+ for data_feed_key, data_spec_versions in items.items():
11
+ if isinstance(data_spec_versions, str):
12
+ data_spec_versions = {data_spec_versions}
13
+ elif isinstance(data_spec_versions, list):
14
+ data_spec_versions = set(data_spec_versions)
15
+ items_[data_feed_key] = data_spec_versions
16
+
17
+ return cls(items_)
18
+
19
+ def copy(self):
20
+ return DataSpecVersionCollection(copy.deepcopy(self))
21
+
22
+ def merge(self, other: "DataSpecVersionCollection"):
23
+ for data_feed_key, data_spec_versions in other.items():
24
+ if data_feed_key in self:
25
+ self[data_feed_key].update(data_spec_versions)
26
+ else:
27
+ self[data_feed_key] = data_spec_versions
28
+
29
+ def get_version(self, data_feed_key: str, default: Optional[str] = None):
30
+ items = self.get(data_feed_key)
31
+ if not items:
32
+ return default
33
+ return list(items)[0]
@@ -0,0 +1,27 @@
1
+ from .collection import DatasetCollection
2
+ from .dataset import Dataset
3
+ from .dataset_repository import DatasetRepository, dataset_repository_factory
4
+ from .file import DraftFile, File, LoadedFile
5
+ from .file_repository import FileRepository, file_repository_factory
6
+ from .file_collection import FileCollection
7
+ from .identifier import Identifier
8
+ from .selector import Selector
9
+ from .revision import Revision
10
+ from .events import DatasetCreated
11
+
12
+ __all__ = [
13
+ "Selector",
14
+ "Revision",
15
+ "Dataset",
16
+ "Identifier",
17
+ "DatasetCollection",
18
+ "DatasetCreated",
19
+ "dataset_repository_factory",
20
+ "File",
21
+ "DraftFile",
22
+ "LoadedFile",
23
+ "DatasetRepository",
24
+ "FileRepository",
25
+ "file_repository_factory",
26
+ "FileCollection",
27
+ ]
@@ -0,0 +1,44 @@
1
+ from typing import List, Optional
2
+
3
+ from .collection_metadata import DatasetCollectionMetadata
4
+ from .dataset import Dataset
5
+ from .identifier import Identifier
6
+
7
+
8
+ class DatasetCollection:
9
+ def __init__(
10
+ self,
11
+ metadata: Optional[DatasetCollectionMetadata] = None,
12
+ datasets: Optional[List[Dataset]] = None,
13
+ ):
14
+ datasets = datasets or []
15
+
16
+ # TODO: this fails when datasets contains different dataset_types with overlapping identifiers
17
+ self.datasets: dict[str, Dataset] = {
18
+ dataset.identifier.key: dataset for dataset in datasets
19
+ }
20
+ self.metadata = metadata
21
+
22
+ def loaded(self):
23
+ return self.metadata.count == len(self.datasets)
24
+
25
+ def get(self, dataset_identifier: Identifier) -> Dataset:
26
+ return self.datasets.get(dataset_identifier.key)
27
+
28
+ def __len__(self):
29
+ return len(self.datasets)
30
+
31
+ def __iter__(self):
32
+ return iter(self.datasets.values())
33
+
34
+ def get_dataset_by_id(self, dataset_id):
35
+ for dataset in self:
36
+ if dataset.dataset_id == dataset_id:
37
+ return dataset
38
+ return None
39
+
40
+ def first(self):
41
+ try:
42
+ return next(iter(self.datasets.values()))
43
+ except StopIteration:
44
+ raise Exception("No items in the collection")
@@ -0,0 +1,13 @@
1
+ from dataclasses import dataclass
2
+ from datetime import datetime
3
+ from typing import Optional
4
+
5
+
6
+ @dataclass
7
+ class DatasetCollectionMetadata:
8
+ # This can be useful to figure out if a backfill is required
9
+ first_modified: Optional[datetime]
10
+
11
+ # Use the last modified to only retrieve datasets that are changed
12
+ last_modified: Optional[datetime]
13
+ row_count: int
@@ -0,0 +1,104 @@
1
+ from dataclasses import dataclass, field
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from typing import List, Optional
5
+
6
+ from ingestify.utils import utcnow
7
+
8
+ from .file import DraftFile
9
+ from .identifier import Identifier
10
+ from .revision import Revision
11
+
12
+
13
+ class DatasetState(Enum):
14
+ SCHEDULED = "SCHEDULED"
15
+ PARTIAL = "PARTIAL"
16
+ COMPLETE = "COMPLETE"
17
+
18
+ @property
19
+ def is_complete(self):
20
+ return self == DatasetState.COMPLETE
21
+
22
+ def __str__(self):
23
+ return self.value
24
+
25
+
26
+ @dataclass
27
+ class Dataset:
28
+ bucket: str # This must be set by the DatasetRepository
29
+
30
+ dataset_id: str
31
+ name: str
32
+ state: DatasetState
33
+
34
+ dataset_type: str
35
+ provider: str
36
+
37
+ identifier: Identifier
38
+ metadata: dict
39
+
40
+ created_at: datetime
41
+ updated_at: datetime
42
+
43
+ revisions: List[Revision] = field(default_factory=list)
44
+
45
+ @property
46
+ def is_complete(self):
47
+ return self.state.is_complete
48
+
49
+ def next_revision_id(self):
50
+ return len(self.revisions)
51
+
52
+ def add_revision(self, revision: Revision):
53
+ self.revisions.append(revision)
54
+ self.updated_at = utcnow()
55
+
56
+ def update_from_resource(self, dataset_resource) -> bool:
57
+ changed = False
58
+ if self.name != dataset_resource.name:
59
+ self.name = dataset_resource.name
60
+ changed = True
61
+
62
+ if self.metadata != dataset_resource.metadata:
63
+ self.metadata = dataset_resource.metadata
64
+ changed = True
65
+
66
+ if self.state != dataset_resource.state:
67
+ self.state = dataset_resource.state
68
+ changed = True
69
+
70
+ if changed:
71
+ self.updated_at = utcnow()
72
+
73
+ return changed
74
+
75
+ @property
76
+ def current_revision(self) -> Optional[Revision]:
77
+ """
78
+ When multiple versions are available, squash versions into one single version which
79
+ contents all most recent files.
80
+ """
81
+ if not self.revisions:
82
+ return None
83
+ elif len(self.revisions) == 1:
84
+ return self.revisions[0]
85
+ else:
86
+ files = {}
87
+
88
+ for revision in self.revisions:
89
+ for file_id, file in revision.modified_files_map.items():
90
+ if isinstance(file, DraftFile):
91
+ raise Exception(
92
+ f"Cannot squash draft file. Revision: {revision}. FileId: {file_id}"
93
+ )
94
+ files[file_id] = file
95
+ files[file_id].revision_id = revision.revision_id
96
+
97
+ return Revision(
98
+ revision_id=self.revisions[-1].revision_id,
99
+ created_at=self.revisions[-1].created_at,
100
+ # created_at=max([file.modified_at for file in files.values()]),
101
+ description="Squashed revision",
102
+ is_squashed=True,
103
+ modified_files=list(files.values()),
104
+ )
@@ -0,0 +1,46 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Optional, List, Union
3
+
4
+ from ingestify.utils import ComponentFactory, ComponentRegistry
5
+
6
+ from .collection import DatasetCollection
7
+ from .dataset import Dataset
8
+ from .selector import Selector
9
+
10
+ dataset_repository_registry = ComponentRegistry()
11
+
12
+
13
+ class DatasetRepository(ABC, metaclass=dataset_repository_registry.metaclass):
14
+ @abstractmethod
15
+ def get_dataset_collection(
16
+ self,
17
+ bucket: str,
18
+ dataset_type: Optional[str] = None,
19
+ dataset_id: Optional[Union[str, List[str]]] = None,
20
+ provider: Optional[str] = None,
21
+ selector: Optional[Union[Selector, List[Selector]]] = None,
22
+ metadata_only: bool = False,
23
+ ) -> DatasetCollection:
24
+ pass
25
+
26
+ @abstractmethod
27
+ def destroy(self, dataset: Dataset):
28
+ pass
29
+
30
+ @abstractmethod
31
+ def save(self, bucket: str, dataset: Dataset):
32
+ pass
33
+
34
+ @abstractmethod
35
+ def next_identity(self):
36
+ pass
37
+
38
+ @classmethod
39
+ @abstractmethod
40
+ def supports(cls, url: str) -> bool:
41
+ pass
42
+
43
+
44
+ dataset_repository_factory = ComponentFactory.build_factory(
45
+ DatasetRepository, dataset_repository_registry
46
+ )
@@ -0,0 +1,31 @@
1
+ from dataclasses import dataclass, field
2
+ from datetime import datetime
3
+
4
+ from ingestify.domain.models.event.domain_event import DomainEvent
5
+ from ingestify.utils import utcnow
6
+
7
+ from .dataset import Dataset
8
+
9
+
10
+ @dataclass
11
+ class DatasetCreated(DomainEvent):
12
+ dataset: Dataset
13
+
14
+ event_type: str = "dataset_created"
15
+ occurred_at: datetime = field(default_factory=utcnow)
16
+
17
+
18
+ @dataclass
19
+ class RevisionAdded(DomainEvent):
20
+ dataset: Dataset
21
+
22
+ event_type: str = "revision_added"
23
+ occurred_at: datetime = field(default_factory=utcnow)
24
+
25
+
26
+ @dataclass
27
+ class MetadataUpdated(DomainEvent):
28
+ dataset: Dataset
29
+
30
+ event_type: str = "metadata_updated"
31
+ occurred_at: datetime = field(default_factory=utcnow)
@@ -0,0 +1,146 @@
1
+ import hashlib
2
+ import mimetypes
3
+
4
+ from dataclasses import dataclass
5
+ from datetime import datetime
6
+ from io import BytesIO, StringIO
7
+ from pathlib import Path
8
+ from typing import BinaryIO, Optional, Union, Callable
9
+
10
+ from ingestify.utils import utcnow
11
+
12
+
13
+ @dataclass
14
+ class DraftFile:
15
+ created_at: datetime
16
+ modified_at: datetime
17
+ tag: str
18
+ size: int
19
+ content_type: Optional[str]
20
+
21
+ data_feed_key: str # Example: 'events'
22
+ data_spec_version: str # Example: 'v3'
23
+ data_serialization_format: str # Example: 'json'
24
+
25
+ stream: BinaryIO
26
+
27
+ @classmethod
28
+ def from_input(
29
+ cls,
30
+ file_,
31
+ data_feed_key,
32
+ data_spec_version="v1",
33
+ data_serialization_format="txt",
34
+ modified_at=None,
35
+ ):
36
+ # Pass-through for these types
37
+ if isinstance(file_, DraftFile) or file_ is None:
38
+ return file_
39
+ elif isinstance(file_, str):
40
+ stream = BytesIO(file_.encode("utf-8"))
41
+ elif isinstance(file_, bytes):
42
+ stream = BytesIO(file_)
43
+ elif isinstance(file_, StringIO):
44
+ stream = BytesIO(file_.read().encode("utf-8"))
45
+ elif isinstance(file_, BytesIO):
46
+ stream = file_
47
+ else:
48
+ raise Exception(f"Not possible to create DraftFile from {type(file_)}")
49
+
50
+ data = stream.read()
51
+ size = len(data)
52
+ tag = hashlib.sha1(data).hexdigest()
53
+ stream.seek(0)
54
+
55
+ now = utcnow()
56
+
57
+ return DraftFile(
58
+ created_at=now,
59
+ modified_at=modified_at or now,
60
+ tag=tag,
61
+ size=size,
62
+ stream=stream,
63
+ content_type=None,
64
+ data_feed_key=data_feed_key,
65
+ data_spec_version=data_spec_version,
66
+ data_serialization_format=data_serialization_format,
67
+ )
68
+
69
+
70
+ @dataclass
71
+ class File:
72
+ file_id: str
73
+ created_at: datetime
74
+ modified_at: datetime
75
+ tag: str
76
+ size: int
77
+ content_type: Optional[str]
78
+
79
+ data_feed_key: str # Example: 'events'
80
+ data_spec_version: str # Example: 'v3'
81
+ data_serialization_format: str # Example: 'json'
82
+
83
+ storage_size: int
84
+ storage_compression_method: Optional[str] # Example: 'gzip'
85
+ storage_path: Path
86
+
87
+ # This can be used when a Version is squashed
88
+ revision_id: Optional[int] = None
89
+
90
+ @classmethod
91
+ def from_draft(
92
+ cls,
93
+ draft_file: DraftFile,
94
+ file_id: str,
95
+ storage_size: int,
96
+ storage_compression_method,
97
+ path: Path,
98
+ ) -> "File":
99
+ return cls(
100
+ file_id=file_id,
101
+ created_at=draft_file.created_at,
102
+ modified_at=draft_file.modified_at,
103
+ tag=draft_file.tag,
104
+ size=draft_file.size,
105
+ data_feed_key=draft_file.data_feed_key,
106
+ data_spec_version=draft_file.data_spec_version,
107
+ data_serialization_format=draft_file.data_serialization_format,
108
+ content_type=draft_file.content_type,
109
+ storage_size=storage_size,
110
+ storage_compression_method=storage_compression_method,
111
+ storage_path=path,
112
+ )
113
+
114
+
115
+ @dataclass
116
+ class LoadedFile:
117
+ # Unique key to identify this File within a Dataset
118
+ file_id: str
119
+ created_at: datetime
120
+ modified_at: datetime
121
+ tag: str
122
+ size: int
123
+ storage_size: int
124
+ content_type: Optional[str]
125
+
126
+ data_feed_key: str # Example: 'events'
127
+ data_spec_version: str # Example: 'v3'
128
+ data_serialization_format: Optional[str] # Example: 'gzip'
129
+
130
+ storage_size: int
131
+ storage_compression_method: Optional[str] # Example: 'gzip'
132
+ storage_path: Path
133
+
134
+ _stream: Union[BinaryIO, Callable[[], BinaryIO]]
135
+
136
+ # This can be used when a Revision is squashed
137
+ revision_id: Optional[int] = None
138
+
139
+ @property
140
+ def stream(self):
141
+ if callable(self._stream):
142
+ self._stream = self._stream(self)
143
+ return self._stream
144
+
145
+
146
+ __all__ = ["File", "DraftFile", "LoadedFile"]
@@ -0,0 +1,35 @@
1
+ from typing import Optional
2
+
3
+ from .file import LoadedFile
4
+
5
+
6
+ class FileCollection(dict):
7
+ def __init__(self, seq, auto_rewind: bool = True, **kwargs):
8
+ super().__init__(seq, **kwargs)
9
+
10
+ self._auto_rewind = auto_rewind
11
+
12
+ def get_file(
13
+ self,
14
+ data_feed_key: Optional[str] = None,
15
+ data_spec_version: Optional[str] = None,
16
+ auto_rewind: Optional[bool] = None,
17
+ ) -> Optional[LoadedFile]:
18
+ if not data_feed_key and not data_spec_version:
19
+ raise ValueError(
20
+ "You have to specify `data_feed_key` or `data_spec_version`"
21
+ )
22
+
23
+ for file in self.values():
24
+ if (not data_feed_key or file.data_feed_key == data_feed_key) and (
25
+ not data_spec_version or file.data_spec_version == data_spec_version
26
+ ):
27
+ should_auto_rewind = auto_rewind
28
+ if should_auto_rewind is None:
29
+ should_auto_rewind = self._auto_rewind
30
+
31
+ if should_auto_rewind and file.stream.tell() > 0:
32
+ file.stream.seek(0)
33
+ return file
34
+
35
+ return None
@@ -0,0 +1,59 @@
1
+ from abc import ABC, abstractmethod
2
+ from pathlib import Path
3
+ from typing import BinaryIO
4
+
5
+ from ingestify.utils import ComponentFactory, ComponentRegistry
6
+
7
+ from .dataset import Dataset
8
+
9
+ file_repository_registry = ComponentRegistry()
10
+
11
+
12
+ class FileRepository(ABC, metaclass=file_repository_registry.metaclass):
13
+ def __init__(self, url: str):
14
+ self.base_dir = Path(url.split("://")[1])
15
+
16
+ @abstractmethod
17
+ def save_content(
18
+ self,
19
+ bucket: str,
20
+ dataset: Dataset,
21
+ revision_id: int,
22
+ filename: str,
23
+ stream: BinaryIO,
24
+ ) -> Path:
25
+ pass
26
+
27
+ @abstractmethod
28
+ def load_content(
29
+ self, bucket: str, dataset: Dataset, revision_id: int, filename: str
30
+ ) -> BinaryIO:
31
+ pass
32
+
33
+ @classmethod
34
+ @abstractmethod
35
+ def supports(cls, url: str) -> bool:
36
+ pass
37
+
38
+ def get_path(
39
+ self, bucket: str, dataset: Dataset, revision_id: int, filename: str
40
+ ) -> Path:
41
+ path = (
42
+ self.base_dir
43
+ / bucket
44
+ / f"provider={dataset.provider}"
45
+ / f"dataset_type={dataset.dataset_type}"
46
+ / str(dataset.identifier)
47
+ / str(revision_id)
48
+ / filename
49
+ )
50
+ return path
51
+
52
+ def get_relative_path(self, path: Path) -> Path:
53
+ """Return the relative path to the base of the repository"""
54
+ return path.relative_to(self.base_dir)
55
+
56
+
57
+ file_repository_factory = ComponentFactory.build_factory(
58
+ FileRepository, file_repository_registry
59
+ )
@@ -0,0 +1,24 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ingestify.utils import key_from_dict
4
+
5
+ if TYPE_CHECKING:
6
+ from ingestify.domain import Selector
7
+
8
+
9
+ class Identifier(dict):
10
+ @classmethod
11
+ def create_from_selector(cls, selector: "Selector", **kwargs):
12
+ identifier = cls(**selector.filtered_attributes)
13
+ identifier.update(kwargs)
14
+ return identifier
15
+
16
+ @property
17
+ def key(self):
18
+ return key_from_dict(self)
19
+
20
+ def __hash__(self):
21
+ return hash(self.key)
22
+
23
+ def __str__(self):
24
+ return "/".join([f"{k}={v}" for k, v in self.items()])
@@ -0,0 +1,29 @@
1
+ from dataclasses import dataclass
2
+ from datetime import datetime
3
+ from typing import Dict, List
4
+
5
+ from .file import File
6
+
7
+
8
+ @dataclass
9
+ class Revision:
10
+ revision_id: int
11
+ created_at: datetime
12
+ description: str
13
+ modified_files: List[File]
14
+ is_squashed: bool = False
15
+
16
+ @property
17
+ def modified_files_map(self) -> Dict[str, File]:
18
+ return {file.file_id: file for file in self.modified_files}
19
+
20
+ def is_changed(self, files: Dict[str, datetime]) -> bool:
21
+ modified_files_map = self.modified_files_map
22
+ for file_id, last_modified in files.items():
23
+ if file_id not in modified_files_map:
24
+ return True
25
+
26
+ if modified_files_map[file_id].modified_at < last_modified:
27
+ return True
28
+
29
+ return False
@@ -0,0 +1,37 @@
1
+ from ingestify.domain.models.data_spec_version_collection import (
2
+ DataSpecVersionCollection,
3
+ )
4
+ from ingestify.utils import AttributeBag
5
+
6
+
7
+ class Selector(AttributeBag):
8
+ def __bool__(self):
9
+ return len(self.filtered_attributes) > 0
10
+
11
+ @classmethod
12
+ def build(cls, attributes, data_spec_versions: DataSpecVersionCollection):
13
+ if callable(attributes):
14
+ return cls(
15
+ _data_spec_versions=data_spec_versions.copy(), _matcher=attributes
16
+ )
17
+ else:
18
+ return cls(_data_spec_versions=data_spec_versions.copy(), **attributes)
19
+
20
+ @property
21
+ def is_dynamic(self):
22
+ return "_matcher" in self.attributes
23
+
24
+ def is_match(self, selector: dict):
25
+ return self._matcher(selector)
26
+
27
+ @property
28
+ def data_spec_versions(self):
29
+ return self._data_spec_versions
30
+
31
+ @property
32
+ def custom_attributes(self):
33
+ return {
34
+ k: v
35
+ for k, v in self.items()
36
+ if k not in ("_matcher", "_data_spec_versions")
37
+ }
@@ -0,0 +1,4 @@
1
+ from .publisher import Publisher
2
+ from .domain_event import DomainEvent
3
+ from .subscriber import Subscriber
4
+ from .event_bus import EventBus