ingestify 0.1.3__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. ingestify/__init__.py +1 -1
  2. ingestify/application/dataset_store.py +47 -36
  3. ingestify/application/ingestion_engine.py +3 -3
  4. ingestify/application/loader.py +71 -241
  5. ingestify/domain/models/__init__.py +1 -6
  6. ingestify/domain/models/base.py +22 -0
  7. ingestify/domain/models/data_spec_version_collection.py +6 -0
  8. ingestify/domain/models/dataset/__init__.py +3 -5
  9. ingestify/domain/models/dataset/dataset.py +15 -32
  10. ingestify/domain/models/dataset/dataset_repository.py +1 -15
  11. ingestify/domain/models/dataset/dataset_state.py +11 -0
  12. ingestify/domain/models/dataset/events.py +6 -16
  13. ingestify/domain/models/dataset/file.py +21 -34
  14. ingestify/domain/models/dataset/file_collection.py +3 -1
  15. ingestify/domain/models/dataset/file_repository.py +29 -28
  16. ingestify/domain/models/dataset/revision.py +26 -3
  17. ingestify/domain/models/event/domain_event.py +8 -4
  18. ingestify/domain/models/ingestion/__init__.py +0 -0
  19. ingestify/domain/models/ingestion/ingestion_job.py +325 -0
  20. ingestify/domain/models/ingestion/ingestion_job_summary.py +123 -0
  21. ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
  22. ingestify/domain/models/resources/dataset_resource.py +29 -37
  23. ingestify/domain/models/sink.py +1 -8
  24. ingestify/domain/models/task/task.py +3 -1
  25. ingestify/domain/models/task/task_summary.py +118 -0
  26. ingestify/domain/models/timing.py +16 -0
  27. ingestify/domain/services/identifier_key_transformer.py +111 -0
  28. ingestify/infra/fetch/http.py +5 -0
  29. ingestify/infra/source/statsbomb_github.py +67 -54
  30. ingestify/infra/store/dataset/__init__.py +0 -2
  31. ingestify/infra/store/dataset/sqlalchemy/mapping.py +187 -4
  32. ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -24
  33. ingestify/infra/store/file/local_file_repository.py +3 -5
  34. ingestify/infra/store/file/s3_file_repository.py +4 -9
  35. ingestify/main.py +64 -25
  36. ingestify/utils.py +15 -78
  37. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/METADATA +2 -1
  38. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/RECORD +41 -34
  39. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/WHEEL +1 -1
  40. ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
  41. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/entry_points.txt +0 -0
  42. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,12 @@ class DataSpecVersionCollection(dict):
16
16
 
17
17
  return cls(items_)
18
18
 
19
+ def to_dict(self):
20
+ return {
21
+ data_feed_key: list(data_spec_versions)
22
+ for data_feed_key, data_spec_versions in self.items()
23
+ }
24
+
19
25
  def copy(self):
20
26
  return DataSpecVersionCollection(copy.deepcopy(self))
21
27
 
@@ -1,8 +1,8 @@
1
+ from .file import DraftFile, File, LoadedFile
1
2
  from .collection import DatasetCollection
2
3
  from .dataset import Dataset
3
- from .dataset_repository import DatasetRepository, dataset_repository_factory
4
- from .file import DraftFile, File, LoadedFile
5
- from .file_repository import FileRepository, file_repository_factory
4
+ from .dataset_repository import DatasetRepository
5
+ from .file_repository import FileRepository
6
6
  from .file_collection import FileCollection
7
7
  from .identifier import Identifier
8
8
  from .selector import Selector
@@ -16,12 +16,10 @@ __all__ = [
16
16
  "Identifier",
17
17
  "DatasetCollection",
18
18
  "DatasetCreated",
19
- "dataset_repository_factory",
20
19
  "File",
21
20
  "DraftFile",
22
21
  "LoadedFile",
23
22
  "DatasetRepository",
24
23
  "FileRepository",
25
- "file_repository_factory",
26
24
  "FileCollection",
27
25
  ]
@@ -1,70 +1,52 @@
1
- from dataclasses import dataclass, field
2
1
  from datetime import datetime
3
2
  from enum import Enum
4
3
  from typing import List, Optional
4
+ from pydantic import Field
5
5
 
6
6
  from ingestify.utils import utcnow
7
-
7
+ from .dataset_state import DatasetState
8
8
  from .file import DraftFile
9
9
  from .identifier import Identifier
10
- from .revision import Revision
11
-
12
-
13
- class DatasetState(Enum):
14
- SCHEDULED = "SCHEDULED"
15
- PARTIAL = "PARTIAL"
16
- COMPLETE = "COMPLETE"
17
-
18
- @property
19
- def is_complete(self):
20
- return self == DatasetState.COMPLETE
10
+ from .revision import Revision, RevisionSource, SourceType
11
+ from ..base import BaseModel
21
12
 
22
- def __str__(self):
23
- return self.value
24
13
 
25
-
26
- @dataclass
27
- class Dataset:
14
+ class Dataset(BaseModel):
28
15
  bucket: str # This must be set by the DatasetRepository
29
-
30
16
  dataset_id: str
31
17
  name: str
32
18
  state: DatasetState
33
-
34
19
  dataset_type: str
35
20
  provider: str
36
-
37
21
  identifier: Identifier
38
22
  metadata: dict
39
-
40
23
  created_at: datetime
41
24
  updated_at: datetime
42
-
43
- revisions: List[Revision] = field(default_factory=list)
25
+ revisions: List[Revision] = Field(default_factory=list)
44
26
 
45
27
  @property
46
28
  def is_complete(self):
47
29
  return self.state.is_complete
48
30
 
49
- def next_revision_id(self):
31
+ def next_revision_id(self) -> int:
50
32
  return len(self.revisions)
51
33
 
52
34
  def add_revision(self, revision: Revision):
53
35
  self.revisions.append(revision)
54
36
  self.updated_at = utcnow()
55
37
 
56
- def update_from_resource(self, dataset_resource) -> bool:
38
+ def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
57
39
  changed = False
58
- if self.name != dataset_resource.name:
59
- self.name = dataset_resource.name
40
+ if self.name != name:
41
+ self.name = name
60
42
  changed = True
61
43
 
62
- if self.metadata != dataset_resource.metadata:
63
- self.metadata = dataset_resource.metadata
44
+ if self.metadata != metadata:
45
+ self.metadata = metadata
64
46
  changed = True
65
47
 
66
- if self.state != dataset_resource.state:
67
- self.state = dataset_resource.state
48
+ if self.state != state:
49
+ self.state = state
68
50
  changed = True
69
51
 
70
52
  if changed:
@@ -101,4 +83,5 @@ class Dataset:
101
83
  description="Squashed revision",
102
84
  is_squashed=True,
103
85
  modified_files=list(files.values()),
86
+ source=RevisionSource(source_type=SourceType.SQUASHED, source_id=""),
104
87
  )
@@ -1,16 +1,12 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from typing import Optional, List, Union
3
3
 
4
- from ingestify.utils import ComponentFactory, ComponentRegistry
5
-
6
4
  from .collection import DatasetCollection
7
5
  from .dataset import Dataset
8
6
  from .selector import Selector
9
7
 
10
- dataset_repository_registry = ComponentRegistry()
11
-
12
8
 
13
- class DatasetRepository(ABC, metaclass=dataset_repository_registry.metaclass):
9
+ class DatasetRepository(ABC):
14
10
  @abstractmethod
15
11
  def get_dataset_collection(
16
12
  self,
@@ -34,13 +30,3 @@ class DatasetRepository(ABC, metaclass=dataset_repository_registry.metaclass):
34
30
  @abstractmethod
35
31
  def next_identity(self):
36
32
  pass
37
-
38
- @classmethod
39
- @abstractmethod
40
- def supports(cls, url: str) -> bool:
41
- pass
42
-
43
-
44
- dataset_repository_factory = ComponentFactory.build_factory(
45
- DatasetRepository, dataset_repository_registry
46
- )
@@ -0,0 +1,11 @@
1
+ from enum import Enum
2
+
3
+
4
+ class DatasetState(str, Enum):
5
+ SCHEDULED = "SCHEDULED"
6
+ PARTIAL = "PARTIAL"
7
+ COMPLETE = "COMPLETE"
8
+
9
+ @property
10
+ def is_complete(self):
11
+ return self == DatasetState.COMPLETE
@@ -1,31 +1,21 @@
1
- from dataclasses import dataclass, field
2
- from datetime import datetime
1
+ from typing import ClassVar
3
2
 
4
- from ingestify.domain.models.event.domain_event import DomainEvent
5
- from ingestify.utils import utcnow
3
+ from pydantic import BaseModel
6
4
 
5
+ from ingestify.domain.models.event.domain_event import DomainEvent
7
6
  from .dataset import Dataset
8
7
 
9
8
 
10
- @dataclass
11
9
  class DatasetCreated(DomainEvent):
12
10
  dataset: Dataset
13
-
14
- event_type: str = "dataset_created"
15
- occurred_at: datetime = field(default_factory=utcnow)
11
+ event_type: ClassVar[str] = "dataset_created"
16
12
 
17
13
 
18
- @dataclass
19
14
  class RevisionAdded(DomainEvent):
20
15
  dataset: Dataset
16
+ event_type: ClassVar[str] = "revision_added"
21
17
 
22
- event_type: str = "revision_added"
23
- occurred_at: datetime = field(default_factory=utcnow)
24
18
 
25
-
26
- @dataclass
27
19
  class MetadataUpdated(DomainEvent):
28
20
  dataset: Dataset
29
-
30
- event_type: str = "metadata_updated"
31
- occurred_at: datetime = field(default_factory=utcnow)
21
+ event_type: ClassVar[str] = "metadata_updated"
@@ -1,37 +1,32 @@
1
- import hashlib
2
- import mimetypes
3
-
4
- from dataclasses import dataclass
5
1
  from datetime import datetime
6
- from io import BytesIO, StringIO
7
2
  from pathlib import Path
8
- from typing import BinaryIO, Optional, Union, Callable
3
+ from typing import BinaryIO, Optional, Union, Callable, Awaitable
4
+ from io import BytesIO, StringIO
5
+ import hashlib
9
6
 
7
+ from ingestify.domain.models.base import BaseModel
10
8
  from ingestify.utils import utcnow
11
9
 
12
10
 
13
- @dataclass
14
- class DraftFile:
11
+ class DraftFile(BaseModel):
15
12
  created_at: datetime
16
13
  modified_at: datetime
17
14
  tag: str
18
15
  size: int
19
16
  content_type: Optional[str]
20
-
21
17
  data_feed_key: str # Example: 'events'
22
18
  data_spec_version: str # Example: 'v3'
23
19
  data_serialization_format: str # Example: 'json'
24
-
25
- stream: BinaryIO
20
+ stream: BytesIO
26
21
 
27
22
  @classmethod
28
23
  def from_input(
29
24
  cls,
30
25
  file_,
31
- data_feed_key,
32
- data_spec_version="v1",
33
- data_serialization_format="txt",
34
- modified_at=None,
26
+ data_feed_key: str,
27
+ data_spec_version: str = "v1",
28
+ data_serialization_format: str = "txt",
29
+ modified_at: Optional[datetime] = None,
35
30
  ):
36
31
  # Pass-through for these types
37
32
  if isinstance(file_, DraftFile) or file_ is None:
@@ -67,25 +62,20 @@ class DraftFile:
67
62
  )
68
63
 
69
64
 
70
- @dataclass
71
- class File:
65
+ class File(BaseModel):
72
66
  file_id: str
73
67
  created_at: datetime
74
68
  modified_at: datetime
75
69
  tag: str
76
70
  size: int
77
71
  content_type: Optional[str]
78
-
79
72
  data_feed_key: str # Example: 'events'
80
73
  data_spec_version: str # Example: 'v3'
81
74
  data_serialization_format: str # Example: 'json'
82
-
83
75
  storage_size: int
84
76
  storage_compression_method: Optional[str] # Example: 'gzip'
85
77
  storage_path: Path
86
-
87
- # This can be used when a Version is squashed
88
- revision_id: Optional[int] = None
78
+ revision_id: Optional[int] = None # This can be used when a Version is squashed
89
79
 
90
80
  @classmethod
91
81
  def from_draft(
@@ -93,7 +83,7 @@ class File:
93
83
  draft_file: DraftFile,
94
84
  file_id: str,
95
85
  storage_size: int,
96
- storage_compression_method,
86
+ storage_compression_method: str,
97
87
  path: Path,
98
88
  ) -> "File":
99
89
  return cls(
@@ -112,8 +102,7 @@ class File:
112
102
  )
113
103
 
114
104
 
115
- @dataclass
116
- class LoadedFile:
105
+ class LoadedFile(BaseModel):
117
106
  # Unique key to identify this File within a Dataset
118
107
  file_id: str
119
108
  created_at: datetime
@@ -122,24 +111,22 @@ class LoadedFile:
122
111
  size: int
123
112
  storage_size: int
124
113
  content_type: Optional[str]
125
-
126
114
  data_feed_key: str # Example: 'events'
127
115
  data_spec_version: str # Example: 'v3'
128
- data_serialization_format: Optional[str] # Example: 'gzip'
129
-
130
- storage_size: int
116
+ data_serialization_format: Optional[str] # Example: 'json'
131
117
  storage_compression_method: Optional[str] # Example: 'gzip'
132
118
  storage_path: Path
119
+ _stream: Union[BinaryIO, Callable[[], Awaitable[BinaryIO]]]
120
+ revision_id: Optional[int] = None # This can be used when a Revision is squashed
133
121
 
134
- _stream: Union[BinaryIO, Callable[[], BinaryIO]]
135
-
136
- # This can be used when a Revision is squashed
137
- revision_id: Optional[int] = None
122
+ def load_stream(self):
123
+ if callable(self._stream):
124
+ self._stream = self._stream(self)
138
125
 
139
126
  @property
140
127
  def stream(self):
141
128
  if callable(self._stream):
142
- self._stream = self._stream(self)
129
+ raise Exception("You should load the stream first using `load_stream`")
143
130
  return self._stream
144
131
 
145
132
 
@@ -3,7 +3,7 @@ from typing import Optional
3
3
  from .file import LoadedFile
4
4
 
5
5
 
6
- class FileCollection(dict):
6
+ class FileCollection(dict[str, LoadedFile]):
7
7
  def __init__(self, seq, auto_rewind: bool = True, **kwargs):
8
8
  super().__init__(seq, **kwargs)
9
9
 
@@ -28,6 +28,8 @@ class FileCollection(dict):
28
28
  if should_auto_rewind is None:
29
29
  should_auto_rewind = self._auto_rewind
30
30
 
31
+ file.load_stream()
32
+
31
33
  if should_auto_rewind and file.stream.tell() > 0:
32
34
  file.stream.seek(0)
33
35
  return file
@@ -2,16 +2,35 @@ from abc import ABC, abstractmethod
2
2
  from pathlib import Path
3
3
  from typing import BinaryIO
4
4
 
5
- from ingestify.utils import ComponentFactory, ComponentRegistry
6
-
7
5
  from .dataset import Dataset
8
-
9
- file_repository_registry = ComponentRegistry()
6
+ from ...services.identifier_key_transformer import IdentifierTransformer
10
7
 
11
8
 
12
- class FileRepository(ABC, metaclass=file_repository_registry.metaclass):
13
- def __init__(self, url: str):
9
+ class FileRepository(ABC):
10
+ def __init__(self, url: str, identifier_transformer: IdentifierTransformer):
14
11
  self.base_dir = Path(url.split("://")[1])
12
+ self.identifier_transformer = identifier_transformer
13
+
14
+ def get_write_path(
15
+ self, bucket: str, dataset: Dataset, revision_id: int, filename: str
16
+ ) -> Path:
17
+ # TODO: use the IdentifierKeyTransformer
18
+ identifier_path = self.identifier_transformer.to_path(
19
+ provider=dataset.provider,
20
+ dataset_type=dataset.dataset_type,
21
+ identifier=dataset.identifier,
22
+ )
23
+
24
+ path = (
25
+ self.base_dir
26
+ / bucket
27
+ / f"provider={dataset.provider}"
28
+ / f"dataset_type={dataset.dataset_type}"
29
+ / identifier_path
30
+ / str(revision_id)
31
+ / filename
32
+ )
33
+ return path
15
34
 
16
35
  @abstractmethod
17
36
  def save_content(
@@ -24,10 +43,11 @@ class FileRepository(ABC, metaclass=file_repository_registry.metaclass):
24
43
  ) -> Path:
25
44
  pass
26
45
 
46
+ def get_read_path(self, storage_path: str) -> Path:
47
+ return self.base_dir / storage_path
48
+
27
49
  @abstractmethod
28
- def load_content(
29
- self, bucket: str, dataset: Dataset, revision_id: int, filename: str
30
- ) -> BinaryIO:
50
+ def load_content(self, storage_path: str) -> BinaryIO:
31
51
  pass
32
52
 
33
53
  @classmethod
@@ -35,25 +55,6 @@ class FileRepository(ABC, metaclass=file_repository_registry.metaclass):
35
55
  def supports(cls, url: str) -> bool:
36
56
  pass
37
57
 
38
- def get_path(
39
- self, bucket: str, dataset: Dataset, revision_id: int, filename: str
40
- ) -> Path:
41
- path = (
42
- self.base_dir
43
- / bucket
44
- / f"provider={dataset.provider}"
45
- / f"dataset_type={dataset.dataset_type}"
46
- / str(dataset.identifier)
47
- / str(revision_id)
48
- / filename
49
- )
50
- return path
51
-
52
58
  def get_relative_path(self, path: Path) -> Path:
53
59
  """Return the relative path to the base of the repository"""
54
60
  return path.relative_to(self.base_dir)
55
-
56
-
57
- file_repository_factory = ComponentFactory.build_factory(
58
- FileRepository, file_repository_registry
59
- )
@@ -1,17 +1,40 @@
1
- from dataclasses import dataclass
2
1
  from datetime import datetime
2
+ from enum import Enum
3
3
  from typing import Dict, List
4
4
 
5
+ from typing_extensions import TypedDict
6
+
5
7
  from .file import File
8
+ from ..base import BaseModel
9
+
10
+
11
+ class SourceType(str, Enum):
12
+ TASK = "TASK"
13
+ MANUAL = "MANUAL"
14
+ SQUASHED = "SQUASHED"
15
+
16
+
17
+ class RevisionSource(TypedDict):
18
+ source_type: SourceType
19
+ source_id: str
20
+
21
+
22
+ class RevisionState(str, Enum):
23
+ PENDING_VALIDATION = "PENDING_VALIDATION"
24
+ VALIDATING = "VALIDATING"
25
+ VALIDATION_FAILED = "VALIDATION_FAILED"
26
+ APPROVED = "APPROVED"
27
+ REJECTED = "REJECTED"
6
28
 
7
29
 
8
- @dataclass
9
- class Revision:
30
+ class Revision(BaseModel):
10
31
  revision_id: int
11
32
  created_at: datetime
12
33
  description: str
13
34
  modified_files: List[File]
35
+ source: RevisionSource
14
36
  is_squashed: bool = False
37
+ state: RevisionState = RevisionState.PENDING_VALIDATION
15
38
 
16
39
  @property
17
40
  def modified_files_map(self) -> Dict[str, File]:
@@ -1,9 +1,13 @@
1
- from abc import abstractmethod, ABC
2
- from dataclasses import dataclass
1
+ from abc import ABC, abstractmethod
2
+ from datetime import datetime
3
+ from pydantic import BaseModel, Field
3
4
 
5
+ from ingestify.utils import utcnow
6
+
7
+
8
+ class DomainEvent(BaseModel, ABC):
9
+ occurred_at: datetime = Field(default_factory=utcnow)
4
10
 
5
- @dataclass
6
- class DomainEvent(ABC):
7
11
  @property
8
12
  @abstractmethod
9
13
  def event_type(self) -> str:
File without changes