tgedr-dataops 0.0.37__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
- tgedr_dataops/commons/utils_fs.py +187 -0
- tgedr_dataops/quality/pandas_validation.py +21 -0
- tgedr_dataops/sink/local_fs_file_sink.py +77 -0
- {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
- tgedr_dataops/source/abstract_s3_file_source.py +72 -0
- tgedr_dataops/source/local_fs_file_source.py +108 -0
- tgedr_dataops/source/pd_df_s3_source.py +130 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
- tgedr_dataops/source/s3_file_extended_source.py +68 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +63 -27
- tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
- tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
- tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
- tgedr_dataops-1.0.1.dist-info/METADATA +72 -0
- tgedr_dataops-1.0.1.dist-info/RECORD +22 -0
- {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info}/WHEEL +1 -1
- tgedr_dataops-1.0.1.dist-info/top_level.txt +1 -0
- tgedr/dataops/chain.py +0 -51
- tgedr/dataops/commons/dataset.py +0 -23
- tgedr/dataops/commons/metadata.py +0 -172
- tgedr/dataops/commons/utils_fs.py +0 -85
- tgedr/dataops/commons/utils_spark.py +0 -87
- tgedr/dataops/etl.py +0 -112
- tgedr/dataops/processor.py +0 -27
- tgedr/dataops/sink/local_fs_file_sink.py +0 -47
- tgedr/dataops/sink/sink.py +0 -46
- tgedr/dataops/source/abstract_s3_file_source.py +0 -43
- tgedr/dataops/source/delta_table_source.py +0 -49
- tgedr/dataops/source/local_delta_table.py +0 -47
- tgedr/dataops/source/local_fs_file_source.py +0 -71
- tgedr/dataops/source/pd_df_s3_source.py +0 -76
- tgedr/dataops/source/s3_delta_table.py +0 -75
- tgedr/dataops/source/s3_file_extended_source.py +0 -39
- tgedr/dataops/source/source.py +0 -51
- tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
- tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
- tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
- tgedr/dataops/store/spark_delta.py +0 -369
- tgedr/dataops/store/store.py +0 -49
- tgedr/dataops/utils_reflection.py +0 -134
- tgedr/dataops/validation/abs.py +0 -46
- tgedr/dataops/validation/pandas.py +0 -10
- tgedr/dataops/validation/pyspark.py +0 -10
- tgedr_dataops-0.0.37.dist-info/METADATA +0 -21
- tgedr_dataops-0.0.37.dist-info/RECORD +0 -38
- tgedr_dataops-0.0.37.dist-info/top_level.txt +0 -1
- {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
- {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,17 +1,30 @@
|
|
|
1
|
+
"""AWS S3 connector module.
|
|
2
|
+
|
|
3
|
+
This module provides the S3Connector class for establishing and managing
|
|
4
|
+
connections to AWS S3 resources.
|
|
5
|
+
"""
|
|
1
6
|
import os
|
|
2
7
|
import boto3
|
|
3
8
|
|
|
4
9
|
|
|
5
10
|
class S3Connector:
|
|
6
|
-
"""utility base class to be extended, providing a connection session with aws s3 resources"""
|
|
11
|
+
"""utility base class to be extended, providing a connection session with aws s3 resources."""
|
|
7
12
|
|
|
8
|
-
def __init__(self):
|
|
13
|
+
def __init__(self) -> None:
|
|
14
|
+
"""Initialize the S3Connector with empty session, resource, and client attributes."""
|
|
9
15
|
self.__resource = None
|
|
10
16
|
self.__session = None
|
|
11
17
|
self.__client = None
|
|
12
18
|
|
|
13
19
|
@property
|
|
14
|
-
def _session(self)
|
|
20
|
+
def _session(self) -> boto3.Session:
|
|
21
|
+
"""Get or create a boto3 session.
|
|
22
|
+
|
|
23
|
+
Returns
|
|
24
|
+
-------
|
|
25
|
+
boto3.Session
|
|
26
|
+
The boto3 session instance, using credentials if configured.
|
|
27
|
+
"""
|
|
15
28
|
if self.__session is None:
|
|
16
29
|
if "1" == os.getenv("S3_CONNECTOR_USE_CREDENTIALS", default="0"):
|
|
17
30
|
self.__session = boto3.Session(
|
|
@@ -26,13 +39,27 @@ class S3Connector:
|
|
|
26
39
|
return self.__session
|
|
27
40
|
|
|
28
41
|
@property
|
|
29
|
-
def _resource(self):
|
|
42
|
+
def _resource(self) -> boto3.resources.base.ServiceResource:
|
|
43
|
+
"""Get or create an S3 resource.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
boto3.resources.base.ServiceResource
|
|
48
|
+
The boto3 S3 resource instance.
|
|
49
|
+
"""
|
|
30
50
|
if self.__resource is None:
|
|
31
51
|
self.__resource = self._session.resource("s3")
|
|
32
52
|
return self.__resource
|
|
33
53
|
|
|
34
54
|
@property
|
|
35
|
-
def _client(self):
|
|
55
|
+
def _client(self) -> boto3.client:
|
|
56
|
+
"""Get or create an S3 client.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
boto3.client
|
|
61
|
+
The boto3 S3 client instance.
|
|
62
|
+
"""
|
|
36
63
|
if self.__client is None:
|
|
37
64
|
self.__client = self._session.client("s3")
|
|
38
65
|
return self.__client
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""Filesystem utility functions for file and path operations.
|
|
2
|
+
|
|
3
|
+
This module provides utilities for:
|
|
4
|
+
- Creating temporary files and directories
|
|
5
|
+
- Parsing and processing S3 URLs and paths
|
|
6
|
+
- Extracting URL protocols
|
|
7
|
+
- Generating file hashes
|
|
8
|
+
"""
|
|
9
|
+
import tempfile
|
|
10
|
+
import re
|
|
11
|
+
import hashlib
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def temp_dir(root: str | None = None, suffix: str | None = None, prefix: str | None = None) -> str:
|
|
16
|
+
"""Create a temporary directory and return its path.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
root : str | None
|
|
21
|
+
Directory where the temporary directory will be created.
|
|
22
|
+
suffix : str | None
|
|
23
|
+
Suffix for the temporary directory name.
|
|
24
|
+
prefix : str | None
|
|
25
|
+
Prefix for the temporary directory name.
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
str
|
|
30
|
+
Path to the created temporary directory.
|
|
31
|
+
"""
|
|
32
|
+
return tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=root)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def temp_file(
|
|
36
|
+
root: str | None = None, suffix: str | None = None, prefix: str | None = None,
|
|
37
|
+
discard_handle: bool = True
|
|
38
|
+
) -> str | tuple[int, str]:
|
|
39
|
+
"""Create a temporary file and return its path or handle and path.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
root : str | None
|
|
44
|
+
Directory where the temporary file will be created.
|
|
45
|
+
suffix : str | None
|
|
46
|
+
Suffix for the temporary file name.
|
|
47
|
+
prefix : str | None
|
|
48
|
+
Prefix for the temporary file name.
|
|
49
|
+
discard_handle : bool
|
|
50
|
+
If True, return only the file path. If False, return tuple of (handle, path).
|
|
51
|
+
|
|
52
|
+
Returns
|
|
53
|
+
-------
|
|
54
|
+
str | tuple[int, str]
|
|
55
|
+
File path if discard_handle is True, otherwise tuple of (file handle, file path).
|
|
56
|
+
"""
|
|
57
|
+
h, f = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=root)
|
|
58
|
+
if discard_handle:
|
|
59
|
+
return f
|
|
60
|
+
return (h, f)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def resolve_url_protocol(url: str) -> str:
|
|
64
|
+
"""Extract the protocol from a URL.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
url : str
|
|
69
|
+
The URL to extract the protocol from.
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
str
|
|
74
|
+
The protocol (e.g., 'http://', 'https://') or None if no protocol is found.
|
|
75
|
+
"""
|
|
76
|
+
result = None
|
|
77
|
+
group_match = re.search("(.*://).*", url)
|
|
78
|
+
if group_match is not None:
|
|
79
|
+
result = group_match.group(1)
|
|
80
|
+
return result
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def resolve_s3_protocol(url: str) -> str:
|
|
84
|
+
"""Extract the S3 protocol from a URL.
|
|
85
|
+
|
|
86
|
+
Parameters
|
|
87
|
+
----------
|
|
88
|
+
url : str
|
|
89
|
+
The URL to extract the S3 protocol from.
|
|
90
|
+
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
93
|
+
str
|
|
94
|
+
The S3 protocol (e.g., 's3://', 's3a://') or None if no S3 protocol is found.
|
|
95
|
+
"""
|
|
96
|
+
result = None
|
|
97
|
+
group_match = re.search("(s3[a]?://).*", url)
|
|
98
|
+
if group_match is not None:
|
|
99
|
+
result = group_match.group(1)
|
|
100
|
+
return result
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def remove_s3_protocol(url: str) -> str:
|
|
104
|
+
"""Remove the S3 protocol prefix from a URL.
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
url : str
|
|
109
|
+
The S3 URL to remove the protocol from.
|
|
110
|
+
|
|
111
|
+
Returns
|
|
112
|
+
-------
|
|
113
|
+
str
|
|
114
|
+
The URL without the S3 protocol prefix (s3:// or s3a://).
|
|
115
|
+
"""
|
|
116
|
+
if url.startswith("s3://"):
|
|
117
|
+
result = url[5:]
|
|
118
|
+
elif url.startswith("s3a://"):
|
|
119
|
+
result = url[6:]
|
|
120
|
+
else:
|
|
121
|
+
result = url
|
|
122
|
+
return result
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def process_s3_path(path: str) -> tuple[str, str]:
|
|
126
|
+
"""Extract bucket and key from an S3 path.
|
|
127
|
+
|
|
128
|
+
Parameters
|
|
129
|
+
----------
|
|
130
|
+
path : str
|
|
131
|
+
The S3 path to process (with or without protocol).
|
|
132
|
+
|
|
133
|
+
Returns
|
|
134
|
+
-------
|
|
135
|
+
tuple[str, str]
|
|
136
|
+
A tuple containing (bucket, key).
|
|
137
|
+
"""
|
|
138
|
+
no_protocol_path = remove_s3_protocol(path)
|
|
139
|
+
path_elements = no_protocol_path.split("/")
|
|
140
|
+
bucket = path_elements[0]
|
|
141
|
+
key = "/".join(path_elements[1:])
|
|
142
|
+
return (bucket, key)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def process_s3_url(url: str) -> tuple[str, str, str]:
|
|
146
|
+
"""Extract protocol, bucket, and key from an S3 URL.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
url : str
|
|
151
|
+
The S3 URL to process.
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
tuple[str, str, str]
|
|
156
|
+
A tuple containing (protocol, bucket, key).
|
|
157
|
+
"""
|
|
158
|
+
protocol = resolve_s3_protocol(url)
|
|
159
|
+
no_protocol_url = remove_s3_protocol(url)
|
|
160
|
+
path_elements = no_protocol_url.split("/")
|
|
161
|
+
bucket = path_elements[0]
|
|
162
|
+
key = "/".join(path_elements[1:])
|
|
163
|
+
return ("" if protocol is None else protocol, bucket, key)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def hash_file(filepath: str, hash_func=hashlib.sha256) -> str: # noqa: ANN001
|
|
167
|
+
"""Generate a hash for a file.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
filepath (str): The path to the file.
|
|
171
|
+
hash_func: A hashlib hash function, e.g., hashlib.md5().
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
str: The hexadecimal hash string of the file.
|
|
175
|
+
"""
|
|
176
|
+
# Initialize the hash object
|
|
177
|
+
hasher = hash_func()
|
|
178
|
+
|
|
179
|
+
# Open the file in binary read mode
|
|
180
|
+
with Path(filepath).open("rb") as file:
|
|
181
|
+
# Read the file in chunks to avoid using too much memory
|
|
182
|
+
chunk_size = 8192
|
|
183
|
+
while chunk := file.read(chunk_size):
|
|
184
|
+
hasher.update(chunk)
|
|
185
|
+
|
|
186
|
+
# Return the hexadecimal digest of the hash
|
|
187
|
+
return hasher.hexdigest()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Pandas DataFrame validation implementation module.
|
|
2
|
+
|
|
3
|
+
This module provides the Pandas-specific implementation of Great Expectations validation.
|
|
4
|
+
"""
|
|
5
|
+
from great_expectations.execution_engine import ExecutionEngine
|
|
6
|
+
from great_expectations.execution_engine import PandasExecutionEngine
|
|
7
|
+
from tgedr_dataops_abs.great_expectations_validation import GreatExpectationsValidation
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PandasValidation(GreatExpectationsValidation):
|
|
11
|
+
"""Pandas DataFrame validation implementation."""
|
|
12
|
+
|
|
13
|
+
def _get_execution_engine(self, batch_data_dict: dict) -> ExecutionEngine:
|
|
14
|
+
"""Get the execution engine used by the validation implementation.
|
|
15
|
+
|
|
16
|
+
Returns
|
|
17
|
+
-------
|
|
18
|
+
ExecutionEngine
|
|
19
|
+
The execution engine instance.
|
|
20
|
+
"""
|
|
21
|
+
return PandasExecutionEngine(batch_data_dict=batch_data_dict)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Local filesystem sink implementation for persisting files.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- LocalFsFileSink: sink class for saving/persisting files to local filesystem.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import shutil
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from tgedr_dataops_abs.sink import Sink, SinkException
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LocalFsFileSink(Sink):
|
|
19
|
+
"""sink class used to save/persist an object/file to a local fs location."""
|
|
20
|
+
|
|
21
|
+
CONTEXT_SOURCE_PATH = "source"
|
|
22
|
+
CONTEXT_TARGET_PATH = "target"
|
|
23
|
+
|
|
24
|
+
def put(self, context: dict[str, Any] | None = None) -> Any:
|
|
25
|
+
"""Copy a file from source to target on local filesystem.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
context : dict[str, Any], optional
|
|
30
|
+
Context dictionary containing 'source' and 'target' paths.
|
|
31
|
+
|
|
32
|
+
Raises
|
|
33
|
+
------
|
|
34
|
+
SinkException
|
|
35
|
+
If source or target context is missing.
|
|
36
|
+
"""
|
|
37
|
+
logger.info(f"[put|in] ({context})")
|
|
38
|
+
|
|
39
|
+
if self.CONTEXT_SOURCE_PATH not in context:
|
|
40
|
+
raise SinkException(f"you must provide context for {self.CONTEXT_SOURCE_PATH}")
|
|
41
|
+
if self.CONTEXT_TARGET_PATH not in context:
|
|
42
|
+
raise SinkException(f"you must provide context for {self.CONTEXT_TARGET_PATH}")
|
|
43
|
+
|
|
44
|
+
source = context[self.CONTEXT_SOURCE_PATH]
|
|
45
|
+
target = context[self.CONTEXT_TARGET_PATH]
|
|
46
|
+
|
|
47
|
+
shutil.copy(source, target)
|
|
48
|
+
logger.info("[put|out]")
|
|
49
|
+
|
|
50
|
+
def delete(self, context: dict[str, Any] | None = None) -> None:
|
|
51
|
+
"""Delete a file or directory from local filesystem.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
context : dict[str, Any], optional
|
|
56
|
+
Context dictionary containing 'target' path.
|
|
57
|
+
|
|
58
|
+
Raises
|
|
59
|
+
------
|
|
60
|
+
SinkException
|
|
61
|
+
If target context is missing or target is neither file nor directory.
|
|
62
|
+
"""
|
|
63
|
+
logger.info(f"[delete|in] ({context})")
|
|
64
|
+
|
|
65
|
+
if self.CONTEXT_TARGET_PATH not in context:
|
|
66
|
+
raise SinkException(f"you must provide context for {self.CONTEXT_TARGET_PATH}")
|
|
67
|
+
|
|
68
|
+
target = Path(context[self.CONTEXT_TARGET_PATH])
|
|
69
|
+
|
|
70
|
+
if target.is_file():
|
|
71
|
+
target.unlink()
|
|
72
|
+
elif target.is_dir():
|
|
73
|
+
shutil.rmtree(target)
|
|
74
|
+
else:
|
|
75
|
+
raise SinkException(f"[delete] is it a dir or a folder? {target}")
|
|
76
|
+
|
|
77
|
+
logger.info("[delete|out]")
|
|
@@ -1,26 +1,50 @@
|
|
|
1
|
+
"""S3 sink implementation for persisting files to S3.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- S3FileSink: sink class for saving/persisting files to S3 buckets.
|
|
5
|
+
"""
|
|
1
6
|
import logging
|
|
2
|
-
import
|
|
3
|
-
from typing import Any
|
|
4
|
-
from
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
from tgedr_dataops.commons.s3_connector import S3Connector
|
|
5
10
|
|
|
6
|
-
from
|
|
7
|
-
from
|
|
11
|
+
from tgedr_dataops_abs.sink import Sink, SinkException
|
|
12
|
+
from tgedr_dataops.commons.utils_fs import remove_s3_protocol
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
logger = logging.getLogger(__name__)
|
|
11
16
|
|
|
12
17
|
|
|
13
18
|
class S3FileSink(Sink, S3Connector):
|
|
14
|
-
"""sink class used to save/persist a local object/file to an s3 bucket"""
|
|
19
|
+
"""sink class used to save/persist a local object/file to an s3 bucket."""
|
|
15
20
|
|
|
16
21
|
CONTEXT_SOURCE_PATH = "source"
|
|
17
22
|
CONTEXT_TARGET_PATH = "target"
|
|
18
23
|
|
|
19
|
-
def __init__(self, config:
|
|
24
|
+
def __init__(self, config: dict[str, Any] | None = None) -> None:
|
|
25
|
+
"""Initialize the S3FileSink.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
config : dict[str, Any], optional
|
|
30
|
+
Configuration dictionary.
|
|
31
|
+
"""
|
|
20
32
|
Sink.__init__(self, config=config)
|
|
21
33
|
S3Connector.__init__(self)
|
|
22
34
|
|
|
23
|
-
def put(self, context:
|
|
35
|
+
def put(self, context: dict[str, Any] | None = None) -> Any:
|
|
36
|
+
"""Upload a local file to S3 bucket.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
context : dict[str, Any], optional
|
|
41
|
+
Context dictionary containing 'source' (local file) and 'target' (S3 URL) paths.
|
|
42
|
+
|
|
43
|
+
Raises
|
|
44
|
+
------
|
|
45
|
+
SinkException
|
|
46
|
+
If source or target context is missing, or if source is a directory.
|
|
47
|
+
"""
|
|
24
48
|
logger.info(f"[put|in] ({context})")
|
|
25
49
|
|
|
26
50
|
if self.CONTEXT_SOURCE_PATH not in context:
|
|
@@ -29,7 +53,7 @@ class S3FileSink(Sink, S3Connector):
|
|
|
29
53
|
raise SinkException(f"you must provide context for {self.CONTEXT_TARGET_PATH}")
|
|
30
54
|
|
|
31
55
|
source = context[self.CONTEXT_SOURCE_PATH]
|
|
32
|
-
if
|
|
56
|
+
if Path(source).is_dir():
|
|
33
57
|
raise SinkException("source can't be a folder, must be a file")
|
|
34
58
|
|
|
35
59
|
target = remove_s3_protocol(context[self.CONTEXT_TARGET_PATH])
|
|
@@ -38,7 +62,7 @@ class S3FileSink(Sink, S3Connector):
|
|
|
38
62
|
target_key = "/".join(target_elements[1:])
|
|
39
63
|
|
|
40
64
|
if target_key.endswith("/"):
|
|
41
|
-
target_file =
|
|
65
|
+
target_file = Path(source).name
|
|
42
66
|
target_key = target_key + target_file
|
|
43
67
|
|
|
44
68
|
logger.info(f"[put] uploading {source} to key: {target_key} in bucket: {target_bucket}")
|
|
@@ -46,7 +70,19 @@ class S3FileSink(Sink, S3Connector):
|
|
|
46
70
|
|
|
47
71
|
logger.info("[put|out]")
|
|
48
72
|
|
|
49
|
-
def delete(self, context:
|
|
73
|
+
def delete(self, context: dict[str, Any] | None = None) -> None:
|
|
74
|
+
"""Delete an object from S3 bucket.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
context : dict[str, Any], optional
|
|
79
|
+
Context dictionary containing 'target' (S3 URL) path.
|
|
80
|
+
|
|
81
|
+
Raises
|
|
82
|
+
------
|
|
83
|
+
SinkException
|
|
84
|
+
If target context is missing.
|
|
85
|
+
"""
|
|
50
86
|
logger.info(f"[delete|in] ({context})")
|
|
51
87
|
|
|
52
88
|
if self.CONTEXT_TARGET_PATH not in context:
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Abstract base class for S3 file sources.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- AbstractS3FileSource: abstract class for reading file sources from S3.
|
|
5
|
+
"""
|
|
6
|
+
from abc import ABC
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from tgedr_dataops.commons.s3_connector import S3Connector
|
|
11
|
+
from tgedr_dataops.commons.utils_fs import process_s3_url
|
|
12
|
+
from tgedr_dataops_abs.source import Source, SourceException
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AbstractS3FileSource(Source, S3Connector, ABC):
|
|
19
|
+
"""abstract class used to read file sources from s3."""
|
|
20
|
+
|
|
21
|
+
CONTEXT_KEY_URL = "url"
|
|
22
|
+
CONTEXT_KEY_SUFFIX = "suffix"
|
|
23
|
+
|
|
24
|
+
def __init__(self, config: dict[str, Any] | None = None) -> None:
|
|
25
|
+
"""Initialize the S3 file source.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
config : dict[str, Any], optional
|
|
30
|
+
Configuration dictionary for the source.
|
|
31
|
+
"""
|
|
32
|
+
Source.__init__(self, config=config)
|
|
33
|
+
S3Connector.__init__(self)
|
|
34
|
+
|
|
35
|
+
def list(self, context: dict[str, Any] | None = None) -> list[str]:
|
|
36
|
+
"""List objects in the S3 bucket.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
context : dict[str, Any], optional
|
|
41
|
+
Context dictionary containing 'source' S3 URL.
|
|
42
|
+
|
|
43
|
+
Returns
|
|
44
|
+
-------
|
|
45
|
+
list[str]
|
|
46
|
+
List of S3 object keys in the source bucket/prefix.
|
|
47
|
+
|
|
48
|
+
Raises
|
|
49
|
+
------
|
|
50
|
+
SourceException
|
|
51
|
+
If source context is missing.
|
|
52
|
+
"""
|
|
53
|
+
logger.info(f"[list|in] ({context})")
|
|
54
|
+
|
|
55
|
+
result: list[str] = []
|
|
56
|
+
if self.CONTEXT_KEY_URL not in context:
|
|
57
|
+
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
58
|
+
|
|
59
|
+
protocol, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
|
|
60
|
+
|
|
61
|
+
objs = self._client.list_objects_v2(Bucket=bucket, Prefix=key)
|
|
62
|
+
result = [
|
|
63
|
+
(protocol + bucket + "/" + entry["Key"]) for entry in objs["Contents"] if not (entry["Key"]).endswith("/")
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
if self.CONTEXT_KEY_SUFFIX in context:
|
|
67
|
+
suffix: str = context[self.CONTEXT_KEY_SUFFIX]
|
|
68
|
+
result = [f for f in result if f.endswith(suffix)]
|
|
69
|
+
|
|
70
|
+
logger.debug(f"[list|out] => {result}")
|
|
71
|
+
logger.info(f"[list|out] => result len: {len(result)}")
|
|
72
|
+
return result
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Local filesystem source implementation for file operations.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- LocalFsFileSource: A source class for retrieving and listing local files.
|
|
5
|
+
"""
|
|
6
|
+
import logging
|
|
7
|
+
import shutil
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from tgedr_dataops_abs.source import Source, SourceException
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LocalFsFileSource(Source):
|
|
18
|
+
"""source class used to retrieve local objects/files to a another local fs location."""
|
|
19
|
+
|
|
20
|
+
CONTEXT_KEY_SOURCE = "source"
|
|
21
|
+
CONTEXT_KEY_TARGET = "target"
|
|
22
|
+
CONTEXT_KEY_SUFFIX = "suffix"
|
|
23
|
+
CONTEXT_KEY_FILES = "files"
|
|
24
|
+
|
|
25
|
+
def list(self, context: dict[str, Any] | None = None) -> list[str]:
|
|
26
|
+
"""List files in the local filesystem directory.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
context : dict[str, Any], optional
|
|
31
|
+
Context dictionary containing 'source' path.
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
list[str]
|
|
36
|
+
List of file paths in the source directory.
|
|
37
|
+
|
|
38
|
+
Raises
|
|
39
|
+
------
|
|
40
|
+
SourceException
|
|
41
|
+
If source context is missing or if source path is not a directory.
|
|
42
|
+
"""
|
|
43
|
+
logger.info(f"[list|in] ({context})")
|
|
44
|
+
result: list[str] = []
|
|
45
|
+
if self.CONTEXT_KEY_SOURCE not in context:
|
|
46
|
+
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_SOURCE}")
|
|
47
|
+
|
|
48
|
+
source = context[self.CONTEXT_KEY_SOURCE]
|
|
49
|
+
source_path = Path(source)
|
|
50
|
+
if source_path.is_dir():
|
|
51
|
+
suffix = None
|
|
52
|
+
if self.CONTEXT_KEY_SUFFIX in context:
|
|
53
|
+
suffix = context[self.CONTEXT_KEY_SUFFIX]
|
|
54
|
+
result: list[str] = [str(file) for file in source_path.iterdir() if file.name.endswith(suffix)]
|
|
55
|
+
else:
|
|
56
|
+
result: list[str] = [str(file) for file in source_path.iterdir()]
|
|
57
|
+
elif source_path.is_file():
|
|
58
|
+
result: list[str] = [source]
|
|
59
|
+
|
|
60
|
+
logger.debug(f"[list|out] => {result}")
|
|
61
|
+
logger.info(f"[list|out] => result len: {len(result)}")
|
|
62
|
+
return result
|
|
63
|
+
|
|
64
|
+
def get(self, context: dict[str, Any] | None = None) -> Any:
|
|
65
|
+
"""Retrieve file(s) from local filesystem.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
context : dict[str, Any], optional
|
|
70
|
+
Context dictionary containing 'source' path.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
Any
|
|
75
|
+
List of file paths if source is a directory, or single file path if source is a file.
|
|
76
|
+
|
|
77
|
+
Raises
|
|
78
|
+
------
|
|
79
|
+
SourceException
|
|
80
|
+
If source context is missing.
|
|
81
|
+
"""
|
|
82
|
+
logger.info(f"[get|in] ({context})")
|
|
83
|
+
|
|
84
|
+
if self.CONTEXT_KEY_FILES not in context or self.CONTEXT_KEY_TARGET not in context:
|
|
85
|
+
raise SourceException(f"{self.CONTEXT_KEY_FILES} and {self.CONTEXT_KEY_TARGET} must be provided in config")
|
|
86
|
+
files = context[self.CONTEXT_KEY_FILES]
|
|
87
|
+
target = context[self.CONTEXT_KEY_TARGET]
|
|
88
|
+
|
|
89
|
+
if "list" != type(files).__name__:
|
|
90
|
+
if "str" == type(files).__name__:
|
|
91
|
+
files = [files]
|
|
92
|
+
else:
|
|
93
|
+
raise SourceException("files argument must be a list of strings or a string")
|
|
94
|
+
|
|
95
|
+
target_is_dir: bool = False
|
|
96
|
+
if Path(target).is_dir():
|
|
97
|
+
target_is_dir = True
|
|
98
|
+
|
|
99
|
+
result: list[str] = []
|
|
100
|
+
|
|
101
|
+
for file in files:
|
|
102
|
+
basename = Path(file).name
|
|
103
|
+
new_file = str(Path(target) / basename) if target_is_dir else target
|
|
104
|
+
shutil.copy(file, new_file)
|
|
105
|
+
result.append(new_file)
|
|
106
|
+
|
|
107
|
+
logger.info("[get|out] => {result}")
|
|
108
|
+
return result
|