tgedr-dataops 0.0.37__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
- tgedr_dataops/commons/utils_fs.py +187 -0
- tgedr_dataops/quality/pandas_validation.py +21 -0
- tgedr_dataops/sink/local_fs_file_sink.py +77 -0
- {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
- tgedr_dataops/source/abstract_s3_file_source.py +72 -0
- tgedr_dataops/source/local_fs_file_source.py +108 -0
- tgedr_dataops/source/pd_df_s3_source.py +130 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
- tgedr_dataops/source/s3_file_extended_source.py +68 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +63 -27
- tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
- tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
- tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
- tgedr_dataops-1.0.2.dist-info/METADATA +72 -0
- tgedr_dataops-1.0.2.dist-info/RECORD +22 -0
- {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.2.dist-info}/WHEEL +1 -1
- tgedr_dataops-1.0.2.dist-info/top_level.txt +1 -0
- tgedr/dataops/chain.py +0 -51
- tgedr/dataops/commons/dataset.py +0 -23
- tgedr/dataops/commons/metadata.py +0 -172
- tgedr/dataops/commons/utils_fs.py +0 -85
- tgedr/dataops/commons/utils_spark.py +0 -87
- tgedr/dataops/etl.py +0 -112
- tgedr/dataops/processor.py +0 -27
- tgedr/dataops/sink/local_fs_file_sink.py +0 -47
- tgedr/dataops/sink/sink.py +0 -46
- tgedr/dataops/source/abstract_s3_file_source.py +0 -43
- tgedr/dataops/source/delta_table_source.py +0 -49
- tgedr/dataops/source/local_delta_table.py +0 -47
- tgedr/dataops/source/local_fs_file_source.py +0 -71
- tgedr/dataops/source/pd_df_s3_source.py +0 -76
- tgedr/dataops/source/s3_delta_table.py +0 -75
- tgedr/dataops/source/s3_file_extended_source.py +0 -39
- tgedr/dataops/source/source.py +0 -51
- tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
- tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
- tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
- tgedr/dataops/store/spark_delta.py +0 -369
- tgedr/dataops/store/store.py +0 -49
- tgedr/dataops/utils_reflection.py +0 -134
- tgedr/dataops/validation/abs.py +0 -46
- tgedr/dataops/validation/pandas.py +0 -10
- tgedr/dataops/validation/pyspark.py +0 -10
- tgedr_dataops-0.0.37.dist-info/METADATA +0 -21
- tgedr_dataops-0.0.37.dist-info/RECORD +0 -38
- tgedr_dataops-0.0.37.dist-info/top_level.txt +0 -1
- {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
- {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.2.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Module for reading pandas DataFrames from S3 sources.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- PdDfS3Source: class for reading CSV and Excel files from S3 into pandas DataFrames.
|
|
5
|
+
"""
|
|
6
|
+
from io import StringIO
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Any, ClassVar
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from tgedr_dataops.commons.utils_fs import process_s3_url
|
|
12
|
+
from tgedr_dataops.source.abstract_s3_file_source import AbstractS3FileSource
|
|
13
|
+
from tgedr_dataops_abs.source import SourceException
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PdDfS3Source(AbstractS3FileSource):
|
|
19
|
+
"""class used to read a pandas dataframe from a csv file in s3."""
|
|
20
|
+
|
|
21
|
+
CONTEXT_KEY_FILE_FORMAT = "file_format"
|
|
22
|
+
CONTEXT_KEY_SEPARATOR = "sep"
|
|
23
|
+
CONTEXT_KEY_NO_HEADER = "no_header"
|
|
24
|
+
CONTEXT_KEY_COLUMN_NAMES = "column_names"
|
|
25
|
+
CONTEXT_KEY_SCHEMA_TYPES = "schema_types"
|
|
26
|
+
DEFAULT_FORMAT = "csv"
|
|
27
|
+
FORMATS: ClassVar[list[str]] = ["csv", "xlsx"]
|
|
28
|
+
DEFAULT_SEPARATOR = ","
|
|
29
|
+
|
|
30
|
+
def __init__(self, config: dict[str, Any] | None = None) -> None:
|
|
31
|
+
"""Initialize the PdDfS3Source with optional configuration.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
config : dict[str, Any], optional
|
|
36
|
+
Configuration dictionary for S3 connection, by default None.
|
|
37
|
+
"""
|
|
38
|
+
super().__init__(config=config)
|
|
39
|
+
|
|
40
|
+
def get(self, context: dict[str, Any] | None = None) -> pd.DataFrame:
|
|
41
|
+
"""Retrieve and load a pandas DataFrame from S3.
|
|
42
|
+
|
|
43
|
+
Supports CSV and Excel formats.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
context : dict[str, Any], optional
|
|
48
|
+
Context dictionary containing:
|
|
49
|
+
- 'source' or CONTEXT_KEY_URL: S3 URL
|
|
50
|
+
- 'file_format': File format (csv or xlsx), defaults to csv
|
|
51
|
+
- 'sep': CSV separator, defaults to comma
|
|
52
|
+
- 'no_header': If present, CSV has no header row
|
|
53
|
+
- 'column_names': List of column names for CSV
|
|
54
|
+
- 'schema_types': Dictionary of column types for CSV
|
|
55
|
+
|
|
56
|
+
Returns
|
|
57
|
+
-------
|
|
58
|
+
pd.DataFrame
|
|
59
|
+
The loaded pandas DataFrame.
|
|
60
|
+
|
|
61
|
+
Raises
|
|
62
|
+
------
|
|
63
|
+
SourceException
|
|
64
|
+
If source URL is missing or if file_format is unsupported.
|
|
65
|
+
"""
|
|
66
|
+
logger.info(f"[get|in] ({context})")
|
|
67
|
+
result: pd.DataFrame = None
|
|
68
|
+
|
|
69
|
+
if self.CONTEXT_KEY_URL not in context:
|
|
70
|
+
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
71
|
+
|
|
72
|
+
_format: str = self.DEFAULT_FORMAT
|
|
73
|
+
if self.CONTEXT_KEY_FILE_FORMAT in context:
|
|
74
|
+
_format = context[self.CONTEXT_KEY_FILE_FORMAT]
|
|
75
|
+
if _format not in self.FORMATS:
|
|
76
|
+
raise SourceException(f"[get] invalid format: {_format}")
|
|
77
|
+
|
|
78
|
+
result = self.__read_csv(context=context) if "csv" == _format else self.__read_excel(context=context)
|
|
79
|
+
|
|
80
|
+
logger.info(f"[get|out] => {result}")
|
|
81
|
+
return result
|
|
82
|
+
|
|
83
|
+
def __read_csv(self, context: dict[str, Any] | None = None) -> pd.DataFrame:
|
|
84
|
+
"""Read a CSV file from S3 into a pandas DataFrame.
|
|
85
|
+
|
|
86
|
+
Parameters
|
|
87
|
+
----------
|
|
88
|
+
context : dict[str, Any], optional
|
|
89
|
+
Context dictionary with S3 URL and CSV reading options.
|
|
90
|
+
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
93
|
+
pd.DataFrame
|
|
94
|
+
The loaded DataFrame from CSV.
|
|
95
|
+
"""
|
|
96
|
+
logger.info(f"[__read_csv|in] ({context})")
|
|
97
|
+
|
|
98
|
+
_, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
|
|
99
|
+
|
|
100
|
+
obj = self._client.get_object(Bucket=bucket, Key=key)
|
|
101
|
+
data = obj["Body"].read().decode("utf-8")
|
|
102
|
+
|
|
103
|
+
header = 0 if self.CONTEXT_KEY_NO_HEADER not in context else None
|
|
104
|
+
names = context.get(self.CONTEXT_KEY_COLUMN_NAMES, None)
|
|
105
|
+
dtype = context.get(self.CONTEXT_KEY_SCHEMA_TYPES, None)
|
|
106
|
+
sep = context.get(self.CONTEXT_KEY_SEPARATOR, self.DEFAULT_SEPARATOR)
|
|
107
|
+
|
|
108
|
+
result: pd.DataFrame = pd.read_csv(StringIO(data), sep=sep, header=header, names=names, dtype=dtype)
|
|
109
|
+
|
|
110
|
+
logger.info(f"[__read_csv|out] => {result}")
|
|
111
|
+
return result
|
|
112
|
+
|
|
113
|
+
def __read_excel(self, context: dict[str, Any] | None = None) -> pd.DataFrame:
|
|
114
|
+
"""Read an Excel file from S3 into a pandas DataFrame.
|
|
115
|
+
|
|
116
|
+
Parameters
|
|
117
|
+
----------
|
|
118
|
+
context : dict[str, Any], optional
|
|
119
|
+
Context dictionary containing S3 URL.
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
pd.DataFrame
|
|
124
|
+
The loaded DataFrame from Excel.
|
|
125
|
+
"""
|
|
126
|
+
logger.info(f"[__read_excel|in] ({context})")
|
|
127
|
+
src = context[self.CONTEXT_KEY_URL]
|
|
128
|
+
result: pd.DataFrame = pd.read_excel(src, engine="openpyxl")
|
|
129
|
+
logger.info(f"[__read_excel|out] => {result}")
|
|
130
|
+
return result
|
|
@@ -1,17 +1,22 @@
|
|
|
1
|
+
"""Module for copying files between S3 buckets.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- S3FileCopy: class for copying objects/files from one S3 bucket to another
|
|
5
|
+
"""
|
|
1
6
|
import logging
|
|
2
|
-
import
|
|
3
|
-
from typing import Any
|
|
4
|
-
from
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
from tgedr_dataops.commons.s3_connector import S3Connector
|
|
5
10
|
|
|
6
|
-
from
|
|
7
|
-
from
|
|
11
|
+
from tgedr_dataops_abs.source import Source, SourceException
|
|
12
|
+
from tgedr_dataops.commons.utils_fs import process_s3_path, resolve_s3_protocol
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
logger = logging.getLogger(__name__)
|
|
11
16
|
|
|
12
17
|
|
|
13
18
|
class S3FileCopy(Source, S3Connector):
|
|
14
|
-
"""class used to copy objects/files from an s3 bucket to another s3 bucket"""
|
|
19
|
+
"""class used to copy objects/files from an s3 bucket to another s3 bucket."""
|
|
15
20
|
|
|
16
21
|
CONTEXT_KEY_SOURCE = "source"
|
|
17
22
|
CONTEXT_KEY_TARGET = "target"
|
|
@@ -19,14 +24,38 @@ class S3FileCopy(Source, S3Connector):
|
|
|
19
24
|
CONTEXT_KEY_SUFFIX = "suffix"
|
|
20
25
|
CONTEXT_KEY_PRESERVE_SOURCE_KEY = "preserve_source_key"
|
|
21
26
|
|
|
22
|
-
def __init__(self, config:
|
|
27
|
+
def __init__(self, config: dict[str, Any] | None = None) -> None:
|
|
28
|
+
"""Initialize the S3FileCopy source.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
config : dict[str, Any], optional
|
|
33
|
+
Configuration dictionary containing optional AWS credentials.
|
|
34
|
+
"""
|
|
23
35
|
Source.__init__(self, config=config)
|
|
24
36
|
S3Connector.__init__(self)
|
|
25
37
|
|
|
26
|
-
def list(self, context:
|
|
38
|
+
def list(self, context: dict[str, Any] | None = None) -> list[str]:
|
|
39
|
+
"""List objects in an S3 bucket matching the given prefix.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
context : dict[str, Any], optional
|
|
44
|
+
Context dictionary containing 'source' (source S3 URL) and optionally 'suffix' to filter results.
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
list[str]
|
|
49
|
+
List of S3 URLs matching the criteria.
|
|
50
|
+
|
|
51
|
+
Raises
|
|
52
|
+
------
|
|
53
|
+
SourceException
|
|
54
|
+
If the 'source' key is missing from the context.
|
|
55
|
+
"""
|
|
27
56
|
logger.info(f"[list|in] ({context})")
|
|
28
57
|
|
|
29
|
-
result:
|
|
58
|
+
result: list[str] = []
|
|
30
59
|
if self.CONTEXT_KEY_SOURCE not in context:
|
|
31
60
|
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_SOURCE}")
|
|
32
61
|
|
|
@@ -48,10 +77,27 @@ class S3FileCopy(Source, S3Connector):
|
|
|
48
77
|
logger.info(f"[list|out] => result len: {len(result)}")
|
|
49
78
|
return result
|
|
50
79
|
|
|
51
|
-
def get(self, context:
|
|
80
|
+
def get(self, context: dict[str, Any] | None = None) -> Any:
|
|
81
|
+
"""Copy a file from one S3 location to another.
|
|
82
|
+
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
context : dict[str, Any], optional
|
|
86
|
+
Context dictionary containing 'source' (source S3 URL) and 'target' (target S3 URL).
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
Any
|
|
91
|
+
The target S3 URL.
|
|
92
|
+
|
|
93
|
+
Raises
|
|
94
|
+
------
|
|
95
|
+
SourceException
|
|
96
|
+
If source or target context is missing.
|
|
97
|
+
"""
|
|
52
98
|
logger.info(f"[get|in] ({context})")
|
|
53
99
|
|
|
54
|
-
result:
|
|
100
|
+
result: list[str] = []
|
|
55
101
|
if self.CONTEXT_KEY_FILES not in context:
|
|
56
102
|
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_FILES}")
|
|
57
103
|
if self.CONTEXT_KEY_TARGET not in context:
|
|
@@ -59,9 +105,7 @@ class S3FileCopy(Source, S3Connector):
|
|
|
59
105
|
|
|
60
106
|
preserve_source_key = False
|
|
61
107
|
if self.CONTEXT_KEY_PRESERVE_SOURCE_KEY in context:
|
|
62
|
-
preserve_source_key = (
|
|
63
|
-
True if (str(context[self.CONTEXT_KEY_PRESERVE_SOURCE_KEY]).lower() in ["1", "true"]) else False
|
|
64
|
-
)
|
|
108
|
+
preserve_source_key = str(context[self.CONTEXT_KEY_PRESERVE_SOURCE_KEY]).lower() in ["1", "true"]
|
|
65
109
|
logger.info(f"[get] preserve_source_key: {preserve_source_key}")
|
|
66
110
|
|
|
67
111
|
target_bucket, target_key = process_s3_path(context[self.CONTEXT_KEY_TARGET])
|
|
@@ -69,31 +113,23 @@ class S3FileCopy(Source, S3Connector):
|
|
|
69
113
|
protocol = "" if s3_protocol is None else s3_protocol
|
|
70
114
|
|
|
71
115
|
files = context[self.CONTEXT_KEY_FILES]
|
|
72
|
-
multiple_files: bool = True if (0 < len(files)) else False
|
|
73
|
-
|
|
74
116
|
for file in files:
|
|
75
117
|
src_bucket, src_key = process_s3_path(file)
|
|
76
|
-
|
|
77
118
|
src = {"Bucket": src_bucket, "Key": src_key}
|
|
78
119
|
|
|
79
|
-
if
|
|
80
|
-
|
|
120
|
+
target_key = target_key.rstrip("/") if target_key.endswith("/") else target_key
|
|
121
|
+
if preserve_source_key:
|
|
122
|
+
key = str(Path(target_key) / src_key)
|
|
81
123
|
else:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
if target_key.endswith("/"):
|
|
85
|
-
src_key_leaf = os.path.basename(src_key)
|
|
86
|
-
key += src_key_leaf
|
|
87
|
-
else:
|
|
88
|
-
src_key_leaf = os.path.basename(src_key)
|
|
89
|
-
key = os.path.join(key, src_key_leaf)
|
|
124
|
+
src_key_leaf = Path(src_key).name
|
|
125
|
+
key = str(Path(target_key) / src_key_leaf)
|
|
90
126
|
|
|
91
127
|
logger.info(
|
|
92
128
|
f"[get] copying... src bucket: {src_bucket} src key: {src_key} target bucket: {target_bucket} target key: {key}"
|
|
93
129
|
)
|
|
94
130
|
self._client.copy(src, target_bucket, key)
|
|
95
131
|
|
|
96
|
-
result.append(
|
|
132
|
+
result.append(str(Path(protocol) / target_bucket / key))
|
|
97
133
|
|
|
98
134
|
logger.info(f"[get|out] => {result}")
|
|
99
135
|
return result
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""S3 file extended source module for retrieving objects from S3 with metadata support.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- S3FileExtendedSource: class for retrieving S3 objects/files with metadata extraction
|
|
5
|
+
"""
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any, ClassVar
|
|
8
|
+
|
|
9
|
+
from tgedr_dataops.source.s3_file_source import S3FileSource
|
|
10
|
+
from tgedr_dataops_abs.source import SourceException
|
|
11
|
+
from tgedr_dataops.commons.utils_fs import process_s3_path
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class S3FileExtendedSource(S3FileSource):
|
|
18
|
+
"""Class used to retrieve objects/files from S3 bucket to local filesystem."""
|
|
19
|
+
|
|
20
|
+
METADATA_KEYS: ClassVar[list[str]] = ["LastModified", "ContentLength", "ETag", "VersionId", "ContentType"]
|
|
21
|
+
|
|
22
|
+
def __init__(self, config: dict[str, Any] | None = None) -> None:
|
|
23
|
+
"""Initialize the S3FileExtendedSource.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
config : dict[str, Any], optional
|
|
28
|
+
Configuration dictionary containing optional AWS credentials.
|
|
29
|
+
"""
|
|
30
|
+
super().__init__(config=config)
|
|
31
|
+
|
|
32
|
+
def get_metadata(self, context: dict[str, Any] | None = None) -> dict[str, Any]:
|
|
33
|
+
"""Retrieve metadata for an S3 object.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
context : dict[str, Any], optional
|
|
38
|
+
Context dictionary containing 'source' S3 URL.
|
|
39
|
+
|
|
40
|
+
Returns
|
|
41
|
+
-------
|
|
42
|
+
dict[str, Any]
|
|
43
|
+
Dictionary containing object metadata including size, last modified time, etc.
|
|
44
|
+
|
|
45
|
+
Raises
|
|
46
|
+
------
|
|
47
|
+
SourceException
|
|
48
|
+
If source context is missing.
|
|
49
|
+
"""
|
|
50
|
+
logger.info(f"[get_metadata|in] ({context})")
|
|
51
|
+
|
|
52
|
+
result: dict[str, Any] = {}
|
|
53
|
+
if self.CONTEXT_KEY_SOURCE not in context:
|
|
54
|
+
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_SOURCE}")
|
|
55
|
+
|
|
56
|
+
bucket, key = process_s3_path(context[self.CONTEXT_KEY_SOURCE])
|
|
57
|
+
|
|
58
|
+
o = self._client.head_object(Bucket=bucket, Key=key)
|
|
59
|
+
|
|
60
|
+
for key in list(o.keys()):
|
|
61
|
+
if key in self.METADATA_KEYS:
|
|
62
|
+
if key == "LastModified":
|
|
63
|
+
result[key] = int(o[key].timestamp())
|
|
64
|
+
else:
|
|
65
|
+
result[key] = o[key]
|
|
66
|
+
|
|
67
|
+
logger.info(f"[get_metadata|out] => result len: {len(result)}")
|
|
68
|
+
return result
|
|
@@ -1,31 +1,60 @@
|
|
|
1
|
+
"""S3 file source module for retrieving objects from S3 buckets.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- S3FileSource: A source implementation for listing and downloading files from S3.
|
|
5
|
+
"""
|
|
1
6
|
import logging
|
|
2
|
-
import
|
|
3
|
-
from typing import Any
|
|
4
|
-
from
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
from tgedr_dataops.commons.s3_connector import S3Connector
|
|
5
10
|
|
|
6
|
-
from
|
|
7
|
-
from
|
|
11
|
+
from tgedr_dataops_abs.source import Source, SourceException
|
|
12
|
+
from tgedr_dataops.commons.utils_fs import remove_s3_protocol, resolve_s3_protocol
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
logger = logging.getLogger(__name__)
|
|
11
16
|
|
|
12
17
|
|
|
13
18
|
class S3FileSource(Source, S3Connector):
|
|
14
|
-
"""class used to retrieve objects/files from s3 bucket to local fs location"""
|
|
19
|
+
"""class used to retrieve objects/files from s3 bucket to local fs location."""
|
|
15
20
|
|
|
16
21
|
CONTEXT_KEY_SOURCE = "source"
|
|
17
22
|
CONTEXT_KEY_TARGET = "target"
|
|
18
23
|
CONTEXT_KEY_FILES = "files"
|
|
19
24
|
CONTEXT_KEY_SUFFIX = "suffix"
|
|
20
25
|
|
|
21
|
-
def __init__(self, config:
|
|
26
|
+
def __init__(self, config: dict[str, Any] | None = None) -> None:
|
|
27
|
+
"""Initialize the S3FileSource.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
config : dict[str, Any], optional
|
|
32
|
+
Configuration dictionary containing optional AWS credentials.
|
|
33
|
+
"""
|
|
22
34
|
Source.__init__(self, config=config)
|
|
23
35
|
S3Connector.__init__(self)
|
|
24
36
|
|
|
25
|
-
def list(self, context:
|
|
37
|
+
def list(self, context: dict[str, Any] | None = None) -> list[str]:
|
|
38
|
+
"""List files in the S3 bucket.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
context : dict[str, Any], optional
|
|
43
|
+
Context dictionary containing 'source' (S3 URL) and optionally 'suffix' for filtering.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
list[str]
|
|
48
|
+
List of S3 object URLs in the source bucket/prefix.
|
|
49
|
+
|
|
50
|
+
Raises
|
|
51
|
+
------
|
|
52
|
+
SourceException
|
|
53
|
+
If source context is missing.
|
|
54
|
+
"""
|
|
26
55
|
logger.info(f"[list|in] ({context})")
|
|
27
56
|
|
|
28
|
-
result:
|
|
57
|
+
result: list[str] = []
|
|
29
58
|
if self.CONTEXT_KEY_SOURCE not in context:
|
|
30
59
|
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_SOURCE}")
|
|
31
60
|
|
|
@@ -50,42 +79,49 @@ class S3FileSource(Source, S3Connector):
|
|
|
50
79
|
logger.info(f"[list|out] => result len: {len(result)}")
|
|
51
80
|
return result
|
|
52
81
|
|
|
53
|
-
def get(self, context:
|
|
82
|
+
def get(self, context: dict[str, Any] | None = None) -> Any:
|
|
83
|
+
"""Download a file from S3 to local filesystem.
|
|
84
|
+
|
|
85
|
+
Parameters
|
|
86
|
+
----------
|
|
87
|
+
context : dict[str, Any], optional
|
|
88
|
+
Context dictionary containing 'source' (S3 URL) and 'target' (local path).
|
|
89
|
+
|
|
90
|
+
Returns
|
|
91
|
+
-------
|
|
92
|
+
Any
|
|
93
|
+
The local target file path.
|
|
94
|
+
|
|
95
|
+
Raises
|
|
96
|
+
------
|
|
97
|
+
SourceException
|
|
98
|
+
If source or target context is missing.
|
|
99
|
+
"""
|
|
54
100
|
logger.info(f"[get|in] ({context})")
|
|
55
101
|
|
|
56
|
-
result:
|
|
102
|
+
result: list[str] = []
|
|
57
103
|
if self.CONTEXT_KEY_FILES not in context:
|
|
58
104
|
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_FILES}")
|
|
59
105
|
if self.CONTEXT_KEY_TARGET not in context:
|
|
60
106
|
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_TARGET}")
|
|
61
107
|
|
|
62
|
-
preserve_structure = True
|
|
63
|
-
|
|
64
108
|
files = context[self.CONTEXT_KEY_FILES]
|
|
65
109
|
target = context[self.CONTEXT_KEY_TARGET]
|
|
66
110
|
|
|
67
|
-
target_is_dir: bool =
|
|
68
|
-
if
|
|
69
|
-
target_is_dir = True
|
|
111
|
+
target_is_dir: bool = Path(target).is_dir()
|
|
112
|
+
target = target.rstrip("/") if target.endswith("/") else target
|
|
70
113
|
|
|
71
114
|
for file in files:
|
|
72
115
|
path_elements = remove_s3_protocol(file).split("/")
|
|
73
116
|
bucket = path_elements[0]
|
|
74
117
|
key = "/".join(path_elements[1:])
|
|
75
118
|
filename = path_elements[-1]
|
|
76
|
-
|
|
77
|
-
if target_is_dir:
|
|
78
|
-
if preserve_structure:
|
|
79
|
-
local_file = os.path.join(target, key)
|
|
80
|
-
else:
|
|
81
|
-
local_file = os.path.join(target, filename)
|
|
82
|
-
else:
|
|
83
|
-
local_file = target
|
|
119
|
+
local_file = str(Path(target) / filename) if target_is_dir else target
|
|
84
120
|
|
|
85
121
|
# assure we have that path there
|
|
86
|
-
local_folder =
|
|
87
|
-
if not
|
|
88
|
-
|
|
122
|
+
local_folder = Path(local_file).parent
|
|
123
|
+
if not local_folder.is_dir():
|
|
124
|
+
local_folder.mkdir(parents=True)
|
|
89
125
|
|
|
90
126
|
logger.info(f"[get] bucket: {bucket} key: {key} file: {file} local_file: {local_file}")
|
|
91
127
|
self._client.download_file(Bucket=bucket, Key=key, Filename=local_file)
|