tgedr-dataops 0.0.37__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
- tgedr_dataops/commons/utils_fs.py +187 -0
- tgedr_dataops/quality/pandas_validation.py +21 -0
- tgedr_dataops/sink/local_fs_file_sink.py +77 -0
- {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
- tgedr_dataops/source/abstract_s3_file_source.py +72 -0
- tgedr_dataops/source/local_fs_file_source.py +108 -0
- tgedr_dataops/source/pd_df_s3_source.py +130 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
- tgedr_dataops/source/s3_file_extended_source.py +68 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +63 -27
- tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
- tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
- tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
- tgedr_dataops-1.0.1.dist-info/METADATA +72 -0
- tgedr_dataops-1.0.1.dist-info/RECORD +22 -0
- {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info}/WHEEL +1 -1
- tgedr_dataops-1.0.1.dist-info/top_level.txt +1 -0
- tgedr/dataops/chain.py +0 -51
- tgedr/dataops/commons/dataset.py +0 -23
- tgedr/dataops/commons/metadata.py +0 -172
- tgedr/dataops/commons/utils_fs.py +0 -85
- tgedr/dataops/commons/utils_spark.py +0 -87
- tgedr/dataops/etl.py +0 -112
- tgedr/dataops/processor.py +0 -27
- tgedr/dataops/sink/local_fs_file_sink.py +0 -47
- tgedr/dataops/sink/sink.py +0 -46
- tgedr/dataops/source/abstract_s3_file_source.py +0 -43
- tgedr/dataops/source/delta_table_source.py +0 -49
- tgedr/dataops/source/local_delta_table.py +0 -47
- tgedr/dataops/source/local_fs_file_source.py +0 -71
- tgedr/dataops/source/pd_df_s3_source.py +0 -76
- tgedr/dataops/source/s3_delta_table.py +0 -75
- tgedr/dataops/source/s3_file_extended_source.py +0 -39
- tgedr/dataops/source/source.py +0 -51
- tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
- tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
- tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
- tgedr/dataops/store/spark_delta.py +0 -369
- tgedr/dataops/store/store.py +0 -49
- tgedr/dataops/utils_reflection.py +0 -134
- tgedr/dataops/validation/abs.py +0 -46
- tgedr/dataops/validation/pandas.py +0 -10
- tgedr/dataops/validation/pyspark.py +0 -10
- tgedr_dataops-0.0.37.dist-info/METADATA +0 -21
- tgedr_dataops-0.0.37.dist-info/RECORD +0 -38
- tgedr_dataops-0.0.37.dist-info/top_level.txt +0 -1
- {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
- {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
"""Filesystem-based single partition Parquet store implementation.
|
|
2
|
+
|
|
3
|
+
This module provides an abstract base class for storing and retrieving data
|
|
4
|
+
in Parquet format with optional single partition support across different
|
|
5
|
+
filesystem implementations (local, S3, etc.).
|
|
6
|
+
"""
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
import logging
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import pyarrow as pa
|
|
13
|
+
import pyarrow.parquet as pq
|
|
14
|
+
import pyarrow.compute as pc
|
|
15
|
+
|
|
16
|
+
from tgedr_dataops_abs.store import Store, StoreException
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def pandas_mapper(arrow_type: pa.DataType) -> pd.api.extensions.ExtensionDtype | None:
|
|
23
|
+
"""Map PyArrow types to pandas nullable types.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
arrow_type : pa.DataType
|
|
28
|
+
PyArrow data type to map.
|
|
29
|
+
|
|
30
|
+
Returns
|
|
31
|
+
-------
|
|
32
|
+
pd.api.extensions.ExtensionDtype or None
|
|
33
|
+
Corresponding pandas nullable dtype, or None for default behavior.
|
|
34
|
+
"""
|
|
35
|
+
if pa.types.is_int64(arrow_type):
|
|
36
|
+
return pd.Int64Dtype()
|
|
37
|
+
if pa.types.is_float64(arrow_type):
|
|
38
|
+
return pd.Float64Dtype()
|
|
39
|
+
if pa.types.is_string(arrow_type):
|
|
40
|
+
return pd.StringDtype() # pragma: no cover
|
|
41
|
+
# suggest default behavior
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class FsSinglePartitionParquetStore(Store, ABC):
|
|
46
|
+
"""Abstract store implementation for Parquet files with optional single partition.
|
|
47
|
+
|
|
48
|
+
This class provides persistence on Parquet files with an optional single partition,
|
|
49
|
+
regardless of the underlying filesystem location (local, S3, etc.).
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def fs(self) -> Any:
|
|
55
|
+
"""Abstract property providing a filesystem implementation.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
Any
|
|
60
|
+
Filesystem implementation (e.g., LocalFileSystem, S3FileSystem).
|
|
61
|
+
"""
|
|
62
|
+
raise NotImplementedError
|
|
63
|
+
|
|
64
|
+
@abstractmethod
|
|
65
|
+
def _rmdir(self, key: str) -> None:
|
|
66
|
+
"""Remove a directory.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
key : str
|
|
71
|
+
Directory path to remove.
|
|
72
|
+
"""
|
|
73
|
+
raise NotImplementedError
|
|
74
|
+
|
|
75
|
+
@abstractmethod
|
|
76
|
+
def _exists(self, key: str) -> bool:
|
|
77
|
+
"""Check if a path exists.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
key : str
|
|
82
|
+
Path to check for existence.
|
|
83
|
+
|
|
84
|
+
Returns
|
|
85
|
+
-------
|
|
86
|
+
bool
|
|
87
|
+
True if path exists, False otherwise.
|
|
88
|
+
"""
|
|
89
|
+
raise NotImplementedError
|
|
90
|
+
|
|
91
|
+
def __init__(self, config: dict[str, Any] | None = None) -> None:
|
|
92
|
+
"""Initialize the FsSinglePartitionParquetStore.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
config : dict[str, Any], optional
|
|
97
|
+
Configuration dictionary.
|
|
98
|
+
"""
|
|
99
|
+
Store.__init__(self, config)
|
|
100
|
+
self._fs = None
|
|
101
|
+
|
|
102
|
+
def get(
|
|
103
|
+
self,
|
|
104
|
+
key: str,
|
|
105
|
+
filter_func: Any | None = None,
|
|
106
|
+
filters: list[tuple[str, str, list[str]]] | None = None,
|
|
107
|
+
schema: pa.Schema | None = None,
|
|
108
|
+
) -> pd.DataFrame:
|
|
109
|
+
"""Read a pandas DataFrame from Parquet storage.
|
|
110
|
+
|
|
111
|
+
Reads data from the specified location, optionally enforcing a schema
|
|
112
|
+
and allowing filtering of data.
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
key : str
|
|
117
|
+
Location/URL/path where data is persisted.
|
|
118
|
+
filter_func : callable, optional
|
|
119
|
+
Filter expression (see PyArrow Table.filter documentation).
|
|
120
|
+
filters : list[tuple[str, str, list[str]]], optional
|
|
121
|
+
Filter expression for read_table (see PyArrow parquet.read_table documentation).
|
|
122
|
+
schema : pa.Schema, optional
|
|
123
|
+
Data schema to enforce while reading.
|
|
124
|
+
|
|
125
|
+
Returns
|
|
126
|
+
-------
|
|
127
|
+
pd.DataFrame
|
|
128
|
+
The loaded DataFrame.
|
|
129
|
+
"""
|
|
130
|
+
schema_msg_segment = "0" if schema is None else str(len(schema))
|
|
131
|
+
logger.info(f"[get|in] ({key}, {filter_func}, {filters}, schema len:{schema_msg_segment})")
|
|
132
|
+
logger.debug(f"[get|in] ({key}, {filter_func}, {filters}, {schema})")
|
|
133
|
+
table = pq.read_table(key, filesystem=self.fs, filters=filters, schema=schema)
|
|
134
|
+
if filter_func is not None:
|
|
135
|
+
table = table.filter(filter_func)
|
|
136
|
+
result = table.to_pandas(types_mapper=pandas_mapper)
|
|
137
|
+
logger.info(f"[get|out] => {result.shape}")
|
|
138
|
+
return result
|
|
139
|
+
|
|
140
|
+
def delete(
|
|
141
|
+
self,
|
|
142
|
+
key: str,
|
|
143
|
+
partition_field: str | None = None,
|
|
144
|
+
partition_values: list[str] | None = None,
|
|
145
|
+
kv_dict: dict[str, list[Any]] | None = None,
|
|
146
|
+
schema: pa.Schema = None,
|
|
147
|
+
) -> None:
|
|
148
|
+
"""Delete partitions or data from Parquet storage.
|
|
149
|
+
|
|
150
|
+
Removes partitions (full or partial), deletes specific values, or removes
|
|
151
|
+
an entire dataset from Parquet storage.
|
|
152
|
+
|
|
153
|
+
Parameters
|
|
154
|
+
----------
|
|
155
|
+
key : str
|
|
156
|
+
Location/URL/path where data is persisted.
|
|
157
|
+
partition_field : str, optional
|
|
158
|
+
Name of the partition field in the dataset.
|
|
159
|
+
partition_values : list[str], optional
|
|
160
|
+
Partition values to delete.
|
|
161
|
+
kv_dict : dict[str, list[Any]], optional
|
|
162
|
+
Key-value map defining fields and array of values for deletion filter.
|
|
163
|
+
schema : pa.Schema, optional
|
|
164
|
+
Data schema to enforce if reading is required.
|
|
165
|
+
"""
|
|
166
|
+
schema_msg_segment = "0" if schema is None else str(len(schema))
|
|
167
|
+
logger.info(
|
|
168
|
+
f"[delete|in] ({key}, {partition_field}, {partition_values}, {kv_dict}, schema len:{schema_msg_segment})"
|
|
169
|
+
)
|
|
170
|
+
logger.debug(f"[delete|in] ({key}, {partition_field}, {partition_values}, {kv_dict}, {schema})")
|
|
171
|
+
|
|
172
|
+
if partition_values is not None and partition_field is not None:
|
|
173
|
+
self._remove_partitions(key, partition_field=partition_field, partition_values=partition_values)
|
|
174
|
+
elif kv_dict is not None and partition_field is not None:
|
|
175
|
+
table = pq.read_table(key, filesystem=self.fs, schema=schema)
|
|
176
|
+
for k, v in kv_dict.items():
|
|
177
|
+
filter_condition = ~pc.is_in(pc.field(k), pa.array(v))
|
|
178
|
+
table = table.filter(filter_condition)
|
|
179
|
+
self.delete(key, schema=schema)
|
|
180
|
+
pq.write_to_dataset(
|
|
181
|
+
table,
|
|
182
|
+
root_path=key,
|
|
183
|
+
partition_cols=[partition_field],
|
|
184
|
+
existing_data_behavior="delete_matching",
|
|
185
|
+
filesystem=self.fs,
|
|
186
|
+
schema=schema,
|
|
187
|
+
)
|
|
188
|
+
else:
|
|
189
|
+
self._rmdir(key)
|
|
190
|
+
|
|
191
|
+
logger.info("[delete|out]")
|
|
192
|
+
|
|
193
|
+
def save(
|
|
194
|
+
self,
|
|
195
|
+
df: pd.DataFrame,
|
|
196
|
+
key: str,
|
|
197
|
+
partition_field: str | None = None,
|
|
198
|
+
append: bool = False,
|
|
199
|
+
replace_partitions: bool = False,
|
|
200
|
+
schema: Any = None,
|
|
201
|
+
) -> None:
|
|
202
|
+
"""Save a pandas DataFrame in Parquet format.
|
|
203
|
+
|
|
204
|
+
Saves data to the specified location with optional partitioning,
|
|
205
|
+
append, or replace behavior.
|
|
206
|
+
|
|
207
|
+
Parameters
|
|
208
|
+
----------
|
|
209
|
+
df : pd.DataFrame
|
|
210
|
+
The DataFrame to be saved.
|
|
211
|
+
key : str
|
|
212
|
+
Location/URL/path where data should be persisted.
|
|
213
|
+
partition_field : str, optional
|
|
214
|
+
Name of the partition field in the dataset.
|
|
215
|
+
append : bool, default False
|
|
216
|
+
If True, data will be appended; otherwise will overwrite.
|
|
217
|
+
replace_partitions : bool, default False
|
|
218
|
+
If True, partitions will be replaced, deleting existing data in those partitions.
|
|
219
|
+
schema : pa.Schema, optional
|
|
220
|
+
Data schema to enforce while writing.
|
|
221
|
+
|
|
222
|
+
Raises
|
|
223
|
+
------
|
|
224
|
+
StoreException
|
|
225
|
+
If both append and replace_partitions are True.
|
|
226
|
+
"""
|
|
227
|
+
schema_msg_segment = "0" if schema is None else str(len(schema))
|
|
228
|
+
logger.info(
|
|
229
|
+
f"[save|in] ({df.shape}, {key}, {partition_field}, {append}, {replace_partitions}, schema len:{schema_msg_segment})"
|
|
230
|
+
)
|
|
231
|
+
logger.debug(f"[save|in] ({df}, {key}, {partition_field}, {append}, {replace_partitions}, {schema})")
|
|
232
|
+
|
|
233
|
+
if schema is not None and isinstance(schema, pa.lib.Schema):
|
|
234
|
+
# we will order the columns based on the schema
|
|
235
|
+
columns = list(schema.names)
|
|
236
|
+
df = df[columns]
|
|
237
|
+
|
|
238
|
+
if replace_partitions and append:
|
|
239
|
+
raise StoreException("cannot request for replace_partitions and append at the same time")
|
|
240
|
+
|
|
241
|
+
if append:
|
|
242
|
+
pq.write_to_dataset(
|
|
243
|
+
pa.Table.from_pandas(df, preserve_index=False),
|
|
244
|
+
root_path=key,
|
|
245
|
+
partition_cols=[partition_field],
|
|
246
|
+
filesystem=self.fs,
|
|
247
|
+
schema=schema,
|
|
248
|
+
)
|
|
249
|
+
elif replace_partitions:
|
|
250
|
+
partitions = df[partition_field].unique().tolist()
|
|
251
|
+
self._remove_partitions(key, partition_field, partitions)
|
|
252
|
+
pq.write_to_dataset(
|
|
253
|
+
pa.Table.from_pandas(df, preserve_index=False),
|
|
254
|
+
root_path=key,
|
|
255
|
+
partition_cols=[partition_field],
|
|
256
|
+
existing_data_behavior="delete_matching",
|
|
257
|
+
filesystem=self.fs,
|
|
258
|
+
schema=schema,
|
|
259
|
+
)
|
|
260
|
+
else:
|
|
261
|
+
self.delete(key)
|
|
262
|
+
pq.write_to_dataset(
|
|
263
|
+
pa.Table.from_pandas(df, preserve_index=False),
|
|
264
|
+
root_path=key,
|
|
265
|
+
partition_cols=[partition_field],
|
|
266
|
+
existing_data_behavior="delete_matching",
|
|
267
|
+
filesystem=self.fs,
|
|
268
|
+
schema=schema,
|
|
269
|
+
)
|
|
270
|
+
logger.info("[save|out]")
|
|
271
|
+
|
|
272
|
+
def _remove_partitions(self, key: str, partition_field: str, partition_values: list[str]) -> None:
|
|
273
|
+
"""Remove specific partitions from the dataset.
|
|
274
|
+
|
|
275
|
+
Parameters
|
|
276
|
+
----------
|
|
277
|
+
key : str
|
|
278
|
+
Root path of the dataset.
|
|
279
|
+
partition_field : str
|
|
280
|
+
Name of the partition field.
|
|
281
|
+
partition_values : list[str]
|
|
282
|
+
List of partition values to remove.
|
|
283
|
+
"""
|
|
284
|
+
logger.debug(f"[_remove_partitions|in] ({key}, {partition_field}, {partition_values})")
|
|
285
|
+
|
|
286
|
+
for partition_value in partition_values:
|
|
287
|
+
partition_key = f"{partition_field}={partition_value}"
|
|
288
|
+
partition_path = str(Path(key) / partition_key)
|
|
289
|
+
self._rmdir(partition_path)
|
|
290
|
+
|
|
291
|
+
logger.debug("[_remove_partitions|out]")
|
|
292
|
+
|
|
293
|
+
def update(
|
|
294
|
+
self,
|
|
295
|
+
df: pd.DataFrame,
|
|
296
|
+
key: str,
|
|
297
|
+
key_fields: list[str],
|
|
298
|
+
partition_field: str | None = None,
|
|
299
|
+
schema: Any = None,
|
|
300
|
+
) -> None:
|
|
301
|
+
"""Update rows in a pandas DataFrame stored in Parquet format.
|
|
302
|
+
|
|
303
|
+
Updates matching rows based on key fields by merging with existing data.
|
|
304
|
+
|
|
305
|
+
Parameters
|
|
306
|
+
----------
|
|
307
|
+
df : pd.DataFrame
|
|
308
|
+
The DataFrame containing updates.
|
|
309
|
+
key : str
|
|
310
|
+
Location/URL/path where data is persisted.
|
|
311
|
+
key_fields : list[str]
|
|
312
|
+
Primary fields used to match rows for updating.
|
|
313
|
+
partition_field : str, optional
|
|
314
|
+
Name of the partition field to enforce while saving.
|
|
315
|
+
schema : pa.Schema, optional
|
|
316
|
+
Data schema to enforce while reading and saving.
|
|
317
|
+
"""
|
|
318
|
+
schema_msg_segment = "0" if schema is None else str(len(schema))
|
|
319
|
+
logger.info(
|
|
320
|
+
f"[update|in] ({df.shape}, {key}, {key_fields}, {partition_field}, schema len:{schema_msg_segment})"
|
|
321
|
+
)
|
|
322
|
+
logger.debug(f"[update|in] ({df}, {key}, {key_fields}, {partition_field}, {schema})")
|
|
323
|
+
|
|
324
|
+
df0 = self.get(key, schema=schema)
|
|
325
|
+
match = pd.merge(df0.reset_index(), df.reset_index(), on=key_fields)
|
|
326
|
+
index_left = match["index_x"]
|
|
327
|
+
index_right = match["index_y"]
|
|
328
|
+
df0.iloc[index_left] = df.iloc[index_right]
|
|
329
|
+
self.save(df0, key, partition_field=partition_field, schema=schema)
|
|
330
|
+
|
|
331
|
+
logger.info("[update|out]")
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Local filesystem implementation for single partition Parquet storage.
|
|
2
|
+
|
|
3
|
+
This module provides LocalFsSinglePartitionParquetStore, which implements
|
|
4
|
+
single partition Parquet storage using the local file system.
|
|
5
|
+
"""
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any
|
|
8
|
+
from pyarrow import fs
|
|
9
|
+
|
|
10
|
+
from tgedr_dataops.store.fs_single_partition_parquet import FsSinglePartitionParquetStore
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LocalFsSinglePartitionParquetStore(FsSinglePartitionParquetStore):
|
|
17
|
+
"""FsSinglePartitionParquetStore implementation using local file system."""
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def fs(self) -> Any:
|
|
21
|
+
"""Get the PyArrow local filesystem instance.
|
|
22
|
+
|
|
23
|
+
Returns
|
|
24
|
+
-------
|
|
25
|
+
Any
|
|
26
|
+
The filesystem instance for local file operations.
|
|
27
|
+
"""
|
|
28
|
+
if self._fs is None:
|
|
29
|
+
self._fs = fs.LocalFileSystem()
|
|
30
|
+
return self._fs
|
|
31
|
+
|
|
32
|
+
def _rmdir(self, key: str) -> None:
|
|
33
|
+
"""Remove a directory recursively.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
key : str
|
|
38
|
+
Path to the directory to delete.
|
|
39
|
+
"""
|
|
40
|
+
if self.fs.get_file_info(key).type.name == "Directory":
|
|
41
|
+
self.fs.delete_dir(key)
|
|
42
|
+
|
|
43
|
+
def _exists(self, key: str) -> bool:
|
|
44
|
+
"""Check if a path exists on the local filesystem.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
key : str
|
|
49
|
+
Path to check for existence.
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
bool
|
|
54
|
+
True if path exists, False otherwise.
|
|
55
|
+
"""
|
|
56
|
+
return self.fs.get_file_info(key).type.name != "NotFound"
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""S3 implementation of single partition parquet store."""
|
|
2
|
+
import logging
|
|
3
|
+
import s3fs
|
|
4
|
+
from typing import Any
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import pyarrow as pa
|
|
7
|
+
|
|
8
|
+
from tgedr_dataops.store.fs_single_partition_parquet import FsSinglePartitionParquetStore
|
|
9
|
+
from tgedr_dataops.commons.utils_fs import remove_s3_protocol
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class S3FsSinglePartitionParquetStore(FsSinglePartitionParquetStore): # pragma: no cover
|
|
16
|
+
"""FsSinglePartitionParquetStore implementation using aws s3 file system."""
|
|
17
|
+
|
|
18
|
+
CONFIG_KEY_AWS_ACCESS_KEY_ID: str = "aws_access_key_id"
|
|
19
|
+
CONFIG_KEY_AWS_SECRET_ACCESS_KEY: str = "aws_secret_access_key" # noqa: S105
|
|
20
|
+
CONFIG_KEY_AWS_SESSION_TOKEN: str = "aws_session_token" # noqa: S105
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def fs(self) -> Any:
|
|
24
|
+
"""Get the S3 filesystem instance.
|
|
25
|
+
|
|
26
|
+
Returns
|
|
27
|
+
-------
|
|
28
|
+
Any
|
|
29
|
+
The s3fs.S3FileSystem instance for S3 operations.
|
|
30
|
+
"""
|
|
31
|
+
if self._fs is None:
|
|
32
|
+
if (self._config is not None) and all(
|
|
33
|
+
element in list(self._config.keys())
|
|
34
|
+
for element in [
|
|
35
|
+
self.CONFIG_KEY_AWS_ACCESS_KEY_ID,
|
|
36
|
+
self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY,
|
|
37
|
+
self.CONFIG_KEY_AWS_SESSION_TOKEN,
|
|
38
|
+
]
|
|
39
|
+
):
|
|
40
|
+
self._fs = s3fs.S3FileSystem(
|
|
41
|
+
key=self._config[self.CONFIG_KEY_AWS_ACCESS_KEY_ID],
|
|
42
|
+
secret=self._config[self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY],
|
|
43
|
+
token=self._config[self.CONFIG_KEY_AWS_SESSION_TOKEN],
|
|
44
|
+
)
|
|
45
|
+
else:
|
|
46
|
+
self._fs = s3fs.S3FileSystem()
|
|
47
|
+
return self._fs
|
|
48
|
+
|
|
49
|
+
def _rmdir(self, key: str) -> None:
|
|
50
|
+
"""Remove a directory (prefix) from S3.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
key : str
|
|
55
|
+
S3 path/prefix to delete.
|
|
56
|
+
"""
|
|
57
|
+
if self.fs.isdir(key):
|
|
58
|
+
self.fs.delete(key, recursive=True)
|
|
59
|
+
|
|
60
|
+
def _exists(self, key: str) -> bool:
|
|
61
|
+
"""Check if a path exists in S3.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
key : str
|
|
66
|
+
S3 path to check for existence.
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
bool
|
|
71
|
+
True if path exists in S3, False otherwise.
|
|
72
|
+
"""
|
|
73
|
+
return self.fs.get_file_info(key).type.name != "NotFound"
|
|
74
|
+
|
|
75
|
+
def get(
|
|
76
|
+
self,
|
|
77
|
+
key: str,
|
|
78
|
+
filter_func: callable | None = None,
|
|
79
|
+
filters: list[tuple[str, str, list[str]]] | None = None,
|
|
80
|
+
schema: pa.Schema = None,
|
|
81
|
+
) -> pd.DataFrame:
|
|
82
|
+
"""Retrieve data from S3 parquet store.
|
|
83
|
+
|
|
84
|
+
Parameters
|
|
85
|
+
----------
|
|
86
|
+
key : str
|
|
87
|
+
S3 path to the parquet data (s3:// protocol will be removed).
|
|
88
|
+
filter_func : callable, optional
|
|
89
|
+
Row filter function.
|
|
90
|
+
filters : list[tuple[str, str, list[str]]], optional
|
|
91
|
+
Partition filters.
|
|
92
|
+
schema : pa.Schema, optional
|
|
93
|
+
PyArrow schema for reading.
|
|
94
|
+
|
|
95
|
+
Returns
|
|
96
|
+
-------
|
|
97
|
+
pd.DataFrame
|
|
98
|
+
The loaded DataFrame.
|
|
99
|
+
"""
|
|
100
|
+
return super().get(key=remove_s3_protocol(key), filter_func=filter_func, filters=filters, schema=schema)
|
|
101
|
+
|
|
102
|
+
def delete(
|
|
103
|
+
self,
|
|
104
|
+
key: str,
|
|
105
|
+
partition_field: str | None = None,
|
|
106
|
+
partition_values: list[str] | None = None,
|
|
107
|
+
kv_dict: dict[str, list[Any]] | None = None,
|
|
108
|
+
schema: pa.Schema = None,
|
|
109
|
+
) -> None:
|
|
110
|
+
"""Delete data from S3 parquet store.
|
|
111
|
+
|
|
112
|
+
Parameters
|
|
113
|
+
----------
|
|
114
|
+
key : str
|
|
115
|
+
S3 path to the parquet data (s3:// protocol will be removed).
|
|
116
|
+
partition_field : str, optional
|
|
117
|
+
Field used for partitioning.
|
|
118
|
+
partition_values : list[str], optional
|
|
119
|
+
List of partition values to delete.
|
|
120
|
+
kv_dict : dict[str, list[Any]], optional
|
|
121
|
+
Dictionary of key-value filters for deletion.
|
|
122
|
+
schema : pa.Schema, optional
|
|
123
|
+
PyArrow schema.
|
|
124
|
+
"""
|
|
125
|
+
super().delete(
|
|
126
|
+
key=remove_s3_protocol(key),
|
|
127
|
+
partition_field=partition_field,
|
|
128
|
+
partition_values=partition_values,
|
|
129
|
+
kv_dict=kv_dict,
|
|
130
|
+
schema=schema,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def save(
|
|
134
|
+
self,
|
|
135
|
+
df: pd.DataFrame,
|
|
136
|
+
key: str,
|
|
137
|
+
partition_field: str | None = None,
|
|
138
|
+
append: bool = False,
|
|
139
|
+
replace_partitions: bool = False,
|
|
140
|
+
schema: Any = None,
|
|
141
|
+
) -> None:
|
|
142
|
+
"""Save DataFrame to S3 parquet store.
|
|
143
|
+
|
|
144
|
+
Parameters
|
|
145
|
+
----------
|
|
146
|
+
df : pd.DataFrame
|
|
147
|
+
The DataFrame to save.
|
|
148
|
+
key : str
|
|
149
|
+
S3 path to save the parquet data (s3:// protocol will be removed).
|
|
150
|
+
partition_field : str, optional
|
|
151
|
+
Field to partition by.
|
|
152
|
+
append : bool, default False
|
|
153
|
+
If True, append to existing data.
|
|
154
|
+
replace_partitions : bool, default False
|
|
155
|
+
If True, replace existing partitions.
|
|
156
|
+
schema : Any, optional
|
|
157
|
+
Schema for the data.
|
|
158
|
+
"""
|
|
159
|
+
super().save(
|
|
160
|
+
df=df,
|
|
161
|
+
key=remove_s3_protocol(key),
|
|
162
|
+
partition_field=partition_field,
|
|
163
|
+
append=append,
|
|
164
|
+
replace_partitions=replace_partitions,
|
|
165
|
+
schema=schema,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def update(
|
|
169
|
+
self,
|
|
170
|
+
df: pd.DataFrame,
|
|
171
|
+
key: str,
|
|
172
|
+
key_fields: list[str],
|
|
173
|
+
partition_field: str | None = None,
|
|
174
|
+
schema: Any = None,
|
|
175
|
+
) -> None:
|
|
176
|
+
"""Update data in S3 parquet store.
|
|
177
|
+
|
|
178
|
+
Parameters
|
|
179
|
+
----------
|
|
180
|
+
df : pd.DataFrame
|
|
181
|
+
The DataFrame with updated data.
|
|
182
|
+
key : str
|
|
183
|
+
S3 path to the parquet data (s3:// protocol will be removed).
|
|
184
|
+
key_fields : list[str]
|
|
185
|
+
List of fields to use as keys for matching records.
|
|
186
|
+
partition_field : str, optional
|
|
187
|
+
Field used for partitioning.
|
|
188
|
+
schema : Any, optional
|
|
189
|
+
Schema for the data.
|
|
190
|
+
"""
|
|
191
|
+
super().update(
|
|
192
|
+
df=df, key=remove_s3_protocol(key), key_fields=key_fields, partition_field=partition_field, schema=schema
|
|
193
|
+
)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tgedr-dataops
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: data operations related code
|
|
5
|
+
Author-email: joao tiago viegas <3536754+jtviegas@users.noreply.github.com>
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: tgedr-dataops-abs==0.0.3
|
|
10
|
+
Requires-Dist: s3fs==2024.5.0
|
|
11
|
+
Requires-Dist: boto3==1.34.106
|
|
12
|
+
Requires-Dist: openpyxl==3.1.2
|
|
13
|
+
Requires-Dist: pyarrow>=23.0.0
|
|
14
|
+
Requires-Dist: moto>=5.1.20
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# data-ops
|
|
18
|
+
|
|
19
|
+

|
|
20
|
+
[](https://pypi.org/project/tgedr-dataops/)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
data operations related code
|
|
25
|
+
|
|
26
|
+
## motivation
|
|
27
|
+
*data-ops* is a library with tested and used code aligning on some standards regarding code structure and quality and to avoid reinventing the wheel
|
|
28
|
+
|
|
29
|
+
## installation
|
|
30
|
+
`pip install tgedr-dataops`
|
|
31
|
+
|
|
32
|
+
## package namespaces and its contents
|
|
33
|
+
|
|
34
|
+
#### commons
|
|
35
|
+
- __S3Connector__: base class to be extended, providing a connection session with aws s3 resources
|
|
36
|
+
- __utils_fs__: utility module with file system related functions ([example](tests/tgedr_dataops/commons/test_utils_fs.py))
|
|
37
|
+
|
|
38
|
+
#### quality
|
|
39
|
+
- __PandasValidation__ : __GreatExpectationsValidation__ implementation to validate pandas dataframes with Great Expectations library ([example](tests/tgedr_dataops/quality/test_pandas_validation.py))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
#### sink
|
|
43
|
+
- __LocalFsFileSink__: __Sink__ implementation class used to save/persist an object/file to a local fs location ([example](tests/tgedr_dataops/sink/test_localfs_file_sink.py))
|
|
44
|
+
- __S3FileSink__: __Sink__ implementation class used to save/persist a local object/file to an s3 bucket ([example](tests/tgedr_dataops/sink/test_s3_file_sink.py))
|
|
45
|
+
|
|
46
|
+
#### source
|
|
47
|
+
- __AbstractS3FileSource__: abstract __Source__ class used to retrieve objects/files from s3 bucket to local fs location circumventing some formats download limitation
|
|
48
|
+
- __LocalFsFileSource__: __Source__ implementation class used to retrieve local objects/files to another local fs location ([example](tests/tgedr_dataops/source/test_localfs_file_source.py))
|
|
49
|
+
- __PdDfS3Source__: __Source__ implementation class used to read a pandas dataframe from s3, whether a csv or an excel (xslx) file ([example csv](tests/tgedr_dataops/source/test_pd_df_s3_source_csv.py), [example excel](tests/tgedr_dataops/source/test_pd_df_s3_source_excel.py))
|
|
50
|
+
- __S3FileCopy__: __Source__ implementation class used to copy objects/files from an s3 bucket to another s3 bucket ([example](tests/tgedr_dataops/source/test_s3_copy.py))
|
|
51
|
+
- __S3FileExtendedSource__: __Source__ implementation class used to retrieve objects/files from s3 bucket to local fs location with the extra method `get_metadata` providing sile metadata ("LastModified", "ContentLength", "ETag", "VersionId", "ContentType")([example](tests/tgedr_dataops/source/test_s3_file_extended_source.py))
|
|
52
|
+
- __S3FileSource__: __Source__ implementation class used to retrieve objects/files from s3 bucket to local fs location ([example](tests/tgedr_dataops/source/test_s3_file_source.py))
|
|
53
|
+
|
|
54
|
+
#### store
|
|
55
|
+
- __FsSinglePartitionParquetStore__ : abstract __Store__ implementation defining persistence on parquet files with an optional single partition, regardless of the location it should persist
|
|
56
|
+
- __LocalFsSinglePartitionParquetStore__ : __FsSinglePartitionParquetStore__ implementation using local file system ([example](tests/tgedr_dataops/store/test_local_fs_single_partition_parquet.py))
|
|
57
|
+
- __S3FsSinglePartitionParquetStore__ : __FsSinglePartitionParquetStore__ implementation using aws s3 file system ([example](tests/tgedr_dataops/store/MANUAL_test_s3_single_partition_parquet.py))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
## development
|
|
61
|
+
- main requirements:
|
|
62
|
+
- _uv_
|
|
63
|
+
- _bash_
|
|
64
|
+
- Clone the repository like this:
|
|
65
|
+
|
|
66
|
+
``` bash
|
|
67
|
+
git clone git@github.com:tgedr/pycommons
|
|
68
|
+
```
|
|
69
|
+
- cd into the folder: `cd pycommons`
|
|
70
|
+
- install requirements: `./helper.sh reqs`
|
|
71
|
+
|
|
72
|
+
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
tgedr_dataops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
tgedr_dataops/commons/s3_connector.py,sha256=Tf1Cc2CPE5uRl3oW_yTu59Ge2ukQmsncqM6HdG90sck,1993
|
|
3
|
+
tgedr_dataops/commons/utils_fs.py,sha256=ZFZG-aG-liq-s3CnEDggvLJgJ1zsoKsqYMQU7bMbxiY,4888
|
|
4
|
+
tgedr_dataops/quality/pandas_validation.py,sha256=WLHiL0b0Nm4ldO_1t55nFQ-F6UwxouYvbnP7bONrBp8,817
|
|
5
|
+
tgedr_dataops/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
tgedr_dataops/sink/local_fs_file_sink.py,sha256=5gM4dOXiGRtPLvYFHAvwijQbXRNltpUdS06kIGhOwj4,2353
|
|
7
|
+
tgedr_dataops/sink/s3_file_sink.py,sha256=5p591g89UjCpoRuKlaf7UAOzF7Ac0UyWEhSPZX7_O-E,3344
|
|
8
|
+
tgedr_dataops/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
tgedr_dataops/source/abstract_s3_file_source.py,sha256=XqUGDLz6qXJnydoyE3rQJcDOKwD_nuSzO2tElOfEM1w,2230
|
|
10
|
+
tgedr_dataops/source/local_fs_file_source.py,sha256=GheAp2QaugxDc4REMqJ_435seUmEv-Marnv4LwGXUjw,3553
|
|
11
|
+
tgedr_dataops/source/pd_df_s3_source.py,sha256=rL7eEEkwXate5aVNDC_LvU80apKphff2ViI65vB6TpI,4566
|
|
12
|
+
tgedr_dataops/source/s3_file_copy.py,sha256=rV-N2pwH12LrsXYl2aINBcHwBcHTP3xTF0aS8-oqZEQ,4899
|
|
13
|
+
tgedr_dataops/source/s3_file_extended_source.py,sha256=8LLNJmBUtrAEUSFE2Pl5rKa7TKEFinXQ0q6T9jJHS5E,2264
|
|
14
|
+
tgedr_dataops/source/s3_file_source.py,sha256=Ntfgn7r_Hy74CuO-UM5XqAnN_PZMiLipDxMsF2Es88s,4563
|
|
15
|
+
tgedr_dataops/store/fs_single_partition_parquet.py,sha256=tGcknJ0igaRrpURP-3nJoAyR1Hd_KbONjcyVADmRzwY,11705
|
|
16
|
+
tgedr_dataops/store/local_fs_single_partition_parquet.py,sha256=_0n2qUDABm0WRat4-V7gKoxz3JZVyomQbC_OCLmHxbg,1542
|
|
17
|
+
tgedr_dataops/store/s3_single_partition_parquet.py,sha256=dxmYY4h7Y9FfOKVjPgNEU812FH_5MeAQihRrdeHZd7U,6070
|
|
18
|
+
tgedr_dataops-1.0.1.dist-info/licenses/LICENSE,sha256=awOCsWJ58m_2kBQwBUGWejVqZm6wuRtCL2hi9rfa0X4,1211
|
|
19
|
+
tgedr_dataops-1.0.1.dist-info/METADATA,sha256=Rfg1g1oAZQ9ajQq03tzbWhjCybZpJY4rDz9txCkrlrY,3865
|
|
20
|
+
tgedr_dataops-1.0.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
21
|
+
tgedr_dataops-1.0.1.dist-info/top_level.txt,sha256=O-CCFyLA9TzzhSdYShs9EcoZzpu_TzEr_rHGZiN-YBg,14
|
|
22
|
+
tgedr_dataops-1.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tgedr_dataops
|