tgedr-dataops 0.0.36__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
  2. tgedr_dataops/commons/utils_fs.py +187 -0
  3. tgedr_dataops/quality/pandas_validation.py +21 -0
  4. tgedr_dataops/sink/local_fs_file_sink.py +77 -0
  5. {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
  6. tgedr_dataops/source/abstract_s3_file_source.py +72 -0
  7. tgedr_dataops/source/local_fs_file_source.py +108 -0
  8. tgedr_dataops/source/pd_df_s3_source.py +130 -0
  9. {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
  10. tgedr_dataops/source/s3_file_extended_source.py +68 -0
  11. {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +60 -39
  12. tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
  13. tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
  14. tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
  15. tgedr_dataops-1.0.1.dist-info/METADATA +72 -0
  16. tgedr_dataops-1.0.1.dist-info/RECORD +22 -0
  17. {tgedr_dataops-0.0.36.dist-info → tgedr_dataops-1.0.1.dist-info}/WHEEL +1 -1
  18. tgedr_dataops-1.0.1.dist-info/top_level.txt +1 -0
  19. tgedr/dataops/chain.py +0 -51
  20. tgedr/dataops/commons/dataset.py +0 -23
  21. tgedr/dataops/commons/metadata.py +0 -172
  22. tgedr/dataops/commons/utils_fs.py +0 -85
  23. tgedr/dataops/commons/utils_spark.py +0 -87
  24. tgedr/dataops/etl.py +0 -112
  25. tgedr/dataops/processor.py +0 -27
  26. tgedr/dataops/sink/local_fs_file_sink.py +0 -47
  27. tgedr/dataops/sink/sink.py +0 -46
  28. tgedr/dataops/source/abstract_s3_file_source.py +0 -43
  29. tgedr/dataops/source/delta_table_source.py +0 -49
  30. tgedr/dataops/source/local_delta_table.py +0 -47
  31. tgedr/dataops/source/local_fs_file_source.py +0 -71
  32. tgedr/dataops/source/pd_df_s3_source.py +0 -51
  33. tgedr/dataops/source/s3_delta_table.py +0 -75
  34. tgedr/dataops/source/source.py +0 -51
  35. tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
  36. tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
  37. tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
  38. tgedr/dataops/store/spark_delta.py +0 -369
  39. tgedr/dataops/store/store.py +0 -49
  40. tgedr/dataops/utils_reflection.py +0 -134
  41. tgedr/dataops/validation/abs.py +0 -46
  42. tgedr/dataops/validation/pandas.py +0 -10
  43. tgedr/dataops/validation/pyspark.py +0 -10
  44. tgedr_dataops-0.0.36.dist-info/METADATA +0 -20
  45. tgedr_dataops-0.0.36.dist-info/RECORD +0 -37
  46. tgedr_dataops-0.0.36.dist-info/top_level.txt +0 -1
  47. {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
  48. {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
  49. {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
  50. {tgedr_dataops-0.0.36.dist-info → tgedr_dataops-1.0.1.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,331 @@
1
+ """Filesystem-based single partition Parquet store implementation.
2
+
3
+ This module provides an abstract base class for storing and retrieving data
4
+ in Parquet format with optional single partition support across different
5
+ filesystem implementations (local, S3, etc.).
6
+ """
7
+ from abc import ABC, abstractmethod
8
+ import logging
9
+ from pathlib import Path
10
+ from typing import Any
11
+ import pandas as pd
12
+ import pyarrow as pa
13
+ import pyarrow.parquet as pq
14
+ import pyarrow.compute as pc
15
+
16
+ from tgedr_dataops_abs.store import Store, StoreException
17
+
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def pandas_mapper(arrow_type: pa.DataType) -> pd.api.extensions.ExtensionDtype | None:
23
+ """Map PyArrow types to pandas nullable types.
24
+
25
+ Parameters
26
+ ----------
27
+ arrow_type : pa.DataType
28
+ PyArrow data type to map.
29
+
30
+ Returns
31
+ -------
32
+ pd.api.extensions.ExtensionDtype or None
33
+ Corresponding pandas nullable dtype, or None for default behavior.
34
+ """
35
+ if pa.types.is_int64(arrow_type):
36
+ return pd.Int64Dtype()
37
+ if pa.types.is_float64(arrow_type):
38
+ return pd.Float64Dtype()
39
+ if pa.types.is_string(arrow_type):
40
+ return pd.StringDtype() # pragma: no cover
41
+ # suggest default behavior
42
+ return None
43
+
44
+
45
+ class FsSinglePartitionParquetStore(Store, ABC):
46
+ """Abstract store implementation for Parquet files with optional single partition.
47
+
48
+ This class provides persistence on Parquet files with an optional single partition,
49
+ regardless of the underlying filesystem location (local, S3, etc.).
50
+ """
51
+
52
+ @property
53
+ @abstractmethod
54
+ def fs(self) -> Any:
55
+ """Abstract property providing a filesystem implementation.
56
+
57
+ Returns
58
+ -------
59
+ Any
60
+ Filesystem implementation (e.g., LocalFileSystem, S3FileSystem).
61
+ """
62
+ raise NotImplementedError
63
+
64
+ @abstractmethod
65
+ def _rmdir(self, key: str) -> None:
66
+ """Remove a directory.
67
+
68
+ Parameters
69
+ ----------
70
+ key : str
71
+ Directory path to remove.
72
+ """
73
+ raise NotImplementedError
74
+
75
+ @abstractmethod
76
+ def _exists(self, key: str) -> bool:
77
+ """Check if a path exists.
78
+
79
+ Parameters
80
+ ----------
81
+ key : str
82
+ Path to check for existence.
83
+
84
+ Returns
85
+ -------
86
+ bool
87
+ True if path exists, False otherwise.
88
+ """
89
+ raise NotImplementedError
90
+
91
+ def __init__(self, config: dict[str, Any] | None = None) -> None:
92
+ """Initialize the FsSinglePartitionParquetStore.
93
+
94
+ Parameters
95
+ ----------
96
+ config : dict[str, Any], optional
97
+ Configuration dictionary.
98
+ """
99
+ Store.__init__(self, config)
100
+ self._fs = None
101
+
102
+ def get(
103
+ self,
104
+ key: str,
105
+ filter_func: Any | None = None,
106
+ filters: list[tuple[str, str, list[str]]] | None = None,
107
+ schema: pa.Schema | None = None,
108
+ ) -> pd.DataFrame:
109
+ """Read a pandas DataFrame from Parquet storage.
110
+
111
+ Reads data from the specified location, optionally enforcing a schema
112
+ and allowing filtering of data.
113
+
114
+ Parameters
115
+ ----------
116
+ key : str
117
+ Location/URL/path where data is persisted.
118
+ filter_func : callable, optional
119
+ Filter expression (see PyArrow Table.filter documentation).
120
+ filters : list[tuple[str, str, list[str]]], optional
121
+ Filter expression for read_table (see PyArrow parquet.read_table documentation).
122
+ schema : pa.Schema, optional
123
+ Data schema to enforce while reading.
124
+
125
+ Returns
126
+ -------
127
+ pd.DataFrame
128
+ The loaded DataFrame.
129
+ """
130
+ schema_msg_segment = "0" if schema is None else str(len(schema))
131
+ logger.info(f"[get|in] ({key}, {filter_func}, {filters}, schema len:{schema_msg_segment})")
132
+ logger.debug(f"[get|in] ({key}, {filter_func}, {filters}, {schema})")
133
+ table = pq.read_table(key, filesystem=self.fs, filters=filters, schema=schema)
134
+ if filter_func is not None:
135
+ table = table.filter(filter_func)
136
+ result = table.to_pandas(types_mapper=pandas_mapper)
137
+ logger.info(f"[get|out] => {result.shape}")
138
+ return result
139
+
140
+ def delete(
141
+ self,
142
+ key: str,
143
+ partition_field: str | None = None,
144
+ partition_values: list[str] | None = None,
145
+ kv_dict: dict[str, list[Any]] | None = None,
146
+ schema: pa.Schema = None,
147
+ ) -> None:
148
+ """Delete partitions or data from Parquet storage.
149
+
150
+ Removes partitions (full or partial), deletes specific values, or removes
151
+ an entire dataset from Parquet storage.
152
+
153
+ Parameters
154
+ ----------
155
+ key : str
156
+ Location/URL/path where data is persisted.
157
+ partition_field : str, optional
158
+ Name of the partition field in the dataset.
159
+ partition_values : list[str], optional
160
+ Partition values to delete.
161
+ kv_dict : dict[str, list[Any]], optional
162
+ Key-value map defining fields and array of values for deletion filter.
163
+ schema : pa.Schema, optional
164
+ Data schema to enforce if reading is required.
165
+ """
166
+ schema_msg_segment = "0" if schema is None else str(len(schema))
167
+ logger.info(
168
+ f"[delete|in] ({key}, {partition_field}, {partition_values}, {kv_dict}, schema len:{schema_msg_segment})"
169
+ )
170
+ logger.debug(f"[delete|in] ({key}, {partition_field}, {partition_values}, {kv_dict}, {schema})")
171
+
172
+ if partition_values is not None and partition_field is not None:
173
+ self._remove_partitions(key, partition_field=partition_field, partition_values=partition_values)
174
+ elif kv_dict is not None and partition_field is not None:
175
+ table = pq.read_table(key, filesystem=self.fs, schema=schema)
176
+ for k, v in kv_dict.items():
177
+ filter_condition = ~pc.is_in(pc.field(k), pa.array(v))
178
+ table = table.filter(filter_condition)
179
+ self.delete(key, schema=schema)
180
+ pq.write_to_dataset(
181
+ table,
182
+ root_path=key,
183
+ partition_cols=[partition_field],
184
+ existing_data_behavior="delete_matching",
185
+ filesystem=self.fs,
186
+ schema=schema,
187
+ )
188
+ else:
189
+ self._rmdir(key)
190
+
191
+ logger.info("[delete|out]")
192
+
193
+ def save(
194
+ self,
195
+ df: pd.DataFrame,
196
+ key: str,
197
+ partition_field: str | None = None,
198
+ append: bool = False,
199
+ replace_partitions: bool = False,
200
+ schema: Any = None,
201
+ ) -> None:
202
+ """Save a pandas DataFrame in Parquet format.
203
+
204
+ Saves data to the specified location with optional partitioning,
205
+ append, or replace behavior.
206
+
207
+ Parameters
208
+ ----------
209
+ df : pd.DataFrame
210
+ The DataFrame to be saved.
211
+ key : str
212
+ Location/URL/path where data should be persisted.
213
+ partition_field : str, optional
214
+ Name of the partition field in the dataset.
215
+ append : bool, default False
216
+ If True, data will be appended; otherwise will overwrite.
217
+ replace_partitions : bool, default False
218
+ If True, partitions will be replaced, deleting existing data in those partitions.
219
+ schema : pa.Schema, optional
220
+ Data schema to enforce while writing.
221
+
222
+ Raises
223
+ ------
224
+ StoreException
225
+ If both append and replace_partitions are True.
226
+ """
227
+ schema_msg_segment = "0" if schema is None else str(len(schema))
228
+ logger.info(
229
+ f"[save|in] ({df.shape}, {key}, {partition_field}, {append}, {replace_partitions}, schema len:{schema_msg_segment})"
230
+ )
231
+ logger.debug(f"[save|in] ({df}, {key}, {partition_field}, {append}, {replace_partitions}, {schema})")
232
+
233
+ if schema is not None and isinstance(schema, pa.lib.Schema):
234
+ # we will order the columns based on the schema
235
+ columns = list(schema.names)
236
+ df = df[columns]
237
+
238
+ if replace_partitions and append:
239
+ raise StoreException("cannot request for replace_partitions and append at the same time")
240
+
241
+ if append:
242
+ pq.write_to_dataset(
243
+ pa.Table.from_pandas(df, preserve_index=False),
244
+ root_path=key,
245
+ partition_cols=[partition_field],
246
+ filesystem=self.fs,
247
+ schema=schema,
248
+ )
249
+ elif replace_partitions:
250
+ partitions = df[partition_field].unique().tolist()
251
+ self._remove_partitions(key, partition_field, partitions)
252
+ pq.write_to_dataset(
253
+ pa.Table.from_pandas(df, preserve_index=False),
254
+ root_path=key,
255
+ partition_cols=[partition_field],
256
+ existing_data_behavior="delete_matching",
257
+ filesystem=self.fs,
258
+ schema=schema,
259
+ )
260
+ else:
261
+ self.delete(key)
262
+ pq.write_to_dataset(
263
+ pa.Table.from_pandas(df, preserve_index=False),
264
+ root_path=key,
265
+ partition_cols=[partition_field],
266
+ existing_data_behavior="delete_matching",
267
+ filesystem=self.fs,
268
+ schema=schema,
269
+ )
270
+ logger.info("[save|out]")
271
+
272
+ def _remove_partitions(self, key: str, partition_field: str, partition_values: list[str]) -> None:
273
+ """Remove specific partitions from the dataset.
274
+
275
+ Parameters
276
+ ----------
277
+ key : str
278
+ Root path of the dataset.
279
+ partition_field : str
280
+ Name of the partition field.
281
+ partition_values : list[str]
282
+ List of partition values to remove.
283
+ """
284
+ logger.debug(f"[_remove_partitions|in] ({key}, {partition_field}, {partition_values})")
285
+
286
+ for partition_value in partition_values:
287
+ partition_key = f"{partition_field}={partition_value}"
288
+ partition_path = str(Path(key) / partition_key)
289
+ self._rmdir(partition_path)
290
+
291
+ logger.debug("[_remove_partitions|out]")
292
+
293
+ def update(
294
+ self,
295
+ df: pd.DataFrame,
296
+ key: str,
297
+ key_fields: list[str],
298
+ partition_field: str | None = None,
299
+ schema: Any = None,
300
+ ) -> None:
301
+ """Update rows in a pandas DataFrame stored in Parquet format.
302
+
303
+ Updates matching rows based on key fields by merging with existing data.
304
+
305
+ Parameters
306
+ ----------
307
+ df : pd.DataFrame
308
+ The DataFrame containing updates.
309
+ key : str
310
+ Location/URL/path where data is persisted.
311
+ key_fields : list[str]
312
+ Primary fields used to match rows for updating.
313
+ partition_field : str, optional
314
+ Name of the partition field to enforce while saving.
315
+ schema : pa.Schema, optional
316
+ Data schema to enforce while reading and saving.
317
+ """
318
+ schema_msg_segment = "0" if schema is None else str(len(schema))
319
+ logger.info(
320
+ f"[update|in] ({df.shape}, {key}, {key_fields}, {partition_field}, schema len:{schema_msg_segment})"
321
+ )
322
+ logger.debug(f"[update|in] ({df}, {key}, {key_fields}, {partition_field}, {schema})")
323
+
324
+ df0 = self.get(key, schema=schema)
325
+ match = pd.merge(df0.reset_index(), df.reset_index(), on=key_fields)
326
+ index_left = match["index_x"]
327
+ index_right = match["index_y"]
328
+ df0.iloc[index_left] = df.iloc[index_right]
329
+ self.save(df0, key, partition_field=partition_field, schema=schema)
330
+
331
+ logger.info("[update|out]")
@@ -0,0 +1,56 @@
1
+ """Local filesystem implementation for single partition Parquet storage.
2
+
3
+ This module provides LocalFsSinglePartitionParquetStore, which implements
4
+ single partition Parquet storage using the local file system.
5
+ """
6
+ import logging
7
+ from typing import Any
8
+ from pyarrow import fs
9
+
10
+ from tgedr_dataops.store.fs_single_partition_parquet import FsSinglePartitionParquetStore
11
+
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class LocalFsSinglePartitionParquetStore(FsSinglePartitionParquetStore):
17
+ """FsSinglePartitionParquetStore implementation using local file system."""
18
+
19
+ @property
20
+ def fs(self) -> Any:
21
+ """Get the PyArrow local filesystem instance.
22
+
23
+ Returns
24
+ -------
25
+ Any
26
+ The filesystem instance for local file operations.
27
+ """
28
+ if self._fs is None:
29
+ self._fs = fs.LocalFileSystem()
30
+ return self._fs
31
+
32
+ def _rmdir(self, key: str) -> None:
33
+ """Remove a directory recursively.
34
+
35
+ Parameters
36
+ ----------
37
+ key : str
38
+ Path to the directory to delete.
39
+ """
40
+ if self.fs.get_file_info(key).type.name == "Directory":
41
+ self.fs.delete_dir(key)
42
+
43
+ def _exists(self, key: str) -> bool:
44
+ """Check if a path exists on the local filesystem.
45
+
46
+ Parameters
47
+ ----------
48
+ key : str
49
+ Path to check for existence.
50
+
51
+ Returns
52
+ -------
53
+ bool
54
+ True if path exists, False otherwise.
55
+ """
56
+ return self.fs.get_file_info(key).type.name != "NotFound"
@@ -0,0 +1,193 @@
1
+ """S3 implementation of single partition parquet store."""
2
+ import logging
3
+ import s3fs
4
+ from typing import Any
5
+ import pandas as pd
6
+ import pyarrow as pa
7
+
8
+ from tgedr_dataops.store.fs_single_partition_parquet import FsSinglePartitionParquetStore
9
+ from tgedr_dataops.commons.utils_fs import remove_s3_protocol
10
+
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class S3FsSinglePartitionParquetStore(FsSinglePartitionParquetStore): # pragma: no cover
16
+ """FsSinglePartitionParquetStore implementation using aws s3 file system."""
17
+
18
+ CONFIG_KEY_AWS_ACCESS_KEY_ID: str = "aws_access_key_id"
19
+ CONFIG_KEY_AWS_SECRET_ACCESS_KEY: str = "aws_secret_access_key" # noqa: S105
20
+ CONFIG_KEY_AWS_SESSION_TOKEN: str = "aws_session_token" # noqa: S105
21
+
22
+ @property
23
+ def fs(self) -> Any:
24
+ """Get the S3 filesystem instance.
25
+
26
+ Returns
27
+ -------
28
+ Any
29
+ The s3fs.S3FileSystem instance for S3 operations.
30
+ """
31
+ if self._fs is None:
32
+ if (self._config is not None) and all(
33
+ element in list(self._config.keys())
34
+ for element in [
35
+ self.CONFIG_KEY_AWS_ACCESS_KEY_ID,
36
+ self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY,
37
+ self.CONFIG_KEY_AWS_SESSION_TOKEN,
38
+ ]
39
+ ):
40
+ self._fs = s3fs.S3FileSystem(
41
+ key=self._config[self.CONFIG_KEY_AWS_ACCESS_KEY_ID],
42
+ secret=self._config[self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY],
43
+ token=self._config[self.CONFIG_KEY_AWS_SESSION_TOKEN],
44
+ )
45
+ else:
46
+ self._fs = s3fs.S3FileSystem()
47
+ return self._fs
48
+
49
+ def _rmdir(self, key: str) -> None:
50
+ """Remove a directory (prefix) from S3.
51
+
52
+ Parameters
53
+ ----------
54
+ key : str
55
+ S3 path/prefix to delete.
56
+ """
57
+ if self.fs.isdir(key):
58
+ self.fs.delete(key, recursive=True)
59
+
60
+ def _exists(self, key: str) -> bool:
61
+ """Check if a path exists in S3.
62
+
63
+ Parameters
64
+ ----------
65
+ key : str
66
+ S3 path to check for existence.
67
+
68
+ Returns
69
+ -------
70
+ bool
71
+ True if path exists in S3, False otherwise.
72
+ """
73
+ return self.fs.get_file_info(key).type.name != "NotFound"
74
+
75
+ def get(
76
+ self,
77
+ key: str,
78
+ filter_func: callable | None = None,
79
+ filters: list[tuple[str, str, list[str]]] | None = None,
80
+ schema: pa.Schema = None,
81
+ ) -> pd.DataFrame:
82
+ """Retrieve data from S3 parquet store.
83
+
84
+ Parameters
85
+ ----------
86
+ key : str
87
+ S3 path to the parquet data (s3:// protocol will be removed).
88
+ filter_func : callable, optional
89
+ Row filter function.
90
+ filters : list[tuple[str, str, list[str]]], optional
91
+ Partition filters.
92
+ schema : pa.Schema, optional
93
+ PyArrow schema for reading.
94
+
95
+ Returns
96
+ -------
97
+ pd.DataFrame
98
+ The loaded DataFrame.
99
+ """
100
+ return super().get(key=remove_s3_protocol(key), filter_func=filter_func, filters=filters, schema=schema)
101
+
102
+ def delete(
103
+ self,
104
+ key: str,
105
+ partition_field: str | None = None,
106
+ partition_values: list[str] | None = None,
107
+ kv_dict: dict[str, list[Any]] | None = None,
108
+ schema: pa.Schema = None,
109
+ ) -> None:
110
+ """Delete data from S3 parquet store.
111
+
112
+ Parameters
113
+ ----------
114
+ key : str
115
+ S3 path to the parquet data (s3:// protocol will be removed).
116
+ partition_field : str, optional
117
+ Field used for partitioning.
118
+ partition_values : list[str], optional
119
+ List of partition values to delete.
120
+ kv_dict : dict[str, list[Any]], optional
121
+ Dictionary of key-value filters for deletion.
122
+ schema : pa.Schema, optional
123
+ PyArrow schema.
124
+ """
125
+ super().delete(
126
+ key=remove_s3_protocol(key),
127
+ partition_field=partition_field,
128
+ partition_values=partition_values,
129
+ kv_dict=kv_dict,
130
+ schema=schema,
131
+ )
132
+
133
+ def save(
134
+ self,
135
+ df: pd.DataFrame,
136
+ key: str,
137
+ partition_field: str | None = None,
138
+ append: bool = False,
139
+ replace_partitions: bool = False,
140
+ schema: Any = None,
141
+ ) -> None:
142
+ """Save DataFrame to S3 parquet store.
143
+
144
+ Parameters
145
+ ----------
146
+ df : pd.DataFrame
147
+ The DataFrame to save.
148
+ key : str
149
+ S3 path to save the parquet data (s3:// protocol will be removed).
150
+ partition_field : str, optional
151
+ Field to partition by.
152
+ append : bool, default False
153
+ If True, append to existing data.
154
+ replace_partitions : bool, default False
155
+ If True, replace existing partitions.
156
+ schema : Any, optional
157
+ Schema for the data.
158
+ """
159
+ super().save(
160
+ df=df,
161
+ key=remove_s3_protocol(key),
162
+ partition_field=partition_field,
163
+ append=append,
164
+ replace_partitions=replace_partitions,
165
+ schema=schema,
166
+ )
167
+
168
+ def update(
169
+ self,
170
+ df: pd.DataFrame,
171
+ key: str,
172
+ key_fields: list[str],
173
+ partition_field: str | None = None,
174
+ schema: Any = None,
175
+ ) -> None:
176
+ """Update data in S3 parquet store.
177
+
178
+ Parameters
179
+ ----------
180
+ df : pd.DataFrame
181
+ The DataFrame with updated data.
182
+ key : str
183
+ S3 path to the parquet data (s3:// protocol will be removed).
184
+ key_fields : list[str]
185
+ List of fields to use as keys for matching records.
186
+ partition_field : str, optional
187
+ Field used for partitioning.
188
+ schema : Any, optional
189
+ Schema for the data.
190
+ """
191
+ super().update(
192
+ df=df, key=remove_s3_protocol(key), key_fields=key_fields, partition_field=partition_field, schema=schema
193
+ )
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.4
2
+ Name: tgedr-dataops
3
+ Version: 1.0.1
4
+ Summary: data operations related code
5
+ Author-email: joao tiago viegas <3536754+jtviegas@users.noreply.github.com>
6
+ Requires-Python: >=3.11
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: tgedr-dataops-abs==0.0.3
10
+ Requires-Dist: s3fs==2024.5.0
11
+ Requires-Dist: boto3==1.34.106
12
+ Requires-Dist: openpyxl==3.1.2
13
+ Requires-Dist: pyarrow>=23.0.0
14
+ Requires-Dist: moto>=5.1.20
15
+ Dynamic: license-file
16
+
17
+ # data-ops
18
+
19
+ ![Coverage](./coverage.svg)
20
+ [![PyPI](https://img.shields.io/pypi/v/tgedr-dataops)](https://pypi.org/project/tgedr-dataops/)
21
+
22
+
23
+
24
+ data operations related code
25
+
26
+ ## motivation
27
+ *data-ops* is a library with tested and used code aligning on some standards regarding code structure and quality and to avoid reinventing the wheel
28
+
29
+ ## installation
30
+ `pip install tgedr-dataops`
31
+
32
+ ## package namespaces and its contents
33
+
34
+ #### commons
35
+ - __S3Connector__: base class to be extended, providing a connection session with aws s3 resources
36
+ - __utils_fs__: utility module with file system related functions ([example](tests/tgedr_dataops/commons/test_utils_fs.py))
37
+
38
+ #### quality
39
+ - __PandasValidation__ : __GreatExpectationsValidation__ implementation to validate pandas dataframes with Great Expectations library ([example](tests/tgedr_dataops/quality/test_pandas_validation.py))
40
+
41
+
42
+ #### sink
43
+ - __LocalFsFileSink__: __Sink__ implementation class used to save/persist an object/file to a local fs location ([example](tests/tgedr_dataops/sink/test_localfs_file_sink.py))
44
+ - __S3FileSink__: __Sink__ implementation class used to save/persist a local object/file to an s3 bucket ([example](tests/tgedr_dataops/sink/test_s3_file_sink.py))
45
+
46
+ #### source
47
+ - __AbstractS3FileSource__: abstract __Source__ class used to retrieve objects/files from s3 bucket to local fs location circumventing some formats download limitation
48
+ - __LocalFsFileSource__: __Source__ implementation class used to retrieve local objects/files to another local fs location ([example](tests/tgedr_dataops/source/test_localfs_file_source.py))
49
+ - __PdDfS3Source__: __Source__ implementation class used to read a pandas dataframe from s3, whether a csv or an excel (xslx) file ([example csv](tests/tgedr_dataops/source/test_pd_df_s3_source_csv.py), [example excel](tests/tgedr_dataops/source/test_pd_df_s3_source_excel.py))
50
+ - __S3FileCopy__: __Source__ implementation class used to copy objects/files from an s3 bucket to another s3 bucket ([example](tests/tgedr_dataops/source/test_s3_copy.py))
51
+ - __S3FileExtendedSource__: __Source__ implementation class used to retrieve objects/files from s3 bucket to local fs location with the extra method `get_metadata` providing sile metadata ("LastModified", "ContentLength", "ETag", "VersionId", "ContentType")([example](tests/tgedr_dataops/source/test_s3_file_extended_source.py))
52
+ - __S3FileSource__: __Source__ implementation class used to retrieve objects/files from s3 bucket to local fs location ([example](tests/tgedr_dataops/source/test_s3_file_source.py))
53
+
54
+ #### store
55
+ - __FsSinglePartitionParquetStore__ : abstract __Store__ implementation defining persistence on parquet files with an optional single partition, regardless of the location it should persist
56
+ - __LocalFsSinglePartitionParquetStore__ : __FsSinglePartitionParquetStore__ implementation using local file system ([example](tests/tgedr_dataops/store/test_local_fs_single_partition_parquet.py))
57
+ - __S3FsSinglePartitionParquetStore__ : __FsSinglePartitionParquetStore__ implementation using aws s3 file system ([example](tests/tgedr_dataops/store/MANUAL_test_s3_single_partition_parquet.py))
58
+
59
+
60
+ ## development
61
+ - main requirements:
62
+ - _uv_
63
+ - _bash_
64
+ - Clone the repository like this:
65
+
66
+ ``` bash
67
+ git clone git@github.com:tgedr/pycommons
68
+ ```
69
+ - cd into the folder: `cd pycommons`
70
+ - install requirements: `./helper.sh reqs`
71
+
72
+
@@ -0,0 +1,22 @@
1
+ tgedr_dataops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ tgedr_dataops/commons/s3_connector.py,sha256=Tf1Cc2CPE5uRl3oW_yTu59Ge2ukQmsncqM6HdG90sck,1993
3
+ tgedr_dataops/commons/utils_fs.py,sha256=ZFZG-aG-liq-s3CnEDggvLJgJ1zsoKsqYMQU7bMbxiY,4888
4
+ tgedr_dataops/quality/pandas_validation.py,sha256=WLHiL0b0Nm4ldO_1t55nFQ-F6UwxouYvbnP7bONrBp8,817
5
+ tgedr_dataops/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ tgedr_dataops/sink/local_fs_file_sink.py,sha256=5gM4dOXiGRtPLvYFHAvwijQbXRNltpUdS06kIGhOwj4,2353
7
+ tgedr_dataops/sink/s3_file_sink.py,sha256=5p591g89UjCpoRuKlaf7UAOzF7Ac0UyWEhSPZX7_O-E,3344
8
+ tgedr_dataops/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ tgedr_dataops/source/abstract_s3_file_source.py,sha256=XqUGDLz6qXJnydoyE3rQJcDOKwD_nuSzO2tElOfEM1w,2230
10
+ tgedr_dataops/source/local_fs_file_source.py,sha256=GheAp2QaugxDc4REMqJ_435seUmEv-Marnv4LwGXUjw,3553
11
+ tgedr_dataops/source/pd_df_s3_source.py,sha256=rL7eEEkwXate5aVNDC_LvU80apKphff2ViI65vB6TpI,4566
12
+ tgedr_dataops/source/s3_file_copy.py,sha256=rV-N2pwH12LrsXYl2aINBcHwBcHTP3xTF0aS8-oqZEQ,4899
13
+ tgedr_dataops/source/s3_file_extended_source.py,sha256=8LLNJmBUtrAEUSFE2Pl5rKa7TKEFinXQ0q6T9jJHS5E,2264
14
+ tgedr_dataops/source/s3_file_source.py,sha256=Ntfgn7r_Hy74CuO-UM5XqAnN_PZMiLipDxMsF2Es88s,4563
15
+ tgedr_dataops/store/fs_single_partition_parquet.py,sha256=tGcknJ0igaRrpURP-3nJoAyR1Hd_KbONjcyVADmRzwY,11705
16
+ tgedr_dataops/store/local_fs_single_partition_parquet.py,sha256=_0n2qUDABm0WRat4-V7gKoxz3JZVyomQbC_OCLmHxbg,1542
17
+ tgedr_dataops/store/s3_single_partition_parquet.py,sha256=dxmYY4h7Y9FfOKVjPgNEU812FH_5MeAQihRrdeHZd7U,6070
18
+ tgedr_dataops-1.0.1.dist-info/licenses/LICENSE,sha256=awOCsWJ58m_2kBQwBUGWejVqZm6wuRtCL2hi9rfa0X4,1211
19
+ tgedr_dataops-1.0.1.dist-info/METADATA,sha256=Rfg1g1oAZQ9ajQq03tzbWhjCybZpJY4rDz9txCkrlrY,3865
20
+ tgedr_dataops-1.0.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
21
+ tgedr_dataops-1.0.1.dist-info/top_level.txt,sha256=O-CCFyLA9TzzhSdYShs9EcoZzpu_TzEr_rHGZiN-YBg,14
22
+ tgedr_dataops-1.0.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1 @@
1
+ tgedr_dataops