tgedr-dataops 0.0.34__tar.gz → 0.0.37__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tgedr_dataops-0.0.34/src/tgedr_dataops.egg-info → tgedr_dataops-0.0.37}/PKG-INFO +2 -1
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/README.md +5 -4
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/setup.py +3 -2
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/source/pd_df_s3_source.py +27 -2
- tgedr_dataops-0.0.37/src/tgedr/dataops/source/s3_file_extended_source.py +39 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/source/s3_file_source.py +0 -15
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/store/s3_single_partition_parquet.py +1 -1
- tgedr_dataops-0.0.37/src/tgedr/dataops/validation/pyspark.py +10 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37/src/tgedr_dataops.egg-info}/PKG-INFO +2 -1
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr_dataops.egg-info/SOURCES.txt +2 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr_dataops.egg-info/requires.txt +1 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/LICENSE +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/pyproject.toml +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/setup.cfg +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/__init__.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/chain.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/commons/dataset.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/commons/metadata.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/commons/s3_connector.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/commons/utils_fs.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/commons/utils_spark.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/etl.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/processor.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/sink/__init__.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/sink/local_fs_file_sink.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/sink/s3_file_sink.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/sink/sink.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/source/__init__.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/source/abstract_s3_file_source.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/source/delta_table_source.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/source/local_delta_table.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/source/local_fs_file_source.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/source/s3_delta_table.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/source/s3_file_copy.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/source/source.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/store/fs_single_partition_parquet.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/store/spark_delta.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/store/store.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/utils_reflection.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/validation/abs.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/validation/pandas.py +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr_dataops.egg-info/dependency_links.txt +0 -0
- {tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr_dataops.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: tgedr-dataops
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.37
|
|
4
4
|
Summary: data operations related code
|
|
5
5
|
Home-page: https://github.com/jtviegas-sandbox/dataops
|
|
6
6
|
Author: joao tiago viegas
|
|
@@ -17,3 +17,4 @@ Requires-Dist: s3fs==2024.5.0
|
|
|
17
17
|
Requires-Dist: boto3==1.34.106
|
|
18
18
|
Requires-Dist: great_expectations==0.18.10
|
|
19
19
|
Requires-Dist: deltalake~=0.16.4
|
|
20
|
+
Requires-Dist: openpyxl==3.1.2
|
|
@@ -28,10 +28,11 @@ data operations related code
|
|
|
28
28
|
- __Source__: abstract **source** class defining methods (`list` and `get`) to manage retrieval of data from somewhere as defined by implementing classes
|
|
29
29
|
- __LocalFsFileSource__: __source__ class used to retrieve local objects/files to another local fs location ([example](test/tgedr/dataops/source/test_localfs_file_source.py))
|
|
30
30
|
- __S3FileSource__: __source__ class used to retrieve objects/files from s3 bucket to local fs location ([example](test/tgedr/dataops/source/test_s3_file_source.py))
|
|
31
|
+
- __S3FileExtendedSource__: __source__ class used to retrieve objects/files from s3 bucket to local fs location with the extra method `get_metadata` providing sile metadata ("LastModified", "ContentLength", "ETag", "VersionId", "ContentType")([example](test/tgedr/dataops/source/test_s3_file_extended_source.py))
|
|
31
32
|
- __S3FileCopy__: __source__ class used to copy objects/files from an s3 bucket to another s3 bucket ([example](test/tgedr/dataops/source/test_s3_copy.py))
|
|
32
33
|
- __S3DeltaTable__: __source__ class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe ([example](test/tgedr/dataops/source/test_s3_delta_table.py))
|
|
33
34
|
- __LocalDeltaTable__: __source__ class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe ([example](test/tgedr/dataops/source/test_local_delta_table.py))
|
|
34
|
-
- __PdDfS3Source__: __source__ class used to read a pandas dataframe from,
|
|
35
|
+
- __PdDfS3Source__: __source__ class used to read a pandas dataframe from s3, whether a csv or an excel (xslx) file ([example csv](test/tgedr/dataops/source/test_pd_df_s3_source_csv.py), [example excel](test/tgedr/dataops/source/test_pd_df_s3_source_excel.py))
|
|
35
36
|
|
|
36
37
|
#### store
|
|
37
38
|
- __Store__ : abstract class used to manage persistence, defining CRUD-like (CreateReadUpdateDelete) methods
|
|
@@ -41,9 +42,9 @@ data operations related code
|
|
|
41
42
|
- __SparkDeltaStore__ : __store__ implementation for pyspark distributed processing with delta table format ([example](test/tgedr/dataops/store/test_spark_delta.py))
|
|
42
43
|
|
|
43
44
|
#### validation
|
|
44
|
-
- __DataValidation__ : abstract class defining a `validate` method to perform data validation, currently using Great Expectations library
|
|
45
|
-
- __pandas.Impl__ : __DataValidation__ implementation to validate pandas dataframes with Great Expectations library ([example](test/tgedr/dataops/validation/
|
|
46
|
-
|
|
45
|
+
- __DataValidation__ : abstract class defining a `validate` method to perform data validation, currently using [Great Expectations library](https://greatexpectations.io/expectations/)
|
|
46
|
+
- __pandas.Impl__ : __DataValidation__ implementation to validate pandas dataframes with Great Expectations library ([example](test/tgedr/dataops/validation/test_pandas.py))
|
|
47
|
+
- __pyspark.Impl__ : __DataValidation__ implementation to validate pyspark dataframes with Great Expectations library ([example](test/tgedr/dataops/validation/test_pyspark.py))
|
|
47
48
|
|
|
48
49
|
|
|
49
50
|
## installation
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
from setuptools import setup, find_namespace_packages
|
|
5
5
|
|
|
6
6
|
logger = logging.getLogger(__name__)
|
|
7
|
-
VERSION = "0.0.
|
|
7
|
+
VERSION = "0.0.37"
|
|
8
8
|
logging.info(f"building version: {VERSION}")
|
|
9
9
|
|
|
10
10
|
setup(
|
|
@@ -29,7 +29,8 @@ setup(
|
|
|
29
29
|
"s3fs==2024.5.0",
|
|
30
30
|
"boto3==1.34.106",
|
|
31
31
|
"great_expectations==0.18.10",
|
|
32
|
-
"deltalake~=0.16.4"
|
|
32
|
+
"deltalake~=0.16.4",
|
|
33
|
+
"openpyxl==3.1.2"
|
|
33
34
|
],
|
|
34
35
|
python_requires='>=3.9',
|
|
35
36
|
)
|
|
@@ -19,6 +19,7 @@ class PdDfS3Source(AbstractS3FileSource):
|
|
|
19
19
|
CONTEXT_KEY_COLUMN_NAMES = "column_names"
|
|
20
20
|
CONTEXT_KEY_SCHEMA_TYPES = "schema_types"
|
|
21
21
|
DEFAULT_FORMAT = "csv"
|
|
22
|
+
FORMATS = ["csv", "xlsx"]
|
|
22
23
|
DEFAULT_SEPARATOR = ","
|
|
23
24
|
|
|
24
25
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
@@ -33,6 +34,23 @@ class PdDfS3Source(AbstractS3FileSource):
|
|
|
33
34
|
if self.CONTEXT_KEY_URL not in context:
|
|
34
35
|
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
35
36
|
|
|
37
|
+
format: str = self.DEFAULT_FORMAT
|
|
38
|
+
if self.CONTEXT_KEY_FILE_FORMAT in context:
|
|
39
|
+
format = context[self.CONTEXT_KEY_FILE_FORMAT]
|
|
40
|
+
if format not in self.FORMATS:
|
|
41
|
+
raise SourceException(f"[get] invalid format: {format}")
|
|
42
|
+
|
|
43
|
+
if "csv" == format:
|
|
44
|
+
result = self.__read_csv(context=context)
|
|
45
|
+
else:
|
|
46
|
+
result = self.__read_excel(context=context)
|
|
47
|
+
|
|
48
|
+
logger.info(f"[get|out] => {result}")
|
|
49
|
+
return result
|
|
50
|
+
|
|
51
|
+
def __read_csv(self, context: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
|
52
|
+
logger.info(f"[__read_csv|in] ({context})")
|
|
53
|
+
|
|
36
54
|
protocol, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
|
|
37
55
|
|
|
38
56
|
obj = self._client.get_object(Bucket=bucket, Key=key)
|
|
@@ -45,7 +63,14 @@ class PdDfS3Source(AbstractS3FileSource):
|
|
|
45
63
|
self.DEFAULT_SEPARATOR if self.CONTEXT_KEY_SEPARATOR not in context else context[self.CONTEXT_KEY_SEPARATOR]
|
|
46
64
|
)
|
|
47
65
|
|
|
48
|
-
result = pd.read_csv(StringIO(data), sep=sep, header=header, names=names, dtype=dtype)
|
|
66
|
+
result: pd.DataFrame = pd.read_csv(StringIO(data), sep=sep, header=header, names=names, dtype=dtype)
|
|
49
67
|
|
|
50
|
-
logger.info(f"[
|
|
68
|
+
logger.info(f"[__read_csv|out] => {result}")
|
|
69
|
+
return result
|
|
70
|
+
|
|
71
|
+
def __read_excel(self, context: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
|
72
|
+
logger.info(f"[__read_excel|in] ({context})")
|
|
73
|
+
src = context[self.CONTEXT_KEY_URL]
|
|
74
|
+
result: pd.DataFrame = pd.read_excel(src, engine="openpyxl")
|
|
75
|
+
logger.info(f"[__read_excel|out] => {result}")
|
|
51
76
|
return result
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
from tgedr.dataops.source.s3_file_source import S3FileSource
|
|
5
|
+
from tgedr.dataops.source.source import SourceException
|
|
6
|
+
from tgedr.dataops.commons.utils_fs import process_s3_path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class S3FileExtendedSource(S3FileSource):
|
|
13
|
+
"""class used to retrieve objects/files from s3 bucket to local fs location"""
|
|
14
|
+
|
|
15
|
+
METADATA_KEYS = ["LastModified", "ContentLength", "ETag", "VersionId", "ContentType"]
|
|
16
|
+
|
|
17
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
18
|
+
super().__init__(config=config)
|
|
19
|
+
|
|
20
|
+
def get_metadata(self, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
21
|
+
logger.info(f"[get_metadata|in] ({context})")
|
|
22
|
+
|
|
23
|
+
result: Dict[str, Any] = {}
|
|
24
|
+
if self.CONTEXT_KEY_SOURCE not in context:
|
|
25
|
+
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_SOURCE}")
|
|
26
|
+
|
|
27
|
+
bucket, key = process_s3_path(context[self.CONTEXT_KEY_SOURCE])
|
|
28
|
+
|
|
29
|
+
o = self._client.head_object(Bucket=bucket, Key=key)
|
|
30
|
+
|
|
31
|
+
for key in list(o.keys()):
|
|
32
|
+
if key in self.METADATA_KEYS:
|
|
33
|
+
if key == "LastModified":
|
|
34
|
+
result[key] = int(o[key].timestamp())
|
|
35
|
+
else:
|
|
36
|
+
result[key] = o[key]
|
|
37
|
+
|
|
38
|
+
logger.info(f"[get_metadata|out] => result len: {result}")
|
|
39
|
+
return result
|
|
@@ -22,21 +22,6 @@ class S3FileSource(Source, S3Connector):
|
|
|
22
22
|
Source.__init__(self, config=config)
|
|
23
23
|
S3Connector.__init__(self)
|
|
24
24
|
|
|
25
|
-
def __derive_local_file(self, target: str, isdir: bool, file: str):
|
|
26
|
-
logger.debug(f"[__derive_local_file|in] ({target}, {isdir}, {file})")
|
|
27
|
-
|
|
28
|
-
if isdir:
|
|
29
|
-
result = os.path.join(target, file)
|
|
30
|
-
basedir = os.path.dirname(result)
|
|
31
|
-
if not os.path.exists(basedir):
|
|
32
|
-
logger.debug(f"[__derive_local_file] creating folder: {basedir}")
|
|
33
|
-
os.mkdir(basedir)
|
|
34
|
-
else:
|
|
35
|
-
result = target
|
|
36
|
-
|
|
37
|
-
logger.debug(f"[__derive_local_file|out] => {result}")
|
|
38
|
-
return result
|
|
39
|
-
|
|
40
25
|
def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
41
26
|
logger.info(f"[list|in] ({context})")
|
|
42
27
|
|
{tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/store/s3_single_partition_parquet.py
RENAMED
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
|
6
6
|
import pyarrow as pa
|
|
7
7
|
|
|
8
8
|
from tgedr.dataops.store.fs_single_partition_parquet import FsSinglePartitionParquetStore
|
|
9
|
-
from
|
|
9
|
+
from tgedr.dataops.commons.utils_fs import remove_s3_protocol
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from great_expectations.dataset.dataset import Dataset
|
|
3
|
+
|
|
4
|
+
from tgedr.dataops.validation.abs import DataValidation
|
|
5
|
+
from great_expectations.dataset.sparkdf_dataset import SparkDFDataset
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Impl(DataValidation):
|
|
9
|
+
def _get_dataset(self, df: Any) -> Dataset:
|
|
10
|
+
return SparkDFDataset(df)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: tgedr-dataops
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.37
|
|
4
4
|
Summary: data operations related code
|
|
5
5
|
Home-page: https://github.com/jtviegas-sandbox/dataops
|
|
6
6
|
Author: joao tiago viegas
|
|
@@ -17,3 +17,4 @@ Requires-Dist: s3fs==2024.5.0
|
|
|
17
17
|
Requires-Dist: boto3==1.34.106
|
|
18
18
|
Requires-Dist: great_expectations==0.18.10
|
|
19
19
|
Requires-Dist: deltalake~=0.16.4
|
|
20
|
+
Requires-Dist: openpyxl==3.1.2
|
|
@@ -24,6 +24,7 @@ src/tgedr/dataops/source/local_fs_file_source.py
|
|
|
24
24
|
src/tgedr/dataops/source/pd_df_s3_source.py
|
|
25
25
|
src/tgedr/dataops/source/s3_delta_table.py
|
|
26
26
|
src/tgedr/dataops/source/s3_file_copy.py
|
|
27
|
+
src/tgedr/dataops/source/s3_file_extended_source.py
|
|
27
28
|
src/tgedr/dataops/source/s3_file_source.py
|
|
28
29
|
src/tgedr/dataops/source/source.py
|
|
29
30
|
src/tgedr/dataops/store/fs_single_partition_parquet.py
|
|
@@ -33,6 +34,7 @@ src/tgedr/dataops/store/spark_delta.py
|
|
|
33
34
|
src/tgedr/dataops/store/store.py
|
|
34
35
|
src/tgedr/dataops/validation/abs.py
|
|
35
36
|
src/tgedr/dataops/validation/pandas.py
|
|
37
|
+
src/tgedr/dataops/validation/pyspark.py
|
|
36
38
|
src/tgedr_dataops.egg-info/PKG-INFO
|
|
37
39
|
src/tgedr_dataops.egg-info/SOURCES.txt
|
|
38
40
|
src/tgedr_dataops.egg-info/dependency_links.txt
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/source/abstract_s3_file_source.py
RENAMED
|
File without changes
|
{tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/source/delta_table_source.py
RENAMED
|
File without changes
|
|
File without changes
|
{tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/source/local_fs_file_source.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr/dataops/store/fs_single_partition_parquet.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{tgedr_dataops-0.0.34 → tgedr_dataops-0.0.37}/src/tgedr_dataops.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|