tgedr-dataops 0.0.37__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
- tgedr_dataops/commons/utils_fs.py +187 -0
- tgedr_dataops/quality/pandas_validation.py +21 -0
- tgedr_dataops/sink/local_fs_file_sink.py +77 -0
- {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
- tgedr_dataops/source/abstract_s3_file_source.py +72 -0
- tgedr_dataops/source/local_fs_file_source.py +108 -0
- tgedr_dataops/source/pd_df_s3_source.py +130 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
- tgedr_dataops/source/s3_file_extended_source.py +68 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +63 -27
- tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
- tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
- tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
- tgedr_dataops-1.0.1.dist-info/METADATA +72 -0
- tgedr_dataops-1.0.1.dist-info/RECORD +22 -0
- {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info}/WHEEL +1 -1
- tgedr_dataops-1.0.1.dist-info/top_level.txt +1 -0
- tgedr/dataops/chain.py +0 -51
- tgedr/dataops/commons/dataset.py +0 -23
- tgedr/dataops/commons/metadata.py +0 -172
- tgedr/dataops/commons/utils_fs.py +0 -85
- tgedr/dataops/commons/utils_spark.py +0 -87
- tgedr/dataops/etl.py +0 -112
- tgedr/dataops/processor.py +0 -27
- tgedr/dataops/sink/local_fs_file_sink.py +0 -47
- tgedr/dataops/sink/sink.py +0 -46
- tgedr/dataops/source/abstract_s3_file_source.py +0 -43
- tgedr/dataops/source/delta_table_source.py +0 -49
- tgedr/dataops/source/local_delta_table.py +0 -47
- tgedr/dataops/source/local_fs_file_source.py +0 -71
- tgedr/dataops/source/pd_df_s3_source.py +0 -76
- tgedr/dataops/source/s3_delta_table.py +0 -75
- tgedr/dataops/source/s3_file_extended_source.py +0 -39
- tgedr/dataops/source/source.py +0 -51
- tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
- tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
- tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
- tgedr/dataops/store/spark_delta.py +0 -369
- tgedr/dataops/store/store.py +0 -49
- tgedr/dataops/utils_reflection.py +0 -134
- tgedr/dataops/validation/abs.py +0 -46
- tgedr/dataops/validation/pandas.py +0 -10
- tgedr/dataops/validation/pyspark.py +0 -10
- tgedr_dataops-0.0.37.dist-info/METADATA +0 -21
- tgedr_dataops-0.0.37.dist-info/RECORD +0 -38
- tgedr_dataops-0.0.37.dist-info/top_level.txt +0 -1
- {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
- {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info/licenses}/LICENSE +0 -0
tgedr/dataops/validation/abs.py
DELETED
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
import logging
|
|
3
|
-
from typing import Any
|
|
4
|
-
|
|
5
|
-
from great_expectations.core import ExpectationSuite
|
|
6
|
-
from great_expectations.dataset.dataset import Dataset
|
|
7
|
-
|
|
8
|
-
from tgedr.dataops.utils_reflection import UtilsReflection
|
|
9
|
-
|
|
10
|
-
logger = logging.getLogger(__name__)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class DataValidationException(Exception):
|
|
14
|
-
pass
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class DataValidation(ABC):
|
|
18
|
-
@staticmethod
|
|
19
|
-
def get_impl(name: str):
|
|
20
|
-
logger.info(f"[get_impl|in] ({name})")
|
|
21
|
-
result = None
|
|
22
|
-
module = ".".join(__name__.split(".")[:-1]) + "." + name.lower()
|
|
23
|
-
try:
|
|
24
|
-
result = UtilsReflection.load_subclass_from_module(module, "Impl", DataValidation)()
|
|
25
|
-
except Exception as x:
|
|
26
|
-
raise DataValidationException(f"[get_impl] couldn't load implementation for {name}: {x}")
|
|
27
|
-
logger.info(f"[get_impl|out] ({result})")
|
|
28
|
-
return result
|
|
29
|
-
|
|
30
|
-
@abstractmethod
|
|
31
|
-
def _get_dataset(self, df: Any) -> Dataset:
|
|
32
|
-
raise NotImplementedError("DataValidation")
|
|
33
|
-
|
|
34
|
-
def validate(self, df: Any, expectations: dict) -> None:
|
|
35
|
-
logger.info(f"[validate|in] ({df}, {expectations})")
|
|
36
|
-
|
|
37
|
-
try:
|
|
38
|
-
dataset = self._get_dataset(df)
|
|
39
|
-
|
|
40
|
-
validation = dataset.validate(ExpectationSuite(**expectations), only_return_failures=True)
|
|
41
|
-
result = validation.to_json_dict()
|
|
42
|
-
except Exception as x:
|
|
43
|
-
raise DataValidationException(f"[validate] failed data expectations", x)
|
|
44
|
-
|
|
45
|
-
logger.info(f"[validate|out] => {result['success']}")
|
|
46
|
-
return result
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
from typing import Any
|
|
2
|
-
from great_expectations.dataset.dataset import Dataset
|
|
3
|
-
|
|
4
|
-
from tgedr.dataops.validation.abs import DataValidation
|
|
5
|
-
from great_expectations.dataset.sparkdf_dataset import PandasDataset
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class Impl(DataValidation):
|
|
9
|
-
def _get_dataset(self, df: Any) -> Dataset:
|
|
10
|
-
return PandasDataset(df)
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
from typing import Any
|
|
2
|
-
from great_expectations.dataset.dataset import Dataset
|
|
3
|
-
|
|
4
|
-
from tgedr.dataops.validation.abs import DataValidation
|
|
5
|
-
from great_expectations.dataset.sparkdf_dataset import SparkDFDataset
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class Impl(DataValidation):
|
|
9
|
-
def _get_dataset(self, df: Any) -> Dataset:
|
|
10
|
-
return SparkDFDataset(df)
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: tgedr-dataops
|
|
3
|
-
Version: 0.0.37
|
|
4
|
-
Summary: data operations related code
|
|
5
|
-
Home-page: https://github.com/jtviegas-sandbox/dataops
|
|
6
|
-
Author: joao tiago viegas
|
|
7
|
-
Author-email: jtviegas@gmail.com
|
|
8
|
-
License: Unlicense
|
|
9
|
-
Keywords: data engineering mlops ml
|
|
10
|
-
Classifier: Development Status :: 3 - Alpha
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
-
Requires-Python: >=3.9
|
|
13
|
-
License-File: LICENSE
|
|
14
|
-
Requires-Dist: pandas ==1.5.3
|
|
15
|
-
Requires-Dist: pyarrow ==15.*
|
|
16
|
-
Requires-Dist: s3fs ==2024.5.0
|
|
17
|
-
Requires-Dist: boto3 ==1.34.106
|
|
18
|
-
Requires-Dist: great-expectations ==0.18.10
|
|
19
|
-
Requires-Dist: deltalake ~=0.16.4
|
|
20
|
-
Requires-Dist: openpyxl ==3.1.2
|
|
21
|
-
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
tgedr/dataops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
tgedr/dataops/chain.py,sha256=bo9HhfV3eJmss52HaR8LkV7RYSTKmya0lbcqiovOnQU,1405
|
|
3
|
-
tgedr/dataops/etl.py,sha256=BEnpcBPJxAIXVhwnprTMl-onQHuUn3amuAgPVPJnnhs,3250
|
|
4
|
-
tgedr/dataops/processor.py,sha256=GZmq7yefq6ySgRkxJyHxXUsUhfa6NiHmn1W3SECXVBA,710
|
|
5
|
-
tgedr/dataops/utils_reflection.py,sha256=W3eqrypMwqRsvevEZ9d6Zxw_GJL4d938lXiU1gIOwlo,4841
|
|
6
|
-
tgedr/dataops/commons/dataset.py,sha256=VIt385Zps78y6cbe54G4PLNuUA647S94UqokubM1gKI,637
|
|
7
|
-
tgedr/dataops/commons/metadata.py,sha256=5uIurGkkREJuAlONav8_TyLVNY097Zix6LIdRxVE_uQ,5796
|
|
8
|
-
tgedr/dataops/commons/s3_connector.py,sha256=AmD8l86ORbABamkquD8BMtMpgSfThkVbl9Kf54wR_pA,1194
|
|
9
|
-
tgedr/dataops/commons/utils_fs.py,sha256=8gXPcdQx520pr1h7tn7m_Z1AojgWIdebNLCyiQgEZEU,2362
|
|
10
|
-
tgedr/dataops/commons/utils_spark.py,sha256=VSFUcQt-yE54VHJkonveFwhM3ADTK3kfC3QfA34QOVI,3191
|
|
11
|
-
tgedr/dataops/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
tgedr/dataops/sink/local_fs_file_sink.py,sha256=JjFWwW_ufWF8I-PVPP1TleKmRj8Des-6A0xLqtbPYW8,1508
|
|
13
|
-
tgedr/dataops/sink/s3_file_sink.py,sha256=m1D6SwuqYEVqvdA9XrL6nVlX_oWLRDT6v74mh_diuQM,2362
|
|
14
|
-
tgedr/dataops/sink/sink.py,sha256=8rG3ZNpzeZ82Ac1IoPzkdQTs006IbG-k39APFCeXogk,1271
|
|
15
|
-
tgedr/dataops/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
-
tgedr/dataops/source/abstract_s3_file_source.py,sha256=AMnTWxs57Jni46jz5PvrZTRjd-Ah_TUZEP4Rcx7YziM,1513
|
|
17
|
-
tgedr/dataops/source/delta_table_source.py,sha256=tXeJCM-QhssztEpyKGzb6zcM1AlaUwV1Y5d2RHCiOTU,1653
|
|
18
|
-
tgedr/dataops/source/local_delta_table.py,sha256=Z9413skOSsNGfSNeBs-ETVXQJcW4bhS3MYmCwySo9Tc,1545
|
|
19
|
-
tgedr/dataops/source/local_fs_file_source.py,sha256=wihK_wVV6xb8AlkCmZHSRlRmqqm6w9Aq-PI0oOnhn_k,2570
|
|
20
|
-
tgedr/dataops/source/pd_df_s3_source.py,sha256=Bfnj363MZtmMJB4W32DKEaYYG0v7NeG-m66C5i8bxsc,3103
|
|
21
|
-
tgedr/dataops/source/s3_delta_table.py,sha256=d_FoFDNogjZifRQv8V2OmTXbPYymx9QaQcW15uX34pI,2974
|
|
22
|
-
tgedr/dataops/source/s3_file_copy.py,sha256=DbHvstAqi23cywoG6nHpQxvzerrnepApOsv6zsELYNQ,3930
|
|
23
|
-
tgedr/dataops/source/s3_file_extended_source.py,sha256=B6m84DWV45E1V3XsLkHARp9YVNvmsDi54ikd2__9q5Q,1384
|
|
24
|
-
tgedr/dataops/source/s3_file_source.py,sha256=ayQ-gPxARFc1xINFOM0_hQFUn6odpts_uFaDZYs7RC4,3520
|
|
25
|
-
tgedr/dataops/source/source.py,sha256=REeqluMGLMjoDWtdZthzUYkmVeHemSV7t9wjc6eTpJE,1350
|
|
26
|
-
tgedr/dataops/store/fs_single_partition_parquet.py,sha256=CR3406emhxn33jjObnMotXEmZGfh4Iu5Ygv30FvkY6Y,9695
|
|
27
|
-
tgedr/dataops/store/local_fs_single_partition_parquet.py,sha256=N_I96fqxQAp2fWBngoDci3aR1-kcmkWjIVRD0nUi07U,683
|
|
28
|
-
tgedr/dataops/store/s3_single_partition_parquet.py,sha256=2vSyLb-mZ2mAbRhaDoCcsOEnKzQ30AsUYTlqhKQCroc,3249
|
|
29
|
-
tgedr/dataops/store/spark_delta.py,sha256=AHqIKDi9axOKpMJHt4AiBGX8V2mFV_7vNPRpaw37rDY,15101
|
|
30
|
-
tgedr/dataops/store/store.py,sha256=uAuR7MWVdKRaisQ69rFqliLnJrsgpTzbsOh7uPmLSlI,1315
|
|
31
|
-
tgedr/dataops/validation/abs.py,sha256=84HGUuh6k_uG-ON0bauR4lDBTfUeI3GmxOiWsMkTu3E,1521
|
|
32
|
-
tgedr/dataops/validation/pandas.py,sha256=Vfr38f3txbTy098ufPUcRsCgrYu47Rg35upZ9IXcLSk,315
|
|
33
|
-
tgedr/dataops/validation/pyspark.py,sha256=4OEnA21_vSwB5HjaD5KZdfIjBnTn3KUAqvVGHnY-zNI,317
|
|
34
|
-
tgedr_dataops-0.0.37.dist-info/LICENSE,sha256=awOCsWJ58m_2kBQwBUGWejVqZm6wuRtCL2hi9rfa0X4,1211
|
|
35
|
-
tgedr_dataops-0.0.37.dist-info/METADATA,sha256=OGYBz2HJOKVIl5NlRqzjqjG4zHXmgFA2sErmeFCIAYk,639
|
|
36
|
-
tgedr_dataops-0.0.37.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
37
|
-
tgedr_dataops-0.0.37.dist-info/top_level.txt,sha256=acugNvvENatFXbPxKQD9YI5PpzzehbAwyY1keiIkR7I,6
|
|
38
|
-
tgedr_dataops-0.0.37.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
tgedr
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|