tgedr-dataops 0.0.37__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
  2. tgedr_dataops/commons/utils_fs.py +187 -0
  3. tgedr_dataops/quality/pandas_validation.py +21 -0
  4. tgedr_dataops/sink/local_fs_file_sink.py +77 -0
  5. {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
  6. tgedr_dataops/source/abstract_s3_file_source.py +72 -0
  7. tgedr_dataops/source/local_fs_file_source.py +108 -0
  8. tgedr_dataops/source/pd_df_s3_source.py +130 -0
  9. {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
  10. tgedr_dataops/source/s3_file_extended_source.py +68 -0
  11. {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +63 -27
  12. tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
  13. tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
  14. tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
  15. tgedr_dataops-1.0.1.dist-info/METADATA +72 -0
  16. tgedr_dataops-1.0.1.dist-info/RECORD +22 -0
  17. {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info}/WHEEL +1 -1
  18. tgedr_dataops-1.0.1.dist-info/top_level.txt +1 -0
  19. tgedr/dataops/chain.py +0 -51
  20. tgedr/dataops/commons/dataset.py +0 -23
  21. tgedr/dataops/commons/metadata.py +0 -172
  22. tgedr/dataops/commons/utils_fs.py +0 -85
  23. tgedr/dataops/commons/utils_spark.py +0 -87
  24. tgedr/dataops/etl.py +0 -112
  25. tgedr/dataops/processor.py +0 -27
  26. tgedr/dataops/sink/local_fs_file_sink.py +0 -47
  27. tgedr/dataops/sink/sink.py +0 -46
  28. tgedr/dataops/source/abstract_s3_file_source.py +0 -43
  29. tgedr/dataops/source/delta_table_source.py +0 -49
  30. tgedr/dataops/source/local_delta_table.py +0 -47
  31. tgedr/dataops/source/local_fs_file_source.py +0 -71
  32. tgedr/dataops/source/pd_df_s3_source.py +0 -76
  33. tgedr/dataops/source/s3_delta_table.py +0 -75
  34. tgedr/dataops/source/s3_file_extended_source.py +0 -39
  35. tgedr/dataops/source/source.py +0 -51
  36. tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
  37. tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
  38. tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
  39. tgedr/dataops/store/spark_delta.py +0 -369
  40. tgedr/dataops/store/store.py +0 -49
  41. tgedr/dataops/utils_reflection.py +0 -134
  42. tgedr/dataops/validation/abs.py +0 -46
  43. tgedr/dataops/validation/pandas.py +0 -10
  44. tgedr/dataops/validation/pyspark.py +0 -10
  45. tgedr_dataops-0.0.37.dist-info/METADATA +0 -21
  46. tgedr_dataops-0.0.37.dist-info/RECORD +0 -38
  47. tgedr_dataops-0.0.37.dist-info/top_level.txt +0 -1
  48. {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
  49. {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
  50. {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
  51. {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info/licenses}/LICENSE +0 -0
@@ -1,46 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- import logging
3
- from typing import Any
4
-
5
- from great_expectations.core import ExpectationSuite
6
- from great_expectations.dataset.dataset import Dataset
7
-
8
- from tgedr.dataops.utils_reflection import UtilsReflection
9
-
10
- logger = logging.getLogger(__name__)
11
-
12
-
13
- class DataValidationException(Exception):
14
- pass
15
-
16
-
17
- class DataValidation(ABC):
18
- @staticmethod
19
- def get_impl(name: str):
20
- logger.info(f"[get_impl|in] ({name})")
21
- result = None
22
- module = ".".join(__name__.split(".")[:-1]) + "." + name.lower()
23
- try:
24
- result = UtilsReflection.load_subclass_from_module(module, "Impl", DataValidation)()
25
- except Exception as x:
26
- raise DataValidationException(f"[get_impl] couldn't load implementation for {name}: {x}")
27
- logger.info(f"[get_impl|out] ({result})")
28
- return result
29
-
30
- @abstractmethod
31
- def _get_dataset(self, df: Any) -> Dataset:
32
- raise NotImplementedError("DataValidation")
33
-
34
- def validate(self, df: Any, expectations: dict) -> None:
35
- logger.info(f"[validate|in] ({df}, {expectations})")
36
-
37
- try:
38
- dataset = self._get_dataset(df)
39
-
40
- validation = dataset.validate(ExpectationSuite(**expectations), only_return_failures=True)
41
- result = validation.to_json_dict()
42
- except Exception as x:
43
- raise DataValidationException(f"[validate] failed data expectations", x)
44
-
45
- logger.info(f"[validate|out] => {result['success']}")
46
- return result
@@ -1,10 +0,0 @@
1
- from typing import Any
2
- from great_expectations.dataset.dataset import Dataset
3
-
4
- from tgedr.dataops.validation.abs import DataValidation
5
- from great_expectations.dataset.sparkdf_dataset import PandasDataset
6
-
7
-
8
- class Impl(DataValidation):
9
- def _get_dataset(self, df: Any) -> Dataset:
10
- return PandasDataset(df)
@@ -1,10 +0,0 @@
1
- from typing import Any
2
- from great_expectations.dataset.dataset import Dataset
3
-
4
- from tgedr.dataops.validation.abs import DataValidation
5
- from great_expectations.dataset.sparkdf_dataset import SparkDFDataset
6
-
7
-
8
- class Impl(DataValidation):
9
- def _get_dataset(self, df: Any) -> Dataset:
10
- return SparkDFDataset(df)
@@ -1,21 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: tgedr-dataops
3
- Version: 0.0.37
4
- Summary: data operations related code
5
- Home-page: https://github.com/jtviegas-sandbox/dataops
6
- Author: joao tiago viegas
7
- Author-email: jtviegas@gmail.com
8
- License: Unlicense
9
- Keywords: data engineering mlops ml
10
- Classifier: Development Status :: 3 - Alpha
11
- Classifier: Programming Language :: Python :: 3.10
12
- Requires-Python: >=3.9
13
- License-File: LICENSE
14
- Requires-Dist: pandas ==1.5.3
15
- Requires-Dist: pyarrow ==15.*
16
- Requires-Dist: s3fs ==2024.5.0
17
- Requires-Dist: boto3 ==1.34.106
18
- Requires-Dist: great-expectations ==0.18.10
19
- Requires-Dist: deltalake ~=0.16.4
20
- Requires-Dist: openpyxl ==3.1.2
21
-
@@ -1,38 +0,0 @@
1
- tgedr/dataops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- tgedr/dataops/chain.py,sha256=bo9HhfV3eJmss52HaR8LkV7RYSTKmya0lbcqiovOnQU,1405
3
- tgedr/dataops/etl.py,sha256=BEnpcBPJxAIXVhwnprTMl-onQHuUn3amuAgPVPJnnhs,3250
4
- tgedr/dataops/processor.py,sha256=GZmq7yefq6ySgRkxJyHxXUsUhfa6NiHmn1W3SECXVBA,710
5
- tgedr/dataops/utils_reflection.py,sha256=W3eqrypMwqRsvevEZ9d6Zxw_GJL4d938lXiU1gIOwlo,4841
6
- tgedr/dataops/commons/dataset.py,sha256=VIt385Zps78y6cbe54G4PLNuUA647S94UqokubM1gKI,637
7
- tgedr/dataops/commons/metadata.py,sha256=5uIurGkkREJuAlONav8_TyLVNY097Zix6LIdRxVE_uQ,5796
8
- tgedr/dataops/commons/s3_connector.py,sha256=AmD8l86ORbABamkquD8BMtMpgSfThkVbl9Kf54wR_pA,1194
9
- tgedr/dataops/commons/utils_fs.py,sha256=8gXPcdQx520pr1h7tn7m_Z1AojgWIdebNLCyiQgEZEU,2362
10
- tgedr/dataops/commons/utils_spark.py,sha256=VSFUcQt-yE54VHJkonveFwhM3ADTK3kfC3QfA34QOVI,3191
11
- tgedr/dataops/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- tgedr/dataops/sink/local_fs_file_sink.py,sha256=JjFWwW_ufWF8I-PVPP1TleKmRj8Des-6A0xLqtbPYW8,1508
13
- tgedr/dataops/sink/s3_file_sink.py,sha256=m1D6SwuqYEVqvdA9XrL6nVlX_oWLRDT6v74mh_diuQM,2362
14
- tgedr/dataops/sink/sink.py,sha256=8rG3ZNpzeZ82Ac1IoPzkdQTs006IbG-k39APFCeXogk,1271
15
- tgedr/dataops/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- tgedr/dataops/source/abstract_s3_file_source.py,sha256=AMnTWxs57Jni46jz5PvrZTRjd-Ah_TUZEP4Rcx7YziM,1513
17
- tgedr/dataops/source/delta_table_source.py,sha256=tXeJCM-QhssztEpyKGzb6zcM1AlaUwV1Y5d2RHCiOTU,1653
18
- tgedr/dataops/source/local_delta_table.py,sha256=Z9413skOSsNGfSNeBs-ETVXQJcW4bhS3MYmCwySo9Tc,1545
19
- tgedr/dataops/source/local_fs_file_source.py,sha256=wihK_wVV6xb8AlkCmZHSRlRmqqm6w9Aq-PI0oOnhn_k,2570
20
- tgedr/dataops/source/pd_df_s3_source.py,sha256=Bfnj363MZtmMJB4W32DKEaYYG0v7NeG-m66C5i8bxsc,3103
21
- tgedr/dataops/source/s3_delta_table.py,sha256=d_FoFDNogjZifRQv8V2OmTXbPYymx9QaQcW15uX34pI,2974
22
- tgedr/dataops/source/s3_file_copy.py,sha256=DbHvstAqi23cywoG6nHpQxvzerrnepApOsv6zsELYNQ,3930
23
- tgedr/dataops/source/s3_file_extended_source.py,sha256=B6m84DWV45E1V3XsLkHARp9YVNvmsDi54ikd2__9q5Q,1384
24
- tgedr/dataops/source/s3_file_source.py,sha256=ayQ-gPxARFc1xINFOM0_hQFUn6odpts_uFaDZYs7RC4,3520
25
- tgedr/dataops/source/source.py,sha256=REeqluMGLMjoDWtdZthzUYkmVeHemSV7t9wjc6eTpJE,1350
26
- tgedr/dataops/store/fs_single_partition_parquet.py,sha256=CR3406emhxn33jjObnMotXEmZGfh4Iu5Ygv30FvkY6Y,9695
27
- tgedr/dataops/store/local_fs_single_partition_parquet.py,sha256=N_I96fqxQAp2fWBngoDci3aR1-kcmkWjIVRD0nUi07U,683
28
- tgedr/dataops/store/s3_single_partition_parquet.py,sha256=2vSyLb-mZ2mAbRhaDoCcsOEnKzQ30AsUYTlqhKQCroc,3249
29
- tgedr/dataops/store/spark_delta.py,sha256=AHqIKDi9axOKpMJHt4AiBGX8V2mFV_7vNPRpaw37rDY,15101
30
- tgedr/dataops/store/store.py,sha256=uAuR7MWVdKRaisQ69rFqliLnJrsgpTzbsOh7uPmLSlI,1315
31
- tgedr/dataops/validation/abs.py,sha256=84HGUuh6k_uG-ON0bauR4lDBTfUeI3GmxOiWsMkTu3E,1521
32
- tgedr/dataops/validation/pandas.py,sha256=Vfr38f3txbTy098ufPUcRsCgrYu47Rg35upZ9IXcLSk,315
33
- tgedr/dataops/validation/pyspark.py,sha256=4OEnA21_vSwB5HjaD5KZdfIjBnTn3KUAqvVGHnY-zNI,317
34
- tgedr_dataops-0.0.37.dist-info/LICENSE,sha256=awOCsWJ58m_2kBQwBUGWejVqZm6wuRtCL2hi9rfa0X4,1211
35
- tgedr_dataops-0.0.37.dist-info/METADATA,sha256=OGYBz2HJOKVIl5NlRqzjqjG4zHXmgFA2sErmeFCIAYk,639
36
- tgedr_dataops-0.0.37.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
37
- tgedr_dataops-0.0.37.dist-info/top_level.txt,sha256=acugNvvENatFXbPxKQD9YI5PpzzehbAwyY1keiIkR7I,6
38
- tgedr_dataops-0.0.37.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- tgedr
File without changes
File without changes
File without changes