tgedr-dataops 0.0.36__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
  2. tgedr_dataops/commons/utils_fs.py +187 -0
  3. tgedr_dataops/quality/pandas_validation.py +21 -0
  4. tgedr_dataops/sink/local_fs_file_sink.py +77 -0
  5. {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
  6. tgedr_dataops/source/abstract_s3_file_source.py +72 -0
  7. tgedr_dataops/source/local_fs_file_source.py +108 -0
  8. tgedr_dataops/source/pd_df_s3_source.py +130 -0
  9. {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
  10. tgedr_dataops/source/s3_file_extended_source.py +68 -0
  11. {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +60 -39
  12. tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
  13. tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
  14. tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
  15. tgedr_dataops-1.0.1.dist-info/METADATA +72 -0
  16. tgedr_dataops-1.0.1.dist-info/RECORD +22 -0
  17. {tgedr_dataops-0.0.36.dist-info → tgedr_dataops-1.0.1.dist-info}/WHEEL +1 -1
  18. tgedr_dataops-1.0.1.dist-info/top_level.txt +1 -0
  19. tgedr/dataops/chain.py +0 -51
  20. tgedr/dataops/commons/dataset.py +0 -23
  21. tgedr/dataops/commons/metadata.py +0 -172
  22. tgedr/dataops/commons/utils_fs.py +0 -85
  23. tgedr/dataops/commons/utils_spark.py +0 -87
  24. tgedr/dataops/etl.py +0 -112
  25. tgedr/dataops/processor.py +0 -27
  26. tgedr/dataops/sink/local_fs_file_sink.py +0 -47
  27. tgedr/dataops/sink/sink.py +0 -46
  28. tgedr/dataops/source/abstract_s3_file_source.py +0 -43
  29. tgedr/dataops/source/delta_table_source.py +0 -49
  30. tgedr/dataops/source/local_delta_table.py +0 -47
  31. tgedr/dataops/source/local_fs_file_source.py +0 -71
  32. tgedr/dataops/source/pd_df_s3_source.py +0 -51
  33. tgedr/dataops/source/s3_delta_table.py +0 -75
  34. tgedr/dataops/source/source.py +0 -51
  35. tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
  36. tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
  37. tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
  38. tgedr/dataops/store/spark_delta.py +0 -369
  39. tgedr/dataops/store/store.py +0 -49
  40. tgedr/dataops/utils_reflection.py +0 -134
  41. tgedr/dataops/validation/abs.py +0 -46
  42. tgedr/dataops/validation/pandas.py +0 -10
  43. tgedr/dataops/validation/pyspark.py +0 -10
  44. tgedr_dataops-0.0.36.dist-info/METADATA +0 -20
  45. tgedr_dataops-0.0.36.dist-info/RECORD +0 -37
  46. tgedr_dataops-0.0.36.dist-info/top_level.txt +0 -1
  47. {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
  48. {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
  49. {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
  50. {tgedr_dataops-0.0.36.dist-info → tgedr_dataops-1.0.1.dist-info/licenses}/LICENSE +0 -0
@@ -1,46 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- import logging
3
- from typing import Any
4
-
5
- from great_expectations.core import ExpectationSuite
6
- from great_expectations.dataset.dataset import Dataset
7
-
8
- from tgedr.dataops.utils_reflection import UtilsReflection
9
-
10
- logger = logging.getLogger(__name__)
11
-
12
-
13
- class DataValidationException(Exception):
14
- pass
15
-
16
-
17
- class DataValidation(ABC):
18
- @staticmethod
19
- def get_impl(name: str):
20
- logger.info(f"[get_impl|in] ({name})")
21
- result = None
22
- module = ".".join(__name__.split(".")[:-1]) + "." + name.lower()
23
- try:
24
- result = UtilsReflection.load_subclass_from_module(module, "Impl", DataValidation)()
25
- except Exception as x:
26
- raise DataValidationException(f"[get_impl] couldn't load implementation for {name}: {x}")
27
- logger.info(f"[get_impl|out] ({result})")
28
- return result
29
-
30
- @abstractmethod
31
- def _get_dataset(self, df: Any) -> Dataset:
32
- raise NotImplementedError("DataValidation")
33
-
34
- def validate(self, df: Any, expectations: dict) -> None:
35
- logger.info(f"[validate|in] ({df}, {expectations})")
36
-
37
- try:
38
- dataset = self._get_dataset(df)
39
-
40
- validation = dataset.validate(ExpectationSuite(**expectations), only_return_failures=True)
41
- result = validation.to_json_dict()
42
- except Exception as x:
43
- raise DataValidationException(f"[validate] failed data expectations", x)
44
-
45
- logger.info(f"[validate|out] => {result['success']}")
46
- return result
@@ -1,10 +0,0 @@
1
- from typing import Any
2
- from great_expectations.dataset.dataset import Dataset
3
-
4
- from tgedr.dataops.validation.abs import DataValidation
5
- from great_expectations.dataset.sparkdf_dataset import PandasDataset
6
-
7
-
8
- class Impl(DataValidation):
9
- def _get_dataset(self, df: Any) -> Dataset:
10
- return PandasDataset(df)
@@ -1,10 +0,0 @@
1
- from typing import Any
2
- from great_expectations.dataset.dataset import Dataset
3
-
4
- from tgedr.dataops.validation.abs import DataValidation
5
- from great_expectations.dataset.sparkdf_dataset import SparkDFDataset
6
-
7
-
8
- class Impl(DataValidation):
9
- def _get_dataset(self, df: Any) -> Dataset:
10
- return SparkDFDataset(df)
@@ -1,20 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: tgedr-dataops
3
- Version: 0.0.36
4
- Summary: data operations related code
5
- Home-page: https://github.com/jtviegas-sandbox/dataops
6
- Author: joao tiago viegas
7
- Author-email: jtviegas@gmail.com
8
- License: Unlicense
9
- Keywords: data engineering mlops ml
10
- Classifier: Development Status :: 3 - Alpha
11
- Classifier: Programming Language :: Python :: 3.10
12
- Requires-Python: >=3.9
13
- License-File: LICENSE
14
- Requires-Dist: pandas ==1.5.3
15
- Requires-Dist: pyarrow ==15.*
16
- Requires-Dist: s3fs ==2024.5.0
17
- Requires-Dist: boto3 ==1.34.106
18
- Requires-Dist: great-expectations ==0.18.10
19
- Requires-Dist: deltalake ~=0.16.4
20
-
@@ -1,37 +0,0 @@
1
- tgedr/dataops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- tgedr/dataops/chain.py,sha256=bo9HhfV3eJmss52HaR8LkV7RYSTKmya0lbcqiovOnQU,1405
3
- tgedr/dataops/etl.py,sha256=BEnpcBPJxAIXVhwnprTMl-onQHuUn3amuAgPVPJnnhs,3250
4
- tgedr/dataops/processor.py,sha256=GZmq7yefq6ySgRkxJyHxXUsUhfa6NiHmn1W3SECXVBA,710
5
- tgedr/dataops/utils_reflection.py,sha256=W3eqrypMwqRsvevEZ9d6Zxw_GJL4d938lXiU1gIOwlo,4841
6
- tgedr/dataops/commons/dataset.py,sha256=VIt385Zps78y6cbe54G4PLNuUA647S94UqokubM1gKI,637
7
- tgedr/dataops/commons/metadata.py,sha256=5uIurGkkREJuAlONav8_TyLVNY097Zix6LIdRxVE_uQ,5796
8
- tgedr/dataops/commons/s3_connector.py,sha256=AmD8l86ORbABamkquD8BMtMpgSfThkVbl9Kf54wR_pA,1194
9
- tgedr/dataops/commons/utils_fs.py,sha256=8gXPcdQx520pr1h7tn7m_Z1AojgWIdebNLCyiQgEZEU,2362
10
- tgedr/dataops/commons/utils_spark.py,sha256=VSFUcQt-yE54VHJkonveFwhM3ADTK3kfC3QfA34QOVI,3191
11
- tgedr/dataops/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- tgedr/dataops/sink/local_fs_file_sink.py,sha256=JjFWwW_ufWF8I-PVPP1TleKmRj8Des-6A0xLqtbPYW8,1508
13
- tgedr/dataops/sink/s3_file_sink.py,sha256=m1D6SwuqYEVqvdA9XrL6nVlX_oWLRDT6v74mh_diuQM,2362
14
- tgedr/dataops/sink/sink.py,sha256=8rG3ZNpzeZ82Ac1IoPzkdQTs006IbG-k39APFCeXogk,1271
15
- tgedr/dataops/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- tgedr/dataops/source/abstract_s3_file_source.py,sha256=AMnTWxs57Jni46jz5PvrZTRjd-Ah_TUZEP4Rcx7YziM,1513
17
- tgedr/dataops/source/delta_table_source.py,sha256=tXeJCM-QhssztEpyKGzb6zcM1AlaUwV1Y5d2RHCiOTU,1653
18
- tgedr/dataops/source/local_delta_table.py,sha256=Z9413skOSsNGfSNeBs-ETVXQJcW4bhS3MYmCwySo9Tc,1545
19
- tgedr/dataops/source/local_fs_file_source.py,sha256=wihK_wVV6xb8AlkCmZHSRlRmqqm6w9Aq-PI0oOnhn_k,2570
20
- tgedr/dataops/source/pd_df_s3_source.py,sha256=3Mc0VDGTOmk1HT951l6Qd_bqL0DpPjlfZk0JQjWh3JE,2092
21
- tgedr/dataops/source/s3_delta_table.py,sha256=d_FoFDNogjZifRQv8V2OmTXbPYymx9QaQcW15uX34pI,2974
22
- tgedr/dataops/source/s3_file_copy.py,sha256=DbHvstAqi23cywoG6nHpQxvzerrnepApOsv6zsELYNQ,3930
23
- tgedr/dataops/source/s3_file_source.py,sha256=C8Y0h89p1eBWNGxV4oTNzVLLwZoAoyshHuMlDKKqDA0,4072
24
- tgedr/dataops/source/source.py,sha256=REeqluMGLMjoDWtdZthzUYkmVeHemSV7t9wjc6eTpJE,1350
25
- tgedr/dataops/store/fs_single_partition_parquet.py,sha256=CR3406emhxn33jjObnMotXEmZGfh4Iu5Ygv30FvkY6Y,9695
26
- tgedr/dataops/store/local_fs_single_partition_parquet.py,sha256=N_I96fqxQAp2fWBngoDci3aR1-kcmkWjIVRD0nUi07U,683
27
- tgedr/dataops/store/s3_single_partition_parquet.py,sha256=2vSyLb-mZ2mAbRhaDoCcsOEnKzQ30AsUYTlqhKQCroc,3249
28
- tgedr/dataops/store/spark_delta.py,sha256=AHqIKDi9axOKpMJHt4AiBGX8V2mFV_7vNPRpaw37rDY,15101
29
- tgedr/dataops/store/store.py,sha256=uAuR7MWVdKRaisQ69rFqliLnJrsgpTzbsOh7uPmLSlI,1315
30
- tgedr/dataops/validation/abs.py,sha256=84HGUuh6k_uG-ON0bauR4lDBTfUeI3GmxOiWsMkTu3E,1521
31
- tgedr/dataops/validation/pandas.py,sha256=Vfr38f3txbTy098ufPUcRsCgrYu47Rg35upZ9IXcLSk,315
32
- tgedr/dataops/validation/pyspark.py,sha256=4OEnA21_vSwB5HjaD5KZdfIjBnTn3KUAqvVGHnY-zNI,317
33
- tgedr_dataops-0.0.36.dist-info/LICENSE,sha256=awOCsWJ58m_2kBQwBUGWejVqZm6wuRtCL2hi9rfa0X4,1211
34
- tgedr_dataops-0.0.36.dist-info/METADATA,sha256=O_K6YMP__Vgb4Hf4t-34hZYXFRIrvgV6ZkoW3xDseXw,607
35
- tgedr_dataops-0.0.36.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
36
- tgedr_dataops-0.0.36.dist-info/top_level.txt,sha256=acugNvvENatFXbPxKQD9YI5PpzzehbAwyY1keiIkR7I,6
37
- tgedr_dataops-0.0.36.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- tgedr
File without changes
File without changes
File without changes