tgedr-dataops 0.0.34__py3-none-any.whl → 0.0.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,7 @@ class PdDfS3Source(AbstractS3FileSource):
19
19
  CONTEXT_KEY_COLUMN_NAMES = "column_names"
20
20
  CONTEXT_KEY_SCHEMA_TYPES = "schema_types"
21
21
  DEFAULT_FORMAT = "csv"
22
+ FORMATS = ["csv", "xlsx"]
22
23
  DEFAULT_SEPARATOR = ","
23
24
 
24
25
  def __init__(self, config: Optional[Dict[str, Any]] = None):
@@ -33,6 +34,23 @@ class PdDfS3Source(AbstractS3FileSource):
33
34
  if self.CONTEXT_KEY_URL not in context:
34
35
  raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
35
36
 
37
+ format: str = self.DEFAULT_FORMAT
38
+ if self.CONTEXT_KEY_FILE_FORMAT in context:
39
+ format = context[self.CONTEXT_KEY_FILE_FORMAT]
40
+ if format not in self.FORMATS:
41
+ raise SourceException(f"[get] invalid format: {format}")
42
+
43
+ if "csv" == format:
44
+ result = self.__read_csv(context=context)
45
+ else:
46
+ result = self.__read_excel(context=context)
47
+
48
+ logger.info(f"[get|out] => {result}")
49
+ return result
50
+
51
+ def __read_csv(self, context: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
52
+ logger.info(f"[__read_csv|in] ({context})")
53
+
36
54
  protocol, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
37
55
 
38
56
  obj = self._client.get_object(Bucket=bucket, Key=key)
@@ -45,7 +63,14 @@ class PdDfS3Source(AbstractS3FileSource):
45
63
  self.DEFAULT_SEPARATOR if self.CONTEXT_KEY_SEPARATOR not in context else context[self.CONTEXT_KEY_SEPARATOR]
46
64
  )
47
65
 
48
- result = pd.read_csv(StringIO(data), sep=sep, header=header, names=names, dtype=dtype)
66
+ result: pd.DataFrame = pd.read_csv(StringIO(data), sep=sep, header=header, names=names, dtype=dtype)
49
67
 
50
- logger.info(f"[get|out] => {result}")
68
+ logger.info(f"[__read_csv|out] => {result}")
69
+ return result
70
+
71
+ def __read_excel(self, context: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
72
+ logger.info(f"[__read_excel|in] ({context})")
73
+ src = context[self.CONTEXT_KEY_URL]
74
+ result: pd.DataFrame = pd.read_excel(src, engine="openpyxl")
75
+ logger.info(f"[__read_excel|out] => {result}")
51
76
  return result
@@ -0,0 +1,39 @@
1
+ import logging
2
+ from typing import Any, Dict, Optional
3
+
4
+ from tgedr.dataops.source.s3_file_source import S3FileSource
5
+ from tgedr.dataops.source.source import SourceException
6
+ from tgedr.dataops.commons.utils_fs import process_s3_path
7
+
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class S3FileExtendedSource(S3FileSource):
13
+ """class used to retrieve objects/files from s3 bucket to local fs location"""
14
+
15
+ METADATA_KEYS = ["LastModified", "ContentLength", "ETag", "VersionId", "ContentType"]
16
+
17
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
18
+ super().__init__(config=config)
19
+
20
+ def get_metadata(self, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
21
+ logger.info(f"[get_metadata|in] ({context})")
22
+
23
+ result: Dict[str, Any] = {}
24
+ if self.CONTEXT_KEY_SOURCE not in context:
25
+ raise SourceException(f"you must provide context for {self.CONTEXT_KEY_SOURCE}")
26
+
27
+ bucket, key = process_s3_path(context[self.CONTEXT_KEY_SOURCE])
28
+
29
+ o = self._client.head_object(Bucket=bucket, Key=key)
30
+
31
+ for key in list(o.keys()):
32
+ if key in self.METADATA_KEYS:
33
+ if key == "LastModified":
34
+ result[key] = int(o[key].timestamp())
35
+ else:
36
+ result[key] = o[key]
37
+
38
+ logger.info(f"[get_metadata|out] => result len: {result}")
39
+ return result
@@ -22,21 +22,6 @@ class S3FileSource(Source, S3Connector):
22
22
  Source.__init__(self, config=config)
23
23
  S3Connector.__init__(self)
24
24
 
25
- def __derive_local_file(self, target: str, isdir: bool, file: str):
26
- logger.debug(f"[__derive_local_file|in] ({target}, {isdir}, {file})")
27
-
28
- if isdir:
29
- result = os.path.join(target, file)
30
- basedir = os.path.dirname(result)
31
- if not os.path.exists(basedir):
32
- logger.debug(f"[__derive_local_file] creating folder: {basedir}")
33
- os.mkdir(basedir)
34
- else:
35
- result = target
36
-
37
- logger.debug(f"[__derive_local_file|out] => {result}")
38
- return result
39
-
40
25
  def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
41
26
  logger.info(f"[list|in] ({context})")
42
27
 
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  import pyarrow as pa
7
7
 
8
8
  from tgedr.dataops.store.fs_single_partition_parquet import FsSinglePartitionParquetStore
9
- from src.nn.gs.ss.dataops.commons.utils_fs import remove_s3_protocol
9
+ from tgedr.dataops.commons.utils_fs import remove_s3_protocol
10
10
 
11
11
 
12
12
  logger = logging.getLogger(__name__)
@@ -0,0 +1,10 @@
1
+ from typing import Any
2
+ from great_expectations.dataset.dataset import Dataset
3
+
4
+ from tgedr.dataops.validation.abs import DataValidation
5
+ from great_expectations.dataset.sparkdf_dataset import SparkDFDataset
6
+
7
+
8
+ class Impl(DataValidation):
9
+ def _get_dataset(self, df: Any) -> Dataset:
10
+ return SparkDFDataset(df)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tgedr-dataops
3
- Version: 0.0.34
3
+ Version: 0.0.37
4
4
  Summary: data operations related code
5
5
  Home-page: https://github.com/jtviegas-sandbox/dataops
6
6
  Author: joao tiago viegas
@@ -17,4 +17,5 @@ Requires-Dist: s3fs ==2024.5.0
17
17
  Requires-Dist: boto3 ==1.34.106
18
18
  Requires-Dist: great-expectations ==0.18.10
19
19
  Requires-Dist: deltalake ~=0.16.4
20
+ Requires-Dist: openpyxl ==3.1.2
20
21
 
@@ -17,20 +17,22 @@ tgedr/dataops/source/abstract_s3_file_source.py,sha256=AMnTWxs57Jni46jz5PvrZTRjd
17
17
  tgedr/dataops/source/delta_table_source.py,sha256=tXeJCM-QhssztEpyKGzb6zcM1AlaUwV1Y5d2RHCiOTU,1653
18
18
  tgedr/dataops/source/local_delta_table.py,sha256=Z9413skOSsNGfSNeBs-ETVXQJcW4bhS3MYmCwySo9Tc,1545
19
19
  tgedr/dataops/source/local_fs_file_source.py,sha256=wihK_wVV6xb8AlkCmZHSRlRmqqm6w9Aq-PI0oOnhn_k,2570
20
- tgedr/dataops/source/pd_df_s3_source.py,sha256=3Mc0VDGTOmk1HT951l6Qd_bqL0DpPjlfZk0JQjWh3JE,2092
20
+ tgedr/dataops/source/pd_df_s3_source.py,sha256=Bfnj363MZtmMJB4W32DKEaYYG0v7NeG-m66C5i8bxsc,3103
21
21
  tgedr/dataops/source/s3_delta_table.py,sha256=d_FoFDNogjZifRQv8V2OmTXbPYymx9QaQcW15uX34pI,2974
22
22
  tgedr/dataops/source/s3_file_copy.py,sha256=DbHvstAqi23cywoG6nHpQxvzerrnepApOsv6zsELYNQ,3930
23
- tgedr/dataops/source/s3_file_source.py,sha256=C8Y0h89p1eBWNGxV4oTNzVLLwZoAoyshHuMlDKKqDA0,4072
23
+ tgedr/dataops/source/s3_file_extended_source.py,sha256=B6m84DWV45E1V3XsLkHARp9YVNvmsDi54ikd2__9q5Q,1384
24
+ tgedr/dataops/source/s3_file_source.py,sha256=ayQ-gPxARFc1xINFOM0_hQFUn6odpts_uFaDZYs7RC4,3520
24
25
  tgedr/dataops/source/source.py,sha256=REeqluMGLMjoDWtdZthzUYkmVeHemSV7t9wjc6eTpJE,1350
25
26
  tgedr/dataops/store/fs_single_partition_parquet.py,sha256=CR3406emhxn33jjObnMotXEmZGfh4Iu5Ygv30FvkY6Y,9695
26
27
  tgedr/dataops/store/local_fs_single_partition_parquet.py,sha256=N_I96fqxQAp2fWBngoDci3aR1-kcmkWjIVRD0nUi07U,683
27
- tgedr/dataops/store/s3_single_partition_parquet.py,sha256=JwMhRO9403OLhepeAdbSDNhmM7I4dLoriQfr2IxymHE,3256
28
+ tgedr/dataops/store/s3_single_partition_parquet.py,sha256=2vSyLb-mZ2mAbRhaDoCcsOEnKzQ30AsUYTlqhKQCroc,3249
28
29
  tgedr/dataops/store/spark_delta.py,sha256=AHqIKDi9axOKpMJHt4AiBGX8V2mFV_7vNPRpaw37rDY,15101
29
30
  tgedr/dataops/store/store.py,sha256=uAuR7MWVdKRaisQ69rFqliLnJrsgpTzbsOh7uPmLSlI,1315
30
31
  tgedr/dataops/validation/abs.py,sha256=84HGUuh6k_uG-ON0bauR4lDBTfUeI3GmxOiWsMkTu3E,1521
31
32
  tgedr/dataops/validation/pandas.py,sha256=Vfr38f3txbTy098ufPUcRsCgrYu47Rg35upZ9IXcLSk,315
32
- tgedr_dataops-0.0.34.dist-info/LICENSE,sha256=awOCsWJ58m_2kBQwBUGWejVqZm6wuRtCL2hi9rfa0X4,1211
33
- tgedr_dataops-0.0.34.dist-info/METADATA,sha256=ryzlSTJx7GtIQNpX3PL2QiZ0h-bsR_smUL-l_rdfXdw,607
34
- tgedr_dataops-0.0.34.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
35
- tgedr_dataops-0.0.34.dist-info/top_level.txt,sha256=acugNvvENatFXbPxKQD9YI5PpzzehbAwyY1keiIkR7I,6
36
- tgedr_dataops-0.0.34.dist-info/RECORD,,
33
+ tgedr/dataops/validation/pyspark.py,sha256=4OEnA21_vSwB5HjaD5KZdfIjBnTn3KUAqvVGHnY-zNI,317
34
+ tgedr_dataops-0.0.37.dist-info/LICENSE,sha256=awOCsWJ58m_2kBQwBUGWejVqZm6wuRtCL2hi9rfa0X4,1211
35
+ tgedr_dataops-0.0.37.dist-info/METADATA,sha256=OGYBz2HJOKVIl5NlRqzjqjG4zHXmgFA2sErmeFCIAYk,639
36
+ tgedr_dataops-0.0.37.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
37
+ tgedr_dataops-0.0.37.dist-info/top_level.txt,sha256=acugNvvENatFXbPxKQD9YI5PpzzehbAwyY1keiIkR7I,6
38
+ tgedr_dataops-0.0.37.dist-info/RECORD,,