tgedr-dataops 0.0.32__py3-none-any.whl → 0.0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tgedr/dataops/commons/utils_fs.py +9 -0
- tgedr/dataops/source/abstract_s3_file_source.py +43 -0
- tgedr/dataops/source/delta_table_source.py +9 -5
- tgedr/dataops/source/pd_df_s3_source.py +51 -0
- tgedr/dataops/source/source.py +4 -0
- {tgedr_dataops-0.0.32.dist-info → tgedr_dataops-0.0.34.dist-info}/METADATA +1 -1
- {tgedr_dataops-0.0.32.dist-info → tgedr_dataops-0.0.34.dist-info}/RECORD +10 -8
- {tgedr_dataops-0.0.32.dist-info → tgedr_dataops-0.0.34.dist-info}/LICENSE +0 -0
- {tgedr_dataops-0.0.32.dist-info → tgedr_dataops-0.0.34.dist-info}/WHEEL +0 -0
- {tgedr_dataops-0.0.32.dist-info → tgedr_dataops-0.0.34.dist-info}/top_level.txt +0 -0
|
@@ -52,6 +52,15 @@ def process_s3_path(path: str) -> Tuple[str, str]:
|
|
|
52
52
|
return (bucket, key)
|
|
53
53
|
|
|
54
54
|
|
|
55
|
+
def process_s3_url(url: str) -> Tuple[str, str, str]:
|
|
56
|
+
protocol = resolve_s3_protocol(url)
|
|
57
|
+
no_protocol_url = remove_s3_protocol(url)
|
|
58
|
+
path_elements = no_protocol_url.split("/")
|
|
59
|
+
bucket = path_elements[0]
|
|
60
|
+
key = "/".join(path_elements[1:])
|
|
61
|
+
return ("" if protocol is None else protocol, bucket, key)
|
|
62
|
+
|
|
63
|
+
|
|
55
64
|
def hash_file(filepath, hash_func=hashlib.sha256) -> AnyStr:
|
|
56
65
|
"""Generate a hash for a file.
|
|
57
66
|
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from tgedr.dataops.commons.s3_connector import S3Connector
|
|
6
|
+
from tgedr.dataops.commons.utils_fs import process_s3_url
|
|
7
|
+
from tgedr.dataops.source.source import Source, SourceException
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AbstractS3FileSource(Source, S3Connector, ABC):
|
|
14
|
+
"""abstract class used to read file sources from s3"""
|
|
15
|
+
|
|
16
|
+
CONTEXT_KEY_URL = "url"
|
|
17
|
+
CONTEXT_KEY_SUFFIX = "suffix"
|
|
18
|
+
|
|
19
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
20
|
+
Source.__init__(self, config=config)
|
|
21
|
+
S3Connector.__init__(self)
|
|
22
|
+
|
|
23
|
+
def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
24
|
+
logger.info(f"[list|in] ({context})")
|
|
25
|
+
|
|
26
|
+
result: List[str] = []
|
|
27
|
+
if self.CONTEXT_KEY_URL not in context:
|
|
28
|
+
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
29
|
+
|
|
30
|
+
protocol, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
|
|
31
|
+
|
|
32
|
+
objs = self._client.list_objects_v2(Bucket=bucket, Prefix=key)
|
|
33
|
+
result = [
|
|
34
|
+
(protocol + bucket + "/" + entry["Key"]) for entry in objs["Contents"] if not (entry["Key"]).endswith("/")
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
if self.CONTEXT_KEY_SUFFIX in context:
|
|
38
|
+
suffix: str = context[self.CONTEXT_KEY_SUFFIX]
|
|
39
|
+
result = [f for f in result if f.endswith(suffix)]
|
|
40
|
+
|
|
41
|
+
logger.debug(f"[list|out] => {result}")
|
|
42
|
+
logger.info(f"[list|out] => result len: {len(result)}")
|
|
43
|
+
return result
|
|
@@ -3,8 +3,9 @@ import logging
|
|
|
3
3
|
from typing import Any, Dict, List, Optional
|
|
4
4
|
from pandas import DataFrame
|
|
5
5
|
from deltalake import DeltaTable
|
|
6
|
+
from deltalake.exceptions import TableNotFoundError
|
|
6
7
|
|
|
7
|
-
from tgedr.dataops.source.source import Source, SourceException
|
|
8
|
+
from tgedr.dataops.source.source import Source, SourceException, NoSourceException
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
logger = logging.getLogger()
|
|
@@ -36,10 +37,13 @@ class DeltaTableSource(Source, ABC):
|
|
|
36
37
|
if self.CONTEXT_KEY_COLUMNS in context:
|
|
37
38
|
columns = context[self.CONTEXT_KEY_COLUMNS]
|
|
38
39
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
40
|
+
try:
|
|
41
|
+
delta_table = DeltaTable(
|
|
42
|
+
table_uri=context[self.CONTEXT_KEY_URL], storage_options=self._storage_options, without_files=True
|
|
43
|
+
)
|
|
44
|
+
result = delta_table.to_pandas(columns=columns)
|
|
45
|
+
except TableNotFoundError as tnfe:
|
|
46
|
+
raise NoSourceException(f"could not find delta table: {context[self.CONTEXT_KEY_URL]}")
|
|
43
47
|
|
|
44
48
|
logger.info(f"[get|out] => {result}")
|
|
45
49
|
return result
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from io import StringIO
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from tgedr.dataops.commons.utils_fs import process_s3_url
|
|
7
|
+
from tgedr.dataops.source.abstract_s3_file_source import AbstractS3FileSource
|
|
8
|
+
from tgedr.dataops.source.source import SourceException
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PdDfS3Source(AbstractS3FileSource):
|
|
14
|
+
"""class used to read a pandas dataframe from a csv file in s3"""
|
|
15
|
+
|
|
16
|
+
CONTEXT_KEY_FILE_FORMAT = "file_format"
|
|
17
|
+
CONTEXT_KEY_SEPARATOR = "sep"
|
|
18
|
+
CONTEXT_KEY_NO_HEADER = "no_header"
|
|
19
|
+
CONTEXT_KEY_COLUMN_NAMES = "column_names"
|
|
20
|
+
CONTEXT_KEY_SCHEMA_TYPES = "schema_types"
|
|
21
|
+
DEFAULT_FORMAT = "csv"
|
|
22
|
+
DEFAULT_SEPARATOR = ","
|
|
23
|
+
|
|
24
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
25
|
+
super().__init__(config=config)
|
|
26
|
+
|
|
27
|
+
def get(self, context: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
|
28
|
+
"""retrieves a pandas dataframe, by default reading it from a csv,
|
|
29
|
+
you can ask for a different format using the context key 'file_format' (available formats: csv)"""
|
|
30
|
+
logger.info(f"[get|in] ({context})")
|
|
31
|
+
result: pd.DataFrame = None
|
|
32
|
+
|
|
33
|
+
if self.CONTEXT_KEY_URL not in context:
|
|
34
|
+
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
35
|
+
|
|
36
|
+
protocol, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
|
|
37
|
+
|
|
38
|
+
obj = self._client.get_object(Bucket=bucket, Key=key)
|
|
39
|
+
data = obj["Body"].read().decode("utf-8")
|
|
40
|
+
|
|
41
|
+
header = 0 if self.CONTEXT_KEY_NO_HEADER not in context else None
|
|
42
|
+
names = None if self.CONTEXT_KEY_COLUMN_NAMES not in context else context[self.CONTEXT_KEY_COLUMN_NAMES]
|
|
43
|
+
dtype = None if self.CONTEXT_KEY_SCHEMA_TYPES not in context else context[self.CONTEXT_KEY_SCHEMA_TYPES]
|
|
44
|
+
sep = (
|
|
45
|
+
self.DEFAULT_SEPARATOR if self.CONTEXT_KEY_SEPARATOR not in context else context[self.CONTEXT_KEY_SEPARATOR]
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
result = pd.read_csv(StringIO(data), sep=sep, header=header, names=names, dtype=dtype)
|
|
49
|
+
|
|
50
|
+
logger.info(f"[get|out] => {result}")
|
|
51
|
+
return result
|
tgedr/dataops/source/source.py
CHANGED
|
@@ -6,20 +6,22 @@ tgedr/dataops/utils_reflection.py,sha256=W3eqrypMwqRsvevEZ9d6Zxw_GJL4d938lXiU1gI
|
|
|
6
6
|
tgedr/dataops/commons/dataset.py,sha256=VIt385Zps78y6cbe54G4PLNuUA647S94UqokubM1gKI,637
|
|
7
7
|
tgedr/dataops/commons/metadata.py,sha256=5uIurGkkREJuAlONav8_TyLVNY097Zix6LIdRxVE_uQ,5796
|
|
8
8
|
tgedr/dataops/commons/s3_connector.py,sha256=AmD8l86ORbABamkquD8BMtMpgSfThkVbl9Kf54wR_pA,1194
|
|
9
|
-
tgedr/dataops/commons/utils_fs.py,sha256=
|
|
9
|
+
tgedr/dataops/commons/utils_fs.py,sha256=8gXPcdQx520pr1h7tn7m_Z1AojgWIdebNLCyiQgEZEU,2362
|
|
10
10
|
tgedr/dataops/commons/utils_spark.py,sha256=VSFUcQt-yE54VHJkonveFwhM3ADTK3kfC3QfA34QOVI,3191
|
|
11
11
|
tgedr/dataops/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
tgedr/dataops/sink/local_fs_file_sink.py,sha256=JjFWwW_ufWF8I-PVPP1TleKmRj8Des-6A0xLqtbPYW8,1508
|
|
13
13
|
tgedr/dataops/sink/s3_file_sink.py,sha256=m1D6SwuqYEVqvdA9XrL6nVlX_oWLRDT6v74mh_diuQM,2362
|
|
14
14
|
tgedr/dataops/sink/sink.py,sha256=8rG3ZNpzeZ82Ac1IoPzkdQTs006IbG-k39APFCeXogk,1271
|
|
15
15
|
tgedr/dataops/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
-
tgedr/dataops/source/
|
|
16
|
+
tgedr/dataops/source/abstract_s3_file_source.py,sha256=AMnTWxs57Jni46jz5PvrZTRjd-Ah_TUZEP4Rcx7YziM,1513
|
|
17
|
+
tgedr/dataops/source/delta_table_source.py,sha256=tXeJCM-QhssztEpyKGzb6zcM1AlaUwV1Y5d2RHCiOTU,1653
|
|
17
18
|
tgedr/dataops/source/local_delta_table.py,sha256=Z9413skOSsNGfSNeBs-ETVXQJcW4bhS3MYmCwySo9Tc,1545
|
|
18
19
|
tgedr/dataops/source/local_fs_file_source.py,sha256=wihK_wVV6xb8AlkCmZHSRlRmqqm6w9Aq-PI0oOnhn_k,2570
|
|
20
|
+
tgedr/dataops/source/pd_df_s3_source.py,sha256=3Mc0VDGTOmk1HT951l6Qd_bqL0DpPjlfZk0JQjWh3JE,2092
|
|
19
21
|
tgedr/dataops/source/s3_delta_table.py,sha256=d_FoFDNogjZifRQv8V2OmTXbPYymx9QaQcW15uX34pI,2974
|
|
20
22
|
tgedr/dataops/source/s3_file_copy.py,sha256=DbHvstAqi23cywoG6nHpQxvzerrnepApOsv6zsELYNQ,3930
|
|
21
23
|
tgedr/dataops/source/s3_file_source.py,sha256=C8Y0h89p1eBWNGxV4oTNzVLLwZoAoyshHuMlDKKqDA0,4072
|
|
22
|
-
tgedr/dataops/source/source.py,sha256=
|
|
24
|
+
tgedr/dataops/source/source.py,sha256=REeqluMGLMjoDWtdZthzUYkmVeHemSV7t9wjc6eTpJE,1350
|
|
23
25
|
tgedr/dataops/store/fs_single_partition_parquet.py,sha256=CR3406emhxn33jjObnMotXEmZGfh4Iu5Ygv30FvkY6Y,9695
|
|
24
26
|
tgedr/dataops/store/local_fs_single_partition_parquet.py,sha256=N_I96fqxQAp2fWBngoDci3aR1-kcmkWjIVRD0nUi07U,683
|
|
25
27
|
tgedr/dataops/store/s3_single_partition_parquet.py,sha256=JwMhRO9403OLhepeAdbSDNhmM7I4dLoriQfr2IxymHE,3256
|
|
@@ -27,8 +29,8 @@ tgedr/dataops/store/spark_delta.py,sha256=AHqIKDi9axOKpMJHt4AiBGX8V2mFV_7vNPRpaw
|
|
|
27
29
|
tgedr/dataops/store/store.py,sha256=uAuR7MWVdKRaisQ69rFqliLnJrsgpTzbsOh7uPmLSlI,1315
|
|
28
30
|
tgedr/dataops/validation/abs.py,sha256=84HGUuh6k_uG-ON0bauR4lDBTfUeI3GmxOiWsMkTu3E,1521
|
|
29
31
|
tgedr/dataops/validation/pandas.py,sha256=Vfr38f3txbTy098ufPUcRsCgrYu47Rg35upZ9IXcLSk,315
|
|
30
|
-
tgedr_dataops-0.0.
|
|
31
|
-
tgedr_dataops-0.0.
|
|
32
|
-
tgedr_dataops-0.0.
|
|
33
|
-
tgedr_dataops-0.0.
|
|
34
|
-
tgedr_dataops-0.0.
|
|
32
|
+
tgedr_dataops-0.0.34.dist-info/LICENSE,sha256=awOCsWJ58m_2kBQwBUGWejVqZm6wuRtCL2hi9rfa0X4,1211
|
|
33
|
+
tgedr_dataops-0.0.34.dist-info/METADATA,sha256=ryzlSTJx7GtIQNpX3PL2QiZ0h-bsR_smUL-l_rdfXdw,607
|
|
34
|
+
tgedr_dataops-0.0.34.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
35
|
+
tgedr_dataops-0.0.34.dist-info/top_level.txt,sha256=acugNvvENatFXbPxKQD9YI5PpzzehbAwyY1keiIkR7I,6
|
|
36
|
+
tgedr_dataops-0.0.34.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|