tgedr-dataops 0.0.32__tar.gz → 0.0.34__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tgedr_dataops-0.0.32/src/tgedr_dataops.egg-info → tgedr_dataops-0.0.34}/PKG-INFO +1 -1
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/README.md +3 -2
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/setup.py +1 -1
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/commons/utils_fs.py +9 -0
- tgedr_dataops-0.0.34/src/tgedr/dataops/source/abstract_s3_file_source.py +43 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/source/delta_table_source.py +9 -5
- tgedr_dataops-0.0.34/src/tgedr/dataops/source/pd_df_s3_source.py +51 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/source/source.py +4 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34/src/tgedr_dataops.egg-info}/PKG-INFO +1 -1
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr_dataops.egg-info/SOURCES.txt +2 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/LICENSE +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/pyproject.toml +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/setup.cfg +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/__init__.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/chain.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/commons/dataset.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/commons/metadata.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/commons/s3_connector.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/commons/utils_spark.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/etl.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/processor.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/sink/__init__.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/sink/local_fs_file_sink.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/sink/s3_file_sink.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/sink/sink.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/source/__init__.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/source/local_delta_table.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/source/local_fs_file_source.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/source/s3_delta_table.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/source/s3_file_copy.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/source/s3_file_source.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/store/fs_single_partition_parquet.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/store/s3_single_partition_parquet.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/store/spark_delta.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/store/store.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/utils_reflection.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/validation/abs.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/validation/pandas.py +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr_dataops.egg-info/dependency_links.txt +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr_dataops.egg-info/requires.txt +0 -0
- {tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr_dataops.egg-info/top_level.txt +0 -0
|
@@ -29,8 +29,9 @@ data operations related code
|
|
|
29
29
|
- __LocalFsFileSource__: __source__ class used to retrieve local objects/files to another local fs location ([example](test/tgedr/dataops/source/test_localfs_file_source.py))
|
|
30
30
|
- __S3FileSource__: __source__ class used to retrieve objects/files from s3 bucket to local fs location ([example](test/tgedr/dataops/source/test_s3_file_source.py))
|
|
31
31
|
- __S3FileCopy__: __source__ class used to copy objects/files from an s3 bucket to another s3 bucket ([example](test/tgedr/dataops/source/test_s3_copy.py))
|
|
32
|
-
- __S3DeltaTable__: __source__ class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe
|
|
33
|
-
- __LocalDeltaTable__: __source__ class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe
|
|
32
|
+
- __S3DeltaTable__: __source__ class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe ([example](test/tgedr/dataops/source/test_s3_delta_table.py))
|
|
33
|
+
- __LocalDeltaTable__: __source__ class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe ([example](test/tgedr/dataops/source/test_local_delta_table.py))
|
|
34
|
+
- __PdDfS3Source__: __source__ class used to read a pandas dataframe from, by default, a csv file in s3 ([example](test/tgedr/dataops/source/test_pd_df_s3_source.pypd))
|
|
34
35
|
|
|
35
36
|
#### store
|
|
36
37
|
- __Store__ : abstract class used to manage persistence, defining CRUD-like (CreateReadUpdateDelete) methods
|
|
@@ -52,6 +52,15 @@ def process_s3_path(path: str) -> Tuple[str, str]:
|
|
|
52
52
|
return (bucket, key)
|
|
53
53
|
|
|
54
54
|
|
|
55
|
+
def process_s3_url(url: str) -> Tuple[str, str, str]:
|
|
56
|
+
protocol = resolve_s3_protocol(url)
|
|
57
|
+
no_protocol_url = remove_s3_protocol(url)
|
|
58
|
+
path_elements = no_protocol_url.split("/")
|
|
59
|
+
bucket = path_elements[0]
|
|
60
|
+
key = "/".join(path_elements[1:])
|
|
61
|
+
return ("" if protocol is None else protocol, bucket, key)
|
|
62
|
+
|
|
63
|
+
|
|
55
64
|
def hash_file(filepath, hash_func=hashlib.sha256) -> AnyStr:
|
|
56
65
|
"""Generate a hash for a file.
|
|
57
66
|
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from tgedr.dataops.commons.s3_connector import S3Connector
|
|
6
|
+
from tgedr.dataops.commons.utils_fs import process_s3_url
|
|
7
|
+
from tgedr.dataops.source.source import Source, SourceException
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AbstractS3FileSource(Source, S3Connector, ABC):
|
|
14
|
+
"""abstract class used to read file sources from s3"""
|
|
15
|
+
|
|
16
|
+
CONTEXT_KEY_URL = "url"
|
|
17
|
+
CONTEXT_KEY_SUFFIX = "suffix"
|
|
18
|
+
|
|
19
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
20
|
+
Source.__init__(self, config=config)
|
|
21
|
+
S3Connector.__init__(self)
|
|
22
|
+
|
|
23
|
+
def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
24
|
+
logger.info(f"[list|in] ({context})")
|
|
25
|
+
|
|
26
|
+
result: List[str] = []
|
|
27
|
+
if self.CONTEXT_KEY_URL not in context:
|
|
28
|
+
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
29
|
+
|
|
30
|
+
protocol, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
|
|
31
|
+
|
|
32
|
+
objs = self._client.list_objects_v2(Bucket=bucket, Prefix=key)
|
|
33
|
+
result = [
|
|
34
|
+
(protocol + bucket + "/" + entry["Key"]) for entry in objs["Contents"] if not (entry["Key"]).endswith("/")
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
if self.CONTEXT_KEY_SUFFIX in context:
|
|
38
|
+
suffix: str = context[self.CONTEXT_KEY_SUFFIX]
|
|
39
|
+
result = [f for f in result if f.endswith(suffix)]
|
|
40
|
+
|
|
41
|
+
logger.debug(f"[list|out] => {result}")
|
|
42
|
+
logger.info(f"[list|out] => result len: {len(result)}")
|
|
43
|
+
return result
|
{tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/source/delta_table_source.py
RENAMED
|
@@ -3,8 +3,9 @@ import logging
|
|
|
3
3
|
from typing import Any, Dict, List, Optional
|
|
4
4
|
from pandas import DataFrame
|
|
5
5
|
from deltalake import DeltaTable
|
|
6
|
+
from deltalake.exceptions import TableNotFoundError
|
|
6
7
|
|
|
7
|
-
from tgedr.dataops.source.source import Source, SourceException
|
|
8
|
+
from tgedr.dataops.source.source import Source, SourceException, NoSourceException
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
logger = logging.getLogger()
|
|
@@ -36,10 +37,13 @@ class DeltaTableSource(Source, ABC):
|
|
|
36
37
|
if self.CONTEXT_KEY_COLUMNS in context:
|
|
37
38
|
columns = context[self.CONTEXT_KEY_COLUMNS]
|
|
38
39
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
40
|
+
try:
|
|
41
|
+
delta_table = DeltaTable(
|
|
42
|
+
table_uri=context[self.CONTEXT_KEY_URL], storage_options=self._storage_options, without_files=True
|
|
43
|
+
)
|
|
44
|
+
result = delta_table.to_pandas(columns=columns)
|
|
45
|
+
except TableNotFoundError as tnfe:
|
|
46
|
+
raise NoSourceException(f"could not find delta table: {context[self.CONTEXT_KEY_URL]}")
|
|
43
47
|
|
|
44
48
|
logger.info(f"[get|out] => {result}")
|
|
45
49
|
return result
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from io import StringIO
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from tgedr.dataops.commons.utils_fs import process_s3_url
|
|
7
|
+
from tgedr.dataops.source.abstract_s3_file_source import AbstractS3FileSource
|
|
8
|
+
from tgedr.dataops.source.source import SourceException
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PdDfS3Source(AbstractS3FileSource):
|
|
14
|
+
"""class used to read a pandas dataframe from a csv file in s3"""
|
|
15
|
+
|
|
16
|
+
CONTEXT_KEY_FILE_FORMAT = "file_format"
|
|
17
|
+
CONTEXT_KEY_SEPARATOR = "sep"
|
|
18
|
+
CONTEXT_KEY_NO_HEADER = "no_header"
|
|
19
|
+
CONTEXT_KEY_COLUMN_NAMES = "column_names"
|
|
20
|
+
CONTEXT_KEY_SCHEMA_TYPES = "schema_types"
|
|
21
|
+
DEFAULT_FORMAT = "csv"
|
|
22
|
+
DEFAULT_SEPARATOR = ","
|
|
23
|
+
|
|
24
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
25
|
+
super().__init__(config=config)
|
|
26
|
+
|
|
27
|
+
def get(self, context: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
|
28
|
+
"""retrieves a pandas dataframe, by default reading it from a csv,
|
|
29
|
+
you can ask for a different format using the context key 'file_format' (available formats: csv)"""
|
|
30
|
+
logger.info(f"[get|in] ({context})")
|
|
31
|
+
result: pd.DataFrame = None
|
|
32
|
+
|
|
33
|
+
if self.CONTEXT_KEY_URL not in context:
|
|
34
|
+
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
35
|
+
|
|
36
|
+
protocol, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
|
|
37
|
+
|
|
38
|
+
obj = self._client.get_object(Bucket=bucket, Key=key)
|
|
39
|
+
data = obj["Body"].read().decode("utf-8")
|
|
40
|
+
|
|
41
|
+
header = 0 if self.CONTEXT_KEY_NO_HEADER not in context else None
|
|
42
|
+
names = None if self.CONTEXT_KEY_COLUMN_NAMES not in context else context[self.CONTEXT_KEY_COLUMN_NAMES]
|
|
43
|
+
dtype = None if self.CONTEXT_KEY_SCHEMA_TYPES not in context else context[self.CONTEXT_KEY_SCHEMA_TYPES]
|
|
44
|
+
sep = (
|
|
45
|
+
self.DEFAULT_SEPARATOR if self.CONTEXT_KEY_SEPARATOR not in context else context[self.CONTEXT_KEY_SEPARATOR]
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
result = pd.read_csv(StringIO(data), sep=sep, header=header, names=names, dtype=dtype)
|
|
49
|
+
|
|
50
|
+
logger.info(f"[get|out] => {result}")
|
|
51
|
+
return result
|
|
@@ -17,9 +17,11 @@ src/tgedr/dataops/sink/local_fs_file_sink.py
|
|
|
17
17
|
src/tgedr/dataops/sink/s3_file_sink.py
|
|
18
18
|
src/tgedr/dataops/sink/sink.py
|
|
19
19
|
src/tgedr/dataops/source/__init__.py
|
|
20
|
+
src/tgedr/dataops/source/abstract_s3_file_source.py
|
|
20
21
|
src/tgedr/dataops/source/delta_table_source.py
|
|
21
22
|
src/tgedr/dataops/source/local_delta_table.py
|
|
22
23
|
src/tgedr/dataops/source/local_fs_file_source.py
|
|
24
|
+
src/tgedr/dataops/source/pd_df_s3_source.py
|
|
23
25
|
src/tgedr/dataops/source/s3_delta_table.py
|
|
24
26
|
src/tgedr/dataops/source/s3_file_copy.py
|
|
25
27
|
src/tgedr/dataops/source/s3_file_source.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/source/local_fs_file_source.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/store/fs_single_partition_parquet.py
RENAMED
|
File without changes
|
|
File without changes
|
{tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr/dataops/store/s3_single_partition_parquet.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{tgedr_dataops-0.0.32 → tgedr_dataops-0.0.34}/src/tgedr_dataops.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|