tgedr-dataops 0.0.36__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
- tgedr_dataops/commons/utils_fs.py +187 -0
- tgedr_dataops/quality/pandas_validation.py +21 -0
- tgedr_dataops/sink/local_fs_file_sink.py +77 -0
- {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
- tgedr_dataops/source/abstract_s3_file_source.py +72 -0
- tgedr_dataops/source/local_fs_file_source.py +108 -0
- tgedr_dataops/source/pd_df_s3_source.py +130 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
- tgedr_dataops/source/s3_file_extended_source.py +68 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +60 -39
- tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
- tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
- tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
- tgedr_dataops-1.0.1.dist-info/METADATA +72 -0
- tgedr_dataops-1.0.1.dist-info/RECORD +22 -0
- {tgedr_dataops-0.0.36.dist-info → tgedr_dataops-1.0.1.dist-info}/WHEEL +1 -1
- tgedr_dataops-1.0.1.dist-info/top_level.txt +1 -0
- tgedr/dataops/chain.py +0 -51
- tgedr/dataops/commons/dataset.py +0 -23
- tgedr/dataops/commons/metadata.py +0 -172
- tgedr/dataops/commons/utils_fs.py +0 -85
- tgedr/dataops/commons/utils_spark.py +0 -87
- tgedr/dataops/etl.py +0 -112
- tgedr/dataops/processor.py +0 -27
- tgedr/dataops/sink/local_fs_file_sink.py +0 -47
- tgedr/dataops/sink/sink.py +0 -46
- tgedr/dataops/source/abstract_s3_file_source.py +0 -43
- tgedr/dataops/source/delta_table_source.py +0 -49
- tgedr/dataops/source/local_delta_table.py +0 -47
- tgedr/dataops/source/local_fs_file_source.py +0 -71
- tgedr/dataops/source/pd_df_s3_source.py +0 -51
- tgedr/dataops/source/s3_delta_table.py +0 -75
- tgedr/dataops/source/source.py +0 -51
- tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
- tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
- tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
- tgedr/dataops/store/spark_delta.py +0 -369
- tgedr/dataops/store/store.py +0 -49
- tgedr/dataops/utils_reflection.py +0 -134
- tgedr/dataops/validation/abs.py +0 -46
- tgedr/dataops/validation/pandas.py +0 -10
- tgedr/dataops/validation/pyspark.py +0 -10
- tgedr_dataops-0.0.36.dist-info/METADATA +0 -20
- tgedr_dataops-0.0.36.dist-info/RECORD +0 -37
- tgedr_dataops-0.0.36.dist-info/top_level.txt +0 -1
- {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
- {tgedr_dataops-0.0.36.dist-info → tgedr_dataops-1.0.1.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
import logging
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
-
from pandas import DataFrame
|
|
5
|
-
from deltalake import DeltaTable
|
|
6
|
-
from deltalake.exceptions import TableNotFoundError
|
|
7
|
-
|
|
8
|
-
from tgedr.dataops.source.source import Source, SourceException, NoSourceException
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
logger = logging.getLogger()
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class DeltaTableSource(Source, ABC):
|
|
15
|
-
"""abstract class used to read delta lake format datasets returning a pandas dataframe"""
|
|
16
|
-
|
|
17
|
-
CONTEXT_KEY_URL: str = "url"
|
|
18
|
-
CONTEXT_KEY_COLUMNS: str = "columns"
|
|
19
|
-
|
|
20
|
-
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
21
|
-
super().__init__(config=config)
|
|
22
|
-
|
|
23
|
-
@property
|
|
24
|
-
@abstractmethod
|
|
25
|
-
def _storage_options(self):
|
|
26
|
-
return None
|
|
27
|
-
|
|
28
|
-
def get(self, context: Optional[Dict[str, Any]] = None) -> DataFrame:
|
|
29
|
-
"""retrieves a delta lake table"""
|
|
30
|
-
logger.info(f"[get|in] ({context})")
|
|
31
|
-
result: DataFrame = None
|
|
32
|
-
|
|
33
|
-
if self.CONTEXT_KEY_URL not in context:
|
|
34
|
-
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
35
|
-
|
|
36
|
-
columns: List[str] = None
|
|
37
|
-
if self.CONTEXT_KEY_COLUMNS in context:
|
|
38
|
-
columns = context[self.CONTEXT_KEY_COLUMNS]
|
|
39
|
-
|
|
40
|
-
try:
|
|
41
|
-
delta_table = DeltaTable(
|
|
42
|
-
table_uri=context[self.CONTEXT_KEY_URL], storage_options=self._storage_options, without_files=True
|
|
43
|
-
)
|
|
44
|
-
result = delta_table.to_pandas(columns=columns)
|
|
45
|
-
except TableNotFoundError as tnfe:
|
|
46
|
-
raise NoSourceException(f"could not find delta table: {context[self.CONTEXT_KEY_URL]}")
|
|
47
|
-
|
|
48
|
-
logger.info(f"[get|out] => {result}")
|
|
49
|
-
return result
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import os
|
|
3
|
-
import re
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
5
|
-
import glob
|
|
6
|
-
|
|
7
|
-
from tgedr.dataops.source.delta_table_source import DeltaTableSource
|
|
8
|
-
from tgedr.dataops.source.source import SourceException
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
logger = logging.getLogger()
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class LocalDeltaTable(DeltaTableSource):
|
|
15
|
-
"""class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe"""
|
|
16
|
-
|
|
17
|
-
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
18
|
-
super().__init__(config=config)
|
|
19
|
-
|
|
20
|
-
@property
|
|
21
|
-
def _storage_options(self):
|
|
22
|
-
return None
|
|
23
|
-
|
|
24
|
-
def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
25
|
-
"""lists the available delta lake datasets in the url provided"""
|
|
26
|
-
logger.info(f"[list|in] ({context})")
|
|
27
|
-
|
|
28
|
-
result: List[str] = []
|
|
29
|
-
if self.CONTEXT_KEY_URL not in context:
|
|
30
|
-
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
31
|
-
|
|
32
|
-
url = context[self.CONTEXT_KEY_URL]
|
|
33
|
-
if not os.path.isdir(url):
|
|
34
|
-
raise SourceException(f"not a delta lake url: {url}")
|
|
35
|
-
|
|
36
|
-
matches: set[str] = set()
|
|
37
|
-
pattern: str = f".*{url}/(.*)/_delta_log/.*"
|
|
38
|
-
for entry in glob.iglob(url + "**/**", recursive=True):
|
|
39
|
-
match = re.search(pattern, entry)
|
|
40
|
-
if match:
|
|
41
|
-
matches.add(match.group(1))
|
|
42
|
-
|
|
43
|
-
result = list(matches)
|
|
44
|
-
|
|
45
|
-
logger.info(f"[list] result: {result}")
|
|
46
|
-
logger.info(f"[list|out] => result len: {len(result)}")
|
|
47
|
-
return result
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import os
|
|
3
|
-
import shutil
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
5
|
-
|
|
6
|
-
from tgedr.dataops.source.source import Source, SourceException
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
logger = logging.getLogger(__name__)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class LocalFsFileSource(Source):
|
|
13
|
-
"""source class used to retrieve local objects/files to a another local fs location"""
|
|
14
|
-
|
|
15
|
-
CONTEXT_KEY_SOURCE = "source"
|
|
16
|
-
CONTEXT_KEY_TARGET = "target"
|
|
17
|
-
CONTEXT_KEY_SUFFIX = "suffix"
|
|
18
|
-
CONTEXT_KEY_FILES = "files"
|
|
19
|
-
|
|
20
|
-
def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
21
|
-
logger.info(f"[list|in] ({context})")
|
|
22
|
-
result: List[str] = []
|
|
23
|
-
if self.CONTEXT_KEY_SOURCE not in context:
|
|
24
|
-
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_SOURCE}")
|
|
25
|
-
|
|
26
|
-
source = context[self.CONTEXT_KEY_SOURCE]
|
|
27
|
-
if os.path.isdir(source):
|
|
28
|
-
suffix = None
|
|
29
|
-
if self.CONTEXT_KEY_SUFFIX in context:
|
|
30
|
-
suffix = context[self.CONTEXT_KEY_SUFFIX]
|
|
31
|
-
result: List[str] = [os.path.join(source, file) for file in os.listdir(source) if file.endswith(suffix)]
|
|
32
|
-
else:
|
|
33
|
-
result: List[str] = [os.path.join(source, file) for file in os.listdir(source)]
|
|
34
|
-
elif os.path.isfile(source):
|
|
35
|
-
result: List[str] = [source]
|
|
36
|
-
|
|
37
|
-
logger.debug(f"[list|out] => {result}")
|
|
38
|
-
logger.info(f"[list|out] => result len: {len(result)}")
|
|
39
|
-
return result
|
|
40
|
-
|
|
41
|
-
def get(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
42
|
-
logger.info(f"[get|in] ({context})")
|
|
43
|
-
|
|
44
|
-
if self.CONTEXT_KEY_FILES not in context or self.CONTEXT_KEY_TARGET not in context:
|
|
45
|
-
raise SourceException(f"{self.CONTEXT_KEY_FILES} and {self.CONTEXT_KEY_TARGET} must be provided in config")
|
|
46
|
-
files = context[self.CONTEXT_KEY_FILES]
|
|
47
|
-
target = context[self.CONTEXT_KEY_TARGET]
|
|
48
|
-
|
|
49
|
-
if "list" != type(files).__name__:
|
|
50
|
-
if "string" == type(files).__name__:
|
|
51
|
-
files = [files]
|
|
52
|
-
else:
|
|
53
|
-
raise SourceException("files argument must be a list of strings or a string")
|
|
54
|
-
|
|
55
|
-
target_is_dir: bool = False
|
|
56
|
-
if os.path.isdir(target):
|
|
57
|
-
target_is_dir = True
|
|
58
|
-
|
|
59
|
-
result: List[str] = []
|
|
60
|
-
|
|
61
|
-
for file in files:
|
|
62
|
-
basename = os.path.basename(file)
|
|
63
|
-
if target_is_dir:
|
|
64
|
-
new_file = os.path.join(target, basename)
|
|
65
|
-
else:
|
|
66
|
-
new_file = target
|
|
67
|
-
shutil.copy(file, new_file)
|
|
68
|
-
result.append(new_file)
|
|
69
|
-
|
|
70
|
-
logger.info("[get|out] => {result}")
|
|
71
|
-
return result
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
from io import StringIO
|
|
2
|
-
import logging
|
|
3
|
-
from typing import Any, Dict, Optional
|
|
4
|
-
import pandas as pd
|
|
5
|
-
|
|
6
|
-
from tgedr.dataops.commons.utils_fs import process_s3_url
|
|
7
|
-
from tgedr.dataops.source.abstract_s3_file_source import AbstractS3FileSource
|
|
8
|
-
from tgedr.dataops.source.source import SourceException
|
|
9
|
-
|
|
10
|
-
logger = logging.getLogger()
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class PdDfS3Source(AbstractS3FileSource):
|
|
14
|
-
"""class used to read a pandas dataframe from a csv file in s3"""
|
|
15
|
-
|
|
16
|
-
CONTEXT_KEY_FILE_FORMAT = "file_format"
|
|
17
|
-
CONTEXT_KEY_SEPARATOR = "sep"
|
|
18
|
-
CONTEXT_KEY_NO_HEADER = "no_header"
|
|
19
|
-
CONTEXT_KEY_COLUMN_NAMES = "column_names"
|
|
20
|
-
CONTEXT_KEY_SCHEMA_TYPES = "schema_types"
|
|
21
|
-
DEFAULT_FORMAT = "csv"
|
|
22
|
-
DEFAULT_SEPARATOR = ","
|
|
23
|
-
|
|
24
|
-
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
25
|
-
super().__init__(config=config)
|
|
26
|
-
|
|
27
|
-
def get(self, context: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
|
|
28
|
-
"""retrieves a pandas dataframe, by default reading it from a csv,
|
|
29
|
-
you can ask for a different format using the context key 'file_format' (available formats: csv)"""
|
|
30
|
-
logger.info(f"[get|in] ({context})")
|
|
31
|
-
result: pd.DataFrame = None
|
|
32
|
-
|
|
33
|
-
if self.CONTEXT_KEY_URL not in context:
|
|
34
|
-
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
35
|
-
|
|
36
|
-
protocol, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
|
|
37
|
-
|
|
38
|
-
obj = self._client.get_object(Bucket=bucket, Key=key)
|
|
39
|
-
data = obj["Body"].read().decode("utf-8")
|
|
40
|
-
|
|
41
|
-
header = 0 if self.CONTEXT_KEY_NO_HEADER not in context else None
|
|
42
|
-
names = None if self.CONTEXT_KEY_COLUMN_NAMES not in context else context[self.CONTEXT_KEY_COLUMN_NAMES]
|
|
43
|
-
dtype = None if self.CONTEXT_KEY_SCHEMA_TYPES not in context else context[self.CONTEXT_KEY_SCHEMA_TYPES]
|
|
44
|
-
sep = (
|
|
45
|
-
self.DEFAULT_SEPARATOR if self.CONTEXT_KEY_SEPARATOR not in context else context[self.CONTEXT_KEY_SEPARATOR]
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
result = pd.read_csv(StringIO(data), sep=sep, header=header, names=names, dtype=dtype)
|
|
49
|
-
|
|
50
|
-
logger.info(f"[get|out] => {result}")
|
|
51
|
-
return result
|
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import re
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
-
|
|
5
|
-
from tgedr.dataops.commons.s3_connector import S3Connector
|
|
6
|
-
from tgedr.dataops.commons.utils_fs import remove_s3_protocol, resolve_s3_protocol
|
|
7
|
-
from tgedr.dataops.source.delta_table_source import DeltaTableSource
|
|
8
|
-
from tgedr.dataops.source.source import SourceException
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
logger = logging.getLogger()
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class S3DeltaTable(DeltaTableSource, S3Connector):
|
|
15
|
-
"""class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe"""
|
|
16
|
-
|
|
17
|
-
CONFIG_KEY_AWS_ACCESS_KEY_ID: str = "AWS_ACCESS_KEY_ID"
|
|
18
|
-
CONFIG_KEY_AWS_SECRET_ACCESS_KEY: str = "AWS_SECRET_ACCESS_KEY"
|
|
19
|
-
CONFIG_KEY_AWS_SESSION_TOKEN: str = "AWS_SESSION_TOKEN"
|
|
20
|
-
CONFIG_KEY_AWS_REGION: str = "AWS_REGION"
|
|
21
|
-
|
|
22
|
-
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
23
|
-
DeltaTableSource.__init__(self, config=config)
|
|
24
|
-
S3Connector.__init__(self)
|
|
25
|
-
|
|
26
|
-
@property
|
|
27
|
-
def _storage_options(self):
|
|
28
|
-
result = None
|
|
29
|
-
if (self._config is not None) and all(
|
|
30
|
-
element in list(self._config.keys())
|
|
31
|
-
for element in [
|
|
32
|
-
self.CONFIG_KEY_AWS_ACCESS_KEY_ID,
|
|
33
|
-
self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY,
|
|
34
|
-
self.CONFIG_KEY_AWS_SESSION_TOKEN,
|
|
35
|
-
self.CONFIG_KEY_AWS_REGION,
|
|
36
|
-
]
|
|
37
|
-
):
|
|
38
|
-
result = {
|
|
39
|
-
"AWS_ACCESS_KEY_ID": self._config[self.CONFIG_KEY_AWS_ACCESS_KEY_ID],
|
|
40
|
-
"AWS_SECRET_ACCESS_KEY": self._config[self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY],
|
|
41
|
-
"AWS_SESSION_TOKEN": self._config[self.CONFIG_KEY_AWS_SESSION_TOKEN],
|
|
42
|
-
"AWS_REGION": self._config[self.CONFIG_KEY_AWS_REGION],
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
return result
|
|
46
|
-
|
|
47
|
-
def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
48
|
-
"""lists the available delta lake datasets in the url provided"""
|
|
49
|
-
logger.info(f"[list|in] ({context})")
|
|
50
|
-
|
|
51
|
-
result: List[str] = []
|
|
52
|
-
if self.CONTEXT_KEY_URL not in context:
|
|
53
|
-
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
54
|
-
|
|
55
|
-
s3_protocol: str = resolve_s3_protocol(context[self.CONTEXT_KEY_URL])
|
|
56
|
-
protocol = "" if s3_protocol is None else s3_protocol
|
|
57
|
-
|
|
58
|
-
path = remove_s3_protocol(context[self.CONTEXT_KEY_URL])
|
|
59
|
-
path_elements = path.split("/")
|
|
60
|
-
bucket = path_elements[0]
|
|
61
|
-
key = "/".join(path_elements[1:])
|
|
62
|
-
|
|
63
|
-
matches: set[str] = set()
|
|
64
|
-
pattern: str = f".*{key}/(.*)/_delta_log/.*"
|
|
65
|
-
for entry in self._client.list_objects_v2(Bucket=bucket, Prefix=key)["Contents"]:
|
|
66
|
-
output_key: str = entry["Key"]
|
|
67
|
-
match = re.search(pattern, output_key)
|
|
68
|
-
if match:
|
|
69
|
-
matches.add(f"{key}/{match.group(1)}")
|
|
70
|
-
|
|
71
|
-
result = list(matches)
|
|
72
|
-
|
|
73
|
-
logger.info(f"[list] result: {result}")
|
|
74
|
-
logger.info(f"[list|out] => result len: {len(result)}")
|
|
75
|
-
return result
|
tgedr/dataops/source/source.py
DELETED
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
import abc
|
|
2
|
-
from typing import Any, Dict, Optional
|
|
3
|
-
|
|
4
|
-
from tgedr.dataops.chain import Chain
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class SourceException(Exception):
|
|
8
|
-
pass
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class NoSourceException(SourceException):
|
|
12
|
-
pass
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class SourceInterface(metaclass=abc.ABCMeta):
|
|
16
|
-
"""
|
|
17
|
-
def get(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
18
|
-
raise NotImplementedError()
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
@classmethod
|
|
22
|
-
def __subclasshook__(cls, subclass):
|
|
23
|
-
return (
|
|
24
|
-
hasattr(subclass, "get")
|
|
25
|
-
and callable(subclass.get)
|
|
26
|
-
and hasattr(subclass, "list")
|
|
27
|
-
and callable(subclass.list)
|
|
28
|
-
or NotImplemented
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@SourceInterface.register
|
|
33
|
-
class Source(abc.ABC):
|
|
34
|
-
"""abstract class defining methods ('list' and 'get') to manage retrieval of data from somewhere as defined by implementing classes"""
|
|
35
|
-
|
|
36
|
-
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
37
|
-
self._config = config
|
|
38
|
-
|
|
39
|
-
@abc.abstractmethod
|
|
40
|
-
def get(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
41
|
-
raise NotImplementedError()
|
|
42
|
-
|
|
43
|
-
@abc.abstractmethod
|
|
44
|
-
def list(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
45
|
-
raise NotImplementedError()
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
@SourceInterface.register
|
|
49
|
-
class SourceChain(Chain, abc.ABC):
|
|
50
|
-
def execute(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
51
|
-
return self.get(context=context)
|
|
@@ -1,231 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
import logging
|
|
3
|
-
import os
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
5
|
-
import pandas as pd
|
|
6
|
-
import pyarrow as pa
|
|
7
|
-
import pyarrow.parquet as pq
|
|
8
|
-
import pyarrow.compute as pc
|
|
9
|
-
|
|
10
|
-
from tgedr.dataops.store.store import Store, StoreException
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
logger = logging.getLogger(__name__)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def pandas_mapper(arrow_type):
|
|
17
|
-
if pa.types.is_int64(arrow_type):
|
|
18
|
-
return pd.Int64Dtype()
|
|
19
|
-
if pa.types.is_float64(arrow_type):
|
|
20
|
-
return pd.Float64Dtype()
|
|
21
|
-
if pa.types.is_string(arrow_type):
|
|
22
|
-
return pd.StringDtype()
|
|
23
|
-
# suggest default behavior
|
|
24
|
-
return None
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class FsSinglePartitionParquetStore(Store, ABC):
|
|
28
|
-
"""abstract store implementation defining persistence on parquet files with an optional single partition,
|
|
29
|
-
regardless of the location it should persist"""
|
|
30
|
-
|
|
31
|
-
@property
|
|
32
|
-
@abstractmethod
|
|
33
|
-
def fs(self):
|
|
34
|
-
"""abstract method providing a filesystem implementation (local, s3, etc...)"""
|
|
35
|
-
raise NotImplementedError()
|
|
36
|
-
|
|
37
|
-
@abstractmethod
|
|
38
|
-
def _rmdir(self, key):
|
|
39
|
-
raise NotImplementedError()
|
|
40
|
-
|
|
41
|
-
@abstractmethod
|
|
42
|
-
def _exists(self, key) -> bool:
|
|
43
|
-
raise NotImplementedError()
|
|
44
|
-
|
|
45
|
-
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
46
|
-
Store.__init__(self, config)
|
|
47
|
-
self._fs = None
|
|
48
|
-
|
|
49
|
-
def get(
|
|
50
|
-
self,
|
|
51
|
-
key: str,
|
|
52
|
-
filter: callable = None,
|
|
53
|
-
filters: List[tuple[str, str, List[str]]] = None,
|
|
54
|
-
schema: pa.Schema = None,
|
|
55
|
-
) -> pd.DataFrame:
|
|
56
|
-
"""
|
|
57
|
-
reads a pandas dataframe from somewhere (key), depending on implementation, eventually enforcing a schema and
|
|
58
|
-
allowing filtering of data
|
|
59
|
-
|
|
60
|
-
Parameters:
|
|
61
|
-
key (str): location/url/path where data should be persisted
|
|
62
|
-
filter (Array or array-like or Expression): filter expression (see: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.filter)
|
|
63
|
-
filters (pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]]): filter expression (see: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html)
|
|
64
|
-
schema: data schema to enforce while reading (see: https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema)
|
|
65
|
-
|
|
66
|
-
Returns:
|
|
67
|
-
pandas.DataFrame: the dataframe
|
|
68
|
-
"""
|
|
69
|
-
schema_msg_segment = "0" if schema is None else str(len(schema))
|
|
70
|
-
logger.info(f"[get|in] ({key}, {filter}, {filters}, schema len:{schema_msg_segment})")
|
|
71
|
-
logger.debug(f"[get|in] ({key}, {filter}, {filters}, {schema})")
|
|
72
|
-
table = pq.read_table(key, filesystem=self.fs, filters=filters, schema=schema)
|
|
73
|
-
if filter is not None:
|
|
74
|
-
table = table.filter(filter)
|
|
75
|
-
result = table.to_pandas(types_mapper=pandas_mapper)
|
|
76
|
-
logger.info(f"[get|out] => {result.shape}")
|
|
77
|
-
return result
|
|
78
|
-
|
|
79
|
-
def delete(
|
|
80
|
-
self,
|
|
81
|
-
key: str,
|
|
82
|
-
partition_field: Optional[str] = None,
|
|
83
|
-
partition_values: Optional[List[str]] = None,
|
|
84
|
-
kv_dict: Optional[Dict[str, List[Any]]] = None,
|
|
85
|
-
schema: pa.Schema = None,
|
|
86
|
-
):
|
|
87
|
-
"""
|
|
88
|
-
removes partitions, full or partial, or deletes partial values or a full dataset
|
|
89
|
-
from a parquet storage somewhere (key), depending on implementation
|
|
90
|
-
|
|
91
|
-
Parameters:
|
|
92
|
-
key (str): location/url/path where data is persisted
|
|
93
|
-
partition_field (str): name of the partition field in the dataset
|
|
94
|
-
partition_values (str): partition values to delete
|
|
95
|
-
kv_dict (Dict[str, List[Any]]): key-value map defining the fields and array of values to become the deletion filter
|
|
96
|
-
schema: data schema to enforce if reading is required (see: https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema)
|
|
97
|
-
"""
|
|
98
|
-
schema_msg_segment = "0" if schema is None else str(len(schema))
|
|
99
|
-
logger.info(
|
|
100
|
-
f"[delete|in] ({key}, {partition_field}, {partition_values}, {kv_dict}, schema len:{schema_msg_segment})"
|
|
101
|
-
)
|
|
102
|
-
logger.debug(f"[delete|in] ({key}, {partition_field}, {partition_values}, {kv_dict}, {schema})")
|
|
103
|
-
|
|
104
|
-
if partition_values is not None and partition_field is not None:
|
|
105
|
-
self._remove_partitions(key, partition_field=partition_field, partition_values=partition_values)
|
|
106
|
-
elif kv_dict is not None and partition_field is not None:
|
|
107
|
-
table = pq.read_table(key, filesystem=self.fs, schema=schema)
|
|
108
|
-
for k, v in kv_dict.items():
|
|
109
|
-
filter_condition = ~pc.is_in(pc.field(k), pa.array(v))
|
|
110
|
-
table = table.filter(filter_condition)
|
|
111
|
-
self.delete(key, schema=schema)
|
|
112
|
-
pq.write_to_dataset(
|
|
113
|
-
table,
|
|
114
|
-
root_path=key,
|
|
115
|
-
partition_cols=[partition_field],
|
|
116
|
-
existing_data_behavior="delete_matching",
|
|
117
|
-
filesystem=self.fs,
|
|
118
|
-
schema=schema,
|
|
119
|
-
)
|
|
120
|
-
else:
|
|
121
|
-
self._rmdir(key)
|
|
122
|
-
|
|
123
|
-
logger.info("[delete|out]")
|
|
124
|
-
|
|
125
|
-
def save(
|
|
126
|
-
self,
|
|
127
|
-
df: pd.DataFrame,
|
|
128
|
-
key: str,
|
|
129
|
-
partition_field: Optional[str] = None,
|
|
130
|
-
append: bool = False,
|
|
131
|
-
replace_partitions: bool = False,
|
|
132
|
-
schema: Any = None,
|
|
133
|
-
):
|
|
134
|
-
"""
|
|
135
|
-
saves a pandas dataframe in parquet format somewhere (key), depending on implementation
|
|
136
|
-
|
|
137
|
-
Parameters:
|
|
138
|
-
df (pandas.DataFrame): the dataframe to be saved
|
|
139
|
-
key (str): location/url/path where data is persisted
|
|
140
|
-
partition_field (str): name of the partition field in the dataset
|
|
141
|
-
append (bool): if data should be appended, otherwise will overwrite
|
|
142
|
-
replace_partitions (bool): if partitions should be replaced, this will delete the data existent on those partitions completely
|
|
143
|
-
schema: data schema to enforce if reading is required (see: https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema)
|
|
144
|
-
"""
|
|
145
|
-
schema_msg_segment = "0" if schema is None else str(len(schema))
|
|
146
|
-
logger.info(
|
|
147
|
-
f"[save|in] ({df.shape}, {key}, {partition_field}, {append}, {replace_partitions}, schema len:{schema_msg_segment})"
|
|
148
|
-
)
|
|
149
|
-
logger.debug(f"[save|in] ({df}, {key}, {partition_field}, {append}, {replace_partitions}, {schema})")
|
|
150
|
-
|
|
151
|
-
if schema is not None and isinstance(schema, pa.lib.Schema):
|
|
152
|
-
# we will order the columns based on the schema
|
|
153
|
-
columns = [col for col in schema.names]
|
|
154
|
-
df = df[columns]
|
|
155
|
-
|
|
156
|
-
if replace_partitions and append:
|
|
157
|
-
raise StoreException(f"cannot request for replace_partitions and append at the same time")
|
|
158
|
-
|
|
159
|
-
if append:
|
|
160
|
-
pq.write_to_dataset(
|
|
161
|
-
pa.Table.from_pandas(df, preserve_index=False),
|
|
162
|
-
root_path=key,
|
|
163
|
-
partition_cols=[partition_field],
|
|
164
|
-
filesystem=self.fs,
|
|
165
|
-
schema=schema,
|
|
166
|
-
)
|
|
167
|
-
elif replace_partitions:
|
|
168
|
-
partitions = df[partition_field].unique().tolist()
|
|
169
|
-
self._remove_partitions(key, partition_field, partitions)
|
|
170
|
-
pq.write_to_dataset(
|
|
171
|
-
pa.Table.from_pandas(df, preserve_index=False),
|
|
172
|
-
root_path=key,
|
|
173
|
-
partition_cols=[partition_field],
|
|
174
|
-
existing_data_behavior="delete_matching",
|
|
175
|
-
filesystem=self.fs,
|
|
176
|
-
schema=schema,
|
|
177
|
-
)
|
|
178
|
-
else:
|
|
179
|
-
self.delete(key)
|
|
180
|
-
pq.write_to_dataset(
|
|
181
|
-
pa.Table.from_pandas(df, preserve_index=False),
|
|
182
|
-
root_path=key,
|
|
183
|
-
partition_cols=[partition_field],
|
|
184
|
-
existing_data_behavior="delete_matching",
|
|
185
|
-
filesystem=self.fs,
|
|
186
|
-
schema=schema,
|
|
187
|
-
)
|
|
188
|
-
logger.info("[save|out]")
|
|
189
|
-
|
|
190
|
-
def _remove_partitions(self, key: str, partition_field: str, partition_values: List[str]):
|
|
191
|
-
logger.debug(f"[_remove_partitions|in] ({key}, {partition_field}, {partition_values})")
|
|
192
|
-
|
|
193
|
-
for partition_value in partition_values:
|
|
194
|
-
partition_key = f"{partition_field}={partition_value}"
|
|
195
|
-
partition_path = os.path.join(key, partition_key)
|
|
196
|
-
self._rmdir(partition_path)
|
|
197
|
-
|
|
198
|
-
logger.debug("[_remove_partitions|out]")
|
|
199
|
-
|
|
200
|
-
def update(
|
|
201
|
-
self,
|
|
202
|
-
df: pd.DataFrame,
|
|
203
|
-
key: str,
|
|
204
|
-
key_fields: List[str],
|
|
205
|
-
partition_field: Optional[str] = None,
|
|
206
|
-
schema: Any = None,
|
|
207
|
-
):
|
|
208
|
-
"""
|
|
209
|
-
updates a pandas dataframe in parquet format somewhere (key), depending on implementation
|
|
210
|
-
|
|
211
|
-
Parameters:
|
|
212
|
-
df (pandas.DataFrame): the dataframe to be saved
|
|
213
|
-
key (str): location/url/path where data is persisted
|
|
214
|
-
key_fields (List[str]): primary fields of the dataset used to match the rows to update with the new dataset
|
|
215
|
-
partition_field (str): name of the partition field to enforce while saving
|
|
216
|
-
schema: data schema to enforce while reading and saving (see: https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema)
|
|
217
|
-
"""
|
|
218
|
-
schema_msg_segment = "0" if schema is None else str(len(schema))
|
|
219
|
-
logger.info(
|
|
220
|
-
f"[update|in] ({df.shape}, {key}, {key_fields}, {partition_field}, schema len:{schema_msg_segment})"
|
|
221
|
-
)
|
|
222
|
-
logger.debug(f"[update|in] ({df}, {key}, {key_fields}, {partition_field}, {schema})")
|
|
223
|
-
|
|
224
|
-
df0 = self.get(key, schema=schema)
|
|
225
|
-
match = pd.merge(df0.reset_index(), df.reset_index(), on=key_fields)
|
|
226
|
-
index_left = match["index_x"]
|
|
227
|
-
index_right = match["index_y"]
|
|
228
|
-
df0.iloc[index_left] = df.iloc[index_right]
|
|
229
|
-
self.save(df0, key, partition_field=partition_field, schema=schema)
|
|
230
|
-
|
|
231
|
-
logger.info(f"[update|out]")
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from pyarrow import fs
|
|
3
|
-
|
|
4
|
-
from tgedr.dataops.store.fs_single_partition_parquet import FsSinglePartitionParquetStore
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
logger = logging.getLogger(__name__)
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class LocalFsSinglePartitionParquetStore(FsSinglePartitionParquetStore):
|
|
11
|
-
"""FsSinglePartitionParquetStore implementation using local file system"""
|
|
12
|
-
|
|
13
|
-
@property
|
|
14
|
-
def fs(self):
|
|
15
|
-
if self._fs is None:
|
|
16
|
-
self._fs = fs.LocalFileSystem()
|
|
17
|
-
return self._fs
|
|
18
|
-
|
|
19
|
-
def _rmdir(self, key):
|
|
20
|
-
if self.fs.get_file_info(key).type.name == "Directory":
|
|
21
|
-
self.fs.delete_dir(key)
|
|
22
|
-
|
|
23
|
-
def _exists(self, key) -> bool:
|
|
24
|
-
return self.fs.get_file_info(key).type.name != "NotFound"
|