tgedr-dataops 0.0.37__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
  2. tgedr_dataops/commons/utils_fs.py +187 -0
  3. tgedr_dataops/quality/pandas_validation.py +21 -0
  4. tgedr_dataops/sink/local_fs_file_sink.py +77 -0
  5. {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
  6. tgedr_dataops/source/abstract_s3_file_source.py +72 -0
  7. tgedr_dataops/source/local_fs_file_source.py +108 -0
  8. tgedr_dataops/source/pd_df_s3_source.py +130 -0
  9. {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
  10. tgedr_dataops/source/s3_file_extended_source.py +68 -0
  11. {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +63 -27
  12. tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
  13. tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
  14. tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
  15. tgedr_dataops-1.0.1.dist-info/METADATA +72 -0
  16. tgedr_dataops-1.0.1.dist-info/RECORD +22 -0
  17. {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info}/WHEEL +1 -1
  18. tgedr_dataops-1.0.1.dist-info/top_level.txt +1 -0
  19. tgedr/dataops/chain.py +0 -51
  20. tgedr/dataops/commons/dataset.py +0 -23
  21. tgedr/dataops/commons/metadata.py +0 -172
  22. tgedr/dataops/commons/utils_fs.py +0 -85
  23. tgedr/dataops/commons/utils_spark.py +0 -87
  24. tgedr/dataops/etl.py +0 -112
  25. tgedr/dataops/processor.py +0 -27
  26. tgedr/dataops/sink/local_fs_file_sink.py +0 -47
  27. tgedr/dataops/sink/sink.py +0 -46
  28. tgedr/dataops/source/abstract_s3_file_source.py +0 -43
  29. tgedr/dataops/source/delta_table_source.py +0 -49
  30. tgedr/dataops/source/local_delta_table.py +0 -47
  31. tgedr/dataops/source/local_fs_file_source.py +0 -71
  32. tgedr/dataops/source/pd_df_s3_source.py +0 -76
  33. tgedr/dataops/source/s3_delta_table.py +0 -75
  34. tgedr/dataops/source/s3_file_extended_source.py +0 -39
  35. tgedr/dataops/source/source.py +0 -51
  36. tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
  37. tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
  38. tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
  39. tgedr/dataops/store/spark_delta.py +0 -369
  40. tgedr/dataops/store/store.py +0 -49
  41. tgedr/dataops/utils_reflection.py +0 -134
  42. tgedr/dataops/validation/abs.py +0 -46
  43. tgedr/dataops/validation/pandas.py +0 -10
  44. tgedr/dataops/validation/pyspark.py +0 -10
  45. tgedr_dataops-0.0.37.dist-info/METADATA +0 -21
  46. tgedr_dataops-0.0.37.dist-info/RECORD +0 -38
  47. tgedr_dataops-0.0.37.dist-info/top_level.txt +0 -1
  48. {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
  49. {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
  50. {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
  51. {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info/licenses}/LICENSE +0 -0
@@ -1,49 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- import logging
3
- from typing import Any, Dict, List, Optional
4
- from pandas import DataFrame
5
- from deltalake import DeltaTable
6
- from deltalake.exceptions import TableNotFoundError
7
-
8
- from tgedr.dataops.source.source import Source, SourceException, NoSourceException
9
-
10
-
11
- logger = logging.getLogger()
12
-
13
-
14
- class DeltaTableSource(Source, ABC):
15
- """abstract class used to read delta lake format datasets returning a pandas dataframe"""
16
-
17
- CONTEXT_KEY_URL: str = "url"
18
- CONTEXT_KEY_COLUMNS: str = "columns"
19
-
20
- def __init__(self, config: Optional[Dict[str, Any]] = None):
21
- super().__init__(config=config)
22
-
23
- @property
24
- @abstractmethod
25
- def _storage_options(self):
26
- return None
27
-
28
- def get(self, context: Optional[Dict[str, Any]] = None) -> DataFrame:
29
- """retrieves a delta lake table"""
30
- logger.info(f"[get|in] ({context})")
31
- result: DataFrame = None
32
-
33
- if self.CONTEXT_KEY_URL not in context:
34
- raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
35
-
36
- columns: List[str] = None
37
- if self.CONTEXT_KEY_COLUMNS in context:
38
- columns = context[self.CONTEXT_KEY_COLUMNS]
39
-
40
- try:
41
- delta_table = DeltaTable(
42
- table_uri=context[self.CONTEXT_KEY_URL], storage_options=self._storage_options, without_files=True
43
- )
44
- result = delta_table.to_pandas(columns=columns)
45
- except TableNotFoundError as tnfe:
46
- raise NoSourceException(f"could not find delta table: {context[self.CONTEXT_KEY_URL]}")
47
-
48
- logger.info(f"[get|out] => {result}")
49
- return result
@@ -1,47 +0,0 @@
1
- import logging
2
- import os
3
- import re
4
- from typing import Any, Dict, List, Optional
5
- import glob
6
-
7
- from tgedr.dataops.source.delta_table_source import DeltaTableSource
8
- from tgedr.dataops.source.source import SourceException
9
-
10
-
11
- logger = logging.getLogger()
12
-
13
-
14
- class LocalDeltaTable(DeltaTableSource):
15
- """class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe"""
16
-
17
- def __init__(self, config: Optional[Dict[str, Any]] = None):
18
- super().__init__(config=config)
19
-
20
- @property
21
- def _storage_options(self):
22
- return None
23
-
24
- def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
25
- """lists the available delta lake datasets in the url provided"""
26
- logger.info(f"[list|in] ({context})")
27
-
28
- result: List[str] = []
29
- if self.CONTEXT_KEY_URL not in context:
30
- raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
31
-
32
- url = context[self.CONTEXT_KEY_URL]
33
- if not os.path.isdir(url):
34
- raise SourceException(f"not a delta lake url: {url}")
35
-
36
- matches: set[str] = set()
37
- pattern: str = f".*{url}/(.*)/_delta_log/.*"
38
- for entry in glob.iglob(url + "**/**", recursive=True):
39
- match = re.search(pattern, entry)
40
- if match:
41
- matches.add(match.group(1))
42
-
43
- result = list(matches)
44
-
45
- logger.info(f"[list] result: {result}")
46
- logger.info(f"[list|out] => result len: {len(result)}")
47
- return result
@@ -1,71 +0,0 @@
1
- import logging
2
- import os
3
- import shutil
4
- from typing import Any, Dict, List, Optional
5
-
6
- from tgedr.dataops.source.source import Source, SourceException
7
-
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
-
12
- class LocalFsFileSource(Source):
13
- """source class used to retrieve local objects/files to a another local fs location"""
14
-
15
- CONTEXT_KEY_SOURCE = "source"
16
- CONTEXT_KEY_TARGET = "target"
17
- CONTEXT_KEY_SUFFIX = "suffix"
18
- CONTEXT_KEY_FILES = "files"
19
-
20
- def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
21
- logger.info(f"[list|in] ({context})")
22
- result: List[str] = []
23
- if self.CONTEXT_KEY_SOURCE not in context:
24
- raise SourceException(f"you must provide context for {self.CONTEXT_KEY_SOURCE}")
25
-
26
- source = context[self.CONTEXT_KEY_SOURCE]
27
- if os.path.isdir(source):
28
- suffix = None
29
- if self.CONTEXT_KEY_SUFFIX in context:
30
- suffix = context[self.CONTEXT_KEY_SUFFIX]
31
- result: List[str] = [os.path.join(source, file) for file in os.listdir(source) if file.endswith(suffix)]
32
- else:
33
- result: List[str] = [os.path.join(source, file) for file in os.listdir(source)]
34
- elif os.path.isfile(source):
35
- result: List[str] = [source]
36
-
37
- logger.debug(f"[list|out] => {result}")
38
- logger.info(f"[list|out] => result len: {len(result)}")
39
- return result
40
-
41
- def get(self, context: Optional[Dict[str, Any]] = None) -> Any:
42
- logger.info(f"[get|in] ({context})")
43
-
44
- if self.CONTEXT_KEY_FILES not in context or self.CONTEXT_KEY_TARGET not in context:
45
- raise SourceException(f"{self.CONTEXT_KEY_FILES} and {self.CONTEXT_KEY_TARGET} must be provided in config")
46
- files = context[self.CONTEXT_KEY_FILES]
47
- target = context[self.CONTEXT_KEY_TARGET]
48
-
49
- if "list" != type(files).__name__:
50
- if "string" == type(files).__name__:
51
- files = [files]
52
- else:
53
- raise SourceException("files argument must be a list of strings or a string")
54
-
55
- target_is_dir: bool = False
56
- if os.path.isdir(target):
57
- target_is_dir = True
58
-
59
- result: List[str] = []
60
-
61
- for file in files:
62
- basename = os.path.basename(file)
63
- if target_is_dir:
64
- new_file = os.path.join(target, basename)
65
- else:
66
- new_file = target
67
- shutil.copy(file, new_file)
68
- result.append(new_file)
69
-
70
- logger.info("[get|out] => {result}")
71
- return result
@@ -1,76 +0,0 @@
1
- from io import StringIO
2
- import logging
3
- from typing import Any, Dict, Optional
4
- import pandas as pd
5
-
6
- from tgedr.dataops.commons.utils_fs import process_s3_url
7
- from tgedr.dataops.source.abstract_s3_file_source import AbstractS3FileSource
8
- from tgedr.dataops.source.source import SourceException
9
-
10
- logger = logging.getLogger()
11
-
12
-
13
- class PdDfS3Source(AbstractS3FileSource):
14
- """class used to read a pandas dataframe from a csv file in s3"""
15
-
16
- CONTEXT_KEY_FILE_FORMAT = "file_format"
17
- CONTEXT_KEY_SEPARATOR = "sep"
18
- CONTEXT_KEY_NO_HEADER = "no_header"
19
- CONTEXT_KEY_COLUMN_NAMES = "column_names"
20
- CONTEXT_KEY_SCHEMA_TYPES = "schema_types"
21
- DEFAULT_FORMAT = "csv"
22
- FORMATS = ["csv", "xlsx"]
23
- DEFAULT_SEPARATOR = ","
24
-
25
- def __init__(self, config: Optional[Dict[str, Any]] = None):
26
- super().__init__(config=config)
27
-
28
- def get(self, context: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
29
- """retrieves a pandas dataframe, by default reading it from a csv,
30
- you can ask for a different format using the context key 'file_format' (available formats: csv)"""
31
- logger.info(f"[get|in] ({context})")
32
- result: pd.DataFrame = None
33
-
34
- if self.CONTEXT_KEY_URL not in context:
35
- raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
36
-
37
- format: str = self.DEFAULT_FORMAT
38
- if self.CONTEXT_KEY_FILE_FORMAT in context:
39
- format = context[self.CONTEXT_KEY_FILE_FORMAT]
40
- if format not in self.FORMATS:
41
- raise SourceException(f"[get] invalid format: {format}")
42
-
43
- if "csv" == format:
44
- result = self.__read_csv(context=context)
45
- else:
46
- result = self.__read_excel(context=context)
47
-
48
- logger.info(f"[get|out] => {result}")
49
- return result
50
-
51
- def __read_csv(self, context: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
52
- logger.info(f"[__read_csv|in] ({context})")
53
-
54
- protocol, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
55
-
56
- obj = self._client.get_object(Bucket=bucket, Key=key)
57
- data = obj["Body"].read().decode("utf-8")
58
-
59
- header = 0 if self.CONTEXT_KEY_NO_HEADER not in context else None
60
- names = None if self.CONTEXT_KEY_COLUMN_NAMES not in context else context[self.CONTEXT_KEY_COLUMN_NAMES]
61
- dtype = None if self.CONTEXT_KEY_SCHEMA_TYPES not in context else context[self.CONTEXT_KEY_SCHEMA_TYPES]
62
- sep = (
63
- self.DEFAULT_SEPARATOR if self.CONTEXT_KEY_SEPARATOR not in context else context[self.CONTEXT_KEY_SEPARATOR]
64
- )
65
-
66
- result: pd.DataFrame = pd.read_csv(StringIO(data), sep=sep, header=header, names=names, dtype=dtype)
67
-
68
- logger.info(f"[__read_csv|out] => {result}")
69
- return result
70
-
71
- def __read_excel(self, context: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
72
- logger.info(f"[__read_excel|in] ({context})")
73
- src = context[self.CONTEXT_KEY_URL]
74
- result: pd.DataFrame = pd.read_excel(src, engine="openpyxl")
75
- logger.info(f"[__read_excel|out] => {result}")
76
- return result
@@ -1,75 +0,0 @@
1
- import logging
2
- import re
3
- from typing import Any, Dict, List, Optional
4
-
5
- from tgedr.dataops.commons.s3_connector import S3Connector
6
- from tgedr.dataops.commons.utils_fs import remove_s3_protocol, resolve_s3_protocol
7
- from tgedr.dataops.source.delta_table_source import DeltaTableSource
8
- from tgedr.dataops.source.source import SourceException
9
-
10
-
11
- logger = logging.getLogger()
12
-
13
-
14
- class S3DeltaTable(DeltaTableSource, S3Connector):
15
- """class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe"""
16
-
17
- CONFIG_KEY_AWS_ACCESS_KEY_ID: str = "AWS_ACCESS_KEY_ID"
18
- CONFIG_KEY_AWS_SECRET_ACCESS_KEY: str = "AWS_SECRET_ACCESS_KEY"
19
- CONFIG_KEY_AWS_SESSION_TOKEN: str = "AWS_SESSION_TOKEN"
20
- CONFIG_KEY_AWS_REGION: str = "AWS_REGION"
21
-
22
- def __init__(self, config: Optional[Dict[str, Any]] = None):
23
- DeltaTableSource.__init__(self, config=config)
24
- S3Connector.__init__(self)
25
-
26
- @property
27
- def _storage_options(self):
28
- result = None
29
- if (self._config is not None) and all(
30
- element in list(self._config.keys())
31
- for element in [
32
- self.CONFIG_KEY_AWS_ACCESS_KEY_ID,
33
- self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY,
34
- self.CONFIG_KEY_AWS_SESSION_TOKEN,
35
- self.CONFIG_KEY_AWS_REGION,
36
- ]
37
- ):
38
- result = {
39
- "AWS_ACCESS_KEY_ID": self._config[self.CONFIG_KEY_AWS_ACCESS_KEY_ID],
40
- "AWS_SECRET_ACCESS_KEY": self._config[self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY],
41
- "AWS_SESSION_TOKEN": self._config[self.CONFIG_KEY_AWS_SESSION_TOKEN],
42
- "AWS_REGION": self._config[self.CONFIG_KEY_AWS_REGION],
43
- }
44
-
45
- return result
46
-
47
- def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
48
- """lists the available delta lake datasets in the url provided"""
49
- logger.info(f"[list|in] ({context})")
50
-
51
- result: List[str] = []
52
- if self.CONTEXT_KEY_URL not in context:
53
- raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
54
-
55
- s3_protocol: str = resolve_s3_protocol(context[self.CONTEXT_KEY_URL])
56
- protocol = "" if s3_protocol is None else s3_protocol
57
-
58
- path = remove_s3_protocol(context[self.CONTEXT_KEY_URL])
59
- path_elements = path.split("/")
60
- bucket = path_elements[0]
61
- key = "/".join(path_elements[1:])
62
-
63
- matches: set[str] = set()
64
- pattern: str = f".*{key}/(.*)/_delta_log/.*"
65
- for entry in self._client.list_objects_v2(Bucket=bucket, Prefix=key)["Contents"]:
66
- output_key: str = entry["Key"]
67
- match = re.search(pattern, output_key)
68
- if match:
69
- matches.add(f"{key}/{match.group(1)}")
70
-
71
- result = list(matches)
72
-
73
- logger.info(f"[list] result: {result}")
74
- logger.info(f"[list|out] => result len: {len(result)}")
75
- return result
@@ -1,39 +0,0 @@
1
- import logging
2
- from typing import Any, Dict, Optional
3
-
4
- from tgedr.dataops.source.s3_file_source import S3FileSource
5
- from tgedr.dataops.source.source import SourceException
6
- from tgedr.dataops.commons.utils_fs import process_s3_path
7
-
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
-
12
- class S3FileExtendedSource(S3FileSource):
13
- """class used to retrieve objects/files from s3 bucket to local fs location"""
14
-
15
- METADATA_KEYS = ["LastModified", "ContentLength", "ETag", "VersionId", "ContentType"]
16
-
17
- def __init__(self, config: Optional[Dict[str, Any]] = None):
18
- super().__init__(config=config)
19
-
20
- def get_metadata(self, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
21
- logger.info(f"[get_metadata|in] ({context})")
22
-
23
- result: Dict[str, Any] = {}
24
- if self.CONTEXT_KEY_SOURCE not in context:
25
- raise SourceException(f"you must provide context for {self.CONTEXT_KEY_SOURCE}")
26
-
27
- bucket, key = process_s3_path(context[self.CONTEXT_KEY_SOURCE])
28
-
29
- o = self._client.head_object(Bucket=bucket, Key=key)
30
-
31
- for key in list(o.keys()):
32
- if key in self.METADATA_KEYS:
33
- if key == "LastModified":
34
- result[key] = int(o[key].timestamp())
35
- else:
36
- result[key] = o[key]
37
-
38
- logger.info(f"[get_metadata|out] => result len: {result}")
39
- return result
@@ -1,51 +0,0 @@
1
- import abc
2
- from typing import Any, Dict, Optional
3
-
4
- from tgedr.dataops.chain import Chain
5
-
6
-
7
- class SourceException(Exception):
8
- pass
9
-
10
-
11
- class NoSourceException(SourceException):
12
- pass
13
-
14
-
15
- class SourceInterface(metaclass=abc.ABCMeta):
16
- """
17
- def get(self, context: Optional[Dict[str, Any]] = None) -> Any:
18
- raise NotImplementedError()
19
- """
20
-
21
- @classmethod
22
- def __subclasshook__(cls, subclass):
23
- return (
24
- hasattr(subclass, "get")
25
- and callable(subclass.get)
26
- and hasattr(subclass, "list")
27
- and callable(subclass.list)
28
- or NotImplemented
29
- )
30
-
31
-
32
- @SourceInterface.register
33
- class Source(abc.ABC):
34
- """abstract class defining methods ('list' and 'get') to manage retrieval of data from somewhere as defined by implementing classes"""
35
-
36
- def __init__(self, config: Optional[Dict[str, Any]] = None):
37
- self._config = config
38
-
39
- @abc.abstractmethod
40
- def get(self, context: Optional[Dict[str, Any]] = None) -> Any:
41
- raise NotImplementedError()
42
-
43
- @abc.abstractmethod
44
- def list(self, context: Optional[Dict[str, Any]] = None) -> Any:
45
- raise NotImplementedError()
46
-
47
-
48
- @SourceInterface.register
49
- class SourceChain(Chain, abc.ABC):
50
- def execute(self, context: Optional[Dict[str, Any]] = None) -> Any:
51
- return self.get(context=context)
@@ -1,231 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- import logging
3
- import os
4
- from typing import Any, Dict, List, Optional
5
- import pandas as pd
6
- import pyarrow as pa
7
- import pyarrow.parquet as pq
8
- import pyarrow.compute as pc
9
-
10
- from tgedr.dataops.store.store import Store, StoreException
11
-
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
- def pandas_mapper(arrow_type):
17
- if pa.types.is_int64(arrow_type):
18
- return pd.Int64Dtype()
19
- if pa.types.is_float64(arrow_type):
20
- return pd.Float64Dtype()
21
- if pa.types.is_string(arrow_type):
22
- return pd.StringDtype()
23
- # suggest default behavior
24
- return None
25
-
26
-
27
- class FsSinglePartitionParquetStore(Store, ABC):
28
- """abstract store implementation defining persistence on parquet files with an optional single partition,
29
- regardless of the location it should persist"""
30
-
31
- @property
32
- @abstractmethod
33
- def fs(self):
34
- """abstract method providing a filesystem implementation (local, s3, etc...)"""
35
- raise NotImplementedError()
36
-
37
- @abstractmethod
38
- def _rmdir(self, key):
39
- raise NotImplementedError()
40
-
41
- @abstractmethod
42
- def _exists(self, key) -> bool:
43
- raise NotImplementedError()
44
-
45
- def __init__(self, config: Optional[Dict[str, Any]] = None):
46
- Store.__init__(self, config)
47
- self._fs = None
48
-
49
- def get(
50
- self,
51
- key: str,
52
- filter: callable = None,
53
- filters: List[tuple[str, str, List[str]]] = None,
54
- schema: pa.Schema = None,
55
- ) -> pd.DataFrame:
56
- """
57
- reads a pandas dataframe from somewhere (key), depending on implementation, eventually enforcing a schema and
58
- allowing filtering of data
59
-
60
- Parameters:
61
- key (str): location/url/path where data should be persisted
62
- filter (Array or array-like or Expression): filter expression (see: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.filter)
63
- filters (pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]]): filter expression (see: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html)
64
- schema: data schema to enforce while reading (see: https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema)
65
-
66
- Returns:
67
- pandas.DataFrame: the dataframe
68
- """
69
- schema_msg_segment = "0" if schema is None else str(len(schema))
70
- logger.info(f"[get|in] ({key}, {filter}, {filters}, schema len:{schema_msg_segment})")
71
- logger.debug(f"[get|in] ({key}, {filter}, {filters}, {schema})")
72
- table = pq.read_table(key, filesystem=self.fs, filters=filters, schema=schema)
73
- if filter is not None:
74
- table = table.filter(filter)
75
- result = table.to_pandas(types_mapper=pandas_mapper)
76
- logger.info(f"[get|out] => {result.shape}")
77
- return result
78
-
79
- def delete(
80
- self,
81
- key: str,
82
- partition_field: Optional[str] = None,
83
- partition_values: Optional[List[str]] = None,
84
- kv_dict: Optional[Dict[str, List[Any]]] = None,
85
- schema: pa.Schema = None,
86
- ):
87
- """
88
- removes partitions, full or partial, or deletes partial values or a full dataset
89
- from a parquet storage somewhere (key), depending on implementation
90
-
91
- Parameters:
92
- key (str): location/url/path where data is persisted
93
- partition_field (str): name of the partition field in the dataset
94
- partition_values (str): partition values to delete
95
- kv_dict (Dict[str, List[Any]]): key-value map defining the fields and array of values to become the deletion filter
96
- schema: data schema to enforce if reading is required (see: https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema)
97
- """
98
- schema_msg_segment = "0" if schema is None else str(len(schema))
99
- logger.info(
100
- f"[delete|in] ({key}, {partition_field}, {partition_values}, {kv_dict}, schema len:{schema_msg_segment})"
101
- )
102
- logger.debug(f"[delete|in] ({key}, {partition_field}, {partition_values}, {kv_dict}, {schema})")
103
-
104
- if partition_values is not None and partition_field is not None:
105
- self._remove_partitions(key, partition_field=partition_field, partition_values=partition_values)
106
- elif kv_dict is not None and partition_field is not None:
107
- table = pq.read_table(key, filesystem=self.fs, schema=schema)
108
- for k, v in kv_dict.items():
109
- filter_condition = ~pc.is_in(pc.field(k), pa.array(v))
110
- table = table.filter(filter_condition)
111
- self.delete(key, schema=schema)
112
- pq.write_to_dataset(
113
- table,
114
- root_path=key,
115
- partition_cols=[partition_field],
116
- existing_data_behavior="delete_matching",
117
- filesystem=self.fs,
118
- schema=schema,
119
- )
120
- else:
121
- self._rmdir(key)
122
-
123
- logger.info("[delete|out]")
124
-
125
- def save(
126
- self,
127
- df: pd.DataFrame,
128
- key: str,
129
- partition_field: Optional[str] = None,
130
- append: bool = False,
131
- replace_partitions: bool = False,
132
- schema: Any = None,
133
- ):
134
- """
135
- saves a pandas dataframe in parquet format somewhere (key), depending on implementation
136
-
137
- Parameters:
138
- df (pandas.DataFrame): the dataframe to be saved
139
- key (str): location/url/path where data is persisted
140
- partition_field (str): name of the partition field in the dataset
141
- append (bool): if data should be appended, otherwise will overwrite
142
- replace_partitions (bool): if partitions should be replaced, this will delete the data existent on those partitions completely
143
- schema: data schema to enforce if reading is required (see: https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema)
144
- """
145
- schema_msg_segment = "0" if schema is None else str(len(schema))
146
- logger.info(
147
- f"[save|in] ({df.shape}, {key}, {partition_field}, {append}, {replace_partitions}, schema len:{schema_msg_segment})"
148
- )
149
- logger.debug(f"[save|in] ({df}, {key}, {partition_field}, {append}, {replace_partitions}, {schema})")
150
-
151
- if schema is not None and isinstance(schema, pa.lib.Schema):
152
- # we will order the columns based on the schema
153
- columns = [col for col in schema.names]
154
- df = df[columns]
155
-
156
- if replace_partitions and append:
157
- raise StoreException(f"cannot request for replace_partitions and append at the same time")
158
-
159
- if append:
160
- pq.write_to_dataset(
161
- pa.Table.from_pandas(df, preserve_index=False),
162
- root_path=key,
163
- partition_cols=[partition_field],
164
- filesystem=self.fs,
165
- schema=schema,
166
- )
167
- elif replace_partitions:
168
- partitions = df[partition_field].unique().tolist()
169
- self._remove_partitions(key, partition_field, partitions)
170
- pq.write_to_dataset(
171
- pa.Table.from_pandas(df, preserve_index=False),
172
- root_path=key,
173
- partition_cols=[partition_field],
174
- existing_data_behavior="delete_matching",
175
- filesystem=self.fs,
176
- schema=schema,
177
- )
178
- else:
179
- self.delete(key)
180
- pq.write_to_dataset(
181
- pa.Table.from_pandas(df, preserve_index=False),
182
- root_path=key,
183
- partition_cols=[partition_field],
184
- existing_data_behavior="delete_matching",
185
- filesystem=self.fs,
186
- schema=schema,
187
- )
188
- logger.info("[save|out]")
189
-
190
- def _remove_partitions(self, key: str, partition_field: str, partition_values: List[str]):
191
- logger.debug(f"[_remove_partitions|in] ({key}, {partition_field}, {partition_values})")
192
-
193
- for partition_value in partition_values:
194
- partition_key = f"{partition_field}={partition_value}"
195
- partition_path = os.path.join(key, partition_key)
196
- self._rmdir(partition_path)
197
-
198
- logger.debug("[_remove_partitions|out]")
199
-
200
- def update(
201
- self,
202
- df: pd.DataFrame,
203
- key: str,
204
- key_fields: List[str],
205
- partition_field: Optional[str] = None,
206
- schema: Any = None,
207
- ):
208
- """
209
- updates a pandas dataframe in parquet format somewhere (key), depending on implementation
210
-
211
- Parameters:
212
- df (pandas.DataFrame): the dataframe to be saved
213
- key (str): location/url/path where data is persisted
214
- key_fields (List[str]): primary fields of the dataset used to match the rows to update with the new dataset
215
- partition_field (str): name of the partition field to enforce while saving
216
- schema: data schema to enforce while reading and saving (see: https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema)
217
- """
218
- schema_msg_segment = "0" if schema is None else str(len(schema))
219
- logger.info(
220
- f"[update|in] ({df.shape}, {key}, {key_fields}, {partition_field}, schema len:{schema_msg_segment})"
221
- )
222
- logger.debug(f"[update|in] ({df}, {key}, {key_fields}, {partition_field}, {schema})")
223
-
224
- df0 = self.get(key, schema=schema)
225
- match = pd.merge(df0.reset_index(), df.reset_index(), on=key_fields)
226
- index_left = match["index_x"]
227
- index_right = match["index_y"]
228
- df0.iloc[index_left] = df.iloc[index_right]
229
- self.save(df0, key, partition_field=partition_field, schema=schema)
230
-
231
- logger.info(f"[update|out]")
@@ -1,24 +0,0 @@
1
- import logging
2
- from pyarrow import fs
3
-
4
- from tgedr.dataops.store.fs_single_partition_parquet import FsSinglePartitionParquetStore
5
-
6
-
7
- logger = logging.getLogger(__name__)
8
-
9
-
10
- class LocalFsSinglePartitionParquetStore(FsSinglePartitionParquetStore):
11
- """FsSinglePartitionParquetStore implementation using local file system"""
12
-
13
- @property
14
- def fs(self):
15
- if self._fs is None:
16
- self._fs = fs.LocalFileSystem()
17
- return self._fs
18
-
19
- def _rmdir(self, key):
20
- if self.fs.get_file_info(key).type.name == "Directory":
21
- self.fs.delete_dir(key)
22
-
23
- def _exists(self, key) -> bool:
24
- return self.fs.get_file_info(key).type.name != "NotFound"