tgedr-dataops 0.0.37__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
  2. tgedr_dataops/commons/utils_fs.py +187 -0
  3. tgedr_dataops/quality/pandas_validation.py +21 -0
  4. tgedr_dataops/sink/local_fs_file_sink.py +77 -0
  5. {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
  6. tgedr_dataops/source/abstract_s3_file_source.py +72 -0
  7. tgedr_dataops/source/local_fs_file_source.py +108 -0
  8. tgedr_dataops/source/pd_df_s3_source.py +130 -0
  9. {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
  10. tgedr_dataops/source/s3_file_extended_source.py +68 -0
  11. {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +63 -27
  12. tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
  13. tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
  14. tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
  15. tgedr_dataops-1.0.1.dist-info/METADATA +72 -0
  16. tgedr_dataops-1.0.1.dist-info/RECORD +22 -0
  17. {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info}/WHEEL +1 -1
  18. tgedr_dataops-1.0.1.dist-info/top_level.txt +1 -0
  19. tgedr/dataops/chain.py +0 -51
  20. tgedr/dataops/commons/dataset.py +0 -23
  21. tgedr/dataops/commons/metadata.py +0 -172
  22. tgedr/dataops/commons/utils_fs.py +0 -85
  23. tgedr/dataops/commons/utils_spark.py +0 -87
  24. tgedr/dataops/etl.py +0 -112
  25. tgedr/dataops/processor.py +0 -27
  26. tgedr/dataops/sink/local_fs_file_sink.py +0 -47
  27. tgedr/dataops/sink/sink.py +0 -46
  28. tgedr/dataops/source/abstract_s3_file_source.py +0 -43
  29. tgedr/dataops/source/delta_table_source.py +0 -49
  30. tgedr/dataops/source/local_delta_table.py +0 -47
  31. tgedr/dataops/source/local_fs_file_source.py +0 -71
  32. tgedr/dataops/source/pd_df_s3_source.py +0 -76
  33. tgedr/dataops/source/s3_delta_table.py +0 -75
  34. tgedr/dataops/source/s3_file_extended_source.py +0 -39
  35. tgedr/dataops/source/source.py +0 -51
  36. tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
  37. tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
  38. tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
  39. tgedr/dataops/store/spark_delta.py +0 -369
  40. tgedr/dataops/store/store.py +0 -49
  41. tgedr/dataops/utils_reflection.py +0 -134
  42. tgedr/dataops/validation/abs.py +0 -46
  43. tgedr/dataops/validation/pandas.py +0 -10
  44. tgedr/dataops/validation/pyspark.py +0 -10
  45. tgedr_dataops-0.0.37.dist-info/METADATA +0 -21
  46. tgedr_dataops-0.0.37.dist-info/RECORD +0 -38
  47. tgedr_dataops-0.0.37.dist-info/top_level.txt +0 -1
  48. {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
  49. {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
  50. {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
  51. {tgedr_dataops-0.0.37.dist-info → tgedr_dataops-1.0.1.dist-info/licenses}/LICENSE +0 -0
tgedr/dataops/chain.py DELETED
@@ -1,51 +0,0 @@
1
- import abc
2
- from typing import Any, Dict, Optional
3
-
4
- from tgedr.dataops.processor import Processor
5
-
6
-
7
- class ChainException(Exception):
8
- pass
9
-
10
-
11
- class ChainInterface(metaclass=abc.ABCMeta):
12
- @classmethod
13
- def __subclasshook__(cls, subclass):
14
- return (
15
- hasattr(subclass, "next")
16
- and callable(subclass.next)
17
- and hasattr(subclass, "execute")
18
- and callable(subclass.execute)
19
- ) or NotImplemented
20
-
21
-
22
- class ChainMixin(abc.ABC):
23
- def next(self, handler: "ChainMixin") -> "ChainMixin":
24
- if "_next" not in self.__dict__ or self._next is None:
25
- self._next: "ChainMixin" = handler
26
- else:
27
- self._next.next(handler)
28
- return self
29
-
30
- @abc.abstractmethod
31
- def execute(self, context: Optional[Dict[str, Any]] = None) -> Any:
32
- raise NotImplementedError()
33
-
34
-
35
- class ProcessorChainMixin(ChainMixin):
36
- def execute(self, context: Optional[Dict[str, Any]] = None) -> Any:
37
- self.process(context=context)
38
- if "_next" in self.__dict__ and self._next is not None:
39
- self._next.execute(context=context)
40
-
41
-
42
- @ChainInterface.register
43
- class ProcessorChain(ProcessorChainMixin, Processor):
44
- pass
45
-
46
-
47
- @ChainInterface.register
48
- class Chain(ChainMixin, abc.ABC):
49
- @abc.abstractmethod
50
- def execute(self, context: Optional[Dict[str, Any]] = None) -> Any:
51
- raise NotImplementedError()
@@ -1,23 +0,0 @@
1
- from dataclasses import dataclass
2
- import json
3
- from pyspark.sql import DataFrame
4
- from tgedr.dataops.commons.metadata import Metadata
5
-
6
-
7
- @dataclass(frozen=True)
8
- class Dataset:
9
- """
10
- utility immutable class to wrap up a dataframe along with metadata
11
- """
12
-
13
- __slots__ = ["metadata", "data"]
14
- metadata: Metadata
15
- data: DataFrame
16
-
17
- def as_dict(self) -> dict:
18
- """serialize the dataset as a dictionary"""
19
- return {"metadata": self.metadata.as_dict(), "data": str(self.data.__repr__)}
20
-
21
- def __str__(self) -> str:
22
- """serialize the dataset as a json string"""
23
- return json.dumps(self.as_dict())
@@ -1,172 +0,0 @@
1
- from dataclasses import dataclass
2
- import json
3
- from typing import Dict, List, Optional, Union
4
-
5
-
6
- @dataclass(frozen=True)
7
- class FieldFrame:
8
- """
9
- class depicting a field values range, to be used in metadata
10
-
11
- Parameters:
12
- field (str): the name of the field
13
- lower (Union[int, str, float]): field lower bound
14
- upper (Union[int, str, float]): field upper bound
15
- """
16
-
17
- __slots__ = ["field", "lower", "upper"]
18
- field: str
19
- lower: Union[int, str, float]
20
- upper: Union[int, str, float]
21
-
22
- def as_dict(self) -> Dict[str, Union[int, str, float]]:
23
- return {"field": self.field, "lower": self.lower, "upper": self.upper}
24
-
25
- @staticmethod
26
- def from_str(src: str) -> "FieldFrame":
27
- r = json.loads(src)
28
- field = r["field"]
29
- lower = r["lower"]
30
- upper = r["upper"]
31
- return FieldFrame(field=field, lower=lower, upper=upper)
32
-
33
- def __str__(self) -> str:
34
- return json.dumps(self.as_dict())
35
-
36
- def __eq__(self, other):
37
- return self.field == other.field and self.lower == other.lower and self.upper == other.upper
38
-
39
- def __gt__(self, other):
40
- return self.field > other.field or (
41
- self.field == other.field
42
- and (self.lower > other.lower or (self.lower == other.lower and self.upper > other.upper))
43
- )
44
-
45
- def __ne__(self, other):
46
- return not other == self
47
-
48
- def __ge__(self, other):
49
- return other == self or self > other
50
-
51
- def __le__(self, other):
52
- return other == self or self < other
53
-
54
- def __lt__(self, other):
55
- return other > self
56
-
57
-
58
- @dataclass(frozen=True)
59
- class Metadata:
60
- """immutable class depicting dataset metadata
61
-
62
- Parameters:
63
- name (str): the name of the dataset
64
- version (Optional[str]): version of this dataset, if available
65
- framing (Optional[List[FieldFrame]]): multiple field frames
66
- sources (Optional[List["Metadata"]]): metadatas related to the datasets sourcing this one
67
- """
68
-
69
- __slots__ = ["name", "version", "framing", "sources"]
70
- name: str
71
- version: Optional[str]
72
- framing: Optional[List[FieldFrame]]
73
- sources: Optional[List["Metadata"]]
74
-
75
- def as_dict(self) -> dict:
76
- result = {"name": self.name}
77
- if self.version is not None:
78
- result["version"] = self.version
79
- if self.framing is not None:
80
- result["framing"] = []
81
- for f in self.framing:
82
- (result["framing"]).append(f.as_dict())
83
- if self.sources is not None:
84
- result["sources"] = []
85
- for source in self.sources:
86
- (result["sources"]).append(source.as_dict())
87
-
88
- return result
89
-
90
- def __str__(self):
91
- return json.dumps(self.as_dict())
92
-
93
- def __eq__(self, other: object) -> bool:
94
- return (
95
- self.name == other.name
96
- and (
97
- (self.version is None and other.version is None)
98
- or ((self.version is not None and other.version is not None) and self.version == other.version)
99
- )
100
- and (
101
- (self.framing is None and other.framing is None)
102
- or (
103
- (self.framing is not None and other.framing is not None)
104
- and sorted(self.framing) == sorted(other.framing)
105
- )
106
- )
107
- and (
108
- (self.sources is None and other.sources is None)
109
- or (
110
- (self.sources is not None and other.sources is not None)
111
- and sorted(self.sources) == sorted(other.sources)
112
- )
113
- )
114
- )
115
-
116
- def __gt__(self, other):
117
- return self.name > other.name or (
118
- self.name == other.name
119
- and (
120
- ((self.version is not None and other.version is not None) and (self.version > other.version))
121
- or (self.version is not None and other.version is None)
122
- or (
123
- (
124
- (self.framing is not None and other.framing is not None)
125
- and (sorted(self.framing) > sorted(other.framing))
126
- )
127
- or (self.framing is not None and other.framing is None)
128
- or (
129
- (
130
- (self.sources is not None and other.sources is not None)
131
- and (sorted(self.sources) > sorted(other.sources))
132
- )
133
- or (self.sources is not None and other.sources is None)
134
- )
135
- )
136
- )
137
- )
138
-
139
- def __ne__(self, other):
140
- return not other == self
141
-
142
- def __ge__(self, other):
143
- return other == self or self > other
144
-
145
- def __le__(self, other):
146
- return other == self or self < other
147
-
148
- def __lt__(self, other):
149
- return other > self
150
-
151
- @staticmethod
152
- def from_str(src: str) -> "Metadata":
153
- r = json.loads(src)
154
- name = r["name"]
155
- version = None if "version" not in r else r["version"]
156
-
157
- framing = None
158
- framing_entries = None if "framing" not in r else r["framing"]
159
- if framing_entries is not None:
160
- framing = []
161
- for framing_entry in framing_entries:
162
- framing.append(FieldFrame.from_str(json.dumps(framing_entry)))
163
-
164
- sources = None
165
- sources_entries = None if "sources" not in r else r["sources"]
166
- if sources_entries is not None:
167
- sources = []
168
- for source_entry in sources_entries:
169
- source_entry_as_str = json.dumps(source_entry)
170
- sources.append(Metadata.from_str(source_entry_as_str))
171
-
172
- return Metadata(name=name, version=version, framing=framing, sources=sources)
@@ -1,85 +0,0 @@
1
- import tempfile
2
- import re
3
- from typing import Tuple, Union, AnyStr
4
- import hashlib
5
-
6
-
7
- def temp_dir(root: str = None, suffix: str = None, prefix: str = None) -> str:
8
- return tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=root)
9
-
10
-
11
- def temp_file(
12
- root: str = None, suffix: str = None, prefix: str = None, discard_handle: bool = True
13
- ) -> Union[str, Tuple[int, str]]:
14
- h, f = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=root)
15
- if discard_handle:
16
- return f
17
- else:
18
- return (h, f)
19
-
20
-
21
- def resolve_url_protocol(url: str) -> str:
22
- result = None
23
- group_match = re.search("(.*://).*", url)
24
- if group_match is not None:
25
- result = group_match.group(1)
26
- return result
27
-
28
-
29
- def resolve_s3_protocol(url: str) -> str:
30
- result = None
31
- group_match = re.search("(s3[a]?://).*", url)
32
- if group_match is not None:
33
- result = group_match.group(1)
34
- return result
35
-
36
-
37
- def remove_s3_protocol(url: str) -> str:
38
- if url.startswith("s3://"):
39
- result = url[5:]
40
- elif url.startswith("s3a://"):
41
- result = url[6:]
42
- else:
43
- result = url
44
- return result
45
-
46
-
47
- def process_s3_path(path: str) -> Tuple[str, str]:
48
- no_protocol_path = remove_s3_protocol(path)
49
- path_elements = no_protocol_path.split("/")
50
- bucket = path_elements[0]
51
- key = "/".join(path_elements[1:])
52
- return (bucket, key)
53
-
54
-
55
- def process_s3_url(url: str) -> Tuple[str, str, str]:
56
- protocol = resolve_s3_protocol(url)
57
- no_protocol_url = remove_s3_protocol(url)
58
- path_elements = no_protocol_url.split("/")
59
- bucket = path_elements[0]
60
- key = "/".join(path_elements[1:])
61
- return ("" if protocol is None else protocol, bucket, key)
62
-
63
-
64
- def hash_file(filepath, hash_func=hashlib.sha256) -> AnyStr:
65
- """Generate a hash for a file.
66
-
67
- Args:
68
- filepath (str): The path to the file.
69
- hash_func: A hashlib hash function, e.g., hashlib.md5().
70
-
71
- Returns:
72
- str: The hexadecimal hash string of the file.
73
- """
74
- # Initialize the hash object
75
- hasher = hash_func()
76
-
77
- # Open the file in binary read mode
78
- with open(filepath, "rb") as file:
79
- # Read the file in chunks to avoid using too much memory
80
- chunk_size = 8192
81
- while chunk := file.read(chunk_size):
82
- hasher.update(chunk)
83
-
84
- # Return the hexadecimal digest of the hash
85
- return hasher.hexdigest()
@@ -1,87 +0,0 @@
1
- import logging
2
- import os
3
- from typing import Dict
4
- from pyspark.sql import SparkSession
5
- from pyspark import SparkConf
6
- from pyspark.sql import types as T
7
- from pyspark.context import SparkContext
8
-
9
-
10
- logger = logging.getLogger(__name__)
11
-
12
-
13
- class UtilsSpark:
14
- """class with handy functions to work with spark"""
15
-
16
- __ENV_KEY_PYSPARK_IS_LOCAL = "PYSPARK_IS_LOCAL"
17
- __ENV_KEY_NOT_AWS_CLOUD = "NOT_AWS_CLOUD"
18
- __DTYPES_MAP = {
19
- "bigint": T.LongType,
20
- "string": T.StringType,
21
- "double": T.DoubleType,
22
- "int": T.IntegerType,
23
- "boolean": T.BooleanType,
24
- "timestamp": T.TimestampType,
25
- "date": T.DateType,
26
- }
27
-
28
- @staticmethod
29
- def get_local_spark_session(config: dict = None) -> SparkSession:
30
- logger.debug(f"[get_local_spark_session|in] ({config})")
31
- os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages io.delta:delta-core_2.12:2.3.0 pyspark-shell"
32
- builder = (
33
- SparkSession.builder.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
34
- .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
35
- .config("spark.driver.host", "localhost")
36
- )
37
-
38
- if config is not None:
39
- for k, v in config.items():
40
- builder.config(k, v)
41
-
42
- spark = builder.getOrCreate()
43
-
44
- logger.debug(f"[get_local_spark_session|out] => {spark}")
45
- return spark
46
-
47
- @staticmethod
48
- def get_spark_session(config: dict = None) -> SparkSession:
49
- logger.debug(f"[get_spark_session|in] ({config})")
50
-
51
- if "1" == os.getenv(UtilsSpark.__ENV_KEY_PYSPARK_IS_LOCAL):
52
- spark: SparkSession = UtilsSpark.get_local_spark_session(config)
53
- else:
54
- if "1" == os.getenv(UtilsSpark.__ENV_KEY_NOT_AWS_CLOUD):
55
- active_session = SparkSession.getActiveSession()
56
- else:
57
- from awsglue.context import GlueContext
58
-
59
- glueContext = GlueContext(SparkContext.getOrCreate())
60
- active_session = glueContext.spark_session
61
-
62
- spark_config = SparkConf()
63
-
64
- if active_session is not None:
65
- former_config = active_session.sparkContext.getConf().getAll()
66
- for entry in former_config:
67
- spark_config.set(entry[0], entry[1])
68
- if config is not None:
69
- for k, v in config.items():
70
- spark_config.set(k, v)
71
- spark: SparkSession = SparkSession.builder.config(conf=spark_config).getOrCreate()
72
- else:
73
- spark: SparkSession = SparkSession.builder.getOrCreate()
74
-
75
- logger.debug(f"[get_spark_session|out] => {spark}")
76
- return spark
77
-
78
- @staticmethod
79
- def build_schema_from_dtypes(dtypes_schema: Dict[str, str]) -> T.StructType:
80
- logger.info(f"[build_schema_from_dtypes|in] ({dtypes_schema})")
81
- result = T.StructType()
82
- for field, dtype in dtypes_schema.items():
83
- new_type = UtilsSpark.__DTYPES_MAP[dtype]
84
- result.add(field, new_type(), True)
85
-
86
- logger.info(f"[build_schema_from_dtypes|out] => {result}")
87
- return result
tgedr/dataops/etl.py DELETED
@@ -1,112 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- import inspect
3
- import logging
4
- from typing import Any, Dict, Optional
5
-
6
-
7
- logger = logging.getLogger(__name__)
8
-
9
-
10
- class EtlException(Exception):
11
- pass
12
-
13
-
14
- """
15
- ETL is an abstract base class that should be extended when you want to run an ETL-like task/job
16
- A subclass of ETL has extract, transform and load methods.
17
-
18
- The ETL class has static utility methods that serve as an outline for the class. Use example below:
19
-
20
- ```python
21
- class MyEtl(Etl):
22
- @Etl.inject_configuration
23
- def extract(self, MY_PARAM) -> None:
24
- # "MY_PARAM" should be supplied in 'configuration' dict otherwise an exception will be raised
25
-
26
- @Etl.inject_configuration
27
- def load(self, NOT_IN_CONFIG=123) -> None:
28
- # If you try to inject a configuration key that is NOT on the configuration dictionary
29
- # supplied to the constructor, it will not throw an error as long as you set a default
30
- # value in the method you wish to decorate
31
- assert NOT_IN_CONFIG == 123, "This will be ok"
32
-
33
- ```
34
- """
35
-
36
-
37
- class Etl(ABC):
38
- def __init__(self, configuration: Optional[Dict[str, Any]] = None) -> None:
39
- """Initialize a new instance of ETL.
40
-
41
- Parameters
42
- ----------
43
- configuration : Dict[str, Any]
44
- source for configuration injection
45
- """
46
- self._configuration = configuration
47
-
48
- @abstractmethod
49
- def extract(self) -> Any:
50
- raise NotImplementedError()
51
-
52
- @abstractmethod
53
- def transform(self) -> Any:
54
- raise NotImplementedError()
55
-
56
- @abstractmethod
57
- def load(self) -> Any:
58
- raise NotImplementedError()
59
-
60
- def validate_extract(self):
61
- """
62
- Optional extra checks for extract step.
63
- """
64
-
65
- def validate_transform(self):
66
- """
67
- Optional extra checks for transform step.
68
- """
69
-
70
- def run(self) -> Any:
71
- logger.info("[run|in]")
72
-
73
- self.extract()
74
- self.validate_extract()
75
-
76
- self.transform()
77
- self.validate_transform()
78
-
79
- result: Any = self.load()
80
-
81
- logger.info(f"[run|out] => {result}")
82
- return result
83
-
84
- @staticmethod
85
- def inject_configuration(f):
86
- def decorator(self):
87
- signature = inspect.signature(f)
88
-
89
- missing_params = []
90
- params = {}
91
- for param in [parameter for parameter in signature.parameters if parameter != "self"]:
92
- if signature.parameters[param].default != inspect._empty:
93
- params[param] = signature.parameters[param].default
94
- else:
95
- params[param] = None
96
- if self._configuration is None or param not in self._configuration:
97
- missing_params.append(param)
98
-
99
- if self._configuration is not None and param in self._configuration:
100
- params[param] = self._configuration[param]
101
-
102
- if 0 < len(missing_params):
103
- raise EtlException(
104
- f"{type(self).__name__}.{f.__name__}: missing required configuration parameters: {missing_params}"
105
- )
106
-
107
- return f(
108
- self,
109
- *[params[argument] for argument in params],
110
- )
111
-
112
- return decorator
@@ -1,27 +0,0 @@
1
- import abc
2
- from typing import Any, Dict, Optional
3
-
4
-
5
- class ProcessorException(Exception):
6
- pass
7
-
8
-
9
- class ProcessorInterface(metaclass=abc.ABCMeta):
10
- """
11
- def process(self, context: Optional[Dict[str, Any]] = None) -> Any:
12
- raise NotImplementedError()
13
- """
14
-
15
- @classmethod
16
- def __subclasshook__(cls, subclass):
17
- return hasattr(subclass, "process") and callable(subclass.process) or NotImplemented
18
-
19
-
20
- @ProcessorInterface.register
21
- class Processor(abc.ABC):
22
- def __init__(self, config: Optional[Dict[str, Any]] = None):
23
- self._config = config
24
-
25
- @abc.abstractmethod
26
- def process(self, context: Optional[Dict[str, Any]] = None) -> Any:
27
- raise NotImplementedError()
@@ -1,47 +0,0 @@
1
- import logging
2
- import os
3
- import shutil
4
- from typing import Any, Dict, Optional
5
-
6
- from tgedr.dataops.sink.sink import Sink, SinkException
7
-
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
-
12
- class LocalFsFileSink(Sink):
13
- """sink class used to save/persist an object/file to a local fs location"""
14
-
15
- CONTEXT_SOURCE_PATH = "source"
16
- CONTEXT_TARGET_PATH = "target"
17
-
18
- def put(self, context: Optional[Dict[str, Any]] = None) -> Any:
19
- logger.info(f"[put|in] ({context})")
20
-
21
- if self.CONTEXT_SOURCE_PATH not in context:
22
- raise SinkException(f"you must provide context for {self.CONTEXT_SOURCE_PATH}")
23
- if self.CONTEXT_TARGET_PATH not in context:
24
- raise SinkException(f"you must provide context for {self.CONTEXT_TARGET_PATH}")
25
-
26
- source = context[self.CONTEXT_SOURCE_PATH]
27
- target = context[self.CONTEXT_TARGET_PATH]
28
-
29
- shutil.copy(source, target)
30
- logger.info("[put|out]")
31
-
32
- def delete(self, context: Optional[Dict[str, Any]] = None):
33
- logger.info(f"[delete|in] ({context})")
34
-
35
- if self.CONTEXT_TARGET_PATH not in context:
36
- raise SinkException(f"you must provide context for {self.CONTEXT_TARGET_PATH}")
37
-
38
- target = context[self.CONTEXT_TARGET_PATH]
39
-
40
- if os.path.isfile(target):
41
- os.remove(target)
42
- elif os.path.isdir(target):
43
- shutil.rmtree(target)
44
- else:
45
- raise SinkException(f"[delete] is it a dir or a folder? {target}")
46
-
47
- logger.info("[delete|out]")
@@ -1,46 +0,0 @@
1
- import abc
2
- from typing import Any, Dict, Optional
3
-
4
- from tgedr.dataops.chain import Chain
5
-
6
-
7
- class SinkException(Exception):
8
- pass
9
-
10
-
11
- class SinkInterface(metaclass=abc.ABCMeta):
12
- """
13
- def put(self, context: Optional[Dict[str, Any]] = None) -> Any:
14
- raise NotImplementedError()
15
- """
16
-
17
- @classmethod
18
- def __subclasshook__(cls, subclass):
19
- return (
20
- hasattr(subclass, "put")
21
- and callable(subclass.put)
22
- and hasattr(subclass, "delete")
23
- and callable(subclass.delete)
24
- ) or NotImplemented
25
-
26
-
27
- @SinkInterface.register
28
- class Sink(abc.ABC):
29
- """abstract class defining methods ('put' and 'delete') to manage persistence of data somewhere as defined by implementing classes"""
30
-
31
- def __init__(self, config: Optional[Dict[str, Any]] = None):
32
- self._config = config
33
-
34
- @abc.abstractmethod
35
- def put(self, context: Optional[Dict[str, Any]] = None) -> Any:
36
- raise NotImplementedError()
37
-
38
- @abc.abstractmethod
39
- def delete(self, context: Optional[Dict[str, Any]] = None):
40
- raise NotImplementedError()
41
-
42
-
43
- @SinkInterface.register
44
- class SinkChain(Chain, abc.ABC):
45
- def execute(self, context: Optional[Dict[str, Any]] = None) -> Any:
46
- return self.put(context=context)
@@ -1,43 +0,0 @@
1
- from abc import ABC
2
- import logging
3
- from typing import Any, Dict, List, Optional
4
-
5
- from tgedr.dataops.commons.s3_connector import S3Connector
6
- from tgedr.dataops.commons.utils_fs import process_s3_url
7
- from tgedr.dataops.source.source import Source, SourceException
8
-
9
-
10
- logger = logging.getLogger()
11
-
12
-
13
- class AbstractS3FileSource(Source, S3Connector, ABC):
14
- """abstract class used to read file sources from s3"""
15
-
16
- CONTEXT_KEY_URL = "url"
17
- CONTEXT_KEY_SUFFIX = "suffix"
18
-
19
- def __init__(self, config: Optional[Dict[str, Any]] = None):
20
- Source.__init__(self, config=config)
21
- S3Connector.__init__(self)
22
-
23
- def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
24
- logger.info(f"[list|in] ({context})")
25
-
26
- result: List[str] = []
27
- if self.CONTEXT_KEY_URL not in context:
28
- raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
29
-
30
- protocol, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
31
-
32
- objs = self._client.list_objects_v2(Bucket=bucket, Prefix=key)
33
- result = [
34
- (protocol + bucket + "/" + entry["Key"]) for entry in objs["Contents"] if not (entry["Key"]).endswith("/")
35
- ]
36
-
37
- if self.CONTEXT_KEY_SUFFIX in context:
38
- suffix: str = context[self.CONTEXT_KEY_SUFFIX]
39
- result = [f for f in result if f.endswith(suffix)]
40
-
41
- logger.debug(f"[list|out] => {result}")
42
- logger.info(f"[list|out] => result len: {len(result)}")
43
- return result