tgedr-dataops 0.0.36__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tgedr/dataops → tgedr_dataops}/commons/s3_connector.py +32 -5
- tgedr_dataops/commons/utils_fs.py +187 -0
- tgedr_dataops/quality/pandas_validation.py +21 -0
- tgedr_dataops/sink/local_fs_file_sink.py +77 -0
- {tgedr/dataops → tgedr_dataops}/sink/s3_file_sink.py +47 -11
- tgedr_dataops/source/abstract_s3_file_source.py +72 -0
- tgedr_dataops/source/local_fs_file_source.py +108 -0
- tgedr_dataops/source/pd_df_s3_source.py +130 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_copy.py +64 -28
- tgedr_dataops/source/s3_file_extended_source.py +68 -0
- {tgedr/dataops → tgedr_dataops}/source/s3_file_source.py +60 -39
- tgedr_dataops/store/fs_single_partition_parquet.py +331 -0
- tgedr_dataops/store/local_fs_single_partition_parquet.py +56 -0
- tgedr_dataops/store/s3_single_partition_parquet.py +193 -0
- tgedr_dataops-1.0.1.dist-info/METADATA +72 -0
- tgedr_dataops-1.0.1.dist-info/RECORD +22 -0
- {tgedr_dataops-0.0.36.dist-info → tgedr_dataops-1.0.1.dist-info}/WHEEL +1 -1
- tgedr_dataops-1.0.1.dist-info/top_level.txt +1 -0
- tgedr/dataops/chain.py +0 -51
- tgedr/dataops/commons/dataset.py +0 -23
- tgedr/dataops/commons/metadata.py +0 -172
- tgedr/dataops/commons/utils_fs.py +0 -85
- tgedr/dataops/commons/utils_spark.py +0 -87
- tgedr/dataops/etl.py +0 -112
- tgedr/dataops/processor.py +0 -27
- tgedr/dataops/sink/local_fs_file_sink.py +0 -47
- tgedr/dataops/sink/sink.py +0 -46
- tgedr/dataops/source/abstract_s3_file_source.py +0 -43
- tgedr/dataops/source/delta_table_source.py +0 -49
- tgedr/dataops/source/local_delta_table.py +0 -47
- tgedr/dataops/source/local_fs_file_source.py +0 -71
- tgedr/dataops/source/pd_df_s3_source.py +0 -51
- tgedr/dataops/source/s3_delta_table.py +0 -75
- tgedr/dataops/source/source.py +0 -51
- tgedr/dataops/store/fs_single_partition_parquet.py +0 -231
- tgedr/dataops/store/local_fs_single_partition_parquet.py +0 -24
- tgedr/dataops/store/s3_single_partition_parquet.py +0 -102
- tgedr/dataops/store/spark_delta.py +0 -369
- tgedr/dataops/store/store.py +0 -49
- tgedr/dataops/utils_reflection.py +0 -134
- tgedr/dataops/validation/abs.py +0 -46
- tgedr/dataops/validation/pandas.py +0 -10
- tgedr/dataops/validation/pyspark.py +0 -10
- tgedr_dataops-0.0.36.dist-info/METADATA +0 -20
- tgedr_dataops-0.0.36.dist-info/RECORD +0 -37
- tgedr_dataops-0.0.36.dist-info/top_level.txt +0 -1
- {tgedr/dataops → tgedr_dataops}/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/sink/__init__.py +0 -0
- {tgedr/dataops → tgedr_dataops}/source/__init__.py +0 -0
- {tgedr_dataops-0.0.36.dist-info → tgedr_dataops-1.0.1.dist-info/licenses}/LICENSE +0 -0
tgedr/dataops/chain.py
DELETED
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
import abc
|
|
2
|
-
from typing import Any, Dict, Optional
|
|
3
|
-
|
|
4
|
-
from tgedr.dataops.processor import Processor
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class ChainException(Exception):
|
|
8
|
-
pass
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class ChainInterface(metaclass=abc.ABCMeta):
|
|
12
|
-
@classmethod
|
|
13
|
-
def __subclasshook__(cls, subclass):
|
|
14
|
-
return (
|
|
15
|
-
hasattr(subclass, "next")
|
|
16
|
-
and callable(subclass.next)
|
|
17
|
-
and hasattr(subclass, "execute")
|
|
18
|
-
and callable(subclass.execute)
|
|
19
|
-
) or NotImplemented
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class ChainMixin(abc.ABC):
|
|
23
|
-
def next(self, handler: "ChainMixin") -> "ChainMixin":
|
|
24
|
-
if "_next" not in self.__dict__ or self._next is None:
|
|
25
|
-
self._next: "ChainMixin" = handler
|
|
26
|
-
else:
|
|
27
|
-
self._next.next(handler)
|
|
28
|
-
return self
|
|
29
|
-
|
|
30
|
-
@abc.abstractmethod
|
|
31
|
-
def execute(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
32
|
-
raise NotImplementedError()
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class ProcessorChainMixin(ChainMixin):
|
|
36
|
-
def execute(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
37
|
-
self.process(context=context)
|
|
38
|
-
if "_next" in self.__dict__ and self._next is not None:
|
|
39
|
-
self._next.execute(context=context)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
@ChainInterface.register
|
|
43
|
-
class ProcessorChain(ProcessorChainMixin, Processor):
|
|
44
|
-
pass
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
@ChainInterface.register
|
|
48
|
-
class Chain(ChainMixin, abc.ABC):
|
|
49
|
-
@abc.abstractmethod
|
|
50
|
-
def execute(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
51
|
-
raise NotImplementedError()
|
tgedr/dataops/commons/dataset.py
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
import json
|
|
3
|
-
from pyspark.sql import DataFrame
|
|
4
|
-
from tgedr.dataops.commons.metadata import Metadata
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
@dataclass(frozen=True)
|
|
8
|
-
class Dataset:
|
|
9
|
-
"""
|
|
10
|
-
utility immutable class to wrap up a dataframe along with metadata
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
__slots__ = ["metadata", "data"]
|
|
14
|
-
metadata: Metadata
|
|
15
|
-
data: DataFrame
|
|
16
|
-
|
|
17
|
-
def as_dict(self) -> dict:
|
|
18
|
-
"""serialize the dataset as a dictionary"""
|
|
19
|
-
return {"metadata": self.metadata.as_dict(), "data": str(self.data.__repr__)}
|
|
20
|
-
|
|
21
|
-
def __str__(self) -> str:
|
|
22
|
-
"""serialize the dataset as a json string"""
|
|
23
|
-
return json.dumps(self.as_dict())
|
|
@@ -1,172 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
import json
|
|
3
|
-
from typing import Dict, List, Optional, Union
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
@dataclass(frozen=True)
|
|
7
|
-
class FieldFrame:
|
|
8
|
-
"""
|
|
9
|
-
class depicting a field values range, to be used in metadata
|
|
10
|
-
|
|
11
|
-
Parameters:
|
|
12
|
-
field (str): the name of the field
|
|
13
|
-
lower (Union[int, str, float]): field lower bound
|
|
14
|
-
upper (Union[int, str, float]): field upper bound
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
__slots__ = ["field", "lower", "upper"]
|
|
18
|
-
field: str
|
|
19
|
-
lower: Union[int, str, float]
|
|
20
|
-
upper: Union[int, str, float]
|
|
21
|
-
|
|
22
|
-
def as_dict(self) -> Dict[str, Union[int, str, float]]:
|
|
23
|
-
return {"field": self.field, "lower": self.lower, "upper": self.upper}
|
|
24
|
-
|
|
25
|
-
@staticmethod
|
|
26
|
-
def from_str(src: str) -> "FieldFrame":
|
|
27
|
-
r = json.loads(src)
|
|
28
|
-
field = r["field"]
|
|
29
|
-
lower = r["lower"]
|
|
30
|
-
upper = r["upper"]
|
|
31
|
-
return FieldFrame(field=field, lower=lower, upper=upper)
|
|
32
|
-
|
|
33
|
-
def __str__(self) -> str:
|
|
34
|
-
return json.dumps(self.as_dict())
|
|
35
|
-
|
|
36
|
-
def __eq__(self, other):
|
|
37
|
-
return self.field == other.field and self.lower == other.lower and self.upper == other.upper
|
|
38
|
-
|
|
39
|
-
def __gt__(self, other):
|
|
40
|
-
return self.field > other.field or (
|
|
41
|
-
self.field == other.field
|
|
42
|
-
and (self.lower > other.lower or (self.lower == other.lower and self.upper > other.upper))
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
def __ne__(self, other):
|
|
46
|
-
return not other == self
|
|
47
|
-
|
|
48
|
-
def __ge__(self, other):
|
|
49
|
-
return other == self or self > other
|
|
50
|
-
|
|
51
|
-
def __le__(self, other):
|
|
52
|
-
return other == self or self < other
|
|
53
|
-
|
|
54
|
-
def __lt__(self, other):
|
|
55
|
-
return other > self
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
@dataclass(frozen=True)
|
|
59
|
-
class Metadata:
|
|
60
|
-
"""immutable class depicting dataset metadata
|
|
61
|
-
|
|
62
|
-
Parameters:
|
|
63
|
-
name (str): the name of the dataset
|
|
64
|
-
version (Optional[str]): version of this dataset, if available
|
|
65
|
-
framing (Optional[List[FieldFrame]]): multiple field frames
|
|
66
|
-
sources (Optional[List["Metadata"]]): metadatas related to the datasets sourcing this one
|
|
67
|
-
"""
|
|
68
|
-
|
|
69
|
-
__slots__ = ["name", "version", "framing", "sources"]
|
|
70
|
-
name: str
|
|
71
|
-
version: Optional[str]
|
|
72
|
-
framing: Optional[List[FieldFrame]]
|
|
73
|
-
sources: Optional[List["Metadata"]]
|
|
74
|
-
|
|
75
|
-
def as_dict(self) -> dict:
|
|
76
|
-
result = {"name": self.name}
|
|
77
|
-
if self.version is not None:
|
|
78
|
-
result["version"] = self.version
|
|
79
|
-
if self.framing is not None:
|
|
80
|
-
result["framing"] = []
|
|
81
|
-
for f in self.framing:
|
|
82
|
-
(result["framing"]).append(f.as_dict())
|
|
83
|
-
if self.sources is not None:
|
|
84
|
-
result["sources"] = []
|
|
85
|
-
for source in self.sources:
|
|
86
|
-
(result["sources"]).append(source.as_dict())
|
|
87
|
-
|
|
88
|
-
return result
|
|
89
|
-
|
|
90
|
-
def __str__(self):
|
|
91
|
-
return json.dumps(self.as_dict())
|
|
92
|
-
|
|
93
|
-
def __eq__(self, other: object) -> bool:
|
|
94
|
-
return (
|
|
95
|
-
self.name == other.name
|
|
96
|
-
and (
|
|
97
|
-
(self.version is None and other.version is None)
|
|
98
|
-
or ((self.version is not None and other.version is not None) and self.version == other.version)
|
|
99
|
-
)
|
|
100
|
-
and (
|
|
101
|
-
(self.framing is None and other.framing is None)
|
|
102
|
-
or (
|
|
103
|
-
(self.framing is not None and other.framing is not None)
|
|
104
|
-
and sorted(self.framing) == sorted(other.framing)
|
|
105
|
-
)
|
|
106
|
-
)
|
|
107
|
-
and (
|
|
108
|
-
(self.sources is None and other.sources is None)
|
|
109
|
-
or (
|
|
110
|
-
(self.sources is not None and other.sources is not None)
|
|
111
|
-
and sorted(self.sources) == sorted(other.sources)
|
|
112
|
-
)
|
|
113
|
-
)
|
|
114
|
-
)
|
|
115
|
-
|
|
116
|
-
def __gt__(self, other):
|
|
117
|
-
return self.name > other.name or (
|
|
118
|
-
self.name == other.name
|
|
119
|
-
and (
|
|
120
|
-
((self.version is not None and other.version is not None) and (self.version > other.version))
|
|
121
|
-
or (self.version is not None and other.version is None)
|
|
122
|
-
or (
|
|
123
|
-
(
|
|
124
|
-
(self.framing is not None and other.framing is not None)
|
|
125
|
-
and (sorted(self.framing) > sorted(other.framing))
|
|
126
|
-
)
|
|
127
|
-
or (self.framing is not None and other.framing is None)
|
|
128
|
-
or (
|
|
129
|
-
(
|
|
130
|
-
(self.sources is not None and other.sources is not None)
|
|
131
|
-
and (sorted(self.sources) > sorted(other.sources))
|
|
132
|
-
)
|
|
133
|
-
or (self.sources is not None and other.sources is None)
|
|
134
|
-
)
|
|
135
|
-
)
|
|
136
|
-
)
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
def __ne__(self, other):
|
|
140
|
-
return not other == self
|
|
141
|
-
|
|
142
|
-
def __ge__(self, other):
|
|
143
|
-
return other == self or self > other
|
|
144
|
-
|
|
145
|
-
def __le__(self, other):
|
|
146
|
-
return other == self or self < other
|
|
147
|
-
|
|
148
|
-
def __lt__(self, other):
|
|
149
|
-
return other > self
|
|
150
|
-
|
|
151
|
-
@staticmethod
|
|
152
|
-
def from_str(src: str) -> "Metadata":
|
|
153
|
-
r = json.loads(src)
|
|
154
|
-
name = r["name"]
|
|
155
|
-
version = None if "version" not in r else r["version"]
|
|
156
|
-
|
|
157
|
-
framing = None
|
|
158
|
-
framing_entries = None if "framing" not in r else r["framing"]
|
|
159
|
-
if framing_entries is not None:
|
|
160
|
-
framing = []
|
|
161
|
-
for framing_entry in framing_entries:
|
|
162
|
-
framing.append(FieldFrame.from_str(json.dumps(framing_entry)))
|
|
163
|
-
|
|
164
|
-
sources = None
|
|
165
|
-
sources_entries = None if "sources" not in r else r["sources"]
|
|
166
|
-
if sources_entries is not None:
|
|
167
|
-
sources = []
|
|
168
|
-
for source_entry in sources_entries:
|
|
169
|
-
source_entry_as_str = json.dumps(source_entry)
|
|
170
|
-
sources.append(Metadata.from_str(source_entry_as_str))
|
|
171
|
-
|
|
172
|
-
return Metadata(name=name, version=version, framing=framing, sources=sources)
|
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
import tempfile
|
|
2
|
-
import re
|
|
3
|
-
from typing import Tuple, Union, AnyStr
|
|
4
|
-
import hashlib
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def temp_dir(root: str = None, suffix: str = None, prefix: str = None) -> str:
|
|
8
|
-
return tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=root)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def temp_file(
|
|
12
|
-
root: str = None, suffix: str = None, prefix: str = None, discard_handle: bool = True
|
|
13
|
-
) -> Union[str, Tuple[int, str]]:
|
|
14
|
-
h, f = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=root)
|
|
15
|
-
if discard_handle:
|
|
16
|
-
return f
|
|
17
|
-
else:
|
|
18
|
-
return (h, f)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def resolve_url_protocol(url: str) -> str:
|
|
22
|
-
result = None
|
|
23
|
-
group_match = re.search("(.*://).*", url)
|
|
24
|
-
if group_match is not None:
|
|
25
|
-
result = group_match.group(1)
|
|
26
|
-
return result
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def resolve_s3_protocol(url: str) -> str:
|
|
30
|
-
result = None
|
|
31
|
-
group_match = re.search("(s3[a]?://).*", url)
|
|
32
|
-
if group_match is not None:
|
|
33
|
-
result = group_match.group(1)
|
|
34
|
-
return result
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def remove_s3_protocol(url: str) -> str:
|
|
38
|
-
if url.startswith("s3://"):
|
|
39
|
-
result = url[5:]
|
|
40
|
-
elif url.startswith("s3a://"):
|
|
41
|
-
result = url[6:]
|
|
42
|
-
else:
|
|
43
|
-
result = url
|
|
44
|
-
return result
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def process_s3_path(path: str) -> Tuple[str, str]:
|
|
48
|
-
no_protocol_path = remove_s3_protocol(path)
|
|
49
|
-
path_elements = no_protocol_path.split("/")
|
|
50
|
-
bucket = path_elements[0]
|
|
51
|
-
key = "/".join(path_elements[1:])
|
|
52
|
-
return (bucket, key)
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def process_s3_url(url: str) -> Tuple[str, str, str]:
|
|
56
|
-
protocol = resolve_s3_protocol(url)
|
|
57
|
-
no_protocol_url = remove_s3_protocol(url)
|
|
58
|
-
path_elements = no_protocol_url.split("/")
|
|
59
|
-
bucket = path_elements[0]
|
|
60
|
-
key = "/".join(path_elements[1:])
|
|
61
|
-
return ("" if protocol is None else protocol, bucket, key)
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def hash_file(filepath, hash_func=hashlib.sha256) -> AnyStr:
|
|
65
|
-
"""Generate a hash for a file.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
filepath (str): The path to the file.
|
|
69
|
-
hash_func: A hashlib hash function, e.g., hashlib.md5().
|
|
70
|
-
|
|
71
|
-
Returns:
|
|
72
|
-
str: The hexadecimal hash string of the file.
|
|
73
|
-
"""
|
|
74
|
-
# Initialize the hash object
|
|
75
|
-
hasher = hash_func()
|
|
76
|
-
|
|
77
|
-
# Open the file in binary read mode
|
|
78
|
-
with open(filepath, "rb") as file:
|
|
79
|
-
# Read the file in chunks to avoid using too much memory
|
|
80
|
-
chunk_size = 8192
|
|
81
|
-
while chunk := file.read(chunk_size):
|
|
82
|
-
hasher.update(chunk)
|
|
83
|
-
|
|
84
|
-
# Return the hexadecimal digest of the hash
|
|
85
|
-
return hasher.hexdigest()
|
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import os
|
|
3
|
-
from typing import Dict
|
|
4
|
-
from pyspark.sql import SparkSession
|
|
5
|
-
from pyspark import SparkConf
|
|
6
|
-
from pyspark.sql import types as T
|
|
7
|
-
from pyspark.context import SparkContext
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
logger = logging.getLogger(__name__)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class UtilsSpark:
|
|
14
|
-
"""class with handy functions to work with spark"""
|
|
15
|
-
|
|
16
|
-
__ENV_KEY_PYSPARK_IS_LOCAL = "PYSPARK_IS_LOCAL"
|
|
17
|
-
__ENV_KEY_NOT_AWS_CLOUD = "NOT_AWS_CLOUD"
|
|
18
|
-
__DTYPES_MAP = {
|
|
19
|
-
"bigint": T.LongType,
|
|
20
|
-
"string": T.StringType,
|
|
21
|
-
"double": T.DoubleType,
|
|
22
|
-
"int": T.IntegerType,
|
|
23
|
-
"boolean": T.BooleanType,
|
|
24
|
-
"timestamp": T.TimestampType,
|
|
25
|
-
"date": T.DateType,
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
@staticmethod
|
|
29
|
-
def get_local_spark_session(config: dict = None) -> SparkSession:
|
|
30
|
-
logger.debug(f"[get_local_spark_session|in] ({config})")
|
|
31
|
-
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages io.delta:delta-core_2.12:2.3.0 pyspark-shell"
|
|
32
|
-
builder = (
|
|
33
|
-
SparkSession.builder.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
|
|
34
|
-
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
|
|
35
|
-
.config("spark.driver.host", "localhost")
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
if config is not None:
|
|
39
|
-
for k, v in config.items():
|
|
40
|
-
builder.config(k, v)
|
|
41
|
-
|
|
42
|
-
spark = builder.getOrCreate()
|
|
43
|
-
|
|
44
|
-
logger.debug(f"[get_local_spark_session|out] => {spark}")
|
|
45
|
-
return spark
|
|
46
|
-
|
|
47
|
-
@staticmethod
|
|
48
|
-
def get_spark_session(config: dict = None) -> SparkSession:
|
|
49
|
-
logger.debug(f"[get_spark_session|in] ({config})")
|
|
50
|
-
|
|
51
|
-
if "1" == os.getenv(UtilsSpark.__ENV_KEY_PYSPARK_IS_LOCAL):
|
|
52
|
-
spark: SparkSession = UtilsSpark.get_local_spark_session(config)
|
|
53
|
-
else:
|
|
54
|
-
if "1" == os.getenv(UtilsSpark.__ENV_KEY_NOT_AWS_CLOUD):
|
|
55
|
-
active_session = SparkSession.getActiveSession()
|
|
56
|
-
else:
|
|
57
|
-
from awsglue.context import GlueContext
|
|
58
|
-
|
|
59
|
-
glueContext = GlueContext(SparkContext.getOrCreate())
|
|
60
|
-
active_session = glueContext.spark_session
|
|
61
|
-
|
|
62
|
-
spark_config = SparkConf()
|
|
63
|
-
|
|
64
|
-
if active_session is not None:
|
|
65
|
-
former_config = active_session.sparkContext.getConf().getAll()
|
|
66
|
-
for entry in former_config:
|
|
67
|
-
spark_config.set(entry[0], entry[1])
|
|
68
|
-
if config is not None:
|
|
69
|
-
for k, v in config.items():
|
|
70
|
-
spark_config.set(k, v)
|
|
71
|
-
spark: SparkSession = SparkSession.builder.config(conf=spark_config).getOrCreate()
|
|
72
|
-
else:
|
|
73
|
-
spark: SparkSession = SparkSession.builder.getOrCreate()
|
|
74
|
-
|
|
75
|
-
logger.debug(f"[get_spark_session|out] => {spark}")
|
|
76
|
-
return spark
|
|
77
|
-
|
|
78
|
-
@staticmethod
|
|
79
|
-
def build_schema_from_dtypes(dtypes_schema: Dict[str, str]) -> T.StructType:
|
|
80
|
-
logger.info(f"[build_schema_from_dtypes|in] ({dtypes_schema})")
|
|
81
|
-
result = T.StructType()
|
|
82
|
-
for field, dtype in dtypes_schema.items():
|
|
83
|
-
new_type = UtilsSpark.__DTYPES_MAP[dtype]
|
|
84
|
-
result.add(field, new_type(), True)
|
|
85
|
-
|
|
86
|
-
logger.info(f"[build_schema_from_dtypes|out] => {result}")
|
|
87
|
-
return result
|
tgedr/dataops/etl.py
DELETED
|
@@ -1,112 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
import inspect
|
|
3
|
-
import logging
|
|
4
|
-
from typing import Any, Dict, Optional
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
logger = logging.getLogger(__name__)
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class EtlException(Exception):
|
|
11
|
-
pass
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
"""
|
|
15
|
-
ETL is an abstract base class that should be extended when you want to run an ETL-like task/job
|
|
16
|
-
A subclass of ETL has extract, transform and load methods.
|
|
17
|
-
|
|
18
|
-
The ETL class has static utility methods that serve as an outline for the class. Use example below:
|
|
19
|
-
|
|
20
|
-
```python
|
|
21
|
-
class MyEtl(Etl):
|
|
22
|
-
@Etl.inject_configuration
|
|
23
|
-
def extract(self, MY_PARAM) -> None:
|
|
24
|
-
# "MY_PARAM" should be supplied in 'configuration' dict otherwise an exception will be raised
|
|
25
|
-
|
|
26
|
-
@Etl.inject_configuration
|
|
27
|
-
def load(self, NOT_IN_CONFIG=123) -> None:
|
|
28
|
-
# If you try to inject a configuration key that is NOT on the configuration dictionary
|
|
29
|
-
# supplied to the constructor, it will not throw an error as long as you set a default
|
|
30
|
-
# value in the method you wish to decorate
|
|
31
|
-
assert NOT_IN_CONFIG == 123, "This will be ok"
|
|
32
|
-
|
|
33
|
-
```
|
|
34
|
-
"""
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class Etl(ABC):
|
|
38
|
-
def __init__(self, configuration: Optional[Dict[str, Any]] = None) -> None:
|
|
39
|
-
"""Initialize a new instance of ETL.
|
|
40
|
-
|
|
41
|
-
Parameters
|
|
42
|
-
----------
|
|
43
|
-
configuration : Dict[str, Any]
|
|
44
|
-
source for configuration injection
|
|
45
|
-
"""
|
|
46
|
-
self._configuration = configuration
|
|
47
|
-
|
|
48
|
-
@abstractmethod
|
|
49
|
-
def extract(self) -> Any:
|
|
50
|
-
raise NotImplementedError()
|
|
51
|
-
|
|
52
|
-
@abstractmethod
|
|
53
|
-
def transform(self) -> Any:
|
|
54
|
-
raise NotImplementedError()
|
|
55
|
-
|
|
56
|
-
@abstractmethod
|
|
57
|
-
def load(self) -> Any:
|
|
58
|
-
raise NotImplementedError()
|
|
59
|
-
|
|
60
|
-
def validate_extract(self):
|
|
61
|
-
"""
|
|
62
|
-
Optional extra checks for extract step.
|
|
63
|
-
"""
|
|
64
|
-
|
|
65
|
-
def validate_transform(self):
|
|
66
|
-
"""
|
|
67
|
-
Optional extra checks for transform step.
|
|
68
|
-
"""
|
|
69
|
-
|
|
70
|
-
def run(self) -> Any:
|
|
71
|
-
logger.info("[run|in]")
|
|
72
|
-
|
|
73
|
-
self.extract()
|
|
74
|
-
self.validate_extract()
|
|
75
|
-
|
|
76
|
-
self.transform()
|
|
77
|
-
self.validate_transform()
|
|
78
|
-
|
|
79
|
-
result: Any = self.load()
|
|
80
|
-
|
|
81
|
-
logger.info(f"[run|out] => {result}")
|
|
82
|
-
return result
|
|
83
|
-
|
|
84
|
-
@staticmethod
|
|
85
|
-
def inject_configuration(f):
|
|
86
|
-
def decorator(self):
|
|
87
|
-
signature = inspect.signature(f)
|
|
88
|
-
|
|
89
|
-
missing_params = []
|
|
90
|
-
params = {}
|
|
91
|
-
for param in [parameter for parameter in signature.parameters if parameter != "self"]:
|
|
92
|
-
if signature.parameters[param].default != inspect._empty:
|
|
93
|
-
params[param] = signature.parameters[param].default
|
|
94
|
-
else:
|
|
95
|
-
params[param] = None
|
|
96
|
-
if self._configuration is None or param not in self._configuration:
|
|
97
|
-
missing_params.append(param)
|
|
98
|
-
|
|
99
|
-
if self._configuration is not None and param in self._configuration:
|
|
100
|
-
params[param] = self._configuration[param]
|
|
101
|
-
|
|
102
|
-
if 0 < len(missing_params):
|
|
103
|
-
raise EtlException(
|
|
104
|
-
f"{type(self).__name__}.{f.__name__}: missing required configuration parameters: {missing_params}"
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
return f(
|
|
108
|
-
self,
|
|
109
|
-
*[params[argument] for argument in params],
|
|
110
|
-
)
|
|
111
|
-
|
|
112
|
-
return decorator
|
tgedr/dataops/processor.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import abc
|
|
2
|
-
from typing import Any, Dict, Optional
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class ProcessorException(Exception):
|
|
6
|
-
pass
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class ProcessorInterface(metaclass=abc.ABCMeta):
|
|
10
|
-
"""
|
|
11
|
-
def process(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
12
|
-
raise NotImplementedError()
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
@classmethod
|
|
16
|
-
def __subclasshook__(cls, subclass):
|
|
17
|
-
return hasattr(subclass, "process") and callable(subclass.process) or NotImplemented
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
@ProcessorInterface.register
|
|
21
|
-
class Processor(abc.ABC):
|
|
22
|
-
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
23
|
-
self._config = config
|
|
24
|
-
|
|
25
|
-
@abc.abstractmethod
|
|
26
|
-
def process(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
27
|
-
raise NotImplementedError()
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import os
|
|
3
|
-
import shutil
|
|
4
|
-
from typing import Any, Dict, Optional
|
|
5
|
-
|
|
6
|
-
from tgedr.dataops.sink.sink import Sink, SinkException
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
logger = logging.getLogger(__name__)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class LocalFsFileSink(Sink):
|
|
13
|
-
"""sink class used to save/persist an object/file to a local fs location"""
|
|
14
|
-
|
|
15
|
-
CONTEXT_SOURCE_PATH = "source"
|
|
16
|
-
CONTEXT_TARGET_PATH = "target"
|
|
17
|
-
|
|
18
|
-
def put(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
19
|
-
logger.info(f"[put|in] ({context})")
|
|
20
|
-
|
|
21
|
-
if self.CONTEXT_SOURCE_PATH not in context:
|
|
22
|
-
raise SinkException(f"you must provide context for {self.CONTEXT_SOURCE_PATH}")
|
|
23
|
-
if self.CONTEXT_TARGET_PATH not in context:
|
|
24
|
-
raise SinkException(f"you must provide context for {self.CONTEXT_TARGET_PATH}")
|
|
25
|
-
|
|
26
|
-
source = context[self.CONTEXT_SOURCE_PATH]
|
|
27
|
-
target = context[self.CONTEXT_TARGET_PATH]
|
|
28
|
-
|
|
29
|
-
shutil.copy(source, target)
|
|
30
|
-
logger.info("[put|out]")
|
|
31
|
-
|
|
32
|
-
def delete(self, context: Optional[Dict[str, Any]] = None):
|
|
33
|
-
logger.info(f"[delete|in] ({context})")
|
|
34
|
-
|
|
35
|
-
if self.CONTEXT_TARGET_PATH not in context:
|
|
36
|
-
raise SinkException(f"you must provide context for {self.CONTEXT_TARGET_PATH}")
|
|
37
|
-
|
|
38
|
-
target = context[self.CONTEXT_TARGET_PATH]
|
|
39
|
-
|
|
40
|
-
if os.path.isfile(target):
|
|
41
|
-
os.remove(target)
|
|
42
|
-
elif os.path.isdir(target):
|
|
43
|
-
shutil.rmtree(target)
|
|
44
|
-
else:
|
|
45
|
-
raise SinkException(f"[delete] is it a dir or a folder? {target}")
|
|
46
|
-
|
|
47
|
-
logger.info("[delete|out]")
|
tgedr/dataops/sink/sink.py
DELETED
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
import abc
|
|
2
|
-
from typing import Any, Dict, Optional
|
|
3
|
-
|
|
4
|
-
from tgedr.dataops.chain import Chain
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class SinkException(Exception):
|
|
8
|
-
pass
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class SinkInterface(metaclass=abc.ABCMeta):
|
|
12
|
-
"""
|
|
13
|
-
def put(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
14
|
-
raise NotImplementedError()
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
@classmethod
|
|
18
|
-
def __subclasshook__(cls, subclass):
|
|
19
|
-
return (
|
|
20
|
-
hasattr(subclass, "put")
|
|
21
|
-
and callable(subclass.put)
|
|
22
|
-
and hasattr(subclass, "delete")
|
|
23
|
-
and callable(subclass.delete)
|
|
24
|
-
) or NotImplemented
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@SinkInterface.register
|
|
28
|
-
class Sink(abc.ABC):
|
|
29
|
-
"""abstract class defining methods ('put' and 'delete') to manage persistence of data somewhere as defined by implementing classes"""
|
|
30
|
-
|
|
31
|
-
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
32
|
-
self._config = config
|
|
33
|
-
|
|
34
|
-
@abc.abstractmethod
|
|
35
|
-
def put(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
36
|
-
raise NotImplementedError()
|
|
37
|
-
|
|
38
|
-
@abc.abstractmethod
|
|
39
|
-
def delete(self, context: Optional[Dict[str, Any]] = None):
|
|
40
|
-
raise NotImplementedError()
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
@SinkInterface.register
|
|
44
|
-
class SinkChain(Chain, abc.ABC):
|
|
45
|
-
def execute(self, context: Optional[Dict[str, Any]] = None) -> Any:
|
|
46
|
-
return self.put(context=context)
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
from abc import ABC
|
|
2
|
-
import logging
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
-
|
|
5
|
-
from tgedr.dataops.commons.s3_connector import S3Connector
|
|
6
|
-
from tgedr.dataops.commons.utils_fs import process_s3_url
|
|
7
|
-
from tgedr.dataops.source.source import Source, SourceException
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
logger = logging.getLogger()
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class AbstractS3FileSource(Source, S3Connector, ABC):
|
|
14
|
-
"""abstract class used to read file sources from s3"""
|
|
15
|
-
|
|
16
|
-
CONTEXT_KEY_URL = "url"
|
|
17
|
-
CONTEXT_KEY_SUFFIX = "suffix"
|
|
18
|
-
|
|
19
|
-
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
20
|
-
Source.__init__(self, config=config)
|
|
21
|
-
S3Connector.__init__(self)
|
|
22
|
-
|
|
23
|
-
def list(self, context: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
24
|
-
logger.info(f"[list|in] ({context})")
|
|
25
|
-
|
|
26
|
-
result: List[str] = []
|
|
27
|
-
if self.CONTEXT_KEY_URL not in context:
|
|
28
|
-
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
29
|
-
|
|
30
|
-
protocol, bucket, key = process_s3_url(context[self.CONTEXT_KEY_URL])
|
|
31
|
-
|
|
32
|
-
objs = self._client.list_objects_v2(Bucket=bucket, Prefix=key)
|
|
33
|
-
result = [
|
|
34
|
-
(protocol + bucket + "/" + entry["Key"]) for entry in objs["Contents"] if not (entry["Key"]).endswith("/")
|
|
35
|
-
]
|
|
36
|
-
|
|
37
|
-
if self.CONTEXT_KEY_SUFFIX in context:
|
|
38
|
-
suffix: str = context[self.CONTEXT_KEY_SUFFIX]
|
|
39
|
-
result = [f for f in result if f.endswith(suffix)]
|
|
40
|
-
|
|
41
|
-
logger.debug(f"[list|out] => {result}")
|
|
42
|
-
logger.info(f"[list|out] => result len: {len(result)}")
|
|
43
|
-
return result
|