tgedr-dataops-ext 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tgedr_dataops_ext/__init__.py +0 -0
- tgedr_dataops_ext/commons/dataset.py +27 -0
- tgedr_dataops_ext/commons/metadata.py +236 -0
- tgedr_dataops_ext/commons/utils_spark.py +133 -0
- tgedr_dataops_ext/quality/pyspark_validation.py +22 -0
- tgedr_dataops_ext/source/delta_table_source.py +57 -0
- tgedr_dataops_ext/source/local_delta_table.py +58 -0
- tgedr_dataops_ext/source/s3_delta_table.py +83 -0
- tgedr_dataops_ext/store/spark_delta.py +515 -0
- tgedr_dataops_ext-0.0.1.dist-info/METADATA +60 -0
- tgedr_dataops_ext-0.0.1.dist-info/RECORD +13 -0
- tgedr_dataops_ext-0.0.1.dist-info/WHEEL +5 -0
- tgedr_dataops_ext-0.0.1.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Dataset module for wrapping dataframes with metadata.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- Dataset: an immutable dataclass that combines a DataFrame with Metadata.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
import json
|
|
9
|
+
from pyspark.sql import DataFrame
|
|
10
|
+
from tgedr_dataops_ext.commons.metadata import Metadata
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class Dataset:
|
|
15
|
+
"""Utility immutable class to wrap up a dataframe along with metadata."""
|
|
16
|
+
|
|
17
|
+
__slots__ = ["data", "metadata"]
|
|
18
|
+
metadata: Metadata
|
|
19
|
+
data: DataFrame
|
|
20
|
+
|
|
21
|
+
def as_dict(self) -> dict:
|
|
22
|
+
"""Serialize the dataset as a dictionary."""
|
|
23
|
+
return {"metadata": self.metadata.as_dict(), "data": str(self.data.__repr__)}
|
|
24
|
+
|
|
25
|
+
def __str__(self) -> str:
|
|
26
|
+
"""Serialize the dataset as a json string."""
|
|
27
|
+
return json.dumps(self.as_dict())
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""Module for dataset metadata classes."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class FieldFrame:
|
|
9
|
+
"""Class depicting a field values range, to be used in metadata.
|
|
10
|
+
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
field : str
|
|
14
|
+
The name of the field.
|
|
15
|
+
lower : Union[int, str, float]
|
|
16
|
+
Field lower bound.
|
|
17
|
+
upper : Union[int, str, float]
|
|
18
|
+
Field upper bound.
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
__slots__ = ["field", "lower", "upper"]
|
|
23
|
+
field: str
|
|
24
|
+
lower: int | str | float
|
|
25
|
+
upper: int | str | float
|
|
26
|
+
|
|
27
|
+
def as_dict(self) -> dict[str, int | str | float]:
|
|
28
|
+
"""Convert the FieldFrame to a dictionary representation.
|
|
29
|
+
|
|
30
|
+
Returns
|
|
31
|
+
-------
|
|
32
|
+
dict[str, int | str | float]
|
|
33
|
+
A dictionary containing the field name, lower bound, and upper bound.
|
|
34
|
+
"""
|
|
35
|
+
return {"field": self.field, "lower": self.lower, "upper": self.upper}
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def from_str(src: str) -> "FieldFrame":
|
|
39
|
+
"""Create a FieldFrame instance from a JSON string.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
src : str
|
|
44
|
+
A JSON string representation of a FieldFrame.
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
FieldFrame
|
|
49
|
+
A new FieldFrame instance created from the JSON string.
|
|
50
|
+
"""
|
|
51
|
+
r = json.loads(src)
|
|
52
|
+
field = r["field"]
|
|
53
|
+
lower = r["lower"]
|
|
54
|
+
upper = r["upper"]
|
|
55
|
+
return FieldFrame(field=field, lower=lower, upper=upper)
|
|
56
|
+
|
|
57
|
+
def __str__(self) -> str:
|
|
58
|
+
"""Return a JSON string representation of the FieldFrame."""
|
|
59
|
+
return json.dumps(self.as_dict())
|
|
60
|
+
|
|
61
|
+
def __eq__(self, other: "FieldFrame") -> bool:
|
|
62
|
+
"""Check equality between two FieldFrame instances."""
|
|
63
|
+
return self.field == other.field and self.lower == other.lower and self.upper == other.upper
|
|
64
|
+
|
|
65
|
+
def __gt__(self, other: "FieldFrame") -> bool:
|
|
66
|
+
"""Check if this FieldFrame is greater than another FieldFrame."""
|
|
67
|
+
return self.field > other.field or (
|
|
68
|
+
self.field == other.field
|
|
69
|
+
and (self.lower > other.lower or (self.lower == other.lower and self.upper > other.upper))
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def __ne__(self, other: "FieldFrame") -> bool:
|
|
73
|
+
"""Check inequality between two FieldFrame instances."""
|
|
74
|
+
return not other == self
|
|
75
|
+
|
|
76
|
+
def __ge__(self, other: "FieldFrame") -> bool:
|
|
77
|
+
"""Check if this FieldFrame is greater than or equal to another FieldFrame."""
|
|
78
|
+
return other == self or self > other
|
|
79
|
+
|
|
80
|
+
def __le__(self, other: "FieldFrame") -> bool:
|
|
81
|
+
"""Check if this FieldFrame is less than or equal to another FieldFrame."""
|
|
82
|
+
return other == self or self < other
|
|
83
|
+
|
|
84
|
+
def __lt__(self, other: "FieldFrame") -> bool:
|
|
85
|
+
"""Check if this FieldFrame is less than another FieldFrame."""
|
|
86
|
+
return other > self
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass(frozen=True)
|
|
90
|
+
class Metadata:
|
|
91
|
+
"""Immutable class depicting dataset metadata.
|
|
92
|
+
|
|
93
|
+
Parameters
|
|
94
|
+
----------
|
|
95
|
+
name : str
|
|
96
|
+
The name of the dataset.
|
|
97
|
+
version : Optional[str]
|
|
98
|
+
Version of this dataset, if available.
|
|
99
|
+
framing : Optional[List[FieldFrame]]
|
|
100
|
+
Multiple field frames.
|
|
101
|
+
sources : Optional[List["Metadata"]]
|
|
102
|
+
Metadatas related to the datasets sourcing this one.
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
__slots__ = ["framing", "name", "sources", "version"]
|
|
107
|
+
name: str
|
|
108
|
+
version: str | None
|
|
109
|
+
framing: list[FieldFrame] | None
|
|
110
|
+
sources: list["Metadata"] | None
|
|
111
|
+
|
|
112
|
+
def as_dict(self) -> dict:
|
|
113
|
+
"""Convert the Metadata to a dictionary representation.
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
-------
|
|
117
|
+
dict
|
|
118
|
+
A dictionary containing the metadata fields including name, version,
|
|
119
|
+
framing, and sources if they are not None.
|
|
120
|
+
"""
|
|
121
|
+
result = {"name": self.name}
|
|
122
|
+
if self.version is not None:
|
|
123
|
+
result["version"] = self.version
|
|
124
|
+
if self.framing is not None:
|
|
125
|
+
result["framing"] = []
|
|
126
|
+
for f in self.framing:
|
|
127
|
+
(result["framing"]).append(f.as_dict())
|
|
128
|
+
if self.sources is not None:
|
|
129
|
+
result["sources"] = []
|
|
130
|
+
for source in self.sources:
|
|
131
|
+
(result["sources"]).append(source.as_dict())
|
|
132
|
+
|
|
133
|
+
return result
|
|
134
|
+
|
|
135
|
+
def __str__(self) -> str:
|
|
136
|
+
"""Return a JSON string representation of the Metadata."""
|
|
137
|
+
return json.dumps(self.as_dict())
|
|
138
|
+
|
|
139
|
+
def __eq__(self, other: object) -> bool:
|
|
140
|
+
"""Check equality between two Metadata instances."""
|
|
141
|
+
return (
|
|
142
|
+
self.name == other.name
|
|
143
|
+
and (
|
|
144
|
+
(self.version is None and other.version is None)
|
|
145
|
+
or ((self.version is not None and other.version is not None) and self.version == other.version)
|
|
146
|
+
)
|
|
147
|
+
and (
|
|
148
|
+
(self.framing is None and other.framing is None)
|
|
149
|
+
or (
|
|
150
|
+
(self.framing is not None and other.framing is not None)
|
|
151
|
+
and sorted(self.framing) == sorted(other.framing)
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
and (
|
|
155
|
+
(self.sources is None and other.sources is None)
|
|
156
|
+
or (
|
|
157
|
+
(self.sources is not None and other.sources is not None)
|
|
158
|
+
and sorted(self.sources) == sorted(other.sources)
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def __gt__(self, other: "Metadata") -> bool:
|
|
164
|
+
"""Check if this Metadata is greater than another Metadata."""
|
|
165
|
+
return self.name > other.name or (
|
|
166
|
+
self.name == other.name
|
|
167
|
+
and (
|
|
168
|
+
((self.version is not None and other.version is not None) and (self.version > other.version))
|
|
169
|
+
or (self.version is not None and other.version is None)
|
|
170
|
+
or (
|
|
171
|
+
(
|
|
172
|
+
(self.framing is not None and other.framing is not None)
|
|
173
|
+
and (sorted(self.framing) > sorted(other.framing))
|
|
174
|
+
)
|
|
175
|
+
or (self.framing is not None and other.framing is None)
|
|
176
|
+
or (
|
|
177
|
+
(
|
|
178
|
+
(self.sources is not None and other.sources is not None)
|
|
179
|
+
and (sorted(self.sources) > sorted(other.sources))
|
|
180
|
+
)
|
|
181
|
+
or (self.sources is not None and other.sources is None)
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def __ne__(self, other: "Metadata") -> bool:
|
|
188
|
+
"""Check inequality between two Metadata instances."""
|
|
189
|
+
return not other == self
|
|
190
|
+
|
|
191
|
+
def __ge__(self, other: "Metadata") -> bool:
|
|
192
|
+
"""Check if this Metadata is greater than or equal to another Metadata."""
|
|
193
|
+
return other == self or self > other
|
|
194
|
+
|
|
195
|
+
def __le__(self, other: "Metadata") -> bool:
|
|
196
|
+
"""Check if this Metadata is less than or equal to another Metadata."""
|
|
197
|
+
return other == self or self < other
|
|
198
|
+
|
|
199
|
+
def __lt__(self, other: "Metadata") -> bool:
|
|
200
|
+
"""Check if this Metadata is less than another Metadata."""
|
|
201
|
+
return other > self
|
|
202
|
+
|
|
203
|
+
@staticmethod
|
|
204
|
+
def from_str(src: str) -> "Metadata":
|
|
205
|
+
"""Create a Metadata instance from a JSON string.
|
|
206
|
+
|
|
207
|
+
Parameters
|
|
208
|
+
----------
|
|
209
|
+
src : str
|
|
210
|
+
A JSON string representation of a Metadata object.
|
|
211
|
+
|
|
212
|
+
Returns
|
|
213
|
+
-------
|
|
214
|
+
Metadata
|
|
215
|
+
A new Metadata instance created from the JSON string.
|
|
216
|
+
"""
|
|
217
|
+
r = json.loads(src)
|
|
218
|
+
name = r["name"]
|
|
219
|
+
version = r.get("version", None)
|
|
220
|
+
|
|
221
|
+
framing = None
|
|
222
|
+
framing_entries = r.get("framing", None)
|
|
223
|
+
if framing_entries is not None:
|
|
224
|
+
framing = []
|
|
225
|
+
for framing_entry in framing_entries:
|
|
226
|
+
framing.append(FieldFrame.from_str(json.dumps(framing_entry)))
|
|
227
|
+
|
|
228
|
+
sources = None
|
|
229
|
+
sources_entries = r.get("sources", None)
|
|
230
|
+
if sources_entries is not None:
|
|
231
|
+
sources = []
|
|
232
|
+
for source_entry in sources_entries:
|
|
233
|
+
source_entry_as_str = json.dumps(source_entry)
|
|
234
|
+
sources.append(Metadata.from_str(source_entry_as_str))
|
|
235
|
+
|
|
236
|
+
return Metadata(name=name, version=version, framing=framing, sources=sources)
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Spark utility functions and classes.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- UtilsSpark: A utility class with handy functions to work with Spark sessions,
|
|
5
|
+
including creating local and remote sessions with Delta Lake support,
|
|
6
|
+
and building PySpark schemas from data type dictionaries.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
from typing import ClassVar
|
|
12
|
+
from pyspark.sql import SparkSession
|
|
13
|
+
from pyspark import SparkConf
|
|
14
|
+
from pyspark.sql import types as T # noqa: N812
|
|
15
|
+
from pyspark.context import SparkContext
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class UtilsSpark:
|
|
22
|
+
"""class with handy functions to work with spark."""
|
|
23
|
+
|
|
24
|
+
__ENV_KEY_PYSPARK_IS_LOCAL = "PYSPARK_IS_LOCAL"
|
|
25
|
+
__ENV_KEY_NOT_AWS_CLOUD = "NOT_AWS_CLOUD"
|
|
26
|
+
__DTYPES_MAP: ClassVar[dict[str, type]] = {
|
|
27
|
+
"bigint": T.LongType,
|
|
28
|
+
"string": T.StringType,
|
|
29
|
+
"double": T.DoubleType,
|
|
30
|
+
"int": T.IntegerType,
|
|
31
|
+
"boolean": T.BooleanType,
|
|
32
|
+
"timestamp": T.TimestampType,
|
|
33
|
+
"date": T.DateType,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def get_local_spark_session(config: dict | None = None) -> SparkSession:
|
|
38
|
+
"""Get a local Spark session configured for Delta Lake.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
config : dict | None, optional
|
|
43
|
+
Additional Spark configuration parameters, by default None.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
SparkSession
|
|
48
|
+
A configured local Spark session instance with Delta Lake support.
|
|
49
|
+
"""
|
|
50
|
+
logger.debug(f"[get_local_spark_session|in] ({config})")
|
|
51
|
+
# PySpark 3.4 uses Scala 2.12, so we need delta-core_2.12
|
|
52
|
+
builder = (
|
|
53
|
+
SparkSession.builder.config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
|
|
54
|
+
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
|
|
55
|
+
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
|
|
56
|
+
.config("spark.driver.host", "localhost")
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
if config is not None:
|
|
60
|
+
for k, v in config.items():
|
|
61
|
+
builder.config(k, v)
|
|
62
|
+
|
|
63
|
+
spark = builder.getOrCreate()
|
|
64
|
+
|
|
65
|
+
logger.debug(f"[get_local_spark_session|out] => {spark}")
|
|
66
|
+
return spark
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def get_spark_session(config: dict | None = None) -> SparkSession:
|
|
70
|
+
"""Get a Spark session based on the environment configuration.
|
|
71
|
+
|
|
72
|
+
Parameters
|
|
73
|
+
----------
|
|
74
|
+
config : dict | None, optional
|
|
75
|
+
Additional Spark configuration parameters, by default None.
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
SparkSession
|
|
80
|
+
A configured Spark session instance.
|
|
81
|
+
"""
|
|
82
|
+
logger.debug(f"[get_spark_session|in] ({config})")
|
|
83
|
+
|
|
84
|
+
if "1" == os.getenv(UtilsSpark.__ENV_KEY_PYSPARK_IS_LOCAL):
|
|
85
|
+
spark: SparkSession = UtilsSpark.get_local_spark_session(config)
|
|
86
|
+
else:
|
|
87
|
+
if "1" == os.getenv(UtilsSpark.__ENV_KEY_NOT_AWS_CLOUD):
|
|
88
|
+
active_session = SparkSession.getActiveSession()
|
|
89
|
+
else:
|
|
90
|
+
from awsglue.context import GlueContext # type: ignore # pragma: no cover # noqa: PGH003
|
|
91
|
+
|
|
92
|
+
glueContext = GlueContext(SparkContext.getOrCreate()) # pragma: no cover # noqa: N806
|
|
93
|
+
active_session = glueContext.spark_session # pragma: no cover
|
|
94
|
+
|
|
95
|
+
spark_config = SparkConf()
|
|
96
|
+
|
|
97
|
+
if active_session is not None:
|
|
98
|
+
former_config = active_session.sparkContext.getConf().getAll()
|
|
99
|
+
for entry in former_config:
|
|
100
|
+
spark_config.set(entry[0], entry[1])
|
|
101
|
+
if config is not None:
|
|
102
|
+
for k, v in config.items():
|
|
103
|
+
spark_config.set(k, v)
|
|
104
|
+
spark: SparkSession = SparkSession.builder.config(conf=spark_config).getOrCreate()
|
|
105
|
+
else:
|
|
106
|
+
spark: SparkSession = SparkSession.builder.getOrCreate()
|
|
107
|
+
|
|
108
|
+
logger.debug(f"[get_spark_session|out] => {spark}")
|
|
109
|
+
return spark
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def build_schema_from_dtypes(dtypes_schema: dict[str, str]) -> T.StructType:
|
|
113
|
+
"""Build a PySpark StructType schema from a dictionary of data types.
|
|
114
|
+
|
|
115
|
+
Parameters
|
|
116
|
+
----------
|
|
117
|
+
dtypes_schema : dict[str, str]
|
|
118
|
+
A dictionary mapping field names to their corresponding data type strings.
|
|
119
|
+
Supported types: 'bigint', 'string', 'double', 'int', 'boolean', 'timestamp', 'date'.
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
T.StructType
|
|
124
|
+
A PySpark StructType schema with fields defined by the input dictionary.
|
|
125
|
+
"""
|
|
126
|
+
logger.info(f"[build_schema_from_dtypes|in] ({dtypes_schema})")
|
|
127
|
+
result = T.StructType()
|
|
128
|
+
for field, dtype in dtypes_schema.items():
|
|
129
|
+
new_type = UtilsSpark.__DTYPES_MAP[dtype]
|
|
130
|
+
result.add(field, new_type(), True) # noqa: FBT003
|
|
131
|
+
|
|
132
|
+
logger.info(f"[build_schema_from_dtypes|out] => {result}")
|
|
133
|
+
return result
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Pyspark DataFrame validation implementation module.
|
|
2
|
+
|
|
3
|
+
This module provides the Pyspark-specific implementation of Great Expectations validation.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from great_expectations.execution_engine import ExecutionEngine
|
|
7
|
+
from great_expectations.execution_engine import SparkDFExecutionEngine
|
|
8
|
+
from tgedr_dataops_abs.great_expectations_validation import GreatExpectationsValidation
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PysparkValidation(GreatExpectationsValidation):
|
|
12
|
+
"""Pyspark DataFrame validation implementation."""
|
|
13
|
+
|
|
14
|
+
def _get_execution_engine(self, batch_data_dict: dict) -> ExecutionEngine:
|
|
15
|
+
"""Get the execution engine used by the validation implementation.
|
|
16
|
+
|
|
17
|
+
Returns
|
|
18
|
+
-------
|
|
19
|
+
ExecutionEngine
|
|
20
|
+
The execution engine instance.
|
|
21
|
+
"""
|
|
22
|
+
return SparkDFExecutionEngine(batch_data_dict=batch_data_dict)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Delta Lake table source implementation.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- DeltaTableSource: abstract base class for reading delta lake format datasets
|
|
5
|
+
and returning pandas DataFrames with configurable storage options.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any
|
|
11
|
+
from pandas import DataFrame
|
|
12
|
+
from deltalake import DeltaTable
|
|
13
|
+
from deltalake.exceptions import TableNotFoundError
|
|
14
|
+
|
|
15
|
+
from tgedr_dataops_abs.source import Source, SourceException, NoSourceException
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DeltaTableSource(Source, ABC):
|
|
22
|
+
"""abstract class used to read delta lake format datasets returning a pandas dataframe."""
|
|
23
|
+
|
|
24
|
+
CONTEXT_KEY_URL: str = "url"
|
|
25
|
+
CONTEXT_KEY_COLUMNS: str = "columns"
|
|
26
|
+
|
|
27
|
+
def __init__(self, config: dict[str, Any] | None = None) -> None:
|
|
28
|
+
"""Initialize the DeltaTableSource with optional configuration."""
|
|
29
|
+
super().__init__(config=config)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def _storage_options(self) -> Any:
|
|
34
|
+
return None # pragma: no cover
|
|
35
|
+
|
|
36
|
+
def get(self, context: dict[str, Any] | None = None) -> DataFrame:
|
|
37
|
+
"""Retrieves a delta lake table."""
|
|
38
|
+
logger.info(f"[get|in] ({context})")
|
|
39
|
+
result: DataFrame = None
|
|
40
|
+
|
|
41
|
+
if self.CONTEXT_KEY_URL not in context:
|
|
42
|
+
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
43
|
+
|
|
44
|
+
columns: list[str] = None
|
|
45
|
+
if self.CONTEXT_KEY_COLUMNS in context:
|
|
46
|
+
columns = context[self.CONTEXT_KEY_COLUMNS]
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
delta_table = DeltaTable(
|
|
50
|
+
table_uri=context[self.CONTEXT_KEY_URL], storage_options=self._storage_options, without_files=True
|
|
51
|
+
)
|
|
52
|
+
result = delta_table.to_pandas(columns=columns)
|
|
53
|
+
except TableNotFoundError as exc:
|
|
54
|
+
raise NoSourceException(f"could not find delta table: {context[self.CONTEXT_KEY_URL]}") from exc
|
|
55
|
+
|
|
56
|
+
logger.info(f"[get|out] => {result}")
|
|
57
|
+
return result
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Local Delta Table source implementation.
|
|
2
|
+
|
|
3
|
+
This module provides the LocalDeltaTable class for reading Delta Lake format datasets
|
|
4
|
+
from the local filesystem using Python only (PySpark not required), returning pandas DataFrames.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import re
|
|
9
|
+
from typing import Any
|
|
10
|
+
import glob
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from tgedr_dataops_ext.source.delta_table_source import DeltaTableSource
|
|
14
|
+
from tgedr_dataops_abs.source import SourceException
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class LocalDeltaTable(DeltaTableSource):
|
|
21
|
+
"""class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, config: dict[str, Any] | None = None) -> None:
|
|
24
|
+
"""Initialize LocalDeltaTable with optional configuration.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
config: Optional configuration dictionary for the delta table source.
|
|
28
|
+
"""
|
|
29
|
+
super().__init__(config=config)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def _storage_options(self) -> Any:
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
def list(self, context: dict[str, Any] | None = None) -> list[str]:
|
|
36
|
+
"""Lists the available delta lake datasets in the url provided."""
|
|
37
|
+
logger.info(f"[list|in] ({context})")
|
|
38
|
+
|
|
39
|
+
result: list[str] = []
|
|
40
|
+
if self.CONTEXT_KEY_URL not in context:
|
|
41
|
+
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
42
|
+
|
|
43
|
+
url = context[self.CONTEXT_KEY_URL]
|
|
44
|
+
if not Path(url).is_dir():
|
|
45
|
+
raise SourceException(f"not a delta lake url: {url}")
|
|
46
|
+
|
|
47
|
+
matches: set[str] = set()
|
|
48
|
+
pattern: str = f".*{url}/(.*)/_delta_log/.*"
|
|
49
|
+
for entry in glob.iglob(url + "**/**", recursive=True): # noqa: PTH207
|
|
50
|
+
match = re.search(pattern, entry)
|
|
51
|
+
if match:
|
|
52
|
+
matches.add(match.group(1))
|
|
53
|
+
|
|
54
|
+
result = list(matches)
|
|
55
|
+
|
|
56
|
+
logger.info(f"[list] result: {result}")
|
|
57
|
+
logger.info(f"[list|out] => result len: {len(result)}")
|
|
58
|
+
return result
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""S3 Delta Table source module for reading Delta Lake format datasets from S3.
|
|
2
|
+
|
|
3
|
+
This module provides the S3DeltaTable class for reading Delta Lake format datasets
|
|
4
|
+
from S3 buckets using Python only (no PySpark required), returning pandas DataFrames.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import re
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from tgedr_dataops.commons.s3_connector import S3Connector
|
|
12
|
+
from tgedr_dataops.commons.utils_fs import remove_s3_protocol
|
|
13
|
+
from tgedr_dataops_ext.source.delta_table_source import DeltaTableSource
|
|
14
|
+
from tgedr_dataops_abs.source import SourceException
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class S3DeltaTable(DeltaTableSource, S3Connector):
|
|
21
|
+
"""class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe."""
|
|
22
|
+
|
|
23
|
+
CONFIG_KEY_AWS_ACCESS_KEY_ID: str = "AWS_ACCESS_KEY_ID"
|
|
24
|
+
CONFIG_KEY_AWS_SECRET_ACCESS_KEY: str = "AWS_SECRET_ACCESS_KEY" # noqa: S105
|
|
25
|
+
CONFIG_KEY_AWS_SESSION_TOKEN: str = "AWS_SESSION_TOKEN" # noqa: S105
|
|
26
|
+
CONFIG_KEY_AWS_REGION: str = "AWS_REGION"
|
|
27
|
+
|
|
28
|
+
def __init__(self, config: dict[str, Any] | None = None) -> None:
|
|
29
|
+
"""Initialize the S3DeltaTable with optional configuration.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
config: Optional dictionary containing AWS credentials and configuration.
|
|
33
|
+
"""
|
|
34
|
+
DeltaTableSource.__init__(self, config=config)
|
|
35
|
+
S3Connector.__init__(self)
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def _storage_options(self) -> Any:
|
|
39
|
+
result = None
|
|
40
|
+
if (self._config is not None) and all(
|
|
41
|
+
element in list(self._config.keys())
|
|
42
|
+
for element in [
|
|
43
|
+
self.CONFIG_KEY_AWS_ACCESS_KEY_ID,
|
|
44
|
+
self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY,
|
|
45
|
+
self.CONFIG_KEY_AWS_SESSION_TOKEN,
|
|
46
|
+
self.CONFIG_KEY_AWS_REGION,
|
|
47
|
+
]
|
|
48
|
+
):
|
|
49
|
+
result = {
|
|
50
|
+
"AWS_ACCESS_KEY_ID": self._config[self.CONFIG_KEY_AWS_ACCESS_KEY_ID],
|
|
51
|
+
"AWS_SECRET_ACCESS_KEY": self._config[self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY],
|
|
52
|
+
"AWS_SESSION_TOKEN": self._config[self.CONFIG_KEY_AWS_SESSION_TOKEN],
|
|
53
|
+
"AWS_REGION": self._config[self.CONFIG_KEY_AWS_REGION],
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return result
|
|
57
|
+
|
|
58
|
+
def list(self, context: dict[str, Any] | None = None) -> list[str]:
|
|
59
|
+
"""Lists the available delta lake datasets in the url provided."""
|
|
60
|
+
logger.info(f"[list|in] ({context})")
|
|
61
|
+
|
|
62
|
+
result: list[str] = []
|
|
63
|
+
if self.CONTEXT_KEY_URL not in context:
|
|
64
|
+
raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
|
|
65
|
+
|
|
66
|
+
path = remove_s3_protocol(context[self.CONTEXT_KEY_URL])
|
|
67
|
+
path_elements = path.split("/")
|
|
68
|
+
bucket = path_elements[0]
|
|
69
|
+
key = "/".join(path_elements[1:])
|
|
70
|
+
|
|
71
|
+
matches: set[str] = set()
|
|
72
|
+
pattern: str = f".*{key}/(.*)/_delta_log/.*"
|
|
73
|
+
for entry in self._client.list_objects_v2(Bucket=bucket, Prefix=key)["Contents"]:
|
|
74
|
+
output_key: str = entry["Key"]
|
|
75
|
+
match = re.search(pattern, output_key)
|
|
76
|
+
if match:
|
|
77
|
+
matches.add(f"{key}/{match.group(1)}")
|
|
78
|
+
|
|
79
|
+
result = list(matches)
|
|
80
|
+
|
|
81
|
+
logger.info(f"[list] result: {result}")
|
|
82
|
+
logger.info(f"[list|out] => result len: {len(result)}")
|
|
83
|
+
return result
|
|
@@ -0,0 +1,515 @@
|
|
|
1
|
+
"""Spark Delta Lake store implementation module.
|
|
2
|
+
|
|
3
|
+
This module provides a Store implementation for working with Delta Lake format
|
|
4
|
+
using Apache Spark. It includes the SparkDeltaStore class which supports:
|
|
5
|
+
- Reading and writing data in Delta Lake format
|
|
6
|
+
- Versioning and time travel queries
|
|
7
|
+
- Update/merge operations (upserts)
|
|
8
|
+
- Partitioning and schema evolution
|
|
9
|
+
- Retention policies for log and deleted files
|
|
10
|
+
- Metadata management
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from abc import ABC
|
|
14
|
+
import dataclasses
|
|
15
|
+
import logging
|
|
16
|
+
from typing import Any
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
from pyspark.sql import DataFrame
|
|
19
|
+
from delta.tables import DeltaTable
|
|
20
|
+
from pyspark.sql import functions as f
|
|
21
|
+
from pyspark.sql import types as T # noqa: N812
|
|
22
|
+
from pyspark.sql.utils import AnalysisException
|
|
23
|
+
from pyspark.sql.functions import monotonically_increasing_id
|
|
24
|
+
from tgedr_dataops_abs.store import NoStoreException, Store, StoreException
|
|
25
|
+
from tgedr_dataops_ext.commons.metadata import Metadata
|
|
26
|
+
from tgedr_dataops_ext.commons.utils_spark import UtilsSpark
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SparkDeltaStore(Store, ABC):
|
|
32
|
+
"""A store implementation using Spark Delta Lake format.
|
|
33
|
+
|
|
34
|
+
This class provides methods for reading, writing, updating, and deleting
|
|
35
|
+
data in Delta Lake format using Apache Spark. It supports features like
|
|
36
|
+
versioning, partitioning, schema evolution, and retention policies.
|
|
37
|
+
|
|
38
|
+
Attributes
|
|
39
|
+
----------
|
|
40
|
+
config : dict[str, Any] | None
|
|
41
|
+
Optional configuration dictionary for the store.
|
|
42
|
+
|
|
43
|
+
Methods
|
|
44
|
+
-------
|
|
45
|
+
get(key: str, version: Optional[str] = None, **kwargs) -> DataFrame
|
|
46
|
+
Retrieve data from the specified key/path.
|
|
47
|
+
save(df: DataFrame, key: str, append: bool = False, ...) -> None
|
|
48
|
+
Save a DataFrame to the specified key/path.
|
|
49
|
+
update(df: Any, key: str, match_fields: list[str], ...) -> None
|
|
50
|
+
Update or insert data using merge operation.
|
|
51
|
+
delete(key: str, condition: Union[f.Column, str, None] = None, **kwargs) -> None
|
|
52
|
+
Delete data from the specified key/path.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, config: dict[str, Any] | None = None) -> None:
|
|
56
|
+
"""Initialize SparkDeltaStore with optional configuration.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
config: Optional configuration dictionary for the store.
|
|
60
|
+
"""
|
|
61
|
+
Store.__init__(self, config)
|
|
62
|
+
|
|
63
|
+
def get(self, key: str, version: str | None = None, **kwargs) -> DataFrame: # noqa: ANN003, ARG002
|
|
64
|
+
"""Retrieve data from the specified key/path.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
key : str
|
|
69
|
+
The path to the Delta table to read from.
|
|
70
|
+
version : str | None, optional
|
|
71
|
+
The version of the Delta table to read, is None by default.
|
|
72
|
+
**kwargs
|
|
73
|
+
Additional keyword arguments (currently unused).
|
|
74
|
+
|
|
75
|
+
Returns
|
|
76
|
+
-------
|
|
77
|
+
DataFrame
|
|
78
|
+
The Spark DataFrame containing the data from the Delta table.
|
|
79
|
+
|
|
80
|
+
Raises
|
|
81
|
+
------
|
|
82
|
+
NoStoreException
|
|
83
|
+
If no data is found at the specified key/path.
|
|
84
|
+
"""
|
|
85
|
+
logger.info(f"[get|in] ({key}, {version})")
|
|
86
|
+
|
|
87
|
+
table = self._get_table(path=key)
|
|
88
|
+
if table is None:
|
|
89
|
+
raise NoStoreException(f"[get] couldn't find data in key: {key}")
|
|
90
|
+
|
|
91
|
+
reader = UtilsSpark.get_spark_session().read.format("delta")
|
|
92
|
+
if version is not None:
|
|
93
|
+
reader = reader.option("versionAsOf", version)
|
|
94
|
+
|
|
95
|
+
result = reader.load(key)
|
|
96
|
+
|
|
97
|
+
logger.info("[get_df|out]")
|
|
98
|
+
return result
|
|
99
|
+
|
|
100
|
+
def __get_deletion_criteria(self, df: DataFrame) -> Any:
|
|
101
|
+
logger.debug("[__get_deletion_criteria|in])")
|
|
102
|
+
fields = df.dtypes
|
|
103
|
+
numerics = [
|
|
104
|
+
x
|
|
105
|
+
for x in fields
|
|
106
|
+
if x[1] in ["bigint", "int", "double", "float", "long", "decimal.Decimal"] or (x[1][:7]) == "decimal"
|
|
107
|
+
]
|
|
108
|
+
dates = [x for x in fields if (x[1]) in ["datetime", "datetime.datetime"]]
|
|
109
|
+
textuals = [x for x in fields if x[1] in ["string"]]
|
|
110
|
+
if 0 < len(numerics):
|
|
111
|
+
column = numerics[0][0]
|
|
112
|
+
result = (f.col(column) > 0) | (f.col(column) <= 0)
|
|
113
|
+
elif 0 < len(dates):
|
|
114
|
+
column = dates[0][0]
|
|
115
|
+
now = datetime.now(tz=datetime.UTC)
|
|
116
|
+
result = (f.col(column) > now) | (f.col(column) <= now)
|
|
117
|
+
elif 0 < len(textuals):
|
|
118
|
+
column = textuals[0][0]
|
|
119
|
+
result = (f.col(column) > "a") | (f.col(column) <= "a")
|
|
120
|
+
else:
|
|
121
|
+
raise StoreException(
|
|
122
|
+
"[__get_deletion_criteria] failed to figure out column types handy to create a full deletion criteria"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
logger.debug(f"[__get_deletion_criteria|out] = {result}")
|
|
126
|
+
return result
|
|
127
|
+
|
|
128
|
+
def delete(self, key: str, condition: f.Column | str | None = None, **kwargs) -> None: # noqa: ANN003, ARG002
|
|
129
|
+
"""Delete data from the specified key/path.
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
key : str
|
|
134
|
+
The path to the Delta table to delete from.
|
|
135
|
+
condition : f.Column | str | None, optional
|
|
136
|
+
The condition to filter rows for deletion. If None, deletes all rows.
|
|
137
|
+
**kwargs
|
|
138
|
+
Additional keyword arguments (currently unused).
|
|
139
|
+
|
|
140
|
+
Raises
|
|
141
|
+
------
|
|
142
|
+
StoreException
|
|
143
|
+
If deletion fails or column types cannot be determined for full deletion.
|
|
144
|
+
"""
|
|
145
|
+
logger.info(f"[delete|in] ({key}, {condition})")
|
|
146
|
+
|
|
147
|
+
spark = UtilsSpark.get_spark_session()
|
|
148
|
+
"""
|
|
149
|
+
is_s3_operation = True if key.startswith("s3") else False
|
|
150
|
+
if is_s3_operation:
|
|
151
|
+
"""
|
|
152
|
+
delta_table = DeltaTable.forPath(spark, key)
|
|
153
|
+
if condition is None:
|
|
154
|
+
condition = self.__get_deletion_criteria(delta_table.toDF())
|
|
155
|
+
delta_table.delete(condition=condition)
|
|
156
|
+
"""
|
|
157
|
+
else: # local development mostly for temporary or test purposes
|
|
158
|
+
spark_fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
|
|
159
|
+
# get spark context path
|
|
160
|
+
spark_path = spark._jvm.org.apache.hadoop.fs.Path(key)
|
|
161
|
+
logger.info(f"[delete] spark path is {spark_path}")
|
|
162
|
+
try:
|
|
163
|
+
if spark_fs.exists(spark_path):
|
|
164
|
+
spark_fs.delete(spark_path, True)
|
|
165
|
+
except AnalysisException as x:
|
|
166
|
+
raise StoreException(f"[delete] couldn't do it on key {key}: {x}")
|
|
167
|
+
"""
|
|
168
|
+
logger.info("[delete|out]")
|
|
169
|
+
|
|
170
|
+
def save(
|
|
171
|
+
self,
|
|
172
|
+
df: DataFrame,
|
|
173
|
+
key: str,
|
|
174
|
+
append: bool = False,
|
|
175
|
+
partition_fields: list[str] | None = None,
|
|
176
|
+
metadata: Metadata | None = None,
|
|
177
|
+
retention_days: int = 7,
|
|
178
|
+
deleted_retention_days: int = 7,
|
|
179
|
+
column_descriptions: dict[str, str] | None = None,
|
|
180
|
+
table_name: str | None = None,
|
|
181
|
+
**kwargs, # noqa: ANN003
|
|
182
|
+
) -> None:
|
|
183
|
+
"""Save a DataFrame to the specified key/path in Delta format.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
df : DataFrame
|
|
188
|
+
The Spark DataFrame to save.
|
|
189
|
+
key : str
|
|
190
|
+
The path where the Delta table will be saved.
|
|
191
|
+
append : bool, optional
|
|
192
|
+
Whether to append to existing data, is False by default.
|
|
193
|
+
partition_fields : list[str] | None, optional
|
|
194
|
+
List of column names to partition the table by, is None by default.
|
|
195
|
+
metadata : Metadata | None, optional
|
|
196
|
+
Optional metadata to attach to the table, is None by default.
|
|
197
|
+
retention_days : int, optional
|
|
198
|
+
Number of days to retain log files, is 7 by default.
|
|
199
|
+
deleted_retention_days : int, optional
|
|
200
|
+
Number of days to retain deleted files, is 7 by default.
|
|
201
|
+
column_descriptions : dict[str, str] | None, optional
|
|
202
|
+
Dictionary mapping column names to their descriptions, is None by default.
|
|
203
|
+
table_name : str | None, optional
|
|
204
|
+
Optional table name in format 'db.table', is None by default.
|
|
205
|
+
**kwargs
|
|
206
|
+
Additional keyword arguments.
|
|
207
|
+
"""
|
|
208
|
+
logger.info(
|
|
209
|
+
f"[save|in] ({df}, {key}, {append}, {partition_fields}, {metadata}, {retention_days}, {deleted_retention_days}, {column_descriptions}, {table_name}, {kwargs})"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
if column_descriptions is not None:
|
|
213
|
+
df = self._set_column_descriptions(df, column_descriptions)
|
|
214
|
+
|
|
215
|
+
writer = df.write.format("delta").mode("append") if append else df.write.format("delta").mode("overwrite")
|
|
216
|
+
|
|
217
|
+
if partition_fields is not None:
|
|
218
|
+
table = self._get_table(path=key)
|
|
219
|
+
if table is not None:
|
|
220
|
+
self._set_table_partitions(path=key, partition_fields=partition_fields)
|
|
221
|
+
writer = writer.partitionBy(*partition_fields)
|
|
222
|
+
|
|
223
|
+
if self._has_schema_changed(path=key, df=df):
|
|
224
|
+
writer = writer.option("overwriteSchema", "true")
|
|
225
|
+
|
|
226
|
+
if metadata:
|
|
227
|
+
writer = writer.option("userMetadata", metadata)
|
|
228
|
+
|
|
229
|
+
if table_name is not None:
|
|
230
|
+
# assume we have db.table
|
|
231
|
+
db = table_name.split(".")[0]
|
|
232
|
+
UtilsSpark.get_spark_session().sql(f"CREATE DATABASE IF NOT EXISTS {db}")
|
|
233
|
+
writer = writer.option("path", key).saveAsTable(table_name)
|
|
234
|
+
else:
|
|
235
|
+
writer.save(key)
|
|
236
|
+
|
|
237
|
+
logger.info("[save] optimizing...")
|
|
238
|
+
table = self._get_table(path=key)
|
|
239
|
+
|
|
240
|
+
if retention_days is not None and deleted_retention_days is not None:
|
|
241
|
+
self.enforce_retention_policy(
|
|
242
|
+
path=key, retention_days=retention_days, deleted_retention_days=deleted_retention_days
|
|
243
|
+
)
|
|
244
|
+
elif retention_days is not None:
|
|
245
|
+
self.enforce_retention_policy(path=key, retention_days=retention_days)
|
|
246
|
+
|
|
247
|
+
table.optimize().executeCompaction()
|
|
248
|
+
|
|
249
|
+
logger.info("[save|out]")
|
|
250
|
+
|
|
251
|
+
def update(
|
|
252
|
+
self,
|
|
253
|
+
df: Any,
|
|
254
|
+
key: str,
|
|
255
|
+
match_fields: list[str],
|
|
256
|
+
partition_fields: list[str] | None = None,
|
|
257
|
+
metadata: Metadata | None = None,
|
|
258
|
+
retention_days: int = 7,
|
|
259
|
+
deleted_retention_days: int = 7,
|
|
260
|
+
**kwargs, # noqa: ANN003
|
|
261
|
+
) -> None:
|
|
262
|
+
"""Update or insert data using merge operation.
|
|
263
|
+
|
|
264
|
+
Parameters
|
|
265
|
+
----------
|
|
266
|
+
df : Any
|
|
267
|
+
The DataFrame containing updates to merge.
|
|
268
|
+
key : str
|
|
269
|
+
The path to the Delta table to update.
|
|
270
|
+
match_fields : list[str]
|
|
271
|
+
List of column names to use for matching rows during merge.
|
|
272
|
+
partition_fields : list[str] | None, optional
|
|
273
|
+
List of column names to partition the table by, is None by default.
|
|
274
|
+
metadata : Metadata | None, optional
|
|
275
|
+
Optional metadata to attach to the table, is None by default.
|
|
276
|
+
retention_days : int, optional
|
|
277
|
+
Number of days to retain log files, is 7 by default.
|
|
278
|
+
deleted_retention_days : int, optional
|
|
279
|
+
Number of days to retain deleted files, is 7 by default.
|
|
280
|
+
**kwargs
|
|
281
|
+
Additional keyword arguments.
|
|
282
|
+
"""
|
|
283
|
+
logger.info(
|
|
284
|
+
f"[update|in] ({df}, {key}, {match_fields}, {partition_fields}, {metadata}, {retention_days}, {deleted_retention_days}, {kwargs})"
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
table = self._get_table(path=key)
|
|
288
|
+
if table is None:
|
|
289
|
+
self.save(
|
|
290
|
+
df=df,
|
|
291
|
+
key=key,
|
|
292
|
+
partition_fields=partition_fields,
|
|
293
|
+
metadata=metadata,
|
|
294
|
+
retention_days=retention_days,
|
|
295
|
+
deleted_retention_days=deleted_retention_days,
|
|
296
|
+
**kwargs,
|
|
297
|
+
)
|
|
298
|
+
else:
|
|
299
|
+
if partition_fields is not None:
|
|
300
|
+
self._set_table_partitions(path=key, partition_fields=partition_fields)
|
|
301
|
+
|
|
302
|
+
match_clause = None
|
|
303
|
+
for field in match_fields:
|
|
304
|
+
match_clause = (
|
|
305
|
+
f"current.{field} = updates.{field}"
|
|
306
|
+
if match_clause is None
|
|
307
|
+
else f"{match_clause} and current.{field} = updates.{field}"
|
|
308
|
+
)
|
|
309
|
+
logger.info(f"[update] match clause: {match_clause}")
|
|
310
|
+
|
|
311
|
+
# check if the df has all the required columns
|
|
312
|
+
# as we are upserting the updated columns coming in must at least match or exceed the current columns
|
|
313
|
+
for column in table.toDF().columns:
|
|
314
|
+
# we'll assume missing columns are nullable, typically metrics
|
|
315
|
+
if column not in df.columns:
|
|
316
|
+
df = df.withColumn(column, f.lit(None).cast(T.StringType()))
|
|
317
|
+
|
|
318
|
+
table.alias("current").merge(
|
|
319
|
+
df.alias("updates"), match_clause
|
|
320
|
+
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
|
|
321
|
+
|
|
322
|
+
if retention_days is not None and deleted_retention_days is not None:
|
|
323
|
+
self.enforce_retention_policy(
|
|
324
|
+
path=key, retention_days=retention_days, deleted_retention_days=deleted_retention_days
|
|
325
|
+
)
|
|
326
|
+
elif retention_days is not None:
|
|
327
|
+
self.enforce_retention_policy(path=key, retention_days=retention_days)
|
|
328
|
+
|
|
329
|
+
table.optimize().executeCompaction()
|
|
330
|
+
|
|
331
|
+
logger.info("[UtilsDeltaTable.upsert|out]")
|
|
332
|
+
|
|
333
|
+
def enforce_retention_policy(self, path: str, retention_days: int = 7, deleted_retention_days: int = 7) -> None:
|
|
334
|
+
"""Enforce retention policy on Delta table log and deleted files.
|
|
335
|
+
|
|
336
|
+
Parameters
|
|
337
|
+
----------
|
|
338
|
+
path : str
|
|
339
|
+
The path to the Delta table.
|
|
340
|
+
retention_days : int, optional
|
|
341
|
+
Number of days to retain log files, is 7 by default.
|
|
342
|
+
deleted_retention_days : int, optional
|
|
343
|
+
Number of days to retain deleted files, is 7 by default.
|
|
344
|
+
"""
|
|
345
|
+
logger.info(f"[enforce_retention_policy|in] ({path}, {retention_days}, {deleted_retention_days})")
|
|
346
|
+
|
|
347
|
+
retention = f"interval {retention_days} days"
|
|
348
|
+
deleted_retention = f"interval {deleted_retention_days} days"
|
|
349
|
+
|
|
350
|
+
UtilsSpark.get_spark_session().sql(
|
|
351
|
+
f"ALTER TABLE delta.`{path}` SET TBLPROPERTIES('delta.logRetentionDuration' = '{retention}', 'delta.deletedFileRetentionDuration' = '{deleted_retention}')"
|
|
352
|
+
)
|
|
353
|
+
logger.info("[enforce_retention_policy|out]")
|
|
354
|
+
|
|
355
|
+
def get_latest_table_versions(self, path: str, how_many: int = 1) -> list[str]:
|
|
356
|
+
"""Checks the delta table history and retrieves the latest n versions.
|
|
357
|
+
|
|
358
|
+
Sorted from the newest to the oldest.
|
|
359
|
+
"""
|
|
360
|
+
logger.info(f"[get_latest_table_versions|in] ({path}, {how_many})")
|
|
361
|
+
result: list[str] = []
|
|
362
|
+
|
|
363
|
+
table = self._get_table(path=path)
|
|
364
|
+
if table is not None:
|
|
365
|
+
history_rows = table.history().orderBy(f.desc("timestamp")).limit(how_many)
|
|
366
|
+
result = [str(x.version) for x in history_rows.collect()]
|
|
367
|
+
|
|
368
|
+
logger.info(f"[get_latest_table_versions|out] => {result}")
|
|
369
|
+
return result
|
|
370
|
+
|
|
371
|
+
def get_metadata(self, path: str, version: str | None = None) -> Metadata | None:
|
|
372
|
+
"""Retrieve metadata from the Delta table at the specified path.
|
|
373
|
+
|
|
374
|
+
Raises
|
|
375
|
+
------
|
|
376
|
+
NoStoreException
|
|
377
|
+
"""
|
|
378
|
+
logger.info(f"[get_metadata|in] ({path}, {version})")
|
|
379
|
+
table = self._get_table(path)
|
|
380
|
+
if table is None:
|
|
381
|
+
raise NoStoreException(f"[get_metadata] no data in path: {path}")
|
|
382
|
+
|
|
383
|
+
result = None
|
|
384
|
+
|
|
385
|
+
df_history = table.history().filter(f.col("userMetadata").isNotNull())
|
|
386
|
+
if version is not None:
|
|
387
|
+
df_history = df_history.filter(f.col("version") <= int(version))
|
|
388
|
+
|
|
389
|
+
df_history = df_history.orderBy(f.col("version").desc())
|
|
390
|
+
if not df_history.isEmpty():
|
|
391
|
+
user_metadata = df_history.take(1)[0].userMetadata
|
|
392
|
+
result = Metadata.from_str(user_metadata)
|
|
393
|
+
if version is not None:
|
|
394
|
+
result = dataclasses.replace(result, version=version)
|
|
395
|
+
|
|
396
|
+
logger.info(f"[get_metadata|out] => ({result})")
|
|
397
|
+
return result
|
|
398
|
+
|
|
399
|
+
def _get_delta_log(self, path: str) -> DataFrame:
|
|
400
|
+
logger.info(f"[_get_delta_log|in] ({path})")
|
|
401
|
+
|
|
402
|
+
spark = UtilsSpark.get_spark_session()
|
|
403
|
+
jdf = (
|
|
404
|
+
spark._jvm.org.apache.spark.sql.delta.DeltaLog.forTable(spark._jsparkSession, path) # noqa: SLF001
|
|
405
|
+
.snapshot()
|
|
406
|
+
.allFiles()
|
|
407
|
+
.toDF()
|
|
408
|
+
)
|
|
409
|
+
result = DataFrame(jdf, spark)
|
|
410
|
+
|
|
411
|
+
logger.info(f"[_get_delta_log|out] => {result}")
|
|
412
|
+
return result
|
|
413
|
+
|
|
414
|
+
def _get_table_partitions(self, path: str) -> list[str]:
|
|
415
|
+
logger.info(f"[_get_table_partitions|in] ({path})")
|
|
416
|
+
result: list[str] = []
|
|
417
|
+
|
|
418
|
+
delta_log: DataFrame = self._get_delta_log(path=path)
|
|
419
|
+
partition_keys = [
|
|
420
|
+
x.keys
|
|
421
|
+
for x in delta_log.select(f.map_keys(f.col("partitionValues")).alias("keys")).distinct().collect()
|
|
422
|
+
if 0 < len(x)
|
|
423
|
+
]
|
|
424
|
+
if 0 < len(partition_keys):
|
|
425
|
+
result: list[str] = list({y for y in partition_keys for y in y})
|
|
426
|
+
|
|
427
|
+
logger.info(f"[_get_table_partitions|out] => {result}")
|
|
428
|
+
return result
|
|
429
|
+
|
|
430
|
+
def _vacuum_now(self, path: str) -> None:
|
|
431
|
+
logger.info("[_vacuum_now|in]")
|
|
432
|
+
|
|
433
|
+
spark = UtilsSpark.get_spark_session()
|
|
434
|
+
old_conf_value = spark.conf.get("spark.databricks.delta.retentionDurationCheck.enabled")
|
|
435
|
+
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
|
|
436
|
+
DeltaTable.forPath(spark, path).vacuum(0)
|
|
437
|
+
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", old_conf_value)
|
|
438
|
+
|
|
439
|
+
logger.info("[_vacuum_now|out]")
|
|
440
|
+
|
|
441
|
+
def _has_schema_changed(self, path: str, df: DataFrame) -> bool:
|
|
442
|
+
logger.info(f"[_has_schema_changed|in] ({path},{df})")
|
|
443
|
+
result: bool = False
|
|
444
|
+
table = self._get_table(path=path)
|
|
445
|
+
if table is not None:
|
|
446
|
+
result = table.toDF().schema != df.schema
|
|
447
|
+
logger.info(f"[_has_schema_changed|out] => {result}")
|
|
448
|
+
return result
|
|
449
|
+
|
|
450
|
+
def _set_table_partitions(self, path: str, partition_fields: list[str]) -> None:
|
|
451
|
+
logger.info(f"[_set_table_partitions|in] ({path},{partition_fields})")
|
|
452
|
+
|
|
453
|
+
spark = UtilsSpark.get_spark_session()
|
|
454
|
+
# let's check partition_cols
|
|
455
|
+
current_partition_fields = self._get_table_partitions(path=path)
|
|
456
|
+
shall_we_repartition = sorted(partition_fields) != sorted(current_partition_fields)
|
|
457
|
+
|
|
458
|
+
if shall_we_repartition:
|
|
459
|
+
logger.info("[_set_table_partitions] going to repartition")
|
|
460
|
+
new_df = spark.read.format("delta").load(path)
|
|
461
|
+
new_df.write.format("delta").mode("overwrite").partitionBy(*partition_fields).option(
|
|
462
|
+
"overwriteSchema", "true"
|
|
463
|
+
).save(path)
|
|
464
|
+
self._vacuum_now(path)
|
|
465
|
+
logger.info(
|
|
466
|
+
f"[_set_table_partitions] changed partition cols from {current_partition_fields} to {partition_fields}"
|
|
467
|
+
)
|
|
468
|
+
logger.info("[_set_table_partitions|out]")
|
|
469
|
+
|
|
470
|
+
def _get_table(self, path: str) -> DeltaTable | None:
|
|
471
|
+
logger.debug(f"[_get_table|in] ({path})")
|
|
472
|
+
result: DeltaTable = None
|
|
473
|
+
try:
|
|
474
|
+
result: DeltaTable = DeltaTable.forPath(UtilsSpark.get_spark_session(), path)
|
|
475
|
+
except AnalysisException as ax:
|
|
476
|
+
logger.warning(f"[_get_table] couldn't load from {path}: {ax}")
|
|
477
|
+
|
|
478
|
+
logger.debug(f"[_get_table|out] => {result}")
|
|
479
|
+
return result
|
|
480
|
+
|
|
481
|
+
def set_column_comments(self, db: str, table: str, col_comments: dict[str, str]) -> None:
|
|
482
|
+
"""Set comments for columns in a Delta table.
|
|
483
|
+
|
|
484
|
+
Parameters
|
|
485
|
+
----------
|
|
486
|
+
db : str
|
|
487
|
+
The database name where the table is located.
|
|
488
|
+
table : str
|
|
489
|
+
The table name to set column comments for.
|
|
490
|
+
col_comments : dict[str, str]
|
|
491
|
+
Dictionary mapping column names to their comments.
|
|
492
|
+
"""
|
|
493
|
+
logger.info(f"[set_column_comments|in] ({db}, {table}, {col_comments})")
|
|
494
|
+
spark = UtilsSpark.get_spark_session()
|
|
495
|
+
|
|
496
|
+
table_description: DataFrame = spark.sql(f"describe {db}.{table}").withColumn(
|
|
497
|
+
"set_column_comments_id", monotonically_increasing_id()
|
|
498
|
+
)
|
|
499
|
+
partition_info_id = (
|
|
500
|
+
table_description.filter(f.col("col_name") == "# Partition Information").collect()[0].set_column_comments_id
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
table_description = table_description.filter(
|
|
504
|
+
(f.col("set_column_comments_id") < f.lit(partition_info_id)) & (f.col("col_name") != "")
|
|
505
|
+
).drop("set_column_comments_id")
|
|
506
|
+
rows = [r.asDict() for r in table_description.collect()]
|
|
507
|
+
for row in rows:
|
|
508
|
+
col = row["col_name"]
|
|
509
|
+
data_type = row["data_type"]
|
|
510
|
+
if col in col_comments:
|
|
511
|
+
new_comment = col_comments[col]
|
|
512
|
+
logger.info(f"[set_column_comments] setting new comment ({new_comment}) to column {col}")
|
|
513
|
+
spark.sql(f"ALTER TABLE {db}.{table} CHANGE COLUMN {col} {col} {data_type} COMMENT '{new_comment}'")
|
|
514
|
+
|
|
515
|
+
logger.info("[set_column_comments|out]")
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tgedr-dataops-ext
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: this is a template for a python package
|
|
5
|
+
Author-email: developer <developer@email.com>
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: pandas>=2.3.0
|
|
9
|
+
Requires-Dist: deltalake~=0.16.4
|
|
10
|
+
Requires-Dist: delta-spark~=2.4.0
|
|
11
|
+
Requires-Dist: tgedr-dataops>=1.0.1
|
|
12
|
+
Requires-Dist: pyspark~=3.4.0
|
|
13
|
+
|
|
14
|
+
# tgedr-dataops-ext
|
|
15
|
+
|
|
16
|
+

|
|
17
|
+
[](https://pypi.org/project/tgedr-dataops-ext/)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
data operations related code - extended
|
|
21
|
+
|
|
22
|
+
## motivation
|
|
23
|
+
*dataops-ext* is a library with tested and used code aligning on some standards regarding code structure and quality and to avoid reinventing the wheel. It builds on top of *dataops-abs* and *dataops* providing distributed processing features based on pyspark.
|
|
24
|
+
|
|
25
|
+
## installation
|
|
26
|
+
`pip install tgedr-dataops-ext`
|
|
27
|
+
|
|
28
|
+
## package namespaces and its contents
|
|
29
|
+
|
|
30
|
+
#### commons
|
|
31
|
+
- __Dataset__: immutable class to wrap up a dataframe along with metadata ([example](tests/tgedr_dataops_ext/commons/test_dataset.py))
|
|
32
|
+
- __Metadata__: immutable class depicting dataset metadata ([example](tests/tgedr_dataops_ext/commons/test_metadata.py))
|
|
33
|
+
- __UtilsSpark__: utility class to work with spark, mostly helping on creating a session ([example](tests/tgedr_dataops_ext/commons/test_utils_spark.py))
|
|
34
|
+
|
|
35
|
+
#### quality
|
|
36
|
+
- __PysparkValidation__ : __GreatExpectationsValidation__ implementation to validate pyspark dataframes with Great Expectations library ([example](tests/tgedr_dataops_ext/quality/test_pyspark_validation.py))
|
|
37
|
+
|
|
38
|
+
#### source
|
|
39
|
+
|
|
40
|
+
- __DeltaTableSource__: abstract __Source__ class used to read delta lake format datasets returning a pandas dataframe" ([example](tests/tgedr_dataops_ext/source/test_delta_table_source.py))
|
|
41
|
+
- __LocalDeltaTable__: __Source__ class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe ([example](tests/tgedr_dataops_ext/source/test_local_delta_table.py))
|
|
42
|
+
- __S3DeltaTable__: __Source__ class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe ([example](tests/tgedr_dataops_ext/source/test_s3_delta_table.py))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
#### store
|
|
46
|
+
- __SparkDeltaStore__ : __Store__ implementation for pyspark distributed processing with delta table format ([example](tests/tgedr_dataops_ext/store/test_spark_delta.py))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
## development
|
|
51
|
+
- main requirements:
|
|
52
|
+
- _uv_
|
|
53
|
+
- _bash_
|
|
54
|
+
- Clone the repository like this:
|
|
55
|
+
|
|
56
|
+
``` bash
|
|
57
|
+
git clone git@github.com:jtviegas/dataops-ext
|
|
58
|
+
```
|
|
59
|
+
- cd into the folder: `cd dataops-ext`
|
|
60
|
+
- install requirements: `./helper.sh reqs`
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
tgedr_dataops_ext/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
tgedr_dataops_ext/commons/dataset.py,sha256=cYJkqm-w4VxwprUAgB8QyCLiJ-bnK7PLGZWdmkahbhM,794
|
|
3
|
+
tgedr_dataops_ext/commons/metadata.py,sha256=UClsNoo9BUbz6Defp9Jd01k00h7GV9FhET6wXbScSLw,8106
|
|
4
|
+
tgedr_dataops_ext/commons/utils_spark.py,sha256=NcJRlcGky0abc28hq6Lrn0-OY3DjTo8pCOGcWb4qaco,4886
|
|
5
|
+
tgedr_dataops_ext/quality/pyspark_validation.py,sha256=ppnLWBDz2n2rchhyPnfUuNbZVw63dTvkdM56bYWeQYY,824
|
|
6
|
+
tgedr_dataops_ext/source/delta_table_source.py,sha256=lcGgAKpNj8FNF8DTnMK_KAQ3GgEUxHU8sKcwjcCTR84,1961
|
|
7
|
+
tgedr_dataops_ext/source/local_delta_table.py,sha256=3ffk3kWwLjaNLbSFU50R_fTd9WE9ItAaHR7Dv5pATLg,1975
|
|
8
|
+
tgedr_dataops_ext/source/s3_delta_table.py,sha256=z4e5LTAAeqt7GvtNsaoc8z_sZhOBOjQikp35thodZwo,3266
|
|
9
|
+
tgedr_dataops_ext/store/spark_delta.py,sha256=33zKhJYmlGq5mFvy5Yzr9z_zX5gzjZSFW8RuYWHcdjA,20805
|
|
10
|
+
tgedr_dataops_ext-0.0.1.dist-info/METADATA,sha256=kBiDkP5rdWMJRJ_Xb8G6hjJYkAq89uf4BMFE5KCvM5Q,2686
|
|
11
|
+
tgedr_dataops_ext-0.0.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
12
|
+
tgedr_dataops_ext-0.0.1.dist-info/top_level.txt,sha256=st7VbEQz5kyNJ8ww2zjv-uWKyKww1npcI-qr-XQClaY,18
|
|
13
|
+
tgedr_dataops_ext-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tgedr_dataops_ext
|