tgedr-dataops-ext 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,27 @@
1
+ """Dataset module for wrapping dataframes with metadata.
2
+
3
+ This module provides:
4
+ - Dataset: an immutable dataclass that combines a DataFrame with Metadata.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+ import json
9
+ from pyspark.sql import DataFrame
10
+ from tgedr_dataops_ext.commons.metadata import Metadata
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class Dataset:
15
+ """Utility immutable class to wrap up a dataframe along with metadata."""
16
+
17
+ __slots__ = ["data", "metadata"]
18
+ metadata: Metadata
19
+ data: DataFrame
20
+
21
+ def as_dict(self) -> dict:
22
+ """Serialize the dataset as a dictionary."""
23
+ return {"metadata": self.metadata.as_dict(), "data": str(self.data.__repr__)}
24
+
25
+ def __str__(self) -> str:
26
+ """Serialize the dataset as a json string."""
27
+ return json.dumps(self.as_dict())
@@ -0,0 +1,236 @@
1
+ """Module for dataset metadata classes."""
2
+
3
+ from dataclasses import dataclass
4
+ import json
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class FieldFrame:
9
+ """Class depicting a field values range, to be used in metadata.
10
+
11
+ Parameters
12
+ ----------
13
+ field : str
14
+ The name of the field.
15
+ lower : Union[int, str, float]
16
+ Field lower bound.
17
+ upper : Union[int, str, float]
18
+ Field upper bound.
19
+
20
+ """
21
+
22
+ __slots__ = ["field", "lower", "upper"]
23
+ field: str
24
+ lower: int | str | float
25
+ upper: int | str | float
26
+
27
+ def as_dict(self) -> dict[str, int | str | float]:
28
+ """Convert the FieldFrame to a dictionary representation.
29
+
30
+ Returns
31
+ -------
32
+ dict[str, int | str | float]
33
+ A dictionary containing the field name, lower bound, and upper bound.
34
+ """
35
+ return {"field": self.field, "lower": self.lower, "upper": self.upper}
36
+
37
+ @staticmethod
38
+ def from_str(src: str) -> "FieldFrame":
39
+ """Create a FieldFrame instance from a JSON string.
40
+
41
+ Parameters
42
+ ----------
43
+ src : str
44
+ A JSON string representation of a FieldFrame.
45
+
46
+ Returns
47
+ -------
48
+ FieldFrame
49
+ A new FieldFrame instance created from the JSON string.
50
+ """
51
+ r = json.loads(src)
52
+ field = r["field"]
53
+ lower = r["lower"]
54
+ upper = r["upper"]
55
+ return FieldFrame(field=field, lower=lower, upper=upper)
56
+
57
+ def __str__(self) -> str:
58
+ """Return a JSON string representation of the FieldFrame."""
59
+ return json.dumps(self.as_dict())
60
+
61
+ def __eq__(self, other: "FieldFrame") -> bool:
62
+ """Check equality between two FieldFrame instances."""
63
+ return self.field == other.field and self.lower == other.lower and self.upper == other.upper
64
+
65
+ def __gt__(self, other: "FieldFrame") -> bool:
66
+ """Check if this FieldFrame is greater than another FieldFrame."""
67
+ return self.field > other.field or (
68
+ self.field == other.field
69
+ and (self.lower > other.lower or (self.lower == other.lower and self.upper > other.upper))
70
+ )
71
+
72
+ def __ne__(self, other: "FieldFrame") -> bool:
73
+ """Check inequality between two FieldFrame instances."""
74
+ return not other == self
75
+
76
+ def __ge__(self, other: "FieldFrame") -> bool:
77
+ """Check if this FieldFrame is greater than or equal to another FieldFrame."""
78
+ return other == self or self > other
79
+
80
+ def __le__(self, other: "FieldFrame") -> bool:
81
+ """Check if this FieldFrame is less than or equal to another FieldFrame."""
82
+ return other == self or self < other
83
+
84
+ def __lt__(self, other: "FieldFrame") -> bool:
85
+ """Check if this FieldFrame is less than another FieldFrame."""
86
+ return other > self
87
+
88
+
89
+ @dataclass(frozen=True)
90
+ class Metadata:
91
+ """Immutable class depicting dataset metadata.
92
+
93
+ Parameters
94
+ ----------
95
+ name : str
96
+ The name of the dataset.
97
+ version : Optional[str]
98
+ Version of this dataset, if available.
99
+ framing : Optional[List[FieldFrame]]
100
+ Multiple field frames.
101
+ sources : Optional[List["Metadata"]]
102
+ Metadatas related to the datasets sourcing this one.
103
+
104
+ """
105
+
106
+ __slots__ = ["framing", "name", "sources", "version"]
107
+ name: str
108
+ version: str | None
109
+ framing: list[FieldFrame] | None
110
+ sources: list["Metadata"] | None
111
+
112
+ def as_dict(self) -> dict:
113
+ """Convert the Metadata to a dictionary representation.
114
+
115
+ Returns
116
+ -------
117
+ dict
118
+ A dictionary containing the metadata fields including name, version,
119
+ framing, and sources if they are not None.
120
+ """
121
+ result = {"name": self.name}
122
+ if self.version is not None:
123
+ result["version"] = self.version
124
+ if self.framing is not None:
125
+ result["framing"] = []
126
+ for f in self.framing:
127
+ (result["framing"]).append(f.as_dict())
128
+ if self.sources is not None:
129
+ result["sources"] = []
130
+ for source in self.sources:
131
+ (result["sources"]).append(source.as_dict())
132
+
133
+ return result
134
+
135
+ def __str__(self) -> str:
136
+ """Return a JSON string representation of the Metadata."""
137
+ return json.dumps(self.as_dict())
138
+
139
+ def __eq__(self, other: object) -> bool:
140
+ """Check equality between two Metadata instances."""
141
+ return (
142
+ self.name == other.name
143
+ and (
144
+ (self.version is None and other.version is None)
145
+ or ((self.version is not None and other.version is not None) and self.version == other.version)
146
+ )
147
+ and (
148
+ (self.framing is None and other.framing is None)
149
+ or (
150
+ (self.framing is not None and other.framing is not None)
151
+ and sorted(self.framing) == sorted(other.framing)
152
+ )
153
+ )
154
+ and (
155
+ (self.sources is None and other.sources is None)
156
+ or (
157
+ (self.sources is not None and other.sources is not None)
158
+ and sorted(self.sources) == sorted(other.sources)
159
+ )
160
+ )
161
+ )
162
+
163
+ def __gt__(self, other: "Metadata") -> bool:
164
+ """Check if this Metadata is greater than another Metadata."""
165
+ return self.name > other.name or (
166
+ self.name == other.name
167
+ and (
168
+ ((self.version is not None and other.version is not None) and (self.version > other.version))
169
+ or (self.version is not None and other.version is None)
170
+ or (
171
+ (
172
+ (self.framing is not None and other.framing is not None)
173
+ and (sorted(self.framing) > sorted(other.framing))
174
+ )
175
+ or (self.framing is not None and other.framing is None)
176
+ or (
177
+ (
178
+ (self.sources is not None and other.sources is not None)
179
+ and (sorted(self.sources) > sorted(other.sources))
180
+ )
181
+ or (self.sources is not None and other.sources is None)
182
+ )
183
+ )
184
+ )
185
+ )
186
+
187
+ def __ne__(self, other: "Metadata") -> bool:
188
+ """Check inequality between two Metadata instances."""
189
+ return not other == self
190
+
191
+ def __ge__(self, other: "Metadata") -> bool:
192
+ """Check if this Metadata is greater than or equal to another Metadata."""
193
+ return other == self or self > other
194
+
195
+ def __le__(self, other: "Metadata") -> bool:
196
+ """Check if this Metadata is less than or equal to another Metadata."""
197
+ return other == self or self < other
198
+
199
+ def __lt__(self, other: "Metadata") -> bool:
200
+ """Check if this Metadata is less than another Metadata."""
201
+ return other > self
202
+
203
+ @staticmethod
204
+ def from_str(src: str) -> "Metadata":
205
+ """Create a Metadata instance from a JSON string.
206
+
207
+ Parameters
208
+ ----------
209
+ src : str
210
+ A JSON string representation of a Metadata object.
211
+
212
+ Returns
213
+ -------
214
+ Metadata
215
+ A new Metadata instance created from the JSON string.
216
+ """
217
+ r = json.loads(src)
218
+ name = r["name"]
219
+ version = r.get("version", None)
220
+
221
+ framing = None
222
+ framing_entries = r.get("framing", None)
223
+ if framing_entries is not None:
224
+ framing = []
225
+ for framing_entry in framing_entries:
226
+ framing.append(FieldFrame.from_str(json.dumps(framing_entry)))
227
+
228
+ sources = None
229
+ sources_entries = r.get("sources", None)
230
+ if sources_entries is not None:
231
+ sources = []
232
+ for source_entry in sources_entries:
233
+ source_entry_as_str = json.dumps(source_entry)
234
+ sources.append(Metadata.from_str(source_entry_as_str))
235
+
236
+ return Metadata(name=name, version=version, framing=framing, sources=sources)
@@ -0,0 +1,133 @@
1
+ """Spark utility functions and classes.
2
+
3
+ This module provides:
4
+ - UtilsSpark: A utility class with handy functions to work with Spark sessions,
5
+ including creating local and remote sessions with Delta Lake support,
6
+ and building PySpark schemas from data type dictionaries.
7
+ """
8
+
9
+ import logging
10
+ import os
11
+ from typing import ClassVar
12
+ from pyspark.sql import SparkSession
13
+ from pyspark import SparkConf
14
+ from pyspark.sql import types as T # noqa: N812
15
+ from pyspark.context import SparkContext
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class UtilsSpark:
22
+ """class with handy functions to work with spark."""
23
+
24
+ __ENV_KEY_PYSPARK_IS_LOCAL = "PYSPARK_IS_LOCAL"
25
+ __ENV_KEY_NOT_AWS_CLOUD = "NOT_AWS_CLOUD"
26
+ __DTYPES_MAP: ClassVar[dict[str, type]] = {
27
+ "bigint": T.LongType,
28
+ "string": T.StringType,
29
+ "double": T.DoubleType,
30
+ "int": T.IntegerType,
31
+ "boolean": T.BooleanType,
32
+ "timestamp": T.TimestampType,
33
+ "date": T.DateType,
34
+ }
35
+
36
+ @staticmethod
37
+ def get_local_spark_session(config: dict | None = None) -> SparkSession:
38
+ """Get a local Spark session configured for Delta Lake.
39
+
40
+ Parameters
41
+ ----------
42
+ config : dict | None, optional
43
+ Additional Spark configuration parameters, by default None.
44
+
45
+ Returns
46
+ -------
47
+ SparkSession
48
+ A configured local Spark session instance with Delta Lake support.
49
+ """
50
+ logger.debug(f"[get_local_spark_session|in] ({config})")
51
+ # PySpark 3.4 uses Scala 2.12, so we need delta-core_2.12
52
+ builder = (
53
+ SparkSession.builder.config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
54
+ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
55
+ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
56
+ .config("spark.driver.host", "localhost")
57
+ )
58
+
59
+ if config is not None:
60
+ for k, v in config.items():
61
+ builder.config(k, v)
62
+
63
+ spark = builder.getOrCreate()
64
+
65
+ logger.debug(f"[get_local_spark_session|out] => {spark}")
66
+ return spark
67
+
68
+ @staticmethod
69
+ def get_spark_session(config: dict | None = None) -> SparkSession:
70
+ """Get a Spark session based on the environment configuration.
71
+
72
+ Parameters
73
+ ----------
74
+ config : dict | None, optional
75
+ Additional Spark configuration parameters, by default None.
76
+
77
+ Returns
78
+ -------
79
+ SparkSession
80
+ A configured Spark session instance.
81
+ """
82
+ logger.debug(f"[get_spark_session|in] ({config})")
83
+
84
+ if "1" == os.getenv(UtilsSpark.__ENV_KEY_PYSPARK_IS_LOCAL):
85
+ spark: SparkSession = UtilsSpark.get_local_spark_session(config)
86
+ else:
87
+ if "1" == os.getenv(UtilsSpark.__ENV_KEY_NOT_AWS_CLOUD):
88
+ active_session = SparkSession.getActiveSession()
89
+ else:
90
+ from awsglue.context import GlueContext # type: ignore # pragma: no cover # noqa: PGH003
91
+
92
+ glueContext = GlueContext(SparkContext.getOrCreate()) # pragma: no cover # noqa: N806
93
+ active_session = glueContext.spark_session # pragma: no cover
94
+
95
+ spark_config = SparkConf()
96
+
97
+ if active_session is not None:
98
+ former_config = active_session.sparkContext.getConf().getAll()
99
+ for entry in former_config:
100
+ spark_config.set(entry[0], entry[1])
101
+ if config is not None:
102
+ for k, v in config.items():
103
+ spark_config.set(k, v)
104
+ spark: SparkSession = SparkSession.builder.config(conf=spark_config).getOrCreate()
105
+ else:
106
+ spark: SparkSession = SparkSession.builder.getOrCreate()
107
+
108
+ logger.debug(f"[get_spark_session|out] => {spark}")
109
+ return spark
110
+
111
+ @staticmethod
112
+ def build_schema_from_dtypes(dtypes_schema: dict[str, str]) -> T.StructType:
113
+ """Build a PySpark StructType schema from a dictionary of data types.
114
+
115
+ Parameters
116
+ ----------
117
+ dtypes_schema : dict[str, str]
118
+ A dictionary mapping field names to their corresponding data type strings.
119
+ Supported types: 'bigint', 'string', 'double', 'int', 'boolean', 'timestamp', 'date'.
120
+
121
+ Returns
122
+ -------
123
+ T.StructType
124
+ A PySpark StructType schema with fields defined by the input dictionary.
125
+ """
126
+ logger.info(f"[build_schema_from_dtypes|in] ({dtypes_schema})")
127
+ result = T.StructType()
128
+ for field, dtype in dtypes_schema.items():
129
+ new_type = UtilsSpark.__DTYPES_MAP[dtype]
130
+ result.add(field, new_type(), True) # noqa: FBT003
131
+
132
+ logger.info(f"[build_schema_from_dtypes|out] => {result}")
133
+ return result
@@ -0,0 +1,22 @@
1
+ """Pyspark DataFrame validation implementation module.
2
+
3
+ This module provides the Pyspark-specific implementation of Great Expectations validation.
4
+ """
5
+
6
+ from great_expectations.execution_engine import ExecutionEngine
7
+ from great_expectations.execution_engine import SparkDFExecutionEngine
8
+ from tgedr_dataops_abs.great_expectations_validation import GreatExpectationsValidation
9
+
10
+
11
+ class PysparkValidation(GreatExpectationsValidation):
12
+ """Pyspark DataFrame validation implementation."""
13
+
14
+ def _get_execution_engine(self, batch_data_dict: dict) -> ExecutionEngine:
15
+ """Get the execution engine used by the validation implementation.
16
+
17
+ Returns
18
+ -------
19
+ ExecutionEngine
20
+ The execution engine instance.
21
+ """
22
+ return SparkDFExecutionEngine(batch_data_dict=batch_data_dict)
@@ -0,0 +1,57 @@
1
+ """Delta Lake table source implementation.
2
+
3
+ This module provides:
4
+ - DeltaTableSource: abstract base class for reading delta lake format datasets
5
+ and returning pandas DataFrames with configurable storage options.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+ import logging
10
+ from typing import Any
11
+ from pandas import DataFrame
12
+ from deltalake import DeltaTable
13
+ from deltalake.exceptions import TableNotFoundError
14
+
15
+ from tgedr_dataops_abs.source import Source, SourceException, NoSourceException
16
+
17
+
18
+ logger = logging.getLogger()
19
+
20
+
21
+ class DeltaTableSource(Source, ABC):
22
+ """abstract class used to read delta lake format datasets returning a pandas dataframe."""
23
+
24
+ CONTEXT_KEY_URL: str = "url"
25
+ CONTEXT_KEY_COLUMNS: str = "columns"
26
+
27
+ def __init__(self, config: dict[str, Any] | None = None) -> None:
28
+ """Initialize the DeltaTableSource with optional configuration."""
29
+ super().__init__(config=config)
30
+
31
+ @property
32
+ @abstractmethod
33
+ def _storage_options(self) -> Any:
34
+ return None # pragma: no cover
35
+
36
+ def get(self, context: dict[str, Any] | None = None) -> DataFrame:
37
+ """Retrieves a delta lake table."""
38
+ logger.info(f"[get|in] ({context})")
39
+ result: DataFrame = None
40
+
41
+ if self.CONTEXT_KEY_URL not in context:
42
+ raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
43
+
44
+ columns: list[str] = None
45
+ if self.CONTEXT_KEY_COLUMNS in context:
46
+ columns = context[self.CONTEXT_KEY_COLUMNS]
47
+
48
+ try:
49
+ delta_table = DeltaTable(
50
+ table_uri=context[self.CONTEXT_KEY_URL], storage_options=self._storage_options, without_files=True
51
+ )
52
+ result = delta_table.to_pandas(columns=columns)
53
+ except TableNotFoundError as exc:
54
+ raise NoSourceException(f"could not find delta table: {context[self.CONTEXT_KEY_URL]}") from exc
55
+
56
+ logger.info(f"[get|out] => {result}")
57
+ return result
@@ -0,0 +1,58 @@
1
+ """Local Delta Table source implementation.
2
+
3
+ This module provides the LocalDeltaTable class for reading Delta Lake format datasets
4
+ from the local filesystem using Python only (PySpark not required), returning pandas DataFrames.
5
+ """
6
+
7
+ import logging
8
+ import re
9
+ from typing import Any
10
+ import glob
11
+ from pathlib import Path
12
+
13
+ from tgedr_dataops_ext.source.delta_table_source import DeltaTableSource
14
+ from tgedr_dataops_abs.source import SourceException
15
+
16
+
17
+ logger = logging.getLogger()
18
+
19
+
20
+ class LocalDeltaTable(DeltaTableSource):
21
+ """class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe."""
22
+
23
+ def __init__(self, config: dict[str, Any] | None = None) -> None:
24
+ """Initialize LocalDeltaTable with optional configuration.
25
+
26
+ Args:
27
+ config: Optional configuration dictionary for the delta table source.
28
+ """
29
+ super().__init__(config=config)
30
+
31
+ @property
32
+ def _storage_options(self) -> Any:
33
+ return None
34
+
35
+ def list(self, context: dict[str, Any] | None = None) -> list[str]:
36
+ """Lists the available delta lake datasets in the url provided."""
37
+ logger.info(f"[list|in] ({context})")
38
+
39
+ result: list[str] = []
40
+ if self.CONTEXT_KEY_URL not in context:
41
+ raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
42
+
43
+ url = context[self.CONTEXT_KEY_URL]
44
+ if not Path(url).is_dir():
45
+ raise SourceException(f"not a delta lake url: {url}")
46
+
47
+ matches: set[str] = set()
48
+ pattern: str = f".*{url}/(.*)/_delta_log/.*"
49
+ for entry in glob.iglob(url + "**/**", recursive=True): # noqa: PTH207
50
+ match = re.search(pattern, entry)
51
+ if match:
52
+ matches.add(match.group(1))
53
+
54
+ result = list(matches)
55
+
56
+ logger.info(f"[list] result: {result}")
57
+ logger.info(f"[list|out] => result len: {len(result)}")
58
+ return result
@@ -0,0 +1,83 @@
1
+ """S3 Delta Table source module for reading Delta Lake format datasets from S3.
2
+
3
+ This module provides the S3DeltaTable class for reading Delta Lake format datasets
4
+ from S3 buckets using Python only (no PySpark required), returning pandas DataFrames.
5
+ """
6
+
7
+ import logging
8
+ import re
9
+ from typing import Any
10
+
11
+ from tgedr_dataops.commons.s3_connector import S3Connector
12
+ from tgedr_dataops.commons.utils_fs import remove_s3_protocol
13
+ from tgedr_dataops_ext.source.delta_table_source import DeltaTableSource
14
+ from tgedr_dataops_abs.source import SourceException
15
+
16
+
17
+ logger = logging.getLogger()
18
+
19
+
20
+ class S3DeltaTable(DeltaTableSource, S3Connector):
21
+ """class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe."""
22
+
23
+ CONFIG_KEY_AWS_ACCESS_KEY_ID: str = "AWS_ACCESS_KEY_ID"
24
+ CONFIG_KEY_AWS_SECRET_ACCESS_KEY: str = "AWS_SECRET_ACCESS_KEY" # noqa: S105
25
+ CONFIG_KEY_AWS_SESSION_TOKEN: str = "AWS_SESSION_TOKEN" # noqa: S105
26
+ CONFIG_KEY_AWS_REGION: str = "AWS_REGION"
27
+
28
+ def __init__(self, config: dict[str, Any] | None = None) -> None:
29
+ """Initialize the S3DeltaTable with optional configuration.
30
+
31
+ Args:
32
+ config: Optional dictionary containing AWS credentials and configuration.
33
+ """
34
+ DeltaTableSource.__init__(self, config=config)
35
+ S3Connector.__init__(self)
36
+
37
+ @property
38
+ def _storage_options(self) -> Any:
39
+ result = None
40
+ if (self._config is not None) and all(
41
+ element in list(self._config.keys())
42
+ for element in [
43
+ self.CONFIG_KEY_AWS_ACCESS_KEY_ID,
44
+ self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY,
45
+ self.CONFIG_KEY_AWS_SESSION_TOKEN,
46
+ self.CONFIG_KEY_AWS_REGION,
47
+ ]
48
+ ):
49
+ result = {
50
+ "AWS_ACCESS_KEY_ID": self._config[self.CONFIG_KEY_AWS_ACCESS_KEY_ID],
51
+ "AWS_SECRET_ACCESS_KEY": self._config[self.CONFIG_KEY_AWS_SECRET_ACCESS_KEY],
52
+ "AWS_SESSION_TOKEN": self._config[self.CONFIG_KEY_AWS_SESSION_TOKEN],
53
+ "AWS_REGION": self._config[self.CONFIG_KEY_AWS_REGION],
54
+ }
55
+
56
+ return result
57
+
58
+ def list(self, context: dict[str, Any] | None = None) -> list[str]:
59
+ """Lists the available delta lake datasets in the url provided."""
60
+ logger.info(f"[list|in] ({context})")
61
+
62
+ result: list[str] = []
63
+ if self.CONTEXT_KEY_URL not in context:
64
+ raise SourceException(f"you must provide context for {self.CONTEXT_KEY_URL}")
65
+
66
+ path = remove_s3_protocol(context[self.CONTEXT_KEY_URL])
67
+ path_elements = path.split("/")
68
+ bucket = path_elements[0]
69
+ key = "/".join(path_elements[1:])
70
+
71
+ matches: set[str] = set()
72
+ pattern: str = f".*{key}/(.*)/_delta_log/.*"
73
+ for entry in self._client.list_objects_v2(Bucket=bucket, Prefix=key)["Contents"]:
74
+ output_key: str = entry["Key"]
75
+ match = re.search(pattern, output_key)
76
+ if match:
77
+ matches.add(f"{key}/{match.group(1)}")
78
+
79
+ result = list(matches)
80
+
81
+ logger.info(f"[list] result: {result}")
82
+ logger.info(f"[list|out] => result len: {len(result)}")
83
+ return result
@@ -0,0 +1,515 @@
1
+ """Spark Delta Lake store implementation module.
2
+
3
+ This module provides a Store implementation for working with Delta Lake format
4
+ using Apache Spark. It includes the SparkDeltaStore class which supports:
5
+ - Reading and writing data in Delta Lake format
6
+ - Versioning and time travel queries
7
+ - Update/merge operations (upserts)
8
+ - Partitioning and schema evolution
9
+ - Retention policies for log and deleted files
10
+ - Metadata management
11
+ """
12
+
13
+ from abc import ABC
14
+ import dataclasses
15
+ import logging
16
+ from typing import Any
17
+ from datetime import datetime
18
+ from pyspark.sql import DataFrame
19
+ from delta.tables import DeltaTable
20
+ from pyspark.sql import functions as f
21
+ from pyspark.sql import types as T # noqa: N812
22
+ from pyspark.sql.utils import AnalysisException
23
+ from pyspark.sql.functions import monotonically_increasing_id
24
+ from tgedr_dataops_abs.store import NoStoreException, Store, StoreException
25
+ from tgedr_dataops_ext.commons.metadata import Metadata
26
+ from tgedr_dataops_ext.commons.utils_spark import UtilsSpark
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class SparkDeltaStore(Store, ABC):
32
+ """A store implementation using Spark Delta Lake format.
33
+
34
+ This class provides methods for reading, writing, updating, and deleting
35
+ data in Delta Lake format using Apache Spark. It supports features like
36
+ versioning, partitioning, schema evolution, and retention policies.
37
+
38
+ Attributes
39
+ ----------
40
+ config : dict[str, Any] | None
41
+ Optional configuration dictionary for the store.
42
+
43
+ Methods
44
+ -------
45
+ get(key: str, version: Optional[str] = None, **kwargs) -> DataFrame
46
+ Retrieve data from the specified key/path.
47
+ save(df: DataFrame, key: str, append: bool = False, ...) -> None
48
+ Save a DataFrame to the specified key/path.
49
+ update(df: Any, key: str, match_fields: list[str], ...) -> None
50
+ Update or insert data using merge operation.
51
+ delete(key: str, condition: Union[f.Column, str, None] = None, **kwargs) -> None
52
+ Delete data from the specified key/path.
53
+ """
54
+
55
+ def __init__(self, config: dict[str, Any] | None = None) -> None:
56
+ """Initialize SparkDeltaStore with optional configuration.
57
+
58
+ Args:
59
+ config: Optional configuration dictionary for the store.
60
+ """
61
+ Store.__init__(self, config)
62
+
63
+ def get(self, key: str, version: str | None = None, **kwargs) -> DataFrame: # noqa: ANN003, ARG002
64
+ """Retrieve data from the specified key/path.
65
+
66
+ Parameters
67
+ ----------
68
+ key : str
69
+ The path to the Delta table to read from.
70
+ version : str | None, optional
71
+ The version of the Delta table to read, is None by default.
72
+ **kwargs
73
+ Additional keyword arguments (currently unused).
74
+
75
+ Returns
76
+ -------
77
+ DataFrame
78
+ The Spark DataFrame containing the data from the Delta table.
79
+
80
+ Raises
81
+ ------
82
+ NoStoreException
83
+ If no data is found at the specified key/path.
84
+ """
85
+ logger.info(f"[get|in] ({key}, {version})")
86
+
87
+ table = self._get_table(path=key)
88
+ if table is None:
89
+ raise NoStoreException(f"[get] couldn't find data in key: {key}")
90
+
91
+ reader = UtilsSpark.get_spark_session().read.format("delta")
92
+ if version is not None:
93
+ reader = reader.option("versionAsOf", version)
94
+
95
+ result = reader.load(key)
96
+
97
+ logger.info("[get_df|out]")
98
+ return result
99
+
100
+ def __get_deletion_criteria(self, df: DataFrame) -> Any:
101
+ logger.debug("[__get_deletion_criteria|in])")
102
+ fields = df.dtypes
103
+ numerics = [
104
+ x
105
+ for x in fields
106
+ if x[1] in ["bigint", "int", "double", "float", "long", "decimal.Decimal"] or (x[1][:7]) == "decimal"
107
+ ]
108
+ dates = [x for x in fields if (x[1]) in ["datetime", "datetime.datetime"]]
109
+ textuals = [x for x in fields if x[1] in ["string"]]
110
+ if 0 < len(numerics):
111
+ column = numerics[0][0]
112
+ result = (f.col(column) > 0) | (f.col(column) <= 0)
113
+ elif 0 < len(dates):
114
+ column = dates[0][0]
115
+ now = datetime.now(tz=datetime.UTC)
116
+ result = (f.col(column) > now) | (f.col(column) <= now)
117
+ elif 0 < len(textuals):
118
+ column = textuals[0][0]
119
+ result = (f.col(column) > "a") | (f.col(column) <= "a")
120
+ else:
121
+ raise StoreException(
122
+ "[__get_deletion_criteria] failed to figure out column types handy to create a full deletion criteria"
123
+ )
124
+
125
+ logger.debug(f"[__get_deletion_criteria|out] = {result}")
126
+ return result
127
+
128
+ def delete(self, key: str, condition: f.Column | str | None = None, **kwargs) -> None: # noqa: ANN003, ARG002
129
+ """Delete data from the specified key/path.
130
+
131
+ Parameters
132
+ ----------
133
+ key : str
134
+ The path to the Delta table to delete from.
135
+ condition : f.Column | str | None, optional
136
+ The condition to filter rows for deletion. If None, deletes all rows.
137
+ **kwargs
138
+ Additional keyword arguments (currently unused).
139
+
140
+ Raises
141
+ ------
142
+ StoreException
143
+ If deletion fails or column types cannot be determined for full deletion.
144
+ """
145
+ logger.info(f"[delete|in] ({key}, {condition})")
146
+
147
+ spark = UtilsSpark.get_spark_session()
148
+ """
149
+ is_s3_operation = True if key.startswith("s3") else False
150
+ if is_s3_operation:
151
+ """
152
+ delta_table = DeltaTable.forPath(spark, key)
153
+ if condition is None:
154
+ condition = self.__get_deletion_criteria(delta_table.toDF())
155
+ delta_table.delete(condition=condition)
156
+ """
157
+ else: # local development mostly for temporary or test purposes
158
+ spark_fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
159
+ # get spark context path
160
+ spark_path = spark._jvm.org.apache.hadoop.fs.Path(key)
161
+ logger.info(f"[delete] spark path is {spark_path}")
162
+ try:
163
+ if spark_fs.exists(spark_path):
164
+ spark_fs.delete(spark_path, True)
165
+ except AnalysisException as x:
166
+ raise StoreException(f"[delete] couldn't do it on key {key}: {x}")
167
+ """
168
+ logger.info("[delete|out]")
169
+
170
+ def save(
171
+ self,
172
+ df: DataFrame,
173
+ key: str,
174
+ append: bool = False,
175
+ partition_fields: list[str] | None = None,
176
+ metadata: Metadata | None = None,
177
+ retention_days: int = 7,
178
+ deleted_retention_days: int = 7,
179
+ column_descriptions: dict[str, str] | None = None,
180
+ table_name: str | None = None,
181
+ **kwargs, # noqa: ANN003
182
+ ) -> None:
183
+ """Save a DataFrame to the specified key/path in Delta format.
184
+
185
+ Parameters
186
+ ----------
187
+ df : DataFrame
188
+ The Spark DataFrame to save.
189
+ key : str
190
+ The path where the Delta table will be saved.
191
+ append : bool, optional
192
+ Whether to append to existing data, is False by default.
193
+ partition_fields : list[str] | None, optional
194
+ List of column names to partition the table by, is None by default.
195
+ metadata : Metadata | None, optional
196
+ Optional metadata to attach to the table, is None by default.
197
+ retention_days : int, optional
198
+ Number of days to retain log files, is 7 by default.
199
+ deleted_retention_days : int, optional
200
+ Number of days to retain deleted files, is 7 by default.
201
+ column_descriptions : dict[str, str] | None, optional
202
+ Dictionary mapping column names to their descriptions, is None by default.
203
+ table_name : str | None, optional
204
+ Optional table name in format 'db.table', is None by default.
205
+ **kwargs
206
+ Additional keyword arguments.
207
+ """
208
+ logger.info(
209
+ f"[save|in] ({df}, {key}, {append}, {partition_fields}, {metadata}, {retention_days}, {deleted_retention_days}, {column_descriptions}, {table_name}, {kwargs})"
210
+ )
211
+
212
+ if column_descriptions is not None:
213
+ df = self._set_column_descriptions(df, column_descriptions)
214
+
215
+ writer = df.write.format("delta").mode("append") if append else df.write.format("delta").mode("overwrite")
216
+
217
+ if partition_fields is not None:
218
+ table = self._get_table(path=key)
219
+ if table is not None:
220
+ self._set_table_partitions(path=key, partition_fields=partition_fields)
221
+ writer = writer.partitionBy(*partition_fields)
222
+
223
+ if self._has_schema_changed(path=key, df=df):
224
+ writer = writer.option("overwriteSchema", "true")
225
+
226
+ if metadata:
227
+ writer = writer.option("userMetadata", metadata)
228
+
229
+ if table_name is not None:
230
+ # assume we have db.table
231
+ db = table_name.split(".")[0]
232
+ UtilsSpark.get_spark_session().sql(f"CREATE DATABASE IF NOT EXISTS {db}")
233
+ writer = writer.option("path", key).saveAsTable(table_name)
234
+ else:
235
+ writer.save(key)
236
+
237
+ logger.info("[save] optimizing...")
238
+ table = self._get_table(path=key)
239
+
240
+ if retention_days is not None and deleted_retention_days is not None:
241
+ self.enforce_retention_policy(
242
+ path=key, retention_days=retention_days, deleted_retention_days=deleted_retention_days
243
+ )
244
+ elif retention_days is not None:
245
+ self.enforce_retention_policy(path=key, retention_days=retention_days)
246
+
247
+ table.optimize().executeCompaction()
248
+
249
+ logger.info("[save|out]")
250
+
251
+ def update(
252
+ self,
253
+ df: Any,
254
+ key: str,
255
+ match_fields: list[str],
256
+ partition_fields: list[str] | None = None,
257
+ metadata: Metadata | None = None,
258
+ retention_days: int = 7,
259
+ deleted_retention_days: int = 7,
260
+ **kwargs, # noqa: ANN003
261
+ ) -> None:
262
+ """Update or insert data using merge operation.
263
+
264
+ Parameters
265
+ ----------
266
+ df : Any
267
+ The DataFrame containing updates to merge.
268
+ key : str
269
+ The path to the Delta table to update.
270
+ match_fields : list[str]
271
+ List of column names to use for matching rows during merge.
272
+ partition_fields : list[str] | None, optional
273
+ List of column names to partition the table by, is None by default.
274
+ metadata : Metadata | None, optional
275
+ Optional metadata to attach to the table, is None by default.
276
+ retention_days : int, optional
277
+ Number of days to retain log files, is 7 by default.
278
+ deleted_retention_days : int, optional
279
+ Number of days to retain deleted files, is 7 by default.
280
+ **kwargs
281
+ Additional keyword arguments.
282
+ """
283
+ logger.info(
284
+ f"[update|in] ({df}, {key}, {match_fields}, {partition_fields}, {metadata}, {retention_days}, {deleted_retention_days}, {kwargs})"
285
+ )
286
+
287
+ table = self._get_table(path=key)
288
+ if table is None:
289
+ self.save(
290
+ df=df,
291
+ key=key,
292
+ partition_fields=partition_fields,
293
+ metadata=metadata,
294
+ retention_days=retention_days,
295
+ deleted_retention_days=deleted_retention_days,
296
+ **kwargs,
297
+ )
298
+ else:
299
+ if partition_fields is not None:
300
+ self._set_table_partitions(path=key, partition_fields=partition_fields)
301
+
302
+ match_clause = None
303
+ for field in match_fields:
304
+ match_clause = (
305
+ f"current.{field} = updates.{field}"
306
+ if match_clause is None
307
+ else f"{match_clause} and current.{field} = updates.{field}"
308
+ )
309
+ logger.info(f"[update] match clause: {match_clause}")
310
+
311
+ # check if the df has all the required columns
312
+ # as we are upserting the updated columns coming in must at least match or exceed the current columns
313
+ for column in table.toDF().columns:
314
+ # we'll assume missing columns are nullable, typically metrics
315
+ if column not in df.columns:
316
+ df = df.withColumn(column, f.lit(None).cast(T.StringType()))
317
+
318
+ table.alias("current").merge(
319
+ df.alias("updates"), match_clause
320
+ ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
321
+
322
+ if retention_days is not None and deleted_retention_days is not None:
323
+ self.enforce_retention_policy(
324
+ path=key, retention_days=retention_days, deleted_retention_days=deleted_retention_days
325
+ )
326
+ elif retention_days is not None:
327
+ self.enforce_retention_policy(path=key, retention_days=retention_days)
328
+
329
+ table.optimize().executeCompaction()
330
+
331
+ logger.info("[UtilsDeltaTable.upsert|out]")
332
+
333
+ def enforce_retention_policy(self, path: str, retention_days: int = 7, deleted_retention_days: int = 7) -> None:
334
+ """Enforce retention policy on Delta table log and deleted files.
335
+
336
+ Parameters
337
+ ----------
338
+ path : str
339
+ The path to the Delta table.
340
+ retention_days : int, optional
341
+ Number of days to retain log files, is 7 by default.
342
+ deleted_retention_days : int, optional
343
+ Number of days to retain deleted files, is 7 by default.
344
+ """
345
+ logger.info(f"[enforce_retention_policy|in] ({path}, {retention_days}, {deleted_retention_days})")
346
+
347
+ retention = f"interval {retention_days} days"
348
+ deleted_retention = f"interval {deleted_retention_days} days"
349
+
350
+ UtilsSpark.get_spark_session().sql(
351
+ f"ALTER TABLE delta.`{path}` SET TBLPROPERTIES('delta.logRetentionDuration' = '{retention}', 'delta.deletedFileRetentionDuration' = '{deleted_retention}')"
352
+ )
353
+ logger.info("[enforce_retention_policy|out]")
354
+
355
+ def get_latest_table_versions(self, path: str, how_many: int = 1) -> list[str]:
356
+ """Checks the delta table history and retrieves the latest n versions.
357
+
358
+ Sorted from the newest to the oldest.
359
+ """
360
+ logger.info(f"[get_latest_table_versions|in] ({path}, {how_many})")
361
+ result: list[str] = []
362
+
363
+ table = self._get_table(path=path)
364
+ if table is not None:
365
+ history_rows = table.history().orderBy(f.desc("timestamp")).limit(how_many)
366
+ result = [str(x.version) for x in history_rows.collect()]
367
+
368
+ logger.info(f"[get_latest_table_versions|out] => {result}")
369
+ return result
370
+
371
+ def get_metadata(self, path: str, version: str | None = None) -> Metadata | None:
372
+ """Retrieve metadata from the Delta table at the specified path.
373
+
374
+ Raises
375
+ ------
376
+ NoStoreException
377
+ """
378
+ logger.info(f"[get_metadata|in] ({path}, {version})")
379
+ table = self._get_table(path)
380
+ if table is None:
381
+ raise NoStoreException(f"[get_metadata] no data in path: {path}")
382
+
383
+ result = None
384
+
385
+ df_history = table.history().filter(f.col("userMetadata").isNotNull())
386
+ if version is not None:
387
+ df_history = df_history.filter(f.col("version") <= int(version))
388
+
389
+ df_history = df_history.orderBy(f.col("version").desc())
390
+ if not df_history.isEmpty():
391
+ user_metadata = df_history.take(1)[0].userMetadata
392
+ result = Metadata.from_str(user_metadata)
393
+ if version is not None:
394
+ result = dataclasses.replace(result, version=version)
395
+
396
+ logger.info(f"[get_metadata|out] => ({result})")
397
+ return result
398
+
399
+ def _get_delta_log(self, path: str) -> DataFrame:
400
+ logger.info(f"[_get_delta_log|in] ({path})")
401
+
402
+ spark = UtilsSpark.get_spark_session()
403
+ jdf = (
404
+ spark._jvm.org.apache.spark.sql.delta.DeltaLog.forTable(spark._jsparkSession, path) # noqa: SLF001
405
+ .snapshot()
406
+ .allFiles()
407
+ .toDF()
408
+ )
409
+ result = DataFrame(jdf, spark)
410
+
411
+ logger.info(f"[_get_delta_log|out] => {result}")
412
+ return result
413
+
414
+ def _get_table_partitions(self, path: str) -> list[str]:
415
+ logger.info(f"[_get_table_partitions|in] ({path})")
416
+ result: list[str] = []
417
+
418
+ delta_log: DataFrame = self._get_delta_log(path=path)
419
+ partition_keys = [
420
+ x.keys
421
+ for x in delta_log.select(f.map_keys(f.col("partitionValues")).alias("keys")).distinct().collect()
422
+ if 0 < len(x)
423
+ ]
424
+ if 0 < len(partition_keys):
425
+ result: list[str] = list({y for y in partition_keys for y in y})
426
+
427
+ logger.info(f"[_get_table_partitions|out] => {result}")
428
+ return result
429
+
430
+ def _vacuum_now(self, path: str) -> None:
431
+ logger.info("[_vacuum_now|in]")
432
+
433
+ spark = UtilsSpark.get_spark_session()
434
+ old_conf_value = spark.conf.get("spark.databricks.delta.retentionDurationCheck.enabled")
435
+ spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
436
+ DeltaTable.forPath(spark, path).vacuum(0)
437
+ spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", old_conf_value)
438
+
439
+ logger.info("[_vacuum_now|out]")
440
+
441
+ def _has_schema_changed(self, path: str, df: DataFrame) -> bool:
442
+ logger.info(f"[_has_schema_changed|in] ({path},{df})")
443
+ result: bool = False
444
+ table = self._get_table(path=path)
445
+ if table is not None:
446
+ result = table.toDF().schema != df.schema
447
+ logger.info(f"[_has_schema_changed|out] => {result}")
448
+ return result
449
+
450
+ def _set_table_partitions(self, path: str, partition_fields: list[str]) -> None:
451
+ logger.info(f"[_set_table_partitions|in] ({path},{partition_fields})")
452
+
453
+ spark = UtilsSpark.get_spark_session()
454
+ # let's check partition_cols
455
+ current_partition_fields = self._get_table_partitions(path=path)
456
+ shall_we_repartition = sorted(partition_fields) != sorted(current_partition_fields)
457
+
458
+ if shall_we_repartition:
459
+ logger.info("[_set_table_partitions] going to repartition")
460
+ new_df = spark.read.format("delta").load(path)
461
+ new_df.write.format("delta").mode("overwrite").partitionBy(*partition_fields).option(
462
+ "overwriteSchema", "true"
463
+ ).save(path)
464
+ self._vacuum_now(path)
465
+ logger.info(
466
+ f"[_set_table_partitions] changed partition cols from {current_partition_fields} to {partition_fields}"
467
+ )
468
+ logger.info("[_set_table_partitions|out]")
469
+
470
+ def _get_table(self, path: str) -> DeltaTable | None:
471
+ logger.debug(f"[_get_table|in] ({path})")
472
+ result: DeltaTable = None
473
+ try:
474
+ result: DeltaTable = DeltaTable.forPath(UtilsSpark.get_spark_session(), path)
475
+ except AnalysisException as ax:
476
+ logger.warning(f"[_get_table] couldn't load from {path}: {ax}")
477
+
478
+ logger.debug(f"[_get_table|out] => {result}")
479
+ return result
480
+
481
+ def set_column_comments(self, db: str, table: str, col_comments: dict[str, str]) -> None:
482
+ """Set comments for columns in a Delta table.
483
+
484
+ Parameters
485
+ ----------
486
+ db : str
487
+ The database name where the table is located.
488
+ table : str
489
+ The table name to set column comments for.
490
+ col_comments : dict[str, str]
491
+ Dictionary mapping column names to their comments.
492
+ """
493
+ logger.info(f"[set_column_comments|in] ({db}, {table}, {col_comments})")
494
+ spark = UtilsSpark.get_spark_session()
495
+
496
+ table_description: DataFrame = spark.sql(f"describe {db}.{table}").withColumn(
497
+ "set_column_comments_id", monotonically_increasing_id()
498
+ )
499
+ partition_info_id = (
500
+ table_description.filter(f.col("col_name") == "# Partition Information").collect()[0].set_column_comments_id
501
+ )
502
+
503
+ table_description = table_description.filter(
504
+ (f.col("set_column_comments_id") < f.lit(partition_info_id)) & (f.col("col_name") != "")
505
+ ).drop("set_column_comments_id")
506
+ rows = [r.asDict() for r in table_description.collect()]
507
+ for row in rows:
508
+ col = row["col_name"]
509
+ data_type = row["data_type"]
510
+ if col in col_comments:
511
+ new_comment = col_comments[col]
512
+ logger.info(f"[set_column_comments] setting new comment ({new_comment}) to column {col}")
513
+ spark.sql(f"ALTER TABLE {db}.{table} CHANGE COLUMN {col} {col} {data_type} COMMENT '{new_comment}'")
514
+
515
+ logger.info("[set_column_comments|out]")
@@ -0,0 +1,60 @@
1
+ Metadata-Version: 2.4
2
+ Name: tgedr-dataops-ext
3
+ Version: 0.0.1
4
+ Summary: this is a template for a python package
5
+ Author-email: developer <developer@email.com>
6
+ Requires-Python: >=3.11
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: pandas>=2.3.0
9
+ Requires-Dist: deltalake~=0.16.4
10
+ Requires-Dist: delta-spark~=2.4.0
11
+ Requires-Dist: tgedr-dataops>=1.0.1
12
+ Requires-Dist: pyspark~=3.4.0
13
+
14
+ # tgedr-dataops-ext
15
+
16
+ ![Coverage](./coverage.svg)
17
+ [![PyPI](https://img.shields.io/pypi/v/tgedr-dataops-ext)](https://pypi.org/project/tgedr-dataops-ext/)
18
+
19
+
20
+ data operations related code - extended
21
+
22
+ ## motivation
23
+ *dataops-ext* is a library with tested and used code aligning on some standards regarding code structure and quality and to avoid reinventing the wheel. It builds on top of *dataops-abs* and *dataops* providing distributed processing features based on pyspark.
24
+
25
+ ## installation
26
+ `pip install tgedr-dataops-ext`
27
+
28
+ ## package namespaces and its contents
29
+
30
+ #### commons
31
+ - __Dataset__: immutable class to wrap up a dataframe along with metadata ([example](tests/tgedr_dataops_ext/commons/test_dataset.py))
32
+ - __Metadata__: immutable class depicting dataset metadata ([example](tests/tgedr_dataops_ext/commons/test_metadata.py))
33
+ - __UtilsSpark__: utility class to work with spark, mostly helping on creating a session ([example](tests/tgedr_dataops_ext/commons/test_utils_spark.py))
34
+
35
+ #### quality
36
+ - __PysparkValidation__ : __GreatExpectationsValidation__ implementation to validate pyspark dataframes with Great Expectations library ([example](tests/tgedr_dataops_ext/quality/test_pyspark_validation.py))
37
+
38
+ #### source
39
+
40
+ - __DeltaTableSource__: abstract __Source__ class used to read delta lake format datasets returning a pandas dataframe" ([example](tests/tgedr_dataops_ext/source/test_delta_table_source.py))
41
+ - __LocalDeltaTable__: __Source__ class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe ([example](tests/tgedr_dataops_ext/source/test_local_delta_table.py))
42
+ - __S3DeltaTable__: __Source__ class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe ([example](tests/tgedr_dataops_ext/source/test_s3_delta_table.py))
43
+
44
+
45
+ #### store
46
+ - __SparkDeltaStore__ : __Store__ implementation for pyspark distributed processing with delta table format ([example](tests/tgedr_dataops_ext/store/test_spark_delta.py))
47
+
48
+
49
+
50
+ ## development
51
+ - main requirements:
52
+ - _uv_
53
+ - _bash_
54
+ - Clone the repository like this:
55
+
56
+ ``` bash
57
+ git clone git@github.com:jtviegas/dataops-ext
58
+ ```
59
+ - cd into the folder: `cd dataops-ext`
60
+ - install requirements: `./helper.sh reqs`
@@ -0,0 +1,13 @@
1
+ tgedr_dataops_ext/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ tgedr_dataops_ext/commons/dataset.py,sha256=cYJkqm-w4VxwprUAgB8QyCLiJ-bnK7PLGZWdmkahbhM,794
3
+ tgedr_dataops_ext/commons/metadata.py,sha256=UClsNoo9BUbz6Defp9Jd01k00h7GV9FhET6wXbScSLw,8106
4
+ tgedr_dataops_ext/commons/utils_spark.py,sha256=NcJRlcGky0abc28hq6Lrn0-OY3DjTo8pCOGcWb4qaco,4886
5
+ tgedr_dataops_ext/quality/pyspark_validation.py,sha256=ppnLWBDz2n2rchhyPnfUuNbZVw63dTvkdM56bYWeQYY,824
6
+ tgedr_dataops_ext/source/delta_table_source.py,sha256=lcGgAKpNj8FNF8DTnMK_KAQ3GgEUxHU8sKcwjcCTR84,1961
7
+ tgedr_dataops_ext/source/local_delta_table.py,sha256=3ffk3kWwLjaNLbSFU50R_fTd9WE9ItAaHR7Dv5pATLg,1975
8
+ tgedr_dataops_ext/source/s3_delta_table.py,sha256=z4e5LTAAeqt7GvtNsaoc8z_sZhOBOjQikp35thodZwo,3266
9
+ tgedr_dataops_ext/store/spark_delta.py,sha256=33zKhJYmlGq5mFvy5Yzr9z_zX5gzjZSFW8RuYWHcdjA,20805
10
+ tgedr_dataops_ext-0.0.1.dist-info/METADATA,sha256=kBiDkP5rdWMJRJ_Xb8G6hjJYkAq89uf4BMFE5KCvM5Q,2686
11
+ tgedr_dataops_ext-0.0.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
12
+ tgedr_dataops_ext-0.0.1.dist-info/top_level.txt,sha256=st7VbEQz5kyNJ8ww2zjv-uWKyKww1npcI-qr-XQClaY,18
13
+ tgedr_dataops_ext-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ tgedr_dataops_ext