tgedr-dataops-ext 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,60 @@
1
+ Metadata-Version: 2.4
2
+ Name: tgedr-dataops-ext
3
+ Version: 0.0.1
4
+ Summary: this is a template for a python package
5
+ Author-email: developer <developer@email.com>
6
+ Requires-Python: >=3.11
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: pandas>=2.3.0
9
+ Requires-Dist: deltalake~=0.16.4
10
+ Requires-Dist: delta-spark~=2.4.0
11
+ Requires-Dist: tgedr-dataops>=1.0.1
12
+ Requires-Dist: pyspark~=3.4.0
13
+
14
+ # tgedr-dataops-ext
15
+
16
+ ![Coverage](./coverage.svg)
17
+ [![PyPI](https://img.shields.io/pypi/v/tgedr-dataops-ext)](https://pypi.org/project/tgedr-dataops-ext/)
18
+
19
+
20
+ data operations related code - extended
21
+
22
+ ## motivation
23
+ *dataops-ext* is a library with tested and used code aligning on some standards regarding code structure and quality and to avoid reinventing the wheel. It builds on top of *dataops-abs* and *dataops* providing distributed processing features based on pyspark.
24
+
25
+ ## installation
26
+ `pip install tgedr-dataops-ext`
27
+
28
+ ## package namespaces and its contents
29
+
30
+ #### commons
31
+ - __Dataset__: immutable class to wrap up a dataframe along with metadata ([example](tests/tgedr_dataops_ext/commons/test_dataset.py))
32
+ - __Metadata__: immutable class depicting dataset metadata ([example](tests/tgedr_dataops_ext/commons/test_metadata.py))
33
+ - __UtilsSpark__: utility class to work with spark, mostly helping on creating a session ([example](tests/tgedr_dataops_ext/commons/test_utils_spark.py))
34
+
35
+ #### quality
36
+ - __PysparkValidation__ : __GreatExpectationsValidation__ implementation to validate pyspark dataframes with Great Expectations library ([example](tests/tgedr_dataops_ext/quality/test_pyspark_validation.py))
37
+
38
+ #### source
39
+
40
+ - __DeltaTableSource__: abstract __Source__ class used to read delta lake format datasets returning a pandas dataframe" ([example](tests/tgedr_dataops_ext/source/test_delta_table_source.py))
41
+ - __LocalDeltaTable__: __Source__ class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe ([example](tests/tgedr_dataops_ext/source/test_local_delta_table.py))
42
+ - __S3DeltaTable__: __Source__ class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe ([example](tests/tgedr_dataops_ext/source/test_s3_delta_table.py))
43
+
44
+
45
+ #### store
46
+ - __SparkDeltaStore__ : __Store__ implementation for pyspark distributed processing with delta table format ([example](tests/tgedr_dataops_ext/store/test_spark_delta.py))
47
+
48
+
49
+
50
+ ## development
51
+ - main requirements:
52
+ - _uv_
53
+ - _bash_
54
+ - Clone the repository like this:
55
+
56
+ ``` bash
57
+ git clone git@github.com:jtviegas/dataops-ext
58
+ ```
59
+ - cd into the folder: `cd dataops-ext`
60
+ - install requirements: `./helper.sh reqs`
@@ -0,0 +1,47 @@
1
+ # tgedr-dataops-ext
2
+
3
+ ![Coverage](./coverage.svg)
4
+ [![PyPI](https://img.shields.io/pypi/v/tgedr-dataops-ext)](https://pypi.org/project/tgedr-dataops-ext/)
5
+
6
+
7
+ data operations related code - extended
8
+
9
+ ## motivation
10
+ *dataops-ext* is a library with tested and used code aligning on some standards regarding code structure and quality and to avoid reinventing the wheel. It builds on top of *dataops-abs* and *dataops* providing distributed processing features based on pyspark.
11
+
12
+ ## installation
13
+ `pip install tgedr-dataops-ext`
14
+
15
+ ## package namespaces and its contents
16
+
17
+ #### commons
18
+ - __Dataset__: immutable class to wrap up a dataframe along with metadata ([example](tests/tgedr_dataops_ext/commons/test_dataset.py))
19
+ - __Metadata__: immutable class depicting dataset metadata ([example](tests/tgedr_dataops_ext/commons/test_metadata.py))
20
+ - __UtilsSpark__: utility class to work with spark, mostly helping on creating a session ([example](tests/tgedr_dataops_ext/commons/test_utils_spark.py))
21
+
22
+ #### quality
23
+ - __PysparkValidation__ : __GreatExpectationsValidation__ implementation to validate pyspark dataframes with Great Expectations library ([example](tests/tgedr_dataops_ext/quality/test_pyspark_validation.py))
24
+
25
+ #### source
26
+
27
+ - __DeltaTableSource__: abstract __Source__ class used to read delta lake format datasets returning a pandas dataframe" ([example](tests/tgedr_dataops_ext/source/test_delta_table_source.py))
28
+ - __LocalDeltaTable__: __Source__ class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe ([example](tests/tgedr_dataops_ext/source/test_local_delta_table.py))
29
+ - __S3DeltaTable__: __Source__ class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe ([example](tests/tgedr_dataops_ext/source/test_s3_delta_table.py))
30
+
31
+
32
+ #### store
33
+ - __SparkDeltaStore__ : __Store__ implementation for pyspark distributed processing with delta table format ([example](tests/tgedr_dataops_ext/store/test_spark_delta.py))
34
+
35
+
36
+
37
+ ## development
38
+ - main requirements:
39
+ - _uv_
40
+ - _bash_
41
+ - Clone the repository like this:
42
+
43
+ ``` bash
44
+ git clone git@github.com:jtviegas/dataops-ext
45
+ ```
46
+ - cd into the folder: `cd dataops-ext`
47
+ - install requirements: `./helper.sh reqs`
@@ -0,0 +1,134 @@
1
+ [project]
2
+ name = "tgedr-dataops-ext"
3
+ version = "0.0.1"
4
+ description = "this is a template for a python package"
5
+ authors = [
6
+ {name = "developer",email = "developer@email.com"}
7
+ ]
8
+ readme = "README.md"
9
+ requires-python = ">=3.11"
10
+
11
+ dependencies = [
12
+ "pandas>=2.3.0",
13
+ "deltalake~=0.16.4",
14
+ "delta-spark~=2.4.0",
15
+ "tgedr-dataops>=1.0.1",
16
+ "pyspark~=3.4.0"
17
+ ]
18
+ [dependency-groups]
19
+ dev = [
20
+ "pre-commit~=4.2.0",
21
+ "pytest~=8.3.5",
22
+ "pytest-bdd~=8.1.0",
23
+ "pytest-cov~=4.1.0",
24
+ "pytest-mock~=3.15.0",
25
+ "ruff==0.9.10",
26
+ "bandit==1.8.3",
27
+ "safety==3.5.1",
28
+ "typer<0.17.0",
29
+ "genbadge[coverage]>=1.1.3",
30
+ ]
31
+
32
+ # [project.scripts]
33
+ # run = "tgedr.pycommons.entrypoint:entrypoint"
34
+
35
+ [build-system]
36
+ requires = ["setuptools>=78.1.0", "wheel>=0.45.1"]
37
+ build-backend = "setuptools.build_meta"
38
+
39
+ [tool.setuptools.packages.find]
40
+ where = ["src"]
41
+
42
+ [tool.setuptools]
43
+ include-package-data = true
44
+
45
+ [tool.setuptools.package-data]
46
+ "*" = ["CHANGELOG"]
47
+
48
+ [tool.coverage.paths]
49
+ source = ["src/"]
50
+
51
+ [tool.coverage.run]
52
+ source = ["src/"]
53
+ include = ["src/*"]
54
+ omit = [
55
+ "*/tests/*",
56
+ "*/test_*",
57
+ "*/__pycache__/*",
58
+ "*/migrations/*",
59
+ "*/venv/*",
60
+ "*/.venv/*"
61
+ ]
62
+
63
+ [tool.coverage.report]
64
+ exclude_lines = [
65
+ "pragma: no cover",
66
+ "def __repr__",
67
+ "raise AssertionError",
68
+ "raise NotImplementedError",
69
+ "if __name__ == .__main__.:",
70
+ "if TYPE_CHECKING:",
71
+ ]
72
+ show_missing = true
73
+ skip_covered = false
74
+ skip_empty = false
75
+
76
+ [tool.pytest.ini_options]
77
+ # bdd_features_base_dir = "documentation/features"
78
+ pythonpath = "."
79
+
80
+ [tool.ruff]
81
+ exclude = [
82
+ ".bzr",
83
+ ".direnv",
84
+ ".eggs",
85
+ ".git",
86
+ ".git-rewrite",
87
+ ".hg",
88
+ ".ipynb_checkpoints",
89
+ ".mypy_cache",
90
+ ".nox",
91
+ ".pants.d",
92
+ ".pyenv",
93
+ ".pytest_cache",
94
+ ".pytype",
95
+ ".ruff_cache",
96
+ ".svn",
97
+ ".tox",
98
+ ".venv",
99
+ ".vscode",
100
+ "__pypackages__",
101
+ "_build",
102
+ "buck-out",
103
+ "build",
104
+ "dist",
105
+ "node_modules",
106
+ "site-packages",
107
+ "venv",
108
+ "tests/",
109
+ "typings/"
110
+ ]
111
+
112
+ line-length = 120
113
+ indent-width = 4
114
+
115
+ [tool.ruff.lint]
116
+ select = ["ALL"]
117
+ ignore = ["D203", "S101", "D104", "INP001", "D213", "COM812", "I001",
118
+ "D401", "D407", "RET504", "PLR2004", "FA102", "E501", "EXE002", "PLR0913",
119
+ "PLR0912", "C901", "PLR0911", "D413", "N818", "B024", "ANN401", "SIM300",
120
+ "FBT001", "FBT002", "G004", "TRY003", "EM102", "EM101", "PD015", "PD901"]
121
+ fixable = ["ALL"]
122
+ unfixable = []
123
+ # Allow unused variables when underscore-prefixed.
124
+ dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
125
+
126
+ [tool.ruff.format]
127
+ # Like Black, use double quotes for strings.
128
+ quote-style = "double"
129
+ # Like Black, indent with spaces, rather than tabs.
130
+ indent-style = "space"
131
+ # Like Black, respect magic trailing commas.
132
+ skip-magic-trailing-comma = false
133
+ # Like Black, automatically detect the appropriate line ending.
134
+ line-ending = "auto"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,27 @@
1
+ """Dataset module for wrapping dataframes with metadata.
2
+
3
+ This module provides:
4
+ - Dataset: an immutable dataclass that combines a DataFrame with Metadata.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+ import json
9
+ from pyspark.sql import DataFrame
10
+ from tgedr_dataops_ext.commons.metadata import Metadata
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class Dataset:
15
+ """Utility immutable class to wrap up a dataframe along with metadata."""
16
+
17
+ __slots__ = ["data", "metadata"]
18
+ metadata: Metadata
19
+ data: DataFrame
20
+
21
+ def as_dict(self) -> dict:
22
+ """Serialize the dataset as a dictionary."""
23
+ return {"metadata": self.metadata.as_dict(), "data": str(self.data.__repr__)}
24
+
25
+ def __str__(self) -> str:
26
+ """Serialize the dataset as a json string."""
27
+ return json.dumps(self.as_dict())
@@ -0,0 +1,236 @@
1
+ """Module for dataset metadata classes."""
2
+
3
+ from dataclasses import dataclass
4
+ import json
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class FieldFrame:
9
+ """Class depicting a field values range, to be used in metadata.
10
+
11
+ Parameters
12
+ ----------
13
+ field : str
14
+ The name of the field.
15
+ lower : Union[int, str, float]
16
+ Field lower bound.
17
+ upper : Union[int, str, float]
18
+ Field upper bound.
19
+
20
+ """
21
+
22
+ __slots__ = ["field", "lower", "upper"]
23
+ field: str
24
+ lower: int | str | float
25
+ upper: int | str | float
26
+
27
+ def as_dict(self) -> dict[str, int | str | float]:
28
+ """Convert the FieldFrame to a dictionary representation.
29
+
30
+ Returns
31
+ -------
32
+ dict[str, int | str | float]
33
+ A dictionary containing the field name, lower bound, and upper bound.
34
+ """
35
+ return {"field": self.field, "lower": self.lower, "upper": self.upper}
36
+
37
+ @staticmethod
38
+ def from_str(src: str) -> "FieldFrame":
39
+ """Create a FieldFrame instance from a JSON string.
40
+
41
+ Parameters
42
+ ----------
43
+ src : str
44
+ A JSON string representation of a FieldFrame.
45
+
46
+ Returns
47
+ -------
48
+ FieldFrame
49
+ A new FieldFrame instance created from the JSON string.
50
+ """
51
+ r = json.loads(src)
52
+ field = r["field"]
53
+ lower = r["lower"]
54
+ upper = r["upper"]
55
+ return FieldFrame(field=field, lower=lower, upper=upper)
56
+
57
+ def __str__(self) -> str:
58
+ """Return a JSON string representation of the FieldFrame."""
59
+ return json.dumps(self.as_dict())
60
+
61
+ def __eq__(self, other: "FieldFrame") -> bool:
62
+ """Check equality between two FieldFrame instances."""
63
+ return self.field == other.field and self.lower == other.lower and self.upper == other.upper
64
+
65
+ def __gt__(self, other: "FieldFrame") -> bool:
66
+ """Check if this FieldFrame is greater than another FieldFrame."""
67
+ return self.field > other.field or (
68
+ self.field == other.field
69
+ and (self.lower > other.lower or (self.lower == other.lower and self.upper > other.upper))
70
+ )
71
+
72
+ def __ne__(self, other: "FieldFrame") -> bool:
73
+ """Check inequality between two FieldFrame instances."""
74
+ return not other == self
75
+
76
+ def __ge__(self, other: "FieldFrame") -> bool:
77
+ """Check if this FieldFrame is greater than or equal to another FieldFrame."""
78
+ return other == self or self > other
79
+
80
+ def __le__(self, other: "FieldFrame") -> bool:
81
+ """Check if this FieldFrame is less than or equal to another FieldFrame."""
82
+ return other == self or self < other
83
+
84
+ def __lt__(self, other: "FieldFrame") -> bool:
85
+ """Check if this FieldFrame is less than another FieldFrame."""
86
+ return other > self
87
+
88
+
89
+ @dataclass(frozen=True)
90
+ class Metadata:
91
+ """Immutable class depicting dataset metadata.
92
+
93
+ Parameters
94
+ ----------
95
+ name : str
96
+ The name of the dataset.
97
+ version : Optional[str]
98
+ Version of this dataset, if available.
99
+ framing : Optional[List[FieldFrame]]
100
+ Multiple field frames.
101
+ sources : Optional[List["Metadata"]]
102
+ Metadatas related to the datasets sourcing this one.
103
+
104
+ """
105
+
106
+ __slots__ = ["framing", "name", "sources", "version"]
107
+ name: str
108
+ version: str | None
109
+ framing: list[FieldFrame] | None
110
+ sources: list["Metadata"] | None
111
+
112
+ def as_dict(self) -> dict:
113
+ """Convert the Metadata to a dictionary representation.
114
+
115
+ Returns
116
+ -------
117
+ dict
118
+ A dictionary containing the metadata fields including name, version,
119
+ framing, and sources if they are not None.
120
+ """
121
+ result = {"name": self.name}
122
+ if self.version is not None:
123
+ result["version"] = self.version
124
+ if self.framing is not None:
125
+ result["framing"] = []
126
+ for f in self.framing:
127
+ (result["framing"]).append(f.as_dict())
128
+ if self.sources is not None:
129
+ result["sources"] = []
130
+ for source in self.sources:
131
+ (result["sources"]).append(source.as_dict())
132
+
133
+ return result
134
+
135
+ def __str__(self) -> str:
136
+ """Return a JSON string representation of the Metadata."""
137
+ return json.dumps(self.as_dict())
138
+
139
+ def __eq__(self, other: object) -> bool:
140
+ """Check equality between two Metadata instances."""
141
+ return (
142
+ self.name == other.name
143
+ and (
144
+ (self.version is None and other.version is None)
145
+ or ((self.version is not None and other.version is not None) and self.version == other.version)
146
+ )
147
+ and (
148
+ (self.framing is None and other.framing is None)
149
+ or (
150
+ (self.framing is not None and other.framing is not None)
151
+ and sorted(self.framing) == sorted(other.framing)
152
+ )
153
+ )
154
+ and (
155
+ (self.sources is None and other.sources is None)
156
+ or (
157
+ (self.sources is not None and other.sources is not None)
158
+ and sorted(self.sources) == sorted(other.sources)
159
+ )
160
+ )
161
+ )
162
+
163
+ def __gt__(self, other: "Metadata") -> bool:
164
+ """Check if this Metadata is greater than another Metadata."""
165
+ return self.name > other.name or (
166
+ self.name == other.name
167
+ and (
168
+ ((self.version is not None and other.version is not None) and (self.version > other.version))
169
+ or (self.version is not None and other.version is None)
170
+ or (
171
+ (
172
+ (self.framing is not None and other.framing is not None)
173
+ and (sorted(self.framing) > sorted(other.framing))
174
+ )
175
+ or (self.framing is not None and other.framing is None)
176
+ or (
177
+ (
178
+ (self.sources is not None and other.sources is not None)
179
+ and (sorted(self.sources) > sorted(other.sources))
180
+ )
181
+ or (self.sources is not None and other.sources is None)
182
+ )
183
+ )
184
+ )
185
+ )
186
+
187
+ def __ne__(self, other: "Metadata") -> bool:
188
+ """Check inequality between two Metadata instances."""
189
+ return not other == self
190
+
191
+ def __ge__(self, other: "Metadata") -> bool:
192
+ """Check if this Metadata is greater than or equal to another Metadata."""
193
+ return other == self or self > other
194
+
195
+ def __le__(self, other: "Metadata") -> bool:
196
+ """Check if this Metadata is less than or equal to another Metadata."""
197
+ return other == self or self < other
198
+
199
+ def __lt__(self, other: "Metadata") -> bool:
200
+ """Check if this Metadata is less than another Metadata."""
201
+ return other > self
202
+
203
+ @staticmethod
204
+ def from_str(src: str) -> "Metadata":
205
+ """Create a Metadata instance from a JSON string.
206
+
207
+ Parameters
208
+ ----------
209
+ src : str
210
+ A JSON string representation of a Metadata object.
211
+
212
+ Returns
213
+ -------
214
+ Metadata
215
+ A new Metadata instance created from the JSON string.
216
+ """
217
+ r = json.loads(src)
218
+ name = r["name"]
219
+ version = r.get("version", None)
220
+
221
+ framing = None
222
+ framing_entries = r.get("framing", None)
223
+ if framing_entries is not None:
224
+ framing = []
225
+ for framing_entry in framing_entries:
226
+ framing.append(FieldFrame.from_str(json.dumps(framing_entry)))
227
+
228
+ sources = None
229
+ sources_entries = r.get("sources", None)
230
+ if sources_entries is not None:
231
+ sources = []
232
+ for source_entry in sources_entries:
233
+ source_entry_as_str = json.dumps(source_entry)
234
+ sources.append(Metadata.from_str(source_entry_as_str))
235
+
236
+ return Metadata(name=name, version=version, framing=framing, sources=sources)
@@ -0,0 +1,133 @@
1
+ """Spark utility functions and classes.
2
+
3
+ This module provides:
4
+ - UtilsSpark: A utility class with handy functions to work with Spark sessions,
5
+ including creating local and remote sessions with Delta Lake support,
6
+ and building PySpark schemas from data type dictionaries.
7
+ """
8
+
9
+ import logging
10
+ import os
11
+ from typing import ClassVar
12
+ from pyspark.sql import SparkSession
13
+ from pyspark import SparkConf
14
+ from pyspark.sql import types as T # noqa: N812
15
+ from pyspark.context import SparkContext
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class UtilsSpark:
22
+ """class with handy functions to work with spark."""
23
+
24
+ __ENV_KEY_PYSPARK_IS_LOCAL = "PYSPARK_IS_LOCAL"
25
+ __ENV_KEY_NOT_AWS_CLOUD = "NOT_AWS_CLOUD"
26
+ __DTYPES_MAP: ClassVar[dict[str, type]] = {
27
+ "bigint": T.LongType,
28
+ "string": T.StringType,
29
+ "double": T.DoubleType,
30
+ "int": T.IntegerType,
31
+ "boolean": T.BooleanType,
32
+ "timestamp": T.TimestampType,
33
+ "date": T.DateType,
34
+ }
35
+
36
+ @staticmethod
37
+ def get_local_spark_session(config: dict | None = None) -> SparkSession:
38
+ """Get a local Spark session configured for Delta Lake.
39
+
40
+ Parameters
41
+ ----------
42
+ config : dict | None, optional
43
+ Additional Spark configuration parameters, by default None.
44
+
45
+ Returns
46
+ -------
47
+ SparkSession
48
+ A configured local Spark session instance with Delta Lake support.
49
+ """
50
+ logger.debug(f"[get_local_spark_session|in] ({config})")
51
+ # PySpark 3.4 uses Scala 2.12, so we need delta-core_2.12
52
+ builder = (
53
+ SparkSession.builder.config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
54
+ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
55
+ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
56
+ .config("spark.driver.host", "localhost")
57
+ )
58
+
59
+ if config is not None:
60
+ for k, v in config.items():
61
+ builder.config(k, v)
62
+
63
+ spark = builder.getOrCreate()
64
+
65
+ logger.debug(f"[get_local_spark_session|out] => {spark}")
66
+ return spark
67
+
68
+ @staticmethod
69
+ def get_spark_session(config: dict | None = None) -> SparkSession:
70
+ """Get a Spark session based on the environment configuration.
71
+
72
+ Parameters
73
+ ----------
74
+ config : dict | None, optional
75
+ Additional Spark configuration parameters, by default None.
76
+
77
+ Returns
78
+ -------
79
+ SparkSession
80
+ A configured Spark session instance.
81
+ """
82
+ logger.debug(f"[get_spark_session|in] ({config})")
83
+
84
+ if "1" == os.getenv(UtilsSpark.__ENV_KEY_PYSPARK_IS_LOCAL):
85
+ spark: SparkSession = UtilsSpark.get_local_spark_session(config)
86
+ else:
87
+ if "1" == os.getenv(UtilsSpark.__ENV_KEY_NOT_AWS_CLOUD):
88
+ active_session = SparkSession.getActiveSession()
89
+ else:
90
+ from awsglue.context import GlueContext # type: ignore # pragma: no cover # noqa: PGH003
91
+
92
+ glueContext = GlueContext(SparkContext.getOrCreate()) # pragma: no cover # noqa: N806
93
+ active_session = glueContext.spark_session # pragma: no cover
94
+
95
+ spark_config = SparkConf()
96
+
97
+ if active_session is not None:
98
+ former_config = active_session.sparkContext.getConf().getAll()
99
+ for entry in former_config:
100
+ spark_config.set(entry[0], entry[1])
101
+ if config is not None:
102
+ for k, v in config.items():
103
+ spark_config.set(k, v)
104
+ spark: SparkSession = SparkSession.builder.config(conf=spark_config).getOrCreate()
105
+ else:
106
+ spark: SparkSession = SparkSession.builder.getOrCreate()
107
+
108
+ logger.debug(f"[get_spark_session|out] => {spark}")
109
+ return spark
110
+
111
+ @staticmethod
112
+ def build_schema_from_dtypes(dtypes_schema: dict[str, str]) -> T.StructType:
113
+ """Build a PySpark StructType schema from a dictionary of data types.
114
+
115
+ Parameters
116
+ ----------
117
+ dtypes_schema : dict[str, str]
118
+ A dictionary mapping field names to their corresponding data type strings.
119
+ Supported types: 'bigint', 'string', 'double', 'int', 'boolean', 'timestamp', 'date'.
120
+
121
+ Returns
122
+ -------
123
+ T.StructType
124
+ A PySpark StructType schema with fields defined by the input dictionary.
125
+ """
126
+ logger.info(f"[build_schema_from_dtypes|in] ({dtypes_schema})")
127
+ result = T.StructType()
128
+ for field, dtype in dtypes_schema.items():
129
+ new_type = UtilsSpark.__DTYPES_MAP[dtype]
130
+ result.add(field, new_type(), True) # noqa: FBT003
131
+
132
+ logger.info(f"[build_schema_from_dtypes|out] => {result}")
133
+ return result
@@ -0,0 +1,22 @@
1
+ """Pyspark DataFrame validation implementation module.
2
+
3
+ This module provides the Pyspark-specific implementation of Great Expectations validation.
4
+ """
5
+
6
+ from great_expectations.execution_engine import ExecutionEngine
7
+ from great_expectations.execution_engine import SparkDFExecutionEngine
8
+ from tgedr_dataops_abs.great_expectations_validation import GreatExpectationsValidation
9
+
10
+
11
+ class PysparkValidation(GreatExpectationsValidation):
12
+ """Pyspark DataFrame validation implementation."""
13
+
14
+ def _get_execution_engine(self, batch_data_dict: dict) -> ExecutionEngine:
15
+ """Get the execution engine used by the validation implementation.
16
+
17
+ Returns
18
+ -------
19
+ ExecutionEngine
20
+ The execution engine instance.
21
+ """
22
+ return SparkDFExecutionEngine(batch_data_dict=batch_data_dict)