tgedr-dataops-ext 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tgedr_dataops_ext-0.0.1/PKG-INFO +60 -0
- tgedr_dataops_ext-0.0.1/README.md +47 -0
- tgedr_dataops_ext-0.0.1/pyproject.toml +134 -0
- tgedr_dataops_ext-0.0.1/setup.cfg +4 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext/__init__.py +0 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext/commons/dataset.py +27 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext/commons/metadata.py +236 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext/commons/utils_spark.py +133 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext/quality/pyspark_validation.py +22 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext/source/delta_table_source.py +57 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext/source/local_delta_table.py +58 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext/source/s3_delta_table.py +83 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext/store/spark_delta.py +515 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext.egg-info/PKG-INFO +60 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext.egg-info/SOURCES.txt +16 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext.egg-info/dependency_links.txt +1 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext.egg-info/requires.txt +5 -0
- tgedr_dataops_ext-0.0.1/src/tgedr_dataops_ext.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tgedr-dataops-ext
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: this is a template for a python package
|
|
5
|
+
Author-email: developer <developer@email.com>
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: pandas>=2.3.0
|
|
9
|
+
Requires-Dist: deltalake~=0.16.4
|
|
10
|
+
Requires-Dist: delta-spark~=2.4.0
|
|
11
|
+
Requires-Dist: tgedr-dataops>=1.0.1
|
|
12
|
+
Requires-Dist: pyspark~=3.4.0
|
|
13
|
+
|
|
14
|
+
# tgedr-dataops-ext
|
|
15
|
+
|
|
16
|
+

|
|
17
|
+
[](https://pypi.org/project/tgedr-dataops-ext/)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
data operations related code - extended
|
|
21
|
+
|
|
22
|
+
## motivation
|
|
23
|
+
*dataops-ext* is a library with tested and used code aligning on some standards regarding code structure and quality and to avoid reinventing the wheel. It builds on top of *dataops-abs* and *dataops* providing distributed processing features based on pyspark.
|
|
24
|
+
|
|
25
|
+
## installation
|
|
26
|
+
`pip install tgedr-dataops-ext`
|
|
27
|
+
|
|
28
|
+
## package namespaces and its contents
|
|
29
|
+
|
|
30
|
+
#### commons
|
|
31
|
+
- __Dataset__: immutable class to wrap up a dataframe along with metadata ([example](tests/tgedr_dataops_ext/commons/test_dataset.py))
|
|
32
|
+
- __Metadata__: immutable class depicting dataset metadata ([example](tests/tgedr_dataops_ext/commons/test_metadata.py))
|
|
33
|
+
- __UtilsSpark__: utility class to work with spark, mostly helping on creating a session ([example](tests/tgedr_dataops_ext/commons/test_utils_spark.py))
|
|
34
|
+
|
|
35
|
+
#### quality
|
|
36
|
+
- __PysparkValidation__ : __GreatExpectationsValidation__ implementation to validate pyspark dataframes with Great Expectations library ([example](tests/tgedr_dataops_ext/quality/test_pyspark_validation.py))
|
|
37
|
+
|
|
38
|
+
#### source
|
|
39
|
+
|
|
40
|
+
- __DeltaTableSource__: abstract __Source__ class used to read delta lake format datasets returning a pandas dataframe" ([example](tests/tgedr_dataops_ext/source/test_delta_table_source.py))
|
|
41
|
+
- __LocalDeltaTable__: __Source__ class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe ([example](tests/tgedr_dataops_ext/source/test_local_delta_table.py))
|
|
42
|
+
- __S3DeltaTable__: __Source__ class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe ([example](tests/tgedr_dataops_ext/source/test_s3_delta_table.py))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
#### store
|
|
46
|
+
- __SparkDeltaStore__ : __Store__ implementation for pyspark distributed processing with delta table format ([example](tests/tgedr_dataops_ext/store/test_spark_delta.py))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
## development
|
|
51
|
+
- main requirements:
|
|
52
|
+
- _uv_
|
|
53
|
+
- _bash_
|
|
54
|
+
- Clone the repository like this:
|
|
55
|
+
|
|
56
|
+
``` bash
|
|
57
|
+
git clone git@github.com:jtviegas/dataops-ext
|
|
58
|
+
```
|
|
59
|
+
- cd into the folder: `cd dataops-ext`
|
|
60
|
+
- install requirements: `./helper.sh reqs`
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# tgedr-dataops-ext
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
[](https://pypi.org/project/tgedr-dataops-ext/)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
data operations related code - extended
|
|
8
|
+
|
|
9
|
+
## motivation
|
|
10
|
+
*dataops-ext* is a library with tested and used code aligning on some standards regarding code structure and quality and to avoid reinventing the wheel. It builds on top of *dataops-abs* and *dataops* providing distributed processing features based on pyspark.
|
|
11
|
+
|
|
12
|
+
## installation
|
|
13
|
+
`pip install tgedr-dataops-ext`
|
|
14
|
+
|
|
15
|
+
## package namespaces and its contents
|
|
16
|
+
|
|
17
|
+
#### commons
|
|
18
|
+
- __Dataset__: immutable class to wrap up a dataframe along with metadata ([example](tests/tgedr_dataops_ext/commons/test_dataset.py))
|
|
19
|
+
- __Metadata__: immutable class depicting dataset metadata ([example](tests/tgedr_dataops_ext/commons/test_metadata.py))
|
|
20
|
+
- __UtilsSpark__: utility class to work with spark, mostly helping on creating a session ([example](tests/tgedr_dataops_ext/commons/test_utils_spark.py))
|
|
21
|
+
|
|
22
|
+
#### quality
|
|
23
|
+
- __PysparkValidation__ : __GreatExpectationsValidation__ implementation to validate pyspark dataframes with Great Expectations library ([example](tests/tgedr_dataops_ext/quality/test_pyspark_validation.py))
|
|
24
|
+
|
|
25
|
+
#### source
|
|
26
|
+
|
|
27
|
+
- __DeltaTableSource__: abstract __Source__ class used to read delta lake format datasets returning a pandas dataframe" ([example](tests/tgedr_dataops_ext/source/test_delta_table_source.py))
|
|
28
|
+
- __LocalDeltaTable__: __Source__ class used to read delta lake format datasets from local fs with python only, pyspark not needed, returning a pandas dataframe ([example](tests/tgedr_dataops_ext/source/test_local_delta_table.py))
|
|
29
|
+
- __S3DeltaTable__: __Source__ class used to read delta lake format datasets from s3 bucket with python only, pyspark not needed, returning a pandas dataframe ([example](tests/tgedr_dataops_ext/source/test_s3_delta_table.py))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
#### store
|
|
33
|
+
- __SparkDeltaStore__ : __Store__ implementation for pyspark distributed processing with delta table format ([example](tests/tgedr_dataops_ext/store/test_spark_delta.py))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
## development
|
|
38
|
+
- main requirements:
|
|
39
|
+
- _uv_
|
|
40
|
+
- _bash_
|
|
41
|
+
- Clone the repository like this:
|
|
42
|
+
|
|
43
|
+
``` bash
|
|
44
|
+
git clone git@github.com:jtviegas/dataops-ext
|
|
45
|
+
```
|
|
46
|
+
- cd into the folder: `cd dataops-ext`
|
|
47
|
+
- install requirements: `./helper.sh reqs`
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "tgedr-dataops-ext"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "this is a template for a python package"
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "developer",email = "developer@email.com"}
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
|
|
11
|
+
dependencies = [
|
|
12
|
+
"pandas>=2.3.0",
|
|
13
|
+
"deltalake~=0.16.4",
|
|
14
|
+
"delta-spark~=2.4.0",
|
|
15
|
+
"tgedr-dataops>=1.0.1",
|
|
16
|
+
"pyspark~=3.4.0"
|
|
17
|
+
]
|
|
18
|
+
[dependency-groups]
|
|
19
|
+
dev = [
|
|
20
|
+
"pre-commit~=4.2.0",
|
|
21
|
+
"pytest~=8.3.5",
|
|
22
|
+
"pytest-bdd~=8.1.0",
|
|
23
|
+
"pytest-cov~=4.1.0",
|
|
24
|
+
"pytest-mock~=3.15.0",
|
|
25
|
+
"ruff==0.9.10",
|
|
26
|
+
"bandit==1.8.3",
|
|
27
|
+
"safety==3.5.1",
|
|
28
|
+
"typer<0.17.0",
|
|
29
|
+
"genbadge[coverage]>=1.1.3",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# [project.scripts]
|
|
33
|
+
# run = "tgedr.pycommons.entrypoint:entrypoint"
|
|
34
|
+
|
|
35
|
+
[build-system]
|
|
36
|
+
requires = ["setuptools>=78.1.0", "wheel>=0.45.1"]
|
|
37
|
+
build-backend = "setuptools.build_meta"
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.packages.find]
|
|
40
|
+
where = ["src"]
|
|
41
|
+
|
|
42
|
+
[tool.setuptools]
|
|
43
|
+
include-package-data = true
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.package-data]
|
|
46
|
+
"*" = ["CHANGELOG"]
|
|
47
|
+
|
|
48
|
+
[tool.coverage.paths]
|
|
49
|
+
source = ["src/"]
|
|
50
|
+
|
|
51
|
+
[tool.coverage.run]
|
|
52
|
+
source = ["src/"]
|
|
53
|
+
include = ["src/*"]
|
|
54
|
+
omit = [
|
|
55
|
+
"*/tests/*",
|
|
56
|
+
"*/test_*",
|
|
57
|
+
"*/__pycache__/*",
|
|
58
|
+
"*/migrations/*",
|
|
59
|
+
"*/venv/*",
|
|
60
|
+
"*/.venv/*"
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
[tool.coverage.report]
|
|
64
|
+
exclude_lines = [
|
|
65
|
+
"pragma: no cover",
|
|
66
|
+
"def __repr__",
|
|
67
|
+
"raise AssertionError",
|
|
68
|
+
"raise NotImplementedError",
|
|
69
|
+
"if __name__ == .__main__.:",
|
|
70
|
+
"if TYPE_CHECKING:",
|
|
71
|
+
]
|
|
72
|
+
show_missing = true
|
|
73
|
+
skip_covered = false
|
|
74
|
+
skip_empty = false
|
|
75
|
+
|
|
76
|
+
[tool.pytest.ini_options]
|
|
77
|
+
# bdd_features_base_dir = "documentation/features"
|
|
78
|
+
pythonpath = "."
|
|
79
|
+
|
|
80
|
+
[tool.ruff]
|
|
81
|
+
exclude = [
|
|
82
|
+
".bzr",
|
|
83
|
+
".direnv",
|
|
84
|
+
".eggs",
|
|
85
|
+
".git",
|
|
86
|
+
".git-rewrite",
|
|
87
|
+
".hg",
|
|
88
|
+
".ipynb_checkpoints",
|
|
89
|
+
".mypy_cache",
|
|
90
|
+
".nox",
|
|
91
|
+
".pants.d",
|
|
92
|
+
".pyenv",
|
|
93
|
+
".pytest_cache",
|
|
94
|
+
".pytype",
|
|
95
|
+
".ruff_cache",
|
|
96
|
+
".svn",
|
|
97
|
+
".tox",
|
|
98
|
+
".venv",
|
|
99
|
+
".vscode",
|
|
100
|
+
"__pypackages__",
|
|
101
|
+
"_build",
|
|
102
|
+
"buck-out",
|
|
103
|
+
"build",
|
|
104
|
+
"dist",
|
|
105
|
+
"node_modules",
|
|
106
|
+
"site-packages",
|
|
107
|
+
"venv",
|
|
108
|
+
"tests/",
|
|
109
|
+
"typings/"
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
line-length = 120
|
|
113
|
+
indent-width = 4
|
|
114
|
+
|
|
115
|
+
[tool.ruff.lint]
|
|
116
|
+
select = ["ALL"]
|
|
117
|
+
ignore = ["D203", "S101", "D104", "INP001", "D213", "COM812", "I001",
|
|
118
|
+
"D401", "D407", "RET504", "PLR2004", "FA102", "E501", "EXE002", "PLR0913",
|
|
119
|
+
"PLR0912", "C901", "PLR0911", "D413", "N818", "B024", "ANN401", "SIM300",
|
|
120
|
+
"FBT001", "FBT002", "G004", "TRY003", "EM102", "EM101", "PD015", "PD901"]
|
|
121
|
+
fixable = ["ALL"]
|
|
122
|
+
unfixable = []
|
|
123
|
+
# Allow unused variables when underscore-prefixed.
|
|
124
|
+
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
|
|
125
|
+
|
|
126
|
+
[tool.ruff.format]
|
|
127
|
+
# Like Black, use double quotes for strings.
|
|
128
|
+
quote-style = "double"
|
|
129
|
+
# Like Black, indent with spaces, rather than tabs.
|
|
130
|
+
indent-style = "space"
|
|
131
|
+
# Like Black, respect magic trailing commas.
|
|
132
|
+
skip-magic-trailing-comma = false
|
|
133
|
+
# Like Black, automatically detect the appropriate line ending.
|
|
134
|
+
line-ending = "auto"
|
|
File without changes
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Dataset module for wrapping dataframes with metadata.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- Dataset: an immutable dataclass that combines a DataFrame with Metadata.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
import json
|
|
9
|
+
from pyspark.sql import DataFrame
|
|
10
|
+
from tgedr_dataops_ext.commons.metadata import Metadata
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class Dataset:
|
|
15
|
+
"""Utility immutable class to wrap up a dataframe along with metadata."""
|
|
16
|
+
|
|
17
|
+
__slots__ = ["data", "metadata"]
|
|
18
|
+
metadata: Metadata
|
|
19
|
+
data: DataFrame
|
|
20
|
+
|
|
21
|
+
def as_dict(self) -> dict:
|
|
22
|
+
"""Serialize the dataset as a dictionary."""
|
|
23
|
+
return {"metadata": self.metadata.as_dict(), "data": str(self.data.__repr__)}
|
|
24
|
+
|
|
25
|
+
def __str__(self) -> str:
|
|
26
|
+
"""Serialize the dataset as a json string."""
|
|
27
|
+
return json.dumps(self.as_dict())
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""Module for dataset metadata classes."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class FieldFrame:
|
|
9
|
+
"""Class depicting a field values range, to be used in metadata.
|
|
10
|
+
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
field : str
|
|
14
|
+
The name of the field.
|
|
15
|
+
lower : Union[int, str, float]
|
|
16
|
+
Field lower bound.
|
|
17
|
+
upper : Union[int, str, float]
|
|
18
|
+
Field upper bound.
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
__slots__ = ["field", "lower", "upper"]
|
|
23
|
+
field: str
|
|
24
|
+
lower: int | str | float
|
|
25
|
+
upper: int | str | float
|
|
26
|
+
|
|
27
|
+
def as_dict(self) -> dict[str, int | str | float]:
|
|
28
|
+
"""Convert the FieldFrame to a dictionary representation.
|
|
29
|
+
|
|
30
|
+
Returns
|
|
31
|
+
-------
|
|
32
|
+
dict[str, int | str | float]
|
|
33
|
+
A dictionary containing the field name, lower bound, and upper bound.
|
|
34
|
+
"""
|
|
35
|
+
return {"field": self.field, "lower": self.lower, "upper": self.upper}
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def from_str(src: str) -> "FieldFrame":
|
|
39
|
+
"""Create a FieldFrame instance from a JSON string.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
src : str
|
|
44
|
+
A JSON string representation of a FieldFrame.
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
FieldFrame
|
|
49
|
+
A new FieldFrame instance created from the JSON string.
|
|
50
|
+
"""
|
|
51
|
+
r = json.loads(src)
|
|
52
|
+
field = r["field"]
|
|
53
|
+
lower = r["lower"]
|
|
54
|
+
upper = r["upper"]
|
|
55
|
+
return FieldFrame(field=field, lower=lower, upper=upper)
|
|
56
|
+
|
|
57
|
+
def __str__(self) -> str:
|
|
58
|
+
"""Return a JSON string representation of the FieldFrame."""
|
|
59
|
+
return json.dumps(self.as_dict())
|
|
60
|
+
|
|
61
|
+
def __eq__(self, other: "FieldFrame") -> bool:
|
|
62
|
+
"""Check equality between two FieldFrame instances."""
|
|
63
|
+
return self.field == other.field and self.lower == other.lower and self.upper == other.upper
|
|
64
|
+
|
|
65
|
+
def __gt__(self, other: "FieldFrame") -> bool:
|
|
66
|
+
"""Check if this FieldFrame is greater than another FieldFrame."""
|
|
67
|
+
return self.field > other.field or (
|
|
68
|
+
self.field == other.field
|
|
69
|
+
and (self.lower > other.lower or (self.lower == other.lower and self.upper > other.upper))
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def __ne__(self, other: "FieldFrame") -> bool:
|
|
73
|
+
"""Check inequality between two FieldFrame instances."""
|
|
74
|
+
return not other == self
|
|
75
|
+
|
|
76
|
+
def __ge__(self, other: "FieldFrame") -> bool:
|
|
77
|
+
"""Check if this FieldFrame is greater than or equal to another FieldFrame."""
|
|
78
|
+
return other == self or self > other
|
|
79
|
+
|
|
80
|
+
def __le__(self, other: "FieldFrame") -> bool:
|
|
81
|
+
"""Check if this FieldFrame is less than or equal to another FieldFrame."""
|
|
82
|
+
return other == self or self < other
|
|
83
|
+
|
|
84
|
+
def __lt__(self, other: "FieldFrame") -> bool:
|
|
85
|
+
"""Check if this FieldFrame is less than another FieldFrame."""
|
|
86
|
+
return other > self
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass(frozen=True)
|
|
90
|
+
class Metadata:
|
|
91
|
+
"""Immutable class depicting dataset metadata.
|
|
92
|
+
|
|
93
|
+
Parameters
|
|
94
|
+
----------
|
|
95
|
+
name : str
|
|
96
|
+
The name of the dataset.
|
|
97
|
+
version : Optional[str]
|
|
98
|
+
Version of this dataset, if available.
|
|
99
|
+
framing : Optional[List[FieldFrame]]
|
|
100
|
+
Multiple field frames.
|
|
101
|
+
sources : Optional[List["Metadata"]]
|
|
102
|
+
Metadatas related to the datasets sourcing this one.
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
__slots__ = ["framing", "name", "sources", "version"]
|
|
107
|
+
name: str
|
|
108
|
+
version: str | None
|
|
109
|
+
framing: list[FieldFrame] | None
|
|
110
|
+
sources: list["Metadata"] | None
|
|
111
|
+
|
|
112
|
+
def as_dict(self) -> dict:
|
|
113
|
+
"""Convert the Metadata to a dictionary representation.
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
-------
|
|
117
|
+
dict
|
|
118
|
+
A dictionary containing the metadata fields including name, version,
|
|
119
|
+
framing, and sources if they are not None.
|
|
120
|
+
"""
|
|
121
|
+
result = {"name": self.name}
|
|
122
|
+
if self.version is not None:
|
|
123
|
+
result["version"] = self.version
|
|
124
|
+
if self.framing is not None:
|
|
125
|
+
result["framing"] = []
|
|
126
|
+
for f in self.framing:
|
|
127
|
+
(result["framing"]).append(f.as_dict())
|
|
128
|
+
if self.sources is not None:
|
|
129
|
+
result["sources"] = []
|
|
130
|
+
for source in self.sources:
|
|
131
|
+
(result["sources"]).append(source.as_dict())
|
|
132
|
+
|
|
133
|
+
return result
|
|
134
|
+
|
|
135
|
+
def __str__(self) -> str:
|
|
136
|
+
"""Return a JSON string representation of the Metadata."""
|
|
137
|
+
return json.dumps(self.as_dict())
|
|
138
|
+
|
|
139
|
+
def __eq__(self, other: object) -> bool:
|
|
140
|
+
"""Check equality between two Metadata instances."""
|
|
141
|
+
return (
|
|
142
|
+
self.name == other.name
|
|
143
|
+
and (
|
|
144
|
+
(self.version is None and other.version is None)
|
|
145
|
+
or ((self.version is not None and other.version is not None) and self.version == other.version)
|
|
146
|
+
)
|
|
147
|
+
and (
|
|
148
|
+
(self.framing is None and other.framing is None)
|
|
149
|
+
or (
|
|
150
|
+
(self.framing is not None and other.framing is not None)
|
|
151
|
+
and sorted(self.framing) == sorted(other.framing)
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
and (
|
|
155
|
+
(self.sources is None and other.sources is None)
|
|
156
|
+
or (
|
|
157
|
+
(self.sources is not None and other.sources is not None)
|
|
158
|
+
and sorted(self.sources) == sorted(other.sources)
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def __gt__(self, other: "Metadata") -> bool:
|
|
164
|
+
"""Check if this Metadata is greater than another Metadata."""
|
|
165
|
+
return self.name > other.name or (
|
|
166
|
+
self.name == other.name
|
|
167
|
+
and (
|
|
168
|
+
((self.version is not None and other.version is not None) and (self.version > other.version))
|
|
169
|
+
or (self.version is not None and other.version is None)
|
|
170
|
+
or (
|
|
171
|
+
(
|
|
172
|
+
(self.framing is not None and other.framing is not None)
|
|
173
|
+
and (sorted(self.framing) > sorted(other.framing))
|
|
174
|
+
)
|
|
175
|
+
or (self.framing is not None and other.framing is None)
|
|
176
|
+
or (
|
|
177
|
+
(
|
|
178
|
+
(self.sources is not None and other.sources is not None)
|
|
179
|
+
and (sorted(self.sources) > sorted(other.sources))
|
|
180
|
+
)
|
|
181
|
+
or (self.sources is not None and other.sources is None)
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def __ne__(self, other: "Metadata") -> bool:
|
|
188
|
+
"""Check inequality between two Metadata instances."""
|
|
189
|
+
return not other == self
|
|
190
|
+
|
|
191
|
+
def __ge__(self, other: "Metadata") -> bool:
|
|
192
|
+
"""Check if this Metadata is greater than or equal to another Metadata."""
|
|
193
|
+
return other == self or self > other
|
|
194
|
+
|
|
195
|
+
def __le__(self, other: "Metadata") -> bool:
|
|
196
|
+
"""Check if this Metadata is less than or equal to another Metadata."""
|
|
197
|
+
return other == self or self < other
|
|
198
|
+
|
|
199
|
+
def __lt__(self, other: "Metadata") -> bool:
|
|
200
|
+
"""Check if this Metadata is less than another Metadata."""
|
|
201
|
+
return other > self
|
|
202
|
+
|
|
203
|
+
@staticmethod
|
|
204
|
+
def from_str(src: str) -> "Metadata":
|
|
205
|
+
"""Create a Metadata instance from a JSON string.
|
|
206
|
+
|
|
207
|
+
Parameters
|
|
208
|
+
----------
|
|
209
|
+
src : str
|
|
210
|
+
A JSON string representation of a Metadata object.
|
|
211
|
+
|
|
212
|
+
Returns
|
|
213
|
+
-------
|
|
214
|
+
Metadata
|
|
215
|
+
A new Metadata instance created from the JSON string.
|
|
216
|
+
"""
|
|
217
|
+
r = json.loads(src)
|
|
218
|
+
name = r["name"]
|
|
219
|
+
version = r.get("version", None)
|
|
220
|
+
|
|
221
|
+
framing = None
|
|
222
|
+
framing_entries = r.get("framing", None)
|
|
223
|
+
if framing_entries is not None:
|
|
224
|
+
framing = []
|
|
225
|
+
for framing_entry in framing_entries:
|
|
226
|
+
framing.append(FieldFrame.from_str(json.dumps(framing_entry)))
|
|
227
|
+
|
|
228
|
+
sources = None
|
|
229
|
+
sources_entries = r.get("sources", None)
|
|
230
|
+
if sources_entries is not None:
|
|
231
|
+
sources = []
|
|
232
|
+
for source_entry in sources_entries:
|
|
233
|
+
source_entry_as_str = json.dumps(source_entry)
|
|
234
|
+
sources.append(Metadata.from_str(source_entry_as_str))
|
|
235
|
+
|
|
236
|
+
return Metadata(name=name, version=version, framing=framing, sources=sources)
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Spark utility functions and classes.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- UtilsSpark: A utility class with handy functions to work with Spark sessions,
|
|
5
|
+
including creating local and remote sessions with Delta Lake support,
|
|
6
|
+
and building PySpark schemas from data type dictionaries.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
from typing import ClassVar
|
|
12
|
+
from pyspark.sql import SparkSession
|
|
13
|
+
from pyspark import SparkConf
|
|
14
|
+
from pyspark.sql import types as T # noqa: N812
|
|
15
|
+
from pyspark.context import SparkContext
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class UtilsSpark:
|
|
22
|
+
"""class with handy functions to work with spark."""
|
|
23
|
+
|
|
24
|
+
__ENV_KEY_PYSPARK_IS_LOCAL = "PYSPARK_IS_LOCAL"
|
|
25
|
+
__ENV_KEY_NOT_AWS_CLOUD = "NOT_AWS_CLOUD"
|
|
26
|
+
__DTYPES_MAP: ClassVar[dict[str, type]] = {
|
|
27
|
+
"bigint": T.LongType,
|
|
28
|
+
"string": T.StringType,
|
|
29
|
+
"double": T.DoubleType,
|
|
30
|
+
"int": T.IntegerType,
|
|
31
|
+
"boolean": T.BooleanType,
|
|
32
|
+
"timestamp": T.TimestampType,
|
|
33
|
+
"date": T.DateType,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def get_local_spark_session(config: dict | None = None) -> SparkSession:
|
|
38
|
+
"""Get a local Spark session configured for Delta Lake.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
config : dict | None, optional
|
|
43
|
+
Additional Spark configuration parameters, by default None.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
SparkSession
|
|
48
|
+
A configured local Spark session instance with Delta Lake support.
|
|
49
|
+
"""
|
|
50
|
+
logger.debug(f"[get_local_spark_session|in] ({config})")
|
|
51
|
+
# PySpark 3.4 uses Scala 2.12, so we need delta-core_2.12
|
|
52
|
+
builder = (
|
|
53
|
+
SparkSession.builder.config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
|
|
54
|
+
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
|
|
55
|
+
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
|
|
56
|
+
.config("spark.driver.host", "localhost")
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
if config is not None:
|
|
60
|
+
for k, v in config.items():
|
|
61
|
+
builder.config(k, v)
|
|
62
|
+
|
|
63
|
+
spark = builder.getOrCreate()
|
|
64
|
+
|
|
65
|
+
logger.debug(f"[get_local_spark_session|out] => {spark}")
|
|
66
|
+
return spark
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def get_spark_session(config: dict | None = None) -> SparkSession:
|
|
70
|
+
"""Get a Spark session based on the environment configuration.
|
|
71
|
+
|
|
72
|
+
Parameters
|
|
73
|
+
----------
|
|
74
|
+
config : dict | None, optional
|
|
75
|
+
Additional Spark configuration parameters, by default None.
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
SparkSession
|
|
80
|
+
A configured Spark session instance.
|
|
81
|
+
"""
|
|
82
|
+
logger.debug(f"[get_spark_session|in] ({config})")
|
|
83
|
+
|
|
84
|
+
if "1" == os.getenv(UtilsSpark.__ENV_KEY_PYSPARK_IS_LOCAL):
|
|
85
|
+
spark: SparkSession = UtilsSpark.get_local_spark_session(config)
|
|
86
|
+
else:
|
|
87
|
+
if "1" == os.getenv(UtilsSpark.__ENV_KEY_NOT_AWS_CLOUD):
|
|
88
|
+
active_session = SparkSession.getActiveSession()
|
|
89
|
+
else:
|
|
90
|
+
from awsglue.context import GlueContext # type: ignore # pragma: no cover # noqa: PGH003
|
|
91
|
+
|
|
92
|
+
glueContext = GlueContext(SparkContext.getOrCreate()) # pragma: no cover # noqa: N806
|
|
93
|
+
active_session = glueContext.spark_session # pragma: no cover
|
|
94
|
+
|
|
95
|
+
spark_config = SparkConf()
|
|
96
|
+
|
|
97
|
+
if active_session is not None:
|
|
98
|
+
former_config = active_session.sparkContext.getConf().getAll()
|
|
99
|
+
for entry in former_config:
|
|
100
|
+
spark_config.set(entry[0], entry[1])
|
|
101
|
+
if config is not None:
|
|
102
|
+
for k, v in config.items():
|
|
103
|
+
spark_config.set(k, v)
|
|
104
|
+
spark: SparkSession = SparkSession.builder.config(conf=spark_config).getOrCreate()
|
|
105
|
+
else:
|
|
106
|
+
spark: SparkSession = SparkSession.builder.getOrCreate()
|
|
107
|
+
|
|
108
|
+
logger.debug(f"[get_spark_session|out] => {spark}")
|
|
109
|
+
return spark
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def build_schema_from_dtypes(dtypes_schema: dict[str, str]) -> T.StructType:
|
|
113
|
+
"""Build a PySpark StructType schema from a dictionary of data types.
|
|
114
|
+
|
|
115
|
+
Parameters
|
|
116
|
+
----------
|
|
117
|
+
dtypes_schema : dict[str, str]
|
|
118
|
+
A dictionary mapping field names to their corresponding data type strings.
|
|
119
|
+
Supported types: 'bigint', 'string', 'double', 'int', 'boolean', 'timestamp', 'date'.
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
T.StructType
|
|
124
|
+
A PySpark StructType schema with fields defined by the input dictionary.
|
|
125
|
+
"""
|
|
126
|
+
logger.info(f"[build_schema_from_dtypes|in] ({dtypes_schema})")
|
|
127
|
+
result = T.StructType()
|
|
128
|
+
for field, dtype in dtypes_schema.items():
|
|
129
|
+
new_type = UtilsSpark.__DTYPES_MAP[dtype]
|
|
130
|
+
result.add(field, new_type(), True) # noqa: FBT003
|
|
131
|
+
|
|
132
|
+
logger.info(f"[build_schema_from_dtypes|out] => {result}")
|
|
133
|
+
return result
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Pyspark DataFrame validation implementation module.
|
|
2
|
+
|
|
3
|
+
This module provides the Pyspark-specific implementation of Great Expectations validation.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from great_expectations.execution_engine import ExecutionEngine
|
|
7
|
+
from great_expectations.execution_engine import SparkDFExecutionEngine
|
|
8
|
+
from tgedr_dataops_abs.great_expectations_validation import GreatExpectationsValidation
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PysparkValidation(GreatExpectationsValidation):
|
|
12
|
+
"""Pyspark DataFrame validation implementation."""
|
|
13
|
+
|
|
14
|
+
def _get_execution_engine(self, batch_data_dict: dict) -> ExecutionEngine:
|
|
15
|
+
"""Get the execution engine used by the validation implementation.
|
|
16
|
+
|
|
17
|
+
Returns
|
|
18
|
+
-------
|
|
19
|
+
ExecutionEngine
|
|
20
|
+
The execution engine instance.
|
|
21
|
+
"""
|
|
22
|
+
return SparkDFExecutionEngine(batch_data_dict=batch_data_dict)
|