cloe-nessy 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/pipeline/actions/__init__.py +2 -0
- cloe_nessy/pipeline/actions/transform_clean_column_names.py +77 -0
- cloe_nessy/pipeline/actions/transform_distinct.py +18 -1
- cloe_nessy/session/session_manager.py +13 -7
- {cloe_nessy-0.3.2.dist-info → cloe_nessy-0.3.3.dist-info}/METADATA +1 -1
- {cloe_nessy-0.3.2.dist-info → cloe_nessy-0.3.3.dist-info}/RECORD +8 -7
- {cloe_nessy-0.3.2.dist-info → cloe_nessy-0.3.3.dist-info}/WHEEL +1 -1
- {cloe_nessy-0.3.2.dist-info → cloe_nessy-0.3.3.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,7 @@ from .read_excel import ReadExcelAction
|
|
|
7
7
|
from .read_files import ReadFilesAction
|
|
8
8
|
from .read_metadata_yaml import ReadMetadataYAMLAction
|
|
9
9
|
from .transform_change_datatype import TransformChangeDatatypeAction
|
|
10
|
+
from .transform_clean_column_names import TransformCleanColumnNamesAction
|
|
10
11
|
from .transform_concat_columns import TransformConcatColumnsAction
|
|
11
12
|
from .transform_decode import TransformDecodeAction
|
|
12
13
|
from .transform_distinct import TransformDistinctAction
|
|
@@ -39,6 +40,7 @@ __all__ = [
|
|
|
39
40
|
"TransformFilterAction",
|
|
40
41
|
"TransformUnionAction",
|
|
41
42
|
"TransformChangeDatatypeAction",
|
|
43
|
+
"TransformCleanColumnNamesAction",
|
|
42
44
|
"TransformConcatColumnsAction",
|
|
43
45
|
"TransformDecodeAction",
|
|
44
46
|
"TransformDistinctAction",
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import pyspark.sql.functions as F
|
|
6
|
+
import pyspark.sql.types as T
|
|
7
|
+
|
|
8
|
+
from ..pipeline_action import PipelineAction
|
|
9
|
+
from ..pipeline_context import PipelineContext
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TransformCleanColumnNamesAction(PipelineAction):
|
|
13
|
+
"""Fixes column names in the DataFrame to be valid.
|
|
14
|
+
|
|
15
|
+
Removes invalid characters from the column names, including the fields of a struct and
|
|
16
|
+
replaces a single leading underscore by a double underscore.
|
|
17
|
+
|
|
18
|
+
Example:
|
|
19
|
+
```yaml
|
|
20
|
+
Clean Column Names:
|
|
21
|
+
action: TRANSFORM_CLEAN_COLUMN_NAMES
|
|
22
|
+
```
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name: str = "TRANSFORM_CLEAN_COLUMN_NAMES"
|
|
26
|
+
|
|
27
|
+
def run(
|
|
28
|
+
self,
|
|
29
|
+
context: PipelineContext,
|
|
30
|
+
**_: Any,
|
|
31
|
+
) -> PipelineContext:
|
|
32
|
+
"""Fixes column names in the DataFrame to be valid.
|
|
33
|
+
|
|
34
|
+
Removes invalid characters from the column names, including the fields of a struct and
|
|
35
|
+
replaces a single leading underscore by a double underscore.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
context: The context in which this Action is executed.
|
|
39
|
+
|
|
40
|
+
Raises:
|
|
41
|
+
ValueError: If the data from the context is None.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
The context after the execution of this Action, containing the DataFrame with cleaned column names.
|
|
45
|
+
"""
|
|
46
|
+
if context.data is None:
|
|
47
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
48
|
+
|
|
49
|
+
with_columns_renamed = {}
|
|
50
|
+
with_columns_casted: dict[str, T.StructType | T.ArrayType | T.MapType] = {}
|
|
51
|
+
|
|
52
|
+
single_underscrore_at_beginning = r"^_(?=[^_])"
|
|
53
|
+
|
|
54
|
+
for c in context.data.schema:
|
|
55
|
+
old_name = c.name
|
|
56
|
+
new_name = re.sub(single_underscrore_at_beginning, "__", re.sub("\W", "_", old_name))
|
|
57
|
+
with_columns_renamed[old_name] = new_name
|
|
58
|
+
|
|
59
|
+
if isinstance(c.dataType, (T.StructType | T.ArrayType | T.MapType)):
|
|
60
|
+
old_column_schema = c.dataType.json()
|
|
61
|
+
new_column_schema = re.sub(
|
|
62
|
+
r'(?<="name":")[^"]+',
|
|
63
|
+
lambda m: re.sub("\W", "_", str(m.group())),
|
|
64
|
+
old_column_schema,
|
|
65
|
+
)
|
|
66
|
+
if isinstance(c.dataType, T.StructType):
|
|
67
|
+
with_columns_casted[new_name] = T.StructType.fromJson(json.loads(new_column_schema))
|
|
68
|
+
elif isinstance(c.dataType, T.ArrayType):
|
|
69
|
+
with_columns_casted[new_name] = T.ArrayType.fromJson(json.loads(new_column_schema))
|
|
70
|
+
elif isinstance(c.dataType, T.MapType):
|
|
71
|
+
with_columns_casted[new_name] = T.MapType.fromJson(json.loads(new_column_schema))
|
|
72
|
+
|
|
73
|
+
df = context.data.withColumnsRenamed(with_columns_renamed)
|
|
74
|
+
for c_name, c_type in with_columns_casted.items():
|
|
75
|
+
df = df.withColumn(c_name, F.col(c_name).cast(c_type))
|
|
76
|
+
|
|
77
|
+
return context.from_existing(data=df) # type: ignore
|
|
@@ -7,10 +7,16 @@ from ..pipeline_context import PipelineContext
|
|
|
7
7
|
class TransformDistinctAction(PipelineAction):
|
|
8
8
|
"""Selects distinct rows from the DataFrame in the given context.
|
|
9
9
|
|
|
10
|
+
If a subset is given these columns are used for duplicate comparison. If no subset is given all columns are used.
|
|
11
|
+
|
|
10
12
|
Example:
|
|
11
13
|
```yaml
|
|
12
14
|
Decode Columns:
|
|
13
15
|
action: TRANSFORM_DISTINCT
|
|
16
|
+
options:
|
|
17
|
+
subset:
|
|
18
|
+
- first_name
|
|
19
|
+
- last_name
|
|
14
20
|
```
|
|
15
21
|
"""
|
|
16
22
|
|
|
@@ -19,12 +25,15 @@ class TransformDistinctAction(PipelineAction):
|
|
|
19
25
|
def run(
|
|
20
26
|
self,
|
|
21
27
|
context: PipelineContext,
|
|
28
|
+
*,
|
|
29
|
+
subset: list[str] | None = None,
|
|
22
30
|
**_: Any,
|
|
23
31
|
) -> PipelineContext:
|
|
24
32
|
"""Selects distinct rows from the DataFrame in the given context.
|
|
25
33
|
|
|
26
34
|
Args:
|
|
27
35
|
context: The context in which this Action is executed.
|
|
36
|
+
subset: List of column names to use for duplicate comparison (default All columns).
|
|
28
37
|
|
|
29
38
|
Raises:
|
|
30
39
|
ValueError: If the data from the context is None.
|
|
@@ -35,6 +44,14 @@ class TransformDistinctAction(PipelineAction):
|
|
|
35
44
|
if context.data is None:
|
|
36
45
|
raise ValueError("Data from the context is required for the operation.")
|
|
37
46
|
|
|
38
|
-
|
|
47
|
+
# check if all columns that are part of the subset are actually part of the dataframe.
|
|
48
|
+
if subset is not None:
|
|
49
|
+
subset_columns_not_in_dataframe = set(subset) - set(context.data.columns)
|
|
50
|
+
if len(subset_columns_not_in_dataframe) != 0:
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f"The following subset columns are not part of the dataframe: {subset_columns_not_in_dataframe}"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
df = context.data.dropDuplicates(subset=subset)
|
|
39
56
|
|
|
40
57
|
return context.from_existing(data=df) # type: ignore
|
|
@@ -63,13 +63,13 @@ class SessionManager:
|
|
|
63
63
|
@classmethod
|
|
64
64
|
def get_utils(
|
|
65
65
|
cls,
|
|
66
|
-
) -> Any: # return type should be Union[DBUtils, MsSparkUtils
|
|
67
|
-
"""Get or create a DBUtils or MsSparkUtils instance, depending on the context.
|
|
66
|
+
) -> Any: # return type should be Union[DBUtils, MsSparkUtils, RemoteDbUtils].
|
|
67
|
+
"""Get or create a DBUtils, RemoteDbUtils or MsSparkUtils instance, depending on the context.
|
|
68
68
|
|
|
69
|
-
In Databricks this will return DBUtils,
|
|
69
|
+
In Databricks this will return DBUtils, when using Databricks-Connect it returns RemoteDbUtils, and in Fabric it will return MsSparkUtils.
|
|
70
70
|
|
|
71
71
|
Returns:
|
|
72
|
-
utils: The DBUtils or MsSparkUtils instance.
|
|
72
|
+
utils: The DBUtils, RemoteDbUtils or MsSparkUtils instance.
|
|
73
73
|
|
|
74
74
|
Raises:
|
|
75
75
|
RuntimeError: If the instance cannot be created.
|
|
@@ -88,19 +88,25 @@ class SessionManager:
|
|
|
88
88
|
}
|
|
89
89
|
|
|
90
90
|
try:
|
|
91
|
-
cls._utils = utils_function[cls._env](
|
|
91
|
+
cls._utils = utils_function[cls._env]() # type: ignore
|
|
92
92
|
except Exception as e:
|
|
93
93
|
raise RuntimeError(f"Cannot create utils instance. Error: {e}") from e
|
|
94
94
|
|
|
95
95
|
return cls._utils
|
|
96
96
|
|
|
97
|
+
@classmethod
|
|
97
98
|
def _get_dbutils(cls):
|
|
99
|
+
if cls._env == cls.Environment.DATABRICKS_CONNECT:
|
|
100
|
+
from databricks.sdk import WorkspaceClient
|
|
101
|
+
|
|
102
|
+
return WorkspaceClient().dbutils
|
|
103
|
+
|
|
98
104
|
from pyspark.dbutils import DBUtils
|
|
99
105
|
|
|
100
106
|
cls.get_spark_session()
|
|
101
|
-
|
|
102
|
-
return utils
|
|
107
|
+
return DBUtils(cls._spark)
|
|
103
108
|
|
|
109
|
+
@classmethod
|
|
104
110
|
def _get_mssparkutils(cls):
|
|
105
111
|
from notebookutils import mssparkutils # type: ignore
|
|
106
112
|
|
|
@@ -47,17 +47,18 @@ cloe_nessy/pipeline/pipeline_config.py,sha256=BN3ZSbr6bC-X9edoh-n5vRfPHFMbgtAU7m
|
|
|
47
47
|
cloe_nessy/pipeline/pipeline_context.py,sha256=csElDc6BsynDUtRXgQOSCH7ONc_b-ag0YEg0zlQTz58,1874
|
|
48
48
|
cloe_nessy/pipeline/pipeline_parsing_service.py,sha256=c_nAsgw81QYBM9AFiTxGgqRhNXABkDKplbeoCJPtbpE,6434
|
|
49
49
|
cloe_nessy/pipeline/pipeline_step.py,sha256=UlnmpS6gm_dZ7m9dD1mZvye7mvUF_DA7HjOZo0oGYDU,1977
|
|
50
|
-
cloe_nessy/pipeline/actions/__init__.py,sha256=
|
|
50
|
+
cloe_nessy/pipeline/actions/__init__.py,sha256=LwKctXy4Jun52BnCVGvWa8nnKVjTSov4GT58j6Zy8zg,2273
|
|
51
51
|
cloe_nessy/pipeline/actions/read_api.py,sha256=wGyPZdeh3Cam_BQBilltWBWCIdD9I_kv4lunEhE39Tg,6625
|
|
52
52
|
cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=aZy4sJLLE8ZQ_SPXGSDoHYaBJTz8s7xQDVn5eYrYHvE,2689
|
|
53
53
|
cloe_nessy/pipeline/actions/read_excel.py,sha256=EgHbK1wO6dkDo0KErYDhK_2sNIkIoa-6As9oo9dNFsE,7708
|
|
54
54
|
cloe_nessy/pipeline/actions/read_files.py,sha256=8twjprqKYEmVu5QITEGe4no45TfhgzZosTFVQ89vV6g,3861
|
|
55
55
|
cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=aZtkstf9jBYYN2MGnazz63BG_hJ7mIgAfKiNqUpc26E,2235
|
|
56
56
|
cloe_nessy/pipeline/actions/transform_change_datatype.py,sha256=Nz3Ncr-Zd-wy8g9-aN5XcvpWAHLyWs70RpZ7KqKqIaU,1788
|
|
57
|
+
cloe_nessy/pipeline/actions/transform_clean_column_names.py,sha256=XuVAVEbp-UiF8PO6wAEJyl1TYgBD7MSnuOGhuEvXKv4,2881
|
|
57
58
|
cloe_nessy/pipeline/actions/transform_concat_columns.py,sha256=V0TzeQFpBYur_T1Nv0nRpOU02nKQ2iypo2CCcV2rBtk,3083
|
|
58
59
|
cloe_nessy/pipeline/actions/transform_decode.py,sha256=DmT-29dIqbz_xTj4GSCfnbgYRCiUrWzKvGrRYy1frNw,4004
|
|
59
60
|
cloe_nessy/pipeline/actions/transform_deduplication.py,sha256=2VN5_wza7sD7fERyG6ElGh_Yo-W-Mxw-QBmtDXs1MGQ,5063
|
|
60
|
-
cloe_nessy/pipeline/actions/transform_distinct.py,sha256=
|
|
61
|
+
cloe_nessy/pipeline/actions/transform_distinct.py,sha256=R0Wv_YnWOw198r0rPR_72fgH5sp7upgjZzfOPTZ1oPA,1942
|
|
61
62
|
cloe_nessy/pipeline/actions/transform_filter.py,sha256=vOAxKtNWCABLb6G6Xz98NK7fEfgn6QJia31S7IvoUTg,1428
|
|
62
63
|
cloe_nessy/pipeline/actions/transform_generic_sql.py,sha256=cli59HCERFge7f0RB8yXw2oDtHSbMCWQMdeCeqhbdg8,2355
|
|
63
64
|
cloe_nessy/pipeline/actions/transform_group_aggregate.py,sha256=HcY4sqb2yNBCz90jQtxGA8fZPuQXfJuaDmv8lWuoTqg,4050
|
|
@@ -69,12 +70,12 @@ cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=Kez8puDK7cRfhleBE
|
|
|
69
70
|
cloe_nessy/pipeline/actions/transform_union.py,sha256=TDER06IABzxvIez4bGLKCLaDA4eScpTzYRbfUzwv_RQ,2342
|
|
70
71
|
cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=6yAHTX5kZviumgBW_NYVGAUin6U2nDzmic9of6wA8FY,2590
|
|
71
72
|
cloe_nessy/session/__init__.py,sha256=t7_YjUhJYW3km_FrucaUdbIl1boQtwkyhw_8yE10qzc,74
|
|
72
|
-
cloe_nessy/session/session_manager.py,sha256=
|
|
73
|
+
cloe_nessy/session/session_manager.py,sha256=PK7awMc6fmot7f9FMmvIUbIzKFgjcy2o2bZS9kjVs10,6733
|
|
73
74
|
cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEvzM,101
|
|
74
75
|
cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
|
|
75
76
|
cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
76
77
|
cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
|
|
77
|
-
cloe_nessy-0.3.
|
|
78
|
-
cloe_nessy-0.3.
|
|
79
|
-
cloe_nessy-0.3.
|
|
80
|
-
cloe_nessy-0.3.
|
|
78
|
+
cloe_nessy-0.3.3.dist-info/METADATA,sha256=StCfl2I5dItaMzO10u3gQw6WxfjZUZHRIodEvKuQu_s,1837
|
|
79
|
+
cloe_nessy-0.3.3.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
|
80
|
+
cloe_nessy-0.3.3.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
|
|
81
|
+
cloe_nessy-0.3.3.dist-info/RECORD,,
|
|
File without changes
|