cloe-nessy 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/integration/reader/excel_reader.py +1 -1
- cloe_nessy/integration/reader/file_reader.py +2 -1
- cloe_nessy/pipeline/actions/__init__.py +2 -0
- cloe_nessy/pipeline/actions/transform_regex_extract.py +169 -0
- {cloe_nessy-1.0.0.dist-info → cloe_nessy-1.0.1.dist-info}/METADATA +2 -2
- {cloe_nessy-1.0.0.dist-info → cloe_nessy-1.0.1.dist-info}/RECORD +7 -6
- {cloe_nessy-1.0.0.dist-info → cloe_nessy-1.0.1.dist-info}/WHEEL +0 -0
|
@@ -192,7 +192,8 @@ class FileReader(BaseReader):
|
|
|
192
192
|
"""Add all metadata columns to the DataFrame."""
|
|
193
193
|
metadata_columns = df.select("_metadata.*").columns
|
|
194
194
|
|
|
195
|
-
|
|
195
|
+
# Cast all metadata values to strings to ensure type consistency in the map
|
|
196
|
+
entries = [(F.lit(field), F.col(f"_metadata.{field}").cast("string")) for field in metadata_columns]
|
|
196
197
|
flat_list = [item for tup in entries for item in tup]
|
|
197
198
|
|
|
198
199
|
df = df.withColumn("__metadata", F.create_map(flat_list))
|
|
@@ -19,6 +19,7 @@ from .transform_group_aggregate import TransformGroupAggregate
|
|
|
19
19
|
from .transform_hash_columns import TransformHashColumnsAction
|
|
20
20
|
from .transform_join import TransformJoinAction
|
|
21
21
|
from .transform_json_normalize import TransformJsonNormalize
|
|
22
|
+
from .transform_regex_extract import TransformRegexExtract
|
|
22
23
|
from .transform_rename_columns import TransformRenameColumnsAction
|
|
23
24
|
from .transform_replace_values import TransformReplaceValuesAction
|
|
24
25
|
from .transform_select_columns import TransformSelectColumnsAction
|
|
@@ -56,6 +57,7 @@ __all__ = [
|
|
|
56
57
|
"TransformGroupAggregate",
|
|
57
58
|
"TransformJoinAction",
|
|
58
59
|
"TransformJsonNormalize",
|
|
60
|
+
"TransformRegexExtract",
|
|
59
61
|
"TransformRenameColumnsAction",
|
|
60
62
|
"TransformReplaceValuesAction",
|
|
61
63
|
"TransformSelectColumnsAction",
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import pyspark.sql.functions as F
|
|
5
|
+
|
|
6
|
+
from cloe_nessy.pipeline.pipeline_action import PipelineAction
|
|
7
|
+
from cloe_nessy.pipeline.pipeline_context import PipelineContext
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TransformRegexExtract(PipelineAction):
|
|
11
|
+
r"""Extract values from a specified column in a DataFrame using regex patterns.
|
|
12
|
+
|
|
13
|
+
This action extracts values from a column based on a regex pattern and stores
|
|
14
|
+
the result in a new column. Optionally, you can replace the matched pattern in
|
|
15
|
+
the original column with a different string, remove the original column, or add
|
|
16
|
+
a boolean column indicating which rows matched the pattern.
|
|
17
|
+
|
|
18
|
+
Example:
|
|
19
|
+
```yaml
|
|
20
|
+
Extract Action:
|
|
21
|
+
action: TRANSFORM_REGEX_EXTRACT
|
|
22
|
+
options:
|
|
23
|
+
source_column_name: Email
|
|
24
|
+
extract_column_name: org_domain
|
|
25
|
+
pattern: (?<=@)([A-Za-z0-9-]+)
|
|
26
|
+
replace_by: exampledomain.org
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
This action also supports processing multiple columns simultaneously. To use this
|
|
30
|
+
functionality, structure the configuration as a dictionary mapping each source
|
|
31
|
+
column name to its extraction parameters.
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
```yaml
|
|
35
|
+
Extract Action:
|
|
36
|
+
action: TRANSFORM_REGEX_EXTRACT
|
|
37
|
+
options:
|
|
38
|
+
extract_columns:
|
|
39
|
+
Name:
|
|
40
|
+
pattern: (?<=\w+) (\w+)
|
|
41
|
+
replace_by: ''
|
|
42
|
+
extract_column_name: last_name
|
|
43
|
+
match_info_column_name: has_last_name
|
|
44
|
+
Email:
|
|
45
|
+
pattern: @\w+\.\w+
|
|
46
|
+
extract_column_name: domain
|
|
47
|
+
keep_original_column: False
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
name: str = "TRANSFORM_REGEX_EXTRACT"
|
|
53
|
+
|
|
54
|
+
def run(
|
|
55
|
+
self,
|
|
56
|
+
context: PipelineContext,
|
|
57
|
+
source_column_name: str = "",
|
|
58
|
+
extract_column_name: str = "",
|
|
59
|
+
pattern: str = "",
|
|
60
|
+
keep_original_column: bool = True,
|
|
61
|
+
replace_by: str = "",
|
|
62
|
+
match_info_column_name: str = "",
|
|
63
|
+
extract_columns: dict | None = None,
|
|
64
|
+
**_: Any,
|
|
65
|
+
) -> PipelineContext:
|
|
66
|
+
"""Performs a regex extract (and replace) on a specified column in a DataFrame.
|
|
67
|
+
|
|
68
|
+
This function performs a regex extract (and optionally a replace) on one or more columns.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
context: The context in which this action is executed.
|
|
72
|
+
source_column_name: Column name to perform the regex replace on.
|
|
73
|
+
pattern: Regex pattern to match.
|
|
74
|
+
replace_by: String that should replace the extracted pattern in the source column.
|
|
75
|
+
extract_column_name: Column name to store the extract, default: <source_column_name>_extract
|
|
76
|
+
keep_original_column: Whether to keep the original column, default: True
|
|
77
|
+
match_info_column_name: Column name to store a boolean column whether a match was found, default: None
|
|
78
|
+
extract_columns: Dictionary of column names and their corresponding 1-column-case.
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
ValueError: If any of the required arguments are not provided.
|
|
82
|
+
ValueError: If the regex pattern is invalid.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
PipelineContext: Transformed context with the modified DataFrame.
|
|
86
|
+
"""
|
|
87
|
+
if context.data is None:
|
|
88
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
89
|
+
if not extract_columns and not source_column_name:
|
|
90
|
+
raise ValueError("Either extract_columns or source_column_name must be provided.")
|
|
91
|
+
|
|
92
|
+
df = context.data
|
|
93
|
+
|
|
94
|
+
if source_column_name:
|
|
95
|
+
self._console_logger.info(f"Extracting from column '{source_column_name}' using pattern: {pattern}")
|
|
96
|
+
df = self._process_one_column(
|
|
97
|
+
df,
|
|
98
|
+
source_column_name,
|
|
99
|
+
pattern,
|
|
100
|
+
extract_column_name,
|
|
101
|
+
replace_by,
|
|
102
|
+
keep_original_column,
|
|
103
|
+
match_info_column_name,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
elif isinstance(extract_columns, dict):
|
|
107
|
+
self._console_logger.info(f"Extracting from {len(extract_columns)} columns")
|
|
108
|
+
for one_source_column_name in extract_columns:
|
|
109
|
+
parameter_dict = self._get_default_dict() | extract_columns[one_source_column_name]
|
|
110
|
+
df = self._process_one_column(df, one_source_column_name, **parameter_dict)
|
|
111
|
+
|
|
112
|
+
else:
|
|
113
|
+
raise ValueError("extract_columns must be a dictionary. See documentation for proper format.")
|
|
114
|
+
|
|
115
|
+
return context.from_existing(data=df)
|
|
116
|
+
|
|
117
|
+
def _process_one_column(
|
|
118
|
+
self,
|
|
119
|
+
df,
|
|
120
|
+
source_column_name,
|
|
121
|
+
pattern,
|
|
122
|
+
extract_column_name,
|
|
123
|
+
replace_by,
|
|
124
|
+
keep_original_column,
|
|
125
|
+
match_info_column_name,
|
|
126
|
+
):
|
|
127
|
+
# Extract the first captured group (group 0 is the entire match)
|
|
128
|
+
matched_group_id = 0
|
|
129
|
+
|
|
130
|
+
if not extract_column_name:
|
|
131
|
+
extract_column_name = f"{source_column_name}_extracted"
|
|
132
|
+
|
|
133
|
+
if not pattern:
|
|
134
|
+
raise ValueError(f"The regex pattern (pattern) for column {source_column_name} must be provided.")
|
|
135
|
+
|
|
136
|
+
# Validate regex pattern
|
|
137
|
+
try:
|
|
138
|
+
re.compile(pattern)
|
|
139
|
+
except re.error as e:
|
|
140
|
+
raise ValueError(f"Invalid regex pattern '{pattern}' for column {source_column_name}: {e}") from e
|
|
141
|
+
|
|
142
|
+
df = df.withColumn(extract_column_name, F.regexp_extract(source_column_name, pattern, matched_group_id))
|
|
143
|
+
|
|
144
|
+
if replace_by:
|
|
145
|
+
df = df.withColumn(source_column_name, F.regexp_replace(source_column_name, pattern, replace_by))
|
|
146
|
+
|
|
147
|
+
if match_info_column_name:
|
|
148
|
+
# Check if extraction is null or empty string
|
|
149
|
+
df = df.withColumn(
|
|
150
|
+
match_info_column_name,
|
|
151
|
+
F.when((F.col(extract_column_name).isNull()) | (F.col(extract_column_name) == ""), False).otherwise(
|
|
152
|
+
True
|
|
153
|
+
),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
if not keep_original_column:
|
|
157
|
+
df = df.drop(source_column_name)
|
|
158
|
+
|
|
159
|
+
return df
|
|
160
|
+
|
|
161
|
+
def _get_default_dict(self) -> dict[str, Any]:
|
|
162
|
+
"""Return default parameters for single column extraction."""
|
|
163
|
+
return {
|
|
164
|
+
"pattern": "",
|
|
165
|
+
"extract_column_name": "",
|
|
166
|
+
"replace_by": "",
|
|
167
|
+
"keep_original_column": True,
|
|
168
|
+
"match_info_column_name": "",
|
|
169
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cloe-nessy
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: Your friendly datalake monster.
|
|
5
5
|
Project-URL: homepage, https://initions.com/
|
|
6
6
|
Author-email: initions <ICSMC_EXT_PYPIORG@accenture.com>
|
|
@@ -12,7 +12,7 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
12
12
|
Classifier: Operating System :: OS Independent
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
14
|
Classifier: Topic :: Database
|
|
15
|
-
Requires-Python: <3.
|
|
15
|
+
Requires-Python: <3.14,>=3.11
|
|
16
16
|
Requires-Dist: azure-identity<2.0.0,>=1.19.0
|
|
17
17
|
Requires-Dist: cloe-logging[databricks,log-analytics]<0.4,>=0.3.8
|
|
18
18
|
Requires-Dist: databricks-sdk<1.0.0,>=0.36.0
|
|
@@ -30,9 +30,9 @@ cloe_nessy/integration/delta_loader/strategies/delta_timestamp_loader.py,sha256=
|
|
|
30
30
|
cloe_nessy/integration/reader/__init__.py,sha256=NWQx-v6aKE8YOHhsxfeaZnMVq4KLKyRWXzUduf5aVsk,265
|
|
31
31
|
cloe_nessy/integration/reader/api_reader.py,sha256=FbOyfLVG1ryL2GC-MgE1uClHICsQKBj9yZbY4TG5qrk,19637
|
|
32
32
|
cloe_nessy/integration/reader/catalog_reader.py,sha256=DlnykmFjV_v8SCBh3qaCvf24QM-6TdMFVHx5Mqv7Nvs,4850
|
|
33
|
-
cloe_nessy/integration/reader/excel_reader.py,sha256=
|
|
33
|
+
cloe_nessy/integration/reader/excel_reader.py,sha256=QXm0MaE_-tW5ix-f_3Pgn-Vx7VG5jA_uSp858rVV7lA,8042
|
|
34
34
|
cloe_nessy/integration/reader/exceptions.py,sha256=_A9jFpe_RIDZCGY76qzjic9bsshxns6yXPSl141dq1c,203
|
|
35
|
-
cloe_nessy/integration/reader/file_reader.py,sha256=
|
|
35
|
+
cloe_nessy/integration/reader/file_reader.py,sha256=FFqqu1h003FY2Df3ru-G1JO4Bg2Ai8Rzh58fjOCN7NM,8262
|
|
36
36
|
cloe_nessy/integration/reader/reader.py,sha256=YHriYkzsBduBjfI2FnP03VEo15a8UCRZ_sXtre8eaEs,1041
|
|
37
37
|
cloe_nessy/integration/writer/__init__.py,sha256=3yzCAGiWZdQWtsbzlTih01sxVTJV2DDYwvl34lEAUlE,243
|
|
38
38
|
cloe_nessy/integration/writer/catalog_writer.py,sha256=dQeXmtfs7J6rP6Ye3OCvxBraFScFX_3SHs7Md58hEeM,5296
|
|
@@ -74,7 +74,7 @@ cloe_nessy/pipeline/pipeline_context.py,sha256=eCOcjyE16rGRom3L85Gy_BbncfQD6i1x3
|
|
|
74
74
|
cloe_nessy/pipeline/pipeline_parsing_service.py,sha256=eeC4RbGBILGN6zkbUyjH-qGgEMtOWV4Kv_VxrHbHMY0,9021
|
|
75
75
|
cloe_nessy/pipeline/pipeline_plotting_service.py,sha256=goMQj73FzUVchKn5c2SsPcWR6fr7DtVkVrcQfJsKCq4,13111
|
|
76
76
|
cloe_nessy/pipeline/pipeline_step.py,sha256=oTnlvRpB0fbOBQXbPe1URstA5fv-97igCHt_41fKCAk,2082
|
|
77
|
-
cloe_nessy/pipeline/actions/__init__.py,sha256=
|
|
77
|
+
cloe_nessy/pipeline/actions/__init__.py,sha256=FfAnSIl-0T6pnaWhClkDqV8nfTdvLvZZJdwycsZMLPw,2990
|
|
78
78
|
cloe_nessy/pipeline/actions/read_api.py,sha256=MAc7QfmhnaRUMdE09Ywt41RSAsuW4co8zF0zXHwbM8U,16193
|
|
79
79
|
cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=sx3dezd33c1FawMrxORwhK5GNo1IpjCyuLATWz7esZ0,6735
|
|
80
80
|
cloe_nessy/pipeline/actions/read_excel.py,sha256=IG_VmDEt1TvGVEO0SY9Fm3awHNjfisR1_7DUmhC3NEE,7968
|
|
@@ -93,6 +93,7 @@ cloe_nessy/pipeline/actions/transform_group_aggregate.py,sha256=KUHeeP-RIDi34dpb
|
|
|
93
93
|
cloe_nessy/pipeline/actions/transform_hash_columns.py,sha256=M5_wolJwzJpPTSrZq4yWV3TH7H6BGqbjJkJCwtqPlQo,8507
|
|
94
94
|
cloe_nessy/pipeline/actions/transform_join.py,sha256=ez1M1wVc9khOZj1swMArJbBKXxEpjenUHrW1wL8H330,7200
|
|
95
95
|
cloe_nessy/pipeline/actions/transform_json_normalize.py,sha256=petF7pnNq1EKc8MqVdG0weFALAHNILSe_eAu4Z5XxIo,4833
|
|
96
|
+
cloe_nessy/pipeline/actions/transform_regex_extract.py,sha256=vMtUW0s_oXy8DC1-4Xh-WQN3CCp8jXYsJiFYvGdYrqE,6390
|
|
96
97
|
cloe_nessy/pipeline/actions/transform_rename_columns.py,sha256=4zJcPCONMU4C67qeuzsrX3AORRRHoq_selUI7FJyeg0,1952
|
|
97
98
|
cloe_nessy/pipeline/actions/transform_replace_values.py,sha256=1OPHTrjcphfyGepcO7ozYfeqfwA18pjlyHpVKUS_AAU,2049
|
|
98
99
|
cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=-GhSEsb7iNnZIsYRm3BG9BX4_qUDJMbpj1DsKPY046w,4574
|
|
@@ -112,6 +113,6 @@ cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_Up
|
|
|
112
113
|
cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
113
114
|
cloe_nessy/utils/column_names.py,sha256=dCNtm61mc5aLkY2oE4rlfN3VLCrpot6fOESjAZmCmhA,361
|
|
114
115
|
cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
|
|
115
|
-
cloe_nessy-1.0.
|
|
116
|
-
cloe_nessy-1.0.
|
|
117
|
-
cloe_nessy-1.0.
|
|
116
|
+
cloe_nessy-1.0.1.dist-info/METADATA,sha256=qLn3XYfGsw2pW-pPtUUidtcHZiUtIwOESWY8LCenGEY,3291
|
|
117
|
+
cloe_nessy-1.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
118
|
+
cloe_nessy-1.0.1.dist-info/RECORD,,
|
|
File without changes
|