cloe-nessy 0.2.11__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/pipeline/actions/__init__.py +2 -0
- cloe_nessy/pipeline/actions/transform_deduplication.py +125 -0
- cloe_nessy/pipeline/actions/transform_group_aggregate.py +104 -0
- {cloe_nessy-0.2.11.dist-info → cloe_nessy-0.3.2.dist-info}/METADATA +1 -1
- {cloe_nessy-0.2.11.dist-info → cloe_nessy-0.3.2.dist-info}/RECORD +7 -5
- {cloe_nessy-0.2.11.dist-info → cloe_nessy-0.3.2.dist-info}/WHEEL +0 -0
- {cloe_nessy-0.2.11.dist-info → cloe_nessy-0.3.2.dist-info}/top_level.txt +0 -0
|
@@ -12,6 +12,7 @@ from .transform_decode import TransformDecodeAction
|
|
|
12
12
|
from .transform_distinct import TransformDistinctAction
|
|
13
13
|
from .transform_filter import TransformFilterAction
|
|
14
14
|
from .transform_generic_sql import TransformSqlAction
|
|
15
|
+
from .transform_group_aggregate import TransformGroupAggregate
|
|
15
16
|
from .transform_join import TransformJoinAction
|
|
16
17
|
from .transform_json_normalize import TransformJsonNormalize
|
|
17
18
|
from .transform_rename_columns import TransformRenameColumnsAction
|
|
@@ -42,6 +43,7 @@ __all__ = [
|
|
|
42
43
|
"TransformDecodeAction",
|
|
43
44
|
"TransformDistinctAction",
|
|
44
45
|
"TransformSqlAction",
|
|
46
|
+
"TransformGroupAggregate",
|
|
45
47
|
"TransformJoinAction",
|
|
46
48
|
"TransformJsonNormalize",
|
|
47
49
|
"TransformRenameColumnsAction",
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import string
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import pyspark.sql.functions as F
|
|
6
|
+
import pyspark.sql.types as T
|
|
7
|
+
from pyspark.sql import Window
|
|
8
|
+
|
|
9
|
+
from ..pipeline_action import PipelineAction
|
|
10
|
+
from ..pipeline_context import PipelineContext
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TransformDeduplication(PipelineAction):
|
|
14
|
+
"""Deduplicates the data from the given DataFrame.
|
|
15
|
+
|
|
16
|
+
This method deduplicates the data where the key columns are the same
|
|
17
|
+
and keeps the entry with the highest values in the order_by_columns
|
|
18
|
+
(can be changed to lowest by setting the parameter descending to false).
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
```yaml
|
|
22
|
+
Deduplicate Columns:
|
|
23
|
+
action: TRANSFORM_DEDUPLICATION
|
|
24
|
+
options:
|
|
25
|
+
key_columns:
|
|
26
|
+
- id
|
|
27
|
+
order_by_columns:
|
|
28
|
+
- source_file_modification_time
|
|
29
|
+
```
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
name: str = "TRANSFORM_DEDUPLICATION"
|
|
33
|
+
|
|
34
|
+
def run(
|
|
35
|
+
self,
|
|
36
|
+
context: PipelineContext,
|
|
37
|
+
*,
|
|
38
|
+
key_columns: list[str] | None = None,
|
|
39
|
+
order_by_columns: list[str] | None = None,
|
|
40
|
+
descending: bool = True,
|
|
41
|
+
**_: Any,
|
|
42
|
+
) -> PipelineContext:
|
|
43
|
+
"""Deduplicates the data based on key columns and order by columns.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
context: The context in which this Action is executed.
|
|
47
|
+
key_columns: A list of the key column names. The returned data only keeps one
|
|
48
|
+
line of data with the same key columns.
|
|
49
|
+
order_by_columns: A list of order by column names. The returned data keeps the
|
|
50
|
+
first line of data with the same key columns ordered by these columns.
|
|
51
|
+
descending: Whether to sort descending or ascending.
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
ValueError: If no key_columns are specified.
|
|
55
|
+
ValueError: If no order_by_columns are specified.
|
|
56
|
+
ValueError: If the data from context is None.
|
|
57
|
+
ValueError: If key_columns and order_by_columns overlap.
|
|
58
|
+
ValueError: If key_columns or order_by_columns contain Nulls.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
The context after the execution of this Action, containing the DataFrame with the deduplicated data.
|
|
62
|
+
"""
|
|
63
|
+
if context.data is None:
|
|
64
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
65
|
+
if key_columns is None:
|
|
66
|
+
raise ValueError("Please provide at least one key column.")
|
|
67
|
+
if order_by_columns is None:
|
|
68
|
+
raise ValueError("Please provide at least one order by column.")
|
|
69
|
+
|
|
70
|
+
# check if the key_columns and order_by_columns are the same
|
|
71
|
+
if len(set(key_columns) & set(order_by_columns)) != 0:
|
|
72
|
+
raise ValueError("The key_columns and order_by_columns cannot contain the same column")
|
|
73
|
+
|
|
74
|
+
# check if the key_columns and order_by_columns are not null
|
|
75
|
+
df_nulls = context.data.filter(F.greatest(*[F.col(c).isNull() for c in key_columns + order_by_columns]) == 1)
|
|
76
|
+
if df_nulls.head(1): # if the filteredDataFrame is not empty
|
|
77
|
+
raise ValueError(
|
|
78
|
+
"The key_columns and order_by_columns cannot be null. Please check the quality of the provided columns (null handling)"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# check if the order_by columns have the preferred data types
|
|
82
|
+
recommended_order_by_data_types = [
|
|
83
|
+
T.TimestampType(),
|
|
84
|
+
T.TimestampNTZType(),
|
|
85
|
+
T.DataType(),
|
|
86
|
+
T.IntegerType(),
|
|
87
|
+
T.LongType(),
|
|
88
|
+
T.DoubleType(),
|
|
89
|
+
T.FloatType(),
|
|
90
|
+
T.DecimalType(),
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
for c in context.data.schema:
|
|
94
|
+
if c.name in order_by_columns and c.dataType not in recommended_order_by_data_types:
|
|
95
|
+
log_message = (
|
|
96
|
+
f"action_name : {self.name} | message : order_by_column `{c.name}` is of type {c.dataType}; "
|
|
97
|
+
"recommended data types are {recommended_order_by_data_types}"
|
|
98
|
+
)
|
|
99
|
+
self._console_logger.warning(log_message)
|
|
100
|
+
self._tabular_logger.warning(log_message)
|
|
101
|
+
|
|
102
|
+
# sort the order_by columns in the preferred order
|
|
103
|
+
if descending:
|
|
104
|
+
order_by_list = [F.col(col_name).desc() for col_name in order_by_columns]
|
|
105
|
+
else:
|
|
106
|
+
order_by_list = [F.col(col_name).asc() for col_name in order_by_columns]
|
|
107
|
+
|
|
108
|
+
# create the window specification
|
|
109
|
+
window_specification = Window.partitionBy(key_columns).orderBy(order_by_list)
|
|
110
|
+
|
|
111
|
+
# generate a column name that is not in the input dataframe
|
|
112
|
+
def generate_random_string(length):
|
|
113
|
+
return "".join(random.choice(string.ascii_uppercase) for _ in range(length))
|
|
114
|
+
|
|
115
|
+
row_number_col_name = generate_random_string(20)
|
|
116
|
+
while row_number_col_name in context.data.columns:
|
|
117
|
+
row_number_col_name = generate_random_string(20)
|
|
118
|
+
|
|
119
|
+
# drop the duplicates
|
|
120
|
+
df = (
|
|
121
|
+
context.data.withColumn(row_number_col_name, F.row_number().over(window_specification))
|
|
122
|
+
.filter(F.col(row_number_col_name) == 1)
|
|
123
|
+
.drop(row_number_col_name)
|
|
124
|
+
)
|
|
125
|
+
return context.from_existing(data=df)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
import pyspark.sql.functions as F
|
|
4
|
+
|
|
5
|
+
from ..pipeline_action import PipelineAction
|
|
6
|
+
from ..pipeline_context import PipelineContext
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TransformGroupAggregate(PipelineAction):
|
|
10
|
+
"""Performs aggregation operations on grouped data within a DataFrame.
|
|
11
|
+
|
|
12
|
+
This class allows you to group data by specified columns and apply various aggregation functions
|
|
13
|
+
to other columns. The aggregation functions can be specified as a dictionary where keys are column names
|
|
14
|
+
and values are either a single aggregation function or a list of functions.
|
|
15
|
+
|
|
16
|
+
Example:
|
|
17
|
+
```yaml
|
|
18
|
+
Transform Group Aggregate:
|
|
19
|
+
action: TRANSFORM_GROUP_AGGREGATE
|
|
20
|
+
options:
|
|
21
|
+
grouping_columns:
|
|
22
|
+
- column1
|
|
23
|
+
- column2
|
|
24
|
+
aggregations:
|
|
25
|
+
column3:
|
|
26
|
+
- sum
|
|
27
|
+
- avg
|
|
28
|
+
column4: max
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
name (str): The name of the action, default is "TRANSFORM_GROUP_AGGREGATE".
|
|
33
|
+
|
|
34
|
+
Methods:
|
|
35
|
+
run(context, grouping_columns=None, aggregations=None, **_):
|
|
36
|
+
Executes the aggregation on the grouped data.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If the context data is None.
|
|
40
|
+
ValueError: If no aggregations are provided.
|
|
41
|
+
ValueError: If invalid aggregation operations are provided.
|
|
42
|
+
ValueError: If columns with unsupported data types are included in the aggregations.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
name: str = "TRANSFORM_GROUP_AGGREGATE"
|
|
46
|
+
|
|
47
|
+
def run(
|
|
48
|
+
self,
|
|
49
|
+
context: PipelineContext,
|
|
50
|
+
*,
|
|
51
|
+
grouping_columns: list[str] | None = None,
|
|
52
|
+
aggregations: dict[str, str | list] | None = None,
|
|
53
|
+
**_: Any,
|
|
54
|
+
) -> PipelineContext:
|
|
55
|
+
"""Executes the aggregation on the grouped data.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
context: The context in which this action is executed.
|
|
59
|
+
grouping_columns: A list of columns to group by.
|
|
60
|
+
aggregations: A dictionary where keys are column names and values are either a single
|
|
61
|
+
aggregation function or a list of functions.
|
|
62
|
+
|
|
63
|
+
Raises:
|
|
64
|
+
ValueError: If the context data is None.
|
|
65
|
+
ValueError: If no aggregations are provided.
|
|
66
|
+
ValueError: If invalid aggregation operations are provided.
|
|
67
|
+
ValueError: If columns with unsupported data types are included in the aggregations.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
PipelineContext: The context after the execution of this action.
|
|
71
|
+
"""
|
|
72
|
+
if context.data is None:
|
|
73
|
+
raise ValueError("Data from the context is required for the operation.")
|
|
74
|
+
|
|
75
|
+
if grouping_columns is None:
|
|
76
|
+
raise ValueError("Please provide at least one grouping column")
|
|
77
|
+
if aggregations is None:
|
|
78
|
+
raise ValueError("Please provide aggregations.")
|
|
79
|
+
|
|
80
|
+
valid_operations = ["avg", "max", "min", "mean", "sum", "count"]
|
|
81
|
+
|
|
82
|
+
for operation in aggregations.values():
|
|
83
|
+
if isinstance(operation, list):
|
|
84
|
+
if not set(operation).issubset(valid_operations):
|
|
85
|
+
raise ValueError(f"Please provide valid operations. Valid operations are {valid_operations}")
|
|
86
|
+
elif isinstance(operation, str):
|
|
87
|
+
if operation not in valid_operations:
|
|
88
|
+
raise ValueError(f"Please provide valid operations. Valid operations are {valid_operations}")
|
|
89
|
+
else:
|
|
90
|
+
raise ValueError("OPERATION DATATYPE INVALID")
|
|
91
|
+
|
|
92
|
+
aggregation_list = []
|
|
93
|
+
for column_name, aggregation in aggregations.items():
|
|
94
|
+
if isinstance(aggregation, list):
|
|
95
|
+
for subaggregation in aggregation:
|
|
96
|
+
aggregation_list.append(
|
|
97
|
+
getattr(F, subaggregation)(column_name).alias(f"{subaggregation}_{column_name}")
|
|
98
|
+
)
|
|
99
|
+
else:
|
|
100
|
+
aggregation_list.append(getattr(F, aggregation)(column_name).alias(f"{aggregation}_{column_name}"))
|
|
101
|
+
|
|
102
|
+
df = context.data.groupBy(grouping_columns).agg(*aggregation_list)
|
|
103
|
+
|
|
104
|
+
return context.from_existing(data=df)
|
|
@@ -47,7 +47,7 @@ cloe_nessy/pipeline/pipeline_config.py,sha256=BN3ZSbr6bC-X9edoh-n5vRfPHFMbgtAU7m
|
|
|
47
47
|
cloe_nessy/pipeline/pipeline_context.py,sha256=csElDc6BsynDUtRXgQOSCH7ONc_b-ag0YEg0zlQTz58,1874
|
|
48
48
|
cloe_nessy/pipeline/pipeline_parsing_service.py,sha256=c_nAsgw81QYBM9AFiTxGgqRhNXABkDKplbeoCJPtbpE,6434
|
|
49
49
|
cloe_nessy/pipeline/pipeline_step.py,sha256=UlnmpS6gm_dZ7m9dD1mZvye7mvUF_DA7HjOZo0oGYDU,1977
|
|
50
|
-
cloe_nessy/pipeline/actions/__init__.py,sha256=
|
|
50
|
+
cloe_nessy/pipeline/actions/__init__.py,sha256=Psksv49DVhWHR2D1OuMxvYClF1Vjh5shiyy9yBdWnb0,2160
|
|
51
51
|
cloe_nessy/pipeline/actions/read_api.py,sha256=wGyPZdeh3Cam_BQBilltWBWCIdD9I_kv4lunEhE39Tg,6625
|
|
52
52
|
cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=aZy4sJLLE8ZQ_SPXGSDoHYaBJTz8s7xQDVn5eYrYHvE,2689
|
|
53
53
|
cloe_nessy/pipeline/actions/read_excel.py,sha256=EgHbK1wO6dkDo0KErYDhK_2sNIkIoa-6As9oo9dNFsE,7708
|
|
@@ -56,9 +56,11 @@ cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=aZtkstf9jBYYN2MGnazz63B
|
|
|
56
56
|
cloe_nessy/pipeline/actions/transform_change_datatype.py,sha256=Nz3Ncr-Zd-wy8g9-aN5XcvpWAHLyWs70RpZ7KqKqIaU,1788
|
|
57
57
|
cloe_nessy/pipeline/actions/transform_concat_columns.py,sha256=V0TzeQFpBYur_T1Nv0nRpOU02nKQ2iypo2CCcV2rBtk,3083
|
|
58
58
|
cloe_nessy/pipeline/actions/transform_decode.py,sha256=DmT-29dIqbz_xTj4GSCfnbgYRCiUrWzKvGrRYy1frNw,4004
|
|
59
|
+
cloe_nessy/pipeline/actions/transform_deduplication.py,sha256=2VN5_wza7sD7fERyG6ElGh_Yo-W-Mxw-QBmtDXs1MGQ,5063
|
|
59
60
|
cloe_nessy/pipeline/actions/transform_distinct.py,sha256=sdCElXCM77AQ0m6Zzg_h7cyavBOxo7W9K1NrsvNLufA,1105
|
|
60
61
|
cloe_nessy/pipeline/actions/transform_filter.py,sha256=vOAxKtNWCABLb6G6Xz98NK7fEfgn6QJia31S7IvoUTg,1428
|
|
61
62
|
cloe_nessy/pipeline/actions/transform_generic_sql.py,sha256=cli59HCERFge7f0RB8yXw2oDtHSbMCWQMdeCeqhbdg8,2355
|
|
63
|
+
cloe_nessy/pipeline/actions/transform_group_aggregate.py,sha256=HcY4sqb2yNBCz90jQtxGA8fZPuQXfJuaDmv8lWuoTqg,4050
|
|
62
64
|
cloe_nessy/pipeline/actions/transform_join.py,sha256=qktyaN2kcCkmoH3RILTc-UGYsGACx1nXH6xLtuvYi7k,3080
|
|
63
65
|
cloe_nessy/pipeline/actions/transform_json_normalize.py,sha256=xN_cQgHSMSyPsyYXBdoe2i5pHnyH-kkH5do8qr3vybw,4157
|
|
64
66
|
cloe_nessy/pipeline/actions/transform_rename_columns.py,sha256=fFdg3353QCE3zBei6iYQW9huPBcQ906sJLioaOUWj3s,1924
|
|
@@ -72,7 +74,7 @@ cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEv
|
|
|
72
74
|
cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
|
|
73
75
|
cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
74
76
|
cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
|
|
75
|
-
cloe_nessy-0.2.
|
|
76
|
-
cloe_nessy-0.2.
|
|
77
|
-
cloe_nessy-0.2.
|
|
78
|
-
cloe_nessy-0.2.
|
|
77
|
+
cloe_nessy-0.3.2.dist-info/METADATA,sha256=7w0f9JC9rm0tmEBYvkPSuTPTKIoGaHFytY7eYf1GRkU,1837
|
|
78
|
+
cloe_nessy-0.3.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
79
|
+
cloe_nessy-0.3.2.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
|
|
80
|
+
cloe_nessy-0.3.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|