cloe-nessy 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@ from .read_excel import ReadExcelAction
7
7
  from .read_files import ReadFilesAction
8
8
  from .read_metadata_yaml import ReadMetadataYAMLAction
9
9
  from .transform_change_datatype import TransformChangeDatatypeAction
10
+ from .transform_clean_column_names import TransformCleanColumnNamesAction
10
11
  from .transform_concat_columns import TransformConcatColumnsAction
11
12
  from .transform_decode import TransformDecodeAction
12
13
  from .transform_distinct import TransformDistinctAction
@@ -39,6 +40,7 @@ __all__ = [
39
40
  "TransformFilterAction",
40
41
  "TransformUnionAction",
41
42
  "TransformChangeDatatypeAction",
43
+ "TransformCleanColumnNamesAction",
42
44
  "TransformConcatColumnsAction",
43
45
  "TransformDecodeAction",
44
46
  "TransformDistinctAction",
@@ -0,0 +1,77 @@
1
+ import json
2
+ import re
3
+ from typing import Any
4
+
5
+ import pyspark.sql.functions as F
6
+ import pyspark.sql.types as T
7
+
8
+ from ..pipeline_action import PipelineAction
9
+ from ..pipeline_context import PipelineContext
10
+
11
+
12
+ class TransformCleanColumnNamesAction(PipelineAction):
13
+ """Fixes column names in the DataFrame to be valid.
14
+
15
+ Removes invalid characters from the column names, including the fields of a struct and
16
+ replaces a single leading underscore by a double underscore.
17
+
18
+ Example:
19
+ ```yaml
20
+ Clean Column Names:
21
+ action: TRANSFORM_CLEAN_COLUMN_NAMES
22
+ ```
23
+ """
24
+
25
+ name: str = "TRANSFORM_CLEAN_COLUMN_NAMES"
26
+
27
+ def run(
28
+ self,
29
+ context: PipelineContext,
30
+ **_: Any,
31
+ ) -> PipelineContext:
32
+ """Fixes column names in the DataFrame to be valid.
33
+
34
+ Removes invalid characters from the column names, including the fields of a struct and
35
+ replaces a single leading underscore by a double underscore.
36
+
37
+ Args:
38
+ context: The context in which this Action is executed.
39
+
40
+ Raises:
41
+ ValueError: If the data from the context is None.
42
+
43
+ Returns:
44
+ The context after the execution of this Action, containing the DataFrame with cleaned column names.
45
+ """
46
+ if context.data is None:
47
+ raise ValueError("Data from the context is required for the operation.")
48
+
49
+ with_columns_renamed = {}
50
+ with_columns_casted: dict[str, T.StructType | T.ArrayType | T.MapType] = {}
51
+
52
+ single_underscrore_at_beginning = r"^_(?=[^_])"
53
+
54
+ for c in context.data.schema:
55
+ old_name = c.name
56
+ new_name = re.sub(single_underscrore_at_beginning, "__", re.sub("\W", "_", old_name))
57
+ with_columns_renamed[old_name] = new_name
58
+
59
+ if isinstance(c.dataType, (T.StructType | T.ArrayType | T.MapType)):
60
+ old_column_schema = c.dataType.json()
61
+ new_column_schema = re.sub(
62
+ r'(?<="name":")[^"]+',
63
+ lambda m: re.sub("\W", "_", str(m.group())),
64
+ old_column_schema,
65
+ )
66
+ if isinstance(c.dataType, T.StructType):
67
+ with_columns_casted[new_name] = T.StructType.fromJson(json.loads(new_column_schema))
68
+ elif isinstance(c.dataType, T.ArrayType):
69
+ with_columns_casted[new_name] = T.ArrayType.fromJson(json.loads(new_column_schema))
70
+ elif isinstance(c.dataType, T.MapType):
71
+ with_columns_casted[new_name] = T.MapType.fromJson(json.loads(new_column_schema))
72
+
73
+ df = context.data.withColumnsRenamed(with_columns_renamed)
74
+ for c_name, c_type in with_columns_casted.items():
75
+ df = df.withColumn(c_name, F.col(c_name).cast(c_type))
76
+
77
+ return context.from_existing(data=df) # type: ignore
@@ -0,0 +1,125 @@
1
+ import random
2
+ import string
3
+ from typing import Any
4
+
5
+ import pyspark.sql.functions as F
6
+ import pyspark.sql.types as T
7
+ from pyspark.sql import Window
8
+
9
+ from ..pipeline_action import PipelineAction
10
+ from ..pipeline_context import PipelineContext
11
+
12
+
13
+ class TransformDeduplication(PipelineAction):
14
+ """Deduplicates the data from the given DataFrame.
15
+
16
+ This method deduplicates the data where the key columns are the same
17
+ and keeps the entry with the highest values in the order_by_columns
18
+ (can be changed to lowest by setting the parameter descending to false).
19
+
20
+ Example:
21
+ ```yaml
22
+ Deduplicate Columns:
23
+ action: TRANSFORM_DEDUPLICATION
24
+ options:
25
+ key_columns:
26
+ - id
27
+ order_by_columns:
28
+ - source_file_modification_time
29
+ ```
30
+ """
31
+
32
+ name: str = "TRANSFORM_DEDUPLICATION"
33
+
34
+ def run(
35
+ self,
36
+ context: PipelineContext,
37
+ *,
38
+ key_columns: list[str] | None = None,
39
+ order_by_columns: list[str] | None = None,
40
+ descending: bool = True,
41
+ **_: Any,
42
+ ) -> PipelineContext:
43
+ """Deduplicates the data based on key columns and order by columns.
44
+
45
+ Args:
46
+ context: The context in which this Action is executed.
47
+ key_columns: A list of the key column names. The returned data only keeps one
48
+ line of data with the same key columns.
49
+ order_by_columns: A list of order by column names. The returned data keeps the
50
+ first line of data with the same key columns ordered by these columns.
51
+ descending: Whether to sort descending or ascending.
52
+
53
+ Raises:
54
+ ValueError: If no key_columns are specified.
55
+ ValueError: If no order_by_columns are specified.
56
+ ValueError: If the data from context is None.
57
+ ValueError: If key_columns and order_by_columns overlap.
58
+ ValueError: If key_columns or order_by_columns contain Nulls.
59
+
60
+ Returns:
61
+ The context after the execution of this Action, containing the DataFrame with the deduplicated data.
62
+ """
63
+ if context.data is None:
64
+ raise ValueError("Data from the context is required for the operation.")
65
+ if key_columns is None:
66
+ raise ValueError("Please provide at least one key column.")
67
+ if order_by_columns is None:
68
+ raise ValueError("Please provide at least one order by column.")
69
+
70
+ # check if the key_columns and order_by_columns are the same
71
+ if len(set(key_columns) & set(order_by_columns)) != 0:
72
+ raise ValueError("The key_columns and order_by_columns cannot contain the same column")
73
+
74
+ # check if the key_columns and order_by_columns are not null
75
+ df_nulls = context.data.filter(F.greatest(*[F.col(c).isNull() for c in key_columns + order_by_columns]) == 1)
76
+ if df_nulls.head(1): # if the filteredDataFrame is not empty
77
+ raise ValueError(
78
+ "The key_columns and order_by_columns cannot be null. Please check the quality of the provided columns (null handling)"
79
+ )
80
+
81
+ # check if the order_by columns have the preferred data types
82
+ recommended_order_by_data_types = [
83
+ T.TimestampType(),
84
+ T.TimestampNTZType(),
85
+ T.DataType(),
86
+ T.IntegerType(),
87
+ T.LongType(),
88
+ T.DoubleType(),
89
+ T.FloatType(),
90
+ T.DecimalType(),
91
+ ]
92
+
93
+ for c in context.data.schema:
94
+ if c.name in order_by_columns and c.dataType not in recommended_order_by_data_types:
95
+ log_message = (
96
+ f"action_name : {self.name} | message : order_by_column `{c.name}` is of type {c.dataType}; "
97
+ "recommended data types are {recommended_order_by_data_types}"
98
+ )
99
+ self._console_logger.warning(log_message)
100
+ self._tabular_logger.warning(log_message)
101
+
102
+ # sort the order_by columns in the preferred order
103
+ if descending:
104
+ order_by_list = [F.col(col_name).desc() for col_name in order_by_columns]
105
+ else:
106
+ order_by_list = [F.col(col_name).asc() for col_name in order_by_columns]
107
+
108
+ # create the window specification
109
+ window_specification = Window.partitionBy(key_columns).orderBy(order_by_list)
110
+
111
+ # generate a column name that is not in the input dataframe
112
+ def generate_random_string(length):
113
+ return "".join(random.choice(string.ascii_uppercase) for _ in range(length))
114
+
115
+ row_number_col_name = generate_random_string(20)
116
+ while row_number_col_name in context.data.columns:
117
+ row_number_col_name = generate_random_string(20)
118
+
119
+ # drop the duplicates
120
+ df = (
121
+ context.data.withColumn(row_number_col_name, F.row_number().over(window_specification))
122
+ .filter(F.col(row_number_col_name) == 1)
123
+ .drop(row_number_col_name)
124
+ )
125
+ return context.from_existing(data=df)
@@ -7,10 +7,16 @@ from ..pipeline_context import PipelineContext
7
7
  class TransformDistinctAction(PipelineAction):
8
8
  """Selects distinct rows from the DataFrame in the given context.
9
9
 
10
+ If a subset is given these columns are used for duplicate comparison. If no subset is given all columns are used.
11
+
10
12
  Example:
11
13
  ```yaml
12
14
  Decode Columns:
13
15
  action: TRANSFORM_DISTINCT
16
+ options:
17
+ subset:
18
+ - first_name
19
+ - last_name
14
20
  ```
15
21
  """
16
22
 
@@ -19,12 +25,15 @@ class TransformDistinctAction(PipelineAction):
19
25
  def run(
20
26
  self,
21
27
  context: PipelineContext,
28
+ *,
29
+ subset: list[str] | None = None,
22
30
  **_: Any,
23
31
  ) -> PipelineContext:
24
32
  """Selects distinct rows from the DataFrame in the given context.
25
33
 
26
34
  Args:
27
35
  context: The context in which this Action is executed.
36
+ subset: List of column names to use for duplicate comparison (default All columns).
28
37
 
29
38
  Raises:
30
39
  ValueError: If the data from the context is None.
@@ -35,6 +44,14 @@ class TransformDistinctAction(PipelineAction):
35
44
  if context.data is None:
36
45
  raise ValueError("Data from the context is required for the operation.")
37
46
 
38
- df = context.data.distinct()
47
+ # check if all columns that are part of the subset are actually part of the dataframe.
48
+ if subset is not None:
49
+ subset_columns_not_in_dataframe = set(subset) - set(context.data.columns)
50
+ if len(subset_columns_not_in_dataframe) != 0:
51
+ raise ValueError(
52
+ f"The following subset columns are not part of the dataframe: {subset_columns_not_in_dataframe}"
53
+ )
54
+
55
+ df = context.data.dropDuplicates(subset=subset)
39
56
 
40
57
  return context.from_existing(data=df) # type: ignore
@@ -63,13 +63,13 @@ class SessionManager:
63
63
  @classmethod
64
64
  def get_utils(
65
65
  cls,
66
- ) -> Any: # return type should be Union[DBUtils, MsSparkUtils], but can't import locally.
67
- """Get or create a DBUtils or MsSparkUtils instance, depending on the context.
66
+ ) -> Any: # return type should be Union[DBUtils, MsSparkUtils, RemoteDbUtils].
67
+ """Get or create a DBUtils, RemoteDbUtils or MsSparkUtils instance, depending on the context.
68
68
 
69
- In Databricks this will return DBUtils, while in Fabric it will return MsSparkUtils.
69
+ In Databricks this will return DBUtils, when using Databricks-Connect it returns RemoteDbUtils, and in Fabric it will return MsSparkUtils.
70
70
 
71
71
  Returns:
72
- utils: The DBUtils or MsSparkUtils instance.
72
+ utils: The DBUtils, RemoteDbUtils or MsSparkUtils instance.
73
73
 
74
74
  Raises:
75
75
  RuntimeError: If the instance cannot be created.
@@ -88,19 +88,25 @@ class SessionManager:
88
88
  }
89
89
 
90
90
  try:
91
- cls._utils = utils_function[cls._env](cls) # type: ignore
91
+ cls._utils = utils_function[cls._env]() # type: ignore
92
92
  except Exception as e:
93
93
  raise RuntimeError(f"Cannot create utils instance. Error: {e}") from e
94
94
 
95
95
  return cls._utils
96
96
 
97
+ @classmethod
97
98
  def _get_dbutils(cls):
99
+ if cls._env == cls.Environment.DATABRICKS_CONNECT:
100
+ from databricks.sdk import WorkspaceClient
101
+
102
+ return WorkspaceClient().dbutils
103
+
98
104
  from pyspark.dbutils import DBUtils
99
105
 
100
106
  cls.get_spark_session()
101
- utils = DBUtils(cls._spark)
102
- return utils
107
+ return DBUtils(cls._spark)
103
108
 
109
+ @classmethod
104
110
  def _get_mssparkutils(cls):
105
111
  from notebookutils import mssparkutils # type: ignore
106
112
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: cloe-nessy
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: Your friendly datalake monster.
5
5
  Home-page: https://initions.com/
6
6
  Author: initions
@@ -47,16 +47,18 @@ cloe_nessy/pipeline/pipeline_config.py,sha256=BN3ZSbr6bC-X9edoh-n5vRfPHFMbgtAU7m
47
47
  cloe_nessy/pipeline/pipeline_context.py,sha256=csElDc6BsynDUtRXgQOSCH7ONc_b-ag0YEg0zlQTz58,1874
48
48
  cloe_nessy/pipeline/pipeline_parsing_service.py,sha256=c_nAsgw81QYBM9AFiTxGgqRhNXABkDKplbeoCJPtbpE,6434
49
49
  cloe_nessy/pipeline/pipeline_step.py,sha256=UlnmpS6gm_dZ7m9dD1mZvye7mvUF_DA7HjOZo0oGYDU,1977
50
- cloe_nessy/pipeline/actions/__init__.py,sha256=Psksv49DVhWHR2D1OuMxvYClF1Vjh5shiyy9yBdWnb0,2160
50
+ cloe_nessy/pipeline/actions/__init__.py,sha256=LwKctXy4Jun52BnCVGvWa8nnKVjTSov4GT58j6Zy8zg,2273
51
51
  cloe_nessy/pipeline/actions/read_api.py,sha256=wGyPZdeh3Cam_BQBilltWBWCIdD9I_kv4lunEhE39Tg,6625
52
52
  cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=aZy4sJLLE8ZQ_SPXGSDoHYaBJTz8s7xQDVn5eYrYHvE,2689
53
53
  cloe_nessy/pipeline/actions/read_excel.py,sha256=EgHbK1wO6dkDo0KErYDhK_2sNIkIoa-6As9oo9dNFsE,7708
54
54
  cloe_nessy/pipeline/actions/read_files.py,sha256=8twjprqKYEmVu5QITEGe4no45TfhgzZosTFVQ89vV6g,3861
55
55
  cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=aZtkstf9jBYYN2MGnazz63BG_hJ7mIgAfKiNqUpc26E,2235
56
56
  cloe_nessy/pipeline/actions/transform_change_datatype.py,sha256=Nz3Ncr-Zd-wy8g9-aN5XcvpWAHLyWs70RpZ7KqKqIaU,1788
57
+ cloe_nessy/pipeline/actions/transform_clean_column_names.py,sha256=XuVAVEbp-UiF8PO6wAEJyl1TYgBD7MSnuOGhuEvXKv4,2881
57
58
  cloe_nessy/pipeline/actions/transform_concat_columns.py,sha256=V0TzeQFpBYur_T1Nv0nRpOU02nKQ2iypo2CCcV2rBtk,3083
58
59
  cloe_nessy/pipeline/actions/transform_decode.py,sha256=DmT-29dIqbz_xTj4GSCfnbgYRCiUrWzKvGrRYy1frNw,4004
59
- cloe_nessy/pipeline/actions/transform_distinct.py,sha256=sdCElXCM77AQ0m6Zzg_h7cyavBOxo7W9K1NrsvNLufA,1105
60
+ cloe_nessy/pipeline/actions/transform_deduplication.py,sha256=2VN5_wza7sD7fERyG6ElGh_Yo-W-Mxw-QBmtDXs1MGQ,5063
61
+ cloe_nessy/pipeline/actions/transform_distinct.py,sha256=R0Wv_YnWOw198r0rPR_72fgH5sp7upgjZzfOPTZ1oPA,1942
60
62
  cloe_nessy/pipeline/actions/transform_filter.py,sha256=vOAxKtNWCABLb6G6Xz98NK7fEfgn6QJia31S7IvoUTg,1428
61
63
  cloe_nessy/pipeline/actions/transform_generic_sql.py,sha256=cli59HCERFge7f0RB8yXw2oDtHSbMCWQMdeCeqhbdg8,2355
62
64
  cloe_nessy/pipeline/actions/transform_group_aggregate.py,sha256=HcY4sqb2yNBCz90jQtxGA8fZPuQXfJuaDmv8lWuoTqg,4050
@@ -68,12 +70,12 @@ cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=Kez8puDK7cRfhleBE
68
70
  cloe_nessy/pipeline/actions/transform_union.py,sha256=TDER06IABzxvIez4bGLKCLaDA4eScpTzYRbfUzwv_RQ,2342
69
71
  cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=6yAHTX5kZviumgBW_NYVGAUin6U2nDzmic9of6wA8FY,2590
70
72
  cloe_nessy/session/__init__.py,sha256=t7_YjUhJYW3km_FrucaUdbIl1boQtwkyhw_8yE10qzc,74
71
- cloe_nessy/session/session_manager.py,sha256=rd33lSafzomuyGf1BzhyjIWuy9sXgFjr-ca7A7Sw8eo,6490
73
+ cloe_nessy/session/session_manager.py,sha256=PK7awMc6fmot7f9FMmvIUbIzKFgjcy2o2bZS9kjVs10,6733
72
74
  cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEvzM,101
73
75
  cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
74
76
  cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
75
77
  cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
76
- cloe_nessy-0.3.1.dist-info/METADATA,sha256=ziNbpjwuDfxE2Un5Y4YfYuEc1brCHy0Ic-rVc_ChZhY,1837
77
- cloe_nessy-0.3.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
78
- cloe_nessy-0.3.1.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
79
- cloe_nessy-0.3.1.dist-info/RECORD,,
78
+ cloe_nessy-0.3.3.dist-info/METADATA,sha256=StCfl2I5dItaMzO10u3gQw6WxfjZUZHRIodEvKuQu_s,1837
79
+ cloe_nessy-0.3.3.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
80
+ cloe_nessy-0.3.3.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
81
+ cloe_nessy-0.3.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5