cloe-nessy 0.3.19__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import os
2
+ from typing import Any
2
3
 
3
4
  from ..logging.logger_mixin import LoggerMixin
4
5
  from .factory import FileRetrievalFactory
@@ -9,7 +10,7 @@ def get_file_paths(
9
10
  location: str,
10
11
  file_name_pattern: str | None = None,
11
12
  search_subdirs: bool = True,
12
- **kwargs,
13
+ **kwargs: Any,
13
14
  ) -> list[str]:
14
15
  """Retrieves file paths from a specified location based on the provided criteria.
15
16
 
@@ -1,4 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
+ from typing import Any
2
3
 
3
4
 
4
5
  class FileRetrievalStrategy(ABC):
@@ -15,7 +16,7 @@ class FileRetrievalStrategy(ABC):
15
16
  location: str,
16
17
  extension: str | None = None,
17
18
  search_subdirs: bool = True,
18
- **kwargs,
19
+ **kwargs: Any,
19
20
  ) -> list[str]:
20
21
  """Retrieves a list of file paths based on the specified criteria.
21
22
 
@@ -1,4 +1,5 @@
1
1
  import os
2
+ from typing import Any
2
3
 
3
4
  from ..exceptions import FileUtilitiesError
4
5
  from .base_strategy import FileRetrievalStrategy
@@ -16,7 +17,7 @@ class LocalDirectoryStrategy(FileRetrievalStrategy):
16
17
  location: str,
17
18
  extension: str | None = None,
18
19
  search_subdirs: bool = True,
19
- **kwargs, # noqa: ARG004
20
+ **kwargs: Any, # noqa: ARG004
20
21
  ) -> list[str]:
21
22
  """Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
22
23
 
@@ -1,3 +1,5 @@
1
+ from typing import Any
2
+
1
3
  from .base_strategy import FileRetrievalStrategy
2
4
  from .local_strategy import LocalDirectoryStrategy
3
5
 
@@ -10,7 +12,7 @@ class OneLakeStrategy(FileRetrievalStrategy):
10
12
  location: str,
11
13
  extension: str | None = None,
12
14
  search_subdirs: bool = True,
13
- **kwargs,
15
+ **kwargs: Any,
14
16
  ) -> list:
15
17
  """Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
16
18
 
@@ -1,3 +1,5 @@
1
+ from typing import Any
2
+
1
3
  from ...session import SessionManager
2
4
  from ..exceptions import FileUtilitiesError
3
5
  from .base_strategy import FileRetrievalStrategy
@@ -15,7 +17,7 @@ class UtilsStrategy(FileRetrievalStrategy):
15
17
  location: str,
16
18
  extension: str | None = None,
17
19
  search_subdirs: bool = True,
18
- **kwargs, # noqa: ARG004
20
+ **kwargs: Any, # noqa: ARG004
19
21
  ) -> list:
20
22
  """Recursively retrieves all files with a specified extension from a given directory and its subdirectories.
21
23
 
@@ -160,7 +160,7 @@ class ExcelDataFrameReader(BaseReader):
160
160
  "__metadata",
161
161
  F.create_map(
162
162
  F.lit("timestamp"),
163
- F.current_timestamp(),
163
+ F.current_timestamp().cast("string"),
164
164
  F.lit("file_location"),
165
165
  F.lit(location),
166
166
  F.lit("sheet_name"),
@@ -192,7 +192,8 @@ class FileReader(BaseReader):
192
192
  """Add all metadata columns to the DataFrame."""
193
193
  metadata_columns = df.select("_metadata.*").columns
194
194
 
195
- entries = [(F.lit(field), F.col(f"_metadata.{field}")) for field in metadata_columns]
195
+ # Cast all metadata values to strings to ensure type consistency in the map
196
+ entries = [(F.lit(field), F.col(f"_metadata.{field}").cast("string")) for field in metadata_columns]
196
197
  flat_list = [item for tup in entries for item in tup]
197
198
 
198
199
  df = df.withColumn("__metadata", F.create_map(flat_list))
@@ -62,7 +62,7 @@ class DeltaAppendWriter(BaseDeltaWriter):
62
62
  trigger_dict: dict | None = None,
63
63
  options: dict[str, str] | None = None,
64
64
  await_termination: bool = False,
65
- ):
65
+ ) -> None:
66
66
  """Appends the provided DataFrame to a Delta table.
67
67
 
68
68
  Args:
@@ -151,7 +151,7 @@ class BaseDeltaWriter(BaseWriter, ABC):
151
151
  return " AND ".join([f"target.`{c}` <=> source.`{c}`" for c in columns])
152
152
 
153
153
  @staticmethod
154
- def _partition_pruning_conditions(df, partition_cols: list[str] | None) -> str:
154
+ def _partition_pruning_conditions(df: "DataFrame", partition_cols: list[str] | None) -> str:
155
155
  """Generates partition pruning conditions for an SQL query.
156
156
 
157
157
  This function is used to optimize the performance of an SQL query by only scanning the
@@ -1,7 +1,15 @@
1
1
  from .pipeline import Pipeline
2
2
  from .pipeline_action import PipelineAction
3
+ from .pipeline_builder import PipelineBuilder
3
4
  from .pipeline_context import PipelineContext
4
5
  from .pipeline_parsing_service import PipelineParsingService
5
6
  from .pipeline_step import PipelineStep
6
7
 
7
- __all__ = ["Pipeline", "PipelineParsingService", "PipelineContext", "PipelineAction", "PipelineStep"]
8
+ __all__ = [
9
+ "Pipeline",
10
+ "PipelineBuilder",
11
+ "PipelineParsingService",
12
+ "PipelineContext",
13
+ "PipelineAction",
14
+ "PipelineStep",
15
+ ]
@@ -19,10 +19,12 @@ from .transform_group_aggregate import TransformGroupAggregate
19
19
  from .transform_hash_columns import TransformHashColumnsAction
20
20
  from .transform_join import TransformJoinAction
21
21
  from .transform_json_normalize import TransformJsonNormalize
22
+ from .transform_regex_extract import TransformRegexExtract
22
23
  from .transform_rename_columns import TransformRenameColumnsAction
23
24
  from .transform_replace_values import TransformReplaceValuesAction
24
25
  from .transform_select_columns import TransformSelectColumnsAction
25
26
  from .transform_union import TransformUnionAction
27
+ from .transform_with_column import TransformWithColumnAction
26
28
  from .write_catalog_table import WriteCatalogTableAction
27
29
  from .write_delta_append import WriteDeltaAppendAction
28
30
  from .write_delta_merge import WriteDeltaMergeAction
@@ -55,9 +57,11 @@ __all__ = [
55
57
  "TransformGroupAggregate",
56
58
  "TransformJoinAction",
57
59
  "TransformJsonNormalize",
60
+ "TransformRegexExtract",
58
61
  "TransformRenameColumnsAction",
59
62
  "TransformReplaceValuesAction",
60
63
  "TransformSelectColumnsAction",
64
+ "TransformWithColumnAction",
61
65
  "WriteCatalogTableAction",
62
66
  "WriteDeltaAppendAction",
63
67
  "WriteDeltaMergeAction",
@@ -96,8 +96,6 @@ class ReadCatalogTableAction(PipelineAction):
96
96
  configuration for the streaming query, such as processing time or
97
97
  continuous processing.
98
98
  behavior, such as filters or reading modes. Defaults to None.
99
- delta_load_options: Options for delta loading, if applicable.
100
- Configures the [`DeltaLoader`][cloe_nessy.integration.delta_loader].
101
99
 
102
100
  Raises:
103
101
  ValueError: If neither `table_identifier` nor `table_metadata.identifier` in the `context` is provided.
@@ -25,7 +25,8 @@ class HashSettings(BaseModel):
25
25
  bits: int | None = Field(default=None, description="Only required for sha2")
26
26
 
27
27
  @model_validator(mode="before")
28
- def validate_all(cls, values):
28
+ @classmethod
29
+ def validate_all(cls: type["HashSettings"], values: Any) -> Any:
29
30
  """Validates the input values for a hashing operation before model instantiation.
30
31
 
31
32
  This method performs the following checks:
@@ -91,7 +92,8 @@ class HashConfig(BaseModel):
91
92
  hash_config: dict[str, HashSettings]
92
93
 
93
94
  @model_validator(mode="before")
94
- def validate_config(cls, values):
95
+ @classmethod
96
+ def validate_config(cls: type["HashConfig"], values: Any) -> Any:
95
97
  """Validates the hash configuration provided in the model.
96
98
 
97
99
  This method is executed in "before" mode to ensure that the `hash_config`
@@ -0,0 +1,169 @@
1
+ import re
2
+ from typing import Any
3
+
4
+ import pyspark.sql.functions as F
5
+
6
+ from cloe_nessy.pipeline.pipeline_action import PipelineAction
7
+ from cloe_nessy.pipeline.pipeline_context import PipelineContext
8
+
9
+
10
+ class TransformRegexExtract(PipelineAction):
11
+ r"""Extract values from a specified column in a DataFrame using regex patterns.
12
+
13
+ This action extracts values from a column based on a regex pattern and stores
14
+ the result in a new column. Optionally, you can replace the matched pattern in
15
+ the original column with a different string, remove the original column, or add
16
+ a boolean column indicating which rows matched the pattern.
17
+
18
+ Example:
19
+ ```yaml
20
+ Extract Action:
21
+ action: TRANSFORM_REGEX_EXTRACT
22
+ options:
23
+ source_column_name: Email
24
+ extract_column_name: org_domain
25
+ pattern: (?<=@)([A-Za-z0-9-]+)
26
+ replace_by: exampledomain.org
27
+ ```
28
+
29
+ This action also supports processing multiple columns simultaneously. To use this
30
+ functionality, structure the configuration as a dictionary mapping each source
31
+ column name to its extraction parameters.
32
+
33
+ Example:
34
+ ```yaml
35
+ Extract Action:
36
+ action: TRANSFORM_REGEX_EXTRACT
37
+ options:
38
+ extract_columns:
39
+ Name:
40
+ pattern: (?<=\w+) (\w+)
41
+ replace_by: ''
42
+ extract_column_name: last_name
43
+ match_info_column_name: has_last_name
44
+ Email:
45
+ pattern: @\w+\.\w+
46
+ extract_column_name: domain
47
+ keep_original_column: False
48
+ ```
49
+
50
+ """
51
+
52
+ name: str = "TRANSFORM_REGEX_EXTRACT"
53
+
54
+ def run(
55
+ self,
56
+ context: PipelineContext,
57
+ source_column_name: str = "",
58
+ extract_column_name: str = "",
59
+ pattern: str = "",
60
+ keep_original_column: bool = True,
61
+ replace_by: str = "",
62
+ match_info_column_name: str = "",
63
+ extract_columns: dict | None = None,
64
+ **_: Any,
65
+ ) -> PipelineContext:
66
+ """Performs a regex extract (and replace) on a specified column in a DataFrame.
67
+
68
+ This function performs a regex extract (and optionally a replace) on one or more columns.
69
+
70
+ Args:
71
+ context: The context in which this action is executed.
72
+ source_column_name: Column name to perform the regex replace on.
73
+ pattern: Regex pattern to match.
74
+ replace_by: String that should replace the extracted pattern in the source column.
75
+ extract_column_name: Column name to store the extract, default: <source_column_name>_extract
76
+ keep_original_column: Whether to keep the original column, default: True
77
+ match_info_column_name: Column name to store a boolean column whether a match was found, default: None
78
+ extract_columns: Dictionary of column names and their corresponding 1-column-case.
79
+
80
+ Raises:
81
+ ValueError: If any of the required arguments are not provided.
82
+ ValueError: If the regex pattern is invalid.
83
+
84
+ Returns:
85
+ PipelineContext: Transformed context with the modified DataFrame.
86
+ """
87
+ if context.data is None:
88
+ raise ValueError("Data from the context is required for the operation.")
89
+ if not extract_columns and not source_column_name:
90
+ raise ValueError("Either extract_columns or source_column_name must be provided.")
91
+
92
+ df = context.data
93
+
94
+ if source_column_name:
95
+ self._console_logger.info(f"Extracting from column '{source_column_name}' using pattern: {pattern}")
96
+ df = self._process_one_column(
97
+ df,
98
+ source_column_name,
99
+ pattern,
100
+ extract_column_name,
101
+ replace_by,
102
+ keep_original_column,
103
+ match_info_column_name,
104
+ )
105
+
106
+ elif isinstance(extract_columns, dict):
107
+ self._console_logger.info(f"Extracting from {len(extract_columns)} columns")
108
+ for one_source_column_name in extract_columns:
109
+ parameter_dict = self._get_default_dict() | extract_columns[one_source_column_name]
110
+ df = self._process_one_column(df, one_source_column_name, **parameter_dict)
111
+
112
+ else:
113
+ raise ValueError("extract_columns must be a dictionary. See documentation for proper format.")
114
+
115
+ return context.from_existing(data=df)
116
+
117
+ def _process_one_column(
118
+ self,
119
+ df,
120
+ source_column_name,
121
+ pattern,
122
+ extract_column_name,
123
+ replace_by,
124
+ keep_original_column,
125
+ match_info_column_name,
126
+ ):
127
+ # Extract the first captured group (group 0 is the entire match)
128
+ matched_group_id = 0
129
+
130
+ if not extract_column_name:
131
+ extract_column_name = f"{source_column_name}_extracted"
132
+
133
+ if not pattern:
134
+ raise ValueError(f"The regex pattern (pattern) for column {source_column_name} must be provided.")
135
+
136
+ # Validate regex pattern
137
+ try:
138
+ re.compile(pattern)
139
+ except re.error as e:
140
+ raise ValueError(f"Invalid regex pattern '{pattern}' for column {source_column_name}: {e}") from e
141
+
142
+ df = df.withColumn(extract_column_name, F.regexp_extract(source_column_name, pattern, matched_group_id))
143
+
144
+ if replace_by:
145
+ df = df.withColumn(source_column_name, F.regexp_replace(source_column_name, pattern, replace_by))
146
+
147
+ if match_info_column_name:
148
+ # Check if extraction is null or empty string
149
+ df = df.withColumn(
150
+ match_info_column_name,
151
+ F.when((F.col(extract_column_name).isNull()) | (F.col(extract_column_name) == ""), False).otherwise(
152
+ True
153
+ ),
154
+ )
155
+
156
+ if not keep_original_column:
157
+ df = df.drop(source_column_name)
158
+
159
+ return df
160
+
161
+ def _get_default_dict(self) -> dict[str, Any]:
162
+ """Return default parameters for single column extraction."""
163
+ return {
164
+ "pattern": "",
165
+ "extract_column_name": "",
166
+ "replace_by": "",
167
+ "keep_original_column": True,
168
+ "match_info_column_name": "",
169
+ }
@@ -0,0 +1,104 @@
1
+ """Transform action to add or update a column using a SQL expression."""
2
+
3
+ from typing import Any
4
+
5
+ from pyspark.sql import functions as F
6
+
7
+ from cloe_nessy.pipeline.pipeline_action import PipelineAction
8
+ from cloe_nessy.pipeline.pipeline_context import PipelineContext
9
+
10
+
11
+ class TransformWithColumnAction(PipelineAction):
12
+ """Add or update a column in the DataFrame using a SQL expression.
13
+
14
+ This action uses PySpark's expr() function to evaluate SQL expressions and
15
+ create or update columns in the DataFrame.
16
+
17
+ Examples:
18
+ === "Create new column"
19
+ ```yaml
20
+ Create Full Name:
21
+ action: TRANSFORM_WITH_COLUMN
22
+ options:
23
+ column_name: full_name
24
+ expression: concat(first_name, ' ', last_name)
25
+ ```
26
+
27
+ === "Update existing column"
28
+ ```yaml
29
+ Lowercase Email:
30
+ action: TRANSFORM_WITH_COLUMN
31
+ options:
32
+ column_name: email
33
+ expression: lower(email)
34
+ ```
35
+
36
+ === "Calculated column"
37
+ ```yaml
38
+ Calculate Total:
39
+ action: TRANSFORM_WITH_COLUMN
40
+ options:
41
+ column_name: total_price
42
+ expression: price * quantity * (1 + tax_rate)
43
+ ```
44
+
45
+ === "Extract date parts"
46
+ ```yaml
47
+ Extract Year:
48
+ action: TRANSFORM_WITH_COLUMN
49
+ options:
50
+ column_name: year
51
+ expression: year(order_date)
52
+ ```
53
+ """
54
+
55
+ name: str = "TRANSFORM_WITH_COLUMN"
56
+
57
+ def run(
58
+ self,
59
+ context: PipelineContext,
60
+ *,
61
+ column_name: str = "",
62
+ expression: str = "",
63
+ **_: Any,
64
+ ) -> PipelineContext:
65
+ """Add or update a column using a SQL expression.
66
+
67
+ Args:
68
+ context: The pipeline context containing the DataFrame
69
+ column_name: Name of the column to create or update
70
+ expression: SQL expression to evaluate for the column value
71
+ **_: Additional unused keyword arguments
72
+
73
+ Returns:
74
+ PipelineContext: Updated context with the modified DataFrame
75
+
76
+ Raises:
77
+ ValueError: If column_name is not provided
78
+ ValueError: If expression is not provided
79
+ ValueError: If context.data is None
80
+ Exception: If the SQL expression is invalid
81
+ """
82
+ if not column_name:
83
+ raise ValueError("No column_name provided.")
84
+
85
+ if not expression:
86
+ raise ValueError("No expression provided.")
87
+
88
+ if context.data is None:
89
+ raise ValueError("Data from context is required for transform_with_column")
90
+
91
+ self._console_logger.info(f"Adding/updating column '{column_name}' with expression: {expression}")
92
+
93
+ df = context.data
94
+
95
+ try:
96
+ # Use F.expr() to evaluate the SQL expression
97
+ df = df.withColumn(column_name, F.expr(expression))
98
+ except Exception as e:
99
+ self._console_logger.error(f"Failed to evaluate expression '{expression}' for column '{column_name}': {e}")
100
+ raise
101
+
102
+ self._console_logger.info(f"Successfully added/updated column '{column_name}'")
103
+
104
+ return context.from_existing(data=df)
@@ -19,9 +19,6 @@ class WriteDeltaAppendAction(PipelineAction):
19
19
  table_identifier: my_catalog.my_schema.my_table
20
20
  ignore_empty_df: false
21
21
  ```
22
-
23
- Returns:
24
- None.
25
22
  """
26
23
 
27
24
  name: str = "WRITE_DELTA_APPEND"
@@ -28,9 +28,6 @@ class WriteDeltaMergeAction(PipelineAction):
28
28
  when_not_matched_insert: true
29
29
  use_partition_pruning: true
30
30
  ```
31
-
32
- Returns:
33
- None.
34
31
  """
35
32
 
36
33
  name: str = "WRITE_DELTA_MERGE"
@@ -0,0 +1,210 @@
1
+ from collections import OrderedDict
2
+ from collections.abc import Callable
3
+ from typing import Any, Self
4
+
5
+ from .pipeline import Pipeline
6
+ from .pipeline_step import PipelineStep
7
+
8
+
9
+ class PipelineBuilder:
10
+ """Fluent API builder for creating Nessy pipelines programmatically.
11
+
12
+ This class provides a chainable interface for building pipelines using method calls
13
+ instead of YAML configuration. It dynamically creates methods for all available
14
+ PipelineActions.
15
+
16
+ Example:
17
+ ```python
18
+ pipeline = (PipelineBuilder("My Pipeline")
19
+ .read_files(location="data/*.csv", extension="csv")
20
+ .transform_clean_column_names()
21
+ .transform_filter(condition="amount > 1000")
22
+ .write_catalog_table(catalog="prod", schema="sales", table="results")
23
+ .build())
24
+
25
+ pipeline.run()
26
+ ```
27
+ """
28
+
29
+ def __init__(self, name: str) -> None:
30
+ """Initialize the pipeline builder.
31
+
32
+ Args:
33
+ name: The name of the pipeline.
34
+ """
35
+ self.name = name
36
+ self.steps: OrderedDict[str, PipelineStep] = OrderedDict()
37
+ self._step_counter = 0
38
+
39
+ def __getattr__(self, name: str) -> Callable[..., "PipelineBuilder"]:
40
+ """Dynamically create methods for pipeline actions.
41
+
42
+ This method is called when an attribute that doesn't exist is accessed.
43
+ It converts method calls like `read_files()` into the corresponding PipelineAction.
44
+
45
+ Args:
46
+ name: The method name being called.
47
+
48
+ Returns:
49
+ A callable that adds the corresponding pipeline step.
50
+
51
+ Raises:
52
+ AttributeError: If the method name doesn't correspond to a known action.
53
+ """
54
+ # Lazy import to avoid circular import issues
55
+ from .actions import pipeline_actions
56
+
57
+ # Convert method name to action name (e.g., read_files -> READ_FILES)
58
+ action_name = name.upper()
59
+
60
+ if action_name in pipeline_actions:
61
+ action_class = pipeline_actions[action_name]
62
+
63
+ def method(**kwargs: Any) -> "PipelineBuilder":
64
+ return self._add_step(action_class, **kwargs)
65
+
66
+ return method
67
+
68
+ raise AttributeError(
69
+ f"PipelineBuilder has no method '{name}'. Available actions: {list(pipeline_actions.keys())}"
70
+ )
71
+
72
+ def _add_step(self, action_class: type, step_name: str | None = None, **options: Any) -> Self:
73
+ """Add a step to the pipeline.
74
+
75
+ Args:
76
+ action_class: The PipelineAction class to instantiate.
77
+ step_name: Optional custom name for the step.
78
+ **options: Options to pass to the action.
79
+
80
+ Returns:
81
+ Self for method chaining.
82
+
83
+ Raises:
84
+ ValueError: If a step with the given name already exists.
85
+ """
86
+ if step_name is None:
87
+ step_name = f"step_{self._step_counter:03d}_{action_class.__name__}"
88
+
89
+ # Validate that step name is unique
90
+ if step_name in self.steps:
91
+ raise ValueError(
92
+ f"A step with name '{step_name}' already exists in the pipeline. "
93
+ f"Please provide a unique step_name. "
94
+ f"Existing steps: {list(self.steps.keys())}"
95
+ )
96
+
97
+ # Convert any PipelineBuilder instances in options to PipelineStep references
98
+ options = self._convert_builder_references(options)
99
+
100
+ # Set up context reference to previous step
101
+ context_ref = None
102
+ if self.steps:
103
+ context_ref = list(self.steps.keys())[-1]
104
+
105
+ step = PipelineStep(name=step_name, action=action_class(), options=options, _context_ref=context_ref)
106
+
107
+ # Remove any predecessors that are from already-executed external pipelines
108
+ # (these steps have results but aren't in our pipeline)
109
+ external_predecessors = set()
110
+ for pred_name in step._predecessors:
111
+ if pred_name not in self.steps and pred_name != context_ref:
112
+ # Check if this is a reference to an executed step from options
113
+ for opt_val in options.values():
114
+ if isinstance(opt_val, PipelineStep) and opt_val.name == pred_name:
115
+ # This is an external executed step, remove from predecessors
116
+ external_predecessors.add(pred_name)
117
+ break
118
+
119
+ step._predecessors -= external_predecessors
120
+
121
+ self.steps[step_name] = step
122
+ self._step_counter += 1
123
+ return self
124
+
125
+ def _convert_builder_references(self, options: dict[str, Any]) -> dict[str, Any]:
126
+ """Convert any PipelineBuilder instances in options to PipelineStep references.
127
+
128
+ This method recursively processes options to find PipelineBuilder instances and
129
+ converts them to their last step's PipelineStep reference. This allows users to
130
+ pass PipelineBuilder instances directly to actions that expect PipelineStep references.
131
+
132
+ Handles PipelineBuilder instances in:
133
+ - Direct values
134
+ - Lists
135
+ - Nested dictionaries
136
+
137
+ Args:
138
+ options: Dictionary of options that may contain PipelineBuilder instances.
139
+
140
+ Returns:
141
+ Dictionary with PipelineBuilder instances converted to PipelineStep references.
142
+
143
+ Raises:
144
+ ValueError: If a PipelineBuilder has no steps.
145
+ """
146
+ converted = {}
147
+ for key, value in options.items():
148
+ converted[key] = self._convert_value(value, key)
149
+ return converted
150
+
151
+ def _convert_value(self, value: Any, context: str = "") -> Any:
152
+ """Recursively convert a value, handling PipelineBuilder instances.
153
+
154
+ When a PipelineBuilder is passed as a value, it is executed immediately
155
+ and its last step is returned as the reference. This allows the pipeline
156
+ to be run before the main pipeline that references it.
157
+
158
+ Args:
159
+ value: The value to convert.
160
+ context: Context string for error messages (e.g., key name).
161
+
162
+ Returns:
163
+ The converted value.
164
+ """
165
+ if isinstance(value, PipelineBuilder):
166
+ # Build and run the referenced pipeline immediately
167
+ pipeline = value.build()
168
+ if not pipeline.steps:
169
+ context_msg = f" in '{context}'" if context else ""
170
+ raise ValueError(f"PipelineBuilder{context_msg} must have at least one step")
171
+
172
+ # Run the pipeline to populate the results
173
+ pipeline.run()
174
+
175
+ # Get the last step which now has results
176
+ last_step_name = list(pipeline.steps.keys())[-1]
177
+ last_step = pipeline.steps[last_step_name]
178
+
179
+ # Clear predecessors since this step is already executed and has its result
180
+ # This prevents the main pipeline from trying to resolve dependencies
181
+ # that don't exist in its own step dictionary
182
+ last_step._predecessors = set()
183
+ last_step._context_ref = None
184
+
185
+ return last_step
186
+ if isinstance(value, dict):
187
+ # Recursively convert nested dictionaries
188
+ return {k: self._convert_value(v, f"{context}.{k}" if context else k) for k, v in value.items()}
189
+ if isinstance(value, list):
190
+ # Recursively convert lists
191
+ return [
192
+ self._convert_value(item, f"{context}[{i}]" if context else f"[{i}]") for i, item in enumerate(value)
193
+ ]
194
+ return value
195
+
196
+ def build(self) -> Pipeline:
197
+ """Build the pipeline from the configured steps.
198
+
199
+ Returns:
200
+ A Pipeline object ready for execution.
201
+ """
202
+ return Pipeline(name=self.name, steps=self.steps)
203
+
204
+ def run(self) -> None:
205
+ """Build and run the pipeline immediately.
206
+
207
+ This is a convenience method equivalent to calling build().run().
208
+ """
209
+ pipeline = self.build()
210
+ pipeline.run()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cloe-nessy
3
- Version: 0.3.19
3
+ Version: 1.0.1
4
4
  Summary: Your friendly datalake monster.
5
5
  Project-URL: homepage, https://initions.com/
6
6
  Author-email: initions <ICSMC_EXT_PYPIORG@accenture.com>
@@ -12,11 +12,11 @@ Classifier: License :: OSI Approved :: MIT License
12
12
  Classifier: Operating System :: OS Independent
13
13
  Classifier: Programming Language :: Python :: 3
14
14
  Classifier: Topic :: Database
15
- Requires-Python: <3.13,>=3.11
15
+ Requires-Python: <3.14,>=3.11
16
16
  Requires-Dist: azure-identity<2.0.0,>=1.19.0
17
17
  Requires-Dist: cloe-logging[databricks,log-analytics]<0.4,>=0.3.8
18
18
  Requires-Dist: databricks-sdk<1.0.0,>=0.36.0
19
- Requires-Dist: fsspec<2025.7.1,>=2025.7.0
19
+ Requires-Dist: fsspec<2025.12.1,>=2025.12.0
20
20
  Requires-Dist: httpx<1.0.0,>=0.27.2
21
21
  Requires-Dist: jinja2<4.0.0,>=3.1.4
22
22
  Requires-Dist: matplotlib<4.0.0,>=3.9.2
@@ -11,13 +11,13 @@ cloe_nessy/clients/api_client/pagination_strategy.py,sha256=YcvAee8CrJiOxEvuFQ4K
11
11
  cloe_nessy/file_utilities/__init__.py,sha256=nY8H48jYHvTy0VYSRHVhZaFMlzfch4-T7y3N73tgMpI,73
12
12
  cloe_nessy/file_utilities/exceptions.py,sha256=RDeV2S6AQnFhFINRo84HDV_hk2RMrf5oNQ7GhHmAZy0,97
13
13
  cloe_nessy/file_utilities/factory.py,sha256=JONYGI8MCkNwG2_ujvjN3iB7BIdl7SqXKgV05YY_i4E,1735
14
- cloe_nessy/file_utilities/get_file_paths.py,sha256=wQCNBi7kgM32BSFlCuKFnORd9myjZUygpNm2-tF1F54,2980
14
+ cloe_nessy/file_utilities/get_file_paths.py,sha256=Hgfwtat7SWIjmyQG0WCrp5kOW5O0RWtfv3tHmT3igBE,3008
15
15
  cloe_nessy/file_utilities/location_types.py,sha256=G0FjpEu4_inmWbu5tvs2FyZv2TIhmPgjWU_Rtvmd6i8,801
16
16
  cloe_nessy/file_utilities/strategies/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- cloe_nessy/file_utilities/strategies/base_strategy.py,sha256=2BdGdP8ThjIP4e_fv7apx7Hg_L6q3nsPdek4oPgN7CI,2833
18
- cloe_nessy/file_utilities/strategies/local_strategy.py,sha256=6OcEjzLvRTBT8FKXhkLI0befT48SHutGHFIXMq5Sq8E,2217
19
- cloe_nessy/file_utilities/strategies/onelake_strategy.py,sha256=RnQjWtWIFzFj-zPqzyZaPYIjtjXkgP-K7-VA8GhkNmg,1980
20
- cloe_nessy/file_utilities/strategies/utils_strategy.py,sha256=urayKfOUpSaXKgTs1KVK0TS7FWVrJ3k4OLKh35sCxAU,3194
17
+ cloe_nessy/file_utilities/strategies/base_strategy.py,sha256=HwARDqb59i5HJyF-URbXKNGkOVcXEQn41_xD4W0DrXw,2861
18
+ cloe_nessy/file_utilities/strategies/local_strategy.py,sha256=LxCCggFgH7s1heySy_JtROJCNsSyXkV5kd-VRLIf3ng,2245
19
+ cloe_nessy/file_utilities/strategies/onelake_strategy.py,sha256=Pni_JkKqfbKoEMOCWbBJJdUIhpIFUPTUyIxSCSlPZRM,2009
20
+ cloe_nessy/file_utilities/strategies/utils_strategy.py,sha256=w4nrS6IcPPN7UBFBwszCfxgTI6xSE5BdY2WiqGYsFyI,3223
21
21
  cloe_nessy/integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  cloe_nessy/integration/delta_loader/__init__.py,sha256=ZdBDde1uPtTCL_KAhilVmtVmmGvH5dHb05QsOozkteE,438
23
23
  cloe_nessy/integration/delta_loader/delta_load_options.py,sha256=bbPGhC0n8L6CmcmV91Xqq6fWRimxlUHUkr22uVqG0g4,1363
@@ -30,19 +30,19 @@ cloe_nessy/integration/delta_loader/strategies/delta_timestamp_loader.py,sha256=
30
30
  cloe_nessy/integration/reader/__init__.py,sha256=NWQx-v6aKE8YOHhsxfeaZnMVq4KLKyRWXzUduf5aVsk,265
31
31
  cloe_nessy/integration/reader/api_reader.py,sha256=FbOyfLVG1ryL2GC-MgE1uClHICsQKBj9yZbY4TG5qrk,19637
32
32
  cloe_nessy/integration/reader/catalog_reader.py,sha256=DlnykmFjV_v8SCBh3qaCvf24QM-6TdMFVHx5Mqv7Nvs,4850
33
- cloe_nessy/integration/reader/excel_reader.py,sha256=JGmxQ16ux0HT-MLvAUp-9XMdKUToMb7cdObciZNsYSs,8027
33
+ cloe_nessy/integration/reader/excel_reader.py,sha256=QXm0MaE_-tW5ix-f_3Pgn-Vx7VG5jA_uSp858rVV7lA,8042
34
34
  cloe_nessy/integration/reader/exceptions.py,sha256=_A9jFpe_RIDZCGY76qzjic9bsshxns6yXPSl141dq1c,203
35
- cloe_nessy/integration/reader/file_reader.py,sha256=t5zF-cmZo1X0a1rki6ry1rSiFEu5uXRP2rNGd90fwoY,8163
35
+ cloe_nessy/integration/reader/file_reader.py,sha256=FFqqu1h003FY2Df3ru-G1JO4Bg2Ai8Rzh58fjOCN7NM,8262
36
36
  cloe_nessy/integration/reader/reader.py,sha256=YHriYkzsBduBjfI2FnP03VEo15a8UCRZ_sXtre8eaEs,1041
37
37
  cloe_nessy/integration/writer/__init__.py,sha256=3yzCAGiWZdQWtsbzlTih01sxVTJV2DDYwvl34lEAUlE,243
38
38
  cloe_nessy/integration/writer/catalog_writer.py,sha256=dQeXmtfs7J6rP6Ye3OCvxBraFScFX_3SHs7Md58hEeM,5296
39
39
  cloe_nessy/integration/writer/file_writer.py,sha256=SUDbN13ZzDhbM8DpOGFgM_Gkg70To4L6Q182pXx2HRM,5454
40
40
  cloe_nessy/integration/writer/writer.py,sha256=elFPLFrWR-qVE9qnBtzzzhyRALLQcRVuOsPS0rNmRt4,1741
41
41
  cloe_nessy/integration/writer/delta_writer/__init__.py,sha256=h2CT6Hllmk0nodlek27uqwniCzVZKMkYcPGyG9K2Z24,164
42
- cloe_nessy/integration/writer/delta_writer/delta_append_writer.py,sha256=TbpW-j87_H9dcUza34uR6VWslJez406y3_5N1ip0SnM,4740
42
+ cloe_nessy/integration/writer/delta_writer/delta_append_writer.py,sha256=nribgHmapp59v3Rw_AfJg0_BRYhP7x2IJIeE74Ia_6A,4748
43
43
  cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py,sha256=Yp_q_ycasW2_wwmzty_6fZeBVcW_0o8gLrr6F1gaUjQ,10195
44
44
  cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py,sha256=m4YFY9_WgaOcnpBviVt3Km-w3wf3NF25wPS-n0NBGcE,970
45
- cloe_nessy/integration/writer/delta_writer/delta_writer_base.py,sha256=upUtDZMzwYFU0kzmkelVgkpFToXkrypcR3h_jvGjz14,8596
45
+ cloe_nessy/integration/writer/delta_writer/delta_writer_base.py,sha256=O7hw7YOa2FEzBlzjwPfxQTxm0ZrlszIjjfsHTwE_OhU,8609
46
46
  cloe_nessy/integration/writer/delta_writer/exceptions.py,sha256=xPmGiYV0xQXauln5Oh34E5vbm0rVcs6xCh-SJSb2bw0,107
47
47
  cloe_nessy/logging/__init__.py,sha256=ySVCVbdyR3Dno_tl2ZfiER_7EVaDoQMHVkNyfdMZumY,65
48
48
  cloe_nessy/logging/logger_mixin.py,sha256=H8MyMEyb_kEDP0Ow5QStAFLuOkTIeUnneGaj916fKlU,7443
@@ -65,17 +65,18 @@ cloe_nessy/models/templates/create_volume.sql.j2,sha256=XIUf1cHcvAxcGTyhzUiv4xpQ
65
65
  cloe_nessy/object_manager/__init__.py,sha256=3sle0vNpPwBOkycxA3XVS9m4XZf5LD3Qd4NGxdqcHno,186
66
66
  cloe_nessy/object_manager/table_manager.py,sha256=4eQG-zMiuBpeJmvWdL3KdhHRiPFf8TS0RFNRp8Yz6rY,13887
67
67
  cloe_nessy/object_manager/volume_manager.py,sha256=6epd3KXzcNH04EvaKubAfLsaUm9qBMeT3KNvMK04gGs,2727
68
- cloe_nessy/pipeline/__init__.py,sha256=sespmJ5JsgyiFyZiedTiL2kg--zGIX7cjTYsD5vemEg,325
68
+ cloe_nessy/pipeline/__init__.py,sha256=BUzL4HJaCXWmK7OgKaxdwK72JrrdzfzIvyxOGtM28U0,417
69
69
  cloe_nessy/pipeline/pipeline.py,sha256=L4wk3b06LNWRj01nnAkuQpeRrwFTyaV1xTpgYAg4sak,10819
70
70
  cloe_nessy/pipeline/pipeline_action.py,sha256=S7IVFdmG12fRBzHuE_DiWn7qlMtApz6IloVd2Fj31Sg,1944
71
+ cloe_nessy/pipeline/pipeline_builder.py,sha256=_BBl43two0pherkTXZ-Yrpt6XcLW8Q-Z98qxbFIsMao,7929
71
72
  cloe_nessy/pipeline/pipeline_config.py,sha256=oVQ-IH4etTGZVVEnE-5iDPLYOtWpvDlltWFv1nevnqQ,3229
72
73
  cloe_nessy/pipeline/pipeline_context.py,sha256=eCOcjyE16rGRom3L85Gy_BbncfQD6i1x31yrWqZws-4,1881
73
74
  cloe_nessy/pipeline/pipeline_parsing_service.py,sha256=eeC4RbGBILGN6zkbUyjH-qGgEMtOWV4Kv_VxrHbHMY0,9021
74
75
  cloe_nessy/pipeline/pipeline_plotting_service.py,sha256=goMQj73FzUVchKn5c2SsPcWR6fr7DtVkVrcQfJsKCq4,13111
75
76
  cloe_nessy/pipeline/pipeline_step.py,sha256=oTnlvRpB0fbOBQXbPe1URstA5fv-97igCHt_41fKCAk,2082
76
- cloe_nessy/pipeline/actions/__init__.py,sha256=Qad9kxOQHoMQ1sj-4AxABNNIdaN5QkZAB14DUFKAtUA,2808
77
+ cloe_nessy/pipeline/actions/__init__.py,sha256=FfAnSIl-0T6pnaWhClkDqV8nfTdvLvZZJdwycsZMLPw,2990
77
78
  cloe_nessy/pipeline/actions/read_api.py,sha256=MAc7QfmhnaRUMdE09Ywt41RSAsuW4co8zF0zXHwbM8U,16193
78
- cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=EkP3JSI7VQMkvUsb97ieUeGnnfvyyUI7egvqNWMqK0I,6894
79
+ cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=sx3dezd33c1FawMrxORwhK5GNo1IpjCyuLATWz7esZ0,6735
79
80
  cloe_nessy/pipeline/actions/read_excel.py,sha256=IG_VmDEt1TvGVEO0SY9Fm3awHNjfisR1_7DUmhC3NEE,7968
80
81
  cloe_nessy/pipeline/actions/read_files.py,sha256=hRcM7wG35vxxLVajW3SK5euHW02qxiXCYSkIl11xiQ0,7308
81
82
  cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=i8fQceV63eAqx_x0ANisCkXWfMHyhqsfFHVFH5yP2po,3544
@@ -89,16 +90,18 @@ cloe_nessy/pipeline/actions/transform_distinct.py,sha256=c7aBxANyqT4aKhm0cSELDtD
89
90
  cloe_nessy/pipeline/actions/transform_filter.py,sha256=Nz_ggRfKIcNzYFfFOsgq1QeatjdEis0up4I7cOWBdyo,1446
90
91
  cloe_nessy/pipeline/actions/transform_generic_sql.py,sha256=_naWfmPdYAUKjPNeHu5qJAohOL7DHCSYz_kwoeRv3OI,2741
91
92
  cloe_nessy/pipeline/actions/transform_group_aggregate.py,sha256=KUHeeP-RIDi34dpbsPEJkzea5zFJA6MuyjNpOsFud9o,4045
92
- cloe_nessy/pipeline/actions/transform_hash_columns.py,sha256=H8j_Xadnm3npVNA_nu7Be7v0bJV20ELKMxSsVHHl6CY,8407
93
+ cloe_nessy/pipeline/actions/transform_hash_columns.py,sha256=M5_wolJwzJpPTSrZq4yWV3TH7H6BGqbjJkJCwtqPlQo,8507
93
94
  cloe_nessy/pipeline/actions/transform_join.py,sha256=ez1M1wVc9khOZj1swMArJbBKXxEpjenUHrW1wL8H330,7200
94
95
  cloe_nessy/pipeline/actions/transform_json_normalize.py,sha256=petF7pnNq1EKc8MqVdG0weFALAHNILSe_eAu4Z5XxIo,4833
96
+ cloe_nessy/pipeline/actions/transform_regex_extract.py,sha256=vMtUW0s_oXy8DC1-4Xh-WQN3CCp8jXYsJiFYvGdYrqE,6390
95
97
  cloe_nessy/pipeline/actions/transform_rename_columns.py,sha256=4zJcPCONMU4C67qeuzsrX3AORRRHoq_selUI7FJyeg0,1952
96
98
  cloe_nessy/pipeline/actions/transform_replace_values.py,sha256=1OPHTrjcphfyGepcO7ozYfeqfwA18pjlyHpVKUS_AAU,2049
97
99
  cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=-GhSEsb7iNnZIsYRm3BG9BX4_qUDJMbpj1DsKPY046w,4574
98
100
  cloe_nessy/pipeline/actions/transform_union.py,sha256=SZtEzh567CIExUj9yMEgshE28h4dXKT7Wr2TDj4zB4k,2718
101
+ cloe_nessy/pipeline/actions/transform_with_column.py,sha256=c-E1yYkeYmovbN1maT7ImpdQlW0nYvYsHCtDvfe4wt8,3357
99
102
  cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=FyC0scQU8Ul3Uigpk6IN2IJpf_4jRjAqF5yHtDVwG00,4852
100
- cloe_nessy/pipeline/actions/write_delta_append.py,sha256=2F5qnKPsY_F-2672Ce4Gub7qdna157jEqHHc429fO2A,2962
101
- cloe_nessy/pipeline/actions/write_delta_merge.py,sha256=zcOk4ytZFUxyGY8U2fdFPLFnw2g_yhaS_vOx_e3wCuE,5847
103
+ cloe_nessy/pipeline/actions/write_delta_append.py,sha256=e1g4mDhwAZdKyt4Gb7ZzHcQrJ1duSl8qOn6ONizRsoM,2934
104
+ cloe_nessy/pipeline/actions/write_delta_merge.py,sha256=fwinlTeZoDuTyrbln5vMu1UJ1LG8ZQrus3LoCVF__I4,5819
102
105
  cloe_nessy/pipeline/actions/write_file.py,sha256=JZ8UZslxUn_ttYt5wDyvtHFq2FqYk3vOR8kvExJI8pk,3212
103
106
  cloe_nessy/pipeline/utils/__init__.py,sha256=xi02UjBMiXWD7b9gDvww4gyRyowb0eRd_6Wbu0F_cro,118
104
107
  cloe_nessy/pipeline/utils/delta_load_utils.py,sha256=KitMNruxePEkecI0h4Jint1JwJpaEog5mCOchMkgan8,1495
@@ -110,6 +113,6 @@ cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_Up
110
113
  cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
111
114
  cloe_nessy/utils/column_names.py,sha256=dCNtm61mc5aLkY2oE4rlfN3VLCrpot6fOESjAZmCmhA,361
112
115
  cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
113
- cloe_nessy-0.3.19.dist-info/METADATA,sha256=O3LES1mWSPONQE6q47c_j9s_sQcvU2a5RfL1WQW3JPk,3290
114
- cloe_nessy-0.3.19.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
115
- cloe_nessy-0.3.19.dist-info/RECORD,,
116
+ cloe_nessy-1.0.1.dist-info/METADATA,sha256=qLn3XYfGsw2pW-pPtUUidtcHZiUtIwOESWY8LCenGEY,3291
117
+ cloe_nessy-1.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
118
+ cloe_nessy-1.0.1.dist-info/RECORD,,