cloe-nessy 0.3.18__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/clients/api_client/__init__.py +10 -1
- cloe_nessy/clients/api_client/api_client.py +19 -8
- cloe_nessy/clients/api_client/api_response.py +7 -4
- cloe_nessy/clients/api_client/pagination_config.py +84 -0
- cloe_nessy/clients/api_client/pagination_strategy.py +500 -0
- cloe_nessy/file_utilities/get_file_paths.py +2 -1
- cloe_nessy/file_utilities/strategies/base_strategy.py +2 -1
- cloe_nessy/file_utilities/strategies/local_strategy.py +2 -1
- cloe_nessy/file_utilities/strategies/onelake_strategy.py +3 -1
- cloe_nessy/file_utilities/strategies/utils_strategy.py +3 -1
- cloe_nessy/integration/reader/__init__.py +2 -2
- cloe_nessy/integration/reader/api_reader.py +463 -72
- cloe_nessy/integration/reader/catalog_reader.py +6 -4
- cloe_nessy/integration/reader/excel_reader.py +3 -3
- cloe_nessy/integration/reader/file_reader.py +3 -1
- cloe_nessy/integration/reader/reader.py +1 -1
- cloe_nessy/integration/writer/catalog_writer.py +1 -1
- cloe_nessy/integration/writer/delta_writer/delta_append_writer.py +1 -1
- cloe_nessy/integration/writer/delta_writer/delta_writer_base.py +1 -1
- cloe_nessy/pipeline/__init__.py +9 -1
- cloe_nessy/pipeline/actions/__init__.py +3 -1
- cloe_nessy/pipeline/actions/read_api.py +272 -75
- cloe_nessy/pipeline/actions/read_catalog_table.py +0 -2
- cloe_nessy/pipeline/actions/read_excel.py +1 -1
- cloe_nessy/pipeline/actions/transform_decode.py +2 -1
- cloe_nessy/pipeline/actions/transform_hash_columns.py +4 -2
- cloe_nessy/pipeline/actions/transform_with_column.py +104 -0
- cloe_nessy/pipeline/actions/write_delta_append.py +0 -3
- cloe_nessy/pipeline/actions/write_delta_merge.py +0 -3
- cloe_nessy/pipeline/pipeline_builder.py +210 -0
- cloe_nessy/pipeline/pipeline_config.py +2 -0
- cloe_nessy/pipeline/pipeline_context.py +1 -1
- cloe_nessy/pipeline/pipeline_parsing_service.py +104 -39
- cloe_nessy/pipeline/pipeline_step.py +2 -0
- cloe_nessy/session/__init__.py +2 -1
- cloe_nessy/session/pyspark_compat.py +15 -0
- cloe_nessy/session/session_manager.py +1 -1
- {cloe_nessy-0.3.18.dist-info → cloe_nessy-1.0.0.dist-info}/METADATA +4 -4
- {cloe_nessy-0.3.18.dist-info → cloe_nessy-1.0.0.dist-info}/RECORD +40 -35
- {cloe_nessy-0.3.18.dist-info → cloe_nessy-1.0.0.dist-info}/WHEEL +1 -1
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Transform action to add or update a column using a SQL expression."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pyspark.sql import functions as F
|
|
6
|
+
|
|
7
|
+
from cloe_nessy.pipeline.pipeline_action import PipelineAction
|
|
8
|
+
from cloe_nessy.pipeline.pipeline_context import PipelineContext
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TransformWithColumnAction(PipelineAction):
|
|
12
|
+
"""Add or update a column in the DataFrame using a SQL expression.
|
|
13
|
+
|
|
14
|
+
This action uses PySpark's expr() function to evaluate SQL expressions and
|
|
15
|
+
create or update columns in the DataFrame.
|
|
16
|
+
|
|
17
|
+
Examples:
|
|
18
|
+
=== "Create new column"
|
|
19
|
+
```yaml
|
|
20
|
+
Create Full Name:
|
|
21
|
+
action: TRANSFORM_WITH_COLUMN
|
|
22
|
+
options:
|
|
23
|
+
column_name: full_name
|
|
24
|
+
expression: concat(first_name, ' ', last_name)
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
=== "Update existing column"
|
|
28
|
+
```yaml
|
|
29
|
+
Lowercase Email:
|
|
30
|
+
action: TRANSFORM_WITH_COLUMN
|
|
31
|
+
options:
|
|
32
|
+
column_name: email
|
|
33
|
+
expression: lower(email)
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
=== "Calculated column"
|
|
37
|
+
```yaml
|
|
38
|
+
Calculate Total:
|
|
39
|
+
action: TRANSFORM_WITH_COLUMN
|
|
40
|
+
options:
|
|
41
|
+
column_name: total_price
|
|
42
|
+
expression: price * quantity * (1 + tax_rate)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
=== "Extract date parts"
|
|
46
|
+
```yaml
|
|
47
|
+
Extract Year:
|
|
48
|
+
action: TRANSFORM_WITH_COLUMN
|
|
49
|
+
options:
|
|
50
|
+
column_name: year
|
|
51
|
+
expression: year(order_date)
|
|
52
|
+
```
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
name: str = "TRANSFORM_WITH_COLUMN"
|
|
56
|
+
|
|
57
|
+
def run(
|
|
58
|
+
self,
|
|
59
|
+
context: PipelineContext,
|
|
60
|
+
*,
|
|
61
|
+
column_name: str = "",
|
|
62
|
+
expression: str = "",
|
|
63
|
+
**_: Any,
|
|
64
|
+
) -> PipelineContext:
|
|
65
|
+
"""Add or update a column using a SQL expression.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
context: The pipeline context containing the DataFrame
|
|
69
|
+
column_name: Name of the column to create or update
|
|
70
|
+
expression: SQL expression to evaluate for the column value
|
|
71
|
+
**_: Additional unused keyword arguments
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
PipelineContext: Updated context with the modified DataFrame
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
ValueError: If column_name is not provided
|
|
78
|
+
ValueError: If expression is not provided
|
|
79
|
+
ValueError: If context.data is None
|
|
80
|
+
Exception: If the SQL expression is invalid
|
|
81
|
+
"""
|
|
82
|
+
if not column_name:
|
|
83
|
+
raise ValueError("No column_name provided.")
|
|
84
|
+
|
|
85
|
+
if not expression:
|
|
86
|
+
raise ValueError("No expression provided.")
|
|
87
|
+
|
|
88
|
+
if context.data is None:
|
|
89
|
+
raise ValueError("Data from context is required for transform_with_column")
|
|
90
|
+
|
|
91
|
+
self._console_logger.info(f"Adding/updating column '{column_name}' with expression: {expression}")
|
|
92
|
+
|
|
93
|
+
df = context.data
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
# Use F.expr() to evaluate the SQL expression
|
|
97
|
+
df = df.withColumn(column_name, F.expr(expression))
|
|
98
|
+
except Exception as e:
|
|
99
|
+
self._console_logger.error(f"Failed to evaluate expression '{expression}' for column '{column_name}': {e}")
|
|
100
|
+
raise
|
|
101
|
+
|
|
102
|
+
self._console_logger.info(f"Successfully added/updated column '{column_name}'")
|
|
103
|
+
|
|
104
|
+
return context.from_existing(data=df)
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
from collections import OrderedDict
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from typing import Any, Self
|
|
4
|
+
|
|
5
|
+
from .pipeline import Pipeline
|
|
6
|
+
from .pipeline_step import PipelineStep
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PipelineBuilder:
|
|
10
|
+
"""Fluent API builder for creating Nessy pipelines programmatically.
|
|
11
|
+
|
|
12
|
+
This class provides a chainable interface for building pipelines using method calls
|
|
13
|
+
instead of YAML configuration. It dynamically creates methods for all available
|
|
14
|
+
PipelineActions.
|
|
15
|
+
|
|
16
|
+
Example:
|
|
17
|
+
```python
|
|
18
|
+
pipeline = (PipelineBuilder("My Pipeline")
|
|
19
|
+
.read_files(location="data/*.csv", extension="csv")
|
|
20
|
+
.transform_clean_column_names()
|
|
21
|
+
.transform_filter(condition="amount > 1000")
|
|
22
|
+
.write_catalog_table(catalog="prod", schema="sales", table="results")
|
|
23
|
+
.build())
|
|
24
|
+
|
|
25
|
+
pipeline.run()
|
|
26
|
+
```
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, name: str) -> None:
|
|
30
|
+
"""Initialize the pipeline builder.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
name: The name of the pipeline.
|
|
34
|
+
"""
|
|
35
|
+
self.name = name
|
|
36
|
+
self.steps: OrderedDict[str, PipelineStep] = OrderedDict()
|
|
37
|
+
self._step_counter = 0
|
|
38
|
+
|
|
39
|
+
def __getattr__(self, name: str) -> Callable[..., "PipelineBuilder"]:
|
|
40
|
+
"""Dynamically create methods for pipeline actions.
|
|
41
|
+
|
|
42
|
+
This method is called when an attribute that doesn't exist is accessed.
|
|
43
|
+
It converts method calls like `read_files()` into the corresponding PipelineAction.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
name: The method name being called.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
A callable that adds the corresponding pipeline step.
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
AttributeError: If the method name doesn't correspond to a known action.
|
|
53
|
+
"""
|
|
54
|
+
# Lazy import to avoid circular import issues
|
|
55
|
+
from .actions import pipeline_actions
|
|
56
|
+
|
|
57
|
+
# Convert method name to action name (e.g., read_files -> READ_FILES)
|
|
58
|
+
action_name = name.upper()
|
|
59
|
+
|
|
60
|
+
if action_name in pipeline_actions:
|
|
61
|
+
action_class = pipeline_actions[action_name]
|
|
62
|
+
|
|
63
|
+
def method(**kwargs: Any) -> "PipelineBuilder":
|
|
64
|
+
return self._add_step(action_class, **kwargs)
|
|
65
|
+
|
|
66
|
+
return method
|
|
67
|
+
|
|
68
|
+
raise AttributeError(
|
|
69
|
+
f"PipelineBuilder has no method '{name}'. Available actions: {list(pipeline_actions.keys())}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def _add_step(self, action_class: type, step_name: str | None = None, **options: Any) -> Self:
|
|
73
|
+
"""Add a step to the pipeline.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
action_class: The PipelineAction class to instantiate.
|
|
77
|
+
step_name: Optional custom name for the step.
|
|
78
|
+
**options: Options to pass to the action.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Self for method chaining.
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
ValueError: If a step with the given name already exists.
|
|
85
|
+
"""
|
|
86
|
+
if step_name is None:
|
|
87
|
+
step_name = f"step_{self._step_counter:03d}_{action_class.__name__}"
|
|
88
|
+
|
|
89
|
+
# Validate that step name is unique
|
|
90
|
+
if step_name in self.steps:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
f"A step with name '{step_name}' already exists in the pipeline. "
|
|
93
|
+
f"Please provide a unique step_name. "
|
|
94
|
+
f"Existing steps: {list(self.steps.keys())}"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Convert any PipelineBuilder instances in options to PipelineStep references
|
|
98
|
+
options = self._convert_builder_references(options)
|
|
99
|
+
|
|
100
|
+
# Set up context reference to previous step
|
|
101
|
+
context_ref = None
|
|
102
|
+
if self.steps:
|
|
103
|
+
context_ref = list(self.steps.keys())[-1]
|
|
104
|
+
|
|
105
|
+
step = PipelineStep(name=step_name, action=action_class(), options=options, _context_ref=context_ref)
|
|
106
|
+
|
|
107
|
+
# Remove any predecessors that are from already-executed external pipelines
|
|
108
|
+
# (these steps have results but aren't in our pipeline)
|
|
109
|
+
external_predecessors = set()
|
|
110
|
+
for pred_name in step._predecessors:
|
|
111
|
+
if pred_name not in self.steps and pred_name != context_ref:
|
|
112
|
+
# Check if this is a reference to an executed step from options
|
|
113
|
+
for opt_val in options.values():
|
|
114
|
+
if isinstance(opt_val, PipelineStep) and opt_val.name == pred_name:
|
|
115
|
+
# This is an external executed step, remove from predecessors
|
|
116
|
+
external_predecessors.add(pred_name)
|
|
117
|
+
break
|
|
118
|
+
|
|
119
|
+
step._predecessors -= external_predecessors
|
|
120
|
+
|
|
121
|
+
self.steps[step_name] = step
|
|
122
|
+
self._step_counter += 1
|
|
123
|
+
return self
|
|
124
|
+
|
|
125
|
+
def _convert_builder_references(self, options: dict[str, Any]) -> dict[str, Any]:
|
|
126
|
+
"""Convert any PipelineBuilder instances in options to PipelineStep references.
|
|
127
|
+
|
|
128
|
+
This method recursively processes options to find PipelineBuilder instances and
|
|
129
|
+
converts them to their last step's PipelineStep reference. This allows users to
|
|
130
|
+
pass PipelineBuilder instances directly to actions that expect PipelineStep references.
|
|
131
|
+
|
|
132
|
+
Handles PipelineBuilder instances in:
|
|
133
|
+
- Direct values
|
|
134
|
+
- Lists
|
|
135
|
+
- Nested dictionaries
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
options: Dictionary of options that may contain PipelineBuilder instances.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Dictionary with PipelineBuilder instances converted to PipelineStep references.
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
ValueError: If a PipelineBuilder has no steps.
|
|
145
|
+
"""
|
|
146
|
+
converted = {}
|
|
147
|
+
for key, value in options.items():
|
|
148
|
+
converted[key] = self._convert_value(value, key)
|
|
149
|
+
return converted
|
|
150
|
+
|
|
151
|
+
def _convert_value(self, value: Any, context: str = "") -> Any:
|
|
152
|
+
"""Recursively convert a value, handling PipelineBuilder instances.
|
|
153
|
+
|
|
154
|
+
When a PipelineBuilder is passed as a value, it is executed immediately
|
|
155
|
+
and its last step is returned as the reference. This allows the pipeline
|
|
156
|
+
to be run before the main pipeline that references it.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
value: The value to convert.
|
|
160
|
+
context: Context string for error messages (e.g., key name).
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
The converted value.
|
|
164
|
+
"""
|
|
165
|
+
if isinstance(value, PipelineBuilder):
|
|
166
|
+
# Build and run the referenced pipeline immediately
|
|
167
|
+
pipeline = value.build()
|
|
168
|
+
if not pipeline.steps:
|
|
169
|
+
context_msg = f" in '{context}'" if context else ""
|
|
170
|
+
raise ValueError(f"PipelineBuilder{context_msg} must have at least one step")
|
|
171
|
+
|
|
172
|
+
# Run the pipeline to populate the results
|
|
173
|
+
pipeline.run()
|
|
174
|
+
|
|
175
|
+
# Get the last step which now has results
|
|
176
|
+
last_step_name = list(pipeline.steps.keys())[-1]
|
|
177
|
+
last_step = pipeline.steps[last_step_name]
|
|
178
|
+
|
|
179
|
+
# Clear predecessors since this step is already executed and has its result
|
|
180
|
+
# This prevents the main pipeline from trying to resolve dependencies
|
|
181
|
+
# that don't exist in its own step dictionary
|
|
182
|
+
last_step._predecessors = set()
|
|
183
|
+
last_step._context_ref = None
|
|
184
|
+
|
|
185
|
+
return last_step
|
|
186
|
+
if isinstance(value, dict):
|
|
187
|
+
# Recursively convert nested dictionaries
|
|
188
|
+
return {k: self._convert_value(v, f"{context}.{k}" if context else k) for k, v in value.items()}
|
|
189
|
+
if isinstance(value, list):
|
|
190
|
+
# Recursively convert lists
|
|
191
|
+
return [
|
|
192
|
+
self._convert_value(item, f"{context}[{i}]" if context else f"[{i}]") for i, item in enumerate(value)
|
|
193
|
+
]
|
|
194
|
+
return value
|
|
195
|
+
|
|
196
|
+
def build(self) -> Pipeline:
|
|
197
|
+
"""Build the pipeline from the configured steps.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
A Pipeline object ready for execution.
|
|
201
|
+
"""
|
|
202
|
+
return Pipeline(name=self.name, steps=self.steps)
|
|
203
|
+
|
|
204
|
+
def run(self) -> None:
|
|
205
|
+
"""Build and run the pipeline immediately.
|
|
206
|
+
|
|
207
|
+
This is a convenience method equivalent to calling build().run().
|
|
208
|
+
"""
|
|
209
|
+
pipeline = self.build()
|
|
210
|
+
pipeline.run()
|
|
@@ -83,6 +83,7 @@ class PipelineStepConfig(PipelineConfigBaseModel):
|
|
|
83
83
|
context: str | None = None
|
|
84
84
|
table_metadata: str | None = None
|
|
85
85
|
options: dict = Field(default_factory=dict)
|
|
86
|
+
env: dict = Field(default_factory=dict)
|
|
86
87
|
|
|
87
88
|
|
|
88
89
|
class PipelineConfig(PipelineConfigBaseModel):
|
|
@@ -90,3 +91,4 @@ class PipelineConfig(PipelineConfigBaseModel):
|
|
|
90
91
|
|
|
91
92
|
name: str
|
|
92
93
|
steps: OrderedDict[str, PipelineStepConfig]
|
|
94
|
+
env: dict[str, str] = Field(default_factory=dict)
|
|
@@ -3,6 +3,7 @@ import re
|
|
|
3
3
|
from collections import OrderedDict
|
|
4
4
|
from enum import Enum
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
6
7
|
|
|
7
8
|
import yaml
|
|
8
9
|
|
|
@@ -10,7 +11,7 @@ from ..logging import LoggerMixin
|
|
|
10
11
|
from ..session import SessionManager
|
|
11
12
|
from .actions import PipelineActionType, pipeline_actions
|
|
12
13
|
from .pipeline import Pipeline
|
|
13
|
-
from .pipeline_config import PipelineConfig
|
|
14
|
+
from .pipeline_config import PipelineConfig, PipelineStepConfig
|
|
14
15
|
from .pipeline_step import PipelineStep
|
|
15
16
|
|
|
16
17
|
|
|
@@ -63,49 +64,22 @@ class PipelineParsingService:
|
|
|
63
64
|
if not yaml_str:
|
|
64
65
|
raise ValueError("YAML content is empty.")
|
|
65
66
|
|
|
66
|
-
|
|
67
|
-
|
|
67
|
+
secrets_repl_yaml_str = PipelineParsingService._replace_secret_refs(yaml_str)
|
|
68
|
+
fixed_yaml_str = PipelineParsingService._fix_yaml_str_with_templates(secrets_repl_yaml_str)
|
|
69
|
+
config = yaml.safe_load(fixed_yaml_str)
|
|
68
70
|
pipeline_config = PipelineConfig.metadata_to_instance(config)
|
|
69
|
-
steps = PipelineParsingService._get_steps(pipeline_config.steps)
|
|
71
|
+
steps = PipelineParsingService._get_steps(pipeline_config.steps, pipeline_config.env)
|
|
70
72
|
pipeline = Pipeline(name=pipeline_config.name, steps=steps) # type: ignore
|
|
71
73
|
console_logger.info("Pipeline [ '%s' ] parsed successfully with %d steps.", pipeline.name, len(pipeline.steps))
|
|
72
74
|
return pipeline
|
|
73
75
|
|
|
74
76
|
@staticmethod
|
|
75
|
-
def
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
scope-name is the name of the secret scope and secret-key is the key of
|
|
82
|
-
the secret.
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
yaml_str: A string that can be parsed in YAML format.
|
|
86
|
-
|
|
87
|
-
Returns:
|
|
88
|
-
The same YAML string with environment variable placeholders replaced.
|
|
89
|
-
"""
|
|
90
|
-
env_var_pattern = r"\{\{env:([^}]+)\}\}"
|
|
91
|
-
secret_ref_pattern = r"\{\{(?!step|env)([^}]+):([^}]+)\}\}"
|
|
92
|
-
|
|
93
|
-
def replace_with_env_var(match):
|
|
94
|
-
env_var_name = match.group(1)
|
|
95
|
-
env_var_value = os.getenv(env_var_name)
|
|
96
|
-
return env_var_value
|
|
97
|
-
|
|
98
|
-
def replace_with_secret(match):
|
|
99
|
-
secret_scope_name = match.group(1)
|
|
100
|
-
secret_key = match.group(2)
|
|
101
|
-
return SessionManager.get_utils().secrets.get(scope=secret_scope_name, key=secret_key)
|
|
102
|
-
|
|
103
|
-
env_replaced_yaml_string = re.sub(env_var_pattern, replace_with_env_var, yaml_str)
|
|
104
|
-
final_yaml_string = re.sub(secret_ref_pattern, replace_with_secret, env_replaced_yaml_string)
|
|
105
|
-
return final_yaml_string
|
|
106
|
-
|
|
107
|
-
@staticmethod
|
|
108
|
-
def _get_steps(step_configs, last_step_name: str | None = None):
|
|
77
|
+
def _get_steps(
|
|
78
|
+
step_configs: OrderedDict[str, PipelineStepConfig],
|
|
79
|
+
pipeline_env: dict[str, str],
|
|
80
|
+
last_step_name: str | None = None,
|
|
81
|
+
) -> OrderedDict[str, PipelineStep]:
|
|
82
|
+
os_env = dict(os.environ)
|
|
109
83
|
steps = OrderedDict()
|
|
110
84
|
for step_name, step_config in step_configs.items():
|
|
111
85
|
is_successor = step_config.is_successor
|
|
@@ -115,19 +89,99 @@ class PipelineParsingService:
|
|
|
115
89
|
action = PipelineActionType[step_config.action.name].value()
|
|
116
90
|
step = PipelineStep(
|
|
117
91
|
name=step_name,
|
|
92
|
+
env=step_config.env,
|
|
118
93
|
action=action,
|
|
119
94
|
options=step_config.options,
|
|
120
95
|
_context_ref=context_ref,
|
|
121
96
|
_table_metadata_ref=step_config.table_metadata,
|
|
122
97
|
)
|
|
123
|
-
steps[step.name] = step
|
|
98
|
+
steps[step.name] = PipelineParsingService._resolve_env_vars(step, os_env, pipeline_env)
|
|
124
99
|
last_step_name = step_name
|
|
125
100
|
for step in steps.values():
|
|
126
101
|
steps[step.name] = PipelineParsingService._replace_step_refs(steps, step)
|
|
127
102
|
return steps
|
|
128
103
|
|
|
104
|
+
@staticmethod
|
|
105
|
+
def _replace_secret_refs(yaml_str: str) -> str:
|
|
106
|
+
"""Replaces secret reference placeholders in a YAML string.
|
|
107
|
+
|
|
108
|
+
Replaces secret references with the pattern `{{secret-scope-name:secret-key}}`.
|
|
109
|
+
Where scope-name is the name of the secret scope and secret-key is the key of the secret.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
yaml_str: A string that can be parsed in YAML format.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
The same YAML string with secret reference placeholders replaced.
|
|
116
|
+
"""
|
|
117
|
+
secret_ref_pattern = r"\{\{(?!(?:env|step):)([^}]+):([^}]+)\}\}"
|
|
118
|
+
|
|
119
|
+
def replace_with_secret(match):
|
|
120
|
+
secret_scope_name = match.group(1)
|
|
121
|
+
secret_key = match.group(2)
|
|
122
|
+
return SessionManager.get_utils().secrets.get(scope=secret_scope_name, key=secret_key)
|
|
123
|
+
|
|
124
|
+
return re.sub(secret_ref_pattern, replace_with_secret, yaml_str)
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def _resolve_env_vars(step: PipelineStep, os_env: dict[str, str], pipeline_env: dict[str, str]) -> PipelineStep:
|
|
128
|
+
"""Resolves environment variable placeholders in step definition.
|
|
129
|
+
|
|
130
|
+
Resolves environment variables with the pattern `{{env:var-name}}`,
|
|
131
|
+
where the `var-name` is the name of the environment variable.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
step: Step definition, where replacement is occurred.
|
|
135
|
+
os_env: OS scope environment variable.
|
|
136
|
+
pipeline_env: Pipeline scope environment variables.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
The same step definition with environment variable placeholders replaced.
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
KeyError: If the specified key is not found in the environment variables.
|
|
143
|
+
"""
|
|
144
|
+
env_var_pattern = re.compile(r"\{\{env:([A-Z_][A-Z0-9_]*)\}\}")
|
|
145
|
+
|
|
146
|
+
def _resolve_object(obj: Any) -> Any:
|
|
147
|
+
if isinstance(obj, str):
|
|
148
|
+
return _resolve_string(obj)
|
|
149
|
+
if isinstance(obj, list):
|
|
150
|
+
return [_resolve_object(i) for i in obj]
|
|
151
|
+
if isinstance(obj, dict):
|
|
152
|
+
return {k: _resolve_object(v) for k, v in obj.items()}
|
|
153
|
+
return obj
|
|
154
|
+
|
|
155
|
+
def _resolve_string(value: str) -> str:
|
|
156
|
+
def repl(match):
|
|
157
|
+
key = match.group(1)
|
|
158
|
+
if key not in effective_env:
|
|
159
|
+
raise KeyError(f"Environment variable '{key}' is not defined")
|
|
160
|
+
return str(effective_env[key])
|
|
161
|
+
|
|
162
|
+
return env_var_pattern.sub(repl, value)
|
|
163
|
+
|
|
164
|
+
if step.options:
|
|
165
|
+
effective_env = {**os_env, **pipeline_env, **step.env}
|
|
166
|
+
for option, value in step.options.items():
|
|
167
|
+
step.options[option] = _resolve_object(value)
|
|
168
|
+
|
|
169
|
+
return step
|
|
170
|
+
|
|
129
171
|
@staticmethod
|
|
130
172
|
def _replace_step_refs(steps: OrderedDict[str, PipelineStep], step: PipelineStep) -> PipelineStep:
|
|
173
|
+
"""Replaces other steps reference placeholders in a step definition.
|
|
174
|
+
|
|
175
|
+
Replaces other steps references with the pattern `((step:step-name))`.
|
|
176
|
+
Where the `step-name` is the name of the referenced step.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
steps: All pipeline steps definitions.
|
|
180
|
+
step: Step definition, where replacement is occurred.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
The same step definition with referenced step names replaced.
|
|
184
|
+
"""
|
|
131
185
|
step_ref_pattern = r"\(\(step:([^)]+)\)\)"
|
|
132
186
|
|
|
133
187
|
def _handle_string_value(value: str, option: str):
|
|
@@ -154,3 +208,14 @@ class PipelineParsingService:
|
|
|
154
208
|
_handle_list_value(value, option)
|
|
155
209
|
|
|
156
210
|
return step
|
|
211
|
+
|
|
212
|
+
@staticmethod
|
|
213
|
+
def _fix_yaml_str_with_templates(yaml_str: str) -> str:
|
|
214
|
+
"""Fixes unquoted {{env:...}} templates before yaml.safe_load."""
|
|
215
|
+
unquoted_template = re.compile(r"(:)\s*(\{\{env:[^}]+\}\})(?=\s*$|\s+#)", re.MULTILINE)
|
|
216
|
+
|
|
217
|
+
def replacer(match):
|
|
218
|
+
colon, template = match.groups()
|
|
219
|
+
return f'{colon} "{template}"'
|
|
220
|
+
|
|
221
|
+
return unquoted_template.sub(replacer, yaml_str)
|
|
@@ -15,6 +15,7 @@ class PipelineStep:
|
|
|
15
15
|
Attributes:
|
|
16
16
|
name: The name of the step.
|
|
17
17
|
action: The action to be executed.
|
|
18
|
+
env: The step environment variables.
|
|
18
19
|
is_successor: A boolean indicating if the step is a successor and takes
|
|
19
20
|
the previous steps context.
|
|
20
21
|
context: The context of the step.
|
|
@@ -26,6 +27,7 @@ class PipelineStep:
|
|
|
26
27
|
|
|
27
28
|
name: str
|
|
28
29
|
action: PipelineAction
|
|
30
|
+
env: dict[str, str] = field(default_factory=lambda: {})
|
|
29
31
|
context: PipelineContext = field(default_factory=lambda: PipelineContext())
|
|
30
32
|
options: dict[str, Any] = field(default_factory=lambda: {})
|
|
31
33
|
result: PipelineContext = field(default_factory=lambda: PipelineContext())
|
cloe_nessy/session/__init__.py
CHANGED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
from pyspark.sql.utils import is_remote
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from pyspark.sql import Column, DataFrame, SparkSession
|
|
7
|
+
else:
|
|
8
|
+
# Real runtime imports
|
|
9
|
+
if is_remote():
|
|
10
|
+
from pyspark.sql.connect.dataframe import DataFrame
|
|
11
|
+
from pyspark.sql.connect.session import SparkSession
|
|
12
|
+
else:
|
|
13
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
14
|
+
|
|
15
|
+
__all__ = ["SparkSession", "DataFrame", "Column"]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cloe-nessy
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Your friendly datalake monster.
|
|
5
5
|
Project-URL: homepage, https://initions.com/
|
|
6
6
|
Author-email: initions <ICSMC_EXT_PYPIORG@accenture.com>
|
|
@@ -16,7 +16,7 @@ Requires-Python: <3.13,>=3.11
|
|
|
16
16
|
Requires-Dist: azure-identity<2.0.0,>=1.19.0
|
|
17
17
|
Requires-Dist: cloe-logging[databricks,log-analytics]<0.4,>=0.3.8
|
|
18
18
|
Requires-Dist: databricks-sdk<1.0.0,>=0.36.0
|
|
19
|
-
Requires-Dist: fsspec<2025.
|
|
19
|
+
Requires-Dist: fsspec<2025.12.1,>=2025.12.0
|
|
20
20
|
Requires-Dist: httpx<1.0.0,>=0.27.2
|
|
21
21
|
Requires-Dist: jinja2<4.0.0,>=3.1.4
|
|
22
22
|
Requires-Dist: matplotlib<4.0.0,>=3.9.2
|
|
@@ -58,12 +58,12 @@ Extract-Transform-Load (ETL) Workflow.
|
|
|
58
58
|
|
|
59
59
|
When you are contributing, please refer to our Contribution Guide in the *nessy*
|
|
60
60
|
Docs
|
|
61
|
-
[here](https://
|
|
61
|
+
[here](https://mango-tree-0b8dd3b03.1.azurestaticapps.net/tool_docs/nessy/Developer-Guide/)!
|
|
62
62
|
|
|
63
63
|
## Usage
|
|
64
64
|
|
|
65
65
|
Please find the User Guide
|
|
66
|
-
[here](https://
|
|
66
|
+
[here](https://mango-tree-0b8dd3b03.1.azurestaticapps.net/tool_docs/nessy/User-Guide/)!
|
|
67
67
|
|
|
68
68
|
## Contact
|
|
69
69
|
|