cloe-nessy 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/__init__.py +0 -0
- cloe_nessy/clients/__init__.py +5 -0
- cloe_nessy/clients/api_client/__init__.py +3 -0
- cloe_nessy/clients/api_client/api_client.py +188 -0
- cloe_nessy/clients/api_client/api_response.py +72 -0
- cloe_nessy/clients/api_client/auth.py +178 -0
- cloe_nessy/clients/api_client/exceptions.py +22 -0
- cloe_nessy/file_utilities/__init__.py +3 -0
- cloe_nessy/file_utilities/exceptions.py +4 -0
- cloe_nessy/file_utilities/factory.py +42 -0
- cloe_nessy/file_utilities/get_file_paths.py +72 -0
- cloe_nessy/file_utilities/location_types.py +29 -0
- cloe_nessy/file_utilities/strategies/__init__.py +0 -0
- cloe_nessy/file_utilities/strategies/base_strategy.py +59 -0
- cloe_nessy/file_utilities/strategies/local_strategy.py +51 -0
- cloe_nessy/file_utilities/strategies/onelake_strategy.py +31 -0
- cloe_nessy/file_utilities/strategies/utils_strategy.py +72 -0
- cloe_nessy/integration/__init__.py +0 -0
- cloe_nessy/integration/reader/__init__.py +6 -0
- cloe_nessy/integration/reader/api_reader.py +141 -0
- cloe_nessy/integration/reader/catalog_reader.py +49 -0
- cloe_nessy/integration/reader/excel_reader.py +170 -0
- cloe_nessy/integration/reader/exceptions.py +10 -0
- cloe_nessy/integration/reader/file_reader.py +96 -0
- cloe_nessy/integration/reader/reader.py +34 -0
- cloe_nessy/integration/writer/__init__.py +3 -0
- cloe_nessy/integration/writer/catalog_writer.py +48 -0
- cloe_nessy/logging/__init__.py +3 -0
- cloe_nessy/logging/logger_mixin.py +162 -0
- cloe_nessy/models/__init__.py +13 -0
- cloe_nessy/models/column.py +65 -0
- cloe_nessy/models/constraint.py +9 -0
- cloe_nessy/models/foreign_key.py +34 -0
- cloe_nessy/models/mixins/__init__.py +0 -0
- cloe_nessy/models/mixins/read_instance_mixin.py +124 -0
- cloe_nessy/models/mixins/template_loader_mixin.py +18 -0
- cloe_nessy/models/schema.py +76 -0
- cloe_nessy/models/table.py +236 -0
- cloe_nessy/models/types.py +7 -0
- cloe_nessy/object_manager/__init__.py +3 -0
- cloe_nessy/object_manager/table_manager.py +58 -0
- cloe_nessy/pipeline/__init__.py +7 -0
- cloe_nessy/pipeline/actions/__init__.py +50 -0
- cloe_nessy/pipeline/actions/read_api.py +178 -0
- cloe_nessy/pipeline/actions/read_catalog_table.py +68 -0
- cloe_nessy/pipeline/actions/read_excel.py +177 -0
- cloe_nessy/pipeline/actions/read_files.py +105 -0
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +66 -0
- cloe_nessy/pipeline/actions/transform_change_datatype.py +56 -0
- cloe_nessy/pipeline/actions/transform_concat_columns.py +88 -0
- cloe_nessy/pipeline/actions/transform_decode.py +102 -0
- cloe_nessy/pipeline/actions/transform_distinct.py +40 -0
- cloe_nessy/pipeline/actions/transform_filter.py +51 -0
- cloe_nessy/pipeline/actions/transform_generic_sql.py +66 -0
- cloe_nessy/pipeline/actions/transform_join.py +81 -0
- cloe_nessy/pipeline/actions/transform_json_normalize.py +106 -0
- cloe_nessy/pipeline/actions/transform_rename_columns.py +60 -0
- cloe_nessy/pipeline/actions/transform_replace_values.py +59 -0
- cloe_nessy/pipeline/actions/transform_select_columns.py +83 -0
- cloe_nessy/pipeline/actions/transform_union.py +71 -0
- cloe_nessy/pipeline/actions/write_catalog_table.py +73 -0
- cloe_nessy/pipeline/pipeline.py +201 -0
- cloe_nessy/pipeline/pipeline_action.py +62 -0
- cloe_nessy/pipeline/pipeline_config.py +92 -0
- cloe_nessy/pipeline/pipeline_context.py +56 -0
- cloe_nessy/pipeline/pipeline_parsing_service.py +156 -0
- cloe_nessy/pipeline/pipeline_step.py +50 -0
- cloe_nessy/py.typed +0 -0
- cloe_nessy/session/__init__.py +3 -0
- cloe_nessy/session/session_manager.py +188 -0
- cloe_nessy/settings/__init__.py +3 -0
- cloe_nessy/settings/settings.py +91 -0
- cloe_nessy/utils/__init__.py +0 -0
- cloe_nessy/utils/file_and_directory_handler.py +19 -0
- cloe_nessy-0.2.9.dist-info/METADATA +26 -0
- cloe_nessy-0.2.9.dist-info/RECORD +78 -0
- cloe_nessy-0.2.9.dist-info/WHEEL +5 -0
- cloe_nessy-0.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from ...integration.writer import CatalogWriter
|
|
4
|
+
from ..pipeline_action import PipelineAction
|
|
5
|
+
from ..pipeline_context import PipelineContext
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class WriteCatalogTableAction(PipelineAction):
|
|
9
|
+
"""Writes a DataFrame to a specified catalog table using [CatalogWriter][cloe_nessy.integration.writer.CatalogWriter].
|
|
10
|
+
|
|
11
|
+
Example:
|
|
12
|
+
```yaml
|
|
13
|
+
Write Table to Catalog:
|
|
14
|
+
action: WRITE_CATALOG_TABLE
|
|
15
|
+
options:
|
|
16
|
+
table_identifier: my_catalog.business_schema.sales_table
|
|
17
|
+
mode: append
|
|
18
|
+
partition_by: day
|
|
19
|
+
options: <options for the writer>
|
|
20
|
+
```
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
name: str = "WRITE_CATALOG_TABLE"
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def run(
|
|
27
|
+
context: PipelineContext,
|
|
28
|
+
*,
|
|
29
|
+
table_identifier: str | None = None,
|
|
30
|
+
mode: str = "append",
|
|
31
|
+
partition_by: str | list[str] | None = None,
|
|
32
|
+
options: dict[str, str] | None = None,
|
|
33
|
+
**_: Any,
|
|
34
|
+
) -> PipelineContext:
|
|
35
|
+
"""Writes a DataFrame to a specified catalog table.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
context: Context in which this Action is executed.
|
|
39
|
+
table_identifier: The table identifier in the unity catalog in the
|
|
40
|
+
format 'catalog.schema.table'. If not provided, attempts to use the
|
|
41
|
+
context's table metadata.
|
|
42
|
+
mode: The write mode. One of 'append', 'overwrite', 'error',
|
|
43
|
+
'errorifexists', or 'ignore'.
|
|
44
|
+
partition_by: Names of the partitioning columns.
|
|
45
|
+
options: Additional options for the write operation.
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
ValueError: If the table name is not specified or cannot be inferred from
|
|
49
|
+
the context.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Context after the execution of this Action.
|
|
53
|
+
"""
|
|
54
|
+
if not options:
|
|
55
|
+
options = dict()
|
|
56
|
+
if partition_by is None:
|
|
57
|
+
if hasattr(context.table_metadata, "partition_by"):
|
|
58
|
+
partition_by = context.table_metadata.partition_by # type: ignore
|
|
59
|
+
|
|
60
|
+
if (table_metadata := context.table_metadata) and table_identifier is None:
|
|
61
|
+
table_identifier = table_metadata.identifier
|
|
62
|
+
if table_identifier is None:
|
|
63
|
+
raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
|
|
64
|
+
|
|
65
|
+
writer = CatalogWriter()
|
|
66
|
+
writer.write_table(
|
|
67
|
+
df=context.data, # type: ignore
|
|
68
|
+
table_identifier=table_identifier,
|
|
69
|
+
mode=mode,
|
|
70
|
+
partition_by=partition_by,
|
|
71
|
+
options=options,
|
|
72
|
+
)
|
|
73
|
+
return context.from_existing()
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from collections import OrderedDict
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
|
+
from threading import Lock
|
|
5
|
+
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
7
|
+
import networkx as nx
|
|
8
|
+
|
|
9
|
+
from ..logging.logger_mixin import LoggerMixin
|
|
10
|
+
from .pipeline_step import PipelineStep
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Pipeline(LoggerMixin):
|
|
14
|
+
"""A Pipeline represents the logical unit of one ETL process.
|
|
15
|
+
|
|
16
|
+
This class manages a directed acyclic graph (DAG) of steps, ensuring that
|
|
17
|
+
each step is executed in the correct order based on dependencies.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
name: The name of the pipeline.
|
|
21
|
+
steps: An ordered dictionary of PipelineSteps that are part of the pipeline.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, name: str, steps: OrderedDict[str, "PipelineStep"] | None = None) -> None:
|
|
25
|
+
self.name: str = name
|
|
26
|
+
self.steps: OrderedDict[str, PipelineStep] = steps if steps is not None else OrderedDict()
|
|
27
|
+
self._console_logger = self.get_console_logger()
|
|
28
|
+
self._graph: nx.DiGraph = self._create_graph()
|
|
29
|
+
self._lock: Lock = Lock()
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def graph(self) -> nx.DiGraph:
|
|
33
|
+
"""Get the pipeline graph."""
|
|
34
|
+
return self._graph
|
|
35
|
+
|
|
36
|
+
def _create_graph(self) -> nx.DiGraph:
|
|
37
|
+
"""Creates a directed acyclic graph (DAG) representing the pipeline steps and their dependencies.
|
|
38
|
+
|
|
39
|
+
Each node in the graph represents a single step in the pipeline, and each edge represents a dependency.
|
|
40
|
+
"""
|
|
41
|
+
g: nx.DiGraph = nx.DiGraph()
|
|
42
|
+
g.add_nodes_from(set([s.name for s in self.steps.values()]))
|
|
43
|
+
g.add_edges_from(set([(p, s.name) for s in self.steps.values() for p in s._predecessors if p]))
|
|
44
|
+
|
|
45
|
+
self._console_logger.debug(f"Graph created with {g.number_of_nodes()} nodes and {g.number_of_edges()} edges.")
|
|
46
|
+
return g
|
|
47
|
+
|
|
48
|
+
def _run_step(self, step_name: str) -> None:
|
|
49
|
+
"""Executes the run method of the corresponding step in the pipeline."""
|
|
50
|
+
step = self.steps[step_name]
|
|
51
|
+
|
|
52
|
+
# Handle context and metadata references
|
|
53
|
+
if step._context_ref:
|
|
54
|
+
step.context = self.steps[step._context_ref].result
|
|
55
|
+
if step._table_metadata_ref:
|
|
56
|
+
step.context.table_metadata = self.steps[step._table_metadata_ref].result.table_metadata
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
self._console_logger.info(f"Starting execution of step: {step.name}")
|
|
60
|
+
step.run()
|
|
61
|
+
except Exception as err:
|
|
62
|
+
self._console_logger.error(f"Execution of step {step.name} failed with error: {str(err)}")
|
|
63
|
+
raise err
|
|
64
|
+
else:
|
|
65
|
+
self._console_logger.info(f"Execution of step {step.name} succeeded.")
|
|
66
|
+
|
|
67
|
+
def _get_ready_to_run_steps(self, remaining_steps: list[str], g: nx.DiGraph) -> set[str]:
|
|
68
|
+
"""Identifies and returns the steps that are ready to run.
|
|
69
|
+
|
|
70
|
+
This method checks the directed acyclic graph (DAG) to find steps that have no predecessors,
|
|
71
|
+
indicating that they are ready to be executed. It logs the remaining steps and the steps that
|
|
72
|
+
are ready to run.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
remaining_steps: A list of step IDs that are yet to be executed.
|
|
76
|
+
g: The directed acyclic graph representing the pipeline.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
A set of step IDs that are ready to be executed.
|
|
80
|
+
"""
|
|
81
|
+
with self._lock:
|
|
82
|
+
ready_to_run = set([step for step in remaining_steps if g.in_degree(step) == 0])
|
|
83
|
+
self._console_logger.debug(f"Remaining steps: {remaining_steps}")
|
|
84
|
+
self._console_logger.debug(f"Ready to run: {ready_to_run}")
|
|
85
|
+
return ready_to_run
|
|
86
|
+
|
|
87
|
+
def _submit_ready_steps(
|
|
88
|
+
self, ready_to_run: set[str], remaining_steps: list[str], executor: ThreadPoolExecutor, futures: dict
|
|
89
|
+
):
|
|
90
|
+
"""Submits the ready-to-run steps to the executor for execution.
|
|
91
|
+
|
|
92
|
+
This method takes the steps that are ready to run, removes them from the list of remaining steps,
|
|
93
|
+
and submits them to the executor for concurrent execution. It also updates the futures dictionary
|
|
94
|
+
to keep track of the submitted tasks.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
ready_to_run: A set of steps that are ready to be executed.
|
|
98
|
+
remaining_steps: A list of steps that are yet to be executed.
|
|
99
|
+
executor: The executor that manages the concurrent execution of steps.
|
|
100
|
+
futures: A dictionary mapping futures to their corresponding step ID.
|
|
101
|
+
"""
|
|
102
|
+
with self._lock:
|
|
103
|
+
for step in ready_to_run:
|
|
104
|
+
self._console_logger.debug(f"Submitting: {step}")
|
|
105
|
+
remaining_steps.remove(step)
|
|
106
|
+
future = executor.submit(self._run_step, step)
|
|
107
|
+
futures[future] = step
|
|
108
|
+
|
|
109
|
+
def _handle_completed_tasks(self, futures, g, remaining_steps):
|
|
110
|
+
"""Handles the completion of tasks in the pipeline.
|
|
111
|
+
|
|
112
|
+
This method processes the futures that have completed execution. It removes the corresponding
|
|
113
|
+
steps from the directed acyclic graph (DAG) and checks if new steps are ready to run. If new
|
|
114
|
+
steps are ready, it returns True to indicate that the pipeline can continue execution.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
futures: A dictionary mapping futures to their corresponding steps.
|
|
118
|
+
g: The directed acyclic graph representing the pipeline.
|
|
119
|
+
remaining_steps: A list of steps that are yet to be executed.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
True if new steps are ready to run, False otherwise.
|
|
123
|
+
"""
|
|
124
|
+
# Wait for tasks to complete and free up dependencies
|
|
125
|
+
for future in as_completed(futures):
|
|
126
|
+
future.result() # checks if the run was successful, otherwise throws an error and cancels remaining futures
|
|
127
|
+
step = futures[future]
|
|
128
|
+
del futures[future]
|
|
129
|
+
with self._lock:
|
|
130
|
+
g.remove_node(step) # Mark the step as completed by removing it from the graph.
|
|
131
|
+
if len(set([step for step in remaining_steps if g.in_degree(step) == 0])) > 0:
|
|
132
|
+
self._console_logger.debug("New steps ready to run")
|
|
133
|
+
return True
|
|
134
|
+
self._console_logger.debug("No more steps to run")
|
|
135
|
+
return False
|
|
136
|
+
|
|
137
|
+
def run(self) -> None:
|
|
138
|
+
"""Runs the pipeline by executing each step in the correct order."""
|
|
139
|
+
g = self._create_graph()
|
|
140
|
+
remaining_steps = list(g.nodes())
|
|
141
|
+
self._console_logger.info(f"Pipeline [' {self.name} '] started with {len(remaining_steps)} steps.")
|
|
142
|
+
|
|
143
|
+
with ThreadPoolExecutor(max_workers=int(os.environ.get("NESSY_MAX_WORKERS", 1))) as executor:
|
|
144
|
+
futures: dict = {}
|
|
145
|
+
try:
|
|
146
|
+
self._console_logger.debug(f"Remaining steps: {remaining_steps}")
|
|
147
|
+
while remaining_steps:
|
|
148
|
+
ready_to_run = self._get_ready_to_run_steps(remaining_steps, g)
|
|
149
|
+
if not ready_to_run:
|
|
150
|
+
# If there are still steps to be executed, but all of them have predecessors there
|
|
151
|
+
# must be a cyclic dependency in the graph.
|
|
152
|
+
self._console_logger.error(
|
|
153
|
+
f"Cyclic dependency detected in the pipeline. Remaining steps: {remaining_steps}"
|
|
154
|
+
)
|
|
155
|
+
raise RuntimeError("Cyclic dependency detected in the pipeline!")
|
|
156
|
+
|
|
157
|
+
self._submit_ready_steps(ready_to_run, remaining_steps, executor, futures)
|
|
158
|
+
|
|
159
|
+
if self._handle_completed_tasks(futures, g, remaining_steps):
|
|
160
|
+
continue
|
|
161
|
+
except RuntimeError as e:
|
|
162
|
+
self._console_logger.error(f"Pipeline [' {self.name} '] failed due to cyclic dependency: {str(e)}")
|
|
163
|
+
raise e
|
|
164
|
+
except Exception as e:
|
|
165
|
+
self._console_logger.error(f"Pipeline [' {self.name} '] failed: {str(e)}")
|
|
166
|
+
raise e
|
|
167
|
+
finally:
|
|
168
|
+
# ensure that any futures are canceled (if successful, it finished anyway, if error, cancel still running futures)
|
|
169
|
+
for future in futures:
|
|
170
|
+
future.cancel() # Cancel remaining futures
|
|
171
|
+
self._graph = self._create_graph() # recreate the graph after the run
|
|
172
|
+
self._console_logger.info(f"Pipeline [' {self.name} '] completed successfully.")
|
|
173
|
+
|
|
174
|
+
def plot_graph(self, save_path: str | None = None) -> None:
|
|
175
|
+
"""Visualizes the graph of the pipeline using matplotlib.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
save_path: If provided, the graph will be saved to this path. Otherwise, it will be shown.
|
|
179
|
+
"""
|
|
180
|
+
pos = nx.spring_layout(self._graph) # Position steps (nodes) using the spring layout
|
|
181
|
+
plt.figure(figsize=(12, 8))
|
|
182
|
+
nx.draw(
|
|
183
|
+
self._graph,
|
|
184
|
+
pos,
|
|
185
|
+
with_labels=True,
|
|
186
|
+
node_color="lightblue",
|
|
187
|
+
font_weight="bold",
|
|
188
|
+
node_size=3000,
|
|
189
|
+
font_size=10,
|
|
190
|
+
edge_color="gray",
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Draw edge labels if needed
|
|
194
|
+
edge_labels = nx.get_edge_attributes(self._graph, "label")
|
|
195
|
+
nx.draw_networkx_edge_labels(self._graph, pos, edge_labels=edge_labels)
|
|
196
|
+
|
|
197
|
+
if save_path:
|
|
198
|
+
plt.savefig(save_path)
|
|
199
|
+
self._console_logger.info(f"Graph visual saved to {save_path}")
|
|
200
|
+
else:
|
|
201
|
+
plt.show()
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC, ABCMeta, abstractmethod
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from ..logging import LoggerMixin
|
|
7
|
+
from .pipeline_context import PipelineContext
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class PipelineActionLogs:
|
|
12
|
+
"""Dataclass defining the pipeline action logs table."""
|
|
13
|
+
|
|
14
|
+
table_name: str = "nessy_action_logs"
|
|
15
|
+
log_type: str = "nessy_action_logs"
|
|
16
|
+
columns: dict[str, str] = field(
|
|
17
|
+
default_factory=lambda: {
|
|
18
|
+
"action_name": "STRING",
|
|
19
|
+
"message": "STRING",
|
|
20
|
+
}
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PipelineActionMeta(ABCMeta):
|
|
25
|
+
"""Metaclass for PipelineAction to ensure that all subclasses have a 'name' attribute."""
|
|
26
|
+
|
|
27
|
+
def __init__(cls, name, bases, dct):
|
|
28
|
+
if cls.__name__ != "PipelineAction" and "name" not in dct:
|
|
29
|
+
raise TypeError(f"Class {name} is missing required 'name' attribute")
|
|
30
|
+
super().__init__(name, bases, dct)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class PipelineAction(ABC, LoggerMixin, metaclass=PipelineActionMeta):
|
|
34
|
+
"""Models the operation being executed against an Input.
|
|
35
|
+
|
|
36
|
+
Attributes:
|
|
37
|
+
name: The name of the action.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
name: str
|
|
41
|
+
|
|
42
|
+
def __init__(self, tabular_logger: logging.Logger | None = None) -> None:
|
|
43
|
+
"""Initializes the PipelineAction object.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
tabular_logger: The tabular logger to use for dependency injection.
|
|
47
|
+
"""
|
|
48
|
+
self._console_logger = self.get_console_logger()
|
|
49
|
+
self._tabular_logger = tabular_logger or self.get_tabular_logger(
|
|
50
|
+
logger_name="Tabular:PipelineAction",
|
|
51
|
+
uc_table_name=PipelineActionLogs().table_name,
|
|
52
|
+
uc_table_columns=PipelineActionLogs().columns,
|
|
53
|
+
log_type=PipelineActionLogs().log_type,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def __str__(self) -> str:
|
|
57
|
+
return self.__class__.__name__
|
|
58
|
+
|
|
59
|
+
@abstractmethod
|
|
60
|
+
def run(self, context: PipelineContext, **kwargs: Any) -> PipelineContext:
|
|
61
|
+
"""Execute the pipeline action."""
|
|
62
|
+
pass
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections import OrderedDict
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field, ValidationError, model_validator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PipelineConfigBaseModel(BaseModel):
|
|
9
|
+
"""The base model for Pipeline Config objects."""
|
|
10
|
+
|
|
11
|
+
@classmethod
|
|
12
|
+
def metadata_to_instance(cls, data: dict) -> Any:
|
|
13
|
+
"""Parses a Dictionary to an instance.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
data: The data to parse.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
An instance and potentially a list of errors.
|
|
20
|
+
"""
|
|
21
|
+
errors = []
|
|
22
|
+
try:
|
|
23
|
+
instance = cls(**data)
|
|
24
|
+
except ValidationError as e:
|
|
25
|
+
instance = None
|
|
26
|
+
errors.append(e)
|
|
27
|
+
if errors:
|
|
28
|
+
PipelineConfig.handle_validation_errors(errors)
|
|
29
|
+
return instance
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def handle_validation_errors(errors: list[ValidationError]) -> None:
|
|
33
|
+
"""Cleanly prints Pydantic validation errors and raises a ValueError.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
errors: A list of Pydantic validation errors.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If any validation errors occurred.
|
|
40
|
+
"""
|
|
41
|
+
logger = logging.getLogger(__name__)
|
|
42
|
+
for error in errors:
|
|
43
|
+
if isinstance(error, ValidationError):
|
|
44
|
+
logger.error(f"Validation errors for {error.title}:")
|
|
45
|
+
for err in error.errors():
|
|
46
|
+
loc = ".".join(map(str, err["loc"]))
|
|
47
|
+
msg = err["msg"]
|
|
48
|
+
err_type = err["type"]
|
|
49
|
+
input_value = err.get("input", "N/A")
|
|
50
|
+
logger.error(f" Location: {loc}")
|
|
51
|
+
logger.error(f" Error Message: {msg}")
|
|
52
|
+
logger.error(f" Error Type: {err_type}")
|
|
53
|
+
logger.error(f" Input Value: {input_value}")
|
|
54
|
+
logger.error(f" Further information: {err.get('ctx', {}).get('url', 'N/A')}")
|
|
55
|
+
logger.error("")
|
|
56
|
+
else:
|
|
57
|
+
logger.error(error)
|
|
58
|
+
if errors:
|
|
59
|
+
raise ValueError(f"Validation errors occurred: {errors}")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class PipelineActionConfig(PipelineConfigBaseModel):
|
|
63
|
+
"""This class stores the configuration for a pipeline action."""
|
|
64
|
+
|
|
65
|
+
name: str
|
|
66
|
+
|
|
67
|
+
@model_validator(mode="before")
|
|
68
|
+
def validate_action(cls, v):
|
|
69
|
+
"""The Pipeline Action must be a valid action type."""
|
|
70
|
+
# This validation was removed in favor of custom validations in YAML
|
|
71
|
+
# pipeline definitions.
|
|
72
|
+
# if v not in PipelineActionType.__members__: # noqa: ERA001
|
|
73
|
+
# raise ValueError(f"Action '{v}' is not a valid action.") # noqa: ERA001
|
|
74
|
+
action_config = {"name": v}
|
|
75
|
+
return action_config
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class PipelineStepConfig(PipelineConfigBaseModel):
|
|
79
|
+
"""This class stores the configuration for a pipeline step."""
|
|
80
|
+
|
|
81
|
+
action: PipelineActionConfig
|
|
82
|
+
is_successor: bool = True
|
|
83
|
+
context: str | None = None
|
|
84
|
+
table_metadata: str | None = None
|
|
85
|
+
options: dict = Field(default_factory=dict)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class PipelineConfig(PipelineConfigBaseModel):
|
|
89
|
+
"""This class stores the configuration for a pipeline."""
|
|
90
|
+
|
|
91
|
+
name: str
|
|
92
|
+
steps: OrderedDict[str, PipelineStepConfig]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
|
|
5
|
+
from ..models import Table
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PipelineContext:
|
|
9
|
+
"""A class that models the context of a pipeline.
|
|
10
|
+
|
|
11
|
+
The context consists of Table Metadata (the Table definition) and the actual data
|
|
12
|
+
as a DataFrame.
|
|
13
|
+
|
|
14
|
+
Attributes:
|
|
15
|
+
table_metadata: The Nessy-Table definition.
|
|
16
|
+
data: The data of the context.
|
|
17
|
+
runtime_info: Additional runtime information, e.g. streaming status.
|
|
18
|
+
status: The status of the context. Can be "initialized", "successful" or
|
|
19
|
+
"failed".
|
|
20
|
+
|
|
21
|
+
Note:
|
|
22
|
+
This is not a pydantic class, because Fabric does not support the type ConnectDataFrame.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
table_metadata: Table | None = None,
|
|
28
|
+
data: DataFrame | None = None,
|
|
29
|
+
runtime_info: dict[str, Any] | None = None,
|
|
30
|
+
status: str = "initialized",
|
|
31
|
+
) -> None:
|
|
32
|
+
self.table_metadata = table_metadata
|
|
33
|
+
self.data = data
|
|
34
|
+
self.runtime_info = runtime_info if runtime_info is not None else {}
|
|
35
|
+
self.status = status
|
|
36
|
+
|
|
37
|
+
def from_existing(
|
|
38
|
+
self,
|
|
39
|
+
table_metadata: Table | None = None,
|
|
40
|
+
data: DataFrame | None = None,
|
|
41
|
+
runtime_info: dict[str, Any] | None = None,
|
|
42
|
+
) -> "PipelineContext":
|
|
43
|
+
"""Creates a new PipelineContext from an existing one.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
table_metadata: The metadata of the new context.
|
|
47
|
+
data: The data of the new context.
|
|
48
|
+
runtime_info: The runtime_info of the new context.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
The new PipelineContext.
|
|
52
|
+
"""
|
|
53
|
+
final_metadata = table_metadata or self.table_metadata
|
|
54
|
+
final_data = data or self.data
|
|
55
|
+
final_runtime_info = runtime_info or self.runtime_info or {}
|
|
56
|
+
return PipelineContext(table_metadata=final_metadata, data=final_data, runtime_info=final_runtime_info)
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from collections import OrderedDict
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from ..logging import LoggerMixin
|
|
10
|
+
from ..session import SessionManager
|
|
11
|
+
from .actions import PipelineActionType, pipeline_actions
|
|
12
|
+
from .pipeline import Pipeline
|
|
13
|
+
from .pipeline_config import PipelineConfig
|
|
14
|
+
from .pipeline_step import PipelineStep
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PipelineParsingService:
|
|
18
|
+
"""A service class that parses a YAML document or string into a Pipeline object."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, custom_actions=None):
|
|
21
|
+
if custom_actions is not None:
|
|
22
|
+
for action in custom_actions:
|
|
23
|
+
self.register_pipeline_action(action)
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def register_pipeline_action(pipeline_action_class):
|
|
27
|
+
"""Registers a custom pipeline action class.
|
|
28
|
+
|
|
29
|
+
!!! note
|
|
30
|
+
Registering an action enables the custom action to be used in the
|
|
31
|
+
pipeline YAML definition. This is automatically called, when the
|
|
32
|
+
PipelineParsingService is instantiated with (a list of) custom
|
|
33
|
+
actions.
|
|
34
|
+
"""
|
|
35
|
+
console_logger = LoggerMixin().get_console_logger()
|
|
36
|
+
console_logger.info("Registering custom pipeline action [' %s ']", pipeline_action_class.name)
|
|
37
|
+
pipeline_actions[pipeline_action_class.name] = pipeline_action_class
|
|
38
|
+
|
|
39
|
+
global PipelineActionType
|
|
40
|
+
PipelineActionType = Enum("PipelineActionType", pipeline_actions)
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
def parse(path: Path | None = None, yaml_str: str | None = None) -> Pipeline:
|
|
44
|
+
"""Reads the YAML from a given Path and returns a Pipeline object.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
path: Path to the YAML document.
|
|
48
|
+
yaml_str: A string that can be parsed in YAML format.
|
|
49
|
+
|
|
50
|
+
Raises:
|
|
51
|
+
ValueError: If neither 'path' nor 'yaml_str' has been provided.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Pipeline: The resulting Pipeline instance.
|
|
55
|
+
"""
|
|
56
|
+
console_logger = LoggerMixin().get_console_logger()
|
|
57
|
+
if not path and not yaml_str:
|
|
58
|
+
raise ValueError("Neither 'file_path' nor 'yaml_str' was provided. Please supply one of them.")
|
|
59
|
+
if path:
|
|
60
|
+
path_obj = Path(path)
|
|
61
|
+
with open(path_obj) as f:
|
|
62
|
+
yaml_str = f.read()
|
|
63
|
+
if not yaml_str:
|
|
64
|
+
raise ValueError("YAML content is empty.")
|
|
65
|
+
|
|
66
|
+
final_yaml_str = PipelineParsingService._replace_variables(yaml_str)
|
|
67
|
+
config = yaml.safe_load(final_yaml_str)
|
|
68
|
+
pipeline_config = PipelineConfig.metadata_to_instance(config)
|
|
69
|
+
steps = PipelineParsingService._get_steps(pipeline_config.steps)
|
|
70
|
+
pipeline = Pipeline(name=pipeline_config.name, steps=steps) # type: ignore
|
|
71
|
+
console_logger.info("Pipeline [ '%s' ] parsed successfully with %d steps.", pipeline.name, len(pipeline.steps))
|
|
72
|
+
return pipeline
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def _replace_variables(yaml_str: str) -> str:
|
|
76
|
+
"""Replace variable placeholders in a YAML string.
|
|
77
|
+
|
|
78
|
+
Replaces environment variables with the pattern `{{env:var-name}}`. Where
|
|
79
|
+
the var-name is the name of the environment variable. Replaces secret
|
|
80
|
+
references with the pattern `{{secret-scope-name:secret-key}}`. Where
|
|
81
|
+
scope-name is the name of the secret scope and secret-key is the key of
|
|
82
|
+
the secret.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
yaml_str: A string that can be parsed in YAML format.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
The same YAML string with environment variable placeholders replaced.
|
|
89
|
+
"""
|
|
90
|
+
env_var_pattern = r"\{\{env:([^}]+)\}\}"
|
|
91
|
+
secret_ref_pattern = r"\{\{(?!step|env)([^}]+):([^}]+)\}\}"
|
|
92
|
+
|
|
93
|
+
def replace_with_env_var(match):
|
|
94
|
+
env_var_name = match.group(1)
|
|
95
|
+
env_var_value = os.getenv(env_var_name)
|
|
96
|
+
return env_var_value
|
|
97
|
+
|
|
98
|
+
def replace_with_secret(match):
|
|
99
|
+
secret_scope_name = match.group(1)
|
|
100
|
+
secret_key = match.group(2)
|
|
101
|
+
return SessionManager.get_utils().secrets.get(scope=secret_scope_name, key=secret_key)
|
|
102
|
+
|
|
103
|
+
env_replaced_yaml_string = re.sub(env_var_pattern, replace_with_env_var, yaml_str)
|
|
104
|
+
final_yaml_string = re.sub(secret_ref_pattern, replace_with_secret, env_replaced_yaml_string)
|
|
105
|
+
return final_yaml_string
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def _get_steps(step_configs, last_step_name: str | None = None):
|
|
109
|
+
steps = OrderedDict()
|
|
110
|
+
for step_name, step_config in step_configs.items():
|
|
111
|
+
is_successor = step_config.is_successor
|
|
112
|
+
context_ref = step_config.context
|
|
113
|
+
if is_successor and not context_ref:
|
|
114
|
+
context_ref = last_step_name
|
|
115
|
+
action = PipelineActionType[step_config.action.name].value()
|
|
116
|
+
step = PipelineStep(
|
|
117
|
+
name=step_name,
|
|
118
|
+
action=action,
|
|
119
|
+
options=step_config.options,
|
|
120
|
+
_context_ref=context_ref,
|
|
121
|
+
_table_metadata_ref=step_config.table_metadata,
|
|
122
|
+
)
|
|
123
|
+
steps[step.name] = step
|
|
124
|
+
last_step_name = step_name
|
|
125
|
+
for step in steps.values():
|
|
126
|
+
steps[step.name] = PipelineParsingService._replace_step_refs(steps, step)
|
|
127
|
+
return steps
|
|
128
|
+
|
|
129
|
+
@staticmethod
|
|
130
|
+
def _replace_step_refs(steps: OrderedDict[str, PipelineStep], step: PipelineStep) -> PipelineStep:
|
|
131
|
+
step_ref_pattern = r"\(\(step:([^)]+)\)\)"
|
|
132
|
+
|
|
133
|
+
def _handle_string_value(value: str, option: str):
|
|
134
|
+
if match := re.match(step_ref_pattern, value):
|
|
135
|
+
dependency_step_name = match.group(1)
|
|
136
|
+
dependency_step = steps.get(dependency_step_name)
|
|
137
|
+
step.options[option] = dependency_step
|
|
138
|
+
step._predecessors.add(dependency_step_name)
|
|
139
|
+
|
|
140
|
+
def _handle_list_value(value: list, option: str):
|
|
141
|
+
for i, v in enumerate(value):
|
|
142
|
+
if isinstance(v, str):
|
|
143
|
+
if match := re.match(step_ref_pattern, v):
|
|
144
|
+
dependency_step_name = match.group(1)
|
|
145
|
+
dependency_step = steps.get(dependency_step_name)
|
|
146
|
+
step.options[option][i] = dependency_step
|
|
147
|
+
step._predecessors.add(dependency_step_name)
|
|
148
|
+
|
|
149
|
+
if step.options:
|
|
150
|
+
for option, value in step.options.items():
|
|
151
|
+
if isinstance(value, str):
|
|
152
|
+
_handle_string_value(value, option)
|
|
153
|
+
elif isinstance(value, list):
|
|
154
|
+
_handle_list_value(value, option)
|
|
155
|
+
|
|
156
|
+
return step
|