cloe-nessy 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. cloe_nessy/__init__.py +0 -0
  2. cloe_nessy/clients/__init__.py +5 -0
  3. cloe_nessy/clients/api_client/__init__.py +3 -0
  4. cloe_nessy/clients/api_client/api_client.py +188 -0
  5. cloe_nessy/clients/api_client/api_response.py +72 -0
  6. cloe_nessy/clients/api_client/auth.py +178 -0
  7. cloe_nessy/clients/api_client/exceptions.py +22 -0
  8. cloe_nessy/file_utilities/__init__.py +3 -0
  9. cloe_nessy/file_utilities/exceptions.py +4 -0
  10. cloe_nessy/file_utilities/factory.py +42 -0
  11. cloe_nessy/file_utilities/get_file_paths.py +72 -0
  12. cloe_nessy/file_utilities/location_types.py +29 -0
  13. cloe_nessy/file_utilities/strategies/__init__.py +0 -0
  14. cloe_nessy/file_utilities/strategies/base_strategy.py +59 -0
  15. cloe_nessy/file_utilities/strategies/local_strategy.py +51 -0
  16. cloe_nessy/file_utilities/strategies/onelake_strategy.py +31 -0
  17. cloe_nessy/file_utilities/strategies/utils_strategy.py +72 -0
  18. cloe_nessy/integration/__init__.py +0 -0
  19. cloe_nessy/integration/reader/__init__.py +6 -0
  20. cloe_nessy/integration/reader/api_reader.py +141 -0
  21. cloe_nessy/integration/reader/catalog_reader.py +49 -0
  22. cloe_nessy/integration/reader/excel_reader.py +170 -0
  23. cloe_nessy/integration/reader/exceptions.py +10 -0
  24. cloe_nessy/integration/reader/file_reader.py +96 -0
  25. cloe_nessy/integration/reader/reader.py +34 -0
  26. cloe_nessy/integration/writer/__init__.py +3 -0
  27. cloe_nessy/integration/writer/catalog_writer.py +48 -0
  28. cloe_nessy/logging/__init__.py +3 -0
  29. cloe_nessy/logging/logger_mixin.py +162 -0
  30. cloe_nessy/models/__init__.py +13 -0
  31. cloe_nessy/models/column.py +65 -0
  32. cloe_nessy/models/constraint.py +9 -0
  33. cloe_nessy/models/foreign_key.py +34 -0
  34. cloe_nessy/models/mixins/__init__.py +0 -0
  35. cloe_nessy/models/mixins/read_instance_mixin.py +124 -0
  36. cloe_nessy/models/mixins/template_loader_mixin.py +18 -0
  37. cloe_nessy/models/schema.py +76 -0
  38. cloe_nessy/models/table.py +236 -0
  39. cloe_nessy/models/types.py +7 -0
  40. cloe_nessy/object_manager/__init__.py +3 -0
  41. cloe_nessy/object_manager/table_manager.py +58 -0
  42. cloe_nessy/pipeline/__init__.py +7 -0
  43. cloe_nessy/pipeline/actions/__init__.py +50 -0
  44. cloe_nessy/pipeline/actions/read_api.py +178 -0
  45. cloe_nessy/pipeline/actions/read_catalog_table.py +68 -0
  46. cloe_nessy/pipeline/actions/read_excel.py +177 -0
  47. cloe_nessy/pipeline/actions/read_files.py +105 -0
  48. cloe_nessy/pipeline/actions/read_metadata_yaml.py +66 -0
  49. cloe_nessy/pipeline/actions/transform_change_datatype.py +56 -0
  50. cloe_nessy/pipeline/actions/transform_concat_columns.py +88 -0
  51. cloe_nessy/pipeline/actions/transform_decode.py +102 -0
  52. cloe_nessy/pipeline/actions/transform_distinct.py +40 -0
  53. cloe_nessy/pipeline/actions/transform_filter.py +51 -0
  54. cloe_nessy/pipeline/actions/transform_generic_sql.py +66 -0
  55. cloe_nessy/pipeline/actions/transform_join.py +81 -0
  56. cloe_nessy/pipeline/actions/transform_json_normalize.py +106 -0
  57. cloe_nessy/pipeline/actions/transform_rename_columns.py +60 -0
  58. cloe_nessy/pipeline/actions/transform_replace_values.py +59 -0
  59. cloe_nessy/pipeline/actions/transform_select_columns.py +83 -0
  60. cloe_nessy/pipeline/actions/transform_union.py +71 -0
  61. cloe_nessy/pipeline/actions/write_catalog_table.py +73 -0
  62. cloe_nessy/pipeline/pipeline.py +201 -0
  63. cloe_nessy/pipeline/pipeline_action.py +62 -0
  64. cloe_nessy/pipeline/pipeline_config.py +92 -0
  65. cloe_nessy/pipeline/pipeline_context.py +56 -0
  66. cloe_nessy/pipeline/pipeline_parsing_service.py +156 -0
  67. cloe_nessy/pipeline/pipeline_step.py +50 -0
  68. cloe_nessy/py.typed +0 -0
  69. cloe_nessy/session/__init__.py +3 -0
  70. cloe_nessy/session/session_manager.py +188 -0
  71. cloe_nessy/settings/__init__.py +3 -0
  72. cloe_nessy/settings/settings.py +91 -0
  73. cloe_nessy/utils/__init__.py +0 -0
  74. cloe_nessy/utils/file_and_directory_handler.py +19 -0
  75. cloe_nessy-0.2.9.dist-info/METADATA +26 -0
  76. cloe_nessy-0.2.9.dist-info/RECORD +78 -0
  77. cloe_nessy-0.2.9.dist-info/WHEEL +5 -0
  78. cloe_nessy-0.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,73 @@
1
+ from typing import Any
2
+
3
+ from ...integration.writer import CatalogWriter
4
+ from ..pipeline_action import PipelineAction
5
+ from ..pipeline_context import PipelineContext
6
+
7
+
8
+ class WriteCatalogTableAction(PipelineAction):
9
+ """Writes a DataFrame to a specified catalog table using [CatalogWriter][cloe_nessy.integration.writer.CatalogWriter].
10
+
11
+ Example:
12
+ ```yaml
13
+ Write Table to Catalog:
14
+ action: WRITE_CATALOG_TABLE
15
+ options:
16
+ table_identifier: my_catalog.business_schema.sales_table
17
+ mode: append
18
+ partition_by: day
19
+ options: <options for the writer>
20
+ ```
21
+ """
22
+
23
+ name: str = "WRITE_CATALOG_TABLE"
24
+
25
+ @staticmethod
26
+ def run(
27
+ context: PipelineContext,
28
+ *,
29
+ table_identifier: str | None = None,
30
+ mode: str = "append",
31
+ partition_by: str | list[str] | None = None,
32
+ options: dict[str, str] | None = None,
33
+ **_: Any,
34
+ ) -> PipelineContext:
35
+ """Writes a DataFrame to a specified catalog table.
36
+
37
+ Args:
38
+ context: Context in which this Action is executed.
39
+ table_identifier: The table identifier in the unity catalog in the
40
+ format 'catalog.schema.table'. If not provided, attempts to use the
41
+ context's table metadata.
42
+ mode: The write mode. One of 'append', 'overwrite', 'error',
43
+ 'errorifexists', or 'ignore'.
44
+ partition_by: Names of the partitioning columns.
45
+ options: Additional options for the write operation.
46
+
47
+ Raises:
48
+ ValueError: If the table name is not specified or cannot be inferred from
49
+ the context.
50
+
51
+ Returns:
52
+ Context after the execution of this Action.
53
+ """
54
+ if not options:
55
+ options = dict()
56
+ if partition_by is None:
57
+ if hasattr(context.table_metadata, "partition_by"):
58
+ partition_by = context.table_metadata.partition_by # type: ignore
59
+
60
+ if (table_metadata := context.table_metadata) and table_identifier is None:
61
+ table_identifier = table_metadata.identifier
62
+ if table_identifier is None:
63
+ raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
64
+
65
+ writer = CatalogWriter()
66
+ writer.write_table(
67
+ df=context.data, # type: ignore
68
+ table_identifier=table_identifier,
69
+ mode=mode,
70
+ partition_by=partition_by,
71
+ options=options,
72
+ )
73
+ return context.from_existing()
@@ -0,0 +1,201 @@
1
+ import os
2
+ from collections import OrderedDict
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ from threading import Lock
5
+
6
+ import matplotlib.pyplot as plt
7
+ import networkx as nx
8
+
9
+ from ..logging.logger_mixin import LoggerMixin
10
+ from .pipeline_step import PipelineStep
11
+
12
+
13
+ class Pipeline(LoggerMixin):
14
+ """A Pipeline represents the logical unit of one ETL process.
15
+
16
+ This class manages a directed acyclic graph (DAG) of steps, ensuring that
17
+ each step is executed in the correct order based on dependencies.
18
+
19
+ Attributes:
20
+ name: The name of the pipeline.
21
+ steps: An ordered dictionary of PipelineSteps that are part of the pipeline.
22
+ """
23
+
24
+ def __init__(self, name: str, steps: OrderedDict[str, "PipelineStep"] | None = None) -> None:
25
+ self.name: str = name
26
+ self.steps: OrderedDict[str, PipelineStep] = steps if steps is not None else OrderedDict()
27
+ self._console_logger = self.get_console_logger()
28
+ self._graph: nx.DiGraph = self._create_graph()
29
+ self._lock: Lock = Lock()
30
+
31
+ @property
32
+ def graph(self) -> nx.DiGraph:
33
+ """Get the pipeline graph."""
34
+ return self._graph
35
+
36
+ def _create_graph(self) -> nx.DiGraph:
37
+ """Creates a directed acyclic graph (DAG) representing the pipeline steps and their dependencies.
38
+
39
+ Each node in the graph represents a single step in the pipeline, and each edge represents a dependency.
40
+ """
41
+ g: nx.DiGraph = nx.DiGraph()
42
+ g.add_nodes_from(set([s.name for s in self.steps.values()]))
43
+ g.add_edges_from(set([(p, s.name) for s in self.steps.values() for p in s._predecessors if p]))
44
+
45
+ self._console_logger.debug(f"Graph created with {g.number_of_nodes()} nodes and {g.number_of_edges()} edges.")
46
+ return g
47
+
48
+ def _run_step(self, step_name: str) -> None:
49
+ """Executes the run method of the corresponding step in the pipeline."""
50
+ step = self.steps[step_name]
51
+
52
+ # Handle context and metadata references
53
+ if step._context_ref:
54
+ step.context = self.steps[step._context_ref].result
55
+ if step._table_metadata_ref:
56
+ step.context.table_metadata = self.steps[step._table_metadata_ref].result.table_metadata
57
+
58
+ try:
59
+ self._console_logger.info(f"Starting execution of step: {step.name}")
60
+ step.run()
61
+ except Exception as err:
62
+ self._console_logger.error(f"Execution of step {step.name} failed with error: {str(err)}")
63
+ raise err
64
+ else:
65
+ self._console_logger.info(f"Execution of step {step.name} succeeded.")
66
+
67
+ def _get_ready_to_run_steps(self, remaining_steps: list[str], g: nx.DiGraph) -> set[str]:
68
+ """Identifies and returns the steps that are ready to run.
69
+
70
+ This method checks the directed acyclic graph (DAG) to find steps that have no predecessors,
71
+ indicating that they are ready to be executed. It logs the remaining steps and the steps that
72
+ are ready to run.
73
+
74
+ Args:
75
+ remaining_steps: A list of step IDs that are yet to be executed.
76
+ g: The directed acyclic graph representing the pipeline.
77
+
78
+ Returns:
79
+ A set of step IDs that are ready to be executed.
80
+ """
81
+ with self._lock:
82
+ ready_to_run = set([step for step in remaining_steps if g.in_degree(step) == 0])
83
+ self._console_logger.debug(f"Remaining steps: {remaining_steps}")
84
+ self._console_logger.debug(f"Ready to run: {ready_to_run}")
85
+ return ready_to_run
86
+
87
+ def _submit_ready_steps(
88
+ self, ready_to_run: set[str], remaining_steps: list[str], executor: ThreadPoolExecutor, futures: dict
89
+ ):
90
+ """Submits the ready-to-run steps to the executor for execution.
91
+
92
+ This method takes the steps that are ready to run, removes them from the list of remaining steps,
93
+ and submits them to the executor for concurrent execution. It also updates the futures dictionary
94
+ to keep track of the submitted tasks.
95
+
96
+ Args:
97
+ ready_to_run: A set of steps that are ready to be executed.
98
+ remaining_steps: A list of steps that are yet to be executed.
99
+ executor: The executor that manages the concurrent execution of steps.
100
+ futures: A dictionary mapping futures to their corresponding step ID.
101
+ """
102
+ with self._lock:
103
+ for step in ready_to_run:
104
+ self._console_logger.debug(f"Submitting: {step}")
105
+ remaining_steps.remove(step)
106
+ future = executor.submit(self._run_step, step)
107
+ futures[future] = step
108
+
109
+ def _handle_completed_tasks(self, futures, g, remaining_steps):
110
+ """Handles the completion of tasks in the pipeline.
111
+
112
+ This method processes the futures that have completed execution. It removes the corresponding
113
+ steps from the directed acyclic graph (DAG) and checks if new steps are ready to run. If new
114
+ steps are ready, it returns True to indicate that the pipeline can continue execution.
115
+
116
+ Args:
117
+ futures: A dictionary mapping futures to their corresponding steps.
118
+ g: The directed acyclic graph representing the pipeline.
119
+ remaining_steps: A list of steps that are yet to be executed.
120
+
121
+ Returns:
122
+ True if new steps are ready to run, False otherwise.
123
+ """
124
+ # Wait for tasks to complete and free up dependencies
125
+ for future in as_completed(futures):
126
+ future.result() # checks if the run was successful, otherwise throws an error and cancels remaining futures
127
+ step = futures[future]
128
+ del futures[future]
129
+ with self._lock:
130
+ g.remove_node(step) # Mark the step as completed by removing it from the graph.
131
+ if len(set([step for step in remaining_steps if g.in_degree(step) == 0])) > 0:
132
+ self._console_logger.debug("New steps ready to run")
133
+ return True
134
+ self._console_logger.debug("No more steps to run")
135
+ return False
136
+
137
+ def run(self) -> None:
138
+ """Runs the pipeline by executing each step in the correct order."""
139
+ g = self._create_graph()
140
+ remaining_steps = list(g.nodes())
141
+ self._console_logger.info(f"Pipeline [' {self.name} '] started with {len(remaining_steps)} steps.")
142
+
143
+ with ThreadPoolExecutor(max_workers=int(os.environ.get("NESSY_MAX_WORKERS", 1))) as executor:
144
+ futures: dict = {}
145
+ try:
146
+ self._console_logger.debug(f"Remaining steps: {remaining_steps}")
147
+ while remaining_steps:
148
+ ready_to_run = self._get_ready_to_run_steps(remaining_steps, g)
149
+ if not ready_to_run:
150
+ # If there are still steps to be executed, but all of them have predecessors there
151
+ # must be a cyclic dependency in the graph.
152
+ self._console_logger.error(
153
+ f"Cyclic dependency detected in the pipeline. Remaining steps: {remaining_steps}"
154
+ )
155
+ raise RuntimeError("Cyclic dependency detected in the pipeline!")
156
+
157
+ self._submit_ready_steps(ready_to_run, remaining_steps, executor, futures)
158
+
159
+ if self._handle_completed_tasks(futures, g, remaining_steps):
160
+ continue
161
+ except RuntimeError as e:
162
+ self._console_logger.error(f"Pipeline [' {self.name} '] failed due to cyclic dependency: {str(e)}")
163
+ raise e
164
+ except Exception as e:
165
+ self._console_logger.error(f"Pipeline [' {self.name} '] failed: {str(e)}")
166
+ raise e
167
+ finally:
168
+ # ensure that any futures are canceled (if successful, it finished anyway, if error, cancel still running futures)
169
+ for future in futures:
170
+ future.cancel() # Cancel remaining futures
171
+ self._graph = self._create_graph() # recreate the graph after the run
172
+ self._console_logger.info(f"Pipeline [' {self.name} '] completed successfully.")
173
+
174
+ def plot_graph(self, save_path: str | None = None) -> None:
175
+ """Visualizes the graph of the pipeline using matplotlib.
176
+
177
+ Args:
178
+ save_path: If provided, the graph will be saved to this path. Otherwise, it will be shown.
179
+ """
180
+ pos = nx.spring_layout(self._graph) # Position steps (nodes) using the spring layout
181
+ plt.figure(figsize=(12, 8))
182
+ nx.draw(
183
+ self._graph,
184
+ pos,
185
+ with_labels=True,
186
+ node_color="lightblue",
187
+ font_weight="bold",
188
+ node_size=3000,
189
+ font_size=10,
190
+ edge_color="gray",
191
+ )
192
+
193
+ # Draw edge labels if needed
194
+ edge_labels = nx.get_edge_attributes(self._graph, "label")
195
+ nx.draw_networkx_edge_labels(self._graph, pos, edge_labels=edge_labels)
196
+
197
+ if save_path:
198
+ plt.savefig(save_path)
199
+ self._console_logger.info(f"Graph visual saved to {save_path}")
200
+ else:
201
+ plt.show()
@@ -0,0 +1,62 @@
1
+ import logging
2
+ from abc import ABC, ABCMeta, abstractmethod
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+
6
+ from ..logging import LoggerMixin
7
+ from .pipeline_context import PipelineContext
8
+
9
+
10
+ @dataclass
11
+ class PipelineActionLogs:
12
+ """Dataclass defining the pipeline action logs table."""
13
+
14
+ table_name: str = "nessy_action_logs"
15
+ log_type: str = "nessy_action_logs"
16
+ columns: dict[str, str] = field(
17
+ default_factory=lambda: {
18
+ "action_name": "STRING",
19
+ "message": "STRING",
20
+ }
21
+ )
22
+
23
+
24
+ class PipelineActionMeta(ABCMeta):
25
+ """Metaclass for PipelineAction to ensure that all subclasses have a 'name' attribute."""
26
+
27
+ def __init__(cls, name, bases, dct):
28
+ if cls.__name__ != "PipelineAction" and "name" not in dct:
29
+ raise TypeError(f"Class {name} is missing required 'name' attribute")
30
+ super().__init__(name, bases, dct)
31
+
32
+
33
+ class PipelineAction(ABC, LoggerMixin, metaclass=PipelineActionMeta):
34
+ """Models the operation being executed against an Input.
35
+
36
+ Attributes:
37
+ name: The name of the action.
38
+ """
39
+
40
+ name: str
41
+
42
+ def __init__(self, tabular_logger: logging.Logger | None = None) -> None:
43
+ """Initializes the PipelineAction object.
44
+
45
+ Args:
46
+ tabular_logger: The tabular logger to use for dependency injection.
47
+ """
48
+ self._console_logger = self.get_console_logger()
49
+ self._tabular_logger = tabular_logger or self.get_tabular_logger(
50
+ logger_name="Tabular:PipelineAction",
51
+ uc_table_name=PipelineActionLogs().table_name,
52
+ uc_table_columns=PipelineActionLogs().columns,
53
+ log_type=PipelineActionLogs().log_type,
54
+ )
55
+
56
+ def __str__(self) -> str:
57
+ return self.__class__.__name__
58
+
59
+ @abstractmethod
60
+ def run(self, context: PipelineContext, **kwargs: Any) -> PipelineContext:
61
+ """Execute the pipeline action."""
62
+ pass
@@ -0,0 +1,92 @@
1
+ import logging
2
+ from collections import OrderedDict
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel, Field, ValidationError, model_validator
6
+
7
+
8
+ class PipelineConfigBaseModel(BaseModel):
9
+ """The base model for Pipeline Config objects."""
10
+
11
+ @classmethod
12
+ def metadata_to_instance(cls, data: dict) -> Any:
13
+ """Parses a Dictionary to an instance.
14
+
15
+ Args:
16
+ data: The data to parse.
17
+
18
+ Returns:
19
+ An instance and potentially a list of errors.
20
+ """
21
+ errors = []
22
+ try:
23
+ instance = cls(**data)
24
+ except ValidationError as e:
25
+ instance = None
26
+ errors.append(e)
27
+ if errors:
28
+ PipelineConfig.handle_validation_errors(errors)
29
+ return instance
30
+
31
+ @staticmethod
32
+ def handle_validation_errors(errors: list[ValidationError]) -> None:
33
+ """Cleanly prints Pydantic validation errors and raises a ValueError.
34
+
35
+ Args:
36
+ errors: A list of Pydantic validation errors.
37
+
38
+ Raises:
39
+ ValueError: If any validation errors occurred.
40
+ """
41
+ logger = logging.getLogger(__name__)
42
+ for error in errors:
43
+ if isinstance(error, ValidationError):
44
+ logger.error(f"Validation errors for {error.title}:")
45
+ for err in error.errors():
46
+ loc = ".".join(map(str, err["loc"]))
47
+ msg = err["msg"]
48
+ err_type = err["type"]
49
+ input_value = err.get("input", "N/A")
50
+ logger.error(f" Location: {loc}")
51
+ logger.error(f" Error Message: {msg}")
52
+ logger.error(f" Error Type: {err_type}")
53
+ logger.error(f" Input Value: {input_value}")
54
+ logger.error(f" Further information: {err.get('ctx', {}).get('url', 'N/A')}")
55
+ logger.error("")
56
+ else:
57
+ logger.error(error)
58
+ if errors:
59
+ raise ValueError(f"Validation errors occurred: {errors}")
60
+
61
+
62
+ class PipelineActionConfig(PipelineConfigBaseModel):
63
+ """This class stores the configuration for a pipeline action."""
64
+
65
+ name: str
66
+
67
+ @model_validator(mode="before")
68
+ def validate_action(cls, v):
69
+ """The Pipeline Action must be a valid action type."""
70
+ # This validation was removed in favor of custom validations in YAML
71
+ # pipeline definitions.
72
+ # if v not in PipelineActionType.__members__: # noqa: ERA001
73
+ # raise ValueError(f"Action '{v}' is not a valid action.") # noqa: ERA001
74
+ action_config = {"name": v}
75
+ return action_config
76
+
77
+
78
+ class PipelineStepConfig(PipelineConfigBaseModel):
79
+ """This class stores the configuration for a pipeline step."""
80
+
81
+ action: PipelineActionConfig
82
+ is_successor: bool = True
83
+ context: str | None = None
84
+ table_metadata: str | None = None
85
+ options: dict = Field(default_factory=dict)
86
+
87
+
88
+ class PipelineConfig(PipelineConfigBaseModel):
89
+ """This class stores the configuration for a pipeline."""
90
+
91
+ name: str
92
+ steps: OrderedDict[str, PipelineStepConfig]
@@ -0,0 +1,56 @@
1
+ from typing import Any
2
+
3
+ from pyspark.sql import DataFrame
4
+
5
+ from ..models import Table
6
+
7
+
8
+ class PipelineContext:
9
+ """A class that models the context of a pipeline.
10
+
11
+ The context consists of Table Metadata (the Table definition) and the actual data
12
+ as a DataFrame.
13
+
14
+ Attributes:
15
+ table_metadata: The Nessy-Table definition.
16
+ data: The data of the context.
17
+ runtime_info: Additional runtime information, e.g. streaming status.
18
+ status: The status of the context. Can be "initialized", "successful" or
19
+ "failed".
20
+
21
+ Note:
22
+ This is not a pydantic class, because Fabric does not support the type ConnectDataFrame.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ table_metadata: Table | None = None,
28
+ data: DataFrame | None = None,
29
+ runtime_info: dict[str, Any] | None = None,
30
+ status: str = "initialized",
31
+ ) -> None:
32
+ self.table_metadata = table_metadata
33
+ self.data = data
34
+ self.runtime_info = runtime_info if runtime_info is not None else {}
35
+ self.status = status
36
+
37
+ def from_existing(
38
+ self,
39
+ table_metadata: Table | None = None,
40
+ data: DataFrame | None = None,
41
+ runtime_info: dict[str, Any] | None = None,
42
+ ) -> "PipelineContext":
43
+ """Creates a new PipelineContext from an existing one.
44
+
45
+ Args:
46
+ table_metadata: The metadata of the new context.
47
+ data: The data of the new context.
48
+ runtime_info: The runtime_info of the new context.
49
+
50
+ Returns:
51
+ The new PipelineContext.
52
+ """
53
+ final_metadata = table_metadata or self.table_metadata
54
+ final_data = data or self.data
55
+ final_runtime_info = runtime_info or self.runtime_info or {}
56
+ return PipelineContext(table_metadata=final_metadata, data=final_data, runtime_info=final_runtime_info)
@@ -0,0 +1,156 @@
1
+ import os
2
+ import re
3
+ from collections import OrderedDict
4
+ from enum import Enum
5
+ from pathlib import Path
6
+
7
+ import yaml
8
+
9
+ from ..logging import LoggerMixin
10
+ from ..session import SessionManager
11
+ from .actions import PipelineActionType, pipeline_actions
12
+ from .pipeline import Pipeline
13
+ from .pipeline_config import PipelineConfig
14
+ from .pipeline_step import PipelineStep
15
+
16
+
17
+ class PipelineParsingService:
18
+ """A service class that parses a YAML document or string into a Pipeline object."""
19
+
20
+ def __init__(self, custom_actions=None):
21
+ if custom_actions is not None:
22
+ for action in custom_actions:
23
+ self.register_pipeline_action(action)
24
+
25
+ @staticmethod
26
+ def register_pipeline_action(pipeline_action_class):
27
+ """Registers a custom pipeline action class.
28
+
29
+ !!! note
30
+ Registering an action enables the custom action to be used in the
31
+ pipeline YAML definition. This is automatically called, when the
32
+ PipelineParsingService is instantiated with (a list of) custom
33
+ actions.
34
+ """
35
+ console_logger = LoggerMixin().get_console_logger()
36
+ console_logger.info("Registering custom pipeline action [' %s ']", pipeline_action_class.name)
37
+ pipeline_actions[pipeline_action_class.name] = pipeline_action_class
38
+
39
+ global PipelineActionType
40
+ PipelineActionType = Enum("PipelineActionType", pipeline_actions)
41
+
42
+ @staticmethod
43
+ def parse(path: Path | None = None, yaml_str: str | None = None) -> Pipeline:
44
+ """Reads the YAML from a given Path and returns a Pipeline object.
45
+
46
+ Args:
47
+ path: Path to the YAML document.
48
+ yaml_str: A string that can be parsed in YAML format.
49
+
50
+ Raises:
51
+ ValueError: If neither 'path' nor 'yaml_str' has been provided.
52
+
53
+ Returns:
54
+ Pipeline: The resulting Pipeline instance.
55
+ """
56
+ console_logger = LoggerMixin().get_console_logger()
57
+ if not path and not yaml_str:
58
+ raise ValueError("Neither 'file_path' nor 'yaml_str' was provided. Please supply one of them.")
59
+ if path:
60
+ path_obj = Path(path)
61
+ with open(path_obj) as f:
62
+ yaml_str = f.read()
63
+ if not yaml_str:
64
+ raise ValueError("YAML content is empty.")
65
+
66
+ final_yaml_str = PipelineParsingService._replace_variables(yaml_str)
67
+ config = yaml.safe_load(final_yaml_str)
68
+ pipeline_config = PipelineConfig.metadata_to_instance(config)
69
+ steps = PipelineParsingService._get_steps(pipeline_config.steps)
70
+ pipeline = Pipeline(name=pipeline_config.name, steps=steps) # type: ignore
71
+ console_logger.info("Pipeline [ '%s' ] parsed successfully with %d steps.", pipeline.name, len(pipeline.steps))
72
+ return pipeline
73
+
74
+ @staticmethod
75
+ def _replace_variables(yaml_str: str) -> str:
76
+ """Replace variable placeholders in a YAML string.
77
+
78
+ Replaces environment variables with the pattern `{{env:var-name}}`. Where
79
+ the var-name is the name of the environment variable. Replaces secret
80
+ references with the pattern `{{secret-scope-name:secret-key}}`. Where
81
+ scope-name is the name of the secret scope and secret-key is the key of
82
+ the secret.
83
+
84
+ Args:
85
+ yaml_str: A string that can be parsed in YAML format.
86
+
87
+ Returns:
88
+ The same YAML string with environment variable placeholders replaced.
89
+ """
90
+ env_var_pattern = r"\{\{env:([^}]+)\}\}"
91
+ secret_ref_pattern = r"\{\{(?!step|env)([^}]+):([^}]+)\}\}"
92
+
93
+ def replace_with_env_var(match):
94
+ env_var_name = match.group(1)
95
+ env_var_value = os.getenv(env_var_name)
96
+ return env_var_value
97
+
98
+ def replace_with_secret(match):
99
+ secret_scope_name = match.group(1)
100
+ secret_key = match.group(2)
101
+ return SessionManager.get_utils().secrets.get(scope=secret_scope_name, key=secret_key)
102
+
103
+ env_replaced_yaml_string = re.sub(env_var_pattern, replace_with_env_var, yaml_str)
104
+ final_yaml_string = re.sub(secret_ref_pattern, replace_with_secret, env_replaced_yaml_string)
105
+ return final_yaml_string
106
+
107
+ @staticmethod
108
+ def _get_steps(step_configs, last_step_name: str | None = None):
109
+ steps = OrderedDict()
110
+ for step_name, step_config in step_configs.items():
111
+ is_successor = step_config.is_successor
112
+ context_ref = step_config.context
113
+ if is_successor and not context_ref:
114
+ context_ref = last_step_name
115
+ action = PipelineActionType[step_config.action.name].value()
116
+ step = PipelineStep(
117
+ name=step_name,
118
+ action=action,
119
+ options=step_config.options,
120
+ _context_ref=context_ref,
121
+ _table_metadata_ref=step_config.table_metadata,
122
+ )
123
+ steps[step.name] = step
124
+ last_step_name = step_name
125
+ for step in steps.values():
126
+ steps[step.name] = PipelineParsingService._replace_step_refs(steps, step)
127
+ return steps
128
+
129
+ @staticmethod
130
+ def _replace_step_refs(steps: OrderedDict[str, PipelineStep], step: PipelineStep) -> PipelineStep:
131
+ step_ref_pattern = r"\(\(step:([^)]+)\)\)"
132
+
133
+ def _handle_string_value(value: str, option: str):
134
+ if match := re.match(step_ref_pattern, value):
135
+ dependency_step_name = match.group(1)
136
+ dependency_step = steps.get(dependency_step_name)
137
+ step.options[option] = dependency_step
138
+ step._predecessors.add(dependency_step_name)
139
+
140
+ def _handle_list_value(value: list, option: str):
141
+ for i, v in enumerate(value):
142
+ if isinstance(v, str):
143
+ if match := re.match(step_ref_pattern, v):
144
+ dependency_step_name = match.group(1)
145
+ dependency_step = steps.get(dependency_step_name)
146
+ step.options[option][i] = dependency_step
147
+ step._predecessors.add(dependency_step_name)
148
+
149
+ if step.options:
150
+ for option, value in step.options.items():
151
+ if isinstance(value, str):
152
+ _handle_string_value(value, option)
153
+ elif isinstance(value, list):
154
+ _handle_list_value(value, option)
155
+
156
+ return step