cloe-nessy 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. cloe_nessy/__init__.py +0 -0
  2. cloe_nessy/clients/__init__.py +5 -0
  3. cloe_nessy/clients/api_client/__init__.py +3 -0
  4. cloe_nessy/clients/api_client/api_client.py +188 -0
  5. cloe_nessy/clients/api_client/api_response.py +72 -0
  6. cloe_nessy/clients/api_client/auth.py +178 -0
  7. cloe_nessy/clients/api_client/exceptions.py +22 -0
  8. cloe_nessy/file_utilities/__init__.py +3 -0
  9. cloe_nessy/file_utilities/exceptions.py +4 -0
  10. cloe_nessy/file_utilities/factory.py +42 -0
  11. cloe_nessy/file_utilities/get_file_paths.py +72 -0
  12. cloe_nessy/file_utilities/location_types.py +29 -0
  13. cloe_nessy/file_utilities/strategies/__init__.py +0 -0
  14. cloe_nessy/file_utilities/strategies/base_strategy.py +59 -0
  15. cloe_nessy/file_utilities/strategies/local_strategy.py +51 -0
  16. cloe_nessy/file_utilities/strategies/onelake_strategy.py +31 -0
  17. cloe_nessy/file_utilities/strategies/utils_strategy.py +72 -0
  18. cloe_nessy/integration/__init__.py +0 -0
  19. cloe_nessy/integration/reader/__init__.py +6 -0
  20. cloe_nessy/integration/reader/api_reader.py +141 -0
  21. cloe_nessy/integration/reader/catalog_reader.py +49 -0
  22. cloe_nessy/integration/reader/excel_reader.py +170 -0
  23. cloe_nessy/integration/reader/exceptions.py +10 -0
  24. cloe_nessy/integration/reader/file_reader.py +96 -0
  25. cloe_nessy/integration/reader/reader.py +34 -0
  26. cloe_nessy/integration/writer/__init__.py +3 -0
  27. cloe_nessy/integration/writer/catalog_writer.py +48 -0
  28. cloe_nessy/logging/__init__.py +3 -0
  29. cloe_nessy/logging/logger_mixin.py +162 -0
  30. cloe_nessy/models/__init__.py +13 -0
  31. cloe_nessy/models/column.py +65 -0
  32. cloe_nessy/models/constraint.py +9 -0
  33. cloe_nessy/models/foreign_key.py +34 -0
  34. cloe_nessy/models/mixins/__init__.py +0 -0
  35. cloe_nessy/models/mixins/read_instance_mixin.py +124 -0
  36. cloe_nessy/models/mixins/template_loader_mixin.py +18 -0
  37. cloe_nessy/models/schema.py +76 -0
  38. cloe_nessy/models/table.py +236 -0
  39. cloe_nessy/models/types.py +7 -0
  40. cloe_nessy/object_manager/__init__.py +3 -0
  41. cloe_nessy/object_manager/table_manager.py +58 -0
  42. cloe_nessy/pipeline/__init__.py +7 -0
  43. cloe_nessy/pipeline/actions/__init__.py +50 -0
  44. cloe_nessy/pipeline/actions/read_api.py +178 -0
  45. cloe_nessy/pipeline/actions/read_catalog_table.py +68 -0
  46. cloe_nessy/pipeline/actions/read_excel.py +177 -0
  47. cloe_nessy/pipeline/actions/read_files.py +105 -0
  48. cloe_nessy/pipeline/actions/read_metadata_yaml.py +66 -0
  49. cloe_nessy/pipeline/actions/transform_change_datatype.py +56 -0
  50. cloe_nessy/pipeline/actions/transform_concat_columns.py +88 -0
  51. cloe_nessy/pipeline/actions/transform_decode.py +102 -0
  52. cloe_nessy/pipeline/actions/transform_distinct.py +40 -0
  53. cloe_nessy/pipeline/actions/transform_filter.py +51 -0
  54. cloe_nessy/pipeline/actions/transform_generic_sql.py +66 -0
  55. cloe_nessy/pipeline/actions/transform_join.py +81 -0
  56. cloe_nessy/pipeline/actions/transform_json_normalize.py +106 -0
  57. cloe_nessy/pipeline/actions/transform_rename_columns.py +60 -0
  58. cloe_nessy/pipeline/actions/transform_replace_values.py +59 -0
  59. cloe_nessy/pipeline/actions/transform_select_columns.py +83 -0
  60. cloe_nessy/pipeline/actions/transform_union.py +71 -0
  61. cloe_nessy/pipeline/actions/write_catalog_table.py +73 -0
  62. cloe_nessy/pipeline/pipeline.py +201 -0
  63. cloe_nessy/pipeline/pipeline_action.py +62 -0
  64. cloe_nessy/pipeline/pipeline_config.py +92 -0
  65. cloe_nessy/pipeline/pipeline_context.py +56 -0
  66. cloe_nessy/pipeline/pipeline_parsing_service.py +156 -0
  67. cloe_nessy/pipeline/pipeline_step.py +50 -0
  68. cloe_nessy/py.typed +0 -0
  69. cloe_nessy/session/__init__.py +3 -0
  70. cloe_nessy/session/session_manager.py +188 -0
  71. cloe_nessy/settings/__init__.py +3 -0
  72. cloe_nessy/settings/settings.py +91 -0
  73. cloe_nessy/utils/__init__.py +0 -0
  74. cloe_nessy/utils/file_and_directory_handler.py +19 -0
  75. cloe_nessy-0.2.9.dist-info/METADATA +26 -0
  76. cloe_nessy-0.2.9.dist-info/RECORD +78 -0
  77. cloe_nessy-0.2.9.dist-info/WHEEL +5 -0
  78. cloe_nessy-0.2.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,96 @@
1
+ from typing import Any
2
+
3
+ import pyspark.sql.functions as F
4
+ from pyspark.sql import DataFrame
5
+
6
+ from ...file_utilities import get_file_paths
7
+ from .reader import BaseReader
8
+
9
+
10
+ class FileReader(BaseReader):
11
+ """Utility class for reading a file into a DataFrame.
12
+
13
+ This class reads data from files and loads it into a Spark DataFrame.
14
+ """
15
+
16
+ def __init__(self):
17
+ """Initializes the FileReader object."""
18
+ super().__init__()
19
+
20
+ def read(
21
+ self,
22
+ location: str,
23
+ spark_format: str | None = None,
24
+ extension: str | None = None,
25
+ schema: str | None = None,
26
+ search_subdirs: bool = True,
27
+ options: dict | None = None,
28
+ add_metadata_column: bool = False,
29
+ **kwargs: Any,
30
+ ) -> DataFrame:
31
+ """Reads files from a specified location and returns a DataFrame.
32
+
33
+ Arguments:
34
+ location: Location of files to read.
35
+ spark_format: Format of files to read. If not provided, it will be inferred from the extension.
36
+ extension: File extension (csv, json, parquet, txt). Used if spark_format is not provided.
37
+ schema: Schema of the file. If None, schema will be inferred.
38
+ search_subdirs: Whether to include files in subdirectories.
39
+ options: Spark DataFrame reader options.
40
+ add_metadata_column: Whether to include __metadata column in the DataFrame.
41
+ kwargs: This method does not accept any additional keyword arguments.
42
+ """
43
+ if options is None:
44
+ options = {}
45
+
46
+ if not spark_format and not extension:
47
+ raise ValueError("Either spark_format or extension must be provided.")
48
+ self._console_logger.debug(f"Reading files from [ '{location}' ] ...")
49
+ extension_to_datatype_dict = {"csv": "csv", "json": "json", "parquet": "parquet", "txt": "text", "xml": "xml"}
50
+
51
+ if extension and not spark_format:
52
+ if extension not in extension_to_datatype_dict:
53
+ raise ValueError(f"Unsupported file extension: {extension}")
54
+ spark_format = extension_to_datatype_dict[extension]
55
+ self._console_logger.debug(f"Reading files with format: {spark_format}")
56
+ if extension:
57
+ file_paths = get_file_paths(location, extension, search_subdirs)
58
+ else:
59
+ file_paths = [location]
60
+ self._console_logger.debug(f"Found {len(file_paths)} files to read")
61
+ self._console_logger.debug(f"File paths: {file_paths}")
62
+ assert spark_format is not None
63
+
64
+ reader = self._spark.read.format(spark_format)
65
+ if schema:
66
+ reader.schema(schema)
67
+ else:
68
+ options["inferSchema"] = True
69
+
70
+ self._console_logger.debug(f"Setting options: {options}")
71
+ reader.options(**options)
72
+
73
+ try:
74
+ self._console_logger.debug("Loading files into DataFrame")
75
+ df = reader.load(file_paths)
76
+ self._console_logger.debug("Successfully loaded files into DataFrame")
77
+ if add_metadata_column:
78
+ df = self._add_metadata_column(df)
79
+ except Exception as e:
80
+ self._console_logger.error(f"Failed to read files from [ '{location}' ]: {e}")
81
+ raise
82
+ else:
83
+ self._console_logger.info(f"Successfully read files from [ '{location}' ]")
84
+ return df
85
+
86
+ def _add_metadata_column(self, df: DataFrame) -> DataFrame:
87
+ """Add all metadata columns to the DataFrame."""
88
+ # Extract metadata fields into separate columns
89
+ metadata_columns = df.select("_metadata.*").columns
90
+
91
+ entries = [(F.lit(field), F.col(f"_metadata.{field}")) for field in metadata_columns]
92
+ flat_list = [item for tup in entries for item in tup]
93
+
94
+ df = df.withColumn("__metadata", F.create_map(flat_list))
95
+
96
+ return df
@@ -0,0 +1,34 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ from pyspark.sql import DataFrame, SparkSession
5
+
6
+ from ...logging.logger_mixin import LoggerMixin
7
+ from ...session import SessionManager
8
+
9
+
10
+ class BaseReader(ABC, LoggerMixin):
11
+ """Abstract base class for reading data into a Spark DataFrame.
12
+
13
+ This class provides a common interface for different types of data readers.
14
+
15
+ Attributes:
16
+ _spark: The Spark session used for creating DataFrames.
17
+ """
18
+
19
+ def __init__(self) -> None:
20
+ self._spark: SparkSession = SessionManager.get_spark_session()
21
+ self._console_logger = self.get_console_logger()
22
+
23
+ @abstractmethod
24
+ def read(self, *args: Any, **kwargs: Any) -> DataFrame:
25
+ """Abstract method to return a batch data frame.
26
+
27
+ Args:
28
+ *args: Arbitrary non-keyword arguments for reading data.
29
+ **kwargs: Arbitrary keyword arguments for reading data.
30
+
31
+ Returns:
32
+ DataFrame: The Spark DataFrame containing the read data.
33
+ """
34
+ pass
@@ -0,0 +1,3 @@
1
+ from .catalog_writer import CatalogWriter
2
+
3
+ __all__ = ["CatalogWriter"]
@@ -0,0 +1,48 @@
1
+ from pyspark.sql import DataFrame
2
+
3
+
4
+ class CatalogWriter:
5
+ """A writer for Catalog tables."""
6
+
7
+ @staticmethod
8
+ def write_table(
9
+ df: DataFrame | None,
10
+ table_identifier: str | None,
11
+ partition_by: str | list[str] | None = None,
12
+ options: dict[str, str] | None = None,
13
+ mode: str = "append",
14
+ ) -> None:
15
+ """Write a table to the unity catalog.
16
+
17
+ Args:
18
+ df: The DataFrame to write.
19
+ table_identifier: The table identifier in the unity catalog in the
20
+ format 'catalog.schema.table'.
21
+ mode: The write mode. One of append, overwrite, error, errorifexists, ignore.
22
+ partition_by: Names of the partitioning columns.
23
+ options: All other string options.
24
+
25
+ Notes:
26
+ append: Append contents of this DataFrame to existing data.
27
+ overwrite: Overwrite existing data.
28
+ error or errorifexists: Throw an exception if data already exists.
29
+ ignore: Silently ignore this operation if data already exists.
30
+
31
+ Raises:
32
+ ValueError: If the mode is not one of append, overwrite, error, errorifexists, ignore.
33
+ ValueError: If the table_identifier is not a string or not in the format 'catalog.schema.table'.
34
+ ValueError: If the DataFrame is None.
35
+ """
36
+ if mode not in ("append", "overwrite", "error", "errorifexists", "ignore"):
37
+ raise ValueError("mode must be one of append, overwrite, error, errorifexists, ignore")
38
+ if not table_identifier:
39
+ raise ValueError("table_identifier is required")
40
+ elif not isinstance(table_identifier, str):
41
+ raise ValueError("table_identifier must be a string")
42
+ elif len(table_identifier.split(".")) != 3:
43
+ raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
44
+ if not df:
45
+ raise ValueError("df is required, but was None.")
46
+ if options is None:
47
+ options = {}
48
+ df.write.saveAsTable(table_identifier, mode=mode, partitionBy=partition_by, **options)
@@ -0,0 +1,3 @@
1
+ from .logger_mixin import LoggerMixin
2
+
3
+ __all__ = ["LoggerMixin"]
@@ -0,0 +1,162 @@
1
+ import logging
2
+ import logging.handlers
3
+ from typing import cast
4
+
5
+ from cloe_logging import LoggerFactory
6
+
7
+ from ..settings import LoggingSettings, NessySettings
8
+
9
+ factory = LoggerFactory()
10
+
11
+ DEFAULT_COLUMN_SPLIT_CHAR = "|"
12
+ DEFAULT_KEY_VALUE_SPLIT_CHAR = ":"
13
+
14
+
15
+ class LoggerMixin:
16
+ """LoggingMixin class to add logging functionality to classes."""
17
+
18
+ def get_console_logger(
19
+ self,
20
+ level: int | None = None,
21
+ log_format: str | None = None,
22
+ ) -> logging.Logger:
23
+ """Adds a console logger to the class.
24
+
25
+ Args:
26
+ level: The logging level for the console logger.
27
+ log_format: The format for the console logger.
28
+
29
+ Returns:
30
+ The logger with the console handler.
31
+ """
32
+ logging_settings: LoggingSettings = NessySettings().logging
33
+ logger = LoggerFactory.get_logger(
34
+ handler_types=["console"],
35
+ logger_name=f"Console:{self.__class__.__name__}",
36
+ logging_level=level if level is not None else logging_settings.log_level_console,
37
+ log_format=log_format if log_format is not None else logging_settings.log_format_console,
38
+ )
39
+ return cast(logging.Logger, logger)
40
+
41
+ def get_tabular_logger(
42
+ self,
43
+ logger_name: str | None = None,
44
+ handlers: list[str] | None = None,
45
+ level: int | None = None,
46
+ add_log_analytics_logger: bool | None = None,
47
+ add_unity_catalog_logger: bool | None = None,
48
+ # LAW
49
+ log_type: str | None = None,
50
+ workspace_id: str | None = None,
51
+ shared_key: str | None = None,
52
+ # UC
53
+ uc_workspace_url: str | None = None,
54
+ uc_warehouse_id: str | None = None,
55
+ uc_catalog_name: str | None = None,
56
+ uc_schema_name: str | None = None,
57
+ uc_table_name: str | None = None,
58
+ uc_table_columns: dict[str, str] | None = None,
59
+ column_split_char: str = DEFAULT_COLUMN_SPLIT_CHAR,
60
+ key_value_split_char: str = DEFAULT_KEY_VALUE_SPLIT_CHAR,
61
+ ) -> logging.Logger:
62
+ """Adds a tabular logger to the class.
63
+
64
+ Args:
65
+ logger_name: The name of the logger.
66
+ handlers: The list of handlers to add.
67
+ level: The logging level for the tabular logger. If not provided, the value from the settings will be used.
68
+ add_log_analytics_logger: Whether to add a LogAnalyticsHandler to the logger. If not provided, the value from the settings will be used.
69
+ add_unity_catalog_logger: Whether to add a UnityCatalogHandler to the logger. If not provided, the value from the settings will be used.
70
+ log_type: The log type for the Log Analytics workspace.
71
+ workspace_id: The workspace id for the Log Analytics workspace. If not provided, the value from the settings will be used.
72
+ shared_key: The shared key for the Log Analytics workspace.
73
+ uc_workspace_url: The workspace url for the Unity Catalog. If not provided, the value from the settings will be used.
74
+ uc_warehouse_id: The warehouse id for the Unity Catalog. If not provided, the value from the settings will be used.
75
+ uc_catalog_name: The catalog name for the Unity Catalog. If not provided, the value from the settings will be used.
76
+ uc_schema_name: The schema name for the Unity Catalog. If not provided, the value from the settings will be used.
77
+ uc_table_name: The table name for the Unity Catalog.
78
+ uc_table_columns: The columns for the Unity Catalog Table.
79
+ column_split_char: The column split character for the Log Analytics workspace and Unity Catalog. Defaults to "|".
80
+ key_value_split_char: The key value split character for the Log Analytics workspace and Unity Catalog. Defaults to ":".
81
+
82
+ Returns:
83
+ The logger with the added tabular handlers.
84
+ """
85
+ if handlers is None:
86
+ handlers = []
87
+ logging_settings = NessySettings().logging
88
+
89
+ if self.should_add_log_analytics_handler(logging_settings, add_log_analytics_logger):
90
+ handlers.append("log_analytics")
91
+
92
+ if self.should_add_unity_catalog_handler(logging_settings, add_unity_catalog_logger):
93
+ handlers.append("unity_catalog")
94
+
95
+ logger = LoggerFactory.get_logger(
96
+ handler_types=handlers,
97
+ logger_name=logger_name or f"Tabular:{self.__class__.__name__}",
98
+ level=level,
99
+ column_split_char=column_split_char,
100
+ key_value_split_char=key_value_split_char,
101
+ # UC Settings
102
+ uc_table_name=uc_table_name,
103
+ uc_catalog_name=uc_catalog_name or logging_settings.uc_catalog_name,
104
+ uc_schema_name=uc_schema_name or logging_settings.uc_schema_name,
105
+ uc_table_columns=uc_table_columns,
106
+ workspace_url=uc_workspace_url or logging_settings.uc_workspace_url,
107
+ warehouse_id=uc_warehouse_id or logging_settings.uc_warehouse_id,
108
+ # LAW Settings
109
+ workspace_id=workspace_id or logging_settings.log_analytics_workspace_id,
110
+ shared_key=shared_key or logging_settings.log_analytics_shared_key,
111
+ log_type=log_type,
112
+ test_connectivity=False,
113
+ )
114
+ return cast(logging.Logger, logger)
115
+
116
+ @staticmethod
117
+ def should_add_log_analytics_handler(
118
+ logging_settings: LoggingSettings,
119
+ add_log_analytics_logger: bool | None,
120
+ **kwargs, # noqa: ARG004
121
+ ) -> bool:
122
+ """Determines if a LogAnalyticsHandler should be added to the logger.
123
+
124
+ The Logger will be added if the `target_log_analytics` setting is set to True or if the `add_log_analytics_logger`
125
+ argument is set to True.
126
+
127
+ Setting `target_log_analytics` to False will prevent the handler from being added.
128
+
129
+ Args:
130
+ logging_settings: The logging settings to use for the logger.
131
+ add_log_analytics_logger: Whether to add a LogAnalyticsHandler to the logger.
132
+ **kwargs: Additional keyword arguments. Not used.
133
+
134
+ Returns:
135
+ bool: True if the LogAnalyticsHandler should be added, False otherwise.
136
+ """
137
+ disable_overwrite = logging_settings.target_log_analytics is False
138
+ enable_logger = logging_settings.target_log_analytics or add_log_analytics_logger
139
+ return cast(bool, enable_logger and not disable_overwrite)
140
+
141
+ @staticmethod
142
+ def should_add_unity_catalog_handler(
143
+ logging_settings: LoggingSettings,
144
+ add_unity_catalog_logger: bool | None,
145
+ ) -> bool:
146
+ """Determines if a UnityCatalogHandler should be added to the logger.
147
+
148
+ The Logger will be added if the `target_unity_catalog_table` setting is set to True or if the `add_unity_catalog_logger`
149
+ argument is set to True.
150
+
151
+ Setting `target_unity_catalog_table` to False will prevent the handler from being added.
152
+
153
+ Args:
154
+ logging_settings: The logging settings to use for the logger.
155
+ add_unity_catalog_logger: Whether to add a UnityCatalogHandler to the logger.
156
+
157
+ Returns:
158
+ bool: True if the UnityCatalogHandler should be added, False otherwise.
159
+ """
160
+ disable_overwrite = logging_settings.target_unity_catalog_table is False
161
+ enable_logger = logging_settings.target_unity_catalog_table or add_unity_catalog_logger
162
+ return cast(bool, enable_logger and not disable_overwrite)
@@ -0,0 +1,13 @@
1
+ from .column import Column
2
+ from .constraint import Constraint
3
+ from .foreign_key import ForeignKey
4
+ from .schema import Schema
5
+ from .table import Table
6
+
7
+ __all__ = [
8
+ "Column",
9
+ "Constraint",
10
+ "Table",
11
+ "Schema",
12
+ "ForeignKey",
13
+ ]
@@ -0,0 +1,65 @@
1
+ import re
2
+ from typing import Any
3
+
4
+ from pydantic import BaseModel, Field, field_validator, model_validator
5
+
6
+ COLUMN_DATA_TYPE_LIST = {
7
+ "string",
8
+ "integer",
9
+ "int",
10
+ "smallint",
11
+ "float",
12
+ "boolean",
13
+ "bool",
14
+ "bigint",
15
+ "long",
16
+ "double",
17
+ "date",
18
+ "timestamp",
19
+ "array",
20
+ "map",
21
+ "variant",
22
+ "struct",
23
+ }
24
+
25
+
26
+ class Column(BaseModel):
27
+ """Represents a Column of a Table."""
28
+
29
+ name: str
30
+ data_type: str
31
+ nullable: bool
32
+ default_value: Any = None
33
+ generated: str | None = None
34
+ properties: dict[str, Any] = Field(default_factory=dict)
35
+ comment: str | None = None
36
+
37
+ @field_validator("data_type", mode="before")
38
+ def data_type_transform(cls, raw: str) -> str:
39
+ """Map potential aliases to the correct SQL data type.
40
+
41
+ Args:
42
+ raw: The value for the data type.
43
+ """
44
+ val = raw.lower()
45
+ base_data_types = re.findall(r"\b[a-z]+\b", val)
46
+ forbidden_characters = re.findall(r"[^a-z\<\>)]+", val)
47
+
48
+ if forbidden_characters:
49
+ raise ValueError(f"Forbidden characters in data type definition [ '{val}' ]: [' {forbidden_characters} ']")
50
+ for base_data_type in base_data_types:
51
+ if base_data_type not in COLUMN_DATA_TYPE_LIST:
52
+ raise ValueError(f"Unknown data type used in data type definition [ '{val}' ]")
53
+ return val
54
+
55
+ @model_validator(mode="before")
56
+ def _validate_generated_and_default_value(cls, v: Any) -> Any:
57
+ """Check if a column has a default value and is generated.
58
+
59
+ That doesn't make sense, so an error should be raised.
60
+ """
61
+ if v.get("default_value") and v.get("generated"):
62
+ raise ValueError("A column can't have a default value and be generated.")
63
+ if (v.get("default_value") or v.get("generated")) and v.get("nullable") is True:
64
+ raise ValueError("A column can't have a default value or be generated and be nullable.")
65
+ return v
@@ -0,0 +1,9 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class Constraint(BaseModel):
5
+ """Represents a Constraint on a Table."""
6
+
7
+ name: str
8
+ expression: str
9
+ description: str | None = None
@@ -0,0 +1,34 @@
1
+ import os
2
+
3
+ from pydantic import BaseModel, field_validator
4
+
5
+
6
+ def _process_column_input(v):
7
+ if isinstance(v, str):
8
+ v = [col.strip() for col in v.split(",")]
9
+ return v
10
+
11
+
12
+ class ForeignKey(BaseModel):
13
+ """Represents a ForeignKey."""
14
+
15
+ foreign_key_columns: list[str]
16
+ parent_table: str
17
+ parent_columns: list[str]
18
+ foreign_key_option: list[str] | None = None
19
+
20
+ @field_validator("foreign_key_columns", mode="before")
21
+ def _validate_foreign_key_columns(cls, v):
22
+ return _process_column_input(v)
23
+
24
+ @field_validator("parent_columns", mode="before")
25
+ def _validate_parent_columns(cls, v):
26
+ return _process_column_input(v)
27
+
28
+ @field_validator("parent_table", mode="before")
29
+ def _validate_identifier(cls, v):
30
+ if len(v.split(".")) != 3:
31
+ raise ValueError("The 'parent_table' must be in the format 'catalog.schema.table'")
32
+ if "<env>" in v:
33
+ v = v.replace("<env>", os.environ["PROJECT_ENVIRONMENT"])
34
+ return v
File without changes
@@ -0,0 +1,124 @@
1
+ import os
2
+ import pathlib
3
+ import re
4
+ from typing import Any, Self
5
+
6
+ import yaml
7
+ import yaml.parser
8
+ import yaml.scanner
9
+ from pydantic import BaseModel, ValidationError
10
+
11
+ from ...session import SessionManager
12
+ from ..types import ValidationErrorType
13
+
14
+
15
+ class ReadInstancesMixin(BaseModel):
16
+ """This class defines the methods to read, validate and parse metadata definitions."""
17
+
18
+ @classmethod
19
+ def metadata_to_instance(cls, data: dict) -> tuple[Self | None, list[ValidationError]]:
20
+ """Parses a Dictionary to an instance.
21
+
22
+ Returns:
23
+ An instance and potentially a list of errors.
24
+ """
25
+ errors = []
26
+ try:
27
+ instance = cls(**data)
28
+ except ValidationError as e:
29
+ instance = None
30
+ errors.append(e)
31
+ return instance, errors
32
+
33
+ @classmethod
34
+ def read_instance_from_file(
35
+ cls,
36
+ instance_path: pathlib.Path,
37
+ **_: Any, # allow subclasses to pass additional arguments
38
+ ) -> tuple[Self | None, list[ValidationErrorType]]:
39
+ """Read and instantiate a single YAML file for the given path.
40
+
41
+ Arguments:
42
+ instance_path: The path to the file to instantiate.
43
+
44
+ Return:
45
+ Returns a tuple of the instantiated model and errors.
46
+ """
47
+ errors: list[ValidationErrorType] = []
48
+ try:
49
+ with instance_path.open("r") as file:
50
+ raw_string = file.read()
51
+ yaml_str = cls._replace_variables(raw_string)
52
+ data = yaml.safe_load(yaml_str)
53
+ instance, sub_errors = cls.metadata_to_instance(data)
54
+ errors += sub_errors
55
+ except (ValidationError, yaml.parser.ParserError, yaml.scanner.ScannerError) as e:
56
+ instance = None
57
+ errors.append(e)
58
+ return instance, errors
59
+
60
+ @classmethod
61
+ def read_instances_from_directory(
62
+ cls,
63
+ instance_path: pathlib.Path,
64
+ fail_on_missing_subfolder: bool = True,
65
+ **_: Any, # allow subclasses to pass additional arguments
66
+ ) -> tuple[list[Self], list[ValidationErrorType]]:
67
+ """Read and instantiate all *.yaml files for the given path.
68
+
69
+ Arguments:
70
+ instance_path: Path to the directory containing the instance definitions as YAML files.
71
+ fail_on_missing_subfolder: If False return a tuple with 2 empty
72
+ lists. Otherwise raise a FileNotFoundError.
73
+
74
+ Return:
75
+ Returns a tuple of the instantiated models and errors.
76
+ """
77
+ instances: list[Self] = []
78
+ errors: list[ValidationErrorType] = []
79
+
80
+ if not instance_path.exists() or not instance_path.is_dir():
81
+ if fail_on_missing_subfolder:
82
+ raise FileNotFoundError(f"Directory not found: {instance_path}")
83
+ else:
84
+ return instances, errors
85
+
86
+ for instance_file in instance_path.iterdir():
87
+ sub_errors: list[ValidationErrorType] = []
88
+ if instance_file.is_file() and instance_file.suffix in (".yaml", ".yml"):
89
+ instance, sub_errors = cls.read_instance_from_file(instance_file)
90
+ instances += [] if instance is None else [instance]
91
+ errors += sub_errors
92
+
93
+ return instances, errors
94
+
95
+ @staticmethod
96
+ def _replace_variables(yaml_str: str) -> str:
97
+ """Replace variable placeholders in a YAML string.
98
+
99
+ Replaces environment variables with the pattern `{{env:var-name}}`. Where
100
+ the var-name is the name of the environment variable.
101
+
102
+ Args:
103
+ yaml_str (str): A string that can be parsed in YAML format.
104
+
105
+ Returns:
106
+ The same YAML string with environment variable placeholders
107
+ replaced.
108
+ """
109
+ env_var_pattern = r"\{\{env:([^}]+)\}\}"
110
+ secret_ref_pattern = r"\{\{(?!step|env)([^}]+):([^}]+)\}\}"
111
+
112
+ def replace_with_env_var(match):
113
+ env_var_name = match.group(1)
114
+ env_var_value = os.getenv(env_var_name)
115
+ return env_var_value
116
+
117
+ def replace_with_secret(match):
118
+ secret_scope_name = match.group(1)
119
+ secret_key = match.group(2)
120
+ return SessionManager.get_utils().secrets.get(scope=secret_scope_name, key=secret_key)
121
+
122
+ env_replaced_yaml_string = re.sub(env_var_pattern, replace_with_env_var, yaml_str)
123
+ final_yaml_string = re.sub(secret_ref_pattern, replace_with_secret, env_replaced_yaml_string)
124
+ return final_yaml_string
@@ -0,0 +1,18 @@
1
+ from pathlib import Path
2
+
3
+ from jinja2 import FileSystemLoader
4
+ from jinja2 import Template as JinjaTemplate
5
+ from jinja2.environment import Environment
6
+
7
+
8
+ class TemplateLoaderMixin:
9
+ """A Mixin to load Jinja Templates."""
10
+
11
+ @staticmethod
12
+ def get_template(template_path: Path, template_name: str) -> JinjaTemplate:
13
+ """Load the specified template."""
14
+ loader: FileSystemLoader = FileSystemLoader(template_path)
15
+
16
+ env = Environment(loader=loader, keep_trailing_newline=True)
17
+
18
+ return env.get_template(template_name)