cloe-nessy 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/__init__.py +0 -0
- cloe_nessy/clients/__init__.py +5 -0
- cloe_nessy/clients/api_client/__init__.py +3 -0
- cloe_nessy/clients/api_client/api_client.py +188 -0
- cloe_nessy/clients/api_client/api_response.py +72 -0
- cloe_nessy/clients/api_client/auth.py +178 -0
- cloe_nessy/clients/api_client/exceptions.py +22 -0
- cloe_nessy/file_utilities/__init__.py +3 -0
- cloe_nessy/file_utilities/exceptions.py +4 -0
- cloe_nessy/file_utilities/factory.py +42 -0
- cloe_nessy/file_utilities/get_file_paths.py +72 -0
- cloe_nessy/file_utilities/location_types.py +29 -0
- cloe_nessy/file_utilities/strategies/__init__.py +0 -0
- cloe_nessy/file_utilities/strategies/base_strategy.py +59 -0
- cloe_nessy/file_utilities/strategies/local_strategy.py +51 -0
- cloe_nessy/file_utilities/strategies/onelake_strategy.py +31 -0
- cloe_nessy/file_utilities/strategies/utils_strategy.py +72 -0
- cloe_nessy/integration/__init__.py +0 -0
- cloe_nessy/integration/reader/__init__.py +6 -0
- cloe_nessy/integration/reader/api_reader.py +141 -0
- cloe_nessy/integration/reader/catalog_reader.py +49 -0
- cloe_nessy/integration/reader/excel_reader.py +170 -0
- cloe_nessy/integration/reader/exceptions.py +10 -0
- cloe_nessy/integration/reader/file_reader.py +96 -0
- cloe_nessy/integration/reader/reader.py +34 -0
- cloe_nessy/integration/writer/__init__.py +3 -0
- cloe_nessy/integration/writer/catalog_writer.py +48 -0
- cloe_nessy/logging/__init__.py +3 -0
- cloe_nessy/logging/logger_mixin.py +162 -0
- cloe_nessy/models/__init__.py +13 -0
- cloe_nessy/models/column.py +65 -0
- cloe_nessy/models/constraint.py +9 -0
- cloe_nessy/models/foreign_key.py +34 -0
- cloe_nessy/models/mixins/__init__.py +0 -0
- cloe_nessy/models/mixins/read_instance_mixin.py +124 -0
- cloe_nessy/models/mixins/template_loader_mixin.py +18 -0
- cloe_nessy/models/schema.py +76 -0
- cloe_nessy/models/table.py +236 -0
- cloe_nessy/models/types.py +7 -0
- cloe_nessy/object_manager/__init__.py +3 -0
- cloe_nessy/object_manager/table_manager.py +58 -0
- cloe_nessy/pipeline/__init__.py +7 -0
- cloe_nessy/pipeline/actions/__init__.py +50 -0
- cloe_nessy/pipeline/actions/read_api.py +178 -0
- cloe_nessy/pipeline/actions/read_catalog_table.py +68 -0
- cloe_nessy/pipeline/actions/read_excel.py +177 -0
- cloe_nessy/pipeline/actions/read_files.py +105 -0
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +66 -0
- cloe_nessy/pipeline/actions/transform_change_datatype.py +56 -0
- cloe_nessy/pipeline/actions/transform_concat_columns.py +88 -0
- cloe_nessy/pipeline/actions/transform_decode.py +102 -0
- cloe_nessy/pipeline/actions/transform_distinct.py +40 -0
- cloe_nessy/pipeline/actions/transform_filter.py +51 -0
- cloe_nessy/pipeline/actions/transform_generic_sql.py +66 -0
- cloe_nessy/pipeline/actions/transform_join.py +81 -0
- cloe_nessy/pipeline/actions/transform_json_normalize.py +106 -0
- cloe_nessy/pipeline/actions/transform_rename_columns.py +60 -0
- cloe_nessy/pipeline/actions/transform_replace_values.py +59 -0
- cloe_nessy/pipeline/actions/transform_select_columns.py +83 -0
- cloe_nessy/pipeline/actions/transform_union.py +71 -0
- cloe_nessy/pipeline/actions/write_catalog_table.py +73 -0
- cloe_nessy/pipeline/pipeline.py +201 -0
- cloe_nessy/pipeline/pipeline_action.py +62 -0
- cloe_nessy/pipeline/pipeline_config.py +92 -0
- cloe_nessy/pipeline/pipeline_context.py +56 -0
- cloe_nessy/pipeline/pipeline_parsing_service.py +156 -0
- cloe_nessy/pipeline/pipeline_step.py +50 -0
- cloe_nessy/py.typed +0 -0
- cloe_nessy/session/__init__.py +3 -0
- cloe_nessy/session/session_manager.py +188 -0
- cloe_nessy/settings/__init__.py +3 -0
- cloe_nessy/settings/settings.py +91 -0
- cloe_nessy/utils/__init__.py +0 -0
- cloe_nessy/utils/file_and_directory_handler.py +19 -0
- cloe_nessy-0.2.9.dist-info/METADATA +26 -0
- cloe_nessy-0.2.9.dist-info/RECORD +78 -0
- cloe_nessy-0.2.9.dist-info/WHEEL +5 -0
- cloe_nessy-0.2.9.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
import pyspark.sql.functions as F
|
|
4
|
+
from pyspark.sql import DataFrame
|
|
5
|
+
|
|
6
|
+
from ...file_utilities import get_file_paths
|
|
7
|
+
from .reader import BaseReader
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FileReader(BaseReader):
|
|
11
|
+
"""Utility class for reading a file into a DataFrame.
|
|
12
|
+
|
|
13
|
+
This class reads data from files and loads it into a Spark DataFrame.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
"""Initializes the FileReader object."""
|
|
18
|
+
super().__init__()
|
|
19
|
+
|
|
20
|
+
def read(
|
|
21
|
+
self,
|
|
22
|
+
location: str,
|
|
23
|
+
spark_format: str | None = None,
|
|
24
|
+
extension: str | None = None,
|
|
25
|
+
schema: str | None = None,
|
|
26
|
+
search_subdirs: bool = True,
|
|
27
|
+
options: dict | None = None,
|
|
28
|
+
add_metadata_column: bool = False,
|
|
29
|
+
**kwargs: Any,
|
|
30
|
+
) -> DataFrame:
|
|
31
|
+
"""Reads files from a specified location and returns a DataFrame.
|
|
32
|
+
|
|
33
|
+
Arguments:
|
|
34
|
+
location: Location of files to read.
|
|
35
|
+
spark_format: Format of files to read. If not provided, it will be inferred from the extension.
|
|
36
|
+
extension: File extension (csv, json, parquet, txt). Used if spark_format is not provided.
|
|
37
|
+
schema: Schema of the file. If None, schema will be inferred.
|
|
38
|
+
search_subdirs: Whether to include files in subdirectories.
|
|
39
|
+
options: Spark DataFrame reader options.
|
|
40
|
+
add_metadata_column: Whether to include __metadata column in the DataFrame.
|
|
41
|
+
kwargs: This method does not accept any additional keyword arguments.
|
|
42
|
+
"""
|
|
43
|
+
if options is None:
|
|
44
|
+
options = {}
|
|
45
|
+
|
|
46
|
+
if not spark_format and not extension:
|
|
47
|
+
raise ValueError("Either spark_format or extension must be provided.")
|
|
48
|
+
self._console_logger.debug(f"Reading files from [ '{location}' ] ...")
|
|
49
|
+
extension_to_datatype_dict = {"csv": "csv", "json": "json", "parquet": "parquet", "txt": "text", "xml": "xml"}
|
|
50
|
+
|
|
51
|
+
if extension and not spark_format:
|
|
52
|
+
if extension not in extension_to_datatype_dict:
|
|
53
|
+
raise ValueError(f"Unsupported file extension: {extension}")
|
|
54
|
+
spark_format = extension_to_datatype_dict[extension]
|
|
55
|
+
self._console_logger.debug(f"Reading files with format: {spark_format}")
|
|
56
|
+
if extension:
|
|
57
|
+
file_paths = get_file_paths(location, extension, search_subdirs)
|
|
58
|
+
else:
|
|
59
|
+
file_paths = [location]
|
|
60
|
+
self._console_logger.debug(f"Found {len(file_paths)} files to read")
|
|
61
|
+
self._console_logger.debug(f"File paths: {file_paths}")
|
|
62
|
+
assert spark_format is not None
|
|
63
|
+
|
|
64
|
+
reader = self._spark.read.format(spark_format)
|
|
65
|
+
if schema:
|
|
66
|
+
reader.schema(schema)
|
|
67
|
+
else:
|
|
68
|
+
options["inferSchema"] = True
|
|
69
|
+
|
|
70
|
+
self._console_logger.debug(f"Setting options: {options}")
|
|
71
|
+
reader.options(**options)
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
self._console_logger.debug("Loading files into DataFrame")
|
|
75
|
+
df = reader.load(file_paths)
|
|
76
|
+
self._console_logger.debug("Successfully loaded files into DataFrame")
|
|
77
|
+
if add_metadata_column:
|
|
78
|
+
df = self._add_metadata_column(df)
|
|
79
|
+
except Exception as e:
|
|
80
|
+
self._console_logger.error(f"Failed to read files from [ '{location}' ]: {e}")
|
|
81
|
+
raise
|
|
82
|
+
else:
|
|
83
|
+
self._console_logger.info(f"Successfully read files from [ '{location}' ]")
|
|
84
|
+
return df
|
|
85
|
+
|
|
86
|
+
def _add_metadata_column(self, df: DataFrame) -> DataFrame:
|
|
87
|
+
"""Add all metadata columns to the DataFrame."""
|
|
88
|
+
# Extract metadata fields into separate columns
|
|
89
|
+
metadata_columns = df.select("_metadata.*").columns
|
|
90
|
+
|
|
91
|
+
entries = [(F.lit(field), F.col(f"_metadata.{field}")) for field in metadata_columns]
|
|
92
|
+
flat_list = [item for tup in entries for item in tup]
|
|
93
|
+
|
|
94
|
+
df = df.withColumn("__metadata", F.create_map(flat_list))
|
|
95
|
+
|
|
96
|
+
return df
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
5
|
+
|
|
6
|
+
from ...logging.logger_mixin import LoggerMixin
|
|
7
|
+
from ...session import SessionManager
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseReader(ABC, LoggerMixin):
|
|
11
|
+
"""Abstract base class for reading data into a Spark DataFrame.
|
|
12
|
+
|
|
13
|
+
This class provides a common interface for different types of data readers.
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
_spark: The Spark session used for creating DataFrames.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self) -> None:
|
|
20
|
+
self._spark: SparkSession = SessionManager.get_spark_session()
|
|
21
|
+
self._console_logger = self.get_console_logger()
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def read(self, *args: Any, **kwargs: Any) -> DataFrame:
|
|
25
|
+
"""Abstract method to return a batch data frame.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
*args: Arbitrary non-keyword arguments for reading data.
|
|
29
|
+
**kwargs: Arbitrary keyword arguments for reading data.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
DataFrame: The Spark DataFrame containing the read data.
|
|
33
|
+
"""
|
|
34
|
+
pass
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class CatalogWriter:
|
|
5
|
+
"""A writer for Catalog tables."""
|
|
6
|
+
|
|
7
|
+
@staticmethod
|
|
8
|
+
def write_table(
|
|
9
|
+
df: DataFrame | None,
|
|
10
|
+
table_identifier: str | None,
|
|
11
|
+
partition_by: str | list[str] | None = None,
|
|
12
|
+
options: dict[str, str] | None = None,
|
|
13
|
+
mode: str = "append",
|
|
14
|
+
) -> None:
|
|
15
|
+
"""Write a table to the unity catalog.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
df: The DataFrame to write.
|
|
19
|
+
table_identifier: The table identifier in the unity catalog in the
|
|
20
|
+
format 'catalog.schema.table'.
|
|
21
|
+
mode: The write mode. One of append, overwrite, error, errorifexists, ignore.
|
|
22
|
+
partition_by: Names of the partitioning columns.
|
|
23
|
+
options: All other string options.
|
|
24
|
+
|
|
25
|
+
Notes:
|
|
26
|
+
append: Append contents of this DataFrame to existing data.
|
|
27
|
+
overwrite: Overwrite existing data.
|
|
28
|
+
error or errorifexists: Throw an exception if data already exists.
|
|
29
|
+
ignore: Silently ignore this operation if data already exists.
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
ValueError: If the mode is not one of append, overwrite, error, errorifexists, ignore.
|
|
33
|
+
ValueError: If the table_identifier is not a string or not in the format 'catalog.schema.table'.
|
|
34
|
+
ValueError: If the DataFrame is None.
|
|
35
|
+
"""
|
|
36
|
+
if mode not in ("append", "overwrite", "error", "errorifexists", "ignore"):
|
|
37
|
+
raise ValueError("mode must be one of append, overwrite, error, errorifexists, ignore")
|
|
38
|
+
if not table_identifier:
|
|
39
|
+
raise ValueError("table_identifier is required")
|
|
40
|
+
elif not isinstance(table_identifier, str):
|
|
41
|
+
raise ValueError("table_identifier must be a string")
|
|
42
|
+
elif len(table_identifier.split(".")) != 3:
|
|
43
|
+
raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
|
|
44
|
+
if not df:
|
|
45
|
+
raise ValueError("df is required, but was None.")
|
|
46
|
+
if options is None:
|
|
47
|
+
options = {}
|
|
48
|
+
df.write.saveAsTable(table_identifier, mode=mode, partitionBy=partition_by, **options)
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import logging.handlers
|
|
3
|
+
from typing import cast
|
|
4
|
+
|
|
5
|
+
from cloe_logging import LoggerFactory
|
|
6
|
+
|
|
7
|
+
from ..settings import LoggingSettings, NessySettings
|
|
8
|
+
|
|
9
|
+
factory = LoggerFactory()
|
|
10
|
+
|
|
11
|
+
DEFAULT_COLUMN_SPLIT_CHAR = "|"
|
|
12
|
+
DEFAULT_KEY_VALUE_SPLIT_CHAR = ":"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LoggerMixin:
|
|
16
|
+
"""LoggingMixin class to add logging functionality to classes."""
|
|
17
|
+
|
|
18
|
+
def get_console_logger(
|
|
19
|
+
self,
|
|
20
|
+
level: int | None = None,
|
|
21
|
+
log_format: str | None = None,
|
|
22
|
+
) -> logging.Logger:
|
|
23
|
+
"""Adds a console logger to the class.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
level: The logging level for the console logger.
|
|
27
|
+
log_format: The format for the console logger.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
The logger with the console handler.
|
|
31
|
+
"""
|
|
32
|
+
logging_settings: LoggingSettings = NessySettings().logging
|
|
33
|
+
logger = LoggerFactory.get_logger(
|
|
34
|
+
handler_types=["console"],
|
|
35
|
+
logger_name=f"Console:{self.__class__.__name__}",
|
|
36
|
+
logging_level=level if level is not None else logging_settings.log_level_console,
|
|
37
|
+
log_format=log_format if log_format is not None else logging_settings.log_format_console,
|
|
38
|
+
)
|
|
39
|
+
return cast(logging.Logger, logger)
|
|
40
|
+
|
|
41
|
+
def get_tabular_logger(
|
|
42
|
+
self,
|
|
43
|
+
logger_name: str | None = None,
|
|
44
|
+
handlers: list[str] | None = None,
|
|
45
|
+
level: int | None = None,
|
|
46
|
+
add_log_analytics_logger: bool | None = None,
|
|
47
|
+
add_unity_catalog_logger: bool | None = None,
|
|
48
|
+
# LAW
|
|
49
|
+
log_type: str | None = None,
|
|
50
|
+
workspace_id: str | None = None,
|
|
51
|
+
shared_key: str | None = None,
|
|
52
|
+
# UC
|
|
53
|
+
uc_workspace_url: str | None = None,
|
|
54
|
+
uc_warehouse_id: str | None = None,
|
|
55
|
+
uc_catalog_name: str | None = None,
|
|
56
|
+
uc_schema_name: str | None = None,
|
|
57
|
+
uc_table_name: str | None = None,
|
|
58
|
+
uc_table_columns: dict[str, str] | None = None,
|
|
59
|
+
column_split_char: str = DEFAULT_COLUMN_SPLIT_CHAR,
|
|
60
|
+
key_value_split_char: str = DEFAULT_KEY_VALUE_SPLIT_CHAR,
|
|
61
|
+
) -> logging.Logger:
|
|
62
|
+
"""Adds a tabular logger to the class.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
logger_name: The name of the logger.
|
|
66
|
+
handlers: The list of handlers to add.
|
|
67
|
+
level: The logging level for the tabular logger. If not provided, the value from the settings will be used.
|
|
68
|
+
add_log_analytics_logger: Whether to add a LogAnalyticsHandler to the logger. If not provided, the value from the settings will be used.
|
|
69
|
+
add_unity_catalog_logger: Whether to add a UnityCatalogHandler to the logger. If not provided, the value from the settings will be used.
|
|
70
|
+
log_type: The log type for the Log Analytics workspace.
|
|
71
|
+
workspace_id: The workspace id for the Log Analytics workspace. If not provided, the value from the settings will be used.
|
|
72
|
+
shared_key: The shared key for the Log Analytics workspace.
|
|
73
|
+
uc_workspace_url: The workspace url for the Unity Catalog. If not provided, the value from the settings will be used.
|
|
74
|
+
uc_warehouse_id: The warehouse id for the Unity Catalog. If not provided, the value from the settings will be used.
|
|
75
|
+
uc_catalog_name: The catalog name for the Unity Catalog. If not provided, the value from the settings will be used.
|
|
76
|
+
uc_schema_name: The schema name for the Unity Catalog. If not provided, the value from the settings will be used.
|
|
77
|
+
uc_table_name: The table name for the Unity Catalog.
|
|
78
|
+
uc_table_columns: The columns for the Unity Catalog Table.
|
|
79
|
+
column_split_char: The column split character for the Log Analytics workspace and Unity Catalog. Defaults to "|".
|
|
80
|
+
key_value_split_char: The key value split character for the Log Analytics workspace and Unity Catalog. Defaults to ":".
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
The logger with the added tabular handlers.
|
|
84
|
+
"""
|
|
85
|
+
if handlers is None:
|
|
86
|
+
handlers = []
|
|
87
|
+
logging_settings = NessySettings().logging
|
|
88
|
+
|
|
89
|
+
if self.should_add_log_analytics_handler(logging_settings, add_log_analytics_logger):
|
|
90
|
+
handlers.append("log_analytics")
|
|
91
|
+
|
|
92
|
+
if self.should_add_unity_catalog_handler(logging_settings, add_unity_catalog_logger):
|
|
93
|
+
handlers.append("unity_catalog")
|
|
94
|
+
|
|
95
|
+
logger = LoggerFactory.get_logger(
|
|
96
|
+
handler_types=handlers,
|
|
97
|
+
logger_name=logger_name or f"Tabular:{self.__class__.__name__}",
|
|
98
|
+
level=level,
|
|
99
|
+
column_split_char=column_split_char,
|
|
100
|
+
key_value_split_char=key_value_split_char,
|
|
101
|
+
# UC Settings
|
|
102
|
+
uc_table_name=uc_table_name,
|
|
103
|
+
uc_catalog_name=uc_catalog_name or logging_settings.uc_catalog_name,
|
|
104
|
+
uc_schema_name=uc_schema_name or logging_settings.uc_schema_name,
|
|
105
|
+
uc_table_columns=uc_table_columns,
|
|
106
|
+
workspace_url=uc_workspace_url or logging_settings.uc_workspace_url,
|
|
107
|
+
warehouse_id=uc_warehouse_id or logging_settings.uc_warehouse_id,
|
|
108
|
+
# LAW Settings
|
|
109
|
+
workspace_id=workspace_id or logging_settings.log_analytics_workspace_id,
|
|
110
|
+
shared_key=shared_key or logging_settings.log_analytics_shared_key,
|
|
111
|
+
log_type=log_type,
|
|
112
|
+
test_connectivity=False,
|
|
113
|
+
)
|
|
114
|
+
return cast(logging.Logger, logger)
|
|
115
|
+
|
|
116
|
+
@staticmethod
|
|
117
|
+
def should_add_log_analytics_handler(
|
|
118
|
+
logging_settings: LoggingSettings,
|
|
119
|
+
add_log_analytics_logger: bool | None,
|
|
120
|
+
**kwargs, # noqa: ARG004
|
|
121
|
+
) -> bool:
|
|
122
|
+
"""Determines if a LogAnalyticsHandler should be added to the logger.
|
|
123
|
+
|
|
124
|
+
The Logger will be added if the `target_log_analytics` setting is set to True or if the `add_log_analytics_logger`
|
|
125
|
+
argument is set to True.
|
|
126
|
+
|
|
127
|
+
Setting `target_log_analytics` to False will prevent the handler from being added.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
logging_settings: The logging settings to use for the logger.
|
|
131
|
+
add_log_analytics_logger: Whether to add a LogAnalyticsHandler to the logger.
|
|
132
|
+
**kwargs: Additional keyword arguments. Not used.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
bool: True if the LogAnalyticsHandler should be added, False otherwise.
|
|
136
|
+
"""
|
|
137
|
+
disable_overwrite = logging_settings.target_log_analytics is False
|
|
138
|
+
enable_logger = logging_settings.target_log_analytics or add_log_analytics_logger
|
|
139
|
+
return cast(bool, enable_logger and not disable_overwrite)
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
def should_add_unity_catalog_handler(
|
|
143
|
+
logging_settings: LoggingSettings,
|
|
144
|
+
add_unity_catalog_logger: bool | None,
|
|
145
|
+
) -> bool:
|
|
146
|
+
"""Determines if a UnityCatalogHandler should be added to the logger.
|
|
147
|
+
|
|
148
|
+
The Logger will be added if the `target_unity_catalog_table` setting is set to True or if the `add_unity_catalog_logger`
|
|
149
|
+
argument is set to True.
|
|
150
|
+
|
|
151
|
+
Setting `target_unity_catalog_table` to False will prevent the handler from being added.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
logging_settings: The logging settings to use for the logger.
|
|
155
|
+
add_unity_catalog_logger: Whether to add a UnityCatalogHandler to the logger.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
bool: True if the UnityCatalogHandler should be added, False otherwise.
|
|
159
|
+
"""
|
|
160
|
+
disable_overwrite = logging_settings.target_unity_catalog_table is False
|
|
161
|
+
enable_logger = logging_settings.target_unity_catalog_table or add_unity_catalog_logger
|
|
162
|
+
return cast(bool, enable_logger and not disable_overwrite)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
5
|
+
|
|
6
|
+
COLUMN_DATA_TYPE_LIST = {
|
|
7
|
+
"string",
|
|
8
|
+
"integer",
|
|
9
|
+
"int",
|
|
10
|
+
"smallint",
|
|
11
|
+
"float",
|
|
12
|
+
"boolean",
|
|
13
|
+
"bool",
|
|
14
|
+
"bigint",
|
|
15
|
+
"long",
|
|
16
|
+
"double",
|
|
17
|
+
"date",
|
|
18
|
+
"timestamp",
|
|
19
|
+
"array",
|
|
20
|
+
"map",
|
|
21
|
+
"variant",
|
|
22
|
+
"struct",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Column(BaseModel):
|
|
27
|
+
"""Represents a Column of a Table."""
|
|
28
|
+
|
|
29
|
+
name: str
|
|
30
|
+
data_type: str
|
|
31
|
+
nullable: bool
|
|
32
|
+
default_value: Any = None
|
|
33
|
+
generated: str | None = None
|
|
34
|
+
properties: dict[str, Any] = Field(default_factory=dict)
|
|
35
|
+
comment: str | None = None
|
|
36
|
+
|
|
37
|
+
@field_validator("data_type", mode="before")
|
|
38
|
+
def data_type_transform(cls, raw: str) -> str:
|
|
39
|
+
"""Map potential aliases to the correct SQL data type.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
raw: The value for the data type.
|
|
43
|
+
"""
|
|
44
|
+
val = raw.lower()
|
|
45
|
+
base_data_types = re.findall(r"\b[a-z]+\b", val)
|
|
46
|
+
forbidden_characters = re.findall(r"[^a-z\<\>)]+", val)
|
|
47
|
+
|
|
48
|
+
if forbidden_characters:
|
|
49
|
+
raise ValueError(f"Forbidden characters in data type definition [ '{val}' ]: [' {forbidden_characters} ']")
|
|
50
|
+
for base_data_type in base_data_types:
|
|
51
|
+
if base_data_type not in COLUMN_DATA_TYPE_LIST:
|
|
52
|
+
raise ValueError(f"Unknown data type used in data type definition [ '{val}' ]")
|
|
53
|
+
return val
|
|
54
|
+
|
|
55
|
+
@model_validator(mode="before")
|
|
56
|
+
def _validate_generated_and_default_value(cls, v: Any) -> Any:
|
|
57
|
+
"""Check if a column has a default value and is generated.
|
|
58
|
+
|
|
59
|
+
That doesn't make sense, so an error should be raised.
|
|
60
|
+
"""
|
|
61
|
+
if v.get("default_value") and v.get("generated"):
|
|
62
|
+
raise ValueError("A column can't have a default value and be generated.")
|
|
63
|
+
if (v.get("default_value") or v.get("generated")) and v.get("nullable") is True:
|
|
64
|
+
raise ValueError("A column can't have a default value or be generated and be nullable.")
|
|
65
|
+
return v
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, field_validator
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _process_column_input(v):
|
|
7
|
+
if isinstance(v, str):
|
|
8
|
+
v = [col.strip() for col in v.split(",")]
|
|
9
|
+
return v
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ForeignKey(BaseModel):
|
|
13
|
+
"""Represents a ForeignKey."""
|
|
14
|
+
|
|
15
|
+
foreign_key_columns: list[str]
|
|
16
|
+
parent_table: str
|
|
17
|
+
parent_columns: list[str]
|
|
18
|
+
foreign_key_option: list[str] | None = None
|
|
19
|
+
|
|
20
|
+
@field_validator("foreign_key_columns", mode="before")
|
|
21
|
+
def _validate_foreign_key_columns(cls, v):
|
|
22
|
+
return _process_column_input(v)
|
|
23
|
+
|
|
24
|
+
@field_validator("parent_columns", mode="before")
|
|
25
|
+
def _validate_parent_columns(cls, v):
|
|
26
|
+
return _process_column_input(v)
|
|
27
|
+
|
|
28
|
+
@field_validator("parent_table", mode="before")
|
|
29
|
+
def _validate_identifier(cls, v):
|
|
30
|
+
if len(v.split(".")) != 3:
|
|
31
|
+
raise ValueError("The 'parent_table' must be in the format 'catalog.schema.table'")
|
|
32
|
+
if "<env>" in v:
|
|
33
|
+
v = v.replace("<env>", os.environ["PROJECT_ENVIRONMENT"])
|
|
34
|
+
return v
|
|
File without changes
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pathlib
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Self
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
import yaml.parser
|
|
8
|
+
import yaml.scanner
|
|
9
|
+
from pydantic import BaseModel, ValidationError
|
|
10
|
+
|
|
11
|
+
from ...session import SessionManager
|
|
12
|
+
from ..types import ValidationErrorType
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ReadInstancesMixin(BaseModel):
|
|
16
|
+
"""This class defines the methods to read, validate and parse metadata definitions."""
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def metadata_to_instance(cls, data: dict) -> tuple[Self | None, list[ValidationError]]:
|
|
20
|
+
"""Parses a Dictionary to an instance.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
An instance and potentially a list of errors.
|
|
24
|
+
"""
|
|
25
|
+
errors = []
|
|
26
|
+
try:
|
|
27
|
+
instance = cls(**data)
|
|
28
|
+
except ValidationError as e:
|
|
29
|
+
instance = None
|
|
30
|
+
errors.append(e)
|
|
31
|
+
return instance, errors
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def read_instance_from_file(
|
|
35
|
+
cls,
|
|
36
|
+
instance_path: pathlib.Path,
|
|
37
|
+
**_: Any, # allow subclasses to pass additional arguments
|
|
38
|
+
) -> tuple[Self | None, list[ValidationErrorType]]:
|
|
39
|
+
"""Read and instantiate a single YAML file for the given path.
|
|
40
|
+
|
|
41
|
+
Arguments:
|
|
42
|
+
instance_path: The path to the file to instantiate.
|
|
43
|
+
|
|
44
|
+
Return:
|
|
45
|
+
Returns a tuple of the instantiated model and errors.
|
|
46
|
+
"""
|
|
47
|
+
errors: list[ValidationErrorType] = []
|
|
48
|
+
try:
|
|
49
|
+
with instance_path.open("r") as file:
|
|
50
|
+
raw_string = file.read()
|
|
51
|
+
yaml_str = cls._replace_variables(raw_string)
|
|
52
|
+
data = yaml.safe_load(yaml_str)
|
|
53
|
+
instance, sub_errors = cls.metadata_to_instance(data)
|
|
54
|
+
errors += sub_errors
|
|
55
|
+
except (ValidationError, yaml.parser.ParserError, yaml.scanner.ScannerError) as e:
|
|
56
|
+
instance = None
|
|
57
|
+
errors.append(e)
|
|
58
|
+
return instance, errors
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def read_instances_from_directory(
|
|
62
|
+
cls,
|
|
63
|
+
instance_path: pathlib.Path,
|
|
64
|
+
fail_on_missing_subfolder: bool = True,
|
|
65
|
+
**_: Any, # allow subclasses to pass additional arguments
|
|
66
|
+
) -> tuple[list[Self], list[ValidationErrorType]]:
|
|
67
|
+
"""Read and instantiate all *.yaml files for the given path.
|
|
68
|
+
|
|
69
|
+
Arguments:
|
|
70
|
+
instance_path: Path to the directory containing the instance definitions as YAML files.
|
|
71
|
+
fail_on_missing_subfolder: If False return a tuple with 2 empty
|
|
72
|
+
lists. Otherwise raise a FileNotFoundError.
|
|
73
|
+
|
|
74
|
+
Return:
|
|
75
|
+
Returns a tuple of the instantiated models and errors.
|
|
76
|
+
"""
|
|
77
|
+
instances: list[Self] = []
|
|
78
|
+
errors: list[ValidationErrorType] = []
|
|
79
|
+
|
|
80
|
+
if not instance_path.exists() or not instance_path.is_dir():
|
|
81
|
+
if fail_on_missing_subfolder:
|
|
82
|
+
raise FileNotFoundError(f"Directory not found: {instance_path}")
|
|
83
|
+
else:
|
|
84
|
+
return instances, errors
|
|
85
|
+
|
|
86
|
+
for instance_file in instance_path.iterdir():
|
|
87
|
+
sub_errors: list[ValidationErrorType] = []
|
|
88
|
+
if instance_file.is_file() and instance_file.suffix in (".yaml", ".yml"):
|
|
89
|
+
instance, sub_errors = cls.read_instance_from_file(instance_file)
|
|
90
|
+
instances += [] if instance is None else [instance]
|
|
91
|
+
errors += sub_errors
|
|
92
|
+
|
|
93
|
+
return instances, errors
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def _replace_variables(yaml_str: str) -> str:
|
|
97
|
+
"""Replace variable placeholders in a YAML string.
|
|
98
|
+
|
|
99
|
+
Replaces environment variables with the pattern `{{env:var-name}}`. Where
|
|
100
|
+
the var-name is the name of the environment variable.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
yaml_str (str): A string that can be parsed in YAML format.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
The same YAML string with environment variable placeholders
|
|
107
|
+
replaced.
|
|
108
|
+
"""
|
|
109
|
+
env_var_pattern = r"\{\{env:([^}]+)\}\}"
|
|
110
|
+
secret_ref_pattern = r"\{\{(?!step|env)([^}]+):([^}]+)\}\}"
|
|
111
|
+
|
|
112
|
+
def replace_with_env_var(match):
|
|
113
|
+
env_var_name = match.group(1)
|
|
114
|
+
env_var_value = os.getenv(env_var_name)
|
|
115
|
+
return env_var_value
|
|
116
|
+
|
|
117
|
+
def replace_with_secret(match):
|
|
118
|
+
secret_scope_name = match.group(1)
|
|
119
|
+
secret_key = match.group(2)
|
|
120
|
+
return SessionManager.get_utils().secrets.get(scope=secret_scope_name, key=secret_key)
|
|
121
|
+
|
|
122
|
+
env_replaced_yaml_string = re.sub(env_var_pattern, replace_with_env_var, yaml_str)
|
|
123
|
+
final_yaml_string = re.sub(secret_ref_pattern, replace_with_secret, env_replaced_yaml_string)
|
|
124
|
+
return final_yaml_string
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from jinja2 import FileSystemLoader
|
|
4
|
+
from jinja2 import Template as JinjaTemplate
|
|
5
|
+
from jinja2.environment import Environment
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TemplateLoaderMixin:
|
|
9
|
+
"""A Mixin to load Jinja Templates."""
|
|
10
|
+
|
|
11
|
+
@staticmethod
|
|
12
|
+
def get_template(template_path: Path, template_name: str) -> JinjaTemplate:
|
|
13
|
+
"""Load the specified template."""
|
|
14
|
+
loader: FileSystemLoader = FileSystemLoader(template_path)
|
|
15
|
+
|
|
16
|
+
env = Environment(loader=loader, keep_trailing_newline=True)
|
|
17
|
+
|
|
18
|
+
return env.get_template(template_name)
|