cloe-nessy 0.3.5__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. cloe_nessy/__init__.py +0 -0
  2. cloe_nessy/clients/__init__.py +0 -0
  3. cloe_nessy/clients/api_client/__init__.py +0 -0
  4. cloe_nessy/clients/api_client/api_client.py +0 -0
  5. cloe_nessy/clients/api_client/api_response.py +0 -0
  6. cloe_nessy/clients/api_client/auth.py +0 -0
  7. cloe_nessy/clients/api_client/exceptions.py +0 -0
  8. cloe_nessy/file_utilities/__init__.py +0 -0
  9. cloe_nessy/file_utilities/exceptions.py +0 -0
  10. cloe_nessy/file_utilities/factory.py +0 -0
  11. cloe_nessy/file_utilities/get_file_paths.py +0 -0
  12. cloe_nessy/file_utilities/location_types.py +0 -0
  13. cloe_nessy/file_utilities/strategies/__init__.py +0 -0
  14. cloe_nessy/file_utilities/strategies/base_strategy.py +0 -0
  15. cloe_nessy/file_utilities/strategies/local_strategy.py +0 -0
  16. cloe_nessy/file_utilities/strategies/onelake_strategy.py +0 -0
  17. cloe_nessy/file_utilities/strategies/utils_strategy.py +0 -0
  18. cloe_nessy/integration/__init__.py +0 -0
  19. cloe_nessy/integration/reader/__init__.py +0 -0
  20. cloe_nessy/integration/reader/api_reader.py +4 -2
  21. cloe_nessy/integration/reader/catalog_reader.py +6 -3
  22. cloe_nessy/integration/reader/excel_reader.py +1 -1
  23. cloe_nessy/integration/reader/exceptions.py +0 -0
  24. cloe_nessy/integration/reader/file_reader.py +78 -5
  25. cloe_nessy/integration/reader/reader.py +0 -0
  26. cloe_nessy/integration/writer/__init__.py +8 -1
  27. cloe_nessy/integration/writer/catalog_writer.py +0 -0
  28. cloe_nessy/integration/writer/delta_writer/__init__.py +7 -0
  29. cloe_nessy/integration/writer/delta_writer/delta_append_writer.py +108 -0
  30. cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +215 -0
  31. cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py +21 -0
  32. cloe_nessy/integration/writer/delta_writer/delta_writer_base.py +210 -0
  33. cloe_nessy/integration/writer/delta_writer/exceptions.py +4 -0
  34. cloe_nessy/integration/writer/file_writer.py +132 -0
  35. cloe_nessy/integration/writer/writer.py +54 -0
  36. cloe_nessy/logging/__init__.py +0 -0
  37. cloe_nessy/logging/logger_mixin.py +0 -0
  38. cloe_nessy/models/__init__.py +4 -0
  39. cloe_nessy/models/adapter/__init__.py +3 -0
  40. cloe_nessy/models/adapter/unity_catalog_adapter.py +296 -0
  41. cloe_nessy/models/catalog.py +10 -0
  42. cloe_nessy/models/column.py +0 -0
  43. cloe_nessy/models/constraint.py +0 -0
  44. cloe_nessy/models/foreign_key.py +0 -0
  45. cloe_nessy/models/mixins/__init__.py +0 -0
  46. cloe_nessy/models/mixins/read_instance_mixin.py +0 -0
  47. cloe_nessy/models/mixins/template_loader_mixin.py +0 -0
  48. cloe_nessy/models/schema.py +20 -1
  49. cloe_nessy/models/table.py +67 -11
  50. cloe_nessy/models/types.py +0 -0
  51. cloe_nessy/models/volume.py +67 -0
  52. cloe_nessy/object_manager/__init__.py +7 -2
  53. cloe_nessy/object_manager/table_manager.py +251 -21
  54. cloe_nessy/object_manager/volume_manager.py +70 -0
  55. cloe_nessy/pipeline/__init__.py +0 -0
  56. cloe_nessy/pipeline/actions/__init__.py +9 -1
  57. cloe_nessy/pipeline/actions/read_api.py +0 -0
  58. cloe_nessy/pipeline/actions/read_catalog_table.py +1 -4
  59. cloe_nessy/pipeline/actions/read_excel.py +0 -0
  60. cloe_nessy/pipeline/actions/read_files.py +0 -0
  61. cloe_nessy/pipeline/actions/read_metadata_yaml.py +0 -0
  62. cloe_nessy/pipeline/actions/transform_change_datatype.py +0 -0
  63. cloe_nessy/pipeline/actions/transform_clean_column_names.py +0 -0
  64. cloe_nessy/pipeline/actions/transform_concat_columns.py +0 -0
  65. cloe_nessy/pipeline/actions/transform_decode.py +0 -0
  66. cloe_nessy/pipeline/actions/transform_deduplication.py +0 -0
  67. cloe_nessy/pipeline/actions/transform_distinct.py +0 -0
  68. cloe_nessy/pipeline/actions/transform_filter.py +0 -0
  69. cloe_nessy/pipeline/actions/transform_generic_sql.py +0 -0
  70. cloe_nessy/pipeline/actions/transform_group_aggregate.py +0 -0
  71. cloe_nessy/pipeline/actions/transform_hash_columns.py +209 -0
  72. cloe_nessy/pipeline/actions/transform_join.py +0 -0
  73. cloe_nessy/pipeline/actions/transform_json_normalize.py +0 -0
  74. cloe_nessy/pipeline/actions/transform_rename_columns.py +0 -0
  75. cloe_nessy/pipeline/actions/transform_replace_values.py +0 -0
  76. cloe_nessy/pipeline/actions/transform_select_columns.py +0 -0
  77. cloe_nessy/pipeline/actions/transform_union.py +0 -0
  78. cloe_nessy/pipeline/actions/write_catalog_table.py +0 -0
  79. cloe_nessy/pipeline/actions/write_delta_append.py +69 -0
  80. cloe_nessy/pipeline/actions/write_delta_merge.py +118 -0
  81. cloe_nessy/pipeline/actions/write_file.py +94 -0
  82. cloe_nessy/pipeline/pipeline.py +44 -2
  83. cloe_nessy/pipeline/pipeline_action.py +0 -0
  84. cloe_nessy/pipeline/pipeline_config.py +0 -0
  85. cloe_nessy/pipeline/pipeline_context.py +0 -0
  86. cloe_nessy/pipeline/pipeline_parsing_service.py +0 -0
  87. cloe_nessy/pipeline/pipeline_step.py +0 -0
  88. cloe_nessy/py.typed +0 -0
  89. cloe_nessy/session/__init__.py +0 -0
  90. cloe_nessy/session/session_manager.py +27 -0
  91. cloe_nessy/settings/__init__.py +0 -0
  92. cloe_nessy/settings/settings.py +0 -0
  93. cloe_nessy/utils/__init__.py +0 -0
  94. cloe_nessy/utils/file_and_directory_handler.py +0 -0
  95. cloe_nessy-0.3.9.dist-info/METADATA +70 -0
  96. {cloe_nessy-0.3.5.dist-info → cloe_nessy-0.3.9.dist-info}/RECORD +35 -18
  97. {cloe_nessy-0.3.5.dist-info → cloe_nessy-0.3.9.dist-info}/WHEEL +1 -1
  98. {cloe_nessy-0.3.5.dist-info → cloe_nessy-0.3.9.dist-info}/top_level.txt +0 -0
  99. cloe_nessy-0.3.5.dist-info/METADATA +0 -26
@@ -0,0 +1,210 @@
1
+ import logging
2
+ from abc import ABC
3
+ from dataclasses import dataclass, field
4
+
5
+ from pyspark.sql import DataFrame, Row
6
+ from pyspark.sql.functions import col, concat, concat_ws, format_string, lit
7
+
8
+ from ....object_manager import TableManager
9
+ from ....session import SessionManager
10
+ from ..writer import BaseWriter
11
+ from .delta_table_operation_type import DeltaTableOperationType
12
+ from .exceptions import EmptyDataframeError
13
+
14
+
15
+ @dataclass
16
+ class DeltaWriterLogs:
17
+ """Dataclass defining the delta writer logs table."""
18
+
19
+ logger_name = "Tabular:DeltaWriter"
20
+ log_type: str = "nessy_simple_logs"
21
+ uc_table_name: str = "nessy_simple_logs"
22
+ uc_table_columns: dict[str, str] = field(
23
+ default_factory=lambda: {
24
+ "message": "STRING",
25
+ }
26
+ )
27
+
28
+
29
+ @dataclass
30
+ class TableOperationMetricsLogs:
31
+ """Dataclass defining the table operation metrics logs table."""
32
+
33
+ logger_name = "Tabular:TableOperationMetrics"
34
+ log_type: str = "nessy_table_operation_metrics"
35
+ uc_table_name: str = "nessy_table_operation_metrics"
36
+ uc_table_columns: dict[str, str] = field(
37
+ default_factory=lambda: {
38
+ "timestamp": "TIMESTAMP",
39
+ "table_identifier": "STRING",
40
+ "operation_type": "STRING",
41
+ "metric": "STRING",
42
+ "value": "STRING",
43
+ "user_name": "STRING",
44
+ "job_id": "STRING",
45
+ "job_run_id": "STRING",
46
+ "run_id": "STRING",
47
+ "notebook_id": "STRING",
48
+ "cluster_id": "STRING",
49
+ }
50
+ )
51
+
52
+
53
+ class BaseDeltaWriter(BaseWriter, ABC):
54
+ """A class for writing DataFrames to Delta tables."""
55
+
56
+ def __init__(
57
+ self,
58
+ tabular_logger: logging.Logger | None = None,
59
+ table_operation_metrics_logger: logging.Logger | None = None,
60
+ ):
61
+ super().__init__()
62
+ self._spark = SessionManager.get_spark_session()
63
+ self._dbutils = SessionManager.get_utils()
64
+ self._table_operation_metrics_logger = table_operation_metrics_logger or self.get_tabular_logger(
65
+ **DeltaWriterLogs().__dict__
66
+ )
67
+ self.table_manager = TableManager()
68
+ self._tabular_logger = tabular_logger or self.get_tabular_logger(**DeltaWriterLogs().__dict__)
69
+
70
+ def _delta_operation_log(self, table_identifier: str, operation_type: DeltaTableOperationType) -> dict:
71
+ """Returns a dictionary containing the most recent delta log of a Delta table for given operation type.
72
+
73
+ Args:
74
+ table_identifier: The identifier of the Delta table in the format 'catalog.schema.table'.
75
+ operation_type: A DeltaTableOperationType
76
+ object specifying the type of operation for which metrics should
77
+ be retrieved (UPDATE, DELETE, MERGE or WRITE).
78
+
79
+ Returns:
80
+ dict: A dictionary containing the operation log.
81
+ """
82
+ delta_history = self._spark.sql(f"DESCRIBE HISTORY {table_identifier}")
83
+
84
+ try:
85
+ operation_log: dict = (
86
+ delta_history.filter(col("operation") == operation_type.name.replace("_", " "))
87
+ .orderBy("version", ascending=False)
88
+ .collect()[0]
89
+ .asDict()
90
+ )
91
+ except IndexError:
92
+ operation_log = {}
93
+
94
+ return operation_log
95
+
96
+ def _report_delta_table_operation_metrics(
97
+ self, table_identifier: str, operation_type: DeltaTableOperationType
98
+ ) -> None:
99
+ """Logs the most recent metrics of a Delta table for given operation type.
100
+
101
+ Args:
102
+ table_identifier: The identifier of the Delta table in the format 'catalog.schema.table'.
103
+ operation_type: A DeltaTableOperationType object specifying the type
104
+ of operation for which metrics should be retrieved (UPDATE, DELETE,
105
+ MERGE or WRITE).
106
+ """
107
+ operation_log = self._delta_operation_log(table_identifier, operation_type)
108
+ timestamp = operation_log.get("timestamp")
109
+ user_name = operation_log.get("userName")
110
+ job_id = (operation_log.get("job") or Row(jobId=None)).asDict().get("jobId")
111
+ job_run_id = (operation_log.get("job") or Row(jobRunId=None)).asDict().get("jobRunId")
112
+ run_id = (operation_log.get("job") or Row(runId=None)).asDict().get("runId")
113
+ notebook_id = (operation_log.get("notebook") or Row(notebook_id=None)).asDict().get("notebookId")
114
+ cluster_id = operation_log.get("clusterId")
115
+ affected_rows = {
116
+ k: v for k, v in operation_log.get("operationMetrics", {}).items() if k in operation_type.value
117
+ }
118
+ for metric, value in affected_rows.items():
119
+ log_message = f"""timestamp: {timestamp} |
120
+ table_identifier: {table_identifier} |
121
+ operation_type: {operation_type.name} |
122
+ metric_name: {metric} |
123
+ metric_value: {value} |
124
+ user_name: {user_name} |
125
+ job_id: {job_id} |
126
+ job_run_id: {job_run_id} |
127
+ run_id: {run_id} |
128
+ notebook_id: {notebook_id} |
129
+ cluster_id: {cluster_id}
130
+ """
131
+ self._table_operation_metrics_logger.info(log_message)
132
+
133
+ @staticmethod
134
+ def _merge_match_conditions(columns: list[str]) -> str:
135
+ """Merges match conditions of the given columns into a single string.
136
+
137
+ This function is used to generate an SQL query to match rows between two tables based on
138
+ the specified columns.
139
+
140
+ Args:
141
+ columns: A list of strings representing the names of the columns to match.
142
+
143
+ Returns:
144
+ A string containing the match conditions, separated by " AND "
145
+
146
+ Example:
147
+ ```python
148
+ _merge_match_conditions(["column1", "column2"]) # "target.column1 <=> source.column1 AND target.column2 <=> source.column2"
149
+ ```
150
+ """
151
+ return " AND ".join([f"target.`{c}` <=> source.`{c}`" for c in columns])
152
+
153
+ @staticmethod
154
+ def _partition_pruning_conditions(df, partition_cols: list[str] | None) -> str:
155
+ """Generates partition pruning conditions for an SQL query.
156
+
157
+ This function is used to optimize the performance of an SQL query by only scanning the
158
+ necessary partitions in a table, based on the specified partition columns and the data
159
+ in a Spark dataframe.
160
+
161
+ Args:
162
+ df: A Spark dataframe containing the data to generate the partition pruning
163
+ conditions from.
164
+ partition_cols: A list of strings representing the names of the
165
+ partition columns.
166
+
167
+ Returns:
168
+ A string, representing the partition pruning conditions.
169
+
170
+ Example:
171
+ ```python
172
+ _partition_pruning_conditions(df, ["column1", "column2"])
173
+ "(target.column1 = 'value1' AND target.column2 = 'value2') OR (target.column1 = 'value3'
174
+ AND target.column2 = 'value4')"
175
+ ```
176
+ """
177
+ if not partition_cols:
178
+ return ""
179
+ pruning_conditions = (
180
+ df.select(*partition_cols)
181
+ .distinct()
182
+ .select([format_string("target.`%s` = '%s'", lit(c), col(c)).alias(c) for c in partition_cols])
183
+ .withColumn("result", concat(lit("("), concat_ws(" AND ", *partition_cols), lit(")")))
184
+ .select("result")
185
+ .toPandas()
186
+ .result.str.cat(sep=" OR ")
187
+ )
188
+ pruning_conditions = "(" + pruning_conditions + ")"
189
+
190
+ return str(pruning_conditions)
191
+
192
+ def _empty_dataframe_check(self, df: DataFrame, ignore_empty_df: bool) -> bool | None:
193
+ """Checks if a DataFrame is empty and raises an exception if it is not expected to be empty.
194
+
195
+ Args:
196
+ df: The DataFrame to check for emptiness.
197
+ ignore_empty_df: If True, the function will return without raising
198
+ an exception if the DataFrame is empty. If False, an EmptyDataframeException
199
+ will be raised.
200
+
201
+ Raises:
202
+ EmptyDataframeException: If the DataFrame is empty and ignore_empty_df is False.
203
+ """
204
+ if df.isEmpty():
205
+ if ignore_empty_df:
206
+ return True
207
+ raise EmptyDataframeError(
208
+ "EMPTY DATAFRAME, nothing to write. If this is expected, consider setting `ignore_empty_df` to True.",
209
+ )
210
+ return None
@@ -0,0 +1,4 @@
1
+ class EmptyDataframeError(Exception):
2
+ """When a dataframe is empty when it should not be."""
3
+
4
+ pass
@@ -0,0 +1,132 @@
1
+ from typing import Any
2
+
3
+ from pyspark.sql import DataFrame, DataFrameWriter
4
+ from pyspark.sql.streaming import DataStreamWriter
5
+
6
+ from .writer import BaseWriter
7
+
8
+
9
+ class FileWriter(BaseWriter):
10
+ """Utility class for writing a DataFrame to a file."""
11
+
12
+ def __init__(self):
13
+ super().__init__()
14
+
15
+ def _get_writer(self, df: DataFrame) -> DataFrameWriter:
16
+ """Returns a DataFrameWriter."""
17
+ return df.write
18
+
19
+ def _get_stream_writer(self, df: DataFrame) -> DataStreamWriter:
20
+ """Returns a DataStreamWriter."""
21
+ return df.writeStream
22
+
23
+ def _log_operation(self, location: str, status: str, error: str | None = None):
24
+ """Logs the status of an operation."""
25
+ if status == "start":
26
+ self._console_logger.info(f"Starting to write to {location}")
27
+ elif status == "succeeded":
28
+ self._console_logger.info(f"Successfully wrote to {location}")
29
+ elif status == "failed":
30
+ self._console_logger.error(f"Failed to write to {location}: {error}")
31
+
32
+ def _validate_trigger(self, trigger_dict: dict[str, Any]):
33
+ """Validates the trigger type."""
34
+ triggers = ["processingTime", "once", "continuous", "availableNow"]
35
+ if not any(trigger in trigger_dict for trigger in triggers):
36
+ raise ValueError(f"Invalid trigger type. Supported types are: {', '.join(triggers)}")
37
+
38
+ def write_stream(
39
+ self,
40
+ data_frame: DataFrame | None = None,
41
+ location: str | None = None,
42
+ format: str = "delta",
43
+ checkpoint_location: str | None = None,
44
+ partition_cols: list[str] | None = None,
45
+ mode: str = "append",
46
+ trigger_dict: dict | None = None,
47
+ options: dict[str, Any] | None = None,
48
+ await_termination: bool = False,
49
+ **_: Any,
50
+ ):
51
+ """Writes a dataframe to specified location in specified format as a stream.
52
+
53
+ Args:
54
+ data_frame: The DataFrame to write.
55
+ location: The location to write the DataFrame to.
56
+ format: The format to write the DataFrame in.
57
+ checkpoint_location: Location of checkpoint. If None, defaults
58
+ to the location of the table being written, with '_checkpoint_'
59
+ added before the name.
60
+ partition_cols: Columns to partition by.
61
+ mode: The write mode.
62
+ trigger_dict: A dictionary specifying the trigger configuration for the streaming query.
63
+ Supported keys include:
64
+
65
+ - "processingTime": Specifies a time interval (e.g., "10 seconds") for micro-batch processing.
66
+ - "once": Processes all available data once and then stops.
67
+ - "continuous": Specifies a time interval (e.g., "1 second") for continuous processing.
68
+ - "availableNow": Processes all available data immediately and then stops.
69
+
70
+ If nothing is provided, the default is {"availableNow": True}.
71
+ options: Additional options for writing.
72
+ await_termination: If True, the function will wait for the streaming
73
+ query to finish before returning. This is useful for ensuring that
74
+ the data has been fully written before proceeding with other
75
+ operations.
76
+ """
77
+ if not location or not data_frame:
78
+ raise ValueError("Location and data_frame are required for streaming.")
79
+
80
+ self._log_operation(location, "start")
81
+ try:
82
+ options = options or {}
83
+ trigger_dict = trigger_dict or {"availableNow": True}
84
+ checkpoint_location = self._get_checkpoint_location(location, checkpoint_location)
85
+ self._validate_trigger(trigger_dict)
86
+ stream_writer = self._get_stream_writer(data_frame)
87
+
88
+ stream_writer.trigger(**trigger_dict)
89
+ stream_writer.format(format)
90
+ stream_writer.outputMode(mode)
91
+ stream_writer.options(**options).option("checkpointLocation", checkpoint_location)
92
+ if partition_cols:
93
+ stream_writer.partitionBy(partition_cols)
94
+
95
+ query = stream_writer.start(location)
96
+ if await_termination is True:
97
+ query.awaitTermination()
98
+ except Exception as e:
99
+ self._log_operation(location, "failed", str(e))
100
+ raise e
101
+ else:
102
+ self._log_operation(location, "succeeded")
103
+
104
+ def write(
105
+ self,
106
+ data_frame: DataFrame,
107
+ location: str | None = None,
108
+ format: str = "delta",
109
+ partition_cols: list[str] | None = None,
110
+ mode: str = "append",
111
+ options: dict[str, Any] | None = None,
112
+ **_: Any,
113
+ ):
114
+ """Writes a dataframe to specified location in specified format."""
115
+ if not location:
116
+ raise ValueError("Location is required for writing to file.")
117
+
118
+ self._log_operation(location, "start")
119
+ try:
120
+ options = options or {}
121
+ df_writer = self._get_writer(data_frame)
122
+ df_writer.format(format)
123
+ df_writer.mode(mode)
124
+ if partition_cols:
125
+ df_writer.partitionBy(partition_cols)
126
+ df_writer.options(**options)
127
+ df_writer.save(str(location))
128
+ except Exception as e:
129
+ self._log_operation(location, "failed", str(e))
130
+ raise e
131
+ else:
132
+ self._log_operation(location, "succeeded")
@@ -0,0 +1,54 @@
1
+ from abc import ABC, abstractmethod
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from pyspark.sql import DataFrame
6
+
7
+ from ...logging import LoggerMixin
8
+
9
+
10
+ class BaseWriter(ABC, LoggerMixin):
11
+ """BaseWriter class to write data."""
12
+
13
+ def __init__(self):
14
+ self._console_logger = self.get_console_logger()
15
+
16
+ @abstractmethod
17
+ def write_stream(self, **kwargs: Any):
18
+ """Writes a DataFrame stream."""
19
+ pass
20
+
21
+ @abstractmethod
22
+ def write(
23
+ self,
24
+ data_frame: DataFrame,
25
+ **kwargs: Any,
26
+ ):
27
+ """Writes a DataFrame."""
28
+ pass
29
+
30
+ def log_operation(self, operation: str, identifier: str | Path, status: str, error: str = ""):
31
+ """Logs the metrics for one operation on the given identifier.
32
+
33
+ Args:
34
+ operation: Describes the type of operation, e.g. 'read_api'.
35
+ identifier: An identifier for the object that's being interacted with.
36
+ status: The status of the operation. Must be one of "start", "failed", "succeeded".
37
+ error: The error message, if any. Defaults to ''.
38
+ """
39
+ self._console_logger.info(
40
+ "operation:%s | identifier:%s | status:%s | error:%s",
41
+ operation,
42
+ identifier,
43
+ status,
44
+ error,
45
+ )
46
+
47
+ def _get_checkpoint_location(self, location: str, checkpoint_location: str | None) -> str:
48
+ """Generates the checkpoint location if not provided."""
49
+ if checkpoint_location is None:
50
+ location_path = Path(location)
51
+ checkpoint_location = str(location_path.parent / f"_checkpoint_{location_path.name}").replace(
52
+ "abfss:/", "abfss://"
53
+ )
54
+ return checkpoint_location
File without changes
File without changes
@@ -1,13 +1,17 @@
1
+ from .catalog import Catalog
1
2
  from .column import Column
2
3
  from .constraint import Constraint
3
4
  from .foreign_key import ForeignKey
4
5
  from .schema import Schema
5
6
  from .table import Table
7
+ from .volume import Volume
6
8
 
7
9
  __all__ = [
10
+ "Catalog",
8
11
  "Column",
9
12
  "Constraint",
10
13
  "Table",
11
14
  "Schema",
12
15
  "ForeignKey",
16
+ "Volume",
13
17
  ]
@@ -0,0 +1,3 @@
1
+ from .unity_catalog_adapter import UnityCatalogAdapter
2
+
3
+ __all__ = ["UnityCatalogAdapter"]