cloe-nessy 0.3.17.0__py3-none-any.whl → 0.3.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. cloe_nessy/clients/api_client/__init__.py +10 -1
  2. cloe_nessy/clients/api_client/api_client.py +19 -8
  3. cloe_nessy/clients/api_client/api_response.py +7 -4
  4. cloe_nessy/clients/api_client/pagination_config.py +84 -0
  5. cloe_nessy/clients/api_client/pagination_strategy.py +500 -0
  6. cloe_nessy/integration/delta_loader/delta_loader.py +1 -1
  7. cloe_nessy/integration/reader/__init__.py +2 -2
  8. cloe_nessy/integration/reader/api_reader.py +463 -72
  9. cloe_nessy/integration/reader/catalog_reader.py +49 -10
  10. cloe_nessy/integration/reader/excel_reader.py +3 -3
  11. cloe_nessy/integration/reader/file_reader.py +3 -1
  12. cloe_nessy/integration/reader/reader.py +1 -1
  13. cloe_nessy/integration/writer/catalog_writer.py +64 -2
  14. cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +5 -1
  15. cloe_nessy/models/column.py +3 -2
  16. cloe_nessy/models/schema.py +1 -0
  17. cloe_nessy/models/templates/create_table.sql.j2 +22 -0
  18. cloe_nessy/object_manager/table_manager.py +29 -7
  19. cloe_nessy/pipeline/actions/__init__.py +1 -1
  20. cloe_nessy/pipeline/actions/read_api.py +272 -75
  21. cloe_nessy/pipeline/actions/read_catalog_table.py +73 -10
  22. cloe_nessy/pipeline/actions/read_excel.py +1 -1
  23. cloe_nessy/pipeline/actions/read_metadata_yaml.py +61 -33
  24. cloe_nessy/pipeline/actions/transform_decode.py +2 -1
  25. cloe_nessy/pipeline/actions/transform_join.py +98 -24
  26. cloe_nessy/pipeline/actions/transform_union.py +2 -2
  27. cloe_nessy/pipeline/actions/write_catalog_table.py +66 -21
  28. cloe_nessy/pipeline/actions/write_delta_merge.py +1 -0
  29. cloe_nessy/pipeline/pipeline_config.py +2 -0
  30. cloe_nessy/pipeline/pipeline_context.py +1 -1
  31. cloe_nessy/pipeline/pipeline_parsing_service.py +104 -39
  32. cloe_nessy/pipeline/pipeline_step.py +2 -0
  33. cloe_nessy/session/__init__.py +2 -1
  34. cloe_nessy/session/pyspark_compat.py +15 -0
  35. cloe_nessy/session/session_manager.py +1 -1
  36. {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/METADATA +19 -19
  37. {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/RECORD +38 -36
  38. {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/WHEEL +1 -2
  39. cloe_nessy-0.3.17.0.dist-info/top_level.txt +0 -1
@@ -3,7 +3,8 @@ from typing import Any
3
3
 
4
4
  import pandas as pd
5
5
  import pyspark.sql.functions as F
6
- from pyspark.sql import DataFrame
6
+
7
+ from cloe_nessy.session import DataFrame
7
8
 
8
9
  from .reader import BaseReader
9
10
 
@@ -27,7 +28,6 @@ class ExcelDataFrameReader(BaseReader):
27
28
  def read(
28
29
  self,
29
30
  location: str,
30
- *,
31
31
  sheet_name: str | int | list = 0,
32
32
  header: int | list[int] = 0,
33
33
  index_col: int | list[int] | None = None,
@@ -43,7 +43,7 @@ class ExcelDataFrameReader(BaseReader):
43
43
  options: dict | None = None,
44
44
  load_as_strings: bool = False,
45
45
  add_metadata_column: bool = False,
46
- **kwargs: Any,
46
+ **_: Any,
47
47
  ) -> DataFrame:
48
48
  """Reads Excel file on specified location and returns DataFrame.
49
49
 
@@ -1,10 +1,12 @@
1
1
  from typing import Any
2
2
 
3
3
  import pyspark.sql.functions as F
4
- from pyspark.sql import DataFrame, DataFrameReader
4
+ from pyspark.sql import DataFrameReader
5
5
  from pyspark.sql.streaming import DataStreamReader
6
6
  from pyspark.sql.types import StructType
7
7
 
8
+ from cloe_nessy.session import DataFrame
9
+
8
10
  from ...file_utilities import get_file_paths
9
11
  from ..delta_loader.delta_load_options import DeltaLoadOptions
10
12
  from ..delta_loader.delta_loader_factory import DeltaLoaderFactory
@@ -1,7 +1,7 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from typing import Any
3
3
 
4
- from pyspark.sql import DataFrame, SparkSession
4
+ from cloe_nessy.session import DataFrame, SparkSession
5
5
 
6
6
  from ...logging.logger_mixin import LoggerMixin
7
7
  from ...session import SessionManager
@@ -1,11 +1,11 @@
1
- from pyspark.sql import DataFrame
1
+ from cloe_nessy.session import DataFrame
2
2
 
3
3
 
4
4
  class CatalogWriter:
5
5
  """A writer for Catalog tables."""
6
6
 
7
7
  @staticmethod
8
- def write_table(
8
+ def write(
9
9
  df: DataFrame | None,
10
10
  table_identifier: str | None,
11
11
  partition_by: str | list[str] | None = None,
@@ -46,3 +46,65 @@ class CatalogWriter:
46
46
  if options is None:
47
47
  options = {}
48
48
  df.write.saveAsTable(table_identifier, mode=mode, partitionBy=partition_by, **options)
49
+
50
+ @staticmethod
51
+ def write_stream(
52
+ df: DataFrame | None,
53
+ table_identifier: str | None,
54
+ checkpoint_location: str | None = None,
55
+ trigger_dict: dict | None = None,
56
+ options: dict[str, str] | None = None,
57
+ mode: str = "append",
58
+ await_termination: bool = False,
59
+ ) -> None:
60
+ """Write a streaming DataFrame to a Unity Catalog table.
61
+
62
+ Args:
63
+ df: The streaming DataFrame to write.
64
+ table_identifier: The table identifier in the Unity Catalog in the
65
+ format 'catalog.schema.table'.
66
+ checkpoint_location: Location for checkpointing. Required for stream recovery.
67
+ trigger_dict: A dictionary specifying the trigger configuration for the streaming query.
68
+ Supported keys include:
69
+ - "processingTime": Specifies a time interval (e.g., "10 seconds") for micro-batch processing.
70
+ - "once": Processes all available data once and then stops.
71
+ - "continuous": Specifies a time interval (e.g., "1 second") for continuous processing.
72
+ - "availableNow": Processes all available data immediately and then stops.
73
+ If nothing is provided, the default is {"availableNow": True}.
74
+ options: PySpark options for the DataFrame streaming write operation.
75
+ mode: The write mode. For streaming, typically "append".
76
+ await_termination: If True, the function will wait for the streaming
77
+ query to finish before returning.
78
+
79
+ Raises:
80
+ ValueError: If the mode is not supported for streaming operations.
81
+ ValueError: If the table_identifier is not a string or not in the format 'catalog.schema.table'.
82
+ ValueError: If the DataFrame is None.
83
+ ValueError: If checkpoint_location is not provided.
84
+ """
85
+ if mode not in ("append", "complete", "update"):
86
+ raise ValueError("mode must be one of append, complete, update for streaming operations")
87
+ if not table_identifier:
88
+ raise ValueError("table_identifier is required")
89
+ elif not isinstance(table_identifier, str):
90
+ raise ValueError("table_identifier must be a string")
91
+ elif len(table_identifier.split(".")) != 3:
92
+ raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
93
+ if not df:
94
+ raise ValueError("df is required, but was None.")
95
+ if not checkpoint_location:
96
+ raise ValueError("checkpoint_location is required for streaming operations")
97
+
98
+ if options is None:
99
+ options = {}
100
+ if trigger_dict is None:
101
+ trigger_dict = {"availableNow": True}
102
+
103
+ stream_writer = df.writeStream.format("delta").outputMode(mode)
104
+ stream_writer.options(**options).option("checkpointLocation", checkpoint_location)
105
+ stream_writer.trigger(**trigger_dict)
106
+
107
+ query = stream_writer.toTable(table_identifier)
108
+
109
+ if await_termination:
110
+ query.awaitTermination()
@@ -196,7 +196,11 @@ class DeltaMergeWriter(BaseDeltaWriter):
196
196
 
197
197
  config = DeltaMergeConfig(dataframe_columns=data_frame.columns, **kwargs)
198
198
 
199
- delta_table = self.table_manager.get_delta_table(location=storage_path, spark=data_frame.sparkSession)
199
+ delta_table = self.table_manager.get_delta_table(
200
+ table=table,
201
+ location=storage_path if not table else None,
202
+ spark=data_frame.sparkSession,
203
+ )
200
204
 
201
205
  match_conditions = self._build_match_conditions(data_frame, config)
202
206
 
@@ -5,6 +5,7 @@ from pydantic import BaseModel, Field, field_validator, model_validator
5
5
 
6
6
  COLUMN_DATA_TYPE_LIST = {
7
7
  "string",
8
+ "decimal",
8
9
  "integer",
9
10
  "int",
10
11
  "smallint",
@@ -31,7 +32,7 @@ class Column(BaseModel):
31
32
  nullable: bool = True
32
33
  default_value: Any = None
33
34
  generated: str | None = None
34
- properties: dict[str, Any] = Field(default_factory=dict)
35
+ business_properties: dict[str, Any] = Field(default_factory=dict)
35
36
  comment: str | None = None
36
37
 
37
38
  @field_validator("data_type", mode="before")
@@ -43,7 +44,7 @@ class Column(BaseModel):
43
44
  """
44
45
  val = raw.lower()
45
46
  base_data_types = re.findall(r"\b[a-z]+\b", val)
46
- forbidden_characters = re.findall(r"[^a-z\<\>)]+", val)
47
+ forbidden_characters = re.findall(r"[^a-z0-9\(\)\<\>, ]+", val)
47
48
 
48
49
  if forbidden_characters:
49
50
  raise ValueError(f"Forbidden characters in data type definition [ '{val}' ]: [' {forbidden_characters} ']")
@@ -43,6 +43,7 @@ class Schema(ReadInstancesMixin):
43
43
  raise FileNotFoundError("Schema file not found.")
44
44
 
45
45
  schema, schema_errors = super().read_instance_from_file(processed_instance_path)
46
+ table_errors: list[ValidationErrorType] = []
46
47
  if schema:
47
48
  schema.storage_path = "" if not schema.storage_path else schema.storage_path
48
49
  tables, table_errors = Table.read_instances_from_directory(
@@ -13,6 +13,14 @@ USING delta
13
13
  {% if table.storage_path %}
14
14
  LOCATION '{{ table.storage_path }}'
15
15
  {% endif %}
16
+ {% if table.properties %}
17
+ TBLPROPERTIES (
18
+ {%- for key, value in table.properties.items() %}
19
+ {%- if not loop.first %}, {% endif -%}
20
+ '{{key}}' = '{{value}}'
21
+ {%- endfor -%}
22
+ )
23
+ {% endif %}
16
24
  {% if table.partition_by -%}
17
25
  {%- if table.liquid_clustering -%} CLUSTER {%- else -%} PARTITIONED {%- endif %} BY (
18
26
  {%- for column in table.partition_by -%}
@@ -34,3 +42,17 @@ ALTER TABLE {{ table.escaped_identifier }} ADD CONSTRAINT {{constraint.name}} CH
34
42
  {%- if table.comment %}
35
43
  COMMENT ON TABLE {{ table.escaped_identifier }} IS '{{ table.comment }}';
36
44
  {%- endif %}
45
+ {# Tags do not yet work in Databricks
46
+ {%- if table.business_properties %}
47
+ {%- for tag_key, tag_value in table.business_properties.items() %}
48
+ SET TAG ON TABLE {{ table.escaped_identifier }} `{{ tag_key }}`{% if tag_value %} = `{{ tag_value }}`{% endif %};
49
+ {%- endfor %}
50
+ {%- endif %}
51
+
52
+ {%- for column in table.columns %}
53
+ {%- if column.business_properties %}
54
+ {%- for tag_key, tag_value in column.business_properties.items() %}
55
+ SET TAG ON COLUMN {{ table.escaped_identifier }}.`{{ column.name }}` `{{ tag_key }}`{% if tag_value %} = `{{ tag_value }}`{% endif %};
56
+ {%- endfor %}
57
+ {%- endif %}
58
+ {%- endfor %} #}
@@ -110,7 +110,7 @@ class TableManager(LoggerMixin):
110
110
  self._spark.sql(f"USE CATALOG {table.catalog};")
111
111
  self._spark.sql(f"USE SCHEMA {table.schema};")
112
112
  for statement in table.get_create_statement(replace=replace).split(";"):
113
- if statement and statement != "\n":
113
+ if statement and statement.strip():
114
114
  self._spark.sql(statement)
115
115
 
116
116
  def drop_table(
@@ -186,6 +186,9 @@ class TableManager(LoggerMixin):
186
186
  def get_delta_table(self, table: Table | None = None, location: str | None = None, spark=None) -> DeltaTable:
187
187
  """Get the DeltaTable object from the Table objects location or a location string.
188
188
 
189
+ For managed tables, uses the table identifier to access the DeltaTable.
190
+ For external tables or when a location is provided, uses the storage path.
191
+
189
192
  Args:
190
193
  table: A Table object representing the Delta table.
191
194
  location: A string representing the table location.
@@ -195,17 +198,35 @@ class TableManager(LoggerMixin):
195
198
  The DeltaTable object corresponding to the given Table object or location string.
196
199
 
197
200
  Raises:
198
- ValueError: If neither table nor location is provided, or if both are provided.
201
+ ValueError: If neither table nor location is provided.
199
202
  """
200
- if (table is None and location is None) or (table is not None and location is not None):
203
+ if table is None and location is None:
201
204
  raise ValueError(
202
- f"Either table or location must be provided, but not both. Table: {table}, location: {location}",
205
+ f"Either table or location must be provided. Table: {table}, location: {location}",
206
+ )
207
+
208
+ spark_session = spark or self._spark
209
+
210
+ if table is not None and location is not None:
211
+ self._console_logger.info(
212
+ f"Both table ({table.identifier}) and location ({location}) provided. Using table object as priority."
203
213
  )
204
214
 
205
215
  if table is not None:
206
- location = str(table.storage_path)
216
+ if table.is_external is False:
217
+ self._console_logger.info(f"Getting DeltaTable object for managed table: {table.identifier}")
218
+ return DeltaTable.forName(spark_session, table.identifier)
219
+
220
+ table_location = str(table.storage_path)
221
+ self._console_logger.info(f"Getting DeltaTable object for external table location: {table_location}")
222
+ return DeltaTable.forPath(spark_session, table_location)
223
+
224
+ self._console_logger.info(f"No table object provided, using location: {location}")
225
+ if location is None:
226
+ self._console_logger.error("Location is None - this should not happen!")
227
+ raise ValueError("Location cannot be None when no table object is provided")
207
228
  self._console_logger.info(f"Getting DeltaTable object for location: {location}")
208
- return DeltaTable.forPath(spark or self._spark, str(location))
229
+ return DeltaTable.forPath(spark_session, str(location))
209
230
 
210
231
  def table_exists(self, table: Table | None = None, table_identifier: str | None = None) -> bool:
211
232
  """Checks if a table exists in the catalog.
@@ -235,9 +256,10 @@ class TableManager(LoggerMixin):
235
256
  raise ValueError("Invalid table identifier format. Expected 'catalog.schema.table'.")
236
257
 
237
258
  query_result = self._spark.sql(
259
+ # Using both upper and lower case to ensure compatibility with case changes in Databricks
238
260
  f"""
239
261
  SELECT 1 FROM {catalog}.information_schema.tables
240
- WHERE table_name = '{table_name}'
262
+ WHERE table_name in ('{table_name}', '{table_name.lower()}')
241
263
  AND table_schema = '{schema}'
242
264
  LIMIT 1""",
243
265
  )
@@ -33,7 +33,7 @@ pipeline_actions = {cls.name: cls for cls in PipelineAction.__subclasses__()}
33
33
  # Register all subclasses dynamically as enum using their "name" attribute as
34
34
  # key. We need to do this here, because otherwise we don't get all subclasses
35
35
  # from a relative import of PipelineAction
36
- PipelineActionType = Enum("PipelineActionType", pipeline_actions) # type: ignore
36
+ PipelineActionType = Enum("PipelineActionType", pipeline_actions) # type: ignore[misc]
37
37
 
38
38
  __all__ = [
39
39
  "ReadAPIAction",