cloe-nessy 0.3.17.0__py3-none-any.whl → 0.3.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/clients/api_client/__init__.py +10 -1
- cloe_nessy/clients/api_client/api_client.py +19 -8
- cloe_nessy/clients/api_client/api_response.py +7 -4
- cloe_nessy/clients/api_client/pagination_config.py +84 -0
- cloe_nessy/clients/api_client/pagination_strategy.py +500 -0
- cloe_nessy/integration/delta_loader/delta_loader.py +1 -1
- cloe_nessy/integration/reader/__init__.py +2 -2
- cloe_nessy/integration/reader/api_reader.py +463 -72
- cloe_nessy/integration/reader/catalog_reader.py +49 -10
- cloe_nessy/integration/reader/excel_reader.py +3 -3
- cloe_nessy/integration/reader/file_reader.py +3 -1
- cloe_nessy/integration/reader/reader.py +1 -1
- cloe_nessy/integration/writer/catalog_writer.py +64 -2
- cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +5 -1
- cloe_nessy/models/column.py +3 -2
- cloe_nessy/models/schema.py +1 -0
- cloe_nessy/models/templates/create_table.sql.j2 +22 -0
- cloe_nessy/object_manager/table_manager.py +29 -7
- cloe_nessy/pipeline/actions/__init__.py +1 -1
- cloe_nessy/pipeline/actions/read_api.py +272 -75
- cloe_nessy/pipeline/actions/read_catalog_table.py +73 -10
- cloe_nessy/pipeline/actions/read_excel.py +1 -1
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +61 -33
- cloe_nessy/pipeline/actions/transform_decode.py +2 -1
- cloe_nessy/pipeline/actions/transform_join.py +98 -24
- cloe_nessy/pipeline/actions/transform_union.py +2 -2
- cloe_nessy/pipeline/actions/write_catalog_table.py +66 -21
- cloe_nessy/pipeline/actions/write_delta_merge.py +1 -0
- cloe_nessy/pipeline/pipeline_config.py +2 -0
- cloe_nessy/pipeline/pipeline_context.py +1 -1
- cloe_nessy/pipeline/pipeline_parsing_service.py +104 -39
- cloe_nessy/pipeline/pipeline_step.py +2 -0
- cloe_nessy/session/__init__.py +2 -1
- cloe_nessy/session/pyspark_compat.py +15 -0
- cloe_nessy/session/session_manager.py +1 -1
- {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/METADATA +19 -19
- {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/RECORD +38 -36
- {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/WHEEL +1 -2
- cloe_nessy-0.3.17.0.dist-info/top_level.txt +0 -1
|
@@ -3,7 +3,8 @@ from typing import Any
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
import pyspark.sql.functions as F
|
|
6
|
-
|
|
6
|
+
|
|
7
|
+
from cloe_nessy.session import DataFrame
|
|
7
8
|
|
|
8
9
|
from .reader import BaseReader
|
|
9
10
|
|
|
@@ -27,7 +28,6 @@ class ExcelDataFrameReader(BaseReader):
|
|
|
27
28
|
def read(
|
|
28
29
|
self,
|
|
29
30
|
location: str,
|
|
30
|
-
*,
|
|
31
31
|
sheet_name: str | int | list = 0,
|
|
32
32
|
header: int | list[int] = 0,
|
|
33
33
|
index_col: int | list[int] | None = None,
|
|
@@ -43,7 +43,7 @@ class ExcelDataFrameReader(BaseReader):
|
|
|
43
43
|
options: dict | None = None,
|
|
44
44
|
load_as_strings: bool = False,
|
|
45
45
|
add_metadata_column: bool = False,
|
|
46
|
-
**
|
|
46
|
+
**_: Any,
|
|
47
47
|
) -> DataFrame:
|
|
48
48
|
"""Reads Excel file on specified location and returns DataFrame.
|
|
49
49
|
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
3
|
import pyspark.sql.functions as F
|
|
4
|
-
from pyspark.sql import
|
|
4
|
+
from pyspark.sql import DataFrameReader
|
|
5
5
|
from pyspark.sql.streaming import DataStreamReader
|
|
6
6
|
from pyspark.sql.types import StructType
|
|
7
7
|
|
|
8
|
+
from cloe_nessy.session import DataFrame
|
|
9
|
+
|
|
8
10
|
from ...file_utilities import get_file_paths
|
|
9
11
|
from ..delta_loader.delta_load_options import DeltaLoadOptions
|
|
10
12
|
from ..delta_loader.delta_loader_factory import DeltaLoaderFactory
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
from
|
|
1
|
+
from cloe_nessy.session import DataFrame
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class CatalogWriter:
|
|
5
5
|
"""A writer for Catalog tables."""
|
|
6
6
|
|
|
7
7
|
@staticmethod
|
|
8
|
-
def
|
|
8
|
+
def write(
|
|
9
9
|
df: DataFrame | None,
|
|
10
10
|
table_identifier: str | None,
|
|
11
11
|
partition_by: str | list[str] | None = None,
|
|
@@ -46,3 +46,65 @@ class CatalogWriter:
|
|
|
46
46
|
if options is None:
|
|
47
47
|
options = {}
|
|
48
48
|
df.write.saveAsTable(table_identifier, mode=mode, partitionBy=partition_by, **options)
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def write_stream(
|
|
52
|
+
df: DataFrame | None,
|
|
53
|
+
table_identifier: str | None,
|
|
54
|
+
checkpoint_location: str | None = None,
|
|
55
|
+
trigger_dict: dict | None = None,
|
|
56
|
+
options: dict[str, str] | None = None,
|
|
57
|
+
mode: str = "append",
|
|
58
|
+
await_termination: bool = False,
|
|
59
|
+
) -> None:
|
|
60
|
+
"""Write a streaming DataFrame to a Unity Catalog table.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
df: The streaming DataFrame to write.
|
|
64
|
+
table_identifier: The table identifier in the Unity Catalog in the
|
|
65
|
+
format 'catalog.schema.table'.
|
|
66
|
+
checkpoint_location: Location for checkpointing. Required for stream recovery.
|
|
67
|
+
trigger_dict: A dictionary specifying the trigger configuration for the streaming query.
|
|
68
|
+
Supported keys include:
|
|
69
|
+
- "processingTime": Specifies a time interval (e.g., "10 seconds") for micro-batch processing.
|
|
70
|
+
- "once": Processes all available data once and then stops.
|
|
71
|
+
- "continuous": Specifies a time interval (e.g., "1 second") for continuous processing.
|
|
72
|
+
- "availableNow": Processes all available data immediately and then stops.
|
|
73
|
+
If nothing is provided, the default is {"availableNow": True}.
|
|
74
|
+
options: PySpark options for the DataFrame streaming write operation.
|
|
75
|
+
mode: The write mode. For streaming, typically "append".
|
|
76
|
+
await_termination: If True, the function will wait for the streaming
|
|
77
|
+
query to finish before returning.
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
ValueError: If the mode is not supported for streaming operations.
|
|
81
|
+
ValueError: If the table_identifier is not a string or not in the format 'catalog.schema.table'.
|
|
82
|
+
ValueError: If the DataFrame is None.
|
|
83
|
+
ValueError: If checkpoint_location is not provided.
|
|
84
|
+
"""
|
|
85
|
+
if mode not in ("append", "complete", "update"):
|
|
86
|
+
raise ValueError("mode must be one of append, complete, update for streaming operations")
|
|
87
|
+
if not table_identifier:
|
|
88
|
+
raise ValueError("table_identifier is required")
|
|
89
|
+
elif not isinstance(table_identifier, str):
|
|
90
|
+
raise ValueError("table_identifier must be a string")
|
|
91
|
+
elif len(table_identifier.split(".")) != 3:
|
|
92
|
+
raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
|
|
93
|
+
if not df:
|
|
94
|
+
raise ValueError("df is required, but was None.")
|
|
95
|
+
if not checkpoint_location:
|
|
96
|
+
raise ValueError("checkpoint_location is required for streaming operations")
|
|
97
|
+
|
|
98
|
+
if options is None:
|
|
99
|
+
options = {}
|
|
100
|
+
if trigger_dict is None:
|
|
101
|
+
trigger_dict = {"availableNow": True}
|
|
102
|
+
|
|
103
|
+
stream_writer = df.writeStream.format("delta").outputMode(mode)
|
|
104
|
+
stream_writer.options(**options).option("checkpointLocation", checkpoint_location)
|
|
105
|
+
stream_writer.trigger(**trigger_dict)
|
|
106
|
+
|
|
107
|
+
query = stream_writer.toTable(table_identifier)
|
|
108
|
+
|
|
109
|
+
if await_termination:
|
|
110
|
+
query.awaitTermination()
|
|
@@ -196,7 +196,11 @@ class DeltaMergeWriter(BaseDeltaWriter):
|
|
|
196
196
|
|
|
197
197
|
config = DeltaMergeConfig(dataframe_columns=data_frame.columns, **kwargs)
|
|
198
198
|
|
|
199
|
-
delta_table = self.table_manager.get_delta_table(
|
|
199
|
+
delta_table = self.table_manager.get_delta_table(
|
|
200
|
+
table=table,
|
|
201
|
+
location=storage_path if not table else None,
|
|
202
|
+
spark=data_frame.sparkSession,
|
|
203
|
+
)
|
|
200
204
|
|
|
201
205
|
match_conditions = self._build_match_conditions(data_frame, config)
|
|
202
206
|
|
cloe_nessy/models/column.py
CHANGED
|
@@ -5,6 +5,7 @@ from pydantic import BaseModel, Field, field_validator, model_validator
|
|
|
5
5
|
|
|
6
6
|
COLUMN_DATA_TYPE_LIST = {
|
|
7
7
|
"string",
|
|
8
|
+
"decimal",
|
|
8
9
|
"integer",
|
|
9
10
|
"int",
|
|
10
11
|
"smallint",
|
|
@@ -31,7 +32,7 @@ class Column(BaseModel):
|
|
|
31
32
|
nullable: bool = True
|
|
32
33
|
default_value: Any = None
|
|
33
34
|
generated: str | None = None
|
|
34
|
-
|
|
35
|
+
business_properties: dict[str, Any] = Field(default_factory=dict)
|
|
35
36
|
comment: str | None = None
|
|
36
37
|
|
|
37
38
|
@field_validator("data_type", mode="before")
|
|
@@ -43,7 +44,7 @@ class Column(BaseModel):
|
|
|
43
44
|
"""
|
|
44
45
|
val = raw.lower()
|
|
45
46
|
base_data_types = re.findall(r"\b[a-z]+\b", val)
|
|
46
|
-
forbidden_characters = re.findall(r"[^a-
|
|
47
|
+
forbidden_characters = re.findall(r"[^a-z0-9\(\)\<\>, ]+", val)
|
|
47
48
|
|
|
48
49
|
if forbidden_characters:
|
|
49
50
|
raise ValueError(f"Forbidden characters in data type definition [ '{val}' ]: [' {forbidden_characters} ']")
|
cloe_nessy/models/schema.py
CHANGED
|
@@ -43,6 +43,7 @@ class Schema(ReadInstancesMixin):
|
|
|
43
43
|
raise FileNotFoundError("Schema file not found.")
|
|
44
44
|
|
|
45
45
|
schema, schema_errors = super().read_instance_from_file(processed_instance_path)
|
|
46
|
+
table_errors: list[ValidationErrorType] = []
|
|
46
47
|
if schema:
|
|
47
48
|
schema.storage_path = "" if not schema.storage_path else schema.storage_path
|
|
48
49
|
tables, table_errors = Table.read_instances_from_directory(
|
|
@@ -13,6 +13,14 @@ USING delta
|
|
|
13
13
|
{% if table.storage_path %}
|
|
14
14
|
LOCATION '{{ table.storage_path }}'
|
|
15
15
|
{% endif %}
|
|
16
|
+
{% if table.properties %}
|
|
17
|
+
TBLPROPERTIES (
|
|
18
|
+
{%- for key, value in table.properties.items() %}
|
|
19
|
+
{%- if not loop.first %}, {% endif -%}
|
|
20
|
+
'{{key}}' = '{{value}}'
|
|
21
|
+
{%- endfor -%}
|
|
22
|
+
)
|
|
23
|
+
{% endif %}
|
|
16
24
|
{% if table.partition_by -%}
|
|
17
25
|
{%- if table.liquid_clustering -%} CLUSTER {%- else -%} PARTITIONED {%- endif %} BY (
|
|
18
26
|
{%- for column in table.partition_by -%}
|
|
@@ -34,3 +42,17 @@ ALTER TABLE {{ table.escaped_identifier }} ADD CONSTRAINT {{constraint.name}} CH
|
|
|
34
42
|
{%- if table.comment %}
|
|
35
43
|
COMMENT ON TABLE {{ table.escaped_identifier }} IS '{{ table.comment }}';
|
|
36
44
|
{%- endif %}
|
|
45
|
+
{# Tags do not yet work in Databricks
|
|
46
|
+
{%- if table.business_properties %}
|
|
47
|
+
{%- for tag_key, tag_value in table.business_properties.items() %}
|
|
48
|
+
SET TAG ON TABLE {{ table.escaped_identifier }} `{{ tag_key }}`{% if tag_value %} = `{{ tag_value }}`{% endif %};
|
|
49
|
+
{%- endfor %}
|
|
50
|
+
{%- endif %}
|
|
51
|
+
|
|
52
|
+
{%- for column in table.columns %}
|
|
53
|
+
{%- if column.business_properties %}
|
|
54
|
+
{%- for tag_key, tag_value in column.business_properties.items() %}
|
|
55
|
+
SET TAG ON COLUMN {{ table.escaped_identifier }}.`{{ column.name }}` `{{ tag_key }}`{% if tag_value %} = `{{ tag_value }}`{% endif %};
|
|
56
|
+
{%- endfor %}
|
|
57
|
+
{%- endif %}
|
|
58
|
+
{%- endfor %} #}
|
|
@@ -110,7 +110,7 @@ class TableManager(LoggerMixin):
|
|
|
110
110
|
self._spark.sql(f"USE CATALOG {table.catalog};")
|
|
111
111
|
self._spark.sql(f"USE SCHEMA {table.schema};")
|
|
112
112
|
for statement in table.get_create_statement(replace=replace).split(";"):
|
|
113
|
-
if statement and statement
|
|
113
|
+
if statement and statement.strip():
|
|
114
114
|
self._spark.sql(statement)
|
|
115
115
|
|
|
116
116
|
def drop_table(
|
|
@@ -186,6 +186,9 @@ class TableManager(LoggerMixin):
|
|
|
186
186
|
def get_delta_table(self, table: Table | None = None, location: str | None = None, spark=None) -> DeltaTable:
|
|
187
187
|
"""Get the DeltaTable object from the Table objects location or a location string.
|
|
188
188
|
|
|
189
|
+
For managed tables, uses the table identifier to access the DeltaTable.
|
|
190
|
+
For external tables or when a location is provided, uses the storage path.
|
|
191
|
+
|
|
189
192
|
Args:
|
|
190
193
|
table: A Table object representing the Delta table.
|
|
191
194
|
location: A string representing the table location.
|
|
@@ -195,17 +198,35 @@ class TableManager(LoggerMixin):
|
|
|
195
198
|
The DeltaTable object corresponding to the given Table object or location string.
|
|
196
199
|
|
|
197
200
|
Raises:
|
|
198
|
-
ValueError: If neither table nor location is provided
|
|
201
|
+
ValueError: If neither table nor location is provided.
|
|
199
202
|
"""
|
|
200
|
-
if
|
|
203
|
+
if table is None and location is None:
|
|
201
204
|
raise ValueError(
|
|
202
|
-
f"Either table or location must be provided
|
|
205
|
+
f"Either table or location must be provided. Table: {table}, location: {location}",
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
spark_session = spark or self._spark
|
|
209
|
+
|
|
210
|
+
if table is not None and location is not None:
|
|
211
|
+
self._console_logger.info(
|
|
212
|
+
f"Both table ({table.identifier}) and location ({location}) provided. Using table object as priority."
|
|
203
213
|
)
|
|
204
214
|
|
|
205
215
|
if table is not None:
|
|
206
|
-
|
|
216
|
+
if table.is_external is False:
|
|
217
|
+
self._console_logger.info(f"Getting DeltaTable object for managed table: {table.identifier}")
|
|
218
|
+
return DeltaTable.forName(spark_session, table.identifier)
|
|
219
|
+
|
|
220
|
+
table_location = str(table.storage_path)
|
|
221
|
+
self._console_logger.info(f"Getting DeltaTable object for external table location: {table_location}")
|
|
222
|
+
return DeltaTable.forPath(spark_session, table_location)
|
|
223
|
+
|
|
224
|
+
self._console_logger.info(f"No table object provided, using location: {location}")
|
|
225
|
+
if location is None:
|
|
226
|
+
self._console_logger.error("Location is None - this should not happen!")
|
|
227
|
+
raise ValueError("Location cannot be None when no table object is provided")
|
|
207
228
|
self._console_logger.info(f"Getting DeltaTable object for location: {location}")
|
|
208
|
-
return DeltaTable.forPath(
|
|
229
|
+
return DeltaTable.forPath(spark_session, str(location))
|
|
209
230
|
|
|
210
231
|
def table_exists(self, table: Table | None = None, table_identifier: str | None = None) -> bool:
|
|
211
232
|
"""Checks if a table exists in the catalog.
|
|
@@ -235,9 +256,10 @@ class TableManager(LoggerMixin):
|
|
|
235
256
|
raise ValueError("Invalid table identifier format. Expected 'catalog.schema.table'.")
|
|
236
257
|
|
|
237
258
|
query_result = self._spark.sql(
|
|
259
|
+
# Using both upper and lower case to ensure compatibility with case changes in Databricks
|
|
238
260
|
f"""
|
|
239
261
|
SELECT 1 FROM {catalog}.information_schema.tables
|
|
240
|
-
WHERE table_name
|
|
262
|
+
WHERE table_name in ('{table_name}', '{table_name.lower()}')
|
|
241
263
|
AND table_schema = '{schema}'
|
|
242
264
|
LIMIT 1""",
|
|
243
265
|
)
|
|
@@ -33,7 +33,7 @@ pipeline_actions = {cls.name: cls for cls in PipelineAction.__subclasses__()}
|
|
|
33
33
|
# Register all subclasses dynamically as enum using their "name" attribute as
|
|
34
34
|
# key. We need to do this here, because otherwise we don't get all subclasses
|
|
35
35
|
# from a relative import of PipelineAction
|
|
36
|
-
PipelineActionType = Enum("PipelineActionType", pipeline_actions) # type: ignore
|
|
36
|
+
PipelineActionType = Enum("PipelineActionType", pipeline_actions) # type: ignore[misc]
|
|
37
37
|
|
|
38
38
|
__all__ = [
|
|
39
39
|
"ReadAPIAction",
|