cloe-nessy 0.3.16.5b0__py3-none-any.whl → 0.3.16.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/integration/reader/catalog_reader.py +0 -36
- cloe_nessy/integration/writer/catalog_writer.py +1 -63
- cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +1 -5
- cloe_nessy/models/column.py +2 -3
- cloe_nessy/models/schema.py +0 -1
- cloe_nessy/models/templates/create_table.sql.j2 +0 -22
- cloe_nessy/object_manager/table_manager.py +7 -28
- cloe_nessy/pipeline/actions/read_catalog_table.py +10 -32
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +33 -61
- cloe_nessy/pipeline/actions/transform_join.py +24 -98
- cloe_nessy/pipeline/actions/transform_union.py +2 -2
- cloe_nessy/pipeline/actions/write_catalog_table.py +19 -64
- cloe_nessy/pipeline/actions/write_delta_merge.py +0 -1
- cloe_nessy/session/session_manager.py +10 -10
- {cloe_nessy-0.3.16.5b0.dist-info → cloe_nessy-0.3.16.6.dist-info}/METADATA +17 -18
- {cloe_nessy-0.3.16.5b0.dist-info → cloe_nessy-0.3.16.6.dist-info}/RECORD +18 -18
- {cloe_nessy-0.3.16.5b0.dist-info → cloe_nessy-0.3.16.6.dist-info}/WHEEL +2 -1
- cloe_nessy-0.3.16.6.dist-info/top_level.txt +1 -0
- cloe_nessy/pipeline/actions/transform_convert_timestamp.py +0 -87
|
@@ -50,39 +50,3 @@ class CatalogReader(BaseReader):
|
|
|
50
50
|
raise ReadOperationFailedError(
|
|
51
51
|
f"An error occurred while reading the table '{table_identifier}': {err}"
|
|
52
52
|
) from err
|
|
53
|
-
|
|
54
|
-
def read_stream(
|
|
55
|
-
self, table_identifier: str = "", *, options: dict[str, str] | None = None, **kwargs: Any
|
|
56
|
-
) -> DataFrame:
|
|
57
|
-
"""Reads a streaming table from the Unity Catalog.
|
|
58
|
-
|
|
59
|
-
Args:
|
|
60
|
-
table_identifier: The table identifier in the Unity Catalog in the format 'catalog.schema.table'.
|
|
61
|
-
options: PySpark options for the read stream operation.
|
|
62
|
-
**kwargs: Additional keyword arguments to maintain compatibility with the base class method.
|
|
63
|
-
|
|
64
|
-
Returns:
|
|
65
|
-
The Spark Streaming DataFrame containing the read data.
|
|
66
|
-
|
|
67
|
-
Raises:
|
|
68
|
-
ValueError: If the table_identifier is not provided, is not a string, or is not in the correct format.
|
|
69
|
-
Exception: For any other unexpected errors during streaming read operation.
|
|
70
|
-
"""
|
|
71
|
-
if options is None:
|
|
72
|
-
options = {}
|
|
73
|
-
if not table_identifier:
|
|
74
|
-
raise ValueError("table_identifier is required")
|
|
75
|
-
if not isinstance(table_identifier, str):
|
|
76
|
-
raise ValueError("table_identifier must be a string")
|
|
77
|
-
if len(table_identifier.split(".")) != 3:
|
|
78
|
-
raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
|
|
79
|
-
|
|
80
|
-
try:
|
|
81
|
-
df = self._spark.readStream.table(table_identifier, **options)
|
|
82
|
-
return df
|
|
83
|
-
except AnalysisException as err:
|
|
84
|
-
raise ValueError(f"Table not found or not streamable: {table_identifier}") from err
|
|
85
|
-
except Exception as err:
|
|
86
|
-
raise ReadOperationFailedError(
|
|
87
|
-
f"An error occurred while reading the stream from table '{table_identifier}': {err}"
|
|
88
|
-
) from err
|
|
@@ -5,7 +5,7 @@ class CatalogWriter:
|
|
|
5
5
|
"""A writer for Catalog tables."""
|
|
6
6
|
|
|
7
7
|
@staticmethod
|
|
8
|
-
def
|
|
8
|
+
def write_table(
|
|
9
9
|
df: DataFrame | None,
|
|
10
10
|
table_identifier: str | None,
|
|
11
11
|
partition_by: str | list[str] | None = None,
|
|
@@ -46,65 +46,3 @@ class CatalogWriter:
|
|
|
46
46
|
if options is None:
|
|
47
47
|
options = {}
|
|
48
48
|
df.write.saveAsTable(table_identifier, mode=mode, partitionBy=partition_by, **options)
|
|
49
|
-
|
|
50
|
-
@staticmethod
|
|
51
|
-
def write_stream(
|
|
52
|
-
df: DataFrame | None,
|
|
53
|
-
table_identifier: str | None,
|
|
54
|
-
checkpoint_location: str | None = None,
|
|
55
|
-
trigger_dict: dict | None = None,
|
|
56
|
-
options: dict[str, str] | None = None,
|
|
57
|
-
mode: str = "append",
|
|
58
|
-
await_termination: bool = False,
|
|
59
|
-
) -> None:
|
|
60
|
-
"""Write a streaming DataFrame to a Unity Catalog table.
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
df: The streaming DataFrame to write.
|
|
64
|
-
table_identifier: The table identifier in the Unity Catalog in the
|
|
65
|
-
format 'catalog.schema.table'.
|
|
66
|
-
checkpoint_location: Location for checkpointing. Required for stream recovery.
|
|
67
|
-
trigger_dict: A dictionary specifying the trigger configuration for the streaming query.
|
|
68
|
-
Supported keys include:
|
|
69
|
-
- "processingTime": Specifies a time interval (e.g., "10 seconds") for micro-batch processing.
|
|
70
|
-
- "once": Processes all available data once and then stops.
|
|
71
|
-
- "continuous": Specifies a time interval (e.g., "1 second") for continuous processing.
|
|
72
|
-
- "availableNow": Processes all available data immediately and then stops.
|
|
73
|
-
If nothing is provided, the default is {"availableNow": True}.
|
|
74
|
-
options: PySpark options for the DataFrame streaming write operation.
|
|
75
|
-
mode: The write mode. For streaming, typically "append".
|
|
76
|
-
await_termination: If True, the function will wait for the streaming
|
|
77
|
-
query to finish before returning.
|
|
78
|
-
|
|
79
|
-
Raises:
|
|
80
|
-
ValueError: If the mode is not supported for streaming operations.
|
|
81
|
-
ValueError: If the table_identifier is not a string or not in the format 'catalog.schema.table'.
|
|
82
|
-
ValueError: If the DataFrame is None.
|
|
83
|
-
ValueError: If checkpoint_location is not provided.
|
|
84
|
-
"""
|
|
85
|
-
if mode not in ("append", "complete", "update"):
|
|
86
|
-
raise ValueError("mode must be one of append, complete, update for streaming operations")
|
|
87
|
-
if not table_identifier:
|
|
88
|
-
raise ValueError("table_identifier is required")
|
|
89
|
-
elif not isinstance(table_identifier, str):
|
|
90
|
-
raise ValueError("table_identifier must be a string")
|
|
91
|
-
elif len(table_identifier.split(".")) != 3:
|
|
92
|
-
raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
|
|
93
|
-
if not df:
|
|
94
|
-
raise ValueError("df is required, but was None.")
|
|
95
|
-
if not checkpoint_location:
|
|
96
|
-
raise ValueError("checkpoint_location is required for streaming operations")
|
|
97
|
-
|
|
98
|
-
if options is None:
|
|
99
|
-
options = {}
|
|
100
|
-
if trigger_dict is None:
|
|
101
|
-
trigger_dict = {"availableNow": True}
|
|
102
|
-
|
|
103
|
-
stream_writer = df.writeStream.format("delta").outputMode(mode)
|
|
104
|
-
stream_writer.options(**options).option("checkpointLocation", checkpoint_location)
|
|
105
|
-
stream_writer.trigger(**trigger_dict)
|
|
106
|
-
|
|
107
|
-
query = stream_writer.toTable(table_identifier)
|
|
108
|
-
|
|
109
|
-
if await_termination:
|
|
110
|
-
query.awaitTermination()
|
|
@@ -196,11 +196,7 @@ class DeltaMergeWriter(BaseDeltaWriter):
|
|
|
196
196
|
|
|
197
197
|
config = DeltaMergeConfig(dataframe_columns=data_frame.columns, **kwargs)
|
|
198
198
|
|
|
199
|
-
delta_table = self.table_manager.get_delta_table(
|
|
200
|
-
table=table,
|
|
201
|
-
location=storage_path,
|
|
202
|
-
spark=data_frame.sparkSession,
|
|
203
|
-
)
|
|
199
|
+
delta_table = self.table_manager.get_delta_table(location=storage_path, spark=data_frame.sparkSession)
|
|
204
200
|
|
|
205
201
|
match_conditions = self._build_match_conditions(data_frame, config)
|
|
206
202
|
|
cloe_nessy/models/column.py
CHANGED
|
@@ -5,7 +5,6 @@ from pydantic import BaseModel, Field, field_validator, model_validator
|
|
|
5
5
|
|
|
6
6
|
COLUMN_DATA_TYPE_LIST = {
|
|
7
7
|
"string",
|
|
8
|
-
"decimal",
|
|
9
8
|
"integer",
|
|
10
9
|
"int",
|
|
11
10
|
"smallint",
|
|
@@ -32,7 +31,7 @@ class Column(BaseModel):
|
|
|
32
31
|
nullable: bool
|
|
33
32
|
default_value: Any = None
|
|
34
33
|
generated: str | None = None
|
|
35
|
-
|
|
34
|
+
properties: dict[str, Any] = Field(default_factory=dict)
|
|
36
35
|
comment: str | None = None
|
|
37
36
|
|
|
38
37
|
@field_validator("data_type", mode="before")
|
|
@@ -44,7 +43,7 @@ class Column(BaseModel):
|
|
|
44
43
|
"""
|
|
45
44
|
val = raw.lower()
|
|
46
45
|
base_data_types = re.findall(r"\b[a-z]+\b", val)
|
|
47
|
-
forbidden_characters = re.findall(r"[^a-
|
|
46
|
+
forbidden_characters = re.findall(r"[^a-z\<\>)]+", val)
|
|
48
47
|
|
|
49
48
|
if forbidden_characters:
|
|
50
49
|
raise ValueError(f"Forbidden characters in data type definition [ '{val}' ]: [' {forbidden_characters} ']")
|
cloe_nessy/models/schema.py
CHANGED
|
@@ -43,7 +43,6 @@ class Schema(ReadInstancesMixin):
|
|
|
43
43
|
raise FileNotFoundError("Schema file not found.")
|
|
44
44
|
|
|
45
45
|
schema, schema_errors = super().read_instance_from_file(processed_instance_path)
|
|
46
|
-
table_errors: list[ValidationErrorType] = []
|
|
47
46
|
if schema:
|
|
48
47
|
schema.storage_path = "" if not schema.storage_path else schema.storage_path
|
|
49
48
|
tables, table_errors = Table.read_instances_from_directory(
|
|
@@ -13,14 +13,6 @@ USING delta
|
|
|
13
13
|
{% if table.storage_path %}
|
|
14
14
|
LOCATION '{{ table.storage_path }}'
|
|
15
15
|
{% endif %}
|
|
16
|
-
{% if table.properties %}
|
|
17
|
-
TBLPROPERTIES (
|
|
18
|
-
{%- for key, value in table.properties.items() %}
|
|
19
|
-
{%- if not loop.first %}, {% endif -%}
|
|
20
|
-
'{{key}}' = '{{value}}'
|
|
21
|
-
{%- endfor -%}
|
|
22
|
-
)
|
|
23
|
-
{% endif %}
|
|
24
16
|
{% if table.partition_by -%}
|
|
25
17
|
{%- if table.liquid_clustering -%} CLUSTER {%- else -%} PARTITIONED {%- endif %} BY (
|
|
26
18
|
{%- for column in table.partition_by -%}
|
|
@@ -42,17 +34,3 @@ ALTER TABLE {{ table.escaped_identifier }} ADD CONSTRAINT {{constraint.name}} CH
|
|
|
42
34
|
{%- if table.comment %}
|
|
43
35
|
COMMENT ON TABLE {{ table.escaped_identifier }} IS '{{ table.comment }}';
|
|
44
36
|
{%- endif %}
|
|
45
|
-
{# Tags do not yet work in Databricks
|
|
46
|
-
{%- if table.business_properties %}
|
|
47
|
-
{%- for tag_key, tag_value in table.business_properties.items() %}
|
|
48
|
-
SET TAG ON TABLE {{ table.escaped_identifier }} `{{ tag_key }}`{% if tag_value %} = `{{ tag_value }}`{% endif %};
|
|
49
|
-
{%- endfor %}
|
|
50
|
-
{%- endif %}
|
|
51
|
-
|
|
52
|
-
{%- for column in table.columns %}
|
|
53
|
-
{%- if column.business_properties %}
|
|
54
|
-
{%- for tag_key, tag_value in column.business_properties.items() %}
|
|
55
|
-
SET TAG ON COLUMN {{ table.escaped_identifier }}.`{{ column.name }}` `{{ tag_key }}`{% if tag_value %} = `{{ tag_value }}`{% endif %};
|
|
56
|
-
{%- endfor %}
|
|
57
|
-
{%- endif %}
|
|
58
|
-
{%- endfor %} #}
|
|
@@ -186,9 +186,6 @@ class TableManager(LoggerMixin):
|
|
|
186
186
|
def get_delta_table(self, table: Table | None = None, location: str | None = None, spark=None) -> DeltaTable:
|
|
187
187
|
"""Get the DeltaTable object from the Table objects location or a location string.
|
|
188
188
|
|
|
189
|
-
For managed tables, uses the table identifier to access the DeltaTable.
|
|
190
|
-
For external tables or when a location is provided, uses the storage path.
|
|
191
|
-
|
|
192
189
|
Args:
|
|
193
190
|
table: A Table object representing the Delta table.
|
|
194
191
|
location: A string representing the table location.
|
|
@@ -198,34 +195,17 @@ class TableManager(LoggerMixin):
|
|
|
198
195
|
The DeltaTable object corresponding to the given Table object or location string.
|
|
199
196
|
|
|
200
197
|
Raises:
|
|
201
|
-
ValueError: If neither table nor location is provided.
|
|
198
|
+
ValueError: If neither table nor location is provided, or if both are provided.
|
|
202
199
|
"""
|
|
203
|
-
if table is None and location is None:
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
spark_session = spark or self._spark
|
|
208
|
-
|
|
209
|
-
if table is not None and location is not None:
|
|
210
|
-
self._console_logger.info(
|
|
211
|
-
f"Both table ({table.identifier}) and location ({location}) provided. Using table object as priority."
|
|
200
|
+
if (table is None and location is None) or (table is not None and location is not None):
|
|
201
|
+
raise ValueError(
|
|
202
|
+
f"Either table or location must be provided, but not both. Table: {table}, location: {location}",
|
|
212
203
|
)
|
|
213
204
|
|
|
214
205
|
if table is not None:
|
|
215
|
-
|
|
216
|
-
self._console_logger.info(f"Getting DeltaTable object for managed table: {table.identifier}")
|
|
217
|
-
return DeltaTable.forName(spark_session, table.identifier)
|
|
218
|
-
|
|
219
|
-
table_location = str(table.storage_path)
|
|
220
|
-
self._console_logger.info(f"Getting DeltaTable object for external table location: {table_location}")
|
|
221
|
-
return DeltaTable.forPath(spark_session, table_location)
|
|
222
|
-
|
|
223
|
-
self._console_logger.info(f"No table object provided, using location: {location}")
|
|
224
|
-
if location is None:
|
|
225
|
-
self._console_logger.error("Location is None - this should not happen!")
|
|
226
|
-
raise ValueError("Location cannot be None when no table object is provided")
|
|
206
|
+
location = str(table.storage_path)
|
|
227
207
|
self._console_logger.info(f"Getting DeltaTable object for location: {location}")
|
|
228
|
-
return DeltaTable.forPath(
|
|
208
|
+
return DeltaTable.forPath(spark or self._spark, str(location))
|
|
229
209
|
|
|
230
210
|
def table_exists(self, table: Table | None = None, table_identifier: str | None = None) -> bool:
|
|
231
211
|
"""Checks if a table exists in the catalog.
|
|
@@ -255,10 +235,9 @@ class TableManager(LoggerMixin):
|
|
|
255
235
|
raise ValueError("Invalid table identifier format. Expected 'catalog.schema.table'.")
|
|
256
236
|
|
|
257
237
|
query_result = self._spark.sql(
|
|
258
|
-
# Using both upper and lower case to ensure compatibility with case changes in Databricks
|
|
259
238
|
f"""
|
|
260
239
|
SELECT 1 FROM {catalog}.information_schema.tables
|
|
261
|
-
WHERE table_name
|
|
240
|
+
WHERE table_name = '{table_name}'
|
|
262
241
|
AND table_schema = '{schema}'
|
|
263
242
|
LIMIT 1""",
|
|
264
243
|
)
|
|
@@ -15,23 +15,13 @@ class ReadCatalogTableAction(PipelineAction):
|
|
|
15
15
|
into a DataFrame and returned as part of an updated `PipelineContext`.
|
|
16
16
|
|
|
17
17
|
Example:
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
```
|
|
26
|
-
=== "Streaming Read"
|
|
27
|
-
```yaml
|
|
28
|
-
Read Sales Table Stream:
|
|
29
|
-
action: READ_CATALOG_TABLE
|
|
30
|
-
options:
|
|
31
|
-
table_identifier: my_catalog.business_schema.sales_table
|
|
32
|
-
stream: true
|
|
33
|
-
options: <options for the CatalogReader read_stream method>
|
|
34
|
-
```
|
|
18
|
+
```yaml
|
|
19
|
+
Read Sales Table:
|
|
20
|
+
action: READ_CATALOG_TABLE
|
|
21
|
+
options:
|
|
22
|
+
table_identifier: my_catalog.business_schema.sales_table
|
|
23
|
+
options: <options for the CatalogReader read method>
|
|
24
|
+
```
|
|
35
25
|
"""
|
|
36
26
|
|
|
37
27
|
name: str = "READ_CATALOG_TABLE"
|
|
@@ -42,7 +32,6 @@ class ReadCatalogTableAction(PipelineAction):
|
|
|
42
32
|
*,
|
|
43
33
|
table_identifier: str | None = None,
|
|
44
34
|
options: dict[str, str] | None = None,
|
|
45
|
-
stream: bool = False,
|
|
46
35
|
**_: Any, # define kwargs to match the base class signature
|
|
47
36
|
) -> PipelineContext:
|
|
48
37
|
"""Reads a table from Unity Catalog using a specified table identifier and optional reader configurations.
|
|
@@ -55,13 +44,7 @@ class ReadCatalogTableAction(PipelineAction):
|
|
|
55
44
|
identifier from the `table_metadata` in the `context`.
|
|
56
45
|
options: A dictionary of options for customizing
|
|
57
46
|
the [`CatalogReader`][cloe_nessy.integration.reader.catalog_reader]
|
|
58
|
-
behavior, such as filters or reading modes.
|
|
59
|
-
stream: If True, the action will read the table as a stream.
|
|
60
|
-
checkpoint_location: The location for storing
|
|
61
|
-
checkpoints if streaming is enabled.
|
|
62
|
-
trigger_dict: A dictionary specifying the trigger
|
|
63
|
-
configuration for the streaming query, such as processing time or
|
|
64
|
-
continuous processing.
|
|
47
|
+
behavior, such as filters or reading modes. Defaults to None.
|
|
65
48
|
|
|
66
49
|
Raises:
|
|
67
50
|
ValueError: If neither `table_identifier` nor `table_metadata.identifier` in the `context` is provided.
|
|
@@ -78,10 +61,5 @@ class ReadCatalogTableAction(PipelineAction):
|
|
|
78
61
|
raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
|
|
79
62
|
|
|
80
63
|
table_reader = CatalogReader()
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
df = table_reader.read_stream(table_identifier=table_identifier, options=options)
|
|
84
|
-
else:
|
|
85
|
-
df = table_reader.read(table_identifier=table_identifier, options=options)
|
|
86
|
-
|
|
87
|
-
return context.from_existing(data=df, runtime_info=context.runtime_info)
|
|
64
|
+
df = table_reader.read(table_identifier=table_identifier, options=options)
|
|
65
|
+
return context.from_existing(data=df)
|
|
@@ -1,94 +1,66 @@
|
|
|
1
|
-
|
|
1
|
+
import pathlib
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
|
-
from ...models import
|
|
4
|
+
from ...models import Schema
|
|
5
5
|
from ..pipeline_action import PipelineAction
|
|
6
6
|
from ..pipeline_context import PipelineContext
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class ReadMetadataYAMLAction(PipelineAction):
|
|
10
|
-
"""Reads
|
|
10
|
+
"""Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
|
|
11
11
|
|
|
12
12
|
Example:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
```
|
|
22
|
-
=== "External Table"
|
|
23
|
-
```yaml
|
|
24
|
-
Read Table Metadata:
|
|
25
|
-
action: READ_METADATA_YAML_ACTION
|
|
26
|
-
options:
|
|
27
|
-
file_path: metadata/schemas/bronze/sales_table.yml
|
|
28
|
-
catalog_name: production
|
|
29
|
-
schema_name: sales_data
|
|
30
|
-
storage_path: abfs://external_storage/sales_data/sales_table
|
|
31
|
-
```
|
|
13
|
+
```yaml
|
|
14
|
+
Read Schema Metadata:
|
|
15
|
+
action: READ_METADATA_YAML_ACTION
|
|
16
|
+
options:
|
|
17
|
+
path: excel_file_folder/excel_files_june/
|
|
18
|
+
file_name: sales_schema.yml
|
|
19
|
+
table_name: sales
|
|
20
|
+
```
|
|
32
21
|
"""
|
|
33
22
|
|
|
34
23
|
name: str = "READ_METADATA_YAML_ACTION"
|
|
35
24
|
|
|
25
|
+
@staticmethod
|
|
36
26
|
def run(
|
|
37
|
-
self,
|
|
38
27
|
context: PipelineContext,
|
|
39
28
|
*,
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
storage_path: str | None = None,
|
|
29
|
+
path: str | None = None,
|
|
30
|
+
file_name: str | None = None,
|
|
31
|
+
table_name: str | None = None,
|
|
44
32
|
**_: Any,
|
|
45
33
|
) -> PipelineContext:
|
|
46
|
-
"""Reads
|
|
34
|
+
"""Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
|
|
47
35
|
|
|
48
36
|
Args:
|
|
49
37
|
context: The context in which this Action is executed.
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
storage_path: The storage path for the table, if applicable. If not
|
|
54
|
-
provided, the table will be considered a managed table.
|
|
38
|
+
path: The path to the data contract directory.
|
|
39
|
+
file_name: The name of the file that defines the schema.
|
|
40
|
+
table_name: The name of the table for which to retrieve metadata.
|
|
55
41
|
|
|
56
42
|
Raises:
|
|
57
|
-
ValueError: If any issues occur while reading the
|
|
58
|
-
missing file,
|
|
43
|
+
ValueError: If any issues occur while reading the schema, such as an invalid schema,
|
|
44
|
+
missing file, or missing path.
|
|
59
45
|
|
|
60
46
|
Returns:
|
|
61
47
|
The context after the execution of this Action, containing the table metadata.
|
|
62
48
|
"""
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
missing_params.append("schema_name")
|
|
49
|
+
if not path:
|
|
50
|
+
raise ValueError("No path provided. Please specify path to schema metadata.")
|
|
51
|
+
if not file_name:
|
|
52
|
+
raise ValueError("No file_name provided. Please specify file name.")
|
|
53
|
+
if not table_name:
|
|
54
|
+
raise ValueError("No table_name provided. Please specify table name.")
|
|
70
55
|
|
|
71
|
-
|
|
72
|
-
raise ValueError(
|
|
73
|
-
f"Missing required parameters: {', '.join(missing_params)}. Please specify all required parameters."
|
|
74
|
-
)
|
|
56
|
+
path_obj = pathlib.Path(path)
|
|
75
57
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
table, errors = Table.read_instance_from_file(
|
|
79
|
-
final_file_path,
|
|
80
|
-
catalog_name=catalog_name,
|
|
81
|
-
schema_name=schema_name,
|
|
82
|
-
)
|
|
58
|
+
schema, errors = Schema.read_instance_from_file(path_obj / file_name)
|
|
83
59
|
if errors:
|
|
84
|
-
raise ValueError(f"Errors while reading
|
|
85
|
-
if not
|
|
86
|
-
raise ValueError("No
|
|
60
|
+
raise ValueError(f"Errors while reading schema metadata: {errors}")
|
|
61
|
+
if not schema:
|
|
62
|
+
raise ValueError("No schema found in metadata.")
|
|
87
63
|
|
|
88
|
-
|
|
89
|
-
self._console_logger.info(f"Setting storage path for table [ '{table.name}' ] to [ '{storage_path}' ]")
|
|
90
|
-
table.storage_path = storage_path
|
|
91
|
-
table.is_external = True
|
|
64
|
+
table = schema.get_table_by_name(table_name=table_name)
|
|
92
65
|
|
|
93
|
-
self._console_logger.info(f"Table [ '{table.name}' ] metadata read successfully from [ '{file_path}' ]")
|
|
94
66
|
return context.from_existing(table_metadata=table)
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
-
from pyspark.sql import functions as F
|
|
4
|
-
|
|
5
3
|
from ..pipeline_action import PipelineAction
|
|
6
4
|
from ..pipeline_context import PipelineContext
|
|
7
5
|
from ..pipeline_step import PipelineStep
|
|
@@ -15,74 +13,20 @@ class TransformJoinAction(PipelineAction):
|
|
|
15
13
|
from [PySpark
|
|
16
14
|
documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.join.html)
|
|
17
15
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
```
|
|
28
|
-
|
|
29
|
-
=== "Multiple Columns Join"
|
|
30
|
-
```yaml
|
|
31
|
-
Join Tables:
|
|
32
|
-
action: TRANSFORM_JOIN
|
|
33
|
-
options:
|
|
34
|
-
joined_data: ((step:Transform First Table))
|
|
35
|
-
join_on: [customer_id, order_date]
|
|
36
|
-
how: left
|
|
37
|
-
```
|
|
38
|
-
|
|
39
|
-
=== "Dictionary Join (Different Column Names)"
|
|
40
|
-
```yaml
|
|
41
|
-
Join Tables:
|
|
42
|
-
action: TRANSFORM_JOIN
|
|
43
|
-
options:
|
|
44
|
-
joined_data: ((step:Transform First Table))
|
|
45
|
-
join_on:
|
|
46
|
-
customer_id: cust_id
|
|
47
|
-
order_date: date
|
|
48
|
-
how: inner
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
=== "Complex Join with Literals and Expressions"
|
|
52
|
-
```yaml
|
|
53
|
-
Join Tables:
|
|
54
|
-
action: TRANSFORM_JOIN
|
|
55
|
-
options:
|
|
56
|
-
joined_data: ((step:Load Conditions Table))
|
|
57
|
-
join_condition: |
|
|
58
|
-
left.material = right.material
|
|
59
|
-
AND right.sales_org = '10'
|
|
60
|
-
AND right.distr_chan = '10'
|
|
61
|
-
AND right.knart = 'ZUVP'
|
|
62
|
-
AND right.lovmkond <> 'X'
|
|
63
|
-
AND right.sales_unit = 'ST'
|
|
64
|
-
AND left.calday BETWEEN
|
|
65
|
-
to_date(right.date_from, 'yyyyMMdd') AND
|
|
66
|
-
to_date(right.date_to, 'yyyyMMdd')
|
|
67
|
-
how: left
|
|
68
|
-
```
|
|
16
|
+
Example:
|
|
17
|
+
```yaml
|
|
18
|
+
Join Tables:
|
|
19
|
+
action: TRANSFORM_JOIN
|
|
20
|
+
options:
|
|
21
|
+
joined_data: ((step:Transform First Table))
|
|
22
|
+
join_on: id
|
|
23
|
+
how: anti
|
|
24
|
+
```
|
|
69
25
|
|
|
70
26
|
!!! note "Referencing a DataFrame from another step"
|
|
71
27
|
The `joined_data` parameter is a reference to the DataFrame from another step.
|
|
72
28
|
The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
|
|
73
29
|
for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
|
|
74
|
-
|
|
75
|
-
!!! tip "Dictionary Join Syntax"
|
|
76
|
-
When using a dictionary for `join_on`, the keys represent columns
|
|
77
|
-
from the DataFrame in context and the values represent columns from
|
|
78
|
-
the DataFrame in `joined_data`. This is useful when joining tables
|
|
79
|
-
with different column names for the same logical entity.
|
|
80
|
-
|
|
81
|
-
!!! tip "Complex Join Conditions"
|
|
82
|
-
Use `join_condition` instead of `join_on` for complex joins with literals,
|
|
83
|
-
expressions, and multiple conditions. Reference columns using `left.column_name`
|
|
84
|
-
for the main DataFrame and `right.column_name` for the joined DataFrame.
|
|
85
|
-
Supports all PySpark functions and operators.
|
|
86
30
|
"""
|
|
87
31
|
|
|
88
32
|
name: str = "TRANSFORM_JOIN"
|
|
@@ -93,7 +37,6 @@ class TransformJoinAction(PipelineAction):
|
|
|
93
37
|
*,
|
|
94
38
|
joined_data: PipelineStep | None = None,
|
|
95
39
|
join_on: list[str] | str | dict[str, str] | None = None,
|
|
96
|
-
join_condition: str | None = None,
|
|
97
40
|
how: str = "inner",
|
|
98
41
|
**_: Any,
|
|
99
42
|
) -> PipelineContext:
|
|
@@ -106,17 +49,13 @@ class TransformJoinAction(PipelineAction):
|
|
|
106
49
|
join_on: A string for the join column
|
|
107
50
|
name, a list of column names, or a dictionary mapping columns from the
|
|
108
51
|
left DataFrame to the right DataFrame. This defines the condition for the
|
|
109
|
-
join operation.
|
|
110
|
-
join_condition: A string containing a complex join expression with literals,
|
|
111
|
-
functions, and multiple conditions. Use 'left.' and 'right.' prefixes
|
|
112
|
-
to reference columns from respective DataFrames. Mutually exclusive with join_on.
|
|
52
|
+
join operation.
|
|
113
53
|
how: The type of join to perform. Must be one of: inner, cross, outer,
|
|
114
54
|
full, fullouter, left, leftouter, right, rightouter, semi, anti, etc.
|
|
115
55
|
|
|
116
56
|
Raises:
|
|
117
57
|
ValueError: If no joined_data is provided.
|
|
118
|
-
ValueError: If
|
|
119
|
-
ValueError: If both join_on and join_condition are provided.
|
|
58
|
+
ValueError: If no join_on is provided.
|
|
120
59
|
ValueError: If the data from context is None.
|
|
121
60
|
ValueError: If the data from the joined_data is None.
|
|
122
61
|
|
|
@@ -125,12 +64,8 @@ class TransformJoinAction(PipelineAction):
|
|
|
125
64
|
"""
|
|
126
65
|
if joined_data is None or joined_data.result is None or joined_data.result.data is None:
|
|
127
66
|
raise ValueError("No joined_data provided.")
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
raise ValueError("Either join_on or join_condition must be provided.")
|
|
131
|
-
|
|
132
|
-
if join_on and join_condition:
|
|
133
|
-
raise ValueError("Cannot specify both join_on and join_condition. Use one or the other.")
|
|
67
|
+
if not join_on:
|
|
68
|
+
raise ValueError("No join_on provided.")
|
|
134
69
|
|
|
135
70
|
if context.data is None:
|
|
136
71
|
raise ValueError("Data from the context is required for the operation.")
|
|
@@ -138,25 +73,16 @@ class TransformJoinAction(PipelineAction):
|
|
|
138
73
|
df_right = joined_data.result.data.alias("right") # type: ignore
|
|
139
74
|
df_left = context.data.alias("left") # type: ignore
|
|
140
75
|
|
|
141
|
-
if
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
elif isinstance(join_on, list):
|
|
153
|
-
join_condition_list = join_on
|
|
154
|
-
else:
|
|
155
|
-
join_condition_list = [
|
|
156
|
-
df_left[left_column] == df_right[right_column] # type: ignore
|
|
157
|
-
for left_column, right_column in join_on.items()
|
|
158
|
-
]
|
|
159
|
-
|
|
160
|
-
df = df_left.join(df_right, on=join_condition_list, how=how) # type: ignore
|
|
76
|
+
if isinstance(join_on, str):
|
|
77
|
+
join_condition = [join_on]
|
|
78
|
+
elif isinstance(join_on, list):
|
|
79
|
+
join_condition = join_on
|
|
80
|
+
else:
|
|
81
|
+
join_condition = [
|
|
82
|
+
df_left[left_column] == df_right[right_column] # type: ignore
|
|
83
|
+
for left_column, right_column in join_on.items()
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
df = df_left.join(df_right, on=join_condition, how=how) # type: ignore
|
|
161
87
|
|
|
162
88
|
return context.from_existing(data=df) # type: ignore
|
|
@@ -22,8 +22,8 @@ class TransformUnionAction(PipelineAction):
|
|
|
22
22
|
action: TRANSFORM_UNION
|
|
23
23
|
options:
|
|
24
24
|
union_data:
|
|
25
|
-
- ((step:Filter First Table))
|
|
26
|
-
- ((step:SQL Transform Second Table))
|
|
25
|
+
- ((step: Filter First Table))
|
|
26
|
+
- ((step: SQL Transform Second Table))
|
|
27
27
|
```
|
|
28
28
|
!!! note "Referencing a DataFrame from another step"
|
|
29
29
|
The `union_data` parameter is a reference to the DataFrame from another step.
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
3
|
from ...integration.writer import CatalogWriter
|
|
4
|
-
from ...object_manager import TableManager
|
|
5
4
|
from ..pipeline_action import PipelineAction
|
|
6
5
|
from ..pipeline_context import PipelineContext
|
|
7
6
|
|
|
@@ -9,31 +8,17 @@ from ..pipeline_context import PipelineContext
|
|
|
9
8
|
class WriteCatalogTableAction(PipelineAction):
|
|
10
9
|
"""Writes a DataFrame to a specified catalog table using [CatalogWriter][cloe_nessy.integration.writer.CatalogWriter].
|
|
11
10
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
11
|
+
Example:
|
|
12
|
+
```yaml
|
|
13
|
+
Write Table to Catalog:
|
|
14
|
+
action: WRITE_CATALOG_TABLE
|
|
15
|
+
options:
|
|
16
|
+
table_identifier: my_catalog.business_schema.sales_table
|
|
17
|
+
mode: append
|
|
18
|
+
partition_by: day
|
|
17
19
|
options:
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
partition_by: day
|
|
21
|
-
options:
|
|
22
|
-
mergeSchema: true
|
|
23
|
-
```
|
|
24
|
-
=== "Streaming Write"
|
|
25
|
-
```yaml
|
|
26
|
-
Write Table to Catalog Stream:
|
|
27
|
-
action: WRITE_CATALOG_TABLE
|
|
28
|
-
options:
|
|
29
|
-
table_identifier: my_catalog.business_schema.sales_table
|
|
30
|
-
mode: append
|
|
31
|
-
checkpoint_location: /path/to/checkpoint
|
|
32
|
-
trigger_dict:
|
|
33
|
-
processingTime: 10 seconds
|
|
34
|
-
options:
|
|
35
|
-
mergeSchema: true
|
|
36
|
-
```
|
|
20
|
+
mergeSchema: true
|
|
21
|
+
```
|
|
37
22
|
"""
|
|
38
23
|
|
|
39
24
|
name: str = "WRITE_CATALOG_TABLE"
|
|
@@ -46,9 +31,6 @@ class WriteCatalogTableAction(PipelineAction):
|
|
|
46
31
|
mode: str = "append",
|
|
47
32
|
partition_by: str | list[str] | None = None,
|
|
48
33
|
options: dict[str, str] | None = None,
|
|
49
|
-
checkpoint_location: str | None = None,
|
|
50
|
-
trigger_dict: dict | None = None,
|
|
51
|
-
await_termination: bool = False,
|
|
52
34
|
**_: Any,
|
|
53
35
|
) -> PipelineContext:
|
|
54
36
|
"""Writes a DataFrame to a specified catalog table.
|
|
@@ -61,11 +43,7 @@ class WriteCatalogTableAction(PipelineAction):
|
|
|
61
43
|
mode: The write mode. One of 'append', 'overwrite', 'error',
|
|
62
44
|
'errorifexists', or 'ignore'.
|
|
63
45
|
partition_by: Names of the partitioning columns.
|
|
64
|
-
|
|
65
|
-
trigger_dict: A dictionary specifying the trigger configuration for the streaming query.
|
|
66
|
-
await_termination: If True, the function will wait for the streaming
|
|
67
|
-
query to finish before returning.
|
|
68
|
-
options: Additional options for the DataFrame write operation.
|
|
46
|
+
options: PySpark options for the DataFrame.saveAsTable operation (e.g. mergeSchema:true).
|
|
69
47
|
|
|
70
48
|
Raises:
|
|
71
49
|
ValueError: If the table name is not specified or cannot be inferred from
|
|
@@ -76,15 +54,8 @@ class WriteCatalogTableAction(PipelineAction):
|
|
|
76
54
|
"""
|
|
77
55
|
if not options:
|
|
78
56
|
options = dict()
|
|
79
|
-
streaming = context.runtime_info and context.runtime_info.get("streaming")
|
|
80
|
-
if streaming and not checkpoint_location:
|
|
81
|
-
raise ValueError("Checkpoint location must be specified for streaming writes.")
|
|
82
57
|
if partition_by is None:
|
|
83
|
-
if (
|
|
84
|
-
context.table_metadata is not None
|
|
85
|
-
and hasattr(context.table_metadata, "partition_by")
|
|
86
|
-
and not context.table_metadata.liquid_clustering
|
|
87
|
-
):
|
|
58
|
+
if hasattr(context.table_metadata, "partition_by"):
|
|
88
59
|
partition_by = context.table_metadata.partition_by # type: ignore
|
|
89
60
|
|
|
90
61
|
if (table_metadata := context.table_metadata) and table_identifier is None:
|
|
@@ -92,28 +63,12 @@ class WriteCatalogTableAction(PipelineAction):
|
|
|
92
63
|
if table_identifier is None:
|
|
93
64
|
raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
|
|
94
65
|
|
|
95
|
-
if table_metadata:
|
|
96
|
-
manager = TableManager()
|
|
97
|
-
manager.create_table(table=table_metadata, ignore_if_exists=True, replace=False)
|
|
98
|
-
|
|
99
66
|
writer = CatalogWriter()
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
options=options,
|
|
108
|
-
mode=mode,
|
|
109
|
-
await_termination=await_termination,
|
|
110
|
-
)
|
|
111
|
-
else:
|
|
112
|
-
writer.write(
|
|
113
|
-
df=context.data, # type: ignore
|
|
114
|
-
table_identifier=table_identifier,
|
|
115
|
-
mode=mode,
|
|
116
|
-
partition_by=partition_by,
|
|
117
|
-
options=options,
|
|
118
|
-
)
|
|
67
|
+
writer.write_table(
|
|
68
|
+
df=context.data, # type: ignore
|
|
69
|
+
table_identifier=table_identifier,
|
|
70
|
+
mode=mode,
|
|
71
|
+
partition_by=partition_by,
|
|
72
|
+
options=options,
|
|
73
|
+
)
|
|
119
74
|
return context.from_existing()
|
|
@@ -98,7 +98,6 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
98
98
|
|
|
99
99
|
delta_merge_writer.write(
|
|
100
100
|
table_identifier=context.table_metadata.identifier,
|
|
101
|
-
table=context.table_metadata,
|
|
102
101
|
storage_path=str(context.table_metadata.storage_path),
|
|
103
102
|
data_frame=context.data,
|
|
104
103
|
key_columns=key_columns,
|
|
@@ -201,6 +201,16 @@ class SessionManager(LoggerMixin):
|
|
|
201
201
|
except ImportError:
|
|
202
202
|
logger.debug("dbruntime.dbutils not available")
|
|
203
203
|
|
|
204
|
+
logger.debug("Checking for Fabric UI...")
|
|
205
|
+
try:
|
|
206
|
+
from notebookutils import mssparkutils # type: ignore # noqa: F401
|
|
207
|
+
|
|
208
|
+
logger.debug("✓ Detected FABRIC_UI via notebookutils")
|
|
209
|
+
cls._env = cls.Environment.FABRIC_UI
|
|
210
|
+
return cls._env
|
|
211
|
+
except ImportError:
|
|
212
|
+
logger.debug("notebookutils not available")
|
|
213
|
+
|
|
204
214
|
logger.debug("Checking for Databricks Connect...")
|
|
205
215
|
try:
|
|
206
216
|
from databricks.sdk.dbutils import RemoteDbUtils # type: ignore # noqa: F401
|
|
@@ -212,16 +222,6 @@ class SessionManager(LoggerMixin):
|
|
|
212
222
|
except ImportError:
|
|
213
223
|
logger.debug("RemoteDbUtils not available")
|
|
214
224
|
|
|
215
|
-
logger.debug("Checking for Fabric UI...")
|
|
216
|
-
try:
|
|
217
|
-
from notebookutils import mssparkutils # type: ignore # noqa: F401
|
|
218
|
-
|
|
219
|
-
logger.debug("✓ Detected FABRIC_UI via notebookutils")
|
|
220
|
-
cls._env = cls.Environment.FABRIC_UI
|
|
221
|
-
return cls._env
|
|
222
|
-
except ImportError:
|
|
223
|
-
logger.debug("notebookutils not available")
|
|
224
|
-
|
|
225
225
|
logger.error("No environment could be detected")
|
|
226
226
|
raise RuntimeError(
|
|
227
227
|
"Cannot detect environment. This usually means you're not in a recognized Spark environment. "
|
|
@@ -1,37 +1,36 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cloe-nessy
|
|
3
|
-
Version: 0.3.16.
|
|
3
|
+
Version: 0.3.16.6
|
|
4
4
|
Summary: Your friendly datalake monster.
|
|
5
|
-
Project-URL: homepage, https://initions.com/
|
|
6
5
|
Author-email: initions <ICSMC_EXT_PYPIORG@accenture.com>
|
|
7
6
|
License: MIT
|
|
7
|
+
Project-URL: homepage, https://initions.com/
|
|
8
8
|
Classifier: Development Status :: 5 - Production/Stable
|
|
9
9
|
Classifier: Environment :: Console
|
|
10
|
-
Classifier: Intended Audience :: Developers
|
|
11
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
12
|
Classifier: Operating System :: OS Independent
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
14
|
Classifier: Topic :: Database
|
|
15
15
|
Requires-Python: <3.13,>=3.11
|
|
16
|
-
|
|
17
|
-
Requires-Dist: cloe-logging[databricks,log-analytics]<0.4,>=0.3.8
|
|
18
|
-
Requires-Dist: databricks-sdk<1.0.0,>=0.36.0
|
|
19
|
-
Requires-Dist: delta-spark>=3.3.2
|
|
20
|
-
Requires-Dist: fsspec<2025.7.1,>=2025.7.0
|
|
21
|
-
Requires-Dist: httpx<1.0.0,>=0.27.2
|
|
22
|
-
Requires-Dist: jinja2<4.0.0,>=3.1.4
|
|
23
|
-
Requires-Dist: matplotlib<4.0.0,>=3.9.2
|
|
24
|
-
Requires-Dist: networkx<4.0,>=3.3
|
|
25
|
-
Requires-Dist: openpyxl<4.0.0,>=3.1.5
|
|
26
|
-
Requires-Dist: pandas-stubs<3.0.0.0,>=2.2.2.240807
|
|
27
|
-
Requires-Dist: pydantic-settings<3.0.0,>=2.4.0
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
28
17
|
Requires-Dist: pydantic<3.0.0,>=2.7.2
|
|
29
18
|
Requires-Dist: pyyaml<7.0.0,>=6.0.1
|
|
30
|
-
Requires-Dist: requests<3.0.0,>=2.32.3
|
|
31
|
-
Requires-Dist: types-networkx<4.0.0.0,>=3.2.1.20240820
|
|
32
19
|
Requires-Dist: types-pyyaml<7.0.0.0,>=6.0.12.20240311
|
|
20
|
+
Requires-Dist: jinja2<4.0.0,>=3.1.4
|
|
21
|
+
Requires-Dist: pydantic-settings<3.0.0,>=2.4.0
|
|
22
|
+
Requires-Dist: openpyxl<4.0.0,>=3.1.5
|
|
23
|
+
Requires-Dist: requests<3.0.0,>=2.32.3
|
|
33
24
|
Requires-Dist: types-requests<3.0.0.0,>=2.32.0.20240712
|
|
34
|
-
|
|
25
|
+
Requires-Dist: pandas-stubs<3.0.0.0,>=2.2.2.240807
|
|
26
|
+
Requires-Dist: azure-identity<2.0.0,>=1.19.0
|
|
27
|
+
Requires-Dist: httpx<1.0.0,>=0.27.2
|
|
28
|
+
Requires-Dist: databricks-sdk<1.0.0,>=0.36.0
|
|
29
|
+
Requires-Dist: networkx<4.0,>=3.3
|
|
30
|
+
Requires-Dist: matplotlib<4.0.0,>=3.9.2
|
|
31
|
+
Requires-Dist: types-networkx<4.0.0.0,>=3.2.1.20240820
|
|
32
|
+
Requires-Dist: fsspec<2025.7.1,>=2025.7.0
|
|
33
|
+
Requires-Dist: cloe-logging[databricks,log-analytics]<0.4,>=0.3.8
|
|
35
34
|
|
|
36
35
|
# cloe-nessy
|
|
37
36
|
|
|
@@ -19,18 +19,18 @@ cloe_nessy/file_utilities/strategies/utils_strategy.py,sha256=urayKfOUpSaXKgTs1K
|
|
|
19
19
|
cloe_nessy/integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
20
|
cloe_nessy/integration/reader/__init__.py,sha256=J5vlORqHLBpHEvzIwfIjzN5xEdOat-8jlmdLcGj8nsA,239
|
|
21
21
|
cloe_nessy/integration/reader/api_reader.py,sha256=3Mf-txOTJ1dXCzdNtRTLC8UKftKms4NxOoLVgzcc2eo,5691
|
|
22
|
-
cloe_nessy/integration/reader/catalog_reader.py,sha256=
|
|
22
|
+
cloe_nessy/integration/reader/catalog_reader.py,sha256=lwDeWBVXfFh75XknPawetL9ZBtqS-Oss5rNzbrEeIQg,2070
|
|
23
23
|
cloe_nessy/integration/reader/excel_reader.py,sha256=8KCqKBYFE6RGCiahJimQOAtbYZzaUzlnoslW9yca5P8,8035
|
|
24
24
|
cloe_nessy/integration/reader/exceptions.py,sha256=_A9jFpe_RIDZCGY76qzjic9bsshxns6yXPSl141dq1c,203
|
|
25
25
|
cloe_nessy/integration/reader/file_reader.py,sha256=3DcZhyyL-Cf_R7Px1UDHJwpO8Un31dWey2Q-f4DtWfY,6879
|
|
26
26
|
cloe_nessy/integration/reader/reader.py,sha256=e2KVPePQme8SBQJEbL-3zpGasOgTiEvKFTslow2wGPw,1034
|
|
27
27
|
cloe_nessy/integration/writer/__init__.py,sha256=3yzCAGiWZdQWtsbzlTih01sxVTJV2DDYwvl34lEAUlE,243
|
|
28
|
-
cloe_nessy/integration/writer/catalog_writer.py,sha256=
|
|
28
|
+
cloe_nessy/integration/writer/catalog_writer.py,sha256=Gb-hMdADgO_uUJ7mZPHBYyNme2qXsdFFnzwo7GcShHM,2192
|
|
29
29
|
cloe_nessy/integration/writer/file_writer.py,sha256=SUDbN13ZzDhbM8DpOGFgM_Gkg70To4L6Q182pXx2HRM,5454
|
|
30
30
|
cloe_nessy/integration/writer/writer.py,sha256=elFPLFrWR-qVE9qnBtzzzhyRALLQcRVuOsPS0rNmRt4,1741
|
|
31
31
|
cloe_nessy/integration/writer/delta_writer/__init__.py,sha256=h2CT6Hllmk0nodlek27uqwniCzVZKMkYcPGyG9K2Z24,164
|
|
32
32
|
cloe_nessy/integration/writer/delta_writer/delta_append_writer.py,sha256=TbpW-j87_H9dcUza34uR6VWslJez406y3_5N1ip0SnM,4740
|
|
33
|
-
cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py,sha256=
|
|
33
|
+
cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py,sha256=no2GOLqMAJd0fEy2mqMevMj_CvutcJPRmXJC2tD4icA,10112
|
|
34
34
|
cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py,sha256=kiacqQ2FYQSzakJqZ9-ZHH3os4X7--QuER_2xx9y21k,971
|
|
35
35
|
cloe_nessy/integration/writer/delta_writer/delta_writer_base.py,sha256=upUtDZMzwYFU0kzmkelVgkpFToXkrypcR3h_jvGjz14,8596
|
|
36
36
|
cloe_nessy/integration/writer/delta_writer/exceptions.py,sha256=xPmGiYV0xQXauln5Oh34E5vbm0rVcs6xCh-SJSb2bw0,107
|
|
@@ -38,10 +38,10 @@ cloe_nessy/logging/__init__.py,sha256=ySVCVbdyR3Dno_tl2ZfiER_7EVaDoQMHVkNyfdMZum
|
|
|
38
38
|
cloe_nessy/logging/logger_mixin.py,sha256=9iy7BF6drYme-f7Rrt_imbVBRgVqQ89xjcP1X5aMtfY,7467
|
|
39
39
|
cloe_nessy/models/__init__.py,sha256=-FmWEJ1Oq1njSopjc0R7GmT64mLSmALkm8PkHNzy9Y8,327
|
|
40
40
|
cloe_nessy/models/catalog.py,sha256=ayC1sMp4cNLAZtu0ICVV3Us6-o4hn8U9tpzzvxC9RAs,177
|
|
41
|
-
cloe_nessy/models/column.py,sha256=
|
|
41
|
+
cloe_nessy/models/column.py,sha256=53fBwRnino72XKACsHZpN9QfCBqqSXyKLHZlM0huumg,1988
|
|
42
42
|
cloe_nessy/models/constraint.py,sha256=hsFlhn4n928z81O3dl3v5bMetewPWzMjkJK3_4kASSM,178
|
|
43
43
|
cloe_nessy/models/foreign_key.py,sha256=DwRVHs9sShqqPV-NL7ow_3AmPPWX0Od26yZn_I565pU,1001
|
|
44
|
-
cloe_nessy/models/schema.py,sha256=
|
|
44
|
+
cloe_nessy/models/schema.py,sha256=yUrjjEhAH5zbCymE67Az_jPnVB8hGO-_UNfqzeZCD_Y,3376
|
|
45
45
|
cloe_nessy/models/table.py,sha256=O9vcJ1XBIb6kA-NAI3SNpB5b7MGDo3p4wMJdonPaBfA,12076
|
|
46
46
|
cloe_nessy/models/types.py,sha256=XRbuJGdTNa6aXyE3IAzs_J9gVjbfkzMDLfGl-k6jI_4,223
|
|
47
47
|
cloe_nessy/models/volume.py,sha256=51BE06FrL1Wv6zblFwJ_HTiR6WQqH7pSmrdH90rqwLg,2444
|
|
@@ -50,10 +50,10 @@ cloe_nessy/models/adapter/unity_catalog_adapter.py,sha256=a-14Ys-AevVYQd0xeJU1sy
|
|
|
50
50
|
cloe_nessy/models/mixins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
51
|
cloe_nessy/models/mixins/read_instance_mixin.py,sha256=j5Y4aNWOh1jlskEaxNooZFJgPyxRmik00gAVLJnAaRs,4507
|
|
52
52
|
cloe_nessy/models/mixins/template_loader_mixin.py,sha256=5MXhEGBFlq3dwZvINEyBowSlipNnVun2H_TmhI_fsS4,549
|
|
53
|
-
cloe_nessy/models/templates/create_table.sql.j2,sha256=
|
|
53
|
+
cloe_nessy/models/templates/create_table.sql.j2,sha256=QWbiTXwmGaIlZUAIGL4pAlHkDbP9mq1vGAkdKCPOqm4,1669
|
|
54
54
|
cloe_nessy/models/templates/create_volume.sql.j2,sha256=XIUf1cHcvAxcGTyhzUiv4xpQ1cfDw_ra3_FKmOuLoBs,289
|
|
55
55
|
cloe_nessy/object_manager/__init__.py,sha256=3sle0vNpPwBOkycxA3XVS9m4XZf5LD3Qd4NGxdqcHno,186
|
|
56
|
-
cloe_nessy/object_manager/table_manager.py,sha256=
|
|
56
|
+
cloe_nessy/object_manager/table_manager.py,sha256=m6u_KFYCPoqq1hagwt3s7eQopjV2oOJNlmXDVAfku-k,12703
|
|
57
57
|
cloe_nessy/object_manager/volume_manager.py,sha256=6epd3KXzcNH04EvaKubAfLsaUm9qBMeT3KNvMK04gGs,2727
|
|
58
58
|
cloe_nessy/pipeline/__init__.py,sha256=sespmJ5JsgyiFyZiedTiL2kg--zGIX7cjTYsD5vemEg,325
|
|
59
59
|
cloe_nessy/pipeline/pipeline.py,sha256=L4wk3b06LNWRj01nnAkuQpeRrwFTyaV1xTpgYAg4sak,10819
|
|
@@ -65,14 +65,13 @@ cloe_nessy/pipeline/pipeline_plotting_service.py,sha256=goMQj73FzUVchKn5c2SsPcWR
|
|
|
65
65
|
cloe_nessy/pipeline/pipeline_step.py,sha256=UlnmpS6gm_dZ7m9dD1mZvye7mvUF_DA7HjOZo0oGYDU,1977
|
|
66
66
|
cloe_nessy/pipeline/actions/__init__.py,sha256=9gjSQKLGrPcaYaJrTYZde8d4yNrN1SoXN_DDHq5KrvY,2600
|
|
67
67
|
cloe_nessy/pipeline/actions/read_api.py,sha256=RBv5XeHtjTXuCP09Fqo6JNx6iIhQQI-nuAHCuSaGs2s,7778
|
|
68
|
-
cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=
|
|
68
|
+
cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=oXbqbc6BfR82dSIGclwzWiTN8EVmpFjNIYLKm4qOU50,2754
|
|
69
69
|
cloe_nessy/pipeline/actions/read_excel.py,sha256=Mhl3r_2Hqk2XN7Fl5WqqAyE4JdnwSiivbhWMglyBtkE,7961
|
|
70
70
|
cloe_nessy/pipeline/actions/read_files.py,sha256=N9bFgtG1tovhp2JayxE5YiN9PiO2lgG2-6h_Y6tD2eU,5220
|
|
71
|
-
cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=
|
|
71
|
+
cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=3ZDy9qiDYtM1oDQzHPC23hLOvHjhdk5zg1wVHE60m9k,2295
|
|
72
72
|
cloe_nessy/pipeline/actions/transform_change_datatype.py,sha256=24Tn6R3TvUkWCh8V6naLdyNbCbqvyPOOoer-hy_Ebq4,2077
|
|
73
73
|
cloe_nessy/pipeline/actions/transform_clean_column_names.py,sha256=VxvWqENW63c50L96JA1V_ioe4By6gGzx_iY86njOXEM,3044
|
|
74
74
|
cloe_nessy/pipeline/actions/transform_concat_columns.py,sha256=Nk8YbhxDnFZsWzW9Dj5Yl76Uq6VrcMlevQPHGms65L8,3777
|
|
75
|
-
cloe_nessy/pipeline/actions/transform_convert_timestamp.py,sha256=je6H-mtNeokU9W_-RCWaRCFvMhk4oQL9s60FVBrl8Po,3090
|
|
76
75
|
cloe_nessy/pipeline/actions/transform_decode.py,sha256=JajMwHREtxa8u_1Q3RZDBVMjncoSel-WzQFVTO0MREg,4455
|
|
77
76
|
cloe_nessy/pipeline/actions/transform_deduplication.py,sha256=E0ypz9qkHMSatNfnHekP-E6svQVL149M4PV02M03drg,5099
|
|
78
77
|
cloe_nessy/pipeline/actions/transform_distinct.py,sha256=c7aBxANyqT4aKhm0cSELDtD-bP0Se9vxlBF0K4AgQWs,1976
|
|
@@ -80,22 +79,23 @@ cloe_nessy/pipeline/actions/transform_filter.py,sha256=Nz_ggRfKIcNzYFfFOsgq1Qeat
|
|
|
80
79
|
cloe_nessy/pipeline/actions/transform_generic_sql.py,sha256=_naWfmPdYAUKjPNeHu5qJAohOL7DHCSYz_kwoeRv3OI,2741
|
|
81
80
|
cloe_nessy/pipeline/actions/transform_group_aggregate.py,sha256=KUHeeP-RIDi34dpbsPEJkzea5zFJA6MuyjNpOsFud9o,4045
|
|
82
81
|
cloe_nessy/pipeline/actions/transform_hash_columns.py,sha256=heRjBA-Gfu-nmNHOjTYlipEpKY8oNPAHAY40vjJk3aI,8383
|
|
83
|
-
cloe_nessy/pipeline/actions/transform_join.py,sha256=
|
|
82
|
+
cloe_nessy/pipeline/actions/transform_join.py,sha256=e_tvMk8YJTAWcUK_EmOgNt0s31ICZoMX_MKOTWx4lBY,3645
|
|
84
83
|
cloe_nessy/pipeline/actions/transform_json_normalize.py,sha256=petF7pnNq1EKc8MqVdG0weFALAHNILSe_eAu4Z5XxIo,4833
|
|
85
84
|
cloe_nessy/pipeline/actions/transform_rename_columns.py,sha256=4zJcPCONMU4C67qeuzsrX3AORRRHoq_selUI7FJyeg0,1952
|
|
86
85
|
cloe_nessy/pipeline/actions/transform_replace_values.py,sha256=1OPHTrjcphfyGepcO7ozYfeqfwA18pjlyHpVKUS_AAU,2049
|
|
87
86
|
cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=-GhSEsb7iNnZIsYRm3BG9BX4_qUDJMbpj1DsKPY046w,4574
|
|
88
|
-
cloe_nessy/pipeline/actions/transform_union.py,sha256=
|
|
89
|
-
cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=
|
|
87
|
+
cloe_nessy/pipeline/actions/transform_union.py,sha256=s81Vge0AbYPc7VkskCYfOQ_LEjqcmfNFyDkytfjcZyo,2720
|
|
88
|
+
cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=j7gRuG3Fedh8JgevIFBbHKock3laJVq4l6Mx3CGU5eo,2676
|
|
90
89
|
cloe_nessy/pipeline/actions/write_delta_append.py,sha256=fuL29SK9G5K14ycckU3iPexeK0XNXUfQscCwhXHxbKA,2498
|
|
91
|
-
cloe_nessy/pipeline/actions/write_delta_merge.py,sha256=
|
|
90
|
+
cloe_nessy/pipeline/actions/write_delta_merge.py,sha256=Hir7QZZZJ9hmQZXiJ9iz6u06OCmcHFpyKFVB_I1saSM,5043
|
|
92
91
|
cloe_nessy/pipeline/actions/write_file.py,sha256=H8LRst045yij-8XJ5pRB9m5d1lZpZjFa0WSVdSFesPo,2984
|
|
93
92
|
cloe_nessy/session/__init__.py,sha256=t7_YjUhJYW3km_FrucaUdbIl1boQtwkyhw_8yE10qzc,74
|
|
94
|
-
cloe_nessy/session/session_manager.py,sha256=
|
|
93
|
+
cloe_nessy/session/session_manager.py,sha256=VCUPhACeN5armd4D0TqDeH4Ih9nu6XvXSREFqHUwt4s,9710
|
|
95
94
|
cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEvzM,101
|
|
96
95
|
cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
|
|
97
96
|
cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
98
97
|
cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
|
|
99
|
-
cloe_nessy-0.3.16.
|
|
100
|
-
cloe_nessy-0.3.16.
|
|
101
|
-
cloe_nessy-0.3.16.
|
|
98
|
+
cloe_nessy-0.3.16.6.dist-info/METADATA,sha256=YfBuBVqeRWjBTWlj4SQKyUVrc-PX78fK_MnHhO2MQv4,3292
|
|
99
|
+
cloe_nessy-0.3.16.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
100
|
+
cloe_nessy-0.3.16.6.dist-info/top_level.txt,sha256=Z7izn8HmQpg2wBUb-0jzaKlYKMU7Ypzuc9__9vPtW_I,11
|
|
101
|
+
cloe_nessy-0.3.16.6.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
cloe_nessy
|
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
from typing import Any
|
|
2
|
-
|
|
3
|
-
from pyspark.errors.exceptions.base import IllegalArgumentException
|
|
4
|
-
from pyspark.sql import functions as F
|
|
5
|
-
|
|
6
|
-
from ...pipeline import PipelineAction, PipelineContext
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class TransformConvertTimestampAction(PipelineAction):
|
|
10
|
-
"""This class implements a Transform action for an ETL pipeline.
|
|
11
|
-
|
|
12
|
-
This action performs timestamp based conversions.
|
|
13
|
-
|
|
14
|
-
Example:
|
|
15
|
-
```yaml
|
|
16
|
-
Convert Timestamp:
|
|
17
|
-
action: TRANSFORM_CONVERT_TIMESTAMP
|
|
18
|
-
options:
|
|
19
|
-
column: my_timestamp_column
|
|
20
|
-
source_format: unixtime
|
|
21
|
-
target_format: yyyy-MM-dd HH:mm:ss
|
|
22
|
-
```
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
name: str = "TRANSFORM_CONVERT_TIMESTAMP"
|
|
26
|
-
|
|
27
|
-
def run(
|
|
28
|
-
self,
|
|
29
|
-
context: PipelineContext,
|
|
30
|
-
*,
|
|
31
|
-
column: str = "",
|
|
32
|
-
source_format: str = "",
|
|
33
|
-
target_format: str = "",
|
|
34
|
-
**_: Any,
|
|
35
|
-
) -> PipelineContext:
|
|
36
|
-
"""Converts a column from a given source format to a new format.
|
|
37
|
-
|
|
38
|
-
Args:
|
|
39
|
-
context: Context in which this Action is executed.
|
|
40
|
-
column: The column that should be converted.
|
|
41
|
-
source_format: Initial format type of the column.
|
|
42
|
-
target_format: Desired format type of the column. This also supports
|
|
43
|
-
passing a format string like 'yyyy-MM-dd HH:mm:ss'.
|
|
44
|
-
|
|
45
|
-
Raises:
|
|
46
|
-
ValueError: If no column, source_format and target_format are provided.
|
|
47
|
-
ValueError: If source_format or target_format are not supported.
|
|
48
|
-
|
|
49
|
-
Returns:
|
|
50
|
-
PipelineContext: Context after the execution of this Action.
|
|
51
|
-
"""
|
|
52
|
-
if not column:
|
|
53
|
-
raise ValueError("No column provided.")
|
|
54
|
-
if not source_format:
|
|
55
|
-
raise ValueError("No source_format provided.")
|
|
56
|
-
if not target_format:
|
|
57
|
-
raise ValueError("No target_format provided.")
|
|
58
|
-
if context.data is None:
|
|
59
|
-
raise ValueError("Context DataFrame is required.")
|
|
60
|
-
df = context.data
|
|
61
|
-
|
|
62
|
-
match source_format:
|
|
63
|
-
# convert always to timestamp first
|
|
64
|
-
case "unixtime":
|
|
65
|
-
df = df.withColumn(column, F.from_unixtime(F.col(column)))
|
|
66
|
-
case "unixtime_ms":
|
|
67
|
-
df = df.withColumn(column, F.to_timestamp(F.col(column) / 1000))
|
|
68
|
-
case "string":
|
|
69
|
-
df = df.withColumn(column, F.to_timestamp(F.col(column)))
|
|
70
|
-
case "timestamp":
|
|
71
|
-
pass
|
|
72
|
-
case _:
|
|
73
|
-
raise ValueError(f"Unknown source_format {source_format}")
|
|
74
|
-
|
|
75
|
-
match target_format:
|
|
76
|
-
# convert from timestamp to desired output format
|
|
77
|
-
case "timestamp":
|
|
78
|
-
pass
|
|
79
|
-
case "unixtime":
|
|
80
|
-
df = df.withColumn(column, F.to_unix_timestamp(F.col(column)))
|
|
81
|
-
case _:
|
|
82
|
-
try:
|
|
83
|
-
df = df.withColumn(column, F.date_format(F.col(column), target_format))
|
|
84
|
-
except IllegalArgumentException as e:
|
|
85
|
-
raise ValueError(f"Invalid target_format {target_format}") from e
|
|
86
|
-
|
|
87
|
-
return context.from_existing(data=df)
|