cloe-nessy 0.3.13.3b0__py3-none-any.whl → 0.3.13.5b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +3 -1
- cloe_nessy/models/column.py +2 -1
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +61 -33
- cloe_nessy/pipeline/actions/transform_convert_timestamp.py +87 -0
- {cloe_nessy-0.3.13.3b0.dist-info → cloe_nessy-0.3.13.5b0.dist-info}/METADATA +2 -1
- {cloe_nessy-0.3.13.3b0.dist-info → cloe_nessy-0.3.13.5b0.dist-info}/RECORD +7 -6
- {cloe_nessy-0.3.13.3b0.dist-info → cloe_nessy-0.3.13.5b0.dist-info}/WHEEL +0 -0
|
@@ -197,7 +197,9 @@ class DeltaMergeWriter(BaseDeltaWriter):
|
|
|
197
197
|
config = DeltaMergeConfig(dataframe_columns=data_frame.columns, **kwargs)
|
|
198
198
|
|
|
199
199
|
delta_table = self.table_manager.get_delta_table(
|
|
200
|
-
table=table,
|
|
200
|
+
table=table,
|
|
201
|
+
location=storage_path,
|
|
202
|
+
spark=data_frame.sparkSession,
|
|
201
203
|
)
|
|
202
204
|
|
|
203
205
|
match_conditions = self._build_match_conditions(data_frame, config)
|
cloe_nessy/models/column.py
CHANGED
|
@@ -5,6 +5,7 @@ from pydantic import BaseModel, Field, field_validator, model_validator
|
|
|
5
5
|
|
|
6
6
|
COLUMN_DATA_TYPE_LIST = {
|
|
7
7
|
"string",
|
|
8
|
+
"decimal",
|
|
8
9
|
"integer",
|
|
9
10
|
"int",
|
|
10
11
|
"smallint",
|
|
@@ -43,7 +44,7 @@ class Column(BaseModel):
|
|
|
43
44
|
"""
|
|
44
45
|
val = raw.lower()
|
|
45
46
|
base_data_types = re.findall(r"\b[a-z]+\b", val)
|
|
46
|
-
forbidden_characters = re.findall(r"[^a-
|
|
47
|
+
forbidden_characters = re.findall(r"[^a-z0-9\(\)\<\>,\s]+", val)
|
|
47
48
|
|
|
48
49
|
if forbidden_characters:
|
|
49
50
|
raise ValueError(f"Forbidden characters in data type definition [ '{val}' ]: [' {forbidden_characters} ']")
|
|
@@ -1,66 +1,94 @@
|
|
|
1
|
-
import
|
|
1
|
+
from pathlib import Path
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
|
-
from ...models import
|
|
4
|
+
from ...models import Table
|
|
5
5
|
from ..pipeline_action import PipelineAction
|
|
6
6
|
from ..pipeline_context import PipelineContext
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class ReadMetadataYAMLAction(PipelineAction):
|
|
10
|
-
"""Reads
|
|
10
|
+
"""Reads table metadata from a yaml file using the [`Table`][cloe_nessy.models.table] model.
|
|
11
11
|
|
|
12
12
|
Example:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
13
|
+
=== "Managed Table"
|
|
14
|
+
```yaml
|
|
15
|
+
Read Table Metadata:
|
|
16
|
+
action: READ_METADATA_YAML_ACTION
|
|
17
|
+
options:
|
|
18
|
+
file_path: metadata/schemas/bronze/sales_table.yml
|
|
19
|
+
catalog_name: production
|
|
20
|
+
schema_name: sales_data
|
|
21
|
+
```
|
|
22
|
+
=== "External Table"
|
|
23
|
+
```yaml
|
|
24
|
+
Read Table Metadata:
|
|
25
|
+
action: READ_METADATA_YAML_ACTION
|
|
26
|
+
options:
|
|
27
|
+
file_path: metadata/schemas/bronze/sales_table.yml
|
|
28
|
+
catalog_name: production
|
|
29
|
+
schema_name: sales_data
|
|
30
|
+
storage_path: abfs://external_storage/sales_data/sales_table
|
|
31
|
+
```
|
|
21
32
|
"""
|
|
22
33
|
|
|
23
34
|
name: str = "READ_METADATA_YAML_ACTION"
|
|
24
35
|
|
|
25
|
-
@staticmethod
|
|
26
36
|
def run(
|
|
37
|
+
self,
|
|
27
38
|
context: PipelineContext,
|
|
28
39
|
*,
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
40
|
+
file_path: str | None = None,
|
|
41
|
+
catalog_name: str | None = None,
|
|
42
|
+
schema_name: str | None = None,
|
|
43
|
+
storage_path: str | None = None,
|
|
32
44
|
**_: Any,
|
|
33
45
|
) -> PipelineContext:
|
|
34
|
-
"""Reads
|
|
46
|
+
"""Reads table metadata from a yaml file using the [`Table`][cloe_nessy.models.table] model.
|
|
35
47
|
|
|
36
48
|
Args:
|
|
37
49
|
context: The context in which this Action is executed.
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
50
|
+
file_path: The path to the file that defines the table.
|
|
51
|
+
catalog_name: The name of the catalog for the table.
|
|
52
|
+
schema_name: The name of the schema for the table.
|
|
53
|
+
storage_path: The storage path for the table, if applicable. If not
|
|
54
|
+
provided, the table will be considered a managed table.
|
|
41
55
|
|
|
42
56
|
Raises:
|
|
43
|
-
ValueError: If any issues occur while reading the
|
|
44
|
-
missing file, or missing
|
|
57
|
+
ValueError: If any issues occur while reading the table metadata, such as an invalid table,
|
|
58
|
+
missing file, missing path, or missing catalog/schema names.
|
|
45
59
|
|
|
46
60
|
Returns:
|
|
47
61
|
The context after the execution of this Action, containing the table metadata.
|
|
48
62
|
"""
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
63
|
+
missing_params = []
|
|
64
|
+
if not file_path:
|
|
65
|
+
missing_params.append("file_path")
|
|
66
|
+
if not catalog_name:
|
|
67
|
+
missing_params.append("catalog_name")
|
|
68
|
+
if not schema_name:
|
|
69
|
+
missing_params.append("schema_name")
|
|
55
70
|
|
|
56
|
-
|
|
71
|
+
if missing_params:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Missing required parameters: {', '.join(missing_params)}. Please specify all required parameters."
|
|
74
|
+
)
|
|
57
75
|
|
|
58
|
-
|
|
76
|
+
final_file_path = Path(file_path) if file_path else Path()
|
|
77
|
+
|
|
78
|
+
table, errors = Table.read_instance_from_file(
|
|
79
|
+
final_file_path,
|
|
80
|
+
catalog_name=catalog_name,
|
|
81
|
+
schema_name=schema_name,
|
|
82
|
+
)
|
|
59
83
|
if errors:
|
|
60
|
-
raise ValueError(f"Errors while reading
|
|
61
|
-
if not
|
|
62
|
-
raise ValueError("No
|
|
84
|
+
raise ValueError(f"Errors while reading table metadata: {errors}")
|
|
85
|
+
if not table:
|
|
86
|
+
raise ValueError("No table found in metadata.")
|
|
63
87
|
|
|
64
|
-
table
|
|
88
|
+
if not table.storage_path and storage_path:
|
|
89
|
+
self._console_logger.info(f"Setting storage path for table [ '{table.name}' ] to [ '{storage_path}' ]")
|
|
90
|
+
table.storage_path = storage_path
|
|
91
|
+
table.is_external = True
|
|
65
92
|
|
|
93
|
+
self._console_logger.info(f"Table [ '{table.name}' ] metadata read successfully from [ '{file_path}' ]")
|
|
66
94
|
return context.from_existing(table_metadata=table)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from pyspark.errors.exceptions.base import IllegalArgumentException
|
|
4
|
+
from pyspark.sql import functions as F
|
|
5
|
+
|
|
6
|
+
from ...pipeline import PipelineAction, PipelineContext
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TransformConvertTimestampAction(PipelineAction):
|
|
10
|
+
"""This class implements a Transform action for an ETL pipeline.
|
|
11
|
+
|
|
12
|
+
This action performs timestamp based conversions.
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
```yaml
|
|
16
|
+
Convert Timestamp:
|
|
17
|
+
action: TRANSFORM_CONVERT_TIMESTAMP
|
|
18
|
+
options:
|
|
19
|
+
column: my_timestamp_column
|
|
20
|
+
source_format: unixtime
|
|
21
|
+
target_format: yyyy-MM-dd HH:mm:ss
|
|
22
|
+
```
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name: str = "TRANSFORM_CONVERT_TIMESTAMP"
|
|
26
|
+
|
|
27
|
+
def run(
|
|
28
|
+
self,
|
|
29
|
+
context: PipelineContext,
|
|
30
|
+
*,
|
|
31
|
+
column: str = "",
|
|
32
|
+
source_format: str = "",
|
|
33
|
+
target_format: str = "",
|
|
34
|
+
**_: Any,
|
|
35
|
+
) -> PipelineContext:
|
|
36
|
+
"""Converts a column from a given source format to a new format.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
context: Context in which this Action is executed.
|
|
40
|
+
column: The column that should be converted.
|
|
41
|
+
source_format: Initial format type of the column.
|
|
42
|
+
target_format: Desired format type of the column. This also supports
|
|
43
|
+
passing a format string like 'yyyy-MM-dd HH:mm:ss'.
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
ValueError: If no column, source_format and target_format are provided.
|
|
47
|
+
ValueError: If source_format or target_format are not supported.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
PipelineContext: Context after the execution of this Action.
|
|
51
|
+
"""
|
|
52
|
+
if not column:
|
|
53
|
+
raise ValueError("No column provided.")
|
|
54
|
+
if not source_format:
|
|
55
|
+
raise ValueError("No source_format provided.")
|
|
56
|
+
if not target_format:
|
|
57
|
+
raise ValueError("No target_format provided.")
|
|
58
|
+
if context.data is None:
|
|
59
|
+
raise ValueError("Context DataFrame is required.")
|
|
60
|
+
df = context.data
|
|
61
|
+
|
|
62
|
+
match source_format:
|
|
63
|
+
# convert always to timestamp first
|
|
64
|
+
case "unixtime":
|
|
65
|
+
df = df.withColumn(column, F.from_unixtime(F.col(column)))
|
|
66
|
+
case "unixtime_ms":
|
|
67
|
+
df = df.withColumn(column, F.to_timestamp(F.col(column) / 1000))
|
|
68
|
+
case "string":
|
|
69
|
+
df = df.withColumn(column, F.to_timestamp(F.col(column)))
|
|
70
|
+
case "timestamp":
|
|
71
|
+
pass
|
|
72
|
+
case _:
|
|
73
|
+
raise ValueError(f"Unknown source_format {source_format}")
|
|
74
|
+
|
|
75
|
+
match target_format:
|
|
76
|
+
# convert from timestamp to desired output format
|
|
77
|
+
case "timestamp":
|
|
78
|
+
pass
|
|
79
|
+
case "unixtime":
|
|
80
|
+
df = df.withColumn(column, F.to_unix_timestamp(F.col(column)))
|
|
81
|
+
case _:
|
|
82
|
+
try:
|
|
83
|
+
df = df.withColumn(column, F.date_format(F.col(column), target_format))
|
|
84
|
+
except IllegalArgumentException as e:
|
|
85
|
+
raise ValueError(f"Invalid target_format {target_format}") from e
|
|
86
|
+
|
|
87
|
+
return context.from_existing(data=df)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cloe-nessy
|
|
3
|
-
Version: 0.3.13.
|
|
3
|
+
Version: 0.3.13.5b0
|
|
4
4
|
Summary: Your friendly datalake monster.
|
|
5
5
|
Project-URL: homepage, https://initions.com/
|
|
6
6
|
Author-email: initions <ICSMC_EXT_PYPIORG@accenture.com>
|
|
@@ -16,6 +16,7 @@ Requires-Python: <3.13,>=3.11
|
|
|
16
16
|
Requires-Dist: azure-identity<2.0.0,>=1.19.0
|
|
17
17
|
Requires-Dist: cloe-logging[databricks,log-analytics]<0.4,>=0.3.8
|
|
18
18
|
Requires-Dist: databricks-sdk<1.0.0,>=0.36.0
|
|
19
|
+
Requires-Dist: delta-spark>=3.3.2
|
|
19
20
|
Requires-Dist: fsspec<2025.6.0,>=2025.5.1
|
|
20
21
|
Requires-Dist: httpx<1.0.0,>=0.27.2
|
|
21
22
|
Requires-Dist: jinja2<4.0.0,>=3.1.4
|
|
@@ -30,7 +30,7 @@ cloe_nessy/integration/writer/file_writer.py,sha256=SUDbN13ZzDhbM8DpOGFgM_Gkg70T
|
|
|
30
30
|
cloe_nessy/integration/writer/writer.py,sha256=elFPLFrWR-qVE9qnBtzzzhyRALLQcRVuOsPS0rNmRt4,1741
|
|
31
31
|
cloe_nessy/integration/writer/delta_writer/__init__.py,sha256=h2CT6Hllmk0nodlek27uqwniCzVZKMkYcPGyG9K2Z24,164
|
|
32
32
|
cloe_nessy/integration/writer/delta_writer/delta_append_writer.py,sha256=TbpW-j87_H9dcUza34uR6VWslJez406y3_5N1ip0SnM,4740
|
|
33
|
-
cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py,sha256=
|
|
33
|
+
cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py,sha256=zhqPIPfAJTzSLFgBUCwFesUW7CcF1zCPRU-N_8yYjok,10172
|
|
34
34
|
cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py,sha256=kiacqQ2FYQSzakJqZ9-ZHH3os4X7--QuER_2xx9y21k,971
|
|
35
35
|
cloe_nessy/integration/writer/delta_writer/delta_writer_base.py,sha256=upUtDZMzwYFU0kzmkelVgkpFToXkrypcR3h_jvGjz14,8596
|
|
36
36
|
cloe_nessy/integration/writer/delta_writer/exceptions.py,sha256=xPmGiYV0xQXauln5Oh34E5vbm0rVcs6xCh-SJSb2bw0,107
|
|
@@ -38,7 +38,7 @@ cloe_nessy/logging/__init__.py,sha256=ySVCVbdyR3Dno_tl2ZfiER_7EVaDoQMHVkNyfdMZum
|
|
|
38
38
|
cloe_nessy/logging/logger_mixin.py,sha256=9iy7BF6drYme-f7Rrt_imbVBRgVqQ89xjcP1X5aMtfY,7467
|
|
39
39
|
cloe_nessy/models/__init__.py,sha256=-FmWEJ1Oq1njSopjc0R7GmT64mLSmALkm8PkHNzy9Y8,327
|
|
40
40
|
cloe_nessy/models/catalog.py,sha256=ayC1sMp4cNLAZtu0ICVV3Us6-o4hn8U9tpzzvxC9RAs,177
|
|
41
|
-
cloe_nessy/models/column.py,sha256=
|
|
41
|
+
cloe_nessy/models/column.py,sha256=8wR7E8PRhUc0dwM83IIlpz7kBncZim7J5FvQzd8R_Us,2012
|
|
42
42
|
cloe_nessy/models/constraint.py,sha256=hsFlhn4n928z81O3dl3v5bMetewPWzMjkJK3_4kASSM,178
|
|
43
43
|
cloe_nessy/models/foreign_key.py,sha256=DwRVHs9sShqqPV-NL7ow_3AmPPWX0Od26yZn_I565pU,1001
|
|
44
44
|
cloe_nessy/models/schema.py,sha256=cNSrH7K4hLRrkg1E6fW6DUIBMZdR2A5B21POj5iQ4GA,3429
|
|
@@ -67,10 +67,11 @@ cloe_nessy/pipeline/actions/read_api.py,sha256=RBv5XeHtjTXuCP09Fqo6JNx6iIhQQI-nu
|
|
|
67
67
|
cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=oXbqbc6BfR82dSIGclwzWiTN8EVmpFjNIYLKm4qOU50,2754
|
|
68
68
|
cloe_nessy/pipeline/actions/read_excel.py,sha256=Mhl3r_2Hqk2XN7Fl5WqqAyE4JdnwSiivbhWMglyBtkE,7961
|
|
69
69
|
cloe_nessy/pipeline/actions/read_files.py,sha256=N9bFgtG1tovhp2JayxE5YiN9PiO2lgG2-6h_Y6tD2eU,5220
|
|
70
|
-
cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=
|
|
70
|
+
cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=i8fQceV63eAqx_x0ANisCkXWfMHyhqsfFHVFH5yP2po,3544
|
|
71
71
|
cloe_nessy/pipeline/actions/transform_change_datatype.py,sha256=24Tn6R3TvUkWCh8V6naLdyNbCbqvyPOOoer-hy_Ebq4,2077
|
|
72
72
|
cloe_nessy/pipeline/actions/transform_clean_column_names.py,sha256=-CEdcXb7Fz5DQNitGlJ8EVBE_LzxfsInyCIO-D7b4iY,3042
|
|
73
73
|
cloe_nessy/pipeline/actions/transform_concat_columns.py,sha256=Nk8YbhxDnFZsWzW9Dj5Yl76Uq6VrcMlevQPHGms65L8,3777
|
|
74
|
+
cloe_nessy/pipeline/actions/transform_convert_timestamp.py,sha256=je6H-mtNeokU9W_-RCWaRCFvMhk4oQL9s60FVBrl8Po,3090
|
|
74
75
|
cloe_nessy/pipeline/actions/transform_decode.py,sha256=JajMwHREtxa8u_1Q3RZDBVMjncoSel-WzQFVTO0MREg,4455
|
|
75
76
|
cloe_nessy/pipeline/actions/transform_deduplication.py,sha256=E0ypz9qkHMSatNfnHekP-E6svQVL149M4PV02M03drg,5099
|
|
76
77
|
cloe_nessy/pipeline/actions/transform_distinct.py,sha256=c7aBxANyqT4aKhm0cSELDtD-bP0Se9vxlBF0K4AgQWs,1976
|
|
@@ -94,6 +95,6 @@ cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEv
|
|
|
94
95
|
cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
|
|
95
96
|
cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
96
97
|
cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
|
|
97
|
-
cloe_nessy-0.3.13.
|
|
98
|
-
cloe_nessy-0.3.13.
|
|
99
|
-
cloe_nessy-0.3.13.
|
|
98
|
+
cloe_nessy-0.3.13.5b0.dist-info/METADATA,sha256=PBFKdmm5_n8bAarqbddj81pIjctNxHIgQPw72Lru01M,3328
|
|
99
|
+
cloe_nessy-0.3.13.5b0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
100
|
+
cloe_nessy-0.3.13.5b0.dist-info/RECORD,,
|
|
File without changes
|