cloe-nessy 0.3.13.4b0__py3-none-any.whl → 0.3.13.6b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/models/column.py +2 -1
- cloe_nessy/models/templates/create_table.sql.j2 +8 -0
- cloe_nessy/pipeline/actions/transform_convert_timestamp.py +87 -0
- cloe_nessy/pipeline/actions/transform_join.py +38 -9
- cloe_nessy/pipeline/actions/transform_union.py +2 -2
- {cloe_nessy-0.3.13.4b0.dist-info → cloe_nessy-0.3.13.6b0.dist-info}/METADATA +3 -2
- {cloe_nessy-0.3.13.4b0.dist-info → cloe_nessy-0.3.13.6b0.dist-info}/RECORD +8 -7
- {cloe_nessy-0.3.13.4b0.dist-info → cloe_nessy-0.3.13.6b0.dist-info}/WHEEL +0 -0
cloe_nessy/models/column.py
CHANGED
|
@@ -5,6 +5,7 @@ from pydantic import BaseModel, Field, field_validator, model_validator
|
|
|
5
5
|
|
|
6
6
|
COLUMN_DATA_TYPE_LIST = {
|
|
7
7
|
"string",
|
|
8
|
+
"decimal",
|
|
8
9
|
"integer",
|
|
9
10
|
"int",
|
|
10
11
|
"smallint",
|
|
@@ -43,7 +44,7 @@ class Column(BaseModel):
|
|
|
43
44
|
"""
|
|
44
45
|
val = raw.lower()
|
|
45
46
|
base_data_types = re.findall(r"\b[a-z]+\b", val)
|
|
46
|
-
forbidden_characters = re.findall(r"[^a-
|
|
47
|
+
forbidden_characters = re.findall(r"[^a-z0-9\(\)\<\>,\s]+", val)
|
|
47
48
|
|
|
48
49
|
if forbidden_characters:
|
|
49
50
|
raise ValueError(f"Forbidden characters in data type definition [ '{val}' ]: [' {forbidden_characters} ']")
|
|
@@ -13,6 +13,14 @@ USING delta
|
|
|
13
13
|
{% if table.storage_path %}
|
|
14
14
|
LOCATION '{{ table.storage_path }}'
|
|
15
15
|
{% endif %}
|
|
16
|
+
{% if table.properties %}
|
|
17
|
+
TBLPROPERTIES (
|
|
18
|
+
{%- for key, value in table.properties.items() %}
|
|
19
|
+
{%- if not loop.first %}, {% endif -%}
|
|
20
|
+
'{{key}}' = '{{value}}'
|
|
21
|
+
{%- endfor -%}
|
|
22
|
+
)
|
|
23
|
+
{% endif %}
|
|
16
24
|
{% if table.partition_by -%}
|
|
17
25
|
{%- if table.liquid_clustering -%} CLUSTER {%- else -%} PARTITIONED {%- endif %} BY (
|
|
18
26
|
{%- for column in table.partition_by -%}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from pyspark.errors.exceptions.base import IllegalArgumentException
|
|
4
|
+
from pyspark.sql import functions as F
|
|
5
|
+
|
|
6
|
+
from ...pipeline import PipelineAction, PipelineContext
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TransformConvertTimestampAction(PipelineAction):
|
|
10
|
+
"""This class implements a Transform action for an ETL pipeline.
|
|
11
|
+
|
|
12
|
+
This action performs timestamp based conversions.
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
```yaml
|
|
16
|
+
Convert Timestamp:
|
|
17
|
+
action: TRANSFORM_CONVERT_TIMESTAMP
|
|
18
|
+
options:
|
|
19
|
+
column: my_timestamp_column
|
|
20
|
+
source_format: unixtime
|
|
21
|
+
target_format: yyyy-MM-dd HH:mm:ss
|
|
22
|
+
```
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name: str = "TRANSFORM_CONVERT_TIMESTAMP"
|
|
26
|
+
|
|
27
|
+
def run(
|
|
28
|
+
self,
|
|
29
|
+
context: PipelineContext,
|
|
30
|
+
*,
|
|
31
|
+
column: str = "",
|
|
32
|
+
source_format: str = "",
|
|
33
|
+
target_format: str = "",
|
|
34
|
+
**_: Any,
|
|
35
|
+
) -> PipelineContext:
|
|
36
|
+
"""Converts a column from a given source format to a new format.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
context: Context in which this Action is executed.
|
|
40
|
+
column: The column that should be converted.
|
|
41
|
+
source_format: Initial format type of the column.
|
|
42
|
+
target_format: Desired format type of the column. This also supports
|
|
43
|
+
passing a format string like 'yyyy-MM-dd HH:mm:ss'.
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
ValueError: If no column, source_format and target_format are provided.
|
|
47
|
+
ValueError: If source_format or target_format are not supported.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
PipelineContext: Context after the execution of this Action.
|
|
51
|
+
"""
|
|
52
|
+
if not column:
|
|
53
|
+
raise ValueError("No column provided.")
|
|
54
|
+
if not source_format:
|
|
55
|
+
raise ValueError("No source_format provided.")
|
|
56
|
+
if not target_format:
|
|
57
|
+
raise ValueError("No target_format provided.")
|
|
58
|
+
if context.data is None:
|
|
59
|
+
raise ValueError("Context DataFrame is required.")
|
|
60
|
+
df = context.data
|
|
61
|
+
|
|
62
|
+
match source_format:
|
|
63
|
+
# convert always to timestamp first
|
|
64
|
+
case "unixtime":
|
|
65
|
+
df = df.withColumn(column, F.from_unixtime(F.col(column)))
|
|
66
|
+
case "unixtime_ms":
|
|
67
|
+
df = df.withColumn(column, F.to_timestamp(F.col(column) / 1000))
|
|
68
|
+
case "string":
|
|
69
|
+
df = df.withColumn(column, F.to_timestamp(F.col(column)))
|
|
70
|
+
case "timestamp":
|
|
71
|
+
pass
|
|
72
|
+
case _:
|
|
73
|
+
raise ValueError(f"Unknown source_format {source_format}")
|
|
74
|
+
|
|
75
|
+
match target_format:
|
|
76
|
+
# convert from timestamp to desired output format
|
|
77
|
+
case "timestamp":
|
|
78
|
+
pass
|
|
79
|
+
case "unixtime":
|
|
80
|
+
df = df.withColumn(column, F.to_unix_timestamp(F.col(column)))
|
|
81
|
+
case _:
|
|
82
|
+
try:
|
|
83
|
+
df = df.withColumn(column, F.date_format(F.col(column), target_format))
|
|
84
|
+
except IllegalArgumentException as e:
|
|
85
|
+
raise ValueError(f"Invalid target_format {target_format}") from e
|
|
86
|
+
|
|
87
|
+
return context.from_existing(data=df)
|
|
@@ -13,20 +13,49 @@ class TransformJoinAction(PipelineAction):
|
|
|
13
13
|
from [PySpark
|
|
14
14
|
documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.join.html)
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
16
|
+
Examples:
|
|
17
|
+
=== "Simple Column Join"
|
|
18
|
+
```yaml
|
|
19
|
+
Join Tables:
|
|
20
|
+
action: TRANSFORM_JOIN
|
|
21
|
+
options:
|
|
22
|
+
joined_data: ((step:Transform First Table))
|
|
23
|
+
join_on: id
|
|
24
|
+
how: inner
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
=== "Multiple Columns Join"
|
|
28
|
+
```yaml
|
|
29
|
+
Join Tables:
|
|
30
|
+
action: TRANSFORM_JOIN
|
|
31
|
+
options:
|
|
32
|
+
joined_data: ((step:Transform First Table))
|
|
33
|
+
join_on: [customer_id, order_date]
|
|
34
|
+
how: left
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
=== "Dictionary Join (Different Column Names)"
|
|
38
|
+
```yaml
|
|
39
|
+
Join Tables:
|
|
40
|
+
action: TRANSFORM_JOIN
|
|
41
|
+
options:
|
|
42
|
+
joined_data: ((step:Transform First Table))
|
|
43
|
+
join_on:
|
|
44
|
+
customer_id: cust_id
|
|
45
|
+
order_date: date
|
|
46
|
+
how: inner
|
|
47
|
+
```
|
|
25
48
|
|
|
26
49
|
!!! note "Referencing a DataFrame from another step"
|
|
27
50
|
The `joined_data` parameter is a reference to the DataFrame from another step.
|
|
28
51
|
The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
|
|
29
52
|
for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
|
|
53
|
+
|
|
54
|
+
!!! tip "Dictionary Join Syntax"
|
|
55
|
+
When using a dictionary for `join_on`, the keys represent columns
|
|
56
|
+
from the DataFrame in context and the values represent columns from
|
|
57
|
+
the DataFrame in `joined_data` This is useful when joining tables
|
|
58
|
+
with different column names for the same logical entity.
|
|
30
59
|
"""
|
|
31
60
|
|
|
32
61
|
name: str = "TRANSFORM_JOIN"
|
|
@@ -22,8 +22,8 @@ class TransformUnionAction(PipelineAction):
|
|
|
22
22
|
action: TRANSFORM_UNION
|
|
23
23
|
options:
|
|
24
24
|
union_data:
|
|
25
|
-
- ((step:
|
|
26
|
-
- ((step:
|
|
25
|
+
- ((step:Filter First Table))
|
|
26
|
+
- ((step:SQL Transform Second Table))
|
|
27
27
|
```
|
|
28
28
|
!!! note "Referencing a DataFrame from another step"
|
|
29
29
|
The `union_data` parameter is a reference to the DataFrame from another step.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cloe-nessy
|
|
3
|
-
Version: 0.3.13.
|
|
3
|
+
Version: 0.3.13.6b0
|
|
4
4
|
Summary: Your friendly datalake monster.
|
|
5
5
|
Project-URL: homepage, https://initions.com/
|
|
6
6
|
Author-email: initions <ICSMC_EXT_PYPIORG@accenture.com>
|
|
@@ -16,7 +16,8 @@ Requires-Python: <3.13,>=3.11
|
|
|
16
16
|
Requires-Dist: azure-identity<2.0.0,>=1.19.0
|
|
17
17
|
Requires-Dist: cloe-logging[databricks,log-analytics]<0.4,>=0.3.8
|
|
18
18
|
Requires-Dist: databricks-sdk<1.0.0,>=0.36.0
|
|
19
|
-
Requires-Dist:
|
|
19
|
+
Requires-Dist: delta-spark>=3.3.2
|
|
20
|
+
Requires-Dist: fsspec<2025.7.1,>=2025.7.0
|
|
20
21
|
Requires-Dist: httpx<1.0.0,>=0.27.2
|
|
21
22
|
Requires-Dist: jinja2<4.0.0,>=3.1.4
|
|
22
23
|
Requires-Dist: matplotlib<4.0.0,>=3.9.2
|
|
@@ -38,7 +38,7 @@ cloe_nessy/logging/__init__.py,sha256=ySVCVbdyR3Dno_tl2ZfiER_7EVaDoQMHVkNyfdMZum
|
|
|
38
38
|
cloe_nessy/logging/logger_mixin.py,sha256=9iy7BF6drYme-f7Rrt_imbVBRgVqQ89xjcP1X5aMtfY,7467
|
|
39
39
|
cloe_nessy/models/__init__.py,sha256=-FmWEJ1Oq1njSopjc0R7GmT64mLSmALkm8PkHNzy9Y8,327
|
|
40
40
|
cloe_nessy/models/catalog.py,sha256=ayC1sMp4cNLAZtu0ICVV3Us6-o4hn8U9tpzzvxC9RAs,177
|
|
41
|
-
cloe_nessy/models/column.py,sha256=
|
|
41
|
+
cloe_nessy/models/column.py,sha256=8wR7E8PRhUc0dwM83IIlpz7kBncZim7J5FvQzd8R_Us,2012
|
|
42
42
|
cloe_nessy/models/constraint.py,sha256=hsFlhn4n928z81O3dl3v5bMetewPWzMjkJK3_4kASSM,178
|
|
43
43
|
cloe_nessy/models/foreign_key.py,sha256=DwRVHs9sShqqPV-NL7ow_3AmPPWX0Od26yZn_I565pU,1001
|
|
44
44
|
cloe_nessy/models/schema.py,sha256=cNSrH7K4hLRrkg1E6fW6DUIBMZdR2A5B21POj5iQ4GA,3429
|
|
@@ -50,7 +50,7 @@ cloe_nessy/models/adapter/unity_catalog_adapter.py,sha256=a-14Ys-AevVYQd0xeJU1sy
|
|
|
50
50
|
cloe_nessy/models/mixins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
51
|
cloe_nessy/models/mixins/read_instance_mixin.py,sha256=j5Y4aNWOh1jlskEaxNooZFJgPyxRmik00gAVLJnAaRs,4507
|
|
52
52
|
cloe_nessy/models/mixins/template_loader_mixin.py,sha256=5MXhEGBFlq3dwZvINEyBowSlipNnVun2H_TmhI_fsS4,549
|
|
53
|
-
cloe_nessy/models/templates/create_table.sql.j2,sha256=
|
|
53
|
+
cloe_nessy/models/templates/create_table.sql.j2,sha256=z-NNUJ61wqMZyNaKGBsrj6gqogo6CtEaS1rWoa8hUbw,1877
|
|
54
54
|
cloe_nessy/models/templates/create_volume.sql.j2,sha256=XIUf1cHcvAxcGTyhzUiv4xpQ1cfDw_ra3_FKmOuLoBs,289
|
|
55
55
|
cloe_nessy/object_manager/__init__.py,sha256=3sle0vNpPwBOkycxA3XVS9m4XZf5LD3Qd4NGxdqcHno,186
|
|
56
56
|
cloe_nessy/object_manager/table_manager.py,sha256=suHx56TYXagaJ2dVkvTP7vwSI4xgTqXNkHYBbYh2pd4,13913
|
|
@@ -71,6 +71,7 @@ cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=i8fQceV63eAqx_x0ANisCkX
|
|
|
71
71
|
cloe_nessy/pipeline/actions/transform_change_datatype.py,sha256=24Tn6R3TvUkWCh8V6naLdyNbCbqvyPOOoer-hy_Ebq4,2077
|
|
72
72
|
cloe_nessy/pipeline/actions/transform_clean_column_names.py,sha256=-CEdcXb7Fz5DQNitGlJ8EVBE_LzxfsInyCIO-D7b4iY,3042
|
|
73
73
|
cloe_nessy/pipeline/actions/transform_concat_columns.py,sha256=Nk8YbhxDnFZsWzW9Dj5Yl76Uq6VrcMlevQPHGms65L8,3777
|
|
74
|
+
cloe_nessy/pipeline/actions/transform_convert_timestamp.py,sha256=je6H-mtNeokU9W_-RCWaRCFvMhk4oQL9s60FVBrl8Po,3090
|
|
74
75
|
cloe_nessy/pipeline/actions/transform_decode.py,sha256=JajMwHREtxa8u_1Q3RZDBVMjncoSel-WzQFVTO0MREg,4455
|
|
75
76
|
cloe_nessy/pipeline/actions/transform_deduplication.py,sha256=E0ypz9qkHMSatNfnHekP-E6svQVL149M4PV02M03drg,5099
|
|
76
77
|
cloe_nessy/pipeline/actions/transform_distinct.py,sha256=c7aBxANyqT4aKhm0cSELDtD-bP0Se9vxlBF0K4AgQWs,1976
|
|
@@ -78,12 +79,12 @@ cloe_nessy/pipeline/actions/transform_filter.py,sha256=Nz_ggRfKIcNzYFfFOsgq1Qeat
|
|
|
78
79
|
cloe_nessy/pipeline/actions/transform_generic_sql.py,sha256=_naWfmPdYAUKjPNeHu5qJAohOL7DHCSYz_kwoeRv3OI,2741
|
|
79
80
|
cloe_nessy/pipeline/actions/transform_group_aggregate.py,sha256=KUHeeP-RIDi34dpbsPEJkzea5zFJA6MuyjNpOsFud9o,4045
|
|
80
81
|
cloe_nessy/pipeline/actions/transform_hash_columns.py,sha256=heRjBA-Gfu-nmNHOjTYlipEpKY8oNPAHAY40vjJk3aI,8383
|
|
81
|
-
cloe_nessy/pipeline/actions/transform_join.py,sha256=
|
|
82
|
+
cloe_nessy/pipeline/actions/transform_join.py,sha256=BjMn2h_Trq8l1n9R4QB55v1pAm0a9ft1vMLDBnHKG6g,4790
|
|
82
83
|
cloe_nessy/pipeline/actions/transform_json_normalize.py,sha256=petF7pnNq1EKc8MqVdG0weFALAHNILSe_eAu4Z5XxIo,4833
|
|
83
84
|
cloe_nessy/pipeline/actions/transform_rename_columns.py,sha256=4zJcPCONMU4C67qeuzsrX3AORRRHoq_selUI7FJyeg0,1952
|
|
84
85
|
cloe_nessy/pipeline/actions/transform_replace_values.py,sha256=1OPHTrjcphfyGepcO7ozYfeqfwA18pjlyHpVKUS_AAU,2049
|
|
85
86
|
cloe_nessy/pipeline/actions/transform_select_columns.py,sha256=-GhSEsb7iNnZIsYRm3BG9BX4_qUDJMbpj1DsKPY046w,4574
|
|
86
|
-
cloe_nessy/pipeline/actions/transform_union.py,sha256=
|
|
87
|
+
cloe_nessy/pipeline/actions/transform_union.py,sha256=SZtEzh567CIExUj9yMEgshE28h4dXKT7Wr2TDj4zB4k,2718
|
|
87
88
|
cloe_nessy/pipeline/actions/write_catalog_table.py,sha256=j7gRuG3Fedh8JgevIFBbHKock3laJVq4l6Mx3CGU5eo,2676
|
|
88
89
|
cloe_nessy/pipeline/actions/write_delta_append.py,sha256=fuL29SK9G5K14ycckU3iPexeK0XNXUfQscCwhXHxbKA,2498
|
|
89
90
|
cloe_nessy/pipeline/actions/write_delta_merge.py,sha256=gh3oD0ZGjDq0hw56NiRimK4HHCruDofqqdzFFgYLve8,5085
|
|
@@ -94,6 +95,6 @@ cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEv
|
|
|
94
95
|
cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
|
|
95
96
|
cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
96
97
|
cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
|
|
97
|
-
cloe_nessy-0.3.13.
|
|
98
|
-
cloe_nessy-0.3.13.
|
|
99
|
-
cloe_nessy-0.3.13.
|
|
98
|
+
cloe_nessy-0.3.13.6b0.dist-info/METADATA,sha256=Dn1ZY7mxS1qLW4vJAcOF-aYA9crg-4W7iMPTitd2ogs,3328
|
|
99
|
+
cloe_nessy-0.3.13.6b0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
100
|
+
cloe_nessy-0.3.13.6b0.dist-info/RECORD,,
|
|
File without changes
|