cloe-nessy 0.3.13.3b0__py3-none-any.whl → 0.3.13.5b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -197,7 +197,9 @@ class DeltaMergeWriter(BaseDeltaWriter):
197
197
  config = DeltaMergeConfig(dataframe_columns=data_frame.columns, **kwargs)
198
198
 
199
199
  delta_table = self.table_manager.get_delta_table(
200
- table=table, location=storage_path, spark=data_frame.sparkSession
200
+ table=table,
201
+ location=storage_path,
202
+ spark=data_frame.sparkSession,
201
203
  )
202
204
 
203
205
  match_conditions = self._build_match_conditions(data_frame, config)
@@ -5,6 +5,7 @@ from pydantic import BaseModel, Field, field_validator, model_validator
5
5
 
6
6
  COLUMN_DATA_TYPE_LIST = {
7
7
  "string",
8
+ "decimal",
8
9
  "integer",
9
10
  "int",
10
11
  "smallint",
@@ -43,7 +44,7 @@ class Column(BaseModel):
43
44
  """
44
45
  val = raw.lower()
45
46
  base_data_types = re.findall(r"\b[a-z]+\b", val)
46
- forbidden_characters = re.findall(r"[^a-z\<\>)]+", val)
47
+ forbidden_characters = re.findall(r"[^a-z0-9\(\)\<\>,\s]+", val)
47
48
 
48
49
  if forbidden_characters:
49
50
  raise ValueError(f"Forbidden characters in data type definition [ '{val}' ]: [' {forbidden_characters} ']")
@@ -1,66 +1,94 @@
1
- import pathlib
1
+ from pathlib import Path
2
2
  from typing import Any
3
3
 
4
- from ...models import Schema
4
+ from ...models import Table
5
5
  from ..pipeline_action import PipelineAction
6
6
  from ..pipeline_context import PipelineContext
7
7
 
8
8
 
9
9
  class ReadMetadataYAMLAction(PipelineAction):
10
- """Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
10
+ """Reads table metadata from a yaml file using the [`Table`][cloe_nessy.models.table] model.
11
11
 
12
12
  Example:
13
- ```yaml
14
- Read Schema Metadata:
15
- action: READ_METADATA_YAML_ACTION
16
- options:
17
- path: excel_file_folder/excel_files_june/
18
- file_name: sales_schema.yml
19
- table_name: sales
20
- ```
13
+ === "Managed Table"
14
+ ```yaml
15
+ Read Table Metadata:
16
+ action: READ_METADATA_YAML_ACTION
17
+ options:
18
+ file_path: metadata/schemas/bronze/sales_table.yml
19
+ catalog_name: production
20
+ schema_name: sales_data
21
+ ```
22
+ === "External Table"
23
+ ```yaml
24
+ Read Table Metadata:
25
+ action: READ_METADATA_YAML_ACTION
26
+ options:
27
+ file_path: metadata/schemas/bronze/sales_table.yml
28
+ catalog_name: production
29
+ schema_name: sales_data
30
+ storage_path: abfs://external_storage/sales_data/sales_table
31
+ ```
21
32
  """
22
33
 
23
34
  name: str = "READ_METADATA_YAML_ACTION"
24
35
 
25
- @staticmethod
26
36
  def run(
37
+ self,
27
38
  context: PipelineContext,
28
39
  *,
29
- path: str | None = None,
30
- file_name: str | None = None,
31
- table_name: str | None = None,
40
+ file_path: str | None = None,
41
+ catalog_name: str | None = None,
42
+ schema_name: str | None = None,
43
+ storage_path: str | None = None,
32
44
  **_: Any,
33
45
  ) -> PipelineContext:
34
- """Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
46
+ """Reads table metadata from a yaml file using the [`Table`][cloe_nessy.models.table] model.
35
47
 
36
48
  Args:
37
49
  context: The context in which this Action is executed.
38
- path: The path to the data contract directory.
39
- file_name: The name of the file that defines the schema.
40
- table_name: The name of the table for which to retrieve metadata.
50
+ file_path: The path to the file that defines the table.
51
+ catalog_name: The name of the catalog for the table.
52
+ schema_name: The name of the schema for the table.
53
+ storage_path: The storage path for the table, if applicable. If not
54
+ provided, the table will be considered a managed table.
41
55
 
42
56
  Raises:
43
- ValueError: If any issues occur while reading the schema, such as an invalid schema,
44
- missing file, or missing path.
57
+ ValueError: If any issues occur while reading the table metadata, such as an invalid table,
58
+ missing file, missing path, or missing catalog/schema names.
45
59
 
46
60
  Returns:
47
61
  The context after the execution of this Action, containing the table metadata.
48
62
  """
49
- if not path:
50
- raise ValueError("No path provided. Please specify path to schema metadata.")
51
- if not file_name:
52
- raise ValueError("No file_name provided. Please specify file name.")
53
- if not table_name:
54
- raise ValueError("No table_name provided. Please specify table name.")
63
+ missing_params = []
64
+ if not file_path:
65
+ missing_params.append("file_path")
66
+ if not catalog_name:
67
+ missing_params.append("catalog_name")
68
+ if not schema_name:
69
+ missing_params.append("schema_name")
55
70
 
56
- path_obj = pathlib.Path(path)
71
+ if missing_params:
72
+ raise ValueError(
73
+ f"Missing required parameters: {', '.join(missing_params)}. Please specify all required parameters."
74
+ )
57
75
 
58
- schema, errors = Schema.read_instance_from_file(path_obj / file_name)
76
+ final_file_path = Path(file_path) if file_path else Path()
77
+
78
+ table, errors = Table.read_instance_from_file(
79
+ final_file_path,
80
+ catalog_name=catalog_name,
81
+ schema_name=schema_name,
82
+ )
59
83
  if errors:
60
- raise ValueError(f"Errors while reading schema metadata: {errors}")
61
- if not schema:
62
- raise ValueError("No schema found in metadata.")
84
+ raise ValueError(f"Errors while reading table metadata: {errors}")
85
+ if not table:
86
+ raise ValueError("No table found in metadata.")
63
87
 
64
- table = schema.get_table_by_name(table_name=table_name)
88
+ if not table.storage_path and storage_path:
89
+ self._console_logger.info(f"Setting storage path for table [ '{table.name}' ] to [ '{storage_path}' ]")
90
+ table.storage_path = storage_path
91
+ table.is_external = True
65
92
 
93
+ self._console_logger.info(f"Table [ '{table.name}' ] metadata read successfully from [ '{file_path}' ]")
66
94
  return context.from_existing(table_metadata=table)
@@ -0,0 +1,87 @@
1
+ from typing import Any
2
+
3
+ from pyspark.errors.exceptions.base import IllegalArgumentException
4
+ from pyspark.sql import functions as F
5
+
6
+ from ...pipeline import PipelineAction, PipelineContext
7
+
8
+
9
+ class TransformConvertTimestampAction(PipelineAction):
10
+ """This class implements a Transform action for an ETL pipeline.
11
+
12
+ This action performs timestamp based conversions.
13
+
14
+ Example:
15
+ ```yaml
16
+ Convert Timestamp:
17
+ action: TRANSFORM_CONVERT_TIMESTAMP
18
+ options:
19
+ column: my_timestamp_column
20
+ source_format: unixtime
21
+ target_format: yyyy-MM-dd HH:mm:ss
22
+ ```
23
+ """
24
+
25
+ name: str = "TRANSFORM_CONVERT_TIMESTAMP"
26
+
27
+ def run(
28
+ self,
29
+ context: PipelineContext,
30
+ *,
31
+ column: str = "",
32
+ source_format: str = "",
33
+ target_format: str = "",
34
+ **_: Any,
35
+ ) -> PipelineContext:
36
+ """Converts a column from a given source format to a new format.
37
+
38
+ Args:
39
+ context: Context in which this Action is executed.
40
+ column: The column that should be converted.
41
+ source_format: Initial format type of the column.
42
+ target_format: Desired format type of the column. This also supports
43
+ passing a format string like 'yyyy-MM-dd HH:mm:ss'.
44
+
45
+ Raises:
46
+ ValueError: If no column, source_format and target_format are provided.
47
+ ValueError: If source_format or target_format are not supported.
48
+
49
+ Returns:
50
+ PipelineContext: Context after the execution of this Action.
51
+ """
52
+ if not column:
53
+ raise ValueError("No column provided.")
54
+ if not source_format:
55
+ raise ValueError("No source_format provided.")
56
+ if not target_format:
57
+ raise ValueError("No target_format provided.")
58
+ if context.data is None:
59
+ raise ValueError("Context DataFrame is required.")
60
+ df = context.data
61
+
62
+ match source_format:
63
+ # convert always to timestamp first
64
+ case "unixtime":
65
+ df = df.withColumn(column, F.from_unixtime(F.col(column)))
66
+ case "unixtime_ms":
67
+ df = df.withColumn(column, F.to_timestamp(F.col(column) / 1000))
68
+ case "string":
69
+ df = df.withColumn(column, F.to_timestamp(F.col(column)))
70
+ case "timestamp":
71
+ pass
72
+ case _:
73
+ raise ValueError(f"Unknown source_format {source_format}")
74
+
75
+ match target_format:
76
+ # convert from timestamp to desired output format
77
+ case "timestamp":
78
+ pass
79
+ case "unixtime":
80
+ df = df.withColumn(column, F.to_unix_timestamp(F.col(column)))
81
+ case _:
82
+ try:
83
+ df = df.withColumn(column, F.date_format(F.col(column), target_format))
84
+ except IllegalArgumentException as e:
85
+ raise ValueError(f"Invalid target_format {target_format}") from e
86
+
87
+ return context.from_existing(data=df)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cloe-nessy
3
- Version: 0.3.13.3b0
3
+ Version: 0.3.13.5b0
4
4
  Summary: Your friendly datalake monster.
5
5
  Project-URL: homepage, https://initions.com/
6
6
  Author-email: initions <ICSMC_EXT_PYPIORG@accenture.com>
@@ -16,6 +16,7 @@ Requires-Python: <3.13,>=3.11
16
16
  Requires-Dist: azure-identity<2.0.0,>=1.19.0
17
17
  Requires-Dist: cloe-logging[databricks,log-analytics]<0.4,>=0.3.8
18
18
  Requires-Dist: databricks-sdk<1.0.0,>=0.36.0
19
+ Requires-Dist: delta-spark>=3.3.2
19
20
  Requires-Dist: fsspec<2025.6.0,>=2025.5.1
20
21
  Requires-Dist: httpx<1.0.0,>=0.27.2
21
22
  Requires-Dist: jinja2<4.0.0,>=3.1.4
@@ -30,7 +30,7 @@ cloe_nessy/integration/writer/file_writer.py,sha256=SUDbN13ZzDhbM8DpOGFgM_Gkg70T
30
30
  cloe_nessy/integration/writer/writer.py,sha256=elFPLFrWR-qVE9qnBtzzzhyRALLQcRVuOsPS0rNmRt4,1741
31
31
  cloe_nessy/integration/writer/delta_writer/__init__.py,sha256=h2CT6Hllmk0nodlek27uqwniCzVZKMkYcPGyG9K2Z24,164
32
32
  cloe_nessy/integration/writer/delta_writer/delta_append_writer.py,sha256=TbpW-j87_H9dcUza34uR6VWslJez406y3_5N1ip0SnM,4740
33
- cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py,sha256=3LQsrPNq9Xi71NLFqbB1Qk6tcheNPoQX3ngvCuIKwaw,10147
33
+ cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py,sha256=zhqPIPfAJTzSLFgBUCwFesUW7CcF1zCPRU-N_8yYjok,10172
34
34
  cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py,sha256=kiacqQ2FYQSzakJqZ9-ZHH3os4X7--QuER_2xx9y21k,971
35
35
  cloe_nessy/integration/writer/delta_writer/delta_writer_base.py,sha256=upUtDZMzwYFU0kzmkelVgkpFToXkrypcR3h_jvGjz14,8596
36
36
  cloe_nessy/integration/writer/delta_writer/exceptions.py,sha256=xPmGiYV0xQXauln5Oh34E5vbm0rVcs6xCh-SJSb2bw0,107
@@ -38,7 +38,7 @@ cloe_nessy/logging/__init__.py,sha256=ySVCVbdyR3Dno_tl2ZfiER_7EVaDoQMHVkNyfdMZum
38
38
  cloe_nessy/logging/logger_mixin.py,sha256=9iy7BF6drYme-f7Rrt_imbVBRgVqQ89xjcP1X5aMtfY,7467
39
39
  cloe_nessy/models/__init__.py,sha256=-FmWEJ1Oq1njSopjc0R7GmT64mLSmALkm8PkHNzy9Y8,327
40
40
  cloe_nessy/models/catalog.py,sha256=ayC1sMp4cNLAZtu0ICVV3Us6-o4hn8U9tpzzvxC9RAs,177
41
- cloe_nessy/models/column.py,sha256=53fBwRnino72XKACsHZpN9QfCBqqSXyKLHZlM0huumg,1988
41
+ cloe_nessy/models/column.py,sha256=8wR7E8PRhUc0dwM83IIlpz7kBncZim7J5FvQzd8R_Us,2012
42
42
  cloe_nessy/models/constraint.py,sha256=hsFlhn4n928z81O3dl3v5bMetewPWzMjkJK3_4kASSM,178
43
43
  cloe_nessy/models/foreign_key.py,sha256=DwRVHs9sShqqPV-NL7ow_3AmPPWX0Od26yZn_I565pU,1001
44
44
  cloe_nessy/models/schema.py,sha256=cNSrH7K4hLRrkg1E6fW6DUIBMZdR2A5B21POj5iQ4GA,3429
@@ -67,10 +67,11 @@ cloe_nessy/pipeline/actions/read_api.py,sha256=RBv5XeHtjTXuCP09Fqo6JNx6iIhQQI-nu
67
67
  cloe_nessy/pipeline/actions/read_catalog_table.py,sha256=oXbqbc6BfR82dSIGclwzWiTN8EVmpFjNIYLKm4qOU50,2754
68
68
  cloe_nessy/pipeline/actions/read_excel.py,sha256=Mhl3r_2Hqk2XN7Fl5WqqAyE4JdnwSiivbhWMglyBtkE,7961
69
69
  cloe_nessy/pipeline/actions/read_files.py,sha256=N9bFgtG1tovhp2JayxE5YiN9PiO2lgG2-6h_Y6tD2eU,5220
70
- cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=3ZDy9qiDYtM1oDQzHPC23hLOvHjhdk5zg1wVHE60m9k,2295
70
+ cloe_nessy/pipeline/actions/read_metadata_yaml.py,sha256=i8fQceV63eAqx_x0ANisCkXWfMHyhqsfFHVFH5yP2po,3544
71
71
  cloe_nessy/pipeline/actions/transform_change_datatype.py,sha256=24Tn6R3TvUkWCh8V6naLdyNbCbqvyPOOoer-hy_Ebq4,2077
72
72
  cloe_nessy/pipeline/actions/transform_clean_column_names.py,sha256=-CEdcXb7Fz5DQNitGlJ8EVBE_LzxfsInyCIO-D7b4iY,3042
73
73
  cloe_nessy/pipeline/actions/transform_concat_columns.py,sha256=Nk8YbhxDnFZsWzW9Dj5Yl76Uq6VrcMlevQPHGms65L8,3777
74
+ cloe_nessy/pipeline/actions/transform_convert_timestamp.py,sha256=je6H-mtNeokU9W_-RCWaRCFvMhk4oQL9s60FVBrl8Po,3090
74
75
  cloe_nessy/pipeline/actions/transform_decode.py,sha256=JajMwHREtxa8u_1Q3RZDBVMjncoSel-WzQFVTO0MREg,4455
75
76
  cloe_nessy/pipeline/actions/transform_deduplication.py,sha256=E0ypz9qkHMSatNfnHekP-E6svQVL149M4PV02M03drg,5099
76
77
  cloe_nessy/pipeline/actions/transform_distinct.py,sha256=c7aBxANyqT4aKhm0cSELDtD-bP0Se9vxlBF0K4AgQWs,1976
@@ -94,6 +95,6 @@ cloe_nessy/settings/__init__.py,sha256=ZbkneO3WaKOxon7qHFHnou7EnBOSnBFyKMDZblIEv
94
95
  cloe_nessy/settings/settings.py,sha256=I4n129lrujriW-d8q4as2Kb4_kI932ModfZ5Ow_UpVM,3653
95
96
  cloe_nessy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
96
97
  cloe_nessy/utils/file_and_directory_handler.py,sha256=r2EVt9xG81p6ScaJCwETC5an6pMT6WseB0jMOR-JlpU,602
97
- cloe_nessy-0.3.13.3b0.dist-info/METADATA,sha256=D5hiAAFKj9BZ6RA5SR9dJ7E4BFQhuNH7AXIT-ELLpP0,3294
98
- cloe_nessy-0.3.13.3b0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
99
- cloe_nessy-0.3.13.3b0.dist-info/RECORD,,
98
+ cloe_nessy-0.3.13.5b0.dist-info/METADATA,sha256=PBFKdmm5_n8bAarqbddj81pIjctNxHIgQPw72Lru01M,3328
99
+ cloe_nessy-0.3.13.5b0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
100
+ cloe_nessy-0.3.13.5b0.dist-info/RECORD,,