cloe-nessy 0.3.16.6b0__py3-none-any.whl → 0.3.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. cloe_nessy/integration/delta_loader/__init__.py +14 -0
  2. cloe_nessy/integration/delta_loader/delta_load_options.py +37 -0
  3. cloe_nessy/integration/delta_loader/delta_loader.py +165 -0
  4. cloe_nessy/integration/delta_loader/delta_loader_factory.py +53 -0
  5. cloe_nessy/integration/delta_loader/delta_loader_metadata_table.py +68 -0
  6. cloe_nessy/integration/delta_loader/strategies/__init__.py +9 -0
  7. cloe_nessy/integration/delta_loader/strategies/delta_cdf_loader.py +361 -0
  8. cloe_nessy/integration/delta_loader/strategies/delta_timestamp_loader.py +163 -0
  9. cloe_nessy/integration/reader/catalog_reader.py +33 -6
  10. cloe_nessy/integration/reader/file_reader.py +23 -0
  11. cloe_nessy/integration/writer/delta_writer/delta_table_operation_type.py +1 -1
  12. cloe_nessy/logging/logger_mixin.py +0 -1
  13. cloe_nessy/models/column.py +1 -1
  14. cloe_nessy/models/table.py +4 -3
  15. cloe_nessy/object_manager/table_manager.py +3 -1
  16. cloe_nessy/pipeline/actions/__init__.py +4 -0
  17. cloe_nessy/pipeline/actions/read_catalog_table.py +36 -3
  18. cloe_nessy/pipeline/actions/read_files.py +45 -3
  19. cloe_nessy/pipeline/actions/transform_convert_timestamp.py +97 -0
  20. cloe_nessy/pipeline/actions/transform_deduplication.py +7 -12
  21. cloe_nessy/pipeline/actions/transform_hash_columns.py +7 -7
  22. cloe_nessy/pipeline/actions/write_catalog_table.py +5 -0
  23. cloe_nessy/pipeline/actions/write_delta_append.py +15 -0
  24. cloe_nessy/pipeline/actions/write_delta_merge.py +23 -0
  25. cloe_nessy/pipeline/actions/write_file.py +6 -1
  26. cloe_nessy/pipeline/utils/__init__.py +5 -0
  27. cloe_nessy/pipeline/utils/delta_load_utils.py +36 -0
  28. cloe_nessy/utils/column_names.py +9 -0
  29. {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.17.0.dist-info}/METADATA +3 -3
  30. {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.17.0.dist-info}/RECORD +32 -20
  31. {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.17.0.dist-info}/WHEEL +0 -0
  32. {cloe_nessy-0.3.16.6b0.dist-info → cloe_nessy-0.3.17.0.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,6 @@ from pathlib import Path
2
2
  from typing import Any, Self
3
3
 
4
4
  import yaml
5
- import yaml.scanner
6
5
  from jinja2 import TemplateNotFound
7
6
  from pydantic import (
8
7
  Field,
@@ -11,6 +10,8 @@ from pydantic import (
11
10
  field_validator,
12
11
  model_validator,
13
12
  )
13
+ from yaml.parser import ParserError
14
+ from yaml.scanner import ScannerError
14
15
 
15
16
  from ..logging import LoggerMixin
16
17
  from ..utils.file_and_directory_handler import process_path
@@ -225,8 +226,8 @@ class Table(TemplateLoaderMixin, ReadInstancesMixin, LoggerMixin):
225
226
  errors += sub_errors
226
227
  except (
227
228
  ValidationError,
228
- yaml.parser.ParserError,
229
- yaml.scanner.ScannerError,
229
+ ParserError,
230
+ ScannerError,
230
231
  ) as e:
231
232
  instance = None
232
233
  errors.append(e)
@@ -198,7 +198,9 @@ class TableManager(LoggerMixin):
198
198
  ValueError: If neither table nor location is provided, or if both are provided.
199
199
  """
200
200
  if (table is None and location is None) or (table is not None and location is not None):
201
- raise ValueError("Either table or location must be provided, but not both.")
201
+ raise ValueError(
202
+ f"Either table or location must be provided, but not both. Table: {table}, location: {location}",
203
+ )
202
204
 
203
205
  if table is not None:
204
206
  location = str(table.storage_path)
@@ -9,7 +9,9 @@ from .read_metadata_yaml import ReadMetadataYAMLAction
9
9
  from .transform_change_datatype import TransformChangeDatatypeAction
10
10
  from .transform_clean_column_names import TransformCleanColumnNamesAction
11
11
  from .transform_concat_columns import TransformConcatColumnsAction
12
+ from .transform_convert_timestamp import TransformConvertTimestampAction
12
13
  from .transform_decode import TransformDecodeAction
14
+ from .transform_deduplication import TransformDeduplication
13
15
  from .transform_distinct import TransformDistinctAction
14
16
  from .transform_filter import TransformFilterAction
15
17
  from .transform_generic_sql import TransformSqlAction
@@ -45,7 +47,9 @@ __all__ = [
45
47
  "TransformChangeDatatypeAction",
46
48
  "TransformCleanColumnNamesAction",
47
49
  "TransformConcatColumnsAction",
50
+ "TransformConvertTimestampAction",
48
51
  "TransformDecodeAction",
52
+ "TransformDeduplication",
49
53
  "TransformDistinctAction",
50
54
  "TransformSqlAction",
51
55
  "TransformGroupAggregate",
@@ -1,8 +1,10 @@
1
1
  from typing import Any
2
2
 
3
+ from ...integration.delta_loader import DeltaLoadOptions
3
4
  from ...integration.reader import CatalogReader
4
5
  from ..pipeline_action import PipelineAction
5
6
  from ..pipeline_context import PipelineContext
7
+ from ..utils import set_delta_load_info
6
8
 
7
9
 
8
10
  class ReadCatalogTableAction(PipelineAction):
@@ -21,6 +23,12 @@ class ReadCatalogTableAction(PipelineAction):
21
23
  options:
22
24
  table_identifier: my_catalog.business_schema.sales_table
23
25
  options: <options for the CatalogReader read method>
26
+ delta_load_options:
27
+ strategy: CDF
28
+ delta_load_identifier: my_delta_load_id
29
+ strategy_options:
30
+ deduplication_columns: ["id"]
31
+ enable_full_load: true
24
32
  ```
25
33
  """
26
34
 
@@ -32,6 +40,7 @@ class ReadCatalogTableAction(PipelineAction):
32
40
  *,
33
41
  table_identifier: str | None = None,
34
42
  options: dict[str, str] | None = None,
43
+ delta_load_options: dict[Any, Any] | DeltaLoadOptions | None = None,
35
44
  **_: Any, # define kwargs to match the base class signature
36
45
  ) -> PipelineContext:
37
46
  """Reads a table from Unity Catalog using a specified table identifier and optional reader configurations.
@@ -45,6 +54,8 @@ class ReadCatalogTableAction(PipelineAction):
45
54
  options: A dictionary of options for customizing
46
55
  the [`CatalogReader`][cloe_nessy.integration.reader.catalog_reader]
47
56
  behavior, such as filters or reading modes. Defaults to None.
57
+ delta_load_options: Options for delta loading, if applicable.
58
+ Configures the [`DeltaLoader`][cloe_nessy.integration.delta_loader].
48
59
 
49
60
  Raises:
50
61
  ValueError: If neither `table_identifier` nor `table_metadata.identifier` in the `context` is provided.
@@ -53,13 +64,35 @@ class ReadCatalogTableAction(PipelineAction):
53
64
  An updated pipeline context containing the data read from the catalog table as a DataFrame.
54
65
  """
55
66
  if not options:
56
- options = dict()
67
+ options = {}
68
+
69
+ if not delta_load_options:
70
+ delta_load_options = {}
57
71
 
58
72
  if (table_metadata := context.table_metadata) and table_identifier is None:
59
73
  table_identifier = table_metadata.identifier
60
74
  if table_identifier is None:
61
75
  raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
62
76
 
77
+ if isinstance(delta_load_options, dict):
78
+ delta_options_dict = delta_load_options
79
+ if delta_load_options:
80
+ delta_load_options = DeltaLoadOptions(**delta_load_options)
81
+ else:
82
+ delta_load_options = None
83
+ else:
84
+ delta_options_dict = delta_load_options.model_dump() if delta_load_options else {}
85
+
86
+ runtime_info = set_delta_load_info(
87
+ table_identifier=table_identifier,
88
+ delta_load_options=delta_options_dict,
89
+ runtime_info=context.runtime_info or {},
90
+ )
91
+
63
92
  table_reader = CatalogReader()
64
- df = table_reader.read(table_identifier=table_identifier, options=options)
65
- return context.from_existing(data=df)
93
+ df = table_reader.read(
94
+ table_identifier=table_identifier,
95
+ options=options,
96
+ delta_load_options=delta_load_options,
97
+ )
98
+ return context.from_existing(data=df, runtime_info=runtime_info)
@@ -1,8 +1,10 @@
1
1
  from typing import Any
2
2
 
3
+ from ...integration.delta_loader import DeltaLoadOptions
3
4
  from ...integration.reader import FileReader
4
5
  from ..pipeline_action import PipelineAction
5
6
  from ..pipeline_context import PipelineContext
7
+ from ..utils import set_delta_load_info
6
8
 
7
9
 
8
10
  class ReadFilesAction(PipelineAction):
@@ -55,6 +57,24 @@ class ReadFilesAction(PipelineAction):
55
57
  Use the `extension` option to specify the extension of the files
56
58
  to read. Additionally, use the `spark_format` option to specify
57
59
  the format with which to read the files.
60
+
61
+ === "Read Delta Lake table with delta loading"
62
+ ```yaml
63
+ Read Delta Files:
64
+ action: READ_FILES
65
+ options:
66
+ location: /path/to/delta/table
67
+ spark_format: delta
68
+ delta_load_options:
69
+ strategy: CDF
70
+ delta_load_identifier: my_delta_files_load
71
+ strategy_options:
72
+ deduplication_columns: ["id"]
73
+ enable_full_load: false
74
+ ```
75
+ !!! note "Delta Loading for Files"
76
+ Use `delta_load_options` when reading Delta Lake tables to enable
77
+ incremental loading. This works with both CDF and timestamp strategies.
58
78
  """
59
79
 
60
80
  name: str = "READ_FILES"
@@ -70,6 +90,7 @@ class ReadFilesAction(PipelineAction):
70
90
  schema: str | None = None,
71
91
  add_metadata_column: bool = True,
72
92
  options: dict[str, str] | None = None,
93
+ delta_load_options: dict[Any, Any] | DeltaLoadOptions | None = None,
73
94
  **_: Any,
74
95
  ) -> PipelineContext:
75
96
  """Reads files from a specified location.
@@ -87,6 +108,8 @@ class ReadFilesAction(PipelineAction):
87
108
  add_metadata_column: Whether to include the `__metadata` column with
88
109
  file metadata in the DataFrame.
89
110
  options: Additional options passed to the reader.
111
+ delta_load_options: Options for delta loading, if applicable. When provided
112
+ for Delta format files, enables incremental loading using delta loader strategies.
90
113
 
91
114
  Raises:
92
115
  ValueError: If neither `extension` nor `spark_format` are provided, or if
@@ -105,6 +128,25 @@ class ReadFilesAction(PipelineAction):
105
128
  if (metadata := context.table_metadata) and schema is None:
106
129
  schema = metadata.schema
107
130
 
131
+ # Convert dict to DeltaLoadOptions if needed
132
+ if isinstance(delta_load_options, dict):
133
+ delta_load_options = DeltaLoadOptions(**delta_load_options)
134
+
135
+ # Set up runtime info for delta loading
136
+ runtime_info = context.runtime_info or {}
137
+ if delta_load_options:
138
+ # Convert DeltaLoadOptions to dict for runtime info storage
139
+ delta_options_dict = (
140
+ delta_load_options.model_dump()
141
+ if isinstance(delta_load_options, DeltaLoadOptions)
142
+ else delta_load_options
143
+ )
144
+ runtime_info = set_delta_load_info(
145
+ table_identifier=location, # Use location as identifier for file-based delta loading
146
+ delta_load_options=delta_options_dict,
147
+ runtime_info=runtime_info,
148
+ )
149
+
108
150
  file_reader = FileReader()
109
151
  df = file_reader.read(
110
152
  location=location,
@@ -114,11 +156,11 @@ class ReadFilesAction(PipelineAction):
114
156
  search_subdirs=search_subdirs,
115
157
  options=options,
116
158
  add_metadata_column=add_metadata_column,
159
+ delta_load_options=delta_load_options,
117
160
  )
118
161
 
119
- runtime_info = context.runtime_info
120
-
121
- if add_metadata_column:
162
+ # Only process metadata column if it exists and wasn't using delta loading
163
+ if add_metadata_column and "__metadata" in df.columns:
122
164
  read_files_list = [x.file_path for x in df.select("__metadata.file_path").drop_duplicates().collect()]
123
165
  if runtime_info is None:
124
166
  runtime_info = {"read_files": read_files_list}
@@ -0,0 +1,97 @@
1
+ from typing import Any
2
+
3
+ from pyspark.errors.exceptions.connect import IllegalArgumentException
4
+ from pyspark.sql import functions as F
5
+ from pyspark.sql.utils import AnalysisException
6
+
7
+ from ..pipeline_action import PipelineAction
8
+ from ..pipeline_context import PipelineContext
9
+
10
+
11
+ class TransformConvertTimestampAction(PipelineAction):
12
+ """This action performs timestamp based conversions.
13
+
14
+ Example:
15
+ ```yaml
16
+ Convert Timestamp:
17
+ action: TRANSFORM_CONVERT_TIMESTAMP
18
+ options:
19
+ columns:
20
+ - date
21
+ - creation_timestamp
22
+ - current_ts
23
+ source_format: unixtime_ms
24
+ target_format: timestamp
25
+ ```
26
+ """
27
+
28
+ name: str = "TRANSFORM_CONVERT_TIMESTAMP"
29
+
30
+ def run(
31
+ self,
32
+ context: PipelineContext,
33
+ *,
34
+ columns: list[str] | str | None = None,
35
+ source_format: str = "",
36
+ target_format: str = "",
37
+ **_: Any,
38
+ ) -> PipelineContext:
39
+ """Converts column(s) from a given source format to a new format.
40
+
41
+ Args:
42
+ context: Context in which this Action is executed.
43
+ columns: A column name or a list of column names that should be converted.
44
+ source_format: Initial format type of the column.
45
+ target_format: Desired format type of the column.
46
+ This also supports passing a format string like `yyyy-MM-dd HH:mm:ss`.
47
+
48
+ Raises:
49
+ ValueError: If no column, source_format or target_format are provided.
50
+ ValueError: If source_format or target_format are not supported.
51
+
52
+ Returns:
53
+ PipelineContext: Context after the execution of this Action.
54
+ """
55
+ if not columns:
56
+ raise ValueError("No column names provided.")
57
+ if not source_format:
58
+ raise ValueError("No source_format provided.")
59
+ if not target_format:
60
+ raise ValueError("No target_format provided.")
61
+ if context.data is None:
62
+ raise ValueError("Context DataFrame is required.")
63
+ df = context.data
64
+
65
+ columns = [columns] if isinstance(columns, str) else columns
66
+
67
+ match source_format:
68
+ # convert always to timestamp first
69
+ case "string" | "date" | "unixtime":
70
+ for column in columns:
71
+ df = df.withColumn(column, F.to_timestamp(F.col(column)))
72
+ case "unixtime_ms":
73
+ for column in columns:
74
+ df = df.withColumn(column, F.to_timestamp(F.col(column) / 1000))
75
+ case "timestamp":
76
+ pass
77
+ case _:
78
+ raise ValueError(f"Unknown source_format {source_format}")
79
+
80
+ match target_format:
81
+ # convert from timestamp to desired output type and format
82
+ case "timestamp":
83
+ pass
84
+ case "unixtime":
85
+ for column in columns:
86
+ df = df.withColumn(column, F.to_unix_timestamp(F.col(column)))
87
+ case "date":
88
+ for column in columns:
89
+ df = df.withColumn(column, F.to_date(F.col(column)))
90
+ case _:
91
+ try:
92
+ for column in columns:
93
+ df = df.withColumn(column, F.date_format(F.col(column), target_format))
94
+ except (IllegalArgumentException, AnalysisException) as e:
95
+ raise ValueError(f"Invalid target_format {target_format}") from e
96
+
97
+ return context.from_existing(data=df)
@@ -1,11 +1,10 @@
1
- import random
2
- import string
3
1
  from typing import Any
4
2
 
5
3
  import pyspark.sql.functions as F
6
4
  import pyspark.sql.types as T
7
5
  from pyspark.sql import Window
8
6
 
7
+ from ...utils.column_names import generate_unique_column_name
9
8
  from ..pipeline_action import PipelineAction
10
9
  from ..pipeline_context import PipelineContext
11
10
 
@@ -105,18 +104,14 @@ class TransformDeduplication(PipelineAction):
105
104
  else:
106
105
  order_by_list = [F.col(col_name).asc() for col_name in order_by_columns]
107
106
 
108
- # create the window specification
109
- window_specification = Window.partitionBy(key_columns).orderBy(order_by_list)
110
-
111
- # generate a column name that is not in the input dataframe
112
- def generate_random_string(length):
113
- return "".join(random.choice(string.ascii_uppercase) for _ in range(length))
107
+ window_specification = (
108
+ Window.partitionBy(key_columns)
109
+ .orderBy(order_by_list)
110
+ .rowsBetween(Window.unboundedPreceding, Window.currentRow)
111
+ )
114
112
 
115
- row_number_col_name = generate_random_string(20)
116
- while row_number_col_name in context.data.columns:
117
- row_number_col_name = generate_random_string(20)
113
+ row_number_col_name = generate_unique_column_name(existing_columns=set(context.data.columns), prefix="row_num")
118
114
 
119
- # drop the duplicates
120
115
  df = (
121
116
  context.data.withColumn(row_number_col_name, F.row_number().over(window_specification))
122
117
  .filter(F.col(row_number_col_name) == 1)
@@ -132,13 +132,13 @@ class TransformHashColumnsAction(PipelineAction):
132
132
  action: TRANSFORM_HASH_COLUMNS
133
133
  options:
134
134
  hash_config:
135
- - hashed_column1:
136
- columns: ["column1", "column2"]
137
- algorithm: "sha2"
138
- bits: 224
139
- - hashed_column2:
140
- columns: ["column1"]
141
- algorithm: "crc32"
135
+ hashed_column1:
136
+ columns: ["column1", "column2"]
137
+ algorithm: "sha2"
138
+ bits: 224
139
+ hashed_column2:
140
+ columns: ["column1"]
141
+ algorithm: "crc32"
142
142
  ```
143
143
 
144
144
  Given a DataFrame `df` with the following structure:
@@ -1,5 +1,6 @@
1
1
  from typing import Any
2
2
 
3
+ from ...integration.delta_loader import consume_delta_load
3
4
  from ...integration.writer import CatalogWriter
4
5
  from ..pipeline_action import PipelineAction
5
6
  from ..pipeline_context import PipelineContext
@@ -63,6 +64,10 @@ class WriteCatalogTableAction(PipelineAction):
63
64
  if table_identifier is None:
64
65
  raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
65
66
 
67
+ runtime_info = getattr(context, "runtime_info", None)
68
+ if runtime_info and runtime_info.get("is_delta_load"):
69
+ consume_delta_load(runtime_info)
70
+
66
71
  writer = CatalogWriter()
67
72
  writer.write_table(
68
73
  df=context.data, # type: ignore
@@ -1,5 +1,6 @@
1
1
  from typing import Any
2
2
 
3
+ from ...integration.delta_loader import consume_delta_load
3
4
  from ...integration.writer import DeltaAppendWriter
4
5
  from ...models.adapter import UnityCatalogAdapter
5
6
  from ...pipeline import PipelineAction, PipelineContext
@@ -10,6 +11,15 @@ class WriteDeltaAppendAction(PipelineAction):
10
11
 
11
12
  The WriteDeltaAppendAction appends a Dataframe to Delta Table.
12
13
 
14
+ Example:
15
+ ```yaml
16
+ Write Delta Append:
17
+ action: WRITE_DELTA_APPEND
18
+ options:
19
+ table_identifier: my_catalog.my_schema.my_table
20
+ ignore_empty_df: false
21
+ ```
22
+
13
23
  Returns:
14
24
  None.
15
25
  """
@@ -66,4 +76,9 @@ class WriteDeltaAppendAction(PipelineAction):
66
76
  ignore_empty_df=ignore_empty_df,
67
77
  options=options,
68
78
  )
79
+
80
+ runtime_info = getattr(context, "runtime_info", None)
81
+ if runtime_info and runtime_info.get("is_delta_load"):
82
+ consume_delta_load(runtime_info)
83
+
69
84
  return context.from_existing()
@@ -1,5 +1,6 @@
1
1
  from typing import Any
2
2
 
3
+ from ...integration.delta_loader import consume_delta_load
3
4
  from ...integration.writer import DeltaMergeWriter
4
5
  from ...models.adapter import UnityCatalogAdapter
5
6
  from ...pipeline import PipelineAction, PipelineContext
@@ -10,6 +11,24 @@ class WriteDeltaMergeAction(PipelineAction):
10
11
 
11
12
  The MergeIntoDeltaAction merges a Dataframe to Delta Table.
12
13
 
14
+ Example:
15
+ ```yaml
16
+ Write Delta Merge:
17
+ action: WRITE_DELTA_MERGE
18
+ options:
19
+ table_identifier: my_catalog.my_schema.my_table
20
+ key_columns:
21
+ - id
22
+ - customer_id
23
+ cols_to_update:
24
+ - name
25
+ - email
26
+ - updated_at
27
+ when_matched_update: true
28
+ when_not_matched_insert: true
29
+ use_partition_pruning: true
30
+ ```
31
+
13
32
  Returns:
14
33
  None.
15
34
  """
@@ -112,6 +131,10 @@ class WriteDeltaMergeAction(PipelineAction):
112
131
  ignore_empty_df=ignore_empty_df,
113
132
  )
114
133
 
134
+ runtime_info = getattr(context, "runtime_info", None)
135
+ if runtime_info and runtime_info.get("is_delta_load"):
136
+ consume_delta_load(runtime_info)
137
+
115
138
  if refresh_table:
116
139
  delta_merge_writer.table_manager.refresh_table(table_identifier=context.table_metadata.identifier)
117
140
 
@@ -1,5 +1,6 @@
1
1
  from typing import Any
2
2
 
3
+ from ...integration.delta_loader import consume_delta_load
3
4
  from ...integration.writer import FileWriter
4
5
  from ...pipeline import PipelineAction, PipelineContext
5
6
 
@@ -21,7 +22,7 @@ class WriteFileAction(PipelineAction):
21
22
  mode: "append"
22
23
  is_stream: False
23
24
  options:
24
- mergeSchema: "true"
25
+ mergeSchema: true
25
26
  ```
26
27
  """
27
28
 
@@ -91,4 +92,8 @@ class WriteFileAction(PipelineAction):
91
92
  options=options,
92
93
  )
93
94
 
95
+ runtime_info = getattr(context, "runtime_info", None)
96
+ if runtime_info and runtime_info.get("is_delta_load"):
97
+ consume_delta_load(runtime_info)
98
+
94
99
  return context.from_existing()
@@ -0,0 +1,5 @@
1
+ """Pipeline utility modules."""
2
+
3
+ from .delta_load_utils import set_delta_load_info
4
+
5
+ __all__ = ["set_delta_load_info"]
@@ -0,0 +1,36 @@
1
+ """Utilities for managing delta load information in pipeline runtime context."""
2
+
3
+ from typing import Any
4
+
5
+
6
+ def set_delta_load_info(
7
+ table_identifier: str,
8
+ delta_load_options: dict[str, Any],
9
+ runtime_info: dict[str, Any],
10
+ ) -> dict[str, Any]:
11
+ """Update the runtime information dictionary with delta load options for a specific table.
12
+
13
+ If delta load options are provided, this function marks the runtime as a delta load and
14
+ stores the options under the given table identifier within the 'delta_load_options' key
15
+ of the runtime_info dictionary.
16
+
17
+ The method uses `setdefault("delta_load_options", {})` to ensure that the 'delta_load_options'
18
+ key exists in the runtime_info dictionary. If the key is not present, it initializes it with
19
+ an empty dictionary. This prevents overwriting existing delta load options and allows
20
+ multiple tables' options to be stored without losing previous entries.
21
+
22
+ Args:
23
+ table_identifier: The identifier for the table (can be table name or file path).
24
+ delta_load_options: Options specific to the delta load for the table.
25
+ runtime_info: The runtime information dictionary to update.
26
+
27
+ Returns:
28
+ The updated runtime information dictionary with delta load details.
29
+ """
30
+ if not delta_load_options:
31
+ return runtime_info
32
+
33
+ runtime_info["is_delta_load"] = True
34
+ runtime_info.setdefault("delta_load_options", {})[table_identifier] = delta_load_options
35
+
36
+ return runtime_info
@@ -0,0 +1,9 @@
1
+ import uuid
2
+
3
+
4
+ def generate_unique_column_name(existing_columns: set[str], prefix: str = "temp_col") -> str:
5
+ """Generate a unique column name that doesn't conflict with existing columns."""
6
+ base_name = f"{prefix}_{uuid.uuid4().hex[:8]}"
7
+ while base_name in existing_columns:
8
+ base_name = f"{prefix}_{uuid.uuid4().hex[:8]}"
9
+ return base_name
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cloe-nessy
3
- Version: 0.3.16.6b0
3
+ Version: 0.3.17.0
4
4
  Summary: Your friendly datalake monster.
5
5
  Author-email: initions <ICSMC_EXT_PYPIORG@accenture.com>
6
6
  License: MIT
@@ -58,12 +58,12 @@ Extract-Transform-Load (ETL) Workflow.
58
58
 
59
59
  When you are contributing, please refer to our Contribution Guide in the *nessy*
60
60
  Docs
61
- [here](https://white-rock-0cabbc003.1.azurestaticapps.net/tool_docs/nessy/Developer-Guide/)!
61
+ [here](https://yellow-mud-0b9177e03.2.azurestaticapps.net/tool_docs/nessy/Developer-Guide/)!
62
62
 
63
63
  ## Usage
64
64
 
65
65
  Please find the User Guide
66
- [here](https://white-rock-0cabbc003.1.azurestaticapps.net/tool_docs/nessy/User-Guide/)!
66
+ [here](https://yellow-mud-0b9177e03.2.azurestaticapps.net/tool_docs/nessy/User-Guide/)!
67
67
 
68
68
  ## Contact
69
69