cloe-nessy 0.3.16.3b0__py3-none-any.whl → 0.3.16.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -50,39 +50,3 @@ class CatalogReader(BaseReader):
50
50
  raise ReadOperationFailedError(
51
51
  f"An error occurred while reading the table '{table_identifier}': {err}"
52
52
  ) from err
53
-
54
- def read_stream(
55
- self, table_identifier: str = "", *, options: dict[str, str] | None = None, **kwargs: Any
56
- ) -> DataFrame:
57
- """Reads a streaming table from the Unity Catalog.
58
-
59
- Args:
60
- table_identifier: The table identifier in the Unity Catalog in the format 'catalog.schema.table'.
61
- options: PySpark options for the read stream operation.
62
- **kwargs: Additional keyword arguments to maintain compatibility with the base class method.
63
-
64
- Returns:
65
- The Spark Streaming DataFrame containing the read data.
66
-
67
- Raises:
68
- ValueError: If the table_identifier is not provided, is not a string, or is not in the correct format.
69
- Exception: For any other unexpected errors during streaming read operation.
70
- """
71
- if options is None:
72
- options = {}
73
- if not table_identifier:
74
- raise ValueError("table_identifier is required")
75
- if not isinstance(table_identifier, str):
76
- raise ValueError("table_identifier must be a string")
77
- if len(table_identifier.split(".")) != 3:
78
- raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
79
-
80
- try:
81
- df = self._spark.readStream.table(table_identifier, **options)
82
- return df
83
- except AnalysisException as err:
84
- raise ValueError(f"Table not found or not streamable: {table_identifier}") from err
85
- except Exception as err:
86
- raise ReadOperationFailedError(
87
- f"An error occurred while reading the stream from table '{table_identifier}': {err}"
88
- ) from err
@@ -5,7 +5,7 @@ class CatalogWriter:
5
5
  """A writer for Catalog tables."""
6
6
 
7
7
  @staticmethod
8
- def write(
8
+ def write_table(
9
9
  df: DataFrame | None,
10
10
  table_identifier: str | None,
11
11
  partition_by: str | list[str] | None = None,
@@ -46,65 +46,3 @@ class CatalogWriter:
46
46
  if options is None:
47
47
  options = {}
48
48
  df.write.saveAsTable(table_identifier, mode=mode, partitionBy=partition_by, **options)
49
-
50
- @staticmethod
51
- def write_stream(
52
- df: DataFrame | None,
53
- table_identifier: str | None,
54
- checkpoint_location: str | None = None,
55
- trigger_dict: dict | None = None,
56
- options: dict[str, str] | None = None,
57
- mode: str = "append",
58
- await_termination: bool = False,
59
- ) -> None:
60
- """Write a streaming DataFrame to a Unity Catalog table.
61
-
62
- Args:
63
- df: The streaming DataFrame to write.
64
- table_identifier: The table identifier in the Unity Catalog in the
65
- format 'catalog.schema.table'.
66
- checkpoint_location: Location for checkpointing. Required for stream recovery.
67
- trigger_dict: A dictionary specifying the trigger configuration for the streaming query.
68
- Supported keys include:
69
- - "processingTime": Specifies a time interval (e.g., "10 seconds") for micro-batch processing.
70
- - "once": Processes all available data once and then stops.
71
- - "continuous": Specifies a time interval (e.g., "1 second") for continuous processing.
72
- - "availableNow": Processes all available data immediately and then stops.
73
- If nothing is provided, the default is {"availableNow": True}.
74
- options: PySpark options for the DataFrame streaming write operation.
75
- mode: The write mode. For streaming, typically "append".
76
- await_termination: If True, the function will wait for the streaming
77
- query to finish before returning.
78
-
79
- Raises:
80
- ValueError: If the mode is not supported for streaming operations.
81
- ValueError: If the table_identifier is not a string or not in the format 'catalog.schema.table'.
82
- ValueError: If the DataFrame is None.
83
- ValueError: If checkpoint_location is not provided.
84
- """
85
- if mode not in ("append", "complete", "update"):
86
- raise ValueError("mode must be one of append, complete, update for streaming operations")
87
- if not table_identifier:
88
- raise ValueError("table_identifier is required")
89
- elif not isinstance(table_identifier, str):
90
- raise ValueError("table_identifier must be a string")
91
- elif len(table_identifier.split(".")) != 3:
92
- raise ValueError("table_identifier must be in the format 'catalog.schema.table'")
93
- if not df:
94
- raise ValueError("df is required, but was None.")
95
- if not checkpoint_location:
96
- raise ValueError("checkpoint_location is required for streaming operations")
97
-
98
- if options is None:
99
- options = {}
100
- if trigger_dict is None:
101
- trigger_dict = {"availableNow": True}
102
-
103
- stream_writer = df.writeStream.format("delta").outputMode(mode)
104
- stream_writer.options(**options).option("checkpointLocation", checkpoint_location)
105
- stream_writer.trigger(**trigger_dict)
106
-
107
- query = stream_writer.toTable(table_identifier)
108
-
109
- if await_termination:
110
- query.awaitTermination()
@@ -196,11 +196,7 @@ class DeltaMergeWriter(BaseDeltaWriter):
196
196
 
197
197
  config = DeltaMergeConfig(dataframe_columns=data_frame.columns, **kwargs)
198
198
 
199
- delta_table = self.table_manager.get_delta_table(
200
- table=table,
201
- location=storage_path,
202
- spark=data_frame.sparkSession,
203
- )
199
+ delta_table = self.table_manager.get_delta_table(location=storage_path, spark=data_frame.sparkSession)
204
200
 
205
201
  match_conditions = self._build_match_conditions(data_frame, config)
206
202
 
@@ -5,7 +5,6 @@ from pydantic import BaseModel, Field, field_validator, model_validator
5
5
 
6
6
  COLUMN_DATA_TYPE_LIST = {
7
7
  "string",
8
- "decimal",
9
8
  "integer",
10
9
  "int",
11
10
  "smallint",
@@ -32,7 +31,7 @@ class Column(BaseModel):
32
31
  nullable: bool
33
32
  default_value: Any = None
34
33
  generated: str | None = None
35
- business_properties: dict[str, Any] = Field(default_factory=dict)
34
+ properties: dict[str, Any] = Field(default_factory=dict)
36
35
  comment: str | None = None
37
36
 
38
37
  @field_validator("data_type", mode="before")
@@ -44,7 +43,7 @@ class Column(BaseModel):
44
43
  """
45
44
  val = raw.lower()
46
45
  base_data_types = re.findall(r"\b[a-z]+\b", val)
47
- forbidden_characters = re.findall(r"[^a-z0-9\(\)\<\>,\s]+", val)
46
+ forbidden_characters = re.findall(r"[^a-z\<\>)]+", val)
48
47
 
49
48
  if forbidden_characters:
50
49
  raise ValueError(f"Forbidden characters in data type definition [ '{val}' ]: [' {forbidden_characters} ']")
@@ -43,7 +43,6 @@ class Schema(ReadInstancesMixin):
43
43
  raise FileNotFoundError("Schema file not found.")
44
44
 
45
45
  schema, schema_errors = super().read_instance_from_file(processed_instance_path)
46
- table_errors: list[ValidationErrorType] = []
47
46
  if schema:
48
47
  schema.storage_path = "" if not schema.storage_path else schema.storage_path
49
48
  tables, table_errors = Table.read_instances_from_directory(
@@ -13,14 +13,6 @@ USING delta
13
13
  {% if table.storage_path %}
14
14
  LOCATION '{{ table.storage_path }}'
15
15
  {% endif %}
16
- {% if table.properties %}
17
- TBLPROPERTIES (
18
- {%- for key, value in table.properties.items() %}
19
- {%- if not loop.first %}, {% endif -%}
20
- '{{key}}' = '{{value}}'
21
- {%- endfor -%}
22
- )
23
- {% endif %}
24
16
  {% if table.partition_by -%}
25
17
  {%- if table.liquid_clustering -%} CLUSTER {%- else -%} PARTITIONED {%- endif %} BY (
26
18
  {%- for column in table.partition_by -%}
@@ -42,17 +34,3 @@ ALTER TABLE {{ table.escaped_identifier }} ADD CONSTRAINT {{constraint.name}} CH
42
34
  {%- if table.comment %}
43
35
  COMMENT ON TABLE {{ table.escaped_identifier }} IS '{{ table.comment }}';
44
36
  {%- endif %}
45
-
46
- {%- if table.business_properties %}
47
- {%- for tag_key, tag_value in table.business_properties.items() %}
48
- SET TAG ON TABLE {{ table.escaped_identifier }} "{{ tag_key }}"{% if tag_value %}="{{ tag_value }}"{% endif %};
49
- {%- endfor %}
50
- {%- endif %}
51
-
52
- {%- for column in table.columns %}
53
- {%- if column.business_properties %}
54
- {%- for tag_key, tag_value in column.business_properties.items() %}
55
- SET TAG ON COLUMN {{ table.escaped_identifier }}.`{{ column.name }}` "{{ tag_key }}"{% if tag_value %}="{{ tag_value }}"{% endif %};
56
- {%- endfor %}
57
- {%- endif %}
58
- {%- endfor %}
@@ -186,9 +186,6 @@ class TableManager(LoggerMixin):
186
186
  def get_delta_table(self, table: Table | None = None, location: str | None = None, spark=None) -> DeltaTable:
187
187
  """Get the DeltaTable object from the Table objects location or a location string.
188
188
 
189
- For managed tables, uses the table identifier to access the DeltaTable.
190
- For external tables or when a location is provided, uses the storage path.
191
-
192
189
  Args:
193
190
  table: A Table object representing the Delta table.
194
191
  location: A string representing the table location.
@@ -198,34 +195,17 @@ class TableManager(LoggerMixin):
198
195
  The DeltaTable object corresponding to the given Table object or location string.
199
196
 
200
197
  Raises:
201
- ValueError: If neither table nor location is provided.
198
+ ValueError: If neither table nor location is provided, or if both are provided.
202
199
  """
203
- if table is None and location is None:
204
- self._console_logger.error("Invalid parameters: both table and location are None")
205
- raise ValueError("Either table or location must be provided.")
206
-
207
- spark_session = spark or self._spark
208
-
209
- if table is not None and location is not None:
210
- self._console_logger.info(
211
- f"Both table ({table.identifier}) and location ({location}) provided. Using table object as priority."
200
+ if (table is None and location is None) or (table is not None and location is not None):
201
+ raise ValueError(
202
+ f"Either table or location must be provided, but not both. Table: {table}, location: {location}",
212
203
  )
213
204
 
214
205
  if table is not None:
215
- if table.is_external is False:
216
- self._console_logger.info(f"Getting DeltaTable object for managed table: {table.identifier}")
217
- return DeltaTable.forName(spark_session, table.identifier)
218
-
219
- table_location = str(table.storage_path)
220
- self._console_logger.info(f"Getting DeltaTable object for external table location: {table_location}")
221
- return DeltaTable.forPath(spark_session, table_location)
222
-
223
- self._console_logger.info(f"No table object provided, using location: {location}")
224
- if location is None:
225
- self._console_logger.error("Location is None - this should not happen!")
226
- raise ValueError("Location cannot be None when no table object is provided")
206
+ location = str(table.storage_path)
227
207
  self._console_logger.info(f"Getting DeltaTable object for location: {location}")
228
- return DeltaTable.forPath(spark_session, str(location))
208
+ return DeltaTable.forPath(spark or self._spark, str(location))
229
209
 
230
210
  def table_exists(self, table: Table | None = None, table_identifier: str | None = None) -> bool:
231
211
  """Checks if a table exists in the catalog.
@@ -255,10 +235,9 @@ class TableManager(LoggerMixin):
255
235
  raise ValueError("Invalid table identifier format. Expected 'catalog.schema.table'.")
256
236
 
257
237
  query_result = self._spark.sql(
258
- # Using both upper and lower case to ensure compatibility with case changes in Databricks
259
238
  f"""
260
239
  SELECT 1 FROM {catalog}.information_schema.tables
261
- WHERE table_name in ('{table_name}', '{table_name.lower()}')
240
+ WHERE table_name = '{table_name}'
262
241
  AND table_schema = '{schema}'
263
242
  LIMIT 1""",
264
243
  )
@@ -15,23 +15,13 @@ class ReadCatalogTableAction(PipelineAction):
15
15
  into a DataFrame and returned as part of an updated `PipelineContext`.
16
16
 
17
17
  Example:
18
- === "Batch Read"
19
- ```yaml
20
- Read Sales Table:
21
- action: READ_CATALOG_TABLE
22
- options:
23
- table_identifier: my_catalog.business_schema.sales_table
24
- options: <options for the CatalogReader read method>
25
- ```
26
- === "Streaming Read"
27
- ```yaml
28
- Read Sales Table Stream:
29
- action: READ_CATALOG_TABLE
30
- options:
31
- table_identifier: my_catalog.business_schema.sales_table
32
- stream: true
33
- options: <options for the CatalogReader read_stream method>
34
- ```
18
+ ```yaml
19
+ Read Sales Table:
20
+ action: READ_CATALOG_TABLE
21
+ options:
22
+ table_identifier: my_catalog.business_schema.sales_table
23
+ options: <options for the CatalogReader read method>
24
+ ```
35
25
  """
36
26
 
37
27
  name: str = "READ_CATALOG_TABLE"
@@ -42,7 +32,6 @@ class ReadCatalogTableAction(PipelineAction):
42
32
  *,
43
33
  table_identifier: str | None = None,
44
34
  options: dict[str, str] | None = None,
45
- stream: bool = False,
46
35
  **_: Any, # define kwargs to match the base class signature
47
36
  ) -> PipelineContext:
48
37
  """Reads a table from Unity Catalog using a specified table identifier and optional reader configurations.
@@ -55,13 +44,7 @@ class ReadCatalogTableAction(PipelineAction):
55
44
  identifier from the `table_metadata` in the `context`.
56
45
  options: A dictionary of options for customizing
57
46
  the [`CatalogReader`][cloe_nessy.integration.reader.catalog_reader]
58
- behavior, such as filters or reading modes.
59
- stream: If True, the action will read the table as a stream.
60
- checkpoint_location: The location for storing
61
- checkpoints if streaming is enabled.
62
- trigger_dict: A dictionary specifying the trigger
63
- configuration for the streaming query, such as processing time or
64
- continuous processing.
47
+ behavior, such as filters or reading modes. Defaults to None.
65
48
 
66
49
  Raises:
67
50
  ValueError: If neither `table_identifier` nor `table_metadata.identifier` in the `context` is provided.
@@ -78,10 +61,5 @@ class ReadCatalogTableAction(PipelineAction):
78
61
  raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
79
62
 
80
63
  table_reader = CatalogReader()
81
- if stream:
82
- context.runtime_info = (context.runtime_info or {}) | {"streaming": True}
83
- df = table_reader.read_stream(table_identifier=table_identifier, options=options)
84
- else:
85
- df = table_reader.read(table_identifier=table_identifier, options=options)
86
-
87
- return context.from_existing(data=df, runtime_info=context.runtime_info)
64
+ df = table_reader.read(table_identifier=table_identifier, options=options)
65
+ return context.from_existing(data=df)
@@ -1,94 +1,66 @@
1
- from pathlib import Path
1
+ import pathlib
2
2
  from typing import Any
3
3
 
4
- from ...models import Table
4
+ from ...models import Schema
5
5
  from ..pipeline_action import PipelineAction
6
6
  from ..pipeline_context import PipelineContext
7
7
 
8
8
 
9
9
  class ReadMetadataYAMLAction(PipelineAction):
10
- """Reads table metadata from a yaml file using the [`Table`][cloe_nessy.models.table] model.
10
+ """Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
11
11
 
12
12
  Example:
13
- === "Managed Table"
14
- ```yaml
15
- Read Table Metadata:
16
- action: READ_METADATA_YAML_ACTION
17
- options:
18
- file_path: metadata/schemas/bronze/sales_table.yml
19
- catalog_name: production
20
- schema_name: sales_data
21
- ```
22
- === "External Table"
23
- ```yaml
24
- Read Table Metadata:
25
- action: READ_METADATA_YAML_ACTION
26
- options:
27
- file_path: metadata/schemas/bronze/sales_table.yml
28
- catalog_name: production
29
- schema_name: sales_data
30
- storage_path: abfs://external_storage/sales_data/sales_table
31
- ```
13
+ ```yaml
14
+ Read Schema Metadata:
15
+ action: READ_METADATA_YAML_ACTION
16
+ options:
17
+ path: excel_file_folder/excel_files_june/
18
+ file_name: sales_schema.yml
19
+ table_name: sales
20
+ ```
32
21
  """
33
22
 
34
23
  name: str = "READ_METADATA_YAML_ACTION"
35
24
 
25
+ @staticmethod
36
26
  def run(
37
- self,
38
27
  context: PipelineContext,
39
28
  *,
40
- file_path: str | None = None,
41
- catalog_name: str | None = None,
42
- schema_name: str | None = None,
43
- storage_path: str | None = None,
29
+ path: str | None = None,
30
+ file_name: str | None = None,
31
+ table_name: str | None = None,
44
32
  **_: Any,
45
33
  ) -> PipelineContext:
46
- """Reads table metadata from a yaml file using the [`Table`][cloe_nessy.models.table] model.
34
+ """Reads schema metadata from a yaml file using the [`Schema`][cloe_nessy.models.schema] model.
47
35
 
48
36
  Args:
49
37
  context: The context in which this Action is executed.
50
- file_path: The path to the file that defines the table.
51
- catalog_name: The name of the catalog for the table.
52
- schema_name: The name of the schema for the table.
53
- storage_path: The storage path for the table, if applicable. If not
54
- provided, the table will be considered a managed table.
38
+ path: The path to the data contract directory.
39
+ file_name: The name of the file that defines the schema.
40
+ table_name: The name of the table for which to retrieve metadata.
55
41
 
56
42
  Raises:
57
- ValueError: If any issues occur while reading the table metadata, such as an invalid table,
58
- missing file, missing path, or missing catalog/schema names.
43
+ ValueError: If any issues occur while reading the schema, such as an invalid schema,
44
+ missing file, or missing path.
59
45
 
60
46
  Returns:
61
47
  The context after the execution of this Action, containing the table metadata.
62
48
  """
63
- missing_params = []
64
- if not file_path:
65
- missing_params.append("file_path")
66
- if not catalog_name:
67
- missing_params.append("catalog_name")
68
- if not schema_name:
69
- missing_params.append("schema_name")
49
+ if not path:
50
+ raise ValueError("No path provided. Please specify path to schema metadata.")
51
+ if not file_name:
52
+ raise ValueError("No file_name provided. Please specify file name.")
53
+ if not table_name:
54
+ raise ValueError("No table_name provided. Please specify table name.")
70
55
 
71
- if missing_params:
72
- raise ValueError(
73
- f"Missing required parameters: {', '.join(missing_params)}. Please specify all required parameters."
74
- )
56
+ path_obj = pathlib.Path(path)
75
57
 
76
- final_file_path = Path(file_path) if file_path else Path()
77
-
78
- table, errors = Table.read_instance_from_file(
79
- final_file_path,
80
- catalog_name=catalog_name,
81
- schema_name=schema_name,
82
- )
58
+ schema, errors = Schema.read_instance_from_file(path_obj / file_name)
83
59
  if errors:
84
- raise ValueError(f"Errors while reading table metadata: {errors}")
85
- if not table:
86
- raise ValueError("No table found in metadata.")
60
+ raise ValueError(f"Errors while reading schema metadata: {errors}")
61
+ if not schema:
62
+ raise ValueError("No schema found in metadata.")
87
63
 
88
- if not table.storage_path and storage_path:
89
- self._console_logger.info(f"Setting storage path for table [ '{table.name}' ] to [ '{storage_path}' ]")
90
- table.storage_path = storage_path
91
- table.is_external = True
64
+ table = schema.get_table_by_name(table_name=table_name)
92
65
 
93
- self._console_logger.info(f"Table [ '{table.name}' ] metadata read successfully from [ '{file_path}' ]")
94
66
  return context.from_existing(table_metadata=table)
@@ -1,7 +1,5 @@
1
1
  from typing import Any
2
2
 
3
- from pyspark.sql import functions as F
4
-
5
3
  from ..pipeline_action import PipelineAction
6
4
  from ..pipeline_context import PipelineContext
7
5
  from ..pipeline_step import PipelineStep
@@ -15,74 +13,20 @@ class TransformJoinAction(PipelineAction):
15
13
  from [PySpark
16
14
  documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.join.html)
17
15
 
18
- Examples:
19
- === "Simple Column Join"
20
- ```yaml
21
- Join Tables:
22
- action: TRANSFORM_JOIN
23
- options:
24
- joined_data: ((step:Transform First Table))
25
- join_on: id
26
- how: inner
27
- ```
28
-
29
- === "Multiple Columns Join"
30
- ```yaml
31
- Join Tables:
32
- action: TRANSFORM_JOIN
33
- options:
34
- joined_data: ((step:Transform First Table))
35
- join_on: [customer_id, order_date]
36
- how: left
37
- ```
38
-
39
- === "Dictionary Join (Different Column Names)"
40
- ```yaml
41
- Join Tables:
42
- action: TRANSFORM_JOIN
43
- options:
44
- joined_data: ((step:Transform First Table))
45
- join_on:
46
- customer_id: cust_id
47
- order_date: date
48
- how: inner
49
- ```
50
-
51
- === "Complex Join with Literals and Expressions"
52
- ```yaml
53
- Join Tables:
54
- action: TRANSFORM_JOIN
55
- options:
56
- joined_data: ((step:Load Conditions Table))
57
- join_condition: |
58
- left.material = right.material
59
- AND right.sales_org = '10'
60
- AND right.distr_chan = '10'
61
- AND right.knart = 'ZUVP'
62
- AND right.lovmkond <> 'X'
63
- AND right.sales_unit = 'ST'
64
- AND left.calday BETWEEN
65
- to_date(right.date_from, 'yyyyMMdd') AND
66
- to_date(right.date_to, 'yyyyMMdd')
67
- how: left
68
- ```
16
+ Example:
17
+ ```yaml
18
+ Join Tables:
19
+ action: TRANSFORM_JOIN
20
+ options:
21
+ joined_data: ((step:Transform First Table))
22
+ join_on: id
23
+ how: anti
24
+ ```
69
25
 
70
26
  !!! note "Referencing a DataFrame from another step"
71
27
  The `joined_data` parameter is a reference to the DataFrame from another step.
72
28
  The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
73
29
  for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
74
-
75
- !!! tip "Dictionary Join Syntax"
76
- When using a dictionary for `join_on`, the keys represent columns
77
- from the DataFrame in context and the values represent columns from
78
- the DataFrame in `joined_data`. This is useful when joining tables
79
- with different column names for the same logical entity.
80
-
81
- !!! tip "Complex Join Conditions"
82
- Use `join_condition` instead of `join_on` for complex joins with literals,
83
- expressions, and multiple conditions. Reference columns using `left.column_name`
84
- for the main DataFrame and `right.column_name` for the joined DataFrame.
85
- Supports all PySpark functions and operators.
86
30
  """
87
31
 
88
32
  name: str = "TRANSFORM_JOIN"
@@ -93,7 +37,6 @@ class TransformJoinAction(PipelineAction):
93
37
  *,
94
38
  joined_data: PipelineStep | None = None,
95
39
  join_on: list[str] | str | dict[str, str] | None = None,
96
- join_condition: str | None = None,
97
40
  how: str = "inner",
98
41
  **_: Any,
99
42
  ) -> PipelineContext:
@@ -106,17 +49,13 @@ class TransformJoinAction(PipelineAction):
106
49
  join_on: A string for the join column
107
50
  name, a list of column names, or a dictionary mapping columns from the
108
51
  left DataFrame to the right DataFrame. This defines the condition for the
109
- join operation. Mutually exclusive with join_condition.
110
- join_condition: A string containing a complex join expression with literals,
111
- functions, and multiple conditions. Use 'left.' and 'right.' prefixes
112
- to reference columns from respective DataFrames. Mutually exclusive with join_on.
52
+ join operation.
113
53
  how: The type of join to perform. Must be one of: inner, cross, outer,
114
54
  full, fullouter, left, leftouter, right, rightouter, semi, anti, etc.
115
55
 
116
56
  Raises:
117
57
  ValueError: If no joined_data is provided.
118
- ValueError: If neither join_on nor join_condition is provided.
119
- ValueError: If both join_on and join_condition are provided.
58
+ ValueError: If no join_on is provided.
120
59
  ValueError: If the data from context is None.
121
60
  ValueError: If the data from the joined_data is None.
122
61
 
@@ -125,12 +64,8 @@ class TransformJoinAction(PipelineAction):
125
64
  """
126
65
  if joined_data is None or joined_data.result is None or joined_data.result.data is None:
127
66
  raise ValueError("No joined_data provided.")
128
-
129
- if not join_on and not join_condition:
130
- raise ValueError("Either join_on or join_condition must be provided.")
131
-
132
- if join_on and join_condition:
133
- raise ValueError("Cannot specify both join_on and join_condition. Use one or the other.")
67
+ if not join_on:
68
+ raise ValueError("No join_on provided.")
134
69
 
135
70
  if context.data is None:
136
71
  raise ValueError("Data from the context is required for the operation.")
@@ -138,25 +73,16 @@ class TransformJoinAction(PipelineAction):
138
73
  df_right = joined_data.result.data.alias("right") # type: ignore
139
74
  df_left = context.data.alias("left") # type: ignore
140
75
 
141
- if join_condition:
142
- try:
143
- condition = F.expr(join_condition)
144
- except Exception as e:
145
- # this will not raise an error in most cases, because the evaluation of the expression is lazy
146
- raise ValueError(f"Failed to parse join condition '{join_condition}': {str(e)}") from e
147
- df = df_left.join(df_right, on=condition, how=how) # type: ignore
148
-
149
- if join_on:
150
- if isinstance(join_on, str):
151
- join_condition_list = [join_on]
152
- elif isinstance(join_on, list):
153
- join_condition_list = join_on
154
- else:
155
- join_condition_list = [
156
- df_left[left_column] == df_right[right_column] # type: ignore
157
- for left_column, right_column in join_on.items()
158
- ]
159
-
160
- df = df_left.join(df_right, on=join_condition_list, how=how) # type: ignore
76
+ if isinstance(join_on, str):
77
+ join_condition = [join_on]
78
+ elif isinstance(join_on, list):
79
+ join_condition = join_on
80
+ else:
81
+ join_condition = [
82
+ df_left[left_column] == df_right[right_column] # type: ignore
83
+ for left_column, right_column in join_on.items()
84
+ ]
85
+
86
+ df = df_left.join(df_right, on=join_condition, how=how) # type: ignore
161
87
 
162
88
  return context.from_existing(data=df) # type: ignore
@@ -22,8 +22,8 @@ class TransformUnionAction(PipelineAction):
22
22
  action: TRANSFORM_UNION
23
23
  options:
24
24
  union_data:
25
- - ((step:Filter First Table))
26
- - ((step:SQL Transform Second Table))
25
+ - ((step: Filter First Table))
26
+ - ((step: SQL Transform Second Table))
27
27
  ```
28
28
  !!! note "Referencing a DataFrame from another step"
29
29
  The `union_data` parameter is a reference to the DataFrame from another step.