cloe-nessy 0.3.17.0__py3-none-any.whl → 0.3.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloe_nessy/clients/api_client/__init__.py +10 -1
- cloe_nessy/clients/api_client/api_client.py +19 -8
- cloe_nessy/clients/api_client/api_response.py +7 -4
- cloe_nessy/clients/api_client/pagination_config.py +84 -0
- cloe_nessy/clients/api_client/pagination_strategy.py +500 -0
- cloe_nessy/integration/delta_loader/delta_loader.py +1 -1
- cloe_nessy/integration/reader/__init__.py +2 -2
- cloe_nessy/integration/reader/api_reader.py +463 -72
- cloe_nessy/integration/reader/catalog_reader.py +49 -10
- cloe_nessy/integration/reader/excel_reader.py +3 -3
- cloe_nessy/integration/reader/file_reader.py +3 -1
- cloe_nessy/integration/reader/reader.py +1 -1
- cloe_nessy/integration/writer/catalog_writer.py +64 -2
- cloe_nessy/integration/writer/delta_writer/delta_merge_writer.py +5 -1
- cloe_nessy/models/column.py +3 -2
- cloe_nessy/models/schema.py +1 -0
- cloe_nessy/models/templates/create_table.sql.j2 +22 -0
- cloe_nessy/object_manager/table_manager.py +29 -7
- cloe_nessy/pipeline/actions/__init__.py +1 -1
- cloe_nessy/pipeline/actions/read_api.py +272 -75
- cloe_nessy/pipeline/actions/read_catalog_table.py +73 -10
- cloe_nessy/pipeline/actions/read_excel.py +1 -1
- cloe_nessy/pipeline/actions/read_metadata_yaml.py +61 -33
- cloe_nessy/pipeline/actions/transform_decode.py +2 -1
- cloe_nessy/pipeline/actions/transform_join.py +98 -24
- cloe_nessy/pipeline/actions/transform_union.py +2 -2
- cloe_nessy/pipeline/actions/write_catalog_table.py +66 -21
- cloe_nessy/pipeline/actions/write_delta_merge.py +1 -0
- cloe_nessy/pipeline/pipeline_config.py +2 -0
- cloe_nessy/pipeline/pipeline_context.py +1 -1
- cloe_nessy/pipeline/pipeline_parsing_service.py +104 -39
- cloe_nessy/pipeline/pipeline_step.py +2 -0
- cloe_nessy/session/__init__.py +2 -1
- cloe_nessy/session/pyspark_compat.py +15 -0
- cloe_nessy/session/session_manager.py +1 -1
- {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/METADATA +19 -19
- {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/RECORD +38 -36
- {cloe_nessy-0.3.17.0.dist-info → cloe_nessy-0.3.19.dist-info}/WHEEL +1 -2
- cloe_nessy-0.3.17.0.dist-info/top_level.txt +0 -1
|
@@ -1,66 +1,94 @@
|
|
|
1
|
-
import
|
|
1
|
+
from pathlib import Path
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
|
-
from ...models import
|
|
4
|
+
from ...models import Table
|
|
5
5
|
from ..pipeline_action import PipelineAction
|
|
6
6
|
from ..pipeline_context import PipelineContext
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class ReadMetadataYAMLAction(PipelineAction):
|
|
10
|
-
"""Reads
|
|
10
|
+
"""Reads table metadata from a yaml file using the [`Table`][cloe_nessy.models.table] model.
|
|
11
11
|
|
|
12
12
|
Example:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
13
|
+
=== "Managed Table"
|
|
14
|
+
```yaml
|
|
15
|
+
Read Table Metadata:
|
|
16
|
+
action: READ_METADATA_YAML_ACTION
|
|
17
|
+
options:
|
|
18
|
+
file_path: metadata/schemas/bronze/sales_table.yml
|
|
19
|
+
catalog_name: production
|
|
20
|
+
schema_name: sales_data
|
|
21
|
+
```
|
|
22
|
+
=== "External Table"
|
|
23
|
+
```yaml
|
|
24
|
+
Read Table Metadata:
|
|
25
|
+
action: READ_METADATA_YAML_ACTION
|
|
26
|
+
options:
|
|
27
|
+
file_path: metadata/schemas/bronze/sales_table.yml
|
|
28
|
+
catalog_name: production
|
|
29
|
+
schema_name: sales_data
|
|
30
|
+
storage_path: abfs://external_storage/sales_data/sales_table
|
|
31
|
+
```
|
|
21
32
|
"""
|
|
22
33
|
|
|
23
34
|
name: str = "READ_METADATA_YAML_ACTION"
|
|
24
35
|
|
|
25
|
-
@staticmethod
|
|
26
36
|
def run(
|
|
37
|
+
self,
|
|
27
38
|
context: PipelineContext,
|
|
28
39
|
*,
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
40
|
+
file_path: str | None = None,
|
|
41
|
+
catalog_name: str | None = None,
|
|
42
|
+
schema_name: str | None = None,
|
|
43
|
+
storage_path: str | None = None,
|
|
32
44
|
**_: Any,
|
|
33
45
|
) -> PipelineContext:
|
|
34
|
-
"""Reads
|
|
46
|
+
"""Reads table metadata from a yaml file using the [`Table`][cloe_nessy.models.table] model.
|
|
35
47
|
|
|
36
48
|
Args:
|
|
37
49
|
context: The context in which this Action is executed.
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
50
|
+
file_path: The path to the file that defines the table.
|
|
51
|
+
catalog_name: The name of the catalog for the table.
|
|
52
|
+
schema_name: The name of the schema for the table.
|
|
53
|
+
storage_path: The storage path for the table, if applicable. If not
|
|
54
|
+
provided, the table will be considered a managed table.
|
|
41
55
|
|
|
42
56
|
Raises:
|
|
43
|
-
ValueError: If any issues occur while reading the
|
|
44
|
-
missing file, or missing
|
|
57
|
+
ValueError: If any issues occur while reading the table metadata, such as an invalid table,
|
|
58
|
+
missing file, missing path, or missing catalog/schema names.
|
|
45
59
|
|
|
46
60
|
Returns:
|
|
47
61
|
The context after the execution of this Action, containing the table metadata.
|
|
48
62
|
"""
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
63
|
+
missing_params = []
|
|
64
|
+
if not file_path:
|
|
65
|
+
missing_params.append("file_path")
|
|
66
|
+
if not catalog_name:
|
|
67
|
+
missing_params.append("catalog_name")
|
|
68
|
+
if not schema_name:
|
|
69
|
+
missing_params.append("schema_name")
|
|
55
70
|
|
|
56
|
-
|
|
71
|
+
if missing_params:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Missing required parameters: {', '.join(missing_params)}. Please specify all required parameters."
|
|
74
|
+
)
|
|
57
75
|
|
|
58
|
-
|
|
76
|
+
final_file_path = Path(file_path) if file_path else Path()
|
|
77
|
+
|
|
78
|
+
table, errors = Table.read_instance_from_file(
|
|
79
|
+
final_file_path,
|
|
80
|
+
catalog_name=catalog_name,
|
|
81
|
+
schema_name=schema_name,
|
|
82
|
+
)
|
|
59
83
|
if errors:
|
|
60
|
-
raise ValueError(f"Errors while reading
|
|
61
|
-
if not
|
|
62
|
-
raise ValueError("No
|
|
84
|
+
raise ValueError(f"Errors while reading table metadata: {errors}")
|
|
85
|
+
if not table:
|
|
86
|
+
raise ValueError("No table found in metadata.")
|
|
63
87
|
|
|
64
|
-
table
|
|
88
|
+
if not table.storage_path and storage_path:
|
|
89
|
+
self._console_logger.info(f"Setting storage path for table [ '{table.name}' ] to [ '{storage_path}' ]")
|
|
90
|
+
table.storage_path = storage_path
|
|
91
|
+
table.is_external = True
|
|
65
92
|
|
|
93
|
+
self._console_logger.info(f"Table [ '{table.name}' ] metadata read successfully from [ '{file_path}' ]")
|
|
66
94
|
return context.from_existing(table_metadata=table)
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
-
from pyspark.sql import DataFrame
|
|
4
3
|
from pyspark.sql.functions import col, from_json, schema_of_json, unbase64
|
|
5
4
|
|
|
5
|
+
from cloe_nessy.session import DataFrame
|
|
6
|
+
|
|
6
7
|
from ..pipeline_action import PipelineAction
|
|
7
8
|
from ..pipeline_context import PipelineContext
|
|
8
9
|
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
+
from pyspark.sql import functions as F
|
|
4
|
+
|
|
3
5
|
from ..pipeline_action import PipelineAction
|
|
4
6
|
from ..pipeline_context import PipelineContext
|
|
5
7
|
from ..pipeline_step import PipelineStep
|
|
@@ -13,20 +15,74 @@ class TransformJoinAction(PipelineAction):
|
|
|
13
15
|
from [PySpark
|
|
14
16
|
documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.join.html)
|
|
15
17
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
18
|
+
Examples:
|
|
19
|
+
=== "Simple Column Join"
|
|
20
|
+
```yaml
|
|
21
|
+
Join Tables:
|
|
22
|
+
action: TRANSFORM_JOIN
|
|
23
|
+
options:
|
|
24
|
+
joined_data: ((step:Transform First Table))
|
|
25
|
+
join_on: id
|
|
26
|
+
how: inner
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
=== "Multiple Columns Join"
|
|
30
|
+
```yaml
|
|
31
|
+
Join Tables:
|
|
32
|
+
action: TRANSFORM_JOIN
|
|
33
|
+
options:
|
|
34
|
+
joined_data: ((step:Transform First Table))
|
|
35
|
+
join_on: [customer_id, order_date]
|
|
36
|
+
how: left
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
=== "Dictionary Join (Different Column Names)"
|
|
40
|
+
```yaml
|
|
41
|
+
Join Tables:
|
|
42
|
+
action: TRANSFORM_JOIN
|
|
43
|
+
options:
|
|
44
|
+
joined_data: ((step:Transform First Table))
|
|
45
|
+
join_on:
|
|
46
|
+
customer_id: cust_id
|
|
47
|
+
order_date: date
|
|
48
|
+
how: inner
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
=== "Complex Join with Literals and Expressions"
|
|
52
|
+
```yaml
|
|
53
|
+
Join Tables:
|
|
54
|
+
action: TRANSFORM_JOIN
|
|
55
|
+
options:
|
|
56
|
+
joined_data: ((step:Load Conditions Table))
|
|
57
|
+
join_condition: |
|
|
58
|
+
left.material = right.material
|
|
59
|
+
AND right.sales_org = '10'
|
|
60
|
+
AND right.distr_chan = '10'
|
|
61
|
+
AND right.knart = 'ZUVP'
|
|
62
|
+
AND right.lovmkond <> 'X'
|
|
63
|
+
AND right.sales_unit = 'ST'
|
|
64
|
+
AND left.calday BETWEEN
|
|
65
|
+
to_date(right.date_from, 'yyyyMMdd') AND
|
|
66
|
+
to_date(right.date_to, 'yyyyMMdd')
|
|
67
|
+
how: left
|
|
68
|
+
```
|
|
25
69
|
|
|
26
70
|
!!! note "Referencing a DataFrame from another step"
|
|
27
71
|
The `joined_data` parameter is a reference to the DataFrame from another step.
|
|
28
72
|
The DataFrame is accessed using the `result` attribute of the PipelineStep. The syntax
|
|
29
73
|
for referencing the DataFrame is `((step:Step Name))`, mind the double parentheses.
|
|
74
|
+
|
|
75
|
+
!!! tip "Dictionary Join Syntax"
|
|
76
|
+
When using a dictionary for `join_on`, the keys represent columns
|
|
77
|
+
from the DataFrame in context and the values represent columns from
|
|
78
|
+
the DataFrame in `joined_data`. This is useful when joining tables
|
|
79
|
+
with different column names for the same logical entity.
|
|
80
|
+
|
|
81
|
+
!!! tip "Complex Join Conditions"
|
|
82
|
+
Use `join_condition` instead of `join_on` for complex joins with literals,
|
|
83
|
+
expressions, and multiple conditions. Reference columns using `left.column_name`
|
|
84
|
+
for the main DataFrame and `right.column_name` for the joined DataFrame.
|
|
85
|
+
Supports all PySpark functions and operators.
|
|
30
86
|
"""
|
|
31
87
|
|
|
32
88
|
name: str = "TRANSFORM_JOIN"
|
|
@@ -37,6 +93,7 @@ class TransformJoinAction(PipelineAction):
|
|
|
37
93
|
*,
|
|
38
94
|
joined_data: PipelineStep | None = None,
|
|
39
95
|
join_on: list[str] | str | dict[str, str] | None = None,
|
|
96
|
+
join_condition: str | None = None,
|
|
40
97
|
how: str = "inner",
|
|
41
98
|
**_: Any,
|
|
42
99
|
) -> PipelineContext:
|
|
@@ -49,13 +106,17 @@ class TransformJoinAction(PipelineAction):
|
|
|
49
106
|
join_on: A string for the join column
|
|
50
107
|
name, a list of column names, or a dictionary mapping columns from the
|
|
51
108
|
left DataFrame to the right DataFrame. This defines the condition for the
|
|
52
|
-
join operation.
|
|
109
|
+
join operation. Mutually exclusive with join_condition.
|
|
110
|
+
join_condition: A string containing a complex join expression with literals,
|
|
111
|
+
functions, and multiple conditions. Use 'left.' and 'right.' prefixes
|
|
112
|
+
to reference columns from respective DataFrames. Mutually exclusive with join_on.
|
|
53
113
|
how: The type of join to perform. Must be one of: inner, cross, outer,
|
|
54
114
|
full, fullouter, left, leftouter, right, rightouter, semi, anti, etc.
|
|
55
115
|
|
|
56
116
|
Raises:
|
|
57
117
|
ValueError: If no joined_data is provided.
|
|
58
|
-
ValueError: If
|
|
118
|
+
ValueError: If neither join_on nor join_condition is provided.
|
|
119
|
+
ValueError: If both join_on and join_condition are provided.
|
|
59
120
|
ValueError: If the data from context is None.
|
|
60
121
|
ValueError: If the data from the joined_data is None.
|
|
61
122
|
|
|
@@ -64,8 +125,12 @@ class TransformJoinAction(PipelineAction):
|
|
|
64
125
|
"""
|
|
65
126
|
if joined_data is None or joined_data.result is None or joined_data.result.data is None:
|
|
66
127
|
raise ValueError("No joined_data provided.")
|
|
67
|
-
|
|
68
|
-
|
|
128
|
+
|
|
129
|
+
if not join_on and not join_condition:
|
|
130
|
+
raise ValueError("Either join_on or join_condition must be provided.")
|
|
131
|
+
|
|
132
|
+
if join_on and join_condition:
|
|
133
|
+
raise ValueError("Cannot specify both join_on and join_condition. Use one or the other.")
|
|
69
134
|
|
|
70
135
|
if context.data is None:
|
|
71
136
|
raise ValueError("Data from the context is required for the operation.")
|
|
@@ -73,16 +138,25 @@ class TransformJoinAction(PipelineAction):
|
|
|
73
138
|
df_right = joined_data.result.data.alias("right") # type: ignore
|
|
74
139
|
df_left = context.data.alias("left") # type: ignore
|
|
75
140
|
|
|
76
|
-
if
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
141
|
+
if join_condition:
|
|
142
|
+
try:
|
|
143
|
+
condition = F.expr(join_condition)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
# this will not raise an error in most cases, because the evaluation of the expression is lazy
|
|
146
|
+
raise ValueError(f"Failed to parse join condition '{join_condition}': {str(e)}") from e
|
|
147
|
+
df = df_left.join(df_right, on=condition, how=how) # type: ignore
|
|
148
|
+
|
|
149
|
+
if join_on:
|
|
150
|
+
if isinstance(join_on, str):
|
|
151
|
+
join_condition_list = [join_on]
|
|
152
|
+
elif isinstance(join_on, list):
|
|
153
|
+
join_condition_list = join_on
|
|
154
|
+
else:
|
|
155
|
+
join_condition_list = [
|
|
156
|
+
df_left[left_column] == df_right[right_column] # type: ignore
|
|
157
|
+
for left_column, right_column in join_on.items()
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
df = df_left.join(df_right, on=join_condition_list, how=how) # type: ignore
|
|
87
161
|
|
|
88
162
|
return context.from_existing(data=df) # type: ignore
|
|
@@ -22,8 +22,8 @@ class TransformUnionAction(PipelineAction):
|
|
|
22
22
|
action: TRANSFORM_UNION
|
|
23
23
|
options:
|
|
24
24
|
union_data:
|
|
25
|
-
- ((step:
|
|
26
|
-
- ((step:
|
|
25
|
+
- ((step:Filter First Table))
|
|
26
|
+
- ((step:SQL Transform Second Table))
|
|
27
27
|
```
|
|
28
28
|
!!! note "Referencing a DataFrame from another step"
|
|
29
29
|
The `union_data` parameter is a reference to the DataFrame from another step.
|
|
@@ -2,6 +2,7 @@ from typing import Any
|
|
|
2
2
|
|
|
3
3
|
from ...integration.delta_loader import consume_delta_load
|
|
4
4
|
from ...integration.writer import CatalogWriter
|
|
5
|
+
from ...object_manager import TableManager
|
|
5
6
|
from ..pipeline_action import PipelineAction
|
|
6
7
|
from ..pipeline_context import PipelineContext
|
|
7
8
|
|
|
@@ -9,17 +10,31 @@ from ..pipeline_context import PipelineContext
|
|
|
9
10
|
class WriteCatalogTableAction(PipelineAction):
|
|
10
11
|
"""Writes a DataFrame to a specified catalog table using [CatalogWriter][cloe_nessy.integration.writer.CatalogWriter].
|
|
11
12
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
table_identifier: my_catalog.business_schema.sales_table
|
|
18
|
-
mode: append
|
|
19
|
-
partition_by: day
|
|
13
|
+
Examples:
|
|
14
|
+
=== "Batch Write"
|
|
15
|
+
```yaml
|
|
16
|
+
Write Table to Catalog:
|
|
17
|
+
action: WRITE_CATALOG_TABLE
|
|
20
18
|
options:
|
|
21
|
-
|
|
22
|
-
|
|
19
|
+
table_identifier: my_catalog.business_schema.sales_table
|
|
20
|
+
mode: append
|
|
21
|
+
partition_by: day
|
|
22
|
+
options:
|
|
23
|
+
mergeSchema: true
|
|
24
|
+
```
|
|
25
|
+
=== "Streaming Write"
|
|
26
|
+
```yaml
|
|
27
|
+
Write Table to Catalog Stream:
|
|
28
|
+
action: WRITE_CATALOG_TABLE
|
|
29
|
+
options:
|
|
30
|
+
table_identifier: my_catalog.business_schema.sales_table
|
|
31
|
+
mode: append
|
|
32
|
+
checkpoint_location: /path/to/checkpoint
|
|
33
|
+
trigger_dict:
|
|
34
|
+
processingTime: 10 seconds
|
|
35
|
+
options:
|
|
36
|
+
mergeSchema: true
|
|
37
|
+
```
|
|
23
38
|
"""
|
|
24
39
|
|
|
25
40
|
name: str = "WRITE_CATALOG_TABLE"
|
|
@@ -32,6 +47,9 @@ class WriteCatalogTableAction(PipelineAction):
|
|
|
32
47
|
mode: str = "append",
|
|
33
48
|
partition_by: str | list[str] | None = None,
|
|
34
49
|
options: dict[str, str] | None = None,
|
|
50
|
+
checkpoint_location: str | None = None,
|
|
51
|
+
trigger_dict: dict | None = None,
|
|
52
|
+
await_termination: bool = False,
|
|
35
53
|
**_: Any,
|
|
36
54
|
) -> PipelineContext:
|
|
37
55
|
"""Writes a DataFrame to a specified catalog table.
|
|
@@ -44,7 +62,11 @@ class WriteCatalogTableAction(PipelineAction):
|
|
|
44
62
|
mode: The write mode. One of 'append', 'overwrite', 'error',
|
|
45
63
|
'errorifexists', or 'ignore'.
|
|
46
64
|
partition_by: Names of the partitioning columns.
|
|
47
|
-
|
|
65
|
+
checkpoint_location: Location for checkpointing.
|
|
66
|
+
trigger_dict: A dictionary specifying the trigger configuration for the streaming query.
|
|
67
|
+
await_termination: If True, the function will wait for the streaming
|
|
68
|
+
query to finish before returning.
|
|
69
|
+
options: Additional options for the DataFrame write operation.
|
|
48
70
|
|
|
49
71
|
Raises:
|
|
50
72
|
ValueError: If the table name is not specified or cannot be inferred from
|
|
@@ -55,25 +77,48 @@ class WriteCatalogTableAction(PipelineAction):
|
|
|
55
77
|
"""
|
|
56
78
|
if not options:
|
|
57
79
|
options = dict()
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
80
|
+
streaming = context.runtime_info and context.runtime_info.get("streaming")
|
|
81
|
+
if streaming and not checkpoint_location:
|
|
82
|
+
raise ValueError("Checkpoint location must be specified for streaming writes.")
|
|
83
|
+
if (
|
|
84
|
+
partition_by is None
|
|
85
|
+
and context.table_metadata is not None
|
|
86
|
+
and hasattr(context.table_metadata, "partition_by")
|
|
87
|
+
and not context.table_metadata.liquid_clustering
|
|
88
|
+
):
|
|
89
|
+
partition_by = context.table_metadata.partition_by # type: ignore
|
|
61
90
|
|
|
62
91
|
if (table_metadata := context.table_metadata) and table_identifier is None:
|
|
63
92
|
table_identifier = table_metadata.identifier
|
|
64
93
|
if table_identifier is None:
|
|
65
94
|
raise ValueError("Table name must be specified or a valid Table object with identifier must be set.")
|
|
66
95
|
|
|
96
|
+
if table_metadata:
|
|
97
|
+
manager = TableManager()
|
|
98
|
+
manager.create_table(table=table_metadata, ignore_if_exists=True, replace=False)
|
|
99
|
+
|
|
67
100
|
runtime_info = getattr(context, "runtime_info", None)
|
|
68
101
|
if runtime_info and runtime_info.get("is_delta_load"):
|
|
69
102
|
consume_delta_load(runtime_info)
|
|
70
103
|
|
|
71
104
|
writer = CatalogWriter()
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
105
|
+
|
|
106
|
+
if streaming:
|
|
107
|
+
writer.write_stream(
|
|
108
|
+
df=context.data, # type: ignore
|
|
109
|
+
table_identifier=table_identifier,
|
|
110
|
+
checkpoint_location=checkpoint_location,
|
|
111
|
+
trigger_dict=trigger_dict,
|
|
112
|
+
options=options,
|
|
113
|
+
mode=mode,
|
|
114
|
+
await_termination=await_termination,
|
|
115
|
+
)
|
|
116
|
+
else:
|
|
117
|
+
writer.write(
|
|
118
|
+
df=context.data, # type: ignore
|
|
119
|
+
table_identifier=table_identifier,
|
|
120
|
+
mode=mode,
|
|
121
|
+
partition_by=partition_by,
|
|
122
|
+
options=options,
|
|
123
|
+
)
|
|
79
124
|
return context.from_existing()
|
|
@@ -117,6 +117,7 @@ class WriteDeltaMergeAction(PipelineAction):
|
|
|
117
117
|
|
|
118
118
|
delta_merge_writer.write(
|
|
119
119
|
table_identifier=context.table_metadata.identifier,
|
|
120
|
+
table=context.table_metadata,
|
|
120
121
|
storage_path=str(context.table_metadata.storage_path),
|
|
121
122
|
data_frame=context.data,
|
|
122
123
|
key_columns=key_columns,
|
|
@@ -83,6 +83,7 @@ class PipelineStepConfig(PipelineConfigBaseModel):
|
|
|
83
83
|
context: str | None = None
|
|
84
84
|
table_metadata: str | None = None
|
|
85
85
|
options: dict = Field(default_factory=dict)
|
|
86
|
+
env: dict = Field(default_factory=dict)
|
|
86
87
|
|
|
87
88
|
|
|
88
89
|
class PipelineConfig(PipelineConfigBaseModel):
|
|
@@ -90,3 +91,4 @@ class PipelineConfig(PipelineConfigBaseModel):
|
|
|
90
91
|
|
|
91
92
|
name: str
|
|
92
93
|
steps: OrderedDict[str, PipelineStepConfig]
|
|
94
|
+
env: dict[str, str] = Field(default_factory=dict)
|