Flowfile 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +27 -6
- flowfile/api.py +1 -0
- flowfile/web/__init__.py +2 -2
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
- flowfile/web/static/assets/CloudConnectionManager-c20a740f.js +783 -0
- flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
- flowfile/web/static/assets/CloudStorageReader-960b400a.js +437 -0
- flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
- flowfile/web/static/assets/CloudStorageWriter-e3decbdd.js +430 -0
- flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-d67e2405.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-9ea35e84.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-9578bfa5.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-19531098.js} +9 -9
- flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-40476474.js} +47141 -43697
- flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-2297ef96.js} +6 -6
- flowfile/web/static/assets/{Filter-f87bb897.js → Filter-f211c03a.js} +8 -8
- flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
- flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-4207ea31.js} +75 -9
- flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-bf120df0.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-5bb7497a.js} +5 -5
- flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-92c81b65.js} +6 -6
- flowfile/web/static/assets/{Join-eec38203.js → Join-4e49a274.js} +23 -15
- flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
- flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-90998ae8.js} +106 -34
- flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
- flowfile/web/static/assets/{Output-3b2ca045.js → Output-81e3e917.js} +4 -4
- flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-a3419842.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-72710deb.js} +6 -6
- flowfile/web/static/assets/{Read-07acdc9a.js → Read-c4059daf.js} +6 -6
- flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-c2b5e095.js} +5 -5
- flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-10baf191.js} +6 -6
- flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-3ed9a0ae.js} +5 -5
- flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-0d49c0e8.js} +2 -2
- flowfile/web/static/assets/{Select-32b28406.js → Select-8a02a0b3.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-4c0f45f5.js} +1 -1
- flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-f55c9f9d.js} +6 -6
- flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-5dbc2145.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-a1768e52.js} +2 -2
- flowfile/web/static/assets/{Union-39eecc6c.js → Union-f2aefdc9.js} +5 -5
- flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-46b250da.js} +8 -8
- flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-25ac84cc.js} +5 -5
- flowfile/web/static/assets/api-6ef0dcef.js +80 -0
- flowfile/web/static/assets/{api-44ca9e9c.js → api-a0abbdc7.js} +1 -1
- flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
- flowfile/web/static/assets/{designer-267d44f1.js → designer-13eabd83.js} +36 -34
- flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-b87e7f6f.js} +1 -1
- flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-13564764.js} +1 -1
- flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-fd2cd6f9.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-71e11604.js} +3 -3
- flowfile/web/static/assets/{index-e235a8bc.js → index-f6c15e76.js} +59 -22
- flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-988d9efe.js} +3 -3
- flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-dd636aa2.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-af36165e.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-2847001e.js} +2 -1
- flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-0371da73.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/METADATA +9 -4
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/RECORD +131 -124
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +2 -0
- flowfile_core/__init__.py +3 -0
- flowfile_core/auth/jwt.py +39 -0
- flowfile_core/configs/node_store/nodes.py +9 -6
- flowfile_core/configs/settings.py +6 -5
- flowfile_core/database/connection.py +63 -15
- flowfile_core/database/init_db.py +0 -1
- flowfile_core/database/models.py +49 -2
- flowfile_core/flowfile/code_generator/code_generator.py +472 -17
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
- flowfile_core/flowfile/extensions.py +1 -1
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1062 -311
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
- flowfile_core/flowfile/flow_graph.py +718 -253
- flowfile_core/flowfile/flow_graph_utils.py +2 -2
- flowfile_core/flowfile/flow_node/flow_node.py +563 -117
- flowfile_core/flowfile/flow_node/models.py +154 -20
- flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
- flowfile_core/flowfile/handler.py +2 -33
- flowfile_core/flowfile/manage/open_flowfile.py +1 -2
- flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
- flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
- flowfile_core/flowfile/util/calculate_layout.py +0 -2
- flowfile_core/flowfile/utils.py +35 -26
- flowfile_core/main.py +35 -15
- flowfile_core/routes/cloud_connections.py +77 -0
- flowfile_core/routes/logs.py +2 -7
- flowfile_core/routes/public.py +1 -0
- flowfile_core/routes/routes.py +130 -90
- flowfile_core/routes/secrets.py +72 -14
- flowfile_core/schemas/__init__.py +8 -0
- flowfile_core/schemas/cloud_storage_schemas.py +215 -0
- flowfile_core/schemas/input_schema.py +121 -71
- flowfile_core/schemas/output_model.py +19 -3
- flowfile_core/schemas/schemas.py +150 -12
- flowfile_core/schemas/transform_schema.py +175 -35
- flowfile_core/utils/utils.py +40 -1
- flowfile_core/utils/validate_setup.py +41 -0
- flowfile_frame/__init__.py +9 -1
- flowfile_frame/cloud_storage/frame_helpers.py +39 -0
- flowfile_frame/cloud_storage/secret_manager.py +73 -0
- flowfile_frame/expr.py +28 -1
- flowfile_frame/expr.pyi +76 -61
- flowfile_frame/flow_frame.py +481 -208
- flowfile_frame/flow_frame.pyi +140 -91
- flowfile_frame/flow_frame_methods.py +160 -22
- flowfile_frame/group_frame.py +3 -0
- flowfile_frame/utils.py +25 -3
- flowfile_worker/external_sources/s3_source/main.py +216 -0
- flowfile_worker/external_sources/s3_source/models.py +142 -0
- flowfile_worker/funcs.py +51 -6
- flowfile_worker/models.py +22 -2
- flowfile_worker/routes.py +40 -38
- flowfile_worker/utils.py +1 -1
- test_utils/s3/commands.py +46 -0
- test_utils/s3/data_generator.py +292 -0
- test_utils/s3/demo_data_generator.py +186 -0
- test_utils/s3/fixtures.py +214 -0
- flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
- flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
- flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
- flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
- flowfile_core/schemas/defaults.py +0 -9
- flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
- flowfile_core/schemas/models.py +0 -193
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
- flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
- flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
- {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_frame/cloud_storage}/__init__.py +0 -0
- {flowfile_core/schemas/external_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
- {flowfile_worker/external_sources/airbyte_sources → test_utils/s3}/__init__.py +0 -0
flowfile_frame/flow_frame.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, C
|
|
|
5
5
|
import re
|
|
6
6
|
|
|
7
7
|
import polars as pl
|
|
8
|
-
|
|
8
|
+
from polars._typing import (CsvEncoding)
|
|
9
9
|
from flowfile_frame.lazy_methods import add_lazyframe_methods
|
|
10
10
|
|
|
11
11
|
from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
|
|
@@ -20,13 +20,12 @@ from flowfile_frame.expr import Expr, Column, lit, col
|
|
|
20
20
|
from flowfile_frame.selectors import Selector
|
|
21
21
|
from flowfile_frame.group_frame import GroupByFrame
|
|
22
22
|
from flowfile_frame.utils import (_parse_inputs_as_iterable, create_flow_graph, stringify_values,
|
|
23
|
-
ensure_inputs_as_iterable
|
|
23
|
+
ensure_inputs_as_iterable, generate_node_id,
|
|
24
|
+
set_node_id, data as node_id_data)
|
|
24
25
|
from flowfile_frame.join import _normalize_columns_to_list, _create_join_mappings
|
|
25
26
|
from flowfile_frame.utils import _check_if_convertible_to_code
|
|
26
27
|
from flowfile_frame.config import logger
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
node_id_counter = 0
|
|
28
|
+
from flowfile_frame.cloud_storage.frame_helpers import add_write_ff_to_cloud_storage
|
|
30
29
|
|
|
31
30
|
|
|
32
31
|
def can_be_expr(param: inspect.Parameter) -> bool:
|
|
@@ -102,23 +101,17 @@ def _extract_expr_parts(expr_obj) -> tuple[str, str]:
|
|
|
102
101
|
def _check_ok_for_serialization(method_name: str = None, polars_expr: pl.Expr | None = None,
|
|
103
102
|
group_expr: pl.Expr | None = None) -> None:
|
|
104
103
|
if method_name is None:
|
|
105
|
-
raise
|
|
104
|
+
raise NotImplementedError("Cannot create a polars lambda expression without the method")
|
|
106
105
|
if polars_expr is None:
|
|
107
|
-
raise
|
|
106
|
+
raise NotImplementedError("Cannot create polars expressions with lambda function")
|
|
108
107
|
method_ref = getattr(pl.LazyFrame, method_name)
|
|
109
108
|
if method_ref is None:
|
|
110
109
|
raise ModuleNotFoundError(f"Could not find the method {method_name} in polars lazyframe")
|
|
111
110
|
if method_name == 'group_by':
|
|
112
111
|
if group_expr is None:
|
|
113
|
-
raise
|
|
112
|
+
raise NotImplementedError("Cannot create a polars lambda expression without the groupby expression")
|
|
114
113
|
if not all(isinstance(ge, pl.Expr) for ge in group_expr):
|
|
115
|
-
raise
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def generate_node_id() -> int:
|
|
119
|
-
global node_id_counter
|
|
120
|
-
node_id_counter += 1
|
|
121
|
-
return node_id_counter
|
|
114
|
+
raise NotImplementedError("Cannot create a polars lambda expression without the groupby expression")
|
|
122
115
|
|
|
123
116
|
|
|
124
117
|
@add_lazyframe_methods
|
|
@@ -181,38 +174,41 @@ class FlowFrame:
|
|
|
181
174
|
flow_graph = create_flow_graph()
|
|
182
175
|
|
|
183
176
|
flow_id = flow_graph.flow_id
|
|
184
|
-
# Convert data to a polars DataFrame/
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
177
|
+
# Convert data to a polars DataFrame/LazyFram
|
|
178
|
+
if isinstance(data, pl.LazyFrame):
|
|
179
|
+
flow_graph.add_dependency_on_polars_lazy_frame(data.lazy(), node_id)
|
|
180
|
+
else:
|
|
181
|
+
try:
|
|
182
|
+
# Use polars to convert from various types
|
|
183
|
+
pl_df = pl.DataFrame(
|
|
184
|
+
data,
|
|
185
|
+
schema=schema,
|
|
186
|
+
schema_overrides=schema_overrides,
|
|
187
|
+
strict=strict,
|
|
188
|
+
orient=orient,
|
|
189
|
+
infer_schema_length=infer_schema_length,
|
|
190
|
+
nan_to_null=nan_to_null,
|
|
191
|
+
)
|
|
192
|
+
pl_data = pl_df.lazy()
|
|
193
|
+
except Exception as e:
|
|
194
|
+
raise ValueError(f"Could not dconvert data to a polars DataFrame: {e}")
|
|
195
|
+
# Create a FlowDataEngine to get data in the right format for manual input
|
|
196
|
+
flow_table = FlowDataEngine(raw_data=pl_data)
|
|
197
|
+
raw_data_format = input_schema.RawData(data=list(flow_table.to_dict().values()),
|
|
198
|
+
columns=[c.get_minimal_field_info() for c in flow_table.schema])
|
|
199
|
+
# Create a manual input node
|
|
200
|
+
input_node = input_schema.NodeManualInput(
|
|
201
|
+
flow_id=flow_id,
|
|
202
|
+
node_id=node_id,
|
|
203
|
+
raw_data_format=raw_data_format,
|
|
204
|
+
pos_x=100,
|
|
205
|
+
pos_y=100,
|
|
206
|
+
is_setup=True,
|
|
207
|
+
description=description,
|
|
195
208
|
)
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
# Create a FlowDataEngine to get data in the right format for manual input
|
|
200
|
-
flow_table = FlowDataEngine(raw_data=pl_data)
|
|
201
|
-
raw_data_format = input_schema.RawData(data=list(flow_table.to_dict().values()),
|
|
202
|
-
columns=[c.get_minimal_field_info() for c in flow_table.schema])
|
|
203
|
-
# Create a manual input node
|
|
204
|
-
input_node = input_schema.NodeManualInput(
|
|
205
|
-
flow_id=flow_id,
|
|
206
|
-
node_id=node_id,
|
|
207
|
-
raw_data_format=raw_data_format,
|
|
208
|
-
pos_x=100,
|
|
209
|
-
pos_y=100,
|
|
210
|
-
is_setup=True,
|
|
211
|
-
description=description,
|
|
212
|
-
)
|
|
213
|
-
# Add to graph
|
|
214
|
-
flow_graph.add_manual_input(input_node)
|
|
215
|
-
# Return new frame
|
|
209
|
+
# Add to graph
|
|
210
|
+
flow_graph.add_manual_input(input_node)
|
|
211
|
+
# Return new fram
|
|
216
212
|
return FlowFrame(
|
|
217
213
|
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
218
214
|
flow_graph=flow_graph,
|
|
@@ -221,70 +217,92 @@ class FlowFrame:
|
|
|
221
217
|
)
|
|
222
218
|
|
|
223
219
|
def __new__(
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
return cls.create_from_any_type(
|
|
241
|
-
data=data,
|
|
242
|
-
schema=schema,
|
|
243
|
-
schema_overrides=schema_overrides,
|
|
244
|
-
strict=strict,
|
|
245
|
-
orient=orient,
|
|
246
|
-
infer_schema_length=infer_schema_length,
|
|
247
|
-
nan_to_null=nan_to_null,
|
|
248
|
-
flow_graph=flow_graph,
|
|
249
|
-
node_id=node_id,
|
|
250
|
-
parent_node_id=parent_node_id,
|
|
251
|
-
)
|
|
220
|
+
cls,
|
|
221
|
+
data: pl.LazyFrame | FrameInitTypes = None,
|
|
222
|
+
schema: SchemaDefinition | None = None,
|
|
223
|
+
*,
|
|
224
|
+
schema_overrides: SchemaDict | None = None,
|
|
225
|
+
strict: bool = True,
|
|
226
|
+
orient: Orientation | None = None,
|
|
227
|
+
infer_schema_length: int | None = 100,
|
|
228
|
+
nan_to_null: bool = False,
|
|
229
|
+
flow_graph: Optional[FlowGraph] = None,
|
|
230
|
+
node_id: Optional[int] = None,
|
|
231
|
+
parent_node_id: Optional[int] = None,
|
|
232
|
+
**kwargs, # Accept and ignore any other kwargs for API compatibility
|
|
233
|
+
) -> "FlowFrame":
|
|
234
|
+
"""
|
|
235
|
+
Unified constructor for FlowFrame.
|
|
252
236
|
|
|
253
|
-
|
|
254
|
-
|
|
237
|
+
- If `flow_graph` and `node_id` are provided, it creates a lightweight Python
|
|
238
|
+
wrapper around an existing node in the graph.
|
|
239
|
+
- Otherwise, it creates a new source node in a new or existing graph
|
|
240
|
+
from the provided data.
|
|
241
|
+
"""
|
|
242
|
+
# --- Path 1: Internal Wrapper Creation ---
|
|
243
|
+
# This path is taken by methods like .join(), .sort(), etc., which provide an existing graph.
|
|
244
|
+
if flow_graph is not None and node_id is not None:
|
|
245
|
+
instance = super().__new__(cls)
|
|
246
|
+
instance.data = data
|
|
247
|
+
instance.flow_graph = flow_graph
|
|
248
|
+
instance.node_id = node_id
|
|
249
|
+
instance.parent_node_id = parent_node_id
|
|
250
|
+
return instance
|
|
251
|
+
elif flow_graph is not None and not isinstance(data, pl.LazyFrame):
|
|
252
|
+
instance = cls.create_from_any_type(data=data, schema=schema, schema_overrides=schema_overrides,
|
|
253
|
+
strict=strict, orient=orient, infer_schema_length=infer_schema_length,
|
|
254
|
+
nan_to_null=nan_to_null, flow_graph=flow_graph, node_id=node_id,
|
|
255
|
+
parent_node_id=parent_node_id
|
|
256
|
+
)
|
|
257
|
+
return instance
|
|
258
|
+
|
|
259
|
+
source_graph = create_flow_graph()
|
|
260
|
+
source_node_id = generate_node_id()
|
|
255
261
|
|
|
256
|
-
def __init__(
|
|
257
|
-
self,
|
|
258
|
-
data: pl.LazyFrame | FrameInitTypes = None,
|
|
259
|
-
schema: SchemaDefinition | None = None,
|
|
260
|
-
*,
|
|
261
|
-
schema_overrides: SchemaDict | None = None,
|
|
262
|
-
strict: bool = True,
|
|
263
|
-
orient: Orientation | None = None,
|
|
264
|
-
infer_schema_length: int | None = 100,
|
|
265
|
-
nan_to_null: bool = False,
|
|
266
|
-
flow_graph=None,
|
|
267
|
-
node_id=None,
|
|
268
|
-
parent_node_id=None,
|
|
269
|
-
):
|
|
270
|
-
"""Initialize the FlowFrame with data and graph references."""
|
|
271
262
|
if data is None:
|
|
272
263
|
data = pl.LazyFrame()
|
|
273
264
|
if not isinstance(data, pl.LazyFrame):
|
|
274
|
-
return
|
|
275
|
-
|
|
276
|
-
self.node_id = node_id or generate_node_id()
|
|
277
|
-
self.parent_node_id = parent_node_id
|
|
278
265
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
266
|
+
description = "Data imported from Python object"
|
|
267
|
+
try:
|
|
268
|
+
pl_df = pl.DataFrame(
|
|
269
|
+
data, schema=schema, schema_overrides=schema_overrides,
|
|
270
|
+
strict=strict, orient=orient, infer_schema_length=infer_schema_length,
|
|
271
|
+
nan_to_null=nan_to_null
|
|
272
|
+
)
|
|
273
|
+
pl_data = pl_df.lazy()
|
|
274
|
+
except Exception as e:
|
|
275
|
+
raise ValueError(f"Could not convert data to a Polars DataFrame: {e}")
|
|
276
|
+
|
|
277
|
+
flow_table = FlowDataEngine(raw_data=pl_data)
|
|
278
|
+
raw_data_format = input_schema.RawData(data=list(flow_table.to_dict().values()),
|
|
279
|
+
columns=[c.get_minimal_field_info() for c in flow_table.schema])
|
|
280
|
+
input_node = input_schema.NodeManualInput(
|
|
281
|
+
flow_id=source_graph.flow_id, node_id=source_node_id,
|
|
282
|
+
raw_data_format=raw_data_format, pos_x=100, pos_y=100,
|
|
283
|
+
is_setup=True, description=description
|
|
284
|
+
)
|
|
285
|
+
source_graph.add_manual_input(input_node)
|
|
286
286
|
else:
|
|
287
|
-
|
|
287
|
+
source_graph.add_dependency_on_polars_lazy_frame(data, source_node_id)
|
|
288
|
+
|
|
289
|
+
final_data = source_graph.get_node(source_node_id).get_resulting_data().data_frame
|
|
290
|
+
return cls(
|
|
291
|
+
data=final_data,
|
|
292
|
+
flow_graph=source_graph,
|
|
293
|
+
node_id=source_node_id,
|
|
294
|
+
parent_node_id=parent_node_id
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
def __init__(self, *args, **kwargs):
|
|
298
|
+
"""
|
|
299
|
+
The __init__ method is intentionally left empty.
|
|
300
|
+
All initialization logic is handled in the `__new__` method to support
|
|
301
|
+
the flexible factory pattern and prevent state from being overwritten.
|
|
302
|
+
Python automatically calls __init__ after __new__, so this empty
|
|
303
|
+
method catches that call and safely does nothing.
|
|
304
|
+
"""
|
|
305
|
+
pass
|
|
288
306
|
|
|
289
307
|
def __repr__(self):
|
|
290
308
|
return str(self.data)
|
|
@@ -535,18 +553,18 @@ class FlowFrame:
|
|
|
535
553
|
self.flow_graph.add_polars_code(polars_code_settings)
|
|
536
554
|
|
|
537
555
|
def join(
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
556
|
+
self,
|
|
557
|
+
other,
|
|
558
|
+
on: List[str | Column] | str | Column = None,
|
|
559
|
+
how: str = "inner",
|
|
560
|
+
left_on: List[str | Column] | str | Column = None,
|
|
561
|
+
right_on: List[str | Column] | str | Column = None,
|
|
562
|
+
suffix: str = "_right",
|
|
563
|
+
validate: str = None,
|
|
564
|
+
nulls_equal: bool = False,
|
|
565
|
+
coalesce: bool = None,
|
|
566
|
+
maintain_order: Literal[None, "left", "right", "left_right", "right_left"] = None,
|
|
567
|
+
description: str = None,
|
|
550
568
|
):
|
|
551
569
|
"""
|
|
552
570
|
Add a join operation to the Logical Plan.
|
|
@@ -591,27 +609,87 @@ class FlowFrame:
|
|
|
591
609
|
FlowFrame
|
|
592
610
|
New FlowFrame with join operation applied.
|
|
593
611
|
"""
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
612
|
+
# Step 1: Determine if we need to use Polars code
|
|
613
|
+
use_polars_code = self._should_use_polars_code_for_join(
|
|
614
|
+
maintain_order, coalesce, nulls_equal, validate, suffix
|
|
615
|
+
)
|
|
616
|
+
# Step 2: Ensure both FlowFrames are in the same graph
|
|
617
|
+
self._ensure_same_graph(other)
|
|
618
|
+
|
|
619
|
+
# Step 3: Generate new node ID
|
|
620
|
+
new_node_id = generate_node_id()
|
|
621
|
+
|
|
622
|
+
# Step 4: Parse and validate join columns
|
|
623
|
+
left_columns, right_columns = self._parse_join_columns(
|
|
624
|
+
on, left_on, right_on, how
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
# Step 5: Validate column lists have same length (except for cross join)
|
|
628
|
+
if how != 'cross' and left_columns is not None and right_columns is not None:
|
|
629
|
+
if len(left_columns) != len(right_columns):
|
|
630
|
+
raise ValueError(
|
|
631
|
+
f"Length mismatch: left columns ({len(left_columns)}) != right columns ({len(right_columns)})"
|
|
632
|
+
)
|
|
599
633
|
|
|
634
|
+
# Step 6: Create join mappings if not using Polars code
|
|
600
635
|
join_mappings = None
|
|
636
|
+
if not use_polars_code and how != 'cross':
|
|
637
|
+
join_mappings, use_polars_code = _create_join_mappings(
|
|
638
|
+
left_columns or [], right_columns or []
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
# Step 7: Execute join based on approach
|
|
642
|
+
if use_polars_code or suffix != '_right':
|
|
643
|
+
return self._execute_polars_code_join(
|
|
644
|
+
other, new_node_id, on, left_on, right_on, left_columns, right_columns,
|
|
645
|
+
how, suffix, validate, nulls_equal, coalesce, maintain_order, description
|
|
646
|
+
)
|
|
647
|
+
elif join_mappings or how == 'cross':
|
|
648
|
+
return self._execute_native_join(
|
|
649
|
+
other, new_node_id, join_mappings, how, description
|
|
650
|
+
)
|
|
651
|
+
else:
|
|
652
|
+
raise ValueError("Could not execute join")
|
|
653
|
+
|
|
654
|
+
def _should_use_polars_code_for_join(
|
|
655
|
+
self, maintain_order, coalesce, nulls_equal, validate, suffix
|
|
656
|
+
) -> bool:
|
|
657
|
+
"""Determine if we should use Polars code instead of native join."""
|
|
658
|
+
return not (
|
|
659
|
+
maintain_order is None and
|
|
660
|
+
coalesce is None and
|
|
661
|
+
nulls_equal is False and
|
|
662
|
+
validate is None and
|
|
663
|
+
suffix == '_right'
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
def _ensure_same_graph(self, other: "FlowFrame") -> None:
|
|
667
|
+
"""Ensure both FlowFrames are in the same graph, combining if necessary."""
|
|
601
668
|
if self.flow_graph.flow_id != other.flow_graph.flow_id:
|
|
602
|
-
combined_graph, node_mappings = combine_flow_graphs_with_mapping(
|
|
669
|
+
combined_graph, node_mappings = combine_flow_graphs_with_mapping(
|
|
670
|
+
self.flow_graph, other.flow_graph
|
|
671
|
+
)
|
|
672
|
+
|
|
603
673
|
new_self_node_id = node_mappings.get((self.flow_graph.flow_id, self.node_id), None)
|
|
604
674
|
new_other_node_id = node_mappings.get((other.flow_graph.flow_id, other.node_id), None)
|
|
675
|
+
|
|
605
676
|
if new_other_node_id is None or new_self_node_id is None:
|
|
606
677
|
raise ValueError("Cannot remap the nodes")
|
|
678
|
+
|
|
607
679
|
self.node_id = new_self_node_id
|
|
608
680
|
other.node_id = new_other_node_id
|
|
609
681
|
self.flow_graph = combined_graph
|
|
610
682
|
other.flow_graph = combined_graph
|
|
611
|
-
|
|
612
|
-
node_id_counter += len(combined_graph.nodes)
|
|
613
|
-
new_node_id = generate_node_id()
|
|
683
|
+
node_id_data["c"] = node_id_data["c"] + len(combined_graph.nodes)
|
|
614
684
|
|
|
685
|
+
def _parse_join_columns(
|
|
686
|
+
self,
|
|
687
|
+
on: List[str | Column] | str | Column,
|
|
688
|
+
left_on: List[str | Column] | str | Column,
|
|
689
|
+
right_on: List[str | Column] | str | Column,
|
|
690
|
+
how: str
|
|
691
|
+
) -> tuple[List[str] | None, List[str] | None]:
|
|
692
|
+
"""Parse and validate join column specifications."""
|
|
615
693
|
if on is not None:
|
|
616
694
|
left_columns = right_columns = _normalize_columns_to_list(on)
|
|
617
695
|
elif left_on is not None and right_on is not None:
|
|
@@ -623,93 +701,180 @@ class FlowFrame:
|
|
|
623
701
|
else:
|
|
624
702
|
raise ValueError("Must specify either 'on' or both 'left_on' and 'right_on'")
|
|
625
703
|
|
|
626
|
-
|
|
627
|
-
if how != 'cross' and len(left_columns) != len(right_columns):
|
|
628
|
-
raise ValueError(
|
|
629
|
-
f"Length mismatch: left columns ({len(left_columns)}) != right columns ({len(right_columns)})"
|
|
630
|
-
)
|
|
631
|
-
if not use_polars_code:
|
|
632
|
-
join_mappings, use_polars_code = _create_join_mappings(
|
|
633
|
-
left_columns or [], right_columns or []
|
|
634
|
-
)
|
|
635
|
-
|
|
636
|
-
if use_polars_code or suffix != '_right':
|
|
704
|
+
return left_columns, right_columns
|
|
637
705
|
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
706
|
+
def _execute_polars_code_join(
|
|
707
|
+
self,
|
|
708
|
+
other: "FlowFrame",
|
|
709
|
+
new_node_id: int,
|
|
710
|
+
on: List[str | Column] | str | Column,
|
|
711
|
+
left_on: List[str | Column] | str | Column,
|
|
712
|
+
right_on: List[str | Column] | str | Column,
|
|
713
|
+
left_columns: List[str] | None,
|
|
714
|
+
right_columns: List[str] | None,
|
|
715
|
+
how: str,
|
|
716
|
+
suffix: str,
|
|
717
|
+
validate: str,
|
|
718
|
+
nulls_equal: bool,
|
|
719
|
+
coalesce: bool,
|
|
720
|
+
maintain_order: Literal[None, "left", "right", "left_right", "right_left"],
|
|
721
|
+
description: str,
|
|
722
|
+
) -> "FlowFrame":
|
|
723
|
+
"""Execute join using Polars code approach."""
|
|
724
|
+
# Build the code arguments
|
|
725
|
+
code_kwargs = self._build_polars_join_kwargs(
|
|
726
|
+
on, left_on, right_on, left_columns, right_columns,
|
|
727
|
+
how, suffix, validate, nulls_equal, coalesce, maintain_order
|
|
728
|
+
)
|
|
656
729
|
|
|
657
|
-
|
|
730
|
+
kwargs_str = ", ".join(f"{k}={v}" for k, v in code_kwargs.items() if v is not None)
|
|
731
|
+
code = f"input_df_1.join({kwargs_str})"
|
|
658
732
|
|
|
659
|
-
|
|
660
|
-
|
|
733
|
+
# Add the Polars code node
|
|
734
|
+
self._add_polars_code(
|
|
735
|
+
new_node_id, code, description,
|
|
736
|
+
depending_on_ids=[self.node_id, other.node_id]
|
|
737
|
+
)
|
|
661
738
|
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
else:
|
|
666
|
-
join_input = transform_schema.JoinInput(
|
|
667
|
-
join_mapping=join_mappings,
|
|
668
|
-
left_select=left_select.renames,
|
|
669
|
-
right_select=right_select.renames,
|
|
670
|
-
how=how,
|
|
671
|
-
)
|
|
739
|
+
# Add connections
|
|
740
|
+
self._add_connection(self.node_id, new_node_id, "main")
|
|
741
|
+
other._add_connection(other.node_id, new_node_id, "main")
|
|
672
742
|
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
depending_on_ids=[self.node_id, other.node_id],
|
|
681
|
-
description=description or f"Join with {how} strategy",
|
|
682
|
-
auto_generate_selection=True,
|
|
683
|
-
verify_integrity=True,
|
|
684
|
-
)
|
|
743
|
+
# Create and return result frame
|
|
744
|
+
return FlowFrame(
|
|
745
|
+
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
746
|
+
flow_graph=self.flow_graph,
|
|
747
|
+
node_id=new_node_id,
|
|
748
|
+
parent_node_id=self.node_id,
|
|
749
|
+
)
|
|
685
750
|
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
751
|
+
def _build_polars_join_kwargs(
|
|
752
|
+
self,
|
|
753
|
+
on: List[str | Column] | str | Column,
|
|
754
|
+
left_on: List[str | Column] | str | Column,
|
|
755
|
+
right_on: List[str | Column] | str | Column,
|
|
756
|
+
left_columns: List[str] | None,
|
|
757
|
+
right_columns: List[str] | None,
|
|
758
|
+
how: str,
|
|
759
|
+
suffix: str,
|
|
760
|
+
validate: str,
|
|
761
|
+
nulls_equal: bool,
|
|
762
|
+
coalesce: bool,
|
|
763
|
+
maintain_order: Literal[None, "left", "right", "left_right", "right_left"],
|
|
764
|
+
) -> dict:
|
|
765
|
+
"""Build kwargs dictionary for Polars join code."""
|
|
766
|
+
|
|
767
|
+
def format_column_list(cols):
|
|
768
|
+
if cols is None:
|
|
769
|
+
return None
|
|
770
|
+
return "[" + ', '.join(
|
|
771
|
+
f"'{v}'" if isinstance(v, str) else str(v)
|
|
772
|
+
for v in _normalize_columns_to_list(cols)
|
|
773
|
+
) + "]"
|
|
774
|
+
|
|
775
|
+
return {
|
|
776
|
+
"other": "input_df_2",
|
|
777
|
+
"how": _to_string_val(how),
|
|
778
|
+
"on": format_column_list(on) if on else None,
|
|
779
|
+
"left_on": format_column_list(left_columns) if left_on else None,
|
|
780
|
+
"right_on": format_column_list(right_columns) if right_on else None,
|
|
781
|
+
"suffix": _to_string_val(suffix),
|
|
782
|
+
"validate": _to_string_val(validate),
|
|
783
|
+
"nulls_equal": nulls_equal,
|
|
784
|
+
"coalesce": coalesce,
|
|
785
|
+
"maintain_order": _to_string_val(maintain_order)
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
def _execute_native_join(
|
|
789
|
+
self,
|
|
790
|
+
other: "FlowFrame",
|
|
791
|
+
new_node_id: int,
|
|
792
|
+
join_mappings: List | None,
|
|
793
|
+
how: str,
|
|
794
|
+
description: str,
|
|
795
|
+
) -> "FlowFrame":
|
|
796
|
+
"""Execute join using native FlowFile join nodes."""
|
|
797
|
+
# Create select inputs for both frames
|
|
798
|
+
left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
|
|
799
|
+
right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
|
|
800
|
+
# Create appropriate join input based on join type
|
|
801
|
+
if how == 'cross':
|
|
802
|
+
join_input = transform_schema.CrossJoinInput(
|
|
803
|
+
left_select=left_select.renames,
|
|
804
|
+
right_select=right_select.renames,
|
|
708
805
|
)
|
|
709
806
|
else:
|
|
710
|
-
|
|
807
|
+
join_input = transform_schema.JoinInput(
|
|
808
|
+
join_mapping=join_mappings,
|
|
809
|
+
left_select=left_select.renames,
|
|
810
|
+
right_select=right_select.renames,
|
|
811
|
+
how=how,
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
# Configure join input
|
|
815
|
+
join_input.auto_rename()
|
|
816
|
+
for right_column in right_select.renames:
|
|
817
|
+
if right_column.join_key:
|
|
818
|
+
right_column.keep = False
|
|
711
819
|
|
|
712
|
-
|
|
820
|
+
# Create and add appropriate node
|
|
821
|
+
if how == 'cross':
|
|
822
|
+
self._add_cross_join_node(new_node_id, join_input, description, other)
|
|
823
|
+
else:
|
|
824
|
+
self._add_regular_join_node(new_node_id, join_input, description, other)
|
|
825
|
+
|
|
826
|
+
# Add connections
|
|
827
|
+
self._add_connection(self.node_id, new_node_id, "main")
|
|
828
|
+
other._add_connection(other.node_id, new_node_id, "right")
|
|
829
|
+
# Create and return result frame
|
|
830
|
+
return FlowFrame(
|
|
831
|
+
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
832
|
+
flow_graph=self.flow_graph,
|
|
833
|
+
node_id=new_node_id,
|
|
834
|
+
parent_node_id=self.node_id,
|
|
835
|
+
)
|
|
836
|
+
|
|
837
|
+
def _add_cross_join_node(
|
|
838
|
+
self,
|
|
839
|
+
new_node_id: int,
|
|
840
|
+
join_input: "transform_schema.CrossJoinInput",
|
|
841
|
+
description: str,
|
|
842
|
+
other: "FlowFrame",
|
|
843
|
+
) -> None:
|
|
844
|
+
"""Add a cross join node to the graph."""
|
|
845
|
+
cross_join_settings = input_schema.NodeCrossJoin(
|
|
846
|
+
flow_id=self.flow_graph.flow_id,
|
|
847
|
+
node_id=new_node_id,
|
|
848
|
+
cross_join_input=join_input,
|
|
849
|
+
is_setup=True,
|
|
850
|
+
depending_on_ids=[self.node_id, other.node_id],
|
|
851
|
+
description=description or f"Join with cross strategy",
|
|
852
|
+
auto_generate_selection=True,
|
|
853
|
+
verify_integrity=True,
|
|
854
|
+
)
|
|
855
|
+
self.flow_graph.add_cross_join(cross_join_settings)
|
|
856
|
+
|
|
857
|
+
def _add_regular_join_node(
|
|
858
|
+
self,
|
|
859
|
+
new_node_id: int,
|
|
860
|
+
join_input: "transform_schema.JoinInput",
|
|
861
|
+
description: str,
|
|
862
|
+
other: "FlowFrame",
|
|
863
|
+
) -> None:
|
|
864
|
+
"""Add a regular join node to the graph."""
|
|
865
|
+
join_settings = input_schema.NodeJoin(
|
|
866
|
+
flow_id=self.flow_graph.flow_id,
|
|
867
|
+
node_id=new_node_id,
|
|
868
|
+
join_input=join_input,
|
|
869
|
+
auto_generate_selection=True,
|
|
870
|
+
verify_integrity=True,
|
|
871
|
+
pos_x=200,
|
|
872
|
+
pos_y=150,
|
|
873
|
+
is_setup=True,
|
|
874
|
+
depending_on_ids=[self.node_id, other.node_id],
|
|
875
|
+
description=description or f"Join with {join_input.how} strategy",
|
|
876
|
+
)
|
|
877
|
+
self.flow_graph.add_join(join_settings)
|
|
713
878
|
|
|
714
879
|
def _add_number_of_records(self, new_node_id: int, description: str = None) -> "FlowFrame":
|
|
715
880
|
node_number_of_records = input_schema.NodeRecordCount(
|
|
@@ -923,7 +1088,7 @@ class FlowFrame:
|
|
|
923
1088
|
|
|
924
1089
|
def write_parquet(
|
|
925
1090
|
self,
|
|
926
|
-
path: str|os.PathLike,
|
|
1091
|
+
path: str | os.PathLike,
|
|
927
1092
|
*,
|
|
928
1093
|
description: str = None,
|
|
929
1094
|
convert_to_absolute_path: bool = True,
|
|
@@ -1093,6 +1258,117 @@ class FlowFrame:
|
|
|
1093
1258
|
|
|
1094
1259
|
return self._create_child_frame(new_node_id)
|
|
1095
1260
|
|
|
1261
|
+
def write_parquet_to_cloud_storage(self,
|
|
1262
|
+
path: str,
|
|
1263
|
+
connection_name: Optional[str] = None,
|
|
1264
|
+
compression: Literal["snappy", "gzip", "brotli", "lz4", "zstd"] = "snappy",
|
|
1265
|
+
description: Optional[str] = None,
|
|
1266
|
+
) -> "FlowFrame":
|
|
1267
|
+
"""
|
|
1268
|
+
Write the data frame to cloud storage in Parquet format.
|
|
1269
|
+
|
|
1270
|
+
Args:
|
|
1271
|
+
path (str): The destination path in cloud storage where the Parquet file will be written.
|
|
1272
|
+
connection_name (Optional[str], optional): The name of the storage connection
|
|
1273
|
+
that a user can create. If None, uses the default connection. Defaults to None.
|
|
1274
|
+
compression (Literal["snappy", "gzip", "brotli", "lz4", "zstd"], optional):
|
|
1275
|
+
The compression algorithm to use for the Parquet file. Defaults to "snappy".
|
|
1276
|
+
description (Optional[str], optional): Description of this operation for the ETL graph.
|
|
1277
|
+
|
|
1278
|
+
Returns:
|
|
1279
|
+
FlowFrame: A new child data frame representing the written data.
|
|
1280
|
+
"""
|
|
1281
|
+
|
|
1282
|
+
new_node_id = add_write_ff_to_cloud_storage(path, flow_graph=self.flow_graph,
|
|
1283
|
+
connection_name=connection_name,
|
|
1284
|
+
depends_on_node_id=self.node_id,
|
|
1285
|
+
parquet_compression=compression,
|
|
1286
|
+
file_format="parquet",
|
|
1287
|
+
description=description)
|
|
1288
|
+
return self._create_child_frame(new_node_id)
|
|
1289
|
+
|
|
1290
|
+
def write_csv_to_cloud_storage(self,
|
|
1291
|
+
path: str,
|
|
1292
|
+
connection_name: Optional[str] = None,
|
|
1293
|
+
delimiter: str = ";",
|
|
1294
|
+
encoding: CsvEncoding = "utf8",
|
|
1295
|
+
description: Optional[str] = None,
|
|
1296
|
+
) -> "FlowFrame":
|
|
1297
|
+
"""
|
|
1298
|
+
Write the data frame to cloud storage in CSV format.
|
|
1299
|
+
|
|
1300
|
+
Args:
|
|
1301
|
+
path (str): The destination path in cloud storage where the CSV file will be written.
|
|
1302
|
+
connection_name (Optional[str], optional): The name of the storage connection
|
|
1303
|
+
that a user can create. If None, uses the default connection. Defaults to None.
|
|
1304
|
+
delimiter (str, optional): The character used to separate fields in the CSV file.
|
|
1305
|
+
Defaults to ";".
|
|
1306
|
+
encoding (CsvEncoding, optional): The character encoding to use for the CSV file.
|
|
1307
|
+
Defaults to "utf8".
|
|
1308
|
+
description (Optional[str], optional): Description of this operation for the ETL graph.
|
|
1309
|
+
|
|
1310
|
+
Returns:
|
|
1311
|
+
FlowFrame: A new child data frame representing the written data.
|
|
1312
|
+
"""
|
|
1313
|
+
new_node_id = add_write_ff_to_cloud_storage(path, flow_graph=self.flow_graph,
|
|
1314
|
+
connection_name=connection_name,
|
|
1315
|
+
depends_on_node_id=self.node_id,
|
|
1316
|
+
csv_delimiter=delimiter,
|
|
1317
|
+
csv_encoding=encoding,
|
|
1318
|
+
file_format="csv",
|
|
1319
|
+
description=description)
|
|
1320
|
+
return self._create_child_frame(new_node_id)
|
|
1321
|
+
|
|
1322
|
+
def write_delta(self,
|
|
1323
|
+
path: str,
|
|
1324
|
+
connection_name: Optional[str] = None,
|
|
1325
|
+
write_mode: Literal["overwrite", "append"] = "overwrite",
|
|
1326
|
+
description: Optional[str] = None,
|
|
1327
|
+
) -> "FlowFrame":
|
|
1328
|
+
"""
|
|
1329
|
+
Write the data frame to cloud storage in Delta Lake format.
|
|
1330
|
+
|
|
1331
|
+
Args:
|
|
1332
|
+
path (str): The destination path in cloud storage where the Delta table will be written.
|
|
1333
|
+
connection_name (Optional[str], optional): The name of the storage connection
|
|
1334
|
+
that a user can create. If None, uses the default connection. Defaults to None.
|
|
1335
|
+
write_mode (Literal["overwrite", "append"], optional): The write mode for the Delta table.
|
|
1336
|
+
"overwrite" replaces existing data, "append" adds to existing data. Defaults to "overwrite".
|
|
1337
|
+
description (Optional[str], optional): Description of this operation for the ETL graph.
|
|
1338
|
+
Returns:
|
|
1339
|
+
FlowFrame: A new child data frame representing the written data.
|
|
1340
|
+
"""
|
|
1341
|
+
new_node_id = add_write_ff_to_cloud_storage(path, flow_graph=self.flow_graph,
|
|
1342
|
+
connection_name=connection_name,
|
|
1343
|
+
depends_on_node_id=self.node_id,
|
|
1344
|
+
write_mode=write_mode,
|
|
1345
|
+
file_format="delta",
|
|
1346
|
+
description=description)
|
|
1347
|
+
return self._create_child_frame(new_node_id)
|
|
1348
|
+
|
|
1349
|
+
def write_json_to_cloud_storage(self,
|
|
1350
|
+
path: str,
|
|
1351
|
+
connection_name: Optional[str] = None,
|
|
1352
|
+
description: Optional[str] = None,
|
|
1353
|
+
) -> "FlowFrame":
|
|
1354
|
+
"""
|
|
1355
|
+
Write the data frame to cloud storage in JSON format.
|
|
1356
|
+
|
|
1357
|
+
Args:
|
|
1358
|
+
path (str): The destination path in cloud storage where the JSON file will be written.
|
|
1359
|
+
connection_name (Optional[str], optional): The name of the storage connection
|
|
1360
|
+
that a user can create. If None, uses the default connection. Defaults to None.
|
|
1361
|
+
description (Optional[str], optional): Description of this operation for the ETL graph.
|
|
1362
|
+
Returns:
|
|
1363
|
+
FlowFrame: A new child data frame representing the written data.
|
|
1364
|
+
"""
|
|
1365
|
+
new_node_id = add_write_ff_to_cloud_storage(path, flow_graph=self.flow_graph,
|
|
1366
|
+
connection_name=connection_name,
|
|
1367
|
+
depends_on_node_id=self.node_id,
|
|
1368
|
+
file_format="json",
|
|
1369
|
+
description=description)
|
|
1370
|
+
return self._create_child_frame(new_node_id)
|
|
1371
|
+
|
|
1096
1372
|
def group_by(self, *by, description: str = None, maintain_order=False, **named_by) -> GroupByFrame:
|
|
1097
1373
|
"""
|
|
1098
1374
|
Start a group by operation.
|
|
@@ -1124,7 +1400,6 @@ class FlowFrame:
|
|
|
1124
1400
|
by_cols.append(col(col_expr).alias(new_name))
|
|
1125
1401
|
elif isinstance(col_expr, Expr):
|
|
1126
1402
|
by_cols.append(col_expr.alias(new_name))
|
|
1127
|
-
|
|
1128
1403
|
# Create a GroupByFrame
|
|
1129
1404
|
return GroupByFrame(
|
|
1130
1405
|
node_id=new_node_id,
|
|
@@ -1141,7 +1416,7 @@ class FlowFrame:
|
|
|
1141
1416
|
self.flow_graph.apply_layout()
|
|
1142
1417
|
self.flow_graph.save_flow(file_path)
|
|
1143
1418
|
|
|
1144
|
-
def collect(self, *args, **kwargs):
|
|
1419
|
+
def collect(self, *args, **kwargs) -> pl.DataFrame:
|
|
1145
1420
|
"""Collect lazy data into memory."""
|
|
1146
1421
|
if hasattr(self.data, "collect"):
|
|
1147
1422
|
return self.data.collect(*args, **kwargs)
|
|
@@ -1463,8 +1738,7 @@ class FlowFrame:
|
|
|
1463
1738
|
combined_graph, node_mappings = combine_flow_graphs_with_mapping(*all_graphs)
|
|
1464
1739
|
for f in [self] + other:
|
|
1465
1740
|
f.node_id = node_mappings.get((f.flow_graph.flow_id, f.node_id), None)
|
|
1466
|
-
|
|
1467
|
-
node_id_counter += len(combined_graph.nodes)
|
|
1741
|
+
node_id_data["c"] = node_id_data["c"] + len(combined_graph.nodes)
|
|
1468
1742
|
else:
|
|
1469
1743
|
combined_graph = self.flow_graph
|
|
1470
1744
|
new_node_id = generate_node_id()
|
|
@@ -1659,7 +1933,6 @@ class FlowFrame:
|
|
|
1659
1933
|
all_input_expr_objects: List[Expr] = []
|
|
1660
1934
|
pure_polars_expr_strings_for_wc: List[str] = []
|
|
1661
1935
|
collected_raw_definitions: List[str] = []
|
|
1662
|
-
|
|
1663
1936
|
has_exprs_or_named_exprs = bool(exprs or named_exprs)
|
|
1664
1937
|
if has_exprs_or_named_exprs:
|
|
1665
1938
|
actual_exprs_to_process: List[Expr] = []
|