Flowfile 0.3.1.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +2 -1
- flowfile/api.py +5 -3
- flowfile/web/__init__.py +3 -0
- flowfile/web/static/assets/{AirbyteReader-cb0c1d4a.js → AirbyteReader-2b1cf2d8.js} +10 -9
- flowfile/web/static/assets/{CrossJoin-a514fa59.js → CrossJoin-cc3ab73c.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-f2cecf33.js → DatabaseConnectionSettings-307c4652.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-83ee3c98.js → DatabaseManager-69faa6e1.js} +10 -6
- flowfile/web/static/assets/{DatabaseReader-dc0c6881.js → DatabaseReader-e4134cd0.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-5afe9f8d.js → DatabaseWriter-d32d75b1.js} +9 -9
- flowfile/web/static/assets/{ExploreData-c7ee19cf.js → ExploreData-5eb48389.js} +18639 -18629
- flowfile/web/static/assets/{ExternalSource-17b23a01.js → ExternalSource-29489051.js} +8 -21
- flowfile/web/static/assets/{Filter-90856b4f.js → Filter-031332bb.js} +9 -9
- flowfile/web/static/assets/{Formula-38b71e9e.js → Formula-3b900540.js} +15 -15
- flowfile/web/static/assets/{Formula-d60a74f4.css → Formula-b8cefc31.css} +4 -4
- flowfile/web/static/assets/{FuzzyMatch-d0f1fe81.js → FuzzyMatch-dee31153.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-0c86bbc6.js → GraphSolver-ca74eb47.js} +5 -5
- flowfile/web/static/assets/{GroupBy-f2772e9f.js → GroupBy-081b6591.js} +8 -7
- flowfile/web/static/assets/{Join-bc3e1cf7.js → Join-b467376f.js} +11 -10
- flowfile/web/static/assets/{ManualInput-03aa0245.js → ManualInput-ffffb80a.js} +11 -8
- flowfile/web/static/assets/{Output-5b35eee8.js → Output-9a87d4ba.js} +4 -4
- flowfile/web/static/assets/{Pivot-7164087c.js → Pivot-ee3e6093.js} +8 -7
- flowfile/web/static/assets/{PolarsCode-3abf6507.js → PolarsCode-03921254.js} +13 -11
- flowfile/web/static/assets/{PopOver-b37ff9be.js → PopOver-3bdf8951.js} +1 -1
- flowfile/web/static/assets/{Read-65966a3e.js → Read-67fee3a0.js} +6 -6
- flowfile/web/static/assets/{RecordCount-c66c6d6d.js → RecordCount-a2acd02d.js} +7 -6
- flowfile/web/static/assets/{RecordId-826dc095.js → RecordId-0c8bcd77.js} +10 -8
- flowfile/web/static/assets/{Sample-4ed555c8.js → Sample-60594a3a.js} +7 -6
- flowfile/web/static/assets/{SecretManager-eac1e97d.js → SecretManager-bbcec2ac.js} +2 -2
- flowfile/web/static/assets/{Select-085f05cc.js → Select-9540e6ca.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-1f5e79c1.js → SettingsSection-48f28104.js} +1 -1
- flowfile/web/static/assets/{Sort-3e6cb414.js → Sort-6dbe3633.js} +6 -6
- flowfile/web/static/assets/{TextToRows-606349bc.js → TextToRows-27aab4a8.js} +18 -13
- flowfile/web/static/assets/{UnavailableFields-b41976ed.js → UnavailableFields-8143044b.js} +2 -2
- flowfile/web/static/assets/{Union-fca91665.js → Union-52460248.js} +7 -6
- flowfile/web/static/assets/{Unique-a59f830e.js → Unique-f6962644.js} +8 -8
- flowfile/web/static/assets/{Unpivot-c3815565.js → Unpivot-1ff1e938.js} +5 -5
- flowfile/web/static/assets/{api-22b338bd.js → api-3b345d92.js} +1 -1
- flowfile/web/static/assets/{designer-e5bbe26f.js → designer-4736134f.js} +72 -42
- flowfile/web/static/assets/{documentation-08045cf2.js → documentation-b9545eba.js} +1 -1
- flowfile/web/static/assets/{dropDown-5e7e9a5a.js → dropDown-d5a4014c.js} +1 -1
- flowfile/web/static/assets/{dropDownGeneric-50a91b99.js → dropDownGeneric-1f4e32ec.js} +2 -2
- flowfile/web/static/assets/{fullEditor-705c6ccb.js → fullEditor-f4791c23.js} +3 -3
- flowfile/web/static/assets/{genericNodeSettings-65587f20.js → genericNodeSettings-1d456350.js} +3 -3
- flowfile/web/static/assets/{index-552863fd.js → index-f25c9283.js} +2608 -1570
- flowfile/web/static/assets/{nodeTitle-cf9bae3c.js → nodeTitle-cad6fd9d.js} +3 -3
- flowfile/web/static/assets/{secretApi-3ad510e1.js → secretApi-01f07e2c.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-bd644891.js → selectDynamic-f46a4e3f.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-dd17b478.js → vue-codemirror.esm-eb98fc8b.js} +15 -14
- flowfile/web/static/assets/{vue-content-loader.es-6b36f05e.js → vue-content-loader.es-860c0380.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/METADATA +1 -3
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/RECORD +97 -88
- flowfile_core/configs/__init__.py +15 -4
- flowfile_core/configs/node_store/nodes.py +2 -4
- flowfile_core/configs/settings.py +5 -3
- flowfile_core/configs/utils.py +18 -0
- flowfile_core/flowfile/FlowfileFlow.py +84 -29
- flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +55 -18
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +34 -2
- flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
- flowfile_core/flowfile/flow_graph_utils.py +320 -0
- flowfile_core/flowfile/flow_node/flow_node.py +2 -1
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +0 -1
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
- flowfile_core/flowfile/utils.py +34 -3
- flowfile_core/main.py +2 -3
- flowfile_core/routes/secrets.py +1 -1
- flowfile_core/schemas/input_schema.py +12 -14
- flowfile_core/schemas/transform_schema.py +25 -47
- flowfile_frame/__init__.py +11 -4
- flowfile_frame/adding_expr.py +280 -0
- flowfile_frame/config.py +9 -0
- flowfile_frame/expr.py +301 -83
- flowfile_frame/expr.pyi +2174 -0
- flowfile_frame/expr_name.py +258 -0
- flowfile_frame/flow_frame.py +616 -627
- flowfile_frame/flow_frame.pyi +336 -0
- flowfile_frame/flow_frame_methods.py +617 -0
- flowfile_frame/group_frame.py +89 -42
- flowfile_frame/join.py +1 -2
- flowfile_frame/lazy.py +704 -0
- flowfile_frame/lazy_methods.py +201 -0
- flowfile_frame/list_name_space.py +324 -0
- flowfile_frame/selectors.py +3 -0
- flowfile_frame/series.py +70 -0
- flowfile_frame/utils.py +80 -4
- flowfile/web/static/assets/GoogleSheet-854294a4.js +0 -2616
- flowfile/web/static/assets/GoogleSheet-92084da7.css +0 -233
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +0 -74
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/LICENSE +0 -0
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/WHEEL +0 -0
- {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/entry_points.txt +0 -0
- /flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
- /flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0
flowfile_frame/flow_frame.py
CHANGED
|
@@ -1,14 +1,17 @@
|
|
|
1
|
-
import
|
|
1
|
+
import inspect
|
|
2
2
|
import os
|
|
3
|
-
from typing import Any, Iterable, List, Literal, Optional, Tuple, Union
|
|
4
|
-
from pathlib import Path
|
|
3
|
+
from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable, get_args, get_origin
|
|
5
4
|
|
|
6
5
|
import re
|
|
6
|
+
|
|
7
7
|
import polars as pl
|
|
8
|
-
from polars._typing import FrameInitTypes, SchemaDefinition, SchemaDict, Orientation
|
|
9
8
|
|
|
10
|
-
|
|
9
|
+
from flowfile_frame.lazy_methods import add_lazyframe_methods
|
|
10
|
+
|
|
11
|
+
from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
|
|
12
|
+
from collections.abc import Iterator
|
|
11
13
|
from flowfile_core.flowfile.FlowfileFlow import FlowGraph, add_connection
|
|
14
|
+
from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
|
|
12
15
|
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
|
|
13
16
|
from flowfile_core.flowfile.flow_node.flow_node import FlowNode
|
|
14
17
|
from flowfile_core.schemas import input_schema, transform_schema
|
|
@@ -16,12 +19,36 @@ from flowfile_core.schemas import input_schema, transform_schema
|
|
|
16
19
|
from flowfile_frame.expr import Expr, Column, lit, col
|
|
17
20
|
from flowfile_frame.selectors import Selector
|
|
18
21
|
from flowfile_frame.group_frame import GroupByFrame
|
|
19
|
-
from flowfile_frame.utils import _parse_inputs_as_iterable, create_flow_graph
|
|
22
|
+
from flowfile_frame.utils import (_parse_inputs_as_iterable, create_flow_graph, stringify_values,
|
|
23
|
+
ensure_inputs_as_iterable)
|
|
20
24
|
from flowfile_frame.join import _normalize_columns_to_list, _create_join_mappings
|
|
25
|
+
from flowfile_frame.utils import _check_if_convertible_to_code
|
|
26
|
+
from flowfile_frame.config import logger
|
|
27
|
+
|
|
21
28
|
|
|
22
29
|
node_id_counter = 0
|
|
23
30
|
|
|
24
31
|
|
|
32
|
+
def can_be_expr(param: inspect.Parameter) -> bool:
|
|
33
|
+
"""Check if a parameter can be of type pl.Expr"""
|
|
34
|
+
if param.annotation == inspect.Parameter.empty:
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
# Check direct match or in Union args
|
|
38
|
+
types = get_args(param.annotation) if get_origin(param.annotation) is Union else [param.annotation]
|
|
39
|
+
return any(t in (pl.Expr, pl.expr.expr.Expr) for t in types)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _contains_lambda_pattern(text: str) -> bool:
|
|
43
|
+
return "<lambda> at" in text
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_method_name_from_code(code: str) -> str | None:
|
|
47
|
+
split_code = code.split("input_df.")
|
|
48
|
+
if len(split_code) > 1:
|
|
49
|
+
return split_code[1].split("(")[0]
|
|
50
|
+
|
|
51
|
+
|
|
25
52
|
def _to_string_val(v) -> str:
|
|
26
53
|
if isinstance(v, str):
|
|
27
54
|
return f"'{v}'"
|
|
@@ -29,12 +56,72 @@ def _to_string_val(v) -> str:
|
|
|
29
56
|
return v
|
|
30
57
|
|
|
31
58
|
|
|
59
|
+
def _extract_expr_parts(expr_obj) -> tuple[str, str]:
|
|
60
|
+
"""
|
|
61
|
+
Extract the pure expression string and any raw definitions (including function sources) from an Expr object.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
expr_obj : Expr
|
|
66
|
+
The expression object to extract parts from
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
tuple[str, str]
|
|
71
|
+
A tuple of (pure_expr_str, raw_definitions_str)
|
|
72
|
+
"""
|
|
73
|
+
if not isinstance(expr_obj, Expr):
|
|
74
|
+
# If it's not an Expr, just return its string representation
|
|
75
|
+
return str(expr_obj), ""
|
|
76
|
+
|
|
77
|
+
# Get the basic representation
|
|
78
|
+
pure_expr_str = expr_obj._repr_str
|
|
79
|
+
|
|
80
|
+
# Collect all definitions (function sources)
|
|
81
|
+
raw_definitions = []
|
|
82
|
+
|
|
83
|
+
# Add function sources if any
|
|
84
|
+
if hasattr(expr_obj, '_function_sources') and expr_obj._function_sources:
|
|
85
|
+
# Remove duplicates while preserving order
|
|
86
|
+
unique_sources = []
|
|
87
|
+
seen = set()
|
|
88
|
+
for source in expr_obj._function_sources:
|
|
89
|
+
if source not in seen:
|
|
90
|
+
seen.add(source)
|
|
91
|
+
unique_sources.append(source)
|
|
92
|
+
|
|
93
|
+
if unique_sources:
|
|
94
|
+
raw_definitions.extend(unique_sources)
|
|
95
|
+
|
|
96
|
+
# Join all definitions
|
|
97
|
+
raw_defs_str = "\n\n".join(raw_definitions) if raw_definitions else ""
|
|
98
|
+
|
|
99
|
+
return pure_expr_str, raw_defs_str
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _check_ok_for_serialization(method_name: str = None, polars_expr: pl.Expr | None = None,
|
|
103
|
+
group_expr: pl.Expr | None = None) -> None:
|
|
104
|
+
if method_name is None:
|
|
105
|
+
raise NotImplemented("Cannot create a polars lambda expression without the method")
|
|
106
|
+
if polars_expr is None:
|
|
107
|
+
raise NotImplemented("Cannot create polars expressions with lambda function")
|
|
108
|
+
method_ref = getattr(pl.LazyFrame, method_name)
|
|
109
|
+
if method_ref is None:
|
|
110
|
+
raise ModuleNotFoundError(f"Could not find the method {method_name} in polars lazyframe")
|
|
111
|
+
if method_name == 'group_by':
|
|
112
|
+
if group_expr is None:
|
|
113
|
+
raise NotImplemented("Cannot create a polars lambda expression without the groupby expression")
|
|
114
|
+
if not all(isinstance(ge, pl.Expr) for ge in group_expr):
|
|
115
|
+
raise NotImplemented("Cannot create a polars lambda expression without the groupby expression")
|
|
116
|
+
|
|
117
|
+
|
|
32
118
|
def generate_node_id() -> int:
|
|
33
119
|
global node_id_counter
|
|
34
120
|
node_id_counter += 1
|
|
35
121
|
return node_id_counter
|
|
36
122
|
|
|
37
123
|
|
|
124
|
+
@add_lazyframe_methods
|
|
38
125
|
class FlowFrame:
|
|
39
126
|
"""Main class that wraps FlowDataEngine and maintains the ETL graph."""
|
|
40
127
|
flow_graph: FlowGraph
|
|
@@ -89,13 +176,11 @@ class FlowFrame:
|
|
|
89
176
|
# Extract flow-specific parameters
|
|
90
177
|
node_id = node_id or generate_node_id()
|
|
91
178
|
description = "Data imported from Python object"
|
|
92
|
-
|
|
93
179
|
# Create a new flow graph if none is provided
|
|
94
180
|
if flow_graph is None:
|
|
95
181
|
flow_graph = create_flow_graph()
|
|
96
182
|
|
|
97
183
|
flow_id = flow_graph.flow_id
|
|
98
|
-
|
|
99
184
|
# Convert data to a polars DataFrame/LazyFrame
|
|
100
185
|
try:
|
|
101
186
|
# Use polars to convert from various types
|
|
@@ -110,25 +195,23 @@ class FlowFrame:
|
|
|
110
195
|
)
|
|
111
196
|
pl_data = pl_df.lazy()
|
|
112
197
|
except Exception as e:
|
|
113
|
-
raise ValueError(f"Could not
|
|
114
|
-
|
|
198
|
+
raise ValueError(f"Could not dconvert data to a polars DataFrame: {e}")
|
|
115
199
|
# Create a FlowDataEngine to get data in the right format for manual input
|
|
116
200
|
flow_table = FlowDataEngine(raw_data=pl_data)
|
|
117
|
-
|
|
201
|
+
raw_data_format = input_schema.RawData(data=list(flow_table.to_dict().values()),
|
|
202
|
+
columns=[c.get_minimal_field_info() for c in flow_table.schema])
|
|
118
203
|
# Create a manual input node
|
|
119
204
|
input_node = input_schema.NodeManualInput(
|
|
120
205
|
flow_id=flow_id,
|
|
121
206
|
node_id=node_id,
|
|
122
|
-
|
|
207
|
+
raw_data_format=raw_data_format,
|
|
123
208
|
pos_x=100,
|
|
124
209
|
pos_y=100,
|
|
125
210
|
is_setup=True,
|
|
126
211
|
description=description,
|
|
127
212
|
)
|
|
128
|
-
|
|
129
213
|
# Add to graph
|
|
130
214
|
flow_graph.add_manual_input(input_node)
|
|
131
|
-
|
|
132
215
|
# Return new frame
|
|
133
216
|
return FlowFrame(
|
|
134
217
|
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
@@ -152,7 +235,6 @@ class FlowFrame:
|
|
|
152
235
|
parent_node_id=None,
|
|
153
236
|
):
|
|
154
237
|
"""Create a new FlowFrame instance."""
|
|
155
|
-
|
|
156
238
|
# If data is not a LazyFrame, use the factory method
|
|
157
239
|
if data is not None and not isinstance(data, pl.LazyFrame):
|
|
158
240
|
return cls.create_from_any_type(
|
|
@@ -168,7 +250,6 @@ class FlowFrame:
|
|
|
168
250
|
parent_node_id=parent_node_id,
|
|
169
251
|
)
|
|
170
252
|
|
|
171
|
-
# Otherwise create the instance normally
|
|
172
253
|
instance = super().__new__(cls)
|
|
173
254
|
return instance
|
|
174
255
|
|
|
@@ -187,7 +268,6 @@ class FlowFrame:
|
|
|
187
268
|
parent_node_id=None,
|
|
188
269
|
):
|
|
189
270
|
"""Initialize the FlowFrame with data and graph references."""
|
|
190
|
-
|
|
191
271
|
if data is None:
|
|
192
272
|
data = pl.LazyFrame()
|
|
193
273
|
if not isinstance(data, pl.LazyFrame):
|
|
@@ -219,205 +299,235 @@ class FlowFrame:
|
|
|
219
299
|
def _create_child_frame(self, new_node_id):
|
|
220
300
|
"""Helper method to create a new FlowFrame that's a child of this one"""
|
|
221
301
|
self._add_connection(self.node_id, new_node_id)
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
302
|
+
try:
|
|
303
|
+
return FlowFrame(
|
|
304
|
+
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
305
|
+
flow_graph=self.flow_graph,
|
|
306
|
+
node_id=new_node_id,
|
|
307
|
+
parent_node_id=self.node_id,
|
|
308
|
+
)
|
|
309
|
+
except AttributeError:
|
|
310
|
+
raise ValueError('Could not execute the function')
|
|
228
311
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
description: str = None,
|
|
238
|
-
):
|
|
312
|
+
@staticmethod
|
|
313
|
+
def _generate_sort_polars_code(
|
|
314
|
+
pure_sort_expr_strs: List[str],
|
|
315
|
+
descending_values: List[bool],
|
|
316
|
+
nulls_last_values: List[bool],
|
|
317
|
+
multithreaded: bool,
|
|
318
|
+
maintain_order: bool,
|
|
319
|
+
) -> str:
|
|
239
320
|
"""
|
|
240
|
-
|
|
321
|
+
Generates the `input_df.sort(...)` Polars code string using pure expression strings.
|
|
322
|
+
"""
|
|
323
|
+
kwargs_for_code: Dict[str, Any] = {}
|
|
324
|
+
if any(descending_values):
|
|
325
|
+
kwargs_for_code["descending"] = descending_values[0] if len(descending_values) == 1 else descending_values
|
|
326
|
+
if any(nulls_last_values):
|
|
327
|
+
kwargs_for_code["nulls_last"] = nulls_last_values[0] if len(nulls_last_values) == 1 else nulls_last_values
|
|
328
|
+
if not multithreaded:
|
|
329
|
+
kwargs_for_code["multithreaded"] = multithreaded
|
|
330
|
+
if maintain_order:
|
|
331
|
+
kwargs_for_code["maintain_order"] = maintain_order
|
|
241
332
|
|
|
242
|
-
|
|
243
|
-
-----------
|
|
244
|
-
by : Expr, str, or list of Expr/str
|
|
245
|
-
Column(s) to sort by. Accepts expression input. Strings are parsed as column names.
|
|
246
|
-
*more_by : Expr or str
|
|
247
|
-
Additional columns to sort by, specified as positional arguments.
|
|
248
|
-
descending : bool or list of bool, default False
|
|
249
|
-
Sort in descending order. When sorting by multiple columns, can be specified per column.
|
|
250
|
-
nulls_last : bool or list of bool, default False
|
|
251
|
-
Place null values last; can specify a single boolean or a sequence for per-column control.
|
|
252
|
-
multithreaded : bool, default True
|
|
253
|
-
Sort using multiple threads.
|
|
254
|
-
maintain_order : bool, default False
|
|
255
|
-
Whether the order should be maintained if elements are equal.
|
|
256
|
-
description : str, optional
|
|
257
|
-
Description of this operation for the ETL graph.
|
|
333
|
+
kwargs_str_for_code = ", ".join(f"{k}={repr(v)}" for k, v in kwargs_for_code.items())
|
|
258
334
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
335
|
+
by_arg_for_code = pure_sort_expr_strs[0] if len(
|
|
336
|
+
pure_sort_expr_strs) == 1 else f"[{', '.join(pure_sort_expr_strs)}]"
|
|
337
|
+
return f"input_df.sort({by_arg_for_code}{', ' + kwargs_str_for_code if kwargs_str_for_code else ''})"
|
|
338
|
+
|
|
339
|
+
def sort(
|
|
340
|
+
self,
|
|
341
|
+
by: Union[List[Union[Expr, str]], Expr, str],
|
|
342
|
+
*more_by: Union[Expr, str],
|
|
343
|
+
descending: Union[bool, List[bool]] = False,
|
|
344
|
+
nulls_last: Union[bool, List[bool]] = False,
|
|
345
|
+
multithreaded: bool = True,
|
|
346
|
+
maintain_order: bool = False,
|
|
347
|
+
description: Optional[str] = None,
|
|
348
|
+
) -> "FlowFrame":
|
|
349
|
+
"""
|
|
350
|
+
Sort the dataframe by the given columns.
|
|
263
351
|
"""
|
|
264
|
-
|
|
352
|
+
initial_by_args = list(_parse_inputs_as_iterable((by,)))
|
|
265
353
|
new_node_id = generate_node_id()
|
|
266
|
-
|
|
354
|
+
|
|
355
|
+
sort_expressions_input: list = initial_by_args
|
|
267
356
|
if more_by:
|
|
268
|
-
|
|
357
|
+
sort_expressions_input.extend(list(_parse_inputs_as_iterable(more_by)))
|
|
269
358
|
|
|
270
|
-
|
|
271
|
-
|
|
359
|
+
all_processed_expr_objects: List[Expr] = []
|
|
360
|
+
pure_polars_expr_strings_for_sort: List[str] = []
|
|
361
|
+
collected_raw_definitions: List[str] = []
|
|
362
|
+
column_names_for_native_node: List[str] = []
|
|
272
363
|
|
|
273
|
-
|
|
274
|
-
for expr in sort_expressions:
|
|
275
|
-
if not isinstance(expr, (str, Column)) or (
|
|
276
|
-
isinstance(expr, Column) and expr._select_input.is_altered
|
|
277
|
-
):
|
|
278
|
-
needs_polars_code = True
|
|
279
|
-
break
|
|
364
|
+
use_polars_code_path = False
|
|
280
365
|
|
|
281
|
-
# Also need polars code if we're using maintain_order or multithreaded params
|
|
282
366
|
if maintain_order or not multithreaded:
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
if
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
needs_polars_code = True
|
|
306
|
-
else:
|
|
307
|
-
nulls_last_values = [nulls_last] * len(sort_expressions)
|
|
308
|
-
# Non-default nulls_last needs polars code
|
|
309
|
-
if nulls_last:
|
|
310
|
-
needs_polars_code = True
|
|
311
|
-
|
|
312
|
-
if needs_polars_code:
|
|
313
|
-
# Generate polars code for complex cases
|
|
314
|
-
code = self._generate_sort_polars_code(
|
|
315
|
-
sort_expressions,
|
|
316
|
-
descending_values,
|
|
317
|
-
nulls_last_values,
|
|
318
|
-
multithreaded,
|
|
319
|
-
maintain_order,
|
|
320
|
-
)
|
|
321
|
-
self._add_polars_code(new_node_id, code, description)
|
|
322
|
-
else:
|
|
323
|
-
# Use native implementation for simple cases
|
|
324
|
-
sort_inputs = []
|
|
325
|
-
for i, expr in enumerate(sort_expressions):
|
|
326
|
-
# Convert expr to column name
|
|
327
|
-
if isinstance(expr, Column):
|
|
328
|
-
column_name = expr.name
|
|
329
|
-
elif isinstance(expr, str):
|
|
330
|
-
column_name = expr
|
|
367
|
+
use_polars_code_path = True
|
|
368
|
+
|
|
369
|
+
is_nulls_last_list = isinstance(nulls_last, (list, tuple))
|
|
370
|
+
if is_nulls_last_list and any(val for val in nulls_last if val is not False):
|
|
371
|
+
use_polars_code_path = True
|
|
372
|
+
elif not is_nulls_last_list and nulls_last is not False:
|
|
373
|
+
use_polars_code_path = True
|
|
374
|
+
|
|
375
|
+
for expr_input in sort_expressions_input:
|
|
376
|
+
current_expr_obj: Expr
|
|
377
|
+
is_simple_col_for_native = False
|
|
378
|
+
|
|
379
|
+
if isinstance(expr_input, str):
|
|
380
|
+
current_expr_obj = col(expr_input)
|
|
381
|
+
column_names_for_native_node.append(expr_input)
|
|
382
|
+
is_simple_col_for_native = True
|
|
383
|
+
elif isinstance(expr_input, Column):
|
|
384
|
+
current_expr_obj = expr_input
|
|
385
|
+
# Type ignore below due to simplified Column stub
|
|
386
|
+
if not expr_input._select_input.is_altered: # type: ignore
|
|
387
|
+
column_names_for_native_node.append(expr_input.column_name) # type: ignore
|
|
388
|
+
is_simple_col_for_native = True
|
|
331
389
|
else:
|
|
332
|
-
|
|
390
|
+
use_polars_code_path = True # Altered Column implies complex expression
|
|
391
|
+
elif isinstance(expr_input, Expr):
|
|
392
|
+
current_expr_obj = expr_input
|
|
393
|
+
use_polars_code_path = True # General Expr implies complex expression
|
|
394
|
+
else: # Convert other types to lit
|
|
395
|
+
current_expr_obj = lit(expr_input)
|
|
396
|
+
use_polars_code_path = True # Literal might be part of a complex sort for Polars code
|
|
397
|
+
|
|
398
|
+
all_processed_expr_objects.append(current_expr_obj)
|
|
399
|
+
|
|
400
|
+
pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
|
|
401
|
+
pure_polars_expr_strings_for_sort.append(pure_expr_str)
|
|
402
|
+
|
|
403
|
+
if raw_defs_str:
|
|
404
|
+
if raw_defs_str not in collected_raw_definitions:
|
|
405
|
+
collected_raw_definitions.append(raw_defs_str)
|
|
406
|
+
use_polars_code_path = True
|
|
407
|
+
|
|
408
|
+
if not is_simple_col_for_native: # If it wasn't a simple string or unaltered Column
|
|
409
|
+
use_polars_code_path = True
|
|
410
|
+
|
|
411
|
+
desc_values = list(descending) if isinstance(descending, list) else [descending] * len(
|
|
412
|
+
all_processed_expr_objects)
|
|
413
|
+
null_last_values = list(nulls_last) if isinstance(nulls_last, list) else [nulls_last] * len(
|
|
414
|
+
all_processed_expr_objects)
|
|
415
|
+
|
|
416
|
+
if len(desc_values) != len(all_processed_expr_objects):
|
|
417
|
+
raise ValueError("Length of 'descending' does not match the number of sort expressions.")
|
|
418
|
+
if len(null_last_values) != len(all_processed_expr_objects):
|
|
419
|
+
raise ValueError("Length of 'nulls_last' does not match the number of sort expressions.")
|
|
420
|
+
|
|
421
|
+
if use_polars_code_path:
|
|
422
|
+
polars_operation_code = self._generate_sort_polars_code(
|
|
423
|
+
pure_polars_expr_strings_for_sort, desc_values, null_last_values, multithreaded, maintain_order
|
|
424
|
+
)
|
|
333
425
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
426
|
+
final_code_for_node: str
|
|
427
|
+
if collected_raw_definitions:
|
|
428
|
+
unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions)) # Order-preserving unique
|
|
429
|
+
definitions_section = "\n\n".join(unique_raw_definitions)
|
|
430
|
+
final_code_for_node = definitions_section + \
|
|
431
|
+
"\#─────SPLIT─────\n\n" + \
|
|
432
|
+
f"output_df = {polars_operation_code}"
|
|
433
|
+
else:
|
|
434
|
+
final_code_for_node = polars_operation_code
|
|
435
|
+
|
|
436
|
+
pl_expressions_for_fallback = [e.expr for e in all_processed_expr_objects if
|
|
437
|
+
hasattr(e, 'expr') and e.expr is not None]
|
|
438
|
+
kwargs_for_fallback = {
|
|
439
|
+
"descending": desc_values[0] if len(desc_values) == 1 else desc_values,
|
|
440
|
+
"nulls_last": null_last_values[0] if len(null_last_values) == 1 else null_last_values,
|
|
441
|
+
"multithreaded": multithreaded, "maintain_order": maintain_order}
|
|
442
|
+
|
|
443
|
+
self._add_polars_code(new_node_id, final_code_for_node, description, method_name="sort",
|
|
444
|
+
convertable_to_code=_check_if_convertible_to_code(all_processed_expr_objects),
|
|
445
|
+
polars_expr=pl_expressions_for_fallback,
|
|
446
|
+
kwargs_expr=kwargs_for_fallback)
|
|
447
|
+
else:
|
|
448
|
+
sort_inputs_for_node = []
|
|
449
|
+
for i, col_name_for_native in enumerate(column_names_for_native_node):
|
|
450
|
+
sort_inputs_for_node.append(
|
|
451
|
+
transform_schema.SortByInput(column=col_name_for_native, how="desc" if desc_values[i] else "asc")
|
|
452
|
+
# type: ignore
|
|
340
453
|
)
|
|
341
|
-
|
|
342
454
|
sort_settings = input_schema.NodeSort(
|
|
343
|
-
flow_id=self.flow_graph.flow_id,
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
pos_x=200,
|
|
347
|
-
pos_y=150,
|
|
348
|
-
is_setup=True,
|
|
349
|
-
depending_on_id=self.node_id,
|
|
350
|
-
description=description
|
|
351
|
-
or f"Sort by {', '.join(str(e) for e in sort_expressions)}",
|
|
352
|
-
)
|
|
455
|
+
flow_id=self.flow_graph.flow_id, node_id=new_node_id, sort_input=sort_inputs_for_node, # type: ignore
|
|
456
|
+
pos_x=200, pos_y=150, is_setup=True, depending_on_id=self.node_id,
|
|
457
|
+
description=description or f"Sort by {', '.join(column_names_for_native_node)}")
|
|
353
458
|
self.flow_graph.add_sort(sort_settings)
|
|
354
459
|
|
|
355
460
|
return self._create_child_frame(new_node_id)
|
|
356
461
|
|
|
357
|
-
def
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
kwargs["descending"] = descending_values
|
|
391
|
-
|
|
392
|
-
# Only add nulls_last if it's non-default
|
|
393
|
-
if any(nl for nl in nulls_last_values):
|
|
394
|
-
if len(nulls_last_values) == 1:
|
|
395
|
-
kwargs["nulls_last"] = nulls_last_values[0]
|
|
462
|
+
def _add_polars_code(self, new_node_id: int, code: str, description: str = None,
|
|
463
|
+
depending_on_ids: List[str] | None = None, convertable_to_code: bool = True,
|
|
464
|
+
method_name: str = None, polars_expr: Expr | List[Expr] | None = None,
|
|
465
|
+
group_expr: Expr | List[Expr] | None = None,
|
|
466
|
+
kwargs_expr: Dict | None = None,
|
|
467
|
+
group_kwargs: Dict | None = None, ):
|
|
468
|
+
polars_code_for_node: str
|
|
469
|
+
if not convertable_to_code or _contains_lambda_pattern(code):
|
|
470
|
+
|
|
471
|
+
effective_method_name = get_method_name_from_code(
|
|
472
|
+
code) if method_name is None and "input_df." in code else method_name
|
|
473
|
+
|
|
474
|
+
pl_expr_list = ensure_inputs_as_iterable(polars_expr) if polars_expr is not None else []
|
|
475
|
+
group_expr_list = ensure_inputs_as_iterable(group_expr) if group_expr is not None else []
|
|
476
|
+
|
|
477
|
+
_check_ok_for_serialization(polars_expr=pl_expr_list, method_name=effective_method_name,
|
|
478
|
+
group_expr=group_expr_list)
|
|
479
|
+
|
|
480
|
+
current_kwargs_expr = kwargs_expr if kwargs_expr is not None else {}
|
|
481
|
+
result_lazyframe_or_expr: Any
|
|
482
|
+
|
|
483
|
+
if effective_method_name == "group_by":
|
|
484
|
+
group_kwargs = {} if group_kwargs is None else group_kwargs
|
|
485
|
+
if not group_expr_list:
|
|
486
|
+
raise ValueError("group_expr is required for group_by method in serialization fallback.")
|
|
487
|
+
target_obj = getattr(self.data, effective_method_name)(*group_expr_list, **group_kwargs)
|
|
488
|
+
if not pl_expr_list:
|
|
489
|
+
raise ValueError(
|
|
490
|
+
"Aggregation expressions (polars_expr) are required for group_by().agg() in serialization fallback.")
|
|
491
|
+
result_lazyframe_or_expr = target_obj.agg(*pl_expr_list, **current_kwargs_expr)
|
|
492
|
+
elif effective_method_name:
|
|
493
|
+
result_lazyframe_or_expr = getattr(self.data, effective_method_name)(*pl_expr_list,
|
|
494
|
+
**current_kwargs_expr)
|
|
396
495
|
else:
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
496
|
+
raise ValueError(
|
|
497
|
+
"Cannot execute Polars operation: method_name is missing and could not be inferred for serialization fallback.")
|
|
498
|
+
try:
|
|
499
|
+
if isinstance(result_lazyframe_or_expr, pl.LazyFrame):
|
|
500
|
+
serialized_value_for_code = result_lazyframe_or_expr.serialize(format='json')
|
|
501
|
+
polars_code_for_node = "\n".join([
|
|
502
|
+
f"serialized_value = r'''{serialized_value_for_code}'''",
|
|
503
|
+
"buffer = BytesIO(serialized_value.encode('utf-8'))",
|
|
504
|
+
"output_df = pl.LazyFrame.deserialize(buffer, format='json')",
|
|
505
|
+
])
|
|
506
|
+
logger.warning(
|
|
507
|
+
f"Transformation '{effective_method_name}' uses non-serializable elements. "
|
|
508
|
+
"Falling back to serializing the resulting Polars LazyFrame object."
|
|
509
|
+
"This will result in a breaking graph when using the the ui."
|
|
510
|
+
)
|
|
511
|
+
else:
|
|
512
|
+
logger.error(
|
|
513
|
+
f"Fallback for non-convertible code for method '{effective_method_name}' "
|
|
514
|
+
f"resulted in a '{type(result_lazyframe_or_expr).__name__}' instead of a Polars LazyFrame. "
|
|
515
|
+
"This type cannot be persisted as a LazyFrame node via this fallback."
|
|
516
|
+
)
|
|
517
|
+
return FlowFrame(result_lazyframe_or_expr, flow_graph=self.flow_graph, node_id=new_node_id)
|
|
518
|
+
except Exception as e:
|
|
519
|
+
logger.warning(
|
|
520
|
+
f"Critical error: Could not serialize the result of operation '{effective_method_name}' "
|
|
521
|
+
f"during fallback for non-convertible code. Error: {e}."
|
|
522
|
+
"When using a lambda function, consider defining the function first"
|
|
523
|
+
)
|
|
524
|
+
return FlowFrame(result_lazyframe_or_expr, flow_graph=self.flow_graph, node_id=new_node_id)
|
|
412
525
|
else:
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
def _add_polars_code(self, new_node_id: int, code: str, description: str = None,
|
|
416
|
-
depending_on_ids: List[str] | None = None):
|
|
526
|
+
polars_code_for_node = code
|
|
417
527
|
polars_code_settings = input_schema.NodePolarsCode(
|
|
418
528
|
flow_id=self.flow_graph.flow_id,
|
|
419
529
|
node_id=new_node_id,
|
|
420
|
-
polars_code_input=transform_schema.PolarsCodeInput(polars_code=
|
|
530
|
+
polars_code_input=transform_schema.PolarsCodeInput(polars_code=polars_code_for_node),
|
|
421
531
|
is_setup=True,
|
|
422
532
|
depending_on_ids=depending_on_ids if depending_on_ids is not None else [self.node_id],
|
|
423
533
|
description=description,
|
|
@@ -458,14 +568,17 @@ class FlowFrame:
|
|
|
458
568
|
validate : {"1:1", "1:m", "m:1", "m:m"}, optional
|
|
459
569
|
Validate join relationship.
|
|
460
570
|
nulls_equal:
|
|
461
|
-
Join on null values. By default null values will never produce matches.
|
|
571
|
+
Join on null values. By default, null values will never produce matches.
|
|
462
572
|
coalesce:
|
|
463
573
|
None: -> join specific.
|
|
464
574
|
True: -> Always coalesce join columns.
|
|
465
575
|
False: -> Never coalesce join columns.
|
|
466
576
|
maintain_order:
|
|
467
|
-
Which DataFrame row order to preserve, if any. Do not rely on any observed ordering without explicitly
|
|
468
|
-
|
|
577
|
+
Which DataFrame row order to preserve, if any. Do not rely on any observed ordering without explicitly
|
|
578
|
+
setting this parameter, as your code may break in a future release.
|
|
579
|
+
Not specifying any ordering can improve performance Supported for inner, left, right and full joins
|
|
580
|
+
None: No specific ordering is desired. The ordering might differ across Polars versions or even between
|
|
581
|
+
different runs.
|
|
469
582
|
left: Preserves the order of the left DataFrame.
|
|
470
583
|
right: Preserves the order of the right DataFrame.
|
|
471
584
|
left_right: First preserves the order of the left DataFrame, then the right.
|
|
@@ -478,14 +591,27 @@ class FlowFrame:
|
|
|
478
591
|
FlowFrame
|
|
479
592
|
New FlowFrame with join operation applied.
|
|
480
593
|
"""
|
|
481
|
-
new_node_id = generate_node_id()
|
|
482
|
-
print('new node id', new_node_id)
|
|
483
594
|
use_polars_code = not(maintain_order is None and
|
|
484
595
|
coalesce is None and
|
|
485
596
|
nulls_equal is False and
|
|
486
597
|
validate is None and
|
|
487
598
|
suffix == '_right')
|
|
599
|
+
|
|
488
600
|
join_mappings = None
|
|
601
|
+
if self.flow_graph.flow_id != other.flow_graph.flow_id:
|
|
602
|
+
combined_graph, node_mappings = combine_flow_graphs_with_mapping(self.flow_graph, other.flow_graph)
|
|
603
|
+
new_self_node_id = node_mappings.get((self.flow_graph.flow_id, self.node_id), None)
|
|
604
|
+
new_other_node_id = node_mappings.get((other.flow_graph.flow_id, other.node_id), None)
|
|
605
|
+
if new_other_node_id is None or new_self_node_id is None:
|
|
606
|
+
raise ValueError("Cannot remap the nodes")
|
|
607
|
+
self.node_id = new_self_node_id
|
|
608
|
+
other.node_id = new_other_node_id
|
|
609
|
+
self.flow_graph = combined_graph
|
|
610
|
+
other.flow_graph = combined_graph
|
|
611
|
+
global node_id_counter
|
|
612
|
+
node_id_counter += len(combined_graph.nodes)
|
|
613
|
+
new_node_id = generate_node_id()
|
|
614
|
+
|
|
489
615
|
if on is not None:
|
|
490
616
|
left_columns = right_columns = _normalize_columns_to_list(on)
|
|
491
617
|
elif left_on is not None and right_on is not None:
|
|
@@ -504,10 +630,11 @@ class FlowFrame:
|
|
|
504
630
|
)
|
|
505
631
|
if not use_polars_code:
|
|
506
632
|
join_mappings, use_polars_code = _create_join_mappings(
|
|
507
|
-
left_columns, right_columns
|
|
633
|
+
left_columns or [], right_columns or []
|
|
508
634
|
)
|
|
509
635
|
|
|
510
636
|
if use_polars_code or suffix != '_right':
|
|
637
|
+
|
|
511
638
|
_on = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in _normalize_columns_to_list(on)) + "]" if on else None
|
|
512
639
|
_left = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in left_columns) + "]" if left_on else None
|
|
513
640
|
_right = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in right_columns) + "]" if right_on else None
|
|
@@ -527,31 +654,50 @@ class FlowFrame:
|
|
|
527
654
|
parent_node_id=self.node_id,
|
|
528
655
|
)
|
|
529
656
|
|
|
530
|
-
elif join_mappings:
|
|
657
|
+
elif join_mappings or how == 'cross':
|
|
658
|
+
|
|
531
659
|
left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
|
|
532
660
|
right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
|
|
533
661
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
662
|
+
if how == 'cross':
|
|
663
|
+
join_input = transform_schema.CrossJoinInput(left_select=left_select.renames,
|
|
664
|
+
right_select=right_select.renames,)
|
|
665
|
+
else:
|
|
666
|
+
join_input = transform_schema.JoinInput(
|
|
667
|
+
join_mapping=join_mappings,
|
|
668
|
+
left_select=left_select.renames,
|
|
669
|
+
right_select=right_select.renames,
|
|
670
|
+
how=how,
|
|
671
|
+
)
|
|
672
|
+
|
|
540
673
|
join_input.auto_rename()
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
674
|
+
if how == 'cross':
|
|
675
|
+
cross_join_settings = input_schema.NodeCrossJoin(
|
|
676
|
+
flow_id=self.flow_graph.flow_id,
|
|
677
|
+
node_id=new_node_id,
|
|
678
|
+
cross_join_input=join_input,
|
|
679
|
+
is_setup=True,
|
|
680
|
+
depending_on_ids=[self.node_id, other.node_id],
|
|
681
|
+
description=description or f"Join with {how} strategy",
|
|
682
|
+
auto_generate_selection=True,
|
|
683
|
+
verify_integrity=True,
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
self.flow_graph.add_cross_join(cross_join_settings)
|
|
687
|
+
else:
|
|
688
|
+
join_settings = input_schema.NodeJoin(
|
|
689
|
+
flow_id=self.flow_graph.flow_id,
|
|
690
|
+
node_id=new_node_id,
|
|
691
|
+
join_input=join_input,
|
|
692
|
+
auto_generate_selection=True,
|
|
693
|
+
verify_integrity=True,
|
|
694
|
+
pos_x=200,
|
|
695
|
+
pos_y=150,
|
|
696
|
+
is_setup=True,
|
|
697
|
+
depending_on_ids=[self.node_id, other.node_id],
|
|
698
|
+
description=description or f"Join with {how} strategy",
|
|
699
|
+
)
|
|
700
|
+
self.flow_graph.add_join(join_settings)
|
|
555
701
|
self._add_connection(self.node_id, new_node_id, "main")
|
|
556
702
|
other._add_connection(other.node_id, new_node_id, "right")
|
|
557
703
|
result_frame = FlowFrame(
|
|
@@ -578,40 +724,68 @@ class FlowFrame:
|
|
|
578
724
|
self.flow_graph.add_record_count(node_number_of_records)
|
|
579
725
|
return self._create_child_frame(new_node_id)
|
|
580
726
|
|
|
581
|
-
def select(self, *columns, description: str = None):
|
|
727
|
+
def select(self, *columns: Union[str, Expr, Selector], description: Optional[str] = None) -> "FlowFrame":
|
|
582
728
|
"""
|
|
583
729
|
Select columns from the frame.
|
|
584
|
-
|
|
585
|
-
Args:
|
|
586
|
-
*columns: Column names or expressions
|
|
587
|
-
description: Description of the step, this will be shown in the flowfile file
|
|
588
|
-
|
|
589
|
-
Returns:
|
|
590
|
-
A new FlowFrame with selected columns
|
|
591
730
|
"""
|
|
592
|
-
|
|
593
|
-
columns = _parse_inputs_as_iterable(columns)
|
|
731
|
+
columns_iterable = list(_parse_inputs_as_iterable(columns))
|
|
594
732
|
new_node_id = generate_node_id()
|
|
595
|
-
existing_columns = self.columns
|
|
596
733
|
|
|
597
|
-
if (len(
|
|
598
|
-
and str(
|
|
734
|
+
if (len(columns_iterable) == 1 and isinstance(columns_iterable[0], Expr)
|
|
735
|
+
and str(columns_iterable[0]) == "pl.Expr(len()).alias('number_of_records')"):
|
|
599
736
|
return self._add_number_of_records(new_node_id, description)
|
|
600
737
|
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
738
|
+
all_input_expr_objects: List[Expr] = []
|
|
739
|
+
pure_polars_expr_strings_for_select: List[str] = []
|
|
740
|
+
collected_raw_definitions: List[str] = []
|
|
741
|
+
selected_col_names_for_native: List[str] = [] # For native node
|
|
742
|
+
|
|
743
|
+
can_use_native_node = True
|
|
744
|
+
|
|
745
|
+
if len(columns_iterable) == 1 and isinstance(columns_iterable[0], str) and columns_iterable[0] == '*':
|
|
746
|
+
effective_columns_iterable = [col(c_name) for c_name in self.columns]
|
|
747
|
+
else:
|
|
748
|
+
effective_columns_iterable = columns_iterable
|
|
749
|
+
for expr_input in effective_columns_iterable:
|
|
750
|
+
current_expr_obj = expr_input
|
|
751
|
+
is_simple_col_for_native = False
|
|
752
|
+
|
|
753
|
+
if isinstance(expr_input, str):
|
|
754
|
+
current_expr_obj = col(expr_input)
|
|
755
|
+
selected_col_names_for_native.append(expr_input)
|
|
756
|
+
is_simple_col_for_native = True
|
|
757
|
+
elif isinstance(expr_input, Column) and not expr_input._select_input.is_altered: # type: ignore
|
|
758
|
+
selected_col_names_for_native.append(expr_input.column_name) # type: ignore
|
|
759
|
+
is_simple_col_for_native = True
|
|
760
|
+
elif isinstance(expr_input, Selector): # Selectors imply Polars code path
|
|
761
|
+
can_use_native_node = False
|
|
762
|
+
# current_expr_obj = expr_input # Already an Expr-like via selector
|
|
763
|
+
elif not isinstance(expr_input, Expr): # Includes Column
|
|
764
|
+
current_expr_obj = lit(expr_input)
|
|
765
|
+
|
|
766
|
+
all_input_expr_objects.append(current_expr_obj) # type: ignore
|
|
767
|
+
|
|
768
|
+
pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
|
|
769
|
+
|
|
770
|
+
pure_polars_expr_strings_for_select.append(pure_expr_str)
|
|
771
|
+
if raw_defs_str and raw_defs_str not in collected_raw_definitions:
|
|
772
|
+
collected_raw_definitions.append(raw_defs_str)
|
|
773
|
+
|
|
774
|
+
if not is_simple_col_for_native and not isinstance(expr_input, Selector):
|
|
775
|
+
can_use_native_node = False # Complex expressions require Polars code
|
|
776
|
+
if collected_raw_definitions: # Has to use Polars code if there are definitions
|
|
777
|
+
can_use_native_node = False
|
|
778
|
+
if can_use_native_node:
|
|
779
|
+
select_inputs_for_node = [transform_schema.SelectInput(old_name=name) for name in
|
|
780
|
+
selected_col_names_for_native]
|
|
781
|
+
existing_cols = self.columns
|
|
782
|
+
dropped_columns = [transform_schema.SelectInput(c, keep=False) for c in existing_cols if
|
|
783
|
+
c not in selected_col_names_for_native]
|
|
784
|
+
select_inputs_for_node.extend(dropped_columns)
|
|
611
785
|
select_settings = input_schema.NodeSelect(
|
|
612
786
|
flow_id=self.flow_graph.flow_id,
|
|
613
787
|
node_id=new_node_id,
|
|
614
|
-
select_input=
|
|
788
|
+
select_input=select_inputs_for_node,
|
|
615
789
|
keep_missing=False,
|
|
616
790
|
pos_x=200,
|
|
617
791
|
pos_y=100,
|
|
@@ -619,60 +793,97 @@ class FlowFrame:
|
|
|
619
793
|
depending_on_id=self.node_id,
|
|
620
794
|
description=description
|
|
621
795
|
)
|
|
622
|
-
|
|
623
|
-
# Add to graph
|
|
624
796
|
self.flow_graph.add_select(select_settings)
|
|
625
|
-
return self._create_child_frame(new_node_id)
|
|
626
|
-
|
|
627
797
|
else:
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
print('warning this cannot be converted to flowfile frontend. Make sure you use the flowfile expr')
|
|
637
|
-
is_readable = False
|
|
638
|
-
elif isinstance(col_, str) and col_ in self.columns:
|
|
639
|
-
col_expr = Column(col_)
|
|
640
|
-
readable_exprs.append(col_expr)
|
|
641
|
-
else:
|
|
642
|
-
lit_expr = lit(col_)
|
|
643
|
-
readable_exprs.append(lit_expr)
|
|
644
|
-
if is_readable:
|
|
645
|
-
code = f"input_df.select([{', '.join(str(e) for e in readable_exprs)}])"
|
|
798
|
+
polars_operation_code = f"input_df.select([{', '.join(pure_polars_expr_strings_for_select)}])"
|
|
799
|
+
final_code_for_node: str
|
|
800
|
+
if collected_raw_definitions:
|
|
801
|
+
unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions))
|
|
802
|
+
definitions_section = "\n\n".join(unique_raw_definitions)
|
|
803
|
+
final_code_for_node = definitions_section + \
|
|
804
|
+
"\#─────SPLIT─────\n\n" + \
|
|
805
|
+
f"output_df = {polars_operation_code}"
|
|
646
806
|
else:
|
|
647
|
-
|
|
807
|
+
final_code_for_node = polars_operation_code
|
|
648
808
|
|
|
649
|
-
|
|
650
|
-
|
|
809
|
+
pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
|
|
810
|
+
isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
|
|
811
|
+
self._add_polars_code(new_node_id, final_code_for_node, description,
|
|
812
|
+
method_name="select",
|
|
813
|
+
convertable_to_code=_check_if_convertible_to_code(all_input_expr_objects),
|
|
814
|
+
polars_expr=pl_expressions_for_fallback)
|
|
815
|
+
|
|
816
|
+
return self._create_child_frame(new_node_id)
|
|
651
817
|
|
|
652
|
-
def filter(self,
|
|
818
|
+
def filter(self, *predicates: Union[Expr, Any], flowfile_formula: Optional[str] = None,
|
|
819
|
+
description: Optional[str] = None, **constraints: Any) -> "FlowFrame":
|
|
653
820
|
"""
|
|
654
821
|
Filter rows based on a predicate.
|
|
655
|
-
|
|
656
|
-
Args:
|
|
657
|
-
predicate: Filter condition
|
|
658
|
-
flowfile_formula: Native support in frontend
|
|
659
|
-
description: Description of the step that is performed
|
|
660
|
-
Returns:
|
|
661
|
-
A new FlowFrame with filtered rows
|
|
662
822
|
"""
|
|
823
|
+
if (len(predicates) > 0 or len(constraints) > 0) and flowfile_formula:
|
|
824
|
+
raise ValueError("You can only use one of the following: predicates, constraints or flowfile_formula")
|
|
825
|
+
available_columns = self.columns
|
|
663
826
|
new_node_id = generate_node_id()
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
827
|
+
if len(predicates) > 0 or len(constraints) > 0:
|
|
828
|
+
all_input_expr_objects: List[Expr] = []
|
|
829
|
+
pure_polars_expr_strings: List[str] = []
|
|
830
|
+
collected_raw_definitions: List[str] = []
|
|
831
|
+
|
|
832
|
+
processed_predicates = []
|
|
833
|
+
for pred_item in predicates:
|
|
834
|
+
if isinstance(pred_item, (tuple, list, Iterator)):
|
|
835
|
+
# If it's a sequence, extend the processed_predicates with its elements
|
|
836
|
+
processed_predicates.extend(list(pred_item))
|
|
837
|
+
else:
|
|
838
|
+
# Otherwise, just add the item
|
|
839
|
+
processed_predicates.append(pred_item)
|
|
840
|
+
|
|
841
|
+
for pred_input in processed_predicates: # Loop over the processed_predicates
|
|
842
|
+
# End of the new/modified section
|
|
843
|
+
current_expr_obj = None # Initialize current_expr_obj
|
|
844
|
+
if isinstance(pred_input, Expr):
|
|
845
|
+
current_expr_obj = pred_input
|
|
846
|
+
elif isinstance(pred_input, str) and pred_input in available_columns:
|
|
847
|
+
current_expr_obj = col(pred_input)
|
|
848
|
+
else:
|
|
849
|
+
current_expr_obj = lit(pred_input)
|
|
850
|
+
|
|
851
|
+
all_input_expr_objects.append(current_expr_obj)
|
|
852
|
+
|
|
853
|
+
pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
|
|
854
|
+
pure_polars_expr_strings.append(f"({pure_expr_str})")
|
|
855
|
+
if raw_defs_str and raw_defs_str not in collected_raw_definitions:
|
|
856
|
+
collected_raw_definitions.append(raw_defs_str)
|
|
857
|
+
|
|
858
|
+
for k, v_val in constraints.items():
|
|
859
|
+
constraint_expr_obj = (col(k) == lit(v_val))
|
|
860
|
+
all_input_expr_objects.append(constraint_expr_obj)
|
|
861
|
+
pure_expr_str, raw_defs_str = _extract_expr_parts(
|
|
862
|
+
constraint_expr_obj) # Constraint exprs are unlikely to have defs
|
|
863
|
+
pure_polars_expr_strings.append(f"({pure_expr_str})")
|
|
864
|
+
if raw_defs_str and raw_defs_str not in collected_raw_definitions: # Should be rare here
|
|
865
|
+
collected_raw_definitions.append(raw_defs_str)
|
|
866
|
+
|
|
867
|
+
filter_conditions_str = " & ".join(pure_polars_expr_strings) if pure_polars_expr_strings else "pl.lit(True)"
|
|
868
|
+
polars_operation_code = f"input_df.filter({filter_conditions_str})"
|
|
869
|
+
|
|
870
|
+
final_code_for_node: str
|
|
871
|
+
if collected_raw_definitions:
|
|
872
|
+
unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions)) # Order-preserving unique
|
|
873
|
+
definitions_section = "\n\n".join(unique_raw_definitions)
|
|
874
|
+
final_code_for_node = definitions_section + \
|
|
875
|
+
"\#─────SPLIT─────\n\n" + \
|
|
876
|
+
f"output_df = {polars_operation_code}"
|
|
669
877
|
else:
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
878
|
+
final_code_for_node = polars_operation_code
|
|
879
|
+
|
|
880
|
+
convertable_to_code = _check_if_convertible_to_code(all_input_expr_objects)
|
|
881
|
+
pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
|
|
882
|
+
isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
|
|
883
|
+
self._add_polars_code(new_node_id, final_code_for_node, description, method_name="filter",
|
|
884
|
+
convertable_to_code=convertable_to_code,
|
|
885
|
+
polars_expr=pl_expressions_for_fallback)
|
|
674
886
|
elif flowfile_formula:
|
|
675
|
-
# Create node settings
|
|
676
887
|
filter_settings = input_schema.NodeFilter(
|
|
677
888
|
flow_id=self.flow_graph.flow_id,
|
|
678
889
|
node_id=new_node_id,
|
|
@@ -686,8 +897,10 @@ class FlowFrame:
|
|
|
686
897
|
depending_on_id=self.node_id,
|
|
687
898
|
description=description
|
|
688
899
|
)
|
|
689
|
-
|
|
690
900
|
self.flow_graph.add_filter(filter_settings)
|
|
901
|
+
else:
|
|
902
|
+
logger.info("Filter called with no arguments; creating a pass-through Polars code node.")
|
|
903
|
+
self._add_polars_code(new_node_id, "output_df = input_df", description or "No-op filter", method_name=None)
|
|
691
904
|
|
|
692
905
|
return self._create_child_frame(new_node_id)
|
|
693
906
|
|
|
@@ -772,7 +985,7 @@ class FlowFrame:
|
|
|
772
985
|
if convert_to_absolute_path:
|
|
773
986
|
output_settings.directory = output_settings.abs_file_path
|
|
774
987
|
except Exception as e:
|
|
775
|
-
|
|
988
|
+
logger.warning(f"Could not determine absolute path for {file_str}: {e}")
|
|
776
989
|
|
|
777
990
|
if not use_polars_code:
|
|
778
991
|
node_output = input_schema.NodeOutput(
|
|
@@ -800,7 +1013,7 @@ class FlowFrame:
|
|
|
800
1013
|
|
|
801
1014
|
# Use sink_parquet for LazyFrames
|
|
802
1015
|
code = f"input_df.sink_parquet({args_str})"
|
|
803
|
-
|
|
1016
|
+
logger.debug(f"Generated Polars Code: {code}")
|
|
804
1017
|
self._add_polars_code(new_node_id, code, description)
|
|
805
1018
|
|
|
806
1019
|
return self._create_child_frame(new_node_id)
|
|
@@ -848,7 +1061,7 @@ class FlowFrame:
|
|
|
848
1061
|
if convert_to_absolute_path:
|
|
849
1062
|
output_settings.directory = output_settings.abs_file_path
|
|
850
1063
|
except Exception as e:
|
|
851
|
-
|
|
1064
|
+
logger.warning(f"Could not determine absolute path for {file_str}: {e}")
|
|
852
1065
|
|
|
853
1066
|
if not use_polars_code:
|
|
854
1067
|
node_output = input_schema.NodeOutput(
|
|
@@ -881,7 +1094,7 @@ class FlowFrame:
|
|
|
881
1094
|
args_str += f", {kwargs_repr}"
|
|
882
1095
|
|
|
883
1096
|
code = f"input_df.collect().write_csv({args_str})"
|
|
884
|
-
|
|
1097
|
+
logger.debug(f"Generated Polars Code: {code}")
|
|
885
1098
|
self._add_polars_code(new_node_id, code, description)
|
|
886
1099
|
|
|
887
1100
|
return self._create_child_frame(new_node_id)
|
|
@@ -934,10 +1147,10 @@ class FlowFrame:
|
|
|
934
1147
|
self.flow_graph.apply_layout()
|
|
935
1148
|
self.flow_graph.save_flow(file_path)
|
|
936
1149
|
|
|
937
|
-
def collect(self):
|
|
1150
|
+
def collect(self, *args, **kwargs):
|
|
938
1151
|
"""Collect lazy data into memory."""
|
|
939
1152
|
if hasattr(self.data, "collect"):
|
|
940
|
-
return self.data.collect()
|
|
1153
|
+
return self.data.collect(*args, **kwargs)
|
|
941
1154
|
return self.data
|
|
942
1155
|
|
|
943
1156
|
def _with_flowfile_formula(self, flowfile_formula: str, output_column_name, description: str = None) -> "FlowFrame":
|
|
@@ -946,7 +1159,7 @@ class FlowFrame:
|
|
|
946
1159
|
input_schema.NodeFormula(flow_id=self.flow_graph.flow_id, node_id=new_node_id, depending_on_id=self.node_id,
|
|
947
1160
|
function=transform_schema.FunctionInput(
|
|
948
1161
|
function=flowfile_formula,
|
|
949
|
-
field=transform_schema.FieldInput(name=output_column_name)),
|
|
1162
|
+
field=transform_schema.FieldInput(name=output_column_name, data_type='Auto')),
|
|
950
1163
|
description=description))
|
|
951
1164
|
self.flow_graph.add_formula(function_settings)
|
|
952
1165
|
return self._create_child_frame(new_node_id)
|
|
@@ -1241,16 +1454,27 @@ class FlowFrame:
|
|
|
1241
1454
|
FlowFrame
|
|
1242
1455
|
A new FlowFrame with the concatenated data
|
|
1243
1456
|
"""
|
|
1244
|
-
new_node_id = generate_node_id()
|
|
1245
|
-
|
|
1246
1457
|
# Convert single FlowFrame to list
|
|
1247
1458
|
if isinstance(other, FlowFrame):
|
|
1248
1459
|
others = [other]
|
|
1249
1460
|
else:
|
|
1250
1461
|
others = other
|
|
1251
|
-
|
|
1462
|
+
all_graphs = []
|
|
1463
|
+
all_graph_ids = []
|
|
1464
|
+
for g in [self.flow_graph] + [f.flow_graph for f in others]:
|
|
1465
|
+
if g.flow_id not in all_graph_ids:
|
|
1466
|
+
all_graph_ids.append(g.flow_id)
|
|
1467
|
+
all_graphs.append(g)
|
|
1468
|
+
if len(all_graphs) > 1:
|
|
1469
|
+
combined_graph, node_mappings = combine_flow_graphs_with_mapping(*all_graphs)
|
|
1470
|
+
for f in [self] + other:
|
|
1471
|
+
f.node_id = node_mappings.get((f.flow_graph.flow_id, f.node_id), None)
|
|
1472
|
+
global node_id_counter
|
|
1473
|
+
node_id_counter += len(combined_graph.nodes)
|
|
1474
|
+
else:
|
|
1475
|
+
combined_graph = self.flow_graph
|
|
1476
|
+
new_node_id = generate_node_id()
|
|
1252
1477
|
use_native = how == "diagonal_relaxed" and parallel and not rechunk
|
|
1253
|
-
|
|
1254
1478
|
if use_native:
|
|
1255
1479
|
# Create union input for the transform schema
|
|
1256
1480
|
union_input = transform_schema.UnionInput(
|
|
@@ -1284,7 +1508,6 @@ class FlowFrame:
|
|
|
1284
1508
|
input_vars.append(f"input_df_{i+2}")
|
|
1285
1509
|
|
|
1286
1510
|
frames_list = f"[{', '.join(input_vars)}]"
|
|
1287
|
-
|
|
1288
1511
|
code = f"""
|
|
1289
1512
|
# Perform concat operation
|
|
1290
1513
|
output_df = pl.concat(
|
|
@@ -1294,19 +1517,20 @@ class FlowFrame:
|
|
|
1294
1517
|
parallel={parallel}
|
|
1295
1518
|
)
|
|
1296
1519
|
"""
|
|
1297
|
-
|
|
1520
|
+
self.flow_graph = combined_graph
|
|
1298
1521
|
|
|
1299
1522
|
# Add polars code node with dependencies on all input frames
|
|
1300
1523
|
depending_on_ids = [self.node_id] + [frame.node_id for frame in others]
|
|
1301
1524
|
self._add_polars_code(
|
|
1302
1525
|
new_node_id, code, description, depending_on_ids=depending_on_ids
|
|
1303
1526
|
)
|
|
1304
|
-
|
|
1305
1527
|
# Add connections to ensure all frames are available
|
|
1306
1528
|
self._add_connection(self.node_id, new_node_id, "main")
|
|
1529
|
+
|
|
1307
1530
|
for other_frame in others:
|
|
1308
|
-
other_frame._add_connection(other_frame.node_id, new_node_id, "main")
|
|
1309
1531
|
|
|
1532
|
+
other_frame.flow_graph = combined_graph
|
|
1533
|
+
other_frame._add_connection(other_frame.node_id, new_node_id, "main")
|
|
1310
1534
|
# Create and return the new frame
|
|
1311
1535
|
return FlowFrame(
|
|
1312
1536
|
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
@@ -1343,7 +1567,7 @@ class FlowFrame:
|
|
|
1343
1567
|
return False, None
|
|
1344
1568
|
|
|
1345
1569
|
# Extract the output name
|
|
1346
|
-
output_name = expr.
|
|
1570
|
+
output_name = expr.column_name
|
|
1347
1571
|
|
|
1348
1572
|
if ".over(" not in expr._repr_str:
|
|
1349
1573
|
# Simple cumulative count can be implemented as a record ID with offset=1
|
|
@@ -1426,62 +1650,70 @@ class FlowFrame:
|
|
|
1426
1650
|
return False, None
|
|
1427
1651
|
|
|
1428
1652
|
def with_columns(
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1653
|
+
self,
|
|
1654
|
+
*exprs: Union[Expr, Iterable[Expr], Any], # Allow Any for implicit lit conversion
|
|
1655
|
+
flowfile_formulas: Optional[List[str]] = None,
|
|
1656
|
+
output_column_names: Optional[List[str]] = None,
|
|
1657
|
+
description: Optional[str] = None,
|
|
1658
|
+
**named_exprs: Union[Expr, Any], # Allow Any for implicit lit conversion
|
|
1435
1659
|
) -> "FlowFrame":
|
|
1436
1660
|
"""
|
|
1437
|
-
Add
|
|
1438
|
-
|
|
1439
|
-
Parameters
|
|
1440
|
-
----------
|
|
1441
|
-
exprs : Expr or List[Expr], optional
|
|
1442
|
-
Expressions to evaluate as new columns
|
|
1443
|
-
flowfile_formulas : List[str], optional
|
|
1444
|
-
Alternative approach using flowfile formula syntax
|
|
1445
|
-
output_column_names : List[str], optional
|
|
1446
|
-
Column names for the flowfile formulas
|
|
1447
|
-
description : str, optional
|
|
1448
|
-
Description of this operation for the ETL graph
|
|
1449
|
-
|
|
1450
|
-
Returns
|
|
1451
|
-
-------
|
|
1452
|
-
FlowFrame
|
|
1453
|
-
A new FlowFrame with the columns added
|
|
1454
|
-
|
|
1455
|
-
Raises
|
|
1456
|
-
------
|
|
1457
|
-
ValueError
|
|
1458
|
-
If neither exprs nor flowfile_formulas with output_column_names are provided,
|
|
1459
|
-
or if the lengths of flowfile_formulas and output_column_names don't match
|
|
1661
|
+
Add or replace columns in the DataFrame.
|
|
1460
1662
|
"""
|
|
1461
|
-
|
|
1462
|
-
new_node_id = generate_node_id()
|
|
1463
|
-
exprs_iterable = _parse_inputs_as_iterable((exprs,))
|
|
1663
|
+
new_node_id = generate_node_id()
|
|
1464
1664
|
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1665
|
+
all_input_expr_objects: List[Expr] = []
|
|
1666
|
+
pure_polars_expr_strings_for_wc: List[str] = []
|
|
1667
|
+
collected_raw_definitions: List[str] = []
|
|
1668
|
+
|
|
1669
|
+
has_exprs_or_named_exprs = bool(exprs or named_exprs)
|
|
1670
|
+
if has_exprs_or_named_exprs:
|
|
1671
|
+
actual_exprs_to_process: List[Expr] = []
|
|
1672
|
+
temp_exprs_iterable = list(_parse_inputs_as_iterable(exprs))
|
|
1673
|
+
|
|
1674
|
+
for item in temp_exprs_iterable:
|
|
1675
|
+
if isinstance(item, Expr):
|
|
1676
|
+
actual_exprs_to_process.append(item)
|
|
1677
|
+
else: # auto-lit for non-Expr positional args
|
|
1678
|
+
actual_exprs_to_process.append(lit(item))
|
|
1679
|
+
|
|
1680
|
+
for name, val_expr in named_exprs.items():
|
|
1681
|
+
if isinstance(val_expr, Expr):
|
|
1682
|
+
actual_exprs_to_process.append(val_expr.alias(name)) # type: ignore # Assuming Expr has alias
|
|
1683
|
+
else: # auto-lit for named args and then alias
|
|
1684
|
+
actual_exprs_to_process.append(lit(val_expr).alias(name)) # type: ignore
|
|
1685
|
+
|
|
1686
|
+
if len(actual_exprs_to_process) == 1 and isinstance(actual_exprs_to_process[0], Expr):
|
|
1687
|
+
pass
|
|
1688
|
+
|
|
1689
|
+
for current_expr_obj in actual_exprs_to_process:
|
|
1690
|
+
all_input_expr_objects.append(current_expr_obj)
|
|
1691
|
+
pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
|
|
1692
|
+
pure_polars_expr_strings_for_wc.append(pure_expr_str) # with_columns takes individual expressions
|
|
1693
|
+
if raw_defs_str and raw_defs_str not in collected_raw_definitions:
|
|
1694
|
+
collected_raw_definitions.append(raw_defs_str)
|
|
1695
|
+
|
|
1696
|
+
polars_operation_code = f"input_df.with_columns([{', '.join(pure_polars_expr_strings_for_wc)}])"
|
|
1697
|
+
|
|
1698
|
+
final_code_for_node: str
|
|
1699
|
+
if collected_raw_definitions:
|
|
1700
|
+
unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions))
|
|
1701
|
+
definitions_section = "\n\n".join(unique_raw_definitions)
|
|
1702
|
+
final_code_for_node = definitions_section + \
|
|
1703
|
+
"\n#─────SPLIT─────\n\n" + \
|
|
1704
|
+
f"output_df = {polars_operation_code}"
|
|
1705
|
+
else:
|
|
1706
|
+
final_code_for_node = polars_operation_code
|
|
1477
1707
|
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1708
|
+
pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
|
|
1709
|
+
isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
|
|
1710
|
+
self._add_polars_code(new_node_id, final_code_for_node, description, method_name='with_columns',
|
|
1711
|
+
convertable_to_code=_check_if_convertible_to_code(all_input_expr_objects),
|
|
1712
|
+
polars_expr=pl_expressions_for_fallback)
|
|
1482
1713
|
return self._create_child_frame(new_node_id)
|
|
1483
1714
|
|
|
1484
1715
|
elif flowfile_formulas is not None and output_column_names is not None:
|
|
1716
|
+
|
|
1485
1717
|
if len(output_column_names) != len(flowfile_formulas):
|
|
1486
1718
|
raise ValueError(
|
|
1487
1719
|
"Length of both the formulas and the output columns names must be identical"
|
|
@@ -1494,9 +1726,7 @@ class FlowFrame:
|
|
|
1494
1726
|
ff = ff._with_flowfile_formula(flowfile_formula, output_column_name, f"{i}: {description}")
|
|
1495
1727
|
return ff
|
|
1496
1728
|
else:
|
|
1497
|
-
raise ValueError(
|
|
1498
|
-
"Either exprs or flowfile_formulas with output_column_names must be provided"
|
|
1499
|
-
)
|
|
1729
|
+
raise ValueError("Either exprs/named_exprs or flowfile_formulas with output_column_names must be provided")
|
|
1500
1730
|
|
|
1501
1731
|
def with_row_index(
|
|
1502
1732
|
self, name: str = "index", offset: int = 0, description: str = None
|
|
@@ -1584,26 +1814,27 @@ class FlowFrame:
|
|
|
1584
1814
|
|
|
1585
1815
|
if isinstance(columns, (list, tuple)):
|
|
1586
1816
|
all_columns.extend(
|
|
1587
|
-
[col.
|
|
1817
|
+
[col.column_name if isinstance(col, Column) else col for col in columns]
|
|
1588
1818
|
)
|
|
1589
1819
|
else:
|
|
1590
|
-
all_columns.append(columns.
|
|
1820
|
+
all_columns.append(columns.column_name if isinstance(columns, Column) else columns)
|
|
1591
1821
|
|
|
1592
1822
|
if more_columns:
|
|
1593
1823
|
for col in more_columns:
|
|
1594
|
-
all_columns.append(col.
|
|
1824
|
+
all_columns.append(col.column_name if isinstance(col, Column) else col)
|
|
1595
1825
|
|
|
1596
1826
|
if len(all_columns) == 1:
|
|
1597
|
-
|
|
1827
|
+
|
|
1828
|
+
columns_str = stringify_values(all_columns[0])
|
|
1598
1829
|
else:
|
|
1599
|
-
columns_str = "[" + ", ".join([
|
|
1830
|
+
columns_str = "[" + ", ".join([ stringify_values(col) for col in all_columns]) + "]"
|
|
1600
1831
|
|
|
1601
1832
|
code = f"""
|
|
1602
1833
|
# Explode columns into multiple rows
|
|
1603
1834
|
output_df = input_df.explode({columns_str})
|
|
1604
1835
|
"""
|
|
1605
1836
|
|
|
1606
|
-
cols_desc = ", ".join(all_columns)
|
|
1837
|
+
cols_desc = ", ".join(str(s) for s in all_columns)
|
|
1607
1838
|
desc = description or f"Explode column(s): {cols_desc}"
|
|
1608
1839
|
|
|
1609
1840
|
# Add polars code node
|
|
@@ -1646,7 +1877,7 @@ class FlowFrame:
|
|
|
1646
1877
|
new_node_id = generate_node_id()
|
|
1647
1878
|
|
|
1648
1879
|
if isinstance(column, Column):
|
|
1649
|
-
column_name = column.
|
|
1880
|
+
column_name = column.column_name
|
|
1650
1881
|
else:
|
|
1651
1882
|
column_name = column
|
|
1652
1883
|
|
|
@@ -1730,7 +1961,7 @@ class FlowFrame:
|
|
|
1730
1961
|
if col_expr._select_input.is_altered:
|
|
1731
1962
|
can_use_native = False
|
|
1732
1963
|
break
|
|
1733
|
-
processed_subset.append(col_expr.
|
|
1964
|
+
processed_subset.append(col_expr.column_name)
|
|
1734
1965
|
else:
|
|
1735
1966
|
can_use_native = False
|
|
1736
1967
|
break
|
|
@@ -1818,276 +2049,34 @@ class FlowFrame:
|
|
|
1818
2049
|
"""Get the number of columns."""
|
|
1819
2050
|
return self.data.width
|
|
1820
2051
|
|
|
2052
|
+
def __contains__(self, key):
|
|
2053
|
+
"""This special method enables the 'in' operator to work with FlowFrame objects."""
|
|
2054
|
+
return key in self.data
|
|
1821
2055
|
|
|
1822
|
-
def
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
"profile",
|
|
1827
|
-
"describe",
|
|
1828
|
-
"explain",
|
|
1829
|
-
"show_graph",
|
|
1830
|
-
"serialize",
|
|
1831
|
-
"fetch",
|
|
1832
|
-
"get_meta",
|
|
1833
|
-
"columns",
|
|
1834
|
-
"dtypes",
|
|
1835
|
-
"schema",
|
|
1836
|
-
"estimated_size",
|
|
1837
|
-
"n_chunks",
|
|
1838
|
-
"is_empty",
|
|
1839
|
-
"chunk_lengths",
|
|
1840
|
-
"optimization_toggle",
|
|
1841
|
-
"set_polars_options",
|
|
1842
|
-
"collect_schema"
|
|
1843
|
-
]
|
|
1844
|
-
|
|
1845
|
-
already_implemented = set(dir(FlowFrame))
|
|
1846
|
-
|
|
1847
|
-
for method_name in delegate_methods:
|
|
1848
|
-
if method_name not in already_implemented and hasattr(
|
|
1849
|
-
pl.LazyFrame, method_name
|
|
1850
|
-
):
|
|
1851
|
-
# Create a simple delegate method
|
|
1852
|
-
def make_delegate(name):
|
|
1853
|
-
def delegate_method(self, *args, **kwargs):
|
|
1854
|
-
return getattr(self.data, name)(*args, **kwargs)
|
|
1855
|
-
|
|
1856
|
-
# Set docstring and name
|
|
1857
|
-
delegate_method.__doc__ = (
|
|
1858
|
-
f"See pl.LazyFrame.{name} for full documentation."
|
|
1859
|
-
)
|
|
1860
|
-
delegate_method.__name__ = name
|
|
1861
|
-
return delegate_method
|
|
1862
|
-
|
|
1863
|
-
# Add the method to the class
|
|
1864
|
-
setattr(FlowFrame, method_name, make_delegate(method_name))
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
_add_delegated_methods()
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
def sum(expr):
|
|
1871
|
-
"""Sum aggregation function."""
|
|
1872
|
-
if isinstance(expr, str):
|
|
1873
|
-
expr = col(expr)
|
|
1874
|
-
return expr.sum()
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
def mean(expr):
|
|
1878
|
-
"""Mean aggregation function."""
|
|
1879
|
-
if isinstance(expr, str):
|
|
1880
|
-
expr = col(expr)
|
|
1881
|
-
return expr.mean()
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
def min(expr):
|
|
1885
|
-
"""Min aggregation function."""
|
|
1886
|
-
if isinstance(expr, str):
|
|
1887
|
-
expr = col(expr)
|
|
1888
|
-
return expr.min()
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
def max(expr):
|
|
1892
|
-
"""Max aggregation function."""
|
|
1893
|
-
if isinstance(expr, str):
|
|
1894
|
-
expr = col(expr)
|
|
1895
|
-
return expr.max()
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
def count(expr):
|
|
1899
|
-
"""Count aggregation function."""
|
|
1900
|
-
if isinstance(expr, str):
|
|
1901
|
-
expr = col(expr)
|
|
1902
|
-
return expr.count()
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
def read_csv(file_path, *, flow_graph: FlowGraph = None, separator: str = ';',
|
|
1906
|
-
convert_to_absolute_path: bool = True,
|
|
1907
|
-
description: str = None, **options):
|
|
1908
|
-
"""
|
|
1909
|
-
Read a CSV file into a FlowFrame.
|
|
1910
|
-
|
|
1911
|
-
Args:
|
|
1912
|
-
file_path: Path to CSV file
|
|
1913
|
-
flow_graph: if you want to add it to an existing graph
|
|
1914
|
-
separator: Single byte character to use as separator in the file.
|
|
1915
|
-
convert_to_absolute_path: If the path needs to be set to a fixed location
|
|
1916
|
-
description: if you want to add a readable name in the frontend (advised)
|
|
1917
|
-
**options: Options for polars.read_csv
|
|
1918
|
-
|
|
1919
|
-
Returns:
|
|
1920
|
-
A FlowFrame with the CSV data
|
|
1921
|
-
"""
|
|
1922
|
-
# Create new node ID
|
|
1923
|
-
node_id = generate_node_id()
|
|
1924
|
-
if flow_graph is None:
|
|
1925
|
-
flow_graph = create_flow_graph()
|
|
1926
|
-
|
|
1927
|
-
flow_id = flow_graph.flow_id
|
|
1928
|
-
|
|
1929
|
-
has_headers = options.get('has_header', True)
|
|
1930
|
-
encoding = options.get('encoding', 'utf-8')
|
|
1931
|
-
|
|
1932
|
-
if '~' in file_path:
|
|
1933
|
-
file_path = os.path.expanduser(file_path)
|
|
1934
|
-
|
|
1935
|
-
received_table = input_schema.ReceivedTable(
|
|
1936
|
-
file_type='csv',
|
|
1937
|
-
path=file_path,
|
|
1938
|
-
name=Path(file_path).name,
|
|
1939
|
-
delimiter=separator,
|
|
1940
|
-
has_headers=has_headers,
|
|
1941
|
-
encoding=encoding
|
|
1942
|
-
)
|
|
1943
|
-
|
|
1944
|
-
if convert_to_absolute_path:
|
|
1945
|
-
received_table.path = received_table.abs_file_path
|
|
1946
|
-
|
|
1947
|
-
read_node = input_schema.NodeRead(
|
|
1948
|
-
flow_id=flow_id,
|
|
1949
|
-
node_id=node_id,
|
|
1950
|
-
received_file=received_table,
|
|
1951
|
-
pos_x=100,
|
|
1952
|
-
pos_y=100,
|
|
1953
|
-
is_setup=True
|
|
1954
|
-
)
|
|
1955
|
-
|
|
1956
|
-
flow_graph.add_read(read_node)
|
|
1957
|
-
|
|
1958
|
-
return FlowFrame(
|
|
1959
|
-
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
1960
|
-
flow_graph=flow_graph,
|
|
1961
|
-
node_id=node_id
|
|
1962
|
-
)
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
|
|
1966
|
-
convert_to_absolute_path: bool = True, **options) -> FlowFrame:
|
|
1967
|
-
"""
|
|
1968
|
-
Read a Parquet file into a FlowFrame.
|
|
1969
|
-
|
|
1970
|
-
Args:
|
|
1971
|
-
file_path: Path to Parquet file
|
|
1972
|
-
flow_graph: if you want to add it to an existing graph
|
|
1973
|
-
description: if you want to add a readable name in the frontend (advised)
|
|
1974
|
-
convert_to_absolute_path: If the path needs to be set to a fixed location
|
|
1975
|
-
**options: Options for polars.read_parquet
|
|
1976
|
-
|
|
1977
|
-
Returns:
|
|
1978
|
-
A FlowFrame with the Parquet data
|
|
1979
|
-
"""
|
|
1980
|
-
if '~' in file_path:
|
|
1981
|
-
file_path = os.path.expanduser(file_path)
|
|
1982
|
-
node_id = generate_node_id()
|
|
1983
|
-
|
|
1984
|
-
if flow_graph is None:
|
|
1985
|
-
flow_graph = create_flow_graph()
|
|
1986
|
-
|
|
1987
|
-
flow_id = flow_graph.flow_id
|
|
1988
|
-
|
|
1989
|
-
received_table = input_schema.ReceivedTable(
|
|
1990
|
-
file_type='parquet',
|
|
1991
|
-
path=file_path,
|
|
1992
|
-
name=Path(file_path).name,
|
|
1993
|
-
)
|
|
1994
|
-
if convert_to_absolute_path:
|
|
1995
|
-
received_table.path = received_table.abs_file_path
|
|
1996
|
-
|
|
1997
|
-
read_node = input_schema.NodeRead(
|
|
1998
|
-
flow_id=flow_id,
|
|
1999
|
-
node_id=node_id,
|
|
2000
|
-
received_file=received_table,
|
|
2001
|
-
pos_x=100,
|
|
2002
|
-
pos_y=100,
|
|
2003
|
-
is_setup=True,
|
|
2004
|
-
description=description
|
|
2005
|
-
)
|
|
2006
|
-
|
|
2007
|
-
flow_graph.add_read(read_node)
|
|
2008
|
-
|
|
2009
|
-
return FlowFrame(
|
|
2010
|
-
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
2011
|
-
flow_graph=flow_graph,
|
|
2012
|
-
node_id=node_id
|
|
2013
|
-
)
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
def from_dict(data, *, flow_graph: FlowGraph = None, description: str = None) -> FlowFrame:
|
|
2017
|
-
"""
|
|
2018
|
-
Create a FlowFrame from a dictionary or list of dictionaries.
|
|
2019
|
-
|
|
2020
|
-
Args:
|
|
2021
|
-
data: Dictionary of lists or list of dictionaries
|
|
2022
|
-
flow_graph: if you want to add it to an existing graph
|
|
2023
|
-
description: if you want to add a readable name in the frontend (advised)
|
|
2024
|
-
Returns:
|
|
2025
|
-
A FlowFrame with the data
|
|
2026
|
-
"""
|
|
2027
|
-
# Create new node ID
|
|
2028
|
-
node_id = generate_node_id()
|
|
2029
|
-
|
|
2030
|
-
if not flow_graph:
|
|
2031
|
-
flow_graph = create_flow_graph()
|
|
2032
|
-
flow_id = flow_graph.flow_id
|
|
2033
|
-
|
|
2034
|
-
input_node = input_schema.NodeManualInput(
|
|
2035
|
-
flow_id=flow_id,
|
|
2036
|
-
node_id=node_id,
|
|
2037
|
-
raw_data=FlowDataEngine(data).to_pylist(),
|
|
2038
|
-
pos_x=100,
|
|
2039
|
-
pos_y=100,
|
|
2040
|
-
is_setup=True,
|
|
2041
|
-
description=description
|
|
2042
|
-
)
|
|
2043
|
-
|
|
2044
|
-
# Add to graph
|
|
2045
|
-
flow_graph.add_manual_input(input_node)
|
|
2046
|
-
|
|
2047
|
-
# Return new frame
|
|
2048
|
-
return FlowFrame(
|
|
2049
|
-
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
2050
|
-
flow_graph=flow_graph,
|
|
2051
|
-
node_id=node_id
|
|
2052
|
-
)
|
|
2056
|
+
def __bool__(self):
|
|
2057
|
+
"""This special method determines how the object behaves in boolean contexts.
|
|
2058
|
+
Returns True if the FlowFrame contains any data, False otherwise."""
|
|
2059
|
+
return bool(self.data)
|
|
2053
2060
|
|
|
2061
|
+
@staticmethod
|
|
2062
|
+
def _comparison_error(operator: str) -> pl.lazyframe.frame.NoReturn:
|
|
2063
|
+
msg = f'"{operator!r}" comparison not supported for LazyFrame objects'
|
|
2064
|
+
raise TypeError(msg)
|
|
2054
2065
|
|
|
2055
|
-
def
|
|
2056
|
-
|
|
2057
|
-
rechunk: bool = False,
|
|
2058
|
-
parallel: bool = True,
|
|
2059
|
-
description: str = None) -> 'FlowFrame':
|
|
2060
|
-
"""
|
|
2061
|
-
Concatenate multiple FlowFrames into one.
|
|
2066
|
+
def __eq__(self, other: object) -> pl.lazyframe.frame.NoReturn:
|
|
2067
|
+
self._comparison_error("==")
|
|
2062
2068
|
|
|
2063
|
-
|
|
2064
|
-
|
|
2065
|
-
frames : List[FlowFrame]
|
|
2066
|
-
List of FlowFrames to concatenate
|
|
2067
|
-
how : str, default 'vertical'
|
|
2068
|
-
How to combine the FlowFrames (see concat method documentation)
|
|
2069
|
-
rechunk : bool, default False
|
|
2070
|
-
Whether to ensure contiguous memory in result
|
|
2071
|
-
parallel : bool, default True
|
|
2072
|
-
Whether to use parallel processing for the operation
|
|
2073
|
-
description : str, optional
|
|
2074
|
-
Description of this operation
|
|
2069
|
+
def __ne__(self, other: object) -> pl.lazyframe.frame.NoReturn:
|
|
2070
|
+
self._comparison_error("!=")
|
|
2075
2071
|
|
|
2076
|
-
|
|
2077
|
-
|
|
2078
|
-
FlowFrame
|
|
2079
|
-
A new FlowFrame with the concatenated data
|
|
2080
|
-
"""
|
|
2081
|
-
if not frames:
|
|
2082
|
-
raise ValueError("No frames provided to concat_frames")
|
|
2072
|
+
def __gt__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
|
|
2073
|
+
self._comparison_error(">")
|
|
2083
2074
|
|
|
2084
|
-
|
|
2085
|
-
|
|
2075
|
+
def __lt__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
|
|
2076
|
+
self._comparison_error("<")
|
|
2086
2077
|
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
remaining_frames = frames[1:]
|
|
2078
|
+
def __ge__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
|
|
2079
|
+
self._comparison_error(">=")
|
|
2090
2080
|
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
description=description)
|
|
2081
|
+
def __le__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
|
|
2082
|
+
self._comparison_error("<=")
|