Flowfile 0.3.2__py3-none-any.whl → 0.3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowfile/__init__.py +3 -2
- flowfile/web/__init__.py +3 -0
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/METADATA +4 -3
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/RECORD +46 -35
- flowfile_core/configs/__init__.py +15 -4
- flowfile_core/configs/settings.py +5 -3
- flowfile_core/configs/utils.py +18 -0
- flowfile_core/flowfile/FlowfileFlow.py +13 -18
- flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +54 -17
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +2 -1
- flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
- flowfile_core/flowfile/flow_node/flow_node.py +2 -1
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
- flowfile_core/flowfile/utils.py +34 -3
- flowfile_core/main.py +2 -3
- flowfile_core/routes/secrets.py +1 -1
- flowfile_core/schemas/input_schema.py +10 -4
- flowfile_core/schemas/transform_schema.py +25 -47
- flowfile_frame/__init__.py +11 -4
- flowfile_frame/adding_expr.py +280 -0
- flowfile_frame/config.py +9 -0
- flowfile_frame/expr.py +301 -83
- flowfile_frame/expr.pyi +2174 -0
- flowfile_frame/expr_name.py +258 -0
- flowfile_frame/flow_frame.py +584 -1002
- flowfile_frame/flow_frame.pyi +368 -0
- flowfile_frame/flow_frame_methods.py +617 -0
- flowfile_frame/group_frame.py +89 -42
- flowfile_frame/join.py +1 -2
- flowfile_frame/lazy.py +704 -0
- flowfile_frame/lazy_methods.py +201 -0
- flowfile_frame/list_name_space.py +324 -0
- flowfile_frame/selectors.py +3 -0
- flowfile_frame/series.py +70 -0
- flowfile_frame/utils.py +80 -4
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/LICENSE +0 -0
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/WHEEL +0 -0
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/entry_points.txt +0 -0
- /flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
- /flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0
flowfile_frame/flow_frame.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
import
|
|
1
|
+
import inspect
|
|
2
2
|
import os
|
|
3
|
-
from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable
|
|
4
|
-
from pathlib import Path
|
|
3
|
+
from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable, get_args, get_origin
|
|
5
4
|
|
|
6
|
-
import io
|
|
7
5
|
import re
|
|
6
|
+
|
|
8
7
|
import polars as pl
|
|
9
|
-
from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation, IO, Mapping, PolarsDataType,
|
|
10
|
-
Sequence, CsvEncoding)
|
|
11
8
|
|
|
12
|
-
|
|
9
|
+
from flowfile_frame.lazy_methods import add_lazyframe_methods
|
|
10
|
+
|
|
11
|
+
from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
|
|
12
|
+
from collections.abc import Iterator
|
|
13
13
|
from flowfile_core.flowfile.FlowfileFlow import FlowGraph, add_connection
|
|
14
14
|
from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
|
|
15
15
|
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
|
|
@@ -19,19 +19,35 @@ from flowfile_core.schemas import input_schema, transform_schema
|
|
|
19
19
|
from flowfile_frame.expr import Expr, Column, lit, col
|
|
20
20
|
from flowfile_frame.selectors import Selector
|
|
21
21
|
from flowfile_frame.group_frame import GroupByFrame
|
|
22
|
-
from flowfile_frame.utils import _parse_inputs_as_iterable, create_flow_graph
|
|
22
|
+
from flowfile_frame.utils import (_parse_inputs_as_iterable, create_flow_graph, stringify_values,
|
|
23
|
+
ensure_inputs_as_iterable)
|
|
23
24
|
from flowfile_frame.join import _normalize_columns_to_list, _create_join_mappings
|
|
25
|
+
from flowfile_frame.utils import _check_if_convertible_to_code
|
|
26
|
+
from flowfile_frame.config import logger
|
|
27
|
+
|
|
24
28
|
|
|
25
29
|
node_id_counter = 0
|
|
26
30
|
|
|
27
31
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
+
def can_be_expr(param: inspect.Parameter) -> bool:
|
|
33
|
+
"""Check if a parameter can be of type pl.Expr"""
|
|
34
|
+
if param.annotation == inspect.Parameter.empty:
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
# Check direct match or in Union args
|
|
38
|
+
types = get_args(param.annotation) if get_origin(param.annotation) is Union else [param.annotation]
|
|
39
|
+
return any(t in (pl.Expr, pl.expr.expr.Expr) for t in types)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _contains_lambda_pattern(text: str) -> bool:
|
|
43
|
+
return "<lambda> at" in text
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_method_name_from_code(code: str) -> str | None:
|
|
47
|
+
split_code = code.split("input_df.")
|
|
48
|
+
if len(split_code) > 1:
|
|
49
|
+
return split_code[1].split("(")[0]
|
|
32
50
|
|
|
33
|
-
# Create and export the logger
|
|
34
|
-
logger = logging.getLogger('flow_frame')
|
|
35
51
|
|
|
36
52
|
def _to_string_val(v) -> str:
|
|
37
53
|
if isinstance(v, str):
|
|
@@ -40,12 +56,72 @@ def _to_string_val(v) -> str:
|
|
|
40
56
|
return v
|
|
41
57
|
|
|
42
58
|
|
|
59
|
+
def _extract_expr_parts(expr_obj) -> tuple[str, str]:
|
|
60
|
+
"""
|
|
61
|
+
Extract the pure expression string and any raw definitions (including function sources) from an Expr object.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
expr_obj : Expr
|
|
66
|
+
The expression object to extract parts from
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
tuple[str, str]
|
|
71
|
+
A tuple of (pure_expr_str, raw_definitions_str)
|
|
72
|
+
"""
|
|
73
|
+
if not isinstance(expr_obj, Expr):
|
|
74
|
+
# If it's not an Expr, just return its string representation
|
|
75
|
+
return str(expr_obj), ""
|
|
76
|
+
|
|
77
|
+
# Get the basic representation
|
|
78
|
+
pure_expr_str = expr_obj._repr_str
|
|
79
|
+
|
|
80
|
+
# Collect all definitions (function sources)
|
|
81
|
+
raw_definitions = []
|
|
82
|
+
|
|
83
|
+
# Add function sources if any
|
|
84
|
+
if hasattr(expr_obj, '_function_sources') and expr_obj._function_sources:
|
|
85
|
+
# Remove duplicates while preserving order
|
|
86
|
+
unique_sources = []
|
|
87
|
+
seen = set()
|
|
88
|
+
for source in expr_obj._function_sources:
|
|
89
|
+
if source not in seen:
|
|
90
|
+
seen.add(source)
|
|
91
|
+
unique_sources.append(source)
|
|
92
|
+
|
|
93
|
+
if unique_sources:
|
|
94
|
+
raw_definitions.extend(unique_sources)
|
|
95
|
+
|
|
96
|
+
# Join all definitions
|
|
97
|
+
raw_defs_str = "\n\n".join(raw_definitions) if raw_definitions else ""
|
|
98
|
+
|
|
99
|
+
return pure_expr_str, raw_defs_str
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _check_ok_for_serialization(method_name: str = None, polars_expr: pl.Expr | None = None,
|
|
103
|
+
group_expr: pl.Expr | None = None) -> None:
|
|
104
|
+
if method_name is None:
|
|
105
|
+
raise NotImplemented("Cannot create a polars lambda expression without the method")
|
|
106
|
+
if polars_expr is None:
|
|
107
|
+
raise NotImplemented("Cannot create polars expressions with lambda function")
|
|
108
|
+
method_ref = getattr(pl.LazyFrame, method_name)
|
|
109
|
+
if method_ref is None:
|
|
110
|
+
raise ModuleNotFoundError(f"Could not find the method {method_name} in polars lazyframe")
|
|
111
|
+
if method_name == 'group_by':
|
|
112
|
+
if group_expr is None:
|
|
113
|
+
raise NotImplemented("Cannot create a polars lambda expression without the groupby expression")
|
|
114
|
+
if not all(isinstance(ge, pl.Expr) for ge in group_expr):
|
|
115
|
+
raise NotImplemented("Cannot create a polars lambda expression without the groupby expression")
|
|
116
|
+
|
|
117
|
+
|
|
43
118
|
def generate_node_id() -> int:
|
|
44
119
|
global node_id_counter
|
|
45
120
|
node_id_counter += 1
|
|
46
121
|
return node_id_counter
|
|
47
122
|
|
|
48
123
|
|
|
124
|
+
@add_lazyframe_methods
|
|
49
125
|
class FlowFrame:
|
|
50
126
|
"""Main class that wraps FlowDataEngine and maintains the ETL graph."""
|
|
51
127
|
flow_graph: FlowGraph
|
|
@@ -100,13 +176,11 @@ class FlowFrame:
|
|
|
100
176
|
# Extract flow-specific parameters
|
|
101
177
|
node_id = node_id or generate_node_id()
|
|
102
178
|
description = "Data imported from Python object"
|
|
103
|
-
|
|
104
179
|
# Create a new flow graph if none is provided
|
|
105
180
|
if flow_graph is None:
|
|
106
181
|
flow_graph = create_flow_graph()
|
|
107
182
|
|
|
108
183
|
flow_id = flow_graph.flow_id
|
|
109
|
-
|
|
110
184
|
# Convert data to a polars DataFrame/LazyFrame
|
|
111
185
|
try:
|
|
112
186
|
# Use polars to convert from various types
|
|
@@ -121,25 +195,23 @@ class FlowFrame:
|
|
|
121
195
|
)
|
|
122
196
|
pl_data = pl_df.lazy()
|
|
123
197
|
except Exception as e:
|
|
124
|
-
raise ValueError(f"Could not
|
|
125
|
-
|
|
198
|
+
raise ValueError(f"Could not dconvert data to a polars DataFrame: {e}")
|
|
126
199
|
# Create a FlowDataEngine to get data in the right format for manual input
|
|
127
200
|
flow_table = FlowDataEngine(raw_data=pl_data)
|
|
128
|
-
|
|
201
|
+
raw_data_format = input_schema.RawData(data=list(flow_table.to_dict().values()),
|
|
202
|
+
columns=[c.get_minimal_field_info() for c in flow_table.schema])
|
|
129
203
|
# Create a manual input node
|
|
130
204
|
input_node = input_schema.NodeManualInput(
|
|
131
205
|
flow_id=flow_id,
|
|
132
206
|
node_id=node_id,
|
|
133
|
-
|
|
207
|
+
raw_data_format=raw_data_format,
|
|
134
208
|
pos_x=100,
|
|
135
209
|
pos_y=100,
|
|
136
210
|
is_setup=True,
|
|
137
211
|
description=description,
|
|
138
212
|
)
|
|
139
|
-
|
|
140
213
|
# Add to graph
|
|
141
214
|
flow_graph.add_manual_input(input_node)
|
|
142
|
-
|
|
143
215
|
# Return new frame
|
|
144
216
|
return FlowFrame(
|
|
145
217
|
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
@@ -163,7 +235,6 @@ class FlowFrame:
|
|
|
163
235
|
parent_node_id=None,
|
|
164
236
|
):
|
|
165
237
|
"""Create a new FlowFrame instance."""
|
|
166
|
-
|
|
167
238
|
# If data is not a LazyFrame, use the factory method
|
|
168
239
|
if data is not None and not isinstance(data, pl.LazyFrame):
|
|
169
240
|
return cls.create_from_any_type(
|
|
@@ -179,7 +250,6 @@ class FlowFrame:
|
|
|
179
250
|
parent_node_id=parent_node_id,
|
|
180
251
|
)
|
|
181
252
|
|
|
182
|
-
# Otherwise create the instance normally
|
|
183
253
|
instance = super().__new__(cls)
|
|
184
254
|
return instance
|
|
185
255
|
|
|
@@ -198,7 +268,6 @@ class FlowFrame:
|
|
|
198
268
|
parent_node_id=None,
|
|
199
269
|
):
|
|
200
270
|
"""Initialize the FlowFrame with data and graph references."""
|
|
201
|
-
|
|
202
271
|
if data is None:
|
|
203
272
|
data = pl.LazyFrame()
|
|
204
273
|
if not isinstance(data, pl.LazyFrame):
|
|
@@ -230,205 +299,235 @@ class FlowFrame:
|
|
|
230
299
|
def _create_child_frame(self, new_node_id):
|
|
231
300
|
"""Helper method to create a new FlowFrame that's a child of this one"""
|
|
232
301
|
self._add_connection(self.node_id, new_node_id)
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
302
|
+
try:
|
|
303
|
+
return FlowFrame(
|
|
304
|
+
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
305
|
+
flow_graph=self.flow_graph,
|
|
306
|
+
node_id=new_node_id,
|
|
307
|
+
parent_node_id=self.node_id,
|
|
308
|
+
)
|
|
309
|
+
except AttributeError:
|
|
310
|
+
raise ValueError('Could not execute the function')
|
|
239
311
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
description: str = None,
|
|
249
|
-
):
|
|
312
|
+
@staticmethod
|
|
313
|
+
def _generate_sort_polars_code(
|
|
314
|
+
pure_sort_expr_strs: List[str],
|
|
315
|
+
descending_values: List[bool],
|
|
316
|
+
nulls_last_values: List[bool],
|
|
317
|
+
multithreaded: bool,
|
|
318
|
+
maintain_order: bool,
|
|
319
|
+
) -> str:
|
|
250
320
|
"""
|
|
251
|
-
|
|
321
|
+
Generates the `input_df.sort(...)` Polars code string using pure expression strings.
|
|
322
|
+
"""
|
|
323
|
+
kwargs_for_code: Dict[str, Any] = {}
|
|
324
|
+
if any(descending_values):
|
|
325
|
+
kwargs_for_code["descending"] = descending_values[0] if len(descending_values) == 1 else descending_values
|
|
326
|
+
if any(nulls_last_values):
|
|
327
|
+
kwargs_for_code["nulls_last"] = nulls_last_values[0] if len(nulls_last_values) == 1 else nulls_last_values
|
|
328
|
+
if not multithreaded:
|
|
329
|
+
kwargs_for_code["multithreaded"] = multithreaded
|
|
330
|
+
if maintain_order:
|
|
331
|
+
kwargs_for_code["maintain_order"] = maintain_order
|
|
252
332
|
|
|
253
|
-
|
|
254
|
-
-----------
|
|
255
|
-
by : Expr, str, or list of Expr/str
|
|
256
|
-
Column(s) to sort by. Accepts expression input. Strings are parsed as column names.
|
|
257
|
-
*more_by : Expr or str
|
|
258
|
-
Additional columns to sort by, specified as positional arguments.
|
|
259
|
-
descending : bool or list of bool, default False
|
|
260
|
-
Sort in descending order. When sorting by multiple columns, can be specified per column.
|
|
261
|
-
nulls_last : bool or list of bool, default False
|
|
262
|
-
Place null values last; can specify a single boolean or a sequence for per-column control.
|
|
263
|
-
multithreaded : bool, default True
|
|
264
|
-
Sort using multiple threads.
|
|
265
|
-
maintain_order : bool, default False
|
|
266
|
-
Whether the order should be maintained if elements are equal.
|
|
267
|
-
description : str, optional
|
|
268
|
-
Description of this operation for the ETL graph.
|
|
333
|
+
kwargs_str_for_code = ", ".join(f"{k}={repr(v)}" for k, v in kwargs_for_code.items())
|
|
269
334
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
335
|
+
by_arg_for_code = pure_sort_expr_strs[0] if len(
|
|
336
|
+
pure_sort_expr_strs) == 1 else f"[{', '.join(pure_sort_expr_strs)}]"
|
|
337
|
+
return f"input_df.sort({by_arg_for_code}{', ' + kwargs_str_for_code if kwargs_str_for_code else ''})"
|
|
338
|
+
|
|
339
|
+
def sort(
|
|
340
|
+
self,
|
|
341
|
+
by: Union[List[Union[Expr, str]], Expr, str],
|
|
342
|
+
*more_by: Union[Expr, str],
|
|
343
|
+
descending: Union[bool, List[bool]] = False,
|
|
344
|
+
nulls_last: Union[bool, List[bool]] = False,
|
|
345
|
+
multithreaded: bool = True,
|
|
346
|
+
maintain_order: bool = False,
|
|
347
|
+
description: Optional[str] = None,
|
|
348
|
+
) -> "FlowFrame":
|
|
349
|
+
"""
|
|
350
|
+
Sort the dataframe by the given columns.
|
|
274
351
|
"""
|
|
275
|
-
|
|
352
|
+
initial_by_args = list(_parse_inputs_as_iterable((by,)))
|
|
276
353
|
new_node_id = generate_node_id()
|
|
277
|
-
|
|
354
|
+
|
|
355
|
+
sort_expressions_input: list = initial_by_args
|
|
278
356
|
if more_by:
|
|
279
|
-
|
|
357
|
+
sort_expressions_input.extend(list(_parse_inputs_as_iterable(more_by)))
|
|
280
358
|
|
|
281
|
-
|
|
282
|
-
|
|
359
|
+
all_processed_expr_objects: List[Expr] = []
|
|
360
|
+
pure_polars_expr_strings_for_sort: List[str] = []
|
|
361
|
+
collected_raw_definitions: List[str] = []
|
|
362
|
+
column_names_for_native_node: List[str] = []
|
|
283
363
|
|
|
284
|
-
|
|
285
|
-
for expr in sort_expressions:
|
|
286
|
-
if not isinstance(expr, (str, Column)) or (
|
|
287
|
-
isinstance(expr, Column) and expr._select_input.is_altered
|
|
288
|
-
):
|
|
289
|
-
needs_polars_code = True
|
|
290
|
-
break
|
|
364
|
+
use_polars_code_path = False
|
|
291
365
|
|
|
292
|
-
# Also need polars code if we're using maintain_order or multithreaded params
|
|
293
366
|
if maintain_order or not multithreaded:
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
if
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
needs_polars_code = True
|
|
317
|
-
else:
|
|
318
|
-
nulls_last_values = [nulls_last] * len(sort_expressions)
|
|
319
|
-
# Non-default nulls_last needs polars code
|
|
320
|
-
if nulls_last:
|
|
321
|
-
needs_polars_code = True
|
|
322
|
-
|
|
323
|
-
if needs_polars_code:
|
|
324
|
-
# Generate polars code for complex cases
|
|
325
|
-
code = self._generate_sort_polars_code(
|
|
326
|
-
sort_expressions,
|
|
327
|
-
descending_values,
|
|
328
|
-
nulls_last_values,
|
|
329
|
-
multithreaded,
|
|
330
|
-
maintain_order,
|
|
331
|
-
)
|
|
332
|
-
self._add_polars_code(new_node_id, code, description)
|
|
333
|
-
else:
|
|
334
|
-
# Use native implementation for simple cases
|
|
335
|
-
sort_inputs = []
|
|
336
|
-
for i, expr in enumerate(sort_expressions):
|
|
337
|
-
# Convert expr to column name
|
|
338
|
-
if isinstance(expr, Column):
|
|
339
|
-
column_name = expr.name
|
|
340
|
-
elif isinstance(expr, str):
|
|
341
|
-
column_name = expr
|
|
367
|
+
use_polars_code_path = True
|
|
368
|
+
|
|
369
|
+
is_nulls_last_list = isinstance(nulls_last, (list, tuple))
|
|
370
|
+
if is_nulls_last_list and any(val for val in nulls_last if val is not False):
|
|
371
|
+
use_polars_code_path = True
|
|
372
|
+
elif not is_nulls_last_list and nulls_last is not False:
|
|
373
|
+
use_polars_code_path = True
|
|
374
|
+
|
|
375
|
+
for expr_input in sort_expressions_input:
|
|
376
|
+
current_expr_obj: Expr
|
|
377
|
+
is_simple_col_for_native = False
|
|
378
|
+
|
|
379
|
+
if isinstance(expr_input, str):
|
|
380
|
+
current_expr_obj = col(expr_input)
|
|
381
|
+
column_names_for_native_node.append(expr_input)
|
|
382
|
+
is_simple_col_for_native = True
|
|
383
|
+
elif isinstance(expr_input, Column):
|
|
384
|
+
current_expr_obj = expr_input
|
|
385
|
+
# Type ignore below due to simplified Column stub
|
|
386
|
+
if not expr_input._select_input.is_altered: # type: ignore
|
|
387
|
+
column_names_for_native_node.append(expr_input.column_name) # type: ignore
|
|
388
|
+
is_simple_col_for_native = True
|
|
342
389
|
else:
|
|
343
|
-
|
|
390
|
+
use_polars_code_path = True # Altered Column implies complex expression
|
|
391
|
+
elif isinstance(expr_input, Expr):
|
|
392
|
+
current_expr_obj = expr_input
|
|
393
|
+
use_polars_code_path = True # General Expr implies complex expression
|
|
394
|
+
else: # Convert other types to lit
|
|
395
|
+
current_expr_obj = lit(expr_input)
|
|
396
|
+
use_polars_code_path = True # Literal might be part of a complex sort for Polars code
|
|
397
|
+
|
|
398
|
+
all_processed_expr_objects.append(current_expr_obj)
|
|
399
|
+
|
|
400
|
+
pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
|
|
401
|
+
pure_polars_expr_strings_for_sort.append(pure_expr_str)
|
|
402
|
+
|
|
403
|
+
if raw_defs_str:
|
|
404
|
+
if raw_defs_str not in collected_raw_definitions:
|
|
405
|
+
collected_raw_definitions.append(raw_defs_str)
|
|
406
|
+
use_polars_code_path = True
|
|
407
|
+
|
|
408
|
+
if not is_simple_col_for_native: # If it wasn't a simple string or unaltered Column
|
|
409
|
+
use_polars_code_path = True
|
|
410
|
+
|
|
411
|
+
desc_values = list(descending) if isinstance(descending, list) else [descending] * len(
|
|
412
|
+
all_processed_expr_objects)
|
|
413
|
+
null_last_values = list(nulls_last) if isinstance(nulls_last, list) else [nulls_last] * len(
|
|
414
|
+
all_processed_expr_objects)
|
|
415
|
+
|
|
416
|
+
if len(desc_values) != len(all_processed_expr_objects):
|
|
417
|
+
raise ValueError("Length of 'descending' does not match the number of sort expressions.")
|
|
418
|
+
if len(null_last_values) != len(all_processed_expr_objects):
|
|
419
|
+
raise ValueError("Length of 'nulls_last' does not match the number of sort expressions.")
|
|
420
|
+
|
|
421
|
+
if use_polars_code_path:
|
|
422
|
+
polars_operation_code = self._generate_sort_polars_code(
|
|
423
|
+
pure_polars_expr_strings_for_sort, desc_values, null_last_values, multithreaded, maintain_order
|
|
424
|
+
)
|
|
344
425
|
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
426
|
+
final_code_for_node: str
|
|
427
|
+
if collected_raw_definitions:
|
|
428
|
+
unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions)) # Order-preserving unique
|
|
429
|
+
definitions_section = "\n\n".join(unique_raw_definitions)
|
|
430
|
+
final_code_for_node = definitions_section + \
|
|
431
|
+
"\#─────SPLIT─────\n\n" + \
|
|
432
|
+
f"output_df = {polars_operation_code}"
|
|
433
|
+
else:
|
|
434
|
+
final_code_for_node = polars_operation_code
|
|
435
|
+
|
|
436
|
+
pl_expressions_for_fallback = [e.expr for e in all_processed_expr_objects if
|
|
437
|
+
hasattr(e, 'expr') and e.expr is not None]
|
|
438
|
+
kwargs_for_fallback = {
|
|
439
|
+
"descending": desc_values[0] if len(desc_values) == 1 else desc_values,
|
|
440
|
+
"nulls_last": null_last_values[0] if len(null_last_values) == 1 else null_last_values,
|
|
441
|
+
"multithreaded": multithreaded, "maintain_order": maintain_order}
|
|
442
|
+
|
|
443
|
+
self._add_polars_code(new_node_id, final_code_for_node, description, method_name="sort",
|
|
444
|
+
convertable_to_code=_check_if_convertible_to_code(all_processed_expr_objects),
|
|
445
|
+
polars_expr=pl_expressions_for_fallback,
|
|
446
|
+
kwargs_expr=kwargs_for_fallback)
|
|
447
|
+
else:
|
|
448
|
+
sort_inputs_for_node = []
|
|
449
|
+
for i, col_name_for_native in enumerate(column_names_for_native_node):
|
|
450
|
+
sort_inputs_for_node.append(
|
|
451
|
+
transform_schema.SortByInput(column=col_name_for_native, how="desc" if desc_values[i] else "asc")
|
|
452
|
+
# type: ignore
|
|
351
453
|
)
|
|
352
|
-
|
|
353
454
|
sort_settings = input_schema.NodeSort(
|
|
354
|
-
flow_id=self.flow_graph.flow_id,
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
pos_x=200,
|
|
358
|
-
pos_y=150,
|
|
359
|
-
is_setup=True,
|
|
360
|
-
depending_on_id=self.node_id,
|
|
361
|
-
description=description
|
|
362
|
-
or f"Sort by {', '.join(str(e) for e in sort_expressions)}",
|
|
363
|
-
)
|
|
455
|
+
flow_id=self.flow_graph.flow_id, node_id=new_node_id, sort_input=sort_inputs_for_node, # type: ignore
|
|
456
|
+
pos_x=200, pos_y=150, is_setup=True, depending_on_id=self.node_id,
|
|
457
|
+
description=description or f"Sort by {', '.join(column_names_for_native_node)}")
|
|
364
458
|
self.flow_graph.add_sort(sort_settings)
|
|
365
459
|
|
|
366
460
|
return self._create_child_frame(new_node_id)
|
|
367
461
|
|
|
368
|
-
def
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
kwargs["descending"] = descending_values
|
|
402
|
-
|
|
403
|
-
# Only add nulls_last if it's non-default
|
|
404
|
-
if any(nl for nl in nulls_last_values):
|
|
405
|
-
if len(nulls_last_values) == 1:
|
|
406
|
-
kwargs["nulls_last"] = nulls_last_values[0]
|
|
462
|
+
def _add_polars_code(self, new_node_id: int, code: str, description: str = None,
|
|
463
|
+
depending_on_ids: List[str] | None = None, convertable_to_code: bool = True,
|
|
464
|
+
method_name: str = None, polars_expr: Expr | List[Expr] | None = None,
|
|
465
|
+
group_expr: Expr | List[Expr] | None = None,
|
|
466
|
+
kwargs_expr: Dict | None = None,
|
|
467
|
+
group_kwargs: Dict | None = None, ):
|
|
468
|
+
polars_code_for_node: str
|
|
469
|
+
if not convertable_to_code or _contains_lambda_pattern(code):
|
|
470
|
+
|
|
471
|
+
effective_method_name = get_method_name_from_code(
|
|
472
|
+
code) if method_name is None and "input_df." in code else method_name
|
|
473
|
+
|
|
474
|
+
pl_expr_list = ensure_inputs_as_iterable(polars_expr) if polars_expr is not None else []
|
|
475
|
+
group_expr_list = ensure_inputs_as_iterable(group_expr) if group_expr is not None else []
|
|
476
|
+
|
|
477
|
+
_check_ok_for_serialization(polars_expr=pl_expr_list, method_name=effective_method_name,
|
|
478
|
+
group_expr=group_expr_list)
|
|
479
|
+
|
|
480
|
+
current_kwargs_expr = kwargs_expr if kwargs_expr is not None else {}
|
|
481
|
+
result_lazyframe_or_expr: Any
|
|
482
|
+
|
|
483
|
+
if effective_method_name == "group_by":
|
|
484
|
+
group_kwargs = {} if group_kwargs is None else group_kwargs
|
|
485
|
+
if not group_expr_list:
|
|
486
|
+
raise ValueError("group_expr is required for group_by method in serialization fallback.")
|
|
487
|
+
target_obj = getattr(self.data, effective_method_name)(*group_expr_list, **group_kwargs)
|
|
488
|
+
if not pl_expr_list:
|
|
489
|
+
raise ValueError(
|
|
490
|
+
"Aggregation expressions (polars_expr) are required for group_by().agg() in serialization fallback.")
|
|
491
|
+
result_lazyframe_or_expr = target_obj.agg(*pl_expr_list, **current_kwargs_expr)
|
|
492
|
+
elif effective_method_name:
|
|
493
|
+
result_lazyframe_or_expr = getattr(self.data, effective_method_name)(*pl_expr_list,
|
|
494
|
+
**current_kwargs_expr)
|
|
407
495
|
else:
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
496
|
+
raise ValueError(
|
|
497
|
+
"Cannot execute Polars operation: method_name is missing and could not be inferred for serialization fallback.")
|
|
498
|
+
try:
|
|
499
|
+
if isinstance(result_lazyframe_or_expr, pl.LazyFrame):
|
|
500
|
+
serialized_value_for_code = result_lazyframe_or_expr.serialize(format='json')
|
|
501
|
+
polars_code_for_node = "\n".join([
|
|
502
|
+
f"serialized_value = r'''{serialized_value_for_code}'''",
|
|
503
|
+
"buffer = BytesIO(serialized_value.encode('utf-8'))",
|
|
504
|
+
"output_df = pl.LazyFrame.deserialize(buffer, format='json')",
|
|
505
|
+
])
|
|
506
|
+
logger.warning(
|
|
507
|
+
f"Transformation '{effective_method_name}' uses non-serializable elements. "
|
|
508
|
+
"Falling back to serializing the resulting Polars LazyFrame object."
|
|
509
|
+
"This will result in a breaking graph when using the the ui."
|
|
510
|
+
)
|
|
511
|
+
else:
|
|
512
|
+
logger.error(
|
|
513
|
+
f"Fallback for non-convertible code for method '{effective_method_name}' "
|
|
514
|
+
f"resulted in a '{type(result_lazyframe_or_expr).__name__}' instead of a Polars LazyFrame. "
|
|
515
|
+
"This type cannot be persisted as a LazyFrame node via this fallback."
|
|
516
|
+
)
|
|
517
|
+
return FlowFrame(result_lazyframe_or_expr, flow_graph=self.flow_graph, node_id=new_node_id)
|
|
518
|
+
except Exception as e:
|
|
519
|
+
logger.warning(
|
|
520
|
+
f"Critical error: Could not serialize the result of operation '{effective_method_name}' "
|
|
521
|
+
f"during fallback for non-convertible code. Error: {e}."
|
|
522
|
+
"When using a lambda function, consider defining the function first"
|
|
523
|
+
)
|
|
524
|
+
return FlowFrame(result_lazyframe_or_expr, flow_graph=self.flow_graph, node_id=new_node_id)
|
|
423
525
|
else:
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
def _add_polars_code(self, new_node_id: int, code: str, description: str = None,
|
|
427
|
-
depending_on_ids: List[str] | None = None):
|
|
526
|
+
polars_code_for_node = code
|
|
428
527
|
polars_code_settings = input_schema.NodePolarsCode(
|
|
429
528
|
flow_id=self.flow_graph.flow_id,
|
|
430
529
|
node_id=new_node_id,
|
|
431
|
-
polars_code_input=transform_schema.PolarsCodeInput(polars_code=
|
|
530
|
+
polars_code_input=transform_schema.PolarsCodeInput(polars_code=polars_code_for_node),
|
|
432
531
|
is_setup=True,
|
|
433
532
|
depending_on_ids=depending_on_ids if depending_on_ids is not None else [self.node_id],
|
|
434
533
|
description=description,
|
|
@@ -469,14 +568,17 @@ class FlowFrame:
|
|
|
469
568
|
validate : {"1:1", "1:m", "m:1", "m:m"}, optional
|
|
470
569
|
Validate join relationship.
|
|
471
570
|
nulls_equal:
|
|
472
|
-
Join on null values. By default null values will never produce matches.
|
|
571
|
+
Join on null values. By default, null values will never produce matches.
|
|
473
572
|
coalesce:
|
|
474
573
|
None: -> join specific.
|
|
475
574
|
True: -> Always coalesce join columns.
|
|
476
575
|
False: -> Never coalesce join columns.
|
|
477
576
|
maintain_order:
|
|
478
|
-
Which DataFrame row order to preserve, if any. Do not rely on any observed ordering without explicitly
|
|
479
|
-
|
|
577
|
+
Which DataFrame row order to preserve, if any. Do not rely on any observed ordering without explicitly
|
|
578
|
+
setting this parameter, as your code may break in a future release.
|
|
579
|
+
Not specifying any ordering can improve performance Supported for inner, left, right and full joins
|
|
580
|
+
None: No specific ordering is desired. The ordering might differ across Polars versions or even between
|
|
581
|
+
different runs.
|
|
480
582
|
left: Preserves the order of the left DataFrame.
|
|
481
583
|
right: Preserves the order of the right DataFrame.
|
|
482
584
|
left_right: First preserves the order of the left DataFrame, then the right.
|
|
@@ -494,6 +596,7 @@ class FlowFrame:
|
|
|
494
596
|
nulls_equal is False and
|
|
495
597
|
validate is None and
|
|
496
598
|
suffix == '_right')
|
|
599
|
+
|
|
497
600
|
join_mappings = None
|
|
498
601
|
if self.flow_graph.flow_id != other.flow_graph.flow_id:
|
|
499
602
|
combined_graph, node_mappings = combine_flow_graphs_with_mapping(self.flow_graph, other.flow_graph)
|
|
@@ -508,6 +611,7 @@ class FlowFrame:
|
|
|
508
611
|
global node_id_counter
|
|
509
612
|
node_id_counter += len(combined_graph.nodes)
|
|
510
613
|
new_node_id = generate_node_id()
|
|
614
|
+
|
|
511
615
|
if on is not None:
|
|
512
616
|
left_columns = right_columns = _normalize_columns_to_list(on)
|
|
513
617
|
elif left_on is not None and right_on is not None:
|
|
@@ -526,10 +630,11 @@ class FlowFrame:
|
|
|
526
630
|
)
|
|
527
631
|
if not use_polars_code:
|
|
528
632
|
join_mappings, use_polars_code = _create_join_mappings(
|
|
529
|
-
left_columns, right_columns
|
|
633
|
+
left_columns or [], right_columns or []
|
|
530
634
|
)
|
|
531
635
|
|
|
532
636
|
if use_polars_code or suffix != '_right':
|
|
637
|
+
|
|
533
638
|
_on = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in _normalize_columns_to_list(on)) + "]" if on else None
|
|
534
639
|
_left = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in left_columns) + "]" if left_on else None
|
|
535
640
|
_right = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in right_columns) + "]" if right_on else None
|
|
@@ -549,31 +654,50 @@ class FlowFrame:
|
|
|
549
654
|
parent_node_id=self.node_id,
|
|
550
655
|
)
|
|
551
656
|
|
|
552
|
-
elif join_mappings:
|
|
657
|
+
elif join_mappings or how == 'cross':
|
|
658
|
+
|
|
553
659
|
left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
|
|
554
660
|
right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
|
|
555
661
|
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
662
|
+
if how == 'cross':
|
|
663
|
+
join_input = transform_schema.CrossJoinInput(left_select=left_select.renames,
|
|
664
|
+
right_select=right_select.renames,)
|
|
665
|
+
else:
|
|
666
|
+
join_input = transform_schema.JoinInput(
|
|
667
|
+
join_mapping=join_mappings,
|
|
668
|
+
left_select=left_select.renames,
|
|
669
|
+
right_select=right_select.renames,
|
|
670
|
+
how=how,
|
|
671
|
+
)
|
|
672
|
+
|
|
562
673
|
join_input.auto_rename()
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
674
|
+
if how == 'cross':
|
|
675
|
+
cross_join_settings = input_schema.NodeCrossJoin(
|
|
676
|
+
flow_id=self.flow_graph.flow_id,
|
|
677
|
+
node_id=new_node_id,
|
|
678
|
+
cross_join_input=join_input,
|
|
679
|
+
is_setup=True,
|
|
680
|
+
depending_on_ids=[self.node_id, other.node_id],
|
|
681
|
+
description=description or f"Join with {how} strategy",
|
|
682
|
+
auto_generate_selection=True,
|
|
683
|
+
verify_integrity=True,
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
self.flow_graph.add_cross_join(cross_join_settings)
|
|
687
|
+
else:
|
|
688
|
+
join_settings = input_schema.NodeJoin(
|
|
689
|
+
flow_id=self.flow_graph.flow_id,
|
|
690
|
+
node_id=new_node_id,
|
|
691
|
+
join_input=join_input,
|
|
692
|
+
auto_generate_selection=True,
|
|
693
|
+
verify_integrity=True,
|
|
694
|
+
pos_x=200,
|
|
695
|
+
pos_y=150,
|
|
696
|
+
is_setup=True,
|
|
697
|
+
depending_on_ids=[self.node_id, other.node_id],
|
|
698
|
+
description=description or f"Join with {how} strategy",
|
|
699
|
+
)
|
|
700
|
+
self.flow_graph.add_join(join_settings)
|
|
577
701
|
self._add_connection(self.node_id, new_node_id, "main")
|
|
578
702
|
other._add_connection(other.node_id, new_node_id, "right")
|
|
579
703
|
result_frame = FlowFrame(
|
|
@@ -600,38 +724,65 @@ class FlowFrame:
|
|
|
600
724
|
self.flow_graph.add_record_count(node_number_of_records)
|
|
601
725
|
return self._create_child_frame(new_node_id)
|
|
602
726
|
|
|
603
|
-
def select(self, *columns, description: str = None):
|
|
727
|
+
def select(self, *columns: Union[str, Expr, Selector], description: Optional[str] = None) -> "FlowFrame":
|
|
604
728
|
"""
|
|
605
729
|
Select columns from the frame.
|
|
606
|
-
|
|
607
|
-
Args:
|
|
608
|
-
*columns: Column names or expressions
|
|
609
|
-
description: Description of the step, this will be shown in the flowfile file
|
|
610
|
-
|
|
611
|
-
Returns:
|
|
612
|
-
A new FlowFrame with selected columns
|
|
613
730
|
"""
|
|
614
|
-
|
|
615
|
-
columns = _parse_inputs_as_iterable(columns)
|
|
731
|
+
columns_iterable = list(_parse_inputs_as_iterable(columns))
|
|
616
732
|
new_node_id = generate_node_id()
|
|
617
|
-
existing_columns = self.columns
|
|
618
733
|
|
|
619
|
-
if (len(
|
|
620
|
-
and str(
|
|
734
|
+
if (len(columns_iterable) == 1 and isinstance(columns_iterable[0], Expr)
|
|
735
|
+
and str(columns_iterable[0]) == "pl.Expr(len()).alias('number_of_records')"):
|
|
621
736
|
return self._add_number_of_records(new_node_id, description)
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
737
|
+
|
|
738
|
+
all_input_expr_objects: List[Expr] = []
|
|
739
|
+
pure_polars_expr_strings_for_select: List[str] = []
|
|
740
|
+
collected_raw_definitions: List[str] = []
|
|
741
|
+
selected_col_names_for_native: List[transform_schema.SelectInput] = [] # For native node
|
|
742
|
+
|
|
743
|
+
can_use_native_node = True
|
|
744
|
+
if len(columns_iterable) == 1 and isinstance(columns_iterable[0], str) and columns_iterable[0] == '*':
|
|
745
|
+
effective_columns_iterable = [col(c_name) for c_name in self.columns]
|
|
746
|
+
else:
|
|
747
|
+
effective_columns_iterable = columns_iterable
|
|
748
|
+
for expr_input in effective_columns_iterable:
|
|
749
|
+
current_expr_obj = expr_input
|
|
750
|
+
is_simple_col_for_native = False
|
|
751
|
+
|
|
752
|
+
if isinstance(expr_input, str):
|
|
753
|
+
current_expr_obj = col(expr_input)
|
|
754
|
+
selected_col_names_for_native.append(transform_schema.SelectInput(old_name=expr_input))
|
|
755
|
+
is_simple_col_for_native = True
|
|
756
|
+
elif isinstance(expr_input, Column):
|
|
757
|
+
selected_col_names_for_native.append(expr_input.to_select_input())
|
|
758
|
+
is_simple_col_for_native = True
|
|
759
|
+
elif isinstance(expr_input, Selector):
|
|
760
|
+
can_use_native_node = False
|
|
761
|
+
elif not isinstance(expr_input, Expr):
|
|
762
|
+
current_expr_obj = lit(expr_input)
|
|
763
|
+
|
|
764
|
+
all_input_expr_objects.append(current_expr_obj) # type: ignore
|
|
765
|
+
|
|
766
|
+
pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
|
|
767
|
+
|
|
768
|
+
pure_polars_expr_strings_for_select.append(pure_expr_str)
|
|
769
|
+
if raw_defs_str and raw_defs_str not in collected_raw_definitions:
|
|
770
|
+
collected_raw_definitions.append(raw_defs_str)
|
|
771
|
+
|
|
772
|
+
if not is_simple_col_for_native and not isinstance(expr_input, Selector):
|
|
773
|
+
can_use_native_node = False
|
|
774
|
+
if collected_raw_definitions: # Has to use Polars code if there are definitions
|
|
775
|
+
can_use_native_node = False
|
|
776
|
+
if can_use_native_node:
|
|
777
|
+
existing_cols = self.columns
|
|
778
|
+
selected_col_names = {select_col.old_name for select_col in selected_col_names_for_native}
|
|
779
|
+
dropped_columns = [transform_schema.SelectInput(c, keep=False) for c in existing_cols if
|
|
780
|
+
c not in selected_col_names]
|
|
781
|
+
selected_col_names_for_native.extend(dropped_columns)
|
|
631
782
|
select_settings = input_schema.NodeSelect(
|
|
632
783
|
flow_id=self.flow_graph.flow_id,
|
|
633
784
|
node_id=new_node_id,
|
|
634
|
-
select_input=
|
|
785
|
+
select_input=selected_col_names_for_native,
|
|
635
786
|
keep_missing=False,
|
|
636
787
|
pos_x=200,
|
|
637
788
|
pos_y=100,
|
|
@@ -639,60 +790,97 @@ class FlowFrame:
|
|
|
639
790
|
depending_on_id=self.node_id,
|
|
640
791
|
description=description
|
|
641
792
|
)
|
|
642
|
-
|
|
643
|
-
# Add to graph
|
|
644
793
|
self.flow_graph.add_select(select_settings)
|
|
645
|
-
return self._create_child_frame(new_node_id)
|
|
646
|
-
|
|
647
794
|
else:
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
print('warning this cannot be converted to flowfile frontend. Make sure you use the flowfile expr')
|
|
657
|
-
is_readable = False
|
|
658
|
-
elif isinstance(col_, str) and col_ in self.columns:
|
|
659
|
-
col_expr = Column(col_)
|
|
660
|
-
readable_exprs.append(col_expr)
|
|
661
|
-
else:
|
|
662
|
-
lit_expr = lit(col_)
|
|
663
|
-
readable_exprs.append(lit_expr)
|
|
664
|
-
if is_readable:
|
|
665
|
-
code = f"input_df.select([{', '.join(str(e) for e in readable_exprs)}])"
|
|
795
|
+
polars_operation_code = f"input_df.select([{', '.join(pure_polars_expr_strings_for_select)}])"
|
|
796
|
+
final_code_for_node: str
|
|
797
|
+
if collected_raw_definitions:
|
|
798
|
+
unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions))
|
|
799
|
+
definitions_section = "\n\n".join(unique_raw_definitions)
|
|
800
|
+
final_code_for_node = definitions_section + \
|
|
801
|
+
"\#─────SPLIT─────\n\n" + \
|
|
802
|
+
f"output_df = {polars_operation_code}"
|
|
666
803
|
else:
|
|
667
|
-
|
|
804
|
+
final_code_for_node = polars_operation_code
|
|
668
805
|
|
|
669
|
-
|
|
670
|
-
|
|
806
|
+
pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
|
|
807
|
+
isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
|
|
808
|
+
self._add_polars_code(new_node_id, final_code_for_node, description,
|
|
809
|
+
method_name="select",
|
|
810
|
+
convertable_to_code=_check_if_convertible_to_code(all_input_expr_objects),
|
|
811
|
+
polars_expr=pl_expressions_for_fallback)
|
|
671
812
|
|
|
672
|
-
|
|
813
|
+
return self._create_child_frame(new_node_id)
|
|
814
|
+
|
|
815
|
+
def filter(self, *predicates: Union[Expr, Any], flowfile_formula: Optional[str] = None,
|
|
816
|
+
description: Optional[str] = None, **constraints: Any) -> "FlowFrame":
|
|
673
817
|
"""
|
|
674
818
|
Filter rows based on a predicate.
|
|
675
|
-
|
|
676
|
-
Args:
|
|
677
|
-
predicate: Filter condition
|
|
678
|
-
flowfile_formula: Native support in frontend
|
|
679
|
-
description: Description of the step that is performed
|
|
680
|
-
Returns:
|
|
681
|
-
A new FlowFrame with filtered rows
|
|
682
819
|
"""
|
|
820
|
+
if (len(predicates) > 0 or len(constraints) > 0) and flowfile_formula:
|
|
821
|
+
raise ValueError("You can only use one of the following: predicates, constraints or flowfile_formula")
|
|
822
|
+
available_columns = self.columns
|
|
683
823
|
new_node_id = generate_node_id()
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
824
|
+
if len(predicates) > 0 or len(constraints) > 0:
|
|
825
|
+
all_input_expr_objects: List[Expr] = []
|
|
826
|
+
pure_polars_expr_strings: List[str] = []
|
|
827
|
+
collected_raw_definitions: List[str] = []
|
|
828
|
+
|
|
829
|
+
processed_predicates = []
|
|
830
|
+
for pred_item in predicates:
|
|
831
|
+
if isinstance(pred_item, (tuple, list, Iterator)):
|
|
832
|
+
# If it's a sequence, extend the processed_predicates with its elements
|
|
833
|
+
processed_predicates.extend(list(pred_item))
|
|
834
|
+
else:
|
|
835
|
+
# Otherwise, just add the item
|
|
836
|
+
processed_predicates.append(pred_item)
|
|
837
|
+
|
|
838
|
+
for pred_input in processed_predicates: # Loop over the processed_predicates
|
|
839
|
+
# End of the new/modified section
|
|
840
|
+
current_expr_obj = None # Initialize current_expr_obj
|
|
841
|
+
if isinstance(pred_input, Expr):
|
|
842
|
+
current_expr_obj = pred_input
|
|
843
|
+
elif isinstance(pred_input, str) and pred_input in available_columns:
|
|
844
|
+
current_expr_obj = col(pred_input)
|
|
845
|
+
else:
|
|
846
|
+
current_expr_obj = lit(pred_input)
|
|
847
|
+
|
|
848
|
+
all_input_expr_objects.append(current_expr_obj)
|
|
849
|
+
|
|
850
|
+
pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
|
|
851
|
+
pure_polars_expr_strings.append(f"({pure_expr_str})")
|
|
852
|
+
if raw_defs_str and raw_defs_str not in collected_raw_definitions:
|
|
853
|
+
collected_raw_definitions.append(raw_defs_str)
|
|
854
|
+
|
|
855
|
+
for k, v_val in constraints.items():
|
|
856
|
+
constraint_expr_obj = (col(k) == lit(v_val))
|
|
857
|
+
all_input_expr_objects.append(constraint_expr_obj)
|
|
858
|
+
pure_expr_str, raw_defs_str = _extract_expr_parts(
|
|
859
|
+
constraint_expr_obj) # Constraint exprs are unlikely to have defs
|
|
860
|
+
pure_polars_expr_strings.append(f"({pure_expr_str})")
|
|
861
|
+
if raw_defs_str and raw_defs_str not in collected_raw_definitions: # Should be rare here
|
|
862
|
+
collected_raw_definitions.append(raw_defs_str)
|
|
863
|
+
|
|
864
|
+
filter_conditions_str = " & ".join(pure_polars_expr_strings) if pure_polars_expr_strings else "pl.lit(True)"
|
|
865
|
+
polars_operation_code = f"input_df.filter({filter_conditions_str})"
|
|
866
|
+
|
|
867
|
+
final_code_for_node: str
|
|
868
|
+
if collected_raw_definitions:
|
|
869
|
+
unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions)) # Order-preserving unique
|
|
870
|
+
definitions_section = "\n\n".join(unique_raw_definitions)
|
|
871
|
+
final_code_for_node = definitions_section + \
|
|
872
|
+
"\#─────SPLIT─────\n\n" + \
|
|
873
|
+
f"output_df = {polars_operation_code}"
|
|
689
874
|
else:
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
875
|
+
final_code_for_node = polars_operation_code
|
|
876
|
+
|
|
877
|
+
convertable_to_code = _check_if_convertible_to_code(all_input_expr_objects)
|
|
878
|
+
pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
|
|
879
|
+
isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
|
|
880
|
+
self._add_polars_code(new_node_id, final_code_for_node, description, method_name="filter",
|
|
881
|
+
convertable_to_code=convertable_to_code,
|
|
882
|
+
polars_expr=pl_expressions_for_fallback)
|
|
694
883
|
elif flowfile_formula:
|
|
695
|
-
# Create node settings
|
|
696
884
|
filter_settings = input_schema.NodeFilter(
|
|
697
885
|
flow_id=self.flow_graph.flow_id,
|
|
698
886
|
node_id=new_node_id,
|
|
@@ -706,8 +894,10 @@ class FlowFrame:
|
|
|
706
894
|
depending_on_id=self.node_id,
|
|
707
895
|
description=description
|
|
708
896
|
)
|
|
709
|
-
|
|
710
897
|
self.flow_graph.add_filter(filter_settings)
|
|
898
|
+
else:
|
|
899
|
+
logger.info("Filter called with no arguments; creating a pass-through Polars code node.")
|
|
900
|
+
self._add_polars_code(new_node_id, "output_df = input_df", description or "No-op filter", method_name=None)
|
|
711
901
|
|
|
712
902
|
return self._create_child_frame(new_node_id)
|
|
713
903
|
|
|
@@ -792,7 +982,7 @@ class FlowFrame:
|
|
|
792
982
|
if convert_to_absolute_path:
|
|
793
983
|
output_settings.directory = output_settings.abs_file_path
|
|
794
984
|
except Exception as e:
|
|
795
|
-
|
|
985
|
+
logger.warning(f"Could not determine absolute path for {file_str}: {e}")
|
|
796
986
|
|
|
797
987
|
if not use_polars_code:
|
|
798
988
|
node_output = input_schema.NodeOutput(
|
|
@@ -820,7 +1010,7 @@ class FlowFrame:
|
|
|
820
1010
|
|
|
821
1011
|
# Use sink_parquet for LazyFrames
|
|
822
1012
|
code = f"input_df.sink_parquet({args_str})"
|
|
823
|
-
|
|
1013
|
+
logger.debug(f"Generated Polars Code: {code}")
|
|
824
1014
|
self._add_polars_code(new_node_id, code, description)
|
|
825
1015
|
|
|
826
1016
|
return self._create_child_frame(new_node_id)
|
|
@@ -868,7 +1058,7 @@ class FlowFrame:
|
|
|
868
1058
|
if convert_to_absolute_path:
|
|
869
1059
|
output_settings.directory = output_settings.abs_file_path
|
|
870
1060
|
except Exception as e:
|
|
871
|
-
|
|
1061
|
+
logger.warning(f"Could not determine absolute path for {file_str}: {e}")
|
|
872
1062
|
|
|
873
1063
|
if not use_polars_code:
|
|
874
1064
|
node_output = input_schema.NodeOutput(
|
|
@@ -901,7 +1091,7 @@ class FlowFrame:
|
|
|
901
1091
|
args_str += f", {kwargs_repr}"
|
|
902
1092
|
|
|
903
1093
|
code = f"input_df.collect().write_csv({args_str})"
|
|
904
|
-
|
|
1094
|
+
logger.debug(f"Generated Polars Code: {code}")
|
|
905
1095
|
self._add_polars_code(new_node_id, code, description)
|
|
906
1096
|
|
|
907
1097
|
return self._create_child_frame(new_node_id)
|
|
@@ -954,10 +1144,10 @@ class FlowFrame:
|
|
|
954
1144
|
self.flow_graph.apply_layout()
|
|
955
1145
|
self.flow_graph.save_flow(file_path)
|
|
956
1146
|
|
|
957
|
-
def collect(self):
|
|
1147
|
+
def collect(self, *args, **kwargs):
|
|
958
1148
|
"""Collect lazy data into memory."""
|
|
959
1149
|
if hasattr(self.data, "collect"):
|
|
960
|
-
return self.data.collect()
|
|
1150
|
+
return self.data.collect(*args, **kwargs)
|
|
961
1151
|
return self.data
|
|
962
1152
|
|
|
963
1153
|
def _with_flowfile_formula(self, flowfile_formula: str, output_column_name, description: str = None) -> "FlowFrame":
|
|
@@ -1278,9 +1468,10 @@ class FlowFrame:
|
|
|
1278
1468
|
f.node_id = node_mappings.get((f.flow_graph.flow_id, f.node_id), None)
|
|
1279
1469
|
global node_id_counter
|
|
1280
1470
|
node_id_counter += len(combined_graph.nodes)
|
|
1471
|
+
else:
|
|
1472
|
+
combined_graph = self.flow_graph
|
|
1281
1473
|
new_node_id = generate_node_id()
|
|
1282
1474
|
use_native = how == "diagonal_relaxed" and parallel and not rechunk
|
|
1283
|
-
|
|
1284
1475
|
if use_native:
|
|
1285
1476
|
# Create union input for the transform schema
|
|
1286
1477
|
union_input = transform_schema.UnionInput(
|
|
@@ -1314,7 +1505,6 @@ class FlowFrame:
|
|
|
1314
1505
|
input_vars.append(f"input_df_{i+2}")
|
|
1315
1506
|
|
|
1316
1507
|
frames_list = f"[{', '.join(input_vars)}]"
|
|
1317
|
-
|
|
1318
1508
|
code = f"""
|
|
1319
1509
|
# Perform concat operation
|
|
1320
1510
|
output_df = pl.concat(
|
|
@@ -1324,19 +1514,20 @@ class FlowFrame:
|
|
|
1324
1514
|
parallel={parallel}
|
|
1325
1515
|
)
|
|
1326
1516
|
"""
|
|
1327
|
-
|
|
1517
|
+
self.flow_graph = combined_graph
|
|
1328
1518
|
|
|
1329
1519
|
# Add polars code node with dependencies on all input frames
|
|
1330
1520
|
depending_on_ids = [self.node_id] + [frame.node_id for frame in others]
|
|
1331
1521
|
self._add_polars_code(
|
|
1332
1522
|
new_node_id, code, description, depending_on_ids=depending_on_ids
|
|
1333
1523
|
)
|
|
1334
|
-
|
|
1335
1524
|
# Add connections to ensure all frames are available
|
|
1336
1525
|
self._add_connection(self.node_id, new_node_id, "main")
|
|
1526
|
+
|
|
1337
1527
|
for other_frame in others:
|
|
1338
|
-
other_frame._add_connection(other_frame.node_id, new_node_id, "main")
|
|
1339
1528
|
|
|
1529
|
+
other_frame.flow_graph = combined_graph
|
|
1530
|
+
other_frame._add_connection(other_frame.node_id, new_node_id, "main")
|
|
1340
1531
|
# Create and return the new frame
|
|
1341
1532
|
return FlowFrame(
|
|
1342
1533
|
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
@@ -1373,7 +1564,7 @@ class FlowFrame:
|
|
|
1373
1564
|
return False, None
|
|
1374
1565
|
|
|
1375
1566
|
# Extract the output name
|
|
1376
|
-
output_name = expr.
|
|
1567
|
+
output_name = expr.column_name
|
|
1377
1568
|
|
|
1378
1569
|
if ".over(" not in expr._repr_str:
|
|
1379
1570
|
# Simple cumulative count can be implemented as a record ID with offset=1
|
|
@@ -1456,62 +1647,70 @@ class FlowFrame:
|
|
|
1456
1647
|
return False, None
|
|
1457
1648
|
|
|
1458
1649
|
def with_columns(
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1650
|
+
self,
|
|
1651
|
+
*exprs: Union[Expr, Iterable[Expr], Any], # Allow Any for implicit lit conversion
|
|
1652
|
+
flowfile_formulas: Optional[List[str]] = None,
|
|
1653
|
+
output_column_names: Optional[List[str]] = None,
|
|
1654
|
+
description: Optional[str] = None,
|
|
1655
|
+
**named_exprs: Union[Expr, Any], # Allow Any for implicit lit conversion
|
|
1465
1656
|
) -> "FlowFrame":
|
|
1466
1657
|
"""
|
|
1467
|
-
Add
|
|
1468
|
-
|
|
1469
|
-
Parameters
|
|
1470
|
-
----------
|
|
1471
|
-
exprs : Expr or List[Expr], optional
|
|
1472
|
-
Expressions to evaluate as new columns
|
|
1473
|
-
flowfile_formulas : List[str], optional
|
|
1474
|
-
Alternative approach using flowfile formula syntax
|
|
1475
|
-
output_column_names : List[str], optional
|
|
1476
|
-
Column names for the flowfile formulas
|
|
1477
|
-
description : str, optional
|
|
1478
|
-
Description of this operation for the ETL graph
|
|
1479
|
-
|
|
1480
|
-
Returns
|
|
1481
|
-
-------
|
|
1482
|
-
FlowFrame
|
|
1483
|
-
A new FlowFrame with the columns added
|
|
1484
|
-
|
|
1485
|
-
Raises
|
|
1486
|
-
------
|
|
1487
|
-
ValueError
|
|
1488
|
-
If neither exprs nor flowfile_formulas with output_column_names are provided,
|
|
1489
|
-
or if the lengths of flowfile_formulas and output_column_names don't match
|
|
1658
|
+
Add or replace columns in the DataFrame.
|
|
1490
1659
|
"""
|
|
1491
|
-
|
|
1492
|
-
new_node_id = generate_node_id()
|
|
1493
|
-
exprs_iterable = _parse_inputs_as_iterable((exprs,))
|
|
1660
|
+
new_node_id = generate_node_id()
|
|
1494
1661
|
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1662
|
+
all_input_expr_objects: List[Expr] = []
|
|
1663
|
+
pure_polars_expr_strings_for_wc: List[str] = []
|
|
1664
|
+
collected_raw_definitions: List[str] = []
|
|
1665
|
+
|
|
1666
|
+
has_exprs_or_named_exprs = bool(exprs or named_exprs)
|
|
1667
|
+
if has_exprs_or_named_exprs:
|
|
1668
|
+
actual_exprs_to_process: List[Expr] = []
|
|
1669
|
+
temp_exprs_iterable = list(_parse_inputs_as_iterable(exprs))
|
|
1670
|
+
|
|
1671
|
+
for item in temp_exprs_iterable:
|
|
1672
|
+
if isinstance(item, Expr):
|
|
1673
|
+
actual_exprs_to_process.append(item)
|
|
1674
|
+
else: # auto-lit for non-Expr positional args
|
|
1675
|
+
actual_exprs_to_process.append(lit(item))
|
|
1676
|
+
|
|
1677
|
+
for name, val_expr in named_exprs.items():
|
|
1678
|
+
if isinstance(val_expr, Expr):
|
|
1679
|
+
actual_exprs_to_process.append(val_expr.alias(name)) # type: ignore # Assuming Expr has alias
|
|
1680
|
+
else: # auto-lit for named args and then alias
|
|
1681
|
+
actual_exprs_to_process.append(lit(val_expr).alias(name)) # type: ignore
|
|
1682
|
+
|
|
1683
|
+
if len(actual_exprs_to_process) == 1 and isinstance(actual_exprs_to_process[0], Expr):
|
|
1684
|
+
pass
|
|
1685
|
+
|
|
1686
|
+
for current_expr_obj in actual_exprs_to_process:
|
|
1687
|
+
all_input_expr_objects.append(current_expr_obj)
|
|
1688
|
+
pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
|
|
1689
|
+
pure_polars_expr_strings_for_wc.append(pure_expr_str) # with_columns takes individual expressions
|
|
1690
|
+
if raw_defs_str and raw_defs_str not in collected_raw_definitions:
|
|
1691
|
+
collected_raw_definitions.append(raw_defs_str)
|
|
1692
|
+
|
|
1693
|
+
polars_operation_code = f"input_df.with_columns([{', '.join(pure_polars_expr_strings_for_wc)}])"
|
|
1694
|
+
|
|
1695
|
+
final_code_for_node: str
|
|
1696
|
+
if collected_raw_definitions:
|
|
1697
|
+
unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions))
|
|
1698
|
+
definitions_section = "\n\n".join(unique_raw_definitions)
|
|
1699
|
+
final_code_for_node = definitions_section + \
|
|
1700
|
+
"\n#─────SPLIT─────\n\n" + \
|
|
1701
|
+
f"output_df = {polars_operation_code}"
|
|
1702
|
+
else:
|
|
1703
|
+
final_code_for_node = polars_operation_code
|
|
1507
1704
|
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1705
|
+
pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
|
|
1706
|
+
isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
|
|
1707
|
+
self._add_polars_code(new_node_id, final_code_for_node, description, method_name='with_columns',
|
|
1708
|
+
convertable_to_code=_check_if_convertible_to_code(all_input_expr_objects),
|
|
1709
|
+
polars_expr=pl_expressions_for_fallback)
|
|
1512
1710
|
return self._create_child_frame(new_node_id)
|
|
1513
1711
|
|
|
1514
1712
|
elif flowfile_formulas is not None and output_column_names is not None:
|
|
1713
|
+
|
|
1515
1714
|
if len(output_column_names) != len(flowfile_formulas):
|
|
1516
1715
|
raise ValueError(
|
|
1517
1716
|
"Length of both the formulas and the output columns names must be identical"
|
|
@@ -1524,9 +1723,7 @@ class FlowFrame:
|
|
|
1524
1723
|
ff = ff._with_flowfile_formula(flowfile_formula, output_column_name, f"{i}: {description}")
|
|
1525
1724
|
return ff
|
|
1526
1725
|
else:
|
|
1527
|
-
raise ValueError(
|
|
1528
|
-
"Either exprs or flowfile_formulas with output_column_names must be provided"
|
|
1529
|
-
)
|
|
1726
|
+
raise ValueError("Either exprs/named_exprs or flowfile_formulas with output_column_names must be provided")
|
|
1530
1727
|
|
|
1531
1728
|
def with_row_index(
|
|
1532
1729
|
self, name: str = "index", offset: int = 0, description: str = None
|
|
@@ -1614,26 +1811,27 @@ class FlowFrame:
|
|
|
1614
1811
|
|
|
1615
1812
|
if isinstance(columns, (list, tuple)):
|
|
1616
1813
|
all_columns.extend(
|
|
1617
|
-
[col.
|
|
1814
|
+
[col.column_name if isinstance(col, Column) else col for col in columns]
|
|
1618
1815
|
)
|
|
1619
1816
|
else:
|
|
1620
|
-
all_columns.append(columns.
|
|
1817
|
+
all_columns.append(columns.column_name if isinstance(columns, Column) else columns)
|
|
1621
1818
|
|
|
1622
1819
|
if more_columns:
|
|
1623
1820
|
for col in more_columns:
|
|
1624
|
-
all_columns.append(col.
|
|
1821
|
+
all_columns.append(col.column_name if isinstance(col, Column) else col)
|
|
1625
1822
|
|
|
1626
1823
|
if len(all_columns) == 1:
|
|
1627
|
-
|
|
1824
|
+
|
|
1825
|
+
columns_str = stringify_values(all_columns[0])
|
|
1628
1826
|
else:
|
|
1629
|
-
columns_str = "[" + ", ".join([
|
|
1827
|
+
columns_str = "[" + ", ".join([ stringify_values(col) for col in all_columns]) + "]"
|
|
1630
1828
|
|
|
1631
1829
|
code = f"""
|
|
1632
1830
|
# Explode columns into multiple rows
|
|
1633
1831
|
output_df = input_df.explode({columns_str})
|
|
1634
1832
|
"""
|
|
1635
1833
|
|
|
1636
|
-
cols_desc = ", ".join(all_columns)
|
|
1834
|
+
cols_desc = ", ".join(str(s) for s in all_columns)
|
|
1637
1835
|
desc = description or f"Explode column(s): {cols_desc}"
|
|
1638
1836
|
|
|
1639
1837
|
# Add polars code node
|
|
@@ -1676,7 +1874,7 @@ class FlowFrame:
|
|
|
1676
1874
|
new_node_id = generate_node_id()
|
|
1677
1875
|
|
|
1678
1876
|
if isinstance(column, Column):
|
|
1679
|
-
column_name = column.
|
|
1877
|
+
column_name = column.column_name
|
|
1680
1878
|
else:
|
|
1681
1879
|
column_name = column
|
|
1682
1880
|
|
|
@@ -1760,7 +1958,7 @@ class FlowFrame:
|
|
|
1760
1958
|
if col_expr._select_input.is_altered:
|
|
1761
1959
|
can_use_native = False
|
|
1762
1960
|
break
|
|
1763
|
-
processed_subset.append(col_expr.
|
|
1961
|
+
processed_subset.append(col_expr.column_name)
|
|
1764
1962
|
else:
|
|
1765
1963
|
can_use_native = False
|
|
1766
1964
|
break
|
|
@@ -1848,650 +2046,34 @@ class FlowFrame:
|
|
|
1848
2046
|
"""Get the number of columns."""
|
|
1849
2047
|
return self.data.width
|
|
1850
2048
|
|
|
2049
|
+
def __contains__(self, key):
|
|
2050
|
+
"""This special method enables the 'in' operator to work with FlowFrame objects."""
|
|
2051
|
+
return key in self.data
|
|
1851
2052
|
|
|
1852
|
-
def
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
"profile",
|
|
1857
|
-
"describe",
|
|
1858
|
-
"explain",
|
|
1859
|
-
"show_graph",
|
|
1860
|
-
"serialize",
|
|
1861
|
-
"fetch",
|
|
1862
|
-
"get_meta",
|
|
1863
|
-
"columns",
|
|
1864
|
-
"dtypes",
|
|
1865
|
-
"schema",
|
|
1866
|
-
"estimated_size",
|
|
1867
|
-
"n_chunks",
|
|
1868
|
-
"is_empty",
|
|
1869
|
-
"chunk_lengths",
|
|
1870
|
-
"optimization_toggle",
|
|
1871
|
-
"set_polars_options",
|
|
1872
|
-
"collect_schema"
|
|
1873
|
-
]
|
|
1874
|
-
|
|
1875
|
-
already_implemented = set(dir(FlowFrame))
|
|
1876
|
-
|
|
1877
|
-
for method_name in delegate_methods:
|
|
1878
|
-
if method_name not in already_implemented and hasattr(
|
|
1879
|
-
pl.LazyFrame, method_name
|
|
1880
|
-
):
|
|
1881
|
-
# Create a simple delegate method
|
|
1882
|
-
def make_delegate(name):
|
|
1883
|
-
def delegate_method(self, *args, **kwargs):
|
|
1884
|
-
return getattr(self.data, name)(*args, **kwargs)
|
|
1885
|
-
|
|
1886
|
-
# Set docstring and name
|
|
1887
|
-
delegate_method.__doc__ = (
|
|
1888
|
-
f"See pl.LazyFrame.{name} for full documentation."
|
|
1889
|
-
)
|
|
1890
|
-
delegate_method.__name__ = name
|
|
1891
|
-
return delegate_method
|
|
1892
|
-
|
|
1893
|
-
# Add the method to the class
|
|
1894
|
-
setattr(FlowFrame, method_name, make_delegate(method_name))
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
_add_delegated_methods()
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
def sum(expr):
|
|
1901
|
-
"""Sum aggregation function."""
|
|
1902
|
-
if isinstance(expr, str):
|
|
1903
|
-
expr = col(expr)
|
|
1904
|
-
return expr.sum()
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
def mean(expr):
|
|
1908
|
-
"""Mean aggregation function."""
|
|
1909
|
-
if isinstance(expr, str):
|
|
1910
|
-
expr = col(expr)
|
|
1911
|
-
return expr.mean()
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
def min(expr):
|
|
1915
|
-
"""Min aggregation function."""
|
|
1916
|
-
if isinstance(expr, str):
|
|
1917
|
-
expr = col(expr)
|
|
1918
|
-
return expr.min()
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
def max(expr):
|
|
1922
|
-
"""Max aggregation function."""
|
|
1923
|
-
if isinstance(expr, str):
|
|
1924
|
-
expr = col(expr)
|
|
1925
|
-
return expr.max()
|
|
1926
|
-
|
|
1927
|
-
|
|
1928
|
-
def count(expr):
|
|
1929
|
-
"""Count aggregation function."""
|
|
1930
|
-
if isinstance(expr, str):
|
|
1931
|
-
expr = col(expr)
|
|
1932
|
-
return expr.count()
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
def read_csv(
|
|
1936
|
-
source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
|
|
1937
|
-
*,
|
|
1938
|
-
flow_graph: Optional[Any] = None, # Using Any for FlowGraph placeholder
|
|
1939
|
-
separator: str = ',',
|
|
1940
|
-
convert_to_absolute_path: bool = True,
|
|
1941
|
-
description: Optional[str] = None,
|
|
1942
|
-
has_header: bool = True,
|
|
1943
|
-
new_columns: Optional[List[str]] = None,
|
|
1944
|
-
comment_prefix: Optional[str] = None,
|
|
1945
|
-
quote_char: Optional[str] = '"',
|
|
1946
|
-
skip_rows: int = 0,
|
|
1947
|
-
skip_lines: int = 0,
|
|
1948
|
-
schema: Optional[SchemaDict] = None,
|
|
1949
|
-
schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
|
|
1950
|
-
null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
|
|
1951
|
-
missing_utf8_is_empty_string: bool = False,
|
|
1952
|
-
ignore_errors: bool = False,
|
|
1953
|
-
try_parse_dates: bool = False,
|
|
1954
|
-
infer_schema: bool = True,
|
|
1955
|
-
infer_schema_length: Optional[int] = 100,
|
|
1956
|
-
n_rows: Optional[int] = None,
|
|
1957
|
-
encoding: CsvEncoding = 'utf8',
|
|
1958
|
-
low_memory: bool = False,
|
|
1959
|
-
rechunk: bool = False,
|
|
1960
|
-
storage_options: Optional[Dict[str, Any]] = None,
|
|
1961
|
-
skip_rows_after_header: int = 0,
|
|
1962
|
-
row_index_name: Optional[str] = None,
|
|
1963
|
-
row_index_offset: int = 0,
|
|
1964
|
-
eol_char: str = '\n',
|
|
1965
|
-
raise_if_empty: bool = True,
|
|
1966
|
-
truncate_ragged_lines: bool = False,
|
|
1967
|
-
decimal_comma: bool = False,
|
|
1968
|
-
glob: bool = True,
|
|
1969
|
-
cache: bool = True,
|
|
1970
|
-
with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
|
|
1971
|
-
**other_options: Any
|
|
1972
|
-
) -> FlowFrame:
|
|
1973
|
-
"""
|
|
1974
|
-
Read a CSV file into a FlowFrame.
|
|
1975
|
-
|
|
1976
|
-
This function uses the native FlowGraph implementation when the parameters
|
|
1977
|
-
fall within the supported range, and falls back to using Polars' scan_csv implementation
|
|
1978
|
-
for more advanced features.
|
|
1979
|
-
|
|
1980
|
-
Args:
|
|
1981
|
-
source: Path(s) to CSV file(s), or a file-like object.
|
|
1982
|
-
flow_graph: if you want to add it to an existing graph
|
|
1983
|
-
separator: Single byte character to use as separator in the file.
|
|
1984
|
-
convert_to_absolute_path: If the path needs to be set to a fixed location
|
|
1985
|
-
description: if you want to add a readable name in the frontend (advised)
|
|
1986
|
-
|
|
1987
|
-
# Polars.scan_csv aligned parameters
|
|
1988
|
-
has_header: Indicate if the first row of the dataset is a header or not.
|
|
1989
|
-
new_columns: Rename columns after selection.
|
|
1990
|
-
comment_prefix: String that indicates a comment line if found at beginning of line.
|
|
1991
|
-
quote_char: Character used for quoting. None to disable.
|
|
1992
|
-
skip_rows: Start reading after this many rows.
|
|
1993
|
-
skip_lines: Skip this many lines by newline char only.
|
|
1994
|
-
schema: Schema to use when reading the CSV.
|
|
1995
|
-
schema_overrides: Schema overrides for specific columns.
|
|
1996
|
-
null_values: Values to interpret as null.
|
|
1997
|
-
missing_utf8_is_empty_string: Treat missing utf8 values as empty strings.
|
|
1998
|
-
ignore_errors: Try to keep reading lines if some parsing errors occur.
|
|
1999
|
-
try_parse_dates: Try to automatically parse dates.
|
|
2000
|
-
infer_schema: Boolean flag. If False, `infer_schema_length` for Polars is set to 0.
|
|
2001
|
-
infer_schema_length: Number of rows to use for schema inference. Polars default is 100.
|
|
2002
|
-
n_rows: Stop reading after this many rows.
|
|
2003
|
-
encoding: Character encoding to use.
|
|
2004
|
-
low_memory: Reduce memory usage at the cost of performance.
|
|
2005
|
-
rechunk: Ensure data is in contiguous memory layout after parsing.
|
|
2006
|
-
storage_options: Options for fsspec for cloud storage.
|
|
2007
|
-
skip_rows_after_header: Skip rows after header.
|
|
2008
|
-
row_index_name: Name of the row index column.
|
|
2009
|
-
row_index_offset: Start value for the row index.
|
|
2010
|
-
eol_char: End of line character.
|
|
2011
|
-
raise_if_empty: Raise error if file is empty.
|
|
2012
|
-
truncate_ragged_lines: Truncate lines with too many values.
|
|
2013
|
-
decimal_comma: Parse floats with decimal comma.
|
|
2014
|
-
glob: Use glob pattern for file path (if source is a string).
|
|
2015
|
-
cache: Cache the result after reading (Polars default True).
|
|
2016
|
-
with_column_names: Apply a function over the column names.
|
|
2017
|
-
other_options: Any other options to pass to polars.scan_csv (e.g. retries, file_cache_ttl).
|
|
2018
|
-
|
|
2019
|
-
Returns:
|
|
2020
|
-
A FlowFrame with the CSV data.
|
|
2021
|
-
"""
|
|
2022
|
-
node_id = generate_node_id() # Assuming generate_node_id is defined
|
|
2023
|
-
if flow_graph is None:
|
|
2024
|
-
flow_graph = create_flow_graph() # Assuming create_flow_graph is defined
|
|
2025
|
-
flow_id = flow_graph.flow_id
|
|
2026
|
-
|
|
2027
|
-
current_source_path_for_native = None
|
|
2028
|
-
if isinstance(source, (str, os.PathLike)):
|
|
2029
|
-
current_source_path_for_native = str(source)
|
|
2030
|
-
if '~' in current_source_path_for_native:
|
|
2031
|
-
current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
|
|
2032
|
-
elif isinstance(source, list) and all(isinstance(s, (str, os.PathLike)) for s in source):
|
|
2033
|
-
current_source_path_for_native = str(source[0]) if source else None
|
|
2034
|
-
if current_source_path_for_native and '~' in current_source_path_for_native:
|
|
2035
|
-
current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
|
|
2036
|
-
elif isinstance(source, (io.BytesIO, io.StringIO)):
|
|
2037
|
-
logger.warning("Read from bytes io from csv not supported, converting data to raw data")
|
|
2038
|
-
return from_dict(pl.read_csv(source), flow_graph=flow_graph, description=description)
|
|
2039
|
-
actual_infer_schema_length: Optional[int]
|
|
2040
|
-
if not infer_schema:
|
|
2041
|
-
actual_infer_schema_length = 0
|
|
2042
|
-
else:
|
|
2043
|
-
actual_infer_schema_length = infer_schema_length
|
|
2044
|
-
can_use_native = (
|
|
2045
|
-
current_source_path_for_native is not None and
|
|
2046
|
-
comment_prefix is None and
|
|
2047
|
-
skip_lines == 0 and
|
|
2048
|
-
schema is None and
|
|
2049
|
-
schema_overrides is None and
|
|
2050
|
-
null_values is None and
|
|
2051
|
-
not missing_utf8_is_empty_string and
|
|
2052
|
-
not try_parse_dates and
|
|
2053
|
-
n_rows is None and
|
|
2054
|
-
not low_memory and
|
|
2055
|
-
not rechunk and
|
|
2056
|
-
storage_options is None and
|
|
2057
|
-
skip_rows_after_header == 0 and
|
|
2058
|
-
row_index_name is None and
|
|
2059
|
-
row_index_offset == 0 and
|
|
2060
|
-
eol_char == '\n' and
|
|
2061
|
-
not decimal_comma and
|
|
2062
|
-
new_columns is None and
|
|
2063
|
-
glob is True
|
|
2064
|
-
)
|
|
2065
|
-
if can_use_native and current_source_path_for_native:
|
|
2066
|
-
received_table = input_schema.ReceivedTable(
|
|
2067
|
-
file_type='csv',
|
|
2068
|
-
path=current_source_path_for_native,
|
|
2069
|
-
name=Path(current_source_path_for_native).name,
|
|
2070
|
-
delimiter=separator,
|
|
2071
|
-
has_headers=has_header,
|
|
2072
|
-
encoding=encoding,
|
|
2073
|
-
starting_from_line=skip_rows,
|
|
2074
|
-
quote_char=quote_char if quote_char is not None else '"',
|
|
2075
|
-
infer_schema_length=actual_infer_schema_length if actual_infer_schema_length is not None else 10000,
|
|
2076
|
-
truncate_ragged_lines=truncate_ragged_lines,
|
|
2077
|
-
ignore_errors=ignore_errors,
|
|
2078
|
-
row_delimiter=eol_char
|
|
2079
|
-
)
|
|
2080
|
-
if convert_to_absolute_path:
|
|
2081
|
-
try:
|
|
2082
|
-
received_table.set_absolute_filepath()
|
|
2083
|
-
received_table.path = received_table.abs_file_path
|
|
2084
|
-
except Exception as e:
|
|
2085
|
-
print(f"Warning: Could not determine absolute path for {current_source_path_for_native}: {e}")
|
|
2053
|
+
def __bool__(self):
|
|
2054
|
+
"""This special method determines how the object behaves in boolean contexts.
|
|
2055
|
+
Returns True if the FlowFrame contains any data, False otherwise."""
|
|
2056
|
+
return bool(self.data)
|
|
2086
2057
|
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
received_file=received_table,
|
|
2092
|
-
pos_x=100,
|
|
2093
|
-
pos_y=100,
|
|
2094
|
-
is_setup=True,
|
|
2095
|
-
description=read_node_description
|
|
2096
|
-
)
|
|
2097
|
-
flow_graph.add_read(read_node)
|
|
2098
|
-
result_frame = FlowFrame(
|
|
2099
|
-
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
2100
|
-
flow_graph=flow_graph,
|
|
2101
|
-
node_id=node_id
|
|
2102
|
-
)
|
|
2103
|
-
return result_frame
|
|
2104
|
-
else:
|
|
2105
|
-
polars_source_arg = source
|
|
2106
|
-
polars_code = _build_polars_code_args(
|
|
2107
|
-
source=polars_source_arg,
|
|
2108
|
-
separator=separator,
|
|
2109
|
-
has_header=has_header,
|
|
2110
|
-
new_columns=new_columns,
|
|
2111
|
-
comment_prefix=comment_prefix,
|
|
2112
|
-
quote_char=quote_char,
|
|
2113
|
-
skip_rows=skip_rows,
|
|
2114
|
-
skip_lines=skip_lines,
|
|
2115
|
-
schema=schema,
|
|
2116
|
-
schema_overrides=schema_overrides,
|
|
2117
|
-
null_values=null_values,
|
|
2118
|
-
missing_utf8_is_empty_string=missing_utf8_is_empty_string,
|
|
2119
|
-
ignore_errors=ignore_errors,
|
|
2120
|
-
try_parse_dates=try_parse_dates,
|
|
2121
|
-
infer_schema_length=actual_infer_schema_length,
|
|
2122
|
-
n_rows=n_rows,
|
|
2123
|
-
encoding=encoding,
|
|
2124
|
-
low_memory=low_memory,
|
|
2125
|
-
rechunk=rechunk,
|
|
2126
|
-
storage_options=storage_options,
|
|
2127
|
-
skip_rows_after_header=skip_rows_after_header,
|
|
2128
|
-
row_index_name=row_index_name,
|
|
2129
|
-
row_index_offset=row_index_offset,
|
|
2130
|
-
eol_char=eol_char,
|
|
2131
|
-
raise_if_empty=raise_if_empty,
|
|
2132
|
-
truncate_ragged_lines=truncate_ragged_lines,
|
|
2133
|
-
decimal_comma=decimal_comma,
|
|
2134
|
-
glob=glob,
|
|
2135
|
-
cache=cache,
|
|
2136
|
-
with_column_names=with_column_names,
|
|
2137
|
-
**other_options
|
|
2138
|
-
)
|
|
2139
|
-
polars_code_node_description = description or "Read CSV with Polars scan_csv"
|
|
2140
|
-
if isinstance(source, (str, os.PathLike)):
|
|
2141
|
-
polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source).name}"
|
|
2142
|
-
elif isinstance(source, list) and source and isinstance(source[0], (str, os.PathLike)):
|
|
2143
|
-
polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source[0]).name} (and possibly others)"
|
|
2144
|
-
|
|
2145
|
-
# Assuming input_schema.NodePolarsCode, transform_schema.PolarsCodeInput are defined
|
|
2146
|
-
polars_code_settings = input_schema.NodePolarsCode(
|
|
2147
|
-
flow_id=flow_id,
|
|
2148
|
-
node_id=node_id,
|
|
2149
|
-
polars_code_input=transform_schema.PolarsCodeInput(polars_code=polars_code),
|
|
2150
|
-
is_setup=True,
|
|
2151
|
-
description=polars_code_node_description
|
|
2152
|
-
)
|
|
2153
|
-
flow_graph.add_polars_code(polars_code_settings)
|
|
2154
|
-
return FlowFrame(
|
|
2155
|
-
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
2156
|
-
flow_graph=flow_graph,
|
|
2157
|
-
node_id=node_id,
|
|
2158
|
-
)
|
|
2159
|
-
|
|
2160
|
-
def _build_polars_code_args(
|
|
2161
|
-
source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
|
|
2162
|
-
separator: str,
|
|
2163
|
-
has_header: bool,
|
|
2164
|
-
new_columns: Optional[List[str]],
|
|
2165
|
-
comment_prefix: Optional[str],
|
|
2166
|
-
quote_char: Optional[str],
|
|
2167
|
-
skip_rows: int,
|
|
2168
|
-
skip_lines: int,
|
|
2169
|
-
schema: Optional[SchemaDict],
|
|
2170
|
-
schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]],
|
|
2171
|
-
null_values: Optional[Union[str, List[str], Dict[str, str]]],
|
|
2172
|
-
missing_utf8_is_empty_string: bool,
|
|
2173
|
-
ignore_errors: bool,
|
|
2174
|
-
try_parse_dates: bool,
|
|
2175
|
-
infer_schema_length: Optional[int],
|
|
2176
|
-
n_rows: Optional[int],
|
|
2177
|
-
encoding: CsvEncoding,
|
|
2178
|
-
low_memory: bool,
|
|
2179
|
-
rechunk: bool,
|
|
2180
|
-
storage_options: Optional[Dict[str, Any]],
|
|
2181
|
-
skip_rows_after_header: int,
|
|
2182
|
-
row_index_name: Optional[str],
|
|
2183
|
-
row_index_offset: int,
|
|
2184
|
-
eol_char: str,
|
|
2185
|
-
raise_if_empty: bool,
|
|
2186
|
-
truncate_ragged_lines: bool,
|
|
2187
|
-
decimal_comma: bool,
|
|
2188
|
-
glob: bool,
|
|
2189
|
-
cache: bool,
|
|
2190
|
-
with_column_names: Optional[Callable[[List[str]], List[str]]],
|
|
2191
|
-
**other_options: Any
|
|
2192
|
-
) -> str:
|
|
2193
|
-
source_repr: str
|
|
2194
|
-
if isinstance(source, (str, Path)):
|
|
2195
|
-
source_repr = repr(str(source))
|
|
2196
|
-
elif isinstance(source, list):
|
|
2197
|
-
source_repr = repr([str(p) for p in source])
|
|
2198
|
-
elif isinstance(source, bytes):
|
|
2199
|
-
source_repr = "source_bytes_obj"
|
|
2200
|
-
elif hasattr(source, 'read'):
|
|
2201
|
-
source_repr = "source_file_like_obj"
|
|
2202
|
-
else:
|
|
2203
|
-
source_repr = repr(source)
|
|
2204
|
-
|
|
2205
|
-
param_mapping = {
|
|
2206
|
-
'has_header': (True, lambda x: str(x)),
|
|
2207
|
-
'separator': (',', lambda x: repr(str(x))),
|
|
2208
|
-
'comment_prefix': (None, lambda x: repr(str(x)) if x is not None else 'None'),
|
|
2209
|
-
'quote_char': ('"', lambda x: repr(str(x)) if x is not None else 'None'),
|
|
2210
|
-
'skip_rows': (0, str),
|
|
2211
|
-
'skip_lines': (0, str),
|
|
2212
|
-
'schema': (None, lambda x: repr(x) if x is not None else 'None'),
|
|
2213
|
-
'schema_overrides': (None, lambda x: repr(x) if x is not None else 'None'),
|
|
2214
|
-
'null_values': (None, lambda x: repr(x) if x is not None else 'None'),
|
|
2215
|
-
'missing_utf8_is_empty_string': (False, str),
|
|
2216
|
-
'ignore_errors': (False, str),
|
|
2217
|
-
'cache': (True, str),
|
|
2218
|
-
'with_column_names': (None, lambda x: repr(x) if x is not None else 'None'),
|
|
2219
|
-
'infer_schema_length': (100, lambda x: str(x) if x is not None else 'None'),
|
|
2220
|
-
'n_rows': (None, lambda x: str(x) if x is not None else 'None'),
|
|
2221
|
-
'encoding': ('utf8', lambda x: repr(str(x))),
|
|
2222
|
-
'low_memory': (False, str),
|
|
2223
|
-
'rechunk': (False, str),
|
|
2224
|
-
'skip_rows_after_header': (0, str),
|
|
2225
|
-
'row_index_name': (None, lambda x: repr(str(x)) if x is not None else 'None'),
|
|
2226
|
-
'row_index_offset': (0, str),
|
|
2227
|
-
'try_parse_dates': (False, str),
|
|
2228
|
-
'eol_char': ('\n', lambda x: repr(str(x))),
|
|
2229
|
-
'new_columns': (None, lambda x: repr(x) if x is not None else 'None'),
|
|
2230
|
-
'raise_if_empty': (True, str),
|
|
2231
|
-
'truncate_ragged_lines': (False, str),
|
|
2232
|
-
'decimal_comma': (False, str),
|
|
2233
|
-
'glob': (True, str),
|
|
2234
|
-
'storage_options': (None, lambda x: repr(x) if x is not None else 'None'),
|
|
2235
|
-
}
|
|
2236
|
-
|
|
2237
|
-
all_vars = locals()
|
|
2238
|
-
kwargs_list = []
|
|
2239
|
-
|
|
2240
|
-
for param_name_key, (default_value, format_func) in param_mapping.items():
|
|
2241
|
-
value = all_vars.get(param_name_key)
|
|
2242
|
-
formatted_value = format_func(value)
|
|
2243
|
-
kwargs_list.append(f"{param_name_key}={formatted_value}")
|
|
2244
|
-
|
|
2245
|
-
if other_options:
|
|
2246
|
-
for k, v in other_options.items():
|
|
2247
|
-
kwargs_list.append(f"{k}={repr(v)}")
|
|
2248
|
-
|
|
2249
|
-
kwargs_str = ",\n ".join(kwargs_list)
|
|
2250
|
-
|
|
2251
|
-
if kwargs_str:
|
|
2252
|
-
polars_code = f"output_df = pl.scan_csv(\n {source_repr},\n {kwargs_str}\n)"
|
|
2253
|
-
else:
|
|
2254
|
-
polars_code = f"output_df = pl.scan_csv({source_repr})"
|
|
2255
|
-
|
|
2256
|
-
return polars_code
|
|
2257
|
-
|
|
2258
|
-
|
|
2259
|
-
def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
|
|
2260
|
-
convert_to_absolute_path: bool = True, **options) -> FlowFrame:
|
|
2261
|
-
"""
|
|
2262
|
-
Read a Parquet file into a FlowFrame.
|
|
2263
|
-
|
|
2264
|
-
Args:
|
|
2265
|
-
file_path: Path to Parquet file
|
|
2266
|
-
flow_graph: if you want to add it to an existing graph
|
|
2267
|
-
description: if you want to add a readable name in the frontend (advised)
|
|
2268
|
-
convert_to_absolute_path: If the path needs to be set to a fixed location
|
|
2269
|
-
**options: Options for polars.read_parquet
|
|
2270
|
-
|
|
2271
|
-
Returns:
|
|
2272
|
-
A FlowFrame with the Parquet data
|
|
2273
|
-
"""
|
|
2274
|
-
if '~' in file_path:
|
|
2275
|
-
file_path = os.path.expanduser(file_path)
|
|
2276
|
-
node_id = generate_node_id()
|
|
2277
|
-
|
|
2278
|
-
if flow_graph is None:
|
|
2279
|
-
flow_graph = create_flow_graph()
|
|
2280
|
-
|
|
2281
|
-
flow_id = flow_graph.flow_id
|
|
2282
|
-
|
|
2283
|
-
received_table = input_schema.ReceivedTable(
|
|
2284
|
-
file_type='parquet',
|
|
2285
|
-
path=file_path,
|
|
2286
|
-
name=Path(file_path).name,
|
|
2287
|
-
)
|
|
2288
|
-
if convert_to_absolute_path:
|
|
2289
|
-
received_table.path = received_table.abs_file_path
|
|
2290
|
-
|
|
2291
|
-
read_node = input_schema.NodeRead(
|
|
2292
|
-
flow_id=flow_id,
|
|
2293
|
-
node_id=node_id,
|
|
2294
|
-
received_file=received_table,
|
|
2295
|
-
pos_x=100,
|
|
2296
|
-
pos_y=100,
|
|
2297
|
-
is_setup=True,
|
|
2298
|
-
description=description
|
|
2299
|
-
)
|
|
2300
|
-
|
|
2301
|
-
flow_graph.add_read(read_node)
|
|
2302
|
-
|
|
2303
|
-
return FlowFrame(
|
|
2304
|
-
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
2305
|
-
flow_graph=flow_graph,
|
|
2306
|
-
node_id=node_id
|
|
2307
|
-
)
|
|
2308
|
-
|
|
2309
|
-
|
|
2310
|
-
def from_dict(data, *, flow_graph: FlowGraph = None, description: str = None) -> FlowFrame:
|
|
2311
|
-
"""
|
|
2312
|
-
Create a FlowFrame from a dictionary or list of dictionaries.
|
|
2313
|
-
|
|
2314
|
-
Args:
|
|
2315
|
-
data: Dictionary of lists or list of dictionaries
|
|
2316
|
-
flow_graph: if you want to add it to an existing graph
|
|
2317
|
-
description: if you want to add a readable name in the frontend (advised)
|
|
2318
|
-
Returns:
|
|
2319
|
-
A FlowFrame with the data
|
|
2320
|
-
"""
|
|
2321
|
-
# Create new node ID
|
|
2322
|
-
node_id = generate_node_id()
|
|
2323
|
-
|
|
2324
|
-
if not flow_graph:
|
|
2325
|
-
flow_graph = create_flow_graph()
|
|
2326
|
-
flow_id = flow_graph.flow_id
|
|
2327
|
-
|
|
2328
|
-
input_node = input_schema.NodeManualInput(
|
|
2329
|
-
flow_id=flow_id,
|
|
2330
|
-
node_id=node_id,
|
|
2331
|
-
raw_data=FlowDataEngine(data).to_pylist(),
|
|
2332
|
-
pos_x=100,
|
|
2333
|
-
pos_y=100,
|
|
2334
|
-
is_setup=True,
|
|
2335
|
-
description=description
|
|
2336
|
-
)
|
|
2337
|
-
|
|
2338
|
-
# Add to graph
|
|
2339
|
-
flow_graph.add_manual_input(input_node)
|
|
2340
|
-
|
|
2341
|
-
# Return new frame
|
|
2342
|
-
return FlowFrame(
|
|
2343
|
-
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
2344
|
-
flow_graph=flow_graph,
|
|
2345
|
-
node_id=node_id
|
|
2346
|
-
)
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
def concat(frames: List['FlowFrame'],
|
|
2350
|
-
how: str = 'vertical',
|
|
2351
|
-
rechunk: bool = False,
|
|
2352
|
-
parallel: bool = True,
|
|
2353
|
-
description: str = None) -> 'FlowFrame':
|
|
2354
|
-
"""
|
|
2355
|
-
Concatenate multiple FlowFrames into one.
|
|
2356
|
-
|
|
2357
|
-
Parameters
|
|
2358
|
-
----------
|
|
2359
|
-
frames : List[FlowFrame]
|
|
2360
|
-
List of FlowFrames to concatenate
|
|
2361
|
-
how : str, default 'vertical'
|
|
2362
|
-
How to combine the FlowFrames (see concat method documentation)
|
|
2363
|
-
rechunk : bool, default False
|
|
2364
|
-
Whether to ensure contiguous memory in result
|
|
2365
|
-
parallel : bool, default True
|
|
2366
|
-
Whether to use parallel processing for the operation
|
|
2367
|
-
description : str, optional
|
|
2368
|
-
Description of this operation
|
|
2369
|
-
|
|
2370
|
-
Returns
|
|
2371
|
-
-------
|
|
2372
|
-
FlowFrame
|
|
2373
|
-
A new FlowFrame with the concatenated data
|
|
2374
|
-
"""
|
|
2375
|
-
if not frames:
|
|
2376
|
-
raise ValueError("No frames provided to concat_frames")
|
|
2377
|
-
|
|
2378
|
-
if len(frames) == 1:
|
|
2379
|
-
return frames[0]
|
|
2380
|
-
|
|
2381
|
-
# Use first frame's concat method with remaining frames
|
|
2382
|
-
first_frame = frames[0]
|
|
2383
|
-
remaining_frames = frames[1:]
|
|
2384
|
-
|
|
2385
|
-
return first_frame.concat(remaining_frames, how=how,
|
|
2386
|
-
rechunk=rechunk, parallel=parallel,
|
|
2387
|
-
description=description)
|
|
2388
|
-
|
|
2389
|
-
|
|
2390
|
-
def scan_csv(
|
|
2391
|
-
source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
|
|
2392
|
-
*,
|
|
2393
|
-
flow_graph: Optional[Any] = None, # Using Any for FlowGraph placeholder
|
|
2394
|
-
separator: str = ',',
|
|
2395
|
-
convert_to_absolute_path: bool = True,
|
|
2396
|
-
description: Optional[str] = None,
|
|
2397
|
-
has_header: bool = True,
|
|
2398
|
-
new_columns: Optional[List[str]] = None,
|
|
2399
|
-
comment_prefix: Optional[str] = None,
|
|
2400
|
-
quote_char: Optional[str] = '"',
|
|
2401
|
-
skip_rows: int = 0,
|
|
2402
|
-
skip_lines: int = 0,
|
|
2403
|
-
schema: Optional[SchemaDict] = None,
|
|
2404
|
-
schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
|
|
2405
|
-
null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
|
|
2406
|
-
missing_utf8_is_empty_string: bool = False,
|
|
2407
|
-
ignore_errors: bool = False,
|
|
2408
|
-
try_parse_dates: bool = False,
|
|
2409
|
-
infer_schema: bool = True,
|
|
2410
|
-
infer_schema_length: Optional[int] = 100,
|
|
2411
|
-
n_rows: Optional[int] = None,
|
|
2412
|
-
encoding: CsvEncoding = 'utf8',
|
|
2413
|
-
low_memory: bool = False,
|
|
2414
|
-
rechunk: bool = False,
|
|
2415
|
-
storage_options: Optional[Dict[str, Any]] = None,
|
|
2416
|
-
skip_rows_after_header: int = 0,
|
|
2417
|
-
row_index_name: Optional[str] = None,
|
|
2418
|
-
row_index_offset: int = 0,
|
|
2419
|
-
eol_char: str = '\n',
|
|
2420
|
-
raise_if_empty: bool = True,
|
|
2421
|
-
truncate_ragged_lines: bool = False,
|
|
2422
|
-
decimal_comma: bool = False,
|
|
2423
|
-
glob: bool = True,
|
|
2424
|
-
cache: bool = True,
|
|
2425
|
-
with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
|
|
2426
|
-
**other_options: Any
|
|
2427
|
-
) -> FlowFrame:
|
|
2428
|
-
"""
|
|
2429
|
-
Scan a CSV file into a FlowFrame. This function is an alias for read_csv.
|
|
2058
|
+
@staticmethod
|
|
2059
|
+
def _comparison_error(operator: str) -> pl.lazyframe.frame.NoReturn:
|
|
2060
|
+
msg = f'"{operator!r}" comparison not supported for LazyFrame objects'
|
|
2061
|
+
raise TypeError(msg)
|
|
2430
2062
|
|
|
2431
|
-
|
|
2432
|
-
|
|
2063
|
+
def __eq__(self, other: object) -> pl.lazyframe.frame.NoReturn:
|
|
2064
|
+
self._comparison_error("==")
|
|
2433
2065
|
|
|
2434
|
-
|
|
2435
|
-
|
|
2436
|
-
return read_csv(
|
|
2437
|
-
source=source,
|
|
2438
|
-
flow_graph=flow_graph,
|
|
2439
|
-
separator=separator,
|
|
2440
|
-
convert_to_absolute_path=convert_to_absolute_path,
|
|
2441
|
-
description=description,
|
|
2442
|
-
has_header=has_header,
|
|
2443
|
-
new_columns=new_columns,
|
|
2444
|
-
comment_prefix=comment_prefix,
|
|
2445
|
-
quote_char=quote_char,
|
|
2446
|
-
skip_rows=skip_rows,
|
|
2447
|
-
skip_lines=skip_lines,
|
|
2448
|
-
schema=schema,
|
|
2449
|
-
schema_overrides=schema_overrides,
|
|
2450
|
-
null_values=null_values,
|
|
2451
|
-
missing_utf8_is_empty_string=missing_utf8_is_empty_string,
|
|
2452
|
-
ignore_errors=ignore_errors,
|
|
2453
|
-
try_parse_dates=try_parse_dates,
|
|
2454
|
-
infer_schema=infer_schema,
|
|
2455
|
-
infer_schema_length=infer_schema_length,
|
|
2456
|
-
n_rows=n_rows,
|
|
2457
|
-
encoding=encoding,
|
|
2458
|
-
low_memory=low_memory,
|
|
2459
|
-
rechunk=rechunk,
|
|
2460
|
-
storage_options=storage_options,
|
|
2461
|
-
skip_rows_after_header=skip_rows_after_header,
|
|
2462
|
-
row_index_name=row_index_name,
|
|
2463
|
-
row_index_offset=row_index_offset,
|
|
2464
|
-
eol_char=eol_char,
|
|
2465
|
-
raise_if_empty=raise_if_empty,
|
|
2466
|
-
truncate_ragged_lines=truncate_ragged_lines,
|
|
2467
|
-
decimal_comma=decimal_comma,
|
|
2468
|
-
glob=glob,
|
|
2469
|
-
cache=cache,
|
|
2470
|
-
with_column_names=with_column_names,
|
|
2471
|
-
**other_options
|
|
2472
|
-
)
|
|
2066
|
+
def __ne__(self, other: object) -> pl.lazyframe.frame.NoReturn:
|
|
2067
|
+
self._comparison_error("!=")
|
|
2473
2068
|
|
|
2069
|
+
def __gt__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
|
|
2070
|
+
self._comparison_error(">")
|
|
2474
2071
|
|
|
2475
|
-
def
|
|
2476
|
-
|
|
2477
|
-
*,
|
|
2478
|
-
flow_graph: FlowGraph = None,
|
|
2479
|
-
description: str = None,
|
|
2480
|
-
convert_to_absolute_path: bool = True,
|
|
2481
|
-
**options
|
|
2482
|
-
) -> FlowFrame:
|
|
2483
|
-
"""
|
|
2484
|
-
Scan a Parquet file into a FlowFrame. This function is an alias for read_parquet.
|
|
2072
|
+
def __lt__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
|
|
2073
|
+
self._comparison_error("<")
|
|
2485
2074
|
|
|
2486
|
-
|
|
2487
|
-
|
|
2075
|
+
def __ge__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
|
|
2076
|
+
self._comparison_error(">=")
|
|
2488
2077
|
|
|
2489
|
-
|
|
2490
|
-
|
|
2491
|
-
return read_parquet(
|
|
2492
|
-
file_path=file_path,
|
|
2493
|
-
flow_graph=flow_graph,
|
|
2494
|
-
description=description,
|
|
2495
|
-
convert_to_absolute_path=convert_to_absolute_path,
|
|
2496
|
-
**options
|
|
2497
|
-
)
|
|
2078
|
+
def __le__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
|
|
2079
|
+
self._comparison_error("<=")
|