Flowfile 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (46) hide show
  1. flowfile/__init__.py +2 -1
  2. flowfile/web/__init__.py +3 -0
  3. {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/METADATA +1 -1
  4. {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/RECORD +46 -35
  5. flowfile_core/configs/__init__.py +15 -4
  6. flowfile_core/configs/settings.py +5 -3
  7. flowfile_core/configs/utils.py +18 -0
  8. flowfile_core/flowfile/FlowfileFlow.py +13 -18
  9. flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
  10. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +54 -17
  11. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
  12. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
  13. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +2 -1
  14. flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
  15. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
  16. flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
  17. flowfile_core/flowfile/flow_node/flow_node.py +2 -1
  18. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
  19. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
  20. flowfile_core/flowfile/utils.py +34 -3
  21. flowfile_core/main.py +2 -3
  22. flowfile_core/routes/secrets.py +1 -1
  23. flowfile_core/schemas/input_schema.py +10 -4
  24. flowfile_core/schemas/transform_schema.py +25 -47
  25. flowfile_frame/__init__.py +11 -4
  26. flowfile_frame/adding_expr.py +280 -0
  27. flowfile_frame/config.py +9 -0
  28. flowfile_frame/expr.py +301 -83
  29. flowfile_frame/expr.pyi +2174 -0
  30. flowfile_frame/expr_name.py +258 -0
  31. flowfile_frame/flow_frame.py +587 -1002
  32. flowfile_frame/flow_frame.pyi +336 -0
  33. flowfile_frame/flow_frame_methods.py +617 -0
  34. flowfile_frame/group_frame.py +89 -42
  35. flowfile_frame/join.py +1 -2
  36. flowfile_frame/lazy.py +704 -0
  37. flowfile_frame/lazy_methods.py +201 -0
  38. flowfile_frame/list_name_space.py +324 -0
  39. flowfile_frame/selectors.py +3 -0
  40. flowfile_frame/series.py +70 -0
  41. flowfile_frame/utils.py +80 -4
  42. {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/LICENSE +0 -0
  43. {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/WHEEL +0 -0
  44. {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/entry_points.txt +0 -0
  45. /flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
  46. /flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0
@@ -1,15 +1,15 @@
1
- import logging
1
+ import inspect
2
2
  import os
3
- from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable
4
- from pathlib import Path
3
+ from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable, get_args, get_origin
5
4
 
6
- import io
7
5
  import re
6
+
8
7
  import polars as pl
9
- from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation, IO, Mapping, PolarsDataType,
10
- Sequence, CsvEncoding)
11
8
 
12
- # Assume these imports are correct from your original context
9
+ from flowfile_frame.lazy_methods import add_lazyframe_methods
10
+
11
+ from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
12
+ from collections.abc import Iterator
13
13
  from flowfile_core.flowfile.FlowfileFlow import FlowGraph, add_connection
14
14
  from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
15
15
  from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
@@ -19,19 +19,35 @@ from flowfile_core.schemas import input_schema, transform_schema
19
19
  from flowfile_frame.expr import Expr, Column, lit, col
20
20
  from flowfile_frame.selectors import Selector
21
21
  from flowfile_frame.group_frame import GroupByFrame
22
- from flowfile_frame.utils import _parse_inputs_as_iterable, create_flow_graph
22
+ from flowfile_frame.utils import (_parse_inputs_as_iterable, create_flow_graph, stringify_values,
23
+ ensure_inputs_as_iterable)
23
24
  from flowfile_frame.join import _normalize_columns_to_list, _create_join_mappings
25
+ from flowfile_frame.utils import _check_if_convertible_to_code
26
+ from flowfile_frame.config import logger
27
+
24
28
 
25
29
  node_id_counter = 0
26
30
 
27
31
 
28
- logging.basicConfig(
29
- level=logging.INFO,
30
- format='[%(levelname)s] %(message)s'
31
- )
32
+ def can_be_expr(param: inspect.Parameter) -> bool:
33
+ """Check if a parameter can be of type pl.Expr"""
34
+ if param.annotation == inspect.Parameter.empty:
35
+ return False
36
+
37
+ # Check direct match or in Union args
38
+ types = get_args(param.annotation) if get_origin(param.annotation) is Union else [param.annotation]
39
+ return any(t in (pl.Expr, pl.expr.expr.Expr) for t in types)
40
+
41
+
42
+ def _contains_lambda_pattern(text: str) -> bool:
43
+ return "<lambda> at" in text
44
+
45
+
46
+ def get_method_name_from_code(code: str) -> str | None:
47
+ split_code = code.split("input_df.")
48
+ if len(split_code) > 1:
49
+ return split_code[1].split("(")[0]
32
50
 
33
- # Create and export the logger
34
- logger = logging.getLogger('flow_frame')
35
51
 
36
52
  def _to_string_val(v) -> str:
37
53
  if isinstance(v, str):
@@ -40,12 +56,72 @@ def _to_string_val(v) -> str:
40
56
  return v
41
57
 
42
58
 
59
+ def _extract_expr_parts(expr_obj) -> tuple[str, str]:
60
+ """
61
+ Extract the pure expression string and any raw definitions (including function sources) from an Expr object.
62
+
63
+ Parameters
64
+ ----------
65
+ expr_obj : Expr
66
+ The expression object to extract parts from
67
+
68
+ Returns
69
+ -------
70
+ tuple[str, str]
71
+ A tuple of (pure_expr_str, raw_definitions_str)
72
+ """
73
+ if not isinstance(expr_obj, Expr):
74
+ # If it's not an Expr, just return its string representation
75
+ return str(expr_obj), ""
76
+
77
+ # Get the basic representation
78
+ pure_expr_str = expr_obj._repr_str
79
+
80
+ # Collect all definitions (function sources)
81
+ raw_definitions = []
82
+
83
+ # Add function sources if any
84
+ if hasattr(expr_obj, '_function_sources') and expr_obj._function_sources:
85
+ # Remove duplicates while preserving order
86
+ unique_sources = []
87
+ seen = set()
88
+ for source in expr_obj._function_sources:
89
+ if source not in seen:
90
+ seen.add(source)
91
+ unique_sources.append(source)
92
+
93
+ if unique_sources:
94
+ raw_definitions.extend(unique_sources)
95
+
96
+ # Join all definitions
97
+ raw_defs_str = "\n\n".join(raw_definitions) if raw_definitions else ""
98
+
99
+ return pure_expr_str, raw_defs_str
100
+
101
+
102
+ def _check_ok_for_serialization(method_name: str = None, polars_expr: pl.Expr | None = None,
103
+ group_expr: pl.Expr | None = None) -> None:
104
+ if method_name is None:
105
+ raise NotImplemented("Cannot create a polars lambda expression without the method")
106
+ if polars_expr is None:
107
+ raise NotImplemented("Cannot create polars expressions with lambda function")
108
+ method_ref = getattr(pl.LazyFrame, method_name)
109
+ if method_ref is None:
110
+ raise ModuleNotFoundError(f"Could not find the method {method_name} in polars lazyframe")
111
+ if method_name == 'group_by':
112
+ if group_expr is None:
113
+ raise NotImplemented("Cannot create a polars lambda expression without the groupby expression")
114
+ if not all(isinstance(ge, pl.Expr) for ge in group_expr):
115
+ raise NotImplemented("Cannot create a polars lambda expression without the groupby expression")
116
+
117
+
43
118
  def generate_node_id() -> int:
44
119
  global node_id_counter
45
120
  node_id_counter += 1
46
121
  return node_id_counter
47
122
 
48
123
 
124
+ @add_lazyframe_methods
49
125
  class FlowFrame:
50
126
  """Main class that wraps FlowDataEngine and maintains the ETL graph."""
51
127
  flow_graph: FlowGraph
@@ -100,13 +176,11 @@ class FlowFrame:
100
176
  # Extract flow-specific parameters
101
177
  node_id = node_id or generate_node_id()
102
178
  description = "Data imported from Python object"
103
-
104
179
  # Create a new flow graph if none is provided
105
180
  if flow_graph is None:
106
181
  flow_graph = create_flow_graph()
107
182
 
108
183
  flow_id = flow_graph.flow_id
109
-
110
184
  # Convert data to a polars DataFrame/LazyFrame
111
185
  try:
112
186
  # Use polars to convert from various types
@@ -121,25 +195,23 @@ class FlowFrame:
121
195
  )
122
196
  pl_data = pl_df.lazy()
123
197
  except Exception as e:
124
- raise ValueError(f"Could not convert data to a polars DataFrame: {e}")
125
-
198
+ raise ValueError(f"Could not dconvert data to a polars DataFrame: {e}")
126
199
  # Create a FlowDataEngine to get data in the right format for manual input
127
200
  flow_table = FlowDataEngine(raw_data=pl_data)
128
-
201
+ raw_data_format = input_schema.RawData(data=list(flow_table.to_dict().values()),
202
+ columns=[c.get_minimal_field_info() for c in flow_table.schema])
129
203
  # Create a manual input node
130
204
  input_node = input_schema.NodeManualInput(
131
205
  flow_id=flow_id,
132
206
  node_id=node_id,
133
- raw_data=flow_table.to_pylist(), # Convert to list of dicts
207
+ raw_data_format=raw_data_format,
134
208
  pos_x=100,
135
209
  pos_y=100,
136
210
  is_setup=True,
137
211
  description=description,
138
212
  )
139
-
140
213
  # Add to graph
141
214
  flow_graph.add_manual_input(input_node)
142
-
143
215
  # Return new frame
144
216
  return FlowFrame(
145
217
  data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
@@ -163,7 +235,6 @@ class FlowFrame:
163
235
  parent_node_id=None,
164
236
  ):
165
237
  """Create a new FlowFrame instance."""
166
-
167
238
  # If data is not a LazyFrame, use the factory method
168
239
  if data is not None and not isinstance(data, pl.LazyFrame):
169
240
  return cls.create_from_any_type(
@@ -179,7 +250,6 @@ class FlowFrame:
179
250
  parent_node_id=parent_node_id,
180
251
  )
181
252
 
182
- # Otherwise create the instance normally
183
253
  instance = super().__new__(cls)
184
254
  return instance
185
255
 
@@ -198,7 +268,6 @@ class FlowFrame:
198
268
  parent_node_id=None,
199
269
  ):
200
270
  """Initialize the FlowFrame with data and graph references."""
201
-
202
271
  if data is None:
203
272
  data = pl.LazyFrame()
204
273
  if not isinstance(data, pl.LazyFrame):
@@ -230,205 +299,235 @@ class FlowFrame:
230
299
  def _create_child_frame(self, new_node_id):
231
300
  """Helper method to create a new FlowFrame that's a child of this one"""
232
301
  self._add_connection(self.node_id, new_node_id)
233
- return FlowFrame(
234
- data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
235
- flow_graph=self.flow_graph,
236
- node_id=new_node_id,
237
- parent_node_id=self.node_id,
238
- )
302
+ try:
303
+ return FlowFrame(
304
+ data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
305
+ flow_graph=self.flow_graph,
306
+ node_id=new_node_id,
307
+ parent_node_id=self.node_id,
308
+ )
309
+ except AttributeError:
310
+ raise ValueError('Could not execute the function')
239
311
 
240
- def sort(
241
- self,
242
- by: List[Expr | str] | Expr | str,
243
- *more_by,
244
- descending: bool | List[bool] = False,
245
- nulls_last: bool = False,
246
- multithreaded: bool = True,
247
- maintain_order: bool = False,
248
- description: str = None,
249
- ):
312
+ @staticmethod
313
+ def _generate_sort_polars_code(
314
+ pure_sort_expr_strs: List[str],
315
+ descending_values: List[bool],
316
+ nulls_last_values: List[bool],
317
+ multithreaded: bool,
318
+ maintain_order: bool,
319
+ ) -> str:
250
320
  """
251
- Sort the dataframe by the given columns.
321
+ Generates the `input_df.sort(...)` Polars code string using pure expression strings.
322
+ """
323
+ kwargs_for_code: Dict[str, Any] = {}
324
+ if any(descending_values):
325
+ kwargs_for_code["descending"] = descending_values[0] if len(descending_values) == 1 else descending_values
326
+ if any(nulls_last_values):
327
+ kwargs_for_code["nulls_last"] = nulls_last_values[0] if len(nulls_last_values) == 1 else nulls_last_values
328
+ if not multithreaded:
329
+ kwargs_for_code["multithreaded"] = multithreaded
330
+ if maintain_order:
331
+ kwargs_for_code["maintain_order"] = maintain_order
252
332
 
253
- Parameters:
254
- -----------
255
- by : Expr, str, or list of Expr/str
256
- Column(s) to sort by. Accepts expression input. Strings are parsed as column names.
257
- *more_by : Expr or str
258
- Additional columns to sort by, specified as positional arguments.
259
- descending : bool or list of bool, default False
260
- Sort in descending order. When sorting by multiple columns, can be specified per column.
261
- nulls_last : bool or list of bool, default False
262
- Place null values last; can specify a single boolean or a sequence for per-column control.
263
- multithreaded : bool, default True
264
- Sort using multiple threads.
265
- maintain_order : bool, default False
266
- Whether the order should be maintained if elements are equal.
267
- description : str, optional
268
- Description of this operation for the ETL graph.
333
+ kwargs_str_for_code = ", ".join(f"{k}={repr(v)}" for k, v in kwargs_for_code.items())
269
334
 
270
- Returns:
271
- --------
272
- FlowFrame
273
- A new FlowFrame with sorted data.
335
+ by_arg_for_code = pure_sort_expr_strs[0] if len(
336
+ pure_sort_expr_strs) == 1 else f"[{', '.join(pure_sort_expr_strs)}]"
337
+ return f"input_df.sort({by_arg_for_code}{', ' + kwargs_str_for_code if kwargs_str_for_code else ''})"
338
+
339
+ def sort(
340
+ self,
341
+ by: Union[List[Union[Expr, str]], Expr, str],
342
+ *more_by: Union[Expr, str],
343
+ descending: Union[bool, List[bool]] = False,
344
+ nulls_last: Union[bool, List[bool]] = False,
345
+ multithreaded: bool = True,
346
+ maintain_order: bool = False,
347
+ description: Optional[str] = None,
348
+ ) -> "FlowFrame":
349
+ """
350
+ Sort the dataframe by the given columns.
274
351
  """
275
- by = list(_parse_inputs_as_iterable((by,)))
352
+ initial_by_args = list(_parse_inputs_as_iterable((by,)))
276
353
  new_node_id = generate_node_id()
277
- sort_expressions = by
354
+
355
+ sort_expressions_input: list = initial_by_args
278
356
  if more_by:
279
- sort_expressions.extend(more_by)
357
+ sort_expressions_input.extend(list(_parse_inputs_as_iterable(more_by)))
280
358
 
281
- # Determine if we need to use polars code fallback
282
- needs_polars_code = False
359
+ all_processed_expr_objects: List[Expr] = []
360
+ pure_polars_expr_strings_for_sort: List[str] = []
361
+ collected_raw_definitions: List[str] = []
362
+ column_names_for_native_node: List[str] = []
283
363
 
284
- # Check for any expressions that are not simple columns
285
- for expr in sort_expressions:
286
- if not isinstance(expr, (str, Column)) or (
287
- isinstance(expr, Column) and expr._select_input.is_altered
288
- ):
289
- needs_polars_code = True
290
- break
364
+ use_polars_code_path = False
291
365
 
292
- # Also need polars code if we're using maintain_order or multithreaded params
293
366
  if maintain_order or not multithreaded:
294
- needs_polars_code = True
295
-
296
- # Standardize descending parameter
297
- if isinstance(descending, (list, tuple)):
298
- # Ensure descending list has the same length as sort_expressions
299
- if len(descending) != len(sort_expressions):
300
- raise ValueError(
301
- f"Length of descending ({len(descending)}) must match number of sort columns ({len(sort_expressions)})"
302
- )
303
- descending_values = descending
304
- else:
305
- descending_values = [descending] * len(sort_expressions)
306
-
307
- # Standardize nulls_last parameter
308
- if isinstance(nulls_last, (list, tuple)):
309
- if len(nulls_last) != len(sort_expressions):
310
- raise ValueError(
311
- f"Length of nulls_last ({len(nulls_last)}) must match number of sort columns ({len(sort_expressions)})"
312
- )
313
- nulls_last_values = nulls_last
314
- # Any non-default nulls_last needs polars code
315
- if any(val is not False for val in nulls_last_values):
316
- needs_polars_code = True
317
- else:
318
- nulls_last_values = [nulls_last] * len(sort_expressions)
319
- # Non-default nulls_last needs polars code
320
- if nulls_last:
321
- needs_polars_code = True
322
-
323
- if needs_polars_code:
324
- # Generate polars code for complex cases
325
- code = self._generate_sort_polars_code(
326
- sort_expressions,
327
- descending_values,
328
- nulls_last_values,
329
- multithreaded,
330
- maintain_order,
331
- )
332
- self._add_polars_code(new_node_id, code, description)
333
- else:
334
- # Use native implementation for simple cases
335
- sort_inputs = []
336
- for i, expr in enumerate(sort_expressions):
337
- # Convert expr to column name
338
- if isinstance(expr, Column):
339
- column_name = expr.name
340
- elif isinstance(expr, str):
341
- column_name = expr
367
+ use_polars_code_path = True
368
+
369
+ is_nulls_last_list = isinstance(nulls_last, (list, tuple))
370
+ if is_nulls_last_list and any(val for val in nulls_last if val is not False):
371
+ use_polars_code_path = True
372
+ elif not is_nulls_last_list and nulls_last is not False:
373
+ use_polars_code_path = True
374
+
375
+ for expr_input in sort_expressions_input:
376
+ current_expr_obj: Expr
377
+ is_simple_col_for_native = False
378
+
379
+ if isinstance(expr_input, str):
380
+ current_expr_obj = col(expr_input)
381
+ column_names_for_native_node.append(expr_input)
382
+ is_simple_col_for_native = True
383
+ elif isinstance(expr_input, Column):
384
+ current_expr_obj = expr_input
385
+ # Type ignore below due to simplified Column stub
386
+ if not expr_input._select_input.is_altered: # type: ignore
387
+ column_names_for_native_node.append(expr_input.column_name) # type: ignore
388
+ is_simple_col_for_native = True
342
389
  else:
343
- column_name = str(expr)
390
+ use_polars_code_path = True # Altered Column implies complex expression
391
+ elif isinstance(expr_input, Expr):
392
+ current_expr_obj = expr_input
393
+ use_polars_code_path = True # General Expr implies complex expression
394
+ else: # Convert other types to lit
395
+ current_expr_obj = lit(expr_input)
396
+ use_polars_code_path = True # Literal might be part of a complex sort for Polars code
397
+
398
+ all_processed_expr_objects.append(current_expr_obj)
399
+
400
+ pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
401
+ pure_polars_expr_strings_for_sort.append(pure_expr_str)
402
+
403
+ if raw_defs_str:
404
+ if raw_defs_str not in collected_raw_definitions:
405
+ collected_raw_definitions.append(raw_defs_str)
406
+ use_polars_code_path = True
407
+
408
+ if not is_simple_col_for_native: # If it wasn't a simple string or unaltered Column
409
+ use_polars_code_path = True
410
+
411
+ desc_values = list(descending) if isinstance(descending, list) else [descending] * len(
412
+ all_processed_expr_objects)
413
+ null_last_values = list(nulls_last) if isinstance(nulls_last, list) else [nulls_last] * len(
414
+ all_processed_expr_objects)
415
+
416
+ if len(desc_values) != len(all_processed_expr_objects):
417
+ raise ValueError("Length of 'descending' does not match the number of sort expressions.")
418
+ if len(null_last_values) != len(all_processed_expr_objects):
419
+ raise ValueError("Length of 'nulls_last' does not match the number of sort expressions.")
420
+
421
+ if use_polars_code_path:
422
+ polars_operation_code = self._generate_sort_polars_code(
423
+ pure_polars_expr_strings_for_sort, desc_values, null_last_values, multithreaded, maintain_order
424
+ )
344
425
 
345
- # Create SortByInput with appropriate settings
346
- sort_inputs.append(
347
- transform_schema.SortByInput(
348
- column=column_name,
349
- how="desc" if descending_values[i] else "asc",
350
- )
426
+ final_code_for_node: str
427
+ if collected_raw_definitions:
428
+ unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions)) # Order-preserving unique
429
+ definitions_section = "\n\n".join(unique_raw_definitions)
430
+ final_code_for_node = definitions_section + \
431
+ "\#─────SPLIT─────\n\n" + \
432
+ f"output_df = {polars_operation_code}"
433
+ else:
434
+ final_code_for_node = polars_operation_code
435
+
436
+ pl_expressions_for_fallback = [e.expr for e in all_processed_expr_objects if
437
+ hasattr(e, 'expr') and e.expr is not None]
438
+ kwargs_for_fallback = {
439
+ "descending": desc_values[0] if len(desc_values) == 1 else desc_values,
440
+ "nulls_last": null_last_values[0] if len(null_last_values) == 1 else null_last_values,
441
+ "multithreaded": multithreaded, "maintain_order": maintain_order}
442
+
443
+ self._add_polars_code(new_node_id, final_code_for_node, description, method_name="sort",
444
+ convertable_to_code=_check_if_convertible_to_code(all_processed_expr_objects),
445
+ polars_expr=pl_expressions_for_fallback,
446
+ kwargs_expr=kwargs_for_fallback)
447
+ else:
448
+ sort_inputs_for_node = []
449
+ for i, col_name_for_native in enumerate(column_names_for_native_node):
450
+ sort_inputs_for_node.append(
451
+ transform_schema.SortByInput(column=col_name_for_native, how="desc" if desc_values[i] else "asc")
452
+ # type: ignore
351
453
  )
352
-
353
454
  sort_settings = input_schema.NodeSort(
354
- flow_id=self.flow_graph.flow_id,
355
- node_id=new_node_id,
356
- sort_input=sort_inputs,
357
- pos_x=200,
358
- pos_y=150,
359
- is_setup=True,
360
- depending_on_id=self.node_id,
361
- description=description
362
- or f"Sort by {', '.join(str(e) for e in sort_expressions)}",
363
- )
455
+ flow_id=self.flow_graph.flow_id, node_id=new_node_id, sort_input=sort_inputs_for_node, # type: ignore
456
+ pos_x=200, pos_y=150, is_setup=True, depending_on_id=self.node_id,
457
+ description=description or f"Sort by {', '.join(column_names_for_native_node)}")
364
458
  self.flow_graph.add_sort(sort_settings)
365
459
 
366
460
  return self._create_child_frame(new_node_id)
367
461
 
368
- def _generate_sort_polars_code(
369
- self,
370
- sort_expressions: list,
371
- descending_values: list,
372
- nulls_last_values: list,
373
- multithreaded: bool,
374
- maintain_order: bool,
375
- ) -> str:
376
- """Generate Polars code for sort operations that need fallback."""
377
- # Format expressions for code
378
- expr_strs = []
379
- for expr in sort_expressions:
380
- if isinstance(expr, (Expr, Column)):
381
- expr_strs.append(str(expr))
382
- elif isinstance(expr, str):
383
- expr_strs.append(f"'{expr}'")
384
- else:
385
- expr_strs.append(str(expr))
386
-
387
- # Format parameters
388
- if len(sort_expressions) == 1:
389
- by_arg = expr_strs[0]
390
- else:
391
- by_arg = f"[{', '.join(expr_strs)}]"
392
-
393
- # Build kwargs
394
- kwargs = {}
395
-
396
- # Only add descending if it's non-default
397
- if any(d for d in descending_values):
398
- if len(descending_values) == 1:
399
- kwargs["descending"] = descending_values[0]
400
- else:
401
- kwargs["descending"] = descending_values
402
-
403
- # Only add nulls_last if it's non-default
404
- if any(nl for nl in nulls_last_values):
405
- if len(nulls_last_values) == 1:
406
- kwargs["nulls_last"] = nulls_last_values[0]
462
+ def _add_polars_code(self, new_node_id: int, code: str, description: str = None,
463
+ depending_on_ids: List[str] | None = None, convertable_to_code: bool = True,
464
+ method_name: str = None, polars_expr: Expr | List[Expr] | None = None,
465
+ group_expr: Expr | List[Expr] | None = None,
466
+ kwargs_expr: Dict | None = None,
467
+ group_kwargs: Dict | None = None, ):
468
+ polars_code_for_node: str
469
+ if not convertable_to_code or _contains_lambda_pattern(code):
470
+
471
+ effective_method_name = get_method_name_from_code(
472
+ code) if method_name is None and "input_df." in code else method_name
473
+
474
+ pl_expr_list = ensure_inputs_as_iterable(polars_expr) if polars_expr is not None else []
475
+ group_expr_list = ensure_inputs_as_iterable(group_expr) if group_expr is not None else []
476
+
477
+ _check_ok_for_serialization(polars_expr=pl_expr_list, method_name=effective_method_name,
478
+ group_expr=group_expr_list)
479
+
480
+ current_kwargs_expr = kwargs_expr if kwargs_expr is not None else {}
481
+ result_lazyframe_or_expr: Any
482
+
483
+ if effective_method_name == "group_by":
484
+ group_kwargs = {} if group_kwargs is None else group_kwargs
485
+ if not group_expr_list:
486
+ raise ValueError("group_expr is required for group_by method in serialization fallback.")
487
+ target_obj = getattr(self.data, effective_method_name)(*group_expr_list, **group_kwargs)
488
+ if not pl_expr_list:
489
+ raise ValueError(
490
+ "Aggregation expressions (polars_expr) are required for group_by().agg() in serialization fallback.")
491
+ result_lazyframe_or_expr = target_obj.agg(*pl_expr_list, **current_kwargs_expr)
492
+ elif effective_method_name:
493
+ result_lazyframe_or_expr = getattr(self.data, effective_method_name)(*pl_expr_list,
494
+ **current_kwargs_expr)
407
495
  else:
408
- kwargs["nulls_last"] = nulls_last_values
409
-
410
- # Add other parameters if they're non-default
411
- if not multithreaded:
412
- kwargs["multithreaded"] = multithreaded
413
-
414
- if maintain_order:
415
- kwargs["maintain_order"] = maintain_order
416
-
417
- # Build kwargs string
418
- kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items())
419
-
420
- # Build final code
421
- if kwargs_str:
422
- return f"input_df.sort({by_arg}, {kwargs_str})"
496
+ raise ValueError(
497
+ "Cannot execute Polars operation: method_name is missing and could not be inferred for serialization fallback.")
498
+ try:
499
+ if isinstance(result_lazyframe_or_expr, pl.LazyFrame):
500
+ serialized_value_for_code = result_lazyframe_or_expr.serialize(format='json')
501
+ polars_code_for_node = "\n".join([
502
+ f"serialized_value = r'''{serialized_value_for_code}'''",
503
+ "buffer = BytesIO(serialized_value.encode('utf-8'))",
504
+ "output_df = pl.LazyFrame.deserialize(buffer, format='json')",
505
+ ])
506
+ logger.warning(
507
+ f"Transformation '{effective_method_name}' uses non-serializable elements. "
508
+ "Falling back to serializing the resulting Polars LazyFrame object."
509
+ "This will result in a breaking graph when using the the ui."
510
+ )
511
+ else:
512
+ logger.error(
513
+ f"Fallback for non-convertible code for method '{effective_method_name}' "
514
+ f"resulted in a '{type(result_lazyframe_or_expr).__name__}' instead of a Polars LazyFrame. "
515
+ "This type cannot be persisted as a LazyFrame node via this fallback."
516
+ )
517
+ return FlowFrame(result_lazyframe_or_expr, flow_graph=self.flow_graph, node_id=new_node_id)
518
+ except Exception as e:
519
+ logger.warning(
520
+ f"Critical error: Could not serialize the result of operation '{effective_method_name}' "
521
+ f"during fallback for non-convertible code. Error: {e}."
522
+ "When using a lambda function, consider defining the function first"
523
+ )
524
+ return FlowFrame(result_lazyframe_or_expr, flow_graph=self.flow_graph, node_id=new_node_id)
423
525
  else:
424
- return f"input_df.sort({by_arg})"
425
-
426
- def _add_polars_code(self, new_node_id: int, code: str, description: str = None,
427
- depending_on_ids: List[str] | None = None):
526
+ polars_code_for_node = code
428
527
  polars_code_settings = input_schema.NodePolarsCode(
429
528
  flow_id=self.flow_graph.flow_id,
430
529
  node_id=new_node_id,
431
- polars_code_input=transform_schema.PolarsCodeInput(polars_code=code),
530
+ polars_code_input=transform_schema.PolarsCodeInput(polars_code=polars_code_for_node),
432
531
  is_setup=True,
433
532
  depending_on_ids=depending_on_ids if depending_on_ids is not None else [self.node_id],
434
533
  description=description,
@@ -469,14 +568,17 @@ class FlowFrame:
469
568
  validate : {"1:1", "1:m", "m:1", "m:m"}, optional
470
569
  Validate join relationship.
471
570
  nulls_equal:
472
- Join on null values. By default null values will never produce matches.
571
+ Join on null values. By default, null values will never produce matches.
473
572
  coalesce:
474
573
  None: -> join specific.
475
574
  True: -> Always coalesce join columns.
476
575
  False: -> Never coalesce join columns.
477
576
  maintain_order:
478
- Which DataFrame row order to preserve, if any. Do not rely on any observed ordering without explicitly setting this parameter, as your code may break in a future release. Not specifying any ordering can improve performance Supported for inner, left, right and full joins
479
- None: No specific ordering is desired. The ordering might differ across Polars versions or even between different runs.
577
+ Which DataFrame row order to preserve, if any. Do not rely on any observed ordering without explicitly
578
+ setting this parameter, as your code may break in a future release.
579
+ Not specifying any ordering can improve performance Supported for inner, left, right and full joins
580
+ None: No specific ordering is desired. The ordering might differ across Polars versions or even between
581
+ different runs.
480
582
  left: Preserves the order of the left DataFrame.
481
583
  right: Preserves the order of the right DataFrame.
482
584
  left_right: First preserves the order of the left DataFrame, then the right.
@@ -494,6 +596,7 @@ class FlowFrame:
494
596
  nulls_equal is False and
495
597
  validate is None and
496
598
  suffix == '_right')
599
+
497
600
  join_mappings = None
498
601
  if self.flow_graph.flow_id != other.flow_graph.flow_id:
499
602
  combined_graph, node_mappings = combine_flow_graphs_with_mapping(self.flow_graph, other.flow_graph)
@@ -508,6 +611,7 @@ class FlowFrame:
508
611
  global node_id_counter
509
612
  node_id_counter += len(combined_graph.nodes)
510
613
  new_node_id = generate_node_id()
614
+
511
615
  if on is not None:
512
616
  left_columns = right_columns = _normalize_columns_to_list(on)
513
617
  elif left_on is not None and right_on is not None:
@@ -526,10 +630,11 @@ class FlowFrame:
526
630
  )
527
631
  if not use_polars_code:
528
632
  join_mappings, use_polars_code = _create_join_mappings(
529
- left_columns, right_columns
633
+ left_columns or [], right_columns or []
530
634
  )
531
635
 
532
636
  if use_polars_code or suffix != '_right':
637
+
533
638
  _on = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in _normalize_columns_to_list(on)) + "]" if on else None
534
639
  _left = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in left_columns) + "]" if left_on else None
535
640
  _right = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in right_columns) + "]" if right_on else None
@@ -549,31 +654,50 @@ class FlowFrame:
549
654
  parent_node_id=self.node_id,
550
655
  )
551
656
 
552
- elif join_mappings:
657
+ elif join_mappings or how == 'cross':
658
+
553
659
  left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
554
660
  right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
555
661
 
556
- join_input = transform_schema.JoinInput(
557
- join_mapping=join_mappings,
558
- left_select=left_select.renames,
559
- right_select=right_select.renames,
560
- how=how,
561
- )
662
+ if how == 'cross':
663
+ join_input = transform_schema.CrossJoinInput(left_select=left_select.renames,
664
+ right_select=right_select.renames,)
665
+ else:
666
+ join_input = transform_schema.JoinInput(
667
+ join_mapping=join_mappings,
668
+ left_select=left_select.renames,
669
+ right_select=right_select.renames,
670
+ how=how,
671
+ )
672
+
562
673
  join_input.auto_rename()
563
- # Create node settings
564
- join_settings = input_schema.NodeJoin(
565
- flow_id=self.flow_graph.flow_id,
566
- node_id=new_node_id,
567
- join_input=join_input,
568
- auto_generate_selection=True,
569
- verify_integrity=True,
570
- pos_x=200,
571
- pos_y=150,
572
- is_setup=True,
573
- depending_on_ids=[self.node_id, other.node_id],
574
- description=description or f"Join with {how} strategy",
575
- )
576
- self.flow_graph.add_join(join_settings)
674
+ if how == 'cross':
675
+ cross_join_settings = input_schema.NodeCrossJoin(
676
+ flow_id=self.flow_graph.flow_id,
677
+ node_id=new_node_id,
678
+ cross_join_input=join_input,
679
+ is_setup=True,
680
+ depending_on_ids=[self.node_id, other.node_id],
681
+ description=description or f"Join with {how} strategy",
682
+ auto_generate_selection=True,
683
+ verify_integrity=True,
684
+ )
685
+
686
+ self.flow_graph.add_cross_join(cross_join_settings)
687
+ else:
688
+ join_settings = input_schema.NodeJoin(
689
+ flow_id=self.flow_graph.flow_id,
690
+ node_id=new_node_id,
691
+ join_input=join_input,
692
+ auto_generate_selection=True,
693
+ verify_integrity=True,
694
+ pos_x=200,
695
+ pos_y=150,
696
+ is_setup=True,
697
+ depending_on_ids=[self.node_id, other.node_id],
698
+ description=description or f"Join with {how} strategy",
699
+ )
700
+ self.flow_graph.add_join(join_settings)
577
701
  self._add_connection(self.node_id, new_node_id, "main")
578
702
  other._add_connection(other.node_id, new_node_id, "right")
579
703
  result_frame = FlowFrame(
@@ -600,38 +724,68 @@ class FlowFrame:
600
724
  self.flow_graph.add_record_count(node_number_of_records)
601
725
  return self._create_child_frame(new_node_id)
602
726
 
603
- def select(self, *columns, description: str = None):
727
+ def select(self, *columns: Union[str, Expr, Selector], description: Optional[str] = None) -> "FlowFrame":
604
728
  """
605
729
  Select columns from the frame.
606
-
607
- Args:
608
- *columns: Column names or expressions
609
- description: Description of the step, this will be shown in the flowfile file
610
-
611
- Returns:
612
- A new FlowFrame with selected columns
613
730
  """
614
- # Create new node ID
615
- columns = _parse_inputs_as_iterable(columns)
731
+ columns_iterable = list(_parse_inputs_as_iterable(columns))
616
732
  new_node_id = generate_node_id()
617
- existing_columns = self.columns
618
733
 
619
- if (len(columns) == 1 and isinstance(columns[0], Expr)
620
- and str(columns[0]) == "pl.Expr(len()).alias('number_of_records')"):
734
+ if (len(columns_iterable) == 1 and isinstance(columns_iterable[0], Expr)
735
+ and str(columns_iterable[0]) == "pl.Expr(len()).alias('number_of_records')"):
621
736
  return self._add_number_of_records(new_node_id, description)
622
- if all(isinstance(col_, (str, Column)) for col_ in columns):
623
-
624
- select_inputs = [
625
- transform_schema.SelectInput(old_name=col_) if isinstance(col_, str) else col_.to_select_input()
626
- for col_ in columns
627
- ]
628
- dropped_columns = [transform_schema.SelectInput(c, keep=False) for c in existing_columns if
629
- c not in [s.old_name for s in select_inputs]]
630
- select_inputs.extend(dropped_columns)
737
+
738
+ all_input_expr_objects: List[Expr] = []
739
+ pure_polars_expr_strings_for_select: List[str] = []
740
+ collected_raw_definitions: List[str] = []
741
+ selected_col_names_for_native: List[str] = [] # For native node
742
+
743
+ can_use_native_node = True
744
+
745
+ if len(columns_iterable) == 1 and isinstance(columns_iterable[0], str) and columns_iterable[0] == '*':
746
+ effective_columns_iterable = [col(c_name) for c_name in self.columns]
747
+ else:
748
+ effective_columns_iterable = columns_iterable
749
+ for expr_input in effective_columns_iterable:
750
+ current_expr_obj = expr_input
751
+ is_simple_col_for_native = False
752
+
753
+ if isinstance(expr_input, str):
754
+ current_expr_obj = col(expr_input)
755
+ selected_col_names_for_native.append(expr_input)
756
+ is_simple_col_for_native = True
757
+ elif isinstance(expr_input, Column) and not expr_input._select_input.is_altered: # type: ignore
758
+ selected_col_names_for_native.append(expr_input.column_name) # type: ignore
759
+ is_simple_col_for_native = True
760
+ elif isinstance(expr_input, Selector): # Selectors imply Polars code path
761
+ can_use_native_node = False
762
+ # current_expr_obj = expr_input # Already an Expr-like via selector
763
+ elif not isinstance(expr_input, Expr): # Includes Column
764
+ current_expr_obj = lit(expr_input)
765
+
766
+ all_input_expr_objects.append(current_expr_obj) # type: ignore
767
+
768
+ pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
769
+
770
+ pure_polars_expr_strings_for_select.append(pure_expr_str)
771
+ if raw_defs_str and raw_defs_str not in collected_raw_definitions:
772
+ collected_raw_definitions.append(raw_defs_str)
773
+
774
+ if not is_simple_col_for_native and not isinstance(expr_input, Selector):
775
+ can_use_native_node = False # Complex expressions require Polars code
776
+ if collected_raw_definitions: # Has to use Polars code if there are definitions
777
+ can_use_native_node = False
778
+ if can_use_native_node:
779
+ select_inputs_for_node = [transform_schema.SelectInput(old_name=name) for name in
780
+ selected_col_names_for_native]
781
+ existing_cols = self.columns
782
+ dropped_columns = [transform_schema.SelectInput(c, keep=False) for c in existing_cols if
783
+ c not in selected_col_names_for_native]
784
+ select_inputs_for_node.extend(dropped_columns)
631
785
  select_settings = input_schema.NodeSelect(
632
786
  flow_id=self.flow_graph.flow_id,
633
787
  node_id=new_node_id,
634
- select_input=select_inputs,
788
+ select_input=select_inputs_for_node,
635
789
  keep_missing=False,
636
790
  pos_x=200,
637
791
  pos_y=100,
@@ -639,60 +793,97 @@ class FlowFrame:
639
793
  depending_on_id=self.node_id,
640
794
  description=description
641
795
  )
642
-
643
- # Add to graph
644
796
  self.flow_graph.add_select(select_settings)
645
- return self._create_child_frame(new_node_id)
646
-
647
797
  else:
648
- readable_exprs = []
649
- is_readable: bool = True
650
- for col_ in columns:
651
- if isinstance(col_, Expr):
652
- readable_exprs.append(col_)
653
- elif isinstance(col_, Selector):
654
- readable_exprs.append(col_)
655
- elif isinstance(col_, pl.expr.Expr):
656
- print('warning this cannot be converted to flowfile frontend. Make sure you use the flowfile expr')
657
- is_readable = False
658
- elif isinstance(col_, str) and col_ in self.columns:
659
- col_expr = Column(col_)
660
- readable_exprs.append(col_expr)
661
- else:
662
- lit_expr = lit(col_)
663
- readable_exprs.append(lit_expr)
664
- if is_readable:
665
- code = f"input_df.select([{', '.join(str(e) for e in readable_exprs)}])"
798
+ polars_operation_code = f"input_df.select([{', '.join(pure_polars_expr_strings_for_select)}])"
799
+ final_code_for_node: str
800
+ if collected_raw_definitions:
801
+ unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions))
802
+ definitions_section = "\n\n".join(unique_raw_definitions)
803
+ final_code_for_node = definitions_section + \
804
+ "\#─────SPLIT─────\n\n" + \
805
+ f"output_df = {polars_operation_code}"
666
806
  else:
667
- raise ValueError('Not supported')
807
+ final_code_for_node = polars_operation_code
668
808
 
669
- self._add_polars_code(new_node_id, code, description)
670
- return self._create_child_frame(new_node_id)
809
+ pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
810
+ isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
811
+ self._add_polars_code(new_node_id, final_code_for_node, description,
812
+ method_name="select",
813
+ convertable_to_code=_check_if_convertible_to_code(all_input_expr_objects),
814
+ polars_expr=pl_expressions_for_fallback)
671
815
 
672
- def filter(self, predicate: Expr | Any = None, *, flowfile_formula: str = None, description: str = None):
816
+ return self._create_child_frame(new_node_id)
817
+
818
+ def filter(self, *predicates: Union[Expr, Any], flowfile_formula: Optional[str] = None,
819
+ description: Optional[str] = None, **constraints: Any) -> "FlowFrame":
673
820
  """
674
821
  Filter rows based on a predicate.
675
-
676
- Args:
677
- predicate: Filter condition
678
- flowfile_formula: Native support in frontend
679
- description: Description of the step that is performed
680
- Returns:
681
- A new FlowFrame with filtered rows
682
822
  """
823
+ if (len(predicates) > 0 or len(constraints) > 0) and flowfile_formula:
824
+ raise ValueError("You can only use one of the following: predicates, constraints or flowfile_formula")
825
+ available_columns = self.columns
683
826
  new_node_id = generate_node_id()
684
- # Create new node ID
685
- if predicate:
686
- # we use for now the fallback on polars code.
687
- if isinstance(predicate, Expr):
688
- predicate_expr = predicate
827
+ if len(predicates) > 0 or len(constraints) > 0:
828
+ all_input_expr_objects: List[Expr] = []
829
+ pure_polars_expr_strings: List[str] = []
830
+ collected_raw_definitions: List[str] = []
831
+
832
+ processed_predicates = []
833
+ for pred_item in predicates:
834
+ if isinstance(pred_item, (tuple, list, Iterator)):
835
+ # If it's a sequence, extend the processed_predicates with its elements
836
+ processed_predicates.extend(list(pred_item))
837
+ else:
838
+ # Otherwise, just add the item
839
+ processed_predicates.append(pred_item)
840
+
841
+ for pred_input in processed_predicates: # Loop over the processed_predicates
842
+ # End of the new/modified section
843
+ current_expr_obj = None # Initialize current_expr_obj
844
+ if isinstance(pred_input, Expr):
845
+ current_expr_obj = pred_input
846
+ elif isinstance(pred_input, str) and pred_input in available_columns:
847
+ current_expr_obj = col(pred_input)
848
+ else:
849
+ current_expr_obj = lit(pred_input)
850
+
851
+ all_input_expr_objects.append(current_expr_obj)
852
+
853
+ pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
854
+ pure_polars_expr_strings.append(f"({pure_expr_str})")
855
+ if raw_defs_str and raw_defs_str not in collected_raw_definitions:
856
+ collected_raw_definitions.append(raw_defs_str)
857
+
858
+ for k, v_val in constraints.items():
859
+ constraint_expr_obj = (col(k) == lit(v_val))
860
+ all_input_expr_objects.append(constraint_expr_obj)
861
+ pure_expr_str, raw_defs_str = _extract_expr_parts(
862
+ constraint_expr_obj) # Constraint exprs are unlikely to have defs
863
+ pure_polars_expr_strings.append(f"({pure_expr_str})")
864
+ if raw_defs_str and raw_defs_str not in collected_raw_definitions: # Should be rare here
865
+ collected_raw_definitions.append(raw_defs_str)
866
+
867
+ filter_conditions_str = " & ".join(pure_polars_expr_strings) if pure_polars_expr_strings else "pl.lit(True)"
868
+ polars_operation_code = f"input_df.filter({filter_conditions_str})"
869
+
870
+ final_code_for_node: str
871
+ if collected_raw_definitions:
872
+ unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions)) # Order-preserving unique
873
+ definitions_section = "\n\n".join(unique_raw_definitions)
874
+ final_code_for_node = definitions_section + \
875
+ "\#─────SPLIT─────\n\n" + \
876
+ f"output_df = {polars_operation_code}"
689
877
  else:
690
- predicate_expr = lit(predicate)
691
- code = f"input_df.filter({str(predicate_expr)})"
692
- self._add_polars_code(new_node_id, code, description)
693
-
878
+ final_code_for_node = polars_operation_code
879
+
880
+ convertable_to_code = _check_if_convertible_to_code(all_input_expr_objects)
881
+ pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
882
+ isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
883
+ self._add_polars_code(new_node_id, final_code_for_node, description, method_name="filter",
884
+ convertable_to_code=convertable_to_code,
885
+ polars_expr=pl_expressions_for_fallback)
694
886
  elif flowfile_formula:
695
- # Create node settings
696
887
  filter_settings = input_schema.NodeFilter(
697
888
  flow_id=self.flow_graph.flow_id,
698
889
  node_id=new_node_id,
@@ -706,8 +897,10 @@ class FlowFrame:
706
897
  depending_on_id=self.node_id,
707
898
  description=description
708
899
  )
709
-
710
900
  self.flow_graph.add_filter(filter_settings)
901
+ else:
902
+ logger.info("Filter called with no arguments; creating a pass-through Polars code node.")
903
+ self._add_polars_code(new_node_id, "output_df = input_df", description or "No-op filter", method_name=None)
711
904
 
712
905
  return self._create_child_frame(new_node_id)
713
906
 
@@ -792,7 +985,7 @@ class FlowFrame:
792
985
  if convert_to_absolute_path:
793
986
  output_settings.directory = output_settings.abs_file_path
794
987
  except Exception as e:
795
- print(f"Warning: Could not determine absolute path for {file_str}: {e}")
988
+ logger.warning(f"Could not determine absolute path for {file_str}: {e}")
796
989
 
797
990
  if not use_polars_code:
798
991
  node_output = input_schema.NodeOutput(
@@ -820,7 +1013,7 @@ class FlowFrame:
820
1013
 
821
1014
  # Use sink_parquet for LazyFrames
822
1015
  code = f"input_df.sink_parquet({args_str})"
823
- print(f"Generated Polars Code: {code}")
1016
+ logger.debug(f"Generated Polars Code: {code}")
824
1017
  self._add_polars_code(new_node_id, code, description)
825
1018
 
826
1019
  return self._create_child_frame(new_node_id)
@@ -868,7 +1061,7 @@ class FlowFrame:
868
1061
  if convert_to_absolute_path:
869
1062
  output_settings.directory = output_settings.abs_file_path
870
1063
  except Exception as e:
871
- print(f"Warning: Could not determine absolute path for {file_str}: {e}")
1064
+ logger.warning(f"Could not determine absolute path for {file_str}: {e}")
872
1065
 
873
1066
  if not use_polars_code:
874
1067
  node_output = input_schema.NodeOutput(
@@ -901,7 +1094,7 @@ class FlowFrame:
901
1094
  args_str += f", {kwargs_repr}"
902
1095
 
903
1096
  code = f"input_df.collect().write_csv({args_str})"
904
- print(f"Generated Polars Code: {code}")
1097
+ logger.debug(f"Generated Polars Code: {code}")
905
1098
  self._add_polars_code(new_node_id, code, description)
906
1099
 
907
1100
  return self._create_child_frame(new_node_id)
@@ -954,10 +1147,10 @@ class FlowFrame:
954
1147
  self.flow_graph.apply_layout()
955
1148
  self.flow_graph.save_flow(file_path)
956
1149
 
957
- def collect(self):
1150
+ def collect(self, *args, **kwargs):
958
1151
  """Collect lazy data into memory."""
959
1152
  if hasattr(self.data, "collect"):
960
- return self.data.collect()
1153
+ return self.data.collect(*args, **kwargs)
961
1154
  return self.data
962
1155
 
963
1156
  def _with_flowfile_formula(self, flowfile_formula: str, output_column_name, description: str = None) -> "FlowFrame":
@@ -1278,9 +1471,10 @@ class FlowFrame:
1278
1471
  f.node_id = node_mappings.get((f.flow_graph.flow_id, f.node_id), None)
1279
1472
  global node_id_counter
1280
1473
  node_id_counter += len(combined_graph.nodes)
1474
+ else:
1475
+ combined_graph = self.flow_graph
1281
1476
  new_node_id = generate_node_id()
1282
1477
  use_native = how == "diagonal_relaxed" and parallel and not rechunk
1283
-
1284
1478
  if use_native:
1285
1479
  # Create union input for the transform schema
1286
1480
  union_input = transform_schema.UnionInput(
@@ -1314,7 +1508,6 @@ class FlowFrame:
1314
1508
  input_vars.append(f"input_df_{i+2}")
1315
1509
 
1316
1510
  frames_list = f"[{', '.join(input_vars)}]"
1317
-
1318
1511
  code = f"""
1319
1512
  # Perform concat operation
1320
1513
  output_df = pl.concat(
@@ -1324,19 +1517,20 @@ class FlowFrame:
1324
1517
  parallel={parallel}
1325
1518
  )
1326
1519
  """
1327
-
1520
+ self.flow_graph = combined_graph
1328
1521
 
1329
1522
  # Add polars code node with dependencies on all input frames
1330
1523
  depending_on_ids = [self.node_id] + [frame.node_id for frame in others]
1331
1524
  self._add_polars_code(
1332
1525
  new_node_id, code, description, depending_on_ids=depending_on_ids
1333
1526
  )
1334
-
1335
1527
  # Add connections to ensure all frames are available
1336
1528
  self._add_connection(self.node_id, new_node_id, "main")
1529
+
1337
1530
  for other_frame in others:
1338
- other_frame._add_connection(other_frame.node_id, new_node_id, "main")
1339
1531
 
1532
+ other_frame.flow_graph = combined_graph
1533
+ other_frame._add_connection(other_frame.node_id, new_node_id, "main")
1340
1534
  # Create and return the new frame
1341
1535
  return FlowFrame(
1342
1536
  data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
@@ -1373,7 +1567,7 @@ class FlowFrame:
1373
1567
  return False, None
1374
1568
 
1375
1569
  # Extract the output name
1376
- output_name = expr.name
1570
+ output_name = expr.column_name
1377
1571
 
1378
1572
  if ".over(" not in expr._repr_str:
1379
1573
  # Simple cumulative count can be implemented as a record ID with offset=1
@@ -1456,62 +1650,70 @@ class FlowFrame:
1456
1650
  return False, None
1457
1651
 
1458
1652
  def with_columns(
1459
- self,
1460
- exprs: Expr | List[Expr | None] = None,
1461
- *,
1462
- flowfile_formulas: Optional[List[str]] = None,
1463
- output_column_names: Optional[List[str]] = None,
1464
- description: Optional[str] = None,
1653
+ self,
1654
+ *exprs: Union[Expr, Iterable[Expr], Any], # Allow Any for implicit lit conversion
1655
+ flowfile_formulas: Optional[List[str]] = None,
1656
+ output_column_names: Optional[List[str]] = None,
1657
+ description: Optional[str] = None,
1658
+ **named_exprs: Union[Expr, Any], # Allow Any for implicit lit conversion
1465
1659
  ) -> "FlowFrame":
1466
1660
  """
1467
- Add multiple columns to the DataFrame.
1468
-
1469
- Parameters
1470
- ----------
1471
- exprs : Expr or List[Expr], optional
1472
- Expressions to evaluate as new columns
1473
- flowfile_formulas : List[str], optional
1474
- Alternative approach using flowfile formula syntax
1475
- output_column_names : List[str], optional
1476
- Column names for the flowfile formulas
1477
- description : str, optional
1478
- Description of this operation for the ETL graph
1479
-
1480
- Returns
1481
- -------
1482
- FlowFrame
1483
- A new FlowFrame with the columns added
1484
-
1485
- Raises
1486
- ------
1487
- ValueError
1488
- If neither exprs nor flowfile_formulas with output_column_names are provided,
1489
- or if the lengths of flowfile_formulas and output_column_names don't match
1661
+ Add or replace columns in the DataFrame.
1490
1662
  """
1491
- if exprs is not None:
1492
- new_node_id = generate_node_id()
1493
- exprs_iterable = _parse_inputs_as_iterable((exprs,))
1663
+ new_node_id = generate_node_id()
1494
1664
 
1495
- if len(exprs_iterable) == 1:
1496
- detected, result = self._detect_cum_count_record_id(
1497
- exprs_iterable[0], new_node_id, description
1498
- )
1499
- if detected:
1500
- return result
1501
- all_expressions = []
1502
- for expression in exprs_iterable:
1503
- if not isinstance(expression, (Expr, Column)):
1504
- all_expressions.append(lit(expression))
1505
- else:
1506
- all_expressions.append(expression)
1665
+ all_input_expr_objects: List[Expr] = []
1666
+ pure_polars_expr_strings_for_wc: List[str] = []
1667
+ collected_raw_definitions: List[str] = []
1668
+
1669
+ has_exprs_or_named_exprs = bool(exprs or named_exprs)
1670
+ if has_exprs_or_named_exprs:
1671
+ actual_exprs_to_process: List[Expr] = []
1672
+ temp_exprs_iterable = list(_parse_inputs_as_iterable(exprs))
1673
+
1674
+ for item in temp_exprs_iterable:
1675
+ if isinstance(item, Expr):
1676
+ actual_exprs_to_process.append(item)
1677
+ else: # auto-lit for non-Expr positional args
1678
+ actual_exprs_to_process.append(lit(item))
1679
+
1680
+ for name, val_expr in named_exprs.items():
1681
+ if isinstance(val_expr, Expr):
1682
+ actual_exprs_to_process.append(val_expr.alias(name)) # type: ignore # Assuming Expr has alias
1683
+ else: # auto-lit for named args and then alias
1684
+ actual_exprs_to_process.append(lit(val_expr).alias(name)) # type: ignore
1685
+
1686
+ if len(actual_exprs_to_process) == 1 and isinstance(actual_exprs_to_process[0], Expr):
1687
+ pass
1688
+
1689
+ for current_expr_obj in actual_exprs_to_process:
1690
+ all_input_expr_objects.append(current_expr_obj)
1691
+ pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
1692
+ pure_polars_expr_strings_for_wc.append(pure_expr_str) # with_columns takes individual expressions
1693
+ if raw_defs_str and raw_defs_str not in collected_raw_definitions:
1694
+ collected_raw_definitions.append(raw_defs_str)
1695
+
1696
+ polars_operation_code = f"input_df.with_columns([{', '.join(pure_polars_expr_strings_for_wc)}])"
1697
+
1698
+ final_code_for_node: str
1699
+ if collected_raw_definitions:
1700
+ unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions))
1701
+ definitions_section = "\n\n".join(unique_raw_definitions)
1702
+ final_code_for_node = definitions_section + \
1703
+ "\n#─────SPLIT─────\n\n" + \
1704
+ f"output_df = {polars_operation_code}"
1705
+ else:
1706
+ final_code_for_node = polars_operation_code
1507
1707
 
1508
- code = (
1509
- f"input_df.with_columns({', '.join(str(e) for e in all_expressions)})"
1510
- )
1511
- self._add_polars_code(new_node_id, code, description)
1708
+ pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
1709
+ isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
1710
+ self._add_polars_code(new_node_id, final_code_for_node, description, method_name='with_columns',
1711
+ convertable_to_code=_check_if_convertible_to_code(all_input_expr_objects),
1712
+ polars_expr=pl_expressions_for_fallback)
1512
1713
  return self._create_child_frame(new_node_id)
1513
1714
 
1514
1715
  elif flowfile_formulas is not None and output_column_names is not None:
1716
+
1515
1717
  if len(output_column_names) != len(flowfile_formulas):
1516
1718
  raise ValueError(
1517
1719
  "Length of both the formulas and the output columns names must be identical"
@@ -1524,9 +1726,7 @@ class FlowFrame:
1524
1726
  ff = ff._with_flowfile_formula(flowfile_formula, output_column_name, f"{i}: {description}")
1525
1727
  return ff
1526
1728
  else:
1527
- raise ValueError(
1528
- "Either exprs or flowfile_formulas with output_column_names must be provided"
1529
- )
1729
+ raise ValueError("Either exprs/named_exprs or flowfile_formulas with output_column_names must be provided")
1530
1730
 
1531
1731
  def with_row_index(
1532
1732
  self, name: str = "index", offset: int = 0, description: str = None
@@ -1614,26 +1814,27 @@ class FlowFrame:
1614
1814
 
1615
1815
  if isinstance(columns, (list, tuple)):
1616
1816
  all_columns.extend(
1617
- [col.name if isinstance(col, Column) else col for col in columns]
1817
+ [col.column_name if isinstance(col, Column) else col for col in columns]
1618
1818
  )
1619
1819
  else:
1620
- all_columns.append(columns.name if isinstance(columns, Column) else columns)
1820
+ all_columns.append(columns.column_name if isinstance(columns, Column) else columns)
1621
1821
 
1622
1822
  if more_columns:
1623
1823
  for col in more_columns:
1624
- all_columns.append(col.name if isinstance(col, Column) else col)
1824
+ all_columns.append(col.column_name if isinstance(col, Column) else col)
1625
1825
 
1626
1826
  if len(all_columns) == 1:
1627
- columns_str = f"'{all_columns[0]}'"
1827
+
1828
+ columns_str = stringify_values(all_columns[0])
1628
1829
  else:
1629
- columns_str = "[" + ", ".join([f"'{col}'" for col in all_columns]) + "]"
1830
+ columns_str = "[" + ", ".join([ stringify_values(col) for col in all_columns]) + "]"
1630
1831
 
1631
1832
  code = f"""
1632
1833
  # Explode columns into multiple rows
1633
1834
  output_df = input_df.explode({columns_str})
1634
1835
  """
1635
1836
 
1636
- cols_desc = ", ".join(all_columns)
1837
+ cols_desc = ", ".join(str(s) for s in all_columns)
1637
1838
  desc = description or f"Explode column(s): {cols_desc}"
1638
1839
 
1639
1840
  # Add polars code node
@@ -1676,7 +1877,7 @@ class FlowFrame:
1676
1877
  new_node_id = generate_node_id()
1677
1878
 
1678
1879
  if isinstance(column, Column):
1679
- column_name = column.name
1880
+ column_name = column.column_name
1680
1881
  else:
1681
1882
  column_name = column
1682
1883
 
@@ -1760,7 +1961,7 @@ class FlowFrame:
1760
1961
  if col_expr._select_input.is_altered:
1761
1962
  can_use_native = False
1762
1963
  break
1763
- processed_subset.append(col_expr.name)
1964
+ processed_subset.append(col_expr.column_name)
1764
1965
  else:
1765
1966
  can_use_native = False
1766
1967
  break
@@ -1848,650 +2049,34 @@ class FlowFrame:
1848
2049
  """Get the number of columns."""
1849
2050
  return self.data.width
1850
2051
 
2052
+ def __contains__(self, key):
2053
+ """This special method enables the 'in' operator to work with FlowFrame objects."""
2054
+ return key in self.data
1851
2055
 
1852
- def _add_delegated_methods():
1853
- """Add delegated methods from polars LazyFrame."""
1854
- delegate_methods = [
1855
- "collect_async",
1856
- "profile",
1857
- "describe",
1858
- "explain",
1859
- "show_graph",
1860
- "serialize",
1861
- "fetch",
1862
- "get_meta",
1863
- "columns",
1864
- "dtypes",
1865
- "schema",
1866
- "estimated_size",
1867
- "n_chunks",
1868
- "is_empty",
1869
- "chunk_lengths",
1870
- "optimization_toggle",
1871
- "set_polars_options",
1872
- "collect_schema"
1873
- ]
1874
-
1875
- already_implemented = set(dir(FlowFrame))
1876
-
1877
- for method_name in delegate_methods:
1878
- if method_name not in already_implemented and hasattr(
1879
- pl.LazyFrame, method_name
1880
- ):
1881
- # Create a simple delegate method
1882
- def make_delegate(name):
1883
- def delegate_method(self, *args, **kwargs):
1884
- return getattr(self.data, name)(*args, **kwargs)
1885
-
1886
- # Set docstring and name
1887
- delegate_method.__doc__ = (
1888
- f"See pl.LazyFrame.{name} for full documentation."
1889
- )
1890
- delegate_method.__name__ = name
1891
- return delegate_method
1892
-
1893
- # Add the method to the class
1894
- setattr(FlowFrame, method_name, make_delegate(method_name))
1895
-
1896
-
1897
- _add_delegated_methods()
1898
-
1899
-
1900
- def sum(expr):
1901
- """Sum aggregation function."""
1902
- if isinstance(expr, str):
1903
- expr = col(expr)
1904
- return expr.sum()
1905
-
1906
-
1907
- def mean(expr):
1908
- """Mean aggregation function."""
1909
- if isinstance(expr, str):
1910
- expr = col(expr)
1911
- return expr.mean()
1912
-
1913
-
1914
- def min(expr):
1915
- """Min aggregation function."""
1916
- if isinstance(expr, str):
1917
- expr = col(expr)
1918
- return expr.min()
1919
-
1920
-
1921
- def max(expr):
1922
- """Max aggregation function."""
1923
- if isinstance(expr, str):
1924
- expr = col(expr)
1925
- return expr.max()
1926
-
1927
-
1928
- def count(expr):
1929
- """Count aggregation function."""
1930
- if isinstance(expr, str):
1931
- expr = col(expr)
1932
- return expr.count()
1933
-
1934
-
1935
- def read_csv(
1936
- source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
1937
- *,
1938
- flow_graph: Optional[Any] = None, # Using Any for FlowGraph placeholder
1939
- separator: str = ',',
1940
- convert_to_absolute_path: bool = True,
1941
- description: Optional[str] = None,
1942
- has_header: bool = True,
1943
- new_columns: Optional[List[str]] = None,
1944
- comment_prefix: Optional[str] = None,
1945
- quote_char: Optional[str] = '"',
1946
- skip_rows: int = 0,
1947
- skip_lines: int = 0,
1948
- schema: Optional[SchemaDict] = None,
1949
- schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
1950
- null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
1951
- missing_utf8_is_empty_string: bool = False,
1952
- ignore_errors: bool = False,
1953
- try_parse_dates: bool = False,
1954
- infer_schema: bool = True,
1955
- infer_schema_length: Optional[int] = 100,
1956
- n_rows: Optional[int] = None,
1957
- encoding: CsvEncoding = 'utf8',
1958
- low_memory: bool = False,
1959
- rechunk: bool = False,
1960
- storage_options: Optional[Dict[str, Any]] = None,
1961
- skip_rows_after_header: int = 0,
1962
- row_index_name: Optional[str] = None,
1963
- row_index_offset: int = 0,
1964
- eol_char: str = '\n',
1965
- raise_if_empty: bool = True,
1966
- truncate_ragged_lines: bool = False,
1967
- decimal_comma: bool = False,
1968
- glob: bool = True,
1969
- cache: bool = True,
1970
- with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
1971
- **other_options: Any
1972
- ) -> FlowFrame:
1973
- """
1974
- Read a CSV file into a FlowFrame.
1975
-
1976
- This function uses the native FlowGraph implementation when the parameters
1977
- fall within the supported range, and falls back to using Polars' scan_csv implementation
1978
- for more advanced features.
1979
-
1980
- Args:
1981
- source: Path(s) to CSV file(s), or a file-like object.
1982
- flow_graph: if you want to add it to an existing graph
1983
- separator: Single byte character to use as separator in the file.
1984
- convert_to_absolute_path: If the path needs to be set to a fixed location
1985
- description: if you want to add a readable name in the frontend (advised)
1986
-
1987
- # Polars.scan_csv aligned parameters
1988
- has_header: Indicate if the first row of the dataset is a header or not.
1989
- new_columns: Rename columns after selection.
1990
- comment_prefix: String that indicates a comment line if found at beginning of line.
1991
- quote_char: Character used for quoting. None to disable.
1992
- skip_rows: Start reading after this many rows.
1993
- skip_lines: Skip this many lines by newline char only.
1994
- schema: Schema to use when reading the CSV.
1995
- schema_overrides: Schema overrides for specific columns.
1996
- null_values: Values to interpret as null.
1997
- missing_utf8_is_empty_string: Treat missing utf8 values as empty strings.
1998
- ignore_errors: Try to keep reading lines if some parsing errors occur.
1999
- try_parse_dates: Try to automatically parse dates.
2000
- infer_schema: Boolean flag. If False, `infer_schema_length` for Polars is set to 0.
2001
- infer_schema_length: Number of rows to use for schema inference. Polars default is 100.
2002
- n_rows: Stop reading after this many rows.
2003
- encoding: Character encoding to use.
2004
- low_memory: Reduce memory usage at the cost of performance.
2005
- rechunk: Ensure data is in contiguous memory layout after parsing.
2006
- storage_options: Options for fsspec for cloud storage.
2007
- skip_rows_after_header: Skip rows after header.
2008
- row_index_name: Name of the row index column.
2009
- row_index_offset: Start value for the row index.
2010
- eol_char: End of line character.
2011
- raise_if_empty: Raise error if file is empty.
2012
- truncate_ragged_lines: Truncate lines with too many values.
2013
- decimal_comma: Parse floats with decimal comma.
2014
- glob: Use glob pattern for file path (if source is a string).
2015
- cache: Cache the result after reading (Polars default True).
2016
- with_column_names: Apply a function over the column names.
2017
- other_options: Any other options to pass to polars.scan_csv (e.g. retries, file_cache_ttl).
2018
-
2019
- Returns:
2020
- A FlowFrame with the CSV data.
2021
- """
2022
- node_id = generate_node_id() # Assuming generate_node_id is defined
2023
- if flow_graph is None:
2024
- flow_graph = create_flow_graph() # Assuming create_flow_graph is defined
2025
- flow_id = flow_graph.flow_id
2026
-
2027
- current_source_path_for_native = None
2028
- if isinstance(source, (str, os.PathLike)):
2029
- current_source_path_for_native = str(source)
2030
- if '~' in current_source_path_for_native:
2031
- current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
2032
- elif isinstance(source, list) and all(isinstance(s, (str, os.PathLike)) for s in source):
2033
- current_source_path_for_native = str(source[0]) if source else None
2034
- if current_source_path_for_native and '~' in current_source_path_for_native:
2035
- current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
2036
- elif isinstance(source, (io.BytesIO, io.StringIO)):
2037
- logger.warning("Read from bytes io from csv not supported, converting data to raw data")
2038
- return from_dict(pl.read_csv(source), flow_graph=flow_graph, description=description)
2039
- actual_infer_schema_length: Optional[int]
2040
- if not infer_schema:
2041
- actual_infer_schema_length = 0
2042
- else:
2043
- actual_infer_schema_length = infer_schema_length
2044
- can_use_native = (
2045
- current_source_path_for_native is not None and
2046
- comment_prefix is None and
2047
- skip_lines == 0 and
2048
- schema is None and
2049
- schema_overrides is None and
2050
- null_values is None and
2051
- not missing_utf8_is_empty_string and
2052
- not try_parse_dates and
2053
- n_rows is None and
2054
- not low_memory and
2055
- not rechunk and
2056
- storage_options is None and
2057
- skip_rows_after_header == 0 and
2058
- row_index_name is None and
2059
- row_index_offset == 0 and
2060
- eol_char == '\n' and
2061
- not decimal_comma and
2062
- new_columns is None and
2063
- glob is True
2064
- )
2065
- if can_use_native and current_source_path_for_native:
2066
- received_table = input_schema.ReceivedTable(
2067
- file_type='csv',
2068
- path=current_source_path_for_native,
2069
- name=Path(current_source_path_for_native).name,
2070
- delimiter=separator,
2071
- has_headers=has_header,
2072
- encoding=encoding,
2073
- starting_from_line=skip_rows,
2074
- quote_char=quote_char if quote_char is not None else '"',
2075
- infer_schema_length=actual_infer_schema_length if actual_infer_schema_length is not None else 10000,
2076
- truncate_ragged_lines=truncate_ragged_lines,
2077
- ignore_errors=ignore_errors,
2078
- row_delimiter=eol_char
2079
- )
2080
- if convert_to_absolute_path:
2081
- try:
2082
- received_table.set_absolute_filepath()
2083
- received_table.path = received_table.abs_file_path
2084
- except Exception as e:
2085
- print(f"Warning: Could not determine absolute path for {current_source_path_for_native}: {e}")
2056
+ def __bool__(self):
2057
+ """This special method determines how the object behaves in boolean contexts.
2058
+ Returns True if the FlowFrame contains any data, False otherwise."""
2059
+ return bool(self.data)
2086
2060
 
2087
- read_node_description = description or f"Read CSV from {Path(current_source_path_for_native).name}"
2088
- read_node = input_schema.NodeRead(
2089
- flow_id=flow_id,
2090
- node_id=node_id,
2091
- received_file=received_table,
2092
- pos_x=100,
2093
- pos_y=100,
2094
- is_setup=True,
2095
- description=read_node_description
2096
- )
2097
- flow_graph.add_read(read_node)
2098
- result_frame = FlowFrame(
2099
- data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2100
- flow_graph=flow_graph,
2101
- node_id=node_id
2102
- )
2103
- return result_frame
2104
- else:
2105
- polars_source_arg = source
2106
- polars_code = _build_polars_code_args(
2107
- source=polars_source_arg,
2108
- separator=separator,
2109
- has_header=has_header,
2110
- new_columns=new_columns,
2111
- comment_prefix=comment_prefix,
2112
- quote_char=quote_char,
2113
- skip_rows=skip_rows,
2114
- skip_lines=skip_lines,
2115
- schema=schema,
2116
- schema_overrides=schema_overrides,
2117
- null_values=null_values,
2118
- missing_utf8_is_empty_string=missing_utf8_is_empty_string,
2119
- ignore_errors=ignore_errors,
2120
- try_parse_dates=try_parse_dates,
2121
- infer_schema_length=actual_infer_schema_length,
2122
- n_rows=n_rows,
2123
- encoding=encoding,
2124
- low_memory=low_memory,
2125
- rechunk=rechunk,
2126
- storage_options=storage_options,
2127
- skip_rows_after_header=skip_rows_after_header,
2128
- row_index_name=row_index_name,
2129
- row_index_offset=row_index_offset,
2130
- eol_char=eol_char,
2131
- raise_if_empty=raise_if_empty,
2132
- truncate_ragged_lines=truncate_ragged_lines,
2133
- decimal_comma=decimal_comma,
2134
- glob=glob,
2135
- cache=cache,
2136
- with_column_names=with_column_names,
2137
- **other_options
2138
- )
2139
- polars_code_node_description = description or "Read CSV with Polars scan_csv"
2140
- if isinstance(source, (str, os.PathLike)):
2141
- polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source).name}"
2142
- elif isinstance(source, list) and source and isinstance(source[0], (str, os.PathLike)):
2143
- polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source[0]).name} (and possibly others)"
2144
-
2145
- # Assuming input_schema.NodePolarsCode, transform_schema.PolarsCodeInput are defined
2146
- polars_code_settings = input_schema.NodePolarsCode(
2147
- flow_id=flow_id,
2148
- node_id=node_id,
2149
- polars_code_input=transform_schema.PolarsCodeInput(polars_code=polars_code),
2150
- is_setup=True,
2151
- description=polars_code_node_description
2152
- )
2153
- flow_graph.add_polars_code(polars_code_settings)
2154
- return FlowFrame(
2155
- data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2156
- flow_graph=flow_graph,
2157
- node_id=node_id,
2158
- )
2159
-
2160
- def _build_polars_code_args(
2161
- source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
2162
- separator: str,
2163
- has_header: bool,
2164
- new_columns: Optional[List[str]],
2165
- comment_prefix: Optional[str],
2166
- quote_char: Optional[str],
2167
- skip_rows: int,
2168
- skip_lines: int,
2169
- schema: Optional[SchemaDict],
2170
- schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]],
2171
- null_values: Optional[Union[str, List[str], Dict[str, str]]],
2172
- missing_utf8_is_empty_string: bool,
2173
- ignore_errors: bool,
2174
- try_parse_dates: bool,
2175
- infer_schema_length: Optional[int],
2176
- n_rows: Optional[int],
2177
- encoding: CsvEncoding,
2178
- low_memory: bool,
2179
- rechunk: bool,
2180
- storage_options: Optional[Dict[str, Any]],
2181
- skip_rows_after_header: int,
2182
- row_index_name: Optional[str],
2183
- row_index_offset: int,
2184
- eol_char: str,
2185
- raise_if_empty: bool,
2186
- truncate_ragged_lines: bool,
2187
- decimal_comma: bool,
2188
- glob: bool,
2189
- cache: bool,
2190
- with_column_names: Optional[Callable[[List[str]], List[str]]],
2191
- **other_options: Any
2192
- ) -> str:
2193
- source_repr: str
2194
- if isinstance(source, (str, Path)):
2195
- source_repr = repr(str(source))
2196
- elif isinstance(source, list):
2197
- source_repr = repr([str(p) for p in source])
2198
- elif isinstance(source, bytes):
2199
- source_repr = "source_bytes_obj"
2200
- elif hasattr(source, 'read'):
2201
- source_repr = "source_file_like_obj"
2202
- else:
2203
- source_repr = repr(source)
2204
-
2205
- param_mapping = {
2206
- 'has_header': (True, lambda x: str(x)),
2207
- 'separator': (',', lambda x: repr(str(x))),
2208
- 'comment_prefix': (None, lambda x: repr(str(x)) if x is not None else 'None'),
2209
- 'quote_char': ('"', lambda x: repr(str(x)) if x is not None else 'None'),
2210
- 'skip_rows': (0, str),
2211
- 'skip_lines': (0, str),
2212
- 'schema': (None, lambda x: repr(x) if x is not None else 'None'),
2213
- 'schema_overrides': (None, lambda x: repr(x) if x is not None else 'None'),
2214
- 'null_values': (None, lambda x: repr(x) if x is not None else 'None'),
2215
- 'missing_utf8_is_empty_string': (False, str),
2216
- 'ignore_errors': (False, str),
2217
- 'cache': (True, str),
2218
- 'with_column_names': (None, lambda x: repr(x) if x is not None else 'None'),
2219
- 'infer_schema_length': (100, lambda x: str(x) if x is not None else 'None'),
2220
- 'n_rows': (None, lambda x: str(x) if x is not None else 'None'),
2221
- 'encoding': ('utf8', lambda x: repr(str(x))),
2222
- 'low_memory': (False, str),
2223
- 'rechunk': (False, str),
2224
- 'skip_rows_after_header': (0, str),
2225
- 'row_index_name': (None, lambda x: repr(str(x)) if x is not None else 'None'),
2226
- 'row_index_offset': (0, str),
2227
- 'try_parse_dates': (False, str),
2228
- 'eol_char': ('\n', lambda x: repr(str(x))),
2229
- 'new_columns': (None, lambda x: repr(x) if x is not None else 'None'),
2230
- 'raise_if_empty': (True, str),
2231
- 'truncate_ragged_lines': (False, str),
2232
- 'decimal_comma': (False, str),
2233
- 'glob': (True, str),
2234
- 'storage_options': (None, lambda x: repr(x) if x is not None else 'None'),
2235
- }
2236
-
2237
- all_vars = locals()
2238
- kwargs_list = []
2239
-
2240
- for param_name_key, (default_value, format_func) in param_mapping.items():
2241
- value = all_vars.get(param_name_key)
2242
- formatted_value = format_func(value)
2243
- kwargs_list.append(f"{param_name_key}={formatted_value}")
2244
-
2245
- if other_options:
2246
- for k, v in other_options.items():
2247
- kwargs_list.append(f"{k}={repr(v)}")
2248
-
2249
- kwargs_str = ",\n ".join(kwargs_list)
2250
-
2251
- if kwargs_str:
2252
- polars_code = f"output_df = pl.scan_csv(\n {source_repr},\n {kwargs_str}\n)"
2253
- else:
2254
- polars_code = f"output_df = pl.scan_csv({source_repr})"
2255
-
2256
- return polars_code
2257
-
2258
-
2259
- def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
2260
- convert_to_absolute_path: bool = True, **options) -> FlowFrame:
2261
- """
2262
- Read a Parquet file into a FlowFrame.
2263
-
2264
- Args:
2265
- file_path: Path to Parquet file
2266
- flow_graph: if you want to add it to an existing graph
2267
- description: if you want to add a readable name in the frontend (advised)
2268
- convert_to_absolute_path: If the path needs to be set to a fixed location
2269
- **options: Options for polars.read_parquet
2270
-
2271
- Returns:
2272
- A FlowFrame with the Parquet data
2273
- """
2274
- if '~' in file_path:
2275
- file_path = os.path.expanduser(file_path)
2276
- node_id = generate_node_id()
2277
-
2278
- if flow_graph is None:
2279
- flow_graph = create_flow_graph()
2280
-
2281
- flow_id = flow_graph.flow_id
2282
-
2283
- received_table = input_schema.ReceivedTable(
2284
- file_type='parquet',
2285
- path=file_path,
2286
- name=Path(file_path).name,
2287
- )
2288
- if convert_to_absolute_path:
2289
- received_table.path = received_table.abs_file_path
2290
-
2291
- read_node = input_schema.NodeRead(
2292
- flow_id=flow_id,
2293
- node_id=node_id,
2294
- received_file=received_table,
2295
- pos_x=100,
2296
- pos_y=100,
2297
- is_setup=True,
2298
- description=description
2299
- )
2300
-
2301
- flow_graph.add_read(read_node)
2302
-
2303
- return FlowFrame(
2304
- data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2305
- flow_graph=flow_graph,
2306
- node_id=node_id
2307
- )
2308
-
2309
-
2310
- def from_dict(data, *, flow_graph: FlowGraph = None, description: str = None) -> FlowFrame:
2311
- """
2312
- Create a FlowFrame from a dictionary or list of dictionaries.
2313
-
2314
- Args:
2315
- data: Dictionary of lists or list of dictionaries
2316
- flow_graph: if you want to add it to an existing graph
2317
- description: if you want to add a readable name in the frontend (advised)
2318
- Returns:
2319
- A FlowFrame with the data
2320
- """
2321
- # Create new node ID
2322
- node_id = generate_node_id()
2323
-
2324
- if not flow_graph:
2325
- flow_graph = create_flow_graph()
2326
- flow_id = flow_graph.flow_id
2327
-
2328
- input_node = input_schema.NodeManualInput(
2329
- flow_id=flow_id,
2330
- node_id=node_id,
2331
- raw_data=FlowDataEngine(data).to_pylist(),
2332
- pos_x=100,
2333
- pos_y=100,
2334
- is_setup=True,
2335
- description=description
2336
- )
2337
-
2338
- # Add to graph
2339
- flow_graph.add_manual_input(input_node)
2340
-
2341
- # Return new frame
2342
- return FlowFrame(
2343
- data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2344
- flow_graph=flow_graph,
2345
- node_id=node_id
2346
- )
2347
-
2348
-
2349
- def concat(frames: List['FlowFrame'],
2350
- how: str = 'vertical',
2351
- rechunk: bool = False,
2352
- parallel: bool = True,
2353
- description: str = None) -> 'FlowFrame':
2354
- """
2355
- Concatenate multiple FlowFrames into one.
2356
-
2357
- Parameters
2358
- ----------
2359
- frames : List[FlowFrame]
2360
- List of FlowFrames to concatenate
2361
- how : str, default 'vertical'
2362
- How to combine the FlowFrames (see concat method documentation)
2363
- rechunk : bool, default False
2364
- Whether to ensure contiguous memory in result
2365
- parallel : bool, default True
2366
- Whether to use parallel processing for the operation
2367
- description : str, optional
2368
- Description of this operation
2369
-
2370
- Returns
2371
- -------
2372
- FlowFrame
2373
- A new FlowFrame with the concatenated data
2374
- """
2375
- if not frames:
2376
- raise ValueError("No frames provided to concat_frames")
2377
-
2378
- if len(frames) == 1:
2379
- return frames[0]
2380
-
2381
- # Use first frame's concat method with remaining frames
2382
- first_frame = frames[0]
2383
- remaining_frames = frames[1:]
2384
-
2385
- return first_frame.concat(remaining_frames, how=how,
2386
- rechunk=rechunk, parallel=parallel,
2387
- description=description)
2388
-
2389
-
2390
- def scan_csv(
2391
- source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
2392
- *,
2393
- flow_graph: Optional[Any] = None, # Using Any for FlowGraph placeholder
2394
- separator: str = ',',
2395
- convert_to_absolute_path: bool = True,
2396
- description: Optional[str] = None,
2397
- has_header: bool = True,
2398
- new_columns: Optional[List[str]] = None,
2399
- comment_prefix: Optional[str] = None,
2400
- quote_char: Optional[str] = '"',
2401
- skip_rows: int = 0,
2402
- skip_lines: int = 0,
2403
- schema: Optional[SchemaDict] = None,
2404
- schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
2405
- null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
2406
- missing_utf8_is_empty_string: bool = False,
2407
- ignore_errors: bool = False,
2408
- try_parse_dates: bool = False,
2409
- infer_schema: bool = True,
2410
- infer_schema_length: Optional[int] = 100,
2411
- n_rows: Optional[int] = None,
2412
- encoding: CsvEncoding = 'utf8',
2413
- low_memory: bool = False,
2414
- rechunk: bool = False,
2415
- storage_options: Optional[Dict[str, Any]] = None,
2416
- skip_rows_after_header: int = 0,
2417
- row_index_name: Optional[str] = None,
2418
- row_index_offset: int = 0,
2419
- eol_char: str = '\n',
2420
- raise_if_empty: bool = True,
2421
- truncate_ragged_lines: bool = False,
2422
- decimal_comma: bool = False,
2423
- glob: bool = True,
2424
- cache: bool = True,
2425
- with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
2426
- **other_options: Any
2427
- ) -> FlowFrame:
2428
- """
2429
- Scan a CSV file into a FlowFrame. This function is an alias for read_csv.
2061
+ @staticmethod
2062
+ def _comparison_error(operator: str) -> pl.lazyframe.frame.NoReturn:
2063
+ msg = f'"{operator!r}" comparison not supported for LazyFrame objects'
2064
+ raise TypeError(msg)
2430
2065
 
2431
- This method is the same as read_csv but is provided for compatibility with
2432
- the polars API where scan_csv returns a LazyFrame.
2066
+ def __eq__(self, other: object) -> pl.lazyframe.frame.NoReturn:
2067
+ self._comparison_error("==")
2433
2068
 
2434
- See read_csv for full documentation.
2435
- """
2436
- return read_csv(
2437
- source=source,
2438
- flow_graph=flow_graph,
2439
- separator=separator,
2440
- convert_to_absolute_path=convert_to_absolute_path,
2441
- description=description,
2442
- has_header=has_header,
2443
- new_columns=new_columns,
2444
- comment_prefix=comment_prefix,
2445
- quote_char=quote_char,
2446
- skip_rows=skip_rows,
2447
- skip_lines=skip_lines,
2448
- schema=schema,
2449
- schema_overrides=schema_overrides,
2450
- null_values=null_values,
2451
- missing_utf8_is_empty_string=missing_utf8_is_empty_string,
2452
- ignore_errors=ignore_errors,
2453
- try_parse_dates=try_parse_dates,
2454
- infer_schema=infer_schema,
2455
- infer_schema_length=infer_schema_length,
2456
- n_rows=n_rows,
2457
- encoding=encoding,
2458
- low_memory=low_memory,
2459
- rechunk=rechunk,
2460
- storage_options=storage_options,
2461
- skip_rows_after_header=skip_rows_after_header,
2462
- row_index_name=row_index_name,
2463
- row_index_offset=row_index_offset,
2464
- eol_char=eol_char,
2465
- raise_if_empty=raise_if_empty,
2466
- truncate_ragged_lines=truncate_ragged_lines,
2467
- decimal_comma=decimal_comma,
2468
- glob=glob,
2469
- cache=cache,
2470
- with_column_names=with_column_names,
2471
- **other_options
2472
- )
2069
+ def __ne__(self, other: object) -> pl.lazyframe.frame.NoReturn:
2070
+ self._comparison_error("!=")
2473
2071
 
2072
+ def __gt__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
2073
+ self._comparison_error(">")
2474
2074
 
2475
- def scan_parquet(
2476
- file_path,
2477
- *,
2478
- flow_graph: FlowGraph = None,
2479
- description: str = None,
2480
- convert_to_absolute_path: bool = True,
2481
- **options
2482
- ) -> FlowFrame:
2483
- """
2484
- Scan a Parquet file into a FlowFrame. This function is an alias for read_parquet.
2075
+ def __lt__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
2076
+ self._comparison_error("<")
2485
2077
 
2486
- This method is the same as read_parquet but is provided for compatibility with
2487
- the polars API where scan_parquet returns a LazyFrame.
2078
+ def __ge__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
2079
+ self._comparison_error(">=")
2488
2080
 
2489
- See read_parquet for full documentation.
2490
- """
2491
- return read_parquet(
2492
- file_path=file_path,
2493
- flow_graph=flow_graph,
2494
- description=description,
2495
- convert_to_absolute_path=convert_to_absolute_path,
2496
- **options
2497
- )
2081
+ def __le__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
2082
+ self._comparison_error("<=")