Flowfile 0.3.2__py3-none-any.whl → 0.3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. flowfile/__init__.py +3 -2
  2. flowfile/web/__init__.py +3 -0
  3. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/METADATA +4 -3
  4. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/RECORD +46 -35
  5. flowfile_core/configs/__init__.py +15 -4
  6. flowfile_core/configs/settings.py +5 -3
  7. flowfile_core/configs/utils.py +18 -0
  8. flowfile_core/flowfile/FlowfileFlow.py +13 -18
  9. flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
  10. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +54 -17
  11. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
  12. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
  13. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +2 -1
  14. flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
  15. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
  16. flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
  17. flowfile_core/flowfile/flow_node/flow_node.py +2 -1
  18. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
  19. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
  20. flowfile_core/flowfile/utils.py +34 -3
  21. flowfile_core/main.py +2 -3
  22. flowfile_core/routes/secrets.py +1 -1
  23. flowfile_core/schemas/input_schema.py +10 -4
  24. flowfile_core/schemas/transform_schema.py +25 -47
  25. flowfile_frame/__init__.py +11 -4
  26. flowfile_frame/adding_expr.py +280 -0
  27. flowfile_frame/config.py +9 -0
  28. flowfile_frame/expr.py +301 -83
  29. flowfile_frame/expr.pyi +2174 -0
  30. flowfile_frame/expr_name.py +258 -0
  31. flowfile_frame/flow_frame.py +584 -1002
  32. flowfile_frame/flow_frame.pyi +368 -0
  33. flowfile_frame/flow_frame_methods.py +617 -0
  34. flowfile_frame/group_frame.py +89 -42
  35. flowfile_frame/join.py +1 -2
  36. flowfile_frame/lazy.py +704 -0
  37. flowfile_frame/lazy_methods.py +201 -0
  38. flowfile_frame/list_name_space.py +324 -0
  39. flowfile_frame/selectors.py +3 -0
  40. flowfile_frame/series.py +70 -0
  41. flowfile_frame/utils.py +80 -4
  42. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/LICENSE +0 -0
  43. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/WHEEL +0 -0
  44. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/entry_points.txt +0 -0
  45. /flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
  46. /flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0
@@ -1,15 +1,15 @@
1
- import logging
1
+ import inspect
2
2
  import os
3
- from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable
4
- from pathlib import Path
3
+ from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable, get_args, get_origin
5
4
 
6
- import io
7
5
  import re
6
+
8
7
  import polars as pl
9
- from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation, IO, Mapping, PolarsDataType,
10
- Sequence, CsvEncoding)
11
8
 
12
- # Assume these imports are correct from your original context
9
+ from flowfile_frame.lazy_methods import add_lazyframe_methods
10
+
11
+ from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
12
+ from collections.abc import Iterator
13
13
  from flowfile_core.flowfile.FlowfileFlow import FlowGraph, add_connection
14
14
  from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
15
15
  from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
@@ -19,19 +19,35 @@ from flowfile_core.schemas import input_schema, transform_schema
19
19
  from flowfile_frame.expr import Expr, Column, lit, col
20
20
  from flowfile_frame.selectors import Selector
21
21
  from flowfile_frame.group_frame import GroupByFrame
22
- from flowfile_frame.utils import _parse_inputs_as_iterable, create_flow_graph
22
+ from flowfile_frame.utils import (_parse_inputs_as_iterable, create_flow_graph, stringify_values,
23
+ ensure_inputs_as_iterable)
23
24
  from flowfile_frame.join import _normalize_columns_to_list, _create_join_mappings
25
+ from flowfile_frame.utils import _check_if_convertible_to_code
26
+ from flowfile_frame.config import logger
27
+
24
28
 
25
29
  node_id_counter = 0
26
30
 
27
31
 
28
- logging.basicConfig(
29
- level=logging.INFO,
30
- format='[%(levelname)s] %(message)s'
31
- )
32
+ def can_be_expr(param: inspect.Parameter) -> bool:
33
+ """Check if a parameter can be of type pl.Expr"""
34
+ if param.annotation == inspect.Parameter.empty:
35
+ return False
36
+
37
+ # Check direct match or in Union args
38
+ types = get_args(param.annotation) if get_origin(param.annotation) is Union else [param.annotation]
39
+ return any(t in (pl.Expr, pl.expr.expr.Expr) for t in types)
40
+
41
+
42
+ def _contains_lambda_pattern(text: str) -> bool:
43
+ return "<lambda> at" in text
44
+
45
+
46
+ def get_method_name_from_code(code: str) -> str | None:
47
+ split_code = code.split("input_df.")
48
+ if len(split_code) > 1:
49
+ return split_code[1].split("(")[0]
32
50
 
33
- # Create and export the logger
34
- logger = logging.getLogger('flow_frame')
35
51
 
36
52
  def _to_string_val(v) -> str:
37
53
  if isinstance(v, str):
@@ -40,12 +56,72 @@ def _to_string_val(v) -> str:
40
56
  return v
41
57
 
42
58
 
59
+ def _extract_expr_parts(expr_obj) -> tuple[str, str]:
60
+ """
61
+ Extract the pure expression string and any raw definitions (including function sources) from an Expr object.
62
+
63
+ Parameters
64
+ ----------
65
+ expr_obj : Expr
66
+ The expression object to extract parts from
67
+
68
+ Returns
69
+ -------
70
+ tuple[str, str]
71
+ A tuple of (pure_expr_str, raw_definitions_str)
72
+ """
73
+ if not isinstance(expr_obj, Expr):
74
+ # If it's not an Expr, just return its string representation
75
+ return str(expr_obj), ""
76
+
77
+ # Get the basic representation
78
+ pure_expr_str = expr_obj._repr_str
79
+
80
+ # Collect all definitions (function sources)
81
+ raw_definitions = []
82
+
83
+ # Add function sources if any
84
+ if hasattr(expr_obj, '_function_sources') and expr_obj._function_sources:
85
+ # Remove duplicates while preserving order
86
+ unique_sources = []
87
+ seen = set()
88
+ for source in expr_obj._function_sources:
89
+ if source not in seen:
90
+ seen.add(source)
91
+ unique_sources.append(source)
92
+
93
+ if unique_sources:
94
+ raw_definitions.extend(unique_sources)
95
+
96
+ # Join all definitions
97
+ raw_defs_str = "\n\n".join(raw_definitions) if raw_definitions else ""
98
+
99
+ return pure_expr_str, raw_defs_str
100
+
101
+
102
+ def _check_ok_for_serialization(method_name: str = None, polars_expr: pl.Expr | None = None,
103
+ group_expr: pl.Expr | None = None) -> None:
104
+ if method_name is None:
105
+ raise NotImplemented("Cannot create a polars lambda expression without the method")
106
+ if polars_expr is None:
107
+ raise NotImplemented("Cannot create polars expressions with lambda function")
108
+ method_ref = getattr(pl.LazyFrame, method_name)
109
+ if method_ref is None:
110
+ raise ModuleNotFoundError(f"Could not find the method {method_name} in polars lazyframe")
111
+ if method_name == 'group_by':
112
+ if group_expr is None:
113
+ raise NotImplemented("Cannot create a polars lambda expression without the groupby expression")
114
+ if not all(isinstance(ge, pl.Expr) for ge in group_expr):
115
+ raise NotImplemented("Cannot create a polars lambda expression without the groupby expression")
116
+
117
+
43
118
  def generate_node_id() -> int:
44
119
  global node_id_counter
45
120
  node_id_counter += 1
46
121
  return node_id_counter
47
122
 
48
123
 
124
+ @add_lazyframe_methods
49
125
  class FlowFrame:
50
126
  """Main class that wraps FlowDataEngine and maintains the ETL graph."""
51
127
  flow_graph: FlowGraph
@@ -100,13 +176,11 @@ class FlowFrame:
100
176
  # Extract flow-specific parameters
101
177
  node_id = node_id or generate_node_id()
102
178
  description = "Data imported from Python object"
103
-
104
179
  # Create a new flow graph if none is provided
105
180
  if flow_graph is None:
106
181
  flow_graph = create_flow_graph()
107
182
 
108
183
  flow_id = flow_graph.flow_id
109
-
110
184
  # Convert data to a polars DataFrame/LazyFrame
111
185
  try:
112
186
  # Use polars to convert from various types
@@ -121,25 +195,23 @@ class FlowFrame:
121
195
  )
122
196
  pl_data = pl_df.lazy()
123
197
  except Exception as e:
124
- raise ValueError(f"Could not convert data to a polars DataFrame: {e}")
125
-
198
+ raise ValueError(f"Could not dconvert data to a polars DataFrame: {e}")
126
199
  # Create a FlowDataEngine to get data in the right format for manual input
127
200
  flow_table = FlowDataEngine(raw_data=pl_data)
128
-
201
+ raw_data_format = input_schema.RawData(data=list(flow_table.to_dict().values()),
202
+ columns=[c.get_minimal_field_info() for c in flow_table.schema])
129
203
  # Create a manual input node
130
204
  input_node = input_schema.NodeManualInput(
131
205
  flow_id=flow_id,
132
206
  node_id=node_id,
133
- raw_data=flow_table.to_pylist(), # Convert to list of dicts
207
+ raw_data_format=raw_data_format,
134
208
  pos_x=100,
135
209
  pos_y=100,
136
210
  is_setup=True,
137
211
  description=description,
138
212
  )
139
-
140
213
  # Add to graph
141
214
  flow_graph.add_manual_input(input_node)
142
-
143
215
  # Return new frame
144
216
  return FlowFrame(
145
217
  data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
@@ -163,7 +235,6 @@ class FlowFrame:
163
235
  parent_node_id=None,
164
236
  ):
165
237
  """Create a new FlowFrame instance."""
166
-
167
238
  # If data is not a LazyFrame, use the factory method
168
239
  if data is not None and not isinstance(data, pl.LazyFrame):
169
240
  return cls.create_from_any_type(
@@ -179,7 +250,6 @@ class FlowFrame:
179
250
  parent_node_id=parent_node_id,
180
251
  )
181
252
 
182
- # Otherwise create the instance normally
183
253
  instance = super().__new__(cls)
184
254
  return instance
185
255
 
@@ -198,7 +268,6 @@ class FlowFrame:
198
268
  parent_node_id=None,
199
269
  ):
200
270
  """Initialize the FlowFrame with data and graph references."""
201
-
202
271
  if data is None:
203
272
  data = pl.LazyFrame()
204
273
  if not isinstance(data, pl.LazyFrame):
@@ -230,205 +299,235 @@ class FlowFrame:
230
299
  def _create_child_frame(self, new_node_id):
231
300
  """Helper method to create a new FlowFrame that's a child of this one"""
232
301
  self._add_connection(self.node_id, new_node_id)
233
- return FlowFrame(
234
- data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
235
- flow_graph=self.flow_graph,
236
- node_id=new_node_id,
237
- parent_node_id=self.node_id,
238
- )
302
+ try:
303
+ return FlowFrame(
304
+ data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
305
+ flow_graph=self.flow_graph,
306
+ node_id=new_node_id,
307
+ parent_node_id=self.node_id,
308
+ )
309
+ except AttributeError:
310
+ raise ValueError('Could not execute the function')
239
311
 
240
- def sort(
241
- self,
242
- by: List[Expr | str] | Expr | str,
243
- *more_by,
244
- descending: bool | List[bool] = False,
245
- nulls_last: bool = False,
246
- multithreaded: bool = True,
247
- maintain_order: bool = False,
248
- description: str = None,
249
- ):
312
+ @staticmethod
313
+ def _generate_sort_polars_code(
314
+ pure_sort_expr_strs: List[str],
315
+ descending_values: List[bool],
316
+ nulls_last_values: List[bool],
317
+ multithreaded: bool,
318
+ maintain_order: bool,
319
+ ) -> str:
250
320
  """
251
- Sort the dataframe by the given columns.
321
+ Generates the `input_df.sort(...)` Polars code string using pure expression strings.
322
+ """
323
+ kwargs_for_code: Dict[str, Any] = {}
324
+ if any(descending_values):
325
+ kwargs_for_code["descending"] = descending_values[0] if len(descending_values) == 1 else descending_values
326
+ if any(nulls_last_values):
327
+ kwargs_for_code["nulls_last"] = nulls_last_values[0] if len(nulls_last_values) == 1 else nulls_last_values
328
+ if not multithreaded:
329
+ kwargs_for_code["multithreaded"] = multithreaded
330
+ if maintain_order:
331
+ kwargs_for_code["maintain_order"] = maintain_order
252
332
 
253
- Parameters:
254
- -----------
255
- by : Expr, str, or list of Expr/str
256
- Column(s) to sort by. Accepts expression input. Strings are parsed as column names.
257
- *more_by : Expr or str
258
- Additional columns to sort by, specified as positional arguments.
259
- descending : bool or list of bool, default False
260
- Sort in descending order. When sorting by multiple columns, can be specified per column.
261
- nulls_last : bool or list of bool, default False
262
- Place null values last; can specify a single boolean or a sequence for per-column control.
263
- multithreaded : bool, default True
264
- Sort using multiple threads.
265
- maintain_order : bool, default False
266
- Whether the order should be maintained if elements are equal.
267
- description : str, optional
268
- Description of this operation for the ETL graph.
333
+ kwargs_str_for_code = ", ".join(f"{k}={repr(v)}" for k, v in kwargs_for_code.items())
269
334
 
270
- Returns:
271
- --------
272
- FlowFrame
273
- A new FlowFrame with sorted data.
335
+ by_arg_for_code = pure_sort_expr_strs[0] if len(
336
+ pure_sort_expr_strs) == 1 else f"[{', '.join(pure_sort_expr_strs)}]"
337
+ return f"input_df.sort({by_arg_for_code}{', ' + kwargs_str_for_code if kwargs_str_for_code else ''})"
338
+
339
+ def sort(
340
+ self,
341
+ by: Union[List[Union[Expr, str]], Expr, str],
342
+ *more_by: Union[Expr, str],
343
+ descending: Union[bool, List[bool]] = False,
344
+ nulls_last: Union[bool, List[bool]] = False,
345
+ multithreaded: bool = True,
346
+ maintain_order: bool = False,
347
+ description: Optional[str] = None,
348
+ ) -> "FlowFrame":
349
+ """
350
+ Sort the dataframe by the given columns.
274
351
  """
275
- by = list(_parse_inputs_as_iterable((by,)))
352
+ initial_by_args = list(_parse_inputs_as_iterable((by,)))
276
353
  new_node_id = generate_node_id()
277
- sort_expressions = by
354
+
355
+ sort_expressions_input: list = initial_by_args
278
356
  if more_by:
279
- sort_expressions.extend(more_by)
357
+ sort_expressions_input.extend(list(_parse_inputs_as_iterable(more_by)))
280
358
 
281
- # Determine if we need to use polars code fallback
282
- needs_polars_code = False
359
+ all_processed_expr_objects: List[Expr] = []
360
+ pure_polars_expr_strings_for_sort: List[str] = []
361
+ collected_raw_definitions: List[str] = []
362
+ column_names_for_native_node: List[str] = []
283
363
 
284
- # Check for any expressions that are not simple columns
285
- for expr in sort_expressions:
286
- if not isinstance(expr, (str, Column)) or (
287
- isinstance(expr, Column) and expr._select_input.is_altered
288
- ):
289
- needs_polars_code = True
290
- break
364
+ use_polars_code_path = False
291
365
 
292
- # Also need polars code if we're using maintain_order or multithreaded params
293
366
  if maintain_order or not multithreaded:
294
- needs_polars_code = True
295
-
296
- # Standardize descending parameter
297
- if isinstance(descending, (list, tuple)):
298
- # Ensure descending list has the same length as sort_expressions
299
- if len(descending) != len(sort_expressions):
300
- raise ValueError(
301
- f"Length of descending ({len(descending)}) must match number of sort columns ({len(sort_expressions)})"
302
- )
303
- descending_values = descending
304
- else:
305
- descending_values = [descending] * len(sort_expressions)
306
-
307
- # Standardize nulls_last parameter
308
- if isinstance(nulls_last, (list, tuple)):
309
- if len(nulls_last) != len(sort_expressions):
310
- raise ValueError(
311
- f"Length of nulls_last ({len(nulls_last)}) must match number of sort columns ({len(sort_expressions)})"
312
- )
313
- nulls_last_values = nulls_last
314
- # Any non-default nulls_last needs polars code
315
- if any(val is not False for val in nulls_last_values):
316
- needs_polars_code = True
317
- else:
318
- nulls_last_values = [nulls_last] * len(sort_expressions)
319
- # Non-default nulls_last needs polars code
320
- if nulls_last:
321
- needs_polars_code = True
322
-
323
- if needs_polars_code:
324
- # Generate polars code for complex cases
325
- code = self._generate_sort_polars_code(
326
- sort_expressions,
327
- descending_values,
328
- nulls_last_values,
329
- multithreaded,
330
- maintain_order,
331
- )
332
- self._add_polars_code(new_node_id, code, description)
333
- else:
334
- # Use native implementation for simple cases
335
- sort_inputs = []
336
- for i, expr in enumerate(sort_expressions):
337
- # Convert expr to column name
338
- if isinstance(expr, Column):
339
- column_name = expr.name
340
- elif isinstance(expr, str):
341
- column_name = expr
367
+ use_polars_code_path = True
368
+
369
+ is_nulls_last_list = isinstance(nulls_last, (list, tuple))
370
+ if is_nulls_last_list and any(val for val in nulls_last if val is not False):
371
+ use_polars_code_path = True
372
+ elif not is_nulls_last_list and nulls_last is not False:
373
+ use_polars_code_path = True
374
+
375
+ for expr_input in sort_expressions_input:
376
+ current_expr_obj: Expr
377
+ is_simple_col_for_native = False
378
+
379
+ if isinstance(expr_input, str):
380
+ current_expr_obj = col(expr_input)
381
+ column_names_for_native_node.append(expr_input)
382
+ is_simple_col_for_native = True
383
+ elif isinstance(expr_input, Column):
384
+ current_expr_obj = expr_input
385
+ # Type ignore below due to simplified Column stub
386
+ if not expr_input._select_input.is_altered: # type: ignore
387
+ column_names_for_native_node.append(expr_input.column_name) # type: ignore
388
+ is_simple_col_for_native = True
342
389
  else:
343
- column_name = str(expr)
390
+ use_polars_code_path = True # Altered Column implies complex expression
391
+ elif isinstance(expr_input, Expr):
392
+ current_expr_obj = expr_input
393
+ use_polars_code_path = True # General Expr implies complex expression
394
+ else: # Convert other types to lit
395
+ current_expr_obj = lit(expr_input)
396
+ use_polars_code_path = True # Literal might be part of a complex sort for Polars code
397
+
398
+ all_processed_expr_objects.append(current_expr_obj)
399
+
400
+ pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
401
+ pure_polars_expr_strings_for_sort.append(pure_expr_str)
402
+
403
+ if raw_defs_str:
404
+ if raw_defs_str not in collected_raw_definitions:
405
+ collected_raw_definitions.append(raw_defs_str)
406
+ use_polars_code_path = True
407
+
408
+ if not is_simple_col_for_native: # If it wasn't a simple string or unaltered Column
409
+ use_polars_code_path = True
410
+
411
+ desc_values = list(descending) if isinstance(descending, list) else [descending] * len(
412
+ all_processed_expr_objects)
413
+ null_last_values = list(nulls_last) if isinstance(nulls_last, list) else [nulls_last] * len(
414
+ all_processed_expr_objects)
415
+
416
+ if len(desc_values) != len(all_processed_expr_objects):
417
+ raise ValueError("Length of 'descending' does not match the number of sort expressions.")
418
+ if len(null_last_values) != len(all_processed_expr_objects):
419
+ raise ValueError("Length of 'nulls_last' does not match the number of sort expressions.")
420
+
421
+ if use_polars_code_path:
422
+ polars_operation_code = self._generate_sort_polars_code(
423
+ pure_polars_expr_strings_for_sort, desc_values, null_last_values, multithreaded, maintain_order
424
+ )
344
425
 
345
- # Create SortByInput with appropriate settings
346
- sort_inputs.append(
347
- transform_schema.SortByInput(
348
- column=column_name,
349
- how="desc" if descending_values[i] else "asc",
350
- )
426
+ final_code_for_node: str
427
+ if collected_raw_definitions:
428
+ unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions)) # Order-preserving unique
429
+ definitions_section = "\n\n".join(unique_raw_definitions)
430
+ final_code_for_node = definitions_section + \
431
+ "\#─────SPLIT─────\n\n" + \
432
+ f"output_df = {polars_operation_code}"
433
+ else:
434
+ final_code_for_node = polars_operation_code
435
+
436
+ pl_expressions_for_fallback = [e.expr for e in all_processed_expr_objects if
437
+ hasattr(e, 'expr') and e.expr is not None]
438
+ kwargs_for_fallback = {
439
+ "descending": desc_values[0] if len(desc_values) == 1 else desc_values,
440
+ "nulls_last": null_last_values[0] if len(null_last_values) == 1 else null_last_values,
441
+ "multithreaded": multithreaded, "maintain_order": maintain_order}
442
+
443
+ self._add_polars_code(new_node_id, final_code_for_node, description, method_name="sort",
444
+ convertable_to_code=_check_if_convertible_to_code(all_processed_expr_objects),
445
+ polars_expr=pl_expressions_for_fallback,
446
+ kwargs_expr=kwargs_for_fallback)
447
+ else:
448
+ sort_inputs_for_node = []
449
+ for i, col_name_for_native in enumerate(column_names_for_native_node):
450
+ sort_inputs_for_node.append(
451
+ transform_schema.SortByInput(column=col_name_for_native, how="desc" if desc_values[i] else "asc")
452
+ # type: ignore
351
453
  )
352
-
353
454
  sort_settings = input_schema.NodeSort(
354
- flow_id=self.flow_graph.flow_id,
355
- node_id=new_node_id,
356
- sort_input=sort_inputs,
357
- pos_x=200,
358
- pos_y=150,
359
- is_setup=True,
360
- depending_on_id=self.node_id,
361
- description=description
362
- or f"Sort by {', '.join(str(e) for e in sort_expressions)}",
363
- )
455
+ flow_id=self.flow_graph.flow_id, node_id=new_node_id, sort_input=sort_inputs_for_node, # type: ignore
456
+ pos_x=200, pos_y=150, is_setup=True, depending_on_id=self.node_id,
457
+ description=description or f"Sort by {', '.join(column_names_for_native_node)}")
364
458
  self.flow_graph.add_sort(sort_settings)
365
459
 
366
460
  return self._create_child_frame(new_node_id)
367
461
 
368
- def _generate_sort_polars_code(
369
- self,
370
- sort_expressions: list,
371
- descending_values: list,
372
- nulls_last_values: list,
373
- multithreaded: bool,
374
- maintain_order: bool,
375
- ) -> str:
376
- """Generate Polars code for sort operations that need fallback."""
377
- # Format expressions for code
378
- expr_strs = []
379
- for expr in sort_expressions:
380
- if isinstance(expr, (Expr, Column)):
381
- expr_strs.append(str(expr))
382
- elif isinstance(expr, str):
383
- expr_strs.append(f"'{expr}'")
384
- else:
385
- expr_strs.append(str(expr))
386
-
387
- # Format parameters
388
- if len(sort_expressions) == 1:
389
- by_arg = expr_strs[0]
390
- else:
391
- by_arg = f"[{', '.join(expr_strs)}]"
392
-
393
- # Build kwargs
394
- kwargs = {}
395
-
396
- # Only add descending if it's non-default
397
- if any(d for d in descending_values):
398
- if len(descending_values) == 1:
399
- kwargs["descending"] = descending_values[0]
400
- else:
401
- kwargs["descending"] = descending_values
402
-
403
- # Only add nulls_last if it's non-default
404
- if any(nl for nl in nulls_last_values):
405
- if len(nulls_last_values) == 1:
406
- kwargs["nulls_last"] = nulls_last_values[0]
462
+ def _add_polars_code(self, new_node_id: int, code: str, description: str = None,
463
+ depending_on_ids: List[str] | None = None, convertable_to_code: bool = True,
464
+ method_name: str = None, polars_expr: Expr | List[Expr] | None = None,
465
+ group_expr: Expr | List[Expr] | None = None,
466
+ kwargs_expr: Dict | None = None,
467
+ group_kwargs: Dict | None = None, ):
468
+ polars_code_for_node: str
469
+ if not convertable_to_code or _contains_lambda_pattern(code):
470
+
471
+ effective_method_name = get_method_name_from_code(
472
+ code) if method_name is None and "input_df." in code else method_name
473
+
474
+ pl_expr_list = ensure_inputs_as_iterable(polars_expr) if polars_expr is not None else []
475
+ group_expr_list = ensure_inputs_as_iterable(group_expr) if group_expr is not None else []
476
+
477
+ _check_ok_for_serialization(polars_expr=pl_expr_list, method_name=effective_method_name,
478
+ group_expr=group_expr_list)
479
+
480
+ current_kwargs_expr = kwargs_expr if kwargs_expr is not None else {}
481
+ result_lazyframe_or_expr: Any
482
+
483
+ if effective_method_name == "group_by":
484
+ group_kwargs = {} if group_kwargs is None else group_kwargs
485
+ if not group_expr_list:
486
+ raise ValueError("group_expr is required for group_by method in serialization fallback.")
487
+ target_obj = getattr(self.data, effective_method_name)(*group_expr_list, **group_kwargs)
488
+ if not pl_expr_list:
489
+ raise ValueError(
490
+ "Aggregation expressions (polars_expr) are required for group_by().agg() in serialization fallback.")
491
+ result_lazyframe_or_expr = target_obj.agg(*pl_expr_list, **current_kwargs_expr)
492
+ elif effective_method_name:
493
+ result_lazyframe_or_expr = getattr(self.data, effective_method_name)(*pl_expr_list,
494
+ **current_kwargs_expr)
407
495
  else:
408
- kwargs["nulls_last"] = nulls_last_values
409
-
410
- # Add other parameters if they're non-default
411
- if not multithreaded:
412
- kwargs["multithreaded"] = multithreaded
413
-
414
- if maintain_order:
415
- kwargs["maintain_order"] = maintain_order
416
-
417
- # Build kwargs string
418
- kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items())
419
-
420
- # Build final code
421
- if kwargs_str:
422
- return f"input_df.sort({by_arg}, {kwargs_str})"
496
+ raise ValueError(
497
+ "Cannot execute Polars operation: method_name is missing and could not be inferred for serialization fallback.")
498
+ try:
499
+ if isinstance(result_lazyframe_or_expr, pl.LazyFrame):
500
+ serialized_value_for_code = result_lazyframe_or_expr.serialize(format='json')
501
+ polars_code_for_node = "\n".join([
502
+ f"serialized_value = r'''{serialized_value_for_code}'''",
503
+ "buffer = BytesIO(serialized_value.encode('utf-8'))",
504
+ "output_df = pl.LazyFrame.deserialize(buffer, format='json')",
505
+ ])
506
+ logger.warning(
507
+ f"Transformation '{effective_method_name}' uses non-serializable elements. "
508
+ "Falling back to serializing the resulting Polars LazyFrame object."
509
+ "This will result in a breaking graph when using the the ui."
510
+ )
511
+ else:
512
+ logger.error(
513
+ f"Fallback for non-convertible code for method '{effective_method_name}' "
514
+ f"resulted in a '{type(result_lazyframe_or_expr).__name__}' instead of a Polars LazyFrame. "
515
+ "This type cannot be persisted as a LazyFrame node via this fallback."
516
+ )
517
+ return FlowFrame(result_lazyframe_or_expr, flow_graph=self.flow_graph, node_id=new_node_id)
518
+ except Exception as e:
519
+ logger.warning(
520
+ f"Critical error: Could not serialize the result of operation '{effective_method_name}' "
521
+ f"during fallback for non-convertible code. Error: {e}."
522
+ "When using a lambda function, consider defining the function first"
523
+ )
524
+ return FlowFrame(result_lazyframe_or_expr, flow_graph=self.flow_graph, node_id=new_node_id)
423
525
  else:
424
- return f"input_df.sort({by_arg})"
425
-
426
- def _add_polars_code(self, new_node_id: int, code: str, description: str = None,
427
- depending_on_ids: List[str] | None = None):
526
+ polars_code_for_node = code
428
527
  polars_code_settings = input_schema.NodePolarsCode(
429
528
  flow_id=self.flow_graph.flow_id,
430
529
  node_id=new_node_id,
431
- polars_code_input=transform_schema.PolarsCodeInput(polars_code=code),
530
+ polars_code_input=transform_schema.PolarsCodeInput(polars_code=polars_code_for_node),
432
531
  is_setup=True,
433
532
  depending_on_ids=depending_on_ids if depending_on_ids is not None else [self.node_id],
434
533
  description=description,
@@ -469,14 +568,17 @@ class FlowFrame:
469
568
  validate : {"1:1", "1:m", "m:1", "m:m"}, optional
470
569
  Validate join relationship.
471
570
  nulls_equal:
472
- Join on null values. By default null values will never produce matches.
571
+ Join on null values. By default, null values will never produce matches.
473
572
  coalesce:
474
573
  None: -> join specific.
475
574
  True: -> Always coalesce join columns.
476
575
  False: -> Never coalesce join columns.
477
576
  maintain_order:
478
- Which DataFrame row order to preserve, if any. Do not rely on any observed ordering without explicitly setting this parameter, as your code may break in a future release. Not specifying any ordering can improve performance Supported for inner, left, right and full joins
479
- None: No specific ordering is desired. The ordering might differ across Polars versions or even between different runs.
577
+ Which DataFrame row order to preserve, if any. Do not rely on any observed ordering without explicitly
578
+ setting this parameter, as your code may break in a future release.
579
+ Not specifying any ordering can improve performance Supported for inner, left, right and full joins
580
+ None: No specific ordering is desired. The ordering might differ across Polars versions or even between
581
+ different runs.
480
582
  left: Preserves the order of the left DataFrame.
481
583
  right: Preserves the order of the right DataFrame.
482
584
  left_right: First preserves the order of the left DataFrame, then the right.
@@ -494,6 +596,7 @@ class FlowFrame:
494
596
  nulls_equal is False and
495
597
  validate is None and
496
598
  suffix == '_right')
599
+
497
600
  join_mappings = None
498
601
  if self.flow_graph.flow_id != other.flow_graph.flow_id:
499
602
  combined_graph, node_mappings = combine_flow_graphs_with_mapping(self.flow_graph, other.flow_graph)
@@ -508,6 +611,7 @@ class FlowFrame:
508
611
  global node_id_counter
509
612
  node_id_counter += len(combined_graph.nodes)
510
613
  new_node_id = generate_node_id()
614
+
511
615
  if on is not None:
512
616
  left_columns = right_columns = _normalize_columns_to_list(on)
513
617
  elif left_on is not None and right_on is not None:
@@ -526,10 +630,11 @@ class FlowFrame:
526
630
  )
527
631
  if not use_polars_code:
528
632
  join_mappings, use_polars_code = _create_join_mappings(
529
- left_columns, right_columns
633
+ left_columns or [], right_columns or []
530
634
  )
531
635
 
532
636
  if use_polars_code or suffix != '_right':
637
+
533
638
  _on = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in _normalize_columns_to_list(on)) + "]" if on else None
534
639
  _left = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in left_columns) + "]" if left_on else None
535
640
  _right = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in right_columns) + "]" if right_on else None
@@ -549,31 +654,50 @@ class FlowFrame:
549
654
  parent_node_id=self.node_id,
550
655
  )
551
656
 
552
- elif join_mappings:
657
+ elif join_mappings or how == 'cross':
658
+
553
659
  left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
554
660
  right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
555
661
 
556
- join_input = transform_schema.JoinInput(
557
- join_mapping=join_mappings,
558
- left_select=left_select.renames,
559
- right_select=right_select.renames,
560
- how=how,
561
- )
662
+ if how == 'cross':
663
+ join_input = transform_schema.CrossJoinInput(left_select=left_select.renames,
664
+ right_select=right_select.renames,)
665
+ else:
666
+ join_input = transform_schema.JoinInput(
667
+ join_mapping=join_mappings,
668
+ left_select=left_select.renames,
669
+ right_select=right_select.renames,
670
+ how=how,
671
+ )
672
+
562
673
  join_input.auto_rename()
563
- # Create node settings
564
- join_settings = input_schema.NodeJoin(
565
- flow_id=self.flow_graph.flow_id,
566
- node_id=new_node_id,
567
- join_input=join_input,
568
- auto_generate_selection=True,
569
- verify_integrity=True,
570
- pos_x=200,
571
- pos_y=150,
572
- is_setup=True,
573
- depending_on_ids=[self.node_id, other.node_id],
574
- description=description or f"Join with {how} strategy",
575
- )
576
- self.flow_graph.add_join(join_settings)
674
+ if how == 'cross':
675
+ cross_join_settings = input_schema.NodeCrossJoin(
676
+ flow_id=self.flow_graph.flow_id,
677
+ node_id=new_node_id,
678
+ cross_join_input=join_input,
679
+ is_setup=True,
680
+ depending_on_ids=[self.node_id, other.node_id],
681
+ description=description or f"Join with {how} strategy",
682
+ auto_generate_selection=True,
683
+ verify_integrity=True,
684
+ )
685
+
686
+ self.flow_graph.add_cross_join(cross_join_settings)
687
+ else:
688
+ join_settings = input_schema.NodeJoin(
689
+ flow_id=self.flow_graph.flow_id,
690
+ node_id=new_node_id,
691
+ join_input=join_input,
692
+ auto_generate_selection=True,
693
+ verify_integrity=True,
694
+ pos_x=200,
695
+ pos_y=150,
696
+ is_setup=True,
697
+ depending_on_ids=[self.node_id, other.node_id],
698
+ description=description or f"Join with {how} strategy",
699
+ )
700
+ self.flow_graph.add_join(join_settings)
577
701
  self._add_connection(self.node_id, new_node_id, "main")
578
702
  other._add_connection(other.node_id, new_node_id, "right")
579
703
  result_frame = FlowFrame(
@@ -600,38 +724,65 @@ class FlowFrame:
600
724
  self.flow_graph.add_record_count(node_number_of_records)
601
725
  return self._create_child_frame(new_node_id)
602
726
 
603
- def select(self, *columns, description: str = None):
727
+ def select(self, *columns: Union[str, Expr, Selector], description: Optional[str] = None) -> "FlowFrame":
604
728
  """
605
729
  Select columns from the frame.
606
-
607
- Args:
608
- *columns: Column names or expressions
609
- description: Description of the step, this will be shown in the flowfile file
610
-
611
- Returns:
612
- A new FlowFrame with selected columns
613
730
  """
614
- # Create new node ID
615
- columns = _parse_inputs_as_iterable(columns)
731
+ columns_iterable = list(_parse_inputs_as_iterable(columns))
616
732
  new_node_id = generate_node_id()
617
- existing_columns = self.columns
618
733
 
619
- if (len(columns) == 1 and isinstance(columns[0], Expr)
620
- and str(columns[0]) == "pl.Expr(len()).alias('number_of_records')"):
734
+ if (len(columns_iterable) == 1 and isinstance(columns_iterable[0], Expr)
735
+ and str(columns_iterable[0]) == "pl.Expr(len()).alias('number_of_records')"):
621
736
  return self._add_number_of_records(new_node_id, description)
622
- if all(isinstance(col_, (str, Column)) for col_ in columns):
623
-
624
- select_inputs = [
625
- transform_schema.SelectInput(old_name=col_) if isinstance(col_, str) else col_.to_select_input()
626
- for col_ in columns
627
- ]
628
- dropped_columns = [transform_schema.SelectInput(c, keep=False) for c in existing_columns if
629
- c not in [s.old_name for s in select_inputs]]
630
- select_inputs.extend(dropped_columns)
737
+
738
+ all_input_expr_objects: List[Expr] = []
739
+ pure_polars_expr_strings_for_select: List[str] = []
740
+ collected_raw_definitions: List[str] = []
741
+ selected_col_names_for_native: List[transform_schema.SelectInput] = [] # For native node
742
+
743
+ can_use_native_node = True
744
+ if len(columns_iterable) == 1 and isinstance(columns_iterable[0], str) and columns_iterable[0] == '*':
745
+ effective_columns_iterable = [col(c_name) for c_name in self.columns]
746
+ else:
747
+ effective_columns_iterable = columns_iterable
748
+ for expr_input in effective_columns_iterable:
749
+ current_expr_obj = expr_input
750
+ is_simple_col_for_native = False
751
+
752
+ if isinstance(expr_input, str):
753
+ current_expr_obj = col(expr_input)
754
+ selected_col_names_for_native.append(transform_schema.SelectInput(old_name=expr_input))
755
+ is_simple_col_for_native = True
756
+ elif isinstance(expr_input, Column):
757
+ selected_col_names_for_native.append(expr_input.to_select_input())
758
+ is_simple_col_for_native = True
759
+ elif isinstance(expr_input, Selector):
760
+ can_use_native_node = False
761
+ elif not isinstance(expr_input, Expr):
762
+ current_expr_obj = lit(expr_input)
763
+
764
+ all_input_expr_objects.append(current_expr_obj) # type: ignore
765
+
766
+ pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
767
+
768
+ pure_polars_expr_strings_for_select.append(pure_expr_str)
769
+ if raw_defs_str and raw_defs_str not in collected_raw_definitions:
770
+ collected_raw_definitions.append(raw_defs_str)
771
+
772
+ if not is_simple_col_for_native and not isinstance(expr_input, Selector):
773
+ can_use_native_node = False
774
+ if collected_raw_definitions: # Has to use Polars code if there are definitions
775
+ can_use_native_node = False
776
+ if can_use_native_node:
777
+ existing_cols = self.columns
778
+ selected_col_names = {select_col.old_name for select_col in selected_col_names_for_native}
779
+ dropped_columns = [transform_schema.SelectInput(c, keep=False) for c in existing_cols if
780
+ c not in selected_col_names]
781
+ selected_col_names_for_native.extend(dropped_columns)
631
782
  select_settings = input_schema.NodeSelect(
632
783
  flow_id=self.flow_graph.flow_id,
633
784
  node_id=new_node_id,
634
- select_input=select_inputs,
785
+ select_input=selected_col_names_for_native,
635
786
  keep_missing=False,
636
787
  pos_x=200,
637
788
  pos_y=100,
@@ -639,60 +790,97 @@ class FlowFrame:
639
790
  depending_on_id=self.node_id,
640
791
  description=description
641
792
  )
642
-
643
- # Add to graph
644
793
  self.flow_graph.add_select(select_settings)
645
- return self._create_child_frame(new_node_id)
646
-
647
794
  else:
648
- readable_exprs = []
649
- is_readable: bool = True
650
- for col_ in columns:
651
- if isinstance(col_, Expr):
652
- readable_exprs.append(col_)
653
- elif isinstance(col_, Selector):
654
- readable_exprs.append(col_)
655
- elif isinstance(col_, pl.expr.Expr):
656
- print('warning this cannot be converted to flowfile frontend. Make sure you use the flowfile expr')
657
- is_readable = False
658
- elif isinstance(col_, str) and col_ in self.columns:
659
- col_expr = Column(col_)
660
- readable_exprs.append(col_expr)
661
- else:
662
- lit_expr = lit(col_)
663
- readable_exprs.append(lit_expr)
664
- if is_readable:
665
- code = f"input_df.select([{', '.join(str(e) for e in readable_exprs)}])"
795
+ polars_operation_code = f"input_df.select([{', '.join(pure_polars_expr_strings_for_select)}])"
796
+ final_code_for_node: str
797
+ if collected_raw_definitions:
798
+ unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions))
799
+ definitions_section = "\n\n".join(unique_raw_definitions)
800
+ final_code_for_node = definitions_section + \
801
+ "\#─────SPLIT─────\n\n" + \
802
+ f"output_df = {polars_operation_code}"
666
803
  else:
667
- raise ValueError('Not supported')
804
+ final_code_for_node = polars_operation_code
668
805
 
669
- self._add_polars_code(new_node_id, code, description)
670
- return self._create_child_frame(new_node_id)
806
+ pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
807
+ isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
808
+ self._add_polars_code(new_node_id, final_code_for_node, description,
809
+ method_name="select",
810
+ convertable_to_code=_check_if_convertible_to_code(all_input_expr_objects),
811
+ polars_expr=pl_expressions_for_fallback)
671
812
 
672
- def filter(self, predicate: Expr | Any = None, *, flowfile_formula: str = None, description: str = None):
813
+ return self._create_child_frame(new_node_id)
814
+
815
+ def filter(self, *predicates: Union[Expr, Any], flowfile_formula: Optional[str] = None,
816
+ description: Optional[str] = None, **constraints: Any) -> "FlowFrame":
673
817
  """
674
818
  Filter rows based on a predicate.
675
-
676
- Args:
677
- predicate: Filter condition
678
- flowfile_formula: Native support in frontend
679
- description: Description of the step that is performed
680
- Returns:
681
- A new FlowFrame with filtered rows
682
819
  """
820
+ if (len(predicates) > 0 or len(constraints) > 0) and flowfile_formula:
821
+ raise ValueError("You can only use one of the following: predicates, constraints or flowfile_formula")
822
+ available_columns = self.columns
683
823
  new_node_id = generate_node_id()
684
- # Create new node ID
685
- if predicate:
686
- # we use for now the fallback on polars code.
687
- if isinstance(predicate, Expr):
688
- predicate_expr = predicate
824
+ if len(predicates) > 0 or len(constraints) > 0:
825
+ all_input_expr_objects: List[Expr] = []
826
+ pure_polars_expr_strings: List[str] = []
827
+ collected_raw_definitions: List[str] = []
828
+
829
+ processed_predicates = []
830
+ for pred_item in predicates:
831
+ if isinstance(pred_item, (tuple, list, Iterator)):
832
+ # If it's a sequence, extend the processed_predicates with its elements
833
+ processed_predicates.extend(list(pred_item))
834
+ else:
835
+ # Otherwise, just add the item
836
+ processed_predicates.append(pred_item)
837
+
838
+ for pred_input in processed_predicates: # Loop over the processed_predicates
839
+ # End of the new/modified section
840
+ current_expr_obj = None # Initialize current_expr_obj
841
+ if isinstance(pred_input, Expr):
842
+ current_expr_obj = pred_input
843
+ elif isinstance(pred_input, str) and pred_input in available_columns:
844
+ current_expr_obj = col(pred_input)
845
+ else:
846
+ current_expr_obj = lit(pred_input)
847
+
848
+ all_input_expr_objects.append(current_expr_obj)
849
+
850
+ pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
851
+ pure_polars_expr_strings.append(f"({pure_expr_str})")
852
+ if raw_defs_str and raw_defs_str not in collected_raw_definitions:
853
+ collected_raw_definitions.append(raw_defs_str)
854
+
855
+ for k, v_val in constraints.items():
856
+ constraint_expr_obj = (col(k) == lit(v_val))
857
+ all_input_expr_objects.append(constraint_expr_obj)
858
+ pure_expr_str, raw_defs_str = _extract_expr_parts(
859
+ constraint_expr_obj) # Constraint exprs are unlikely to have defs
860
+ pure_polars_expr_strings.append(f"({pure_expr_str})")
861
+ if raw_defs_str and raw_defs_str not in collected_raw_definitions: # Should be rare here
862
+ collected_raw_definitions.append(raw_defs_str)
863
+
864
+ filter_conditions_str = " & ".join(pure_polars_expr_strings) if pure_polars_expr_strings else "pl.lit(True)"
865
+ polars_operation_code = f"input_df.filter({filter_conditions_str})"
866
+
867
+ final_code_for_node: str
868
+ if collected_raw_definitions:
869
+ unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions)) # Order-preserving unique
870
+ definitions_section = "\n\n".join(unique_raw_definitions)
871
+ final_code_for_node = definitions_section + \
872
+ "\#─────SPLIT─────\n\n" + \
873
+ f"output_df = {polars_operation_code}"
689
874
  else:
690
- predicate_expr = lit(predicate)
691
- code = f"input_df.filter({str(predicate_expr)})"
692
- self._add_polars_code(new_node_id, code, description)
693
-
875
+ final_code_for_node = polars_operation_code
876
+
877
+ convertable_to_code = _check_if_convertible_to_code(all_input_expr_objects)
878
+ pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
879
+ isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
880
+ self._add_polars_code(new_node_id, final_code_for_node, description, method_name="filter",
881
+ convertable_to_code=convertable_to_code,
882
+ polars_expr=pl_expressions_for_fallback)
694
883
  elif flowfile_formula:
695
- # Create node settings
696
884
  filter_settings = input_schema.NodeFilter(
697
885
  flow_id=self.flow_graph.flow_id,
698
886
  node_id=new_node_id,
@@ -706,8 +894,10 @@ class FlowFrame:
706
894
  depending_on_id=self.node_id,
707
895
  description=description
708
896
  )
709
-
710
897
  self.flow_graph.add_filter(filter_settings)
898
+ else:
899
+ logger.info("Filter called with no arguments; creating a pass-through Polars code node.")
900
+ self._add_polars_code(new_node_id, "output_df = input_df", description or "No-op filter", method_name=None)
711
901
 
712
902
  return self._create_child_frame(new_node_id)
713
903
 
@@ -792,7 +982,7 @@ class FlowFrame:
792
982
  if convert_to_absolute_path:
793
983
  output_settings.directory = output_settings.abs_file_path
794
984
  except Exception as e:
795
- print(f"Warning: Could not determine absolute path for {file_str}: {e}")
985
+ logger.warning(f"Could not determine absolute path for {file_str}: {e}")
796
986
 
797
987
  if not use_polars_code:
798
988
  node_output = input_schema.NodeOutput(
@@ -820,7 +1010,7 @@ class FlowFrame:
820
1010
 
821
1011
  # Use sink_parquet for LazyFrames
822
1012
  code = f"input_df.sink_parquet({args_str})"
823
- print(f"Generated Polars Code: {code}")
1013
+ logger.debug(f"Generated Polars Code: {code}")
824
1014
  self._add_polars_code(new_node_id, code, description)
825
1015
 
826
1016
  return self._create_child_frame(new_node_id)
@@ -868,7 +1058,7 @@ class FlowFrame:
868
1058
  if convert_to_absolute_path:
869
1059
  output_settings.directory = output_settings.abs_file_path
870
1060
  except Exception as e:
871
- print(f"Warning: Could not determine absolute path for {file_str}: {e}")
1061
+ logger.warning(f"Could not determine absolute path for {file_str}: {e}")
872
1062
 
873
1063
  if not use_polars_code:
874
1064
  node_output = input_schema.NodeOutput(
@@ -901,7 +1091,7 @@ class FlowFrame:
901
1091
  args_str += f", {kwargs_repr}"
902
1092
 
903
1093
  code = f"input_df.collect().write_csv({args_str})"
904
- print(f"Generated Polars Code: {code}")
1094
+ logger.debug(f"Generated Polars Code: {code}")
905
1095
  self._add_polars_code(new_node_id, code, description)
906
1096
 
907
1097
  return self._create_child_frame(new_node_id)
@@ -954,10 +1144,10 @@ class FlowFrame:
954
1144
  self.flow_graph.apply_layout()
955
1145
  self.flow_graph.save_flow(file_path)
956
1146
 
957
- def collect(self):
1147
+ def collect(self, *args, **kwargs):
958
1148
  """Collect lazy data into memory."""
959
1149
  if hasattr(self.data, "collect"):
960
- return self.data.collect()
1150
+ return self.data.collect(*args, **kwargs)
961
1151
  return self.data
962
1152
 
963
1153
  def _with_flowfile_formula(self, flowfile_formula: str, output_column_name, description: str = None) -> "FlowFrame":
@@ -1278,9 +1468,10 @@ class FlowFrame:
1278
1468
  f.node_id = node_mappings.get((f.flow_graph.flow_id, f.node_id), None)
1279
1469
  global node_id_counter
1280
1470
  node_id_counter += len(combined_graph.nodes)
1471
+ else:
1472
+ combined_graph = self.flow_graph
1281
1473
  new_node_id = generate_node_id()
1282
1474
  use_native = how == "diagonal_relaxed" and parallel and not rechunk
1283
-
1284
1475
  if use_native:
1285
1476
  # Create union input for the transform schema
1286
1477
  union_input = transform_schema.UnionInput(
@@ -1314,7 +1505,6 @@ class FlowFrame:
1314
1505
  input_vars.append(f"input_df_{i+2}")
1315
1506
 
1316
1507
  frames_list = f"[{', '.join(input_vars)}]"
1317
-
1318
1508
  code = f"""
1319
1509
  # Perform concat operation
1320
1510
  output_df = pl.concat(
@@ -1324,19 +1514,20 @@ class FlowFrame:
1324
1514
  parallel={parallel}
1325
1515
  )
1326
1516
  """
1327
-
1517
+ self.flow_graph = combined_graph
1328
1518
 
1329
1519
  # Add polars code node with dependencies on all input frames
1330
1520
  depending_on_ids = [self.node_id] + [frame.node_id for frame in others]
1331
1521
  self._add_polars_code(
1332
1522
  new_node_id, code, description, depending_on_ids=depending_on_ids
1333
1523
  )
1334
-
1335
1524
  # Add connections to ensure all frames are available
1336
1525
  self._add_connection(self.node_id, new_node_id, "main")
1526
+
1337
1527
  for other_frame in others:
1338
- other_frame._add_connection(other_frame.node_id, new_node_id, "main")
1339
1528
 
1529
+ other_frame.flow_graph = combined_graph
1530
+ other_frame._add_connection(other_frame.node_id, new_node_id, "main")
1340
1531
  # Create and return the new frame
1341
1532
  return FlowFrame(
1342
1533
  data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
@@ -1373,7 +1564,7 @@ class FlowFrame:
1373
1564
  return False, None
1374
1565
 
1375
1566
  # Extract the output name
1376
- output_name = expr.name
1567
+ output_name = expr.column_name
1377
1568
 
1378
1569
  if ".over(" not in expr._repr_str:
1379
1570
  # Simple cumulative count can be implemented as a record ID with offset=1
@@ -1456,62 +1647,70 @@ class FlowFrame:
1456
1647
  return False, None
1457
1648
 
1458
1649
  def with_columns(
1459
- self,
1460
- exprs: Expr | List[Expr | None] = None,
1461
- *,
1462
- flowfile_formulas: Optional[List[str]] = None,
1463
- output_column_names: Optional[List[str]] = None,
1464
- description: Optional[str] = None,
1650
+ self,
1651
+ *exprs: Union[Expr, Iterable[Expr], Any], # Allow Any for implicit lit conversion
1652
+ flowfile_formulas: Optional[List[str]] = None,
1653
+ output_column_names: Optional[List[str]] = None,
1654
+ description: Optional[str] = None,
1655
+ **named_exprs: Union[Expr, Any], # Allow Any for implicit lit conversion
1465
1656
  ) -> "FlowFrame":
1466
1657
  """
1467
- Add multiple columns to the DataFrame.
1468
-
1469
- Parameters
1470
- ----------
1471
- exprs : Expr or List[Expr], optional
1472
- Expressions to evaluate as new columns
1473
- flowfile_formulas : List[str], optional
1474
- Alternative approach using flowfile formula syntax
1475
- output_column_names : List[str], optional
1476
- Column names for the flowfile formulas
1477
- description : str, optional
1478
- Description of this operation for the ETL graph
1479
-
1480
- Returns
1481
- -------
1482
- FlowFrame
1483
- A new FlowFrame with the columns added
1484
-
1485
- Raises
1486
- ------
1487
- ValueError
1488
- If neither exprs nor flowfile_formulas with output_column_names are provided,
1489
- or if the lengths of flowfile_formulas and output_column_names don't match
1658
+ Add or replace columns in the DataFrame.
1490
1659
  """
1491
- if exprs is not None:
1492
- new_node_id = generate_node_id()
1493
- exprs_iterable = _parse_inputs_as_iterable((exprs,))
1660
+ new_node_id = generate_node_id()
1494
1661
 
1495
- if len(exprs_iterable) == 1:
1496
- detected, result = self._detect_cum_count_record_id(
1497
- exprs_iterable[0], new_node_id, description
1498
- )
1499
- if detected:
1500
- return result
1501
- all_expressions = []
1502
- for expression in exprs_iterable:
1503
- if not isinstance(expression, (Expr, Column)):
1504
- all_expressions.append(lit(expression))
1505
- else:
1506
- all_expressions.append(expression)
1662
+ all_input_expr_objects: List[Expr] = []
1663
+ pure_polars_expr_strings_for_wc: List[str] = []
1664
+ collected_raw_definitions: List[str] = []
1665
+
1666
+ has_exprs_or_named_exprs = bool(exprs or named_exprs)
1667
+ if has_exprs_or_named_exprs:
1668
+ actual_exprs_to_process: List[Expr] = []
1669
+ temp_exprs_iterable = list(_parse_inputs_as_iterable(exprs))
1670
+
1671
+ for item in temp_exprs_iterable:
1672
+ if isinstance(item, Expr):
1673
+ actual_exprs_to_process.append(item)
1674
+ else: # auto-lit for non-Expr positional args
1675
+ actual_exprs_to_process.append(lit(item))
1676
+
1677
+ for name, val_expr in named_exprs.items():
1678
+ if isinstance(val_expr, Expr):
1679
+ actual_exprs_to_process.append(val_expr.alias(name)) # type: ignore # Assuming Expr has alias
1680
+ else: # auto-lit for named args and then alias
1681
+ actual_exprs_to_process.append(lit(val_expr).alias(name)) # type: ignore
1682
+
1683
+ if len(actual_exprs_to_process) == 1 and isinstance(actual_exprs_to_process[0], Expr):
1684
+ pass
1685
+
1686
+ for current_expr_obj in actual_exprs_to_process:
1687
+ all_input_expr_objects.append(current_expr_obj)
1688
+ pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
1689
+ pure_polars_expr_strings_for_wc.append(pure_expr_str) # with_columns takes individual expressions
1690
+ if raw_defs_str and raw_defs_str not in collected_raw_definitions:
1691
+ collected_raw_definitions.append(raw_defs_str)
1692
+
1693
+ polars_operation_code = f"input_df.with_columns([{', '.join(pure_polars_expr_strings_for_wc)}])"
1694
+
1695
+ final_code_for_node: str
1696
+ if collected_raw_definitions:
1697
+ unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions))
1698
+ definitions_section = "\n\n".join(unique_raw_definitions)
1699
+ final_code_for_node = definitions_section + \
1700
+ "\n#─────SPLIT─────\n\n" + \
1701
+ f"output_df = {polars_operation_code}"
1702
+ else:
1703
+ final_code_for_node = polars_operation_code
1507
1704
 
1508
- code = (
1509
- f"input_df.with_columns({', '.join(str(e) for e in all_expressions)})"
1510
- )
1511
- self._add_polars_code(new_node_id, code, description)
1705
+ pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
1706
+ isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
1707
+ self._add_polars_code(new_node_id, final_code_for_node, description, method_name='with_columns',
1708
+ convertable_to_code=_check_if_convertible_to_code(all_input_expr_objects),
1709
+ polars_expr=pl_expressions_for_fallback)
1512
1710
  return self._create_child_frame(new_node_id)
1513
1711
 
1514
1712
  elif flowfile_formulas is not None and output_column_names is not None:
1713
+
1515
1714
  if len(output_column_names) != len(flowfile_formulas):
1516
1715
  raise ValueError(
1517
1716
  "Length of both the formulas and the output columns names must be identical"
@@ -1524,9 +1723,7 @@ class FlowFrame:
1524
1723
  ff = ff._with_flowfile_formula(flowfile_formula, output_column_name, f"{i}: {description}")
1525
1724
  return ff
1526
1725
  else:
1527
- raise ValueError(
1528
- "Either exprs or flowfile_formulas with output_column_names must be provided"
1529
- )
1726
+ raise ValueError("Either exprs/named_exprs or flowfile_formulas with output_column_names must be provided")
1530
1727
 
1531
1728
  def with_row_index(
1532
1729
  self, name: str = "index", offset: int = 0, description: str = None
@@ -1614,26 +1811,27 @@ class FlowFrame:
1614
1811
 
1615
1812
  if isinstance(columns, (list, tuple)):
1616
1813
  all_columns.extend(
1617
- [col.name if isinstance(col, Column) else col for col in columns]
1814
+ [col.column_name if isinstance(col, Column) else col for col in columns]
1618
1815
  )
1619
1816
  else:
1620
- all_columns.append(columns.name if isinstance(columns, Column) else columns)
1817
+ all_columns.append(columns.column_name if isinstance(columns, Column) else columns)
1621
1818
 
1622
1819
  if more_columns:
1623
1820
  for col in more_columns:
1624
- all_columns.append(col.name if isinstance(col, Column) else col)
1821
+ all_columns.append(col.column_name if isinstance(col, Column) else col)
1625
1822
 
1626
1823
  if len(all_columns) == 1:
1627
- columns_str = f"'{all_columns[0]}'"
1824
+
1825
+ columns_str = stringify_values(all_columns[0])
1628
1826
  else:
1629
- columns_str = "[" + ", ".join([f"'{col}'" for col in all_columns]) + "]"
1827
+ columns_str = "[" + ", ".join([ stringify_values(col) for col in all_columns]) + "]"
1630
1828
 
1631
1829
  code = f"""
1632
1830
  # Explode columns into multiple rows
1633
1831
  output_df = input_df.explode({columns_str})
1634
1832
  """
1635
1833
 
1636
- cols_desc = ", ".join(all_columns)
1834
+ cols_desc = ", ".join(str(s) for s in all_columns)
1637
1835
  desc = description or f"Explode column(s): {cols_desc}"
1638
1836
 
1639
1837
  # Add polars code node
@@ -1676,7 +1874,7 @@ class FlowFrame:
1676
1874
  new_node_id = generate_node_id()
1677
1875
 
1678
1876
  if isinstance(column, Column):
1679
- column_name = column.name
1877
+ column_name = column.column_name
1680
1878
  else:
1681
1879
  column_name = column
1682
1880
 
@@ -1760,7 +1958,7 @@ class FlowFrame:
1760
1958
  if col_expr._select_input.is_altered:
1761
1959
  can_use_native = False
1762
1960
  break
1763
- processed_subset.append(col_expr.name)
1961
+ processed_subset.append(col_expr.column_name)
1764
1962
  else:
1765
1963
  can_use_native = False
1766
1964
  break
@@ -1848,650 +2046,34 @@ class FlowFrame:
1848
2046
  """Get the number of columns."""
1849
2047
  return self.data.width
1850
2048
 
2049
+ def __contains__(self, key):
2050
+ """This special method enables the 'in' operator to work with FlowFrame objects."""
2051
+ return key in self.data
1851
2052
 
1852
- def _add_delegated_methods():
1853
- """Add delegated methods from polars LazyFrame."""
1854
- delegate_methods = [
1855
- "collect_async",
1856
- "profile",
1857
- "describe",
1858
- "explain",
1859
- "show_graph",
1860
- "serialize",
1861
- "fetch",
1862
- "get_meta",
1863
- "columns",
1864
- "dtypes",
1865
- "schema",
1866
- "estimated_size",
1867
- "n_chunks",
1868
- "is_empty",
1869
- "chunk_lengths",
1870
- "optimization_toggle",
1871
- "set_polars_options",
1872
- "collect_schema"
1873
- ]
1874
-
1875
- already_implemented = set(dir(FlowFrame))
1876
-
1877
- for method_name in delegate_methods:
1878
- if method_name not in already_implemented and hasattr(
1879
- pl.LazyFrame, method_name
1880
- ):
1881
- # Create a simple delegate method
1882
- def make_delegate(name):
1883
- def delegate_method(self, *args, **kwargs):
1884
- return getattr(self.data, name)(*args, **kwargs)
1885
-
1886
- # Set docstring and name
1887
- delegate_method.__doc__ = (
1888
- f"See pl.LazyFrame.{name} for full documentation."
1889
- )
1890
- delegate_method.__name__ = name
1891
- return delegate_method
1892
-
1893
- # Add the method to the class
1894
- setattr(FlowFrame, method_name, make_delegate(method_name))
1895
-
1896
-
1897
- _add_delegated_methods()
1898
-
1899
-
1900
- def sum(expr):
1901
- """Sum aggregation function."""
1902
- if isinstance(expr, str):
1903
- expr = col(expr)
1904
- return expr.sum()
1905
-
1906
-
1907
- def mean(expr):
1908
- """Mean aggregation function."""
1909
- if isinstance(expr, str):
1910
- expr = col(expr)
1911
- return expr.mean()
1912
-
1913
-
1914
- def min(expr):
1915
- """Min aggregation function."""
1916
- if isinstance(expr, str):
1917
- expr = col(expr)
1918
- return expr.min()
1919
-
1920
-
1921
- def max(expr):
1922
- """Max aggregation function."""
1923
- if isinstance(expr, str):
1924
- expr = col(expr)
1925
- return expr.max()
1926
-
1927
-
1928
- def count(expr):
1929
- """Count aggregation function."""
1930
- if isinstance(expr, str):
1931
- expr = col(expr)
1932
- return expr.count()
1933
-
1934
-
1935
- def read_csv(
1936
- source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
1937
- *,
1938
- flow_graph: Optional[Any] = None, # Using Any for FlowGraph placeholder
1939
- separator: str = ',',
1940
- convert_to_absolute_path: bool = True,
1941
- description: Optional[str] = None,
1942
- has_header: bool = True,
1943
- new_columns: Optional[List[str]] = None,
1944
- comment_prefix: Optional[str] = None,
1945
- quote_char: Optional[str] = '"',
1946
- skip_rows: int = 0,
1947
- skip_lines: int = 0,
1948
- schema: Optional[SchemaDict] = None,
1949
- schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
1950
- null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
1951
- missing_utf8_is_empty_string: bool = False,
1952
- ignore_errors: bool = False,
1953
- try_parse_dates: bool = False,
1954
- infer_schema: bool = True,
1955
- infer_schema_length: Optional[int] = 100,
1956
- n_rows: Optional[int] = None,
1957
- encoding: CsvEncoding = 'utf8',
1958
- low_memory: bool = False,
1959
- rechunk: bool = False,
1960
- storage_options: Optional[Dict[str, Any]] = None,
1961
- skip_rows_after_header: int = 0,
1962
- row_index_name: Optional[str] = None,
1963
- row_index_offset: int = 0,
1964
- eol_char: str = '\n',
1965
- raise_if_empty: bool = True,
1966
- truncate_ragged_lines: bool = False,
1967
- decimal_comma: bool = False,
1968
- glob: bool = True,
1969
- cache: bool = True,
1970
- with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
1971
- **other_options: Any
1972
- ) -> FlowFrame:
1973
- """
1974
- Read a CSV file into a FlowFrame.
1975
-
1976
- This function uses the native FlowGraph implementation when the parameters
1977
- fall within the supported range, and falls back to using Polars' scan_csv implementation
1978
- for more advanced features.
1979
-
1980
- Args:
1981
- source: Path(s) to CSV file(s), or a file-like object.
1982
- flow_graph: if you want to add it to an existing graph
1983
- separator: Single byte character to use as separator in the file.
1984
- convert_to_absolute_path: If the path needs to be set to a fixed location
1985
- description: if you want to add a readable name in the frontend (advised)
1986
-
1987
- # Polars.scan_csv aligned parameters
1988
- has_header: Indicate if the first row of the dataset is a header or not.
1989
- new_columns: Rename columns after selection.
1990
- comment_prefix: String that indicates a comment line if found at beginning of line.
1991
- quote_char: Character used for quoting. None to disable.
1992
- skip_rows: Start reading after this many rows.
1993
- skip_lines: Skip this many lines by newline char only.
1994
- schema: Schema to use when reading the CSV.
1995
- schema_overrides: Schema overrides for specific columns.
1996
- null_values: Values to interpret as null.
1997
- missing_utf8_is_empty_string: Treat missing utf8 values as empty strings.
1998
- ignore_errors: Try to keep reading lines if some parsing errors occur.
1999
- try_parse_dates: Try to automatically parse dates.
2000
- infer_schema: Boolean flag. If False, `infer_schema_length` for Polars is set to 0.
2001
- infer_schema_length: Number of rows to use for schema inference. Polars default is 100.
2002
- n_rows: Stop reading after this many rows.
2003
- encoding: Character encoding to use.
2004
- low_memory: Reduce memory usage at the cost of performance.
2005
- rechunk: Ensure data is in contiguous memory layout after parsing.
2006
- storage_options: Options for fsspec for cloud storage.
2007
- skip_rows_after_header: Skip rows after header.
2008
- row_index_name: Name of the row index column.
2009
- row_index_offset: Start value for the row index.
2010
- eol_char: End of line character.
2011
- raise_if_empty: Raise error if file is empty.
2012
- truncate_ragged_lines: Truncate lines with too many values.
2013
- decimal_comma: Parse floats with decimal comma.
2014
- glob: Use glob pattern for file path (if source is a string).
2015
- cache: Cache the result after reading (Polars default True).
2016
- with_column_names: Apply a function over the column names.
2017
- other_options: Any other options to pass to polars.scan_csv (e.g. retries, file_cache_ttl).
2018
-
2019
- Returns:
2020
- A FlowFrame with the CSV data.
2021
- """
2022
- node_id = generate_node_id() # Assuming generate_node_id is defined
2023
- if flow_graph is None:
2024
- flow_graph = create_flow_graph() # Assuming create_flow_graph is defined
2025
- flow_id = flow_graph.flow_id
2026
-
2027
- current_source_path_for_native = None
2028
- if isinstance(source, (str, os.PathLike)):
2029
- current_source_path_for_native = str(source)
2030
- if '~' in current_source_path_for_native:
2031
- current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
2032
- elif isinstance(source, list) and all(isinstance(s, (str, os.PathLike)) for s in source):
2033
- current_source_path_for_native = str(source[0]) if source else None
2034
- if current_source_path_for_native and '~' in current_source_path_for_native:
2035
- current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
2036
- elif isinstance(source, (io.BytesIO, io.StringIO)):
2037
- logger.warning("Read from bytes io from csv not supported, converting data to raw data")
2038
- return from_dict(pl.read_csv(source), flow_graph=flow_graph, description=description)
2039
- actual_infer_schema_length: Optional[int]
2040
- if not infer_schema:
2041
- actual_infer_schema_length = 0
2042
- else:
2043
- actual_infer_schema_length = infer_schema_length
2044
- can_use_native = (
2045
- current_source_path_for_native is not None and
2046
- comment_prefix is None and
2047
- skip_lines == 0 and
2048
- schema is None and
2049
- schema_overrides is None and
2050
- null_values is None and
2051
- not missing_utf8_is_empty_string and
2052
- not try_parse_dates and
2053
- n_rows is None and
2054
- not low_memory and
2055
- not rechunk and
2056
- storage_options is None and
2057
- skip_rows_after_header == 0 and
2058
- row_index_name is None and
2059
- row_index_offset == 0 and
2060
- eol_char == '\n' and
2061
- not decimal_comma and
2062
- new_columns is None and
2063
- glob is True
2064
- )
2065
- if can_use_native and current_source_path_for_native:
2066
- received_table = input_schema.ReceivedTable(
2067
- file_type='csv',
2068
- path=current_source_path_for_native,
2069
- name=Path(current_source_path_for_native).name,
2070
- delimiter=separator,
2071
- has_headers=has_header,
2072
- encoding=encoding,
2073
- starting_from_line=skip_rows,
2074
- quote_char=quote_char if quote_char is not None else '"',
2075
- infer_schema_length=actual_infer_schema_length if actual_infer_schema_length is not None else 10000,
2076
- truncate_ragged_lines=truncate_ragged_lines,
2077
- ignore_errors=ignore_errors,
2078
- row_delimiter=eol_char
2079
- )
2080
- if convert_to_absolute_path:
2081
- try:
2082
- received_table.set_absolute_filepath()
2083
- received_table.path = received_table.abs_file_path
2084
- except Exception as e:
2085
- print(f"Warning: Could not determine absolute path for {current_source_path_for_native}: {e}")
2053
+ def __bool__(self):
2054
+ """This special method determines how the object behaves in boolean contexts.
2055
+ Returns True if the FlowFrame contains any data, False otherwise."""
2056
+ return bool(self.data)
2086
2057
 
2087
- read_node_description = description or f"Read CSV from {Path(current_source_path_for_native).name}"
2088
- read_node = input_schema.NodeRead(
2089
- flow_id=flow_id,
2090
- node_id=node_id,
2091
- received_file=received_table,
2092
- pos_x=100,
2093
- pos_y=100,
2094
- is_setup=True,
2095
- description=read_node_description
2096
- )
2097
- flow_graph.add_read(read_node)
2098
- result_frame = FlowFrame(
2099
- data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2100
- flow_graph=flow_graph,
2101
- node_id=node_id
2102
- )
2103
- return result_frame
2104
- else:
2105
- polars_source_arg = source
2106
- polars_code = _build_polars_code_args(
2107
- source=polars_source_arg,
2108
- separator=separator,
2109
- has_header=has_header,
2110
- new_columns=new_columns,
2111
- comment_prefix=comment_prefix,
2112
- quote_char=quote_char,
2113
- skip_rows=skip_rows,
2114
- skip_lines=skip_lines,
2115
- schema=schema,
2116
- schema_overrides=schema_overrides,
2117
- null_values=null_values,
2118
- missing_utf8_is_empty_string=missing_utf8_is_empty_string,
2119
- ignore_errors=ignore_errors,
2120
- try_parse_dates=try_parse_dates,
2121
- infer_schema_length=actual_infer_schema_length,
2122
- n_rows=n_rows,
2123
- encoding=encoding,
2124
- low_memory=low_memory,
2125
- rechunk=rechunk,
2126
- storage_options=storage_options,
2127
- skip_rows_after_header=skip_rows_after_header,
2128
- row_index_name=row_index_name,
2129
- row_index_offset=row_index_offset,
2130
- eol_char=eol_char,
2131
- raise_if_empty=raise_if_empty,
2132
- truncate_ragged_lines=truncate_ragged_lines,
2133
- decimal_comma=decimal_comma,
2134
- glob=glob,
2135
- cache=cache,
2136
- with_column_names=with_column_names,
2137
- **other_options
2138
- )
2139
- polars_code_node_description = description or "Read CSV with Polars scan_csv"
2140
- if isinstance(source, (str, os.PathLike)):
2141
- polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source).name}"
2142
- elif isinstance(source, list) and source and isinstance(source[0], (str, os.PathLike)):
2143
- polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source[0]).name} (and possibly others)"
2144
-
2145
- # Assuming input_schema.NodePolarsCode, transform_schema.PolarsCodeInput are defined
2146
- polars_code_settings = input_schema.NodePolarsCode(
2147
- flow_id=flow_id,
2148
- node_id=node_id,
2149
- polars_code_input=transform_schema.PolarsCodeInput(polars_code=polars_code),
2150
- is_setup=True,
2151
- description=polars_code_node_description
2152
- )
2153
- flow_graph.add_polars_code(polars_code_settings)
2154
- return FlowFrame(
2155
- data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2156
- flow_graph=flow_graph,
2157
- node_id=node_id,
2158
- )
2159
-
2160
- def _build_polars_code_args(
2161
- source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
2162
- separator: str,
2163
- has_header: bool,
2164
- new_columns: Optional[List[str]],
2165
- comment_prefix: Optional[str],
2166
- quote_char: Optional[str],
2167
- skip_rows: int,
2168
- skip_lines: int,
2169
- schema: Optional[SchemaDict],
2170
- schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]],
2171
- null_values: Optional[Union[str, List[str], Dict[str, str]]],
2172
- missing_utf8_is_empty_string: bool,
2173
- ignore_errors: bool,
2174
- try_parse_dates: bool,
2175
- infer_schema_length: Optional[int],
2176
- n_rows: Optional[int],
2177
- encoding: CsvEncoding,
2178
- low_memory: bool,
2179
- rechunk: bool,
2180
- storage_options: Optional[Dict[str, Any]],
2181
- skip_rows_after_header: int,
2182
- row_index_name: Optional[str],
2183
- row_index_offset: int,
2184
- eol_char: str,
2185
- raise_if_empty: bool,
2186
- truncate_ragged_lines: bool,
2187
- decimal_comma: bool,
2188
- glob: bool,
2189
- cache: bool,
2190
- with_column_names: Optional[Callable[[List[str]], List[str]]],
2191
- **other_options: Any
2192
- ) -> str:
2193
- source_repr: str
2194
- if isinstance(source, (str, Path)):
2195
- source_repr = repr(str(source))
2196
- elif isinstance(source, list):
2197
- source_repr = repr([str(p) for p in source])
2198
- elif isinstance(source, bytes):
2199
- source_repr = "source_bytes_obj"
2200
- elif hasattr(source, 'read'):
2201
- source_repr = "source_file_like_obj"
2202
- else:
2203
- source_repr = repr(source)
2204
-
2205
- param_mapping = {
2206
- 'has_header': (True, lambda x: str(x)),
2207
- 'separator': (',', lambda x: repr(str(x))),
2208
- 'comment_prefix': (None, lambda x: repr(str(x)) if x is not None else 'None'),
2209
- 'quote_char': ('"', lambda x: repr(str(x)) if x is not None else 'None'),
2210
- 'skip_rows': (0, str),
2211
- 'skip_lines': (0, str),
2212
- 'schema': (None, lambda x: repr(x) if x is not None else 'None'),
2213
- 'schema_overrides': (None, lambda x: repr(x) if x is not None else 'None'),
2214
- 'null_values': (None, lambda x: repr(x) if x is not None else 'None'),
2215
- 'missing_utf8_is_empty_string': (False, str),
2216
- 'ignore_errors': (False, str),
2217
- 'cache': (True, str),
2218
- 'with_column_names': (None, lambda x: repr(x) if x is not None else 'None'),
2219
- 'infer_schema_length': (100, lambda x: str(x) if x is not None else 'None'),
2220
- 'n_rows': (None, lambda x: str(x) if x is not None else 'None'),
2221
- 'encoding': ('utf8', lambda x: repr(str(x))),
2222
- 'low_memory': (False, str),
2223
- 'rechunk': (False, str),
2224
- 'skip_rows_after_header': (0, str),
2225
- 'row_index_name': (None, lambda x: repr(str(x)) if x is not None else 'None'),
2226
- 'row_index_offset': (0, str),
2227
- 'try_parse_dates': (False, str),
2228
- 'eol_char': ('\n', lambda x: repr(str(x))),
2229
- 'new_columns': (None, lambda x: repr(x) if x is not None else 'None'),
2230
- 'raise_if_empty': (True, str),
2231
- 'truncate_ragged_lines': (False, str),
2232
- 'decimal_comma': (False, str),
2233
- 'glob': (True, str),
2234
- 'storage_options': (None, lambda x: repr(x) if x is not None else 'None'),
2235
- }
2236
-
2237
- all_vars = locals()
2238
- kwargs_list = []
2239
-
2240
- for param_name_key, (default_value, format_func) in param_mapping.items():
2241
- value = all_vars.get(param_name_key)
2242
- formatted_value = format_func(value)
2243
- kwargs_list.append(f"{param_name_key}={formatted_value}")
2244
-
2245
- if other_options:
2246
- for k, v in other_options.items():
2247
- kwargs_list.append(f"{k}={repr(v)}")
2248
-
2249
- kwargs_str = ",\n ".join(kwargs_list)
2250
-
2251
- if kwargs_str:
2252
- polars_code = f"output_df = pl.scan_csv(\n {source_repr},\n {kwargs_str}\n)"
2253
- else:
2254
- polars_code = f"output_df = pl.scan_csv({source_repr})"
2255
-
2256
- return polars_code
2257
-
2258
-
2259
- def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
2260
- convert_to_absolute_path: bool = True, **options) -> FlowFrame:
2261
- """
2262
- Read a Parquet file into a FlowFrame.
2263
-
2264
- Args:
2265
- file_path: Path to Parquet file
2266
- flow_graph: if you want to add it to an existing graph
2267
- description: if you want to add a readable name in the frontend (advised)
2268
- convert_to_absolute_path: If the path needs to be set to a fixed location
2269
- **options: Options for polars.read_parquet
2270
-
2271
- Returns:
2272
- A FlowFrame with the Parquet data
2273
- """
2274
- if '~' in file_path:
2275
- file_path = os.path.expanduser(file_path)
2276
- node_id = generate_node_id()
2277
-
2278
- if flow_graph is None:
2279
- flow_graph = create_flow_graph()
2280
-
2281
- flow_id = flow_graph.flow_id
2282
-
2283
- received_table = input_schema.ReceivedTable(
2284
- file_type='parquet',
2285
- path=file_path,
2286
- name=Path(file_path).name,
2287
- )
2288
- if convert_to_absolute_path:
2289
- received_table.path = received_table.abs_file_path
2290
-
2291
- read_node = input_schema.NodeRead(
2292
- flow_id=flow_id,
2293
- node_id=node_id,
2294
- received_file=received_table,
2295
- pos_x=100,
2296
- pos_y=100,
2297
- is_setup=True,
2298
- description=description
2299
- )
2300
-
2301
- flow_graph.add_read(read_node)
2302
-
2303
- return FlowFrame(
2304
- data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2305
- flow_graph=flow_graph,
2306
- node_id=node_id
2307
- )
2308
-
2309
-
2310
- def from_dict(data, *, flow_graph: FlowGraph = None, description: str = None) -> FlowFrame:
2311
- """
2312
- Create a FlowFrame from a dictionary or list of dictionaries.
2313
-
2314
- Args:
2315
- data: Dictionary of lists or list of dictionaries
2316
- flow_graph: if you want to add it to an existing graph
2317
- description: if you want to add a readable name in the frontend (advised)
2318
- Returns:
2319
- A FlowFrame with the data
2320
- """
2321
- # Create new node ID
2322
- node_id = generate_node_id()
2323
-
2324
- if not flow_graph:
2325
- flow_graph = create_flow_graph()
2326
- flow_id = flow_graph.flow_id
2327
-
2328
- input_node = input_schema.NodeManualInput(
2329
- flow_id=flow_id,
2330
- node_id=node_id,
2331
- raw_data=FlowDataEngine(data).to_pylist(),
2332
- pos_x=100,
2333
- pos_y=100,
2334
- is_setup=True,
2335
- description=description
2336
- )
2337
-
2338
- # Add to graph
2339
- flow_graph.add_manual_input(input_node)
2340
-
2341
- # Return new frame
2342
- return FlowFrame(
2343
- data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2344
- flow_graph=flow_graph,
2345
- node_id=node_id
2346
- )
2347
-
2348
-
2349
- def concat(frames: List['FlowFrame'],
2350
- how: str = 'vertical',
2351
- rechunk: bool = False,
2352
- parallel: bool = True,
2353
- description: str = None) -> 'FlowFrame':
2354
- """
2355
- Concatenate multiple FlowFrames into one.
2356
-
2357
- Parameters
2358
- ----------
2359
- frames : List[FlowFrame]
2360
- List of FlowFrames to concatenate
2361
- how : str, default 'vertical'
2362
- How to combine the FlowFrames (see concat method documentation)
2363
- rechunk : bool, default False
2364
- Whether to ensure contiguous memory in result
2365
- parallel : bool, default True
2366
- Whether to use parallel processing for the operation
2367
- description : str, optional
2368
- Description of this operation
2369
-
2370
- Returns
2371
- -------
2372
- FlowFrame
2373
- A new FlowFrame with the concatenated data
2374
- """
2375
- if not frames:
2376
- raise ValueError("No frames provided to concat_frames")
2377
-
2378
- if len(frames) == 1:
2379
- return frames[0]
2380
-
2381
- # Use first frame's concat method with remaining frames
2382
- first_frame = frames[0]
2383
- remaining_frames = frames[1:]
2384
-
2385
- return first_frame.concat(remaining_frames, how=how,
2386
- rechunk=rechunk, parallel=parallel,
2387
- description=description)
2388
-
2389
-
2390
- def scan_csv(
2391
- source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
2392
- *,
2393
- flow_graph: Optional[Any] = None, # Using Any for FlowGraph placeholder
2394
- separator: str = ',',
2395
- convert_to_absolute_path: bool = True,
2396
- description: Optional[str] = None,
2397
- has_header: bool = True,
2398
- new_columns: Optional[List[str]] = None,
2399
- comment_prefix: Optional[str] = None,
2400
- quote_char: Optional[str] = '"',
2401
- skip_rows: int = 0,
2402
- skip_lines: int = 0,
2403
- schema: Optional[SchemaDict] = None,
2404
- schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
2405
- null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
2406
- missing_utf8_is_empty_string: bool = False,
2407
- ignore_errors: bool = False,
2408
- try_parse_dates: bool = False,
2409
- infer_schema: bool = True,
2410
- infer_schema_length: Optional[int] = 100,
2411
- n_rows: Optional[int] = None,
2412
- encoding: CsvEncoding = 'utf8',
2413
- low_memory: bool = False,
2414
- rechunk: bool = False,
2415
- storage_options: Optional[Dict[str, Any]] = None,
2416
- skip_rows_after_header: int = 0,
2417
- row_index_name: Optional[str] = None,
2418
- row_index_offset: int = 0,
2419
- eol_char: str = '\n',
2420
- raise_if_empty: bool = True,
2421
- truncate_ragged_lines: bool = False,
2422
- decimal_comma: bool = False,
2423
- glob: bool = True,
2424
- cache: bool = True,
2425
- with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
2426
- **other_options: Any
2427
- ) -> FlowFrame:
2428
- """
2429
- Scan a CSV file into a FlowFrame. This function is an alias for read_csv.
2058
+ @staticmethod
2059
+ def _comparison_error(operator: str) -> pl.lazyframe.frame.NoReturn:
2060
+ msg = f'"{operator!r}" comparison not supported for LazyFrame objects'
2061
+ raise TypeError(msg)
2430
2062
 
2431
- This method is the same as read_csv but is provided for compatibility with
2432
- the polars API where scan_csv returns a LazyFrame.
2063
+ def __eq__(self, other: object) -> pl.lazyframe.frame.NoReturn:
2064
+ self._comparison_error("==")
2433
2065
 
2434
- See read_csv for full documentation.
2435
- """
2436
- return read_csv(
2437
- source=source,
2438
- flow_graph=flow_graph,
2439
- separator=separator,
2440
- convert_to_absolute_path=convert_to_absolute_path,
2441
- description=description,
2442
- has_header=has_header,
2443
- new_columns=new_columns,
2444
- comment_prefix=comment_prefix,
2445
- quote_char=quote_char,
2446
- skip_rows=skip_rows,
2447
- skip_lines=skip_lines,
2448
- schema=schema,
2449
- schema_overrides=schema_overrides,
2450
- null_values=null_values,
2451
- missing_utf8_is_empty_string=missing_utf8_is_empty_string,
2452
- ignore_errors=ignore_errors,
2453
- try_parse_dates=try_parse_dates,
2454
- infer_schema=infer_schema,
2455
- infer_schema_length=infer_schema_length,
2456
- n_rows=n_rows,
2457
- encoding=encoding,
2458
- low_memory=low_memory,
2459
- rechunk=rechunk,
2460
- storage_options=storage_options,
2461
- skip_rows_after_header=skip_rows_after_header,
2462
- row_index_name=row_index_name,
2463
- row_index_offset=row_index_offset,
2464
- eol_char=eol_char,
2465
- raise_if_empty=raise_if_empty,
2466
- truncate_ragged_lines=truncate_ragged_lines,
2467
- decimal_comma=decimal_comma,
2468
- glob=glob,
2469
- cache=cache,
2470
- with_column_names=with_column_names,
2471
- **other_options
2472
- )
2066
+ def __ne__(self, other: object) -> pl.lazyframe.frame.NoReturn:
2067
+ self._comparison_error("!=")
2473
2068
 
2069
+ def __gt__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
2070
+ self._comparison_error(">")
2474
2071
 
2475
- def scan_parquet(
2476
- file_path,
2477
- *,
2478
- flow_graph: FlowGraph = None,
2479
- description: str = None,
2480
- convert_to_absolute_path: bool = True,
2481
- **options
2482
- ) -> FlowFrame:
2483
- """
2484
- Scan a Parquet file into a FlowFrame. This function is an alias for read_parquet.
2072
+ def __lt__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
2073
+ self._comparison_error("<")
2485
2074
 
2486
- This method is the same as read_parquet but is provided for compatibility with
2487
- the polars API where scan_parquet returns a LazyFrame.
2075
+ def __ge__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
2076
+ self._comparison_error(">=")
2488
2077
 
2489
- See read_parquet for full documentation.
2490
- """
2491
- return read_parquet(
2492
- file_path=file_path,
2493
- flow_graph=flow_graph,
2494
- description=description,
2495
- convert_to_absolute_path=convert_to_absolute_path,
2496
- **options
2497
- )
2078
+ def __le__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
2079
+ self._comparison_error("<=")