Flowfile 0.3.1.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (100) hide show
  1. flowfile/__init__.py +2 -1
  2. flowfile/api.py +5 -3
  3. flowfile/web/__init__.py +3 -0
  4. flowfile/web/static/assets/{AirbyteReader-cb0c1d4a.js → AirbyteReader-2b1cf2d8.js} +10 -9
  5. flowfile/web/static/assets/{CrossJoin-a514fa59.js → CrossJoin-cc3ab73c.js} +8 -8
  6. flowfile/web/static/assets/{DatabaseConnectionSettings-f2cecf33.js → DatabaseConnectionSettings-307c4652.js} +2 -2
  7. flowfile/web/static/assets/{DatabaseManager-83ee3c98.js → DatabaseManager-69faa6e1.js} +10 -6
  8. flowfile/web/static/assets/{DatabaseReader-dc0c6881.js → DatabaseReader-e4134cd0.js} +9 -9
  9. flowfile/web/static/assets/{DatabaseWriter-5afe9f8d.js → DatabaseWriter-d32d75b1.js} +9 -9
  10. flowfile/web/static/assets/{ExploreData-c7ee19cf.js → ExploreData-5eb48389.js} +18639 -18629
  11. flowfile/web/static/assets/{ExternalSource-17b23a01.js → ExternalSource-29489051.js} +8 -21
  12. flowfile/web/static/assets/{Filter-90856b4f.js → Filter-031332bb.js} +9 -9
  13. flowfile/web/static/assets/{Formula-38b71e9e.js → Formula-3b900540.js} +15 -15
  14. flowfile/web/static/assets/{Formula-d60a74f4.css → Formula-b8cefc31.css} +4 -4
  15. flowfile/web/static/assets/{FuzzyMatch-d0f1fe81.js → FuzzyMatch-dee31153.js} +9 -9
  16. flowfile/web/static/assets/{GraphSolver-0c86bbc6.js → GraphSolver-ca74eb47.js} +5 -5
  17. flowfile/web/static/assets/{GroupBy-f2772e9f.js → GroupBy-081b6591.js} +8 -7
  18. flowfile/web/static/assets/{Join-bc3e1cf7.js → Join-b467376f.js} +11 -10
  19. flowfile/web/static/assets/{ManualInput-03aa0245.js → ManualInput-ffffb80a.js} +11 -8
  20. flowfile/web/static/assets/{Output-5b35eee8.js → Output-9a87d4ba.js} +4 -4
  21. flowfile/web/static/assets/{Pivot-7164087c.js → Pivot-ee3e6093.js} +8 -7
  22. flowfile/web/static/assets/{PolarsCode-3abf6507.js → PolarsCode-03921254.js} +13 -11
  23. flowfile/web/static/assets/{PopOver-b37ff9be.js → PopOver-3bdf8951.js} +1 -1
  24. flowfile/web/static/assets/{Read-65966a3e.js → Read-67fee3a0.js} +6 -6
  25. flowfile/web/static/assets/{RecordCount-c66c6d6d.js → RecordCount-a2acd02d.js} +7 -6
  26. flowfile/web/static/assets/{RecordId-826dc095.js → RecordId-0c8bcd77.js} +10 -8
  27. flowfile/web/static/assets/{Sample-4ed555c8.js → Sample-60594a3a.js} +7 -6
  28. flowfile/web/static/assets/{SecretManager-eac1e97d.js → SecretManager-bbcec2ac.js} +2 -2
  29. flowfile/web/static/assets/{Select-085f05cc.js → Select-9540e6ca.js} +8 -8
  30. flowfile/web/static/assets/{SettingsSection-1f5e79c1.js → SettingsSection-48f28104.js} +1 -1
  31. flowfile/web/static/assets/{Sort-3e6cb414.js → Sort-6dbe3633.js} +6 -6
  32. flowfile/web/static/assets/{TextToRows-606349bc.js → TextToRows-27aab4a8.js} +18 -13
  33. flowfile/web/static/assets/{UnavailableFields-b41976ed.js → UnavailableFields-8143044b.js} +2 -2
  34. flowfile/web/static/assets/{Union-fca91665.js → Union-52460248.js} +7 -6
  35. flowfile/web/static/assets/{Unique-a59f830e.js → Unique-f6962644.js} +8 -8
  36. flowfile/web/static/assets/{Unpivot-c3815565.js → Unpivot-1ff1e938.js} +5 -5
  37. flowfile/web/static/assets/{api-22b338bd.js → api-3b345d92.js} +1 -1
  38. flowfile/web/static/assets/{designer-e5bbe26f.js → designer-4736134f.js} +72 -42
  39. flowfile/web/static/assets/{documentation-08045cf2.js → documentation-b9545eba.js} +1 -1
  40. flowfile/web/static/assets/{dropDown-5e7e9a5a.js → dropDown-d5a4014c.js} +1 -1
  41. flowfile/web/static/assets/{dropDownGeneric-50a91b99.js → dropDownGeneric-1f4e32ec.js} +2 -2
  42. flowfile/web/static/assets/{fullEditor-705c6ccb.js → fullEditor-f4791c23.js} +3 -3
  43. flowfile/web/static/assets/{genericNodeSettings-65587f20.js → genericNodeSettings-1d456350.js} +3 -3
  44. flowfile/web/static/assets/{index-552863fd.js → index-f25c9283.js} +2608 -1570
  45. flowfile/web/static/assets/{nodeTitle-cf9bae3c.js → nodeTitle-cad6fd9d.js} +3 -3
  46. flowfile/web/static/assets/{secretApi-3ad510e1.js → secretApi-01f07e2c.js} +1 -1
  47. flowfile/web/static/assets/{selectDynamic-bd644891.js → selectDynamic-f46a4e3f.js} +3 -3
  48. flowfile/web/static/assets/{vue-codemirror.esm-dd17b478.js → vue-codemirror.esm-eb98fc8b.js} +15 -14
  49. flowfile/web/static/assets/{vue-content-loader.es-6b36f05e.js → vue-content-loader.es-860c0380.js} +1 -1
  50. flowfile/web/static/index.html +1 -1
  51. {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/METADATA +1 -3
  52. {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/RECORD +97 -88
  53. flowfile_core/configs/__init__.py +15 -4
  54. flowfile_core/configs/node_store/nodes.py +2 -4
  55. flowfile_core/configs/settings.py +5 -3
  56. flowfile_core/configs/utils.py +18 -0
  57. flowfile_core/flowfile/FlowfileFlow.py +84 -29
  58. flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
  59. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +55 -18
  60. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
  61. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
  62. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +34 -2
  63. flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
  64. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
  65. flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
  66. flowfile_core/flowfile/flow_graph_utils.py +320 -0
  67. flowfile_core/flowfile/flow_node/flow_node.py +2 -1
  68. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
  69. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +0 -1
  70. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
  71. flowfile_core/flowfile/utils.py +34 -3
  72. flowfile_core/main.py +2 -3
  73. flowfile_core/routes/secrets.py +1 -1
  74. flowfile_core/schemas/input_schema.py +12 -14
  75. flowfile_core/schemas/transform_schema.py +25 -47
  76. flowfile_frame/__init__.py +11 -4
  77. flowfile_frame/adding_expr.py +280 -0
  78. flowfile_frame/config.py +9 -0
  79. flowfile_frame/expr.py +301 -83
  80. flowfile_frame/expr.pyi +2174 -0
  81. flowfile_frame/expr_name.py +258 -0
  82. flowfile_frame/flow_frame.py +616 -627
  83. flowfile_frame/flow_frame.pyi +336 -0
  84. flowfile_frame/flow_frame_methods.py +617 -0
  85. flowfile_frame/group_frame.py +89 -42
  86. flowfile_frame/join.py +1 -2
  87. flowfile_frame/lazy.py +704 -0
  88. flowfile_frame/lazy_methods.py +201 -0
  89. flowfile_frame/list_name_space.py +324 -0
  90. flowfile_frame/selectors.py +3 -0
  91. flowfile_frame/series.py +70 -0
  92. flowfile_frame/utils.py +80 -4
  93. flowfile/web/static/assets/GoogleSheet-854294a4.js +0 -2616
  94. flowfile/web/static/assets/GoogleSheet-92084da7.css +0 -233
  95. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +0 -74
  96. {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/LICENSE +0 -0
  97. {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/WHEEL +0 -0
  98. {flowfile-0.3.1.2.dist-info → flowfile-0.3.3.dist-info}/entry_points.txt +0 -0
  99. /flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
  100. /flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0
@@ -1,14 +1,17 @@
1
- import uuid
1
+ import inspect
2
2
  import os
3
- from typing import Any, Iterable, List, Literal, Optional, Tuple, Union
4
- from pathlib import Path
3
+ from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable, get_args, get_origin
5
4
 
6
5
  import re
6
+
7
7
  import polars as pl
8
- from polars._typing import FrameInitTypes, SchemaDefinition, SchemaDict, Orientation
9
8
 
10
- # Assume these imports are correct from your original context
9
+ from flowfile_frame.lazy_methods import add_lazyframe_methods
10
+
11
+ from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
12
+ from collections.abc import Iterator
11
13
  from flowfile_core.flowfile.FlowfileFlow import FlowGraph, add_connection
14
+ from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
12
15
  from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
13
16
  from flowfile_core.flowfile.flow_node.flow_node import FlowNode
14
17
  from flowfile_core.schemas import input_schema, transform_schema
@@ -16,12 +19,36 @@ from flowfile_core.schemas import input_schema, transform_schema
16
19
  from flowfile_frame.expr import Expr, Column, lit, col
17
20
  from flowfile_frame.selectors import Selector
18
21
  from flowfile_frame.group_frame import GroupByFrame
19
- from flowfile_frame.utils import _parse_inputs_as_iterable, create_flow_graph
22
+ from flowfile_frame.utils import (_parse_inputs_as_iterable, create_flow_graph, stringify_values,
23
+ ensure_inputs_as_iterable)
20
24
  from flowfile_frame.join import _normalize_columns_to_list, _create_join_mappings
25
+ from flowfile_frame.utils import _check_if_convertible_to_code
26
+ from flowfile_frame.config import logger
27
+
21
28
 
22
29
  node_id_counter = 0
23
30
 
24
31
 
32
+ def can_be_expr(param: inspect.Parameter) -> bool:
33
+ """Check if a parameter can be of type pl.Expr"""
34
+ if param.annotation == inspect.Parameter.empty:
35
+ return False
36
+
37
+ # Check direct match or in Union args
38
+ types = get_args(param.annotation) if get_origin(param.annotation) is Union else [param.annotation]
39
+ return any(t in (pl.Expr, pl.expr.expr.Expr) for t in types)
40
+
41
+
42
+ def _contains_lambda_pattern(text: str) -> bool:
43
+ return "<lambda> at" in text
44
+
45
+
46
+ def get_method_name_from_code(code: str) -> str | None:
47
+ split_code = code.split("input_df.")
48
+ if len(split_code) > 1:
49
+ return split_code[1].split("(")[0]
50
+
51
+
25
52
  def _to_string_val(v) -> str:
26
53
  if isinstance(v, str):
27
54
  return f"'{v}'"
@@ -29,12 +56,72 @@ def _to_string_val(v) -> str:
29
56
  return v
30
57
 
31
58
 
59
+ def _extract_expr_parts(expr_obj) -> tuple[str, str]:
60
+ """
61
+ Extract the pure expression string and any raw definitions (including function sources) from an Expr object.
62
+
63
+ Parameters
64
+ ----------
65
+ expr_obj : Expr
66
+ The expression object to extract parts from
67
+
68
+ Returns
69
+ -------
70
+ tuple[str, str]
71
+ A tuple of (pure_expr_str, raw_definitions_str)
72
+ """
73
+ if not isinstance(expr_obj, Expr):
74
+ # If it's not an Expr, just return its string representation
75
+ return str(expr_obj), ""
76
+
77
+ # Get the basic representation
78
+ pure_expr_str = expr_obj._repr_str
79
+
80
+ # Collect all definitions (function sources)
81
+ raw_definitions = []
82
+
83
+ # Add function sources if any
84
+ if hasattr(expr_obj, '_function_sources') and expr_obj._function_sources:
85
+ # Remove duplicates while preserving order
86
+ unique_sources = []
87
+ seen = set()
88
+ for source in expr_obj._function_sources:
89
+ if source not in seen:
90
+ seen.add(source)
91
+ unique_sources.append(source)
92
+
93
+ if unique_sources:
94
+ raw_definitions.extend(unique_sources)
95
+
96
+ # Join all definitions
97
+ raw_defs_str = "\n\n".join(raw_definitions) if raw_definitions else ""
98
+
99
+ return pure_expr_str, raw_defs_str
100
+
101
+
102
+ def _check_ok_for_serialization(method_name: str = None, polars_expr: pl.Expr | None = None,
103
+ group_expr: pl.Expr | None = None) -> None:
104
+ if method_name is None:
105
+ raise NotImplemented("Cannot create a polars lambda expression without the method")
106
+ if polars_expr is None:
107
+ raise NotImplemented("Cannot create polars expressions with lambda function")
108
+ method_ref = getattr(pl.LazyFrame, method_name)
109
+ if method_ref is None:
110
+ raise ModuleNotFoundError(f"Could not find the method {method_name} in polars lazyframe")
111
+ if method_name == 'group_by':
112
+ if group_expr is None:
113
+ raise NotImplemented("Cannot create a polars lambda expression without the groupby expression")
114
+ if not all(isinstance(ge, pl.Expr) for ge in group_expr):
115
+ raise NotImplemented("Cannot create a polars lambda expression without the groupby expression")
116
+
117
+
32
118
  def generate_node_id() -> int:
33
119
  global node_id_counter
34
120
  node_id_counter += 1
35
121
  return node_id_counter
36
122
 
37
123
 
124
+ @add_lazyframe_methods
38
125
  class FlowFrame:
39
126
  """Main class that wraps FlowDataEngine and maintains the ETL graph."""
40
127
  flow_graph: FlowGraph
@@ -89,13 +176,11 @@ class FlowFrame:
89
176
  # Extract flow-specific parameters
90
177
  node_id = node_id or generate_node_id()
91
178
  description = "Data imported from Python object"
92
-
93
179
  # Create a new flow graph if none is provided
94
180
  if flow_graph is None:
95
181
  flow_graph = create_flow_graph()
96
182
 
97
183
  flow_id = flow_graph.flow_id
98
-
99
184
  # Convert data to a polars DataFrame/LazyFrame
100
185
  try:
101
186
  # Use polars to convert from various types
@@ -110,25 +195,23 @@ class FlowFrame:
110
195
  )
111
196
  pl_data = pl_df.lazy()
112
197
  except Exception as e:
113
- raise ValueError(f"Could not convert data to a polars DataFrame: {e}")
114
-
198
+ raise ValueError(f"Could not dconvert data to a polars DataFrame: {e}")
115
199
  # Create a FlowDataEngine to get data in the right format for manual input
116
200
  flow_table = FlowDataEngine(raw_data=pl_data)
117
-
201
+ raw_data_format = input_schema.RawData(data=list(flow_table.to_dict().values()),
202
+ columns=[c.get_minimal_field_info() for c in flow_table.schema])
118
203
  # Create a manual input node
119
204
  input_node = input_schema.NodeManualInput(
120
205
  flow_id=flow_id,
121
206
  node_id=node_id,
122
- raw_data=flow_table.to_pylist(), # Convert to list of dicts
207
+ raw_data_format=raw_data_format,
123
208
  pos_x=100,
124
209
  pos_y=100,
125
210
  is_setup=True,
126
211
  description=description,
127
212
  )
128
-
129
213
  # Add to graph
130
214
  flow_graph.add_manual_input(input_node)
131
-
132
215
  # Return new frame
133
216
  return FlowFrame(
134
217
  data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
@@ -152,7 +235,6 @@ class FlowFrame:
152
235
  parent_node_id=None,
153
236
  ):
154
237
  """Create a new FlowFrame instance."""
155
-
156
238
  # If data is not a LazyFrame, use the factory method
157
239
  if data is not None and not isinstance(data, pl.LazyFrame):
158
240
  return cls.create_from_any_type(
@@ -168,7 +250,6 @@ class FlowFrame:
168
250
  parent_node_id=parent_node_id,
169
251
  )
170
252
 
171
- # Otherwise create the instance normally
172
253
  instance = super().__new__(cls)
173
254
  return instance
174
255
 
@@ -187,7 +268,6 @@ class FlowFrame:
187
268
  parent_node_id=None,
188
269
  ):
189
270
  """Initialize the FlowFrame with data and graph references."""
190
-
191
271
  if data is None:
192
272
  data = pl.LazyFrame()
193
273
  if not isinstance(data, pl.LazyFrame):
@@ -219,205 +299,235 @@ class FlowFrame:
219
299
  def _create_child_frame(self, new_node_id):
220
300
  """Helper method to create a new FlowFrame that's a child of this one"""
221
301
  self._add_connection(self.node_id, new_node_id)
222
- return FlowFrame(
223
- data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
224
- flow_graph=self.flow_graph,
225
- node_id=new_node_id,
226
- parent_node_id=self.node_id,
227
- )
302
+ try:
303
+ return FlowFrame(
304
+ data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
305
+ flow_graph=self.flow_graph,
306
+ node_id=new_node_id,
307
+ parent_node_id=self.node_id,
308
+ )
309
+ except AttributeError:
310
+ raise ValueError('Could not execute the function')
228
311
 
229
- def sort(
230
- self,
231
- by: List[Expr | str] | Expr | str,
232
- *more_by,
233
- descending: bool | List[bool] = False,
234
- nulls_last: bool = False,
235
- multithreaded: bool = True,
236
- maintain_order: bool = False,
237
- description: str = None,
238
- ):
312
+ @staticmethod
313
+ def _generate_sort_polars_code(
314
+ pure_sort_expr_strs: List[str],
315
+ descending_values: List[bool],
316
+ nulls_last_values: List[bool],
317
+ multithreaded: bool,
318
+ maintain_order: bool,
319
+ ) -> str:
239
320
  """
240
- Sort the dataframe by the given columns.
321
+ Generates the `input_df.sort(...)` Polars code string using pure expression strings.
322
+ """
323
+ kwargs_for_code: Dict[str, Any] = {}
324
+ if any(descending_values):
325
+ kwargs_for_code["descending"] = descending_values[0] if len(descending_values) == 1 else descending_values
326
+ if any(nulls_last_values):
327
+ kwargs_for_code["nulls_last"] = nulls_last_values[0] if len(nulls_last_values) == 1 else nulls_last_values
328
+ if not multithreaded:
329
+ kwargs_for_code["multithreaded"] = multithreaded
330
+ if maintain_order:
331
+ kwargs_for_code["maintain_order"] = maintain_order
241
332
 
242
- Parameters:
243
- -----------
244
- by : Expr, str, or list of Expr/str
245
- Column(s) to sort by. Accepts expression input. Strings are parsed as column names.
246
- *more_by : Expr or str
247
- Additional columns to sort by, specified as positional arguments.
248
- descending : bool or list of bool, default False
249
- Sort in descending order. When sorting by multiple columns, can be specified per column.
250
- nulls_last : bool or list of bool, default False
251
- Place null values last; can specify a single boolean or a sequence for per-column control.
252
- multithreaded : bool, default True
253
- Sort using multiple threads.
254
- maintain_order : bool, default False
255
- Whether the order should be maintained if elements are equal.
256
- description : str, optional
257
- Description of this operation for the ETL graph.
333
+ kwargs_str_for_code = ", ".join(f"{k}={repr(v)}" for k, v in kwargs_for_code.items())
258
334
 
259
- Returns:
260
- --------
261
- FlowFrame
262
- A new FlowFrame with sorted data.
335
+ by_arg_for_code = pure_sort_expr_strs[0] if len(
336
+ pure_sort_expr_strs) == 1 else f"[{', '.join(pure_sort_expr_strs)}]"
337
+ return f"input_df.sort({by_arg_for_code}{', ' + kwargs_str_for_code if kwargs_str_for_code else ''})"
338
+
339
+ def sort(
340
+ self,
341
+ by: Union[List[Union[Expr, str]], Expr, str],
342
+ *more_by: Union[Expr, str],
343
+ descending: Union[bool, List[bool]] = False,
344
+ nulls_last: Union[bool, List[bool]] = False,
345
+ multithreaded: bool = True,
346
+ maintain_order: bool = False,
347
+ description: Optional[str] = None,
348
+ ) -> "FlowFrame":
349
+ """
350
+ Sort the dataframe by the given columns.
263
351
  """
264
- by = list(_parse_inputs_as_iterable((by,)))
352
+ initial_by_args = list(_parse_inputs_as_iterable((by,)))
265
353
  new_node_id = generate_node_id()
266
- sort_expressions = by
354
+
355
+ sort_expressions_input: list = initial_by_args
267
356
  if more_by:
268
- sort_expressions.extend(more_by)
357
+ sort_expressions_input.extend(list(_parse_inputs_as_iterable(more_by)))
269
358
 
270
- # Determine if we need to use polars code fallback
271
- needs_polars_code = False
359
+ all_processed_expr_objects: List[Expr] = []
360
+ pure_polars_expr_strings_for_sort: List[str] = []
361
+ collected_raw_definitions: List[str] = []
362
+ column_names_for_native_node: List[str] = []
272
363
 
273
- # Check for any expressions that are not simple columns
274
- for expr in sort_expressions:
275
- if not isinstance(expr, (str, Column)) or (
276
- isinstance(expr, Column) and expr._select_input.is_altered
277
- ):
278
- needs_polars_code = True
279
- break
364
+ use_polars_code_path = False
280
365
 
281
- # Also need polars code if we're using maintain_order or multithreaded params
282
366
  if maintain_order or not multithreaded:
283
- needs_polars_code = True
284
-
285
- # Standardize descending parameter
286
- if isinstance(descending, (list, tuple)):
287
- # Ensure descending list has the same length as sort_expressions
288
- if len(descending) != len(sort_expressions):
289
- raise ValueError(
290
- f"Length of descending ({len(descending)}) must match number of sort columns ({len(sort_expressions)})"
291
- )
292
- descending_values = descending
293
- else:
294
- descending_values = [descending] * len(sort_expressions)
295
-
296
- # Standardize nulls_last parameter
297
- if isinstance(nulls_last, (list, tuple)):
298
- if len(nulls_last) != len(sort_expressions):
299
- raise ValueError(
300
- f"Length of nulls_last ({len(nulls_last)}) must match number of sort columns ({len(sort_expressions)})"
301
- )
302
- nulls_last_values = nulls_last
303
- # Any non-default nulls_last needs polars code
304
- if any(val is not False for val in nulls_last_values):
305
- needs_polars_code = True
306
- else:
307
- nulls_last_values = [nulls_last] * len(sort_expressions)
308
- # Non-default nulls_last needs polars code
309
- if nulls_last:
310
- needs_polars_code = True
311
-
312
- if needs_polars_code:
313
- # Generate polars code for complex cases
314
- code = self._generate_sort_polars_code(
315
- sort_expressions,
316
- descending_values,
317
- nulls_last_values,
318
- multithreaded,
319
- maintain_order,
320
- )
321
- self._add_polars_code(new_node_id, code, description)
322
- else:
323
- # Use native implementation for simple cases
324
- sort_inputs = []
325
- for i, expr in enumerate(sort_expressions):
326
- # Convert expr to column name
327
- if isinstance(expr, Column):
328
- column_name = expr.name
329
- elif isinstance(expr, str):
330
- column_name = expr
367
+ use_polars_code_path = True
368
+
369
+ is_nulls_last_list = isinstance(nulls_last, (list, tuple))
370
+ if is_nulls_last_list and any(val for val in nulls_last if val is not False):
371
+ use_polars_code_path = True
372
+ elif not is_nulls_last_list and nulls_last is not False:
373
+ use_polars_code_path = True
374
+
375
+ for expr_input in sort_expressions_input:
376
+ current_expr_obj: Expr
377
+ is_simple_col_for_native = False
378
+
379
+ if isinstance(expr_input, str):
380
+ current_expr_obj = col(expr_input)
381
+ column_names_for_native_node.append(expr_input)
382
+ is_simple_col_for_native = True
383
+ elif isinstance(expr_input, Column):
384
+ current_expr_obj = expr_input
385
+ # Type ignore below due to simplified Column stub
386
+ if not expr_input._select_input.is_altered: # type: ignore
387
+ column_names_for_native_node.append(expr_input.column_name) # type: ignore
388
+ is_simple_col_for_native = True
331
389
  else:
332
- column_name = str(expr)
390
+ use_polars_code_path = True # Altered Column implies complex expression
391
+ elif isinstance(expr_input, Expr):
392
+ current_expr_obj = expr_input
393
+ use_polars_code_path = True # General Expr implies complex expression
394
+ else: # Convert other types to lit
395
+ current_expr_obj = lit(expr_input)
396
+ use_polars_code_path = True # Literal might be part of a complex sort for Polars code
397
+
398
+ all_processed_expr_objects.append(current_expr_obj)
399
+
400
+ pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
401
+ pure_polars_expr_strings_for_sort.append(pure_expr_str)
402
+
403
+ if raw_defs_str:
404
+ if raw_defs_str not in collected_raw_definitions:
405
+ collected_raw_definitions.append(raw_defs_str)
406
+ use_polars_code_path = True
407
+
408
+ if not is_simple_col_for_native: # If it wasn't a simple string or unaltered Column
409
+ use_polars_code_path = True
410
+
411
+ desc_values = list(descending) if isinstance(descending, list) else [descending] * len(
412
+ all_processed_expr_objects)
413
+ null_last_values = list(nulls_last) if isinstance(nulls_last, list) else [nulls_last] * len(
414
+ all_processed_expr_objects)
415
+
416
+ if len(desc_values) != len(all_processed_expr_objects):
417
+ raise ValueError("Length of 'descending' does not match the number of sort expressions.")
418
+ if len(null_last_values) != len(all_processed_expr_objects):
419
+ raise ValueError("Length of 'nulls_last' does not match the number of sort expressions.")
420
+
421
+ if use_polars_code_path:
422
+ polars_operation_code = self._generate_sort_polars_code(
423
+ pure_polars_expr_strings_for_sort, desc_values, null_last_values, multithreaded, maintain_order
424
+ )
333
425
 
334
- # Create SortByInput with appropriate settings
335
- sort_inputs.append(
336
- transform_schema.SortByInput(
337
- column=column_name,
338
- how="desc" if descending_values[i] else "asc",
339
- )
426
+ final_code_for_node: str
427
+ if collected_raw_definitions:
428
+ unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions)) # Order-preserving unique
429
+ definitions_section = "\n\n".join(unique_raw_definitions)
430
+ final_code_for_node = definitions_section + \
431
+ "\#─────SPLIT─────\n\n" + \
432
+ f"output_df = {polars_operation_code}"
433
+ else:
434
+ final_code_for_node = polars_operation_code
435
+
436
+ pl_expressions_for_fallback = [e.expr for e in all_processed_expr_objects if
437
+ hasattr(e, 'expr') and e.expr is not None]
438
+ kwargs_for_fallback = {
439
+ "descending": desc_values[0] if len(desc_values) == 1 else desc_values,
440
+ "nulls_last": null_last_values[0] if len(null_last_values) == 1 else null_last_values,
441
+ "multithreaded": multithreaded, "maintain_order": maintain_order}
442
+
443
+ self._add_polars_code(new_node_id, final_code_for_node, description, method_name="sort",
444
+ convertable_to_code=_check_if_convertible_to_code(all_processed_expr_objects),
445
+ polars_expr=pl_expressions_for_fallback,
446
+ kwargs_expr=kwargs_for_fallback)
447
+ else:
448
+ sort_inputs_for_node = []
449
+ for i, col_name_for_native in enumerate(column_names_for_native_node):
450
+ sort_inputs_for_node.append(
451
+ transform_schema.SortByInput(column=col_name_for_native, how="desc" if desc_values[i] else "asc")
452
+ # type: ignore
340
453
  )
341
-
342
454
  sort_settings = input_schema.NodeSort(
343
- flow_id=self.flow_graph.flow_id,
344
- node_id=new_node_id,
345
- sort_input=sort_inputs,
346
- pos_x=200,
347
- pos_y=150,
348
- is_setup=True,
349
- depending_on_id=self.node_id,
350
- description=description
351
- or f"Sort by {', '.join(str(e) for e in sort_expressions)}",
352
- )
455
+ flow_id=self.flow_graph.flow_id, node_id=new_node_id, sort_input=sort_inputs_for_node, # type: ignore
456
+ pos_x=200, pos_y=150, is_setup=True, depending_on_id=self.node_id,
457
+ description=description or f"Sort by {', '.join(column_names_for_native_node)}")
353
458
  self.flow_graph.add_sort(sort_settings)
354
459
 
355
460
  return self._create_child_frame(new_node_id)
356
461
 
357
- def _generate_sort_polars_code(
358
- self,
359
- sort_expressions: list,
360
- descending_values: list,
361
- nulls_last_values: list,
362
- multithreaded: bool,
363
- maintain_order: bool,
364
- ) -> str:
365
- """Generate Polars code for sort operations that need fallback."""
366
- # Format expressions for code
367
- expr_strs = []
368
- for expr in sort_expressions:
369
- if isinstance(expr, (Expr, Column)):
370
- expr_strs.append(str(expr))
371
- elif isinstance(expr, str):
372
- expr_strs.append(f"'{expr}'")
373
- else:
374
- expr_strs.append(str(expr))
375
-
376
- # Format parameters
377
- if len(sort_expressions) == 1:
378
- by_arg = expr_strs[0]
379
- else:
380
- by_arg = f"[{', '.join(expr_strs)}]"
381
-
382
- # Build kwargs
383
- kwargs = {}
384
-
385
- # Only add descending if it's non-default
386
- if any(d for d in descending_values):
387
- if len(descending_values) == 1:
388
- kwargs["descending"] = descending_values[0]
389
- else:
390
- kwargs["descending"] = descending_values
391
-
392
- # Only add nulls_last if it's non-default
393
- if any(nl for nl in nulls_last_values):
394
- if len(nulls_last_values) == 1:
395
- kwargs["nulls_last"] = nulls_last_values[0]
462
+ def _add_polars_code(self, new_node_id: int, code: str, description: str = None,
463
+ depending_on_ids: List[str] | None = None, convertable_to_code: bool = True,
464
+ method_name: str = None, polars_expr: Expr | List[Expr] | None = None,
465
+ group_expr: Expr | List[Expr] | None = None,
466
+ kwargs_expr: Dict | None = None,
467
+ group_kwargs: Dict | None = None, ):
468
+ polars_code_for_node: str
469
+ if not convertable_to_code or _contains_lambda_pattern(code):
470
+
471
+ effective_method_name = get_method_name_from_code(
472
+ code) if method_name is None and "input_df." in code else method_name
473
+
474
+ pl_expr_list = ensure_inputs_as_iterable(polars_expr) if polars_expr is not None else []
475
+ group_expr_list = ensure_inputs_as_iterable(group_expr) if group_expr is not None else []
476
+
477
+ _check_ok_for_serialization(polars_expr=pl_expr_list, method_name=effective_method_name,
478
+ group_expr=group_expr_list)
479
+
480
+ current_kwargs_expr = kwargs_expr if kwargs_expr is not None else {}
481
+ result_lazyframe_or_expr: Any
482
+
483
+ if effective_method_name == "group_by":
484
+ group_kwargs = {} if group_kwargs is None else group_kwargs
485
+ if not group_expr_list:
486
+ raise ValueError("group_expr is required for group_by method in serialization fallback.")
487
+ target_obj = getattr(self.data, effective_method_name)(*group_expr_list, **group_kwargs)
488
+ if not pl_expr_list:
489
+ raise ValueError(
490
+ "Aggregation expressions (polars_expr) are required for group_by().agg() in serialization fallback.")
491
+ result_lazyframe_or_expr = target_obj.agg(*pl_expr_list, **current_kwargs_expr)
492
+ elif effective_method_name:
493
+ result_lazyframe_or_expr = getattr(self.data, effective_method_name)(*pl_expr_list,
494
+ **current_kwargs_expr)
396
495
  else:
397
- kwargs["nulls_last"] = nulls_last_values
398
-
399
- # Add other parameters if they're non-default
400
- if not multithreaded:
401
- kwargs["multithreaded"] = multithreaded
402
-
403
- if maintain_order:
404
- kwargs["maintain_order"] = maintain_order
405
-
406
- # Build kwargs string
407
- kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items())
408
-
409
- # Build final code
410
- if kwargs_str:
411
- return f"input_df.sort({by_arg}, {kwargs_str})"
496
+ raise ValueError(
497
+ "Cannot execute Polars operation: method_name is missing and could not be inferred for serialization fallback.")
498
+ try:
499
+ if isinstance(result_lazyframe_or_expr, pl.LazyFrame):
500
+ serialized_value_for_code = result_lazyframe_or_expr.serialize(format='json')
501
+ polars_code_for_node = "\n".join([
502
+ f"serialized_value = r'''{serialized_value_for_code}'''",
503
+ "buffer = BytesIO(serialized_value.encode('utf-8'))",
504
+ "output_df = pl.LazyFrame.deserialize(buffer, format='json')",
505
+ ])
506
+ logger.warning(
507
+ f"Transformation '{effective_method_name}' uses non-serializable elements. "
508
+ "Falling back to serializing the resulting Polars LazyFrame object."
509
+ "This will result in a breaking graph when using the the ui."
510
+ )
511
+ else:
512
+ logger.error(
513
+ f"Fallback for non-convertible code for method '{effective_method_name}' "
514
+ f"resulted in a '{type(result_lazyframe_or_expr).__name__}' instead of a Polars LazyFrame. "
515
+ "This type cannot be persisted as a LazyFrame node via this fallback."
516
+ )
517
+ return FlowFrame(result_lazyframe_or_expr, flow_graph=self.flow_graph, node_id=new_node_id)
518
+ except Exception as e:
519
+ logger.warning(
520
+ f"Critical error: Could not serialize the result of operation '{effective_method_name}' "
521
+ f"during fallback for non-convertible code. Error: {e}."
522
+ "When using a lambda function, consider defining the function first"
523
+ )
524
+ return FlowFrame(result_lazyframe_or_expr, flow_graph=self.flow_graph, node_id=new_node_id)
412
525
  else:
413
- return f"input_df.sort({by_arg})"
414
-
415
- def _add_polars_code(self, new_node_id: int, code: str, description: str = None,
416
- depending_on_ids: List[str] | None = None):
526
+ polars_code_for_node = code
417
527
  polars_code_settings = input_schema.NodePolarsCode(
418
528
  flow_id=self.flow_graph.flow_id,
419
529
  node_id=new_node_id,
420
- polars_code_input=transform_schema.PolarsCodeInput(polars_code=code),
530
+ polars_code_input=transform_schema.PolarsCodeInput(polars_code=polars_code_for_node),
421
531
  is_setup=True,
422
532
  depending_on_ids=depending_on_ids if depending_on_ids is not None else [self.node_id],
423
533
  description=description,
@@ -458,14 +568,17 @@ class FlowFrame:
458
568
  validate : {"1:1", "1:m", "m:1", "m:m"}, optional
459
569
  Validate join relationship.
460
570
  nulls_equal:
461
- Join on null values. By default null values will never produce matches.
571
+ Join on null values. By default, null values will never produce matches.
462
572
  coalesce:
463
573
  None: -> join specific.
464
574
  True: -> Always coalesce join columns.
465
575
  False: -> Never coalesce join columns.
466
576
  maintain_order:
467
- Which DataFrame row order to preserve, if any. Do not rely on any observed ordering without explicitly setting this parameter, as your code may break in a future release. Not specifying any ordering can improve performance Supported for inner, left, right and full joins
468
- None: No specific ordering is desired. The ordering might differ across Polars versions or even between different runs.
577
+ Which DataFrame row order to preserve, if any. Do not rely on any observed ordering without explicitly
578
+ setting this parameter, as your code may break in a future release.
579
+ Not specifying any ordering can improve performance Supported for inner, left, right and full joins
580
+ None: No specific ordering is desired. The ordering might differ across Polars versions or even between
581
+ different runs.
469
582
  left: Preserves the order of the left DataFrame.
470
583
  right: Preserves the order of the right DataFrame.
471
584
  left_right: First preserves the order of the left DataFrame, then the right.
@@ -478,14 +591,27 @@ class FlowFrame:
478
591
  FlowFrame
479
592
  New FlowFrame with join operation applied.
480
593
  """
481
- new_node_id = generate_node_id()
482
- print('new node id', new_node_id)
483
594
  use_polars_code = not(maintain_order is None and
484
595
  coalesce is None and
485
596
  nulls_equal is False and
486
597
  validate is None and
487
598
  suffix == '_right')
599
+
488
600
  join_mappings = None
601
+ if self.flow_graph.flow_id != other.flow_graph.flow_id:
602
+ combined_graph, node_mappings = combine_flow_graphs_with_mapping(self.flow_graph, other.flow_graph)
603
+ new_self_node_id = node_mappings.get((self.flow_graph.flow_id, self.node_id), None)
604
+ new_other_node_id = node_mappings.get((other.flow_graph.flow_id, other.node_id), None)
605
+ if new_other_node_id is None or new_self_node_id is None:
606
+ raise ValueError("Cannot remap the nodes")
607
+ self.node_id = new_self_node_id
608
+ other.node_id = new_other_node_id
609
+ self.flow_graph = combined_graph
610
+ other.flow_graph = combined_graph
611
+ global node_id_counter
612
+ node_id_counter += len(combined_graph.nodes)
613
+ new_node_id = generate_node_id()
614
+
489
615
  if on is not None:
490
616
  left_columns = right_columns = _normalize_columns_to_list(on)
491
617
  elif left_on is not None and right_on is not None:
@@ -504,10 +630,11 @@ class FlowFrame:
504
630
  )
505
631
  if not use_polars_code:
506
632
  join_mappings, use_polars_code = _create_join_mappings(
507
- left_columns, right_columns
633
+ left_columns or [], right_columns or []
508
634
  )
509
635
 
510
636
  if use_polars_code or suffix != '_right':
637
+
511
638
  _on = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in _normalize_columns_to_list(on)) + "]" if on else None
512
639
  _left = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in left_columns) + "]" if left_on else None
513
640
  _right = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in right_columns) + "]" if right_on else None
@@ -527,31 +654,50 @@ class FlowFrame:
527
654
  parent_node_id=self.node_id,
528
655
  )
529
656
 
530
- elif join_mappings:
657
+ elif join_mappings or how == 'cross':
658
+
531
659
  left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
532
660
  right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
533
661
 
534
- join_input = transform_schema.JoinInput(
535
- join_mapping=join_mappings,
536
- left_select=left_select.renames,
537
- right_select=right_select.renames,
538
- how=how,
539
- )
662
+ if how == 'cross':
663
+ join_input = transform_schema.CrossJoinInput(left_select=left_select.renames,
664
+ right_select=right_select.renames,)
665
+ else:
666
+ join_input = transform_schema.JoinInput(
667
+ join_mapping=join_mappings,
668
+ left_select=left_select.renames,
669
+ right_select=right_select.renames,
670
+ how=how,
671
+ )
672
+
540
673
  join_input.auto_rename()
541
- # Create node settings
542
- join_settings = input_schema.NodeJoin(
543
- flow_id=self.flow_graph.flow_id,
544
- node_id=new_node_id,
545
- join_input=join_input,
546
- auto_generate_selection=True,
547
- verify_integrity=True,
548
- pos_x=200,
549
- pos_y=150,
550
- is_setup=True,
551
- depending_on_ids=[self.node_id, other.node_id],
552
- description=description or f"Join with {how} strategy",
553
- )
554
- self.flow_graph.add_join(join_settings)
674
+ if how == 'cross':
675
+ cross_join_settings = input_schema.NodeCrossJoin(
676
+ flow_id=self.flow_graph.flow_id,
677
+ node_id=new_node_id,
678
+ cross_join_input=join_input,
679
+ is_setup=True,
680
+ depending_on_ids=[self.node_id, other.node_id],
681
+ description=description or f"Join with {how} strategy",
682
+ auto_generate_selection=True,
683
+ verify_integrity=True,
684
+ )
685
+
686
+ self.flow_graph.add_cross_join(cross_join_settings)
687
+ else:
688
+ join_settings = input_schema.NodeJoin(
689
+ flow_id=self.flow_graph.flow_id,
690
+ node_id=new_node_id,
691
+ join_input=join_input,
692
+ auto_generate_selection=True,
693
+ verify_integrity=True,
694
+ pos_x=200,
695
+ pos_y=150,
696
+ is_setup=True,
697
+ depending_on_ids=[self.node_id, other.node_id],
698
+ description=description or f"Join with {how} strategy",
699
+ )
700
+ self.flow_graph.add_join(join_settings)
555
701
  self._add_connection(self.node_id, new_node_id, "main")
556
702
  other._add_connection(other.node_id, new_node_id, "right")
557
703
  result_frame = FlowFrame(
@@ -578,40 +724,68 @@ class FlowFrame:
578
724
  self.flow_graph.add_record_count(node_number_of_records)
579
725
  return self._create_child_frame(new_node_id)
580
726
 
581
- def select(self, *columns, description: str = None):
727
+ def select(self, *columns: Union[str, Expr, Selector], description: Optional[str] = None) -> "FlowFrame":
582
728
  """
583
729
  Select columns from the frame.
584
-
585
- Args:
586
- *columns: Column names or expressions
587
- description: Description of the step, this will be shown in the flowfile file
588
-
589
- Returns:
590
- A new FlowFrame with selected columns
591
730
  """
592
- # Create new node ID
593
- columns = _parse_inputs_as_iterable(columns)
731
+ columns_iterable = list(_parse_inputs_as_iterable(columns))
594
732
  new_node_id = generate_node_id()
595
- existing_columns = self.columns
596
733
 
597
- if (len(columns) == 1 and isinstance(columns[0], Expr)
598
- and str(columns[0]) == "pl.Expr(len()).alias('number_of_records')"):
734
+ if (len(columns_iterable) == 1 and isinstance(columns_iterable[0], Expr)
735
+ and str(columns_iterable[0]) == "pl.Expr(len()).alias('number_of_records')"):
599
736
  return self._add_number_of_records(new_node_id, description)
600
737
 
601
- # Handle simple column names
602
- if all(isinstance(col_, (str, Column)) for col_ in columns):
603
- # Create select inputs
604
- select_inputs = [
605
- transform_schema.SelectInput(old_name=col_) if isinstance(col_, str) else col_.to_select_input()
606
- for col_ in columns
607
- ]
608
- dropped_columns = [transform_schema.SelectInput(c, keep=False) for c in existing_columns if
609
- c not in [s.old_name for s in select_inputs]]
610
- select_inputs.extend(dropped_columns)
738
+ all_input_expr_objects: List[Expr] = []
739
+ pure_polars_expr_strings_for_select: List[str] = []
740
+ collected_raw_definitions: List[str] = []
741
+ selected_col_names_for_native: List[str] = [] # For native node
742
+
743
+ can_use_native_node = True
744
+
745
+ if len(columns_iterable) == 1 and isinstance(columns_iterable[0], str) and columns_iterable[0] == '*':
746
+ effective_columns_iterable = [col(c_name) for c_name in self.columns]
747
+ else:
748
+ effective_columns_iterable = columns_iterable
749
+ for expr_input in effective_columns_iterable:
750
+ current_expr_obj = expr_input
751
+ is_simple_col_for_native = False
752
+
753
+ if isinstance(expr_input, str):
754
+ current_expr_obj = col(expr_input)
755
+ selected_col_names_for_native.append(expr_input)
756
+ is_simple_col_for_native = True
757
+ elif isinstance(expr_input, Column) and not expr_input._select_input.is_altered: # type: ignore
758
+ selected_col_names_for_native.append(expr_input.column_name) # type: ignore
759
+ is_simple_col_for_native = True
760
+ elif isinstance(expr_input, Selector): # Selectors imply Polars code path
761
+ can_use_native_node = False
762
+ # current_expr_obj = expr_input # Already an Expr-like via selector
763
+ elif not isinstance(expr_input, Expr): # Includes Column
764
+ current_expr_obj = lit(expr_input)
765
+
766
+ all_input_expr_objects.append(current_expr_obj) # type: ignore
767
+
768
+ pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
769
+
770
+ pure_polars_expr_strings_for_select.append(pure_expr_str)
771
+ if raw_defs_str and raw_defs_str not in collected_raw_definitions:
772
+ collected_raw_definitions.append(raw_defs_str)
773
+
774
+ if not is_simple_col_for_native and not isinstance(expr_input, Selector):
775
+ can_use_native_node = False # Complex expressions require Polars code
776
+ if collected_raw_definitions: # Has to use Polars code if there are definitions
777
+ can_use_native_node = False
778
+ if can_use_native_node:
779
+ select_inputs_for_node = [transform_schema.SelectInput(old_name=name) for name in
780
+ selected_col_names_for_native]
781
+ existing_cols = self.columns
782
+ dropped_columns = [transform_schema.SelectInput(c, keep=False) for c in existing_cols if
783
+ c not in selected_col_names_for_native]
784
+ select_inputs_for_node.extend(dropped_columns)
611
785
  select_settings = input_schema.NodeSelect(
612
786
  flow_id=self.flow_graph.flow_id,
613
787
  node_id=new_node_id,
614
- select_input=select_inputs,
788
+ select_input=select_inputs_for_node,
615
789
  keep_missing=False,
616
790
  pos_x=200,
617
791
  pos_y=100,
@@ -619,60 +793,97 @@ class FlowFrame:
619
793
  depending_on_id=self.node_id,
620
794
  description=description
621
795
  )
622
-
623
- # Add to graph
624
796
  self.flow_graph.add_select(select_settings)
625
- return self._create_child_frame(new_node_id)
626
-
627
797
  else:
628
- readable_exprs = []
629
- is_readable: bool = True
630
- for col_ in columns:
631
- if isinstance(col_, Expr):
632
- readable_exprs.append(col_)
633
- elif isinstance(col_, Selector):
634
- readable_exprs.append(col_)
635
- elif isinstance(col_, pl.expr.Expr):
636
- print('warning this cannot be converted to flowfile frontend. Make sure you use the flowfile expr')
637
- is_readable = False
638
- elif isinstance(col_, str) and col_ in self.columns:
639
- col_expr = Column(col_)
640
- readable_exprs.append(col_expr)
641
- else:
642
- lit_expr = lit(col_)
643
- readable_exprs.append(lit_expr)
644
- if is_readable:
645
- code = f"input_df.select([{', '.join(str(e) for e in readable_exprs)}])"
798
+ polars_operation_code = f"input_df.select([{', '.join(pure_polars_expr_strings_for_select)}])"
799
+ final_code_for_node: str
800
+ if collected_raw_definitions:
801
+ unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions))
802
+ definitions_section = "\n\n".join(unique_raw_definitions)
803
+ final_code_for_node = definitions_section + \
804
+ "\#─────SPLIT─────\n\n" + \
805
+ f"output_df = {polars_operation_code}"
646
806
  else:
647
- raise ValueError('Not supported')
807
+ final_code_for_node = polars_operation_code
648
808
 
649
- self._add_polars_code(new_node_id, code, description)
650
- return self._create_child_frame(new_node_id)
809
+ pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
810
+ isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
811
+ self._add_polars_code(new_node_id, final_code_for_node, description,
812
+ method_name="select",
813
+ convertable_to_code=_check_if_convertible_to_code(all_input_expr_objects),
814
+ polars_expr=pl_expressions_for_fallback)
815
+
816
+ return self._create_child_frame(new_node_id)
651
817
 
652
- def filter(self, predicate: Expr | Any = None, *, flowfile_formula: str = None, description: str = None):
818
+ def filter(self, *predicates: Union[Expr, Any], flowfile_formula: Optional[str] = None,
819
+ description: Optional[str] = None, **constraints: Any) -> "FlowFrame":
653
820
  """
654
821
  Filter rows based on a predicate.
655
-
656
- Args:
657
- predicate: Filter condition
658
- flowfile_formula: Native support in frontend
659
- description: Description of the step that is performed
660
- Returns:
661
- A new FlowFrame with filtered rows
662
822
  """
823
+ if (len(predicates) > 0 or len(constraints) > 0) and flowfile_formula:
824
+ raise ValueError("You can only use one of the following: predicates, constraints or flowfile_formula")
825
+ available_columns = self.columns
663
826
  new_node_id = generate_node_id()
664
- # Create new node ID
665
- if predicate:
666
- # we use for now the fallback on polars code.
667
- if isinstance(predicate, Expr):
668
- predicate_expr = predicate
827
+ if len(predicates) > 0 or len(constraints) > 0:
828
+ all_input_expr_objects: List[Expr] = []
829
+ pure_polars_expr_strings: List[str] = []
830
+ collected_raw_definitions: List[str] = []
831
+
832
+ processed_predicates = []
833
+ for pred_item in predicates:
834
+ if isinstance(pred_item, (tuple, list, Iterator)):
835
+ # If it's a sequence, extend the processed_predicates with its elements
836
+ processed_predicates.extend(list(pred_item))
837
+ else:
838
+ # Otherwise, just add the item
839
+ processed_predicates.append(pred_item)
840
+
841
+ for pred_input in processed_predicates: # Loop over the processed_predicates
842
+ # End of the new/modified section
843
+ current_expr_obj = None # Initialize current_expr_obj
844
+ if isinstance(pred_input, Expr):
845
+ current_expr_obj = pred_input
846
+ elif isinstance(pred_input, str) and pred_input in available_columns:
847
+ current_expr_obj = col(pred_input)
848
+ else:
849
+ current_expr_obj = lit(pred_input)
850
+
851
+ all_input_expr_objects.append(current_expr_obj)
852
+
853
+ pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
854
+ pure_polars_expr_strings.append(f"({pure_expr_str})")
855
+ if raw_defs_str and raw_defs_str not in collected_raw_definitions:
856
+ collected_raw_definitions.append(raw_defs_str)
857
+
858
+ for k, v_val in constraints.items():
859
+ constraint_expr_obj = (col(k) == lit(v_val))
860
+ all_input_expr_objects.append(constraint_expr_obj)
861
+ pure_expr_str, raw_defs_str = _extract_expr_parts(
862
+ constraint_expr_obj) # Constraint exprs are unlikely to have defs
863
+ pure_polars_expr_strings.append(f"({pure_expr_str})")
864
+ if raw_defs_str and raw_defs_str not in collected_raw_definitions: # Should be rare here
865
+ collected_raw_definitions.append(raw_defs_str)
866
+
867
+ filter_conditions_str = " & ".join(pure_polars_expr_strings) if pure_polars_expr_strings else "pl.lit(True)"
868
+ polars_operation_code = f"input_df.filter({filter_conditions_str})"
869
+
870
+ final_code_for_node: str
871
+ if collected_raw_definitions:
872
+ unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions)) # Order-preserving unique
873
+ definitions_section = "\n\n".join(unique_raw_definitions)
874
+ final_code_for_node = definitions_section + \
875
+ "\#─────SPLIT─────\n\n" + \
876
+ f"output_df = {polars_operation_code}"
669
877
  else:
670
- predicate_expr = lit(predicate)
671
- code = f"input_df.filter({str(predicate_expr)})"
672
- self._add_polars_code(new_node_id, code, description)
673
-
878
+ final_code_for_node = polars_operation_code
879
+
880
+ convertable_to_code = _check_if_convertible_to_code(all_input_expr_objects)
881
+ pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
882
+ isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
883
+ self._add_polars_code(new_node_id, final_code_for_node, description, method_name="filter",
884
+ convertable_to_code=convertable_to_code,
885
+ polars_expr=pl_expressions_for_fallback)
674
886
  elif flowfile_formula:
675
- # Create node settings
676
887
  filter_settings = input_schema.NodeFilter(
677
888
  flow_id=self.flow_graph.flow_id,
678
889
  node_id=new_node_id,
@@ -686,8 +897,10 @@ class FlowFrame:
686
897
  depending_on_id=self.node_id,
687
898
  description=description
688
899
  )
689
-
690
900
  self.flow_graph.add_filter(filter_settings)
901
+ else:
902
+ logger.info("Filter called with no arguments; creating a pass-through Polars code node.")
903
+ self._add_polars_code(new_node_id, "output_df = input_df", description or "No-op filter", method_name=None)
691
904
 
692
905
  return self._create_child_frame(new_node_id)
693
906
 
@@ -772,7 +985,7 @@ class FlowFrame:
772
985
  if convert_to_absolute_path:
773
986
  output_settings.directory = output_settings.abs_file_path
774
987
  except Exception as e:
775
- print(f"Warning: Could not determine absolute path for {file_str}: {e}")
988
+ logger.warning(f"Could not determine absolute path for {file_str}: {e}")
776
989
 
777
990
  if not use_polars_code:
778
991
  node_output = input_schema.NodeOutput(
@@ -800,7 +1013,7 @@ class FlowFrame:
800
1013
 
801
1014
  # Use sink_parquet for LazyFrames
802
1015
  code = f"input_df.sink_parquet({args_str})"
803
- print(f"Generated Polars Code: {code}")
1016
+ logger.debug(f"Generated Polars Code: {code}")
804
1017
  self._add_polars_code(new_node_id, code, description)
805
1018
 
806
1019
  return self._create_child_frame(new_node_id)
@@ -848,7 +1061,7 @@ class FlowFrame:
848
1061
  if convert_to_absolute_path:
849
1062
  output_settings.directory = output_settings.abs_file_path
850
1063
  except Exception as e:
851
- print(f"Warning: Could not determine absolute path for {file_str}: {e}")
1064
+ logger.warning(f"Could not determine absolute path for {file_str}: {e}")
852
1065
 
853
1066
  if not use_polars_code:
854
1067
  node_output = input_schema.NodeOutput(
@@ -881,7 +1094,7 @@ class FlowFrame:
881
1094
  args_str += f", {kwargs_repr}"
882
1095
 
883
1096
  code = f"input_df.collect().write_csv({args_str})"
884
- print(f"Generated Polars Code: {code}")
1097
+ logger.debug(f"Generated Polars Code: {code}")
885
1098
  self._add_polars_code(new_node_id, code, description)
886
1099
 
887
1100
  return self._create_child_frame(new_node_id)
@@ -934,10 +1147,10 @@ class FlowFrame:
934
1147
  self.flow_graph.apply_layout()
935
1148
  self.flow_graph.save_flow(file_path)
936
1149
 
937
- def collect(self):
1150
+ def collect(self, *args, **kwargs):
938
1151
  """Collect lazy data into memory."""
939
1152
  if hasattr(self.data, "collect"):
940
- return self.data.collect()
1153
+ return self.data.collect(*args, **kwargs)
941
1154
  return self.data
942
1155
 
943
1156
  def _with_flowfile_formula(self, flowfile_formula: str, output_column_name, description: str = None) -> "FlowFrame":
@@ -946,7 +1159,7 @@ class FlowFrame:
946
1159
  input_schema.NodeFormula(flow_id=self.flow_graph.flow_id, node_id=new_node_id, depending_on_id=self.node_id,
947
1160
  function=transform_schema.FunctionInput(
948
1161
  function=flowfile_formula,
949
- field=transform_schema.FieldInput(name=output_column_name)),
1162
+ field=transform_schema.FieldInput(name=output_column_name, data_type='Auto')),
950
1163
  description=description))
951
1164
  self.flow_graph.add_formula(function_settings)
952
1165
  return self._create_child_frame(new_node_id)
@@ -1241,16 +1454,27 @@ class FlowFrame:
1241
1454
  FlowFrame
1242
1455
  A new FlowFrame with the concatenated data
1243
1456
  """
1244
- new_node_id = generate_node_id()
1245
-
1246
1457
  # Convert single FlowFrame to list
1247
1458
  if isinstance(other, FlowFrame):
1248
1459
  others = [other]
1249
1460
  else:
1250
1461
  others = other
1251
-
1462
+ all_graphs = []
1463
+ all_graph_ids = []
1464
+ for g in [self.flow_graph] + [f.flow_graph for f in others]:
1465
+ if g.flow_id not in all_graph_ids:
1466
+ all_graph_ids.append(g.flow_id)
1467
+ all_graphs.append(g)
1468
+ if len(all_graphs) > 1:
1469
+ combined_graph, node_mappings = combine_flow_graphs_with_mapping(*all_graphs)
1470
+ for f in [self] + other:
1471
+ f.node_id = node_mappings.get((f.flow_graph.flow_id, f.node_id), None)
1472
+ global node_id_counter
1473
+ node_id_counter += len(combined_graph.nodes)
1474
+ else:
1475
+ combined_graph = self.flow_graph
1476
+ new_node_id = generate_node_id()
1252
1477
  use_native = how == "diagonal_relaxed" and parallel and not rechunk
1253
-
1254
1478
  if use_native:
1255
1479
  # Create union input for the transform schema
1256
1480
  union_input = transform_schema.UnionInput(
@@ -1284,7 +1508,6 @@ class FlowFrame:
1284
1508
  input_vars.append(f"input_df_{i+2}")
1285
1509
 
1286
1510
  frames_list = f"[{', '.join(input_vars)}]"
1287
-
1288
1511
  code = f"""
1289
1512
  # Perform concat operation
1290
1513
  output_df = pl.concat(
@@ -1294,19 +1517,20 @@ class FlowFrame:
1294
1517
  parallel={parallel}
1295
1518
  )
1296
1519
  """
1297
-
1520
+ self.flow_graph = combined_graph
1298
1521
 
1299
1522
  # Add polars code node with dependencies on all input frames
1300
1523
  depending_on_ids = [self.node_id] + [frame.node_id for frame in others]
1301
1524
  self._add_polars_code(
1302
1525
  new_node_id, code, description, depending_on_ids=depending_on_ids
1303
1526
  )
1304
-
1305
1527
  # Add connections to ensure all frames are available
1306
1528
  self._add_connection(self.node_id, new_node_id, "main")
1529
+
1307
1530
  for other_frame in others:
1308
- other_frame._add_connection(other_frame.node_id, new_node_id, "main")
1309
1531
 
1532
+ other_frame.flow_graph = combined_graph
1533
+ other_frame._add_connection(other_frame.node_id, new_node_id, "main")
1310
1534
  # Create and return the new frame
1311
1535
  return FlowFrame(
1312
1536
  data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
@@ -1343,7 +1567,7 @@ class FlowFrame:
1343
1567
  return False, None
1344
1568
 
1345
1569
  # Extract the output name
1346
- output_name = expr.name
1570
+ output_name = expr.column_name
1347
1571
 
1348
1572
  if ".over(" not in expr._repr_str:
1349
1573
  # Simple cumulative count can be implemented as a record ID with offset=1
@@ -1426,62 +1650,70 @@ class FlowFrame:
1426
1650
  return False, None
1427
1651
 
1428
1652
  def with_columns(
1429
- self,
1430
- exprs: Expr | List[Expr | None] = None,
1431
- *,
1432
- flowfile_formulas: Optional[List[str]] = None,
1433
- output_column_names: Optional[List[str]] = None,
1434
- description: Optional[str] = None,
1653
+ self,
1654
+ *exprs: Union[Expr, Iterable[Expr], Any], # Allow Any for implicit lit conversion
1655
+ flowfile_formulas: Optional[List[str]] = None,
1656
+ output_column_names: Optional[List[str]] = None,
1657
+ description: Optional[str] = None,
1658
+ **named_exprs: Union[Expr, Any], # Allow Any for implicit lit conversion
1435
1659
  ) -> "FlowFrame":
1436
1660
  """
1437
- Add multiple columns to the DataFrame.
1438
-
1439
- Parameters
1440
- ----------
1441
- exprs : Expr or List[Expr], optional
1442
- Expressions to evaluate as new columns
1443
- flowfile_formulas : List[str], optional
1444
- Alternative approach using flowfile formula syntax
1445
- output_column_names : List[str], optional
1446
- Column names for the flowfile formulas
1447
- description : str, optional
1448
- Description of this operation for the ETL graph
1449
-
1450
- Returns
1451
- -------
1452
- FlowFrame
1453
- A new FlowFrame with the columns added
1454
-
1455
- Raises
1456
- ------
1457
- ValueError
1458
- If neither exprs nor flowfile_formulas with output_column_names are provided,
1459
- or if the lengths of flowfile_formulas and output_column_names don't match
1661
+ Add or replace columns in the DataFrame.
1460
1662
  """
1461
- if exprs is not None:
1462
- new_node_id = generate_node_id()
1463
- exprs_iterable = _parse_inputs_as_iterable((exprs,))
1663
+ new_node_id = generate_node_id()
1464
1664
 
1465
- if len(exprs_iterable) == 1:
1466
- detected, result = self._detect_cum_count_record_id(
1467
- exprs_iterable[0], new_node_id, description
1468
- )
1469
- if detected:
1470
- return result
1471
- all_expressions = []
1472
- for expression in exprs_iterable:
1473
- if not isinstance(expression, (Expr, Column)):
1474
- all_expressions.append(lit(expression))
1475
- else:
1476
- all_expressions.append(expression)
1665
+ all_input_expr_objects: List[Expr] = []
1666
+ pure_polars_expr_strings_for_wc: List[str] = []
1667
+ collected_raw_definitions: List[str] = []
1668
+
1669
+ has_exprs_or_named_exprs = bool(exprs or named_exprs)
1670
+ if has_exprs_or_named_exprs:
1671
+ actual_exprs_to_process: List[Expr] = []
1672
+ temp_exprs_iterable = list(_parse_inputs_as_iterable(exprs))
1673
+
1674
+ for item in temp_exprs_iterable:
1675
+ if isinstance(item, Expr):
1676
+ actual_exprs_to_process.append(item)
1677
+ else: # auto-lit for non-Expr positional args
1678
+ actual_exprs_to_process.append(lit(item))
1679
+
1680
+ for name, val_expr in named_exprs.items():
1681
+ if isinstance(val_expr, Expr):
1682
+ actual_exprs_to_process.append(val_expr.alias(name)) # type: ignore # Assuming Expr has alias
1683
+ else: # auto-lit for named args and then alias
1684
+ actual_exprs_to_process.append(lit(val_expr).alias(name)) # type: ignore
1685
+
1686
+ if len(actual_exprs_to_process) == 1 and isinstance(actual_exprs_to_process[0], Expr):
1687
+ pass
1688
+
1689
+ for current_expr_obj in actual_exprs_to_process:
1690
+ all_input_expr_objects.append(current_expr_obj)
1691
+ pure_expr_str, raw_defs_str = _extract_expr_parts(current_expr_obj)
1692
+ pure_polars_expr_strings_for_wc.append(pure_expr_str) # with_columns takes individual expressions
1693
+ if raw_defs_str and raw_defs_str not in collected_raw_definitions:
1694
+ collected_raw_definitions.append(raw_defs_str)
1695
+
1696
+ polars_operation_code = f"input_df.with_columns([{', '.join(pure_polars_expr_strings_for_wc)}])"
1697
+
1698
+ final_code_for_node: str
1699
+ if collected_raw_definitions:
1700
+ unique_raw_definitions = list(dict.fromkeys(collected_raw_definitions))
1701
+ definitions_section = "\n\n".join(unique_raw_definitions)
1702
+ final_code_for_node = definitions_section + \
1703
+ "\n#─────SPLIT─────\n\n" + \
1704
+ f"output_df = {polars_operation_code}"
1705
+ else:
1706
+ final_code_for_node = polars_operation_code
1477
1707
 
1478
- code = (
1479
- f"input_df.with_columns({', '.join(str(e) for e in all_expressions)})"
1480
- )
1481
- self._add_polars_code(new_node_id, code, description)
1708
+ pl_expressions_for_fallback = [e.expr for e in all_input_expr_objects if
1709
+ isinstance(e, Expr) and hasattr(e, 'expr') and e.expr is not None]
1710
+ self._add_polars_code(new_node_id, final_code_for_node, description, method_name='with_columns',
1711
+ convertable_to_code=_check_if_convertible_to_code(all_input_expr_objects),
1712
+ polars_expr=pl_expressions_for_fallback)
1482
1713
  return self._create_child_frame(new_node_id)
1483
1714
 
1484
1715
  elif flowfile_formulas is not None and output_column_names is not None:
1716
+
1485
1717
  if len(output_column_names) != len(flowfile_formulas):
1486
1718
  raise ValueError(
1487
1719
  "Length of both the formulas and the output columns names must be identical"
@@ -1494,9 +1726,7 @@ class FlowFrame:
1494
1726
  ff = ff._with_flowfile_formula(flowfile_formula, output_column_name, f"{i}: {description}")
1495
1727
  return ff
1496
1728
  else:
1497
- raise ValueError(
1498
- "Either exprs or flowfile_formulas with output_column_names must be provided"
1499
- )
1729
+ raise ValueError("Either exprs/named_exprs or flowfile_formulas with output_column_names must be provided")
1500
1730
 
1501
1731
  def with_row_index(
1502
1732
  self, name: str = "index", offset: int = 0, description: str = None
@@ -1584,26 +1814,27 @@ class FlowFrame:
1584
1814
 
1585
1815
  if isinstance(columns, (list, tuple)):
1586
1816
  all_columns.extend(
1587
- [col.name if isinstance(col, Column) else col for col in columns]
1817
+ [col.column_name if isinstance(col, Column) else col for col in columns]
1588
1818
  )
1589
1819
  else:
1590
- all_columns.append(columns.name if isinstance(columns, Column) else columns)
1820
+ all_columns.append(columns.column_name if isinstance(columns, Column) else columns)
1591
1821
 
1592
1822
  if more_columns:
1593
1823
  for col in more_columns:
1594
- all_columns.append(col.name if isinstance(col, Column) else col)
1824
+ all_columns.append(col.column_name if isinstance(col, Column) else col)
1595
1825
 
1596
1826
  if len(all_columns) == 1:
1597
- columns_str = f"'{all_columns[0]}'"
1827
+
1828
+ columns_str = stringify_values(all_columns[0])
1598
1829
  else:
1599
- columns_str = "[" + ", ".join([f"'{col}'" for col in all_columns]) + "]"
1830
+ columns_str = "[" + ", ".join([ stringify_values(col) for col in all_columns]) + "]"
1600
1831
 
1601
1832
  code = f"""
1602
1833
  # Explode columns into multiple rows
1603
1834
  output_df = input_df.explode({columns_str})
1604
1835
  """
1605
1836
 
1606
- cols_desc = ", ".join(all_columns)
1837
+ cols_desc = ", ".join(str(s) for s in all_columns)
1607
1838
  desc = description or f"Explode column(s): {cols_desc}"
1608
1839
 
1609
1840
  # Add polars code node
@@ -1646,7 +1877,7 @@ class FlowFrame:
1646
1877
  new_node_id = generate_node_id()
1647
1878
 
1648
1879
  if isinstance(column, Column):
1649
- column_name = column.name
1880
+ column_name = column.column_name
1650
1881
  else:
1651
1882
  column_name = column
1652
1883
 
@@ -1730,7 +1961,7 @@ class FlowFrame:
1730
1961
  if col_expr._select_input.is_altered:
1731
1962
  can_use_native = False
1732
1963
  break
1733
- processed_subset.append(col_expr.name)
1964
+ processed_subset.append(col_expr.column_name)
1734
1965
  else:
1735
1966
  can_use_native = False
1736
1967
  break
@@ -1818,276 +2049,34 @@ class FlowFrame:
1818
2049
  """Get the number of columns."""
1819
2050
  return self.data.width
1820
2051
 
2052
+ def __contains__(self, key):
2053
+ """This special method enables the 'in' operator to work with FlowFrame objects."""
2054
+ return key in self.data
1821
2055
 
1822
- def _add_delegated_methods():
1823
- """Add delegated methods from polars LazyFrame."""
1824
- delegate_methods = [
1825
- "collect_async",
1826
- "profile",
1827
- "describe",
1828
- "explain",
1829
- "show_graph",
1830
- "serialize",
1831
- "fetch",
1832
- "get_meta",
1833
- "columns",
1834
- "dtypes",
1835
- "schema",
1836
- "estimated_size",
1837
- "n_chunks",
1838
- "is_empty",
1839
- "chunk_lengths",
1840
- "optimization_toggle",
1841
- "set_polars_options",
1842
- "collect_schema"
1843
- ]
1844
-
1845
- already_implemented = set(dir(FlowFrame))
1846
-
1847
- for method_name in delegate_methods:
1848
- if method_name not in already_implemented and hasattr(
1849
- pl.LazyFrame, method_name
1850
- ):
1851
- # Create a simple delegate method
1852
- def make_delegate(name):
1853
- def delegate_method(self, *args, **kwargs):
1854
- return getattr(self.data, name)(*args, **kwargs)
1855
-
1856
- # Set docstring and name
1857
- delegate_method.__doc__ = (
1858
- f"See pl.LazyFrame.{name} for full documentation."
1859
- )
1860
- delegate_method.__name__ = name
1861
- return delegate_method
1862
-
1863
- # Add the method to the class
1864
- setattr(FlowFrame, method_name, make_delegate(method_name))
1865
-
1866
-
1867
- _add_delegated_methods()
1868
-
1869
-
1870
- def sum(expr):
1871
- """Sum aggregation function."""
1872
- if isinstance(expr, str):
1873
- expr = col(expr)
1874
- return expr.sum()
1875
-
1876
-
1877
- def mean(expr):
1878
- """Mean aggregation function."""
1879
- if isinstance(expr, str):
1880
- expr = col(expr)
1881
- return expr.mean()
1882
-
1883
-
1884
- def min(expr):
1885
- """Min aggregation function."""
1886
- if isinstance(expr, str):
1887
- expr = col(expr)
1888
- return expr.min()
1889
-
1890
-
1891
- def max(expr):
1892
- """Max aggregation function."""
1893
- if isinstance(expr, str):
1894
- expr = col(expr)
1895
- return expr.max()
1896
-
1897
-
1898
- def count(expr):
1899
- """Count aggregation function."""
1900
- if isinstance(expr, str):
1901
- expr = col(expr)
1902
- return expr.count()
1903
-
1904
-
1905
- def read_csv(file_path, *, flow_graph: FlowGraph = None, separator: str = ';',
1906
- convert_to_absolute_path: bool = True,
1907
- description: str = None, **options):
1908
- """
1909
- Read a CSV file into a FlowFrame.
1910
-
1911
- Args:
1912
- file_path: Path to CSV file
1913
- flow_graph: if you want to add it to an existing graph
1914
- separator: Single byte character to use as separator in the file.
1915
- convert_to_absolute_path: If the path needs to be set to a fixed location
1916
- description: if you want to add a readable name in the frontend (advised)
1917
- **options: Options for polars.read_csv
1918
-
1919
- Returns:
1920
- A FlowFrame with the CSV data
1921
- """
1922
- # Create new node ID
1923
- node_id = generate_node_id()
1924
- if flow_graph is None:
1925
- flow_graph = create_flow_graph()
1926
-
1927
- flow_id = flow_graph.flow_id
1928
-
1929
- has_headers = options.get('has_header', True)
1930
- encoding = options.get('encoding', 'utf-8')
1931
-
1932
- if '~' in file_path:
1933
- file_path = os.path.expanduser(file_path)
1934
-
1935
- received_table = input_schema.ReceivedTable(
1936
- file_type='csv',
1937
- path=file_path,
1938
- name=Path(file_path).name,
1939
- delimiter=separator,
1940
- has_headers=has_headers,
1941
- encoding=encoding
1942
- )
1943
-
1944
- if convert_to_absolute_path:
1945
- received_table.path = received_table.abs_file_path
1946
-
1947
- read_node = input_schema.NodeRead(
1948
- flow_id=flow_id,
1949
- node_id=node_id,
1950
- received_file=received_table,
1951
- pos_x=100,
1952
- pos_y=100,
1953
- is_setup=True
1954
- )
1955
-
1956
- flow_graph.add_read(read_node)
1957
-
1958
- return FlowFrame(
1959
- data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
1960
- flow_graph=flow_graph,
1961
- node_id=node_id
1962
- )
1963
-
1964
-
1965
- def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
1966
- convert_to_absolute_path: bool = True, **options) -> FlowFrame:
1967
- """
1968
- Read a Parquet file into a FlowFrame.
1969
-
1970
- Args:
1971
- file_path: Path to Parquet file
1972
- flow_graph: if you want to add it to an existing graph
1973
- description: if you want to add a readable name in the frontend (advised)
1974
- convert_to_absolute_path: If the path needs to be set to a fixed location
1975
- **options: Options for polars.read_parquet
1976
-
1977
- Returns:
1978
- A FlowFrame with the Parquet data
1979
- """
1980
- if '~' in file_path:
1981
- file_path = os.path.expanduser(file_path)
1982
- node_id = generate_node_id()
1983
-
1984
- if flow_graph is None:
1985
- flow_graph = create_flow_graph()
1986
-
1987
- flow_id = flow_graph.flow_id
1988
-
1989
- received_table = input_schema.ReceivedTable(
1990
- file_type='parquet',
1991
- path=file_path,
1992
- name=Path(file_path).name,
1993
- )
1994
- if convert_to_absolute_path:
1995
- received_table.path = received_table.abs_file_path
1996
-
1997
- read_node = input_schema.NodeRead(
1998
- flow_id=flow_id,
1999
- node_id=node_id,
2000
- received_file=received_table,
2001
- pos_x=100,
2002
- pos_y=100,
2003
- is_setup=True,
2004
- description=description
2005
- )
2006
-
2007
- flow_graph.add_read(read_node)
2008
-
2009
- return FlowFrame(
2010
- data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2011
- flow_graph=flow_graph,
2012
- node_id=node_id
2013
- )
2014
-
2015
-
2016
- def from_dict(data, *, flow_graph: FlowGraph = None, description: str = None) -> FlowFrame:
2017
- """
2018
- Create a FlowFrame from a dictionary or list of dictionaries.
2019
-
2020
- Args:
2021
- data: Dictionary of lists or list of dictionaries
2022
- flow_graph: if you want to add it to an existing graph
2023
- description: if you want to add a readable name in the frontend (advised)
2024
- Returns:
2025
- A FlowFrame with the data
2026
- """
2027
- # Create new node ID
2028
- node_id = generate_node_id()
2029
-
2030
- if not flow_graph:
2031
- flow_graph = create_flow_graph()
2032
- flow_id = flow_graph.flow_id
2033
-
2034
- input_node = input_schema.NodeManualInput(
2035
- flow_id=flow_id,
2036
- node_id=node_id,
2037
- raw_data=FlowDataEngine(data).to_pylist(),
2038
- pos_x=100,
2039
- pos_y=100,
2040
- is_setup=True,
2041
- description=description
2042
- )
2043
-
2044
- # Add to graph
2045
- flow_graph.add_manual_input(input_node)
2046
-
2047
- # Return new frame
2048
- return FlowFrame(
2049
- data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2050
- flow_graph=flow_graph,
2051
- node_id=node_id
2052
- )
2056
+ def __bool__(self):
2057
+ """This special method determines how the object behaves in boolean contexts.
2058
+ Returns True if the FlowFrame contains any data, False otherwise."""
2059
+ return bool(self.data)
2053
2060
 
2061
+ @staticmethod
2062
+ def _comparison_error(operator: str) -> pl.lazyframe.frame.NoReturn:
2063
+ msg = f'"{operator!r}" comparison not supported for LazyFrame objects'
2064
+ raise TypeError(msg)
2054
2065
 
2055
- def concat(frames: List['FlowFrame'],
2056
- how: str = 'vertical',
2057
- rechunk: bool = False,
2058
- parallel: bool = True,
2059
- description: str = None) -> 'FlowFrame':
2060
- """
2061
- Concatenate multiple FlowFrames into one.
2066
+ def __eq__(self, other: object) -> pl.lazyframe.frame.NoReturn:
2067
+ self._comparison_error("==")
2062
2068
 
2063
- Parameters
2064
- ----------
2065
- frames : List[FlowFrame]
2066
- List of FlowFrames to concatenate
2067
- how : str, default 'vertical'
2068
- How to combine the FlowFrames (see concat method documentation)
2069
- rechunk : bool, default False
2070
- Whether to ensure contiguous memory in result
2071
- parallel : bool, default True
2072
- Whether to use parallel processing for the operation
2073
- description : str, optional
2074
- Description of this operation
2069
+ def __ne__(self, other: object) -> pl.lazyframe.frame.NoReturn:
2070
+ self._comparison_error("!=")
2075
2071
 
2076
- Returns
2077
- -------
2078
- FlowFrame
2079
- A new FlowFrame with the concatenated data
2080
- """
2081
- if not frames:
2082
- raise ValueError("No frames provided to concat_frames")
2072
+ def __gt__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
2073
+ self._comparison_error(">")
2083
2074
 
2084
- if len(frames) == 1:
2085
- return frames[0]
2075
+ def __lt__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
2076
+ self._comparison_error("<")
2086
2077
 
2087
- # Use first frame's concat method with remaining frames
2088
- first_frame = frames[0]
2089
- remaining_frames = frames[1:]
2078
+ def __ge__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
2079
+ self._comparison_error(">=")
2090
2080
 
2091
- return first_frame.concat(remaining_frames, how=how,
2092
- rechunk=rechunk, parallel=parallel,
2093
- description=description)
2081
+ def __le__(self, other: Any) -> pl.lazyframe.frame.NoReturn:
2082
+ self._comparison_error("<=")