Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
@@ -0,0 +1,2093 @@
1
+ import uuid
2
+ import os
3
+ from typing import Any, Iterable, List, Literal, Optional, Tuple, Union
4
+ from pathlib import Path
5
+
6
+ import re
7
+ import polars as pl
8
+ from polars._typing import FrameInitTypes, SchemaDefinition, SchemaDict, Orientation
9
+
10
+ # Assume these imports are correct from your original context
11
+ from flowfile_core.flowfile.FlowfileFlow import FlowGraph, add_connection
12
+ from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
13
+ from flowfile_core.flowfile.flow_node.flow_node import FlowNode
14
+ from flowfile_core.schemas import input_schema, transform_schema
15
+
16
+ from flowfile_frame.expr import Expr, Column, lit, col
17
+ from flowfile_frame.selectors import Selector
18
+ from flowfile_frame.group_frame import GroupByFrame
19
+ from flowfile_frame.utils import _parse_inputs_as_iterable, create_etl_graph
20
+ from flowfile_frame.join import _normalize_columns_to_list, _create_join_mappings
21
+
22
+ node_id_counter = 0
23
+
24
+
25
+ def _to_string_val(v) -> str:
26
+ if isinstance(v, str):
27
+ return f"'{v}'"
28
+ else:
29
+ return v
30
+
31
+
32
+ def generate_node_id() -> int:
33
+ global node_id_counter
34
+ node_id_counter += 1
35
+ return node_id_counter
36
+
37
+
38
+ class FlowFrame:
39
+ """Main class that wraps FlowDataEngine and maintains the ETL graph."""
40
+ flow_graph: FlowGraph
41
+ data: pl.LazyFrame
42
+
43
+ @staticmethod
44
+ def create_from_any_type(
45
+ data: FrameInitTypes = None,
46
+ schema: SchemaDefinition | None = None,
47
+ *,
48
+ schema_overrides: SchemaDict | None = None,
49
+ strict: bool = True,
50
+ orient: Orientation | None = None,
51
+ infer_schema_length: int | None = 100,
52
+ nan_to_null: bool = False,
53
+ flow_graph=None,
54
+ node_id=None,
55
+ parent_node_id=None,
56
+ ):
57
+ """
58
+ Simple naive implementation of creating the frame from any type. It converts the data to a polars frame,
59
+ next it implements it from a manual_input
60
+
61
+ Parameters
62
+ ----------
63
+ data : FrameInitTypes
64
+ Data to initialize the frame with
65
+ schema : SchemaDefinition, optional
66
+ Schema definition for the data
67
+ schema_overrides : pl.SchemaDict, optional
68
+ Schema overrides for specific columns
69
+ strict : bool, default True
70
+ Whether to enforce the schema strictly
71
+ orient : pl.Orientation, optional
72
+ Orientation of the data
73
+ infer_schema_length : int, default 100
74
+ Number of rows to use for schema inference
75
+ nan_to_null : bool, default False
76
+ Whether to convert NaN values to null
77
+ flow_graph : FlowGraph, optional
78
+ Existing ETL graph to add nodes to
79
+ node_id : int, optional
80
+ ID for the new node
81
+ parent_node_id : int, optional
82
+ ID of the parent node
83
+
84
+ Returns
85
+ -------
86
+ FlowFrame
87
+ A new FlowFrame with the data loaded as a manual input node
88
+ """
89
+ # Extract flow-specific parameters
90
+ node_id = node_id or generate_node_id()
91
+ description = "Data imported from Python object"
92
+
93
+ # Create a new flow graph if none is provided
94
+ if flow_graph is None:
95
+ flow_graph = create_etl_graph()
96
+
97
+ flow_id = flow_graph.flow_id
98
+
99
+ # Convert data to a polars DataFrame/LazyFrame
100
+ try:
101
+ # Use polars to convert from various types
102
+ pl_df = pl.DataFrame(
103
+ data,
104
+ schema=schema,
105
+ schema_overrides=schema_overrides,
106
+ strict=strict,
107
+ orient=orient,
108
+ infer_schema_length=infer_schema_length,
109
+ nan_to_null=nan_to_null,
110
+ )
111
+ pl_data = pl_df.lazy()
112
+ except Exception as e:
113
+ raise ValueError(f"Could not convert data to a polars DataFrame: {e}")
114
+
115
+ # Create a FlowDataEngine to get data in the right format for manual input
116
+ flow_table = FlowDataEngine(raw_data=pl_data)
117
+
118
+ # Create a manual input node
119
+ input_node = input_schema.NodeManualInput(
120
+ flow_id=flow_id,
121
+ node_id=node_id,
122
+ raw_data=flow_table.to_pylist(), # Convert to list of dicts
123
+ pos_x=100,
124
+ pos_y=100,
125
+ is_setup=True,
126
+ description=description,
127
+ )
128
+
129
+ # Add to graph
130
+ flow_graph.add_manual_input(input_node)
131
+
132
+ # Return new frame
133
+ return FlowFrame(
134
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
135
+ flow_graph=flow_graph,
136
+ node_id=node_id,
137
+ parent_node_id=parent_node_id,
138
+ )
139
+
140
+ def __new__(
141
+ cls,
142
+ data: pl.LazyFrame | FrameInitTypes = None,
143
+ schema: SchemaDefinition | None = None,
144
+ *,
145
+ schema_overrides: SchemaDict | None = None,
146
+ strict: bool = True,
147
+ orient: Orientation | None = None,
148
+ infer_schema_length: int | None = 100,
149
+ nan_to_null: bool = False,
150
+ flow_graph=None,
151
+ node_id=None,
152
+ parent_node_id=None,
153
+ ):
154
+ """Create a new FlowFrame instance."""
155
+
156
+ # If data is not a LazyFrame, use the factory method
157
+ if data is not None and not isinstance(data, pl.LazyFrame):
158
+ return cls.create_from_any_type(
159
+ data=data,
160
+ schema=schema,
161
+ schema_overrides=schema_overrides,
162
+ strict=strict,
163
+ orient=orient,
164
+ infer_schema_length=infer_schema_length,
165
+ nan_to_null=nan_to_null,
166
+ flow_graph=flow_graph,
167
+ node_id=node_id,
168
+ parent_node_id=parent_node_id,
169
+ )
170
+
171
+ # Otherwise create the instance normally
172
+ instance = super().__new__(cls)
173
+ return instance
174
+
175
+ def __init__(
176
+ self,
177
+ data: pl.LazyFrame | FrameInitTypes = None,
178
+ schema: SchemaDefinition | None = None,
179
+ *,
180
+ schema_overrides: SchemaDict | None = None,
181
+ strict: bool = True,
182
+ orient: Orientation | None = None,
183
+ infer_schema_length: int | None = 100,
184
+ nan_to_null: bool = False,
185
+ flow_graph=None,
186
+ node_id=None,
187
+ parent_node_id=None,
188
+ ):
189
+ """Initialize the FlowFrame with data and graph references."""
190
+
191
+ if data is None:
192
+ data = pl.LazyFrame()
193
+ if not isinstance(data, pl.LazyFrame):
194
+ return
195
+
196
+ self.node_id = node_id or generate_node_id()
197
+ self.parent_node_id = parent_node_id
198
+
199
+ # Initialize graph
200
+ if flow_graph is None:
201
+ flow_graph = create_etl_graph()
202
+ self.flow_graph = flow_graph
203
+ # Set up data
204
+ if isinstance(data, FlowDataEngine):
205
+ self.data = data.data_frame
206
+ else:
207
+ self.data = data
208
+
209
+ def __repr__(self):
210
+ return str(self.data)
211
+
212
+ def _add_connection(self, from_id, to_id, input_type: input_schema.InputType = "main"):
213
+ """Helper method to add a connection between nodes"""
214
+ connection = input_schema.NodeConnection.create_from_simple_input(
215
+ from_id=from_id, to_id=to_id, input_type=input_type
216
+ )
217
+ add_connection(self.flow_graph, connection)
218
+
219
+ def _create_child_frame(self, new_node_id):
220
+ """Helper method to create a new FlowFrame that's a child of this one"""
221
+ self._add_connection(self.node_id, new_node_id)
222
+ return FlowFrame(
223
+ data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
224
+ flow_graph=self.flow_graph,
225
+ node_id=new_node_id,
226
+ parent_node_id=self.node_id,
227
+ )
228
+
229
+ def sort(
230
+ self,
231
+ by: List[Expr | str] | Expr | str,
232
+ *more_by,
233
+ descending: bool | List[bool] = False,
234
+ nulls_last: bool = False,
235
+ multithreaded: bool = True,
236
+ maintain_order: bool = False,
237
+ description: str = None,
238
+ ):
239
+ """
240
+ Sort the dataframe by the given columns.
241
+
242
+ Parameters:
243
+ -----------
244
+ by : Expr, str, or list of Expr/str
245
+ Column(s) to sort by. Accepts expression input. Strings are parsed as column names.
246
+ *more_by : Expr or str
247
+ Additional columns to sort by, specified as positional arguments.
248
+ descending : bool or list of bool, default False
249
+ Sort in descending order. When sorting by multiple columns, can be specified per column.
250
+ nulls_last : bool or list of bool, default False
251
+ Place null values last; can specify a single boolean or a sequence for per-column control.
252
+ multithreaded : bool, default True
253
+ Sort using multiple threads.
254
+ maintain_order : bool, default False
255
+ Whether the order should be maintained if elements are equal.
256
+ description : str, optional
257
+ Description of this operation for the ETL graph.
258
+
259
+ Returns:
260
+ --------
261
+ FlowFrame
262
+ A new FlowFrame with sorted data.
263
+ """
264
+ by = list(_parse_inputs_as_iterable((by,)))
265
+ new_node_id = generate_node_id()
266
+ sort_expressions = by
267
+ if more_by:
268
+ sort_expressions.extend(more_by)
269
+
270
+ # Determine if we need to use polars code fallback
271
+ needs_polars_code = False
272
+
273
+ # Check for any expressions that are not simple columns
274
+ for expr in sort_expressions:
275
+ if not isinstance(expr, (str, Column)) or (
276
+ isinstance(expr, Column) and expr._select_input.is_altered
277
+ ):
278
+ needs_polars_code = True
279
+ break
280
+
281
+ # Also need polars code if we're using maintain_order or multithreaded params
282
+ if maintain_order or not multithreaded:
283
+ needs_polars_code = True
284
+
285
+ # Standardize descending parameter
286
+ if isinstance(descending, (list, tuple)):
287
+ # Ensure descending list has the same length as sort_expressions
288
+ if len(descending) != len(sort_expressions):
289
+ raise ValueError(
290
+ f"Length of descending ({len(descending)}) must match number of sort columns ({len(sort_expressions)})"
291
+ )
292
+ descending_values = descending
293
+ else:
294
+ descending_values = [descending] * len(sort_expressions)
295
+
296
+ # Standardize nulls_last parameter
297
+ if isinstance(nulls_last, (list, tuple)):
298
+ if len(nulls_last) != len(sort_expressions):
299
+ raise ValueError(
300
+ f"Length of nulls_last ({len(nulls_last)}) must match number of sort columns ({len(sort_expressions)})"
301
+ )
302
+ nulls_last_values = nulls_last
303
+ # Any non-default nulls_last needs polars code
304
+ if any(val is not False for val in nulls_last_values):
305
+ needs_polars_code = True
306
+ else:
307
+ nulls_last_values = [nulls_last] * len(sort_expressions)
308
+ # Non-default nulls_last needs polars code
309
+ if nulls_last:
310
+ needs_polars_code = True
311
+
312
+ if needs_polars_code:
313
+ # Generate polars code for complex cases
314
+ code = self._generate_sort_polars_code(
315
+ sort_expressions,
316
+ descending_values,
317
+ nulls_last_values,
318
+ multithreaded,
319
+ maintain_order,
320
+ )
321
+ self._add_polars_code(new_node_id, code, description)
322
+ else:
323
+ # Use native implementation for simple cases
324
+ sort_inputs = []
325
+ for i, expr in enumerate(sort_expressions):
326
+ # Convert expr to column name
327
+ if isinstance(expr, Column):
328
+ column_name = expr.name
329
+ elif isinstance(expr, str):
330
+ column_name = expr
331
+ else:
332
+ column_name = str(expr)
333
+
334
+ # Create SortByInput with appropriate settings
335
+ sort_inputs.append(
336
+ transform_schema.SortByInput(
337
+ column=column_name,
338
+ how="desc" if descending_values[i] else "asc",
339
+ )
340
+ )
341
+
342
+ sort_settings = input_schema.NodeSort(
343
+ flow_id=self.flow_graph.flow_id,
344
+ node_id=new_node_id,
345
+ sort_input=sort_inputs,
346
+ pos_x=200,
347
+ pos_y=150,
348
+ is_setup=True,
349
+ depending_on_id=self.node_id,
350
+ description=description
351
+ or f"Sort by {', '.join(str(e) for e in sort_expressions)}",
352
+ )
353
+ self.flow_graph.add_sort(sort_settings)
354
+
355
+ return self._create_child_frame(new_node_id)
356
+
357
+ def _generate_sort_polars_code(
358
+ self,
359
+ sort_expressions: list,
360
+ descending_values: list,
361
+ nulls_last_values: list,
362
+ multithreaded: bool,
363
+ maintain_order: bool,
364
+ ) -> str:
365
+ """Generate Polars code for sort operations that need fallback."""
366
+ # Format expressions for code
367
+ expr_strs = []
368
+ for expr in sort_expressions:
369
+ if isinstance(expr, (Expr, Column)):
370
+ expr_strs.append(str(expr))
371
+ elif isinstance(expr, str):
372
+ expr_strs.append(f"'{expr}'")
373
+ else:
374
+ expr_strs.append(str(expr))
375
+
376
+ # Format parameters
377
+ if len(sort_expressions) == 1:
378
+ by_arg = expr_strs[0]
379
+ else:
380
+ by_arg = f"[{', '.join(expr_strs)}]"
381
+
382
+ # Build kwargs
383
+ kwargs = {}
384
+
385
+ # Only add descending if it's non-default
386
+ if any(d for d in descending_values):
387
+ if len(descending_values) == 1:
388
+ kwargs["descending"] = descending_values[0]
389
+ else:
390
+ kwargs["descending"] = descending_values
391
+
392
+ # Only add nulls_last if it's non-default
393
+ if any(nl for nl in nulls_last_values):
394
+ if len(nulls_last_values) == 1:
395
+ kwargs["nulls_last"] = nulls_last_values[0]
396
+ else:
397
+ kwargs["nulls_last"] = nulls_last_values
398
+
399
+ # Add other parameters if they're non-default
400
+ if not multithreaded:
401
+ kwargs["multithreaded"] = multithreaded
402
+
403
+ if maintain_order:
404
+ kwargs["maintain_order"] = maintain_order
405
+
406
+ # Build kwargs string
407
+ kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items())
408
+
409
+ # Build final code
410
+ if kwargs_str:
411
+ return f"input_df.sort({by_arg}, {kwargs_str})"
412
+ else:
413
+ return f"input_df.sort({by_arg})"
414
+
415
+ def _add_polars_code(self, new_node_id: int, code: str, description: str = None,
416
+ depending_on_ids: List[str] | None = None):
417
+ polars_code_settings = input_schema.NodePolarsCode(
418
+ flow_id=self.flow_graph.flow_id,
419
+ node_id=new_node_id,
420
+ polars_code_input=transform_schema.PolarsCodeInput(polars_code=code),
421
+ is_setup=True,
422
+ depending_on_ids=depending_on_ids if depending_on_ids is not None else [self.node_id],
423
+ description=description,
424
+ )
425
+ self.flow_graph.add_polars_code(polars_code_settings)
426
+
427
+ def join(
428
+ self,
429
+ other,
430
+ on: List[str | Column] | str | Column = None,
431
+ how: str = "inner",
432
+ left_on: List[str | Column] | str | Column = None,
433
+ right_on: List[str | Column] | str | Column = None,
434
+ suffix: str = "_right",
435
+ validate: str = None,
436
+ nulls_equal: bool = False,
437
+ coalesce: bool = None,
438
+ maintain_order: Literal[None, "left", "right", "left_right", "right_left"] = None,
439
+ description: str = None,
440
+ ):
441
+ """
442
+ Add a join operation to the Logical Plan.
443
+
444
+ Parameters
445
+ ----------
446
+ other : FlowFrame
447
+ Other DataFrame.
448
+ on : str or list of str, optional
449
+ Name(s) of the join columns in both DataFrames.
450
+ how : {'inner', 'left', 'outer', 'semi', 'anti', 'cross'}, default 'inner'
451
+ Join strategy.
452
+ left_on : str or list of str, optional
453
+ Name(s) of the left join column(s).
454
+ right_on : str or list of str, optional
455
+ Name(s) of the right join column(s).
456
+ suffix : str, default "_right"
457
+ Suffix to add to columns with a duplicate name.
458
+ validate : {"1:1", "1:m", "m:1", "m:m"}, optional
459
+ Validate join relationship.
460
+ nulls_equal:
461
+ Join on null values. By default null values will never produce matches.
462
+ coalesce:
463
+ None: -> join specific.
464
+ True: -> Always coalesce join columns.
465
+ False: -> Never coalesce join columns.
466
+ maintain_order:
467
+ Which DataFrame row order to preserve, if any. Do not rely on any observed ordering without explicitly setting this parameter, as your code may break in a future release. Not specifying any ordering can improve performance Supported for inner, left, right and full joins
468
+ None: No specific ordering is desired. The ordering might differ across Polars versions or even between different runs.
469
+ left: Preserves the order of the left DataFrame.
470
+ right: Preserves the order of the right DataFrame.
471
+ left_right: First preserves the order of the left DataFrame, then the right.
472
+ right_left: First preserves the order of the right DataFrame, then the left.
473
+ description : str, optional
474
+ Description of the join operation for the ETL graph.
475
+
476
+ Returns
477
+ -------
478
+ FlowFrame
479
+ New FlowFrame with join operation applied.
480
+ """
481
+ new_node_id = generate_node_id()
482
+ print('new node id', new_node_id)
483
+ use_polars_code = not(maintain_order is None and
484
+ coalesce is None and
485
+ nulls_equal is False and
486
+ validate is None and
487
+ suffix == '_right')
488
+ join_mappings = None
489
+ if on is not None:
490
+ left_columns = right_columns = _normalize_columns_to_list(on)
491
+ elif left_on is not None and right_on is not None:
492
+ left_columns = _normalize_columns_to_list(left_on)
493
+ right_columns = _normalize_columns_to_list(right_on)
494
+ elif how == 'cross' and left_on is None and right_on is None and on is None:
495
+ left_columns = None
496
+ right_columns = None
497
+ else:
498
+ raise ValueError("Must specify either 'on' or both 'left_on' and 'right_on'")
499
+
500
+ # Ensure left and right column lists have same length
501
+ if how != 'cross' and len(left_columns) != len(right_columns):
502
+ raise ValueError(
503
+ f"Length mismatch: left columns ({len(left_columns)}) != right columns ({len(right_columns)})"
504
+ )
505
+ if not use_polars_code:
506
+ join_mappings, use_polars_code = _create_join_mappings(
507
+ left_columns, right_columns
508
+ )
509
+
510
+ if use_polars_code or suffix != '_right':
511
+ _on = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in _normalize_columns_to_list(on)) + "]" if on else None
512
+ _left = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in left_columns) + "]" if left_on else None
513
+ _right = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in right_columns) + "]" if right_on else None
514
+ code_kwargs = {"other": "input_df_2", "how": _to_string_val(how), "on": _on, "left_on": _left,
515
+ "right_on": _right, "suffix": _to_string_val(suffix), "validate": _to_string_val(validate),
516
+ "nulls_equal": nulls_equal, "coalesce": coalesce,
517
+ "maintain_order": _to_string_val(maintain_order)}
518
+ kwargs_str = ", ".join(f"{k}={v}" for k, v in code_kwargs.items() if v is not None)
519
+ code = f"input_df_1.join({kwargs_str})"
520
+ self._add_polars_code(new_node_id, code, description, depending_on_ids=[self.node_id, other.node_id])
521
+ self._add_connection(self.node_id, new_node_id, "main")
522
+ other._add_connection(other.node_id, new_node_id, "main")
523
+ result_frame = FlowFrame(
524
+ data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
525
+ flow_graph=self.flow_graph,
526
+ node_id=new_node_id,
527
+ parent_node_id=self.node_id,
528
+ )
529
+
530
+ elif join_mappings:
531
+ left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
532
+ right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
533
+
534
+ join_input = transform_schema.JoinInput(
535
+ join_mapping=join_mappings,
536
+ left_select=left_select.renames,
537
+ right_select=right_select.renames,
538
+ how=how,
539
+ )
540
+ join_input.auto_rename()
541
+ # Create node settings
542
+ join_settings = input_schema.NodeJoin(
543
+ flow_id=self.flow_graph.flow_id,
544
+ node_id=new_node_id,
545
+ join_input=join_input,
546
+ auto_generate_selection=True,
547
+ verify_integrity=True,
548
+ pos_x=200,
549
+ pos_y=150,
550
+ is_setup=True,
551
+ depending_on_ids=[self.node_id, other.node_id],
552
+ description=description or f"Join with {how} strategy",
553
+ )
554
+ self.flow_graph.add_join(join_settings)
555
+ self._add_connection(self.node_id, new_node_id, "main")
556
+ other._add_connection(other.node_id, new_node_id, "right")
557
+ result_frame = FlowFrame(
558
+ data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
559
+ flow_graph=self.flow_graph,
560
+ node_id=new_node_id,
561
+ parent_node_id=self.node_id,
562
+ )
563
+ else:
564
+ raise ValueError("Could not execute join")
565
+
566
+ return result_frame
567
+
568
+ def _add_number_of_records(self, new_node_id: int, description: str = None) -> "FlowFrame":
569
+ node_number_of_records = input_schema.NodeRecordCount(
570
+ flow_id=self.flow_graph.flow_id,
571
+ node_id=new_node_id,
572
+ pos_x=200,
573
+ pos_y=100,
574
+ is_setup=True,
575
+ depending_on_id=self.node_id,
576
+ description=description
577
+ )
578
+ self.flow_graph.add_record_count(node_number_of_records)
579
+ return self._create_child_frame(new_node_id)
580
+
581
+ def select(self, *columns, description: str = None):
582
+ """
583
+ Select columns from the frame.
584
+
585
+ Args:
586
+ *columns: Column names or expressions
587
+ description: Description of the step, this will be shown in the flowfile file
588
+
589
+ Returns:
590
+ A new FlowFrame with selected columns
591
+ """
592
+ # Create new node ID
593
+ columns = _parse_inputs_as_iterable(columns)
594
+ new_node_id = generate_node_id()
595
+ existing_columns = self.columns
596
+
597
+ if (len(columns) == 1 and isinstance(columns[0], Expr)
598
+ and str(columns[0]) == "pl.Expr(len()).alias('number_of_records')"):
599
+ return self._add_number_of_records(new_node_id, description)
600
+
601
+ # Handle simple column names
602
+ if all(isinstance(col_, (str, Column)) for col_ in columns):
603
+ # Create select inputs
604
+ select_inputs = [
605
+ transform_schema.SelectInput(old_name=col_) if isinstance(col_, str) else col_.to_select_input()
606
+ for col_ in columns
607
+ ]
608
+ dropped_columns = [transform_schema.SelectInput(c, keep=False) for c in existing_columns if
609
+ c not in [s.old_name for s in select_inputs]]
610
+ select_inputs.extend(dropped_columns)
611
+ select_settings = input_schema.NodeSelect(
612
+ flow_id=self.flow_graph.flow_id,
613
+ node_id=new_node_id,
614
+ select_input=select_inputs,
615
+ keep_missing=False,
616
+ pos_x=200,
617
+ pos_y=100,
618
+ is_setup=True,
619
+ depending_on_id=self.node_id,
620
+ description=description
621
+ )
622
+
623
+ # Add to graph
624
+ self.flow_graph.add_select(select_settings)
625
+ return self._create_child_frame(new_node_id)
626
+
627
+ else:
628
+ readable_exprs = []
629
+ is_readable: bool = True
630
+ for col_ in columns:
631
+ if isinstance(col_, Expr):
632
+ readable_exprs.append(col_)
633
+ elif isinstance(col_, Selector):
634
+ readable_exprs.append(col_)
635
+ elif isinstance(col_, pl.expr.Expr):
636
+ print('warning this cannot be converted to flowfile frontend. Make sure you use the flowfile expr')
637
+ is_readable = False
638
+ elif isinstance(col_, str) and col_ in self.columns:
639
+ col_expr = Column(col_)
640
+ readable_exprs.append(col_expr)
641
+ else:
642
+ lit_expr = lit(col_)
643
+ readable_exprs.append(lit_expr)
644
+ if is_readable:
645
+ code = f"input_df.select([{', '.join(str(e) for e in readable_exprs)}])"
646
+ else:
647
+ raise ValueError('Not supported')
648
+
649
+ self._add_polars_code(new_node_id, code, description)
650
+ return self._create_child_frame(new_node_id)
651
+
652
+ def filter(self, predicate: Expr | Any = None, *, flowfile_formula: str = None, description: str = None):
653
+ """
654
+ Filter rows based on a predicate.
655
+
656
+ Args:
657
+ predicate: Filter condition
658
+ flowfile_formula: Native support in frontend
659
+ description: Description of the step that is performed
660
+ Returns:
661
+ A new FlowFrame with filtered rows
662
+ """
663
+ new_node_id = generate_node_id()
664
+ # Create new node ID
665
+ if predicate:
666
+ # we use for now the fallback on polars code.
667
+ if isinstance(predicate, Expr):
668
+ predicate_expr = predicate
669
+ else:
670
+ predicate_expr = lit(predicate)
671
+ code = f"input_df.filter({str(predicate_expr)})"
672
+ self._add_polars_code(new_node_id, code, description)
673
+
674
+ elif flowfile_formula:
675
+ # Create node settings
676
+ filter_settings = input_schema.NodeFilter(
677
+ flow_id=self.flow_graph.flow_id,
678
+ node_id=new_node_id,
679
+ filter_input=transform_schema.FilterInput(
680
+ advanced_filter=flowfile_formula,
681
+ filter_type="advanced"
682
+ ),
683
+ pos_x=200,
684
+ pos_y=150,
685
+ is_setup=True,
686
+ depending_on_id=self.node_id,
687
+ description=description
688
+ )
689
+
690
+ self.flow_graph.add_filter(filter_settings)
691
+
692
+ return self._create_child_frame(new_node_id)
693
+
694
+ def sink_csv(self,
695
+ file: str,
696
+ *args,
697
+ separator: str = ",",
698
+ encoding: str = "utf-8",
699
+ description: str = None):
700
+ """
701
+ Write the data to a CSV file.
702
+
703
+ Args:
704
+ path: Path or filename for the CSV file
705
+ separator: Field delimiter to use, defaults to ','
706
+ encoding: File encoding, defaults to 'utf-8'
707
+ description: Description of this operation for the ETL graph
708
+
709
+ Returns:
710
+ Self for method chaining
711
+ """
712
+ return self.write_csv(file, *args, separator=separator, encoding=encoding, description=description)
713
+
714
+ def write_parquet(
715
+ self,
716
+ path: str|os.PathLike,
717
+ *,
718
+ description: str = None,
719
+ convert_to_absolute_path: bool = True,
720
+ **kwargs: Any,
721
+ ) -> "FlowFrame":
722
+ """
723
+ Write the data to a Parquet file. Creates a standard Output node if only
724
+ 'path' and standard options are provided. Falls back to a Polars Code node
725
+ if other keyword arguments are used.
726
+
727
+ Args:
728
+ path: Path (string or pathlib.Path) or filename for the Parquet file.
729
+ Note: Writable file-like objects are not supported when using advanced options
730
+ that trigger the Polars Code node fallback.
731
+ description: Description of this operation for the ETL graph.
732
+ convert_to_absolute_path: If the path needs to be set to a fixed location.
733
+ **kwargs: Additional keyword arguments for polars.DataFrame.sink_parquet/write_parquet.
734
+ If any kwargs other than 'description' or 'convert_to_absolute_path' are provided,
735
+ a Polars Code node will be created instead of a standard Output node.
736
+ Complex objects like IO streams or credential provider functions are NOT
737
+ supported via this method's Polars Code fallback.
738
+
739
+ Returns:
740
+ Self for method chaining (new FlowFrame pointing to the output node).
741
+ """
742
+ new_node_id = generate_node_id()
743
+
744
+ is_path_input = isinstance(path, (str, os.PathLike))
745
+ if isinstance(path, os.PathLike):
746
+ file_str = str(path)
747
+ elif isinstance(path, str):
748
+ file_str = path
749
+ else:
750
+ file_str = path
751
+ is_path_input = False
752
+ if "~" in file_str:
753
+ file_str = os.path.expanduser(file_str)
754
+ file_name = file_str.split(os.sep)[-1]
755
+ use_polars_code = bool(kwargs.items()) or not is_path_input
756
+
757
+ output_parquet_table = input_schema.OutputParquetTable(
758
+ file_type="parquet"
759
+ )
760
+ output_settings = input_schema.OutputSettings(
761
+ file_type='parquet',
762
+ name=file_name,
763
+ directory=file_str if is_path_input else str(file_str),
764
+ output_parquet_table=output_parquet_table,
765
+ output_csv_table=input_schema.OutputCsvTable(),
766
+ output_excel_table=input_schema.OutputExcelTable()
767
+ )
768
+
769
+ if is_path_input:
770
+ try:
771
+ output_settings.set_absolute_filepath()
772
+ if convert_to_absolute_path:
773
+ output_settings.directory = output_settings.abs_file_path
774
+ except Exception as e:
775
+ print(f"Warning: Could not determine absolute path for {file_str}: {e}")
776
+
777
+ if not use_polars_code:
778
+ node_output = input_schema.NodeOutput(
779
+ flow_id=self.flow_graph.flow_id,
780
+ node_id=new_node_id,
781
+ output_settings=output_settings,
782
+ depending_on_id=self.node_id,
783
+ description=description
784
+ )
785
+ self.flow_graph.add_output(node_output)
786
+ else:
787
+ if not is_path_input:
788
+ raise TypeError(
789
+ f"Input 'path' must be a string or Path-like object when using advanced "
790
+ f"write_parquet options (kwargs={kwargs.items()}), got {type(path)}."
791
+ " File-like objects are not supported with the Polars Code fallback."
792
+ )
793
+
794
+ # Use the potentially converted absolute path string
795
+ path_arg_repr = repr(output_settings.directory)
796
+ kwargs_repr = ", ".join(f"{k}={repr(v)}" for k, v in kwargs.items())
797
+ args_str = f"path={path_arg_repr}"
798
+ if kwargs_repr:
799
+ args_str += f", {kwargs_repr}"
800
+
801
+ # Use sink_parquet for LazyFrames
802
+ code = f"input_df.sink_parquet({args_str})"
803
+ print(f"Generated Polars Code: {code}")
804
+ self._add_polars_code(new_node_id, code, description)
805
+
806
+ return self._create_child_frame(new_node_id)
807
+
808
+ def write_csv(
809
+ self,
810
+ file: str | os.PathLike,
811
+ *,
812
+ separator: str = ",",
813
+ encoding: str = "utf-8",
814
+ description: str = None,
815
+ convert_to_absolute_path: bool = True,
816
+ **kwargs: Any,
817
+ ) -> "FlowFrame":
818
+
819
+ new_node_id = generate_node_id()
820
+
821
+ is_path_input = isinstance(file, (str, os.PathLike))
822
+ if isinstance(file, os.PathLike):
823
+ file_str = str(file)
824
+ elif isinstance(file, str):
825
+ file_str = file
826
+ else:
827
+ file_str = file
828
+ is_path_input = False
829
+ if "~" in file_str:
830
+ file_str = os.path.expanduser(file_str)
831
+ file_name = file_str.split(os.sep)[-1] if is_path_input else "output.csv"
832
+
833
+ use_polars_code = bool(kwargs) or not is_path_input
834
+
835
+ output_settings = input_schema.OutputSettings(
836
+ file_type='csv',
837
+ name=file_name,
838
+ directory=file_str if is_path_input else str(file_str),
839
+ output_csv_table=input_schema.OutputCsvTable(
840
+ file_type="csv", delimiter=separator, encoding=encoding),
841
+ output_excel_table=input_schema.OutputExcelTable(),
842
+ output_parquet_table=input_schema.OutputParquetTable()
843
+ )
844
+
845
+ if is_path_input:
846
+ try:
847
+ output_settings.set_absolute_filepath()
848
+ if convert_to_absolute_path:
849
+ output_settings.directory = output_settings.abs_file_path
850
+ except Exception as e:
851
+ print(f"Warning: Could not determine absolute path for {file_str}: {e}")
852
+
853
+ if not use_polars_code:
854
+ node_output = input_schema.NodeOutput(
855
+ flow_id=self.flow_graph.flow_id,
856
+ node_id=new_node_id,
857
+ output_settings=output_settings,
858
+ depending_on_id=self.node_id,
859
+ description=description
860
+ )
861
+ self.flow_graph.add_output(node_output)
862
+ else:
863
+ if not is_path_input:
864
+ raise TypeError(
865
+ f"Input 'file' must be a string or Path-like object when using advanced "
866
+ f"write_csv options (kwargs={kwargs}), got {type(file)}."
867
+ " File-like objects are not supported with the Polars Code fallback."
868
+ )
869
+
870
+ path_arg_repr = repr(output_settings.directory)
871
+
872
+ all_kwargs_for_code = {
873
+ 'separator': separator,
874
+ 'encoding': encoding,
875
+ **kwargs # Add the extra kwargs
876
+ }
877
+ kwargs_repr = ", ".join(f"{k}={repr(v)}" for k, v in all_kwargs_for_code.items())
878
+
879
+ args_str = f"file={path_arg_repr}"
880
+ if kwargs_repr:
881
+ args_str += f", {kwargs_repr}"
882
+
883
+ code = f"input_df.collect().write_csv({args_str})"
884
+ print(f"Generated Polars Code: {code}")
885
+ self._add_polars_code(new_node_id, code, description)
886
+
887
+ return self._create_child_frame(new_node_id)
888
+
889
+ def group_by(self, *by, description: str = None, maintain_order=False, **named_by) -> GroupByFrame:
890
+ """
891
+ Start a group by operation.
892
+
893
+ Parameters:
894
+ *by: Column names or expressions to group by
895
+ description: add optional description to this step for the frontend
896
+ maintain_order: Keep groups in the order they appear in the data
897
+ **named_by: Additional columns to group by with custom names
898
+
899
+ Returns:
900
+ GroupByFrame object for aggregations
901
+ """
902
+ # Process positional arguments
903
+ new_node_id = generate_node_id()
904
+ by_cols = []
905
+ for col_expr in by:
906
+ if isinstance(col_expr, str):
907
+ by_cols.append(col_expr)
908
+ elif isinstance(col_expr, Expr):
909
+ by_cols.append(col_expr)
910
+ elif isinstance(col_expr, Selector):
911
+ by_cols.append(col_expr)
912
+ elif isinstance(col_expr, (list, tuple)):
913
+ by_cols.extend(col_expr)
914
+
915
+ for new_name, col_expr in named_by.items():
916
+ if isinstance(col_expr, str):
917
+ by_cols.append(col(col_expr).alias(new_name))
918
+ elif isinstance(col_expr, Expr):
919
+ by_cols.append(col_expr.alias(new_name))
920
+
921
+ # Create a GroupByFrame
922
+ return GroupByFrame(
923
+ node_id=new_node_id,
924
+ parent_frame=self, by_cols=by_cols, maintain_order=maintain_order, description=description
925
+ )
926
+
927
+ def to_graph(self):
928
+ """Get the underlying ETL graph."""
929
+ return self.flow_graph
930
+
931
+ def save_graph(self, file_path: str, auto_arrange: bool = True):
932
+ """Save the graph """
933
+ if auto_arrange:
934
+ self.flow_graph.apply_layout()
935
+ self.flow_graph.save_flow(file_path)
936
+
937
+ def collect(self):
938
+ """Collect lazy data into memory."""
939
+ if hasattr(self.data, "collect"):
940
+ return self.data.collect()
941
+ return self.data
942
+
943
+ def _with_flowfile_formula(self, flowfile_formula: str, output_column_name, description: str = None) -> "FlowFrame":
944
+ new_node_id = generate_node_id()
945
+ function_settings = (
946
+ input_schema.NodeFormula(flow_id=self.flow_graph.flow_id, node_id=new_node_id, depending_on_id=self.node_id,
947
+ function=transform_schema.FunctionInput(
948
+ function=flowfile_formula,
949
+ field=transform_schema.FieldInput(name=output_column_name)),
950
+ description=description))
951
+ self.flow_graph.add_formula(function_settings)
952
+ return self._create_child_frame(new_node_id)
953
+
954
+ def head(self, n: int, description: str = None):
955
+ new_node_id = generate_node_id()
956
+ settings = input_schema.NodeSample(flow_id=self.flow_graph.flow_id,
957
+ node_id=new_node_id,
958
+ depending_on_id=self.node_id,
959
+ sample_size=n,
960
+ description=description
961
+ )
962
+ self.flow_graph.add_sample(settings)
963
+ return self._create_child_frame(new_node_id)
964
+
965
+ def limit(self, n: int, description: str = None):
966
+ return self.head(n, description)
967
+
968
+ def cache(self) -> "FlowFrame":
969
+ setting_input = self.get_node_settings().setting_input
970
+ setting_input.cache_results = True
971
+ self.data.cache()
972
+ return self
973
+
974
+ def get_node_settings(self) -> FlowNode:
975
+ return self.flow_graph.get_node(self.node_id)
976
+
977
+ def pivot(self,
978
+ on: str | list[str],
979
+ *,
980
+ index: str | list[str] | None = None,
981
+ values: str | list[str] | None = None,
982
+ aggregate_function: str | None = "first",
983
+ maintain_order: bool = True,
984
+ sort_columns: bool = False,
985
+ separator: str = '_',
986
+ description: str = None) -> "FlowFrame":
987
+ """
988
+ Pivot a DataFrame from long to wide format.
989
+
990
+ Parameters
991
+ ----------
992
+ on: str | list[str]
993
+ Column values to use as column names in the pivoted DataFrame
994
+ index: str | list[str] | None
995
+ Column(s) to use as index/row identifiers in the pivoted DataFrame
996
+ values: str | list[str] | None
997
+ Column(s) that contain the values of the pivoted DataFrame
998
+ aggregate_function: str | None
999
+ Function to aggregate values if there are duplicate entries.
1000
+ Options: 'first', 'last', 'min', 'max', 'sum', 'mean', 'median', 'count'
1001
+ maintain_order: bool
1002
+ Whether to maintain the order of the columns/rows as they appear in the source
1003
+ sort_columns: bool
1004
+ Whether to sort the output columns
1005
+ separator: str
1006
+ Separator to use when joining column levels in the pivoted DataFrame
1007
+ description: str
1008
+ Description of this operation for the ETL graph
1009
+
1010
+ Returns
1011
+ -------
1012
+ FlowFrame
1013
+ A new FlowFrame with pivoted data
1014
+ """
1015
+ new_node_id = generate_node_id()
1016
+
1017
+ # Handle input standardization
1018
+ on_value = on[0] if isinstance(on, list) and len(on) == 1 else on
1019
+
1020
+ # Create index_columns list
1021
+ if index is None:
1022
+ index_columns = []
1023
+ elif isinstance(index, str):
1024
+ index_columns = [index]
1025
+ else:
1026
+ index_columns = list(index)
1027
+
1028
+ # Set values column
1029
+ if values is None:
1030
+ raise ValueError("Values parameter must be specified for pivot operation")
1031
+
1032
+ value_col = values if isinstance(values, str) else values[0]
1033
+
1034
+ # Set valid aggregations
1035
+ valid_aggs = ['first', 'last', 'min', 'max', 'sum', 'mean', 'median', 'count']
1036
+ if aggregate_function not in valid_aggs:
1037
+ raise ValueError(f"Invalid aggregate_function: {aggregate_function}. "
1038
+ f"Must be one of: {', '.join(valid_aggs)}")
1039
+
1040
+ # Check if we can use the native implementation
1041
+ can_use_native = (
1042
+ isinstance(on_value, str) and
1043
+ isinstance(value_col, str) and
1044
+ aggregate_function in valid_aggs
1045
+ )
1046
+
1047
+ if can_use_native:
1048
+ # Create pivot input for native implementation
1049
+ pivot_input = transform_schema.PivotInput(
1050
+ index_columns=index_columns,
1051
+ pivot_column=on_value,
1052
+ value_col=value_col,
1053
+ aggregations=[aggregate_function]
1054
+ )
1055
+
1056
+ # Create node settings
1057
+ pivot_settings = input_schema.NodePivot(
1058
+ flow_id=self.flow_graph.flow_id,
1059
+ node_id=new_node_id,
1060
+ pivot_input=pivot_input,
1061
+ pos_x=200,
1062
+ pos_y=150,
1063
+ is_setup=True,
1064
+ depending_on_id=self.node_id,
1065
+ description=description or f"Pivot {value_col} by {on_value}"
1066
+ )
1067
+
1068
+ # Add to graph using native implementation
1069
+ self.flow_graph.add_pivot(pivot_settings)
1070
+ else:
1071
+ # Fall back to polars code for complex cases
1072
+ # Generate proper polars code
1073
+ on_repr = repr(on)
1074
+ index_repr = repr(index)
1075
+ values_repr = repr(values)
1076
+
1077
+ code = f"""
1078
+ # Perform pivot operation
1079
+ result = input_df.pivot(
1080
+ on={on_repr},
1081
+ index={index_repr},
1082
+ values={values_repr},
1083
+ aggregate_function='{aggregate_function}',
1084
+ maintain_order={maintain_order},
1085
+ sort_columns={sort_columns},
1086
+ separator="{separator}"
1087
+ )
1088
+ result
1089
+ """
1090
+ # Generate description if not provided
1091
+ if description is None:
1092
+ on_str = on if isinstance(on, str) else ", ".join(on if isinstance(on, list) else [on])
1093
+ values_str = values if isinstance(values, str) else ", ".join(
1094
+ values if isinstance(values, list) else [values])
1095
+ description = f"Pivot {values_str} by {on_str}"
1096
+
1097
+ # Add polars code node
1098
+ self._add_polars_code(new_node_id, code, description)
1099
+
1100
+ return self._create_child_frame(new_node_id)
1101
+
1102
+ def unpivot(self,
1103
+ on: list[str | Selector] | str | None | Selector = None,
1104
+ *,
1105
+ index: list[str] | str | None = None,
1106
+ variable_name: str = "variable",
1107
+ value_name: str = "value",
1108
+ description: str = None) -> "FlowFrame":
1109
+ """
1110
+ Unpivot a DataFrame from wide to long format.
1111
+
1112
+ Parameters
1113
+ ----------
1114
+ on : list[str | Selector] | str | None | Selector
1115
+ Column(s) to unpivot (become values in the value column)
1116
+ If None, all columns not in index will be used
1117
+ index : list[str] | str | None
1118
+ Column(s) to use as identifier variables (stay as columns)
1119
+ variable_name : str, optional
1120
+ Name to give to the variable column, by default "variable"
1121
+ value_name : str, optional
1122
+ Name to give to the value column, by default "value"
1123
+ description : str, optional
1124
+ Description of this operation for the ETL graph
1125
+
1126
+ Returns
1127
+ -------
1128
+ FlowFrame
1129
+ A new FlowFrame with unpivoted data
1130
+ """
1131
+ new_node_id = generate_node_id()
1132
+
1133
+ # Standardize inputs
1134
+ if index is None:
1135
+ index_columns = []
1136
+ elif isinstance(index, str):
1137
+ index_columns = [index]
1138
+ else:
1139
+ index_columns = list(index)
1140
+ can_use_native = True
1141
+ if on is None:
1142
+ value_columns = []
1143
+ elif isinstance(on, (str, Selector)):
1144
+ if isinstance(on, Selector):
1145
+ can_use_native = False
1146
+ value_columns = [on]
1147
+ elif isinstance(on, Iterable):
1148
+ value_columns = list(on)
1149
+ if isinstance(value_columns[0], Iterable):
1150
+ can_use_native = False
1151
+ else:
1152
+ value_columns = [on]
1153
+
1154
+ if can_use_native:
1155
+ can_use_native = (variable_name == "variable" and value_name == "value")
1156
+ if can_use_native:
1157
+ unpivot_input = transform_schema.UnpivotInput(
1158
+ index_columns=index_columns,
1159
+ value_columns=value_columns,
1160
+ data_type_selector=None,
1161
+ data_type_selector_mode='column'
1162
+ )
1163
+
1164
+ # Create node settings
1165
+ unpivot_settings = input_schema.NodeUnpivot(
1166
+ flow_id=self.flow_graph.flow_id,
1167
+ node_id=new_node_id,
1168
+ unpivot_input=unpivot_input,
1169
+ pos_x=200,
1170
+ pos_y=150,
1171
+ is_setup=True,
1172
+ depending_on_id=self.node_id,
1173
+ description=description or "Unpivot data from wide to long format"
1174
+ )
1175
+
1176
+ # Add to graph using native implementation
1177
+ self.flow_graph.add_unpivot(unpivot_settings)
1178
+ else:
1179
+ # Fall back to polars code for complex cases
1180
+
1181
+ # Generate proper polars code
1182
+ on_repr = repr(on)
1183
+ index_repr = repr(index)
1184
+
1185
+ # Using unpivot() method to match polars API
1186
+ code = f"""
1187
+ # Perform unpivot operation
1188
+ output_df = input_df.unpivot(
1189
+ on={on_repr},
1190
+ index={index_repr},
1191
+ variable_name="{variable_name}",
1192
+ value_name="{value_name}"
1193
+ )
1194
+ output_df
1195
+ """
1196
+ # Generate description if not provided
1197
+ if description is None:
1198
+ index_str = ", ".join(index_columns) if index_columns else "none"
1199
+ value_str = ", ".join(value_columns) if value_columns else "all non-index columns"
1200
+ description = f"Unpivot data with index: {index_str} and value cols: {value_str}"
1201
+
1202
+ # Add polars code node
1203
+ self._add_polars_code(new_node_id, code, description)
1204
+
1205
+ return self._create_child_frame(new_node_id)
1206
+
1207
+ def concat(
1208
+ self,
1209
+ other: "FlowFrame" | List["FlowFrame"],
1210
+ how: str = "vertical",
1211
+ rechunk: bool = False,
1212
+ parallel: bool = True,
1213
+ description: str = None,
1214
+ ) -> "FlowFrame":
1215
+ """
1216
+ Combine multiple FlowFrames into a single FlowFrame.
1217
+
1218
+ This is equivalent to Polars' concat operation with various joining strategies.
1219
+
1220
+ Parameters
1221
+ ----------
1222
+ other : FlowFrame or List[FlowFrame]
1223
+ One or more FlowFrames to concatenate with this one
1224
+ how : str, default 'vertical'
1225
+ How to combine the FlowFrames:
1226
+ - 'vertical': Stack frames on top of each other (equivalent to 'union all')
1227
+ - 'vertical_relaxed': Same as vertical but coerces columns to common supertypes
1228
+ - 'diagonal': Union of column schemas, filling missing values with null
1229
+ - 'diagonal_relaxed': Same as diagonal but coerces columns to common supertypes
1230
+ - 'horizontal': Stack horizontally (column-wise concat)
1231
+ - 'align', 'align_full', 'align_left', 'align_right': Auto-determine key columns
1232
+ rechunk : bool, default False
1233
+ Whether to ensure contiguous memory in result
1234
+ parallel : bool, default True
1235
+ Whether to use parallel processing for the operation
1236
+ description : str, optional
1237
+ Description of this operation for the ETL graph
1238
+
1239
+ Returns
1240
+ -------
1241
+ FlowFrame
1242
+ A new FlowFrame with the concatenated data
1243
+ """
1244
+ new_node_id = generate_node_id()
1245
+
1246
+ # Convert single FlowFrame to list
1247
+ if isinstance(other, FlowFrame):
1248
+ others = [other]
1249
+ else:
1250
+ others = other
1251
+
1252
+ use_native = how == "diagonal_relaxed" and parallel and not rechunk
1253
+
1254
+ if use_native:
1255
+ # Create union input for the transform schema
1256
+ union_input = transform_schema.UnionInput(
1257
+ mode="relaxed" # This maps to diagonal_relaxed in polars
1258
+ )
1259
+
1260
+ # Create node settings
1261
+ union_settings = input_schema.NodeUnion(
1262
+ flow_id=self.flow_graph.flow_id,
1263
+ node_id=new_node_id,
1264
+ union_input=union_input,
1265
+ pos_x=200,
1266
+ pos_y=150,
1267
+ is_setup=True,
1268
+ depending_on_ids=[self.node_id] + [frame.node_id for frame in others],
1269
+ description=description or "Concatenate dataframes",
1270
+ )
1271
+
1272
+ # Add to graph
1273
+ self.flow_graph.add_union(union_settings)
1274
+
1275
+ # Add connections
1276
+ self._add_connection(self.node_id, new_node_id, "main")
1277
+ for other_frame in others:
1278
+ other_frame._add_connection(other_frame.node_id, new_node_id, "main")
1279
+ else:
1280
+ # Fall back to Polars code for other cases
1281
+ # Create a list of input dataframes for the code
1282
+ input_vars = ["input_df_1"]
1283
+ for i in range(len(others)):
1284
+ input_vars.append(f"input_df_{i+2}")
1285
+
1286
+ frames_list = f"[{', '.join(input_vars)}]"
1287
+
1288
+ code = f"""
1289
+ # Perform concat operation
1290
+ output_df = pl.concat(
1291
+ {frames_list},
1292
+ how='{how}',
1293
+ rechunk={rechunk},
1294
+ parallel={parallel}
1295
+ )
1296
+ """
1297
+
1298
+
1299
+ # Add polars code node with dependencies on all input frames
1300
+ depending_on_ids = [self.node_id] + [frame.node_id for frame in others]
1301
+ self._add_polars_code(
1302
+ new_node_id, code, description, depending_on_ids=depending_on_ids
1303
+ )
1304
+
1305
+ # Add connections to ensure all frames are available
1306
+ self._add_connection(self.node_id, new_node_id, "main")
1307
+ for other_frame in others:
1308
+ other_frame._add_connection(other_frame.node_id, new_node_id, "main")
1309
+
1310
+ # Create and return the new frame
1311
+ return FlowFrame(
1312
+ data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
1313
+ flow_graph=self.flow_graph,
1314
+ node_id=new_node_id,
1315
+ parent_node_id=self.node_id,
1316
+ )
1317
+
1318
+ def _detect_cum_count_record_id(
1319
+ self, expr: Any, new_node_id: int, description: Optional[str] = None
1320
+ ) -> Tuple[bool, Optional["FlowFrame"]]:
1321
+ """
1322
+ Detect if the expression is a cum_count operation and use record_id if possible.
1323
+
1324
+ Parameters
1325
+ ----------
1326
+ expr : Any
1327
+ Expression to analyze
1328
+ new_node_id : int
1329
+ Node ID to use if creating a record_id node
1330
+ description : str, optional
1331
+ Description to use for the new node
1332
+
1333
+ Returns
1334
+ -------
1335
+ Tuple[bool, Optional[FlowFrame]]
1336
+ A tuple containing:
1337
+ - bool: Whether a cum_count expression was detected and optimized
1338
+ - Optional[FlowFrame]: The new FlowFrame if detection was successful, otherwise None
1339
+ """
1340
+ # Check if this is a cum_count operation
1341
+ if (not isinstance(expr, Expr) or not expr._repr_str
1342
+ or "cum_count" not in expr._repr_str or not hasattr(expr, "name")):
1343
+ return False, None
1344
+
1345
+ # Extract the output name
1346
+ output_name = expr.name
1347
+
1348
+ if ".over(" not in expr._repr_str:
1349
+ # Simple cumulative count can be implemented as a record ID with offset=1
1350
+ record_id_input = transform_schema.RecordIdInput(
1351
+ output_column_name=output_name,
1352
+ offset=1,
1353
+ group_by=False,
1354
+ group_by_columns=[],
1355
+ )
1356
+
1357
+ # Create node settings
1358
+ record_id_settings = input_schema.NodeRecordId(
1359
+ flow_id=self.flow_graph.flow_id,
1360
+ node_id=new_node_id,
1361
+ record_id_input=record_id_input,
1362
+ pos_x=200,
1363
+ pos_y=150,
1364
+ is_setup=True,
1365
+ depending_on_id=self.node_id,
1366
+ description=description or f"Add cumulative count as '{output_name}'",
1367
+ )
1368
+
1369
+ # Add to graph using native implementation
1370
+ self.flow_graph.add_record_id(record_id_settings)
1371
+ return True, self._create_child_frame(new_node_id)
1372
+
1373
+ # Check for windowed/partitioned cum_count
1374
+ elif ".over(" in expr._repr_str:
1375
+ # Try to extract partition columns from different patterns
1376
+ partition_columns = []
1377
+
1378
+ # Case 1: Simple string column - .over('column')
1379
+ simple_match = re.search(r'\.over\([\'"]([^\'"]+)[\'"]\)', expr._repr_str)
1380
+ if simple_match:
1381
+ partition_columns = [simple_match.group(1)]
1382
+
1383
+ # Case 2: List of column strings - .over(['col1', 'col2'])
1384
+ list_match = re.search(r"\.over\(\[(.*?)\]", expr._repr_str)
1385
+ if list_match:
1386
+ items = list_match.group(1).split(",")
1387
+ for item in items:
1388
+ # Extract string column names from quoted strings
1389
+ col_match = re.search(r'[\'"]([^\'"]+)[\'"]', item.strip())
1390
+ if col_match:
1391
+ partition_columns.append(col_match.group(1))
1392
+
1393
+ # Case 3: pl.col expressions - .over(pl.col('category'), pl.col('abc'))
1394
+ col_matches = re.finditer(r'pl\.col\([\'"]([^\'"]+)[\'"]\)', expr._repr_str)
1395
+ for match in col_matches:
1396
+ partition_columns.append(match.group(1))
1397
+
1398
+ # If we found partition columns, create a grouped record ID
1399
+ if partition_columns:
1400
+ # Use grouped record ID implementation
1401
+ record_id_input = transform_schema.RecordIdInput(
1402
+ output_column_name=output_name,
1403
+ offset=1,
1404
+ group_by=True,
1405
+ group_by_columns=partition_columns,
1406
+ )
1407
+
1408
+ # Create node settings
1409
+ record_id_settings = input_schema.NodeRecordId(
1410
+ flow_id=self.flow_graph.flow_id,
1411
+ node_id=new_node_id,
1412
+ record_id_input=record_id_input,
1413
+ pos_x=200,
1414
+ pos_y=150,
1415
+ is_setup=True,
1416
+ depending_on_id=self.node_id,
1417
+ description=description
1418
+ or f"Add grouped cumulative count as '{output_name}' by {', '.join(partition_columns)}",
1419
+ )
1420
+
1421
+ # Add to graph using native implementation
1422
+ self.flow_graph.add_record_id(record_id_settings)
1423
+ return True, self._create_child_frame(new_node_id)
1424
+
1425
+ # Not a cum_count we can optimize
1426
+ return False, None
1427
+
1428
+ def with_columns(
1429
+ self,
1430
+ exprs: Expr | List[Expr | None] = None,
1431
+ *,
1432
+ flowfile_formulas: Optional[List[str]] = None,
1433
+ output_column_names: Optional[List[str]] = None,
1434
+ description: Optional[str] = None,
1435
+ ) -> "FlowFrame":
1436
+ """
1437
+ Add multiple columns to the DataFrame.
1438
+
1439
+ Parameters
1440
+ ----------
1441
+ exprs : Expr or List[Expr], optional
1442
+ Expressions to evaluate as new columns
1443
+ flowfile_formulas : List[str], optional
1444
+ Alternative approach using flowfile formula syntax
1445
+ output_column_names : List[str], optional
1446
+ Column names for the flowfile formulas
1447
+ description : str, optional
1448
+ Description of this operation for the ETL graph
1449
+
1450
+ Returns
1451
+ -------
1452
+ FlowFrame
1453
+ A new FlowFrame with the columns added
1454
+
1455
+ Raises
1456
+ ------
1457
+ ValueError
1458
+ If neither exprs nor flowfile_formulas with output_column_names are provided,
1459
+ or if the lengths of flowfile_formulas and output_column_names don't match
1460
+ """
1461
+ if exprs is not None:
1462
+ new_node_id = generate_node_id()
1463
+ exprs_iterable = _parse_inputs_as_iterable((exprs,))
1464
+
1465
+ if len(exprs_iterable) == 1:
1466
+ detected, result = self._detect_cum_count_record_id(
1467
+ exprs_iterable[0], new_node_id, description
1468
+ )
1469
+ if detected:
1470
+ return result
1471
+ all_expressions = []
1472
+ for expression in exprs_iterable:
1473
+ if not isinstance(expression, (Expr, Column)):
1474
+ all_expressions.append(lit(expression))
1475
+ else:
1476
+ all_expressions.append(expression)
1477
+
1478
+ code = (
1479
+ f"input_df.with_columns({', '.join(str(e) for e in all_expressions)})"
1480
+ )
1481
+ self._add_polars_code(new_node_id, code, description)
1482
+ return self._create_child_frame(new_node_id)
1483
+
1484
+ elif flowfile_formulas is not None and output_column_names is not None:
1485
+ if len(output_column_names) != len(flowfile_formulas):
1486
+ raise ValueError(
1487
+ "Length of both the formulas and the output columns names must be identical"
1488
+ )
1489
+
1490
+ if len(flowfile_formulas) == 1:
1491
+ return self._with_flowfile_formula(flowfile_formulas[0], output_column_names[0], description)
1492
+ ff = self
1493
+ for i, (flowfile_formula, output_column_name) in enumerate(zip(flowfile_formulas, output_column_names)):
1494
+ ff = ff._with_flowfile_formula(flowfile_formula, output_column_name, f"{i}: {description}")
1495
+ return ff
1496
+ else:
1497
+ raise ValueError(
1498
+ "Either exprs or flowfile_formulas with output_column_names must be provided"
1499
+ )
1500
+
1501
+ def with_row_index(
1502
+ self, name: str = "index", offset: int = 0, description: str = None
1503
+ ) -> "FlowFrame":
1504
+ """
1505
+ Add a row index as the first column in the DataFrame.
1506
+
1507
+ Parameters
1508
+ ----------
1509
+ name : str, default "index"
1510
+ Name of the index column.
1511
+ offset : int, default 0
1512
+ Start the index at this offset. Cannot be negative.
1513
+ description : str, optional
1514
+ Description of this operation for the ETL graph
1515
+
1516
+ Returns
1517
+ -------
1518
+ FlowFrame
1519
+ A new FlowFrame with the row index column added
1520
+ """
1521
+ new_node_id = generate_node_id()
1522
+
1523
+ # Check if we can use the native record_id implementation
1524
+ if name == "record_id" or (offset == 1 and name != "index"):
1525
+ # Create RecordIdInput - no grouping needed
1526
+ record_id_input = transform_schema.RecordIdInput(
1527
+ output_column_name=name,
1528
+ offset=offset,
1529
+ group_by=False,
1530
+ group_by_columns=[],
1531
+ )
1532
+
1533
+ # Create node settings
1534
+ record_id_settings = input_schema.NodeRecordId(
1535
+ flow_id=self.flow_graph.flow_id,
1536
+ node_id=new_node_id,
1537
+ record_id_input=record_id_input,
1538
+ pos_x=200,
1539
+ pos_y=150,
1540
+ is_setup=True,
1541
+ depending_on_id=self.node_id,
1542
+ description=description or f"Add row index column '{name}'",
1543
+ )
1544
+
1545
+ # Add to graph
1546
+ self.flow_graph.add_record_id(record_id_settings)
1547
+ else:
1548
+ # Use the polars code approach for other cases
1549
+ code = f"input_df.with_row_index(name='{name}', offset={offset})"
1550
+ self._add_polars_code(
1551
+ new_node_id, code, description or f"Add row index column '{name}'"
1552
+ )
1553
+
1554
+ return self._create_child_frame(new_node_id)
1555
+
1556
+ def explode(
1557
+ self,
1558
+ columns: str | Column | Iterable[str | Column],
1559
+ *more_columns: str | Column,
1560
+ description: str = None,
1561
+ ) -> "FlowFrame":
1562
+ """
1563
+ Explode the dataframe to long format by exploding the given columns.
1564
+
1565
+ The underlying columns being exploded must be of the List or Array data type.
1566
+
1567
+ Parameters
1568
+ ----------
1569
+ columns : str, Column, or Sequence[str, Column]
1570
+ Column names, expressions, or a sequence of them to explode
1571
+ *more_columns : str or Column
1572
+ Additional columns to explode, specified as positional arguments
1573
+ description : str, optional
1574
+ Description of this operation for the ETL graph
1575
+
1576
+ Returns
1577
+ -------
1578
+ FlowFrame
1579
+ A new FlowFrame with exploded rows
1580
+ """
1581
+ new_node_id = generate_node_id()
1582
+
1583
+ all_columns = []
1584
+
1585
+ if isinstance(columns, (list, tuple)):
1586
+ all_columns.extend(
1587
+ [col.name if isinstance(col, Column) else col for col in columns]
1588
+ )
1589
+ else:
1590
+ all_columns.append(columns.name if isinstance(columns, Column) else columns)
1591
+
1592
+ if more_columns:
1593
+ for col in more_columns:
1594
+ all_columns.append(col.name if isinstance(col, Column) else col)
1595
+
1596
+ if len(all_columns) == 1:
1597
+ columns_str = f"'{all_columns[0]}'"
1598
+ else:
1599
+ columns_str = "[" + ", ".join([f"'{col}'" for col in all_columns]) + "]"
1600
+
1601
+ code = f"""
1602
+ # Explode columns into multiple rows
1603
+ output_df = input_df.explode({columns_str})
1604
+ """
1605
+
1606
+ cols_desc = ", ".join(all_columns)
1607
+ desc = description or f"Explode column(s): {cols_desc}"
1608
+
1609
+ # Add polars code node
1610
+ self._add_polars_code(new_node_id, code, desc)
1611
+
1612
+ return self._create_child_frame(new_node_id)
1613
+
1614
+ def text_to_rows(
1615
+ self,
1616
+ column: str | Column,
1617
+ output_column: str = None,
1618
+ delimiter: str = None,
1619
+ split_by_column: str = None,
1620
+ description: str = None,
1621
+ ) -> "FlowFrame":
1622
+ """
1623
+ Split text in a column into multiple rows.
1624
+
1625
+ This is equivalent to the explode operation after string splitting in Polars.
1626
+
1627
+ Parameters
1628
+ ----------
1629
+ column : str or Column
1630
+ Column containing text to split
1631
+ output_column : str, optional
1632
+ Column name for the split values (defaults to input column name)
1633
+ delimiter : str, default ','
1634
+ String delimiter to split text on when using a fixed value
1635
+ split_by_column : str, optional
1636
+ Alternative: column name containing the delimiter for each row
1637
+ If provided, this overrides the delimiter parameter
1638
+ description : str, optional
1639
+ Description of this operation for the ETL graph
1640
+
1641
+ Returns
1642
+ -------
1643
+ FlowFrame
1644
+ A new FlowFrame with text split into multiple rows
1645
+ """
1646
+ new_node_id = generate_node_id()
1647
+
1648
+ if isinstance(column, Column):
1649
+ column_name = column.name
1650
+ else:
1651
+ column_name = column
1652
+
1653
+ output_column = output_column or column_name
1654
+
1655
+ text_to_rows_input = transform_schema.TextToRowsInput(
1656
+ column_to_split=column_name,
1657
+ output_column_name=output_column,
1658
+ split_by_fixed_value=split_by_column is None,
1659
+ split_fixed_value=delimiter,
1660
+ split_by_column=split_by_column,
1661
+ )
1662
+
1663
+ # Create node settings
1664
+ text_to_rows_settings = input_schema.NodeTextToRows(
1665
+ flow_id=self.flow_graph.flow_id,
1666
+ node_id=new_node_id,
1667
+ text_to_rows_input=text_to_rows_input,
1668
+ pos_x=200,
1669
+ pos_y=150,
1670
+ is_setup=True,
1671
+ depending_on_id=self.node_id,
1672
+ description=description or f"Split text in '{column_name}' to rows",
1673
+ )
1674
+
1675
+ # Add to graph
1676
+ self.flow_graph.add_text_to_rows(text_to_rows_settings)
1677
+
1678
+ return self._create_child_frame(new_node_id)
1679
+
1680
+ def unique(
1681
+ self,
1682
+ subset: Union[str, "Expr", List[ Union[ str, "Expr"]]] = None,
1683
+ *,
1684
+ keep: Literal["first", "last", "any", "none"] = "any",
1685
+ maintain_order: bool = False,
1686
+ description: str = None,
1687
+ ) -> "FlowFrame":
1688
+ """
1689
+ Drop duplicate rows from this dataframe.
1690
+
1691
+ Parameters
1692
+ ----------
1693
+ subset : str, Expr, list of str or Expr, optional
1694
+ Column name(s) or selector(s), to consider when identifying duplicate rows.
1695
+ If set to None (default), use all columns.
1696
+ keep : {'first', 'last', 'any', 'none'}, default 'any'
1697
+ Which of the duplicate rows to keep.
1698
+ * 'any': Does not give any guarantee of which row is kept.
1699
+ This allows more optimizations.
1700
+ * 'none': Don't keep duplicate rows.
1701
+ * 'first': Keep first unique row.
1702
+ * 'last': Keep last unique row.
1703
+ maintain_order : bool, default False
1704
+ Keep the same order as the original DataFrame. This is more expensive
1705
+ to compute. Settings this to True blocks the possibility to run on
1706
+ the streaming engine.
1707
+ description : str, optional
1708
+ Description of this operation for the ETL graph.
1709
+
1710
+ Returns
1711
+ -------
1712
+ FlowFrame
1713
+ DataFrame with unique rows.
1714
+ """
1715
+ new_node_id = generate_node_id()
1716
+
1717
+ processed_subset = None
1718
+ can_use_native = True
1719
+ if subset is not None:
1720
+ # Convert to list if single item
1721
+ if not isinstance(subset, (list, tuple)):
1722
+ subset = [subset]
1723
+
1724
+ # Extract column names
1725
+ processed_subset = []
1726
+ for col_expr in subset:
1727
+ if isinstance(col_expr, str):
1728
+ processed_subset.append(col_expr)
1729
+ elif isinstance(col_expr, Column):
1730
+ if col_expr._select_input.is_altered:
1731
+ can_use_native = False
1732
+ break
1733
+ processed_subset.append(col_expr.name)
1734
+ else:
1735
+ can_use_native = False
1736
+ break
1737
+
1738
+ # Determine if we can use the native implementation
1739
+ can_use_native = (
1740
+ can_use_native
1741
+ and keep in ["any", "first", "last", "none"]
1742
+ and not maintain_order
1743
+ )
1744
+
1745
+ if can_use_native:
1746
+ # Use the native NodeUnique implementation
1747
+ unique_input = transform_schema.UniqueInput(
1748
+ columns=processed_subset, strategy=keep
1749
+ )
1750
+
1751
+ # Create node settings
1752
+ unique_settings = input_schema.NodeUnique(
1753
+ flow_id=self.flow_graph.flow_id,
1754
+ node_id=new_node_id,
1755
+ unique_input=unique_input,
1756
+ pos_x=200,
1757
+ pos_y=150,
1758
+ is_setup=True,
1759
+ depending_on_id=self.node_id,
1760
+ description=description or f"Get unique rows (strategy: {keep})",
1761
+ )
1762
+
1763
+ # Add to graph using native implementation
1764
+ self.flow_graph.add_unique(unique_settings)
1765
+ else:
1766
+ # Generate polars code for more complex cases
1767
+ if subset is None:
1768
+ subset_str = "None"
1769
+ elif isinstance(subset, (list, tuple)):
1770
+ # Format each item in the subset list
1771
+ items = []
1772
+ for item in subset:
1773
+ if isinstance(item, str):
1774
+ items.append(f'"{item}"')
1775
+ else:
1776
+ # For expressions, use their string representation
1777
+ items.append(str(item))
1778
+ subset_str = f"[{', '.join(items)}]"
1779
+ else:
1780
+ # Single item that's not a string
1781
+ subset_str = str(subset)
1782
+
1783
+ code = f"""
1784
+ # Remove duplicate rows
1785
+ output_df = input_df.unique(
1786
+ subset={subset_str},
1787
+ keep='{keep}',
1788
+ maintain_order={maintain_order}
1789
+ )
1790
+ """
1791
+
1792
+ # Create descriptive text based on parameters
1793
+ subset_desc = "all columns" if subset is None else f"columns: {subset_str}"
1794
+ desc = description or f"Get unique rows using {subset_desc}, keeping {keep}"
1795
+
1796
+ # Add polars code node
1797
+ self._add_polars_code(new_node_id, code, desc)
1798
+
1799
+ return self._create_child_frame(new_node_id)
1800
+
1801
+ @property
1802
+ def columns(self) -> List[str]:
1803
+ """Get the column names."""
1804
+ return self.data.columns
1805
+
1806
+ @property
1807
+ def dtypes(self) -> List[pl.DataType]:
1808
+ """Get the column data types."""
1809
+ return self.data.dtypes
1810
+
1811
+ @property
1812
+ def schema(self) -> pl.schema.Schema:
1813
+ """Get an ordered mapping of column names to their data type."""
1814
+ return self.data.schema
1815
+
1816
+ @property
1817
+ def width(self) -> int:
1818
+ """Get the number of columns."""
1819
+ return self.data.width
1820
+
1821
+
1822
+ def _add_delegated_methods():
1823
+ """Add delegated methods from polars LazyFrame."""
1824
+ delegate_methods = [
1825
+ "collect_async",
1826
+ "profile",
1827
+ "describe",
1828
+ "explain",
1829
+ "show_graph",
1830
+ "serialize",
1831
+ "fetch",
1832
+ "get_meta",
1833
+ "columns",
1834
+ "dtypes",
1835
+ "schema",
1836
+ "estimated_size",
1837
+ "n_chunks",
1838
+ "is_empty",
1839
+ "chunk_lengths",
1840
+ "optimization_toggle",
1841
+ "set_polars_options",
1842
+ "collect_schema"
1843
+ ]
1844
+
1845
+ already_implemented = set(dir(FlowFrame))
1846
+
1847
+ for method_name in delegate_methods:
1848
+ if method_name not in already_implemented and hasattr(
1849
+ pl.LazyFrame, method_name
1850
+ ):
1851
+ # Create a simple delegate method
1852
+ def make_delegate(name):
1853
+ def delegate_method(self, *args, **kwargs):
1854
+ return getattr(self.data, name)(*args, **kwargs)
1855
+
1856
+ # Set docstring and name
1857
+ delegate_method.__doc__ = (
1858
+ f"See pl.LazyFrame.{name} for full documentation."
1859
+ )
1860
+ delegate_method.__name__ = name
1861
+ return delegate_method
1862
+
1863
+ # Add the method to the class
1864
+ setattr(FlowFrame, method_name, make_delegate(method_name))
1865
+
1866
+
1867
+ _add_delegated_methods()
1868
+
1869
+
1870
+ def sum(expr):
1871
+ """Sum aggregation function."""
1872
+ if isinstance(expr, str):
1873
+ expr = col(expr)
1874
+ return expr.sum()
1875
+
1876
+
1877
+ def mean(expr):
1878
+ """Mean aggregation function."""
1879
+ if isinstance(expr, str):
1880
+ expr = col(expr)
1881
+ return expr.mean()
1882
+
1883
+
1884
+ def min(expr):
1885
+ """Min aggregation function."""
1886
+ if isinstance(expr, str):
1887
+ expr = col(expr)
1888
+ return expr.min()
1889
+
1890
+
1891
+ def max(expr):
1892
+ """Max aggregation function."""
1893
+ if isinstance(expr, str):
1894
+ expr = col(expr)
1895
+ return expr.max()
1896
+
1897
+
1898
+ def count(expr):
1899
+ """Count aggregation function."""
1900
+ if isinstance(expr, str):
1901
+ expr = col(expr)
1902
+ return expr.count()
1903
+
1904
+
1905
+ def read_csv(file_path, *, flow_graph: FlowGraph = None, separator: str = ';',
1906
+ convert_to_absolute_path: bool = True,
1907
+ description: str = None, **options):
1908
+ """
1909
+ Read a CSV file into a FlowFrame.
1910
+
1911
+ Args:
1912
+ file_path: Path to CSV file
1913
+ flow_graph: if you want to add it to an existing graph
1914
+ separator: Single byte character to use as separator in the file.
1915
+ convert_to_absolute_path: If the path needs to be set to a fixed location
1916
+ description: if you want to add a readable name in the frontend (advised)
1917
+ **options: Options for polars.read_csv
1918
+
1919
+ Returns:
1920
+ A FlowFrame with the CSV data
1921
+ """
1922
+ # Create new node ID
1923
+ node_id = generate_node_id()
1924
+ if flow_graph is None:
1925
+ flow_graph = create_etl_graph()
1926
+
1927
+ flow_id = flow_graph.flow_id
1928
+
1929
+ has_headers = options.get('has_header', True)
1930
+ encoding = options.get('encoding', 'utf-8')
1931
+
1932
+ if '~' in file_path:
1933
+ file_path = os.path.expanduser(file_path)
1934
+
1935
+ received_table = input_schema.ReceivedTable(
1936
+ file_type='csv',
1937
+ path=file_path,
1938
+ name=Path(file_path).name,
1939
+ delimiter=separator,
1940
+ has_headers=has_headers,
1941
+ encoding=encoding
1942
+ )
1943
+
1944
+ if convert_to_absolute_path:
1945
+ received_table.path = received_table.abs_file_path
1946
+
1947
+ read_node = input_schema.NodeRead(
1948
+ flow_id=flow_id,
1949
+ node_id=node_id,
1950
+ received_file=received_table,
1951
+ pos_x=100,
1952
+ pos_y=100,
1953
+ is_setup=True
1954
+ )
1955
+
1956
+ flow_graph.add_read(read_node)
1957
+
1958
+ return FlowFrame(
1959
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
1960
+ flow_graph=flow_graph,
1961
+ node_id=node_id
1962
+ )
1963
+
1964
+
1965
+ def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
1966
+ convert_to_absolute_path: bool = True, **options) -> FlowFrame:
1967
+ """
1968
+ Read a Parquet file into a FlowFrame.
1969
+
1970
+ Args:
1971
+ file_path: Path to Parquet file
1972
+ flow_graph: if you want to add it to an existing graph
1973
+ description: if you want to add a readable name in the frontend (advised)
1974
+ convert_to_absolute_path: If the path needs to be set to a fixed location
1975
+ **options: Options for polars.read_parquet
1976
+
1977
+ Returns:
1978
+ A FlowFrame with the Parquet data
1979
+ """
1980
+ if '~' in file_path:
1981
+ file_path = os.path.expanduser(file_path)
1982
+ node_id = generate_node_id()
1983
+
1984
+ if flow_graph is None:
1985
+ flow_graph = create_etl_graph()
1986
+
1987
+ flow_id = flow_graph.flow_id
1988
+
1989
+ received_table = input_schema.ReceivedTable(
1990
+ file_type='parquet',
1991
+ path=file_path,
1992
+ name=Path(file_path).name,
1993
+ )
1994
+ if convert_to_absolute_path:
1995
+ received_table.path = received_table.abs_file_path
1996
+
1997
+ read_node = input_schema.NodeRead(
1998
+ flow_id=flow_id,
1999
+ node_id=node_id,
2000
+ received_file=received_table,
2001
+ pos_x=100,
2002
+ pos_y=100,
2003
+ is_setup=True,
2004
+ description=description
2005
+ )
2006
+
2007
+ flow_graph.add_read(read_node)
2008
+
2009
+ return FlowFrame(
2010
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2011
+ flow_graph=flow_graph,
2012
+ node_id=node_id
2013
+ )
2014
+
2015
+
2016
+ def from_dict(data, *, flow_graph: FlowGraph = None, description: str = None) -> FlowFrame:
2017
+ """
2018
+ Create a FlowFrame from a dictionary or list of dictionaries.
2019
+
2020
+ Args:
2021
+ data: Dictionary of lists or list of dictionaries
2022
+ flow_graph: if you want to add it to an existing graph
2023
+ description: if you want to add a readable name in the frontend (advised)
2024
+ Returns:
2025
+ A FlowFrame with the data
2026
+ """
2027
+ # Create new node ID
2028
+ node_id = generate_node_id()
2029
+
2030
+ if not flow_graph:
2031
+ flow_graph = create_etl_graph()
2032
+ flow_id = flow_graph.flow_id
2033
+
2034
+ input_node = input_schema.NodeManualInput(
2035
+ flow_id=flow_id,
2036
+ node_id=node_id,
2037
+ raw_data=FlowDataEngine(data).to_pylist(),
2038
+ pos_x=100,
2039
+ pos_y=100,
2040
+ is_setup=True,
2041
+ description=description
2042
+ )
2043
+
2044
+ # Add to graph
2045
+ flow_graph.add_manual_input(input_node)
2046
+
2047
+ # Return new frame
2048
+ return FlowFrame(
2049
+ data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
2050
+ flow_graph=flow_graph,
2051
+ node_id=node_id
2052
+ )
2053
+
2054
+
2055
+ def concat(frames: List['FlowFrame'],
2056
+ how: str = 'vertical',
2057
+ rechunk: bool = False,
2058
+ parallel: bool = True,
2059
+ description: str = None) -> 'FlowFrame':
2060
+ """
2061
+ Concatenate multiple FlowFrames into one.
2062
+
2063
+ Parameters
2064
+ ----------
2065
+ frames : List[FlowFrame]
2066
+ List of FlowFrames to concatenate
2067
+ how : str, default 'vertical'
2068
+ How to combine the FlowFrames (see concat method documentation)
2069
+ rechunk : bool, default False
2070
+ Whether to ensure contiguous memory in result
2071
+ parallel : bool, default True
2072
+ Whether to use parallel processing for the operation
2073
+ description : str, optional
2074
+ Description of this operation
2075
+
2076
+ Returns
2077
+ -------
2078
+ FlowFrame
2079
+ A new FlowFrame with the concatenated data
2080
+ """
2081
+ if not frames:
2082
+ raise ValueError("No frames provided to concat_frames")
2083
+
2084
+ if len(frames) == 1:
2085
+ return frames[0]
2086
+
2087
+ # Use first frame's concat method with remaining frames
2088
+ first_frame = frames[0]
2089
+ remaining_frames = frames[1:]
2090
+
2091
+ return first_frame.concat(remaining_frames, how=how,
2092
+ rechunk=rechunk, parallel=parallel,
2093
+ description=description)