Flowfile 0.3.3.2__py3-none-any.whl → 0.3.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (76) hide show
  1. flowfile/__init__.py +1 -1
  2. flowfile/__main__.py +0 -2
  3. flowfile/api.py +94 -46
  4. flowfile/web/__init__.py +4 -1
  5. flowfile/web/static/assets/{AirbyteReader-2b1cf2d8.js → AirbyteReader-e08044e5.js} +8 -8
  6. flowfile/web/static/assets/{CrossJoin-cc3ab73c.js → CrossJoin-dfcf7351.js} +8 -8
  7. flowfile/web/static/assets/{DatabaseConnectionSettings-307c4652.js → DatabaseConnectionSettings-b2afb1d7.js} +2 -2
  8. flowfile/web/static/assets/{DatabaseManager-69faa6e1.js → DatabaseManager-824a49b2.js} +2 -2
  9. flowfile/web/static/assets/{DatabaseReader-e4134cd0.js → DatabaseReader-a48124d8.js} +9 -9
  10. flowfile/web/static/assets/{DatabaseWriter-d32d75b1.js → DatabaseWriter-b47cbae2.js} +9 -9
  11. flowfile/web/static/assets/{ExploreData-5eb48389.js → ExploreData-fdfc45a4.js} +5 -5
  12. flowfile/web/static/assets/{ExternalSource-29489051.js → ExternalSource-861b0e71.js} +6 -6
  13. flowfile/web/static/assets/{Filter-031332bb.js → Filter-f87bb897.js} +8 -9
  14. flowfile/web/static/assets/{Formula-3b900540.js → Formula-1e2ed720.js} +9 -10
  15. flowfile/web/static/assets/{FuzzyMatch-dee31153.js → FuzzyMatch-b6cc4fdd.js} +9 -9
  16. flowfile/web/static/assets/{GraphSolver-ca74eb47.js → GraphSolver-6a371f4c.js} +5 -5
  17. flowfile/web/static/assets/{GroupBy-081b6591.js → GroupBy-f7b7f472.js} +6 -6
  18. flowfile/web/static/assets/{Join-b467376f.js → Join-eec38203.js} +9 -9
  19. flowfile/web/static/assets/{ManualInput-ffffb80a.js → ManualInput-9aaa46fb.js} +5 -5
  20. flowfile/web/static/assets/{Output-9a87d4ba.js → Output-3b2ca045.js} +4 -4
  21. flowfile/web/static/assets/{Pivot-ee3e6093.js → Pivot-a4f5d88f.js} +6 -6
  22. flowfile/web/static/assets/PolarsCode-49ce444f.js +306 -0
  23. flowfile/web/static/assets/{Read-67fee3a0.js → Read-07acdc9a.js} +6 -6
  24. flowfile/web/static/assets/{RecordCount-a2acd02d.js → RecordCount-6a21da56.js} +5 -5
  25. flowfile/web/static/assets/{RecordId-0c8bcd77.js → RecordId-949bdc17.js} +6 -6
  26. flowfile/web/static/assets/{Sample-60594a3a.js → Sample-7afca6e1.js} +5 -5
  27. flowfile/web/static/assets/{SecretManager-bbcec2ac.js → SecretManager-b41c029d.js} +2 -2
  28. flowfile/web/static/assets/{Select-9540e6ca.js → Select-32b28406.js} +8 -8
  29. flowfile/web/static/assets/{SettingsSection-48f28104.js → SettingsSection-a0f15a05.js} +1 -1
  30. flowfile/web/static/assets/{Sort-6dbe3633.js → Sort-fc6ba0e2.js} +6 -6
  31. flowfile/web/static/assets/{TextToRows-27aab4a8.js → TextToRows-23127596.js} +8 -8
  32. flowfile/web/static/assets/{UnavailableFields-8143044b.js → UnavailableFields-c42880a3.js} +2 -2
  33. flowfile/web/static/assets/{Union-52460248.js → Union-39eecc6c.js} +5 -5
  34. flowfile/web/static/assets/{Unique-f6962644.js → Unique-a0e8fe61.js} +8 -8
  35. flowfile/web/static/assets/{Unpivot-1ff1e938.js → Unpivot-1e2d43f0.js} +5 -5
  36. flowfile/web/static/assets/{api-3b345d92.js → api-44ca9e9c.js} +1 -1
  37. flowfile/web/static/assets/{designer-2394122a.css → designer-186f2e71.css} +64 -9
  38. flowfile/web/static/assets/{designer-4736134f.js → designer-267d44f1.js} +2835 -105
  39. flowfile/web/static/assets/{documentation-b9545eba.js → documentation-6c0810a2.js} +1 -1
  40. flowfile/web/static/assets/{dropDown-d5a4014c.js → dropDown-52790b15.js} +1 -1
  41. flowfile/web/static/assets/{dropDownGeneric-1f4e32ec.js → dropDownGeneric-60f56a8a.js} +2 -2
  42. flowfile/web/static/assets/{fullEditor-f4791c23.js → fullEditor-e272b506.js} +2 -3
  43. flowfile/web/static/assets/{genericNodeSettings-1d456350.js → genericNodeSettings-4bdcf98e.js} +3 -3
  44. flowfile/web/static/assets/{index-f25c9283.js → index-e235a8bc.js} +12 -12
  45. flowfile/web/static/assets/{nodeTitle-cad6fd9d.js → nodeTitle-fc3fc4b7.js} +3 -3
  46. flowfile/web/static/assets/{secretApi-01f07e2c.js → secretApi-cdc2a3fd.js} +1 -1
  47. flowfile/web/static/assets/{selectDynamic-f46a4e3f.js → selectDynamic-96aa82cd.js} +3 -3
  48. flowfile/web/static/assets/{vue-codemirror.esm-eb98fc8b.js → vue-codemirror.esm-25e75a08.js} +610 -29
  49. flowfile/web/static/assets/{vue-content-loader.es-860c0380.js → vue-content-loader.es-6c4b1c24.js} +1 -1
  50. flowfile/web/static/index.html +1 -1
  51. {flowfile-0.3.3.2.dist-info → flowfile-0.3.4.1.dist-info}/METADATA +16 -3
  52. {flowfile-0.3.3.2.dist-info → flowfile-0.3.4.1.dist-info}/RECORD +73 -73
  53. flowfile_core/flowfile/code_generator/__init__.py +0 -0
  54. flowfile_core/flowfile/code_generator/code_generator.py +723 -0
  55. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1 -1
  56. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +1 -1
  57. flowfile_core/flowfile/{FlowfileFlow.py → flow_graph.py} +3 -3
  58. flowfile_core/flowfile/flow_graph_utils.py +1 -1
  59. flowfile_core/flowfile/handler.py +1 -1
  60. flowfile_core/flowfile/manage/open_flowfile.py +1 -1
  61. flowfile_core/flowfile/util/calculate_layout.py +1 -1
  62. flowfile_core/routes/routes.py +11 -1
  63. flowfile_core/schemas/input_schema.py +2 -1
  64. flowfile_frame/adapters.py +1 -1
  65. flowfile_frame/flow_frame.py +1 -4
  66. flowfile_frame/flow_frame.pyi +1 -1
  67. flowfile_frame/flow_frame_methods.py +1 -1
  68. flowfile_frame/lazy.py +1 -1
  69. flowfile_frame/utils.py +1 -1
  70. flowfile/readme.md +0 -127
  71. flowfile/web/static/assets/PolarsCode-03921254.js +0 -2865
  72. flowfile/web/static/assets/PopOver-3bdf8951.js +0 -577
  73. /flowfile/web/static/assets/{PopOver-bccfde04.css → vue-codemirror-bccfde04.css} +0 -0
  74. {flowfile-0.3.3.2.dist-info → flowfile-0.3.4.1.dist-info}/LICENSE +0 -0
  75. {flowfile-0.3.3.2.dist-info → flowfile-0.3.4.1.dist-info}/WHEEL +0 -0
  76. {flowfile-0.3.3.2.dist-info → flowfile-0.3.4.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,723 @@
1
+ from typing import List, Dict, Optional, Set, Tuple, Any
2
+ from collections import defaultdict
3
+ import polars as pl
4
+
5
+ from flowfile_core.flowfile.flow_graph import FlowGraph
6
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn, convert_pl_type_to_string
7
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
8
+ from flowfile_core.flowfile.flow_node.flow_node import FlowNode
9
+ from flowfile_core.flowfile.util.execution_orderer import determine_execution_order
10
+ from flowfile_core.schemas import input_schema, transform_schema
11
+ from flowfile_core.configs import logger
12
+
13
+
14
+ class FlowGraphToPolarsConverter:
15
+ """
16
+ Converts a FlowGraph into executable Polars code.
17
+
18
+ This class takes a FlowGraph instance and generates standalone Python code
19
+ that uses only Polars, without any Flowfile dependencies.
20
+ """
21
+ flow_graph: FlowGraph
22
+ node_var_mapping: Dict[int, str]
23
+ imports: Set[str]
24
+ code_lines: List[str]
25
+ output_nodes: List[Tuple[int, str]] = []
26
+ last_node_var: Optional[str] = None
27
+
28
+ def __init__(self, flow_graph: FlowGraph):
29
+ self.flow_graph = flow_graph
30
+ self.node_var_mapping: Dict[int, str] = {} # Maps node_id to variable name
31
+ self.imports: Set[str] = {"import polars as pl"}
32
+ self.code_lines: List[str] = []
33
+ self.output_nodes = []
34
+ self.last_node_var = None
35
+
36
+ def convert(self) -> str:
37
+ """
38
+ Main method to convert the FlowGraph to Polars code.
39
+
40
+ Returns:
41
+ str: Complete Python code that can be executed standalone
42
+ """
43
+ # Get execution order
44
+ execution_order = determine_execution_order(
45
+ all_nodes=[node for node in self.flow_graph.nodes if node.is_correct],
46
+ flow_starts=self.flow_graph._flow_starts + self.flow_graph.get_implicit_starter_nodes()
47
+ )
48
+
49
+ # Generate code for each node in order
50
+ for node in execution_order:
51
+ self._generate_node_code(node)
52
+
53
+ # Combine everything
54
+ return self._build_final_code()
55
+
56
+ def handle_output_node(self, node: FlowNode, var_name: str) -> None:
57
+ settings = node.setting_input
58
+ if hasattr(settings, 'is_flow_output') and settings.is_flow_output:
59
+ self.output_nodes.append((node.node_id, var_name))
60
+
61
+ def _generate_node_code(self, node: FlowNode) -> None:
62
+ """Generate Polars code for a specific node."""
63
+ node_type = node.node_type
64
+ settings = node.setting_input
65
+ # Skip placeholder nodes
66
+ if isinstance(settings, input_schema.NodePromise):
67
+ self._add_comment(f"# Skipping uninitialized node: {node.node_id}")
68
+ return
69
+ # Create variable name for this node's output
70
+ var_name = f"df_{node.node_id}"
71
+ self.node_var_mapping[node.node_id] = var_name
72
+ self.handle_output_node(node, var_name)
73
+ if node.node_template.output>0:
74
+ self.last_node_var = var_name
75
+ # Get input variable names
76
+ input_vars = self._get_input_vars(node)
77
+ # Route to appropriate handler based on node type
78
+ handler = getattr(self, f"_handle_{node_type}", None)
79
+ if handler:
80
+ handler(settings, var_name, input_vars)
81
+ else:
82
+ self._add_comment(f"# TODO: Implement handler for node type: {node_type}")
83
+ raise Exception(f"No handler implemented for node type: {node_type}")
84
+
85
+ def _get_input_vars(self, node: FlowNode) -> Dict[str, str]:
86
+ """Get input variable names for a node."""
87
+ input_vars = {}
88
+
89
+ if node.node_inputs.main_inputs:
90
+ if len(node.node_inputs.main_inputs) == 1:
91
+ input_vars['main'] = self.node_var_mapping.get(
92
+ node.node_inputs.main_inputs[0].node_id, 'df'
93
+ )
94
+ else:
95
+ for i, input_node in enumerate(node.node_inputs.main_inputs):
96
+ input_vars[f'main_{i}'] = self.node_var_mapping.get(
97
+ input_node.node_id, f'df_{i}'
98
+ )
99
+
100
+ if node.node_inputs.left_input:
101
+ input_vars['left'] = self.node_var_mapping.get(
102
+ node.node_inputs.left_input.node_id, 'df_left'
103
+ )
104
+
105
+ if node.node_inputs.right_input:
106
+ input_vars['right'] = self.node_var_mapping.get(
107
+ node.node_inputs.right_input.node_id, 'df_right'
108
+ )
109
+
110
+ return input_vars
111
+
112
+ def _handle_csv_read(self, file_settings: input_schema.ReceivedTable, var_name: str):
113
+ if file_settings.encoding.lower() in ('utf-8', 'utf8'):
114
+ encoding = "utf8-lossy"
115
+ self._add_code(f"{var_name} = pl.scan_csv(")
116
+ self._add_code(f' "{file_settings.abs_file_path}",')
117
+ self._add_code(f' separator="{file_settings.delimiter}",')
118
+ self._add_code(f' has_header={file_settings.has_headers},')
119
+ self._add_code(f' ignore_errors={file_settings.ignore_errors},')
120
+ self._add_code(f' encoding="{encoding}",')
121
+ self._add_code(f' skip_rows={file_settings.starting_from_line},')
122
+ self._add_code(")")
123
+ else:
124
+ self._add_code(f"{var_name} = pl.read_csv(")
125
+ self._add_code(f' "{file_settings.abs_file_path}",')
126
+ self._add_code(f' separator="{file_settings.delimiter}",')
127
+ self._add_code(f' has_header={file_settings.has_headers},')
128
+ self._add_code(f' ignore_errors={file_settings.ignore_errors},')
129
+ if file_settings.encoding:
130
+ self._add_code(f' encoding="{file_settings.encoding}",')
131
+ self._add_code(f' skip_rows={file_settings.starting_from_line},')
132
+ self._add_code(").lazy()")
133
+
134
+ def _handle_read(self, settings: input_schema.NodeRead, var_name: str, input_vars: Dict[str, str]) -> None:
135
+ """Handle file reading nodes."""
136
+ file_settings = settings.received_file
137
+
138
+ if file_settings.file_type == 'csv':
139
+ self._handle_csv_read(file_settings, var_name)
140
+
141
+ elif file_settings.file_type == 'parquet':
142
+ self._add_code(f'{var_name} = pl.scan_parquet("{file_settings.abs_file_path}")')
143
+
144
+ elif file_settings.file_type in ('xlsx', 'excel'):
145
+ self._add_code(f"{var_name} = pl.read_excel(")
146
+ self._add_code(f' "{file_settings.abs_file_path}",')
147
+ if file_settings.sheet_name:
148
+ self._add_code(f' sheet_name="{file_settings.sheet_name}",')
149
+ self._add_code(").lazy()")
150
+
151
+ self._add_code("")
152
+
153
+ @staticmethod
154
+ def _generate_pl_schema_with_typing(flowfile_schema: List[FlowfileColumn]) -> str:
155
+ polars_schema_str = "pl.Schema([" + ", ".join(f'("{flowfile_column.column_name}", pl.{flowfile_column.data_type})'
156
+ for flowfile_column in flowfile_schema) + "])"
157
+ return polars_schema_str
158
+
159
+ def get_manual_schema_input(self, flowfile_schema: List[FlowfileColumn]) -> str:
160
+ polars_schema_str = self._generate_pl_schema_with_typing(flowfile_schema)
161
+ is_valid_pl_schema = self._validate_pl_schema(polars_schema_str)
162
+ if is_valid_pl_schema:
163
+ return polars_schema_str
164
+ else:
165
+ return "[" + ", ".join([f'"{c.name}"' for c in flowfile_schema]) + "]"
166
+
167
+ @staticmethod
168
+ def _validate_pl_schema(pl_schema_str: str) -> bool:
169
+ try:
170
+ _globals = {"pl": pl}
171
+ eval(pl_schema_str, _globals)
172
+ return True
173
+ except Exception as e:
174
+ logger.error(f"Invalid Polars schema: {e}")
175
+ return False
176
+
177
+ def _handle_manual_input(self, settings: input_schema.NodeManualInput, var_name: str, input_vars: Dict[str, str]) -> None:
178
+ """Handle manual data input nodes."""
179
+ if settings.raw_data_format:
180
+ data = settings.raw_data_format.data
181
+ flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in settings.raw_data_format.columns)
182
+ schema = self.get_manual_schema_input(flowfile_schema)
183
+ self._add_code(f"{var_name} = pl.LazyFrame({data}, schema={schema}, strict=False)")
184
+ else:
185
+ self._add_code(f"{var_name} = pl.LazyFrame({settings.raw_data})")
186
+ self._add_code("")
187
+
188
+ def _handle_filter(self, settings: input_schema.NodeFilter, var_name: str, input_vars: Dict[str, str]) -> None:
189
+ """Handle filter nodes."""
190
+ input_df = input_vars.get('main', 'df')
191
+
192
+ if settings.filter_input.filter_type == 'advanced':
193
+ # Parse the advanced filter expression
194
+ self.imports.add(
195
+ "from polars_expr_transformer.process.polars_expr_transformer import simple_function_to_expr"
196
+ )
197
+ self._add_code(f"{var_name} = {input_df}.filter(")
198
+ self._add_code(f'simple_function_to_expr("{settings.filter_input.advanced_filter}")')
199
+ self._add_code(")")
200
+ else:
201
+ # Handle basic filter
202
+ basic = settings.filter_input.basic_filter
203
+ filter_expr = self._create_basic_filter_expr(basic)
204
+ self._add_code(f"{var_name} = {input_df}.filter({filter_expr})")
205
+ self._add_code("")
206
+
207
+ def _handle_record_count(self, settings: input_schema.NodeRecordCount, var_name: str, input_vars: Dict[str, str]):
208
+ input_df = input_vars.get('main', 'df')
209
+ self._add_code(f"{var_name} = {input_df}.select(pl.len().alias('number_of_records'))")
210
+
211
+ def _handle_graph_solver(self, settings: input_schema.NodeGraphSolver, var_name: str, input_vars: Dict[str, str]):
212
+ input_df = input_vars.get('main', 'df')
213
+ from_col_name = settings.graph_solver_input.col_from
214
+ to_col_name = settings.graph_solver_input.col_to
215
+ output_col_name = settings.graph_solver_input.output_column_name
216
+ self._add_code(f'{var_name} = {input_df}.with_columns(graph_solver(pl.col("{from_col_name}"), '
217
+ f'pl.col("{to_col_name}"))'
218
+ f'.alias("{output_col_name}"))')
219
+ self._add_code("")
220
+ self.imports.add("from polars_grouper import graph_solver")
221
+
222
+ def _handle_select(self, settings: input_schema.NodeSelect, var_name: str, input_vars: Dict[str, str]) -> None:
223
+ """Handle select/rename nodes."""
224
+ input_df = input_vars.get('main', 'df')
225
+ # Get columns to keep and renames
226
+ select_exprs = []
227
+ for select_input in settings.select_input:
228
+ if select_input.keep and select_input.is_available:
229
+ if select_input.old_name != select_input.new_name:
230
+ expr = f'pl.col("{select_input.old_name}").alias("{select_input.new_name}")'
231
+ else:
232
+ expr = f'pl.col("{select_input.old_name}")'
233
+
234
+ if (select_input.data_type_change or select_input.is_altered) and select_input.data_type:
235
+ polars_dtype = self._get_polars_dtype(select_input.data_type)
236
+ expr = f'{expr}.cast({polars_dtype})'
237
+
238
+ select_exprs.append(expr)
239
+
240
+ if select_exprs:
241
+ self._add_code(f"{var_name} = {input_df}.select([")
242
+ for expr in select_exprs:
243
+ self._add_code(f" {expr},")
244
+ self._add_code("])")
245
+ else:
246
+ self._add_code(f"{var_name} = {input_df}")
247
+ self._add_code("")
248
+
249
+ def _handle_join(self, settings: input_schema.NodeJoin, var_name: str, input_vars: Dict[str, str]) -> None:
250
+ """Handle join nodes."""
251
+ left_df = input_vars.get('main', input_vars.get('main_0', 'df_left'))
252
+ right_df = input_vars.get('right', input_vars.get('main_1', 'df_right'))
253
+
254
+ # Extract join keys
255
+ left_on = [jm.left_col for jm in settings.join_input.join_mapping]
256
+ right_on = [jm.right_col for jm in settings.join_input.join_mapping]
257
+
258
+ self._add_code(f"{var_name} = {left_df}.join(")
259
+ self._add_code(f" {right_df},")
260
+ self._add_code(f" left_on={left_on},")
261
+ self._add_code(f" right_on={right_on},")
262
+ self._add_code(f' how="{settings.join_input.how}"')
263
+ self._add_code(")")
264
+ self._add_code("")
265
+
266
+ def _handle_group_by(self, settings: input_schema.NodeGroupBy, var_name: str, input_vars: Dict[str, str]) -> None:
267
+ """Handle group by nodes."""
268
+ input_df = input_vars.get('main', 'df')
269
+
270
+ # Separate groupby columns from aggregation columns
271
+ group_cols = []
272
+ agg_exprs = []
273
+
274
+ for agg_col in settings.groupby_input.agg_cols:
275
+ if agg_col.agg == 'groupby':
276
+ group_cols.append(agg_col.old_name)
277
+ else:
278
+ agg_func = self._get_agg_function(agg_col.agg)
279
+ expr = f'pl.col("{agg_col.old_name}").{agg_func}().alias("{agg_col.new_name}")'
280
+ agg_exprs.append(expr)
281
+
282
+ self._add_code(f"{var_name} = {input_df}.group_by({group_cols}).agg([")
283
+ for expr in agg_exprs:
284
+ self._add_code(f" {expr},")
285
+ self._add_code("])")
286
+ self._add_code("")
287
+
288
+ def _handle_formula(self, settings: input_schema.NodeFormula, var_name: str, input_vars: Dict[str, str]) -> None:
289
+ """Handle formula/expression nodes."""
290
+ input_df = input_vars.get('main', 'df')
291
+ self.imports.add("from polars_expr_transformer.process.polars_expr_transformer import simple_function_to_expr")
292
+
293
+ # Convert SQL-like formula to Polars expression
294
+ formula = settings.function.function
295
+ col_name = settings.function.field.name
296
+ self._add_code(f"{var_name} = {input_df}.with_columns([")
297
+ self._add_code(f'simple_function_to_expr({repr(formula)}).alias("{col_name}")')
298
+ if settings.function.field.data_type not in (None, "Auto"):
299
+ output_type = convert_pl_type_to_string(cast_str_to_polars_type(settings.function.field.data_type))
300
+ if output_type[:3] != "pl.":
301
+ output_type = "pl." + output_type
302
+ self._add_code(f' .cast({output_type})')
303
+
304
+ self._add_code("])")
305
+ self._add_code("")
306
+
307
+ def _handle_pivot_no_index(self, settings: input_schema.NodePivot, var_name: str, input_df: str, agg_func: str):
308
+ pivot_input = settings.pivot_input
309
+
310
+ self._add_code(f'{var_name} = ({input_df}.collect()')
311
+ self._add_code(' .with_columns(pl.lit(1).alias("__temp_index__"))')
312
+ self._add_code(' .pivot(')
313
+ self._add_code(f' values="{pivot_input.value_col}",')
314
+ self._add_code(f' index=["__temp_index__"],')
315
+ self._add_code(f' columns="{pivot_input.pivot_column}",')
316
+ self._add_code(f' aggregate_function="{agg_func}"')
317
+ self._add_code(" )")
318
+ self._add_code(' .drop("__temp_index__")')
319
+ self._add_code(").lazy()")
320
+ self._add_code("")
321
+
322
+ def _handle_pivot(self, settings: input_schema.NodePivot, var_name: str, input_vars: Dict[str, str]) -> None:
323
+ """Handle pivot nodes."""
324
+ input_df = input_vars.get('main', 'df')
325
+ pivot_input = settings.pivot_input
326
+ if len(pivot_input.aggregations) > 1:
327
+ logger.error("Multiple aggregations are not convertable to polars code. "
328
+ "Taking the first value")
329
+ if len(pivot_input.aggregations) > 0:
330
+ agg_func = pivot_input.aggregations[0]
331
+ else:
332
+ agg_func = 'first'
333
+ if len(settings.pivot_input.index_columns) == 0:
334
+ self._handle_pivot_no_index(settings, var_name, input_df, agg_func)
335
+ else:
336
+ # Generate pivot code
337
+ self._add_code(f"{var_name} = {input_df}.collect().pivot(")
338
+ self._add_code(f" values='{pivot_input.value_col}',")
339
+ self._add_code(f" index={pivot_input.index_columns},")
340
+ self._add_code(f" columns='{pivot_input.pivot_column}',")
341
+
342
+ self._add_code(f" aggregate_function='{agg_func}'")
343
+ self._add_code(").lazy()")
344
+ self._add_code("")
345
+
346
+ def _handle_unpivot(self, settings: input_schema.NodeUnpivot, var_name: str, input_vars: Dict[str, str]) -> None:
347
+ """Handle unpivot nodes."""
348
+ input_df = input_vars.get('main', 'df')
349
+ unpivot_input = settings.unpivot_input
350
+
351
+ self._add_code(f"{var_name} = {input_df}.unpivot(")
352
+
353
+ if unpivot_input.index_columns:
354
+ self._add_code(f" index={unpivot_input.index_columns},")
355
+
356
+ if unpivot_input.value_columns:
357
+ self._add_code(f" on={unpivot_input.value_columns},")
358
+
359
+ self._add_code(" variable_name='variable',")
360
+ self._add_code(" value_name='value'")
361
+ self._add_code(")")
362
+ self._add_code("")
363
+
364
+ def _handle_union(self, settings: input_schema.NodeUnion, var_name: str, input_vars: Dict[str, str]) -> None:
365
+ """Handle union nodes."""
366
+ # Get all input LazyFrame
367
+ dfs = []
368
+ if 'main' in input_vars:
369
+ dfs.append(input_vars['main'])
370
+ else:
371
+ # Multiple main inputs
372
+ for key, df_var in input_vars.items():
373
+ if key.startswith('main'):
374
+ dfs.append(df_var)
375
+
376
+ if settings.union_input.mode == 'relaxed':
377
+ how = 'diagonal_relaxed'
378
+ else:
379
+ how = 'diagonal'
380
+
381
+ self._add_code(f"{var_name} = pl.concat([")
382
+ for df in dfs:
383
+ self._add_code(f" {df},")
384
+ self._add_code(f"], how='{how}')")
385
+ self._add_code("")
386
+
387
+ def _handle_sort(self, settings: input_schema.NodeSort, var_name: str, input_vars: Dict[str, str]) -> None:
388
+ """Handle sort nodes."""
389
+ input_df = input_vars.get('main', 'df')
390
+
391
+ sort_cols = []
392
+ descending = []
393
+
394
+ for sort_input in settings.sort_input:
395
+ sort_cols.append(f'"{sort_input.column}"')
396
+ descending.append(sort_input.how == 'desc')
397
+
398
+ self._add_code(f"{var_name} = {input_df}.sort([{', '.join(sort_cols)}], descending={descending})")
399
+ self._add_code("")
400
+
401
+ def _handle_sample(self, settings: input_schema.NodeSample, var_name: str, input_vars: Dict[str, str]) -> None:
402
+ """Handle sample nodes."""
403
+ input_df = input_vars.get('main', 'df')
404
+ self._add_code(f"{var_name} = {input_df}.head(n={settings.sample_size})")
405
+ self._add_code("")
406
+
407
+ def _handle_unique(self, settings: input_schema.NodeUnique, var_name: str, input_vars: Dict[str, str]) -> None:
408
+ """Handle unique/distinct nodes."""
409
+ input_df = input_vars.get('main', 'df')
410
+
411
+ if settings.unique_input.columns:
412
+ self._add_code(f"{var_name} = {input_df}.unique(subset={settings.unique_input.columns}, keep='{settings.unique_input.strategy}')")
413
+ else:
414
+ self._add_code(f"{var_name} = {input_df}.unique(keep='{settings.unique_input.strategy}')")
415
+ self._add_code("")
416
+
417
+ def _handle_text_to_rows(self, settings: input_schema.NodeTextToRows, var_name: str, input_vars: Dict[str, str]) -> None:
418
+ """Handle text to rows (explode) nodes."""
419
+ input_df = input_vars.get('main', 'df')
420
+ text_input = settings.text_to_rows_input
421
+
422
+ # First split the column
423
+ split_expr = f'pl.col("{text_input.column_to_split}").str.split("{text_input.split_fixed_value}")'
424
+ if text_input.output_column_name and text_input.output_column_name != text_input.column_to_split:
425
+ split_expr = f'{split_expr}.alias("{text_input.output_column_name}")'
426
+ explode_col = text_input.output_column_name
427
+ else:
428
+ explode_col = text_input.column_to_split
429
+
430
+ self._add_code(f"{var_name} = {input_df}.with_columns({split_expr}).explode('{explode_col}')")
431
+ self._add_code("")
432
+ # .with_columns(
433
+ # (pl.cum_count(record_id_settings.output_column_name)
434
+ # .over(record_id_settings.group_by_columns) + record_id_settings.offset - 1)
435
+ # .alias(record_id_settings.output_column_name)
436
+ # )
437
+ def _handle_record_id(self, settings: input_schema.NodeRecordId, var_name: str, input_vars: Dict[str, str]) -> None:
438
+ """Handle record ID nodes."""
439
+ input_df = input_vars.get('main', 'df')
440
+ record_input = settings.record_id_input
441
+ if record_input.group_by and record_input.group_by_columns:
442
+
443
+ # Row number within groups
444
+ self._add_code(f"{var_name} = ({input_df}")
445
+ self._add_code(f" .with_columns(pl.lit(1).alias('{record_input.output_column_name}'))")
446
+ self._add_code(f" .with_columns([")
447
+ self._add_code(f" (pl.cum_count('{record_input.output_column_name}').over({record_input.group_by_columns}) + {record_input.offset} - 1)")
448
+ self._add_code(f" .alias('{record_input.output_column_name}')")
449
+ self._add_code("])")
450
+ self._add_code(f".select(['{record_input.output_column_name}'] + [col for col in {input_df}.columns if col != '{record_input.output_column_name}'])")
451
+ self._add_code(")")
452
+ else:
453
+ # Simple row number
454
+ self._add_code(f"{var_name} = {input_df}.with_row_count(name='{record_input.output_column_name}', offset={record_input.offset})")
455
+ self._add_code("")
456
+
457
+ def _handle_cross_join(self, settings: input_schema.NodeCrossJoin, var_name: str, input_vars: Dict[str, str]) -> None:
458
+ """Handle cross join nodes."""
459
+ left_df = input_vars.get('main', input_vars.get('main_0', 'df_left'))
460
+ right_df = input_vars.get('right', input_vars.get('main_1', 'df_right'))
461
+
462
+ self._add_code(f"{var_name} = {left_df}.join({right_df}, how='cross')")
463
+ self._add_code("")
464
+
465
+ def _handle_output(self, settings: input_schema.NodeOutput, var_name: str, input_vars: Dict[str, str]) -> None:
466
+ """Handle output nodes."""
467
+ input_df = input_vars.get('main', 'df')
468
+ output_settings = settings.output_settings
469
+
470
+ if output_settings.file_type == 'csv':
471
+ self._add_code(f'{input_df}.sink_csv(')
472
+ self._add_code(f' "{output_settings.abs_file_path}",')
473
+ self._add_code(f' separator="{output_settings.output_csv_table.delimiter}"')
474
+ self._add_code(')')
475
+
476
+ elif output_settings.file_type == 'parquet':
477
+ self._add_code(f'{input_df}.sink_parquet("{output_settings.abs_file_path}")')
478
+
479
+ elif output_settings.file_type == 'excel':
480
+ self._add_code(f'{input_df}.collect().write_excel(')
481
+ self._add_code(f' "{output_settings.abs_file_path}",')
482
+ self._add_code(f' worksheet="{output_settings.output_excel_table.sheet_name}"')
483
+ self._add_code(')')
484
+
485
+ self._add_code("")
486
+
487
+ def _handle_polars_code(self, settings: input_schema.NodePolarsCode, var_name: str, input_vars: Dict[str, str]) -> None:
488
+ """Handle custom Polars code nodes."""
489
+ code = settings.polars_code_input.polars_code.strip()
490
+ # Determine function parameters based on number of inputs
491
+ if len(input_vars) == 0:
492
+ params = ""
493
+ args = ""
494
+ elif len(input_vars) == 1:
495
+ params = "input_df: pl.LazyFrame"
496
+ input_df = list(input_vars.values())[0]
497
+ args = input_df
498
+ else:
499
+ # Multiple inputs
500
+ param_list = []
501
+ arg_list = []
502
+ i = 1
503
+ for key in sorted(input_vars.keys()):
504
+ if key.startswith('main'):
505
+ param_list.append(f"input_df_{i}: pl.LazyFrame")
506
+ arg_list.append(input_vars[key])
507
+ i += 1
508
+ params = ", ".join(param_list)
509
+ args = ", ".join(arg_list)
510
+
511
+ # Check if the code is just an expression (no assignment)
512
+ is_expression = '=' not in code or code.count('=') == 1 and '==' in code
513
+
514
+ # Wrap the code in a function
515
+ self._add_code(f"# Custom Polars code")
516
+ self._add_code(f"def _polars_code_{var_name.replace('df_', '')}({params}):")
517
+
518
+ # Handle the code based on its structure
519
+ if is_expression:
520
+ # It's just an expression, return it directly
521
+ self._add_code(f" return {code}")
522
+ else:
523
+ # It contains assignments
524
+ for line in code.split('\n'):
525
+ if line.strip():
526
+ self._add_code(f" {line}")
527
+
528
+ # If no explicit return, try to detect what to return
529
+ if 'return' not in code:
530
+ # Try to find the last assignment
531
+ lines = [l.strip() for l in code.split('\n') if l.strip() and '=' in l]
532
+ if lines:
533
+ last_assignment = lines[-1]
534
+ if '=' in last_assignment:
535
+ output_var = last_assignment.split('=')[0].strip()
536
+ self._add_code(f" return {output_var}")
537
+
538
+ self._add_code("")
539
+
540
+ # Call the function
541
+ self._add_code(f"{var_name} = _polars_code_{var_name.replace('df_', '')}({args})")
542
+ self._add_code("")
543
+
544
+ # Helper methods
545
+
546
+ def _add_code(self, line: str) -> None:
547
+ """Add a line of code."""
548
+ self.code_lines.append(line)
549
+
550
+ def _add_comment(self, comment: str) -> None:
551
+ """Add a comment line."""
552
+ self.code_lines.append(comment)
553
+
554
+ def _parse_filter_expression(self, expr: str) -> str:
555
+ """Parse Flowfile filter expression to Polars expression."""
556
+ # This is a simplified parser - you'd need more sophisticated parsing
557
+ # Handle patterns like [column]>value or [column]="value"
558
+
559
+ import re
560
+
561
+ # Pattern: [column_name]operator"value" or [column_name]operatorvalue
562
+ pattern = r'\[([^\]]+)\]([><=!]+)"?([^"]*)"?'
563
+
564
+ def replace_expr(match):
565
+ col, op, val = match.groups()
566
+
567
+ # Map operators
568
+ op_map = {
569
+ '=': '==',
570
+ '!=': '!=',
571
+ '>': '>',
572
+ '<': '<',
573
+ '>=': '>=',
574
+ '<=': '<='
575
+ }
576
+
577
+ polars_op = op_map.get(op, op)
578
+
579
+ # Check if value is numeric
580
+ try:
581
+ float(val)
582
+ return f'pl.col("{col}") {polars_op} {val}'
583
+ except ValueError:
584
+ return f'pl.col("{col}") {polars_op} "{val}"'
585
+
586
+ return re.sub(pattern, replace_expr, expr)
587
+
588
+ def _create_basic_filter_expr(self, basic: transform_schema.BasicFilter) -> str:
589
+ """Create Polars expression from basic filter."""
590
+ col = f'pl.col("{basic.field}")'
591
+
592
+ if basic.filter_type == 'equals':
593
+ return f'{col} == "{basic.filter_value}"'
594
+ elif basic.filter_type == 'not_equals':
595
+ return f'{col} != "{basic.filter_value}"'
596
+ elif basic.filter_type == 'greater':
597
+ return f'{col} > {basic.filter_value}'
598
+ elif basic.filter_type == 'less':
599
+ return f'{col} < {basic.filter_value}'
600
+ elif basic.filter_type == 'in':
601
+ values = basic.filter_value.split(',')
602
+ return f"pl.col('{col}').is_in({values})"
603
+ return col
604
+
605
+ def _get_polars_dtype(self, dtype_str: str) -> str:
606
+ """Convert Flowfile dtype string to Polars dtype."""
607
+ dtype_map = {
608
+ 'String': 'pl.Utf8',
609
+ 'Integer': 'pl.Int64',
610
+ 'Double': 'pl.Float64',
611
+ 'Boolean': 'pl.Boolean',
612
+ 'Date': 'pl.Date',
613
+ 'Datetime': 'pl.Datetime',
614
+ 'Float32': 'pl.Float32',
615
+ 'Float64': 'pl.Float64',
616
+ 'Int32': 'pl.Int32',
617
+ 'Int64': 'pl.Int64',
618
+ 'Utf8': 'pl.Utf8',
619
+ }
620
+ return dtype_map.get(dtype_str, 'pl.Utf8')
621
+
622
+ def _get_agg_function(self, agg: str) -> str:
623
+ """Get Polars aggregation function name."""
624
+ agg_map = {
625
+ 'avg': 'mean',
626
+ 'average': 'mean',
627
+ 'concat': 'str.concat',
628
+ }
629
+ return agg_map.get(agg, agg)
630
+
631
+ def _sql_to_polars_expr(self, sql_expr: str) -> str:
632
+ """Convert SQL-like expression to Polars expression."""
633
+ # This is a very simplified converter
634
+ # In practice, you'd want a proper SQL parser
635
+
636
+ # Replace column references
637
+ import re
638
+
639
+ # Pattern for column names (simplified)
640
+ col_pattern = r'\b([a-zA-Z_][a-zA-Z0-9_]*)\b'
641
+
642
+ def replace_col(match):
643
+ col_name = match.group(1)
644
+ # Skip SQL keywords
645
+ keywords = {'CASE', 'WHEN', 'THEN', 'ELSE', 'END', 'AND', 'OR', 'NOT', 'IN', 'AS'}
646
+ if col_name.upper() in keywords:
647
+ return col_name
648
+ return f'pl.col("{col_name}")'
649
+
650
+ result = re.sub(col_pattern, replace_col, sql_expr)
651
+
652
+ # Handle CASE WHEN
653
+ if 'CASE' in result:
654
+ # This would need proper parsing
655
+ result = "pl.when(...).then(...).otherwise(...)"
656
+
657
+ return result
658
+
659
+ def add_return_code(self, lines: List[str]) -> None:
660
+ if self.output_nodes:
661
+ # Return marked output nodes
662
+ if len(self.output_nodes) == 1:
663
+ # Single output
664
+ _, var_name = self.output_nodes[0]
665
+ lines.append(f" return {var_name}")
666
+ else:
667
+ # Multiple outputs - return as dictionary
668
+ lines.append(" return {")
669
+ for node_id, var_name in self.output_nodes:
670
+ lines.append(f' "node_{node_id}": {var_name},')
671
+ lines.append(" }")
672
+ elif self.last_node_var:
673
+ lines.append(f" return {self.last_node_var}")
674
+ else:
675
+ lines.append(" return None")
676
+
677
+ def _build_final_code(self) -> str:
678
+ """Build the final Python code."""
679
+ lines = []
680
+
681
+ # Add imports
682
+ lines.extend(sorted(self.imports))
683
+ lines.append("")
684
+ lines.append("")
685
+
686
+ # Add main function
687
+ lines.append("def run_etl_pipeline():")
688
+ lines.append(' """')
689
+ lines.append(f' ETL Pipeline: {self.flow_graph.__name__}')
690
+ lines.append(' Generated from Flowfile')
691
+ lines.append(' """')
692
+ lines.append(" ")
693
+
694
+ # Add the generated code
695
+ for line in self.code_lines:
696
+ if line:
697
+ lines.append(f" {line}")
698
+ else:
699
+ lines.append("")
700
+ # Add main block
701
+ lines.append("")
702
+ self.add_return_code(lines)
703
+ lines.append("")
704
+ lines.append("")
705
+ lines.append('if __name__ == "__main__":')
706
+ lines.append(" pipeline_output = run_etl_pipeline()")
707
+
708
+ return "\n".join(lines)
709
+
710
+
711
+ # Example usage function
712
+ def export_flow_to_polars(flow_graph: FlowGraph) -> str:
713
+ """
714
+ Export a FlowGraph to standalone Polars code.
715
+
716
+ Args:
717
+ flow_graph: The FlowGraph instance to convert
718
+
719
+ Returns:
720
+ str: Python code that can be executed standalone
721
+ """
722
+ converter = FlowGraphToPolarsConverter(flow_graph)
723
+ return converter.convert()