Flowfile 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (145) hide show
  1. flowfile/__init__.py +27 -6
  2. flowfile/api.py +1 -0
  3. flowfile/web/__init__.py +2 -2
  4. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
  5. flowfile/web/static/assets/CloudConnectionManager-c20a740f.js +783 -0
  6. flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
  7. flowfile/web/static/assets/CloudStorageReader-960b400a.js +437 -0
  8. flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
  9. flowfile/web/static/assets/CloudStorageWriter-e3decbdd.js +430 -0
  10. flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-d67e2405.js} +8 -8
  11. flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
  12. flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-9ea35e84.js} +2 -2
  13. flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-9578bfa5.js} +9 -9
  14. flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-19531098.js} +9 -9
  15. flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-40476474.js} +47141 -43697
  16. flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-2297ef96.js} +6 -6
  17. flowfile/web/static/assets/{Filter-f87bb897.js → Filter-f211c03a.js} +8 -8
  18. flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
  19. flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-4207ea31.js} +75 -9
  20. flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-bf120df0.js} +9 -9
  21. flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-5bb7497a.js} +5 -5
  22. flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-92c81b65.js} +6 -6
  23. flowfile/web/static/assets/{Join-eec38203.js → Join-4e49a274.js} +23 -15
  24. flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
  25. flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-90998ae8.js} +106 -34
  26. flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
  27. flowfile/web/static/assets/{Output-3b2ca045.js → Output-81e3e917.js} +4 -4
  28. flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-a3419842.js} +6 -6
  29. flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-72710deb.js} +6 -6
  30. flowfile/web/static/assets/{Read-07acdc9a.js → Read-c4059daf.js} +6 -6
  31. flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-c2b5e095.js} +5 -5
  32. flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-10baf191.js} +6 -6
  33. flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-3ed9a0ae.js} +5 -5
  34. flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-0d49c0e8.js} +2 -2
  35. flowfile/web/static/assets/{Select-32b28406.js → Select-8a02a0b3.js} +8 -8
  36. flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-4c0f45f5.js} +1 -1
  37. flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-f55c9f9d.js} +6 -6
  38. flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-5dbc2145.js} +8 -8
  39. flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-a1768e52.js} +2 -2
  40. flowfile/web/static/assets/{Union-39eecc6c.js → Union-f2aefdc9.js} +5 -5
  41. flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-46b250da.js} +8 -8
  42. flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-25ac84cc.js} +5 -5
  43. flowfile/web/static/assets/api-6ef0dcef.js +80 -0
  44. flowfile/web/static/assets/{api-44ca9e9c.js → api-a0abbdc7.js} +1 -1
  45. flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
  46. flowfile/web/static/assets/{designer-267d44f1.js → designer-13eabd83.js} +36 -34
  47. flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-b87e7f6f.js} +1 -1
  48. flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-13564764.js} +1 -1
  49. flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-fd2cd6f9.js} +2 -2
  50. flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-71e11604.js} +3 -3
  51. flowfile/web/static/assets/{index-e235a8bc.js → index-f6c15e76.js} +59 -22
  52. flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-988d9efe.js} +3 -3
  53. flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-dd636aa2.js} +1 -1
  54. flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-af36165e.js} +3 -3
  55. flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-2847001e.js} +2 -1
  56. flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-0371da73.js} +1 -1
  57. flowfile/web/static/index.html +1 -1
  58. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/METADATA +9 -4
  59. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/RECORD +131 -124
  60. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +2 -0
  61. flowfile_core/__init__.py +3 -0
  62. flowfile_core/auth/jwt.py +39 -0
  63. flowfile_core/configs/node_store/nodes.py +9 -6
  64. flowfile_core/configs/settings.py +6 -5
  65. flowfile_core/database/connection.py +63 -15
  66. flowfile_core/database/init_db.py +0 -1
  67. flowfile_core/database/models.py +49 -2
  68. flowfile_core/flowfile/code_generator/code_generator.py +472 -17
  69. flowfile_core/flowfile/connection_manager/models.py +1 -1
  70. flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
  71. flowfile_core/flowfile/extensions.py +1 -1
  72. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
  73. flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
  74. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1062 -311
  75. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
  76. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
  77. flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
  78. flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
  79. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
  80. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
  81. flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
  82. flowfile_core/flowfile/flow_graph.py +718 -253
  83. flowfile_core/flowfile/flow_graph_utils.py +2 -2
  84. flowfile_core/flowfile/flow_node/flow_node.py +563 -117
  85. flowfile_core/flowfile/flow_node/models.py +154 -20
  86. flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
  87. flowfile_core/flowfile/handler.py +2 -33
  88. flowfile_core/flowfile/manage/open_flowfile.py +1 -2
  89. flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
  90. flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
  91. flowfile_core/flowfile/util/calculate_layout.py +0 -2
  92. flowfile_core/flowfile/utils.py +35 -26
  93. flowfile_core/main.py +35 -15
  94. flowfile_core/routes/cloud_connections.py +77 -0
  95. flowfile_core/routes/logs.py +2 -7
  96. flowfile_core/routes/public.py +1 -0
  97. flowfile_core/routes/routes.py +130 -90
  98. flowfile_core/routes/secrets.py +72 -14
  99. flowfile_core/schemas/__init__.py +8 -0
  100. flowfile_core/schemas/cloud_storage_schemas.py +215 -0
  101. flowfile_core/schemas/input_schema.py +121 -71
  102. flowfile_core/schemas/output_model.py +19 -3
  103. flowfile_core/schemas/schemas.py +150 -12
  104. flowfile_core/schemas/transform_schema.py +175 -35
  105. flowfile_core/utils/utils.py +40 -1
  106. flowfile_core/utils/validate_setup.py +41 -0
  107. flowfile_frame/__init__.py +9 -1
  108. flowfile_frame/cloud_storage/frame_helpers.py +39 -0
  109. flowfile_frame/cloud_storage/secret_manager.py +73 -0
  110. flowfile_frame/expr.py +28 -1
  111. flowfile_frame/expr.pyi +76 -61
  112. flowfile_frame/flow_frame.py +481 -208
  113. flowfile_frame/flow_frame.pyi +140 -91
  114. flowfile_frame/flow_frame_methods.py +160 -22
  115. flowfile_frame/group_frame.py +3 -0
  116. flowfile_frame/utils.py +25 -3
  117. flowfile_worker/external_sources/s3_source/main.py +216 -0
  118. flowfile_worker/external_sources/s3_source/models.py +142 -0
  119. flowfile_worker/funcs.py +51 -6
  120. flowfile_worker/models.py +22 -2
  121. flowfile_worker/routes.py +40 -38
  122. flowfile_worker/utils.py +1 -1
  123. test_utils/s3/commands.py +46 -0
  124. test_utils/s3/data_generator.py +292 -0
  125. test_utils/s3/demo_data_generator.py +186 -0
  126. test_utils/s3/fixtures.py +214 -0
  127. flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
  128. flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
  129. flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
  130. flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
  131. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
  132. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
  133. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
  134. flowfile_core/schemas/defaults.py +0 -9
  135. flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
  136. flowfile_core/schemas/models.py +0 -193
  137. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
  138. flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
  139. flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
  140. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  141. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
  142. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
  143. {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_frame/cloud_storage}/__init__.py +0 -0
  144. {flowfile_core/schemas/external_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
  145. {flowfile_worker/external_sources/airbyte_sources → test_utils/s3}/__init__.py +0 -0
@@ -1,5 +1,4 @@
1
- from typing import List, Dict, Optional, Set, Tuple, Any
2
- from collections import defaultdict
1
+ from typing import List, Dict, Optional, Set, Tuple
3
2
  import polars as pl
4
3
 
5
4
  from flowfile_core.flowfile.flow_graph import FlowGraph
@@ -131,6 +130,42 @@ class FlowGraphToPolarsConverter:
131
130
  self._add_code(f' skip_rows={file_settings.starting_from_line},')
132
131
  self._add_code(").lazy()")
133
132
 
133
+ def _handle_cloud_storage_reader(self, settings: input_schema.NodeCloudStorageReader, var_name: str, input_vars: Dict[str, str]):
134
+ cloud_read_settings = settings.cloud_storage_settings
135
+ self.imports.add(
136
+ "import flowfile as ff"
137
+ )
138
+ if cloud_read_settings.file_format == "csv":
139
+ self._add_code(f"{var_name} = ff.scan_csv_from_cloud_storage(")
140
+ self._add_code(f' "{cloud_read_settings.resource_path}",')
141
+ self._add_code(f' connection_name="{cloud_read_settings.connection_name}",')
142
+ self._add_code(f' scan_mode="{cloud_read_settings.scan_mode}",')
143
+ self._add_code(f' delimiter="{cloud_read_settings.csv_delimiter}",')
144
+ self._add_code(f' has_header={cloud_read_settings.csv_has_header},')
145
+ self._add_code(f' encoding="{cloud_read_settings.csv_encoding}",')
146
+
147
+ elif cloud_read_settings.file_format == "parquet":
148
+ self._add_code(f"{var_name} = ff.scan_parquet_from_cloud_storage(")
149
+ self._add_code(f' "{cloud_read_settings.resource_path}",')
150
+ self._add_code(f' connection_name="{cloud_read_settings.connection_name}",')
151
+ self._add_code(f' scan_mode="{cloud_read_settings.scan_mode}",')
152
+
153
+ elif cloud_read_settings.file_format == "json":
154
+ self._add_code(f"{var_name} = ff.scan_json_from_cloud_storage(")
155
+ self._add_code(f' "{cloud_read_settings.resource_path}",')
156
+ self._add_code(f' connection_name="{cloud_read_settings.connection_name}",')
157
+ self._add_code(f' scan_mode="{cloud_read_settings.scan_mode}",')
158
+
159
+ elif cloud_read_settings.file_format == "delta":
160
+ self._add_code(f"{var_name} = ff.scan_delta(")
161
+ self._add_code(f' "{cloud_read_settings.resource_path}",')
162
+ self._add_code(f' connection_name="{cloud_read_settings.connection_name}",')
163
+ self._add_code(f' scan_mode="{cloud_read_settings.scan_mode}",')
164
+ self._add_code(f' version_id={cloud_read_settings.delta_version},')
165
+ else:
166
+ return
167
+ self._add_code(").data")
168
+
134
169
  def _handle_read(self, settings: input_schema.NodeRead, var_name: str, input_vars: Dict[str, str]) -> None:
135
170
  """Handle file reading nodes."""
136
171
  file_settings = settings.received_file
@@ -176,13 +211,10 @@ class FlowGraphToPolarsConverter:
176
211
 
177
212
  def _handle_manual_input(self, settings: input_schema.NodeManualInput, var_name: str, input_vars: Dict[str, str]) -> None:
178
213
  """Handle manual data input nodes."""
179
- if settings.raw_data_format:
180
- data = settings.raw_data_format.data
181
- flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in settings.raw_data_format.columns)
182
- schema = self.get_manual_schema_input(flowfile_schema)
183
- self._add_code(f"{var_name} = pl.LazyFrame({data}, schema={schema}, strict=False)")
184
- else:
185
- self._add_code(f"{var_name} = pl.LazyFrame({settings.raw_data})")
214
+ data = settings.raw_data_format.data
215
+ flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in settings.raw_data_format.columns)
216
+ schema = self.get_manual_schema_input(flowfile_schema)
217
+ self._add_code(f"{var_name} = pl.LazyFrame({data}, schema={schema}, strict=False)")
186
218
  self._add_code("")
187
219
 
188
220
  def _handle_filter(self, settings: input_schema.NodeFilter, var_name: str, input_vars: Dict[str, str]) -> None:
@@ -247,21 +279,410 @@ class FlowGraphToPolarsConverter:
247
279
  self._add_code("")
248
280
 
249
281
  def _handle_join(self, settings: input_schema.NodeJoin, var_name: str, input_vars: Dict[str, str]) -> None:
250
- """Handle join nodes."""
282
+ """Handle join nodes by routing to appropriate join type handler.
283
+
284
+ This is the main entry point for processing join operations. It determines
285
+ the type of join and delegates to the appropriate handler method.
286
+
287
+ Args:
288
+ settings: NodeJoin settings containing join configuration
289
+ var_name: Name of the variable to store the joined DataFrame
290
+ input_vars: Dictionary mapping input names to DataFrame variable names
291
+
292
+ Returns:
293
+ None: Modifies internal state by adding generated code
294
+ """
251
295
  left_df = input_vars.get('main', input_vars.get('main_0', 'df_left'))
252
296
  right_df = input_vars.get('right', input_vars.get('main_1', 'df_right'))
253
297
 
254
- # Extract join keys
298
+ # Ensure left and right DataFrames are distinct
299
+ if left_df == right_df:
300
+ right_df = "df_right"
301
+ self._add_code(f"{right_df} = {left_df}")
302
+
303
+ if settings.join_input.how in ("semi", "anti"):
304
+ self._handle_semi_anti_join(settings, var_name, left_df, right_df)
305
+ else:
306
+ self._handle_standard_join(settings, var_name, left_df, right_df)
307
+
308
+ def _handle_semi_anti_join(self, settings: input_schema.NodeJoin, var_name: str, left_df: str,
309
+ right_df: str) -> None:
310
+ """Handle semi and anti joins which only return rows from the left DataFrame.
311
+
312
+ Semi joins return rows from left DataFrame that have matches in right.
313
+ Anti joins return rows from left DataFrame that have no matches in right.
314
+ These joins are simpler as they don't require column management from right DataFrame.
315
+
316
+ Args:
317
+ settings: NodeJoin settings containing join configuration
318
+ var_name: Name of the variable to store the result
319
+ left_df: Variable name of the left DataFrame
320
+ right_df: Variable name of the right DataFrame
321
+
322
+ Returns:
323
+ None: Modifies internal state by adding generated code
324
+ """
255
325
  left_on = [jm.left_col for jm in settings.join_input.join_mapping]
256
326
  right_on = [jm.right_col for jm in settings.join_input.join_mapping]
257
327
 
258
- self._add_code(f"{var_name} = {left_df}.join(")
259
- self._add_code(f" {right_df},")
260
- self._add_code(f" left_on={left_on},")
261
- self._add_code(f" right_on={right_on},")
262
- self._add_code(f' how="{settings.join_input.how}"')
328
+ self._add_code(f"{var_name} = ({left_df}.join(")
329
+ self._add_code(f" {right_df},")
330
+ self._add_code(f" left_on={left_on},")
331
+ self._add_code(f" right_on={right_on},")
332
+ self._add_code(f' how="{settings.join_input.how}"')
333
+ self._add_code(" )")
334
+ self._add_code(")")
335
+
336
+ def _handle_standard_join(self, settings: input_schema.NodeJoin, var_name: str, left_df: str,
337
+ right_df: str) -> None:
338
+ """Handle standard joins (left, right, inner, outer) with full column management.
339
+
340
+ Standard joins may include columns from both DataFrames and require careful
341
+ management of column names, duplicates, and transformations. This method
342
+ orchestrates the complete join process including pre/post transformations.
343
+
344
+ Process:
345
+ 1. Auto-rename columns to avoid conflicts
346
+ 2. Extract join keys
347
+ 3. Apply pre-join transformations (renames, drops)
348
+ 4. Handle join-specific key transformations
349
+ 5. Execute join with post-processing
350
+
351
+ Args:
352
+ settings: NodeJoin settings containing join configuration
353
+ var_name: Name of the variable to store the result
354
+ left_df: Variable name of the left DataFrame
355
+ right_df: Variable name of the right DataFrame
356
+
357
+ Returns:
358
+ None: Modifies internal state by adding generated code
359
+ """
360
+ settings.join_input.auto_rename()
361
+
362
+ # Get join keys
363
+ left_on, right_on = self._get_join_keys(settings)
364
+
365
+ # Apply pre-join transformations
366
+ left_df, right_df = self._apply_pre_join_transformations(settings, left_df, right_df)
367
+
368
+ # Handle join-specific key transformations
369
+ left_on, right_on, reverse_action, after_join_drop_cols = self._handle_join_key_transformations(
370
+ settings, left_df, right_df, left_on, right_on
371
+ )
372
+
373
+ # Execute the join
374
+ self._execute_join_with_post_processing(
375
+ settings, var_name, left_df, right_df, left_on, right_on,
376
+ after_join_drop_cols, reverse_action
377
+ )
378
+
379
+ def _get_join_keys(self, settings: input_schema.NodeJoin) -> Tuple[List[str], List[str]]:
380
+ """Extract join keys based on join type.
381
+
382
+ Different join types require different handling of join keys:
383
+ - For outer/right joins: Uses renamed column names for right DataFrame
384
+ - For other joins: Uses original column names from join mapping
385
+
386
+ Args:
387
+ settings: NodeJoin settings containing join configuration
388
+
389
+ Returns:
390
+ Tuple[List[str], List[str]]: Lists of (left_on, right_on) column names
391
+ """
392
+ left_on = [jm.left_col for jm in settings.join_input.get_names_for_table_rename()]
393
+
394
+ if settings.join_input.how in ("outer", "right"):
395
+ right_on = [jm.right_col for jm in settings.join_input.get_names_for_table_rename()]
396
+ else:
397
+ right_on = [jm.right_col for jm in settings.join_input.join_mapping]
398
+
399
+ return left_on, right_on
400
+
401
+ def _apply_pre_join_transformations(self, settings: input_schema.NodeJoin, left_df: str, right_df: str) -> Tuple[
402
+ str, str]:
403
+ """Apply column renames and drops before the join operation.
404
+
405
+ Pre-join transformations prepare DataFrames by:
406
+ - Renaming columns according to user specifications
407
+ - Dropping columns marked as not to keep (except join keys)
408
+ - Special handling for right/outer joins where join keys may need preservation
409
+
410
+ Args:
411
+ settings: NodeJoin settings containing column rename/drop specifications
412
+ left_df: Variable name of the left DataFrame
413
+ right_df: Variable name of the right DataFrame
414
+
415
+ Returns:
416
+ Tuple[str, str]: The same DataFrame variable names (left_df, right_df)
417
+ Note: DataFrames are modified via generated code, not new variables
418
+ """
419
+ # Calculate renames and drops
420
+ right_renames = {
421
+ column.old_name: column.new_name
422
+ for column in settings.join_input.right_select.renames
423
+ if
424
+ column.old_name != column.new_name and not column.join_key or settings.join_input.how in ("outer", "right")
425
+ }
426
+
427
+ left_renames = {
428
+ column.old_name: column.new_name
429
+ for column in settings.join_input.left_select.renames
430
+ if column.old_name != column.new_name
431
+ }
432
+
433
+ left_drop_columns = [
434
+ column.old_name for column in settings.join_input.left_select.renames
435
+ if not column.keep and not column.join_key
436
+ ]
437
+
438
+ right_drop_columns = [
439
+ column.old_name for column in settings.join_input.right_select.renames
440
+ if not column.keep and not column.join_key
441
+ ]
442
+
443
+ # Apply transformations
444
+ if right_renames:
445
+ self._add_code(f"{right_df} = {right_df}.rename({right_renames})")
446
+ if left_renames:
447
+ self._add_code(f"{left_df} = {left_df}.rename({left_renames})")
448
+ if left_drop_columns:
449
+ self._add_code(f"{left_df} = {left_df}.drop({left_drop_columns})")
450
+ if right_drop_columns:
451
+ self._add_code(f"{right_df} = {right_df}.drop({right_drop_columns})")
452
+
453
+ return left_df, right_df
454
+
455
+ def _handle_join_key_transformations(self, settings: input_schema.NodeJoin, left_df: str, right_df: str,
456
+ left_on: List[str], right_on: List[str]) \
457
+ -> Tuple[List[str], List[str], Optional[Dict], List[str]]:
458
+ """Route to appropriate join-specific key transformation handler.
459
+
460
+ Different join types require different strategies for handling join keys
461
+ to avoid conflicts and preserve necessary columns.
462
+
463
+ Args:
464
+ settings: NodeJoin settings containing join configuration
465
+ left_df: Variable name of the left DataFrame
466
+ right_df: Variable name of the right DataFrame
467
+ left_on: List of left DataFrame column names to join on
468
+ right_on: List of right DataFrame column names to join on
469
+
470
+ Returns:
471
+ Tuple containing:
472
+ - left_on: Potentially modified list of left join columns
473
+ - right_on: Potentially modified list of right join columns
474
+ - reverse_action: Dictionary for renaming columns after join (or None)
475
+ - after_join_drop_cols: List of columns to drop after join
476
+ """
477
+ join_type = settings.join_input.how
478
+
479
+ if join_type in ("left", "inner"):
480
+ return self._handle_left_inner_join_keys(settings, right_df, left_on, right_on)
481
+ elif join_type == "right":
482
+ return self._handle_right_join_keys(settings, left_df, left_on, right_on)
483
+ elif join_type == "outer":
484
+ return self._handle_outer_join_keys(settings, right_df, left_on, right_on)
485
+ else:
486
+ return left_on, right_on, None, []
487
+
488
+ def _handle_left_inner_join_keys(self, settings: input_schema.NodeJoin, right_df: str,
489
+ left_on: List[str], right_on: List[str]) -> Tuple[
490
+ List[str], List[str], Dict, List[str]]:
491
+ """Handle key transformations for left and inner joins.
492
+
493
+ For left/inner joins:
494
+ - Join keys from left DataFrame are preserved
495
+ - Right DataFrame join keys are temporarily renamed with __DROP__ prefix
496
+ - After join, these temporary columns can be renamed back if needed
497
+
498
+ Args:
499
+ settings: NodeJoin settings containing join configuration
500
+ right_df: Variable name of the right DataFrame
501
+ left_on: List of left DataFrame column names to join on
502
+ right_on: List of right DataFrame column names to join on
503
+
504
+ Returns:
505
+ Tuple containing:
506
+ - left_on: Unchanged left join columns
507
+ - right_on: Unchanged right join columns
508
+ - reverse_action: Mapping to rename __DROP__ columns after join
509
+ - after_join_drop_cols: Left join keys marked for dropping
510
+ """
511
+ left_join_keys_to_keep = [jk.new_name for jk in settings.join_input.left_select.join_key_selects if jk.keep]
512
+
513
+ join_key_duplication_command = [
514
+ f'pl.col("{rjk.old_name}").alias("__DROP__{rjk.new_name}__DROP__")'
515
+ for rjk in settings.join_input.right_select.join_key_selects if rjk.keep
516
+ ]
517
+
518
+ reverse_action = {
519
+ f"__DROP__{rjk.new_name}__DROP__": rjk.new_name
520
+ for rjk in settings.join_input.right_select.join_key_selects if rjk.keep
521
+ }
522
+
523
+ if join_key_duplication_command:
524
+ self._add_code(f"{right_df} = {right_df}.with_columns([{', '.join(join_key_duplication_command)}])")
525
+
526
+ after_join_drop_cols = [
527
+ k.new_name for k in settings.join_input.left_select.join_key_selects
528
+ if not k.keep
529
+ ]
530
+
531
+ return left_on, right_on, reverse_action, after_join_drop_cols
532
+
533
+ def _handle_right_join_keys(self, settings: input_schema.NodeJoin, left_df: str,
534
+ left_on: List[str], right_on: List[str]) -> Tuple[
535
+ List[str], List[str], None, List[str]]:
536
+ """Handle key transformations for right joins.
537
+
538
+ For right joins:
539
+ - Join keys from right DataFrame are preserved
540
+ - Left DataFrame join keys are prefixed with __jk_ to avoid conflicts
541
+ - Polars appends "_right" suffix to conflicting column names
542
+
543
+ Args:
544
+ settings: NodeJoin settings containing join configuration
545
+ left_df: Variable name of the left DataFrame
546
+ left_on: List of left DataFrame column names to join on
547
+ right_on: List of right DataFrame column names to join on
548
+
549
+ Returns:
550
+ Tuple containing:
551
+ - left_on: Modified left join columns with __jk_ prefix where needed
552
+ - right_on: Unchanged right join columns
553
+ - reverse_action: None (no post-join renaming needed)
554
+ - after_join_drop_cols: Right join keys marked for dropping
555
+ """
556
+ join_key_duplication_command = [
557
+ f'pl.col("{ljk.new_name}").alias("__jk_{ljk.new_name}")'
558
+ for ljk in settings.join_input.left_select.join_key_selects if ljk.keep
559
+ ]
560
+
561
+ # Update left_on keys
562
+ for position, left_on_key in enumerate(left_on):
563
+ left_on_select = settings.join_input.left_select.get_select_input_on_new_name(left_on_key)
564
+ if left_on_select and left_on_select.keep:
565
+ left_on[position] = f"__jk_{left_on_select.new_name}"
566
+
567
+ if join_key_duplication_command:
568
+ self._add_code(f"{left_df} = {left_df}.with_columns([{', '.join(join_key_duplication_command)}])")
569
+
570
+ # Calculate columns to drop after join
571
+ left_join_keys_keep = {jk.new_name for jk in settings.join_input.left_select.join_key_selects if jk.keep}
572
+ after_join_drop_cols_right = [
573
+ jk.new_name if jk.new_name not in left_join_keys_keep else jk.new_name + "_right"
574
+ for jk in settings.join_input.right_select.join_key_selects if not jk.keep
575
+ ]
576
+ after_join_drop_cols = list(set(after_join_drop_cols_right))
577
+
578
+ return left_on, right_on, None, after_join_drop_cols
579
+
580
+ def _handle_outer_join_keys(self, settings: input_schema.NodeJoin, right_df: str,
581
+ left_on: List[str], right_on: List[str]) -> Tuple[
582
+ List[str], List[str], Dict, List[str]]:
583
+ """Handle key transformations for outer joins.
584
+
585
+ For outer joins:
586
+ - Both left and right join keys may need to be preserved
587
+ - Right DataFrame join keys are prefixed with __jk_ when they conflict
588
+ - Post-join renaming reverses the __jk_ prefix
589
+
590
+ Args:
591
+ settings: NodeJoin settings containing join configuration
592
+ right_df: Variable name of the right DataFrame
593
+ left_on: List of left DataFrame column names to join on
594
+ right_on: List of right DataFrame column names to join on
595
+
596
+ Returns:
597
+ Tuple containing:
598
+ - left_on: Unchanged left join columns
599
+ - right_on: Modified right join columns with __jk_ prefix where needed
600
+ - reverse_action: Mapping to remove __jk_ prefix after join
601
+ - after_join_drop_cols: Combined list of columns to drop from both sides
602
+ """
603
+ left_join_keys = {jk.new_name for jk in settings.join_input.left_select.join_key_selects}
604
+
605
+ join_keys_to_keep_and_rename = [
606
+ rjk for rjk in settings.join_input.right_select.join_key_selects
607
+ if rjk.keep and rjk.new_name in left_join_keys
608
+ ]
609
+
610
+ join_key_rename_command = {
611
+ rjk.new_name: f"__jk_{rjk.new_name}"
612
+ for rjk in join_keys_to_keep_and_rename
613
+ }
614
+
615
+ # Update right_on keys
616
+ for position, right_on_key in enumerate(right_on):
617
+ right_on_select = settings.join_input.right_select.get_select_input_on_new_name(right_on_key)
618
+ if right_on_select and right_on_select.keep and right_on_select.new_name in left_join_keys:
619
+ right_on[position] = f"__jk_{right_on_select.new_name}"
620
+
621
+ if join_key_rename_command:
622
+ self._add_code(f"{right_df} = {right_df}.rename({join_key_rename_command})")
623
+
624
+ reverse_action = {f"__jk_{rjk.new_name}": rjk.new_name for rjk in join_keys_to_keep_and_rename}
625
+
626
+ # Calculate columns to drop after join
627
+ after_join_drop_cols_left = [
628
+ jk.new_name for jk in settings.join_input.left_select.join_key_selects if not jk.keep
629
+ ]
630
+ after_join_drop_cols_right = [
631
+ jk.new_name if jk.new_name not in left_join_keys else jk.new_name + "_right"
632
+ for jk in settings.join_input.right_select.join_key_selects if not jk.keep
633
+ ]
634
+ after_join_drop_cols = after_join_drop_cols_left + after_join_drop_cols_right
635
+
636
+ return left_on, right_on, reverse_action, after_join_drop_cols
637
+
638
+ def _execute_join_with_post_processing(self, settings: input_schema.NodeJoin, var_name: str,
639
+ left_df: str, right_df: str, left_on: List[str], right_on: List[str],
640
+ after_join_drop_cols: List[str], reverse_action: Optional[Dict]) -> None:
641
+ """Execute the join operation and apply post-processing steps.
642
+
643
+ Generates the actual join code with any necessary post-processing:
644
+ 1. Executes the join operation
645
+ 2. For right joins: Collects to eager mode (Polars requirement)
646
+ 3. Drops unnecessary columns
647
+ 4. Renames temporary columns back to final names
648
+ 5. For right joins: Converts back to lazy mode
649
+
650
+ Args:
651
+ settings: NodeJoin settings containing join configuration
652
+ var_name: Name of the variable to store the result
653
+ left_df: Variable name of the left DataFrame
654
+ right_df: Variable name of the right DataFrame
655
+ left_on: List of left DataFrame column names to join on
656
+ right_on: List of right DataFrame column names to join on
657
+ after_join_drop_cols: List of columns to drop after join
658
+ reverse_action: Dictionary for renaming columns after join (or None)
659
+
660
+ Returns:
661
+ None: Modifies internal state by adding generated code
662
+ """
663
+ self._add_code(f"{var_name} = ({left_df}.join(")
664
+ self._add_code(f" {right_df},")
665
+ self._add_code(f" left_on={left_on},")
666
+ self._add_code(f" right_on={right_on},")
667
+ self._add_code(f' how="{settings.join_input.how}"')
668
+ self._add_code(" )")
669
+
670
+ # Handle right join special case
671
+ if settings.join_input.how == 'right':
672
+ self._add_code(".collect()") # Right join needs to be collected first cause of issue with rename
673
+
674
+ # Apply post-join transformations
675
+ if after_join_drop_cols:
676
+ self._add_code(f".drop({after_join_drop_cols})")
677
+
678
+ if reverse_action:
679
+ self._add_code(f".rename({reverse_action})")
680
+
681
+ # Convert back to lazy for right joins
682
+ if settings.join_input.how == 'right':
683
+ self._add_code(f".lazy()")
684
+
263
685
  self._add_code(")")
264
- self._add_code("")
265
686
 
266
687
  def _handle_group_by(self, settings: input_schema.NodeGroupBy, var_name: str, input_vars: Dict[str, str]) -> None:
267
688
  """Handle group by nodes."""
@@ -462,6 +883,40 @@ class FlowGraphToPolarsConverter:
462
883
  self._add_code(f"{var_name} = {left_df}.join({right_df}, how='cross')")
463
884
  self._add_code("")
464
885
 
886
+ def _handle_cloud_storage_writer(self, settings: input_schema.NodeCloudStorageWriter, var_name: str, input_vars: Dict[str, str]) -> None:
887
+ """Handle cloud storage writer nodes."""
888
+ input_df = input_vars.get('main', 'df')
889
+ # def write_csv_to_cloud_storage(self, path: str, connection_name: typing.Optional[str] = None, delimiter: str = ';', encoding: typing.Literal['utf8', 'utf8-lossy'] = 'utf8', description: Optional[str] = None) -> 'FlowFrame': ...
890
+
891
+ output_settings = settings.cloud_storage_settings
892
+ self.imports.add("import flowfile as ff")
893
+ self._add_code(f"(ff.FlowFrame({input_df})")
894
+ if output_settings.file_format == "csv":
895
+ self._add_code(f' .write_csv_to_cloud_storage(')
896
+ self._add_code(f' path="{output_settings.resource_path}",')
897
+ self._add_code(f' connection_name="{output_settings.connection_name}",')
898
+ self._add_code(f' delimiter="{output_settings.csv_delimiter}",')
899
+ self._add_code(f' encoding="{output_settings.csv_encoding}",')
900
+ self._add_code(f' description="{settings.description}"')
901
+ elif output_settings.file_format == "parquet":
902
+ self._add_code(f' .write_parquet_to_cloud_storage(')
903
+ self._add_code(f' path="{output_settings.resource_path}",')
904
+ self._add_code(f' connection_name="{output_settings.connection_name}",')
905
+ self._add_code(f' description="{settings.description}"')
906
+ elif output_settings.file_format == "json":
907
+ self._add_code(f' .write_json_to_cloud_storage(')
908
+ self._add_code(f' path="{output_settings.resource_path}",')
909
+ self._add_code(f' connection_name="{output_settings.connection_name}",')
910
+ self._add_code(f' description="{settings.description}"')
911
+ elif output_settings.file_format == "delta":
912
+ self._add_code(f' .write_delta(')
913
+ self._add_code(f' path="{output_settings.resource_path}",')
914
+ self._add_code(f' write_mode="{output_settings.write_mode}",')
915
+ self._add_code(f' connection_name="{output_settings.connection_name}",')
916
+ self._add_code(f' description="{settings.description}"')
917
+ self._add_code(' )')
918
+ self._add_code(')')
919
+
465
920
  def _handle_output(self, settings: input_schema.NodeOutput, var_name: str, input_vars: Dict[str, str]) -> None:
466
921
  """Handle output nodes."""
467
922
  input_df = input_vars.get('main', 'df')
@@ -7,4 +7,4 @@ class Connection:
7
7
  group: str # e.g. source-faker
8
8
  name: str # e.g. source-faker-100000
9
9
  config_setting: Any
10
- type: str = None # e.g. airbyte
10
+ type: str = None