Flowfile 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (98) hide show
  1. flowfile/__init__.py +27 -6
  2. flowfile/api.py +1 -0
  3. flowfile/web/__init__.py +2 -2
  4. flowfile/web/static/assets/{CloudConnectionManager-d004942f.js → CloudConnectionManager-c20a740f.js} +3 -4
  5. flowfile/web/static/assets/{CloudStorageReader-eccf9fc2.js → CloudStorageReader-960b400a.js} +7 -7
  6. flowfile/web/static/assets/{CloudStorageWriter-b1ba6bba.js → CloudStorageWriter-e3decbdd.js} +7 -7
  7. flowfile/web/static/assets/{CrossJoin-68981877.js → CrossJoin-d67e2405.js} +8 -8
  8. flowfile/web/static/assets/{DatabaseConnectionSettings-0b06649c.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
  9. flowfile/web/static/assets/{DatabaseManager-8349a426.js → DatabaseManager-9ea35e84.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseReader-905344f8.js → DatabaseReader-9578bfa5.js} +9 -9
  11. flowfile/web/static/assets/{DatabaseWriter-9f5b8638.js → DatabaseWriter-19531098.js} +9 -9
  12. flowfile/web/static/assets/{ExploreData-131a6d53.js → ExploreData-40476474.js} +47141 -43697
  13. flowfile/web/static/assets/{ExternalSource-e3549dcc.js → ExternalSource-2297ef96.js} +6 -6
  14. flowfile/web/static/assets/{Filter-6e0730ae.js → Filter-f211c03a.js} +8 -8
  15. flowfile/web/static/assets/{Formula-02f033e6.js → Formula-4207ea31.js} +8 -8
  16. flowfile/web/static/assets/{FuzzyMatch-54c14036.js → FuzzyMatch-bf120df0.js} +9 -9
  17. flowfile/web/static/assets/{GraphSolver-08a3f499.js → GraphSolver-5bb7497a.js} +5 -5
  18. flowfile/web/static/assets/{GroupBy-2ae38139.js → GroupBy-92c81b65.js} +6 -6
  19. flowfile/web/static/assets/{Join-493b9772.js → Join-4e49a274.js} +9 -9
  20. flowfile/web/static/assets/{ManualInput-4373d163.js → ManualInput-90998ae8.js} +5 -5
  21. flowfile/web/static/assets/{Output-b534f3c7.js → Output-81e3e917.js} +4 -4
  22. flowfile/web/static/assets/{Pivot-2968ff65.js → Pivot-a3419842.js} +6 -6
  23. flowfile/web/static/assets/{PolarsCode-65136536.js → PolarsCode-72710deb.js} +6 -6
  24. flowfile/web/static/assets/{Read-c56339ed.js → Read-c4059daf.js} +6 -6
  25. flowfile/web/static/assets/{RecordCount-1c641a5e.js → RecordCount-c2b5e095.js} +5 -5
  26. flowfile/web/static/assets/{RecordId-df308b8f.js → RecordId-10baf191.js} +6 -6
  27. flowfile/web/static/assets/{Sample-293e8a64.js → Sample-3ed9a0ae.js} +5 -5
  28. flowfile/web/static/assets/{SecretManager-03911655.js → SecretManager-0d49c0e8.js} +2 -2
  29. flowfile/web/static/assets/{Select-3058a13d.js → Select-8a02a0b3.js} +8 -8
  30. flowfile/web/static/assets/{SettingsSection-fbf4fb39.js → SettingsSection-4c0f45f5.js} +1 -1
  31. flowfile/web/static/assets/{Sort-a29bbaf7.js → Sort-f55c9f9d.js} +6 -6
  32. flowfile/web/static/assets/{TextToRows-c7d7760e.js → TextToRows-5dbc2145.js} +8 -8
  33. flowfile/web/static/assets/{UnavailableFields-118f1d20.js → UnavailableFields-a1768e52.js} +2 -2
  34. flowfile/web/static/assets/{Union-f0589571.js → Union-f2aefdc9.js} +5 -5
  35. flowfile/web/static/assets/{Unique-7329a207.js → Unique-46b250da.js} +8 -8
  36. flowfile/web/static/assets/{Unpivot-30b0be15.js → Unpivot-25ac84cc.js} +5 -5
  37. flowfile/web/static/assets/{api-fb67319c.js → api-6ef0dcef.js} +1 -1
  38. flowfile/web/static/assets/{api-602fb95c.js → api-a0abbdc7.js} +1 -1
  39. flowfile/web/static/assets/{designer-94a6bf4d.js → designer-13eabd83.js} +4 -4
  40. flowfile/web/static/assets/{documentation-a224831e.js → documentation-b87e7f6f.js} +1 -1
  41. flowfile/web/static/assets/{dropDown-c2d2aa97.js → dropDown-13564764.js} +1 -1
  42. flowfile/web/static/assets/{fullEditor-921ac5fd.js → fullEditor-fd2cd6f9.js} +2 -2
  43. flowfile/web/static/assets/{genericNodeSettings-7013cc94.js → genericNodeSettings-71e11604.js} +3 -3
  44. flowfile/web/static/assets/{index-3a75211d.js → index-f6c15e76.js} +46 -22
  45. flowfile/web/static/assets/{nodeTitle-a63d4680.js → nodeTitle-988d9efe.js} +3 -3
  46. flowfile/web/static/assets/{secretApi-763aec6e.js → secretApi-dd636aa2.js} +1 -1
  47. flowfile/web/static/assets/{selectDynamic-08464729.js → selectDynamic-af36165e.js} +3 -3
  48. flowfile/web/static/assets/{vue-codemirror.esm-f15a5f87.js → vue-codemirror.esm-2847001e.js} +1 -1
  49. flowfile/web/static/assets/{vue-content-loader.es-93bd09d7.js → vue-content-loader.es-0371da73.js} +1 -1
  50. flowfile/web/static/index.html +1 -1
  51. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/METADATA +2 -2
  52. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/RECORD +96 -94
  53. flowfile_core/__init__.py +1 -0
  54. flowfile_core/auth/jwt.py +39 -0
  55. flowfile_core/configs/node_store/nodes.py +1 -0
  56. flowfile_core/configs/settings.py +6 -5
  57. flowfile_core/flowfile/code_generator/code_generator.py +71 -0
  58. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +1 -1
  59. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +597 -309
  60. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
  61. flowfile_core/flowfile/flow_graph.py +619 -191
  62. flowfile_core/flowfile/flow_graph_utils.py +2 -2
  63. flowfile_core/flowfile/flow_node/flow_node.py +500 -89
  64. flowfile_core/flowfile/flow_node/models.py +125 -20
  65. flowfile_core/flowfile/handler.py +2 -33
  66. flowfile_core/flowfile/manage/open_flowfile.py +1 -2
  67. flowfile_core/flowfile/util/calculate_layout.py +0 -2
  68. flowfile_core/flowfile/utils.py +36 -5
  69. flowfile_core/main.py +32 -13
  70. flowfile_core/routes/cloud_connections.py +7 -11
  71. flowfile_core/routes/logs.py +2 -6
  72. flowfile_core/routes/public.py +1 -0
  73. flowfile_core/routes/routes.py +127 -51
  74. flowfile_core/routes/secrets.py +72 -14
  75. flowfile_core/schemas/__init__.py +8 -0
  76. flowfile_core/schemas/input_schema.py +92 -64
  77. flowfile_core/schemas/output_model.py +19 -3
  78. flowfile_core/schemas/schemas.py +144 -11
  79. flowfile_core/schemas/transform_schema.py +82 -17
  80. flowfile_frame/__init__.py +9 -1
  81. flowfile_frame/cloud_storage/__init__.py +0 -0
  82. flowfile_frame/cloud_storage/frame_helpers.py +39 -0
  83. flowfile_frame/cloud_storage/secret_manager.py +73 -0
  84. flowfile_frame/expr.py +28 -1
  85. flowfile_frame/expr.pyi +76 -61
  86. flowfile_frame/flow_frame.py +232 -110
  87. flowfile_frame/flow_frame.pyi +140 -91
  88. flowfile_frame/flow_frame_methods.py +150 -12
  89. flowfile_frame/group_frame.py +3 -0
  90. flowfile_frame/utils.py +25 -3
  91. test_utils/s3/data_generator.py +1 -0
  92. test_utils/s3/demo_data_generator.py +186 -0
  93. test_utils/s3/fixtures.py +6 -1
  94. flowfile_core/schemas/defaults.py +0 -9
  95. flowfile_core/schemas/models.py +0 -193
  96. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
  97. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
  98. {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +0 -0
@@ -2,7 +2,7 @@ import datetime
2
2
  import pickle
3
3
  import polars as pl
4
4
  import fastexcel
5
- import copy
5
+ import re
6
6
  from fastapi.exceptions import HTTPException
7
7
  from time import time
8
8
  from functools import partial
@@ -11,6 +11,7 @@ from uuid import uuid1
11
11
  from copy import deepcopy
12
12
  from pyarrow.parquet import ParquetFile
13
13
  from flowfile_core.configs import logger
14
+ from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
14
15
  from flowfile_core.configs.flow_logger import FlowLogger
15
16
  from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
16
17
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
@@ -23,8 +24,10 @@ from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_x
23
24
  get_calamine_xlsx_data_types
24
25
  from flowfile_core.flowfile.sources import external_sources
25
26
  from flowfile_core.schemas import input_schema, schemas, transform_schema
26
- from flowfile_core.schemas.output_model import TableExample, NodeData, NodeResult, RunInformation
27
- from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSettingsInternal, FullCloudStorageConnection,
27
+ from flowfile_core.schemas.output_model import NodeData, NodeResult, RunInformation
28
+ from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSettingsInternal,
29
+ CloudStorageWriteSettingsInternal,
30
+ FullCloudStorageConnection,
28
31
  get_cloud_storage_write_settings_worker_interface, AuthMethod)
29
32
  from flowfile_core.flowfile.utils import snake_case_to_camel_case
30
33
  from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
@@ -45,6 +48,21 @@ from flowfile_core.flowfile.util.calculate_layout import calculate_layered_layou
45
48
 
46
49
  def get_xlsx_schema(engine: str, file_path: str, sheet_name: str, start_row: int, start_column: int,
47
50
  end_row: int, end_column: int, has_headers: bool):
51
+ """Calculates the schema of an XLSX file by reading a sample of rows.
52
+
53
+ Args:
54
+ engine: The engine to use for reading ('openpyxl' or 'calamine').
55
+ file_path: The path to the XLSX file.
56
+ sheet_name: The name of the sheet to read.
57
+ start_row: The starting row for data reading.
58
+ start_column: The starting column for data reading.
59
+ end_row: The ending row for data reading.
60
+ end_column: The ending column for data reading.
61
+ has_headers: A boolean indicating if the file has a header row.
62
+
63
+ Returns:
64
+ A list of FlowfileColumn objects representing the schema.
65
+ """
48
66
  try:
49
67
  logger.info('Starting to calculate the schema')
50
68
  if engine == 'openpyxl':
@@ -67,26 +85,69 @@ def get_xlsx_schema(engine: str, file_path: str, sheet_name: str, start_row: int
67
85
 
68
86
 
69
87
  def skip_node_message(flow_logger: FlowLogger, nodes: List[FlowNode]) -> None:
88
+ """Logs a warning message listing all nodes that will be skipped during execution.
89
+
90
+ Args:
91
+ flow_logger: The logger instance for the flow.
92
+ nodes: A list of FlowNode objects to be skipped.
93
+ """
70
94
  if len(nodes) > 0:
71
95
  msg = "\n".join(str(node) for node in nodes)
72
96
  flow_logger.warning(f'skipping nodes:\n{msg}')
73
97
 
74
98
 
75
99
  def execution_order_message(flow_logger: FlowLogger, nodes: List[FlowNode]) -> None:
100
+ """Logs an informational message showing the determined execution order of nodes.
101
+
102
+ Args:
103
+ flow_logger: The logger instance for the flow.
104
+ nodes: A list of FlowNode objects in the order they will be executed.
105
+ """
76
106
  msg = "\n".join(str(node) for node in nodes)
77
107
  flow_logger.info(f'execution order:\n{msg}')
78
108
 
79
109
 
80
110
  def get_xlsx_schema_callback(engine: str, file_path: str, sheet_name: str, start_row: int, start_column: int,
81
111
  end_row: int, end_column: int, has_headers: bool):
112
+ """Creates a partially applied function for lazy calculation of an XLSX schema.
113
+
114
+ Args:
115
+ engine: The engine to use for reading.
116
+ file_path: The path to the XLSX file.
117
+ sheet_name: The name of the sheet.
118
+ start_row: The starting row.
119
+ start_column: The starting column.
120
+ end_row: The ending row.
121
+ end_column: The ending column.
122
+ has_headers: A boolean indicating if the file has headers.
123
+
124
+ Returns:
125
+ A callable function that, when called, will execute `get_xlsx_schema`.
126
+ """
82
127
  return partial(get_xlsx_schema, engine=engine, file_path=file_path, sheet_name=sheet_name, start_row=start_row,
83
128
  start_column=start_column, end_row=end_row, end_column=end_column, has_headers=has_headers)
84
129
 
85
130
 
86
- def get_cloud_connection_settings(connection_name: str, user_id: int, auth_mode: AuthMethod) -> FullCloudStorageConnection:
131
+ def get_cloud_connection_settings(connection_name: str,
132
+ user_id: int, auth_mode: AuthMethod) -> FullCloudStorageConnection:
133
+ """Retrieves cloud storage connection settings, falling back to environment variables if needed.
134
+
135
+ Args:
136
+ connection_name: The name of the saved connection.
137
+ user_id: The ID of the user owning the connection.
138
+ auth_mode: The authentication method specified by the user.
139
+
140
+ Returns:
141
+ A FullCloudStorageConnection object with the connection details.
142
+
143
+ Raises:
144
+ HTTPException: If the connection settings cannot be found.
145
+ """
87
146
  cloud_connection_settings = get_local_cloud_connection(connection_name, user_id)
88
- if cloud_connection_settings is None and auth_mode == "aws-cli":
147
+ if cloud_connection_settings is None and auth_mode in ("env_vars", "auto"):
89
148
  # If the auth mode is aws-cli, we do not need connection settings
149
+ cloud_connection_settings = FullCloudStorageConnection(storage_type="s3", auth_method="env_vars")
150
+ elif cloud_connection_settings is None and auth_mode == "aws-cli":
90
151
  cloud_connection_settings = FullCloudStorageConnection(storage_type="s3", auth_method="aws-cli")
91
152
  if cloud_connection_settings is None:
92
153
  raise HTTPException(status_code=400, detail="Cloud connection settings not found")
@@ -94,18 +155,10 @@ def get_cloud_connection_settings(connection_name: str, user_id: int, auth_mode:
94
155
 
95
156
 
96
157
  class FlowGraph:
97
- """
98
- FlowGraph is a class that enables Extract, Transform and Load (ETL) operations
99
- on data. It allows you to create a Directed Acyclic Graph (DAG) where each
100
- node represents a step in the ETL pipeline.
158
+ """A class representing a Directed Acyclic Graph (DAG) for data processing pipelines.
101
159
 
102
- The class offers methods to add transformations and data sources, as well as
103
- methods to run the transformations and generate results.
104
-
105
- Attributes:
106
- _input_cols (set): A set that stores the input columns for the transformations.
107
- _output_cols (set): A set that stores the output columns from the transformations.
108
- """
160
+ It manages nodes, connections, and the execution of the entire flow.
161
+ """
109
162
  uuid: str
110
163
  depends_on: Dict[int, Union[ParquetFile, FlowDataEngine, "FlowGraph", pl.DataFrame,]]
111
164
  _flow_id: int
@@ -127,13 +180,27 @@ class FlowGraph:
127
180
  flow_settings: schemas.FlowSettings = None
128
181
  flow_logger: FlowLogger
129
182
 
130
- def __init__(self, flow_id: int,
131
- flow_settings: schemas.FlowSettings,
183
+ def __init__(self,
184
+ flow_settings: schemas.FlowSettings | schemas.FlowGraphConfig,
132
185
  name: str = None, input_cols: List[str] = None,
133
186
  output_cols: List[str] = None,
134
187
  path_ref: str = None,
135
188
  input_flow: Union[ParquetFile, FlowDataEngine, "FlowGraph"] = None,
136
189
  cache_results: bool = False):
190
+ """Initializes a new FlowGraph instance.
191
+
192
+ Args:
193
+ flow_settings: The configuration settings for the flow.
194
+ name: The name of the flow.
195
+ input_cols: A list of input column names.
196
+ output_cols: A list of output column names.
197
+ path_ref: An optional path to an initial data source.
198
+ input_flow: An optional existing data object to start the flow with.
199
+ cache_results: A global flag to enable or disable result caching.
200
+ """
201
+ if isinstance(flow_settings, schemas.FlowGraphConfig):
202
+ flow_settings = schemas.FlowSettings.from_flow_settings_input(flow_settings)
203
+
137
204
  self.flow_settings = flow_settings
138
205
  self.uuid = str(uuid1())
139
206
  self.nodes_completed = 0
@@ -141,8 +208,8 @@ class FlowGraph:
141
208
  self.end_datetime = None
142
209
  self.latest_run_info = None
143
210
  self.node_results = []
144
- self._flow_id = flow_id
145
- self.flow_logger = FlowLogger(flow_id)
211
+ self._flow_id = flow_settings.flow_id
212
+ self.flow_logger = FlowLogger(flow_settings.flow_id)
146
213
  self._flow_starts: List[FlowNode] = []
147
214
  self._results = None
148
215
  self.schema = None
@@ -160,7 +227,13 @@ class FlowGraph:
160
227
  self.add_datasource(input_file=input_flow)
161
228
 
162
229
  def add_node_promise(self, node_promise: input_schema.NodePromise):
230
+ """Adds a placeholder node to the graph that is not yet fully configured.
163
231
 
232
+ Useful for building the graph structure before all settings are available.
233
+
234
+ Args:
235
+ node_promise: A promise object containing basic node information.
236
+ """
164
237
  def placeholder(n: FlowNode = None):
165
238
  if n is None:
166
239
  return FlowDataEngine()
@@ -169,10 +242,75 @@ class FlowGraph:
169
242
  self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=placeholder,
170
243
  setting_input=node_promise)
171
244
 
172
- def apply_layout(self, y_spacing: int = 150, x_spacing: int = 200, initial_y: int = 100):
245
+ def print_tree(self, show_schema=False, show_descriptions=False):
246
+ """
247
+ Print flow_graph as a tree.
173
248
  """
174
- Calculates and applies a layered layout to all nodes in the graph.
175
- Updates the pos_x and pos_y attributes of the node setting inputs.
249
+ max_node_id = max(self._node_db.keys())
250
+
251
+ tree = ""
252
+ tabs = 0
253
+ tab_counter = 0
254
+ for node in self.nodes:
255
+ tab_counter += 1
256
+ node_input = node.setting_input
257
+ operation = str(self._node_db[node_input.node_id]).split("(")[1][:-1].replace("_", " ").title()
258
+
259
+ if operation == "Formula":
260
+ operation = "With Columns"
261
+
262
+ tree += str(operation) + " (id=" + str(node_input.node_id) + ")"
263
+
264
+ if show_descriptions & show_schema:
265
+ raise ValueError('show_descriptions and show_schema cannot be True simultaneously')
266
+ if show_descriptions:
267
+ tree += ": " + str(node_input.description)
268
+ elif show_schema:
269
+ tree += " -> ["
270
+ if operation == "Manual Input":
271
+ schema = ", ".join([str(i.name) + ": " + str(i.data_type) for i in node_input.raw_data_format.columns])
272
+ tree += schema
273
+ elif operation == "With Columns":
274
+ tree_with_col_schema = ", " + node_input.function.field.name + ": " + node_input.function.field.data_type
275
+ tree += schema + tree_with_col_schema
276
+ elif operation == "Filter":
277
+ index = node_input.filter_input.advanced_filter.find("]")
278
+ filtered_column = str(node_input.filter_input.advanced_filter[1:index])
279
+ schema = re.sub('({str(filtered_column)}: [A-Za-z0-9]+\,\s)', "", schema)
280
+ tree += schema
281
+ elif operation == "Group By":
282
+ for col in node_input.groupby_input.agg_cols:
283
+ schema = re.sub(str(col.old_name) + ': [a-z0-9]+\, ', "", schema)
284
+ tree += schema
285
+ tree += "]"
286
+ else:
287
+ if operation == "Manual Input":
288
+ tree += ": " + str(node_input.raw_data_format.data)
289
+ elif operation == "With Columns":
290
+ tree += ": " + str(node_input.function)
291
+ elif operation == "Filter":
292
+ tree += ": " + str(node_input.filter_input.advanced_filter)
293
+ elif operation == "Group By":
294
+ tree += ": groupby=[" + ", ".join([col.old_name for col in node_input.groupby_input.agg_cols if col.agg == "groupby"]) + "], "
295
+ tree += "agg=[" + ", ".join([str(col.agg) + "(" + str(col.old_name) + ")" for col in node_input.groupby_input.agg_cols if col.agg != "groupby"]) + "]"
296
+
297
+ if node_input.node_id < max_node_id:
298
+ tree += "\n" + "# " + " "*3*(tabs-1) + "|___ "
299
+ print("\n"*2)
300
+
301
+ return print(tree)
302
+
303
+
304
+
305
+ def apply_layout(self, y_spacing: int = 150, x_spacing: int = 200, initial_y: int = 100):
306
+ """Calculates and applies a layered layout to all nodes in the graph.
307
+
308
+ This updates their x and y positions for UI rendering.
309
+
310
+ Args:
311
+ y_spacing: The vertical spacing between layers.
312
+ x_spacing: The horizontal spacing between nodes in the same layer.
313
+ initial_y: The initial y-position for the first layer.
176
314
  """
177
315
  self.flow_logger.info("Applying layered layout...")
178
316
  start_time = time()
@@ -199,7 +337,7 @@ class FlowGraph:
199
337
  else:
200
338
  self.flow_logger.warning(f"Node {node_id} setting_input ({type(setting)}) lacks pos_x/pos_y attributes.")
201
339
  elif node:
202
- self.flow_logger.warning(f"Node {node_id} lacks setting_input attribute.")
340
+ self.flow_logger.warning(f"Node {node_id} lacks setting_input attribute.")
203
341
  # else: Node not found, already warned by calculate_layered_layout
204
342
 
205
343
  end_time = time()
@@ -207,51 +345,20 @@ class FlowGraph:
207
345
 
208
346
  except Exception as e:
209
347
  self.flow_logger.error(f"Error applying layout: {e}")
210
- raise # Optional: re-raise the exception
211
-
212
- def add_initial_node_analysis(self, node_promise: input_schema.NodePromise):
213
- node_analysis = create_graphic_walker_node_from_node_promise(node_promise)
214
- self.add_explore_data(node_analysis)
215
-
216
- def add_explore_data(self, node_analysis: input_schema.NodeExploreData):
217
- sample_size: int = 10000
218
-
219
- def analysis_preparation(flowfile_table: FlowDataEngine):
220
- if flowfile_table.number_of_records <= 0:
221
- number_of_records = flowfile_table.get_number_of_records(calculate_in_worker_process=True)
222
- else:
223
- number_of_records = flowfile_table.number_of_records
224
- if number_of_records > sample_size:
225
- flowfile_table = flowfile_table.get_sample(sample_size, random=True)
226
- external_sampler = ExternalDfFetcher(
227
- lf=flowfile_table.data_frame,
228
- file_ref="__gf_walker"+node.hash,
229
- wait_on_completion=True,
230
- node_id=node.node_id,
231
- flow_id=self.flow_id,
232
- )
233
- node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref)
234
- return flowfile_table
235
-
236
- def schema_callback():
237
- node = self.get_node(node_analysis.node_id)
238
- if len(node.all_inputs) == 1:
239
- input_node = node.all_inputs[0]
240
- return input_node.schema
241
- else:
242
- return [FlowfileColumn.from_input('col_1', 'na')]
243
-
244
- self.add_node_step(node_id=node_analysis.node_id, node_type='explore_data',
245
- function=analysis_preparation,
246
- setting_input=node_analysis, schema_callback=schema_callback)
247
- node = self.get_node(node_analysis.node_id)
348
+ raise # Optional: re-raise the exception
248
349
 
249
350
  @property
250
351
  def flow_id(self) -> int:
352
+ """Gets the unique identifier of the flow."""
251
353
  return self._flow_id
252
354
 
253
355
  @flow_id.setter
254
356
  def flow_id(self, new_id: int):
357
+ """Sets the unique identifier for the flow and updates all child nodes.
358
+
359
+ Args:
360
+ new_id: The new flow ID.
361
+ """
255
362
  self._flow_id = new_id
256
363
  for node in self.nodes:
257
364
  if hasattr(node.setting_input, 'flow_id'):
@@ -259,23 +366,35 @@ class FlowGraph:
259
366
  self.flow_settings.flow_id = new_id
260
367
 
261
368
  def __repr__(self):
262
- """
263
- Official string representation of the FlowGraph class.
264
- """
369
+ """Provides the official string representation of the FlowGraph instance."""
265
370
  settings_str = " -" + '\n -'.join(f"{k}: {v}" for k, v in self.flow_settings)
266
371
  return f"FlowGraph(\nNodes: {self._node_db}\n\nSettings:\n{settings_str}"
267
372
 
268
373
  def get_nodes_overview(self):
374
+ """Gets a list of dictionary representations for all nodes in the graph."""
269
375
  output = []
270
376
  for v in self._node_db.values():
271
377
  output.append(v.get_repr())
272
378
  return output
273
379
 
274
380
  def remove_from_output_cols(self, columns: List[str]):
381
+ """Removes specified columns from the list of expected output columns.
382
+
383
+ Args:
384
+ columns: A list of column names to remove.
385
+ """
275
386
  cols = set(columns)
276
387
  self._output_cols = [c for c in self._output_cols if c not in cols]
277
388
 
278
- def get_node(self, node_id: Union[int, str] = None) -> FlowNode:
389
+ def get_node(self, node_id: Union[int, str] = None) -> FlowNode | None:
390
+ """Retrieves a node from the graph by its ID.
391
+
392
+ Args:
393
+ node_id: The ID of the node to retrieve. If None, retrieves the last added node.
394
+
395
+ Returns:
396
+ The FlowNode object, or None if not found.
397
+ """
279
398
  if node_id is None:
280
399
  node_id = self._node_ids[-1]
281
400
  node = self._node_db.get(node_id)
@@ -283,6 +402,12 @@ class FlowGraph:
283
402
  return node
284
403
 
285
404
  def add_pivot(self, pivot_settings: input_schema.NodePivot):
405
+ """Adds a pivot node to the graph.
406
+
407
+ Args:
408
+ pivot_settings: The settings for the pivot operation.
409
+ """
410
+
286
411
  def _func(fl: FlowDataEngine):
287
412
  return fl.do_pivot(pivot_settings.pivot_input, self.flow_logger.get_node_logger(pivot_settings.node_id))
288
413
 
@@ -302,6 +427,11 @@ class FlowGraph:
302
427
  node.schema_callback = schema_callback
303
428
 
304
429
  def add_unpivot(self, unpivot_settings: input_schema.NodeUnpivot):
430
+ """Adds an unpivot node to the graph.
431
+
432
+ Args:
433
+ unpivot_settings: The settings for the unpivot operation.
434
+ """
305
435
 
306
436
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
307
437
  return fl.unpivot(unpivot_settings.unpivot_input)
@@ -313,6 +443,12 @@ class FlowGraph:
313
443
  input_node_ids=[unpivot_settings.depending_on_id])
314
444
 
315
445
  def add_union(self, union_settings: input_schema.NodeUnion):
446
+ """Adds a union node to combine multiple data streams.
447
+
448
+ Args:
449
+ union_settings: The settings for the union operation.
450
+ """
451
+
316
452
  def _func(*flowfile_tables: FlowDataEngine):
317
453
  dfs: List[pl.LazyFrame] | List[pl.DataFrame] = [flt.data_frame for flt in flowfile_tables]
318
454
  return FlowDataEngine(pl.concat(dfs, how='diagonal_relaxed'))
@@ -323,7 +459,59 @@ class FlowGraph:
323
459
  setting_input=union_settings,
324
460
  input_node_ids=union_settings.depending_on_ids)
325
461
 
462
+ def add_initial_node_analysis(self, node_promise: input_schema.NodePromise):
463
+ """Adds a data exploration/analysis node based on a node promise.
464
+
465
+ Args:
466
+ node_promise: The promise representing the node to be analyzed.
467
+ """
468
+ node_analysis = create_graphic_walker_node_from_node_promise(node_promise)
469
+ self.add_explore_data(node_analysis)
470
+
471
+ def add_explore_data(self, node_analysis: input_schema.NodeExploreData):
472
+ """Adds a specialized node for data exploration and visualization.
473
+
474
+ Args:
475
+ node_analysis: The settings for the data exploration node.
476
+ """
477
+ sample_size: int = 10000
478
+
479
+ def analysis_preparation(flowfile_table: FlowDataEngine):
480
+ if flowfile_table.number_of_records <= 0:
481
+ number_of_records = flowfile_table.get_number_of_records(calculate_in_worker_process=True)
482
+ else:
483
+ number_of_records = flowfile_table.number_of_records
484
+ if number_of_records > sample_size:
485
+ flowfile_table = flowfile_table.get_sample(sample_size, random=True)
486
+ external_sampler = ExternalDfFetcher(
487
+ lf=flowfile_table.data_frame,
488
+ file_ref="__gf_walker"+node.hash,
489
+ wait_on_completion=True,
490
+ node_id=node.node_id,
491
+ flow_id=self.flow_id,
492
+ )
493
+ node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref)
494
+ return flowfile_table
495
+
496
+ def schema_callback():
497
+ node = self.get_node(node_analysis.node_id)
498
+ if len(node.all_inputs) == 1:
499
+ input_node = node.all_inputs[0]
500
+ return input_node.schema
501
+ else:
502
+ return [FlowfileColumn.from_input('col_1', 'na')]
503
+
504
+ self.add_node_step(node_id=node_analysis.node_id, node_type='explore_data',
505
+ function=analysis_preparation,
506
+ setting_input=node_analysis, schema_callback=schema_callback)
507
+ node = self.get_node(node_analysis.node_id)
508
+
326
509
  def add_group_by(self, group_by_settings: input_schema.NodeGroupBy):
510
+ """Adds a group-by aggregation node to the graph.
511
+
512
+ Args:
513
+ group_by_settings: The settings for the group-by operation.
514
+ """
327
515
 
328
516
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
329
517
  return fl.do_group_by(group_by_settings.groupby_input, False)
@@ -337,6 +525,7 @@ class FlowGraph:
337
525
  node = self.get_node(group_by_settings.node_id)
338
526
 
339
527
  def schema_callback():
528
+
340
529
  output_columns = [(c.old_name, c.new_name, c.output_type) for c in group_by_settings.groupby_input.agg_cols]
341
530
  depends_on = node.node_inputs.main_inputs[0]
342
531
  input_schema_dict: Dict[str, str] = {s.name: s.data_type for s in depends_on.schema}
@@ -348,22 +537,13 @@ class FlowGraph:
348
537
 
349
538
  node.schema_callback = schema_callback
350
539
 
351
- def add_or_update_column_func(self, col_name: str, pl_dtype: pl.DataType, depends_on: FlowNode):
352
- col_output = FlowfileColumn.from_input(column_name=col_name, data_type=str(pl_dtype))
353
- schema = depends_on.schema
354
- col_exist = depends_on.get_flow_file_column_schema(col_name)
355
- if col_exist is None:
356
- new_schema = schema + [col_output]
357
- else:
358
- new_schema = []
359
- for s in self.schema:
360
- if s.name == col_name:
361
- new_schema.append(col_output)
362
- else:
363
- new_schema.append(s)
364
- return new_schema
365
-
366
540
  def add_filter(self, filter_settings: input_schema.NodeFilter):
541
+ """Adds a filter node to the graph.
542
+
543
+ Args:
544
+ filter_settings: The settings for the filter operation.
545
+ """
546
+
367
547
  is_advanced = filter_settings.filter_input.filter_type == 'advanced'
368
548
  if is_advanced:
369
549
  predicate = filter_settings.filter_input.advanced_filter
@@ -397,6 +577,12 @@ class FlowGraph:
397
577
  )
398
578
 
399
579
  def add_record_count(self, node_number_of_records: input_schema.NodeRecordCount):
580
+ """Adds a filter node to the graph.
581
+
582
+ Args:
583
+ node_number_of_records: The settings for the record count operation.
584
+ """
585
+
400
586
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
401
587
  return fl.get_record_count()
402
588
 
@@ -407,9 +593,14 @@ class FlowGraph:
407
593
  input_node_ids=[node_number_of_records.depending_on_id])
408
594
 
409
595
  def add_polars_code(self, node_polars_code: input_schema.NodePolarsCode):
596
+ """Adds a node that executes custom Polars code.
597
+
598
+ Args:
599
+ node_polars_code: The settings for the Polars code node.
600
+ """
601
+
410
602
  def _func(*flowfile_tables: FlowDataEngine) -> FlowDataEngine:
411
603
  return execute_polars_code(*flowfile_tables, code=node_polars_code.polars_code_input.polars_code)
412
-
413
604
  self.add_node_step(node_id=node_polars_code.node_id,
414
605
  function=_func,
415
606
  node_type='polars_code',
@@ -422,7 +613,31 @@ class FlowGraph:
422
613
  node = self.get_node(node_id=node_polars_code.node_id)
423
614
  node.results.errors = str(e)
424
615
 
616
+ def add_dependency_on_polars_lazy_frame(self,
617
+ lazy_frame: pl.LazyFrame,
618
+ node_id: int):
619
+ """Adds a special node that directly injects a Polars LazyFrame into the graph.
620
+
621
+ Note: This is intended for backend use and will not work in the UI editor.
622
+
623
+ Args:
624
+ lazy_frame: The Polars LazyFrame to inject.
625
+ node_id: The ID for the new node.
626
+ """
627
+ def _func():
628
+ return FlowDataEngine(lazy_frame)
629
+ node_promise = input_schema.NodePromise(flow_id=self.flow_id,
630
+ node_id=node_id, node_type="polars_lazy_frame",
631
+ is_setup=True)
632
+ self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=_func,
633
+ setting_input=node_promise)
634
+
425
635
  def add_unique(self, unique_settings: input_schema.NodeUnique):
636
+ """Adds a node to find and remove duplicate rows.
637
+
638
+ Args:
639
+ unique_settings: The settings for the unique operation.
640
+ """
426
641
 
427
642
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
428
643
  return fl.make_unique(unique_settings.unique_input)
@@ -435,6 +650,16 @@ class FlowGraph:
435
650
  input_node_ids=[unique_settings.depending_on_id])
436
651
 
437
652
  def add_graph_solver(self, graph_solver_settings: input_schema.NodeGraphSolver):
653
+ """Adds a node that solves graph-like problems within the data.
654
+
655
+ This node can be used for operations like finding network paths,
656
+ calculating connected components, or performing other graph algorithms
657
+ on relational data that represents nodes and edges.
658
+
659
+ Args:
660
+ graph_solver_settings: The settings object defining the graph inputs
661
+ and the specific algorithm to apply.
662
+ """
438
663
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
439
664
  return fl.solve_graph(graph_solver_settings.graph_solver_input)
440
665
 
@@ -445,6 +670,12 @@ class FlowGraph:
445
670
  input_node_ids=[graph_solver_settings.depending_on_id])
446
671
 
447
672
  def add_formula(self, function_settings: input_schema.NodeFormula):
673
+ """Adds a node that applies a formula to create or modify a column.
674
+
675
+ Args:
676
+ function_settings: The settings for the formula operation.
677
+ """
678
+
448
679
  error = ""
449
680
  if function_settings.function.field.data_type not in (None, "Auto"):
450
681
  output_type = cast_str_to_polars_type(function_settings.function.field.data_type)
@@ -476,6 +707,14 @@ class FlowGraph:
476
707
  return True, ""
477
708
 
478
709
  def add_cross_join(self, cross_join_settings: input_schema.NodeCrossJoin) -> "FlowGraph":
710
+ """Adds a cross join node to the graph.
711
+
712
+ Args:
713
+ cross_join_settings: The settings for the cross join operation.
714
+
715
+ Returns:
716
+ The `FlowGraph` instance for method chaining.
717
+ """
479
718
 
480
719
  def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
481
720
  for left_select in cross_join_settings.cross_join_input.left_select.renames:
@@ -497,6 +736,15 @@ class FlowGraph:
497
736
  return self
498
737
 
499
738
  def add_join(self, join_settings: input_schema.NodeJoin) -> "FlowGraph":
739
+ """Adds a join node to combine two data streams based on key columns.
740
+
741
+ Args:
742
+ join_settings: The settings for the join operation.
743
+
744
+ Returns:
745
+ The `FlowGraph` instance for method chaining.
746
+ """
747
+
500
748
  def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
501
749
  for left_select in join_settings.join_input.left_select.renames:
502
750
  left_select.is_available = True if left_select.old_name in main.schema else False
@@ -517,6 +765,15 @@ class FlowGraph:
517
765
  return self
518
766
 
519
767
  def add_fuzzy_match(self, fuzzy_settings: input_schema.NodeFuzzyMatch) -> "FlowGraph":
768
+ """Adds a fuzzy matching node to join data on approximate string matches.
769
+
770
+ Args:
771
+ fuzzy_settings: The settings for the fuzzy match operation.
772
+
773
+ Returns:
774
+ The `FlowGraph` instance for method chaining.
775
+ """
776
+
520
777
  def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
521
778
  f = main.start_fuzzy_join(fuzzy_match_input=fuzzy_settings.join_input, other=right, file_ref=node.hash,
522
779
  flow_id=self.flow_id, node_id=fuzzy_settings.node_id)
@@ -541,6 +798,18 @@ class FlowGraph:
541
798
  return self
542
799
 
543
800
  def add_text_to_rows(self, node_text_to_rows: input_schema.NodeTextToRows) -> "FlowGraph":
801
+ """Adds a node that splits cell values into multiple rows.
802
+
803
+ This is useful for un-nesting data where a single field contains multiple
804
+ values separated by a delimiter.
805
+
806
+ Args:
807
+ node_text_to_rows: The settings object that specifies the column to split
808
+ and the delimiter to use.
809
+
810
+ Returns:
811
+ The `FlowGraph` instance for method chaining.
812
+ """
544
813
  def _func(table: FlowDataEngine) -> FlowDataEngine:
545
814
  return table.split(node_text_to_rows.text_to_rows_input)
546
815
 
@@ -552,6 +821,15 @@ class FlowGraph:
552
821
  return self
553
822
 
554
823
  def add_sort(self, sort_settings: input_schema.NodeSort) -> "FlowGraph":
824
+ """Adds a node to sort the data based on one or more columns.
825
+
826
+ Args:
827
+ sort_settings: The settings for the sort operation.
828
+
829
+ Returns:
830
+ The `FlowGraph` instance for method chaining.
831
+ """
832
+
555
833
  def _func(table: FlowDataEngine) -> FlowDataEngine:
556
834
  return table.do_sort(sort_settings.sort_input)
557
835
 
@@ -563,6 +841,14 @@ class FlowGraph:
563
841
  return self
564
842
 
565
843
  def add_sample(self, sample_settings: input_schema.NodeSample) -> "FlowGraph":
844
+ """Adds a node to take a random or top-N sample of the data.
845
+
846
+ Args:
847
+ sample_settings: The settings object specifying the size of the sample.
848
+
849
+ Returns:
850
+ The `FlowGraph` instance for method chaining.
851
+ """
566
852
  def _func(table: FlowDataEngine) -> FlowDataEngine:
567
853
  return table.get_sample(sample_settings.sample_size)
568
854
 
@@ -575,6 +861,15 @@ class FlowGraph:
575
861
  return self
576
862
 
577
863
  def add_record_id(self, record_id_settings: input_schema.NodeRecordId) -> "FlowGraph":
864
+ """Adds a node to create a new column with a unique ID for each record.
865
+
866
+ Args:
867
+ record_id_settings: The settings object specifying the name of the
868
+ new record ID column.
869
+
870
+ Returns:
871
+ The `FlowGraph` instance for method chaining.
872
+ """
578
873
 
579
874
  def _func(table: FlowDataEngine) -> FlowDataEngine:
580
875
  return table.add_record_id(record_id_settings.record_id_input)
@@ -588,6 +883,15 @@ class FlowGraph:
588
883
  return self
589
884
 
590
885
  def add_select(self, select_settings: input_schema.NodeSelect) -> "FlowGraph":
886
+ """Adds a node to select, rename, reorder, or drop columns.
887
+
888
+ Args:
889
+ select_settings: The settings for the select operation.
890
+
891
+ Returns:
892
+ The `FlowGraph` instance for method chaining.
893
+ """
894
+
591
895
  select_cols = select_settings.select_input
592
896
  drop_cols = tuple(s.old_name for s in select_settings.select_input)
593
897
 
@@ -621,9 +925,18 @@ class FlowGraph:
621
925
 
622
926
  @property
623
927
  def graph_has_functions(self) -> bool:
928
+ """Checks if the graph has any nodes."""
624
929
  return len(self._node_ids) > 0
625
930
 
626
931
  def delete_node(self, node_id: Union[int, str]):
932
+ """Deletes a node from the graph and updates all its connections.
933
+
934
+ Args:
935
+ node_id: The ID of the node to delete.
936
+
937
+ Raises:
938
+ Exception: If the node with the given ID does not exist.
939
+ """
627
940
  logger.info(f"Starting deletion of node with ID: {node_id}")
628
941
 
629
942
  node = self._node_db.get(node_id)
@@ -656,6 +969,7 @@ class FlowGraph:
656
969
 
657
970
  @property
658
971
  def graph_has_input_data(self) -> bool:
972
+ """Checks if the graph has an initial input data source."""
659
973
  return self._input_data is not None
660
974
 
661
975
  def add_node_step(self,
@@ -670,6 +984,24 @@ class FlowGraph:
670
984
  cache_results: bool = None,
671
985
  schema_callback: Callable = None,
672
986
  input_node_ids: List[int] = None) -> FlowNode:
987
+ """The core method for adding or updating a node in the graph.
988
+
989
+ Args:
990
+ node_id: The unique ID for the node.
991
+ function: The core processing function for the node.
992
+ input_columns: A list of input column names required by the function.
993
+ output_schema: A predefined schema for the node's output.
994
+ node_type: A string identifying the type of node (e.g., 'filter', 'join').
995
+ drop_columns: A list of columns to be dropped after the function executes.
996
+ renew_schema: If True, the schema is recalculated after execution.
997
+ setting_input: A configuration object containing settings for the node.
998
+ cache_results: If True, the node's results are cached for future runs.
999
+ schema_callback: A function that dynamically calculates the output schema.
1000
+ input_node_ids: A list of IDs for the nodes that this node depends on.
1001
+
1002
+ Returns:
1003
+ The created or updated FlowNode object.
1004
+ """
673
1005
  existing_node = self.get_node(node_id)
674
1006
  if existing_node is not None:
675
1007
  if existing_node.node_type != node_type:
@@ -686,9 +1018,8 @@ class FlowGraph:
686
1018
  if (
687
1019
  input_nodes is not None or
688
1020
  function.__name__ in ('placeholder', 'analysis_preparation') or
689
- node_type == "cloud_storage_reader"
1021
+ node_type in ("cloud_storage_reader", "polars_lazy_frame", "input_data")
690
1022
  ):
691
-
692
1023
  if not existing_node:
693
1024
  node = FlowNode(node_id=node_id,
694
1025
  function=function,
@@ -709,8 +1040,6 @@ class FlowGraph:
709
1040
  setting_input=setting_input,
710
1041
  schema_callback=schema_callback)
711
1042
  node = existing_node
712
- elif node_type == 'input_data':
713
- node = None
714
1043
  else:
715
1044
  raise Exception("No data initialized")
716
1045
  self._node_db[node_id] = node
@@ -718,6 +1047,11 @@ class FlowGraph:
718
1047
  return node
719
1048
 
720
1049
  def add_include_cols(self, include_columns: List[str]):
1050
+ """Adds columns to both the input and output column lists.
1051
+
1052
+ Args:
1053
+ include_columns: A list of column names to include.
1054
+ """
721
1055
  for column in include_columns:
722
1056
  if column not in self._input_cols:
723
1057
  self._input_cols.append(column)
@@ -726,6 +1060,12 @@ class FlowGraph:
726
1060
  return self
727
1061
 
728
1062
  def add_output(self, output_file: input_schema.NodeOutput):
1063
+ """Adds an output node to write the final data to a destination.
1064
+
1065
+ Args:
1066
+ output_file: The settings for the output file.
1067
+ """
1068
+
729
1069
  def _func(df: FlowDataEngine):
730
1070
  output_file.output_settings.populate_abs_file_path()
731
1071
  execute_remote = self.execution_location != 'local'
@@ -747,7 +1087,12 @@ class FlowGraph:
747
1087
  input_node_ids=[input_node_id])
748
1088
 
749
1089
  def add_database_writer(self, node_database_writer: input_schema.NodeDatabaseWriter):
750
- logger.info("Adding database reader")
1090
+ """Adds a node to write data to a database.
1091
+
1092
+ Args:
1093
+ node_database_writer: The settings for the database writer node.
1094
+ """
1095
+
751
1096
  node_type = 'database_writer'
752
1097
  database_settings: input_schema.DatabaseWriteSettings = node_database_writer.database_write_settings
753
1098
  database_connection: Optional[input_schema.DatabaseConnection | input_schema.FullDatabaseConnection]
@@ -795,6 +1140,12 @@ class FlowGraph:
795
1140
  node = self.get_node(node_database_writer.node_id)
796
1141
 
797
1142
  def add_database_reader(self, node_database_reader: input_schema.NodeDatabaseReader):
1143
+ """Adds a node to read data from a database.
1144
+
1145
+ Args:
1146
+ node_database_reader: The settings for the database reader node.
1147
+ """
1148
+
798
1149
  logger.info("Adding database reader")
799
1150
  node_type = 'database_reader'
800
1151
  database_settings: input_schema.DatabaseSettings = node_database_reader.database_settings
@@ -868,15 +1219,27 @@ class FlowGraph:
868
1219
  self._node_ids.append(node_database_reader.node_id)
869
1220
 
870
1221
  def add_sql_source(self, external_source_input: input_schema.NodeExternalSource):
1222
+ """Adds a node that reads data from a SQL source.
1223
+
1224
+ This is a convenience alias for `add_external_source`.
1225
+
1226
+ Args:
1227
+ external_source_input: The settings for the external SQL source node.
1228
+ """
871
1229
  logger.info('Adding sql source')
872
1230
  self.add_external_source(external_source_input)
873
1231
 
874
1232
  def add_cloud_storage_writer(self, node_cloud_storage_writer: input_schema.NodeCloudStorageWriter) -> None:
1233
+ """Adds a node to write data to a cloud storage provider.
875
1234
 
876
- node_type = "cloud_storage_writer"
1235
+ Args:
1236
+ node_cloud_storage_writer: The settings for the cloud storage writer node.
1237
+ """
877
1238
 
1239
+ node_type = "cloud_storage_writer"
878
1240
  def _func(df: FlowDataEngine):
879
1241
  df.lazy = True
1242
+ execute_remote = self.execution_location != 'local'
880
1243
  cloud_connection_settings = get_cloud_connection_settings(
881
1244
  connection_name=node_cloud_storage_writer.cloud_storage_settings.connection_name,
882
1245
  user_id=node_cloud_storage_writer.user_id,
@@ -888,15 +1251,22 @@ class FlowGraph:
888
1251
  aws_allow_unsafe_html=cloud_connection_settings.aws_allow_unsafe_html,
889
1252
  **CloudStorageReader.get_storage_options(cloud_connection_settings)
890
1253
  )
891
- settings = get_cloud_storage_write_settings_worker_interface(
892
- write_settings=node_cloud_storage_writer.cloud_storage_settings,
893
- connection=full_cloud_storage_connection,
894
- lf=df.data_frame,
895
- flowfile_node_id=node_cloud_storage_writer.node_id,
896
- flowfile_flow_id=self.flow_id)
897
- external_database_writer = ExternalCloudWriter(settings, wait_on_completion=False)
898
- node._fetch_cached_df = external_database_writer
899
- external_database_writer.get_result()
1254
+ if execute_remote:
1255
+ settings = get_cloud_storage_write_settings_worker_interface(
1256
+ write_settings=node_cloud_storage_writer.cloud_storage_settings,
1257
+ connection=full_cloud_storage_connection,
1258
+ lf=df.data_frame,
1259
+ flowfile_node_id=node_cloud_storage_writer.node_id,
1260
+ flowfile_flow_id=self.flow_id)
1261
+ external_database_writer = ExternalCloudWriter(settings, wait_on_completion=False)
1262
+ node._fetch_cached_df = external_database_writer
1263
+ external_database_writer.get_result()
1264
+ else:
1265
+ cloud_storage_write_settings_internal = CloudStorageWriteSettingsInternal(
1266
+ connection=full_cloud_storage_connection,
1267
+ write_settings=node_cloud_storage_writer.cloud_storage_settings,
1268
+ )
1269
+ df.to_cloud_storage_obj(cloud_storage_write_settings_internal)
900
1270
  return df
901
1271
 
902
1272
  def schema_callback():
@@ -919,12 +1289,10 @@ class FlowGraph:
919
1289
  node = self.get_node(node_cloud_storage_writer.node_id)
920
1290
 
921
1291
  def add_cloud_storage_reader(self, node_cloud_storage_reader: input_schema.NodeCloudStorageReader) -> None:
922
- """
923
- Adds a cloud storage read node to the flow graph.
1292
+ """Adds a cloud storage read node to the flow graph.
1293
+
924
1294
  Args:
925
- node_cloud_storage_reader (input_schema.NodeCloudStorageReader):
926
- The settings for the cloud storage read node.
927
- Returns:
1295
+ node_cloud_storage_reader: The settings for the cloud storage read node.
928
1296
  """
929
1297
  node_type = "cloud_storage_reader"
930
1298
  logger.info("Adding cloud storage reader")
@@ -953,6 +1321,11 @@ class FlowGraph:
953
1321
 
954
1322
  def add_external_source(self,
955
1323
  external_source_input: input_schema.NodeExternalSource):
1324
+ """Adds a node for a custom external data source.
1325
+
1326
+ Args:
1327
+ external_source_input: The settings for the external source node.
1328
+ """
956
1329
 
957
1330
  node_type = 'external_source'
958
1331
  external_source_script = getattr(external_sources.custom_external_sources, external_source_input.identifier)
@@ -1009,6 +1382,12 @@ class FlowGraph:
1009
1382
  setting_input=external_source_input)
1010
1383
 
1011
1384
  def add_read(self, input_file: input_schema.NodeRead):
1385
+ """Adds a node to read data from a local file (e.g., CSV, Parquet, Excel).
1386
+
1387
+ Args:
1388
+ input_file: The settings for the read operation.
1389
+ """
1390
+
1012
1391
  if input_file.received_file.file_type in ('xlsx', 'excel') and input_file.received_file.sheet_name == '':
1013
1392
  sheet_name = fastexcel.read_excel(input_file.received_file.path).sheet_names[0]
1014
1393
  input_file.received_file.sheet_name = sheet_name
@@ -1077,7 +1456,18 @@ class FlowGraph:
1077
1456
  node.schema_callback = schema_callback
1078
1457
  return self
1079
1458
 
1080
- def add_datasource(self, input_file: input_schema.NodeDatasource | input_schema.NodeManualInput):
1459
+ def add_datasource(self, input_file: Union[input_schema.NodeDatasource, input_schema.NodeManualInput]) -> "FlowGraph":
1460
+ """Adds a data source node to the graph.
1461
+
1462
+ This method serves as a factory for creating starting nodes, handling both
1463
+ file-based sources and direct manual data entry.
1464
+
1465
+ Args:
1466
+ input_file: The configuration object for the data source.
1467
+
1468
+ Returns:
1469
+ The `FlowGraph` instance for method chaining.
1470
+ """
1081
1471
  if isinstance(input_file, input_schema.NodeManualInput):
1082
1472
  input_data = FlowDataEngine(input_file.raw_data_format)
1083
1473
  ref = 'manual_input'
@@ -1103,29 +1493,35 @@ class FlowGraph:
1103
1493
  return self
1104
1494
 
1105
1495
  def add_manual_input(self, input_file: input_schema.NodeManualInput):
1496
+ """Adds a node for manual data entry.
1497
+
1498
+ This is a convenience alias for `add_datasource`.
1499
+
1500
+ Args:
1501
+ input_file: The settings and data for the manual input node.
1502
+ """
1106
1503
  self.add_datasource(input_file)
1107
1504
 
1108
1505
  @property
1109
1506
  def nodes(self) -> List[FlowNode]:
1110
- return list(self._node_db.values())
1507
+ """Gets a list of all FlowNode objects in the graph."""
1111
1508
 
1112
- def check_for_missed_cols(self, expected_cols: List):
1113
- not_filled_cols = set(expected_cols) - set(self._output_cols)
1114
- cols_available = list(not_filled_cols & set([c.name for c in self._input_data.schema]))
1115
- self._output_cols += cols_available
1116
-
1117
- @property
1118
- def input_data_columns(self) -> List[str] | None:
1119
- if self._input_cols:
1120
- return list(set([col for col in self._input_cols if
1121
- col in [table_col.name for table_col in self._input_data.schema]]))
1509
+ return list(self._node_db.values())
1122
1510
 
1123
1511
  @property
1124
- def execution_mode(self) -> str:
1512
+ def execution_mode(self) -> schemas.ExecutionModeLiteral:
1513
+ """Gets the current execution mode ('Development' or 'Performance')."""
1125
1514
  return self.flow_settings.execution_mode
1126
1515
 
1127
1516
  def get_implicit_starter_nodes(self) -> List[FlowNode]:
1128
- """Ensures that nodes that can be a start (e.g. polars code), will be a starting node"""
1517
+ """Finds nodes that can act as starting points but are not explicitly defined as such.
1518
+
1519
+ Some nodes, like the Polars Code node, can function without an input. This
1520
+ method identifies such nodes if they have no incoming connections.
1521
+
1522
+ Returns:
1523
+ A list of `FlowNode` objects that are implicit starting nodes.
1524
+ """
1129
1525
  starting_node_ids = [node.node_id for node in self._flow_starts]
1130
1526
  implicit_starting_nodes = []
1131
1527
  for node in self.nodes:
@@ -1135,17 +1531,39 @@ class FlowGraph:
1135
1531
 
1136
1532
  @execution_mode.setter
1137
1533
  def execution_mode(self, mode: schemas.ExecutionModeLiteral):
1534
+ """Sets the execution mode for the flow.
1535
+
1536
+ Args:
1537
+ mode: The execution mode to set.
1538
+ """
1138
1539
  self.flow_settings.execution_mode = mode
1139
1540
 
1140
1541
  @property
1141
1542
  def execution_location(self) -> schemas.ExecutionLocationsLiteral:
1543
+ """Gets the current execution location."""
1142
1544
  return self.flow_settings.execution_location
1143
1545
 
1144
1546
  @execution_location.setter
1145
1547
  def execution_location(self, execution_location: schemas.ExecutionLocationsLiteral):
1548
+ """Sets the execution location for the flow.
1549
+
1550
+ Args:
1551
+ execution_location: The execution location to set.
1552
+ """
1146
1553
  self.flow_settings.execution_location = execution_location
1147
1554
 
1148
- def run_graph(self):
1555
+ def run_graph(self) -> RunInformation | None:
1556
+ """Executes the entire data flow graph from start to finish.
1557
+
1558
+ It determines the correct execution order, runs each node,
1559
+ collects results, and handles errors and cancellations.
1560
+
1561
+ Returns:
1562
+ A RunInformation object summarizing the execution results.
1563
+
1564
+ Raises:
1565
+ Exception: If the flow is already running.
1566
+ """
1149
1567
  if self.flow_settings.is_running:
1150
1568
  raise Exception('Flow is already running')
1151
1569
  try:
@@ -1167,6 +1585,8 @@ class FlowGraph:
1167
1585
  skip_node_message(self.flow_logger, skip_nodes)
1168
1586
  execution_order_message(self.flow_logger, execution_order)
1169
1587
  performance_mode = self.flow_settings.execution_mode == 'Performance'
1588
+ if self.flow_settings.execution_location == 'local':
1589
+ OFFLOAD_TO_WORKER.value = False
1170
1590
  for node in execution_order:
1171
1591
  node_logger = self.flow_logger.get_node_logger(node.node_id)
1172
1592
  if self.flow_settings.is_canceled:
@@ -1215,6 +1635,11 @@ class FlowGraph:
1215
1635
  self.flow_settings.is_running = False
1216
1636
 
1217
1637
  def get_run_info(self) -> RunInformation:
1638
+ """Gets a summary of the most recent graph execution.
1639
+
1640
+ Returns:
1641
+ A RunInformation object with details about the last run.
1642
+ """
1218
1643
  if self.latest_run_info is None:
1219
1644
  node_results = self.node_results
1220
1645
  success = all(nr.success for nr in node_results)
@@ -1234,6 +1659,11 @@ class FlowGraph:
1234
1659
 
1235
1660
  @property
1236
1661
  def node_connections(self) -> List[Tuple[int, int]]:
1662
+ """Computes and returns a list of all connections in the graph.
1663
+
1664
+ Returns:
1665
+ A list of tuples, where each tuple is a (source_id, target_id) pair.
1666
+ """
1237
1667
  connections = set()
1238
1668
  for node in self.nodes:
1239
1669
  outgoing_connections = [(node.node_id, ltn.node_id) for ltn in node.leads_to_nodes]
@@ -1245,28 +1675,30 @@ class FlowGraph:
1245
1675
  connections.add(node_connection)
1246
1676
  return list(connections)
1247
1677
 
1248
- def get_schema(self) -> List[FlowfileColumn]:
1249
- if self.schema is None:
1250
- if len(self._node_ids) > 0:
1251
- self.schema = self._node_db[self._node_ids[0]].schema
1252
- return self.schema
1678
+ def get_node_data(self, node_id: int, include_example: bool = True) -> NodeData:
1679
+ """Retrieves all data needed to render a node in the UI.
1253
1680
 
1254
- def get_example_data(self, node_id: int) -> TableExample | None:
1255
- node = self._node_db[node_id]
1256
- return node.get_table_example(include_data=True)
1681
+ Args:
1682
+ node_id: The ID of the node.
1683
+ include_example: Whether to include data samples in the result.
1257
1684
 
1258
- def get_node_data(self, node_id: int, include_example: bool = True) -> NodeData:
1685
+ Returns:
1686
+ A NodeData object, or None if the node is not found.
1687
+ """
1259
1688
  node = self._node_db[node_id]
1260
1689
  return node.get_node_data(flow_id=self.flow_id, include_example=include_example)
1261
1690
 
1262
1691
  def get_node_storage(self) -> schemas.FlowInformation:
1692
+ """Serializes the entire graph's state into a storable format.
1263
1693
 
1694
+ Returns:
1695
+ A FlowInformation object representing the complete graph.
1696
+ """
1264
1697
  node_information = {node.node_id: node.get_node_information() for
1265
1698
  node in self.nodes if node.is_setup and node.is_correct}
1266
1699
 
1267
1700
  return schemas.FlowInformation(flow_id=self.flow_id,
1268
1701
  flow_name=self.__name__,
1269
- storage_location=self.flow_settings.path,
1270
1702
  flow_settings=self.flow_settings,
1271
1703
  data=node_information,
1272
1704
  node_starts=[v.node_id for v in self._flow_starts],
@@ -1274,6 +1706,8 @@ class FlowGraph:
1274
1706
  )
1275
1707
 
1276
1708
  def cancel(self):
1709
+ """Cancels an ongoing graph execution."""
1710
+
1277
1711
  if not self.flow_settings.is_running:
1278
1712
  return
1279
1713
  self.flow_settings.is_canceled = True
@@ -1281,15 +1715,30 @@ class FlowGraph:
1281
1715
  node.cancel()
1282
1716
 
1283
1717
  def close_flow(self):
1718
+ """Performs cleanup operations, such as clearing node caches."""
1719
+
1284
1720
  for node in self.nodes:
1285
1721
  node.remove_cache()
1286
1722
 
1287
1723
  def save_flow(self, flow_path: str):
1724
+ """Saves the current state of the flow graph to a file.
1725
+
1726
+ Args:
1727
+ flow_path: The path where the flow file will be saved.
1728
+ """
1288
1729
  with open(flow_path, 'wb') as f:
1289
1730
  pickle.dump(self.get_node_storage(), f)
1290
1731
  self.flow_settings.path = flow_path
1291
1732
 
1292
- def get_frontend_data(self):
1733
+ def get_frontend_data(self) -> dict:
1734
+ """Formats the graph structure into a JSON-like dictionary for a specific legacy frontend.
1735
+
1736
+ This method transforms the graph's state into a format compatible with the
1737
+ Drawflow.js library.
1738
+
1739
+ Returns:
1740
+ A dictionary representing the graph in Drawflow format.
1741
+ """
1293
1742
  result = {
1294
1743
  'Home': {
1295
1744
  "data": {}
@@ -1360,6 +1809,11 @@ class FlowGraph:
1360
1809
  return result
1361
1810
 
1362
1811
  def get_vue_flow_input(self) -> schemas.VueFlowInput:
1812
+ """Formats the graph's nodes and edges into a schema suitable for the VueFlow frontend.
1813
+
1814
+ Returns:
1815
+ A VueFlowInput object.
1816
+ """
1363
1817
  edges: List[schemas.NodeEdge] = []
1364
1818
  nodes: List[schemas.NodeInput] = []
1365
1819
  for node in self.nodes:
@@ -1368,11 +1822,19 @@ class FlowGraph:
1368
1822
  return schemas.VueFlowInput(node_edges=edges, node_inputs=nodes)
1369
1823
 
1370
1824
  def reset(self):
1825
+ """Forces a deep reset on all nodes in the graph."""
1826
+
1371
1827
  for node in self.nodes:
1372
1828
  node.reset(True)
1373
1829
 
1374
1830
  def copy_node(self, new_node_settings: input_schema.NodePromise, existing_setting_input: Any, node_type: str) -> None:
1375
- """Copy an existing node with potentially new settings."""
1831
+ """Creates a copy of an existing node.
1832
+
1833
+ Args:
1834
+ new_node_settings: The promise containing new settings (like ID and position).
1835
+ existing_setting_input: The settings object from the node being copied.
1836
+ node_type: The type of the node being copied.
1837
+ """
1376
1838
  self.add_node_promise(new_node_settings)
1377
1839
 
1378
1840
  if isinstance(existing_setting_input, input_schema.NodePromise):
@@ -1383,69 +1845,26 @@ class FlowGraph:
1383
1845
  )
1384
1846
  getattr(self, f"add_{node_type}")(combined_settings)
1385
1847
 
1848
+ def generate_code(self):
1849
+ """Generates code for the flow graph.
1850
+ This method exports the flow graph to a Polars-compatible format.
1851
+ """
1852
+ from flowfile_core.flowfile.code_generator.code_generator import export_flow_to_polars
1853
+ print(export_flow_to_polars(self))
1386
1854
 
1387
- def combine_flow_graphs(*flow_graphs: FlowGraph) -> FlowGraph:
1388
- """
1389
- Combine multiple flow graphs into a single graph, ensuring node IDs don't overlap.
1855
+
1856
+ def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
1857
+ """Merges settings from an existing object with new settings from a NodePromise.
1858
+
1859
+ Typically used when copying a node to apply a new ID and position.
1390
1860
 
1391
1861
  Args:
1392
- *flow_graphs: Multiple FlowGraph instances to combine
1862
+ setting_input: The original settings object.
1863
+ new_settings: The NodePromise with new positional and ID data.
1393
1864
 
1394
1865
  Returns:
1395
- A new FlowGraph containing all nodes and edges from the input graphs with remapped IDs
1396
-
1397
- Raises:
1398
- ValueError: If any flow_ids overlap
1866
+ A new settings object with the merged properties.
1399
1867
  """
1400
- # Validate flow IDs are unique
1401
- _validate_unique_flow_ids(flow_graphs)
1402
-
1403
- # Create ID mapping for all nodes
1404
- node_id_mapping = _create_node_id_mapping(flow_graphs)
1405
-
1406
- # Remap and combine nodes
1407
- all_nodes = _remap_nodes(flow_graphs, node_id_mapping)
1408
-
1409
- # Create a new combined flow graph
1410
- combined_flow_id = hash(tuple(fg.flow_id for fg in flow_graphs))
1411
- # return FlowGraph(flow_id=combined_flow_id, nodes=all_nodes, edges=all_edges)
1412
-
1413
-
1414
- def _validate_unique_flow_ids(flow_graphs: Tuple[FlowGraph, ...]) -> None:
1415
- """Ensure all flow graphs have unique flow_ids."""
1416
- all_flow_ids = [fg.flow_id for fg in flow_graphs]
1417
- if len(all_flow_ids) != len(set(all_flow_ids)):
1418
- raise ValueError("Cannot combine overlapping graphs, make sure the graphs have a unique identifier")
1419
-
1420
-
1421
- def _create_node_id_mapping(flow_graphs: Tuple[FlowGraph, ...]) -> Dict[int, Dict[int, int]]:
1422
- """Create a mapping from original node IDs to new unique node IDs."""
1423
- node_id_mapping: Dict[int, Dict[int, int]] = {}
1424
- next_node_id = 0
1425
-
1426
- for fg in flow_graphs:
1427
- node_id_mapping[fg.flow_id] = {}
1428
- for node in fg.nodes:
1429
- node_id_mapping[fg.flow_id][node.node_id] = next_node_id
1430
- next_node_id += 1
1431
-
1432
- return node_id_mapping
1433
-
1434
-
1435
- def _remap_nodes(flow_graphs: Tuple[FlowGraph, ...],
1436
- node_id_mapping: Dict[int, Dict[int, int]]) -> List:
1437
- """Create new nodes with remapped IDs."""
1438
- all_nodes = []
1439
- for fg in flow_graphs:
1440
- for node in fg.nodes:
1441
- new_node = copy.deepcopy(node)
1442
- new_node.node_id = node_id_mapping[fg.flow_id][node.node_id]
1443
- all_nodes.append(new_node)
1444
- return all_nodes
1445
-
1446
-
1447
- def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
1448
- """Combine excopy_nodeisting settings with new settings from a NodePromise."""
1449
1868
  copied_setting_input = deepcopy(setting_input)
1450
1869
 
1451
1870
  # Update only attributes that exist on new_settings
@@ -1464,7 +1883,13 @@ def combine_existing_settings_and_new_settings(setting_input: Any, new_settings:
1464
1883
  return copied_setting_input
1465
1884
 
1466
1885
 
1467
- def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection):
1886
+ def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection) -> None:
1887
+ """Adds a connection between two nodes in the flow graph.
1888
+
1889
+ Args:
1890
+ flow: The FlowGraph instance to modify.
1891
+ node_connection: An object defining the source and target of the connection.
1892
+ """
1468
1893
  logger.info('adding a connection')
1469
1894
  from_node = flow.get_node(node_connection.output_connection.node_id)
1470
1895
  to_node = flow.get_node(node_connection.input_connection.node_id)
@@ -1476,7 +1901,12 @@ def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection
1476
1901
 
1477
1902
 
1478
1903
  def delete_connection(graph, node_connection: input_schema.NodeConnection):
1479
- """Delete the connection between two nodes."""
1904
+ """Deletes a connection between two nodes in the flow graph.
1905
+
1906
+ Args:
1907
+ graph: The FlowGraph instance to modify.
1908
+ node_connection: An object defining the connection to be removed.
1909
+ """
1480
1910
  from_node = graph.get_node(node_connection.output_connection.node_id)
1481
1911
  to_node = graph.get_node(node_connection.input_connection.node_id)
1482
1912
  connection_valid = to_node.node_inputs.validate_if_input_connection_exists(
@@ -1492,6 +1922,4 @@ def delete_connection(graph, node_connection: input_schema.NodeConnection):
1492
1922
  to_node.delete_input_node(
1493
1923
  node_connection.output_connection.node_id,
1494
1924
  connection_type=node_connection.input_connection.connection_class,
1495
- )
1496
-
1497
-
1925
+ )