Flowfile 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. flowfile/__init__.py +27 -6
  2. flowfile/api.py +5 -2
  3. flowfile/web/__init__.py +4 -2
  4. flowfile/web/static/assets/{CloudConnectionManager-d004942f.js → CloudConnectionManager-c20a740f.js} +3 -4
  5. flowfile/web/static/assets/{CloudStorageReader-eccf9fc2.js → CloudStorageReader-960b400a.js} +7 -7
  6. flowfile/web/static/assets/{CloudStorageWriter-b1ba6bba.js → CloudStorageWriter-e3decbdd.js} +7 -7
  7. flowfile/web/static/assets/{CrossJoin-68981877.js → CrossJoin-d67e2405.js} +8 -8
  8. flowfile/web/static/assets/{DatabaseConnectionSettings-0b06649c.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
  9. flowfile/web/static/assets/{DatabaseManager-8349a426.js → DatabaseManager-9ea35e84.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseReader-905344f8.js → DatabaseReader-9578bfa5.js} +9 -9
  11. flowfile/web/static/assets/{DatabaseWriter-9f5b8638.js → DatabaseWriter-19531098.js} +9 -9
  12. flowfile/web/static/assets/{ExploreData-131a6d53.js → ExploreData-40476474.js} +47141 -43697
  13. flowfile/web/static/assets/{ExternalSource-e3549dcc.js → ExternalSource-2297ef96.js} +6 -6
  14. flowfile/web/static/assets/{Filter-6e0730ae.js → Filter-f211c03a.js} +8 -8
  15. flowfile/web/static/assets/{Formula-02f033e6.js → Formula-4207ea31.js} +8 -8
  16. flowfile/web/static/assets/{FuzzyMatch-54c14036.js → FuzzyMatch-bf120df0.js} +9 -9
  17. flowfile/web/static/assets/{GraphSolver-08a3f499.js → GraphSolver-5bb7497a.js} +5 -5
  18. flowfile/web/static/assets/{GroupBy-2ae38139.js → GroupBy-92c81b65.js} +6 -6
  19. flowfile/web/static/assets/{Join-493b9772.js → Join-4e49a274.js} +9 -9
  20. flowfile/web/static/assets/{ManualInput-4373d163.js → ManualInput-90998ae8.js} +5 -5
  21. flowfile/web/static/assets/{Output-b534f3c7.js → Output-81e3e917.js} +4 -4
  22. flowfile/web/static/assets/{Pivot-2968ff65.js → Pivot-a3419842.js} +6 -6
  23. flowfile/web/static/assets/{PolarsCode-65136536.js → PolarsCode-72710deb.js} +6 -6
  24. flowfile/web/static/assets/{Read-c56339ed.js → Read-c4059daf.js} +6 -6
  25. flowfile/web/static/assets/{RecordCount-1c641a5e.js → RecordCount-c2b5e095.js} +5 -5
  26. flowfile/web/static/assets/{RecordId-df308b8f.js → RecordId-10baf191.js} +6 -6
  27. flowfile/web/static/assets/{Sample-293e8a64.js → Sample-3ed9a0ae.js} +5 -5
  28. flowfile/web/static/assets/{SecretManager-03911655.js → SecretManager-0d49c0e8.js} +2 -2
  29. flowfile/web/static/assets/{Select-3058a13d.js → Select-8a02a0b3.js} +8 -8
  30. flowfile/web/static/assets/{SettingsSection-fbf4fb39.js → SettingsSection-4c0f45f5.js} +1 -1
  31. flowfile/web/static/assets/{Sort-a29bbaf7.js → Sort-f55c9f9d.js} +6 -6
  32. flowfile/web/static/assets/{TextToRows-c7d7760e.js → TextToRows-5dbc2145.js} +8 -8
  33. flowfile/web/static/assets/{UnavailableFields-118f1d20.js → UnavailableFields-a1768e52.js} +2 -2
  34. flowfile/web/static/assets/{Union-f0589571.js → Union-f2aefdc9.js} +5 -5
  35. flowfile/web/static/assets/{Unique-7329a207.js → Unique-46b250da.js} +8 -8
  36. flowfile/web/static/assets/{Unpivot-30b0be15.js → Unpivot-25ac84cc.js} +5 -5
  37. flowfile/web/static/assets/{api-fb67319c.js → api-6ef0dcef.js} +1 -1
  38. flowfile/web/static/assets/{api-602fb95c.js → api-a0abbdc7.js} +1 -1
  39. flowfile/web/static/assets/{designer-94a6bf4d.js → designer-13eabd83.js} +4 -4
  40. flowfile/web/static/assets/{documentation-a224831e.js → documentation-b87e7f6f.js} +1 -1
  41. flowfile/web/static/assets/{dropDown-c2d2aa97.js → dropDown-13564764.js} +1 -1
  42. flowfile/web/static/assets/{fullEditor-921ac5fd.js → fullEditor-fd2cd6f9.js} +2 -2
  43. flowfile/web/static/assets/{genericNodeSettings-7013cc94.js → genericNodeSettings-71e11604.js} +3 -3
  44. flowfile/web/static/assets/{index-3a75211d.js → index-f6c15e76.js} +46 -22
  45. flowfile/web/static/assets/{nodeTitle-a63d4680.js → nodeTitle-988d9efe.js} +3 -3
  46. flowfile/web/static/assets/{secretApi-763aec6e.js → secretApi-dd636aa2.js} +1 -1
  47. flowfile/web/static/assets/{selectDynamic-08464729.js → selectDynamic-af36165e.js} +3 -3
  48. flowfile/web/static/assets/{vue-codemirror.esm-f15a5f87.js → vue-codemirror.esm-2847001e.js} +1 -1
  49. flowfile/web/static/assets/{vue-content-loader.es-93bd09d7.js → vue-content-loader.es-0371da73.js} +1 -1
  50. flowfile/web/static/index.html +1 -1
  51. {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/METADATA +2 -2
  52. {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/RECORD +100 -98
  53. flowfile_core/__init__.py +1 -0
  54. flowfile_core/auth/jwt.py +39 -0
  55. flowfile_core/configs/node_store/nodes.py +1 -0
  56. flowfile_core/configs/settings.py +6 -5
  57. flowfile_core/configs/utils.py +5 -0
  58. flowfile_core/database/connection.py +1 -3
  59. flowfile_core/flowfile/code_generator/code_generator.py +71 -0
  60. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +1 -2
  61. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +598 -310
  62. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
  63. flowfile_core/flowfile/flow_graph.py +620 -192
  64. flowfile_core/flowfile/flow_graph_utils.py +2 -2
  65. flowfile_core/flowfile/flow_node/flow_node.py +510 -89
  66. flowfile_core/flowfile/flow_node/models.py +125 -20
  67. flowfile_core/flowfile/handler.py +2 -33
  68. flowfile_core/flowfile/manage/open_flowfile.py +1 -2
  69. flowfile_core/flowfile/util/calculate_layout.py +0 -2
  70. flowfile_core/flowfile/utils.py +36 -5
  71. flowfile_core/main.py +32 -13
  72. flowfile_core/routes/cloud_connections.py +7 -11
  73. flowfile_core/routes/logs.py +2 -6
  74. flowfile_core/routes/public.py +1 -0
  75. flowfile_core/routes/routes.py +127 -51
  76. flowfile_core/routes/secrets.py +72 -14
  77. flowfile_core/schemas/__init__.py +8 -0
  78. flowfile_core/schemas/input_schema.py +92 -64
  79. flowfile_core/schemas/output_model.py +19 -3
  80. flowfile_core/schemas/schemas.py +144 -11
  81. flowfile_core/schemas/transform_schema.py +82 -17
  82. flowfile_core/utils/arrow_reader.py +8 -3
  83. flowfile_core/utils/validate_setup.py +0 -2
  84. flowfile_frame/__init__.py +9 -1
  85. flowfile_frame/cloud_storage/__init__.py +0 -0
  86. flowfile_frame/cloud_storage/frame_helpers.py +39 -0
  87. flowfile_frame/cloud_storage/secret_manager.py +73 -0
  88. flowfile_frame/expr.py +42 -1
  89. flowfile_frame/expr.pyi +76 -61
  90. flowfile_frame/flow_frame.py +233 -111
  91. flowfile_frame/flow_frame.pyi +137 -91
  92. flowfile_frame/flow_frame_methods.py +150 -12
  93. flowfile_frame/group_frame.py +3 -0
  94. flowfile_frame/utils.py +25 -3
  95. test_utils/s3/data_generator.py +1 -0
  96. test_utils/s3/demo_data_generator.py +186 -0
  97. test_utils/s3/fixtures.py +6 -1
  98. flowfile_core/schemas/defaults.py +0 -9
  99. flowfile_core/schemas/models.py +0 -193
  100. {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/LICENSE +0 -0
  101. {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/WHEEL +0 -0
  102. {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/entry_points.txt +0 -0
@@ -2,7 +2,7 @@ import datetime
2
2
  import pickle
3
3
  import polars as pl
4
4
  import fastexcel
5
- import copy
5
+ import re
6
6
  from fastapi.exceptions import HTTPException
7
7
  from time import time
8
8
  from functools import partial
@@ -11,6 +11,7 @@ from uuid import uuid1
11
11
  from copy import deepcopy
12
12
  from pyarrow.parquet import ParquetFile
13
13
  from flowfile_core.configs import logger
14
+ from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
14
15
  from flowfile_core.configs.flow_logger import FlowLogger
15
16
  from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
16
17
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
@@ -23,8 +24,10 @@ from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_x
23
24
  get_calamine_xlsx_data_types
24
25
  from flowfile_core.flowfile.sources import external_sources
25
26
  from flowfile_core.schemas import input_schema, schemas, transform_schema
26
- from flowfile_core.schemas.output_model import TableExample, NodeData, NodeResult, RunInformation
27
- from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSettingsInternal, FullCloudStorageConnection,
27
+ from flowfile_core.schemas.output_model import NodeData, NodeResult, RunInformation
28
+ from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSettingsInternal,
29
+ CloudStorageWriteSettingsInternal,
30
+ FullCloudStorageConnection,
28
31
  get_cloud_storage_write_settings_worker_interface, AuthMethod)
29
32
  from flowfile_core.flowfile.utils import snake_case_to_camel_case
30
33
  from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
@@ -45,6 +48,21 @@ from flowfile_core.flowfile.util.calculate_layout import calculate_layered_layou
45
48
 
46
49
  def get_xlsx_schema(engine: str, file_path: str, sheet_name: str, start_row: int, start_column: int,
47
50
  end_row: int, end_column: int, has_headers: bool):
51
+ """Calculates the schema of an XLSX file by reading a sample of rows.
52
+
53
+ Args:
54
+ engine: The engine to use for reading ('openpyxl' or 'calamine').
55
+ file_path: The path to the XLSX file.
56
+ sheet_name: The name of the sheet to read.
57
+ start_row: The starting row for data reading.
58
+ start_column: The starting column for data reading.
59
+ end_row: The ending row for data reading.
60
+ end_column: The ending column for data reading.
61
+ has_headers: A boolean indicating if the file has a header row.
62
+
63
+ Returns:
64
+ A list of FlowfileColumn objects representing the schema.
65
+ """
48
66
  try:
49
67
  logger.info('Starting to calculate the schema')
50
68
  if engine == 'openpyxl':
@@ -67,26 +85,69 @@ def get_xlsx_schema(engine: str, file_path: str, sheet_name: str, start_row: int
67
85
 
68
86
 
69
87
  def skip_node_message(flow_logger: FlowLogger, nodes: List[FlowNode]) -> None:
88
+ """Logs a warning message listing all nodes that will be skipped during execution.
89
+
90
+ Args:
91
+ flow_logger: The logger instance for the flow.
92
+ nodes: A list of FlowNode objects to be skipped.
93
+ """
70
94
  if len(nodes) > 0:
71
95
  msg = "\n".join(str(node) for node in nodes)
72
96
  flow_logger.warning(f'skipping nodes:\n{msg}')
73
97
 
74
98
 
75
99
  def execution_order_message(flow_logger: FlowLogger, nodes: List[FlowNode]) -> None:
100
+ """Logs an informational message showing the determined execution order of nodes.
101
+
102
+ Args:
103
+ flow_logger: The logger instance for the flow.
104
+ nodes: A list of FlowNode objects in the order they will be executed.
105
+ """
76
106
  msg = "\n".join(str(node) for node in nodes)
77
107
  flow_logger.info(f'execution order:\n{msg}')
78
108
 
79
109
 
80
110
  def get_xlsx_schema_callback(engine: str, file_path: str, sheet_name: str, start_row: int, start_column: int,
81
111
  end_row: int, end_column: int, has_headers: bool):
112
+ """Creates a partially applied function for lazy calculation of an XLSX schema.
113
+
114
+ Args:
115
+ engine: The engine to use for reading.
116
+ file_path: The path to the XLSX file.
117
+ sheet_name: The name of the sheet.
118
+ start_row: The starting row.
119
+ start_column: The starting column.
120
+ end_row: The ending row.
121
+ end_column: The ending column.
122
+ has_headers: A boolean indicating if the file has headers.
123
+
124
+ Returns:
125
+ A callable function that, when called, will execute `get_xlsx_schema`.
126
+ """
82
127
  return partial(get_xlsx_schema, engine=engine, file_path=file_path, sheet_name=sheet_name, start_row=start_row,
83
128
  start_column=start_column, end_row=end_row, end_column=end_column, has_headers=has_headers)
84
129
 
85
130
 
86
- def get_cloud_connection_settings(connection_name: str, user_id: int, auth_mode: AuthMethod) -> FullCloudStorageConnection:
131
+ def get_cloud_connection_settings(connection_name: str,
132
+ user_id: int, auth_mode: AuthMethod) -> FullCloudStorageConnection:
133
+ """Retrieves cloud storage connection settings, falling back to environment variables if needed.
134
+
135
+ Args:
136
+ connection_name: The name of the saved connection.
137
+ user_id: The ID of the user owning the connection.
138
+ auth_mode: The authentication method specified by the user.
139
+
140
+ Returns:
141
+ A FullCloudStorageConnection object with the connection details.
142
+
143
+ Raises:
144
+ HTTPException: If the connection settings cannot be found.
145
+ """
87
146
  cloud_connection_settings = get_local_cloud_connection(connection_name, user_id)
88
- if cloud_connection_settings is None and auth_mode == "aws-cli":
147
+ if cloud_connection_settings is None and auth_mode in ("env_vars", "auto"):
89
148
  # If the auth mode is aws-cli, we do not need connection settings
149
+ cloud_connection_settings = FullCloudStorageConnection(storage_type="s3", auth_method="env_vars")
150
+ elif cloud_connection_settings is None and auth_mode == "aws-cli":
90
151
  cloud_connection_settings = FullCloudStorageConnection(storage_type="s3", auth_method="aws-cli")
91
152
  if cloud_connection_settings is None:
92
153
  raise HTTPException(status_code=400, detail="Cloud connection settings not found")
@@ -94,18 +155,10 @@ def get_cloud_connection_settings(connection_name: str, user_id: int, auth_mode:
94
155
 
95
156
 
96
157
  class FlowGraph:
97
- """
98
- FlowGraph is a class that enables Extract, Transform and Load (ETL) operations
99
- on data. It allows you to create a Directed Acyclic Graph (DAG) where each
100
- node represents a step in the ETL pipeline.
101
-
102
- The class offers methods to add transformations and data sources, as well as
103
- methods to run the transformations and generate results.
158
+ """A class representing a Directed Acyclic Graph (DAG) for data processing pipelines.
104
159
 
105
- Attributes:
106
- _input_cols (set): A set that stores the input columns for the transformations.
107
- _output_cols (set): A set that stores the output columns from the transformations.
108
- """
160
+ It manages nodes, connections, and the execution of the entire flow.
161
+ """
109
162
  uuid: str
110
163
  depends_on: Dict[int, Union[ParquetFile, FlowDataEngine, "FlowGraph", pl.DataFrame,]]
111
164
  _flow_id: int
@@ -127,13 +180,27 @@ class FlowGraph:
127
180
  flow_settings: schemas.FlowSettings = None
128
181
  flow_logger: FlowLogger
129
182
 
130
- def __init__(self, flow_id: int,
131
- flow_settings: schemas.FlowSettings,
183
+ def __init__(self,
184
+ flow_settings: schemas.FlowSettings | schemas.FlowGraphConfig,
132
185
  name: str = None, input_cols: List[str] = None,
133
186
  output_cols: List[str] = None,
134
187
  path_ref: str = None,
135
188
  input_flow: Union[ParquetFile, FlowDataEngine, "FlowGraph"] = None,
136
189
  cache_results: bool = False):
190
+ """Initializes a new FlowGraph instance.
191
+
192
+ Args:
193
+ flow_settings: The configuration settings for the flow.
194
+ name: The name of the flow.
195
+ input_cols: A list of input column names.
196
+ output_cols: A list of output column names.
197
+ path_ref: An optional path to an initial data source.
198
+ input_flow: An optional existing data object to start the flow with.
199
+ cache_results: A global flag to enable or disable result caching.
200
+ """
201
+ if isinstance(flow_settings, schemas.FlowGraphConfig):
202
+ flow_settings = schemas.FlowSettings.from_flow_settings_input(flow_settings)
203
+
137
204
  self.flow_settings = flow_settings
138
205
  self.uuid = str(uuid1())
139
206
  self.nodes_completed = 0
@@ -141,8 +208,8 @@ class FlowGraph:
141
208
  self.end_datetime = None
142
209
  self.latest_run_info = None
143
210
  self.node_results = []
144
- self._flow_id = flow_id
145
- self.flow_logger = FlowLogger(flow_id)
211
+ self._flow_id = flow_settings.flow_id
212
+ self.flow_logger = FlowLogger(flow_settings.flow_id)
146
213
  self._flow_starts: List[FlowNode] = []
147
214
  self._results = None
148
215
  self.schema = None
@@ -160,7 +227,13 @@ class FlowGraph:
160
227
  self.add_datasource(input_file=input_flow)
161
228
 
162
229
  def add_node_promise(self, node_promise: input_schema.NodePromise):
230
+ """Adds a placeholder node to the graph that is not yet fully configured.
231
+
232
+ Useful for building the graph structure before all settings are available.
163
233
 
234
+ Args:
235
+ node_promise: A promise object containing basic node information.
236
+ """
164
237
  def placeholder(n: FlowNode = None):
165
238
  if n is None:
166
239
  return FlowDataEngine()
@@ -169,10 +242,73 @@ class FlowGraph:
169
242
  self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=placeholder,
170
243
  setting_input=node_promise)
171
244
 
172
- def apply_layout(self, y_spacing: int = 150, x_spacing: int = 200, initial_y: int = 100):
245
+ def print_tree(self, show_schema=False, show_descriptions=False):
173
246
  """
174
- Calculates and applies a layered layout to all nodes in the graph.
175
- Updates the pos_x and pos_y attributes of the node setting inputs.
247
+ Print flow_graph as a tree.
248
+ """
249
+ max_node_id = max(self._node_db.keys())
250
+
251
+ tree = ""
252
+ tabs = 0
253
+ tab_counter = 0
254
+ for node in self.nodes:
255
+ tab_counter += 1
256
+ node_input = node.setting_input
257
+ operation = str(self._node_db[node_input.node_id]).split("(")[1][:-1].replace("_", " ").title()
258
+
259
+ if operation == "Formula":
260
+ operation = "With Columns"
261
+
262
+ tree += str(operation) + " (id=" + str(node_input.node_id) + ")"
263
+
264
+ if show_descriptions & show_schema:
265
+ raise ValueError('show_descriptions and show_schema cannot be True simultaneously')
266
+ if show_descriptions:
267
+ tree += ": " + str(node_input.description)
268
+ elif show_schema:
269
+ tree += " -> ["
270
+ if operation == "Manual Input":
271
+ schema = ", ".join([str(i.name) + ": " + str(i.data_type) for i in node_input.raw_data_format.columns])
272
+ tree += schema
273
+ elif operation == "With Columns":
274
+ tree_with_col_schema = ", " + node_input.function.field.name + ": " + node_input.function.field.data_type
275
+ tree += schema + tree_with_col_schema
276
+ elif operation == "Filter":
277
+ index = node_input.filter_input.advanced_filter.find("]")
278
+ filtered_column = str(node_input.filter_input.advanced_filter[1:index])
279
+ schema = re.sub('({str(filtered_column)}: [A-Za-z0-9]+\,\s)', "", schema)
280
+ tree += schema
281
+ elif operation == "Group By":
282
+ for col in node_input.groupby_input.agg_cols:
283
+ schema = re.sub(str(col.old_name) + ': [a-z0-9]+\, ', "", schema)
284
+ tree += schema
285
+ tree += "]"
286
+ else:
287
+ if operation == "Manual Input":
288
+ tree += ": " + str(node_input.raw_data_format.data)
289
+ elif operation == "With Columns":
290
+ tree += ": " + str(node_input.function)
291
+ elif operation == "Filter":
292
+ tree += ": " + str(node_input.filter_input.advanced_filter)
293
+ elif operation == "Group By":
294
+ tree += ": groupby=[" + ", ".join([col.old_name for col in node_input.groupby_input.agg_cols if col.agg == "groupby"]) + "], "
295
+ tree += "agg=[" + ", ".join([str(col.agg) + "(" + str(col.old_name) + ")" for col in node_input.groupby_input.agg_cols if col.agg != "groupby"]) + "]"
296
+
297
+ if node_input.node_id < max_node_id:
298
+ tree += "\n" + "# " + " "*3*(tabs-1) + "|___ "
299
+ print("\n"*2)
300
+
301
+ return print(tree)
302
+
303
+ def apply_layout(self, y_spacing: int = 150, x_spacing: int = 200, initial_y: int = 100):
304
+ """Calculates and applies a layered layout to all nodes in the graph.
305
+
306
+ This updates their x and y positions for UI rendering.
307
+
308
+ Args:
309
+ y_spacing: The vertical spacing between layers.
310
+ x_spacing: The horizontal spacing between nodes in the same layer.
311
+ initial_y: The initial y-position for the first layer.
176
312
  """
177
313
  self.flow_logger.info("Applying layered layout...")
178
314
  start_time = time()
@@ -199,7 +335,7 @@ class FlowGraph:
199
335
  else:
200
336
  self.flow_logger.warning(f"Node {node_id} setting_input ({type(setting)}) lacks pos_x/pos_y attributes.")
201
337
  elif node:
202
- self.flow_logger.warning(f"Node {node_id} lacks setting_input attribute.")
338
+ self.flow_logger.warning(f"Node {node_id} lacks setting_input attribute.")
203
339
  # else: Node not found, already warned by calculate_layered_layout
204
340
 
205
341
  end_time = time()
@@ -207,51 +343,20 @@ class FlowGraph:
207
343
 
208
344
  except Exception as e:
209
345
  self.flow_logger.error(f"Error applying layout: {e}")
210
- raise # Optional: re-raise the exception
211
-
212
- def add_initial_node_analysis(self, node_promise: input_schema.NodePromise):
213
- node_analysis = create_graphic_walker_node_from_node_promise(node_promise)
214
- self.add_explore_data(node_analysis)
215
-
216
- def add_explore_data(self, node_analysis: input_schema.NodeExploreData):
217
- sample_size: int = 10000
218
-
219
- def analysis_preparation(flowfile_table: FlowDataEngine):
220
- if flowfile_table.number_of_records <= 0:
221
- number_of_records = flowfile_table.get_number_of_records(calculate_in_worker_process=True)
222
- else:
223
- number_of_records = flowfile_table.number_of_records
224
- if number_of_records > sample_size:
225
- flowfile_table = flowfile_table.get_sample(sample_size, random=True)
226
- external_sampler = ExternalDfFetcher(
227
- lf=flowfile_table.data_frame,
228
- file_ref="__gf_walker"+node.hash,
229
- wait_on_completion=True,
230
- node_id=node.node_id,
231
- flow_id=self.flow_id,
232
- )
233
- node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref)
234
- return flowfile_table
235
-
236
- def schema_callback():
237
- node = self.get_node(node_analysis.node_id)
238
- if len(node.all_inputs) == 1:
239
- input_node = node.all_inputs[0]
240
- return input_node.schema
241
- else:
242
- return [FlowfileColumn.from_input('col_1', 'na')]
243
-
244
- self.add_node_step(node_id=node_analysis.node_id, node_type='explore_data',
245
- function=analysis_preparation,
246
- setting_input=node_analysis, schema_callback=schema_callback)
247
- node = self.get_node(node_analysis.node_id)
346
+ raise # Optional: re-raise the exception
248
347
 
249
348
  @property
250
349
  def flow_id(self) -> int:
350
+ """Gets the unique identifier of the flow."""
251
351
  return self._flow_id
252
352
 
253
353
  @flow_id.setter
254
354
  def flow_id(self, new_id: int):
355
+ """Sets the unique identifier for the flow and updates all child nodes.
356
+
357
+ Args:
358
+ new_id: The new flow ID.
359
+ """
255
360
  self._flow_id = new_id
256
361
  for node in self.nodes:
257
362
  if hasattr(node.setting_input, 'flow_id'):
@@ -259,23 +364,35 @@ class FlowGraph:
259
364
  self.flow_settings.flow_id = new_id
260
365
 
261
366
  def __repr__(self):
262
- """
263
- Official string representation of the FlowGraph class.
264
- """
367
+ """Provides the official string representation of the FlowGraph instance."""
265
368
  settings_str = " -" + '\n -'.join(f"{k}: {v}" for k, v in self.flow_settings)
266
369
  return f"FlowGraph(\nNodes: {self._node_db}\n\nSettings:\n{settings_str}"
267
370
 
268
371
  def get_nodes_overview(self):
372
+ """Gets a list of dictionary representations for all nodes in the graph."""
269
373
  output = []
270
374
  for v in self._node_db.values():
271
375
  output.append(v.get_repr())
272
376
  return output
273
377
 
274
378
  def remove_from_output_cols(self, columns: List[str]):
379
+ """Removes specified columns from the list of expected output columns.
380
+
381
+ Args:
382
+ columns: A list of column names to remove.
383
+ """
275
384
  cols = set(columns)
276
385
  self._output_cols = [c for c in self._output_cols if c not in cols]
277
386
 
278
- def get_node(self, node_id: Union[int, str] = None) -> FlowNode:
387
+ def get_node(self, node_id: Union[int, str] = None) -> FlowNode | None:
388
+ """Retrieves a node from the graph by its ID.
389
+
390
+ Args:
391
+ node_id: The ID of the node to retrieve. If None, retrieves the last added node.
392
+
393
+ Returns:
394
+ The FlowNode object, or None if not found.
395
+ """
279
396
  if node_id is None:
280
397
  node_id = self._node_ids[-1]
281
398
  node = self._node_db.get(node_id)
@@ -283,6 +400,12 @@ class FlowGraph:
283
400
  return node
284
401
 
285
402
  def add_pivot(self, pivot_settings: input_schema.NodePivot):
403
+ """Adds a pivot node to the graph.
404
+
405
+ Args:
406
+ pivot_settings: The settings for the pivot operation.
407
+ """
408
+
286
409
  def _func(fl: FlowDataEngine):
287
410
  return fl.do_pivot(pivot_settings.pivot_input, self.flow_logger.get_node_logger(pivot_settings.node_id))
288
411
 
@@ -302,6 +425,11 @@ class FlowGraph:
302
425
  node.schema_callback = schema_callback
303
426
 
304
427
  def add_unpivot(self, unpivot_settings: input_schema.NodeUnpivot):
428
+ """Adds an unpivot node to the graph.
429
+
430
+ Args:
431
+ unpivot_settings: The settings for the unpivot operation.
432
+ """
305
433
 
306
434
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
307
435
  return fl.unpivot(unpivot_settings.unpivot_input)
@@ -313,6 +441,12 @@ class FlowGraph:
313
441
  input_node_ids=[unpivot_settings.depending_on_id])
314
442
 
315
443
  def add_union(self, union_settings: input_schema.NodeUnion):
444
+ """Adds a union node to combine multiple data streams.
445
+
446
+ Args:
447
+ union_settings: The settings for the union operation.
448
+ """
449
+
316
450
  def _func(*flowfile_tables: FlowDataEngine):
317
451
  dfs: List[pl.LazyFrame] | List[pl.DataFrame] = [flt.data_frame for flt in flowfile_tables]
318
452
  return FlowDataEngine(pl.concat(dfs, how='diagonal_relaxed'))
@@ -323,7 +457,60 @@ class FlowGraph:
323
457
  setting_input=union_settings,
324
458
  input_node_ids=union_settings.depending_on_ids)
325
459
 
460
+ def add_initial_node_analysis(self, node_promise: input_schema.NodePromise):
461
+ """Adds a data exploration/analysis node based on a node promise.
462
+
463
+ Args:
464
+ node_promise: The promise representing the node to be analyzed.
465
+ """
466
+ node_analysis = create_graphic_walker_node_from_node_promise(node_promise)
467
+ self.add_explore_data(node_analysis)
468
+
469
+ def add_explore_data(self, node_analysis: input_schema.NodeExploreData):
470
+ """Adds a specialized node for data exploration and visualization.
471
+
472
+ Args:
473
+ node_analysis: The settings for the data exploration node.
474
+ """
475
+ sample_size: int = 10000
476
+
477
+ def analysis_preparation(flowfile_table: FlowDataEngine):
478
+ if flowfile_table.number_of_records <= 0:
479
+ number_of_records = flowfile_table.get_number_of_records(calculate_in_worker_process=True)
480
+ else:
481
+ number_of_records = flowfile_table.number_of_records
482
+ if number_of_records > sample_size:
483
+ flowfile_table = flowfile_table.get_sample(sample_size, random=True)
484
+ external_sampler = ExternalDfFetcher(
485
+ lf=flowfile_table.data_frame,
486
+ file_ref="__gf_walker"+node.hash,
487
+ wait_on_completion=True,
488
+ node_id=node.node_id,
489
+ flow_id=self.flow_id,
490
+ )
491
+ node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref,
492
+ n=min(sample_size, number_of_records))
493
+ return flowfile_table
494
+
495
+ def schema_callback():
496
+ node = self.get_node(node_analysis.node_id)
497
+ if len(node.all_inputs) == 1:
498
+ input_node = node.all_inputs[0]
499
+ return input_node.schema
500
+ else:
501
+ return [FlowfileColumn.from_input('col_1', 'na')]
502
+
503
+ self.add_node_step(node_id=node_analysis.node_id, node_type='explore_data',
504
+ function=analysis_preparation,
505
+ setting_input=node_analysis, schema_callback=schema_callback)
506
+ node = self.get_node(node_analysis.node_id)
507
+
326
508
  def add_group_by(self, group_by_settings: input_schema.NodeGroupBy):
509
+ """Adds a group-by aggregation node to the graph.
510
+
511
+ Args:
512
+ group_by_settings: The settings for the group-by operation.
513
+ """
327
514
 
328
515
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
329
516
  return fl.do_group_by(group_by_settings.groupby_input, False)
@@ -337,6 +524,7 @@ class FlowGraph:
337
524
  node = self.get_node(group_by_settings.node_id)
338
525
 
339
526
  def schema_callback():
527
+
340
528
  output_columns = [(c.old_name, c.new_name, c.output_type) for c in group_by_settings.groupby_input.agg_cols]
341
529
  depends_on = node.node_inputs.main_inputs[0]
342
530
  input_schema_dict: Dict[str, str] = {s.name: s.data_type for s in depends_on.schema}
@@ -348,22 +536,13 @@ class FlowGraph:
348
536
 
349
537
  node.schema_callback = schema_callback
350
538
 
351
- def add_or_update_column_func(self, col_name: str, pl_dtype: pl.DataType, depends_on: FlowNode):
352
- col_output = FlowfileColumn.from_input(column_name=col_name, data_type=str(pl_dtype))
353
- schema = depends_on.schema
354
- col_exist = depends_on.get_flow_file_column_schema(col_name)
355
- if col_exist is None:
356
- new_schema = schema + [col_output]
357
- else:
358
- new_schema = []
359
- for s in self.schema:
360
- if s.name == col_name:
361
- new_schema.append(col_output)
362
- else:
363
- new_schema.append(s)
364
- return new_schema
365
-
366
539
  def add_filter(self, filter_settings: input_schema.NodeFilter):
540
+ """Adds a filter node to the graph.
541
+
542
+ Args:
543
+ filter_settings: The settings for the filter operation.
544
+ """
545
+
367
546
  is_advanced = filter_settings.filter_input.filter_type == 'advanced'
368
547
  if is_advanced:
369
548
  predicate = filter_settings.filter_input.advanced_filter
@@ -397,6 +576,12 @@ class FlowGraph:
397
576
  )
398
577
 
399
578
  def add_record_count(self, node_number_of_records: input_schema.NodeRecordCount):
579
+ """Adds a filter node to the graph.
580
+
581
+ Args:
582
+ node_number_of_records: The settings for the record count operation.
583
+ """
584
+
400
585
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
401
586
  return fl.get_record_count()
402
587
 
@@ -407,9 +592,14 @@ class FlowGraph:
407
592
  input_node_ids=[node_number_of_records.depending_on_id])
408
593
 
409
594
  def add_polars_code(self, node_polars_code: input_schema.NodePolarsCode):
595
+ """Adds a node that executes custom Polars code.
596
+
597
+ Args:
598
+ node_polars_code: The settings for the Polars code node.
599
+ """
600
+
410
601
  def _func(*flowfile_tables: FlowDataEngine) -> FlowDataEngine:
411
602
  return execute_polars_code(*flowfile_tables, code=node_polars_code.polars_code_input.polars_code)
412
-
413
603
  self.add_node_step(node_id=node_polars_code.node_id,
414
604
  function=_func,
415
605
  node_type='polars_code',
@@ -422,7 +612,31 @@ class FlowGraph:
422
612
  node = self.get_node(node_id=node_polars_code.node_id)
423
613
  node.results.errors = str(e)
424
614
 
615
+ def add_dependency_on_polars_lazy_frame(self,
616
+ lazy_frame: pl.LazyFrame,
617
+ node_id: int):
618
+ """Adds a special node that directly injects a Polars LazyFrame into the graph.
619
+
620
+ Note: This is intended for backend use and will not work in the UI editor.
621
+
622
+ Args:
623
+ lazy_frame: The Polars LazyFrame to inject.
624
+ node_id: The ID for the new node.
625
+ """
626
+ def _func():
627
+ return FlowDataEngine(lazy_frame)
628
+ node_promise = input_schema.NodePromise(flow_id=self.flow_id,
629
+ node_id=node_id, node_type="polars_lazy_frame",
630
+ is_setup=True)
631
+ self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=_func,
632
+ setting_input=node_promise)
633
+
425
634
  def add_unique(self, unique_settings: input_schema.NodeUnique):
635
+ """Adds a node to find and remove duplicate rows.
636
+
637
+ Args:
638
+ unique_settings: The settings for the unique operation.
639
+ """
426
640
 
427
641
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
428
642
  return fl.make_unique(unique_settings.unique_input)
@@ -435,6 +649,16 @@ class FlowGraph:
435
649
  input_node_ids=[unique_settings.depending_on_id])
436
650
 
437
651
  def add_graph_solver(self, graph_solver_settings: input_schema.NodeGraphSolver):
652
+ """Adds a node that solves graph-like problems within the data.
653
+
654
+ This node can be used for operations like finding network paths,
655
+ calculating connected components, or performing other graph algorithms
656
+ on relational data that represents nodes and edges.
657
+
658
+ Args:
659
+ graph_solver_settings: The settings object defining the graph inputs
660
+ and the specific algorithm to apply.
661
+ """
438
662
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
439
663
  return fl.solve_graph(graph_solver_settings.graph_solver_input)
440
664
 
@@ -445,6 +669,12 @@ class FlowGraph:
445
669
  input_node_ids=[graph_solver_settings.depending_on_id])
446
670
 
447
671
  def add_formula(self, function_settings: input_schema.NodeFormula):
672
+ """Adds a node that applies a formula to create or modify a column.
673
+
674
+ Args:
675
+ function_settings: The settings for the formula operation.
676
+ """
677
+
448
678
  error = ""
449
679
  if function_settings.function.field.data_type not in (None, "Auto"):
450
680
  output_type = cast_str_to_polars_type(function_settings.function.field.data_type)
@@ -476,6 +706,14 @@ class FlowGraph:
476
706
  return True, ""
477
707
 
478
708
  def add_cross_join(self, cross_join_settings: input_schema.NodeCrossJoin) -> "FlowGraph":
709
+ """Adds a cross join node to the graph.
710
+
711
+ Args:
712
+ cross_join_settings: The settings for the cross join operation.
713
+
714
+ Returns:
715
+ The `FlowGraph` instance for method chaining.
716
+ """
479
717
 
480
718
  def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
481
719
  for left_select in cross_join_settings.cross_join_input.left_select.renames:
@@ -497,6 +735,15 @@ class FlowGraph:
497
735
  return self
498
736
 
499
737
  def add_join(self, join_settings: input_schema.NodeJoin) -> "FlowGraph":
738
+ """Adds a join node to combine two data streams based on key columns.
739
+
740
+ Args:
741
+ join_settings: The settings for the join operation.
742
+
743
+ Returns:
744
+ The `FlowGraph` instance for method chaining.
745
+ """
746
+
500
747
  def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
501
748
  for left_select in join_settings.join_input.left_select.renames:
502
749
  left_select.is_available = True if left_select.old_name in main.schema else False
@@ -517,6 +764,15 @@ class FlowGraph:
517
764
  return self
518
765
 
519
766
  def add_fuzzy_match(self, fuzzy_settings: input_schema.NodeFuzzyMatch) -> "FlowGraph":
767
+ """Adds a fuzzy matching node to join data on approximate string matches.
768
+
769
+ Args:
770
+ fuzzy_settings: The settings for the fuzzy match operation.
771
+
772
+ Returns:
773
+ The `FlowGraph` instance for method chaining.
774
+ """
775
+
520
776
  def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
521
777
  f = main.start_fuzzy_join(fuzzy_match_input=fuzzy_settings.join_input, other=right, file_ref=node.hash,
522
778
  flow_id=self.flow_id, node_id=fuzzy_settings.node_id)
@@ -541,6 +797,18 @@ class FlowGraph:
541
797
  return self
542
798
 
543
799
  def add_text_to_rows(self, node_text_to_rows: input_schema.NodeTextToRows) -> "FlowGraph":
800
+ """Adds a node that splits cell values into multiple rows.
801
+
802
+ This is useful for un-nesting data where a single field contains multiple
803
+ values separated by a delimiter.
804
+
805
+ Args:
806
+ node_text_to_rows: The settings object that specifies the column to split
807
+ and the delimiter to use.
808
+
809
+ Returns:
810
+ The `FlowGraph` instance for method chaining.
811
+ """
544
812
  def _func(table: FlowDataEngine) -> FlowDataEngine:
545
813
  return table.split(node_text_to_rows.text_to_rows_input)
546
814
 
@@ -552,6 +820,15 @@ class FlowGraph:
552
820
  return self
553
821
 
554
822
  def add_sort(self, sort_settings: input_schema.NodeSort) -> "FlowGraph":
823
+ """Adds a node to sort the data based on one or more columns.
824
+
825
+ Args:
826
+ sort_settings: The settings for the sort operation.
827
+
828
+ Returns:
829
+ The `FlowGraph` instance for method chaining.
830
+ """
831
+
555
832
  def _func(table: FlowDataEngine) -> FlowDataEngine:
556
833
  return table.do_sort(sort_settings.sort_input)
557
834
 
@@ -563,6 +840,14 @@ class FlowGraph:
563
840
  return self
564
841
 
565
842
  def add_sample(self, sample_settings: input_schema.NodeSample) -> "FlowGraph":
843
+ """Adds a node to take a random or top-N sample of the data.
844
+
845
+ Args:
846
+ sample_settings: The settings object specifying the size of the sample.
847
+
848
+ Returns:
849
+ The `FlowGraph` instance for method chaining.
850
+ """
566
851
  def _func(table: FlowDataEngine) -> FlowDataEngine:
567
852
  return table.get_sample(sample_settings.sample_size)
568
853
 
@@ -575,6 +860,15 @@ class FlowGraph:
575
860
  return self
576
861
 
577
862
  def add_record_id(self, record_id_settings: input_schema.NodeRecordId) -> "FlowGraph":
863
+ """Adds a node to create a new column with a unique ID for each record.
864
+
865
+ Args:
866
+ record_id_settings: The settings object specifying the name of the
867
+ new record ID column.
868
+
869
+ Returns:
870
+ The `FlowGraph` instance for method chaining.
871
+ """
578
872
 
579
873
  def _func(table: FlowDataEngine) -> FlowDataEngine:
580
874
  return table.add_record_id(record_id_settings.record_id_input)
@@ -588,6 +882,15 @@ class FlowGraph:
588
882
  return self
589
883
 
590
884
  def add_select(self, select_settings: input_schema.NodeSelect) -> "FlowGraph":
885
+ """Adds a node to select, rename, reorder, or drop columns.
886
+
887
+ Args:
888
+ select_settings: The settings for the select operation.
889
+
890
+ Returns:
891
+ The `FlowGraph` instance for method chaining.
892
+ """
893
+
591
894
  select_cols = select_settings.select_input
592
895
  drop_cols = tuple(s.old_name for s in select_settings.select_input)
593
896
 
@@ -621,9 +924,18 @@ class FlowGraph:
621
924
 
622
925
  @property
623
926
  def graph_has_functions(self) -> bool:
927
+ """Checks if the graph has any nodes."""
624
928
  return len(self._node_ids) > 0
625
929
 
626
930
  def delete_node(self, node_id: Union[int, str]):
931
+ """Deletes a node from the graph and updates all its connections.
932
+
933
+ Args:
934
+ node_id: The ID of the node to delete.
935
+
936
+ Raises:
937
+ Exception: If the node with the given ID does not exist.
938
+ """
627
939
  logger.info(f"Starting deletion of node with ID: {node_id}")
628
940
 
629
941
  node = self._node_db.get(node_id)
@@ -656,6 +968,7 @@ class FlowGraph:
656
968
 
657
969
  @property
658
970
  def graph_has_input_data(self) -> bool:
971
+ """Checks if the graph has an initial input data source."""
659
972
  return self._input_data is not None
660
973
 
661
974
  def add_node_step(self,
@@ -670,6 +983,24 @@ class FlowGraph:
670
983
  cache_results: bool = None,
671
984
  schema_callback: Callable = None,
672
985
  input_node_ids: List[int] = None) -> FlowNode:
986
+ """The core method for adding or updating a node in the graph.
987
+
988
+ Args:
989
+ node_id: The unique ID for the node.
990
+ function: The core processing function for the node.
991
+ input_columns: A list of input column names required by the function.
992
+ output_schema: A predefined schema for the node's output.
993
+ node_type: A string identifying the type of node (e.g., 'filter', 'join').
994
+ drop_columns: A list of columns to be dropped after the function executes.
995
+ renew_schema: If True, the schema is recalculated after execution.
996
+ setting_input: A configuration object containing settings for the node.
997
+ cache_results: If True, the node's results are cached for future runs.
998
+ schema_callback: A function that dynamically calculates the output schema.
999
+ input_node_ids: A list of IDs for the nodes that this node depends on.
1000
+
1001
+ Returns:
1002
+ The created or updated FlowNode object.
1003
+ """
673
1004
  existing_node = self.get_node(node_id)
674
1005
  if existing_node is not None:
675
1006
  if existing_node.node_type != node_type:
@@ -686,9 +1017,8 @@ class FlowGraph:
686
1017
  if (
687
1018
  input_nodes is not None or
688
1019
  function.__name__ in ('placeholder', 'analysis_preparation') or
689
- node_type == "cloud_storage_reader"
1020
+ node_type in ("cloud_storage_reader", "polars_lazy_frame", "input_data")
690
1021
  ):
691
-
692
1022
  if not existing_node:
693
1023
  node = FlowNode(node_id=node_id,
694
1024
  function=function,
@@ -709,8 +1039,6 @@ class FlowGraph:
709
1039
  setting_input=setting_input,
710
1040
  schema_callback=schema_callback)
711
1041
  node = existing_node
712
- elif node_type == 'input_data':
713
- node = None
714
1042
  else:
715
1043
  raise Exception("No data initialized")
716
1044
  self._node_db[node_id] = node
@@ -718,6 +1046,11 @@ class FlowGraph:
718
1046
  return node
719
1047
 
720
1048
  def add_include_cols(self, include_columns: List[str]):
1049
+ """Adds columns to both the input and output column lists.
1050
+
1051
+ Args:
1052
+ include_columns: A list of column names to include.
1053
+ """
721
1054
  for column in include_columns:
722
1055
  if column not in self._input_cols:
723
1056
  self._input_cols.append(column)
@@ -726,6 +1059,12 @@ class FlowGraph:
726
1059
  return self
727
1060
 
728
1061
  def add_output(self, output_file: input_schema.NodeOutput):
1062
+ """Adds an output node to write the final data to a destination.
1063
+
1064
+ Args:
1065
+ output_file: The settings for the output file.
1066
+ """
1067
+
729
1068
  def _func(df: FlowDataEngine):
730
1069
  output_file.output_settings.populate_abs_file_path()
731
1070
  execute_remote = self.execution_location != 'local'
@@ -747,7 +1086,12 @@ class FlowGraph:
747
1086
  input_node_ids=[input_node_id])
748
1087
 
749
1088
  def add_database_writer(self, node_database_writer: input_schema.NodeDatabaseWriter):
750
- logger.info("Adding database reader")
1089
+ """Adds a node to write data to a database.
1090
+
1091
+ Args:
1092
+ node_database_writer: The settings for the database writer node.
1093
+ """
1094
+
751
1095
  node_type = 'database_writer'
752
1096
  database_settings: input_schema.DatabaseWriteSettings = node_database_writer.database_write_settings
753
1097
  database_connection: Optional[input_schema.DatabaseConnection | input_schema.FullDatabaseConnection]
@@ -795,6 +1139,12 @@ class FlowGraph:
795
1139
  node = self.get_node(node_database_writer.node_id)
796
1140
 
797
1141
  def add_database_reader(self, node_database_reader: input_schema.NodeDatabaseReader):
1142
+ """Adds a node to read data from a database.
1143
+
1144
+ Args:
1145
+ node_database_reader: The settings for the database reader node.
1146
+ """
1147
+
798
1148
  logger.info("Adding database reader")
799
1149
  node_type = 'database_reader'
800
1150
  database_settings: input_schema.DatabaseSettings = node_database_reader.database_settings
@@ -868,15 +1218,27 @@ class FlowGraph:
868
1218
  self._node_ids.append(node_database_reader.node_id)
869
1219
 
870
1220
  def add_sql_source(self, external_source_input: input_schema.NodeExternalSource):
1221
+ """Adds a node that reads data from a SQL source.
1222
+
1223
+ This is a convenience alias for `add_external_source`.
1224
+
1225
+ Args:
1226
+ external_source_input: The settings for the external SQL source node.
1227
+ """
871
1228
  logger.info('Adding sql source')
872
1229
  self.add_external_source(external_source_input)
873
1230
 
874
1231
  def add_cloud_storage_writer(self, node_cloud_storage_writer: input_schema.NodeCloudStorageWriter) -> None:
1232
+ """Adds a node to write data to a cloud storage provider.
875
1233
 
876
- node_type = "cloud_storage_writer"
1234
+ Args:
1235
+ node_cloud_storage_writer: The settings for the cloud storage writer node.
1236
+ """
877
1237
 
1238
+ node_type = "cloud_storage_writer"
878
1239
  def _func(df: FlowDataEngine):
879
1240
  df.lazy = True
1241
+ execute_remote = self.execution_location != 'local'
880
1242
  cloud_connection_settings = get_cloud_connection_settings(
881
1243
  connection_name=node_cloud_storage_writer.cloud_storage_settings.connection_name,
882
1244
  user_id=node_cloud_storage_writer.user_id,
@@ -888,15 +1250,22 @@ class FlowGraph:
888
1250
  aws_allow_unsafe_html=cloud_connection_settings.aws_allow_unsafe_html,
889
1251
  **CloudStorageReader.get_storage_options(cloud_connection_settings)
890
1252
  )
891
- settings = get_cloud_storage_write_settings_worker_interface(
892
- write_settings=node_cloud_storage_writer.cloud_storage_settings,
893
- connection=full_cloud_storage_connection,
894
- lf=df.data_frame,
895
- flowfile_node_id=node_cloud_storage_writer.node_id,
896
- flowfile_flow_id=self.flow_id)
897
- external_database_writer = ExternalCloudWriter(settings, wait_on_completion=False)
898
- node._fetch_cached_df = external_database_writer
899
- external_database_writer.get_result()
1253
+ if execute_remote:
1254
+ settings = get_cloud_storage_write_settings_worker_interface(
1255
+ write_settings=node_cloud_storage_writer.cloud_storage_settings,
1256
+ connection=full_cloud_storage_connection,
1257
+ lf=df.data_frame,
1258
+ flowfile_node_id=node_cloud_storage_writer.node_id,
1259
+ flowfile_flow_id=self.flow_id)
1260
+ external_database_writer = ExternalCloudWriter(settings, wait_on_completion=False)
1261
+ node._fetch_cached_df = external_database_writer
1262
+ external_database_writer.get_result()
1263
+ else:
1264
+ cloud_storage_write_settings_internal = CloudStorageWriteSettingsInternal(
1265
+ connection=full_cloud_storage_connection,
1266
+ write_settings=node_cloud_storage_writer.cloud_storage_settings,
1267
+ )
1268
+ df.to_cloud_storage_obj(cloud_storage_write_settings_internal)
900
1269
  return df
901
1270
 
902
1271
  def schema_callback():
@@ -919,12 +1288,10 @@ class FlowGraph:
919
1288
  node = self.get_node(node_cloud_storage_writer.node_id)
920
1289
 
921
1290
  def add_cloud_storage_reader(self, node_cloud_storage_reader: input_schema.NodeCloudStorageReader) -> None:
922
- """
923
- Adds a cloud storage read node to the flow graph.
1291
+ """Adds a cloud storage read node to the flow graph.
1292
+
924
1293
  Args:
925
- node_cloud_storage_reader (input_schema.NodeCloudStorageReader):
926
- The settings for the cloud storage read node.
927
- Returns:
1294
+ node_cloud_storage_reader: The settings for the cloud storage read node.
928
1295
  """
929
1296
  node_type = "cloud_storage_reader"
930
1297
  logger.info("Adding cloud storage reader")
@@ -953,6 +1320,11 @@ class FlowGraph:
953
1320
 
954
1321
  def add_external_source(self,
955
1322
  external_source_input: input_schema.NodeExternalSource):
1323
+ """Adds a node for a custom external data source.
1324
+
1325
+ Args:
1326
+ external_source_input: The settings for the external source node.
1327
+ """
956
1328
 
957
1329
  node_type = 'external_source'
958
1330
  external_source_script = getattr(external_sources.custom_external_sources, external_source_input.identifier)
@@ -1009,6 +1381,12 @@ class FlowGraph:
1009
1381
  setting_input=external_source_input)
1010
1382
 
1011
1383
  def add_read(self, input_file: input_schema.NodeRead):
1384
+ """Adds a node to read data from a local file (e.g., CSV, Parquet, Excel).
1385
+
1386
+ Args:
1387
+ input_file: The settings for the read operation.
1388
+ """
1389
+
1012
1390
  if input_file.received_file.file_type in ('xlsx', 'excel') and input_file.received_file.sheet_name == '':
1013
1391
  sheet_name = fastexcel.read_excel(input_file.received_file.path).sheet_names[0]
1014
1392
  input_file.received_file.sheet_name = sheet_name
@@ -1077,7 +1455,18 @@ class FlowGraph:
1077
1455
  node.schema_callback = schema_callback
1078
1456
  return self
1079
1457
 
1080
- def add_datasource(self, input_file: input_schema.NodeDatasource | input_schema.NodeManualInput):
1458
+ def add_datasource(self, input_file: Union[input_schema.NodeDatasource, input_schema.NodeManualInput]) -> "FlowGraph":
1459
+ """Adds a data source node to the graph.
1460
+
1461
+ This method serves as a factory for creating starting nodes, handling both
1462
+ file-based sources and direct manual data entry.
1463
+
1464
+ Args:
1465
+ input_file: The configuration object for the data source.
1466
+
1467
+ Returns:
1468
+ The `FlowGraph` instance for method chaining.
1469
+ """
1081
1470
  if isinstance(input_file, input_schema.NodeManualInput):
1082
1471
  input_data = FlowDataEngine(input_file.raw_data_format)
1083
1472
  ref = 'manual_input'
@@ -1103,29 +1492,35 @@ class FlowGraph:
1103
1492
  return self
1104
1493
 
1105
1494
  def add_manual_input(self, input_file: input_schema.NodeManualInput):
1495
+ """Adds a node for manual data entry.
1496
+
1497
+ This is a convenience alias for `add_datasource`.
1498
+
1499
+ Args:
1500
+ input_file: The settings and data for the manual input node.
1501
+ """
1106
1502
  self.add_datasource(input_file)
1107
1503
 
1108
1504
  @property
1109
1505
  def nodes(self) -> List[FlowNode]:
1110
- return list(self._node_db.values())
1506
+ """Gets a list of all FlowNode objects in the graph."""
1111
1507
 
1112
- def check_for_missed_cols(self, expected_cols: List):
1113
- not_filled_cols = set(expected_cols) - set(self._output_cols)
1114
- cols_available = list(not_filled_cols & set([c.name for c in self._input_data.schema]))
1115
- self._output_cols += cols_available
1116
-
1117
- @property
1118
- def input_data_columns(self) -> List[str] | None:
1119
- if self._input_cols:
1120
- return list(set([col for col in self._input_cols if
1121
- col in [table_col.name for table_col in self._input_data.schema]]))
1508
+ return list(self._node_db.values())
1122
1509
 
1123
1510
  @property
1124
- def execution_mode(self) -> str:
1511
+ def execution_mode(self) -> schemas.ExecutionModeLiteral:
1512
+ """Gets the current execution mode ('Development' or 'Performance')."""
1125
1513
  return self.flow_settings.execution_mode
1126
1514
 
1127
1515
  def get_implicit_starter_nodes(self) -> List[FlowNode]:
1128
- """Ensures that nodes that can be a start (e.g. polars code), will be a starting node"""
1516
+ """Finds nodes that can act as starting points but are not explicitly defined as such.
1517
+
1518
+ Some nodes, like the Polars Code node, can function without an input. This
1519
+ method identifies such nodes if they have no incoming connections.
1520
+
1521
+ Returns:
1522
+ A list of `FlowNode` objects that are implicit starting nodes.
1523
+ """
1129
1524
  starting_node_ids = [node.node_id for node in self._flow_starts]
1130
1525
  implicit_starting_nodes = []
1131
1526
  for node in self.nodes:
@@ -1135,17 +1530,39 @@ class FlowGraph:
1135
1530
 
1136
1531
  @execution_mode.setter
1137
1532
  def execution_mode(self, mode: schemas.ExecutionModeLiteral):
1533
+ """Sets the execution mode for the flow.
1534
+
1535
+ Args:
1536
+ mode: The execution mode to set.
1537
+ """
1138
1538
  self.flow_settings.execution_mode = mode
1139
1539
 
1140
1540
  @property
1141
1541
  def execution_location(self) -> schemas.ExecutionLocationsLiteral:
1542
+ """Gets the current execution location."""
1142
1543
  return self.flow_settings.execution_location
1143
1544
 
1144
1545
  @execution_location.setter
1145
1546
  def execution_location(self, execution_location: schemas.ExecutionLocationsLiteral):
1547
+ """Sets the execution location for the flow.
1548
+
1549
+ Args:
1550
+ execution_location: The execution location to set.
1551
+ """
1146
1552
  self.flow_settings.execution_location = execution_location
1147
1553
 
1148
- def run_graph(self):
1554
+ def run_graph(self) -> RunInformation | None:
1555
+ """Executes the entire data flow graph from start to finish.
1556
+
1557
+ It determines the correct execution order, runs each node,
1558
+ collects results, and handles errors and cancellations.
1559
+
1560
+ Returns:
1561
+ A RunInformation object summarizing the execution results.
1562
+
1563
+ Raises:
1564
+ Exception: If the flow is already running.
1565
+ """
1149
1566
  if self.flow_settings.is_running:
1150
1567
  raise Exception('Flow is already running')
1151
1568
  try:
@@ -1163,10 +1580,13 @@ class FlowGraph:
1163
1580
  execution_order = determine_execution_order(all_nodes=[node for node in self.nodes if
1164
1581
  node not in skip_nodes],
1165
1582
  flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
1166
-
1167
1583
  skip_node_message(self.flow_logger, skip_nodes)
1168
1584
  execution_order_message(self.flow_logger, execution_order)
1169
1585
  performance_mode = self.flow_settings.execution_mode == 'Performance'
1586
+ if self.flow_settings.execution_location == 'local':
1587
+ OFFLOAD_TO_WORKER.value = False
1588
+ elif self.flow_settings.execution_location == 'remote':
1589
+ OFFLOAD_TO_WORKER.value = True
1170
1590
  for node in execution_order:
1171
1591
  node_logger = self.flow_logger.get_node_logger(node.node_id)
1172
1592
  if self.flow_settings.is_canceled:
@@ -1215,6 +1635,11 @@ class FlowGraph:
1215
1635
  self.flow_settings.is_running = False
1216
1636
 
1217
1637
  def get_run_info(self) -> RunInformation:
1638
+ """Gets a summary of the most recent graph execution.
1639
+
1640
+ Returns:
1641
+ A RunInformation object with details about the last run.
1642
+ """
1218
1643
  if self.latest_run_info is None:
1219
1644
  node_results = self.node_results
1220
1645
  success = all(nr.success for nr in node_results)
@@ -1234,6 +1659,11 @@ class FlowGraph:
1234
1659
 
1235
1660
  @property
1236
1661
  def node_connections(self) -> List[Tuple[int, int]]:
1662
+ """Computes and returns a list of all connections in the graph.
1663
+
1664
+ Returns:
1665
+ A list of tuples, where each tuple is a (source_id, target_id) pair.
1666
+ """
1237
1667
  connections = set()
1238
1668
  for node in self.nodes:
1239
1669
  outgoing_connections = [(node.node_id, ltn.node_id) for ltn in node.leads_to_nodes]
@@ -1245,28 +1675,30 @@ class FlowGraph:
1245
1675
  connections.add(node_connection)
1246
1676
  return list(connections)
1247
1677
 
1248
- def get_schema(self) -> List[FlowfileColumn]:
1249
- if self.schema is None:
1250
- if len(self._node_ids) > 0:
1251
- self.schema = self._node_db[self._node_ids[0]].schema
1252
- return self.schema
1678
+ def get_node_data(self, node_id: int, include_example: bool = True) -> NodeData:
1679
+ """Retrieves all data needed to render a node in the UI.
1253
1680
 
1254
- def get_example_data(self, node_id: int) -> TableExample | None:
1255
- node = self._node_db[node_id]
1256
- return node.get_table_example(include_data=True)
1681
+ Args:
1682
+ node_id: The ID of the node.
1683
+ include_example: Whether to include data samples in the result.
1257
1684
 
1258
- def get_node_data(self, node_id: int, include_example: bool = True) -> NodeData:
1685
+ Returns:
1686
+ A NodeData object, or None if the node is not found.
1687
+ """
1259
1688
  node = self._node_db[node_id]
1260
1689
  return node.get_node_data(flow_id=self.flow_id, include_example=include_example)
1261
1690
 
1262
1691
  def get_node_storage(self) -> schemas.FlowInformation:
1692
+ """Serializes the entire graph's state into a storable format.
1263
1693
 
1694
+ Returns:
1695
+ A FlowInformation object representing the complete graph.
1696
+ """
1264
1697
  node_information = {node.node_id: node.get_node_information() for
1265
1698
  node in self.nodes if node.is_setup and node.is_correct}
1266
1699
 
1267
1700
  return schemas.FlowInformation(flow_id=self.flow_id,
1268
1701
  flow_name=self.__name__,
1269
- storage_location=self.flow_settings.path,
1270
1702
  flow_settings=self.flow_settings,
1271
1703
  data=node_information,
1272
1704
  node_starts=[v.node_id for v in self._flow_starts],
@@ -1274,6 +1706,8 @@ class FlowGraph:
1274
1706
  )
1275
1707
 
1276
1708
  def cancel(self):
1709
+ """Cancels an ongoing graph execution."""
1710
+
1277
1711
  if not self.flow_settings.is_running:
1278
1712
  return
1279
1713
  self.flow_settings.is_canceled = True
@@ -1281,15 +1715,30 @@ class FlowGraph:
1281
1715
  node.cancel()
1282
1716
 
1283
1717
  def close_flow(self):
1718
+ """Performs cleanup operations, such as clearing node caches."""
1719
+
1284
1720
  for node in self.nodes:
1285
1721
  node.remove_cache()
1286
1722
 
1287
1723
  def save_flow(self, flow_path: str):
1724
+ """Saves the current state of the flow graph to a file.
1725
+
1726
+ Args:
1727
+ flow_path: The path where the flow file will be saved.
1728
+ """
1288
1729
  with open(flow_path, 'wb') as f:
1289
1730
  pickle.dump(self.get_node_storage(), f)
1290
1731
  self.flow_settings.path = flow_path
1291
1732
 
1292
- def get_frontend_data(self):
1733
+ def get_frontend_data(self) -> dict:
1734
+ """Formats the graph structure into a JSON-like dictionary for a specific legacy frontend.
1735
+
1736
+ This method transforms the graph's state into a format compatible with the
1737
+ Drawflow.js library.
1738
+
1739
+ Returns:
1740
+ A dictionary representing the graph in Drawflow format.
1741
+ """
1293
1742
  result = {
1294
1743
  'Home': {
1295
1744
  "data": {}
@@ -1360,6 +1809,11 @@ class FlowGraph:
1360
1809
  return result
1361
1810
 
1362
1811
  def get_vue_flow_input(self) -> schemas.VueFlowInput:
1812
+ """Formats the graph's nodes and edges into a schema suitable for the VueFlow frontend.
1813
+
1814
+ Returns:
1815
+ A VueFlowInput object.
1816
+ """
1363
1817
  edges: List[schemas.NodeEdge] = []
1364
1818
  nodes: List[schemas.NodeInput] = []
1365
1819
  for node in self.nodes:
@@ -1368,11 +1822,19 @@ class FlowGraph:
1368
1822
  return schemas.VueFlowInput(node_edges=edges, node_inputs=nodes)
1369
1823
 
1370
1824
  def reset(self):
1825
+ """Forces a deep reset on all nodes in the graph."""
1826
+
1371
1827
  for node in self.nodes:
1372
1828
  node.reset(True)
1373
1829
 
1374
1830
  def copy_node(self, new_node_settings: input_schema.NodePromise, existing_setting_input: Any, node_type: str) -> None:
1375
- """Copy an existing node with potentially new settings."""
1831
+ """Creates a copy of an existing node.
1832
+
1833
+ Args:
1834
+ new_node_settings: The promise containing new settings (like ID and position).
1835
+ existing_setting_input: The settings object from the node being copied.
1836
+ node_type: The type of the node being copied.
1837
+ """
1376
1838
  self.add_node_promise(new_node_settings)
1377
1839
 
1378
1840
  if isinstance(existing_setting_input, input_schema.NodePromise):
@@ -1383,69 +1845,26 @@ class FlowGraph:
1383
1845
  )
1384
1846
  getattr(self, f"add_{node_type}")(combined_settings)
1385
1847
 
1848
+ def generate_code(self):
1849
+ """Generates code for the flow graph.
1850
+ This method exports the flow graph to a Polars-compatible format.
1851
+ """
1852
+ from flowfile_core.flowfile.code_generator.code_generator import export_flow_to_polars
1853
+ print(export_flow_to_polars(self))
1386
1854
 
1387
- def combine_flow_graphs(*flow_graphs: FlowGraph) -> FlowGraph:
1388
- """
1389
- Combine multiple flow graphs into a single graph, ensuring node IDs don't overlap.
1855
+
1856
+ def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
1857
+ """Merges settings from an existing object with new settings from a NodePromise.
1858
+
1859
+ Typically used when copying a node to apply a new ID and position.
1390
1860
 
1391
1861
  Args:
1392
- *flow_graphs: Multiple FlowGraph instances to combine
1862
+ setting_input: The original settings object.
1863
+ new_settings: The NodePromise with new positional and ID data.
1393
1864
 
1394
1865
  Returns:
1395
- A new FlowGraph containing all nodes and edges from the input graphs with remapped IDs
1396
-
1397
- Raises:
1398
- ValueError: If any flow_ids overlap
1866
+ A new settings object with the merged properties.
1399
1867
  """
1400
- # Validate flow IDs are unique
1401
- _validate_unique_flow_ids(flow_graphs)
1402
-
1403
- # Create ID mapping for all nodes
1404
- node_id_mapping = _create_node_id_mapping(flow_graphs)
1405
-
1406
- # Remap and combine nodes
1407
- all_nodes = _remap_nodes(flow_graphs, node_id_mapping)
1408
-
1409
- # Create a new combined flow graph
1410
- combined_flow_id = hash(tuple(fg.flow_id for fg in flow_graphs))
1411
- # return FlowGraph(flow_id=combined_flow_id, nodes=all_nodes, edges=all_edges)
1412
-
1413
-
1414
- def _validate_unique_flow_ids(flow_graphs: Tuple[FlowGraph, ...]) -> None:
1415
- """Ensure all flow graphs have unique flow_ids."""
1416
- all_flow_ids = [fg.flow_id for fg in flow_graphs]
1417
- if len(all_flow_ids) != len(set(all_flow_ids)):
1418
- raise ValueError("Cannot combine overlapping graphs, make sure the graphs have a unique identifier")
1419
-
1420
-
1421
- def _create_node_id_mapping(flow_graphs: Tuple[FlowGraph, ...]) -> Dict[int, Dict[int, int]]:
1422
- """Create a mapping from original node IDs to new unique node IDs."""
1423
- node_id_mapping: Dict[int, Dict[int, int]] = {}
1424
- next_node_id = 0
1425
-
1426
- for fg in flow_graphs:
1427
- node_id_mapping[fg.flow_id] = {}
1428
- for node in fg.nodes:
1429
- node_id_mapping[fg.flow_id][node.node_id] = next_node_id
1430
- next_node_id += 1
1431
-
1432
- return node_id_mapping
1433
-
1434
-
1435
- def _remap_nodes(flow_graphs: Tuple[FlowGraph, ...],
1436
- node_id_mapping: Dict[int, Dict[int, int]]) -> List:
1437
- """Create new nodes with remapped IDs."""
1438
- all_nodes = []
1439
- for fg in flow_graphs:
1440
- for node in fg.nodes:
1441
- new_node = copy.deepcopy(node)
1442
- new_node.node_id = node_id_mapping[fg.flow_id][node.node_id]
1443
- all_nodes.append(new_node)
1444
- return all_nodes
1445
-
1446
-
1447
- def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
1448
- """Combine excopy_nodeisting settings with new settings from a NodePromise."""
1449
1868
  copied_setting_input = deepcopy(setting_input)
1450
1869
 
1451
1870
  # Update only attributes that exist on new_settings
@@ -1464,7 +1883,13 @@ def combine_existing_settings_and_new_settings(setting_input: Any, new_settings:
1464
1883
  return copied_setting_input
1465
1884
 
1466
1885
 
1467
- def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection):
1886
+ def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection) -> None:
1887
+ """Adds a connection between two nodes in the flow graph.
1888
+
1889
+ Args:
1890
+ flow: The FlowGraph instance to modify.
1891
+ node_connection: An object defining the source and target of the connection.
1892
+ """
1468
1893
  logger.info('adding a connection')
1469
1894
  from_node = flow.get_node(node_connection.output_connection.node_id)
1470
1895
  to_node = flow.get_node(node_connection.input_connection.node_id)
@@ -1476,7 +1901,12 @@ def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection
1476
1901
 
1477
1902
 
1478
1903
  def delete_connection(graph, node_connection: input_schema.NodeConnection):
1479
- """Delete the connection between two nodes."""
1904
+ """Deletes a connection between two nodes in the flow graph.
1905
+
1906
+ Args:
1907
+ graph: The FlowGraph instance to modify.
1908
+ node_connection: An object defining the connection to be removed.
1909
+ """
1480
1910
  from_node = graph.get_node(node_connection.output_connection.node_id)
1481
1911
  to_node = graph.get_node(node_connection.input_connection.node_id)
1482
1912
  connection_valid = to_node.node_inputs.validate_if_input_connection_exists(
@@ -1492,6 +1922,4 @@ def delete_connection(graph, node_connection: input_schema.NodeConnection):
1492
1922
  to_node.delete_input_node(
1493
1923
  node_connection.output_connection.node_id,
1494
1924
  connection_type=node_connection.input_connection.connection_class,
1495
- )
1496
-
1497
-
1925
+ )