Flowfile 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (145) hide show
  1. flowfile/__init__.py +27 -6
  2. flowfile/api.py +1 -0
  3. flowfile/web/__init__.py +2 -2
  4. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
  5. flowfile/web/static/assets/CloudConnectionManager-c20a740f.js +783 -0
  6. flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
  7. flowfile/web/static/assets/CloudStorageReader-960b400a.js +437 -0
  8. flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
  9. flowfile/web/static/assets/CloudStorageWriter-e3decbdd.js +430 -0
  10. flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-d67e2405.js} +8 -8
  11. flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
  12. flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-9ea35e84.js} +2 -2
  13. flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-9578bfa5.js} +9 -9
  14. flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-19531098.js} +9 -9
  15. flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-40476474.js} +47141 -43697
  16. flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-2297ef96.js} +6 -6
  17. flowfile/web/static/assets/{Filter-f87bb897.js → Filter-f211c03a.js} +8 -8
  18. flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
  19. flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-4207ea31.js} +75 -9
  20. flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-bf120df0.js} +9 -9
  21. flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-5bb7497a.js} +5 -5
  22. flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-92c81b65.js} +6 -6
  23. flowfile/web/static/assets/{Join-eec38203.js → Join-4e49a274.js} +23 -15
  24. flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
  25. flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-90998ae8.js} +106 -34
  26. flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
  27. flowfile/web/static/assets/{Output-3b2ca045.js → Output-81e3e917.js} +4 -4
  28. flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-a3419842.js} +6 -6
  29. flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-72710deb.js} +6 -6
  30. flowfile/web/static/assets/{Read-07acdc9a.js → Read-c4059daf.js} +6 -6
  31. flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-c2b5e095.js} +5 -5
  32. flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-10baf191.js} +6 -6
  33. flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-3ed9a0ae.js} +5 -5
  34. flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-0d49c0e8.js} +2 -2
  35. flowfile/web/static/assets/{Select-32b28406.js → Select-8a02a0b3.js} +8 -8
  36. flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-4c0f45f5.js} +1 -1
  37. flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-f55c9f9d.js} +6 -6
  38. flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-5dbc2145.js} +8 -8
  39. flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-a1768e52.js} +2 -2
  40. flowfile/web/static/assets/{Union-39eecc6c.js → Union-f2aefdc9.js} +5 -5
  41. flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-46b250da.js} +8 -8
  42. flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-25ac84cc.js} +5 -5
  43. flowfile/web/static/assets/api-6ef0dcef.js +80 -0
  44. flowfile/web/static/assets/{api-44ca9e9c.js → api-a0abbdc7.js} +1 -1
  45. flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
  46. flowfile/web/static/assets/{designer-267d44f1.js → designer-13eabd83.js} +36 -34
  47. flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-b87e7f6f.js} +1 -1
  48. flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-13564764.js} +1 -1
  49. flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-fd2cd6f9.js} +2 -2
  50. flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-71e11604.js} +3 -3
  51. flowfile/web/static/assets/{index-e235a8bc.js → index-f6c15e76.js} +59 -22
  52. flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-988d9efe.js} +3 -3
  53. flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-dd636aa2.js} +1 -1
  54. flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-af36165e.js} +3 -3
  55. flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-2847001e.js} +2 -1
  56. flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-0371da73.js} +1 -1
  57. flowfile/web/static/index.html +1 -1
  58. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/METADATA +9 -4
  59. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/RECORD +131 -124
  60. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +2 -0
  61. flowfile_core/__init__.py +3 -0
  62. flowfile_core/auth/jwt.py +39 -0
  63. flowfile_core/configs/node_store/nodes.py +9 -6
  64. flowfile_core/configs/settings.py +6 -5
  65. flowfile_core/database/connection.py +63 -15
  66. flowfile_core/database/init_db.py +0 -1
  67. flowfile_core/database/models.py +49 -2
  68. flowfile_core/flowfile/code_generator/code_generator.py +472 -17
  69. flowfile_core/flowfile/connection_manager/models.py +1 -1
  70. flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
  71. flowfile_core/flowfile/extensions.py +1 -1
  72. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
  73. flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
  74. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1062 -311
  75. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
  76. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
  77. flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
  78. flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
  79. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
  80. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
  81. flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
  82. flowfile_core/flowfile/flow_graph.py +718 -253
  83. flowfile_core/flowfile/flow_graph_utils.py +2 -2
  84. flowfile_core/flowfile/flow_node/flow_node.py +563 -117
  85. flowfile_core/flowfile/flow_node/models.py +154 -20
  86. flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
  87. flowfile_core/flowfile/handler.py +2 -33
  88. flowfile_core/flowfile/manage/open_flowfile.py +1 -2
  89. flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
  90. flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
  91. flowfile_core/flowfile/util/calculate_layout.py +0 -2
  92. flowfile_core/flowfile/utils.py +35 -26
  93. flowfile_core/main.py +35 -15
  94. flowfile_core/routes/cloud_connections.py +77 -0
  95. flowfile_core/routes/logs.py +2 -7
  96. flowfile_core/routes/public.py +1 -0
  97. flowfile_core/routes/routes.py +130 -90
  98. flowfile_core/routes/secrets.py +72 -14
  99. flowfile_core/schemas/__init__.py +8 -0
  100. flowfile_core/schemas/cloud_storage_schemas.py +215 -0
  101. flowfile_core/schemas/input_schema.py +121 -71
  102. flowfile_core/schemas/output_model.py +19 -3
  103. flowfile_core/schemas/schemas.py +150 -12
  104. flowfile_core/schemas/transform_schema.py +175 -35
  105. flowfile_core/utils/utils.py +40 -1
  106. flowfile_core/utils/validate_setup.py +41 -0
  107. flowfile_frame/__init__.py +9 -1
  108. flowfile_frame/cloud_storage/frame_helpers.py +39 -0
  109. flowfile_frame/cloud_storage/secret_manager.py +73 -0
  110. flowfile_frame/expr.py +28 -1
  111. flowfile_frame/expr.pyi +76 -61
  112. flowfile_frame/flow_frame.py +481 -208
  113. flowfile_frame/flow_frame.pyi +140 -91
  114. flowfile_frame/flow_frame_methods.py +160 -22
  115. flowfile_frame/group_frame.py +3 -0
  116. flowfile_frame/utils.py +25 -3
  117. flowfile_worker/external_sources/s3_source/main.py +216 -0
  118. flowfile_worker/external_sources/s3_source/models.py +142 -0
  119. flowfile_worker/funcs.py +51 -6
  120. flowfile_worker/models.py +22 -2
  121. flowfile_worker/routes.py +40 -38
  122. flowfile_worker/utils.py +1 -1
  123. test_utils/s3/commands.py +46 -0
  124. test_utils/s3/data_generator.py +292 -0
  125. test_utils/s3/demo_data_generator.py +186 -0
  126. test_utils/s3/fixtures.py +214 -0
  127. flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
  128. flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
  129. flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
  130. flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
  131. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
  132. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
  133. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
  134. flowfile_core/schemas/defaults.py +0 -9
  135. flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
  136. flowfile_core/schemas/models.py +0 -193
  137. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
  138. flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
  139. flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
  140. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  141. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
  142. {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
  143. {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_frame/cloud_storage}/__init__.py +0 -0
  144. {flowfile_core/schemas/external_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
  145. {flowfile_worker/external_sources/airbyte_sources → test_utils/s3}/__init__.py +0 -0
@@ -2,7 +2,7 @@ import datetime
2
2
  import pickle
3
3
  import polars as pl
4
4
  import fastexcel
5
- import copy
5
+ import re
6
6
  from fastapi.exceptions import HTTPException
7
7
  from time import time
8
8
  from functools import partial
@@ -11,37 +11,58 @@ from uuid import uuid1
11
11
  from copy import deepcopy
12
12
  from pyarrow.parquet import ParquetFile
13
13
  from flowfile_core.configs import logger
14
+ from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
14
15
  from flowfile_core.configs.flow_logger import FlowLogger
15
16
  from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
16
- from flowfile_core.flowfile.sources.external_sources.airbyte_sources.settings import airbyte_settings_from_config
17
17
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
18
18
  from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.settings_validator import (calculate_fuzzy_match_schema,
19
19
  pre_calculate_pivot_schema)
20
+ from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import CloudStorageReader
20
21
  from flowfile_core.utils.arrow_reader import get_read_top_n
21
22
  from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine, execute_polars_code
22
23
  from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_xlsx_datatypes, \
23
24
  get_calamine_xlsx_data_types
24
25
  from flowfile_core.flowfile.sources import external_sources
25
26
  from flowfile_core.schemas import input_schema, schemas, transform_schema
26
- from flowfile_core.schemas.output_model import TableExample, NodeData, NodeResult, RunInformation
27
- from flowfile_core.flowfile.utils import snake_case_to_camel_case, _handle_raw_data
27
+ from flowfile_core.schemas.output_model import NodeData, NodeResult, RunInformation
28
+ from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSettingsInternal,
29
+ CloudStorageWriteSettingsInternal,
30
+ FullCloudStorageConnection,
31
+ get_cloud_storage_write_settings_worker_interface, AuthMethod)
32
+ from flowfile_core.flowfile.utils import snake_case_to_camel_case
28
33
  from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
29
34
  from flowfile_core.flowfile.flow_node.flow_node import FlowNode
30
35
  from flowfile_core.flowfile.util.execution_orderer import determine_execution_order
31
36
  from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
32
- from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import (ExternalAirbyteFetcher,
33
- ExternalDatabaseFetcher,
37
+ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import (ExternalDatabaseFetcher,
34
38
  ExternalDatabaseWriter,
35
- ExternalDfFetcher)
39
+ ExternalDfFetcher,
40
+ ExternalCloudWriter)
36
41
  from flowfile_core.secret_manager.secret_manager import get_encrypted_secret, decrypt_secret
37
42
  from flowfile_core.flowfile.sources.external_sources.sql_source import utils as sql_utils, models as sql_models
38
43
  from flowfile_core.flowfile.sources.external_sources.sql_source.sql_source import SqlSource, BaseSqlSource
39
- from flowfile_core.flowfile.database_connection_manager.db_connections import get_local_database_connection
44
+ from flowfile_core.flowfile.database_connection_manager.db_connections import (get_local_database_connection,
45
+ get_local_cloud_connection)
40
46
  from flowfile_core.flowfile.util.calculate_layout import calculate_layered_layout
41
47
 
42
48
 
43
49
  def get_xlsx_schema(engine: str, file_path: str, sheet_name: str, start_row: int, start_column: int,
44
50
  end_row: int, end_column: int, has_headers: bool):
51
+ """Calculates the schema of an XLSX file by reading a sample of rows.
52
+
53
+ Args:
54
+ engine: The engine to use for reading ('openpyxl' or 'calamine').
55
+ file_path: The path to the XLSX file.
56
+ sheet_name: The name of the sheet to read.
57
+ start_row: The starting row for data reading.
58
+ start_column: The starting column for data reading.
59
+ end_row: The ending row for data reading.
60
+ end_column: The ending column for data reading.
61
+ has_headers: A boolean indicating if the file has a header row.
62
+
63
+ Returns:
64
+ A list of FlowfileColumn objects representing the schema.
65
+ """
45
66
  try:
46
67
  logger.info('Starting to calculate the schema')
47
68
  if engine == 'openpyxl':
@@ -64,35 +85,80 @@ def get_xlsx_schema(engine: str, file_path: str, sheet_name: str, start_row: int
64
85
 
65
86
 
66
87
  def skip_node_message(flow_logger: FlowLogger, nodes: List[FlowNode]) -> None:
88
+ """Logs a warning message listing all nodes that will be skipped during execution.
89
+
90
+ Args:
91
+ flow_logger: The logger instance for the flow.
92
+ nodes: A list of FlowNode objects to be skipped.
93
+ """
67
94
  if len(nodes) > 0:
68
95
  msg = "\n".join(str(node) for node in nodes)
69
96
  flow_logger.warning(f'skipping nodes:\n{msg}')
70
97
 
71
98
 
72
99
  def execution_order_message(flow_logger: FlowLogger, nodes: List[FlowNode]) -> None:
100
+ """Logs an informational message showing the determined execution order of nodes.
101
+
102
+ Args:
103
+ flow_logger: The logger instance for the flow.
104
+ nodes: A list of FlowNode objects in the order they will be executed.
105
+ """
73
106
  msg = "\n".join(str(node) for node in nodes)
74
107
  flow_logger.info(f'execution order:\n{msg}')
75
108
 
76
109
 
77
110
  def get_xlsx_schema_callback(engine: str, file_path: str, sheet_name: str, start_row: int, start_column: int,
78
111
  end_row: int, end_column: int, has_headers: bool):
112
+ """Creates a partially applied function for lazy calculation of an XLSX schema.
113
+
114
+ Args:
115
+ engine: The engine to use for reading.
116
+ file_path: The path to the XLSX file.
117
+ sheet_name: The name of the sheet.
118
+ start_row: The starting row.
119
+ start_column: The starting column.
120
+ end_row: The ending row.
121
+ end_column: The ending column.
122
+ has_headers: A boolean indicating if the file has headers.
123
+
124
+ Returns:
125
+ A callable function that, when called, will execute `get_xlsx_schema`.
126
+ """
79
127
  return partial(get_xlsx_schema, engine=engine, file_path=file_path, sheet_name=sheet_name, start_row=start_row,
80
128
  start_column=start_column, end_row=end_row, end_column=end_column, has_headers=has_headers)
81
129
 
82
130
 
83
- class FlowGraph:
131
+ def get_cloud_connection_settings(connection_name: str,
132
+ user_id: int, auth_mode: AuthMethod) -> FullCloudStorageConnection:
133
+ """Retrieves cloud storage connection settings, falling back to environment variables if needed.
134
+
135
+ Args:
136
+ connection_name: The name of the saved connection.
137
+ user_id: The ID of the user owning the connection.
138
+ auth_mode: The authentication method specified by the user.
139
+
140
+ Returns:
141
+ A FullCloudStorageConnection object with the connection details.
142
+
143
+ Raises:
144
+ HTTPException: If the connection settings cannot be found.
84
145
  """
85
- FlowGraph is a class that enables Extract, Transform and Load (ETL) operations
86
- on data. It allows you to create a Directed Acyclic Graph (DAG) where each
87
- node represents a step in the ETL pipeline.
146
+ cloud_connection_settings = get_local_cloud_connection(connection_name, user_id)
147
+ if cloud_connection_settings is None and auth_mode in ("env_vars", "auto"):
148
+ # If the auth mode is aws-cli, we do not need connection settings
149
+ cloud_connection_settings = FullCloudStorageConnection(storage_type="s3", auth_method="env_vars")
150
+ elif cloud_connection_settings is None and auth_mode == "aws-cli":
151
+ cloud_connection_settings = FullCloudStorageConnection(storage_type="s3", auth_method="aws-cli")
152
+ if cloud_connection_settings is None:
153
+ raise HTTPException(status_code=400, detail="Cloud connection settings not found")
154
+ return cloud_connection_settings
88
155
 
89
- The class offers methods to add transformations and data sources, as well as
90
- methods to run the transformations and generate results.
91
156
 
92
- Attributes:
93
- _input_cols (set): A set that stores the input columns for the transformations.
94
- _output_cols (set): A set that stores the output columns from the transformations.
95
- """
157
+ class FlowGraph:
158
+ """A class representing a Directed Acyclic Graph (DAG) for data processing pipelines.
159
+
160
+ It manages nodes, connections, and the execution of the entire flow.
161
+ """
96
162
  uuid: str
97
163
  depends_on: Dict[int, Union[ParquetFile, FlowDataEngine, "FlowGraph", pl.DataFrame,]]
98
164
  _flow_id: int
@@ -114,13 +180,27 @@ class FlowGraph:
114
180
  flow_settings: schemas.FlowSettings = None
115
181
  flow_logger: FlowLogger
116
182
 
117
- def __init__(self, flow_id: int,
118
- flow_settings: schemas.FlowSettings,
183
+ def __init__(self,
184
+ flow_settings: schemas.FlowSettings | schemas.FlowGraphConfig,
119
185
  name: str = None, input_cols: List[str] = None,
120
186
  output_cols: List[str] = None,
121
187
  path_ref: str = None,
122
188
  input_flow: Union[ParquetFile, FlowDataEngine, "FlowGraph"] = None,
123
189
  cache_results: bool = False):
190
+ """Initializes a new FlowGraph instance.
191
+
192
+ Args:
193
+ flow_settings: The configuration settings for the flow.
194
+ name: The name of the flow.
195
+ input_cols: A list of input column names.
196
+ output_cols: A list of output column names.
197
+ path_ref: An optional path to an initial data source.
198
+ input_flow: An optional existing data object to start the flow with.
199
+ cache_results: A global flag to enable or disable result caching.
200
+ """
201
+ if isinstance(flow_settings, schemas.FlowGraphConfig):
202
+ flow_settings = schemas.FlowSettings.from_flow_settings_input(flow_settings)
203
+
124
204
  self.flow_settings = flow_settings
125
205
  self.uuid = str(uuid1())
126
206
  self.nodes_completed = 0
@@ -128,8 +208,8 @@ class FlowGraph:
128
208
  self.end_datetime = None
129
209
  self.latest_run_info = None
130
210
  self.node_results = []
131
- self._flow_id = flow_id
132
- self.flow_logger = FlowLogger(flow_id)
211
+ self._flow_id = flow_settings.flow_id
212
+ self.flow_logger = FlowLogger(flow_settings.flow_id)
133
213
  self._flow_starts: List[FlowNode] = []
134
214
  self._results = None
135
215
  self.schema = None
@@ -147,7 +227,13 @@ class FlowGraph:
147
227
  self.add_datasource(input_file=input_flow)
148
228
 
149
229
  def add_node_promise(self, node_promise: input_schema.NodePromise):
230
+ """Adds a placeholder node to the graph that is not yet fully configured.
231
+
232
+ Useful for building the graph structure before all settings are available.
150
233
 
234
+ Args:
235
+ node_promise: A promise object containing basic node information.
236
+ """
151
237
  def placeholder(n: FlowNode = None):
152
238
  if n is None:
153
239
  return FlowDataEngine()
@@ -156,10 +242,75 @@ class FlowGraph:
156
242
  self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=placeholder,
157
243
  setting_input=node_promise)
158
244
 
159
- def apply_layout(self, y_spacing: int = 150, x_spacing: int = 200, initial_y: int = 100):
245
+ def print_tree(self, show_schema=False, show_descriptions=False):
246
+ """
247
+ Print flow_graph as a tree.
160
248
  """
161
- Calculates and applies a layered layout to all nodes in the graph.
162
- Updates the pos_x and pos_y attributes of the node setting inputs.
249
+ max_node_id = max(self._node_db.keys())
250
+
251
+ tree = ""
252
+ tabs = 0
253
+ tab_counter = 0
254
+ for node in self.nodes:
255
+ tab_counter += 1
256
+ node_input = node.setting_input
257
+ operation = str(self._node_db[node_input.node_id]).split("(")[1][:-1].replace("_", " ").title()
258
+
259
+ if operation == "Formula":
260
+ operation = "With Columns"
261
+
262
+ tree += str(operation) + " (id=" + str(node_input.node_id) + ")"
263
+
264
+ if show_descriptions & show_schema:
265
+ raise ValueError('show_descriptions and show_schema cannot be True simultaneously')
266
+ if show_descriptions:
267
+ tree += ": " + str(node_input.description)
268
+ elif show_schema:
269
+ tree += " -> ["
270
+ if operation == "Manual Input":
271
+ schema = ", ".join([str(i.name) + ": " + str(i.data_type) for i in node_input.raw_data_format.columns])
272
+ tree += schema
273
+ elif operation == "With Columns":
274
+ tree_with_col_schema = ", " + node_input.function.field.name + ": " + node_input.function.field.data_type
275
+ tree += schema + tree_with_col_schema
276
+ elif operation == "Filter":
277
+ index = node_input.filter_input.advanced_filter.find("]")
278
+ filtered_column = str(node_input.filter_input.advanced_filter[1:index])
279
+ schema = re.sub('({str(filtered_column)}: [A-Za-z0-9]+\,\s)', "", schema)
280
+ tree += schema
281
+ elif operation == "Group By":
282
+ for col in node_input.groupby_input.agg_cols:
283
+ schema = re.sub(str(col.old_name) + ': [a-z0-9]+\, ', "", schema)
284
+ tree += schema
285
+ tree += "]"
286
+ else:
287
+ if operation == "Manual Input":
288
+ tree += ": " + str(node_input.raw_data_format.data)
289
+ elif operation == "With Columns":
290
+ tree += ": " + str(node_input.function)
291
+ elif operation == "Filter":
292
+ tree += ": " + str(node_input.filter_input.advanced_filter)
293
+ elif operation == "Group By":
294
+ tree += ": groupby=[" + ", ".join([col.old_name for col in node_input.groupby_input.agg_cols if col.agg == "groupby"]) + "], "
295
+ tree += "agg=[" + ", ".join([str(col.agg) + "(" + str(col.old_name) + ")" for col in node_input.groupby_input.agg_cols if col.agg != "groupby"]) + "]"
296
+
297
+ if node_input.node_id < max_node_id:
298
+ tree += "\n" + "# " + " "*3*(tabs-1) + "|___ "
299
+ print("\n"*2)
300
+
301
+ return print(tree)
302
+
303
+
304
+
305
+ def apply_layout(self, y_spacing: int = 150, x_spacing: int = 200, initial_y: int = 100):
306
+ """Calculates and applies a layered layout to all nodes in the graph.
307
+
308
+ This updates their x and y positions for UI rendering.
309
+
310
+ Args:
311
+ y_spacing: The vertical spacing between layers.
312
+ x_spacing: The horizontal spacing between nodes in the same layer.
313
+ initial_y: The initial y-position for the first layer.
163
314
  """
164
315
  self.flow_logger.info("Applying layered layout...")
165
316
  start_time = time()
@@ -186,7 +337,7 @@ class FlowGraph:
186
337
  else:
187
338
  self.flow_logger.warning(f"Node {node_id} setting_input ({type(setting)}) lacks pos_x/pos_y attributes.")
188
339
  elif node:
189
- self.flow_logger.warning(f"Node {node_id} lacks setting_input attribute.")
340
+ self.flow_logger.warning(f"Node {node_id} lacks setting_input attribute.")
190
341
  # else: Node not found, already warned by calculate_layered_layout
191
342
 
192
343
  end_time = time()
@@ -194,51 +345,20 @@ class FlowGraph:
194
345
 
195
346
  except Exception as e:
196
347
  self.flow_logger.error(f"Error applying layout: {e}")
197
- raise # Optional: re-raise the exception
198
-
199
- def add_initial_node_analysis(self, node_promise: input_schema.NodePromise):
200
- node_analysis = create_graphic_walker_node_from_node_promise(node_promise)
201
- self.add_explore_data(node_analysis)
202
-
203
- def add_explore_data(self, node_analysis: input_schema.NodeExploreData):
204
- sample_size: int = 10000
205
-
206
- def analysis_preparation(flowfile_table: FlowDataEngine):
207
- if flowfile_table.number_of_records <= 0:
208
- number_of_records = flowfile_table.get_number_of_records(calculate_in_worker_process=True)
209
- else:
210
- number_of_records = flowfile_table.number_of_records
211
- if number_of_records > sample_size:
212
- flowfile_table = flowfile_table.get_sample(sample_size, random=True)
213
- external_sampler = ExternalDfFetcher(
214
- lf=flowfile_table.data_frame,
215
- file_ref="__gf_walker"+node.hash,
216
- wait_on_completion=True,
217
- node_id=node.node_id,
218
- flow_id=self.flow_id,
219
- )
220
- node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref)
221
- return flowfile_table
222
-
223
- def schema_callback():
224
- node = self.get_node(node_analysis.node_id)
225
- if len(node.all_inputs) == 1:
226
- input_node = node.all_inputs[0]
227
- return input_node.schema
228
- else:
229
- return [FlowfileColumn.from_input('col_1', 'na')]
230
-
231
- self.add_node_step(node_id=node_analysis.node_id, node_type='explore_data',
232
- function=analysis_preparation,
233
- setting_input=node_analysis, schema_callback=schema_callback)
234
- node = self.get_node(node_analysis.node_id)
348
+ raise # Optional: re-raise the exception
235
349
 
236
350
  @property
237
351
  def flow_id(self) -> int:
352
+ """Gets the unique identifier of the flow."""
238
353
  return self._flow_id
239
354
 
240
355
  @flow_id.setter
241
356
  def flow_id(self, new_id: int):
357
+ """Sets the unique identifier for the flow and updates all child nodes.
358
+
359
+ Args:
360
+ new_id: The new flow ID.
361
+ """
242
362
  self._flow_id = new_id
243
363
  for node in self.nodes:
244
364
  if hasattr(node.setting_input, 'flow_id'):
@@ -246,23 +366,35 @@ class FlowGraph:
246
366
  self.flow_settings.flow_id = new_id
247
367
 
248
368
  def __repr__(self):
249
- """
250
- Official string representation of the FlowGraph class.
251
- """
369
+ """Provides the official string representation of the FlowGraph instance."""
252
370
  settings_str = " -" + '\n -'.join(f"{k}: {v}" for k, v in self.flow_settings)
253
371
  return f"FlowGraph(\nNodes: {self._node_db}\n\nSettings:\n{settings_str}"
254
372
 
255
373
  def get_nodes_overview(self):
374
+ """Gets a list of dictionary representations for all nodes in the graph."""
256
375
  output = []
257
376
  for v in self._node_db.values():
258
377
  output.append(v.get_repr())
259
378
  return output
260
379
 
261
380
  def remove_from_output_cols(self, columns: List[str]):
381
+ """Removes specified columns from the list of expected output columns.
382
+
383
+ Args:
384
+ columns: A list of column names to remove.
385
+ """
262
386
  cols = set(columns)
263
387
  self._output_cols = [c for c in self._output_cols if c not in cols]
264
388
 
265
- def get_node(self, node_id: Union[int, str] = None) -> FlowNode:
389
+ def get_node(self, node_id: Union[int, str] = None) -> FlowNode | None:
390
+ """Retrieves a node from the graph by its ID.
391
+
392
+ Args:
393
+ node_id: The ID of the node to retrieve. If None, retrieves the last added node.
394
+
395
+ Returns:
396
+ The FlowNode object, or None if not found.
397
+ """
266
398
  if node_id is None:
267
399
  node_id = self._node_ids[-1]
268
400
  node = self._node_db.get(node_id)
@@ -270,6 +402,12 @@ class FlowGraph:
270
402
  return node
271
403
 
272
404
  def add_pivot(self, pivot_settings: input_schema.NodePivot):
405
+ """Adds a pivot node to the graph.
406
+
407
+ Args:
408
+ pivot_settings: The settings for the pivot operation.
409
+ """
410
+
273
411
  def _func(fl: FlowDataEngine):
274
412
  return fl.do_pivot(pivot_settings.pivot_input, self.flow_logger.get_node_logger(pivot_settings.node_id))
275
413
 
@@ -289,6 +427,11 @@ class FlowGraph:
289
427
  node.schema_callback = schema_callback
290
428
 
291
429
  def add_unpivot(self, unpivot_settings: input_schema.NodeUnpivot):
430
+ """Adds an unpivot node to the graph.
431
+
432
+ Args:
433
+ unpivot_settings: The settings for the unpivot operation.
434
+ """
292
435
 
293
436
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
294
437
  return fl.unpivot(unpivot_settings.unpivot_input)
@@ -300,6 +443,12 @@ class FlowGraph:
300
443
  input_node_ids=[unpivot_settings.depending_on_id])
301
444
 
302
445
  def add_union(self, union_settings: input_schema.NodeUnion):
446
+ """Adds a union node to combine multiple data streams.
447
+
448
+ Args:
449
+ union_settings: The settings for the union operation.
450
+ """
451
+
303
452
  def _func(*flowfile_tables: FlowDataEngine):
304
453
  dfs: List[pl.LazyFrame] | List[pl.DataFrame] = [flt.data_frame for flt in flowfile_tables]
305
454
  return FlowDataEngine(pl.concat(dfs, how='diagonal_relaxed'))
@@ -310,7 +459,59 @@ class FlowGraph:
310
459
  setting_input=union_settings,
311
460
  input_node_ids=union_settings.depending_on_ids)
312
461
 
462
+ def add_initial_node_analysis(self, node_promise: input_schema.NodePromise):
463
+ """Adds a data exploration/analysis node based on a node promise.
464
+
465
+ Args:
466
+ node_promise: The promise representing the node to be analyzed.
467
+ """
468
+ node_analysis = create_graphic_walker_node_from_node_promise(node_promise)
469
+ self.add_explore_data(node_analysis)
470
+
471
+ def add_explore_data(self, node_analysis: input_schema.NodeExploreData):
472
+ """Adds a specialized node for data exploration and visualization.
473
+
474
+ Args:
475
+ node_analysis: The settings for the data exploration node.
476
+ """
477
+ sample_size: int = 10000
478
+
479
+ def analysis_preparation(flowfile_table: FlowDataEngine):
480
+ if flowfile_table.number_of_records <= 0:
481
+ number_of_records = flowfile_table.get_number_of_records(calculate_in_worker_process=True)
482
+ else:
483
+ number_of_records = flowfile_table.number_of_records
484
+ if number_of_records > sample_size:
485
+ flowfile_table = flowfile_table.get_sample(sample_size, random=True)
486
+ external_sampler = ExternalDfFetcher(
487
+ lf=flowfile_table.data_frame,
488
+ file_ref="__gf_walker"+node.hash,
489
+ wait_on_completion=True,
490
+ node_id=node.node_id,
491
+ flow_id=self.flow_id,
492
+ )
493
+ node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref)
494
+ return flowfile_table
495
+
496
+ def schema_callback():
497
+ node = self.get_node(node_analysis.node_id)
498
+ if len(node.all_inputs) == 1:
499
+ input_node = node.all_inputs[0]
500
+ return input_node.schema
501
+ else:
502
+ return [FlowfileColumn.from_input('col_1', 'na')]
503
+
504
+ self.add_node_step(node_id=node_analysis.node_id, node_type='explore_data',
505
+ function=analysis_preparation,
506
+ setting_input=node_analysis, schema_callback=schema_callback)
507
+ node = self.get_node(node_analysis.node_id)
508
+
313
509
  def add_group_by(self, group_by_settings: input_schema.NodeGroupBy):
510
+ """Adds a group-by aggregation node to the graph.
511
+
512
+ Args:
513
+ group_by_settings: The settings for the group-by operation.
514
+ """
314
515
 
315
516
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
316
517
  return fl.do_group_by(group_by_settings.groupby_input, False)
@@ -324,6 +525,7 @@ class FlowGraph:
324
525
  node = self.get_node(group_by_settings.node_id)
325
526
 
326
527
  def schema_callback():
528
+
327
529
  output_columns = [(c.old_name, c.new_name, c.output_type) for c in group_by_settings.groupby_input.agg_cols]
328
530
  depends_on = node.node_inputs.main_inputs[0]
329
531
  input_schema_dict: Dict[str, str] = {s.name: s.data_type for s in depends_on.schema}
@@ -335,22 +537,13 @@ class FlowGraph:
335
537
 
336
538
  node.schema_callback = schema_callback
337
539
 
338
- def add_or_update_column_func(self, col_name: str, pl_dtype: pl.DataType, depends_on: FlowNode):
339
- col_output = FlowfileColumn.from_input(column_name=col_name, data_type=str(pl_dtype))
340
- schema = depends_on.schema
341
- col_exist = depends_on.get_flow_file_column_schema(col_name)
342
- if col_exist is None:
343
- new_schema = schema + [col_output]
344
- else:
345
- new_schema = []
346
- for s in self.schema:
347
- if s.name == col_name:
348
- new_schema.append(col_output)
349
- else:
350
- new_schema.append(s)
351
- return new_schema
352
-
353
540
  def add_filter(self, filter_settings: input_schema.NodeFilter):
541
+ """Adds a filter node to the graph.
542
+
543
+ Args:
544
+ filter_settings: The settings for the filter operation.
545
+ """
546
+
354
547
  is_advanced = filter_settings.filter_input.filter_type == 'advanced'
355
548
  if is_advanced:
356
549
  predicate = filter_settings.filter_input.advanced_filter
@@ -384,6 +577,12 @@ class FlowGraph:
384
577
  )
385
578
 
386
579
  def add_record_count(self, node_number_of_records: input_schema.NodeRecordCount):
580
+ """Adds a filter node to the graph.
581
+
582
+ Args:
583
+ node_number_of_records: The settings for the record count operation.
584
+ """
585
+
387
586
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
388
587
  return fl.get_record_count()
389
588
 
@@ -394,9 +593,14 @@ class FlowGraph:
394
593
  input_node_ids=[node_number_of_records.depending_on_id])
395
594
 
396
595
  def add_polars_code(self, node_polars_code: input_schema.NodePolarsCode):
596
+ """Adds a node that executes custom Polars code.
597
+
598
+ Args:
599
+ node_polars_code: The settings for the Polars code node.
600
+ """
601
+
397
602
  def _func(*flowfile_tables: FlowDataEngine) -> FlowDataEngine:
398
603
  return execute_polars_code(*flowfile_tables, code=node_polars_code.polars_code_input.polars_code)
399
-
400
604
  self.add_node_step(node_id=node_polars_code.node_id,
401
605
  function=_func,
402
606
  node_type='polars_code',
@@ -409,7 +613,31 @@ class FlowGraph:
409
613
  node = self.get_node(node_id=node_polars_code.node_id)
410
614
  node.results.errors = str(e)
411
615
 
616
+ def add_dependency_on_polars_lazy_frame(self,
617
+ lazy_frame: pl.LazyFrame,
618
+ node_id: int):
619
+ """Adds a special node that directly injects a Polars LazyFrame into the graph.
620
+
621
+ Note: This is intended for backend use and will not work in the UI editor.
622
+
623
+ Args:
624
+ lazy_frame: The Polars LazyFrame to inject.
625
+ node_id: The ID for the new node.
626
+ """
627
+ def _func():
628
+ return FlowDataEngine(lazy_frame)
629
+ node_promise = input_schema.NodePromise(flow_id=self.flow_id,
630
+ node_id=node_id, node_type="polars_lazy_frame",
631
+ is_setup=True)
632
+ self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=_func,
633
+ setting_input=node_promise)
634
+
412
635
  def add_unique(self, unique_settings: input_schema.NodeUnique):
636
+ """Adds a node to find and remove duplicate rows.
637
+
638
+ Args:
639
+ unique_settings: The settings for the unique operation.
640
+ """
413
641
 
414
642
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
415
643
  return fl.make_unique(unique_settings.unique_input)
@@ -422,6 +650,16 @@ class FlowGraph:
422
650
  input_node_ids=[unique_settings.depending_on_id])
423
651
 
424
652
  def add_graph_solver(self, graph_solver_settings: input_schema.NodeGraphSolver):
653
+ """Adds a node that solves graph-like problems within the data.
654
+
655
+ This node can be used for operations like finding network paths,
656
+ calculating connected components, or performing other graph algorithms
657
+ on relational data that represents nodes and edges.
658
+
659
+ Args:
660
+ graph_solver_settings: The settings object defining the graph inputs
661
+ and the specific algorithm to apply.
662
+ """
425
663
  def _func(fl: FlowDataEngine) -> FlowDataEngine:
426
664
  return fl.solve_graph(graph_solver_settings.graph_solver_input)
427
665
 
@@ -432,6 +670,12 @@ class FlowGraph:
432
670
  input_node_ids=[graph_solver_settings.depending_on_id])
433
671
 
434
672
  def add_formula(self, function_settings: input_schema.NodeFormula):
673
+ """Adds a node that applies a formula to create or modify a column.
674
+
675
+ Args:
676
+ function_settings: The settings for the formula operation.
677
+ """
678
+
435
679
  error = ""
436
680
  if function_settings.function.field.data_type not in (None, "Auto"):
437
681
  output_type = cast_str_to_polars_type(function_settings.function.field.data_type)
@@ -463,6 +707,14 @@ class FlowGraph:
463
707
  return True, ""
464
708
 
465
709
  def add_cross_join(self, cross_join_settings: input_schema.NodeCrossJoin) -> "FlowGraph":
710
+ """Adds a cross join node to the graph.
711
+
712
+ Args:
713
+ cross_join_settings: The settings for the cross join operation.
714
+
715
+ Returns:
716
+ The `FlowGraph` instance for method chaining.
717
+ """
466
718
 
467
719
  def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
468
720
  for left_select in cross_join_settings.cross_join_input.left_select.renames:
@@ -484,6 +736,15 @@ class FlowGraph:
484
736
  return self
485
737
 
486
738
  def add_join(self, join_settings: input_schema.NodeJoin) -> "FlowGraph":
739
+ """Adds a join node to combine two data streams based on key columns.
740
+
741
+ Args:
742
+ join_settings: The settings for the join operation.
743
+
744
+ Returns:
745
+ The `FlowGraph` instance for method chaining.
746
+ """
747
+
487
748
  def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
488
749
  for left_select in join_settings.join_input.left_select.renames:
489
750
  left_select.is_available = True if left_select.old_name in main.schema else False
@@ -504,6 +765,15 @@ class FlowGraph:
504
765
  return self
505
766
 
506
767
  def add_fuzzy_match(self, fuzzy_settings: input_schema.NodeFuzzyMatch) -> "FlowGraph":
768
+ """Adds a fuzzy matching node to join data on approximate string matches.
769
+
770
+ Args:
771
+ fuzzy_settings: The settings for the fuzzy match operation.
772
+
773
+ Returns:
774
+ The `FlowGraph` instance for method chaining.
775
+ """
776
+
507
777
  def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
508
778
  f = main.start_fuzzy_join(fuzzy_match_input=fuzzy_settings.join_input, other=right, file_ref=node.hash,
509
779
  flow_id=self.flow_id, node_id=fuzzy_settings.node_id)
@@ -528,6 +798,18 @@ class FlowGraph:
528
798
  return self
529
799
 
530
800
  def add_text_to_rows(self, node_text_to_rows: input_schema.NodeTextToRows) -> "FlowGraph":
801
+ """Adds a node that splits cell values into multiple rows.
802
+
803
+ This is useful for un-nesting data where a single field contains multiple
804
+ values separated by a delimiter.
805
+
806
+ Args:
807
+ node_text_to_rows: The settings object that specifies the column to split
808
+ and the delimiter to use.
809
+
810
+ Returns:
811
+ The `FlowGraph` instance for method chaining.
812
+ """
531
813
  def _func(table: FlowDataEngine) -> FlowDataEngine:
532
814
  return table.split(node_text_to_rows.text_to_rows_input)
533
815
 
@@ -539,6 +821,15 @@ class FlowGraph:
539
821
  return self
540
822
 
541
823
  def add_sort(self, sort_settings: input_schema.NodeSort) -> "FlowGraph":
824
+ """Adds a node to sort the data based on one or more columns.
825
+
826
+ Args:
827
+ sort_settings: The settings for the sort operation.
828
+
829
+ Returns:
830
+ The `FlowGraph` instance for method chaining.
831
+ """
832
+
542
833
  def _func(table: FlowDataEngine) -> FlowDataEngine:
543
834
  return table.do_sort(sort_settings.sort_input)
544
835
 
@@ -550,6 +841,14 @@ class FlowGraph:
550
841
  return self
551
842
 
552
843
  def add_sample(self, sample_settings: input_schema.NodeSample) -> "FlowGraph":
844
+ """Adds a node to take a random or top-N sample of the data.
845
+
846
+ Args:
847
+ sample_settings: The settings object specifying the size of the sample.
848
+
849
+ Returns:
850
+ The `FlowGraph` instance for method chaining.
851
+ """
553
852
  def _func(table: FlowDataEngine) -> FlowDataEngine:
554
853
  return table.get_sample(sample_settings.sample_size)
555
854
 
@@ -562,6 +861,15 @@ class FlowGraph:
562
861
  return self
563
862
 
564
863
  def add_record_id(self, record_id_settings: input_schema.NodeRecordId) -> "FlowGraph":
864
+ """Adds a node to create a new column with a unique ID for each record.
865
+
866
+ Args:
867
+ record_id_settings: The settings object specifying the name of the
868
+ new record ID column.
869
+
870
+ Returns:
871
+ The `FlowGraph` instance for method chaining.
872
+ """
565
873
 
566
874
  def _func(table: FlowDataEngine) -> FlowDataEngine:
567
875
  return table.add_record_id(record_id_settings.record_id_input)
@@ -575,6 +883,15 @@ class FlowGraph:
575
883
  return self
576
884
 
577
885
  def add_select(self, select_settings: input_schema.NodeSelect) -> "FlowGraph":
886
+ """Adds a node to select, rename, reorder, or drop columns.
887
+
888
+ Args:
889
+ select_settings: The settings for the select operation.
890
+
891
+ Returns:
892
+ The `FlowGraph` instance for method chaining.
893
+ """
894
+
578
895
  select_cols = select_settings.select_input
579
896
  drop_cols = tuple(s.old_name for s in select_settings.select_input)
580
897
 
@@ -608,9 +925,18 @@ class FlowGraph:
608
925
 
609
926
  @property
610
927
  def graph_has_functions(self) -> bool:
928
+ """Checks if the graph has any nodes."""
611
929
  return len(self._node_ids) > 0
612
930
 
613
931
  def delete_node(self, node_id: Union[int, str]):
932
+ """Deletes a node from the graph and updates all its connections.
933
+
934
+ Args:
935
+ node_id: The ID of the node to delete.
936
+
937
+ Raises:
938
+ Exception: If the node with the given ID does not exist.
939
+ """
614
940
  logger.info(f"Starting deletion of node with ID: {node_id}")
615
941
 
616
942
  node = self._node_db.get(node_id)
@@ -643,6 +969,7 @@ class FlowGraph:
643
969
 
644
970
  @property
645
971
  def graph_has_input_data(self) -> bool:
972
+ """Checks if the graph has an initial input data source."""
646
973
  return self._input_data is not None
647
974
 
648
975
  def add_node_step(self,
@@ -656,7 +983,25 @@ class FlowGraph:
656
983
  setting_input: Any = None,
657
984
  cache_results: bool = None,
658
985
  schema_callback: Callable = None,
659
- input_node_ids: List[int] = None):
986
+ input_node_ids: List[int] = None) -> FlowNode:
987
+ """The core method for adding or updating a node in the graph.
988
+
989
+ Args:
990
+ node_id: The unique ID for the node.
991
+ function: The core processing function for the node.
992
+ input_columns: A list of input column names required by the function.
993
+ output_schema: A predefined schema for the node's output.
994
+ node_type: A string identifying the type of node (e.g., 'filter', 'join').
995
+ drop_columns: A list of columns to be dropped after the function executes.
996
+ renew_schema: If True, the schema is recalculated after execution.
997
+ setting_input: A configuration object containing settings for the node.
998
+ cache_results: If True, the node's results are cached for future runs.
999
+ schema_callback: A function that dynamically calculates the output schema.
1000
+ input_node_ids: A list of IDs for the nodes that this node depends on.
1001
+
1002
+ Returns:
1003
+ The created or updated FlowNode object.
1004
+ """
660
1005
  existing_node = self.get_node(node_id)
661
1006
  if existing_node is not None:
662
1007
  if existing_node.node_type != node_type:
@@ -668,15 +1013,13 @@ class FlowGraph:
668
1013
  input_nodes = [self.get_node(node_id) for node_id in input_node_ids]
669
1014
  else:
670
1015
  input_nodes = None
671
- if cache_results is None:
672
- if hasattr(setting_input, 'cache_results'):
673
- cache_results = getattr(setting_input, 'cache_results')
674
- cache_results = False if cache_results is None else cache_results
675
1016
  if isinstance(input_columns, str):
676
1017
  input_columns = [input_columns]
677
-
678
- if input_nodes is not None or function.__name__ in ('placeholder', 'analysis_preparation'):
679
-
1018
+ if (
1019
+ input_nodes is not None or
1020
+ function.__name__ in ('placeholder', 'analysis_preparation') or
1021
+ node_type in ("cloud_storage_reader", "polars_lazy_frame", "input_data")
1022
+ ):
680
1023
  if not existing_node:
681
1024
  node = FlowNode(node_id=node_id,
682
1025
  function=function,
@@ -697,14 +1040,18 @@ class FlowGraph:
697
1040
  setting_input=setting_input,
698
1041
  schema_callback=schema_callback)
699
1042
  node = existing_node
700
- elif node_type == 'input_data':
701
- node = None
702
1043
  else:
703
1044
  raise Exception("No data initialized")
704
1045
  self._node_db[node_id] = node
705
1046
  self._node_ids.append(node_id)
1047
+ return node
706
1048
 
707
1049
  def add_include_cols(self, include_columns: List[str]):
1050
+ """Adds columns to both the input and output column lists.
1051
+
1052
+ Args:
1053
+ include_columns: A list of column names to include.
1054
+ """
708
1055
  for column in include_columns:
709
1056
  if column not in self._input_cols:
710
1057
  self._input_cols.append(column)
@@ -713,6 +1060,12 @@ class FlowGraph:
713
1060
  return self
714
1061
 
715
1062
  def add_output(self, output_file: input_schema.NodeOutput):
1063
+ """Adds an output node to write the final data to a destination.
1064
+
1065
+ Args:
1066
+ output_file: The settings for the output file.
1067
+ """
1068
+
716
1069
  def _func(df: FlowDataEngine):
717
1070
  output_file.output_settings.populate_abs_file_path()
718
1071
  execute_remote = self.execution_location != 'local'
@@ -734,7 +1087,12 @@ class FlowGraph:
734
1087
  input_node_ids=[input_node_id])
735
1088
 
736
1089
  def add_database_writer(self, node_database_writer: input_schema.NodeDatabaseWriter):
737
- logger.info("Adding database reader")
1090
+ """Adds a node to write data to a database.
1091
+
1092
+ Args:
1093
+ node_database_writer: The settings for the database writer node.
1094
+ """
1095
+
738
1096
  node_type = 'database_writer'
739
1097
  database_settings: input_schema.DatabaseWriteSettings = node_database_writer.database_write_settings
740
1098
  database_connection: Optional[input_schema.DatabaseConnection | input_schema.FullDatabaseConnection]
@@ -782,6 +1140,12 @@ class FlowGraph:
782
1140
  node = self.get_node(node_database_writer.node_id)
783
1141
 
784
1142
  def add_database_reader(self, node_database_reader: input_schema.NodeDatabaseReader):
1143
+ """Adds a node to read data from a database.
1144
+
1145
+ Args:
1146
+ node_database_reader: The settings for the database reader node.
1147
+ """
1148
+
785
1149
  logger.info("Adding database reader")
786
1150
  node_type = 'database_reader'
787
1151
  database_settings: input_schema.DatabaseSettings = node_database_reader.database_settings
@@ -854,80 +1218,129 @@ class FlowGraph:
854
1218
  self._flow_starts.append(node)
855
1219
  self._node_ids.append(node_database_reader.node_id)
856
1220
 
857
- def add_airbyte_reader(self, external_source_input: input_schema.NodeAirbyteReader):
858
- logger.info('Adding airbyte reader')
859
- node_type = 'airbyte_reader'
860
- source_settings: input_schema.AirbyteReader = external_source_input.source_settings
861
- airbyte_settings = airbyte_settings_from_config(source_settings, flow_id=self.flow_id,
862
- node_id=external_source_input.node_id)
1221
+ def add_sql_source(self, external_source_input: input_schema.NodeExternalSource):
1222
+ """Adds a node that reads data from a SQL source.
863
1223
 
864
- logger.info("Airbyte settings created")
865
- airbyte_settings.fields = source_settings.fields
866
- external_source = data_source_factory(source_type='airbyte', airbyte_settings=airbyte_settings)
1224
+ This is a convenience alias for `add_external_source`.
867
1225
 
868
- def _func():
869
- logger.info('Calling external source')
870
- external_fetcher = ExternalAirbyteFetcher(airbyte_settings, wait_on_completion=False)
871
- node._fetch_cached_df = external_fetcher
872
- fl = FlowDataEngine(external_fetcher.get_result())
873
- external_source_input.source_settings.fields = [c.get_minimal_field_info() for c in fl.schema]
874
- return fl
1226
+ Args:
1227
+ external_source_input: The settings for the external SQL source node.
1228
+ """
1229
+ logger.info('Adding sql source')
1230
+ self.add_external_source(external_source_input)
1231
+
1232
+ def add_cloud_storage_writer(self, node_cloud_storage_writer: input_schema.NodeCloudStorageWriter) -> None:
1233
+ """Adds a node to write data to a cloud storage provider.
1234
+
1235
+ Args:
1236
+ node_cloud_storage_writer: The settings for the cloud storage writer node.
1237
+ """
1238
+
1239
+ node_type = "cloud_storage_writer"
1240
+ def _func(df: FlowDataEngine):
1241
+ df.lazy = True
1242
+ execute_remote = self.execution_location != 'local'
1243
+ cloud_connection_settings = get_cloud_connection_settings(
1244
+ connection_name=node_cloud_storage_writer.cloud_storage_settings.connection_name,
1245
+ user_id=node_cloud_storage_writer.user_id,
1246
+ auth_mode=node_cloud_storage_writer.cloud_storage_settings.auth_mode
1247
+ )
1248
+ full_cloud_storage_connection = FullCloudStorageConnection(
1249
+ storage_type=cloud_connection_settings.storage_type,
1250
+ auth_method=cloud_connection_settings.auth_method,
1251
+ aws_allow_unsafe_html=cloud_connection_settings.aws_allow_unsafe_html,
1252
+ **CloudStorageReader.get_storage_options(cloud_connection_settings)
1253
+ )
1254
+ if execute_remote:
1255
+ settings = get_cloud_storage_write_settings_worker_interface(
1256
+ write_settings=node_cloud_storage_writer.cloud_storage_settings,
1257
+ connection=full_cloud_storage_connection,
1258
+ lf=df.data_frame,
1259
+ flowfile_node_id=node_cloud_storage_writer.node_id,
1260
+ flowfile_flow_id=self.flow_id)
1261
+ external_database_writer = ExternalCloudWriter(settings, wait_on_completion=False)
1262
+ node._fetch_cached_df = external_database_writer
1263
+ external_database_writer.get_result()
1264
+ else:
1265
+ cloud_storage_write_settings_internal = CloudStorageWriteSettingsInternal(
1266
+ connection=full_cloud_storage_connection,
1267
+ write_settings=node_cloud_storage_writer.cloud_storage_settings,
1268
+ )
1269
+ df.to_cloud_storage_obj(cloud_storage_write_settings_internal)
1270
+ return df
875
1271
 
876
1272
  def schema_callback():
877
- return [FlowfileColumn.from_input(f.name, f.data_type) for f in external_source.schema]
1273
+ logger.info("Starting to run the schema callback for cloud storage writer")
1274
+ if self.get_node(node_cloud_storage_writer.node_id).is_correct:
1275
+ return self.get_node(node_cloud_storage_writer.node_id).node_inputs.main_inputs[0].schema
1276
+ else:
1277
+ return [FlowfileColumn.from_input(column_name="__error__", data_type="String")]
878
1278
 
879
- node = self.get_node(external_source_input.node_id)
880
- if node:
881
- node.node_type = node_type
882
- node.name = node_type
883
- node.function = _func
884
- node.setting_input = external_source_input
885
- node.node_settings.cache_results = external_source_input.cache_results
886
- if external_source_input.node_id not in set(start_node.node_id for start_node in self._flow_starts):
887
- self._flow_starts.append(node)
888
- node.schema_callback = schema_callback
889
- else:
890
- node = FlowNode(external_source_input.node_id, function=_func,
891
- setting_input=external_source_input,
892
- name=node_type, node_type=node_type, parent_uuid=self.uuid,
893
- schema_callback=schema_callback)
894
- self._node_db[external_source_input.node_id] = node
895
- self._flow_starts.append(node)
896
- self._node_ids.append(external_source_input.node_id)
897
- if external_source_input.source_settings.fields and len(external_source_input.source_settings.fields) > 0:
898
- logger.info('Using provided schema in the node')
1279
+ self.add_node_step(
1280
+ node_id=node_cloud_storage_writer.node_id,
1281
+ function=_func,
1282
+ input_columns=[],
1283
+ node_type=node_type,
1284
+ setting_input=node_cloud_storage_writer,
1285
+ schema_callback=schema_callback,
1286
+ input_node_ids=[node_cloud_storage_writer.depending_on_id]
1287
+ )
899
1288
 
1289
+ node = self.get_node(node_cloud_storage_writer.node_id)
900
1290
 
901
- def add_sql_source(self, external_source_input: input_schema.NodeExternalSource):
902
- logger.info('Adding sql source')
903
- self.add_external_source(external_source_input)
1291
+ def add_cloud_storage_reader(self, node_cloud_storage_reader: input_schema.NodeCloudStorageReader) -> None:
1292
+ """Adds a cloud storage read node to the flow graph.
1293
+
1294
+ Args:
1295
+ node_cloud_storage_reader: The settings for the cloud storage read node.
1296
+ """
1297
+ node_type = "cloud_storage_reader"
1298
+ logger.info("Adding cloud storage reader")
1299
+ cloud_storage_read_settings = node_cloud_storage_reader.cloud_storage_settings
1300
+
1301
+ def _func():
1302
+ logger.info("Starting to run the schema callback for cloud storage reader")
1303
+ self.flow_logger.info("Starting to run the schema callback for cloud storage reader")
1304
+ settings = CloudStorageReadSettingsInternal(read_settings=cloud_storage_read_settings,
1305
+ connection=get_cloud_connection_settings(
1306
+ connection_name=cloud_storage_read_settings.connection_name,
1307
+ user_id=node_cloud_storage_reader.user_id,
1308
+ auth_mode=cloud_storage_read_settings.auth_mode
1309
+ ))
1310
+ fl = FlowDataEngine.from_cloud_storage_obj(settings)
1311
+ return fl
1312
+
1313
+ node = self.add_node_step(node_id=node_cloud_storage_reader.node_id,
1314
+ function=_func,
1315
+ cache_results=node_cloud_storage_reader.cache_results,
1316
+ setting_input=node_cloud_storage_reader,
1317
+ node_type=node_type,
1318
+ )
1319
+ if node_cloud_storage_reader.node_id not in set(start_node.node_id for start_node in self._flow_starts):
1320
+ self._flow_starts.append(node)
904
1321
 
905
1322
  def add_external_source(self,
906
- external_source_input: input_schema.NodeExternalSource | input_schema.NodeAirbyteReader):
907
-
908
- custom_source_type = external_source_input.identifier != 'airbyte'
909
- if custom_source_type:
910
- node_type = 'external_source'
911
- external_source_script = getattr(external_sources.custom_external_sources, external_source_input.identifier)
912
- source_settings = (getattr(input_schema, snake_case_to_camel_case(external_source_input.identifier)).
913
- model_validate(external_source_input.source_settings))
914
- if hasattr(external_source_script, 'initial_getter'):
915
- initial_getter = getattr(external_source_script, 'initial_getter')(source_settings)
916
- else:
917
- initial_getter = None
918
- data_getter = external_source_script.getter(source_settings)
919
- external_source = data_source_factory(source_type='custom',
920
- data_getter=data_getter,
921
- initial_data_getter=initial_getter,
922
- orientation=external_source_input.source_settings.orientation,
923
- schema=None)
1323
+ external_source_input: input_schema.NodeExternalSource):
1324
+ """Adds a node for a custom external data source.
1325
+
1326
+ Args:
1327
+ external_source_input: The settings for the external source node.
1328
+ """
1329
+
1330
+ node_type = 'external_source'
1331
+ external_source_script = getattr(external_sources.custom_external_sources, external_source_input.identifier)
1332
+ source_settings = (getattr(input_schema, snake_case_to_camel_case(external_source_input.identifier)).
1333
+ model_validate(external_source_input.source_settings))
1334
+ if hasattr(external_source_script, 'initial_getter'):
1335
+ initial_getter = getattr(external_source_script, 'initial_getter')(source_settings)
924
1336
  else:
925
- node_type = 'airbyte_reader'
926
- source_settings: input_schema.AirbyteReader = external_source_input.source_settings
927
- airbyte_settings = airbyte_settings_from_config(source_settings, flow_id=self.flow_id,
928
- node_id=external_source_input.node_id)
929
- airbyte_settings.fields = source_settings.fields
930
- external_source = data_source_factory(source_type='airbyte', airbyte_settings=airbyte_settings)
1337
+ initial_getter = None
1338
+ data_getter = external_source_script.getter(source_settings)
1339
+ external_source = data_source_factory(source_type='custom',
1340
+ data_getter=data_getter,
1341
+ initial_data_getter=initial_getter,
1342
+ orientation=external_source_input.source_settings.orientation,
1343
+ schema=None)
931
1344
 
932
1345
  def _func():
933
1346
  logger.info('Calling external source')
@@ -969,6 +1382,12 @@ class FlowGraph:
969
1382
  setting_input=external_source_input)
970
1383
 
971
1384
  def add_read(self, input_file: input_schema.NodeRead):
1385
+ """Adds a node to read data from a local file (e.g., CSV, Parquet, Excel).
1386
+
1387
+ Args:
1388
+ input_file: The settings for the read operation.
1389
+ """
1390
+
972
1391
  if input_file.received_file.file_type in ('xlsx', 'excel') and input_file.received_file.sheet_name == '':
973
1392
  sheet_name = fastexcel.read_excel(input_file.received_file.path).sheet_names[0]
974
1393
  input_file.received_file.sheet_name = sheet_name
@@ -984,8 +1403,8 @@ class FlowGraph:
984
1403
  input_data = FlowDataEngine.create_from_path(input_file.received_file)
985
1404
  else:
986
1405
  input_data = FlowDataEngine.create_from_path_worker(input_file.received_file,
987
- node_id=input_file.node_id,
988
- flow_id=self.flow_id)
1406
+ node_id=input_file.node_id,
1407
+ flow_id=self.flow_id)
989
1408
  input_data.name = input_file.received_file.name
990
1409
  return input_data
991
1410
 
@@ -1037,9 +1456,19 @@ class FlowGraph:
1037
1456
  node.schema_callback = schema_callback
1038
1457
  return self
1039
1458
 
1040
- def add_datasource(self, input_file: input_schema.NodeDatasource | input_schema.NodeManualInput):
1459
+ def add_datasource(self, input_file: Union[input_schema.NodeDatasource, input_schema.NodeManualInput]) -> "FlowGraph":
1460
+ """Adds a data source node to the graph.
1461
+
1462
+ This method serves as a factory for creating starting nodes, handling both
1463
+ file-based sources and direct manual data entry.
1464
+
1465
+ Args:
1466
+ input_file: The configuration object for the data source.
1467
+
1468
+ Returns:
1469
+ The `FlowGraph` instance for method chaining.
1470
+ """
1041
1471
  if isinstance(input_file, input_schema.NodeManualInput):
1042
- _handle_raw_data(input_file)
1043
1472
  input_data = FlowDataEngine(input_file.raw_data_format)
1044
1473
  ref = 'manual_input'
1045
1474
  else:
@@ -1051,10 +1480,8 @@ class FlowGraph:
1051
1480
  node.name = ref
1052
1481
  node.function = input_data
1053
1482
  node.setting_input = input_file
1054
-
1055
1483
  if not input_file.node_id in set(start_node.node_id for start_node in self._flow_starts):
1056
1484
  self._flow_starts.append(node)
1057
-
1058
1485
  else:
1059
1486
  input_data.collect()
1060
1487
  node = FlowNode(input_file.node_id, function=input_data,
@@ -1066,29 +1493,35 @@ class FlowGraph:
1066
1493
  return self
1067
1494
 
1068
1495
  def add_manual_input(self, input_file: input_schema.NodeManualInput):
1496
+ """Adds a node for manual data entry.
1497
+
1498
+ This is a convenience alias for `add_datasource`.
1499
+
1500
+ Args:
1501
+ input_file: The settings and data for the manual input node.
1502
+ """
1069
1503
  self.add_datasource(input_file)
1070
1504
 
1071
1505
  @property
1072
1506
  def nodes(self) -> List[FlowNode]:
1073
- return list(self._node_db.values())
1074
-
1075
- def check_for_missed_cols(self, expected_cols: List):
1076
- not_filled_cols = set(expected_cols) - set(self._output_cols)
1077
- cols_available = list(not_filled_cols & set([c.name for c in self._input_data.schema]))
1078
- self._output_cols += cols_available
1507
+ """Gets a list of all FlowNode objects in the graph."""
1079
1508
 
1080
- @property
1081
- def input_data_columns(self) -> List[str] | None:
1082
- if self._input_cols:
1083
- return list(set([col for col in self._input_cols if
1084
- col in [table_col.name for table_col in self._input_data.schema]]))
1509
+ return list(self._node_db.values())
1085
1510
 
1086
1511
  @property
1087
- def execution_mode(self) -> str:
1512
+ def execution_mode(self) -> schemas.ExecutionModeLiteral:
1513
+ """Gets the current execution mode ('Development' or 'Performance')."""
1088
1514
  return self.flow_settings.execution_mode
1089
1515
 
1090
1516
  def get_implicit_starter_nodes(self) -> List[FlowNode]:
1091
- """Ensures that nodes that can be a start (e.g. polars code), will be a starting node"""
1517
+ """Finds nodes that can act as starting points but are not explicitly defined as such.
1518
+
1519
+ Some nodes, like the Polars Code node, can function without an input. This
1520
+ method identifies such nodes if they have no incoming connections.
1521
+
1522
+ Returns:
1523
+ A list of `FlowNode` objects that are implicit starting nodes.
1524
+ """
1092
1525
  starting_node_ids = [node.node_id for node in self._flow_starts]
1093
1526
  implicit_starting_nodes = []
1094
1527
  for node in self.nodes:
@@ -1098,17 +1531,39 @@ class FlowGraph:
1098
1531
 
1099
1532
  @execution_mode.setter
1100
1533
  def execution_mode(self, mode: schemas.ExecutionModeLiteral):
1534
+ """Sets the execution mode for the flow.
1535
+
1536
+ Args:
1537
+ mode: The execution mode to set.
1538
+ """
1101
1539
  self.flow_settings.execution_mode = mode
1102
1540
 
1103
1541
  @property
1104
1542
  def execution_location(self) -> schemas.ExecutionLocationsLiteral:
1543
+ """Gets the current execution location."""
1105
1544
  return self.flow_settings.execution_location
1106
1545
 
1107
1546
  @execution_location.setter
1108
1547
  def execution_location(self, execution_location: schemas.ExecutionLocationsLiteral):
1548
+ """Sets the execution location for the flow.
1549
+
1550
+ Args:
1551
+ execution_location: The execution location to set.
1552
+ """
1109
1553
  self.flow_settings.execution_location = execution_location
1110
1554
 
1111
- def run_graph(self):
1555
+ def run_graph(self) -> RunInformation | None:
1556
+ """Executes the entire data flow graph from start to finish.
1557
+
1558
+ It determines the correct execution order, runs each node,
1559
+ collects results, and handles errors and cancellations.
1560
+
1561
+ Returns:
1562
+ A RunInformation object summarizing the execution results.
1563
+
1564
+ Raises:
1565
+ Exception: If the flow is already running.
1566
+ """
1112
1567
  if self.flow_settings.is_running:
1113
1568
  raise Exception('Flow is already running')
1114
1569
  try:
@@ -1130,6 +1585,8 @@ class FlowGraph:
1130
1585
  skip_node_message(self.flow_logger, skip_nodes)
1131
1586
  execution_order_message(self.flow_logger, execution_order)
1132
1587
  performance_mode = self.flow_settings.execution_mode == 'Performance'
1588
+ if self.flow_settings.execution_location == 'local':
1589
+ OFFLOAD_TO_WORKER.value = False
1133
1590
  for node in execution_order:
1134
1591
  node_logger = self.flow_logger.get_node_logger(node.node_id)
1135
1592
  if self.flow_settings.is_canceled:
@@ -1178,6 +1635,11 @@ class FlowGraph:
1178
1635
  self.flow_settings.is_running = False
1179
1636
 
1180
1637
  def get_run_info(self) -> RunInformation:
1638
+ """Gets a summary of the most recent graph execution.
1639
+
1640
+ Returns:
1641
+ A RunInformation object with details about the last run.
1642
+ """
1181
1643
  if self.latest_run_info is None:
1182
1644
  node_results = self.node_results
1183
1645
  success = all(nr.success for nr in node_results)
@@ -1197,6 +1659,11 @@ class FlowGraph:
1197
1659
 
1198
1660
  @property
1199
1661
  def node_connections(self) -> List[Tuple[int, int]]:
1662
+ """Computes and returns a list of all connections in the graph.
1663
+
1664
+ Returns:
1665
+ A list of tuples, where each tuple is a (source_id, target_id) pair.
1666
+ """
1200
1667
  connections = set()
1201
1668
  for node in self.nodes:
1202
1669
  outgoing_connections = [(node.node_id, ltn.node_id) for ltn in node.leads_to_nodes]
@@ -1208,28 +1675,30 @@ class FlowGraph:
1208
1675
  connections.add(node_connection)
1209
1676
  return list(connections)
1210
1677
 
1211
- def get_schema(self) -> List[FlowfileColumn]:
1212
- if self.schema is None:
1213
- if len(self._node_ids) > 0:
1214
- self.schema = self._node_db[self._node_ids[0]].schema
1215
- return self.schema
1678
+ def get_node_data(self, node_id: int, include_example: bool = True) -> NodeData:
1679
+ """Retrieves all data needed to render a node in the UI.
1216
1680
 
1217
- def get_example_data(self, node_id: int) -> TableExample | None:
1218
- node = self._node_db[node_id]
1219
- return node.get_table_example(include_data=True)
1681
+ Args:
1682
+ node_id: The ID of the node.
1683
+ include_example: Whether to include data samples in the result.
1220
1684
 
1221
- def get_node_data(self, node_id: int, include_example: bool = True) -> NodeData:
1685
+ Returns:
1686
+ A NodeData object, or None if the node is not found.
1687
+ """
1222
1688
  node = self._node_db[node_id]
1223
1689
  return node.get_node_data(flow_id=self.flow_id, include_example=include_example)
1224
1690
 
1225
1691
  def get_node_storage(self) -> schemas.FlowInformation:
1692
+ """Serializes the entire graph's state into a storable format.
1226
1693
 
1694
+ Returns:
1695
+ A FlowInformation object representing the complete graph.
1696
+ """
1227
1697
  node_information = {node.node_id: node.get_node_information() for
1228
1698
  node in self.nodes if node.is_setup and node.is_correct}
1229
1699
 
1230
1700
  return schemas.FlowInformation(flow_id=self.flow_id,
1231
1701
  flow_name=self.__name__,
1232
- storage_location=self.flow_settings.path,
1233
1702
  flow_settings=self.flow_settings,
1234
1703
  data=node_information,
1235
1704
  node_starts=[v.node_id for v in self._flow_starts],
@@ -1237,6 +1706,8 @@ class FlowGraph:
1237
1706
  )
1238
1707
 
1239
1708
  def cancel(self):
1709
+ """Cancels an ongoing graph execution."""
1710
+
1240
1711
  if not self.flow_settings.is_running:
1241
1712
  return
1242
1713
  self.flow_settings.is_canceled = True
@@ -1244,15 +1715,30 @@ class FlowGraph:
1244
1715
  node.cancel()
1245
1716
 
1246
1717
  def close_flow(self):
1718
+ """Performs cleanup operations, such as clearing node caches."""
1719
+
1247
1720
  for node in self.nodes:
1248
1721
  node.remove_cache()
1249
1722
 
1250
1723
  def save_flow(self, flow_path: str):
1724
+ """Saves the current state of the flow graph to a file.
1725
+
1726
+ Args:
1727
+ flow_path: The path where the flow file will be saved.
1728
+ """
1251
1729
  with open(flow_path, 'wb') as f:
1252
1730
  pickle.dump(self.get_node_storage(), f)
1253
1731
  self.flow_settings.path = flow_path
1254
1732
 
1255
- def get_frontend_data(self):
1733
+ def get_frontend_data(self) -> dict:
1734
+ """Formats the graph structure into a JSON-like dictionary for a specific legacy frontend.
1735
+
1736
+ This method transforms the graph's state into a format compatible with the
1737
+ Drawflow.js library.
1738
+
1739
+ Returns:
1740
+ A dictionary representing the graph in Drawflow format.
1741
+ """
1256
1742
  result = {
1257
1743
  'Home': {
1258
1744
  "data": {}
@@ -1323,6 +1809,11 @@ class FlowGraph:
1323
1809
  return result
1324
1810
 
1325
1811
  def get_vue_flow_input(self) -> schemas.VueFlowInput:
1812
+ """Formats the graph's nodes and edges into a schema suitable for the VueFlow frontend.
1813
+
1814
+ Returns:
1815
+ A VueFlowInput object.
1816
+ """
1326
1817
  edges: List[schemas.NodeEdge] = []
1327
1818
  nodes: List[schemas.NodeInput] = []
1328
1819
  for node in self.nodes:
@@ -1331,11 +1822,19 @@ class FlowGraph:
1331
1822
  return schemas.VueFlowInput(node_edges=edges, node_inputs=nodes)
1332
1823
 
1333
1824
  def reset(self):
1825
+ """Forces a deep reset on all nodes in the graph."""
1826
+
1334
1827
  for node in self.nodes:
1335
1828
  node.reset(True)
1336
1829
 
1337
1830
  def copy_node(self, new_node_settings: input_schema.NodePromise, existing_setting_input: Any, node_type: str) -> None:
1338
- """Copy an existing node with potentially new settings."""
1831
+ """Creates a copy of an existing node.
1832
+
1833
+ Args:
1834
+ new_node_settings: The promise containing new settings (like ID and position).
1835
+ existing_setting_input: The settings object from the node being copied.
1836
+ node_type: The type of the node being copied.
1837
+ """
1339
1838
  self.add_node_promise(new_node_settings)
1340
1839
 
1341
1840
  if isinstance(existing_setting_input, input_schema.NodePromise):
@@ -1346,69 +1845,26 @@ class FlowGraph:
1346
1845
  )
1347
1846
  getattr(self, f"add_{node_type}")(combined_settings)
1348
1847
 
1848
+ def generate_code(self):
1849
+ """Generates code for the flow graph.
1850
+ This method exports the flow graph to a Polars-compatible format.
1851
+ """
1852
+ from flowfile_core.flowfile.code_generator.code_generator import export_flow_to_polars
1853
+ print(export_flow_to_polars(self))
1349
1854
 
1350
- def combine_flow_graphs(*flow_graphs: FlowGraph) -> FlowGraph:
1351
- """
1352
- Combine multiple flow graphs into a single graph, ensuring node IDs don't overlap.
1855
+
1856
+ def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
1857
+ """Merges settings from an existing object with new settings from a NodePromise.
1858
+
1859
+ Typically used when copying a node to apply a new ID and position.
1353
1860
 
1354
1861
  Args:
1355
- *flow_graphs: Multiple FlowGraph instances to combine
1862
+ setting_input: The original settings object.
1863
+ new_settings: The NodePromise with new positional and ID data.
1356
1864
 
1357
1865
  Returns:
1358
- A new FlowGraph containing all nodes and edges from the input graphs with remapped IDs
1359
-
1360
- Raises:
1361
- ValueError: If any flow_ids overlap
1866
+ A new settings object with the merged properties.
1362
1867
  """
1363
- # Validate flow IDs are unique
1364
- _validate_unique_flow_ids(flow_graphs)
1365
-
1366
- # Create ID mapping for all nodes
1367
- node_id_mapping = _create_node_id_mapping(flow_graphs)
1368
-
1369
- # Remap and combine nodes
1370
- all_nodes = _remap_nodes(flow_graphs, node_id_mapping)
1371
-
1372
- # Create a new combined flow graph
1373
- combined_flow_id = hash(tuple(fg.flow_id for fg in flow_graphs))
1374
- # return FlowGraph(flow_id=combined_flow_id, nodes=all_nodes, edges=all_edges)
1375
-
1376
-
1377
- def _validate_unique_flow_ids(flow_graphs: Tuple[FlowGraph, ...]) -> None:
1378
- """Ensure all flow graphs have unique flow_ids."""
1379
- all_flow_ids = [fg.flow_id for fg in flow_graphs]
1380
- if len(all_flow_ids) != len(set(all_flow_ids)):
1381
- raise ValueError("Cannot combine overlapping graphs, make sure the graphs have a unique identifier")
1382
-
1383
-
1384
- def _create_node_id_mapping(flow_graphs: Tuple[FlowGraph, ...]) -> Dict[int, Dict[int, int]]:
1385
- """Create a mapping from original node IDs to new unique node IDs."""
1386
- node_id_mapping: Dict[int, Dict[int, int]] = {}
1387
- next_node_id = 0
1388
-
1389
- for fg in flow_graphs:
1390
- node_id_mapping[fg.flow_id] = {}
1391
- for node in fg.nodes:
1392
- node_id_mapping[fg.flow_id][node.node_id] = next_node_id
1393
- next_node_id += 1
1394
-
1395
- return node_id_mapping
1396
-
1397
-
1398
- def _remap_nodes(flow_graphs: Tuple[FlowGraph, ...],
1399
- node_id_mapping: Dict[int, Dict[int, int]]) -> List:
1400
- """Create new nodes with remapped IDs."""
1401
- all_nodes = []
1402
- for fg in flow_graphs:
1403
- for node in fg.nodes:
1404
- new_node = copy.deepcopy(node)
1405
- new_node.node_id = node_id_mapping[fg.flow_id][node.node_id]
1406
- all_nodes.append(new_node)
1407
- return all_nodes
1408
-
1409
-
1410
- def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
1411
- """Combine excopy_nodeisting settings with new settings from a NodePromise."""
1412
1868
  copied_setting_input = deepcopy(setting_input)
1413
1869
 
1414
1870
  # Update only attributes that exist on new_settings
@@ -1427,7 +1883,13 @@ def combine_existing_settings_and_new_settings(setting_input: Any, new_settings:
1427
1883
  return copied_setting_input
1428
1884
 
1429
1885
 
1430
- def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection):
1886
+ def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection) -> None:
1887
+ """Adds a connection between two nodes in the flow graph.
1888
+
1889
+ Args:
1890
+ flow: The FlowGraph instance to modify.
1891
+ node_connection: An object defining the source and target of the connection.
1892
+ """
1431
1893
  logger.info('adding a connection')
1432
1894
  from_node = flow.get_node(node_connection.output_connection.node_id)
1433
1895
  to_node = flow.get_node(node_connection.input_connection.node_id)
@@ -1439,7 +1901,12 @@ def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection
1439
1901
 
1440
1902
 
1441
1903
  def delete_connection(graph, node_connection: input_schema.NodeConnection):
1442
- """Delete the connection between two nodes."""
1904
+ """Deletes a connection between two nodes in the flow graph.
1905
+
1906
+ Args:
1907
+ graph: The FlowGraph instance to modify.
1908
+ node_connection: An object defining the connection to be removed.
1909
+ """
1443
1910
  from_node = graph.get_node(node_connection.output_connection.node_id)
1444
1911
  to_node = graph.get_node(node_connection.input_connection.node_id)
1445
1912
  connection_valid = to_node.node_inputs.validate_if_input_connection_exists(
@@ -1455,6 +1922,4 @@ def delete_connection(graph, node_connection: input_schema.NodeConnection):
1455
1922
  to_node.delete_input_node(
1456
1923
  node_connection.output_connection.node_id,
1457
1924
  connection_type=node_connection.input_connection.connection_class,
1458
- )
1459
-
1460
-
1925
+ )