Flowfile 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +27 -6
- flowfile/api.py +1 -0
- flowfile/web/__init__.py +2 -2
- flowfile/web/static/assets/{CloudConnectionManager-d004942f.js → CloudConnectionManager-c20a740f.js} +3 -4
- flowfile/web/static/assets/{CloudStorageReader-eccf9fc2.js → CloudStorageReader-960b400a.js} +7 -7
- flowfile/web/static/assets/{CloudStorageWriter-b1ba6bba.js → CloudStorageWriter-e3decbdd.js} +7 -7
- flowfile/web/static/assets/{CrossJoin-68981877.js → CrossJoin-d67e2405.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-0b06649c.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-8349a426.js → DatabaseManager-9ea35e84.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-905344f8.js → DatabaseReader-9578bfa5.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-9f5b8638.js → DatabaseWriter-19531098.js} +9 -9
- flowfile/web/static/assets/{ExploreData-131a6d53.js → ExploreData-40476474.js} +47141 -43697
- flowfile/web/static/assets/{ExternalSource-e3549dcc.js → ExternalSource-2297ef96.js} +6 -6
- flowfile/web/static/assets/{Filter-6e0730ae.js → Filter-f211c03a.js} +8 -8
- flowfile/web/static/assets/{Formula-02f033e6.js → Formula-4207ea31.js} +8 -8
- flowfile/web/static/assets/{FuzzyMatch-54c14036.js → FuzzyMatch-bf120df0.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-08a3f499.js → GraphSolver-5bb7497a.js} +5 -5
- flowfile/web/static/assets/{GroupBy-2ae38139.js → GroupBy-92c81b65.js} +6 -6
- flowfile/web/static/assets/{Join-493b9772.js → Join-4e49a274.js} +9 -9
- flowfile/web/static/assets/{ManualInput-4373d163.js → ManualInput-90998ae8.js} +5 -5
- flowfile/web/static/assets/{Output-b534f3c7.js → Output-81e3e917.js} +4 -4
- flowfile/web/static/assets/{Pivot-2968ff65.js → Pivot-a3419842.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-65136536.js → PolarsCode-72710deb.js} +6 -6
- flowfile/web/static/assets/{Read-c56339ed.js → Read-c4059daf.js} +6 -6
- flowfile/web/static/assets/{RecordCount-1c641a5e.js → RecordCount-c2b5e095.js} +5 -5
- flowfile/web/static/assets/{RecordId-df308b8f.js → RecordId-10baf191.js} +6 -6
- flowfile/web/static/assets/{Sample-293e8a64.js → Sample-3ed9a0ae.js} +5 -5
- flowfile/web/static/assets/{SecretManager-03911655.js → SecretManager-0d49c0e8.js} +2 -2
- flowfile/web/static/assets/{Select-3058a13d.js → Select-8a02a0b3.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-fbf4fb39.js → SettingsSection-4c0f45f5.js} +1 -1
- flowfile/web/static/assets/{Sort-a29bbaf7.js → Sort-f55c9f9d.js} +6 -6
- flowfile/web/static/assets/{TextToRows-c7d7760e.js → TextToRows-5dbc2145.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-118f1d20.js → UnavailableFields-a1768e52.js} +2 -2
- flowfile/web/static/assets/{Union-f0589571.js → Union-f2aefdc9.js} +5 -5
- flowfile/web/static/assets/{Unique-7329a207.js → Unique-46b250da.js} +8 -8
- flowfile/web/static/assets/{Unpivot-30b0be15.js → Unpivot-25ac84cc.js} +5 -5
- flowfile/web/static/assets/{api-fb67319c.js → api-6ef0dcef.js} +1 -1
- flowfile/web/static/assets/{api-602fb95c.js → api-a0abbdc7.js} +1 -1
- flowfile/web/static/assets/{designer-94a6bf4d.js → designer-13eabd83.js} +4 -4
- flowfile/web/static/assets/{documentation-a224831e.js → documentation-b87e7f6f.js} +1 -1
- flowfile/web/static/assets/{dropDown-c2d2aa97.js → dropDown-13564764.js} +1 -1
- flowfile/web/static/assets/{fullEditor-921ac5fd.js → fullEditor-fd2cd6f9.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-7013cc94.js → genericNodeSettings-71e11604.js} +3 -3
- flowfile/web/static/assets/{index-3a75211d.js → index-f6c15e76.js} +46 -22
- flowfile/web/static/assets/{nodeTitle-a63d4680.js → nodeTitle-988d9efe.js} +3 -3
- flowfile/web/static/assets/{secretApi-763aec6e.js → secretApi-dd636aa2.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-08464729.js → selectDynamic-af36165e.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-f15a5f87.js → vue-codemirror.esm-2847001e.js} +1 -1
- flowfile/web/static/assets/{vue-content-loader.es-93bd09d7.js → vue-content-loader.es-0371da73.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/METADATA +2 -2
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/RECORD +96 -94
- flowfile_core/__init__.py +1 -0
- flowfile_core/auth/jwt.py +39 -0
- flowfile_core/configs/node_store/nodes.py +1 -0
- flowfile_core/configs/settings.py +6 -5
- flowfile_core/flowfile/code_generator/code_generator.py +71 -0
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +1 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +597 -309
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
- flowfile_core/flowfile/flow_graph.py +619 -191
- flowfile_core/flowfile/flow_graph_utils.py +2 -2
- flowfile_core/flowfile/flow_node/flow_node.py +500 -89
- flowfile_core/flowfile/flow_node/models.py +125 -20
- flowfile_core/flowfile/handler.py +2 -33
- flowfile_core/flowfile/manage/open_flowfile.py +1 -2
- flowfile_core/flowfile/util/calculate_layout.py +0 -2
- flowfile_core/flowfile/utils.py +36 -5
- flowfile_core/main.py +32 -13
- flowfile_core/routes/cloud_connections.py +7 -11
- flowfile_core/routes/logs.py +2 -6
- flowfile_core/routes/public.py +1 -0
- flowfile_core/routes/routes.py +127 -51
- flowfile_core/routes/secrets.py +72 -14
- flowfile_core/schemas/__init__.py +8 -0
- flowfile_core/schemas/input_schema.py +92 -64
- flowfile_core/schemas/output_model.py +19 -3
- flowfile_core/schemas/schemas.py +144 -11
- flowfile_core/schemas/transform_schema.py +82 -17
- flowfile_frame/__init__.py +9 -1
- flowfile_frame/cloud_storage/__init__.py +0 -0
- flowfile_frame/cloud_storage/frame_helpers.py +39 -0
- flowfile_frame/cloud_storage/secret_manager.py +73 -0
- flowfile_frame/expr.py +28 -1
- flowfile_frame/expr.pyi +76 -61
- flowfile_frame/flow_frame.py +232 -110
- flowfile_frame/flow_frame.pyi +140 -91
- flowfile_frame/flow_frame_methods.py +150 -12
- flowfile_frame/group_frame.py +3 -0
- flowfile_frame/utils.py +25 -3
- test_utils/s3/data_generator.py +1 -0
- test_utils/s3/demo_data_generator.py +186 -0
- test_utils/s3/fixtures.py +6 -1
- flowfile_core/schemas/defaults.py +0 -9
- flowfile_core/schemas/models.py +0 -193
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +0 -0
|
@@ -2,7 +2,7 @@ import datetime
|
|
|
2
2
|
import pickle
|
|
3
3
|
import polars as pl
|
|
4
4
|
import fastexcel
|
|
5
|
-
import
|
|
5
|
+
import re
|
|
6
6
|
from fastapi.exceptions import HTTPException
|
|
7
7
|
from time import time
|
|
8
8
|
from functools import partial
|
|
@@ -11,6 +11,7 @@ from uuid import uuid1
|
|
|
11
11
|
from copy import deepcopy
|
|
12
12
|
from pyarrow.parquet import ParquetFile
|
|
13
13
|
from flowfile_core.configs import logger
|
|
14
|
+
from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
|
|
14
15
|
from flowfile_core.configs.flow_logger import FlowLogger
|
|
15
16
|
from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
|
|
16
17
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
|
|
@@ -23,8 +24,10 @@ from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_x
|
|
|
23
24
|
get_calamine_xlsx_data_types
|
|
24
25
|
from flowfile_core.flowfile.sources import external_sources
|
|
25
26
|
from flowfile_core.schemas import input_schema, schemas, transform_schema
|
|
26
|
-
from flowfile_core.schemas.output_model import
|
|
27
|
-
from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSettingsInternal,
|
|
27
|
+
from flowfile_core.schemas.output_model import NodeData, NodeResult, RunInformation
|
|
28
|
+
from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSettingsInternal,
|
|
29
|
+
CloudStorageWriteSettingsInternal,
|
|
30
|
+
FullCloudStorageConnection,
|
|
28
31
|
get_cloud_storage_write_settings_worker_interface, AuthMethod)
|
|
29
32
|
from flowfile_core.flowfile.utils import snake_case_to_camel_case
|
|
30
33
|
from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
|
|
@@ -45,6 +48,21 @@ from flowfile_core.flowfile.util.calculate_layout import calculate_layered_layou
|
|
|
45
48
|
|
|
46
49
|
def get_xlsx_schema(engine: str, file_path: str, sheet_name: str, start_row: int, start_column: int,
|
|
47
50
|
end_row: int, end_column: int, has_headers: bool):
|
|
51
|
+
"""Calculates the schema of an XLSX file by reading a sample of rows.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
engine: The engine to use for reading ('openpyxl' or 'calamine').
|
|
55
|
+
file_path: The path to the XLSX file.
|
|
56
|
+
sheet_name: The name of the sheet to read.
|
|
57
|
+
start_row: The starting row for data reading.
|
|
58
|
+
start_column: The starting column for data reading.
|
|
59
|
+
end_row: The ending row for data reading.
|
|
60
|
+
end_column: The ending column for data reading.
|
|
61
|
+
has_headers: A boolean indicating if the file has a header row.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
A list of FlowfileColumn objects representing the schema.
|
|
65
|
+
"""
|
|
48
66
|
try:
|
|
49
67
|
logger.info('Starting to calculate the schema')
|
|
50
68
|
if engine == 'openpyxl':
|
|
@@ -67,26 +85,69 @@ def get_xlsx_schema(engine: str, file_path: str, sheet_name: str, start_row: int
|
|
|
67
85
|
|
|
68
86
|
|
|
69
87
|
def skip_node_message(flow_logger: FlowLogger, nodes: List[FlowNode]) -> None:
|
|
88
|
+
"""Logs a warning message listing all nodes that will be skipped during execution.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
flow_logger: The logger instance for the flow.
|
|
92
|
+
nodes: A list of FlowNode objects to be skipped.
|
|
93
|
+
"""
|
|
70
94
|
if len(nodes) > 0:
|
|
71
95
|
msg = "\n".join(str(node) for node in nodes)
|
|
72
96
|
flow_logger.warning(f'skipping nodes:\n{msg}')
|
|
73
97
|
|
|
74
98
|
|
|
75
99
|
def execution_order_message(flow_logger: FlowLogger, nodes: List[FlowNode]) -> None:
|
|
100
|
+
"""Logs an informational message showing the determined execution order of nodes.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
flow_logger: The logger instance for the flow.
|
|
104
|
+
nodes: A list of FlowNode objects in the order they will be executed.
|
|
105
|
+
"""
|
|
76
106
|
msg = "\n".join(str(node) for node in nodes)
|
|
77
107
|
flow_logger.info(f'execution order:\n{msg}')
|
|
78
108
|
|
|
79
109
|
|
|
80
110
|
def get_xlsx_schema_callback(engine: str, file_path: str, sheet_name: str, start_row: int, start_column: int,
|
|
81
111
|
end_row: int, end_column: int, has_headers: bool):
|
|
112
|
+
"""Creates a partially applied function for lazy calculation of an XLSX schema.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
engine: The engine to use for reading.
|
|
116
|
+
file_path: The path to the XLSX file.
|
|
117
|
+
sheet_name: The name of the sheet.
|
|
118
|
+
start_row: The starting row.
|
|
119
|
+
start_column: The starting column.
|
|
120
|
+
end_row: The ending row.
|
|
121
|
+
end_column: The ending column.
|
|
122
|
+
has_headers: A boolean indicating if the file has headers.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
A callable function that, when called, will execute `get_xlsx_schema`.
|
|
126
|
+
"""
|
|
82
127
|
return partial(get_xlsx_schema, engine=engine, file_path=file_path, sheet_name=sheet_name, start_row=start_row,
|
|
83
128
|
start_column=start_column, end_row=end_row, end_column=end_column, has_headers=has_headers)
|
|
84
129
|
|
|
85
130
|
|
|
86
|
-
def get_cloud_connection_settings(connection_name: str,
|
|
131
|
+
def get_cloud_connection_settings(connection_name: str,
|
|
132
|
+
user_id: int, auth_mode: AuthMethod) -> FullCloudStorageConnection:
|
|
133
|
+
"""Retrieves cloud storage connection settings, falling back to environment variables if needed.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
connection_name: The name of the saved connection.
|
|
137
|
+
user_id: The ID of the user owning the connection.
|
|
138
|
+
auth_mode: The authentication method specified by the user.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
A FullCloudStorageConnection object with the connection details.
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
HTTPException: If the connection settings cannot be found.
|
|
145
|
+
"""
|
|
87
146
|
cloud_connection_settings = get_local_cloud_connection(connection_name, user_id)
|
|
88
|
-
if cloud_connection_settings is None and auth_mode
|
|
147
|
+
if cloud_connection_settings is None and auth_mode in ("env_vars", "auto"):
|
|
89
148
|
# If the auth mode is aws-cli, we do not need connection settings
|
|
149
|
+
cloud_connection_settings = FullCloudStorageConnection(storage_type="s3", auth_method="env_vars")
|
|
150
|
+
elif cloud_connection_settings is None and auth_mode == "aws-cli":
|
|
90
151
|
cloud_connection_settings = FullCloudStorageConnection(storage_type="s3", auth_method="aws-cli")
|
|
91
152
|
if cloud_connection_settings is None:
|
|
92
153
|
raise HTTPException(status_code=400, detail="Cloud connection settings not found")
|
|
@@ -94,18 +155,10 @@ def get_cloud_connection_settings(connection_name: str, user_id: int, auth_mode:
|
|
|
94
155
|
|
|
95
156
|
|
|
96
157
|
class FlowGraph:
|
|
97
|
-
"""
|
|
98
|
-
FlowGraph is a class that enables Extract, Transform and Load (ETL) operations
|
|
99
|
-
on data. It allows you to create a Directed Acyclic Graph (DAG) where each
|
|
100
|
-
node represents a step in the ETL pipeline.
|
|
158
|
+
"""A class representing a Directed Acyclic Graph (DAG) for data processing pipelines.
|
|
101
159
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
Attributes:
|
|
106
|
-
_input_cols (set): A set that stores the input columns for the transformations.
|
|
107
|
-
_output_cols (set): A set that stores the output columns from the transformations.
|
|
108
|
-
"""
|
|
160
|
+
It manages nodes, connections, and the execution of the entire flow.
|
|
161
|
+
"""
|
|
109
162
|
uuid: str
|
|
110
163
|
depends_on: Dict[int, Union[ParquetFile, FlowDataEngine, "FlowGraph", pl.DataFrame,]]
|
|
111
164
|
_flow_id: int
|
|
@@ -127,13 +180,27 @@ class FlowGraph:
|
|
|
127
180
|
flow_settings: schemas.FlowSettings = None
|
|
128
181
|
flow_logger: FlowLogger
|
|
129
182
|
|
|
130
|
-
def __init__(self,
|
|
131
|
-
flow_settings: schemas.FlowSettings,
|
|
183
|
+
def __init__(self,
|
|
184
|
+
flow_settings: schemas.FlowSettings | schemas.FlowGraphConfig,
|
|
132
185
|
name: str = None, input_cols: List[str] = None,
|
|
133
186
|
output_cols: List[str] = None,
|
|
134
187
|
path_ref: str = None,
|
|
135
188
|
input_flow: Union[ParquetFile, FlowDataEngine, "FlowGraph"] = None,
|
|
136
189
|
cache_results: bool = False):
|
|
190
|
+
"""Initializes a new FlowGraph instance.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
flow_settings: The configuration settings for the flow.
|
|
194
|
+
name: The name of the flow.
|
|
195
|
+
input_cols: A list of input column names.
|
|
196
|
+
output_cols: A list of output column names.
|
|
197
|
+
path_ref: An optional path to an initial data source.
|
|
198
|
+
input_flow: An optional existing data object to start the flow with.
|
|
199
|
+
cache_results: A global flag to enable or disable result caching.
|
|
200
|
+
"""
|
|
201
|
+
if isinstance(flow_settings, schemas.FlowGraphConfig):
|
|
202
|
+
flow_settings = schemas.FlowSettings.from_flow_settings_input(flow_settings)
|
|
203
|
+
|
|
137
204
|
self.flow_settings = flow_settings
|
|
138
205
|
self.uuid = str(uuid1())
|
|
139
206
|
self.nodes_completed = 0
|
|
@@ -141,8 +208,8 @@ class FlowGraph:
|
|
|
141
208
|
self.end_datetime = None
|
|
142
209
|
self.latest_run_info = None
|
|
143
210
|
self.node_results = []
|
|
144
|
-
self._flow_id = flow_id
|
|
145
|
-
self.flow_logger = FlowLogger(flow_id)
|
|
211
|
+
self._flow_id = flow_settings.flow_id
|
|
212
|
+
self.flow_logger = FlowLogger(flow_settings.flow_id)
|
|
146
213
|
self._flow_starts: List[FlowNode] = []
|
|
147
214
|
self._results = None
|
|
148
215
|
self.schema = None
|
|
@@ -160,7 +227,13 @@ class FlowGraph:
|
|
|
160
227
|
self.add_datasource(input_file=input_flow)
|
|
161
228
|
|
|
162
229
|
def add_node_promise(self, node_promise: input_schema.NodePromise):
|
|
230
|
+
"""Adds a placeholder node to the graph that is not yet fully configured.
|
|
163
231
|
|
|
232
|
+
Useful for building the graph structure before all settings are available.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
node_promise: A promise object containing basic node information.
|
|
236
|
+
"""
|
|
164
237
|
def placeholder(n: FlowNode = None):
|
|
165
238
|
if n is None:
|
|
166
239
|
return FlowDataEngine()
|
|
@@ -169,10 +242,75 @@ class FlowGraph:
|
|
|
169
242
|
self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=placeholder,
|
|
170
243
|
setting_input=node_promise)
|
|
171
244
|
|
|
172
|
-
def
|
|
245
|
+
def print_tree(self, show_schema=False, show_descriptions=False):
|
|
246
|
+
"""
|
|
247
|
+
Print flow_graph as a tree.
|
|
173
248
|
"""
|
|
174
|
-
|
|
175
|
-
|
|
249
|
+
max_node_id = max(self._node_db.keys())
|
|
250
|
+
|
|
251
|
+
tree = ""
|
|
252
|
+
tabs = 0
|
|
253
|
+
tab_counter = 0
|
|
254
|
+
for node in self.nodes:
|
|
255
|
+
tab_counter += 1
|
|
256
|
+
node_input = node.setting_input
|
|
257
|
+
operation = str(self._node_db[node_input.node_id]).split("(")[1][:-1].replace("_", " ").title()
|
|
258
|
+
|
|
259
|
+
if operation == "Formula":
|
|
260
|
+
operation = "With Columns"
|
|
261
|
+
|
|
262
|
+
tree += str(operation) + " (id=" + str(node_input.node_id) + ")"
|
|
263
|
+
|
|
264
|
+
if show_descriptions & show_schema:
|
|
265
|
+
raise ValueError('show_descriptions and show_schema cannot be True simultaneously')
|
|
266
|
+
if show_descriptions:
|
|
267
|
+
tree += ": " + str(node_input.description)
|
|
268
|
+
elif show_schema:
|
|
269
|
+
tree += " -> ["
|
|
270
|
+
if operation == "Manual Input":
|
|
271
|
+
schema = ", ".join([str(i.name) + ": " + str(i.data_type) for i in node_input.raw_data_format.columns])
|
|
272
|
+
tree += schema
|
|
273
|
+
elif operation == "With Columns":
|
|
274
|
+
tree_with_col_schema = ", " + node_input.function.field.name + ": " + node_input.function.field.data_type
|
|
275
|
+
tree += schema + tree_with_col_schema
|
|
276
|
+
elif operation == "Filter":
|
|
277
|
+
index = node_input.filter_input.advanced_filter.find("]")
|
|
278
|
+
filtered_column = str(node_input.filter_input.advanced_filter[1:index])
|
|
279
|
+
schema = re.sub('({str(filtered_column)}: [A-Za-z0-9]+\,\s)', "", schema)
|
|
280
|
+
tree += schema
|
|
281
|
+
elif operation == "Group By":
|
|
282
|
+
for col in node_input.groupby_input.agg_cols:
|
|
283
|
+
schema = re.sub(str(col.old_name) + ': [a-z0-9]+\, ', "", schema)
|
|
284
|
+
tree += schema
|
|
285
|
+
tree += "]"
|
|
286
|
+
else:
|
|
287
|
+
if operation == "Manual Input":
|
|
288
|
+
tree += ": " + str(node_input.raw_data_format.data)
|
|
289
|
+
elif operation == "With Columns":
|
|
290
|
+
tree += ": " + str(node_input.function)
|
|
291
|
+
elif operation == "Filter":
|
|
292
|
+
tree += ": " + str(node_input.filter_input.advanced_filter)
|
|
293
|
+
elif operation == "Group By":
|
|
294
|
+
tree += ": groupby=[" + ", ".join([col.old_name for col in node_input.groupby_input.agg_cols if col.agg == "groupby"]) + "], "
|
|
295
|
+
tree += "agg=[" + ", ".join([str(col.agg) + "(" + str(col.old_name) + ")" for col in node_input.groupby_input.agg_cols if col.agg != "groupby"]) + "]"
|
|
296
|
+
|
|
297
|
+
if node_input.node_id < max_node_id:
|
|
298
|
+
tree += "\n" + "# " + " "*3*(tabs-1) + "|___ "
|
|
299
|
+
print("\n"*2)
|
|
300
|
+
|
|
301
|
+
return print(tree)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def apply_layout(self, y_spacing: int = 150, x_spacing: int = 200, initial_y: int = 100):
|
|
306
|
+
"""Calculates and applies a layered layout to all nodes in the graph.
|
|
307
|
+
|
|
308
|
+
This updates their x and y positions for UI rendering.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
y_spacing: The vertical spacing between layers.
|
|
312
|
+
x_spacing: The horizontal spacing between nodes in the same layer.
|
|
313
|
+
initial_y: The initial y-position for the first layer.
|
|
176
314
|
"""
|
|
177
315
|
self.flow_logger.info("Applying layered layout...")
|
|
178
316
|
start_time = time()
|
|
@@ -199,7 +337,7 @@ class FlowGraph:
|
|
|
199
337
|
else:
|
|
200
338
|
self.flow_logger.warning(f"Node {node_id} setting_input ({type(setting)}) lacks pos_x/pos_y attributes.")
|
|
201
339
|
elif node:
|
|
202
|
-
|
|
340
|
+
self.flow_logger.warning(f"Node {node_id} lacks setting_input attribute.")
|
|
203
341
|
# else: Node not found, already warned by calculate_layered_layout
|
|
204
342
|
|
|
205
343
|
end_time = time()
|
|
@@ -207,51 +345,20 @@ class FlowGraph:
|
|
|
207
345
|
|
|
208
346
|
except Exception as e:
|
|
209
347
|
self.flow_logger.error(f"Error applying layout: {e}")
|
|
210
|
-
raise
|
|
211
|
-
|
|
212
|
-
def add_initial_node_analysis(self, node_promise: input_schema.NodePromise):
|
|
213
|
-
node_analysis = create_graphic_walker_node_from_node_promise(node_promise)
|
|
214
|
-
self.add_explore_data(node_analysis)
|
|
215
|
-
|
|
216
|
-
def add_explore_data(self, node_analysis: input_schema.NodeExploreData):
|
|
217
|
-
sample_size: int = 10000
|
|
218
|
-
|
|
219
|
-
def analysis_preparation(flowfile_table: FlowDataEngine):
|
|
220
|
-
if flowfile_table.number_of_records <= 0:
|
|
221
|
-
number_of_records = flowfile_table.get_number_of_records(calculate_in_worker_process=True)
|
|
222
|
-
else:
|
|
223
|
-
number_of_records = flowfile_table.number_of_records
|
|
224
|
-
if number_of_records > sample_size:
|
|
225
|
-
flowfile_table = flowfile_table.get_sample(sample_size, random=True)
|
|
226
|
-
external_sampler = ExternalDfFetcher(
|
|
227
|
-
lf=flowfile_table.data_frame,
|
|
228
|
-
file_ref="__gf_walker"+node.hash,
|
|
229
|
-
wait_on_completion=True,
|
|
230
|
-
node_id=node.node_id,
|
|
231
|
-
flow_id=self.flow_id,
|
|
232
|
-
)
|
|
233
|
-
node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref)
|
|
234
|
-
return flowfile_table
|
|
235
|
-
|
|
236
|
-
def schema_callback():
|
|
237
|
-
node = self.get_node(node_analysis.node_id)
|
|
238
|
-
if len(node.all_inputs) == 1:
|
|
239
|
-
input_node = node.all_inputs[0]
|
|
240
|
-
return input_node.schema
|
|
241
|
-
else:
|
|
242
|
-
return [FlowfileColumn.from_input('col_1', 'na')]
|
|
243
|
-
|
|
244
|
-
self.add_node_step(node_id=node_analysis.node_id, node_type='explore_data',
|
|
245
|
-
function=analysis_preparation,
|
|
246
|
-
setting_input=node_analysis, schema_callback=schema_callback)
|
|
247
|
-
node = self.get_node(node_analysis.node_id)
|
|
348
|
+
raise # Optional: re-raise the exception
|
|
248
349
|
|
|
249
350
|
@property
|
|
250
351
|
def flow_id(self) -> int:
|
|
352
|
+
"""Gets the unique identifier of the flow."""
|
|
251
353
|
return self._flow_id
|
|
252
354
|
|
|
253
355
|
@flow_id.setter
|
|
254
356
|
def flow_id(self, new_id: int):
|
|
357
|
+
"""Sets the unique identifier for the flow and updates all child nodes.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
new_id: The new flow ID.
|
|
361
|
+
"""
|
|
255
362
|
self._flow_id = new_id
|
|
256
363
|
for node in self.nodes:
|
|
257
364
|
if hasattr(node.setting_input, 'flow_id'):
|
|
@@ -259,23 +366,35 @@ class FlowGraph:
|
|
|
259
366
|
self.flow_settings.flow_id = new_id
|
|
260
367
|
|
|
261
368
|
def __repr__(self):
|
|
262
|
-
"""
|
|
263
|
-
Official string representation of the FlowGraph class.
|
|
264
|
-
"""
|
|
369
|
+
"""Provides the official string representation of the FlowGraph instance."""
|
|
265
370
|
settings_str = " -" + '\n -'.join(f"{k}: {v}" for k, v in self.flow_settings)
|
|
266
371
|
return f"FlowGraph(\nNodes: {self._node_db}\n\nSettings:\n{settings_str}"
|
|
267
372
|
|
|
268
373
|
def get_nodes_overview(self):
|
|
374
|
+
"""Gets a list of dictionary representations for all nodes in the graph."""
|
|
269
375
|
output = []
|
|
270
376
|
for v in self._node_db.values():
|
|
271
377
|
output.append(v.get_repr())
|
|
272
378
|
return output
|
|
273
379
|
|
|
274
380
|
def remove_from_output_cols(self, columns: List[str]):
|
|
381
|
+
"""Removes specified columns from the list of expected output columns.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
columns: A list of column names to remove.
|
|
385
|
+
"""
|
|
275
386
|
cols = set(columns)
|
|
276
387
|
self._output_cols = [c for c in self._output_cols if c not in cols]
|
|
277
388
|
|
|
278
|
-
def get_node(self, node_id: Union[int, str] = None) -> FlowNode:
|
|
389
|
+
def get_node(self, node_id: Union[int, str] = None) -> FlowNode | None:
|
|
390
|
+
"""Retrieves a node from the graph by its ID.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
node_id: The ID of the node to retrieve. If None, retrieves the last added node.
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
The FlowNode object, or None if not found.
|
|
397
|
+
"""
|
|
279
398
|
if node_id is None:
|
|
280
399
|
node_id = self._node_ids[-1]
|
|
281
400
|
node = self._node_db.get(node_id)
|
|
@@ -283,6 +402,12 @@ class FlowGraph:
|
|
|
283
402
|
return node
|
|
284
403
|
|
|
285
404
|
def add_pivot(self, pivot_settings: input_schema.NodePivot):
|
|
405
|
+
"""Adds a pivot node to the graph.
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
pivot_settings: The settings for the pivot operation.
|
|
409
|
+
"""
|
|
410
|
+
|
|
286
411
|
def _func(fl: FlowDataEngine):
|
|
287
412
|
return fl.do_pivot(pivot_settings.pivot_input, self.flow_logger.get_node_logger(pivot_settings.node_id))
|
|
288
413
|
|
|
@@ -302,6 +427,11 @@ class FlowGraph:
|
|
|
302
427
|
node.schema_callback = schema_callback
|
|
303
428
|
|
|
304
429
|
def add_unpivot(self, unpivot_settings: input_schema.NodeUnpivot):
|
|
430
|
+
"""Adds an unpivot node to the graph.
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
unpivot_settings: The settings for the unpivot operation.
|
|
434
|
+
"""
|
|
305
435
|
|
|
306
436
|
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
307
437
|
return fl.unpivot(unpivot_settings.unpivot_input)
|
|
@@ -313,6 +443,12 @@ class FlowGraph:
|
|
|
313
443
|
input_node_ids=[unpivot_settings.depending_on_id])
|
|
314
444
|
|
|
315
445
|
def add_union(self, union_settings: input_schema.NodeUnion):
|
|
446
|
+
"""Adds a union node to combine multiple data streams.
|
|
447
|
+
|
|
448
|
+
Args:
|
|
449
|
+
union_settings: The settings for the union operation.
|
|
450
|
+
"""
|
|
451
|
+
|
|
316
452
|
def _func(*flowfile_tables: FlowDataEngine):
|
|
317
453
|
dfs: List[pl.LazyFrame] | List[pl.DataFrame] = [flt.data_frame for flt in flowfile_tables]
|
|
318
454
|
return FlowDataEngine(pl.concat(dfs, how='diagonal_relaxed'))
|
|
@@ -323,7 +459,59 @@ class FlowGraph:
|
|
|
323
459
|
setting_input=union_settings,
|
|
324
460
|
input_node_ids=union_settings.depending_on_ids)
|
|
325
461
|
|
|
462
|
+
def add_initial_node_analysis(self, node_promise: input_schema.NodePromise):
|
|
463
|
+
"""Adds a data exploration/analysis node based on a node promise.
|
|
464
|
+
|
|
465
|
+
Args:
|
|
466
|
+
node_promise: The promise representing the node to be analyzed.
|
|
467
|
+
"""
|
|
468
|
+
node_analysis = create_graphic_walker_node_from_node_promise(node_promise)
|
|
469
|
+
self.add_explore_data(node_analysis)
|
|
470
|
+
|
|
471
|
+
def add_explore_data(self, node_analysis: input_schema.NodeExploreData):
|
|
472
|
+
"""Adds a specialized node for data exploration and visualization.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
node_analysis: The settings for the data exploration node.
|
|
476
|
+
"""
|
|
477
|
+
sample_size: int = 10000
|
|
478
|
+
|
|
479
|
+
def analysis_preparation(flowfile_table: FlowDataEngine):
|
|
480
|
+
if flowfile_table.number_of_records <= 0:
|
|
481
|
+
number_of_records = flowfile_table.get_number_of_records(calculate_in_worker_process=True)
|
|
482
|
+
else:
|
|
483
|
+
number_of_records = flowfile_table.number_of_records
|
|
484
|
+
if number_of_records > sample_size:
|
|
485
|
+
flowfile_table = flowfile_table.get_sample(sample_size, random=True)
|
|
486
|
+
external_sampler = ExternalDfFetcher(
|
|
487
|
+
lf=flowfile_table.data_frame,
|
|
488
|
+
file_ref="__gf_walker"+node.hash,
|
|
489
|
+
wait_on_completion=True,
|
|
490
|
+
node_id=node.node_id,
|
|
491
|
+
flow_id=self.flow_id,
|
|
492
|
+
)
|
|
493
|
+
node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref)
|
|
494
|
+
return flowfile_table
|
|
495
|
+
|
|
496
|
+
def schema_callback():
|
|
497
|
+
node = self.get_node(node_analysis.node_id)
|
|
498
|
+
if len(node.all_inputs) == 1:
|
|
499
|
+
input_node = node.all_inputs[0]
|
|
500
|
+
return input_node.schema
|
|
501
|
+
else:
|
|
502
|
+
return [FlowfileColumn.from_input('col_1', 'na')]
|
|
503
|
+
|
|
504
|
+
self.add_node_step(node_id=node_analysis.node_id, node_type='explore_data',
|
|
505
|
+
function=analysis_preparation,
|
|
506
|
+
setting_input=node_analysis, schema_callback=schema_callback)
|
|
507
|
+
node = self.get_node(node_analysis.node_id)
|
|
508
|
+
|
|
326
509
|
def add_group_by(self, group_by_settings: input_schema.NodeGroupBy):
|
|
510
|
+
"""Adds a group-by aggregation node to the graph.
|
|
511
|
+
|
|
512
|
+
Args:
|
|
513
|
+
group_by_settings: The settings for the group-by operation.
|
|
514
|
+
"""
|
|
327
515
|
|
|
328
516
|
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
329
517
|
return fl.do_group_by(group_by_settings.groupby_input, False)
|
|
@@ -337,6 +525,7 @@ class FlowGraph:
|
|
|
337
525
|
node = self.get_node(group_by_settings.node_id)
|
|
338
526
|
|
|
339
527
|
def schema_callback():
|
|
528
|
+
|
|
340
529
|
output_columns = [(c.old_name, c.new_name, c.output_type) for c in group_by_settings.groupby_input.agg_cols]
|
|
341
530
|
depends_on = node.node_inputs.main_inputs[0]
|
|
342
531
|
input_schema_dict: Dict[str, str] = {s.name: s.data_type for s in depends_on.schema}
|
|
@@ -348,22 +537,13 @@ class FlowGraph:
|
|
|
348
537
|
|
|
349
538
|
node.schema_callback = schema_callback
|
|
350
539
|
|
|
351
|
-
def add_or_update_column_func(self, col_name: str, pl_dtype: pl.DataType, depends_on: FlowNode):
|
|
352
|
-
col_output = FlowfileColumn.from_input(column_name=col_name, data_type=str(pl_dtype))
|
|
353
|
-
schema = depends_on.schema
|
|
354
|
-
col_exist = depends_on.get_flow_file_column_schema(col_name)
|
|
355
|
-
if col_exist is None:
|
|
356
|
-
new_schema = schema + [col_output]
|
|
357
|
-
else:
|
|
358
|
-
new_schema = []
|
|
359
|
-
for s in self.schema:
|
|
360
|
-
if s.name == col_name:
|
|
361
|
-
new_schema.append(col_output)
|
|
362
|
-
else:
|
|
363
|
-
new_schema.append(s)
|
|
364
|
-
return new_schema
|
|
365
|
-
|
|
366
540
|
def add_filter(self, filter_settings: input_schema.NodeFilter):
|
|
541
|
+
"""Adds a filter node to the graph.
|
|
542
|
+
|
|
543
|
+
Args:
|
|
544
|
+
filter_settings: The settings for the filter operation.
|
|
545
|
+
"""
|
|
546
|
+
|
|
367
547
|
is_advanced = filter_settings.filter_input.filter_type == 'advanced'
|
|
368
548
|
if is_advanced:
|
|
369
549
|
predicate = filter_settings.filter_input.advanced_filter
|
|
@@ -397,6 +577,12 @@ class FlowGraph:
|
|
|
397
577
|
)
|
|
398
578
|
|
|
399
579
|
def add_record_count(self, node_number_of_records: input_schema.NodeRecordCount):
|
|
580
|
+
"""Adds a filter node to the graph.
|
|
581
|
+
|
|
582
|
+
Args:
|
|
583
|
+
node_number_of_records: The settings for the record count operation.
|
|
584
|
+
"""
|
|
585
|
+
|
|
400
586
|
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
401
587
|
return fl.get_record_count()
|
|
402
588
|
|
|
@@ -407,9 +593,14 @@ class FlowGraph:
|
|
|
407
593
|
input_node_ids=[node_number_of_records.depending_on_id])
|
|
408
594
|
|
|
409
595
|
def add_polars_code(self, node_polars_code: input_schema.NodePolarsCode):
|
|
596
|
+
"""Adds a node that executes custom Polars code.
|
|
597
|
+
|
|
598
|
+
Args:
|
|
599
|
+
node_polars_code: The settings for the Polars code node.
|
|
600
|
+
"""
|
|
601
|
+
|
|
410
602
|
def _func(*flowfile_tables: FlowDataEngine) -> FlowDataEngine:
|
|
411
603
|
return execute_polars_code(*flowfile_tables, code=node_polars_code.polars_code_input.polars_code)
|
|
412
|
-
|
|
413
604
|
self.add_node_step(node_id=node_polars_code.node_id,
|
|
414
605
|
function=_func,
|
|
415
606
|
node_type='polars_code',
|
|
@@ -422,7 +613,31 @@ class FlowGraph:
|
|
|
422
613
|
node = self.get_node(node_id=node_polars_code.node_id)
|
|
423
614
|
node.results.errors = str(e)
|
|
424
615
|
|
|
616
|
+
def add_dependency_on_polars_lazy_frame(self,
|
|
617
|
+
lazy_frame: pl.LazyFrame,
|
|
618
|
+
node_id: int):
|
|
619
|
+
"""Adds a special node that directly injects a Polars LazyFrame into the graph.
|
|
620
|
+
|
|
621
|
+
Note: This is intended for backend use and will not work in the UI editor.
|
|
622
|
+
|
|
623
|
+
Args:
|
|
624
|
+
lazy_frame: The Polars LazyFrame to inject.
|
|
625
|
+
node_id: The ID for the new node.
|
|
626
|
+
"""
|
|
627
|
+
def _func():
|
|
628
|
+
return FlowDataEngine(lazy_frame)
|
|
629
|
+
node_promise = input_schema.NodePromise(flow_id=self.flow_id,
|
|
630
|
+
node_id=node_id, node_type="polars_lazy_frame",
|
|
631
|
+
is_setup=True)
|
|
632
|
+
self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=_func,
|
|
633
|
+
setting_input=node_promise)
|
|
634
|
+
|
|
425
635
|
def add_unique(self, unique_settings: input_schema.NodeUnique):
|
|
636
|
+
"""Adds a node to find and remove duplicate rows.
|
|
637
|
+
|
|
638
|
+
Args:
|
|
639
|
+
unique_settings: The settings for the unique operation.
|
|
640
|
+
"""
|
|
426
641
|
|
|
427
642
|
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
428
643
|
return fl.make_unique(unique_settings.unique_input)
|
|
@@ -435,6 +650,16 @@ class FlowGraph:
|
|
|
435
650
|
input_node_ids=[unique_settings.depending_on_id])
|
|
436
651
|
|
|
437
652
|
def add_graph_solver(self, graph_solver_settings: input_schema.NodeGraphSolver):
|
|
653
|
+
"""Adds a node that solves graph-like problems within the data.
|
|
654
|
+
|
|
655
|
+
This node can be used for operations like finding network paths,
|
|
656
|
+
calculating connected components, or performing other graph algorithms
|
|
657
|
+
on relational data that represents nodes and edges.
|
|
658
|
+
|
|
659
|
+
Args:
|
|
660
|
+
graph_solver_settings: The settings object defining the graph inputs
|
|
661
|
+
and the specific algorithm to apply.
|
|
662
|
+
"""
|
|
438
663
|
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
439
664
|
return fl.solve_graph(graph_solver_settings.graph_solver_input)
|
|
440
665
|
|
|
@@ -445,6 +670,12 @@ class FlowGraph:
|
|
|
445
670
|
input_node_ids=[graph_solver_settings.depending_on_id])
|
|
446
671
|
|
|
447
672
|
def add_formula(self, function_settings: input_schema.NodeFormula):
|
|
673
|
+
"""Adds a node that applies a formula to create or modify a column.
|
|
674
|
+
|
|
675
|
+
Args:
|
|
676
|
+
function_settings: The settings for the formula operation.
|
|
677
|
+
"""
|
|
678
|
+
|
|
448
679
|
error = ""
|
|
449
680
|
if function_settings.function.field.data_type not in (None, "Auto"):
|
|
450
681
|
output_type = cast_str_to_polars_type(function_settings.function.field.data_type)
|
|
@@ -476,6 +707,14 @@ class FlowGraph:
|
|
|
476
707
|
return True, ""
|
|
477
708
|
|
|
478
709
|
def add_cross_join(self, cross_join_settings: input_schema.NodeCrossJoin) -> "FlowGraph":
|
|
710
|
+
"""Adds a cross join node to the graph.
|
|
711
|
+
|
|
712
|
+
Args:
|
|
713
|
+
cross_join_settings: The settings for the cross join operation.
|
|
714
|
+
|
|
715
|
+
Returns:
|
|
716
|
+
The `FlowGraph` instance for method chaining.
|
|
717
|
+
"""
|
|
479
718
|
|
|
480
719
|
def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
|
|
481
720
|
for left_select in cross_join_settings.cross_join_input.left_select.renames:
|
|
@@ -497,6 +736,15 @@ class FlowGraph:
|
|
|
497
736
|
return self
|
|
498
737
|
|
|
499
738
|
def add_join(self, join_settings: input_schema.NodeJoin) -> "FlowGraph":
|
|
739
|
+
"""Adds a join node to combine two data streams based on key columns.
|
|
740
|
+
|
|
741
|
+
Args:
|
|
742
|
+
join_settings: The settings for the join operation.
|
|
743
|
+
|
|
744
|
+
Returns:
|
|
745
|
+
The `FlowGraph` instance for method chaining.
|
|
746
|
+
"""
|
|
747
|
+
|
|
500
748
|
def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
|
|
501
749
|
for left_select in join_settings.join_input.left_select.renames:
|
|
502
750
|
left_select.is_available = True if left_select.old_name in main.schema else False
|
|
@@ -517,6 +765,15 @@ class FlowGraph:
|
|
|
517
765
|
return self
|
|
518
766
|
|
|
519
767
|
def add_fuzzy_match(self, fuzzy_settings: input_schema.NodeFuzzyMatch) -> "FlowGraph":
|
|
768
|
+
"""Adds a fuzzy matching node to join data on approximate string matches.
|
|
769
|
+
|
|
770
|
+
Args:
|
|
771
|
+
fuzzy_settings: The settings for the fuzzy match operation.
|
|
772
|
+
|
|
773
|
+
Returns:
|
|
774
|
+
The `FlowGraph` instance for method chaining.
|
|
775
|
+
"""
|
|
776
|
+
|
|
520
777
|
def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
|
|
521
778
|
f = main.start_fuzzy_join(fuzzy_match_input=fuzzy_settings.join_input, other=right, file_ref=node.hash,
|
|
522
779
|
flow_id=self.flow_id, node_id=fuzzy_settings.node_id)
|
|
@@ -541,6 +798,18 @@ class FlowGraph:
|
|
|
541
798
|
return self
|
|
542
799
|
|
|
543
800
|
def add_text_to_rows(self, node_text_to_rows: input_schema.NodeTextToRows) -> "FlowGraph":
|
|
801
|
+
"""Adds a node that splits cell values into multiple rows.
|
|
802
|
+
|
|
803
|
+
This is useful for un-nesting data where a single field contains multiple
|
|
804
|
+
values separated by a delimiter.
|
|
805
|
+
|
|
806
|
+
Args:
|
|
807
|
+
node_text_to_rows: The settings object that specifies the column to split
|
|
808
|
+
and the delimiter to use.
|
|
809
|
+
|
|
810
|
+
Returns:
|
|
811
|
+
The `FlowGraph` instance for method chaining.
|
|
812
|
+
"""
|
|
544
813
|
def _func(table: FlowDataEngine) -> FlowDataEngine:
|
|
545
814
|
return table.split(node_text_to_rows.text_to_rows_input)
|
|
546
815
|
|
|
@@ -552,6 +821,15 @@ class FlowGraph:
|
|
|
552
821
|
return self
|
|
553
822
|
|
|
554
823
|
def add_sort(self, sort_settings: input_schema.NodeSort) -> "FlowGraph":
|
|
824
|
+
"""Adds a node to sort the data based on one or more columns.
|
|
825
|
+
|
|
826
|
+
Args:
|
|
827
|
+
sort_settings: The settings for the sort operation.
|
|
828
|
+
|
|
829
|
+
Returns:
|
|
830
|
+
The `FlowGraph` instance for method chaining.
|
|
831
|
+
"""
|
|
832
|
+
|
|
555
833
|
def _func(table: FlowDataEngine) -> FlowDataEngine:
|
|
556
834
|
return table.do_sort(sort_settings.sort_input)
|
|
557
835
|
|
|
@@ -563,6 +841,14 @@ class FlowGraph:
|
|
|
563
841
|
return self
|
|
564
842
|
|
|
565
843
|
def add_sample(self, sample_settings: input_schema.NodeSample) -> "FlowGraph":
|
|
844
|
+
"""Adds a node to take a random or top-N sample of the data.
|
|
845
|
+
|
|
846
|
+
Args:
|
|
847
|
+
sample_settings: The settings object specifying the size of the sample.
|
|
848
|
+
|
|
849
|
+
Returns:
|
|
850
|
+
The `FlowGraph` instance for method chaining.
|
|
851
|
+
"""
|
|
566
852
|
def _func(table: FlowDataEngine) -> FlowDataEngine:
|
|
567
853
|
return table.get_sample(sample_settings.sample_size)
|
|
568
854
|
|
|
@@ -575,6 +861,15 @@ class FlowGraph:
|
|
|
575
861
|
return self
|
|
576
862
|
|
|
577
863
|
def add_record_id(self, record_id_settings: input_schema.NodeRecordId) -> "FlowGraph":
|
|
864
|
+
"""Adds a node to create a new column with a unique ID for each record.
|
|
865
|
+
|
|
866
|
+
Args:
|
|
867
|
+
record_id_settings: The settings object specifying the name of the
|
|
868
|
+
new record ID column.
|
|
869
|
+
|
|
870
|
+
Returns:
|
|
871
|
+
The `FlowGraph` instance for method chaining.
|
|
872
|
+
"""
|
|
578
873
|
|
|
579
874
|
def _func(table: FlowDataEngine) -> FlowDataEngine:
|
|
580
875
|
return table.add_record_id(record_id_settings.record_id_input)
|
|
@@ -588,6 +883,15 @@ class FlowGraph:
|
|
|
588
883
|
return self
|
|
589
884
|
|
|
590
885
|
def add_select(self, select_settings: input_schema.NodeSelect) -> "FlowGraph":
|
|
886
|
+
"""Adds a node to select, rename, reorder, or drop columns.
|
|
887
|
+
|
|
888
|
+
Args:
|
|
889
|
+
select_settings: The settings for the select operation.
|
|
890
|
+
|
|
891
|
+
Returns:
|
|
892
|
+
The `FlowGraph` instance for method chaining.
|
|
893
|
+
"""
|
|
894
|
+
|
|
591
895
|
select_cols = select_settings.select_input
|
|
592
896
|
drop_cols = tuple(s.old_name for s in select_settings.select_input)
|
|
593
897
|
|
|
@@ -621,9 +925,18 @@ class FlowGraph:
|
|
|
621
925
|
|
|
622
926
|
@property
|
|
623
927
|
def graph_has_functions(self) -> bool:
|
|
928
|
+
"""Checks if the graph has any nodes."""
|
|
624
929
|
return len(self._node_ids) > 0
|
|
625
930
|
|
|
626
931
|
def delete_node(self, node_id: Union[int, str]):
|
|
932
|
+
"""Deletes a node from the graph and updates all its connections.
|
|
933
|
+
|
|
934
|
+
Args:
|
|
935
|
+
node_id: The ID of the node to delete.
|
|
936
|
+
|
|
937
|
+
Raises:
|
|
938
|
+
Exception: If the node with the given ID does not exist.
|
|
939
|
+
"""
|
|
627
940
|
logger.info(f"Starting deletion of node with ID: {node_id}")
|
|
628
941
|
|
|
629
942
|
node = self._node_db.get(node_id)
|
|
@@ -656,6 +969,7 @@ class FlowGraph:
|
|
|
656
969
|
|
|
657
970
|
@property
|
|
658
971
|
def graph_has_input_data(self) -> bool:
|
|
972
|
+
"""Checks if the graph has an initial input data source."""
|
|
659
973
|
return self._input_data is not None
|
|
660
974
|
|
|
661
975
|
def add_node_step(self,
|
|
@@ -670,6 +984,24 @@ class FlowGraph:
|
|
|
670
984
|
cache_results: bool = None,
|
|
671
985
|
schema_callback: Callable = None,
|
|
672
986
|
input_node_ids: List[int] = None) -> FlowNode:
|
|
987
|
+
"""The core method for adding or updating a node in the graph.
|
|
988
|
+
|
|
989
|
+
Args:
|
|
990
|
+
node_id: The unique ID for the node.
|
|
991
|
+
function: The core processing function for the node.
|
|
992
|
+
input_columns: A list of input column names required by the function.
|
|
993
|
+
output_schema: A predefined schema for the node's output.
|
|
994
|
+
node_type: A string identifying the type of node (e.g., 'filter', 'join').
|
|
995
|
+
drop_columns: A list of columns to be dropped after the function executes.
|
|
996
|
+
renew_schema: If True, the schema is recalculated after execution.
|
|
997
|
+
setting_input: A configuration object containing settings for the node.
|
|
998
|
+
cache_results: If True, the node's results are cached for future runs.
|
|
999
|
+
schema_callback: A function that dynamically calculates the output schema.
|
|
1000
|
+
input_node_ids: A list of IDs for the nodes that this node depends on.
|
|
1001
|
+
|
|
1002
|
+
Returns:
|
|
1003
|
+
The created or updated FlowNode object.
|
|
1004
|
+
"""
|
|
673
1005
|
existing_node = self.get_node(node_id)
|
|
674
1006
|
if existing_node is not None:
|
|
675
1007
|
if existing_node.node_type != node_type:
|
|
@@ -686,9 +1018,8 @@ class FlowGraph:
|
|
|
686
1018
|
if (
|
|
687
1019
|
input_nodes is not None or
|
|
688
1020
|
function.__name__ in ('placeholder', 'analysis_preparation') or
|
|
689
|
-
node_type
|
|
1021
|
+
node_type in ("cloud_storage_reader", "polars_lazy_frame", "input_data")
|
|
690
1022
|
):
|
|
691
|
-
|
|
692
1023
|
if not existing_node:
|
|
693
1024
|
node = FlowNode(node_id=node_id,
|
|
694
1025
|
function=function,
|
|
@@ -709,8 +1040,6 @@ class FlowGraph:
|
|
|
709
1040
|
setting_input=setting_input,
|
|
710
1041
|
schema_callback=schema_callback)
|
|
711
1042
|
node = existing_node
|
|
712
|
-
elif node_type == 'input_data':
|
|
713
|
-
node = None
|
|
714
1043
|
else:
|
|
715
1044
|
raise Exception("No data initialized")
|
|
716
1045
|
self._node_db[node_id] = node
|
|
@@ -718,6 +1047,11 @@ class FlowGraph:
|
|
|
718
1047
|
return node
|
|
719
1048
|
|
|
720
1049
|
def add_include_cols(self, include_columns: List[str]):
|
|
1050
|
+
"""Adds columns to both the input and output column lists.
|
|
1051
|
+
|
|
1052
|
+
Args:
|
|
1053
|
+
include_columns: A list of column names to include.
|
|
1054
|
+
"""
|
|
721
1055
|
for column in include_columns:
|
|
722
1056
|
if column not in self._input_cols:
|
|
723
1057
|
self._input_cols.append(column)
|
|
@@ -726,6 +1060,12 @@ class FlowGraph:
|
|
|
726
1060
|
return self
|
|
727
1061
|
|
|
728
1062
|
def add_output(self, output_file: input_schema.NodeOutput):
|
|
1063
|
+
"""Adds an output node to write the final data to a destination.
|
|
1064
|
+
|
|
1065
|
+
Args:
|
|
1066
|
+
output_file: The settings for the output file.
|
|
1067
|
+
"""
|
|
1068
|
+
|
|
729
1069
|
def _func(df: FlowDataEngine):
|
|
730
1070
|
output_file.output_settings.populate_abs_file_path()
|
|
731
1071
|
execute_remote = self.execution_location != 'local'
|
|
@@ -747,7 +1087,12 @@ class FlowGraph:
|
|
|
747
1087
|
input_node_ids=[input_node_id])
|
|
748
1088
|
|
|
749
1089
|
def add_database_writer(self, node_database_writer: input_schema.NodeDatabaseWriter):
|
|
750
|
-
|
|
1090
|
+
"""Adds a node to write data to a database.
|
|
1091
|
+
|
|
1092
|
+
Args:
|
|
1093
|
+
node_database_writer: The settings for the database writer node.
|
|
1094
|
+
"""
|
|
1095
|
+
|
|
751
1096
|
node_type = 'database_writer'
|
|
752
1097
|
database_settings: input_schema.DatabaseWriteSettings = node_database_writer.database_write_settings
|
|
753
1098
|
database_connection: Optional[input_schema.DatabaseConnection | input_schema.FullDatabaseConnection]
|
|
@@ -795,6 +1140,12 @@ class FlowGraph:
|
|
|
795
1140
|
node = self.get_node(node_database_writer.node_id)
|
|
796
1141
|
|
|
797
1142
|
def add_database_reader(self, node_database_reader: input_schema.NodeDatabaseReader):
|
|
1143
|
+
"""Adds a node to read data from a database.
|
|
1144
|
+
|
|
1145
|
+
Args:
|
|
1146
|
+
node_database_reader: The settings for the database reader node.
|
|
1147
|
+
"""
|
|
1148
|
+
|
|
798
1149
|
logger.info("Adding database reader")
|
|
799
1150
|
node_type = 'database_reader'
|
|
800
1151
|
database_settings: input_schema.DatabaseSettings = node_database_reader.database_settings
|
|
@@ -868,15 +1219,27 @@ class FlowGraph:
|
|
|
868
1219
|
self._node_ids.append(node_database_reader.node_id)
|
|
869
1220
|
|
|
870
1221
|
def add_sql_source(self, external_source_input: input_schema.NodeExternalSource):
|
|
1222
|
+
"""Adds a node that reads data from a SQL source.
|
|
1223
|
+
|
|
1224
|
+
This is a convenience alias for `add_external_source`.
|
|
1225
|
+
|
|
1226
|
+
Args:
|
|
1227
|
+
external_source_input: The settings for the external SQL source node.
|
|
1228
|
+
"""
|
|
871
1229
|
logger.info('Adding sql source')
|
|
872
1230
|
self.add_external_source(external_source_input)
|
|
873
1231
|
|
|
874
1232
|
def add_cloud_storage_writer(self, node_cloud_storage_writer: input_schema.NodeCloudStorageWriter) -> None:
|
|
1233
|
+
"""Adds a node to write data to a cloud storage provider.
|
|
875
1234
|
|
|
876
|
-
|
|
1235
|
+
Args:
|
|
1236
|
+
node_cloud_storage_writer: The settings for the cloud storage writer node.
|
|
1237
|
+
"""
|
|
877
1238
|
|
|
1239
|
+
node_type = "cloud_storage_writer"
|
|
878
1240
|
def _func(df: FlowDataEngine):
|
|
879
1241
|
df.lazy = True
|
|
1242
|
+
execute_remote = self.execution_location != 'local'
|
|
880
1243
|
cloud_connection_settings = get_cloud_connection_settings(
|
|
881
1244
|
connection_name=node_cloud_storage_writer.cloud_storage_settings.connection_name,
|
|
882
1245
|
user_id=node_cloud_storage_writer.user_id,
|
|
@@ -888,15 +1251,22 @@ class FlowGraph:
|
|
|
888
1251
|
aws_allow_unsafe_html=cloud_connection_settings.aws_allow_unsafe_html,
|
|
889
1252
|
**CloudStorageReader.get_storage_options(cloud_connection_settings)
|
|
890
1253
|
)
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
1254
|
+
if execute_remote:
|
|
1255
|
+
settings = get_cloud_storage_write_settings_worker_interface(
|
|
1256
|
+
write_settings=node_cloud_storage_writer.cloud_storage_settings,
|
|
1257
|
+
connection=full_cloud_storage_connection,
|
|
1258
|
+
lf=df.data_frame,
|
|
1259
|
+
flowfile_node_id=node_cloud_storage_writer.node_id,
|
|
1260
|
+
flowfile_flow_id=self.flow_id)
|
|
1261
|
+
external_database_writer = ExternalCloudWriter(settings, wait_on_completion=False)
|
|
1262
|
+
node._fetch_cached_df = external_database_writer
|
|
1263
|
+
external_database_writer.get_result()
|
|
1264
|
+
else:
|
|
1265
|
+
cloud_storage_write_settings_internal = CloudStorageWriteSettingsInternal(
|
|
1266
|
+
connection=full_cloud_storage_connection,
|
|
1267
|
+
write_settings=node_cloud_storage_writer.cloud_storage_settings,
|
|
1268
|
+
)
|
|
1269
|
+
df.to_cloud_storage_obj(cloud_storage_write_settings_internal)
|
|
900
1270
|
return df
|
|
901
1271
|
|
|
902
1272
|
def schema_callback():
|
|
@@ -919,12 +1289,10 @@ class FlowGraph:
|
|
|
919
1289
|
node = self.get_node(node_cloud_storage_writer.node_id)
|
|
920
1290
|
|
|
921
1291
|
def add_cloud_storage_reader(self, node_cloud_storage_reader: input_schema.NodeCloudStorageReader) -> None:
|
|
922
|
-
"""
|
|
923
|
-
|
|
1292
|
+
"""Adds a cloud storage read node to the flow graph.
|
|
1293
|
+
|
|
924
1294
|
Args:
|
|
925
|
-
node_cloud_storage_reader
|
|
926
|
-
The settings for the cloud storage read node.
|
|
927
|
-
Returns:
|
|
1295
|
+
node_cloud_storage_reader: The settings for the cloud storage read node.
|
|
928
1296
|
"""
|
|
929
1297
|
node_type = "cloud_storage_reader"
|
|
930
1298
|
logger.info("Adding cloud storage reader")
|
|
@@ -953,6 +1321,11 @@ class FlowGraph:
|
|
|
953
1321
|
|
|
954
1322
|
def add_external_source(self,
|
|
955
1323
|
external_source_input: input_schema.NodeExternalSource):
|
|
1324
|
+
"""Adds a node for a custom external data source.
|
|
1325
|
+
|
|
1326
|
+
Args:
|
|
1327
|
+
external_source_input: The settings for the external source node.
|
|
1328
|
+
"""
|
|
956
1329
|
|
|
957
1330
|
node_type = 'external_source'
|
|
958
1331
|
external_source_script = getattr(external_sources.custom_external_sources, external_source_input.identifier)
|
|
@@ -1009,6 +1382,12 @@ class FlowGraph:
|
|
|
1009
1382
|
setting_input=external_source_input)
|
|
1010
1383
|
|
|
1011
1384
|
def add_read(self, input_file: input_schema.NodeRead):
|
|
1385
|
+
"""Adds a node to read data from a local file (e.g., CSV, Parquet, Excel).
|
|
1386
|
+
|
|
1387
|
+
Args:
|
|
1388
|
+
input_file: The settings for the read operation.
|
|
1389
|
+
"""
|
|
1390
|
+
|
|
1012
1391
|
if input_file.received_file.file_type in ('xlsx', 'excel') and input_file.received_file.sheet_name == '':
|
|
1013
1392
|
sheet_name = fastexcel.read_excel(input_file.received_file.path).sheet_names[0]
|
|
1014
1393
|
input_file.received_file.sheet_name = sheet_name
|
|
@@ -1077,7 +1456,18 @@ class FlowGraph:
|
|
|
1077
1456
|
node.schema_callback = schema_callback
|
|
1078
1457
|
return self
|
|
1079
1458
|
|
|
1080
|
-
def add_datasource(self, input_file: input_schema.NodeDatasource
|
|
1459
|
+
def add_datasource(self, input_file: Union[input_schema.NodeDatasource, input_schema.NodeManualInput]) -> "FlowGraph":
|
|
1460
|
+
"""Adds a data source node to the graph.
|
|
1461
|
+
|
|
1462
|
+
This method serves as a factory for creating starting nodes, handling both
|
|
1463
|
+
file-based sources and direct manual data entry.
|
|
1464
|
+
|
|
1465
|
+
Args:
|
|
1466
|
+
input_file: The configuration object for the data source.
|
|
1467
|
+
|
|
1468
|
+
Returns:
|
|
1469
|
+
The `FlowGraph` instance for method chaining.
|
|
1470
|
+
"""
|
|
1081
1471
|
if isinstance(input_file, input_schema.NodeManualInput):
|
|
1082
1472
|
input_data = FlowDataEngine(input_file.raw_data_format)
|
|
1083
1473
|
ref = 'manual_input'
|
|
@@ -1103,29 +1493,35 @@ class FlowGraph:
|
|
|
1103
1493
|
return self
|
|
1104
1494
|
|
|
1105
1495
|
def add_manual_input(self, input_file: input_schema.NodeManualInput):
|
|
1496
|
+
"""Adds a node for manual data entry.
|
|
1497
|
+
|
|
1498
|
+
This is a convenience alias for `add_datasource`.
|
|
1499
|
+
|
|
1500
|
+
Args:
|
|
1501
|
+
input_file: The settings and data for the manual input node.
|
|
1502
|
+
"""
|
|
1106
1503
|
self.add_datasource(input_file)
|
|
1107
1504
|
|
|
1108
1505
|
@property
|
|
1109
1506
|
def nodes(self) -> List[FlowNode]:
|
|
1110
|
-
|
|
1507
|
+
"""Gets a list of all FlowNode objects in the graph."""
|
|
1111
1508
|
|
|
1112
|
-
|
|
1113
|
-
not_filled_cols = set(expected_cols) - set(self._output_cols)
|
|
1114
|
-
cols_available = list(not_filled_cols & set([c.name for c in self._input_data.schema]))
|
|
1115
|
-
self._output_cols += cols_available
|
|
1116
|
-
|
|
1117
|
-
@property
|
|
1118
|
-
def input_data_columns(self) -> List[str] | None:
|
|
1119
|
-
if self._input_cols:
|
|
1120
|
-
return list(set([col for col in self._input_cols if
|
|
1121
|
-
col in [table_col.name for table_col in self._input_data.schema]]))
|
|
1509
|
+
return list(self._node_db.values())
|
|
1122
1510
|
|
|
1123
1511
|
@property
|
|
1124
|
-
def execution_mode(self) ->
|
|
1512
|
+
def execution_mode(self) -> schemas.ExecutionModeLiteral:
|
|
1513
|
+
"""Gets the current execution mode ('Development' or 'Performance')."""
|
|
1125
1514
|
return self.flow_settings.execution_mode
|
|
1126
1515
|
|
|
1127
1516
|
def get_implicit_starter_nodes(self) -> List[FlowNode]:
|
|
1128
|
-
"""
|
|
1517
|
+
"""Finds nodes that can act as starting points but are not explicitly defined as such.
|
|
1518
|
+
|
|
1519
|
+
Some nodes, like the Polars Code node, can function without an input. This
|
|
1520
|
+
method identifies such nodes if they have no incoming connections.
|
|
1521
|
+
|
|
1522
|
+
Returns:
|
|
1523
|
+
A list of `FlowNode` objects that are implicit starting nodes.
|
|
1524
|
+
"""
|
|
1129
1525
|
starting_node_ids = [node.node_id for node in self._flow_starts]
|
|
1130
1526
|
implicit_starting_nodes = []
|
|
1131
1527
|
for node in self.nodes:
|
|
@@ -1135,17 +1531,39 @@ class FlowGraph:
|
|
|
1135
1531
|
|
|
1136
1532
|
@execution_mode.setter
|
|
1137
1533
|
def execution_mode(self, mode: schemas.ExecutionModeLiteral):
|
|
1534
|
+
"""Sets the execution mode for the flow.
|
|
1535
|
+
|
|
1536
|
+
Args:
|
|
1537
|
+
mode: The execution mode to set.
|
|
1538
|
+
"""
|
|
1138
1539
|
self.flow_settings.execution_mode = mode
|
|
1139
1540
|
|
|
1140
1541
|
@property
|
|
1141
1542
|
def execution_location(self) -> schemas.ExecutionLocationsLiteral:
|
|
1543
|
+
"""Gets the current execution location."""
|
|
1142
1544
|
return self.flow_settings.execution_location
|
|
1143
1545
|
|
|
1144
1546
|
@execution_location.setter
|
|
1145
1547
|
def execution_location(self, execution_location: schemas.ExecutionLocationsLiteral):
|
|
1548
|
+
"""Sets the execution location for the flow.
|
|
1549
|
+
|
|
1550
|
+
Args:
|
|
1551
|
+
execution_location: The execution location to set.
|
|
1552
|
+
"""
|
|
1146
1553
|
self.flow_settings.execution_location = execution_location
|
|
1147
1554
|
|
|
1148
|
-
def run_graph(self):
|
|
1555
|
+
def run_graph(self) -> RunInformation | None:
|
|
1556
|
+
"""Executes the entire data flow graph from start to finish.
|
|
1557
|
+
|
|
1558
|
+
It determines the correct execution order, runs each node,
|
|
1559
|
+
collects results, and handles errors and cancellations.
|
|
1560
|
+
|
|
1561
|
+
Returns:
|
|
1562
|
+
A RunInformation object summarizing the execution results.
|
|
1563
|
+
|
|
1564
|
+
Raises:
|
|
1565
|
+
Exception: If the flow is already running.
|
|
1566
|
+
"""
|
|
1149
1567
|
if self.flow_settings.is_running:
|
|
1150
1568
|
raise Exception('Flow is already running')
|
|
1151
1569
|
try:
|
|
@@ -1167,6 +1585,8 @@ class FlowGraph:
|
|
|
1167
1585
|
skip_node_message(self.flow_logger, skip_nodes)
|
|
1168
1586
|
execution_order_message(self.flow_logger, execution_order)
|
|
1169
1587
|
performance_mode = self.flow_settings.execution_mode == 'Performance'
|
|
1588
|
+
if self.flow_settings.execution_location == 'local':
|
|
1589
|
+
OFFLOAD_TO_WORKER.value = False
|
|
1170
1590
|
for node in execution_order:
|
|
1171
1591
|
node_logger = self.flow_logger.get_node_logger(node.node_id)
|
|
1172
1592
|
if self.flow_settings.is_canceled:
|
|
@@ -1215,6 +1635,11 @@ class FlowGraph:
|
|
|
1215
1635
|
self.flow_settings.is_running = False
|
|
1216
1636
|
|
|
1217
1637
|
def get_run_info(self) -> RunInformation:
|
|
1638
|
+
"""Gets a summary of the most recent graph execution.
|
|
1639
|
+
|
|
1640
|
+
Returns:
|
|
1641
|
+
A RunInformation object with details about the last run.
|
|
1642
|
+
"""
|
|
1218
1643
|
if self.latest_run_info is None:
|
|
1219
1644
|
node_results = self.node_results
|
|
1220
1645
|
success = all(nr.success for nr in node_results)
|
|
@@ -1234,6 +1659,11 @@ class FlowGraph:
|
|
|
1234
1659
|
|
|
1235
1660
|
@property
|
|
1236
1661
|
def node_connections(self) -> List[Tuple[int, int]]:
|
|
1662
|
+
"""Computes and returns a list of all connections in the graph.
|
|
1663
|
+
|
|
1664
|
+
Returns:
|
|
1665
|
+
A list of tuples, where each tuple is a (source_id, target_id) pair.
|
|
1666
|
+
"""
|
|
1237
1667
|
connections = set()
|
|
1238
1668
|
for node in self.nodes:
|
|
1239
1669
|
outgoing_connections = [(node.node_id, ltn.node_id) for ltn in node.leads_to_nodes]
|
|
@@ -1245,28 +1675,30 @@ class FlowGraph:
|
|
|
1245
1675
|
connections.add(node_connection)
|
|
1246
1676
|
return list(connections)
|
|
1247
1677
|
|
|
1248
|
-
def
|
|
1249
|
-
|
|
1250
|
-
if len(self._node_ids) > 0:
|
|
1251
|
-
self.schema = self._node_db[self._node_ids[0]].schema
|
|
1252
|
-
return self.schema
|
|
1678
|
+
def get_node_data(self, node_id: int, include_example: bool = True) -> NodeData:
|
|
1679
|
+
"""Retrieves all data needed to render a node in the UI.
|
|
1253
1680
|
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1681
|
+
Args:
|
|
1682
|
+
node_id: The ID of the node.
|
|
1683
|
+
include_example: Whether to include data samples in the result.
|
|
1257
1684
|
|
|
1258
|
-
|
|
1685
|
+
Returns:
|
|
1686
|
+
A NodeData object, or None if the node is not found.
|
|
1687
|
+
"""
|
|
1259
1688
|
node = self._node_db[node_id]
|
|
1260
1689
|
return node.get_node_data(flow_id=self.flow_id, include_example=include_example)
|
|
1261
1690
|
|
|
1262
1691
|
def get_node_storage(self) -> schemas.FlowInformation:
|
|
1692
|
+
"""Serializes the entire graph's state into a storable format.
|
|
1263
1693
|
|
|
1694
|
+
Returns:
|
|
1695
|
+
A FlowInformation object representing the complete graph.
|
|
1696
|
+
"""
|
|
1264
1697
|
node_information = {node.node_id: node.get_node_information() for
|
|
1265
1698
|
node in self.nodes if node.is_setup and node.is_correct}
|
|
1266
1699
|
|
|
1267
1700
|
return schemas.FlowInformation(flow_id=self.flow_id,
|
|
1268
1701
|
flow_name=self.__name__,
|
|
1269
|
-
storage_location=self.flow_settings.path,
|
|
1270
1702
|
flow_settings=self.flow_settings,
|
|
1271
1703
|
data=node_information,
|
|
1272
1704
|
node_starts=[v.node_id for v in self._flow_starts],
|
|
@@ -1274,6 +1706,8 @@ class FlowGraph:
|
|
|
1274
1706
|
)
|
|
1275
1707
|
|
|
1276
1708
|
def cancel(self):
|
|
1709
|
+
"""Cancels an ongoing graph execution."""
|
|
1710
|
+
|
|
1277
1711
|
if not self.flow_settings.is_running:
|
|
1278
1712
|
return
|
|
1279
1713
|
self.flow_settings.is_canceled = True
|
|
@@ -1281,15 +1715,30 @@ class FlowGraph:
|
|
|
1281
1715
|
node.cancel()
|
|
1282
1716
|
|
|
1283
1717
|
def close_flow(self):
|
|
1718
|
+
"""Performs cleanup operations, such as clearing node caches."""
|
|
1719
|
+
|
|
1284
1720
|
for node in self.nodes:
|
|
1285
1721
|
node.remove_cache()
|
|
1286
1722
|
|
|
1287
1723
|
def save_flow(self, flow_path: str):
|
|
1724
|
+
"""Saves the current state of the flow graph to a file.
|
|
1725
|
+
|
|
1726
|
+
Args:
|
|
1727
|
+
flow_path: The path where the flow file will be saved.
|
|
1728
|
+
"""
|
|
1288
1729
|
with open(flow_path, 'wb') as f:
|
|
1289
1730
|
pickle.dump(self.get_node_storage(), f)
|
|
1290
1731
|
self.flow_settings.path = flow_path
|
|
1291
1732
|
|
|
1292
|
-
def get_frontend_data(self):
|
|
1733
|
+
def get_frontend_data(self) -> dict:
|
|
1734
|
+
"""Formats the graph structure into a JSON-like dictionary for a specific legacy frontend.
|
|
1735
|
+
|
|
1736
|
+
This method transforms the graph's state into a format compatible with the
|
|
1737
|
+
Drawflow.js library.
|
|
1738
|
+
|
|
1739
|
+
Returns:
|
|
1740
|
+
A dictionary representing the graph in Drawflow format.
|
|
1741
|
+
"""
|
|
1293
1742
|
result = {
|
|
1294
1743
|
'Home': {
|
|
1295
1744
|
"data": {}
|
|
@@ -1360,6 +1809,11 @@ class FlowGraph:
|
|
|
1360
1809
|
return result
|
|
1361
1810
|
|
|
1362
1811
|
def get_vue_flow_input(self) -> schemas.VueFlowInput:
|
|
1812
|
+
"""Formats the graph's nodes and edges into a schema suitable for the VueFlow frontend.
|
|
1813
|
+
|
|
1814
|
+
Returns:
|
|
1815
|
+
A VueFlowInput object.
|
|
1816
|
+
"""
|
|
1363
1817
|
edges: List[schemas.NodeEdge] = []
|
|
1364
1818
|
nodes: List[schemas.NodeInput] = []
|
|
1365
1819
|
for node in self.nodes:
|
|
@@ -1368,11 +1822,19 @@ class FlowGraph:
|
|
|
1368
1822
|
return schemas.VueFlowInput(node_edges=edges, node_inputs=nodes)
|
|
1369
1823
|
|
|
1370
1824
|
def reset(self):
|
|
1825
|
+
"""Forces a deep reset on all nodes in the graph."""
|
|
1826
|
+
|
|
1371
1827
|
for node in self.nodes:
|
|
1372
1828
|
node.reset(True)
|
|
1373
1829
|
|
|
1374
1830
|
def copy_node(self, new_node_settings: input_schema.NodePromise, existing_setting_input: Any, node_type: str) -> None:
|
|
1375
|
-
"""
|
|
1831
|
+
"""Creates a copy of an existing node.
|
|
1832
|
+
|
|
1833
|
+
Args:
|
|
1834
|
+
new_node_settings: The promise containing new settings (like ID and position).
|
|
1835
|
+
existing_setting_input: The settings object from the node being copied.
|
|
1836
|
+
node_type: The type of the node being copied.
|
|
1837
|
+
"""
|
|
1376
1838
|
self.add_node_promise(new_node_settings)
|
|
1377
1839
|
|
|
1378
1840
|
if isinstance(existing_setting_input, input_schema.NodePromise):
|
|
@@ -1383,69 +1845,26 @@ class FlowGraph:
|
|
|
1383
1845
|
)
|
|
1384
1846
|
getattr(self, f"add_{node_type}")(combined_settings)
|
|
1385
1847
|
|
|
1848
|
+
def generate_code(self):
|
|
1849
|
+
"""Generates code for the flow graph.
|
|
1850
|
+
This method exports the flow graph to a Polars-compatible format.
|
|
1851
|
+
"""
|
|
1852
|
+
from flowfile_core.flowfile.code_generator.code_generator import export_flow_to_polars
|
|
1853
|
+
print(export_flow_to_polars(self))
|
|
1386
1854
|
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1855
|
+
|
|
1856
|
+
def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
|
|
1857
|
+
"""Merges settings from an existing object with new settings from a NodePromise.
|
|
1858
|
+
|
|
1859
|
+
Typically used when copying a node to apply a new ID and position.
|
|
1390
1860
|
|
|
1391
1861
|
Args:
|
|
1392
|
-
|
|
1862
|
+
setting_input: The original settings object.
|
|
1863
|
+
new_settings: The NodePromise with new positional and ID data.
|
|
1393
1864
|
|
|
1394
1865
|
Returns:
|
|
1395
|
-
A new
|
|
1396
|
-
|
|
1397
|
-
Raises:
|
|
1398
|
-
ValueError: If any flow_ids overlap
|
|
1866
|
+
A new settings object with the merged properties.
|
|
1399
1867
|
"""
|
|
1400
|
-
# Validate flow IDs are unique
|
|
1401
|
-
_validate_unique_flow_ids(flow_graphs)
|
|
1402
|
-
|
|
1403
|
-
# Create ID mapping for all nodes
|
|
1404
|
-
node_id_mapping = _create_node_id_mapping(flow_graphs)
|
|
1405
|
-
|
|
1406
|
-
# Remap and combine nodes
|
|
1407
|
-
all_nodes = _remap_nodes(flow_graphs, node_id_mapping)
|
|
1408
|
-
|
|
1409
|
-
# Create a new combined flow graph
|
|
1410
|
-
combined_flow_id = hash(tuple(fg.flow_id for fg in flow_graphs))
|
|
1411
|
-
# return FlowGraph(flow_id=combined_flow_id, nodes=all_nodes, edges=all_edges)
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
def _validate_unique_flow_ids(flow_graphs: Tuple[FlowGraph, ...]) -> None:
|
|
1415
|
-
"""Ensure all flow graphs have unique flow_ids."""
|
|
1416
|
-
all_flow_ids = [fg.flow_id for fg in flow_graphs]
|
|
1417
|
-
if len(all_flow_ids) != len(set(all_flow_ids)):
|
|
1418
|
-
raise ValueError("Cannot combine overlapping graphs, make sure the graphs have a unique identifier")
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
def _create_node_id_mapping(flow_graphs: Tuple[FlowGraph, ...]) -> Dict[int, Dict[int, int]]:
|
|
1422
|
-
"""Create a mapping from original node IDs to new unique node IDs."""
|
|
1423
|
-
node_id_mapping: Dict[int, Dict[int, int]] = {}
|
|
1424
|
-
next_node_id = 0
|
|
1425
|
-
|
|
1426
|
-
for fg in flow_graphs:
|
|
1427
|
-
node_id_mapping[fg.flow_id] = {}
|
|
1428
|
-
for node in fg.nodes:
|
|
1429
|
-
node_id_mapping[fg.flow_id][node.node_id] = next_node_id
|
|
1430
|
-
next_node_id += 1
|
|
1431
|
-
|
|
1432
|
-
return node_id_mapping
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
def _remap_nodes(flow_graphs: Tuple[FlowGraph, ...],
|
|
1436
|
-
node_id_mapping: Dict[int, Dict[int, int]]) -> List:
|
|
1437
|
-
"""Create new nodes with remapped IDs."""
|
|
1438
|
-
all_nodes = []
|
|
1439
|
-
for fg in flow_graphs:
|
|
1440
|
-
for node in fg.nodes:
|
|
1441
|
-
new_node = copy.deepcopy(node)
|
|
1442
|
-
new_node.node_id = node_id_mapping[fg.flow_id][node.node_id]
|
|
1443
|
-
all_nodes.append(new_node)
|
|
1444
|
-
return all_nodes
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
|
|
1448
|
-
"""Combine excopy_nodeisting settings with new settings from a NodePromise."""
|
|
1449
1868
|
copied_setting_input = deepcopy(setting_input)
|
|
1450
1869
|
|
|
1451
1870
|
# Update only attributes that exist on new_settings
|
|
@@ -1464,7 +1883,13 @@ def combine_existing_settings_and_new_settings(setting_input: Any, new_settings:
|
|
|
1464
1883
|
return copied_setting_input
|
|
1465
1884
|
|
|
1466
1885
|
|
|
1467
|
-
def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection):
|
|
1886
|
+
def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection) -> None:
|
|
1887
|
+
"""Adds a connection between two nodes in the flow graph.
|
|
1888
|
+
|
|
1889
|
+
Args:
|
|
1890
|
+
flow: The FlowGraph instance to modify.
|
|
1891
|
+
node_connection: An object defining the source and target of the connection.
|
|
1892
|
+
"""
|
|
1468
1893
|
logger.info('adding a connection')
|
|
1469
1894
|
from_node = flow.get_node(node_connection.output_connection.node_id)
|
|
1470
1895
|
to_node = flow.get_node(node_connection.input_connection.node_id)
|
|
@@ -1476,7 +1901,12 @@ def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection
|
|
|
1476
1901
|
|
|
1477
1902
|
|
|
1478
1903
|
def delete_connection(graph, node_connection: input_schema.NodeConnection):
|
|
1479
|
-
"""
|
|
1904
|
+
"""Deletes a connection between two nodes in the flow graph.
|
|
1905
|
+
|
|
1906
|
+
Args:
|
|
1907
|
+
graph: The FlowGraph instance to modify.
|
|
1908
|
+
node_connection: An object defining the connection to be removed.
|
|
1909
|
+
"""
|
|
1480
1910
|
from_node = graph.get_node(node_connection.output_connection.node_id)
|
|
1481
1911
|
to_node = graph.get_node(node_connection.input_connection.node_id)
|
|
1482
1912
|
connection_valid = to_node.node_inputs.validate_if_input_connection_exists(
|
|
@@ -1492,6 +1922,4 @@ def delete_connection(graph, node_connection: input_schema.NodeConnection):
|
|
|
1492
1922
|
to_node.delete_input_node(
|
|
1493
1923
|
node_connection.output_connection.node_id,
|
|
1494
1924
|
connection_type=node_connection.input_connection.connection_class,
|
|
1495
|
-
)
|
|
1496
|
-
|
|
1497
|
-
|
|
1925
|
+
)
|