Flowfile 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowfile/__init__.py +27 -6
- flowfile/api.py +5 -2
- flowfile/web/__init__.py +4 -2
- flowfile/web/static/assets/{CloudConnectionManager-d004942f.js → CloudConnectionManager-c20a740f.js} +3 -4
- flowfile/web/static/assets/{CloudStorageReader-eccf9fc2.js → CloudStorageReader-960b400a.js} +7 -7
- flowfile/web/static/assets/{CloudStorageWriter-b1ba6bba.js → CloudStorageWriter-e3decbdd.js} +7 -7
- flowfile/web/static/assets/{CrossJoin-68981877.js → CrossJoin-d67e2405.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-0b06649c.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-8349a426.js → DatabaseManager-9ea35e84.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-905344f8.js → DatabaseReader-9578bfa5.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-9f5b8638.js → DatabaseWriter-19531098.js} +9 -9
- flowfile/web/static/assets/{ExploreData-131a6d53.js → ExploreData-40476474.js} +47141 -43697
- flowfile/web/static/assets/{ExternalSource-e3549dcc.js → ExternalSource-2297ef96.js} +6 -6
- flowfile/web/static/assets/{Filter-6e0730ae.js → Filter-f211c03a.js} +8 -8
- flowfile/web/static/assets/{Formula-02f033e6.js → Formula-4207ea31.js} +8 -8
- flowfile/web/static/assets/{FuzzyMatch-54c14036.js → FuzzyMatch-bf120df0.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-08a3f499.js → GraphSolver-5bb7497a.js} +5 -5
- flowfile/web/static/assets/{GroupBy-2ae38139.js → GroupBy-92c81b65.js} +6 -6
- flowfile/web/static/assets/{Join-493b9772.js → Join-4e49a274.js} +9 -9
- flowfile/web/static/assets/{ManualInput-4373d163.js → ManualInput-90998ae8.js} +5 -5
- flowfile/web/static/assets/{Output-b534f3c7.js → Output-81e3e917.js} +4 -4
- flowfile/web/static/assets/{Pivot-2968ff65.js → Pivot-a3419842.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-65136536.js → PolarsCode-72710deb.js} +6 -6
- flowfile/web/static/assets/{Read-c56339ed.js → Read-c4059daf.js} +6 -6
- flowfile/web/static/assets/{RecordCount-1c641a5e.js → RecordCount-c2b5e095.js} +5 -5
- flowfile/web/static/assets/{RecordId-df308b8f.js → RecordId-10baf191.js} +6 -6
- flowfile/web/static/assets/{Sample-293e8a64.js → Sample-3ed9a0ae.js} +5 -5
- flowfile/web/static/assets/{SecretManager-03911655.js → SecretManager-0d49c0e8.js} +2 -2
- flowfile/web/static/assets/{Select-3058a13d.js → Select-8a02a0b3.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-fbf4fb39.js → SettingsSection-4c0f45f5.js} +1 -1
- flowfile/web/static/assets/{Sort-a29bbaf7.js → Sort-f55c9f9d.js} +6 -6
- flowfile/web/static/assets/{TextToRows-c7d7760e.js → TextToRows-5dbc2145.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-118f1d20.js → UnavailableFields-a1768e52.js} +2 -2
- flowfile/web/static/assets/{Union-f0589571.js → Union-f2aefdc9.js} +5 -5
- flowfile/web/static/assets/{Unique-7329a207.js → Unique-46b250da.js} +8 -8
- flowfile/web/static/assets/{Unpivot-30b0be15.js → Unpivot-25ac84cc.js} +5 -5
- flowfile/web/static/assets/{api-fb67319c.js → api-6ef0dcef.js} +1 -1
- flowfile/web/static/assets/{api-602fb95c.js → api-a0abbdc7.js} +1 -1
- flowfile/web/static/assets/{designer-94a6bf4d.js → designer-13eabd83.js} +4 -4
- flowfile/web/static/assets/{documentation-a224831e.js → documentation-b87e7f6f.js} +1 -1
- flowfile/web/static/assets/{dropDown-c2d2aa97.js → dropDown-13564764.js} +1 -1
- flowfile/web/static/assets/{fullEditor-921ac5fd.js → fullEditor-fd2cd6f9.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-7013cc94.js → genericNodeSettings-71e11604.js} +3 -3
- flowfile/web/static/assets/{index-3a75211d.js → index-f6c15e76.js} +46 -22
- flowfile/web/static/assets/{nodeTitle-a63d4680.js → nodeTitle-988d9efe.js} +3 -3
- flowfile/web/static/assets/{secretApi-763aec6e.js → secretApi-dd636aa2.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-08464729.js → selectDynamic-af36165e.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-f15a5f87.js → vue-codemirror.esm-2847001e.js} +1 -1
- flowfile/web/static/assets/{vue-content-loader.es-93bd09d7.js → vue-content-loader.es-0371da73.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/METADATA +2 -2
- {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/RECORD +100 -98
- flowfile_core/__init__.py +1 -0
- flowfile_core/auth/jwt.py +39 -0
- flowfile_core/configs/node_store/nodes.py +1 -0
- flowfile_core/configs/settings.py +6 -5
- flowfile_core/configs/utils.py +5 -0
- flowfile_core/database/connection.py +1 -3
- flowfile_core/flowfile/code_generator/code_generator.py +71 -0
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +1 -2
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +598 -310
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
- flowfile_core/flowfile/flow_graph.py +620 -192
- flowfile_core/flowfile/flow_graph_utils.py +2 -2
- flowfile_core/flowfile/flow_node/flow_node.py +510 -89
- flowfile_core/flowfile/flow_node/models.py +125 -20
- flowfile_core/flowfile/handler.py +2 -33
- flowfile_core/flowfile/manage/open_flowfile.py +1 -2
- flowfile_core/flowfile/util/calculate_layout.py +0 -2
- flowfile_core/flowfile/utils.py +36 -5
- flowfile_core/main.py +32 -13
- flowfile_core/routes/cloud_connections.py +7 -11
- flowfile_core/routes/logs.py +2 -6
- flowfile_core/routes/public.py +1 -0
- flowfile_core/routes/routes.py +127 -51
- flowfile_core/routes/secrets.py +72 -14
- flowfile_core/schemas/__init__.py +8 -0
- flowfile_core/schemas/input_schema.py +92 -64
- flowfile_core/schemas/output_model.py +19 -3
- flowfile_core/schemas/schemas.py +144 -11
- flowfile_core/schemas/transform_schema.py +82 -17
- flowfile_core/utils/arrow_reader.py +8 -3
- flowfile_core/utils/validate_setup.py +0 -2
- flowfile_frame/__init__.py +9 -1
- flowfile_frame/cloud_storage/__init__.py +0 -0
- flowfile_frame/cloud_storage/frame_helpers.py +39 -0
- flowfile_frame/cloud_storage/secret_manager.py +73 -0
- flowfile_frame/expr.py +42 -1
- flowfile_frame/expr.pyi +76 -61
- flowfile_frame/flow_frame.py +233 -111
- flowfile_frame/flow_frame.pyi +137 -91
- flowfile_frame/flow_frame_methods.py +150 -12
- flowfile_frame/group_frame.py +3 -0
- flowfile_frame/utils.py +25 -3
- test_utils/s3/data_generator.py +1 -0
- test_utils/s3/demo_data_generator.py +186 -0
- test_utils/s3/fixtures.py +6 -1
- flowfile_core/schemas/defaults.py +0 -9
- flowfile_core/schemas/models.py +0 -193
- {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/LICENSE +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/WHEEL +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.8.dist-info}/entry_points.txt +0 -0
|
@@ -2,7 +2,7 @@ import datetime
|
|
|
2
2
|
import pickle
|
|
3
3
|
import polars as pl
|
|
4
4
|
import fastexcel
|
|
5
|
-
import
|
|
5
|
+
import re
|
|
6
6
|
from fastapi.exceptions import HTTPException
|
|
7
7
|
from time import time
|
|
8
8
|
from functools import partial
|
|
@@ -11,6 +11,7 @@ from uuid import uuid1
|
|
|
11
11
|
from copy import deepcopy
|
|
12
12
|
from pyarrow.parquet import ParquetFile
|
|
13
13
|
from flowfile_core.configs import logger
|
|
14
|
+
from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
|
|
14
15
|
from flowfile_core.configs.flow_logger import FlowLogger
|
|
15
16
|
from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
|
|
16
17
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
|
|
@@ -23,8 +24,10 @@ from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_x
|
|
|
23
24
|
get_calamine_xlsx_data_types
|
|
24
25
|
from flowfile_core.flowfile.sources import external_sources
|
|
25
26
|
from flowfile_core.schemas import input_schema, schemas, transform_schema
|
|
26
|
-
from flowfile_core.schemas.output_model import
|
|
27
|
-
from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSettingsInternal,
|
|
27
|
+
from flowfile_core.schemas.output_model import NodeData, NodeResult, RunInformation
|
|
28
|
+
from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSettingsInternal,
|
|
29
|
+
CloudStorageWriteSettingsInternal,
|
|
30
|
+
FullCloudStorageConnection,
|
|
28
31
|
get_cloud_storage_write_settings_worker_interface, AuthMethod)
|
|
29
32
|
from flowfile_core.flowfile.utils import snake_case_to_camel_case
|
|
30
33
|
from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
|
|
@@ -45,6 +48,21 @@ from flowfile_core.flowfile.util.calculate_layout import calculate_layered_layou
|
|
|
45
48
|
|
|
46
49
|
def get_xlsx_schema(engine: str, file_path: str, sheet_name: str, start_row: int, start_column: int,
|
|
47
50
|
end_row: int, end_column: int, has_headers: bool):
|
|
51
|
+
"""Calculates the schema of an XLSX file by reading a sample of rows.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
engine: The engine to use for reading ('openpyxl' or 'calamine').
|
|
55
|
+
file_path: The path to the XLSX file.
|
|
56
|
+
sheet_name: The name of the sheet to read.
|
|
57
|
+
start_row: The starting row for data reading.
|
|
58
|
+
start_column: The starting column for data reading.
|
|
59
|
+
end_row: The ending row for data reading.
|
|
60
|
+
end_column: The ending column for data reading.
|
|
61
|
+
has_headers: A boolean indicating if the file has a header row.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
A list of FlowfileColumn objects representing the schema.
|
|
65
|
+
"""
|
|
48
66
|
try:
|
|
49
67
|
logger.info('Starting to calculate the schema')
|
|
50
68
|
if engine == 'openpyxl':
|
|
@@ -67,26 +85,69 @@ def get_xlsx_schema(engine: str, file_path: str, sheet_name: str, start_row: int
|
|
|
67
85
|
|
|
68
86
|
|
|
69
87
|
def skip_node_message(flow_logger: FlowLogger, nodes: List[FlowNode]) -> None:
|
|
88
|
+
"""Logs a warning message listing all nodes that will be skipped during execution.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
flow_logger: The logger instance for the flow.
|
|
92
|
+
nodes: A list of FlowNode objects to be skipped.
|
|
93
|
+
"""
|
|
70
94
|
if len(nodes) > 0:
|
|
71
95
|
msg = "\n".join(str(node) for node in nodes)
|
|
72
96
|
flow_logger.warning(f'skipping nodes:\n{msg}')
|
|
73
97
|
|
|
74
98
|
|
|
75
99
|
def execution_order_message(flow_logger: FlowLogger, nodes: List[FlowNode]) -> None:
|
|
100
|
+
"""Logs an informational message showing the determined execution order of nodes.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
flow_logger: The logger instance for the flow.
|
|
104
|
+
nodes: A list of FlowNode objects in the order they will be executed.
|
|
105
|
+
"""
|
|
76
106
|
msg = "\n".join(str(node) for node in nodes)
|
|
77
107
|
flow_logger.info(f'execution order:\n{msg}')
|
|
78
108
|
|
|
79
109
|
|
|
80
110
|
def get_xlsx_schema_callback(engine: str, file_path: str, sheet_name: str, start_row: int, start_column: int,
|
|
81
111
|
end_row: int, end_column: int, has_headers: bool):
|
|
112
|
+
"""Creates a partially applied function for lazy calculation of an XLSX schema.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
engine: The engine to use for reading.
|
|
116
|
+
file_path: The path to the XLSX file.
|
|
117
|
+
sheet_name: The name of the sheet.
|
|
118
|
+
start_row: The starting row.
|
|
119
|
+
start_column: The starting column.
|
|
120
|
+
end_row: The ending row.
|
|
121
|
+
end_column: The ending column.
|
|
122
|
+
has_headers: A boolean indicating if the file has headers.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
A callable function that, when called, will execute `get_xlsx_schema`.
|
|
126
|
+
"""
|
|
82
127
|
return partial(get_xlsx_schema, engine=engine, file_path=file_path, sheet_name=sheet_name, start_row=start_row,
|
|
83
128
|
start_column=start_column, end_row=end_row, end_column=end_column, has_headers=has_headers)
|
|
84
129
|
|
|
85
130
|
|
|
86
|
-
def get_cloud_connection_settings(connection_name: str,
|
|
131
|
+
def get_cloud_connection_settings(connection_name: str,
|
|
132
|
+
user_id: int, auth_mode: AuthMethod) -> FullCloudStorageConnection:
|
|
133
|
+
"""Retrieves cloud storage connection settings, falling back to environment variables if needed.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
connection_name: The name of the saved connection.
|
|
137
|
+
user_id: The ID of the user owning the connection.
|
|
138
|
+
auth_mode: The authentication method specified by the user.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
A FullCloudStorageConnection object with the connection details.
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
HTTPException: If the connection settings cannot be found.
|
|
145
|
+
"""
|
|
87
146
|
cloud_connection_settings = get_local_cloud_connection(connection_name, user_id)
|
|
88
|
-
if cloud_connection_settings is None and auth_mode
|
|
147
|
+
if cloud_connection_settings is None and auth_mode in ("env_vars", "auto"):
|
|
89
148
|
# If the auth mode is aws-cli, we do not need connection settings
|
|
149
|
+
cloud_connection_settings = FullCloudStorageConnection(storage_type="s3", auth_method="env_vars")
|
|
150
|
+
elif cloud_connection_settings is None and auth_mode == "aws-cli":
|
|
90
151
|
cloud_connection_settings = FullCloudStorageConnection(storage_type="s3", auth_method="aws-cli")
|
|
91
152
|
if cloud_connection_settings is None:
|
|
92
153
|
raise HTTPException(status_code=400, detail="Cloud connection settings not found")
|
|
@@ -94,18 +155,10 @@ def get_cloud_connection_settings(connection_name: str, user_id: int, auth_mode:
|
|
|
94
155
|
|
|
95
156
|
|
|
96
157
|
class FlowGraph:
|
|
97
|
-
"""
|
|
98
|
-
FlowGraph is a class that enables Extract, Transform and Load (ETL) operations
|
|
99
|
-
on data. It allows you to create a Directed Acyclic Graph (DAG) where each
|
|
100
|
-
node represents a step in the ETL pipeline.
|
|
101
|
-
|
|
102
|
-
The class offers methods to add transformations and data sources, as well as
|
|
103
|
-
methods to run the transformations and generate results.
|
|
158
|
+
"""A class representing a Directed Acyclic Graph (DAG) for data processing pipelines.
|
|
104
159
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
_output_cols (set): A set that stores the output columns from the transformations.
|
|
108
|
-
"""
|
|
160
|
+
It manages nodes, connections, and the execution of the entire flow.
|
|
161
|
+
"""
|
|
109
162
|
uuid: str
|
|
110
163
|
depends_on: Dict[int, Union[ParquetFile, FlowDataEngine, "FlowGraph", pl.DataFrame,]]
|
|
111
164
|
_flow_id: int
|
|
@@ -127,13 +180,27 @@ class FlowGraph:
|
|
|
127
180
|
flow_settings: schemas.FlowSettings = None
|
|
128
181
|
flow_logger: FlowLogger
|
|
129
182
|
|
|
130
|
-
def __init__(self,
|
|
131
|
-
flow_settings: schemas.FlowSettings,
|
|
183
|
+
def __init__(self,
|
|
184
|
+
flow_settings: schemas.FlowSettings | schemas.FlowGraphConfig,
|
|
132
185
|
name: str = None, input_cols: List[str] = None,
|
|
133
186
|
output_cols: List[str] = None,
|
|
134
187
|
path_ref: str = None,
|
|
135
188
|
input_flow: Union[ParquetFile, FlowDataEngine, "FlowGraph"] = None,
|
|
136
189
|
cache_results: bool = False):
|
|
190
|
+
"""Initializes a new FlowGraph instance.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
flow_settings: The configuration settings for the flow.
|
|
194
|
+
name: The name of the flow.
|
|
195
|
+
input_cols: A list of input column names.
|
|
196
|
+
output_cols: A list of output column names.
|
|
197
|
+
path_ref: An optional path to an initial data source.
|
|
198
|
+
input_flow: An optional existing data object to start the flow with.
|
|
199
|
+
cache_results: A global flag to enable or disable result caching.
|
|
200
|
+
"""
|
|
201
|
+
if isinstance(flow_settings, schemas.FlowGraphConfig):
|
|
202
|
+
flow_settings = schemas.FlowSettings.from_flow_settings_input(flow_settings)
|
|
203
|
+
|
|
137
204
|
self.flow_settings = flow_settings
|
|
138
205
|
self.uuid = str(uuid1())
|
|
139
206
|
self.nodes_completed = 0
|
|
@@ -141,8 +208,8 @@ class FlowGraph:
|
|
|
141
208
|
self.end_datetime = None
|
|
142
209
|
self.latest_run_info = None
|
|
143
210
|
self.node_results = []
|
|
144
|
-
self._flow_id = flow_id
|
|
145
|
-
self.flow_logger = FlowLogger(flow_id)
|
|
211
|
+
self._flow_id = flow_settings.flow_id
|
|
212
|
+
self.flow_logger = FlowLogger(flow_settings.flow_id)
|
|
146
213
|
self._flow_starts: List[FlowNode] = []
|
|
147
214
|
self._results = None
|
|
148
215
|
self.schema = None
|
|
@@ -160,7 +227,13 @@ class FlowGraph:
|
|
|
160
227
|
self.add_datasource(input_file=input_flow)
|
|
161
228
|
|
|
162
229
|
def add_node_promise(self, node_promise: input_schema.NodePromise):
|
|
230
|
+
"""Adds a placeholder node to the graph that is not yet fully configured.
|
|
231
|
+
|
|
232
|
+
Useful for building the graph structure before all settings are available.
|
|
163
233
|
|
|
234
|
+
Args:
|
|
235
|
+
node_promise: A promise object containing basic node information.
|
|
236
|
+
"""
|
|
164
237
|
def placeholder(n: FlowNode = None):
|
|
165
238
|
if n is None:
|
|
166
239
|
return FlowDataEngine()
|
|
@@ -169,10 +242,73 @@ class FlowGraph:
|
|
|
169
242
|
self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=placeholder,
|
|
170
243
|
setting_input=node_promise)
|
|
171
244
|
|
|
172
|
-
def
|
|
245
|
+
def print_tree(self, show_schema=False, show_descriptions=False):
|
|
173
246
|
"""
|
|
174
|
-
|
|
175
|
-
|
|
247
|
+
Print flow_graph as a tree.
|
|
248
|
+
"""
|
|
249
|
+
max_node_id = max(self._node_db.keys())
|
|
250
|
+
|
|
251
|
+
tree = ""
|
|
252
|
+
tabs = 0
|
|
253
|
+
tab_counter = 0
|
|
254
|
+
for node in self.nodes:
|
|
255
|
+
tab_counter += 1
|
|
256
|
+
node_input = node.setting_input
|
|
257
|
+
operation = str(self._node_db[node_input.node_id]).split("(")[1][:-1].replace("_", " ").title()
|
|
258
|
+
|
|
259
|
+
if operation == "Formula":
|
|
260
|
+
operation = "With Columns"
|
|
261
|
+
|
|
262
|
+
tree += str(operation) + " (id=" + str(node_input.node_id) + ")"
|
|
263
|
+
|
|
264
|
+
if show_descriptions & show_schema:
|
|
265
|
+
raise ValueError('show_descriptions and show_schema cannot be True simultaneously')
|
|
266
|
+
if show_descriptions:
|
|
267
|
+
tree += ": " + str(node_input.description)
|
|
268
|
+
elif show_schema:
|
|
269
|
+
tree += " -> ["
|
|
270
|
+
if operation == "Manual Input":
|
|
271
|
+
schema = ", ".join([str(i.name) + ": " + str(i.data_type) for i in node_input.raw_data_format.columns])
|
|
272
|
+
tree += schema
|
|
273
|
+
elif operation == "With Columns":
|
|
274
|
+
tree_with_col_schema = ", " + node_input.function.field.name + ": " + node_input.function.field.data_type
|
|
275
|
+
tree += schema + tree_with_col_schema
|
|
276
|
+
elif operation == "Filter":
|
|
277
|
+
index = node_input.filter_input.advanced_filter.find("]")
|
|
278
|
+
filtered_column = str(node_input.filter_input.advanced_filter[1:index])
|
|
279
|
+
schema = re.sub('({str(filtered_column)}: [A-Za-z0-9]+\,\s)', "", schema)
|
|
280
|
+
tree += schema
|
|
281
|
+
elif operation == "Group By":
|
|
282
|
+
for col in node_input.groupby_input.agg_cols:
|
|
283
|
+
schema = re.sub(str(col.old_name) + ': [a-z0-9]+\, ', "", schema)
|
|
284
|
+
tree += schema
|
|
285
|
+
tree += "]"
|
|
286
|
+
else:
|
|
287
|
+
if operation == "Manual Input":
|
|
288
|
+
tree += ": " + str(node_input.raw_data_format.data)
|
|
289
|
+
elif operation == "With Columns":
|
|
290
|
+
tree += ": " + str(node_input.function)
|
|
291
|
+
elif operation == "Filter":
|
|
292
|
+
tree += ": " + str(node_input.filter_input.advanced_filter)
|
|
293
|
+
elif operation == "Group By":
|
|
294
|
+
tree += ": groupby=[" + ", ".join([col.old_name for col in node_input.groupby_input.agg_cols if col.agg == "groupby"]) + "], "
|
|
295
|
+
tree += "agg=[" + ", ".join([str(col.agg) + "(" + str(col.old_name) + ")" for col in node_input.groupby_input.agg_cols if col.agg != "groupby"]) + "]"
|
|
296
|
+
|
|
297
|
+
if node_input.node_id < max_node_id:
|
|
298
|
+
tree += "\n" + "# " + " "*3*(tabs-1) + "|___ "
|
|
299
|
+
print("\n"*2)
|
|
300
|
+
|
|
301
|
+
return print(tree)
|
|
302
|
+
|
|
303
|
+
def apply_layout(self, y_spacing: int = 150, x_spacing: int = 200, initial_y: int = 100):
|
|
304
|
+
"""Calculates and applies a layered layout to all nodes in the graph.
|
|
305
|
+
|
|
306
|
+
This updates their x and y positions for UI rendering.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
y_spacing: The vertical spacing between layers.
|
|
310
|
+
x_spacing: The horizontal spacing between nodes in the same layer.
|
|
311
|
+
initial_y: The initial y-position for the first layer.
|
|
176
312
|
"""
|
|
177
313
|
self.flow_logger.info("Applying layered layout...")
|
|
178
314
|
start_time = time()
|
|
@@ -199,7 +335,7 @@ class FlowGraph:
|
|
|
199
335
|
else:
|
|
200
336
|
self.flow_logger.warning(f"Node {node_id} setting_input ({type(setting)}) lacks pos_x/pos_y attributes.")
|
|
201
337
|
elif node:
|
|
202
|
-
|
|
338
|
+
self.flow_logger.warning(f"Node {node_id} lacks setting_input attribute.")
|
|
203
339
|
# else: Node not found, already warned by calculate_layered_layout
|
|
204
340
|
|
|
205
341
|
end_time = time()
|
|
@@ -207,51 +343,20 @@ class FlowGraph:
|
|
|
207
343
|
|
|
208
344
|
except Exception as e:
|
|
209
345
|
self.flow_logger.error(f"Error applying layout: {e}")
|
|
210
|
-
raise
|
|
211
|
-
|
|
212
|
-
def add_initial_node_analysis(self, node_promise: input_schema.NodePromise):
|
|
213
|
-
node_analysis = create_graphic_walker_node_from_node_promise(node_promise)
|
|
214
|
-
self.add_explore_data(node_analysis)
|
|
215
|
-
|
|
216
|
-
def add_explore_data(self, node_analysis: input_schema.NodeExploreData):
|
|
217
|
-
sample_size: int = 10000
|
|
218
|
-
|
|
219
|
-
def analysis_preparation(flowfile_table: FlowDataEngine):
|
|
220
|
-
if flowfile_table.number_of_records <= 0:
|
|
221
|
-
number_of_records = flowfile_table.get_number_of_records(calculate_in_worker_process=True)
|
|
222
|
-
else:
|
|
223
|
-
number_of_records = flowfile_table.number_of_records
|
|
224
|
-
if number_of_records > sample_size:
|
|
225
|
-
flowfile_table = flowfile_table.get_sample(sample_size, random=True)
|
|
226
|
-
external_sampler = ExternalDfFetcher(
|
|
227
|
-
lf=flowfile_table.data_frame,
|
|
228
|
-
file_ref="__gf_walker"+node.hash,
|
|
229
|
-
wait_on_completion=True,
|
|
230
|
-
node_id=node.node_id,
|
|
231
|
-
flow_id=self.flow_id,
|
|
232
|
-
)
|
|
233
|
-
node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref)
|
|
234
|
-
return flowfile_table
|
|
235
|
-
|
|
236
|
-
def schema_callback():
|
|
237
|
-
node = self.get_node(node_analysis.node_id)
|
|
238
|
-
if len(node.all_inputs) == 1:
|
|
239
|
-
input_node = node.all_inputs[0]
|
|
240
|
-
return input_node.schema
|
|
241
|
-
else:
|
|
242
|
-
return [FlowfileColumn.from_input('col_1', 'na')]
|
|
243
|
-
|
|
244
|
-
self.add_node_step(node_id=node_analysis.node_id, node_type='explore_data',
|
|
245
|
-
function=analysis_preparation,
|
|
246
|
-
setting_input=node_analysis, schema_callback=schema_callback)
|
|
247
|
-
node = self.get_node(node_analysis.node_id)
|
|
346
|
+
raise # Optional: re-raise the exception
|
|
248
347
|
|
|
249
348
|
@property
|
|
250
349
|
def flow_id(self) -> int:
|
|
350
|
+
"""Gets the unique identifier of the flow."""
|
|
251
351
|
return self._flow_id
|
|
252
352
|
|
|
253
353
|
@flow_id.setter
|
|
254
354
|
def flow_id(self, new_id: int):
|
|
355
|
+
"""Sets the unique identifier for the flow and updates all child nodes.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
new_id: The new flow ID.
|
|
359
|
+
"""
|
|
255
360
|
self._flow_id = new_id
|
|
256
361
|
for node in self.nodes:
|
|
257
362
|
if hasattr(node.setting_input, 'flow_id'):
|
|
@@ -259,23 +364,35 @@ class FlowGraph:
|
|
|
259
364
|
self.flow_settings.flow_id = new_id
|
|
260
365
|
|
|
261
366
|
def __repr__(self):
|
|
262
|
-
"""
|
|
263
|
-
Official string representation of the FlowGraph class.
|
|
264
|
-
"""
|
|
367
|
+
"""Provides the official string representation of the FlowGraph instance."""
|
|
265
368
|
settings_str = " -" + '\n -'.join(f"{k}: {v}" for k, v in self.flow_settings)
|
|
266
369
|
return f"FlowGraph(\nNodes: {self._node_db}\n\nSettings:\n{settings_str}"
|
|
267
370
|
|
|
268
371
|
def get_nodes_overview(self):
|
|
372
|
+
"""Gets a list of dictionary representations for all nodes in the graph."""
|
|
269
373
|
output = []
|
|
270
374
|
for v in self._node_db.values():
|
|
271
375
|
output.append(v.get_repr())
|
|
272
376
|
return output
|
|
273
377
|
|
|
274
378
|
def remove_from_output_cols(self, columns: List[str]):
|
|
379
|
+
"""Removes specified columns from the list of expected output columns.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
columns: A list of column names to remove.
|
|
383
|
+
"""
|
|
275
384
|
cols = set(columns)
|
|
276
385
|
self._output_cols = [c for c in self._output_cols if c not in cols]
|
|
277
386
|
|
|
278
|
-
def get_node(self, node_id: Union[int, str] = None) -> FlowNode:
|
|
387
|
+
def get_node(self, node_id: Union[int, str] = None) -> FlowNode | None:
|
|
388
|
+
"""Retrieves a node from the graph by its ID.
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
node_id: The ID of the node to retrieve. If None, retrieves the last added node.
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
The FlowNode object, or None if not found.
|
|
395
|
+
"""
|
|
279
396
|
if node_id is None:
|
|
280
397
|
node_id = self._node_ids[-1]
|
|
281
398
|
node = self._node_db.get(node_id)
|
|
@@ -283,6 +400,12 @@ class FlowGraph:
|
|
|
283
400
|
return node
|
|
284
401
|
|
|
285
402
|
def add_pivot(self, pivot_settings: input_schema.NodePivot):
|
|
403
|
+
"""Adds a pivot node to the graph.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
pivot_settings: The settings for the pivot operation.
|
|
407
|
+
"""
|
|
408
|
+
|
|
286
409
|
def _func(fl: FlowDataEngine):
|
|
287
410
|
return fl.do_pivot(pivot_settings.pivot_input, self.flow_logger.get_node_logger(pivot_settings.node_id))
|
|
288
411
|
|
|
@@ -302,6 +425,11 @@ class FlowGraph:
|
|
|
302
425
|
node.schema_callback = schema_callback
|
|
303
426
|
|
|
304
427
|
def add_unpivot(self, unpivot_settings: input_schema.NodeUnpivot):
|
|
428
|
+
"""Adds an unpivot node to the graph.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
unpivot_settings: The settings for the unpivot operation.
|
|
432
|
+
"""
|
|
305
433
|
|
|
306
434
|
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
307
435
|
return fl.unpivot(unpivot_settings.unpivot_input)
|
|
@@ -313,6 +441,12 @@ class FlowGraph:
|
|
|
313
441
|
input_node_ids=[unpivot_settings.depending_on_id])
|
|
314
442
|
|
|
315
443
|
def add_union(self, union_settings: input_schema.NodeUnion):
|
|
444
|
+
"""Adds a union node to combine multiple data streams.
|
|
445
|
+
|
|
446
|
+
Args:
|
|
447
|
+
union_settings: The settings for the union operation.
|
|
448
|
+
"""
|
|
449
|
+
|
|
316
450
|
def _func(*flowfile_tables: FlowDataEngine):
|
|
317
451
|
dfs: List[pl.LazyFrame] | List[pl.DataFrame] = [flt.data_frame for flt in flowfile_tables]
|
|
318
452
|
return FlowDataEngine(pl.concat(dfs, how='diagonal_relaxed'))
|
|
@@ -323,7 +457,60 @@ class FlowGraph:
|
|
|
323
457
|
setting_input=union_settings,
|
|
324
458
|
input_node_ids=union_settings.depending_on_ids)
|
|
325
459
|
|
|
460
|
+
def add_initial_node_analysis(self, node_promise: input_schema.NodePromise):
|
|
461
|
+
"""Adds a data exploration/analysis node based on a node promise.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
node_promise: The promise representing the node to be analyzed.
|
|
465
|
+
"""
|
|
466
|
+
node_analysis = create_graphic_walker_node_from_node_promise(node_promise)
|
|
467
|
+
self.add_explore_data(node_analysis)
|
|
468
|
+
|
|
469
|
+
def add_explore_data(self, node_analysis: input_schema.NodeExploreData):
|
|
470
|
+
"""Adds a specialized node for data exploration and visualization.
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
node_analysis: The settings for the data exploration node.
|
|
474
|
+
"""
|
|
475
|
+
sample_size: int = 10000
|
|
476
|
+
|
|
477
|
+
def analysis_preparation(flowfile_table: FlowDataEngine):
|
|
478
|
+
if flowfile_table.number_of_records <= 0:
|
|
479
|
+
number_of_records = flowfile_table.get_number_of_records(calculate_in_worker_process=True)
|
|
480
|
+
else:
|
|
481
|
+
number_of_records = flowfile_table.number_of_records
|
|
482
|
+
if number_of_records > sample_size:
|
|
483
|
+
flowfile_table = flowfile_table.get_sample(sample_size, random=True)
|
|
484
|
+
external_sampler = ExternalDfFetcher(
|
|
485
|
+
lf=flowfile_table.data_frame,
|
|
486
|
+
file_ref="__gf_walker"+node.hash,
|
|
487
|
+
wait_on_completion=True,
|
|
488
|
+
node_id=node.node_id,
|
|
489
|
+
flow_id=self.flow_id,
|
|
490
|
+
)
|
|
491
|
+
node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref,
|
|
492
|
+
n=min(sample_size, number_of_records))
|
|
493
|
+
return flowfile_table
|
|
494
|
+
|
|
495
|
+
def schema_callback():
|
|
496
|
+
node = self.get_node(node_analysis.node_id)
|
|
497
|
+
if len(node.all_inputs) == 1:
|
|
498
|
+
input_node = node.all_inputs[0]
|
|
499
|
+
return input_node.schema
|
|
500
|
+
else:
|
|
501
|
+
return [FlowfileColumn.from_input('col_1', 'na')]
|
|
502
|
+
|
|
503
|
+
self.add_node_step(node_id=node_analysis.node_id, node_type='explore_data',
|
|
504
|
+
function=analysis_preparation,
|
|
505
|
+
setting_input=node_analysis, schema_callback=schema_callback)
|
|
506
|
+
node = self.get_node(node_analysis.node_id)
|
|
507
|
+
|
|
326
508
|
def add_group_by(self, group_by_settings: input_schema.NodeGroupBy):
|
|
509
|
+
"""Adds a group-by aggregation node to the graph.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
group_by_settings: The settings for the group-by operation.
|
|
513
|
+
"""
|
|
327
514
|
|
|
328
515
|
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
329
516
|
return fl.do_group_by(group_by_settings.groupby_input, False)
|
|
@@ -337,6 +524,7 @@ class FlowGraph:
|
|
|
337
524
|
node = self.get_node(group_by_settings.node_id)
|
|
338
525
|
|
|
339
526
|
def schema_callback():
|
|
527
|
+
|
|
340
528
|
output_columns = [(c.old_name, c.new_name, c.output_type) for c in group_by_settings.groupby_input.agg_cols]
|
|
341
529
|
depends_on = node.node_inputs.main_inputs[0]
|
|
342
530
|
input_schema_dict: Dict[str, str] = {s.name: s.data_type for s in depends_on.schema}
|
|
@@ -348,22 +536,13 @@ class FlowGraph:
|
|
|
348
536
|
|
|
349
537
|
node.schema_callback = schema_callback
|
|
350
538
|
|
|
351
|
-
def add_or_update_column_func(self, col_name: str, pl_dtype: pl.DataType, depends_on: FlowNode):
|
|
352
|
-
col_output = FlowfileColumn.from_input(column_name=col_name, data_type=str(pl_dtype))
|
|
353
|
-
schema = depends_on.schema
|
|
354
|
-
col_exist = depends_on.get_flow_file_column_schema(col_name)
|
|
355
|
-
if col_exist is None:
|
|
356
|
-
new_schema = schema + [col_output]
|
|
357
|
-
else:
|
|
358
|
-
new_schema = []
|
|
359
|
-
for s in self.schema:
|
|
360
|
-
if s.name == col_name:
|
|
361
|
-
new_schema.append(col_output)
|
|
362
|
-
else:
|
|
363
|
-
new_schema.append(s)
|
|
364
|
-
return new_schema
|
|
365
|
-
|
|
366
539
|
def add_filter(self, filter_settings: input_schema.NodeFilter):
|
|
540
|
+
"""Adds a filter node to the graph.
|
|
541
|
+
|
|
542
|
+
Args:
|
|
543
|
+
filter_settings: The settings for the filter operation.
|
|
544
|
+
"""
|
|
545
|
+
|
|
367
546
|
is_advanced = filter_settings.filter_input.filter_type == 'advanced'
|
|
368
547
|
if is_advanced:
|
|
369
548
|
predicate = filter_settings.filter_input.advanced_filter
|
|
@@ -397,6 +576,12 @@ class FlowGraph:
|
|
|
397
576
|
)
|
|
398
577
|
|
|
399
578
|
def add_record_count(self, node_number_of_records: input_schema.NodeRecordCount):
|
|
579
|
+
"""Adds a filter node to the graph.
|
|
580
|
+
|
|
581
|
+
Args:
|
|
582
|
+
node_number_of_records: The settings for the record count operation.
|
|
583
|
+
"""
|
|
584
|
+
|
|
400
585
|
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
401
586
|
return fl.get_record_count()
|
|
402
587
|
|
|
@@ -407,9 +592,14 @@ class FlowGraph:
|
|
|
407
592
|
input_node_ids=[node_number_of_records.depending_on_id])
|
|
408
593
|
|
|
409
594
|
def add_polars_code(self, node_polars_code: input_schema.NodePolarsCode):
|
|
595
|
+
"""Adds a node that executes custom Polars code.
|
|
596
|
+
|
|
597
|
+
Args:
|
|
598
|
+
node_polars_code: The settings for the Polars code node.
|
|
599
|
+
"""
|
|
600
|
+
|
|
410
601
|
def _func(*flowfile_tables: FlowDataEngine) -> FlowDataEngine:
|
|
411
602
|
return execute_polars_code(*flowfile_tables, code=node_polars_code.polars_code_input.polars_code)
|
|
412
|
-
|
|
413
603
|
self.add_node_step(node_id=node_polars_code.node_id,
|
|
414
604
|
function=_func,
|
|
415
605
|
node_type='polars_code',
|
|
@@ -422,7 +612,31 @@ class FlowGraph:
|
|
|
422
612
|
node = self.get_node(node_id=node_polars_code.node_id)
|
|
423
613
|
node.results.errors = str(e)
|
|
424
614
|
|
|
615
|
+
def add_dependency_on_polars_lazy_frame(self,
|
|
616
|
+
lazy_frame: pl.LazyFrame,
|
|
617
|
+
node_id: int):
|
|
618
|
+
"""Adds a special node that directly injects a Polars LazyFrame into the graph.
|
|
619
|
+
|
|
620
|
+
Note: This is intended for backend use and will not work in the UI editor.
|
|
621
|
+
|
|
622
|
+
Args:
|
|
623
|
+
lazy_frame: The Polars LazyFrame to inject.
|
|
624
|
+
node_id: The ID for the new node.
|
|
625
|
+
"""
|
|
626
|
+
def _func():
|
|
627
|
+
return FlowDataEngine(lazy_frame)
|
|
628
|
+
node_promise = input_schema.NodePromise(flow_id=self.flow_id,
|
|
629
|
+
node_id=node_id, node_type="polars_lazy_frame",
|
|
630
|
+
is_setup=True)
|
|
631
|
+
self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=_func,
|
|
632
|
+
setting_input=node_promise)
|
|
633
|
+
|
|
425
634
|
def add_unique(self, unique_settings: input_schema.NodeUnique):
|
|
635
|
+
"""Adds a node to find and remove duplicate rows.
|
|
636
|
+
|
|
637
|
+
Args:
|
|
638
|
+
unique_settings: The settings for the unique operation.
|
|
639
|
+
"""
|
|
426
640
|
|
|
427
641
|
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
428
642
|
return fl.make_unique(unique_settings.unique_input)
|
|
@@ -435,6 +649,16 @@ class FlowGraph:
|
|
|
435
649
|
input_node_ids=[unique_settings.depending_on_id])
|
|
436
650
|
|
|
437
651
|
def add_graph_solver(self, graph_solver_settings: input_schema.NodeGraphSolver):
|
|
652
|
+
"""Adds a node that solves graph-like problems within the data.
|
|
653
|
+
|
|
654
|
+
This node can be used for operations like finding network paths,
|
|
655
|
+
calculating connected components, or performing other graph algorithms
|
|
656
|
+
on relational data that represents nodes and edges.
|
|
657
|
+
|
|
658
|
+
Args:
|
|
659
|
+
graph_solver_settings: The settings object defining the graph inputs
|
|
660
|
+
and the specific algorithm to apply.
|
|
661
|
+
"""
|
|
438
662
|
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
439
663
|
return fl.solve_graph(graph_solver_settings.graph_solver_input)
|
|
440
664
|
|
|
@@ -445,6 +669,12 @@ class FlowGraph:
|
|
|
445
669
|
input_node_ids=[graph_solver_settings.depending_on_id])
|
|
446
670
|
|
|
447
671
|
def add_formula(self, function_settings: input_schema.NodeFormula):
|
|
672
|
+
"""Adds a node that applies a formula to create or modify a column.
|
|
673
|
+
|
|
674
|
+
Args:
|
|
675
|
+
function_settings: The settings for the formula operation.
|
|
676
|
+
"""
|
|
677
|
+
|
|
448
678
|
error = ""
|
|
449
679
|
if function_settings.function.field.data_type not in (None, "Auto"):
|
|
450
680
|
output_type = cast_str_to_polars_type(function_settings.function.field.data_type)
|
|
@@ -476,6 +706,14 @@ class FlowGraph:
|
|
|
476
706
|
return True, ""
|
|
477
707
|
|
|
478
708
|
def add_cross_join(self, cross_join_settings: input_schema.NodeCrossJoin) -> "FlowGraph":
|
|
709
|
+
"""Adds a cross join node to the graph.
|
|
710
|
+
|
|
711
|
+
Args:
|
|
712
|
+
cross_join_settings: The settings for the cross join operation.
|
|
713
|
+
|
|
714
|
+
Returns:
|
|
715
|
+
The `FlowGraph` instance for method chaining.
|
|
716
|
+
"""
|
|
479
717
|
|
|
480
718
|
def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
|
|
481
719
|
for left_select in cross_join_settings.cross_join_input.left_select.renames:
|
|
@@ -497,6 +735,15 @@ class FlowGraph:
|
|
|
497
735
|
return self
|
|
498
736
|
|
|
499
737
|
def add_join(self, join_settings: input_schema.NodeJoin) -> "FlowGraph":
|
|
738
|
+
"""Adds a join node to combine two data streams based on key columns.
|
|
739
|
+
|
|
740
|
+
Args:
|
|
741
|
+
join_settings: The settings for the join operation.
|
|
742
|
+
|
|
743
|
+
Returns:
|
|
744
|
+
The `FlowGraph` instance for method chaining.
|
|
745
|
+
"""
|
|
746
|
+
|
|
500
747
|
def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
|
|
501
748
|
for left_select in join_settings.join_input.left_select.renames:
|
|
502
749
|
left_select.is_available = True if left_select.old_name in main.schema else False
|
|
@@ -517,6 +764,15 @@ class FlowGraph:
|
|
|
517
764
|
return self
|
|
518
765
|
|
|
519
766
|
def add_fuzzy_match(self, fuzzy_settings: input_schema.NodeFuzzyMatch) -> "FlowGraph":
|
|
767
|
+
"""Adds a fuzzy matching node to join data on approximate string matches.
|
|
768
|
+
|
|
769
|
+
Args:
|
|
770
|
+
fuzzy_settings: The settings for the fuzzy match operation.
|
|
771
|
+
|
|
772
|
+
Returns:
|
|
773
|
+
The `FlowGraph` instance for method chaining.
|
|
774
|
+
"""
|
|
775
|
+
|
|
520
776
|
def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
|
|
521
777
|
f = main.start_fuzzy_join(fuzzy_match_input=fuzzy_settings.join_input, other=right, file_ref=node.hash,
|
|
522
778
|
flow_id=self.flow_id, node_id=fuzzy_settings.node_id)
|
|
@@ -541,6 +797,18 @@ class FlowGraph:
|
|
|
541
797
|
return self
|
|
542
798
|
|
|
543
799
|
def add_text_to_rows(self, node_text_to_rows: input_schema.NodeTextToRows) -> "FlowGraph":
|
|
800
|
+
"""Adds a node that splits cell values into multiple rows.
|
|
801
|
+
|
|
802
|
+
This is useful for un-nesting data where a single field contains multiple
|
|
803
|
+
values separated by a delimiter.
|
|
804
|
+
|
|
805
|
+
Args:
|
|
806
|
+
node_text_to_rows: The settings object that specifies the column to split
|
|
807
|
+
and the delimiter to use.
|
|
808
|
+
|
|
809
|
+
Returns:
|
|
810
|
+
The `FlowGraph` instance for method chaining.
|
|
811
|
+
"""
|
|
544
812
|
def _func(table: FlowDataEngine) -> FlowDataEngine:
|
|
545
813
|
return table.split(node_text_to_rows.text_to_rows_input)
|
|
546
814
|
|
|
@@ -552,6 +820,15 @@ class FlowGraph:
|
|
|
552
820
|
return self
|
|
553
821
|
|
|
554
822
|
def add_sort(self, sort_settings: input_schema.NodeSort) -> "FlowGraph":
|
|
823
|
+
"""Adds a node to sort the data based on one or more columns.
|
|
824
|
+
|
|
825
|
+
Args:
|
|
826
|
+
sort_settings: The settings for the sort operation.
|
|
827
|
+
|
|
828
|
+
Returns:
|
|
829
|
+
The `FlowGraph` instance for method chaining.
|
|
830
|
+
"""
|
|
831
|
+
|
|
555
832
|
def _func(table: FlowDataEngine) -> FlowDataEngine:
|
|
556
833
|
return table.do_sort(sort_settings.sort_input)
|
|
557
834
|
|
|
@@ -563,6 +840,14 @@ class FlowGraph:
|
|
|
563
840
|
return self
|
|
564
841
|
|
|
565
842
|
def add_sample(self, sample_settings: input_schema.NodeSample) -> "FlowGraph":
|
|
843
|
+
"""Adds a node to take a random or top-N sample of the data.
|
|
844
|
+
|
|
845
|
+
Args:
|
|
846
|
+
sample_settings: The settings object specifying the size of the sample.
|
|
847
|
+
|
|
848
|
+
Returns:
|
|
849
|
+
The `FlowGraph` instance for method chaining.
|
|
850
|
+
"""
|
|
566
851
|
def _func(table: FlowDataEngine) -> FlowDataEngine:
|
|
567
852
|
return table.get_sample(sample_settings.sample_size)
|
|
568
853
|
|
|
@@ -575,6 +860,15 @@ class FlowGraph:
|
|
|
575
860
|
return self
|
|
576
861
|
|
|
577
862
|
def add_record_id(self, record_id_settings: input_schema.NodeRecordId) -> "FlowGraph":
|
|
863
|
+
"""Adds a node to create a new column with a unique ID for each record.
|
|
864
|
+
|
|
865
|
+
Args:
|
|
866
|
+
record_id_settings: The settings object specifying the name of the
|
|
867
|
+
new record ID column.
|
|
868
|
+
|
|
869
|
+
Returns:
|
|
870
|
+
The `FlowGraph` instance for method chaining.
|
|
871
|
+
"""
|
|
578
872
|
|
|
579
873
|
def _func(table: FlowDataEngine) -> FlowDataEngine:
|
|
580
874
|
return table.add_record_id(record_id_settings.record_id_input)
|
|
@@ -588,6 +882,15 @@ class FlowGraph:
|
|
|
588
882
|
return self
|
|
589
883
|
|
|
590
884
|
def add_select(self, select_settings: input_schema.NodeSelect) -> "FlowGraph":
|
|
885
|
+
"""Adds a node to select, rename, reorder, or drop columns.
|
|
886
|
+
|
|
887
|
+
Args:
|
|
888
|
+
select_settings: The settings for the select operation.
|
|
889
|
+
|
|
890
|
+
Returns:
|
|
891
|
+
The `FlowGraph` instance for method chaining.
|
|
892
|
+
"""
|
|
893
|
+
|
|
591
894
|
select_cols = select_settings.select_input
|
|
592
895
|
drop_cols = tuple(s.old_name for s in select_settings.select_input)
|
|
593
896
|
|
|
@@ -621,9 +924,18 @@ class FlowGraph:
|
|
|
621
924
|
|
|
622
925
|
@property
|
|
623
926
|
def graph_has_functions(self) -> bool:
|
|
927
|
+
"""Checks if the graph has any nodes."""
|
|
624
928
|
return len(self._node_ids) > 0
|
|
625
929
|
|
|
626
930
|
def delete_node(self, node_id: Union[int, str]):
|
|
931
|
+
"""Deletes a node from the graph and updates all its connections.
|
|
932
|
+
|
|
933
|
+
Args:
|
|
934
|
+
node_id: The ID of the node to delete.
|
|
935
|
+
|
|
936
|
+
Raises:
|
|
937
|
+
Exception: If the node with the given ID does not exist.
|
|
938
|
+
"""
|
|
627
939
|
logger.info(f"Starting deletion of node with ID: {node_id}")
|
|
628
940
|
|
|
629
941
|
node = self._node_db.get(node_id)
|
|
@@ -656,6 +968,7 @@ class FlowGraph:
|
|
|
656
968
|
|
|
657
969
|
@property
|
|
658
970
|
def graph_has_input_data(self) -> bool:
|
|
971
|
+
"""Checks if the graph has an initial input data source."""
|
|
659
972
|
return self._input_data is not None
|
|
660
973
|
|
|
661
974
|
def add_node_step(self,
|
|
@@ -670,6 +983,24 @@ class FlowGraph:
|
|
|
670
983
|
cache_results: bool = None,
|
|
671
984
|
schema_callback: Callable = None,
|
|
672
985
|
input_node_ids: List[int] = None) -> FlowNode:
|
|
986
|
+
"""The core method for adding or updating a node in the graph.
|
|
987
|
+
|
|
988
|
+
Args:
|
|
989
|
+
node_id: The unique ID for the node.
|
|
990
|
+
function: The core processing function for the node.
|
|
991
|
+
input_columns: A list of input column names required by the function.
|
|
992
|
+
output_schema: A predefined schema for the node's output.
|
|
993
|
+
node_type: A string identifying the type of node (e.g., 'filter', 'join').
|
|
994
|
+
drop_columns: A list of columns to be dropped after the function executes.
|
|
995
|
+
renew_schema: If True, the schema is recalculated after execution.
|
|
996
|
+
setting_input: A configuration object containing settings for the node.
|
|
997
|
+
cache_results: If True, the node's results are cached for future runs.
|
|
998
|
+
schema_callback: A function that dynamically calculates the output schema.
|
|
999
|
+
input_node_ids: A list of IDs for the nodes that this node depends on.
|
|
1000
|
+
|
|
1001
|
+
Returns:
|
|
1002
|
+
The created or updated FlowNode object.
|
|
1003
|
+
"""
|
|
673
1004
|
existing_node = self.get_node(node_id)
|
|
674
1005
|
if existing_node is not None:
|
|
675
1006
|
if existing_node.node_type != node_type:
|
|
@@ -686,9 +1017,8 @@ class FlowGraph:
|
|
|
686
1017
|
if (
|
|
687
1018
|
input_nodes is not None or
|
|
688
1019
|
function.__name__ in ('placeholder', 'analysis_preparation') or
|
|
689
|
-
node_type
|
|
1020
|
+
node_type in ("cloud_storage_reader", "polars_lazy_frame", "input_data")
|
|
690
1021
|
):
|
|
691
|
-
|
|
692
1022
|
if not existing_node:
|
|
693
1023
|
node = FlowNode(node_id=node_id,
|
|
694
1024
|
function=function,
|
|
@@ -709,8 +1039,6 @@ class FlowGraph:
|
|
|
709
1039
|
setting_input=setting_input,
|
|
710
1040
|
schema_callback=schema_callback)
|
|
711
1041
|
node = existing_node
|
|
712
|
-
elif node_type == 'input_data':
|
|
713
|
-
node = None
|
|
714
1042
|
else:
|
|
715
1043
|
raise Exception("No data initialized")
|
|
716
1044
|
self._node_db[node_id] = node
|
|
@@ -718,6 +1046,11 @@ class FlowGraph:
|
|
|
718
1046
|
return node
|
|
719
1047
|
|
|
720
1048
|
def add_include_cols(self, include_columns: List[str]):
|
|
1049
|
+
"""Adds columns to both the input and output column lists.
|
|
1050
|
+
|
|
1051
|
+
Args:
|
|
1052
|
+
include_columns: A list of column names to include.
|
|
1053
|
+
"""
|
|
721
1054
|
for column in include_columns:
|
|
722
1055
|
if column not in self._input_cols:
|
|
723
1056
|
self._input_cols.append(column)
|
|
@@ -726,6 +1059,12 @@ class FlowGraph:
|
|
|
726
1059
|
return self
|
|
727
1060
|
|
|
728
1061
|
def add_output(self, output_file: input_schema.NodeOutput):
|
|
1062
|
+
"""Adds an output node to write the final data to a destination.
|
|
1063
|
+
|
|
1064
|
+
Args:
|
|
1065
|
+
output_file: The settings for the output file.
|
|
1066
|
+
"""
|
|
1067
|
+
|
|
729
1068
|
def _func(df: FlowDataEngine):
|
|
730
1069
|
output_file.output_settings.populate_abs_file_path()
|
|
731
1070
|
execute_remote = self.execution_location != 'local'
|
|
@@ -747,7 +1086,12 @@ class FlowGraph:
|
|
|
747
1086
|
input_node_ids=[input_node_id])
|
|
748
1087
|
|
|
749
1088
|
def add_database_writer(self, node_database_writer: input_schema.NodeDatabaseWriter):
|
|
750
|
-
|
|
1089
|
+
"""Adds a node to write data to a database.
|
|
1090
|
+
|
|
1091
|
+
Args:
|
|
1092
|
+
node_database_writer: The settings for the database writer node.
|
|
1093
|
+
"""
|
|
1094
|
+
|
|
751
1095
|
node_type = 'database_writer'
|
|
752
1096
|
database_settings: input_schema.DatabaseWriteSettings = node_database_writer.database_write_settings
|
|
753
1097
|
database_connection: Optional[input_schema.DatabaseConnection | input_schema.FullDatabaseConnection]
|
|
@@ -795,6 +1139,12 @@ class FlowGraph:
|
|
|
795
1139
|
node = self.get_node(node_database_writer.node_id)
|
|
796
1140
|
|
|
797
1141
|
def add_database_reader(self, node_database_reader: input_schema.NodeDatabaseReader):
|
|
1142
|
+
"""Adds a node to read data from a database.
|
|
1143
|
+
|
|
1144
|
+
Args:
|
|
1145
|
+
node_database_reader: The settings for the database reader node.
|
|
1146
|
+
"""
|
|
1147
|
+
|
|
798
1148
|
logger.info("Adding database reader")
|
|
799
1149
|
node_type = 'database_reader'
|
|
800
1150
|
database_settings: input_schema.DatabaseSettings = node_database_reader.database_settings
|
|
@@ -868,15 +1218,27 @@ class FlowGraph:
|
|
|
868
1218
|
self._node_ids.append(node_database_reader.node_id)
|
|
869
1219
|
|
|
870
1220
|
def add_sql_source(self, external_source_input: input_schema.NodeExternalSource):
|
|
1221
|
+
"""Adds a node that reads data from a SQL source.
|
|
1222
|
+
|
|
1223
|
+
This is a convenience alias for `add_external_source`.
|
|
1224
|
+
|
|
1225
|
+
Args:
|
|
1226
|
+
external_source_input: The settings for the external SQL source node.
|
|
1227
|
+
"""
|
|
871
1228
|
logger.info('Adding sql source')
|
|
872
1229
|
self.add_external_source(external_source_input)
|
|
873
1230
|
|
|
874
1231
|
def add_cloud_storage_writer(self, node_cloud_storage_writer: input_schema.NodeCloudStorageWriter) -> None:
|
|
1232
|
+
"""Adds a node to write data to a cloud storage provider.
|
|
875
1233
|
|
|
876
|
-
|
|
1234
|
+
Args:
|
|
1235
|
+
node_cloud_storage_writer: The settings for the cloud storage writer node.
|
|
1236
|
+
"""
|
|
877
1237
|
|
|
1238
|
+
node_type = "cloud_storage_writer"
|
|
878
1239
|
def _func(df: FlowDataEngine):
|
|
879
1240
|
df.lazy = True
|
|
1241
|
+
execute_remote = self.execution_location != 'local'
|
|
880
1242
|
cloud_connection_settings = get_cloud_connection_settings(
|
|
881
1243
|
connection_name=node_cloud_storage_writer.cloud_storage_settings.connection_name,
|
|
882
1244
|
user_id=node_cloud_storage_writer.user_id,
|
|
@@ -888,15 +1250,22 @@ class FlowGraph:
|
|
|
888
1250
|
aws_allow_unsafe_html=cloud_connection_settings.aws_allow_unsafe_html,
|
|
889
1251
|
**CloudStorageReader.get_storage_options(cloud_connection_settings)
|
|
890
1252
|
)
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
1253
|
+
if execute_remote:
|
|
1254
|
+
settings = get_cloud_storage_write_settings_worker_interface(
|
|
1255
|
+
write_settings=node_cloud_storage_writer.cloud_storage_settings,
|
|
1256
|
+
connection=full_cloud_storage_connection,
|
|
1257
|
+
lf=df.data_frame,
|
|
1258
|
+
flowfile_node_id=node_cloud_storage_writer.node_id,
|
|
1259
|
+
flowfile_flow_id=self.flow_id)
|
|
1260
|
+
external_database_writer = ExternalCloudWriter(settings, wait_on_completion=False)
|
|
1261
|
+
node._fetch_cached_df = external_database_writer
|
|
1262
|
+
external_database_writer.get_result()
|
|
1263
|
+
else:
|
|
1264
|
+
cloud_storage_write_settings_internal = CloudStorageWriteSettingsInternal(
|
|
1265
|
+
connection=full_cloud_storage_connection,
|
|
1266
|
+
write_settings=node_cloud_storage_writer.cloud_storage_settings,
|
|
1267
|
+
)
|
|
1268
|
+
df.to_cloud_storage_obj(cloud_storage_write_settings_internal)
|
|
900
1269
|
return df
|
|
901
1270
|
|
|
902
1271
|
def schema_callback():
|
|
@@ -919,12 +1288,10 @@ class FlowGraph:
|
|
|
919
1288
|
node = self.get_node(node_cloud_storage_writer.node_id)
|
|
920
1289
|
|
|
921
1290
|
def add_cloud_storage_reader(self, node_cloud_storage_reader: input_schema.NodeCloudStorageReader) -> None:
|
|
922
|
-
"""
|
|
923
|
-
|
|
1291
|
+
"""Adds a cloud storage read node to the flow graph.
|
|
1292
|
+
|
|
924
1293
|
Args:
|
|
925
|
-
node_cloud_storage_reader
|
|
926
|
-
The settings for the cloud storage read node.
|
|
927
|
-
Returns:
|
|
1294
|
+
node_cloud_storage_reader: The settings for the cloud storage read node.
|
|
928
1295
|
"""
|
|
929
1296
|
node_type = "cloud_storage_reader"
|
|
930
1297
|
logger.info("Adding cloud storage reader")
|
|
@@ -953,6 +1320,11 @@ class FlowGraph:
|
|
|
953
1320
|
|
|
954
1321
|
def add_external_source(self,
|
|
955
1322
|
external_source_input: input_schema.NodeExternalSource):
|
|
1323
|
+
"""Adds a node for a custom external data source.
|
|
1324
|
+
|
|
1325
|
+
Args:
|
|
1326
|
+
external_source_input: The settings for the external source node.
|
|
1327
|
+
"""
|
|
956
1328
|
|
|
957
1329
|
node_type = 'external_source'
|
|
958
1330
|
external_source_script = getattr(external_sources.custom_external_sources, external_source_input.identifier)
|
|
@@ -1009,6 +1381,12 @@ class FlowGraph:
|
|
|
1009
1381
|
setting_input=external_source_input)
|
|
1010
1382
|
|
|
1011
1383
|
def add_read(self, input_file: input_schema.NodeRead):
|
|
1384
|
+
"""Adds a node to read data from a local file (e.g., CSV, Parquet, Excel).
|
|
1385
|
+
|
|
1386
|
+
Args:
|
|
1387
|
+
input_file: The settings for the read operation.
|
|
1388
|
+
"""
|
|
1389
|
+
|
|
1012
1390
|
if input_file.received_file.file_type in ('xlsx', 'excel') and input_file.received_file.sheet_name == '':
|
|
1013
1391
|
sheet_name = fastexcel.read_excel(input_file.received_file.path).sheet_names[0]
|
|
1014
1392
|
input_file.received_file.sheet_name = sheet_name
|
|
@@ -1077,7 +1455,18 @@ class FlowGraph:
|
|
|
1077
1455
|
node.schema_callback = schema_callback
|
|
1078
1456
|
return self
|
|
1079
1457
|
|
|
1080
|
-
def add_datasource(self, input_file: input_schema.NodeDatasource
|
|
1458
|
+
def add_datasource(self, input_file: Union[input_schema.NodeDatasource, input_schema.NodeManualInput]) -> "FlowGraph":
|
|
1459
|
+
"""Adds a data source node to the graph.
|
|
1460
|
+
|
|
1461
|
+
This method serves as a factory for creating starting nodes, handling both
|
|
1462
|
+
file-based sources and direct manual data entry.
|
|
1463
|
+
|
|
1464
|
+
Args:
|
|
1465
|
+
input_file: The configuration object for the data source.
|
|
1466
|
+
|
|
1467
|
+
Returns:
|
|
1468
|
+
The `FlowGraph` instance for method chaining.
|
|
1469
|
+
"""
|
|
1081
1470
|
if isinstance(input_file, input_schema.NodeManualInput):
|
|
1082
1471
|
input_data = FlowDataEngine(input_file.raw_data_format)
|
|
1083
1472
|
ref = 'manual_input'
|
|
@@ -1103,29 +1492,35 @@ class FlowGraph:
|
|
|
1103
1492
|
return self
|
|
1104
1493
|
|
|
1105
1494
|
def add_manual_input(self, input_file: input_schema.NodeManualInput):
|
|
1495
|
+
"""Adds a node for manual data entry.
|
|
1496
|
+
|
|
1497
|
+
This is a convenience alias for `add_datasource`.
|
|
1498
|
+
|
|
1499
|
+
Args:
|
|
1500
|
+
input_file: The settings and data for the manual input node.
|
|
1501
|
+
"""
|
|
1106
1502
|
self.add_datasource(input_file)
|
|
1107
1503
|
|
|
1108
1504
|
@property
|
|
1109
1505
|
def nodes(self) -> List[FlowNode]:
|
|
1110
|
-
|
|
1506
|
+
"""Gets a list of all FlowNode objects in the graph."""
|
|
1111
1507
|
|
|
1112
|
-
|
|
1113
|
-
not_filled_cols = set(expected_cols) - set(self._output_cols)
|
|
1114
|
-
cols_available = list(not_filled_cols & set([c.name for c in self._input_data.schema]))
|
|
1115
|
-
self._output_cols += cols_available
|
|
1116
|
-
|
|
1117
|
-
@property
|
|
1118
|
-
def input_data_columns(self) -> List[str] | None:
|
|
1119
|
-
if self._input_cols:
|
|
1120
|
-
return list(set([col for col in self._input_cols if
|
|
1121
|
-
col in [table_col.name for table_col in self._input_data.schema]]))
|
|
1508
|
+
return list(self._node_db.values())
|
|
1122
1509
|
|
|
1123
1510
|
@property
|
|
1124
|
-
def execution_mode(self) ->
|
|
1511
|
+
def execution_mode(self) -> schemas.ExecutionModeLiteral:
|
|
1512
|
+
"""Gets the current execution mode ('Development' or 'Performance')."""
|
|
1125
1513
|
return self.flow_settings.execution_mode
|
|
1126
1514
|
|
|
1127
1515
|
def get_implicit_starter_nodes(self) -> List[FlowNode]:
|
|
1128
|
-
"""
|
|
1516
|
+
"""Finds nodes that can act as starting points but are not explicitly defined as such.
|
|
1517
|
+
|
|
1518
|
+
Some nodes, like the Polars Code node, can function without an input. This
|
|
1519
|
+
method identifies such nodes if they have no incoming connections.
|
|
1520
|
+
|
|
1521
|
+
Returns:
|
|
1522
|
+
A list of `FlowNode` objects that are implicit starting nodes.
|
|
1523
|
+
"""
|
|
1129
1524
|
starting_node_ids = [node.node_id for node in self._flow_starts]
|
|
1130
1525
|
implicit_starting_nodes = []
|
|
1131
1526
|
for node in self.nodes:
|
|
@@ -1135,17 +1530,39 @@ class FlowGraph:
|
|
|
1135
1530
|
|
|
1136
1531
|
@execution_mode.setter
|
|
1137
1532
|
def execution_mode(self, mode: schemas.ExecutionModeLiteral):
|
|
1533
|
+
"""Sets the execution mode for the flow.
|
|
1534
|
+
|
|
1535
|
+
Args:
|
|
1536
|
+
mode: The execution mode to set.
|
|
1537
|
+
"""
|
|
1138
1538
|
self.flow_settings.execution_mode = mode
|
|
1139
1539
|
|
|
1140
1540
|
@property
|
|
1141
1541
|
def execution_location(self) -> schemas.ExecutionLocationsLiteral:
|
|
1542
|
+
"""Gets the current execution location."""
|
|
1142
1543
|
return self.flow_settings.execution_location
|
|
1143
1544
|
|
|
1144
1545
|
@execution_location.setter
|
|
1145
1546
|
def execution_location(self, execution_location: schemas.ExecutionLocationsLiteral):
|
|
1547
|
+
"""Sets the execution location for the flow.
|
|
1548
|
+
|
|
1549
|
+
Args:
|
|
1550
|
+
execution_location: The execution location to set.
|
|
1551
|
+
"""
|
|
1146
1552
|
self.flow_settings.execution_location = execution_location
|
|
1147
1553
|
|
|
1148
|
-
def run_graph(self):
|
|
1554
|
+
def run_graph(self) -> RunInformation | None:
|
|
1555
|
+
"""Executes the entire data flow graph from start to finish.
|
|
1556
|
+
|
|
1557
|
+
It determines the correct execution order, runs each node,
|
|
1558
|
+
collects results, and handles errors and cancellations.
|
|
1559
|
+
|
|
1560
|
+
Returns:
|
|
1561
|
+
A RunInformation object summarizing the execution results.
|
|
1562
|
+
|
|
1563
|
+
Raises:
|
|
1564
|
+
Exception: If the flow is already running.
|
|
1565
|
+
"""
|
|
1149
1566
|
if self.flow_settings.is_running:
|
|
1150
1567
|
raise Exception('Flow is already running')
|
|
1151
1568
|
try:
|
|
@@ -1163,10 +1580,13 @@ class FlowGraph:
|
|
|
1163
1580
|
execution_order = determine_execution_order(all_nodes=[node for node in self.nodes if
|
|
1164
1581
|
node not in skip_nodes],
|
|
1165
1582
|
flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
|
|
1166
|
-
|
|
1167
1583
|
skip_node_message(self.flow_logger, skip_nodes)
|
|
1168
1584
|
execution_order_message(self.flow_logger, execution_order)
|
|
1169
1585
|
performance_mode = self.flow_settings.execution_mode == 'Performance'
|
|
1586
|
+
if self.flow_settings.execution_location == 'local':
|
|
1587
|
+
OFFLOAD_TO_WORKER.value = False
|
|
1588
|
+
elif self.flow_settings.execution_location == 'remote':
|
|
1589
|
+
OFFLOAD_TO_WORKER.value = True
|
|
1170
1590
|
for node in execution_order:
|
|
1171
1591
|
node_logger = self.flow_logger.get_node_logger(node.node_id)
|
|
1172
1592
|
if self.flow_settings.is_canceled:
|
|
@@ -1215,6 +1635,11 @@ class FlowGraph:
|
|
|
1215
1635
|
self.flow_settings.is_running = False
|
|
1216
1636
|
|
|
1217
1637
|
def get_run_info(self) -> RunInformation:
|
|
1638
|
+
"""Gets a summary of the most recent graph execution.
|
|
1639
|
+
|
|
1640
|
+
Returns:
|
|
1641
|
+
A RunInformation object with details about the last run.
|
|
1642
|
+
"""
|
|
1218
1643
|
if self.latest_run_info is None:
|
|
1219
1644
|
node_results = self.node_results
|
|
1220
1645
|
success = all(nr.success for nr in node_results)
|
|
@@ -1234,6 +1659,11 @@ class FlowGraph:
|
|
|
1234
1659
|
|
|
1235
1660
|
@property
|
|
1236
1661
|
def node_connections(self) -> List[Tuple[int, int]]:
|
|
1662
|
+
"""Computes and returns a list of all connections in the graph.
|
|
1663
|
+
|
|
1664
|
+
Returns:
|
|
1665
|
+
A list of tuples, where each tuple is a (source_id, target_id) pair.
|
|
1666
|
+
"""
|
|
1237
1667
|
connections = set()
|
|
1238
1668
|
for node in self.nodes:
|
|
1239
1669
|
outgoing_connections = [(node.node_id, ltn.node_id) for ltn in node.leads_to_nodes]
|
|
@@ -1245,28 +1675,30 @@ class FlowGraph:
|
|
|
1245
1675
|
connections.add(node_connection)
|
|
1246
1676
|
return list(connections)
|
|
1247
1677
|
|
|
1248
|
-
def
|
|
1249
|
-
|
|
1250
|
-
if len(self._node_ids) > 0:
|
|
1251
|
-
self.schema = self._node_db[self._node_ids[0]].schema
|
|
1252
|
-
return self.schema
|
|
1678
|
+
def get_node_data(self, node_id: int, include_example: bool = True) -> NodeData:
|
|
1679
|
+
"""Retrieves all data needed to render a node in the UI.
|
|
1253
1680
|
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1681
|
+
Args:
|
|
1682
|
+
node_id: The ID of the node.
|
|
1683
|
+
include_example: Whether to include data samples in the result.
|
|
1257
1684
|
|
|
1258
|
-
|
|
1685
|
+
Returns:
|
|
1686
|
+
A NodeData object, or None if the node is not found.
|
|
1687
|
+
"""
|
|
1259
1688
|
node = self._node_db[node_id]
|
|
1260
1689
|
return node.get_node_data(flow_id=self.flow_id, include_example=include_example)
|
|
1261
1690
|
|
|
1262
1691
|
def get_node_storage(self) -> schemas.FlowInformation:
|
|
1692
|
+
"""Serializes the entire graph's state into a storable format.
|
|
1263
1693
|
|
|
1694
|
+
Returns:
|
|
1695
|
+
A FlowInformation object representing the complete graph.
|
|
1696
|
+
"""
|
|
1264
1697
|
node_information = {node.node_id: node.get_node_information() for
|
|
1265
1698
|
node in self.nodes if node.is_setup and node.is_correct}
|
|
1266
1699
|
|
|
1267
1700
|
return schemas.FlowInformation(flow_id=self.flow_id,
|
|
1268
1701
|
flow_name=self.__name__,
|
|
1269
|
-
storage_location=self.flow_settings.path,
|
|
1270
1702
|
flow_settings=self.flow_settings,
|
|
1271
1703
|
data=node_information,
|
|
1272
1704
|
node_starts=[v.node_id for v in self._flow_starts],
|
|
@@ -1274,6 +1706,8 @@ class FlowGraph:
|
|
|
1274
1706
|
)
|
|
1275
1707
|
|
|
1276
1708
|
def cancel(self):
|
|
1709
|
+
"""Cancels an ongoing graph execution."""
|
|
1710
|
+
|
|
1277
1711
|
if not self.flow_settings.is_running:
|
|
1278
1712
|
return
|
|
1279
1713
|
self.flow_settings.is_canceled = True
|
|
@@ -1281,15 +1715,30 @@ class FlowGraph:
|
|
|
1281
1715
|
node.cancel()
|
|
1282
1716
|
|
|
1283
1717
|
def close_flow(self):
|
|
1718
|
+
"""Performs cleanup operations, such as clearing node caches."""
|
|
1719
|
+
|
|
1284
1720
|
for node in self.nodes:
|
|
1285
1721
|
node.remove_cache()
|
|
1286
1722
|
|
|
1287
1723
|
def save_flow(self, flow_path: str):
|
|
1724
|
+
"""Saves the current state of the flow graph to a file.
|
|
1725
|
+
|
|
1726
|
+
Args:
|
|
1727
|
+
flow_path: The path where the flow file will be saved.
|
|
1728
|
+
"""
|
|
1288
1729
|
with open(flow_path, 'wb') as f:
|
|
1289
1730
|
pickle.dump(self.get_node_storage(), f)
|
|
1290
1731
|
self.flow_settings.path = flow_path
|
|
1291
1732
|
|
|
1292
|
-
def get_frontend_data(self):
|
|
1733
|
+
def get_frontend_data(self) -> dict:
|
|
1734
|
+
"""Formats the graph structure into a JSON-like dictionary for a specific legacy frontend.
|
|
1735
|
+
|
|
1736
|
+
This method transforms the graph's state into a format compatible with the
|
|
1737
|
+
Drawflow.js library.
|
|
1738
|
+
|
|
1739
|
+
Returns:
|
|
1740
|
+
A dictionary representing the graph in Drawflow format.
|
|
1741
|
+
"""
|
|
1293
1742
|
result = {
|
|
1294
1743
|
'Home': {
|
|
1295
1744
|
"data": {}
|
|
@@ -1360,6 +1809,11 @@ class FlowGraph:
|
|
|
1360
1809
|
return result
|
|
1361
1810
|
|
|
1362
1811
|
def get_vue_flow_input(self) -> schemas.VueFlowInput:
|
|
1812
|
+
"""Formats the graph's nodes and edges into a schema suitable for the VueFlow frontend.
|
|
1813
|
+
|
|
1814
|
+
Returns:
|
|
1815
|
+
A VueFlowInput object.
|
|
1816
|
+
"""
|
|
1363
1817
|
edges: List[schemas.NodeEdge] = []
|
|
1364
1818
|
nodes: List[schemas.NodeInput] = []
|
|
1365
1819
|
for node in self.nodes:
|
|
@@ -1368,11 +1822,19 @@ class FlowGraph:
|
|
|
1368
1822
|
return schemas.VueFlowInput(node_edges=edges, node_inputs=nodes)
|
|
1369
1823
|
|
|
1370
1824
|
def reset(self):
|
|
1825
|
+
"""Forces a deep reset on all nodes in the graph."""
|
|
1826
|
+
|
|
1371
1827
|
for node in self.nodes:
|
|
1372
1828
|
node.reset(True)
|
|
1373
1829
|
|
|
1374
1830
|
def copy_node(self, new_node_settings: input_schema.NodePromise, existing_setting_input: Any, node_type: str) -> None:
|
|
1375
|
-
"""
|
|
1831
|
+
"""Creates a copy of an existing node.
|
|
1832
|
+
|
|
1833
|
+
Args:
|
|
1834
|
+
new_node_settings: The promise containing new settings (like ID and position).
|
|
1835
|
+
existing_setting_input: The settings object from the node being copied.
|
|
1836
|
+
node_type: The type of the node being copied.
|
|
1837
|
+
"""
|
|
1376
1838
|
self.add_node_promise(new_node_settings)
|
|
1377
1839
|
|
|
1378
1840
|
if isinstance(existing_setting_input, input_schema.NodePromise):
|
|
@@ -1383,69 +1845,26 @@ class FlowGraph:
|
|
|
1383
1845
|
)
|
|
1384
1846
|
getattr(self, f"add_{node_type}")(combined_settings)
|
|
1385
1847
|
|
|
1848
|
+
def generate_code(self):
|
|
1849
|
+
"""Generates code for the flow graph.
|
|
1850
|
+
This method exports the flow graph to a Polars-compatible format.
|
|
1851
|
+
"""
|
|
1852
|
+
from flowfile_core.flowfile.code_generator.code_generator import export_flow_to_polars
|
|
1853
|
+
print(export_flow_to_polars(self))
|
|
1386
1854
|
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1855
|
+
|
|
1856
|
+
def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
|
|
1857
|
+
"""Merges settings from an existing object with new settings from a NodePromise.
|
|
1858
|
+
|
|
1859
|
+
Typically used when copying a node to apply a new ID and position.
|
|
1390
1860
|
|
|
1391
1861
|
Args:
|
|
1392
|
-
|
|
1862
|
+
setting_input: The original settings object.
|
|
1863
|
+
new_settings: The NodePromise with new positional and ID data.
|
|
1393
1864
|
|
|
1394
1865
|
Returns:
|
|
1395
|
-
A new
|
|
1396
|
-
|
|
1397
|
-
Raises:
|
|
1398
|
-
ValueError: If any flow_ids overlap
|
|
1866
|
+
A new settings object with the merged properties.
|
|
1399
1867
|
"""
|
|
1400
|
-
# Validate flow IDs are unique
|
|
1401
|
-
_validate_unique_flow_ids(flow_graphs)
|
|
1402
|
-
|
|
1403
|
-
# Create ID mapping for all nodes
|
|
1404
|
-
node_id_mapping = _create_node_id_mapping(flow_graphs)
|
|
1405
|
-
|
|
1406
|
-
# Remap and combine nodes
|
|
1407
|
-
all_nodes = _remap_nodes(flow_graphs, node_id_mapping)
|
|
1408
|
-
|
|
1409
|
-
# Create a new combined flow graph
|
|
1410
|
-
combined_flow_id = hash(tuple(fg.flow_id for fg in flow_graphs))
|
|
1411
|
-
# return FlowGraph(flow_id=combined_flow_id, nodes=all_nodes, edges=all_edges)
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
def _validate_unique_flow_ids(flow_graphs: Tuple[FlowGraph, ...]) -> None:
|
|
1415
|
-
"""Ensure all flow graphs have unique flow_ids."""
|
|
1416
|
-
all_flow_ids = [fg.flow_id for fg in flow_graphs]
|
|
1417
|
-
if len(all_flow_ids) != len(set(all_flow_ids)):
|
|
1418
|
-
raise ValueError("Cannot combine overlapping graphs, make sure the graphs have a unique identifier")
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
def _create_node_id_mapping(flow_graphs: Tuple[FlowGraph, ...]) -> Dict[int, Dict[int, int]]:
|
|
1422
|
-
"""Create a mapping from original node IDs to new unique node IDs."""
|
|
1423
|
-
node_id_mapping: Dict[int, Dict[int, int]] = {}
|
|
1424
|
-
next_node_id = 0
|
|
1425
|
-
|
|
1426
|
-
for fg in flow_graphs:
|
|
1427
|
-
node_id_mapping[fg.flow_id] = {}
|
|
1428
|
-
for node in fg.nodes:
|
|
1429
|
-
node_id_mapping[fg.flow_id][node.node_id] = next_node_id
|
|
1430
|
-
next_node_id += 1
|
|
1431
|
-
|
|
1432
|
-
return node_id_mapping
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
def _remap_nodes(flow_graphs: Tuple[FlowGraph, ...],
|
|
1436
|
-
node_id_mapping: Dict[int, Dict[int, int]]) -> List:
|
|
1437
|
-
"""Create new nodes with remapped IDs."""
|
|
1438
|
-
all_nodes = []
|
|
1439
|
-
for fg in flow_graphs:
|
|
1440
|
-
for node in fg.nodes:
|
|
1441
|
-
new_node = copy.deepcopy(node)
|
|
1442
|
-
new_node.node_id = node_id_mapping[fg.flow_id][node.node_id]
|
|
1443
|
-
all_nodes.append(new_node)
|
|
1444
|
-
return all_nodes
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
|
|
1448
|
-
"""Combine excopy_nodeisting settings with new settings from a NodePromise."""
|
|
1449
1868
|
copied_setting_input = deepcopy(setting_input)
|
|
1450
1869
|
|
|
1451
1870
|
# Update only attributes that exist on new_settings
|
|
@@ -1464,7 +1883,13 @@ def combine_existing_settings_and_new_settings(setting_input: Any, new_settings:
|
|
|
1464
1883
|
return copied_setting_input
|
|
1465
1884
|
|
|
1466
1885
|
|
|
1467
|
-
def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection):
|
|
1886
|
+
def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection) -> None:
|
|
1887
|
+
"""Adds a connection between two nodes in the flow graph.
|
|
1888
|
+
|
|
1889
|
+
Args:
|
|
1890
|
+
flow: The FlowGraph instance to modify.
|
|
1891
|
+
node_connection: An object defining the source and target of the connection.
|
|
1892
|
+
"""
|
|
1468
1893
|
logger.info('adding a connection')
|
|
1469
1894
|
from_node = flow.get_node(node_connection.output_connection.node_id)
|
|
1470
1895
|
to_node = flow.get_node(node_connection.input_connection.node_id)
|
|
@@ -1476,7 +1901,12 @@ def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection
|
|
|
1476
1901
|
|
|
1477
1902
|
|
|
1478
1903
|
def delete_connection(graph, node_connection: input_schema.NodeConnection):
|
|
1479
|
-
"""
|
|
1904
|
+
"""Deletes a connection between two nodes in the flow graph.
|
|
1905
|
+
|
|
1906
|
+
Args:
|
|
1907
|
+
graph: The FlowGraph instance to modify.
|
|
1908
|
+
node_connection: An object defining the connection to be removed.
|
|
1909
|
+
"""
|
|
1480
1910
|
from_node = graph.get_node(node_connection.output_connection.node_id)
|
|
1481
1911
|
to_node = graph.get_node(node_connection.input_connection.node_id)
|
|
1482
1912
|
connection_valid = to_node.node_inputs.validate_if_input_connection_exists(
|
|
@@ -1492,6 +1922,4 @@ def delete_connection(graph, node_connection: input_schema.NodeConnection):
|
|
|
1492
1922
|
to_node.delete_input_node(
|
|
1493
1923
|
node_connection.output_connection.node_id,
|
|
1494
1924
|
connection_type=node_connection.input_connection.connection_class,
|
|
1495
|
-
)
|
|
1496
|
-
|
|
1497
|
-
|
|
1925
|
+
)
|