Flowfile 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +27 -6
- flowfile/api.py +1 -0
- flowfile/web/__init__.py +2 -2
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
- flowfile/web/static/assets/CloudConnectionManager-c20a740f.js +783 -0
- flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
- flowfile/web/static/assets/CloudStorageReader-960b400a.js +437 -0
- flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
- flowfile/web/static/assets/CloudStorageWriter-e3decbdd.js +430 -0
- flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-d67e2405.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-9ea35e84.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-9578bfa5.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-19531098.js} +9 -9
- flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-40476474.js} +47141 -43697
- flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-2297ef96.js} +6 -6
- flowfile/web/static/assets/{Filter-f87bb897.js → Filter-f211c03a.js} +8 -8
- flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
- flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-4207ea31.js} +75 -9
- flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-bf120df0.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-5bb7497a.js} +5 -5
- flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-92c81b65.js} +6 -6
- flowfile/web/static/assets/{Join-eec38203.js → Join-4e49a274.js} +23 -15
- flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
- flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-90998ae8.js} +106 -34
- flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
- flowfile/web/static/assets/{Output-3b2ca045.js → Output-81e3e917.js} +4 -4
- flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-a3419842.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-72710deb.js} +6 -6
- flowfile/web/static/assets/{Read-07acdc9a.js → Read-c4059daf.js} +6 -6
- flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-c2b5e095.js} +5 -5
- flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-10baf191.js} +6 -6
- flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-3ed9a0ae.js} +5 -5
- flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-0d49c0e8.js} +2 -2
- flowfile/web/static/assets/{Select-32b28406.js → Select-8a02a0b3.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-4c0f45f5.js} +1 -1
- flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-f55c9f9d.js} +6 -6
- flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-5dbc2145.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-a1768e52.js} +2 -2
- flowfile/web/static/assets/{Union-39eecc6c.js → Union-f2aefdc9.js} +5 -5
- flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-46b250da.js} +8 -8
- flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-25ac84cc.js} +5 -5
- flowfile/web/static/assets/api-6ef0dcef.js +80 -0
- flowfile/web/static/assets/{api-44ca9e9c.js → api-a0abbdc7.js} +1 -1
- flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
- flowfile/web/static/assets/{designer-267d44f1.js → designer-13eabd83.js} +36 -34
- flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-b87e7f6f.js} +1 -1
- flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-13564764.js} +1 -1
- flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-fd2cd6f9.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-71e11604.js} +3 -3
- flowfile/web/static/assets/{index-e235a8bc.js → index-f6c15e76.js} +59 -22
- flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-988d9efe.js} +3 -3
- flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-dd636aa2.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-af36165e.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-2847001e.js} +2 -1
- flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-0371da73.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/METADATA +9 -4
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/RECORD +131 -124
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +2 -0
- flowfile_core/__init__.py +3 -0
- flowfile_core/auth/jwt.py +39 -0
- flowfile_core/configs/node_store/nodes.py +9 -6
- flowfile_core/configs/settings.py +6 -5
- flowfile_core/database/connection.py +63 -15
- flowfile_core/database/init_db.py +0 -1
- flowfile_core/database/models.py +49 -2
- flowfile_core/flowfile/code_generator/code_generator.py +472 -17
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
- flowfile_core/flowfile/extensions.py +1 -1
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1062 -311
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
- flowfile_core/flowfile/flow_graph.py +718 -253
- flowfile_core/flowfile/flow_graph_utils.py +2 -2
- flowfile_core/flowfile/flow_node/flow_node.py +563 -117
- flowfile_core/flowfile/flow_node/models.py +154 -20
- flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
- flowfile_core/flowfile/handler.py +2 -33
- flowfile_core/flowfile/manage/open_flowfile.py +1 -2
- flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
- flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
- flowfile_core/flowfile/util/calculate_layout.py +0 -2
- flowfile_core/flowfile/utils.py +35 -26
- flowfile_core/main.py +35 -15
- flowfile_core/routes/cloud_connections.py +77 -0
- flowfile_core/routes/logs.py +2 -7
- flowfile_core/routes/public.py +1 -0
- flowfile_core/routes/routes.py +130 -90
- flowfile_core/routes/secrets.py +72 -14
- flowfile_core/schemas/__init__.py +8 -0
- flowfile_core/schemas/cloud_storage_schemas.py +215 -0
- flowfile_core/schemas/input_schema.py +121 -71
- flowfile_core/schemas/output_model.py +19 -3
- flowfile_core/schemas/schemas.py +150 -12
- flowfile_core/schemas/transform_schema.py +175 -35
- flowfile_core/utils/utils.py +40 -1
- flowfile_core/utils/validate_setup.py +41 -0
- flowfile_frame/__init__.py +9 -1
- flowfile_frame/cloud_storage/frame_helpers.py +39 -0
- flowfile_frame/cloud_storage/secret_manager.py +73 -0
- flowfile_frame/expr.py +28 -1
- flowfile_frame/expr.pyi +76 -61
- flowfile_frame/flow_frame.py +481 -208
- flowfile_frame/flow_frame.pyi +140 -91
- flowfile_frame/flow_frame_methods.py +160 -22
- flowfile_frame/group_frame.py +3 -0
- flowfile_frame/utils.py +25 -3
- flowfile_worker/external_sources/s3_source/main.py +216 -0
- flowfile_worker/external_sources/s3_source/models.py +142 -0
- flowfile_worker/funcs.py +51 -6
- flowfile_worker/models.py +22 -2
- flowfile_worker/routes.py +40 -38
- flowfile_worker/utils.py +1 -1
- test_utils/s3/commands.py +46 -0
- test_utils/s3/data_generator.py +292 -0
- test_utils/s3/demo_data_generator.py +186 -0
- test_utils/s3/fixtures.py +214 -0
- flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
- flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
- flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
- flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
- flowfile_core/schemas/defaults.py +0 -9
- flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
- flowfile_core/schemas/models.py +0 -193
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
- flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
- flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
- {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_frame/cloud_storage}/__init__.py +0 -0
- {flowfile_core/schemas/external_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
- {flowfile_worker/external_sources/airbyte_sources → test_utils/s3}/__init__.py +0 -0
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
|
|
2
1
|
from typing import List, Union, Callable, Any, Optional, Generator, Literal
|
|
3
2
|
from flowfile_core.configs import logger
|
|
4
3
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
|
|
@@ -6,6 +5,7 @@ from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEng
|
|
|
6
5
|
from flowfile_core.utils.arrow_reader import get_read_top_n
|
|
7
6
|
from flowfile_core.schemas import input_schema, schemas
|
|
8
7
|
from flowfile_core.configs.flow_logger import NodeLogger
|
|
8
|
+
from flowfile_core.configs.settings import SINGLE_FILE_MODE
|
|
9
9
|
|
|
10
10
|
from flowfile_core.schemas.output_model import TableExample, FileColumn, NodeData
|
|
11
11
|
from flowfile_core.flowfile.utils import get_hash
|
|
@@ -13,13 +13,19 @@ from flowfile_core.configs.node_store import nodes as node_interface
|
|
|
13
13
|
from flowfile_core.flowfile.setting_generator import setting_generator, setting_updator
|
|
14
14
|
from time import sleep
|
|
15
15
|
from flowfile_core.flowfile.flow_data_engine.subprocess_operations import (
|
|
16
|
-
ExternalDfFetcher, ExternalSampler, results_exists, get_external_df_result,
|
|
16
|
+
ExternalDfFetcher, ExternalSampler, results_exists, get_external_df_result,
|
|
17
|
+
ExternalDatabaseFetcher, ExternalDatabaseWriter, ExternalCloudWriter)
|
|
17
18
|
from flowfile_core.flowfile.flow_node.models import (NodeStepSettings, NodeStepInputs, NodeSchemaInformation,
|
|
18
19
|
NodeStepStats, NodeResults)
|
|
19
20
|
from flowfile_core.flowfile.flow_node.schema_callback import SingleExecutionFuture
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class FlowNode:
|
|
24
|
+
"""Represents a single node in a data flow graph.
|
|
25
|
+
|
|
26
|
+
This class manages the node's state, its data processing function,
|
|
27
|
+
and its connections to other nodes within the graph.
|
|
28
|
+
"""
|
|
23
29
|
parent_uuid: str
|
|
24
30
|
node_type: str
|
|
25
31
|
node_template: node_interface.NodeTemplate
|
|
@@ -31,15 +37,66 @@ class FlowNode:
|
|
|
31
37
|
results: NodeResults
|
|
32
38
|
node_information: Optional[schemas.NodeInformation] = None
|
|
33
39
|
leads_to_nodes: List["FlowNode"] = [] # list with target flows, after execution the step will trigger those step(s)
|
|
40
|
+
user_provided_schema_callback: Optional[Callable] = None # user provided callback function for schema calculation
|
|
34
41
|
_setting_input: Any = None
|
|
35
42
|
_hash: Optional[str] = None # host this for caching results
|
|
36
43
|
_function: Callable = None # the function that needs to be executed when triggered
|
|
44
|
+
_name: str = None # name of the node, used for display
|
|
37
45
|
_schema_callback: Optional[SingleExecutionFuture] = None # Function that calculates the schema without executing
|
|
38
46
|
_state_needs_reset: bool = False
|
|
39
|
-
_fetch_cached_df: Optional[ExternalDfFetcher | ExternalDatabaseFetcher | ExternalDatabaseWriter] = None
|
|
40
|
-
_cache_progress: Optional[ExternalDfFetcher | ExternalDatabaseFetcher | ExternalDatabaseWriter] = None
|
|
47
|
+
_fetch_cached_df: Optional[ExternalDfFetcher | ExternalDatabaseFetcher | ExternalDatabaseWriter | ExternalCloudWriter] = None
|
|
48
|
+
_cache_progress: Optional[ExternalDfFetcher | ExternalDatabaseFetcher | ExternalDatabaseWriter | ExternalCloudWriter] = None
|
|
49
|
+
|
|
50
|
+
def __init__(self, node_id: Union[str, int], function: Callable,
|
|
51
|
+
parent_uuid: str,
|
|
52
|
+
setting_input: Any,
|
|
53
|
+
name: str,
|
|
54
|
+
node_type: str,
|
|
55
|
+
input_columns: List[str] = None,
|
|
56
|
+
output_schema: List[FlowfileColumn] = None,
|
|
57
|
+
drop_columns: List[str] = None,
|
|
58
|
+
renew_schema: bool = True,
|
|
59
|
+
pos_x: float = 0,
|
|
60
|
+
pos_y: float = 0,
|
|
61
|
+
schema_callback: Callable = None,
|
|
62
|
+
):
|
|
63
|
+
"""Initializes a FlowNode instance.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
node_id: Unique identifier for the node.
|
|
67
|
+
function: The core data processing function for the node.
|
|
68
|
+
parent_uuid: The UUID of the parent flow.
|
|
69
|
+
setting_input: The configuration/settings object for the node.
|
|
70
|
+
name: The name of the node.
|
|
71
|
+
node_type: The type identifier of the node (e.g., 'join', 'filter').
|
|
72
|
+
input_columns: List of column names expected as input.
|
|
73
|
+
output_schema: The schema of the columns to be added.
|
|
74
|
+
drop_columns: List of column names to be dropped.
|
|
75
|
+
renew_schema: Flag to indicate if the schema should be renewed.
|
|
76
|
+
pos_x: The x-coordinate on the canvas.
|
|
77
|
+
pos_y: The y-coordinate on the canvas.
|
|
78
|
+
schema_callback: A custom function to calculate the output schema.
|
|
79
|
+
"""
|
|
80
|
+
self._name = None
|
|
81
|
+
self.parent_uuid = parent_uuid
|
|
82
|
+
self.post_init()
|
|
83
|
+
self.active = True
|
|
84
|
+
self.node_information.id = node_id
|
|
85
|
+
self.node_type = node_type
|
|
86
|
+
self.node_settings.renew_schema = renew_schema
|
|
87
|
+
self.update_node(function=function,
|
|
88
|
+
input_columns=input_columns,
|
|
89
|
+
output_schema=output_schema,
|
|
90
|
+
drop_columns=drop_columns,
|
|
91
|
+
setting_input=setting_input,
|
|
92
|
+
name=name,
|
|
93
|
+
pos_x=pos_x,
|
|
94
|
+
pos_y=pos_y,
|
|
95
|
+
schema_callback=schema_callback,
|
|
96
|
+
)
|
|
41
97
|
|
|
42
98
|
def post_init(self):
|
|
99
|
+
"""Initializes or resets the node's attributes to their default states."""
|
|
43
100
|
self.node_inputs = NodeStepInputs()
|
|
44
101
|
self.node_stats = NodeStepStats()
|
|
45
102
|
self.node_settings = NodeStepSettings()
|
|
@@ -53,19 +110,65 @@ class FlowNode:
|
|
|
53
110
|
self._state_needs_reset = False
|
|
54
111
|
|
|
55
112
|
@property
|
|
56
|
-
def state_needs_reset(self):
|
|
113
|
+
def state_needs_reset(self) -> bool:
|
|
114
|
+
"""Checks if the node's state needs to be reset.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
True if a reset is required, False otherwise.
|
|
118
|
+
"""
|
|
57
119
|
return self._state_needs_reset
|
|
58
120
|
|
|
59
121
|
@state_needs_reset.setter
|
|
60
122
|
def state_needs_reset(self, v: bool):
|
|
123
|
+
"""Sets the flag indicating that the node's state needs to be reset.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
v: The boolean value to set.
|
|
127
|
+
"""
|
|
61
128
|
self._state_needs_reset = v
|
|
62
129
|
|
|
130
|
+
@staticmethod
|
|
131
|
+
def create_schema_callback_from_function(f: Callable) -> Callable[[], List[FlowfileColumn]]:
|
|
132
|
+
"""Wraps a node's function to create a schema callback that extracts the schema.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
f: The node's core function that returns a FlowDataEngine instance.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
A callable that, when executed, returns the output schema.
|
|
139
|
+
"""
|
|
140
|
+
def schema_callback() -> List[FlowfileColumn]:
|
|
141
|
+
try:
|
|
142
|
+
logger.info('Executing the schema callback function based on the node function')
|
|
143
|
+
return f().schema
|
|
144
|
+
except Exception as e:
|
|
145
|
+
logger.warning(f'Error with the schema callback: {e}')
|
|
146
|
+
return []
|
|
147
|
+
return schema_callback
|
|
148
|
+
|
|
63
149
|
@property
|
|
64
|
-
def schema_callback(self):
|
|
150
|
+
def schema_callback(self) -> SingleExecutionFuture:
|
|
151
|
+
"""Gets the schema callback function, creating one if it doesn't exist.
|
|
152
|
+
|
|
153
|
+
The callback is used for predicting the output schema without full execution.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
A SingleExecutionFuture instance wrapping the schema function.
|
|
157
|
+
"""
|
|
158
|
+
if self._schema_callback is None:
|
|
159
|
+
if self.user_provided_schema_callback is not None:
|
|
160
|
+
self.schema_callback = self.user_provided_schema_callback
|
|
161
|
+
elif self.is_start:
|
|
162
|
+
self.schema_callback = self.create_schema_callback_from_function(self._function)
|
|
65
163
|
return self._schema_callback
|
|
66
164
|
|
|
67
165
|
@schema_callback.setter
|
|
68
166
|
def schema_callback(self, f: Callable):
|
|
167
|
+
"""Sets the schema callback function for the node.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
f: The function to be used for schema calculation.
|
|
171
|
+
"""
|
|
69
172
|
if f is None:
|
|
70
173
|
return
|
|
71
174
|
|
|
@@ -76,13 +179,27 @@ class FlowNode:
|
|
|
76
179
|
return []
|
|
77
180
|
|
|
78
181
|
self._schema_callback = SingleExecutionFuture(f, error_callback)
|
|
79
|
-
self._schema_callback.start()
|
|
80
182
|
|
|
81
183
|
@property
|
|
82
184
|
def is_start(self) -> bool:
|
|
185
|
+
"""Determines if the node is a starting node in the flow.
|
|
186
|
+
|
|
187
|
+
A starting node requires no inputs.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
True if the node is a start node, False otherwise.
|
|
191
|
+
"""
|
|
83
192
|
return not self.has_input and self.node_template.input == 0
|
|
84
193
|
|
|
85
194
|
def get_input_type(self, node_id: int) -> List:
|
|
195
|
+
"""Gets the type of connection ('main', 'left', 'right') for a given input node ID.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
node_id: The ID of the input node.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
A list of connection types for that node ID.
|
|
202
|
+
"""
|
|
86
203
|
relation_type = []
|
|
87
204
|
if node_id in [n.node_id for n in self.node_inputs.main_inputs]:
|
|
88
205
|
relation_type.append('main')
|
|
@@ -92,36 +209,6 @@ class FlowNode:
|
|
|
92
209
|
relation_type.append('right')
|
|
93
210
|
return list(set(relation_type))
|
|
94
211
|
|
|
95
|
-
def __init__(self, node_id: Union[str, int], function: Callable,
|
|
96
|
-
parent_uuid: str,
|
|
97
|
-
setting_input: Any,
|
|
98
|
-
name: str,
|
|
99
|
-
node_type: str,
|
|
100
|
-
input_columns: List[str] = None,
|
|
101
|
-
output_schema: List[FlowfileColumn] = None,
|
|
102
|
-
drop_columns: List[str] = None,
|
|
103
|
-
renew_schema: bool = True,
|
|
104
|
-
pos_x: float = 0,
|
|
105
|
-
pos_y: float = 0,
|
|
106
|
-
schema_callback: Callable = None,
|
|
107
|
-
):
|
|
108
|
-
self.parent_uuid = parent_uuid
|
|
109
|
-
self.post_init()
|
|
110
|
-
self.active = True
|
|
111
|
-
self.node_information.id = node_id
|
|
112
|
-
self.node_type = node_type
|
|
113
|
-
self.node_settings.renew_schema = renew_schema
|
|
114
|
-
self.update_node(function=function,
|
|
115
|
-
input_columns=input_columns,
|
|
116
|
-
output_schema=output_schema,
|
|
117
|
-
drop_columns=drop_columns,
|
|
118
|
-
setting_input=setting_input,
|
|
119
|
-
name=name,
|
|
120
|
-
pos_x=pos_x,
|
|
121
|
-
pos_y=pos_y,
|
|
122
|
-
schema_callback=schema_callback,
|
|
123
|
-
)
|
|
124
|
-
|
|
125
212
|
def update_node(self,
|
|
126
213
|
function: Callable,
|
|
127
214
|
input_columns: List[str] = None,
|
|
@@ -133,13 +220,28 @@ class FlowNode:
|
|
|
133
220
|
pos_y: float = 0,
|
|
134
221
|
schema_callback: Callable = None,
|
|
135
222
|
):
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
223
|
+
"""Updates the properties of the node.
|
|
224
|
+
|
|
225
|
+
This is called during initialization and when settings are changed.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
function: The new core data processing function.
|
|
229
|
+
input_columns: The new list of input columns.
|
|
230
|
+
output_schema: The new schema of added columns.
|
|
231
|
+
drop_columns: The new list of dropped columns.
|
|
232
|
+
name: The new name for the node.
|
|
233
|
+
setting_input: The new settings object.
|
|
234
|
+
pos_x: The new x-coordinate.
|
|
235
|
+
pos_y: The new y-coordinate.
|
|
236
|
+
schema_callback: The new custom schema callback function.
|
|
237
|
+
"""
|
|
238
|
+
self.user_provided_schema_callback = schema_callback
|
|
239
|
+
self.node_information.y_position = int(pos_y)
|
|
240
|
+
self.node_information.x_position = int(pos_x)
|
|
140
241
|
self.node_information.setting_input = setting_input
|
|
141
242
|
self.name = self.node_type if name is None else name
|
|
142
243
|
self._function = function
|
|
244
|
+
|
|
143
245
|
self.node_schema.input_columns = [] if input_columns is None else input_columns
|
|
144
246
|
self.node_schema.output_columns = [] if output_schema is None else output_schema
|
|
145
247
|
self.node_schema.drop_columns = [] if drop_columns is None else drop_columns
|
|
@@ -147,7 +249,6 @@ class FlowNode:
|
|
|
147
249
|
if hasattr(setting_input, 'cache_results'):
|
|
148
250
|
self.node_settings.cache_results = setting_input.cache_results
|
|
149
251
|
|
|
150
|
-
self.setting_input = setting_input
|
|
151
252
|
self.results.errors = None
|
|
152
253
|
self.add_lead_to_in_depend_source()
|
|
153
254
|
_ = self.hash
|
|
@@ -155,51 +256,102 @@ class FlowNode:
|
|
|
155
256
|
if self.node_template is None:
|
|
156
257
|
raise Exception(f'Node template {self.node_type} not found')
|
|
157
258
|
self.node_default = node_interface.node_defaults.get(self.node_type)
|
|
259
|
+
self.setting_input = setting_input # wait until the end so that the hash is calculated correctly
|
|
158
260
|
|
|
159
261
|
@property
|
|
160
|
-
def name(self):
|
|
262
|
+
def name(self) -> str:
|
|
263
|
+
"""Gets the name of the node.
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
The node's name.
|
|
267
|
+
"""
|
|
161
268
|
return self._name
|
|
162
269
|
|
|
163
270
|
@name.setter
|
|
164
271
|
def name(self, name: str):
|
|
272
|
+
"""Sets the name of the node.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
name: The new name.
|
|
276
|
+
"""
|
|
165
277
|
self._name = name
|
|
166
278
|
self.__name__ = name
|
|
167
279
|
|
|
168
280
|
@property
|
|
169
|
-
def setting_input(self):
|
|
281
|
+
def setting_input(self) -> Any:
|
|
282
|
+
"""Gets the node's specific configuration settings.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
The settings object.
|
|
286
|
+
"""
|
|
170
287
|
return self._setting_input
|
|
171
288
|
|
|
172
289
|
@setting_input.setter
|
|
173
290
|
def setting_input(self, setting_input: Any):
|
|
291
|
+
"""Sets the node's configuration and triggers a reset if necessary.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
setting_input: The new settings object.
|
|
295
|
+
"""
|
|
296
|
+
is_manual_input = (self.node_type == 'manual_input' and
|
|
297
|
+
isinstance(setting_input, input_schema.NodeManualInput) and
|
|
298
|
+
isinstance(self._setting_input, input_schema.NodeManualInput)
|
|
299
|
+
)
|
|
300
|
+
if is_manual_input:
|
|
301
|
+
_ = self.hash
|
|
174
302
|
self._setting_input = setting_input
|
|
175
303
|
self.set_node_information()
|
|
176
|
-
if
|
|
177
|
-
if self.hash != self.calculate_hash(setting_input) or not self.node_stats.
|
|
178
|
-
self.function =
|
|
304
|
+
if is_manual_input:
|
|
305
|
+
if self.hash != self.calculate_hash(setting_input) or not self.node_stats.has_run_with_current_setup:
|
|
306
|
+
self.function = FlowDataEngine(setting_input.raw_data_format)
|
|
179
307
|
self.reset()
|
|
180
308
|
self.get_predicted_schema()
|
|
181
309
|
elif self._setting_input is not None:
|
|
182
310
|
self.reset()
|
|
183
311
|
|
|
184
312
|
@property
|
|
185
|
-
def node_id(self):
|
|
313
|
+
def node_id(self) -> Union[str, int]:
|
|
314
|
+
"""Gets the unique identifier of the node.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
The node's ID.
|
|
318
|
+
"""
|
|
186
319
|
return self.node_information.id
|
|
187
320
|
|
|
188
321
|
@property
|
|
189
|
-
def left_input(self):
|
|
322
|
+
def left_input(self) -> Optional["FlowNode"]:
|
|
323
|
+
"""Gets the node connected to the left input port.
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
The left input FlowNode, or None.
|
|
327
|
+
"""
|
|
190
328
|
return self.node_inputs.left_input
|
|
191
329
|
|
|
192
330
|
@property
|
|
193
|
-
def right_input(self):
|
|
331
|
+
def right_input(self) -> Optional["FlowNode"]:
|
|
332
|
+
"""Gets the node connected to the right input port.
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
The right input FlowNode, or None.
|
|
336
|
+
"""
|
|
194
337
|
return self.node_inputs.right_input
|
|
195
338
|
|
|
196
339
|
@property
|
|
197
340
|
def main_input(self) -> List["FlowNode"]:
|
|
341
|
+
"""Gets the list of nodes connected to the main input port(s).
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
A list of main input FlowNodes.
|
|
345
|
+
"""
|
|
198
346
|
return self.node_inputs.main_inputs
|
|
199
347
|
|
|
200
348
|
@property
|
|
201
|
-
def is_correct(self):
|
|
202
|
-
|
|
349
|
+
def is_correct(self) -> bool:
|
|
350
|
+
"""Checks if the node's input connections satisfy its template requirements.
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
True if connections are valid, False otherwise.
|
|
354
|
+
"""
|
|
203
355
|
if isinstance(self.setting_input, input_schema.NodePromise):
|
|
204
356
|
return False
|
|
205
357
|
return (self.node_template.input == len(self.node_inputs.get_all_inputs()) or
|
|
@@ -207,6 +359,10 @@ class FlowNode:
|
|
|
207
359
|
(self.node_template.multi and self.node_template.can_be_start))
|
|
208
360
|
|
|
209
361
|
def set_node_information(self):
|
|
362
|
+
"""Populates the `node_information` attribute with the current state.
|
|
363
|
+
|
|
364
|
+
This includes the node's connections, settings, and position.
|
|
365
|
+
"""
|
|
210
366
|
logger.info('setting node information')
|
|
211
367
|
node_information = self.node_information
|
|
212
368
|
node_information.left_input_id = self.node_inputs.left_input.node_id if self.left_input else None
|
|
@@ -221,43 +377,76 @@ class FlowNode:
|
|
|
221
377
|
node_information.type = self.node_type
|
|
222
378
|
|
|
223
379
|
def get_node_information(self) -> schemas.NodeInformation:
|
|
380
|
+
"""Updates and returns the node's information object.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
The `NodeInformation` object for this node.
|
|
384
|
+
"""
|
|
224
385
|
self.set_node_information()
|
|
225
386
|
return self.node_information
|
|
226
387
|
|
|
227
388
|
@property
|
|
228
|
-
def function(self):
|
|
389
|
+
def function(self) -> Callable:
|
|
390
|
+
"""Gets the core processing function of the node.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
The callable function.
|
|
394
|
+
"""
|
|
229
395
|
return self._function
|
|
230
396
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
397
|
+
@function.setter
|
|
398
|
+
def function(self, function: Callable):
|
|
399
|
+
"""Sets the core processing function of the node.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
function: The new callable function.
|
|
403
|
+
"""
|
|
404
|
+
self._function = function
|
|
239
405
|
|
|
240
406
|
@property
|
|
241
407
|
def all_inputs(self) -> List["FlowNode"]:
|
|
408
|
+
"""Gets a list of all nodes connected to any input port.
|
|
409
|
+
|
|
410
|
+
Returns:
|
|
411
|
+
A list of all input FlowNodes.
|
|
412
|
+
"""
|
|
242
413
|
return self.node_inputs.get_all_inputs()
|
|
243
414
|
|
|
244
|
-
def calculate_hash(self, setting_input: Any):
|
|
415
|
+
def calculate_hash(self, setting_input: Any) -> str:
|
|
416
|
+
"""Calculates a hash based on settings and input node hashes.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
setting_input: The node's settings object to be included in the hash.
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
A string hash value.
|
|
423
|
+
"""
|
|
245
424
|
depends_on_hashes = [_node.hash for _node in self.all_inputs]
|
|
246
425
|
node_data_hash = get_hash(setting_input)
|
|
247
426
|
return get_hash(depends_on_hashes + [node_data_hash, self.parent_uuid])
|
|
248
427
|
|
|
249
428
|
@property
|
|
250
|
-
def hash(self):
|
|
429
|
+
def hash(self) -> str:
|
|
430
|
+
"""Gets the cached hash for the node, calculating it if it doesn't exist.
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
The string hash value.
|
|
434
|
+
"""
|
|
251
435
|
if not self._hash:
|
|
252
436
|
self._hash = self.calculate_hash(self.setting_input)
|
|
253
437
|
return self._hash
|
|
254
438
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
439
|
+
def add_node_connection(self, from_node: "FlowNode",
|
|
440
|
+
insert_type: Literal['main', 'left', 'right'] = 'main') -> None:
|
|
441
|
+
"""Adds a connection from a source node to this node.
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
from_node: The node to connect from.
|
|
445
|
+
insert_type: The type of input to connect to ('main', 'left', 'right').
|
|
259
446
|
|
|
260
|
-
|
|
447
|
+
Raises:
|
|
448
|
+
Exception: If the insert_type is invalid.
|
|
449
|
+
"""
|
|
261
450
|
from_node.leads_to_nodes.append(self)
|
|
262
451
|
if insert_type == 'main':
|
|
263
452
|
if self.node_template.input <= 2 or self.node_inputs.main_inputs is None:
|
|
@@ -276,22 +465,41 @@ class FlowNode:
|
|
|
276
465
|
self.reset()
|
|
277
466
|
from_node.reset()
|
|
278
467
|
|
|
279
|
-
def evaluate_nodes(self, deep: bool = False):
|
|
468
|
+
def evaluate_nodes(self, deep: bool = False) -> None:
|
|
469
|
+
"""Triggers a state reset for all directly connected downstream nodes.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
deep: If True, the reset propagates recursively through the entire downstream graph.
|
|
473
|
+
"""
|
|
280
474
|
for node in self.leads_to_nodes:
|
|
281
475
|
self.print(f'resetting node: {node.node_id}')
|
|
282
476
|
node.reset(deep)
|
|
283
477
|
|
|
284
|
-
def get_flow_file_column_schema(self, col_name: str) -> FlowfileColumn:
|
|
478
|
+
def get_flow_file_column_schema(self, col_name: str) -> FlowfileColumn | None:
|
|
479
|
+
"""Retrieves the schema for a specific column from the output schema.
|
|
480
|
+
|
|
481
|
+
Args:
|
|
482
|
+
col_name: The name of the column.
|
|
483
|
+
|
|
484
|
+
Returns:
|
|
485
|
+
The FlowfileColumn object for that column, or None if not found.
|
|
486
|
+
"""
|
|
285
487
|
for s in self.schema:
|
|
286
488
|
if s.column_name == col_name:
|
|
287
489
|
return s
|
|
288
490
|
|
|
289
|
-
def get_predicted_schema(self, force: bool = False):
|
|
290
|
-
"""
|
|
291
|
-
|
|
292
|
-
|
|
491
|
+
def get_predicted_schema(self, force: bool = False) -> List[FlowfileColumn] | None:
|
|
492
|
+
"""Predicts the output schema of the node without full execution.
|
|
493
|
+
|
|
494
|
+
It uses the schema_callback or infers from predicted data.
|
|
495
|
+
|
|
496
|
+
Args:
|
|
497
|
+
force: If True, forces recalculation even if a predicted schema exists.
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
A list of FlowfileColumn objects representing the predicted schema.
|
|
293
501
|
"""
|
|
294
|
-
if self.node_schema.predicted_schema
|
|
502
|
+
if self.node_schema.predicted_schema and not force:
|
|
295
503
|
return self.node_schema.predicted_schema
|
|
296
504
|
if self.schema_callback is not None and (self.node_schema.predicted_schema is None or force):
|
|
297
505
|
self.print('Getting the data from a schema callback')
|
|
@@ -299,7 +507,7 @@ class FlowNode:
|
|
|
299
507
|
# Force the schema callback to reset, so that it will be executed again
|
|
300
508
|
self.schema_callback.reset()
|
|
301
509
|
schema = self.schema_callback()
|
|
302
|
-
if schema is not None:
|
|
510
|
+
if schema is not None and len(schema) > 0:
|
|
303
511
|
self.print('Calculating the schema based on the schema callback')
|
|
304
512
|
self.node_schema.predicted_schema = schema
|
|
305
513
|
return self.node_schema.predicted_schema
|
|
@@ -311,6 +519,11 @@ class FlowNode:
|
|
|
311
519
|
|
|
312
520
|
@property
|
|
313
521
|
def is_setup(self) -> bool:
|
|
522
|
+
"""Checks if the node has been properly configured and is ready for execution.
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
True if the node is set up, False otherwise.
|
|
526
|
+
"""
|
|
314
527
|
if not self.node_information.is_setup:
|
|
315
528
|
if self.function.__name__ != 'placeholder':
|
|
316
529
|
self.node_information.is_setup = True
|
|
@@ -318,16 +531,31 @@ class FlowNode:
|
|
|
318
531
|
return self.node_information.is_setup
|
|
319
532
|
|
|
320
533
|
def print(self, v: Any):
|
|
534
|
+
"""Helper method to log messages with node context.
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
v: The message or value to log.
|
|
538
|
+
"""
|
|
321
539
|
logger.info(f'{self.node_type}, node_id: {self.node_id}: {v}')
|
|
322
540
|
|
|
323
|
-
def get_resulting_data(self) -> FlowDataEngine:
|
|
541
|
+
def get_resulting_data(self) -> FlowDataEngine | None:
|
|
542
|
+
"""Executes the node's function to produce the actual output data.
|
|
543
|
+
|
|
544
|
+
Handles both regular functions and external data sources.
|
|
545
|
+
|
|
546
|
+
Returns:
|
|
547
|
+
A FlowDataEngine instance containing the result, or None on error.
|
|
548
|
+
|
|
549
|
+
Raises:
|
|
550
|
+
Exception: Propagates exceptions from the node's function execution.
|
|
551
|
+
"""
|
|
324
552
|
if self.is_setup:
|
|
325
553
|
if self.results.resulting_data is None and self.results.errors is None:
|
|
326
554
|
self.print('getting resulting data')
|
|
327
555
|
try:
|
|
328
556
|
if isinstance(self.function, FlowDataEngine):
|
|
329
557
|
fl: FlowDataEngine = self.function
|
|
330
|
-
elif self.node_type
|
|
558
|
+
elif self.node_type == 'external_source':
|
|
331
559
|
fl: FlowDataEngine = self.function()
|
|
332
560
|
fl.collect_external()
|
|
333
561
|
self.node_settings.streamable = False
|
|
@@ -342,11 +570,19 @@ class FlowNode:
|
|
|
342
570
|
except Exception as e:
|
|
343
571
|
self.results.resulting_data = FlowDataEngine()
|
|
344
572
|
self.results.errors = str(e)
|
|
345
|
-
self.node_stats.
|
|
573
|
+
self.node_stats.has_run_with_current_setup = False
|
|
574
|
+
self.node_stats.has_completed_last_run = False
|
|
346
575
|
raise e
|
|
347
576
|
return self.results.resulting_data
|
|
348
577
|
|
|
349
|
-
def _predicted_data_getter(self) -> FlowDataEngine|None:
|
|
578
|
+
def _predicted_data_getter(self) -> FlowDataEngine | None:
|
|
579
|
+
"""Internal helper to get a predicted data result.
|
|
580
|
+
|
|
581
|
+
This calls the function with predicted data from input nodes.
|
|
582
|
+
|
|
583
|
+
Returns:
|
|
584
|
+
A FlowDataEngine instance with predicted data, or an empty one on error.
|
|
585
|
+
"""
|
|
350
586
|
try:
|
|
351
587
|
fl = self._function(*[v.get_predicted_resulting_data() for v in self.all_inputs])
|
|
352
588
|
return fl
|
|
@@ -363,8 +599,16 @@ class FlowNode:
|
|
|
363
599
|
logger.warning(e)
|
|
364
600
|
|
|
365
601
|
def get_predicted_resulting_data(self) -> FlowDataEngine:
|
|
602
|
+
"""Creates a `FlowDataEngine` instance based on the predicted schema.
|
|
603
|
+
|
|
604
|
+
This avoids executing the node's full logic.
|
|
605
|
+
|
|
606
|
+
Returns:
|
|
607
|
+
A FlowDataEngine instance with a schema but no data.
|
|
608
|
+
"""
|
|
366
609
|
if self.needs_run(False) and self.schema_callback is not None or self.node_schema.result_schema is not None:
|
|
367
610
|
self.print('Getting data based on the schema')
|
|
611
|
+
|
|
368
612
|
_s = self.schema_callback() if self.node_schema.result_schema is None else self.node_schema.result_schema
|
|
369
613
|
return FlowDataEngine.create_from_schema(_s)
|
|
370
614
|
else:
|
|
@@ -375,17 +619,28 @@ class FlowNode:
|
|
|
375
619
|
return fl
|
|
376
620
|
|
|
377
621
|
def add_lead_to_in_depend_source(self):
|
|
622
|
+
"""Ensures this node is registered in the `leads_to_nodes` list of its inputs."""
|
|
378
623
|
for input_node in self.all_inputs:
|
|
379
624
|
if self.node_id not in [n.node_id for n in input_node.leads_to_nodes]:
|
|
380
625
|
input_node.leads_to_nodes.append(self)
|
|
381
626
|
|
|
382
627
|
def get_all_dependent_nodes(self) -> Generator["FlowNode", None, None]:
|
|
628
|
+
"""Yields all downstream nodes recursively.
|
|
629
|
+
|
|
630
|
+
Returns:
|
|
631
|
+
A generator of all dependent FlowNode objects.
|
|
632
|
+
"""
|
|
383
633
|
for node in self.leads_to_nodes:
|
|
384
634
|
yield node
|
|
385
635
|
for n in node.get_all_dependent_nodes():
|
|
386
636
|
yield n
|
|
387
637
|
|
|
388
638
|
def get_all_dependent_node_ids(self) -> Generator[int, None, None]:
|
|
639
|
+
"""Yields the IDs of all downstream nodes recursively.
|
|
640
|
+
|
|
641
|
+
Returns:
|
|
642
|
+
A generator of all dependent node IDs.
|
|
643
|
+
"""
|
|
389
644
|
for node in self.leads_to_nodes:
|
|
390
645
|
yield node.node_id
|
|
391
646
|
for n in node.get_all_dependent_node_ids():
|
|
@@ -393,6 +648,13 @@ class FlowNode:
|
|
|
393
648
|
|
|
394
649
|
@property
|
|
395
650
|
def schema(self) -> List[FlowfileColumn]:
|
|
651
|
+
"""Gets the definitive output schema of the node.
|
|
652
|
+
|
|
653
|
+
If not already run, it falls back to the predicted schema.
|
|
654
|
+
|
|
655
|
+
Returns:
|
|
656
|
+
A list of FlowfileColumn objects.
|
|
657
|
+
"""
|
|
396
658
|
try:
|
|
397
659
|
if self.is_setup and self.results.errors is None:
|
|
398
660
|
if self.node_schema.result_schema is not None and len(self.node_schema.result_schema) > 0:
|
|
@@ -405,31 +667,42 @@ class FlowNode:
|
|
|
405
667
|
return self.node_schema.result_schema
|
|
406
668
|
else:
|
|
407
669
|
return []
|
|
408
|
-
except:
|
|
670
|
+
except Exception as e:
|
|
671
|
+
logger.error(e)
|
|
409
672
|
return []
|
|
410
673
|
|
|
411
|
-
def load_from_cache(self) -> FlowDataEngine:
|
|
412
|
-
if results_exists(self.hash):
|
|
413
|
-
try:
|
|
414
|
-
return FlowDataEngine(self._fetch_cached_df.get_result())
|
|
415
|
-
except Exception as e:
|
|
416
|
-
logger.error(e)
|
|
417
|
-
|
|
418
674
|
def remove_cache(self):
|
|
675
|
+
"""Removes cached results for this node.
|
|
676
|
+
|
|
677
|
+
Note: Currently not fully implemented.
|
|
678
|
+
"""
|
|
679
|
+
|
|
419
680
|
if results_exists(self.hash):
|
|
420
681
|
logger.warning('Not implemented')
|
|
421
682
|
|
|
422
683
|
def needs_run(self, performance_mode: bool, node_logger: NodeLogger = None,
|
|
423
684
|
execution_location: schemas.ExecutionLocationsLiteral = "auto") -> bool:
|
|
424
|
-
if
|
|
685
|
+
"""Determines if the node needs to be executed.
|
|
686
|
+
|
|
687
|
+
The decision is based on its run state, caching settings, and execution mode.
|
|
688
|
+
|
|
689
|
+
Args:
|
|
690
|
+
performance_mode: True if the flow is in performance mode.
|
|
691
|
+
node_logger: The logger instance for this node.
|
|
692
|
+
execution_location: The target execution location.
|
|
693
|
+
|
|
694
|
+
Returns:
|
|
695
|
+
True if the node should be run, False otherwise.
|
|
696
|
+
"""
|
|
697
|
+
if execution_location == "local" or SINGLE_FILE_MODE:
|
|
425
698
|
return False
|
|
699
|
+
|
|
426
700
|
flow_logger = logger if node_logger is None else node_logger
|
|
427
701
|
cache_result_exists = results_exists(self.hash)
|
|
428
|
-
if not self.node_stats.
|
|
702
|
+
if not self.node_stats.has_run_with_current_setup:
|
|
429
703
|
flow_logger.info('Node has not run, needs to run')
|
|
430
704
|
return True
|
|
431
705
|
if self.node_settings.cache_results and cache_result_exists:
|
|
432
|
-
|
|
433
706
|
return False
|
|
434
707
|
elif self.node_settings.cache_results and not cache_result_exists:
|
|
435
708
|
return True
|
|
@@ -439,9 +712,34 @@ class FlowNode:
|
|
|
439
712
|
return True
|
|
440
713
|
|
|
441
714
|
def __call__(self, *args, **kwargs):
|
|
715
|
+
"""Makes the node instance callable, acting as an alias for execute_node."""
|
|
442
716
|
self.execute_node(*args, **kwargs)
|
|
443
717
|
|
|
718
|
+
def execute_full_local(self, performance_mode: bool = False) -> None:
|
|
719
|
+
"""Executes the node's logic locally, including example data generation.
|
|
720
|
+
|
|
721
|
+
Args:
|
|
722
|
+
performance_mode: If True, skips generating example data.
|
|
723
|
+
|
|
724
|
+
Raises:
|
|
725
|
+
Exception: Propagates exceptions from the execution.
|
|
726
|
+
"""
|
|
727
|
+
if self.results.resulting_data is None and not performance_mode:
|
|
728
|
+
self.results.resulting_data = self.get_resulting_data()
|
|
729
|
+
self.results.example_data_generator = lambda: self.get_resulting_data().get_sample(100).to_arrow()
|
|
730
|
+
self.node_schema.result_schema = self.results.resulting_data.schema
|
|
731
|
+
self.node_stats.has_completed_last_run = True
|
|
732
|
+
|
|
444
733
|
def execute_local(self, flow_id: int, performance_mode: bool = False):
|
|
734
|
+
"""Executes the node's logic locally.
|
|
735
|
+
|
|
736
|
+
Args:
|
|
737
|
+
flow_id: The ID of the parent flow.
|
|
738
|
+
performance_mode: If True, skips generating example data.
|
|
739
|
+
|
|
740
|
+
Raises:
|
|
741
|
+
Exception: Propagates exceptions from the execution.
|
|
742
|
+
"""
|
|
445
743
|
try:
|
|
446
744
|
resulting_data = self.get_resulting_data()
|
|
447
745
|
if not performance_mode:
|
|
@@ -449,23 +747,32 @@ class FlowNode:
|
|
|
449
747
|
wait_on_completion=True, node_id=self.node_id, flow_id=flow_id)
|
|
450
748
|
self.store_example_data_generator(external_sampler)
|
|
451
749
|
if self.results.errors is None and not self.node_stats.is_canceled:
|
|
452
|
-
self.node_stats.
|
|
750
|
+
self.node_stats.has_run_with_current_setup = True
|
|
453
751
|
self.node_schema.result_schema = resulting_data.schema
|
|
454
752
|
|
|
455
753
|
except Exception as e:
|
|
456
754
|
logger.warning(f"Error with step {self.__name__}")
|
|
457
755
|
logger.error(str(e))
|
|
458
756
|
self.results.errors = str(e)
|
|
459
|
-
self.node_stats.
|
|
757
|
+
self.node_stats.has_run_with_current_setup = False
|
|
758
|
+
self.node_stats.has_completed_last_run = False
|
|
460
759
|
raise e
|
|
461
760
|
|
|
462
|
-
if self.node_stats.
|
|
761
|
+
if self.node_stats.has_run_with_current_setup:
|
|
463
762
|
for step in self.leads_to_nodes:
|
|
464
763
|
if not self.node_settings.streamable:
|
|
465
764
|
step.node_settings.streamable = self.node_settings.streamable
|
|
466
765
|
|
|
467
766
|
def execute_remote(self, performance_mode: bool = False, node_logger: NodeLogger = None):
|
|
468
|
-
|
|
767
|
+
"""Executes the node's logic remotely or handles cached results.
|
|
768
|
+
|
|
769
|
+
Args:
|
|
770
|
+
performance_mode: If True, skips generating example data.
|
|
771
|
+
node_logger: The logger for this node execution.
|
|
772
|
+
|
|
773
|
+
Raises:
|
|
774
|
+
Exception: If the node_logger is not provided or if execution fails.
|
|
775
|
+
"""
|
|
469
776
|
if node_logger is None:
|
|
470
777
|
raise Exception('Node logger is not defined')
|
|
471
778
|
if self.node_settings.cache_results and results_exists(self.hash):
|
|
@@ -477,7 +784,7 @@ class FlowNode:
|
|
|
477
784
|
node_logger.warning('Failed to read the cache, rerunning the code')
|
|
478
785
|
if self.node_type == 'output':
|
|
479
786
|
self.results.resulting_data = self.get_resulting_data()
|
|
480
|
-
self.node_stats.
|
|
787
|
+
self.node_stats.has_run_with_current_setup = True
|
|
481
788
|
return
|
|
482
789
|
try:
|
|
483
790
|
self.get_resulting_data()
|
|
@@ -498,7 +805,7 @@ class FlowNode:
|
|
|
498
805
|
)
|
|
499
806
|
if not performance_mode:
|
|
500
807
|
self.store_example_data_generator(external_df_fetcher)
|
|
501
|
-
self.node_stats.
|
|
808
|
+
self.node_stats.has_run_with_current_setup = True
|
|
502
809
|
|
|
503
810
|
except Exception as e:
|
|
504
811
|
node_logger.error('Error with external process')
|
|
@@ -522,11 +829,15 @@ class FlowNode:
|
|
|
522
829
|
self._fetch_cached_df = None
|
|
523
830
|
|
|
524
831
|
def prepare_before_run(self):
|
|
832
|
+
"""Resets results and errors before a new execution."""
|
|
833
|
+
|
|
525
834
|
self.results.errors = None
|
|
526
835
|
self.results.resulting_data = None
|
|
527
836
|
self.results.example_data = None
|
|
528
837
|
|
|
529
838
|
def cancel(self):
|
|
839
|
+
"""Cancels an ongoing external process if one is running."""
|
|
840
|
+
|
|
530
841
|
if self._fetch_cached_df is not None:
|
|
531
842
|
self._fetch_cached_df.cancel()
|
|
532
843
|
self.node_stats.is_canceled = True
|
|
@@ -536,15 +847,29 @@ class FlowNode:
|
|
|
536
847
|
|
|
537
848
|
def execute_node(self, run_location: schemas.ExecutionLocationsLiteral, reset_cache: bool = False,
|
|
538
849
|
performance_mode: bool = False, retry: bool = True, node_logger: NodeLogger = None):
|
|
850
|
+
"""Orchestrates the execution, handling location, caching, and retries.
|
|
851
|
+
|
|
852
|
+
Args:
|
|
853
|
+
run_location: The location for execution ('local', 'remote').
|
|
854
|
+
reset_cache: If True, forces removal of any existing cache.
|
|
855
|
+
performance_mode: If True, optimizes for speed over diagnostics.
|
|
856
|
+
retry: If True, allows retrying execution on recoverable errors.
|
|
857
|
+
node_logger: The logger for this node execution.
|
|
858
|
+
|
|
859
|
+
Raises:
|
|
860
|
+
Exception: If the node_logger is not defined.
|
|
861
|
+
"""
|
|
539
862
|
if node_logger is None:
|
|
540
863
|
raise Exception('Flow logger is not defined')
|
|
541
864
|
# node_logger = flow_logger.get_node_logger(self.node_id)
|
|
542
865
|
if reset_cache:
|
|
543
866
|
self.remove_cache()
|
|
544
|
-
self.node_stats.
|
|
867
|
+
self.node_stats.has_run_with_current_setup = False
|
|
868
|
+
self.node_stats.has_completed_last_run = False
|
|
545
869
|
if self.is_setup:
|
|
546
870
|
node_logger.info(f'Starting to run {self.__name__}')
|
|
547
|
-
if self.needs_run(performance_mode, node_logger, run_location)
|
|
871
|
+
if (self.needs_run(performance_mode, node_logger, run_location) or self.node_template.node_group == "output"
|
|
872
|
+
and not (run_location == 'local' or SINGLE_FILE_MODE)):
|
|
548
873
|
self.prepare_before_run()
|
|
549
874
|
try:
|
|
550
875
|
if ((run_location == 'remote' or (self.node_default.transform_type == 'wide')
|
|
@@ -572,16 +897,30 @@ class FlowNode:
|
|
|
572
897
|
performance_mode=performance_mode, retry=False,
|
|
573
898
|
node_logger=node_logger)
|
|
574
899
|
else:
|
|
575
|
-
self.node_stats.has_run = False
|
|
576
900
|
self.results.errors = str(e)
|
|
577
901
|
node_logger.error(f'Error with running the node: {e}')
|
|
578
|
-
|
|
902
|
+
elif ((run_location == 'local' or SINGLE_FILE_MODE) and (not self.node_stats.has_run_with_current_setup
|
|
903
|
+
or self.node_template.node_group == "output")):
|
|
904
|
+
try:
|
|
905
|
+
node_logger.info('Executing fully locally')
|
|
906
|
+
self.execute_full_local(performance_mode)
|
|
907
|
+
except Exception as e:
|
|
908
|
+
self.results.errors = str(e)
|
|
909
|
+
node_logger.error(f'Error with running the node: {e}')
|
|
910
|
+
self.node_stats.error = str(e)
|
|
911
|
+
self.node_stats.has_completed_last_run = False
|
|
912
|
+
self.node_stats.has_run_with_current_setup = True
|
|
579
913
|
else:
|
|
580
914
|
node_logger.info('Node has already run, not running the node')
|
|
581
915
|
else:
|
|
582
916
|
node_logger.warning(f'Node {self.__name__} is not setup, cannot run the node')
|
|
583
917
|
|
|
584
918
|
def store_example_data_generator(self, external_df_fetcher: ExternalDfFetcher | ExternalSampler):
|
|
919
|
+
"""Stores a generator function for fetching a sample of the result data.
|
|
920
|
+
|
|
921
|
+
Args:
|
|
922
|
+
external_df_fetcher: The process that generated the sample data.
|
|
923
|
+
"""
|
|
585
924
|
if external_df_fetcher.status is not None:
|
|
586
925
|
file_ref = external_df_fetcher.status.file_ref
|
|
587
926
|
self.results.example_data_path = file_ref
|
|
@@ -590,23 +929,48 @@ class FlowNode:
|
|
|
590
929
|
logger.error('Could not get the sample data, the external process is not ready')
|
|
591
930
|
|
|
592
931
|
def needs_reset(self) -> bool:
|
|
932
|
+
"""Checks if the node's hash has changed, indicating an outdated state.
|
|
933
|
+
|
|
934
|
+
Returns:
|
|
935
|
+
True if the calculated hash differs from the stored hash.
|
|
936
|
+
"""
|
|
593
937
|
return self._hash != self.calculate_hash(self.setting_input)
|
|
594
938
|
|
|
595
939
|
def reset(self, deep: bool = False):
|
|
940
|
+
"""Resets the node's execution state and schema information.
|
|
941
|
+
|
|
942
|
+
This also triggers a reset on all downstream nodes.
|
|
943
|
+
|
|
944
|
+
Args:
|
|
945
|
+
deep: If True, forces a reset even if the hash hasn't changed.
|
|
946
|
+
"""
|
|
596
947
|
needs_reset = self.needs_reset() or deep
|
|
597
948
|
if needs_reset:
|
|
598
949
|
logger.info(f'{self.node_id}: Node needs reset')
|
|
599
|
-
self.node_stats.
|
|
950
|
+
self.node_stats.has_run_with_current_setup = False
|
|
600
951
|
self.results.reset()
|
|
601
|
-
if self.
|
|
602
|
-
self.
|
|
952
|
+
if self.is_correct:
|
|
953
|
+
self._schema_callback = None # Ensure the schema callback is reset
|
|
954
|
+
if self.schema_callback:
|
|
955
|
+
logger.info(f'{self.node_id}: Resetting the schema callback')
|
|
956
|
+
self.schema_callback.start()
|
|
603
957
|
self.node_schema.result_schema = None
|
|
604
958
|
self.node_schema.predicted_schema = None
|
|
605
959
|
self._hash = None
|
|
606
960
|
self.node_information.is_setup = None
|
|
961
|
+
self.results.errors = None
|
|
607
962
|
self.evaluate_nodes()
|
|
963
|
+
_ = self.hash # Recalculate the hash after reset
|
|
608
964
|
|
|
609
965
|
def delete_lead_to_node(self, node_id: int) -> bool:
|
|
966
|
+
"""Removes a connection to a specific downstream node.
|
|
967
|
+
|
|
968
|
+
Args:
|
|
969
|
+
node_id: The ID of the downstream node to disconnect.
|
|
970
|
+
|
|
971
|
+
Returns:
|
|
972
|
+
True if the connection was found and removed, False otherwise.
|
|
973
|
+
"""
|
|
610
974
|
logger.info(f'Deleting lead to node: {node_id}')
|
|
611
975
|
for i, lead_to_node in enumerate(self.leads_to_nodes):
|
|
612
976
|
logger.info(f'Checking lead to node: {lead_to_node.node_id}')
|
|
@@ -618,7 +982,16 @@ class FlowNode:
|
|
|
618
982
|
|
|
619
983
|
def delete_input_node(self, node_id: int, connection_type: input_schema.InputConnectionClass = 'input-0',
|
|
620
984
|
complete: bool = False) -> bool:
|
|
621
|
-
|
|
985
|
+
"""Removes a connection from a specific input node.
|
|
986
|
+
|
|
987
|
+
Args:
|
|
988
|
+
node_id: The ID of the input node to disconnect.
|
|
989
|
+
connection_type: The specific input handle (e.g., 'input-0', 'input-1').
|
|
990
|
+
complete: If True, tries to delete from all input types.
|
|
991
|
+
|
|
992
|
+
Returns:
|
|
993
|
+
True if a connection was found and removed, False otherwise.
|
|
994
|
+
"""
|
|
622
995
|
deleted: bool = False
|
|
623
996
|
if connection_type == 'input-0':
|
|
624
997
|
for i, node in enumerate(self.node_inputs.main_inputs):
|
|
@@ -641,17 +1014,32 @@ class FlowNode:
|
|
|
641
1014
|
self.reset()
|
|
642
1015
|
return deleted
|
|
643
1016
|
|
|
644
|
-
def __repr__(self):
|
|
1017
|
+
def __repr__(self) -> str:
|
|
1018
|
+
"""Provides a string representation of the FlowNode instance.
|
|
1019
|
+
|
|
1020
|
+
Returns:
|
|
1021
|
+
A string showing the node's ID and type.
|
|
1022
|
+
"""
|
|
645
1023
|
return f"Node id: {self.node_id} ({self.node_type})"
|
|
646
1024
|
|
|
647
|
-
def _get_readable_schema(self):
|
|
1025
|
+
def _get_readable_schema(self) -> List[dict] | None:
|
|
1026
|
+
"""Helper to get a simplified, dictionary representation of the output schema.
|
|
1027
|
+
|
|
1028
|
+
Returns:
|
|
1029
|
+
A list of dictionaries, each with 'column_name' and 'data_type'.
|
|
1030
|
+
"""
|
|
648
1031
|
if self.is_setup:
|
|
649
1032
|
output = []
|
|
650
1033
|
for s in self.schema:
|
|
651
1034
|
output.append(dict(column_name=s.column_name, data_type=s.data_type))
|
|
652
1035
|
return output
|
|
653
1036
|
|
|
654
|
-
def get_repr(self):
|
|
1037
|
+
def get_repr(self) -> dict:
|
|
1038
|
+
"""Gets a detailed dictionary representation of the node's state.
|
|
1039
|
+
|
|
1040
|
+
Returns:
|
|
1041
|
+
A dictionary containing key information about the node.
|
|
1042
|
+
"""
|
|
655
1043
|
return dict(FlowNode=
|
|
656
1044
|
dict(node_id=self.node_id,
|
|
657
1045
|
step_name=self.__name__,
|
|
@@ -659,33 +1047,70 @@ class FlowNode:
|
|
|
659
1047
|
output_schema=self._get_readable_schema()))
|
|
660
1048
|
|
|
661
1049
|
@property
|
|
662
|
-
def number_of_leads_to_nodes(self) -> int:
|
|
1050
|
+
def number_of_leads_to_nodes(self) -> int | None:
|
|
1051
|
+
"""Counts the number of downstream node connections.
|
|
1052
|
+
|
|
1053
|
+
Returns:
|
|
1054
|
+
The number of nodes this node leads to.
|
|
1055
|
+
"""
|
|
663
1056
|
if self.is_setup:
|
|
664
1057
|
return len(self.leads_to_nodes)
|
|
665
1058
|
|
|
666
1059
|
@property
|
|
667
1060
|
def has_next_step(self) -> bool:
|
|
1061
|
+
"""Checks if this node has any downstream connections.
|
|
1062
|
+
|
|
1063
|
+
Returns:
|
|
1064
|
+
True if it has at least one downstream node.
|
|
1065
|
+
"""
|
|
668
1066
|
return len(self.leads_to_nodes) > 0
|
|
669
1067
|
|
|
670
1068
|
@property
|
|
671
1069
|
def has_input(self) -> bool:
|
|
1070
|
+
"""Checks if this node has any input connections.
|
|
1071
|
+
|
|
1072
|
+
Returns:
|
|
1073
|
+
True if it has at least one input node.
|
|
1074
|
+
"""
|
|
672
1075
|
return len(self.all_inputs) > 0
|
|
673
1076
|
|
|
674
1077
|
@property
|
|
675
1078
|
def singular_input(self) -> bool:
|
|
1079
|
+
"""Checks if the node template specifies exactly one input.
|
|
1080
|
+
|
|
1081
|
+
Returns:
|
|
1082
|
+
True if the node is a single-input type.
|
|
1083
|
+
"""
|
|
676
1084
|
return self.node_template.input == 1
|
|
677
1085
|
|
|
678
1086
|
@property
|
|
679
1087
|
def singular_main_input(self) -> "FlowNode":
|
|
1088
|
+
"""Gets the input node, assuming it is a single-input type.
|
|
1089
|
+
|
|
1090
|
+
Returns:
|
|
1091
|
+
The single input FlowNode, or None.
|
|
1092
|
+
"""
|
|
680
1093
|
if self.singular_input:
|
|
681
1094
|
return self.all_inputs[0]
|
|
682
1095
|
|
|
683
1096
|
def get_table_example(self, include_data: bool = False) -> TableExample | None:
|
|
1097
|
+
"""Generates a `TableExample` model summarizing the node's output.
|
|
1098
|
+
|
|
1099
|
+
This can optionally include a sample of the data.
|
|
1100
|
+
|
|
1101
|
+
Args:
|
|
1102
|
+
include_data: If True, includes a data sample in the result.
|
|
1103
|
+
|
|
1104
|
+
Returns:
|
|
1105
|
+
A `TableExample` object, or None if the node is not set up.
|
|
1106
|
+
"""
|
|
684
1107
|
self.print('Getting a table example')
|
|
685
|
-
if self.
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
1108
|
+
if self.is_setup and include_data and self.node_stats.has_completed_last_run:
|
|
1109
|
+
|
|
1110
|
+
if self.node_template.node_group == 'output':
|
|
1111
|
+
self.print('getting the table example')
|
|
1112
|
+
return self.main_input[0].get_table_example(include_data)
|
|
1113
|
+
|
|
689
1114
|
logger.info('getting the table example since the node has run')
|
|
690
1115
|
example_data_getter = self.results.example_data_generator
|
|
691
1116
|
if example_data_getter is not None:
|
|
@@ -714,13 +1139,19 @@ class FlowNode:
|
|
|
714
1139
|
table_schema=schema, columns=columns,
|
|
715
1140
|
data=[])
|
|
716
1141
|
|
|
717
|
-
def calculate_settings_out_select(self):
|
|
718
|
-
pass
|
|
719
|
-
|
|
720
1142
|
def get_node_data(self, flow_id: int, include_example: bool = False) -> NodeData:
|
|
1143
|
+
"""Gathers all necessary data for representing the node in the UI.
|
|
1144
|
+
|
|
1145
|
+
Args:
|
|
1146
|
+
flow_id: The ID of the parent flow.
|
|
1147
|
+
include_example: If True, includes data samples.
|
|
1148
|
+
|
|
1149
|
+
Returns:
|
|
1150
|
+
A `NodeData` object.
|
|
1151
|
+
"""
|
|
721
1152
|
node = NodeData(flow_id=flow_id,
|
|
722
1153
|
node_id=self.node_id,
|
|
723
|
-
has_run=self.node_stats.
|
|
1154
|
+
has_run=self.node_stats.has_run_with_current_setup,
|
|
724
1155
|
setting_input=self.setting_input,
|
|
725
1156
|
flow_type=self.node_type)
|
|
726
1157
|
if self.main_input:
|
|
@@ -737,15 +1168,30 @@ class FlowNode:
|
|
|
737
1168
|
return node
|
|
738
1169
|
|
|
739
1170
|
def get_output_data(self) -> TableExample:
|
|
1171
|
+
"""Gets the full output data sample for this node.
|
|
1172
|
+
|
|
1173
|
+
Returns:
|
|
1174
|
+
A `TableExample` object with data.
|
|
1175
|
+
"""
|
|
740
1176
|
return self.get_table_example(True)
|
|
741
1177
|
|
|
742
1178
|
def get_node_input(self) -> schemas.NodeInput:
|
|
1179
|
+
"""Creates a `NodeInput` schema object for representing this node in the UI.
|
|
1180
|
+
|
|
1181
|
+
Returns:
|
|
1182
|
+
A `NodeInput` object.
|
|
1183
|
+
"""
|
|
743
1184
|
return schemas.NodeInput(pos_y=self.setting_input.pos_y,
|
|
744
1185
|
pos_x=self.setting_input.pos_x,
|
|
745
1186
|
id=self.node_id,
|
|
746
1187
|
**self.node_template.__dict__)
|
|
747
1188
|
|
|
748
1189
|
def get_edge_input(self) -> List[schemas.NodeEdge]:
|
|
1190
|
+
"""Generates `NodeEdge` objects for all input connections to this node.
|
|
1191
|
+
|
|
1192
|
+
Returns:
|
|
1193
|
+
A list of `NodeEdge` objects.
|
|
1194
|
+
"""
|
|
749
1195
|
edges = []
|
|
750
1196
|
if self.node_inputs.main_inputs is not None:
|
|
751
1197
|
for i, main_input in enumerate(self.node_inputs.main_inputs):
|