Flowfile 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +4 -3
- flowfile/api.py +1 -1
- flowfile/web/static/assets/{CloudConnectionManager-c20a740f.js → CloudConnectionManager-d7c2c028.js} +2 -2
- flowfile/web/static/assets/{CloudStorageReader-960b400a.js → CloudStorageReader-d467329f.js} +11 -78
- flowfile/web/static/assets/{CloudStorageWriter-e3decbdd.js → CloudStorageWriter-071b8b00.js} +12 -79
- flowfile/web/static/assets/{CloudStorageWriter-49c9a4b2.css → CloudStorageWriter-b0ee067f.css} +24 -24
- flowfile/web/static/assets/ContextMenu-2dea5e27.js +41 -0
- flowfile/web/static/assets/{SettingsSection-9c836ecc.css → ContextMenu-4c74eef1.css} +0 -21
- flowfile/web/static/assets/ContextMenu-63cfa99b.css +26 -0
- flowfile/web/static/assets/ContextMenu-785554c4.js +41 -0
- flowfile/web/static/assets/ContextMenu-a51e19ea.js +41 -0
- flowfile/web/static/assets/ContextMenu-c13f91d0.css +26 -0
- flowfile/web/static/assets/{CrossJoin-41efa4cb.css → CrossJoin-1119d18e.css} +18 -18
- flowfile/web/static/assets/{CrossJoin-d67e2405.js → CrossJoin-cf68ec7a.js} +14 -84
- flowfile/web/static/assets/{DatabaseConnectionSettings-a81e0f7e.js → DatabaseConnectionSettings-435c5dd8.js} +3 -3
- flowfile/web/static/assets/{DatabaseManager-9ea35e84.js → DatabaseManager-349e33a8.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-9578bfa5.js → DatabaseReader-8075bd28.js} +14 -114
- flowfile/web/static/assets/{DatabaseReader-f50c6558.css → DatabaseReader-ae61773c.css} +0 -27
- flowfile/web/static/assets/{DatabaseWriter-19531098.js → DatabaseWriter-3e2dda89.js} +13 -74
- flowfile/web/static/assets/{ExploreData-5bdae813.css → ExploreData-2d0cf4db.css} +8 -14
- flowfile/web/static/assets/ExploreData-76ec698c.js +192 -0
- flowfile/web/static/assets/{ExternalSource-2297ef96.js → ExternalSource-609a265c.js} +8 -79
- flowfile/web/static/assets/{Filter-f211c03a.js → Filter-97cff793.js} +12 -85
- flowfile/web/static/assets/{Filter-a9d08ba1.css → Filter-f62091b3.css} +3 -3
- flowfile/web/static/assets/{Formula-4207ea31.js → Formula-09de0ec9.js} +18 -85
- flowfile/web/static/assets/{Formula-29f19d21.css → Formula-bb96803d.css} +4 -4
- flowfile/web/static/assets/{FuzzyMatch-6857de82.css → FuzzyMatch-1010f966.css} +42 -42
- flowfile/web/static/assets/{FuzzyMatch-bf120df0.js → FuzzyMatch-bdf70248.js} +16 -87
- flowfile/web/static/assets/{GraphSolver-5bb7497a.js → GraphSolver-0b5a0e05.js} +13 -159
- flowfile/web/static/assets/GraphSolver-f0cb7bfb.css +22 -0
- flowfile/web/static/assets/{Unique-b5615727.css → GroupBy-b9505323.css} +8 -8
- flowfile/web/static/assets/{GroupBy-92c81b65.js → GroupBy-eaddadde.js} +12 -75
- flowfile/web/static/assets/{Join-4e49a274.js → Join-3313371b.js} +15 -85
- flowfile/web/static/assets/{Join-f45eff22.css → Join-fd79b451.css} +20 -20
- flowfile/web/static/assets/{ManualInput-a71b52c6.css → ManualInput-3246a08d.css} +20 -20
- flowfile/web/static/assets/{ManualInput-90998ae8.js → ManualInput-e8bfc0be.js} +11 -82
- flowfile/web/static/assets/{Output-81e3e917.js → Output-7303bb09.js} +13 -243
- flowfile/web/static/assets/Output-ddc9079f.css +37 -0
- flowfile/web/static/assets/{Pivot-a3419842.js → Pivot-3b1c54ef.js} +14 -138
- flowfile/web/static/assets/Pivot-cf333e3d.css +22 -0
- flowfile/web/static/assets/PivotValidation-3bb36c8f.js +61 -0
- flowfile/web/static/assets/PivotValidation-891ddfb0.css +13 -0
- flowfile/web/static/assets/PivotValidation-c46cd420.css +13 -0
- flowfile/web/static/assets/PivotValidation-eaa819c0.js +61 -0
- flowfile/web/static/assets/{PolarsCode-72710deb.js → PolarsCode-aa12e25d.js} +13 -80
- flowfile/web/static/assets/Read-6b17491f.css +62 -0
- flowfile/web/static/assets/Read-a2bfc618.js +243 -0
- flowfile/web/static/assets/RecordCount-aa0dc082.js +53 -0
- flowfile/web/static/assets/{RecordId-10baf191.js → RecordId-48ee1a3b.js} +8 -80
- flowfile/web/static/assets/SQLQueryComponent-36cef432.css +27 -0
- flowfile/web/static/assets/SQLQueryComponent-e149dbf2.js +38 -0
- flowfile/web/static/assets/{Sample-3ed9a0ae.js → Sample-f06cb97a.js} +8 -77
- flowfile/web/static/assets/{SecretManager-0d49c0e8.js → SecretManager-37f34886.js} +2 -2
- flowfile/web/static/assets/{Select-8a02a0b3.js → Select-b60e6c47.js} +11 -85
- flowfile/web/static/assets/SettingsSection-2e4d03c4.css +21 -0
- flowfile/web/static/assets/SettingsSection-5c696bee.css +20 -0
- flowfile/web/static/assets/SettingsSection-70e5a7b1.js +53 -0
- flowfile/web/static/assets/SettingsSection-71e6b7e3.css +21 -0
- flowfile/web/static/assets/{SettingsSection-4c0f45f5.js → SettingsSection-75b6cf4f.js} +2 -40
- flowfile/web/static/assets/SettingsSection-e57a672e.js +45 -0
- flowfile/web/static/assets/{GroupBy-ab1ea74b.css → Sort-3643d625.css} +8 -8
- flowfile/web/static/assets/{Sort-f55c9f9d.js → Sort-51b1ee4d.js} +12 -97
- flowfile/web/static/assets/{TextToRows-5dbc2145.js → TextToRows-26835f8f.js} +14 -83
- flowfile/web/static/assets/{TextToRows-c92d1ec2.css → TextToRows-5d2c1190.css} +9 -9
- flowfile/web/static/assets/{UnavailableFields-a1768e52.js → UnavailableFields-88a4cd0c.js} +2 -2
- flowfile/web/static/assets/Union-4d0088eb.js +77 -0
- flowfile/web/static/assets/{Union-8d9ac7f9.css → Union-af6c3d9b.css} +6 -6
- flowfile/web/static/assets/{Unique-46b250da.js → Unique-7d554a62.js} +22 -91
- flowfile/web/static/assets/{Sort-7ccfa0fe.css → Unique-f9fb0809.css} +8 -8
- flowfile/web/static/assets/Unpivot-1e422df3.css +30 -0
- flowfile/web/static/assets/{Unpivot-25ac84cc.js → Unpivot-4668595c.js} +12 -166
- flowfile/web/static/assets/UnpivotValidation-0d240eeb.css +13 -0
- flowfile/web/static/assets/UnpivotValidation-d4f0e0e8.js +51 -0
- flowfile/web/static/assets/{ExploreData-40476474.js → VueGraphicWalker-5324d566.js} +4 -264
- flowfile/web/static/assets/VueGraphicWalker-ed5ab88b.css +6 -0
- flowfile/web/static/assets/{api-6ef0dcef.js → api-271ed117.js} +1 -1
- flowfile/web/static/assets/{api-a0abbdc7.js → api-31e4fea6.js} +1 -1
- flowfile/web/static/assets/{designer-186f2e71.css → designer-091bdc3f.css} +819 -184
- flowfile/web/static/assets/{designer-13eabd83.js → designer-bf3d9487.js} +2214 -680
- flowfile/web/static/assets/{documentation-b87e7f6f.js → documentation-4d0a1cea.js} +1 -1
- flowfile/web/static/assets/{dropDown-13564764.js → dropDown-025888df.js} +1 -1
- flowfile/web/static/assets/{fullEditor-fd2cd6f9.js → fullEditor-1df991ec.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-71e11604.js → genericNodeSettings-d3b2b2ac.js} +3 -3
- flowfile/web/static/assets/{index-f6c15e76.js → index-d0518598.js} +210 -31
- flowfile/web/static/assets/{Output-48f81019.css → outputCsv-9cc59e0b.css} +0 -143
- flowfile/web/static/assets/outputCsv-d8457527.js +86 -0
- flowfile/web/static/assets/outputExcel-b41305c0.css +102 -0
- flowfile/web/static/assets/outputExcel-be89153e.js +56 -0
- flowfile/web/static/assets/outputParquet-cf8cf3f2.css +4 -0
- flowfile/web/static/assets/outputParquet-fabb445a.js +31 -0
- flowfile/web/static/assets/readCsv-bca3ed53.css +52 -0
- flowfile/web/static/assets/readCsv-e8359522.js +178 -0
- flowfile/web/static/assets/readExcel-dabaf51b.js +203 -0
- flowfile/web/static/assets/readExcel-e1b381ea.css +64 -0
- flowfile/web/static/assets/readParquet-cee068e2.css +19 -0
- flowfile/web/static/assets/readParquet-e0771ef2.js +26 -0
- flowfile/web/static/assets/{secretApi-dd636aa2.js → secretApi-ce823eee.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-af36165e.js → selectDynamic-5476546e.js} +7 -7
- flowfile/web/static/assets/{selectDynamic-b062bc9b.css → selectDynamic-aa913ff4.css} +16 -16
- flowfile/web/static/assets/{vue-codemirror.esm-2847001e.js → vue-codemirror.esm-9ed00d50.js} +29 -33
- flowfile/web/static/assets/{vue-content-loader.es-0371da73.js → vue-content-loader.es-7bca2d9b.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.8.dist-info → flowfile-0.3.10.dist-info}/METADATA +2 -1
- {flowfile-0.3.8.dist-info → flowfile-0.3.10.dist-info}/RECORD +147 -117
- flowfile_core/configs/flow_logger.py +5 -13
- flowfile_core/configs/node_store/nodes.py +303 -44
- flowfile_core/configs/settings.py +6 -3
- flowfile_core/database/connection.py +5 -21
- flowfile_core/fileExplorer/funcs.py +239 -121
- flowfile_core/flowfile/code_generator/code_generator.py +36 -0
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +60 -80
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +61 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +44 -3
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +3 -3
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +33 -10
- flowfile_core/flowfile/flow_graph.py +223 -118
- flowfile_core/flowfile/flow_node/flow_node.py +56 -19
- flowfile_core/flowfile/flow_node/models.py +0 -2
- flowfile_core/flowfile/flow_node/schema_callback.py +138 -43
- flowfile_core/flowfile/graph_tree/graph_tree.py +250 -0
- flowfile_core/flowfile/graph_tree/models.py +15 -0
- flowfile_core/flowfile/handler.py +22 -3
- flowfile_core/flowfile/manage/compatibility_enhancements.py +1 -1
- flowfile_core/flowfile/{flow_data_engine/fuzzy_matching/settings_validator.py → schema_callbacks.py} +72 -16
- flowfile_core/flowfile/setting_generator/settings.py +2 -2
- flowfile_core/flowfile/util/execution_orderer.py +9 -0
- flowfile_core/flowfile/util/node_skipper.py +8 -0
- flowfile_core/main.py +4 -1
- flowfile_core/routes/routes.py +59 -10
- flowfile_core/schemas/input_schema.py +0 -1
- flowfile_core/schemas/output_model.py +5 -2
- flowfile_core/schemas/schemas.py +48 -3
- flowfile_core/schemas/transform_schema.py +28 -38
- flowfile_frame/__init__.py +1 -4
- flowfile_frame/flow_frame.py +33 -4
- flowfile_frame/flow_frame.pyi +2 -0
- flowfile_worker/__init__.py +6 -35
- flowfile_worker/funcs.py +7 -3
- flowfile_worker/main.py +5 -2
- flowfile_worker/models.py +3 -1
- flowfile_worker/routes.py +47 -5
- shared/__init__.py +15 -0
- shared/storage_config.py +243 -0
- flowfile/web/static/assets/GraphSolver-17fd26db.css +0 -68
- flowfile/web/static/assets/Pivot-f415e85f.css +0 -35
- flowfile/web/static/assets/Read-80dc1675.css +0 -197
- flowfile/web/static/assets/Read-c4059daf.js +0 -701
- flowfile/web/static/assets/RecordCount-c2b5e095.js +0 -122
- flowfile/web/static/assets/Union-f2aefdc9.js +0 -146
- flowfile/web/static/assets/Unpivot-246e9bbd.css +0 -77
- flowfile/web/static/assets/nodeTitle-988d9efe.js +0 -227
- flowfile/web/static/assets/nodeTitle-f4b12bcb.css +0 -134
- flowfile_worker/polars_fuzzy_match/matcher.py +0 -435
- flowfile_worker/polars_fuzzy_match/models.py +0 -36
- flowfile_worker/polars_fuzzy_match/pre_process.py +0 -213
- flowfile_worker/polars_fuzzy_match/process.py +0 -86
- flowfile_worker/polars_fuzzy_match/utils.py +0 -50
- {flowfile-0.3.8.dist-info → flowfile-0.3.10.dist-info}/LICENSE +0 -0
- {flowfile-0.3.8.dist-info → flowfile-0.3.10.dist-info}/WHEEL +0 -0
- {flowfile-0.3.8.dist-info → flowfile-0.3.10.dist-info}/entry_points.txt +0 -0
- {flowfile_worker/polars_fuzzy_match → flowfile_core/flowfile/graph_tree}/__init__.py +0 -0
|
@@ -1,27 +1,30 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import pickle
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
|
|
3
6
|
import polars as pl
|
|
7
|
+
|
|
4
8
|
import fastexcel
|
|
5
|
-
import re
|
|
6
9
|
from fastapi.exceptions import HTTPException
|
|
7
10
|
from time import time
|
|
8
11
|
from functools import partial
|
|
9
|
-
from typing import List, Dict, Union, Callable, Any, Optional, Tuple
|
|
12
|
+
from typing import List, Dict, Union, Callable, Any, Optional, Tuple, Literal
|
|
10
13
|
from uuid import uuid1
|
|
11
14
|
from copy import deepcopy
|
|
12
15
|
from pyarrow.parquet import ParquetFile
|
|
13
16
|
from flowfile_core.configs import logger
|
|
14
|
-
from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
|
|
15
17
|
from flowfile_core.configs.flow_logger import FlowLogger
|
|
16
18
|
from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
|
|
17
19
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
|
|
18
|
-
|
|
19
|
-
pre_calculate_pivot_schema)
|
|
20
|
+
|
|
20
21
|
from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import CloudStorageReader
|
|
21
22
|
from flowfile_core.utils.arrow_reader import get_read_top_n
|
|
22
23
|
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine, execute_polars_code
|
|
23
|
-
from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_xlsx_datatypes,
|
|
24
|
-
|
|
24
|
+
from flowfile_core.flowfile.flow_data_engine.read_excel_tables import (get_open_xlsx_datatypes,
|
|
25
|
+
get_calamine_xlsx_data_types)
|
|
26
|
+
|
|
27
|
+
from flowfile_core.flowfile.schema_callbacks import (calculate_fuzzy_match_schema, pre_calculate_pivot_schema)
|
|
25
28
|
from flowfile_core.flowfile.sources import external_sources
|
|
26
29
|
from flowfile_core.schemas import input_schema, schemas, transform_schema
|
|
27
30
|
from flowfile_core.schemas.output_model import NodeData, NodeResult, RunInformation
|
|
@@ -32,7 +35,11 @@ from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSetting
|
|
|
32
35
|
from flowfile_core.flowfile.utils import snake_case_to_camel_case
|
|
33
36
|
from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
|
|
34
37
|
from flowfile_core.flowfile.flow_node.flow_node import FlowNode
|
|
35
|
-
from flowfile_core.flowfile.util.execution_orderer import
|
|
38
|
+
from flowfile_core.flowfile.util.execution_orderer import compute_execution_plan
|
|
39
|
+
from flowfile_core.flowfile.graph_tree.graph_tree import (add_un_drawn_nodes, build_flow_paths,
|
|
40
|
+
build_node_info, calculate_depth,
|
|
41
|
+
define_node_connections, draw_merged_paths,
|
|
42
|
+
draw_standalone_paths, group_nodes_by_depth)
|
|
36
43
|
from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
|
|
37
44
|
from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import (ExternalDatabaseFetcher,
|
|
38
45
|
ExternalDatabaseWriter,
|
|
@@ -172,12 +179,10 @@ class FlowGraph:
|
|
|
172
179
|
schema: Optional[List[FlowfileColumn]] = None
|
|
173
180
|
has_over_row_function: bool = False
|
|
174
181
|
_flow_starts: List[Union[int, str]] = None
|
|
175
|
-
node_results: List[NodeResult] = None
|
|
176
182
|
latest_run_info: Optional[RunInformation] = None
|
|
177
183
|
start_datetime: datetime = None
|
|
178
184
|
end_datetime: datetime = None
|
|
179
|
-
|
|
180
|
-
flow_settings: schemas.FlowSettings = None
|
|
185
|
+
_flow_settings: schemas.FlowSettings = None
|
|
181
186
|
flow_logger: FlowLogger
|
|
182
187
|
|
|
183
188
|
def __init__(self,
|
|
@@ -201,13 +206,11 @@ class FlowGraph:
|
|
|
201
206
|
if isinstance(flow_settings, schemas.FlowGraphConfig):
|
|
202
207
|
flow_settings = schemas.FlowSettings.from_flow_settings_input(flow_settings)
|
|
203
208
|
|
|
204
|
-
self.
|
|
209
|
+
self._flow_settings = flow_settings
|
|
205
210
|
self.uuid = str(uuid1())
|
|
206
|
-
self.nodes_completed = 0
|
|
207
211
|
self.start_datetime = None
|
|
208
212
|
self.end_datetime = None
|
|
209
213
|
self.latest_run_info = None
|
|
210
|
-
self.node_results = []
|
|
211
214
|
self._flow_id = flow_settings.flow_id
|
|
212
215
|
self.flow_logger = FlowLogger(flow_settings.flow_id)
|
|
213
216
|
self._flow_starts: List[FlowNode] = []
|
|
@@ -226,6 +229,19 @@ class FlowGraph:
|
|
|
226
229
|
elif input_flow is not None:
|
|
227
230
|
self.add_datasource(input_file=input_flow)
|
|
228
231
|
|
|
232
|
+
@property
|
|
233
|
+
def flow_settings(self) -> schemas.FlowSettings:
|
|
234
|
+
return self._flow_settings
|
|
235
|
+
|
|
236
|
+
@flow_settings.setter
|
|
237
|
+
def flow_settings(self, flow_settings: schemas.FlowSettings):
|
|
238
|
+
if (
|
|
239
|
+
(self._flow_settings.execution_location != flow_settings.execution_location) or
|
|
240
|
+
(self._flow_settings.execution_mode != flow_settings.execution_mode)
|
|
241
|
+
):
|
|
242
|
+
self.reset()
|
|
243
|
+
self._flow_settings = flow_settings
|
|
244
|
+
|
|
229
245
|
def add_node_promise(self, node_promise: input_schema.NodePromise):
|
|
230
246
|
"""Adds a placeholder node to the graph that is not yet fully configured.
|
|
231
247
|
|
|
@@ -242,64 +258,6 @@ class FlowGraph:
|
|
|
242
258
|
self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=placeholder,
|
|
243
259
|
setting_input=node_promise)
|
|
244
260
|
|
|
245
|
-
def print_tree(self, show_schema=False, show_descriptions=False):
|
|
246
|
-
"""
|
|
247
|
-
Print flow_graph as a tree.
|
|
248
|
-
"""
|
|
249
|
-
max_node_id = max(self._node_db.keys())
|
|
250
|
-
|
|
251
|
-
tree = ""
|
|
252
|
-
tabs = 0
|
|
253
|
-
tab_counter = 0
|
|
254
|
-
for node in self.nodes:
|
|
255
|
-
tab_counter += 1
|
|
256
|
-
node_input = node.setting_input
|
|
257
|
-
operation = str(self._node_db[node_input.node_id]).split("(")[1][:-1].replace("_", " ").title()
|
|
258
|
-
|
|
259
|
-
if operation == "Formula":
|
|
260
|
-
operation = "With Columns"
|
|
261
|
-
|
|
262
|
-
tree += str(operation) + " (id=" + str(node_input.node_id) + ")"
|
|
263
|
-
|
|
264
|
-
if show_descriptions & show_schema:
|
|
265
|
-
raise ValueError('show_descriptions and show_schema cannot be True simultaneously')
|
|
266
|
-
if show_descriptions:
|
|
267
|
-
tree += ": " + str(node_input.description)
|
|
268
|
-
elif show_schema:
|
|
269
|
-
tree += " -> ["
|
|
270
|
-
if operation == "Manual Input":
|
|
271
|
-
schema = ", ".join([str(i.name) + ": " + str(i.data_type) for i in node_input.raw_data_format.columns])
|
|
272
|
-
tree += schema
|
|
273
|
-
elif operation == "With Columns":
|
|
274
|
-
tree_with_col_schema = ", " + node_input.function.field.name + ": " + node_input.function.field.data_type
|
|
275
|
-
tree += schema + tree_with_col_schema
|
|
276
|
-
elif operation == "Filter":
|
|
277
|
-
index = node_input.filter_input.advanced_filter.find("]")
|
|
278
|
-
filtered_column = str(node_input.filter_input.advanced_filter[1:index])
|
|
279
|
-
schema = re.sub('({str(filtered_column)}: [A-Za-z0-9]+\,\s)', "", schema)
|
|
280
|
-
tree += schema
|
|
281
|
-
elif operation == "Group By":
|
|
282
|
-
for col in node_input.groupby_input.agg_cols:
|
|
283
|
-
schema = re.sub(str(col.old_name) + ': [a-z0-9]+\, ', "", schema)
|
|
284
|
-
tree += schema
|
|
285
|
-
tree += "]"
|
|
286
|
-
else:
|
|
287
|
-
if operation == "Manual Input":
|
|
288
|
-
tree += ": " + str(node_input.raw_data_format.data)
|
|
289
|
-
elif operation == "With Columns":
|
|
290
|
-
tree += ": " + str(node_input.function)
|
|
291
|
-
elif operation == "Filter":
|
|
292
|
-
tree += ": " + str(node_input.filter_input.advanced_filter)
|
|
293
|
-
elif operation == "Group By":
|
|
294
|
-
tree += ": groupby=[" + ", ".join([col.old_name for col in node_input.groupby_input.agg_cols if col.agg == "groupby"]) + "], "
|
|
295
|
-
tree += "agg=[" + ", ".join([str(col.agg) + "(" + str(col.old_name) + ")" for col in node_input.groupby_input.agg_cols if col.agg != "groupby"]) + "]"
|
|
296
|
-
|
|
297
|
-
if node_input.node_id < max_node_id:
|
|
298
|
-
tree += "\n" + "# " + " "*3*(tabs-1) + "|___ "
|
|
299
|
-
print("\n"*2)
|
|
300
|
-
|
|
301
|
-
return print(tree)
|
|
302
|
-
|
|
303
261
|
def apply_layout(self, y_spacing: int = 150, x_spacing: int = 200, initial_y: int = 100):
|
|
304
262
|
"""Calculates and applies a layered layout to all nodes in the graph.
|
|
305
263
|
|
|
@@ -368,6 +326,86 @@ class FlowGraph:
|
|
|
368
326
|
settings_str = " -" + '\n -'.join(f"{k}: {v}" for k, v in self.flow_settings)
|
|
369
327
|
return f"FlowGraph(\nNodes: {self._node_db}\n\nSettings:\n{settings_str}"
|
|
370
328
|
|
|
329
|
+
def print_tree(self):
|
|
330
|
+
"""Print flow_graph as a visual tree structure, showing the DAG relationships with ASCII art."""
|
|
331
|
+
if not self._node_db:
|
|
332
|
+
self.flow_logger.info("Empty flow graph")
|
|
333
|
+
return
|
|
334
|
+
|
|
335
|
+
# Build node information
|
|
336
|
+
node_info = build_node_info(self.nodes)
|
|
337
|
+
|
|
338
|
+
# Calculate depths for all nodes
|
|
339
|
+
for node_id in node_info:
|
|
340
|
+
calculate_depth(node_id, node_info)
|
|
341
|
+
|
|
342
|
+
# Group nodes by depth
|
|
343
|
+
depth_groups, max_depth = group_nodes_by_depth(node_info)
|
|
344
|
+
|
|
345
|
+
# Sort nodes within each depth group
|
|
346
|
+
for depth in depth_groups:
|
|
347
|
+
depth_groups[depth].sort()
|
|
348
|
+
|
|
349
|
+
# Create the main flow visualization
|
|
350
|
+
lines = ["=" * 80, "Flow Graph Visualization", "=" * 80, ""]
|
|
351
|
+
|
|
352
|
+
# Track which nodes connect to what
|
|
353
|
+
merge_points = define_node_connections(node_info)
|
|
354
|
+
|
|
355
|
+
# Build the flow paths
|
|
356
|
+
|
|
357
|
+
# Find the maximum label length for each depth level
|
|
358
|
+
max_label_length = {}
|
|
359
|
+
for depth in range(max_depth + 1):
|
|
360
|
+
if depth in depth_groups:
|
|
361
|
+
max_len = max(len(node_info[nid].label) for nid in depth_groups[depth])
|
|
362
|
+
max_label_length[depth] = max_len
|
|
363
|
+
|
|
364
|
+
# Draw the paths
|
|
365
|
+
drawn_nodes = set()
|
|
366
|
+
merge_drawn = set()
|
|
367
|
+
|
|
368
|
+
# Group paths by their merge points
|
|
369
|
+
paths_by_merge = {}
|
|
370
|
+
standalone_paths = []
|
|
371
|
+
|
|
372
|
+
# Build flow paths
|
|
373
|
+
paths = build_flow_paths(node_info, self._flow_starts, merge_points)
|
|
374
|
+
|
|
375
|
+
# Define paths to merge and standalone paths
|
|
376
|
+
for path in paths:
|
|
377
|
+
if len(path) > 1 and path[-1] in merge_points and len(merge_points[path[-1]]) > 1:
|
|
378
|
+
merge_id = path[-1]
|
|
379
|
+
if merge_id not in paths_by_merge:
|
|
380
|
+
paths_by_merge[merge_id] = []
|
|
381
|
+
paths_by_merge[merge_id].append(path)
|
|
382
|
+
else:
|
|
383
|
+
standalone_paths.append(path)
|
|
384
|
+
|
|
385
|
+
# Draw merged paths
|
|
386
|
+
draw_merged_paths(node_info, merge_points, paths_by_merge, merge_drawn, drawn_nodes, lines)
|
|
387
|
+
|
|
388
|
+
# Draw standlone paths
|
|
389
|
+
draw_standalone_paths(drawn_nodes, standalone_paths, lines, node_info)
|
|
390
|
+
|
|
391
|
+
# Add undrawn nodes
|
|
392
|
+
add_un_drawn_nodes(drawn_nodes, node_info, lines)
|
|
393
|
+
|
|
394
|
+
try:
|
|
395
|
+
skip_nodes, ordered_nodes = compute_execution_plan(
|
|
396
|
+
nodes=self.nodes,
|
|
397
|
+
flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
|
|
398
|
+
if ordered_nodes:
|
|
399
|
+
for i, node in enumerate(ordered_nodes, 1):
|
|
400
|
+
lines.append(f" {i:3d}. {node_info[node.node_id].label}")
|
|
401
|
+
except Exception as e:
|
|
402
|
+
lines.append(f" Could not determine execution order: {e}")
|
|
403
|
+
|
|
404
|
+
# Print everything
|
|
405
|
+
output = "\n".join(lines)
|
|
406
|
+
|
|
407
|
+
print(output)
|
|
408
|
+
|
|
371
409
|
def get_nodes_overview(self):
|
|
372
410
|
"""Gets a list of dictionary representations for all nodes in the graph."""
|
|
373
411
|
output = []
|
|
@@ -774,26 +812,34 @@ class FlowGraph:
|
|
|
774
812
|
"""
|
|
775
813
|
|
|
776
814
|
def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
|
|
777
|
-
|
|
815
|
+
node = self.get_node(node_id=fuzzy_settings.node_id)
|
|
816
|
+
if self.execution_location == "local":
|
|
817
|
+
return main.fuzzy_join(fuzzy_match_input=deepcopy(fuzzy_settings.join_input),
|
|
818
|
+
other=right,
|
|
819
|
+
node_logger=self.flow_logger.get_node_logger(fuzzy_settings.node_id))
|
|
820
|
+
|
|
821
|
+
f = main.start_fuzzy_join(fuzzy_match_input=deepcopy(fuzzy_settings.join_input), other=right, file_ref=node.hash,
|
|
778
822
|
flow_id=self.flow_id, node_id=fuzzy_settings.node_id)
|
|
779
823
|
logger.info("Started the fuzzy match action")
|
|
780
|
-
node._fetch_cached_df = f
|
|
824
|
+
node._fetch_cached_df = f # Add to the node so it can be cancelled and fetch later if needed
|
|
781
825
|
return FlowDataEngine(f.get_result())
|
|
782
826
|
|
|
783
|
-
self.add_node_step(node_id=fuzzy_settings.node_id,
|
|
784
|
-
function=_func,
|
|
785
|
-
input_columns=[],
|
|
786
|
-
node_type='fuzzy_match',
|
|
787
|
-
setting_input=fuzzy_settings)
|
|
788
|
-
node = self.get_node(node_id=fuzzy_settings.node_id)
|
|
789
|
-
|
|
790
827
|
def schema_callback():
|
|
791
|
-
|
|
828
|
+
fm_input_copy = deepcopy(fuzzy_settings.join_input) # Deepcopy create an unique object per func
|
|
829
|
+
node = self.get_node(node_id=fuzzy_settings.node_id)
|
|
830
|
+
return calculate_fuzzy_match_schema(fm_input_copy,
|
|
792
831
|
left_schema=node.node_inputs.main_inputs[0].schema,
|
|
793
832
|
right_schema=node.node_inputs.right_input.schema
|
|
794
833
|
)
|
|
795
834
|
|
|
796
|
-
|
|
835
|
+
self.add_node_step(node_id=fuzzy_settings.node_id,
|
|
836
|
+
function=_func,
|
|
837
|
+
input_columns=[],
|
|
838
|
+
node_type='fuzzy_match',
|
|
839
|
+
setting_input=fuzzy_settings,
|
|
840
|
+
input_node_ids=fuzzy_settings.depending_on_ids,
|
|
841
|
+
schema_callback=schema_callback)
|
|
842
|
+
|
|
797
843
|
return self
|
|
798
844
|
|
|
799
845
|
def add_text_to_rows(self, node_text_to_rows: input_schema.NodeTextToRows) -> "FlowGraph":
|
|
@@ -1549,8 +1595,70 @@ class FlowGraph:
|
|
|
1549
1595
|
Args:
|
|
1550
1596
|
execution_location: The execution location to set.
|
|
1551
1597
|
"""
|
|
1598
|
+
if self.flow_settings.execution_location != execution_location:
|
|
1599
|
+
self.reset()
|
|
1552
1600
|
self.flow_settings.execution_location = execution_location
|
|
1553
1601
|
|
|
1602
|
+
def validate_if_node_can_be_fetched(self, node_id: int) -> None:
|
|
1603
|
+
flow_node = self._node_db.get(node_id)
|
|
1604
|
+
if not flow_node:
|
|
1605
|
+
raise Exception("Node not found found")
|
|
1606
|
+
skip_nodes, execution_order = compute_execution_plan(
|
|
1607
|
+
nodes=self.nodes, flow_starts=self._flow_starts+self.get_implicit_starter_nodes()
|
|
1608
|
+
)
|
|
1609
|
+
if flow_node.node_id in [skip_node.node_id for skip_node in skip_nodes]:
|
|
1610
|
+
raise Exception("Node can not be executed because it does not have it's inputs")
|
|
1611
|
+
|
|
1612
|
+
def create_initial_run_information(self, number_of_nodes: int,
|
|
1613
|
+
run_type: Literal["fetch_one", "full_run"]):
|
|
1614
|
+
return RunInformation(
|
|
1615
|
+
flow_id=self.flow_id, start_time=datetime.datetime.now(), end_time=None,
|
|
1616
|
+
success=None, number_of_nodes=number_of_nodes, node_step_result=[],
|
|
1617
|
+
run_type=run_type
|
|
1618
|
+
)
|
|
1619
|
+
|
|
1620
|
+
def trigger_fetch_node(self, node_id: int) -> RunInformation | None:
|
|
1621
|
+
"""Executes a specific node in the graph by its ID."""
|
|
1622
|
+
if self.flow_settings.is_running:
|
|
1623
|
+
raise Exception("Flow is already running")
|
|
1624
|
+
flow_node = self.get_node(node_id)
|
|
1625
|
+
self.flow_settings.is_running = True
|
|
1626
|
+
self.flow_settings.is_canceled = False
|
|
1627
|
+
self.flow_logger.clear_log_file()
|
|
1628
|
+
self.latest_run_info = self.create_initial_run_information(1, "fetch_one")
|
|
1629
|
+
node_logger = self.flow_logger.get_node_logger(flow_node.node_id)
|
|
1630
|
+
node_result = NodeResult(node_id=flow_node.node_id, node_name=flow_node.name)
|
|
1631
|
+
logger.info(f'Starting to run: node {flow_node.node_id}, start time: {node_result.start_timestamp}')
|
|
1632
|
+
try:
|
|
1633
|
+
self.latest_run_info.node_step_result.append(node_result)
|
|
1634
|
+
flow_node.execute_node(run_location=self.flow_settings.execution_location,
|
|
1635
|
+
performance_mode=False,
|
|
1636
|
+
node_logger=node_logger,
|
|
1637
|
+
optimize_for_downstream=False,
|
|
1638
|
+
reset_cache=True)
|
|
1639
|
+
node_result.error = str(flow_node.results.errors)
|
|
1640
|
+
if self.flow_settings.is_canceled:
|
|
1641
|
+
node_result.success = None
|
|
1642
|
+
node_result.success = None
|
|
1643
|
+
node_result.is_running = False
|
|
1644
|
+
node_result.success = flow_node.results.errors is None
|
|
1645
|
+
node_result.end_timestamp = time()
|
|
1646
|
+
node_result.run_time = int(node_result.end_timestamp - node_result.start_timestamp)
|
|
1647
|
+
node_result.is_running = False
|
|
1648
|
+
self.latest_run_info.nodes_completed += 1
|
|
1649
|
+
self.latest_run_info.end_time = datetime.datetime.now()
|
|
1650
|
+
self.flow_settings.is_running = False
|
|
1651
|
+
return self.get_run_info()
|
|
1652
|
+
except Exception as e:
|
|
1653
|
+
node_result.error = 'Node did not run'
|
|
1654
|
+
node_result.success = False
|
|
1655
|
+
node_result.end_timestamp = time()
|
|
1656
|
+
node_result.run_time = int(node_result.end_timestamp - node_result.start_timestamp)
|
|
1657
|
+
node_result.is_running = False
|
|
1658
|
+
node_logger.error(f'Error in node {flow_node.node_id}: {e}')
|
|
1659
|
+
finally:
|
|
1660
|
+
self.flow_settings.is_running = False
|
|
1661
|
+
|
|
1554
1662
|
def run_graph(self) -> RunInformation | None:
|
|
1555
1663
|
"""Executes the entire data flow graph from start to finish.
|
|
1556
1664
|
|
|
@@ -1566,27 +1674,23 @@ class FlowGraph:
|
|
|
1566
1674
|
if self.flow_settings.is_running:
|
|
1567
1675
|
raise Exception('Flow is already running')
|
|
1568
1676
|
try:
|
|
1677
|
+
|
|
1569
1678
|
self.flow_settings.is_running = True
|
|
1570
1679
|
self.flow_settings.is_canceled = False
|
|
1571
1680
|
self.flow_logger.clear_log_file()
|
|
1572
|
-
self.nodes_completed = 0
|
|
1573
|
-
self.node_results = []
|
|
1574
|
-
self.start_datetime = datetime.datetime.now()
|
|
1575
|
-
self.end_datetime = None
|
|
1576
|
-
self.latest_run_info = None
|
|
1577
1681
|
self.flow_logger.info('Starting to run flowfile flow...')
|
|
1578
|
-
|
|
1579
|
-
skip_nodes
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1682
|
+
|
|
1683
|
+
skip_nodes, execution_order = compute_execution_plan(
|
|
1684
|
+
nodes=self.nodes,
|
|
1685
|
+
flow_starts=self._flow_starts+self.get_implicit_starter_nodes()
|
|
1686
|
+
)
|
|
1687
|
+
|
|
1688
|
+
self.latest_run_info = self.create_initial_run_information(len(execution_order), "full_run")
|
|
1689
|
+
|
|
1583
1690
|
skip_node_message(self.flow_logger, skip_nodes)
|
|
1584
1691
|
execution_order_message(self.flow_logger, execution_order)
|
|
1585
1692
|
performance_mode = self.flow_settings.execution_mode == 'Performance'
|
|
1586
|
-
|
|
1587
|
-
OFFLOAD_TO_WORKER.value = False
|
|
1588
|
-
elif self.flow_settings.execution_location == 'remote':
|
|
1589
|
-
OFFLOAD_TO_WORKER.value = True
|
|
1693
|
+
|
|
1590
1694
|
for node in execution_order:
|
|
1591
1695
|
node_logger = self.flow_logger.get_node_logger(node.node_id)
|
|
1592
1696
|
if self.flow_settings.is_canceled:
|
|
@@ -1596,7 +1700,7 @@ class FlowGraph:
|
|
|
1596
1700
|
node_logger.info(f'Skipping node {node.node_id}')
|
|
1597
1701
|
continue
|
|
1598
1702
|
node_result = NodeResult(node_id=node.node_id, node_name=node.name)
|
|
1599
|
-
self.
|
|
1703
|
+
self.latest_run_info.node_step_result.append(node_result)
|
|
1600
1704
|
logger.info(f'Starting to run: node {node.node_id}, start time: {node_result.start_timestamp}')
|
|
1601
1705
|
node.execute_node(run_location=self.flow_settings.execution_location,
|
|
1602
1706
|
performance_mode=performance_mode,
|
|
@@ -1622,7 +1726,7 @@ class FlowGraph:
|
|
|
1622
1726
|
if not node_result.success:
|
|
1623
1727
|
skip_nodes.extend(list(node.get_all_dependent_nodes()))
|
|
1624
1728
|
node_logger.info(f'Completed node with success: {node_result.success}')
|
|
1625
|
-
self.nodes_completed += 1
|
|
1729
|
+
self.latest_run_info.nodes_completed += 1
|
|
1626
1730
|
self.flow_logger.info('Flow completed!')
|
|
1627
1731
|
self.end_datetime = datetime.datetime.now()
|
|
1628
1732
|
self.flow_settings.is_running = False
|
|
@@ -1634,28 +1738,23 @@ class FlowGraph:
|
|
|
1634
1738
|
finally:
|
|
1635
1739
|
self.flow_settings.is_running = False
|
|
1636
1740
|
|
|
1637
|
-
def get_run_info(self) -> RunInformation:
|
|
1741
|
+
def get_run_info(self) -> RunInformation | None:
|
|
1638
1742
|
"""Gets a summary of the most recent graph execution.
|
|
1639
1743
|
|
|
1640
1744
|
Returns:
|
|
1641
1745
|
A RunInformation object with details about the last run.
|
|
1642
1746
|
"""
|
|
1747
|
+
is_running = self.flow_settings.is_running
|
|
1643
1748
|
if self.latest_run_info is None:
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
self.latest_run_info = RunInformation(start_time=self.start_datetime, end_time=self.end_datetime,
|
|
1654
|
-
success=all(nr.success for nr in node_results),
|
|
1655
|
-
node_step_result=node_results, flow_id=self.flow_id,
|
|
1656
|
-
nodes_completed=self.nodes_completed,
|
|
1657
|
-
number_of_nodes=len(self.nodes))
|
|
1658
|
-
return self.latest_run_info
|
|
1749
|
+
return
|
|
1750
|
+
|
|
1751
|
+
elif not is_running and self.latest_run_info.success is not None:
|
|
1752
|
+
return self.latest_run_info
|
|
1753
|
+
|
|
1754
|
+
run_info = self.latest_run_info
|
|
1755
|
+
if not is_running:
|
|
1756
|
+
run_info.success = all(nr.success for nr in run_info.node_step_result)
|
|
1757
|
+
return run_info
|
|
1659
1758
|
|
|
1660
1759
|
@property
|
|
1661
1760
|
def node_connections(self) -> List[Tuple[int, int]]:
|
|
@@ -1726,8 +1825,14 @@ class FlowGraph:
|
|
|
1726
1825
|
Args:
|
|
1727
1826
|
flow_path: The path where the flow file will be saved.
|
|
1728
1827
|
"""
|
|
1729
|
-
|
|
1730
|
-
|
|
1828
|
+
logger.info("Saving flow to %s", flow_path)
|
|
1829
|
+
os.makedirs(os.path.dirname(flow_path), exist_ok=True)
|
|
1830
|
+
try:
|
|
1831
|
+
with open(flow_path, 'wb') as f:
|
|
1832
|
+
pickle.dump(self.get_node_storage(), f)
|
|
1833
|
+
except Exception as e:
|
|
1834
|
+
logger.error(f"Error saving flow: {e}")
|
|
1835
|
+
|
|
1731
1836
|
self.flow_settings.path = flow_path
|
|
1732
1837
|
|
|
1733
1838
|
def get_frontend_data(self) -> dict:
|
|
@@ -1922,4 +2027,4 @@ def delete_connection(graph, node_connection: input_schema.NodeConnection):
|
|
|
1922
2027
|
to_node.delete_input_node(
|
|
1923
2028
|
node_connection.output_connection.node_id,
|
|
1924
2029
|
connection_type=node_connection.input_connection.connection_class,
|
|
1925
|
-
)
|
|
2030
|
+
)
|
|
@@ -5,7 +5,6 @@ from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEng
|
|
|
5
5
|
from flowfile_core.utils.arrow_reader import get_read_top_n
|
|
6
6
|
from flowfile_core.schemas import input_schema, schemas
|
|
7
7
|
from flowfile_core.configs.flow_logger import NodeLogger
|
|
8
|
-
from flowfile_core.configs.settings import SINGLE_FILE_MODE, OFFLOAD_TO_WORKER
|
|
9
8
|
|
|
10
9
|
from flowfile_core.schemas.output_model import TableExample, FileColumn, NodeData
|
|
11
10
|
from flowfile_core.flowfile.utils import get_hash
|
|
@@ -13,7 +12,7 @@ from flowfile_core.configs.node_store import nodes as node_interface
|
|
|
13
12
|
from flowfile_core.flowfile.setting_generator import setting_generator, setting_updator
|
|
14
13
|
from time import sleep
|
|
15
14
|
from flowfile_core.flowfile.flow_data_engine.subprocess_operations import (
|
|
16
|
-
ExternalDfFetcher, ExternalSampler, results_exists, get_external_df_result,
|
|
15
|
+
ExternalDfFetcher, ExternalSampler, clear_task_from_worker, results_exists, get_external_df_result,
|
|
17
16
|
ExternalDatabaseFetcher, ExternalDatabaseWriter, ExternalCloudWriter)
|
|
18
17
|
from flowfile_core.flowfile.flow_node.models import (NodeStepSettings, NodeStepInputs, NodeSchemaInformation,
|
|
19
18
|
NodeStepStats, NodeResults)
|
|
@@ -679,9 +678,10 @@ class FlowNode:
|
|
|
679
678
|
|
|
680
679
|
if results_exists(self.hash):
|
|
681
680
|
logger.warning('Not implemented')
|
|
681
|
+
clear_task_from_worker(self.hash)
|
|
682
682
|
|
|
683
683
|
def needs_run(self, performance_mode: bool, node_logger: NodeLogger = None,
|
|
684
|
-
execution_location: schemas.ExecutionLocationsLiteral = "
|
|
684
|
+
execution_location: schemas.ExecutionLocationsLiteral = "remote") -> bool:
|
|
685
685
|
"""Determines if the node needs to be executed.
|
|
686
686
|
|
|
687
687
|
The decision is based on its run state, caching settings, and execution mode.
|
|
@@ -694,7 +694,7 @@ class FlowNode:
|
|
|
694
694
|
Returns:
|
|
695
695
|
True if the node should be run, False otherwise.
|
|
696
696
|
"""
|
|
697
|
-
if execution_location == "local"
|
|
697
|
+
if execution_location == "local":
|
|
698
698
|
return False
|
|
699
699
|
|
|
700
700
|
flow_logger = logger if node_logger is None else node_logger
|
|
@@ -724,6 +724,8 @@ class FlowNode:
|
|
|
724
724
|
Raises:
|
|
725
725
|
Exception: Propagates exceptions from the execution.
|
|
726
726
|
"""
|
|
727
|
+
self.clear_table_example()
|
|
728
|
+
|
|
727
729
|
def example_data_generator():
|
|
728
730
|
example_data = None
|
|
729
731
|
|
|
@@ -736,6 +738,7 @@ class FlowNode:
|
|
|
736
738
|
resulting_data = self.get_resulting_data()
|
|
737
739
|
|
|
738
740
|
if not performance_mode:
|
|
741
|
+
self.node_stats.has_run_with_current_setup = True
|
|
739
742
|
self.results.example_data_generator = example_data_generator()
|
|
740
743
|
self.node_schema.result_schema = self.results.resulting_data.schema
|
|
741
744
|
self.node_stats.has_completed_last_run = True
|
|
@@ -855,8 +858,12 @@ class FlowNode:
|
|
|
855
858
|
logger.warning('No external process to cancel')
|
|
856
859
|
self.node_stats.is_canceled = True
|
|
857
860
|
|
|
858
|
-
def execute_node(self, run_location: schemas.ExecutionLocationsLiteral,
|
|
859
|
-
|
|
861
|
+
def execute_node(self, run_location: schemas.ExecutionLocationsLiteral,
|
|
862
|
+
reset_cache: bool = False,
|
|
863
|
+
performance_mode: bool = False,
|
|
864
|
+
retry: bool = True,
|
|
865
|
+
node_logger: NodeLogger = None,
|
|
866
|
+
optimize_for_downstream: bool = True):
|
|
860
867
|
"""Orchestrates the execution, handling location, caching, and retries.
|
|
861
868
|
|
|
862
869
|
Args:
|
|
@@ -865,25 +872,33 @@ class FlowNode:
|
|
|
865
872
|
performance_mode: If True, optimizes for speed over diagnostics.
|
|
866
873
|
retry: If True, allows retrying execution on recoverable errors.
|
|
867
874
|
node_logger: The logger for this node execution.
|
|
875
|
+
optimize_for_downstream: If true, operations that shuffle the order of rows are fully cached and provided as
|
|
876
|
+
input to downstream steps
|
|
868
877
|
|
|
869
878
|
Raises:
|
|
870
879
|
Exception: If the node_logger is not defined.
|
|
871
880
|
"""
|
|
872
881
|
if node_logger is None:
|
|
873
882
|
raise Exception('Flow logger is not defined')
|
|
874
|
-
#
|
|
883
|
+
# TODO: Simplify which route is being picked there are many duplicate checks
|
|
884
|
+
|
|
875
885
|
if reset_cache:
|
|
876
886
|
self.remove_cache()
|
|
877
887
|
self.node_stats.has_run_with_current_setup = False
|
|
878
888
|
self.node_stats.has_completed_last_run = False
|
|
889
|
+
|
|
879
890
|
if self.is_setup:
|
|
880
891
|
node_logger.info(f'Starting to run {self.__name__}')
|
|
881
892
|
if (self.needs_run(performance_mode, node_logger, run_location) or self.node_template.node_group == "output"
|
|
882
|
-
and not (run_location == 'local'
|
|
893
|
+
and not (run_location == 'local')):
|
|
894
|
+
self.clear_table_example()
|
|
883
895
|
self.prepare_before_run()
|
|
896
|
+
self.reset()
|
|
884
897
|
try:
|
|
885
|
-
if ((run_location == 'remote' or
|
|
886
|
-
|
|
898
|
+
if (((run_location == 'remote' or
|
|
899
|
+
(self.node_default.transform_type == 'wide' and optimize_for_downstream) and
|
|
900
|
+
not run_location == 'local'))
|
|
901
|
+
or self.node_settings.cache_results):
|
|
887
902
|
node_logger.info('Running the node remotely')
|
|
888
903
|
if self.node_settings.cache_results:
|
|
889
904
|
performance_mode = False
|
|
@@ -908,8 +923,14 @@ class FlowNode:
|
|
|
908
923
|
node_logger=node_logger)
|
|
909
924
|
else:
|
|
910
925
|
self.results.errors = str(e)
|
|
911
|
-
|
|
912
|
-
|
|
926
|
+
if "Connection refused" in str(e) and "/submit_query/" in str(e):
|
|
927
|
+
node_logger.warning("There was an issue connecting to the remote worker, "
|
|
928
|
+
"ensure the worker process is running, "
|
|
929
|
+
"or change the settings to, so it executes locally")
|
|
930
|
+
node_logger.error("Could not execute in the remote worker. (Re)start the worker service, or change settings to local settings.")
|
|
931
|
+
else:
|
|
932
|
+
node_logger.error(f'Error with running the node: {e}')
|
|
933
|
+
elif ((run_location == 'local') and
|
|
913
934
|
(not self.node_stats.has_run_with_current_setup or self.node_template.node_group == "output")):
|
|
914
935
|
try:
|
|
915
936
|
node_logger.info('Executing fully locally')
|
|
@@ -919,7 +940,7 @@ class FlowNode:
|
|
|
919
940
|
node_logger.error(f'Error with running the node: {e}')
|
|
920
941
|
self.node_stats.error = str(e)
|
|
921
942
|
self.node_stats.has_completed_last_run = False
|
|
922
|
-
|
|
943
|
+
|
|
923
944
|
else:
|
|
924
945
|
node_logger.info('Node has already run, not running the node')
|
|
925
946
|
else:
|
|
@@ -959,16 +980,16 @@ class FlowNode:
|
|
|
959
980
|
logger.info(f'{self.node_id}: Node needs reset')
|
|
960
981
|
self.node_stats.has_run_with_current_setup = False
|
|
961
982
|
self.results.reset()
|
|
962
|
-
if self.is_correct:
|
|
963
|
-
self._schema_callback = None # Ensure the schema callback is reset
|
|
964
|
-
if self.schema_callback:
|
|
965
|
-
logger.info(f'{self.node_id}: Resetting the schema callback')
|
|
966
|
-
self.schema_callback.start()
|
|
967
983
|
self.node_schema.result_schema = None
|
|
968
984
|
self.node_schema.predicted_schema = None
|
|
969
985
|
self._hash = None
|
|
970
986
|
self.node_information.is_setup = None
|
|
971
987
|
self.results.errors = None
|
|
988
|
+
if self.is_correct:
|
|
989
|
+
self._schema_callback = None # Ensure the schema callback is reset
|
|
990
|
+
if self.schema_callback:
|
|
991
|
+
logger.info(f'{self.node_id}: Resetting the schema callback')
|
|
992
|
+
self.schema_callback.start()
|
|
972
993
|
self.evaluate_nodes()
|
|
973
994
|
_ = self.hash # Recalculate the hash after reset
|
|
974
995
|
|
|
@@ -1103,6 +1124,17 @@ class FlowNode:
|
|
|
1103
1124
|
if self.singular_input:
|
|
1104
1125
|
return self.all_inputs[0]
|
|
1105
1126
|
|
|
1127
|
+
def clear_table_example(self) -> None:
|
|
1128
|
+
"""
|
|
1129
|
+
Clear the table example in the results so that it clears the existing results
|
|
1130
|
+
Returns:
|
|
1131
|
+
None
|
|
1132
|
+
"""
|
|
1133
|
+
|
|
1134
|
+
self.results.example_data = None
|
|
1135
|
+
self.results.example_data_generator = None
|
|
1136
|
+
self.results.example_data_path = None
|
|
1137
|
+
|
|
1106
1138
|
def get_table_example(self, include_data: bool = False) -> TableExample | None:
|
|
1107
1139
|
"""Generates a `TableExample` model summarizing the node's output.
|
|
1108
1140
|
|
|
@@ -1131,10 +1163,15 @@ class FlowNode:
|
|
|
1131
1163
|
data = []
|
|
1132
1164
|
schema = [FileColumn.model_validate(c.get_column_repr()) for c in self.schema]
|
|
1133
1165
|
fl = self.get_resulting_data()
|
|
1166
|
+
has_example_data = self.results.example_data_generator is not None
|
|
1167
|
+
|
|
1134
1168
|
return TableExample(node_id=self.node_id,
|
|
1135
1169
|
name=str(self.node_id), number_of_records=999,
|
|
1136
1170
|
number_of_columns=fl.number_of_fields,
|
|
1137
|
-
table_schema=schema, columns=fl.columns, data=data
|
|
1171
|
+
table_schema=schema, columns=fl.columns, data=data,
|
|
1172
|
+
has_example_data=has_example_data,
|
|
1173
|
+
has_run_with_current_setup=self.node_stats.has_run_with_current_setup
|
|
1174
|
+
)
|
|
1138
1175
|
else:
|
|
1139
1176
|
logger.warning('getting the table example but the node has not run')
|
|
1140
1177
|
try:
|