Flowfile 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowfile/__init__.py +4 -3
- flowfile/api.py +1 -0
- flowfile/web/static/assets/{CloudConnectionManager-c20a740f.js → CloudConnectionManager-c97c25f8.js} +2 -2
- flowfile/web/static/assets/{CloudStorageReader-960b400a.js → CloudStorageReader-f1ff509e.js} +7 -7
- flowfile/web/static/assets/{CloudStorageWriter-e3decbdd.js → CloudStorageWriter-034f8b78.js} +7 -7
- flowfile/web/static/assets/{CrossJoin-d67e2405.js → CrossJoin-9e156ebe.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-a81e0f7e.js → DatabaseConnectionSettings-d5c625b3.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-9ea35e84.js → DatabaseManager-265adc5e.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-9578bfa5.js → DatabaseReader-0b10551e.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-19531098.js → DatabaseWriter-c17c6916.js} +9 -9
- flowfile/web/static/assets/{ExploreData-40476474.js → ExploreData-18a4fe52.js} +5 -5
- flowfile/web/static/assets/{ExternalSource-2297ef96.js → ExternalSource-3a66556c.js} +6 -6
- flowfile/web/static/assets/{Filter-f211c03a.js → Filter-91ad87e7.js} +8 -8
- flowfile/web/static/assets/{Formula-4207ea31.js → Formula-3c395ab1.js} +8 -8
- flowfile/web/static/assets/{FuzzyMatch-bf120df0.js → FuzzyMatch-2df0d230.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-5bb7497a.js → GraphSolver-d285877f.js} +5 -5
- flowfile/web/static/assets/{GroupBy-92c81b65.js → GroupBy-0bd1cc6b.js} +6 -6
- flowfile/web/static/assets/{Join-4e49a274.js → Join-5a78a203.js} +9 -9
- flowfile/web/static/assets/{ManualInput-90998ae8.js → ManualInput-93aef9d6.js} +5 -5
- flowfile/web/static/assets/{Output-81e3e917.js → Output-411ecaee.js} +4 -4
- flowfile/web/static/assets/{Pivot-a3419842.js → Pivot-89db4b04.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-72710deb.js → PolarsCode-a9f974f8.js} +6 -6
- flowfile/web/static/assets/{Read-c4059daf.js → Read-c3b1929c.js} +6 -6
- flowfile/web/static/assets/{RecordCount-c2b5e095.js → RecordCount-4e95f98e.js} +5 -5
- flowfile/web/static/assets/{RecordId-10baf191.js → RecordId-55ae7d36.js} +6 -6
- flowfile/web/static/assets/{Sample-3ed9a0ae.js → Sample-b4a18476.js} +5 -5
- flowfile/web/static/assets/{SecretManager-0d49c0e8.js → SecretManager-b066d13a.js} +2 -2
- flowfile/web/static/assets/{Select-8a02a0b3.js → Select-727688dc.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-4c0f45f5.js → SettingsSection-695ac487.js} +1 -1
- flowfile/web/static/assets/{Sort-f55c9f9d.js → Sort-be3339a8.js} +6 -6
- flowfile/web/static/assets/{TextToRows-5dbc2145.js → TextToRows-7b8998da.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-a1768e52.js → UnavailableFields-8b0cb48e.js} +2 -2
- flowfile/web/static/assets/{Union-f2aefdc9.js → Union-89fd73dc.js} +5 -5
- flowfile/web/static/assets/{Unique-46b250da.js → Unique-af5a80b4.js} +8 -8
- flowfile/web/static/assets/{Unpivot-25ac84cc.js → Unpivot-5195d411.js} +5 -5
- flowfile/web/static/assets/{api-a0abbdc7.js → api-023d1733.js} +1 -1
- flowfile/web/static/assets/{api-6ef0dcef.js → api-cb00cce6.js} +1 -1
- flowfile/web/static/assets/{designer-186f2e71.css → designer-2197d782.css} +17 -17
- flowfile/web/static/assets/{designer-13eabd83.js → designer-6c322d8e.js} +67 -21
- flowfile/web/static/assets/{documentation-b87e7f6f.js → documentation-4d1fafe1.js} +1 -1
- flowfile/web/static/assets/{dropDown-13564764.js → dropDown-0b46dd77.js} +1 -1
- flowfile/web/static/assets/{fullEditor-fd2cd6f9.js → fullEditor-ec4e4f95.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-71e11604.js → genericNodeSettings-def5879b.js} +3 -3
- flowfile/web/static/assets/{index-f6c15e76.js → index-683fc198.js} +6 -6
- flowfile/web/static/assets/{nodeTitle-988d9efe.js → nodeTitle-a16db7c3.js} +3 -3
- flowfile/web/static/assets/{secretApi-dd636aa2.js → secretApi-baceb6f9.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-af36165e.js → selectDynamic-de91449a.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-2847001e.js → vue-codemirror.esm-dc5e3348.js} +1 -1
- flowfile/web/static/assets/{vue-content-loader.es-0371da73.js → vue-content-loader.es-ba94b82f.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/METADATA +2 -1
- {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/RECORD +81 -83
- flowfile_core/configs/settings.py +4 -2
- flowfile_core/flowfile/code_generator/code_generator.py +36 -0
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +60 -80
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +61 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +44 -3
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +3 -3
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +5 -2
- flowfile_core/flowfile/flow_graph.py +128 -87
- flowfile_core/flowfile/flow_node/flow_node.py +16 -11
- flowfile_core/flowfile/flow_node/models.py +0 -2
- flowfile_core/flowfile/flow_node/schema_callback.py +138 -43
- flowfile_core/flowfile/graph_tree/graph_tree.py +250 -0
- flowfile_core/flowfile/graph_tree/models.py +15 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +1 -1
- flowfile_core/flowfile/{flow_data_engine/fuzzy_matching/settings_validator.py → schema_callbacks.py} +65 -13
- flowfile_core/flowfile/setting_generator/settings.py +2 -1
- flowfile_core/flowfile/util/execution_orderer.py +9 -0
- flowfile_core/flowfile/util/node_skipper.py +8 -0
- flowfile_core/schemas/schemas.py +46 -3
- flowfile_core/schemas/transform_schema.py +27 -38
- flowfile_frame/__init__.py +1 -4
- flowfile_frame/flow_frame.py +33 -4
- flowfile_frame/flow_frame.pyi +2 -0
- flowfile_worker/funcs.py +7 -3
- flowfile_worker/models.py +3 -1
- flowfile_worker/polars_fuzzy_match/matcher.py +0 -435
- flowfile_worker/polars_fuzzy_match/models.py +0 -36
- flowfile_worker/polars_fuzzy_match/pre_process.py +0 -213
- flowfile_worker/polars_fuzzy_match/process.py +0 -86
- flowfile_worker/polars_fuzzy_match/utils.py +0 -50
- {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/LICENSE +0 -0
- {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/WHEEL +0 -0
- {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/entry_points.txt +0 -0
- {flowfile_worker/polars_fuzzy_match → flowfile_core/flowfile/graph_tree}/__init__.py +0 -0
|
@@ -2,7 +2,6 @@ import datetime
|
|
|
2
2
|
import pickle
|
|
3
3
|
import polars as pl
|
|
4
4
|
import fastexcel
|
|
5
|
-
import re
|
|
6
5
|
from fastapi.exceptions import HTTPException
|
|
7
6
|
from time import time
|
|
8
7
|
from functools import partial
|
|
@@ -11,17 +10,17 @@ from uuid import uuid1
|
|
|
11
10
|
from copy import deepcopy
|
|
12
11
|
from pyarrow.parquet import ParquetFile
|
|
13
12
|
from flowfile_core.configs import logger
|
|
14
|
-
from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
|
|
15
13
|
from flowfile_core.configs.flow_logger import FlowLogger
|
|
16
14
|
from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
|
|
17
15
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
|
|
18
|
-
|
|
19
|
-
pre_calculate_pivot_schema)
|
|
16
|
+
|
|
20
17
|
from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import CloudStorageReader
|
|
21
18
|
from flowfile_core.utils.arrow_reader import get_read_top_n
|
|
22
19
|
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine, execute_polars_code
|
|
23
|
-
from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_xlsx_datatypes,
|
|
24
|
-
|
|
20
|
+
from flowfile_core.flowfile.flow_data_engine.read_excel_tables import (get_open_xlsx_datatypes,
|
|
21
|
+
get_calamine_xlsx_data_types)
|
|
22
|
+
|
|
23
|
+
from flowfile_core.flowfile.schema_callbacks import (calculate_fuzzy_match_schema, pre_calculate_pivot_schema)
|
|
25
24
|
from flowfile_core.flowfile.sources import external_sources
|
|
26
25
|
from flowfile_core.schemas import input_schema, schemas, transform_schema
|
|
27
26
|
from flowfile_core.schemas.output_model import NodeData, NodeResult, RunInformation
|
|
@@ -32,7 +31,11 @@ from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSetting
|
|
|
32
31
|
from flowfile_core.flowfile.utils import snake_case_to_camel_case
|
|
33
32
|
from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
|
|
34
33
|
from flowfile_core.flowfile.flow_node.flow_node import FlowNode
|
|
35
|
-
from flowfile_core.flowfile.util.execution_orderer import
|
|
34
|
+
from flowfile_core.flowfile.util.execution_orderer import compute_execution_plan
|
|
35
|
+
from flowfile_core.flowfile.graph_tree.graph_tree import (add_un_drawn_nodes, build_flow_paths,
|
|
36
|
+
build_node_info, calculate_depth,
|
|
37
|
+
define_node_connections, draw_merged_paths,
|
|
38
|
+
draw_standalone_paths, group_nodes_by_depth)
|
|
36
39
|
from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
|
|
37
40
|
from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import (ExternalDatabaseFetcher,
|
|
38
41
|
ExternalDatabaseWriter,
|
|
@@ -177,7 +180,7 @@ class FlowGraph:
|
|
|
177
180
|
start_datetime: datetime = None
|
|
178
181
|
end_datetime: datetime = None
|
|
179
182
|
nodes_completed: int = 0
|
|
180
|
-
|
|
183
|
+
_flow_settings: schemas.FlowSettings = None
|
|
181
184
|
flow_logger: FlowLogger
|
|
182
185
|
|
|
183
186
|
def __init__(self,
|
|
@@ -201,7 +204,7 @@ class FlowGraph:
|
|
|
201
204
|
if isinstance(flow_settings, schemas.FlowGraphConfig):
|
|
202
205
|
flow_settings = schemas.FlowSettings.from_flow_settings_input(flow_settings)
|
|
203
206
|
|
|
204
|
-
self.
|
|
207
|
+
self._flow_settings = flow_settings
|
|
205
208
|
self.uuid = str(uuid1())
|
|
206
209
|
self.nodes_completed = 0
|
|
207
210
|
self.start_datetime = None
|
|
@@ -226,6 +229,19 @@ class FlowGraph:
|
|
|
226
229
|
elif input_flow is not None:
|
|
227
230
|
self.add_datasource(input_file=input_flow)
|
|
228
231
|
|
|
232
|
+
@property
|
|
233
|
+
def flow_settings(self) -> schemas.FlowSettings:
|
|
234
|
+
return self._flow_settings
|
|
235
|
+
|
|
236
|
+
@flow_settings.setter
|
|
237
|
+
def flow_settings(self, flow_settings: schemas.FlowSettings):
|
|
238
|
+
if (
|
|
239
|
+
(self._flow_settings.execution_location != flow_settings.execution_location) or
|
|
240
|
+
(self._flow_settings.execution_mode != flow_settings.execution_mode)
|
|
241
|
+
):
|
|
242
|
+
self.reset()
|
|
243
|
+
self._flow_settings = flow_settings
|
|
244
|
+
|
|
229
245
|
def add_node_promise(self, node_promise: input_schema.NodePromise):
|
|
230
246
|
"""Adds a placeholder node to the graph that is not yet fully configured.
|
|
231
247
|
|
|
@@ -242,64 +258,6 @@ class FlowGraph:
|
|
|
242
258
|
self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=placeholder,
|
|
243
259
|
setting_input=node_promise)
|
|
244
260
|
|
|
245
|
-
def print_tree(self, show_schema=False, show_descriptions=False):
|
|
246
|
-
"""
|
|
247
|
-
Print flow_graph as a tree.
|
|
248
|
-
"""
|
|
249
|
-
max_node_id = max(self._node_db.keys())
|
|
250
|
-
|
|
251
|
-
tree = ""
|
|
252
|
-
tabs = 0
|
|
253
|
-
tab_counter = 0
|
|
254
|
-
for node in self.nodes:
|
|
255
|
-
tab_counter += 1
|
|
256
|
-
node_input = node.setting_input
|
|
257
|
-
operation = str(self._node_db[node_input.node_id]).split("(")[1][:-1].replace("_", " ").title()
|
|
258
|
-
|
|
259
|
-
if operation == "Formula":
|
|
260
|
-
operation = "With Columns"
|
|
261
|
-
|
|
262
|
-
tree += str(operation) + " (id=" + str(node_input.node_id) + ")"
|
|
263
|
-
|
|
264
|
-
if show_descriptions & show_schema:
|
|
265
|
-
raise ValueError('show_descriptions and show_schema cannot be True simultaneously')
|
|
266
|
-
if show_descriptions:
|
|
267
|
-
tree += ": " + str(node_input.description)
|
|
268
|
-
elif show_schema:
|
|
269
|
-
tree += " -> ["
|
|
270
|
-
if operation == "Manual Input":
|
|
271
|
-
schema = ", ".join([str(i.name) + ": " + str(i.data_type) for i in node_input.raw_data_format.columns])
|
|
272
|
-
tree += schema
|
|
273
|
-
elif operation == "With Columns":
|
|
274
|
-
tree_with_col_schema = ", " + node_input.function.field.name + ": " + node_input.function.field.data_type
|
|
275
|
-
tree += schema + tree_with_col_schema
|
|
276
|
-
elif operation == "Filter":
|
|
277
|
-
index = node_input.filter_input.advanced_filter.find("]")
|
|
278
|
-
filtered_column = str(node_input.filter_input.advanced_filter[1:index])
|
|
279
|
-
schema = re.sub('({str(filtered_column)}: [A-Za-z0-9]+\,\s)', "", schema)
|
|
280
|
-
tree += schema
|
|
281
|
-
elif operation == "Group By":
|
|
282
|
-
for col in node_input.groupby_input.agg_cols:
|
|
283
|
-
schema = re.sub(str(col.old_name) + ': [a-z0-9]+\, ', "", schema)
|
|
284
|
-
tree += schema
|
|
285
|
-
tree += "]"
|
|
286
|
-
else:
|
|
287
|
-
if operation == "Manual Input":
|
|
288
|
-
tree += ": " + str(node_input.raw_data_format.data)
|
|
289
|
-
elif operation == "With Columns":
|
|
290
|
-
tree += ": " + str(node_input.function)
|
|
291
|
-
elif operation == "Filter":
|
|
292
|
-
tree += ": " + str(node_input.filter_input.advanced_filter)
|
|
293
|
-
elif operation == "Group By":
|
|
294
|
-
tree += ": groupby=[" + ", ".join([col.old_name for col in node_input.groupby_input.agg_cols if col.agg == "groupby"]) + "], "
|
|
295
|
-
tree += "agg=[" + ", ".join([str(col.agg) + "(" + str(col.old_name) + ")" for col in node_input.groupby_input.agg_cols if col.agg != "groupby"]) + "]"
|
|
296
|
-
|
|
297
|
-
if node_input.node_id < max_node_id:
|
|
298
|
-
tree += "\n" + "# " + " "*3*(tabs-1) + "|___ "
|
|
299
|
-
print("\n"*2)
|
|
300
|
-
|
|
301
|
-
return print(tree)
|
|
302
|
-
|
|
303
261
|
def apply_layout(self, y_spacing: int = 150, x_spacing: int = 200, initial_y: int = 100):
|
|
304
262
|
"""Calculates and applies a layered layout to all nodes in the graph.
|
|
305
263
|
|
|
@@ -368,6 +326,86 @@ class FlowGraph:
|
|
|
368
326
|
settings_str = " -" + '\n -'.join(f"{k}: {v}" for k, v in self.flow_settings)
|
|
369
327
|
return f"FlowGraph(\nNodes: {self._node_db}\n\nSettings:\n{settings_str}"
|
|
370
328
|
|
|
329
|
+
def print_tree(self):
|
|
330
|
+
"""Print flow_graph as a visual tree structure, showing the DAG relationships with ASCII art."""
|
|
331
|
+
if not self._node_db:
|
|
332
|
+
self.flow_logger.info("Empty flow graph")
|
|
333
|
+
return
|
|
334
|
+
|
|
335
|
+
# Build node information
|
|
336
|
+
node_info = build_node_info(self.nodes)
|
|
337
|
+
|
|
338
|
+
# Calculate depths for all nodes
|
|
339
|
+
for node_id in node_info:
|
|
340
|
+
calculate_depth(node_id, node_info)
|
|
341
|
+
|
|
342
|
+
# Group nodes by depth
|
|
343
|
+
depth_groups, max_depth = group_nodes_by_depth(node_info)
|
|
344
|
+
|
|
345
|
+
# Sort nodes within each depth group
|
|
346
|
+
for depth in depth_groups:
|
|
347
|
+
depth_groups[depth].sort()
|
|
348
|
+
|
|
349
|
+
# Create the main flow visualization
|
|
350
|
+
lines = ["=" * 80, "Flow Graph Visualization", "=" * 80, ""]
|
|
351
|
+
|
|
352
|
+
# Track which nodes connect to what
|
|
353
|
+
merge_points = define_node_connections(node_info)
|
|
354
|
+
|
|
355
|
+
# Build the flow paths
|
|
356
|
+
|
|
357
|
+
# Find the maximum label length for each depth level
|
|
358
|
+
max_label_length = {}
|
|
359
|
+
for depth in range(max_depth + 1):
|
|
360
|
+
if depth in depth_groups:
|
|
361
|
+
max_len = max(len(node_info[nid].label) for nid in depth_groups[depth])
|
|
362
|
+
max_label_length[depth] = max_len
|
|
363
|
+
|
|
364
|
+
# Draw the paths
|
|
365
|
+
drawn_nodes = set()
|
|
366
|
+
merge_drawn = set()
|
|
367
|
+
|
|
368
|
+
# Group paths by their merge points
|
|
369
|
+
paths_by_merge = {}
|
|
370
|
+
standalone_paths = []
|
|
371
|
+
|
|
372
|
+
# Build flow paths
|
|
373
|
+
paths = build_flow_paths(node_info, self._flow_starts, merge_points)
|
|
374
|
+
|
|
375
|
+
# Define paths to merge and standalone paths
|
|
376
|
+
for path in paths:
|
|
377
|
+
if len(path) > 1 and path[-1] in merge_points and len(merge_points[path[-1]]) > 1:
|
|
378
|
+
merge_id = path[-1]
|
|
379
|
+
if merge_id not in paths_by_merge:
|
|
380
|
+
paths_by_merge[merge_id] = []
|
|
381
|
+
paths_by_merge[merge_id].append(path)
|
|
382
|
+
else:
|
|
383
|
+
standalone_paths.append(path)
|
|
384
|
+
|
|
385
|
+
# Draw merged paths
|
|
386
|
+
draw_merged_paths(node_info, merge_points, paths_by_merge, merge_drawn, drawn_nodes, lines)
|
|
387
|
+
|
|
388
|
+
# Draw standlone paths
|
|
389
|
+
draw_standalone_paths(drawn_nodes, standalone_paths, lines, node_info)
|
|
390
|
+
|
|
391
|
+
# Add undrawn nodes
|
|
392
|
+
add_un_drawn_nodes(drawn_nodes, node_info, lines)
|
|
393
|
+
|
|
394
|
+
try:
|
|
395
|
+
skip_nodes, ordered_nodes = compute_execution_plan(
|
|
396
|
+
nodes=self.nodes,
|
|
397
|
+
flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
|
|
398
|
+
if ordered_nodes:
|
|
399
|
+
for i, node in enumerate(ordered_nodes, 1):
|
|
400
|
+
lines.append(f" {i:3d}. {node_info[node.node_id].label}")
|
|
401
|
+
except Exception as e:
|
|
402
|
+
lines.append(f" Could not determine execution order: {e}")
|
|
403
|
+
|
|
404
|
+
# Print everything
|
|
405
|
+
output = "\n".join(lines)
|
|
406
|
+
|
|
407
|
+
print(output)
|
|
408
|
+
|
|
371
409
|
def get_nodes_overview(self):
|
|
372
410
|
"""Gets a list of dictionary representations for all nodes in the graph."""
|
|
373
411
|
output = []
|
|
@@ -774,26 +812,34 @@ class FlowGraph:
|
|
|
774
812
|
"""
|
|
775
813
|
|
|
776
814
|
def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
|
|
815
|
+
node = self.get_node(node_id=fuzzy_settings.node_id)
|
|
816
|
+
if self.execution_location == "local":
|
|
817
|
+
return main.fuzzy_join(fuzzy_match_input=fuzzy_settings.join_input,
|
|
818
|
+
other=right,
|
|
819
|
+
node_logger=self.flow_logger.get_node_logger(fuzzy_settings.node_id))
|
|
820
|
+
|
|
777
821
|
f = main.start_fuzzy_join(fuzzy_match_input=fuzzy_settings.join_input, other=right, file_ref=node.hash,
|
|
778
822
|
flow_id=self.flow_id, node_id=fuzzy_settings.node_id)
|
|
779
823
|
logger.info("Started the fuzzy match action")
|
|
780
|
-
node._fetch_cached_df = f
|
|
824
|
+
node._fetch_cached_df = f # Add to the node so it can be cancelled and fetch later if needed
|
|
781
825
|
return FlowDataEngine(f.get_result())
|
|
782
826
|
|
|
783
|
-
self.add_node_step(node_id=fuzzy_settings.node_id,
|
|
784
|
-
function=_func,
|
|
785
|
-
input_columns=[],
|
|
786
|
-
node_type='fuzzy_match',
|
|
787
|
-
setting_input=fuzzy_settings)
|
|
788
|
-
node = self.get_node(node_id=fuzzy_settings.node_id)
|
|
789
|
-
|
|
790
827
|
def schema_callback():
|
|
791
|
-
|
|
828
|
+
fm_input_copy = deepcopy(fuzzy_settings.join_input) # Deepcopy create an unique object per func
|
|
829
|
+
node = self.get_node(node_id=fuzzy_settings.node_id)
|
|
830
|
+
return calculate_fuzzy_match_schema(fm_input_copy,
|
|
792
831
|
left_schema=node.node_inputs.main_inputs[0].schema,
|
|
793
832
|
right_schema=node.node_inputs.right_input.schema
|
|
794
833
|
)
|
|
795
834
|
|
|
796
|
-
|
|
835
|
+
self.add_node_step(node_id=fuzzy_settings.node_id,
|
|
836
|
+
function=_func,
|
|
837
|
+
input_columns=[],
|
|
838
|
+
node_type='fuzzy_match',
|
|
839
|
+
setting_input=fuzzy_settings,
|
|
840
|
+
input_node_ids=fuzzy_settings.depending_on_ids,
|
|
841
|
+
schema_callback=schema_callback)
|
|
842
|
+
|
|
797
843
|
return self
|
|
798
844
|
|
|
799
845
|
def add_text_to_rows(self, node_text_to_rows: input_schema.NodeTextToRows) -> "FlowGraph":
|
|
@@ -1549,6 +1595,8 @@ class FlowGraph:
|
|
|
1549
1595
|
Args:
|
|
1550
1596
|
execution_location: The execution location to set.
|
|
1551
1597
|
"""
|
|
1598
|
+
if self.flow_settings.execution_location != execution_location:
|
|
1599
|
+
self.reset()
|
|
1552
1600
|
self.flow_settings.execution_location = execution_location
|
|
1553
1601
|
|
|
1554
1602
|
def run_graph(self) -> RunInformation | None:
|
|
@@ -1575,18 +1623,11 @@ class FlowGraph:
|
|
|
1575
1623
|
self.end_datetime = None
|
|
1576
1624
|
self.latest_run_info = None
|
|
1577
1625
|
self.flow_logger.info('Starting to run flowfile flow...')
|
|
1578
|
-
skip_nodes =
|
|
1579
|
-
|
|
1580
|
-
execution_order = determine_execution_order(all_nodes=[node for node in self.nodes if
|
|
1581
|
-
node not in skip_nodes],
|
|
1582
|
-
flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
|
|
1626
|
+
skip_nodes, execution_order = compute_execution_plan(nodes=self.nodes, flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
|
|
1627
|
+
|
|
1583
1628
|
skip_node_message(self.flow_logger, skip_nodes)
|
|
1584
1629
|
execution_order_message(self.flow_logger, execution_order)
|
|
1585
1630
|
performance_mode = self.flow_settings.execution_mode == 'Performance'
|
|
1586
|
-
if self.flow_settings.execution_location == 'local':
|
|
1587
|
-
OFFLOAD_TO_WORKER.value = False
|
|
1588
|
-
elif self.flow_settings.execution_location == 'remote':
|
|
1589
|
-
OFFLOAD_TO_WORKER.value = True
|
|
1590
1631
|
for node in execution_order:
|
|
1591
1632
|
node_logger = self.flow_logger.get_node_logger(node.node_id)
|
|
1592
1633
|
if self.flow_settings.is_canceled:
|
|
@@ -1922,4 +1963,4 @@ def delete_connection(graph, node_connection: input_schema.NodeConnection):
|
|
|
1922
1963
|
to_node.delete_input_node(
|
|
1923
1964
|
node_connection.output_connection.node_id,
|
|
1924
1965
|
connection_type=node_connection.input_connection.connection_class,
|
|
1925
|
-
)
|
|
1966
|
+
)
|
|
@@ -5,7 +5,6 @@ from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEng
|
|
|
5
5
|
from flowfile_core.utils.arrow_reader import get_read_top_n
|
|
6
6
|
from flowfile_core.schemas import input_schema, schemas
|
|
7
7
|
from flowfile_core.configs.flow_logger import NodeLogger
|
|
8
|
-
from flowfile_core.configs.settings import SINGLE_FILE_MODE, OFFLOAD_TO_WORKER
|
|
9
8
|
|
|
10
9
|
from flowfile_core.schemas.output_model import TableExample, FileColumn, NodeData
|
|
11
10
|
from flowfile_core.flowfile.utils import get_hash
|
|
@@ -681,7 +680,7 @@ class FlowNode:
|
|
|
681
680
|
logger.warning('Not implemented')
|
|
682
681
|
|
|
683
682
|
def needs_run(self, performance_mode: bool, node_logger: NodeLogger = None,
|
|
684
|
-
execution_location: schemas.ExecutionLocationsLiteral = "
|
|
683
|
+
execution_location: schemas.ExecutionLocationsLiteral = "worker") -> bool:
|
|
685
684
|
"""Determines if the node needs to be executed.
|
|
686
685
|
|
|
687
686
|
The decision is based on its run state, caching settings, and execution mode.
|
|
@@ -694,7 +693,7 @@ class FlowNode:
|
|
|
694
693
|
Returns:
|
|
695
694
|
True if the node should be run, False otherwise.
|
|
696
695
|
"""
|
|
697
|
-
if execution_location == "local"
|
|
696
|
+
if execution_location == "local":
|
|
698
697
|
return False
|
|
699
698
|
|
|
700
699
|
flow_logger = logger if node_logger is None else node_logger
|
|
@@ -879,7 +878,7 @@ class FlowNode:
|
|
|
879
878
|
if self.is_setup:
|
|
880
879
|
node_logger.info(f'Starting to run {self.__name__}')
|
|
881
880
|
if (self.needs_run(performance_mode, node_logger, run_location) or self.node_template.node_group == "output"
|
|
882
|
-
and not (run_location == 'local'
|
|
881
|
+
and not (run_location == 'local')):
|
|
883
882
|
self.prepare_before_run()
|
|
884
883
|
try:
|
|
885
884
|
if ((run_location == 'remote' or (self.node_default.transform_type == 'wide')
|
|
@@ -908,8 +907,14 @@ class FlowNode:
|
|
|
908
907
|
node_logger=node_logger)
|
|
909
908
|
else:
|
|
910
909
|
self.results.errors = str(e)
|
|
911
|
-
|
|
912
|
-
|
|
910
|
+
if "Connection refused" in str(e) and "/submit_query/" in str(e):
|
|
911
|
+
node_logger.warning("There was an issue connecting to the remote worker, "
|
|
912
|
+
"ensure the worker process is running, "
|
|
913
|
+
"or change the settings to, so it executes locally")
|
|
914
|
+
node_logger.error("Could not execute in the remote worker. (Re)start the worker service, or change settings to local settings.")
|
|
915
|
+
else:
|
|
916
|
+
node_logger.error(f'Error with running the node: {e}')
|
|
917
|
+
elif ((run_location == 'local') and
|
|
913
918
|
(not self.node_stats.has_run_with_current_setup or self.node_template.node_group == "output")):
|
|
914
919
|
try:
|
|
915
920
|
node_logger.info('Executing fully locally')
|
|
@@ -959,16 +964,16 @@ class FlowNode:
|
|
|
959
964
|
logger.info(f'{self.node_id}: Node needs reset')
|
|
960
965
|
self.node_stats.has_run_with_current_setup = False
|
|
961
966
|
self.results.reset()
|
|
962
|
-
if self.is_correct:
|
|
963
|
-
self._schema_callback = None # Ensure the schema callback is reset
|
|
964
|
-
if self.schema_callback:
|
|
965
|
-
logger.info(f'{self.node_id}: Resetting the schema callback')
|
|
966
|
-
self.schema_callback.start()
|
|
967
967
|
self.node_schema.result_schema = None
|
|
968
968
|
self.node_schema.predicted_schema = None
|
|
969
969
|
self._hash = None
|
|
970
970
|
self.node_information.is_setup = None
|
|
971
971
|
self.results.errors = None
|
|
972
|
+
if self.is_correct:
|
|
973
|
+
self._schema_callback = None # Ensure the schema callback is reset
|
|
974
|
+
if self.schema_callback:
|
|
975
|
+
logger.info(f'{self.node_id}: Resetting the schema callback')
|
|
976
|
+
self.schema_callback.start()
|
|
972
977
|
self.evaluate_nodes()
|
|
973
978
|
_ = self.hash # Recalculate the hash after reset
|
|
974
979
|
|
|
@@ -108,14 +108,12 @@ class NodeStepSettings:
|
|
|
108
108
|
streamable: If True, the node can process data in a streaming fashion.
|
|
109
109
|
setup_errors: If True, indicates a non-blocking error occurred during setup.
|
|
110
110
|
breaking_setup_errors: If True, indicates an error occurred that prevents execution.
|
|
111
|
-
execute_location: The preferred location for execution ('auto', 'local', 'remote').
|
|
112
111
|
"""
|
|
113
112
|
cache_results: bool = False
|
|
114
113
|
renew_schema: bool = True
|
|
115
114
|
streamable: bool = True
|
|
116
115
|
setup_errors: bool = False
|
|
117
116
|
breaking_setup_errors: bool = False
|
|
118
|
-
execute_location: schemas.ExecutionLocationsLiteral = 'auto'
|
|
119
117
|
|
|
120
118
|
|
|
121
119
|
class NodeStepInputs:
|
|
@@ -1,71 +1,166 @@
|
|
|
1
|
-
|
|
2
1
|
from typing import Callable, Any, Optional, Generic, TypeVar
|
|
3
2
|
from concurrent.futures import ThreadPoolExecutor, Future
|
|
3
|
+
import threading
|
|
4
4
|
from flowfile_core.configs import logger
|
|
5
5
|
|
|
6
|
-
|
|
7
6
|
T = TypeVar('T')
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
class SingleExecutionFuture(Generic[T]):
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
|
|
10
|
+
"""Thread-safe single execution of a function with result caching.
|
|
11
|
+
|
|
12
|
+
Ensures a function is executed at most once even when called from multiple threads.
|
|
13
|
+
Subsequent calls return the cached result.
|
|
14
|
+
"""
|
|
15
|
+
|
|
14
16
|
func: Callable[[], T]
|
|
15
17
|
on_error: Optional[Callable[[Exception], Any]]
|
|
16
|
-
|
|
17
|
-
|
|
18
|
+
_lock: threading.RLock
|
|
19
|
+
_executor: Optional[ThreadPoolExecutor]
|
|
20
|
+
_future: Optional[Future[T]]
|
|
21
|
+
_result_value: Optional[T]
|
|
22
|
+
_exception: Optional[Exception]
|
|
23
|
+
_has_completed: bool
|
|
24
|
+
_has_started: bool
|
|
18
25
|
|
|
19
26
|
def __init__(
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
27
|
+
self,
|
|
28
|
+
func: Callable[[], T],
|
|
29
|
+
on_error: Optional[Callable[[Exception], Any]] = None
|
|
23
30
|
) -> None:
|
|
24
31
|
"""Initialize with function and optional error handler."""
|
|
25
|
-
self.executor = ThreadPoolExecutor(max_workers=1)
|
|
26
|
-
self.future = None
|
|
27
32
|
self.func = func
|
|
28
33
|
self.on_error = on_error
|
|
29
|
-
|
|
30
|
-
|
|
34
|
+
|
|
35
|
+
# Thread safety
|
|
36
|
+
self._lock = threading.RLock() # RLock allows re-entrant locking
|
|
37
|
+
|
|
38
|
+
# Execution state
|
|
39
|
+
self._executor = None
|
|
40
|
+
self._future = None
|
|
41
|
+
self._result_value = None
|
|
42
|
+
self._exception = None
|
|
43
|
+
self._has_completed = False
|
|
44
|
+
self._has_started = False
|
|
45
|
+
|
|
46
|
+
def _ensure_executor(self) -> ThreadPoolExecutor:
|
|
47
|
+
"""Ensure executor exists, creating if necessary."""
|
|
48
|
+
if self._executor is None or self._executor._shutdown:
|
|
49
|
+
self._executor = ThreadPoolExecutor(max_workers=1)
|
|
50
|
+
return self._executor
|
|
31
51
|
|
|
32
52
|
def start(self) -> None:
|
|
33
53
|
"""Start the function execution if not already started."""
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
54
|
+
with self._lock:
|
|
55
|
+
if self._has_started:
|
|
56
|
+
logger.info("Function already started or completed")
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
logger.info("Starting single executor function")
|
|
60
|
+
executor: ThreadPoolExecutor = self._ensure_executor()
|
|
61
|
+
self._future = executor.submit(self._func_wrapper)
|
|
62
|
+
self._has_started = True
|
|
63
|
+
|
|
64
|
+
def _func_wrapper(self) -> T:
|
|
65
|
+
"""Wrapper to capture the result or exception."""
|
|
66
|
+
try:
|
|
67
|
+
result: T = self.func()
|
|
68
|
+
with self._lock:
|
|
69
|
+
self._result_value = result
|
|
70
|
+
self._has_completed = True
|
|
71
|
+
return result
|
|
72
|
+
except Exception as e:
|
|
73
|
+
with self._lock:
|
|
74
|
+
self._exception = e
|
|
75
|
+
self._has_completed = True
|
|
76
|
+
raise
|
|
37
77
|
|
|
38
78
|
def cleanup(self) -> None:
|
|
39
|
-
"""Clean up resources by
|
|
40
|
-
self.
|
|
41
|
-
|
|
79
|
+
"""Clean up resources by shutting down the executor."""
|
|
80
|
+
with self._lock:
|
|
81
|
+
if self._executor and not self._executor._shutdown:
|
|
82
|
+
self._executor.shutdown(wait=False)
|
|
42
83
|
|
|
43
84
|
def __call__(self) -> Optional[T]:
|
|
44
85
|
"""Execute function if not running and return its result."""
|
|
45
|
-
|
|
46
|
-
return
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
if self.
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
86
|
+
with self._lock:
|
|
87
|
+
# If already completed, return cached result or raise cached exception
|
|
88
|
+
if self._has_completed:
|
|
89
|
+
if self._exception:
|
|
90
|
+
if self.on_error:
|
|
91
|
+
return self.on_error(self._exception)
|
|
92
|
+
else:
|
|
93
|
+
raise self._exception
|
|
94
|
+
return self._result_value
|
|
95
|
+
|
|
96
|
+
# Start if not already started
|
|
97
|
+
if not self._has_started:
|
|
98
|
+
self.start()
|
|
99
|
+
|
|
100
|
+
# Wait for completion outside the lock to avoid blocking other threads
|
|
101
|
+
if self._future:
|
|
102
|
+
try:
|
|
103
|
+
result: T = self._future.result()
|
|
104
|
+
logger.info("Function completed successfully")
|
|
105
|
+
return result
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.error(f"Function raised exception: {e}")
|
|
108
|
+
if self.on_error:
|
|
109
|
+
return self.on_error(e)
|
|
110
|
+
else:
|
|
111
|
+
raise
|
|
112
|
+
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
def reset(self) -> None:
|
|
116
|
+
"""Reset the execution state, allowing the function to be run again."""
|
|
117
|
+
with self._lock:
|
|
118
|
+
logger.info("Resetting single execution future")
|
|
119
|
+
|
|
120
|
+
# Cancel any pending execution
|
|
121
|
+
if self._future and not self._future.done():
|
|
122
|
+
self._future.cancel()
|
|
62
123
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
124
|
+
# Clean up old executor
|
|
125
|
+
if self._executor and not self._executor._shutdown:
|
|
126
|
+
self._executor.shutdown(wait=False)
|
|
127
|
+
|
|
128
|
+
# Reset state
|
|
129
|
+
self._executor = None
|
|
130
|
+
self._future = None
|
|
131
|
+
self._result_value = None
|
|
132
|
+
self._exception = None
|
|
133
|
+
self._has_completed = False
|
|
134
|
+
self._has_started = False
|
|
135
|
+
|
|
136
|
+
def is_running(self) -> bool:
|
|
137
|
+
"""Check if the function is currently executing."""
|
|
138
|
+
with self._lock:
|
|
139
|
+
return bool(
|
|
140
|
+
self._has_started and
|
|
141
|
+
not self._has_completed and
|
|
142
|
+
self._future is not None and
|
|
143
|
+
not self._future.done()
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def is_completed(self) -> bool:
|
|
147
|
+
"""Check if the function has completed execution."""
|
|
148
|
+
with self._lock:
|
|
149
|
+
return self._has_completed
|
|
150
|
+
|
|
151
|
+
def get_result(self) -> Optional[T]:
|
|
152
|
+
"""Get the cached result without triggering execution."""
|
|
153
|
+
with self._lock:
|
|
154
|
+
if self._exception:
|
|
155
|
+
if self.on_error:
|
|
156
|
+
return self.on_error(self._exception)
|
|
157
|
+
else:
|
|
158
|
+
raise self._exception
|
|
159
|
+
return self._result_value
|
|
68
160
|
|
|
69
161
|
def __del__(self) -> None:
|
|
70
162
|
"""Ensure executor is shut down on deletion."""
|
|
71
|
-
|
|
163
|
+
try:
|
|
164
|
+
self.cleanup()
|
|
165
|
+
except Exception:
|
|
166
|
+
pass
|