Flowfile 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowfile/__init__.py +4 -3
- flowfile/api.py +5 -2
- flowfile/web/__init__.py +2 -0
- flowfile/web/static/assets/{CloudConnectionManager-c20a740f.js → CloudConnectionManager-c97c25f8.js} +2 -2
- flowfile/web/static/assets/{CloudStorageReader-960b400a.js → CloudStorageReader-f1ff509e.js} +7 -7
- flowfile/web/static/assets/{CloudStorageWriter-e3decbdd.js → CloudStorageWriter-034f8b78.js} +7 -7
- flowfile/web/static/assets/{CrossJoin-d67e2405.js → CrossJoin-9e156ebe.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-a81e0f7e.js → DatabaseConnectionSettings-d5c625b3.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-9ea35e84.js → DatabaseManager-265adc5e.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-9578bfa5.js → DatabaseReader-0b10551e.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-19531098.js → DatabaseWriter-c17c6916.js} +9 -9
- flowfile/web/static/assets/{ExploreData-40476474.js → ExploreData-18a4fe52.js} +5 -5
- flowfile/web/static/assets/{ExternalSource-2297ef96.js → ExternalSource-3a66556c.js} +6 -6
- flowfile/web/static/assets/{Filter-f211c03a.js → Filter-91ad87e7.js} +8 -8
- flowfile/web/static/assets/{Formula-4207ea31.js → Formula-3c395ab1.js} +8 -8
- flowfile/web/static/assets/{FuzzyMatch-bf120df0.js → FuzzyMatch-2df0d230.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-5bb7497a.js → GraphSolver-d285877f.js} +5 -5
- flowfile/web/static/assets/{GroupBy-92c81b65.js → GroupBy-0bd1cc6b.js} +6 -6
- flowfile/web/static/assets/{Join-4e49a274.js → Join-5a78a203.js} +9 -9
- flowfile/web/static/assets/{ManualInput-90998ae8.js → ManualInput-93aef9d6.js} +5 -5
- flowfile/web/static/assets/{Output-81e3e917.js → Output-411ecaee.js} +4 -4
- flowfile/web/static/assets/{Pivot-a3419842.js → Pivot-89db4b04.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-72710deb.js → PolarsCode-a9f974f8.js} +6 -6
- flowfile/web/static/assets/{Read-c4059daf.js → Read-c3b1929c.js} +6 -6
- flowfile/web/static/assets/{RecordCount-c2b5e095.js → RecordCount-4e95f98e.js} +5 -5
- flowfile/web/static/assets/{RecordId-10baf191.js → RecordId-55ae7d36.js} +6 -6
- flowfile/web/static/assets/{Sample-3ed9a0ae.js → Sample-b4a18476.js} +5 -5
- flowfile/web/static/assets/{SecretManager-0d49c0e8.js → SecretManager-b066d13a.js} +2 -2
- flowfile/web/static/assets/{Select-8a02a0b3.js → Select-727688dc.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-4c0f45f5.js → SettingsSection-695ac487.js} +1 -1
- flowfile/web/static/assets/{Sort-f55c9f9d.js → Sort-be3339a8.js} +6 -6
- flowfile/web/static/assets/{TextToRows-5dbc2145.js → TextToRows-7b8998da.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-a1768e52.js → UnavailableFields-8b0cb48e.js} +2 -2
- flowfile/web/static/assets/{Union-f2aefdc9.js → Union-89fd73dc.js} +5 -5
- flowfile/web/static/assets/{Unique-46b250da.js → Unique-af5a80b4.js} +8 -8
- flowfile/web/static/assets/{Unpivot-25ac84cc.js → Unpivot-5195d411.js} +5 -5
- flowfile/web/static/assets/{api-a0abbdc7.js → api-023d1733.js} +1 -1
- flowfile/web/static/assets/{api-6ef0dcef.js → api-cb00cce6.js} +1 -1
- flowfile/web/static/assets/{designer-186f2e71.css → designer-2197d782.css} +17 -17
- flowfile/web/static/assets/{designer-13eabd83.js → designer-6c322d8e.js} +67 -21
- flowfile/web/static/assets/{documentation-b87e7f6f.js → documentation-4d1fafe1.js} +1 -1
- flowfile/web/static/assets/{dropDown-13564764.js → dropDown-0b46dd77.js} +1 -1
- flowfile/web/static/assets/{fullEditor-fd2cd6f9.js → fullEditor-ec4e4f95.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-71e11604.js → genericNodeSettings-def5879b.js} +3 -3
- flowfile/web/static/assets/{index-f6c15e76.js → index-683fc198.js} +6 -6
- flowfile/web/static/assets/{nodeTitle-988d9efe.js → nodeTitle-a16db7c3.js} +3 -3
- flowfile/web/static/assets/{secretApi-dd636aa2.js → secretApi-baceb6f9.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-af36165e.js → selectDynamic-de91449a.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-2847001e.js → vue-codemirror.esm-dc5e3348.js} +1 -1
- flowfile/web/static/assets/{vue-content-loader.es-0371da73.js → vue-content-loader.es-ba94b82f.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/METADATA +2 -1
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/RECORD +88 -90
- flowfile_core/configs/settings.py +4 -2
- flowfile_core/configs/utils.py +5 -0
- flowfile_core/database/connection.py +1 -3
- flowfile_core/flowfile/code_generator/code_generator.py +36 -0
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +0 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +60 -80
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +61 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +44 -3
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +3 -3
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +5 -2
- flowfile_core/flowfile/flow_graph.py +129 -88
- flowfile_core/flowfile/flow_node/flow_node.py +30 -15
- flowfile_core/flowfile/flow_node/models.py +0 -2
- flowfile_core/flowfile/flow_node/schema_callback.py +138 -43
- flowfile_core/flowfile/graph_tree/graph_tree.py +250 -0
- flowfile_core/flowfile/graph_tree/models.py +15 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +1 -1
- flowfile_core/flowfile/{flow_data_engine/fuzzy_matching/settings_validator.py → schema_callbacks.py} +65 -13
- flowfile_core/flowfile/setting_generator/settings.py +2 -1
- flowfile_core/flowfile/util/execution_orderer.py +9 -0
- flowfile_core/flowfile/util/node_skipper.py +8 -0
- flowfile_core/schemas/schemas.py +46 -3
- flowfile_core/schemas/transform_schema.py +27 -38
- flowfile_core/utils/arrow_reader.py +8 -3
- flowfile_core/utils/validate_setup.py +0 -2
- flowfile_frame/__init__.py +1 -4
- flowfile_frame/expr.py +14 -0
- flowfile_frame/flow_frame.py +34 -5
- flowfile_frame/flow_frame.pyi +5 -6
- flowfile_worker/funcs.py +7 -3
- flowfile_worker/models.py +3 -1
- flowfile_worker/polars_fuzzy_match/matcher.py +0 -435
- flowfile_worker/polars_fuzzy_match/models.py +0 -36
- flowfile_worker/polars_fuzzy_match/pre_process.py +0 -213
- flowfile_worker/polars_fuzzy_match/process.py +0 -86
- flowfile_worker/polars_fuzzy_match/utils.py +0 -50
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/LICENSE +0 -0
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/WHEEL +0 -0
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/entry_points.txt +0 -0
- {flowfile_worker/polars_fuzzy_match → flowfile_core/flowfile/graph_tree}/__init__.py +0 -0
|
@@ -2,7 +2,6 @@ import datetime
|
|
|
2
2
|
import pickle
|
|
3
3
|
import polars as pl
|
|
4
4
|
import fastexcel
|
|
5
|
-
import re
|
|
6
5
|
from fastapi.exceptions import HTTPException
|
|
7
6
|
from time import time
|
|
8
7
|
from functools import partial
|
|
@@ -11,17 +10,17 @@ from uuid import uuid1
|
|
|
11
10
|
from copy import deepcopy
|
|
12
11
|
from pyarrow.parquet import ParquetFile
|
|
13
12
|
from flowfile_core.configs import logger
|
|
14
|
-
from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
|
|
15
13
|
from flowfile_core.configs.flow_logger import FlowLogger
|
|
16
14
|
from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
|
|
17
15
|
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
|
|
18
|
-
|
|
19
|
-
pre_calculate_pivot_schema)
|
|
16
|
+
|
|
20
17
|
from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import CloudStorageReader
|
|
21
18
|
from flowfile_core.utils.arrow_reader import get_read_top_n
|
|
22
19
|
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine, execute_polars_code
|
|
23
|
-
from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_xlsx_datatypes,
|
|
24
|
-
|
|
20
|
+
from flowfile_core.flowfile.flow_data_engine.read_excel_tables import (get_open_xlsx_datatypes,
|
|
21
|
+
get_calamine_xlsx_data_types)
|
|
22
|
+
|
|
23
|
+
from flowfile_core.flowfile.schema_callbacks import (calculate_fuzzy_match_schema, pre_calculate_pivot_schema)
|
|
25
24
|
from flowfile_core.flowfile.sources import external_sources
|
|
26
25
|
from flowfile_core.schemas import input_schema, schemas, transform_schema
|
|
27
26
|
from flowfile_core.schemas.output_model import NodeData, NodeResult, RunInformation
|
|
@@ -32,7 +31,11 @@ from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSetting
|
|
|
32
31
|
from flowfile_core.flowfile.utils import snake_case_to_camel_case
|
|
33
32
|
from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
|
|
34
33
|
from flowfile_core.flowfile.flow_node.flow_node import FlowNode
|
|
35
|
-
from flowfile_core.flowfile.util.execution_orderer import
|
|
34
|
+
from flowfile_core.flowfile.util.execution_orderer import compute_execution_plan
|
|
35
|
+
from flowfile_core.flowfile.graph_tree.graph_tree import (add_un_drawn_nodes, build_flow_paths,
|
|
36
|
+
build_node_info, calculate_depth,
|
|
37
|
+
define_node_connections, draw_merged_paths,
|
|
38
|
+
draw_standalone_paths, group_nodes_by_depth)
|
|
36
39
|
from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
|
|
37
40
|
from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import (ExternalDatabaseFetcher,
|
|
38
41
|
ExternalDatabaseWriter,
|
|
@@ -177,7 +180,7 @@ class FlowGraph:
|
|
|
177
180
|
start_datetime: datetime = None
|
|
178
181
|
end_datetime: datetime = None
|
|
179
182
|
nodes_completed: int = 0
|
|
180
|
-
|
|
183
|
+
_flow_settings: schemas.FlowSettings = None
|
|
181
184
|
flow_logger: FlowLogger
|
|
182
185
|
|
|
183
186
|
def __init__(self,
|
|
@@ -201,7 +204,7 @@ class FlowGraph:
|
|
|
201
204
|
if isinstance(flow_settings, schemas.FlowGraphConfig):
|
|
202
205
|
flow_settings = schemas.FlowSettings.from_flow_settings_input(flow_settings)
|
|
203
206
|
|
|
204
|
-
self.
|
|
207
|
+
self._flow_settings = flow_settings
|
|
205
208
|
self.uuid = str(uuid1())
|
|
206
209
|
self.nodes_completed = 0
|
|
207
210
|
self.start_datetime = None
|
|
@@ -226,6 +229,19 @@ class FlowGraph:
|
|
|
226
229
|
elif input_flow is not None:
|
|
227
230
|
self.add_datasource(input_file=input_flow)
|
|
228
231
|
|
|
232
|
+
@property
|
|
233
|
+
def flow_settings(self) -> schemas.FlowSettings:
|
|
234
|
+
return self._flow_settings
|
|
235
|
+
|
|
236
|
+
@flow_settings.setter
|
|
237
|
+
def flow_settings(self, flow_settings: schemas.FlowSettings):
|
|
238
|
+
if (
|
|
239
|
+
(self._flow_settings.execution_location != flow_settings.execution_location) or
|
|
240
|
+
(self._flow_settings.execution_mode != flow_settings.execution_mode)
|
|
241
|
+
):
|
|
242
|
+
self.reset()
|
|
243
|
+
self._flow_settings = flow_settings
|
|
244
|
+
|
|
229
245
|
def add_node_promise(self, node_promise: input_schema.NodePromise):
|
|
230
246
|
"""Adds a placeholder node to the graph that is not yet fully configured.
|
|
231
247
|
|
|
@@ -242,66 +258,6 @@ class FlowGraph:
|
|
|
242
258
|
self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=placeholder,
|
|
243
259
|
setting_input=node_promise)
|
|
244
260
|
|
|
245
|
-
def print_tree(self, show_schema=False, show_descriptions=False):
|
|
246
|
-
"""
|
|
247
|
-
Print flow_graph as a tree.
|
|
248
|
-
"""
|
|
249
|
-
max_node_id = max(self._node_db.keys())
|
|
250
|
-
|
|
251
|
-
tree = ""
|
|
252
|
-
tabs = 0
|
|
253
|
-
tab_counter = 0
|
|
254
|
-
for node in self.nodes:
|
|
255
|
-
tab_counter += 1
|
|
256
|
-
node_input = node.setting_input
|
|
257
|
-
operation = str(self._node_db[node_input.node_id]).split("(")[1][:-1].replace("_", " ").title()
|
|
258
|
-
|
|
259
|
-
if operation == "Formula":
|
|
260
|
-
operation = "With Columns"
|
|
261
|
-
|
|
262
|
-
tree += str(operation) + " (id=" + str(node_input.node_id) + ")"
|
|
263
|
-
|
|
264
|
-
if show_descriptions & show_schema:
|
|
265
|
-
raise ValueError('show_descriptions and show_schema cannot be True simultaneously')
|
|
266
|
-
if show_descriptions:
|
|
267
|
-
tree += ": " + str(node_input.description)
|
|
268
|
-
elif show_schema:
|
|
269
|
-
tree += " -> ["
|
|
270
|
-
if operation == "Manual Input":
|
|
271
|
-
schema = ", ".join([str(i.name) + ": " + str(i.data_type) for i in node_input.raw_data_format.columns])
|
|
272
|
-
tree += schema
|
|
273
|
-
elif operation == "With Columns":
|
|
274
|
-
tree_with_col_schema = ", " + node_input.function.field.name + ": " + node_input.function.field.data_type
|
|
275
|
-
tree += schema + tree_with_col_schema
|
|
276
|
-
elif operation == "Filter":
|
|
277
|
-
index = node_input.filter_input.advanced_filter.find("]")
|
|
278
|
-
filtered_column = str(node_input.filter_input.advanced_filter[1:index])
|
|
279
|
-
schema = re.sub('({str(filtered_column)}: [A-Za-z0-9]+\,\s)', "", schema)
|
|
280
|
-
tree += schema
|
|
281
|
-
elif operation == "Group By":
|
|
282
|
-
for col in node_input.groupby_input.agg_cols:
|
|
283
|
-
schema = re.sub(str(col.old_name) + ': [a-z0-9]+\, ', "", schema)
|
|
284
|
-
tree += schema
|
|
285
|
-
tree += "]"
|
|
286
|
-
else:
|
|
287
|
-
if operation == "Manual Input":
|
|
288
|
-
tree += ": " + str(node_input.raw_data_format.data)
|
|
289
|
-
elif operation == "With Columns":
|
|
290
|
-
tree += ": " + str(node_input.function)
|
|
291
|
-
elif operation == "Filter":
|
|
292
|
-
tree += ": " + str(node_input.filter_input.advanced_filter)
|
|
293
|
-
elif operation == "Group By":
|
|
294
|
-
tree += ": groupby=[" + ", ".join([col.old_name for col in node_input.groupby_input.agg_cols if col.agg == "groupby"]) + "], "
|
|
295
|
-
tree += "agg=[" + ", ".join([str(col.agg) + "(" + str(col.old_name) + ")" for col in node_input.groupby_input.agg_cols if col.agg != "groupby"]) + "]"
|
|
296
|
-
|
|
297
|
-
if node_input.node_id < max_node_id:
|
|
298
|
-
tree += "\n" + "# " + " "*3*(tabs-1) + "|___ "
|
|
299
|
-
print("\n"*2)
|
|
300
|
-
|
|
301
|
-
return print(tree)
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
261
|
def apply_layout(self, y_spacing: int = 150, x_spacing: int = 200, initial_y: int = 100):
|
|
306
262
|
"""Calculates and applies a layered layout to all nodes in the graph.
|
|
307
263
|
|
|
@@ -370,6 +326,86 @@ class FlowGraph:
|
|
|
370
326
|
settings_str = " -" + '\n -'.join(f"{k}: {v}" for k, v in self.flow_settings)
|
|
371
327
|
return f"FlowGraph(\nNodes: {self._node_db}\n\nSettings:\n{settings_str}"
|
|
372
328
|
|
|
329
|
+
def print_tree(self):
|
|
330
|
+
"""Print flow_graph as a visual tree structure, showing the DAG relationships with ASCII art."""
|
|
331
|
+
if not self._node_db:
|
|
332
|
+
self.flow_logger.info("Empty flow graph")
|
|
333
|
+
return
|
|
334
|
+
|
|
335
|
+
# Build node information
|
|
336
|
+
node_info = build_node_info(self.nodes)
|
|
337
|
+
|
|
338
|
+
# Calculate depths for all nodes
|
|
339
|
+
for node_id in node_info:
|
|
340
|
+
calculate_depth(node_id, node_info)
|
|
341
|
+
|
|
342
|
+
# Group nodes by depth
|
|
343
|
+
depth_groups, max_depth = group_nodes_by_depth(node_info)
|
|
344
|
+
|
|
345
|
+
# Sort nodes within each depth group
|
|
346
|
+
for depth in depth_groups:
|
|
347
|
+
depth_groups[depth].sort()
|
|
348
|
+
|
|
349
|
+
# Create the main flow visualization
|
|
350
|
+
lines = ["=" * 80, "Flow Graph Visualization", "=" * 80, ""]
|
|
351
|
+
|
|
352
|
+
# Track which nodes connect to what
|
|
353
|
+
merge_points = define_node_connections(node_info)
|
|
354
|
+
|
|
355
|
+
# Build the flow paths
|
|
356
|
+
|
|
357
|
+
# Find the maximum label length for each depth level
|
|
358
|
+
max_label_length = {}
|
|
359
|
+
for depth in range(max_depth + 1):
|
|
360
|
+
if depth in depth_groups:
|
|
361
|
+
max_len = max(len(node_info[nid].label) for nid in depth_groups[depth])
|
|
362
|
+
max_label_length[depth] = max_len
|
|
363
|
+
|
|
364
|
+
# Draw the paths
|
|
365
|
+
drawn_nodes = set()
|
|
366
|
+
merge_drawn = set()
|
|
367
|
+
|
|
368
|
+
# Group paths by their merge points
|
|
369
|
+
paths_by_merge = {}
|
|
370
|
+
standalone_paths = []
|
|
371
|
+
|
|
372
|
+
# Build flow paths
|
|
373
|
+
paths = build_flow_paths(node_info, self._flow_starts, merge_points)
|
|
374
|
+
|
|
375
|
+
# Define paths to merge and standalone paths
|
|
376
|
+
for path in paths:
|
|
377
|
+
if len(path) > 1 and path[-1] in merge_points and len(merge_points[path[-1]]) > 1:
|
|
378
|
+
merge_id = path[-1]
|
|
379
|
+
if merge_id not in paths_by_merge:
|
|
380
|
+
paths_by_merge[merge_id] = []
|
|
381
|
+
paths_by_merge[merge_id].append(path)
|
|
382
|
+
else:
|
|
383
|
+
standalone_paths.append(path)
|
|
384
|
+
|
|
385
|
+
# Draw merged paths
|
|
386
|
+
draw_merged_paths(node_info, merge_points, paths_by_merge, merge_drawn, drawn_nodes, lines)
|
|
387
|
+
|
|
388
|
+
# Draw standlone paths
|
|
389
|
+
draw_standalone_paths(drawn_nodes, standalone_paths, lines, node_info)
|
|
390
|
+
|
|
391
|
+
# Add undrawn nodes
|
|
392
|
+
add_un_drawn_nodes(drawn_nodes, node_info, lines)
|
|
393
|
+
|
|
394
|
+
try:
|
|
395
|
+
skip_nodes, ordered_nodes = compute_execution_plan(
|
|
396
|
+
nodes=self.nodes,
|
|
397
|
+
flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
|
|
398
|
+
if ordered_nodes:
|
|
399
|
+
for i, node in enumerate(ordered_nodes, 1):
|
|
400
|
+
lines.append(f" {i:3d}. {node_info[node.node_id].label}")
|
|
401
|
+
except Exception as e:
|
|
402
|
+
lines.append(f" Could not determine execution order: {e}")
|
|
403
|
+
|
|
404
|
+
# Print everything
|
|
405
|
+
output = "\n".join(lines)
|
|
406
|
+
|
|
407
|
+
print(output)
|
|
408
|
+
|
|
373
409
|
def get_nodes_overview(self):
|
|
374
410
|
"""Gets a list of dictionary representations for all nodes in the graph."""
|
|
375
411
|
output = []
|
|
@@ -490,7 +526,8 @@ class FlowGraph:
|
|
|
490
526
|
node_id=node.node_id,
|
|
491
527
|
flow_id=self.flow_id,
|
|
492
528
|
)
|
|
493
|
-
node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref
|
|
529
|
+
node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref,
|
|
530
|
+
n=min(sample_size, number_of_records))
|
|
494
531
|
return flowfile_table
|
|
495
532
|
|
|
496
533
|
def schema_callback():
|
|
@@ -775,26 +812,34 @@ class FlowGraph:
|
|
|
775
812
|
"""
|
|
776
813
|
|
|
777
814
|
def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
|
|
815
|
+
node = self.get_node(node_id=fuzzy_settings.node_id)
|
|
816
|
+
if self.execution_location == "local":
|
|
817
|
+
return main.fuzzy_join(fuzzy_match_input=fuzzy_settings.join_input,
|
|
818
|
+
other=right,
|
|
819
|
+
node_logger=self.flow_logger.get_node_logger(fuzzy_settings.node_id))
|
|
820
|
+
|
|
778
821
|
f = main.start_fuzzy_join(fuzzy_match_input=fuzzy_settings.join_input, other=right, file_ref=node.hash,
|
|
779
822
|
flow_id=self.flow_id, node_id=fuzzy_settings.node_id)
|
|
780
823
|
logger.info("Started the fuzzy match action")
|
|
781
|
-
node._fetch_cached_df = f
|
|
824
|
+
node._fetch_cached_df = f # Add to the node so it can be cancelled and fetch later if needed
|
|
782
825
|
return FlowDataEngine(f.get_result())
|
|
783
826
|
|
|
784
|
-
self.add_node_step(node_id=fuzzy_settings.node_id,
|
|
785
|
-
function=_func,
|
|
786
|
-
input_columns=[],
|
|
787
|
-
node_type='fuzzy_match',
|
|
788
|
-
setting_input=fuzzy_settings)
|
|
789
|
-
node = self.get_node(node_id=fuzzy_settings.node_id)
|
|
790
|
-
|
|
791
827
|
def schema_callback():
|
|
792
|
-
|
|
828
|
+
fm_input_copy = deepcopy(fuzzy_settings.join_input) # Deepcopy create an unique object per func
|
|
829
|
+
node = self.get_node(node_id=fuzzy_settings.node_id)
|
|
830
|
+
return calculate_fuzzy_match_schema(fm_input_copy,
|
|
793
831
|
left_schema=node.node_inputs.main_inputs[0].schema,
|
|
794
832
|
right_schema=node.node_inputs.right_input.schema
|
|
795
833
|
)
|
|
796
834
|
|
|
797
|
-
|
|
835
|
+
self.add_node_step(node_id=fuzzy_settings.node_id,
|
|
836
|
+
function=_func,
|
|
837
|
+
input_columns=[],
|
|
838
|
+
node_type='fuzzy_match',
|
|
839
|
+
setting_input=fuzzy_settings,
|
|
840
|
+
input_node_ids=fuzzy_settings.depending_on_ids,
|
|
841
|
+
schema_callback=schema_callback)
|
|
842
|
+
|
|
798
843
|
return self
|
|
799
844
|
|
|
800
845
|
def add_text_to_rows(self, node_text_to_rows: input_schema.NodeTextToRows) -> "FlowGraph":
|
|
@@ -1550,6 +1595,8 @@ class FlowGraph:
|
|
|
1550
1595
|
Args:
|
|
1551
1596
|
execution_location: The execution location to set.
|
|
1552
1597
|
"""
|
|
1598
|
+
if self.flow_settings.execution_location != execution_location:
|
|
1599
|
+
self.reset()
|
|
1553
1600
|
self.flow_settings.execution_location = execution_location
|
|
1554
1601
|
|
|
1555
1602
|
def run_graph(self) -> RunInformation | None:
|
|
@@ -1576,17 +1623,11 @@ class FlowGraph:
|
|
|
1576
1623
|
self.end_datetime = None
|
|
1577
1624
|
self.latest_run_info = None
|
|
1578
1625
|
self.flow_logger.info('Starting to run flowfile flow...')
|
|
1579
|
-
skip_nodes =
|
|
1580
|
-
skip_nodes.extend([lead_to_node for node in skip_nodes for lead_to_node in node.leads_to_nodes])
|
|
1581
|
-
execution_order = determine_execution_order(all_nodes=[node for node in self.nodes if
|
|
1582
|
-
node not in skip_nodes],
|
|
1583
|
-
flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
|
|
1626
|
+
skip_nodes, execution_order = compute_execution_plan(nodes=self.nodes, flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
|
|
1584
1627
|
|
|
1585
1628
|
skip_node_message(self.flow_logger, skip_nodes)
|
|
1586
1629
|
execution_order_message(self.flow_logger, execution_order)
|
|
1587
1630
|
performance_mode = self.flow_settings.execution_mode == 'Performance'
|
|
1588
|
-
if self.flow_settings.execution_location == 'local':
|
|
1589
|
-
OFFLOAD_TO_WORKER.value = False
|
|
1590
1631
|
for node in execution_order:
|
|
1591
1632
|
node_logger = self.flow_logger.get_node_logger(node.node_id)
|
|
1592
1633
|
if self.flow_settings.is_canceled:
|
|
@@ -1922,4 +1963,4 @@ def delete_connection(graph, node_connection: input_schema.NodeConnection):
|
|
|
1922
1963
|
to_node.delete_input_node(
|
|
1923
1964
|
node_connection.output_connection.node_id,
|
|
1924
1965
|
connection_type=node_connection.input_connection.connection_class,
|
|
1925
|
-
)
|
|
1966
|
+
)
|
|
@@ -5,7 +5,6 @@ from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEng
|
|
|
5
5
|
from flowfile_core.utils.arrow_reader import get_read_top_n
|
|
6
6
|
from flowfile_core.schemas import input_schema, schemas
|
|
7
7
|
from flowfile_core.configs.flow_logger import NodeLogger
|
|
8
|
-
from flowfile_core.configs.settings import SINGLE_FILE_MODE
|
|
9
8
|
|
|
10
9
|
from flowfile_core.schemas.output_model import TableExample, FileColumn, NodeData
|
|
11
10
|
from flowfile_core.flowfile.utils import get_hash
|
|
@@ -681,7 +680,7 @@ class FlowNode:
|
|
|
681
680
|
logger.warning('Not implemented')
|
|
682
681
|
|
|
683
682
|
def needs_run(self, performance_mode: bool, node_logger: NodeLogger = None,
|
|
684
|
-
execution_location: schemas.ExecutionLocationsLiteral = "
|
|
683
|
+
execution_location: schemas.ExecutionLocationsLiteral = "worker") -> bool:
|
|
685
684
|
"""Determines if the node needs to be executed.
|
|
686
685
|
|
|
687
686
|
The decision is based on its run state, caching settings, and execution mode.
|
|
@@ -694,7 +693,7 @@ class FlowNode:
|
|
|
694
693
|
Returns:
|
|
695
694
|
True if the node should be run, False otherwise.
|
|
696
695
|
"""
|
|
697
|
-
if execution_location == "local"
|
|
696
|
+
if execution_location == "local":
|
|
698
697
|
return False
|
|
699
698
|
|
|
700
699
|
flow_logger = logger if node_logger is None else node_logger
|
|
@@ -724,9 +723,19 @@ class FlowNode:
|
|
|
724
723
|
Raises:
|
|
725
724
|
Exception: Propagates exceptions from the execution.
|
|
726
725
|
"""
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
726
|
+
def example_data_generator():
|
|
727
|
+
example_data = None
|
|
728
|
+
|
|
729
|
+
def get_example_data():
|
|
730
|
+
nonlocal example_data
|
|
731
|
+
if example_data is None:
|
|
732
|
+
example_data = resulting_data.get_sample(100).to_arrow()
|
|
733
|
+
return example_data
|
|
734
|
+
return get_example_data
|
|
735
|
+
resulting_data = self.get_resulting_data()
|
|
736
|
+
|
|
737
|
+
if not performance_mode:
|
|
738
|
+
self.results.example_data_generator = example_data_generator()
|
|
730
739
|
self.node_schema.result_schema = self.results.resulting_data.schema
|
|
731
740
|
self.node_stats.has_completed_last_run = True
|
|
732
741
|
|
|
@@ -869,7 +878,7 @@ class FlowNode:
|
|
|
869
878
|
if self.is_setup:
|
|
870
879
|
node_logger.info(f'Starting to run {self.__name__}')
|
|
871
880
|
if (self.needs_run(performance_mode, node_logger, run_location) or self.node_template.node_group == "output"
|
|
872
|
-
and not (run_location == 'local'
|
|
881
|
+
and not (run_location == 'local')):
|
|
873
882
|
self.prepare_before_run()
|
|
874
883
|
try:
|
|
875
884
|
if ((run_location == 'remote' or (self.node_default.transform_type == 'wide')
|
|
@@ -898,9 +907,15 @@ class FlowNode:
|
|
|
898
907
|
node_logger=node_logger)
|
|
899
908
|
else:
|
|
900
909
|
self.results.errors = str(e)
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
910
|
+
if "Connection refused" in str(e) and "/submit_query/" in str(e):
|
|
911
|
+
node_logger.warning("There was an issue connecting to the remote worker, "
|
|
912
|
+
"ensure the worker process is running, "
|
|
913
|
+
"or change the settings to, so it executes locally")
|
|
914
|
+
node_logger.error("Could not execute in the remote worker. (Re)start the worker service, or change settings to local settings.")
|
|
915
|
+
else:
|
|
916
|
+
node_logger.error(f'Error with running the node: {e}')
|
|
917
|
+
elif ((run_location == 'local') and
|
|
918
|
+
(not self.node_stats.has_run_with_current_setup or self.node_template.node_group == "output")):
|
|
904
919
|
try:
|
|
905
920
|
node_logger.info('Executing fully locally')
|
|
906
921
|
self.execute_full_local(performance_mode)
|
|
@@ -949,16 +964,16 @@ class FlowNode:
|
|
|
949
964
|
logger.info(f'{self.node_id}: Node needs reset')
|
|
950
965
|
self.node_stats.has_run_with_current_setup = False
|
|
951
966
|
self.results.reset()
|
|
952
|
-
if self.is_correct:
|
|
953
|
-
self._schema_callback = None # Ensure the schema callback is reset
|
|
954
|
-
if self.schema_callback:
|
|
955
|
-
logger.info(f'{self.node_id}: Resetting the schema callback')
|
|
956
|
-
self.schema_callback.start()
|
|
957
967
|
self.node_schema.result_schema = None
|
|
958
968
|
self.node_schema.predicted_schema = None
|
|
959
969
|
self._hash = None
|
|
960
970
|
self.node_information.is_setup = None
|
|
961
971
|
self.results.errors = None
|
|
972
|
+
if self.is_correct:
|
|
973
|
+
self._schema_callback = None # Ensure the schema callback is reset
|
|
974
|
+
if self.schema_callback:
|
|
975
|
+
logger.info(f'{self.node_id}: Resetting the schema callback')
|
|
976
|
+
self.schema_callback.start()
|
|
962
977
|
self.evaluate_nodes()
|
|
963
978
|
_ = self.hash # Recalculate the hash after reset
|
|
964
979
|
|
|
@@ -108,14 +108,12 @@ class NodeStepSettings:
|
|
|
108
108
|
streamable: If True, the node can process data in a streaming fashion.
|
|
109
109
|
setup_errors: If True, indicates a non-blocking error occurred during setup.
|
|
110
110
|
breaking_setup_errors: If True, indicates an error occurred that prevents execution.
|
|
111
|
-
execute_location: The preferred location for execution ('auto', 'local', 'remote').
|
|
112
111
|
"""
|
|
113
112
|
cache_results: bool = False
|
|
114
113
|
renew_schema: bool = True
|
|
115
114
|
streamable: bool = True
|
|
116
115
|
setup_errors: bool = False
|
|
117
116
|
breaking_setup_errors: bool = False
|
|
118
|
-
execute_location: schemas.ExecutionLocationsLiteral = 'auto'
|
|
119
117
|
|
|
120
118
|
|
|
121
119
|
class NodeStepInputs:
|
|
@@ -1,71 +1,166 @@
|
|
|
1
|
-
|
|
2
1
|
from typing import Callable, Any, Optional, Generic, TypeVar
|
|
3
2
|
from concurrent.futures import ThreadPoolExecutor, Future
|
|
3
|
+
import threading
|
|
4
4
|
from flowfile_core.configs import logger
|
|
5
5
|
|
|
6
|
-
|
|
7
6
|
T = TypeVar('T')
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
class SingleExecutionFuture(Generic[T]):
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
|
|
10
|
+
"""Thread-safe single execution of a function with result caching.
|
|
11
|
+
|
|
12
|
+
Ensures a function is executed at most once even when called from multiple threads.
|
|
13
|
+
Subsequent calls return the cached result.
|
|
14
|
+
"""
|
|
15
|
+
|
|
14
16
|
func: Callable[[], T]
|
|
15
17
|
on_error: Optional[Callable[[Exception], Any]]
|
|
16
|
-
|
|
17
|
-
|
|
18
|
+
_lock: threading.RLock
|
|
19
|
+
_executor: Optional[ThreadPoolExecutor]
|
|
20
|
+
_future: Optional[Future[T]]
|
|
21
|
+
_result_value: Optional[T]
|
|
22
|
+
_exception: Optional[Exception]
|
|
23
|
+
_has_completed: bool
|
|
24
|
+
_has_started: bool
|
|
18
25
|
|
|
19
26
|
def __init__(
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
27
|
+
self,
|
|
28
|
+
func: Callable[[], T],
|
|
29
|
+
on_error: Optional[Callable[[Exception], Any]] = None
|
|
23
30
|
) -> None:
|
|
24
31
|
"""Initialize with function and optional error handler."""
|
|
25
|
-
self.executor = ThreadPoolExecutor(max_workers=1)
|
|
26
|
-
self.future = None
|
|
27
32
|
self.func = func
|
|
28
33
|
self.on_error = on_error
|
|
29
|
-
|
|
30
|
-
|
|
34
|
+
|
|
35
|
+
# Thread safety
|
|
36
|
+
self._lock = threading.RLock() # RLock allows re-entrant locking
|
|
37
|
+
|
|
38
|
+
# Execution state
|
|
39
|
+
self._executor = None
|
|
40
|
+
self._future = None
|
|
41
|
+
self._result_value = None
|
|
42
|
+
self._exception = None
|
|
43
|
+
self._has_completed = False
|
|
44
|
+
self._has_started = False
|
|
45
|
+
|
|
46
|
+
def _ensure_executor(self) -> ThreadPoolExecutor:
|
|
47
|
+
"""Ensure executor exists, creating if necessary."""
|
|
48
|
+
if self._executor is None or self._executor._shutdown:
|
|
49
|
+
self._executor = ThreadPoolExecutor(max_workers=1)
|
|
50
|
+
return self._executor
|
|
31
51
|
|
|
32
52
|
def start(self) -> None:
|
|
33
53
|
"""Start the function execution if not already started."""
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
54
|
+
with self._lock:
|
|
55
|
+
if self._has_started:
|
|
56
|
+
logger.info("Function already started or completed")
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
logger.info("Starting single executor function")
|
|
60
|
+
executor: ThreadPoolExecutor = self._ensure_executor()
|
|
61
|
+
self._future = executor.submit(self._func_wrapper)
|
|
62
|
+
self._has_started = True
|
|
63
|
+
|
|
64
|
+
def _func_wrapper(self) -> T:
|
|
65
|
+
"""Wrapper to capture the result or exception."""
|
|
66
|
+
try:
|
|
67
|
+
result: T = self.func()
|
|
68
|
+
with self._lock:
|
|
69
|
+
self._result_value = result
|
|
70
|
+
self._has_completed = True
|
|
71
|
+
return result
|
|
72
|
+
except Exception as e:
|
|
73
|
+
with self._lock:
|
|
74
|
+
self._exception = e
|
|
75
|
+
self._has_completed = True
|
|
76
|
+
raise
|
|
37
77
|
|
|
38
78
|
def cleanup(self) -> None:
|
|
39
|
-
"""Clean up resources by
|
|
40
|
-
self.
|
|
41
|
-
|
|
79
|
+
"""Clean up resources by shutting down the executor."""
|
|
80
|
+
with self._lock:
|
|
81
|
+
if self._executor and not self._executor._shutdown:
|
|
82
|
+
self._executor.shutdown(wait=False)
|
|
42
83
|
|
|
43
84
|
def __call__(self) -> Optional[T]:
|
|
44
85
|
"""Execute function if not running and return its result."""
|
|
45
|
-
|
|
46
|
-
return
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
if self.
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
86
|
+
with self._lock:
|
|
87
|
+
# If already completed, return cached result or raise cached exception
|
|
88
|
+
if self._has_completed:
|
|
89
|
+
if self._exception:
|
|
90
|
+
if self.on_error:
|
|
91
|
+
return self.on_error(self._exception)
|
|
92
|
+
else:
|
|
93
|
+
raise self._exception
|
|
94
|
+
return self._result_value
|
|
95
|
+
|
|
96
|
+
# Start if not already started
|
|
97
|
+
if not self._has_started:
|
|
98
|
+
self.start()
|
|
99
|
+
|
|
100
|
+
# Wait for completion outside the lock to avoid blocking other threads
|
|
101
|
+
if self._future:
|
|
102
|
+
try:
|
|
103
|
+
result: T = self._future.result()
|
|
104
|
+
logger.info("Function completed successfully")
|
|
105
|
+
return result
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.error(f"Function raised exception: {e}")
|
|
108
|
+
if self.on_error:
|
|
109
|
+
return self.on_error(e)
|
|
110
|
+
else:
|
|
111
|
+
raise
|
|
112
|
+
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
def reset(self) -> None:
|
|
116
|
+
"""Reset the execution state, allowing the function to be run again."""
|
|
117
|
+
with self._lock:
|
|
118
|
+
logger.info("Resetting single execution future")
|
|
119
|
+
|
|
120
|
+
# Cancel any pending execution
|
|
121
|
+
if self._future and not self._future.done():
|
|
122
|
+
self._future.cancel()
|
|
62
123
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
124
|
+
# Clean up old executor
|
|
125
|
+
if self._executor and not self._executor._shutdown:
|
|
126
|
+
self._executor.shutdown(wait=False)
|
|
127
|
+
|
|
128
|
+
# Reset state
|
|
129
|
+
self._executor = None
|
|
130
|
+
self._future = None
|
|
131
|
+
self._result_value = None
|
|
132
|
+
self._exception = None
|
|
133
|
+
self._has_completed = False
|
|
134
|
+
self._has_started = False
|
|
135
|
+
|
|
136
|
+
def is_running(self) -> bool:
|
|
137
|
+
"""Check if the function is currently executing."""
|
|
138
|
+
with self._lock:
|
|
139
|
+
return bool(
|
|
140
|
+
self._has_started and
|
|
141
|
+
not self._has_completed and
|
|
142
|
+
self._future is not None and
|
|
143
|
+
not self._future.done()
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def is_completed(self) -> bool:
|
|
147
|
+
"""Check if the function has completed execution."""
|
|
148
|
+
with self._lock:
|
|
149
|
+
return self._has_completed
|
|
150
|
+
|
|
151
|
+
def get_result(self) -> Optional[T]:
|
|
152
|
+
"""Get the cached result without triggering execution."""
|
|
153
|
+
with self._lock:
|
|
154
|
+
if self._exception:
|
|
155
|
+
if self.on_error:
|
|
156
|
+
return self.on_error(self._exception)
|
|
157
|
+
else:
|
|
158
|
+
raise self._exception
|
|
159
|
+
return self._result_value
|
|
68
160
|
|
|
69
161
|
def __del__(self) -> None:
|
|
70
162
|
"""Ensure executor is shut down on deletion."""
|
|
71
|
-
|
|
163
|
+
try:
|
|
164
|
+
self.cleanup()
|
|
165
|
+
except Exception:
|
|
166
|
+
pass
|