Flowfile 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. flowfile/__init__.py +4 -3
  2. flowfile/api.py +5 -2
  3. flowfile/web/__init__.py +2 -0
  4. flowfile/web/static/assets/{CloudConnectionManager-c20a740f.js → CloudConnectionManager-c97c25f8.js} +2 -2
  5. flowfile/web/static/assets/{CloudStorageReader-960b400a.js → CloudStorageReader-f1ff509e.js} +7 -7
  6. flowfile/web/static/assets/{CloudStorageWriter-e3decbdd.js → CloudStorageWriter-034f8b78.js} +7 -7
  7. flowfile/web/static/assets/{CrossJoin-d67e2405.js → CrossJoin-9e156ebe.js} +8 -8
  8. flowfile/web/static/assets/{DatabaseConnectionSettings-a81e0f7e.js → DatabaseConnectionSettings-d5c625b3.js} +2 -2
  9. flowfile/web/static/assets/{DatabaseManager-9ea35e84.js → DatabaseManager-265adc5e.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseReader-9578bfa5.js → DatabaseReader-0b10551e.js} +9 -9
  11. flowfile/web/static/assets/{DatabaseWriter-19531098.js → DatabaseWriter-c17c6916.js} +9 -9
  12. flowfile/web/static/assets/{ExploreData-40476474.js → ExploreData-18a4fe52.js} +5 -5
  13. flowfile/web/static/assets/{ExternalSource-2297ef96.js → ExternalSource-3a66556c.js} +6 -6
  14. flowfile/web/static/assets/{Filter-f211c03a.js → Filter-91ad87e7.js} +8 -8
  15. flowfile/web/static/assets/{Formula-4207ea31.js → Formula-3c395ab1.js} +8 -8
  16. flowfile/web/static/assets/{FuzzyMatch-bf120df0.js → FuzzyMatch-2df0d230.js} +9 -9
  17. flowfile/web/static/assets/{GraphSolver-5bb7497a.js → GraphSolver-d285877f.js} +5 -5
  18. flowfile/web/static/assets/{GroupBy-92c81b65.js → GroupBy-0bd1cc6b.js} +6 -6
  19. flowfile/web/static/assets/{Join-4e49a274.js → Join-5a78a203.js} +9 -9
  20. flowfile/web/static/assets/{ManualInput-90998ae8.js → ManualInput-93aef9d6.js} +5 -5
  21. flowfile/web/static/assets/{Output-81e3e917.js → Output-411ecaee.js} +4 -4
  22. flowfile/web/static/assets/{Pivot-a3419842.js → Pivot-89db4b04.js} +6 -6
  23. flowfile/web/static/assets/{PolarsCode-72710deb.js → PolarsCode-a9f974f8.js} +6 -6
  24. flowfile/web/static/assets/{Read-c4059daf.js → Read-c3b1929c.js} +6 -6
  25. flowfile/web/static/assets/{RecordCount-c2b5e095.js → RecordCount-4e95f98e.js} +5 -5
  26. flowfile/web/static/assets/{RecordId-10baf191.js → RecordId-55ae7d36.js} +6 -6
  27. flowfile/web/static/assets/{Sample-3ed9a0ae.js → Sample-b4a18476.js} +5 -5
  28. flowfile/web/static/assets/{SecretManager-0d49c0e8.js → SecretManager-b066d13a.js} +2 -2
  29. flowfile/web/static/assets/{Select-8a02a0b3.js → Select-727688dc.js} +8 -8
  30. flowfile/web/static/assets/{SettingsSection-4c0f45f5.js → SettingsSection-695ac487.js} +1 -1
  31. flowfile/web/static/assets/{Sort-f55c9f9d.js → Sort-be3339a8.js} +6 -6
  32. flowfile/web/static/assets/{TextToRows-5dbc2145.js → TextToRows-7b8998da.js} +8 -8
  33. flowfile/web/static/assets/{UnavailableFields-a1768e52.js → UnavailableFields-8b0cb48e.js} +2 -2
  34. flowfile/web/static/assets/{Union-f2aefdc9.js → Union-89fd73dc.js} +5 -5
  35. flowfile/web/static/assets/{Unique-46b250da.js → Unique-af5a80b4.js} +8 -8
  36. flowfile/web/static/assets/{Unpivot-25ac84cc.js → Unpivot-5195d411.js} +5 -5
  37. flowfile/web/static/assets/{api-a0abbdc7.js → api-023d1733.js} +1 -1
  38. flowfile/web/static/assets/{api-6ef0dcef.js → api-cb00cce6.js} +1 -1
  39. flowfile/web/static/assets/{designer-186f2e71.css → designer-2197d782.css} +17 -17
  40. flowfile/web/static/assets/{designer-13eabd83.js → designer-6c322d8e.js} +67 -21
  41. flowfile/web/static/assets/{documentation-b87e7f6f.js → documentation-4d1fafe1.js} +1 -1
  42. flowfile/web/static/assets/{dropDown-13564764.js → dropDown-0b46dd77.js} +1 -1
  43. flowfile/web/static/assets/{fullEditor-fd2cd6f9.js → fullEditor-ec4e4f95.js} +2 -2
  44. flowfile/web/static/assets/{genericNodeSettings-71e11604.js → genericNodeSettings-def5879b.js} +3 -3
  45. flowfile/web/static/assets/{index-f6c15e76.js → index-683fc198.js} +6 -6
  46. flowfile/web/static/assets/{nodeTitle-988d9efe.js → nodeTitle-a16db7c3.js} +3 -3
  47. flowfile/web/static/assets/{secretApi-dd636aa2.js → secretApi-baceb6f9.js} +1 -1
  48. flowfile/web/static/assets/{selectDynamic-af36165e.js → selectDynamic-de91449a.js} +3 -3
  49. flowfile/web/static/assets/{vue-codemirror.esm-2847001e.js → vue-codemirror.esm-dc5e3348.js} +1 -1
  50. flowfile/web/static/assets/{vue-content-loader.es-0371da73.js → vue-content-loader.es-ba94b82f.js} +1 -1
  51. flowfile/web/static/index.html +1 -1
  52. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/METADATA +2 -1
  53. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/RECORD +88 -90
  54. flowfile_core/configs/settings.py +4 -2
  55. flowfile_core/configs/utils.py +5 -0
  56. flowfile_core/database/connection.py +1 -3
  57. flowfile_core/flowfile/code_generator/code_generator.py +36 -0
  58. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +0 -1
  59. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +60 -80
  60. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +61 -0
  61. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +44 -3
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +3 -3
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +5 -2
  64. flowfile_core/flowfile/flow_graph.py +129 -88
  65. flowfile_core/flowfile/flow_node/flow_node.py +30 -15
  66. flowfile_core/flowfile/flow_node/models.py +0 -2
  67. flowfile_core/flowfile/flow_node/schema_callback.py +138 -43
  68. flowfile_core/flowfile/graph_tree/graph_tree.py +250 -0
  69. flowfile_core/flowfile/graph_tree/models.py +15 -0
  70. flowfile_core/flowfile/manage/compatibility_enhancements.py +1 -1
  71. flowfile_core/flowfile/{flow_data_engine/fuzzy_matching/settings_validator.py → schema_callbacks.py} +65 -13
  72. flowfile_core/flowfile/setting_generator/settings.py +2 -1
  73. flowfile_core/flowfile/util/execution_orderer.py +9 -0
  74. flowfile_core/flowfile/util/node_skipper.py +8 -0
  75. flowfile_core/schemas/schemas.py +46 -3
  76. flowfile_core/schemas/transform_schema.py +27 -38
  77. flowfile_core/utils/arrow_reader.py +8 -3
  78. flowfile_core/utils/validate_setup.py +0 -2
  79. flowfile_frame/__init__.py +1 -4
  80. flowfile_frame/expr.py +14 -0
  81. flowfile_frame/flow_frame.py +34 -5
  82. flowfile_frame/flow_frame.pyi +5 -6
  83. flowfile_worker/funcs.py +7 -3
  84. flowfile_worker/models.py +3 -1
  85. flowfile_worker/polars_fuzzy_match/matcher.py +0 -435
  86. flowfile_worker/polars_fuzzy_match/models.py +0 -36
  87. flowfile_worker/polars_fuzzy_match/pre_process.py +0 -213
  88. flowfile_worker/polars_fuzzy_match/process.py +0 -86
  89. flowfile_worker/polars_fuzzy_match/utils.py +0 -50
  90. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/LICENSE +0 -0
  91. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/WHEEL +0 -0
  92. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/entry_points.txt +0 -0
  93. {flowfile_worker/polars_fuzzy_match → flowfile_core/flowfile/graph_tree}/__init__.py +0 -0
@@ -2,7 +2,6 @@ import datetime
2
2
  import pickle
3
3
  import polars as pl
4
4
  import fastexcel
5
- import re
6
5
  from fastapi.exceptions import HTTPException
7
6
  from time import time
8
7
  from functools import partial
@@ -11,17 +10,17 @@ from uuid import uuid1
11
10
  from copy import deepcopy
12
11
  from pyarrow.parquet import ParquetFile
13
12
  from flowfile_core.configs import logger
14
- from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
15
13
  from flowfile_core.configs.flow_logger import FlowLogger
16
14
  from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
17
15
  from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
18
- from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.settings_validator import (calculate_fuzzy_match_schema,
19
- pre_calculate_pivot_schema)
16
+
20
17
  from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import CloudStorageReader
21
18
  from flowfile_core.utils.arrow_reader import get_read_top_n
22
19
  from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine, execute_polars_code
23
- from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_xlsx_datatypes, \
24
- get_calamine_xlsx_data_types
20
+ from flowfile_core.flowfile.flow_data_engine.read_excel_tables import (get_open_xlsx_datatypes,
21
+ get_calamine_xlsx_data_types)
22
+
23
+ from flowfile_core.flowfile.schema_callbacks import (calculate_fuzzy_match_schema, pre_calculate_pivot_schema)
25
24
  from flowfile_core.flowfile.sources import external_sources
26
25
  from flowfile_core.schemas import input_schema, schemas, transform_schema
27
26
  from flowfile_core.schemas.output_model import NodeData, NodeResult, RunInformation
@@ -32,7 +31,11 @@ from flowfile_core.schemas.cloud_storage_schemas import (CloudStorageReadSetting
32
31
  from flowfile_core.flowfile.utils import snake_case_to_camel_case
33
32
  from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
34
33
  from flowfile_core.flowfile.flow_node.flow_node import FlowNode
35
- from flowfile_core.flowfile.util.execution_orderer import determine_execution_order
34
+ from flowfile_core.flowfile.util.execution_orderer import compute_execution_plan
35
+ from flowfile_core.flowfile.graph_tree.graph_tree import (add_un_drawn_nodes, build_flow_paths,
36
+ build_node_info, calculate_depth,
37
+ define_node_connections, draw_merged_paths,
38
+ draw_standalone_paths, group_nodes_by_depth)
36
39
  from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
37
40
  from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import (ExternalDatabaseFetcher,
38
41
  ExternalDatabaseWriter,
@@ -177,7 +180,7 @@ class FlowGraph:
177
180
  start_datetime: datetime = None
178
181
  end_datetime: datetime = None
179
182
  nodes_completed: int = 0
180
- flow_settings: schemas.FlowSettings = None
183
+ _flow_settings: schemas.FlowSettings = None
181
184
  flow_logger: FlowLogger
182
185
 
183
186
  def __init__(self,
@@ -201,7 +204,7 @@ class FlowGraph:
201
204
  if isinstance(flow_settings, schemas.FlowGraphConfig):
202
205
  flow_settings = schemas.FlowSettings.from_flow_settings_input(flow_settings)
203
206
 
204
- self.flow_settings = flow_settings
207
+ self._flow_settings = flow_settings
205
208
  self.uuid = str(uuid1())
206
209
  self.nodes_completed = 0
207
210
  self.start_datetime = None
@@ -226,6 +229,19 @@ class FlowGraph:
226
229
  elif input_flow is not None:
227
230
  self.add_datasource(input_file=input_flow)
228
231
 
232
+ @property
233
+ def flow_settings(self) -> schemas.FlowSettings:
234
+ return self._flow_settings
235
+
236
+ @flow_settings.setter
237
+ def flow_settings(self, flow_settings: schemas.FlowSettings):
238
+ if (
239
+ (self._flow_settings.execution_location != flow_settings.execution_location) or
240
+ (self._flow_settings.execution_mode != flow_settings.execution_mode)
241
+ ):
242
+ self.reset()
243
+ self._flow_settings = flow_settings
244
+
229
245
  def add_node_promise(self, node_promise: input_schema.NodePromise):
230
246
  """Adds a placeholder node to the graph that is not yet fully configured.
231
247
 
@@ -242,66 +258,6 @@ class FlowGraph:
242
258
  self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=placeholder,
243
259
  setting_input=node_promise)
244
260
 
245
- def print_tree(self, show_schema=False, show_descriptions=False):
246
- """
247
- Print flow_graph as a tree.
248
- """
249
- max_node_id = max(self._node_db.keys())
250
-
251
- tree = ""
252
- tabs = 0
253
- tab_counter = 0
254
- for node in self.nodes:
255
- tab_counter += 1
256
- node_input = node.setting_input
257
- operation = str(self._node_db[node_input.node_id]).split("(")[1][:-1].replace("_", " ").title()
258
-
259
- if operation == "Formula":
260
- operation = "With Columns"
261
-
262
- tree += str(operation) + " (id=" + str(node_input.node_id) + ")"
263
-
264
- if show_descriptions & show_schema:
265
- raise ValueError('show_descriptions and show_schema cannot be True simultaneously')
266
- if show_descriptions:
267
- tree += ": " + str(node_input.description)
268
- elif show_schema:
269
- tree += " -> ["
270
- if operation == "Manual Input":
271
- schema = ", ".join([str(i.name) + ": " + str(i.data_type) for i in node_input.raw_data_format.columns])
272
- tree += schema
273
- elif operation == "With Columns":
274
- tree_with_col_schema = ", " + node_input.function.field.name + ": " + node_input.function.field.data_type
275
- tree += schema + tree_with_col_schema
276
- elif operation == "Filter":
277
- index = node_input.filter_input.advanced_filter.find("]")
278
- filtered_column = str(node_input.filter_input.advanced_filter[1:index])
279
- schema = re.sub('({str(filtered_column)}: [A-Za-z0-9]+\,\s)', "", schema)
280
- tree += schema
281
- elif operation == "Group By":
282
- for col in node_input.groupby_input.agg_cols:
283
- schema = re.sub(str(col.old_name) + ': [a-z0-9]+\, ', "", schema)
284
- tree += schema
285
- tree += "]"
286
- else:
287
- if operation == "Manual Input":
288
- tree += ": " + str(node_input.raw_data_format.data)
289
- elif operation == "With Columns":
290
- tree += ": " + str(node_input.function)
291
- elif operation == "Filter":
292
- tree += ": " + str(node_input.filter_input.advanced_filter)
293
- elif operation == "Group By":
294
- tree += ": groupby=[" + ", ".join([col.old_name for col in node_input.groupby_input.agg_cols if col.agg == "groupby"]) + "], "
295
- tree += "agg=[" + ", ".join([str(col.agg) + "(" + str(col.old_name) + ")" for col in node_input.groupby_input.agg_cols if col.agg != "groupby"]) + "]"
296
-
297
- if node_input.node_id < max_node_id:
298
- tree += "\n" + "# " + " "*3*(tabs-1) + "|___ "
299
- print("\n"*2)
300
-
301
- return print(tree)
302
-
303
-
304
-
305
261
  def apply_layout(self, y_spacing: int = 150, x_spacing: int = 200, initial_y: int = 100):
306
262
  """Calculates and applies a layered layout to all nodes in the graph.
307
263
 
@@ -370,6 +326,86 @@ class FlowGraph:
370
326
  settings_str = " -" + '\n -'.join(f"{k}: {v}" for k, v in self.flow_settings)
371
327
  return f"FlowGraph(\nNodes: {self._node_db}\n\nSettings:\n{settings_str}"
372
328
 
329
+ def print_tree(self):
330
+ """Print flow_graph as a visual tree structure, showing the DAG relationships with ASCII art."""
331
+ if not self._node_db:
332
+ self.flow_logger.info("Empty flow graph")
333
+ return
334
+
335
+ # Build node information
336
+ node_info = build_node_info(self.nodes)
337
+
338
+ # Calculate depths for all nodes
339
+ for node_id in node_info:
340
+ calculate_depth(node_id, node_info)
341
+
342
+ # Group nodes by depth
343
+ depth_groups, max_depth = group_nodes_by_depth(node_info)
344
+
345
+ # Sort nodes within each depth group
346
+ for depth in depth_groups:
347
+ depth_groups[depth].sort()
348
+
349
+ # Create the main flow visualization
350
+ lines = ["=" * 80, "Flow Graph Visualization", "=" * 80, ""]
351
+
352
+ # Track which nodes connect to what
353
+ merge_points = define_node_connections(node_info)
354
+
355
+ # Build the flow paths
356
+
357
+ # Find the maximum label length for each depth level
358
+ max_label_length = {}
359
+ for depth in range(max_depth + 1):
360
+ if depth in depth_groups:
361
+ max_len = max(len(node_info[nid].label) for nid in depth_groups[depth])
362
+ max_label_length[depth] = max_len
363
+
364
+ # Draw the paths
365
+ drawn_nodes = set()
366
+ merge_drawn = set()
367
+
368
+ # Group paths by their merge points
369
+ paths_by_merge = {}
370
+ standalone_paths = []
371
+
372
+ # Build flow paths
373
+ paths = build_flow_paths(node_info, self._flow_starts, merge_points)
374
+
375
+ # Define paths to merge and standalone paths
376
+ for path in paths:
377
+ if len(path) > 1 and path[-1] in merge_points and len(merge_points[path[-1]]) > 1:
378
+ merge_id = path[-1]
379
+ if merge_id not in paths_by_merge:
380
+ paths_by_merge[merge_id] = []
381
+ paths_by_merge[merge_id].append(path)
382
+ else:
383
+ standalone_paths.append(path)
384
+
385
+ # Draw merged paths
386
+ draw_merged_paths(node_info, merge_points, paths_by_merge, merge_drawn, drawn_nodes, lines)
387
+
388
+ # Draw standlone paths
389
+ draw_standalone_paths(drawn_nodes, standalone_paths, lines, node_info)
390
+
391
+ # Add undrawn nodes
392
+ add_un_drawn_nodes(drawn_nodes, node_info, lines)
393
+
394
+ try:
395
+ skip_nodes, ordered_nodes = compute_execution_plan(
396
+ nodes=self.nodes,
397
+ flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
398
+ if ordered_nodes:
399
+ for i, node in enumerate(ordered_nodes, 1):
400
+ lines.append(f" {i:3d}. {node_info[node.node_id].label}")
401
+ except Exception as e:
402
+ lines.append(f" Could not determine execution order: {e}")
403
+
404
+ # Print everything
405
+ output = "\n".join(lines)
406
+
407
+ print(output)
408
+
373
409
  def get_nodes_overview(self):
374
410
  """Gets a list of dictionary representations for all nodes in the graph."""
375
411
  output = []
@@ -490,7 +526,8 @@ class FlowGraph:
490
526
  node_id=node.node_id,
491
527
  flow_id=self.flow_id,
492
528
  )
493
- node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref)
529
+ node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref,
530
+ n=min(sample_size, number_of_records))
494
531
  return flowfile_table
495
532
 
496
533
  def schema_callback():
@@ -775,26 +812,34 @@ class FlowGraph:
775
812
  """
776
813
 
777
814
  def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
815
+ node = self.get_node(node_id=fuzzy_settings.node_id)
816
+ if self.execution_location == "local":
817
+ return main.fuzzy_join(fuzzy_match_input=fuzzy_settings.join_input,
818
+ other=right,
819
+ node_logger=self.flow_logger.get_node_logger(fuzzy_settings.node_id))
820
+
778
821
  f = main.start_fuzzy_join(fuzzy_match_input=fuzzy_settings.join_input, other=right, file_ref=node.hash,
779
822
  flow_id=self.flow_id, node_id=fuzzy_settings.node_id)
780
823
  logger.info("Started the fuzzy match action")
781
- node._fetch_cached_df = f
824
+ node._fetch_cached_df = f # Add to the node so it can be cancelled and fetch later if needed
782
825
  return FlowDataEngine(f.get_result())
783
826
 
784
- self.add_node_step(node_id=fuzzy_settings.node_id,
785
- function=_func,
786
- input_columns=[],
787
- node_type='fuzzy_match',
788
- setting_input=fuzzy_settings)
789
- node = self.get_node(node_id=fuzzy_settings.node_id)
790
-
791
827
  def schema_callback():
792
- return calculate_fuzzy_match_schema(fuzzy_settings.join_input,
828
+ fm_input_copy = deepcopy(fuzzy_settings.join_input) # Deepcopy create an unique object per func
829
+ node = self.get_node(node_id=fuzzy_settings.node_id)
830
+ return calculate_fuzzy_match_schema(fm_input_copy,
793
831
  left_schema=node.node_inputs.main_inputs[0].schema,
794
832
  right_schema=node.node_inputs.right_input.schema
795
833
  )
796
834
 
797
- node.schema_callback = schema_callback
835
+ self.add_node_step(node_id=fuzzy_settings.node_id,
836
+ function=_func,
837
+ input_columns=[],
838
+ node_type='fuzzy_match',
839
+ setting_input=fuzzy_settings,
840
+ input_node_ids=fuzzy_settings.depending_on_ids,
841
+ schema_callback=schema_callback)
842
+
798
843
  return self
799
844
 
800
845
  def add_text_to_rows(self, node_text_to_rows: input_schema.NodeTextToRows) -> "FlowGraph":
@@ -1550,6 +1595,8 @@ class FlowGraph:
1550
1595
  Args:
1551
1596
  execution_location: The execution location to set.
1552
1597
  """
1598
+ if self.flow_settings.execution_location != execution_location:
1599
+ self.reset()
1553
1600
  self.flow_settings.execution_location = execution_location
1554
1601
 
1555
1602
  def run_graph(self) -> RunInformation | None:
@@ -1576,17 +1623,11 @@ class FlowGraph:
1576
1623
  self.end_datetime = None
1577
1624
  self.latest_run_info = None
1578
1625
  self.flow_logger.info('Starting to run flowfile flow...')
1579
- skip_nodes = [node for node in self.nodes if not node.is_correct]
1580
- skip_nodes.extend([lead_to_node for node in skip_nodes for lead_to_node in node.leads_to_nodes])
1581
- execution_order = determine_execution_order(all_nodes=[node for node in self.nodes if
1582
- node not in skip_nodes],
1583
- flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
1626
+ skip_nodes, execution_order = compute_execution_plan(nodes=self.nodes, flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
1584
1627
 
1585
1628
  skip_node_message(self.flow_logger, skip_nodes)
1586
1629
  execution_order_message(self.flow_logger, execution_order)
1587
1630
  performance_mode = self.flow_settings.execution_mode == 'Performance'
1588
- if self.flow_settings.execution_location == 'local':
1589
- OFFLOAD_TO_WORKER.value = False
1590
1631
  for node in execution_order:
1591
1632
  node_logger = self.flow_logger.get_node_logger(node.node_id)
1592
1633
  if self.flow_settings.is_canceled:
@@ -1922,4 +1963,4 @@ def delete_connection(graph, node_connection: input_schema.NodeConnection):
1922
1963
  to_node.delete_input_node(
1923
1964
  node_connection.output_connection.node_id,
1924
1965
  connection_type=node_connection.input_connection.connection_class,
1925
- )
1966
+ )
@@ -5,7 +5,6 @@ from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEng
5
5
  from flowfile_core.utils.arrow_reader import get_read_top_n
6
6
  from flowfile_core.schemas import input_schema, schemas
7
7
  from flowfile_core.configs.flow_logger import NodeLogger
8
- from flowfile_core.configs.settings import SINGLE_FILE_MODE
9
8
 
10
9
  from flowfile_core.schemas.output_model import TableExample, FileColumn, NodeData
11
10
  from flowfile_core.flowfile.utils import get_hash
@@ -681,7 +680,7 @@ class FlowNode:
681
680
  logger.warning('Not implemented')
682
681
 
683
682
  def needs_run(self, performance_mode: bool, node_logger: NodeLogger = None,
684
- execution_location: schemas.ExecutionLocationsLiteral = "auto") -> bool:
683
+ execution_location: schemas.ExecutionLocationsLiteral = "worker") -> bool:
685
684
  """Determines if the node needs to be executed.
686
685
 
687
686
  The decision is based on its run state, caching settings, and execution mode.
@@ -694,7 +693,7 @@ class FlowNode:
694
693
  Returns:
695
694
  True if the node should be run, False otherwise.
696
695
  """
697
- if execution_location == "local" or SINGLE_FILE_MODE:
696
+ if execution_location == "local":
698
697
  return False
699
698
 
700
699
  flow_logger = logger if node_logger is None else node_logger
@@ -724,9 +723,19 @@ class FlowNode:
724
723
  Raises:
725
724
  Exception: Propagates exceptions from the execution.
726
725
  """
727
- if self.results.resulting_data is None and not performance_mode:
728
- self.results.resulting_data = self.get_resulting_data()
729
- self.results.example_data_generator = lambda: self.get_resulting_data().get_sample(100).to_arrow()
726
+ def example_data_generator():
727
+ example_data = None
728
+
729
+ def get_example_data():
730
+ nonlocal example_data
731
+ if example_data is None:
732
+ example_data = resulting_data.get_sample(100).to_arrow()
733
+ return example_data
734
+ return get_example_data
735
+ resulting_data = self.get_resulting_data()
736
+
737
+ if not performance_mode:
738
+ self.results.example_data_generator = example_data_generator()
730
739
  self.node_schema.result_schema = self.results.resulting_data.schema
731
740
  self.node_stats.has_completed_last_run = True
732
741
 
@@ -869,7 +878,7 @@ class FlowNode:
869
878
  if self.is_setup:
870
879
  node_logger.info(f'Starting to run {self.__name__}')
871
880
  if (self.needs_run(performance_mode, node_logger, run_location) or self.node_template.node_group == "output"
872
- and not (run_location == 'local' or SINGLE_FILE_MODE)):
881
+ and not (run_location == 'local')):
873
882
  self.prepare_before_run()
874
883
  try:
875
884
  if ((run_location == 'remote' or (self.node_default.transform_type == 'wide')
@@ -898,9 +907,15 @@ class FlowNode:
898
907
  node_logger=node_logger)
899
908
  else:
900
909
  self.results.errors = str(e)
901
- node_logger.error(f'Error with running the node: {e}')
902
- elif ((run_location == 'local' or SINGLE_FILE_MODE) and (not self.node_stats.has_run_with_current_setup
903
- or self.node_template.node_group == "output")):
910
+ if "Connection refused" in str(e) and "/submit_query/" in str(e):
911
+ node_logger.warning("There was an issue connecting to the remote worker, "
912
+ "ensure the worker process is running, "
913
+ "or change the settings to, so it executes locally")
914
+ node_logger.error("Could not execute in the remote worker. (Re)start the worker service, or change settings to local settings.")
915
+ else:
916
+ node_logger.error(f'Error with running the node: {e}')
917
+ elif ((run_location == 'local') and
918
+ (not self.node_stats.has_run_with_current_setup or self.node_template.node_group == "output")):
904
919
  try:
905
920
  node_logger.info('Executing fully locally')
906
921
  self.execute_full_local(performance_mode)
@@ -949,16 +964,16 @@ class FlowNode:
949
964
  logger.info(f'{self.node_id}: Node needs reset')
950
965
  self.node_stats.has_run_with_current_setup = False
951
966
  self.results.reset()
952
- if self.is_correct:
953
- self._schema_callback = None # Ensure the schema callback is reset
954
- if self.schema_callback:
955
- logger.info(f'{self.node_id}: Resetting the schema callback')
956
- self.schema_callback.start()
957
967
  self.node_schema.result_schema = None
958
968
  self.node_schema.predicted_schema = None
959
969
  self._hash = None
960
970
  self.node_information.is_setup = None
961
971
  self.results.errors = None
972
+ if self.is_correct:
973
+ self._schema_callback = None # Ensure the schema callback is reset
974
+ if self.schema_callback:
975
+ logger.info(f'{self.node_id}: Resetting the schema callback')
976
+ self.schema_callback.start()
962
977
  self.evaluate_nodes()
963
978
  _ = self.hash # Recalculate the hash after reset
964
979
 
@@ -108,14 +108,12 @@ class NodeStepSettings:
108
108
  streamable: If True, the node can process data in a streaming fashion.
109
109
  setup_errors: If True, indicates a non-blocking error occurred during setup.
110
110
  breaking_setup_errors: If True, indicates an error occurred that prevents execution.
111
- execute_location: The preferred location for execution ('auto', 'local', 'remote').
112
111
  """
113
112
  cache_results: bool = False
114
113
  renew_schema: bool = True
115
114
  streamable: bool = True
116
115
  setup_errors: bool = False
117
116
  breaking_setup_errors: bool = False
118
- execute_location: schemas.ExecutionLocationsLiteral = 'auto'
119
117
 
120
118
 
121
119
  class NodeStepInputs:
@@ -1,71 +1,166 @@
1
-
2
1
  from typing import Callable, Any, Optional, Generic, TypeVar
3
2
  from concurrent.futures import ThreadPoolExecutor, Future
3
+ import threading
4
4
  from flowfile_core.configs import logger
5
5
 
6
-
7
6
  T = TypeVar('T')
8
7
 
9
8
 
10
9
  class SingleExecutionFuture(Generic[T]):
11
- """Single execution of a function in a separate thread with caching of the result."""
12
- executor: ThreadPoolExecutor
13
- future: Optional[Future[T]]
10
+ """Thread-safe single execution of a function with result caching.
11
+
12
+ Ensures a function is executed at most once even when called from multiple threads.
13
+ Subsequent calls return the cached result.
14
+ """
15
+
14
16
  func: Callable[[], T]
15
17
  on_error: Optional[Callable[[Exception], Any]]
16
- result_value: Optional[T]
17
- has_run_at_least_once: bool = False # Indicates if the function has been run at least once
18
+ _lock: threading.RLock
19
+ _executor: Optional[ThreadPoolExecutor]
20
+ _future: Optional[Future[T]]
21
+ _result_value: Optional[T]
22
+ _exception: Optional[Exception]
23
+ _has_completed: bool
24
+ _has_started: bool
18
25
 
19
26
  def __init__(
20
- self,
21
- func: Callable[[], T],
22
- on_error: Optional[Callable[[Exception], Any]] = None
27
+ self,
28
+ func: Callable[[], T],
29
+ on_error: Optional[Callable[[Exception], Any]] = None
23
30
  ) -> None:
24
31
  """Initialize with function and optional error handler."""
25
- self.executor = ThreadPoolExecutor(max_workers=1)
26
- self.future = None
27
32
  self.func = func
28
33
  self.on_error = on_error
29
- self.result_value = None
30
- self.has_run_at_least_once = False
34
+
35
+ # Thread safety
36
+ self._lock = threading.RLock() # RLock allows re-entrant locking
37
+
38
+ # Execution state
39
+ self._executor = None
40
+ self._future = None
41
+ self._result_value = None
42
+ self._exception = None
43
+ self._has_completed = False
44
+ self._has_started = False
45
+
46
+ def _ensure_executor(self) -> ThreadPoolExecutor:
47
+ """Ensure executor exists, creating if necessary."""
48
+ if self._executor is None or self._executor._shutdown:
49
+ self._executor = ThreadPoolExecutor(max_workers=1)
50
+ return self._executor
31
51
 
32
52
  def start(self) -> None:
33
53
  """Start the function execution if not already started."""
34
- if not self.future:
35
- logger.info("single executor function started")
36
- self.future = self.executor.submit(self.func)
54
+ with self._lock:
55
+ if self._has_started:
56
+ logger.info("Function already started or completed")
57
+ return
58
+
59
+ logger.info("Starting single executor function")
60
+ executor: ThreadPoolExecutor = self._ensure_executor()
61
+ self._future = executor.submit(self._func_wrapper)
62
+ self._has_started = True
63
+
64
+ def _func_wrapper(self) -> T:
65
+ """Wrapper to capture the result or exception."""
66
+ try:
67
+ result: T = self.func()
68
+ with self._lock:
69
+ self._result_value = result
70
+ self._has_completed = True
71
+ return result
72
+ except Exception as e:
73
+ with self._lock:
74
+ self._exception = e
75
+ self._has_completed = True
76
+ raise
37
77
 
38
78
  def cleanup(self) -> None:
39
- """Clean up resources by clearing the future and shutting down the executor."""
40
- self.has_run_at_least_once = True
41
- self.executor.shutdown(wait=False)
79
+ """Clean up resources by shutting down the executor."""
80
+ with self._lock:
81
+ if self._executor and not self._executor._shutdown:
82
+ self._executor.shutdown(wait=False)
42
83
 
43
84
  def __call__(self) -> Optional[T]:
44
85
  """Execute function if not running and return its result."""
45
- if self.result_value:
46
- return self.result_value
47
- if not self.future:
48
- self.start()
49
- else:
50
- logger.info("Function already running or did complete")
51
- try:
52
- self.result_value = self.future.result()
53
- logger.info("Done with the function")
54
- return self.result_value
55
- except Exception as e:
56
- if self.on_error:
57
- return self.on_error(e)
58
- else:
59
- raise e
60
- finally:
61
- self.cleanup()
86
+ with self._lock:
87
+ # If already completed, return cached result or raise cached exception
88
+ if self._has_completed:
89
+ if self._exception:
90
+ if self.on_error:
91
+ return self.on_error(self._exception)
92
+ else:
93
+ raise self._exception
94
+ return self._result_value
95
+
96
+ # Start if not already started
97
+ if not self._has_started:
98
+ self.start()
99
+
100
+ # Wait for completion outside the lock to avoid blocking other threads
101
+ if self._future:
102
+ try:
103
+ result: T = self._future.result()
104
+ logger.info("Function completed successfully")
105
+ return result
106
+ except Exception as e:
107
+ logger.error(f"Function raised exception: {e}")
108
+ if self.on_error:
109
+ return self.on_error(e)
110
+ else:
111
+ raise
112
+
113
+ return None
114
+
115
+ def reset(self) -> None:
116
+ """Reset the execution state, allowing the function to be run again."""
117
+ with self._lock:
118
+ logger.info("Resetting single execution future")
119
+
120
+ # Cancel any pending execution
121
+ if self._future and not self._future.done():
122
+ self._future.cancel()
62
123
 
63
- def reset(self):
64
- """Reset the future and result value."""
65
- logger.info("Resetting the future and result value")
66
- self.result_value = None
67
- self.future = None
124
+ # Clean up old executor
125
+ if self._executor and not self._executor._shutdown:
126
+ self._executor.shutdown(wait=False)
127
+
128
+ # Reset state
129
+ self._executor = None
130
+ self._future = None
131
+ self._result_value = None
132
+ self._exception = None
133
+ self._has_completed = False
134
+ self._has_started = False
135
+
136
+ def is_running(self) -> bool:
137
+ """Check if the function is currently executing."""
138
+ with self._lock:
139
+ return bool(
140
+ self._has_started and
141
+ not self._has_completed and
142
+ self._future is not None and
143
+ not self._future.done()
144
+ )
145
+
146
+ def is_completed(self) -> bool:
147
+ """Check if the function has completed execution."""
148
+ with self._lock:
149
+ return self._has_completed
150
+
151
+ def get_result(self) -> Optional[T]:
152
+ """Get the cached result without triggering execution."""
153
+ with self._lock:
154
+ if self._exception:
155
+ if self.on_error:
156
+ return self.on_error(self._exception)
157
+ else:
158
+ raise self._exception
159
+ return self._result_value
68
160
 
69
161
  def __del__(self) -> None:
70
162
  """Ensure executor is shut down on deletion."""
71
- self.cleanup()
163
+ try:
164
+ self.cleanup()
165
+ except Exception:
166
+ pass