Flowfile 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. flowfile/__init__.py +4 -3
  2. flowfile/api.py +5 -2
  3. flowfile/web/__init__.py +2 -0
  4. flowfile/web/static/assets/{CloudConnectionManager-c20a740f.js → CloudConnectionManager-c97c25f8.js} +2 -2
  5. flowfile/web/static/assets/{CloudStorageReader-960b400a.js → CloudStorageReader-f1ff509e.js} +7 -7
  6. flowfile/web/static/assets/{CloudStorageWriter-e3decbdd.js → CloudStorageWriter-034f8b78.js} +7 -7
  7. flowfile/web/static/assets/{CrossJoin-d67e2405.js → CrossJoin-9e156ebe.js} +8 -8
  8. flowfile/web/static/assets/{DatabaseConnectionSettings-a81e0f7e.js → DatabaseConnectionSettings-d5c625b3.js} +2 -2
  9. flowfile/web/static/assets/{DatabaseManager-9ea35e84.js → DatabaseManager-265adc5e.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseReader-9578bfa5.js → DatabaseReader-0b10551e.js} +9 -9
  11. flowfile/web/static/assets/{DatabaseWriter-19531098.js → DatabaseWriter-c17c6916.js} +9 -9
  12. flowfile/web/static/assets/{ExploreData-40476474.js → ExploreData-18a4fe52.js} +5 -5
  13. flowfile/web/static/assets/{ExternalSource-2297ef96.js → ExternalSource-3a66556c.js} +6 -6
  14. flowfile/web/static/assets/{Filter-f211c03a.js → Filter-91ad87e7.js} +8 -8
  15. flowfile/web/static/assets/{Formula-4207ea31.js → Formula-3c395ab1.js} +8 -8
  16. flowfile/web/static/assets/{FuzzyMatch-bf120df0.js → FuzzyMatch-2df0d230.js} +9 -9
  17. flowfile/web/static/assets/{GraphSolver-5bb7497a.js → GraphSolver-d285877f.js} +5 -5
  18. flowfile/web/static/assets/{GroupBy-92c81b65.js → GroupBy-0bd1cc6b.js} +6 -6
  19. flowfile/web/static/assets/{Join-4e49a274.js → Join-5a78a203.js} +9 -9
  20. flowfile/web/static/assets/{ManualInput-90998ae8.js → ManualInput-93aef9d6.js} +5 -5
  21. flowfile/web/static/assets/{Output-81e3e917.js → Output-411ecaee.js} +4 -4
  22. flowfile/web/static/assets/{Pivot-a3419842.js → Pivot-89db4b04.js} +6 -6
  23. flowfile/web/static/assets/{PolarsCode-72710deb.js → PolarsCode-a9f974f8.js} +6 -6
  24. flowfile/web/static/assets/{Read-c4059daf.js → Read-c3b1929c.js} +6 -6
  25. flowfile/web/static/assets/{RecordCount-c2b5e095.js → RecordCount-4e95f98e.js} +5 -5
  26. flowfile/web/static/assets/{RecordId-10baf191.js → RecordId-55ae7d36.js} +6 -6
  27. flowfile/web/static/assets/{Sample-3ed9a0ae.js → Sample-b4a18476.js} +5 -5
  28. flowfile/web/static/assets/{SecretManager-0d49c0e8.js → SecretManager-b066d13a.js} +2 -2
  29. flowfile/web/static/assets/{Select-8a02a0b3.js → Select-727688dc.js} +8 -8
  30. flowfile/web/static/assets/{SettingsSection-4c0f45f5.js → SettingsSection-695ac487.js} +1 -1
  31. flowfile/web/static/assets/{Sort-f55c9f9d.js → Sort-be3339a8.js} +6 -6
  32. flowfile/web/static/assets/{TextToRows-5dbc2145.js → TextToRows-7b8998da.js} +8 -8
  33. flowfile/web/static/assets/{UnavailableFields-a1768e52.js → UnavailableFields-8b0cb48e.js} +2 -2
  34. flowfile/web/static/assets/{Union-f2aefdc9.js → Union-89fd73dc.js} +5 -5
  35. flowfile/web/static/assets/{Unique-46b250da.js → Unique-af5a80b4.js} +8 -8
  36. flowfile/web/static/assets/{Unpivot-25ac84cc.js → Unpivot-5195d411.js} +5 -5
  37. flowfile/web/static/assets/{api-a0abbdc7.js → api-023d1733.js} +1 -1
  38. flowfile/web/static/assets/{api-6ef0dcef.js → api-cb00cce6.js} +1 -1
  39. flowfile/web/static/assets/{designer-186f2e71.css → designer-2197d782.css} +17 -17
  40. flowfile/web/static/assets/{designer-13eabd83.js → designer-6c322d8e.js} +67 -21
  41. flowfile/web/static/assets/{documentation-b87e7f6f.js → documentation-4d1fafe1.js} +1 -1
  42. flowfile/web/static/assets/{dropDown-13564764.js → dropDown-0b46dd77.js} +1 -1
  43. flowfile/web/static/assets/{fullEditor-fd2cd6f9.js → fullEditor-ec4e4f95.js} +2 -2
  44. flowfile/web/static/assets/{genericNodeSettings-71e11604.js → genericNodeSettings-def5879b.js} +3 -3
  45. flowfile/web/static/assets/{index-f6c15e76.js → index-683fc198.js} +6 -6
  46. flowfile/web/static/assets/{nodeTitle-988d9efe.js → nodeTitle-a16db7c3.js} +3 -3
  47. flowfile/web/static/assets/{secretApi-dd636aa2.js → secretApi-baceb6f9.js} +1 -1
  48. flowfile/web/static/assets/{selectDynamic-af36165e.js → selectDynamic-de91449a.js} +3 -3
  49. flowfile/web/static/assets/{vue-codemirror.esm-2847001e.js → vue-codemirror.esm-dc5e3348.js} +1 -1
  50. flowfile/web/static/assets/{vue-content-loader.es-0371da73.js → vue-content-loader.es-ba94b82f.js} +1 -1
  51. flowfile/web/static/index.html +1 -1
  52. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/METADATA +2 -1
  53. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/RECORD +88 -90
  54. flowfile_core/configs/settings.py +4 -2
  55. flowfile_core/configs/utils.py +5 -0
  56. flowfile_core/database/connection.py +1 -3
  57. flowfile_core/flowfile/code_generator/code_generator.py +36 -0
  58. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +0 -1
  59. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +60 -80
  60. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +61 -0
  61. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +44 -3
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +3 -3
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +5 -2
  64. flowfile_core/flowfile/flow_graph.py +129 -88
  65. flowfile_core/flowfile/flow_node/flow_node.py +30 -15
  66. flowfile_core/flowfile/flow_node/models.py +0 -2
  67. flowfile_core/flowfile/flow_node/schema_callback.py +138 -43
  68. flowfile_core/flowfile/graph_tree/graph_tree.py +250 -0
  69. flowfile_core/flowfile/graph_tree/models.py +15 -0
  70. flowfile_core/flowfile/manage/compatibility_enhancements.py +1 -1
  71. flowfile_core/flowfile/{flow_data_engine/fuzzy_matching/settings_validator.py → schema_callbacks.py} +65 -13
  72. flowfile_core/flowfile/setting_generator/settings.py +2 -1
  73. flowfile_core/flowfile/util/execution_orderer.py +9 -0
  74. flowfile_core/flowfile/util/node_skipper.py +8 -0
  75. flowfile_core/schemas/schemas.py +46 -3
  76. flowfile_core/schemas/transform_schema.py +27 -38
  77. flowfile_core/utils/arrow_reader.py +8 -3
  78. flowfile_core/utils/validate_setup.py +0 -2
  79. flowfile_frame/__init__.py +1 -4
  80. flowfile_frame/expr.py +14 -0
  81. flowfile_frame/flow_frame.py +34 -5
  82. flowfile_frame/flow_frame.pyi +5 -6
  83. flowfile_worker/funcs.py +7 -3
  84. flowfile_worker/models.py +3 -1
  85. flowfile_worker/polars_fuzzy_match/matcher.py +0 -435
  86. flowfile_worker/polars_fuzzy_match/models.py +0 -36
  87. flowfile_worker/polars_fuzzy_match/pre_process.py +0 -213
  88. flowfile_worker/polars_fuzzy_match/process.py +0 -86
  89. flowfile_worker/polars_fuzzy_match/utils.py +0 -50
  90. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/LICENSE +0 -0
  91. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/WHEEL +0 -0
  92. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/entry_points.txt +0 -0
  93. {flowfile_worker/polars_fuzzy_match → flowfile_core/flowfile/graph_tree}/__init__.py +0 -0
@@ -0,0 +1,250 @@
1
+ from pydantic import BaseModel
2
+
3
+ from flowfile_core.flowfile.flow_node.flow_node import FlowNode
4
+
5
+ from flowfile_core.flowfile.graph_tree.models import BranchInfo, InputInfo
6
+
7
+
8
+ def calculate_depth(node_id: int, node_info: dict[int, BranchInfo], visited: set = None) -> int:
9
+ """Calculates the depth of each node."""
10
+
11
+ if visited is None:
12
+ visited = set()
13
+ if node_id in visited:
14
+ return node_info[node_id].depth
15
+ visited.add(node_id)
16
+
17
+ max_input_depth = -1
18
+ inputs = node_info[node_id].inputs
19
+
20
+ for main_id in inputs.main:
21
+ max_input_depth = max(max_input_depth, calculate_depth(main_id, node_info, visited))
22
+ if inputs.left:
23
+ max_input_depth = max(max_input_depth, calculate_depth(inputs.left, node_info, visited))
24
+ if inputs.right:
25
+ max_input_depth = max(max_input_depth, calculate_depth(inputs.right, node_info, visited))
26
+
27
+ node_info[node_id].depth = max_input_depth + 1
28
+ return node_info[node_id].depth
29
+
30
+
31
+ # Trace paths from each root
32
+ def trace_path(node_id: int, node_info: dict[int, BranchInfo], merge_points: dict[int, list[int]],
33
+ current_path: list[int] | None = None):
34
+ """Define the trace of each node path"""
35
+ if current_path is None:
36
+ current_path = []
37
+
38
+ current_path = current_path + [node_id]
39
+ outputs = node_info[node_id].outputs
40
+
41
+ if not outputs:
42
+ # End of path
43
+ return [current_path]
44
+
45
+ # If this node has multiple outputs or connects to a merge point, branch
46
+ all_paths = []
47
+ for output_id in outputs:
48
+ if output_id in merge_points and len(merge_points[output_id]) > 1:
49
+ # This is a merge point, end this path here
50
+ all_paths.append(current_path + [output_id])
51
+ else:
52
+ # Continue the path
53
+ all_paths.extend(trace_path(output_id, node_info, merge_points, current_path))
54
+ return all_paths
55
+
56
+
57
+ def build_node_info(nodes: list[FlowNode]) -> dict[int, BranchInfo]:
58
+ """Builds node information used to construct the graph tree."""
59
+
60
+ node_info = {}
61
+ for node in nodes:
62
+ node_id = node.node_id
63
+
64
+ # Get node label
65
+ operation = node.node_type.replace("_", " ").title() if node.node_type else "Unknown"
66
+ label = f"{operation} (id={node_id})"
67
+ if hasattr(node, 'setting_input') and hasattr(node.setting_input, 'description'):
68
+ if node.setting_input.description:
69
+ desc = node.setting_input.description
70
+ if len(desc) > 20: # Truncate long descriptions
71
+ desc = desc[:17] + "..."
72
+ label = f"{operation} ({node_id}): {desc}"
73
+
74
+ # Get inputs and outputs
75
+ inputs = InputInfo(
76
+ main=[n.node_id for n in (node.node_inputs.main_inputs or [])],
77
+ left=node.node_inputs.left_input.node_id if node.node_inputs.left_input else None,
78
+ right=node.node_inputs.right_input.node_id if node.node_inputs.right_input else None
79
+ )
80
+ outputs = [n.node_id for n in node.leads_to_nodes]
81
+
82
+ node_info[node_id] = BranchInfo(
83
+ label=label,
84
+ short_label=f"{operation} ({node_id})",
85
+ inputs=inputs,
86
+ outputs=outputs,
87
+ depth=0
88
+ )
89
+
90
+ return node_info
91
+
92
+
93
+ def group_nodes_by_depth(node_info: dict[int, BranchInfo]) -> tuple[dict[int, list[int]], int]:
94
+ """Groups each node by depth"""
95
+ depth_groups = {}
96
+ max_depth = 0
97
+ for node_id, info in node_info.items():
98
+ depth = info.depth
99
+ max_depth = max(max_depth, depth)
100
+ if depth not in depth_groups:
101
+ depth_groups[depth] = []
102
+ depth_groups[depth].append(node_id)
103
+
104
+ return depth_groups, max_depth
105
+
106
+
107
+ def define_node_connections(node_info: dict[int, BranchInfo]) -> dict[int, list[int]]:
108
+ """Defines node connections to merge"""
109
+ merge_points = {} # target_id -> list of source_ids
110
+ for node_id, info in node_info.items():
111
+ for output_id in info.outputs:
112
+ if output_id not in merge_points:
113
+ merge_points[output_id] = []
114
+ merge_points[output_id].append(node_id)
115
+
116
+ return merge_points
117
+
118
+
119
+ def build_flow_paths(node_info: dict[int, BranchInfo], flow_starts: list[FlowNode],
120
+ merge_points: dict[int, list[int]]):
121
+ """Build the flow paths to be drawn"""
122
+
123
+
124
+ # Find all root nodes (no inputs)
125
+ root_nodes = [nid for nid, info in node_info.items()
126
+ if not info.inputs.main and not info.inputs.left and not info.inputs.right]
127
+
128
+ if not root_nodes and flow_starts:
129
+ root_nodes = [n.node_id for n in flow_starts]
130
+ paths = [] # List of paths through the graph
131
+
132
+ # Get all paths
133
+ for root_id in root_nodes:
134
+ paths.extend(trace_path(root_id, node_info, merge_points))
135
+
136
+ return paths
137
+
138
+
139
+ def group_paths(paths:list, merge_points:dict):
140
+ """Groups each node path."""
141
+ paths_by_merge = {}
142
+ standalone_paths = []
143
+
144
+ for path in paths:
145
+ if len(path) > 1 and path[-1] in merge_points and len(merge_points[path[-1]]) > 1:
146
+ merge_id = path[-1]
147
+ if merge_id not in paths_by_merge:
148
+ paths_by_merge[merge_id] = []
149
+ paths_by_merge[merge_id].append(path)
150
+ else:
151
+ standalone_paths.append(path)
152
+ return paths_by_merge, standalone_paths
153
+
154
+
155
+ def draw_merged_paths(node_info: dict[int, BranchInfo],
156
+ merge_points: dict[int, list[int]],
157
+ paths_by_merge: dict[int, list[list[int]]],
158
+ merge_drawn: set,
159
+ drawn_nodes: set,
160
+ lines: list[str]):
161
+ """Draws paths for each node that merges."""
162
+ for merge_id, merge_paths in paths_by_merge.items():
163
+ if merge_id in merge_drawn:
164
+ continue
165
+ merge_info = node_info[merge_id]
166
+ sources = merge_points[merge_id]
167
+
168
+ # Draw each source path leading to the merge
169
+ for i, source_id in enumerate(sources):
170
+ # Find the path containing this source
171
+ source_path = None
172
+ for path in merge_paths:
173
+ if source_id in path:
174
+
175
+ source_path = path[:path.index(source_id) + 1]
176
+ break
177
+
178
+ if source_path:
179
+ # Build the line for this path
180
+ line_parts = []
181
+ for j, nid in enumerate(source_path):
182
+ if j == 0:
183
+ line_parts.append(node_info[nid].label)
184
+ else:
185
+ line_parts.append(f" ──> {node_info[nid].short_label}")
186
+
187
+ # Add the merge arrow
188
+ if i == 0:
189
+ # First source
190
+ line = "".join(line_parts) + " ─────┐"
191
+ lines.append(line)
192
+ elif i == len(sources) - 1:
193
+ # Last source
194
+ line = "".join(line_parts) + " ─────┴──> " + merge_info.label
195
+ lines.append(line)
196
+
197
+ # Continue with the rest of the path after merge
198
+ remaining = node_info[merge_id].outputs
199
+ while remaining:
200
+ next_id = remaining[0]
201
+ lines[-1] += f" ──> {node_info[next_id].label}"
202
+ remaining = node_info[next_id].outputs
203
+ drawn_nodes.add(next_id)
204
+ else:
205
+ # Middle sources
206
+ line = "".join(line_parts) + " ─────┤"
207
+ lines.append(line)
208
+
209
+ for nid in source_path:
210
+ drawn_nodes.add(nid)
211
+
212
+ drawn_nodes.add(merge_id)
213
+ merge_drawn.add(merge_id)
214
+ lines.append("") # Add spacing between merge groups
215
+ return paths_by_merge
216
+
217
+
218
+ def draw_standalone_paths(drawn_nodes: set[int], standalone_paths: list[list[int]], lines: list[str],
219
+ node_info: dict[int, BranchInfo]):
220
+ """ Draws paths that do not merge."""
221
+ # Draw standalone paths
222
+ for path in standalone_paths:
223
+ if all(nid in drawn_nodes for nid in path):
224
+ continue
225
+
226
+ line_parts = []
227
+ for i, node_id in enumerate(path):
228
+ if node_id not in drawn_nodes:
229
+ if i == 0:
230
+ line_parts.append(node_info[node_id].label)
231
+ else:
232
+ line_parts.append(f" ──> {node_info[node_id].short_label}")
233
+ drawn_nodes.add(node_id)
234
+
235
+ if line_parts:
236
+ lines.append("".join(line_parts))
237
+
238
+
239
+ def add_un_drawn_nodes(drawn_nodes: set[int], node_info: dict[int, BranchInfo], lines: list[str]):
240
+ """Adds isolated nodes if exists."""
241
+ # Add any remaining undrawn nodes
242
+
243
+ for node_id in node_info:
244
+ if node_id not in drawn_nodes:
245
+ lines.append(node_info[node_id].label + " (isolated)")
246
+
247
+ lines.append("")
248
+ lines.append("=" * 80)
249
+ lines.append("Execution Order")
250
+ lines.append("=" * 80)
@@ -0,0 +1,15 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class InputInfo(BaseModel):
5
+ main: list[int]
6
+ right: int | None = None
7
+ left: int | None = None
8
+
9
+
10
+ class BranchInfo(BaseModel):
11
+ label: str
12
+ short_label: str
13
+ inputs: InputInfo
14
+ outputs: list[int]
15
+ depth: int
@@ -48,7 +48,7 @@ def ensure_compatibility(flow_storage_obj: schemas.FlowInformation, flow_path: s
48
48
  setattr(flow_storage_obj, 'flow_settings', flow_settings)
49
49
  flow_storage_obj = schemas.FlowInformation.model_validate(flow_storage_obj)
50
50
  elif not hasattr(getattr(flow_storage_obj, 'flow_settings'), 'execution_location'):
51
- setattr(getattr(flow_storage_obj, 'flow_settings'), 'execution_location', 'auto')
51
+ setattr(getattr(flow_storage_obj, 'flow_settings'), 'execution_location', "remote")
52
52
  elif not hasattr(flow_storage_obj.flow_settings, 'is_running'):
53
53
  setattr(flow_storage_obj.flow_settings, 'is_running', False)
54
54
  setattr(flow_storage_obj.flow_settings, 'is_canceled', False)
@@ -1,25 +1,72 @@
1
1
 
2
2
  from typing import List
3
- from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn, PlType
4
- from flowfile_core.schemas import transform_schema
5
- from flowfile_core.schemas import input_schema
3
+
6
4
  from polars import datatypes
7
5
  import polars as pl
6
+
7
+ from pl_fuzzy_frame_match.output_column_name_utils import set_name_in_fuzzy_mappings
8
+ from pl_fuzzy_frame_match.pre_process import rename_fuzzy_right_mapping
9
+
8
10
  from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import fetch_unique_values
9
11
  from flowfile_core.configs.flow_logger import main_logger
12
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn, PlType
13
+ from flowfile_core.schemas import transform_schema
14
+ from flowfile_core.schemas import input_schema
15
+
16
+
17
+ def _ensure_all_columns_have_select(left_cols: List[str],
18
+ right_cols: List[str],
19
+ fuzzy_match_input: transform_schema.FuzzyMatchInput):
20
+ """
21
+ Ensure that all columns in the left and right FlowDataEngines are included in the fuzzy match input's select
22
+ statements.
23
+ Args:
24
+ left_cols (List[str]): List of column names in the left FlowDataEngine.
25
+ right_cols (List[str]): List of column names in the right FlowDataEngine.
26
+ fuzzy_match_input (FuzzyMatchInput): Fuzzy match input configuration containing select statements.
27
+
28
+ Returns:
29
+ None
30
+ """
31
+ right_cols_in_select = {c.old_name for c in fuzzy_match_input.right_select.renames}
32
+ left_cols_in_select = {c.old_name for c in fuzzy_match_input.left_select.renames}
10
33
 
34
+ fuzzy_match_input.left_select.renames.extend(
35
+ [transform_schema.SelectInput(col) for col in left_cols if col not in left_cols_in_select])
36
+ fuzzy_match_input.right_select.renames.extend(
37
+ [transform_schema.SelectInput(col) for col in right_cols if col not in right_cols_in_select]
38
+ )
11
39
 
12
- def calculate_uniqueness(a: float, b: float) -> float:
13
- return ((pow(a + 0.5, 2) + pow(b + 0.5, 2)) / 2 - pow(0.5, 2)) + 0.5 * abs(a - b)
40
+
41
+ def _order_join_inputs_based_on_col_order(col_order: List[str], join_inputs: transform_schema.JoinInputs) -> None:
42
+ """
43
+ Ensure that the select columns in the fuzzy match input match the order of the incoming columns.
44
+ This function modifies the join_inputs object in-place.
45
+
46
+ Returns:
47
+ None
48
+ """
49
+ select_map = {select.new_name: select for select in join_inputs.renames}
50
+ ordered_renames = [select_map[col] for col in col_order if col in select_map]
51
+ join_inputs.renames = ordered_renames
14
52
 
15
53
 
16
54
  def calculate_fuzzy_match_schema(fm_input: transform_schema.FuzzyMatchInput,
17
55
  left_schema: List[FlowfileColumn],
18
56
  right_schema: List[FlowfileColumn]):
19
- print('calculating fuzzy match schema')
57
+ _ensure_all_columns_have_select(left_cols=[col.column_name for col in left_schema],
58
+ right_cols=[col.column_name for col in right_schema],
59
+ fuzzy_match_input=fm_input)
60
+ _order_join_inputs_based_on_col_order(col_order=[col.column_name for col in left_schema],
61
+ join_inputs=fm_input.left_select)
62
+ _order_join_inputs_based_on_col_order(col_order=[col.column_name for col in right_schema],
63
+ join_inputs=fm_input.right_select)
20
64
  left_schema_dict, right_schema_dict = ({ls.name: ls for ls in left_schema}, {rs.name: rs for rs in right_schema})
21
65
  fm_input.auto_rename()
22
66
 
67
+ right_renames = {column.old_name: column.new_name for column in fm_input.right_select.renames}
68
+ new_join_mapping = rename_fuzzy_right_mapping(fm_input.join_mapping, right_renames)
69
+
23
70
  output_schema = []
24
71
  for column in fm_input.left_select.renames:
25
72
  column_schema = left_schema_dict.get(column.old_name)
@@ -31,9 +78,9 @@ def calculate_fuzzy_match_schema(fm_input: transform_schema.FuzzyMatchInput,
31
78
  if column_schema and column.keep:
32
79
  output_schema.append(FlowfileColumn.from_input(column.new_name, column_schema.data_type,
33
80
  example_values=column_schema.example_values))
34
-
35
- for i, fm in enumerate(fm_input.join_mapping):
36
- output_schema.append(FlowfileColumn.from_input(f'fuzzy_score_{i}', 'Float64'))
81
+ set_name_in_fuzzy_mappings(new_join_mapping)
82
+ output_schema.extend([FlowfileColumn.from_input(fuzzy_mapping.output_column_name, 'Float64')
83
+ for fuzzy_mapping in new_join_mapping])
37
84
  return output_schema
38
85
 
39
86
 
@@ -71,7 +118,8 @@ def pre_calculate_pivot_schema(node_input_schema: List[FlowfileColumn],
71
118
  val_column_schema = get_schema_of_column(node_input_schema, pivot_input.value_col)
72
119
  if output_fields is not None and len(output_fields) > 0:
73
120
  return index_columns_schema+[FlowfileColumn(PlType(Plcolumn_name=output_field.name,
74
- pl_datatype=output_field.data_type)) for output_field in output_fields]
121
+ pl_datatype=output_field.data_type)) for output_field in
122
+ output_fields]
75
123
 
76
124
  else:
77
125
  max_unique_vals = 200
@@ -84,7 +132,11 @@ def pre_calculate_pivot_schema(node_input_schema: List[FlowfileColumn],
84
132
  f' Max unique values: {max_unique_vals}')
85
133
  pl_output_fields = []
86
134
  for val in unique_vals:
87
- for agg in pivot_input.aggregations:
88
- output_type = get_output_data_type_pivot(val_column_schema, agg)
89
- pl_output_fields.append(PlType(column_name=f'{val}_{agg}', pl_datatype=output_type))
135
+ if len(pivot_input.aggregations) == 1:
136
+ output_type = get_output_data_type_pivot(val_column_schema, pivot_input.aggregations[0])
137
+ pl_output_fields.append(PlType(column_name=str(val), pl_datatype=output_type))
138
+ else:
139
+ for agg in pivot_input.aggregations:
140
+ output_type = get_output_data_type_pivot(val_column_schema, agg)
141
+ pl_output_fields.append(PlType(column_name=f'{val}_{agg}', pl_datatype=output_type))
90
142
  return index_columns_schema + [FlowfileColumn(pl_output_field) for pl_output_field in pl_output_fields]
@@ -4,6 +4,7 @@ from typing import Callable, Iterable
4
4
  from functools import wraps
5
5
  from flowfile_core.schemas.output_model import NodeData
6
6
  from flowfile_core.flowfile.setting_generator.setting_generator import SettingGenerator, SettingUpdator
7
+ from pl_fuzzy_frame_match.models import FuzzyMapping
7
8
 
8
9
  setting_generator = SettingGenerator()
9
10
  setting_updator = SettingUpdator()
@@ -135,7 +136,7 @@ def cross_join(node_data: NodeData):
135
136
 
136
137
 
137
138
  def check_if_fuzzy_match_is_valid(left_columns: Iterable[str], right_columns: Iterable[str],
138
- fuzzy_map: transform_schema.FuzzyMap) -> bool:
139
+ fuzzy_map: FuzzyMapping) -> bool:
139
140
  if fuzzy_map.left_col not in left_columns:
140
141
  return False
141
142
  if fuzzy_map.right_col not in right_columns:
@@ -2,6 +2,15 @@ from typing import List, Dict, Set
2
2
  from flowfile_core.flowfile.flow_node.flow_node import FlowNode
3
3
  from flowfile_core.configs import logger
4
4
  from collections import deque, defaultdict
5
+ from flowfile_core.flowfile.util.node_skipper import determine_nodes_to_skip
6
+
7
+ def compute_execution_plan(nodes: List[FlowNode], flow_starts: List[FlowNode] = None):
8
+ """ Computes the execution order after finding the nodes to skip on the execution step."""
9
+ skip_nodes = determine_nodes_to_skip(nodes=nodes)
10
+ computed_execution_order = determine_execution_order(all_nodes=[node for node in nodes if node not in skip_nodes],
11
+ flow_starts=flow_starts)
12
+ return skip_nodes, computed_execution_order
13
+
5
14
 
6
15
 
7
16
  def determine_execution_order(all_nodes: List[FlowNode], flow_starts: List[FlowNode] = None) -> List[FlowNode]:
@@ -0,0 +1,8 @@
1
+ from typing import List
2
+ from flowfile_core.flowfile.flow_node.flow_node import FlowNode
3
+
4
+ def determine_nodes_to_skip(nodes : List[FlowNode]) -> List[FlowNode]:
5
+ """ Finds nodes to skip on the execution step. """
6
+ skip_nodes = [node for node in nodes if not node.is_correct]
7
+ skip_nodes.extend([lead_to_node for node in skip_nodes for lead_to_node in node.leads_to_nodes])
8
+ return skip_nodes
@@ -1,8 +1,35 @@
1
1
  from typing import Optional, List, Dict, Tuple, Any, Literal, Annotated
2
2
  from pydantic import BaseModel, field_validator, ConfigDict, Field, StringConstraints
3
3
  from flowfile_core.flowfile.utils import create_unique_id
4
+ from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
4
5
  ExecutionModeLiteral = Literal['Development', 'Performance']
5
- ExecutionLocationsLiteral = Literal['auto', 'local', 'remote']
6
+ ExecutionLocationsLiteral = Literal['local', 'remote']
7
+
8
+
9
+ def get_global_execution_location() -> ExecutionLocationsLiteral:
10
+ """
11
+ Calculates the default execution location based on the global settings
12
+ Returns
13
+ -------
14
+ ExecutionLocationsLiteral where the current
15
+ """
16
+ if OFFLOAD_TO_WORKER:
17
+ return "remote"
18
+ return "local"
19
+
20
+
21
+ def is_valid_execution_location_in_current_global_settings(execution_location: ExecutionLocationsLiteral) -> bool:
22
+ return not (get_global_execution_location() == "local" and execution_location == "remote")
23
+
24
+
25
+ def get_prio_execution_location(local_execution_location: ExecutionLocationsLiteral,
26
+ global_execution_location: ExecutionLocationsLiteral) -> ExecutionLocationsLiteral:
27
+ if local_execution_location == global_execution_location:
28
+ return local_execution_location
29
+ elif global_execution_location == "local" and local_execution_location == "remote":
30
+ return "local"
31
+ else:
32
+ return local_execution_location
6
33
 
7
34
 
8
35
  class FlowGraphConfig(BaseModel):
@@ -16,7 +43,7 @@ class FlowGraphConfig(BaseModel):
16
43
  name (str): The name of the flow.
17
44
  path (str): The file path associated with the flow.
18
45
  execution_mode (ExecutionModeLiteral): The mode of execution ('Development' or 'Performance').
19
- execution_location (ExecutionLocationsLiteral): The location for execution ('auto', 'local', 'remote').
46
+ execution_location (ExecutionLocationsLiteral): The location for execution ('local', 'remote').
20
47
  """
21
48
  flow_id: int = Field(default_factory=create_unique_id, description="Unique identifier for the flow.")
22
49
  description: Optional[str] = None
@@ -24,7 +51,23 @@ class FlowGraphConfig(BaseModel):
24
51
  name: str = ''
25
52
  path: str = ''
26
53
  execution_mode: ExecutionModeLiteral = 'Performance'
27
- execution_location: ExecutionLocationsLiteral = "auto"
54
+ execution_location: ExecutionLocationsLiteral = Field(default_factory=get_global_execution_location)
55
+
56
+ @field_validator('execution_location', mode='before')
57
+ def validate_and_set_execution_location(cls, v: Optional[ExecutionLocationsLiteral]) -> ExecutionLocationsLiteral:
58
+ """
59
+ Validates and sets the execution location.
60
+ 1. **If `None` is provided**: It defaults to the location determined by global settings.
61
+ 2. **If a value is provided**: It checks if the value is compatible with the global
62
+ settings. If not (e.g., requesting 'remote' when only 'local' is possible),
63
+ it corrects the value to a compatible one.
64
+ """
65
+ if v is None:
66
+ return get_global_execution_location()
67
+ if v == "auto":
68
+ return get_global_execution_location()
69
+
70
+ return get_prio_execution_location(v, get_global_execution_location())
28
71
 
29
72
 
30
73
  class FlowSettings(FlowGraphConfig):
@@ -6,6 +6,8 @@ from copy import deepcopy
6
6
 
7
7
  from typing import NamedTuple
8
8
 
9
+ from pl_fuzzy_frame_match.models import FuzzyMapping
10
+
9
11
 
10
12
  def get_func_type_mapping(func: str):
11
13
  """Infers the output data type of common aggregation functions."""
@@ -158,6 +160,19 @@ class SelectInputs:
158
160
  """Gets a list of original column names to select from the source DataFrame."""
159
161
  return [v.old_name for v in self.renames if v.keep or (v.join_key and include_join_key)]
160
162
 
163
+ def has_drop_cols(self) -> bool:
164
+ """Checks if any column is marked to be dropped from the selection."""
165
+ return any(not v.keep for v in self.renames)
166
+
167
+ @property
168
+ def drop_columns(self) -> List[SelectInput]:
169
+ """Returns a list of column names that are marked to be dropped from the selection."""
170
+ return [v for v in self.renames if not v.keep and v.is_available]
171
+
172
+ @property
173
+ def non_jk_drop_columns(self) -> List[SelectInput]:
174
+ return [v for v in self.renames if not v.keep and v.is_available and not v.join_key]
175
+
161
176
  def __add__(self, other: "SelectInput"):
162
177
  """Allows adding a SelectInput using the '+' operator."""
163
178
  self.renames.append(other)
@@ -225,32 +240,6 @@ class JoinMap:
225
240
  right_col: str
226
241
 
227
242
 
228
- @dataclass
229
- class FuzzyMap(JoinMap):
230
- """Extends `JoinMap` with settings for fuzzy string matching, such as the algorithm and similarity threshold."""
231
- threshold_score: Optional[float] = 80.0
232
- fuzzy_type: Optional[FuzzyTypeLiteral] = 'levenshtein'
233
- perc_unique: Optional[float] = 0.0
234
- output_column_name: Optional[str] = None
235
- valid: Optional[bool] = True
236
-
237
- def __init__(self, left_col: str, right_col: str = None, threshold_score: float = 80.0,
238
- fuzzy_type: FuzzyTypeLiteral = 'levenshtein', perc_unique: float = 0, output_column_name: str = None,
239
- _output_col_name: str = None, valid: bool = True, output_col_name: str = None):
240
- if right_col is None:
241
- right_col = left_col
242
- self.valid = valid
243
- self.left_col = left_col
244
- self.right_col = right_col
245
- self.threshold_score = threshold_score
246
- self.fuzzy_type = fuzzy_type
247
- self.perc_unique = perc_unique
248
- self.output_column_name = output_column_name if output_column_name is not None else _output_col_name
249
- self.output_column_name = self.output_column_name if self.output_column_name is not None else output_col_name
250
- if self.output_column_name is None:
251
- self.output_column_name = f'fuzzy_score_{self.left_col}_{self.right_col}'
252
-
253
-
254
243
  class JoinSelectMixin:
255
244
  """A mixin providing common methods for join-like operations that involve left and right inputs."""
256
245
  left_select: JoinInputs = None
@@ -430,32 +419,32 @@ class JoinInput(JoinSelectMixin):
430
419
  @dataclass
431
420
  class FuzzyMatchInput(JoinInput):
432
421
  """Extends `JoinInput` with settings specific to fuzzy matching, such as the matching algorithm and threshold."""
433
- join_mapping: List[FuzzyMap]
422
+ join_mapping: List[FuzzyMapping]
434
423
  aggregate_output: bool = False
435
424
 
436
425
  @staticmethod
437
- def parse_fuzz_mapping(fuzz_mapping: List[FuzzyMap] | Tuple[str, str] | str) -> List[FuzzyMap]:
426
+ def parse_fuzz_mapping(fuzz_mapping: List[FuzzyMapping] | Tuple[str, str] | str) -> List[FuzzyMapping]:
438
427
  if isinstance(fuzz_mapping, (tuple, list)):
439
428
  assert len(fuzz_mapping) > 0
440
429
  if all(isinstance(fm, dict) for fm in fuzz_mapping):
441
- fuzz_mapping = [FuzzyMap(**fm) for fm in fuzz_mapping]
430
+ fuzz_mapping = [FuzzyMapping(**fm) for fm in fuzz_mapping]
442
431
 
443
- if not isinstance(fuzz_mapping[0], FuzzyMap):
432
+ if not isinstance(fuzz_mapping[0], FuzzyMapping):
444
433
  assert len(fuzz_mapping) <= 2
445
434
  if len(fuzz_mapping) == 2:
446
435
  assert isinstance(fuzz_mapping[0], str) and isinstance(fuzz_mapping[1], str)
447
- fuzz_mapping = [FuzzyMap(*fuzz_mapping)]
436
+ fuzz_mapping = [FuzzyMapping(*fuzz_mapping)]
448
437
  elif isinstance(fuzz_mapping[0], str):
449
- fuzz_mapping = [FuzzyMap(fuzz_mapping[0], fuzz_mapping[0])]
438
+ fuzz_mapping = [FuzzyMapping(fuzz_mapping[0], fuzz_mapping[0])]
450
439
  elif isinstance(fuzz_mapping, str):
451
- fuzz_mapping = [FuzzyMap(fuzz_mapping, fuzz_mapping)]
452
- elif isinstance(fuzz_mapping, FuzzyMap):
440
+ fuzz_mapping = [FuzzyMapping(fuzz_mapping, fuzz_mapping)]
441
+ elif isinstance(fuzz_mapping, FuzzyMapping):
453
442
  fuzz_mapping = [fuzz_mapping]
454
443
  else:
455
444
  raise Exception('No valid join mapping as input')
456
445
  return fuzz_mapping
457
446
 
458
- def __init__(self, join_mapping: List[FuzzyMap] | Tuple[str, str] | str, left_select: List[SelectInput] | List[str],
447
+ def __init__(self, join_mapping: List[FuzzyMapping] | Tuple[str, str] | str, left_select: List[SelectInput] | List[str],
459
448
  right_select: List[SelectInput] | List[str], aggregate_output: bool = False, how: JoinStrategy = 'inner'):
460
449
  self.join_mapping = self.parse_fuzz_mapping(join_mapping)
461
450
  self.left_select = self.parse_select(left_select)
@@ -463,9 +452,9 @@ class FuzzyMatchInput(JoinInput):
463
452
  self.how = how
464
453
  for jm in self.join_mapping:
465
454
 
466
- if jm.right_col not in self.right_select.old_cols:
455
+ if jm.right_col not in {v.old_name for v in self.right_select.renames}:
467
456
  self.right_select.append(SelectInput(jm.right_col, keep=False, join_key=True))
468
- if jm.left_col not in self.left_select.old_cols:
457
+ if jm.left_col not in {v.old_name for v in self.left_select.renames}:
469
458
  self.left_select.append(SelectInput(jm.left_col, keep=False, join_key=True))
470
459
  [setattr(v, "join_key", v.old_name in self._left_join_keys) for v in self.left_select.renames]
471
460
  [setattr(v, "join_key", v.old_name in self._right_join_keys) for v in self.right_select.renames]
@@ -476,7 +465,7 @@ class FuzzyMatchInput(JoinInput):
476
465
  return self.left_select.new_cols & self.right_select.new_cols
477
466
 
478
467
  @property
479
- def fuzzy_maps(self) -> List[FuzzyMap]:
468
+ def fuzzy_maps(self) -> List[FuzzyMapping]:
480
469
  """Returns the final fuzzy mappings after applying all column renames."""
481
470
  new_mappings = []
482
471
  left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
@@ -138,11 +138,16 @@ def collect_batches(reader: pa.ipc.RecordBatchFileReader, n: int) -> Tuple[List[
138
138
  rows_collected = 0
139
139
 
140
140
  for batch in iter_batches(reader, n, rows_collected):
141
- batches.append(batch)
141
+
142
142
  rows_collected += batch.num_rows
143
143
  logger.debug(f"Collected batch: total rows now {rows_collected}")
144
144
  if rows_collected >= n:
145
+ if rows_collected > n:
146
+ batches.append(batch.slice(0, n - (rows_collected - batch.num_rows)))
147
+ else:
148
+ batches.append(batch)
145
149
  break
150
+ batches.append(batch)
146
151
 
147
152
  logger.info(f"Finished collecting {len(batches)} batches with {rows_collected} total rows")
148
153
  return batches, rows_collected
@@ -217,7 +222,7 @@ def read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> pa.Table:
217
222
 
218
223
  table = pa.Table.from_batches(batches) # type: ignore
219
224
  logger.info(f"Successfully read {rows_collected} rows from {file_path}")
220
- return table
225
+ return table
221
226
 
222
227
 
223
228
  def get_read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> Callable[[], pa.Table]:
@@ -244,4 +249,4 @@ def get_read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> Calla
244
249
  >>> table = reader_func()
245
250
  """
246
251
  logger.info(f"Creating reader function for {file_path} with n={n}, strict={strict}")
247
- return lambda: read_top_n(file_path, n, strict)
252
+ return lambda: read_top_n(file_path, n, strict)