Flowfile 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowfile/__init__.py +4 -3
- flowfile/api.py +5 -2
- flowfile/web/__init__.py +2 -0
- flowfile/web/static/assets/{CloudConnectionManager-c20a740f.js → CloudConnectionManager-c97c25f8.js} +2 -2
- flowfile/web/static/assets/{CloudStorageReader-960b400a.js → CloudStorageReader-f1ff509e.js} +7 -7
- flowfile/web/static/assets/{CloudStorageWriter-e3decbdd.js → CloudStorageWriter-034f8b78.js} +7 -7
- flowfile/web/static/assets/{CrossJoin-d67e2405.js → CrossJoin-9e156ebe.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-a81e0f7e.js → DatabaseConnectionSettings-d5c625b3.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-9ea35e84.js → DatabaseManager-265adc5e.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-9578bfa5.js → DatabaseReader-0b10551e.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-19531098.js → DatabaseWriter-c17c6916.js} +9 -9
- flowfile/web/static/assets/{ExploreData-40476474.js → ExploreData-18a4fe52.js} +5 -5
- flowfile/web/static/assets/{ExternalSource-2297ef96.js → ExternalSource-3a66556c.js} +6 -6
- flowfile/web/static/assets/{Filter-f211c03a.js → Filter-91ad87e7.js} +8 -8
- flowfile/web/static/assets/{Formula-4207ea31.js → Formula-3c395ab1.js} +8 -8
- flowfile/web/static/assets/{FuzzyMatch-bf120df0.js → FuzzyMatch-2df0d230.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-5bb7497a.js → GraphSolver-d285877f.js} +5 -5
- flowfile/web/static/assets/{GroupBy-92c81b65.js → GroupBy-0bd1cc6b.js} +6 -6
- flowfile/web/static/assets/{Join-4e49a274.js → Join-5a78a203.js} +9 -9
- flowfile/web/static/assets/{ManualInput-90998ae8.js → ManualInput-93aef9d6.js} +5 -5
- flowfile/web/static/assets/{Output-81e3e917.js → Output-411ecaee.js} +4 -4
- flowfile/web/static/assets/{Pivot-a3419842.js → Pivot-89db4b04.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-72710deb.js → PolarsCode-a9f974f8.js} +6 -6
- flowfile/web/static/assets/{Read-c4059daf.js → Read-c3b1929c.js} +6 -6
- flowfile/web/static/assets/{RecordCount-c2b5e095.js → RecordCount-4e95f98e.js} +5 -5
- flowfile/web/static/assets/{RecordId-10baf191.js → RecordId-55ae7d36.js} +6 -6
- flowfile/web/static/assets/{Sample-3ed9a0ae.js → Sample-b4a18476.js} +5 -5
- flowfile/web/static/assets/{SecretManager-0d49c0e8.js → SecretManager-b066d13a.js} +2 -2
- flowfile/web/static/assets/{Select-8a02a0b3.js → Select-727688dc.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-4c0f45f5.js → SettingsSection-695ac487.js} +1 -1
- flowfile/web/static/assets/{Sort-f55c9f9d.js → Sort-be3339a8.js} +6 -6
- flowfile/web/static/assets/{TextToRows-5dbc2145.js → TextToRows-7b8998da.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-a1768e52.js → UnavailableFields-8b0cb48e.js} +2 -2
- flowfile/web/static/assets/{Union-f2aefdc9.js → Union-89fd73dc.js} +5 -5
- flowfile/web/static/assets/{Unique-46b250da.js → Unique-af5a80b4.js} +8 -8
- flowfile/web/static/assets/{Unpivot-25ac84cc.js → Unpivot-5195d411.js} +5 -5
- flowfile/web/static/assets/{api-a0abbdc7.js → api-023d1733.js} +1 -1
- flowfile/web/static/assets/{api-6ef0dcef.js → api-cb00cce6.js} +1 -1
- flowfile/web/static/assets/{designer-186f2e71.css → designer-2197d782.css} +17 -17
- flowfile/web/static/assets/{designer-13eabd83.js → designer-6c322d8e.js} +67 -21
- flowfile/web/static/assets/{documentation-b87e7f6f.js → documentation-4d1fafe1.js} +1 -1
- flowfile/web/static/assets/{dropDown-13564764.js → dropDown-0b46dd77.js} +1 -1
- flowfile/web/static/assets/{fullEditor-fd2cd6f9.js → fullEditor-ec4e4f95.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-71e11604.js → genericNodeSettings-def5879b.js} +3 -3
- flowfile/web/static/assets/{index-f6c15e76.js → index-683fc198.js} +6 -6
- flowfile/web/static/assets/{nodeTitle-988d9efe.js → nodeTitle-a16db7c3.js} +3 -3
- flowfile/web/static/assets/{secretApi-dd636aa2.js → secretApi-baceb6f9.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-af36165e.js → selectDynamic-de91449a.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-2847001e.js → vue-codemirror.esm-dc5e3348.js} +1 -1
- flowfile/web/static/assets/{vue-content-loader.es-0371da73.js → vue-content-loader.es-ba94b82f.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/METADATA +2 -1
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/RECORD +88 -90
- flowfile_core/configs/settings.py +4 -2
- flowfile_core/configs/utils.py +5 -0
- flowfile_core/database/connection.py +1 -3
- flowfile_core/flowfile/code_generator/code_generator.py +36 -0
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +0 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +60 -80
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +61 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +44 -3
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +3 -3
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +5 -2
- flowfile_core/flowfile/flow_graph.py +129 -88
- flowfile_core/flowfile/flow_node/flow_node.py +30 -15
- flowfile_core/flowfile/flow_node/models.py +0 -2
- flowfile_core/flowfile/flow_node/schema_callback.py +138 -43
- flowfile_core/flowfile/graph_tree/graph_tree.py +250 -0
- flowfile_core/flowfile/graph_tree/models.py +15 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +1 -1
- flowfile_core/flowfile/{flow_data_engine/fuzzy_matching/settings_validator.py → schema_callbacks.py} +65 -13
- flowfile_core/flowfile/setting_generator/settings.py +2 -1
- flowfile_core/flowfile/util/execution_orderer.py +9 -0
- flowfile_core/flowfile/util/node_skipper.py +8 -0
- flowfile_core/schemas/schemas.py +46 -3
- flowfile_core/schemas/transform_schema.py +27 -38
- flowfile_core/utils/arrow_reader.py +8 -3
- flowfile_core/utils/validate_setup.py +0 -2
- flowfile_frame/__init__.py +1 -4
- flowfile_frame/expr.py +14 -0
- flowfile_frame/flow_frame.py +34 -5
- flowfile_frame/flow_frame.pyi +5 -6
- flowfile_worker/funcs.py +7 -3
- flowfile_worker/models.py +3 -1
- flowfile_worker/polars_fuzzy_match/matcher.py +0 -435
- flowfile_worker/polars_fuzzy_match/models.py +0 -36
- flowfile_worker/polars_fuzzy_match/pre_process.py +0 -213
- flowfile_worker/polars_fuzzy_match/process.py +0 -86
- flowfile_worker/polars_fuzzy_match/utils.py +0 -50
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/LICENSE +0 -0
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/WHEEL +0 -0
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/entry_points.txt +0 -0
- {flowfile_worker/polars_fuzzy_match → flowfile_core/flowfile/graph_tree}/__init__.py +0 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
|
|
3
|
+
from flowfile_core.flowfile.flow_node.flow_node import FlowNode
|
|
4
|
+
|
|
5
|
+
from flowfile_core.flowfile.graph_tree.models import BranchInfo, InputInfo
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def calculate_depth(node_id: int, node_info: dict[int, BranchInfo], visited: set = None) -> int:
|
|
9
|
+
"""Calculates the depth of each node."""
|
|
10
|
+
|
|
11
|
+
if visited is None:
|
|
12
|
+
visited = set()
|
|
13
|
+
if node_id in visited:
|
|
14
|
+
return node_info[node_id].depth
|
|
15
|
+
visited.add(node_id)
|
|
16
|
+
|
|
17
|
+
max_input_depth = -1
|
|
18
|
+
inputs = node_info[node_id].inputs
|
|
19
|
+
|
|
20
|
+
for main_id in inputs.main:
|
|
21
|
+
max_input_depth = max(max_input_depth, calculate_depth(main_id, node_info, visited))
|
|
22
|
+
if inputs.left:
|
|
23
|
+
max_input_depth = max(max_input_depth, calculate_depth(inputs.left, node_info, visited))
|
|
24
|
+
if inputs.right:
|
|
25
|
+
max_input_depth = max(max_input_depth, calculate_depth(inputs.right, node_info, visited))
|
|
26
|
+
|
|
27
|
+
node_info[node_id].depth = max_input_depth + 1
|
|
28
|
+
return node_info[node_id].depth
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Trace paths from each root
|
|
32
|
+
def trace_path(node_id: int, node_info: dict[int, BranchInfo], merge_points: dict[int, list[int]],
|
|
33
|
+
current_path: list[int] | None = None):
|
|
34
|
+
"""Define the trace of each node path"""
|
|
35
|
+
if current_path is None:
|
|
36
|
+
current_path = []
|
|
37
|
+
|
|
38
|
+
current_path = current_path + [node_id]
|
|
39
|
+
outputs = node_info[node_id].outputs
|
|
40
|
+
|
|
41
|
+
if not outputs:
|
|
42
|
+
# End of path
|
|
43
|
+
return [current_path]
|
|
44
|
+
|
|
45
|
+
# If this node has multiple outputs or connects to a merge point, branch
|
|
46
|
+
all_paths = []
|
|
47
|
+
for output_id in outputs:
|
|
48
|
+
if output_id in merge_points and len(merge_points[output_id]) > 1:
|
|
49
|
+
# This is a merge point, end this path here
|
|
50
|
+
all_paths.append(current_path + [output_id])
|
|
51
|
+
else:
|
|
52
|
+
# Continue the path
|
|
53
|
+
all_paths.extend(trace_path(output_id, node_info, merge_points, current_path))
|
|
54
|
+
return all_paths
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def build_node_info(nodes: list[FlowNode]) -> dict[int, BranchInfo]:
|
|
58
|
+
"""Builds node information used to construct the graph tree."""
|
|
59
|
+
|
|
60
|
+
node_info = {}
|
|
61
|
+
for node in nodes:
|
|
62
|
+
node_id = node.node_id
|
|
63
|
+
|
|
64
|
+
# Get node label
|
|
65
|
+
operation = node.node_type.replace("_", " ").title() if node.node_type else "Unknown"
|
|
66
|
+
label = f"{operation} (id={node_id})"
|
|
67
|
+
if hasattr(node, 'setting_input') and hasattr(node.setting_input, 'description'):
|
|
68
|
+
if node.setting_input.description:
|
|
69
|
+
desc = node.setting_input.description
|
|
70
|
+
if len(desc) > 20: # Truncate long descriptions
|
|
71
|
+
desc = desc[:17] + "..."
|
|
72
|
+
label = f"{operation} ({node_id}): {desc}"
|
|
73
|
+
|
|
74
|
+
# Get inputs and outputs
|
|
75
|
+
inputs = InputInfo(
|
|
76
|
+
main=[n.node_id for n in (node.node_inputs.main_inputs or [])],
|
|
77
|
+
left=node.node_inputs.left_input.node_id if node.node_inputs.left_input else None,
|
|
78
|
+
right=node.node_inputs.right_input.node_id if node.node_inputs.right_input else None
|
|
79
|
+
)
|
|
80
|
+
outputs = [n.node_id for n in node.leads_to_nodes]
|
|
81
|
+
|
|
82
|
+
node_info[node_id] = BranchInfo(
|
|
83
|
+
label=label,
|
|
84
|
+
short_label=f"{operation} ({node_id})",
|
|
85
|
+
inputs=inputs,
|
|
86
|
+
outputs=outputs,
|
|
87
|
+
depth=0
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
return node_info
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def group_nodes_by_depth(node_info: dict[int, BranchInfo]) -> tuple[dict[int, list[int]], int]:
|
|
94
|
+
"""Groups each node by depth"""
|
|
95
|
+
depth_groups = {}
|
|
96
|
+
max_depth = 0
|
|
97
|
+
for node_id, info in node_info.items():
|
|
98
|
+
depth = info.depth
|
|
99
|
+
max_depth = max(max_depth, depth)
|
|
100
|
+
if depth not in depth_groups:
|
|
101
|
+
depth_groups[depth] = []
|
|
102
|
+
depth_groups[depth].append(node_id)
|
|
103
|
+
|
|
104
|
+
return depth_groups, max_depth
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def define_node_connections(node_info: dict[int, BranchInfo]) -> dict[int, list[int]]:
|
|
108
|
+
"""Defines node connections to merge"""
|
|
109
|
+
merge_points = {} # target_id -> list of source_ids
|
|
110
|
+
for node_id, info in node_info.items():
|
|
111
|
+
for output_id in info.outputs:
|
|
112
|
+
if output_id not in merge_points:
|
|
113
|
+
merge_points[output_id] = []
|
|
114
|
+
merge_points[output_id].append(node_id)
|
|
115
|
+
|
|
116
|
+
return merge_points
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def build_flow_paths(node_info: dict[int, BranchInfo], flow_starts: list[FlowNode],
|
|
120
|
+
merge_points: dict[int, list[int]]):
|
|
121
|
+
"""Build the flow paths to be drawn"""
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# Find all root nodes (no inputs)
|
|
125
|
+
root_nodes = [nid for nid, info in node_info.items()
|
|
126
|
+
if not info.inputs.main and not info.inputs.left and not info.inputs.right]
|
|
127
|
+
|
|
128
|
+
if not root_nodes and flow_starts:
|
|
129
|
+
root_nodes = [n.node_id for n in flow_starts]
|
|
130
|
+
paths = [] # List of paths through the graph
|
|
131
|
+
|
|
132
|
+
# Get all paths
|
|
133
|
+
for root_id in root_nodes:
|
|
134
|
+
paths.extend(trace_path(root_id, node_info, merge_points))
|
|
135
|
+
|
|
136
|
+
return paths
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def group_paths(paths:list, merge_points:dict):
|
|
140
|
+
"""Groups each node path."""
|
|
141
|
+
paths_by_merge = {}
|
|
142
|
+
standalone_paths = []
|
|
143
|
+
|
|
144
|
+
for path in paths:
|
|
145
|
+
if len(path) > 1 and path[-1] in merge_points and len(merge_points[path[-1]]) > 1:
|
|
146
|
+
merge_id = path[-1]
|
|
147
|
+
if merge_id not in paths_by_merge:
|
|
148
|
+
paths_by_merge[merge_id] = []
|
|
149
|
+
paths_by_merge[merge_id].append(path)
|
|
150
|
+
else:
|
|
151
|
+
standalone_paths.append(path)
|
|
152
|
+
return paths_by_merge, standalone_paths
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def draw_merged_paths(node_info: dict[int, BranchInfo],
|
|
156
|
+
merge_points: dict[int, list[int]],
|
|
157
|
+
paths_by_merge: dict[int, list[list[int]]],
|
|
158
|
+
merge_drawn: set,
|
|
159
|
+
drawn_nodes: set,
|
|
160
|
+
lines: list[str]):
|
|
161
|
+
"""Draws paths for each node that merges."""
|
|
162
|
+
for merge_id, merge_paths in paths_by_merge.items():
|
|
163
|
+
if merge_id in merge_drawn:
|
|
164
|
+
continue
|
|
165
|
+
merge_info = node_info[merge_id]
|
|
166
|
+
sources = merge_points[merge_id]
|
|
167
|
+
|
|
168
|
+
# Draw each source path leading to the merge
|
|
169
|
+
for i, source_id in enumerate(sources):
|
|
170
|
+
# Find the path containing this source
|
|
171
|
+
source_path = None
|
|
172
|
+
for path in merge_paths:
|
|
173
|
+
if source_id in path:
|
|
174
|
+
|
|
175
|
+
source_path = path[:path.index(source_id) + 1]
|
|
176
|
+
break
|
|
177
|
+
|
|
178
|
+
if source_path:
|
|
179
|
+
# Build the line for this path
|
|
180
|
+
line_parts = []
|
|
181
|
+
for j, nid in enumerate(source_path):
|
|
182
|
+
if j == 0:
|
|
183
|
+
line_parts.append(node_info[nid].label)
|
|
184
|
+
else:
|
|
185
|
+
line_parts.append(f" ──> {node_info[nid].short_label}")
|
|
186
|
+
|
|
187
|
+
# Add the merge arrow
|
|
188
|
+
if i == 0:
|
|
189
|
+
# First source
|
|
190
|
+
line = "".join(line_parts) + " ─────┐"
|
|
191
|
+
lines.append(line)
|
|
192
|
+
elif i == len(sources) - 1:
|
|
193
|
+
# Last source
|
|
194
|
+
line = "".join(line_parts) + " ─────┴──> " + merge_info.label
|
|
195
|
+
lines.append(line)
|
|
196
|
+
|
|
197
|
+
# Continue with the rest of the path after merge
|
|
198
|
+
remaining = node_info[merge_id].outputs
|
|
199
|
+
while remaining:
|
|
200
|
+
next_id = remaining[0]
|
|
201
|
+
lines[-1] += f" ──> {node_info[next_id].label}"
|
|
202
|
+
remaining = node_info[next_id].outputs
|
|
203
|
+
drawn_nodes.add(next_id)
|
|
204
|
+
else:
|
|
205
|
+
# Middle sources
|
|
206
|
+
line = "".join(line_parts) + " ─────┤"
|
|
207
|
+
lines.append(line)
|
|
208
|
+
|
|
209
|
+
for nid in source_path:
|
|
210
|
+
drawn_nodes.add(nid)
|
|
211
|
+
|
|
212
|
+
drawn_nodes.add(merge_id)
|
|
213
|
+
merge_drawn.add(merge_id)
|
|
214
|
+
lines.append("") # Add spacing between merge groups
|
|
215
|
+
return paths_by_merge
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def draw_standalone_paths(drawn_nodes: set[int], standalone_paths: list[list[int]], lines: list[str],
|
|
219
|
+
node_info: dict[int, BranchInfo]):
|
|
220
|
+
""" Draws paths that do not merge."""
|
|
221
|
+
# Draw standalone paths
|
|
222
|
+
for path in standalone_paths:
|
|
223
|
+
if all(nid in drawn_nodes for nid in path):
|
|
224
|
+
continue
|
|
225
|
+
|
|
226
|
+
line_parts = []
|
|
227
|
+
for i, node_id in enumerate(path):
|
|
228
|
+
if node_id not in drawn_nodes:
|
|
229
|
+
if i == 0:
|
|
230
|
+
line_parts.append(node_info[node_id].label)
|
|
231
|
+
else:
|
|
232
|
+
line_parts.append(f" ──> {node_info[node_id].short_label}")
|
|
233
|
+
drawn_nodes.add(node_id)
|
|
234
|
+
|
|
235
|
+
if line_parts:
|
|
236
|
+
lines.append("".join(line_parts))
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def add_un_drawn_nodes(drawn_nodes: set[int], node_info: dict[int, BranchInfo], lines: list[str]):
|
|
240
|
+
"""Adds isolated nodes if exists."""
|
|
241
|
+
# Add any remaining undrawn nodes
|
|
242
|
+
|
|
243
|
+
for node_id in node_info:
|
|
244
|
+
if node_id not in drawn_nodes:
|
|
245
|
+
lines.append(node_info[node_id].label + " (isolated)")
|
|
246
|
+
|
|
247
|
+
lines.append("")
|
|
248
|
+
lines.append("=" * 80)
|
|
249
|
+
lines.append("Execution Order")
|
|
250
|
+
lines.append("=" * 80)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class InputInfo(BaseModel):
|
|
5
|
+
main: list[int]
|
|
6
|
+
right: int | None = None
|
|
7
|
+
left: int | None = None
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BranchInfo(BaseModel):
|
|
11
|
+
label: str
|
|
12
|
+
short_label: str
|
|
13
|
+
inputs: InputInfo
|
|
14
|
+
outputs: list[int]
|
|
15
|
+
depth: int
|
|
@@ -48,7 +48,7 @@ def ensure_compatibility(flow_storage_obj: schemas.FlowInformation, flow_path: s
|
|
|
48
48
|
setattr(flow_storage_obj, 'flow_settings', flow_settings)
|
|
49
49
|
flow_storage_obj = schemas.FlowInformation.model_validate(flow_storage_obj)
|
|
50
50
|
elif not hasattr(getattr(flow_storage_obj, 'flow_settings'), 'execution_location'):
|
|
51
|
-
setattr(getattr(flow_storage_obj, 'flow_settings'), 'execution_location',
|
|
51
|
+
setattr(getattr(flow_storage_obj, 'flow_settings'), 'execution_location', "remote")
|
|
52
52
|
elif not hasattr(flow_storage_obj.flow_settings, 'is_running'):
|
|
53
53
|
setattr(flow_storage_obj.flow_settings, 'is_running', False)
|
|
54
54
|
setattr(flow_storage_obj.flow_settings, 'is_canceled', False)
|
flowfile_core/flowfile/{flow_data_engine/fuzzy_matching/settings_validator.py → schema_callbacks.py}
RENAMED
|
@@ -1,25 +1,72 @@
|
|
|
1
1
|
|
|
2
2
|
from typing import List
|
|
3
|
-
|
|
4
|
-
from flowfile_core.schemas import transform_schema
|
|
5
|
-
from flowfile_core.schemas import input_schema
|
|
3
|
+
|
|
6
4
|
from polars import datatypes
|
|
7
5
|
import polars as pl
|
|
6
|
+
|
|
7
|
+
from pl_fuzzy_frame_match.output_column_name_utils import set_name_in_fuzzy_mappings
|
|
8
|
+
from pl_fuzzy_frame_match.pre_process import rename_fuzzy_right_mapping
|
|
9
|
+
|
|
8
10
|
from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import fetch_unique_values
|
|
9
11
|
from flowfile_core.configs.flow_logger import main_logger
|
|
12
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn, PlType
|
|
13
|
+
from flowfile_core.schemas import transform_schema
|
|
14
|
+
from flowfile_core.schemas import input_schema
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _ensure_all_columns_have_select(left_cols: List[str],
|
|
18
|
+
right_cols: List[str],
|
|
19
|
+
fuzzy_match_input: transform_schema.FuzzyMatchInput):
|
|
20
|
+
"""
|
|
21
|
+
Ensure that all columns in the left and right FlowDataEngines are included in the fuzzy match input's select
|
|
22
|
+
statements.
|
|
23
|
+
Args:
|
|
24
|
+
left_cols (List[str]): List of column names in the left FlowDataEngine.
|
|
25
|
+
right_cols (List[str]): List of column names in the right FlowDataEngine.
|
|
26
|
+
fuzzy_match_input (FuzzyMatchInput): Fuzzy match input configuration containing select statements.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
None
|
|
30
|
+
"""
|
|
31
|
+
right_cols_in_select = {c.old_name for c in fuzzy_match_input.right_select.renames}
|
|
32
|
+
left_cols_in_select = {c.old_name for c in fuzzy_match_input.left_select.renames}
|
|
10
33
|
|
|
34
|
+
fuzzy_match_input.left_select.renames.extend(
|
|
35
|
+
[transform_schema.SelectInput(col) for col in left_cols if col not in left_cols_in_select])
|
|
36
|
+
fuzzy_match_input.right_select.renames.extend(
|
|
37
|
+
[transform_schema.SelectInput(col) for col in right_cols if col not in right_cols_in_select]
|
|
38
|
+
)
|
|
11
39
|
|
|
12
|
-
|
|
13
|
-
|
|
40
|
+
|
|
41
|
+
def _order_join_inputs_based_on_col_order(col_order: List[str], join_inputs: transform_schema.JoinInputs) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Ensure that the select columns in the fuzzy match input match the order of the incoming columns.
|
|
44
|
+
This function modifies the join_inputs object in-place.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
None
|
|
48
|
+
"""
|
|
49
|
+
select_map = {select.new_name: select for select in join_inputs.renames}
|
|
50
|
+
ordered_renames = [select_map[col] for col in col_order if col in select_map]
|
|
51
|
+
join_inputs.renames = ordered_renames
|
|
14
52
|
|
|
15
53
|
|
|
16
54
|
def calculate_fuzzy_match_schema(fm_input: transform_schema.FuzzyMatchInput,
|
|
17
55
|
left_schema: List[FlowfileColumn],
|
|
18
56
|
right_schema: List[FlowfileColumn]):
|
|
19
|
-
|
|
57
|
+
_ensure_all_columns_have_select(left_cols=[col.column_name for col in left_schema],
|
|
58
|
+
right_cols=[col.column_name for col in right_schema],
|
|
59
|
+
fuzzy_match_input=fm_input)
|
|
60
|
+
_order_join_inputs_based_on_col_order(col_order=[col.column_name for col in left_schema],
|
|
61
|
+
join_inputs=fm_input.left_select)
|
|
62
|
+
_order_join_inputs_based_on_col_order(col_order=[col.column_name for col in right_schema],
|
|
63
|
+
join_inputs=fm_input.right_select)
|
|
20
64
|
left_schema_dict, right_schema_dict = ({ls.name: ls for ls in left_schema}, {rs.name: rs for rs in right_schema})
|
|
21
65
|
fm_input.auto_rename()
|
|
22
66
|
|
|
67
|
+
right_renames = {column.old_name: column.new_name for column in fm_input.right_select.renames}
|
|
68
|
+
new_join_mapping = rename_fuzzy_right_mapping(fm_input.join_mapping, right_renames)
|
|
69
|
+
|
|
23
70
|
output_schema = []
|
|
24
71
|
for column in fm_input.left_select.renames:
|
|
25
72
|
column_schema = left_schema_dict.get(column.old_name)
|
|
@@ -31,9 +78,9 @@ def calculate_fuzzy_match_schema(fm_input: transform_schema.FuzzyMatchInput,
|
|
|
31
78
|
if column_schema and column.keep:
|
|
32
79
|
output_schema.append(FlowfileColumn.from_input(column.new_name, column_schema.data_type,
|
|
33
80
|
example_values=column_schema.example_values))
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
81
|
+
set_name_in_fuzzy_mappings(new_join_mapping)
|
|
82
|
+
output_schema.extend([FlowfileColumn.from_input(fuzzy_mapping.output_column_name, 'Float64')
|
|
83
|
+
for fuzzy_mapping in new_join_mapping])
|
|
37
84
|
return output_schema
|
|
38
85
|
|
|
39
86
|
|
|
@@ -71,7 +118,8 @@ def pre_calculate_pivot_schema(node_input_schema: List[FlowfileColumn],
|
|
|
71
118
|
val_column_schema = get_schema_of_column(node_input_schema, pivot_input.value_col)
|
|
72
119
|
if output_fields is not None and len(output_fields) > 0:
|
|
73
120
|
return index_columns_schema+[FlowfileColumn(PlType(Plcolumn_name=output_field.name,
|
|
74
|
-
pl_datatype=output_field.data_type)) for output_field in
|
|
121
|
+
pl_datatype=output_field.data_type)) for output_field in
|
|
122
|
+
output_fields]
|
|
75
123
|
|
|
76
124
|
else:
|
|
77
125
|
max_unique_vals = 200
|
|
@@ -84,7 +132,11 @@ def pre_calculate_pivot_schema(node_input_schema: List[FlowfileColumn],
|
|
|
84
132
|
f' Max unique values: {max_unique_vals}')
|
|
85
133
|
pl_output_fields = []
|
|
86
134
|
for val in unique_vals:
|
|
87
|
-
|
|
88
|
-
output_type = get_output_data_type_pivot(val_column_schema,
|
|
89
|
-
pl_output_fields.append(PlType(column_name=
|
|
135
|
+
if len(pivot_input.aggregations) == 1:
|
|
136
|
+
output_type = get_output_data_type_pivot(val_column_schema, pivot_input.aggregations[0])
|
|
137
|
+
pl_output_fields.append(PlType(column_name=str(val), pl_datatype=output_type))
|
|
138
|
+
else:
|
|
139
|
+
for agg in pivot_input.aggregations:
|
|
140
|
+
output_type = get_output_data_type_pivot(val_column_schema, agg)
|
|
141
|
+
pl_output_fields.append(PlType(column_name=f'{val}_{agg}', pl_datatype=output_type))
|
|
90
142
|
return index_columns_schema + [FlowfileColumn(pl_output_field) for pl_output_field in pl_output_fields]
|
|
@@ -4,6 +4,7 @@ from typing import Callable, Iterable
|
|
|
4
4
|
from functools import wraps
|
|
5
5
|
from flowfile_core.schemas.output_model import NodeData
|
|
6
6
|
from flowfile_core.flowfile.setting_generator.setting_generator import SettingGenerator, SettingUpdator
|
|
7
|
+
from pl_fuzzy_frame_match.models import FuzzyMapping
|
|
7
8
|
|
|
8
9
|
setting_generator = SettingGenerator()
|
|
9
10
|
setting_updator = SettingUpdator()
|
|
@@ -135,7 +136,7 @@ def cross_join(node_data: NodeData):
|
|
|
135
136
|
|
|
136
137
|
|
|
137
138
|
def check_if_fuzzy_match_is_valid(left_columns: Iterable[str], right_columns: Iterable[str],
|
|
138
|
-
fuzzy_map:
|
|
139
|
+
fuzzy_map: FuzzyMapping) -> bool:
|
|
139
140
|
if fuzzy_map.left_col not in left_columns:
|
|
140
141
|
return False
|
|
141
142
|
if fuzzy_map.right_col not in right_columns:
|
|
@@ -2,6 +2,15 @@ from typing import List, Dict, Set
|
|
|
2
2
|
from flowfile_core.flowfile.flow_node.flow_node import FlowNode
|
|
3
3
|
from flowfile_core.configs import logger
|
|
4
4
|
from collections import deque, defaultdict
|
|
5
|
+
from flowfile_core.flowfile.util.node_skipper import determine_nodes_to_skip
|
|
6
|
+
|
|
7
|
+
def compute_execution_plan(nodes: List[FlowNode], flow_starts: List[FlowNode] = None):
|
|
8
|
+
""" Computes the execution order after finding the nodes to skip on the execution step."""
|
|
9
|
+
skip_nodes = determine_nodes_to_skip(nodes=nodes)
|
|
10
|
+
computed_execution_order = determine_execution_order(all_nodes=[node for node in nodes if node not in skip_nodes],
|
|
11
|
+
flow_starts=flow_starts)
|
|
12
|
+
return skip_nodes, computed_execution_order
|
|
13
|
+
|
|
5
14
|
|
|
6
15
|
|
|
7
16
|
def determine_execution_order(all_nodes: List[FlowNode], flow_starts: List[FlowNode] = None) -> List[FlowNode]:
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from flowfile_core.flowfile.flow_node.flow_node import FlowNode
|
|
3
|
+
|
|
4
|
+
def determine_nodes_to_skip(nodes : List[FlowNode]) -> List[FlowNode]:
|
|
5
|
+
""" Finds nodes to skip on the execution step. """
|
|
6
|
+
skip_nodes = [node for node in nodes if not node.is_correct]
|
|
7
|
+
skip_nodes.extend([lead_to_node for node in skip_nodes for lead_to_node in node.leads_to_nodes])
|
|
8
|
+
return skip_nodes
|
flowfile_core/schemas/schemas.py
CHANGED
|
@@ -1,8 +1,35 @@
|
|
|
1
1
|
from typing import Optional, List, Dict, Tuple, Any, Literal, Annotated
|
|
2
2
|
from pydantic import BaseModel, field_validator, ConfigDict, Field, StringConstraints
|
|
3
3
|
from flowfile_core.flowfile.utils import create_unique_id
|
|
4
|
+
from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
|
|
4
5
|
ExecutionModeLiteral = Literal['Development', 'Performance']
|
|
5
|
-
ExecutionLocationsLiteral = Literal['
|
|
6
|
+
ExecutionLocationsLiteral = Literal['local', 'remote']
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_global_execution_location() -> ExecutionLocationsLiteral:
|
|
10
|
+
"""
|
|
11
|
+
Calculates the default execution location based on the global settings
|
|
12
|
+
Returns
|
|
13
|
+
-------
|
|
14
|
+
ExecutionLocationsLiteral where the current
|
|
15
|
+
"""
|
|
16
|
+
if OFFLOAD_TO_WORKER:
|
|
17
|
+
return "remote"
|
|
18
|
+
return "local"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def is_valid_execution_location_in_current_global_settings(execution_location: ExecutionLocationsLiteral) -> bool:
|
|
22
|
+
return not (get_global_execution_location() == "local" and execution_location == "remote")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_prio_execution_location(local_execution_location: ExecutionLocationsLiteral,
|
|
26
|
+
global_execution_location: ExecutionLocationsLiteral) -> ExecutionLocationsLiteral:
|
|
27
|
+
if local_execution_location == global_execution_location:
|
|
28
|
+
return local_execution_location
|
|
29
|
+
elif global_execution_location == "local" and local_execution_location == "remote":
|
|
30
|
+
return "local"
|
|
31
|
+
else:
|
|
32
|
+
return local_execution_location
|
|
6
33
|
|
|
7
34
|
|
|
8
35
|
class FlowGraphConfig(BaseModel):
|
|
@@ -16,7 +43,7 @@ class FlowGraphConfig(BaseModel):
|
|
|
16
43
|
name (str): The name of the flow.
|
|
17
44
|
path (str): The file path associated with the flow.
|
|
18
45
|
execution_mode (ExecutionModeLiteral): The mode of execution ('Development' or 'Performance').
|
|
19
|
-
execution_location (ExecutionLocationsLiteral): The location for execution ('
|
|
46
|
+
execution_location (ExecutionLocationsLiteral): The location for execution ('local', 'remote').
|
|
20
47
|
"""
|
|
21
48
|
flow_id: int = Field(default_factory=create_unique_id, description="Unique identifier for the flow.")
|
|
22
49
|
description: Optional[str] = None
|
|
@@ -24,7 +51,23 @@ class FlowGraphConfig(BaseModel):
|
|
|
24
51
|
name: str = ''
|
|
25
52
|
path: str = ''
|
|
26
53
|
execution_mode: ExecutionModeLiteral = 'Performance'
|
|
27
|
-
execution_location: ExecutionLocationsLiteral =
|
|
54
|
+
execution_location: ExecutionLocationsLiteral = Field(default_factory=get_global_execution_location)
|
|
55
|
+
|
|
56
|
+
@field_validator('execution_location', mode='before')
|
|
57
|
+
def validate_and_set_execution_location(cls, v: Optional[ExecutionLocationsLiteral]) -> ExecutionLocationsLiteral:
|
|
58
|
+
"""
|
|
59
|
+
Validates and sets the execution location.
|
|
60
|
+
1. **If `None` is provided**: It defaults to the location determined by global settings.
|
|
61
|
+
2. **If a value is provided**: It checks if the value is compatible with the global
|
|
62
|
+
settings. If not (e.g., requesting 'remote' when only 'local' is possible),
|
|
63
|
+
it corrects the value to a compatible one.
|
|
64
|
+
"""
|
|
65
|
+
if v is None:
|
|
66
|
+
return get_global_execution_location()
|
|
67
|
+
if v == "auto":
|
|
68
|
+
return get_global_execution_location()
|
|
69
|
+
|
|
70
|
+
return get_prio_execution_location(v, get_global_execution_location())
|
|
28
71
|
|
|
29
72
|
|
|
30
73
|
class FlowSettings(FlowGraphConfig):
|
|
@@ -6,6 +6,8 @@ from copy import deepcopy
|
|
|
6
6
|
|
|
7
7
|
from typing import NamedTuple
|
|
8
8
|
|
|
9
|
+
from pl_fuzzy_frame_match.models import FuzzyMapping
|
|
10
|
+
|
|
9
11
|
|
|
10
12
|
def get_func_type_mapping(func: str):
|
|
11
13
|
"""Infers the output data type of common aggregation functions."""
|
|
@@ -158,6 +160,19 @@ class SelectInputs:
|
|
|
158
160
|
"""Gets a list of original column names to select from the source DataFrame."""
|
|
159
161
|
return [v.old_name for v in self.renames if v.keep or (v.join_key and include_join_key)]
|
|
160
162
|
|
|
163
|
+
def has_drop_cols(self) -> bool:
|
|
164
|
+
"""Checks if any column is marked to be dropped from the selection."""
|
|
165
|
+
return any(not v.keep for v in self.renames)
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def drop_columns(self) -> List[SelectInput]:
|
|
169
|
+
"""Returns a list of column names that are marked to be dropped from the selection."""
|
|
170
|
+
return [v for v in self.renames if not v.keep and v.is_available]
|
|
171
|
+
|
|
172
|
+
@property
|
|
173
|
+
def non_jk_drop_columns(self) -> List[SelectInput]:
|
|
174
|
+
return [v for v in self.renames if not v.keep and v.is_available and not v.join_key]
|
|
175
|
+
|
|
161
176
|
def __add__(self, other: "SelectInput"):
|
|
162
177
|
"""Allows adding a SelectInput using the '+' operator."""
|
|
163
178
|
self.renames.append(other)
|
|
@@ -225,32 +240,6 @@ class JoinMap:
|
|
|
225
240
|
right_col: str
|
|
226
241
|
|
|
227
242
|
|
|
228
|
-
@dataclass
|
|
229
|
-
class FuzzyMap(JoinMap):
|
|
230
|
-
"""Extends `JoinMap` with settings for fuzzy string matching, such as the algorithm and similarity threshold."""
|
|
231
|
-
threshold_score: Optional[float] = 80.0
|
|
232
|
-
fuzzy_type: Optional[FuzzyTypeLiteral] = 'levenshtein'
|
|
233
|
-
perc_unique: Optional[float] = 0.0
|
|
234
|
-
output_column_name: Optional[str] = None
|
|
235
|
-
valid: Optional[bool] = True
|
|
236
|
-
|
|
237
|
-
def __init__(self, left_col: str, right_col: str = None, threshold_score: float = 80.0,
|
|
238
|
-
fuzzy_type: FuzzyTypeLiteral = 'levenshtein', perc_unique: float = 0, output_column_name: str = None,
|
|
239
|
-
_output_col_name: str = None, valid: bool = True, output_col_name: str = None):
|
|
240
|
-
if right_col is None:
|
|
241
|
-
right_col = left_col
|
|
242
|
-
self.valid = valid
|
|
243
|
-
self.left_col = left_col
|
|
244
|
-
self.right_col = right_col
|
|
245
|
-
self.threshold_score = threshold_score
|
|
246
|
-
self.fuzzy_type = fuzzy_type
|
|
247
|
-
self.perc_unique = perc_unique
|
|
248
|
-
self.output_column_name = output_column_name if output_column_name is not None else _output_col_name
|
|
249
|
-
self.output_column_name = self.output_column_name if self.output_column_name is not None else output_col_name
|
|
250
|
-
if self.output_column_name is None:
|
|
251
|
-
self.output_column_name = f'fuzzy_score_{self.left_col}_{self.right_col}'
|
|
252
|
-
|
|
253
|
-
|
|
254
243
|
class JoinSelectMixin:
|
|
255
244
|
"""A mixin providing common methods for join-like operations that involve left and right inputs."""
|
|
256
245
|
left_select: JoinInputs = None
|
|
@@ -430,32 +419,32 @@ class JoinInput(JoinSelectMixin):
|
|
|
430
419
|
@dataclass
|
|
431
420
|
class FuzzyMatchInput(JoinInput):
|
|
432
421
|
"""Extends `JoinInput` with settings specific to fuzzy matching, such as the matching algorithm and threshold."""
|
|
433
|
-
join_mapping: List[
|
|
422
|
+
join_mapping: List[FuzzyMapping]
|
|
434
423
|
aggregate_output: bool = False
|
|
435
424
|
|
|
436
425
|
@staticmethod
|
|
437
|
-
def parse_fuzz_mapping(fuzz_mapping: List[
|
|
426
|
+
def parse_fuzz_mapping(fuzz_mapping: List[FuzzyMapping] | Tuple[str, str] | str) -> List[FuzzyMapping]:
|
|
438
427
|
if isinstance(fuzz_mapping, (tuple, list)):
|
|
439
428
|
assert len(fuzz_mapping) > 0
|
|
440
429
|
if all(isinstance(fm, dict) for fm in fuzz_mapping):
|
|
441
|
-
fuzz_mapping = [
|
|
430
|
+
fuzz_mapping = [FuzzyMapping(**fm) for fm in fuzz_mapping]
|
|
442
431
|
|
|
443
|
-
if not isinstance(fuzz_mapping[0],
|
|
432
|
+
if not isinstance(fuzz_mapping[0], FuzzyMapping):
|
|
444
433
|
assert len(fuzz_mapping) <= 2
|
|
445
434
|
if len(fuzz_mapping) == 2:
|
|
446
435
|
assert isinstance(fuzz_mapping[0], str) and isinstance(fuzz_mapping[1], str)
|
|
447
|
-
fuzz_mapping = [
|
|
436
|
+
fuzz_mapping = [FuzzyMapping(*fuzz_mapping)]
|
|
448
437
|
elif isinstance(fuzz_mapping[0], str):
|
|
449
|
-
fuzz_mapping = [
|
|
438
|
+
fuzz_mapping = [FuzzyMapping(fuzz_mapping[0], fuzz_mapping[0])]
|
|
450
439
|
elif isinstance(fuzz_mapping, str):
|
|
451
|
-
fuzz_mapping = [
|
|
452
|
-
elif isinstance(fuzz_mapping,
|
|
440
|
+
fuzz_mapping = [FuzzyMapping(fuzz_mapping, fuzz_mapping)]
|
|
441
|
+
elif isinstance(fuzz_mapping, FuzzyMapping):
|
|
453
442
|
fuzz_mapping = [fuzz_mapping]
|
|
454
443
|
else:
|
|
455
444
|
raise Exception('No valid join mapping as input')
|
|
456
445
|
return fuzz_mapping
|
|
457
446
|
|
|
458
|
-
def __init__(self, join_mapping: List[
|
|
447
|
+
def __init__(self, join_mapping: List[FuzzyMapping] | Tuple[str, str] | str, left_select: List[SelectInput] | List[str],
|
|
459
448
|
right_select: List[SelectInput] | List[str], aggregate_output: bool = False, how: JoinStrategy = 'inner'):
|
|
460
449
|
self.join_mapping = self.parse_fuzz_mapping(join_mapping)
|
|
461
450
|
self.left_select = self.parse_select(left_select)
|
|
@@ -463,9 +452,9 @@ class FuzzyMatchInput(JoinInput):
|
|
|
463
452
|
self.how = how
|
|
464
453
|
for jm in self.join_mapping:
|
|
465
454
|
|
|
466
|
-
if jm.right_col not in self.right_select.
|
|
455
|
+
if jm.right_col not in {v.old_name for v in self.right_select.renames}:
|
|
467
456
|
self.right_select.append(SelectInput(jm.right_col, keep=False, join_key=True))
|
|
468
|
-
if jm.left_col not in self.left_select.
|
|
457
|
+
if jm.left_col not in {v.old_name for v in self.left_select.renames}:
|
|
469
458
|
self.left_select.append(SelectInput(jm.left_col, keep=False, join_key=True))
|
|
470
459
|
[setattr(v, "join_key", v.old_name in self._left_join_keys) for v in self.left_select.renames]
|
|
471
460
|
[setattr(v, "join_key", v.old_name in self._right_join_keys) for v in self.right_select.renames]
|
|
@@ -476,7 +465,7 @@ class FuzzyMatchInput(JoinInput):
|
|
|
476
465
|
return self.left_select.new_cols & self.right_select.new_cols
|
|
477
466
|
|
|
478
467
|
@property
|
|
479
|
-
def fuzzy_maps(self) -> List[
|
|
468
|
+
def fuzzy_maps(self) -> List[FuzzyMapping]:
|
|
480
469
|
"""Returns the final fuzzy mappings after applying all column renames."""
|
|
481
470
|
new_mappings = []
|
|
482
471
|
left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table
|
|
@@ -138,11 +138,16 @@ def collect_batches(reader: pa.ipc.RecordBatchFileReader, n: int) -> Tuple[List[
|
|
|
138
138
|
rows_collected = 0
|
|
139
139
|
|
|
140
140
|
for batch in iter_batches(reader, n, rows_collected):
|
|
141
|
-
|
|
141
|
+
|
|
142
142
|
rows_collected += batch.num_rows
|
|
143
143
|
logger.debug(f"Collected batch: total rows now {rows_collected}")
|
|
144
144
|
if rows_collected >= n:
|
|
145
|
+
if rows_collected > n:
|
|
146
|
+
batches.append(batch.slice(0, n - (rows_collected - batch.num_rows)))
|
|
147
|
+
else:
|
|
148
|
+
batches.append(batch)
|
|
145
149
|
break
|
|
150
|
+
batches.append(batch)
|
|
146
151
|
|
|
147
152
|
logger.info(f"Finished collecting {len(batches)} batches with {rows_collected} total rows")
|
|
148
153
|
return batches, rows_collected
|
|
@@ -217,7 +222,7 @@ def read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> pa.Table:
|
|
|
217
222
|
|
|
218
223
|
table = pa.Table.from_batches(batches) # type: ignore
|
|
219
224
|
logger.info(f"Successfully read {rows_collected} rows from {file_path}")
|
|
220
|
-
|
|
225
|
+
return table
|
|
221
226
|
|
|
222
227
|
|
|
223
228
|
def get_read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> Callable[[], pa.Table]:
|
|
@@ -244,4 +249,4 @@ def get_read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> Calla
|
|
|
244
249
|
>>> table = reader_func()
|
|
245
250
|
"""
|
|
246
251
|
logger.info(f"Creating reader function for {file_path} with n={n}, strict={strict}")
|
|
247
|
-
return lambda: read_top_n(file_path, n, strict)
|
|
252
|
+
return lambda: read_top_n(file_path, n, strict)
|