PyPI - Flowfile - Versions diffs - 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl - Mend

Flowfile 0.3.7py3-none-any.whl → 0.3.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

flowfile_core/flowfile/graph_tree/graph_tree.py ADDED Viewed

@@ -0,0 +1,250 @@
+from pydantic import BaseModel
+from flowfile_core.flowfile.flow_node.flow_node import FlowNode
+from flowfile_core.flowfile.graph_tree.models import BranchInfo, InputInfo
+def calculate_depth(node_id: int, node_info: dict[int, BranchInfo], visited: set = None) -> int:
+    """Calculates the depth of each node."""
+    if visited is None:
+        visited = set()
+    if node_id in visited:
+        return node_info[node_id].depth
+    visited.add(node_id)
+    max_input_depth = -1
+    inputs = node_info[node_id].inputs
+    for main_id in inputs.main:
+        max_input_depth = max(max_input_depth, calculate_depth(main_id, node_info, visited))
+    if inputs.left:
+        max_input_depth = max(max_input_depth, calculate_depth(inputs.left, node_info, visited))
+    if inputs.right:
+        max_input_depth = max(max_input_depth, calculate_depth(inputs.right, node_info, visited))
+    node_info[node_id].depth = max_input_depth + 1
+    return node_info[node_id].depth
+# Trace paths from each root
+def trace_path(node_id: int, node_info: dict[int, BranchInfo], merge_points: dict[int, list[int]],
+               current_path: list[int] | None = None):
+    """Define the trace of each node path"""
+    if current_path is None:
+        current_path = []
+    current_path = current_path + [node_id]
+    outputs = node_info[node_id].outputs
+    if not outputs:
+        # End of path
+        return [current_path]
+    # If this node has multiple outputs or connects to a merge point, branch
+    all_paths = []
+    for output_id in outputs:
+        if output_id in merge_points and len(merge_points[output_id]) > 1:
+            # This is a merge point, end this path here
+            all_paths.append(current_path + [output_id])
+        else:
+            # Continue the path
+            all_paths.extend(trace_path(output_id, node_info, merge_points, current_path))
+    return all_paths
+def build_node_info(nodes: list[FlowNode]) -> dict[int, BranchInfo]:
+    """Builds node information used to construct the graph tree."""
+    node_info = {}
+    for node in nodes:
+        node_id = node.node_id
+        # Get node label
+        operation = node.node_type.replace("_", " ").title() if node.node_type else "Unknown"
+        label = f"{operation} (id={node_id})"
+        if hasattr(node, 'setting_input') and hasattr(node.setting_input, 'description'):
+            if node.setting_input.description:
+                desc = node.setting_input.description
+                if len(desc) > 20:  # Truncate long descriptions
+                    desc = desc[:17] + "..."
+                label = f"{operation} ({node_id}): {desc}"
+        # Get inputs and outputs
+        inputs = InputInfo(
+            main=[n.node_id for n in (node.node_inputs.main_inputs or [])],
+            left=node.node_inputs.left_input.node_id if node.node_inputs.left_input else None,
+            right=node.node_inputs.right_input.node_id if node.node_inputs.right_input else None
+        )
+        outputs = [n.node_id for n in node.leads_to_nodes]
+        node_info[node_id] = BranchInfo(
+            label=label,
+            short_label=f"{operation} ({node_id})",
+            inputs=inputs,
+            outputs=outputs,
+            depth=0
+        )
+    return node_info
+def group_nodes_by_depth(node_info: dict[int, BranchInfo]) -> tuple[dict[int, list[int]], int]:
+    """Groups each node by depth"""
+    depth_groups = {}
+    max_depth = 0
+    for node_id, info in node_info.items():
+        depth = info.depth
+        max_depth = max(max_depth, depth)
+        if depth not in depth_groups:
+            depth_groups[depth] = []
+        depth_groups[depth].append(node_id)
+    return depth_groups, max_depth
+def define_node_connections(node_info: dict[int, BranchInfo]) -> dict[int, list[int]]:
+    """Defines node connections to merge"""
+    merge_points = {}  # target_id -> list of source_ids
+    for node_id, info in node_info.items():
+        for output_id in info.outputs:
+            if output_id not in merge_points:
+                merge_points[output_id] = []
+            merge_points[output_id].append(node_id)
+    return merge_points
+def build_flow_paths(node_info: dict[int, BranchInfo], flow_starts: list[FlowNode],
+                     merge_points: dict[int, list[int]]):
+    """Build the flow paths to be drawn"""
+    # Find all root nodes (no inputs)
+    root_nodes = [nid for nid, info in node_info.items()
+                  if not info.inputs.main and not info.inputs.left and not info.inputs.right]
+    if not root_nodes and flow_starts:
+        root_nodes = [n.node_id for n in flow_starts]
+    paths = []  # List of paths through the graph
+    # Get all paths
+    for root_id in root_nodes:
+        paths.extend(trace_path(root_id, node_info, merge_points))
+    return paths
+def group_paths(paths:list, merge_points:dict):
+    """Groups each node path."""
+    paths_by_merge = {}
+    standalone_paths = []
+    for path in paths:
+        if len(path) > 1 and path[-1] in merge_points and len(merge_points[path[-1]]) > 1:
+            merge_id = path[-1]
+            if merge_id not in paths_by_merge:
+                paths_by_merge[merge_id] = []
+            paths_by_merge[merge_id].append(path)
+        else:
+            standalone_paths.append(path)
+    return paths_by_merge, standalone_paths
+def draw_merged_paths(node_info: dict[int, BranchInfo],
+                      merge_points: dict[int, list[int]],
+                      paths_by_merge: dict[int, list[list[int]]],
+                      merge_drawn: set,
+                      drawn_nodes: set,
+                      lines: list[str]):
+    """Draws paths for each node that merges."""
+    for merge_id, merge_paths in paths_by_merge.items():
+        if merge_id in merge_drawn:
+            continue
+        merge_info = node_info[merge_id]
+        sources = merge_points[merge_id]
+        # Draw each source path leading to the merge
+        for i, source_id in enumerate(sources):
+            # Find the path containing this source
+            source_path = None
+            for path in merge_paths:
+                if source_id in path:
+                    source_path = path[:path.index(source_id) + 1]
+                    break
+            if source_path:
+                # Build the line for this path
+                line_parts = []
+                for j, nid in enumerate(source_path):
+                    if j == 0:
+                        line_parts.append(node_info[nid].label)
+                    else:
+                        line_parts.append(f" ──> {node_info[nid].short_label}")
+                # Add the merge arrow
+                if i == 0:
+                    # First source
+                    line = "".join(line_parts) + " ─────┐"
+                    lines.append(line)
+                elif i == len(sources) - 1:
+                    # Last source
+                    line = "".join(line_parts) + " ─────┴──> " + merge_info.label
+                    lines.append(line)
+                    # Continue with the rest of the path after merge
+                    remaining = node_info[merge_id].outputs
+                    while remaining:
+                        next_id = remaining[0]
+                        lines[-1] += f" ──> {node_info[next_id].label}"
+                        remaining = node_info[next_id].outputs
+                        drawn_nodes.add(next_id)
+                else:
+                    # Middle sources
+                    line = "".join(line_parts) + " ─────┤"
+                    lines.append(line)
+                for nid in source_path:
+                    drawn_nodes.add(nid)
+        drawn_nodes.add(merge_id)
+        merge_drawn.add(merge_id)
+        lines.append("")  # Add spacing between merge groups
+    return paths_by_merge
+def draw_standalone_paths(drawn_nodes: set[int], standalone_paths: list[list[int]], lines: list[str],
+                          node_info: dict[int, BranchInfo]):
+    """ Draws paths that do not merge."""
+    # Draw standalone paths
+    for path in standalone_paths:
+        if all(nid in drawn_nodes for nid in path):
+            continue
+        line_parts = []
+        for i, node_id in enumerate(path):
+            if node_id not in drawn_nodes:
+                if i == 0:
+                    line_parts.append(node_info[node_id].label)
+                else:
+                    line_parts.append(f" ──> {node_info[node_id].short_label}")
+                drawn_nodes.add(node_id)
+        if line_parts:
+            lines.append("".join(line_parts))
+def add_un_drawn_nodes(drawn_nodes: set[int], node_info: dict[int, BranchInfo],  lines: list[str]):
+    """Adds isolated nodes if exists."""
+    # Add any remaining undrawn nodes
+    for node_id in node_info:
+        if node_id not in drawn_nodes:
+            lines.append(node_info[node_id].label + " (isolated)")
+    lines.append("")
+    lines.append("=" * 80)
+    lines.append("Execution Order")
+    lines.append("=" * 80)

flowfile_core/flowfile/graph_tree/models.py ADDED Viewed

@@ -0,0 +1,15 @@
+from pydantic import BaseModel
+class InputInfo(BaseModel):
+    main: list[int]
+    right: int | None = None
+    left: int | None = None
+class BranchInfo(BaseModel):
+    label: str
+    short_label: str
+    inputs: InputInfo
+    outputs: list[int]
+    depth: int

flowfile_core/flowfile/manage/compatibility_enhancements.py CHANGED Viewed

@@ -48,7 +48,7 @@ def ensure_compatibility(flow_storage_obj: schemas.FlowInformation, flow_path: s
         setattr(flow_storage_obj, 'flow_settings', flow_settings)
         flow_storage_obj = schemas.FlowInformation.model_validate(flow_storage_obj)
     elif not hasattr(getattr(flow_storage_obj, 'flow_settings'), 'execution_location'):
-        setattr(getattr(flow_storage_obj, 'flow_settings'), 'execution_location', 'auto')
+        setattr(getattr(flow_storage_obj, 'flow_settings'), 'execution_location', "remote")
     elif not hasattr(flow_storage_obj.flow_settings, 'is_running'):
         setattr(flow_storage_obj.flow_settings, 'is_running', False)
         setattr(flow_storage_obj.flow_settings, 'is_canceled', False)

flowfile_core/flowfile/{flow_data_engine/fuzzy_matching/settings_validator.py → schema_callbacks.py} RENAMED Viewed

@@ -1,25 +1,72 @@
 from typing import List
-from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn, PlType
-from flowfile_core.schemas import transform_schema
-from flowfile_core.schemas import input_schema
 from polars import datatypes
 import polars as pl
+from pl_fuzzy_frame_match.output_column_name_utils import set_name_in_fuzzy_mappings
+from pl_fuzzy_frame_match.pre_process import rename_fuzzy_right_mapping
 from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import fetch_unique_values
 from flowfile_core.configs.flow_logger import main_logger
+from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn, PlType
+from flowfile_core.schemas import transform_schema
+from flowfile_core.schemas import input_schema
+def _ensure_all_columns_have_select(left_cols: List[str],
+                                    right_cols: List[str],
+                                    fuzzy_match_input: transform_schema.FuzzyMatchInput):
+    """
+    Ensure that all columns in the left and right FlowDataEngines are included in the fuzzy match input's select
+     statements.
+    Args:
+        left_cols (List[str]): List of column names in the left FlowDataEngine.
+        right_cols (List[str]): List of column names in the right FlowDataEngine.
+        fuzzy_match_input (FuzzyMatchInput): Fuzzy match input configuration containing select statements.
+    Returns:
+        None
+    """
+    right_cols_in_select = {c.old_name for c in fuzzy_match_input.right_select.renames}
+    left_cols_in_select = {c.old_name for c in fuzzy_match_input.left_select.renames}
+    fuzzy_match_input.left_select.renames.extend(
+        [transform_schema.SelectInput(col) for col in left_cols if col not in left_cols_in_select])
+    fuzzy_match_input.right_select.renames.extend(
+        [transform_schema.SelectInput(col) for col in right_cols if col not in right_cols_in_select]
+    )
-def calculate_uniqueness(a: float, b: float) -> float:
-    return ((pow(a + 0.5, 2) + pow(b + 0.5, 2)) / 2 - pow(0.5, 2)) + 0.5 * abs(a - b)
+def _order_join_inputs_based_on_col_order(col_order: List[str], join_inputs: transform_schema.JoinInputs) -> None:
+    """
+    Ensure that the select columns in the fuzzy match input match the order of the incoming columns.
+    This function modifies the join_inputs object in-place.
+    Returns:
+        None
+    """
+    select_map = {select.new_name: select for select in join_inputs.renames}
+    ordered_renames = [select_map[col] for col in col_order if col in select_map]
+    join_inputs.renames = ordered_renames
 def calculate_fuzzy_match_schema(fm_input: transform_schema.FuzzyMatchInput,
                                  left_schema: List[FlowfileColumn],
                                  right_schema: List[FlowfileColumn]):
-    print('calculating fuzzy match schema')
+    _ensure_all_columns_have_select(left_cols=[col.column_name for col in left_schema],
+                                    right_cols=[col.column_name for col in right_schema],
+                                    fuzzy_match_input=fm_input)
+    _order_join_inputs_based_on_col_order(col_order=[col.column_name for col in left_schema],
+                                          join_inputs=fm_input.left_select)
+    _order_join_inputs_based_on_col_order(col_order=[col.column_name for col in right_schema],
+                                          join_inputs=fm_input.right_select)
     left_schema_dict, right_schema_dict = ({ls.name: ls for ls in left_schema}, {rs.name: rs for rs in right_schema})
     fm_input.auto_rename()
+    right_renames = {column.old_name: column.new_name for column in fm_input.right_select.renames}
+    new_join_mapping = rename_fuzzy_right_mapping(fm_input.join_mapping, right_renames)
     output_schema = []
     for column in fm_input.left_select.renames:
         column_schema = left_schema_dict.get(column.old_name)
@@ -31,9 +78,9 @@ def calculate_fuzzy_match_schema(fm_input: transform_schema.FuzzyMatchInput,
         if column_schema and column.keep:
             output_schema.append(FlowfileColumn.from_input(column.new_name, column_schema.data_type,
                                                            example_values=column_schema.example_values))
-    for i, fm in enumerate(fm_input.join_mapping):
-        output_schema.append(FlowfileColumn.from_input(f'fuzzy_score_{i}', 'Float64'))
+    set_name_in_fuzzy_mappings(new_join_mapping)
+    output_schema.extend([FlowfileColumn.from_input(fuzzy_mapping.output_column_name, 'Float64')
+                          for fuzzy_mapping in new_join_mapping])
     return output_schema
@@ -71,7 +118,8 @@ def pre_calculate_pivot_schema(node_input_schema: List[FlowfileColumn],
     val_column_schema = get_schema_of_column(node_input_schema, pivot_input.value_col)
     if output_fields is not None and len(output_fields) > 0:
         return index_columns_schema+[FlowfileColumn(PlType(Plcolumn_name=output_field.name,
-                                                           pl_datatype=output_field.data_type)) for output_field in output_fields]
+                                                           pl_datatype=output_field.data_type)) for output_field in
+                                     output_fields]
     else:
         max_unique_vals = 200
@@ -84,7 +132,11 @@ def pre_calculate_pivot_schema(node_input_schema: List[FlowfileColumn],
                                 f' Max unique values: {max_unique_vals}')
         pl_output_fields = []
         for val in unique_vals:
-            for agg in pivot_input.aggregations:
-                output_type = get_output_data_type_pivot(val_column_schema, agg)
-                pl_output_fields.append(PlType(column_name=f'{val}_{agg}', pl_datatype=output_type))
+            if len(pivot_input.aggregations) == 1:
+                output_type = get_output_data_type_pivot(val_column_schema, pivot_input.aggregations[0])
+                pl_output_fields.append(PlType(column_name=str(val), pl_datatype=output_type))
+            else:
+                for agg in pivot_input.aggregations:
+                    output_type = get_output_data_type_pivot(val_column_schema, agg)
+                    pl_output_fields.append(PlType(column_name=f'{val}_{agg}', pl_datatype=output_type))
         return index_columns_schema + [FlowfileColumn(pl_output_field) for pl_output_field in pl_output_fields]

flowfile_core/flowfile/setting_generator/settings.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Callable, Iterable
 from functools import wraps
 from flowfile_core.schemas.output_model import NodeData
 from flowfile_core.flowfile.setting_generator.setting_generator import SettingGenerator, SettingUpdator
+from pl_fuzzy_frame_match.models import FuzzyMapping
 setting_generator = SettingGenerator()
 setting_updator = SettingUpdator()
@@ -135,7 +136,7 @@ def cross_join(node_data: NodeData):
 def check_if_fuzzy_match_is_valid(left_columns: Iterable[str], right_columns: Iterable[str],
-                                  fuzzy_map: transform_schema.FuzzyMap) -> bool:
+                                  fuzzy_map: FuzzyMapping) -> bool:
     if fuzzy_map.left_col not in left_columns:
         return False
     if fuzzy_map.right_col not in right_columns:

flowfile_core/flowfile/util/execution_orderer.py CHANGED Viewed

@@ -2,6 +2,15 @@ from typing import List, Dict, Set
 from flowfile_core.flowfile.flow_node.flow_node import FlowNode
 from flowfile_core.configs import logger
 from collections import deque, defaultdict
+from flowfile_core.flowfile.util.node_skipper import determine_nodes_to_skip
+def compute_execution_plan(nodes: List[FlowNode], flow_starts: List[FlowNode] = None):
+    """ Computes the execution order after finding the nodes to skip on the execution step."""
+    skip_nodes = determine_nodes_to_skip(nodes=nodes)
+    computed_execution_order = determine_execution_order(all_nodes=[node for node in nodes if node not in skip_nodes],
+                                                        flow_starts=flow_starts)
+    return skip_nodes, computed_execution_order
 def determine_execution_order(all_nodes: List[FlowNode], flow_starts: List[FlowNode] = None) -> List[FlowNode]:

flowfile_core/flowfile/util/node_skipper.py ADDED Viewed

@@ -0,0 +1,8 @@
+from typing import List
+from flowfile_core.flowfile.flow_node.flow_node import FlowNode
+def determine_nodes_to_skip(nodes : List[FlowNode]) -> List[FlowNode]:
+    """ Finds nodes to skip on the execution step. """
+    skip_nodes = [node for node in nodes if not node.is_correct]
+    skip_nodes.extend([lead_to_node for node in skip_nodes for lead_to_node in node.leads_to_nodes])
+    return skip_nodes

flowfile_core/schemas/schemas.py CHANGED Viewed

@@ -1,8 +1,35 @@
 from typing import Optional, List, Dict, Tuple, Any, Literal, Annotated
 from pydantic import BaseModel, field_validator, ConfigDict, Field, StringConstraints
 from flowfile_core.flowfile.utils import create_unique_id
+from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
 ExecutionModeLiteral = Literal['Development', 'Performance']
-ExecutionLocationsLiteral = Literal['auto', 'local', 'remote']
+ExecutionLocationsLiteral = Literal['local', 'remote']
+def get_global_execution_location() -> ExecutionLocationsLiteral:
+    """
+    Calculates the default execution location based on the global settings
+    Returns
+    -------
+    ExecutionLocationsLiteral where the current
+    """
+    if OFFLOAD_TO_WORKER:
+        return "remote"
+    return "local"
+def is_valid_execution_location_in_current_global_settings(execution_location: ExecutionLocationsLiteral) -> bool:
+    return not (get_global_execution_location() == "local" and execution_location == "remote")
+def get_prio_execution_location(local_execution_location: ExecutionLocationsLiteral,
+                                global_execution_location: ExecutionLocationsLiteral) -> ExecutionLocationsLiteral:
+    if local_execution_location == global_execution_location:
+        return local_execution_location
+    elif global_execution_location == "local" and local_execution_location == "remote":
+        return "local"
+    else:
+        return local_execution_location
 class FlowGraphConfig(BaseModel):
@@ -16,7 +43,7 @@ class FlowGraphConfig(BaseModel):
         name (str): The name of the flow.
         path (str): The file path associated with the flow.
         execution_mode (ExecutionModeLiteral): The mode of execution ('Development' or 'Performance').
-        execution_location (ExecutionLocationsLiteral): The location for execution ('auto', 'local', 'remote').
+        execution_location (ExecutionLocationsLiteral): The location for execution ('local', 'remote').
     """
     flow_id: int = Field(default_factory=create_unique_id, description="Unique identifier for the flow.")
     description: Optional[str] = None
@@ -24,7 +51,23 @@ class FlowGraphConfig(BaseModel):
     name: str = ''
     path: str = ''
     execution_mode: ExecutionModeLiteral = 'Performance'
-    execution_location: ExecutionLocationsLiteral = "auto"
+    execution_location: ExecutionLocationsLiteral = Field(default_factory=get_global_execution_location)
+    @field_validator('execution_location', mode='before')
+    def validate_and_set_execution_location(cls, v: Optional[ExecutionLocationsLiteral]) -> ExecutionLocationsLiteral:
+        """
+        Validates and sets the execution location.
+        1.  **If `None` is provided**: It defaults to the location determined by global settings.
+        2.  **If a value is provided**: It checks if the value is compatible with the global
+            settings. If not (e.g., requesting 'remote' when only 'local' is possible),
+            it corrects the value to a compatible one.
+        """
+        if v is None:
+            return get_global_execution_location()
+        if v == "auto":
+            return get_global_execution_location()
+        return get_prio_execution_location(v, get_global_execution_location())
 class FlowSettings(FlowGraphConfig):

flowfile_core/schemas/transform_schema.py CHANGED Viewed

@@ -6,6 +6,8 @@ from copy import deepcopy
 from typing import NamedTuple
+from pl_fuzzy_frame_match.models import FuzzyMapping
 def get_func_type_mapping(func: str):
     """Infers the output data type of common aggregation functions."""
@@ -158,6 +160,19 @@ class SelectInputs:
         """Gets a list of original column names to select from the source DataFrame."""
         return [v.old_name for v in self.renames if v.keep or (v.join_key and include_join_key)]
+    def has_drop_cols(self) -> bool:
+        """Checks if any column is marked to be dropped from the selection."""
+        return any(not v.keep for v in self.renames)
+    @property
+    def drop_columns(self) -> List[SelectInput]:
+        """Returns a list of column names that are marked to be dropped from the selection."""
+        return [v for v in self.renames if not v.keep and v.is_available]
+    @property
+    def non_jk_drop_columns(self) -> List[SelectInput]:
+        return [v for v in self.renames if not v.keep and v.is_available and not v.join_key]
     def __add__(self, other: "SelectInput"):
         """Allows adding a SelectInput using the '+' operator."""
         self.renames.append(other)
@@ -225,32 +240,6 @@ class JoinMap:
     right_col: str
-@dataclass
-class FuzzyMap(JoinMap):
-    """Extends `JoinMap` with settings for fuzzy string matching, such as the algorithm and similarity threshold."""
-    threshold_score: Optional[float] = 80.0
-    fuzzy_type: Optional[FuzzyTypeLiteral] = 'levenshtein'
-    perc_unique: Optional[float] = 0.0
-    output_column_name: Optional[str] = None
-    valid: Optional[bool] = True
-    def __init__(self, left_col: str, right_col: str = None, threshold_score: float = 80.0,
-                 fuzzy_type: FuzzyTypeLiteral = 'levenshtein', perc_unique: float = 0, output_column_name: str = None,
-                 _output_col_name: str = None, valid: bool = True, output_col_name: str = None):
-        if right_col is None:
-            right_col = left_col
-        self.valid = valid
-        self.left_col = left_col
-        self.right_col = right_col
-        self.threshold_score = threshold_score
-        self.fuzzy_type = fuzzy_type
-        self.perc_unique = perc_unique
-        self.output_column_name = output_column_name if output_column_name is not None else _output_col_name
-        self.output_column_name = self.output_column_name if self.output_column_name is not None else output_col_name
-        if self.output_column_name is None:
-            self.output_column_name = f'fuzzy_score_{self.left_col}_{self.right_col}'
 class JoinSelectMixin:
     """A mixin providing common methods for join-like operations that involve left and right inputs."""
     left_select: JoinInputs = None
@@ -430,32 +419,32 @@ class JoinInput(JoinSelectMixin):
 @dataclass
 class FuzzyMatchInput(JoinInput):
     """Extends `JoinInput` with settings specific to fuzzy matching, such as the matching algorithm and threshold."""
-    join_mapping: List[FuzzyMap]
+    join_mapping: List[FuzzyMapping]
     aggregate_output: bool = False
     @staticmethod
-    def parse_fuzz_mapping(fuzz_mapping: List[FuzzyMap] | Tuple[str, str] | str) -> List[FuzzyMap]:
+    def parse_fuzz_mapping(fuzz_mapping: List[FuzzyMapping] | Tuple[str, str] | str) -> List[FuzzyMapping]:
         if isinstance(fuzz_mapping, (tuple, list)):
             assert len(fuzz_mapping) > 0
             if all(isinstance(fm, dict) for fm in fuzz_mapping):
-                fuzz_mapping = [FuzzyMap(**fm) for fm in fuzz_mapping]
+                fuzz_mapping = [FuzzyMapping(**fm) for fm in fuzz_mapping]
-            if not isinstance(fuzz_mapping[0], FuzzyMap):
+            if not isinstance(fuzz_mapping[0], FuzzyMapping):
                 assert len(fuzz_mapping) <= 2
                 if len(fuzz_mapping) == 2:
                     assert isinstance(fuzz_mapping[0], str) and isinstance(fuzz_mapping[1], str)
-                    fuzz_mapping = [FuzzyMap(*fuzz_mapping)]
+                    fuzz_mapping = [FuzzyMapping(*fuzz_mapping)]
                 elif isinstance(fuzz_mapping[0], str):
-                    fuzz_mapping = [FuzzyMap(fuzz_mapping[0], fuzz_mapping[0])]
+                    fuzz_mapping = [FuzzyMapping(fuzz_mapping[0], fuzz_mapping[0])]
         elif isinstance(fuzz_mapping, str):
-            fuzz_mapping = [FuzzyMap(fuzz_mapping, fuzz_mapping)]
-        elif isinstance(fuzz_mapping, FuzzyMap):
+            fuzz_mapping = [FuzzyMapping(fuzz_mapping, fuzz_mapping)]
+        elif isinstance(fuzz_mapping, FuzzyMapping):
             fuzz_mapping = [fuzz_mapping]
         else:
             raise Exception('No valid join mapping as input')
         return fuzz_mapping
-    def __init__(self, join_mapping: List[FuzzyMap] | Tuple[str, str] | str, left_select: List[SelectInput] | List[str],
+    def __init__(self, join_mapping: List[FuzzyMapping] | Tuple[str, str] | str, left_select: List[SelectInput] | List[str],
                  right_select: List[SelectInput] | List[str], aggregate_output: bool = False, how: JoinStrategy = 'inner'):
         self.join_mapping = self.parse_fuzz_mapping(join_mapping)
         self.left_select = self.parse_select(left_select)
@@ -463,9 +452,9 @@ class FuzzyMatchInput(JoinInput):
         self.how = how
         for jm in self.join_mapping:
-            if jm.right_col not in self.right_select.old_cols:
+            if jm.right_col not in {v.old_name for v in self.right_select.renames}:
                 self.right_select.append(SelectInput(jm.right_col, keep=False, join_key=True))
-            if jm.left_col not in self.left_select.old_cols:
+            if jm.left_col not in {v.old_name for v in self.left_select.renames}:
                 self.left_select.append(SelectInput(jm.left_col, keep=False, join_key=True))
         [setattr(v, "join_key", v.old_name in self._left_join_keys) for v in self.left_select.renames]
         [setattr(v, "join_key", v.old_name in self._right_join_keys) for v in self.right_select.renames]
@@ -476,7 +465,7 @@ class FuzzyMatchInput(JoinInput):
         return self.left_select.new_cols & self.right_select.new_cols
     @property
-    def fuzzy_maps(self) -> List[FuzzyMap]:
+    def fuzzy_maps(self) -> List[FuzzyMapping]:
         """Returns the final fuzzy mappings after applying all column renames."""
         new_mappings = []
         left_rename_table, right_rename_table = self.left_select.rename_table, self.right_select.rename_table

flowfile_core/utils/arrow_reader.py CHANGED Viewed

@@ -138,11 +138,16 @@ def collect_batches(reader: pa.ipc.RecordBatchFileReader, n: int) -> Tuple[List[
     rows_collected = 0
     for batch in iter_batches(reader, n, rows_collected):
-        batches.append(batch)
         rows_collected += batch.num_rows
         logger.debug(f"Collected batch: total rows now {rows_collected}")
         if rows_collected >= n:
+            if rows_collected > n:
+                batches.append(batch.slice(0, n - (rows_collected - batch.num_rows)))
+            else:
+                batches.append(batch)
             break
+        batches.append(batch)
     logger.info(f"Finished collecting {len(batches)} batches with {rows_collected} total rows")
     return batches, rows_collected
@@ -217,7 +222,7 @@ def read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> pa.Table:
         table = pa.Table.from_batches(batches)  # type: ignore
         logger.info(f"Successfully read {rows_collected} rows from {file_path}")
-        return table
+    return table
 def get_read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> Callable[[], pa.Table]:
@@ -244,4 +249,4 @@ def get_read_top_n(file_path: str, n: int = 1000, strict: bool = False) -> Calla
         >>> table = reader_func()
     """
     logger.info(f"Creating reader function for {file_path} with n={n}, strict={strict}")
-    return lambda: read_top_n(file_path, n, strict)
+    return lambda: read_top_n(file_path, n, strict)

Flowfile 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

Flowfile 0.3.7py3-none-any.whl → 0.3.9py3-none-any.whl