PyPI - Flowfile - Versions diffs - 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl - Mend

Flowfile 0.3.8py3-none-any.whl → 0.3.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

flowfile_core/flowfile/flow_data_engine/flow_data_engine.py CHANGED Viewed

@@ -6,6 +6,8 @@ from dataclasses import dataclass
 from math import ceil
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar, Literal, Generator
+from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
 # Third-party imports
 from loky import Future
 import polars as pl
@@ -19,12 +21,12 @@ from pyarrow.parquet import ParquetFile
 from flowfile_core.configs import logger
 from flowfile_core.utils.utils import ensure_similarity_dicts
 from flowfile_core.configs.flow_logger import NodeLogger
-from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
 from flowfile_core.schemas import (
     cloud_storage_schemas,
     input_schema,
     transform_schema as transform_schemas
 )
+from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
 # Local imports - Flow File Components
 from flowfile_core.flowfile.flow_data_engine import utils
@@ -64,6 +66,7 @@ from flowfile_core.flowfile.sources.external_sources.base_class import ExternalD
 T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
 def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform_schemas.JoinInput) -> Tuple[T, T, Dict[str, str]]:
     """Temporarily renames join keys to avoid conflicts during a join.
@@ -1563,7 +1566,7 @@ class FlowDataEngine:
         return FlowDataEngine(df, number_of_records=len(df), schema=self.schema)
     def get_sample(self, n_rows: int = 100, random: bool = False, shuffle: bool = False,
-                   seed: int = None) -> "FlowDataEngine":
+                   seed: int = None, execution_location: Optional[ExecutionLocationsLiteral] = None) -> "FlowDataEngine":
         """Gets a sample of rows from the DataFrame.
         Args:
@@ -1571,11 +1574,10 @@ class FlowDataEngine:
             random: If True, performs random sampling. If False, takes the first n_rows.
             shuffle: If True (and `random` is True), shuffles the data before sampling.
             seed: A random seed for reproducibility.
+            execution_location: Location which is used to calculate the size of the dataframe
         Returns:
             A new `FlowDataEngine` instance containing the sampled data.
         """
-        n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=OFFLOAD_TO_WORKER))
         logging.info(f'Getting sample of {n_rows} rows')
         if random:
@@ -1583,12 +1585,17 @@ class FlowDataEngine:
                 self.collect_external()
             if self.lazy and shuffle:
-                sample_df = self.data_frame.collect(engine="streaming" if self._streamable else "auto").sample(n_rows,
-                                                                                                               seed=seed,
-                                                                                     shuffle=shuffle)
+                sample_df = (self.data_frame.collect(engine="streaming" if self._streamable else "auto")
+                             .sample(n_rows, seed=seed, shuffle=shuffle))
             elif shuffle:
                 sample_df = self.data_frame.sample(n_rows, seed=seed, shuffle=shuffle)
             else:
+                if execution_location is None:
+                    execution_location = get_global_execution_location()
+                n_rows = min(n_rows, self.get_number_of_records(
+                    calculate_in_worker_process=execution_location == "remote")
+                             )
                 every_n_records = ceil(self.number_of_records / n_rows)
                 sample_df = self.data_frame.gather_every(every_n_records)
         else:
@@ -1596,7 +1603,7 @@ class FlowDataEngine:
                 self.collect(n_rows)
             sample_df = self.data_frame.head(n_rows)
-        return FlowDataEngine(sample_df, schema=self.schema, number_of_records=n_records)
+        return FlowDataEngine(sample_df, schema=self.schema)
     def get_subset(self, n_rows: int = 100) -> "FlowDataEngine":
         """Gets the first `n_rows` from the DataFrame.
@@ -1650,8 +1657,7 @@ class FlowDataEngine:
             An `ExternalFuzzyMatchFetcher` object that can be used to track the
             progress and retrieve the result of the fuzzy join.
         """
-        left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
-                                                    fuzzy_match_input=fuzzy_match_input)
+        left_df, right_df = prepare_for_fuzzy_match(left=self, right=other, fuzzy_match_input=fuzzy_match_input)
         return ExternalFuzzyMatchFetcher(left_df, right_df,
                                          fuzzy_maps=fuzzy_match_input.fuzzy_maps,
                                          file_ref=file_ref + '_fm',
@@ -1659,59 +1665,33 @@ class FlowDataEngine:
                                          flow_id=flow_id,
                                          node_id=node_id)
-    def do_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
-                      other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
-                      node_id: int | str = -1) -> "FlowDataEngine":
-        """Performs a fuzzy join with another DataFrame.
-        This method blocks until the fuzzy join operation is complete.
-        Args:
-            fuzzy_match_input: A `FuzzyMatchInput` object with the matching parameters.
-            other: The right `FlowDataEngine` to join with.
-            file_ref: A reference string for temporary files.
-            flow_id: The flow ID for tracking.
-            node_id: The node ID for tracking.
-        Returns:
-            A new `FlowDataEngine` instance with the result of the fuzzy join.
-        """
-        left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
-                                                    fuzzy_match_input=fuzzy_match_input)
-        f = ExternalFuzzyMatchFetcher(left_df, right_df,
-                                      fuzzy_maps=fuzzy_match_input.fuzzy_maps,
-                                      file_ref=file_ref + '_fm',
-                                      wait_on_completion=True,
-                                      flow_id=flow_id,
-                                      node_id=node_id)
-        return FlowDataEngine(f.get_result())
-    def fuzzy_match(self, right: "FlowDataEngine", left_on: str, right_on: str,
-                    fuzzy_method: str = 'levenshtein', threshold: float = 0.75) -> "FlowDataEngine":
-        """Performs a simple fuzzy match between two DataFrames on a single column pair.
-        This is a convenience method for a common fuzzy join scenario.
-        Args:
-            right: The right `FlowDataEngine` to match against.
-            left_on: The column name from the left DataFrame to match on.
-            right_on: The column name from the right DataFrame to match on.
-            fuzzy_method: The fuzzy matching algorithm to use (e.g., 'levenshtein').
-            threshold: The similarity score threshold (0.0 to 1.0) for a match.
-        Returns:
-            A new `FlowDataEngine` with the matched data.
-        """
-        fuzzy_match_input = transform_schemas.FuzzyMatchInput(
-            [transform_schemas.FuzzyMap(
-                left_on, right_on,
-                fuzzy_type=fuzzy_method,
-                threshold_score=threshold
-            )],
-            left_select=self.columns,
-            right_select=right.columns
-        )
-        return self.do_fuzzy_join(fuzzy_match_input, right, str(id(self)))
+    def fuzzy_join_external(self,
+                            fuzzy_match_input: transform_schemas.FuzzyMatchInput,
+                            other: "FlowDataEngine",
+                            file_ref: str = None,
+                            flow_id: int = -1,
+                            node_id: int = -1
+                            ):
+        if file_ref is None:
+            file_ref = str(id(self)) + '_' + str(id(other))
+        left_df, right_df = prepare_for_fuzzy_match(left=self, right=other, fuzzy_match_input=fuzzy_match_input)
+        external_tracker = ExternalFuzzyMatchFetcher(left_df, right_df,
+                                                     fuzzy_maps=fuzzy_match_input.fuzzy_maps,
+                                                     file_ref=file_ref + '_fm',
+                                                     wait_on_completion=False,
+                                                     flow_id=flow_id,
+                                                     node_id=node_id)
+        return FlowDataEngine(external_tracker.get_result())
+    def fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
+                   other: "FlowDataEngine",
+                   node_logger: NodeLogger = None) -> "FlowDataEngine":
+        left_df, right_df = prepare_for_fuzzy_match(left=self, right=other, fuzzy_match_input=fuzzy_match_input)
+        fuzzy_mappings = [FuzzyMapping(**fm.__dict__) for fm in fuzzy_match_input.fuzzy_maps]
+        return FlowDataEngine(fuzzy_match_dfs(left_df, right_df, fuzzy_maps=fuzzy_mappings,
+                                              logger=node_logger.logger if node_logger else logger)
+                              .lazy())
     def do_cross_join(self, cross_join_input: transform_schemas.CrossJoinInput,
                       auto_generate_selection: bool, verify_integrity: bool,
@@ -1733,11 +1713,12 @@ class FlowDataEngine:
             Exception: If `verify_integrity` is True and the join would result in
                 an excessively large number of records.
         """
         self.lazy = True
         other.lazy = True
         verify_join_select_integrity(cross_join_input, left_columns=self.columns, right_columns=other.columns)
         right_select = [v.old_name for v in cross_join_input.right_select.renames
                         if (v.keep or v.join_key) and v.is_available]
         left_select = [v.old_name for v in cross_join_input.left_select.renames
@@ -1746,26 +1727,14 @@ class FlowDataEngine:
         left = self.data_frame.select(left_select).rename(cross_join_input.left_select.rename_table)
         right = other.data_frame.select(right_select).rename(cross_join_input.right_select.rename_table)
-        if verify_integrity:
-            n_records = self.get_number_of_records() * other.get_number_of_records()
-            if n_records > 1_000_000_000:
-                raise Exception("Join will result in too many records, ending process")
-        else:
-            n_records = -1
         joined_df = left.join(right, how='cross')
         cols_to_delete_after = [col.new_name for col in
                                 cross_join_input.left_select.renames + cross_join_input.left_select.renames
                                 if col.join_key and not col.keep and col.is_available]
-        if verify_integrity:
-            return FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False,
-                                 number_of_records=n_records, streamable=False)
-        else:
-            fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False,
-                               number_of_records=0, streamable=False)
-            return fl
+        fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False, streamable=False)
+        return fl
     def join(self, join_input: transform_schemas.JoinInput, auto_generate_selection: bool,
              verify_integrity: bool, other: "FlowDataEngine") -> "FlowDataEngine":
@@ -1901,7 +1870,7 @@ class FlowDataEngine:
         other.number_of_records = -1
         other = other.select_columns(self.columns)
-        if self.get_number_of_records() != other.get_number_of_records():
+        if self.get_number_of_records_in_process() != other.get_number_of_records_in_process():
             raise Exception('Number of records is not equal')
         if self.columns != other.columns:
@@ -1937,6 +1906,18 @@ class FlowDataEngine:
         ).result
         return number_of_records
+    def get_number_of_records_in_process(self, force_calculate: bool = False):
+        """
+        Get the number of records in the DataFrame in the local process.
+        args:
+            force_calculate: If True, forces recalculation even if a value is cached.
+        Returns:
+            The total number of records.
+        """
+        return self.get_number_of_records(force_calculate=force_calculate)
     def get_number_of_records(self, warn: bool = False, force_calculate: bool = False,
                               calculate_in_worker_process: bool = False) -> int:
         """Gets the total number of records in the DataFrame.
@@ -1956,7 +1937,6 @@ class FlowDataEngine:
         """
         if self.is_future and not self.is_collected:
             return -1
-        calculate_in_worker_process = False if not OFFLOAD_TO_WORKER else calculate_in_worker_process
         if self.number_of_records is None or self.number_of_records < 0 or force_calculate:
             if self._number_of_records_callback is not None:
                 self._number_of_records_callback(self)

flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py CHANGED Viewed

@@ -76,6 +76,67 @@ class FlowfileColumn:
         self.__sql_type = None
         self.__perc_unique = None
+    def __repr__(self):
+        """
+        Provides a concise, developer-friendly representation of the object.
+        Ideal for debugging and console inspection.
+        """
+        return (f"FlowfileColumn(name='{self.column_name}', "
+                f"type={self.data_type}, "
+                f"size={self.size}, "
+                f"nulls={self.number_of_empty_values})")
+    def __str__(self):
+        """
+        Provides a detailed, readable summary of the column's metadata.
+        It conditionally omits any attribute that is None, ensuring a clean output.
+        """
+        # --- Header (Always Shown) ---
+        header = f"<FlowfileColumn: '{self.column_name}'>"
+        lines = []
+        # --- Core Attributes (Conditionally Shown) ---
+        if self.data_type is not None:
+            lines.append(f"  Type: {self.data_type}")
+        if self.size is not None:
+            lines.append(f"  Non-Nulls: {self.size}")
+        # Calculate and display nulls if possible
+        if self.size is not None and self.number_of_empty_values is not None:
+            total_entries = self.size + self.number_of_empty_values
+            if total_entries > 0:
+                null_perc = (self.number_of_empty_values / total_entries) * 100
+                null_info = f"{self.number_of_empty_values} ({null_perc:.1f}%)"
+            else:
+                null_info = "0 (0.0%)"
+            lines.append(f"  Nulls: {null_info}")
+        if self.number_of_unique_values is not None:
+            lines.append(f"  Unique: {self.number_of_unique_values}")
+        # --- Conditional Stats Section ---
+        stats = []
+        if self.min_value is not None:
+            stats.append(f"    Min: {self.min_value}")
+        if self.max_value is not None:
+            stats.append(f"    Max: {self.max_value}")
+        if self.average_value is not None:
+            stats.append(f"    Mean: {self.average_value}")
+        if stats:
+            lines.append("  Stats:")
+            lines.extend(stats)
+        # --- Conditional Examples Section ---
+        if self.example_values:
+            example_str = str(self.example_values)
+            # Truncate long example strings for cleaner display
+            if len(example_str) > 70:
+                example_str = example_str[:67] + '...'
+            lines.append(f"  Examples: {example_str}")
+        return f"{header}\n" + "\n".join(lines)
     @classmethod
     def create_from_polars_type(cls, polars_type: PlType, **kwargs) -> "FlowfileColumn":
         for k, v in kwargs.items():

flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py CHANGED Viewed

@@ -1,12 +1,49 @@
-from flowfile_core.schemas.transform_schema import FuzzyMatchInput
+from flowfile_core.schemas.transform_schema import FuzzyMatchInput, SelectInput, JoinInputs
 from flowfile_core.flowfile.flow_data_engine.join import verify_join_select_integrity, verify_join_map_integrity
 import polars as pl
-from typing import TYPE_CHECKING, Tuple
+from typing import TYPE_CHECKING, Tuple, List
 if TYPE_CHECKING:
     from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
+def _order_join_inputs_based_on_col_order(col_order: List[str], join_inputs: JoinInputs) -> None:
+    """
+    Ensure that the select columns in the fuzzy match input match the order of the incoming columns.
+    This function modifies the join_inputs object in-place.
+    Returns:
+        None
+    """
+    select_map = {select.new_name: select for select in join_inputs.renames}
+    ordered_renames = [select_map[col] for col in col_order if col in select_map]
+    join_inputs.renames = ordered_renames
+def _ensure_all_columns_have_select(left: "FlowDataEngine",
+                                    right: "FlowDataEngine",
+                                    fuzzy_match_input: FuzzyMatchInput):
+    """
+    Ensure that all columns in the left and right FlowDataEngines are included in the fuzzy match input's select
+     statements.
+    Args:
+        left (FlowDataEngine):
+        right (FlowDataEngine):
+        fuzzy_match_input ():
+    Returns:
+        None
+    """
+    right_cols_in_select = {c.old_name for c in fuzzy_match_input.right_select.renames}
+    left_cols_in_select = {c.old_name for c in fuzzy_match_input.left_select.renames}
+    fuzzy_match_input.left_select.renames.extend(
+        [SelectInput(col) for col in left.columns if col not in left_cols_in_select])
+    fuzzy_match_input.right_select.renames.extend(
+        [SelectInput(col) for col in right.columns if col not in right_cols_in_select]
+    )
 def prepare_for_fuzzy_match(left: "FlowDataEngine", right: "FlowDataEngine",
                             fuzzy_match_input: FuzzyMatchInput) -> Tuple[pl.LazyFrame, pl.LazyFrame]:
     """
@@ -19,14 +56,18 @@ def prepare_for_fuzzy_match(left: "FlowDataEngine", right: "FlowDataEngine",
     Returns:
         Tuple[pl.LazyFrame, pl.LazyFrame]: Prepared left and right lazy frames
     """
     left.lazy = True
     right.lazy = True
+    _ensure_all_columns_have_select(left, right, fuzzy_match_input)
+    _order_join_inputs_based_on_col_order(left.columns, fuzzy_match_input.left_select)
+    _order_join_inputs_based_on_col_order(right.columns, fuzzy_match_input.right_select)
     verify_join_select_integrity(fuzzy_match_input, left_columns=left.columns, right_columns=right.columns)
     if not verify_join_map_integrity(fuzzy_match_input, left_columns=left.schema, right_columns=right.schema):
         raise Exception('Join is not valid by the data fields')
     fuzzy_match_input = fuzzy_match_input
     fuzzy_match_input.auto_rename()
     right_select = [v.old_name for v in fuzzy_match_input.right_select.renames if
                     (v.keep or v.join_key) and v.is_available]
     left_select = [v.old_name for v in fuzzy_match_input.left_select.renames if

flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from typing import Any, Optional, Literal
 from pydantic import BaseModel
-from flowfile_core.schemas.transform_schema import FuzzyMap
+from pl_fuzzy_frame_match.models import FuzzyMapping
 OperationType = Literal['store', 'calculate_schema', 'calculate_number_of_records', 'write_output', 'store_sample']
@@ -20,8 +20,8 @@ class FuzzyJoinInput(BaseModel):
     cache_dir: Optional[str] = None
     left_df_operation: PolarsOperation
     right_df_operation: PolarsOperation
-    fuzzy_maps: list[FuzzyMap]
-    flowfile_node_id: int|str
+    fuzzy_maps: list[FuzzyMapping]
+    flowfile_node_id: int | str
     flowfile_flow_id: int

flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py CHANGED Viewed

@@ -9,11 +9,12 @@ from uuid import uuid4
 import polars as pl
 import requests
+from pl_fuzzy_frame_match.models import FuzzyMapping
 from flowfile_core.configs import logger
 from flowfile_core.configs.settings import WORKER_URL
 from flowfile_core.flowfile.flow_data_engine.subprocess_operations.models import (
     FuzzyJoinInput,
-    FuzzyMap,
     OperationType,
     PolarsOperation,
     Status
@@ -53,7 +54,7 @@ def trigger_sample_operation(lf: pl.LazyFrame, file_ref: str, flow_id: int, node
 def trigger_fuzzy_match_operation(left_df: pl.LazyFrame, right_df: pl.LazyFrame,
-                                  fuzzy_maps: List[FuzzyMap],
+                                  fuzzy_maps: List[FuzzyMapping],
                                   file_ref: str,
                                   flow_id: int,
                                   node_id: int | str) -> Status:
@@ -122,6 +123,8 @@ def results_exists(file_ref: str):
         return False
     except requests.RequestException as e:
         logger.error(f"Failed to check results existence: {str(e)}")
+        if "Connection refused" in str(e):
+            logger.info("")
         return False

Flowfile 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl

Flowfile 0.3.8py3-none-any.whl → 0.3.9py3-none-any.whl