Flowfile 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. flowfile/__init__.py +4 -3
  2. flowfile/api.py +1 -0
  3. flowfile/web/static/assets/{CloudConnectionManager-c20a740f.js → CloudConnectionManager-c97c25f8.js} +2 -2
  4. flowfile/web/static/assets/{CloudStorageReader-960b400a.js → CloudStorageReader-f1ff509e.js} +7 -7
  5. flowfile/web/static/assets/{CloudStorageWriter-e3decbdd.js → CloudStorageWriter-034f8b78.js} +7 -7
  6. flowfile/web/static/assets/{CrossJoin-d67e2405.js → CrossJoin-9e156ebe.js} +8 -8
  7. flowfile/web/static/assets/{DatabaseConnectionSettings-a81e0f7e.js → DatabaseConnectionSettings-d5c625b3.js} +2 -2
  8. flowfile/web/static/assets/{DatabaseManager-9ea35e84.js → DatabaseManager-265adc5e.js} +2 -2
  9. flowfile/web/static/assets/{DatabaseReader-9578bfa5.js → DatabaseReader-0b10551e.js} +9 -9
  10. flowfile/web/static/assets/{DatabaseWriter-19531098.js → DatabaseWriter-c17c6916.js} +9 -9
  11. flowfile/web/static/assets/{ExploreData-40476474.js → ExploreData-18a4fe52.js} +5 -5
  12. flowfile/web/static/assets/{ExternalSource-2297ef96.js → ExternalSource-3a66556c.js} +6 -6
  13. flowfile/web/static/assets/{Filter-f211c03a.js → Filter-91ad87e7.js} +8 -8
  14. flowfile/web/static/assets/{Formula-4207ea31.js → Formula-3c395ab1.js} +8 -8
  15. flowfile/web/static/assets/{FuzzyMatch-bf120df0.js → FuzzyMatch-2df0d230.js} +9 -9
  16. flowfile/web/static/assets/{GraphSolver-5bb7497a.js → GraphSolver-d285877f.js} +5 -5
  17. flowfile/web/static/assets/{GroupBy-92c81b65.js → GroupBy-0bd1cc6b.js} +6 -6
  18. flowfile/web/static/assets/{Join-4e49a274.js → Join-5a78a203.js} +9 -9
  19. flowfile/web/static/assets/{ManualInput-90998ae8.js → ManualInput-93aef9d6.js} +5 -5
  20. flowfile/web/static/assets/{Output-81e3e917.js → Output-411ecaee.js} +4 -4
  21. flowfile/web/static/assets/{Pivot-a3419842.js → Pivot-89db4b04.js} +6 -6
  22. flowfile/web/static/assets/{PolarsCode-72710deb.js → PolarsCode-a9f974f8.js} +6 -6
  23. flowfile/web/static/assets/{Read-c4059daf.js → Read-c3b1929c.js} +6 -6
  24. flowfile/web/static/assets/{RecordCount-c2b5e095.js → RecordCount-4e95f98e.js} +5 -5
  25. flowfile/web/static/assets/{RecordId-10baf191.js → RecordId-55ae7d36.js} +6 -6
  26. flowfile/web/static/assets/{Sample-3ed9a0ae.js → Sample-b4a18476.js} +5 -5
  27. flowfile/web/static/assets/{SecretManager-0d49c0e8.js → SecretManager-b066d13a.js} +2 -2
  28. flowfile/web/static/assets/{Select-8a02a0b3.js → Select-727688dc.js} +8 -8
  29. flowfile/web/static/assets/{SettingsSection-4c0f45f5.js → SettingsSection-695ac487.js} +1 -1
  30. flowfile/web/static/assets/{Sort-f55c9f9d.js → Sort-be3339a8.js} +6 -6
  31. flowfile/web/static/assets/{TextToRows-5dbc2145.js → TextToRows-7b8998da.js} +8 -8
  32. flowfile/web/static/assets/{UnavailableFields-a1768e52.js → UnavailableFields-8b0cb48e.js} +2 -2
  33. flowfile/web/static/assets/{Union-f2aefdc9.js → Union-89fd73dc.js} +5 -5
  34. flowfile/web/static/assets/{Unique-46b250da.js → Unique-af5a80b4.js} +8 -8
  35. flowfile/web/static/assets/{Unpivot-25ac84cc.js → Unpivot-5195d411.js} +5 -5
  36. flowfile/web/static/assets/{api-a0abbdc7.js → api-023d1733.js} +1 -1
  37. flowfile/web/static/assets/{api-6ef0dcef.js → api-cb00cce6.js} +1 -1
  38. flowfile/web/static/assets/{designer-186f2e71.css → designer-2197d782.css} +17 -17
  39. flowfile/web/static/assets/{designer-13eabd83.js → designer-6c322d8e.js} +67 -21
  40. flowfile/web/static/assets/{documentation-b87e7f6f.js → documentation-4d1fafe1.js} +1 -1
  41. flowfile/web/static/assets/{dropDown-13564764.js → dropDown-0b46dd77.js} +1 -1
  42. flowfile/web/static/assets/{fullEditor-fd2cd6f9.js → fullEditor-ec4e4f95.js} +2 -2
  43. flowfile/web/static/assets/{genericNodeSettings-71e11604.js → genericNodeSettings-def5879b.js} +3 -3
  44. flowfile/web/static/assets/{index-f6c15e76.js → index-683fc198.js} +6 -6
  45. flowfile/web/static/assets/{nodeTitle-988d9efe.js → nodeTitle-a16db7c3.js} +3 -3
  46. flowfile/web/static/assets/{secretApi-dd636aa2.js → secretApi-baceb6f9.js} +1 -1
  47. flowfile/web/static/assets/{selectDynamic-af36165e.js → selectDynamic-de91449a.js} +3 -3
  48. flowfile/web/static/assets/{vue-codemirror.esm-2847001e.js → vue-codemirror.esm-dc5e3348.js} +1 -1
  49. flowfile/web/static/assets/{vue-content-loader.es-0371da73.js → vue-content-loader.es-ba94b82f.js} +1 -1
  50. flowfile/web/static/index.html +1 -1
  51. {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/METADATA +2 -1
  52. {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/RECORD +81 -83
  53. flowfile_core/configs/settings.py +4 -2
  54. flowfile_core/flowfile/code_generator/code_generator.py +36 -0
  55. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +60 -80
  56. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +61 -0
  57. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +44 -3
  58. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +3 -3
  59. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +5 -2
  60. flowfile_core/flowfile/flow_graph.py +128 -87
  61. flowfile_core/flowfile/flow_node/flow_node.py +16 -11
  62. flowfile_core/flowfile/flow_node/models.py +0 -2
  63. flowfile_core/flowfile/flow_node/schema_callback.py +138 -43
  64. flowfile_core/flowfile/graph_tree/graph_tree.py +250 -0
  65. flowfile_core/flowfile/graph_tree/models.py +15 -0
  66. flowfile_core/flowfile/manage/compatibility_enhancements.py +1 -1
  67. flowfile_core/flowfile/{flow_data_engine/fuzzy_matching/settings_validator.py → schema_callbacks.py} +65 -13
  68. flowfile_core/flowfile/setting_generator/settings.py +2 -1
  69. flowfile_core/flowfile/util/execution_orderer.py +9 -0
  70. flowfile_core/flowfile/util/node_skipper.py +8 -0
  71. flowfile_core/schemas/schemas.py +46 -3
  72. flowfile_core/schemas/transform_schema.py +27 -38
  73. flowfile_frame/__init__.py +1 -4
  74. flowfile_frame/flow_frame.py +33 -4
  75. flowfile_frame/flow_frame.pyi +2 -0
  76. flowfile_worker/funcs.py +7 -3
  77. flowfile_worker/models.py +3 -1
  78. flowfile_worker/polars_fuzzy_match/matcher.py +0 -435
  79. flowfile_worker/polars_fuzzy_match/models.py +0 -36
  80. flowfile_worker/polars_fuzzy_match/pre_process.py +0 -213
  81. flowfile_worker/polars_fuzzy_match/process.py +0 -86
  82. flowfile_worker/polars_fuzzy_match/utils.py +0 -50
  83. {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/LICENSE +0 -0
  84. {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/WHEEL +0 -0
  85. {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/entry_points.txt +0 -0
  86. {flowfile_worker/polars_fuzzy_match → flowfile_core/flowfile/graph_tree}/__init__.py +0 -0
@@ -6,6 +6,8 @@ from dataclasses import dataclass
6
6
  from math import ceil
7
7
  from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar, Literal, Generator
8
8
 
9
+ from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
10
+
9
11
  # Third-party imports
10
12
  from loky import Future
11
13
  import polars as pl
@@ -19,12 +21,12 @@ from pyarrow.parquet import ParquetFile
19
21
  from flowfile_core.configs import logger
20
22
  from flowfile_core.utils.utils import ensure_similarity_dicts
21
23
  from flowfile_core.configs.flow_logger import NodeLogger
22
- from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
23
24
  from flowfile_core.schemas import (
24
25
  cloud_storage_schemas,
25
26
  input_schema,
26
27
  transform_schema as transform_schemas
27
28
  )
29
+ from flowfile_core.schemas.schemas import ExecutionLocationsLiteral, get_global_execution_location
28
30
 
29
31
  # Local imports - Flow File Components
30
32
  from flowfile_core.flowfile.flow_data_engine import utils
@@ -64,6 +66,7 @@ from flowfile_core.flowfile.sources.external_sources.base_class import ExternalD
64
66
 
65
67
  T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
66
68
 
69
+
67
70
  def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform_schemas.JoinInput) -> Tuple[T, T, Dict[str, str]]:
68
71
  """Temporarily renames join keys to avoid conflicts during a join.
69
72
 
@@ -1563,7 +1566,7 @@ class FlowDataEngine:
1563
1566
  return FlowDataEngine(df, number_of_records=len(df), schema=self.schema)
1564
1567
 
1565
1568
  def get_sample(self, n_rows: int = 100, random: bool = False, shuffle: bool = False,
1566
- seed: int = None) -> "FlowDataEngine":
1569
+ seed: int = None, execution_location: Optional[ExecutionLocationsLiteral] = None) -> "FlowDataEngine":
1567
1570
  """Gets a sample of rows from the DataFrame.
1568
1571
 
1569
1572
  Args:
@@ -1571,11 +1574,10 @@ class FlowDataEngine:
1571
1574
  random: If True, performs random sampling. If False, takes the first n_rows.
1572
1575
  shuffle: If True (and `random` is True), shuffles the data before sampling.
1573
1576
  seed: A random seed for reproducibility.
1574
-
1577
+ execution_location: Location which is used to calculate the size of the dataframe
1575
1578
  Returns:
1576
1579
  A new `FlowDataEngine` instance containing the sampled data.
1577
1580
  """
1578
- n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=OFFLOAD_TO_WORKER))
1579
1581
  logging.info(f'Getting sample of {n_rows} rows')
1580
1582
 
1581
1583
  if random:
@@ -1583,12 +1585,17 @@ class FlowDataEngine:
1583
1585
  self.collect_external()
1584
1586
 
1585
1587
  if self.lazy and shuffle:
1586
- sample_df = self.data_frame.collect(engine="streaming" if self._streamable else "auto").sample(n_rows,
1587
- seed=seed,
1588
- shuffle=shuffle)
1588
+ sample_df = (self.data_frame.collect(engine="streaming" if self._streamable else "auto")
1589
+ .sample(n_rows, seed=seed, shuffle=shuffle))
1589
1590
  elif shuffle:
1590
1591
  sample_df = self.data_frame.sample(n_rows, seed=seed, shuffle=shuffle)
1591
1592
  else:
1593
+ if execution_location is None:
1594
+ execution_location = get_global_execution_location()
1595
+ n_rows = min(n_rows, self.get_number_of_records(
1596
+ calculate_in_worker_process=execution_location == "remote")
1597
+ )
1598
+
1592
1599
  every_n_records = ceil(self.number_of_records / n_rows)
1593
1600
  sample_df = self.data_frame.gather_every(every_n_records)
1594
1601
  else:
@@ -1596,7 +1603,7 @@ class FlowDataEngine:
1596
1603
  self.collect(n_rows)
1597
1604
  sample_df = self.data_frame.head(n_rows)
1598
1605
 
1599
- return FlowDataEngine(sample_df, schema=self.schema, number_of_records=n_records)
1606
+ return FlowDataEngine(sample_df, schema=self.schema)
1600
1607
 
1601
1608
  def get_subset(self, n_rows: int = 100) -> "FlowDataEngine":
1602
1609
  """Gets the first `n_rows` from the DataFrame.
@@ -1650,8 +1657,7 @@ class FlowDataEngine:
1650
1657
  An `ExternalFuzzyMatchFetcher` object that can be used to track the
1651
1658
  progress and retrieve the result of the fuzzy join.
1652
1659
  """
1653
- left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
1654
- fuzzy_match_input=fuzzy_match_input)
1660
+ left_df, right_df = prepare_for_fuzzy_match(left=self, right=other, fuzzy_match_input=fuzzy_match_input)
1655
1661
  return ExternalFuzzyMatchFetcher(left_df, right_df,
1656
1662
  fuzzy_maps=fuzzy_match_input.fuzzy_maps,
1657
1663
  file_ref=file_ref + '_fm',
@@ -1659,59 +1665,33 @@ class FlowDataEngine:
1659
1665
  flow_id=flow_id,
1660
1666
  node_id=node_id)
1661
1667
 
1662
- def do_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1663
- other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
1664
- node_id: int | str = -1) -> "FlowDataEngine":
1665
- """Performs a fuzzy join with another DataFrame.
1666
-
1667
- This method blocks until the fuzzy join operation is complete.
1668
-
1669
- Args:
1670
- fuzzy_match_input: A `FuzzyMatchInput` object with the matching parameters.
1671
- other: The right `FlowDataEngine` to join with.
1672
- file_ref: A reference string for temporary files.
1673
- flow_id: The flow ID for tracking.
1674
- node_id: The node ID for tracking.
1675
-
1676
- Returns:
1677
- A new `FlowDataEngine` instance with the result of the fuzzy join.
1678
- """
1679
- left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
1680
- fuzzy_match_input=fuzzy_match_input)
1681
- f = ExternalFuzzyMatchFetcher(left_df, right_df,
1682
- fuzzy_maps=fuzzy_match_input.fuzzy_maps,
1683
- file_ref=file_ref + '_fm',
1684
- wait_on_completion=True,
1685
- flow_id=flow_id,
1686
- node_id=node_id)
1687
- return FlowDataEngine(f.get_result())
1688
-
1689
- def fuzzy_match(self, right: "FlowDataEngine", left_on: str, right_on: str,
1690
- fuzzy_method: str = 'levenshtein', threshold: float = 0.75) -> "FlowDataEngine":
1691
- """Performs a simple fuzzy match between two DataFrames on a single column pair.
1692
-
1693
- This is a convenience method for a common fuzzy join scenario.
1694
-
1695
- Args:
1696
- right: The right `FlowDataEngine` to match against.
1697
- left_on: The column name from the left DataFrame to match on.
1698
- right_on: The column name from the right DataFrame to match on.
1699
- fuzzy_method: The fuzzy matching algorithm to use (e.g., 'levenshtein').
1700
- threshold: The similarity score threshold (0.0 to 1.0) for a match.
1701
-
1702
- Returns:
1703
- A new `FlowDataEngine` with the matched data.
1704
- """
1705
- fuzzy_match_input = transform_schemas.FuzzyMatchInput(
1706
- [transform_schemas.FuzzyMap(
1707
- left_on, right_on,
1708
- fuzzy_type=fuzzy_method,
1709
- threshold_score=threshold
1710
- )],
1711
- left_select=self.columns,
1712
- right_select=right.columns
1713
- )
1714
- return self.do_fuzzy_join(fuzzy_match_input, right, str(id(self)))
1668
+ def fuzzy_join_external(self,
1669
+ fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1670
+ other: "FlowDataEngine",
1671
+ file_ref: str = None,
1672
+ flow_id: int = -1,
1673
+ node_id: int = -1
1674
+ ):
1675
+ if file_ref is None:
1676
+ file_ref = str(id(self)) + '_' + str(id(other))
1677
+
1678
+ left_df, right_df = prepare_for_fuzzy_match(left=self, right=other, fuzzy_match_input=fuzzy_match_input)
1679
+ external_tracker = ExternalFuzzyMatchFetcher(left_df, right_df,
1680
+ fuzzy_maps=fuzzy_match_input.fuzzy_maps,
1681
+ file_ref=file_ref + '_fm',
1682
+ wait_on_completion=False,
1683
+ flow_id=flow_id,
1684
+ node_id=node_id)
1685
+ return FlowDataEngine(external_tracker.get_result())
1686
+
1687
+ def fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
1688
+ other: "FlowDataEngine",
1689
+ node_logger: NodeLogger = None) -> "FlowDataEngine":
1690
+ left_df, right_df = prepare_for_fuzzy_match(left=self, right=other, fuzzy_match_input=fuzzy_match_input)
1691
+ fuzzy_mappings = [FuzzyMapping(**fm.__dict__) for fm in fuzzy_match_input.fuzzy_maps]
1692
+ return FlowDataEngine(fuzzy_match_dfs(left_df, right_df, fuzzy_maps=fuzzy_mappings,
1693
+ logger=node_logger.logger if node_logger else logger)
1694
+ .lazy())
1715
1695
 
1716
1696
  def do_cross_join(self, cross_join_input: transform_schemas.CrossJoinInput,
1717
1697
  auto_generate_selection: bool, verify_integrity: bool,
@@ -1733,11 +1713,12 @@ class FlowDataEngine:
1733
1713
  Exception: If `verify_integrity` is True and the join would result in
1734
1714
  an excessively large number of records.
1735
1715
  """
1716
+
1736
1717
  self.lazy = True
1718
+
1737
1719
  other.lazy = True
1738
1720
 
1739
1721
  verify_join_select_integrity(cross_join_input, left_columns=self.columns, right_columns=other.columns)
1740
-
1741
1722
  right_select = [v.old_name for v in cross_join_input.right_select.renames
1742
1723
  if (v.keep or v.join_key) and v.is_available]
1743
1724
  left_select = [v.old_name for v in cross_join_input.left_select.renames
@@ -1746,26 +1727,14 @@ class FlowDataEngine:
1746
1727
  left = self.data_frame.select(left_select).rename(cross_join_input.left_select.rename_table)
1747
1728
  right = other.data_frame.select(right_select).rename(cross_join_input.right_select.rename_table)
1748
1729
 
1749
- if verify_integrity:
1750
- n_records = self.get_number_of_records() * other.get_number_of_records()
1751
- if n_records > 1_000_000_000:
1752
- raise Exception("Join will result in too many records, ending process")
1753
- else:
1754
- n_records = -1
1755
-
1756
1730
  joined_df = left.join(right, how='cross')
1757
1731
 
1758
1732
  cols_to_delete_after = [col.new_name for col in
1759
1733
  cross_join_input.left_select.renames + cross_join_input.left_select.renames
1760
1734
  if col.join_key and not col.keep and col.is_available]
1761
1735
 
1762
- if verify_integrity:
1763
- return FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False,
1764
- number_of_records=n_records, streamable=False)
1765
- else:
1766
- fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False,
1767
- number_of_records=0, streamable=False)
1768
- return fl
1736
+ fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False, streamable=False)
1737
+ return fl
1769
1738
 
1770
1739
  def join(self, join_input: transform_schemas.JoinInput, auto_generate_selection: bool,
1771
1740
  verify_integrity: bool, other: "FlowDataEngine") -> "FlowDataEngine":
@@ -1901,7 +1870,7 @@ class FlowDataEngine:
1901
1870
  other.number_of_records = -1
1902
1871
  other = other.select_columns(self.columns)
1903
1872
 
1904
- if self.get_number_of_records() != other.get_number_of_records():
1873
+ if self.get_number_of_records_in_process() != other.get_number_of_records_in_process():
1905
1874
  raise Exception('Number of records is not equal')
1906
1875
 
1907
1876
  if self.columns != other.columns:
@@ -1937,6 +1906,18 @@ class FlowDataEngine:
1937
1906
  ).result
1938
1907
  return number_of_records
1939
1908
 
1909
+ def get_number_of_records_in_process(self, force_calculate: bool = False):
1910
+ """
1911
+ Get the number of records in the DataFrame in the local process.
1912
+
1913
+ args:
1914
+ force_calculate: If True, forces recalculation even if a value is cached.
1915
+
1916
+ Returns:
1917
+ The total number of records.
1918
+ """
1919
+ return self.get_number_of_records(force_calculate=force_calculate)
1920
+
1940
1921
  def get_number_of_records(self, warn: bool = False, force_calculate: bool = False,
1941
1922
  calculate_in_worker_process: bool = False) -> int:
1942
1923
  """Gets the total number of records in the DataFrame.
@@ -1956,7 +1937,6 @@ class FlowDataEngine:
1956
1937
  """
1957
1938
  if self.is_future and not self.is_collected:
1958
1939
  return -1
1959
- calculate_in_worker_process = False if not OFFLOAD_TO_WORKER else calculate_in_worker_process
1960
1940
  if self.number_of_records is None or self.number_of_records < 0 or force_calculate:
1961
1941
  if self._number_of_records_callback is not None:
1962
1942
  self._number_of_records_callback(self)
@@ -76,6 +76,67 @@ class FlowfileColumn:
76
76
  self.__sql_type = None
77
77
  self.__perc_unique = None
78
78
 
79
+ def __repr__(self):
80
+ """
81
+ Provides a concise, developer-friendly representation of the object.
82
+ Ideal for debugging and console inspection.
83
+ """
84
+ return (f"FlowfileColumn(name='{self.column_name}', "
85
+ f"type={self.data_type}, "
86
+ f"size={self.size}, "
87
+ f"nulls={self.number_of_empty_values})")
88
+
89
+ def __str__(self):
90
+ """
91
+ Provides a detailed, readable summary of the column's metadata.
92
+ It conditionally omits any attribute that is None, ensuring a clean output.
93
+ """
94
+ # --- Header (Always Shown) ---
95
+ header = f"<FlowfileColumn: '{self.column_name}'>"
96
+ lines = []
97
+
98
+ # --- Core Attributes (Conditionally Shown) ---
99
+ if self.data_type is not None:
100
+ lines.append(f" Type: {self.data_type}")
101
+ if self.size is not None:
102
+ lines.append(f" Non-Nulls: {self.size}")
103
+
104
+ # Calculate and display nulls if possible
105
+ if self.size is not None and self.number_of_empty_values is not None:
106
+ total_entries = self.size + self.number_of_empty_values
107
+ if total_entries > 0:
108
+ null_perc = (self.number_of_empty_values / total_entries) * 100
109
+ null_info = f"{self.number_of_empty_values} ({null_perc:.1f}%)"
110
+ else:
111
+ null_info = "0 (0.0%)"
112
+ lines.append(f" Nulls: {null_info}")
113
+
114
+ if self.number_of_unique_values is not None:
115
+ lines.append(f" Unique: {self.number_of_unique_values}")
116
+
117
+ # --- Conditional Stats Section ---
118
+ stats = []
119
+ if self.min_value is not None:
120
+ stats.append(f" Min: {self.min_value}")
121
+ if self.max_value is not None:
122
+ stats.append(f" Max: {self.max_value}")
123
+ if self.average_value is not None:
124
+ stats.append(f" Mean: {self.average_value}")
125
+
126
+ if stats:
127
+ lines.append(" Stats:")
128
+ lines.extend(stats)
129
+
130
+ # --- Conditional Examples Section ---
131
+ if self.example_values:
132
+ example_str = str(self.example_values)
133
+ # Truncate long example strings for cleaner display
134
+ if len(example_str) > 70:
135
+ example_str = example_str[:67] + '...'
136
+ lines.append(f" Examples: {example_str}")
137
+
138
+ return f"{header}\n" + "\n".join(lines)
139
+
79
140
  @classmethod
80
141
  def create_from_polars_type(cls, polars_type: PlType, **kwargs) -> "FlowfileColumn":
81
142
  for k, v in kwargs.items():
@@ -1,12 +1,49 @@
1
- from flowfile_core.schemas.transform_schema import FuzzyMatchInput
1
+ from flowfile_core.schemas.transform_schema import FuzzyMatchInput, SelectInput, JoinInputs
2
2
  from flowfile_core.flowfile.flow_data_engine.join import verify_join_select_integrity, verify_join_map_integrity
3
3
  import polars as pl
4
- from typing import TYPE_CHECKING, Tuple
4
+ from typing import TYPE_CHECKING, Tuple, List
5
5
 
6
6
  if TYPE_CHECKING:
7
7
  from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
8
8
 
9
9
 
10
+ def _order_join_inputs_based_on_col_order(col_order: List[str], join_inputs: JoinInputs) -> None:
11
+ """
12
+ Ensure that the select columns in the fuzzy match input match the order of the incoming columns.
13
+ This function modifies the join_inputs object in-place.
14
+
15
+ Returns:
16
+ None
17
+ """
18
+ select_map = {select.new_name: select for select in join_inputs.renames}
19
+ ordered_renames = [select_map[col] for col in col_order if col in select_map]
20
+ join_inputs.renames = ordered_renames
21
+
22
+
23
+ def _ensure_all_columns_have_select(left: "FlowDataEngine",
24
+ right: "FlowDataEngine",
25
+ fuzzy_match_input: FuzzyMatchInput):
26
+ """
27
+ Ensure that all columns in the left and right FlowDataEngines are included in the fuzzy match input's select
28
+ statements.
29
+ Args:
30
+ left (FlowDataEngine):
31
+ right (FlowDataEngine):
32
+ fuzzy_match_input ():
33
+
34
+ Returns:
35
+ None
36
+ """
37
+ right_cols_in_select = {c.old_name for c in fuzzy_match_input.right_select.renames}
38
+ left_cols_in_select = {c.old_name for c in fuzzy_match_input.left_select.renames}
39
+
40
+ fuzzy_match_input.left_select.renames.extend(
41
+ [SelectInput(col) for col in left.columns if col not in left_cols_in_select])
42
+ fuzzy_match_input.right_select.renames.extend(
43
+ [SelectInput(col) for col in right.columns if col not in right_cols_in_select]
44
+ )
45
+
46
+
10
47
  def prepare_for_fuzzy_match(left: "FlowDataEngine", right: "FlowDataEngine",
11
48
  fuzzy_match_input: FuzzyMatchInput) -> Tuple[pl.LazyFrame, pl.LazyFrame]:
12
49
  """
@@ -19,14 +56,18 @@ def prepare_for_fuzzy_match(left: "FlowDataEngine", right: "FlowDataEngine",
19
56
  Returns:
20
57
  Tuple[pl.LazyFrame, pl.LazyFrame]: Prepared left and right lazy frames
21
58
  """
22
-
23
59
  left.lazy = True
24
60
  right.lazy = True
61
+ _ensure_all_columns_have_select(left, right, fuzzy_match_input)
62
+ _order_join_inputs_based_on_col_order(left.columns, fuzzy_match_input.left_select)
63
+ _order_join_inputs_based_on_col_order(right.columns, fuzzy_match_input.right_select)
64
+
25
65
  verify_join_select_integrity(fuzzy_match_input, left_columns=left.columns, right_columns=right.columns)
26
66
  if not verify_join_map_integrity(fuzzy_match_input, left_columns=left.schema, right_columns=right.schema):
27
67
  raise Exception('Join is not valid by the data fields')
28
68
  fuzzy_match_input = fuzzy_match_input
29
69
  fuzzy_match_input.auto_rename()
70
+
30
71
  right_select = [v.old_name for v in fuzzy_match_input.right_select.renames if
31
72
  (v.keep or v.join_key) and v.is_available]
32
73
  left_select = [v.old_name for v in fuzzy_match_input.left_select.renames if
@@ -1,6 +1,6 @@
1
1
  from typing import Any, Optional, Literal
2
2
  from pydantic import BaseModel
3
- from flowfile_core.schemas.transform_schema import FuzzyMap
3
+ from pl_fuzzy_frame_match.models import FuzzyMapping
4
4
 
5
5
  OperationType = Literal['store', 'calculate_schema', 'calculate_number_of_records', 'write_output', 'store_sample']
6
6
 
@@ -20,8 +20,8 @@ class FuzzyJoinInput(BaseModel):
20
20
  cache_dir: Optional[str] = None
21
21
  left_df_operation: PolarsOperation
22
22
  right_df_operation: PolarsOperation
23
- fuzzy_maps: list[FuzzyMap]
24
- flowfile_node_id: int|str
23
+ fuzzy_maps: list[FuzzyMapping]
24
+ flowfile_node_id: int | str
25
25
  flowfile_flow_id: int
26
26
 
27
27
 
@@ -9,11 +9,12 @@ from uuid import uuid4
9
9
  import polars as pl
10
10
  import requests
11
11
 
12
+ from pl_fuzzy_frame_match.models import FuzzyMapping
13
+
12
14
  from flowfile_core.configs import logger
13
15
  from flowfile_core.configs.settings import WORKER_URL
14
16
  from flowfile_core.flowfile.flow_data_engine.subprocess_operations.models import (
15
17
  FuzzyJoinInput,
16
- FuzzyMap,
17
18
  OperationType,
18
19
  PolarsOperation,
19
20
  Status
@@ -53,7 +54,7 @@ def trigger_sample_operation(lf: pl.LazyFrame, file_ref: str, flow_id: int, node
53
54
 
54
55
 
55
56
  def trigger_fuzzy_match_operation(left_df: pl.LazyFrame, right_df: pl.LazyFrame,
56
- fuzzy_maps: List[FuzzyMap],
57
+ fuzzy_maps: List[FuzzyMapping],
57
58
  file_ref: str,
58
59
  flow_id: int,
59
60
  node_id: int | str) -> Status:
@@ -122,6 +123,8 @@ def results_exists(file_ref: str):
122
123
  return False
123
124
  except requests.RequestException as e:
124
125
  logger.error(f"Failed to check results existence: {str(e)}")
126
+ if "Connection refused" in str(e):
127
+ logger.info("")
125
128
  return False
126
129
 
127
130