Flowfile 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. flowfile/__init__.py +4 -3
  2. flowfile/api.py +1 -0
  3. flowfile/web/static/assets/{CloudConnectionManager-c20a740f.js → CloudConnectionManager-c97c25f8.js} +2 -2
  4. flowfile/web/static/assets/{CloudStorageReader-960b400a.js → CloudStorageReader-f1ff509e.js} +7 -7
  5. flowfile/web/static/assets/{CloudStorageWriter-e3decbdd.js → CloudStorageWriter-034f8b78.js} +7 -7
  6. flowfile/web/static/assets/{CrossJoin-d67e2405.js → CrossJoin-9e156ebe.js} +8 -8
  7. flowfile/web/static/assets/{DatabaseConnectionSettings-a81e0f7e.js → DatabaseConnectionSettings-d5c625b3.js} +2 -2
  8. flowfile/web/static/assets/{DatabaseManager-9ea35e84.js → DatabaseManager-265adc5e.js} +2 -2
  9. flowfile/web/static/assets/{DatabaseReader-9578bfa5.js → DatabaseReader-0b10551e.js} +9 -9
  10. flowfile/web/static/assets/{DatabaseWriter-19531098.js → DatabaseWriter-c17c6916.js} +9 -9
  11. flowfile/web/static/assets/{ExploreData-40476474.js → ExploreData-18a4fe52.js} +5 -5
  12. flowfile/web/static/assets/{ExternalSource-2297ef96.js → ExternalSource-3a66556c.js} +6 -6
  13. flowfile/web/static/assets/{Filter-f211c03a.js → Filter-91ad87e7.js} +8 -8
  14. flowfile/web/static/assets/{Formula-4207ea31.js → Formula-3c395ab1.js} +8 -8
  15. flowfile/web/static/assets/{FuzzyMatch-bf120df0.js → FuzzyMatch-2df0d230.js} +9 -9
  16. flowfile/web/static/assets/{GraphSolver-5bb7497a.js → GraphSolver-d285877f.js} +5 -5
  17. flowfile/web/static/assets/{GroupBy-92c81b65.js → GroupBy-0bd1cc6b.js} +6 -6
  18. flowfile/web/static/assets/{Join-4e49a274.js → Join-5a78a203.js} +9 -9
  19. flowfile/web/static/assets/{ManualInput-90998ae8.js → ManualInput-93aef9d6.js} +5 -5
  20. flowfile/web/static/assets/{Output-81e3e917.js → Output-411ecaee.js} +4 -4
  21. flowfile/web/static/assets/{Pivot-a3419842.js → Pivot-89db4b04.js} +6 -6
  22. flowfile/web/static/assets/{PolarsCode-72710deb.js → PolarsCode-a9f974f8.js} +6 -6
  23. flowfile/web/static/assets/{Read-c4059daf.js → Read-c3b1929c.js} +6 -6
  24. flowfile/web/static/assets/{RecordCount-c2b5e095.js → RecordCount-4e95f98e.js} +5 -5
  25. flowfile/web/static/assets/{RecordId-10baf191.js → RecordId-55ae7d36.js} +6 -6
  26. flowfile/web/static/assets/{Sample-3ed9a0ae.js → Sample-b4a18476.js} +5 -5
  27. flowfile/web/static/assets/{SecretManager-0d49c0e8.js → SecretManager-b066d13a.js} +2 -2
  28. flowfile/web/static/assets/{Select-8a02a0b3.js → Select-727688dc.js} +8 -8
  29. flowfile/web/static/assets/{SettingsSection-4c0f45f5.js → SettingsSection-695ac487.js} +1 -1
  30. flowfile/web/static/assets/{Sort-f55c9f9d.js → Sort-be3339a8.js} +6 -6
  31. flowfile/web/static/assets/{TextToRows-5dbc2145.js → TextToRows-7b8998da.js} +8 -8
  32. flowfile/web/static/assets/{UnavailableFields-a1768e52.js → UnavailableFields-8b0cb48e.js} +2 -2
  33. flowfile/web/static/assets/{Union-f2aefdc9.js → Union-89fd73dc.js} +5 -5
  34. flowfile/web/static/assets/{Unique-46b250da.js → Unique-af5a80b4.js} +8 -8
  35. flowfile/web/static/assets/{Unpivot-25ac84cc.js → Unpivot-5195d411.js} +5 -5
  36. flowfile/web/static/assets/{api-a0abbdc7.js → api-023d1733.js} +1 -1
  37. flowfile/web/static/assets/{api-6ef0dcef.js → api-cb00cce6.js} +1 -1
  38. flowfile/web/static/assets/{designer-186f2e71.css → designer-2197d782.css} +17 -17
  39. flowfile/web/static/assets/{designer-13eabd83.js → designer-6c322d8e.js} +67 -21
  40. flowfile/web/static/assets/{documentation-b87e7f6f.js → documentation-4d1fafe1.js} +1 -1
  41. flowfile/web/static/assets/{dropDown-13564764.js → dropDown-0b46dd77.js} +1 -1
  42. flowfile/web/static/assets/{fullEditor-fd2cd6f9.js → fullEditor-ec4e4f95.js} +2 -2
  43. flowfile/web/static/assets/{genericNodeSettings-71e11604.js → genericNodeSettings-def5879b.js} +3 -3
  44. flowfile/web/static/assets/{index-f6c15e76.js → index-683fc198.js} +6 -6
  45. flowfile/web/static/assets/{nodeTitle-988d9efe.js → nodeTitle-a16db7c3.js} +3 -3
  46. flowfile/web/static/assets/{secretApi-dd636aa2.js → secretApi-baceb6f9.js} +1 -1
  47. flowfile/web/static/assets/{selectDynamic-af36165e.js → selectDynamic-de91449a.js} +3 -3
  48. flowfile/web/static/assets/{vue-codemirror.esm-2847001e.js → vue-codemirror.esm-dc5e3348.js} +1 -1
  49. flowfile/web/static/assets/{vue-content-loader.es-0371da73.js → vue-content-loader.es-ba94b82f.js} +1 -1
  50. flowfile/web/static/index.html +1 -1
  51. {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/METADATA +2 -1
  52. {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/RECORD +81 -83
  53. flowfile_core/configs/settings.py +4 -2
  54. flowfile_core/flowfile/code_generator/code_generator.py +36 -0
  55. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +60 -80
  56. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +61 -0
  57. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +44 -3
  58. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +3 -3
  59. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +5 -2
  60. flowfile_core/flowfile/flow_graph.py +128 -87
  61. flowfile_core/flowfile/flow_node/flow_node.py +16 -11
  62. flowfile_core/flowfile/flow_node/models.py +0 -2
  63. flowfile_core/flowfile/flow_node/schema_callback.py +138 -43
  64. flowfile_core/flowfile/graph_tree/graph_tree.py +250 -0
  65. flowfile_core/flowfile/graph_tree/models.py +15 -0
  66. flowfile_core/flowfile/manage/compatibility_enhancements.py +1 -1
  67. flowfile_core/flowfile/{flow_data_engine/fuzzy_matching/settings_validator.py → schema_callbacks.py} +65 -13
  68. flowfile_core/flowfile/setting_generator/settings.py +2 -1
  69. flowfile_core/flowfile/util/execution_orderer.py +9 -0
  70. flowfile_core/flowfile/util/node_skipper.py +8 -0
  71. flowfile_core/schemas/schemas.py +46 -3
  72. flowfile_core/schemas/transform_schema.py +27 -38
  73. flowfile_frame/__init__.py +1 -4
  74. flowfile_frame/flow_frame.py +33 -4
  75. flowfile_frame/flow_frame.pyi +2 -0
  76. flowfile_worker/funcs.py +7 -3
  77. flowfile_worker/models.py +3 -1
  78. flowfile_worker/polars_fuzzy_match/matcher.py +0 -435
  79. flowfile_worker/polars_fuzzy_match/models.py +0 -36
  80. flowfile_worker/polars_fuzzy_match/pre_process.py +0 -213
  81. flowfile_worker/polars_fuzzy_match/process.py +0 -86
  82. flowfile_worker/polars_fuzzy_match/utils.py +0 -50
  83. {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/LICENSE +0 -0
  84. {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/WHEEL +0 -0
  85. {flowfile-0.3.8.dist-info → flowfile-0.3.9.dist-info}/entry_points.txt +0 -0
  86. {flowfile_worker/polars_fuzzy_match → flowfile_core/flowfile/graph_tree}/__init__.py +0 -0
@@ -5,11 +5,13 @@ from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, C
5
5
  import re
6
6
 
7
7
  import polars as pl
8
- from polars._typing import (CsvEncoding)
9
8
  from flowfile_frame.lazy_methods import add_lazyframe_methods
10
9
 
11
- from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
10
+ from polars._typing import (CsvEncoding, FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
12
11
  from collections.abc import Iterator
12
+
13
+ from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
14
+
13
15
  from flowfile_core.flowfile.flow_graph import FlowGraph, add_connection
14
16
  from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
15
17
  from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
@@ -20,8 +22,7 @@ from flowfile_frame.expr import Expr, Column, lit, col
20
22
  from flowfile_frame.selectors import Selector
21
23
  from flowfile_frame.group_frame import GroupByFrame
22
24
  from flowfile_frame.utils import (_parse_inputs_as_iterable, create_flow_graph, stringify_values,
23
- ensure_inputs_as_iterable, generate_node_id,
24
- set_node_id, data as node_id_data)
25
+ ensure_inputs_as_iterable, generate_node_id, data as node_id_data)
25
26
  from flowfile_frame.join import _normalize_columns_to_list, _create_join_mappings
26
27
  from flowfile_frame.utils import _check_if_convertible_to_code
27
28
  from flowfile_frame.config import logger
@@ -2109,6 +2110,34 @@ class FlowFrame:
2109
2110
 
2110
2111
  return self._create_child_frame(new_node_id)
2111
2112
 
2113
+ def fuzzy_match(self,
2114
+ other: "FlowFrame",
2115
+ fuzzy_mappings: List[FuzzyMapping],
2116
+ description: str = None,
2117
+ ) -> "FlowFrame":
2118
+ self._ensure_same_graph(other)
2119
+
2120
+ # Step 3: Generate new node ID
2121
+ new_node_id = generate_node_id()
2122
+ node_fuzzy_match = input_schema.NodeFuzzyMatch(flow_id=self.flow_graph.flow_id,
2123
+ node_id=new_node_id,
2124
+ join_input=
2125
+ transform_schema.FuzzyMatchInput(join_mapping=fuzzy_mappings,
2126
+ left_select=self.columns,
2127
+ right_select=other.columns),
2128
+ description=description or "Fuzzy match between two FlowFrames",
2129
+ depending_on_ids=[self.node_id, other.node_id],
2130
+ )
2131
+ self.flow_graph.add_fuzzy_match(node_fuzzy_match)
2132
+ self._add_connection(self.node_id, new_node_id, "main")
2133
+ other._add_connection(other.node_id, new_node_id, "right")
2134
+ return FlowFrame(
2135
+ data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
2136
+ flow_graph=self.flow_graph,
2137
+ node_id=new_node_id,
2138
+ parent_node_id=self.node_id,
2139
+ )
2140
+
2112
2141
  def text_to_rows(
2113
2142
  self,
2114
2143
  column: str | Column,
@@ -208,6 +208,8 @@ class FlowFrame:
208
208
  # Get the first row of the DataFrame.
209
209
  def first(self, description: Optional[str] = None) -> 'FlowFrame': ...
210
210
 
211
+ def fuzzy_match(self, other: FlowFrame, fuzzy_mappings: typing.List[flowfile_core.schemas.transform_schema.FuzzyMap], description: str = None) -> 'FlowFrame': ...
212
+
211
213
  # Take every nth row in the LazyFrame and return as a new LazyFrame.
212
214
  def gather_every(self, n: int, offset: int = 0, description: Optional[str] = None) -> 'FlowFrame': ...
213
215
 
flowfile_worker/funcs.py CHANGED
@@ -2,8 +2,9 @@ import polars as pl
2
2
  import io
3
3
  from typing import List, Dict, Callable
4
4
  from multiprocessing import Array, Value, Queue
5
- from flowfile_worker.polars_fuzzy_match.matcher import fuzzy_match_dfs
6
- from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
5
+
6
+ from pl_fuzzy_frame_match import fuzzy_match_dfs, FuzzyMapping
7
+
7
8
  from flowfile_worker.flow_logger import get_worker_logger
8
9
  from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
9
10
  from flowfile_worker.external_sources.sql_source.main import write_df_to_database
@@ -33,7 +34,10 @@ def fuzzy_join_task(left_serializable_object: bytes, right_serializable_object:
33
34
  flowfile_logger.info("Starting fuzzy join operation")
34
35
  left_df = pl.LazyFrame.deserialize(io.BytesIO(left_serializable_object))
35
36
  right_df = pl.LazyFrame.deserialize(io.BytesIO(right_serializable_object))
36
- fuzzy_match_result = fuzzy_match_dfs(left_df, right_df, fuzzy_maps, flowfile_logger)
37
+ fuzzy_match_result = fuzzy_match_dfs(left_df=left_df,
38
+ right_df=right_df,
39
+ fuzzy_maps=fuzzy_maps,
40
+ logger=flowfile_logger)
37
41
  flowfile_logger.info("Fuzzy join operation completed successfully")
38
42
  fuzzy_match_result.write_ipc(file_path)
39
43
  with progress.get_lock():
flowfile_worker/models.py CHANGED
@@ -1,7 +1,9 @@
1
1
  from pydantic import BaseModel
2
2
  from typing import Optional, Literal, Any
3
3
  from base64 import decodebytes
4
- from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
4
+
5
+ from pl_fuzzy_frame_match import FuzzyMapping
6
+
5
7
  from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
6
8
  from flowfile_worker.external_sources.s3_source.models import CloudStorageWriteSettings
7
9
 
@@ -1,435 +0,0 @@
1
- import polars as pl
2
- from typing import List, Optional, Tuple
3
- import tempfile
4
- from logging import Logger
5
-
6
- from flowfile_worker.polars_fuzzy_match.process import calculate_and_parse_fuzzy, process_fuzzy_frames
7
- from flowfile_worker.polars_fuzzy_match.pre_process import pre_process_for_fuzzy_matching
8
- from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
9
- from flowfile_worker.polars_fuzzy_match.utils import cache_polars_frame_to_temp
10
- from flowfile_worker.utils import collect_lazy_frame
11
- import polars_simed as ps
12
-
13
-
14
- HAS_POLARS_SIM = True
15
-
16
-
17
- def ensure_left_is_larger(left_df: pl.DataFrame,
18
- right_df: pl.DataFrame,
19
- left_col_name: str,
20
- right_col_name: str) -> tuple:
21
- """
22
- Ensures that the left dataframe is always the larger one.
23
- If the right dataframe is larger, swaps them.
24
-
25
- Args:
26
- left_df: The left dataframe
27
- right_df: The right dataframe
28
- left_col_name: Column name for the left dataframe
29
- right_col_name: Column name for the right dataframe
30
-
31
- Returns:
32
- tuple: (left_df, right_df, left_col_name, right_col_name)
33
- """
34
- left_frame_len = left_df.select(pl.len())[0, 0]
35
- right_frame_len = right_df.select(pl.len())[0, 0]
36
-
37
- # Swap dataframes if right is larger than left
38
- if right_frame_len > left_frame_len:
39
- return right_df, left_df, right_col_name, left_col_name
40
-
41
- return left_df, right_df, left_col_name, right_col_name
42
-
43
-
44
- def split_dataframe(df: pl.DataFrame, max_chunk_size: int = 500_000) -> List[pl.DataFrame]:
45
- """
46
- Split a Polars DataFrame into multiple DataFrames with a maximum size.
47
-
48
- Args:
49
- df: The Polars DataFrame to split
50
- max_chunk_size: Maximum number of rows per chunk (default: 500,000)
51
-
52
- Returns:
53
- List of Polars DataFrames, each containing at most max_chunk_size rows
54
- """
55
- total_rows = df.select(pl.len())[0, 0]
56
-
57
- # If DataFrame is smaller than max_chunk_size, return it as is
58
- if total_rows <= max_chunk_size:
59
- return [df]
60
-
61
- # Calculate number of chunks needed
62
- num_chunks = (total_rows + max_chunk_size - 1) // max_chunk_size # Ceiling division
63
-
64
- chunks = []
65
- for i in range(num_chunks):
66
- start_idx = i * max_chunk_size
67
- end_idx = min((i + 1) * max_chunk_size, total_rows)
68
-
69
- # Extract chunk using slice
70
- chunk = df.slice(start_idx, end_idx - start_idx)
71
- chunks.append(chunk)
72
-
73
- return chunks
74
-
75
-
76
- def cross_join_large_files(left_fuzzy_frame: pl.LazyFrame,
77
- right_fuzzy_frame: pl.LazyFrame,
78
- left_col_name: str,
79
- right_col_name: str,
80
- flowfile_logger: Logger,
81
- ) -> pl.LazyFrame:
82
- if not HAS_POLARS_SIM:
83
- raise Exception('The polars-sim library is required to perform this operation.')
84
-
85
- left_df = collect_lazy_frame(left_fuzzy_frame)
86
- right_df = collect_lazy_frame(right_fuzzy_frame)
87
-
88
- left_df, right_df, left_col_name, right_col_name = ensure_left_is_larger(
89
- left_df, right_df, left_col_name, right_col_name
90
- )
91
- left_chunks = split_dataframe(left_df, max_chunk_size=500_000) # Reduced chunk size
92
- flowfile_logger.info(f"Splitting left dataframe into {len(left_chunks)} chunks.")
93
- df_matches = []
94
-
95
- # Process each chunk combination with error handling
96
- for i, left_chunk in enumerate(left_chunks):
97
- chunk_matches = ps.join_sim(
98
- left=left_chunk,
99
- right=right_df,
100
- left_on=left_col_name,
101
- right_on=right_col_name,
102
- top_n=100,
103
- add_similarity=False,
104
- )
105
- flowfile_logger.info(f"Processed chunk {int(i)} with {len(chunk_matches)} matches.")
106
- df_matches.append(chunk_matches)
107
-
108
-
109
- # Combine all matches
110
- if df_matches:
111
- return pl.concat(df_matches).lazy()
112
- else:
113
- columns = list(set(left_df.columns).union(set(right_df.columns)))
114
- return pl.DataFrame(schema={col: pl.Null for col in columns}).lazy()
115
-
116
-
117
- def cross_join_small_files(left_df: pl.LazyFrame, right_df: pl.LazyFrame) -> pl.LazyFrame:
118
- return left_df.join(right_df, how='cross')
119
-
120
-
121
- def cross_join_filter_existing_fuzzy_results(left_df: pl.LazyFrame, right_df: pl.LazyFrame,
122
- existing_matches: pl.LazyFrame,
123
- left_col_name: str, right_col_name: str):
124
- """
125
- Process and filter fuzzy matching results by joining dataframes using existing match indices.
126
-
127
- This function takes previously identified fuzzy matches (existing_matches) and performs
128
- a series of operations to create a refined dataset of matches between the left and right
129
- dataframes, preserving index relationships.
130
-
131
- Parameters:
132
- -----------
133
- left_df : pl.LazyFrame
134
- The left dataframe containing records to be matched.
135
- right_df : pl.LazyFrame
136
- The right dataframe containing records to be matched against.
137
- existing_matches : pl.LazyFrame
138
- A dataframe containing the indices of already identified matches between
139
- left_df and right_df, with columns '__left_index' and '__right_index'.
140
- left_col_name : str
141
- The column name from left_df to include in the result.
142
- right_col_name : str
143
- The column name from right_df to include in the result.
144
-
145
- Returns:
146
- --------
147
- pl.LazyFrame
148
- A dataframe containing the unique matches between left_df and right_df,
149
- with index information for both dataframes preserved. The resulting dataframe
150
- includes the specified columns from both dataframes along with their respective
151
- index aggregations.
152
-
153
- Notes:
154
- ------
155
- The function performs these operations:
156
- 1. Join existing matches with both dataframes using their respective indices
157
- 2. Select only the relevant columns and remove duplicates
158
- 3. Create aggregations that preserve the relationship between values and their indices
159
- 4. Join these aggregations back to create the final result set
160
- """
161
- joined_df = (existing_matches
162
- .select(['__left_index', '__right_index'])
163
- .join(left_df, on='__left_index')
164
- .join(right_df, on='__right_index')
165
- .select(left_col_name, right_col_name, '__left_index', '__right_index')
166
- )
167
- return joined_df.group_by([left_col_name, right_col_name]).agg('__left_index', '__right_index')
168
-
169
-
170
- def cross_join_no_existing_fuzzy_results(left_df: pl.LazyFrame, right_df: pl.LazyFrame, left_col_name: str,
171
- right_col_name: str, temp_dir_ref: str,
172
- flowfile_logger: Logger) -> pl.LazyFrame:
173
- """
174
- Generate fuzzy matching results by performing a cross join between dataframes.
175
-
176
- This function processes the input dataframes, determines the appropriate cross join method
177
- based on the size of the resulting cartesian product, and returns the cross-joined results
178
- for fuzzy matching when no existing matches are provided.
179
-
180
- Parameters:
181
- -----------
182
- left_df : pl.LazyFrame
183
- The left dataframe containing records to be matched.
184
- right_df : pl.LazyFrame
185
- The right dataframe containing records to be matched against.
186
- left_col_name : str
187
- The column name from left_df to use for fuzzy matching.
188
- right_col_name : str
189
- The column name from right_df to use for fuzzy matching.
190
- temp_dir_ref : str
191
- Reference to a temporary directory where intermediate results can be stored
192
- during processing of large dataframes.
193
-
194
- Returns:
195
- --------
196
- pl.LazyFrame
197
- A dataframe containing the cross join results of left_df and right_df,
198
- prepared for fuzzy matching operations.
199
-
200
- Notes:
201
- ------
202
- The function performs these operations:
203
- 1. Processes input frames using the process_fuzzy_frames helper function
204
- 2. Calculates the size of the cartesian product to determine processing approach
205
- 3. Uses either cross_join_large_files or cross_join_small_files based on the size:
206
- - For cartesian products > 100M but < 1T (or 10M without polars-sim), uses large file method
207
- - For smaller products, uses the small file method
208
- 4. Raises an exception if the cartesian product exceeds the maximum allowed size
209
-
210
- Raises:
211
- -------
212
- Exception
213
- If the cartesian product of the two dataframes exceeds the maximum allowed size
214
- (1 trillion with polars-sim, 100 million without).
215
- """
216
- (left_fuzzy_frame,
217
- right_fuzzy_frame,
218
- left_col_name,
219
- right_col_name,
220
- len_left_df,
221
- len_right_df) = process_fuzzy_frames(left_df=left_df, right_df=right_df, left_col_name=left_col_name,
222
- right_col_name=right_col_name, temp_dir_ref=temp_dir_ref)
223
- cartesian_size = len_left_df * len_right_df
224
- max_size = 100_000_000_000_000 if HAS_POLARS_SIM else 10_000_000
225
- if cartesian_size > max_size:
226
- flowfile_logger.error(f'The cartesian product of the two dataframes is too large to process: {cartesian_size}')
227
- raise Exception('The cartesian product of the two dataframes is too large to process.')
228
- if cartesian_size > 100_000_000:
229
- flowfile_logger.info('Performing approximate fuzzy match for large dataframes to reduce memory usage.')
230
- cross_join_frame = cross_join_large_files(left_fuzzy_frame, right_fuzzy_frame, left_col_name=left_col_name,
231
- right_col_name=right_col_name, flowfile_logger=flowfile_logger)
232
- else:
233
- cross_join_frame = cross_join_small_files(left_fuzzy_frame, right_fuzzy_frame)
234
- return cross_join_frame
235
-
236
-
237
- def unique_df_large(_df: pl.DataFrame | pl.LazyFrame, cols: Optional[List[str]] = None) -> pl.DataFrame:
238
- """
239
- Efficiently compute unique rows in large dataframes by partitioning.
240
-
241
- This function processes large dataframes by first partitioning them by a selected column,
242
- then finding unique combinations within each partition before recombining the results.
243
- This approach is more memory-efficient for large datasets than calling .unique() directly.
244
-
245
- Parameters:
246
- -----------
247
- _df : pl.DataFrame | pl.LazyFrame
248
- The input dataframe to process. Can be either a Polars DataFrame or LazyFrame.
249
- cols : Optional[List[str]]
250
- The list of columns to consider when finding unique rows. If None, all columns
251
- are used. The first column in this list is used as the partition column.
252
-
253
- Returns:
254
- --------
255
- pl.DataFrame
256
- A dataframe containing only the unique rows from the input dataframe,
257
- based on the specified columns.
258
-
259
- Notes:
260
- ------
261
- The function performs these operations:
262
- 1. Converts LazyFrame to DataFrame if necessary
263
- 2. Partitions the dataframe by the first column in cols (or the first column of the dataframe if cols is None)
264
- 3. Applies the unique operation to each partition based on the remaining columns
265
- 4. Concatenates the results back into a single dataframe
266
- 5. Frees memory by deleting intermediate objects
267
-
268
- This implementation uses tqdm to provide a progress bar during processing,
269
- which is particularly helpful for large datasets where the operation may take time.
270
- """
271
- if isinstance(_df, pl.LazyFrame):
272
- _df = collect_lazy_frame(_df)
273
- from tqdm import tqdm
274
- partition_col = cols[0] if cols is not None else _df.columns[0]
275
- other_cols = cols[1:] if cols is not None else _df.columns[1:]
276
- partitioned_df = _df.partition_by(partition_col)
277
- df = pl.concat([partition.unique(other_cols) for partition in tqdm(partitioned_df)])
278
- del partitioned_df, _df
279
- return df
280
-
281
-
282
- def combine_matches(matching_dfs: List[pl.LazyFrame]):
283
- all_matching_indexes = matching_dfs[-1].select('__left_index', '__right_index')
284
- for matching_df in matching_dfs:
285
- all_matching_indexes = all_matching_indexes.join(matching_df, on=['__left_index', '__right_index'])
286
- return all_matching_indexes
287
-
288
-
289
- def add_index_column(df: pl.LazyFrame, column_name: str, tempdir: str):
290
- return cache_polars_frame_to_temp(df.with_row_index(name=column_name), tempdir)
291
-
292
-
293
- def process_fuzzy_mapping(
294
- fuzzy_map: FuzzyMapping,
295
- left_df: pl.LazyFrame,
296
- right_df: pl.LazyFrame,
297
- existing_matches: Optional[pl.LazyFrame],
298
- local_temp_dir_ref: str,
299
- i: int,
300
- flowfile_logger: Logger,
301
- existing_number_of_matches: Optional[int] = None
302
- ) -> Tuple[pl.LazyFrame, int]:
303
- """
304
- Process a single fuzzy mapping to generate matching dataframes.
305
-
306
- Args:
307
- fuzzy_map: The fuzzy mapping configuration containing match columns and thresholds
308
- left_df: Left dataframe with index column
309
- right_df: Right dataframe with index column
310
- existing_matches: Previously computed matches (or None)
311
- local_temp_dir_ref: Temporary directory reference for caching interim results
312
- i: Index of the current fuzzy mapping
313
- flowfile_logger: Logger instance for progress tracking
314
- existing_number_of_matches: Number of existing matches (if available)
315
-
316
- Returns:
317
- Tuple[pl.LazyFrame, int]: The final matching dataframe and the number of matches
318
- """
319
- # Determine join strategy based on existing matches
320
- if existing_matches is not None:
321
- existing_matches = existing_matches.select('__left_index', '__right_index')
322
- flowfile_logger.info(f'Filtering existing fuzzy matches for {fuzzy_map.left_col} and {fuzzy_map.right_col}')
323
- cross_join_frame = cross_join_filter_existing_fuzzy_results(
324
- left_df=left_df,
325
- right_df=right_df,
326
- existing_matches=existing_matches,
327
- left_col_name=fuzzy_map.left_col,
328
- right_col_name=fuzzy_map.right_col
329
- )
330
- else:
331
- flowfile_logger.info(f'Performing fuzzy match for {fuzzy_map.left_col} and {fuzzy_map.right_col}')
332
- cross_join_frame = cross_join_no_existing_fuzzy_results(
333
- left_df=left_df,
334
- right_df=right_df,
335
- left_col_name=fuzzy_map.left_col,
336
- right_col_name=fuzzy_map.right_col,
337
- temp_dir_ref=local_temp_dir_ref,
338
- flowfile_logger=flowfile_logger
339
- )
340
-
341
- # Calculate fuzzy match scores
342
- flowfile_logger.info(f'Calculating fuzzy match for {fuzzy_map.left_col} and {fuzzy_map.right_col}')
343
- matching_df = calculate_and_parse_fuzzy(
344
- mapping_table=cross_join_frame,
345
- left_col_name=fuzzy_map.left_col,
346
- right_col_name=fuzzy_map.right_col,
347
- fuzzy_method=fuzzy_map.fuzzy_type,
348
- th_score=fuzzy_map.reversed_threshold_score
349
- )
350
- if existing_matches is not None:
351
- matching_df = matching_df.join(existing_matches, on=['__left_index', '__right_index'])
352
- matching_df = cache_polars_frame_to_temp(matching_df, local_temp_dir_ref)
353
- if existing_number_of_matches is None or existing_number_of_matches > 100_000_000:
354
- existing_number_of_matches = matching_df.select(pl.len()).collect()[0, 0]
355
- if existing_number_of_matches > 100_000_000:
356
- return unique_df_large(matching_df.rename({'s': f'fuzzy_score_{i}'})).lazy(), existing_number_of_matches
357
- else:
358
- return matching_df.rename({'s': f'fuzzy_score_{i}'}).unique(), existing_number_of_matches
359
-
360
-
361
- def perform_all_fuzzy_matches(left_df: pl.LazyFrame,
362
- right_df: pl.LazyFrame,
363
- fuzzy_maps: List[FuzzyMapping],
364
- flowfile_logger: Logger,
365
- local_temp_dir_ref: str,
366
- ) -> List[pl.LazyFrame]:
367
- matching_dfs = []
368
- existing_matches = None
369
- existing_number_of_matches = None
370
- for i, fuzzy_map in enumerate(fuzzy_maps):
371
- existing_matches, existing_number_of_matches = process_fuzzy_mapping(
372
- fuzzy_map=fuzzy_map,
373
- left_df=left_df,
374
- right_df=right_df,
375
- existing_matches=existing_matches,
376
- local_temp_dir_ref=local_temp_dir_ref,
377
- i=i,
378
- flowfile_logger=flowfile_logger,
379
- existing_number_of_matches=existing_number_of_matches
380
- )
381
- matching_dfs.append(existing_matches)
382
- return matching_dfs
383
-
384
-
385
- def fuzzy_match_dfs(
386
- left_df: pl.LazyFrame,
387
- right_df: pl.LazyFrame,
388
- fuzzy_maps: List[FuzzyMapping],
389
- flowfile_logger: Logger
390
- ) -> pl.DataFrame:
391
- """
392
- Perform fuzzy matching between two dataframes using multiple fuzzy mapping configurations.
393
-
394
- Args:
395
- left_df: Left dataframe to be matched
396
- right_df: Right dataframe to be matched
397
- fuzzy_maps: List of fuzzy mapping configurations
398
- flowfile_logger: Logger instance for tracking progress
399
-
400
- Returns:
401
- pl.DataFrame: The final matched dataframe with all fuzzy scores
402
- """
403
- left_df, right_df, fuzzy_maps = pre_process_for_fuzzy_matching(left_df, right_df, fuzzy_maps, flowfile_logger)
404
-
405
- # Create a temporary directory for caching intermediate results
406
- local_temp_dir = tempfile.TemporaryDirectory()
407
- local_temp_dir_ref = local_temp_dir.name
408
-
409
- # Add index columns to both dataframes
410
- left_df = add_index_column(left_df, '__left_index', local_temp_dir_ref)
411
- right_df = add_index_column(right_df, '__right_index', local_temp_dir_ref)
412
-
413
- matching_dfs = perform_all_fuzzy_matches(left_df, right_df, fuzzy_maps, flowfile_logger, local_temp_dir_ref)
414
-
415
- # Combine all matches
416
- if len(matching_dfs) > 1:
417
- flowfile_logger.info('Combining fuzzy matches')
418
- all_matches_df = combine_matches(matching_dfs)
419
- else:
420
- flowfile_logger.info('Caching fuzzy matches')
421
- all_matches_df = cache_polars_frame_to_temp(matching_dfs[0], local_temp_dir_ref)
422
-
423
- # Join matches with original dataframes
424
- flowfile_logger.info('Joining fuzzy matches with original dataframes')
425
- output_df = collect_lazy_frame(
426
- (left_df.join(all_matches_df, on='__left_index')
427
- .join(right_df, on='__right_index')
428
- .drop('__right_index', '__left_index'))
429
- )
430
-
431
- # Clean up temporary files
432
- flowfile_logger.info('Cleaning up temporary files')
433
- local_temp_dir.cleanup()
434
-
435
- return output_df
@@ -1,36 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Optional, Literal
3
-
4
- FuzzyTypeLiteral = Literal['levenshtein','jaro', 'jaro_winkler', 'hamming', 'damerau_levenshtein', 'indel']
5
-
6
-
7
- @dataclass
8
- class JoinMap:
9
- left_col: str
10
- right_col: str
11
-
12
-
13
- @dataclass
14
- class FuzzyMapping(JoinMap):
15
- threshold_score: float = 80.0
16
- fuzzy_type: FuzzyTypeLiteral = 'levenshtein'
17
- perc_unique: float = 0.0
18
- output_column_name: Optional[str] = None
19
- valid: bool = True
20
-
21
- def __init__(self, left_col: str, right_col: str = None, threshold_score: float = 80.0,
22
- fuzzy_type: FuzzyTypeLiteral = 'levenshtein', perc_unique: float = 0, output_column_name: str = None,
23
- valid: bool = True):
24
- if right_col is None:
25
- right_col = left_col
26
- self.valid = valid
27
- self.left_col = left_col
28
- self.right_col = right_col
29
- self.threshold_score = threshold_score
30
- self.fuzzy_type = fuzzy_type
31
- self.perc_unique = perc_unique
32
- self.output_col_name = output_column_name if output_column_name is not None else f'fuzzy_score_{left_col}_{right_col}'
33
-
34
- @property
35
- def reversed_threshold_score(self) -> float:
36
- return ((int(self.threshold_score) - 100) * -1) / 100