Flowfile 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowfile/__init__.py +4 -3
- flowfile/api.py +5 -2
- flowfile/web/__init__.py +2 -0
- flowfile/web/static/assets/{CloudConnectionManager-c20a740f.js → CloudConnectionManager-c97c25f8.js} +2 -2
- flowfile/web/static/assets/{CloudStorageReader-960b400a.js → CloudStorageReader-f1ff509e.js} +7 -7
- flowfile/web/static/assets/{CloudStorageWriter-e3decbdd.js → CloudStorageWriter-034f8b78.js} +7 -7
- flowfile/web/static/assets/{CrossJoin-d67e2405.js → CrossJoin-9e156ebe.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-a81e0f7e.js → DatabaseConnectionSettings-d5c625b3.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-9ea35e84.js → DatabaseManager-265adc5e.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-9578bfa5.js → DatabaseReader-0b10551e.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-19531098.js → DatabaseWriter-c17c6916.js} +9 -9
- flowfile/web/static/assets/{ExploreData-40476474.js → ExploreData-18a4fe52.js} +5 -5
- flowfile/web/static/assets/{ExternalSource-2297ef96.js → ExternalSource-3a66556c.js} +6 -6
- flowfile/web/static/assets/{Filter-f211c03a.js → Filter-91ad87e7.js} +8 -8
- flowfile/web/static/assets/{Formula-4207ea31.js → Formula-3c395ab1.js} +8 -8
- flowfile/web/static/assets/{FuzzyMatch-bf120df0.js → FuzzyMatch-2df0d230.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-5bb7497a.js → GraphSolver-d285877f.js} +5 -5
- flowfile/web/static/assets/{GroupBy-92c81b65.js → GroupBy-0bd1cc6b.js} +6 -6
- flowfile/web/static/assets/{Join-4e49a274.js → Join-5a78a203.js} +9 -9
- flowfile/web/static/assets/{ManualInput-90998ae8.js → ManualInput-93aef9d6.js} +5 -5
- flowfile/web/static/assets/{Output-81e3e917.js → Output-411ecaee.js} +4 -4
- flowfile/web/static/assets/{Pivot-a3419842.js → Pivot-89db4b04.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-72710deb.js → PolarsCode-a9f974f8.js} +6 -6
- flowfile/web/static/assets/{Read-c4059daf.js → Read-c3b1929c.js} +6 -6
- flowfile/web/static/assets/{RecordCount-c2b5e095.js → RecordCount-4e95f98e.js} +5 -5
- flowfile/web/static/assets/{RecordId-10baf191.js → RecordId-55ae7d36.js} +6 -6
- flowfile/web/static/assets/{Sample-3ed9a0ae.js → Sample-b4a18476.js} +5 -5
- flowfile/web/static/assets/{SecretManager-0d49c0e8.js → SecretManager-b066d13a.js} +2 -2
- flowfile/web/static/assets/{Select-8a02a0b3.js → Select-727688dc.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-4c0f45f5.js → SettingsSection-695ac487.js} +1 -1
- flowfile/web/static/assets/{Sort-f55c9f9d.js → Sort-be3339a8.js} +6 -6
- flowfile/web/static/assets/{TextToRows-5dbc2145.js → TextToRows-7b8998da.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-a1768e52.js → UnavailableFields-8b0cb48e.js} +2 -2
- flowfile/web/static/assets/{Union-f2aefdc9.js → Union-89fd73dc.js} +5 -5
- flowfile/web/static/assets/{Unique-46b250da.js → Unique-af5a80b4.js} +8 -8
- flowfile/web/static/assets/{Unpivot-25ac84cc.js → Unpivot-5195d411.js} +5 -5
- flowfile/web/static/assets/{api-a0abbdc7.js → api-023d1733.js} +1 -1
- flowfile/web/static/assets/{api-6ef0dcef.js → api-cb00cce6.js} +1 -1
- flowfile/web/static/assets/{designer-186f2e71.css → designer-2197d782.css} +17 -17
- flowfile/web/static/assets/{designer-13eabd83.js → designer-6c322d8e.js} +67 -21
- flowfile/web/static/assets/{documentation-b87e7f6f.js → documentation-4d1fafe1.js} +1 -1
- flowfile/web/static/assets/{dropDown-13564764.js → dropDown-0b46dd77.js} +1 -1
- flowfile/web/static/assets/{fullEditor-fd2cd6f9.js → fullEditor-ec4e4f95.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-71e11604.js → genericNodeSettings-def5879b.js} +3 -3
- flowfile/web/static/assets/{index-f6c15e76.js → index-683fc198.js} +6 -6
- flowfile/web/static/assets/{nodeTitle-988d9efe.js → nodeTitle-a16db7c3.js} +3 -3
- flowfile/web/static/assets/{secretApi-dd636aa2.js → secretApi-baceb6f9.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-af36165e.js → selectDynamic-de91449a.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-2847001e.js → vue-codemirror.esm-dc5e3348.js} +1 -1
- flowfile/web/static/assets/{vue-content-loader.es-0371da73.js → vue-content-loader.es-ba94b82f.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/METADATA +2 -1
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/RECORD +88 -90
- flowfile_core/configs/settings.py +4 -2
- flowfile_core/configs/utils.py +5 -0
- flowfile_core/database/connection.py +1 -3
- flowfile_core/flowfile/code_generator/code_generator.py +36 -0
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +0 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +60 -80
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +61 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +44 -3
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +3 -3
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +5 -2
- flowfile_core/flowfile/flow_graph.py +129 -88
- flowfile_core/flowfile/flow_node/flow_node.py +30 -15
- flowfile_core/flowfile/flow_node/models.py +0 -2
- flowfile_core/flowfile/flow_node/schema_callback.py +138 -43
- flowfile_core/flowfile/graph_tree/graph_tree.py +250 -0
- flowfile_core/flowfile/graph_tree/models.py +15 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +1 -1
- flowfile_core/flowfile/{flow_data_engine/fuzzy_matching/settings_validator.py → schema_callbacks.py} +65 -13
- flowfile_core/flowfile/setting_generator/settings.py +2 -1
- flowfile_core/flowfile/util/execution_orderer.py +9 -0
- flowfile_core/flowfile/util/node_skipper.py +8 -0
- flowfile_core/schemas/schemas.py +46 -3
- flowfile_core/schemas/transform_schema.py +27 -38
- flowfile_core/utils/arrow_reader.py +8 -3
- flowfile_core/utils/validate_setup.py +0 -2
- flowfile_frame/__init__.py +1 -4
- flowfile_frame/expr.py +14 -0
- flowfile_frame/flow_frame.py +34 -5
- flowfile_frame/flow_frame.pyi +5 -6
- flowfile_worker/funcs.py +7 -3
- flowfile_worker/models.py +3 -1
- flowfile_worker/polars_fuzzy_match/matcher.py +0 -435
- flowfile_worker/polars_fuzzy_match/models.py +0 -36
- flowfile_worker/polars_fuzzy_match/pre_process.py +0 -213
- flowfile_worker/polars_fuzzy_match/process.py +0 -86
- flowfile_worker/polars_fuzzy_match/utils.py +0 -50
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/LICENSE +0 -0
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/WHEEL +0 -0
- {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/entry_points.txt +0 -0
- {flowfile_worker/polars_fuzzy_match → flowfile_core/flowfile/graph_tree}/__init__.py +0 -0
|
@@ -34,8 +34,6 @@ def validate_setup():
|
|
|
34
34
|
check_if_node_has_add_function_in_flow_graph(node)
|
|
35
35
|
check_if_node_has_input_schema_definition(node)
|
|
36
36
|
|
|
37
|
-
print("All nodes have corresponding functions in FlowGraph and input schema definitions.")
|
|
38
|
-
|
|
39
37
|
|
|
40
38
|
if __name__ == "__main__":
|
|
41
39
|
validate_setup()
|
flowfile_frame/__init__.py
CHANGED
|
@@ -1,12 +1,9 @@
|
|
|
1
1
|
# flowframe/__init__.py
|
|
2
2
|
"""A Polars-like API for building ETL graphs."""
|
|
3
3
|
|
|
4
|
-
from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
|
|
5
|
-
|
|
6
|
-
OFFLOAD_TO_WORKER.value = False
|
|
7
|
-
|
|
8
4
|
# Core classes
|
|
9
5
|
from flowfile_frame.flow_frame import FlowFrame # noqa: F401
|
|
6
|
+
from pl_fuzzy_frame_match.models import FuzzyMapping # noqa: F401
|
|
10
7
|
|
|
11
8
|
from flowfile_frame.utils import create_flow_graph # noqa: F401
|
|
12
9
|
|
flowfile_frame/expr.py
CHANGED
|
@@ -490,6 +490,20 @@ class Expr:
|
|
|
490
490
|
result.agg_func = "sum"
|
|
491
491
|
return result
|
|
492
492
|
|
|
493
|
+
def unique_counts(self):
|
|
494
|
+
"""
|
|
495
|
+
Return the number of unique values in the column.
|
|
496
|
+
|
|
497
|
+
Returns
|
|
498
|
+
-------
|
|
499
|
+
Expr
|
|
500
|
+
A new expression with the unique counts
|
|
501
|
+
"""
|
|
502
|
+
result_expr = self.expr.unique_counts() if self.expr is not None else None
|
|
503
|
+
result = self._create_next_expr(method_name="unique_counts", result_expr=result_expr, is_complex=self.is_complex)
|
|
504
|
+
result.agg_func = "unique_counts"
|
|
505
|
+
return result
|
|
506
|
+
|
|
493
507
|
def implode(self):
|
|
494
508
|
result_expr = self.expr.implode() if self.expr is not None else None
|
|
495
509
|
result = self._create_next_expr(method_name="implode", result_expr=result_expr, is_complex=self.is_complex)
|
flowfile_frame/flow_frame.py
CHANGED
|
@@ -5,11 +5,13 @@ from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, C
|
|
|
5
5
|
import re
|
|
6
6
|
|
|
7
7
|
import polars as pl
|
|
8
|
-
from polars._typing import (CsvEncoding)
|
|
9
8
|
from flowfile_frame.lazy_methods import add_lazyframe_methods
|
|
10
9
|
|
|
11
|
-
from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
|
|
10
|
+
from polars._typing import (CsvEncoding, FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
|
|
12
11
|
from collections.abc import Iterator
|
|
12
|
+
|
|
13
|
+
from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
|
|
14
|
+
|
|
13
15
|
from flowfile_core.flowfile.flow_graph import FlowGraph, add_connection
|
|
14
16
|
from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
|
|
15
17
|
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
|
|
@@ -20,8 +22,7 @@ from flowfile_frame.expr import Expr, Column, lit, col
|
|
|
20
22
|
from flowfile_frame.selectors import Selector
|
|
21
23
|
from flowfile_frame.group_frame import GroupByFrame
|
|
22
24
|
from flowfile_frame.utils import (_parse_inputs_as_iterable, create_flow_graph, stringify_values,
|
|
23
|
-
ensure_inputs_as_iterable, generate_node_id,
|
|
24
|
-
set_node_id, data as node_id_data)
|
|
25
|
+
ensure_inputs_as_iterable, generate_node_id, data as node_id_data)
|
|
25
26
|
from flowfile_frame.join import _normalize_columns_to_list, _create_join_mappings
|
|
26
27
|
from flowfile_frame.utils import _check_if_convertible_to_code
|
|
27
28
|
from flowfile_frame.config import logger
|
|
@@ -565,7 +566,7 @@ class FlowFrame:
|
|
|
565
566
|
coalesce: bool = None,
|
|
566
567
|
maintain_order: Literal[None, "left", "right", "left_right", "right_left"] = None,
|
|
567
568
|
description: str = None,
|
|
568
|
-
):
|
|
569
|
+
) -> "FlowFrame":
|
|
569
570
|
"""
|
|
570
571
|
Add a join operation to the Logical Plan.
|
|
571
572
|
|
|
@@ -2109,6 +2110,34 @@ class FlowFrame:
|
|
|
2109
2110
|
|
|
2110
2111
|
return self._create_child_frame(new_node_id)
|
|
2111
2112
|
|
|
2113
|
+
def fuzzy_match(self,
|
|
2114
|
+
other: "FlowFrame",
|
|
2115
|
+
fuzzy_mappings: List[FuzzyMapping],
|
|
2116
|
+
description: str = None,
|
|
2117
|
+
) -> "FlowFrame":
|
|
2118
|
+
self._ensure_same_graph(other)
|
|
2119
|
+
|
|
2120
|
+
# Step 3: Generate new node ID
|
|
2121
|
+
new_node_id = generate_node_id()
|
|
2122
|
+
node_fuzzy_match = input_schema.NodeFuzzyMatch(flow_id=self.flow_graph.flow_id,
|
|
2123
|
+
node_id=new_node_id,
|
|
2124
|
+
join_input=
|
|
2125
|
+
transform_schema.FuzzyMatchInput(join_mapping=fuzzy_mappings,
|
|
2126
|
+
left_select=self.columns,
|
|
2127
|
+
right_select=other.columns),
|
|
2128
|
+
description=description or "Fuzzy match between two FlowFrames",
|
|
2129
|
+
depending_on_ids=[self.node_id, other.node_id],
|
|
2130
|
+
)
|
|
2131
|
+
self.flow_graph.add_fuzzy_match(node_fuzzy_match)
|
|
2132
|
+
self._add_connection(self.node_id, new_node_id, "main")
|
|
2133
|
+
other._add_connection(other.node_id, new_node_id, "right")
|
|
2134
|
+
return FlowFrame(
|
|
2135
|
+
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
2136
|
+
flow_graph=self.flow_graph,
|
|
2137
|
+
node_id=new_node_id,
|
|
2138
|
+
parent_node_id=self.node_id,
|
|
2139
|
+
)
|
|
2140
|
+
|
|
2112
2141
|
def text_to_rows(
|
|
2113
2142
|
self,
|
|
2114
2143
|
column: str | Column,
|
flowfile_frame/flow_frame.pyi
CHANGED
|
@@ -80,8 +80,8 @@ class FlowFrame:
|
|
|
80
80
|
|
|
81
81
|
def __ne__(self, other: object) -> typing.NoReturn: ...
|
|
82
82
|
|
|
83
|
-
#
|
|
84
|
-
def __new__(cls, data: typing.Union[LazyFrame, collections.abc.Mapping[str, typing.Union[collections.abc.Sequence[object], collections.abc.Mapping[str, collections.abc.Sequence[object]], ForwardRef('Series')]], collections.abc.Sequence[typing.Any], ForwardRef('np.ndarray[Any, Any]'), ForwardRef('pa.Table'), ForwardRef('pd.DataFrame'), ForwardRef('ArrowArrayExportable'), ForwardRef('ArrowStreamExportable'), ForwardRef('torch.Tensor')] = None, schema: typing.Union[collections.abc.Mapping[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType'), type[int], type[float], type[bool], type[str], type['date'], type['time'], type['datetime'], type['timedelta'], type[list[typing.Any]], type[tuple[typing.Any, ...]], type[bytes], type[object], type['Decimal'], type[None], NoneType]], collections.abc.Sequence[typing.Union[str, tuple[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType'), type[int], type[float], type[bool], type[str], type['date'], type['time'], type['datetime'], type['timedelta'], type[list[typing.Any]], type[tuple[typing.Any, ...]], type[bytes], type[object], type['Decimal'], type[None], NoneType]]]], NoneType] = None, schema_overrides: collections.abc.Mapping[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType')]] | None = None, strict: bool = True, orient: typing.Optional[typing.Literal['col', 'row']] = None, infer_schema_length: int | None = 100, nan_to_null: bool = False, flow_graph: typing.Optional[flowfile_core.flowfile.flow_graph.FlowGraph] = None, node_id: typing.Optional[int] = None, parent_node_id: typing.Optional[int] = None,
|
|
83
|
+
# Unified constructor for FlowFrame.
|
|
84
|
+
def __new__(cls, data: typing.Union[LazyFrame, collections.abc.Mapping[str, typing.Union[collections.abc.Sequence[object], collections.abc.Mapping[str, collections.abc.Sequence[object]], ForwardRef('Series')]], collections.abc.Sequence[typing.Any], ForwardRef('np.ndarray[Any, Any]'), ForwardRef('pa.Table'), ForwardRef('pd.DataFrame'), ForwardRef('ArrowArrayExportable'), ForwardRef('ArrowStreamExportable'), ForwardRef('torch.Tensor')] = None, schema: typing.Union[collections.abc.Mapping[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType'), type[int], type[float], type[bool], type[str], type['date'], type['time'], type['datetime'], type['timedelta'], type[list[typing.Any]], type[tuple[typing.Any, ...]], type[bytes], type[object], type['Decimal'], type[None], NoneType]], collections.abc.Sequence[typing.Union[str, tuple[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType'), type[int], type[float], type[bool], type[str], type['date'], type['time'], type['datetime'], type['timedelta'], type[list[typing.Any]], type[tuple[typing.Any, ...]], type[bytes], type[object], type['Decimal'], type[None], NoneType]]]], NoneType] = None, schema_overrides: collections.abc.Mapping[str, typing.Union[ForwardRef('DataTypeClass'), ForwardRef('DataType')]] | None = None, strict: bool = True, orient: typing.Optional[typing.Literal['col', 'row']] = None, infer_schema_length: int | None = 100, nan_to_null: bool = False, flow_graph: typing.Optional[flowfile_core.flowfile.flow_graph.FlowGraph] = None, node_id: typing.Optional[int] = None, parent_node_id: typing.Optional[int] = None, **kwargs) -> Self: ...
|
|
85
85
|
|
|
86
86
|
def __repr__(self) -> Any: ...
|
|
87
87
|
|
|
@@ -118,9 +118,6 @@ class FlowFrame:
|
|
|
118
118
|
# Execute join using Polars code approach.
|
|
119
119
|
def _execute_polars_code_join(self, other: FlowFrame, new_node_id: int, on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column], left_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column], right_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column], left_columns: typing.Optional[typing.List[str]], right_columns: typing.Optional[typing.List[str]], how: str, suffix: str, validate: str, nulls_equal: bool, coalesce: bool, maintain_order: typing.Literal[None, 'left', 'right', 'left_right', 'right_left'], description: str) -> 'FlowFrame': ...
|
|
120
120
|
|
|
121
|
-
# Internal constructor to create a FlowFrame instance that wraps an
|
|
122
|
-
def _from_existing_node(self, data: LazyFrame, flow_graph: FlowGraph, node_id: int, parent_node_id: typing.Optional[int] = None) -> 'FlowFrame': ...
|
|
123
|
-
|
|
124
121
|
# Generates the `input_df.sort(...)` Polars code string using pure expression strings.
|
|
125
122
|
def _generate_sort_polars_code(self, pure_sort_expr_strs: typing.List[str], descending_values: typing.List[bool], nulls_last_values: typing.List[bool], multithreaded: bool, maintain_order: bool) -> str: ...
|
|
126
123
|
|
|
@@ -211,6 +208,8 @@ class FlowFrame:
|
|
|
211
208
|
# Get the first row of the DataFrame.
|
|
212
209
|
def first(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
213
210
|
|
|
211
|
+
def fuzzy_match(self, other: FlowFrame, fuzzy_mappings: typing.List[flowfile_core.schemas.transform_schema.FuzzyMap], description: str = None) -> 'FlowFrame': ...
|
|
212
|
+
|
|
214
213
|
# Take every nth row in the LazyFrame and return as a new LazyFrame.
|
|
215
214
|
def gather_every(self, n: int, offset: int = 0, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
216
215
|
|
|
@@ -231,7 +230,7 @@ class FlowFrame:
|
|
|
231
230
|
def interpolate(self, description: Optional[str] = None) -> 'FlowFrame': ...
|
|
232
231
|
|
|
233
232
|
# Add a join operation to the Logical Plan.
|
|
234
|
-
def join(self, other, on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column] = None, how: str = 'inner', left_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column] = None, right_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column] = None, suffix: str = '_right', validate: str = None, nulls_equal: bool = False, coalesce: bool = None, maintain_order: typing.Literal[None, 'left', 'right', 'left_right', 'right_left'] = None, description: str = None) ->
|
|
233
|
+
def join(self, other, on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column] = None, how: str = 'inner', left_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column] = None, right_on: typing.Union[typing.List[str | flowfile_frame.expr.Column], str, flowfile_frame.expr.Column] = None, suffix: str = '_right', validate: str = None, nulls_equal: bool = False, coalesce: bool = None, maintain_order: typing.Literal[None, 'left', 'right', 'left_right', 'right_left'] = None, description: str = None) -> 'FlowFrame': ...
|
|
235
234
|
|
|
236
235
|
# Perform an asof join.
|
|
237
236
|
def join_asof(self, other: FlowFrame, left_on: str | None | Expr = None, right_on: str | None | Expr = None, on: str | None | Expr = None, by_left: str | Sequence[str] | None = None, by_right: str | Sequence[str] | None = None, by: str | Sequence[str] | None = None, strategy: AsofJoinStrategy = 'backward', suffix: str = '_right', tolerance: str | int | float | timedelta | None = None, allow_parallel: bool = True, force_parallel: bool = False, coalesce: bool = True, allow_exact_matches: bool = True, check_sortedness: bool = True, description: Optional[str] = None) -> 'FlowFrame': ...
|
flowfile_worker/funcs.py
CHANGED
|
@@ -2,8 +2,9 @@ import polars as pl
|
|
|
2
2
|
import io
|
|
3
3
|
from typing import List, Dict, Callable
|
|
4
4
|
from multiprocessing import Array, Value, Queue
|
|
5
|
-
|
|
6
|
-
from
|
|
5
|
+
|
|
6
|
+
from pl_fuzzy_frame_match import fuzzy_match_dfs, FuzzyMapping
|
|
7
|
+
|
|
7
8
|
from flowfile_worker.flow_logger import get_worker_logger
|
|
8
9
|
from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
|
|
9
10
|
from flowfile_worker.external_sources.sql_source.main import write_df_to_database
|
|
@@ -33,7 +34,10 @@ def fuzzy_join_task(left_serializable_object: bytes, right_serializable_object:
|
|
|
33
34
|
flowfile_logger.info("Starting fuzzy join operation")
|
|
34
35
|
left_df = pl.LazyFrame.deserialize(io.BytesIO(left_serializable_object))
|
|
35
36
|
right_df = pl.LazyFrame.deserialize(io.BytesIO(right_serializable_object))
|
|
36
|
-
fuzzy_match_result = fuzzy_match_dfs(left_df,
|
|
37
|
+
fuzzy_match_result = fuzzy_match_dfs(left_df=left_df,
|
|
38
|
+
right_df=right_df,
|
|
39
|
+
fuzzy_maps=fuzzy_maps,
|
|
40
|
+
logger=flowfile_logger)
|
|
37
41
|
flowfile_logger.info("Fuzzy join operation completed successfully")
|
|
38
42
|
fuzzy_match_result.write_ipc(file_path)
|
|
39
43
|
with progress.get_lock():
|
flowfile_worker/models.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from pydantic import BaseModel
|
|
2
2
|
from typing import Optional, Literal, Any
|
|
3
3
|
from base64 import decodebytes
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
from pl_fuzzy_frame_match import FuzzyMapping
|
|
6
|
+
|
|
5
7
|
from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
|
|
6
8
|
from flowfile_worker.external_sources.s3_source.models import CloudStorageWriteSettings
|
|
7
9
|
|
|
@@ -1,435 +0,0 @@
|
|
|
1
|
-
import polars as pl
|
|
2
|
-
from typing import List, Optional, Tuple
|
|
3
|
-
import tempfile
|
|
4
|
-
from logging import Logger
|
|
5
|
-
|
|
6
|
-
from flowfile_worker.polars_fuzzy_match.process import calculate_and_parse_fuzzy, process_fuzzy_frames
|
|
7
|
-
from flowfile_worker.polars_fuzzy_match.pre_process import pre_process_for_fuzzy_matching
|
|
8
|
-
from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
|
|
9
|
-
from flowfile_worker.polars_fuzzy_match.utils import cache_polars_frame_to_temp
|
|
10
|
-
from flowfile_worker.utils import collect_lazy_frame
|
|
11
|
-
import polars_simed as ps
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
HAS_POLARS_SIM = True
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def ensure_left_is_larger(left_df: pl.DataFrame,
|
|
18
|
-
right_df: pl.DataFrame,
|
|
19
|
-
left_col_name: str,
|
|
20
|
-
right_col_name: str) -> tuple:
|
|
21
|
-
"""
|
|
22
|
-
Ensures that the left dataframe is always the larger one.
|
|
23
|
-
If the right dataframe is larger, swaps them.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
left_df: The left dataframe
|
|
27
|
-
right_df: The right dataframe
|
|
28
|
-
left_col_name: Column name for the left dataframe
|
|
29
|
-
right_col_name: Column name for the right dataframe
|
|
30
|
-
|
|
31
|
-
Returns:
|
|
32
|
-
tuple: (left_df, right_df, left_col_name, right_col_name)
|
|
33
|
-
"""
|
|
34
|
-
left_frame_len = left_df.select(pl.len())[0, 0]
|
|
35
|
-
right_frame_len = right_df.select(pl.len())[0, 0]
|
|
36
|
-
|
|
37
|
-
# Swap dataframes if right is larger than left
|
|
38
|
-
if right_frame_len > left_frame_len:
|
|
39
|
-
return right_df, left_df, right_col_name, left_col_name
|
|
40
|
-
|
|
41
|
-
return left_df, right_df, left_col_name, right_col_name
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def split_dataframe(df: pl.DataFrame, max_chunk_size: int = 500_000) -> List[pl.DataFrame]:
|
|
45
|
-
"""
|
|
46
|
-
Split a Polars DataFrame into multiple DataFrames with a maximum size.
|
|
47
|
-
|
|
48
|
-
Args:
|
|
49
|
-
df: The Polars DataFrame to split
|
|
50
|
-
max_chunk_size: Maximum number of rows per chunk (default: 500,000)
|
|
51
|
-
|
|
52
|
-
Returns:
|
|
53
|
-
List of Polars DataFrames, each containing at most max_chunk_size rows
|
|
54
|
-
"""
|
|
55
|
-
total_rows = df.select(pl.len())[0, 0]
|
|
56
|
-
|
|
57
|
-
# If DataFrame is smaller than max_chunk_size, return it as is
|
|
58
|
-
if total_rows <= max_chunk_size:
|
|
59
|
-
return [df]
|
|
60
|
-
|
|
61
|
-
# Calculate number of chunks needed
|
|
62
|
-
num_chunks = (total_rows + max_chunk_size - 1) // max_chunk_size # Ceiling division
|
|
63
|
-
|
|
64
|
-
chunks = []
|
|
65
|
-
for i in range(num_chunks):
|
|
66
|
-
start_idx = i * max_chunk_size
|
|
67
|
-
end_idx = min((i + 1) * max_chunk_size, total_rows)
|
|
68
|
-
|
|
69
|
-
# Extract chunk using slice
|
|
70
|
-
chunk = df.slice(start_idx, end_idx - start_idx)
|
|
71
|
-
chunks.append(chunk)
|
|
72
|
-
|
|
73
|
-
return chunks
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def cross_join_large_files(left_fuzzy_frame: pl.LazyFrame,
|
|
77
|
-
right_fuzzy_frame: pl.LazyFrame,
|
|
78
|
-
left_col_name: str,
|
|
79
|
-
right_col_name: str,
|
|
80
|
-
flowfile_logger: Logger,
|
|
81
|
-
) -> pl.LazyFrame:
|
|
82
|
-
if not HAS_POLARS_SIM:
|
|
83
|
-
raise Exception('The polars-sim library is required to perform this operation.')
|
|
84
|
-
|
|
85
|
-
left_df = collect_lazy_frame(left_fuzzy_frame)
|
|
86
|
-
right_df = collect_lazy_frame(right_fuzzy_frame)
|
|
87
|
-
|
|
88
|
-
left_df, right_df, left_col_name, right_col_name = ensure_left_is_larger(
|
|
89
|
-
left_df, right_df, left_col_name, right_col_name
|
|
90
|
-
)
|
|
91
|
-
left_chunks = split_dataframe(left_df, max_chunk_size=500_000) # Reduced chunk size
|
|
92
|
-
flowfile_logger.info(f"Splitting left dataframe into {len(left_chunks)} chunks.")
|
|
93
|
-
df_matches = []
|
|
94
|
-
|
|
95
|
-
# Process each chunk combination with error handling
|
|
96
|
-
for i, left_chunk in enumerate(left_chunks):
|
|
97
|
-
chunk_matches = ps.join_sim(
|
|
98
|
-
left=left_chunk,
|
|
99
|
-
right=right_df,
|
|
100
|
-
left_on=left_col_name,
|
|
101
|
-
right_on=right_col_name,
|
|
102
|
-
top_n=100,
|
|
103
|
-
add_similarity=False,
|
|
104
|
-
)
|
|
105
|
-
flowfile_logger.info(f"Processed chunk {int(i)} with {len(chunk_matches)} matches.")
|
|
106
|
-
df_matches.append(chunk_matches)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
# Combine all matches
|
|
110
|
-
if df_matches:
|
|
111
|
-
return pl.concat(df_matches).lazy()
|
|
112
|
-
else:
|
|
113
|
-
columns = list(set(left_df.columns).union(set(right_df.columns)))
|
|
114
|
-
return pl.DataFrame(schema={col: pl.Null for col in columns}).lazy()
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def cross_join_small_files(left_df: pl.LazyFrame, right_df: pl.LazyFrame) -> pl.LazyFrame:
|
|
118
|
-
return left_df.join(right_df, how='cross')
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def cross_join_filter_existing_fuzzy_results(left_df: pl.LazyFrame, right_df: pl.LazyFrame,
|
|
122
|
-
existing_matches: pl.LazyFrame,
|
|
123
|
-
left_col_name: str, right_col_name: str):
|
|
124
|
-
"""
|
|
125
|
-
Process and filter fuzzy matching results by joining dataframes using existing match indices.
|
|
126
|
-
|
|
127
|
-
This function takes previously identified fuzzy matches (existing_matches) and performs
|
|
128
|
-
a series of operations to create a refined dataset of matches between the left and right
|
|
129
|
-
dataframes, preserving index relationships.
|
|
130
|
-
|
|
131
|
-
Parameters:
|
|
132
|
-
-----------
|
|
133
|
-
left_df : pl.LazyFrame
|
|
134
|
-
The left dataframe containing records to be matched.
|
|
135
|
-
right_df : pl.LazyFrame
|
|
136
|
-
The right dataframe containing records to be matched against.
|
|
137
|
-
existing_matches : pl.LazyFrame
|
|
138
|
-
A dataframe containing the indices of already identified matches between
|
|
139
|
-
left_df and right_df, with columns '__left_index' and '__right_index'.
|
|
140
|
-
left_col_name : str
|
|
141
|
-
The column name from left_df to include in the result.
|
|
142
|
-
right_col_name : str
|
|
143
|
-
The column name from right_df to include in the result.
|
|
144
|
-
|
|
145
|
-
Returns:
|
|
146
|
-
--------
|
|
147
|
-
pl.LazyFrame
|
|
148
|
-
A dataframe containing the unique matches between left_df and right_df,
|
|
149
|
-
with index information for both dataframes preserved. The resulting dataframe
|
|
150
|
-
includes the specified columns from both dataframes along with their respective
|
|
151
|
-
index aggregations.
|
|
152
|
-
|
|
153
|
-
Notes:
|
|
154
|
-
------
|
|
155
|
-
The function performs these operations:
|
|
156
|
-
1. Join existing matches with both dataframes using their respective indices
|
|
157
|
-
2. Select only the relevant columns and remove duplicates
|
|
158
|
-
3. Create aggregations that preserve the relationship between values and their indices
|
|
159
|
-
4. Join these aggregations back to create the final result set
|
|
160
|
-
"""
|
|
161
|
-
joined_df = (existing_matches
|
|
162
|
-
.select(['__left_index', '__right_index'])
|
|
163
|
-
.join(left_df, on='__left_index')
|
|
164
|
-
.join(right_df, on='__right_index')
|
|
165
|
-
.select(left_col_name, right_col_name, '__left_index', '__right_index')
|
|
166
|
-
)
|
|
167
|
-
return joined_df.group_by([left_col_name, right_col_name]).agg('__left_index', '__right_index')
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
def cross_join_no_existing_fuzzy_results(left_df: pl.LazyFrame, right_df: pl.LazyFrame, left_col_name: str,
|
|
171
|
-
right_col_name: str, temp_dir_ref: str,
|
|
172
|
-
flowfile_logger: Logger) -> pl.LazyFrame:
|
|
173
|
-
"""
|
|
174
|
-
Generate fuzzy matching results by performing a cross join between dataframes.
|
|
175
|
-
|
|
176
|
-
This function processes the input dataframes, determines the appropriate cross join method
|
|
177
|
-
based on the size of the resulting cartesian product, and returns the cross-joined results
|
|
178
|
-
for fuzzy matching when no existing matches are provided.
|
|
179
|
-
|
|
180
|
-
Parameters:
|
|
181
|
-
-----------
|
|
182
|
-
left_df : pl.LazyFrame
|
|
183
|
-
The left dataframe containing records to be matched.
|
|
184
|
-
right_df : pl.LazyFrame
|
|
185
|
-
The right dataframe containing records to be matched against.
|
|
186
|
-
left_col_name : str
|
|
187
|
-
The column name from left_df to use for fuzzy matching.
|
|
188
|
-
right_col_name : str
|
|
189
|
-
The column name from right_df to use for fuzzy matching.
|
|
190
|
-
temp_dir_ref : str
|
|
191
|
-
Reference to a temporary directory where intermediate results can be stored
|
|
192
|
-
during processing of large dataframes.
|
|
193
|
-
|
|
194
|
-
Returns:
|
|
195
|
-
--------
|
|
196
|
-
pl.LazyFrame
|
|
197
|
-
A dataframe containing the cross join results of left_df and right_df,
|
|
198
|
-
prepared for fuzzy matching operations.
|
|
199
|
-
|
|
200
|
-
Notes:
|
|
201
|
-
------
|
|
202
|
-
The function performs these operations:
|
|
203
|
-
1. Processes input frames using the process_fuzzy_frames helper function
|
|
204
|
-
2. Calculates the size of the cartesian product to determine processing approach
|
|
205
|
-
3. Uses either cross_join_large_files or cross_join_small_files based on the size:
|
|
206
|
-
- For cartesian products > 100M but < 1T (or 10M without polars-sim), uses large file method
|
|
207
|
-
- For smaller products, uses the small file method
|
|
208
|
-
4. Raises an exception if the cartesian product exceeds the maximum allowed size
|
|
209
|
-
|
|
210
|
-
Raises:
|
|
211
|
-
-------
|
|
212
|
-
Exception
|
|
213
|
-
If the cartesian product of the two dataframes exceeds the maximum allowed size
|
|
214
|
-
(1 trillion with polars-sim, 100 million without).
|
|
215
|
-
"""
|
|
216
|
-
(left_fuzzy_frame,
|
|
217
|
-
right_fuzzy_frame,
|
|
218
|
-
left_col_name,
|
|
219
|
-
right_col_name,
|
|
220
|
-
len_left_df,
|
|
221
|
-
len_right_df) = process_fuzzy_frames(left_df=left_df, right_df=right_df, left_col_name=left_col_name,
|
|
222
|
-
right_col_name=right_col_name, temp_dir_ref=temp_dir_ref)
|
|
223
|
-
cartesian_size = len_left_df * len_right_df
|
|
224
|
-
max_size = 100_000_000_000_000 if HAS_POLARS_SIM else 10_000_000
|
|
225
|
-
if cartesian_size > max_size:
|
|
226
|
-
flowfile_logger.error(f'The cartesian product of the two dataframes is too large to process: {cartesian_size}')
|
|
227
|
-
raise Exception('The cartesian product of the two dataframes is too large to process.')
|
|
228
|
-
if cartesian_size > 100_000_000:
|
|
229
|
-
flowfile_logger.info('Performing approximate fuzzy match for large dataframes to reduce memory usage.')
|
|
230
|
-
cross_join_frame = cross_join_large_files(left_fuzzy_frame, right_fuzzy_frame, left_col_name=left_col_name,
|
|
231
|
-
right_col_name=right_col_name, flowfile_logger=flowfile_logger)
|
|
232
|
-
else:
|
|
233
|
-
cross_join_frame = cross_join_small_files(left_fuzzy_frame, right_fuzzy_frame)
|
|
234
|
-
return cross_join_frame
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
def unique_df_large(_df: pl.DataFrame | pl.LazyFrame, cols: Optional[List[str]] = None) -> pl.DataFrame:
|
|
238
|
-
"""
|
|
239
|
-
Efficiently compute unique rows in large dataframes by partitioning.
|
|
240
|
-
|
|
241
|
-
This function processes large dataframes by first partitioning them by a selected column,
|
|
242
|
-
then finding unique combinations within each partition before recombining the results.
|
|
243
|
-
This approach is more memory-efficient for large datasets than calling .unique() directly.
|
|
244
|
-
|
|
245
|
-
Parameters:
|
|
246
|
-
-----------
|
|
247
|
-
_df : pl.DataFrame | pl.LazyFrame
|
|
248
|
-
The input dataframe to process. Can be either a Polars DataFrame or LazyFrame.
|
|
249
|
-
cols : Optional[List[str]]
|
|
250
|
-
The list of columns to consider when finding unique rows. If None, all columns
|
|
251
|
-
are used. The first column in this list is used as the partition column.
|
|
252
|
-
|
|
253
|
-
Returns:
|
|
254
|
-
--------
|
|
255
|
-
pl.DataFrame
|
|
256
|
-
A dataframe containing only the unique rows from the input dataframe,
|
|
257
|
-
based on the specified columns.
|
|
258
|
-
|
|
259
|
-
Notes:
|
|
260
|
-
------
|
|
261
|
-
The function performs these operations:
|
|
262
|
-
1. Converts LazyFrame to DataFrame if necessary
|
|
263
|
-
2. Partitions the dataframe by the first column in cols (or the first column of the dataframe if cols is None)
|
|
264
|
-
3. Applies the unique operation to each partition based on the remaining columns
|
|
265
|
-
4. Concatenates the results back into a single dataframe
|
|
266
|
-
5. Frees memory by deleting intermediate objects
|
|
267
|
-
|
|
268
|
-
This implementation uses tqdm to provide a progress bar during processing,
|
|
269
|
-
which is particularly helpful for large datasets where the operation may take time.
|
|
270
|
-
"""
|
|
271
|
-
if isinstance(_df, pl.LazyFrame):
|
|
272
|
-
_df = collect_lazy_frame(_df)
|
|
273
|
-
from tqdm import tqdm
|
|
274
|
-
partition_col = cols[0] if cols is not None else _df.columns[0]
|
|
275
|
-
other_cols = cols[1:] if cols is not None else _df.columns[1:]
|
|
276
|
-
partitioned_df = _df.partition_by(partition_col)
|
|
277
|
-
df = pl.concat([partition.unique(other_cols) for partition in tqdm(partitioned_df)])
|
|
278
|
-
del partitioned_df, _df
|
|
279
|
-
return df
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
def combine_matches(matching_dfs: List[pl.LazyFrame]):
|
|
283
|
-
all_matching_indexes = matching_dfs[-1].select('__left_index', '__right_index')
|
|
284
|
-
for matching_df in matching_dfs:
|
|
285
|
-
all_matching_indexes = all_matching_indexes.join(matching_df, on=['__left_index', '__right_index'])
|
|
286
|
-
return all_matching_indexes
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
def add_index_column(df: pl.LazyFrame, column_name: str, tempdir: str):
|
|
290
|
-
return cache_polars_frame_to_temp(df.with_row_index(name=column_name), tempdir)
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
def process_fuzzy_mapping(
|
|
294
|
-
fuzzy_map: FuzzyMapping,
|
|
295
|
-
left_df: pl.LazyFrame,
|
|
296
|
-
right_df: pl.LazyFrame,
|
|
297
|
-
existing_matches: Optional[pl.LazyFrame],
|
|
298
|
-
local_temp_dir_ref: str,
|
|
299
|
-
i: int,
|
|
300
|
-
flowfile_logger: Logger,
|
|
301
|
-
existing_number_of_matches: Optional[int] = None
|
|
302
|
-
) -> Tuple[pl.LazyFrame, int]:
|
|
303
|
-
"""
|
|
304
|
-
Process a single fuzzy mapping to generate matching dataframes.
|
|
305
|
-
|
|
306
|
-
Args:
|
|
307
|
-
fuzzy_map: The fuzzy mapping configuration containing match columns and thresholds
|
|
308
|
-
left_df: Left dataframe with index column
|
|
309
|
-
right_df: Right dataframe with index column
|
|
310
|
-
existing_matches: Previously computed matches (or None)
|
|
311
|
-
local_temp_dir_ref: Temporary directory reference for caching interim results
|
|
312
|
-
i: Index of the current fuzzy mapping
|
|
313
|
-
flowfile_logger: Logger instance for progress tracking
|
|
314
|
-
existing_number_of_matches: Number of existing matches (if available)
|
|
315
|
-
|
|
316
|
-
Returns:
|
|
317
|
-
Tuple[pl.LazyFrame, int]: The final matching dataframe and the number of matches
|
|
318
|
-
"""
|
|
319
|
-
# Determine join strategy based on existing matches
|
|
320
|
-
if existing_matches is not None:
|
|
321
|
-
existing_matches = existing_matches.select('__left_index', '__right_index')
|
|
322
|
-
flowfile_logger.info(f'Filtering existing fuzzy matches for {fuzzy_map.left_col} and {fuzzy_map.right_col}')
|
|
323
|
-
cross_join_frame = cross_join_filter_existing_fuzzy_results(
|
|
324
|
-
left_df=left_df,
|
|
325
|
-
right_df=right_df,
|
|
326
|
-
existing_matches=existing_matches,
|
|
327
|
-
left_col_name=fuzzy_map.left_col,
|
|
328
|
-
right_col_name=fuzzy_map.right_col
|
|
329
|
-
)
|
|
330
|
-
else:
|
|
331
|
-
flowfile_logger.info(f'Performing fuzzy match for {fuzzy_map.left_col} and {fuzzy_map.right_col}')
|
|
332
|
-
cross_join_frame = cross_join_no_existing_fuzzy_results(
|
|
333
|
-
left_df=left_df,
|
|
334
|
-
right_df=right_df,
|
|
335
|
-
left_col_name=fuzzy_map.left_col,
|
|
336
|
-
right_col_name=fuzzy_map.right_col,
|
|
337
|
-
temp_dir_ref=local_temp_dir_ref,
|
|
338
|
-
flowfile_logger=flowfile_logger
|
|
339
|
-
)
|
|
340
|
-
|
|
341
|
-
# Calculate fuzzy match scores
|
|
342
|
-
flowfile_logger.info(f'Calculating fuzzy match for {fuzzy_map.left_col} and {fuzzy_map.right_col}')
|
|
343
|
-
matching_df = calculate_and_parse_fuzzy(
|
|
344
|
-
mapping_table=cross_join_frame,
|
|
345
|
-
left_col_name=fuzzy_map.left_col,
|
|
346
|
-
right_col_name=fuzzy_map.right_col,
|
|
347
|
-
fuzzy_method=fuzzy_map.fuzzy_type,
|
|
348
|
-
th_score=fuzzy_map.reversed_threshold_score
|
|
349
|
-
)
|
|
350
|
-
if existing_matches is not None:
|
|
351
|
-
matching_df = matching_df.join(existing_matches, on=['__left_index', '__right_index'])
|
|
352
|
-
matching_df = cache_polars_frame_to_temp(matching_df, local_temp_dir_ref)
|
|
353
|
-
if existing_number_of_matches is None or existing_number_of_matches > 100_000_000:
|
|
354
|
-
existing_number_of_matches = matching_df.select(pl.len()).collect()[0, 0]
|
|
355
|
-
if existing_number_of_matches > 100_000_000:
|
|
356
|
-
return unique_df_large(matching_df.rename({'s': f'fuzzy_score_{i}'})).lazy(), existing_number_of_matches
|
|
357
|
-
else:
|
|
358
|
-
return matching_df.rename({'s': f'fuzzy_score_{i}'}).unique(), existing_number_of_matches
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
def perform_all_fuzzy_matches(left_df: pl.LazyFrame,
|
|
362
|
-
right_df: pl.LazyFrame,
|
|
363
|
-
fuzzy_maps: List[FuzzyMapping],
|
|
364
|
-
flowfile_logger: Logger,
|
|
365
|
-
local_temp_dir_ref: str,
|
|
366
|
-
) -> List[pl.LazyFrame]:
|
|
367
|
-
matching_dfs = []
|
|
368
|
-
existing_matches = None
|
|
369
|
-
existing_number_of_matches = None
|
|
370
|
-
for i, fuzzy_map in enumerate(fuzzy_maps):
|
|
371
|
-
existing_matches, existing_number_of_matches = process_fuzzy_mapping(
|
|
372
|
-
fuzzy_map=fuzzy_map,
|
|
373
|
-
left_df=left_df,
|
|
374
|
-
right_df=right_df,
|
|
375
|
-
existing_matches=existing_matches,
|
|
376
|
-
local_temp_dir_ref=local_temp_dir_ref,
|
|
377
|
-
i=i,
|
|
378
|
-
flowfile_logger=flowfile_logger,
|
|
379
|
-
existing_number_of_matches=existing_number_of_matches
|
|
380
|
-
)
|
|
381
|
-
matching_dfs.append(existing_matches)
|
|
382
|
-
return matching_dfs
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
def fuzzy_match_dfs(
|
|
386
|
-
left_df: pl.LazyFrame,
|
|
387
|
-
right_df: pl.LazyFrame,
|
|
388
|
-
fuzzy_maps: List[FuzzyMapping],
|
|
389
|
-
flowfile_logger: Logger
|
|
390
|
-
) -> pl.DataFrame:
|
|
391
|
-
"""
|
|
392
|
-
Perform fuzzy matching between two dataframes using multiple fuzzy mapping configurations.
|
|
393
|
-
|
|
394
|
-
Args:
|
|
395
|
-
left_df: Left dataframe to be matched
|
|
396
|
-
right_df: Right dataframe to be matched
|
|
397
|
-
fuzzy_maps: List of fuzzy mapping configurations
|
|
398
|
-
flowfile_logger: Logger instance for tracking progress
|
|
399
|
-
|
|
400
|
-
Returns:
|
|
401
|
-
pl.DataFrame: The final matched dataframe with all fuzzy scores
|
|
402
|
-
"""
|
|
403
|
-
left_df, right_df, fuzzy_maps = pre_process_for_fuzzy_matching(left_df, right_df, fuzzy_maps, flowfile_logger)
|
|
404
|
-
|
|
405
|
-
# Create a temporary directory for caching intermediate results
|
|
406
|
-
local_temp_dir = tempfile.TemporaryDirectory()
|
|
407
|
-
local_temp_dir_ref = local_temp_dir.name
|
|
408
|
-
|
|
409
|
-
# Add index columns to both dataframes
|
|
410
|
-
left_df = add_index_column(left_df, '__left_index', local_temp_dir_ref)
|
|
411
|
-
right_df = add_index_column(right_df, '__right_index', local_temp_dir_ref)
|
|
412
|
-
|
|
413
|
-
matching_dfs = perform_all_fuzzy_matches(left_df, right_df, fuzzy_maps, flowfile_logger, local_temp_dir_ref)
|
|
414
|
-
|
|
415
|
-
# Combine all matches
|
|
416
|
-
if len(matching_dfs) > 1:
|
|
417
|
-
flowfile_logger.info('Combining fuzzy matches')
|
|
418
|
-
all_matches_df = combine_matches(matching_dfs)
|
|
419
|
-
else:
|
|
420
|
-
flowfile_logger.info('Caching fuzzy matches')
|
|
421
|
-
all_matches_df = cache_polars_frame_to_temp(matching_dfs[0], local_temp_dir_ref)
|
|
422
|
-
|
|
423
|
-
# Join matches with original dataframes
|
|
424
|
-
flowfile_logger.info('Joining fuzzy matches with original dataframes')
|
|
425
|
-
output_df = collect_lazy_frame(
|
|
426
|
-
(left_df.join(all_matches_df, on='__left_index')
|
|
427
|
-
.join(right_df, on='__right_index')
|
|
428
|
-
.drop('__right_index', '__left_index'))
|
|
429
|
-
)
|
|
430
|
-
|
|
431
|
-
# Clean up temporary files
|
|
432
|
-
flowfile_logger.info('Cleaning up temporary files')
|
|
433
|
-
local_temp_dir.cleanup()
|
|
434
|
-
|
|
435
|
-
return output_df
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
from typing import Optional, Literal
|
|
3
|
-
|
|
4
|
-
FuzzyTypeLiteral = Literal['levenshtein','jaro', 'jaro_winkler', 'hamming', 'damerau_levenshtein', 'indel']
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
@dataclass
|
|
8
|
-
class JoinMap:
|
|
9
|
-
left_col: str
|
|
10
|
-
right_col: str
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
@dataclass
|
|
14
|
-
class FuzzyMapping(JoinMap):
|
|
15
|
-
threshold_score: float = 80.0
|
|
16
|
-
fuzzy_type: FuzzyTypeLiteral = 'levenshtein'
|
|
17
|
-
perc_unique: float = 0.0
|
|
18
|
-
output_column_name: Optional[str] = None
|
|
19
|
-
valid: bool = True
|
|
20
|
-
|
|
21
|
-
def __init__(self, left_col: str, right_col: str = None, threshold_score: float = 80.0,
|
|
22
|
-
fuzzy_type: FuzzyTypeLiteral = 'levenshtein', perc_unique: float = 0, output_column_name: str = None,
|
|
23
|
-
valid: bool = True):
|
|
24
|
-
if right_col is None:
|
|
25
|
-
right_col = left_col
|
|
26
|
-
self.valid = valid
|
|
27
|
-
self.left_col = left_col
|
|
28
|
-
self.right_col = right_col
|
|
29
|
-
self.threshold_score = threshold_score
|
|
30
|
-
self.fuzzy_type = fuzzy_type
|
|
31
|
-
self.perc_unique = perc_unique
|
|
32
|
-
self.output_col_name = output_column_name if output_column_name is not None else f'fuzzy_score_{left_col}_{right_col}'
|
|
33
|
-
|
|
34
|
-
@property
|
|
35
|
-
def reversed_threshold_score(self) -> float:
|
|
36
|
-
return ((int(self.threshold_score) - 100) * -1) / 100
|