lionagi 0.15.14__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lionagi/libs/validate/fuzzy_match_keys.py +5 -182
- lionagi/libs/validate/string_similarity.py +6 -331
- lionagi/ln/__init__.py +56 -66
- lionagi/ln/_async_call.py +13 -10
- lionagi/ln/_hash.py +33 -8
- lionagi/ln/_list_call.py +2 -35
- lionagi/ln/_to_list.py +51 -28
- lionagi/ln/_utils.py +156 -0
- lionagi/ln/concurrency/__init__.py +39 -31
- lionagi/ln/concurrency/_compat.py +65 -0
- lionagi/ln/concurrency/cancel.py +92 -109
- lionagi/ln/concurrency/errors.py +17 -17
- lionagi/ln/concurrency/patterns.py +249 -206
- lionagi/ln/concurrency/primitives.py +257 -216
- lionagi/ln/concurrency/resource_tracker.py +42 -155
- lionagi/ln/concurrency/task.py +55 -73
- lionagi/ln/concurrency/throttle.py +3 -0
- lionagi/ln/concurrency/utils.py +1 -0
- lionagi/ln/fuzzy/__init__.py +15 -0
- lionagi/ln/{_extract_json.py → fuzzy/_extract_json.py} +22 -9
- lionagi/ln/{_fuzzy_json.py → fuzzy/_fuzzy_json.py} +14 -8
- lionagi/ln/fuzzy/_fuzzy_match.py +172 -0
- lionagi/ln/fuzzy/_fuzzy_validate.py +46 -0
- lionagi/ln/fuzzy/_string_similarity.py +332 -0
- lionagi/ln/{_models.py → types.py} +153 -4
- lionagi/operations/flow.py +2 -1
- lionagi/operations/operate/operate.py +26 -16
- lionagi/protocols/contracts.py +46 -0
- lionagi/protocols/generic/event.py +6 -6
- lionagi/protocols/generic/processor.py +9 -5
- lionagi/protocols/ids.py +82 -0
- lionagi/protocols/types.py +10 -12
- lionagi/utils.py +34 -64
- lionagi/version.py +1 -1
- {lionagi-0.15.14.dist-info → lionagi-0.16.0.dist-info}/METADATA +4 -2
- {lionagi-0.15.14.dist-info → lionagi-0.16.0.dist-info}/RECORD +38 -31
- lionagi/ln/_types.py +0 -146
- {lionagi-0.15.14.dist-info → lionagi-0.16.0.dist-info}/WHEEL +0 -0
- {lionagi-0.15.14.dist-info → lionagi-0.16.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,172 @@
|
|
1
|
+
from collections.abc import Sequence
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from typing import Any, ClassVar, Literal
|
4
|
+
|
5
|
+
from ..types import KeysDict, Params, Unset
|
6
|
+
from ._string_similarity import (
|
7
|
+
SIMILARITY_ALGO_MAP,
|
8
|
+
SIMILARITY_TYPE,
|
9
|
+
SimilarityFunc,
|
10
|
+
string_similarity,
|
11
|
+
)
|
12
|
+
|
13
|
+
__all__ = (
|
14
|
+
"fuzzy_match_keys",
|
15
|
+
"FuzzyMatchKeysParams",
|
16
|
+
)
|
17
|
+
|
18
|
+
|
19
|
+
HandleUnmatched = Literal["ignore", "raise", "remove", "fill", "force"]
|
20
|
+
|
21
|
+
|
22
|
+
def fuzzy_match_keys(
|
23
|
+
d_: dict[str, Any],
|
24
|
+
keys: Sequence[str] | KeysDict,
|
25
|
+
/,
|
26
|
+
*,
|
27
|
+
similarity_algo: SIMILARITY_TYPE | SimilarityFunc = "jaro_winkler",
|
28
|
+
similarity_threshold: float = 0.85,
|
29
|
+
fuzzy_match: bool = True,
|
30
|
+
handle_unmatched: HandleUnmatched = "ignore",
|
31
|
+
fill_value: Any = None,
|
32
|
+
fill_mapping: dict[str, Any] | None = None,
|
33
|
+
strict: bool = False,
|
34
|
+
) -> dict[str, Any]:
|
35
|
+
"""
|
36
|
+
Validate and correct dictionary keys based on expected keys using string similarity.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
d_: The dictionary to validate and correct keys for.
|
40
|
+
keys: List of expected keys or dictionary mapping keys to types.
|
41
|
+
similarity_algo: String similarity algorithm to use or custom function.
|
42
|
+
similarity_threshold: Minimum similarity score for fuzzy matching.
|
43
|
+
fuzzy_match: If True, use fuzzy matching for key correction.
|
44
|
+
handle_unmatched: Specifies how to handle unmatched keys:
|
45
|
+
- "ignore": Keep unmatched keys in output.
|
46
|
+
- "raise": Raise ValueError if unmatched keys exist.
|
47
|
+
- "remove": Remove unmatched keys from output.
|
48
|
+
- "fill": Fill unmatched keys with default value/mapping.
|
49
|
+
- "force": Combine "fill" and "remove" behaviors.
|
50
|
+
fill_value: Default value for filling unmatched keys.
|
51
|
+
fill_mapping: Dictionary mapping unmatched keys to default values.
|
52
|
+
strict: If True, raise ValueError if any expected key is missing.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
A new dictionary with validated and corrected keys.
|
56
|
+
|
57
|
+
Raises:
|
58
|
+
ValueError: If validation fails based on specified parameters.
|
59
|
+
TypeError: If input types are invalid.
|
60
|
+
AttributeError: If key validation fails.
|
61
|
+
"""
|
62
|
+
# Input validation
|
63
|
+
if not isinstance(d_, dict):
|
64
|
+
raise TypeError("First argument must be a dictionary")
|
65
|
+
if keys is None:
|
66
|
+
raise TypeError("Keys argument cannot be None")
|
67
|
+
if not 0.0 <= similarity_threshold <= 1.0:
|
68
|
+
raise ValueError("similarity_threshold must be between 0.0 and 1.0")
|
69
|
+
|
70
|
+
# Extract expected keys
|
71
|
+
fields_set = set(keys) if isinstance(keys, list) else set(keys.keys())
|
72
|
+
if not fields_set:
|
73
|
+
return d_.copy() # Return copy of original if no expected keys
|
74
|
+
|
75
|
+
# Initialize output dictionary and tracking sets
|
76
|
+
corrected_out = {}
|
77
|
+
matched_expected = set()
|
78
|
+
matched_input = set()
|
79
|
+
|
80
|
+
# Get similarity function
|
81
|
+
if isinstance(similarity_algo, str):
|
82
|
+
if similarity_algo not in SIMILARITY_ALGO_MAP:
|
83
|
+
raise ValueError(
|
84
|
+
f"Unknown similarity algorithm: {similarity_algo}"
|
85
|
+
)
|
86
|
+
similarity_func = SIMILARITY_ALGO_MAP[similarity_algo]
|
87
|
+
else:
|
88
|
+
similarity_func = similarity_algo
|
89
|
+
|
90
|
+
# First pass: exact matches
|
91
|
+
for key in d_:
|
92
|
+
if key in fields_set:
|
93
|
+
corrected_out[key] = d_[key]
|
94
|
+
matched_expected.add(key)
|
95
|
+
matched_input.add(key)
|
96
|
+
|
97
|
+
# Second pass: fuzzy matching if enabled
|
98
|
+
if fuzzy_match:
|
99
|
+
remaining_input = set(d_.keys()) - matched_input
|
100
|
+
remaining_expected = fields_set - matched_expected
|
101
|
+
|
102
|
+
for key in remaining_input:
|
103
|
+
if not remaining_expected:
|
104
|
+
break
|
105
|
+
|
106
|
+
matches = string_similarity(
|
107
|
+
key,
|
108
|
+
list(remaining_expected),
|
109
|
+
algorithm=similarity_func,
|
110
|
+
threshold=similarity_threshold,
|
111
|
+
return_most_similar=True,
|
112
|
+
)
|
113
|
+
|
114
|
+
if matches:
|
115
|
+
match = matches
|
116
|
+
corrected_out[match] = d_[key]
|
117
|
+
matched_expected.add(match)
|
118
|
+
matched_input.add(key)
|
119
|
+
remaining_expected.remove(match)
|
120
|
+
elif handle_unmatched == "ignore":
|
121
|
+
corrected_out[key] = d_[key]
|
122
|
+
|
123
|
+
# Handle unmatched keys based on handle_unmatched parameter
|
124
|
+
unmatched_input = set(d_.keys()) - matched_input
|
125
|
+
unmatched_expected = fields_set - matched_expected
|
126
|
+
|
127
|
+
if handle_unmatched == "raise" and unmatched_input:
|
128
|
+
raise ValueError(f"Unmatched keys found: {unmatched_input}")
|
129
|
+
|
130
|
+
elif handle_unmatched == "ignore":
|
131
|
+
for key in unmatched_input:
|
132
|
+
corrected_out[key] = d_[key]
|
133
|
+
|
134
|
+
elif handle_unmatched in ("fill", "force"):
|
135
|
+
# Fill missing expected keys
|
136
|
+
for key in unmatched_expected:
|
137
|
+
if fill_mapping and key in fill_mapping:
|
138
|
+
corrected_out[key] = fill_mapping[key]
|
139
|
+
else:
|
140
|
+
corrected_out[key] = fill_value
|
141
|
+
|
142
|
+
# For "fill" mode, also keep unmatched original keys
|
143
|
+
if handle_unmatched == "fill":
|
144
|
+
for key in unmatched_input:
|
145
|
+
corrected_out[key] = d_[key]
|
146
|
+
|
147
|
+
# Check strict mode
|
148
|
+
if strict and unmatched_expected:
|
149
|
+
raise ValueError(f"Missing required keys: {unmatched_expected}")
|
150
|
+
|
151
|
+
return corrected_out
|
152
|
+
|
153
|
+
|
154
|
+
@dataclass(slots=True, init=False, frozen=True)
|
155
|
+
class FuzzyMatchKeysParams(Params):
|
156
|
+
_none_as_sentinel: ClassVar[bool] = False
|
157
|
+
_func: ClassVar[Any] = fuzzy_match_keys
|
158
|
+
|
159
|
+
similarity_algo: SIMILARITY_TYPE | SimilarityFunc = "jaro_winkler"
|
160
|
+
similarity_threshold: float = 0.85
|
161
|
+
|
162
|
+
fuzzy_match: bool = True
|
163
|
+
handle_unmatched: HandleUnmatched = "ignore"
|
164
|
+
|
165
|
+
fill_value: Any = Unset
|
166
|
+
fill_mapping: dict[str, Any] | Any = Unset
|
167
|
+
strict: bool = False
|
168
|
+
|
169
|
+
def __call__(
|
170
|
+
self, d_: dict[str, Any], keys: Sequence[str] | KeysDict
|
171
|
+
) -> dict[str, Any]:
|
172
|
+
return fuzzy_match_keys(d_, keys, **self.default_kw())
|
@@ -0,0 +1,46 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
|
3
|
+
from lionagi._errors import ValidationError
|
4
|
+
|
5
|
+
from ._extract_json import extract_json
|
6
|
+
from ._fuzzy_match import FuzzyMatchKeysParams, fuzzy_match_keys
|
7
|
+
|
8
|
+
__all__ = ("fuzzy_validate_pydantic",)
|
9
|
+
|
10
|
+
|
11
|
+
def fuzzy_validate_pydantic(
|
12
|
+
text,
|
13
|
+
/,
|
14
|
+
model_type: type[BaseModel],
|
15
|
+
fuzzy_parse: bool = True,
|
16
|
+
fuzzy_match: bool = False,
|
17
|
+
fuzzy_match_params: FuzzyMatchKeysParams | dict = None,
|
18
|
+
):
|
19
|
+
try:
|
20
|
+
model_data = extract_json(text, fuzzy_parse=fuzzy_parse)
|
21
|
+
except Exception as e:
|
22
|
+
raise ValidationError(
|
23
|
+
f"Failed to extract valid JSON from model response: {e}"
|
24
|
+
) from e
|
25
|
+
|
26
|
+
d = model_data
|
27
|
+
if fuzzy_match:
|
28
|
+
if fuzzy_match_params is None:
|
29
|
+
model_data = fuzzy_match_keys(
|
30
|
+
d, model_type.model_fields, handle_unmatched="remove"
|
31
|
+
)
|
32
|
+
elif isinstance(fuzzy_match_params, dict):
|
33
|
+
model_data = fuzzy_match_keys(
|
34
|
+
d, model_type.model_fields, **fuzzy_match_params
|
35
|
+
)
|
36
|
+
elif isinstance(fuzzy_match_params, FuzzyMatchKeysParams):
|
37
|
+
model_data = fuzzy_match_params(d, model_type.model_fields)
|
38
|
+
else:
|
39
|
+
raise TypeError(
|
40
|
+
"fuzzy_keys_params must be a dict or FuzzyMatchKeysParams instance"
|
41
|
+
)
|
42
|
+
|
43
|
+
try:
|
44
|
+
return model_type.model_validate(model_data)
|
45
|
+
except Exception as e:
|
46
|
+
raise ValidationError(f"Validation failed: {e}") from e
|
@@ -0,0 +1,332 @@
|
|
1
|
+
# Copyright (c) 2023 - 2025, HaiyangLi <quantocean.li at gmail dot com>
|
2
|
+
#
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
4
|
+
|
5
|
+
from collections.abc import Callable, Sequence
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from difflib import SequenceMatcher
|
8
|
+
from itertools import product
|
9
|
+
from typing import Literal
|
10
|
+
|
11
|
+
__all__ = ("string_similarity",)
|
12
|
+
|
13
|
+
|
14
|
+
def cosine_similarity(s1: str, s2: str) -> float:
|
15
|
+
"""Calculate the cosine similarity between two strings.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
s1: First input string
|
19
|
+
s2: Second input string
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
float: Cosine similarity score between 0 and 1
|
23
|
+
"""
|
24
|
+
if not s1 or not s2:
|
25
|
+
return 0.0
|
26
|
+
|
27
|
+
set1, set2 = set(s1), set(s2)
|
28
|
+
intersection = set1.intersection(set2)
|
29
|
+
|
30
|
+
if not set1 or not set2:
|
31
|
+
return 0.0
|
32
|
+
|
33
|
+
return len(intersection) / ((len(set1) * len(set2)) ** 0.5)
|
34
|
+
|
35
|
+
|
36
|
+
def hamming_similarity(s1: str, s2: str) -> float:
|
37
|
+
"""Calculate the Hamming similarity between two strings.
|
38
|
+
|
39
|
+
The strings must be of equal length. Returns the proportion of positions
|
40
|
+
at which corresponding symbols are the same.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
s1: First input string
|
44
|
+
s2: Second input string
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
float: Hamming similarity score between 0 and 1
|
48
|
+
"""
|
49
|
+
if not s1 or not s2 or len(s1) != len(s2):
|
50
|
+
return 0.0
|
51
|
+
|
52
|
+
matches = sum(c1 == c2 for c1, c2 in zip(s1, s2))
|
53
|
+
return matches / len(s1)
|
54
|
+
|
55
|
+
|
56
|
+
def jaro_distance(s: str, t: str) -> float:
|
57
|
+
"""Calculate the Jaro distance between two strings.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
s: First input string
|
61
|
+
t: Second input string
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
float: Jaro distance score between 0 and 1
|
65
|
+
"""
|
66
|
+
s_len = len(s)
|
67
|
+
t_len = len(t)
|
68
|
+
|
69
|
+
if s_len == 0 and t_len == 0:
|
70
|
+
return 1.0
|
71
|
+
elif s_len == 0 or t_len == 0:
|
72
|
+
return 0.0
|
73
|
+
|
74
|
+
match_distance = (max(s_len, t_len) // 2) - 1
|
75
|
+
match_distance = max(0, match_distance) # Ensure non-negative
|
76
|
+
|
77
|
+
s_matches = [False] * s_len
|
78
|
+
t_matches = [False] * t_len
|
79
|
+
|
80
|
+
matches = 0
|
81
|
+
transpositions = 0
|
82
|
+
|
83
|
+
# Identify matches
|
84
|
+
for i in range(s_len):
|
85
|
+
start = max(0, i - match_distance)
|
86
|
+
end = min(i + match_distance + 1, t_len)
|
87
|
+
|
88
|
+
for j in range(start, end):
|
89
|
+
if t_matches[j] or s[i] != t[j]:
|
90
|
+
continue
|
91
|
+
s_matches[i] = t_matches[j] = True
|
92
|
+
matches += 1
|
93
|
+
break
|
94
|
+
|
95
|
+
if matches == 0:
|
96
|
+
return 0.0
|
97
|
+
|
98
|
+
# Count transpositions
|
99
|
+
k = 0
|
100
|
+
for i in range(s_len):
|
101
|
+
if not s_matches[i]:
|
102
|
+
continue
|
103
|
+
while not t_matches[k]:
|
104
|
+
k += 1
|
105
|
+
if s[i] != t[k]:
|
106
|
+
transpositions += 1
|
107
|
+
k += 1
|
108
|
+
|
109
|
+
transpositions //= 2
|
110
|
+
|
111
|
+
return (
|
112
|
+
matches / s_len
|
113
|
+
+ matches / t_len
|
114
|
+
+ (matches - transpositions) / matches
|
115
|
+
) / 3.0
|
116
|
+
|
117
|
+
|
118
|
+
def jaro_winkler_similarity(s: str, t: str, scaling: float = 0.1) -> float:
|
119
|
+
"""Calculate the Jaro-Winkler similarity between two strings.
|
120
|
+
|
121
|
+
Args:
|
122
|
+
s: First input string
|
123
|
+
t: Second input string
|
124
|
+
scaling: Scaling factor for common prefix adjustment
|
125
|
+
|
126
|
+
Returns:
|
127
|
+
float: Jaro-Winkler similarity score between 0 and 1
|
128
|
+
|
129
|
+
Raises:
|
130
|
+
ValueError: If scaling factor is not between 0 and 0.25
|
131
|
+
"""
|
132
|
+
if not 0 <= scaling <= 0.25:
|
133
|
+
raise ValueError("Scaling factor must be between 0 and 0.25")
|
134
|
+
|
135
|
+
jaro_sim = jaro_distance(s, t)
|
136
|
+
|
137
|
+
# Find length of common prefix (up to 4 chars)
|
138
|
+
prefix_len = 0
|
139
|
+
for s_char, t_char in zip(s, t):
|
140
|
+
if s_char != t_char:
|
141
|
+
break
|
142
|
+
prefix_len += 1
|
143
|
+
if prefix_len == 4:
|
144
|
+
break
|
145
|
+
|
146
|
+
return jaro_sim + (prefix_len * scaling * (1 - jaro_sim))
|
147
|
+
|
148
|
+
|
149
|
+
def levenshtein_distance(a: str, b: str) -> int:
|
150
|
+
"""Calculate the Levenshtein (edit) distance between two strings.
|
151
|
+
|
152
|
+
Args:
|
153
|
+
a: First input string
|
154
|
+
b: Second input string
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
int: Minimum number of single-character edits needed to change one
|
158
|
+
string into the other
|
159
|
+
"""
|
160
|
+
if not a:
|
161
|
+
return len(b)
|
162
|
+
if not b:
|
163
|
+
return len(a)
|
164
|
+
|
165
|
+
m, n = len(a), len(b)
|
166
|
+
d = [[0] * (n + 1) for _ in range(m + 1)]
|
167
|
+
|
168
|
+
for i in range(m + 1):
|
169
|
+
d[i][0] = i
|
170
|
+
for j in range(n + 1):
|
171
|
+
d[0][j] = j
|
172
|
+
|
173
|
+
for i, j in product(range(1, m + 1), range(1, n + 1)):
|
174
|
+
cost = 0 if a[i - 1] == b[j - 1] else 1
|
175
|
+
d[i][j] = min(
|
176
|
+
d[i - 1][j] + 1, # deletion
|
177
|
+
d[i][j - 1] + 1, # insertion
|
178
|
+
d[i - 1][j - 1] + cost, # substitution
|
179
|
+
)
|
180
|
+
|
181
|
+
return d[m][n]
|
182
|
+
|
183
|
+
|
184
|
+
def levenshtein_similarity(s1: str, s2: str) -> float:
|
185
|
+
"""Calculate the Levenshtein similarity between two strings.
|
186
|
+
|
187
|
+
Converts Levenshtein distance to a similarity score between 0 and 1.
|
188
|
+
|
189
|
+
Args:
|
190
|
+
s1: First input string
|
191
|
+
s2: Second input string
|
192
|
+
|
193
|
+
Returns:
|
194
|
+
float: Levenshtein similarity score between 0 and 1
|
195
|
+
"""
|
196
|
+
if not s1 and not s2:
|
197
|
+
return 1.0
|
198
|
+
if not s1 or not s2:
|
199
|
+
return 0.0
|
200
|
+
|
201
|
+
distance = levenshtein_distance(s1, s2)
|
202
|
+
max_len = max(len(s1), len(s2))
|
203
|
+
return 1 - (distance / max_len)
|
204
|
+
|
205
|
+
|
206
|
+
def sequence_matcher_similarity(s1: str, s2: str) -> float:
|
207
|
+
"""Calculate similarity using Python's SequenceMatcher.
|
208
|
+
|
209
|
+
Args:
|
210
|
+
s1: First input string
|
211
|
+
s2: Second input string
|
212
|
+
|
213
|
+
Returns:
|
214
|
+
float: Similarity score between 0 and 1
|
215
|
+
"""
|
216
|
+
return SequenceMatcher(None, s1, s2).ratio()
|
217
|
+
|
218
|
+
|
219
|
+
# Map of available similarity algorithms
|
220
|
+
SIMILARITY_ALGO_MAP = {
|
221
|
+
"jaro_winkler": jaro_winkler_similarity,
|
222
|
+
"levenshtein": levenshtein_similarity,
|
223
|
+
"sequence_matcher": sequence_matcher_similarity,
|
224
|
+
"hamming": hamming_similarity,
|
225
|
+
"cosine": cosine_similarity,
|
226
|
+
}
|
227
|
+
|
228
|
+
|
229
|
+
SIMILARITY_TYPE = Literal[
|
230
|
+
"jaro_winkler",
|
231
|
+
"levenshtein",
|
232
|
+
"sequence_matcher",
|
233
|
+
"hamming",
|
234
|
+
"cosine",
|
235
|
+
]
|
236
|
+
|
237
|
+
# Type alias for similarity functions
|
238
|
+
SimilarityFunc = Callable[[str, str], float]
|
239
|
+
|
240
|
+
|
241
|
+
@dataclass(frozen=True)
|
242
|
+
class MatchResult:
|
243
|
+
"""Represents a string matching result."""
|
244
|
+
|
245
|
+
word: str
|
246
|
+
score: float
|
247
|
+
index: int
|
248
|
+
|
249
|
+
|
250
|
+
def string_similarity(
|
251
|
+
word: str,
|
252
|
+
correct_words: Sequence[str],
|
253
|
+
algorithm: SIMILARITY_TYPE | Callable[[str, str], float] = "jaro_winkler",
|
254
|
+
threshold: float = 0.0,
|
255
|
+
case_sensitive: bool = False,
|
256
|
+
return_most_similar: bool = False,
|
257
|
+
) -> str | list[str] | None:
|
258
|
+
"""Find similar strings using specified similarity algorithm.
|
259
|
+
|
260
|
+
Args:
|
261
|
+
word: The input string to find matches for
|
262
|
+
correct_words: List of strings to compare against
|
263
|
+
algorithm: Similarity algorithm to use
|
264
|
+
threshold: Minimum similarity score (0.0 to 1.0)
|
265
|
+
case_sensitive: Whether to consider case when matching
|
266
|
+
return_most_similar: Return only the most similar match
|
267
|
+
|
268
|
+
Returns:
|
269
|
+
Matching string(s) or None if no matches found
|
270
|
+
|
271
|
+
Raises:
|
272
|
+
ValueError: If correct_words is empty or threshold is invalid
|
273
|
+
"""
|
274
|
+
if not correct_words:
|
275
|
+
raise ValueError("correct_words must not be empty")
|
276
|
+
|
277
|
+
if not 0.0 <= threshold <= 1.0:
|
278
|
+
raise ValueError("threshold must be between 0.0 and 1.0")
|
279
|
+
|
280
|
+
# Convert inputs to strings
|
281
|
+
compare_word = str(word)
|
282
|
+
original_words = [str(w) for w in correct_words]
|
283
|
+
|
284
|
+
# Handle case sensitivity
|
285
|
+
if not case_sensitive:
|
286
|
+
compare_word = compare_word.lower()
|
287
|
+
compare_words = [w.lower() for w in original_words]
|
288
|
+
else:
|
289
|
+
compare_words = original_words.copy()
|
290
|
+
|
291
|
+
# Get scoring function
|
292
|
+
if isinstance(algorithm, str):
|
293
|
+
score_func = SIMILARITY_ALGO_MAP.get(algorithm)
|
294
|
+
if score_func is None:
|
295
|
+
raise ValueError(f"Unsupported algorithm: {algorithm}")
|
296
|
+
elif callable(algorithm):
|
297
|
+
score_func = algorithm
|
298
|
+
else:
|
299
|
+
raise ValueError(
|
300
|
+
"algorithm must be a string specifying a built-in algorithm or a callable"
|
301
|
+
)
|
302
|
+
|
303
|
+
# Calculate similarities
|
304
|
+
results = []
|
305
|
+
for idx, (orig_word, comp_word) in enumerate(
|
306
|
+
zip(original_words, compare_words)
|
307
|
+
):
|
308
|
+
# Skip different length strings for hamming similarity
|
309
|
+
if algorithm == "hamming" and len(comp_word) != len(compare_word):
|
310
|
+
continue
|
311
|
+
|
312
|
+
score = score_func(compare_word, comp_word)
|
313
|
+
if score >= threshold:
|
314
|
+
results.append(MatchResult(orig_word, score, idx))
|
315
|
+
|
316
|
+
# Return None if no matches
|
317
|
+
if not results:
|
318
|
+
return None
|
319
|
+
|
320
|
+
# Sort by score (descending) and index (ascending) for stable ordering
|
321
|
+
results.sort(key=lambda x: (-x.score, x.index))
|
322
|
+
|
323
|
+
# Filter exact matches for case sensitive comparisons
|
324
|
+
if case_sensitive:
|
325
|
+
max_score = results[0].score
|
326
|
+
results = [r for r in results if r.score == max_score]
|
327
|
+
|
328
|
+
# Return results
|
329
|
+
if return_most_similar:
|
330
|
+
return results[0].word
|
331
|
+
|
332
|
+
return [r.word for r in results]
|
@@ -1,11 +1,160 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
from dataclasses import dataclass, field
|
2
|
-
from
|
4
|
+
from enum import Enum as _Enum
|
5
|
+
from typing import Any, ClassVar, Final, Literal, TypeVar, Union
|
6
|
+
|
7
|
+
from typing_extensions import TypedDict, override
|
8
|
+
|
9
|
+
__all__ = (
|
10
|
+
"Undefined",
|
11
|
+
"Unset",
|
12
|
+
"MaybeUndefined",
|
13
|
+
"MaybeUnset",
|
14
|
+
"MaybeSentinel",
|
15
|
+
"SingletonType",
|
16
|
+
"UndefinedType",
|
17
|
+
"UnsetType",
|
18
|
+
"KeysDict",
|
19
|
+
"T",
|
20
|
+
"Enum",
|
21
|
+
"is_sentinel",
|
22
|
+
"not_sentinel",
|
23
|
+
"Params",
|
24
|
+
"DataClass",
|
25
|
+
)
|
26
|
+
|
27
|
+
T = TypeVar("T")
|
28
|
+
|
29
|
+
|
30
|
+
class _SingletonMeta(type):
|
31
|
+
"""Metaclass that guarantees exactly one instance per subclass.
|
32
|
+
|
33
|
+
This ensures that sentinel values maintain identity across the entire application,
|
34
|
+
allowing safe identity checks with 'is' operator.
|
35
|
+
"""
|
36
|
+
|
37
|
+
_cache: dict[type, SingletonType] = {}
|
38
|
+
|
39
|
+
def __call__(cls, *a, **kw):
|
40
|
+
if cls not in cls._cache:
|
41
|
+
cls._cache[cls] = super().__call__(*a, **kw)
|
42
|
+
return cls._cache[cls]
|
43
|
+
|
44
|
+
|
45
|
+
class SingletonType(metaclass=_SingletonMeta):
|
46
|
+
"""Base class for singleton sentinel types.
|
47
|
+
|
48
|
+
Provides consistent interface for sentinel values with:
|
49
|
+
- Identity preservation across deepcopy
|
50
|
+
- Falsy boolean evaluation
|
51
|
+
- Clear string representation
|
52
|
+
"""
|
53
|
+
|
54
|
+
__slots__: tuple[str, ...] = ()
|
55
|
+
|
56
|
+
def __deepcopy__(self, memo): # copy & deepcopy both noop
|
57
|
+
return self
|
58
|
+
|
59
|
+
def __copy__(self):
|
60
|
+
return self
|
61
|
+
|
62
|
+
# concrete classes *must* override the two methods below
|
63
|
+
def __bool__(self) -> bool: ...
|
64
|
+
def __repr__(self) -> str: ...
|
65
|
+
|
66
|
+
|
67
|
+
class UndefinedType(SingletonType):
|
68
|
+
"""Sentinel for a key or field entirely missing from a namespace.
|
69
|
+
|
70
|
+
Use this when:
|
71
|
+
- A field has never been set
|
72
|
+
- A key doesn't exist in a mapping
|
73
|
+
- A value is conceptually undefined (not just unset)
|
74
|
+
|
75
|
+
Example:
|
76
|
+
>>> d = {"a": 1}
|
77
|
+
>>> d.get("b", Undefined) is Undefined
|
78
|
+
True
|
79
|
+
"""
|
80
|
+
|
81
|
+
__slots__ = ()
|
82
|
+
|
83
|
+
def __bool__(self) -> Literal[False]:
|
84
|
+
return False
|
85
|
+
|
86
|
+
def __repr__(self) -> Literal["Undefined"]:
|
87
|
+
return "Undefined"
|
88
|
+
|
89
|
+
def __str__(self) -> Literal["Undefined"]:
|
90
|
+
return "Undefined"
|
91
|
+
|
92
|
+
def __reduce__(self):
|
93
|
+
"""Ensure pickle preservation of singleton identity."""
|
94
|
+
return "Undefined"
|
95
|
+
|
96
|
+
|
97
|
+
class UnsetType(SingletonType):
|
98
|
+
"""Sentinel for a key present but value not yet provided.
|
99
|
+
|
100
|
+
Use this when:
|
101
|
+
- A parameter exists but hasn't been given a value
|
102
|
+
- Distinguishing between None and "not provided"
|
103
|
+
- API parameters that are optional but need explicit handling
|
104
|
+
|
105
|
+
Example:
|
106
|
+
>>> def func(param=Unset):
|
107
|
+
... if param is not Unset:
|
108
|
+
... # param was explicitly provided
|
109
|
+
... process(param)
|
110
|
+
"""
|
111
|
+
|
112
|
+
__slots__ = ()
|
113
|
+
|
114
|
+
def __bool__(self) -> Literal[False]:
|
115
|
+
return False
|
116
|
+
|
117
|
+
def __repr__(self) -> Literal["Unset"]:
|
118
|
+
return "Unset"
|
119
|
+
|
120
|
+
def __str__(self) -> Literal["Unset"]:
|
121
|
+
return "Unset"
|
122
|
+
|
123
|
+
def __reduce__(self):
|
124
|
+
"""Ensure pickle preservation of singleton identity."""
|
125
|
+
return "Unset"
|
126
|
+
|
127
|
+
|
128
|
+
Undefined: Final = UndefinedType()
|
129
|
+
"""A key or field entirely missing from a namespace"""
|
130
|
+
Unset: Final = UnsetType()
|
131
|
+
"""A key present but value not yet provided."""
|
132
|
+
|
133
|
+
MaybeUndefined = Union[T, UndefinedType]
|
134
|
+
MaybeUnset = Union[T, UnsetType]
|
135
|
+
MaybeSentinel = Union[T, UndefinedType, UnsetType]
|
136
|
+
|
137
|
+
|
138
|
+
def is_sentinel(value: Any) -> bool:
|
139
|
+
"""Check if a value is any sentinel (Undefined or Unset)."""
|
140
|
+
return value is Undefined or value is Unset
|
141
|
+
|
142
|
+
|
143
|
+
def not_sentinel(value: Any) -> bool:
|
144
|
+
"""Check if a value is NOT a sentinel. Useful for filtering operations."""
|
145
|
+
return value is not Undefined and value is not Unset
|
146
|
+
|
147
|
+
|
148
|
+
class Enum(_Enum):
|
149
|
+
@classmethod
|
150
|
+
def allowed(cls) -> tuple[str, ...]:
|
151
|
+
return tuple(e.value for e in cls)
|
3
152
|
|
4
|
-
from typing_extensions import override
|
5
153
|
|
6
|
-
|
154
|
+
class KeysDict(TypedDict, total=False):
|
155
|
+
"""TypedDict for keys dictionary."""
|
7
156
|
|
8
|
-
|
157
|
+
key: Any # Represents any key-type pair
|
9
158
|
|
10
159
|
|
11
160
|
@dataclass(slots=True, frozen=True, init=False)
|