lionagi 0.15.14__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lionagi/adapters/async_postgres_adapter.py +1 -1
- lionagi/libs/nested/ninsert.py +3 -6
- lionagi/libs/nested/nmerge.py +1 -2
- lionagi/libs/validate/fuzzy_match_keys.py +5 -182
- lionagi/libs/validate/string_similarity.py +6 -331
- lionagi/ln/__init__.py +56 -66
- lionagi/ln/_async_call.py +13 -10
- lionagi/ln/_hash.py +33 -8
- lionagi/ln/_list_call.py +2 -35
- lionagi/ln/_to_list.py +51 -28
- lionagi/ln/_utils.py +156 -0
- lionagi/ln/concurrency/__init__.py +39 -31
- lionagi/ln/concurrency/_compat.py +65 -0
- lionagi/ln/concurrency/cancel.py +92 -109
- lionagi/ln/concurrency/errors.py +17 -17
- lionagi/ln/concurrency/patterns.py +249 -206
- lionagi/ln/concurrency/primitives.py +257 -216
- lionagi/ln/concurrency/resource_tracker.py +42 -155
- lionagi/ln/concurrency/task.py +55 -73
- lionagi/ln/concurrency/throttle.py +3 -0
- lionagi/ln/concurrency/utils.py +1 -0
- lionagi/ln/fuzzy/__init__.py +15 -0
- lionagi/ln/{_extract_json.py → fuzzy/_extract_json.py} +22 -9
- lionagi/ln/{_fuzzy_json.py → fuzzy/_fuzzy_json.py} +14 -8
- lionagi/ln/fuzzy/_fuzzy_match.py +172 -0
- lionagi/ln/fuzzy/_fuzzy_validate.py +46 -0
- lionagi/ln/fuzzy/_string_similarity.py +332 -0
- lionagi/ln/{_models.py → types.py} +153 -4
- lionagi/operations/ReAct/utils.py +1 -2
- lionagi/operations/flow.py +2 -1
- lionagi/operations/operate/operate.py +26 -16
- lionagi/protocols/action/function_calling.py +1 -4
- lionagi/protocols/contracts.py +46 -0
- lionagi/protocols/generic/element.py +1 -58
- lionagi/protocols/generic/event.py +6 -6
- lionagi/protocols/generic/processor.py +9 -5
- lionagi/protocols/graph/graph.py +1 -2
- lionagi/protocols/graph/node.py +2 -4
- lionagi/protocols/ids.py +82 -0
- lionagi/protocols/messages/instruction.py +1 -2
- lionagi/protocols/messages/manager.py +1 -2
- lionagi/protocols/messages/message.py +1 -4
- lionagi/protocols/types.py +10 -12
- lionagi/service/connections/providers/claude_code_.py +1 -2
- lionagi/service/resilience.py +1 -2
- lionagi/tools/memory/tools.py +2 -4
- lionagi/utils.py +34 -64
- lionagi/version.py +1 -1
- {lionagi-0.15.14.dist-info → lionagi-0.16.1.dist-info}/METADATA +4 -2
- {lionagi-0.15.14.dist-info → lionagi-0.16.1.dist-info}/RECORD +52 -45
- lionagi/ln/_types.py +0 -146
- {lionagi-0.15.14.dist-info → lionagi-0.16.1.dist-info}/WHEEL +0 -0
- {lionagi-0.15.14.dist-info → lionagi-0.16.1.dist-info}/licenses/LICENSE +0 -0
@@ -89,7 +89,7 @@ class LionAGIAsyncPostgresAdapter(AsyncPostgresAdapter[T]):
|
|
89
89
|
sa.Column("id", sa.String, primary_key=True),
|
90
90
|
sa.Column("content", json_type),
|
91
91
|
sa.Column("node_metadata", json_type),
|
92
|
-
sa.Column("created_at", sa.
|
92
|
+
sa.Column("created_at", sa.Float),
|
93
93
|
sa.Column("embedding", json_type, nullable=True),
|
94
94
|
).create(sync_conn, checkfirst=True)
|
95
95
|
)
|
lionagi/libs/nested/ninsert.py
CHANGED
@@ -50,8 +50,7 @@ def ninsert(
|
|
50
50
|
if isinstance(part, int):
|
51
51
|
if isinstance(nested_structure, dict):
|
52
52
|
raise TypeError(
|
53
|
-
f"Unsupported key type: {type(part).__name__}."
|
54
|
-
"Only string keys are acceptable.",
|
53
|
+
f"Unsupported key type: {type(part).__name__}.Only string keys are acceptable.",
|
55
54
|
)
|
56
55
|
while len(nested_structure) <= part:
|
57
56
|
nested_structure.append(None)
|
@@ -67,8 +66,7 @@ def ninsert(
|
|
67
66
|
raise TypeError("Cannot use NoneType as a key in a dictionary")
|
68
67
|
if isinstance(part, (float, complex)):
|
69
68
|
raise TypeError(
|
70
|
-
f"Unsupported key type: {type(part).__name__}."
|
71
|
-
"Only string keys are acceptable.",
|
69
|
+
f"Unsupported key type: {type(part).__name__}.Only string keys are acceptable.",
|
72
70
|
)
|
73
71
|
if part not in nested_structure:
|
74
72
|
next_part = indices[i + 1]
|
@@ -77,8 +75,7 @@ def ninsert(
|
|
77
75
|
)
|
78
76
|
else:
|
79
77
|
raise TypeError(
|
80
|
-
f"Invalid container type: {type(nested_structure)} "
|
81
|
-
"encountered during insertion"
|
78
|
+
f"Invalid container type: {type(nested_structure)} encountered during insertion"
|
82
79
|
)
|
83
80
|
|
84
81
|
nested_structure = nested_structure[part]
|
lionagi/libs/nested/nmerge.py
CHANGED
@@ -51,8 +51,7 @@ def nmerge(
|
|
51
51
|
return _merge_sequences(nested_structure, sort_list, custom_sort)
|
52
52
|
else:
|
53
53
|
raise TypeError(
|
54
|
-
"All items in the input list must be of the same type, "
|
55
|
-
"either dict, list, or Iterable."
|
54
|
+
"All items in the input list must be of the same type, either dict, list, or Iterable."
|
56
55
|
)
|
57
56
|
|
58
57
|
|
@@ -1,184 +1,7 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
from collections.abc import Callable, Sequence
|
6
|
-
from typing import Any, Literal
|
7
|
-
|
8
|
-
from lionagi.utils import KeysDict, Params
|
9
|
-
|
10
|
-
from .string_similarity import (
|
11
|
-
SIMILARITY_ALGO_MAP,
|
12
|
-
SIMILARITY_TYPE,
|
13
|
-
string_similarity,
|
1
|
+
from lionagi.ln.fuzzy._fuzzy_match import (
|
2
|
+
FuzzyMatchKeysParams,
|
3
|
+
HandleUnmatched,
|
4
|
+
fuzzy_match_keys,
|
14
5
|
)
|
15
6
|
|
16
|
-
__all__ = (
|
17
|
-
"fuzzy_match_keys",
|
18
|
-
"FuzzyMatchKeysParams",
|
19
|
-
)
|
20
|
-
|
21
|
-
|
22
|
-
class FuzzyMatchKeysParams(Params):
|
23
|
-
similarity_algo: SIMILARITY_TYPE | Callable[[str, str], float] = (
|
24
|
-
"jaro_winkler"
|
25
|
-
)
|
26
|
-
similarity_threshold: float = 0.85
|
27
|
-
fuzzy_match: bool = True
|
28
|
-
handle_unmatched: Literal["ignore", "raise", "remove", "fill", "force"] = (
|
29
|
-
"ignore"
|
30
|
-
)
|
31
|
-
fill_value: Any = None
|
32
|
-
fill_mapping: dict[str, Any] | None = None
|
33
|
-
strict: bool = False
|
34
|
-
|
35
|
-
def __call__(
|
36
|
-
self, d_: dict[str, Any], keys: Sequence[str] | KeysDict
|
37
|
-
) -> dict[str, Any]:
|
38
|
-
return fuzzy_match_keys(
|
39
|
-
d_,
|
40
|
-
keys,
|
41
|
-
similarity_algo=self.similarity_algo,
|
42
|
-
similarity_threshold=self.similarity_threshold,
|
43
|
-
fuzzy_match=self.fuzzy_match,
|
44
|
-
handle_unmatched=self.handle_unmatched,
|
45
|
-
fill_value=self.fill_value,
|
46
|
-
fill_mapping=self.fill_mapping,
|
47
|
-
strict=self.strict,
|
48
|
-
)
|
49
|
-
|
50
|
-
|
51
|
-
def fuzzy_match_keys(
|
52
|
-
d_: dict[str, Any],
|
53
|
-
keys: Sequence[str] | KeysDict,
|
54
|
-
/,
|
55
|
-
*,
|
56
|
-
similarity_algo: (
|
57
|
-
SIMILARITY_TYPE | Callable[[str, str], float]
|
58
|
-
) = "jaro_winkler",
|
59
|
-
similarity_threshold: float = 0.85,
|
60
|
-
fuzzy_match: bool = True,
|
61
|
-
handle_unmatched: Literal[
|
62
|
-
"ignore", "raise", "remove", "fill", "force"
|
63
|
-
] = "ignore",
|
64
|
-
fill_value: Any = None,
|
65
|
-
fill_mapping: dict[str, Any] | None = None,
|
66
|
-
strict: bool = False,
|
67
|
-
) -> dict[str, Any]:
|
68
|
-
"""
|
69
|
-
Validate and correct dictionary keys based on expected keys using string similarity.
|
70
|
-
|
71
|
-
Args:
|
72
|
-
d_: The dictionary to validate and correct keys for.
|
73
|
-
keys: List of expected keys or dictionary mapping keys to types.
|
74
|
-
similarity_algo: String similarity algorithm to use or custom function.
|
75
|
-
similarity_threshold: Minimum similarity score for fuzzy matching.
|
76
|
-
fuzzy_match: If True, use fuzzy matching for key correction.
|
77
|
-
handle_unmatched: Specifies how to handle unmatched keys:
|
78
|
-
- "ignore": Keep unmatched keys in output.
|
79
|
-
- "raise": Raise ValueError if unmatched keys exist.
|
80
|
-
- "remove": Remove unmatched keys from output.
|
81
|
-
- "fill": Fill unmatched keys with default value/mapping.
|
82
|
-
- "force": Combine "fill" and "remove" behaviors.
|
83
|
-
fill_value: Default value for filling unmatched keys.
|
84
|
-
fill_mapping: Dictionary mapping unmatched keys to default values.
|
85
|
-
strict: If True, raise ValueError if any expected key is missing.
|
86
|
-
|
87
|
-
Returns:
|
88
|
-
A new dictionary with validated and corrected keys.
|
89
|
-
|
90
|
-
Raises:
|
91
|
-
ValueError: If validation fails based on specified parameters.
|
92
|
-
TypeError: If input types are invalid.
|
93
|
-
AttributeError: If key validation fails.
|
94
|
-
"""
|
95
|
-
# Input validation
|
96
|
-
if not isinstance(d_, dict):
|
97
|
-
raise TypeError("First argument must be a dictionary")
|
98
|
-
if keys is None:
|
99
|
-
raise TypeError("Keys argument cannot be None")
|
100
|
-
if not 0.0 <= similarity_threshold <= 1.0:
|
101
|
-
raise ValueError("similarity_threshold must be between 0.0 and 1.0")
|
102
|
-
|
103
|
-
# Extract expected keys
|
104
|
-
fields_set = set(keys) if isinstance(keys, list) else set(keys.keys())
|
105
|
-
if not fields_set:
|
106
|
-
return d_.copy() # Return copy of original if no expected keys
|
107
|
-
|
108
|
-
# Initialize output dictionary and tracking sets
|
109
|
-
corrected_out = {}
|
110
|
-
matched_expected = set()
|
111
|
-
matched_input = set()
|
112
|
-
|
113
|
-
# Get similarity function
|
114
|
-
if isinstance(similarity_algo, str):
|
115
|
-
if similarity_algo not in SIMILARITY_ALGO_MAP:
|
116
|
-
raise ValueError(
|
117
|
-
f"Unknown similarity algorithm: {similarity_algo}"
|
118
|
-
)
|
119
|
-
similarity_func = SIMILARITY_ALGO_MAP[similarity_algo]
|
120
|
-
else:
|
121
|
-
similarity_func = similarity_algo
|
122
|
-
|
123
|
-
# First pass: exact matches
|
124
|
-
for key in d_:
|
125
|
-
if key in fields_set:
|
126
|
-
corrected_out[key] = d_[key]
|
127
|
-
matched_expected.add(key)
|
128
|
-
matched_input.add(key)
|
129
|
-
|
130
|
-
# Second pass: fuzzy matching if enabled
|
131
|
-
if fuzzy_match:
|
132
|
-
remaining_input = set(d_.keys()) - matched_input
|
133
|
-
remaining_expected = fields_set - matched_expected
|
134
|
-
|
135
|
-
for key in remaining_input:
|
136
|
-
if not remaining_expected:
|
137
|
-
break
|
138
|
-
|
139
|
-
matches = string_similarity(
|
140
|
-
key,
|
141
|
-
list(remaining_expected),
|
142
|
-
algorithm=similarity_func,
|
143
|
-
threshold=similarity_threshold,
|
144
|
-
return_most_similar=True,
|
145
|
-
)
|
146
|
-
|
147
|
-
if matches:
|
148
|
-
match = matches
|
149
|
-
corrected_out[match] = d_[key]
|
150
|
-
matched_expected.add(match)
|
151
|
-
matched_input.add(key)
|
152
|
-
remaining_expected.remove(match)
|
153
|
-
elif handle_unmatched == "ignore":
|
154
|
-
corrected_out[key] = d_[key]
|
155
|
-
|
156
|
-
# Handle unmatched keys based on handle_unmatched parameter
|
157
|
-
unmatched_input = set(d_.keys()) - matched_input
|
158
|
-
unmatched_expected = fields_set - matched_expected
|
159
|
-
|
160
|
-
if handle_unmatched == "raise" and unmatched_input:
|
161
|
-
raise ValueError(f"Unmatched keys found: {unmatched_input}")
|
162
|
-
|
163
|
-
elif handle_unmatched == "ignore":
|
164
|
-
for key in unmatched_input:
|
165
|
-
corrected_out[key] = d_[key]
|
166
|
-
|
167
|
-
elif handle_unmatched in ("fill", "force"):
|
168
|
-
# Fill missing expected keys
|
169
|
-
for key in unmatched_expected:
|
170
|
-
if fill_mapping and key in fill_mapping:
|
171
|
-
corrected_out[key] = fill_mapping[key]
|
172
|
-
else:
|
173
|
-
corrected_out[key] = fill_value
|
174
|
-
|
175
|
-
# For "fill" mode, also keep unmatched original keys
|
176
|
-
if handle_unmatched == "fill":
|
177
|
-
for key in unmatched_input:
|
178
|
-
corrected_out[key] = d_[key]
|
179
|
-
|
180
|
-
# Check strict mode
|
181
|
-
if strict and unmatched_expected:
|
182
|
-
raise ValueError(f"Missing required keys: {unmatched_expected}")
|
183
|
-
|
184
|
-
return corrected_out
|
7
|
+
__all__ = ("fuzzy_match_keys", "FuzzyMatchKeysParams", "HandleUnmatched")
|
@@ -1,332 +1,7 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
from lionagi.ln.fuzzy._string_similarity import (
|
2
|
+
SIMILARITY_TYPE,
|
3
|
+
SimilarityFunc,
|
4
|
+
string_similarity,
|
5
|
+
)
|
4
6
|
|
5
|
-
|
6
|
-
from dataclasses import dataclass
|
7
|
-
from difflib import SequenceMatcher
|
8
|
-
from itertools import product
|
9
|
-
from typing import Literal
|
10
|
-
|
11
|
-
__all__ = ("string_similarity",)
|
12
|
-
|
13
|
-
|
14
|
-
def cosine_similarity(s1: str, s2: str) -> float:
|
15
|
-
"""Calculate the cosine similarity between two strings.
|
16
|
-
|
17
|
-
Args:
|
18
|
-
s1: First input string
|
19
|
-
s2: Second input string
|
20
|
-
|
21
|
-
Returns:
|
22
|
-
float: Cosine similarity score between 0 and 1
|
23
|
-
"""
|
24
|
-
if not s1 or not s2:
|
25
|
-
return 0.0
|
26
|
-
|
27
|
-
set1, set2 = set(s1), set(s2)
|
28
|
-
intersection = set1.intersection(set2)
|
29
|
-
|
30
|
-
if not set1 or not set2:
|
31
|
-
return 0.0
|
32
|
-
|
33
|
-
return len(intersection) / ((len(set1) * len(set2)) ** 0.5)
|
34
|
-
|
35
|
-
|
36
|
-
def hamming_similarity(s1: str, s2: str) -> float:
|
37
|
-
"""Calculate the Hamming similarity between two strings.
|
38
|
-
|
39
|
-
The strings must be of equal length. Returns the proportion of positions
|
40
|
-
at which corresponding symbols are the same.
|
41
|
-
|
42
|
-
Args:
|
43
|
-
s1: First input string
|
44
|
-
s2: Second input string
|
45
|
-
|
46
|
-
Returns:
|
47
|
-
float: Hamming similarity score between 0 and 1
|
48
|
-
"""
|
49
|
-
if not s1 or not s2 or len(s1) != len(s2):
|
50
|
-
return 0.0
|
51
|
-
|
52
|
-
matches = sum(c1 == c2 for c1, c2 in zip(s1, s2))
|
53
|
-
return matches / len(s1)
|
54
|
-
|
55
|
-
|
56
|
-
def jaro_distance(s: str, t: str) -> float:
|
57
|
-
"""Calculate the Jaro distance between two strings.
|
58
|
-
|
59
|
-
Args:
|
60
|
-
s: First input string
|
61
|
-
t: Second input string
|
62
|
-
|
63
|
-
Returns:
|
64
|
-
float: Jaro distance score between 0 and 1
|
65
|
-
"""
|
66
|
-
s_len = len(s)
|
67
|
-
t_len = len(t)
|
68
|
-
|
69
|
-
if s_len == 0 and t_len == 0:
|
70
|
-
return 1.0
|
71
|
-
elif s_len == 0 or t_len == 0:
|
72
|
-
return 0.0
|
73
|
-
|
74
|
-
match_distance = (max(s_len, t_len) // 2) - 1
|
75
|
-
match_distance = max(0, match_distance) # Ensure non-negative
|
76
|
-
|
77
|
-
s_matches = [False] * s_len
|
78
|
-
t_matches = [False] * t_len
|
79
|
-
|
80
|
-
matches = 0
|
81
|
-
transpositions = 0
|
82
|
-
|
83
|
-
# Identify matches
|
84
|
-
for i in range(s_len):
|
85
|
-
start = max(0, i - match_distance)
|
86
|
-
end = min(i + match_distance + 1, t_len)
|
87
|
-
|
88
|
-
for j in range(start, end):
|
89
|
-
if t_matches[j] or s[i] != t[j]:
|
90
|
-
continue
|
91
|
-
s_matches[i] = t_matches[j] = True
|
92
|
-
matches += 1
|
93
|
-
break
|
94
|
-
|
95
|
-
if matches == 0:
|
96
|
-
return 0.0
|
97
|
-
|
98
|
-
# Count transpositions
|
99
|
-
k = 0
|
100
|
-
for i in range(s_len):
|
101
|
-
if not s_matches[i]:
|
102
|
-
continue
|
103
|
-
while not t_matches[k]:
|
104
|
-
k += 1
|
105
|
-
if s[i] != t[k]:
|
106
|
-
transpositions += 1
|
107
|
-
k += 1
|
108
|
-
|
109
|
-
transpositions //= 2
|
110
|
-
|
111
|
-
return (
|
112
|
-
matches / s_len
|
113
|
-
+ matches / t_len
|
114
|
-
+ (matches - transpositions) / matches
|
115
|
-
) / 3.0
|
116
|
-
|
117
|
-
|
118
|
-
def jaro_winkler_similarity(s: str, t: str, scaling: float = 0.1) -> float:
|
119
|
-
"""Calculate the Jaro-Winkler similarity between two strings.
|
120
|
-
|
121
|
-
Args:
|
122
|
-
s: First input string
|
123
|
-
t: Second input string
|
124
|
-
scaling: Scaling factor for common prefix adjustment
|
125
|
-
|
126
|
-
Returns:
|
127
|
-
float: Jaro-Winkler similarity score between 0 and 1
|
128
|
-
|
129
|
-
Raises:
|
130
|
-
ValueError: If scaling factor is not between 0 and 0.25
|
131
|
-
"""
|
132
|
-
if not 0 <= scaling <= 0.25:
|
133
|
-
raise ValueError("Scaling factor must be between 0 and 0.25")
|
134
|
-
|
135
|
-
jaro_sim = jaro_distance(s, t)
|
136
|
-
|
137
|
-
# Find length of common prefix (up to 4 chars)
|
138
|
-
prefix_len = 0
|
139
|
-
for s_char, t_char in zip(s, t):
|
140
|
-
if s_char != t_char:
|
141
|
-
break
|
142
|
-
prefix_len += 1
|
143
|
-
if prefix_len == 4:
|
144
|
-
break
|
145
|
-
|
146
|
-
return jaro_sim + (prefix_len * scaling * (1 - jaro_sim))
|
147
|
-
|
148
|
-
|
149
|
-
def levenshtein_distance(a: str, b: str) -> int:
|
150
|
-
"""Calculate the Levenshtein (edit) distance between two strings.
|
151
|
-
|
152
|
-
Args:
|
153
|
-
a: First input string
|
154
|
-
b: Second input string
|
155
|
-
|
156
|
-
Returns:
|
157
|
-
int: Minimum number of single-character edits needed to change one
|
158
|
-
string into the other
|
159
|
-
"""
|
160
|
-
if not a:
|
161
|
-
return len(b)
|
162
|
-
if not b:
|
163
|
-
return len(a)
|
164
|
-
|
165
|
-
m, n = len(a), len(b)
|
166
|
-
d = [[0] * (n + 1) for _ in range(m + 1)]
|
167
|
-
|
168
|
-
for i in range(m + 1):
|
169
|
-
d[i][0] = i
|
170
|
-
for j in range(n + 1):
|
171
|
-
d[0][j] = j
|
172
|
-
|
173
|
-
for i, j in product(range(1, m + 1), range(1, n + 1)):
|
174
|
-
cost = 0 if a[i - 1] == b[j - 1] else 1
|
175
|
-
d[i][j] = min(
|
176
|
-
d[i - 1][j] + 1, # deletion
|
177
|
-
d[i][j - 1] + 1, # insertion
|
178
|
-
d[i - 1][j - 1] + cost, # substitution
|
179
|
-
)
|
180
|
-
|
181
|
-
return d[m][n]
|
182
|
-
|
183
|
-
|
184
|
-
def levenshtein_similarity(s1: str, s2: str) -> float:
|
185
|
-
"""Calculate the Levenshtein similarity between two strings.
|
186
|
-
|
187
|
-
Converts Levenshtein distance to a similarity score between 0 and 1.
|
188
|
-
|
189
|
-
Args:
|
190
|
-
s1: First input string
|
191
|
-
s2: Second input string
|
192
|
-
|
193
|
-
Returns:
|
194
|
-
float: Levenshtein similarity score between 0 and 1
|
195
|
-
"""
|
196
|
-
if not s1 and not s2:
|
197
|
-
return 1.0
|
198
|
-
if not s1 or not s2:
|
199
|
-
return 0.0
|
200
|
-
|
201
|
-
distance = levenshtein_distance(s1, s2)
|
202
|
-
max_len = max(len(s1), len(s2))
|
203
|
-
return 1 - (distance / max_len)
|
204
|
-
|
205
|
-
|
206
|
-
def sequence_matcher_similarity(s1: str, s2: str) -> float:
|
207
|
-
"""Calculate similarity using Python's SequenceMatcher.
|
208
|
-
|
209
|
-
Args:
|
210
|
-
s1: First input string
|
211
|
-
s2: Second input string
|
212
|
-
|
213
|
-
Returns:
|
214
|
-
float: Similarity score between 0 and 1
|
215
|
-
"""
|
216
|
-
return SequenceMatcher(None, s1, s2).ratio()
|
217
|
-
|
218
|
-
|
219
|
-
# Map of available similarity algorithms
|
220
|
-
SIMILARITY_ALGO_MAP = {
|
221
|
-
"jaro_winkler": jaro_winkler_similarity,
|
222
|
-
"levenshtein": levenshtein_similarity,
|
223
|
-
"sequence_matcher": sequence_matcher_similarity,
|
224
|
-
"hamming": hamming_similarity,
|
225
|
-
"cosine": cosine_similarity,
|
226
|
-
}
|
227
|
-
|
228
|
-
|
229
|
-
SIMILARITY_TYPE = Literal[
|
230
|
-
"jaro_winkler",
|
231
|
-
"levenshtein",
|
232
|
-
"sequence_matcher",
|
233
|
-
"hamming",
|
234
|
-
"cosine",
|
235
|
-
]
|
236
|
-
|
237
|
-
# Type alias for similarity functions
|
238
|
-
SimilarityFunc = Callable[[str, str], float]
|
239
|
-
|
240
|
-
|
241
|
-
@dataclass(frozen=True)
|
242
|
-
class MatchResult:
|
243
|
-
"""Represents a string matching result."""
|
244
|
-
|
245
|
-
word: str
|
246
|
-
score: float
|
247
|
-
index: int
|
248
|
-
|
249
|
-
|
250
|
-
def string_similarity(
|
251
|
-
word: str,
|
252
|
-
correct_words: Sequence[str],
|
253
|
-
algorithm: SIMILARITY_TYPE | Callable[[str, str], float] = "jaro_winkler",
|
254
|
-
threshold: float = 0.0,
|
255
|
-
case_sensitive: bool = False,
|
256
|
-
return_most_similar: bool = False,
|
257
|
-
) -> str | list[str] | None:
|
258
|
-
"""Find similar strings using specified similarity algorithm.
|
259
|
-
|
260
|
-
Args:
|
261
|
-
word: The input string to find matches for
|
262
|
-
correct_words: List of strings to compare against
|
263
|
-
algorithm: Similarity algorithm to use
|
264
|
-
threshold: Minimum similarity score (0.0 to 1.0)
|
265
|
-
case_sensitive: Whether to consider case when matching
|
266
|
-
return_most_similar: Return only the most similar match
|
267
|
-
|
268
|
-
Returns:
|
269
|
-
Matching string(s) or None if no matches found
|
270
|
-
|
271
|
-
Raises:
|
272
|
-
ValueError: If correct_words is empty or threshold is invalid
|
273
|
-
"""
|
274
|
-
if not correct_words:
|
275
|
-
raise ValueError("correct_words must not be empty")
|
276
|
-
|
277
|
-
if not 0.0 <= threshold <= 1.0:
|
278
|
-
raise ValueError("threshold must be between 0.0 and 1.0")
|
279
|
-
|
280
|
-
# Convert inputs to strings
|
281
|
-
compare_word = str(word)
|
282
|
-
original_words = [str(w) for w in correct_words]
|
283
|
-
|
284
|
-
# Handle case sensitivity
|
285
|
-
if not case_sensitive:
|
286
|
-
compare_word = compare_word.lower()
|
287
|
-
compare_words = [w.lower() for w in original_words]
|
288
|
-
else:
|
289
|
-
compare_words = original_words.copy()
|
290
|
-
|
291
|
-
# Get scoring function
|
292
|
-
if isinstance(algorithm, str):
|
293
|
-
score_func = SIMILARITY_ALGO_MAP.get(algorithm)
|
294
|
-
if score_func is None:
|
295
|
-
raise ValueError(f"Unsupported algorithm: {algorithm}")
|
296
|
-
elif callable(algorithm):
|
297
|
-
score_func = algorithm
|
298
|
-
else:
|
299
|
-
raise ValueError(
|
300
|
-
"algorithm must be a string specifying a built-in algorithm or a callable"
|
301
|
-
)
|
302
|
-
|
303
|
-
# Calculate similarities
|
304
|
-
results = []
|
305
|
-
for idx, (orig_word, comp_word) in enumerate(
|
306
|
-
zip(original_words, compare_words)
|
307
|
-
):
|
308
|
-
# Skip different length strings for hamming similarity
|
309
|
-
if algorithm == "hamming" and len(comp_word) != len(compare_word):
|
310
|
-
continue
|
311
|
-
|
312
|
-
score = score_func(compare_word, comp_word)
|
313
|
-
if score >= threshold:
|
314
|
-
results.append(MatchResult(orig_word, score, idx))
|
315
|
-
|
316
|
-
# Return None if no matches
|
317
|
-
if not results:
|
318
|
-
return None
|
319
|
-
|
320
|
-
# Sort by score (descending) and index (ascending) for stable ordering
|
321
|
-
results.sort(key=lambda x: (-x.score, x.index))
|
322
|
-
|
323
|
-
# Filter exact matches for case sensitive comparisons
|
324
|
-
if case_sensitive:
|
325
|
-
max_score = results[0].score
|
326
|
-
results = [r for r in results if r.score == max_score]
|
327
|
-
|
328
|
-
# Return results
|
329
|
-
if return_most_similar:
|
330
|
-
return results[0].word
|
331
|
-
|
332
|
-
return [r.word for r in results]
|
7
|
+
__all__ = ("string_similarity", "SIMILARITY_TYPE", "SimilarityFunc")
|