lionherd-core 1.0.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. lionherd_core/__init__.py +84 -0
  2. lionherd_core/base/__init__.py +30 -0
  3. lionherd_core/base/_utils.py +295 -0
  4. lionherd_core/base/broadcaster.py +128 -0
  5. lionherd_core/base/element.py +300 -0
  6. lionherd_core/base/event.py +322 -0
  7. lionherd_core/base/eventbus.py +112 -0
  8. lionherd_core/base/flow.py +236 -0
  9. lionherd_core/base/graph.py +616 -0
  10. lionherd_core/base/node.py +212 -0
  11. lionherd_core/base/pile.py +811 -0
  12. lionherd_core/base/progression.py +261 -0
  13. lionherd_core/errors.py +104 -0
  14. lionherd_core/libs/__init__.py +2 -0
  15. lionherd_core/libs/concurrency/__init__.py +60 -0
  16. lionherd_core/libs/concurrency/_cancel.py +85 -0
  17. lionherd_core/libs/concurrency/_errors.py +80 -0
  18. lionherd_core/libs/concurrency/_patterns.py +238 -0
  19. lionherd_core/libs/concurrency/_primitives.py +253 -0
  20. lionherd_core/libs/concurrency/_priority_queue.py +135 -0
  21. lionherd_core/libs/concurrency/_resource_tracker.py +66 -0
  22. lionherd_core/libs/concurrency/_task.py +58 -0
  23. lionherd_core/libs/concurrency/_utils.py +61 -0
  24. lionherd_core/libs/schema_handlers/__init__.py +35 -0
  25. lionherd_core/libs/schema_handlers/_function_call_parser.py +122 -0
  26. lionherd_core/libs/schema_handlers/_minimal_yaml.py +88 -0
  27. lionherd_core/libs/schema_handlers/_schema_to_model.py +251 -0
  28. lionherd_core/libs/schema_handlers/_typescript.py +153 -0
  29. lionherd_core/libs/string_handlers/__init__.py +15 -0
  30. lionherd_core/libs/string_handlers/_extract_json.py +65 -0
  31. lionherd_core/libs/string_handlers/_fuzzy_json.py +103 -0
  32. lionherd_core/libs/string_handlers/_string_similarity.py +347 -0
  33. lionherd_core/libs/string_handlers/_to_num.py +63 -0
  34. lionherd_core/ln/__init__.py +45 -0
  35. lionherd_core/ln/_async_call.py +314 -0
  36. lionherd_core/ln/_fuzzy_match.py +166 -0
  37. lionherd_core/ln/_fuzzy_validate.py +151 -0
  38. lionherd_core/ln/_hash.py +141 -0
  39. lionherd_core/ln/_json_dump.py +347 -0
  40. lionherd_core/ln/_list_call.py +110 -0
  41. lionherd_core/ln/_to_dict.py +373 -0
  42. lionherd_core/ln/_to_list.py +190 -0
  43. lionherd_core/ln/_utils.py +156 -0
  44. lionherd_core/lndl/__init__.py +62 -0
  45. lionherd_core/lndl/errors.py +30 -0
  46. lionherd_core/lndl/fuzzy.py +321 -0
  47. lionherd_core/lndl/parser.py +427 -0
  48. lionherd_core/lndl/prompt.py +137 -0
  49. lionherd_core/lndl/resolver.py +323 -0
  50. lionherd_core/lndl/types.py +287 -0
  51. lionherd_core/protocols.py +181 -0
  52. lionherd_core/py.typed +0 -0
  53. lionherd_core/types/__init__.py +46 -0
  54. lionherd_core/types/_sentinel.py +131 -0
  55. lionherd_core/types/base.py +341 -0
  56. lionherd_core/types/operable.py +133 -0
  57. lionherd_core/types/spec.py +313 -0
  58. lionherd_core/types/spec_adapters/__init__.py +10 -0
  59. lionherd_core/types/spec_adapters/_protocol.py +125 -0
  60. lionherd_core/types/spec_adapters/pydantic_field.py +177 -0
  61. lionherd_core-1.0.0a3.dist-info/METADATA +502 -0
  62. lionherd_core-1.0.0a3.dist-info/RECORD +64 -0
  63. lionherd_core-1.0.0a3.dist-info/WHEEL +4 -0
  64. lionherd_core-1.0.0a3.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,153 @@
1
+ # Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+
5
+ def _type_map(json_type: str) -> str:
6
+ """Map JSON Schema types to TypeScript-like types."""
7
+ mapping = {
8
+ "string": "string",
9
+ "integer": "int",
10
+ "number": "float",
11
+ "boolean": "bool",
12
+ "array": "array", # Will be handled specially
13
+ "object": "object",
14
+ "null": "null",
15
+ }
16
+ return mapping.get(json_type, json_type)
17
+
18
+
19
+ def _format_enum_union(enum_values: list) -> str:
20
+ """Format enum values as TypeScript union of literals."""
21
+ formatted = []
22
+ for val in enum_values:
23
+ if isinstance(val, str):
24
+ formatted.append(f'"{val}"')
25
+ elif val is None:
26
+ formatted.append("null")
27
+ else:
28
+ formatted.append(str(val))
29
+ return " | ".join(formatted)
30
+
31
+
32
+ def _extract_type_signature(field_spec: dict, required: bool) -> tuple[str, bool]:
33
+ """Extract TypeScript-style type signature from JSON Schema field."""
34
+ # Handle enums first (most specific)
35
+ if "enum" in field_spec:
36
+ type_sig = _format_enum_union(field_spec["enum"])
37
+ # Check if enum includes null
38
+ has_null = None in field_spec["enum"]
39
+ is_optional = not required or has_null
40
+ return type_sig, is_optional
41
+
42
+ # Handle anyOf (unions)
43
+ if "anyOf" in field_spec:
44
+ options = field_spec["anyOf"]
45
+ type_parts = []
46
+ has_null = False
47
+
48
+ for opt in options:
49
+ if opt.get("type") == "null":
50
+ has_null = True
51
+ continue
52
+
53
+ if "type" in opt:
54
+ if opt["type"] == "array" and "items" in opt:
55
+ item_type = _type_map(opt["items"].get("type", "any"))
56
+ if "$ref" in opt["items"]:
57
+ item_type = opt["items"]["$ref"].split("/")[-1]
58
+ type_parts.append(f"{item_type}[]")
59
+ else:
60
+ type_parts.append(_type_map(opt["type"]))
61
+ elif "$ref" in opt:
62
+ ref_name = opt["$ref"].split("/")[-1]
63
+ type_parts.append(ref_name)
64
+ elif "enum" in opt:
65
+ type_parts.append(_format_enum_union(opt["enum"]))
66
+
67
+ if has_null:
68
+ type_parts.append("null")
69
+
70
+ type_sig = " | ".join(type_parts) if type_parts else "any"
71
+ is_optional = not required or has_null
72
+ return type_sig, is_optional
73
+
74
+ # Handle arrays
75
+ if "type" in field_spec and field_spec["type"] == "array":
76
+ if "items" in field_spec:
77
+ items_spec = field_spec["items"]
78
+ if "type" in items_spec:
79
+ item_type = _type_map(items_spec["type"])
80
+ elif "$ref" in items_spec:
81
+ item_type = items_spec["$ref"].split("/")[-1]
82
+ elif "enum" in items_spec:
83
+ item_type = f"({_format_enum_union(items_spec['enum'])})"
84
+ else:
85
+ item_type = "any"
86
+ else:
87
+ item_type = "any"
88
+ type_sig = f"{item_type}[]"
89
+ is_optional = not required
90
+ return type_sig, is_optional
91
+
92
+ # Handle $ref
93
+ if "$ref" in field_spec:
94
+ ref_name = field_spec["$ref"].split("/")[-1]
95
+ is_optional = not required
96
+ return ref_name, is_optional
97
+
98
+ # Handle simple types
99
+ if "type" in field_spec:
100
+ base_type = field_spec["type"]
101
+ # Handle nullable simple types (type? suffix from simplification)
102
+ if isinstance(base_type, str) and base_type.endswith("?"):
103
+ type_sig = _type_map(base_type[:-1])
104
+ is_optional = True
105
+ else:
106
+ type_sig = _type_map(base_type)
107
+ is_optional = not required
108
+ return type_sig, is_optional
109
+
110
+ # Fallback
111
+ return "any", not required
112
+
113
+
114
+ def typescript_schema(schema: dict, indent: int = 0) -> str:
115
+ """Convert JSON Schema to TypeScript-style notation for optimal LLM comprehension."""
116
+ lines = []
117
+ prefix = " " * indent
118
+
119
+ if "properties" not in schema:
120
+ return ""
121
+
122
+ required_fields = set(schema.get("required", []))
123
+
124
+ for field_name, field_spec in schema["properties"].items():
125
+ is_required = field_name in required_fields
126
+
127
+ # Extract type signature
128
+ type_sig, is_optional = _extract_type_signature(field_spec, is_required)
129
+
130
+ # Add default value if present
131
+ default_str = ""
132
+ if "default" in field_spec:
133
+ default_val = field_spec["default"]
134
+ if isinstance(default_val, str):
135
+ default_str = f' = "{default_val}"'
136
+ elif default_val is None:
137
+ default_str = " = null"
138
+ elif isinstance(default_val, bool):
139
+ default_str = f" = {'true' if default_val else 'false'}"
140
+ else:
141
+ default_str = f" = {default_val}"
142
+
143
+ # Build field definition
144
+ optional_marker = "?" if is_optional else ""
145
+ field_def = f"{prefix}{field_name}{optional_marker}: {type_sig}{default_str}"
146
+
147
+ # Add description if present
148
+ if field_spec.get("description"):
149
+ field_def += f" - {field_spec['description']}"
150
+
151
+ lines.append(field_def)
152
+
153
+ return "\n".join(lines)
@@ -0,0 +1,15 @@
1
+ # Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from ._extract_json import extract_json
5
+ from ._fuzzy_json import fuzzy_json
6
+ from ._string_similarity import SimilarityAlgo, string_similarity
7
+ from ._to_num import to_num
8
+
9
+ __all__ = (
10
+ "SimilarityAlgo",
11
+ "extract_json",
12
+ "fuzzy_json",
13
+ "string_similarity",
14
+ "to_num",
15
+ )
@@ -0,0 +1,65 @@
1
+ # Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import contextlib
5
+ import re
6
+ from typing import Any
7
+
8
+ import orjson
9
+
10
+ from ._fuzzy_json import fuzzy_json
11
+
12
+ # Precompile the regex for extracting JSON code blocks
13
+ _JSON_BLOCK_PATTERN = re.compile(r"```json\s*(.*?)\s*```", re.DOTALL)
14
+
15
+
16
+ def extract_json(
17
+ input_data: str | list[str],
18
+ /,
19
+ *,
20
+ fuzzy_parse: bool = False,
21
+ return_one_if_single: bool = True,
22
+ ) -> Any | list[Any]:
23
+ """Extract and parse JSON content from a string or markdown code blocks.
24
+ Attempts direct JSON parsing first. If that fails, looks for JSON content
25
+ within markdown code blocks denoted by ```json.
26
+
27
+ Args:
28
+ input_data (str | list[str]): The input string or list of strings to parse.
29
+ fuzzy_parse (bool): If True, attempts fuzzy JSON parsing on failed attempts.
30
+ return_one_if_single (bool): If True and only one JSON object is found,
31
+ returns a dict instead of a list with one dict.
32
+ """
33
+
34
+ # If input_data is a list, join into a single string
35
+ input_str = "\n".join(input_data) if isinstance(input_data, list) else input_data
36
+
37
+ # 1. Try direct parsing
38
+
39
+ with contextlib.suppress(Exception):
40
+ if fuzzy_parse:
41
+ return fuzzy_json(input_str)
42
+ return orjson.loads(input_str)
43
+
44
+ # 2. Attempt extracting JSON blocks from markdown
45
+ matches = _JSON_BLOCK_PATTERN.findall(input_str)
46
+ if not matches:
47
+ return []
48
+
49
+ # If only one match, return single dict; if multiple, return list of dicts
50
+ if return_one_if_single and len(matches) == 1:
51
+ data_str = matches[0]
52
+ with contextlib.suppress(Exception):
53
+ if fuzzy_parse:
54
+ return fuzzy_json(data_str)
55
+ return orjson.loads(data_str)
56
+ return []
57
+
58
+ # Multiple matches
59
+ results: list[Any] = []
60
+ for m in matches:
61
+ with contextlib.suppress(Exception):
62
+ parsed = fuzzy_json(m) if fuzzy_parse else orjson.loads(m)
63
+ # Append valid JSON (dicts, lists, or primitives)
64
+ results.append(parsed)
65
+ return results
@@ -0,0 +1,103 @@
1
+ # Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import contextlib
5
+ import re
6
+ from typing import Any
7
+
8
+ import orjson
9
+
10
+
11
+ def fuzzy_json(str_to_parse: str, /) -> dict[str, Any] | list[dict[str, Any]]:
12
+ """Parse JSON string with fuzzy error correction (quotes, spacing, brackets)."""
13
+ _check_valid_str(str_to_parse)
14
+
15
+ # 1. Direct attempt
16
+ with contextlib.suppress(orjson.JSONDecodeError):
17
+ return orjson.loads(str_to_parse)
18
+
19
+ # 2. Try cleaning: replace single quotes with double and normalize
20
+ cleaned = _clean_json_string(str_to_parse.replace("'", '"'))
21
+ with contextlib.suppress(orjson.JSONDecodeError):
22
+ return orjson.loads(cleaned)
23
+
24
+ # 3. Try fixing brackets
25
+ fixed = fix_json_string(cleaned)
26
+ with contextlib.suppress(orjson.JSONDecodeError):
27
+ return orjson.loads(fixed)
28
+
29
+ # If all attempts fail
30
+ raise ValueError("Invalid JSON string")
31
+
32
+
33
+ def _check_valid_str(str_to_parse: str, /):
34
+ if not isinstance(str_to_parse, str):
35
+ raise TypeError("Input must be a string")
36
+ if not str_to_parse.strip():
37
+ raise ValueError("Input string is empty")
38
+
39
+
40
+ def _clean_json_string(s: str) -> str:
41
+ """Basic normalization: replace unescaped single quotes, trim spaces, ensure keys are quoted."""
42
+ # Replace unescaped single quotes with double quotes
43
+ # '(?<!\\)'" means a single quote not preceded by a backslash
44
+ s = re.sub(r"(?<!\\)'", '"', s)
45
+ # Collapse multiple whitespaces
46
+ s = re.sub(r"\s+", " ", s)
47
+ # Remove trailing commas before closing brackets/braces
48
+ s = re.sub(r",\s*([}\]])", r"\1", s)
49
+ # Ensure keys are quoted
50
+ # This attempts to find patterns like { key: value } and turn them into {"key": value}
51
+ s = re.sub(r'([{,])\s*([^"\s]+)\s*:', r'\1"\2":', s)
52
+ return s.strip()
53
+
54
+
55
+ def fix_json_string(str_to_parse: str, /) -> str:
56
+ """Fix JSON string by balancing unmatched brackets."""
57
+ if not str_to_parse:
58
+ raise ValueError("Input string is empty")
59
+
60
+ brackets = {"{": "}", "[": "]"}
61
+ open_brackets = []
62
+ pos = 0
63
+ length = len(str_to_parse)
64
+
65
+ while pos < length:
66
+ char = str_to_parse[pos]
67
+
68
+ if char == "\\":
69
+ pos += 2 # Skip escaped chars
70
+ continue
71
+
72
+ if char == '"':
73
+ pos += 1
74
+ # skip string content
75
+ while pos < length:
76
+ if str_to_parse[pos] == "\\":
77
+ pos += 2
78
+ continue
79
+ if str_to_parse[pos] == '"':
80
+ pos += 1
81
+ break
82
+ pos += 1
83
+ continue
84
+
85
+ if char in brackets:
86
+ open_brackets.append(brackets[char])
87
+ elif char in brackets.values():
88
+ if not open_brackets:
89
+ # Extra closing bracket
90
+ # Better to raise error than guess
91
+ raise ValueError("Extra closing bracket found.")
92
+ if open_brackets[-1] != char:
93
+ # Mismatched bracket
94
+ raise ValueError("Mismatched brackets.")
95
+ open_brackets.pop()
96
+
97
+ pos += 1
98
+
99
+ # Add missing closing brackets if any
100
+ if open_brackets:
101
+ str_to_parse += "".join(reversed(open_brackets))
102
+
103
+ return str_to_parse
@@ -0,0 +1,347 @@
1
+ # Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from collections.abc import Callable
5
+ from dataclasses import dataclass
6
+ from typing import TYPE_CHECKING, Literal
7
+
8
+ from lionherd_core.types.base import Enum
9
+
10
+ if TYPE_CHECKING:
11
+ from collections.abc import Sequence
12
+
13
+
14
+ __all__ = ("SimilarityAlgo", "string_similarity")
15
+
16
+
17
+ class SimilarityAlgo(Enum):
18
+ """String similarity algorithm names.
19
+
20
+ Provides type-safe enum values while maintaining string compatibility.
21
+ Use .allowed() to get all valid algorithm names as a tuple.
22
+ """
23
+
24
+ JARO_WINKLER = "jaro_winkler"
25
+ LEVENSHTEIN = "levenshtein"
26
+ SEQUENCE_MATCHER = "sequence_matcher"
27
+ HAMMING = "hamming"
28
+ COSINE = "cosine"
29
+
30
+
31
+ def cosine_similarity(s1: str, s2: str) -> float:
32
+ """Calculate the cosine similarity between two strings.
33
+
34
+ Args:
35
+ s1: First input string
36
+ s2: Second input string
37
+
38
+ Returns:
39
+ float: Cosine similarity score between 0 and 1
40
+ """
41
+ if not s1 or not s2:
42
+ return 0.0
43
+
44
+ set1, set2 = set(s1), set(s2)
45
+ intersection = set1.intersection(set2)
46
+
47
+ return len(intersection) / ((len(set1) * len(set2)) ** 0.5)
48
+
49
+
50
+ def hamming_similarity(s1: str, s2: str) -> float:
51
+ """Calculate the Hamming similarity between two strings.
52
+
53
+ The strings must be of equal length. Returns the proportion of positions
54
+ at which corresponding symbols are the same.
55
+
56
+ Args:
57
+ s1: First input string
58
+ s2: Second input string
59
+
60
+ Returns:
61
+ float: Hamming similarity score between 0 and 1
62
+ """
63
+ if not s1 or not s2 or len(s1) != len(s2):
64
+ return 0.0
65
+
66
+ matches = sum(c1 == c2 for c1, c2 in zip(s1, s2, strict=False))
67
+ return matches / len(s1)
68
+
69
+
70
+ def jaro_distance(s: str, t: str) -> float:
71
+ """Calculate the Jaro distance between two strings.
72
+
73
+ Args:
74
+ s: First input string
75
+ t: Second input string
76
+
77
+ Returns:
78
+ float: Jaro distance score between 0 and 1
79
+ """
80
+ s_len = len(s)
81
+ t_len = len(t)
82
+
83
+ if s_len == 0 and t_len == 0:
84
+ return 1.0
85
+ elif s_len == 0 or t_len == 0:
86
+ return 0.0
87
+
88
+ match_distance = (max(s_len, t_len) // 2) - 1
89
+ match_distance = max(0, match_distance) # Ensure non-negative
90
+
91
+ s_matches = [False] * s_len
92
+ t_matches = [False] * t_len
93
+
94
+ matches = 0
95
+ transpositions = 0
96
+
97
+ # Identify matches
98
+ for i in range(s_len):
99
+ start = max(0, i - match_distance)
100
+ end = min(i + match_distance + 1, t_len)
101
+
102
+ for j in range(start, end):
103
+ if t_matches[j] or s[i] != t[j]:
104
+ continue
105
+ s_matches[i] = t_matches[j] = True
106
+ matches += 1
107
+ break
108
+
109
+ if matches == 0:
110
+ return 0.0
111
+
112
+ # Count transpositions
113
+ k = 0
114
+ for i in range(s_len):
115
+ if not s_matches[i]:
116
+ continue
117
+ while not t_matches[k]:
118
+ k += 1
119
+ if s[i] != t[k]:
120
+ transpositions += 1
121
+ k += 1
122
+
123
+ transpositions //= 2
124
+
125
+ return (matches / s_len + matches / t_len + (matches - transpositions) / matches) / 3.0
126
+
127
+
128
+ def jaro_winkler_similarity(s: str, t: str, scaling: float = 0.1) -> float:
129
+ """Calculate the Jaro-Winkler similarity between two strings.
130
+
131
+ Args:
132
+ s: First input string
133
+ t: Second input string
134
+ scaling: Scaling factor for common prefix adjustment
135
+
136
+ Returns:
137
+ float: Jaro-Winkler similarity score between 0 and 1
138
+
139
+ Raises:
140
+ ValueError: If scaling factor is not between 0 and 0.25
141
+ """
142
+ if not 0 <= scaling <= 0.25:
143
+ raise ValueError("Scaling factor must be between 0 and 0.25")
144
+
145
+ jaro_sim = jaro_distance(s, t)
146
+
147
+ # Find length of common prefix (up to 4 chars)
148
+ prefix_len = 0
149
+ for s_char, t_char in zip(s, t, strict=False):
150
+ if s_char != t_char:
151
+ break
152
+ prefix_len += 1
153
+ if prefix_len == 4:
154
+ break
155
+
156
+ return jaro_sim + (prefix_len * scaling * (1 - jaro_sim))
157
+
158
+
159
+ def levenshtein_distance(a: str, b: str) -> int:
160
+ """Calculate the Levenshtein (edit) distance between two strings.
161
+
162
+ Args:
163
+ a: First input string
164
+ b: Second input string
165
+
166
+ Returns:
167
+ int: Minimum number of single-character edits needed to change one
168
+ string into the other
169
+ """
170
+ from itertools import product
171
+
172
+ if not a:
173
+ return len(b)
174
+ if not b:
175
+ return len(a)
176
+
177
+ m, n = len(a), len(b)
178
+ d = [[0] * (n + 1) for _ in range(m + 1)]
179
+
180
+ for i in range(m + 1):
181
+ d[i][0] = i
182
+ for j in range(n + 1):
183
+ d[0][j] = j
184
+
185
+ for i, j in product(range(1, m + 1), range(1, n + 1)):
186
+ cost = 0 if a[i - 1] == b[j - 1] else 1
187
+ d[i][j] = min(
188
+ d[i - 1][j] + 1, # deletion
189
+ d[i][j - 1] + 1, # insertion
190
+ d[i - 1][j - 1] + cost, # substitution
191
+ )
192
+
193
+ return d[m][n]
194
+
195
+
196
+ def levenshtein_similarity(s1: str, s2: str) -> float:
197
+ """Calculate the Levenshtein similarity between two strings.
198
+
199
+ Converts Levenshtein distance to a similarity score between 0 and 1.
200
+
201
+ Args:
202
+ s1: First input string
203
+ s2: Second input string
204
+
205
+ Returns:
206
+ float: Levenshtein similarity score between 0 and 1
207
+ """
208
+ if not s1 and not s2:
209
+ return 1.0
210
+ if not s1 or not s2:
211
+ return 0.0
212
+
213
+ distance = levenshtein_distance(s1, s2)
214
+ max_len = max(len(s1), len(s2))
215
+ return 1 - (distance / max_len)
216
+
217
+
218
+ def sequence_matcher_similarity(s1: str, s2: str) -> float:
219
+ """Calculate similarity using Python's SequenceMatcher.
220
+
221
+ Args:
222
+ s1: First input string
223
+ s2: Second input string
224
+
225
+ Returns:
226
+ float: Similarity score between 0 and 1
227
+ """
228
+ from difflib import SequenceMatcher
229
+
230
+ return SequenceMatcher(None, s1, s2).ratio()
231
+
232
+
233
+ # Map of available similarity algorithms
234
+ SIMILARITY_ALGO_MAP = {
235
+ "jaro_winkler": jaro_winkler_similarity,
236
+ "levenshtein": levenshtein_similarity,
237
+ "sequence_matcher": sequence_matcher_similarity,
238
+ "hamming": hamming_similarity,
239
+ "cosine": cosine_similarity,
240
+ }
241
+
242
+
243
+ SIMILARITY_TYPE = Literal[
244
+ "jaro_winkler",
245
+ "levenshtein",
246
+ "sequence_matcher",
247
+ "hamming",
248
+ "cosine",
249
+ ]
250
+
251
+ # Type alias for similarity functions
252
+ SimilarityFunc = Callable[[str, str], float]
253
+
254
+
255
+ @dataclass(frozen=True)
256
+ class MatchResult:
257
+ """Represents a string matching result."""
258
+
259
+ word: str
260
+ score: float
261
+ index: int
262
+
263
+
264
+ def string_similarity(
265
+ word: str,
266
+ correct_words: "Sequence[str]",
267
+ algorithm: SIMILARITY_TYPE | SimilarityAlgo | Callable[[str, str], float] = "jaro_winkler",
268
+ threshold: float = 0.0,
269
+ case_sensitive: bool = False,
270
+ return_most_similar: bool = False,
271
+ ) -> str | list[str] | None:
272
+ """Find similar strings using specified similarity algorithm.
273
+
274
+ Args:
275
+ word: The input string to find matches for
276
+ correct_words: List of strings to compare against
277
+ algorithm: Similarity algorithm to use
278
+ threshold: Minimum similarity score (0.0 to 1.0)
279
+ case_sensitive: Whether to consider case when matching
280
+ return_most_similar: Return only the most similar match
281
+
282
+ Returns:
283
+ Matching string(s) or None if no matches found
284
+
285
+ Raises:
286
+ ValueError: If correct_words is empty or threshold is invalid
287
+ """
288
+ if not correct_words:
289
+ raise ValueError("correct_words must not be empty")
290
+
291
+ if not 0.0 <= threshold <= 1.0:
292
+ raise ValueError("threshold must be between 0.0 and 1.0")
293
+
294
+ # Convert inputs to strings
295
+ compare_word = str(word)
296
+ original_words = [str(w) for w in correct_words]
297
+
298
+ # Handle case sensitivity
299
+ if not case_sensitive:
300
+ compare_word = compare_word.lower()
301
+ compare_words = [w.lower() for w in original_words]
302
+ else:
303
+ compare_words = original_words.copy()
304
+
305
+ # Get scoring function
306
+ algo_name: str | None = None
307
+ if isinstance(algorithm, SimilarityAlgo):
308
+ algo_name = algorithm.value
309
+ score_func = SIMILARITY_ALGO_MAP[algo_name]
310
+ elif isinstance(algorithm, str):
311
+ algo_name = algorithm
312
+ score_func = SIMILARITY_ALGO_MAP.get(algo_name)
313
+ if score_func is None:
314
+ raise ValueError(f"Unsupported algorithm: {algo_name}")
315
+ elif callable(algorithm):
316
+ score_func = algorithm
317
+ else:
318
+ raise ValueError("algorithm must be a string specifying a built-in algorithm or a callable")
319
+
320
+ # Calculate similarities
321
+ results = []
322
+ for idx, (orig_word, comp_word) in enumerate(zip(original_words, compare_words, strict=False)):
323
+ # Skip different length strings for hamming similarity
324
+ if algo_name == "hamming" and len(comp_word) != len(compare_word):
325
+ continue
326
+
327
+ score = score_func(compare_word, comp_word) # type: ignore[operator]
328
+ if score >= threshold:
329
+ results.append(MatchResult(orig_word, score, idx))
330
+
331
+ # Return None if no matches
332
+ if not results:
333
+ return None
334
+
335
+ # Sort by score (descending) and index (ascending) for stable ordering
336
+ results.sort(key=lambda x: (-x.score, x.index))
337
+
338
+ # Filter exact matches for case sensitive comparisons
339
+ if case_sensitive:
340
+ max_score = results[0].score
341
+ results = [r for r in results if r.score == max_score]
342
+
343
+ # Return results
344
+ if return_most_similar:
345
+ return results[0].word
346
+
347
+ return [r.word for r in results]