lionherd-core 1.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lionherd_core/__init__.py +84 -0
- lionherd_core/base/__init__.py +30 -0
- lionherd_core/base/_utils.py +295 -0
- lionherd_core/base/broadcaster.py +128 -0
- lionherd_core/base/element.py +300 -0
- lionherd_core/base/event.py +322 -0
- lionherd_core/base/eventbus.py +112 -0
- lionherd_core/base/flow.py +236 -0
- lionherd_core/base/graph.py +616 -0
- lionherd_core/base/node.py +212 -0
- lionherd_core/base/pile.py +811 -0
- lionherd_core/base/progression.py +261 -0
- lionherd_core/errors.py +104 -0
- lionherd_core/libs/__init__.py +2 -0
- lionherd_core/libs/concurrency/__init__.py +60 -0
- lionherd_core/libs/concurrency/_cancel.py +85 -0
- lionherd_core/libs/concurrency/_errors.py +80 -0
- lionherd_core/libs/concurrency/_patterns.py +238 -0
- lionherd_core/libs/concurrency/_primitives.py +253 -0
- lionherd_core/libs/concurrency/_priority_queue.py +135 -0
- lionherd_core/libs/concurrency/_resource_tracker.py +66 -0
- lionherd_core/libs/concurrency/_task.py +58 -0
- lionherd_core/libs/concurrency/_utils.py +61 -0
- lionherd_core/libs/schema_handlers/__init__.py +35 -0
- lionherd_core/libs/schema_handlers/_function_call_parser.py +122 -0
- lionherd_core/libs/schema_handlers/_minimal_yaml.py +88 -0
- lionherd_core/libs/schema_handlers/_schema_to_model.py +251 -0
- lionherd_core/libs/schema_handlers/_typescript.py +153 -0
- lionherd_core/libs/string_handlers/__init__.py +15 -0
- lionherd_core/libs/string_handlers/_extract_json.py +65 -0
- lionherd_core/libs/string_handlers/_fuzzy_json.py +103 -0
- lionherd_core/libs/string_handlers/_string_similarity.py +347 -0
- lionherd_core/libs/string_handlers/_to_num.py +63 -0
- lionherd_core/ln/__init__.py +45 -0
- lionherd_core/ln/_async_call.py +314 -0
- lionherd_core/ln/_fuzzy_match.py +166 -0
- lionherd_core/ln/_fuzzy_validate.py +151 -0
- lionherd_core/ln/_hash.py +141 -0
- lionherd_core/ln/_json_dump.py +347 -0
- lionherd_core/ln/_list_call.py +110 -0
- lionherd_core/ln/_to_dict.py +373 -0
- lionherd_core/ln/_to_list.py +190 -0
- lionherd_core/ln/_utils.py +156 -0
- lionherd_core/lndl/__init__.py +62 -0
- lionherd_core/lndl/errors.py +30 -0
- lionherd_core/lndl/fuzzy.py +321 -0
- lionherd_core/lndl/parser.py +427 -0
- lionherd_core/lndl/prompt.py +137 -0
- lionherd_core/lndl/resolver.py +323 -0
- lionherd_core/lndl/types.py +287 -0
- lionherd_core/protocols.py +181 -0
- lionherd_core/py.typed +0 -0
- lionherd_core/types/__init__.py +46 -0
- lionherd_core/types/_sentinel.py +131 -0
- lionherd_core/types/base.py +341 -0
- lionherd_core/types/operable.py +133 -0
- lionherd_core/types/spec.py +313 -0
- lionherd_core/types/spec_adapters/__init__.py +10 -0
- lionherd_core/types/spec_adapters/_protocol.py +125 -0
- lionherd_core/types/spec_adapters/pydantic_field.py +177 -0
- lionherd_core-1.0.0a3.dist-info/METADATA +502 -0
- lionherd_core-1.0.0a3.dist-info/RECORD +64 -0
- lionherd_core-1.0.0a3.dist-info/WHEEL +4 -0
- lionherd_core-1.0.0a3.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _type_map(json_type: str) -> str:
|
|
6
|
+
"""Map JSON Schema types to TypeScript-like types."""
|
|
7
|
+
mapping = {
|
|
8
|
+
"string": "string",
|
|
9
|
+
"integer": "int",
|
|
10
|
+
"number": "float",
|
|
11
|
+
"boolean": "bool",
|
|
12
|
+
"array": "array", # Will be handled specially
|
|
13
|
+
"object": "object",
|
|
14
|
+
"null": "null",
|
|
15
|
+
}
|
|
16
|
+
return mapping.get(json_type, json_type)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _format_enum_union(enum_values: list) -> str:
|
|
20
|
+
"""Format enum values as TypeScript union of literals."""
|
|
21
|
+
formatted = []
|
|
22
|
+
for val in enum_values:
|
|
23
|
+
if isinstance(val, str):
|
|
24
|
+
formatted.append(f'"{val}"')
|
|
25
|
+
elif val is None:
|
|
26
|
+
formatted.append("null")
|
|
27
|
+
else:
|
|
28
|
+
formatted.append(str(val))
|
|
29
|
+
return " | ".join(formatted)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _extract_type_signature(field_spec: dict, required: bool) -> tuple[str, bool]:
|
|
33
|
+
"""Extract TypeScript-style type signature from JSON Schema field."""
|
|
34
|
+
# Handle enums first (most specific)
|
|
35
|
+
if "enum" in field_spec:
|
|
36
|
+
type_sig = _format_enum_union(field_spec["enum"])
|
|
37
|
+
# Check if enum includes null
|
|
38
|
+
has_null = None in field_spec["enum"]
|
|
39
|
+
is_optional = not required or has_null
|
|
40
|
+
return type_sig, is_optional
|
|
41
|
+
|
|
42
|
+
# Handle anyOf (unions)
|
|
43
|
+
if "anyOf" in field_spec:
|
|
44
|
+
options = field_spec["anyOf"]
|
|
45
|
+
type_parts = []
|
|
46
|
+
has_null = False
|
|
47
|
+
|
|
48
|
+
for opt in options:
|
|
49
|
+
if opt.get("type") == "null":
|
|
50
|
+
has_null = True
|
|
51
|
+
continue
|
|
52
|
+
|
|
53
|
+
if "type" in opt:
|
|
54
|
+
if opt["type"] == "array" and "items" in opt:
|
|
55
|
+
item_type = _type_map(opt["items"].get("type", "any"))
|
|
56
|
+
if "$ref" in opt["items"]:
|
|
57
|
+
item_type = opt["items"]["$ref"].split("/")[-1]
|
|
58
|
+
type_parts.append(f"{item_type}[]")
|
|
59
|
+
else:
|
|
60
|
+
type_parts.append(_type_map(opt["type"]))
|
|
61
|
+
elif "$ref" in opt:
|
|
62
|
+
ref_name = opt["$ref"].split("/")[-1]
|
|
63
|
+
type_parts.append(ref_name)
|
|
64
|
+
elif "enum" in opt:
|
|
65
|
+
type_parts.append(_format_enum_union(opt["enum"]))
|
|
66
|
+
|
|
67
|
+
if has_null:
|
|
68
|
+
type_parts.append("null")
|
|
69
|
+
|
|
70
|
+
type_sig = " | ".join(type_parts) if type_parts else "any"
|
|
71
|
+
is_optional = not required or has_null
|
|
72
|
+
return type_sig, is_optional
|
|
73
|
+
|
|
74
|
+
# Handle arrays
|
|
75
|
+
if "type" in field_spec and field_spec["type"] == "array":
|
|
76
|
+
if "items" in field_spec:
|
|
77
|
+
items_spec = field_spec["items"]
|
|
78
|
+
if "type" in items_spec:
|
|
79
|
+
item_type = _type_map(items_spec["type"])
|
|
80
|
+
elif "$ref" in items_spec:
|
|
81
|
+
item_type = items_spec["$ref"].split("/")[-1]
|
|
82
|
+
elif "enum" in items_spec:
|
|
83
|
+
item_type = f"({_format_enum_union(items_spec['enum'])})"
|
|
84
|
+
else:
|
|
85
|
+
item_type = "any"
|
|
86
|
+
else:
|
|
87
|
+
item_type = "any"
|
|
88
|
+
type_sig = f"{item_type}[]"
|
|
89
|
+
is_optional = not required
|
|
90
|
+
return type_sig, is_optional
|
|
91
|
+
|
|
92
|
+
# Handle $ref
|
|
93
|
+
if "$ref" in field_spec:
|
|
94
|
+
ref_name = field_spec["$ref"].split("/")[-1]
|
|
95
|
+
is_optional = not required
|
|
96
|
+
return ref_name, is_optional
|
|
97
|
+
|
|
98
|
+
# Handle simple types
|
|
99
|
+
if "type" in field_spec:
|
|
100
|
+
base_type = field_spec["type"]
|
|
101
|
+
# Handle nullable simple types (type? suffix from simplification)
|
|
102
|
+
if isinstance(base_type, str) and base_type.endswith("?"):
|
|
103
|
+
type_sig = _type_map(base_type[:-1])
|
|
104
|
+
is_optional = True
|
|
105
|
+
else:
|
|
106
|
+
type_sig = _type_map(base_type)
|
|
107
|
+
is_optional = not required
|
|
108
|
+
return type_sig, is_optional
|
|
109
|
+
|
|
110
|
+
# Fallback
|
|
111
|
+
return "any", not required
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def typescript_schema(schema: dict, indent: int = 0) -> str:
|
|
115
|
+
"""Convert JSON Schema to TypeScript-style notation for optimal LLM comprehension."""
|
|
116
|
+
lines = []
|
|
117
|
+
prefix = " " * indent
|
|
118
|
+
|
|
119
|
+
if "properties" not in schema:
|
|
120
|
+
return ""
|
|
121
|
+
|
|
122
|
+
required_fields = set(schema.get("required", []))
|
|
123
|
+
|
|
124
|
+
for field_name, field_spec in schema["properties"].items():
|
|
125
|
+
is_required = field_name in required_fields
|
|
126
|
+
|
|
127
|
+
# Extract type signature
|
|
128
|
+
type_sig, is_optional = _extract_type_signature(field_spec, is_required)
|
|
129
|
+
|
|
130
|
+
# Add default value if present
|
|
131
|
+
default_str = ""
|
|
132
|
+
if "default" in field_spec:
|
|
133
|
+
default_val = field_spec["default"]
|
|
134
|
+
if isinstance(default_val, str):
|
|
135
|
+
default_str = f' = "{default_val}"'
|
|
136
|
+
elif default_val is None:
|
|
137
|
+
default_str = " = null"
|
|
138
|
+
elif isinstance(default_val, bool):
|
|
139
|
+
default_str = f" = {'true' if default_val else 'false'}"
|
|
140
|
+
else:
|
|
141
|
+
default_str = f" = {default_val}"
|
|
142
|
+
|
|
143
|
+
# Build field definition
|
|
144
|
+
optional_marker = "?" if is_optional else ""
|
|
145
|
+
field_def = f"{prefix}{field_name}{optional_marker}: {type_sig}{default_str}"
|
|
146
|
+
|
|
147
|
+
# Add description if present
|
|
148
|
+
if field_spec.get("description"):
|
|
149
|
+
field_def += f" - {field_spec['description']}"
|
|
150
|
+
|
|
151
|
+
lines.append(field_def)
|
|
152
|
+
|
|
153
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from ._extract_json import extract_json
|
|
5
|
+
from ._fuzzy_json import fuzzy_json
|
|
6
|
+
from ._string_similarity import SimilarityAlgo, string_similarity
|
|
7
|
+
from ._to_num import to_num
|
|
8
|
+
|
|
9
|
+
__all__ = (
|
|
10
|
+
"SimilarityAlgo",
|
|
11
|
+
"extract_json",
|
|
12
|
+
"fuzzy_json",
|
|
13
|
+
"string_similarity",
|
|
14
|
+
"to_num",
|
|
15
|
+
)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
import contextlib
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import orjson
|
|
9
|
+
|
|
10
|
+
from ._fuzzy_json import fuzzy_json
|
|
11
|
+
|
|
12
|
+
# Precompile the regex for extracting JSON code blocks
|
|
13
|
+
_JSON_BLOCK_PATTERN = re.compile(r"```json\s*(.*?)\s*```", re.DOTALL)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def extract_json(
|
|
17
|
+
input_data: str | list[str],
|
|
18
|
+
/,
|
|
19
|
+
*,
|
|
20
|
+
fuzzy_parse: bool = False,
|
|
21
|
+
return_one_if_single: bool = True,
|
|
22
|
+
) -> Any | list[Any]:
|
|
23
|
+
"""Extract and parse JSON content from a string or markdown code blocks.
|
|
24
|
+
Attempts direct JSON parsing first. If that fails, looks for JSON content
|
|
25
|
+
within markdown code blocks denoted by ```json.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
input_data (str | list[str]): The input string or list of strings to parse.
|
|
29
|
+
fuzzy_parse (bool): If True, attempts fuzzy JSON parsing on failed attempts.
|
|
30
|
+
return_one_if_single (bool): If True and only one JSON object is found,
|
|
31
|
+
returns a dict instead of a list with one dict.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
# If input_data is a list, join into a single string
|
|
35
|
+
input_str = "\n".join(input_data) if isinstance(input_data, list) else input_data
|
|
36
|
+
|
|
37
|
+
# 1. Try direct parsing
|
|
38
|
+
|
|
39
|
+
with contextlib.suppress(Exception):
|
|
40
|
+
if fuzzy_parse:
|
|
41
|
+
return fuzzy_json(input_str)
|
|
42
|
+
return orjson.loads(input_str)
|
|
43
|
+
|
|
44
|
+
# 2. Attempt extracting JSON blocks from markdown
|
|
45
|
+
matches = _JSON_BLOCK_PATTERN.findall(input_str)
|
|
46
|
+
if not matches:
|
|
47
|
+
return []
|
|
48
|
+
|
|
49
|
+
# If only one match, return single dict; if multiple, return list of dicts
|
|
50
|
+
if return_one_if_single and len(matches) == 1:
|
|
51
|
+
data_str = matches[0]
|
|
52
|
+
with contextlib.suppress(Exception):
|
|
53
|
+
if fuzzy_parse:
|
|
54
|
+
return fuzzy_json(data_str)
|
|
55
|
+
return orjson.loads(data_str)
|
|
56
|
+
return []
|
|
57
|
+
|
|
58
|
+
# Multiple matches
|
|
59
|
+
results: list[Any] = []
|
|
60
|
+
for m in matches:
|
|
61
|
+
with contextlib.suppress(Exception):
|
|
62
|
+
parsed = fuzzy_json(m) if fuzzy_parse else orjson.loads(m)
|
|
63
|
+
# Append valid JSON (dicts, lists, or primitives)
|
|
64
|
+
results.append(parsed)
|
|
65
|
+
return results
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
import contextlib
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import orjson
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def fuzzy_json(str_to_parse: str, /) -> dict[str, Any] | list[dict[str, Any]]:
|
|
12
|
+
"""Parse JSON string with fuzzy error correction (quotes, spacing, brackets)."""
|
|
13
|
+
_check_valid_str(str_to_parse)
|
|
14
|
+
|
|
15
|
+
# 1. Direct attempt
|
|
16
|
+
with contextlib.suppress(orjson.JSONDecodeError):
|
|
17
|
+
return orjson.loads(str_to_parse)
|
|
18
|
+
|
|
19
|
+
# 2. Try cleaning: replace single quotes with double and normalize
|
|
20
|
+
cleaned = _clean_json_string(str_to_parse.replace("'", '"'))
|
|
21
|
+
with contextlib.suppress(orjson.JSONDecodeError):
|
|
22
|
+
return orjson.loads(cleaned)
|
|
23
|
+
|
|
24
|
+
# 3. Try fixing brackets
|
|
25
|
+
fixed = fix_json_string(cleaned)
|
|
26
|
+
with contextlib.suppress(orjson.JSONDecodeError):
|
|
27
|
+
return orjson.loads(fixed)
|
|
28
|
+
|
|
29
|
+
# If all attempts fail
|
|
30
|
+
raise ValueError("Invalid JSON string")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _check_valid_str(str_to_parse: str, /):
|
|
34
|
+
if not isinstance(str_to_parse, str):
|
|
35
|
+
raise TypeError("Input must be a string")
|
|
36
|
+
if not str_to_parse.strip():
|
|
37
|
+
raise ValueError("Input string is empty")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _clean_json_string(s: str) -> str:
|
|
41
|
+
"""Basic normalization: replace unescaped single quotes, trim spaces, ensure keys are quoted."""
|
|
42
|
+
# Replace unescaped single quotes with double quotes
|
|
43
|
+
# '(?<!\\)'" means a single quote not preceded by a backslash
|
|
44
|
+
s = re.sub(r"(?<!\\)'", '"', s)
|
|
45
|
+
# Collapse multiple whitespaces
|
|
46
|
+
s = re.sub(r"\s+", " ", s)
|
|
47
|
+
# Remove trailing commas before closing brackets/braces
|
|
48
|
+
s = re.sub(r",\s*([}\]])", r"\1", s)
|
|
49
|
+
# Ensure keys are quoted
|
|
50
|
+
# This attempts to find patterns like { key: value } and turn them into {"key": value}
|
|
51
|
+
s = re.sub(r'([{,])\s*([^"\s]+)\s*:', r'\1"\2":', s)
|
|
52
|
+
return s.strip()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def fix_json_string(str_to_parse: str, /) -> str:
|
|
56
|
+
"""Fix JSON string by balancing unmatched brackets."""
|
|
57
|
+
if not str_to_parse:
|
|
58
|
+
raise ValueError("Input string is empty")
|
|
59
|
+
|
|
60
|
+
brackets = {"{": "}", "[": "]"}
|
|
61
|
+
open_brackets = []
|
|
62
|
+
pos = 0
|
|
63
|
+
length = len(str_to_parse)
|
|
64
|
+
|
|
65
|
+
while pos < length:
|
|
66
|
+
char = str_to_parse[pos]
|
|
67
|
+
|
|
68
|
+
if char == "\\":
|
|
69
|
+
pos += 2 # Skip escaped chars
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
if char == '"':
|
|
73
|
+
pos += 1
|
|
74
|
+
# skip string content
|
|
75
|
+
while pos < length:
|
|
76
|
+
if str_to_parse[pos] == "\\":
|
|
77
|
+
pos += 2
|
|
78
|
+
continue
|
|
79
|
+
if str_to_parse[pos] == '"':
|
|
80
|
+
pos += 1
|
|
81
|
+
break
|
|
82
|
+
pos += 1
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
if char in brackets:
|
|
86
|
+
open_brackets.append(brackets[char])
|
|
87
|
+
elif char in brackets.values():
|
|
88
|
+
if not open_brackets:
|
|
89
|
+
# Extra closing bracket
|
|
90
|
+
# Better to raise error than guess
|
|
91
|
+
raise ValueError("Extra closing bracket found.")
|
|
92
|
+
if open_brackets[-1] != char:
|
|
93
|
+
# Mismatched bracket
|
|
94
|
+
raise ValueError("Mismatched brackets.")
|
|
95
|
+
open_brackets.pop()
|
|
96
|
+
|
|
97
|
+
pos += 1
|
|
98
|
+
|
|
99
|
+
# Add missing closing brackets if any
|
|
100
|
+
if open_brackets:
|
|
101
|
+
str_to_parse += "".join(reversed(open_brackets))
|
|
102
|
+
|
|
103
|
+
return str_to_parse
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
# Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import TYPE_CHECKING, Literal
|
|
7
|
+
|
|
8
|
+
from lionherd_core.types.base import Enum
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from collections.abc import Sequence
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
__all__ = ("SimilarityAlgo", "string_similarity")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SimilarityAlgo(Enum):
|
|
18
|
+
"""String similarity algorithm names.
|
|
19
|
+
|
|
20
|
+
Provides type-safe enum values while maintaining string compatibility.
|
|
21
|
+
Use .allowed() to get all valid algorithm names as a tuple.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
JARO_WINKLER = "jaro_winkler"
|
|
25
|
+
LEVENSHTEIN = "levenshtein"
|
|
26
|
+
SEQUENCE_MATCHER = "sequence_matcher"
|
|
27
|
+
HAMMING = "hamming"
|
|
28
|
+
COSINE = "cosine"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def cosine_similarity(s1: str, s2: str) -> float:
|
|
32
|
+
"""Calculate the cosine similarity between two strings.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
s1: First input string
|
|
36
|
+
s2: Second input string
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
float: Cosine similarity score between 0 and 1
|
|
40
|
+
"""
|
|
41
|
+
if not s1 or not s2:
|
|
42
|
+
return 0.0
|
|
43
|
+
|
|
44
|
+
set1, set2 = set(s1), set(s2)
|
|
45
|
+
intersection = set1.intersection(set2)
|
|
46
|
+
|
|
47
|
+
return len(intersection) / ((len(set1) * len(set2)) ** 0.5)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def hamming_similarity(s1: str, s2: str) -> float:
|
|
51
|
+
"""Calculate the Hamming similarity between two strings.
|
|
52
|
+
|
|
53
|
+
The strings must be of equal length. Returns the proportion of positions
|
|
54
|
+
at which corresponding symbols are the same.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
s1: First input string
|
|
58
|
+
s2: Second input string
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
float: Hamming similarity score between 0 and 1
|
|
62
|
+
"""
|
|
63
|
+
if not s1 or not s2 or len(s1) != len(s2):
|
|
64
|
+
return 0.0
|
|
65
|
+
|
|
66
|
+
matches = sum(c1 == c2 for c1, c2 in zip(s1, s2, strict=False))
|
|
67
|
+
return matches / len(s1)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def jaro_distance(s: str, t: str) -> float:
|
|
71
|
+
"""Calculate the Jaro distance between two strings.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
s: First input string
|
|
75
|
+
t: Second input string
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
float: Jaro distance score between 0 and 1
|
|
79
|
+
"""
|
|
80
|
+
s_len = len(s)
|
|
81
|
+
t_len = len(t)
|
|
82
|
+
|
|
83
|
+
if s_len == 0 and t_len == 0:
|
|
84
|
+
return 1.0
|
|
85
|
+
elif s_len == 0 or t_len == 0:
|
|
86
|
+
return 0.0
|
|
87
|
+
|
|
88
|
+
match_distance = (max(s_len, t_len) // 2) - 1
|
|
89
|
+
match_distance = max(0, match_distance) # Ensure non-negative
|
|
90
|
+
|
|
91
|
+
s_matches = [False] * s_len
|
|
92
|
+
t_matches = [False] * t_len
|
|
93
|
+
|
|
94
|
+
matches = 0
|
|
95
|
+
transpositions = 0
|
|
96
|
+
|
|
97
|
+
# Identify matches
|
|
98
|
+
for i in range(s_len):
|
|
99
|
+
start = max(0, i - match_distance)
|
|
100
|
+
end = min(i + match_distance + 1, t_len)
|
|
101
|
+
|
|
102
|
+
for j in range(start, end):
|
|
103
|
+
if t_matches[j] or s[i] != t[j]:
|
|
104
|
+
continue
|
|
105
|
+
s_matches[i] = t_matches[j] = True
|
|
106
|
+
matches += 1
|
|
107
|
+
break
|
|
108
|
+
|
|
109
|
+
if matches == 0:
|
|
110
|
+
return 0.0
|
|
111
|
+
|
|
112
|
+
# Count transpositions
|
|
113
|
+
k = 0
|
|
114
|
+
for i in range(s_len):
|
|
115
|
+
if not s_matches[i]:
|
|
116
|
+
continue
|
|
117
|
+
while not t_matches[k]:
|
|
118
|
+
k += 1
|
|
119
|
+
if s[i] != t[k]:
|
|
120
|
+
transpositions += 1
|
|
121
|
+
k += 1
|
|
122
|
+
|
|
123
|
+
transpositions //= 2
|
|
124
|
+
|
|
125
|
+
return (matches / s_len + matches / t_len + (matches - transpositions) / matches) / 3.0
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def jaro_winkler_similarity(s: str, t: str, scaling: float = 0.1) -> float:
|
|
129
|
+
"""Calculate the Jaro-Winkler similarity between two strings.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
s: First input string
|
|
133
|
+
t: Second input string
|
|
134
|
+
scaling: Scaling factor for common prefix adjustment
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
float: Jaro-Winkler similarity score between 0 and 1
|
|
138
|
+
|
|
139
|
+
Raises:
|
|
140
|
+
ValueError: If scaling factor is not between 0 and 0.25
|
|
141
|
+
"""
|
|
142
|
+
if not 0 <= scaling <= 0.25:
|
|
143
|
+
raise ValueError("Scaling factor must be between 0 and 0.25")
|
|
144
|
+
|
|
145
|
+
jaro_sim = jaro_distance(s, t)
|
|
146
|
+
|
|
147
|
+
# Find length of common prefix (up to 4 chars)
|
|
148
|
+
prefix_len = 0
|
|
149
|
+
for s_char, t_char in zip(s, t, strict=False):
|
|
150
|
+
if s_char != t_char:
|
|
151
|
+
break
|
|
152
|
+
prefix_len += 1
|
|
153
|
+
if prefix_len == 4:
|
|
154
|
+
break
|
|
155
|
+
|
|
156
|
+
return jaro_sim + (prefix_len * scaling * (1 - jaro_sim))
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def levenshtein_distance(a: str, b: str) -> int:
|
|
160
|
+
"""Calculate the Levenshtein (edit) distance between two strings.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
a: First input string
|
|
164
|
+
b: Second input string
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
int: Minimum number of single-character edits needed to change one
|
|
168
|
+
string into the other
|
|
169
|
+
"""
|
|
170
|
+
from itertools import product
|
|
171
|
+
|
|
172
|
+
if not a:
|
|
173
|
+
return len(b)
|
|
174
|
+
if not b:
|
|
175
|
+
return len(a)
|
|
176
|
+
|
|
177
|
+
m, n = len(a), len(b)
|
|
178
|
+
d = [[0] * (n + 1) for _ in range(m + 1)]
|
|
179
|
+
|
|
180
|
+
for i in range(m + 1):
|
|
181
|
+
d[i][0] = i
|
|
182
|
+
for j in range(n + 1):
|
|
183
|
+
d[0][j] = j
|
|
184
|
+
|
|
185
|
+
for i, j in product(range(1, m + 1), range(1, n + 1)):
|
|
186
|
+
cost = 0 if a[i - 1] == b[j - 1] else 1
|
|
187
|
+
d[i][j] = min(
|
|
188
|
+
d[i - 1][j] + 1, # deletion
|
|
189
|
+
d[i][j - 1] + 1, # insertion
|
|
190
|
+
d[i - 1][j - 1] + cost, # substitution
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
return d[m][n]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def levenshtein_similarity(s1: str, s2: str) -> float:
|
|
197
|
+
"""Calculate the Levenshtein similarity between two strings.
|
|
198
|
+
|
|
199
|
+
Converts Levenshtein distance to a similarity score between 0 and 1.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
s1: First input string
|
|
203
|
+
s2: Second input string
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
float: Levenshtein similarity score between 0 and 1
|
|
207
|
+
"""
|
|
208
|
+
if not s1 and not s2:
|
|
209
|
+
return 1.0
|
|
210
|
+
if not s1 or not s2:
|
|
211
|
+
return 0.0
|
|
212
|
+
|
|
213
|
+
distance = levenshtein_distance(s1, s2)
|
|
214
|
+
max_len = max(len(s1), len(s2))
|
|
215
|
+
return 1 - (distance / max_len)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def sequence_matcher_similarity(s1: str, s2: str) -> float:
|
|
219
|
+
"""Calculate similarity using Python's SequenceMatcher.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
s1: First input string
|
|
223
|
+
s2: Second input string
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
float: Similarity score between 0 and 1
|
|
227
|
+
"""
|
|
228
|
+
from difflib import SequenceMatcher
|
|
229
|
+
|
|
230
|
+
return SequenceMatcher(None, s1, s2).ratio()
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# Map of available similarity algorithms
|
|
234
|
+
SIMILARITY_ALGO_MAP = {
|
|
235
|
+
"jaro_winkler": jaro_winkler_similarity,
|
|
236
|
+
"levenshtein": levenshtein_similarity,
|
|
237
|
+
"sequence_matcher": sequence_matcher_similarity,
|
|
238
|
+
"hamming": hamming_similarity,
|
|
239
|
+
"cosine": cosine_similarity,
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
SIMILARITY_TYPE = Literal[
|
|
244
|
+
"jaro_winkler",
|
|
245
|
+
"levenshtein",
|
|
246
|
+
"sequence_matcher",
|
|
247
|
+
"hamming",
|
|
248
|
+
"cosine",
|
|
249
|
+
]
|
|
250
|
+
|
|
251
|
+
# Type alias for similarity functions
|
|
252
|
+
SimilarityFunc = Callable[[str, str], float]
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
@dataclass(frozen=True)
|
|
256
|
+
class MatchResult:
|
|
257
|
+
"""Represents a string matching result."""
|
|
258
|
+
|
|
259
|
+
word: str
|
|
260
|
+
score: float
|
|
261
|
+
index: int
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def string_similarity(
|
|
265
|
+
word: str,
|
|
266
|
+
correct_words: "Sequence[str]",
|
|
267
|
+
algorithm: SIMILARITY_TYPE | SimilarityAlgo | Callable[[str, str], float] = "jaro_winkler",
|
|
268
|
+
threshold: float = 0.0,
|
|
269
|
+
case_sensitive: bool = False,
|
|
270
|
+
return_most_similar: bool = False,
|
|
271
|
+
) -> str | list[str] | None:
|
|
272
|
+
"""Find similar strings using specified similarity algorithm.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
word: The input string to find matches for
|
|
276
|
+
correct_words: List of strings to compare against
|
|
277
|
+
algorithm: Similarity algorithm to use
|
|
278
|
+
threshold: Minimum similarity score (0.0 to 1.0)
|
|
279
|
+
case_sensitive: Whether to consider case when matching
|
|
280
|
+
return_most_similar: Return only the most similar match
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
Matching string(s) or None if no matches found
|
|
284
|
+
|
|
285
|
+
Raises:
|
|
286
|
+
ValueError: If correct_words is empty or threshold is invalid
|
|
287
|
+
"""
|
|
288
|
+
if not correct_words:
|
|
289
|
+
raise ValueError("correct_words must not be empty")
|
|
290
|
+
|
|
291
|
+
if not 0.0 <= threshold <= 1.0:
|
|
292
|
+
raise ValueError("threshold must be between 0.0 and 1.0")
|
|
293
|
+
|
|
294
|
+
# Convert inputs to strings
|
|
295
|
+
compare_word = str(word)
|
|
296
|
+
original_words = [str(w) for w in correct_words]
|
|
297
|
+
|
|
298
|
+
# Handle case sensitivity
|
|
299
|
+
if not case_sensitive:
|
|
300
|
+
compare_word = compare_word.lower()
|
|
301
|
+
compare_words = [w.lower() for w in original_words]
|
|
302
|
+
else:
|
|
303
|
+
compare_words = original_words.copy()
|
|
304
|
+
|
|
305
|
+
# Get scoring function
|
|
306
|
+
algo_name: str | None = None
|
|
307
|
+
if isinstance(algorithm, SimilarityAlgo):
|
|
308
|
+
algo_name = algorithm.value
|
|
309
|
+
score_func = SIMILARITY_ALGO_MAP[algo_name]
|
|
310
|
+
elif isinstance(algorithm, str):
|
|
311
|
+
algo_name = algorithm
|
|
312
|
+
score_func = SIMILARITY_ALGO_MAP.get(algo_name)
|
|
313
|
+
if score_func is None:
|
|
314
|
+
raise ValueError(f"Unsupported algorithm: {algo_name}")
|
|
315
|
+
elif callable(algorithm):
|
|
316
|
+
score_func = algorithm
|
|
317
|
+
else:
|
|
318
|
+
raise ValueError("algorithm must be a string specifying a built-in algorithm or a callable")
|
|
319
|
+
|
|
320
|
+
# Calculate similarities
|
|
321
|
+
results = []
|
|
322
|
+
for idx, (orig_word, comp_word) in enumerate(zip(original_words, compare_words, strict=False)):
|
|
323
|
+
# Skip different length strings for hamming similarity
|
|
324
|
+
if algo_name == "hamming" and len(comp_word) != len(compare_word):
|
|
325
|
+
continue
|
|
326
|
+
|
|
327
|
+
score = score_func(compare_word, comp_word) # type: ignore[operator]
|
|
328
|
+
if score >= threshold:
|
|
329
|
+
results.append(MatchResult(orig_word, score, idx))
|
|
330
|
+
|
|
331
|
+
# Return None if no matches
|
|
332
|
+
if not results:
|
|
333
|
+
return None
|
|
334
|
+
|
|
335
|
+
# Sort by score (descending) and index (ascending) for stable ordering
|
|
336
|
+
results.sort(key=lambda x: (-x.score, x.index))
|
|
337
|
+
|
|
338
|
+
# Filter exact matches for case sensitive comparisons
|
|
339
|
+
if case_sensitive:
|
|
340
|
+
max_score = results[0].score
|
|
341
|
+
results = [r for r in results if r.score == max_score]
|
|
342
|
+
|
|
343
|
+
# Return results
|
|
344
|
+
if return_most_similar:
|
|
345
|
+
return results[0].word
|
|
346
|
+
|
|
347
|
+
return [r.word for r in results]
|