krons 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kronos/__init__.py +0 -0
- kronos/core/__init__.py +145 -0
- kronos/core/broadcaster.py +116 -0
- kronos/core/element.py +225 -0
- kronos/core/event.py +316 -0
- kronos/core/eventbus.py +116 -0
- kronos/core/flow.py +356 -0
- kronos/core/graph.py +442 -0
- kronos/core/node.py +982 -0
- kronos/core/pile.py +575 -0
- kronos/core/processor.py +494 -0
- kronos/core/progression.py +296 -0
- kronos/enforcement/__init__.py +57 -0
- kronos/enforcement/common/__init__.py +34 -0
- kronos/enforcement/common/boolean.py +85 -0
- kronos/enforcement/common/choice.py +97 -0
- kronos/enforcement/common/mapping.py +118 -0
- kronos/enforcement/common/model.py +102 -0
- kronos/enforcement/common/number.py +98 -0
- kronos/enforcement/common/string.py +140 -0
- kronos/enforcement/context.py +129 -0
- kronos/enforcement/policy.py +80 -0
- kronos/enforcement/registry.py +153 -0
- kronos/enforcement/rule.py +312 -0
- kronos/enforcement/service.py +370 -0
- kronos/enforcement/validator.py +198 -0
- kronos/errors.py +146 -0
- kronos/operations/__init__.py +32 -0
- kronos/operations/builder.py +228 -0
- kronos/operations/flow.py +398 -0
- kronos/operations/node.py +101 -0
- kronos/operations/registry.py +92 -0
- kronos/protocols.py +414 -0
- kronos/py.typed +0 -0
- kronos/services/__init__.py +81 -0
- kronos/services/backend.py +286 -0
- kronos/services/endpoint.py +608 -0
- kronos/services/hook.py +471 -0
- kronos/services/imodel.py +465 -0
- kronos/services/registry.py +115 -0
- kronos/services/utilities/__init__.py +36 -0
- kronos/services/utilities/header_factory.py +87 -0
- kronos/services/utilities/rate_limited_executor.py +271 -0
- kronos/services/utilities/rate_limiter.py +180 -0
- kronos/services/utilities/resilience.py +414 -0
- kronos/session/__init__.py +41 -0
- kronos/session/exchange.py +258 -0
- kronos/session/message.py +60 -0
- kronos/session/session.py +411 -0
- kronos/specs/__init__.py +25 -0
- kronos/specs/adapters/__init__.py +0 -0
- kronos/specs/adapters/_utils.py +45 -0
- kronos/specs/adapters/dataclass_field.py +246 -0
- kronos/specs/adapters/factory.py +56 -0
- kronos/specs/adapters/pydantic_adapter.py +309 -0
- kronos/specs/adapters/sql_ddl.py +946 -0
- kronos/specs/catalog/__init__.py +36 -0
- kronos/specs/catalog/_audit.py +39 -0
- kronos/specs/catalog/_common.py +43 -0
- kronos/specs/catalog/_content.py +59 -0
- kronos/specs/catalog/_enforcement.py +70 -0
- kronos/specs/factory.py +120 -0
- kronos/specs/operable.py +314 -0
- kronos/specs/phrase.py +405 -0
- kronos/specs/protocol.py +140 -0
- kronos/specs/spec.py +506 -0
- kronos/types/__init__.py +60 -0
- kronos/types/_sentinel.py +311 -0
- kronos/types/base.py +369 -0
- kronos/types/db_types.py +260 -0
- kronos/types/identity.py +66 -0
- kronos/utils/__init__.py +40 -0
- kronos/utils/_hash.py +234 -0
- kronos/utils/_json_dump.py +392 -0
- kronos/utils/_lazy_init.py +63 -0
- kronos/utils/_to_list.py +165 -0
- kronos/utils/_to_num.py +85 -0
- kronos/utils/_utils.py +375 -0
- kronos/utils/concurrency/__init__.py +205 -0
- kronos/utils/concurrency/_async_call.py +333 -0
- kronos/utils/concurrency/_cancel.py +122 -0
- kronos/utils/concurrency/_errors.py +96 -0
- kronos/utils/concurrency/_patterns.py +363 -0
- kronos/utils/concurrency/_primitives.py +328 -0
- kronos/utils/concurrency/_priority_queue.py +135 -0
- kronos/utils/concurrency/_resource_tracker.py +110 -0
- kronos/utils/concurrency/_run_async.py +67 -0
- kronos/utils/concurrency/_task.py +95 -0
- kronos/utils/concurrency/_utils.py +79 -0
- kronos/utils/fuzzy/__init__.py +14 -0
- kronos/utils/fuzzy/_extract_json.py +90 -0
- kronos/utils/fuzzy/_fuzzy_json.py +288 -0
- kronos/utils/fuzzy/_fuzzy_match.py +149 -0
- kronos/utils/fuzzy/_string_similarity.py +187 -0
- kronos/utils/fuzzy/_to_dict.py +396 -0
- kronos/utils/sql/__init__.py +13 -0
- kronos/utils/sql/_sql_validation.py +142 -0
- krons-0.1.0.dist-info/METADATA +70 -0
- krons-0.1.0.dist-info/RECORD +101 -0
- krons-0.1.0.dist-info/WHEEL +4 -0
- krons-0.1.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
# Copyright (c) 2025 - 2026, HaiyangLi <quantocean.li at gmail dot com>
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import orjson
|
|
7
|
+
|
|
8
|
+
# Security limit: Maximum input string size for JSON parsing.
|
|
9
|
+
# Large inputs can cause memory exhaustion during parsing and regex operations.
|
|
10
|
+
# Default: 10MB - generous for most use cases while preventing DoS.
|
|
11
|
+
MAX_JSON_INPUT_SIZE = 10 * 1024 * 1024 # 10 MB
|
|
12
|
+
|
|
13
|
+
__all__ = ("fuzzy_json",)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def fuzzy_json(
|
|
17
|
+
str_to_parse: str,
|
|
18
|
+
/,
|
|
19
|
+
*,
|
|
20
|
+
max_size: int = MAX_JSON_INPUT_SIZE,
|
|
21
|
+
) -> dict[str, Any] | list[dict[str, Any]]:
|
|
22
|
+
"""Parse JSON string with fuzzy error correction (quotes, spacing, brackets).
|
|
23
|
+
|
|
24
|
+
Security Note:
|
|
25
|
+
Input size is limited to prevent memory exhaustion attacks.
|
|
26
|
+
Uses orjson which has built-in recursion depth limits (CVE-2024-27454 fixed).
|
|
27
|
+
|
|
28
|
+
Limitation:
|
|
29
|
+
This is designed for "nearly-valid" JSON from LLMs (single quotes, trailing
|
|
30
|
+
commas, unquoted keys). It is NOT a security boundary for adversarial input.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
str_to_parse: JSON string to parse
|
|
34
|
+
max_size: Maximum allowed input size in bytes (default: 10MB)
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Either a dict or a list of dicts. Will NOT return primitive types
|
|
38
|
+
(int, float, str, bool, None) or lists of primitives.
|
|
39
|
+
|
|
40
|
+
Raises:
|
|
41
|
+
TypeError: If input is not a string or parsed JSON is a primitive type.
|
|
42
|
+
ValueError: If input is empty, exceeds size limit, or parsing fails.
|
|
43
|
+
"""
|
|
44
|
+
_check_valid_str(str_to_parse, max_size=max_size)
|
|
45
|
+
|
|
46
|
+
# Direct parse
|
|
47
|
+
try:
|
|
48
|
+
result = orjson.loads(str_to_parse)
|
|
49
|
+
return _validate_return_type(result)
|
|
50
|
+
except orjson.JSONDecodeError:
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
# State-machine cleaning
|
|
54
|
+
cleaned = _clean_json_string_safe(str_to_parse)
|
|
55
|
+
try:
|
|
56
|
+
result = orjson.loads(cleaned)
|
|
57
|
+
return _validate_return_type(result)
|
|
58
|
+
except orjson.JSONDecodeError:
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
# Bracket balancing
|
|
62
|
+
fixed = fix_json_string(cleaned)
|
|
63
|
+
try:
|
|
64
|
+
result = orjson.loads(fixed)
|
|
65
|
+
return _validate_return_type(result)
|
|
66
|
+
except orjson.JSONDecodeError:
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
# If all attempts fail
|
|
70
|
+
raise ValueError("Invalid JSON string")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _check_valid_str(
|
|
74
|
+
str_to_parse: str,
|
|
75
|
+
/,
|
|
76
|
+
*,
|
|
77
|
+
max_size: int = MAX_JSON_INPUT_SIZE,
|
|
78
|
+
) -> None:
|
|
79
|
+
"""Validate input string for JSON parsing.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
str_to_parse: Input string to validate
|
|
83
|
+
max_size: Maximum allowed size in bytes
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
TypeError: If input is not a string
|
|
87
|
+
ValueError: If input is empty or exceeds size limit
|
|
88
|
+
"""
|
|
89
|
+
if not isinstance(str_to_parse, str):
|
|
90
|
+
raise TypeError("Input must be a string")
|
|
91
|
+
if not str_to_parse.strip():
|
|
92
|
+
raise ValueError("Input string is empty")
|
|
93
|
+
if len(str_to_parse) > max_size:
|
|
94
|
+
msg = (
|
|
95
|
+
f"Input size ({len(str_to_parse)} bytes) exceeds maximum "
|
|
96
|
+
f"({max_size} bytes). This limit prevents memory exhaustion."
|
|
97
|
+
)
|
|
98
|
+
raise ValueError(msg)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _validate_return_type(result: Any) -> dict[str, Any] | list[dict[str, Any]]:
|
|
102
|
+
"""Validate that parsed JSON matches the declared return type.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
result: The result from orjson.loads()
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
The validated result (dict or list[dict])
|
|
109
|
+
|
|
110
|
+
Raises:
|
|
111
|
+
TypeError: If result is a primitive type or list of non-dict elements
|
|
112
|
+
"""
|
|
113
|
+
if isinstance(result, dict):
|
|
114
|
+
return result
|
|
115
|
+
|
|
116
|
+
if isinstance(result, list):
|
|
117
|
+
if not result: # Empty list valid (vacuous truth)
|
|
118
|
+
return result
|
|
119
|
+
for i, item in enumerate(result):
|
|
120
|
+
if not isinstance(item, dict):
|
|
121
|
+
raise TypeError(
|
|
122
|
+
f"fuzzy_json returns dict or list[dict], got list with "
|
|
123
|
+
f"non-dict element at index {i}: {type(item).__name__}"
|
|
124
|
+
)
|
|
125
|
+
return result
|
|
126
|
+
|
|
127
|
+
raise TypeError(
|
|
128
|
+
f"fuzzy_json returns dict or list[dict], got primitive type: {type(result).__name__}"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _clean_json_string_safe(s: str) -> str:
|
|
133
|
+
"""State-machine JSON cleaner that preserves string content.
|
|
134
|
+
|
|
135
|
+
Fixes common LLM output issues without corrupting quoted content:
|
|
136
|
+
- Single quotes -> double quotes (with escaping)
|
|
137
|
+
- Trailing commas before ] or }
|
|
138
|
+
- Unquoted object keys
|
|
139
|
+
"""
|
|
140
|
+
result: list[str] = []
|
|
141
|
+
pos = 0
|
|
142
|
+
length = len(s)
|
|
143
|
+
|
|
144
|
+
while pos < length:
|
|
145
|
+
char = s[pos]
|
|
146
|
+
|
|
147
|
+
if char == "'": # Convert single-quoted string to double-quoted
|
|
148
|
+
result.append('"')
|
|
149
|
+
pos += 1
|
|
150
|
+
while pos < length:
|
|
151
|
+
inner_char = s[pos]
|
|
152
|
+
if inner_char == "\\": # Escape sequence
|
|
153
|
+
if pos + 1 < length:
|
|
154
|
+
next_char = s[pos + 1]
|
|
155
|
+
if next_char == "'": # \' -> apostrophe
|
|
156
|
+
result.append("'")
|
|
157
|
+
pos += 2
|
|
158
|
+
continue
|
|
159
|
+
result.append(inner_char)
|
|
160
|
+
result.append(next_char)
|
|
161
|
+
pos += 2
|
|
162
|
+
continue
|
|
163
|
+
result.append(inner_char)
|
|
164
|
+
pos += 1
|
|
165
|
+
continue
|
|
166
|
+
if inner_char == "'": # End single-quoted string
|
|
167
|
+
result.append('"')
|
|
168
|
+
pos += 1
|
|
169
|
+
break
|
|
170
|
+
if inner_char == '"': # Escape inner double quotes
|
|
171
|
+
result.append('\\"')
|
|
172
|
+
pos += 1
|
|
173
|
+
continue
|
|
174
|
+
result.append(inner_char)
|
|
175
|
+
pos += 1
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
if char == '"': # Pass through double-quoted strings
|
|
179
|
+
result.append(char)
|
|
180
|
+
pos += 1
|
|
181
|
+
while pos < length:
|
|
182
|
+
inner_char = s[pos]
|
|
183
|
+
if inner_char == "\\": # Copy escape sequences
|
|
184
|
+
result.append(inner_char)
|
|
185
|
+
if pos + 1 < length:
|
|
186
|
+
pos += 1
|
|
187
|
+
result.append(s[pos])
|
|
188
|
+
pos += 1
|
|
189
|
+
continue
|
|
190
|
+
result.append(inner_char)
|
|
191
|
+
if inner_char == '"':
|
|
192
|
+
pos += 1
|
|
193
|
+
break
|
|
194
|
+
pos += 1
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
if char in "{,": # Handle unquoted keys and trailing commas
|
|
198
|
+
if char == ",": # Check for trailing comma
|
|
199
|
+
lookahead = pos + 1
|
|
200
|
+
while lookahead < length and s[lookahead] in " \t\n\r":
|
|
201
|
+
lookahead += 1
|
|
202
|
+
if lookahead < length and s[lookahead] in "]}":
|
|
203
|
+
pos += 1 # Skip trailing comma
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
result.append(char)
|
|
207
|
+
pos += 1
|
|
208
|
+
|
|
209
|
+
while pos < length and s[pos] in " \t\n\r": # Skip whitespace
|
|
210
|
+
result.append(s[pos])
|
|
211
|
+
pos += 1
|
|
212
|
+
|
|
213
|
+
if pos < length and s[pos] not in "\"'{[": # Check for unquoted key
|
|
214
|
+
key_start = pos
|
|
215
|
+
while pos < length and (s[pos].isalnum() or s[pos] == "_"):
|
|
216
|
+
pos += 1
|
|
217
|
+
if pos < length and key_start < pos:
|
|
218
|
+
key_end = pos
|
|
219
|
+
while pos < length and s[pos] in " \t\n\r":
|
|
220
|
+
pos += 1
|
|
221
|
+
if pos < length and s[pos] == ":":
|
|
222
|
+
key = s[key_start:key_end]
|
|
223
|
+
result.append(f'"{key}"')
|
|
224
|
+
continue
|
|
225
|
+
else:
|
|
226
|
+
pos = key_start # Not a key, restore
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
result.append(char)
|
|
230
|
+
pos += 1
|
|
231
|
+
|
|
232
|
+
return "".join(result).strip()
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def fix_json_string(str_to_parse: str, /) -> str:
|
|
236
|
+
"""Balance unmatched brackets in JSON string.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
str_to_parse: JSON string with potentially missing closing brackets.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
JSON string with missing closing brackets appended.
|
|
243
|
+
|
|
244
|
+
Raises:
|
|
245
|
+
ValueError: If input is empty or has mismatched/extra brackets.
|
|
246
|
+
"""
|
|
247
|
+
if not str_to_parse:
|
|
248
|
+
raise ValueError("Input string is empty")
|
|
249
|
+
|
|
250
|
+
brackets = {"{": "}", "[": "]"}
|
|
251
|
+
open_brackets = []
|
|
252
|
+
pos = 0
|
|
253
|
+
length = len(str_to_parse)
|
|
254
|
+
|
|
255
|
+
while pos < length:
|
|
256
|
+
char = str_to_parse[pos]
|
|
257
|
+
|
|
258
|
+
if char == "\\":
|
|
259
|
+
pos += 2
|
|
260
|
+
continue
|
|
261
|
+
|
|
262
|
+
if char == '"':
|
|
263
|
+
pos += 1
|
|
264
|
+
while pos < length: # Skip string content
|
|
265
|
+
if str_to_parse[pos] == "\\":
|
|
266
|
+
pos += 2
|
|
267
|
+
continue
|
|
268
|
+
if str_to_parse[pos] == '"':
|
|
269
|
+
pos += 1
|
|
270
|
+
break
|
|
271
|
+
pos += 1
|
|
272
|
+
continue
|
|
273
|
+
|
|
274
|
+
if char in brackets:
|
|
275
|
+
open_brackets.append(brackets[char])
|
|
276
|
+
elif char in brackets.values():
|
|
277
|
+
if not open_brackets:
|
|
278
|
+
raise ValueError("Extra closing bracket found.")
|
|
279
|
+
if open_brackets[-1] != char:
|
|
280
|
+
raise ValueError("Mismatched brackets.")
|
|
281
|
+
open_brackets.pop()
|
|
282
|
+
|
|
283
|
+
pos += 1
|
|
284
|
+
|
|
285
|
+
if open_brackets: # Append missing closing brackets
|
|
286
|
+
str_to_parse += "".join(reversed(open_brackets))
|
|
287
|
+
|
|
288
|
+
return str_to_parse
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from kronos.types import KeysLike
|
|
8
|
+
|
|
9
|
+
from kronos.types._sentinel import Unset
|
|
10
|
+
|
|
11
|
+
from ._string_similarity import SimilarityAlgo, string_similarity
|
|
12
|
+
|
|
13
|
+
__all__ = ("fuzzy_match_keys",)
|
|
14
|
+
|
|
15
|
+
HandleUnmatched = Literal["ignore", "raise", "remove", "fill", "force"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class HandleUnmatched(Enum):
|
|
19
|
+
"""Strategy for handling unmatched keys in fuzzy_match_keys."""
|
|
20
|
+
|
|
21
|
+
IGNORE = "ignore"
|
|
22
|
+
"""Keep original keys as-is."""
|
|
23
|
+
|
|
24
|
+
RAISE = "raise"
|
|
25
|
+
"""Raise ValueError on unmatched keys."""
|
|
26
|
+
|
|
27
|
+
REMOVE = "remove"
|
|
28
|
+
"""Remove unmatched original keys."""
|
|
29
|
+
|
|
30
|
+
FILL = "fill"
|
|
31
|
+
"""Fill missing expected keys, keep unmatched original keys."""
|
|
32
|
+
|
|
33
|
+
FORCE = "force"
|
|
34
|
+
"""Fill missing expected keys, remove unmatched original keys."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def fuzzy_match_keys(
|
|
38
|
+
d_: dict[str, Any],
|
|
39
|
+
keys: KeysLike,
|
|
40
|
+
/,
|
|
41
|
+
*,
|
|
42
|
+
similarity_algo: SimilarityAlgo = SimilarityAlgo.JARO_WINKLER,
|
|
43
|
+
similarity_threshold: float = 0.85,
|
|
44
|
+
fuzzy_match: bool = True,
|
|
45
|
+
handle_unmatched: HandleUnmatched = HandleUnmatched.IGNORE,
|
|
46
|
+
fill_value: Any = Unset,
|
|
47
|
+
fill_mapping: dict[str, Any] | None = None,
|
|
48
|
+
strict: bool = False,
|
|
49
|
+
) -> dict[str, Any]:
|
|
50
|
+
"""Validate and correct dict keys using fuzzy string matching.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
d_: Input dictionary to validate.
|
|
54
|
+
keys: Expected keys (list, tuple, or object with .keys()).
|
|
55
|
+
similarity_algo: Algorithm for string similarity comparison.
|
|
56
|
+
similarity_threshold: Minimum similarity score (0.0-1.0).
|
|
57
|
+
fuzzy_match: Enable fuzzy matching for non-exact matches.
|
|
58
|
+
handle_unmatched: Strategy for unmatched keys (see HandleUnmatched).
|
|
59
|
+
fill_value: Default value for missing expected keys.
|
|
60
|
+
fill_mapping: Per-key values for specific missing keys.
|
|
61
|
+
strict: Raise if any expected keys remain unmatched.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Dictionary with keys corrected to match expected keys.
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
TypeError: If d_ is not dict or keys is None.
|
|
68
|
+
ValueError: If threshold out of range, or unmatched in RAISE mode.
|
|
69
|
+
"""
|
|
70
|
+
# Input validation
|
|
71
|
+
if not isinstance(d_, dict):
|
|
72
|
+
raise TypeError("First argument must be a dictionary")
|
|
73
|
+
if keys is None:
|
|
74
|
+
raise TypeError("Keys argument cannot be None")
|
|
75
|
+
if not 0.0 <= similarity_threshold <= 1.0:
|
|
76
|
+
raise ValueError("similarity_threshold must be between 0.0 and 1.0")
|
|
77
|
+
|
|
78
|
+
# Extract expected keys
|
|
79
|
+
if isinstance(keys, (list, tuple)):
|
|
80
|
+
fields_set = set(keys)
|
|
81
|
+
elif hasattr(keys, "keys"):
|
|
82
|
+
fields_set = set(keys.keys())
|
|
83
|
+
else:
|
|
84
|
+
fields_set = set(keys)
|
|
85
|
+
if not fields_set:
|
|
86
|
+
return d_.copy()
|
|
87
|
+
|
|
88
|
+
corrected_out = {}
|
|
89
|
+
matched_expected = set()
|
|
90
|
+
matched_input = set()
|
|
91
|
+
|
|
92
|
+
# Pass 1: exact matches
|
|
93
|
+
for key in d_:
|
|
94
|
+
if key in fields_set:
|
|
95
|
+
corrected_out[key] = d_[key]
|
|
96
|
+
matched_expected.add(key)
|
|
97
|
+
matched_input.add(key)
|
|
98
|
+
|
|
99
|
+
# Pass 2: fuzzy matching
|
|
100
|
+
if fuzzy_match:
|
|
101
|
+
remaining_input = set(d_.keys()) - matched_input
|
|
102
|
+
remaining_expected = fields_set - matched_expected
|
|
103
|
+
|
|
104
|
+
for key in remaining_input:
|
|
105
|
+
if not remaining_expected:
|
|
106
|
+
break
|
|
107
|
+
|
|
108
|
+
matches = string_similarity(
|
|
109
|
+
key,
|
|
110
|
+
list(remaining_expected),
|
|
111
|
+
algorithm=similarity_algo,
|
|
112
|
+
threshold=similarity_threshold,
|
|
113
|
+
return_most_similar=True,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
if matches:
|
|
117
|
+
match = matches if isinstance(matches, str) else matches[0] if matches else None
|
|
118
|
+
if match:
|
|
119
|
+
corrected_out[match] = d_[key]
|
|
120
|
+
matched_expected.add(match)
|
|
121
|
+
matched_input.add(key)
|
|
122
|
+
remaining_expected.remove(match)
|
|
123
|
+
elif handle_unmatched == HandleUnmatched.IGNORE:
|
|
124
|
+
corrected_out[key] = d_[key]
|
|
125
|
+
|
|
126
|
+
unmatched_input = set(d_.keys()) - matched_input
|
|
127
|
+
unmatched_expected = fields_set - matched_expected
|
|
128
|
+
|
|
129
|
+
if handle_unmatched == HandleUnmatched.RAISE and unmatched_input:
|
|
130
|
+
raise ValueError(f"Unmatched keys found: {unmatched_input}")
|
|
131
|
+
|
|
132
|
+
elif handle_unmatched == HandleUnmatched.IGNORE:
|
|
133
|
+
for key in unmatched_input:
|
|
134
|
+
corrected_out[key] = d_[key]
|
|
135
|
+
|
|
136
|
+
elif handle_unmatched in (HandleUnmatched.FILL, HandleUnmatched.FORCE):
|
|
137
|
+
for key in unmatched_expected:
|
|
138
|
+
corrected_out[key] = (
|
|
139
|
+
fill_mapping[key] if fill_mapping and key in fill_mapping else fill_value
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
if handle_unmatched == HandleUnmatched.FILL:
|
|
143
|
+
for key in unmatched_input:
|
|
144
|
+
corrected_out[key] = d_[key]
|
|
145
|
+
|
|
146
|
+
if strict and unmatched_expected:
|
|
147
|
+
raise ValueError(f"Missing required keys: {unmatched_expected}")
|
|
148
|
+
|
|
149
|
+
return corrected_out
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# Copyright (c) 2025 - 2026, HaiyangLi <quantocean.li at gmail dot com>
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""String similarity algorithms with rapidfuzz backend.
|
|
5
|
+
|
|
6
|
+
Provides multiple similarity metrics (Jaro-Winkler, Levenshtein, Hamming,
|
|
7
|
+
cosine, SequenceMatcher) for fuzzy string matching and correction.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from enum import Enum
|
|
15
|
+
from typing import TYPE_CHECKING, Literal
|
|
16
|
+
|
|
17
|
+
import rapidfuzz.distance as _rf_distance
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from collections.abc import Sequence
|
|
21
|
+
|
|
22
|
+
__all__ = ("SimilarityAlgo", "string_similarity")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SimilarityAlgo(Enum):
|
|
26
|
+
"""String similarity algorithms with associated scoring functions."""
|
|
27
|
+
|
|
28
|
+
JARO_WINKLER = "jaro_winkler"
|
|
29
|
+
LEVENSHTEIN = "levenshtein"
|
|
30
|
+
SEQUENCE_MATCHER = "sequence_matcher"
|
|
31
|
+
HAMMING = "hamming"
|
|
32
|
+
COSINE = "cosine"
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def from_potential_valid(cls, v: SimilarityAlgo | str) -> SimilarityAlgo:
|
|
36
|
+
"""Convert string to enum, passing through existing enum values."""
|
|
37
|
+
if isinstance(v, cls):
|
|
38
|
+
return v
|
|
39
|
+
try:
|
|
40
|
+
return cls(v)
|
|
41
|
+
except ValueError as e:
|
|
42
|
+
raise ValueError(f"Invalid similarity algorithm: {v}") from e
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def cosine_similarity(s1: str, s2: str) -> float:
|
|
46
|
+
"""Character-set cosine similarity. Returns 0.0 if either string empty."""
|
|
47
|
+
if not s1 or not s2:
|
|
48
|
+
return 0.0
|
|
49
|
+
set1, set2 = set(s1), set(s2)
|
|
50
|
+
return len(set1 & set2) / ((len(set1) * len(set2)) ** 0.5)
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def hamming_similarity(s1: str, s2: str) -> float:
|
|
54
|
+
"""Positional match ratio. Requires equal-length strings (else 0.0)."""
|
|
55
|
+
if not s1 or not s2 or len(s1) != len(s2):
|
|
56
|
+
return 0.0
|
|
57
|
+
return sum(c1 == c2 for c1, c2 in zip(s1, s2, strict=False)) / len(s1)
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def jaro_winkler_similarity(s: str, t: str, scaling: float = 0.1) -> float:
|
|
61
|
+
"""Jaro-Winkler similarity with prefix boost (via rapidfuzz).
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
s: First string.
|
|
65
|
+
t: Second string.
|
|
66
|
+
scaling: Prefix weight, must be in [0, 0.25].
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Similarity score in [0, 1].
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
ValueError: If scaling not in [0, 0.25].
|
|
73
|
+
"""
|
|
74
|
+
if not 0 <= scaling <= 0.25:
|
|
75
|
+
raise ValueError("Scaling factor must be between 0 and 0.25")
|
|
76
|
+
return _rf_distance.JaroWinkler.similarity(s, t, prefix_weight=scaling)
|
|
77
|
+
|
|
78
|
+
@staticmethod
|
|
79
|
+
def sequence_matcher_similarity(s1: str, s2: str) -> float:
|
|
80
|
+
"""Similarity via difflib.SequenceMatcher (longest contiguous match)."""
|
|
81
|
+
from difflib import SequenceMatcher
|
|
82
|
+
|
|
83
|
+
return SequenceMatcher(None, s1, s2).ratio()
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
SIMILARITY_TYPE = Literal[
|
|
87
|
+
"jaro_winkler",
|
|
88
|
+
"levenshtein",
|
|
89
|
+
"sequence_matcher",
|
|
90
|
+
"hamming",
|
|
91
|
+
"cosine",
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
SimilarityFunc = Callable[[str, str], float]
|
|
95
|
+
"""Type alias: (str, str) -> float similarity score."""
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass(frozen=True, slots=True)
|
|
99
|
+
class MatchResult:
|
|
100
|
+
"""String match with score and original index."""
|
|
101
|
+
|
|
102
|
+
word: str
|
|
103
|
+
score: float
|
|
104
|
+
index: int
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def string_similarity(
|
|
108
|
+
word: str,
|
|
109
|
+
correct_words: Sequence[str],
|
|
110
|
+
algorithm: SimilarityAlgo = SimilarityAlgo.JARO_WINKLER,
|
|
111
|
+
threshold: float = 0.0,
|
|
112
|
+
case_sensitive: bool = False,
|
|
113
|
+
return_most_similar: bool = False,
|
|
114
|
+
) -> str | list[str] | None:
|
|
115
|
+
"""Find similar strings using specified similarity algorithm.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
word: The input string to find matches for
|
|
119
|
+
correct_words: List of strings to compare against
|
|
120
|
+
algorithm: Similarity algorithm to use
|
|
121
|
+
threshold: Minimum similarity score (0.0 to 1.0)
|
|
122
|
+
case_sensitive: Whether to consider case when matching
|
|
123
|
+
return_most_similar: Return only the most similar match
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Matching string(s) or None if no matches found
|
|
127
|
+
|
|
128
|
+
Raises:
|
|
129
|
+
ValueError: If correct_words is empty or threshold is invalid
|
|
130
|
+
"""
|
|
131
|
+
if not correct_words:
|
|
132
|
+
raise ValueError("correct_words must not be empty")
|
|
133
|
+
|
|
134
|
+
if not 0.0 <= threshold <= 1.0:
|
|
135
|
+
raise ValueError("threshold must be between 0.0 and 1.0")
|
|
136
|
+
|
|
137
|
+
compare_word = str(word)
|
|
138
|
+
original_words = [str(w) for w in correct_words]
|
|
139
|
+
|
|
140
|
+
if not case_sensitive:
|
|
141
|
+
compare_word = compare_word.lower()
|
|
142
|
+
compare_words = [w.lower() for w in original_words]
|
|
143
|
+
else:
|
|
144
|
+
compare_words = original_words.copy()
|
|
145
|
+
|
|
146
|
+
algo_name: str | None = None
|
|
147
|
+
score_func: Callable[[str, str], float]
|
|
148
|
+
match algorithm:
|
|
149
|
+
case SimilarityAlgo.JARO_WINKLER:
|
|
150
|
+
algo_name = "jaro_winkler"
|
|
151
|
+
score_func = SimilarityAlgo.jaro_winkler_similarity
|
|
152
|
+
case SimilarityAlgo.LEVENSHTEIN:
|
|
153
|
+
algo_name = "levenshtein"
|
|
154
|
+
score_func = _rf_distance.Levenshtein.similarity
|
|
155
|
+
case SimilarityAlgo.SEQUENCE_MATCHER:
|
|
156
|
+
algo_name = "sequence_matcher"
|
|
157
|
+
score_func = SimilarityAlgo.sequence_matcher_similarity
|
|
158
|
+
case SimilarityAlgo.HAMMING:
|
|
159
|
+
algo_name = "hamming"
|
|
160
|
+
score_func = SimilarityAlgo.hamming_similarity
|
|
161
|
+
case SimilarityAlgo.COSINE:
|
|
162
|
+
algo_name = "cosine"
|
|
163
|
+
score_func = SimilarityAlgo.cosine_similarity
|
|
164
|
+
case _:
|
|
165
|
+
raise ValueError(f"Unsupported algorithm: {algorithm}")
|
|
166
|
+
|
|
167
|
+
results = []
|
|
168
|
+
for idx, (orig_word, comp_word) in enumerate(zip(original_words, compare_words, strict=False)):
|
|
169
|
+
if algo_name == "hamming" and len(comp_word) != len(compare_word):
|
|
170
|
+
continue # Hamming requires equal length
|
|
171
|
+
score = score_func(compare_word, comp_word) # type: ignore[operator]
|
|
172
|
+
if score >= threshold:
|
|
173
|
+
results.append(MatchResult(orig_word, score, idx))
|
|
174
|
+
|
|
175
|
+
if not results:
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
results.sort(key=lambda x: (-x.score, x.index)) # Desc score, asc index
|
|
179
|
+
|
|
180
|
+
if case_sensitive: # Keep only top-scoring matches
|
|
181
|
+
max_score = results[0].score
|
|
182
|
+
results = [r for r in results if r.score == max_score]
|
|
183
|
+
|
|
184
|
+
if return_most_similar:
|
|
185
|
+
return results[0].word
|
|
186
|
+
|
|
187
|
+
return [r.word for r in results]
|