lionherd-core 1.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lionherd_core/__init__.py +84 -0
- lionherd_core/base/__init__.py +30 -0
- lionherd_core/base/_utils.py +295 -0
- lionherd_core/base/broadcaster.py +128 -0
- lionherd_core/base/element.py +300 -0
- lionherd_core/base/event.py +322 -0
- lionherd_core/base/eventbus.py +112 -0
- lionherd_core/base/flow.py +236 -0
- lionherd_core/base/graph.py +616 -0
- lionherd_core/base/node.py +212 -0
- lionherd_core/base/pile.py +811 -0
- lionherd_core/base/progression.py +261 -0
- lionherd_core/errors.py +104 -0
- lionherd_core/libs/__init__.py +2 -0
- lionherd_core/libs/concurrency/__init__.py +60 -0
- lionherd_core/libs/concurrency/_cancel.py +85 -0
- lionherd_core/libs/concurrency/_errors.py +80 -0
- lionherd_core/libs/concurrency/_patterns.py +238 -0
- lionherd_core/libs/concurrency/_primitives.py +253 -0
- lionherd_core/libs/concurrency/_priority_queue.py +135 -0
- lionherd_core/libs/concurrency/_resource_tracker.py +66 -0
- lionherd_core/libs/concurrency/_task.py +58 -0
- lionherd_core/libs/concurrency/_utils.py +61 -0
- lionherd_core/libs/schema_handlers/__init__.py +35 -0
- lionherd_core/libs/schema_handlers/_function_call_parser.py +122 -0
- lionherd_core/libs/schema_handlers/_minimal_yaml.py +88 -0
- lionherd_core/libs/schema_handlers/_schema_to_model.py +251 -0
- lionherd_core/libs/schema_handlers/_typescript.py +153 -0
- lionherd_core/libs/string_handlers/__init__.py +15 -0
- lionherd_core/libs/string_handlers/_extract_json.py +65 -0
- lionherd_core/libs/string_handlers/_fuzzy_json.py +103 -0
- lionherd_core/libs/string_handlers/_string_similarity.py +347 -0
- lionherd_core/libs/string_handlers/_to_num.py +63 -0
- lionherd_core/ln/__init__.py +45 -0
- lionherd_core/ln/_async_call.py +314 -0
- lionherd_core/ln/_fuzzy_match.py +166 -0
- lionherd_core/ln/_fuzzy_validate.py +151 -0
- lionherd_core/ln/_hash.py +141 -0
- lionherd_core/ln/_json_dump.py +347 -0
- lionherd_core/ln/_list_call.py +110 -0
- lionherd_core/ln/_to_dict.py +373 -0
- lionherd_core/ln/_to_list.py +190 -0
- lionherd_core/ln/_utils.py +156 -0
- lionherd_core/lndl/__init__.py +62 -0
- lionherd_core/lndl/errors.py +30 -0
- lionherd_core/lndl/fuzzy.py +321 -0
- lionherd_core/lndl/parser.py +427 -0
- lionherd_core/lndl/prompt.py +137 -0
- lionherd_core/lndl/resolver.py +323 -0
- lionherd_core/lndl/types.py +287 -0
- lionherd_core/protocols.py +181 -0
- lionherd_core/py.typed +0 -0
- lionherd_core/types/__init__.py +46 -0
- lionherd_core/types/_sentinel.py +131 -0
- lionherd_core/types/base.py +341 -0
- lionherd_core/types/operable.py +133 -0
- lionherd_core/types/spec.py +313 -0
- lionherd_core/types/spec_adapters/__init__.py +10 -0
- lionherd_core/types/spec_adapters/_protocol.py +125 -0
- lionherd_core/types/spec_adapters/pydantic_field.py +177 -0
- lionherd_core-1.0.0a3.dist-info/METADATA +502 -0
- lionherd_core-1.0.0a3.dist-info/RECORD +64 -0
- lionherd_core-1.0.0a3.dist-info/WHEEL +4 -0
- lionherd_core-1.0.0a3.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from decimal import Decimal
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def to_num(
|
|
9
|
+
input_: Any,
|
|
10
|
+
/,
|
|
11
|
+
*,
|
|
12
|
+
upper_bound: int | float | None = None,
|
|
13
|
+
lower_bound: int | float | None = None,
|
|
14
|
+
num_type: type[int] | type[float] = float,
|
|
15
|
+
precision: int | None = None,
|
|
16
|
+
) -> int | float:
|
|
17
|
+
"""Convert input to numeric type with validation and bounds checking.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
input_: Value to convert to number
|
|
21
|
+
upper_bound: Maximum allowed value (inclusive)
|
|
22
|
+
lower_bound: Minimum allowed value (inclusive)
|
|
23
|
+
num_type: Target numeric type (int or float)
|
|
24
|
+
precision: Number of decimal places for rounding (float only)
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Converted number
|
|
28
|
+
|
|
29
|
+
Raises:
|
|
30
|
+
ValueError: For invalid input or out of bounds values
|
|
31
|
+
TypeError: For invalid input types
|
|
32
|
+
"""
|
|
33
|
+
# Validate num_type
|
|
34
|
+
if num_type not in (int, float):
|
|
35
|
+
raise ValueError(f"Invalid number type: {num_type}")
|
|
36
|
+
|
|
37
|
+
# Handle boolean (special case - must check before int)
|
|
38
|
+
if isinstance(input_, (bool, int, float, Decimal)):
|
|
39
|
+
value = float(input_)
|
|
40
|
+
# Handle string input
|
|
41
|
+
elif isinstance(input_, str):
|
|
42
|
+
input_ = input_.strip()
|
|
43
|
+
if not input_:
|
|
44
|
+
raise ValueError("Empty string cannot be converted to number")
|
|
45
|
+
try:
|
|
46
|
+
value = float(input_)
|
|
47
|
+
except ValueError as e:
|
|
48
|
+
raise ValueError(f"Cannot convert '{input_}' to number") from e
|
|
49
|
+
else:
|
|
50
|
+
raise TypeError(f"Cannot convert {type(input_).__name__} to number")
|
|
51
|
+
|
|
52
|
+
# Apply bounds checking
|
|
53
|
+
if upper_bound is not None and value > upper_bound:
|
|
54
|
+
raise ValueError(f"Value {value} exceeds upper bound {upper_bound}")
|
|
55
|
+
if lower_bound is not None and value < lower_bound:
|
|
56
|
+
raise ValueError(f"Value {value} below lower bound {lower_bound}")
|
|
57
|
+
|
|
58
|
+
# Apply precision for float
|
|
59
|
+
if precision is not None and num_type is float:
|
|
60
|
+
value = round(value, precision)
|
|
61
|
+
|
|
62
|
+
# Convert to target type
|
|
63
|
+
return num_type(value)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from ._async_call import AlcallParams, BcallParams, alcall, bcall
|
|
5
|
+
from ._fuzzy_match import FuzzyMatchKeysParams, fuzzy_match_keys
|
|
6
|
+
from ._fuzzy_validate import fuzzy_validate_mapping, fuzzy_validate_pydantic
|
|
7
|
+
from ._hash import hash_dict
|
|
8
|
+
from ._json_dump import (
|
|
9
|
+
get_orjson_default,
|
|
10
|
+
json_dict,
|
|
11
|
+
json_dumpb,
|
|
12
|
+
json_dumps,
|
|
13
|
+
json_lines_iter,
|
|
14
|
+
make_options,
|
|
15
|
+
)
|
|
16
|
+
from ._list_call import lcall
|
|
17
|
+
from ._to_dict import to_dict
|
|
18
|
+
from ._to_list import to_list
|
|
19
|
+
from ._utils import acreate_path, get_bins, import_module, is_import_installed, now_utc
|
|
20
|
+
|
|
21
|
+
__all__ = (
|
|
22
|
+
"AlcallParams",
|
|
23
|
+
"BcallParams",
|
|
24
|
+
"FuzzyMatchKeysParams",
|
|
25
|
+
"acreate_path",
|
|
26
|
+
"alcall",
|
|
27
|
+
"bcall",
|
|
28
|
+
"fuzzy_match_keys",
|
|
29
|
+
"fuzzy_validate_mapping",
|
|
30
|
+
"fuzzy_validate_pydantic",
|
|
31
|
+
"get_bins",
|
|
32
|
+
"get_orjson_default",
|
|
33
|
+
"hash_dict",
|
|
34
|
+
"import_module",
|
|
35
|
+
"is_import_installed",
|
|
36
|
+
"json_dict",
|
|
37
|
+
"json_dumpb",
|
|
38
|
+
"json_dumps",
|
|
39
|
+
"json_lines_iter",
|
|
40
|
+
"lcall",
|
|
41
|
+
"make_options",
|
|
42
|
+
"now_utc",
|
|
43
|
+
"to_dict",
|
|
44
|
+
"to_list",
|
|
45
|
+
)
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
# Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
import threading
|
|
5
|
+
from collections.abc import AsyncGenerator, Callable
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any, ClassVar, ParamSpec, TypeVar
|
|
8
|
+
|
|
9
|
+
from lionherd_core.libs.concurrency import (
|
|
10
|
+
Semaphore,
|
|
11
|
+
create_task_group,
|
|
12
|
+
get_cancelled_exc_class,
|
|
13
|
+
is_coro_func,
|
|
14
|
+
move_on_after,
|
|
15
|
+
non_cancel_subgroup,
|
|
16
|
+
run_sync,
|
|
17
|
+
sleep,
|
|
18
|
+
)
|
|
19
|
+
from lionherd_core.types import ModelConfig, Params, Unset, not_sentinel
|
|
20
|
+
|
|
21
|
+
from ._to_list import to_list
|
|
22
|
+
|
|
23
|
+
T = TypeVar("T")
|
|
24
|
+
P = ParamSpec("P")
|
|
25
|
+
|
|
26
|
+
_INITIALIZED = False
|
|
27
|
+
_MODEL_LIKE = None
|
|
28
|
+
_INIT_LOCK = threading.RLock()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
__all__ = (
|
|
32
|
+
"AlcallParams",
|
|
33
|
+
"BcallParams",
|
|
34
|
+
"alcall",
|
|
35
|
+
"bcall",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
async def alcall(
|
|
40
|
+
input_: list[Any],
|
|
41
|
+
func: Callable[..., T],
|
|
42
|
+
/,
|
|
43
|
+
*,
|
|
44
|
+
input_flatten: bool = False,
|
|
45
|
+
input_dropna: bool = False,
|
|
46
|
+
input_unique: bool = False,
|
|
47
|
+
input_flatten_tuple_set: bool = False,
|
|
48
|
+
output_flatten: bool = False,
|
|
49
|
+
output_dropna: bool = False,
|
|
50
|
+
output_unique: bool = False,
|
|
51
|
+
output_flatten_tuple_set: bool = False,
|
|
52
|
+
delay_before_start: float = 0,
|
|
53
|
+
retry_initial_delay: float = 0,
|
|
54
|
+
retry_backoff: float = 1,
|
|
55
|
+
retry_default: Any = Unset,
|
|
56
|
+
retry_timeout: float | None = None,
|
|
57
|
+
retry_attempts: int = 0,
|
|
58
|
+
max_concurrent: int | None = None,
|
|
59
|
+
throttle_period: float | None = None,
|
|
60
|
+
return_exceptions: bool = False,
|
|
61
|
+
**kwargs: Any,
|
|
62
|
+
) -> list[T | BaseException]:
|
|
63
|
+
"""Apply function to each list element asynchronously with retry and concurrency control.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
input_: List of items to process (or iterable that will be converted)
|
|
67
|
+
func: Callable to apply (sync or async)
|
|
68
|
+
input_flatten: Flatten nested input structures
|
|
69
|
+
input_dropna: Remove None/undefined from input
|
|
70
|
+
input_unique: Remove duplicate inputs (requires flatten)
|
|
71
|
+
input_flatten_tuple_set: Include tuples/sets in flattening
|
|
72
|
+
output_flatten: Flatten nested output structures
|
|
73
|
+
output_dropna: Remove None/undefined from output
|
|
74
|
+
output_unique: Remove duplicate outputs (requires flatten)
|
|
75
|
+
output_flatten_tuple_set: Include tuples/sets in output flattening
|
|
76
|
+
delay_before_start: Initial delay before processing (seconds)
|
|
77
|
+
retry_initial_delay: Initial retry delay (seconds)
|
|
78
|
+
retry_backoff: Backoff multiplier for retry delays
|
|
79
|
+
retry_default: Default value on retry exhaustion (Unset = raise)
|
|
80
|
+
retry_timeout: Timeout per function call (seconds)
|
|
81
|
+
retry_attempts: Maximum retry attempts (0 = no retry)
|
|
82
|
+
max_concurrent: Max concurrent executions (None = unlimited)
|
|
83
|
+
throttle_period: Delay between starting tasks (seconds)
|
|
84
|
+
return_exceptions: Return exceptions instead of raising
|
|
85
|
+
**kwargs: Additional arguments passed to func
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
List of results (preserves input order, may include exceptions if return_exceptions=True)
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
ValueError: If func is not callable
|
|
92
|
+
TimeoutError: If retry_timeout exceeded
|
|
93
|
+
ExceptionGroup: If return_exceptions=False and tasks raise
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
global _INITIALIZED, _MODEL_LIKE
|
|
97
|
+
if _INITIALIZED is False:
|
|
98
|
+
with _INIT_LOCK:
|
|
99
|
+
# Double-checked locking pattern
|
|
100
|
+
if _INITIALIZED is False:
|
|
101
|
+
from pydantic import BaseModel
|
|
102
|
+
|
|
103
|
+
_MODEL_LIKE = (BaseModel,)
|
|
104
|
+
_INITIALIZED = True
|
|
105
|
+
|
|
106
|
+
# Validate func is a single callable
|
|
107
|
+
if not callable(func):
|
|
108
|
+
# If func is not callable, maybe it's an iterable. Extract one callable if possible.
|
|
109
|
+
try:
|
|
110
|
+
func_list = list(func) # Convert iterable to list
|
|
111
|
+
except TypeError:
|
|
112
|
+
raise ValueError("func must be callable or an iterable containing one callable.")
|
|
113
|
+
|
|
114
|
+
# Ensure exactly one callable is present
|
|
115
|
+
if len(func_list) != 1 or not callable(func_list[0]):
|
|
116
|
+
raise ValueError("Only one callable function is allowed.")
|
|
117
|
+
|
|
118
|
+
func = func_list[0]
|
|
119
|
+
|
|
120
|
+
# Process input if requested
|
|
121
|
+
if any((input_flatten, input_dropna)):
|
|
122
|
+
input_ = to_list(
|
|
123
|
+
input_,
|
|
124
|
+
flatten=input_flatten,
|
|
125
|
+
dropna=input_dropna,
|
|
126
|
+
unique=input_unique,
|
|
127
|
+
flatten_tuple_set=input_flatten_tuple_set,
|
|
128
|
+
)
|
|
129
|
+
else:
|
|
130
|
+
if not isinstance(input_, list):
|
|
131
|
+
# Attempt to iterate
|
|
132
|
+
if isinstance(input_, _MODEL_LIKE):
|
|
133
|
+
# Pydantic model, convert to list
|
|
134
|
+
input_ = [input_]
|
|
135
|
+
else:
|
|
136
|
+
try:
|
|
137
|
+
iter(input_)
|
|
138
|
+
# It's iterable (tuple), convert to list of its contents
|
|
139
|
+
input_ = list(input_)
|
|
140
|
+
except TypeError:
|
|
141
|
+
# Not iterable, just wrap in a list
|
|
142
|
+
input_ = [input_]
|
|
143
|
+
|
|
144
|
+
# Optional initial delay before processing
|
|
145
|
+
if delay_before_start:
|
|
146
|
+
await sleep(delay_before_start)
|
|
147
|
+
|
|
148
|
+
semaphore = Semaphore(max_concurrent) if max_concurrent else None
|
|
149
|
+
throttle_delay = throttle_period or 0
|
|
150
|
+
coro_func = is_coro_func(func)
|
|
151
|
+
|
|
152
|
+
async def call_func(item: Any) -> T:
|
|
153
|
+
if coro_func:
|
|
154
|
+
# Async function
|
|
155
|
+
if retry_timeout is not None:
|
|
156
|
+
with move_on_after(retry_timeout) as cancel_scope:
|
|
157
|
+
result = await func(item, **kwargs)
|
|
158
|
+
if cancel_scope.cancelled_caught:
|
|
159
|
+
raise TimeoutError(f"Function call timed out after {retry_timeout}s")
|
|
160
|
+
return result
|
|
161
|
+
else:
|
|
162
|
+
return await func(item, **kwargs)
|
|
163
|
+
else:
|
|
164
|
+
# Sync function
|
|
165
|
+
if retry_timeout is not None:
|
|
166
|
+
with move_on_after(retry_timeout) as cancel_scope:
|
|
167
|
+
result = await run_sync(func, item, **kwargs)
|
|
168
|
+
if cancel_scope.cancelled_caught:
|
|
169
|
+
raise TimeoutError(f"Function call timed out after {retry_timeout}s")
|
|
170
|
+
return result
|
|
171
|
+
else:
|
|
172
|
+
return await run_sync(func, item, **kwargs)
|
|
173
|
+
|
|
174
|
+
async def execute_task(i: Any, index: int) -> Any:
|
|
175
|
+
attempts = 0
|
|
176
|
+
current_delay = retry_initial_delay
|
|
177
|
+
while True:
|
|
178
|
+
try:
|
|
179
|
+
result = await call_func(i)
|
|
180
|
+
return index, result
|
|
181
|
+
|
|
182
|
+
# if cancelled, re-raise
|
|
183
|
+
except get_cancelled_exc_class():
|
|
184
|
+
raise
|
|
185
|
+
|
|
186
|
+
# handle other exceptions
|
|
187
|
+
except Exception:
|
|
188
|
+
attempts += 1
|
|
189
|
+
if attempts <= retry_attempts:
|
|
190
|
+
if current_delay:
|
|
191
|
+
await sleep(current_delay)
|
|
192
|
+
current_delay *= retry_backoff
|
|
193
|
+
# Retry loop continues
|
|
194
|
+
else:
|
|
195
|
+
# Exhausted retries
|
|
196
|
+
if not_sentinel(retry_default):
|
|
197
|
+
return index, retry_default
|
|
198
|
+
# No default, re-raise
|
|
199
|
+
raise
|
|
200
|
+
|
|
201
|
+
# Preallocate result list and fill by index — preserves order with no lock/sort
|
|
202
|
+
n_items = len(input_)
|
|
203
|
+
out: list[Any] = [None] * n_items
|
|
204
|
+
|
|
205
|
+
async def task_wrapper(item: Any, idx: int) -> None:
|
|
206
|
+
try:
|
|
207
|
+
if semaphore:
|
|
208
|
+
async with semaphore:
|
|
209
|
+
_, result = await execute_task(item, idx)
|
|
210
|
+
else:
|
|
211
|
+
_, result = await execute_task(item, idx)
|
|
212
|
+
out[idx] = result
|
|
213
|
+
except BaseException as exc:
|
|
214
|
+
out[idx] = exc
|
|
215
|
+
if not return_exceptions:
|
|
216
|
+
raise # Propagate to TaskGroup
|
|
217
|
+
|
|
218
|
+
# Execute all tasks using task group
|
|
219
|
+
try:
|
|
220
|
+
async with create_task_group() as tg:
|
|
221
|
+
for idx, item in enumerate(input_):
|
|
222
|
+
tg.start_soon(task_wrapper, item, idx)
|
|
223
|
+
# Apply throttle delay between starting tasks
|
|
224
|
+
if throttle_delay and idx < n_items - 1:
|
|
225
|
+
await sleep(throttle_delay)
|
|
226
|
+
except ExceptionGroup as eg:
|
|
227
|
+
if not return_exceptions:
|
|
228
|
+
# Surface only the non-cancellation subgroup to preserve structure & tracebacks
|
|
229
|
+
rest = non_cancel_subgroup(eg)
|
|
230
|
+
if rest is not None:
|
|
231
|
+
raise rest
|
|
232
|
+
raise
|
|
233
|
+
|
|
234
|
+
output_list = out # already in original order
|
|
235
|
+
return to_list(
|
|
236
|
+
output_list,
|
|
237
|
+
flatten=output_flatten,
|
|
238
|
+
dropna=output_dropna,
|
|
239
|
+
unique=output_unique,
|
|
240
|
+
flatten_tuple_set=output_flatten_tuple_set,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
async def bcall(
|
|
245
|
+
input_: list[Any],
|
|
246
|
+
func: Callable[..., T],
|
|
247
|
+
/,
|
|
248
|
+
batch_size: int,
|
|
249
|
+
**kwargs: Any,
|
|
250
|
+
) -> AsyncGenerator[list[T | BaseException], None]:
|
|
251
|
+
"""Process input in batches using alcall. Yields results batch by batch.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
input_: Items to process
|
|
255
|
+
func: Callable to apply
|
|
256
|
+
batch_size: Number of items per batch
|
|
257
|
+
**kwargs: Arguments passed to alcall (see alcall for details)
|
|
258
|
+
|
|
259
|
+
Yields:
|
|
260
|
+
List of results for each batch
|
|
261
|
+
"""
|
|
262
|
+
input_ = to_list(input_, flatten=True, dropna=True)
|
|
263
|
+
|
|
264
|
+
for i in range(0, len(input_), batch_size):
|
|
265
|
+
batch = input_[i : i + batch_size]
|
|
266
|
+
yield await alcall(batch, func, **kwargs)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
@dataclass(slots=True, init=False, frozen=True)
|
|
270
|
+
class AlcallParams(Params):
|
|
271
|
+
# ClassVar attributes
|
|
272
|
+
_config: ClassVar[ModelConfig] = ModelConfig(none_as_sentinel=True)
|
|
273
|
+
_func: ClassVar[Any] = alcall
|
|
274
|
+
|
|
275
|
+
# input processing
|
|
276
|
+
input_flatten: bool
|
|
277
|
+
input_dropna: bool
|
|
278
|
+
input_unique: bool
|
|
279
|
+
input_flatten_tuple_set: bool
|
|
280
|
+
|
|
281
|
+
# output processing
|
|
282
|
+
output_flatten: bool
|
|
283
|
+
output_dropna: bool
|
|
284
|
+
output_unique: bool
|
|
285
|
+
output_flatten_tuple_set: bool
|
|
286
|
+
|
|
287
|
+
# retry and timeout
|
|
288
|
+
delay_before_start: float
|
|
289
|
+
retry_initial_delay: float
|
|
290
|
+
retry_backoff: float
|
|
291
|
+
retry_default: Any
|
|
292
|
+
retry_timeout: float
|
|
293
|
+
retry_attempts: int
|
|
294
|
+
|
|
295
|
+
# concurrency and throttling
|
|
296
|
+
max_concurrent: int
|
|
297
|
+
throttle_period: float
|
|
298
|
+
|
|
299
|
+
kw: dict[str, Any] = Unset
|
|
300
|
+
|
|
301
|
+
async def __call__(self, input_: list[Any], func: Callable[..., T], **kw: Any) -> list[T]:
|
|
302
|
+
kwargs = {**self.default_kw(), **kw}
|
|
303
|
+
return await alcall(input_, func, **kwargs)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
@dataclass(slots=True, init=False, frozen=True)
|
|
307
|
+
class BcallParams(AlcallParams):
|
|
308
|
+
_func: ClassVar[Any] = bcall
|
|
309
|
+
|
|
310
|
+
batch_size: int
|
|
311
|
+
|
|
312
|
+
async def __call__(self, input_: list[Any], func: Callable[..., T], **kw: Any) -> list[T]:
|
|
313
|
+
kwargs = {**self.default_kw(), **kw}
|
|
314
|
+
return await bcall(input_, func, self.batch_size, **kwargs)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# Copyright (c) 2025, HaiyangLi <quantocean.li at gmail dot com>
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Any, ClassVar, Literal
|
|
6
|
+
|
|
7
|
+
from ..libs.string_handlers._string_similarity import (
|
|
8
|
+
SIMILARITY_ALGO_MAP,
|
|
9
|
+
SIMILARITY_TYPE,
|
|
10
|
+
SimilarityAlgo,
|
|
11
|
+
SimilarityFunc,
|
|
12
|
+
string_similarity,
|
|
13
|
+
)
|
|
14
|
+
from ..types import KeysLike, ModelConfig, Params, Unset
|
|
15
|
+
|
|
16
|
+
__all__ = (
|
|
17
|
+
"FuzzyMatchKeysParams",
|
|
18
|
+
"fuzzy_match_keys",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
HandleUnmatched = Literal["ignore", "raise", "remove", "fill", "force"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def fuzzy_match_keys(
|
|
26
|
+
d_: dict[str, Any],
|
|
27
|
+
keys: KeysLike,
|
|
28
|
+
/,
|
|
29
|
+
*,
|
|
30
|
+
similarity_algo: SIMILARITY_TYPE | SimilarityAlgo | SimilarityFunc = "jaro_winkler",
|
|
31
|
+
similarity_threshold: float = 0.85,
|
|
32
|
+
fuzzy_match: bool = True,
|
|
33
|
+
handle_unmatched: HandleUnmatched = "ignore",
|
|
34
|
+
fill_value: Any = Unset,
|
|
35
|
+
fill_mapping: dict[str, Any] | None = None,
|
|
36
|
+
strict: bool = False,
|
|
37
|
+
) -> dict[str, Any]:
|
|
38
|
+
"""Validate and correct dict keys using fuzzy string matching.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
d_: Input dictionary to validate
|
|
42
|
+
keys: Expected keys (list or dict-like with .keys())
|
|
43
|
+
similarity_algo: Algorithm for string similarity
|
|
44
|
+
similarity_threshold: Minimum similarity score (0.0-1.0)
|
|
45
|
+
fuzzy_match: Enable fuzzy matching for unmatched keys
|
|
46
|
+
handle_unmatched: How to handle unmatched keys ("ignore", "raise", "remove", "fill", "force")
|
|
47
|
+
fill_value: Default value for missing keys when filling
|
|
48
|
+
fill_mapping: Custom values for specific missing keys
|
|
49
|
+
strict: Raise if expected keys are missing
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Dictionary with corrected keys
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
TypeError: If d_ is not a dict or keys is None
|
|
56
|
+
ValueError: If similarity_threshold out of range or unmatched keys found in raise mode
|
|
57
|
+
"""
|
|
58
|
+
# Input validation
|
|
59
|
+
if not isinstance(d_, dict):
|
|
60
|
+
raise TypeError("First argument must be a dictionary")
|
|
61
|
+
if keys is None:
|
|
62
|
+
raise TypeError("Keys argument cannot be None")
|
|
63
|
+
if not 0.0 <= similarity_threshold <= 1.0:
|
|
64
|
+
raise ValueError("similarity_threshold must be between 0.0 and 1.0")
|
|
65
|
+
|
|
66
|
+
# Extract expected keys
|
|
67
|
+
fields_set = set(keys) if isinstance(keys, list) else set(keys.keys())
|
|
68
|
+
if not fields_set:
|
|
69
|
+
return d_.copy() # Return copy of original if no expected keys
|
|
70
|
+
|
|
71
|
+
# Initialize output dictionary and tracking sets
|
|
72
|
+
corrected_out = {}
|
|
73
|
+
matched_expected = set()
|
|
74
|
+
matched_input = set()
|
|
75
|
+
|
|
76
|
+
# Get similarity function
|
|
77
|
+
if isinstance(similarity_algo, SimilarityAlgo):
|
|
78
|
+
similarity_func = SIMILARITY_ALGO_MAP[similarity_algo.value]
|
|
79
|
+
elif isinstance(similarity_algo, str):
|
|
80
|
+
if similarity_algo not in SIMILARITY_ALGO_MAP:
|
|
81
|
+
raise ValueError(f"Unknown similarity algorithm: {similarity_algo}")
|
|
82
|
+
similarity_func = SIMILARITY_ALGO_MAP[similarity_algo]
|
|
83
|
+
else:
|
|
84
|
+
similarity_func = similarity_algo
|
|
85
|
+
|
|
86
|
+
# First pass: exact matches
|
|
87
|
+
for key in d_:
|
|
88
|
+
if key in fields_set:
|
|
89
|
+
corrected_out[key] = d_[key]
|
|
90
|
+
matched_expected.add(key)
|
|
91
|
+
matched_input.add(key)
|
|
92
|
+
|
|
93
|
+
# Second pass: fuzzy matching if enabled
|
|
94
|
+
if fuzzy_match:
|
|
95
|
+
remaining_input = set(d_.keys()) - matched_input
|
|
96
|
+
remaining_expected = fields_set - matched_expected
|
|
97
|
+
|
|
98
|
+
for key in remaining_input:
|
|
99
|
+
if not remaining_expected:
|
|
100
|
+
break
|
|
101
|
+
|
|
102
|
+
matches = string_similarity(
|
|
103
|
+
key,
|
|
104
|
+
list(remaining_expected),
|
|
105
|
+
algorithm=similarity_func,
|
|
106
|
+
threshold=similarity_threshold,
|
|
107
|
+
return_most_similar=True,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
if matches:
|
|
111
|
+
match = matches
|
|
112
|
+
corrected_out[match] = d_[key]
|
|
113
|
+
matched_expected.add(match)
|
|
114
|
+
matched_input.add(key)
|
|
115
|
+
remaining_expected.remove(match)
|
|
116
|
+
elif handle_unmatched == "ignore":
|
|
117
|
+
corrected_out[key] = d_[key]
|
|
118
|
+
|
|
119
|
+
# Handle unmatched keys based on handle_unmatched parameter
|
|
120
|
+
unmatched_input = set(d_.keys()) - matched_input
|
|
121
|
+
unmatched_expected = fields_set - matched_expected
|
|
122
|
+
|
|
123
|
+
if handle_unmatched == "raise" and unmatched_input:
|
|
124
|
+
raise ValueError(f"Unmatched keys found: {unmatched_input}")
|
|
125
|
+
|
|
126
|
+
elif handle_unmatched == "ignore":
|
|
127
|
+
for key in unmatched_input:
|
|
128
|
+
corrected_out[key] = d_[key]
|
|
129
|
+
|
|
130
|
+
elif handle_unmatched in ("fill", "force"):
|
|
131
|
+
# Fill missing expected keys
|
|
132
|
+
for key in unmatched_expected:
|
|
133
|
+
if fill_mapping and key in fill_mapping:
|
|
134
|
+
corrected_out[key] = fill_mapping[key]
|
|
135
|
+
else:
|
|
136
|
+
corrected_out[key] = fill_value
|
|
137
|
+
|
|
138
|
+
# For "fill" mode, also keep unmatched original keys
|
|
139
|
+
if handle_unmatched == "fill":
|
|
140
|
+
for key in unmatched_input:
|
|
141
|
+
corrected_out[key] = d_[key]
|
|
142
|
+
|
|
143
|
+
# Check strict mode
|
|
144
|
+
if strict and unmatched_expected:
|
|
145
|
+
raise ValueError(f"Missing required keys: {unmatched_expected}")
|
|
146
|
+
|
|
147
|
+
return corrected_out
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@dataclass(slots=True, init=False, frozen=True)
|
|
151
|
+
class FuzzyMatchKeysParams(Params):
|
|
152
|
+
_config: ClassVar[ModelConfig] = ModelConfig(none_as_sentinel=False)
|
|
153
|
+
_func: ClassVar[Any] = fuzzy_match_keys
|
|
154
|
+
|
|
155
|
+
similarity_algo: SIMILARITY_TYPE | SimilarityAlgo | SimilarityFunc = "jaro_winkler"
|
|
156
|
+
similarity_threshold: float = 0.85
|
|
157
|
+
|
|
158
|
+
fuzzy_match: bool = True
|
|
159
|
+
handle_unmatched: HandleUnmatched = "ignore"
|
|
160
|
+
|
|
161
|
+
fill_value: Any = Unset
|
|
162
|
+
fill_mapping: dict[str, Any] | Any = Unset
|
|
163
|
+
strict: bool = False
|
|
164
|
+
|
|
165
|
+
def __call__(self, d_: dict[str, Any], keys: KeysLike) -> dict[str, Any]:
|
|
166
|
+
return fuzzy_match_keys(d_, keys, **self.default_kw())
|