speedy-utils 1.1.9__py3-none-any.whl → 1.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_utils/__init__.py +2 -0
- llm_utils/lm/async_lm/async_llm_task.py +5 -1
- llm_utils/lm/async_lm/async_lm.py +34 -55
- llm_utils/lm/async_lm/async_lm_base.py +5 -173
- llm_utils/lm/openai_memoize.py +72 -0
- llm_utils/scripts/vllm_serve.py +2 -1
- speedy_utils/__init__.py +1 -3
- speedy_utils/common/utils_cache.py +464 -294
- speedy_utils/common/utils_io.py +14 -2
- {speedy_utils-1.1.9.dist-info → speedy_utils-1.1.11.dist-info}/METADATA +1 -1
- {speedy_utils-1.1.9.dist-info → speedy_utils-1.1.11.dist-info}/RECORD +13 -12
- {speedy_utils-1.1.9.dist-info → speedy_utils-1.1.11.dist-info}/WHEEL +0 -0
- {speedy_utils-1.1.9.dist-info → speedy_utils-1.1.11.dist-info}/entry_points.txt +0 -0
|
@@ -6,8 +6,15 @@ import os
|
|
|
6
6
|
import os.path as osp
|
|
7
7
|
import pickle
|
|
8
8
|
import uuid
|
|
9
|
+
import weakref
|
|
9
10
|
from threading import Lock
|
|
10
|
-
from typing import Any, Awaitable, Callable, Literal, TypeVar
|
|
11
|
+
from typing import Any, Awaitable, Callable, Literal, Optional, TypeVar, overload
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
# Python 3.10+
|
|
15
|
+
from typing import ParamSpec
|
|
16
|
+
except ImportError: # pragma: no cover
|
|
17
|
+
from typing_extensions import ParamSpec # type: ignore
|
|
11
18
|
|
|
12
19
|
import cachetools
|
|
13
20
|
import pandas as pd
|
|
@@ -18,155 +25,366 @@ from pydantic import BaseModel
|
|
|
18
25
|
from speedy_utils.common.utils_io import dump_json_or_pickle, load_json_or_pickle
|
|
19
26
|
from speedy_utils.common.utils_misc import mkdir_or_exist
|
|
20
27
|
|
|
21
|
-
|
|
22
|
-
|
|
28
|
+
# --------------------------------------------------------------------------------------
|
|
29
|
+
# Defaults / Globals
|
|
30
|
+
# --------------------------------------------------------------------------------------
|
|
23
31
|
|
|
24
|
-
|
|
32
|
+
SPEED_CACHE_DIR = osp.join(osp.expanduser("~"), ".cache/speedy_cache")
|
|
25
33
|
|
|
26
|
-
#
|
|
34
|
+
# Thread locks for safety
|
|
27
35
|
disk_lock = Lock()
|
|
28
36
|
mem_lock = Lock()
|
|
29
37
|
|
|
30
|
-
#
|
|
31
|
-
|
|
32
|
-
|
|
38
|
+
# Quick identifier cache for big objects that support weakref
|
|
39
|
+
# (prevents recomputing expensive keys for the same object instance)
|
|
40
|
+
_QUICK_ID_MAP: "weakref.WeakKeyDictionary[Any, str]" = weakref.WeakKeyDictionary()
|
|
33
41
|
|
|
42
|
+
# Per-function memory caches (so different functions can have different LRU sizes)
|
|
43
|
+
_MEM_CACHES: "weakref.WeakKeyDictionary[Callable[..., Any], cachetools.LRUCache]" = (
|
|
44
|
+
weakref.WeakKeyDictionary()
|
|
45
|
+
)
|
|
34
46
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
if keys:
|
|
38
|
-
arg_spec = inspect.getfullargspec(func).args
|
|
39
|
-
used_args = {arg_spec[i]: arg for i, arg in enumerate(args)}
|
|
40
|
-
used_args.update(kwargs)
|
|
41
|
-
values = [used_args[k] for k in keys if k in used_args]
|
|
42
|
-
if not values:
|
|
43
|
-
raise ValueError(f"Keys {keys} not found in function arguments")
|
|
44
|
-
param_hash = identify(values)
|
|
45
|
-
dir_path = f"{func.__name__}_{identify(func_source)}"
|
|
46
|
-
key_id = f"{'_'.join(keys)}_{param_hash}.pkl"
|
|
47
|
-
return func_source, dir_path, key_id
|
|
47
|
+
# Backward-compat global symbol (internal only; not exported)
|
|
48
|
+
LRU_MEM_CACHE = cachetools.LRUCache(maxsize=256)
|
|
48
49
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
return func_source, "funcs", f"{identify(fid)}.pkl"
|
|
50
|
+
# Typing helpers
|
|
51
|
+
P = ParamSpec("P")
|
|
52
|
+
R = TypeVar("R")
|
|
53
|
+
AsyncFunc = Callable[P, Awaitable[R]]
|
|
54
|
+
|
|
55
|
+
# --------------------------------------------------------------------------------------
|
|
56
|
+
# Utilities
|
|
57
|
+
# --------------------------------------------------------------------------------------
|
|
58
58
|
|
|
59
59
|
|
|
60
60
|
def fast_serialize(x: Any) -> bytes:
|
|
61
|
+
"""Serialize x quickly; JSON if possible (stable), else pickle."""
|
|
61
62
|
try:
|
|
62
|
-
return json.dumps(x, sort_keys=True).encode("utf-8")
|
|
63
|
+
return json.dumps(x, sort_keys=True, default=str).encode("utf-8")
|
|
63
64
|
except (TypeError, ValueError):
|
|
64
65
|
return pickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
|
|
65
66
|
|
|
66
67
|
|
|
67
|
-
def
|
|
68
|
+
def identify_uuid(x: Any) -> str:
|
|
69
|
+
data = fast_serialize(x)
|
|
70
|
+
hash_obj = xxhash.xxh128(data, seed=0)
|
|
71
|
+
return str(uuid.UUID(bytes=hash_obj.digest()))
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_source(func: Callable[..., Any]) -> str:
|
|
75
|
+
"""Minified function source; falls back to module + qualname for builtins/lambdas."""
|
|
76
|
+
try:
|
|
77
|
+
code = inspect.getsource(func)
|
|
78
|
+
except OSError:
|
|
79
|
+
# source not available (e.g., builtins, some C extensions)
|
|
80
|
+
mod = getattr(func, "__module__", "unknown")
|
|
81
|
+
qn = getattr(func, "__qualname__", getattr(func, "__name__", "unknown"))
|
|
82
|
+
code = f"{mod}.{qn}"
|
|
83
|
+
# normalize whitespace to make it stable
|
|
84
|
+
for r in (" ", "\n", "\t", "\r"):
|
|
85
|
+
code = code.replace(r, "")
|
|
86
|
+
return code
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _try_get_quick_id(obj: Any) -> Optional[str]:
|
|
90
|
+
"""Return a quick identifier if obj is weakref-able and cached."""
|
|
91
|
+
try:
|
|
92
|
+
return _QUICK_ID_MAP.get(obj) # type: ignore[arg-type]
|
|
93
|
+
except TypeError:
|
|
94
|
+
# not weakref-able (e.g., list/dict); cannot use WeakKeyDictionary
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _try_store_quick_id(obj: Any, ident: str) -> None:
|
|
99
|
+
"""Store quick identifier if obj is weakref-able."""
|
|
100
|
+
try:
|
|
101
|
+
_QUICK_ID_MAP[obj] = ident # type: ignore[index]
|
|
102
|
+
except TypeError:
|
|
103
|
+
# not weakref-able
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def identify(obj: Any, depth: int = 0, max_depth: int = 2) -> str:
|
|
108
|
+
"""
|
|
109
|
+
Produce a stable, content-based identifier string for arbitrary Python objects.
|
|
110
|
+
Includes a quick path using a weakref cache for large, user-defined objects.
|
|
111
|
+
"""
|
|
112
|
+
# Quick-path for user-defined objects (weakref-able)
|
|
113
|
+
if depth == 0:
|
|
114
|
+
quick = _try_get_quick_id(obj)
|
|
115
|
+
if quick is not None:
|
|
116
|
+
return quick
|
|
117
|
+
|
|
68
118
|
if isinstance(obj, (list, tuple)):
|
|
69
119
|
x = [identify(x, depth + 1, max_depth) for x in obj]
|
|
70
120
|
x = "\n".join(x)
|
|
71
|
-
|
|
72
|
-
|
|
121
|
+
out = identify(x, depth + 1, max_depth)
|
|
122
|
+
if depth == 0:
|
|
123
|
+
_try_store_quick_id(obj, out)
|
|
124
|
+
return out
|
|
73
125
|
elif isinstance(obj, (pd.DataFrame, pd.Series)):
|
|
74
126
|
x = str(obj.to_dict())
|
|
75
|
-
|
|
127
|
+
out = identify(x, depth + 1, max_depth)
|
|
128
|
+
if depth == 0:
|
|
129
|
+
_try_store_quick_id(obj, out)
|
|
130
|
+
return out
|
|
76
131
|
elif hasattr(obj, "__code__"):
|
|
77
|
-
|
|
132
|
+
out = identify(get_source(obj), depth + 1, max_depth)
|
|
133
|
+
if depth == 0:
|
|
134
|
+
_try_store_quick_id(obj, out)
|
|
135
|
+
return out
|
|
78
136
|
elif isinstance(obj, BaseModel):
|
|
79
|
-
|
|
80
|
-
|
|
137
|
+
out = identify(obj.model_dump(), depth + 1, max_depth)
|
|
138
|
+
if depth == 0:
|
|
139
|
+
_try_store_quick_id(obj, out)
|
|
140
|
+
return out
|
|
81
141
|
elif isinstance(obj, dict):
|
|
82
142
|
ks = sorted(obj.keys())
|
|
83
143
|
vs = [identify(obj[k], depth + 1, max_depth) for k in ks]
|
|
84
|
-
|
|
144
|
+
out = identify([ks, vs], depth + 1, max_depth)
|
|
145
|
+
if depth == 0:
|
|
146
|
+
_try_store_quick_id(obj, out)
|
|
147
|
+
return out
|
|
85
148
|
elif obj is None:
|
|
86
|
-
|
|
149
|
+
out = identify("None", depth + 1, max_depth)
|
|
150
|
+
if depth == 0:
|
|
151
|
+
_try_store_quick_id(obj, out)
|
|
152
|
+
return out
|
|
87
153
|
else:
|
|
88
|
-
#
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
154
|
+
# primitives / everything else
|
|
155
|
+
out = xxhash.xxh64_hexdigest(fast_serialize(obj), seed=0)
|
|
156
|
+
if depth == 0:
|
|
157
|
+
_try_store_quick_id(obj, out)
|
|
158
|
+
return out
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _build_named_keys(
|
|
162
|
+
func: Callable[..., Any],
|
|
163
|
+
args: tuple[Any, ...],
|
|
164
|
+
kwargs: dict[str, Any],
|
|
165
|
+
keys: list[str],
|
|
166
|
+
) -> list[Any]:
|
|
167
|
+
"""Extract named parameters in order from args/kwargs for keying."""
|
|
168
|
+
arg_spec = inspect.getfullargspec(func).args
|
|
169
|
+
used_args = {arg_spec[i]: arg for i, arg in enumerate(args[: len(arg_spec)])}
|
|
170
|
+
used_args.update(kwargs)
|
|
171
|
+
values = [used_args[k] for k in keys if k in used_args]
|
|
172
|
+
if not values:
|
|
173
|
+
raise ValueError(f"Keys {keys} not found in function arguments")
|
|
174
|
+
return values
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _compute_cache_components(
|
|
178
|
+
func: Callable[..., Any],
|
|
179
|
+
args: tuple[Any, ...],
|
|
180
|
+
kwargs: dict[str, Any],
|
|
181
|
+
ignore_self: bool,
|
|
182
|
+
keys: Optional[list[str]],
|
|
183
|
+
key_fn: Optional[Callable[..., Any]],
|
|
184
|
+
):
|
|
185
|
+
"""
|
|
186
|
+
Return (func_source, sub_dir, key_id) for disk paths and memory keying.
|
|
187
|
+
- If key_fn provided, it determines the cache key content.
|
|
188
|
+
- Else if keys list provided, use those argument names.
|
|
189
|
+
- Else use full (args, kwargs), optionally ignoring 'self' for methods.
|
|
190
|
+
"""
|
|
191
|
+
func_source = get_source(func)
|
|
92
192
|
|
|
193
|
+
# Custom key function (most explicit & fastest when user knows what's important)
|
|
194
|
+
if key_fn is not None:
|
|
195
|
+
try:
|
|
196
|
+
custom_val = key_fn(*args, **kwargs)
|
|
197
|
+
except Exception as e:
|
|
198
|
+
raise ValueError(f"key function for {func.__name__} raised: {e}") from e
|
|
199
|
+
sub_dir = "custom"
|
|
200
|
+
key_id = f"{identify(custom_val)}.pkl"
|
|
201
|
+
return func_source, sub_dir, key_id
|
|
93
202
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
203
|
+
# Named keys (back-compat)
|
|
204
|
+
if keys:
|
|
205
|
+
values = _build_named_keys(func, args, kwargs, keys)
|
|
206
|
+
param_hash = identify(values)
|
|
207
|
+
dir_path = f"{func.__name__}_{identify(func_source)}"
|
|
208
|
+
key_id = f"{'_'.join(keys)}_{param_hash}.pkl"
|
|
209
|
+
return func_source, dir_path, key_id
|
|
98
210
|
|
|
211
|
+
# Default: full argument identity (optionally ignoring 'self')
|
|
212
|
+
if (
|
|
213
|
+
inspect.getfullargspec(func).args
|
|
214
|
+
and inspect.getfullargspec(func).args[0] == "self"
|
|
215
|
+
and ignore_self
|
|
216
|
+
):
|
|
217
|
+
fid = (func_source, args[1:], kwargs)
|
|
218
|
+
else:
|
|
219
|
+
fid = (func_source, args, kwargs)
|
|
99
220
|
|
|
100
|
-
|
|
101
|
-
code = inspect.getsource(func)
|
|
102
|
-
for r in [" ", "\n", "\t", "\r"]:
|
|
103
|
-
code = code.replace(r, "")
|
|
104
|
-
return code
|
|
221
|
+
return func_source, "funcs", f"{identify(fid)}.pkl"
|
|
105
222
|
|
|
106
223
|
|
|
107
|
-
def
|
|
224
|
+
def _mem_cache_for(func: Callable[..., Any], size: int) -> cachetools.LRUCache:
|
|
225
|
+
"""Get or create a per-function LRU cache with the given size."""
|
|
226
|
+
# Keep a per-function cache to avoid cross-talk of maxsize across functions
|
|
227
|
+
with mem_lock:
|
|
228
|
+
cache = _MEM_CACHES.get(func)
|
|
229
|
+
if cache is None or cache.maxsize != size:
|
|
230
|
+
cache = cachetools.LRUCache(maxsize=size)
|
|
231
|
+
_MEM_CACHES[func] = cache
|
|
232
|
+
# Keep global symbol backwards-compatible internally
|
|
233
|
+
global LRU_MEM_CACHE
|
|
234
|
+
LRU_MEM_CACHE = cache
|
|
235
|
+
return cache
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
# --------------------------------------------------------------------------------------
|
|
239
|
+
# Memory-only memoize (sync / async)
|
|
240
|
+
# --------------------------------------------------------------------------------------
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _memory_memoize(
|
|
244
|
+
func: Callable[P, R],
|
|
245
|
+
size: int,
|
|
246
|
+
keys: Optional[list[str]],
|
|
247
|
+
ignore_self: bool,
|
|
248
|
+
key_fn: Optional[Callable[..., Any]],
|
|
249
|
+
) -> Callable[P, R]:
|
|
250
|
+
mem_cache = _mem_cache_for(func, size)
|
|
251
|
+
|
|
108
252
|
@functools.wraps(func)
|
|
109
|
-
def wrapper(*args, **kwargs):
|
|
253
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
254
|
+
func_source, sub_dir, key_id = _compute_cache_components(
|
|
255
|
+
func, args, kwargs, ignore_self, keys, key_fn
|
|
256
|
+
)
|
|
257
|
+
name = identify((func_source, sub_dir, key_id))
|
|
258
|
+
|
|
259
|
+
with mem_lock:
|
|
260
|
+
if name in mem_cache:
|
|
261
|
+
return mem_cache[name] # type: ignore[return-value]
|
|
262
|
+
|
|
263
|
+
result = func(*args, **kwargs)
|
|
264
|
+
|
|
265
|
+
with mem_lock:
|
|
266
|
+
if name not in mem_cache:
|
|
267
|
+
mem_cache[name] = result # type: ignore[index]
|
|
268
|
+
return result
|
|
269
|
+
|
|
270
|
+
return wrapper
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _async_memory_memoize(
|
|
274
|
+
func: AsyncFunc[P, R],
|
|
275
|
+
size: int,
|
|
276
|
+
keys: Optional[list[str]],
|
|
277
|
+
ignore_self: bool,
|
|
278
|
+
key_fn: Optional[Callable[..., Any]],
|
|
279
|
+
) -> AsyncFunc[P, R]:
|
|
280
|
+
mem_cache = _mem_cache_for(func, size)
|
|
281
|
+
|
|
282
|
+
# Avoid duplicate in-flight computations for the same key
|
|
283
|
+
inflight: dict[str, asyncio.Task[R]] = {}
|
|
284
|
+
alock = asyncio.Lock()
|
|
285
|
+
|
|
286
|
+
@functools.wraps(func)
|
|
287
|
+
async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
288
|
+
func_source, sub_dir, key_id = _compute_cache_components(
|
|
289
|
+
func, args, kwargs, ignore_self, keys, key_fn
|
|
290
|
+
)
|
|
291
|
+
name = identify((func_source, sub_dir, key_id))
|
|
292
|
+
|
|
293
|
+
async with alock:
|
|
294
|
+
if name in mem_cache:
|
|
295
|
+
return mem_cache[name] # type: ignore[return-value]
|
|
296
|
+
task = inflight.get(name)
|
|
297
|
+
if task is None:
|
|
298
|
+
task = asyncio.create_task(func(*args, **kwargs)) # type: ignore[arg-type]
|
|
299
|
+
inflight[name] = task
|
|
300
|
+
|
|
110
301
|
try:
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
302
|
+
result = await task
|
|
303
|
+
finally:
|
|
304
|
+
async with alock:
|
|
305
|
+
inflight.pop(name, None)
|
|
306
|
+
|
|
307
|
+
with mem_lock:
|
|
308
|
+
mem_cache[name] = result # type: ignore[index]
|
|
309
|
+
return result
|
|
310
|
+
|
|
311
|
+
return wrapper
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
# --------------------------------------------------------------------------------------
|
|
315
|
+
# Disk-only memoize (sync / async)
|
|
316
|
+
# --------------------------------------------------------------------------------------
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _disk_memoize(
|
|
320
|
+
func: Callable[P, R],
|
|
321
|
+
keys: Optional[list[str]],
|
|
322
|
+
cache_dir: str,
|
|
323
|
+
ignore_self: bool,
|
|
324
|
+
verbose: bool,
|
|
325
|
+
key_fn: Optional[Callable[..., Any]],
|
|
326
|
+
) -> Callable[P, R]:
|
|
327
|
+
@functools.wraps(func)
|
|
328
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
329
|
+
try:
|
|
330
|
+
func_source, sub_dir, key_id = _compute_cache_components(
|
|
331
|
+
func, args, kwargs, ignore_self, keys, key_fn
|
|
114
332
|
)
|
|
115
|
-
if func_source is None:
|
|
116
|
-
return func(*args, **kwargs)
|
|
117
333
|
if sub_dir == "funcs":
|
|
118
334
|
cache_path = osp.join(cache_dir, sub_dir, func.__name__, key_id)
|
|
119
335
|
else:
|
|
120
336
|
cache_path = osp.join(cache_dir, sub_dir, key_id)
|
|
121
337
|
mkdir_or_exist(osp.dirname(cache_path))
|
|
122
338
|
|
|
123
|
-
# First check with disk lock
|
|
124
339
|
with disk_lock:
|
|
125
340
|
if osp.exists(cache_path):
|
|
126
|
-
# logger.debug(f"Cache HIT for {func.__name__}, key={cache_path}")
|
|
127
341
|
try:
|
|
128
342
|
return load_json_or_pickle(cache_path)
|
|
129
343
|
except Exception as e:
|
|
130
344
|
if osp.exists(cache_path):
|
|
131
345
|
os.remove(cache_path)
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
346
|
+
if verbose:
|
|
347
|
+
logger.opt(depth=1).warning(
|
|
348
|
+
f"Error loading cache: {str(e)[:100]}, recomputing"
|
|
349
|
+
)
|
|
135
350
|
|
|
136
351
|
result = func(*args, **kwargs)
|
|
137
352
|
|
|
138
|
-
# Write result under disk lock to avoid race conditions
|
|
139
353
|
with disk_lock:
|
|
140
354
|
if not osp.exists(cache_path):
|
|
141
355
|
dump_json_or_pickle(result, cache_path)
|
|
142
356
|
return result
|
|
143
357
|
except Exception as e:
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
358
|
+
if verbose:
|
|
359
|
+
logger.opt(depth=1).warning(
|
|
360
|
+
f"Failed to cache {func.__name__}: {e}, executing without cache"
|
|
361
|
+
)
|
|
147
362
|
return func(*args, **kwargs)
|
|
148
363
|
|
|
149
364
|
return wrapper
|
|
150
365
|
|
|
151
366
|
|
|
152
|
-
def _async_disk_memoize(
|
|
367
|
+
def _async_disk_memoize(
|
|
368
|
+
func: AsyncFunc[P, R],
|
|
369
|
+
keys: Optional[list[str]],
|
|
370
|
+
cache_dir: str,
|
|
371
|
+
ignore_self: bool,
|
|
372
|
+
verbose: bool,
|
|
373
|
+
key_fn: Optional[Callable[..., Any]],
|
|
374
|
+
) -> AsyncFunc[P, R]:
|
|
153
375
|
@functools.wraps(func)
|
|
154
|
-
async def wrapper(*args, **kwargs):
|
|
376
|
+
async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
155
377
|
try:
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
func, args, kwargs, ignore_self, keys
|
|
378
|
+
func_source, sub_dir, key_id = _compute_cache_components(
|
|
379
|
+
func, args, kwargs, ignore_self, keys, key_fn
|
|
159
380
|
)
|
|
160
|
-
if func_source is None:
|
|
161
|
-
return await func(*args, **kwargs)
|
|
162
381
|
if sub_dir == "funcs":
|
|
163
382
|
cache_path = osp.join(cache_dir, sub_dir, func.__name__, key_id)
|
|
164
383
|
else:
|
|
165
384
|
cache_path = osp.join(cache_dir, sub_dir, key_id)
|
|
166
385
|
mkdir_or_exist(osp.dirname(cache_path))
|
|
167
386
|
|
|
168
|
-
|
|
169
|
-
def check_cache():
|
|
387
|
+
def check_cache() -> Optional[R]:
|
|
170
388
|
with disk_lock:
|
|
171
389
|
if osp.exists(cache_path):
|
|
172
390
|
try:
|
|
@@ -174,12 +392,12 @@ def _async_disk_memoize(func, keys, cache_dir, ignore_self, verbose):
|
|
|
174
392
|
except Exception as e:
|
|
175
393
|
if osp.exists(cache_path):
|
|
176
394
|
os.remove(cache_path)
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
395
|
+
if verbose:
|
|
396
|
+
logger.opt(depth=1).warning(
|
|
397
|
+
f"Error loading cache: {str(e)[:100]}, recomputing"
|
|
398
|
+
)
|
|
180
399
|
return None
|
|
181
|
-
|
|
182
|
-
# Run cache check in thread pool to avoid blocking
|
|
400
|
+
|
|
183
401
|
loop = asyncio.get_event_loop()
|
|
184
402
|
cached_result = await loop.run_in_executor(None, check_cache)
|
|
185
403
|
if cached_result is not None:
|
|
@@ -187,157 +405,117 @@ def _async_disk_memoize(func, keys, cache_dir, ignore_self, verbose):
|
|
|
187
405
|
|
|
188
406
|
result = await func(*args, **kwargs)
|
|
189
407
|
|
|
190
|
-
|
|
191
|
-
def write_cache():
|
|
408
|
+
def write_cache() -> None:
|
|
192
409
|
with disk_lock:
|
|
193
410
|
if not osp.exists(cache_path):
|
|
194
411
|
dump_json_or_pickle(result, cache_path)
|
|
195
|
-
|
|
412
|
+
|
|
196
413
|
await loop.run_in_executor(None, write_cache)
|
|
197
414
|
return result
|
|
198
415
|
except Exception as e:
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
416
|
+
if verbose:
|
|
417
|
+
logger.opt(depth=1).warning(
|
|
418
|
+
f"Failed to cache {func.__name__}: {e}, executing without cache"
|
|
419
|
+
)
|
|
202
420
|
return await func(*args, **kwargs)
|
|
203
421
|
|
|
204
422
|
return wrapper
|
|
205
423
|
|
|
206
424
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
LRU_MEM_CACHE = cachetools.LRUCache(maxsize=size)
|
|
211
|
-
|
|
212
|
-
@functools.wraps(func)
|
|
213
|
-
def wrapper(*args, **kwargs):
|
|
214
|
-
func_source, sub_dir, key_id = compute_func_id(
|
|
215
|
-
func, args, kwargs, ignore_self, keys
|
|
216
|
-
)
|
|
217
|
-
if func_source is None:
|
|
218
|
-
return func(*args, **kwargs)
|
|
219
|
-
name = identify((func_source, sub_dir, key_id))
|
|
220
|
-
|
|
221
|
-
if not hasattr(func, "_mem_cache"):
|
|
222
|
-
func._mem_cache = LRU_MEM_CACHE
|
|
223
|
-
|
|
224
|
-
with mem_lock:
|
|
225
|
-
if name in func._mem_cache:
|
|
226
|
-
# logger.debug(f"Cache HIT (memory) for {func.__name__}, key={name}")
|
|
227
|
-
return func._mem_cache[name]
|
|
425
|
+
# --------------------------------------------------------------------------------------
|
|
426
|
+
# Memory+Disk (sync / async)
|
|
427
|
+
# --------------------------------------------------------------------------------------
|
|
228
428
|
|
|
229
|
-
result = func(*args, **kwargs)
|
|
230
429
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
global LRU_MEM_CACHE
|
|
241
|
-
if LRU_MEM_CACHE.maxsize != size:
|
|
242
|
-
LRU_MEM_CACHE = cachetools.LRUCache(maxsize=size)
|
|
430
|
+
def both_memoize(
|
|
431
|
+
func: Callable[P, R],
|
|
432
|
+
keys: Optional[list[str]],
|
|
433
|
+
cache_dir: str,
|
|
434
|
+
ignore_self: bool,
|
|
435
|
+
size: int,
|
|
436
|
+
key_fn: Optional[Callable[..., Any]],
|
|
437
|
+
) -> Callable[P, R]:
|
|
438
|
+
mem_cache = _mem_cache_for(func, size)
|
|
243
439
|
|
|
244
440
|
@functools.wraps(func)
|
|
245
|
-
|
|
246
|
-
func_source, sub_dir, key_id =
|
|
247
|
-
func, args, kwargs, ignore_self, keys
|
|
441
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
442
|
+
func_source, sub_dir, key_id = _compute_cache_components(
|
|
443
|
+
func, args, kwargs, ignore_self, keys, key_fn
|
|
248
444
|
)
|
|
249
|
-
if func_source is None:
|
|
250
|
-
return await func(*args, **kwargs)
|
|
251
|
-
name = identify((func_source, sub_dir, key_id))
|
|
252
|
-
|
|
253
|
-
if not hasattr(func, "_mem_cache"):
|
|
254
|
-
func._mem_cache = LRU_MEM_CACHE
|
|
255
|
-
|
|
256
|
-
with mem_lock:
|
|
257
|
-
if name in func._mem_cache:
|
|
258
|
-
# logger.debug(f"Cache HIT (memory) for {func.__name__}, key={name}")
|
|
259
|
-
return func._mem_cache[name]
|
|
260
|
-
|
|
261
|
-
result = await func(*args, **kwargs)
|
|
262
|
-
|
|
263
|
-
with mem_lock:
|
|
264
|
-
if name not in func._mem_cache:
|
|
265
|
-
func._mem_cache[name] = result
|
|
266
|
-
return result
|
|
267
|
-
|
|
268
|
-
return wrapper
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
def both_memoize(func, keys, cache_dir, ignore_self):
|
|
272
|
-
@functools.wraps(func)
|
|
273
|
-
def wrapper(*args, **kwargs):
|
|
274
|
-
func_source, sub_dir, key_id = compute_func_id(
|
|
275
|
-
func, args, kwargs, ignore_self, keys
|
|
276
|
-
)
|
|
277
|
-
if func_source is None:
|
|
278
|
-
return func(*args, **kwargs)
|
|
279
|
-
|
|
280
445
|
mem_key = identify((func_source, sub_dir, key_id))
|
|
281
|
-
if not hasattr(func, "_mem_cache"):
|
|
282
|
-
func._mem_cache = LRU_MEM_CACHE
|
|
283
446
|
|
|
447
|
+
# Memory first
|
|
284
448
|
with mem_lock:
|
|
285
|
-
if mem_key in
|
|
286
|
-
#
|
|
287
|
-
return func._mem_cache[mem_key]
|
|
449
|
+
if mem_key in mem_cache:
|
|
450
|
+
return mem_cache[mem_key] # type: ignore[return-value]
|
|
288
451
|
|
|
452
|
+
# Disk next
|
|
289
453
|
if sub_dir == "funcs":
|
|
290
454
|
cache_path = osp.join(cache_dir, sub_dir, func.__name__, key_id)
|
|
291
455
|
else:
|
|
292
456
|
cache_path = osp.join(cache_dir, sub_dir, key_id)
|
|
293
457
|
mkdir_or_exist(osp.dirname(cache_path))
|
|
294
458
|
|
|
459
|
+
disk_result: Optional[R] = None
|
|
295
460
|
with disk_lock:
|
|
296
461
|
if osp.exists(cache_path):
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
result = func(*args, **kwargs)
|
|
462
|
+
try:
|
|
463
|
+
disk_result = load_json_or_pickle(cache_path)
|
|
464
|
+
except Exception:
|
|
465
|
+
if osp.exists(cache_path):
|
|
466
|
+
os.remove(cache_path)
|
|
467
|
+
disk_result = None
|
|
304
468
|
|
|
469
|
+
if disk_result is not None:
|
|
470
|
+
with mem_lock:
|
|
471
|
+
mem_cache[mem_key] = disk_result # type: ignore[index]
|
|
472
|
+
return disk_result
|
|
473
|
+
|
|
474
|
+
# Miss: compute, then write both
|
|
475
|
+
result = func(*args, **kwargs)
|
|
305
476
|
with disk_lock:
|
|
306
477
|
if not osp.exists(cache_path):
|
|
307
478
|
dump_json_or_pickle(result, cache_path)
|
|
308
479
|
with mem_lock:
|
|
309
|
-
|
|
480
|
+
mem_cache[mem_key] = result # type: ignore[index]
|
|
310
481
|
return result
|
|
311
482
|
|
|
312
483
|
return wrapper
|
|
313
484
|
|
|
314
485
|
|
|
315
|
-
def _async_both_memoize(
|
|
486
|
+
def _async_both_memoize(
|
|
487
|
+
func: AsyncFunc[P, R],
|
|
488
|
+
keys: Optional[list[str]],
|
|
489
|
+
cache_dir: str,
|
|
490
|
+
ignore_self: bool,
|
|
491
|
+
size: int,
|
|
492
|
+
key_fn: Optional[Callable[..., Any]],
|
|
493
|
+
) -> AsyncFunc[P, R]:
|
|
494
|
+
mem_cache = _mem_cache_for(func, size)
|
|
495
|
+
|
|
496
|
+
inflight: dict[str, asyncio.Task[R]] = {}
|
|
497
|
+
alock = asyncio.Lock()
|
|
498
|
+
|
|
316
499
|
@functools.wraps(func)
|
|
317
|
-
async def wrapper(*args, **kwargs):
|
|
318
|
-
func_source, sub_dir, key_id =
|
|
319
|
-
func, args, kwargs, ignore_self, keys
|
|
500
|
+
async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
501
|
+
func_source, sub_dir, key_id = _compute_cache_components(
|
|
502
|
+
func, args, kwargs, ignore_self, keys, key_fn
|
|
320
503
|
)
|
|
321
|
-
if func_source is None:
|
|
322
|
-
return await func(*args, **kwargs)
|
|
323
|
-
|
|
324
504
|
mem_key = identify((func_source, sub_dir, key_id))
|
|
325
|
-
if not hasattr(func, "_mem_cache"):
|
|
326
|
-
func._mem_cache = LRU_MEM_CACHE
|
|
327
505
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
return
|
|
506
|
+
# Memory
|
|
507
|
+
async with alock:
|
|
508
|
+
if mem_key in mem_cache:
|
|
509
|
+
return mem_cache[mem_key] # type: ignore[return-value]
|
|
332
510
|
|
|
511
|
+
# Disk
|
|
333
512
|
if sub_dir == "funcs":
|
|
334
513
|
cache_path = osp.join(cache_dir, sub_dir, func.__name__, key_id)
|
|
335
514
|
else:
|
|
336
515
|
cache_path = osp.join(cache_dir, sub_dir, key_id)
|
|
337
516
|
mkdir_or_exist(osp.dirname(cache_path))
|
|
338
517
|
|
|
339
|
-
|
|
340
|
-
def check_disk_cache():
|
|
518
|
+
def check_disk_cache() -> Optional[R]:
|
|
341
519
|
with disk_lock:
|
|
342
520
|
if osp.exists(cache_path):
|
|
343
521
|
return load_json_or_pickle(cache_path)
|
|
@@ -345,150 +523,142 @@ def _async_both_memoize(func, keys, cache_dir, ignore_self):
|
|
|
345
523
|
|
|
346
524
|
loop = asyncio.get_event_loop()
|
|
347
525
|
disk_result = await loop.run_in_executor(None, check_disk_cache)
|
|
348
|
-
|
|
526
|
+
|
|
349
527
|
if disk_result is not None:
|
|
350
528
|
with mem_lock:
|
|
351
|
-
|
|
529
|
+
mem_cache[mem_key] = disk_result # type: ignore[index]
|
|
352
530
|
return disk_result
|
|
353
531
|
|
|
354
|
-
#
|
|
355
|
-
|
|
532
|
+
# Avoid duplicate async work for same key
|
|
533
|
+
async with alock:
|
|
534
|
+
task = inflight.get(mem_key)
|
|
535
|
+
if task is None:
|
|
536
|
+
task = asyncio.create_task(func(*args, **kwargs)) # type: ignore[arg-type]
|
|
537
|
+
inflight[mem_key] = task
|
|
538
|
+
|
|
539
|
+
try:
|
|
540
|
+
result = await task
|
|
541
|
+
finally:
|
|
542
|
+
async with alock:
|
|
543
|
+
inflight.pop(mem_key, None)
|
|
356
544
|
|
|
357
|
-
|
|
358
|
-
def write_disk_cache():
|
|
545
|
+
def write_disk_cache() -> None:
|
|
359
546
|
with disk_lock:
|
|
360
547
|
if not osp.exists(cache_path):
|
|
361
548
|
dump_json_or_pickle(result, cache_path)
|
|
362
549
|
|
|
363
550
|
await loop.run_in_executor(None, write_disk_cache)
|
|
364
|
-
|
|
551
|
+
|
|
365
552
|
with mem_lock:
|
|
366
|
-
|
|
553
|
+
mem_cache[mem_key] = result # type: ignore[index]
|
|
367
554
|
return result
|
|
368
555
|
|
|
369
556
|
return wrapper
|
|
370
557
|
|
|
371
558
|
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
keys=None,
|
|
376
|
-
cache_dir=SPEED_CACHE_DIR,
|
|
377
|
-
cache_type: Literal["memory", "disk", "both"] = "disk",
|
|
378
|
-
size=10240,
|
|
379
|
-
ignore_self=True,
|
|
380
|
-
verbose=False,
|
|
381
|
-
):
|
|
382
|
-
if "~/" in cache_dir:
|
|
383
|
-
cache_dir = osp.expanduser(cache_dir)
|
|
559
|
+
# --------------------------------------------------------------------------------------
|
|
560
|
+
# Public decorator (only export memoize)
|
|
561
|
+
# --------------------------------------------------------------------------------------
|
|
384
562
|
|
|
385
|
-
def decorator(func):
|
|
386
|
-
# Check if function is async
|
|
387
|
-
is_async = inspect.iscoroutinefunction(func)
|
|
388
|
-
|
|
389
|
-
if cache_type == "memory":
|
|
390
|
-
if is_async:
|
|
391
|
-
return _async_memory_memoize(
|
|
392
|
-
func,
|
|
393
|
-
size,
|
|
394
|
-
keys,
|
|
395
|
-
ignore_self,
|
|
396
|
-
)
|
|
397
|
-
return _memory_memoize(
|
|
398
|
-
func,
|
|
399
|
-
size,
|
|
400
|
-
keys,
|
|
401
|
-
ignore_self,
|
|
402
|
-
)
|
|
403
|
-
elif cache_type == "disk":
|
|
404
|
-
if is_async:
|
|
405
|
-
return _async_disk_memoize(
|
|
406
|
-
func,
|
|
407
|
-
keys,
|
|
408
|
-
cache_dir,
|
|
409
|
-
ignore_self,
|
|
410
|
-
verbose,
|
|
411
|
-
)
|
|
412
|
-
return _disk_memoize(
|
|
413
|
-
func,
|
|
414
|
-
keys,
|
|
415
|
-
cache_dir,
|
|
416
|
-
ignore_self,
|
|
417
|
-
verbose,
|
|
418
|
-
)
|
|
419
|
-
|
|
420
|
-
# cache_type == "both"
|
|
421
|
-
if is_async:
|
|
422
|
-
return _async_both_memoize(
|
|
423
|
-
func,
|
|
424
|
-
keys,
|
|
425
|
-
cache_dir,
|
|
426
|
-
ignore_self,
|
|
427
|
-
)
|
|
428
|
-
return both_memoize(
|
|
429
|
-
func,
|
|
430
|
-
keys,
|
|
431
|
-
cache_dir,
|
|
432
|
-
verbose,
|
|
433
|
-
)
|
|
434
563
|
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
564
|
+
@overload
|
|
565
|
+
def memoize(
|
|
566
|
+
_func: Callable[P, R],
|
|
567
|
+
*,
|
|
568
|
+
keys: Optional[list[str]] = ...,
|
|
569
|
+
key: Optional[Callable[..., Any]] = ...,
|
|
570
|
+
cache_dir: str = ...,
|
|
571
|
+
cache_type: Literal["memory", "disk", "both"] = ...,
|
|
572
|
+
size: int = ...,
|
|
573
|
+
ignore_self: bool = ...,
|
|
574
|
+
verbose: bool = ...,
|
|
575
|
+
) -> Callable[P, R]: ...
|
|
576
|
+
@overload
|
|
577
|
+
def memoize(
|
|
578
|
+
_func: Callable[P, Awaitable[R]],
|
|
579
|
+
*,
|
|
580
|
+
keys: Optional[list[str]] = ...,
|
|
581
|
+
key: Optional[Callable[..., Any]] = ...,
|
|
582
|
+
cache_dir: str = ...,
|
|
583
|
+
cache_type: Literal["memory", "disk", "both"] = ...,
|
|
584
|
+
size: int = ...,
|
|
585
|
+
ignore_self: bool = ...,
|
|
586
|
+
verbose: bool = ...,
|
|
587
|
+
) -> Callable[P, Awaitable[R]]: ...
|
|
588
|
+
@overload
|
|
589
|
+
def memoize(
|
|
590
|
+
_func: None = ...,
|
|
591
|
+
*,
|
|
592
|
+
keys: Optional[list[str]] = ...,
|
|
593
|
+
key: Optional[Callable[..., Any]] = ...,
|
|
594
|
+
cache_dir: str = ...,
|
|
595
|
+
cache_type: Literal["memory", "disk", "both"] = ...,
|
|
596
|
+
size: int = ...,
|
|
597
|
+
ignore_self: bool = ...,
|
|
598
|
+
verbose: bool = ...,
|
|
599
|
+
) -> Callable[[Callable[P, R]], Callable[P, R]]: ...
|
|
440
600
|
|
|
441
601
|
|
|
442
|
-
def
|
|
443
|
-
_func=None,
|
|
602
|
+
def memoize(
|
|
603
|
+
_func: Optional[Callable[P, Any]] = None,
|
|
444
604
|
*,
|
|
445
|
-
keys: list[str]
|
|
605
|
+
keys: Optional[list[str]] = None,
|
|
606
|
+
key: Optional[Callable[..., Any]] = None,
|
|
446
607
|
cache_dir: str = SPEED_CACHE_DIR,
|
|
447
|
-
cache_type: Literal["memory", "disk", "both"] = "
|
|
448
|
-
size: int =
|
|
608
|
+
cache_type: Literal["memory", "disk", "both"] = "both",
|
|
609
|
+
size: int = 256,
|
|
449
610
|
ignore_self: bool = True,
|
|
450
611
|
verbose: bool = False,
|
|
451
612
|
):
|
|
452
613
|
"""
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
Returns:
|
|
465
|
-
Decorated async function with memoization
|
|
466
|
-
|
|
467
|
-
Example:
|
|
468
|
-
@amemoize(cache_type="both")
|
|
469
|
-
async def my_async_func(x: int) -> str:
|
|
470
|
-
return str(x)
|
|
614
|
+
Universal memoizer that supports sync and async functions, preserves annotations
|
|
615
|
+
for Pylance via ParamSpec/TypeVar, and caches in memory + disk by default.
|
|
616
|
+
|
|
617
|
+
- keys: list of argument names to include in key (back-compat).
|
|
618
|
+
- key: custom callable (*args, **kwargs) -> hashable/serializable object for keying.
|
|
619
|
+
Prefer this for performance on big inputs (e.g., key=lambda x: x.id).
|
|
620
|
+
- cache_dir: disk cache base directory (unlimited size).
|
|
621
|
+
- cache_type: "memory" | "disk" | "both" (default "both").
|
|
622
|
+
- size: memory LRU size per-function (default 256 items).
|
|
623
|
+
- ignore_self: ignore 'self' when building the default key for bound methods.
|
|
624
|
+
- verbose: enable warnings on cache load/write errors.
|
|
471
625
|
"""
|
|
472
626
|
if "~/" in cache_dir:
|
|
473
627
|
cache_dir = osp.expanduser(cache_dir)
|
|
628
|
+
from speedy_utils import timef
|
|
629
|
+
|
|
630
|
+
def decorator(func: Callable[P, Any]) -> Callable[P, Any]:
|
|
631
|
+
is_async = inspect.iscoroutinefunction(func)
|
|
632
|
+
|
|
633
|
+
# Apply timing decorator if verbose=True
|
|
634
|
+
target_func = timef(func) if verbose else func
|
|
474
635
|
|
|
475
|
-
def decorator(func):
|
|
476
|
-
# Ensure the function is actually async
|
|
477
|
-
if not inspect.iscoroutinefunction(func):
|
|
478
|
-
raise ValueError(f"amemoize can only be used with async functions. {func.__name__} is not async.")
|
|
479
|
-
|
|
480
636
|
if cache_type == "memory":
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
return
|
|
484
|
-
|
|
485
|
-
|
|
637
|
+
if is_async:
|
|
638
|
+
return _async_memory_memoize(target_func, size, keys, ignore_self, key) # type: ignore[return-value]
|
|
639
|
+
return _memory_memoize(target_func, size, keys, ignore_self, key) # type: ignore[return-value]
|
|
640
|
+
|
|
641
|
+
if cache_type == "disk":
|
|
642
|
+
if is_async:
|
|
643
|
+
return _async_disk_memoize(
|
|
644
|
+
target_func, keys, cache_dir, ignore_self, verbose, key
|
|
645
|
+
) # type: ignore[return-value]
|
|
646
|
+
return _disk_memoize(
|
|
647
|
+
target_func, keys, cache_dir, ignore_self, verbose, key
|
|
648
|
+
) # type: ignore[return-value]
|
|
649
|
+
|
|
650
|
+
# cache_type == "both"
|
|
651
|
+
if is_async:
|
|
652
|
+
return _async_both_memoize(
|
|
653
|
+
target_func, keys, cache_dir, ignore_self, size, key
|
|
654
|
+
) # type: ignore[return-value]
|
|
655
|
+
return both_memoize(target_func, keys, cache_dir, ignore_self, size, key) # type: ignore[return-value]
|
|
486
656
|
|
|
487
|
-
#
|
|
657
|
+
# Support both @memoize and @memoize(...)
|
|
488
658
|
if _func is None:
|
|
489
659
|
return decorator
|
|
490
660
|
else:
|
|
491
661
|
return decorator(_func)
|
|
492
662
|
|
|
493
663
|
|
|
494
|
-
__all__ = ["memoize", "identify"
|
|
664
|
+
__all__ = ["memoize", "identify"]
|