speedy-utils 1.1.9__py3-none-any.whl → 1.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -389,7 +389,7 @@ class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
389
389
  input_data: InputModelType,
390
390
  expected_response: Optional[OutputModelType] = None,
391
391
  label: Optional[str] = None,
392
- cache_dir: pathlib.Path = DEFAULT_CACHE_DIR,
392
+ cache_dir: Optional[pathlib.Path] = None,
393
393
  ) -> OutputModelType:
394
394
  """
395
395
  Generate training data for both thinking and non-thinking modes.
@@ -415,6 +415,10 @@ class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
415
415
  # Create non-thinking mode equivalent
416
416
  no_think_messages = self._create_no_think_messages(think_messages)
417
417
 
418
+ # Use default cache directory if none provided
419
+ if cache_dir is None:
420
+ cache_dir = self.DEFAULT_CACHE_DIR or pathlib.Path("./cache")
421
+
418
422
  # Save training data
419
423
  self._save_training_data(
420
424
  input_data=input_data,
@@ -96,12 +96,16 @@ class AsyncLM(AsyncLMBase):
96
96
 
97
97
  async def _unified_client_call(
98
98
  self,
99
- messages: list[dict],
99
+ messages: RawMsgs,
100
100
  extra_body: Optional[dict] = None,
101
101
  cache_suffix: str = "",
102
102
  ) -> dict:
103
103
  """Unified method for all client interactions with caching and error handling."""
104
- converted_messages = self._convert_messages(messages)
104
+ converted_messages: Messages = (
105
+ self._convert_messages(cast(LegacyMsgs, messages))
106
+ if messages and isinstance(messages[0], dict)
107
+ else cast(Messages, messages)
108
+ )
105
109
  cache_key = None
106
110
  completion = None
107
111
 
@@ -385,3 +389,13 @@ class AsyncLM(AsyncLMBase):
385
389
  raise ValueError(
386
390
  f"Failed to validate against response model {response_model.__name__}: {exc}\nRaw content: {content}"
387
391
  ) from exc
392
+
393
+ async def __aenter__(self):
394
+ return self
395
+
396
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
397
+ if hasattr(self, "_last_client"):
398
+ last_client = self._last_client # type: ignore
399
+ await last_client._client.aclose()
400
+ else:
401
+ logger.warning("No last client to close")
speedy_utils/__init__.py CHANGED
@@ -108,7 +108,7 @@ from .common.notebook_utils import (
108
108
  )
109
109
 
110
110
  # Cache utilities
111
- from .common.utils_cache import amemoize, identify, identify_uuid, memoize
111
+ from .common.utils_cache import identify, identify_uuid, memoize
112
112
 
113
113
  # IO utilities
114
114
  from .common.utils_io import (
@@ -197,7 +197,6 @@ __all__ = [
197
197
  # Function decorators
198
198
  "retry_runtime",
199
199
  # Cache utilities
200
- "amemoize",
201
200
  "memoize",
202
201
  "identify",
203
202
  "identify_uuid",
@@ -227,5 +226,4 @@ __all__ = [
227
226
  "multi_thread",
228
227
  # Notebook utilities
229
228
  "change_dir",
230
- "amemoize",
231
229
  ]
@@ -6,8 +6,15 @@ import os
6
6
  import os.path as osp
7
7
  import pickle
8
8
  import uuid
9
+ import weakref
9
10
  from threading import Lock
10
- from typing import Any, Awaitable, Callable, Literal, TypeVar
11
+ from typing import Any, Awaitable, Callable, Literal, Optional, TypeVar, overload
12
+
13
+ try:
14
+ # Python 3.10+
15
+ from typing import ParamSpec
16
+ except ImportError: # pragma: no cover
17
+ from typing_extensions import ParamSpec # type: ignore
11
18
 
12
19
  import cachetools
13
20
  import pandas as pd
@@ -18,155 +25,366 @@ from pydantic import BaseModel
18
25
  from speedy_utils.common.utils_io import dump_json_or_pickle, load_json_or_pickle
19
26
  from speedy_utils.common.utils_misc import mkdir_or_exist
20
27
 
21
- SPEED_CACHE_DIR = osp.join(osp.expanduser("~"), ".cache/speedy_cache")
22
- LRU_MEM_CACHE = cachetools.LRUCache(maxsize=128_000)
28
+ # --------------------------------------------------------------------------------------
29
+ # Defaults / Globals
30
+ # --------------------------------------------------------------------------------------
23
31
 
24
- thread_locker = Lock()
32
+ SPEED_CACHE_DIR = osp.join(osp.expanduser("~"), ".cache/speedy_cache")
25
33
 
26
- # Add two locks for thread-safe cache access
34
+ # Thread locks for safety
27
35
  disk_lock = Lock()
28
36
  mem_lock = Lock()
29
37
 
30
- # Add async-specific types
31
- T = TypeVar('T')
32
- AsyncFunc = Callable[..., Awaitable[T]]
38
+ # Quick identifier cache for big objects that support weakref
39
+ # (prevents recomputing expensive keys for the same object instance)
40
+ _QUICK_ID_MAP: "weakref.WeakKeyDictionary[Any, str]" = weakref.WeakKeyDictionary()
33
41
 
42
+ # Per-function memory caches (so different functions can have different LRU sizes)
43
+ _MEM_CACHES: "weakref.WeakKeyDictionary[Callable[..., Any], cachetools.LRUCache]" = (
44
+ weakref.WeakKeyDictionary()
45
+ )
34
46
 
35
- def compute_func_id(func, args, kwargs, ignore_self, keys):
36
- func_source = get_source(func)
37
- if keys:
38
- arg_spec = inspect.getfullargspec(func).args
39
- used_args = {arg_spec[i]: arg for i, arg in enumerate(args)}
40
- used_args.update(kwargs)
41
- values = [used_args[k] for k in keys if k in used_args]
42
- if not values:
43
- raise ValueError(f"Keys {keys} not found in function arguments")
44
- param_hash = identify(values)
45
- dir_path = f"{func.__name__}_{identify(func_source)}"
46
- key_id = f"{'_'.join(keys)}_{param_hash}.pkl"
47
- return func_source, dir_path, key_id
47
+ # Backward-compat global symbol (internal only; not exported)
48
+ LRU_MEM_CACHE = cachetools.LRUCache(maxsize=256)
48
49
 
49
- if (
50
- inspect.getfullargspec(func).args
51
- and inspect.getfullargspec(func).args[0] == "self"
52
- and ignore_self
53
- ):
54
- fid = (func_source, args[1:], kwargs)
55
- else:
56
- fid = (func_source, args, kwargs)
57
- return func_source, "funcs", f"{identify(fid)}.pkl"
50
+ # Typing helpers
51
+ P = ParamSpec("P")
52
+ R = TypeVar("R")
53
+ AsyncFunc = Callable[P, Awaitable[R]]
54
+
55
+ # --------------------------------------------------------------------------------------
56
+ # Utilities
57
+ # --------------------------------------------------------------------------------------
58
58
 
59
59
 
60
60
  def fast_serialize(x: Any) -> bytes:
61
+ """Serialize x quickly; JSON if possible (stable), else pickle."""
61
62
  try:
62
- return json.dumps(x, sort_keys=True).encode("utf-8")
63
+ return json.dumps(x, sort_keys=True, default=str).encode("utf-8")
63
64
  except (TypeError, ValueError):
64
65
  return pickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
65
66
 
66
67
 
67
- def identify(obj: Any, depth=0, max_depth=2) -> str:
68
+ def identify_uuid(x: Any) -> str:
69
+ data = fast_serialize(x)
70
+ hash_obj = xxhash.xxh128(data, seed=0)
71
+ return str(uuid.UUID(bytes=hash_obj.digest()))
72
+
73
+
74
+ def get_source(func: Callable[..., Any]) -> str:
75
+ """Minified function source; falls back to module + qualname for builtins/lambdas."""
76
+ try:
77
+ code = inspect.getsource(func)
78
+ except OSError:
79
+ # source not available (e.g., builtins, some C extensions)
80
+ mod = getattr(func, "__module__", "unknown")
81
+ qn = getattr(func, "__qualname__", getattr(func, "__name__", "unknown"))
82
+ code = f"{mod}.{qn}"
83
+ # normalize whitespace to make it stable
84
+ for r in (" ", "\n", "\t", "\r"):
85
+ code = code.replace(r, "")
86
+ return code
87
+
88
+
89
+ def _try_get_quick_id(obj: Any) -> Optional[str]:
90
+ """Return a quick identifier if obj is weakref-able and cached."""
91
+ try:
92
+ return _QUICK_ID_MAP.get(obj) # type: ignore[arg-type]
93
+ except TypeError:
94
+ # not weakref-able (e.g., list/dict); cannot use WeakKeyDictionary
95
+ return None
96
+
97
+
98
+ def _try_store_quick_id(obj: Any, ident: str) -> None:
99
+ """Store quick identifier if obj is weakref-able."""
100
+ try:
101
+ _QUICK_ID_MAP[obj] = ident # type: ignore[index]
102
+ except TypeError:
103
+ # not weakref-able
104
+ pass
105
+
106
+
107
+ def identify(obj: Any, depth: int = 0, max_depth: int = 2) -> str:
108
+ """
109
+ Produce a stable, content-based identifier string for arbitrary Python objects.
110
+ Includes a quick path using a weakref cache for large, user-defined objects.
111
+ """
112
+ # Quick-path for user-defined objects (weakref-able)
113
+ if depth == 0:
114
+ quick = _try_get_quick_id(obj)
115
+ if quick is not None:
116
+ return quick
117
+
68
118
  if isinstance(obj, (list, tuple)):
69
119
  x = [identify(x, depth + 1, max_depth) for x in obj]
70
120
  x = "\n".join(x)
71
- return identify(x, depth + 1, max_depth)
72
- # is pandas row or dict
121
+ out = identify(x, depth + 1, max_depth)
122
+ if depth == 0:
123
+ _try_store_quick_id(obj, out)
124
+ return out
73
125
  elif isinstance(obj, (pd.DataFrame, pd.Series)):
74
126
  x = str(obj.to_dict())
75
- return identify(x, depth + 1, max_depth)
127
+ out = identify(x, depth + 1, max_depth)
128
+ if depth == 0:
129
+ _try_store_quick_id(obj, out)
130
+ return out
76
131
  elif hasattr(obj, "__code__"):
77
- return identify(get_source(obj), depth + 1, max_depth)
132
+ out = identify(get_source(obj), depth + 1, max_depth)
133
+ if depth == 0:
134
+ _try_store_quick_id(obj, out)
135
+ return out
78
136
  elif isinstance(obj, BaseModel):
79
- obj = obj.model_dump()
80
- return identify(obj, depth + 1, max_depth)
137
+ out = identify(obj.model_dump(), depth + 1, max_depth)
138
+ if depth == 0:
139
+ _try_store_quick_id(obj, out)
140
+ return out
81
141
  elif isinstance(obj, dict):
82
142
  ks = sorted(obj.keys())
83
143
  vs = [identify(obj[k], depth + 1, max_depth) for k in ks]
84
- return identify([ks, vs], depth + 1, max_depth)
144
+ out = identify([ks, vs], depth + 1, max_depth)
145
+ if depth == 0:
146
+ _try_store_quick_id(obj, out)
147
+ return out
85
148
  elif obj is None:
86
- return identify("None", depth + 1, max_depth)
149
+ out = identify("None", depth + 1, max_depth)
150
+ if depth == 0:
151
+ _try_store_quick_id(obj, out)
152
+ return out
87
153
  else:
88
- # primitive_types = [int, float, str, bool]
89
- # if not type(obj) in primitive_types:
90
- # logger.warning(f"Unknown type: {type(obj)}")
91
- return xxhash.xxh64_hexdigest(fast_serialize(obj), seed=0)
154
+ # primitives / everything else
155
+ out = xxhash.xxh64_hexdigest(fast_serialize(obj), seed=0)
156
+ if depth == 0:
157
+ _try_store_quick_id(obj, out)
158
+ return out
159
+
160
+
161
+ def _build_named_keys(
162
+ func: Callable[..., Any],
163
+ args: tuple[Any, ...],
164
+ kwargs: dict[str, Any],
165
+ keys: list[str],
166
+ ) -> list[Any]:
167
+ """Extract named parameters in order from args/kwargs for keying."""
168
+ arg_spec = inspect.getfullargspec(func).args
169
+ used_args = {arg_spec[i]: arg for i, arg in enumerate(args[: len(arg_spec)])}
170
+ used_args.update(kwargs)
171
+ values = [used_args[k] for k in keys if k in used_args]
172
+ if not values:
173
+ raise ValueError(f"Keys {keys} not found in function arguments")
174
+ return values
175
+
176
+
177
+ def _compute_cache_components(
178
+ func: Callable[..., Any],
179
+ args: tuple[Any, ...],
180
+ kwargs: dict[str, Any],
181
+ ignore_self: bool,
182
+ keys: Optional[list[str]],
183
+ key_fn: Optional[Callable[..., Any]],
184
+ ):
185
+ """
186
+ Return (func_source, sub_dir, key_id) for disk paths and memory keying.
187
+ - If key_fn provided, it determines the cache key content.
188
+ - Else if keys list provided, use those argument names.
189
+ - Else use full (args, kwargs), optionally ignoring 'self' for methods.
190
+ """
191
+ func_source = get_source(func)
92
192
 
193
+ # Custom key function (most explicit & fastest when user knows what's important)
194
+ if key_fn is not None:
195
+ try:
196
+ custom_val = key_fn(*args, **kwargs)
197
+ except Exception as e:
198
+ raise ValueError(f"key function for {func.__name__} raised: {e}") from e
199
+ sub_dir = "custom"
200
+ key_id = f"{identify(custom_val)}.pkl"
201
+ return func_source, sub_dir, key_id
93
202
 
94
- def identify_uuid(x: Any) -> str:
95
- data = fast_serialize(x)
96
- hash_obj = xxhash.xxh128(data, seed=0)
97
- return str(uuid.UUID(bytes=hash_obj.digest()))
203
+ # Named keys (back-compat)
204
+ if keys:
205
+ values = _build_named_keys(func, args, kwargs, keys)
206
+ param_hash = identify(values)
207
+ dir_path = f"{func.__name__}_{identify(func_source)}"
208
+ key_id = f"{'_'.join(keys)}_{param_hash}.pkl"
209
+ return func_source, dir_path, key_id
98
210
 
211
+ # Default: full argument identity (optionally ignoring 'self')
212
+ if (
213
+ inspect.getfullargspec(func).args
214
+ and inspect.getfullargspec(func).args[0] == "self"
215
+ and ignore_self
216
+ ):
217
+ fid = (func_source, args[1:], kwargs)
218
+ else:
219
+ fid = (func_source, args, kwargs)
99
220
 
100
- def get_source(func):
101
- code = inspect.getsource(func)
102
- for r in [" ", "\n", "\t", "\r"]:
103
- code = code.replace(r, "")
104
- return code
221
+ return func_source, "funcs", f"{identify(fid)}.pkl"
222
+
223
+
224
+ def _mem_cache_for(func: Callable[..., Any], size: int) -> cachetools.LRUCache:
225
+ """Get or create a per-function LRU cache with the given size."""
226
+ # Keep a per-function cache to avoid cross-talk of maxsize across functions
227
+ with mem_lock:
228
+ cache = _MEM_CACHES.get(func)
229
+ if cache is None or cache.maxsize != size:
230
+ cache = cachetools.LRUCache(maxsize=size)
231
+ _MEM_CACHES[func] = cache
232
+ # Keep global symbol backwards-compatible internally
233
+ global LRU_MEM_CACHE
234
+ LRU_MEM_CACHE = cache
235
+ return cache
236
+
237
+
238
+ # --------------------------------------------------------------------------------------
239
+ # Memory-only memoize (sync / async)
240
+ # --------------------------------------------------------------------------------------
241
+
242
+
243
+ def _memory_memoize(
244
+ func: Callable[P, R],
245
+ size: int,
246
+ keys: Optional[list[str]],
247
+ ignore_self: bool,
248
+ key_fn: Optional[Callable[..., Any]],
249
+ ) -> Callable[P, R]:
250
+ mem_cache = _mem_cache_for(func, size)
251
+
252
+ @functools.wraps(func)
253
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
254
+ func_source, sub_dir, key_id = _compute_cache_components(
255
+ func, args, kwargs, ignore_self, keys, key_fn
256
+ )
257
+ name = identify((func_source, sub_dir, key_id))
258
+
259
+ with mem_lock:
260
+ if name in mem_cache:
261
+ return mem_cache[name] # type: ignore[return-value]
262
+
263
+ result = func(*args, **kwargs)
264
+
265
+ with mem_lock:
266
+ if name not in mem_cache:
267
+ mem_cache[name] = result # type: ignore[index]
268
+ return result
269
+
270
+ return wrapper
271
+
272
+
273
+ def _async_memory_memoize(
274
+ func: AsyncFunc[P, R],
275
+ size: int,
276
+ keys: Optional[list[str]],
277
+ ignore_self: bool,
278
+ key_fn: Optional[Callable[..., Any]],
279
+ ) -> AsyncFunc[P, R]:
280
+ mem_cache = _mem_cache_for(func, size)
281
+
282
+ # Avoid duplicate in-flight computations for the same key
283
+ inflight: dict[str, asyncio.Task[R]] = {}
284
+ alock = asyncio.Lock()
285
+
286
+ @functools.wraps(func)
287
+ async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
288
+ func_source, sub_dir, key_id = _compute_cache_components(
289
+ func, args, kwargs, ignore_self, keys, key_fn
290
+ )
291
+ name = identify((func_source, sub_dir, key_id))
292
+
293
+ async with alock:
294
+ if name in mem_cache:
295
+ return mem_cache[name] # type: ignore[return-value]
296
+ task = inflight.get(name)
297
+ if task is None:
298
+ task = asyncio.create_task(func(*args, **kwargs)) # type: ignore[arg-type]
299
+ inflight[name] = task
300
+
301
+ try:
302
+ result = await task
303
+ finally:
304
+ async with alock:
305
+ inflight.pop(name, None)
306
+
307
+ with mem_lock:
308
+ mem_cache[name] = result # type: ignore[index]
309
+ return result
310
+
311
+ return wrapper
312
+
313
+
314
+ # --------------------------------------------------------------------------------------
315
+ # Disk-only memoize (sync / async)
316
+ # --------------------------------------------------------------------------------------
105
317
 
106
318
 
107
- def _disk_memoize(func, keys, cache_dir, ignore_self, verbose):
319
+ def _disk_memoize(
320
+ func: Callable[P, R],
321
+ keys: Optional[list[str]],
322
+ cache_dir: str,
323
+ ignore_self: bool,
324
+ verbose: bool,
325
+ key_fn: Optional[Callable[..., Any]],
326
+ ) -> Callable[P, R]:
108
327
  @functools.wraps(func)
109
- def wrapper(*args, **kwargs):
328
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
110
329
  try:
111
- # Compute cache path as before
112
- func_source, sub_dir, key_id = compute_func_id(
113
- func, args, kwargs, ignore_self, keys
330
+ func_source, sub_dir, key_id = _compute_cache_components(
331
+ func, args, kwargs, ignore_self, keys, key_fn
114
332
  )
115
- if func_source is None:
116
- return func(*args, **kwargs)
117
333
  if sub_dir == "funcs":
118
334
  cache_path = osp.join(cache_dir, sub_dir, func.__name__, key_id)
119
335
  else:
120
336
  cache_path = osp.join(cache_dir, sub_dir, key_id)
121
337
  mkdir_or_exist(osp.dirname(cache_path))
122
338
 
123
- # First check with disk lock
124
339
  with disk_lock:
125
340
  if osp.exists(cache_path):
126
- # logger.debug(f"Cache HIT for {func.__name__}, key={cache_path}")
127
341
  try:
128
342
  return load_json_or_pickle(cache_path)
129
343
  except Exception as e:
130
344
  if osp.exists(cache_path):
131
345
  os.remove(cache_path)
132
- logger.opt(depth=1).warning(
133
- f"Error loading cache: {str(e)[:100]}, continue to recompute"
134
- )
346
+ if verbose:
347
+ logger.opt(depth=1).warning(
348
+ f"Error loading cache: {str(e)[:100]}, recomputing"
349
+ )
135
350
 
136
351
  result = func(*args, **kwargs)
137
352
 
138
- # Write result under disk lock to avoid race conditions
139
353
  with disk_lock:
140
354
  if not osp.exists(cache_path):
141
355
  dump_json_or_pickle(result, cache_path)
142
356
  return result
143
357
  except Exception as e:
144
- logger.opt(depth=1).warning(
145
- f"Failed to cache {func.__name__}: {e}, continue to recompute without cache"
146
- )
358
+ if verbose:
359
+ logger.opt(depth=1).warning(
360
+ f"Failed to cache {func.__name__}: {e}, executing without cache"
361
+ )
147
362
  return func(*args, **kwargs)
148
363
 
149
364
  return wrapper
150
365
 
151
366
 
152
- def _async_disk_memoize(func, keys, cache_dir, ignore_self, verbose):
367
+ def _async_disk_memoize(
368
+ func: AsyncFunc[P, R],
369
+ keys: Optional[list[str]],
370
+ cache_dir: str,
371
+ ignore_self: bool,
372
+ verbose: bool,
373
+ key_fn: Optional[Callable[..., Any]],
374
+ ) -> AsyncFunc[P, R]:
153
375
  @functools.wraps(func)
154
- async def wrapper(*args, **kwargs):
376
+ async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
155
377
  try:
156
- # Compute cache path as before
157
- func_source, sub_dir, key_id = compute_func_id(
158
- func, args, kwargs, ignore_self, keys
378
+ func_source, sub_dir, key_id = _compute_cache_components(
379
+ func, args, kwargs, ignore_self, keys, key_fn
159
380
  )
160
- if func_source is None:
161
- return await func(*args, **kwargs)
162
381
  if sub_dir == "funcs":
163
382
  cache_path = osp.join(cache_dir, sub_dir, func.__name__, key_id)
164
383
  else:
165
384
  cache_path = osp.join(cache_dir, sub_dir, key_id)
166
385
  mkdir_or_exist(osp.dirname(cache_path))
167
386
 
168
- # First check with disk lock (run in thread to avoid blocking)
169
- def check_cache():
387
+ def check_cache() -> Optional[R]:
170
388
  with disk_lock:
171
389
  if osp.exists(cache_path):
172
390
  try:
@@ -174,12 +392,12 @@ def _async_disk_memoize(func, keys, cache_dir, ignore_self, verbose):
174
392
  except Exception as e:
175
393
  if osp.exists(cache_path):
176
394
  os.remove(cache_path)
177
- logger.opt(depth=1).warning(
178
- f"Error loading cache: {str(e)[:100]}, continue to recompute"
179
- )
395
+ if verbose:
396
+ logger.opt(depth=1).warning(
397
+ f"Error loading cache: {str(e)[:100]}, recomputing"
398
+ )
180
399
  return None
181
-
182
- # Run cache check in thread pool to avoid blocking
400
+
183
401
  loop = asyncio.get_event_loop()
184
402
  cached_result = await loop.run_in_executor(None, check_cache)
185
403
  if cached_result is not None:
@@ -187,157 +405,112 @@ def _async_disk_memoize(func, keys, cache_dir, ignore_self, verbose):
187
405
 
188
406
  result = await func(*args, **kwargs)
189
407
 
190
- # Write result under disk lock (run in thread to avoid blocking)
191
- def write_cache():
408
+ def write_cache() -> None:
192
409
  with disk_lock:
193
410
  if not osp.exists(cache_path):
194
411
  dump_json_or_pickle(result, cache_path)
195
-
412
+
196
413
  await loop.run_in_executor(None, write_cache)
197
414
  return result
198
415
  except Exception as e:
199
- logger.opt(depth=1).warning(
200
- f"Failed to cache {func.__name__}: {e}, continue to recompute without cache"
201
- )
416
+ if verbose:
417
+ logger.opt(depth=1).warning(
418
+ f"Failed to cache {func.__name__}: {e}, executing without cache"
419
+ )
202
420
  return await func(*args, **kwargs)
203
421
 
204
422
  return wrapper
205
423
 
206
424
 
207
- def _memory_memoize(func, size, keys, ignore_self):
208
- global LRU_MEM_CACHE
209
- if LRU_MEM_CACHE.maxsize != size:
210
- LRU_MEM_CACHE = cachetools.LRUCache(maxsize=size)
211
-
212
- @functools.wraps(func)
213
- def wrapper(*args, **kwargs):
214
- func_source, sub_dir, key_id = compute_func_id(
215
- func, args, kwargs, ignore_self, keys
216
- )
217
- if func_source is None:
218
- return func(*args, **kwargs)
219
- name = identify((func_source, sub_dir, key_id))
220
-
221
- if not hasattr(func, "_mem_cache"):
222
- func._mem_cache = LRU_MEM_CACHE
425
+ # --------------------------------------------------------------------------------------
426
+ # Memory+Disk (sync / async)
427
+ # --------------------------------------------------------------------------------------
223
428
 
224
- with mem_lock:
225
- if name in func._mem_cache:
226
- # logger.debug(f"Cache HIT (memory) for {func.__name__}, key={name}")
227
- return func._mem_cache[name]
228
429
 
229
- result = func(*args, **kwargs)
430
+ def both_memoize(
431
+ func: Callable[P, R],
432
+ keys: Optional[list[str]],
433
+ cache_dir: str,
434
+ ignore_self: bool,
435
+ size: int,
436
+ key_fn: Optional[Callable[..., Any]],
437
+ ) -> Callable[P, R]:
438
+ mem_cache = _mem_cache_for(func, size)
230
439
 
231
- with mem_lock:
232
- if name not in func._mem_cache:
233
- func._mem_cache[name] = result
234
- return result
235
-
236
- return wrapper
237
-
238
-
239
- def _async_memory_memoize(func, size, keys, ignore_self):
240
- global LRU_MEM_CACHE
241
- if LRU_MEM_CACHE.maxsize != size:
242
- LRU_MEM_CACHE = cachetools.LRUCache(maxsize=size)
243
-
244
- @functools.wraps(func)
245
- async def wrapper(*args, **kwargs):
246
- func_source, sub_dir, key_id = compute_func_id(
247
- func, args, kwargs, ignore_self, keys
248
- )
249
- if func_source is None:
250
- return await func(*args, **kwargs)
251
- name = identify((func_source, sub_dir, key_id))
252
-
253
- if not hasattr(func, "_mem_cache"):
254
- func._mem_cache = LRU_MEM_CACHE
255
-
256
- with mem_lock:
257
- if name in func._mem_cache:
258
- # logger.debug(f"Cache HIT (memory) for {func.__name__}, key={name}")
259
- return func._mem_cache[name]
260
-
261
- result = await func(*args, **kwargs)
262
-
263
- with mem_lock:
264
- if name not in func._mem_cache:
265
- func._mem_cache[name] = result
266
- return result
267
-
268
- return wrapper
269
-
270
-
271
- def both_memoize(func, keys, cache_dir, ignore_self):
272
440
  @functools.wraps(func)
273
- def wrapper(*args, **kwargs):
274
- func_source, sub_dir, key_id = compute_func_id(
275
- func, args, kwargs, ignore_self, keys
441
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
442
+ func_source, sub_dir, key_id = _compute_cache_components(
443
+ func, args, kwargs, ignore_self, keys, key_fn
276
444
  )
277
- if func_source is None:
278
- return func(*args, **kwargs)
279
-
280
445
  mem_key = identify((func_source, sub_dir, key_id))
281
- if not hasattr(func, "_mem_cache"):
282
- func._mem_cache = LRU_MEM_CACHE
283
446
 
447
+ # Memory first
284
448
  with mem_lock:
285
- if mem_key in func._mem_cache:
286
- # logger.debug(f"Cache HIT (memory) for {func.__name__}, key={mem_key}")
287
- return func._mem_cache[mem_key]
449
+ if mem_key in mem_cache:
450
+ return mem_cache[mem_key] # type: ignore[return-value]
288
451
 
452
+ # Disk next
289
453
  if sub_dir == "funcs":
290
454
  cache_path = osp.join(cache_dir, sub_dir, func.__name__, key_id)
291
455
  else:
292
456
  cache_path = osp.join(cache_dir, sub_dir, key_id)
293
457
  mkdir_or_exist(osp.dirname(cache_path))
294
458
 
459
+ disk_result: Optional[R] = None
295
460
  with disk_lock:
296
461
  if osp.exists(cache_path):
297
- # logger.debug(f"Cache HIT (disk) for {func.__name__}, key={cache_path}")
298
- result = load_json_or_pickle(cache_path)
299
- with mem_lock:
300
- func._mem_cache[mem_key] = result
301
- return result
302
- # logger.debug(f"Cache MISS for {func.__name__}, key={cache_path}")
303
- result = func(*args, **kwargs)
462
+ disk_result = load_json_or_pickle(cache_path)
304
463
 
464
+ if disk_result is not None:
465
+ with mem_lock:
466
+ mem_cache[mem_key] = disk_result # type: ignore[index]
467
+ return disk_result
468
+
469
+ # Miss: compute, then write both
470
+ result = func(*args, **kwargs)
305
471
  with disk_lock:
306
472
  if not osp.exists(cache_path):
307
473
  dump_json_or_pickle(result, cache_path)
308
474
  with mem_lock:
309
- func._mem_cache[mem_key] = result
475
+ mem_cache[mem_key] = result # type: ignore[index]
310
476
  return result
311
477
 
312
478
  return wrapper
313
479
 
314
480
 
315
- def _async_both_memoize(func, keys, cache_dir, ignore_self):
481
+ def _async_both_memoize(
482
+ func: AsyncFunc[P, R],
483
+ keys: Optional[list[str]],
484
+ cache_dir: str,
485
+ ignore_self: bool,
486
+ size: int,
487
+ key_fn: Optional[Callable[..., Any]],
488
+ ) -> AsyncFunc[P, R]:
489
+ mem_cache = _mem_cache_for(func, size)
490
+
491
+ inflight: dict[str, asyncio.Task[R]] = {}
492
+ alock = asyncio.Lock()
493
+
316
494
  @functools.wraps(func)
317
- async def wrapper(*args, **kwargs):
318
- func_source, sub_dir, key_id = compute_func_id(
319
- func, args, kwargs, ignore_self, keys
495
+ async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
496
+ func_source, sub_dir, key_id = _compute_cache_components(
497
+ func, args, kwargs, ignore_self, keys, key_fn
320
498
  )
321
- if func_source is None:
322
- return await func(*args, **kwargs)
323
-
324
499
  mem_key = identify((func_source, sub_dir, key_id))
325
- if not hasattr(func, "_mem_cache"):
326
- func._mem_cache = LRU_MEM_CACHE
327
500
 
328
- with mem_lock:
329
- if mem_key in func._mem_cache:
330
- # logger.debug(f"Cache HIT (memory) for {func.__name__}, key={mem_key}")
331
- return func._mem_cache[mem_key]
501
+ # Memory
502
+ async with alock:
503
+ if mem_key in mem_cache:
504
+ return mem_cache[mem_key] # type: ignore[return-value]
332
505
 
506
+ # Disk
333
507
  if sub_dir == "funcs":
334
508
  cache_path = osp.join(cache_dir, sub_dir, func.__name__, key_id)
335
509
  else:
336
510
  cache_path = osp.join(cache_dir, sub_dir, key_id)
337
511
  mkdir_or_exist(osp.dirname(cache_path))
338
512
 
339
- # Check disk cache in thread pool to avoid blocking
340
- def check_disk_cache():
513
+ def check_disk_cache() -> Optional[R]:
341
514
  with disk_lock:
342
515
  if osp.exists(cache_path):
343
516
  return load_json_or_pickle(cache_path)
@@ -345,150 +518,131 @@ def _async_both_memoize(func, keys, cache_dir, ignore_self):
345
518
 
346
519
  loop = asyncio.get_event_loop()
347
520
  disk_result = await loop.run_in_executor(None, check_disk_cache)
348
-
521
+
349
522
  if disk_result is not None:
350
523
  with mem_lock:
351
- func._mem_cache[mem_key] = disk_result
524
+ mem_cache[mem_key] = disk_result # type: ignore[index]
352
525
  return disk_result
353
526
 
354
- # logger.debug(f"Cache MISS for {func.__name__}, key={cache_path}")
355
- result = await func(*args, **kwargs)
527
+ # Avoid duplicate async work for same key
528
+ async with alock:
529
+ task = inflight.get(mem_key)
530
+ if task is None:
531
+ task = asyncio.create_task(func(*args, **kwargs)) # type: ignore[arg-type]
532
+ inflight[mem_key] = task
533
+
534
+ try:
535
+ result = await task
536
+ finally:
537
+ async with alock:
538
+ inflight.pop(mem_key, None)
356
539
 
357
- # Write to disk in thread pool to avoid blocking
358
- def write_disk_cache():
540
+ def write_disk_cache() -> None:
359
541
  with disk_lock:
360
542
  if not osp.exists(cache_path):
361
543
  dump_json_or_pickle(result, cache_path)
362
544
 
363
545
  await loop.run_in_executor(None, write_disk_cache)
364
-
546
+
365
547
  with mem_lock:
366
- func._mem_cache[mem_key] = result
548
+ mem_cache[mem_key] = result # type: ignore[index]
367
549
  return result
368
550
 
369
551
  return wrapper
370
552
 
371
553
 
554
+ # --------------------------------------------------------------------------------------
555
+ # Public decorator (only export memoize)
556
+ # --------------------------------------------------------------------------------------
557
+
558
+ @overload
372
559
  def memoize(
373
- _func=None,
560
+ _func: Callable[P, R],
374
561
  *,
375
- keys=None,
376
- cache_dir=SPEED_CACHE_DIR,
377
- cache_type: Literal["memory", "disk", "both"] = "disk",
378
- size=10240,
379
- ignore_self=True,
380
- verbose=False,
381
- ):
382
- if "~/" in cache_dir:
383
- cache_dir = osp.expanduser(cache_dir)
384
-
385
- def decorator(func):
386
- # Check if function is async
387
- is_async = inspect.iscoroutinefunction(func)
388
-
389
- if cache_type == "memory":
390
- if is_async:
391
- return _async_memory_memoize(
392
- func,
393
- size,
394
- keys,
395
- ignore_self,
396
- )
397
- return _memory_memoize(
398
- func,
399
- size,
400
- keys,
401
- ignore_self,
402
- )
403
- elif cache_type == "disk":
404
- if is_async:
405
- return _async_disk_memoize(
406
- func,
407
- keys,
408
- cache_dir,
409
- ignore_self,
410
- verbose,
411
- )
412
- return _disk_memoize(
413
- func,
414
- keys,
415
- cache_dir,
416
- ignore_self,
417
- verbose,
418
- )
419
-
420
- # cache_type == "both"
421
- if is_async:
422
- return _async_both_memoize(
423
- func,
424
- keys,
425
- cache_dir,
426
- ignore_self,
427
- )
428
- return both_memoize(
429
- func,
430
- keys,
431
- cache_dir,
432
- verbose,
433
- )
434
-
435
- # Handle both @memoize and @memoize() usage patterns
436
- if _func is None:
437
- return decorator
438
- else:
439
- return decorator(_func)
562
+ keys: Optional[list[str]] = ...,
563
+ key: Optional[Callable[..., Any]] = ...,
564
+ cache_dir: str = ...,
565
+ cache_type: Literal["memory", "disk", "both"] = ...,
566
+ size: int = ...,
567
+ ignore_self: bool = ...,
568
+ verbose: bool = ...,
569
+ ) -> Callable[P, R]: ...
570
+ @overload
571
+ def memoize(
572
+ _func: Callable[P, Awaitable[R]],
573
+ *,
574
+ keys: Optional[list[str]] = ...,
575
+ key: Optional[Callable[..., Any]] = ...,
576
+ cache_dir: str = ...,
577
+ cache_type: Literal["memory", "disk", "both"] = ...,
578
+ size: int = ...,
579
+ ignore_self: bool = ...,
580
+ verbose: bool = ...,
581
+ ) -> Callable[P, Awaitable[R]]: ...
582
+ @overload
583
+ def memoize(
584
+ _func: None = ...,
585
+ *,
586
+ keys: Optional[list[str]] = ...,
587
+ key: Optional[Callable[..., Any]] = ...,
588
+ cache_dir: str = ...,
589
+ cache_type: Literal["memory", "disk", "both"] = ...,
590
+ size: int = ...,
591
+ ignore_self: bool = ...,
592
+ verbose: bool = ...,
593
+ ) -> Callable[[Callable[P, R]], Callable[P, R]]: ...
440
594
 
441
595
 
442
- def amemoize(
443
- _func=None,
596
+ def memoize(
597
+ _func: Optional[Callable[P, Any]] = None,
444
598
  *,
445
- keys: list[str] | None = None,
599
+ keys: Optional[list[str]] = None,
600
+ key: Optional[Callable[..., Any]] = None,
446
601
  cache_dir: str = SPEED_CACHE_DIR,
447
- cache_type: Literal["memory", "disk", "both"] = "disk",
448
- size: int = 10240,
602
+ cache_type: Literal["memory", "disk", "both"] = "both",
603
+ size: int = 256,
449
604
  ignore_self: bool = True,
450
605
  verbose: bool = False,
451
606
  ):
452
607
  """
453
- Async-specific memoization decorator for coroutine functions.
454
-
455
- Args:
456
- _func: The async function to memoize (when used without parentheses)
457
- keys: Specific argument keys to use for cache key generation
458
- cache_dir: Directory for disk cache storage
459
- cache_type: Type of caching - "memory", "disk", or "both"
460
- size: Size of memory cache (for memory/both types)
461
- ignore_self: Whether to ignore 'self' parameter in cache key
462
- verbose: Enable verbose logging
463
-
464
- Returns:
465
- Decorated async function with memoization
466
-
467
- Example:
468
- @amemoize(cache_type="both")
469
- async def my_async_func(x: int) -> str:
470
- return str(x)
608
+ Universal memoizer that supports sync and async functions, preserves annotations
609
+ for Pylance via ParamSpec/TypeVar, and caches in memory + disk by default.
610
+
611
+ - keys: list of argument names to include in key (back-compat).
612
+ - key: custom callable (*args, **kwargs) -> hashable/serializable object for keying.
613
+ Prefer this for performance on big inputs (e.g., key=lambda x: x.id).
614
+ - cache_dir: disk cache base directory (unlimited size).
615
+ - cache_type: "memory" | "disk" | "both" (default "both").
616
+ - size: memory LRU size per-function (default 256 items).
617
+ - ignore_self: ignore 'self' when building the default key for bound methods.
618
+ - verbose: enable warnings on cache load/write errors.
471
619
  """
472
620
  if "~/" in cache_dir:
473
621
  cache_dir = osp.expanduser(cache_dir)
474
622
 
475
- def decorator(func):
476
- # Ensure the function is actually async
477
- if not inspect.iscoroutinefunction(func):
478
- raise ValueError(f"amemoize can only be used with async functions. {func.__name__} is not async.")
479
-
623
+ def decorator(func: Callable[P, Any]) -> Callable[P, Any]:
624
+ is_async = inspect.iscoroutinefunction(func)
625
+
480
626
  if cache_type == "memory":
481
- return _async_memory_memoize(func, size, keys, ignore_self)
482
- elif cache_type == "disk":
483
- return _async_disk_memoize(func, keys, cache_dir, ignore_self, verbose)
484
- else: # cache_type == "both"
485
- return _async_both_memoize(func, keys, cache_dir, ignore_self)
627
+ if is_async:
628
+ return _async_memory_memoize(func, size, keys, ignore_self, key) # type: ignore[return-value]
629
+ return _memory_memoize(func, size, keys, ignore_self, key) # type: ignore[return-value]
630
+
631
+ if cache_type == "disk":
632
+ if is_async:
633
+ return _async_disk_memoize(func, keys, cache_dir, ignore_self, verbose, key) # type: ignore[return-value]
634
+ return _disk_memoize(func, keys, cache_dir, ignore_self, verbose, key) # type: ignore[return-value]
635
+
636
+ # cache_type == "both"
637
+ if is_async:
638
+ return _async_both_memoize(func, keys, cache_dir, ignore_self, size, key) # type: ignore[return-value]
639
+ return both_memoize(func, keys, cache_dir, ignore_self, size, key) # type: ignore[return-value]
486
640
 
487
- # Handle both @amemoize and @amemoize() usage patterns
641
+ # Support both @memoize and @memoize(...)
488
642
  if _func is None:
489
643
  return decorator
490
644
  else:
491
645
  return decorator(_func)
492
646
 
493
647
 
494
- __all__ = ["memoize", "identify", "identify_uuid", "amemoize"]
648
+ __all__ = ["memoize", "identify"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: speedy-utils
3
- Version: 1.1.9
3
+ Version: 1.1.10
4
4
  Summary: Fast and easy-to-use package for data science
5
5
  Author: AnhVTH
6
6
  Author-email: anhvth.226@gmail.com
@@ -7,15 +7,15 @@ llm_utils/group_messages.py,sha256=Oe2tlhg-zRodG1-hodYebddrR77j9UdE05LzJw0EvYI,3
7
7
  llm_utils/lm/__init__.py,sha256=rX36_MsnekM5GHwWS56XELbm4W5x2TDwnPERDTfo0eU,194
8
8
  llm_utils/lm/async_lm/__init__.py,sha256=PUBbCuf5u6-0GBUu-2PI6YAguzsyXj-LPkU6vccqT6E,121
9
9
  llm_utils/lm/async_lm/_utils.py,sha256=P1-pUDf_0pDmo8WTIi43t5ARlyGA1RIJfpAhz-gfA5g,6105
10
- llm_utils/lm/async_lm/async_llm_task.py,sha256=QdyxiMLAy7xENdb0SaGAFRTaBO1n0TehPPcTdrLizsg,19010
11
- llm_utils/lm/async_lm/async_lm.py,sha256=J1KC7qCpG_CyJMWca4q71la7JHoANiLLSNQrQH44-z0,14045
10
+ llm_utils/lm/async_lm/async_llm_task.py,sha256=iXSTbf-KekXncVVnic-v4dTq5HBDjbyLwhgo0Y-wp7Q,19167
11
+ llm_utils/lm/async_lm/async_lm.py,sha256=xMoBIX1B-KcAJMWHIl88vf1sZ5iUL4jYh_3OLOZ9TzI,14523
12
12
  llm_utils/lm/async_lm/async_lm_base.py,sha256=Ope5LPOgMm_waoK_nDP7TwOVJRdstP8XCjFS4rBksGY,14874
13
13
  llm_utils/lm/async_lm/lm_specific.py,sha256=KmqdCm3SJ5MqN-dRJd6S5tq5-ve1X2eNWf2CMFtc_3s,3926
14
14
  llm_utils/lm/utils.py,sha256=a0KJj8vjT2fHKb7GKGNJjJHhKLThwpxIL7vnV9Fr3ZY,4584
15
15
  llm_utils/scripts/README.md,sha256=yuOLnLa2od2jp4wVy3rV0rESeiV3o8zol5MNMsZx0DY,999
16
16
  llm_utils/scripts/vllm_load_balancer.py,sha256=TT5Ypq7gUcl52gRFp--ORFFjzhfGlcaX2rkRv8NxlxU,37259
17
17
  llm_utils/scripts/vllm_serve.py,sha256=4NaqpVs7LBvxtvTCMPsNCAOfqiWkKRttxWMmWY7SitA,14729
18
- speedy_utils/__init__.py,sha256=ZtnitBT13OS3xjmsVoVHjmL5RIWaH12PMcp6UDHQjaE,5776
18
+ speedy_utils/__init__.py,sha256=YCpiReW22zG4KkQXQe6V9BQ8bn7PtiXolOaW_iL8T4M,5734
19
19
  speedy_utils/all.py,sha256=t-HKzDmhF1MTFnmq7xRnPs5nFG_aZaLH9Ua0RM6nQ9Y,4855
20
20
  speedy_utils/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  speedy_utils/common/clock.py,sha256=3n4FkCW0dz46O8By09V5Pve1DSMgpLDRbWEVRryryeQ,7423
@@ -23,7 +23,7 @@ speedy_utils/common/function_decorator.py,sha256=BspJ0YuGL6elS7lWBAgELZ-sCfED_1N
23
23
  speedy_utils/common/logger.py,sha256=JqW9gG4ujfq4RldNeYP2p52BYgCwjkYeGGYyzLn6mfY,6422
24
24
  speedy_utils/common/notebook_utils.py,sha256=-97kehJ_Gg3TzDLubsLIYJcykqX1NXhbvBO6nniZSYM,2063
25
25
  speedy_utils/common/report_manager.py,sha256=eBiw5KY6bWUhwki3B4lK5o8bFsp7L5x28X9GCI-Sd1w,3899
26
- speedy_utils/common/utils_cache.py,sha256=G0M_iv3T8QqbBNNiS1LDz6MrRycQjiYLMzmHYpDUCjU,16348
26
+ speedy_utils/common/utils_cache.py,sha256=jMhyni0DXIxX8i5q0WRohVom_BAkN11jtPFWvDcT01I,21904
27
27
  speedy_utils/common/utils_io.py,sha256=tfptex3pbmhXOftr__V-3DbhuDVSm01j4vg39R5jbwI,4792
28
28
  speedy_utils/common/utils_misc.py,sha256=cdEuBBpiB1xpuzj0UBDHDuTIerqsMIw37ENq6EXliOw,1795
29
29
  speedy_utils/common/utils_print.py,sha256=iQqnOYw2EFC8TqeSDbrcnIQAUKT7FbB8Mec8b2aGAzw,4833
@@ -33,7 +33,7 @@ speedy_utils/multi_worker/thread.py,sha256=u_hTwXh7_FciMa5EukdEA1fDCY_vUC4moDceB
33
33
  speedy_utils/scripts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
34
  speedy_utils/scripts/mpython.py,sha256=IvywP7Y0_V6tWfMP-4MjPvN5_KfxWF21xaLJsCIayCk,3821
35
35
  speedy_utils/scripts/openapi_client_codegen.py,sha256=f2125S_q0PILgH5dyzoKRz7pIvNEjCkzpi4Q4pPFRZE,9683
36
- speedy_utils-1.1.9.dist-info/METADATA,sha256=puilG60Uz-Z4Een5-ycrkd5cApDrpIPJofwQmqubniU,7441
37
- speedy_utils-1.1.9.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
38
- speedy_utils-1.1.9.dist-info/entry_points.txt,sha256=T1t85jwx8fK6m5msdkBGIXH5R5Kd0zSL0S6erXERPzg,237
39
- speedy_utils-1.1.9.dist-info/RECORD,,
36
+ speedy_utils-1.1.10.dist-info/METADATA,sha256=XbR7NOZOpvVX6NfE6r40T0tQ6dbsBBOp4fpSWvtigKU,7442
37
+ speedy_utils-1.1.10.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
38
+ speedy_utils-1.1.10.dist-info/entry_points.txt,sha256=T1t85jwx8fK6m5msdkBGIXH5R5Kd0zSL0S6erXERPzg,237
39
+ speedy_utils-1.1.10.dist-info/RECORD,,