speedy-utils 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_utils/__init__.py +3 -2
- llm_utils/lm/async_lm/async_llm_task.py +1 -0
- llm_utils/lm/llm_task.py +303 -10
- llm_utils/lm/openai_memoize.py +10 -2
- llm_utils/vector_cache/core.py +250 -234
- speedy_utils/__init__.py +2 -1
- speedy_utils/common/utils_cache.py +38 -19
- speedy_utils/common/utils_io.py +9 -5
- speedy_utils/multi_worker/process.py +91 -10
- speedy_utils/multi_worker/thread.py +94 -2
- {speedy_utils-1.1.18.dist-info → speedy_utils-1.1.20.dist-info}/METADATA +34 -13
- {speedy_utils-1.1.18.dist-info → speedy_utils-1.1.20.dist-info}/RECORD +19 -19
- {speedy_utils-1.1.18.dist-info → speedy_utils-1.1.20.dist-info}/WHEEL +1 -1
- speedy_utils-1.1.20.dist-info/entry_points.txt +5 -0
- speedy_utils-1.1.18.dist-info/entry_points.txt +0 -6
speedy_utils/__init__.py
CHANGED
|
@@ -138,7 +138,7 @@ from .common.utils_print import (
|
|
|
138
138
|
|
|
139
139
|
# Multi-worker processing
|
|
140
140
|
from .multi_worker.process import multi_process
|
|
141
|
-
from .multi_worker.thread import multi_thread
|
|
141
|
+
from .multi_worker.thread import kill_all_thread, multi_thread
|
|
142
142
|
|
|
143
143
|
# Define __all__ explicitly
|
|
144
144
|
__all__ = [
|
|
@@ -224,6 +224,7 @@ __all__ = [
|
|
|
224
224
|
# Multi-worker processing
|
|
225
225
|
"multi_process",
|
|
226
226
|
"multi_thread",
|
|
227
|
+
"kill_all_thread",
|
|
227
228
|
# Notebook utilities
|
|
228
229
|
"change_dir",
|
|
229
230
|
]
|
|
@@ -258,13 +258,13 @@ def _memory_memoize(
|
|
|
258
258
|
|
|
259
259
|
with mem_lock:
|
|
260
260
|
if name in mem_cache:
|
|
261
|
-
return mem_cache[name]
|
|
261
|
+
return mem_cache[name]
|
|
262
262
|
|
|
263
263
|
result = func(*args, **kwargs)
|
|
264
264
|
|
|
265
265
|
with mem_lock:
|
|
266
266
|
if name not in mem_cache:
|
|
267
|
-
mem_cache[name] = result
|
|
267
|
+
mem_cache[name] = result
|
|
268
268
|
return result
|
|
269
269
|
|
|
270
270
|
return wrapper
|
|
@@ -292,7 +292,7 @@ def _async_memory_memoize(
|
|
|
292
292
|
|
|
293
293
|
async with alock:
|
|
294
294
|
if name in mem_cache:
|
|
295
|
-
return mem_cache[name]
|
|
295
|
+
return mem_cache[name]
|
|
296
296
|
task = inflight.get(name)
|
|
297
297
|
if task is None:
|
|
298
298
|
task = asyncio.create_task(func(*args, **kwargs)) # type: ignore[arg-type]
|
|
@@ -305,7 +305,7 @@ def _async_memory_memoize(
|
|
|
305
305
|
inflight.pop(name, None)
|
|
306
306
|
|
|
307
307
|
with mem_lock:
|
|
308
|
-
mem_cache[name] = result
|
|
308
|
+
mem_cache[name] = result
|
|
309
309
|
return result
|
|
310
310
|
|
|
311
311
|
return wrapper
|
|
@@ -447,7 +447,7 @@ def both_memoize(
|
|
|
447
447
|
# Memory first
|
|
448
448
|
with mem_lock:
|
|
449
449
|
if mem_key in mem_cache:
|
|
450
|
-
return mem_cache[mem_key]
|
|
450
|
+
return mem_cache[mem_key]
|
|
451
451
|
|
|
452
452
|
# Disk next
|
|
453
453
|
if sub_dir == "funcs":
|
|
@@ -468,7 +468,7 @@ def both_memoize(
|
|
|
468
468
|
|
|
469
469
|
if disk_result is not None:
|
|
470
470
|
with mem_lock:
|
|
471
|
-
mem_cache[mem_key] = disk_result
|
|
471
|
+
mem_cache[mem_key] = disk_result
|
|
472
472
|
return disk_result
|
|
473
473
|
|
|
474
474
|
# Miss: compute, then write both
|
|
@@ -477,7 +477,7 @@ def both_memoize(
|
|
|
477
477
|
if not osp.exists(cache_path):
|
|
478
478
|
dump_json_or_pickle(result, cache_path)
|
|
479
479
|
with mem_lock:
|
|
480
|
-
mem_cache[mem_key] = result
|
|
480
|
+
mem_cache[mem_key] = result
|
|
481
481
|
return result
|
|
482
482
|
|
|
483
483
|
return wrapper
|
|
@@ -506,7 +506,7 @@ def _async_both_memoize(
|
|
|
506
506
|
# Memory
|
|
507
507
|
async with alock:
|
|
508
508
|
if mem_key in mem_cache:
|
|
509
|
-
return mem_cache[mem_key]
|
|
509
|
+
return mem_cache[mem_key]
|
|
510
510
|
|
|
511
511
|
# Disk
|
|
512
512
|
if sub_dir == "funcs":
|
|
@@ -526,7 +526,7 @@ def _async_both_memoize(
|
|
|
526
526
|
|
|
527
527
|
if disk_result is not None:
|
|
528
528
|
with mem_lock:
|
|
529
|
-
mem_cache[mem_key] = disk_result
|
|
529
|
+
mem_cache[mem_key] = disk_result
|
|
530
530
|
return disk_result
|
|
531
531
|
|
|
532
532
|
# Avoid duplicate async work for same key
|
|
@@ -550,7 +550,7 @@ def _async_both_memoize(
|
|
|
550
550
|
await loop.run_in_executor(None, write_disk_cache)
|
|
551
551
|
|
|
552
552
|
with mem_lock:
|
|
553
|
-
mem_cache[mem_key] = result
|
|
553
|
+
mem_cache[mem_key] = result
|
|
554
554
|
return result
|
|
555
555
|
|
|
556
556
|
return wrapper
|
|
@@ -561,9 +561,10 @@ def _async_both_memoize(
|
|
|
561
561
|
# --------------------------------------------------------------------------------------
|
|
562
562
|
|
|
563
563
|
|
|
564
|
+
# Define overloads to preserve exact type information
|
|
564
565
|
@overload
|
|
565
566
|
def memoize(
|
|
566
|
-
_func: Callable[P, R
|
|
567
|
+
_func: Callable[P, R],
|
|
567
568
|
*,
|
|
568
569
|
keys: Optional[list[str]] = ...,
|
|
569
570
|
key: Optional[Callable[..., Any]] = ...,
|
|
@@ -572,7 +573,23 @@ def memoize(
|
|
|
572
573
|
size: int = ...,
|
|
573
574
|
ignore_self: bool = ...,
|
|
574
575
|
verbose: bool = ...,
|
|
575
|
-
) -> Callable[P, R
|
|
576
|
+
) -> Callable[P, R]: ...
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
@overload
|
|
580
|
+
def memoize(
|
|
581
|
+
_func: Callable[P, Awaitable[R]],
|
|
582
|
+
*,
|
|
583
|
+
keys: Optional[list[str]] = ...,
|
|
584
|
+
key: Optional[Callable[..., Any]] = ...,
|
|
585
|
+
cache_dir: str = ...,
|
|
586
|
+
cache_type: Literal["memory", "disk", "both"] = ...,
|
|
587
|
+
size: int = ...,
|
|
588
|
+
ignore_self: bool = ...,
|
|
589
|
+
verbose: bool = ...,
|
|
590
|
+
) -> Callable[P, Awaitable[R]]: ...
|
|
591
|
+
|
|
592
|
+
|
|
576
593
|
@overload
|
|
577
594
|
def memoize(
|
|
578
595
|
_func: None = ...,
|
|
@@ -585,8 +602,10 @@ def memoize(
|
|
|
585
602
|
ignore_self: bool = ...,
|
|
586
603
|
verbose: bool = ...,
|
|
587
604
|
) -> Callable[[Callable[P, R]], Callable[P, R]]: ...
|
|
605
|
+
|
|
606
|
+
|
|
588
607
|
@overload
|
|
589
|
-
def memoize(
|
|
608
|
+
def memoize( # type: ignore
|
|
590
609
|
_func: None = ...,
|
|
591
610
|
*,
|
|
592
611
|
keys: Optional[list[str]] = ...,
|
|
@@ -635,24 +654,24 @@ def memoize(
|
|
|
635
654
|
|
|
636
655
|
if cache_type == "memory":
|
|
637
656
|
if is_async:
|
|
638
|
-
return _async_memory_memoize(target_func, size, keys, ignore_self, key)
|
|
639
|
-
return _memory_memoize(target_func, size, keys, ignore_self, key)
|
|
657
|
+
return _async_memory_memoize(target_func, size, keys, ignore_self, key)
|
|
658
|
+
return _memory_memoize(target_func, size, keys, ignore_self, key)
|
|
640
659
|
|
|
641
660
|
if cache_type == "disk":
|
|
642
661
|
if is_async:
|
|
643
662
|
return _async_disk_memoize(
|
|
644
663
|
target_func, keys, cache_dir, ignore_self, verbose, key
|
|
645
|
-
)
|
|
664
|
+
)
|
|
646
665
|
return _disk_memoize(
|
|
647
666
|
target_func, keys, cache_dir, ignore_self, verbose, key
|
|
648
|
-
)
|
|
667
|
+
)
|
|
649
668
|
|
|
650
669
|
# cache_type == "both"
|
|
651
670
|
if is_async:
|
|
652
671
|
return _async_both_memoize(
|
|
653
672
|
target_func, keys, cache_dir, ignore_self, size, key
|
|
654
|
-
)
|
|
655
|
-
return both_memoize(target_func, keys, cache_dir, ignore_self, size, key)
|
|
673
|
+
)
|
|
674
|
+
return both_memoize(target_func, keys, cache_dir, ignore_self, size, key)
|
|
656
675
|
|
|
657
676
|
# Support both @memoize and @memoize(...)
|
|
658
677
|
if _func is None:
|
speedy_utils/common/utils_io.py
CHANGED
|
@@ -1,13 +1,18 @@
|
|
|
1
1
|
# utils/utils_io.py
|
|
2
2
|
|
|
3
|
+
import bz2
|
|
4
|
+
import gzip
|
|
5
|
+
import io
|
|
3
6
|
import json
|
|
7
|
+
import lzma
|
|
4
8
|
import os
|
|
5
9
|
import os.path as osp
|
|
6
10
|
import pickle
|
|
7
11
|
import time
|
|
12
|
+
import warnings
|
|
8
13
|
from glob import glob
|
|
9
14
|
from pathlib import Path
|
|
10
|
-
from typing import Any, Union
|
|
15
|
+
from typing import IO, Any, Iterable, Optional, Union, cast
|
|
11
16
|
|
|
12
17
|
from json_repair import loads as jloads
|
|
13
18
|
from pydantic import BaseModel
|
|
@@ -53,7 +58,7 @@ def dump_json_or_pickle(
|
|
|
53
58
|
except Exception as e:
|
|
54
59
|
if isinstance(obj, BaseModel):
|
|
55
60
|
data = obj.model_dump()
|
|
56
|
-
from fastcore.all import
|
|
61
|
+
from fastcore.all import dict2obj, obj2dict
|
|
57
62
|
obj2 = dict2obj(data)
|
|
58
63
|
with open(fname, "wb") as f:
|
|
59
64
|
pickle.dump(obj2, f)
|
|
@@ -87,8 +92,7 @@ def load_json_or_pickle(fname: str, counter=0) -> Any:
|
|
|
87
92
|
raise ValueError(f"Error {e} while loading {fname}") from e
|
|
88
93
|
|
|
89
94
|
|
|
90
|
-
|
|
91
|
-
from typing import Iterable, Union, IO, Any, Optional, cast
|
|
95
|
+
|
|
92
96
|
|
|
93
97
|
try:
|
|
94
98
|
import orjson # type: ignore[import-not-found] # fastest JSON parser when available
|
|
@@ -212,7 +216,7 @@ def fast_load_jsonl(
|
|
|
212
216
|
if line_count > multiworker_threshold:
|
|
213
217
|
# Use multi-worker processing
|
|
214
218
|
from ..multi_worker.thread import multi_thread
|
|
215
|
-
|
|
219
|
+
|
|
216
220
|
# Read all lines into chunks
|
|
217
221
|
f = _open_auto(path_or_file)
|
|
218
222
|
all_lines = list(f)
|
|
@@ -1,13 +1,32 @@
|
|
|
1
1
|
# ray_multi_process.py
|
|
2
|
-
import time, os, pickle, uuid, datetime
|
|
2
|
+
import time, os, pickle, uuid, datetime, multiprocessing
|
|
3
|
+
import datetime
|
|
4
|
+
import os
|
|
5
|
+
import pickle
|
|
6
|
+
import time
|
|
7
|
+
import uuid
|
|
3
8
|
from pathlib import Path
|
|
4
9
|
from typing import Any, Callable
|
|
5
10
|
from tqdm import tqdm
|
|
11
|
+
import psutil
|
|
12
|
+
import threading
|
|
13
|
+
ray: Any
|
|
14
|
+
try:
|
|
15
|
+
import ray as ray # type: ignore
|
|
16
|
+
_HAS_RAY = True
|
|
17
|
+
except Exception: # pragma: no cover
|
|
18
|
+
ray = None # type: ignore
|
|
19
|
+
_HAS_RAY = False
|
|
20
|
+
from typing import Any, Callable, Iterable
|
|
21
|
+
|
|
6
22
|
import ray
|
|
7
23
|
from fastcore.parallel import parallel
|
|
24
|
+
from tqdm import tqdm
|
|
25
|
+
|
|
8
26
|
|
|
9
27
|
# ─── cache helpers ──────────────────────────────────────────
|
|
10
28
|
|
|
29
|
+
|
|
11
30
|
def _build_cache_dir(func: Callable, items: list[Any]) -> Path:
|
|
12
31
|
"""Build cache dir with function name + timestamp."""
|
|
13
32
|
func_name = getattr(func, "__name__", "func")
|
|
@@ -18,6 +37,7 @@ def _build_cache_dir(func: Callable, items: list[Any]) -> Path:
|
|
|
18
37
|
path.mkdir(parents=True, exist_ok=True)
|
|
19
38
|
return path
|
|
20
39
|
|
|
40
|
+
|
|
21
41
|
def wrap_dump(func: Callable, cache_dir: Path | None):
|
|
22
42
|
"""Wrap a function so results are dumped to .pkl when cache_dir is set."""
|
|
23
43
|
if cache_dir is None:
|
|
@@ -29,12 +49,15 @@ def wrap_dump(func: Callable, cache_dir: Path | None):
|
|
|
29
49
|
with open(p, "wb") as fh:
|
|
30
50
|
pickle.dump(res, fh)
|
|
31
51
|
return str(p)
|
|
52
|
+
|
|
32
53
|
return wrapped
|
|
33
54
|
|
|
55
|
+
|
|
34
56
|
# ─── ray management ─────────────────────────────────────────
|
|
35
57
|
|
|
36
58
|
RAY_WORKER = None
|
|
37
59
|
|
|
60
|
+
|
|
38
61
|
def ensure_ray(workers: int, pbar: tqdm | None = None):
|
|
39
62
|
"""Initialize or reinitialize Ray with a given worker count, log to bar postfix."""
|
|
40
63
|
global RAY_WORKER
|
|
@@ -49,19 +72,22 @@ def ensure_ray(workers: int, pbar: tqdm | None = None):
|
|
|
49
72
|
pbar.set_postfix_str(f"ray.init {workers} took {took:.2f}s")
|
|
50
73
|
RAY_WORKER = workers
|
|
51
74
|
|
|
75
|
+
|
|
52
76
|
# ─── main API ───────────────────────────────────────────────
|
|
53
77
|
from typing import Literal
|
|
54
78
|
|
|
79
|
+
|
|
55
80
|
def multi_process(
|
|
56
81
|
func: Callable[[Any], Any],
|
|
57
|
-
items:
|
|
82
|
+
items: Iterable[Any] | None = None,
|
|
58
83
|
*,
|
|
59
|
-
inputs:
|
|
84
|
+
inputs: Iterable[Any] | None = None,
|
|
60
85
|
workers: int | None = None,
|
|
61
86
|
lazy_output: bool = False,
|
|
62
87
|
progress: bool = True,
|
|
63
88
|
# backend: str = "ray", # "seq", "ray", or "fastcore"
|
|
64
|
-
backend: Literal["seq", "ray", "mp", "threadpool"] =
|
|
89
|
+
backend: Literal["seq", "ray", "mp", "threadpool", "safe"] | None = None,
|
|
90
|
+
backend: Literal["seq", "ray", "mp", "threadpool"] = "mp",
|
|
65
91
|
# Additional optional knobs (accepted for compatibility)
|
|
66
92
|
batch: int | None = None,
|
|
67
93
|
ordered: bool | None = None,
|
|
@@ -75,15 +101,25 @@ def multi_process(
|
|
|
75
101
|
backend:
|
|
76
102
|
- "seq": run sequentially
|
|
77
103
|
- "ray": run in parallel with Ray
|
|
78
|
-
- "
|
|
104
|
+
- "mp": run in parallel with multiprocessing (uses threadpool to avoid fork warnings)
|
|
105
|
+
- "threadpool": run in parallel with thread pool
|
|
106
|
+
- "safe": run in parallel with thread pool (explicitly safe for tests)
|
|
79
107
|
|
|
80
108
|
If lazy_output=True, every result is saved to .pkl and
|
|
81
109
|
the returned list contains file paths.
|
|
82
110
|
"""
|
|
83
111
|
|
|
112
|
+
# default backend selection
|
|
113
|
+
if backend is None:
|
|
114
|
+
backend = "ray" if _HAS_RAY else "mp"
|
|
115
|
+
|
|
84
116
|
# unify items
|
|
117
|
+
# unify items and coerce to concrete list so we can use len() and
|
|
118
|
+
# iterate multiple times. This accepts ranges and other iterables.
|
|
85
119
|
if items is None and inputs is not None:
|
|
86
120
|
items = list(inputs)
|
|
121
|
+
if items is not None and not isinstance(items, list):
|
|
122
|
+
items = list(items)
|
|
87
123
|
if items is None:
|
|
88
124
|
raise ValueError("'items' or 'inputs' must be provided")
|
|
89
125
|
|
|
@@ -95,8 +131,9 @@ def multi_process(
|
|
|
95
131
|
f_wrapped = wrap_dump(func, cache_dir)
|
|
96
132
|
|
|
97
133
|
total = len(items)
|
|
98
|
-
with tqdm(
|
|
99
|
-
|
|
134
|
+
with tqdm(
|
|
135
|
+
total=total, desc=f"multi_process [{backend}]", disable=not progress
|
|
136
|
+
) as pbar:
|
|
100
137
|
# ---- sequential backend ----
|
|
101
138
|
if backend == "seq":
|
|
102
139
|
pbar.set_postfix_str("backend=seq")
|
|
@@ -108,6 +145,13 @@ def multi_process(
|
|
|
108
145
|
|
|
109
146
|
# ---- ray backend ----
|
|
110
147
|
if backend == "ray":
|
|
148
|
+
if not _HAS_RAY:
|
|
149
|
+
msg = (
|
|
150
|
+
"Ray backend requested but 'ray' is not installed. "
|
|
151
|
+
"Install extra: pip install 'speedy-utils[ray]' or "
|
|
152
|
+
"poetry install -E ray."
|
|
153
|
+
)
|
|
154
|
+
raise RuntimeError(msg)
|
|
111
155
|
pbar.set_postfix_str("backend=ray")
|
|
112
156
|
ensure_ray(workers, pbar)
|
|
113
157
|
|
|
@@ -125,10 +169,47 @@ def multi_process(
|
|
|
125
169
|
|
|
126
170
|
# ---- fastcore backend ----
|
|
127
171
|
if backend == "mp":
|
|
128
|
-
results = parallel(
|
|
172
|
+
results = parallel(
|
|
173
|
+
f_wrapped, items, n_workers=workers, progress=progress, threadpool=False
|
|
174
|
+
)
|
|
129
175
|
return list(results)
|
|
130
176
|
if backend == "threadpool":
|
|
131
|
-
results = parallel(
|
|
177
|
+
results = parallel(
|
|
178
|
+
f_wrapped, items, n_workers=workers, progress=progress, threadpool=True
|
|
179
|
+
)
|
|
132
180
|
return list(results)
|
|
133
|
-
|
|
181
|
+
if backend == "safe":
|
|
182
|
+
# Completely safe backend for tests - no multiprocessing, no external progress bars
|
|
183
|
+
import concurrent.futures
|
|
184
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
|
|
185
|
+
results = list(executor.map(f_wrapped, items))
|
|
134
186
|
raise ValueError(f"Unsupported backend: {backend!r}")
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def cleanup_phantom_workers():
|
|
191
|
+
"""
|
|
192
|
+
Kill all child processes (phantom workers) without killing the Jupyter kernel itself.
|
|
193
|
+
Also lists non-daemon threads that remain.
|
|
194
|
+
"""
|
|
195
|
+
parent = psutil.Process(os.getpid())
|
|
196
|
+
|
|
197
|
+
# Kill only children, never the current process
|
|
198
|
+
for child in parent.children(recursive=True):
|
|
199
|
+
try:
|
|
200
|
+
print(f"🔪 Killing child process {child.pid} ({child.name()})")
|
|
201
|
+
child.kill()
|
|
202
|
+
except psutil.NoSuchProcess:
|
|
203
|
+
pass
|
|
204
|
+
|
|
205
|
+
# Report stray threads (can't hard-kill them in Python)
|
|
206
|
+
for t in threading.enumerate():
|
|
207
|
+
if t is threading.current_thread():
|
|
208
|
+
continue
|
|
209
|
+
if not t.daemon:
|
|
210
|
+
print(f"⚠️ Thread {t.name} is still running (cannot be force-killed).")
|
|
211
|
+
|
|
212
|
+
print("✅ Cleaned up child processes (kernel untouched).")
|
|
213
|
+
|
|
214
|
+
# Usage: run this anytime after cancelling a cell
|
|
215
|
+
|
|
@@ -77,7 +77,9 @@
|
|
|
77
77
|
# ============================================================================= #
|
|
78
78
|
"""
|
|
79
79
|
|
|
80
|
+
import ctypes
|
|
80
81
|
import os
|
|
82
|
+
import threading
|
|
81
83
|
import time
|
|
82
84
|
import traceback
|
|
83
85
|
from collections.abc import Callable, Iterable
|
|
@@ -98,6 +100,42 @@ DEFAULT_WORKERS = (os.cpu_count() or 4) * 2
|
|
|
98
100
|
T = TypeVar("T")
|
|
99
101
|
R = TypeVar("R")
|
|
100
102
|
|
|
103
|
+
SPEEDY_RUNNING_THREADS: list[threading.Thread] = []
|
|
104
|
+
_SPEEDY_THREADS_LOCK = threading.Lock()
|
|
105
|
+
|
|
106
|
+
_PY_SET_ASYNC_EXC = ctypes.pythonapi.PyThreadState_SetAsyncExc
|
|
107
|
+
try:
|
|
108
|
+
_PY_SET_ASYNC_EXC.argtypes = (ctypes.c_ulong, ctypes.py_object) # type: ignore[attr-defined]
|
|
109
|
+
_PY_SET_ASYNC_EXC.restype = ctypes.c_int # type: ignore[attr-defined]
|
|
110
|
+
except AttributeError: # pragma: no cover - platform specific
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _prune_dead_threads() -> None:
|
|
115
|
+
with _SPEEDY_THREADS_LOCK:
|
|
116
|
+
SPEEDY_RUNNING_THREADS[:] = [t for t in SPEEDY_RUNNING_THREADS if t.is_alive()]
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _track_threads(threads: Iterable[threading.Thread]) -> None:
|
|
120
|
+
if not threads:
|
|
121
|
+
return
|
|
122
|
+
with _SPEEDY_THREADS_LOCK:
|
|
123
|
+
living = [t for t in SPEEDY_RUNNING_THREADS if t.is_alive()]
|
|
124
|
+
for candidate in threads:
|
|
125
|
+
if not candidate.is_alive():
|
|
126
|
+
continue
|
|
127
|
+
if any(existing is candidate for existing in living):
|
|
128
|
+
continue
|
|
129
|
+
living.append(candidate)
|
|
130
|
+
SPEEDY_RUNNING_THREADS[:] = living
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _track_executor_threads(pool: ThreadPoolExecutor) -> None:
|
|
134
|
+
thread_set = getattr(pool, "_threads", None)
|
|
135
|
+
if not thread_set:
|
|
136
|
+
return
|
|
137
|
+
_track_threads(tuple(thread_set))
|
|
138
|
+
|
|
101
139
|
|
|
102
140
|
def _group_iter(src: Iterable[T], size: int) -> Iterable[list[T]]:
|
|
103
141
|
"""Yield successive chunks from iterable of specified size."""
|
|
@@ -273,11 +311,13 @@ def multi_thread(
|
|
|
273
311
|
fut.idx = next_logical_idx # type: ignore[attr-defined]
|
|
274
312
|
inflight.add(fut)
|
|
275
313
|
next_logical_idx += len(arg)
|
|
314
|
+
_track_executor_threads(pool)
|
|
276
315
|
else:
|
|
277
316
|
fut = pool.submit(_worker, arg, func, fixed_kwargs)
|
|
278
317
|
fut.idx = next_logical_idx # type: ignore[attr-defined]
|
|
279
318
|
inflight.add(fut)
|
|
280
319
|
next_logical_idx += 1
|
|
320
|
+
_track_executor_threads(pool)
|
|
281
321
|
|
|
282
322
|
try:
|
|
283
323
|
# Process futures as they complete and add new ones to keep the pool busy
|
|
@@ -347,11 +387,13 @@ def multi_thread(
|
|
|
347
387
|
fut2.idx = next_logical_idx # type: ignore[attr-defined]
|
|
348
388
|
inflight.add(fut2)
|
|
349
389
|
next_logical_idx += len(arg)
|
|
390
|
+
_track_executor_threads(pool)
|
|
350
391
|
else:
|
|
351
392
|
fut2 = pool.submit(_worker, arg, func, fixed_kwargs)
|
|
352
393
|
fut2.idx = next_logical_idx # type: ignore[attr-defined]
|
|
353
394
|
inflight.add(fut2)
|
|
354
395
|
next_logical_idx += 1
|
|
396
|
+
_track_executor_threads(pool)
|
|
355
397
|
except StopIteration:
|
|
356
398
|
pass
|
|
357
399
|
|
|
@@ -370,6 +412,7 @@ def multi_thread(
|
|
|
370
412
|
bar.close()
|
|
371
413
|
if store_output_pkl_file:
|
|
372
414
|
dump_json_or_pickle(results, store_output_pkl_file)
|
|
415
|
+
_prune_dead_threads()
|
|
373
416
|
return results
|
|
374
417
|
|
|
375
418
|
|
|
@@ -396,9 +439,58 @@ def multi_thread_standard(
|
|
|
396
439
|
Results in same order as input items.
|
|
397
440
|
"""
|
|
398
441
|
with ThreadPoolExecutor(max_workers=workers) as executor:
|
|
399
|
-
futures = [
|
|
442
|
+
futures = []
|
|
443
|
+
for item in items:
|
|
444
|
+
futures.append(executor.submit(fn, item))
|
|
445
|
+
_track_executor_threads(executor)
|
|
400
446
|
results = [fut.result() for fut in futures]
|
|
447
|
+
_prune_dead_threads()
|
|
401
448
|
return results
|
|
402
449
|
|
|
403
450
|
|
|
404
|
-
|
|
451
|
+
def _async_raise(thread_id: int, exc_type: type[BaseException]) -> bool:
|
|
452
|
+
if thread_id <= 0:
|
|
453
|
+
return False
|
|
454
|
+
if not issubclass(exc_type, BaseException):
|
|
455
|
+
raise TypeError("exc_type must derive from BaseException")
|
|
456
|
+
res = _PY_SET_ASYNC_EXC(ctypes.c_ulong(thread_id), ctypes.py_object(exc_type))
|
|
457
|
+
if res == 0:
|
|
458
|
+
return False
|
|
459
|
+
if res > 1: # pragma: no cover - defensive branch
|
|
460
|
+
_PY_SET_ASYNC_EXC(ctypes.c_ulong(thread_id), None)
|
|
461
|
+
raise SystemError("PyThreadState_SetAsyncExc failed")
|
|
462
|
+
return True
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def kill_all_thread(exc_type: type[BaseException] = SystemExit, join_timeout: float = 0.1) -> int:
|
|
466
|
+
"""Forcefully stop tracked worker threads. Returns number of threads signalled."""
|
|
467
|
+
_prune_dead_threads()
|
|
468
|
+
current = threading.current_thread()
|
|
469
|
+
with _SPEEDY_THREADS_LOCK:
|
|
470
|
+
targets = [t for t in SPEEDY_RUNNING_THREADS if t.is_alive()]
|
|
471
|
+
|
|
472
|
+
terminated = 0
|
|
473
|
+
for thread in targets:
|
|
474
|
+
if thread is current:
|
|
475
|
+
continue
|
|
476
|
+
ident = thread.ident
|
|
477
|
+
if ident is None:
|
|
478
|
+
continue
|
|
479
|
+
try:
|
|
480
|
+
if _async_raise(ident, exc_type):
|
|
481
|
+
terminated += 1
|
|
482
|
+
thread.join(timeout=join_timeout)
|
|
483
|
+
else:
|
|
484
|
+
logger.warning("Unable to signal thread %s", thread.name)
|
|
485
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
486
|
+
logger.error("Failed to stop thread %s: %s", thread.name, exc)
|
|
487
|
+
_prune_dead_threads()
|
|
488
|
+
return terminated
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
__all__ = [
|
|
492
|
+
"SPEEDY_RUNNING_THREADS",
|
|
493
|
+
"multi_thread",
|
|
494
|
+
"multi_thread_standard",
|
|
495
|
+
"kill_all_thread",
|
|
496
|
+
]
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: speedy-utils
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.20
|
|
4
4
|
Summary: Fast and easy-to-use package for data science
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
Project-URL: Homepage, https://github.com/anhvth/speedy
|
|
6
|
+
Project-URL: Repository, https://github.com/anhvth/speedy
|
|
7
|
+
Author-email: AnhVTH <anhvth.226@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
12
|
Classifier: Programming Language :: Python :: 3
|
|
9
13
|
Classifier: Programming Language :: Python :: 3.8
|
|
10
14
|
Classifier: Programming Language :: Python :: 3.9
|
|
@@ -13,29 +17,34 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
13
17
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
18
|
Classifier: Programming Language :: Python :: 3.13
|
|
15
19
|
Classifier: Programming Language :: Python :: 3.14
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Requires-Dist: aiohttp
|
|
16
22
|
Requires-Dist: bump2version
|
|
17
23
|
Requires-Dist: cachetools
|
|
18
24
|
Requires-Dist: debugpy
|
|
19
25
|
Requires-Dist: fastcore
|
|
20
26
|
Requires-Dist: fastprogress
|
|
21
|
-
Requires-Dist: freezegun
|
|
27
|
+
Requires-Dist: freezegun
|
|
22
28
|
Requires-Dist: ipdb
|
|
23
29
|
Requires-Dist: ipywidgets
|
|
24
|
-
Requires-Dist: json-repair
|
|
30
|
+
Requires-Dist: json-repair
|
|
25
31
|
Requires-Dist: jupyterlab
|
|
26
32
|
Requires-Dist: loguru
|
|
27
33
|
Requires-Dist: matplotlib
|
|
28
34
|
Requires-Dist: numpy
|
|
29
|
-
Requires-Dist: openai
|
|
30
|
-
Requires-Dist: packaging
|
|
35
|
+
Requires-Dist: openai
|
|
36
|
+
Requires-Dist: packaging
|
|
31
37
|
Requires-Dist: pandas
|
|
32
38
|
Requires-Dist: pydantic
|
|
39
|
+
Requires-Dist: pytest
|
|
40
|
+
Requires-Dist: ray
|
|
33
41
|
Requires-Dist: requests
|
|
34
42
|
Requires-Dist: scikit-learn
|
|
35
43
|
Requires-Dist: tabulate
|
|
36
44
|
Requires-Dist: tqdm
|
|
37
45
|
Requires-Dist: xxhash
|
|
38
|
-
|
|
46
|
+
Provides-Extra: ray
|
|
47
|
+
Requires-Dist: ray>=2.49.1; (python_version >= '3.9') and extra == 'ray'
|
|
39
48
|
Description-Content-Type: text/markdown
|
|
40
49
|
|
|
41
50
|
# Speedy Utils
|
|
@@ -84,6 +93,19 @@ cd speedy-utils
|
|
|
84
93
|
pip install .
|
|
85
94
|
```
|
|
86
95
|
|
|
96
|
+
### Extras
|
|
97
|
+
|
|
98
|
+
Optional dependencies can be installed via extras. For the `ray` backend
|
|
99
|
+
support (requires Python >= 3.9):
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
# pip
|
|
103
|
+
pip install 'speedy-utils[ray]'
|
|
104
|
+
|
|
105
|
+
# Poetry (for developing this repo)
|
|
106
|
+
poetry install -E ray
|
|
107
|
+
```
|
|
108
|
+
|
|
87
109
|
## Updating from previous versions
|
|
88
110
|
|
|
89
111
|
To update from previous versions or switch to v1.x, first uninstall any old
|
|
@@ -282,9 +304,8 @@ python speedy_utils/common/dataclass_parser.py
|
|
|
282
304
|
|
|
283
305
|
Example output:
|
|
284
306
|
|
|
285
|
-
| Field
|
|
286
|
-
|
|
287
|
-
| from_peft
|
|
307
|
+
| Field | Value |
|
|
308
|
+
| --------- | ------------------------------------- |
|
|
309
|
+
| from_peft | ./outputs/llm_hn_qw32b/hn_results_r3/ |
|
|
288
310
|
|
|
289
311
|
Please ensure your code adheres to the project's coding standards and includes appropriate tests.
|
|
290
|
-
|