speedy-utils 1.1.16__py3-none-any.whl → 1.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_utils/__init__.py +8 -1
- llm_utils/chat_format/display.py +109 -14
- llm_utils/lm/__init__.py +12 -11
- llm_utils/lm/async_lm/async_llm_task.py +0 -12
- llm_utils/lm/async_lm/async_lm.py +13 -4
- llm_utils/lm/async_lm/async_lm_base.py +24 -14
- llm_utils/lm/base_prompt_builder.py +288 -0
- llm_utils/lm/llm_task.py +400 -0
- llm_utils/lm/lm.py +207 -0
- llm_utils/lm/lm_base.py +285 -0
- llm_utils/vector_cache/core.py +297 -87
- speedy_utils/common/patcher.py +68 -0
- speedy_utils/common/utils_cache.py +5 -5
- speedy_utils/common/utils_io.py +232 -6
- speedy_utils/multi_worker/process.py +124 -193
- {speedy_utils-1.1.16.dist-info → speedy_utils-1.1.18.dist-info}/METADATA +3 -2
- {speedy_utils-1.1.16.dist-info → speedy_utils-1.1.18.dist-info}/RECORD +19 -14
- {speedy_utils-1.1.16.dist-info → speedy_utils-1.1.18.dist-info}/WHEEL +1 -1
- {speedy_utils-1.1.16.dist-info → speedy_utils-1.1.18.dist-info}/entry_points.txt +0 -0
speedy_utils/common/utils_io.py
CHANGED
|
@@ -87,9 +87,236 @@ def load_json_or_pickle(fname: str, counter=0) -> Any:
|
|
|
87
87
|
raise ValueError(f"Error {e} while loading {fname}") from e
|
|
88
88
|
|
|
89
89
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
90
|
+
import os, io, json, gzip, bz2, lzma, warnings
|
|
91
|
+
from typing import Iterable, Union, IO, Any, Optional, cast
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
import orjson # type: ignore[import-not-found] # fastest JSON parser when available
|
|
95
|
+
except Exception:
|
|
96
|
+
orjson = None
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
import zstandard as zstd # type: ignore[import-not-found] # optional .zst support
|
|
100
|
+
except Exception:
|
|
101
|
+
zstd = None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def fast_load_jsonl(
|
|
105
|
+
path_or_file: Union[str, os.PathLike, IO],
|
|
106
|
+
*,
|
|
107
|
+
progress: bool = False,
|
|
108
|
+
desc: str = "Reading JSONL",
|
|
109
|
+
use_orjson: bool = True,
|
|
110
|
+
encoding: str = "utf-8",
|
|
111
|
+
errors: str = "strict",
|
|
112
|
+
on_error: str = "raise", # 'raise' | 'warn' | 'skip'
|
|
113
|
+
skip_empty: bool = True,
|
|
114
|
+
max_lines: Optional[int] = None,
|
|
115
|
+
use_multiworker: bool = True,
|
|
116
|
+
multiworker_threshold: int = 50000,
|
|
117
|
+
workers: Optional[int] = None,
|
|
118
|
+
) -> Iterable[Any]:
|
|
119
|
+
"""
|
|
120
|
+
Lazily iterate objects from a JSON Lines file.
|
|
121
|
+
|
|
122
|
+
- Streams line-by-line (constant memory).
|
|
123
|
+
- Optional tqdm progress over bytes (compressed size if gz/bz2/xz/zst).
|
|
124
|
+
- Auto-detects compression by extension: .gz, .bz2, .xz/.lzma, .zst/.zstd.
|
|
125
|
+
- Uses orjson if available (use_orjson=True), falls back to json.
|
|
126
|
+
- Automatically uses multi-worker processing for large files (>50k lines).
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
path_or_file: Path-like or file-like object. File-like can be binary or text.
|
|
130
|
+
progress: Show a tqdm progress bar (bytes). Requires `tqdm` if True.
|
|
131
|
+
desc: tqdm description if progress=True.
|
|
132
|
+
use_orjson: Prefer orjson for speed if installed.
|
|
133
|
+
encoding, errors: Used when decoding text or when falling back to `json`.
|
|
134
|
+
on_error: What to do on a malformed line: 'raise', 'warn', or 'skip'.
|
|
135
|
+
skip_empty: Skip blank/whitespace-only lines.
|
|
136
|
+
max_lines: Stop after reading this many lines (useful for sampling).
|
|
137
|
+
use_multiworker: Enable multi-worker processing for large files.
|
|
138
|
+
multiworker_threshold: Line count threshold to trigger multi-worker processing.
|
|
139
|
+
workers: Number of worker threads (defaults to CPU count).
|
|
140
|
+
|
|
141
|
+
Yields:
|
|
142
|
+
Parsed Python objects per line.
|
|
143
|
+
"""
|
|
144
|
+
def _open_auto(pth_or_f) -> IO[Any]:
|
|
145
|
+
if hasattr(pth_or_f, "read"):
|
|
146
|
+
# ensure binary buffer for consistent byte-length progress
|
|
147
|
+
fobj = pth_or_f
|
|
148
|
+
# If it's text, wrap it to binary via encoding; else just return
|
|
149
|
+
if isinstance(fobj, io.TextIOBase):
|
|
150
|
+
# TextIO -> re-encode to bytes on the fly
|
|
151
|
+
return io.BufferedReader(io.BytesIO(fobj.read().encode(encoding, errors)))
|
|
152
|
+
return pth_or_f # assume binary
|
|
153
|
+
s = str(pth_or_f).lower()
|
|
154
|
+
if s.endswith(".gz"):
|
|
155
|
+
return gzip.open(pth_or_f, "rb") # type: ignore
|
|
156
|
+
if s.endswith(".bz2"):
|
|
157
|
+
return bz2.open(pth_or_f, "rb") # type: ignore
|
|
158
|
+
if s.endswith((".xz", ".lzma")):
|
|
159
|
+
return lzma.open(pth_or_f, "rb") # type: ignore
|
|
160
|
+
if s.endswith((".zst", ".zstd")) and zstd is not None:
|
|
161
|
+
fh = open(pth_or_f, "rb")
|
|
162
|
+
dctx = zstd.ZstdDecompressor()
|
|
163
|
+
stream = dctx.stream_reader(fh)
|
|
164
|
+
return io.BufferedReader(stream) # type: ignore
|
|
165
|
+
# plain
|
|
166
|
+
return open(pth_or_f, "rb", buffering=1024 * 1024)
|
|
167
|
+
|
|
168
|
+
def _count_lines_fast(file_path: Union[str, os.PathLike]) -> int:
|
|
169
|
+
"""Quickly count lines in a file, handling compression."""
|
|
170
|
+
try:
|
|
171
|
+
f = _open_auto(file_path)
|
|
172
|
+
count = 0
|
|
173
|
+
for _ in f:
|
|
174
|
+
count += 1
|
|
175
|
+
f.close()
|
|
176
|
+
return count
|
|
177
|
+
except Exception:
|
|
178
|
+
# If we can't count lines, assume it's small
|
|
179
|
+
return 0
|
|
180
|
+
|
|
181
|
+
def _process_chunk(chunk_lines: list[bytes]) -> list[Any]:
|
|
182
|
+
"""Process a chunk of lines and return parsed objects."""
|
|
183
|
+
results = []
|
|
184
|
+
for line_bytes in chunk_lines:
|
|
185
|
+
if skip_empty and not line_bytes.strip():
|
|
186
|
+
continue
|
|
187
|
+
line_bytes = line_bytes.rstrip(b"\r\n")
|
|
188
|
+
try:
|
|
189
|
+
if use_orjson and orjson is not None:
|
|
190
|
+
obj = orjson.loads(line_bytes)
|
|
191
|
+
else:
|
|
192
|
+
obj = json.loads(line_bytes.decode(encoding, errors))
|
|
193
|
+
results.append(obj)
|
|
194
|
+
except Exception as e:
|
|
195
|
+
if on_error == "raise":
|
|
196
|
+
raise
|
|
197
|
+
if on_error == "warn":
|
|
198
|
+
warnings.warn(f"Skipping malformed line: {e}")
|
|
199
|
+
# 'skip' and 'warn' both skip the line
|
|
200
|
+
continue
|
|
201
|
+
return results
|
|
202
|
+
|
|
203
|
+
# Check if we should use multi-worker processing
|
|
204
|
+
should_use_multiworker = (
|
|
205
|
+
use_multiworker
|
|
206
|
+
and not hasattr(path_or_file, "read") # Only for file paths, not file objects
|
|
207
|
+
and max_lines is None # Don't use multiworker if we're limiting lines
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
if should_use_multiworker:
|
|
211
|
+
line_count = _count_lines_fast(cast(Union[str, os.PathLike], path_or_file))
|
|
212
|
+
if line_count > multiworker_threshold:
|
|
213
|
+
# Use multi-worker processing
|
|
214
|
+
from ..multi_worker.thread import multi_thread
|
|
215
|
+
|
|
216
|
+
# Read all lines into chunks
|
|
217
|
+
f = _open_auto(path_or_file)
|
|
218
|
+
all_lines = list(f)
|
|
219
|
+
f.close()
|
|
220
|
+
|
|
221
|
+
# Split into chunks for workers
|
|
222
|
+
num_workers = workers or os.cpu_count() or 4
|
|
223
|
+
chunk_size = max(len(all_lines) // num_workers, 1000)
|
|
224
|
+
chunks = []
|
|
225
|
+
for i in range(0, len(all_lines), chunk_size):
|
|
226
|
+
chunks.append(all_lines[i:i + chunk_size])
|
|
227
|
+
|
|
228
|
+
# Process chunks in parallel
|
|
229
|
+
if progress:
|
|
230
|
+
print(f"Processing {line_count} lines with {num_workers} workers...")
|
|
231
|
+
|
|
232
|
+
chunk_results = multi_thread(_process_chunk, chunks, workers=num_workers, progress=progress)
|
|
233
|
+
|
|
234
|
+
# Flatten results and yield
|
|
235
|
+
for chunk_result in chunk_results:
|
|
236
|
+
for obj in chunk_result:
|
|
237
|
+
yield obj
|
|
238
|
+
return
|
|
239
|
+
|
|
240
|
+
# Single-threaded processing (original logic)
|
|
241
|
+
|
|
242
|
+
f = _open_auto(path_or_file)
|
|
243
|
+
|
|
244
|
+
pbar = None
|
|
245
|
+
if progress:
|
|
246
|
+
try:
|
|
247
|
+
from tqdm import tqdm # type: ignore
|
|
248
|
+
except Exception as e:
|
|
249
|
+
raise ImportError("tqdm is required when progress=True") from e
|
|
250
|
+
total = None
|
|
251
|
+
if not hasattr(path_or_file, "read"):
|
|
252
|
+
try:
|
|
253
|
+
path_for_size = cast(Union[str, os.PathLike], path_or_file)
|
|
254
|
+
total = os.path.getsize(path_for_size) # compressed size if any
|
|
255
|
+
except Exception:
|
|
256
|
+
total = None
|
|
257
|
+
pbar = tqdm(total=total, unit="B", unit_scale=True, desc=desc)
|
|
258
|
+
|
|
259
|
+
line_no = 0
|
|
260
|
+
try:
|
|
261
|
+
for raw_line in f:
|
|
262
|
+
line_no += 1
|
|
263
|
+
if pbar is not None:
|
|
264
|
+
# raw_line is bytes here; if not, compute byte length
|
|
265
|
+
nbytes = len(raw_line) if isinstance(raw_line, (bytes, bytearray)) else len(str(raw_line).encode(encoding, errors))
|
|
266
|
+
pbar.update(nbytes)
|
|
267
|
+
|
|
268
|
+
# Normalize to bytes -> str only if needed
|
|
269
|
+
if isinstance(raw_line, (bytes, bytearray)):
|
|
270
|
+
if skip_empty and not raw_line.strip():
|
|
271
|
+
if max_lines and line_no >= max_lines:
|
|
272
|
+
break
|
|
273
|
+
continue
|
|
274
|
+
line_bytes = raw_line.rstrip(b"\r\n")
|
|
275
|
+
# Parse
|
|
276
|
+
try:
|
|
277
|
+
if use_orjson and orjson is not None:
|
|
278
|
+
obj = orjson.loads(line_bytes)
|
|
279
|
+
else:
|
|
280
|
+
obj = json.loads(line_bytes.decode(encoding, errors))
|
|
281
|
+
except Exception as e:
|
|
282
|
+
if on_error == "raise":
|
|
283
|
+
raise
|
|
284
|
+
if on_error == "warn":
|
|
285
|
+
warnings.warn(f"Skipping malformed line {line_no}: {e}")
|
|
286
|
+
# 'skip' and 'warn' both skip the line
|
|
287
|
+
if max_lines and line_no >= max_lines:
|
|
288
|
+
break
|
|
289
|
+
continue
|
|
290
|
+
else:
|
|
291
|
+
# Text line path (unlikely)
|
|
292
|
+
if skip_empty and not raw_line.strip():
|
|
293
|
+
if max_lines and line_no >= max_lines:
|
|
294
|
+
break
|
|
295
|
+
continue
|
|
296
|
+
try:
|
|
297
|
+
obj = json.loads(raw_line)
|
|
298
|
+
except Exception as e:
|
|
299
|
+
if on_error == "raise":
|
|
300
|
+
raise
|
|
301
|
+
if on_error == "warn":
|
|
302
|
+
warnings.warn(f"Skipping malformed line {line_no}: {e}")
|
|
303
|
+
if max_lines and line_no >= max_lines:
|
|
304
|
+
break
|
|
305
|
+
continue
|
|
306
|
+
|
|
307
|
+
yield obj
|
|
308
|
+
if max_lines and line_no >= max_lines:
|
|
309
|
+
break
|
|
310
|
+
finally:
|
|
311
|
+
if pbar is not None:
|
|
312
|
+
pbar.close()
|
|
313
|
+
# Close only if we opened it (i.e., not an external stream)
|
|
314
|
+
if not hasattr(path_or_file, "read"):
|
|
315
|
+
try:
|
|
316
|
+
f.close()
|
|
317
|
+
except Exception:
|
|
318
|
+
pass
|
|
319
|
+
|
|
93
320
|
|
|
94
321
|
|
|
95
322
|
def load_by_ext(fname: Union[str, list[str]], do_memoize: bool = False) -> Any:
|
|
@@ -124,7 +351,7 @@ def load_by_ext(fname: Union[str, list[str]], do_memoize: bool = False) -> Any:
|
|
|
124
351
|
|
|
125
352
|
def load_default(path: str) -> Any:
|
|
126
353
|
if path.endswith(".jsonl"):
|
|
127
|
-
return
|
|
354
|
+
return list(fast_load_jsonl(path, progress=True))
|
|
128
355
|
elif path.endswith(".json"):
|
|
129
356
|
try:
|
|
130
357
|
return load_json_or_pickle(path)
|
|
@@ -159,14 +386,13 @@ def jdumps(obj, ensure_ascii=False, indent=2, **kwargs):
|
|
|
159
386
|
return json.dumps(obj, ensure_ascii=ensure_ascii, indent=indent, **kwargs)
|
|
160
387
|
|
|
161
388
|
|
|
162
|
-
|
|
389
|
+
load_jsonl = lambda path: list(fast_load_jsonl(path))
|
|
163
390
|
|
|
164
391
|
__all__ = [
|
|
165
392
|
"dump_json_or_pickle",
|
|
166
393
|
"dump_jsonl",
|
|
167
394
|
"load_by_ext",
|
|
168
395
|
"load_json_or_pickle",
|
|
169
|
-
"load_jsonl",
|
|
170
396
|
"jdumps",
|
|
171
397
|
"jloads",
|
|
172
398
|
]
|
|
@@ -1,203 +1,134 @@
|
|
|
1
|
-
|
|
2
|
-
import os
|
|
3
|
-
import
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
-
|
|
7
|
-
from
|
|
1
|
+
# ray_multi_process.py
|
|
2
|
+
import time, os, pickle, uuid, datetime
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Callable
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
import ray
|
|
7
|
+
from fastcore.parallel import parallel
|
|
8
|
+
|
|
9
|
+
# ─── cache helpers ──────────────────────────────────────────
|
|
10
|
+
|
|
11
|
+
def _build_cache_dir(func: Callable, items: list[Any]) -> Path:
|
|
12
|
+
"""Build cache dir with function name + timestamp."""
|
|
13
|
+
func_name = getattr(func, "__name__", "func")
|
|
14
|
+
now = datetime.datetime.now()
|
|
15
|
+
stamp = now.strftime("%m%d_%Hh%Mm%Ss")
|
|
16
|
+
run_id = f"{func_name}_{stamp}_{uuid.uuid4().hex[:6]}"
|
|
17
|
+
path = Path(".cache") / run_id
|
|
18
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
19
|
+
return path
|
|
20
|
+
|
|
21
|
+
def wrap_dump(func: Callable, cache_dir: Path | None):
|
|
22
|
+
"""Wrap a function so results are dumped to .pkl when cache_dir is set."""
|
|
23
|
+
if cache_dir is None:
|
|
24
|
+
return func
|
|
25
|
+
|
|
26
|
+
def wrapped(x, *args, **kwargs):
|
|
27
|
+
res = func(x, *args, **kwargs)
|
|
28
|
+
p = cache_dir / f"{uuid.uuid4().hex}.pkl"
|
|
29
|
+
with open(p, "wb") as fh:
|
|
30
|
+
pickle.dump(res, fh)
|
|
31
|
+
return str(p)
|
|
32
|
+
return wrapped
|
|
33
|
+
|
|
34
|
+
# ─── ray management ─────────────────────────────────────────
|
|
35
|
+
|
|
36
|
+
RAY_WORKER = None
|
|
37
|
+
|
|
38
|
+
def ensure_ray(workers: int, pbar: tqdm | None = None):
|
|
39
|
+
"""Initialize or reinitialize Ray with a given worker count, log to bar postfix."""
|
|
40
|
+
global RAY_WORKER
|
|
41
|
+
if not ray.is_initialized() or RAY_WORKER != workers:
|
|
42
|
+
if ray.is_initialized() and pbar:
|
|
43
|
+
pbar.set_postfix_str(f"Restarting Ray {workers} workers")
|
|
44
|
+
ray.shutdown()
|
|
45
|
+
t0 = time.time()
|
|
46
|
+
ray.init(num_cpus=workers, ignore_reinit_error=True)
|
|
47
|
+
took = time.time() - t0
|
|
48
|
+
if pbar:
|
|
49
|
+
pbar.set_postfix_str(f"ray.init {workers} took {took:.2f}s")
|
|
50
|
+
RAY_WORKER = workers
|
|
51
|
+
|
|
52
|
+
# ─── main API ───────────────────────────────────────────────
|
|
53
|
+
from typing import Literal
|
|
8
54
|
|
|
9
|
-
T = TypeVar("T")
|
|
10
|
-
|
|
11
|
-
if hasattr(multiprocessing, "set_start_method"):
|
|
12
|
-
try:
|
|
13
|
-
multiprocessing.set_start_method("spawn", force=True)
|
|
14
|
-
except RuntimeError:
|
|
15
|
-
pass
|
|
16
|
-
|
|
17
|
-
try:
|
|
18
|
-
from tqdm import tqdm
|
|
19
|
-
except ImportError: # pragma: no cover
|
|
20
|
-
tqdm = None # type: ignore[assignment]
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
# ──── internal helpers ────────────────────────────────────────────────────
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def _group_iter(src: Iterable[Any], size: int) -> Iterable[list[Any]]:
|
|
27
|
-
"Yield *size*-sized chunks from *src*."
|
|
28
|
-
it = iter(src)
|
|
29
|
-
while chunk := list(islice(it, size)):
|
|
30
|
-
yield chunk
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def _short_tb() -> str:
|
|
34
|
-
tb = "".join(traceback.format_exc())
|
|
35
|
-
return "\n".join(ln for ln in tb.splitlines() if "multi_process" not in ln)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def _safe_call(func: Callable, obj, fixed):
|
|
39
|
-
try:
|
|
40
|
-
return func(obj, **fixed)
|
|
41
|
-
except Exception as exc:
|
|
42
|
-
func_name = getattr(func, "__name__", str(func))
|
|
43
|
-
raise RuntimeError(
|
|
44
|
-
f"{func_name}({obj!r}) failed: {exc}\n{_short_tb()}"
|
|
45
|
-
) from exc
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def _worker_process(
|
|
49
|
-
func: Callable, item_batch: Any, fixed_kwargs: dict, batch_size: int
|
|
50
|
-
):
|
|
51
|
-
"""Worker function executed in each process."""
|
|
52
|
-
if batch_size > 1:
|
|
53
|
-
results = []
|
|
54
|
-
for itm in item_batch:
|
|
55
|
-
try:
|
|
56
|
-
results.append(_safe_call(func, itm, fixed_kwargs))
|
|
57
|
-
except Exception:
|
|
58
|
-
results.append(None)
|
|
59
|
-
return results
|
|
60
|
-
return _safe_call(func, item_batch, fixed_kwargs)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
# ──── public API ──────────────────────────────────────────────────────────
|
|
64
55
|
def multi_process(
|
|
65
56
|
func: Callable[[Any], Any],
|
|
66
|
-
|
|
57
|
+
items: list[Any] | None = None,
|
|
67
58
|
*,
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
progress: bool =
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
59
|
+
inputs: list[Any] | None = None,
|
|
60
|
+
workers: int | None = None,
|
|
61
|
+
lazy_output: bool = False,
|
|
62
|
+
progress: bool = True,
|
|
63
|
+
# backend: str = "ray", # "seq", "ray", or "fastcore"
|
|
64
|
+
backend: Literal["seq", "ray", "mp", "threadpool"] = "ray",
|
|
65
|
+
# Additional optional knobs (accepted for compatibility)
|
|
66
|
+
batch: int | None = None,
|
|
67
|
+
ordered: bool | None = None,
|
|
68
|
+
process_update_interval: int | None = None,
|
|
69
|
+
stop_on_error: bool | None = None,
|
|
70
|
+
**func_kwargs: Any,
|
|
78
71
|
) -> list[Any]:
|
|
79
72
|
"""
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
progress – show a tqdm bar (requires *tqdm*).
|
|
90
|
-
inflight – max logical items concurrently submitted
|
|
91
|
-
*(default: ``workers × 4``)*.
|
|
92
|
-
timeout – overall timeout for the mapping (seconds).
|
|
93
|
-
stop_on_error – raise immediately on first exception (default) or
|
|
94
|
-
substitute failing result with ``None``.
|
|
95
|
-
**fixed_kwargs – static keyword args forwarded to every ``func()`` call.
|
|
73
|
+
Multi-process map with selectable backend.
|
|
74
|
+
|
|
75
|
+
backend:
|
|
76
|
+
- "seq": run sequentially
|
|
77
|
+
- "ray": run in parallel with Ray
|
|
78
|
+
- "fastcore": run in parallel with fastcore.parallel
|
|
79
|
+
|
|
80
|
+
If lazy_output=True, every result is saved to .pkl and
|
|
81
|
+
the returned list contains file paths.
|
|
96
82
|
"""
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
83
|
+
|
|
84
|
+
# unify items
|
|
85
|
+
if items is None and inputs is not None:
|
|
86
|
+
items = list(inputs)
|
|
87
|
+
if items is None:
|
|
88
|
+
raise ValueError("'items' or 'inputs' must be provided")
|
|
102
89
|
|
|
103
90
|
if workers is None:
|
|
104
91
|
workers = os.cpu_count() or 1
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
if
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
fut.idx = next_idx # type: ignore[attr-defined]
|
|
149
|
-
futures.add(fut)
|
|
150
|
-
next_idx += len(arg) if batch > 1 else 1
|
|
151
|
-
|
|
152
|
-
while futures:
|
|
153
|
-
for fut in as_completed(futures, timeout=timeout):
|
|
154
|
-
futures.remove(fut)
|
|
155
|
-
idx = fut.idx # type: ignore[attr-defined]
|
|
156
|
-
try:
|
|
157
|
-
res = fut.result()
|
|
158
|
-
except Exception:
|
|
159
|
-
if stop_on_error:
|
|
160
|
-
raise
|
|
161
|
-
num_items = batch if batch > 1 else 1
|
|
162
|
-
res = [None] * num_items if batch > 1 else None
|
|
163
|
-
|
|
164
|
-
out_items = res if batch > 1 else [res]
|
|
165
|
-
if out_items is None:
|
|
166
|
-
out_items = []
|
|
167
|
-
|
|
168
|
-
if ordered and logical_total is not None:
|
|
169
|
-
if isinstance(out_items, list) and len(out_items) > 0:
|
|
170
|
-
for i, item in enumerate(out_items):
|
|
171
|
-
if idx + i < len(results):
|
|
172
|
-
results[idx + i] = item
|
|
173
|
-
else:
|
|
174
|
-
if isinstance(out_items, list):
|
|
175
|
-
results.extend(out_items)
|
|
176
|
-
|
|
177
|
-
completed += len(out_items)
|
|
178
|
-
|
|
179
|
-
if bar and completed - last_bar >= process_update_interval:
|
|
180
|
-
bar.update(completed - last_bar)
|
|
181
|
-
last_bar = completed
|
|
182
|
-
|
|
183
|
-
try:
|
|
184
|
-
while next_idx - completed < inflight:
|
|
185
|
-
arg = next(src_iter)
|
|
186
|
-
fut2 = pool.submit(
|
|
187
|
-
_worker_process, func, arg, fixed_kwargs, batch
|
|
188
|
-
)
|
|
189
|
-
fut2.idx = next_idx # type: ignore[attr-defined]
|
|
190
|
-
futures.add(fut2)
|
|
191
|
-
next_idx += len(arg) if batch > 1 else 1
|
|
192
|
-
except StopIteration:
|
|
193
|
-
pass
|
|
194
|
-
break
|
|
195
|
-
|
|
196
|
-
if bar:
|
|
197
|
-
bar.update(completed - last_bar)
|
|
198
|
-
bar.close()
|
|
199
|
-
|
|
200
|
-
return results
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
__all__ = ["multi_process"]
|
|
92
|
+
|
|
93
|
+
# build cache dir + wrap func
|
|
94
|
+
cache_dir = _build_cache_dir(func, items) if lazy_output else None
|
|
95
|
+
f_wrapped = wrap_dump(func, cache_dir)
|
|
96
|
+
|
|
97
|
+
total = len(items)
|
|
98
|
+
with tqdm(total=total, desc=f"multi_process [{backend}]", disable=not progress) as pbar:
|
|
99
|
+
|
|
100
|
+
# ---- sequential backend ----
|
|
101
|
+
if backend == "seq":
|
|
102
|
+
pbar.set_postfix_str("backend=seq")
|
|
103
|
+
results = []
|
|
104
|
+
for x in items:
|
|
105
|
+
results.append(f_wrapped(x, **func_kwargs))
|
|
106
|
+
pbar.update(1)
|
|
107
|
+
return results
|
|
108
|
+
|
|
109
|
+
# ---- ray backend ----
|
|
110
|
+
if backend == "ray":
|
|
111
|
+
pbar.set_postfix_str("backend=ray")
|
|
112
|
+
ensure_ray(workers, pbar)
|
|
113
|
+
|
|
114
|
+
@ray.remote
|
|
115
|
+
def _task(x):
|
|
116
|
+
return f_wrapped(x, **func_kwargs)
|
|
117
|
+
|
|
118
|
+
refs = [_task.remote(x) for x in items]
|
|
119
|
+
|
|
120
|
+
results = []
|
|
121
|
+
for r in refs:
|
|
122
|
+
results.append(ray.get(r))
|
|
123
|
+
pbar.update(1)
|
|
124
|
+
return results
|
|
125
|
+
|
|
126
|
+
# ---- fastcore backend ----
|
|
127
|
+
if backend == "mp":
|
|
128
|
+
results = parallel(f_wrapped, items, n_workers=workers, progress=progress, threadpool=False)
|
|
129
|
+
return list(results)
|
|
130
|
+
if backend == "threadpool":
|
|
131
|
+
results = parallel(f_wrapped, items, n_workers=workers, progress=progress, threadpool=True)
|
|
132
|
+
return list(results)
|
|
133
|
+
|
|
134
|
+
raise ValueError(f"Unsupported backend: {backend!r}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: speedy-utils
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.18
|
|
4
4
|
Summary: Fast and easy-to-use package for data science
|
|
5
5
|
Author: AnhVTH
|
|
6
6
|
Author-email: anhvth.226@gmail.com
|
|
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
16
|
Requires-Dist: bump2version
|
|
16
17
|
Requires-Dist: cachetools
|
|
17
18
|
Requires-Dist: debugpy
|
|
@@ -1,16 +1,20 @@
|
|
|
1
|
-
llm_utils/__init__.py,sha256=
|
|
1
|
+
llm_utils/__init__.py,sha256=KjgorCpl2YbAGaqaOvFDDlE7V2GUzxFx_Xyz5ROnWZc,916
|
|
2
2
|
llm_utils/chat_format/__init__.py,sha256=8dBIUqFJvkgQYedxBtcyxt-4tt8JxAKVap2JlTXmgaM,737
|
|
3
|
-
llm_utils/chat_format/display.py,sha256=
|
|
3
|
+
llm_utils/chat_format/display.py,sha256=3jKDm4OTrvytK1qBhSOjRLltUIObHsYFdBLgm8SVDE8,14159
|
|
4
4
|
llm_utils/chat_format/transform.py,sha256=eU0c3PdAHCNLuGP1UqPwln0B34Lv3bt_uV9v9BrlCN4,5402
|
|
5
5
|
llm_utils/chat_format/utils.py,sha256=xTxN4HrLHcRO2PfCTR43nH1M5zCa7v0kTTdzAcGkZg0,1229
|
|
6
6
|
llm_utils/group_messages.py,sha256=Oe2tlhg-zRodG1-hodYebddrR77j9UdE05LzJw0EvYI,3622
|
|
7
|
-
llm_utils/lm/__init__.py,sha256=
|
|
7
|
+
llm_utils/lm/__init__.py,sha256=totIZnq1P8eNlfVco0OfdGdTNt1-wSXDSRReRRzYYxw,319
|
|
8
8
|
llm_utils/lm/async_lm/__init__.py,sha256=PUBbCuf5u6-0GBUu-2PI6YAguzsyXj-LPkU6vccqT6E,121
|
|
9
9
|
llm_utils/lm/async_lm/_utils.py,sha256=P1-pUDf_0pDmo8WTIi43t5ARlyGA1RIJfpAhz-gfA5g,6105
|
|
10
|
-
llm_utils/lm/async_lm/async_llm_task.py,sha256=
|
|
11
|
-
llm_utils/lm/async_lm/async_lm.py,sha256=
|
|
12
|
-
llm_utils/lm/async_lm/async_lm_base.py,sha256=
|
|
10
|
+
llm_utils/lm/async_lm/async_llm_task.py,sha256=A5WLIN3v-zpl-sJGiykyo8wOCYEpA8ja70MJcn5t7O4,18668
|
|
11
|
+
llm_utils/lm/async_lm/async_lm.py,sha256=e3o9cyMbkVz_jQDTjJv2ybET_5mY012zdZGjNwi4Qk4,13719
|
|
12
|
+
llm_utils/lm/async_lm/async_lm_base.py,sha256=iJgtzI6pVJzWtlXGqVLwgCIb-FzZAa3E5xW8yhyHUmM,8426
|
|
13
13
|
llm_utils/lm/async_lm/lm_specific.py,sha256=KmqdCm3SJ5MqN-dRJd6S5tq5-ve1X2eNWf2CMFtc_3s,3926
|
|
14
|
+
llm_utils/lm/base_prompt_builder.py,sha256=OLqyxbA8QeYIVFzB9EqxUiE_P2p4_MD_Lq4WSwxFtKU,12136
|
|
15
|
+
llm_utils/lm/llm_task.py,sha256=K5c27iYM9etAbdDM1WiO3-GjTvl1dkzt2sIaW3N1YA0,15483
|
|
16
|
+
llm_utils/lm/lm.py,sha256=8TaLuU7naPQbOFmiS2NQyWVLG0jUUzRRBQsR0In7GVo,7249
|
|
17
|
+
llm_utils/lm/lm_base.py,sha256=pqbHZOdR7yUMpvwt8uBG1dZnt76SY_Wk8BkXQQ-mpWs,9557
|
|
14
18
|
llm_utils/lm/openai_memoize.py,sha256=DdMl31cV9AqLlkARajZrqAKCyhvH8JQk2SAHMSzO3mk,3024
|
|
15
19
|
llm_utils/lm/utils.py,sha256=a0KJj8vjT2fHKb7GKGNJjJHhKLThwpxIL7vnV9Fr3ZY,4584
|
|
16
20
|
llm_utils/scripts/README.md,sha256=yuOLnLa2od2jp4wVy3rV0rESeiV3o8zol5MNMsZx0DY,999
|
|
@@ -18,7 +22,7 @@ llm_utils/scripts/vllm_load_balancer.py,sha256=TT5Ypq7gUcl52gRFp--ORFFjzhfGlcaX2
|
|
|
18
22
|
llm_utils/scripts/vllm_serve.py,sha256=gJ0-y4kybMfSt8qzye1pJqGMY3x9JLRi6Tu7RjJMnss,14771
|
|
19
23
|
llm_utils/vector_cache/__init__.py,sha256=i1KQuC4OhPewYpFl9X6HlWFBuASCTx2qgGizhpZhmn0,862
|
|
20
24
|
llm_utils/vector_cache/cli.py,sha256=DMXTj8nZ2_LRjprbYPb4uzq04qZtOfBbmblmaqDcCuM,6251
|
|
21
|
-
llm_utils/vector_cache/core.py,sha256=
|
|
25
|
+
llm_utils/vector_cache/core.py,sha256=P0VopzMmfnGaYTTEiccXprsyjruje3QT0_AFXF1lZC0,33582
|
|
22
26
|
llm_utils/vector_cache/types.py,sha256=ru8qmUZ8_lNd3_oYpjCMtpXTsqmwsSBe56Z4hTWm3xI,435
|
|
23
27
|
llm_utils/vector_cache/utils.py,sha256=dwbbXlRrARrpmS4YqSlYQqrTURg0UWe8XvaAWcX05MM,1458
|
|
24
28
|
speedy_utils/__init__.py,sha256=nJpUb5Oa3STDbqPSiWXoI-IvKntyRYzYxkYW4GM2i_Q,5740
|
|
@@ -28,18 +32,19 @@ speedy_utils/common/clock.py,sha256=3n4FkCW0dz46O8By09V5Pve1DSMgpLDRbWEVRryryeQ,
|
|
|
28
32
|
speedy_utils/common/function_decorator.py,sha256=BspJ0YuGL6elS7lWBAgELZ-sCfED_1N2P5fgH-fCRUQ,2132
|
|
29
33
|
speedy_utils/common/logger.py,sha256=a2iZx0eWyfi2-2X_H2QmfuA3tfR7_XSM7Nd0GdUnUOs,6435
|
|
30
34
|
speedy_utils/common/notebook_utils.py,sha256=-97kehJ_Gg3TzDLubsLIYJcykqX1NXhbvBO6nniZSYM,2063
|
|
35
|
+
speedy_utils/common/patcher.py,sha256=VCmdxyTF87qroggQkQklRPhAOPJbeBqhcJoTsLcDxNw,2303
|
|
31
36
|
speedy_utils/common/report_manager.py,sha256=eBiw5KY6bWUhwki3B4lK5o8bFsp7L5x28X9GCI-Sd1w,3899
|
|
32
|
-
speedy_utils/common/utils_cache.py,sha256=
|
|
33
|
-
speedy_utils/common/utils_io.py,sha256=
|
|
37
|
+
speedy_utils/common/utils_cache.py,sha256=BCYbtu8lWkLjrPRQnOWHr41IBOLrpOvXIOI4Sg389nc,22430
|
|
38
|
+
speedy_utils/common/utils_io.py,sha256=gv5YN4tYvxsUcCXwaY_hv2g6a9HNltxBC0kNE0iZLak,14284
|
|
34
39
|
speedy_utils/common/utils_misc.py,sha256=cdEuBBpiB1xpuzj0UBDHDuTIerqsMIw37ENq6EXliOw,1795
|
|
35
40
|
speedy_utils/common/utils_print.py,sha256=syRrnSFtguxrV-elx6DDVcSGu4Qy7D_xVNZhPwbUY4A,4864
|
|
36
41
|
speedy_utils/multi_worker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
|
-
speedy_utils/multi_worker/process.py,sha256=
|
|
42
|
+
speedy_utils/multi_worker/process.py,sha256=LmNfV8tfdsf6PFTNzu12C_QWNfEUhgi1MeAJGeMTs1k,4738
|
|
38
43
|
speedy_utils/multi_worker/thread.py,sha256=f02VjJV8nudg0eA_AcfPEX7tHY4-czesuzthKZs_Hdc,16351
|
|
39
44
|
speedy_utils/scripts/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
45
|
speedy_utils/scripts/mpython.py,sha256=IvywP7Y0_V6tWfMP-4MjPvN5_KfxWF21xaLJsCIayCk,3821
|
|
41
46
|
speedy_utils/scripts/openapi_client_codegen.py,sha256=f2125S_q0PILgH5dyzoKRz7pIvNEjCkzpi4Q4pPFRZE,9683
|
|
42
|
-
speedy_utils-1.1.
|
|
43
|
-
speedy_utils-1.1.
|
|
44
|
-
speedy_utils-1.1.
|
|
45
|
-
speedy_utils-1.1.
|
|
47
|
+
speedy_utils-1.1.18.dist-info/METADATA,sha256=dqAnyKYkHVF3HHvzhopXo6huQE16OhFMuGcQUwH6xE0,7534
|
|
48
|
+
speedy_utils-1.1.18.dist-info/WHEEL,sha256=M5asmiAlL6HEcOq52Yi5mmk9KmTVjY2RDPtO4p9DMrc,88
|
|
49
|
+
speedy_utils-1.1.18.dist-info/entry_points.txt,sha256=T1t85jwx8fK6m5msdkBGIXH5R5Kd0zSL0S6erXERPzg,237
|
|
50
|
+
speedy_utils-1.1.18.dist-info/RECORD,,
|