speedy-utils 1.0.15__tar.gz → 1.0.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/PKG-INFO +4 -3
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/pyproject.toml +18 -3
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/llm_utils/__init__.py +3 -2
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/llm_utils/chat_format/display.py +1 -1
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/llm_utils/chat_format/transform.py +1 -2
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/llm_utils/group_messages.py +1 -1
- speedy_utils-1.0.20/src/llm_utils/lm/__init__.py +12 -0
- speedy_utils-1.0.15/src/llm_utils/lm/alm.py → speedy_utils-1.0.20/src/llm_utils/lm/async_lm.py +402 -11
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/llm_utils/lm/chat_html.py +4 -2
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/llm_utils/lm/lm_json.py +2 -6
- speedy_utils-1.0.15/src/llm_utils/lm/lm.py → speedy_utils-1.0.20/src/llm_utils/lm/sync_lm.py +4 -12
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/llm_utils/lm/utils.py +1 -1
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/llm_utils/scripts/vllm_load_balancer.py +0 -1
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/common/function_decorator.py +1 -4
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/common/logger.py +1 -1
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/common/report_manager.py +2 -3
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/common/utils_cache.py +4 -4
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/common/utils_misc.py +1 -2
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/multi_worker/process.py +2 -4
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/scripts/mpython.py +1 -2
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/scripts/openapi_client_codegen.py +1 -5
- speedy_utils-1.0.15/src/llm_utils/lm/__init__.py +0 -9
- speedy_utils-1.0.15/src/llm_utils/scripts/example_vllm_client.py +0 -269
- speedy_utils-1.0.15/src/llm_utils/scripts/requirements_example.txt +0 -3
- speedy_utils-1.0.15/src/llm_utils/scripts/serve_script.sh +0 -2
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/README.md +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/llm_utils/chat_format/__init__.py +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/llm_utils/chat_format/utils.py +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/llm_utils/scripts/README.md +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/llm_utils/scripts/vllm_serve.py +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/__init__.py +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/all.py +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/common/__init__.py +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/common/clock.py +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/common/notebook_utils.py +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/common/utils_io.py +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/common/utils_print.py +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/multi_worker/__init__.py +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/multi_worker/thread.py +0 -0
- {speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/scripts/__init__.py +0 -0
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: speedy-utils
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.20
|
|
4
4
|
Summary: Fast and easy-to-use package for data science
|
|
5
5
|
Author: AnhVTH
|
|
6
6
|
Author-email: anhvth.226@gmail.com
|
|
7
|
-
Requires-Python: >=3.
|
|
7
|
+
Requires-Python: >=3.8
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
9
10
|
Classifier: Programming Language :: Python :: 3.9
|
|
10
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
11
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
@@ -19,7 +20,7 @@ Requires-Dist: fastprogress
|
|
|
19
20
|
Requires-Dist: freezegun (>=1.5.1,<2.0.0)
|
|
20
21
|
Requires-Dist: ipdb
|
|
21
22
|
Requires-Dist: ipywidgets
|
|
22
|
-
Requires-Dist: json-repair (>=0.
|
|
23
|
+
Requires-Dist: json-repair (>=0.25.0,<0.31.0)
|
|
23
24
|
Requires-Dist: jupyterlab
|
|
24
25
|
Requires-Dist: loguru
|
|
25
26
|
Requires-Dist: matplotlib
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "speedy-utils"
|
|
3
|
-
version = "1.0.
|
|
3
|
+
version = "1.0.20"
|
|
4
4
|
description = "Fast and easy-to-use package for data science"
|
|
5
5
|
authors = ["AnhVTH <anhvth.226@gmail.com>"]
|
|
6
6
|
readme = "README.md"
|
|
@@ -36,7 +36,7 @@ no_implicit_optional = true
|
|
|
36
36
|
strict_optional = true
|
|
37
37
|
|
|
38
38
|
[tool.poetry.dependencies]
|
|
39
|
-
python = ">=3.
|
|
39
|
+
python = ">=3.8"
|
|
40
40
|
numpy = "*"
|
|
41
41
|
requests = "*"
|
|
42
42
|
xxhash = "*"
|
|
@@ -54,7 +54,7 @@ pydantic = "*"
|
|
|
54
54
|
tqdm = "*"
|
|
55
55
|
cachetools = "*"
|
|
56
56
|
bump2version = "*"
|
|
57
|
-
json-repair = ">=0.
|
|
57
|
+
json-repair = ">=0.25.0,<0.31.0"
|
|
58
58
|
fastprogress = "*"
|
|
59
59
|
freezegun = "^1.5.1"
|
|
60
60
|
packaging = ">=23.2,<25"
|
|
@@ -64,3 +64,18 @@ mpython = "speedy_utils.scripts.mpython:main"
|
|
|
64
64
|
svllm = "llm_utils.scripts.vllm_serve:main"
|
|
65
65
|
svllm-lb = "llm_utils.scripts.vllm_load_balancer:run_load_balancer"
|
|
66
66
|
openapi_client_codegen = "speedy_utils.scripts.openapi_client_codegen:main"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
[tool.ruff.format]
|
|
70
|
+
quote-style = "double"
|
|
71
|
+
line-ending = "lf"
|
|
72
|
+
docstring-code-format = true
|
|
73
|
+
[tool.ruff]
|
|
74
|
+
exclude = ["**/*.ipynb", "poly_frontend_controler/*", "poly_client/", "legacy"]
|
|
75
|
+
ignore = [
|
|
76
|
+
"E501", # Line too long
|
|
77
|
+
"F401", # Unused import
|
|
78
|
+
"F403", # Wildcard import
|
|
79
|
+
"F841", # Local variable is assigned to but never used
|
|
80
|
+
"T201", # Use of `print` statement
|
|
81
|
+
]
|
|
@@ -9,8 +9,8 @@ from .chat_format import (
|
|
|
9
9
|
transform_messages,
|
|
10
10
|
transform_messages_to_chatml,
|
|
11
11
|
)
|
|
12
|
-
from .lm.
|
|
13
|
-
from .lm.
|
|
12
|
+
from .lm.async_lm import AsyncLLMTask, AsyncLM
|
|
13
|
+
from .lm.sync_lm import LM, LLMTask
|
|
14
14
|
|
|
15
15
|
__all__ = [
|
|
16
16
|
"transform_messages",
|
|
@@ -26,4 +26,5 @@ __all__ = [
|
|
|
26
26
|
"AsyncLM",
|
|
27
27
|
"display_chat_messages_as_html",
|
|
28
28
|
"LLMTask",
|
|
29
|
+
"AsyncLLMTask",
|
|
29
30
|
]
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
from copy import deepcopy
|
|
3
|
-
from typing import Callable, Dict, List, Sequence
|
|
4
3
|
|
|
5
4
|
|
|
6
5
|
def identify_format(item):
|
|
@@ -114,7 +113,7 @@ def transform_messages(
|
|
|
114
113
|
|
|
115
114
|
def transform_messages_to_chatml(input_data, input_format="auto"):
|
|
116
115
|
if input_format == "auto":
|
|
117
|
-
input_data =
|
|
116
|
+
input_data = deepcopy(input_data)
|
|
118
117
|
if isinstance(input_data, list):
|
|
119
118
|
input_format = "chatlm"
|
|
120
119
|
assert (
|
|
@@ -76,7 +76,7 @@ def group_messages_by_len(
|
|
|
76
76
|
"""
|
|
77
77
|
if messages is None:
|
|
78
78
|
raise ValueError("messages parameter cannot be None")
|
|
79
|
-
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
79
|
+
from transformers.models.auto.tokenization_auto import AutoTokenizer # type: ignore
|
|
80
80
|
|
|
81
81
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
82
82
|
|
speedy_utils-1.0.15/src/llm_utils/lm/alm.py → speedy_utils-1.0.20/src/llm_utils/lm/async_lm.py
RENAMED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
"""An **asynchronous** drop‑in replacement for the original `LM` class.
|
|
4
2
|
|
|
5
3
|
Usage example (Python ≥3.8):
|
|
@@ -15,26 +13,30 @@ Usage example (Python ≥3.8):
|
|
|
15
13
|
asyncio.run(main())
|
|
16
14
|
"""
|
|
17
15
|
|
|
18
|
-
import asyncio
|
|
19
16
|
import base64
|
|
20
17
|
import hashlib
|
|
21
18
|
import json
|
|
22
19
|
import os
|
|
20
|
+
from abc import ABC
|
|
21
|
+
from functools import lru_cache
|
|
23
22
|
from typing import (
|
|
24
23
|
Any,
|
|
25
24
|
Dict,
|
|
26
25
|
List,
|
|
26
|
+
Literal,
|
|
27
27
|
Optional,
|
|
28
28
|
Sequence,
|
|
29
29
|
Type,
|
|
30
30
|
TypeVar,
|
|
31
31
|
Union,
|
|
32
|
-
overload,
|
|
33
32
|
cast,
|
|
33
|
+
overload,
|
|
34
34
|
)
|
|
35
35
|
|
|
36
36
|
from httpx import URL
|
|
37
|
+
from loguru import logger
|
|
37
38
|
from openai import AsyncOpenAI, AuthenticationError, BadRequestError, RateLimitError
|
|
39
|
+
from openai.pagination import AsyncPage as AsyncSyncPage
|
|
38
40
|
|
|
39
41
|
# from openai.pagination import AsyncSyncPage
|
|
40
42
|
from openai.types.chat import (
|
|
@@ -44,11 +46,10 @@ from openai.types.chat import (
|
|
|
44
46
|
ChatCompletionToolMessageParam,
|
|
45
47
|
ChatCompletionUserMessageParam,
|
|
46
48
|
)
|
|
47
|
-
from openai.types.chat.parsed_chat_completion import ParsedChatCompletion
|
|
48
49
|
from openai.types.model import Model
|
|
49
50
|
from pydantic import BaseModel
|
|
50
|
-
|
|
51
|
-
from
|
|
51
|
+
|
|
52
|
+
from llm_utils.chat_format.display import get_conversation_one_turn
|
|
52
53
|
|
|
53
54
|
# --------------------------------------------------------------------------- #
|
|
54
55
|
# type helpers
|
|
@@ -67,10 +68,20 @@ def _color(code: int, text: str) -> str:
|
|
|
67
68
|
return f"\x1b[{code}m{text}\x1b[0m"
|
|
68
69
|
|
|
69
70
|
|
|
70
|
-
_red
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
71
|
+
def _red(t):
|
|
72
|
+
return _color(31, t)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _green(t):
|
|
76
|
+
return _color(32, t)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _blue(t):
|
|
80
|
+
return _color(34, t)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _yellow(t):
|
|
84
|
+
return _color(33, t)
|
|
74
85
|
|
|
75
86
|
|
|
76
87
|
class AsyncLM:
|
|
@@ -100,6 +111,7 @@ class AsyncLM:
|
|
|
100
111
|
self.openai_kwargs = openai_kwargs
|
|
101
112
|
self.do_cache = cache
|
|
102
113
|
self.ports = ports
|
|
114
|
+
self._init_port = port # <-- store the port provided at init
|
|
103
115
|
|
|
104
116
|
# Async client
|
|
105
117
|
|
|
@@ -375,10 +387,182 @@ class AsyncLM:
|
|
|
375
387
|
except Exception:
|
|
376
388
|
return None
|
|
377
389
|
|
|
390
|
+
# ------------------------------------------------------------------ #
|
|
391
|
+
# Missing methods from LM class
|
|
392
|
+
# ------------------------------------------------------------------ #
|
|
393
|
+
async def parse(
|
|
394
|
+
self,
|
|
395
|
+
response_model: Type[BaseModel],
|
|
396
|
+
instruction: Optional[str] = None,
|
|
397
|
+
prompt: Optional[str] = None,
|
|
398
|
+
messages: Optional[RawMsgs] = None,
|
|
399
|
+
think: Literal[True, False, None] = None,
|
|
400
|
+
add_json_schema_to_instruction: bool = False,
|
|
401
|
+
temperature: Optional[float] = None,
|
|
402
|
+
max_tokens: Optional[int] = None,
|
|
403
|
+
return_openai_response: bool = False,
|
|
404
|
+
cache: Optional[bool] = True,
|
|
405
|
+
**kwargs,
|
|
406
|
+
):
|
|
407
|
+
"""Parse response using guided JSON generation."""
|
|
408
|
+
if messages is None:
|
|
409
|
+
assert instruction is not None, "Instruction must be provided."
|
|
410
|
+
assert prompt is not None, "Prompt must be provided."
|
|
411
|
+
messages = [
|
|
412
|
+
{
|
|
413
|
+
"role": "system",
|
|
414
|
+
"content": instruction,
|
|
415
|
+
},
|
|
416
|
+
{
|
|
417
|
+
"role": "user",
|
|
418
|
+
"content": prompt,
|
|
419
|
+
},
|
|
420
|
+
] # type: ignore
|
|
421
|
+
|
|
422
|
+
post_fix = ""
|
|
423
|
+
json_schema = response_model.model_json_schema()
|
|
424
|
+
if add_json_schema_to_instruction and response_model:
|
|
425
|
+
_schema = f"\n\n<output_json_schema>\n{json.dumps(json_schema, indent=2)}\n</output_json_schema>"
|
|
426
|
+
post_fix += _schema
|
|
427
|
+
|
|
428
|
+
if think:
|
|
429
|
+
post_fix += "\n\n/think"
|
|
430
|
+
elif not think:
|
|
431
|
+
post_fix += "\n\n/no_think"
|
|
432
|
+
|
|
433
|
+
assert isinstance(messages, list), "Messages must be a list."
|
|
434
|
+
assert len(messages) > 0, "Messages cannot be empty."
|
|
435
|
+
assert messages[0]["role"] == "system", (
|
|
436
|
+
"First message must be a system message with instruction."
|
|
437
|
+
)
|
|
438
|
+
messages[0]["content"] += post_fix # type: ignore
|
|
439
|
+
|
|
440
|
+
model_kwargs = {}
|
|
441
|
+
if temperature is not None:
|
|
442
|
+
model_kwargs["temperature"] = temperature
|
|
443
|
+
if max_tokens is not None:
|
|
444
|
+
model_kwargs["max_tokens"] = max_tokens
|
|
445
|
+
model_kwargs.update(kwargs)
|
|
446
|
+
|
|
447
|
+
use_cache = self.do_cache if cache is None else cache
|
|
448
|
+
cache_key = None
|
|
449
|
+
if use_cache:
|
|
450
|
+
cache_data = {
|
|
451
|
+
"messages": messages,
|
|
452
|
+
"model_kwargs": model_kwargs,
|
|
453
|
+
"guided_json": json_schema,
|
|
454
|
+
"response_format": response_model.__name__,
|
|
455
|
+
}
|
|
456
|
+
cache_key = self._cache_key(cache_data, {}, response_model)
|
|
457
|
+
cached_response = self._load_cache(cache_key)
|
|
458
|
+
self.last_log = [prompt, messages, cached_response]
|
|
459
|
+
if cached_response is not None:
|
|
460
|
+
if return_openai_response:
|
|
461
|
+
return cached_response
|
|
462
|
+
return self._parse_complete_output(cached_response, response_model)
|
|
463
|
+
|
|
464
|
+
completion = await self.client.chat.completions.create(
|
|
465
|
+
model=self.model, # type: ignore
|
|
466
|
+
messages=messages, # type: ignore
|
|
467
|
+
extra_body={"guided_json": json_schema},
|
|
468
|
+
**model_kwargs,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
if cache_key:
|
|
472
|
+
self._dump_cache(cache_key, completion)
|
|
473
|
+
|
|
474
|
+
self.last_log = [prompt, messages, completion]
|
|
475
|
+
if return_openai_response:
|
|
476
|
+
return completion
|
|
477
|
+
return self._parse_complete_output(completion, response_model)
|
|
478
|
+
|
|
479
|
+
def _parse_complete_output(
|
|
480
|
+
self, completion: Any, response_model: Type[BaseModel]
|
|
481
|
+
) -> BaseModel:
|
|
482
|
+
"""Parse completion output to response model."""
|
|
483
|
+
if hasattr(completion, "model_dump"):
|
|
484
|
+
completion = completion.model_dump()
|
|
485
|
+
|
|
486
|
+
if "choices" not in completion or not completion["choices"]:
|
|
487
|
+
raise ValueError("No choices in OpenAI response")
|
|
488
|
+
|
|
489
|
+
content = completion["choices"][0]["message"]["content"]
|
|
490
|
+
if not content:
|
|
491
|
+
raise ValueError("Empty content in response")
|
|
492
|
+
|
|
493
|
+
try:
|
|
494
|
+
data = json.loads(content)
|
|
495
|
+
return response_model.model_validate(data)
|
|
496
|
+
except Exception as exc:
|
|
497
|
+
raise ValueError(
|
|
498
|
+
f"Failed to parse response as {response_model.__name__}: {content}"
|
|
499
|
+
) from exc
|
|
500
|
+
|
|
501
|
+
async def inspect_word_probs(
|
|
502
|
+
self,
|
|
503
|
+
messages: Optional[List[Dict[str, Any]]] = None,
|
|
504
|
+
tokenizer: Optional[Any] = None,
|
|
505
|
+
do_print=True,
|
|
506
|
+
add_think: bool = True,
|
|
507
|
+
) -> tuple[List[Dict[str, Any]], Any, str]:
|
|
508
|
+
"""
|
|
509
|
+
Inspect word probabilities in a language model response.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
tokenizer: Tokenizer instance to encode words.
|
|
513
|
+
messages: List of messages to analyze.
|
|
514
|
+
|
|
515
|
+
Returns:
|
|
516
|
+
A tuple containing:
|
|
517
|
+
- List of word probabilities with their log probabilities.
|
|
518
|
+
- Token log probability dictionaries.
|
|
519
|
+
- Rendered string with colored word probabilities.
|
|
520
|
+
"""
|
|
521
|
+
if messages is None:
|
|
522
|
+
messages = await self.last_messages(add_think=add_think)
|
|
523
|
+
if messages is None:
|
|
524
|
+
raise ValueError("No messages provided and no last messages available.")
|
|
525
|
+
|
|
526
|
+
if tokenizer is None:
|
|
527
|
+
tokenizer = get_tokenizer(self.model)
|
|
528
|
+
|
|
529
|
+
ret = await inspect_word_probs_async(self, tokenizer, messages)
|
|
530
|
+
if do_print:
|
|
531
|
+
print(ret[-1])
|
|
532
|
+
return ret
|
|
533
|
+
|
|
534
|
+
async def last_messages(
|
|
535
|
+
self, add_think: bool = True
|
|
536
|
+
) -> Optional[List[Dict[str, str]]]:
|
|
537
|
+
"""Get the last conversation messages including assistant response."""
|
|
538
|
+
if not hasattr(self, "last_log"):
|
|
539
|
+
return None
|
|
540
|
+
|
|
541
|
+
last_conv = self.last_log
|
|
542
|
+
messages = last_conv[1] if len(last_conv) > 1 else None
|
|
543
|
+
last_msg = last_conv[2]
|
|
544
|
+
if not isinstance(last_msg, dict):
|
|
545
|
+
last_conv[2] = last_conv[2].model_dump() # type: ignore
|
|
546
|
+
msg = last_conv[2]
|
|
547
|
+
# Ensure msg is a dict
|
|
548
|
+
if hasattr(msg, "model_dump"):
|
|
549
|
+
msg = msg.model_dump()
|
|
550
|
+
message = msg["choices"][0]["message"]
|
|
551
|
+
reasoning = message.get("reasoning_content")
|
|
552
|
+
answer = message.get("content")
|
|
553
|
+
if reasoning and add_think:
|
|
554
|
+
final_answer = f"<think>{reasoning}</think>\n{answer}"
|
|
555
|
+
else:
|
|
556
|
+
final_answer = f"<think>\n\n</think>\n{answer}"
|
|
557
|
+
assistant = {"role": "assistant", "content": final_answer}
|
|
558
|
+
messages = messages + [assistant] # type: ignore
|
|
559
|
+
return messages if messages else None
|
|
560
|
+
|
|
378
561
|
# ------------------------------------------------------------------ #
|
|
379
562
|
# Utility helpers
|
|
380
563
|
# ------------------------------------------------------------------ #
|
|
381
564
|
async def inspect_history(self) -> None:
|
|
565
|
+
"""Inspect the conversation history with proper formatting."""
|
|
382
566
|
if not hasattr(self, "last_log"):
|
|
383
567
|
raise ValueError("No history available. Please call the model first.")
|
|
384
568
|
|
|
@@ -466,3 +650,210 @@ class AsyncLM:
|
|
|
466
650
|
except Exception as exc:
|
|
467
651
|
logger.error(f"Failed to list models: {exc}")
|
|
468
652
|
return []
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
# --------------------------------------------------------------------------- #
|
|
656
|
+
# Module-level utility functions (async versions)
|
|
657
|
+
# --------------------------------------------------------------------------- #
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
@lru_cache(maxsize=10)
|
|
661
|
+
def get_tokenizer(model_name: str) -> Any:
|
|
662
|
+
"""Get tokenizer for the given model."""
|
|
663
|
+
from transformers import AutoTokenizer # type: ignore
|
|
664
|
+
|
|
665
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
666
|
+
return tokenizer
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
async def inspect_word_probs_async(lm, tokenizer, messages):
|
|
670
|
+
"""Async version of inspect_word_probs."""
|
|
671
|
+
|
|
672
|
+
import numpy as np
|
|
673
|
+
|
|
674
|
+
async def compute_word_log_probs(
|
|
675
|
+
tokenizer: Any,
|
|
676
|
+
lm_client: Any,
|
|
677
|
+
) -> tuple[List[Dict[str, Any]], Any]:
|
|
678
|
+
# Build a prompt that preserves literal newlines
|
|
679
|
+
prompt = tokenizer.apply_chat_template(
|
|
680
|
+
messages,
|
|
681
|
+
tokenize=False, # Don't tokenize yet, we need raw text
|
|
682
|
+
add_generation_prompt=False, # No generation prompt needed
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
# Request token logprobs
|
|
686
|
+
response = await lm_client.client.completions.create(
|
|
687
|
+
model=lm_client.model, # type: ignore
|
|
688
|
+
prompt=prompt,
|
|
689
|
+
max_tokens=1,
|
|
690
|
+
logprobs=1,
|
|
691
|
+
extra_body={"prompt_logprobs": 0},
|
|
692
|
+
)
|
|
693
|
+
token_logprob_dicts = response.choices[0].prompt_logprobs # type: ignore
|
|
694
|
+
|
|
695
|
+
# Override first token to known start marker
|
|
696
|
+
start_id = tokenizer.encode("<|im_start|>")[0]
|
|
697
|
+
token_logprob_dicts[0] = {
|
|
698
|
+
str(start_id): {
|
|
699
|
+
"logprob": -1,
|
|
700
|
+
"rank": 1,
|
|
701
|
+
"decoded_token": "<|im_start|>",
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
# Flatten tokens
|
|
706
|
+
tokens: List[Dict[str, Any]] = [
|
|
707
|
+
{"id": int(tid), **tdata}
|
|
708
|
+
for td in token_logprob_dicts
|
|
709
|
+
for tid, tdata in td.items()
|
|
710
|
+
]
|
|
711
|
+
|
|
712
|
+
# Validate tokenization
|
|
713
|
+
tokenized = tokenizer.tokenize(prompt)
|
|
714
|
+
if len(tokenized) != len(tokens):
|
|
715
|
+
raise ValueError(f"Token count mismatch: {len(tokenized)} vs {len(tokens)}")
|
|
716
|
+
for idx, tok in enumerate(tokens):
|
|
717
|
+
if tokenized[idx] != tok["decoded_token"]:
|
|
718
|
+
raise AssertionError(
|
|
719
|
+
f"Token mismatch at {idx}: "
|
|
720
|
+
f"{tokenized[idx]} != {tok['decoded_token']}"
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
# Split on newline sentinel
|
|
724
|
+
split_prompt = prompt.replace("\n", " <NL> ")
|
|
725
|
+
words = split_prompt.split()
|
|
726
|
+
|
|
727
|
+
word_log_probs: List[Dict[str, Any]] = []
|
|
728
|
+
token_idx = 0
|
|
729
|
+
|
|
730
|
+
for word in words:
|
|
731
|
+
# Map sentinel back to actual newline for encoding
|
|
732
|
+
target = "\n" if word == "<NL>" else word
|
|
733
|
+
sub_ids = tokenizer.encode(target, add_special_tokens=False)
|
|
734
|
+
count = len(sub_ids)
|
|
735
|
+
if count == 0:
|
|
736
|
+
continue
|
|
737
|
+
|
|
738
|
+
subs = tokens[token_idx : token_idx + count]
|
|
739
|
+
avg_logprob = sum(s["logprob"] for s in subs) / count
|
|
740
|
+
prob = float(np.exp(avg_logprob))
|
|
741
|
+
word_log_probs.append({"word": target, "probability": prob})
|
|
742
|
+
token_idx += count
|
|
743
|
+
|
|
744
|
+
return word_log_probs, token_logprob_dicts # type: ignore
|
|
745
|
+
|
|
746
|
+
def render_by_logprob(word_log_probs: List[Dict[str, Any]]) -> str:
|
|
747
|
+
"""
|
|
748
|
+
Return an ANSI-colored string for word probabilities (red → green).
|
|
749
|
+
"""
|
|
750
|
+
if not word_log_probs:
|
|
751
|
+
return ""
|
|
752
|
+
|
|
753
|
+
probs = [entry["probability"] for entry in word_log_probs]
|
|
754
|
+
min_p, max_p = min(probs), max(probs)
|
|
755
|
+
parts: List[str] = []
|
|
756
|
+
|
|
757
|
+
for entry in word_log_probs:
|
|
758
|
+
word = entry["word"]
|
|
759
|
+
# Preserve actual line breaks
|
|
760
|
+
if word == "\n":
|
|
761
|
+
parts.append("\n")
|
|
762
|
+
continue
|
|
763
|
+
|
|
764
|
+
p = entry["probability"]
|
|
765
|
+
norm = (p - min_p) / (max_p - min_p or 1.0)
|
|
766
|
+
r = int(255 * (1 - norm)) # red component (high when prob is low)
|
|
767
|
+
g = int(255 * norm) # green component (high when prob is high)
|
|
768
|
+
b = 0 # no blue for red-green gradient
|
|
769
|
+
colored = f"\x1b[38;2;{r};{g};{b}m{word}\x1b[0m"
|
|
770
|
+
parts.append(colored + " ")
|
|
771
|
+
|
|
772
|
+
return "".join(parts).rstrip()
|
|
773
|
+
|
|
774
|
+
word_probs, token_logprob_dicts = await compute_word_log_probs(tokenizer, lm)
|
|
775
|
+
return word_probs, token_logprob_dicts, render_by_logprob(word_probs)
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
# --------------------------------------------------------------------------- #
|
|
779
|
+
# Async LLMTask class
|
|
780
|
+
# --------------------------------------------------------------------------- #
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
class AsyncLLMTask(ABC):
|
|
784
|
+
"""
|
|
785
|
+
Async callable wrapper around an AsyncLM endpoint.
|
|
786
|
+
|
|
787
|
+
Sub-classes must set:
|
|
788
|
+
• lm – the async language-model instance
|
|
789
|
+
• InputModel – a Pydantic input class
|
|
790
|
+
• OutputModel – a Pydantic output class
|
|
791
|
+
|
|
792
|
+
Optional flags:
|
|
793
|
+
• temperature – float (default 0.6)
|
|
794
|
+
• think – bool (if the backend supports "chain-of-thought")
|
|
795
|
+
• add_json_schema – bool (include schema in the instruction)
|
|
796
|
+
|
|
797
|
+
The **docstring** of each sub-class is sent as the LM instruction.
|
|
798
|
+
Example
|
|
799
|
+
```python
|
|
800
|
+
class DemoTask(AsyncLLMTask):
|
|
801
|
+
"TODO: SYSTEM_PROMPT_INSTURCTION HERE"
|
|
802
|
+
|
|
803
|
+
lm = AsyncLM(port=8130, cache=False, model="gpt-3.5-turbo")
|
|
804
|
+
|
|
805
|
+
class InputModel(BaseModel):
|
|
806
|
+
text_to_translate:str
|
|
807
|
+
|
|
808
|
+
class OutputModel(BaseModel):
|
|
809
|
+
translation:str
|
|
810
|
+
glossary_use:str
|
|
811
|
+
|
|
812
|
+
temperature = 0.6
|
|
813
|
+
think=False
|
|
814
|
+
|
|
815
|
+
demo_task = DemoTask()
|
|
816
|
+
result = await demo_task({'text_to_translate': 'Translate from english to vietnamese: Hello how are you'})
|
|
817
|
+
```
|
|
818
|
+
"""
|
|
819
|
+
|
|
820
|
+
lm: "AsyncLM"
|
|
821
|
+
InputModel: Type[BaseModel]
|
|
822
|
+
OutputModel: Type[BaseModel]
|
|
823
|
+
|
|
824
|
+
temperature: float = 0.6
|
|
825
|
+
think: bool = False
|
|
826
|
+
add_json_schema: bool = False
|
|
827
|
+
|
|
828
|
+
async def __call__(self, data: BaseModel | dict) -> BaseModel:
|
|
829
|
+
if (
|
|
830
|
+
not hasattr(self, "InputModel")
|
|
831
|
+
or not hasattr(self, "OutputModel")
|
|
832
|
+
or not hasattr(self, "lm")
|
|
833
|
+
):
|
|
834
|
+
raise NotImplementedError(
|
|
835
|
+
f"{self.__class__.__name__} must define lm, InputModel, and OutputModel as class attributes."
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
item = data if isinstance(data, BaseModel) else self.InputModel(**data)
|
|
839
|
+
|
|
840
|
+
return await self.lm.parse(
|
|
841
|
+
prompt=item.model_dump_json(),
|
|
842
|
+
instruction=self.__doc__ or "",
|
|
843
|
+
response_model=self.OutputModel,
|
|
844
|
+
temperature=self.temperature,
|
|
845
|
+
think=self.think,
|
|
846
|
+
add_json_schema_to_instruction=self.add_json_schema,
|
|
847
|
+
)
|
|
848
|
+
|
|
849
|
+
def generate_training_data(
|
|
850
|
+
self, input_dict: Dict[str, Any], output: Dict[str, Any]
|
|
851
|
+
):
|
|
852
|
+
"""Return share gpt like format"""
|
|
853
|
+
system_prompt = self.__doc__ or ""
|
|
854
|
+
user_msg = self.InputModel(**input_dict).model_dump_json() # type: ignore[attr-defined]
|
|
855
|
+
assistant_msg = self.OutputModel(**output).model_dump_json() # type: ignore[attr-defined]
|
|
856
|
+
messages = get_conversation_one_turn(
|
|
857
|
+
system_msg=system_prompt, user_msg=user_msg, assistant_msg=assistant_msg
|
|
858
|
+
)
|
|
859
|
+
return {"messages": messages}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from typing import Any, Optional, cast
|
|
2
|
+
from .sync_lm import LM, Messages, LegacyMsgs, RawMsgs
|
|
2
3
|
import sys
|
|
3
4
|
|
|
4
5
|
# Configuration
|
|
@@ -231,7 +232,8 @@ class LMChatHtml(LM):
|
|
|
231
232
|
padding=display_padding,
|
|
232
233
|
inner_padding=display_inner_padding,
|
|
233
234
|
)
|
|
234
|
-
display_handle
|
|
235
|
+
if display_handle is not None:
|
|
236
|
+
display_handle.update(HTML(html_content))
|
|
235
237
|
else:
|
|
236
238
|
# Console streaming mode (original behavior)
|
|
237
239
|
for chunk in stream:
|
|
@@ -1,11 +1,7 @@
|
|
|
1
|
-
import
|
|
2
|
-
import re
|
|
3
|
-
from functools import cache
|
|
4
|
-
from typing import *
|
|
1
|
+
from typing import Any, Optional
|
|
5
2
|
|
|
6
|
-
from pydantic import BaseModel
|
|
7
3
|
|
|
8
|
-
from llm_utils.lm.
|
|
4
|
+
from llm_utils.lm.sync_lm import LM, RawMsgs
|
|
9
5
|
|
|
10
6
|
|
|
11
7
|
class LMJson(LM):
|
speedy_utils-1.0.15/src/llm_utils/lm/lm.py → speedy_utils-1.0.20/src/llm_utils/lm/sync_lm.py
RENAMED
|
@@ -4,8 +4,8 @@ import base64
|
|
|
4
4
|
import hashlib
|
|
5
5
|
import json
|
|
6
6
|
import os
|
|
7
|
-
import warnings
|
|
8
7
|
from abc import ABC
|
|
8
|
+
from functools import lru_cache
|
|
9
9
|
from typing import (
|
|
10
10
|
Any,
|
|
11
11
|
Dict,
|
|
@@ -20,10 +20,7 @@ from typing import (
|
|
|
20
20
|
overload,
|
|
21
21
|
)
|
|
22
22
|
|
|
23
|
-
from httpx import URL
|
|
24
|
-
from huggingface_hub import repo_info
|
|
25
23
|
from loguru import logger
|
|
26
|
-
from numpy import isin
|
|
27
24
|
from openai import AuthenticationError, OpenAI, RateLimitError
|
|
28
25
|
from openai.pagination import SyncPage
|
|
29
26
|
from openai.types.chat import (
|
|
@@ -33,10 +30,10 @@ from openai.types.chat import (
|
|
|
33
30
|
ChatCompletionToolMessageParam,
|
|
34
31
|
ChatCompletionUserMessageParam,
|
|
35
32
|
)
|
|
36
|
-
from openai.types.chat.parsed_chat_completion import ParsedChatCompletion
|
|
37
33
|
from openai.types.model import Model
|
|
38
34
|
from pydantic import BaseModel
|
|
39
35
|
|
|
36
|
+
from llm_utils.chat_format.display import get_conversation_one_turn
|
|
40
37
|
from speedy_utils.common.utils_io import jdumps
|
|
41
38
|
|
|
42
39
|
# --------------------------------------------------------------------------- #
|
|
@@ -549,7 +546,7 @@ class LM:
|
|
|
549
546
|
|
|
550
547
|
if think:
|
|
551
548
|
post_fix += "\n\n/think"
|
|
552
|
-
elif think
|
|
549
|
+
elif not think:
|
|
553
550
|
post_fix += "\n\n/no_think"
|
|
554
551
|
|
|
555
552
|
assert isinstance(messages, list), "Messages must be a list."
|
|
@@ -675,20 +672,15 @@ class LM:
|
|
|
675
672
|
return messages if messages else None
|
|
676
673
|
|
|
677
674
|
|
|
678
|
-
from functools import lru_cache
|
|
679
|
-
|
|
680
|
-
|
|
681
675
|
@lru_cache(maxsize=10)
|
|
682
676
|
def get_tokenizer(model_name: str) -> Any:
|
|
683
|
-
from transformers import AutoTokenizer
|
|
677
|
+
from transformers import AutoTokenizer # type: ignore
|
|
684
678
|
|
|
685
679
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
686
680
|
return tokenizer
|
|
687
681
|
|
|
688
682
|
|
|
689
683
|
def inspect_word_probs(lm, tokenizer, messages):
|
|
690
|
-
import re
|
|
691
|
-
from typing import Any, Dict, List
|
|
692
684
|
|
|
693
685
|
import numpy as np
|
|
694
686
|
|
|
@@ -82,7 +82,7 @@ def retry_on_exception(max_retries=10, exceptions=(Exception,), sleep_time=3):
|
|
|
82
82
|
try:
|
|
83
83
|
return func(self, *args, **kwargs)
|
|
84
84
|
except exceptions as e:
|
|
85
|
-
import litellm
|
|
85
|
+
import litellm # type: ignore
|
|
86
86
|
|
|
87
87
|
if isinstance(
|
|
88
88
|
e, (litellm.exceptions.APIError, litellm.exceptions.Timeout)
|
|
@@ -2,7 +2,7 @@ import functools
|
|
|
2
2
|
import time
|
|
3
3
|
import traceback
|
|
4
4
|
from collections.abc import Callable
|
|
5
|
-
from typing import Any
|
|
5
|
+
from typing import Any
|
|
6
6
|
|
|
7
7
|
from loguru import logger
|
|
8
8
|
|
|
@@ -26,8 +26,6 @@ def retry_runtime(
|
|
|
26
26
|
def decorator(func: Callable) -> Callable:
|
|
27
27
|
@functools.wraps(func)
|
|
28
28
|
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
29
|
-
last_exception = None
|
|
30
|
-
|
|
31
29
|
for attempt in range(1, max_retry + 1):
|
|
32
30
|
try:
|
|
33
31
|
return func(*args, **kwargs)
|
|
@@ -40,7 +38,6 @@ def retry_runtime(
|
|
|
40
38
|
raise
|
|
41
39
|
|
|
42
40
|
except exceptions as e:
|
|
43
|
-
last_exception = e
|
|
44
41
|
if attempt == max_retry:
|
|
45
42
|
logger.opt(depth=1).error(
|
|
46
43
|
f"Function {func.__name__} failed after {max_retry} retries: {str(e)}"
|
|
@@ -2,7 +2,6 @@ import os
|
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
|
|
5
|
-
from fastcore.all import threaded
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
class ReportManager:
|
|
@@ -40,7 +39,7 @@ class ReportManager:
|
|
|
40
39
|
|
|
41
40
|
md_content.extend(
|
|
42
41
|
[
|
|
43
|
-
|
|
42
|
+
"\n### Results Overview",
|
|
44
43
|
f"- Total items processed: {len(results)}",
|
|
45
44
|
f"- Success rate: {(len(results) - len(errors))/len(results)*100:.1f}%",
|
|
46
45
|
f"- Total errors: {len(errors)}",
|
|
@@ -61,7 +60,7 @@ class ReportManager:
|
|
|
61
60
|
for error_type, errs in error_groups.items():
|
|
62
61
|
md_content.extend(
|
|
63
62
|
[
|
|
64
|
-
|
|
63
|
+
"\n<details>",
|
|
65
64
|
f"<summary><b>{error_type}</b> ({len(errs)} occurrences)</summary>\n",
|
|
66
65
|
"| Index | Input | Error Message |",
|
|
67
66
|
"|-------|-------|---------------|",
|
|
@@ -5,7 +5,8 @@ import os
|
|
|
5
5
|
import os.path as osp
|
|
6
6
|
import pickle
|
|
7
7
|
import uuid
|
|
8
|
-
from
|
|
8
|
+
from threading import Lock
|
|
9
|
+
from typing import Any, Literal
|
|
9
10
|
|
|
10
11
|
import cachetools
|
|
11
12
|
import pandas as pd
|
|
@@ -13,12 +14,11 @@ import xxhash
|
|
|
13
14
|
from loguru import logger
|
|
14
15
|
from pydantic import BaseModel
|
|
15
16
|
|
|
16
|
-
from .utils_io import dump_json_or_pickle, load_json_or_pickle
|
|
17
|
-
from .utils_misc import mkdir_or_exist
|
|
17
|
+
from src.speedy_utils.common.utils_io import dump_json_or_pickle, load_json_or_pickle
|
|
18
|
+
from src.speedy_utils.common.utils_misc import mkdir_or_exist
|
|
18
19
|
|
|
19
20
|
SPEED_CACHE_DIR = osp.join(osp.expanduser("~"), ".cache/speedy_cache")
|
|
20
21
|
LRU_MEM_CACHE = cachetools.LRUCache(maxsize=128_000)
|
|
21
|
-
from threading import Lock
|
|
22
22
|
|
|
23
23
|
thread_locker = Lock()
|
|
24
24
|
|
|
@@ -1,12 +1,10 @@
|
|
|
1
|
-
import inspect
|
|
2
1
|
import multiprocessing
|
|
3
2
|
import os
|
|
4
|
-
import time
|
|
5
3
|
import traceback
|
|
6
|
-
from collections.abc import Callable, Iterable, Iterator
|
|
4
|
+
from collections.abc import Callable, Iterable, Iterator
|
|
7
5
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
8
6
|
from itertools import islice
|
|
9
|
-
from typing import Any,
|
|
7
|
+
from typing import Any, TypeVar, cast
|
|
10
8
|
|
|
11
9
|
T = TypeVar("T")
|
|
12
10
|
|
|
@@ -5,7 +5,6 @@ import multiprocessing # Import multiprocessing module
|
|
|
5
5
|
import os
|
|
6
6
|
import shlex # To properly escape command line arguments
|
|
7
7
|
import shutil
|
|
8
|
-
import subprocess
|
|
9
8
|
|
|
10
9
|
taskset_path = shutil.which("taskset")
|
|
11
10
|
|
|
@@ -80,7 +79,7 @@ def main():
|
|
|
80
79
|
cmd_str = shlex.join(args.cmd)
|
|
81
80
|
|
|
82
81
|
gpus = args.gpus.split(",")
|
|
83
|
-
gpus = [gpu for gpu in gpus if not
|
|
82
|
+
gpus = [gpu for gpu in gpus if gpu not in args.ignore_gpus.split(",")]
|
|
84
83
|
num_gpus = len(gpus)
|
|
85
84
|
|
|
86
85
|
cpu_per_process = max(args.total_cpu // args.total_fold, 1)
|
{speedy_utils-1.0.15 → speedy_utils-1.0.20}/src/speedy_utils/scripts/openapi_client_codegen.py
RENAMED
|
@@ -10,12 +10,8 @@ Usage:
|
|
|
10
10
|
|
|
11
11
|
import argparse
|
|
12
12
|
import json
|
|
13
|
-
import os
|
|
14
|
-
import signal
|
|
15
|
-
import subprocess
|
|
16
13
|
import sys
|
|
17
|
-
import
|
|
18
|
-
from typing import Any, Dict, List, Optional
|
|
14
|
+
from typing import Any, Dict, List
|
|
19
15
|
|
|
20
16
|
|
|
21
17
|
def pascal_case(s: str) -> str:
|
|
@@ -1,269 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Beautiful example script for interacting with VLLM server.
|
|
3
|
-
|
|
4
|
-
This script demonstrates various ways to use the VLLM API server
|
|
5
|
-
for text generation tasks.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import asyncio
|
|
9
|
-
import json
|
|
10
|
-
from typing import Dict, List, Optional, Any
|
|
11
|
-
|
|
12
|
-
import aiohttp
|
|
13
|
-
from loguru import logger
|
|
14
|
-
from pydantic import BaseModel, Field
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class VLLMRequest(BaseModel):
|
|
18
|
-
"""Request model for VLLM API."""
|
|
19
|
-
prompt: str
|
|
20
|
-
max_tokens: int = Field(default=512, ge=1, le=8192)
|
|
21
|
-
temperature: float = Field(default=0.7, ge=0.0, le=2.0)
|
|
22
|
-
top_p: float = Field(default=0.9, ge=0.0, le=1.0)
|
|
23
|
-
stream: bool = False
|
|
24
|
-
stop: Optional[List[str]] = None
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class VLLMResponse(BaseModel):
|
|
28
|
-
"""Response model from VLLM API."""
|
|
29
|
-
text: str
|
|
30
|
-
finish_reason: str
|
|
31
|
-
prompt_tokens: int
|
|
32
|
-
completion_tokens: int
|
|
33
|
-
total_tokens: int
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class VLLMClient:
|
|
37
|
-
"""Client for interacting with VLLM server."""
|
|
38
|
-
|
|
39
|
-
def __init__(self, base_url: str = 'http://localhost:8140'):
|
|
40
|
-
self.base_url = base_url
|
|
41
|
-
self.model_name = 'selfeval_8b'
|
|
42
|
-
|
|
43
|
-
async def generate_text(
|
|
44
|
-
self,
|
|
45
|
-
request: VLLMRequest
|
|
46
|
-
) -> VLLMResponse:
|
|
47
|
-
"""Generate text using VLLM API."""
|
|
48
|
-
url = f'{self.base_url}/v1/completions'
|
|
49
|
-
|
|
50
|
-
payload = {
|
|
51
|
-
'model': self.model_name,
|
|
52
|
-
'prompt': request.prompt,
|
|
53
|
-
'max_tokens': request.max_tokens,
|
|
54
|
-
'temperature': request.temperature,
|
|
55
|
-
'top_p': request.top_p,
|
|
56
|
-
'stream': request.stream,
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
if request.stop:
|
|
60
|
-
payload['stop'] = request.stop
|
|
61
|
-
|
|
62
|
-
async with aiohttp.ClientSession() as session:
|
|
63
|
-
try:
|
|
64
|
-
async with session.post(
|
|
65
|
-
url,
|
|
66
|
-
json=payload,
|
|
67
|
-
timeout=aiohttp.ClientTimeout(total=60)
|
|
68
|
-
) as response:
|
|
69
|
-
response.raise_for_status()
|
|
70
|
-
data = await response.json()
|
|
71
|
-
|
|
72
|
-
choice = data['choices'][0]
|
|
73
|
-
usage = data['usage']
|
|
74
|
-
|
|
75
|
-
return VLLMResponse(
|
|
76
|
-
text=choice['text'],
|
|
77
|
-
finish_reason=choice['finish_reason'],
|
|
78
|
-
prompt_tokens=usage['prompt_tokens'],
|
|
79
|
-
completion_tokens=usage['completion_tokens'],
|
|
80
|
-
total_tokens=usage['total_tokens']
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
except aiohttp.ClientError as e:
|
|
84
|
-
logger.error(f'HTTP error: {e}')
|
|
85
|
-
raise
|
|
86
|
-
except Exception as e:
|
|
87
|
-
logger.error(f'Unexpected error: {e}')
|
|
88
|
-
raise
|
|
89
|
-
|
|
90
|
-
async def generate_batch(
|
|
91
|
-
self,
|
|
92
|
-
requests: List[VLLMRequest]
|
|
93
|
-
) -> List[VLLMResponse]:
|
|
94
|
-
"""Generate text for multiple requests concurrently."""
|
|
95
|
-
tasks = [self.generate_text(req) for req in requests]
|
|
96
|
-
return await asyncio.gather(*tasks, return_exceptions=True)
|
|
97
|
-
|
|
98
|
-
async def health_check(self) -> bool:
|
|
99
|
-
"""Check if the VLLM server is healthy."""
|
|
100
|
-
url = f'{self.base_url}/health'
|
|
101
|
-
|
|
102
|
-
try:
|
|
103
|
-
async with aiohttp.ClientSession() as session:
|
|
104
|
-
async with session.get(
|
|
105
|
-
url,
|
|
106
|
-
timeout=aiohttp.ClientTimeout(total=10)
|
|
107
|
-
) as response:
|
|
108
|
-
return response.status == 200
|
|
109
|
-
except Exception as e:
|
|
110
|
-
logger.warning(f'Health check failed: {e}')
|
|
111
|
-
return False
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
async def example_basic_generation():
|
|
115
|
-
"""Example: Basic text generation."""
|
|
116
|
-
logger.info('🚀 Running basic generation example')
|
|
117
|
-
|
|
118
|
-
client = VLLMClient()
|
|
119
|
-
|
|
120
|
-
# Check server health
|
|
121
|
-
if not await client.health_check():
|
|
122
|
-
logger.error('❌ Server is not healthy')
|
|
123
|
-
return
|
|
124
|
-
|
|
125
|
-
request = VLLMRequest(
|
|
126
|
-
prompt='Explain the concept of machine learning in simple terms:',
|
|
127
|
-
max_tokens=256,
|
|
128
|
-
temperature=0.7,
|
|
129
|
-
stop=['\n\n']
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
try:
|
|
133
|
-
response = await client.generate_text(request)
|
|
134
|
-
|
|
135
|
-
logger.success('✅ Generation completed')
|
|
136
|
-
logger.info(f'📝 Generated text:\n{response.text}')
|
|
137
|
-
logger.info(f'📊 Tokens: {response.total_tokens} total '
|
|
138
|
-
f'({response.prompt_tokens} prompt + '
|
|
139
|
-
f'{response.completion_tokens} completion)')
|
|
140
|
-
|
|
141
|
-
except Exception as e:
|
|
142
|
-
logger.error(f'❌ Generation failed: {e}')
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
async def example_batch_generation():
|
|
146
|
-
"""Example: Batch text generation."""
|
|
147
|
-
logger.info('🚀 Running batch generation example')
|
|
148
|
-
|
|
149
|
-
client = VLLMClient()
|
|
150
|
-
|
|
151
|
-
prompts = [
|
|
152
|
-
'What is artificial intelligence?',
|
|
153
|
-
'Explain quantum computing briefly:',
|
|
154
|
-
'What are the benefits of renewable energy?'
|
|
155
|
-
]
|
|
156
|
-
|
|
157
|
-
requests = [
|
|
158
|
-
VLLMRequest(
|
|
159
|
-
prompt=prompt,
|
|
160
|
-
max_tokens=128,
|
|
161
|
-
temperature=0.8
|
|
162
|
-
) for prompt in prompts
|
|
163
|
-
]
|
|
164
|
-
|
|
165
|
-
try:
|
|
166
|
-
responses = await client.generate_batch(requests)
|
|
167
|
-
|
|
168
|
-
for i, response in enumerate(responses):
|
|
169
|
-
if isinstance(response, Exception):
|
|
170
|
-
logger.error(f'❌ Request {i+1} failed: {response}')
|
|
171
|
-
else:
|
|
172
|
-
logger.success(f'✅ Request {i+1} completed')
|
|
173
|
-
logger.info(f'📝 Response {i+1}:\n{response.text}\n')
|
|
174
|
-
|
|
175
|
-
except Exception as e:
|
|
176
|
-
logger.error(f'❌ Batch generation failed: {e}')
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
async def example_creative_writing():
|
|
180
|
-
"""Example: Creative writing with specific parameters."""
|
|
181
|
-
logger.info('🚀 Running creative writing example')
|
|
182
|
-
|
|
183
|
-
client = VLLMClient()
|
|
184
|
-
|
|
185
|
-
request = VLLMRequest(
|
|
186
|
-
prompt=(
|
|
187
|
-
'Write a short story about a robot discovering emotions. '
|
|
188
|
-
'The story should be exactly 3 paragraphs:\n\n'
|
|
189
|
-
),
|
|
190
|
-
max_tokens=400,
|
|
191
|
-
temperature=1.2, # Higher temperature for creativity
|
|
192
|
-
top_p=0.95,
|
|
193
|
-
stop=['THE END', '\n\n\n']
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
try:
|
|
197
|
-
response = await client.generate_text(request)
|
|
198
|
-
|
|
199
|
-
logger.success('✅ Creative writing completed')
|
|
200
|
-
logger.info(f'📚 Story:\n{response.text}')
|
|
201
|
-
logger.info(f'🎯 Finish reason: {response.finish_reason}')
|
|
202
|
-
|
|
203
|
-
except Exception as e:
|
|
204
|
-
logger.error(f'❌ Creative writing failed: {e}')
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
async def example_code_generation():
|
|
208
|
-
"""Example: Code generation."""
|
|
209
|
-
logger.info('🚀 Running code generation example')
|
|
210
|
-
|
|
211
|
-
client = VLLMClient()
|
|
212
|
-
|
|
213
|
-
request = VLLMRequest(
|
|
214
|
-
prompt=(
|
|
215
|
-
'Write a Python function that calculates the fibonacci '
|
|
216
|
-
'sequence up to n terms:\n\n```python\n'
|
|
217
|
-
),
|
|
218
|
-
max_tokens=300,
|
|
219
|
-
temperature=0.2, # Lower temperature for code
|
|
220
|
-
stop=['```', '\n\n\n']
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
try:
|
|
224
|
-
response = await client.generate_text(request)
|
|
225
|
-
|
|
226
|
-
logger.success('✅ Code generation completed')
|
|
227
|
-
logger.info(f'💻 Generated code:\n```python\n{response.text}\n```')
|
|
228
|
-
|
|
229
|
-
except Exception as e:
|
|
230
|
-
logger.error(f'❌ Code generation failed: {e}')
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
async def main():
|
|
234
|
-
"""Run all examples."""
|
|
235
|
-
logger.info('🎯 Starting VLLM Client Examples')
|
|
236
|
-
logger.info('=' * 50)
|
|
237
|
-
|
|
238
|
-
examples = [
|
|
239
|
-
example_basic_generation,
|
|
240
|
-
example_batch_generation,
|
|
241
|
-
example_creative_writing,
|
|
242
|
-
example_code_generation
|
|
243
|
-
]
|
|
244
|
-
|
|
245
|
-
for example in examples:
|
|
246
|
-
await example()
|
|
247
|
-
logger.info('-' * 50)
|
|
248
|
-
await asyncio.sleep(1) # Brief pause between examples
|
|
249
|
-
|
|
250
|
-
logger.info('🎉 All examples completed!')
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
if __name__ == '__main__':
|
|
254
|
-
# Configure logger
|
|
255
|
-
logger.remove()
|
|
256
|
-
logger.add(
|
|
257
|
-
lambda msg: print(msg, end=''),
|
|
258
|
-
format='<green>{time:HH:mm:ss}</green> | '
|
|
259
|
-
'<level>{level: <8}</level> | '
|
|
260
|
-
'<cyan>{message}</cyan>',
|
|
261
|
-
level='INFO'
|
|
262
|
-
)
|
|
263
|
-
|
|
264
|
-
try:
|
|
265
|
-
asyncio.run(main())
|
|
266
|
-
except KeyboardInterrupt:
|
|
267
|
-
logger.info('\n👋 Goodbye!')
|
|
268
|
-
except Exception as e:
|
|
269
|
-
logger.error(f'❌ Script failed: {e}')
|
|
@@ -1,2 +0,0 @@
|
|
|
1
|
-
HF_HOME=/home/anhvth5/.cache/huggingface CUDA_VISIBLE_DEVICES=0 /home/anhvth5/miniconda3/envs/unsloth_env/bin/vllm serve ./outputs/8B_selfeval_retranslate/Qwen3-8B_2025_05_30/ls_response_only_r8_a8_sq8192_lr5e_06_bz64_ep1_4/ --port 8140 --tensor-parallel 1 --gpu-memory-utilization 0.9 --dtype auto --max-model-len 8192 --enable-prefix-caching --disable-log-requests --served-model-name selfeval_8b
|
|
2
|
-
Logging to /tmp/vllm_8140.txt
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|