speedy-utils 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_utils/chat_format/display.py +17 -4
- llm_utils/lm/async_lm/__init__.py +2 -0
- llm_utils/lm/async_lm/_utils.py +198 -0
- llm_utils/lm/async_lm/async_llm_task.py +154 -0
- llm_utils/lm/{async_lm.py → async_lm/async_lm.py} +191 -354
- llm_utils/scripts/vllm_load_balancer.py +220 -135
- {speedy_utils-1.1.4.dist-info → speedy_utils-1.1.6.dist-info}/METADATA +1 -1
- {speedy_utils-1.1.4.dist-info → speedy_utils-1.1.6.dist-info}/RECORD +10 -7
- {speedy_utils-1.1.4.dist-info → speedy_utils-1.1.6.dist-info}/WHEEL +0 -0
- {speedy_utils-1.1.4.dist-info → speedy_utils-1.1.6.dist-info}/entry_points.txt +0 -0
llm_utils/chat_format/display.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from difflib import SequenceMatcher
|
|
2
4
|
from typing import Any, Optional
|
|
5
|
+
|
|
3
6
|
from IPython.display import HTML, display
|
|
4
|
-
from difflib import SequenceMatcher
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
def show_chat(
|
|
@@ -19,6 +21,17 @@ def show_chat(
|
|
|
19
21
|
isinstance(msg, dict) and "role" in msg and "content" in msg for msg in msgs
|
|
20
22
|
), "The input format is not recognized. Please specify the input format."
|
|
21
23
|
|
|
24
|
+
if isinstance(msgs[-1], dict) and "choices" in msgs[-1]:
|
|
25
|
+
message = msgs[-1]["choices"][0]["message"]
|
|
26
|
+
reasoning_content = message.get("reasoning_content")
|
|
27
|
+
content = message.get("content", "")
|
|
28
|
+
if reasoning_content:
|
|
29
|
+
content = reasoning_content + "\n" + content
|
|
30
|
+
msgs[-1] = {
|
|
31
|
+
"role": message["role"],
|
|
32
|
+
"content": content,
|
|
33
|
+
}
|
|
34
|
+
|
|
22
35
|
themes: dict[str, dict[str, dict[str, str]]] = {
|
|
23
36
|
"default": {
|
|
24
37
|
"system": {"background": "#ffaaaa", "text": "#222222"}, # More red
|
|
@@ -156,9 +169,9 @@ def get_conversation_one_turn(
|
|
|
156
169
|
if assistant_msg is not None:
|
|
157
170
|
messages.append({"role": "assistant", "content": assistant_msg})
|
|
158
171
|
if assistant_prefix is not None:
|
|
159
|
-
assert (
|
|
160
|
-
return_format
|
|
161
|
-
)
|
|
172
|
+
assert return_format != "chatml", (
|
|
173
|
+
'Change return_format to "text" if you want to use assistant_prefix'
|
|
174
|
+
)
|
|
162
175
|
assert messages[-1]["role"] == "user"
|
|
163
176
|
from .transform import transform_messages
|
|
164
177
|
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
from functools import lru_cache
|
|
2
|
+
from typing import (
|
|
3
|
+
Any,
|
|
4
|
+
Dict,
|
|
5
|
+
Generic,
|
|
6
|
+
List,
|
|
7
|
+
TypeVar,
|
|
8
|
+
Union,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
# from openai.pagination import AsyncSyncPage
|
|
12
|
+
from openai.types.chat import (
|
|
13
|
+
ChatCompletionMessageParam,
|
|
14
|
+
)
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
from typing_extensions import TypedDict
|
|
17
|
+
|
|
18
|
+
# --------------------------------------------------------------------------- #
|
|
19
|
+
# type helpers
|
|
20
|
+
# --------------------------------------------------------------------------- #
|
|
21
|
+
TModel = TypeVar("TModel", bound=BaseModel)
|
|
22
|
+
Messages = List[ChatCompletionMessageParam]
|
|
23
|
+
LegacyMsgs = List[Dict[str, str]]
|
|
24
|
+
RawMsgs = Union[Messages, LegacyMsgs]
|
|
25
|
+
|
|
26
|
+
# --------------------------------------------------------------------------- #
|
|
27
|
+
# color helpers (unchanged)
|
|
28
|
+
# --------------------------------------------------------------------------- #
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _color(code: int, text: str) -> str:
|
|
32
|
+
return f"\x1b[{code}m{text}\x1b[0m"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _red(t):
|
|
36
|
+
return _color(31, t)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _green(t):
|
|
40
|
+
return _color(32, t)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _blue(t):
|
|
44
|
+
return _color(34, t)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _yellow(t):
|
|
48
|
+
return _color(33, t)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
TParsed = TypeVar("TParsed", bound=BaseModel)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ParsedOutput(TypedDict, Generic[TParsed]):
|
|
55
|
+
messages: List
|
|
56
|
+
completion: Any
|
|
57
|
+
parsed: TParsed
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# --------------------------------------------------------------------------- #
|
|
61
|
+
# Module-level utility functions (async versions)
|
|
62
|
+
# --------------------------------------------------------------------------- #
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@lru_cache(maxsize=10)
|
|
66
|
+
def get_tokenizer(model_name: str) -> Any:
|
|
67
|
+
"""Get tokenizer for the given model."""
|
|
68
|
+
from transformers import AutoTokenizer # type: ignore
|
|
69
|
+
|
|
70
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
71
|
+
return tokenizer
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
async def inspect_word_probs_async(lm, tokenizer, messages):
|
|
75
|
+
"""Async version of inspect_word_probs."""
|
|
76
|
+
|
|
77
|
+
import numpy as np
|
|
78
|
+
|
|
79
|
+
async def compute_word_log_probs(
|
|
80
|
+
tokenizer: Any,
|
|
81
|
+
lm_client: Any,
|
|
82
|
+
) -> tuple[List[Dict[str, Any]], Any]:
|
|
83
|
+
# Build a prompt that preserves literal newlines
|
|
84
|
+
prompt = tokenizer.apply_chat_template(
|
|
85
|
+
messages,
|
|
86
|
+
tokenize=False, # Don't tokenize yet, we need raw text
|
|
87
|
+
add_generation_prompt=False, # No generation prompt needed
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Request token logprobs
|
|
91
|
+
response = await lm_client.client.completions.create(
|
|
92
|
+
model=lm_client.model, # type: ignore
|
|
93
|
+
prompt=prompt,
|
|
94
|
+
max_tokens=1,
|
|
95
|
+
logprobs=1,
|
|
96
|
+
extra_body={"prompt_logprobs": 0},
|
|
97
|
+
)
|
|
98
|
+
token_logprob_dicts = response.choices[0].prompt_logprobs # type: ignore
|
|
99
|
+
|
|
100
|
+
# Override first token to known start marker
|
|
101
|
+
start_id = tokenizer.encode("<|im_start|>")[0]
|
|
102
|
+
token_logprob_dicts[0] = {
|
|
103
|
+
str(start_id): {
|
|
104
|
+
"logprob": -1,
|
|
105
|
+
"rank": 1,
|
|
106
|
+
"decoded_token": "<|im_start|>",
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
# Flatten tokens
|
|
111
|
+
tokens: List[Dict[str, Any]] = [
|
|
112
|
+
{"id": int(tid), **tdata}
|
|
113
|
+
for td in token_logprob_dicts
|
|
114
|
+
for tid, tdata in td.items()
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
# Validate tokenization
|
|
118
|
+
tokenized = tokenizer.tokenize(prompt)
|
|
119
|
+
if len(tokenized) != len(tokens):
|
|
120
|
+
raise ValueError(f"Token count mismatch: {len(tokenized)} vs {len(tokens)}")
|
|
121
|
+
for idx, tok in enumerate(tokens):
|
|
122
|
+
if tokenized[idx] != tok["decoded_token"]:
|
|
123
|
+
raise AssertionError(
|
|
124
|
+
f"Token mismatch at {idx}: "
|
|
125
|
+
f"{tokenized[idx]} != {tok['decoded_token']}"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Split on newline sentinel
|
|
129
|
+
split_prompt = prompt.replace("\n", " <NL> ")
|
|
130
|
+
words = split_prompt.split()
|
|
131
|
+
|
|
132
|
+
word_log_probs: List[Dict[str, Any]] = []
|
|
133
|
+
token_idx = 0
|
|
134
|
+
|
|
135
|
+
for word in words:
|
|
136
|
+
# Map sentinel back to actual newline for encoding
|
|
137
|
+
target = "\n" if word == "<NL>" else word
|
|
138
|
+
sub_ids = tokenizer.encode(target, add_special_tokens=False)
|
|
139
|
+
count = len(sub_ids)
|
|
140
|
+
if count == 0:
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
subs = tokens[token_idx : token_idx + count]
|
|
144
|
+
avg_logprob = sum(s["logprob"] for s in subs) / count
|
|
145
|
+
prob = float(np.exp(avg_logprob))
|
|
146
|
+
word_log_probs.append({"word": target, "probability": prob})
|
|
147
|
+
token_idx += count
|
|
148
|
+
|
|
149
|
+
return word_log_probs, token_logprob_dicts # type: ignore
|
|
150
|
+
|
|
151
|
+
def render_by_logprob(word_log_probs: List[Dict[str, Any]]) -> str:
|
|
152
|
+
"""
|
|
153
|
+
Return an ANSI-colored string for word probabilities (red → green).
|
|
154
|
+
"""
|
|
155
|
+
if not word_log_probs:
|
|
156
|
+
return ""
|
|
157
|
+
|
|
158
|
+
probs = [entry["probability"] for entry in word_log_probs]
|
|
159
|
+
min_p, max_p = min(probs), max(probs)
|
|
160
|
+
parts: List[str] = []
|
|
161
|
+
|
|
162
|
+
for entry in word_log_probs:
|
|
163
|
+
word = entry["word"]
|
|
164
|
+
# Preserve actual line breaks
|
|
165
|
+
if word == "\n":
|
|
166
|
+
parts.append("\n")
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
p = entry["probability"]
|
|
170
|
+
norm = (p - min_p) / (max_p - min_p or 1.0)
|
|
171
|
+
r = int(255 * (1 - norm)) # red component (high when prob is low)
|
|
172
|
+
g = int(255 * norm) # green component (high when prob is high)
|
|
173
|
+
b = 0 # no blue for red-green gradient
|
|
174
|
+
colored = f"\x1b[38;2;{r};{g};{b}m{word}\x1b[0m"
|
|
175
|
+
parts.append(colored + " ")
|
|
176
|
+
|
|
177
|
+
return "".join(parts).rstrip()
|
|
178
|
+
|
|
179
|
+
word_probs, token_logprob_dicts = await compute_word_log_probs(tokenizer, lm)
|
|
180
|
+
return word_probs, token_logprob_dicts, render_by_logprob(word_probs)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
__all__ = [
|
|
184
|
+
"TModel",
|
|
185
|
+
"Messages",
|
|
186
|
+
"LegacyMsgs",
|
|
187
|
+
"RawMsgs",
|
|
188
|
+
"TParsed",
|
|
189
|
+
"ParsedOutput",
|
|
190
|
+
"get_tokenizer",
|
|
191
|
+
"inspect_word_probs_async",
|
|
192
|
+
"_color",
|
|
193
|
+
"_red",
|
|
194
|
+
"_green",
|
|
195
|
+
"_blue",
|
|
196
|
+
"_yellow",
|
|
197
|
+
]
|
|
198
|
+
# --------------------------------------------------------------------------- #]
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from typing import (
|
|
3
|
+
Any,
|
|
4
|
+
Dict,
|
|
5
|
+
Generic,
|
|
6
|
+
List,
|
|
7
|
+
Optional,
|
|
8
|
+
TypeVar,
|
|
9
|
+
Union,
|
|
10
|
+
cast,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
# from openai.pagination import AsyncSyncPage
|
|
14
|
+
from openai.types.chat import (
|
|
15
|
+
ChatCompletionMessageParam,
|
|
16
|
+
)
|
|
17
|
+
from pydantic import BaseModel
|
|
18
|
+
|
|
19
|
+
from llm_utils.chat_format.display import get_conversation_one_turn
|
|
20
|
+
|
|
21
|
+
from .async_lm import AsyncLM
|
|
22
|
+
|
|
23
|
+
# --------------------------------------------------------------------------- #
|
|
24
|
+
# type helpers
|
|
25
|
+
# --------------------------------------------------------------------------- #
|
|
26
|
+
TModel = TypeVar("TModel", bound=BaseModel)
|
|
27
|
+
Messages = List[ChatCompletionMessageParam]
|
|
28
|
+
LegacyMsgs = List[Dict[str, str]]
|
|
29
|
+
RawMsgs = Union[Messages, LegacyMsgs]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# --------------------------------------------------------------------------- #
|
|
33
|
+
# Async LLMTask class
|
|
34
|
+
# --------------------------------------------------------------------------- #
|
|
35
|
+
|
|
36
|
+
InputModelType = TypeVar("InputModelType", bound=BaseModel)
|
|
37
|
+
OutputModelType = TypeVar("OutputModelType", bound=BaseModel)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class AsyncLLMTask(ABC, Generic[InputModelType, OutputModelType]):
|
|
41
|
+
"""
|
|
42
|
+
Async callable wrapper around an AsyncLM endpoint.
|
|
43
|
+
|
|
44
|
+
Sub-classes must set:
|
|
45
|
+
• lm – the async language-model instance
|
|
46
|
+
• InputModel – a Pydantic input class
|
|
47
|
+
• OutputModel – a Pydantic output class
|
|
48
|
+
|
|
49
|
+
Optional flags:
|
|
50
|
+
• temperature – float (default 0.6)
|
|
51
|
+
• think – bool (if the backend supports "chain-of-thought")
|
|
52
|
+
• add_json_schema – bool (include schema in the instruction)
|
|
53
|
+
|
|
54
|
+
The **docstring** of each sub-class is sent as the LM instruction.
|
|
55
|
+
Example
|
|
56
|
+
```python
|
|
57
|
+
class DemoTask(AsyncLLMTask):
|
|
58
|
+
"TODO: SYSTEM_PROMPT_INSTURCTION HERE"
|
|
59
|
+
|
|
60
|
+
lm = AsyncLM(port=8130, cache=False, model="gpt-3.5-turbo")
|
|
61
|
+
|
|
62
|
+
class InputModel(BaseModel):
|
|
63
|
+
text_to_translate:str
|
|
64
|
+
|
|
65
|
+
class OutputModel(BaseModel):
|
|
66
|
+
translation:str
|
|
67
|
+
glossary_use:str
|
|
68
|
+
|
|
69
|
+
temperature = 0.6
|
|
70
|
+
think=False
|
|
71
|
+
|
|
72
|
+
demo_task = DemoTask()
|
|
73
|
+
result = await demo_task({'text_to_translate': 'Translate from english to vietnamese: Hello how are you'})
|
|
74
|
+
```
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
lm: "AsyncLM"
|
|
78
|
+
InputModel: InputModelType
|
|
79
|
+
OutputModel: OutputModelType
|
|
80
|
+
|
|
81
|
+
temperature: float = 0.6
|
|
82
|
+
think: bool = False
|
|
83
|
+
add_json_schema: bool = False
|
|
84
|
+
cache: bool = False
|
|
85
|
+
|
|
86
|
+
async def __call__(
|
|
87
|
+
self,
|
|
88
|
+
data: BaseModel | dict,
|
|
89
|
+
temperature: float = 0.1,
|
|
90
|
+
cache: bool = False,
|
|
91
|
+
think: Optional[bool] = None, # if not None, overrides self.think
|
|
92
|
+
) -> tuple[OutputModelType, List[Dict[str, Any]]]:
|
|
93
|
+
# Get the input and output model types from the generic parameters
|
|
94
|
+
type_args = getattr(self.__class__, "__orig_bases__", None)
|
|
95
|
+
if (
|
|
96
|
+
type_args
|
|
97
|
+
and hasattr(type_args[0], "__args__")
|
|
98
|
+
and len(type_args[0].__args__) >= 2
|
|
99
|
+
):
|
|
100
|
+
input_model = type_args[0].__args__[0]
|
|
101
|
+
output_model = type_args[0].__args__[1]
|
|
102
|
+
else:
|
|
103
|
+
# Fallback to the old way if type introspection fails
|
|
104
|
+
if (
|
|
105
|
+
not hasattr(self, "InputModel")
|
|
106
|
+
or not hasattr(self, "OutputModel")
|
|
107
|
+
or not hasattr(self, "lm")
|
|
108
|
+
):
|
|
109
|
+
raise NotImplementedError(
|
|
110
|
+
f"{self.__class__.__name__} must define lm, InputModel, and OutputModel as class attributes or use proper generic typing."
|
|
111
|
+
)
|
|
112
|
+
input_model = self.InputModel
|
|
113
|
+
output_model = self.OutputModel
|
|
114
|
+
|
|
115
|
+
# Ensure input_model is a class before calling
|
|
116
|
+
if isinstance(data, BaseModel):
|
|
117
|
+
item = data
|
|
118
|
+
elif isinstance(input_model, type) and issubclass(input_model, BaseModel):
|
|
119
|
+
item = input_model(**data)
|
|
120
|
+
else:
|
|
121
|
+
raise TypeError("InputModel must be a subclass of BaseModel")
|
|
122
|
+
|
|
123
|
+
assert isinstance(output_model, type) and issubclass(output_model, BaseModel), (
|
|
124
|
+
"OutputModel must be a subclass of BaseModel"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
result = await self.lm.parse(
|
|
128
|
+
prompt=item.model_dump_json(),
|
|
129
|
+
instruction=self.__doc__ or "",
|
|
130
|
+
response_model=output_model,
|
|
131
|
+
temperature=temperature or self.temperature,
|
|
132
|
+
think=think if think is not None else self.think,
|
|
133
|
+
add_json_schema_to_instruction=self.add_json_schema,
|
|
134
|
+
cache=self.cache or cache,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
return (
|
|
138
|
+
cast(OutputModelType, result["parsed"]), # type: ignore
|
|
139
|
+
cast(List[dict], result["messages"]), # type: ignore
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def generate_training_data(
|
|
143
|
+
self, input_dict: Dict[str, Any], output: Dict[str, Any]
|
|
144
|
+
) -> Dict[str, Any]:
|
|
145
|
+
"""Return share gpt like format"""
|
|
146
|
+
system_prompt = self.__doc__ or ""
|
|
147
|
+
user_msg = self.InputModel(**input_dict).model_dump_json() # type: ignore[attr-defined]
|
|
148
|
+
assistant_msg = self.OutputModel(**output).model_dump_json() # type: ignore[attr-defined]
|
|
149
|
+
messages = get_conversation_one_turn(
|
|
150
|
+
system_msg=system_prompt, user_msg=user_msg, assistant_msg=assistant_msg
|
|
151
|
+
)
|
|
152
|
+
return {"messages": messages}
|
|
153
|
+
|
|
154
|
+
arun = __call__ # alias for compatibility with other LLMTask implementations
|