@modular-prompt/driver 0.11.15 → 0.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -0
- package/dist/anthropic/anthropic-driver.d.ts +38 -8
- package/dist/anthropic/anthropic-driver.d.ts.map +1 -1
- package/dist/anthropic/anthropic-driver.js +180 -164
- package/dist/anthropic/anthropic-driver.js.map +1 -1
- package/dist/cache-controller.d.ts +28 -0
- package/dist/cache-controller.d.ts.map +1 -0
- package/dist/cache-controller.js +2 -0
- package/dist/cache-controller.js.map +1 -0
- package/dist/cache-utils.d.ts +20 -0
- package/dist/cache-utils.d.ts.map +1 -0
- package/dist/cache-utils.js +71 -0
- package/dist/cache-utils.js.map +1 -0
- package/dist/content-utils.d.ts +9 -0
- package/dist/content-utils.d.ts.map +1 -1
- package/dist/content-utils.js +47 -0
- package/dist/content-utils.js.map +1 -1
- package/dist/driver-registry/config-based-factory.d.ts.map +1 -1
- package/dist/driver-registry/config-based-factory.js +7 -0
- package/dist/driver-registry/config-based-factory.js.map +1 -1
- package/dist/driver-registry/factory-helper.d.ts.map +1 -1
- package/dist/driver-registry/factory-helper.js +7 -4
- package/dist/driver-registry/factory-helper.js.map +1 -1
- package/dist/driver-registry/types.d.ts +6 -0
- package/dist/driver-registry/types.d.ts.map +1 -1
- package/dist/formatter/converter.js +1 -1
- package/dist/formatter/converter.js.map +1 -1
- package/dist/google-genai/element-converter.d.ts +11 -0
- package/dist/google-genai/element-converter.d.ts.map +1 -0
- package/dist/google-genai/element-converter.js +126 -0
- package/dist/google-genai/element-converter.js.map +1 -0
- package/dist/google-genai/google-genai-cache-controller.d.ts +24 -0
- package/dist/google-genai/google-genai-cache-controller.d.ts.map +1 -0
- package/dist/google-genai/google-genai-cache-controller.js +127 -0
- package/dist/google-genai/google-genai-cache-controller.js.map +1 -0
- package/dist/google-genai/google-genai-driver.d.ts +5 -29
- package/dist/google-genai/google-genai-driver.d.ts.map +1 -1
- package/dist/google-genai/google-genai-driver.js +92 -255
- package/dist/google-genai/google-genai-driver.js.map +1 -1
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -1
- package/dist/mlx-ml/mlx-cache-controller.d.ts +66 -0
- package/dist/mlx-ml/mlx-cache-controller.d.ts.map +1 -0
- package/dist/mlx-ml/mlx-cache-controller.js +600 -0
- package/dist/mlx-ml/mlx-cache-controller.js.map +1 -0
- package/dist/mlx-ml/mlx-driver.d.ts +13 -8
- package/dist/mlx-ml/mlx-driver.d.ts.map +1 -1
- package/dist/mlx-ml/mlx-driver.js +202 -143
- package/dist/mlx-ml/mlx-driver.js.map +1 -1
- package/dist/mlx-ml/mlx-message-utils.d.ts +9 -0
- package/dist/mlx-ml/mlx-message-utils.d.ts.map +1 -0
- package/dist/mlx-ml/mlx-message-utils.js +71 -0
- package/dist/mlx-ml/mlx-message-utils.js.map +1 -0
- package/dist/mlx-ml/process/harmony-parser.d.ts +3 -0
- package/dist/mlx-ml/process/harmony-parser.d.ts.map +1 -0
- package/dist/mlx-ml/process/harmony-parser.js +175 -0
- package/dist/mlx-ml/process/harmony-parser.js.map +1 -0
- package/dist/mlx-ml/process/index.d.ts +7 -3
- package/dist/mlx-ml/process/index.d.ts.map +1 -1
- package/dist/mlx-ml/process/index.js +22 -7
- package/dist/mlx-ml/process/index.js.map +1 -1
- package/dist/mlx-ml/process/model-handlers.d.ts +11 -58
- package/dist/mlx-ml/process/model-handlers.d.ts.map +1 -1
- package/dist/mlx-ml/process/model-handlers.js +29 -11
- package/dist/mlx-ml/process/model-handlers.js.map +1 -1
- package/dist/mlx-ml/process/model-specific.d.ts +7 -0
- package/dist/mlx-ml/process/model-specific.d.ts.map +1 -1
- package/dist/mlx-ml/process/model-specific.js +3 -0
- package/dist/mlx-ml/process/model-specific.js.map +1 -1
- package/dist/mlx-ml/process/parameter-validator.d.ts.map +1 -1
- package/dist/mlx-ml/process/parameter-validator.js +10 -3
- package/dist/mlx-ml/process/parameter-validator.js.map +1 -1
- package/dist/mlx-ml/process/process-communication.d.ts +3 -0
- package/dist/mlx-ml/process/process-communication.d.ts.map +1 -1
- package/dist/mlx-ml/process/process-communication.js +13 -0
- package/dist/mlx-ml/process/process-communication.js.map +1 -1
- package/dist/mlx-ml/process/queue.d.ts +5 -2
- package/dist/mlx-ml/process/queue.d.ts.map +1 -1
- package/dist/mlx-ml/process/queue.js +103 -15
- package/dist/mlx-ml/process/queue.js.map +1 -1
- package/dist/mlx-ml/process/response-processor.d.ts +18 -0
- package/dist/mlx-ml/process/response-processor.d.ts.map +1 -0
- package/dist/mlx-ml/process/response-processor.js +24 -0
- package/dist/mlx-ml/process/response-processor.js.map +1 -0
- package/dist/mlx-ml/process/types.d.ts +51 -4
- package/dist/mlx-ml/process/types.d.ts.map +1 -1
- package/dist/mlx-ml/tool-call-parser.d.ts.map +1 -1
- package/dist/mlx-ml/tool-call-parser.js +44 -68
- package/dist/mlx-ml/tool-call-parser.js.map +1 -1
- package/dist/mlx-ml/types.d.ts +1 -0
- package/dist/mlx-ml/types.d.ts.map +1 -1
- package/dist/openai/openai-driver.d.ts +0 -2
- package/dist/openai/openai-driver.d.ts.map +1 -1
- package/dist/openai/openai-driver.js.map +1 -1
- package/dist/types.d.ts +9 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +7 -4
- package/src/mlx-ml/python/__main__.py +41 -425
- package/src/mlx-ml/python/backends/__init__.py +3 -0
- package/src/mlx-ml/python/backends/base.py +84 -0
- package/src/mlx-ml/python/backends/mlx_lm.py +202 -0
- package/src/mlx-ml/python/backends/mlx_vlm.py +99 -0
- package/src/mlx-ml/python/examples/example_basic.py +93 -0
- package/src/mlx-ml/python/examples/example_tool_call.py +165 -0
- package/src/mlx-ml/python/handlers/__init__.py +6 -0
- package/src/mlx-ml/python/handlers/cache.py +81 -0
- package/src/mlx-ml/python/handlers/capabilities.py +6 -0
- package/src/mlx-ml/python/handlers/chat.py +221 -0
- package/src/mlx-ml/python/handlers/completion.py +36 -0
- package/src/mlx-ml/python/handlers/format_test.py +70 -0
- package/src/mlx-ml/python/handlers/tokenize.py +63 -0
- package/src/mlx-ml/python/pyproject.toml +15 -5
- package/src/mlx-ml/python/server.py +126 -0
- package/src/mlx-ml/python/tests/__init__.py +0 -0
- package/src/mlx-ml/python/utils/__init__.py +0 -0
- package/src/mlx-ml/python/utils/prompt_builder.py +54 -0
- package/src/mlx-ml/python/{token_utils.py → utils/token_utils.py} +13 -5
- package/src/mlx-ml/python/uv.lock +299 -57
- /package/src/mlx-ml/python/{chat_template_constraints.py → utils/chat_template_constraints.py} +0 -0
- /package/src/mlx-ml/python/{vlm_utils.py → utils/vlm_utils.py} +0 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
from backends.base import ModelBackend
|
|
8
|
+
from mlx_lm.models.cache import trim_prompt_cache
|
|
9
|
+
from utils.prompt_builder import generate_merged_prompt, supports_chat_template
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _read_cache_token_count(cache_path: str) -> int | None:
|
|
13
|
+
"""Read token count from the sidecar .meta.json file."""
|
|
14
|
+
meta_path = cache_path + '.meta.json'
|
|
15
|
+
try:
|
|
16
|
+
with open(meta_path) as f:
|
|
17
|
+
meta = json.load(f)
|
|
18
|
+
count = meta.get('token_count')
|
|
19
|
+
return int(count) if count is not None else None
|
|
20
|
+
except (FileNotFoundError, json.JSONDecodeError, ValueError, TypeError):
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _stream_to_stdout(
|
|
25
|
+
backend: ModelBackend,
|
|
26
|
+
prompt: str | list[int],
|
|
27
|
+
options: dict,
|
|
28
|
+
images: list | None = None,
|
|
29
|
+
primer: str | None = None,
|
|
30
|
+
prompt_cache: list | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
if primer is not None:
|
|
33
|
+
print(primer, end="", flush=True)
|
|
34
|
+
|
|
35
|
+
last_response = None
|
|
36
|
+
for response in backend.stream_generate(prompt, options, images, prompt_cache=prompt_cache):
|
|
37
|
+
print(response.text.replace("\0", "").replace("\x1e", ""), end="", flush=True)
|
|
38
|
+
last_response = response
|
|
39
|
+
|
|
40
|
+
meta: dict = {}
|
|
41
|
+
if last_response is not None:
|
|
42
|
+
if hasattr(last_response, "prompt_tokens"):
|
|
43
|
+
meta["prompt_tokens"] = last_response.prompt_tokens
|
|
44
|
+
if hasattr(last_response, "generation_tokens"):
|
|
45
|
+
meta["generation_tokens"] = last_response.generation_tokens
|
|
46
|
+
|
|
47
|
+
if meta:
|
|
48
|
+
print(f"\x1e__META__:{json.dumps(meta)}", end="\0", flush=True)
|
|
49
|
+
else:
|
|
50
|
+
print("", end="\0", flush=True)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def handle_chat(
|
|
54
|
+
backend: ModelBackend,
|
|
55
|
+
capabilities: dict,
|
|
56
|
+
messages: list,
|
|
57
|
+
primer: str | None = None,
|
|
58
|
+
options: dict | None = None,
|
|
59
|
+
tools: list | None = None,
|
|
60
|
+
images: list | None = None,
|
|
61
|
+
max_image_size: int = 768,
|
|
62
|
+
reasoning_effort: str | None = None,
|
|
63
|
+
cache_path: str | None = None,
|
|
64
|
+
cache_trim_tokens: int | None = None,
|
|
65
|
+
) -> None:
|
|
66
|
+
"""chat API の処理"""
|
|
67
|
+
if options is None:
|
|
68
|
+
options = {}
|
|
69
|
+
|
|
70
|
+
tokenizer = backend.get_tokenizer()
|
|
71
|
+
|
|
72
|
+
if backend.supports_vision():
|
|
73
|
+
add_generation_prompt = True
|
|
74
|
+
fmt_messages = list(messages)
|
|
75
|
+
if primer is not None:
|
|
76
|
+
fmt_messages.append({"role": "assistant", "content": primer})
|
|
77
|
+
add_generation_prompt = False
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
prompt = tokenizer.apply_chat_template(
|
|
81
|
+
fmt_messages,
|
|
82
|
+
tools=tools,
|
|
83
|
+
add_generation_prompt=add_generation_prompt,
|
|
84
|
+
tokenize=False,
|
|
85
|
+
)
|
|
86
|
+
except TypeError:
|
|
87
|
+
prompt = tokenizer.apply_chat_template(
|
|
88
|
+
fmt_messages,
|
|
89
|
+
add_generation_prompt=add_generation_prompt,
|
|
90
|
+
tokenize=False,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
if primer is not None:
|
|
94
|
+
prompt = primer.join(prompt.split(primer)[0:-1]) + primer
|
|
95
|
+
|
|
96
|
+
display_prompt = re.sub(r'(<\|image_pad\|>)+', '<|image_pad|>...', prompt)
|
|
97
|
+
sys.stderr.write(f"--- vlm prompt (images: {len(images) if images else 0}, max_size: {max_image_size})\n{display_prompt}\n")
|
|
98
|
+
|
|
99
|
+
final_options = dict(options)
|
|
100
|
+
final_options["max_image_size"] = max_image_size
|
|
101
|
+
_stream_to_stdout(
|
|
102
|
+
backend,
|
|
103
|
+
prompt,
|
|
104
|
+
final_options,
|
|
105
|
+
images=images,
|
|
106
|
+
primer=primer,
|
|
107
|
+
)
|
|
108
|
+
return
|
|
109
|
+
|
|
110
|
+
prompt_cache = backend.load_cache_from_file(cache_path) if cache_path else None
|
|
111
|
+
cache_tokens = 0
|
|
112
|
+
if prompt_cache is not None:
|
|
113
|
+
if cache_trim_tokens is not None:
|
|
114
|
+
current_offset = backend.get_cache_offset(prompt_cache)
|
|
115
|
+
if current_offset > cache_trim_tokens:
|
|
116
|
+
trim_prompt_cache(prompt_cache, current_offset - cache_trim_tokens)
|
|
117
|
+
sys.stderr.write(
|
|
118
|
+
f"KV cache trimmed: {current_offset} → {cache_trim_tokens} tokens\n"
|
|
119
|
+
)
|
|
120
|
+
cache_tokens = cache_trim_tokens
|
|
121
|
+
else:
|
|
122
|
+
cache_tokens = current_offset
|
|
123
|
+
else:
|
|
124
|
+
meta_count = _read_cache_token_count(cache_path) if cache_path else None
|
|
125
|
+
if meta_count is not None:
|
|
126
|
+
cache_tokens = meta_count
|
|
127
|
+
else:
|
|
128
|
+
# Legacy cache without meta file - skip it for safety
|
|
129
|
+
sys.stderr.write(
|
|
130
|
+
f"WARNING: Cache file exists but no .meta.json found at {cache_path}. "
|
|
131
|
+
"Ignoring cache for safety (may be from old implementation).\n"
|
|
132
|
+
)
|
|
133
|
+
prompt_cache = None
|
|
134
|
+
cache_tokens = 0
|
|
135
|
+
if prompt_cache is not None:
|
|
136
|
+
sys.stderr.write(
|
|
137
|
+
f"KV cache loaded: {len(prompt_cache)} layers, {cache_tokens} cached tokens\n"
|
|
138
|
+
)
|
|
139
|
+
elif cache_path:
|
|
140
|
+
sys.stderr.write(f"KV cache load FAILED: {cache_path}\n")
|
|
141
|
+
|
|
142
|
+
if not supports_chat_template(tokenizer):
|
|
143
|
+
prompt = generate_merged_prompt(messages, capabilities)
|
|
144
|
+
if prompt_cache is not None:
|
|
145
|
+
sys.stderr.write("KV cache ignored: model does not support chat template\n")
|
|
146
|
+
_stream_to_stdout(backend, prompt, options, primer=primer)
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
add_generation_prompt = True
|
|
150
|
+
fmt_messages = list(messages)
|
|
151
|
+
if primer is not None:
|
|
152
|
+
fmt_messages.append({"role": "assistant", "content": primer})
|
|
153
|
+
add_generation_prompt = False
|
|
154
|
+
|
|
155
|
+
extra_kwargs = {}
|
|
156
|
+
if tools is not None:
|
|
157
|
+
extra_kwargs["tools"] = tools
|
|
158
|
+
if reasoning_effort is not None:
|
|
159
|
+
extra_kwargs["reasoning_effort"] = reasoning_effort
|
|
160
|
+
|
|
161
|
+
trust_remote_code = options.get("trust_remote_code")
|
|
162
|
+
if trust_remote_code is not None:
|
|
163
|
+
extra_kwargs["trust_remote_code"] = trust_remote_code
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
prompt = tokenizer.apply_chat_template(
|
|
167
|
+
fmt_messages,
|
|
168
|
+
add_generation_prompt=add_generation_prompt,
|
|
169
|
+
tokenize=False,
|
|
170
|
+
**extra_kwargs,
|
|
171
|
+
)
|
|
172
|
+
except TypeError:
|
|
173
|
+
try:
|
|
174
|
+
fallback_kwargs = {}
|
|
175
|
+
if tools is not None:
|
|
176
|
+
fallback_kwargs["tools"] = tools
|
|
177
|
+
prompt = tokenizer.apply_chat_template(
|
|
178
|
+
fmt_messages,
|
|
179
|
+
add_generation_prompt=add_generation_prompt,
|
|
180
|
+
tokenize=False,
|
|
181
|
+
**fallback_kwargs,
|
|
182
|
+
)
|
|
183
|
+
except TypeError:
|
|
184
|
+
prompt = tokenizer.apply_chat_template(
|
|
185
|
+
fmt_messages,
|
|
186
|
+
add_generation_prompt=add_generation_prompt,
|
|
187
|
+
tokenize=False,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
if primer is not None:
|
|
191
|
+
prompt = primer.join(prompt.split(primer)[0:-1]) + primer
|
|
192
|
+
|
|
193
|
+
if isinstance(prompt, list):
|
|
194
|
+
sys.stderr.write(f"--- prompt: len={len(prompt)}\n")
|
|
195
|
+
else:
|
|
196
|
+
sys.stderr.write(f"--- prompt\n{prompt}\n")
|
|
197
|
+
|
|
198
|
+
final_options = dict(options)
|
|
199
|
+
final_options.pop("trust_remote_code", None)
|
|
200
|
+
|
|
201
|
+
effective_prompt = prompt
|
|
202
|
+
if prompt_cache is not None and cache_tokens > 0 and isinstance(prompt, str):
|
|
203
|
+
add_special = tokenizer.bos_token is None or not prompt.startswith(
|
|
204
|
+
tokenizer.bos_token
|
|
205
|
+
)
|
|
206
|
+
full_tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
|
|
207
|
+
|
|
208
|
+
if cache_tokens < len(full_tokens):
|
|
209
|
+
effective_prompt = full_tokens[cache_tokens:]
|
|
210
|
+
sys.stderr.write(
|
|
211
|
+
f"Prefilled {cache_tokens}/{len(full_tokens)} tokens, "
|
|
212
|
+
f"generating from {len(effective_prompt)} remaining\n"
|
|
213
|
+
)
|
|
214
|
+
else:
|
|
215
|
+
sys.stderr.write(
|
|
216
|
+
f"Prefill offset {cache_tokens} >= prompt {len(full_tokens)}, "
|
|
217
|
+
f"ignoring prefill state\n"
|
|
218
|
+
)
|
|
219
|
+
prompt_cache = None
|
|
220
|
+
|
|
221
|
+
_stream_to_stdout(backend, effective_prompt, final_options, primer=primer, prompt_cache=prompt_cache)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
from backends.base import ModelBackend
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def handle_completion(
|
|
11
|
+
backend: ModelBackend,
|
|
12
|
+
prompt: str | list[int],
|
|
13
|
+
options: dict | None = None,
|
|
14
|
+
images: list | None = None,
|
|
15
|
+
max_image_size: int = 768,
|
|
16
|
+
) -> None:
|
|
17
|
+
"""completion API の処理"""
|
|
18
|
+
if options is None:
|
|
19
|
+
options = {}
|
|
20
|
+
|
|
21
|
+
final_options = dict(options)
|
|
22
|
+
if images:
|
|
23
|
+
final_options["max_image_size"] = max_image_size
|
|
24
|
+
if os.getenv('MLX_DEBUG'):
|
|
25
|
+
display_prompt = re.sub(r'(<\|image_pad\|>)+', '<|image_pad|>...', prompt)
|
|
26
|
+
sys.stderr.write(f"--- vlm completion (images: {len(images)}, max_size: {max_image_size})\n{display_prompt}\n")
|
|
27
|
+
elif os.getenv('MLX_DEBUG'):
|
|
28
|
+
if isinstance(prompt, list):
|
|
29
|
+
sys.stderr.write(f"--- prompt: len={len(prompt)}\n")
|
|
30
|
+
else:
|
|
31
|
+
sys.stderr.write(f"--- prompt\n{prompt}\n")
|
|
32
|
+
|
|
33
|
+
for response in backend.stream_generate(prompt, final_options, images):
|
|
34
|
+
print(response.text.replace("\0", ""), end="", flush=True)
|
|
35
|
+
|
|
36
|
+
print("\n", end="\0", flush=True)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from backends.base import ModelBackend
|
|
4
|
+
from utils.prompt_builder import generate_merged_prompt, supports_chat_template
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def handle_format_test(
|
|
8
|
+
backend: ModelBackend,
|
|
9
|
+
capabilities: dict,
|
|
10
|
+
messages: list,
|
|
11
|
+
options: dict | None = None,
|
|
12
|
+
tools: list | None = None,
|
|
13
|
+
) -> None:
|
|
14
|
+
"""フォーマットテスト API の処理(実際に生成せずフォーマットのみ)"""
|
|
15
|
+
if options is None:
|
|
16
|
+
options = {}
|
|
17
|
+
|
|
18
|
+
tokenizer = backend.get_tokenizer()
|
|
19
|
+
result = {
|
|
20
|
+
"formatted_prompt": None,
|
|
21
|
+
"template_applied": False,
|
|
22
|
+
"model_specific_processing": None,
|
|
23
|
+
"error": None,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
if supports_chat_template(tokenizer):
|
|
28
|
+
result["model_specific_processing"] = messages
|
|
29
|
+
|
|
30
|
+
primer = options.get("primer")
|
|
31
|
+
add_generation_prompt = True
|
|
32
|
+
fmt_messages = list(messages)
|
|
33
|
+
|
|
34
|
+
if primer is not None:
|
|
35
|
+
fmt_messages.append({"role": "assistant", "content": primer})
|
|
36
|
+
add_generation_prompt = False
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
formatted_prompt = tokenizer.apply_chat_template(
|
|
40
|
+
fmt_messages,
|
|
41
|
+
tools=tools,
|
|
42
|
+
add_generation_prompt=add_generation_prompt,
|
|
43
|
+
tokenize=False,
|
|
44
|
+
)
|
|
45
|
+
except TypeError:
|
|
46
|
+
formatted_prompt = tokenizer.apply_chat_template(
|
|
47
|
+
fmt_messages,
|
|
48
|
+
add_generation_prompt=add_generation_prompt,
|
|
49
|
+
tokenize=False,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
if primer is not None:
|
|
53
|
+
formatted_prompt = (
|
|
54
|
+
primer.join(formatted_prompt.split(primer)[0:-1]) + primer
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
result["formatted_prompt"] = formatted_prompt
|
|
58
|
+
result["template_applied"] = True
|
|
59
|
+
else:
|
|
60
|
+
formatted_prompt = generate_merged_prompt(messages, capabilities)
|
|
61
|
+
primer = options.get("primer")
|
|
62
|
+
if primer is not None:
|
|
63
|
+
formatted_prompt += primer
|
|
64
|
+
|
|
65
|
+
result["formatted_prompt"] = formatted_prompt
|
|
66
|
+
result["template_applied"] = False
|
|
67
|
+
except Exception as e:
|
|
68
|
+
result["error"] = str(e)
|
|
69
|
+
|
|
70
|
+
print(json.dumps(result), end="\0", flush=True)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from backends.base import ModelBackend
|
|
4
|
+
from utils.prompt_builder import generate_merged_prompt, supports_chat_template
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def handle_tokenize(
|
|
8
|
+
backend: ModelBackend,
|
|
9
|
+
capabilities: dict,
|
|
10
|
+
messages: list,
|
|
11
|
+
tools: list | None = None,
|
|
12
|
+
reasoning_effort: str | None = None,
|
|
13
|
+
) -> None:
|
|
14
|
+
"""メッセージをchat template適用後にトークン化して返す"""
|
|
15
|
+
tokenizer = backend.get_tokenizer()
|
|
16
|
+
|
|
17
|
+
result = {
|
|
18
|
+
"token_ids": None,
|
|
19
|
+
"token_count": 0,
|
|
20
|
+
"error": None,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
# apply_chat_templateのfallbackパターン (chat.py L165-188 と同じ)
|
|
25
|
+
# add_generation_prompt=False で、アシスタントの開始トークンは含めない
|
|
26
|
+
extra_kwargs = {}
|
|
27
|
+
if tools is not None:
|
|
28
|
+
extra_kwargs["tools"] = tools
|
|
29
|
+
if reasoning_effort is not None:
|
|
30
|
+
extra_kwargs["reasoning_effort"] = reasoning_effort
|
|
31
|
+
|
|
32
|
+
if supports_chat_template(tokenizer):
|
|
33
|
+
# chat.py と同じfallbackチェーン
|
|
34
|
+
prompt = None
|
|
35
|
+
for kwargs in [extra_kwargs, {k: v for k, v in extra_kwargs.items() if k == "tools"}, {}]:
|
|
36
|
+
try:
|
|
37
|
+
prompt = tokenizer.apply_chat_template(
|
|
38
|
+
messages,
|
|
39
|
+
add_generation_prompt=False,
|
|
40
|
+
tokenize=False,
|
|
41
|
+
**kwargs,
|
|
42
|
+
)
|
|
43
|
+
break
|
|
44
|
+
except TypeError:
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
if prompt is None:
|
|
48
|
+
prompt = str(messages)
|
|
49
|
+
else:
|
|
50
|
+
prompt = generate_merged_prompt(messages, capabilities)
|
|
51
|
+
|
|
52
|
+
# トークン化
|
|
53
|
+
add_special = tokenizer.bos_token is None or not prompt.startswith(
|
|
54
|
+
tokenizer.bos_token or ""
|
|
55
|
+
)
|
|
56
|
+
token_ids = tokenizer.encode(prompt, add_special_tokens=add_special)
|
|
57
|
+
|
|
58
|
+
result["token_ids"] = token_ids
|
|
59
|
+
result["token_count"] = len(token_ids)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
result["error"] = str(e)
|
|
62
|
+
|
|
63
|
+
print(json.dumps(result), end="\0", flush=True)
|
|
@@ -7,18 +7,28 @@ dependencies = [
|
|
|
7
7
|
"flex==6.14.1",
|
|
8
8
|
"hf-xet==1.2.0",
|
|
9
9
|
"jinja2==3.1.6",
|
|
10
|
-
"mlx
|
|
11
|
-
"mlx-lm==0.31.
|
|
12
|
-
"mlx-vlm==0.
|
|
10
|
+
"mlx>=0.31.2; sys_platform == 'darwin'",
|
|
11
|
+
"mlx-lm==0.31.3; sys_platform == 'darwin'",
|
|
12
|
+
"mlx-vlm==0.5.0",
|
|
13
13
|
"tokenizers==0.22.2",
|
|
14
14
|
"torch==2.9.1",
|
|
15
15
|
"torchvision==0.24.1",
|
|
16
|
-
"transformers
|
|
16
|
+
"transformers>=5.5.0",
|
|
17
17
|
]
|
|
18
18
|
|
|
19
|
+
[dependency-groups]
|
|
20
|
+
dev = ["pytest>=9.0"]
|
|
21
|
+
|
|
19
22
|
[build-system]
|
|
20
23
|
requires = ["setuptools>=61.0"]
|
|
21
24
|
build-backend = "setuptools.build_meta"
|
|
22
25
|
|
|
26
|
+
[tool.pytest.ini_options]
|
|
27
|
+
testpaths = ["tests"]
|
|
28
|
+
|
|
23
29
|
[tool.setuptools]
|
|
24
|
-
py-modules = ["__main__", "
|
|
30
|
+
py-modules = ["__main__", "server"]
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.packages.find]
|
|
33
|
+
where = ["."]
|
|
34
|
+
include = ["backends*", "handlers*", "utils*"]
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""JSON-RPC風サーバー: stdin/stdoutベースのリクエストディスパッチ"""
|
|
2
|
+
import json
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
from backends.base import ModelBackend
|
|
6
|
+
from handlers import handle_cache_prefill, handle_capabilities, handle_chat, handle_completion, handle_format_test, handle_tokenize
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
MAX_READ_LINES = 10000
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def read():
|
|
13
|
+
lines = []
|
|
14
|
+
while True:
|
|
15
|
+
line = sys.stdin.readline()
|
|
16
|
+
if not line:
|
|
17
|
+
return None
|
|
18
|
+
lines.append(line)
|
|
19
|
+
if len(lines) > MAX_READ_LINES:
|
|
20
|
+
sys.stderr.write(f"Error: read buffer exceeded {MAX_READ_LINES} lines, discarding\n")
|
|
21
|
+
lines.clear()
|
|
22
|
+
continue
|
|
23
|
+
try:
|
|
24
|
+
return json.loads(''.join(lines))
|
|
25
|
+
except json.JSONDecodeError:
|
|
26
|
+
continue
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Server:
|
|
30
|
+
def __init__(self, backend: ModelBackend, capabilities: dict):
|
|
31
|
+
self.backend = backend
|
|
32
|
+
self.capabilities = capabilities
|
|
33
|
+
|
|
34
|
+
def run(self):
|
|
35
|
+
while True:
|
|
36
|
+
req = read()
|
|
37
|
+
if req is None:
|
|
38
|
+
break
|
|
39
|
+
self._dispatch(req)
|
|
40
|
+
|
|
41
|
+
def _error_response(self, message: str) -> None:
|
|
42
|
+
sys.stderr.write(f"Error: {message}\n")
|
|
43
|
+
print(json.dumps({"error": message}), end='\0', flush=True)
|
|
44
|
+
|
|
45
|
+
def _dispatch(self, req: dict):
|
|
46
|
+
method = req.get('method')
|
|
47
|
+
if not method:
|
|
48
|
+
self._error_response("'method' field is required")
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
if method == 'capabilities':
|
|
53
|
+
handle_capabilities(self.capabilities)
|
|
54
|
+
|
|
55
|
+
elif method == 'format_test':
|
|
56
|
+
messages = req.get('messages')
|
|
57
|
+
if not messages:
|
|
58
|
+
self._error_response("'messages' field is required for format_test method")
|
|
59
|
+
return
|
|
60
|
+
handle_format_test(self.backend, self.capabilities, messages, req.get('options', {}), req.get('tools'))
|
|
61
|
+
|
|
62
|
+
elif method == 'tokenize':
|
|
63
|
+
messages = req.get('messages')
|
|
64
|
+
if messages is None:
|
|
65
|
+
self._error_response("'messages' field is required for tokenize method")
|
|
66
|
+
return
|
|
67
|
+
handle_tokenize(
|
|
68
|
+
self.backend, self.capabilities, messages,
|
|
69
|
+
tools=req.get('tools'),
|
|
70
|
+
reasoning_effort=req.get('reasoning_effort'),
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
elif method == 'cache_prefill':
|
|
74
|
+
cache_path = req.get('cache_path')
|
|
75
|
+
messages = req.get('messages')
|
|
76
|
+
if not cache_path or not messages:
|
|
77
|
+
self._error_response("'cache_path' and 'messages' fields are required for cache_prefill")
|
|
78
|
+
return
|
|
79
|
+
handle_cache_prefill(
|
|
80
|
+
self.backend, self.capabilities, cache_path, messages,
|
|
81
|
+
base_cache_path=req.get('base_cache_path'),
|
|
82
|
+
trim_to_tokens=req.get('trim_to_tokens'),
|
|
83
|
+
prefix_offsets=req.get('prefix_offsets'),
|
|
84
|
+
prefix_hashes=req.get('prefix_hashes'),
|
|
85
|
+
tools=req.get('tools'),
|
|
86
|
+
reasoning_effort=req.get('reasoning_effort'),
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
elif method == 'chat':
|
|
90
|
+
messages = req.get('messages')
|
|
91
|
+
if not messages:
|
|
92
|
+
self._error_response("'messages' field is required for chat method")
|
|
93
|
+
return
|
|
94
|
+
handle_chat(
|
|
95
|
+
self.backend,
|
|
96
|
+
self.capabilities,
|
|
97
|
+
messages,
|
|
98
|
+
primer=req.get('primer'),
|
|
99
|
+
options=req.get('options', {}),
|
|
100
|
+
tools=req.get('tools'),
|
|
101
|
+
images=req.get('images', []),
|
|
102
|
+
max_image_size=req.get('maxImageSize', 768),
|
|
103
|
+
reasoning_effort=req.get('reasoning_effort'),
|
|
104
|
+
cache_path=req.get('cache_path'),
|
|
105
|
+
cache_trim_tokens=req.get('cache_trim_tokens'),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
elif method == 'completion':
|
|
109
|
+
prompt = req.get('prompt')
|
|
110
|
+
if not prompt:
|
|
111
|
+
self._error_response("'prompt' field is required for completion method")
|
|
112
|
+
return
|
|
113
|
+
images = req.get('images', [])
|
|
114
|
+
handle_completion(
|
|
115
|
+
self.backend,
|
|
116
|
+
prompt,
|
|
117
|
+
options=req.get('options', {}),
|
|
118
|
+
images=images if images else None,
|
|
119
|
+
max_image_size=req.get('maxImageSize', 768),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
else:
|
|
123
|
+
self._error_response(f"Unknown method '{method}'")
|
|
124
|
+
|
|
125
|
+
except Exception as e:
|
|
126
|
+
self._error_response(f"Error processing request: {e}")
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""プロンプト生成ユーティリティ"""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def supports_chat_template(tokenizer) -> bool:
|
|
5
|
+
return (hasattr(tokenizer, 'apply_chat_template') and
|
|
6
|
+
hasattr(tokenizer, 'chat_template') and
|
|
7
|
+
tokenizer.chat_template is not None)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def generate_merged_prompt(messages, capabilities):
|
|
11
|
+
"""apply_chat_templateがない場合のプロンプト生成"""
|
|
12
|
+
prompt_parts = []
|
|
13
|
+
special_tokens = capabilities.get('special_tokens', {})
|
|
14
|
+
|
|
15
|
+
for msg in messages:
|
|
16
|
+
role = msg['role']
|
|
17
|
+
role_upper = role.upper()
|
|
18
|
+
|
|
19
|
+
role_token = special_tokens.get(role)
|
|
20
|
+
|
|
21
|
+
if role_token and isinstance(role_token, dict) and 'start' in role_token:
|
|
22
|
+
start_token = role_token['start']['text']
|
|
23
|
+
end_token = role_token['end']['text']
|
|
24
|
+
prompt_parts.extend([
|
|
25
|
+
start_token,
|
|
26
|
+
msg['content'].strip(),
|
|
27
|
+
end_token,
|
|
28
|
+
''
|
|
29
|
+
])
|
|
30
|
+
else:
|
|
31
|
+
block_token = None
|
|
32
|
+
for candidate in ['block', 'context', 'quote', 'section']:
|
|
33
|
+
token = special_tokens.get(candidate)
|
|
34
|
+
if token and isinstance(token, dict) and 'start' in token:
|
|
35
|
+
block_token = token
|
|
36
|
+
break
|
|
37
|
+
|
|
38
|
+
if block_token:
|
|
39
|
+
start_token = block_token['start']['text']
|
|
40
|
+
end_token = block_token['end']['text']
|
|
41
|
+
prompt_parts.extend([
|
|
42
|
+
f'{start_token}{role_upper}:\n{msg["content"].strip()}',
|
|
43
|
+
end_token,
|
|
44
|
+
''
|
|
45
|
+
])
|
|
46
|
+
else:
|
|
47
|
+
prompt_parts.extend([
|
|
48
|
+
f'<!-- begin of {role_upper} -->',
|
|
49
|
+
msg['content'].strip(),
|
|
50
|
+
f'<!-- end of {role_upper} -->',
|
|
51
|
+
''
|
|
52
|
+
])
|
|
53
|
+
|
|
54
|
+
return '\n'.join(prompt_parts[:-1])
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
トークン関連のユーティリティ関数
|
|
3
3
|
"""
|
|
4
|
-
import
|
|
5
|
-
from chat_template_constraints import detect_chat_restrictions
|
|
4
|
+
from utils.chat_template_constraints import detect_chat_restrictions
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
def is_eod_token(response, tokenizer):
|
|
@@ -152,7 +151,8 @@ def get_special_tokens(tokenizer):
|
|
|
152
151
|
|
|
153
152
|
# ツール関連の単体トークン(追加)
|
|
154
153
|
"tool_calls_marker": "[TOOL_CALLS]",
|
|
155
|
-
|
|
154
|
+
# Harmony形式のcallトークン(tool_call_endとは異なる用途)
|
|
155
|
+
"harmony_call": "<|call|>",
|
|
156
156
|
}
|
|
157
157
|
|
|
158
158
|
# VLM processorではconvert_tokens_to_idsがない場合がある
|
|
@@ -254,8 +254,6 @@ def detect_tool_call_format(tokenizer):
|
|
|
254
254
|
(r'<longcat_tool_call>', r'</longcat_tool_call>'),
|
|
255
255
|
# <minimax:tool_call>...</minimax:tool_call>
|
|
256
256
|
(r'<minimax:tool_call>', r'</minimax:tool_call>'),
|
|
257
|
-
# context-1形式: to=functions.{name}...<|call|>
|
|
258
|
-
(r'to=functions\.', r'<\|call\|>'),
|
|
259
257
|
]
|
|
260
258
|
|
|
261
259
|
for start_pattern, end_pattern in tool_call_patterns:
|
|
@@ -266,6 +264,16 @@ def detect_tool_call_format(tokenizer):
|
|
|
266
264
|
result["call_end"] = end_match.group(0)
|
|
267
265
|
break
|
|
268
266
|
|
|
267
|
+
# Harmony形式の専用検出
|
|
268
|
+
# テンプレート内で "functions." と <|call|> が共存する場合
|
|
269
|
+
if "call_start" not in result:
|
|
270
|
+
has_functions = re.search(r'"functions\."', template)
|
|
271
|
+
has_call = re.search(r'<\|call\|>', template)
|
|
272
|
+
if has_functions and has_call:
|
|
273
|
+
result["tool_parser_type"] = "harmony"
|
|
274
|
+
result["call_start"] = "to=functions."
|
|
275
|
+
result["call_end"] = "<|call|>"
|
|
276
|
+
|
|
269
277
|
# Mistral特殊ケース
|
|
270
278
|
if "call_start" not in result:
|
|
271
279
|
mistral_match = re.search(r'\[TOOL_CALLS\]', template)
|