llama-cpp-python-win 0.3.16__cp314-cp314-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bin/convert_hf_to_gguf.py +8751 -0
- bin/ggml-base.dll +0 -0
- bin/ggml-cpu.dll +0 -0
- bin/ggml.dll +0 -0
- bin/llama-mtmd-cli.exe +0 -0
- bin/llama.dll +0 -0
- bin/mtmd.dll +0 -0
- include/ggml-alloc.h +76 -0
- include/ggml-backend.h +354 -0
- include/ggml-blas.h +25 -0
- include/ggml-cann.h +123 -0
- include/ggml-cpp.h +39 -0
- include/ggml-cpu.h +145 -0
- include/ggml-cuda.h +47 -0
- include/ggml-metal.h +66 -0
- include/ggml-opt.h +256 -0
- include/ggml-rpc.h +33 -0
- include/ggml-sycl.h +49 -0
- include/ggml-vulkan.h +29 -0
- include/ggml-webgpu.h +19 -0
- include/ggml.h +2467 -0
- include/gguf.h +202 -0
- include/llama-cpp.h +30 -0
- include/llama.h +1482 -0
- include/mtmd-helper.h +91 -0
- include/mtmd.h +298 -0
- lib/cmake/ggml/ggml-config.cmake +328 -0
- lib/cmake/ggml/ggml-version.cmake +65 -0
- lib/cmake/llama/llama-config.cmake +54 -0
- lib/cmake/llama/llama-version.cmake +65 -0
- lib/ggml-base.lib +0 -0
- lib/ggml-cpu.lib +0 -0
- lib/ggml.lib +0 -0
- lib/llama.lib +0 -0
- lib/mtmd.lib +0 -0
- lib/pkgconfig/llama.pc +10 -0
- llama_cpp/__init__.py +4 -0
- llama_cpp/_ctypes_extensions.py +131 -0
- llama_cpp/_ggml.py +12 -0
- llama_cpp/_internals.py +856 -0
- llama_cpp/_logger.py +47 -0
- llama_cpp/_utils.py +78 -0
- llama_cpp/lib/ggml-base.dll +0 -0
- llama_cpp/lib/ggml-base.lib +0 -0
- llama_cpp/lib/ggml-cpu.dll +0 -0
- llama_cpp/lib/ggml-cpu.lib +0 -0
- llama_cpp/lib/ggml.dll +0 -0
- llama_cpp/lib/ggml.lib +0 -0
- llama_cpp/lib/llama.dll +0 -0
- llama_cpp/lib/llama.lib +0 -0
- llama_cpp/lib/mtmd.dll +0 -0
- llama_cpp/lib/mtmd.lib +0 -0
- llama_cpp/llama.py +2422 -0
- llama_cpp/llama_cache.py +155 -0
- llama_cpp/llama_chat_format.py +3962 -0
- llama_cpp/llama_cpp.py +4374 -0
- llama_cpp/llama_grammar.py +953 -0
- llama_cpp/llama_speculative.py +64 -0
- llama_cpp/llama_tokenizer.py +120 -0
- llama_cpp/llama_types.py +316 -0
- llama_cpp/llava_cpp.py +158 -0
- llama_cpp/mtmd_cpp.py +280 -0
- llama_cpp/py.typed +0 -0
- llama_cpp/server/__init__.py +0 -0
- llama_cpp/server/__main__.py +100 -0
- llama_cpp/server/app.py +597 -0
- llama_cpp/server/cli.py +97 -0
- llama_cpp/server/errors.py +212 -0
- llama_cpp/server/model.py +312 -0
- llama_cpp/server/settings.py +240 -0
- llama_cpp/server/types.py +316 -0
- llama_cpp_python_win-0.3.16.dist-info/METADATA +856 -0
- llama_cpp_python_win-0.3.16.dist-info/RECORD +75 -0
- llama_cpp_python_win-0.3.16.dist-info/WHEEL +5 -0
- llama_cpp_python_win-0.3.16.dist-info/licenses/LICENSE.md +9 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import traceback
|
|
5
|
+
import time
|
|
6
|
+
from re import compile, Match, Pattern
|
|
7
|
+
from typing import Callable, Coroutine, Optional, Tuple, Union, Dict
|
|
8
|
+
from typing_extensions import TypedDict
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
from fastapi import (
|
|
12
|
+
Request,
|
|
13
|
+
Response,
|
|
14
|
+
HTTPException,
|
|
15
|
+
)
|
|
16
|
+
from fastapi.responses import JSONResponse
|
|
17
|
+
from fastapi.routing import APIRoute
|
|
18
|
+
|
|
19
|
+
from llama_cpp.server.types import (
|
|
20
|
+
CreateCompletionRequest,
|
|
21
|
+
CreateEmbeddingRequest,
|
|
22
|
+
CreateChatCompletionRequest,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ErrorResponse(TypedDict):
|
|
27
|
+
"""OpenAI style error response"""
|
|
28
|
+
|
|
29
|
+
message: str
|
|
30
|
+
type: str
|
|
31
|
+
param: Optional[str]
|
|
32
|
+
code: Optional[str]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ErrorResponseFormatters:
|
|
36
|
+
"""Collection of formatters for error responses.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
request (Union[CreateCompletionRequest, CreateChatCompletionRequest]):
|
|
40
|
+
Request body
|
|
41
|
+
match (Match[str]): Match object from regex pattern
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Tuple[int, ErrorResponse]: Status code and error response
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def context_length_exceeded(
|
|
49
|
+
request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
|
|
50
|
+
match, # type: Match[str] # type: ignore
|
|
51
|
+
) -> Tuple[int, ErrorResponse]:
|
|
52
|
+
"""Formatter for context length exceeded error"""
|
|
53
|
+
|
|
54
|
+
context_window = int(match.group(2))
|
|
55
|
+
prompt_tokens = int(match.group(1))
|
|
56
|
+
completion_tokens = request.max_tokens
|
|
57
|
+
if hasattr(request, "messages"):
|
|
58
|
+
# Chat completion
|
|
59
|
+
message = (
|
|
60
|
+
"This model's maximum context length is {} tokens. "
|
|
61
|
+
"However, you requested {} tokens "
|
|
62
|
+
"({} in the messages, {} in the completion). "
|
|
63
|
+
"Please reduce the length of the messages or completion."
|
|
64
|
+
)
|
|
65
|
+
else:
|
|
66
|
+
# Text completion
|
|
67
|
+
message = (
|
|
68
|
+
"This model's maximum context length is {} tokens, "
|
|
69
|
+
"however you requested {} tokens "
|
|
70
|
+
"({} in your prompt; {} for the completion). "
|
|
71
|
+
"Please reduce your prompt; or completion length."
|
|
72
|
+
)
|
|
73
|
+
return 400, ErrorResponse(
|
|
74
|
+
message=message.format(
|
|
75
|
+
context_window,
|
|
76
|
+
(completion_tokens or 0) + prompt_tokens,
|
|
77
|
+
prompt_tokens,
|
|
78
|
+
completion_tokens,
|
|
79
|
+
), # type: ignore
|
|
80
|
+
type="invalid_request_error",
|
|
81
|
+
param="messages",
|
|
82
|
+
code="context_length_exceeded",
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
@staticmethod
|
|
86
|
+
def model_not_found(
|
|
87
|
+
request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
|
|
88
|
+
match, # type: Match[str] # type: ignore
|
|
89
|
+
) -> Tuple[int, ErrorResponse]:
|
|
90
|
+
"""Formatter for model_not_found error"""
|
|
91
|
+
|
|
92
|
+
model_path = str(match.group(1))
|
|
93
|
+
message = f"The model `{model_path}` does not exist"
|
|
94
|
+
return 400, ErrorResponse(
|
|
95
|
+
message=message,
|
|
96
|
+
type="invalid_request_error",
|
|
97
|
+
param=None,
|
|
98
|
+
code="model_not_found",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class RouteErrorHandler(APIRoute):
|
|
103
|
+
"""Custom APIRoute that handles application errors and exceptions"""
|
|
104
|
+
|
|
105
|
+
# key: regex pattern for original error message from llama_cpp
|
|
106
|
+
# value: formatter function
|
|
107
|
+
pattern_and_formatters: Dict[
|
|
108
|
+
"Pattern[str]",
|
|
109
|
+
Callable[
|
|
110
|
+
[
|
|
111
|
+
Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
|
|
112
|
+
"Match[str]",
|
|
113
|
+
],
|
|
114
|
+
Tuple[int, ErrorResponse],
|
|
115
|
+
],
|
|
116
|
+
] = {
|
|
117
|
+
compile(
|
|
118
|
+
r"Requested tokens \((\d+)\) exceed context window of (\d+)"
|
|
119
|
+
): ErrorResponseFormatters.context_length_exceeded,
|
|
120
|
+
compile(
|
|
121
|
+
r"Model path does not exist: (.+)"
|
|
122
|
+
): ErrorResponseFormatters.model_not_found,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
def error_message_wrapper(
|
|
126
|
+
self,
|
|
127
|
+
error: Exception,
|
|
128
|
+
body: Optional[
|
|
129
|
+
Union[
|
|
130
|
+
"CreateChatCompletionRequest",
|
|
131
|
+
"CreateCompletionRequest",
|
|
132
|
+
"CreateEmbeddingRequest",
|
|
133
|
+
]
|
|
134
|
+
] = None,
|
|
135
|
+
) -> Tuple[int, ErrorResponse]:
|
|
136
|
+
"""Wraps error message in OpenAI style error response"""
|
|
137
|
+
if body is not None and isinstance(
|
|
138
|
+
body,
|
|
139
|
+
(
|
|
140
|
+
CreateCompletionRequest,
|
|
141
|
+
CreateChatCompletionRequest,
|
|
142
|
+
),
|
|
143
|
+
):
|
|
144
|
+
# When text completion or chat completion
|
|
145
|
+
for pattern, callback in self.pattern_and_formatters.items():
|
|
146
|
+
match = pattern.search(str(error))
|
|
147
|
+
if match is not None:
|
|
148
|
+
return callback(body, match)
|
|
149
|
+
|
|
150
|
+
# Only print the trace on unexpected exceptions
|
|
151
|
+
print(f"Exception: {str(error)}", file=sys.stderr)
|
|
152
|
+
traceback.print_exc(file=sys.stderr)
|
|
153
|
+
|
|
154
|
+
# Wrap other errors as internal server error
|
|
155
|
+
return 500, ErrorResponse(
|
|
156
|
+
message=str(error),
|
|
157
|
+
type="internal_server_error",
|
|
158
|
+
param=None,
|
|
159
|
+
code=None,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
def get_route_handler(
|
|
163
|
+
self,
|
|
164
|
+
) -> Callable[[Request], Coroutine[None, None, Response]]:
|
|
165
|
+
"""Defines custom route handler that catches exceptions and formats
|
|
166
|
+
in OpenAI style error response"""
|
|
167
|
+
|
|
168
|
+
original_route_handler = super().get_route_handler()
|
|
169
|
+
|
|
170
|
+
async def custom_route_handler(request: Request) -> Response:
|
|
171
|
+
try:
|
|
172
|
+
start_sec = time.perf_counter()
|
|
173
|
+
response = await original_route_handler(request)
|
|
174
|
+
elapsed_time_ms = int((time.perf_counter() - start_sec) * 1000)
|
|
175
|
+
response.headers["openai-processing-ms"] = f"{elapsed_time_ms}"
|
|
176
|
+
return response
|
|
177
|
+
except HTTPException as unauthorized:
|
|
178
|
+
# api key check failed
|
|
179
|
+
raise unauthorized
|
|
180
|
+
except Exception as exc:
|
|
181
|
+
json_body = await request.json()
|
|
182
|
+
try:
|
|
183
|
+
if "messages" in json_body:
|
|
184
|
+
# Chat completion
|
|
185
|
+
body: Optional[
|
|
186
|
+
Union[
|
|
187
|
+
CreateChatCompletionRequest,
|
|
188
|
+
CreateCompletionRequest,
|
|
189
|
+
CreateEmbeddingRequest,
|
|
190
|
+
]
|
|
191
|
+
] = CreateChatCompletionRequest(**json_body)
|
|
192
|
+
elif "prompt" in json_body:
|
|
193
|
+
# Text completion
|
|
194
|
+
body = CreateCompletionRequest(**json_body)
|
|
195
|
+
else:
|
|
196
|
+
# Embedding
|
|
197
|
+
body = CreateEmbeddingRequest(**json_body)
|
|
198
|
+
except Exception:
|
|
199
|
+
# Invalid request body
|
|
200
|
+
body = None
|
|
201
|
+
|
|
202
|
+
# Get proper error message from the exception
|
|
203
|
+
(
|
|
204
|
+
status_code,
|
|
205
|
+
error_message,
|
|
206
|
+
) = self.error_message_wrapper(error=exc, body=body)
|
|
207
|
+
return JSONResponse(
|
|
208
|
+
{"error": error_message},
|
|
209
|
+
status_code=status_code,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
return custom_route_handler
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from typing import Dict, Optional, Union, List
|
|
6
|
+
|
|
7
|
+
import llama_cpp
|
|
8
|
+
import llama_cpp.llama_speculative as llama_speculative
|
|
9
|
+
import llama_cpp.llama_tokenizer as llama_tokenizer
|
|
10
|
+
|
|
11
|
+
from llama_cpp.server.settings import ModelSettings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LlamaProxy:
|
|
15
|
+
def __init__(self, models: List[ModelSettings]) -> None:
|
|
16
|
+
assert len(models) > 0, "No models provided!"
|
|
17
|
+
|
|
18
|
+
self._model_settings_dict: dict[str, ModelSettings] = {}
|
|
19
|
+
for model in models:
|
|
20
|
+
if not model.model_alias:
|
|
21
|
+
model.model_alias = model.model
|
|
22
|
+
self._model_settings_dict[model.model_alias] = model
|
|
23
|
+
|
|
24
|
+
self._current_model: Optional[llama_cpp.Llama] = None
|
|
25
|
+
self._current_model_alias: Optional[str] = None
|
|
26
|
+
|
|
27
|
+
self._default_model_settings: ModelSettings = models[0]
|
|
28
|
+
self._default_model_alias: str = self._default_model_settings.model_alias # type: ignore
|
|
29
|
+
|
|
30
|
+
# Load default model
|
|
31
|
+
self._current_model = self.load_llama_from_model_settings(
|
|
32
|
+
self._default_model_settings
|
|
33
|
+
)
|
|
34
|
+
self._current_model_alias = self._default_model_alias
|
|
35
|
+
|
|
36
|
+
def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
|
|
37
|
+
if model is None:
|
|
38
|
+
model = self._default_model_alias
|
|
39
|
+
|
|
40
|
+
if model not in self._model_settings_dict:
|
|
41
|
+
model = self._default_model_alias
|
|
42
|
+
|
|
43
|
+
if model == self._current_model_alias:
|
|
44
|
+
if self._current_model is not None:
|
|
45
|
+
return self._current_model
|
|
46
|
+
|
|
47
|
+
if self._current_model:
|
|
48
|
+
self._current_model.close()
|
|
49
|
+
self._current_model = None
|
|
50
|
+
|
|
51
|
+
settings = self._model_settings_dict[model]
|
|
52
|
+
self._current_model = self.load_llama_from_model_settings(settings)
|
|
53
|
+
self._current_model_alias = model
|
|
54
|
+
return self._current_model
|
|
55
|
+
|
|
56
|
+
def __getitem__(self, model: str):
|
|
57
|
+
return self._model_settings_dict[model].model_dump()
|
|
58
|
+
|
|
59
|
+
def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]):
|
|
60
|
+
if isinstance(settings, (bytes, str)):
|
|
61
|
+
settings = ModelSettings.model_validate_json(settings)
|
|
62
|
+
self._model_settings_dict[model] = settings
|
|
63
|
+
|
|
64
|
+
def __iter__(self):
|
|
65
|
+
for model in self._model_settings_dict:
|
|
66
|
+
yield model
|
|
67
|
+
|
|
68
|
+
def free(self):
|
|
69
|
+
if self._current_model:
|
|
70
|
+
self._current_model.close()
|
|
71
|
+
del self._current_model
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
|
|
75
|
+
chat_handler = None
|
|
76
|
+
if settings.chat_format == "llava-1-5":
|
|
77
|
+
assert settings.clip_model_path is not None, "clip model not found"
|
|
78
|
+
if settings.hf_model_repo_id is not None:
|
|
79
|
+
chat_handler = (
|
|
80
|
+
llama_cpp.llama_chat_format.Llava15ChatHandler.from_pretrained(
|
|
81
|
+
repo_id=settings.hf_model_repo_id,
|
|
82
|
+
filename=settings.clip_model_path,
|
|
83
|
+
verbose=settings.verbose,
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
else:
|
|
87
|
+
chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
|
|
88
|
+
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
89
|
+
)
|
|
90
|
+
elif settings.chat_format == "obsidian":
|
|
91
|
+
assert settings.clip_model_path is not None, "clip model not found"
|
|
92
|
+
if settings.hf_model_repo_id is not None:
|
|
93
|
+
chat_handler = (
|
|
94
|
+
llama_cpp.llama_chat_format.ObsidianChatHandler.from_pretrained(
|
|
95
|
+
repo_id=settings.hf_model_repo_id,
|
|
96
|
+
filename=settings.clip_model_path,
|
|
97
|
+
verbose=settings.verbose,
|
|
98
|
+
)
|
|
99
|
+
)
|
|
100
|
+
else:
|
|
101
|
+
chat_handler = llama_cpp.llama_chat_format.ObsidianChatHandler(
|
|
102
|
+
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
103
|
+
)
|
|
104
|
+
elif settings.chat_format == "llava-1-6":
|
|
105
|
+
assert settings.clip_model_path is not None, "clip model not found"
|
|
106
|
+
if settings.hf_model_repo_id is not None:
|
|
107
|
+
chat_handler = (
|
|
108
|
+
llama_cpp.llama_chat_format.Llava16ChatHandler.from_pretrained(
|
|
109
|
+
repo_id=settings.hf_model_repo_id,
|
|
110
|
+
filename=settings.clip_model_path,
|
|
111
|
+
verbose=settings.verbose,
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
else:
|
|
115
|
+
chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler(
|
|
116
|
+
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
117
|
+
)
|
|
118
|
+
elif settings.chat_format == "moondream":
|
|
119
|
+
assert settings.clip_model_path is not None, "clip model not found"
|
|
120
|
+
if settings.hf_model_repo_id is not None:
|
|
121
|
+
chat_handler = (
|
|
122
|
+
llama_cpp.llama_chat_format.MoondreamChatHandler.from_pretrained(
|
|
123
|
+
repo_id=settings.hf_model_repo_id,
|
|
124
|
+
filename=settings.clip_model_path,
|
|
125
|
+
verbose=settings.verbose,
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
else:
|
|
129
|
+
chat_handler = llama_cpp.llama_chat_format.MoondreamChatHandler(
|
|
130
|
+
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
131
|
+
)
|
|
132
|
+
elif settings.chat_format == "nanollava":
|
|
133
|
+
assert settings.clip_model_path is not None, "clip model not found"
|
|
134
|
+
if settings.hf_model_repo_id is not None:
|
|
135
|
+
chat_handler = (
|
|
136
|
+
llama_cpp.llama_chat_format.NanoLlavaChatHandler.from_pretrained(
|
|
137
|
+
repo_id=settings.hf_model_repo_id,
|
|
138
|
+
filename=settings.clip_model_path,
|
|
139
|
+
verbose=settings.verbose,
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
else:
|
|
143
|
+
chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
|
|
144
|
+
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
145
|
+
)
|
|
146
|
+
elif settings.chat_format == "llama-3-vision-alpha":
|
|
147
|
+
assert settings.clip_model_path is not None, "clip model not found"
|
|
148
|
+
if settings.hf_model_repo_id is not None:
|
|
149
|
+
chat_handler = (
|
|
150
|
+
llama_cpp.llama_chat_format.Llama3VisionAlpha.from_pretrained(
|
|
151
|
+
repo_id=settings.hf_model_repo_id,
|
|
152
|
+
filename=settings.clip_model_path,
|
|
153
|
+
verbose=settings.verbose,
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
else:
|
|
157
|
+
chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha(
|
|
158
|
+
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
159
|
+
)
|
|
160
|
+
elif settings.chat_format == "minicpm-v-2.6":
|
|
161
|
+
assert settings.clip_model_path is not None, "clip model not found"
|
|
162
|
+
if settings.hf_model_repo_id is not None:
|
|
163
|
+
chat_handler = (
|
|
164
|
+
llama_cpp.llama_chat_format.MiniCPMv26ChatHandler.from_pretrained(
|
|
165
|
+
repo_id=settings.hf_model_repo_id,
|
|
166
|
+
filename=settings.clip_model_path,
|
|
167
|
+
verbose=settings.verbose,
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
else:
|
|
171
|
+
chat_handler = llama_cpp.llama_chat_format.MiniCPMv26ChatHandler(
|
|
172
|
+
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
173
|
+
)
|
|
174
|
+
elif settings.chat_format == "qwen2.5-vl":
|
|
175
|
+
assert settings.clip_model_path is not None, "clip model not found"
|
|
176
|
+
if settings.hf_model_repo_id is not None:
|
|
177
|
+
chat_handler = (
|
|
178
|
+
llama_cpp.llama_chat_format.Qwen25VLChatHandler.from_pretrained(
|
|
179
|
+
repo_id=settings.hf_model_repo_id,
|
|
180
|
+
filename=settings.clip_model_path,
|
|
181
|
+
verbose=settings.verbose,
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
else:
|
|
185
|
+
chat_handler = llama_cpp.llama_chat_format.Qwen25VLChatHandler(
|
|
186
|
+
clip_model_path=settings.clip_model_path, verbose=settings.verbose
|
|
187
|
+
)
|
|
188
|
+
elif settings.chat_format == "hf-autotokenizer":
|
|
189
|
+
assert (
|
|
190
|
+
settings.hf_pretrained_model_name_or_path is not None
|
|
191
|
+
), "hf_pretrained_model_name_or_path must be set for hf-autotokenizer"
|
|
192
|
+
chat_handler = (
|
|
193
|
+
llama_cpp.llama_chat_format.hf_autotokenizer_to_chat_completion_handler(
|
|
194
|
+
settings.hf_pretrained_model_name_or_path
|
|
195
|
+
)
|
|
196
|
+
)
|
|
197
|
+
elif settings.chat_format == "hf-tokenizer-config":
|
|
198
|
+
assert (
|
|
199
|
+
settings.hf_tokenizer_config_path is not None
|
|
200
|
+
), "hf_tokenizer_config_path must be set for hf-tokenizer-config"
|
|
201
|
+
chat_handler = llama_cpp.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler(
|
|
202
|
+
json.load(open(settings.hf_tokenizer_config_path))
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
|
|
206
|
+
if settings.hf_pretrained_model_name_or_path is not None:
|
|
207
|
+
tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(
|
|
208
|
+
settings.hf_pretrained_model_name_or_path
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
draft_model = None
|
|
212
|
+
if settings.draft_model is not None:
|
|
213
|
+
draft_model = llama_speculative.LlamaPromptLookupDecoding(
|
|
214
|
+
num_pred_tokens=settings.draft_model_num_pred_tokens
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None
|
|
218
|
+
if settings.kv_overrides is not None:
|
|
219
|
+
assert isinstance(settings.kv_overrides, list)
|
|
220
|
+
kv_overrides = {}
|
|
221
|
+
for kv in settings.kv_overrides:
|
|
222
|
+
key, value = kv.split("=")
|
|
223
|
+
if ":" in value:
|
|
224
|
+
value_type, value = value.split(":")
|
|
225
|
+
if value_type == "bool":
|
|
226
|
+
kv_overrides[key] = value.lower() in ["true", "1"]
|
|
227
|
+
elif value_type == "int":
|
|
228
|
+
kv_overrides[key] = int(value)
|
|
229
|
+
elif value_type == "float":
|
|
230
|
+
kv_overrides[key] = float(value)
|
|
231
|
+
elif value_type == "str":
|
|
232
|
+
kv_overrides[key] = value
|
|
233
|
+
else:
|
|
234
|
+
raise ValueError(f"Unknown value type {value_type}")
|
|
235
|
+
|
|
236
|
+
import functools
|
|
237
|
+
|
|
238
|
+
kwargs = {}
|
|
239
|
+
|
|
240
|
+
if settings.hf_model_repo_id is not None:
|
|
241
|
+
create_fn = functools.partial(
|
|
242
|
+
llama_cpp.Llama.from_pretrained,
|
|
243
|
+
repo_id=settings.hf_model_repo_id,
|
|
244
|
+
filename=settings.model,
|
|
245
|
+
)
|
|
246
|
+
else:
|
|
247
|
+
create_fn = llama_cpp.Llama
|
|
248
|
+
kwargs["model_path"] = settings.model
|
|
249
|
+
|
|
250
|
+
_model = create_fn(
|
|
251
|
+
**kwargs,
|
|
252
|
+
# Model Params
|
|
253
|
+
n_gpu_layers=settings.n_gpu_layers,
|
|
254
|
+
split_mode=settings.split_mode,
|
|
255
|
+
main_gpu=settings.main_gpu,
|
|
256
|
+
tensor_split=settings.tensor_split,
|
|
257
|
+
vocab_only=settings.vocab_only,
|
|
258
|
+
use_mmap=settings.use_mmap,
|
|
259
|
+
use_mlock=settings.use_mlock,
|
|
260
|
+
kv_overrides=kv_overrides,
|
|
261
|
+
rpc_servers=settings.rpc_servers,
|
|
262
|
+
# Context Params
|
|
263
|
+
seed=settings.seed,
|
|
264
|
+
n_ctx=settings.n_ctx,
|
|
265
|
+
n_batch=settings.n_batch,
|
|
266
|
+
n_ubatch=settings.n_ubatch,
|
|
267
|
+
n_threads=settings.n_threads,
|
|
268
|
+
n_threads_batch=settings.n_threads_batch,
|
|
269
|
+
rope_scaling_type=settings.rope_scaling_type,
|
|
270
|
+
rope_freq_base=settings.rope_freq_base,
|
|
271
|
+
rope_freq_scale=settings.rope_freq_scale,
|
|
272
|
+
yarn_ext_factor=settings.yarn_ext_factor,
|
|
273
|
+
yarn_attn_factor=settings.yarn_attn_factor,
|
|
274
|
+
yarn_beta_fast=settings.yarn_beta_fast,
|
|
275
|
+
yarn_beta_slow=settings.yarn_beta_slow,
|
|
276
|
+
yarn_orig_ctx=settings.yarn_orig_ctx,
|
|
277
|
+
mul_mat_q=settings.mul_mat_q,
|
|
278
|
+
logits_all=settings.logits_all,
|
|
279
|
+
embedding=settings.embedding,
|
|
280
|
+
offload_kqv=settings.offload_kqv,
|
|
281
|
+
flash_attn=settings.flash_attn,
|
|
282
|
+
# Sampling Params
|
|
283
|
+
last_n_tokens_size=settings.last_n_tokens_size,
|
|
284
|
+
# LoRA Params
|
|
285
|
+
lora_base=settings.lora_base,
|
|
286
|
+
lora_path=settings.lora_path,
|
|
287
|
+
# Backend Params
|
|
288
|
+
numa=settings.numa,
|
|
289
|
+
# Chat Format Params
|
|
290
|
+
chat_format=settings.chat_format,
|
|
291
|
+
chat_handler=chat_handler,
|
|
292
|
+
# Speculative Decoding
|
|
293
|
+
draft_model=draft_model,
|
|
294
|
+
# KV Cache Quantization
|
|
295
|
+
type_k=settings.type_k,
|
|
296
|
+
type_v=settings.type_v,
|
|
297
|
+
# Tokenizer
|
|
298
|
+
tokenizer=tokenizer,
|
|
299
|
+
# Misc
|
|
300
|
+
verbose=settings.verbose,
|
|
301
|
+
)
|
|
302
|
+
if settings.cache:
|
|
303
|
+
if settings.cache_type == "disk":
|
|
304
|
+
if settings.verbose:
|
|
305
|
+
print(f"Using disk cache with size {settings.cache_size}")
|
|
306
|
+
cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
|
|
307
|
+
else:
|
|
308
|
+
if settings.verbose:
|
|
309
|
+
print(f"Using ram cache with size {settings.cache_size}")
|
|
310
|
+
cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)
|
|
311
|
+
_model.set_cache(cache)
|
|
312
|
+
return _model
|