llama-cpp-python-win 0.3.16__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. bin/convert_hf_to_gguf.py +8751 -0
  2. bin/ggml-base.dll +0 -0
  3. bin/ggml-cpu.dll +0 -0
  4. bin/ggml.dll +0 -0
  5. bin/llama-mtmd-cli.exe +0 -0
  6. bin/llama.dll +0 -0
  7. bin/mtmd.dll +0 -0
  8. include/ggml-alloc.h +76 -0
  9. include/ggml-backend.h +354 -0
  10. include/ggml-blas.h +25 -0
  11. include/ggml-cann.h +123 -0
  12. include/ggml-cpp.h +39 -0
  13. include/ggml-cpu.h +145 -0
  14. include/ggml-cuda.h +47 -0
  15. include/ggml-metal.h +66 -0
  16. include/ggml-opt.h +256 -0
  17. include/ggml-rpc.h +33 -0
  18. include/ggml-sycl.h +49 -0
  19. include/ggml-vulkan.h +29 -0
  20. include/ggml-webgpu.h +19 -0
  21. include/ggml.h +2467 -0
  22. include/gguf.h +202 -0
  23. include/llama-cpp.h +30 -0
  24. include/llama.h +1482 -0
  25. include/mtmd-helper.h +91 -0
  26. include/mtmd.h +298 -0
  27. lib/cmake/ggml/ggml-config.cmake +328 -0
  28. lib/cmake/ggml/ggml-version.cmake +65 -0
  29. lib/cmake/llama/llama-config.cmake +54 -0
  30. lib/cmake/llama/llama-version.cmake +65 -0
  31. lib/ggml-base.lib +0 -0
  32. lib/ggml-cpu.lib +0 -0
  33. lib/ggml.lib +0 -0
  34. lib/llama.lib +0 -0
  35. lib/mtmd.lib +0 -0
  36. lib/pkgconfig/llama.pc +10 -0
  37. llama_cpp/__init__.py +4 -0
  38. llama_cpp/_ctypes_extensions.py +131 -0
  39. llama_cpp/_ggml.py +12 -0
  40. llama_cpp/_internals.py +856 -0
  41. llama_cpp/_logger.py +47 -0
  42. llama_cpp/_utils.py +78 -0
  43. llama_cpp/lib/ggml-base.dll +0 -0
  44. llama_cpp/lib/ggml-base.lib +0 -0
  45. llama_cpp/lib/ggml-cpu.dll +0 -0
  46. llama_cpp/lib/ggml-cpu.lib +0 -0
  47. llama_cpp/lib/ggml.dll +0 -0
  48. llama_cpp/lib/ggml.lib +0 -0
  49. llama_cpp/lib/llama.dll +0 -0
  50. llama_cpp/lib/llama.lib +0 -0
  51. llama_cpp/lib/mtmd.dll +0 -0
  52. llama_cpp/lib/mtmd.lib +0 -0
  53. llama_cpp/llama.py +2422 -0
  54. llama_cpp/llama_cache.py +155 -0
  55. llama_cpp/llama_chat_format.py +3962 -0
  56. llama_cpp/llama_cpp.py +4374 -0
  57. llama_cpp/llama_grammar.py +953 -0
  58. llama_cpp/llama_speculative.py +64 -0
  59. llama_cpp/llama_tokenizer.py +120 -0
  60. llama_cpp/llama_types.py +316 -0
  61. llama_cpp/llava_cpp.py +158 -0
  62. llama_cpp/mtmd_cpp.py +280 -0
  63. llama_cpp/py.typed +0 -0
  64. llama_cpp/server/__init__.py +0 -0
  65. llama_cpp/server/__main__.py +100 -0
  66. llama_cpp/server/app.py +597 -0
  67. llama_cpp/server/cli.py +97 -0
  68. llama_cpp/server/errors.py +212 -0
  69. llama_cpp/server/model.py +312 -0
  70. llama_cpp/server/settings.py +240 -0
  71. llama_cpp/server/types.py +316 -0
  72. llama_cpp_python_win-0.3.16.dist-info/METADATA +856 -0
  73. llama_cpp_python_win-0.3.16.dist-info/RECORD +75 -0
  74. llama_cpp_python_win-0.3.16.dist-info/WHEEL +5 -0
  75. llama_cpp_python_win-0.3.16.dist-info/licenses/LICENSE.md +9 -0
@@ -0,0 +1,212 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ import traceback
5
+ import time
6
+ from re import compile, Match, Pattern
7
+ from typing import Callable, Coroutine, Optional, Tuple, Union, Dict
8
+ from typing_extensions import TypedDict
9
+
10
+
11
+ from fastapi import (
12
+ Request,
13
+ Response,
14
+ HTTPException,
15
+ )
16
+ from fastapi.responses import JSONResponse
17
+ from fastapi.routing import APIRoute
18
+
19
+ from llama_cpp.server.types import (
20
+ CreateCompletionRequest,
21
+ CreateEmbeddingRequest,
22
+ CreateChatCompletionRequest,
23
+ )
24
+
25
+
26
+ class ErrorResponse(TypedDict):
27
+ """OpenAI style error response"""
28
+
29
+ message: str
30
+ type: str
31
+ param: Optional[str]
32
+ code: Optional[str]
33
+
34
+
35
+ class ErrorResponseFormatters:
36
+ """Collection of formatters for error responses.
37
+
38
+ Args:
39
+ request (Union[CreateCompletionRequest, CreateChatCompletionRequest]):
40
+ Request body
41
+ match (Match[str]): Match object from regex pattern
42
+
43
+ Returns:
44
+ Tuple[int, ErrorResponse]: Status code and error response
45
+ """
46
+
47
+ @staticmethod
48
+ def context_length_exceeded(
49
+ request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
50
+ match, # type: Match[str] # type: ignore
51
+ ) -> Tuple[int, ErrorResponse]:
52
+ """Formatter for context length exceeded error"""
53
+
54
+ context_window = int(match.group(2))
55
+ prompt_tokens = int(match.group(1))
56
+ completion_tokens = request.max_tokens
57
+ if hasattr(request, "messages"):
58
+ # Chat completion
59
+ message = (
60
+ "This model's maximum context length is {} tokens. "
61
+ "However, you requested {} tokens "
62
+ "({} in the messages, {} in the completion). "
63
+ "Please reduce the length of the messages or completion."
64
+ )
65
+ else:
66
+ # Text completion
67
+ message = (
68
+ "This model's maximum context length is {} tokens, "
69
+ "however you requested {} tokens "
70
+ "({} in your prompt; {} for the completion). "
71
+ "Please reduce your prompt; or completion length."
72
+ )
73
+ return 400, ErrorResponse(
74
+ message=message.format(
75
+ context_window,
76
+ (completion_tokens or 0) + prompt_tokens,
77
+ prompt_tokens,
78
+ completion_tokens,
79
+ ), # type: ignore
80
+ type="invalid_request_error",
81
+ param="messages",
82
+ code="context_length_exceeded",
83
+ )
84
+
85
+ @staticmethod
86
+ def model_not_found(
87
+ request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
88
+ match, # type: Match[str] # type: ignore
89
+ ) -> Tuple[int, ErrorResponse]:
90
+ """Formatter for model_not_found error"""
91
+
92
+ model_path = str(match.group(1))
93
+ message = f"The model `{model_path}` does not exist"
94
+ return 400, ErrorResponse(
95
+ message=message,
96
+ type="invalid_request_error",
97
+ param=None,
98
+ code="model_not_found",
99
+ )
100
+
101
+
102
+ class RouteErrorHandler(APIRoute):
103
+ """Custom APIRoute that handles application errors and exceptions"""
104
+
105
+ # key: regex pattern for original error message from llama_cpp
106
+ # value: formatter function
107
+ pattern_and_formatters: Dict[
108
+ "Pattern[str]",
109
+ Callable[
110
+ [
111
+ Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
112
+ "Match[str]",
113
+ ],
114
+ Tuple[int, ErrorResponse],
115
+ ],
116
+ ] = {
117
+ compile(
118
+ r"Requested tokens \((\d+)\) exceed context window of (\d+)"
119
+ ): ErrorResponseFormatters.context_length_exceeded,
120
+ compile(
121
+ r"Model path does not exist: (.+)"
122
+ ): ErrorResponseFormatters.model_not_found,
123
+ }
124
+
125
+ def error_message_wrapper(
126
+ self,
127
+ error: Exception,
128
+ body: Optional[
129
+ Union[
130
+ "CreateChatCompletionRequest",
131
+ "CreateCompletionRequest",
132
+ "CreateEmbeddingRequest",
133
+ ]
134
+ ] = None,
135
+ ) -> Tuple[int, ErrorResponse]:
136
+ """Wraps error message in OpenAI style error response"""
137
+ if body is not None and isinstance(
138
+ body,
139
+ (
140
+ CreateCompletionRequest,
141
+ CreateChatCompletionRequest,
142
+ ),
143
+ ):
144
+ # When text completion or chat completion
145
+ for pattern, callback in self.pattern_and_formatters.items():
146
+ match = pattern.search(str(error))
147
+ if match is not None:
148
+ return callback(body, match)
149
+
150
+ # Only print the trace on unexpected exceptions
151
+ print(f"Exception: {str(error)}", file=sys.stderr)
152
+ traceback.print_exc(file=sys.stderr)
153
+
154
+ # Wrap other errors as internal server error
155
+ return 500, ErrorResponse(
156
+ message=str(error),
157
+ type="internal_server_error",
158
+ param=None,
159
+ code=None,
160
+ )
161
+
162
+ def get_route_handler(
163
+ self,
164
+ ) -> Callable[[Request], Coroutine[None, None, Response]]:
165
+ """Defines custom route handler that catches exceptions and formats
166
+ in OpenAI style error response"""
167
+
168
+ original_route_handler = super().get_route_handler()
169
+
170
+ async def custom_route_handler(request: Request) -> Response:
171
+ try:
172
+ start_sec = time.perf_counter()
173
+ response = await original_route_handler(request)
174
+ elapsed_time_ms = int((time.perf_counter() - start_sec) * 1000)
175
+ response.headers["openai-processing-ms"] = f"{elapsed_time_ms}"
176
+ return response
177
+ except HTTPException as unauthorized:
178
+ # api key check failed
179
+ raise unauthorized
180
+ except Exception as exc:
181
+ json_body = await request.json()
182
+ try:
183
+ if "messages" in json_body:
184
+ # Chat completion
185
+ body: Optional[
186
+ Union[
187
+ CreateChatCompletionRequest,
188
+ CreateCompletionRequest,
189
+ CreateEmbeddingRequest,
190
+ ]
191
+ ] = CreateChatCompletionRequest(**json_body)
192
+ elif "prompt" in json_body:
193
+ # Text completion
194
+ body = CreateCompletionRequest(**json_body)
195
+ else:
196
+ # Embedding
197
+ body = CreateEmbeddingRequest(**json_body)
198
+ except Exception:
199
+ # Invalid request body
200
+ body = None
201
+
202
+ # Get proper error message from the exception
203
+ (
204
+ status_code,
205
+ error_message,
206
+ ) = self.error_message_wrapper(error=exc, body=body)
207
+ return JSONResponse(
208
+ {"error": error_message},
209
+ status_code=status_code,
210
+ )
211
+
212
+ return custom_route_handler
@@ -0,0 +1,312 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ from typing import Dict, Optional, Union, List
6
+
7
+ import llama_cpp
8
+ import llama_cpp.llama_speculative as llama_speculative
9
+ import llama_cpp.llama_tokenizer as llama_tokenizer
10
+
11
+ from llama_cpp.server.settings import ModelSettings
12
+
13
+
14
+ class LlamaProxy:
15
+ def __init__(self, models: List[ModelSettings]) -> None:
16
+ assert len(models) > 0, "No models provided!"
17
+
18
+ self._model_settings_dict: dict[str, ModelSettings] = {}
19
+ for model in models:
20
+ if not model.model_alias:
21
+ model.model_alias = model.model
22
+ self._model_settings_dict[model.model_alias] = model
23
+
24
+ self._current_model: Optional[llama_cpp.Llama] = None
25
+ self._current_model_alias: Optional[str] = None
26
+
27
+ self._default_model_settings: ModelSettings = models[0]
28
+ self._default_model_alias: str = self._default_model_settings.model_alias # type: ignore
29
+
30
+ # Load default model
31
+ self._current_model = self.load_llama_from_model_settings(
32
+ self._default_model_settings
33
+ )
34
+ self._current_model_alias = self._default_model_alias
35
+
36
+ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
37
+ if model is None:
38
+ model = self._default_model_alias
39
+
40
+ if model not in self._model_settings_dict:
41
+ model = self._default_model_alias
42
+
43
+ if model == self._current_model_alias:
44
+ if self._current_model is not None:
45
+ return self._current_model
46
+
47
+ if self._current_model:
48
+ self._current_model.close()
49
+ self._current_model = None
50
+
51
+ settings = self._model_settings_dict[model]
52
+ self._current_model = self.load_llama_from_model_settings(settings)
53
+ self._current_model_alias = model
54
+ return self._current_model
55
+
56
+ def __getitem__(self, model: str):
57
+ return self._model_settings_dict[model].model_dump()
58
+
59
+ def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]):
60
+ if isinstance(settings, (bytes, str)):
61
+ settings = ModelSettings.model_validate_json(settings)
62
+ self._model_settings_dict[model] = settings
63
+
64
+ def __iter__(self):
65
+ for model in self._model_settings_dict:
66
+ yield model
67
+
68
+ def free(self):
69
+ if self._current_model:
70
+ self._current_model.close()
71
+ del self._current_model
72
+
73
+ @staticmethod
74
+ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
75
+ chat_handler = None
76
+ if settings.chat_format == "llava-1-5":
77
+ assert settings.clip_model_path is not None, "clip model not found"
78
+ if settings.hf_model_repo_id is not None:
79
+ chat_handler = (
80
+ llama_cpp.llama_chat_format.Llava15ChatHandler.from_pretrained(
81
+ repo_id=settings.hf_model_repo_id,
82
+ filename=settings.clip_model_path,
83
+ verbose=settings.verbose,
84
+ )
85
+ )
86
+ else:
87
+ chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
88
+ clip_model_path=settings.clip_model_path, verbose=settings.verbose
89
+ )
90
+ elif settings.chat_format == "obsidian":
91
+ assert settings.clip_model_path is not None, "clip model not found"
92
+ if settings.hf_model_repo_id is not None:
93
+ chat_handler = (
94
+ llama_cpp.llama_chat_format.ObsidianChatHandler.from_pretrained(
95
+ repo_id=settings.hf_model_repo_id,
96
+ filename=settings.clip_model_path,
97
+ verbose=settings.verbose,
98
+ )
99
+ )
100
+ else:
101
+ chat_handler = llama_cpp.llama_chat_format.ObsidianChatHandler(
102
+ clip_model_path=settings.clip_model_path, verbose=settings.verbose
103
+ )
104
+ elif settings.chat_format == "llava-1-6":
105
+ assert settings.clip_model_path is not None, "clip model not found"
106
+ if settings.hf_model_repo_id is not None:
107
+ chat_handler = (
108
+ llama_cpp.llama_chat_format.Llava16ChatHandler.from_pretrained(
109
+ repo_id=settings.hf_model_repo_id,
110
+ filename=settings.clip_model_path,
111
+ verbose=settings.verbose,
112
+ )
113
+ )
114
+ else:
115
+ chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler(
116
+ clip_model_path=settings.clip_model_path, verbose=settings.verbose
117
+ )
118
+ elif settings.chat_format == "moondream":
119
+ assert settings.clip_model_path is not None, "clip model not found"
120
+ if settings.hf_model_repo_id is not None:
121
+ chat_handler = (
122
+ llama_cpp.llama_chat_format.MoondreamChatHandler.from_pretrained(
123
+ repo_id=settings.hf_model_repo_id,
124
+ filename=settings.clip_model_path,
125
+ verbose=settings.verbose,
126
+ )
127
+ )
128
+ else:
129
+ chat_handler = llama_cpp.llama_chat_format.MoondreamChatHandler(
130
+ clip_model_path=settings.clip_model_path, verbose=settings.verbose
131
+ )
132
+ elif settings.chat_format == "nanollava":
133
+ assert settings.clip_model_path is not None, "clip model not found"
134
+ if settings.hf_model_repo_id is not None:
135
+ chat_handler = (
136
+ llama_cpp.llama_chat_format.NanoLlavaChatHandler.from_pretrained(
137
+ repo_id=settings.hf_model_repo_id,
138
+ filename=settings.clip_model_path,
139
+ verbose=settings.verbose,
140
+ )
141
+ )
142
+ else:
143
+ chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
144
+ clip_model_path=settings.clip_model_path, verbose=settings.verbose
145
+ )
146
+ elif settings.chat_format == "llama-3-vision-alpha":
147
+ assert settings.clip_model_path is not None, "clip model not found"
148
+ if settings.hf_model_repo_id is not None:
149
+ chat_handler = (
150
+ llama_cpp.llama_chat_format.Llama3VisionAlpha.from_pretrained(
151
+ repo_id=settings.hf_model_repo_id,
152
+ filename=settings.clip_model_path,
153
+ verbose=settings.verbose,
154
+ )
155
+ )
156
+ else:
157
+ chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha(
158
+ clip_model_path=settings.clip_model_path, verbose=settings.verbose
159
+ )
160
+ elif settings.chat_format == "minicpm-v-2.6":
161
+ assert settings.clip_model_path is not None, "clip model not found"
162
+ if settings.hf_model_repo_id is not None:
163
+ chat_handler = (
164
+ llama_cpp.llama_chat_format.MiniCPMv26ChatHandler.from_pretrained(
165
+ repo_id=settings.hf_model_repo_id,
166
+ filename=settings.clip_model_path,
167
+ verbose=settings.verbose,
168
+ )
169
+ )
170
+ else:
171
+ chat_handler = llama_cpp.llama_chat_format.MiniCPMv26ChatHandler(
172
+ clip_model_path=settings.clip_model_path, verbose=settings.verbose
173
+ )
174
+ elif settings.chat_format == "qwen2.5-vl":
175
+ assert settings.clip_model_path is not None, "clip model not found"
176
+ if settings.hf_model_repo_id is not None:
177
+ chat_handler = (
178
+ llama_cpp.llama_chat_format.Qwen25VLChatHandler.from_pretrained(
179
+ repo_id=settings.hf_model_repo_id,
180
+ filename=settings.clip_model_path,
181
+ verbose=settings.verbose,
182
+ )
183
+ )
184
+ else:
185
+ chat_handler = llama_cpp.llama_chat_format.Qwen25VLChatHandler(
186
+ clip_model_path=settings.clip_model_path, verbose=settings.verbose
187
+ )
188
+ elif settings.chat_format == "hf-autotokenizer":
189
+ assert (
190
+ settings.hf_pretrained_model_name_or_path is not None
191
+ ), "hf_pretrained_model_name_or_path must be set for hf-autotokenizer"
192
+ chat_handler = (
193
+ llama_cpp.llama_chat_format.hf_autotokenizer_to_chat_completion_handler(
194
+ settings.hf_pretrained_model_name_or_path
195
+ )
196
+ )
197
+ elif settings.chat_format == "hf-tokenizer-config":
198
+ assert (
199
+ settings.hf_tokenizer_config_path is not None
200
+ ), "hf_tokenizer_config_path must be set for hf-tokenizer-config"
201
+ chat_handler = llama_cpp.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler(
202
+ json.load(open(settings.hf_tokenizer_config_path))
203
+ )
204
+
205
+ tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
206
+ if settings.hf_pretrained_model_name_or_path is not None:
207
+ tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(
208
+ settings.hf_pretrained_model_name_or_path
209
+ )
210
+
211
+ draft_model = None
212
+ if settings.draft_model is not None:
213
+ draft_model = llama_speculative.LlamaPromptLookupDecoding(
214
+ num_pred_tokens=settings.draft_model_num_pred_tokens
215
+ )
216
+
217
+ kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None
218
+ if settings.kv_overrides is not None:
219
+ assert isinstance(settings.kv_overrides, list)
220
+ kv_overrides = {}
221
+ for kv in settings.kv_overrides:
222
+ key, value = kv.split("=")
223
+ if ":" in value:
224
+ value_type, value = value.split(":")
225
+ if value_type == "bool":
226
+ kv_overrides[key] = value.lower() in ["true", "1"]
227
+ elif value_type == "int":
228
+ kv_overrides[key] = int(value)
229
+ elif value_type == "float":
230
+ kv_overrides[key] = float(value)
231
+ elif value_type == "str":
232
+ kv_overrides[key] = value
233
+ else:
234
+ raise ValueError(f"Unknown value type {value_type}")
235
+
236
+ import functools
237
+
238
+ kwargs = {}
239
+
240
+ if settings.hf_model_repo_id is not None:
241
+ create_fn = functools.partial(
242
+ llama_cpp.Llama.from_pretrained,
243
+ repo_id=settings.hf_model_repo_id,
244
+ filename=settings.model,
245
+ )
246
+ else:
247
+ create_fn = llama_cpp.Llama
248
+ kwargs["model_path"] = settings.model
249
+
250
+ _model = create_fn(
251
+ **kwargs,
252
+ # Model Params
253
+ n_gpu_layers=settings.n_gpu_layers,
254
+ split_mode=settings.split_mode,
255
+ main_gpu=settings.main_gpu,
256
+ tensor_split=settings.tensor_split,
257
+ vocab_only=settings.vocab_only,
258
+ use_mmap=settings.use_mmap,
259
+ use_mlock=settings.use_mlock,
260
+ kv_overrides=kv_overrides,
261
+ rpc_servers=settings.rpc_servers,
262
+ # Context Params
263
+ seed=settings.seed,
264
+ n_ctx=settings.n_ctx,
265
+ n_batch=settings.n_batch,
266
+ n_ubatch=settings.n_ubatch,
267
+ n_threads=settings.n_threads,
268
+ n_threads_batch=settings.n_threads_batch,
269
+ rope_scaling_type=settings.rope_scaling_type,
270
+ rope_freq_base=settings.rope_freq_base,
271
+ rope_freq_scale=settings.rope_freq_scale,
272
+ yarn_ext_factor=settings.yarn_ext_factor,
273
+ yarn_attn_factor=settings.yarn_attn_factor,
274
+ yarn_beta_fast=settings.yarn_beta_fast,
275
+ yarn_beta_slow=settings.yarn_beta_slow,
276
+ yarn_orig_ctx=settings.yarn_orig_ctx,
277
+ mul_mat_q=settings.mul_mat_q,
278
+ logits_all=settings.logits_all,
279
+ embedding=settings.embedding,
280
+ offload_kqv=settings.offload_kqv,
281
+ flash_attn=settings.flash_attn,
282
+ # Sampling Params
283
+ last_n_tokens_size=settings.last_n_tokens_size,
284
+ # LoRA Params
285
+ lora_base=settings.lora_base,
286
+ lora_path=settings.lora_path,
287
+ # Backend Params
288
+ numa=settings.numa,
289
+ # Chat Format Params
290
+ chat_format=settings.chat_format,
291
+ chat_handler=chat_handler,
292
+ # Speculative Decoding
293
+ draft_model=draft_model,
294
+ # KV Cache Quantization
295
+ type_k=settings.type_k,
296
+ type_v=settings.type_v,
297
+ # Tokenizer
298
+ tokenizer=tokenizer,
299
+ # Misc
300
+ verbose=settings.verbose,
301
+ )
302
+ if settings.cache:
303
+ if settings.cache_type == "disk":
304
+ if settings.verbose:
305
+ print(f"Using disk cache with size {settings.cache_size}")
306
+ cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
307
+ else:
308
+ if settings.verbose:
309
+ print(f"Using ram cache with size {settings.cache_size}")
310
+ cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)
311
+ _model.set_cache(cache)
312
+ return _model