llama-cpp-python-win 0.3.16__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. bin/convert_hf_to_gguf.py +8751 -0
  2. bin/ggml-base.dll +0 -0
  3. bin/ggml-cpu.dll +0 -0
  4. bin/ggml.dll +0 -0
  5. bin/llama-mtmd-cli.exe +0 -0
  6. bin/llama.dll +0 -0
  7. bin/mtmd.dll +0 -0
  8. include/ggml-alloc.h +76 -0
  9. include/ggml-backend.h +354 -0
  10. include/ggml-blas.h +25 -0
  11. include/ggml-cann.h +123 -0
  12. include/ggml-cpp.h +39 -0
  13. include/ggml-cpu.h +145 -0
  14. include/ggml-cuda.h +47 -0
  15. include/ggml-metal.h +66 -0
  16. include/ggml-opt.h +256 -0
  17. include/ggml-rpc.h +33 -0
  18. include/ggml-sycl.h +49 -0
  19. include/ggml-vulkan.h +29 -0
  20. include/ggml-webgpu.h +19 -0
  21. include/ggml.h +2467 -0
  22. include/gguf.h +202 -0
  23. include/llama-cpp.h +30 -0
  24. include/llama.h +1482 -0
  25. include/mtmd-helper.h +91 -0
  26. include/mtmd.h +298 -0
  27. lib/cmake/ggml/ggml-config.cmake +328 -0
  28. lib/cmake/ggml/ggml-version.cmake +65 -0
  29. lib/cmake/llama/llama-config.cmake +54 -0
  30. lib/cmake/llama/llama-version.cmake +65 -0
  31. lib/ggml-base.lib +0 -0
  32. lib/ggml-cpu.lib +0 -0
  33. lib/ggml.lib +0 -0
  34. lib/llama.lib +0 -0
  35. lib/mtmd.lib +0 -0
  36. lib/pkgconfig/llama.pc +10 -0
  37. llama_cpp/__init__.py +4 -0
  38. llama_cpp/_ctypes_extensions.py +131 -0
  39. llama_cpp/_ggml.py +12 -0
  40. llama_cpp/_internals.py +856 -0
  41. llama_cpp/_logger.py +47 -0
  42. llama_cpp/_utils.py +78 -0
  43. llama_cpp/lib/ggml-base.dll +0 -0
  44. llama_cpp/lib/ggml-base.lib +0 -0
  45. llama_cpp/lib/ggml-cpu.dll +0 -0
  46. llama_cpp/lib/ggml-cpu.lib +0 -0
  47. llama_cpp/lib/ggml.dll +0 -0
  48. llama_cpp/lib/ggml.lib +0 -0
  49. llama_cpp/lib/llama.dll +0 -0
  50. llama_cpp/lib/llama.lib +0 -0
  51. llama_cpp/lib/mtmd.dll +0 -0
  52. llama_cpp/lib/mtmd.lib +0 -0
  53. llama_cpp/llama.py +2422 -0
  54. llama_cpp/llama_cache.py +155 -0
  55. llama_cpp/llama_chat_format.py +3962 -0
  56. llama_cpp/llama_cpp.py +4374 -0
  57. llama_cpp/llama_grammar.py +953 -0
  58. llama_cpp/llama_speculative.py +64 -0
  59. llama_cpp/llama_tokenizer.py +120 -0
  60. llama_cpp/llama_types.py +316 -0
  61. llama_cpp/llava_cpp.py +158 -0
  62. llama_cpp/mtmd_cpp.py +280 -0
  63. llama_cpp/py.typed +0 -0
  64. llama_cpp/server/__init__.py +0 -0
  65. llama_cpp/server/__main__.py +100 -0
  66. llama_cpp/server/app.py +597 -0
  67. llama_cpp/server/cli.py +97 -0
  68. llama_cpp/server/errors.py +212 -0
  69. llama_cpp/server/model.py +312 -0
  70. llama_cpp/server/settings.py +240 -0
  71. llama_cpp/server/types.py +316 -0
  72. llama_cpp_python_win-0.3.16.dist-info/METADATA +856 -0
  73. llama_cpp_python_win-0.3.16.dist-info/RECORD +75 -0
  74. llama_cpp_python_win-0.3.16.dist-info/WHEEL +5 -0
  75. llama_cpp_python_win-0.3.16.dist-info/licenses/LICENSE.md +9 -0
@@ -0,0 +1,240 @@
1
+ from __future__ import annotations
2
+
3
+ import multiprocessing
4
+
5
+ from typing import Optional, List, Literal, Union, Dict, cast
6
+ from typing_extensions import Self
7
+
8
+ from pydantic import Field, model_validator
9
+ from pydantic_settings import BaseSettings
10
+
11
+ import llama_cpp
12
+
13
+ # Disable warning for model and model_alias settings
14
+ BaseSettings.model_config["protected_namespaces"] = ()
15
+
16
+
17
+ class ModelSettings(BaseSettings):
18
+ """Model settings used to load a Llama model."""
19
+
20
+ model: str = Field(
21
+ description="The path to the model to use for generating completions."
22
+ )
23
+ model_alias: Optional[str] = Field(
24
+ default=None,
25
+ description="The alias of the model to use for generating completions.",
26
+ )
27
+ # Model Params
28
+ n_gpu_layers: int = Field(
29
+ default=0,
30
+ ge=-1,
31
+ description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
32
+ )
33
+ split_mode: int = Field(
34
+ default=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
35
+ description="The split mode to use.",
36
+ )
37
+ main_gpu: int = Field(
38
+ default=0,
39
+ ge=0,
40
+ description="Main GPU to use.",
41
+ )
42
+ tensor_split: Optional[List[float]] = Field(
43
+ default=None,
44
+ description="Split layers across multiple GPUs in proportion.",
45
+ )
46
+ vocab_only: bool = Field(
47
+ default=False, description="Whether to only return the vocabulary."
48
+ )
49
+ use_mmap: bool = Field(
50
+ default=llama_cpp.llama_supports_mmap(),
51
+ description="Use mmap.",
52
+ )
53
+ use_mlock: bool = Field(
54
+ default=llama_cpp.llama_supports_mlock(),
55
+ description="Use mlock.",
56
+ )
57
+ kv_overrides: Optional[List[str]] = Field(
58
+ default=None,
59
+ description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
60
+ )
61
+ rpc_servers: Optional[str] = Field(
62
+ default=None,
63
+ description="comma seperated list of rpc servers for offloading",
64
+ )
65
+ # Context Params
66
+ seed: int = Field(
67
+ default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
68
+ )
69
+ n_ctx: int = Field(default=2048, ge=0, description="The context size.")
70
+ n_batch: int = Field(
71
+ default=512, ge=1, description="The batch size to use per eval."
72
+ )
73
+ n_ubatch: int = Field(
74
+ default=512, ge=1, description="The physical batch size used by llama.cpp"
75
+ )
76
+ n_threads: int = Field(
77
+ default=max(multiprocessing.cpu_count() // 2, 1),
78
+ ge=1,
79
+ description="The number of threads to use. Use -1 for max cpu threads",
80
+ )
81
+ n_threads_batch: int = Field(
82
+ default=max(multiprocessing.cpu_count(), 1),
83
+ ge=0,
84
+ description="The number of threads to use when batch processing. Use -1 for max cpu threads",
85
+ )
86
+ rope_scaling_type: int = Field(
87
+ default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
88
+ )
89
+ rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
90
+ rope_freq_scale: float = Field(
91
+ default=0.0, description="RoPE frequency scaling factor"
92
+ )
93
+ yarn_ext_factor: float = Field(default=-1.0)
94
+ yarn_attn_factor: float = Field(default=1.0)
95
+ yarn_beta_fast: float = Field(default=32.0)
96
+ yarn_beta_slow: float = Field(default=1.0)
97
+ yarn_orig_ctx: int = Field(default=0)
98
+ mul_mat_q: bool = Field(
99
+ default=True, description="if true, use experimental mul_mat_q kernels"
100
+ )
101
+ logits_all: bool = Field(default=True, description="Whether to return logits.")
102
+ embedding: bool = Field(default=False, description="Whether to use embeddings.")
103
+ offload_kqv: bool = Field(
104
+ default=True, description="Whether to offload kqv to the GPU."
105
+ )
106
+ flash_attn: bool = Field(
107
+ default=False, description="Whether to use flash attention."
108
+ )
109
+ # Sampling Params
110
+ last_n_tokens_size: int = Field(
111
+ default=64,
112
+ ge=0,
113
+ description="Last n tokens to keep for repeat penalty calculation.",
114
+ )
115
+ # LoRA Params
116
+ lora_base: Optional[str] = Field(
117
+ default=None,
118
+ description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.",
119
+ )
120
+ lora_path: Optional[str] = Field(
121
+ default=None,
122
+ description="Path to a LoRA file to apply to the model.",
123
+ )
124
+ # Backend Params
125
+ numa: Union[bool, int] = Field(
126
+ default=False,
127
+ description="Enable NUMA support.",
128
+ )
129
+ # Chat Format Params
130
+ chat_format: Optional[str] = Field(
131
+ default=None,
132
+ description="Chat format to use.",
133
+ )
134
+ clip_model_path: Optional[str] = Field(
135
+ default=None,
136
+ description="Path to a CLIP model to use for multi-modal chat completion.",
137
+ )
138
+ # Cache Params
139
+ cache: bool = Field(
140
+ default=False,
141
+ description="Use a cache to reduce processing times for evaluated prompts.",
142
+ )
143
+ cache_type: Literal["ram", "disk"] = Field(
144
+ default="ram",
145
+ description="The type of cache to use. Only used if cache is True.",
146
+ )
147
+ cache_size: int = Field(
148
+ default=2 << 30,
149
+ description="The size of the cache in bytes. Only used if cache is True.",
150
+ )
151
+ # Tokenizer Options
152
+ hf_tokenizer_config_path: Optional[str] = Field(
153
+ default=None,
154
+ description="The path to a HuggingFace tokenizer_config.json file.",
155
+ )
156
+ hf_pretrained_model_name_or_path: Optional[str] = Field(
157
+ default=None,
158
+ description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
159
+ )
160
+ # Loading from HuggingFace Model Hub
161
+ hf_model_repo_id: Optional[str] = Field(
162
+ default=None,
163
+ description="The model repo id to use for the HuggingFace tokenizer model.",
164
+ )
165
+ # Speculative Decoding
166
+ draft_model: Optional[str] = Field(
167
+ default=None,
168
+ description="Method to use for speculative decoding. One of (prompt-lookup-decoding).",
169
+ )
170
+ draft_model_num_pred_tokens: int = Field(
171
+ default=10,
172
+ description="Number of tokens to predict using the draft model.",
173
+ )
174
+ # KV Cache Quantization
175
+ type_k: Optional[int] = Field(
176
+ default=None,
177
+ description="Type of the key cache quantization.",
178
+ )
179
+ type_v: Optional[int] = Field(
180
+ default=None,
181
+ description="Type of the value cache quantization.",
182
+ )
183
+ # Misc
184
+ verbose: bool = Field(
185
+ default=True, description="Whether to print debug information."
186
+ )
187
+
188
+ @model_validator(
189
+ mode="before"
190
+ ) # pre=True to ensure this runs before any other validation
191
+ def set_dynamic_defaults(self) -> Self:
192
+ # If n_threads or n_threads_batch is -1, set it to multiprocessing.cpu_count()
193
+ cpu_count = multiprocessing.cpu_count()
194
+ values = cast(Dict[str, int], self)
195
+ if values.get("n_threads", 0) == -1:
196
+ values["n_threads"] = cpu_count
197
+ if values.get("n_threads_batch", 0) == -1:
198
+ values["n_threads_batch"] = cpu_count
199
+ return self
200
+
201
+
202
+ class ServerSettings(BaseSettings):
203
+ """Server settings used to configure the FastAPI and Uvicorn server."""
204
+
205
+ # Uvicorn Settings
206
+ host: str = Field(default="localhost", description="Listen address")
207
+ port: int = Field(default=8000, description="Listen port")
208
+ ssl_keyfile: Optional[str] = Field(
209
+ default=None, description="SSL key file for HTTPS"
210
+ )
211
+ ssl_certfile: Optional[str] = Field(
212
+ default=None, description="SSL certificate file for HTTPS"
213
+ )
214
+ # FastAPI Settings
215
+ api_key: Optional[str] = Field(
216
+ default=None,
217
+ description="API key for authentication. If set all requests need to be authenticated.",
218
+ )
219
+ interrupt_requests: bool = Field(
220
+ default=True,
221
+ description="Whether to interrupt requests when a new request is received.",
222
+ )
223
+ disable_ping_events: bool = Field(
224
+ default=False,
225
+ description="Disable EventSource pings (may be needed for some clients).",
226
+ )
227
+ root_path: str = Field(
228
+ default="",
229
+ description="The root path for the server. Useful when running behind a reverse proxy.",
230
+ )
231
+
232
+
233
+ class Settings(ServerSettings, ModelSettings):
234
+ pass
235
+
236
+
237
+ class ConfigFileSettings(ServerSettings):
238
+ """Configuration file format settings."""
239
+
240
+ models: List[ModelSettings] = Field(default=[], description="Model configs")
@@ -0,0 +1,316 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Optional, Union, Dict
4
+ from typing_extensions import TypedDict, Literal
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ import llama_cpp
9
+
10
+
11
+ model_field = Field(
12
+ description="The model to use for generating completions.", default=None
13
+ )
14
+
15
+ max_tokens_field = Field(
16
+ default=16, ge=1, description="The maximum number of tokens to generate."
17
+ )
18
+
19
+ min_tokens_field = Field(
20
+ default=0,
21
+ ge=0,
22
+ description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).",
23
+ )
24
+
25
+ temperature_field = Field(
26
+ default=0.8,
27
+ description="Adjust the randomness of the generated text.\n\n"
28
+ + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
29
+ )
30
+
31
+ top_p_field = Field(
32
+ default=0.95,
33
+ ge=0.0,
34
+ le=1.0,
35
+ description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n"
36
+ + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.",
37
+ )
38
+
39
+ min_p_field = Field(
40
+ default=0.05,
41
+ ge=0.0,
42
+ le=1.0,
43
+ description="Sets a minimum base probability threshold for token selection.\n\n"
44
+ + "The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter min_p represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with min_p=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.",
45
+ )
46
+
47
+ stop_field = Field(
48
+ default=None,
49
+ description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
50
+ )
51
+
52
+ stream_field = Field(
53
+ default=False,
54
+ description="Whether to stream the results as they are generated. Useful for chatbots.",
55
+ )
56
+
57
+ top_k_field = Field(
58
+ default=40,
59
+ ge=0,
60
+ description="Limit the next token selection to the K most probable tokens.\n\n"
61
+ + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.",
62
+ )
63
+
64
+ repeat_penalty_field = Field(
65
+ default=1.1,
66
+ ge=0.0,
67
+ description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n"
68
+ + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
69
+ )
70
+
71
+ presence_penalty_field = Field(
72
+ default=0.0,
73
+ ge=-2.0,
74
+ le=2.0,
75
+ description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
76
+ )
77
+
78
+ frequency_penalty_field = Field(
79
+ default=0.0,
80
+ ge=-2.0,
81
+ le=2.0,
82
+ description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
83
+ )
84
+
85
+ mirostat_mode_field = Field(
86
+ default=0,
87
+ ge=0,
88
+ le=2,
89
+ description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)",
90
+ )
91
+
92
+ mirostat_tau_field = Field(
93
+ default=5.0,
94
+ ge=0.0,
95
+ le=10.0,
96
+ description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text",
97
+ )
98
+
99
+ mirostat_eta_field = Field(
100
+ default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate"
101
+ )
102
+
103
+ grammar = Field(
104
+ default=None,
105
+ description="A CBNF grammar (as string) to be used for formatting the model's output.",
106
+ )
107
+
108
+
109
+ class CreateCompletionRequest(BaseModel):
110
+ prompt: Union[str, List[str]] = Field(
111
+ default="", description="The prompt to generate completions for."
112
+ )
113
+ suffix: Optional[str] = Field(
114
+ default=None,
115
+ description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
116
+ )
117
+ max_tokens: Optional[int] = Field(
118
+ default=16, ge=0, description="The maximum number of tokens to generate."
119
+ )
120
+ min_tokens: int = min_tokens_field
121
+ temperature: float = temperature_field
122
+ top_p: float = top_p_field
123
+ min_p: float = min_p_field
124
+ echo: bool = Field(
125
+ default=False,
126
+ description="Whether to echo the prompt in the generated text. Useful for chatbots.",
127
+ )
128
+ stop: Optional[Union[str, List[str]]] = stop_field
129
+ stream: bool = stream_field
130
+ logprobs: Optional[int] = Field(
131
+ default=None,
132
+ ge=0,
133
+ description="The number of logprobs to generate. If None, no logprobs are generated.",
134
+ )
135
+ presence_penalty: Optional[float] = presence_penalty_field
136
+ frequency_penalty: Optional[float] = frequency_penalty_field
137
+ logit_bias: Optional[Dict[str, float]] = Field(None)
138
+ seed: Optional[int] = Field(None)
139
+
140
+ # ignored or currently unsupported
141
+ model: Optional[str] = model_field
142
+ n: Optional[int] = 1
143
+ best_of: Optional[int] = 1
144
+ user: Optional[str] = Field(default=None)
145
+
146
+ # llama.cpp specific parameters
147
+ top_k: int = top_k_field
148
+ repeat_penalty: float = repeat_penalty_field
149
+ logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
150
+ mirostat_mode: int = mirostat_mode_field
151
+ mirostat_tau: float = mirostat_tau_field
152
+ mirostat_eta: float = mirostat_eta_field
153
+ grammar: Optional[str] = None
154
+
155
+ model_config = {
156
+ "json_schema_extra": {
157
+ "examples": [
158
+ {
159
+ "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
160
+ "stop": ["\n", "###"],
161
+ }
162
+ ]
163
+ }
164
+ }
165
+
166
+
167
+ class CreateEmbeddingRequest(BaseModel):
168
+ model: Optional[str] = model_field
169
+ input: Union[str, List[str]] = Field(description="The input to embed.")
170
+ user: Optional[str] = Field(default=None)
171
+
172
+ model_config = {
173
+ "json_schema_extra": {
174
+ "examples": [
175
+ {
176
+ "input": "The food was delicious and the waiter...",
177
+ }
178
+ ]
179
+ }
180
+ }
181
+
182
+
183
+ class ChatCompletionRequestMessage(BaseModel):
184
+ role: Literal["system", "user", "assistant", "function"] = Field(
185
+ default="user", description="The role of the message."
186
+ )
187
+ content: Optional[str] = Field(
188
+ default="", description="The content of the message."
189
+ )
190
+
191
+
192
+ class CreateChatCompletionRequest(BaseModel):
193
+ messages: List[llama_cpp.ChatCompletionRequestMessage] = Field(
194
+ default=[], description="A list of messages to generate completions for."
195
+ )
196
+ functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field(
197
+ default=None,
198
+ description="A list of functions to apply to the generated completions.",
199
+ )
200
+ function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field(
201
+ default=None,
202
+ description="A function to apply to the generated completions.",
203
+ )
204
+ tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field(
205
+ default=None,
206
+ description="A list of tools to apply to the generated completions.",
207
+ )
208
+ tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field(
209
+ default=None,
210
+ description="A tool to apply to the generated completions.",
211
+ ) # TODO: verify
212
+ max_tokens: Optional[int] = Field(
213
+ default=None,
214
+ description="The maximum number of tokens to generate. Defaults to inf",
215
+ )
216
+ min_tokens: int = min_tokens_field
217
+ logprobs: Optional[bool] = Field(
218
+ default=False,
219
+ description="Whether to output the logprobs or not. Default is True",
220
+ )
221
+ top_logprobs: Optional[int] = Field(
222
+ default=None,
223
+ ge=0,
224
+ description="The number of logprobs to generate. If None, no logprobs are generated. logprobs need to set to True.",
225
+ )
226
+ temperature: float = temperature_field
227
+ top_p: float = top_p_field
228
+ min_p: float = min_p_field
229
+ stop: Optional[Union[str, List[str]]] = stop_field
230
+ stream: bool = stream_field
231
+ presence_penalty: Optional[float] = presence_penalty_field
232
+ frequency_penalty: Optional[float] = frequency_penalty_field
233
+ logit_bias: Optional[Dict[str, float]] = Field(None)
234
+ seed: Optional[int] = Field(None)
235
+ response_format: Optional[llama_cpp.ChatCompletionRequestResponseFormat] = Field(
236
+ default=None,
237
+ )
238
+
239
+ # ignored or currently unsupported
240
+ model: Optional[str] = model_field
241
+ n: Optional[int] = 1
242
+ user: Optional[str] = Field(None)
243
+
244
+ # llama.cpp specific parameters
245
+ top_k: int = top_k_field
246
+ repeat_penalty: float = repeat_penalty_field
247
+ logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
248
+ mirostat_mode: int = mirostat_mode_field
249
+ mirostat_tau: float = mirostat_tau_field
250
+ mirostat_eta: float = mirostat_eta_field
251
+ grammar: Optional[str] = None
252
+
253
+ model_config = {
254
+ "json_schema_extra": {
255
+ "examples": [
256
+ {
257
+ "messages": [
258
+ ChatCompletionRequestMessage(
259
+ role="system", content="You are a helpful assistant."
260
+ ).model_dump(),
261
+ ChatCompletionRequestMessage(
262
+ role="user", content="What is the capital of France?"
263
+ ).model_dump(),
264
+ ]
265
+ }
266
+ ]
267
+ }
268
+ }
269
+
270
+
271
+ class ModelData(TypedDict):
272
+ id: str
273
+ object: Literal["model"]
274
+ owned_by: str
275
+ permissions: List[str]
276
+
277
+
278
+ class ModelList(TypedDict):
279
+ object: Literal["list"]
280
+ data: List[ModelData]
281
+
282
+
283
+ class TokenizeInputRequest(BaseModel):
284
+ model: Optional[str] = model_field
285
+ input: str = Field(description="The input to tokenize.")
286
+
287
+ model_config = {
288
+ "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
289
+ }
290
+
291
+
292
+ class TokenizeInputResponse(BaseModel):
293
+ tokens: List[int] = Field(description="A list of tokens.")
294
+
295
+ model_config = {"json_schema_extra": {"example": {"tokens": [123, 321, 222]}}}
296
+
297
+
298
+ class TokenizeInputCountResponse(BaseModel):
299
+ count: int = Field(description="The number of tokens in the input.")
300
+
301
+ model_config = {"json_schema_extra": {"example": {"count": 5}}}
302
+
303
+
304
+ class DetokenizeInputRequest(BaseModel):
305
+ model: Optional[str] = model_field
306
+ tokens: List[int] = Field(description="A list of toekns to detokenize.")
307
+
308
+ model_config = {"json_schema_extra": {"example": [{"tokens": [123, 321, 222]}]}}
309
+
310
+
311
+ class DetokenizeInputResponse(BaseModel):
312
+ text: str = Field(description="The detokenized text.")
313
+
314
+ model_config = {
315
+ "json_schema_extra": {"example": {"text": "How many tokens in this query?"}}
316
+ }