llama-cpp-python-win 0.3.16__cp314-cp314-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bin/convert_hf_to_gguf.py +8751 -0
- bin/ggml-base.dll +0 -0
- bin/ggml-cpu.dll +0 -0
- bin/ggml.dll +0 -0
- bin/llama-mtmd-cli.exe +0 -0
- bin/llama.dll +0 -0
- bin/mtmd.dll +0 -0
- include/ggml-alloc.h +76 -0
- include/ggml-backend.h +354 -0
- include/ggml-blas.h +25 -0
- include/ggml-cann.h +123 -0
- include/ggml-cpp.h +39 -0
- include/ggml-cpu.h +145 -0
- include/ggml-cuda.h +47 -0
- include/ggml-metal.h +66 -0
- include/ggml-opt.h +256 -0
- include/ggml-rpc.h +33 -0
- include/ggml-sycl.h +49 -0
- include/ggml-vulkan.h +29 -0
- include/ggml-webgpu.h +19 -0
- include/ggml.h +2467 -0
- include/gguf.h +202 -0
- include/llama-cpp.h +30 -0
- include/llama.h +1482 -0
- include/mtmd-helper.h +91 -0
- include/mtmd.h +298 -0
- lib/cmake/ggml/ggml-config.cmake +328 -0
- lib/cmake/ggml/ggml-version.cmake +65 -0
- lib/cmake/llama/llama-config.cmake +54 -0
- lib/cmake/llama/llama-version.cmake +65 -0
- lib/ggml-base.lib +0 -0
- lib/ggml-cpu.lib +0 -0
- lib/ggml.lib +0 -0
- lib/llama.lib +0 -0
- lib/mtmd.lib +0 -0
- lib/pkgconfig/llama.pc +10 -0
- llama_cpp/__init__.py +4 -0
- llama_cpp/_ctypes_extensions.py +131 -0
- llama_cpp/_ggml.py +12 -0
- llama_cpp/_internals.py +856 -0
- llama_cpp/_logger.py +47 -0
- llama_cpp/_utils.py +78 -0
- llama_cpp/lib/ggml-base.dll +0 -0
- llama_cpp/lib/ggml-base.lib +0 -0
- llama_cpp/lib/ggml-cpu.dll +0 -0
- llama_cpp/lib/ggml-cpu.lib +0 -0
- llama_cpp/lib/ggml.dll +0 -0
- llama_cpp/lib/ggml.lib +0 -0
- llama_cpp/lib/llama.dll +0 -0
- llama_cpp/lib/llama.lib +0 -0
- llama_cpp/lib/mtmd.dll +0 -0
- llama_cpp/lib/mtmd.lib +0 -0
- llama_cpp/llama.py +2422 -0
- llama_cpp/llama_cache.py +155 -0
- llama_cpp/llama_chat_format.py +3962 -0
- llama_cpp/llama_cpp.py +4374 -0
- llama_cpp/llama_grammar.py +953 -0
- llama_cpp/llama_speculative.py +64 -0
- llama_cpp/llama_tokenizer.py +120 -0
- llama_cpp/llama_types.py +316 -0
- llama_cpp/llava_cpp.py +158 -0
- llama_cpp/mtmd_cpp.py +280 -0
- llama_cpp/py.typed +0 -0
- llama_cpp/server/__init__.py +0 -0
- llama_cpp/server/__main__.py +100 -0
- llama_cpp/server/app.py +597 -0
- llama_cpp/server/cli.py +97 -0
- llama_cpp/server/errors.py +212 -0
- llama_cpp/server/model.py +312 -0
- llama_cpp/server/settings.py +240 -0
- llama_cpp/server/types.py +316 -0
- llama_cpp_python_win-0.3.16.dist-info/METADATA +856 -0
- llama_cpp_python_win-0.3.16.dist-info/RECORD +75 -0
- llama_cpp_python_win-0.3.16.dist-info/WHEEL +5 -0
- llama_cpp_python_win-0.3.16.dist-info/licenses/LICENSE.md +9 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import numpy.typing as npt
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LlamaDraftModel(abc.ABC):
|
|
10
|
+
@abc.abstractmethod
|
|
11
|
+
def __call__(
|
|
12
|
+
self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any
|
|
13
|
+
) -> npt.NDArray[np.intc]:
|
|
14
|
+
raise NotImplementedError()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LlamaPromptLookupDecoding(LlamaDraftModel):
|
|
18
|
+
"""Based on https://github.com/apoorvumang/prompt-lookup-decoding"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, max_ngram_size: int = 2, num_pred_tokens: int = 10):
|
|
21
|
+
self.max_ngram_size = max_ngram_size
|
|
22
|
+
self.num_pred_tokens = num_pred_tokens
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def find_candidate_pred_tokens(
|
|
26
|
+
input_ids: npt.NDArray[np.intc],
|
|
27
|
+
max_ngram_size: int,
|
|
28
|
+
num_pred_tokens: int,
|
|
29
|
+
):
|
|
30
|
+
input_length = input_ids.shape[0]
|
|
31
|
+
|
|
32
|
+
for ngram_size in range(min(max_ngram_size, input_length - 1), 0, -1):
|
|
33
|
+
# Create sliding windows of size ngram_size
|
|
34
|
+
windows = np.lib.stride_tricks.sliding_window_view(input_ids, (ngram_size,))
|
|
35
|
+
|
|
36
|
+
# Convert ngram to an array for comparison
|
|
37
|
+
ngram_array = input_ids[-ngram_size:]
|
|
38
|
+
|
|
39
|
+
# Find where the windows match the ngram
|
|
40
|
+
matches = np.all(windows == ngram_array, axis=1)
|
|
41
|
+
|
|
42
|
+
# Get the indices of matches
|
|
43
|
+
match_indices = np.nonzero(matches)[0]
|
|
44
|
+
|
|
45
|
+
# Iterate through match indices to find a valid continuation
|
|
46
|
+
for idx in match_indices:
|
|
47
|
+
start_idx = idx + ngram_size
|
|
48
|
+
end_idx = start_idx + num_pred_tokens
|
|
49
|
+
end_idx = min(end_idx, input_length)
|
|
50
|
+
|
|
51
|
+
if start_idx < end_idx:
|
|
52
|
+
return input_ids[start_idx:end_idx]
|
|
53
|
+
|
|
54
|
+
# If no match is found, return an empty array
|
|
55
|
+
return np.array([], dtype=np.intc)
|
|
56
|
+
|
|
57
|
+
def __call__(
|
|
58
|
+
self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any
|
|
59
|
+
) -> npt.NDArray[np.intc]:
|
|
60
|
+
return self.find_candidate_pred_tokens(
|
|
61
|
+
input_ids=input_ids,
|
|
62
|
+
max_ngram_size=self.max_ngram_size,
|
|
63
|
+
num_pred_tokens=self.num_pred_tokens,
|
|
64
|
+
)
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import abc
|
|
4
|
+
from typing import (
|
|
5
|
+
List,
|
|
6
|
+
Optional,
|
|
7
|
+
Any,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
import llama_cpp
|
|
11
|
+
from llama_cpp.llama_types import List
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BaseLlamaTokenizer(abc.ABC):
|
|
15
|
+
@abc.abstractmethod
|
|
16
|
+
def tokenize(
|
|
17
|
+
self, text: bytes, add_bos: bool = True, special: bool = True
|
|
18
|
+
) -> List[int]:
|
|
19
|
+
"""Tokenize the text into tokens.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
text: The utf-8 encoded string to tokenize.
|
|
23
|
+
add_bos: Whether to add a beginning of sequence token.
|
|
24
|
+
special: Whether to tokenize special tokens.
|
|
25
|
+
"""
|
|
26
|
+
raise NotImplementedError
|
|
27
|
+
|
|
28
|
+
@abc.abstractmethod
|
|
29
|
+
def detokenize(
|
|
30
|
+
self,
|
|
31
|
+
tokens: List[int],
|
|
32
|
+
prev_tokens: Optional[List[int]] = None,
|
|
33
|
+
special: bool = False,
|
|
34
|
+
) -> bytes:
|
|
35
|
+
"""Detokenize the tokens into text.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
tokens: The list of tokens to detokenize.
|
|
39
|
+
prev_tokens: The list of previous tokens. Offset mapping will be performed if provided.
|
|
40
|
+
special: Whether to detokenize special tokens.
|
|
41
|
+
"""
|
|
42
|
+
raise NotImplementedError
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class LlamaTokenizer(BaseLlamaTokenizer):
|
|
46
|
+
def __init__(self, llama: llama_cpp.Llama):
|
|
47
|
+
self._model = llama._model # type: ignore
|
|
48
|
+
|
|
49
|
+
def tokenize(
|
|
50
|
+
self, text: bytes, add_bos: bool = True, special: bool = True
|
|
51
|
+
) -> List[int]:
|
|
52
|
+
return self._model.tokenize(text, add_bos=add_bos, special=special)
|
|
53
|
+
|
|
54
|
+
def detokenize(
|
|
55
|
+
self,
|
|
56
|
+
tokens: List[int],
|
|
57
|
+
prev_tokens: Optional[List[int]] = None,
|
|
58
|
+
special: bool = False,
|
|
59
|
+
) -> bytes:
|
|
60
|
+
return self._model.detokenize(tokens, special=special)
|
|
61
|
+
|
|
62
|
+
def encode(
|
|
63
|
+
self, text: str, add_bos: bool = True, special: bool = True
|
|
64
|
+
) -> List[int]:
|
|
65
|
+
return self.tokenize(
|
|
66
|
+
text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def decode(self, tokens: List[int]) -> str:
|
|
70
|
+
return self.detokenize(tokens).decode("utf-8", errors="ignore")
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
|
|
74
|
+
return cls(llama_cpp.Llama(model_path=path, vocab_only=True))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class LlamaHFTokenizer(BaseLlamaTokenizer):
|
|
78
|
+
def __init__(self, hf_tokenizer: Any):
|
|
79
|
+
self.hf_tokenizer = hf_tokenizer
|
|
80
|
+
|
|
81
|
+
def tokenize(
|
|
82
|
+
self, text: bytes, add_bos: bool = True, special: bool = True
|
|
83
|
+
) -> List[int]:
|
|
84
|
+
return self.hf_tokenizer.encode(
|
|
85
|
+
text.decode("utf-8", errors="ignore"), add_special_tokens=special
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def detokenize(
|
|
89
|
+
self,
|
|
90
|
+
tokens: List[int],
|
|
91
|
+
prev_tokens: Optional[List[int]] = None,
|
|
92
|
+
special: bool = False,
|
|
93
|
+
) -> bytes:
|
|
94
|
+
skip_special_tokens = not special
|
|
95
|
+
if prev_tokens is not None:
|
|
96
|
+
text = self.hf_tokenizer.decode(
|
|
97
|
+
prev_tokens + tokens, skip_special_tokens=skip_special_tokens
|
|
98
|
+
).encode("utf-8", errors="ignore")
|
|
99
|
+
prev_text = self.hf_tokenizer.decode(
|
|
100
|
+
prev_tokens, skip_special_tokens=skip_special_tokens
|
|
101
|
+
).encode("utf-8", errors="ignore")
|
|
102
|
+
return text[len(prev_text) :]
|
|
103
|
+
else:
|
|
104
|
+
return self.hf_tokenizer.decode(
|
|
105
|
+
tokens, skip_special_tokens=skip_special_tokens
|
|
106
|
+
).encode("utf-8", errors="ignore")
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer":
|
|
110
|
+
try:
|
|
111
|
+
from transformers import AutoTokenizer
|
|
112
|
+
except ImportError:
|
|
113
|
+
raise ImportError(
|
|
114
|
+
"The `transformers` library is required to use the `HFTokenizer`."
|
|
115
|
+
"You can install it with `pip install transformers`."
|
|
116
|
+
)
|
|
117
|
+
hf_tokenizer = AutoTokenizer.from_pretrained(
|
|
118
|
+
pretrained_model_name_or_path=pretrained_model_name_or_path
|
|
119
|
+
)
|
|
120
|
+
return cls(hf_tokenizer)
|
llama_cpp/llama_types.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
"""Types and request signatures for OpenAI compatibility
|
|
2
|
+
|
|
3
|
+
NOTE: These types may change to match the OpenAI OpenAPI specification.
|
|
4
|
+
|
|
5
|
+
Based on the OpenAI OpenAPI specification:
|
|
6
|
+
https://github.com/openai/openai-openapi/blob/master/openapi.yaml
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import Any, List, Optional, Dict, Union
|
|
11
|
+
from typing_extensions import TypedDict, NotRequired, Literal
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# NOTE: Defining this correctly using annotations seems to break pydantic validation.
|
|
15
|
+
# This is a workaround until we can figure out how to do this correctly
|
|
16
|
+
# JsonType = Union[None, int, str, bool, List["JsonType"], Dict[str, "JsonType"]]
|
|
17
|
+
JsonType = Union[None, int, str, bool, List[Any], Dict[str, Any]]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class EmbeddingUsage(TypedDict):
|
|
21
|
+
prompt_tokens: int
|
|
22
|
+
total_tokens: int
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Embedding(TypedDict):
|
|
26
|
+
index: int
|
|
27
|
+
object: str
|
|
28
|
+
embedding: Union[List[float], List[List[float]]]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class CreateEmbeddingResponse(TypedDict):
|
|
32
|
+
object: Literal["list"]
|
|
33
|
+
model: str
|
|
34
|
+
data: List[Embedding]
|
|
35
|
+
usage: EmbeddingUsage
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class CompletionLogprobs(TypedDict):
|
|
39
|
+
text_offset: List[int]
|
|
40
|
+
token_logprobs: List[Optional[float]]
|
|
41
|
+
tokens: List[str]
|
|
42
|
+
top_logprobs: List[Optional[Dict[str, float]]]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class CompletionChoice(TypedDict):
|
|
46
|
+
text: str
|
|
47
|
+
index: int
|
|
48
|
+
logprobs: Optional[CompletionLogprobs]
|
|
49
|
+
finish_reason: Optional[Literal["stop", "length"]]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class CompletionUsage(TypedDict):
|
|
53
|
+
prompt_tokens: int
|
|
54
|
+
completion_tokens: int
|
|
55
|
+
total_tokens: int
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class CreateCompletionResponse(TypedDict):
|
|
59
|
+
id: str
|
|
60
|
+
object: Literal["text_completion"]
|
|
61
|
+
created: int
|
|
62
|
+
model: str
|
|
63
|
+
choices: List[CompletionChoice]
|
|
64
|
+
usage: NotRequired[CompletionUsage]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class ChatCompletionResponseFunctionCall(TypedDict):
|
|
68
|
+
name: str
|
|
69
|
+
arguments: str
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class ChatCompletionResponseMessage(TypedDict):
|
|
73
|
+
content: Optional[str]
|
|
74
|
+
tool_calls: NotRequired["ChatCompletionMessageToolCalls"]
|
|
75
|
+
role: Literal["assistant", "function"] # NOTE: "function" may be incorrect here
|
|
76
|
+
function_call: NotRequired[ChatCompletionResponseFunctionCall] # DEPRECATED
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class ChatCompletionFunction(TypedDict):
|
|
80
|
+
name: str
|
|
81
|
+
description: NotRequired[str]
|
|
82
|
+
parameters: Dict[str, JsonType] # TODO: make this more specific
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class ChatCompletionTopLogprobToken(TypedDict):
|
|
86
|
+
token: str
|
|
87
|
+
logprob: float
|
|
88
|
+
bytes: Optional[List[int]]
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class ChatCompletionLogprobToken(ChatCompletionTopLogprobToken):
|
|
92
|
+
token: str
|
|
93
|
+
logprob: float
|
|
94
|
+
bytes: Optional[List[int]]
|
|
95
|
+
top_logprobs: List[ChatCompletionTopLogprobToken]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class ChatCompletionLogprobs(TypedDict):
|
|
99
|
+
content: Optional[List[ChatCompletionLogprobToken]]
|
|
100
|
+
refusal: Optional[List[ChatCompletionLogprobToken]]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class ChatCompletionResponseChoice(TypedDict):
|
|
104
|
+
index: int
|
|
105
|
+
message: "ChatCompletionResponseMessage"
|
|
106
|
+
logprobs: Optional[ChatCompletionLogprobs]
|
|
107
|
+
finish_reason: Optional[str]
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class CreateChatCompletionResponse(TypedDict):
|
|
111
|
+
id: str
|
|
112
|
+
object: Literal["chat.completion"]
|
|
113
|
+
created: int
|
|
114
|
+
model: str
|
|
115
|
+
choices: List["ChatCompletionResponseChoice"]
|
|
116
|
+
usage: CompletionUsage
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class ChatCompletionMessageToolCallChunkFunction(TypedDict):
|
|
120
|
+
name: Optional[str]
|
|
121
|
+
arguments: str
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class ChatCompletionMessageToolCallChunk(TypedDict):
|
|
125
|
+
index: int
|
|
126
|
+
id: NotRequired[str]
|
|
127
|
+
type: Literal["function"]
|
|
128
|
+
function: ChatCompletionMessageToolCallChunkFunction
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class ChatCompletionStreamResponseDeltaEmpty(TypedDict):
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class ChatCompletionStreamResponseDeltaFunctionCall(TypedDict):
|
|
136
|
+
name: str
|
|
137
|
+
arguments: str
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class ChatCompletionStreamResponseDelta(TypedDict):
|
|
141
|
+
content: NotRequired[Optional[str]]
|
|
142
|
+
function_call: NotRequired[
|
|
143
|
+
Optional[ChatCompletionStreamResponseDeltaFunctionCall]
|
|
144
|
+
] # DEPRECATED
|
|
145
|
+
tool_calls: NotRequired[Optional[List[ChatCompletionMessageToolCallChunk]]]
|
|
146
|
+
role: NotRequired[Optional[Literal["system", "user", "assistant", "tool"]]]
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class ChatCompletionStreamResponseChoice(TypedDict):
|
|
150
|
+
index: int
|
|
151
|
+
delta: Union[
|
|
152
|
+
ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
|
|
153
|
+
]
|
|
154
|
+
finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
|
|
155
|
+
logprobs: NotRequired[Optional[ChatCompletionLogprobs]]
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class CreateChatCompletionStreamResponse(TypedDict):
|
|
159
|
+
id: str
|
|
160
|
+
model: str
|
|
161
|
+
object: Literal["chat.completion.chunk"]
|
|
162
|
+
created: int
|
|
163
|
+
choices: List[ChatCompletionStreamResponseChoice]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class ChatCompletionFunctions(TypedDict):
|
|
167
|
+
name: str
|
|
168
|
+
description: NotRequired[str]
|
|
169
|
+
parameters: Dict[str, JsonType] # TODO: make this more specific
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class ChatCompletionFunctionCallOption(TypedDict):
|
|
173
|
+
name: str
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class ChatCompletionRequestResponseFormat(TypedDict):
|
|
177
|
+
type: Literal["text", "json_object"]
|
|
178
|
+
schema: NotRequired[
|
|
179
|
+
JsonType
|
|
180
|
+
] # https://docs.endpoints.anyscale.com/guides/json_mode/
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class ChatCompletionRequestMessageContentPartText(TypedDict):
|
|
184
|
+
type: Literal["text"]
|
|
185
|
+
text: str
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class ChatCompletionRequestMessageContentPartImageImageUrl(TypedDict):
|
|
189
|
+
url: str
|
|
190
|
+
detail: NotRequired[Literal["auto", "low", "high"]]
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class ChatCompletionRequestMessageContentPartImage(TypedDict):
|
|
194
|
+
type: Literal["image_url"]
|
|
195
|
+
image_url: Union[str, ChatCompletionRequestMessageContentPartImageImageUrl]
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
ChatCompletionRequestMessageContentPart = Union[
|
|
199
|
+
ChatCompletionRequestMessageContentPartText,
|
|
200
|
+
ChatCompletionRequestMessageContentPartImage,
|
|
201
|
+
]
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class ChatCompletionRequestSystemMessage(TypedDict):
|
|
205
|
+
role: Literal["system"]
|
|
206
|
+
content: Optional[str]
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class ChatCompletionRequestUserMessage(TypedDict):
|
|
210
|
+
role: Literal["user"]
|
|
211
|
+
content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]]
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class ChatCompletionMessageToolCallFunction(TypedDict):
|
|
215
|
+
name: str
|
|
216
|
+
arguments: str
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
class ChatCompletionMessageToolCall(TypedDict):
|
|
220
|
+
id: str
|
|
221
|
+
type: Literal["function"]
|
|
222
|
+
function: ChatCompletionMessageToolCallFunction
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
ChatCompletionMessageToolCalls = List[ChatCompletionMessageToolCall]
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict):
|
|
229
|
+
name: str
|
|
230
|
+
arguments: str
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class ChatCompletionRequestAssistantMessage(TypedDict):
|
|
234
|
+
role: Literal["assistant"]
|
|
235
|
+
content: NotRequired[str]
|
|
236
|
+
tool_calls: NotRequired[ChatCompletionMessageToolCalls]
|
|
237
|
+
function_call: NotRequired[
|
|
238
|
+
ChatCompletionRequestAssistantMessageFunctionCall
|
|
239
|
+
] # DEPRECATED
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class ChatCompletionRequestToolMessage(TypedDict):
|
|
243
|
+
role: Literal["tool"]
|
|
244
|
+
content: Optional[str]
|
|
245
|
+
tool_call_id: str
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class ChatCompletionRequestFunctionMessage(TypedDict):
|
|
249
|
+
role: Literal["function"]
|
|
250
|
+
content: Optional[str]
|
|
251
|
+
name: str
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
ChatCompletionRequestMessage = Union[
|
|
255
|
+
ChatCompletionRequestSystemMessage,
|
|
256
|
+
ChatCompletionRequestUserMessage,
|
|
257
|
+
ChatCompletionRequestAssistantMessage,
|
|
258
|
+
ChatCompletionRequestUserMessage,
|
|
259
|
+
ChatCompletionRequestToolMessage,
|
|
260
|
+
ChatCompletionRequestFunctionMessage,
|
|
261
|
+
]
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
class ChatCompletionRequestFunctionCallOption(TypedDict):
|
|
265
|
+
name: str
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
ChatCompletionRequestFunctionCall = Union[
|
|
269
|
+
Literal["none", "auto"], ChatCompletionRequestFunctionCallOption
|
|
270
|
+
]
|
|
271
|
+
|
|
272
|
+
ChatCompletionFunctionParameters = Dict[str, JsonType] # TODO: make this more specific
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
class ChatCompletionToolFunction(TypedDict):
|
|
276
|
+
name: str
|
|
277
|
+
description: NotRequired[str]
|
|
278
|
+
parameters: ChatCompletionFunctionParameters
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class ChatCompletionTool(TypedDict):
|
|
282
|
+
type: Literal["function"]
|
|
283
|
+
function: ChatCompletionToolFunction
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
class ChatCompletionNamedToolChoiceFunction(TypedDict):
|
|
287
|
+
name: str
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
class ChatCompletionNamedToolChoice(TypedDict):
|
|
291
|
+
type: Literal["function"]
|
|
292
|
+
function: ChatCompletionNamedToolChoiceFunction
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
ChatCompletionToolChoiceOption = Union[
|
|
296
|
+
Literal["none", "auto", "required"], ChatCompletionNamedToolChoice
|
|
297
|
+
]
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
# NOTE: The following type names are not part of the OpenAI OpenAPI specification
|
|
301
|
+
# and will be removed in a future major release.
|
|
302
|
+
|
|
303
|
+
EmbeddingData = Embedding
|
|
304
|
+
CompletionChunk = CreateCompletionResponse
|
|
305
|
+
Completion = CreateCompletionResponse
|
|
306
|
+
CreateCompletionStreamResponse = CreateCompletionResponse
|
|
307
|
+
ChatCompletionMessage = ChatCompletionResponseMessage
|
|
308
|
+
ChatCompletionChoice = ChatCompletionResponseChoice
|
|
309
|
+
ChatCompletion = CreateChatCompletionResponse
|
|
310
|
+
ChatCompletionChunkDeltaEmpty = ChatCompletionStreamResponseDeltaEmpty
|
|
311
|
+
ChatCompletionChunkChoice = ChatCompletionStreamResponseChoice
|
|
312
|
+
ChatCompletionChunkDelta = ChatCompletionStreamResponseDelta
|
|
313
|
+
ChatCompletionChunk = CreateChatCompletionStreamResponse
|
|
314
|
+
ChatCompletionStreamResponse = CreateChatCompletionStreamResponse
|
|
315
|
+
ChatCompletionResponseFunction = ChatCompletionFunction
|
|
316
|
+
ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall
|
llama_cpp/llava_cpp.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from ctypes import (
|
|
5
|
+
c_bool,
|
|
6
|
+
c_char_p,
|
|
7
|
+
c_int,
|
|
8
|
+
c_uint8,
|
|
9
|
+
c_float,
|
|
10
|
+
c_void_p,
|
|
11
|
+
POINTER,
|
|
12
|
+
_Pointer, # type: ignore
|
|
13
|
+
Structure,
|
|
14
|
+
)
|
|
15
|
+
import pathlib
|
|
16
|
+
from typing import (
|
|
17
|
+
Union,
|
|
18
|
+
NewType,
|
|
19
|
+
Optional,
|
|
20
|
+
TYPE_CHECKING,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
import llama_cpp.llama_cpp as llama_cpp
|
|
24
|
+
|
|
25
|
+
from llama_cpp._ctypes_extensions import (
|
|
26
|
+
load_shared_library,
|
|
27
|
+
ctypes_function_for_shared_library,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from llama_cpp._ctypes_extensions import (
|
|
32
|
+
CtypesArray,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# Specify the base name of the shared library to load
|
|
37
|
+
_libllava_base_name = "llava"
|
|
38
|
+
_libllava_override_path = os.environ.get("LLAVA_CPP_LIB")
|
|
39
|
+
_libllava_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libllava_override_path is None else pathlib.Path()
|
|
40
|
+
|
|
41
|
+
# Load the library
|
|
42
|
+
_libllava = load_shared_library(_libllava_base_name, _libllava_base_path)
|
|
43
|
+
|
|
44
|
+
ctypes_function = ctypes_function_for_shared_library(_libllava)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
################################################
|
|
48
|
+
# llava.h
|
|
49
|
+
################################################
|
|
50
|
+
|
|
51
|
+
# struct clip_ctx;
|
|
52
|
+
clip_ctx_p = NewType("clip_ctx_p", int)
|
|
53
|
+
clip_ctx_p_ctypes = c_void_p
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# struct llava_image_embed {
|
|
57
|
+
# float * embed;
|
|
58
|
+
# int n_image_pos;
|
|
59
|
+
# };
|
|
60
|
+
class llava_image_embed(Structure):
|
|
61
|
+
_fields_ = [
|
|
62
|
+
("embed", POINTER(c_float)),
|
|
63
|
+
("n_image_pos", c_int),
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# /** sanity check for clip <-> llava embed size match */
|
|
68
|
+
# LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
|
|
69
|
+
@ctypes_function(
|
|
70
|
+
"llava_validate_embed_size",
|
|
71
|
+
[llama_cpp.llama_context_p_ctypes, clip_ctx_p_ctypes],
|
|
72
|
+
c_bool,
|
|
73
|
+
)
|
|
74
|
+
def llava_validate_embed_size(
|
|
75
|
+
ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, /
|
|
76
|
+
) -> bool:
|
|
77
|
+
...
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# /** build an image embed from image file bytes */
|
|
81
|
+
# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
|
82
|
+
@ctypes_function(
|
|
83
|
+
"llava_image_embed_make_with_bytes",
|
|
84
|
+
[clip_ctx_p_ctypes, c_int, POINTER(c_uint8), c_int],
|
|
85
|
+
POINTER(llava_image_embed),
|
|
86
|
+
)
|
|
87
|
+
def llava_image_embed_make_with_bytes(
|
|
88
|
+
ctx_clip: clip_ctx_p,
|
|
89
|
+
n_threads: Union[c_int, int],
|
|
90
|
+
image_bytes: CtypesArray[c_uint8],
|
|
91
|
+
image_bytes_length: Union[c_int, int],
|
|
92
|
+
/,
|
|
93
|
+
) -> "_Pointer[llava_image_embed]":
|
|
94
|
+
...
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# /** build an image embed from a path to an image filename */
|
|
98
|
+
# LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
|
99
|
+
@ctypes_function(
|
|
100
|
+
"llava_image_embed_make_with_filename",
|
|
101
|
+
[clip_ctx_p_ctypes, c_int, c_char_p],
|
|
102
|
+
POINTER(llava_image_embed),
|
|
103
|
+
)
|
|
104
|
+
def llava_image_embed_make_with_filename(
|
|
105
|
+
ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, /
|
|
106
|
+
) -> "_Pointer[llava_image_embed]":
|
|
107
|
+
...
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
|
111
|
+
# /** free an embedding made with llava_image_embed_make_* */
|
|
112
|
+
@ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None)
|
|
113
|
+
def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /):
|
|
114
|
+
...
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
|
118
|
+
# LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|
|
119
|
+
@ctypes_function(
|
|
120
|
+
"llava_eval_image_embed",
|
|
121
|
+
[
|
|
122
|
+
llama_cpp.llama_context_p_ctypes,
|
|
123
|
+
POINTER(llava_image_embed),
|
|
124
|
+
c_int,
|
|
125
|
+
POINTER(c_int),
|
|
126
|
+
],
|
|
127
|
+
c_bool,
|
|
128
|
+
)
|
|
129
|
+
def llava_eval_image_embed(
|
|
130
|
+
ctx_llama: llama_cpp.llama_context_p,
|
|
131
|
+
embed: "_Pointer[llava_image_embed]",
|
|
132
|
+
n_batch: Union[c_int, int],
|
|
133
|
+
n_past: "_Pointer[c_int]",
|
|
134
|
+
/,
|
|
135
|
+
) -> bool:
|
|
136
|
+
...
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
################################################
|
|
140
|
+
# clip.h
|
|
141
|
+
################################################
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# /** load mmproj model */
|
|
145
|
+
# CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
|
|
146
|
+
@ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes)
|
|
147
|
+
def clip_model_load(
|
|
148
|
+
fname: bytes, verbosity: Union[c_int, int], /
|
|
149
|
+
) -> Optional[clip_ctx_p]:
|
|
150
|
+
...
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# /** free mmproj model */
|
|
154
|
+
# CLIP_API void clip_free(struct clip_ctx * ctx);
|
|
155
|
+
@ctypes_function("clip_free", [clip_ctx_p_ctypes], None)
|
|
156
|
+
def clip_free(ctx: clip_ctx_p, /):
|
|
157
|
+
...
|
|
158
|
+
|