llama-cpp-haystack 0.4.4__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/CHANGELOG.md +7 -0
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/PKG-INFO +1 -1
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/pyproject.toml +1 -0
- llama_cpp_haystack-1.0.0/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py +251 -0
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/tests/test_chat_generator.py +168 -112
- llama_cpp_haystack-0.4.4/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py +0 -144
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/.gitignore +0 -0
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/LICENSE.txt +0 -0
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/README.md +0 -0
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/examples/llama_cpp_generator_example.py +0 -0
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/examples/rag_pipeline_example.py +0 -0
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/pydoc/config.yml +0 -0
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/src/haystack_integrations/components/generators/llama_cpp/__init__.py +0 -0
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/src/haystack_integrations/components/generators/llama_cpp/generator.py +0 -0
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/tests/__init__.py +0 -0
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/tests/models/.gitignore +0 -0
- {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/tests/test_generator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llama-cpp-haystack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: An integration between the llama.cpp LLM framework and Haystack
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/llama_cpp#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from haystack import component, default_from_dict, default_to_dict
|
|
6
|
+
from haystack.dataclasses import ChatMessage, ToolCall
|
|
7
|
+
from haystack.tools import Tool, _check_duplicate_tool_names, deserialize_tools_inplace
|
|
8
|
+
from llama_cpp import ChatCompletionResponseChoice, CreateChatCompletionResponse, Llama
|
|
9
|
+
from llama_cpp.llama_tokenizer import LlamaHFTokenizer
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _convert_message_to_llamacpp_format(message: ChatMessage) -> Dict[str, Any]:
|
|
15
|
+
"""
|
|
16
|
+
Convert a ChatMessage to the format expected by Ollama Chat API.
|
|
17
|
+
"""
|
|
18
|
+
text_contents = message.texts
|
|
19
|
+
tool_calls = message.tool_calls
|
|
20
|
+
tool_call_results = message.tool_call_results
|
|
21
|
+
|
|
22
|
+
if not text_contents and not tool_calls and not tool_call_results:
|
|
23
|
+
msg = "A `ChatMessage` must contain at least one `TextContent`, `ToolCall`, or `ToolCallResult`."
|
|
24
|
+
raise ValueError(msg)
|
|
25
|
+
elif len(text_contents) + len(tool_call_results) > 1:
|
|
26
|
+
msg = "A `ChatMessage` can only contain one `TextContent` or one `ToolCallResult`."
|
|
27
|
+
raise ValueError(msg)
|
|
28
|
+
|
|
29
|
+
role = message._role.value
|
|
30
|
+
if role == "tool":
|
|
31
|
+
role = "function"
|
|
32
|
+
|
|
33
|
+
llamacpp_msg: Dict[str, Any] = {"role": role}
|
|
34
|
+
|
|
35
|
+
if tool_call_results:
|
|
36
|
+
if tool_call_results[0].origin.id is None:
|
|
37
|
+
msg = "`ToolCall` must have a non-null `id` attribute to be used with llama.cpp."
|
|
38
|
+
raise ValueError(msg)
|
|
39
|
+
llamacpp_msg["content"] = tool_call_results[0].result
|
|
40
|
+
llamacpp_msg["tool_call_id"] = tool_call_results[0].origin.id
|
|
41
|
+
# Llama.cpp does not provide a way to communicate errors in tool invocations, so we ignore the error field
|
|
42
|
+
return llamacpp_msg
|
|
43
|
+
|
|
44
|
+
if text_contents:
|
|
45
|
+
llamacpp_msg["content"] = text_contents[0]
|
|
46
|
+
if tool_calls:
|
|
47
|
+
llamacpp_tool_calls = []
|
|
48
|
+
for tc in tool_calls:
|
|
49
|
+
if tc.id is None:
|
|
50
|
+
msg = "`ToolCall` must have a non-null `id` attribute to be used with llama.cpp."
|
|
51
|
+
raise ValueError(msg)
|
|
52
|
+
llamacpp_tool_calls.append(
|
|
53
|
+
{
|
|
54
|
+
"id": tc.id,
|
|
55
|
+
"type": "function",
|
|
56
|
+
# We disable ensure_ascii so special chars like emojis are not converted
|
|
57
|
+
"function": {"name": tc.tool_name, "arguments": json.dumps(tc.arguments, ensure_ascii=False)},
|
|
58
|
+
}
|
|
59
|
+
)
|
|
60
|
+
llamacpp_msg["tool_calls"] = llamacpp_tool_calls
|
|
61
|
+
return llamacpp_msg
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@component
|
|
65
|
+
class LlamaCppChatGenerator:
|
|
66
|
+
"""
|
|
67
|
+
Provides an interface to generate text using LLM via llama.cpp.
|
|
68
|
+
|
|
69
|
+
[llama.cpp](https://github.com/ggerganov/llama.cpp) is a project written in C/C++ for efficient inference of LLMs.
|
|
70
|
+
It employs the quantized GGUF format, suitable for running these models on standard machines (even without GPUs).
|
|
71
|
+
|
|
72
|
+
Usage example:
|
|
73
|
+
```python
|
|
74
|
+
from haystack_integrations.components.generators.llama_cpp import LlamaCppChatGenerator
|
|
75
|
+
user_message = [ChatMessage.from_user("Who is the best American actor?")]
|
|
76
|
+
generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512)
|
|
77
|
+
|
|
78
|
+
print(generator.run(user_message, generation_kwargs={"max_tokens": 128}))
|
|
79
|
+
# {"replies": [ChatMessage(content="John Cusack", role=<ChatRole.ASSISTANT: "assistant">, name=None, meta={...}]}
|
|
80
|
+
```
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def __init__(
|
|
84
|
+
self,
|
|
85
|
+
model: str,
|
|
86
|
+
n_ctx: Optional[int] = 0,
|
|
87
|
+
n_batch: Optional[int] = 512,
|
|
88
|
+
model_kwargs: Optional[Dict[str, Any]] = None,
|
|
89
|
+
generation_kwargs: Optional[Dict[str, Any]] = None,
|
|
90
|
+
*,
|
|
91
|
+
tools: Optional[List[Tool]] = None,
|
|
92
|
+
):
|
|
93
|
+
"""
|
|
94
|
+
:param model: The path of a quantized model for text generation, for example, "zephyr-7b-beta.Q4_0.gguf".
|
|
95
|
+
If the model path is also specified in the `model_kwargs`, this parameter will be ignored.
|
|
96
|
+
:param n_ctx: The number of tokens in the context. When set to 0, the context will be taken from the model.
|
|
97
|
+
:param n_batch: Prompt processing maximum batch size.
|
|
98
|
+
:param model_kwargs: Dictionary containing keyword arguments used to initialize the LLM for text generation.
|
|
99
|
+
These keyword arguments provide fine-grained control over the model loading.
|
|
100
|
+
In case of duplication, these kwargs override `model`, `n_ctx`, and `n_batch` init parameters.
|
|
101
|
+
For more information on the available kwargs, see
|
|
102
|
+
[llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__).
|
|
103
|
+
:param generation_kwargs: A dictionary containing keyword arguments to customize text generation.
|
|
104
|
+
For more information on the available kwargs, see
|
|
105
|
+
[llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
|
|
106
|
+
:param tools:
|
|
107
|
+
A list of tools for which the model can prepare calls.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
model_kwargs = model_kwargs or {}
|
|
111
|
+
generation_kwargs = generation_kwargs or {}
|
|
112
|
+
|
|
113
|
+
# check if the model_kwargs contain the essential parameters
|
|
114
|
+
# otherwise, populate them with values from init parameters
|
|
115
|
+
model_kwargs.setdefault("model_path", model)
|
|
116
|
+
model_kwargs.setdefault("n_ctx", n_ctx)
|
|
117
|
+
model_kwargs.setdefault("n_batch", n_batch)
|
|
118
|
+
|
|
119
|
+
_check_duplicate_tool_names(tools)
|
|
120
|
+
|
|
121
|
+
self.model_path = model
|
|
122
|
+
self.n_ctx = n_ctx
|
|
123
|
+
self.n_batch = n_batch
|
|
124
|
+
self.model_kwargs = model_kwargs
|
|
125
|
+
self.generation_kwargs = generation_kwargs
|
|
126
|
+
self._model = None
|
|
127
|
+
self.tools = tools
|
|
128
|
+
|
|
129
|
+
def warm_up(self):
|
|
130
|
+
if "hf_tokenizer_path" in self.model_kwargs and "tokenizer" not in self.model_kwargs:
|
|
131
|
+
tokenizer = LlamaHFTokenizer.from_pretrained(self.model_kwargs["hf_tokenizer_path"])
|
|
132
|
+
self.model_kwargs["tokenizer"] = tokenizer
|
|
133
|
+
|
|
134
|
+
if self._model is None:
|
|
135
|
+
self._model = Llama(**self.model_kwargs)
|
|
136
|
+
|
|
137
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
138
|
+
"""
|
|
139
|
+
Serializes the component to a dictionary.
|
|
140
|
+
|
|
141
|
+
:returns:
|
|
142
|
+
Dictionary with serialized data.
|
|
143
|
+
"""
|
|
144
|
+
serialized_tools = [tool.to_dict() for tool in self.tools] if self.tools else None
|
|
145
|
+
return default_to_dict(
|
|
146
|
+
self,
|
|
147
|
+
model=self.model_path,
|
|
148
|
+
n_ctx=self.n_ctx,
|
|
149
|
+
n_batch=self.n_batch,
|
|
150
|
+
model_kwargs=self.model_kwargs,
|
|
151
|
+
generation_kwargs=self.generation_kwargs,
|
|
152
|
+
tools=serialized_tools,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
@classmethod
|
|
156
|
+
def from_dict(cls, data: Dict[str, Any]) -> "LlamaCppChatGenerator":
|
|
157
|
+
"""
|
|
158
|
+
Deserializes the component from a dictionary.
|
|
159
|
+
|
|
160
|
+
:param data:
|
|
161
|
+
Dictionary to deserialize from.
|
|
162
|
+
:returns:
|
|
163
|
+
Deserialized component.
|
|
164
|
+
"""
|
|
165
|
+
deserialize_tools_inplace(data["init_parameters"], key="tools")
|
|
166
|
+
return default_from_dict(cls, data)
|
|
167
|
+
|
|
168
|
+
@component.output_types(replies=List[ChatMessage])
|
|
169
|
+
def run(
|
|
170
|
+
self,
|
|
171
|
+
messages: List[ChatMessage],
|
|
172
|
+
generation_kwargs: Optional[Dict[str, Any]] = None,
|
|
173
|
+
*,
|
|
174
|
+
tools: Optional[List[Tool]] = None,
|
|
175
|
+
):
|
|
176
|
+
"""
|
|
177
|
+
Run the text generation model on the given list of ChatMessages.
|
|
178
|
+
|
|
179
|
+
:param messages:
|
|
180
|
+
A list of ChatMessage instances representing the input messages.
|
|
181
|
+
:param generation_kwargs: A dictionary containing keyword arguments to customize text generation.
|
|
182
|
+
For more information on the available kwargs, see
|
|
183
|
+
[llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
|
|
184
|
+
:param tools:
|
|
185
|
+
A list of tools for which the model can prepare calls. If set, it will override the `tools` parameter set
|
|
186
|
+
during component initialization.
|
|
187
|
+
:returns: A dictionary with the following keys:
|
|
188
|
+
- `replies`: The responses from the model
|
|
189
|
+
"""
|
|
190
|
+
if self._model is None:
|
|
191
|
+
error_msg = "The model has not been loaded. Please call warm_up() before running."
|
|
192
|
+
raise RuntimeError(error_msg)
|
|
193
|
+
|
|
194
|
+
if not messages:
|
|
195
|
+
return {"replies": []}
|
|
196
|
+
|
|
197
|
+
updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
|
|
198
|
+
formatted_messages = [_convert_message_to_llamacpp_format(msg) for msg in messages]
|
|
199
|
+
|
|
200
|
+
tools = tools or self.tools
|
|
201
|
+
llamacpp_tools = {}
|
|
202
|
+
if tools:
|
|
203
|
+
tool_definitions = [{"type": "function", "function": {**t.tool_spec}} for t in tools]
|
|
204
|
+
llamacpp_tools = {"tools": tool_definitions}
|
|
205
|
+
|
|
206
|
+
response = self._model.create_chat_completion(
|
|
207
|
+
messages=formatted_messages, **updated_generation_kwargs, **llamacpp_tools
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
replies = []
|
|
211
|
+
for choice in response["choices"]:
|
|
212
|
+
chat_message = self._convert_chat_completion_choice_to_chat_message(choice, response)
|
|
213
|
+
replies.append(chat_message)
|
|
214
|
+
|
|
215
|
+
return {"replies": replies}
|
|
216
|
+
|
|
217
|
+
@staticmethod
|
|
218
|
+
def _convert_chat_completion_choice_to_chat_message(
|
|
219
|
+
choice: ChatCompletionResponseChoice, response: CreateChatCompletionResponse
|
|
220
|
+
) -> ChatMessage:
|
|
221
|
+
llamacpp_message = choice["message"]
|
|
222
|
+
text_content = llamacpp_message.get("content", "") or None
|
|
223
|
+
tool_calls = []
|
|
224
|
+
|
|
225
|
+
if llamacpp_tool_calls := llamacpp_message.get("tool_calls", []):
|
|
226
|
+
for llamacpp_tc in llamacpp_tool_calls:
|
|
227
|
+
arguments_str = llamacpp_tc["function"]["arguments"]
|
|
228
|
+
try:
|
|
229
|
+
arguments = json.loads(arguments_str)
|
|
230
|
+
tool_calls.append(
|
|
231
|
+
ToolCall(id=llamacpp_tc["id"], tool_name=llamacpp_tc["function"]["name"], arguments=arguments)
|
|
232
|
+
)
|
|
233
|
+
except json.JSONDecodeError:
|
|
234
|
+
logger.warning(
|
|
235
|
+
"Llama.cpp returned a malformed JSON string for tool call arguments. This tool call "
|
|
236
|
+
"will be skipped. Tool call ID: %s, Tool name: %s, Arguments: %s",
|
|
237
|
+
llamacpp_tc["id"],
|
|
238
|
+
llamacpp_tc["function"]["name"],
|
|
239
|
+
arguments_str,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
meta = {
|
|
243
|
+
"response_id": response["id"],
|
|
244
|
+
"model": response["model"],
|
|
245
|
+
"created": response["created"],
|
|
246
|
+
"index": choice["index"],
|
|
247
|
+
"finish_reason": choice["finish_reason"],
|
|
248
|
+
"usage": response["usage"],
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
return ChatMessage.from_assistant(text=text_content, tool_calls=tool_calls, meta=meta)
|
|
@@ -2,14 +2,16 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
import urllib.request
|
|
4
4
|
from pathlib import Path
|
|
5
|
+
from typing import Annotated
|
|
5
6
|
from unittest.mock import MagicMock
|
|
6
7
|
|
|
7
8
|
import pytest
|
|
8
9
|
from haystack import Document, Pipeline
|
|
9
10
|
from haystack.components.builders import ChatPromptBuilder
|
|
10
11
|
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
|
|
11
|
-
from haystack.dataclasses import ChatMessage, ChatRole
|
|
12
|
+
from haystack.dataclasses import ChatMessage, ChatRole, TextContent, ToolCall
|
|
12
13
|
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
|
14
|
+
from haystack.tools import create_tool_from_function
|
|
13
15
|
|
|
14
16
|
from haystack_integrations.components.generators.llama_cpp.chat.chat_generator import (
|
|
15
17
|
LlamaCppChatGenerator,
|
|
@@ -35,17 +37,72 @@ def download_file(file_link, filename, capsys):
|
|
|
35
37
|
|
|
36
38
|
def test_convert_message_to_llamacpp_format():
|
|
37
39
|
message = ChatMessage.from_system("You are good assistant")
|
|
38
|
-
assert _convert_message_to_llamacpp_format(message) == {
|
|
40
|
+
assert _convert_message_to_llamacpp_format(message) == {
|
|
41
|
+
"role": "system",
|
|
42
|
+
"content": "You are good assistant",
|
|
43
|
+
}
|
|
39
44
|
|
|
40
45
|
message = ChatMessage.from_user("I have a question")
|
|
41
|
-
assert _convert_message_to_llamacpp_format(message) == {
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
assert _convert_message_to_llamacpp_format(message) == {
|
|
47
|
+
"role": "user",
|
|
48
|
+
"content": "I have a question",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
message = ChatMessage.from_assistant(text="I have an answer", meta={"finish_reason": "stop"})
|
|
52
|
+
assert _convert_message_to_llamacpp_format(message) == {
|
|
53
|
+
"role": "assistant",
|
|
54
|
+
"content": "I have an answer",
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
message = ChatMessage.from_assistant(
|
|
58
|
+
tool_calls=[ToolCall(id="123", tool_name="weather", arguments={"city": "Paris"})]
|
|
59
|
+
)
|
|
60
|
+
assert _convert_message_to_llamacpp_format(message) == {
|
|
61
|
+
"role": "assistant",
|
|
62
|
+
"tool_calls": [
|
|
63
|
+
{
|
|
64
|
+
"type": "function",
|
|
65
|
+
"function": {"name": "weather", "arguments": '{"city": "Paris"}'},
|
|
66
|
+
"id": "123",
|
|
67
|
+
}
|
|
68
|
+
],
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
tool_result = json.dumps({"weather": "sunny", "temperature": "25"})
|
|
72
|
+
message = ChatMessage.from_tool(
|
|
73
|
+
tool_result=tool_result,
|
|
74
|
+
origin=ToolCall(id="123", tool_name="weather", arguments={"city": "Paris"}),
|
|
75
|
+
)
|
|
76
|
+
assert _convert_message_to_llamacpp_format(message) == {
|
|
77
|
+
"role": "function",
|
|
78
|
+
"content": tool_result,
|
|
79
|
+
"tool_call_id": "123",
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def test_convert_message_to_llamacpp_invalid():
|
|
84
|
+
message = ChatMessage(_role=ChatRole.ASSISTANT, _content=[])
|
|
85
|
+
with pytest.raises(ValueError):
|
|
86
|
+
_convert_message_to_llamacpp_format(message)
|
|
87
|
+
|
|
88
|
+
message = ChatMessage(
|
|
89
|
+
_role=ChatRole.ASSISTANT,
|
|
90
|
+
_content=[
|
|
91
|
+
TextContent(text="I have an answer"),
|
|
92
|
+
TextContent(text="I have another answer"),
|
|
93
|
+
],
|
|
94
|
+
)
|
|
95
|
+
with pytest.raises(ValueError):
|
|
96
|
+
_convert_message_to_llamacpp_format(message)
|
|
97
|
+
|
|
98
|
+
tool_call_null_id = ToolCall(id=None, tool_name="weather", arguments={"city": "Paris"})
|
|
99
|
+
message = ChatMessage.from_assistant(tool_calls=[tool_call_null_id])
|
|
100
|
+
with pytest.raises(ValueError):
|
|
101
|
+
_convert_message_to_llamacpp_format(message)
|
|
102
|
+
|
|
103
|
+
message = ChatMessage.from_tool(tool_result="result", origin=tool_call_null_id)
|
|
104
|
+
with pytest.raises(ValueError):
|
|
105
|
+
_convert_message_to_llamacpp_format(message)
|
|
49
106
|
|
|
50
107
|
|
|
51
108
|
class TestLlamaCppChatGenerator:
|
|
@@ -68,7 +125,7 @@ class TestLlamaCppChatGenerator:
|
|
|
68
125
|
def generator_mock(self):
|
|
69
126
|
mock_model = MagicMock()
|
|
70
127
|
generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=2048, n_batch=512)
|
|
71
|
-
generator.
|
|
128
|
+
generator._model = mock_model
|
|
72
129
|
return generator, mock_model
|
|
73
130
|
|
|
74
131
|
def test_default_init(self):
|
|
@@ -99,6 +156,39 @@ class TestLlamaCppChatGenerator:
|
|
|
99
156
|
assert generator.model_kwargs == {"model_path": "test_model.gguf", "n_ctx": 8192, "n_batch": 512}
|
|
100
157
|
assert generator.generation_kwargs == {}
|
|
101
158
|
|
|
159
|
+
def test_to_dict(self):
|
|
160
|
+
generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=8192, n_batch=512)
|
|
161
|
+
assert generator.to_dict() == {
|
|
162
|
+
"type": "haystack_integrations.components.generators.llama_cpp.chat.chat_generator.LlamaCppChatGenerator",
|
|
163
|
+
"init_parameters": {
|
|
164
|
+
"model": "test_model.gguf",
|
|
165
|
+
"n_ctx": 8192,
|
|
166
|
+
"n_batch": 512,
|
|
167
|
+
"model_kwargs": {"model_path": "test_model.gguf", "n_ctx": 8192, "n_batch": 512},
|
|
168
|
+
"generation_kwargs": {},
|
|
169
|
+
"tools": None,
|
|
170
|
+
},
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
def test_from_dict(self):
|
|
174
|
+
serialized = {
|
|
175
|
+
"type": "haystack_integrations.components.generators.llama_cpp.chat.chat_generator.LlamaCppChatGenerator",
|
|
176
|
+
"init_parameters": {
|
|
177
|
+
"model": "test_model.gguf",
|
|
178
|
+
"n_ctx": 8192,
|
|
179
|
+
"n_batch": 512,
|
|
180
|
+
"model_kwargs": {"model_path": "test_model.gguf", "n_ctx": 8192, "n_batch": 512},
|
|
181
|
+
"generation_kwargs": {},
|
|
182
|
+
"tools": None,
|
|
183
|
+
},
|
|
184
|
+
}
|
|
185
|
+
deserialized = LlamaCppChatGenerator.from_dict(serialized)
|
|
186
|
+
assert deserialized.model_path == "test_model.gguf"
|
|
187
|
+
assert deserialized.n_ctx == 8192
|
|
188
|
+
assert deserialized.n_batch == 512
|
|
189
|
+
assert deserialized.model_kwargs == {"model_path": "test_model.gguf", "n_ctx": 8192, "n_batch": 512}
|
|
190
|
+
assert deserialized.generation_kwargs == {}
|
|
191
|
+
|
|
102
192
|
def test_ignores_model_path_if_specified_in_model_kwargs(self):
|
|
103
193
|
"""
|
|
104
194
|
Test that model_path is ignored if already specified in model_kwargs.
|
|
@@ -320,16 +410,17 @@ class TestLlamaCppChatGenerator:
|
|
|
320
410
|
|
|
321
411
|
|
|
322
412
|
class TestLlamaCppChatGeneratorFunctionary:
|
|
323
|
-
def get_current_temperature(self, location):
|
|
413
|
+
def get_current_temperature(self, location: Annotated[str, "The city and state, e.g. San Francisco, CA"]):
|
|
324
414
|
"""Get the current temperature in a given location"""
|
|
415
|
+
|
|
325
416
|
if "tokyo" in location.lower():
|
|
326
|
-
return
|
|
327
|
-
|
|
328
|
-
return
|
|
329
|
-
|
|
330
|
-
return
|
|
331
|
-
|
|
332
|
-
|
|
417
|
+
return {"location": "Tokyo", "temperature": "10", "unit": "celsius"}
|
|
418
|
+
if "san francisco" in location.lower():
|
|
419
|
+
return {"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}
|
|
420
|
+
if "paris" in location.lower():
|
|
421
|
+
return {"location": "Paris", "temperature": "22", "unit": "celsius"}
|
|
422
|
+
|
|
423
|
+
return {"location": location, "temperature": "unknown"}
|
|
333
424
|
|
|
334
425
|
@pytest.fixture
|
|
335
426
|
def generator(self, model_path, capsys):
|
|
@@ -354,86 +445,60 @@ class TestLlamaCppChatGeneratorFunctionary:
|
|
|
354
445
|
|
|
355
446
|
@pytest.mark.integration
|
|
356
447
|
def test_function_call(self, generator):
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
"properties": {
|
|
365
|
-
"username": {"type": "string", "description": "The username to retrieve information for."}
|
|
366
|
-
},
|
|
367
|
-
"required": ["username"],
|
|
368
|
-
},
|
|
369
|
-
"description": "Retrieves detailed information about a user.",
|
|
370
|
-
},
|
|
371
|
-
}
|
|
372
|
-
]
|
|
448
|
+
|
|
449
|
+
def get_user_info(username: Annotated[str, "The username to retrieve information for."]):
|
|
450
|
+
"""Retrieves detailed information about a user."""
|
|
451
|
+
return {"username": username, "age": 25, "location": "San Francisco"}
|
|
452
|
+
|
|
453
|
+
tool = create_tool_from_function(get_user_info)
|
|
454
|
+
|
|
373
455
|
tool_choice = {"type": "function", "function": {"name": "get_user_info"}}
|
|
374
456
|
|
|
375
457
|
messages = [
|
|
376
458
|
ChatMessage.from_user("Get information for user john_doe"),
|
|
377
459
|
]
|
|
378
|
-
response = generator.run(messages=messages, generation_kwargs={"
|
|
460
|
+
response = generator.run(messages=messages, tools=[tool], generation_kwargs={"tool_choice": tool_choice})
|
|
461
|
+
|
|
462
|
+
reply = response["replies"][0]
|
|
379
463
|
|
|
380
|
-
assert
|
|
381
|
-
|
|
464
|
+
assert reply.role == ChatRole.ASSISTANT
|
|
465
|
+
assert reply.tool_calls
|
|
466
|
+
tool_calls = reply.tool_calls
|
|
382
467
|
assert len(tool_calls) > 0
|
|
383
|
-
assert tool_calls[0]
|
|
384
|
-
assert
|
|
385
|
-
assert response["replies"][0].role == ChatRole.ASSISTANT
|
|
468
|
+
assert tool_calls[0].tool_name == "get_user_info"
|
|
469
|
+
assert tool_calls[0].arguments == {"username": "john_doe"}
|
|
386
470
|
|
|
387
471
|
def test_function_call_and_execute(self, generator):
|
|
388
|
-
|
|
389
|
-
tools = [
|
|
390
|
-
{
|
|
391
|
-
"type": "function",
|
|
392
|
-
"function": {
|
|
393
|
-
"name": "get_current_temperature",
|
|
394
|
-
"description": "Get the current temperature in a given location",
|
|
395
|
-
"parameters": {
|
|
396
|
-
"type": "object",
|
|
397
|
-
"properties": {
|
|
398
|
-
"location": {
|
|
399
|
-
"type": "string",
|
|
400
|
-
"description": "The city and state, e.g. San Francisco, CA",
|
|
401
|
-
},
|
|
402
|
-
},
|
|
403
|
-
"required": ["location"],
|
|
404
|
-
},
|
|
405
|
-
},
|
|
406
|
-
}
|
|
407
|
-
]
|
|
472
|
+
temperature_tool = create_tool_from_function(self.get_current_temperature)
|
|
408
473
|
|
|
409
|
-
|
|
410
|
-
response = generator.run(messages=messages, generation_kwargs={"tools": tools, "tool_choice": tool_choice})
|
|
474
|
+
user_message = ChatMessage.from_user("What's the weather like in San Francisco?")
|
|
411
475
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
476
|
+
tool_choice = {"type": "function", "function": {"name": "get_current_temperature"}}
|
|
477
|
+
response = generator.run(
|
|
478
|
+
messages=[user_message], tools=[temperature_tool], generation_kwargs={"tool_choice": tool_choice}
|
|
479
|
+
)
|
|
415
480
|
|
|
416
481
|
assert "replies" in response
|
|
417
482
|
assert len(response["replies"]) > 0
|
|
418
|
-
|
|
419
483
|
first_reply = response["replies"][0]
|
|
420
|
-
assert
|
|
421
|
-
tool_calls = first_reply.
|
|
484
|
+
assert first_reply.tool_calls
|
|
485
|
+
tool_calls = first_reply.tool_calls
|
|
422
486
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
assert function_name in available_functions
|
|
428
|
-
function_response = available_functions[function_name](**function_args)
|
|
429
|
-
function_message = ChatMessage.from_function(function_response, function_name)
|
|
430
|
-
messages.append(function_message)
|
|
487
|
+
# tool invocation
|
|
488
|
+
tool_call = tool_calls[0]
|
|
489
|
+
function_args = tool_call.arguments
|
|
490
|
+
tool_response = str(temperature_tool.invoke(**function_args))
|
|
431
491
|
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
492
|
+
tool_message = ChatMessage.from_tool(tool_result=tool_response, origin=tool_call)
|
|
493
|
+
|
|
494
|
+
all_messages = [user_message, first_reply, tool_message]
|
|
495
|
+
print(all_messages)
|
|
496
|
+
|
|
497
|
+
second_response = generator.run(messages=all_messages)
|
|
498
|
+
assert "replies" in second_response
|
|
499
|
+
assert len(second_response["replies"]) > 0
|
|
500
|
+
assert any("San Francisco" in reply.text for reply in second_response["replies"])
|
|
501
|
+
assert any("72" in reply.text for reply in second_response["replies"])
|
|
437
502
|
|
|
438
503
|
|
|
439
504
|
class TestLlamaCppChatGeneratorChatML:
|
|
@@ -459,42 +524,33 @@ class TestLlamaCppChatGeneratorChatML:
|
|
|
459
524
|
|
|
460
525
|
@pytest.mark.integration
|
|
461
526
|
def test_function_call_chatml(self, generator):
|
|
527
|
+
|
|
528
|
+
def get_user_detail(name: Annotated[str, "The name of the user"], age: Annotated[int, "The age of the user"]):
|
|
529
|
+
"""Retrieves detailed information about a user."""
|
|
530
|
+
pass
|
|
531
|
+
|
|
532
|
+
tool = create_tool_from_function(get_user_detail)
|
|
533
|
+
|
|
462
534
|
messages = [
|
|
463
535
|
ChatMessage.from_system(
|
|
464
536
|
"""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful,
|
|
465
537
|
detailed, and polite answers to the user's questions. The assistant calls functions with appropriate
|
|
466
538
|
input when necessary"""
|
|
467
539
|
),
|
|
468
|
-
ChatMessage.from_user("
|
|
540
|
+
ChatMessage.from_user("Get details for user: Jason who is 25 years old"),
|
|
469
541
|
]
|
|
470
542
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
"function": {
|
|
475
|
-
"name": "UserDetail",
|
|
476
|
-
"parameters": {
|
|
477
|
-
"type": "object",
|
|
478
|
-
"title": "UserDetail",
|
|
479
|
-
"properties": {
|
|
480
|
-
"name": {"title": "Name", "type": "string"},
|
|
481
|
-
"age": {"title": "Age", "type": "integer"},
|
|
482
|
-
},
|
|
483
|
-
"required": ["name", "age"],
|
|
484
|
-
},
|
|
485
|
-
},
|
|
486
|
-
}
|
|
487
|
-
]
|
|
543
|
+
tool_choice = {"type": "function", "function": {"name": "get_user_detail"}}
|
|
544
|
+
|
|
545
|
+
response = generator.run(messages=messages, tools=[tool], generation_kwargs={"tool_choice": tool_choice})
|
|
488
546
|
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
assert "Jason" in json.loads(tool_calls[0]["function"]["arguments"])["name"]
|
|
500
|
-
assert 25 == json.loads(tool_calls[0]["function"]["arguments"])["age"]
|
|
547
|
+
reply = response["replies"][0]
|
|
548
|
+
assert reply.tool_calls
|
|
549
|
+
tool_calls = reply.tool_calls
|
|
550
|
+
assert len(tool_calls) > 0
|
|
551
|
+
assert tool_calls[0].tool_name == "get_user_detail"
|
|
552
|
+
arguments = tool_calls[0].arguments
|
|
553
|
+
assert "name" in arguments
|
|
554
|
+
assert "age" in arguments
|
|
555
|
+
assert arguments["name"] == "Jason"
|
|
556
|
+
assert arguments["age"] == 25
|
|
@@ -1,144 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
-
|
|
4
|
-
from haystack import component
|
|
5
|
-
from haystack.dataclasses import ChatMessage
|
|
6
|
-
from llama_cpp import Llama
|
|
7
|
-
from llama_cpp.llama_tokenizer import LlamaHFTokenizer
|
|
8
|
-
|
|
9
|
-
logger = logging.getLogger(__name__)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def _convert_message_to_llamacpp_format(message: ChatMessage) -> Dict[str, str]:
|
|
13
|
-
"""
|
|
14
|
-
Convert a message to the format expected by Llama.cpp.
|
|
15
|
-
:returns: A dictionary with the following keys:
|
|
16
|
-
- `role`
|
|
17
|
-
- `content`
|
|
18
|
-
- `name` (optional)
|
|
19
|
-
"""
|
|
20
|
-
formatted_msg = {"role": message.role.value, "content": message.text}
|
|
21
|
-
if message.name:
|
|
22
|
-
formatted_msg["name"] = message.name
|
|
23
|
-
|
|
24
|
-
if formatted_msg["role"] == "tool":
|
|
25
|
-
formatted_msg["name"] = message.tool_call_result.origin.tool_name
|
|
26
|
-
formatted_msg["content"] = message.tool_call_result.result
|
|
27
|
-
|
|
28
|
-
return formatted_msg
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
@component
|
|
32
|
-
class LlamaCppChatGenerator:
|
|
33
|
-
"""
|
|
34
|
-
Provides an interface to generate text using LLM via llama.cpp.
|
|
35
|
-
|
|
36
|
-
[llama.cpp](https://github.com/ggerganov/llama.cpp) is a project written in C/C++ for efficient inference of LLMs.
|
|
37
|
-
It employs the quantized GGUF format, suitable for running these models on standard machines (even without GPUs).
|
|
38
|
-
|
|
39
|
-
Usage example:
|
|
40
|
-
```python
|
|
41
|
-
from haystack_integrations.components.generators.llama_cpp import LlamaCppChatGenerator
|
|
42
|
-
user_message = [ChatMessage.from_user("Who is the best American actor?")]
|
|
43
|
-
generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512)
|
|
44
|
-
|
|
45
|
-
print(generator.run(user_message, generation_kwargs={"max_tokens": 128}))
|
|
46
|
-
# {"replies": [ChatMessage(content="John Cusack", role=<ChatRole.ASSISTANT: "assistant">, name=None, meta={...}]}
|
|
47
|
-
```
|
|
48
|
-
"""
|
|
49
|
-
|
|
50
|
-
def __init__(
|
|
51
|
-
self,
|
|
52
|
-
model: str,
|
|
53
|
-
n_ctx: Optional[int] = 0,
|
|
54
|
-
n_batch: Optional[int] = 512,
|
|
55
|
-
model_kwargs: Optional[Dict[str, Any]] = None,
|
|
56
|
-
generation_kwargs: Optional[Dict[str, Any]] = None,
|
|
57
|
-
):
|
|
58
|
-
"""
|
|
59
|
-
:param model: The path of a quantized model for text generation, for example, "zephyr-7b-beta.Q4_0.gguf".
|
|
60
|
-
If the model path is also specified in the `model_kwargs`, this parameter will be ignored.
|
|
61
|
-
:param n_ctx: The number of tokens in the context. When set to 0, the context will be taken from the model.
|
|
62
|
-
:param n_batch: Prompt processing maximum batch size.
|
|
63
|
-
:param model_kwargs: Dictionary containing keyword arguments used to initialize the LLM for text generation.
|
|
64
|
-
These keyword arguments provide fine-grained control over the model loading.
|
|
65
|
-
In case of duplication, these kwargs override `model`, `n_ctx`, and `n_batch` init parameters.
|
|
66
|
-
For more information on the available kwargs, see
|
|
67
|
-
[llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__).
|
|
68
|
-
:param generation_kwargs: A dictionary containing keyword arguments to customize text generation.
|
|
69
|
-
For more information on the available kwargs, see
|
|
70
|
-
[llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
|
|
71
|
-
"""
|
|
72
|
-
|
|
73
|
-
model_kwargs = model_kwargs or {}
|
|
74
|
-
generation_kwargs = generation_kwargs or {}
|
|
75
|
-
|
|
76
|
-
if "hf_tokenizer_path" in model_kwargs:
|
|
77
|
-
tokenizer = LlamaHFTokenizer.from_pretrained(model_kwargs["hf_tokenizer_path"])
|
|
78
|
-
model_kwargs["tokenizer"] = tokenizer
|
|
79
|
-
|
|
80
|
-
# check if the model_kwargs contain the essential parameters
|
|
81
|
-
# otherwise, populate them with values from init parameters
|
|
82
|
-
model_kwargs.setdefault("model_path", model)
|
|
83
|
-
model_kwargs.setdefault("n_ctx", n_ctx)
|
|
84
|
-
model_kwargs.setdefault("n_batch", n_batch)
|
|
85
|
-
|
|
86
|
-
self.model_path = model
|
|
87
|
-
self.n_ctx = n_ctx
|
|
88
|
-
self.n_batch = n_batch
|
|
89
|
-
self.model_kwargs = model_kwargs
|
|
90
|
-
self.generation_kwargs = generation_kwargs
|
|
91
|
-
self.model = None
|
|
92
|
-
|
|
93
|
-
def warm_up(self):
|
|
94
|
-
if self.model is None:
|
|
95
|
-
self.model = Llama(**self.model_kwargs)
|
|
96
|
-
|
|
97
|
-
@component.output_types(replies=List[ChatMessage])
|
|
98
|
-
def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, Any]] = None):
|
|
99
|
-
"""
|
|
100
|
-
Run the text generation model on the given list of ChatMessages.
|
|
101
|
-
|
|
102
|
-
:param messages:
|
|
103
|
-
A list of ChatMessage instances representing the input messages.
|
|
104
|
-
:param generation_kwargs: A dictionary containing keyword arguments to customize text generation.
|
|
105
|
-
For more information on the available kwargs, see
|
|
106
|
-
[llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
|
|
107
|
-
:returns: A dictionary with the following keys:
|
|
108
|
-
- `replies`: The responses from the model
|
|
109
|
-
"""
|
|
110
|
-
if self.model is None:
|
|
111
|
-
error_msg = "The model has not been loaded. Please call warm_up() before running."
|
|
112
|
-
raise RuntimeError(error_msg)
|
|
113
|
-
|
|
114
|
-
if not messages:
|
|
115
|
-
return {"replies": []}
|
|
116
|
-
|
|
117
|
-
updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
|
|
118
|
-
formatted_messages = [_convert_message_to_llamacpp_format(msg) for msg in messages]
|
|
119
|
-
|
|
120
|
-
response = self.model.create_chat_completion(messages=formatted_messages, **updated_generation_kwargs)
|
|
121
|
-
|
|
122
|
-
replies = []
|
|
123
|
-
|
|
124
|
-
for choice in response["choices"]:
|
|
125
|
-
meta = {
|
|
126
|
-
"response_id": response["id"],
|
|
127
|
-
"model": response["model"],
|
|
128
|
-
"created": response["created"],
|
|
129
|
-
"index": choice["index"],
|
|
130
|
-
"finish_reason": choice["finish_reason"],
|
|
131
|
-
"usage": response["usage"],
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
name = None
|
|
135
|
-
tool_calls = choice.get("message", {}).get("tool_calls", [])
|
|
136
|
-
if tool_calls:
|
|
137
|
-
meta["tool_calls"] = tool_calls
|
|
138
|
-
name = tool_calls[0]["function"]["name"]
|
|
139
|
-
|
|
140
|
-
reply = ChatMessage.from_assistant(choice["message"]["content"], meta=meta)
|
|
141
|
-
reply._name = name or None
|
|
142
|
-
replies.append(reply)
|
|
143
|
-
|
|
144
|
-
return {"replies": replies}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.0.0}/examples/llama_cpp_generator_example.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|