llama-cpp-haystack 0.4.4__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/CHANGELOG.md +14 -0
  2. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/PKG-INFO +4 -4
  3. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/pyproject.toml +26 -30
  4. llama_cpp_haystack-1.1.0/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py +294 -0
  5. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/src/haystack_integrations/components/generators/llama_cpp/generator.py +11 -6
  6. llama_cpp_haystack-1.1.0/src/haystack_integrations/components/generators/py.typed +0 -0
  7. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/tests/test_chat_generator.py +198 -117
  8. llama_cpp_haystack-0.4.4/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py +0 -144
  9. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/.gitignore +0 -0
  10. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/LICENSE.txt +0 -0
  11. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/README.md +0 -0
  12. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/examples/llama_cpp_generator_example.py +0 -0
  13. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/examples/rag_pipeline_example.py +0 -0
  14. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/pydoc/config.yml +0 -0
  15. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/src/haystack_integrations/components/generators/llama_cpp/__init__.py +0 -0
  16. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/tests/__init__.py +0 -0
  17. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/tests/models/.gitignore +0 -0
  18. {llama_cpp_haystack-0.4.4 → llama_cpp_haystack-1.1.0}/tests/test_generator.py +0 -0
@@ -1,5 +1,19 @@
1
1
  # Changelog
2
2
 
3
+ ## [integrations/llama_cpp-v1.0.0] - 2025-02-07
4
+
5
+ ### 🚀 Features
6
+
7
+ - [**breaking**] Llama.cpp - unified support for tools + refactoring (#1357)
8
+
9
+
10
+ ## [integrations/llama_cpp-v0.4.4] - 2025-01-16
11
+
12
+ ### 🧹 Chores
13
+
14
+ - Llama.cpp - gently handle the removal of ChatMessage.from_function (#1298)
15
+
16
+
3
17
  ## [integrations/llama_cpp-v0.4.3] - 2024-12-19
4
18
 
5
19
  ### 🐛 Bug Fixes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llama-cpp-haystack
3
- Version: 0.4.4
3
+ Version: 1.1.0
4
4
  Summary: An integration between the llama.cpp LLM framework and Haystack
5
5
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/llama_cpp#readme
6
6
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
@@ -12,15 +12,15 @@ License-File: LICENSE.txt
12
12
  Classifier: Development Status :: 4 - Beta
13
13
  Classifier: License :: OSI Approved :: Apache Software License
14
14
  Classifier: Programming Language :: Python
15
- Classifier: Programming Language :: Python :: 3.8
16
15
  Classifier: Programming Language :: Python :: 3.9
17
16
  Classifier: Programming Language :: Python :: 3.10
18
17
  Classifier: Programming Language :: Python :: 3.11
19
18
  Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
20
  Classifier: Programming Language :: Python :: Implementation :: CPython
21
21
  Classifier: Programming Language :: Python :: Implementation :: PyPy
22
- Requires-Python: >=3.8
23
- Requires-Dist: haystack-ai>=2.9.0
22
+ Requires-Python: >=3.9
23
+ Requires-Dist: haystack-ai>=2.13.0
24
24
  Requires-Dist: llama-cpp-python>=0.2.87
25
25
  Description-Content-Type: text/markdown
26
26
 
@@ -7,7 +7,7 @@ name = "llama-cpp-haystack"
7
7
  dynamic = ["version"]
8
8
  description = 'An integration between the llama.cpp LLM framework and Haystack'
9
9
  readme = "README.md"
10
- requires-python = ">=3.8"
10
+ requires-python = ">=3.9"
11
11
  license = "Apache-2.0"
12
12
  keywords = []
13
13
  authors = [
@@ -18,15 +18,15 @@ classifiers = [
18
18
  "License :: OSI Approved :: Apache Software License",
19
19
  "Development Status :: 4 - Beta",
20
20
  "Programming Language :: Python",
21
- "Programming Language :: Python :: 3.8",
22
21
  "Programming Language :: Python :: 3.9",
23
22
  "Programming Language :: Python :: 3.10",
24
23
  "Programming Language :: Python :: 3.11",
25
24
  "Programming Language :: Python :: 3.12",
25
+ "Programming Language :: Python :: 3.13",
26
26
  "Programming Language :: Python :: Implementation :: CPython",
27
27
  "Programming Language :: Python :: Implementation :: PyPy",
28
28
  ]
29
- dependencies = ["haystack-ai>=2.9.0", "llama-cpp-python>=0.2.87"]
29
+ dependencies = ["haystack-ai>=2.13.0", "llama-cpp-python>=0.2.87"]
30
30
 
31
31
  [project.urls]
32
32
  Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/llama_cpp#readme"
@@ -46,35 +46,35 @@ git_describe_command = 'git describe --tags --match="integrations/llama_cpp-v[0-
46
46
 
47
47
  [tool.hatch.envs.default]
48
48
  installer = "uv"
49
+ dependencies = ["haystack-pydoc-tools", "ruff"]
50
+ [tool.hatch.envs.default.scripts]
51
+ docs = ["pydoc-markdown pydoc/config.yml"]
52
+ fmt = "ruff check --fix {args} && ruff format {args}"
53
+ fmt-check = "ruff check {args} && ruff format --check {args}"
54
+
55
+ [tool.hatch.envs.test]
49
56
  dependencies = [
50
- "coverage[toml]>=6.5",
51
57
  "pytest",
58
+ "pytest-asyncio",
59
+ "pytest-cov",
52
60
  "pytest-rerunfailures",
53
- "haystack-pydoc-tools",
54
- "transformers[sentencepiece]",
61
+ "mypy",
62
+ "pip",
63
+ "transformers[sentencepiece]"
55
64
  ]
56
- [tool.hatch.envs.default.scripts]
57
- test = "pytest {args:tests}"
58
- test-cov = "coverage run -m pytest {args:tests}"
59
- test-cov-retry = "test-cov --reruns 3 --reruns-delay 30 -x"
60
- cov-report = ["- coverage combine", "coverage report"]
61
- cov = ["test-cov", "cov-report"]
62
- cov-retry = ["test-cov-retry", "cov-report"]
63
- docs = ["pydoc-markdown pydoc/config.yml"]
64
- [[tool.hatch.envs.all.matrix]]
65
- python = ["3.8", "3.9", "3.10", "3.11", "3.12"]
66
-
67
65
 
68
- [tool.hatch.envs.lint]
69
- installer = "uv"
70
- detached = true
71
- dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
66
+ [tool.hatch.envs.test.scripts]
67
+ unit = 'pytest -m "not integration" {args:tests}'
68
+ integration = 'pytest -m "integration" {args:tests}'
69
+ all = 'pytest {args:tests}'
70
+ cov-retry = 'all --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x'
71
+ types = "mypy -p haystack_integrations.components.generators.llama_cpp {args}"
72
72
 
73
- [tool.hatch.envs.lint.scripts]
74
- typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
75
- style = ["ruff check {args:.}", "black --check --diff {args:.}"]
76
- fmt = ["black {args:.}", "ruff check --fix {args:.}", "style"]
77
- all = ["style", "typing"]
73
+ [tool.mypy]
74
+ install_types = true
75
+ non_interactive = true
76
+ check_untyped_defs = true
77
+ disallow_incomplete_defs = true
78
78
 
79
79
  [tool.hatch.metadata]
80
80
  allow-direct-references = true
@@ -164,7 +164,3 @@ markers = [
164
164
  "integration: marks tests as slow (deselect with '-m \"not integration\"')",
165
165
  ]
166
166
  addopts = ["--import-mode=importlib"]
167
-
168
- [[tool.mypy.overrides]]
169
- module = ["haystack.*", "haystack_integrations.*", "pytest.*", "llama_cpp.*"]
170
- ignore_missing_imports = true
@@ -0,0 +1,294 @@
1
+ import json
2
+ from typing import Any, Dict, List, Optional, Union
3
+
4
+ from haystack import component, default_from_dict, default_to_dict, logging
5
+ from haystack.dataclasses import ChatMessage, ToolCall
6
+ from haystack.tools import (
7
+ Tool,
8
+ Toolset,
9
+ _check_duplicate_tool_names,
10
+ deserialize_tools_or_toolset_inplace,
11
+ serialize_tools_or_toolset,
12
+ )
13
+ from llama_cpp import (
14
+ ChatCompletionMessageToolCall,
15
+ ChatCompletionRequestAssistantMessage,
16
+ ChatCompletionRequestMessage,
17
+ ChatCompletionResponseChoice,
18
+ ChatCompletionTool,
19
+ CreateChatCompletionResponse,
20
+ Llama,
21
+ )
22
+ from llama_cpp.llama_tokenizer import LlamaHFTokenizer
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ def _convert_message_to_llamacpp_format(message: ChatMessage) -> ChatCompletionRequestMessage:
28
+ """
29
+ Convert a ChatMessage to the format expected by llama.cpp Chat API.
30
+ """
31
+ text_contents = message.texts
32
+ tool_calls = message.tool_calls
33
+ tool_call_results = message.tool_call_results
34
+
35
+ if not text_contents and not tool_calls and not tool_call_results:
36
+ msg = "A `ChatMessage` must contain at least one `TextContent`, `ToolCall`, or `ToolCallResult`."
37
+ raise ValueError(msg)
38
+ elif len(text_contents) + len(tool_call_results) > 1:
39
+ msg = "A `ChatMessage` can only contain one `TextContent` or one `ToolCallResult`."
40
+ raise ValueError(msg)
41
+
42
+ role = message._role.value
43
+
44
+ if role == "tool" and tool_call_results:
45
+ if tool_call_results[0].origin.id is None:
46
+ msg = "`ToolCall` must have a non-null `id` attribute to be used with llama.cpp."
47
+ raise ValueError(msg)
48
+ return {
49
+ "role": "function",
50
+ "content": tool_call_results[0].result,
51
+ "name": tool_call_results[0].origin.tool_name,
52
+ }
53
+
54
+ if role == "system":
55
+ content = text_contents[0] if text_contents else None
56
+ return {"role": "system", "content": content}
57
+
58
+ if role == "user":
59
+ content = text_contents[0] if text_contents else None
60
+ return {"role": "user", "content": content}
61
+
62
+ if role == "assistant":
63
+ result: ChatCompletionRequestAssistantMessage = {"role": "assistant"}
64
+
65
+ if text_contents:
66
+ result["content"] = text_contents[0]
67
+
68
+ if tool_calls:
69
+ llamacpp_tool_calls: List[ChatCompletionMessageToolCall] = []
70
+ for tc in tool_calls:
71
+ if tc.id is None:
72
+ msg = "`ToolCall` must have a non-null `id` attribute to be used with llama.cpp."
73
+ raise ValueError(msg)
74
+ llamacpp_tool_calls.append(
75
+ {
76
+ "id": tc.id,
77
+ "type": "function",
78
+ # We disable ensure_ascii so special chars like emojis are not converted
79
+ "function": {"name": tc.tool_name, "arguments": json.dumps(tc.arguments, ensure_ascii=False)},
80
+ }
81
+ )
82
+ result["tool_calls"] = llamacpp_tool_calls
83
+
84
+ return result
85
+
86
+ error_msg = f"Unknown role: {role}"
87
+ raise ValueError(error_msg)
88
+
89
+
90
+ @component
91
+ class LlamaCppChatGenerator:
92
+ """
93
+ Provides an interface to generate text using LLM via llama.cpp.
94
+
95
+ [llama.cpp](https://github.com/ggerganov/llama.cpp) is a project written in C/C++ for efficient inference of LLMs.
96
+ It employs the quantized GGUF format, suitable for running these models on standard machines (even without GPUs).
97
+
98
+ Usage example:
99
+ ```python
100
+ from haystack_integrations.components.generators.llama_cpp import LlamaCppChatGenerator
101
+ user_message = [ChatMessage.from_user("Who is the best American actor?")]
102
+ generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512)
103
+
104
+ print(generator.run(user_message, generation_kwargs={"max_tokens": 128}))
105
+ # {"replies": [ChatMessage(content="John Cusack", role=<ChatRole.ASSISTANT: "assistant">, name=None, meta={...}]}
106
+ ```
107
+ """
108
+
109
+ def __init__(
110
+ self,
111
+ model: str,
112
+ n_ctx: Optional[int] = 0,
113
+ n_batch: Optional[int] = 512,
114
+ model_kwargs: Optional[Dict[str, Any]] = None,
115
+ generation_kwargs: Optional[Dict[str, Any]] = None,
116
+ *,
117
+ tools: Optional[Union[List[Tool], Toolset]] = None,
118
+ ):
119
+ """
120
+ :param model: The path of a quantized model for text generation, for example, "zephyr-7b-beta.Q4_0.gguf".
121
+ If the model path is also specified in the `model_kwargs`, this parameter will be ignored.
122
+ :param n_ctx: The number of tokens in the context. When set to 0, the context will be taken from the model.
123
+ :param n_batch: Prompt processing maximum batch size.
124
+ :param model_kwargs: Dictionary containing keyword arguments used to initialize the LLM for text generation.
125
+ These keyword arguments provide fine-grained control over the model loading.
126
+ In case of duplication, these kwargs override `model`, `n_ctx`, and `n_batch` init parameters.
127
+ For more information on the available kwargs, see
128
+ [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__).
129
+ :param generation_kwargs: A dictionary containing keyword arguments to customize text generation.
130
+ For more information on the available kwargs, see
131
+ [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
132
+ :param tools:
133
+ A list of tools or a Toolset for which the model can prepare calls.
134
+ This parameter can accept either a list of `Tool` objects or a `Toolset` instance.
135
+ """
136
+
137
+ model_kwargs = model_kwargs or {}
138
+ generation_kwargs = generation_kwargs or {}
139
+
140
+ # check if the model_kwargs contain the essential parameters
141
+ # otherwise, populate them with values from init parameters
142
+ model_kwargs.setdefault("model_path", model)
143
+ model_kwargs.setdefault("n_ctx", n_ctx)
144
+ model_kwargs.setdefault("n_batch", n_batch)
145
+
146
+ _check_duplicate_tool_names(list(tools or []))
147
+
148
+ self.model_path = model
149
+ self.n_ctx = n_ctx
150
+ self.n_batch = n_batch
151
+ self.model_kwargs = model_kwargs
152
+ self.generation_kwargs = generation_kwargs
153
+ self._model: Optional[Llama] = None
154
+ self.tools = tools
155
+
156
+ def warm_up(self):
157
+ if "hf_tokenizer_path" in self.model_kwargs and "tokenizer" not in self.model_kwargs:
158
+ tokenizer = LlamaHFTokenizer.from_pretrained(self.model_kwargs["hf_tokenizer_path"])
159
+ self.model_kwargs["tokenizer"] = tokenizer
160
+
161
+ if self._model is None:
162
+ self._model = Llama(**self.model_kwargs)
163
+
164
+ def to_dict(self) -> Dict[str, Any]:
165
+ """
166
+ Serializes the component to a dictionary.
167
+
168
+ :returns:
169
+ Dictionary with serialized data.
170
+ """
171
+ return default_to_dict(
172
+ self,
173
+ model=self.model_path,
174
+ n_ctx=self.n_ctx,
175
+ n_batch=self.n_batch,
176
+ model_kwargs=self.model_kwargs,
177
+ generation_kwargs=self.generation_kwargs,
178
+ tools=serialize_tools_or_toolset(self.tools),
179
+ )
180
+
181
+ @classmethod
182
+ def from_dict(cls, data: Dict[str, Any]) -> "LlamaCppChatGenerator":
183
+ """
184
+ Deserializes the component from a dictionary.
185
+
186
+ :param data:
187
+ Dictionary to deserialize from.
188
+ :returns:
189
+ Deserialized component.
190
+ """
191
+ deserialize_tools_or_toolset_inplace(data["init_parameters"], key="tools")
192
+ return default_from_dict(cls, data)
193
+
194
+ @component.output_types(replies=List[ChatMessage])
195
+ def run(
196
+ self,
197
+ messages: List[ChatMessage],
198
+ generation_kwargs: Optional[Dict[str, Any]] = None,
199
+ *,
200
+ tools: Optional[Union[List[Tool], Toolset]] = None,
201
+ ) -> Dict[str, List[ChatMessage]]:
202
+ """
203
+ Run the text generation model on the given list of ChatMessages.
204
+
205
+ :param messages:
206
+ A list of ChatMessage instances representing the input messages.
207
+ :param generation_kwargs: A dictionary containing keyword arguments to customize text generation.
208
+ For more information on the available kwargs, see
209
+ [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
210
+ :param tools:
211
+ A list of tools or a Toolset for which the model can prepare calls. If set, it will override the `tools`
212
+ parameter set during component initialization.
213
+ :returns: A dictionary with the following keys:
214
+ - `replies`: The responses from the model
215
+ """
216
+ if self._model is None:
217
+ error_msg = "The model has not been loaded. Please call warm_up() before running."
218
+ raise RuntimeError(error_msg)
219
+
220
+ if not messages:
221
+ return {"replies": []}
222
+
223
+ updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
224
+ formatted_messages = [_convert_message_to_llamacpp_format(msg) for msg in messages]
225
+
226
+ tools = tools or self.tools
227
+ if isinstance(tools, Toolset):
228
+ tools = list(tools)
229
+ _check_duplicate_tool_names(tools)
230
+
231
+ llamacpp_tools: List[ChatCompletionTool] = []
232
+ if tools:
233
+ for t in tools:
234
+ llamacpp_tools.append(
235
+ {
236
+ "type": "function",
237
+ "function": {
238
+ "name": t.tool_spec["name"],
239
+ "description": t.tool_spec.get("description", ""),
240
+ "parameters": t.tool_spec.get("parameters", {}),
241
+ },
242
+ }
243
+ )
244
+
245
+ response = self._model.create_chat_completion(
246
+ messages=formatted_messages, tools=llamacpp_tools, **updated_generation_kwargs
247
+ )
248
+
249
+ replies = []
250
+ if not isinstance(response, dict):
251
+ msg = f"Expected a dictionary response, got a different object: {response}"
252
+ raise ValueError(msg)
253
+
254
+ for choice in response["choices"]:
255
+ chat_message = self._convert_chat_completion_choice_to_chat_message(choice, response)
256
+ replies.append(chat_message)
257
+
258
+ return {"replies": replies}
259
+
260
+ @staticmethod
261
+ def _convert_chat_completion_choice_to_chat_message(
262
+ choice: ChatCompletionResponseChoice, response: CreateChatCompletionResponse
263
+ ) -> ChatMessage:
264
+ llamacpp_message = choice["message"]
265
+ text_content = llamacpp_message.get("content", "") or None
266
+ tool_calls = []
267
+
268
+ if llamacpp_tool_calls := llamacpp_message.get("tool_calls", []):
269
+ for llamacpp_tc in llamacpp_tool_calls:
270
+ arguments_str = llamacpp_tc["function"]["arguments"]
271
+ try:
272
+ arguments = json.loads(arguments_str)
273
+ tool_calls.append(
274
+ ToolCall(id=llamacpp_tc["id"], tool_name=llamacpp_tc["function"]["name"], arguments=arguments)
275
+ )
276
+ except json.JSONDecodeError:
277
+ logger.warning(
278
+ "Llama.cpp returned a malformed JSON string for tool call arguments. This tool call "
279
+ "will be skipped. Tool call ID: {tc_id}, Tool name: {tc_name}, Arguments: {tc_args}",
280
+ tc_id=llamacpp_tc["id"],
281
+ tc_name=llamacpp_tc["function"]["name"],
282
+ tc_args=arguments_str,
283
+ )
284
+
285
+ meta = {
286
+ "response_id": response["id"],
287
+ "model": response["model"],
288
+ "created": response["created"],
289
+ "index": choice["index"],
290
+ "finish_reason": choice["finish_reason"],
291
+ "usage": response["usage"],
292
+ }
293
+
294
+ return ChatMessage.from_assistant(text=text_content, tool_calls=tool_calls, meta=meta)
@@ -1,7 +1,6 @@
1
- import logging
2
- from typing import Any, Dict, List, Optional
1
+ from typing import Any, Dict, List, Optional, Union
3
2
 
4
- from haystack import component
3
+ from haystack import component, logging
5
4
 
6
5
  from llama_cpp import Llama
7
6
 
@@ -63,14 +62,16 @@ class LlamaCppGenerator:
63
62
  self.n_batch = n_batch
64
63
  self.model_kwargs = model_kwargs
65
64
  self.generation_kwargs = generation_kwargs
66
- self.model = None
65
+ self.model: Optional[Llama] = None
67
66
 
68
67
  def warm_up(self):
69
68
  if self.model is None:
70
69
  self.model = Llama(**self.model_kwargs)
71
70
 
72
71
  @component.output_types(replies=List[str], meta=List[Dict[str, Any]])
73
- def run(self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None):
72
+ def run(
73
+ self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None
74
+ ) -> Dict[str, Union[List[str], List[Dict[str, Any]]]]:
74
75
  """
75
76
  Run the text generation model on the given prompt.
76
77
 
@@ -93,6 +94,10 @@ class LlamaCppGenerator:
93
94
  updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
94
95
 
95
96
  output = self.model.create_completion(prompt=prompt, **updated_generation_kwargs)
97
+ if not isinstance(output, dict):
98
+ msg = f"Expected a dictionary response, got a different object: {output}"
99
+ raise ValueError(msg)
100
+
96
101
  replies = [output["choices"][0]["text"]]
97
102
 
98
- return {"replies": replies, "meta": [output]}
103
+ return {"replies": replies, "meta": [dict(output.items())]}
@@ -2,14 +2,16 @@ import json
2
2
  import os
3
3
  import urllib.request
4
4
  from pathlib import Path
5
+ from typing import Annotated
5
6
  from unittest.mock import MagicMock
6
7
 
7
8
  import pytest
8
9
  from haystack import Document, Pipeline
9
10
  from haystack.components.builders import ChatPromptBuilder
10
11
  from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
11
- from haystack.dataclasses import ChatMessage, ChatRole
12
+ from haystack.dataclasses import ChatMessage, ChatRole, TextContent, ToolCall
12
13
  from haystack.document_stores.in_memory import InMemoryDocumentStore
14
+ from haystack.tools import Tool, Toolset, create_tool_from_function
13
15
 
14
16
  from haystack_integrations.components.generators.llama_cpp.chat.chat_generator import (
15
17
  LlamaCppChatGenerator,
@@ -22,6 +24,24 @@ def model_path():
22
24
  return Path(__file__).parent / "models"
23
25
 
24
26
 
27
+ def get_current_temperature(location: Annotated[str, "The city and state, e.g. San Francisco, CA"]):
28
+ """Get the current temperature in a given location"""
29
+
30
+ if "tokyo" in location.lower():
31
+ return {"location": "Tokyo", "temperature": "10", "unit": "celsius"}
32
+ if "san francisco" in location.lower():
33
+ return {"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}
34
+ if "paris" in location.lower():
35
+ return {"location": "Paris", "temperature": "22", "unit": "celsius"}
36
+
37
+ return {"location": location, "temperature": "unknown"}
38
+
39
+
40
+ @pytest.fixture
41
+ def temperature_tool():
42
+ return create_tool_from_function(get_current_temperature)
43
+
44
+
25
45
  def download_file(file_link, filename, capsys):
26
46
  # Checks if the file already exists before downloading
27
47
  if not os.path.isfile(filename):
@@ -35,17 +55,72 @@ def download_file(file_link, filename, capsys):
35
55
 
36
56
  def test_convert_message_to_llamacpp_format():
37
57
  message = ChatMessage.from_system("You are good assistant")
38
- assert _convert_message_to_llamacpp_format(message) == {"role": "system", "content": "You are good assistant"}
58
+ assert _convert_message_to_llamacpp_format(message) == {
59
+ "role": "system",
60
+ "content": "You are good assistant",
61
+ }
39
62
 
40
63
  message = ChatMessage.from_user("I have a question")
41
- assert _convert_message_to_llamacpp_format(message) == {"role": "user", "content": "I have a question"}
42
-
43
- if hasattr(ChatMessage, "from_function"):
44
- message = ChatMessage.from_function("Function call", "function_name")
45
- converted_message = _convert_message_to_llamacpp_format(message)
46
- assert converted_message["role"] in ("function", "tool")
47
- assert converted_message["name"] == "function_name"
48
- assert converted_message["content"] == "Function call"
64
+ assert _convert_message_to_llamacpp_format(message) == {
65
+ "role": "user",
66
+ "content": "I have a question",
67
+ }
68
+
69
+ message = ChatMessage.from_assistant(text="I have an answer", meta={"finish_reason": "stop"})
70
+ assert _convert_message_to_llamacpp_format(message) == {
71
+ "role": "assistant",
72
+ "content": "I have an answer",
73
+ }
74
+
75
+ message = ChatMessage.from_assistant(
76
+ tool_calls=[ToolCall(id="123", tool_name="weather", arguments={"city": "Paris"})]
77
+ )
78
+ assert _convert_message_to_llamacpp_format(message) == {
79
+ "role": "assistant",
80
+ "tool_calls": [
81
+ {
82
+ "type": "function",
83
+ "function": {"name": "weather", "arguments": '{"city": "Paris"}'},
84
+ "id": "123",
85
+ }
86
+ ],
87
+ }
88
+
89
+ tool_result = json.dumps({"weather": "sunny", "temperature": "25"})
90
+ message = ChatMessage.from_tool(
91
+ tool_result=tool_result,
92
+ origin=ToolCall(id="123", tool_name="weather", arguments={"city": "Paris"}),
93
+ )
94
+ assert _convert_message_to_llamacpp_format(message) == {
95
+ "role": "function",
96
+ "content": tool_result,
97
+ "name": "weather",
98
+ }
99
+
100
+
101
+ def test_convert_message_to_llamacpp_invalid():
102
+ message = ChatMessage(_role=ChatRole.ASSISTANT, _content=[])
103
+ with pytest.raises(ValueError):
104
+ _convert_message_to_llamacpp_format(message)
105
+
106
+ message = ChatMessage(
107
+ _role=ChatRole.ASSISTANT,
108
+ _content=[
109
+ TextContent(text="I have an answer"),
110
+ TextContent(text="I have another answer"),
111
+ ],
112
+ )
113
+ with pytest.raises(ValueError):
114
+ _convert_message_to_llamacpp_format(message)
115
+
116
+ tool_call_null_id = ToolCall(id=None, tool_name="weather", arguments={"city": "Paris"})
117
+ message = ChatMessage.from_assistant(tool_calls=[tool_call_null_id])
118
+ with pytest.raises(ValueError):
119
+ _convert_message_to_llamacpp_format(message)
120
+
121
+ message = ChatMessage.from_tool(tool_result="result", origin=tool_call_null_id)
122
+ with pytest.raises(ValueError):
123
+ _convert_message_to_llamacpp_format(message)
49
124
 
50
125
 
51
126
  class TestLlamaCppChatGenerator:
@@ -68,7 +143,7 @@ class TestLlamaCppChatGenerator:
68
143
  def generator_mock(self):
69
144
  mock_model = MagicMock()
70
145
  generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=2048, n_batch=512)
71
- generator.model = mock_model
146
+ generator._model = mock_model
72
147
  return generator, mock_model
73
148
 
74
149
  def test_default_init(self):
@@ -99,6 +174,63 @@ class TestLlamaCppChatGenerator:
99
174
  assert generator.model_kwargs == {"model_path": "test_model.gguf", "n_ctx": 8192, "n_batch": 512}
100
175
  assert generator.generation_kwargs == {}
101
176
 
177
+ def test_init_with_toolset(self, temperature_tool):
178
+ toolset = Toolset([temperature_tool])
179
+ generator = LlamaCppChatGenerator(model="test_model.gguf", tools=toolset)
180
+ assert generator.tools == toolset
181
+
182
+ def test_to_dict(self):
183
+ generator = LlamaCppChatGenerator(model="test_model.gguf", n_ctx=8192, n_batch=512)
184
+ assert generator.to_dict() == {
185
+ "type": "haystack_integrations.components.generators.llama_cpp.chat.chat_generator.LlamaCppChatGenerator",
186
+ "init_parameters": {
187
+ "model": "test_model.gguf",
188
+ "n_ctx": 8192,
189
+ "n_batch": 512,
190
+ "model_kwargs": {"model_path": "test_model.gguf", "n_ctx": 8192, "n_batch": 512},
191
+ "generation_kwargs": {},
192
+ "tools": None,
193
+ },
194
+ }
195
+
196
+ def test_to_dict_with_toolset(self, temperature_tool):
197
+ toolset = Toolset([temperature_tool])
198
+ generator = LlamaCppChatGenerator(model="test_model.gguf", tools=toolset)
199
+
200
+ data = generator.to_dict()
201
+
202
+ assert "tools" in data["init_parameters"]["tools"]["data"]
203
+ assert data["init_parameters"]["tools"]["type"] == "haystack.tools.toolset.Toolset"
204
+
205
+ def test_from_dict_with_toolset(self, temperature_tool):
206
+ toolset = Toolset([temperature_tool])
207
+ generator = LlamaCppChatGenerator(model="test_model.gguf", tools=toolset)
208
+ data = generator.to_dict()
209
+
210
+ deserialized_component = LlamaCppChatGenerator.from_dict(data)
211
+
212
+ assert isinstance(deserialized_component.tools, Toolset)
213
+ assert all(isinstance(tool, Tool) for tool in deserialized_component.tools)
214
+
215
+ def test_from_dict(self):
216
+ serialized = {
217
+ "type": "haystack_integrations.components.generators.llama_cpp.chat.chat_generator.LlamaCppChatGenerator",
218
+ "init_parameters": {
219
+ "model": "test_model.gguf",
220
+ "n_ctx": 8192,
221
+ "n_batch": 512,
222
+ "model_kwargs": {"model_path": "test_model.gguf", "n_ctx": 8192, "n_batch": 512},
223
+ "generation_kwargs": {},
224
+ "tools": None,
225
+ },
226
+ }
227
+ deserialized = LlamaCppChatGenerator.from_dict(serialized)
228
+ assert deserialized.model_path == "test_model.gguf"
229
+ assert deserialized.n_ctx == 8192
230
+ assert deserialized.n_batch == 512
231
+ assert deserialized.model_kwargs == {"model_path": "test_model.gguf", "n_ctx": 8192, "n_batch": 512}
232
+ assert deserialized.generation_kwargs == {}
233
+
102
234
  def test_ignores_model_path_if_specified_in_model_kwargs(self):
103
235
  """
104
236
  Test that model_path is ignored if already specified in model_kwargs.
@@ -320,17 +452,6 @@ class TestLlamaCppChatGenerator:
320
452
 
321
453
 
322
454
  class TestLlamaCppChatGeneratorFunctionary:
323
- def get_current_temperature(self, location):
324
- """Get the current temperature in a given location"""
325
- if "tokyo" in location.lower():
326
- return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"})
327
- elif "san francisco" in location.lower():
328
- return json.dumps({"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"})
329
- elif "paris" in location.lower():
330
- return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"})
331
- else:
332
- return json.dumps({"location": location, "temperature": "unknown"})
333
-
334
455
  @pytest.fixture
335
456
  def generator(self, model_path, capsys):
336
457
  gguf_model_path = (
@@ -354,90 +475,60 @@ class TestLlamaCppChatGeneratorFunctionary:
354
475
 
355
476
  @pytest.mark.integration
356
477
  def test_function_call(self, generator):
357
- tools = [
358
- {
359
- "type": "function",
360
- "function": {
361
- "name": "get_user_info",
362
- "parameters": {
363
- "type": "object",
364
- "properties": {
365
- "username": {"type": "string", "description": "The username to retrieve information for."}
366
- },
367
- "required": ["username"],
368
- },
369
- "description": "Retrieves detailed information about a user.",
370
- },
371
- }
372
- ]
478
+ def get_user_info(username: Annotated[str, "The username to retrieve information for."]):
479
+ """Retrieves detailed information about a user."""
480
+ return {"username": username, "age": 25, "location": "San Francisco"}
481
+
482
+ tool = create_tool_from_function(get_user_info)
483
+
373
484
  tool_choice = {"type": "function", "function": {"name": "get_user_info"}}
374
485
 
375
486
  messages = [
376
487
  ChatMessage.from_user("Get information for user john_doe"),
377
488
  ]
378
- response = generator.run(messages=messages, generation_kwargs={"tools": tools, "tool_choice": tool_choice})
489
+ response = generator.run(messages=messages, tools=[tool], generation_kwargs={"tool_choice": tool_choice})
490
+
491
+ reply = response["replies"][0]
379
492
 
380
- assert "tool_calls" in response["replies"][0].meta
381
- tool_calls = response["replies"][0].meta["tool_calls"]
493
+ assert reply.role == ChatRole.ASSISTANT
494
+ assert reply.tool_calls
495
+ tool_calls = reply.tool_calls
382
496
  assert len(tool_calls) > 0
383
- assert tool_calls[0]["function"]["name"] == "get_user_info"
384
- assert "username" in json.loads(tool_calls[0]["function"]["arguments"])
385
- assert response["replies"][0].role == ChatRole.ASSISTANT
497
+ assert tool_calls[0].tool_name == "get_user_info"
498
+ assert tool_calls[0].arguments == {"username": "john_doe"}
386
499
 
387
- def test_function_call_and_execute(self, generator):
388
- messages = [ChatMessage.from_user("What's the weather like in San Francisco?")]
389
- tools = [
390
- {
391
- "type": "function",
392
- "function": {
393
- "name": "get_current_temperature",
394
- "description": "Get the current temperature in a given location",
395
- "parameters": {
396
- "type": "object",
397
- "properties": {
398
- "location": {
399
- "type": "string",
400
- "description": "The city and state, e.g. San Francisco, CA",
401
- },
402
- },
403
- "required": ["location"],
404
- },
405
- },
406
- }
407
- ]
500
+ @pytest.mark.integration
501
+ def test_function_call_and_execute(self, generator, temperature_tool):
502
+ user_message = ChatMessage.from_user("What's the weather like in San Francisco?")
408
503
 
409
504
  tool_choice = {"type": "function", "function": {"name": "get_current_temperature"}}
410
- response = generator.run(messages=messages, generation_kwargs={"tools": tools, "tool_choice": tool_choice})
411
-
412
- available_functions = {
413
- "get_current_temperature": self.get_current_temperature,
414
- }
505
+ response = generator.run(
506
+ messages=[user_message], tools=[temperature_tool], generation_kwargs={"tool_choice": tool_choice}
507
+ )
415
508
 
416
509
  assert "replies" in response
417
510
  assert len(response["replies"]) > 0
418
-
419
511
  first_reply = response["replies"][0]
420
- assert "tool_calls" in first_reply.meta
421
- tool_calls = first_reply.meta["tool_calls"]
512
+ assert first_reply.tool_calls
513
+ tool_calls = first_reply.tool_calls
422
514
 
423
- if hasattr(ChatMessage, "from_function"):
424
- for tool_call in tool_calls:
425
- function_name = tool_call["function"]["name"]
426
- function_args = json.loads(tool_call["function"]["arguments"])
427
- assert function_name in available_functions
428
- function_response = available_functions[function_name](**function_args)
429
- function_message = ChatMessage.from_function(function_response, function_name)
430
- messages.append(function_message)
515
+ # tool invocation
516
+ tool_call = tool_calls[0]
517
+ function_args = tool_call.arguments
518
+ tool_response = str(temperature_tool.invoke(**function_args))
431
519
 
432
- second_response = generator.run(messages=messages)
433
- assert "replies" in second_response
434
- assert len(second_response["replies"]) > 0
435
- assert any("San Francisco" in reply.text for reply in second_response["replies"])
436
- assert any("72" in reply.text for reply in second_response["replies"])
520
+ tool_message = ChatMessage.from_tool(tool_result=tool_response, origin=tool_call)
437
521
 
522
+ all_messages = [user_message, first_reply, tool_message]
523
+
524
+ second_response = generator.run(messages=all_messages)
525
+ assert "replies" in second_response
526
+ assert len(second_response["replies"]) > 0
527
+ assert any("San Francisco" in reply.text for reply in second_response["replies"])
528
+ assert any("72" in reply.text for reply in second_response["replies"])
438
529
 
439
- class TestLlamaCppChatGeneratorChatML:
440
530
 
531
+ class TestLlamaCppChatGeneratorChatML:
441
532
  @pytest.fixture
442
533
  def generator(self, model_path, capsys):
443
534
  gguf_model_path = (
@@ -459,42 +550,32 @@ class TestLlamaCppChatGeneratorChatML:
459
550
 
460
551
  @pytest.mark.integration
461
552
  def test_function_call_chatml(self, generator):
553
+ def get_user_detail(name: Annotated[str, "The name of the user"], age: Annotated[int, "The age of the user"]):
554
+ """Retrieves detailed information about a user."""
555
+ pass
556
+
557
+ tool = create_tool_from_function(get_user_detail)
558
+
462
559
  messages = [
463
560
  ChatMessage.from_system(
464
561
  """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful,
465
562
  detailed, and polite answers to the user's questions. The assistant calls functions with appropriate
466
563
  input when necessary"""
467
564
  ),
468
- ChatMessage.from_user("Extract Jason is 25 years old"),
565
+ ChatMessage.from_user("Get details for user: Jason who is 25 years old"),
469
566
  ]
470
567
 
471
- tools = [
472
- {
473
- "type": "function",
474
- "function": {
475
- "name": "UserDetail",
476
- "parameters": {
477
- "type": "object",
478
- "title": "UserDetail",
479
- "properties": {
480
- "name": {"title": "Name", "type": "string"},
481
- "age": {"title": "Age", "type": "integer"},
482
- },
483
- "required": ["name", "age"],
484
- },
485
- },
486
- }
487
- ]
568
+ tool_choice = {"type": "function", "function": {"name": "get_user_detail"}}
488
569
 
489
- tool_choice = {"type": "function", "function": {"name": "UserDetail"}}
490
-
491
- response = generator.run(messages=messages, generation_kwargs={"tools": tools, "tool_choice": tool_choice})
492
- for reply in response["replies"]:
493
- assert "tool_calls" in reply.meta
494
- tool_calls = reply.meta["tool_calls"]
495
- assert len(tool_calls) > 0
496
- assert tool_calls[0]["function"]["name"] == "UserDetail"
497
- assert "name" in json.loads(tool_calls[0]["function"]["arguments"])
498
- assert "age" in json.loads(tool_calls[0]["function"]["arguments"])
499
- assert "Jason" in json.loads(tool_calls[0]["function"]["arguments"])["name"]
500
- assert 25 == json.loads(tool_calls[0]["function"]["arguments"])["age"]
570
+ response = generator.run(messages=messages, tools=[tool], generation_kwargs={"tool_choice": tool_choice})
571
+
572
+ reply = response["replies"][0]
573
+ assert reply.tool_calls
574
+ tool_calls = reply.tool_calls
575
+ assert len(tool_calls) > 0
576
+ assert tool_calls[0].tool_name == "get_user_detail"
577
+ arguments = tool_calls[0].arguments
578
+ assert "name" in arguments
579
+ assert "age" in arguments
580
+ assert arguments["name"] == "Jason"
581
+ assert arguments["age"] == 25
@@ -1,144 +0,0 @@
1
- import logging
2
- from typing import Any, Dict, List, Optional
3
-
4
- from haystack import component
5
- from haystack.dataclasses import ChatMessage
6
- from llama_cpp import Llama
7
- from llama_cpp.llama_tokenizer import LlamaHFTokenizer
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
-
12
- def _convert_message_to_llamacpp_format(message: ChatMessage) -> Dict[str, str]:
13
- """
14
- Convert a message to the format expected by Llama.cpp.
15
- :returns: A dictionary with the following keys:
16
- - `role`
17
- - `content`
18
- - `name` (optional)
19
- """
20
- formatted_msg = {"role": message.role.value, "content": message.text}
21
- if message.name:
22
- formatted_msg["name"] = message.name
23
-
24
- if formatted_msg["role"] == "tool":
25
- formatted_msg["name"] = message.tool_call_result.origin.tool_name
26
- formatted_msg["content"] = message.tool_call_result.result
27
-
28
- return formatted_msg
29
-
30
-
31
- @component
32
- class LlamaCppChatGenerator:
33
- """
34
- Provides an interface to generate text using LLM via llama.cpp.
35
-
36
- [llama.cpp](https://github.com/ggerganov/llama.cpp) is a project written in C/C++ for efficient inference of LLMs.
37
- It employs the quantized GGUF format, suitable for running these models on standard machines (even without GPUs).
38
-
39
- Usage example:
40
- ```python
41
- from haystack_integrations.components.generators.llama_cpp import LlamaCppChatGenerator
42
- user_message = [ChatMessage.from_user("Who is the best American actor?")]
43
- generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512)
44
-
45
- print(generator.run(user_message, generation_kwargs={"max_tokens": 128}))
46
- # {"replies": [ChatMessage(content="John Cusack", role=<ChatRole.ASSISTANT: "assistant">, name=None, meta={...}]}
47
- ```
48
- """
49
-
50
- def __init__(
51
- self,
52
- model: str,
53
- n_ctx: Optional[int] = 0,
54
- n_batch: Optional[int] = 512,
55
- model_kwargs: Optional[Dict[str, Any]] = None,
56
- generation_kwargs: Optional[Dict[str, Any]] = None,
57
- ):
58
- """
59
- :param model: The path of a quantized model for text generation, for example, "zephyr-7b-beta.Q4_0.gguf".
60
- If the model path is also specified in the `model_kwargs`, this parameter will be ignored.
61
- :param n_ctx: The number of tokens in the context. When set to 0, the context will be taken from the model.
62
- :param n_batch: Prompt processing maximum batch size.
63
- :param model_kwargs: Dictionary containing keyword arguments used to initialize the LLM for text generation.
64
- These keyword arguments provide fine-grained control over the model loading.
65
- In case of duplication, these kwargs override `model`, `n_ctx`, and `n_batch` init parameters.
66
- For more information on the available kwargs, see
67
- [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__).
68
- :param generation_kwargs: A dictionary containing keyword arguments to customize text generation.
69
- For more information on the available kwargs, see
70
- [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
71
- """
72
-
73
- model_kwargs = model_kwargs or {}
74
- generation_kwargs = generation_kwargs or {}
75
-
76
- if "hf_tokenizer_path" in model_kwargs:
77
- tokenizer = LlamaHFTokenizer.from_pretrained(model_kwargs["hf_tokenizer_path"])
78
- model_kwargs["tokenizer"] = tokenizer
79
-
80
- # check if the model_kwargs contain the essential parameters
81
- # otherwise, populate them with values from init parameters
82
- model_kwargs.setdefault("model_path", model)
83
- model_kwargs.setdefault("n_ctx", n_ctx)
84
- model_kwargs.setdefault("n_batch", n_batch)
85
-
86
- self.model_path = model
87
- self.n_ctx = n_ctx
88
- self.n_batch = n_batch
89
- self.model_kwargs = model_kwargs
90
- self.generation_kwargs = generation_kwargs
91
- self.model = None
92
-
93
- def warm_up(self):
94
- if self.model is None:
95
- self.model = Llama(**self.model_kwargs)
96
-
97
- @component.output_types(replies=List[ChatMessage])
98
- def run(self, messages: List[ChatMessage], generation_kwargs: Optional[Dict[str, Any]] = None):
99
- """
100
- Run the text generation model on the given list of ChatMessages.
101
-
102
- :param messages:
103
- A list of ChatMessage instances representing the input messages.
104
- :param generation_kwargs: A dictionary containing keyword arguments to customize text generation.
105
- For more information on the available kwargs, see
106
- [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
107
- :returns: A dictionary with the following keys:
108
- - `replies`: The responses from the model
109
- """
110
- if self.model is None:
111
- error_msg = "The model has not been loaded. Please call warm_up() before running."
112
- raise RuntimeError(error_msg)
113
-
114
- if not messages:
115
- return {"replies": []}
116
-
117
- updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
118
- formatted_messages = [_convert_message_to_llamacpp_format(msg) for msg in messages]
119
-
120
- response = self.model.create_chat_completion(messages=formatted_messages, **updated_generation_kwargs)
121
-
122
- replies = []
123
-
124
- for choice in response["choices"]:
125
- meta = {
126
- "response_id": response["id"],
127
- "model": response["model"],
128
- "created": response["created"],
129
- "index": choice["index"],
130
- "finish_reason": choice["finish_reason"],
131
- "usage": response["usage"],
132
- }
133
-
134
- name = None
135
- tool_calls = choice.get("message", {}).get("tool_calls", [])
136
- if tool_calls:
137
- meta["tool_calls"] = tool_calls
138
- name = tool_calls[0]["function"]["name"]
139
-
140
- reply = ChatMessage.from_assistant(choice["message"]["content"], meta=meta)
141
- reply._name = name or None
142
- replies.append(reply)
143
-
144
- return {"replies": replies}