llama-cpp-haystack 1.1.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {llama_cpp_haystack-1.1.0 → llama_cpp_haystack-1.3.0}/CHANGELOG.md +36 -0
  2. llama_cpp_haystack-1.3.0/PKG-INFO +42 -0
  3. llama_cpp_haystack-1.3.0/README.md +16 -0
  4. {llama_cpp_haystack-1.1.0 → llama_cpp_haystack-1.3.0}/pyproject.toml +2 -6
  5. llama_cpp_haystack-1.3.0/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py +531 -0
  6. {llama_cpp_haystack-1.1.0 → llama_cpp_haystack-1.3.0}/src/haystack_integrations/components/generators/llama_cpp/generator.py +1 -1
  7. llama_cpp_haystack-1.3.0/tests/test_chat_generator.py +1287 -0
  8. llama_cpp_haystack-1.3.0/tests/test_files/apple.jpg +0 -0
  9. llama_cpp_haystack-1.1.0/PKG-INFO +0 -259
  10. llama_cpp_haystack-1.1.0/README.md +0 -233
  11. llama_cpp_haystack-1.1.0/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py +0 -294
  12. llama_cpp_haystack-1.1.0/tests/test_chat_generator.py +0 -581
  13. {llama_cpp_haystack-1.1.0 → llama_cpp_haystack-1.3.0}/.gitignore +0 -0
  14. {llama_cpp_haystack-1.1.0 → llama_cpp_haystack-1.3.0}/LICENSE.txt +0 -0
  15. {llama_cpp_haystack-1.1.0 → llama_cpp_haystack-1.3.0}/examples/llama_cpp_generator_example.py +0 -0
  16. {llama_cpp_haystack-1.1.0 → llama_cpp_haystack-1.3.0}/examples/rag_pipeline_example.py +0 -0
  17. {llama_cpp_haystack-1.1.0 → llama_cpp_haystack-1.3.0}/pydoc/config.yml +0 -0
  18. {llama_cpp_haystack-1.1.0 → llama_cpp_haystack-1.3.0}/src/haystack_integrations/components/generators/llama_cpp/__init__.py +0 -0
  19. {llama_cpp_haystack-1.1.0 → llama_cpp_haystack-1.3.0}/src/haystack_integrations/components/generators/py.typed +0 -0
  20. {llama_cpp_haystack-1.1.0 → llama_cpp_haystack-1.3.0}/tests/__init__.py +0 -0
  21. {llama_cpp_haystack-1.1.0 → llama_cpp_haystack-1.3.0}/tests/models/.gitignore +0 -0
  22. {llama_cpp_haystack-1.1.0 → llama_cpp_haystack-1.3.0}/tests/test_generator.py +0 -0
@@ -1,5 +1,41 @@
1
1
  # Changelog
2
2
 
3
+ ## [integrations/llama_cpp-v1.2.0] - 2025-07-28
4
+
5
+ ### 🚀 Features
6
+
7
+ - `LlamaCppChatGenerator` streaming support (#2108)
8
+
9
+ ### 🧹 Chores
10
+
11
+ - Remove black (#1985)
12
+
13
+
14
+ ## [integrations/llama_cpp-v1.1.0] - 2025-06-19
15
+
16
+ ### 🐛 Bug Fixes
17
+
18
+ - Fix llama.cpp types; add py.typed; Toolset support (#1973)
19
+
20
+ ### 🧪 Testing
21
+
22
+ - Test llama.cpp with python 3.12 (#1601)
23
+
24
+ ### ⚙️ CI
25
+
26
+ - Review testing workflows (#1541)
27
+
28
+ ### 🧹 Chores
29
+
30
+ - Remove Python 3.8 support (#1421)
31
+ - Use Haystack logging across integrations (#1484)
32
+ - Update ChatGenerators with `deserialize_tools_or_toolset_inplace` (#1623)
33
+ - Align core-integrations Hatch scripts (#1898)
34
+
35
+ ### 🌀 Miscellaneous
36
+
37
+ - Chore: remove `jsonschema` dependency from `default` environment (#1368)
38
+
3
39
  ## [integrations/llama_cpp-v1.0.0] - 2025-02-07
4
40
 
5
41
  ### 🚀 Features
@@ -0,0 +1,42 @@
1
+ Metadata-Version: 2.4
2
+ Name: llama-cpp-haystack
3
+ Version: 1.3.0
4
+ Summary: An integration between the llama.cpp LLM framework and Haystack
5
+ Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/llama_cpp#readme
6
+ Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
7
+ Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/llama_cpp
8
+ Author: Ashwin Mathur
9
+ Author-email: deepset GmbH <info@deepset.ai>
10
+ License-Expression: Apache-2.0
11
+ License-File: LICENSE.txt
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Programming Language :: Python :: Implementation :: CPython
21
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
22
+ Requires-Python: >=3.9
23
+ Requires-Dist: haystack-ai>=2.16.1
24
+ Requires-Dist: llama-cpp-python>=0.2.87
25
+ Description-Content-Type: text/markdown
26
+
27
+ # llama-cpp-haystack
28
+
29
+ [![PyPI - Version](https://img.shields.io/pypi/v/llama-cpp-haystack.svg)](https://pypi.org/project/llama-cpp-haystack)
30
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-haystack.svg)](https://pypi.org/project/llama-cpp-haystack)
31
+
32
+ - [Integration page](https://haystack.deepset.ai/integrations/llama_-_cpp)
33
+ - [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/llama_cpp/CHANGELOG.md)
34
+
35
+ ---
36
+
37
+ ## Contributing
38
+
39
+ Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
40
+
41
+
42
+
@@ -0,0 +1,16 @@
1
+ # llama-cpp-haystack
2
+
3
+ [![PyPI - Version](https://img.shields.io/pypi/v/llama-cpp-haystack.svg)](https://pypi.org/project/llama-cpp-haystack)
4
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-haystack.svg)](https://pypi.org/project/llama-cpp-haystack)
5
+
6
+ - [Integration page](https://haystack.deepset.ai/integrations/llama_-_cpp)
7
+ - [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/llama_cpp/CHANGELOG.md)
8
+
9
+ ---
10
+
11
+ ## Contributing
12
+
13
+ Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
14
+
15
+
16
+
@@ -26,7 +26,7 @@ classifiers = [
26
26
  "Programming Language :: Python :: Implementation :: CPython",
27
27
  "Programming Language :: Python :: Implementation :: PyPy",
28
28
  ]
29
- dependencies = ["haystack-ai>=2.13.0", "llama-cpp-python>=0.2.87"]
29
+ dependencies = ["haystack-ai>=2.16.1", "llama-cpp-python>=0.2.87"]
30
30
 
31
31
  [project.urls]
32
32
  Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/llama_cpp#readme"
@@ -60,7 +60,7 @@ dependencies = [
60
60
  "pytest-rerunfailures",
61
61
  "mypy",
62
62
  "pip",
63
- "transformers[sentencepiece]"
63
+ "transformers[sentencepiece]",
64
64
  ]
65
65
 
66
66
  [tool.hatch.envs.test.scripts]
@@ -82,10 +82,6 @@ allow-direct-references = true
82
82
  [tool.ruff.lint.isort]
83
83
  known-first-party = ["haystack_integrations"]
84
84
 
85
- [tool.black]
86
- target-version = ["py38"]
87
- line-length = 120
88
- skip-string-normalization = true
89
85
 
90
86
  [tool.ruff]
91
87
  target-version = "py38"
@@ -0,0 +1,531 @@
1
+ import json
2
+ from datetime import datetime, timezone
3
+ from typing import Any, Dict, Iterator, List, Optional, Union
4
+
5
+ from haystack import component, default_from_dict, default_to_dict, logging
6
+ from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message
7
+ from haystack.dataclasses import (
8
+ ChatMessage,
9
+ ComponentInfo,
10
+ ImageContent,
11
+ StreamingCallbackT,
12
+ TextContent,
13
+ ToolCall,
14
+ ToolCallDelta,
15
+ select_streaming_callback,
16
+ )
17
+ from haystack.dataclasses.streaming_chunk import FinishReason, StreamingChunk, SyncStreamingCallbackT
18
+ from haystack.tools import (
19
+ Tool,
20
+ Toolset,
21
+ _check_duplicate_tool_names,
22
+ deserialize_tools_or_toolset_inplace,
23
+ serialize_tools_or_toolset,
24
+ )
25
+ from haystack.utils import deserialize_callable, serialize_callable
26
+ from llama_cpp import (
27
+ ChatCompletionMessageToolCall,
28
+ ChatCompletionRequestAssistantMessage,
29
+ ChatCompletionRequestMessage,
30
+ ChatCompletionRequestMessageContentPart,
31
+ ChatCompletionResponseChoice,
32
+ ChatCompletionTool,
33
+ CreateChatCompletionResponse,
34
+ CreateChatCompletionStreamResponse,
35
+ Llama,
36
+ llama_chat_format,
37
+ )
38
+ from llama_cpp.llama_chat_format import Llava15ChatHandler
39
+ from llama_cpp.llama_tokenizer import LlamaHFTokenizer
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+ FINISH_REASON_MAPPING: Dict[str, FinishReason] = {
44
+ "stop": "stop",
45
+ "length": "length",
46
+ "tool_calls": "tool_calls",
47
+ "function_call": "tool_calls",
48
+ }
49
+
50
+ SUPPORTED_IMAGE_FORMATS = ["image/jpeg", "image/jpg", "image/png", "image/gif", "image/webp"]
51
+
52
+
53
+ def _convert_message_to_llamacpp_format(message: ChatMessage) -> ChatCompletionRequestMessage:
54
+ """
55
+ Convert a ChatMessage to the format expected by llama.cpp Chat API.
56
+ """
57
+ text_contents = message.texts
58
+ tool_calls = message.tool_calls
59
+ tool_call_results = message.tool_call_results
60
+ images = message.images
61
+
62
+ if not text_contents and not tool_calls and not tool_call_results and not images:
63
+ msg = (
64
+ "A `ChatMessage` must contain at least one `TextContent`, `ImageContent`, `ToolCall`, or `ToolCallResult`."
65
+ )
66
+ raise ValueError(msg)
67
+ elif len(text_contents) + len(tool_call_results) > 1:
68
+ msg = "For llama.cpp compatibility, a `ChatMessage` can contain at most one `TextContent` or `ToolCallResult`."
69
+ raise ValueError(msg)
70
+
71
+ role = message._role.value
72
+
73
+ # Check that images are only in user messages
74
+ if images and role != "user":
75
+ msg = "Image content is only supported for user messages"
76
+ raise ValueError(msg)
77
+
78
+ if role == "tool" and tool_call_results:
79
+ if tool_call_results[0].origin.id is None:
80
+ msg = "`ToolCall` must have a non-null `id` attribute to be used with llama.cpp."
81
+ raise ValueError(msg)
82
+ return {
83
+ "role": "function",
84
+ "content": tool_call_results[0].result,
85
+ "name": tool_call_results[0].origin.tool_name,
86
+ }
87
+
88
+ if role == "system":
89
+ return {"role": "system", "content": text_contents[0]}
90
+
91
+ if role == "user":
92
+ # Handle multimodal content (text + images) preserving order
93
+ if images:
94
+ # Check image constraints for LlamaCpp
95
+ for image in images:
96
+ if image.mime_type not in SUPPORTED_IMAGE_FORMATS:
97
+ supported_formats = ", ".join(SUPPORTED_IMAGE_FORMATS)
98
+ msg = (
99
+ f"Unsupported image format: {image.mime_type}. "
100
+ f"LlamaCpp supports the following formats: {supported_formats}"
101
+ )
102
+ raise ValueError(msg)
103
+
104
+ content_parts: list[ChatCompletionRequestMessageContentPart] = []
105
+ for part in message._content:
106
+ if isinstance(part, TextContent) and part.text:
107
+ content_parts.append({"type": "text", "text": part.text})
108
+ elif isinstance(part, ImageContent):
109
+ # LlamaCpp expects base64 data URI format
110
+ image_url = f"data:{part.mime_type};base64,{part.base64_image}"
111
+ content_parts.append({"type": "image_url", "image_url": {"url": image_url}})
112
+
113
+ return {"role": "user", "content": content_parts}
114
+
115
+ # Simple text-only message
116
+ return {"role": "user", "content": text_contents[0]}
117
+
118
+ if role == "assistant":
119
+ result: ChatCompletionRequestAssistantMessage = {"role": "assistant"}
120
+
121
+ if text_contents:
122
+ result["content"] = text_contents[0]
123
+
124
+ if tool_calls:
125
+ llamacpp_tool_calls: List[ChatCompletionMessageToolCall] = []
126
+ for tc in tool_calls:
127
+ if tc.id is None:
128
+ msg = "`ToolCall` must have a non-null `id` attribute to be used with llama.cpp."
129
+ raise ValueError(msg)
130
+ llamacpp_tool_calls.append(
131
+ {
132
+ "id": tc.id,
133
+ "type": "function",
134
+ # We disable ensure_ascii so special chars like emojis are not converted
135
+ "function": {"name": tc.tool_name, "arguments": json.dumps(tc.arguments, ensure_ascii=False)},
136
+ }
137
+ )
138
+ result["tool_calls"] = llamacpp_tool_calls
139
+
140
+ return result
141
+
142
+ error_msg = f"Unknown role: {role}"
143
+ raise ValueError(error_msg)
144
+
145
+
146
+ @component
147
+ class LlamaCppChatGenerator:
148
+ """
149
+ Provides an interface to generate text using LLM via llama.cpp.
150
+
151
+ [llama.cpp](https://github.com/ggml-org/llama.cpp) is a project written in C/C++ for efficient inference of LLMs.
152
+ It employs the quantized GGUF format, suitable for running these models on standard machines (even without GPUs).
153
+ Supports both text-only and multimodal (text + image) models like LLaVA.
154
+
155
+ Usage example:
156
+ ```python
157
+ from haystack_integrations.components.generators.llama_cpp import LlamaCppChatGenerator
158
+ user_message = [ChatMessage.from_user("Who is the best American actor?")]
159
+ generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512)
160
+
161
+ print(generator.run(user_message, generation_kwargs={"max_tokens": 128}))
162
+ # {"replies": [ChatMessage(content="John Cusack", role=<ChatRole.ASSISTANT: "assistant">, name=None, meta={...})}
163
+ ```
164
+
165
+ Usage example with multimodal (image + text):
166
+ ```python
167
+ from haystack.dataclasses import ChatMessage, ImageContent
168
+
169
+ # Create an image from file path or base64
170
+ image_content = ImageContent.from_file_path("path/to/your/image.jpg")
171
+
172
+ # Create a multimodal message with both text and image
173
+ messages = [ChatMessage.from_user(content_parts=["What's in this image?", image_content])]
174
+
175
+ # Initialize with multimodal support
176
+ generator = LlamaCppChatGenerator(
177
+ model="llava-v1.5-7b-q4_0.gguf",
178
+ chat_handler_name="Llava15ChatHandler", # Use llava-1-5 handler
179
+ model_clip_path="mmproj-model-f16.gguf", # CLIP model
180
+ n_ctx=4096 # Larger context for image processing
181
+ )
182
+ generator.warm_up()
183
+
184
+ result = generator.run(messages)
185
+ print(result)
186
+ ```
187
+ """
188
+
189
+ def __init__(
190
+ self,
191
+ model: str,
192
+ n_ctx: Optional[int] = 0,
193
+ n_batch: Optional[int] = 512,
194
+ model_kwargs: Optional[Dict[str, Any]] = None,
195
+ generation_kwargs: Optional[Dict[str, Any]] = None,
196
+ *,
197
+ tools: Optional[Union[List[Tool], Toolset]] = None,
198
+ streaming_callback: Optional[StreamingCallbackT] = None,
199
+ chat_handler_name: Optional[str] = None,
200
+ model_clip_path: Optional[str] = None,
201
+ ):
202
+ """
203
+ :param model: The path of a quantized model for text generation, for example, "zephyr-7b-beta.Q4_0.gguf".
204
+ If the model path is also specified in the `model_kwargs`, this parameter will be ignored.
205
+ :param n_ctx: The number of tokens in the context. When set to 0, the context will be taken from the model.
206
+ :param n_batch: Prompt processing maximum batch size.
207
+ :param model_kwargs: Dictionary containing keyword arguments used to initialize the LLM for text generation.
208
+ These keyword arguments provide fine-grained control over the model loading.
209
+ In case of duplication, these kwargs override `model`, `n_ctx`, and `n_batch` init parameters.
210
+ For more information on the available kwargs, see
211
+ [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__).
212
+ :param generation_kwargs: A dictionary containing keyword arguments to customize text generation.
213
+ For more information on the available kwargs, see
214
+ [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
215
+ :param tools:
216
+ A list of tools or a Toolset for which the model can prepare calls.
217
+ This parameter can accept either a list of `Tool` objects or a `Toolset` instance.
218
+ :param streaming_callback: A callback function that is called when a new token is received from the stream.
219
+ :param chat_handler_name: Name of the chat handler for multimodal models.
220
+ Common options include: "Llava16ChatHandler", "MoondreamChatHandler", "Qwen25VLChatHandler".
221
+ For other handlers, check
222
+ [llama-cpp-python documentation](https://llama-cpp-python.readthedocs.io/en/latest/#multi-modal-models).
223
+ :param model_clip_path: Path to the CLIP model for vision processing (e.g., "mmproj.bin").
224
+ Required when chat_handler_name is provided for multimodal models.
225
+ """
226
+
227
+ model_kwargs = model_kwargs or {}
228
+ generation_kwargs = generation_kwargs or {}
229
+
230
+ # check if the model_kwargs contain the essential parameters
231
+ # otherwise, populate them with values from init parameters
232
+ model_kwargs.setdefault("model_path", model)
233
+ model_kwargs.setdefault("n_ctx", n_ctx)
234
+ model_kwargs.setdefault("n_batch", n_batch)
235
+
236
+ _check_duplicate_tool_names(list(tools or []))
237
+
238
+ handler: Optional[Llava15ChatHandler] = None
239
+ # Validate multimodal requirements
240
+ if chat_handler_name is not None:
241
+ if model_clip_path is None:
242
+ msg = "model_clip_path must be provided when chat_handler_name is specified for multimodal models"
243
+ raise ValueError(msg)
244
+ # Validate chat handler by attempting to import it
245
+ try:
246
+ handler = getattr(llama_chat_format, chat_handler_name)
247
+ except AttributeError as e:
248
+ msg = f"Failed to import chat handler '{chat_handler_name}'."
249
+ raise ValueError(msg) from e
250
+
251
+ self.model_path = model
252
+ self.n_ctx = n_ctx
253
+ self.n_batch = n_batch
254
+ self.model_kwargs = model_kwargs
255
+ self.generation_kwargs = generation_kwargs
256
+ self._model: Optional[Llama] = None
257
+ self.tools = tools
258
+ self.streaming_callback = streaming_callback
259
+ self.chat_handler_name = chat_handler_name
260
+ self.model_clip_path = model_clip_path
261
+ self._handler = handler
262
+
263
+ def warm_up(self):
264
+ if self._model is not None:
265
+ return
266
+
267
+ kwargs = self.model_kwargs.copy()
268
+ if "hf_tokenizer_path" in kwargs and "tokenizer" not in kwargs:
269
+ tokenizer = LlamaHFTokenizer.from_pretrained(kwargs["hf_tokenizer_path"])
270
+ kwargs["tokenizer"] = tokenizer
271
+
272
+ # Handle multimodal initialization
273
+ if self._handler is not None and self.model_clip_path is not None:
274
+ # the following command is correct, but mypy complains because handlers also have a __call__ method
275
+ kwargs["chat_handler"] = self._handler(clip_model_path=self.model_clip_path) # type: ignore[call-arg]
276
+
277
+ self._model = Llama(**kwargs)
278
+
279
+ def to_dict(self) -> Dict[str, Any]:
280
+ """
281
+ Serializes the component to a dictionary.
282
+
283
+ :returns:
284
+ Dictionary with serialized data.
285
+ """
286
+ callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
287
+ return default_to_dict(
288
+ self,
289
+ model=self.model_path,
290
+ n_ctx=self.n_ctx,
291
+ n_batch=self.n_batch,
292
+ model_kwargs=self.model_kwargs,
293
+ generation_kwargs=self.generation_kwargs,
294
+ tools=serialize_tools_or_toolset(self.tools),
295
+ streaming_callback=callback_name,
296
+ chat_handler_name=self.chat_handler_name,
297
+ model_clip_path=self.model_clip_path,
298
+ )
299
+
300
+ @classmethod
301
+ def from_dict(cls, data: Dict[str, Any]) -> "LlamaCppChatGenerator":
302
+ """
303
+ Deserializes the component from a dictionary.
304
+
305
+ :param data:
306
+ Dictionary to deserialize from.
307
+ :returns:
308
+ Deserialized component.
309
+ """
310
+ deserialize_tools_or_toolset_inplace(data["init_parameters"], key="tools")
311
+ if (
312
+ "streaming_callback" in data["init_parameters"]
313
+ and data["init_parameters"]["streaming_callback"] is not None
314
+ ):
315
+ data["init_parameters"]["streaming_callback"] = deserialize_callable(
316
+ data["init_parameters"]["streaming_callback"]
317
+ )
318
+ return default_from_dict(cls, data)
319
+
320
+ @component.output_types(replies=List[ChatMessage])
321
+ def run(
322
+ self,
323
+ messages: List[ChatMessage],
324
+ generation_kwargs: Optional[Dict[str, Any]] = None,
325
+ *,
326
+ tools: Optional[Union[List[Tool], Toolset]] = None,
327
+ streaming_callback: Optional[StreamingCallbackT] = None,
328
+ ) -> Dict[str, List[ChatMessage]]:
329
+ """
330
+ Run the text generation model on the given list of ChatMessages.
331
+
332
+ :param messages:
333
+ A list of ChatMessage instances representing the input messages.
334
+ :param generation_kwargs: A dictionary containing keyword arguments to customize text generation.
335
+ For more information on the available kwargs, see
336
+ [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
337
+ :param tools:
338
+ A list of tools or a Toolset for which the model can prepare calls. If set, it will override the `tools`
339
+ parameter set during component initialization.
340
+ :param streaming_callback: A callback function that is called when a new token is received from the stream.
341
+ If set, it will override the `streaming_callback` parameter set during component initialization.
342
+ :returns: A dictionary with the following keys:
343
+ - `replies`: The responses from the model
344
+ """
345
+ if self._model is None:
346
+ error_msg = "The model has not been loaded. Please call warm_up() before running."
347
+ raise RuntimeError(error_msg)
348
+
349
+ if not messages:
350
+ return {"replies": []}
351
+
352
+ updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
353
+ formatted_messages = [_convert_message_to_llamacpp_format(msg) for msg in messages]
354
+
355
+ tools = tools or self.tools
356
+ if isinstance(tools, Toolset):
357
+ tools = list(tools)
358
+ _check_duplicate_tool_names(tools)
359
+
360
+ llamacpp_tools: List[ChatCompletionTool] = []
361
+ if tools:
362
+ for t in tools:
363
+ llamacpp_tools.append(
364
+ {
365
+ "type": "function",
366
+ "function": {
367
+ "name": t.tool_spec["name"],
368
+ "description": t.tool_spec.get("description", ""),
369
+ "parameters": t.tool_spec.get("parameters", {}),
370
+ },
371
+ }
372
+ )
373
+
374
+ streaming_callback = select_streaming_callback(
375
+ init_callback=self.streaming_callback,
376
+ runtime_callback=streaming_callback,
377
+ requires_async=False,
378
+ )
379
+
380
+ if streaming_callback:
381
+ response_stream = self._model.create_chat_completion(
382
+ messages=formatted_messages, tools=llamacpp_tools, **updated_generation_kwargs, stream=True
383
+ )
384
+ return self._handle_streaming_response(
385
+ response_stream=response_stream, # type: ignore[arg-type]
386
+ streaming_callback=streaming_callback,
387
+ component_info=ComponentInfo.from_component(self),
388
+ ) # we know that response_stream is Iterator[CreateChatCompletionStreamResponse]
389
+ # because create_chat_completion was called with stream=True, but mypy doesn't know that
390
+
391
+ response = self._model.create_chat_completion(
392
+ messages=formatted_messages, tools=llamacpp_tools, **updated_generation_kwargs
393
+ )
394
+ replies = []
395
+ if not isinstance(response, dict):
396
+ msg = f"Expected a dictionary response, got a different object: {response}"
397
+ raise ValueError(msg)
398
+
399
+ for choice in response["choices"]:
400
+ chat_message = self._convert_chat_completion_choice_to_chat_message(choice, response)
401
+ replies.append(chat_message)
402
+ return {"replies": replies}
403
+
404
+ @staticmethod
405
+ def _handle_streaming_response(
406
+ response_stream: Iterator[CreateChatCompletionStreamResponse],
407
+ streaming_callback: SyncStreamingCallbackT,
408
+ component_info: ComponentInfo,
409
+ ) -> Dict[str, List[ChatMessage]]:
410
+ """
411
+ Take streaming responses from llama.cpp, convert to Haystack StreamingChunk objects, stream them,
412
+ and finally convert them to a ChatMessage.
413
+
414
+ :param response_stream: The streaming responses from llama.cpp.
415
+ :param streaming_callback: The callback function for streaming chunks.
416
+ :param component_info: The component info.
417
+ :returns: A dictionary with the replies.
418
+ """
419
+ streaming_chunks = []
420
+
421
+ seen_tool_call_ids = set() # Track tool call IDs we've seen
422
+
423
+ for i, chunk in enumerate(response_stream):
424
+ content = ""
425
+ tool_calls = []
426
+ mapped_finish_reason = None
427
+
428
+ # Track new tool call IDs in this chunk.
429
+ # Considering tool call ID is the only reliable way to recognize tool calls in llama.cpp streaming.
430
+ # They are often spread across multiple chunks.
431
+ new_tool_call_ids = set()
432
+
433
+ if chunk.get("choices") and len(chunk["choices"]) > 0:
434
+ choice = chunk["choices"][0]
435
+ delta = choice.get("delta", {})
436
+
437
+ finish_reason = choice.get("finish_reason")
438
+ mapped_finish_reason = FINISH_REASON_MAPPING.get(finish_reason or "")
439
+
440
+ if content_raw := delta.get("content"):
441
+ content = str(content_raw)
442
+
443
+ tool_calls_data = delta.get("tool_calls")
444
+ if tool_calls_data is not None and isinstance(tool_calls_data, list):
445
+ for tool_call_chunk in tool_calls_data:
446
+ tool_call_id = tool_call_chunk.get("id")
447
+ is_new_tool_call = tool_call_id and tool_call_id not in seen_tool_call_ids
448
+
449
+ if is_new_tool_call:
450
+ new_tool_call_ids.add(tool_call_id)
451
+ seen_tool_call_ids.add(tool_call_id)
452
+
453
+ function_data = tool_call_chunk.get("function", {})
454
+
455
+ # Only include tool_name if this is a new tool call
456
+ tool_name = function_data.get("name", "") if is_new_tool_call else ""
457
+
458
+ tool_calls.append(
459
+ ToolCallDelta(
460
+ index=tool_call_chunk.get("index"),
461
+ id=tool_call_id,
462
+ tool_name=tool_name,
463
+ arguments=function_data.get("arguments"),
464
+ )
465
+ )
466
+
467
+ # start is True if it's the first chunk or if we have new tool call IDs
468
+ start = i == 0 or len(new_tool_call_ids) > 0
469
+
470
+ streaming_chunk = StreamingChunk(
471
+ content="" if tool_calls else content, # prioritize tool calls over content when both are present
472
+ tool_calls=tool_calls,
473
+ component_info=component_info,
474
+ index=i,
475
+ start=start,
476
+ finish_reason=mapped_finish_reason,
477
+ meta={
478
+ "model": chunk["model"],
479
+ "received_at": datetime.fromtimestamp(chunk["created"], tz=timezone.utc).isoformat(),
480
+ }, # llama.cpp does not provide usage metadata during streaming
481
+ )
482
+
483
+ streaming_chunks.append(streaming_chunk)
484
+
485
+ # Stream the chunk
486
+ try:
487
+ streaming_callback(streaming_chunk)
488
+ except Exception as e:
489
+ logger.error(f"Error in streaming callback invocation: {e}")
490
+ continue
491
+
492
+ message = _convert_streaming_chunks_to_chat_message(streaming_chunks)
493
+ return {"replies": [message]}
494
+
495
+ @staticmethod
496
+ def _convert_chat_completion_choice_to_chat_message(
497
+ choice: ChatCompletionResponseChoice, response: CreateChatCompletionResponse
498
+ ) -> ChatMessage:
499
+ llamacpp_message = choice["message"]
500
+ text_content = llamacpp_message.get("content", "") or None
501
+ tool_calls = []
502
+
503
+ if llamacpp_tool_calls := llamacpp_message.get("tool_calls", []):
504
+ for llamacpp_tc in llamacpp_tool_calls:
505
+ arguments_str = llamacpp_tc["function"]["arguments"]
506
+ try:
507
+ arguments = json.loads(arguments_str)
508
+ tool_calls.append(
509
+ ToolCall(id=llamacpp_tc["id"], tool_name=llamacpp_tc["function"]["name"], arguments=arguments)
510
+ )
511
+ except json.JSONDecodeError:
512
+ logger.warning(
513
+ "Llama.cpp returned a malformed JSON string for tool call arguments. This tool call "
514
+ "will be skipped. Tool call ID: {tc_id}, Tool name: {tc_name}, Arguments: {tc_args}",
515
+ tc_id=llamacpp_tc["id"],
516
+ tc_name=llamacpp_tc["function"]["name"],
517
+ tc_args=arguments_str,
518
+ )
519
+
520
+ finish_reason = choice.get("finish_reason")
521
+
522
+ meta = {
523
+ "response_id": response["id"],
524
+ "model": response["model"],
525
+ "created": response["created"],
526
+ "index": choice["index"],
527
+ "finish_reason": FINISH_REASON_MAPPING.get(finish_reason or ""),
528
+ "usage": response["usage"],
529
+ }
530
+
531
+ return ChatMessage.from_assistant(text=text_content, tool_calls=tool_calls, meta=meta)
@@ -12,7 +12,7 @@ class LlamaCppGenerator:
12
12
  """
13
13
  Provides an interface to generate text using LLM via llama.cpp.
14
14
 
15
- [llama.cpp](https://github.com/ggerganov/llama.cpp) is a project written in C/C++ for efficient inference of LLMs.
15
+ [llama.cpp](https://github.com/ggml-org/llama.cpp) is a project written in C/C++ for efficient inference of LLMs.
16
16
  It employs the quantized GGUF format, suitable for running these models on standard machines (even without GPUs).
17
17
 
18
18
  Usage example: