llama-cpp-haystack 1.0.0__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/CHANGELOG.md +32 -0
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/PKG-INFO +4 -4
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/pyproject.toml +26 -35
- llama_cpp_haystack-1.2.0/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py +436 -0
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/src/haystack_integrations/components/generators/llama_cpp/generator.py +12 -7
- llama_cpp_haystack-1.2.0/src/haystack_integrations/components/generators/py.typed +0 -0
- llama_cpp_haystack-1.2.0/tests/test_chat_generator.py +1098 -0
- llama_cpp_haystack-1.0.0/src/haystack_integrations/components/generators/llama_cpp/chat/chat_generator.py +0 -251
- llama_cpp_haystack-1.0.0/tests/test_chat_generator.py +0 -556
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/.gitignore +0 -0
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/LICENSE.txt +0 -0
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/README.md +0 -0
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/examples/llama_cpp_generator_example.py +0 -0
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/examples/rag_pipeline_example.py +0 -0
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/pydoc/config.yml +0 -0
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/src/haystack_integrations/components/generators/llama_cpp/__init__.py +0 -0
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/tests/__init__.py +0 -0
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/tests/models/.gitignore +0 -0
- {llama_cpp_haystack-1.0.0 → llama_cpp_haystack-1.2.0}/tests/test_generator.py +0 -0
|
@@ -1,5 +1,37 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [integrations/llama_cpp-v1.1.0] - 2025-06-19
|
|
4
|
+
|
|
5
|
+
### 🐛 Bug Fixes
|
|
6
|
+
|
|
7
|
+
- Fix llama.cpp types; add py.typed; Toolset support (#1973)
|
|
8
|
+
|
|
9
|
+
### 🧪 Testing
|
|
10
|
+
|
|
11
|
+
- Test llama.cpp with python 3.12 (#1601)
|
|
12
|
+
|
|
13
|
+
### ⚙️ CI
|
|
14
|
+
|
|
15
|
+
- Review testing workflows (#1541)
|
|
16
|
+
|
|
17
|
+
### 🧹 Chores
|
|
18
|
+
|
|
19
|
+
- Remove Python 3.8 support (#1421)
|
|
20
|
+
- Use Haystack logging across integrations (#1484)
|
|
21
|
+
- Update ChatGenerators with `deserialize_tools_or_toolset_inplace` (#1623)
|
|
22
|
+
- Align core-integrations Hatch scripts (#1898)
|
|
23
|
+
|
|
24
|
+
### 🌀 Miscellaneous
|
|
25
|
+
|
|
26
|
+
- Chore: remove `jsonschema` dependency from `default` environment (#1368)
|
|
27
|
+
|
|
28
|
+
## [integrations/llama_cpp-v1.0.0] - 2025-02-07
|
|
29
|
+
|
|
30
|
+
### 🚀 Features
|
|
31
|
+
|
|
32
|
+
- [**breaking**] Llama.cpp - unified support for tools + refactoring (#1357)
|
|
33
|
+
|
|
34
|
+
|
|
3
35
|
## [integrations/llama_cpp-v0.4.4] - 2025-01-16
|
|
4
36
|
|
|
5
37
|
### 🧹 Chores
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llama-cpp-haystack
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: An integration between the llama.cpp LLM framework and Haystack
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/llama_cpp#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -12,15 +12,15 @@ License-File: LICENSE.txt
|
|
|
12
12
|
Classifier: Development Status :: 4 - Beta
|
|
13
13
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
14
|
Classifier: Programming Language :: Python
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.9
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.10
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
20
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
21
21
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
22
|
-
Requires-Python: >=3.
|
|
23
|
-
Requires-Dist: haystack-ai>=2.
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Requires-Dist: haystack-ai>=2.13.0
|
|
24
24
|
Requires-Dist: llama-cpp-python>=0.2.87
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
|
|
@@ -7,7 +7,7 @@ name = "llama-cpp-haystack"
|
|
|
7
7
|
dynamic = ["version"]
|
|
8
8
|
description = 'An integration between the llama.cpp LLM framework and Haystack'
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
requires-python = ">=3.
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
11
|
license = "Apache-2.0"
|
|
12
12
|
keywords = []
|
|
13
13
|
authors = [
|
|
@@ -18,15 +18,15 @@ classifiers = [
|
|
|
18
18
|
"License :: OSI Approved :: Apache Software License",
|
|
19
19
|
"Development Status :: 4 - Beta",
|
|
20
20
|
"Programming Language :: Python",
|
|
21
|
-
"Programming Language :: Python :: 3.8",
|
|
22
21
|
"Programming Language :: Python :: 3.9",
|
|
23
22
|
"Programming Language :: Python :: 3.10",
|
|
24
23
|
"Programming Language :: Python :: 3.11",
|
|
25
24
|
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Programming Language :: Python :: 3.13",
|
|
26
26
|
"Programming Language :: Python :: Implementation :: CPython",
|
|
27
27
|
"Programming Language :: Python :: Implementation :: PyPy",
|
|
28
28
|
]
|
|
29
|
-
dependencies = ["haystack-ai>=2.
|
|
29
|
+
dependencies = ["haystack-ai>=2.13.0", "llama-cpp-python>=0.2.87"]
|
|
30
30
|
|
|
31
31
|
[project.urls]
|
|
32
32
|
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/llama_cpp#readme"
|
|
@@ -46,36 +46,35 @@ git_describe_command = 'git describe --tags --match="integrations/llama_cpp-v[0-
|
|
|
46
46
|
|
|
47
47
|
[tool.hatch.envs.default]
|
|
48
48
|
installer = "uv"
|
|
49
|
+
dependencies = ["haystack-pydoc-tools", "ruff"]
|
|
50
|
+
[tool.hatch.envs.default.scripts]
|
|
51
|
+
docs = ["pydoc-markdown pydoc/config.yml"]
|
|
52
|
+
fmt = "ruff check --fix {args} && ruff format {args}"
|
|
53
|
+
fmt-check = "ruff check {args} && ruff format --check {args}"
|
|
54
|
+
|
|
55
|
+
[tool.hatch.envs.test]
|
|
49
56
|
dependencies = [
|
|
50
|
-
"coverage[toml]>=6.5",
|
|
51
57
|
"pytest",
|
|
58
|
+
"pytest-asyncio",
|
|
59
|
+
"pytest-cov",
|
|
52
60
|
"pytest-rerunfailures",
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
61
|
+
"mypy",
|
|
62
|
+
"pip",
|
|
63
|
+
"transformers[sentencepiece]"
|
|
56
64
|
]
|
|
57
|
-
[tool.hatch.envs.default.scripts]
|
|
58
|
-
test = "pytest {args:tests}"
|
|
59
|
-
test-cov = "coverage run -m pytest {args:tests}"
|
|
60
|
-
test-cov-retry = "test-cov --reruns 3 --reruns-delay 30 -x"
|
|
61
|
-
cov-report = ["- coverage combine", "coverage report"]
|
|
62
|
-
cov = ["test-cov", "cov-report"]
|
|
63
|
-
cov-retry = ["test-cov-retry", "cov-report"]
|
|
64
|
-
docs = ["pydoc-markdown pydoc/config.yml"]
|
|
65
|
-
[[tool.hatch.envs.all.matrix]]
|
|
66
|
-
python = ["3.8", "3.9", "3.10", "3.11", "3.12"]
|
|
67
65
|
|
|
66
|
+
[tool.hatch.envs.test.scripts]
|
|
67
|
+
unit = 'pytest -m "not integration" {args:tests}'
|
|
68
|
+
integration = 'pytest -m "integration" {args:tests}'
|
|
69
|
+
all = 'pytest {args:tests}'
|
|
70
|
+
cov-retry = 'all --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x'
|
|
71
|
+
types = "mypy -p haystack_integrations.components.generators.llama_cpp {args}"
|
|
68
72
|
|
|
69
|
-
[tool.
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
[tool.hatch.envs.lint.scripts]
|
|
75
|
-
typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
|
|
76
|
-
style = ["ruff check {args:.}", "black --check --diff {args:.}"]
|
|
77
|
-
fmt = ["black {args:.}", "ruff check --fix {args:.}", "style"]
|
|
78
|
-
all = ["style", "typing"]
|
|
73
|
+
[tool.mypy]
|
|
74
|
+
install_types = true
|
|
75
|
+
non_interactive = true
|
|
76
|
+
check_untyped_defs = true
|
|
77
|
+
disallow_incomplete_defs = true
|
|
79
78
|
|
|
80
79
|
[tool.hatch.metadata]
|
|
81
80
|
allow-direct-references = true
|
|
@@ -83,10 +82,6 @@ allow-direct-references = true
|
|
|
83
82
|
[tool.ruff.lint.isort]
|
|
84
83
|
known-first-party = ["haystack_integrations"]
|
|
85
84
|
|
|
86
|
-
[tool.black]
|
|
87
|
-
target-version = ["py38"]
|
|
88
|
-
line-length = 120
|
|
89
|
-
skip-string-normalization = true
|
|
90
85
|
|
|
91
86
|
[tool.ruff]
|
|
92
87
|
target-version = "py38"
|
|
@@ -165,7 +160,3 @@ markers = [
|
|
|
165
160
|
"integration: marks tests as slow (deselect with '-m \"not integration\"')",
|
|
166
161
|
]
|
|
167
162
|
addopts = ["--import-mode=importlib"]
|
|
168
|
-
|
|
169
|
-
[[tool.mypy.overrides]]
|
|
170
|
-
module = ["haystack.*", "haystack_integrations.*", "pytest.*", "llama_cpp.*"]
|
|
171
|
-
ignore_missing_imports = true
|
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime, timezone
|
|
3
|
+
from typing import Any, Dict, Iterator, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
from haystack import component, default_from_dict, default_to_dict, logging
|
|
6
|
+
from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message
|
|
7
|
+
from haystack.dataclasses import (
|
|
8
|
+
ChatMessage,
|
|
9
|
+
ComponentInfo,
|
|
10
|
+
StreamingCallbackT,
|
|
11
|
+
ToolCall,
|
|
12
|
+
ToolCallDelta,
|
|
13
|
+
select_streaming_callback,
|
|
14
|
+
)
|
|
15
|
+
from haystack.dataclasses.streaming_chunk import FinishReason, StreamingChunk, SyncStreamingCallbackT
|
|
16
|
+
from haystack.tools import (
|
|
17
|
+
Tool,
|
|
18
|
+
Toolset,
|
|
19
|
+
_check_duplicate_tool_names,
|
|
20
|
+
deserialize_tools_or_toolset_inplace,
|
|
21
|
+
serialize_tools_or_toolset,
|
|
22
|
+
)
|
|
23
|
+
from haystack.utils import deserialize_callable, serialize_callable
|
|
24
|
+
from llama_cpp import (
|
|
25
|
+
ChatCompletionMessageToolCall,
|
|
26
|
+
ChatCompletionRequestAssistantMessage,
|
|
27
|
+
ChatCompletionRequestMessage,
|
|
28
|
+
ChatCompletionResponseChoice,
|
|
29
|
+
ChatCompletionTool,
|
|
30
|
+
CreateChatCompletionResponse,
|
|
31
|
+
CreateChatCompletionStreamResponse,
|
|
32
|
+
Llama,
|
|
33
|
+
)
|
|
34
|
+
from llama_cpp.llama_tokenizer import LlamaHFTokenizer
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
FINISH_REASON_MAPPING: Dict[str, FinishReason] = {
|
|
39
|
+
"stop": "stop",
|
|
40
|
+
"length": "length",
|
|
41
|
+
"tool_calls": "tool_calls",
|
|
42
|
+
"function_call": "tool_calls",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _convert_message_to_llamacpp_format(message: ChatMessage) -> ChatCompletionRequestMessage:
|
|
47
|
+
"""
|
|
48
|
+
Convert a ChatMessage to the format expected by llama.cpp Chat API.
|
|
49
|
+
"""
|
|
50
|
+
text_contents = message.texts
|
|
51
|
+
tool_calls = message.tool_calls
|
|
52
|
+
tool_call_results = message.tool_call_results
|
|
53
|
+
|
|
54
|
+
if not text_contents and not tool_calls and not tool_call_results:
|
|
55
|
+
msg = "A `ChatMessage` must contain at least one `TextContent`, `ToolCall`, or `ToolCallResult`."
|
|
56
|
+
raise ValueError(msg)
|
|
57
|
+
elif len(text_contents) + len(tool_call_results) > 1:
|
|
58
|
+
msg = "A `ChatMessage` can only contain one `TextContent` or one `ToolCallResult`."
|
|
59
|
+
raise ValueError(msg)
|
|
60
|
+
|
|
61
|
+
role = message._role.value
|
|
62
|
+
|
|
63
|
+
if role == "tool" and tool_call_results:
|
|
64
|
+
if tool_call_results[0].origin.id is None:
|
|
65
|
+
msg = "`ToolCall` must have a non-null `id` attribute to be used with llama.cpp."
|
|
66
|
+
raise ValueError(msg)
|
|
67
|
+
return {
|
|
68
|
+
"role": "function",
|
|
69
|
+
"content": tool_call_results[0].result,
|
|
70
|
+
"name": tool_call_results[0].origin.tool_name,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if role == "system":
|
|
74
|
+
content = text_contents[0] if text_contents else None
|
|
75
|
+
return {"role": "system", "content": content}
|
|
76
|
+
|
|
77
|
+
if role == "user":
|
|
78
|
+
content = text_contents[0] if text_contents else None
|
|
79
|
+
return {"role": "user", "content": content}
|
|
80
|
+
|
|
81
|
+
if role == "assistant":
|
|
82
|
+
result: ChatCompletionRequestAssistantMessage = {"role": "assistant"}
|
|
83
|
+
|
|
84
|
+
if text_contents:
|
|
85
|
+
result["content"] = text_contents[0]
|
|
86
|
+
|
|
87
|
+
if tool_calls:
|
|
88
|
+
llamacpp_tool_calls: List[ChatCompletionMessageToolCall] = []
|
|
89
|
+
for tc in tool_calls:
|
|
90
|
+
if tc.id is None:
|
|
91
|
+
msg = "`ToolCall` must have a non-null `id` attribute to be used with llama.cpp."
|
|
92
|
+
raise ValueError(msg)
|
|
93
|
+
llamacpp_tool_calls.append(
|
|
94
|
+
{
|
|
95
|
+
"id": tc.id,
|
|
96
|
+
"type": "function",
|
|
97
|
+
# We disable ensure_ascii so special chars like emojis are not converted
|
|
98
|
+
"function": {"name": tc.tool_name, "arguments": json.dumps(tc.arguments, ensure_ascii=False)},
|
|
99
|
+
}
|
|
100
|
+
)
|
|
101
|
+
result["tool_calls"] = llamacpp_tool_calls
|
|
102
|
+
|
|
103
|
+
return result
|
|
104
|
+
|
|
105
|
+
error_msg = f"Unknown role: {role}"
|
|
106
|
+
raise ValueError(error_msg)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@component
|
|
110
|
+
class LlamaCppChatGenerator:
|
|
111
|
+
"""
|
|
112
|
+
Provides an interface to generate text using LLM via llama.cpp.
|
|
113
|
+
|
|
114
|
+
[llama.cpp](https://github.com/ggml-org/llama.cpp) is a project written in C/C++ for efficient inference of LLMs.
|
|
115
|
+
It employs the quantized GGUF format, suitable for running these models on standard machines (even without GPUs).
|
|
116
|
+
|
|
117
|
+
Usage example:
|
|
118
|
+
```python
|
|
119
|
+
from haystack_integrations.components.generators.llama_cpp import LlamaCppChatGenerator
|
|
120
|
+
user_message = [ChatMessage.from_user("Who is the best American actor?")]
|
|
121
|
+
generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512)
|
|
122
|
+
|
|
123
|
+
print(generator.run(user_message, generation_kwargs={"max_tokens": 128}))
|
|
124
|
+
# {"replies": [ChatMessage(content="John Cusack", role=<ChatRole.ASSISTANT: "assistant">, name=None, meta={...}]}
|
|
125
|
+
```
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __init__(
|
|
129
|
+
self,
|
|
130
|
+
model: str,
|
|
131
|
+
n_ctx: Optional[int] = 0,
|
|
132
|
+
n_batch: Optional[int] = 512,
|
|
133
|
+
model_kwargs: Optional[Dict[str, Any]] = None,
|
|
134
|
+
generation_kwargs: Optional[Dict[str, Any]] = None,
|
|
135
|
+
*,
|
|
136
|
+
tools: Optional[Union[List[Tool], Toolset]] = None,
|
|
137
|
+
streaming_callback: Optional[StreamingCallbackT] = None,
|
|
138
|
+
):
|
|
139
|
+
"""
|
|
140
|
+
:param model: The path of a quantized model for text generation, for example, "zephyr-7b-beta.Q4_0.gguf".
|
|
141
|
+
If the model path is also specified in the `model_kwargs`, this parameter will be ignored.
|
|
142
|
+
:param n_ctx: The number of tokens in the context. When set to 0, the context will be taken from the model.
|
|
143
|
+
:param n_batch: Prompt processing maximum batch size.
|
|
144
|
+
:param model_kwargs: Dictionary containing keyword arguments used to initialize the LLM for text generation.
|
|
145
|
+
These keyword arguments provide fine-grained control over the model loading.
|
|
146
|
+
In case of duplication, these kwargs override `model`, `n_ctx`, and `n_batch` init parameters.
|
|
147
|
+
For more information on the available kwargs, see
|
|
148
|
+
[llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__).
|
|
149
|
+
:param generation_kwargs: A dictionary containing keyword arguments to customize text generation.
|
|
150
|
+
For more information on the available kwargs, see
|
|
151
|
+
[llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
|
|
152
|
+
:param tools:
|
|
153
|
+
A list of tools or a Toolset for which the model can prepare calls.
|
|
154
|
+
This parameter can accept either a list of `Tool` objects or a `Toolset` instance.
|
|
155
|
+
:param streaming_callback: A callback function that is called when a new token is received from the stream.
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
model_kwargs = model_kwargs or {}
|
|
159
|
+
generation_kwargs = generation_kwargs or {}
|
|
160
|
+
|
|
161
|
+
# check if the model_kwargs contain the essential parameters
|
|
162
|
+
# otherwise, populate them with values from init parameters
|
|
163
|
+
model_kwargs.setdefault("model_path", model)
|
|
164
|
+
model_kwargs.setdefault("n_ctx", n_ctx)
|
|
165
|
+
model_kwargs.setdefault("n_batch", n_batch)
|
|
166
|
+
|
|
167
|
+
_check_duplicate_tool_names(list(tools or []))
|
|
168
|
+
|
|
169
|
+
self.model_path = model
|
|
170
|
+
self.n_ctx = n_ctx
|
|
171
|
+
self.n_batch = n_batch
|
|
172
|
+
self.model_kwargs = model_kwargs
|
|
173
|
+
self.generation_kwargs = generation_kwargs
|
|
174
|
+
self._model: Optional[Llama] = None
|
|
175
|
+
self.tools = tools
|
|
176
|
+
self.streaming_callback = streaming_callback
|
|
177
|
+
|
|
178
|
+
def warm_up(self):
|
|
179
|
+
if "hf_tokenizer_path" in self.model_kwargs and "tokenizer" not in self.model_kwargs:
|
|
180
|
+
tokenizer = LlamaHFTokenizer.from_pretrained(self.model_kwargs["hf_tokenizer_path"])
|
|
181
|
+
self.model_kwargs["tokenizer"] = tokenizer
|
|
182
|
+
|
|
183
|
+
if self._model is None:
|
|
184
|
+
self._model = Llama(**self.model_kwargs)
|
|
185
|
+
|
|
186
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
187
|
+
"""
|
|
188
|
+
Serializes the component to a dictionary.
|
|
189
|
+
|
|
190
|
+
:returns:
|
|
191
|
+
Dictionary with serialized data.
|
|
192
|
+
"""
|
|
193
|
+
callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
|
|
194
|
+
return default_to_dict(
|
|
195
|
+
self,
|
|
196
|
+
model=self.model_path,
|
|
197
|
+
n_ctx=self.n_ctx,
|
|
198
|
+
n_batch=self.n_batch,
|
|
199
|
+
model_kwargs=self.model_kwargs,
|
|
200
|
+
generation_kwargs=self.generation_kwargs,
|
|
201
|
+
tools=serialize_tools_or_toolset(self.tools),
|
|
202
|
+
streaming_callback=callback_name,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
@classmethod
|
|
206
|
+
def from_dict(cls, data: Dict[str, Any]) -> "LlamaCppChatGenerator":
|
|
207
|
+
"""
|
|
208
|
+
Deserializes the component from a dictionary.
|
|
209
|
+
|
|
210
|
+
:param data:
|
|
211
|
+
Dictionary to deserialize from.
|
|
212
|
+
:returns:
|
|
213
|
+
Deserialized component.
|
|
214
|
+
"""
|
|
215
|
+
deserialize_tools_or_toolset_inplace(data["init_parameters"], key="tools")
|
|
216
|
+
if (
|
|
217
|
+
"streaming_callback" in data["init_parameters"]
|
|
218
|
+
and data["init_parameters"]["streaming_callback"] is not None
|
|
219
|
+
):
|
|
220
|
+
data["init_parameters"]["streaming_callback"] = deserialize_callable(
|
|
221
|
+
data["init_parameters"]["streaming_callback"]
|
|
222
|
+
)
|
|
223
|
+
return default_from_dict(cls, data)
|
|
224
|
+
|
|
225
|
+
@component.output_types(replies=List[ChatMessage])
|
|
226
|
+
def run(
|
|
227
|
+
self,
|
|
228
|
+
messages: List[ChatMessage],
|
|
229
|
+
generation_kwargs: Optional[Dict[str, Any]] = None,
|
|
230
|
+
*,
|
|
231
|
+
tools: Optional[Union[List[Tool], Toolset]] = None,
|
|
232
|
+
streaming_callback: Optional[StreamingCallbackT] = None,
|
|
233
|
+
) -> Dict[str, List[ChatMessage]]:
|
|
234
|
+
"""
|
|
235
|
+
Run the text generation model on the given list of ChatMessages.
|
|
236
|
+
|
|
237
|
+
:param messages:
|
|
238
|
+
A list of ChatMessage instances representing the input messages.
|
|
239
|
+
:param generation_kwargs: A dictionary containing keyword arguments to customize text generation.
|
|
240
|
+
For more information on the available kwargs, see
|
|
241
|
+
[llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
|
|
242
|
+
:param tools:
|
|
243
|
+
A list of tools or a Toolset for which the model can prepare calls. If set, it will override the `tools`
|
|
244
|
+
parameter set during component initialization.
|
|
245
|
+
:param streaming_callback: A callback function that is called when a new token is received from the stream.
|
|
246
|
+
If set, it will override the `streaming_callback` parameter set during component initialization.
|
|
247
|
+
:returns: A dictionary with the following keys:
|
|
248
|
+
- `replies`: The responses from the model
|
|
249
|
+
"""
|
|
250
|
+
if self._model is None:
|
|
251
|
+
error_msg = "The model has not been loaded. Please call warm_up() before running."
|
|
252
|
+
raise RuntimeError(error_msg)
|
|
253
|
+
|
|
254
|
+
if not messages:
|
|
255
|
+
return {"replies": []}
|
|
256
|
+
|
|
257
|
+
updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
|
|
258
|
+
formatted_messages = [_convert_message_to_llamacpp_format(msg) for msg in messages]
|
|
259
|
+
|
|
260
|
+
tools = tools or self.tools
|
|
261
|
+
if isinstance(tools, Toolset):
|
|
262
|
+
tools = list(tools)
|
|
263
|
+
_check_duplicate_tool_names(tools)
|
|
264
|
+
|
|
265
|
+
llamacpp_tools: List[ChatCompletionTool] = []
|
|
266
|
+
if tools:
|
|
267
|
+
for t in tools:
|
|
268
|
+
llamacpp_tools.append(
|
|
269
|
+
{
|
|
270
|
+
"type": "function",
|
|
271
|
+
"function": {
|
|
272
|
+
"name": t.tool_spec["name"],
|
|
273
|
+
"description": t.tool_spec.get("description", ""),
|
|
274
|
+
"parameters": t.tool_spec.get("parameters", {}),
|
|
275
|
+
},
|
|
276
|
+
}
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
streaming_callback = select_streaming_callback(
|
|
280
|
+
init_callback=self.streaming_callback,
|
|
281
|
+
runtime_callback=streaming_callback,
|
|
282
|
+
requires_async=False,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
if streaming_callback:
|
|
286
|
+
response_stream = self._model.create_chat_completion(
|
|
287
|
+
messages=formatted_messages, tools=llamacpp_tools, **updated_generation_kwargs, stream=True
|
|
288
|
+
)
|
|
289
|
+
return self._handle_streaming_response(
|
|
290
|
+
response_stream=response_stream, # type: ignore[arg-type]
|
|
291
|
+
streaming_callback=streaming_callback,
|
|
292
|
+
component_info=ComponentInfo.from_component(self),
|
|
293
|
+
) # we know that response_stream is Iterator[CreateChatCompletionStreamResponse]
|
|
294
|
+
# because create_chat_completion was called with stream=True, but mypy doesn't know that
|
|
295
|
+
|
|
296
|
+
response = self._model.create_chat_completion(
|
|
297
|
+
messages=formatted_messages, tools=llamacpp_tools, **updated_generation_kwargs
|
|
298
|
+
)
|
|
299
|
+
replies = []
|
|
300
|
+
if not isinstance(response, dict):
|
|
301
|
+
msg = f"Expected a dictionary response, got a different object: {response}"
|
|
302
|
+
raise ValueError(msg)
|
|
303
|
+
|
|
304
|
+
for choice in response["choices"]:
|
|
305
|
+
chat_message = self._convert_chat_completion_choice_to_chat_message(choice, response)
|
|
306
|
+
replies.append(chat_message)
|
|
307
|
+
return {"replies": replies}
|
|
308
|
+
|
|
309
|
+
@staticmethod
|
|
310
|
+
def _handle_streaming_response(
|
|
311
|
+
response_stream: Iterator[CreateChatCompletionStreamResponse],
|
|
312
|
+
streaming_callback: SyncStreamingCallbackT,
|
|
313
|
+
component_info: ComponentInfo,
|
|
314
|
+
) -> Dict[str, List[ChatMessage]]:
|
|
315
|
+
"""
|
|
316
|
+
Take streaming responses from llama.cpp, convert to Haystack StreamingChunk objects, stream them,
|
|
317
|
+
and finally convert them to a ChatMessage.
|
|
318
|
+
|
|
319
|
+
:param response_stream: The streaming responses from llama.cpp.
|
|
320
|
+
:param streaming_callback: The callback function for streaming chunks.
|
|
321
|
+
:param component_info: The component info.
|
|
322
|
+
:returns: A dictionary with the replies.
|
|
323
|
+
"""
|
|
324
|
+
streaming_chunks = []
|
|
325
|
+
|
|
326
|
+
seen_tool_call_ids = set() # Track tool call IDs we've seen
|
|
327
|
+
|
|
328
|
+
for i, chunk in enumerate(response_stream):
|
|
329
|
+
content = ""
|
|
330
|
+
tool_calls = []
|
|
331
|
+
mapped_finish_reason = None
|
|
332
|
+
|
|
333
|
+
# Track new tool call IDs in this chunk.
|
|
334
|
+
# Considering tool call ID is the only reliable way to recognize tool calls in llama.cpp streaming.
|
|
335
|
+
# They are often spread across multiple chunks.
|
|
336
|
+
new_tool_call_ids = set()
|
|
337
|
+
|
|
338
|
+
if chunk.get("choices") and len(chunk["choices"]) > 0:
|
|
339
|
+
choice = chunk["choices"][0]
|
|
340
|
+
delta = choice.get("delta", {})
|
|
341
|
+
|
|
342
|
+
finish_reason = choice.get("finish_reason")
|
|
343
|
+
mapped_finish_reason = FINISH_REASON_MAPPING.get(finish_reason or "")
|
|
344
|
+
|
|
345
|
+
if content_raw := delta.get("content"):
|
|
346
|
+
content = str(content_raw)
|
|
347
|
+
|
|
348
|
+
tool_calls_data = delta.get("tool_calls")
|
|
349
|
+
if tool_calls_data is not None and isinstance(tool_calls_data, list):
|
|
350
|
+
for tool_call_chunk in tool_calls_data:
|
|
351
|
+
tool_call_id = tool_call_chunk.get("id")
|
|
352
|
+
is_new_tool_call = tool_call_id and tool_call_id not in seen_tool_call_ids
|
|
353
|
+
|
|
354
|
+
if is_new_tool_call:
|
|
355
|
+
new_tool_call_ids.add(tool_call_id)
|
|
356
|
+
seen_tool_call_ids.add(tool_call_id)
|
|
357
|
+
|
|
358
|
+
function_data = tool_call_chunk.get("function", {})
|
|
359
|
+
|
|
360
|
+
# Only include tool_name if this is a new tool call
|
|
361
|
+
tool_name = function_data.get("name", "") if is_new_tool_call else ""
|
|
362
|
+
|
|
363
|
+
tool_calls.append(
|
|
364
|
+
ToolCallDelta(
|
|
365
|
+
index=tool_call_chunk.get("index"),
|
|
366
|
+
id=tool_call_id,
|
|
367
|
+
tool_name=tool_name,
|
|
368
|
+
arguments=function_data.get("arguments"),
|
|
369
|
+
)
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# start is True if it's the first chunk or if we have new tool call IDs
|
|
373
|
+
start = i == 0 or len(new_tool_call_ids) > 0
|
|
374
|
+
|
|
375
|
+
streaming_chunk = StreamingChunk(
|
|
376
|
+
content="" if tool_calls else content, # prioritize tool calls over content when both are present
|
|
377
|
+
tool_calls=tool_calls,
|
|
378
|
+
component_info=component_info,
|
|
379
|
+
index=i,
|
|
380
|
+
start=start,
|
|
381
|
+
finish_reason=mapped_finish_reason,
|
|
382
|
+
meta={
|
|
383
|
+
"model": chunk["model"],
|
|
384
|
+
"received_at": datetime.fromtimestamp(chunk["created"], tz=timezone.utc).isoformat(),
|
|
385
|
+
}, # llama.cpp does not provide usage metadata during streaming
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
streaming_chunks.append(streaming_chunk)
|
|
389
|
+
|
|
390
|
+
# Stream the chunk
|
|
391
|
+
try:
|
|
392
|
+
streaming_callback(streaming_chunk)
|
|
393
|
+
except Exception as e:
|
|
394
|
+
logger.error(f"Error in streaming callback invocation: {e}")
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
message = _convert_streaming_chunks_to_chat_message(streaming_chunks)
|
|
398
|
+
return {"replies": [message]}
|
|
399
|
+
|
|
400
|
+
@staticmethod
|
|
401
|
+
def _convert_chat_completion_choice_to_chat_message(
|
|
402
|
+
choice: ChatCompletionResponseChoice, response: CreateChatCompletionResponse
|
|
403
|
+
) -> ChatMessage:
|
|
404
|
+
llamacpp_message = choice["message"]
|
|
405
|
+
text_content = llamacpp_message.get("content", "") or None
|
|
406
|
+
tool_calls = []
|
|
407
|
+
|
|
408
|
+
if llamacpp_tool_calls := llamacpp_message.get("tool_calls", []):
|
|
409
|
+
for llamacpp_tc in llamacpp_tool_calls:
|
|
410
|
+
arguments_str = llamacpp_tc["function"]["arguments"]
|
|
411
|
+
try:
|
|
412
|
+
arguments = json.loads(arguments_str)
|
|
413
|
+
tool_calls.append(
|
|
414
|
+
ToolCall(id=llamacpp_tc["id"], tool_name=llamacpp_tc["function"]["name"], arguments=arguments)
|
|
415
|
+
)
|
|
416
|
+
except json.JSONDecodeError:
|
|
417
|
+
logger.warning(
|
|
418
|
+
"Llama.cpp returned a malformed JSON string for tool call arguments. This tool call "
|
|
419
|
+
"will be skipped. Tool call ID: {tc_id}, Tool name: {tc_name}, Arguments: {tc_args}",
|
|
420
|
+
tc_id=llamacpp_tc["id"],
|
|
421
|
+
tc_name=llamacpp_tc["function"]["name"],
|
|
422
|
+
tc_args=arguments_str,
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
finish_reason = choice.get("finish_reason")
|
|
426
|
+
|
|
427
|
+
meta = {
|
|
428
|
+
"response_id": response["id"],
|
|
429
|
+
"model": response["model"],
|
|
430
|
+
"created": response["created"],
|
|
431
|
+
"index": choice["index"],
|
|
432
|
+
"finish_reason": FINISH_REASON_MAPPING.get(finish_reason or ""),
|
|
433
|
+
"usage": response["usage"],
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
return ChatMessage.from_assistant(text=text_content, tool_calls=tool_calls, meta=meta)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import Any, Dict, List, Optional
|
|
1
|
+
from typing import Any, Dict, List, Optional, Union
|
|
3
2
|
|
|
4
|
-
from haystack import component
|
|
3
|
+
from haystack import component, logging
|
|
5
4
|
|
|
6
5
|
from llama_cpp import Llama
|
|
7
6
|
|
|
@@ -13,7 +12,7 @@ class LlamaCppGenerator:
|
|
|
13
12
|
"""
|
|
14
13
|
Provides an interface to generate text using LLM via llama.cpp.
|
|
15
14
|
|
|
16
|
-
[llama.cpp](https://github.com/
|
|
15
|
+
[llama.cpp](https://github.com/ggml-org/llama.cpp) is a project written in C/C++ for efficient inference of LLMs.
|
|
17
16
|
It employs the quantized GGUF format, suitable for running these models on standard machines (even without GPUs).
|
|
18
17
|
|
|
19
18
|
Usage example:
|
|
@@ -63,14 +62,16 @@ class LlamaCppGenerator:
|
|
|
63
62
|
self.n_batch = n_batch
|
|
64
63
|
self.model_kwargs = model_kwargs
|
|
65
64
|
self.generation_kwargs = generation_kwargs
|
|
66
|
-
self.model = None
|
|
65
|
+
self.model: Optional[Llama] = None
|
|
67
66
|
|
|
68
67
|
def warm_up(self):
|
|
69
68
|
if self.model is None:
|
|
70
69
|
self.model = Llama(**self.model_kwargs)
|
|
71
70
|
|
|
72
71
|
@component.output_types(replies=List[str], meta=List[Dict[str, Any]])
|
|
73
|
-
def run(
|
|
72
|
+
def run(
|
|
73
|
+
self, prompt: str, generation_kwargs: Optional[Dict[str, Any]] = None
|
|
74
|
+
) -> Dict[str, Union[List[str], List[Dict[str, Any]]]]:
|
|
74
75
|
"""
|
|
75
76
|
Run the text generation model on the given prompt.
|
|
76
77
|
|
|
@@ -93,6 +94,10 @@ class LlamaCppGenerator:
|
|
|
93
94
|
updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
|
|
94
95
|
|
|
95
96
|
output = self.model.create_completion(prompt=prompt, **updated_generation_kwargs)
|
|
97
|
+
if not isinstance(output, dict):
|
|
98
|
+
msg = f"Expected a dictionary response, got a different object: {output}"
|
|
99
|
+
raise ValueError(msg)
|
|
100
|
+
|
|
96
101
|
replies = [output["choices"][0]["text"]]
|
|
97
102
|
|
|
98
|
-
return {"replies": replies, "meta": [output]}
|
|
103
|
+
return {"replies": replies, "meta": [dict(output.items())]}
|
|
File without changes
|