ragbits-core 0.16.0__py3-none-any.whl → 1.4.0.dev202512021005__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits/core/__init__.py +21 -2
- ragbits/core/audit/__init__.py +15 -157
- ragbits/core/audit/metrics/__init__.py +83 -0
- ragbits/core/audit/metrics/base.py +198 -0
- ragbits/core/audit/metrics/logfire.py +19 -0
- ragbits/core/audit/metrics/otel.py +65 -0
- ragbits/core/audit/traces/__init__.py +171 -0
- ragbits/core/audit/{base.py → traces/base.py} +9 -5
- ragbits/core/audit/{cli.py → traces/cli.py} +8 -4
- ragbits/core/audit/traces/logfire.py +18 -0
- ragbits/core/audit/{otel.py → traces/otel.py} +5 -8
- ragbits/core/config.py +15 -0
- ragbits/core/embeddings/__init__.py +2 -1
- ragbits/core/embeddings/base.py +19 -0
- ragbits/core/embeddings/dense/base.py +10 -1
- ragbits/core/embeddings/dense/fastembed.py +22 -1
- ragbits/core/embeddings/dense/litellm.py +37 -10
- ragbits/core/embeddings/dense/local.py +15 -1
- ragbits/core/embeddings/dense/noop.py +11 -1
- ragbits/core/embeddings/dense/vertex_multimodal.py +14 -1
- ragbits/core/embeddings/sparse/bag_of_tokens.py +47 -17
- ragbits/core/embeddings/sparse/base.py +10 -1
- ragbits/core/embeddings/sparse/fastembed.py +25 -2
- ragbits/core/llms/__init__.py +3 -3
- ragbits/core/llms/base.py +612 -88
- ragbits/core/llms/exceptions.py +27 -0
- ragbits/core/llms/litellm.py +408 -83
- ragbits/core/llms/local.py +180 -41
- ragbits/core/llms/mock.py +88 -23
- ragbits/core/prompt/__init__.py +2 -2
- ragbits/core/prompt/_cli.py +32 -19
- ragbits/core/prompt/base.py +105 -19
- ragbits/core/prompt/{discovery/prompt_discovery.py → discovery.py} +1 -1
- ragbits/core/prompt/exceptions.py +22 -6
- ragbits/core/prompt/prompt.py +180 -98
- ragbits/core/sources/__init__.py +2 -0
- ragbits/core/sources/azure.py +1 -1
- ragbits/core/sources/base.py +8 -1
- ragbits/core/sources/gcs.py +1 -1
- ragbits/core/sources/git.py +1 -1
- ragbits/core/sources/google_drive.py +595 -0
- ragbits/core/sources/hf.py +71 -31
- ragbits/core/sources/local.py +1 -1
- ragbits/core/sources/s3.py +1 -1
- ragbits/core/utils/config_handling.py +13 -2
- ragbits/core/utils/function_schema.py +220 -0
- ragbits/core/utils/helpers.py +22 -0
- ragbits/core/utils/lazy_litellm.py +44 -0
- ragbits/core/vector_stores/base.py +18 -1
- ragbits/core/vector_stores/chroma.py +28 -11
- ragbits/core/vector_stores/hybrid.py +1 -1
- ragbits/core/vector_stores/hybrid_strategies.py +21 -8
- ragbits/core/vector_stores/in_memory.py +13 -4
- ragbits/core/vector_stores/pgvector.py +123 -47
- ragbits/core/vector_stores/qdrant.py +15 -7
- ragbits/core/vector_stores/weaviate.py +440 -0
- {ragbits_core-0.16.0.dist-info → ragbits_core-1.4.0.dev202512021005.dist-info}/METADATA +22 -6
- ragbits_core-1.4.0.dev202512021005.dist-info/RECORD +79 -0
- {ragbits_core-0.16.0.dist-info → ragbits_core-1.4.0.dev202512021005.dist-info}/WHEEL +1 -1
- ragbits/core/prompt/discovery/__init__.py +0 -3
- ragbits/core/prompt/lab/__init__.py +0 -0
- ragbits/core/prompt/lab/app.py +0 -262
- ragbits_core-0.16.0.dist-info/RECORD +0 -72
ragbits/core/llms/exceptions.py
CHANGED
|
@@ -52,3 +52,30 @@ class LLMNotSupportingImagesError(LLMError):
|
|
|
52
52
|
|
|
53
53
|
def __init__(self, message: str = "There are images in the prompt, but given LLM doesn't support them.") -> None:
|
|
54
54
|
super().__init__(message)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class LLMNotSupportingPdfsError(LLMError):
|
|
58
|
+
"""
|
|
59
|
+
Raised when there are PDFs in the prompt, but LLM doesn't support them.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(self, message: str = "There are PDFs in the prompt, but given LLM doesn't support them.") -> None:
|
|
63
|
+
super().__init__(message)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class LLMNotSupportingToolUseError(LLMError):
|
|
67
|
+
"""
|
|
68
|
+
Raised when there are tools provided, but LLM doesn't support tool use.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(self, message: str = "There are tools provided, but given LLM doesn't support tool use.") -> None:
|
|
72
|
+
super().__init__(message)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class LLMNotSupportingReasoningEffortError(LLMError):
|
|
76
|
+
"""
|
|
77
|
+
Raised when there is reasoning effort provided, but LLM doesn't support it.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(self, model_name: str) -> None:
|
|
81
|
+
super().__init__(f"Model {model_name} does not support reasoning effort.")
|
ragbits/core/llms/litellm.py
CHANGED
|
@@ -1,29 +1,38 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
import asyncio
|
|
2
|
+
import time
|
|
3
|
+
from collections.abc import AsyncGenerator, Callable, Iterable
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
3
5
|
|
|
4
|
-
import
|
|
5
|
-
from litellm.utils import CustomStreamWrapper, ModelResponse
|
|
6
|
+
import tiktoken
|
|
6
7
|
from pydantic import BaseModel
|
|
7
8
|
from typing_extensions import Self
|
|
8
9
|
|
|
9
|
-
from ragbits.core.audit import
|
|
10
|
-
from ragbits.core.
|
|
10
|
+
from ragbits.core.audit.metrics import record_metric
|
|
11
|
+
from ragbits.core.audit.metrics.base import LLMMetric, MetricType
|
|
12
|
+
from ragbits.core.llms.base import LLM, LLMOptions, ToolChoice
|
|
11
13
|
from ragbits.core.llms.exceptions import (
|
|
12
14
|
LLMConnectionError,
|
|
13
15
|
LLMEmptyResponseError,
|
|
14
16
|
LLMNotSupportingImagesError,
|
|
17
|
+
LLMNotSupportingPdfsError,
|
|
18
|
+
LLMNotSupportingReasoningEffortError,
|
|
19
|
+
LLMNotSupportingToolUseError,
|
|
15
20
|
LLMResponseError,
|
|
16
21
|
LLMStatusError,
|
|
17
22
|
)
|
|
18
|
-
from ragbits.core.options import Options
|
|
19
23
|
from ragbits.core.prompt.base import BasePrompt, ChatFormat
|
|
20
24
|
from ragbits.core.types import NOT_GIVEN, NotGiven
|
|
25
|
+
from ragbits.core.utils.lazy_litellm import LazyLiteLLM
|
|
21
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from litellm import CustomStreamWrapper, ModelResponse, Router
|
|
22
29
|
|
|
23
|
-
|
|
30
|
+
|
|
31
|
+
class LiteLLMOptions(LLMOptions):
|
|
24
32
|
"""
|
|
25
33
|
Dataclass that represents all available LLM call options for the LiteLLM client.
|
|
26
34
|
Each of them is described in the [LiteLLM documentation](https://docs.litellm.ai/docs/completion/input).
|
|
35
|
+
Reasoning effort and thinking are described in [LiteLLM Reasoning documentation](https://docs.litellm.ai/docs/reasoning_content)
|
|
27
36
|
"""
|
|
28
37
|
|
|
29
38
|
frequency_penalty: float | None | NotGiven = NOT_GIVEN
|
|
@@ -38,9 +47,13 @@ class LiteLLMOptions(Options):
|
|
|
38
47
|
top_logprobs: int | None | NotGiven = NOT_GIVEN
|
|
39
48
|
logit_bias: dict | None | NotGiven = NOT_GIVEN
|
|
40
49
|
mock_response: str | None | NotGiven = NOT_GIVEN
|
|
50
|
+
tpm: int | None | NotGiven = NOT_GIVEN
|
|
51
|
+
rpm: int | None | NotGiven = NOT_GIVEN
|
|
52
|
+
reasoning_effort: Literal["low", "medium", "high"] | None | NotGiven = NOT_GIVEN
|
|
53
|
+
thinking: dict | None | NotGiven = NOT_GIVEN
|
|
41
54
|
|
|
42
55
|
|
|
43
|
-
class LiteLLM(LLM[LiteLLMOptions]):
|
|
56
|
+
class LiteLLM(LLM[LiteLLMOptions], LazyLiteLLM):
|
|
44
57
|
"""
|
|
45
58
|
Class for interaction with any LLM supported by LiteLLM API.
|
|
46
59
|
"""
|
|
@@ -57,7 +70,7 @@ class LiteLLM(LLM[LiteLLMOptions]):
|
|
|
57
70
|
api_key: str | None = None,
|
|
58
71
|
api_version: str | None = None,
|
|
59
72
|
use_structured_output: bool = False,
|
|
60
|
-
router:
|
|
73
|
+
router: "Router | None" = None,
|
|
61
74
|
custom_model_cost_config: dict | None = None,
|
|
62
75
|
) -> None:
|
|
63
76
|
"""
|
|
@@ -90,7 +103,29 @@ class LiteLLM(LLM[LiteLLMOptions]):
|
|
|
90
103
|
self.router = router
|
|
91
104
|
self.custom_model_cost_config = custom_model_cost_config
|
|
92
105
|
if custom_model_cost_config:
|
|
93
|
-
|
|
106
|
+
self._litellm.register_model(custom_model_cost_config)
|
|
107
|
+
|
|
108
|
+
def get_model_id(self) -> str:
|
|
109
|
+
"""
|
|
110
|
+
Returns the model id.
|
|
111
|
+
"""
|
|
112
|
+
return "litellm:" + self.model_name
|
|
113
|
+
|
|
114
|
+
def get_estimated_cost(self, prompt_tokens: int, completion_tokens: int) -> float:
|
|
115
|
+
"""
|
|
116
|
+
Returns the estimated cost of the LLM call.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
prompt_tokens: The number of tokens in the prompt.
|
|
120
|
+
completion_tokens: The number of tokens in the completion.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
The estimated cost of the LLM call.
|
|
124
|
+
"""
|
|
125
|
+
response_cost = self._litellm.get_model_info(self.model_name)
|
|
126
|
+
response_cost_input = prompt_tokens * response_cost["input_cost_per_token"]
|
|
127
|
+
response_cost_output = completion_tokens * response_cost["output_cost_per_token"]
|
|
128
|
+
return response_cost_input + response_cost_output
|
|
94
129
|
|
|
95
130
|
def count_tokens(self, prompt: BasePrompt) -> int:
|
|
96
131
|
"""
|
|
@@ -102,75 +137,145 @@ class LiteLLM(LLM[LiteLLMOptions]):
|
|
|
102
137
|
Returns:
|
|
103
138
|
Number of tokens in the prompt.
|
|
104
139
|
"""
|
|
105
|
-
return sum(
|
|
140
|
+
return sum(
|
|
141
|
+
self._litellm.token_counter(model=self.model_name, text=message.get("content") or "")
|
|
142
|
+
for message in prompt.chat
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
def get_token_id(self, token: str) -> int:
|
|
146
|
+
"""
|
|
147
|
+
Gets token id.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
token: The token to encode.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
The id for the given token.
|
|
154
|
+
"""
|
|
155
|
+
try:
|
|
156
|
+
tokenizer = tiktoken.encoding_for_model(self.model_name)
|
|
157
|
+
return tokenizer.encode_single_token(token)
|
|
158
|
+
except KeyError:
|
|
159
|
+
return self._litellm.encode(model=self.model_name, text=token)[0]
|
|
106
160
|
|
|
107
161
|
async def _call(
|
|
108
162
|
self,
|
|
109
|
-
prompt: BasePrompt,
|
|
163
|
+
prompt: Iterable[BasePrompt],
|
|
110
164
|
options: LiteLLMOptions,
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
) -> dict:
|
|
165
|
+
tools: list[dict] | None = None,
|
|
166
|
+
tool_choice: ToolChoice | None = None,
|
|
167
|
+
) -> list[dict]:
|
|
114
168
|
"""
|
|
115
169
|
Calls the appropriate LLM endpoint with the given prompt and options.
|
|
116
170
|
|
|
117
171
|
Args:
|
|
118
|
-
prompt: BasePrompt
|
|
172
|
+
prompt: Iterable of BasePrompt objects containing conversations
|
|
119
173
|
options: Additional settings used by the LLM.
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
174
|
+
tools: Functions to be used as tools by the LLM.
|
|
175
|
+
tool_choice: Parameter that allows to control what tool is used. Can be one of:
|
|
176
|
+
- "auto": let model decide if tool call is needed
|
|
177
|
+
- "none": do not call tool
|
|
178
|
+
- "required: enforce tool usage (model decides which one)
|
|
179
|
+
- dict: tool dict corresponding to one of provided tools
|
|
123
180
|
|
|
124
181
|
Returns:
|
|
125
|
-
|
|
182
|
+
list of dictionaries with responses from the LLM and metadata.
|
|
126
183
|
|
|
127
184
|
Raises:
|
|
128
185
|
LLMConnectionError: If there is a connection error with the LLM API.
|
|
129
186
|
LLMStatusError: If the LLM API returns an error status code.
|
|
130
187
|
LLMResponseError: If the LLM API response is invalid.
|
|
131
188
|
LLMNotSupportingImagesError: If the model does not support images.
|
|
189
|
+
LLMNotSupportingPdfsError: If the model does not support PDFs.
|
|
190
|
+
LLMNotSupportingToolUseError: If the model does not support tool use.
|
|
132
191
|
"""
|
|
133
|
-
if
|
|
192
|
+
if any(p.list_images() for p in prompt) and not self._litellm.supports_vision(self.model_name):
|
|
134
193
|
raise LLMNotSupportingImagesError()
|
|
135
194
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
195
|
+
if any(p.list_pdfs() for p in prompt) and not self._litellm.supports_pdf_input(self.model_name):
|
|
196
|
+
raise LLMNotSupportingPdfsError()
|
|
197
|
+
|
|
198
|
+
if tools and not self._litellm.supports_function_calling(self.model_name):
|
|
199
|
+
raise LLMNotSupportingToolUseError()
|
|
200
|
+
|
|
201
|
+
if options.reasoning_effort and not self._litellm.supports_reasoning(self.model_name):
|
|
202
|
+
raise LLMNotSupportingReasoningEffortError(self.model_name)
|
|
203
|
+
|
|
204
|
+
start_time = time.perf_counter()
|
|
205
|
+
raw_responses = await asyncio.gather(
|
|
206
|
+
*(
|
|
207
|
+
self._get_litellm_response(
|
|
208
|
+
conversation=single_prompt.chat,
|
|
209
|
+
options=options,
|
|
210
|
+
response_format=self._get_response_format(
|
|
211
|
+
output_schema=single_prompt.output_schema(), json_mode=single_prompt.json_mode
|
|
212
|
+
),
|
|
213
|
+
tools=tools,
|
|
214
|
+
tool_choice=tool_choice,
|
|
215
|
+
)
|
|
216
|
+
for single_prompt in prompt
|
|
217
|
+
)
|
|
142
218
|
)
|
|
143
|
-
if not response.choices: # type: ignore
|
|
144
|
-
raise LLMEmptyResponseError()
|
|
145
|
-
results = {}
|
|
146
|
-
results["response"] = response.choices[0].message.content # type: ignore
|
|
147
219
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
220
|
+
results: list[dict] = []
|
|
221
|
+
throughput_batch = time.perf_counter() - start_time
|
|
222
|
+
|
|
223
|
+
for response in raw_responses:
|
|
224
|
+
if not response.choices: # type: ignore
|
|
225
|
+
raise LLMEmptyResponseError()
|
|
226
|
+
|
|
227
|
+
result = {}
|
|
228
|
+
result["response"] = response.choices[0].message.content # type: ignore
|
|
229
|
+
result["reasoning"] = getattr(response.choices[0].message, "reasoning_content", None) # type: ignore
|
|
230
|
+
result["throughput"] = throughput_batch / float(len(raw_responses))
|
|
231
|
+
|
|
232
|
+
result["tool_calls"] = (
|
|
233
|
+
[
|
|
234
|
+
{
|
|
235
|
+
"name": tool_call.function.name,
|
|
236
|
+
"arguments": tool_call.function.arguments,
|
|
237
|
+
"type": tool_call.type,
|
|
238
|
+
"id": tool_call.id,
|
|
239
|
+
}
|
|
240
|
+
for tool_call in tool_calls
|
|
241
|
+
]
|
|
242
|
+
if tools and (tool_calls := response.choices[0].message.tool_calls) # type: ignore
|
|
243
|
+
else None
|
|
244
|
+
)
|
|
152
245
|
|
|
153
|
-
|
|
154
|
-
|
|
246
|
+
if options.logprobs:
|
|
247
|
+
result["logprobs"] = response.choices[0].logprobs["content"] # type: ignore
|
|
155
248
|
|
|
156
|
-
|
|
249
|
+
if response.usage: # type: ignore
|
|
250
|
+
result["usage"] = {
|
|
251
|
+
"completion_tokens": response.usage.completion_tokens, # type: ignore
|
|
252
|
+
"prompt_tokens": response.usage.prompt_tokens, # type: ignore
|
|
253
|
+
"total_tokens": response.usage.total_tokens, # type: ignore
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
results.append(result)
|
|
257
|
+
|
|
258
|
+
return results
|
|
157
259
|
|
|
158
260
|
async def _call_streaming(
|
|
159
261
|
self,
|
|
160
262
|
prompt: BasePrompt,
|
|
161
263
|
options: LiteLLMOptions,
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
) -> AsyncGenerator[
|
|
264
|
+
tools: list[dict] | None = None,
|
|
265
|
+
tool_choice: ToolChoice | None = None,
|
|
266
|
+
) -> AsyncGenerator[dict, None]:
|
|
165
267
|
"""
|
|
166
268
|
Calls the appropriate LLM endpoint with the given prompt and options.
|
|
167
269
|
|
|
168
270
|
Args:
|
|
169
271
|
prompt: BasePrompt object containing the conversation
|
|
170
272
|
options: Additional settings used by the LLM.
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
273
|
+
tools: Functions to be used as tools by the LLM.
|
|
274
|
+
tool_choice: Parameter that allows to control what tool is used. Can be one of:
|
|
275
|
+
- "auto": let model decide if tool call is needed
|
|
276
|
+
- "none": do not call tool
|
|
277
|
+
- "required: enforce tool usage (model decides which one)
|
|
278
|
+
- dict: tool dict corresponding to one of provided tools
|
|
174
279
|
|
|
175
280
|
Returns:
|
|
176
281
|
Response string from LLM.
|
|
@@ -180,69 +285,289 @@ class LiteLLM(LLM[LiteLLMOptions]):
|
|
|
180
285
|
LLMStatusError: If the LLM API returns an error status code.
|
|
181
286
|
LLMResponseError: If the LLM API response is invalid.
|
|
182
287
|
LLMNotSupportingImagesError: If the model does not support images.
|
|
288
|
+
LLMNotSupportingPdfsError: If the model does not support PDFs.
|
|
289
|
+
LLMNotSupportingToolUseError: If the model does not support tool use.
|
|
183
290
|
"""
|
|
184
|
-
if prompt.list_images() and not
|
|
291
|
+
if prompt.list_images() and not self._litellm.supports_vision(self.model_name):
|
|
185
292
|
raise LLMNotSupportingImagesError()
|
|
186
293
|
|
|
187
|
-
|
|
294
|
+
if prompt.list_pdfs() and not self._litellm.supports_pdf_input(self.model_name):
|
|
295
|
+
raise LLMNotSupportingPdfsError()
|
|
296
|
+
|
|
297
|
+
if tools and not self._litellm.supports_function_calling(self.model_name):
|
|
298
|
+
raise LLMNotSupportingToolUseError()
|
|
299
|
+
|
|
300
|
+
if options.reasoning_effort and not self._litellm.supports_reasoning(self.model_name):
|
|
301
|
+
raise LLMNotSupportingReasoningEffortError(self.model_name)
|
|
188
302
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
303
|
+
response_format = self._get_response_format(output_schema=prompt.output_schema(), json_mode=prompt.json_mode)
|
|
304
|
+
input_tokens = self.count_tokens(prompt)
|
|
305
|
+
|
|
306
|
+
provider_calculated_usage = None
|
|
307
|
+
|
|
308
|
+
start_time = time.perf_counter()
|
|
309
|
+
response = await self._get_litellm_response(
|
|
310
|
+
conversation=prompt.chat,
|
|
311
|
+
options=options,
|
|
194
312
|
response_format=response_format,
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
)
|
|
203
|
-
if not response.completion_stream: # type: ignore
|
|
313
|
+
tools=tools,
|
|
314
|
+
tool_choice=tool_choice,
|
|
315
|
+
stream=True,
|
|
316
|
+
stream_options={"include_usage": True},
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
try:
|
|
320
|
+
if (not response.completion_stream and not response.choices) and not response.reasoning: # type: ignore
|
|
204
321
|
raise LLMEmptyResponseError()
|
|
322
|
+
except AttributeError:
|
|
323
|
+
# some providers might not include some parameters (i.e. Gemini -> choices)
|
|
324
|
+
pass
|
|
325
|
+
|
|
326
|
+
async def response_to_async_generator(response: "CustomStreamWrapper") -> AsyncGenerator[dict, None]:
|
|
327
|
+
nonlocal input_tokens, provider_calculated_usage
|
|
328
|
+
output_tokens = 0
|
|
329
|
+
tool_calls: list[dict] = []
|
|
330
|
+
|
|
331
|
+
async for item in response:
|
|
332
|
+
reasoning_content = getattr(item.choices[0].delta, "reasoning_content", None)
|
|
333
|
+
if content := item.choices[0].delta.content or reasoning_content:
|
|
334
|
+
output_tokens += 1
|
|
335
|
+
if output_tokens == 1:
|
|
336
|
+
record_metric(
|
|
337
|
+
metric=LLMMetric.TIME_TO_FIRST_TOKEN,
|
|
338
|
+
value=time.perf_counter() - start_time,
|
|
339
|
+
metric_type=MetricType.HISTOGRAM,
|
|
340
|
+
model=self.model_name,
|
|
341
|
+
prompt=prompt.__class__.__name__,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
yield {"response": content, "reasoning": bool(reasoning_content)}
|
|
345
|
+
|
|
346
|
+
if tool_calls_delta := item.choices[0].delta.tool_calls:
|
|
347
|
+
for tool_call_chunk in tool_calls_delta:
|
|
348
|
+
while len(tool_calls) <= tool_call_chunk.index:
|
|
349
|
+
tool_calls.append({"id": "", "type": "", "name": "", "arguments": ""})
|
|
350
|
+
|
|
351
|
+
tool_calls[tool_call_chunk.index]["id"] += tool_call_chunk.id or ""
|
|
352
|
+
tool_calls[tool_call_chunk.index]["type"] += (
|
|
353
|
+
tool_call_chunk.type
|
|
354
|
+
if tool_call_chunk.type
|
|
355
|
+
and tool_call_chunk.type != tool_calls[tool_call_chunk.index]["type"]
|
|
356
|
+
else ""
|
|
357
|
+
)
|
|
358
|
+
tool_calls[tool_call_chunk.index]["name"] += tool_call_chunk.function.name or ""
|
|
359
|
+
tool_calls[tool_call_chunk.index]["arguments"] += tool_call_chunk.function.arguments or ""
|
|
360
|
+
|
|
361
|
+
if usage := getattr(item, "usage", None):
|
|
362
|
+
provider_calculated_usage = usage
|
|
363
|
+
|
|
364
|
+
total_tokens = input_tokens + output_tokens
|
|
365
|
+
|
|
366
|
+
if provider_calculated_usage:
|
|
367
|
+
input_tokens = provider_calculated_usage.prompt_tokens
|
|
368
|
+
output_tokens = provider_calculated_usage.completion_tokens
|
|
369
|
+
total_tokens = provider_calculated_usage.total_tokens
|
|
370
|
+
|
|
371
|
+
if tool_calls:
|
|
372
|
+
yield {"tool_calls": tool_calls}
|
|
373
|
+
|
|
374
|
+
total_time = time.perf_counter() - start_time
|
|
375
|
+
|
|
376
|
+
yield {
|
|
377
|
+
"usage": {
|
|
378
|
+
"prompt_tokens": input_tokens,
|
|
379
|
+
"completion_tokens": output_tokens,
|
|
380
|
+
"total_tokens": total_tokens,
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
record_metric(
|
|
385
|
+
metric=LLMMetric.INPUT_TOKENS,
|
|
386
|
+
value=input_tokens,
|
|
387
|
+
metric_type=MetricType.HISTOGRAM,
|
|
388
|
+
model=self.model_name,
|
|
389
|
+
prompt=prompt.__class__.__name__,
|
|
390
|
+
)
|
|
391
|
+
record_metric(
|
|
392
|
+
metric=LLMMetric.TOKEN_THROUGHPUT,
|
|
393
|
+
value=output_tokens / total_time,
|
|
394
|
+
metric_type=MetricType.HISTOGRAM,
|
|
395
|
+
model=self.model_name,
|
|
396
|
+
prompt=prompt.__class__.__name__,
|
|
397
|
+
)
|
|
398
|
+
record_metric(
|
|
399
|
+
metric=LLMMetric.PROMPT_THROUGHPUT,
|
|
400
|
+
value=total_time,
|
|
401
|
+
metric_type=MetricType.HISTOGRAM,
|
|
402
|
+
model=self.model_name,
|
|
403
|
+
prompt=prompt.__class__.__name__,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
return response_to_async_generator(response) # type: ignore
|
|
205
407
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
408
|
+
def _create_router_from_self_and_options(self, options: LiteLLMOptions) -> "Router":
|
|
409
|
+
params: dict[str, Any] = {
|
|
410
|
+
"model": self.model_name,
|
|
411
|
+
"api_key": self.api_key,
|
|
412
|
+
"api_version": self.api_version,
|
|
413
|
+
"base_url": self.api_base,
|
|
414
|
+
}
|
|
209
415
|
|
|
210
|
-
|
|
416
|
+
if options.tpm:
|
|
417
|
+
params["tpm"] = options.tpm
|
|
418
|
+
if options.rpm:
|
|
419
|
+
params["rpm"] = options.rpm
|
|
211
420
|
|
|
212
|
-
return
|
|
421
|
+
return self._litellm.Router(
|
|
422
|
+
model_list=[{"model_name": self.model_name, "litellm_params": params}],
|
|
423
|
+
routing_strategy="usage-based-routing-v2",
|
|
424
|
+
enable_pre_call_checks=True,
|
|
425
|
+
)
|
|
213
426
|
|
|
214
427
|
async def _get_litellm_response(
|
|
215
428
|
self,
|
|
216
429
|
conversation: ChatFormat,
|
|
217
430
|
options: LiteLLMOptions,
|
|
218
431
|
response_format: type[BaseModel] | dict | None,
|
|
432
|
+
tools: list[dict] | None = None,
|
|
433
|
+
tool_choice: ToolChoice | None = None,
|
|
219
434
|
stream: bool = False,
|
|
220
|
-
|
|
221
|
-
|
|
435
|
+
stream_options: dict | None = None,
|
|
436
|
+
) -> "ModelResponse | CustomStreamWrapper":
|
|
437
|
+
entrypoint = self.router or self._create_router_from_self_and_options(options)
|
|
438
|
+
|
|
439
|
+
# Preprocess messages for Claude with reasoning enabled
|
|
440
|
+
processed_conversation = self._preprocess_messages_for_claude(conversation, options)
|
|
441
|
+
|
|
442
|
+
# Prepare kwargs for the completion call
|
|
443
|
+
completion_kwargs = {
|
|
444
|
+
"messages": processed_conversation,
|
|
445
|
+
"model": self.model_name,
|
|
446
|
+
"response_format": response_format,
|
|
447
|
+
"tools": tools,
|
|
448
|
+
"tool_choice": tool_choice,
|
|
449
|
+
"stream": stream,
|
|
450
|
+
**options.dict(),
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
supported_openai_params = self._litellm.get_supported_openai_params(model=self.model_name) or []
|
|
454
|
+
if "reasoning_effort" not in supported_openai_params:
|
|
455
|
+
completion_kwargs.pop("reasoning_effort")
|
|
456
|
+
if "thinking" not in supported_openai_params:
|
|
457
|
+
completion_kwargs.pop("thinking")
|
|
458
|
+
|
|
459
|
+
if stream_options is not None:
|
|
460
|
+
completion_kwargs["stream_options"] = stream_options
|
|
222
461
|
|
|
223
462
|
try:
|
|
224
|
-
response = await entrypoint.acompletion(
|
|
225
|
-
|
|
226
|
-
model=self.model_name,
|
|
227
|
-
base_url=self.api_base,
|
|
228
|
-
api_key=self.api_key,
|
|
229
|
-
api_version=self.api_version,
|
|
230
|
-
response_format=response_format,
|
|
231
|
-
stream=stream,
|
|
232
|
-
**options.dict(),
|
|
233
|
-
)
|
|
234
|
-
except litellm.openai.APIConnectionError as exc:
|
|
463
|
+
response = await entrypoint.acompletion(**completion_kwargs)
|
|
464
|
+
except self._litellm.openai.APIConnectionError as exc:
|
|
235
465
|
raise LLMConnectionError() from exc
|
|
236
|
-
except
|
|
466
|
+
except self._litellm.openai.APIStatusError as exc:
|
|
237
467
|
raise LLMStatusError(exc.message, exc.status_code) from exc
|
|
238
|
-
except
|
|
468
|
+
except self._litellm.openai.APIResponseValidationError as exc:
|
|
239
469
|
raise LLMResponseError() from exc
|
|
240
470
|
return response
|
|
241
471
|
|
|
472
|
+
def _preprocess_messages_for_claude(self, conversation: ChatFormat, options: LiteLLMOptions) -> ChatFormat:
|
|
473
|
+
"""
|
|
474
|
+
Preprocess messages for Claude when reasoning is enabled.
|
|
475
|
+
|
|
476
|
+
Claude + reasoning_effort + tool calls creates a conflict:
|
|
477
|
+
- LiteLLM validates messages against OpenAI format (rejects Claude native format)
|
|
478
|
+
- Claude requires thinking blocks when reasoning_effort is set (rejects OpenAI format)
|
|
479
|
+
|
|
480
|
+
Subject to removal after the following are resolved on LiteLLM's side:
|
|
481
|
+
Issue: https://github.com/BerriAI/litellm/issues/14194
|
|
482
|
+
Linked PR(s): https://github.com/BerriAI/litellm/pull/15220
|
|
483
|
+
|
|
484
|
+
Solution: Summarize tool call history and append to last user message.
|
|
485
|
+
This provides context to Claude without triggering validation errors.
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
conversation: The conversation in OpenAI format
|
|
489
|
+
options: LLM options including reasoning_effort
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
Processed conversation with tool context included
|
|
493
|
+
"""
|
|
494
|
+
|
|
495
|
+
def create_enhanced_user_message(
|
|
496
|
+
tool_summary_parts: list[str], original_user_msg: str | None
|
|
497
|
+
) -> dict[str, Any]:
|
|
498
|
+
if tool_summary_parts and original_user_msg:
|
|
499
|
+
enhanced_message = original_user_msg
|
|
500
|
+
enhanced_message += "\n\n[Previous tool calls in this conversation:"
|
|
501
|
+
|
|
502
|
+
for summary in tool_summary_parts:
|
|
503
|
+
enhanced_message += f"\n- {summary}"
|
|
504
|
+
enhanced_message += "\nUse this information to provide your final answer.]"
|
|
505
|
+
return {"role": "user", "content": enhanced_message}
|
|
506
|
+
|
|
507
|
+
return {"role": "user", "content": original_user_msg}
|
|
508
|
+
|
|
509
|
+
# Only process for Claude models with reasoning enabled
|
|
510
|
+
is_claude = "anthropic" in self.model_name.lower() or "claude" in self.model_name.lower()
|
|
511
|
+
has_reasoning = options.reasoning_effort is not NOT_GIVEN and options.reasoning_effort is not None
|
|
512
|
+
|
|
513
|
+
if not (is_claude and has_reasoning):
|
|
514
|
+
return conversation
|
|
515
|
+
|
|
516
|
+
# Check if conversation has tool calls
|
|
517
|
+
has_tool_calls = any(msg.get("role") == "assistant" and msg.get("tool_calls") for msg in conversation)
|
|
518
|
+
|
|
519
|
+
if not has_tool_calls:
|
|
520
|
+
# No tool calls, conversation is fine as-is
|
|
521
|
+
return conversation
|
|
522
|
+
|
|
523
|
+
# Build tool call summary from conversation history
|
|
524
|
+
tool_summary_parts = []
|
|
525
|
+
i = 0
|
|
526
|
+
while i < len(conversation):
|
|
527
|
+
msg = conversation[i]
|
|
528
|
+
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
|
529
|
+
# Found assistant message with tool calls
|
|
530
|
+
for tool_call in msg["tool_calls"]:
|
|
531
|
+
tool_name = tool_call["function"]["name"]
|
|
532
|
+
tool_args = tool_call["function"]["arguments"]
|
|
533
|
+
tool_id = tool_call["id"]
|
|
534
|
+
|
|
535
|
+
# Find corresponding tool result
|
|
536
|
+
tool_result = None
|
|
537
|
+
for j in range(i + 1, len(conversation)):
|
|
538
|
+
if conversation[j].get("role") == "tool" and conversation[j].get("tool_call_id") == tool_id:
|
|
539
|
+
tool_result = conversation[j].get("content")
|
|
540
|
+
break
|
|
541
|
+
|
|
542
|
+
if tool_result:
|
|
543
|
+
tool_summary_parts.append(f"{tool_name}({tool_args}) returned: {tool_result}")
|
|
544
|
+
i += 1
|
|
545
|
+
|
|
546
|
+
# Build processed conversation
|
|
547
|
+
processed = []
|
|
548
|
+
|
|
549
|
+
# Keep system message if present
|
|
550
|
+
for msg in conversation:
|
|
551
|
+
if msg.get("role") == "system":
|
|
552
|
+
processed.append(msg)
|
|
553
|
+
break
|
|
554
|
+
|
|
555
|
+
# Get the original user message (first non-system)
|
|
556
|
+
original_user_msg = None
|
|
557
|
+
for msg in conversation:
|
|
558
|
+
if msg.get("role") == "user":
|
|
559
|
+
original_user_msg = msg.get("content", "")
|
|
560
|
+
break
|
|
561
|
+
|
|
562
|
+
# Create enhanced user message with tool context
|
|
563
|
+
processed.append(create_enhanced_user_message(tool_summary_parts, original_user_msg))
|
|
564
|
+
|
|
565
|
+
return processed
|
|
566
|
+
|
|
242
567
|
def _get_response_format(
|
|
243
568
|
self, output_schema: type[BaseModel] | dict | None, json_mode: bool
|
|
244
569
|
) -> type[BaseModel] | dict | None:
|
|
245
|
-
supported_params =
|
|
570
|
+
supported_params = self._litellm.get_supported_openai_params(model=self.model_name)
|
|
246
571
|
|
|
247
572
|
response_format = None
|
|
248
573
|
if supported_params is not None and "response_format" in supported_params:
|
|
@@ -271,7 +596,7 @@ class LiteLLM(LLM[LiteLLMOptions]):
|
|
|
271
596
|
LiteLLM: An initialized LiteLLM instance.
|
|
272
597
|
"""
|
|
273
598
|
if "router" in config:
|
|
274
|
-
router =
|
|
599
|
+
router = cls._get_litellm_module().Router(model_list=config["router"])
|
|
275
600
|
config["router"] = router
|
|
276
601
|
|
|
277
602
|
# Map base_url to api_base if present
|