ragbits-core 0.16.0__py3-none-any.whl → 1.4.0.dev202512021005__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. ragbits/core/__init__.py +21 -2
  2. ragbits/core/audit/__init__.py +15 -157
  3. ragbits/core/audit/metrics/__init__.py +83 -0
  4. ragbits/core/audit/metrics/base.py +198 -0
  5. ragbits/core/audit/metrics/logfire.py +19 -0
  6. ragbits/core/audit/metrics/otel.py +65 -0
  7. ragbits/core/audit/traces/__init__.py +171 -0
  8. ragbits/core/audit/{base.py → traces/base.py} +9 -5
  9. ragbits/core/audit/{cli.py → traces/cli.py} +8 -4
  10. ragbits/core/audit/traces/logfire.py +18 -0
  11. ragbits/core/audit/{otel.py → traces/otel.py} +5 -8
  12. ragbits/core/config.py +15 -0
  13. ragbits/core/embeddings/__init__.py +2 -1
  14. ragbits/core/embeddings/base.py +19 -0
  15. ragbits/core/embeddings/dense/base.py +10 -1
  16. ragbits/core/embeddings/dense/fastembed.py +22 -1
  17. ragbits/core/embeddings/dense/litellm.py +37 -10
  18. ragbits/core/embeddings/dense/local.py +15 -1
  19. ragbits/core/embeddings/dense/noop.py +11 -1
  20. ragbits/core/embeddings/dense/vertex_multimodal.py +14 -1
  21. ragbits/core/embeddings/sparse/bag_of_tokens.py +47 -17
  22. ragbits/core/embeddings/sparse/base.py +10 -1
  23. ragbits/core/embeddings/sparse/fastembed.py +25 -2
  24. ragbits/core/llms/__init__.py +3 -3
  25. ragbits/core/llms/base.py +612 -88
  26. ragbits/core/llms/exceptions.py +27 -0
  27. ragbits/core/llms/litellm.py +408 -83
  28. ragbits/core/llms/local.py +180 -41
  29. ragbits/core/llms/mock.py +88 -23
  30. ragbits/core/prompt/__init__.py +2 -2
  31. ragbits/core/prompt/_cli.py +32 -19
  32. ragbits/core/prompt/base.py +105 -19
  33. ragbits/core/prompt/{discovery/prompt_discovery.py → discovery.py} +1 -1
  34. ragbits/core/prompt/exceptions.py +22 -6
  35. ragbits/core/prompt/prompt.py +180 -98
  36. ragbits/core/sources/__init__.py +2 -0
  37. ragbits/core/sources/azure.py +1 -1
  38. ragbits/core/sources/base.py +8 -1
  39. ragbits/core/sources/gcs.py +1 -1
  40. ragbits/core/sources/git.py +1 -1
  41. ragbits/core/sources/google_drive.py +595 -0
  42. ragbits/core/sources/hf.py +71 -31
  43. ragbits/core/sources/local.py +1 -1
  44. ragbits/core/sources/s3.py +1 -1
  45. ragbits/core/utils/config_handling.py +13 -2
  46. ragbits/core/utils/function_schema.py +220 -0
  47. ragbits/core/utils/helpers.py +22 -0
  48. ragbits/core/utils/lazy_litellm.py +44 -0
  49. ragbits/core/vector_stores/base.py +18 -1
  50. ragbits/core/vector_stores/chroma.py +28 -11
  51. ragbits/core/vector_stores/hybrid.py +1 -1
  52. ragbits/core/vector_stores/hybrid_strategies.py +21 -8
  53. ragbits/core/vector_stores/in_memory.py +13 -4
  54. ragbits/core/vector_stores/pgvector.py +123 -47
  55. ragbits/core/vector_stores/qdrant.py +15 -7
  56. ragbits/core/vector_stores/weaviate.py +440 -0
  57. {ragbits_core-0.16.0.dist-info → ragbits_core-1.4.0.dev202512021005.dist-info}/METADATA +22 -6
  58. ragbits_core-1.4.0.dev202512021005.dist-info/RECORD +79 -0
  59. {ragbits_core-0.16.0.dist-info → ragbits_core-1.4.0.dev202512021005.dist-info}/WHEEL +1 -1
  60. ragbits/core/prompt/discovery/__init__.py +0 -3
  61. ragbits/core/prompt/lab/__init__.py +0 -0
  62. ragbits/core/prompt/lab/app.py +0 -262
  63. ragbits_core-0.16.0.dist-info/RECORD +0 -72
@@ -52,3 +52,30 @@ class LLMNotSupportingImagesError(LLMError):
52
52
 
53
53
  def __init__(self, message: str = "There are images in the prompt, but given LLM doesn't support them.") -> None:
54
54
  super().__init__(message)
55
+
56
+
57
+ class LLMNotSupportingPdfsError(LLMError):
58
+ """
59
+ Raised when there are PDFs in the prompt, but LLM doesn't support them.
60
+ """
61
+
62
+ def __init__(self, message: str = "There are PDFs in the prompt, but given LLM doesn't support them.") -> None:
63
+ super().__init__(message)
64
+
65
+
66
+ class LLMNotSupportingToolUseError(LLMError):
67
+ """
68
+ Raised when there are tools provided, but LLM doesn't support tool use.
69
+ """
70
+
71
+ def __init__(self, message: str = "There are tools provided, but given LLM doesn't support tool use.") -> None:
72
+ super().__init__(message)
73
+
74
+
75
+ class LLMNotSupportingReasoningEffortError(LLMError):
76
+ """
77
+ Raised when there is reasoning effort provided, but LLM doesn't support it.
78
+ """
79
+
80
+ def __init__(self, model_name: str) -> None:
81
+ super().__init__(f"Model {model_name} does not support reasoning effort.")
@@ -1,29 +1,38 @@
1
- from collections.abc import AsyncGenerator, Callable
2
- from typing import Any
1
+ import asyncio
2
+ import time
3
+ from collections.abc import AsyncGenerator, Callable, Iterable
4
+ from typing import TYPE_CHECKING, Any, Literal
3
5
 
4
- import litellm
5
- from litellm.utils import CustomStreamWrapper, ModelResponse
6
+ import tiktoken
6
7
  from pydantic import BaseModel
7
8
  from typing_extensions import Self
8
9
 
9
- from ragbits.core.audit import trace
10
- from ragbits.core.llms.base import LLM
10
+ from ragbits.core.audit.metrics import record_metric
11
+ from ragbits.core.audit.metrics.base import LLMMetric, MetricType
12
+ from ragbits.core.llms.base import LLM, LLMOptions, ToolChoice
11
13
  from ragbits.core.llms.exceptions import (
12
14
  LLMConnectionError,
13
15
  LLMEmptyResponseError,
14
16
  LLMNotSupportingImagesError,
17
+ LLMNotSupportingPdfsError,
18
+ LLMNotSupportingReasoningEffortError,
19
+ LLMNotSupportingToolUseError,
15
20
  LLMResponseError,
16
21
  LLMStatusError,
17
22
  )
18
- from ragbits.core.options import Options
19
23
  from ragbits.core.prompt.base import BasePrompt, ChatFormat
20
24
  from ragbits.core.types import NOT_GIVEN, NotGiven
25
+ from ragbits.core.utils.lazy_litellm import LazyLiteLLM
21
26
 
27
+ if TYPE_CHECKING:
28
+ from litellm import CustomStreamWrapper, ModelResponse, Router
22
29
 
23
- class LiteLLMOptions(Options):
30
+
31
+ class LiteLLMOptions(LLMOptions):
24
32
  """
25
33
  Dataclass that represents all available LLM call options for the LiteLLM client.
26
34
  Each of them is described in the [LiteLLM documentation](https://docs.litellm.ai/docs/completion/input).
35
+ Reasoning effort and thinking are described in [LiteLLM Reasoning documentation](https://docs.litellm.ai/docs/reasoning_content)
27
36
  """
28
37
 
29
38
  frequency_penalty: float | None | NotGiven = NOT_GIVEN
@@ -38,9 +47,13 @@ class LiteLLMOptions(Options):
38
47
  top_logprobs: int | None | NotGiven = NOT_GIVEN
39
48
  logit_bias: dict | None | NotGiven = NOT_GIVEN
40
49
  mock_response: str | None | NotGiven = NOT_GIVEN
50
+ tpm: int | None | NotGiven = NOT_GIVEN
51
+ rpm: int | None | NotGiven = NOT_GIVEN
52
+ reasoning_effort: Literal["low", "medium", "high"] | None | NotGiven = NOT_GIVEN
53
+ thinking: dict | None | NotGiven = NOT_GIVEN
41
54
 
42
55
 
43
- class LiteLLM(LLM[LiteLLMOptions]):
56
+ class LiteLLM(LLM[LiteLLMOptions], LazyLiteLLM):
44
57
  """
45
58
  Class for interaction with any LLM supported by LiteLLM API.
46
59
  """
@@ -57,7 +70,7 @@ class LiteLLM(LLM[LiteLLMOptions]):
57
70
  api_key: str | None = None,
58
71
  api_version: str | None = None,
59
72
  use_structured_output: bool = False,
60
- router: litellm.Router | None = None,
73
+ router: "Router | None" = None,
61
74
  custom_model_cost_config: dict | None = None,
62
75
  ) -> None:
63
76
  """
@@ -90,7 +103,29 @@ class LiteLLM(LLM[LiteLLMOptions]):
90
103
  self.router = router
91
104
  self.custom_model_cost_config = custom_model_cost_config
92
105
  if custom_model_cost_config:
93
- litellm.register_model(custom_model_cost_config)
106
+ self._litellm.register_model(custom_model_cost_config)
107
+
108
+ def get_model_id(self) -> str:
109
+ """
110
+ Returns the model id.
111
+ """
112
+ return "litellm:" + self.model_name
113
+
114
+ def get_estimated_cost(self, prompt_tokens: int, completion_tokens: int) -> float:
115
+ """
116
+ Returns the estimated cost of the LLM call.
117
+
118
+ Args:
119
+ prompt_tokens: The number of tokens in the prompt.
120
+ completion_tokens: The number of tokens in the completion.
121
+
122
+ Returns:
123
+ The estimated cost of the LLM call.
124
+ """
125
+ response_cost = self._litellm.get_model_info(self.model_name)
126
+ response_cost_input = prompt_tokens * response_cost["input_cost_per_token"]
127
+ response_cost_output = completion_tokens * response_cost["output_cost_per_token"]
128
+ return response_cost_input + response_cost_output
94
129
 
95
130
  def count_tokens(self, prompt: BasePrompt) -> int:
96
131
  """
@@ -102,75 +137,145 @@ class LiteLLM(LLM[LiteLLMOptions]):
102
137
  Returns:
103
138
  Number of tokens in the prompt.
104
139
  """
105
- return sum(litellm.token_counter(model=self.model_name, text=message["content"]) for message in prompt.chat)
140
+ return sum(
141
+ self._litellm.token_counter(model=self.model_name, text=message.get("content") or "")
142
+ for message in prompt.chat
143
+ )
144
+
145
+ def get_token_id(self, token: str) -> int:
146
+ """
147
+ Gets token id.
148
+
149
+ Args:
150
+ token: The token to encode.
151
+
152
+ Returns:
153
+ The id for the given token.
154
+ """
155
+ try:
156
+ tokenizer = tiktoken.encoding_for_model(self.model_name)
157
+ return tokenizer.encode_single_token(token)
158
+ except KeyError:
159
+ return self._litellm.encode(model=self.model_name, text=token)[0]
106
160
 
107
161
  async def _call(
108
162
  self,
109
- prompt: BasePrompt,
163
+ prompt: Iterable[BasePrompt],
110
164
  options: LiteLLMOptions,
111
- json_mode: bool = False,
112
- output_schema: type[BaseModel] | dict | None = None,
113
- ) -> dict:
165
+ tools: list[dict] | None = None,
166
+ tool_choice: ToolChoice | None = None,
167
+ ) -> list[dict]:
114
168
  """
115
169
  Calls the appropriate LLM endpoint with the given prompt and options.
116
170
 
117
171
  Args:
118
- prompt: BasePrompt object containing the conversation
172
+ prompt: Iterable of BasePrompt objects containing conversations
119
173
  options: Additional settings used by the LLM.
120
- json_mode: Force the response to be in JSON format.
121
- output_schema: Output schema for requesting a specific response format.
122
- Only used if the client has been initialized with `use_structured_output=True`.
174
+ tools: Functions to be used as tools by the LLM.
175
+ tool_choice: Parameter that allows to control what tool is used. Can be one of:
176
+ - "auto": let model decide if tool call is needed
177
+ - "none": do not call tool
178
+ - "required: enforce tool usage (model decides which one)
179
+ - dict: tool dict corresponding to one of provided tools
123
180
 
124
181
  Returns:
125
- Response string from LLM.
182
+ list of dictionaries with responses from the LLM and metadata.
126
183
 
127
184
  Raises:
128
185
  LLMConnectionError: If there is a connection error with the LLM API.
129
186
  LLMStatusError: If the LLM API returns an error status code.
130
187
  LLMResponseError: If the LLM API response is invalid.
131
188
  LLMNotSupportingImagesError: If the model does not support images.
189
+ LLMNotSupportingPdfsError: If the model does not support PDFs.
190
+ LLMNotSupportingToolUseError: If the model does not support tool use.
132
191
  """
133
- if prompt.list_images() and not litellm.supports_vision(self.model_name):
192
+ if any(p.list_images() for p in prompt) and not self._litellm.supports_vision(self.model_name):
134
193
  raise LLMNotSupportingImagesError()
135
194
 
136
- response_format = self._get_response_format(output_schema=output_schema, json_mode=json_mode)
137
-
138
- response = await self._get_litellm_response(
139
- conversation=prompt.chat,
140
- options=options,
141
- response_format=response_format,
195
+ if any(p.list_pdfs() for p in prompt) and not self._litellm.supports_pdf_input(self.model_name):
196
+ raise LLMNotSupportingPdfsError()
197
+
198
+ if tools and not self._litellm.supports_function_calling(self.model_name):
199
+ raise LLMNotSupportingToolUseError()
200
+
201
+ if options.reasoning_effort and not self._litellm.supports_reasoning(self.model_name):
202
+ raise LLMNotSupportingReasoningEffortError(self.model_name)
203
+
204
+ start_time = time.perf_counter()
205
+ raw_responses = await asyncio.gather(
206
+ *(
207
+ self._get_litellm_response(
208
+ conversation=single_prompt.chat,
209
+ options=options,
210
+ response_format=self._get_response_format(
211
+ output_schema=single_prompt.output_schema(), json_mode=single_prompt.json_mode
212
+ ),
213
+ tools=tools,
214
+ tool_choice=tool_choice,
215
+ )
216
+ for single_prompt in prompt
217
+ )
142
218
  )
143
- if not response.choices: # type: ignore
144
- raise LLMEmptyResponseError()
145
- results = {}
146
- results["response"] = response.choices[0].message.content # type: ignore
147
219
 
148
- if response.usage: # type: ignore
149
- results["completion_tokens"] = response.usage.completion_tokens # type: ignore
150
- results["prompt_tokens"] = response.usage.prompt_tokens # type: ignore
151
- results["total_tokens"] = response.usage.total_tokens # type: ignore
220
+ results: list[dict] = []
221
+ throughput_batch = time.perf_counter() - start_time
222
+
223
+ for response in raw_responses:
224
+ if not response.choices: # type: ignore
225
+ raise LLMEmptyResponseError()
226
+
227
+ result = {}
228
+ result["response"] = response.choices[0].message.content # type: ignore
229
+ result["reasoning"] = getattr(response.choices[0].message, "reasoning_content", None) # type: ignore
230
+ result["throughput"] = throughput_batch / float(len(raw_responses))
231
+
232
+ result["tool_calls"] = (
233
+ [
234
+ {
235
+ "name": tool_call.function.name,
236
+ "arguments": tool_call.function.arguments,
237
+ "type": tool_call.type,
238
+ "id": tool_call.id,
239
+ }
240
+ for tool_call in tool_calls
241
+ ]
242
+ if tools and (tool_calls := response.choices[0].message.tool_calls) # type: ignore
243
+ else None
244
+ )
152
245
 
153
- if options.logprobs:
154
- results["logprobs"] = response.choices[0].logprobs["content"] # type: ignore
246
+ if options.logprobs:
247
+ result["logprobs"] = response.choices[0].logprobs["content"] # type: ignore
155
248
 
156
- return results # type: ignore
249
+ if response.usage: # type: ignore
250
+ result["usage"] = {
251
+ "completion_tokens": response.usage.completion_tokens, # type: ignore
252
+ "prompt_tokens": response.usage.prompt_tokens, # type: ignore
253
+ "total_tokens": response.usage.total_tokens, # type: ignore
254
+ }
255
+
256
+ results.append(result)
257
+
258
+ return results
157
259
 
158
260
  async def _call_streaming(
159
261
  self,
160
262
  prompt: BasePrompt,
161
263
  options: LiteLLMOptions,
162
- json_mode: bool = False,
163
- output_schema: type[BaseModel] | dict | None = None,
164
- ) -> AsyncGenerator[str, None]:
264
+ tools: list[dict] | None = None,
265
+ tool_choice: ToolChoice | None = None,
266
+ ) -> AsyncGenerator[dict, None]:
165
267
  """
166
268
  Calls the appropriate LLM endpoint with the given prompt and options.
167
269
 
168
270
  Args:
169
271
  prompt: BasePrompt object containing the conversation
170
272
  options: Additional settings used by the LLM.
171
- json_mode: Force the response to be in JSON format.
172
- output_schema: Output schema for requesting a specific response format.
173
- Only used if the client has been initialized with `use_structured_output=True`.
273
+ tools: Functions to be used as tools by the LLM.
274
+ tool_choice: Parameter that allows to control what tool is used. Can be one of:
275
+ - "auto": let model decide if tool call is needed
276
+ - "none": do not call tool
277
+ - "required: enforce tool usage (model decides which one)
278
+ - dict: tool dict corresponding to one of provided tools
174
279
 
175
280
  Returns:
176
281
  Response string from LLM.
@@ -180,69 +285,289 @@ class LiteLLM(LLM[LiteLLMOptions]):
180
285
  LLMStatusError: If the LLM API returns an error status code.
181
286
  LLMResponseError: If the LLM API response is invalid.
182
287
  LLMNotSupportingImagesError: If the model does not support images.
288
+ LLMNotSupportingPdfsError: If the model does not support PDFs.
289
+ LLMNotSupportingToolUseError: If the model does not support tool use.
183
290
  """
184
- if prompt.list_images() and not litellm.supports_vision(self.model_name):
291
+ if prompt.list_images() and not self._litellm.supports_vision(self.model_name):
185
292
  raise LLMNotSupportingImagesError()
186
293
 
187
- response_format = self._get_response_format(output_schema=output_schema, json_mode=json_mode)
294
+ if prompt.list_pdfs() and not self._litellm.supports_pdf_input(self.model_name):
295
+ raise LLMNotSupportingPdfsError()
296
+
297
+ if tools and not self._litellm.supports_function_calling(self.model_name):
298
+ raise LLMNotSupportingToolUseError()
299
+
300
+ if options.reasoning_effort and not self._litellm.supports_reasoning(self.model_name):
301
+ raise LLMNotSupportingReasoningEffortError(self.model_name)
188
302
 
189
- with trace(
190
- messages=prompt.chat,
191
- model=self.model_name,
192
- base_url=self.api_base,
193
- api_version=self.api_version,
303
+ response_format = self._get_response_format(output_schema=prompt.output_schema(), json_mode=prompt.json_mode)
304
+ input_tokens = self.count_tokens(prompt)
305
+
306
+ provider_calculated_usage = None
307
+
308
+ start_time = time.perf_counter()
309
+ response = await self._get_litellm_response(
310
+ conversation=prompt.chat,
311
+ options=options,
194
312
  response_format=response_format,
195
- options=options.dict(),
196
- ) as outputs:
197
- response = await self._get_litellm_response(
198
- conversation=prompt.chat,
199
- options=options,
200
- response_format=response_format,
201
- stream=True,
202
- )
203
- if not response.completion_stream: # type: ignore
313
+ tools=tools,
314
+ tool_choice=tool_choice,
315
+ stream=True,
316
+ stream_options={"include_usage": True},
317
+ )
318
+
319
+ try:
320
+ if (not response.completion_stream and not response.choices) and not response.reasoning: # type: ignore
204
321
  raise LLMEmptyResponseError()
322
+ except AttributeError:
323
+ # some providers might not include some parameters (i.e. Gemini -> choices)
324
+ pass
325
+
326
+ async def response_to_async_generator(response: "CustomStreamWrapper") -> AsyncGenerator[dict, None]:
327
+ nonlocal input_tokens, provider_calculated_usage
328
+ output_tokens = 0
329
+ tool_calls: list[dict] = []
330
+
331
+ async for item in response:
332
+ reasoning_content = getattr(item.choices[0].delta, "reasoning_content", None)
333
+ if content := item.choices[0].delta.content or reasoning_content:
334
+ output_tokens += 1
335
+ if output_tokens == 1:
336
+ record_metric(
337
+ metric=LLMMetric.TIME_TO_FIRST_TOKEN,
338
+ value=time.perf_counter() - start_time,
339
+ metric_type=MetricType.HISTOGRAM,
340
+ model=self.model_name,
341
+ prompt=prompt.__class__.__name__,
342
+ )
343
+
344
+ yield {"response": content, "reasoning": bool(reasoning_content)}
345
+
346
+ if tool_calls_delta := item.choices[0].delta.tool_calls:
347
+ for tool_call_chunk in tool_calls_delta:
348
+ while len(tool_calls) <= tool_call_chunk.index:
349
+ tool_calls.append({"id": "", "type": "", "name": "", "arguments": ""})
350
+
351
+ tool_calls[tool_call_chunk.index]["id"] += tool_call_chunk.id or ""
352
+ tool_calls[tool_call_chunk.index]["type"] += (
353
+ tool_call_chunk.type
354
+ if tool_call_chunk.type
355
+ and tool_call_chunk.type != tool_calls[tool_call_chunk.index]["type"]
356
+ else ""
357
+ )
358
+ tool_calls[tool_call_chunk.index]["name"] += tool_call_chunk.function.name or ""
359
+ tool_calls[tool_call_chunk.index]["arguments"] += tool_call_chunk.function.arguments or ""
360
+
361
+ if usage := getattr(item, "usage", None):
362
+ provider_calculated_usage = usage
363
+
364
+ total_tokens = input_tokens + output_tokens
365
+
366
+ if provider_calculated_usage:
367
+ input_tokens = provider_calculated_usage.prompt_tokens
368
+ output_tokens = provider_calculated_usage.completion_tokens
369
+ total_tokens = provider_calculated_usage.total_tokens
370
+
371
+ if tool_calls:
372
+ yield {"tool_calls": tool_calls}
373
+
374
+ total_time = time.perf_counter() - start_time
375
+
376
+ yield {
377
+ "usage": {
378
+ "prompt_tokens": input_tokens,
379
+ "completion_tokens": output_tokens,
380
+ "total_tokens": total_tokens,
381
+ }
382
+ }
383
+
384
+ record_metric(
385
+ metric=LLMMetric.INPUT_TOKENS,
386
+ value=input_tokens,
387
+ metric_type=MetricType.HISTOGRAM,
388
+ model=self.model_name,
389
+ prompt=prompt.__class__.__name__,
390
+ )
391
+ record_metric(
392
+ metric=LLMMetric.TOKEN_THROUGHPUT,
393
+ value=output_tokens / total_time,
394
+ metric_type=MetricType.HISTOGRAM,
395
+ model=self.model_name,
396
+ prompt=prompt.__class__.__name__,
397
+ )
398
+ record_metric(
399
+ metric=LLMMetric.PROMPT_THROUGHPUT,
400
+ value=total_time,
401
+ metric_type=MetricType.HISTOGRAM,
402
+ model=self.model_name,
403
+ prompt=prompt.__class__.__name__,
404
+ )
405
+
406
+ return response_to_async_generator(response) # type: ignore
205
407
 
206
- async def response_to_async_generator(response: CustomStreamWrapper) -> AsyncGenerator[str, None]:
207
- async for item in response:
208
- yield item.choices[0].delta.content or ""
408
+ def _create_router_from_self_and_options(self, options: LiteLLMOptions) -> "Router":
409
+ params: dict[str, Any] = {
410
+ "model": self.model_name,
411
+ "api_key": self.api_key,
412
+ "api_version": self.api_version,
413
+ "base_url": self.api_base,
414
+ }
209
415
 
210
- outputs.response = response_to_async_generator(response) # type: ignore
416
+ if options.tpm:
417
+ params["tpm"] = options.tpm
418
+ if options.rpm:
419
+ params["rpm"] = options.rpm
211
420
 
212
- return outputs.response # type: ignore
421
+ return self._litellm.Router(
422
+ model_list=[{"model_name": self.model_name, "litellm_params": params}],
423
+ routing_strategy="usage-based-routing-v2",
424
+ enable_pre_call_checks=True,
425
+ )
213
426
 
214
427
  async def _get_litellm_response(
215
428
  self,
216
429
  conversation: ChatFormat,
217
430
  options: LiteLLMOptions,
218
431
  response_format: type[BaseModel] | dict | None,
432
+ tools: list[dict] | None = None,
433
+ tool_choice: ToolChoice | None = None,
219
434
  stream: bool = False,
220
- ) -> ModelResponse | CustomStreamWrapper:
221
- entrypoint = self.router or litellm
435
+ stream_options: dict | None = None,
436
+ ) -> "ModelResponse | CustomStreamWrapper":
437
+ entrypoint = self.router or self._create_router_from_self_and_options(options)
438
+
439
+ # Preprocess messages for Claude with reasoning enabled
440
+ processed_conversation = self._preprocess_messages_for_claude(conversation, options)
441
+
442
+ # Prepare kwargs for the completion call
443
+ completion_kwargs = {
444
+ "messages": processed_conversation,
445
+ "model": self.model_name,
446
+ "response_format": response_format,
447
+ "tools": tools,
448
+ "tool_choice": tool_choice,
449
+ "stream": stream,
450
+ **options.dict(),
451
+ }
452
+
453
+ supported_openai_params = self._litellm.get_supported_openai_params(model=self.model_name) or []
454
+ if "reasoning_effort" not in supported_openai_params:
455
+ completion_kwargs.pop("reasoning_effort")
456
+ if "thinking" not in supported_openai_params:
457
+ completion_kwargs.pop("thinking")
458
+
459
+ if stream_options is not None:
460
+ completion_kwargs["stream_options"] = stream_options
222
461
 
223
462
  try:
224
- response = await entrypoint.acompletion(
225
- messages=conversation,
226
- model=self.model_name,
227
- base_url=self.api_base,
228
- api_key=self.api_key,
229
- api_version=self.api_version,
230
- response_format=response_format,
231
- stream=stream,
232
- **options.dict(),
233
- )
234
- except litellm.openai.APIConnectionError as exc:
463
+ response = await entrypoint.acompletion(**completion_kwargs)
464
+ except self._litellm.openai.APIConnectionError as exc:
235
465
  raise LLMConnectionError() from exc
236
- except litellm.openai.APIStatusError as exc:
466
+ except self._litellm.openai.APIStatusError as exc:
237
467
  raise LLMStatusError(exc.message, exc.status_code) from exc
238
- except litellm.openai.APIResponseValidationError as exc:
468
+ except self._litellm.openai.APIResponseValidationError as exc:
239
469
  raise LLMResponseError() from exc
240
470
  return response
241
471
 
472
+ def _preprocess_messages_for_claude(self, conversation: ChatFormat, options: LiteLLMOptions) -> ChatFormat:
473
+ """
474
+ Preprocess messages for Claude when reasoning is enabled.
475
+
476
+ Claude + reasoning_effort + tool calls creates a conflict:
477
+ - LiteLLM validates messages against OpenAI format (rejects Claude native format)
478
+ - Claude requires thinking blocks when reasoning_effort is set (rejects OpenAI format)
479
+
480
+ Subject to removal after the following are resolved on LiteLLM's side:
481
+ Issue: https://github.com/BerriAI/litellm/issues/14194
482
+ Linked PR(s): https://github.com/BerriAI/litellm/pull/15220
483
+
484
+ Solution: Summarize tool call history and append to last user message.
485
+ This provides context to Claude without triggering validation errors.
486
+
487
+ Args:
488
+ conversation: The conversation in OpenAI format
489
+ options: LLM options including reasoning_effort
490
+
491
+ Returns:
492
+ Processed conversation with tool context included
493
+ """
494
+
495
+ def create_enhanced_user_message(
496
+ tool_summary_parts: list[str], original_user_msg: str | None
497
+ ) -> dict[str, Any]:
498
+ if tool_summary_parts and original_user_msg:
499
+ enhanced_message = original_user_msg
500
+ enhanced_message += "\n\n[Previous tool calls in this conversation:"
501
+
502
+ for summary in tool_summary_parts:
503
+ enhanced_message += f"\n- {summary}"
504
+ enhanced_message += "\nUse this information to provide your final answer.]"
505
+ return {"role": "user", "content": enhanced_message}
506
+
507
+ return {"role": "user", "content": original_user_msg}
508
+
509
+ # Only process for Claude models with reasoning enabled
510
+ is_claude = "anthropic" in self.model_name.lower() or "claude" in self.model_name.lower()
511
+ has_reasoning = options.reasoning_effort is not NOT_GIVEN and options.reasoning_effort is not None
512
+
513
+ if not (is_claude and has_reasoning):
514
+ return conversation
515
+
516
+ # Check if conversation has tool calls
517
+ has_tool_calls = any(msg.get("role") == "assistant" and msg.get("tool_calls") for msg in conversation)
518
+
519
+ if not has_tool_calls:
520
+ # No tool calls, conversation is fine as-is
521
+ return conversation
522
+
523
+ # Build tool call summary from conversation history
524
+ tool_summary_parts = []
525
+ i = 0
526
+ while i < len(conversation):
527
+ msg = conversation[i]
528
+ if msg.get("role") == "assistant" and msg.get("tool_calls"):
529
+ # Found assistant message with tool calls
530
+ for tool_call in msg["tool_calls"]:
531
+ tool_name = tool_call["function"]["name"]
532
+ tool_args = tool_call["function"]["arguments"]
533
+ tool_id = tool_call["id"]
534
+
535
+ # Find corresponding tool result
536
+ tool_result = None
537
+ for j in range(i + 1, len(conversation)):
538
+ if conversation[j].get("role") == "tool" and conversation[j].get("tool_call_id") == tool_id:
539
+ tool_result = conversation[j].get("content")
540
+ break
541
+
542
+ if tool_result:
543
+ tool_summary_parts.append(f"{tool_name}({tool_args}) returned: {tool_result}")
544
+ i += 1
545
+
546
+ # Build processed conversation
547
+ processed = []
548
+
549
+ # Keep system message if present
550
+ for msg in conversation:
551
+ if msg.get("role") == "system":
552
+ processed.append(msg)
553
+ break
554
+
555
+ # Get the original user message (first non-system)
556
+ original_user_msg = None
557
+ for msg in conversation:
558
+ if msg.get("role") == "user":
559
+ original_user_msg = msg.get("content", "")
560
+ break
561
+
562
+ # Create enhanced user message with tool context
563
+ processed.append(create_enhanced_user_message(tool_summary_parts, original_user_msg))
564
+
565
+ return processed
566
+
242
567
  def _get_response_format(
243
568
  self, output_schema: type[BaseModel] | dict | None, json_mode: bool
244
569
  ) -> type[BaseModel] | dict | None:
245
- supported_params = litellm.get_supported_openai_params(model=self.model_name)
570
+ supported_params = self._litellm.get_supported_openai_params(model=self.model_name)
246
571
 
247
572
  response_format = None
248
573
  if supported_params is not None and "response_format" in supported_params:
@@ -271,7 +596,7 @@ class LiteLLM(LLM[LiteLLMOptions]):
271
596
  LiteLLM: An initialized LiteLLM instance.
272
597
  """
273
598
  if "router" in config:
274
- router = litellm.router.Router(model_list=config["router"])
599
+ router = cls._get_litellm_module().Router(model_list=config["router"])
275
600
  config["router"] = router
276
601
 
277
602
  # Map base_url to api_base if present