lm-deluge 0.0.78__tar.gz → 0.0.80__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lm_deluge-0.0.78/src/lm_deluge.egg-info → lm_deluge-0.0.80}/PKG-INFO +8 -8
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/README.md +7 -7
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/pyproject.toml +1 -1
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/anthropic.py +43 -16
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/gemini.py +95 -15
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/client.py +6 -5
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/config.py +10 -1
- lm_deluge-0.0.80/src/lm_deluge/llm_tools/sandbox.py +523 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/anthropic.py +15 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/google.py +15 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/openrouter.py +10 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/prompt.py +62 -24
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/warnings.py +4 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80/src/lm_deluge.egg-info}/PKG-INFO +8 -8
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge.egg-info/SOURCES.txt +1 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/tests/test_filesystem_live.py +1 -1
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/LICENSE +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/setup.cfg +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/__init__.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/__init__.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/base.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/bedrock.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/chat_reasoning.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/common.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/deprecated/bedrock.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/deprecated/cohere.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/deprecated/deepseek.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/deprecated/mistral.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/deprecated/vertex.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/mistral.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/openai.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/api_requests/response.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/batches.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/built_in_tools/anthropic/__init__.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/built_in_tools/anthropic/bash.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/built_in_tools/anthropic/computer_use.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/built_in_tools/anthropic/editor.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/built_in_tools/base.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/built_in_tools/openai.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/cache.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/cli.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/embed.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/errors.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/file.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/image.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/llm_tools/__init__.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/llm_tools/classify.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/llm_tools/extract.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/llm_tools/filesystem.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/llm_tools/locate.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/llm_tools/ocr.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/llm_tools/score.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/llm_tools/subagents.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/llm_tools/todos.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/llm_tools/translate.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/mock_openai.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/__init__.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/bedrock.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/cerebras.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/cohere.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/deepseek.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/fireworks.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/grok.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/groq.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/kimi.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/meta.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/minimax.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/mistral.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/openai.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/models/together.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/presets/cerebras.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/presets/meta.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/request_context.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/rerank.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/tool.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/tracker.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/usage.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/util/harmony.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/util/json.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/util/logprobs.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/util/schema.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/util/spatial.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/util/validation.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge/util/xml.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge.egg-info/dependency_links.txt +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge.egg-info/requires.txt +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/src/lm_deluge.egg-info/top_level.txt +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/tests/test_builtin_tools.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/tests/test_file_upload.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/tests/test_filesystem.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/tests/test_mock_openai.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/tests/test_native_mcp_server.py +0 -0
- {lm_deluge-0.0.78 → lm_deluge-0.0.80}/tests/test_openrouter_generic.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lm_deluge
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.80
|
|
4
4
|
Summary: Python utility for using LLM API models.
|
|
5
5
|
Author-email: Benjamin Anderson <ben@trytaylor.ai>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -52,7 +52,7 @@ Dynamic: license-file
|
|
|
52
52
|
pip install lm-deluge
|
|
53
53
|
```
|
|
54
54
|
|
|
55
|
-
The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `
|
|
55
|
+
The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GEMINI_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables. For Bedrock, you'll need to set `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
|
|
56
56
|
|
|
57
57
|
## Quickstart
|
|
58
58
|
|
|
@@ -61,9 +61,9 @@ The package relies on environment variables for API keys. Typical variables incl
|
|
|
61
61
|
```python
|
|
62
62
|
from lm_deluge import LLMClient
|
|
63
63
|
|
|
64
|
-
client = LLMClient("gpt-
|
|
64
|
+
client = LLMClient("gpt-4.1-mini")
|
|
65
65
|
resps = client.process_prompts_sync(["Hello, world!"])
|
|
66
|
-
print(
|
|
66
|
+
print(resps[0].completion)
|
|
67
67
|
```
|
|
68
68
|
|
|
69
69
|
## Spraying Across Models
|
|
@@ -74,13 +74,13 @@ To distribute your requests across models, just provide a list of more than one
|
|
|
74
74
|
from lm_deluge import LLMClient
|
|
75
75
|
|
|
76
76
|
client = LLMClient(
|
|
77
|
-
["gpt-
|
|
77
|
+
["gpt-4.1-mini", "claude-4.5-haiku"],
|
|
78
78
|
max_requests_per_minute=10_000
|
|
79
79
|
)
|
|
80
80
|
resps = client.process_prompts_sync(
|
|
81
81
|
["Hello, ChatGPT!", "Hello, Claude!"]
|
|
82
82
|
)
|
|
83
|
-
print(
|
|
83
|
+
print(resps[0].completion)
|
|
84
84
|
```
|
|
85
85
|
|
|
86
86
|
## Configuration
|
|
@@ -181,7 +181,7 @@ def get_weather(city: str) -> str:
|
|
|
181
181
|
return f"The weather in {city} is sunny and 72°F"
|
|
182
182
|
|
|
183
183
|
tool = Tool.from_function(get_weather)
|
|
184
|
-
client = LLMClient("claude-
|
|
184
|
+
client = LLMClient("claude-4.5-haiku")
|
|
185
185
|
resps = client.process_prompts_sync(
|
|
186
186
|
["What's the weather in Paris?"],
|
|
187
187
|
tools=[tool]
|
|
@@ -255,7 +255,7 @@ conv = (
|
|
|
255
255
|
)
|
|
256
256
|
|
|
257
257
|
# Use prompt caching to cache system message and tools
|
|
258
|
-
client = LLMClient("claude-
|
|
258
|
+
client = LLMClient("claude-4.5-sonnet")
|
|
259
259
|
resps = client.process_prompts_sync(
|
|
260
260
|
[conv],
|
|
261
261
|
cache="system_and_tools" # Cache system message and any tools
|
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
pip install lm-deluge
|
|
24
24
|
```
|
|
25
25
|
|
|
26
|
-
The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `
|
|
26
|
+
The package relies on environment variables for API keys. Typical variables include `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `COHERE_API_KEY`, `META_API_KEY`, and `GEMINI_API_KEY`. `LLMClient` will automatically load the `.env` file when imported; we recommend using that to set the environment variables. For Bedrock, you'll need to set `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
|
|
27
27
|
|
|
28
28
|
## Quickstart
|
|
29
29
|
|
|
@@ -32,9 +32,9 @@ The package relies on environment variables for API keys. Typical variables incl
|
|
|
32
32
|
```python
|
|
33
33
|
from lm_deluge import LLMClient
|
|
34
34
|
|
|
35
|
-
client = LLMClient("gpt-
|
|
35
|
+
client = LLMClient("gpt-4.1-mini")
|
|
36
36
|
resps = client.process_prompts_sync(["Hello, world!"])
|
|
37
|
-
print(
|
|
37
|
+
print(resps[0].completion)
|
|
38
38
|
```
|
|
39
39
|
|
|
40
40
|
## Spraying Across Models
|
|
@@ -45,13 +45,13 @@ To distribute your requests across models, just provide a list of more than one
|
|
|
45
45
|
from lm_deluge import LLMClient
|
|
46
46
|
|
|
47
47
|
client = LLMClient(
|
|
48
|
-
["gpt-
|
|
48
|
+
["gpt-4.1-mini", "claude-4.5-haiku"],
|
|
49
49
|
max_requests_per_minute=10_000
|
|
50
50
|
)
|
|
51
51
|
resps = client.process_prompts_sync(
|
|
52
52
|
["Hello, ChatGPT!", "Hello, Claude!"]
|
|
53
53
|
)
|
|
54
|
-
print(
|
|
54
|
+
print(resps[0].completion)
|
|
55
55
|
```
|
|
56
56
|
|
|
57
57
|
## Configuration
|
|
@@ -152,7 +152,7 @@ def get_weather(city: str) -> str:
|
|
|
152
152
|
return f"The weather in {city} is sunny and 72°F"
|
|
153
153
|
|
|
154
154
|
tool = Tool.from_function(get_weather)
|
|
155
|
-
client = LLMClient("claude-
|
|
155
|
+
client = LLMClient("claude-4.5-haiku")
|
|
156
156
|
resps = client.process_prompts_sync(
|
|
157
157
|
["What's the weather in Paris?"],
|
|
158
158
|
tools=[tool]
|
|
@@ -226,7 +226,7 @@ conv = (
|
|
|
226
226
|
)
|
|
227
227
|
|
|
228
228
|
# Use prompt caching to cache system message and tools
|
|
229
|
-
client = LLMClient("claude-
|
|
229
|
+
client = LLMClient("claude-4.5-sonnet")
|
|
230
230
|
resps = client.process_prompts_sync(
|
|
231
231
|
[conv],
|
|
232
232
|
cache="system_and_tools" # Cache system message and any tools
|
|
@@ -16,6 +16,7 @@ from lm_deluge.util.schema import (
|
|
|
16
16
|
prepare_output_schema,
|
|
17
17
|
transform_schema_for_anthropic,
|
|
18
18
|
)
|
|
19
|
+
from lm_deluge.warnings import maybe_warn
|
|
19
20
|
|
|
20
21
|
from ..models import APIModel
|
|
21
22
|
from .base import APIRequestBase, APIResponse
|
|
@@ -62,20 +63,45 @@ def _build_anthropic_request(
|
|
|
62
63
|
"max_tokens": sampling_params.max_new_tokens,
|
|
63
64
|
}
|
|
64
65
|
|
|
66
|
+
if model.id == "claude-4.5-opus" and sampling_params.global_effort:
|
|
67
|
+
request_json["effort"] = sampling_params.global_effort
|
|
68
|
+
_add_beta(base_headers, "effort-2025-11-24")
|
|
69
|
+
|
|
65
70
|
# handle thinking
|
|
66
|
-
if model.reasoning_model
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
sampling_params.reasoning_effort
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
71
|
+
if model.reasoning_model:
|
|
72
|
+
if (
|
|
73
|
+
sampling_params.thinking_budget is not None
|
|
74
|
+
and sampling_params.reasoning_effort is not None
|
|
75
|
+
):
|
|
76
|
+
maybe_warn("WARN_THINKING_BUDGET_AND_REASONING_EFFORT")
|
|
77
|
+
|
|
78
|
+
if sampling_params.thinking_budget is not None:
|
|
79
|
+
budget = sampling_params.thinking_budget
|
|
80
|
+
elif sampling_params.reasoning_effort is not None:
|
|
81
|
+
# translate reasoning effort of low, medium, high to budget tokens
|
|
82
|
+
budget = {
|
|
83
|
+
"none": 0,
|
|
84
|
+
"minimal": 256,
|
|
85
|
+
"low": 1024,
|
|
86
|
+
"medium": 4096,
|
|
87
|
+
"high": 16384,
|
|
88
|
+
}.get(sampling_params.reasoning_effort)
|
|
89
|
+
assert isinstance(budget, int)
|
|
90
|
+
else:
|
|
91
|
+
budget = 0
|
|
92
|
+
|
|
93
|
+
if budget > 0:
|
|
94
|
+
request_json["thinking"] = {
|
|
95
|
+
"type": "enabled",
|
|
96
|
+
"budget_tokens": budget,
|
|
97
|
+
}
|
|
98
|
+
if "top_p" in request_json:
|
|
99
|
+
request_json["top_p"] = max(request_json["top_p"], 0.95)
|
|
100
|
+
request_json["temperature"] = 1.0
|
|
101
|
+
request_json["max_tokens"] += budget
|
|
102
|
+
else:
|
|
103
|
+
request_json["thinking"] = {"type": "disabled"}
|
|
104
|
+
|
|
79
105
|
else:
|
|
80
106
|
request_json["thinking"] = {"type": "disabled"}
|
|
81
107
|
if sampling_params.reasoning_effort:
|
|
@@ -83,10 +109,11 @@ def _build_anthropic_request(
|
|
|
83
109
|
if system_message is not None:
|
|
84
110
|
request_json["system"] = system_message
|
|
85
111
|
|
|
86
|
-
# handle temp + top_p for opus 4.1/sonnet 4.5
|
|
112
|
+
# handle temp + top_p for opus 4.1/sonnet 4.5.
|
|
113
|
+
# TODO: make clearer / more user-friendly so there can be NotGiven
|
|
114
|
+
# and user can control which one they want to use
|
|
87
115
|
if "4-1" in model.name or "4-5" in model.name:
|
|
88
|
-
|
|
89
|
-
request_json.pop("top_p")
|
|
116
|
+
request_json.pop("top_p")
|
|
90
117
|
|
|
91
118
|
# Handle structured outputs (output_format)
|
|
92
119
|
if context.output_schema:
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
from typing import Any
|
|
4
3
|
|
|
5
4
|
from aiohttp import ClientResponse
|
|
6
5
|
|
|
@@ -23,6 +22,21 @@ async def _build_gemini_request(
|
|
|
23
22
|
) -> dict:
|
|
24
23
|
system_message, messages = prompt.to_gemini()
|
|
25
24
|
|
|
25
|
+
# For Gemini 3, inject dummy signatures when missing for function calls
|
|
26
|
+
is_gemini_3 = "gemini-3" in model.name.lower()
|
|
27
|
+
if is_gemini_3:
|
|
28
|
+
dummy_sig = "context_engineering_is_the_way_to_go"
|
|
29
|
+
for msg in messages:
|
|
30
|
+
if "parts" in msg:
|
|
31
|
+
for part in msg["parts"]:
|
|
32
|
+
# For function calls, inject dummy signature if missing
|
|
33
|
+
if "functionCall" in part and "thoughtSignature" not in part:
|
|
34
|
+
part["thoughtSignature"] = dummy_sig
|
|
35
|
+
maybe_warn(
|
|
36
|
+
"WARN_GEMINI3_MISSING_SIGNATURE",
|
|
37
|
+
part_type="function call",
|
|
38
|
+
)
|
|
39
|
+
|
|
26
40
|
request_json = {
|
|
27
41
|
"contents": messages,
|
|
28
42
|
"generationConfig": {
|
|
@@ -37,20 +51,61 @@ async def _build_gemini_request(
|
|
|
37
51
|
request_json["systemInstruction"] = {"parts": [{"text": system_message}]}
|
|
38
52
|
|
|
39
53
|
# Handle reasoning models (thinking)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
if
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
thinking_config = {"includeThoughts": False, "thinkingBudget": budget}
|
|
54
|
+
is_gemini_3 = "gemini-3" in model.name.lower()
|
|
55
|
+
if is_gemini_3:
|
|
56
|
+
# gemini3 MUST think
|
|
57
|
+
if not sampling_params.reasoning_effort:
|
|
58
|
+
maybe_warn("WARN_GEMINI3_NO_REASONING")
|
|
59
|
+
effort = "low"
|
|
47
60
|
else:
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
61
|
+
level_map = {
|
|
62
|
+
"none": "low",
|
|
63
|
+
"minimal": "low",
|
|
64
|
+
"low": "low",
|
|
65
|
+
"medium": "high", # change when supported
|
|
66
|
+
"high": "high",
|
|
67
|
+
}
|
|
68
|
+
effort = level_map[sampling_params.reasoning_effort]
|
|
69
|
+
thinking_config = {"thinkingLevel": effort}
|
|
70
|
+
request_json["generationConfig"]["thinkingConfig"] = thinking_config
|
|
71
|
+
|
|
72
|
+
elif model.reasoning_model:
|
|
73
|
+
if (
|
|
74
|
+
sampling_params.thinking_budget is not None
|
|
75
|
+
and sampling_params.reasoning_effort is not None
|
|
76
|
+
):
|
|
77
|
+
maybe_warn("WARN_THINKING_BUDGET_AND_REASONING_EFFORT")
|
|
78
|
+
|
|
79
|
+
if (
|
|
80
|
+
sampling_params.thinking_budget is not None
|
|
81
|
+
and sampling_params.thinking_budget > 0
|
|
82
|
+
):
|
|
83
|
+
thinking_config = {
|
|
84
|
+
"includeThoughts": True,
|
|
85
|
+
"thinkingBudget": sampling_params.thinking_budget,
|
|
86
|
+
}
|
|
87
|
+
elif sampling_params.thinking_budget == -1:
|
|
88
|
+
# dynamic thinking
|
|
89
|
+
thinking_config = {"includeThoughts": True, "thinkingBudget": -1}
|
|
90
|
+
elif sampling_params.reasoning_effort not in [None, "none"]:
|
|
91
|
+
level_map = {
|
|
92
|
+
"minimal": 256,
|
|
93
|
+
"low": 1024,
|
|
94
|
+
"medium": 4096,
|
|
95
|
+
"high": 16384,
|
|
96
|
+
}
|
|
97
|
+
assert sampling_params.reasoning_effort in level_map
|
|
98
|
+
budget = level_map[sampling_params.reasoning_effort]
|
|
99
|
+
if "flash-lite" in model.id:
|
|
100
|
+
budget = max(budget, 512)
|
|
101
|
+
thinking_config = {"includeThoughts": True, "thinkingBudget": budget}
|
|
102
|
+
elif "2.5-pro" in model.id:
|
|
103
|
+
# 2.5 pro must think.
|
|
104
|
+
thinking_config = {"includeThoughts": True, "thinkingBudget": 128}
|
|
105
|
+
else:
|
|
106
|
+
# no thoughts head empty
|
|
107
|
+
thinking_config = {"includeThoughts": False, "thinkingBudget": 0}
|
|
108
|
+
|
|
54
109
|
request_json["generationConfig"]["thinkingConfig"] = thinking_config
|
|
55
110
|
|
|
56
111
|
else:
|
|
@@ -66,6 +121,21 @@ async def _build_gemini_request(
|
|
|
66
121
|
if sampling_params.json_mode and model.supports_json:
|
|
67
122
|
request_json["generationConfig"]["responseMimeType"] = "application/json"
|
|
68
123
|
|
|
124
|
+
# Handle media_resolution for Gemini 3 (requires v1alpha)
|
|
125
|
+
if sampling_params.media_resolution is not None:
|
|
126
|
+
is_gemini_3 = "gemini-3" in model.name.lower()
|
|
127
|
+
if is_gemini_3:
|
|
128
|
+
# Add global media resolution to generationConfig
|
|
129
|
+
request_json["generationConfig"]["mediaResolution"] = {
|
|
130
|
+
"level": sampling_params.media_resolution
|
|
131
|
+
}
|
|
132
|
+
else:
|
|
133
|
+
# Warn if trying to use media_resolution on non-Gemini-3 models
|
|
134
|
+
maybe_warn(
|
|
135
|
+
"WARN_MEDIA_RESOLUTION_UNSUPPORTED",
|
|
136
|
+
model_name=model.name,
|
|
137
|
+
)
|
|
138
|
+
|
|
69
139
|
return request_json
|
|
70
140
|
|
|
71
141
|
|
|
@@ -137,10 +207,19 @@ class GeminiRequest(APIRequestBase):
|
|
|
137
207
|
candidate = data["candidates"][0]
|
|
138
208
|
if "content" in candidate and "parts" in candidate["content"]:
|
|
139
209
|
for part in candidate["content"]["parts"]:
|
|
210
|
+
# Extract thought signature if present
|
|
211
|
+
thought_sig = part.get("thoughtSignature")
|
|
212
|
+
|
|
140
213
|
if "text" in part:
|
|
141
214
|
parts.append(Text(part["text"]))
|
|
142
215
|
elif "thought" in part:
|
|
143
|
-
|
|
216
|
+
# Thought with optional signature
|
|
217
|
+
parts.append(
|
|
218
|
+
Thinking(
|
|
219
|
+
content=part["thought"],
|
|
220
|
+
thought_signature=thought_sig,
|
|
221
|
+
)
|
|
222
|
+
)
|
|
144
223
|
elif "functionCall" in part:
|
|
145
224
|
func_call = part["functionCall"]
|
|
146
225
|
# Generate a unique ID since Gemini doesn't provide one
|
|
@@ -152,6 +231,7 @@ class GeminiRequest(APIRequestBase):
|
|
|
152
231
|
id=tool_id,
|
|
153
232
|
name=func_call["name"],
|
|
154
233
|
arguments=func_call.get("args", {}),
|
|
234
|
+
thought_signature=thought_sig,
|
|
155
235
|
)
|
|
156
236
|
)
|
|
157
237
|
|
|
@@ -79,7 +79,7 @@ class _LLMClient(BaseModel):
|
|
|
79
79
|
background: bool = False
|
|
80
80
|
# sampling params - if provided, and sampling_params is not,
|
|
81
81
|
# these override the defaults
|
|
82
|
-
temperature: float = 0
|
|
82
|
+
temperature: float = 1.0
|
|
83
83
|
top_p: float = 1.0
|
|
84
84
|
json_mode: bool = False
|
|
85
85
|
max_new_tokens: int = 512
|
|
@@ -262,6 +262,7 @@ class _LLMClient(BaseModel):
|
|
|
262
262
|
self.max_tokens_per_minute = max_tokens_per_minute
|
|
263
263
|
if max_concurrent_requests:
|
|
264
264
|
self.max_concurrent_requests = max_concurrent_requests
|
|
265
|
+
return self
|
|
265
266
|
|
|
266
267
|
def _get_tracker(self) -> StatusTracker:
|
|
267
268
|
if self._tracker is None:
|
|
@@ -336,7 +337,7 @@ class _LLMClient(BaseModel):
|
|
|
336
337
|
if "sampling_params" not in data or len(data.get("sampling_params", [])) == 0:
|
|
337
338
|
data["sampling_params"] = [
|
|
338
339
|
SamplingParams(
|
|
339
|
-
temperature=data.get("temperature", 0
|
|
340
|
+
temperature=data.get("temperature", 1.0),
|
|
340
341
|
top_p=data.get("top_p", 1.0),
|
|
341
342
|
json_mode=data.get("json_mode", False),
|
|
342
343
|
max_new_tokens=data.get("max_new_tokens", 512),
|
|
@@ -1066,7 +1067,7 @@ def LLMClient(
|
|
|
1066
1067
|
extra_headers: dict[str, str] | None = None,
|
|
1067
1068
|
use_responses_api: bool = False,
|
|
1068
1069
|
background: bool = False,
|
|
1069
|
-
temperature: float = 0
|
|
1070
|
+
temperature: float = 1.0,
|
|
1070
1071
|
top_p: float = 1.0,
|
|
1071
1072
|
json_mode: bool = False,
|
|
1072
1073
|
max_new_tokens: int = 512,
|
|
@@ -1095,7 +1096,7 @@ def LLMClient(
|
|
|
1095
1096
|
extra_headers: dict[str, str] | None = None,
|
|
1096
1097
|
use_responses_api: bool = False,
|
|
1097
1098
|
background: bool = False,
|
|
1098
|
-
temperature: float = 0
|
|
1099
|
+
temperature: float = 1.0,
|
|
1099
1100
|
top_p: float = 1.0,
|
|
1100
1101
|
json_mode: bool = False,
|
|
1101
1102
|
max_new_tokens: int = 512,
|
|
@@ -1123,7 +1124,7 @@ def LLMClient(
|
|
|
1123
1124
|
extra_headers: dict[str, str] | None = None,
|
|
1124
1125
|
use_responses_api: bool = False,
|
|
1125
1126
|
background: bool = False,
|
|
1126
|
-
temperature: float = 0
|
|
1127
|
+
temperature: float = 1.0,
|
|
1127
1128
|
top_p: float = 1.0,
|
|
1128
1129
|
json_mode: bool = False,
|
|
1129
1130
|
max_new_tokens: int = 512,
|
|
@@ -4,14 +4,23 @@ from pydantic import BaseModel
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class SamplingParams(BaseModel):
|
|
7
|
-
temperature: float =
|
|
7
|
+
temperature: float = 1.0 # more typical for new models
|
|
8
8
|
top_p: float = 1.0
|
|
9
9
|
json_mode: bool = False
|
|
10
10
|
max_new_tokens: int = 2_048
|
|
11
|
+
global_effort: Literal["low", "medium", "high"] = "high" # for opus-4.5
|
|
11
12
|
reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None
|
|
13
|
+
thinking_budget: int | None = None
|
|
12
14
|
logprobs: bool = False
|
|
13
15
|
top_logprobs: int | None = None
|
|
14
16
|
strict_tools: bool = True
|
|
17
|
+
# Gemini 3 only - controls multimodal vision processing fidelity
|
|
18
|
+
media_resolution: (
|
|
19
|
+
Literal[
|
|
20
|
+
"media_resolution_low", "media_resolution_medium", "media_resolution_high"
|
|
21
|
+
]
|
|
22
|
+
| None
|
|
23
|
+
) = None
|
|
15
24
|
|
|
16
25
|
def to_vllm(self):
|
|
17
26
|
try:
|