lm-deluge 0.0.79__py3-none-any.whl → 0.0.81__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. lm_deluge/__init__.py +1 -2
  2. lm_deluge/api_requests/anthropic.py +44 -16
  3. lm_deluge/api_requests/base.py +13 -0
  4. lm_deluge/api_requests/gemini.py +54 -41
  5. lm_deluge/api_requests/openai.py +3 -2
  6. lm_deluge/client.py +16 -16
  7. lm_deluge/config.py +3 -1
  8. lm_deluge/llm_tools/__init__.py +12 -5
  9. lm_deluge/models/anthropic.py +15 -0
  10. lm_deluge/pipelines/__init__.py +11 -0
  11. lm_deluge/{llm_tools → pipelines}/score.py +2 -2
  12. lm_deluge/{llm_tools → pipelines}/translate.py +5 -3
  13. lm_deluge/prompt.py +105 -0
  14. lm_deluge/request_context.py +2 -2
  15. lm_deluge/{tool.py → tool/__init__.py} +531 -314
  16. lm_deluge/tool/prefab/__init__.py +29 -0
  17. lm_deluge/tool/prefab/batch_tool.py +156 -0
  18. lm_deluge/{llm_tools → tool/prefab}/filesystem.py +1 -1
  19. lm_deluge/tool/prefab/memory.py +190 -0
  20. lm_deluge/tool/prefab/otc/__init__.py +165 -0
  21. lm_deluge/tool/prefab/otc/executor.py +281 -0
  22. lm_deluge/tool/prefab/otc/parse.py +188 -0
  23. lm_deluge/{llm_tools → tool/prefab}/sandbox.py +251 -61
  24. lm_deluge/{llm_tools → tool/prefab}/todos.py +1 -1
  25. lm_deluge/tool/prefab/tool_search.py +169 -0
  26. lm_deluge/warnings.py +2 -0
  27. {lm_deluge-0.0.79.dist-info → lm_deluge-0.0.81.dist-info}/METADATA +2 -3
  28. {lm_deluge-0.0.79.dist-info → lm_deluge-0.0.81.dist-info}/RECORD +36 -30
  29. lm_deluge/presets/cerebras.py +0 -17
  30. lm_deluge/presets/meta.py +0 -13
  31. /lm_deluge/{llm_tools → pipelines}/classify.py +0 -0
  32. /lm_deluge/{llm_tools → pipelines}/extract.py +0 -0
  33. /lm_deluge/{llm_tools → pipelines}/locate.py +0 -0
  34. /lm_deluge/{llm_tools → pipelines}/ocr.py +0 -0
  35. /lm_deluge/{llm_tools → tool/prefab}/subagents.py +0 -0
  36. {lm_deluge-0.0.79.dist-info → lm_deluge-0.0.81.dist-info}/WHEEL +0 -0
  37. {lm_deluge-0.0.79.dist-info → lm_deluge-0.0.81.dist-info}/licenses/LICENSE +0 -0
  38. {lm_deluge-0.0.79.dist-info → lm_deluge-0.0.81.dist-info}/top_level.txt +0 -0
lm_deluge/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from .client import APIResponse, LLMClient, SamplingParams
2
2
  from .file import File
3
3
  from .prompt import Conversation, Message
4
- from .tool import Tool, ToolParams
4
+ from .tool import Tool
5
5
 
6
6
  try:
7
7
  from .mock_openai import ( # noqa
@@ -25,7 +25,6 @@ __all__ = [
25
25
  "Conversation",
26
26
  "Message",
27
27
  "Tool",
28
- "ToolParams",
29
28
  "File",
30
29
  ]
31
30
 
@@ -16,6 +16,7 @@ from lm_deluge.util.schema import (
16
16
  prepare_output_schema,
17
17
  transform_schema_for_anthropic,
18
18
  )
19
+ from lm_deluge.warnings import maybe_warn
19
20
 
20
21
  from ..models import APIModel
21
22
  from .base import APIRequestBase, APIResponse
@@ -62,20 +63,45 @@ def _build_anthropic_request(
62
63
  "max_tokens": sampling_params.max_new_tokens,
63
64
  }
64
65
 
66
+ if model.id == "claude-4.5-opus" and sampling_params.global_effort:
67
+ request_json["output_config"] = {"effort": sampling_params.global_effort}
68
+ _add_beta(base_headers, "effort-2025-11-24")
69
+
65
70
  # handle thinking
66
- if model.reasoning_model and sampling_params.reasoning_effort:
67
- # translate reasoning effort of low, medium, high to budget tokens
68
- budget = {"minimal": 256, "low": 1024, "medium": 4096, "high": 16384}.get(
69
- sampling_params.reasoning_effort
70
- )
71
- request_json["thinking"] = {
72
- "type": "enabled",
73
- "budget_tokens": budget,
74
- }
75
- if "top_p" in request_json:
76
- request_json["top_p"] = max(request_json["top_p"], 0.95)
77
- request_json["temperature"] = 1.0
78
- request_json["max_tokens"] += budget
71
+ if model.reasoning_model:
72
+ if (
73
+ sampling_params.thinking_budget is not None
74
+ and sampling_params.reasoning_effort is not None
75
+ ):
76
+ maybe_warn("WARN_THINKING_BUDGET_AND_REASONING_EFFORT")
77
+
78
+ if sampling_params.thinking_budget is not None:
79
+ budget = sampling_params.thinking_budget
80
+ elif sampling_params.reasoning_effort is not None:
81
+ # translate reasoning effort of low, medium, high to budget tokens
82
+ budget = {
83
+ "none": 0,
84
+ "minimal": 256,
85
+ "low": 1024,
86
+ "medium": 4096,
87
+ "high": 16384,
88
+ }.get(sampling_params.reasoning_effort)
89
+ assert isinstance(budget, int)
90
+ else:
91
+ budget = 0
92
+
93
+ if budget > 0:
94
+ request_json["thinking"] = {
95
+ "type": "enabled",
96
+ "budget_tokens": budget,
97
+ }
98
+ if "top_p" in request_json:
99
+ request_json["top_p"] = max(request_json["top_p"], 0.95)
100
+ request_json["temperature"] = 1.0
101
+ request_json["max_tokens"] += budget
102
+ else:
103
+ request_json["thinking"] = {"type": "disabled"}
104
+
79
105
  else:
80
106
  request_json["thinking"] = {"type": "disabled"}
81
107
  if sampling_params.reasoning_effort:
@@ -83,11 +109,13 @@ def _build_anthropic_request(
83
109
  if system_message is not None:
84
110
  request_json["system"] = system_message
85
111
 
86
- # handle temp + top_p for opus 4.1/sonnet 4.5
112
+ # handle temp + top_p for opus 4.1/sonnet 4.5.
113
+ # TODO: make clearer / more user-friendly so there can be NotGiven
114
+ # and user can control which one they want to use
87
115
  if "4-1" in model.name or "4-5" in model.name:
88
- if "temperature" in request_json and "top_p" in request_json:
89
- request_json.pop("top_p")
116
+ request_json.pop("top_p")
90
117
 
118
+ # print(request_json)
91
119
  # Handle structured outputs (output_format)
92
120
  if context.output_schema:
93
121
  if model.supports_json:
@@ -222,6 +222,19 @@ class APIRequestBase(ABC):
222
222
  usage=None,
223
223
  )
224
224
 
225
+ except aiohttp.ServerDisconnectedError:
226
+ return APIResponse(
227
+ id=self.context.task_id,
228
+ model_internal=self.context.model_name,
229
+ prompt=self.context.prompt,
230
+ sampling_params=self.context.sampling_params,
231
+ status_code=None,
232
+ is_error=True,
233
+ error_message="Server disconnected.",
234
+ content=None,
235
+ usage=None,
236
+ )
237
+
225
238
  except Exception as e:
226
239
  raise_if_modal_exception(e)
227
240
  tb = traceback.format_exc()
@@ -1,6 +1,5 @@
1
1
  import json
2
2
  import os
3
- from typing import Any
4
3
 
5
4
  from aiohttp import ClientResponse
6
5
 
@@ -52,47 +51,61 @@ async def _build_gemini_request(
52
51
  request_json["systemInstruction"] = {"parts": [{"text": system_message}]}
53
52
 
54
53
  # Handle reasoning models (thinking)
55
- if model.reasoning_model:
56
- thinking_config: dict[str, Any] | None = None
57
- effort = sampling_params.reasoning_effort
58
- is_gemini_3 = "gemini-3" in model.name.lower()
54
+ is_gemini_3 = "gemini-3" in model.name.lower()
55
+ if is_gemini_3:
56
+ # gemini3 MUST think
57
+ if not sampling_params.reasoning_effort:
58
+ maybe_warn("WARN_GEMINI3_NO_REASONING")
59
+ effort = "low"
60
+ else:
61
+ level_map = {
62
+ "none": "low",
63
+ "minimal": "low",
64
+ "low": "low",
65
+ "medium": "high", # change when supported
66
+ "high": "high",
67
+ }
68
+ effort = level_map[sampling_params.reasoning_effort]
69
+ thinking_config = {"thinkingLevel": effort}
70
+ request_json["generationConfig"]["thinkingConfig"] = thinking_config
59
71
 
60
- if is_gemini_3:
61
- # Gemini 3 uses thinkingLevel instead of thinkingBudget
62
- if effort in {"none", "minimal"}:
63
- thinking_config = {"thinkingLevel": "low"}
64
- elif effort is None:
65
- # Default to high when reasoning is enabled but no preference was provided
66
- thinking_config = {"thinkingLevel": "high"}
67
- else:
68
- # Map reasoning_effort to thinkingLevel
69
- level_map = {
70
- "minimal": "low",
71
- "low": "low",
72
- "medium": "medium", # Will work when supported
73
- "high": "high",
74
- }
75
- thinking_level = level_map.get(effort, "high")
76
- thinking_config = {"thinkingLevel": thinking_level}
72
+ elif model.reasoning_model:
73
+ if (
74
+ sampling_params.thinking_budget is not None
75
+ and sampling_params.reasoning_effort is not None
76
+ ):
77
+ maybe_warn("WARN_THINKING_BUDGET_AND_REASONING_EFFORT")
78
+
79
+ if (
80
+ sampling_params.thinking_budget is not None
81
+ and sampling_params.thinking_budget > 0
82
+ ):
83
+ thinking_config = {
84
+ "includeThoughts": True,
85
+ "thinkingBudget": sampling_params.thinking_budget,
86
+ }
87
+ elif sampling_params.thinking_budget == -1:
88
+ # dynamic thinking
89
+ thinking_config = {"includeThoughts": True, "thinkingBudget": -1}
90
+ elif sampling_params.reasoning_effort not in [None, "none"]:
91
+ level_map = {
92
+ "minimal": 256,
93
+ "low": 1024,
94
+ "medium": 4096,
95
+ "high": 16384,
96
+ }
97
+ assert sampling_params.reasoning_effort in level_map
98
+ budget = level_map[sampling_params.reasoning_effort]
99
+ if "flash-lite" in model.id:
100
+ budget = max(budget, 512)
101
+ thinking_config = {"includeThoughts": True, "thinkingBudget": budget}
102
+ elif "2.5-pro" in model.id:
103
+ # 2.5 pro must think.
104
+ thinking_config = {"includeThoughts": True, "thinkingBudget": 128}
77
105
  else:
78
- # Gemini 2.5 uses thinkingBudget (legacy)
79
- if effort is None or effort == "none":
80
- budget = 128 if "2.5-pro" in model.id else 0
81
- # Explicitly disable thoughts when no effort is requested
82
- thinking_config = {"includeThoughts": False, "thinkingBudget": budget}
83
- else:
84
- thinking_config = {"includeThoughts": True}
85
- if (
86
- effort in {"minimal", "low", "medium", "high"}
87
- and "flash" in model.id
88
- ):
89
- budget = {
90
- "minimal": 256,
91
- "low": 1024,
92
- "medium": 4096,
93
- "high": 16384,
94
- }[effort]
95
- thinking_config["thinkingBudget"] = budget
106
+ # no thoughts head empty
107
+ thinking_config = {"includeThoughts": False, "thinkingBudget": 0}
108
+
96
109
  request_json["generationConfig"]["thinkingConfig"] = thinking_config
97
110
 
98
111
  else:
@@ -160,7 +173,7 @@ class GeminiRequest(APIRequestBase):
160
173
  self.request_json = await _build_gemini_request(
161
174
  self.model,
162
175
  self.context.prompt,
163
- self.context.tools,
176
+ self.context.tools, # type: ignore
164
177
  self.context.sampling_params,
165
178
  )
166
179
 
@@ -2,17 +2,18 @@ import json
2
2
  import os
3
3
  import traceback as tb
4
4
  from types import SimpleNamespace
5
+ from typing import Sequence
5
6
 
6
7
  import aiohttp
7
8
  from aiohttp import ClientResponse
8
9
 
9
10
  from lm_deluge.request_context import RequestContext
10
11
  from lm_deluge.tool import MCPServer, Tool
11
- from lm_deluge.warnings import maybe_warn
12
12
  from lm_deluge.util.schema import (
13
13
  prepare_output_schema,
14
14
  transform_schema_for_openai,
15
15
  )
16
+ from lm_deluge.warnings import maybe_warn
16
17
 
17
18
  from ..config import SamplingParams
18
19
  from ..models import APIModel
@@ -610,7 +611,7 @@ async def stream_chat(
610
611
  model_name: str, # must correspond to registry
611
612
  prompt: Conversation,
612
613
  sampling_params: SamplingParams = SamplingParams(),
613
- tools: list | None = None,
614
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
614
615
  cache: CachePattern | None = None,
615
616
  extra_headers: dict[str, str] | None = None,
616
617
  ):
lm_deluge/client.py CHANGED
@@ -79,7 +79,7 @@ class _LLMClient(BaseModel):
79
79
  background: bool = False
80
80
  # sampling params - if provided, and sampling_params is not,
81
81
  # these override the defaults
82
- temperature: float = 0.75
82
+ temperature: float = 1.0
83
83
  top_p: float = 1.0
84
84
  json_mode: bool = False
85
85
  max_new_tokens: int = 512
@@ -337,7 +337,7 @@ class _LLMClient(BaseModel):
337
337
  if "sampling_params" not in data or len(data.get("sampling_params", [])) == 0:
338
338
  data["sampling_params"] = [
339
339
  SamplingParams(
340
- temperature=data.get("temperature", 0.75),
340
+ temperature=data.get("temperature", 1.0),
341
341
  top_p=data.get("top_p", 1.0),
342
342
  json_mode=data.get("json_mode", False),
343
343
  max_new_tokens=data.get("max_new_tokens", 512),
@@ -572,7 +572,7 @@ class _LLMClient(BaseModel):
572
572
  *,
573
573
  return_completions_only: Literal[True],
574
574
  show_progress: bool = ...,
575
- tools: list[Tool | dict | MCPServer] | None = ...,
575
+ tools: Sequence[Tool | dict | MCPServer] | None = ...,
576
576
  output_schema: type[BaseModel] | dict | None = ...,
577
577
  cache: CachePattern | None = ...,
578
578
  service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
@@ -585,7 +585,7 @@ class _LLMClient(BaseModel):
585
585
  *,
586
586
  return_completions_only: Literal[False] = ...,
587
587
  show_progress: bool = ...,
588
- tools: list[Tool | dict | MCPServer] | None = ...,
588
+ tools: Sequence[Tool | dict | MCPServer] | None = ...,
589
589
  output_schema: type[BaseModel] | dict | None = ...,
590
590
  cache: CachePattern | None = ...,
591
591
  service_tier: Literal["auto", "default", "flex", "priority"] | None = ...,
@@ -597,7 +597,7 @@ class _LLMClient(BaseModel):
597
597
  *,
598
598
  return_completions_only: bool = False,
599
599
  show_progress: bool = True,
600
- tools: list[Tool | dict | MCPServer] | None = None,
600
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
601
601
  output_schema: type[BaseModel] | dict | None = None,
602
602
  cache: CachePattern | None = None,
603
603
  service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
@@ -672,7 +672,7 @@ class _LLMClient(BaseModel):
672
672
  *,
673
673
  return_completions_only: bool = False,
674
674
  show_progress=True,
675
- tools: list[Tool | dict | MCPServer] | None = None,
675
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
676
676
  output_schema: type[BaseModel] | dict | None = None,
677
677
  cache: CachePattern | None = None,
678
678
  ):
@@ -705,7 +705,7 @@ class _LLMClient(BaseModel):
705
705
  self,
706
706
  prompt: Prompt,
707
707
  *,
708
- tools: list[Tool | dict | MCPServer] | None = None,
708
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
709
709
  output_schema: type[BaseModel] | dict | None = None,
710
710
  cache: CachePattern | None = None,
711
711
  service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
@@ -742,7 +742,7 @@ class _LLMClient(BaseModel):
742
742
  self,
743
743
  prompt: Prompt,
744
744
  *,
745
- tools: list[Tool | dict | MCPServer] | None = None,
745
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
746
746
  output_schema: type[BaseModel] | dict | None = None,
747
747
  cache: CachePattern | None = None,
748
748
  service_tier: Literal["auto", "default", "flex", "priority"] | None = None,
@@ -835,7 +835,7 @@ class _LLMClient(BaseModel):
835
835
  async def stream(
836
836
  self,
837
837
  prompt: Prompt,
838
- tools: list[Tool | dict | MCPServer] | None = None,
838
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
839
839
  ):
840
840
  model, sampling_params = self._select_model()
841
841
  prompt = prompts_to_conversations([prompt])[0]
@@ -856,7 +856,7 @@ class _LLMClient(BaseModel):
856
856
  task_id: int,
857
857
  conversation: Conversation,
858
858
  *,
859
- tools: list[Tool | dict | MCPServer] | None = None,
859
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
860
860
  max_rounds: int = 5,
861
861
  ) -> AgentLoopResponse:
862
862
  """Internal method to run agent loop and return wrapped result."""
@@ -920,7 +920,7 @@ class _LLMClient(BaseModel):
920
920
  self,
921
921
  conversation: Prompt,
922
922
  *,
923
- tools: list[Tool | dict | MCPServer] | None = None,
923
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
924
924
  max_rounds: int = 5,
925
925
  ) -> int:
926
926
  """Start an agent loop without waiting for it to complete.
@@ -967,7 +967,7 @@ class _LLMClient(BaseModel):
967
967
  self,
968
968
  conversation: Prompt,
969
969
  *,
970
- tools: list[Tool | dict | MCPServer] | None = None,
970
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
971
971
  max_rounds: int = 5,
972
972
  show_progress: bool = False,
973
973
  ) -> tuple[Conversation, APIResponse]:
@@ -986,7 +986,7 @@ class _LLMClient(BaseModel):
986
986
  self,
987
987
  conversation: Prompt,
988
988
  *,
989
- tools: list[Tool | dict | MCPServer] | None = None,
989
+ tools: Sequence[Tool | dict | MCPServer] | None = None,
990
990
  max_rounds: int = 5,
991
991
  show_progress: bool = False,
992
992
  ) -> tuple[Conversation, APIResponse]:
@@ -1067,7 +1067,7 @@ def LLMClient(
1067
1067
  extra_headers: dict[str, str] | None = None,
1068
1068
  use_responses_api: bool = False,
1069
1069
  background: bool = False,
1070
- temperature: float = 0.75,
1070
+ temperature: float = 1.0,
1071
1071
  top_p: float = 1.0,
1072
1072
  json_mode: bool = False,
1073
1073
  max_new_tokens: int = 512,
@@ -1096,7 +1096,7 @@ def LLMClient(
1096
1096
  extra_headers: dict[str, str] | None = None,
1097
1097
  use_responses_api: bool = False,
1098
1098
  background: bool = False,
1099
- temperature: float = 0.75,
1099
+ temperature: float = 1.0,
1100
1100
  top_p: float = 1.0,
1101
1101
  json_mode: bool = False,
1102
1102
  max_new_tokens: int = 512,
@@ -1124,7 +1124,7 @@ def LLMClient(
1124
1124
  extra_headers: dict[str, str] | None = None,
1125
1125
  use_responses_api: bool = False,
1126
1126
  background: bool = False,
1127
- temperature: float = 0.75,
1127
+ temperature: float = 1.0,
1128
1128
  top_p: float = 1.0,
1129
1129
  json_mode: bool = False,
1130
1130
  max_new_tokens: int = 512,
lm_deluge/config.py CHANGED
@@ -4,11 +4,13 @@ from pydantic import BaseModel
4
4
 
5
5
 
6
6
  class SamplingParams(BaseModel):
7
- temperature: float = 0.0
7
+ temperature: float = 1.0 # more typical for new models
8
8
  top_p: float = 1.0
9
9
  json_mode: bool = False
10
10
  max_new_tokens: int = 2_048
11
+ global_effort: Literal["low", "medium", "high"] = "high" # for opus-4.5
11
12
  reasoning_effort: Literal["low", "medium", "high", "minimal", "none", None] = None
13
+ thinking_budget: int | None = None
12
14
  logprobs: bool = False
13
15
  top_logprobs: int | None = None
14
16
  strict_tools: bool = True
@@ -1,8 +1,15 @@
1
- from .extract import extract, extract_async
2
- from .score import score_llm
3
- from .subagents import SubAgentManager
4
- from .todos import TodoItem, TodoManager, TodoPriority, TodoStatus
5
- from .translate import translate, translate_async
1
+ # Backward compatibility - re-export from new locations
2
+ # Pipelines (workflow functions)
3
+ from ..pipelines import extract, extract_async, score_llm, translate, translate_async
4
+
5
+ # Prefab tools (Tool managers)
6
+ from ..tool.prefab import (
7
+ SubAgentManager,
8
+ TodoItem,
9
+ TodoManager,
10
+ TodoPriority,
11
+ TodoStatus,
12
+ )
6
13
 
7
14
  __all__ = [
8
15
  "extract",
@@ -10,6 +10,19 @@ ANTHROPIC_MODELS = {
10
10
  # ░███
11
11
  # █████
12
12
  #
13
+ "claude-4.5-opus": {
14
+ "id": "claude-4.5-opus",
15
+ "name": "claude-opus-4-5-20251101",
16
+ "api_base": "https://api.anthropic.com/v1",
17
+ "api_key_env_var": "ANTHROPIC_API_KEY",
18
+ "supports_json": False,
19
+ "api_spec": "anthropic",
20
+ "input_cost": 5.0,
21
+ "cached_input_cost": 0.50,
22
+ "cache_write_cost": 6.25,
23
+ "output_cost": 25.0,
24
+ "reasoning_model": True,
25
+ },
13
26
  "claude-4.5-haiku": {
14
27
  "id": "claude-4.5-haiku",
15
28
  "name": "claude-haiku-4-5-20251001",
@@ -21,6 +34,7 @@ ANTHROPIC_MODELS = {
21
34
  "cached_input_cost": 0.10,
22
35
  "cache_write_cost": 1.25,
23
36
  "output_cost": 3.0,
37
+ "reasoning_model": True,
24
38
  },
25
39
  "claude-4.5-sonnet": {
26
40
  "id": "claude-4.5-sonnet",
@@ -33,6 +47,7 @@ ANTHROPIC_MODELS = {
33
47
  "cached_input_cost": 0.30,
34
48
  "cache_write_cost": 3.75,
35
49
  "output_cost": 15.0,
50
+ "reasoning_model": True,
36
51
  },
37
52
  "claude-4.1-opus": {
38
53
  "id": "claude-4.1-opus",
@@ -0,0 +1,11 @@
1
+ from .extract import extract, extract_async
2
+ from .score import score_llm
3
+ from .translate import translate, translate_async
4
+
5
+ __all__ = [
6
+ "extract",
7
+ "extract_async",
8
+ "translate",
9
+ "translate_async",
10
+ "score_llm",
11
+ ]
@@ -1,4 +1,4 @@
1
- from ..client import LLMClient, APIResponse
1
+ from ..client import _LLMClient, APIResponse
2
2
  from ..util.logprobs import extract_prob
3
3
 
4
4
  # def extract_prob_yes(logprobs: list[dict]):
@@ -24,7 +24,7 @@ from ..util.logprobs import extract_prob
24
24
  def score_llm(
25
25
  scoring_prompt_template: str,
26
26
  inputs: list[tuple | list | dict], # to format the template
27
- scoring_model: LLMClient,
27
+ scoring_model: _LLMClient,
28
28
  return_probabilities: bool,
29
29
  yes_token: str = "yes",
30
30
  ) -> list[bool | None] | list[float | None]:
@@ -1,5 +1,5 @@
1
1
  import asyncio
2
- from ..client import LLMClient
2
+ from ..client import _LLMClient
3
3
 
4
4
  translation_prompt = (
5
5
  "Translate the following text (enclosed in ```) into English. "
@@ -20,7 +20,9 @@ def is_english(text: str, low_memory: bool = True):
20
20
  return True
21
21
 
22
22
 
23
- async def translate_async(texts: list[str], client: LLMClient, low_memory: bool = True):
23
+ async def translate_async(
24
+ texts: list[str], client: _LLMClient, low_memory: bool = True
25
+ ):
24
26
  to_translate_idxs = [
25
27
  i for i, text in enumerate(texts) if not is_english(text, low_memory=low_memory)
26
28
  ]
@@ -40,5 +42,5 @@ async def translate_async(texts: list[str], client: LLMClient, low_memory: bool
40
42
  return texts
41
43
 
42
44
 
43
- def translate(texts: list[str], client: LLMClient, low_memory: bool = True):
45
+ def translate(texts: list[str], client: _LLMClient, low_memory: bool = True):
44
46
  return asyncio.run(translate_async(texts, client, low_memory))
lm_deluge/prompt.py CHANGED
@@ -1598,6 +1598,111 @@ class Conversation:
1598
1598
 
1599
1599
  return {"messages": serialized}
1600
1600
 
1601
+ def print(self, max_text_length: int = 500, indent: int = 2) -> None:
1602
+ """Pretty-print the conversation to stdout.
1603
+
1604
+ Args:
1605
+ max_text_length: Truncate text content longer than this (default 500 chars)
1606
+ indent: JSON indentation for tool calls/results (default 2)
1607
+ """
1608
+ ROLE_COLORS = {
1609
+ "system": "\033[95m", # magenta
1610
+ "user": "\033[94m", # blue
1611
+ "assistant": "\033[92m", # green
1612
+ "tool": "\033[93m", # yellow
1613
+ }
1614
+ RESET = "\033[0m"
1615
+ DIM = "\033[2m"
1616
+ BOLD = "\033[1m"
1617
+
1618
+ def truncate(text: str, max_len: int) -> str:
1619
+ if len(text) <= max_len:
1620
+ return text
1621
+ return (
1622
+ text[:max_len] + f"{DIM}... [{len(text) - max_len} more chars]{RESET}"
1623
+ )
1624
+
1625
+ def format_json(obj: dict | list, ind: int) -> str:
1626
+ return json.dumps(obj, indent=ind, ensure_ascii=False)
1627
+
1628
+ print(f"\n{BOLD}{'=' * 60}{RESET}")
1629
+ print(f"{BOLD}Conversation ({len(self.messages)} messages){RESET}")
1630
+ print(f"{BOLD}{'=' * 60}{RESET}\n")
1631
+
1632
+ for i, msg in enumerate(self.messages):
1633
+ role_color = ROLE_COLORS.get(msg.role, "")
1634
+ print(f"{role_color}{BOLD}[{msg.role.upper()}]{RESET}")
1635
+
1636
+ for part in msg.parts:
1637
+ if isinstance(part, Text):
1638
+ text = truncate(part.text, max_text_length)
1639
+ # Indent multiline text
1640
+ lines = text.split("\n")
1641
+ if len(lines) > 1:
1642
+ print(" " + "\n ".join(lines))
1643
+ else:
1644
+ print(f" {text}")
1645
+
1646
+ elif isinstance(part, Image):
1647
+ w, h = part.size
1648
+ print(f" {DIM}<Image ({w}x{h})>{RESET}")
1649
+
1650
+ elif isinstance(part, File):
1651
+ size = part.size
1652
+ filename = getattr(part, "filename", None)
1653
+ if filename:
1654
+ print(f" {DIM}<File: {filename} ({size} bytes)>{RESET}")
1655
+ else:
1656
+ print(f" {DIM}<File ({size} bytes)>{RESET}")
1657
+
1658
+ elif isinstance(part, ToolCall):
1659
+ print(
1660
+ f" {DIM}Tool Call:{RESET} {BOLD}{part.name}{RESET} (id: {part.id})"
1661
+ )
1662
+ if part.arguments:
1663
+ args_json = format_json(part.arguments, indent)
1664
+ # Indent the JSON
1665
+ indented = "\n".join(
1666
+ " " + line for line in args_json.split("\n")
1667
+ )
1668
+ print(indented)
1669
+
1670
+ elif isinstance(part, ToolResult):
1671
+ print(f" {DIM}Tool Result:{RESET} (call_id: {part.tool_call_id})")
1672
+ if isinstance(part.result, str):
1673
+ result_text = truncate(part.result, max_text_length)
1674
+ lines = result_text.split("\n")
1675
+ for line in lines:
1676
+ print(f" {line}")
1677
+ elif isinstance(part.result, dict):
1678
+ result_json = format_json(part.result, indent)
1679
+ indented = "\n".join(
1680
+ " " + line for line in result_json.split("\n")
1681
+ )
1682
+ print(indented)
1683
+ elif isinstance(part.result, list):
1684
+ print(f" {DIM}<{len(part.result)} content blocks>{RESET}")
1685
+ for block in part.result:
1686
+ if isinstance(block, Text):
1687
+ block_text = truncate(block.text, max_text_length // 2)
1688
+ print(f" [text] {block_text}")
1689
+ elif isinstance(block, Image):
1690
+ bw, bh = block.size
1691
+ print(f" {DIM}<Image ({bw}x{bh})>{RESET}")
1692
+
1693
+ elif isinstance(part, Thinking):
1694
+ print(f" {DIM}Thinking:{RESET}")
1695
+ thought = truncate(part.content, max_text_length)
1696
+ lines = thought.split("\n")
1697
+ for line in lines:
1698
+ print(f" {DIM}{line}{RESET}")
1699
+
1700
+ # Separator between messages
1701
+ if i < len(self.messages) - 1:
1702
+ print(f"\n{'-' * 40}\n")
1703
+
1704
+ print(f"\n{BOLD}{'=' * 60}{RESET}\n")
1705
+
1601
1706
  @classmethod
1602
1707
  def from_log(cls, payload: dict) -> "Conversation":
1603
1708
  """Re-hydrate a Conversation previously produced by `to_log()`."""
@@ -1,6 +1,6 @@
1
1
  from dataclasses import dataclass, field
2
2
  from functools import cached_property
3
- from typing import Any, Callable, TYPE_CHECKING
3
+ from typing import Any, Callable, Sequence, TYPE_CHECKING
4
4
 
5
5
  from .config import SamplingParams
6
6
  from .prompt import CachePattern, Conversation
@@ -34,7 +34,7 @@ class RequestContext:
34
34
  callback: Callable | None = None
35
35
 
36
36
  # Optional features
37
- tools: list | None = None
37
+ tools: Sequence[Any] | None = None
38
38
  output_schema: "type[BaseModel] | dict | None" = None
39
39
  cache: CachePattern | None = None
40
40
  use_responses_api: bool = False