klaude-code 1.2.10__py3-none-any.whl → 1.2.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. klaude_code/cli/main.py +2 -7
  2. klaude_code/cli/runtime.py +23 -19
  3. klaude_code/command/__init__.py +29 -26
  4. klaude_code/command/clear_cmd.py +0 -2
  5. klaude_code/command/diff_cmd.py +0 -2
  6. klaude_code/command/export_cmd.py +0 -2
  7. klaude_code/command/help_cmd.py +0 -2
  8. klaude_code/command/model_cmd.py +0 -2
  9. klaude_code/command/refresh_cmd.py +0 -2
  10. klaude_code/command/registry.py +4 -8
  11. klaude_code/command/release_notes_cmd.py +0 -2
  12. klaude_code/command/status_cmd.py +2 -4
  13. klaude_code/command/terminal_setup_cmd.py +0 -2
  14. klaude_code/command/thinking_cmd.py +227 -0
  15. klaude_code/config/select_model.py +5 -15
  16. klaude_code/const/__init__.py +1 -1
  17. klaude_code/core/agent.py +1 -1
  18. klaude_code/core/executor.py +1 -4
  19. klaude_code/core/manager/agent_manager.py +15 -9
  20. klaude_code/core/manager/llm_clients_builder.py +4 -7
  21. klaude_code/core/prompt.py +5 -5
  22. klaude_code/core/prompts/prompt-claude-code.md +1 -12
  23. klaude_code/core/prompts/prompt-minimal.md +12 -0
  24. klaude_code/core/task.py +5 -2
  25. klaude_code/core/tool/memory/memory_tool.md +4 -0
  26. klaude_code/core/tool/memory/skill_loader.py +1 -1
  27. klaude_code/core/tool/todo/todo_write_tool.md +0 -157
  28. klaude_code/core/tool/todo/todo_write_tool_raw.md +182 -0
  29. klaude_code/core/tool/tool_registry.py +3 -4
  30. klaude_code/core/turn.py +0 -1
  31. klaude_code/llm/anthropic/client.py +56 -47
  32. klaude_code/llm/client.py +1 -19
  33. klaude_code/llm/codex/client.py +49 -30
  34. klaude_code/llm/openai_compatible/client.py +52 -34
  35. klaude_code/llm/openrouter/client.py +63 -41
  36. klaude_code/llm/responses/client.py +56 -39
  37. klaude_code/llm/usage.py +1 -49
  38. klaude_code/protocol/commands.py +1 -0
  39. klaude_code/protocol/llm_param.py +1 -9
  40. klaude_code/protocol/model.py +4 -3
  41. klaude_code/protocol/op.py +5 -2
  42. klaude_code/protocol/sub_agent.py +1 -0
  43. klaude_code/session/export.py +3 -0
  44. klaude_code/session/selector.py +12 -7
  45. klaude_code/session/session.py +1 -5
  46. klaude_code/session/templates/export_session.html +155 -0
  47. klaude_code/ui/modes/repl/completers.py +3 -3
  48. klaude_code/ui/modes/repl/event_handler.py +1 -5
  49. klaude_code/ui/modes/repl/input_prompt_toolkit.py +3 -34
  50. klaude_code/ui/renderers/metadata.py +11 -1
  51. klaude_code/ui/renderers/tools.py +13 -2
  52. klaude_code/ui/rich/markdown.py +4 -1
  53. klaude_code/ui/terminal/__init__.py +55 -0
  54. {klaude_code-1.2.10.dist-info → klaude_code-1.2.12.dist-info}/METADATA +1 -4
  55. {klaude_code-1.2.10.dist-info → klaude_code-1.2.12.dist-info}/RECORD +57 -54
  56. {klaude_code-1.2.10.dist-info → klaude_code-1.2.12.dist-info}/WHEEL +0 -0
  57. {klaude_code-1.2.10.dist-info → klaude_code-1.2.12.dist-info}/entry_points.txt +0 -0
klaude_code/core/turn.py CHANGED
@@ -158,7 +158,6 @@ class TurnExecutor:
158
158
  input=session_ctx.get_conversation_history(),
159
159
  system=ctx.system_prompt,
160
160
  tools=ctx.tools,
161
- store=False,
162
161
  session_id=session_ctx.session_id,
163
162
  )
164
163
  ):
@@ -15,17 +15,48 @@ from anthropic.types.beta.beta_signature_delta import BetaSignatureDelta
15
15
  from anthropic.types.beta.beta_text_delta import BetaTextDelta
16
16
  from anthropic.types.beta.beta_thinking_delta import BetaThinkingDelta
17
17
  from anthropic.types.beta.beta_tool_use_block import BetaToolUseBlock
18
+ from anthropic.types.beta.message_create_params import MessageCreateParamsStreaming
18
19
 
19
20
  from klaude_code import const
20
21
  from klaude_code.llm.anthropic.input import convert_history_to_input, convert_system_to_input, convert_tool_schema
21
- from klaude_code.llm.client import LLMClientABC, call_with_logged_payload
22
+ from klaude_code.llm.client import LLMClientABC
22
23
  from klaude_code.llm.input_common import apply_config_defaults
23
24
  from klaude_code.llm.registry import register
24
- from klaude_code.llm.usage import MetadataTracker, convert_anthropic_usage
25
+ from klaude_code.llm.usage import MetadataTracker
25
26
  from klaude_code.protocol import llm_param, model
26
27
  from klaude_code.trace import DebugType, log_debug
27
28
 
28
29
 
30
+ def build_payload(param: llm_param.LLMCallParameter) -> MessageCreateParamsStreaming:
31
+ """Build Anthropic API request parameters."""
32
+ messages = convert_history_to_input(param.input, param.model)
33
+ tools = convert_tool_schema(param.tools)
34
+ system = convert_system_to_input(param.system)
35
+
36
+ payload: MessageCreateParamsStreaming = {
37
+ "model": str(param.model),
38
+ "tool_choice": {
39
+ "type": "auto",
40
+ "disable_parallel_tool_use": False,
41
+ },
42
+ "stream": True,
43
+ "max_tokens": param.max_tokens or const.DEFAULT_MAX_TOKENS,
44
+ "temperature": param.temperature or const.DEFAULT_TEMPERATURE,
45
+ "messages": messages,
46
+ "system": system,
47
+ "tools": tools,
48
+ "betas": ["interleaved-thinking-2025-05-14", "context-1m-2025-08-07"],
49
+ }
50
+
51
+ if param.thinking and param.thinking.type == "enabled":
52
+ payload["thinking"] = anthropic.types.ThinkingConfigEnabledParam(
53
+ type="enabled",
54
+ budget_tokens=param.thinking.budget_tokens or const.DEFAULT_ANTHROPIC_THINKING_BUDGET_TOKENS,
55
+ )
56
+
57
+ return payload
58
+
59
+
29
60
  @register(llm_param.LLMClientProtocol.ANTHROPIC)
30
61
  class AnthropicClient(LLMClientABC):
31
62
  def __init__(self, config: llm_param.LLMConfigParameter):
@@ -48,32 +79,16 @@ class AnthropicClient(LLMClientABC):
48
79
 
49
80
  metadata_tracker = MetadataTracker(cost_config=self.get_llm_config().cost)
50
81
 
51
- messages = convert_history_to_input(param.input, param.model)
52
- tools = convert_tool_schema(param.tools)
53
- system = convert_system_to_input(param.system)
54
-
55
- stream = call_with_logged_payload(
56
- self.client.beta.messages.create,
57
- model=str(param.model),
58
- tool_choice={
59
- "type": "auto",
60
- "disable_parallel_tool_use": False,
61
- },
62
- stream=True,
63
- max_tokens=param.max_tokens or const.DEFAULT_MAX_TOKENS,
64
- temperature=param.temperature or const.DEFAULT_TEMPERATURE,
65
- messages=messages,
66
- system=system,
67
- tools=tools,
68
- betas=["interleaved-thinking-2025-05-14", "context-1m-2025-08-07"],
69
- thinking=anthropic.types.ThinkingConfigEnabledParam(
70
- type=param.thinking.type,
71
- budget_tokens=param.thinking.budget_tokens or const.DEFAULT_ANTHROPIC_THINKING_BUDGET_TOKENS,
72
- )
73
- if param.thinking and param.thinking.type == "enabled"
74
- else anthropic.types.ThinkingConfigDisabledParam(
75
- type="disabled",
76
- ),
82
+ payload = build_payload(param)
83
+
84
+ log_debug(
85
+ json.dumps(payload, ensure_ascii=False, default=str),
86
+ style="yellow",
87
+ debug_type=DebugType.LLM_PAYLOAD,
88
+ )
89
+
90
+ stream = self.client.beta.messages.create(
91
+ **payload,
77
92
  extra_headers={"extra": json.dumps({"session_id": param.session_id}, sort_keys=True)},
78
93
  )
79
94
 
@@ -85,9 +100,8 @@ class AnthropicClient(LLMClientABC):
85
100
  current_tool_call_id: str | None = None
86
101
  current_tool_inputs: list[str] | None = None
87
102
 
88
- input_tokens = 0
89
- cached_tokens = 0
90
- output_tokens = 0
103
+ input_token = 0
104
+ cached_token = 0
91
105
 
92
106
  try:
93
107
  async for event in await stream:
@@ -100,11 +114,8 @@ class AnthropicClient(LLMClientABC):
100
114
  match event:
101
115
  case BetaRawMessageStartEvent() as event:
102
116
  response_id = event.message.id
103
- cached_tokens = event.message.usage.cache_read_input_tokens or 0
104
- input_tokens = (event.message.usage.input_tokens or 0) + (
105
- event.message.usage.cache_creation_input_tokens or 0
106
- )
107
- output_tokens = event.message.usage.output_tokens or 0
117
+ cached_token = event.message.usage.cache_read_input_tokens or 0
118
+ input_token = event.message.usage.input_tokens
108
119
  yield model.StartItem(response_id=response_id)
109
120
  case BetaRawContentBlockDeltaEvent() as event:
110
121
  match event.delta:
@@ -170,18 +181,16 @@ class AnthropicClient(LLMClientABC):
170
181
  current_tool_call_id = None
171
182
  current_tool_inputs = None
172
183
  case BetaRawMessageDeltaEvent() as event:
173
- input_tokens += (event.usage.input_tokens or 0) + (event.usage.cache_creation_input_tokens or 0)
174
- output_tokens += event.usage.output_tokens or 0
175
- cached_tokens += event.usage.cache_read_input_tokens or 0
176
-
177
- usage = convert_anthropic_usage(
178
- input_tokens=input_tokens,
179
- output_tokens=output_tokens,
180
- cached_tokens=cached_tokens,
181
- context_limit=param.context_limit,
182
- max_tokens=param.max_tokens,
184
+ metadata_tracker.set_usage(
185
+ model.Usage(
186
+ input_tokens=input_token + cached_token,
187
+ output_tokens=event.usage.output_tokens,
188
+ cached_tokens=cached_token,
189
+ context_size=input_token + cached_token + event.usage.output_tokens,
190
+ context_limit=param.context_limit,
191
+ max_tokens=param.max_tokens,
192
+ )
183
193
  )
184
- metadata_tracker.set_usage(usage)
185
194
  metadata_tracker.set_model_name(str(param.model))
186
195
  metadata_tracker.set_response_id(response_id)
187
196
  yield metadata_tracker.finalize()
klaude_code/llm/client.py CHANGED
@@ -1,10 +1,8 @@
1
- import json
2
1
  from abc import ABC, abstractmethod
3
2
  from collections.abc import AsyncGenerator
4
- from typing import Callable, ParamSpec, TypeVar, cast
3
+ from typing import ParamSpec, TypeVar, cast
5
4
 
6
5
  from klaude_code.protocol import llm_param, model
7
- from klaude_code.trace import DebugType, log_debug
8
6
 
9
7
 
10
8
  class LLMClientABC(ABC):
@@ -31,19 +29,3 @@ class LLMClientABC(ABC):
31
29
 
32
30
  P = ParamSpec("P")
33
31
  R = TypeVar("R")
34
-
35
-
36
- def call_with_logged_payload(func: Callable[P, R], *args: P.args, **kwargs: P.kwargs) -> R:
37
- """Call an SDK function while logging the JSON payload.
38
-
39
- The function reuses the original callable's type signature via ParamSpec
40
- so static type checkers can validate arguments at the call site.
41
- """
42
-
43
- payload = {k: v for k, v in kwargs.items() if v is not None}
44
- log_debug(
45
- json.dumps(payload, ensure_ascii=False, default=str, sort_keys=True),
46
- style="yellow",
47
- debug_type=DebugType.LLM_PAYLOAD,
48
- )
49
- return func(*args, **kwargs)
@@ -1,22 +1,61 @@
1
1
  """Codex LLM client using ChatGPT subscription via OAuth."""
2
2
 
3
+ import json
3
4
  from collections.abc import AsyncGenerator
4
5
  from typing import override
5
6
 
6
7
  import httpx
7
8
  import openai
8
9
  from openai import AsyncOpenAI
10
+ from openai.types.responses.response_create_params import ResponseCreateParamsStreaming
9
11
 
10
12
  from klaude_code.auth.codex.exceptions import CodexNotLoggedInError
11
13
  from klaude_code.auth.codex.oauth import CodexOAuth
12
14
  from klaude_code.auth.codex.token_manager import CodexTokenManager
13
- from klaude_code.llm.client import LLMClientABC, call_with_logged_payload
15
+ from klaude_code.llm.client import LLMClientABC
14
16
  from klaude_code.llm.input_common import apply_config_defaults
15
17
  from klaude_code.llm.registry import register
16
18
  from klaude_code.llm.responses.client import parse_responses_stream
17
19
  from klaude_code.llm.responses.input import convert_history_to_input, convert_tool_schema
18
20
  from klaude_code.llm.usage import MetadataTracker
19
21
  from klaude_code.protocol import llm_param, model
22
+ from klaude_code.trace import DebugType, log_debug
23
+
24
+
25
+ def build_payload(param: llm_param.LLMCallParameter) -> ResponseCreateParamsStreaming:
26
+ """Build Codex API request parameters."""
27
+ inputs = convert_history_to_input(param.input, param.model)
28
+ tools = convert_tool_schema(param.tools)
29
+
30
+ session_id = param.session_id or ""
31
+
32
+ payload: ResponseCreateParamsStreaming = {
33
+ "model": str(param.model),
34
+ "tool_choice": "auto",
35
+ "parallel_tool_calls": True,
36
+ "include": [
37
+ "reasoning.encrypted_content",
38
+ ],
39
+ "store": False,
40
+ "stream": True,
41
+ "input": inputs,
42
+ "instructions": param.system,
43
+ "tools": tools,
44
+ "prompt_cache_key": session_id,
45
+ # max_output_token and temperature is not supported in Codex API
46
+ }
47
+
48
+ if param.thinking and param.thinking.reasoning_effort:
49
+ payload["reasoning"] = {
50
+ "effort": param.thinking.reasoning_effort,
51
+ "summary": param.thinking.reasoning_summary,
52
+ }
53
+
54
+ if param.verbosity:
55
+ payload["text"] = {"verbosity": param.verbosity}
56
+
57
+ return payload
58
+
20
59
 
21
60
  # Codex API configuration
22
61
  CODEX_BASE_URL = "https://chatgpt.com/backend-api/codex"
@@ -81,45 +120,25 @@ class CodexClient(LLMClientABC):
81
120
 
82
121
  param = apply_config_defaults(param, self.get_llm_config())
83
122
 
84
- # Codex API requires store=False
85
- param.store = False
86
-
87
123
  metadata_tracker = MetadataTracker(cost_config=self.get_llm_config().cost)
88
124
 
89
- inputs = convert_history_to_input(param.input, param.model)
90
- tools = convert_tool_schema(param.tools)
125
+ payload = build_payload(param)
91
126
 
92
127
  session_id = param.session_id or ""
93
- # Must send conversation_id/session_id headers to improve ChatGPT backend prompt cache hit rate.
94
128
  extra_headers: dict[str, str] = {}
95
129
  if session_id:
130
+ # Must send conversation_id/session_id headers to improve ChatGPT backend prompt cache hit rate.
96
131
  extra_headers["conversation_id"] = session_id
97
132
  extra_headers["session_id"] = session_id
98
133
 
134
+ log_debug(
135
+ json.dumps(payload, ensure_ascii=False, default=str),
136
+ style="yellow",
137
+ debug_type=DebugType.LLM_PAYLOAD,
138
+ )
99
139
  try:
100
- stream = await call_with_logged_payload(
101
- self.client.responses.create,
102
- model=str(param.model),
103
- tool_choice="auto",
104
- parallel_tool_calls=True,
105
- include=[
106
- "reasoning.encrypted_content",
107
- ],
108
- store=False, # Always False for Codex
109
- stream=True,
110
- input=inputs,
111
- instructions=param.system,
112
- tools=tools,
113
- text={
114
- "verbosity": param.verbosity,
115
- },
116
- prompt_cache_key=session_id,
117
- reasoning={
118
- "effort": param.thinking.reasoning_effort,
119
- "summary": param.thinking.reasoning_summary,
120
- }
121
- if param.thinking and param.thinking.reasoning_effort
122
- else None,
140
+ stream = await self.client.responses.create(
141
+ **payload,
123
142
  extra_headers=extra_headers,
124
143
  )
125
144
  except (openai.OpenAIError, httpx.HTTPError) as e:
@@ -4,8 +4,9 @@ from typing import override
4
4
 
5
5
  import httpx
6
6
  import openai
7
+ from openai.types.chat.completion_create_params import CompletionCreateParamsStreaming
7
8
 
8
- from klaude_code.llm.client import LLMClientABC, call_with_logged_payload
9
+ from klaude_code.llm.client import LLMClientABC
9
10
  from klaude_code.llm.input_common import apply_config_defaults
10
11
  from klaude_code.llm.openai_compatible.input import convert_history_to_input, convert_tool_schema
11
12
  from klaude_code.llm.openai_compatible.stream_processor import StreamStateManager
@@ -15,6 +16,35 @@ from klaude_code.protocol import llm_param, model
15
16
  from klaude_code.trace import DebugType, log_debug
16
17
 
17
18
 
19
+ def build_payload(param: llm_param.LLMCallParameter) -> tuple[CompletionCreateParamsStreaming, dict[str, object]]:
20
+ """Build OpenAI API request parameters."""
21
+ messages = convert_history_to_input(param.input, param.system, param.model)
22
+ tools = convert_tool_schema(param.tools)
23
+
24
+ extra_body: dict[str, object] = {}
25
+
26
+ if param.thinking:
27
+ extra_body["thinking"] = {
28
+ "type": param.thinking.type,
29
+ "budget": param.thinking.budget_tokens,
30
+ }
31
+
32
+ payload: CompletionCreateParamsStreaming = {
33
+ "model": str(param.model),
34
+ "tool_choice": "auto",
35
+ "parallel_tool_calls": True,
36
+ "stream": True,
37
+ "messages": messages,
38
+ "temperature": param.temperature,
39
+ "max_tokens": param.max_tokens,
40
+ "tools": tools,
41
+ "reasoning_effort": param.thinking.reasoning_effort if param.thinking else None,
42
+ "verbosity": param.verbosity,
43
+ }
44
+
45
+ return payload, extra_body
46
+
47
+
18
48
  @register(llm_param.LLMClientProtocol.OPENAI)
19
49
  class OpenAICompatibleClient(LLMClientABC):
20
50
  def __init__(self, config: llm_param.LLMConfigParameter):
@@ -44,32 +74,21 @@ class OpenAICompatibleClient(LLMClientABC):
44
74
  @override
45
75
  async def call(self, param: llm_param.LLMCallParameter) -> AsyncGenerator[model.ConversationItem, None]:
46
76
  param = apply_config_defaults(param, self.get_llm_config())
47
- messages = convert_history_to_input(param.input, param.system, param.model)
48
- tools = convert_tool_schema(param.tools)
49
77
 
50
78
  metadata_tracker = MetadataTracker(cost_config=self.get_llm_config().cost)
51
79
 
52
- extra_body = {}
53
- extra_headers = {"extra": json.dumps({"session_id": param.session_id}, sort_keys=True)}
54
-
55
- if param.thinking:
56
- extra_body["thinking"] = {
57
- "type": param.thinking.type,
58
- "budget": param.thinking.budget_tokens,
59
- }
60
- stream = call_with_logged_payload(
61
- self.client.chat.completions.create,
62
- model=str(param.model),
63
- tool_choice="auto",
64
- parallel_tool_calls=True,
65
- stream=True,
66
- messages=messages,
67
- temperature=param.temperature,
68
- max_tokens=param.max_tokens,
69
- tools=tools,
70
- reasoning_effort=param.thinking.reasoning_effort if param.thinking else None,
71
- verbosity=param.verbosity,
72
- extra_body=extra_body, # pyright: ignore[reportUnknownArgumentType]
80
+ payload, extra_body = build_payload(param)
81
+ extra_headers: dict[str, str] = {"extra": json.dumps({"session_id": param.session_id}, sort_keys=True)}
82
+
83
+ log_debug(
84
+ json.dumps({**payload, **extra_body}, ensure_ascii=False, default=str),
85
+ style="yellow",
86
+ debug_type=DebugType.LLM_PAYLOAD,
87
+ )
88
+
89
+ stream = self.client.chat.completions.create(
90
+ **payload,
91
+ extra_body=extra_body,
73
92
  extra_headers=extra_headers,
74
93
  )
75
94
 
@@ -85,9 +104,7 @@ class OpenAICompatibleClient(LLMClientABC):
85
104
  if not state.response_id and event.id:
86
105
  state.set_response_id(event.id)
87
106
  yield model.StartItem(response_id=event.id)
88
- if (
89
- event.usage is not None and event.usage.completion_tokens is not None # pyright: ignore[reportUnnecessaryComparison] gcp gemini will return None usage field
90
- ):
107
+ if event.usage is not None:
91
108
  metadata_tracker.set_usage(convert_usage(event.usage, param.context_limit, param.max_tokens))
92
109
  if event.model:
93
110
  metadata_tracker.set_model_name(event.model)
@@ -96,9 +113,8 @@ class OpenAICompatibleClient(LLMClientABC):
96
113
 
97
114
  if len(event.choices) == 0:
98
115
  continue
99
- delta = event.choices[0].delta
100
116
 
101
- # Support Kimi K2's usage field in choice
117
+ # Support Moonshot Kimi K2's usage field in choice
102
118
  if hasattr(event.choices[0], "usage") and getattr(event.choices[0], "usage"):
103
119
  metadata_tracker.set_usage(
104
120
  convert_usage(
@@ -108,12 +124,14 @@ class OpenAICompatibleClient(LLMClientABC):
108
124
  )
109
125
  )
110
126
 
127
+ delta = event.choices[0].delta
128
+
111
129
  # Reasoning
112
- reasoning_content = ""
113
- if hasattr(delta, "reasoning") and getattr(delta, "reasoning"):
114
- reasoning_content = getattr(delta, "reasoning")
115
- if hasattr(delta, "reasoning_content") and getattr(delta, "reasoning_content"):
116
- reasoning_content = getattr(delta, "reasoning_content")
130
+ reasoning_content = (
131
+ getattr(delta, "reasoning_content", None)
132
+ or getattr(delta, "reasoning", None)
133
+ or ""
134
+ )
117
135
  if reasoning_content:
118
136
  metadata_tracker.record_token()
119
137
  state.stage = "reasoning"
@@ -1,10 +1,12 @@
1
+ import json
1
2
  from collections.abc import AsyncGenerator
2
3
  from typing import override
3
4
 
4
5
  import httpx
5
6
  import openai
7
+ from openai.types.chat.completion_create_params import CompletionCreateParamsStreaming
6
8
 
7
- from klaude_code.llm.client import LLMClientABC, call_with_logged_payload
9
+ from klaude_code.llm.client import LLMClientABC
8
10
  from klaude_code.llm.input_common import apply_config_defaults
9
11
  from klaude_code.llm.openai_compatible.input import convert_tool_schema
10
12
  from klaude_code.llm.openai_compatible.stream_processor import StreamStateManager
@@ -16,6 +18,52 @@ from klaude_code.protocol import llm_param, model
16
18
  from klaude_code.trace import DebugType, log, log_debug
17
19
 
18
20
 
21
+ def build_payload(
22
+ param: llm_param.LLMCallParameter,
23
+ ) -> tuple[CompletionCreateParamsStreaming, dict[str, object], dict[str, str]]:
24
+ """Build OpenRouter API request parameters."""
25
+ messages = convert_history_to_input(param.input, param.system, param.model)
26
+ tools = convert_tool_schema(param.tools)
27
+
28
+ extra_body: dict[str, object] = {
29
+ "usage": {"include": True} # To get the cache tokens at the end of the response
30
+ }
31
+ extra_headers: dict[str, str] = {}
32
+
33
+ if param.thinking:
34
+ if param.thinking.budget_tokens is not None:
35
+ extra_body["reasoning"] = {
36
+ "max_tokens": param.thinking.budget_tokens,
37
+ "enable": True,
38
+ } # OpenRouter: https://openrouter.ai/docs/use-cases/reasoning-tokens#anthropic-models-with-reasoning-tokens
39
+ elif param.thinking.reasoning_effort is not None:
40
+ extra_body["reasoning"] = {
41
+ "effort": param.thinking.reasoning_effort,
42
+ }
43
+
44
+ if param.provider_routing:
45
+ extra_body["provider"] = param.provider_routing.model_dump(exclude_none=True)
46
+
47
+ if is_claude_model(param.model):
48
+ extra_headers["anthropic-beta"] = (
49
+ "interleaved-thinking-2025-05-14" # Not working yet, maybe OpenRouter's issue, or Anthropic: Interleaved thinking is only supported for tools used via the Messages API.
50
+ )
51
+
52
+ payload: CompletionCreateParamsStreaming = {
53
+ "model": str(param.model),
54
+ "tool_choice": "auto",
55
+ "parallel_tool_calls": True,
56
+ "stream": True,
57
+ "messages": messages,
58
+ "temperature": param.temperature,
59
+ "max_tokens": param.max_tokens,
60
+ "tools": tools,
61
+ "verbosity": param.verbosity,
62
+ }
63
+
64
+ return payload, extra_body, extra_headers
65
+
66
+
19
67
  @register(llm_param.LLMClientProtocol.OPENROUTER)
20
68
  class OpenRouterClient(LLMClientABC):
21
69
  def __init__(self, config: llm_param.LLMConfigParameter):
@@ -35,52 +83,28 @@ class OpenRouterClient(LLMClientABC):
35
83
  @override
36
84
  async def call(self, param: llm_param.LLMCallParameter) -> AsyncGenerator[model.ConversationItem, None]:
37
85
  param = apply_config_defaults(param, self.get_llm_config())
38
- messages = convert_history_to_input(param.input, param.system, param.model)
39
- tools = convert_tool_schema(param.tools)
40
86
 
41
87
  metadata_tracker = MetadataTracker(cost_config=self.get_llm_config().cost)
42
88
 
43
- extra_body: dict[str, object] = {
44
- "usage": {"include": True} # To get the cache tokens at the end of the response
45
- }
46
- extra_headers = {}
47
-
48
- if param.thinking:
49
- if param.thinking.budget_tokens is not None:
50
- extra_body["reasoning"] = {
51
- "max_tokens": param.thinking.budget_tokens,
52
- "enable": True,
53
- } # OpenRouter: https://openrouter.ai/docs/use-cases/reasoning-tokens#anthropic-models-with-reasoning-tokens
54
- elif param.thinking.reasoning_effort is not None:
55
- extra_body["reasoning"] = {
56
- "effort": param.thinking.reasoning_effort,
57
- }
58
- if param.provider_routing:
59
- extra_body["provider"] = param.provider_routing.model_dump(exclude_none=True)
60
- if is_claude_model(param.model):
61
- extra_headers["anthropic-beta"] = (
62
- "interleaved-thinking-2025-05-14" # Not working yet, maybe OpenRouter's issue, or Anthropic: Interleaved thinking is only supported for tools used via the Messages API.
63
- )
64
-
65
- stream = call_with_logged_payload(
66
- self.client.chat.completions.create,
67
- model=str(param.model),
68
- tool_choice="auto",
69
- parallel_tool_calls=True,
70
- stream=True,
71
- messages=messages,
72
- temperature=param.temperature,
73
- max_tokens=param.max_tokens,
74
- tools=tools,
75
- verbosity=param.verbosity,
89
+ payload, extra_body, extra_headers = build_payload(param)
90
+
91
+ log_debug(
92
+ json.dumps({**payload, **extra_body}, ensure_ascii=False, default=str),
93
+ style="yellow",
94
+ debug_type=DebugType.LLM_PAYLOAD,
95
+ )
96
+
97
+ stream = self.client.chat.completions.create(
98
+ **payload,
76
99
  extra_body=extra_body,
77
- extra_headers=extra_headers, # pyright: ignore[reportUnknownArgumentType]
100
+ extra_headers=extra_headers,
78
101
  )
79
102
 
80
103
  reasoning_handler = ReasoningStreamHandler(
81
104
  param_model=str(param.model),
82
105
  response_id=None,
83
106
  )
107
+
84
108
  state = StreamStateManager(
85
109
  param_model=str(param.model),
86
110
  reasoning_flusher=reasoning_handler.flush,
@@ -93,19 +117,17 @@ class OpenRouterClient(LLMClientABC):
93
117
  style="blue",
94
118
  debug_type=DebugType.LLM_STREAM,
95
119
  )
120
+
96
121
  if not state.response_id and event.id:
97
122
  state.set_response_id(event.id)
98
123
  reasoning_handler.set_response_id(event.id)
99
124
  yield model.StartItem(response_id=event.id)
100
- if (
101
- event.usage is not None and event.usage.completion_tokens is not None # pyright: ignore[reportUnnecessaryComparison]
102
- ): # gcp gemini will return None usage field
125
+ if event.usage is not None:
103
126
  metadata_tracker.set_usage(convert_usage(event.usage, param.context_limit, param.max_tokens))
104
127
  if event.model:
105
128
  metadata_tracker.set_model_name(event.model)
106
129
  if provider := getattr(event, "provider", None):
107
130
  metadata_tracker.set_provider(str(provider))
108
-
109
131
  if len(event.choices) == 0:
110
132
  continue
111
133
  delta = event.choices[0].delta