dulus 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. agent.py +363 -0
  2. backend/__init__.py +63 -0
  3. backend/compressor.py +261 -0
  4. backend/context.py +329 -0
  5. backend/githook.py +166 -0
  6. backend/marketplace.py +141 -0
  7. backend/mempalace_bridge.py +182 -0
  8. backend/personas.py +297 -0
  9. backend/plugins.py +222 -0
  10. backend/server.py +411 -0
  11. backend/tasks.py +213 -0
  12. batch_api.py +307 -0
  13. checkpoint/__init__.py +27 -0
  14. checkpoint/hooks.py +90 -0
  15. checkpoint/store.py +314 -0
  16. checkpoint/types.py +80 -0
  17. claude_code_watcher.py +214 -0
  18. clipboard_utils.py +246 -0
  19. cloudsave.py +159 -0
  20. common.py +177 -0
  21. compaction.py +378 -0
  22. config.py +180 -0
  23. context.py +241 -0
  24. dulus-0.2.0.dist-info/METADATA +600 -0
  25. dulus-0.2.0.dist-info/RECORD +101 -0
  26. dulus-0.2.0.dist-info/WHEEL +5 -0
  27. dulus-0.2.0.dist-info/entry_points.txt +2 -0
  28. dulus-0.2.0.dist-info/licenses/LICENSE +674 -0
  29. dulus-0.2.0.dist-info/licenses/license_manager.py +187 -0
  30. dulus-0.2.0.dist-info/top_level.txt +36 -0
  31. dulus.py +8455 -0
  32. dulus_gui.py +331 -0
  33. dulus_mcp/__init__.py +43 -0
  34. dulus_mcp/client.py +546 -0
  35. dulus_mcp/config.py +133 -0
  36. dulus_mcp/tools.py +131 -0
  37. dulus_mcp/types.py +124 -0
  38. gui/__init__.py +18 -0
  39. gui/agent_bridge.py +283 -0
  40. gui/chat_widget.py +448 -0
  41. gui/main_window.py +485 -0
  42. gui/personas.py +230 -0
  43. gui/session_utils.py +189 -0
  44. gui/settings_dialog.py +146 -0
  45. gui/sidebar.py +515 -0
  46. gui/tasks_view.py +499 -0
  47. gui/themes.py +256 -0
  48. gui/tool_panel.py +94 -0
  49. input.py +1030 -0
  50. license_manager.py +187 -0
  51. memory/__init__.py +93 -0
  52. memory/audit.py +51 -0
  53. memory/consolidator.py +312 -0
  54. memory/context.py +270 -0
  55. memory/offload.py +148 -0
  56. memory/palace.py +127 -0
  57. memory/scan.py +146 -0
  58. memory/sessions.py +100 -0
  59. memory/store.py +395 -0
  60. memory/tools.py +408 -0
  61. memory/types.py +114 -0
  62. memory/vector_search.py +92 -0
  63. multi_agent/__init__.py +23 -0
  64. multi_agent/subagent.py +501 -0
  65. multi_agent/tools.py +393 -0
  66. offload_helper.py +183 -0
  67. plugin/__init__.py +22 -0
  68. plugin/autoadapter.py +1641 -0
  69. plugin/loader.py +156 -0
  70. plugin/recommend.py +211 -0
  71. plugin/store.py +387 -0
  72. plugin/types.py +147 -0
  73. providers.py +3750 -0
  74. skill/__init__.py +14 -0
  75. skill/builtin.py +100 -0
  76. skill/clawhub.py +270 -0
  77. skill/executor.py +66 -0
  78. skill/loader.py +199 -0
  79. skill/tools.py +110 -0
  80. skills.py +14 -0
  81. spinner.py +42 -0
  82. string_utils.py +42 -0
  83. subagent.py +11 -0
  84. task/__init__.py +12 -0
  85. task/store.py +199 -0
  86. task/tools.py +265 -0
  87. task/types.py +92 -0
  88. tmux_offloader.py +177 -0
  89. tmux_tools.py +410 -0
  90. tool_registry.py +214 -0
  91. tools.py +2694 -0
  92. ui/__init__.py +1 -0
  93. ui/input.py +464 -0
  94. ui/render.py +272 -0
  95. voice/__init__.py +56 -0
  96. voice/keyterms.py +179 -0
  97. voice/recorder.py +263 -0
  98. voice/stt.py +408 -0
  99. voice/tts.py +570 -0
  100. webchat.py +432 -0
  101. webchat_server.py +1761 -0
providers.py ADDED
@@ -0,0 +1,3750 @@
1
+ """
2
+ Multi-provider support for Dulus.
3
+
4
+ Supported providers:
5
+ anthropic — Claude (claude-opus-4-6, claude-sonnet-4-6, ...)
6
+ openai — GPT (gpt-4o, o3-mini, ...)
7
+ gemini — Google Gemini (gemini-2.0-flash, gemini-1.5-pro, ...)
8
+ kimi — Moonshot AI (kimi-k2.5, moonshot-v1-8k/32k/128k)
9
+ kimi-code — Kimi Code (kimi-for-coding, membership API from kimi.com/code)
10
+ qwen — Alibaba DashScope (qwen-max, qwen-plus, ...)
11
+ zhipu — Zhipu GLM (glm-4, glm-4-plus, ...)
12
+ deepseek — DeepSeek (deepseek-chat, deepseek-reasoner, ...)
13
+ minimax — MiniMax (MiniMax-Text-01, abab6.5s-chat, ...)
14
+ ollama — Local Ollama (llama3.3, qwen2.5-coder, ...)
15
+ lmstudio — Local LM Studio (any loaded model)
16
+ custom — Any OpenAI-compatible endpoint
17
+
18
+ Model string formats:
19
+ "claude-opus-4-6" auto-detected → anthropic
20
+ "gpt-4o" auto-detected → openai
21
+ "ollama/qwen2.5-coder" explicit provider prefix
22
+ "custom/my-model" uses CUSTOM_BASE_URL from config
23
+ """
24
+ from __future__ import annotations
25
+ import json
26
+ import urllib.request
27
+ import urllib.parse
28
+ import requests
29
+ import re
30
+ import time
31
+ import random
32
+ import functools
33
+ import subprocess
34
+ import platform
35
+ from typing import Generator, Any, Callable
36
+
37
+
38
+ # ── Provider resilience: retry with exponential backoff + jitter ─────────
39
+
40
+ class _ProviderRetry:
41
+ """Lightweight retry wrapper for provider streaming calls.
42
+
43
+ Retries on: timeout, connection errors, 429 (rate limit), 5xx.
44
+ Does NOT retry on: 4xx (client errors), auth failures.
45
+ """
46
+ MAX_RETRIES: int = 3
47
+ BASE_DELAY: float = 1.0
48
+ MAX_DELAY: float = 30.0
49
+
50
+ @classmethod
51
+ def is_retryable(cls, exc: Exception) -> bool:
52
+ """Return True if the exception is worth retrying."""
53
+ msg = str(exc).lower()
54
+ # Rate limit / server overload
55
+ if "429" in msg or "rate limit" in msg or "too many requests" in msg:
56
+ return True
57
+ # Server errors
58
+ if "500" in msg or "502" in msg or "503" in msg or "504" in msg:
59
+ return True
60
+ # Timeouts / connection issues
61
+ if "timeout" in msg or "connection" in msg or "timed out" in msg:
62
+ return True
63
+ if "chunked encoding" in msg or "broken pipe" in msg:
64
+ return True
65
+ return False
66
+
67
+ @classmethod
68
+ def sleep_for_attempt(cls, attempt: int) -> float:
69
+ """Exponential backoff with full jitter."""
70
+ exp = cls.BASE_DELAY * (2 ** attempt)
71
+ jitter = random.random() * exp
72
+ return min(jitter, cls.MAX_DELAY)
73
+
74
+ @classmethod
75
+ def wrap_generator(cls, fn: Callable, *args, **kwargs) -> Generator:
76
+ """Wrap a generator function with retry logic.
77
+
78
+ Yields through the generator; if it raises a retryable exception,
79
+ waits and retries up to MAX_RETRIES times.
80
+ """
81
+ last_exc: Exception | None = None
82
+ for attempt in range(cls.MAX_RETRIES + 1):
83
+ try:
84
+ yield from fn(*args, **kwargs)
85
+ return
86
+ except Exception as exc:
87
+ last_exc = exc
88
+ if attempt >= cls.MAX_RETRIES or not cls.is_retryable(exc):
89
+ raise
90
+ delay = cls.sleep_for_attempt(attempt)
91
+ time.sleep(delay)
92
+ # Should never reach here, but just in case
93
+ if last_exc:
94
+ raise last_exc
95
+
96
+
97
+ class WebToolParser:
98
+ """Shared parser for prompt-based tool calls in XML format.
99
+ Also supports auto-wrapping raw JSON tool calls if auto_wrap_json=True.
100
+ """
101
+ def __init__(self, auto_wrap_json: bool = False):
102
+ self._in_call = False
103
+ self._call_buf = ""
104
+ self._raw_buf = ""
105
+ self._auto_wrap_json = auto_wrap_json
106
+ self.tool_calls = []
107
+
108
+ def parse_chunk(self, chunk: str) -> str:
109
+ """Parse chunk, return display text and accumulate tool calls."""
110
+ if not chunk: return ""
111
+ self._raw_buf += chunk
112
+ display = ""
113
+
114
+ while True:
115
+ if not self._in_call:
116
+ # Look for start tag
117
+ pos = self._raw_buf.find("<tool_call>")
118
+ if pos == -1:
119
+ # No start tag. Check for partial start tag at the very end
120
+ last_lt = self._raw_buf.rfind("<")
121
+ if last_lt != -1 and "<tool_call>".startswith(self._raw_buf[last_lt:]):
122
+ display += self._raw_buf[:last_lt]
123
+ self._raw_buf = self._raw_buf[last_lt:]
124
+ else:
125
+ display += self._raw_buf
126
+ self._raw_buf = ""
127
+ break
128
+ else:
129
+ # Found start tag: everything before is text
130
+ display += self._raw_buf[:pos]
131
+ self._in_call = True
132
+ self._raw_buf = self._raw_buf[pos + len("<tool_call>"):]
133
+ continue # Look for end tag in the rest of buffer
134
+ else:
135
+ # Inside a tag: look for end tag
136
+ pos = self._raw_buf.find("</tool_call>")
137
+ if pos == -1:
138
+ # End tag not found yet, wait for more chunks
139
+ self._call_buf += self._raw_buf
140
+ self._raw_buf = ""
141
+ break
142
+ else:
143
+ # Found end tag: extract JSON and continue
144
+ self._call_buf += self._raw_buf[:pos]
145
+ self._raw_buf = self._raw_buf[pos + len("</tool_call>"):]
146
+ try:
147
+ data = json.loads(self._call_buf.strip())
148
+ # Robust name/input extraction
149
+ name = data.get("name") or (data.get("function", {}).get("name") if isinstance(data.get("function"), dict) else None)
150
+ if name:
151
+ self.tool_calls.append({
152
+ "id": f"call_pt_{len(self.tool_calls)}",
153
+ "name": name,
154
+ "input": data.get("input") or data.get("function", {}).get("arguments") or {},
155
+ })
156
+ except: pass
157
+ self._call_buf = ""
158
+ self._in_call = False
159
+ continue # Look for more tags in the rest of buffer
160
+
161
+ # 2. Raw JSON Fallback (only if enabled and NOT inside a tag)
162
+ if self._auto_wrap_json and not self._in_call and "{" in display:
163
+ search_pos = 0
164
+ while True:
165
+ start = display.find("{", search_pos)
166
+ if start == -1: break
167
+
168
+ snippet = display[start:start+500]
169
+ if '"name"' in snippet and ('"input"' in snippet or '"arguments"' in snippet):
170
+ brace_count = 0
171
+ end_pos = -1
172
+ for j in range(start, len(display)):
173
+ if display[j] == "{": brace_count += 1
174
+ elif display[j] == "}":
175
+ brace_count -= 1
176
+ if brace_count == 0:
177
+ end_pos = j + 1
178
+ break
179
+ if end_pos != -1:
180
+ try:
181
+ json_str = display[start:end_pos]
182
+ data = json.loads(json_str)
183
+ name = data.get("name") or (data.get("function", {}).get("name") if isinstance(data.get("function"), dict) else None)
184
+ if name:
185
+ self.tool_calls.append({
186
+ "id": f"call_pt_{len(self.tool_calls)}",
187
+ "name": name,
188
+ "input": data.get("input") or data.get("function", {}).get("arguments") or {},
189
+ })
190
+ display = display[:start] + display[end_pos:]
191
+ search_pos = start
192
+ continue
193
+ except: pass
194
+ search_pos = start + 1
195
+
196
+ return display
197
+
198
+ def flush(self) -> str:
199
+ """Return any remaining text in the buffer."""
200
+ res = self._raw_buf
201
+ self._raw_buf = ""
202
+ # If we were in a call but it never ended, we should probably output the partial call?
203
+ # But for now, just the raw text.
204
+ if self._in_call:
205
+ res = "<tool_call>" + self._call_buf + res
206
+ self._call_buf = ""
207
+ self._in_call = False
208
+ return res
209
+
210
+
211
+ def _format_web_tool_manifest(tool_schemas: list, config: dict, messages: list) -> str:
212
+ """Format tools as a prompt hint for web models.
213
+ First turn → full manifest with strong instructions + tool list.
214
+ Continuation turns → short format reminder (always injected, cheap).
215
+ Disable entirely with config["no_tools"] = True.
216
+ """
217
+ if not tool_schemas or config.get("no_tools"):
218
+ return ""
219
+
220
+ is_first_turn = len([m for m in messages if m.get("role") == "user"]) <= 1
221
+
222
+ # Web providers (claude.ai, qwen.ai, etc.) keep the conversation server-side,
223
+ # so the turn-1 manifest is still in the model's context on every later turn.
224
+ # Re-injecting wastes tokens. Skip unless the user explicitly opted in.
225
+ if not is_first_turn and not config.get("always_inject_tools"):
226
+ return ""
227
+
228
+ manifest = [
229
+ "\n\n[TOOL USE — READ CAREFULLY]",
230
+ "You are running inside an agent harness that can EXECUTE tools for you.",
231
+ "When you need information, file contents, or to run an action — DO NOT describe what you would do; CALL the tool.",
232
+ "",
233
+ "EXACT format (any deviation = the call is ignored):",
234
+ ' <tool_call>{"name": "ToolName", "input": {"key": "value"}}</tool_call>',
235
+ "",
236
+ "Rules:",
237
+ "1. The <tool_call> tag MUST be on its own line, with valid JSON inside.",
238
+ "2. Use ONLY tool names from the list below. Do NOT invent tools (no `SleepTimer`, no `WaitFor`, no fake names).",
239
+ "3. To call multiple tools, emit multiple <tool_call> blocks in the SAME response — do not wait for results between them.",
240
+ "4. After tool results come back, you may call more tools or give a final answer.",
241
+ "5. If no tool is needed, just answer normally — no tool_call tag.",
242
+ "",
243
+ "Example (correct):",
244
+ ' <tool_call>{"name": "Read", "input": {"file_path": "/tmp/foo.txt"}}</tool_call>',
245
+ "",
246
+ "Available Tools:",
247
+ ]
248
+ for s in tool_schemas:
249
+ manifest.append(f"- {s['name']}: {s.get('description', '')}")
250
+ manifest.append(f" Inputs: {json.dumps(s.get('parameters', {}).get('properties', {}), separators=(',', ':'))}")
251
+
252
+ return "\n".join(manifest)
253
+
254
+
255
+ def _consolidate_web_history(messages: list, manifest: str = "") -> str:
256
+ """Consolidate history since last assistant turn into one prompt string.
257
+ This ensures tool results and system notifications are correctly perceived
258
+ by web-based models that take a single prompt string.
259
+ """
260
+ if not messages:
261
+ return manifest
262
+
263
+ # Find last assistant message that actually has text or was saved
264
+ last_ast = -1
265
+ for i in range(len(messages) - 1, -1, -1):
266
+ if messages[i].get("role") == "assistant":
267
+ last_ast = i
268
+ break
269
+
270
+ parts = []
271
+ relevant = messages[last_ast + 1:] if last_ast != -1 else messages
272
+
273
+ for m in relevant:
274
+ role = m.get("role", "user")
275
+ content = m.get("content", "")
276
+
277
+ # We only skip empty content if it's NOT a tool result.
278
+ # Tool results must be sent even if empty so the model knows they ran.
279
+ if role != "tool" and not content:
280
+ continue
281
+
282
+ header = f"--- [{role.upper()}] ---"
283
+ if role == "tool":
284
+ header = f"--- [Tool Result: {m.get('name', 'Unknown')}] ---"
285
+ if not content:
286
+ content = "(No output / Empty result)"
287
+
288
+ parts.append(f"{header}\n{content}")
289
+
290
+ prompt = "\n\n".join(parts).strip()
291
+ if manifest:
292
+ prompt = manifest + "\n\n" + prompt
293
+
294
+ return prompt.strip()
295
+
296
+ # ── Provider registry ──────────────────────────────────────────────────────
297
+
298
+ PROVIDERS: dict[str, dict] = {
299
+ "anthropic": {
300
+ "type": "anthropic",
301
+ "api_key_env": "ANTHROPIC_API_KEY",
302
+ "context_limit": 200000,
303
+ "models": [
304
+ "claude-opus-4-6", "claude-sonnet-4-6", "claude-haiku-4-5-20251001",
305
+ "claude-opus-4-5", "claude-sonnet-4-5",
306
+ "claude-3-5-sonnet-20241022", "claude-3-5-haiku-20241022",
307
+ ],
308
+ },
309
+ "openai": {
310
+ "type": "openai",
311
+ "api_key_env": "OPENAI_API_KEY",
312
+ "base_url": "https://api.openai.com/v1",
313
+ "context_limit": 128000,
314
+ "max_completion_tokens": 16384, # safe cap across gpt-4o/gpt-4.1 family
315
+ "models": [
316
+ "gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4.1", "gpt-4.1-mini",
317
+ "o3-mini", "o1", "o1-mini",
318
+ ],
319
+ },
320
+ "gemini": {
321
+ "type": "openai",
322
+ "api_key_env": "GEMINI_API_KEY",
323
+ "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
324
+ "context_limit": 1000000,
325
+ "max_completion_tokens": 65536, # Gemini 2.x supports up to 65k output tokens
326
+ "models": [
327
+ "gemini-2.5-pro-preview-03-25",
328
+ "gemini-2.0-flash", "gemini-2.0-flash-lite",
329
+ "gemini-1.5-pro", "gemini-1.5-flash",
330
+ ],
331
+ },
332
+ "gemini-web": {
333
+ "type": "gemini-web",
334
+ "context_limit": 1000000,
335
+ "models": [
336
+ "gemini-latest", "gemini-flash", "gemini-pro",
337
+ ],
338
+ },
339
+ "kimi": {
340
+ "type": "openai",
341
+ "api_key_env": "MOONSHOT_API_KEY",
342
+ "base_url": "https://api.moonshot.ai/v1",
343
+ "context_limit": 250000,
344
+ "models": [
345
+ "kimi-k2.5", "kimi-latest",
346
+ "moonshot-v1-8k", "moonshot-v1-32k", "moonshot-v1-128k",
347
+ ],
348
+ },
349
+ "kimi-code": {
350
+ "type": "openai",
351
+ "api_key_env": "KIMI_CODE_API_KEY",
352
+ "base_url": "https://api.kimi.com/coding/v1",
353
+ "context_limit": 256000,
354
+ "models": [
355
+ "kimi-for-coding", "kimi-k2.6", "kimi-k2.5", "kimi-latest",
356
+ ],
357
+ },
358
+ "kimi-code2": {
359
+ "type": "openai",
360
+ "api_key_env": "KIMI_CODE2_API_KEY",
361
+ "base_url": "https://api.kimi.com/coding/v1",
362
+ "context_limit": 256000,
363
+ "models": [
364
+ "kimi-for-coding", "kimi-k2.6", "kimi-k2.5", "kimi-latest",
365
+ ],
366
+ },
367
+ "kimi-code3": {
368
+ "type": "openai",
369
+ "api_key_env": "KIMI_CODE3_API_KEY",
370
+ "base_url": "https://api.kimi.com/coding/v1",
371
+ "context_limit": 256000,
372
+ "models": [
373
+ "kimi-for-coding", "kimi-k2.6", "kimi-k2.5", "kimi-latest",
374
+ ],
375
+ },
376
+ "moonshot": {
377
+ "type": "openai",
378
+ "api_key_env": "MOONSHOT_API_KEY",
379
+ "base_url": "https://api.moonshot.ai/v1",
380
+ "context_limit": 250000,
381
+ "models": [
382
+ "kimi-k2.5", "kimi-latest",
383
+ "moonshot-v1-8k", "moonshot-v1-32k", "moonshot-v1-128k",
384
+ ],
385
+ },
386
+ "qwen": {
387
+ "type": "openai",
388
+ "api_key_env": "DASHSCOPE_API_KEY",
389
+ "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
390
+ "context_limit": 1000000,
391
+ "models": [
392
+ "qwen-max", "qwen-plus", "qwen-turbo", "qwen-long",
393
+ "qwen2.5-72b-instruct", "qwen2.5-coder-32b-instruct",
394
+ "qwq-32b",
395
+ ],
396
+ },
397
+ "zhipu": {
398
+ "type": "openai",
399
+ "api_key_env": "ZHIPU_API_KEY",
400
+ "base_url": "https://api.z.ai/api/coding/paas/v4",
401
+ "context_limit": 128000,
402
+ "models": [
403
+ "glm-4-plus", "glm-4", "glm-4-flash", "glm-4-air",
404
+ "glm-z1-flash", "GLM-4.7", "GLM-4.5-AIR",
405
+ ],
406
+ },
407
+ "deepseek": {
408
+ "type": "openai",
409
+ "api_key_env": "DEEPSEEK_API_KEY",
410
+ "base_url": "https://api.deepseek.com/v1",
411
+ "context_limit": 64000,
412
+ "models": [
413
+ "deepseek-chat", "deepseek-coder", "deepseek-reasoner",
414
+ "deepseek-v3", "deepseek-r1",
415
+ ],
416
+ },
417
+ "minimax": {
418
+ "type": "openai",
419
+ "api_key_env": "MINIMAX_API_KEY",
420
+ "base_url": "https://api.minimaxi.chat/v1",
421
+ "context_limit": 1000000,
422
+ "models": [
423
+ "MiniMax-Text-01", "MiniMax-VL-01",
424
+ "abab6.5s-chat", "abab6.5-chat",
425
+ "abab5.5s-chat", "abab5.5-chat",
426
+ ],
427
+ },
428
+ "ollama": {
429
+ "type": "ollama",
430
+ "api_key_env": None,
431
+ "base_url": "http://localhost:11434",
432
+ "api_key": "ollama",
433
+ "context_limit": 250000,
434
+ "models": [
435
+ "llama3.3", "llama3.2", "phi4", "mistral", "mixtral",
436
+ "qwen2.5-coder", "deepseek-r1", "gemma3",
437
+ ],
438
+ },
439
+ "lmstudio": {
440
+ "type": "openai",
441
+ "api_key_env": None,
442
+ "base_url": "http://localhost:1234/v1",
443
+ "api_key": "lm-studio",
444
+ "context_limit": 128000,
445
+ "models": [], # dynamic, depends on loaded model
446
+ },
447
+ "custom22": {
448
+ "type": "openai",
449
+ "api_key_env": "MIMO_API_KEY",
450
+ "base_url": "https://api.xiaomimimo.com/v1", # read from config["custom_base_url"]
451
+ "context_limit": 128000,
452
+ "models": ["MiMo-V2-Pro"],
453
+ },
454
+ "claude-web": {
455
+ "type": "claude_web",
456
+ "api_key_env": None,
457
+ "context_limit": 200000,
458
+ "models": [
459
+ "claude-sonnet-4-6", "claude-haiku-4-5",
460
+ "claude-opus-4-6", "claude-opus-4-5",
461
+ ],
462
+ },
463
+ "claude-code": {
464
+ "type": "claude_code",
465
+ "api_key_env": None,
466
+ "context_limit": 200000,
467
+ "models": [
468
+ "claude-opus-4-7", "claude-opus-4-6", "claude-sonnet-4-6",
469
+ ],
470
+ },
471
+ "kimi-web": {
472
+ "type": "kimi_web",
473
+ "api_key_env": None,
474
+ "context_limit": 128000,
475
+ "models": [
476
+ "kimi-latest", "kimi-v1",
477
+ ],
478
+ },
479
+ "deepseek-web": {
480
+ "type": "deepseek_web",
481
+ "api_key_env": None,
482
+ "context_limit": 64000,
483
+ "models": [
484
+ "deepseek-v3", "deepseek-r1", "deepseek-latest",
485
+ ],
486
+ },
487
+ "qwen-web": {
488
+ "type": "qwen_web",
489
+ "api_key_env": None,
490
+ "context_limit": 1_000_000,
491
+ "models": [
492
+ "qwen3.6-plus", "qwen-max", "qwen-turbo", "qwen-plus",
493
+ ],
494
+ },
495
+ "nvidia-web": {
496
+ "type": "openai",
497
+ "api_key_env": "NVIDIA_API_KEY",
498
+ "base_url": "https://integrate.api.nvidia.com/v1",
499
+ "context_limit": 128000,
500
+ "max_completion_tokens": 16384,
501
+ "models": [
502
+ "deepseek-ai/deepseek-v4-flash",
503
+ "deepseek-ai/deepseek-r1",
504
+ "meta/llama-3.3-70b-instruct",
505
+ "nvidia/llama-3.1-nemotron-70b-instruct",
506
+ "mistralai/mixtral-8x22b-instruct-v0.1",
507
+ "microsoft/phi-3-medium-128k-instruct",
508
+ "stepfun-ai/step-3.5-flash",
509
+ "qwen/qwen2.5-72b-instruct",
510
+ "google/gemma-2-27b-it",
511
+ ],
512
+ },
513
+ "gcloud": {
514
+ "type": "gcloud",
515
+ "api_key_env": None,
516
+ "context_limit": 1000000,
517
+ "max_completion_tokens": 65536,
518
+ "models": [
519
+ "gemini-2.5-pro",
520
+ "gemini-2.0-flash",
521
+ "gemini-1.5-pro",
522
+ ],
523
+ },
524
+ }
525
+
526
+ # Cost per million tokens (approximate, fallback to 0 for unknown)
527
+ COSTS = {
528
+ "claude-opus-4-6": (15.0, 75.0),
529
+ "claude-sonnet-4-6": (3.0, 15.0),
530
+ "claude-haiku-4-5-20251001": (0.8, 4.0),
531
+ "gpt-4o": (2.5, 10.0),
532
+ "gpt-4o-mini": (0.15, 0.6),
533
+ "o3-mini": (1.1, 4.4),
534
+ "gemini-2.0-flash": (0.075, 0.3),
535
+ "gemini-1.5-pro": (1.25, 5.0),
536
+ "gemini-2.5-pro-preview-03-25": (1.25, 10.0),
537
+ "moonshot-v1-8k": (1.0, 3.0),
538
+ "moonshot-v1-32k": (2.4, 7.0),
539
+ "moonshot-v1-128k": (8.0, 24.0),
540
+ "qwen-max": (2.4, 9.6),
541
+ "qwen-plus": (0.4, 1.2),
542
+ "deepseek-chat": (0.27, 1.1),
543
+ "deepseek-reasoner": (0.55, 2.19),
544
+ "glm-4-plus": (0.7, 0.7),
545
+ "GLM-4.7": (0.7, 0.7),
546
+ "GLM-4.5-AIR": (0.5, 0.5),
547
+ "MiniMax-Text-01": (0.7, 2.1),
548
+ "abab6.5s-chat": (0.1, 0.1),
549
+ "abab6.5-chat": (0.5, 0.5),
550
+ "gcloud/gemini-2.5-pro": (1.25, 10.0),
551
+ "gcloud/gemini-2.0-flash": (0.075, 0.3),
552
+ "gcloud/gemini-1.5-pro": (1.25, 5.0),
553
+ }
554
+
555
+ # Auto-detection: prefix → provider name
556
+ _PREFIXES = [
557
+ ("claude-", "anthropic"),
558
+ ("gpt-", "openai"),
559
+ ("o1", "openai"),
560
+ ("o3", "openai"),
561
+ ("gemini-", "gemini"),
562
+ ("kimi-code/", "kimi-code"),
563
+ ("kimi-code2/", "kimi-code2"),
564
+ ("kimi-code3/", "kimi-code3"),
565
+ ("kimi-for-coding", "kimi-code"),
566
+ ("kimi", "kimi"), # matches 'kimi-' and 'kimi'
567
+ ("moonshot-", "kimi"),
568
+ ("moonshot", "kimi"),
569
+ ("qwen", "qwen"), # qwen-max, qwen2.5-...
570
+ ("qwq-", "qwen"),
571
+ ("glm-", "zhipu"),
572
+ ("GLM-", "zhipu"),
573
+ ("deepseek-", "deepseek"),
574
+ ("minimax-", "minimax"),
575
+ ("MiniMax-", "minimax"),
576
+ ("abab", "minimax"),
577
+ ("llama", "ollama"),
578
+ ("mistral", "ollama"),
579
+ ("phi", "ollama"),
580
+ ("gemma", "ollama"),
581
+ ("gcloud/", "gcloud"),
582
+ ("gcloud-", "gcloud"),
583
+ ]
584
+
585
+ # Models available under claude-web/ prefix
586
+ _CLAUDE_WEB_MODELS = {
587
+ "claude-sonnet-4-6", "claude-haiku-4-5",
588
+ "claude-opus-4-6", "claude-opus-4-5",
589
+ "claude-3-5-sonnet-20241022", "claude-3-5-haiku-20241022",
590
+ }
591
+
592
+
593
+ def detect_provider(model: str) -> str:
594
+ """Return provider name for a model string.
595
+ Supports 'provider/model' explicit format, or auto-detect by prefix."""
596
+ if "/" in model:
597
+ p = model.split("/", 1)[0]
598
+ if p in PROVIDERS:
599
+ return p
600
+ for prefix, pname in _PREFIXES:
601
+ if model.lower().startswith(prefix):
602
+ return pname
603
+ return "openai" # fallback
604
+
605
+
606
+ def _claude_web_cookies_path(config: dict) -> str:
607
+ """Return path to claude.ai cookies JSON file."""
608
+ import os, pathlib
609
+ p = config.get("claude_web_cookies") or str(
610
+ pathlib.Path.home() / ".dulus" / "claude_cookies.json"
611
+ )
612
+ return p
613
+
614
+
615
+ def _kimi_web_auth_path(config: dict) -> str:
616
+ """Return path to kimi.com consumer auth JSON file."""
617
+ import os, pathlib
618
+ p = config.get("kimi_web_auth_path") or str(
619
+ pathlib.Path.home() / ".dulus" / "kimi_consumer.json"
620
+ )
621
+ return p
622
+
623
+
624
+ def _kimi_web_list_chats(auth_data: dict, page_size: int = 50,
625
+ page_token: str = "", query: str = "") -> dict:
626
+ """List recent chats from kimi.com using harvested cookies/headers.
627
+
628
+ Reuses the auth blob saved by /harvest (cookies + x-msh-* + Bearer).
629
+ Endpoint is kimi.chat.v1.ChatService/ListChats (NOT the gateway /Chat one).
630
+ Returns the parsed JSON from the API or raises on HTTP error.
631
+ """
632
+ import requests as _req
633
+
634
+ s = _req.Session()
635
+ for c in auth_data.get("cookies", []):
636
+ s.cookies.set(c["name"], c["value"],
637
+ domain=c.get("domain", ".kimi.com"),
638
+ path=c.get("path", "/"))
639
+
640
+ # Reuse harvested headers, but override content-type for plain JSON
641
+ # (the harvested one is connect+json for the streaming /Chat endpoint).
642
+ base = auth_data.get("headers", {})
643
+ headers = {k: v for k, v in base.items() if k.lower() not in ("content-type",)}
644
+ headers["Content-Type"] = "application/json"
645
+ headers["Accept"] = "*/*"
646
+ headers["Origin"] = "https://www.kimi.com"
647
+ headers.setdefault("Referer", "https://www.kimi.com/chat/history")
648
+
649
+ body = {
650
+ "project_id": "",
651
+ "page_size": page_size,
652
+ "page_token": page_token,
653
+ "query": query,
654
+ }
655
+ url = "https://www.kimi.com/apiv2/kimi.chat.v1.ChatService/ListChats"
656
+ resp = s.post(url, headers=headers, json=body, timeout=20)
657
+ resp.raise_for_status()
658
+ return resp.json()
659
+
660
+
661
+ def _gemini_web_auth_path(config: dict) -> str:
662
+ """Return path to gemini.google.com consumer auth JSON file."""
663
+ import os, pathlib
664
+ p = config.get("gemini_web_auth_path") or str(
665
+ pathlib.Path.home() / ".dulus" / "gemini_web.json"
666
+ )
667
+ return p
668
+
669
+
670
+ def _deepseek_web_auth_path(config: dict) -> str:
671
+ """Return path to chat.deepseek.com consumer auth JSON file."""
672
+ import pathlib
673
+ p = config.get("deepseek_web_auth_path") or str(
674
+ pathlib.Path.home() / ".dulus" / "deepseek_web.json"
675
+ )
676
+ return p
677
+
678
+
679
+ def _qwen_web_auth_path(config: dict) -> str:
680
+ """Return path to chat.qwen.ai consumer auth JSON file."""
681
+ import pathlib
682
+ p = config.get("qwen_web_auth_path") or str(
683
+ pathlib.Path.home() / ".dulus" / "qwen_web.json"
684
+ )
685
+ return p
686
+
687
+
688
+ def _claude_web_org_id(cookies_data: dict, config: dict) -> str:
689
+ """Extract org ID: try cookies → try API → fallback from config → hardcoded."""
690
+ # 1. Cached in config
691
+ if config.get("claude_web_org_id"):
692
+ return config["claude_web_org_id"]
693
+
694
+ # 2. Scan cookies for lastActiveOrg
695
+ for c in cookies_data.get("cookies", []):
696
+ name = c.get("name", "")
697
+ val = c.get("value", "")
698
+ if name == "lastActiveOrg" and val:
699
+ config["claude_web_org_id"] = val
700
+ return val
701
+
702
+ # 3. Try /api/organizations with harvested cookies
703
+ org_id = _claude_web_fetch_org_id(cookies_data)
704
+ if org_id:
705
+ config["claude_web_org_id"] = org_id
706
+ return org_id
707
+
708
+ # 4. Fallback from config or hardcoded
709
+ return config.get("claude_web_org_id", "022b6d58-7355-4e97-bfab-c4fc047674bb")
710
+
711
+
712
+ def _claude_web_headers(cookies_data: dict, referer: str = "https://claude.ai/new") -> dict:
713
+ """Build HTTP headers for claude.ai requests."""
714
+ cookie_str = "; ".join(
715
+ f"{c['name']}={c['value']}"
716
+ for c in cookies_data.get("cookies", [])
717
+ if "claude.ai" in c.get("domain", "") or "anthropic.com" in c.get("domain", "")
718
+ )
719
+ ua = cookies_data.get(
720
+ "user_agent",
721
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
722
+ "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
723
+ )
724
+ h = {
725
+ "Content-Type": "application/json",
726
+ "Accept": "text/event-stream",
727
+ "Accept-Language": "en-US,en;q=0.9",
728
+ "anthropic-client-platform": "web_claude_ai",
729
+ "Origin": "https://claude.ai",
730
+ "Referer": referer,
731
+ "User-Agent": ua,
732
+ "Cookie": cookie_str,
733
+ }
734
+ # Merge harvested request headers (skip Cookie/Host/Content-Length)
735
+ for k, v in cookies_data.get("headers", {}).items():
736
+ if k.lower() not in ("cookie", "host", "content-length", "content-type"):
737
+ h[k] = v
738
+ return h
739
+
740
+
741
+ def _claude_web_fetch_org_id(cookies_data: dict) -> str | None:
742
+ """Call /api/organizations using requests.Session with harvested cookies."""
743
+ try:
744
+ import requests as _req
745
+ s = _req.Session()
746
+ for c in cookies_data.get("cookies", []):
747
+ s.cookies.set(c["name"], c["value"],
748
+ domain=c.get("domain", "claude.ai"),
749
+ path=c.get("path", "/"))
750
+ ua = cookies_data.get("user_agent", "Mozilla/5.0")
751
+ s.headers.update({
752
+ "User-Agent": ua,
753
+ "Accept": "application/json",
754
+ "anthropic-client-platform": "web_claude_ai",
755
+ "Origin": "https://claude.ai",
756
+ "Referer": "https://claude.ai/new",
757
+ })
758
+ resp = s.get("https://claude.ai/api/organizations", timeout=10)
759
+ if resp.status_code == 200:
760
+ orgs = resp.json()
761
+ if isinstance(orgs, list) and orgs:
762
+ return orgs[0].get("uuid") or orgs[0].get("id")
763
+ if isinstance(orgs, dict):
764
+ return orgs.get("uuid") or orgs.get("id")
765
+ except Exception:
766
+ pass
767
+ return None
768
+
769
+
770
+ def _claude_web_create_conversation(cookies_data: dict, org_id: str) -> str | None:
771
+ """Create a new claude.ai chat conversation using requests.Session."""
772
+ from datetime import datetime as _dt
773
+ try:
774
+ import requests as _req
775
+ s = _req.Session()
776
+ for c in cookies_data.get("cookies", []):
777
+ s.cookies.set(c["name"], c["value"],
778
+ domain=c.get("domain", "claude.ai"),
779
+ path=c.get("path", "/"))
780
+ ua = cookies_data.get("user_agent", "Mozilla/5.0")
781
+ s.headers.update({
782
+ "User-Agent": ua,
783
+ "Accept": "application/json",
784
+ "anthropic-client-platform": "web_claude_ai",
785
+ "Origin": "https://claude.ai",
786
+ "Referer": "https://claude.ai/new",
787
+ })
788
+ url = f"https://claude.ai/api/organizations/{org_id}/chat_conversations"
789
+ resp = s.post(url, json={"name": f"Dulus — {_dt.now().strftime('%Y-%m-%d %H:%M:%S')}"}, timeout=15)
790
+ if resp.status_code == 200:
791
+ return resp.json().get("uuid")
792
+ except Exception:
793
+ pass
794
+ return None
795
+
796
+
797
+ def stream_claude_web(
798
+ cookies_file: str,
799
+ model: str,
800
+ system: str,
801
+ messages: list,
802
+ tool_schemas: list,
803
+ config: dict,
804
+ ) -> Generator:
805
+ """Stream from claude.ai web using harvested browser cookies.
806
+
807
+ Tool calling is prompt-based: tool manifest injected into the user
808
+ message; <tool_call>...</tool_call> tags parsed from the response.
809
+ Conversation context is maintained server-side via conversation_id.
810
+ """
811
+ import pathlib
812
+
813
+ # ── Load cookies ─────────────────────────────────────────────────────────
814
+ cpath = pathlib.Path(cookies_file)
815
+ if not cpath.exists():
816
+ msg = f"[claude-web] Cookie file not found: {cookies_file} → run /harvest"
817
+ yield TextChunk(msg)
818
+ yield AssistantTurn(msg, [], 0, 0, error=True)
819
+ return
820
+
821
+ with open(cpath, encoding="utf-8") as f:
822
+ cookies_data = json.load(f)
823
+
824
+ # ── Org ID ───────────────────────────────────────────────────────────────
825
+ org_id = _claude_web_org_id(cookies_data, config)
826
+ if not org_id:
827
+ msg = "[claude-web] Could not get org ID — cookies may be expired. Run /harvest."
828
+ yield TextChunk(msg)
829
+ yield AssistantTurn(msg, [], 0, 0, error=True)
830
+ return
831
+
832
+ # ── Conversation ID (persists for the Dulus session) ───────────────────
833
+ conv_id = config.get("claude_web_conv_id")
834
+ if not conv_id:
835
+ # Use existing conv_id from harvest first (like CODE5.PY)
836
+ conv_ids = cookies_data.get("conversation_ids", [])
837
+ if conv_ids:
838
+ conv_id = conv_ids[0]
839
+ else:
840
+ conv_id = _claude_web_create_conversation(cookies_data, org_id)
841
+ if conv_id:
842
+ config["claude_web_conv_id"] = conv_id
843
+ else:
844
+ msg = "[claude-web] Could not get conversation ID. Run /harvest."
845
+ yield TextChunk(msg)
846
+ yield AssistantTurn(msg, [], 0, 0, error=True)
847
+ return
848
+
849
+ # ── Build prompt from history ──────────────────────────────────────────
850
+ manifest = _format_web_tool_manifest(tool_schemas, config, messages)
851
+ prompt = _consolidate_web_history(messages, manifest)
852
+
853
+ # ── HTTP request ─────────────────────────────────────────────────────────
854
+ url = (
855
+ f"https://claude.ai/api/organizations/{org_id}"
856
+ f"/chat_conversations/{conv_id}/completion"
857
+ )
858
+ payload = {
859
+ "prompt": prompt,
860
+ "timezone": config.get("timezone", "America/Santo_Domingo"),
861
+ "model": model,
862
+ "attachments": [],
863
+ "files": [],
864
+ "rendering_mode": "messages",
865
+ }
866
+ # ── Build requests.Session with cookies (same as CODE5.PY) ─────────────
867
+ import requests as _req
868
+ session = _req.Session()
869
+ for c in cookies_data.get("cookies", []):
870
+ session.cookies.set(c["name"], c["value"],
871
+ domain=c.get("domain", "claude.ai"),
872
+ path=c.get("path", "/"))
873
+ ua = cookies_data.get("user_agent", "Mozilla/5.0")
874
+ session.headers.update({
875
+ "User-Agent": ua,
876
+ "Accept": "text/event-stream",
877
+ "Accept-Language": "en-US,en;q=0.9",
878
+ "anthropic-client-platform": "web_claude_ai",
879
+ "Origin": "https://claude.ai",
880
+ "Referer": f"https://claude.ai/chat/{conv_id}",
881
+ })
882
+ # Merge any harvested headers
883
+ for k, v in cookies_data.get("headers", {}).items():
884
+ if k.lower() not in ("cookie", "host", "content-length", "content-type"):
885
+ session.headers[k] = v
886
+
887
+ # Unified parser for <tool_call> tags
888
+ parser = WebToolParser()
889
+
890
+ # ── Stream ───────────────────────────────────────────────────────────────
891
+ text = ""
892
+ _debug_events: list = []
893
+ try:
894
+ resp_cm = session.post(url, json=payload, stream=True, timeout=120)
895
+ if resp_cm.status_code != 200:
896
+ if resp_cm.status_code in (401, 403):
897
+ msg = f"[claude-web] Auth error {resp_cm.status_code} — cookies expired. Run /harvest."
898
+ elif resp_cm.status_code == 404:
899
+ config.pop("claude_web_conv_id", None)
900
+ msg = "[claude-web] Conversation not found (404). New one will be created next message."
901
+ else:
902
+ msg = f"[claude-web] HTTP {resp_cm.status_code}: {resp_cm.text[:300]}"
903
+ yield TextChunk(msg)
904
+ yield AssistantTurn(msg, [], 0, 0, error=True)
905
+ return
906
+ except Exception as e:
907
+ msg = f"[claude-web] Connection error: {e}"
908
+ yield TextChunk(msg)
909
+ yield AssistantTurn(msg, [], 0, 0, error=True)
910
+ return
911
+
912
+ for raw_line in resp_cm.iter_lines():
913
+ if not raw_line:
914
+ continue
915
+ line_str = raw_line.decode("utf-8") if isinstance(raw_line, bytes) else raw_line
916
+ line_str = line_str.strip()
917
+ if not line_str or not line_str.startswith("data: "):
918
+ continue
919
+ data_str = line_str[6:]
920
+ if data_str == "[DONE]":
921
+ break
922
+ try:
923
+ data = json.loads(data_str)
924
+ except json.JSONDecodeError:
925
+ continue
926
+
927
+ # OLD format: {"completion": "delta", "stop_reason": null}
928
+ # NEW format: {"type": "content_block_delta", "delta": {"type": "text_delta", "text": "..."}}
929
+ completion = data.get("completion", "")
930
+ if not completion:
931
+ evt_type = data.get("type", "")
932
+ if evt_type == "content_block_delta":
933
+ delta = data.get("delta", {})
934
+ if delta.get("type") == "text_delta":
935
+ completion = delta.get("text", "")
936
+
937
+ if completion:
938
+ display = parser.parse_chunk(completion)
939
+ if display:
940
+ text += display
941
+ yield TextChunk(display)
942
+
943
+ # Stop only when stop_reason is explicitly set
944
+ stop_reason = data.get("stop_reason")
945
+ if stop_reason and stop_reason != "null":
946
+ break
947
+
948
+ remaining = parser.flush()
949
+ if remaining:
950
+ text += remaining
951
+ yield TextChunk(remaining)
952
+
953
+ yield AssistantTurn(text, parser.tool_calls, 0, 0)
954
+
955
+
956
+ def stream_claude_code(
957
+ cookies_file: str,
958
+ model: str,
959
+ system: str,
960
+ messages: list,
961
+ tool_schemas: list,
962
+ config: dict,
963
+ ) -> Generator:
964
+ """Stream from claude.ai/code remote-control session using harvested cookies.
965
+
966
+ Endpoint: POST https://claude.ai/v1/sessions/{session_id}/events
967
+ Payload: {"events": [{"type":"user","uuid":"...","session_id":"...","parent_tool_use_id":null,"message":{"role":"user","content":"..."}}]}
968
+ Auth: same claude_cookies.json as claude-web + anthropic-beta: ccr-byoc-2025-07-29
969
+ """
970
+ import pathlib
971
+ import uuid as _uuid
972
+ import requests as _req
973
+
974
+ # ── Load cookies ──────────────────────────────────────────────────────────
975
+ cpath = pathlib.Path(cookies_file)
976
+ if not cpath.exists():
977
+ msg = f"[claude-code] Cookie file not found: {cookies_file} → run /harvest"
978
+ yield TextChunk(msg)
979
+ yield AssistantTurn(msg, [], 0, 0, error=True)
980
+ return
981
+
982
+ with open(cpath, encoding="utf-8") as f:
983
+ cookies_data = json.load(f)
984
+
985
+ # ── Session ID ────────────────────────────────────────────────────────────
986
+ session_id = config.get("claude_code_session_id", "")
987
+ if not session_id:
988
+ msg = (
989
+ "[claude-code] No session ID set.\n"
990
+ "Run `claude remote-control` in a terminal, then:\n"
991
+ " /config claude_code_session_id=session_01VP9K..."
992
+ )
993
+ yield TextChunk(msg)
994
+ yield AssistantTurn(msg, [], 0, 0, error=True)
995
+ return
996
+
997
+ # Accept full URL or bare session ID
998
+ if "/" in session_id:
999
+ session_id = session_id.rstrip("/").split("/")[-1]
1000
+
1001
+ # ── Org ID + activity session from cookies data ───────────────────────────
1002
+ org_id = _claude_web_org_id(cookies_data, config)
1003
+ # activity_session_id lives in cookies
1004
+ activity_session_id = ""
1005
+ for c in cookies_data.get("cookies", []):
1006
+ if c.get("name") == "activitySessionId":
1007
+ activity_session_id = c.get("value", "")
1008
+ break
1009
+
1010
+ # ── Build prompt — same as claude-web (handles list content blocks) ─────────
1011
+ prompt = _consolidate_web_history(messages)
1012
+
1013
+ # ── HTTP session ──────────────────────────────────────────────────────────
1014
+ req_session = _req.Session()
1015
+ for c in cookies_data.get("cookies", []):
1016
+ req_session.cookies.set(
1017
+ c["name"], c["value"],
1018
+ domain=c.get("domain", "claude.ai"),
1019
+ path=c.get("path", "/"),
1020
+ )
1021
+ ua = cookies_data.get("user_agent", "Mozilla/5.0")
1022
+ req_session.headers.update({
1023
+ "User-Agent": ua,
1024
+ "Accept": "*/*",
1025
+ "Accept-Language": "en-US,en;q=0.9",
1026
+ "anthropic-beta": "ccr-byoc-2025-07-29",
1027
+ "anthropic-client-feature": "ccr",
1028
+ "anthropic-client-platform": "web_claude_ai",
1029
+ "anthropic-client-version": "1.0.0",
1030
+ "anthropic-version": "2023-06-01",
1031
+ "content-type": "application/json",
1032
+ "Origin": "https://claude.ai",
1033
+ "Referer": f"https://claude.ai/code/{session_id}",
1034
+ })
1035
+ if org_id:
1036
+ req_session.headers["x-organization-uuid"] = org_id
1037
+ if activity_session_id:
1038
+ req_session.headers["x-activity-session-id"] = activity_session_id
1039
+ # Merge harvested device-id etc
1040
+ for k, v in cookies_data.get("headers", {}).items():
1041
+ kl = k.lower()
1042
+ if kl not in ("cookie", "host", "content-length", "content-type",
1043
+ "anthropic-beta", "anthropic-version"):
1044
+ req_session.headers[k] = v
1045
+
1046
+ # ── Payload ───────────────────────────────────────────────────────────────
1047
+ event_uuid = str(_uuid.uuid4())
1048
+ url = f"https://claude.ai/v1/sessions/{session_id}/events"
1049
+ payload = {
1050
+ "events": [
1051
+ {
1052
+ "type": "user",
1053
+ "uuid": event_uuid,
1054
+ "session_id": session_id,
1055
+ "parent_tool_use_id": None,
1056
+ "message": {
1057
+ "role": "user",
1058
+ "content": prompt,
1059
+ },
1060
+ }
1061
+ ]
1062
+ }
1063
+
1064
+ # ── Seed existing JSONL entries BEFORE sending (to detect new ones after) ──
1065
+ import subprocess as _sp
1066
+ from pathlib import Path as _Path
1067
+
1068
+ _session_dir = _Path.home() / ".claude" / "projects" / "C--Users-Admin-Desktop-DULUSV2"
1069
+ _jsonl_files = sorted(_session_dir.glob("*.jsonl"), key=lambda f: f.stat().st_mtime, reverse=True)
1070
+ _jsonl_path = _jsonl_files[0] if _jsonl_files else None
1071
+
1072
+ _seen_uuids: set = set()
1073
+ if _jsonl_path and _jsonl_path.exists():
1074
+ try:
1075
+ with open(_jsonl_path, "r", encoding="utf-8", errors="ignore") as _f:
1076
+ for _line in _f:
1077
+ _line = _line.strip()
1078
+ if not _line:
1079
+ continue
1080
+ try:
1081
+ _e = json.loads(_line)
1082
+ _uid = _e.get("uuid") or _e.get("id")
1083
+ if _uid:
1084
+ _seen_uuids.add(_uid)
1085
+ except Exception:
1086
+ pass
1087
+ except Exception:
1088
+ pass
1089
+
1090
+ parser = WebToolParser()
1091
+ text = ""
1092
+
1093
+ try:
1094
+ resp = req_session.post(url, json=payload, stream=True, timeout=120)
1095
+ if resp.status_code == 404:
1096
+ msg = (
1097
+ "[claude-code] Session not found (404). May have expired.\n"
1098
+ "Run `claude remote-control` and update:\n"
1099
+ " /config claude_code_session_id=<new_id>"
1100
+ )
1101
+ yield TextChunk(msg)
1102
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1103
+ return
1104
+ if resp.status_code in (401, 403):
1105
+ msg = f"[claude-code] Auth error {resp.status_code} — run /harvest."
1106
+ yield TextChunk(msg)
1107
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1108
+ return
1109
+ if resp.status_code != 200:
1110
+ msg = f"[claude-code] HTTP {resp.status_code}: {resp.text[:400]}"
1111
+ yield TextChunk(msg)
1112
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1113
+ return
1114
+ except Exception as e:
1115
+ msg = f"[claude-code] Connection error: {e}"
1116
+ yield TextChunk(msg)
1117
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1118
+ return
1119
+
1120
+ # POST sent — close response (fire-and-forget, response comes via JSONL)
1121
+ try:
1122
+ resp.close()
1123
+ except Exception:
1124
+ pass
1125
+
1126
+ # ── Poll JSONL for new assistant entry ────────────────────────────────────
1127
+ if not _jsonl_path:
1128
+ msg = "[claude-code] No JSONL session file found in ~/.claude/projects/C--Users-Admin-Desktop-DULUSV2"
1129
+ yield TextChunk(msg)
1130
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1131
+ return
1132
+
1133
+ import time as _time
1134
+ _deadline = _time.time() + 90
1135
+ _poll = 0.3
1136
+ _silence = 2.5 # wait this long after last new entry before yielding
1137
+
1138
+ _accumulated: list[str] = []
1139
+ _last_new_entry_time: float = 0.0
1140
+
1141
+ def _extract_text(entry: dict) -> str:
1142
+ _m = entry.get("message", {})
1143
+ if _m.get("role") != "assistant":
1144
+ return ""
1145
+ _c = _m.get("content", "")
1146
+ if isinstance(_c, str):
1147
+ return _c.strip()
1148
+ if isinstance(_c, list):
1149
+ _parts = []
1150
+ for _b in _c:
1151
+ if isinstance(_b, dict) and _b.get("type") == "text":
1152
+ _parts.append(_b.get("text", "").strip())
1153
+ return "\n".join(_parts).strip()
1154
+ return ""
1155
+
1156
+ while _time.time() < _deadline:
1157
+ # Scan for new entries
1158
+ try:
1159
+ with open(_jsonl_path, "r", encoding="utf-8", errors="ignore") as _f:
1160
+ for _line in _f:
1161
+ _line = _line.strip()
1162
+ if not _line:
1163
+ continue
1164
+ try:
1165
+ _e = json.loads(_line)
1166
+ except Exception:
1167
+ continue
1168
+ _uid = _e.get("uuid") or _e.get("id")
1169
+ if _uid in _seen_uuids:
1170
+ continue
1171
+ _seen_uuids.add(_uid)
1172
+ _t = _extract_text(_e)
1173
+ if _t:
1174
+ _accumulated.append(_t)
1175
+ _last_new_entry_time = _time.time()
1176
+ except Exception:
1177
+ pass
1178
+
1179
+ # If we have text and silence window passed — flush
1180
+ if _accumulated and (_time.time() - _last_new_entry_time) >= _silence:
1181
+ text = "\n\n".join(_accumulated)
1182
+ yield TextChunk(text)
1183
+ yield AssistantTurn(text, [], 0, 0)
1184
+ return
1185
+
1186
+ _time.sleep(_poll)
1187
+
1188
+ if _accumulated:
1189
+ text = "\n\n".join(_accumulated)
1190
+ yield TextChunk(text)
1191
+ yield AssistantTurn(text, [], 0, 0)
1192
+ return
1193
+
1194
+ msg = "[claude-code] Timeout waiting for assistant response (90s)."
1195
+ yield TextChunk(msg)
1196
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1197
+
1198
+
1199
+ def stream_kimi_web(
1200
+ auth_file: str,
1201
+ model: str,
1202
+ system: str,
1203
+ messages: list,
1204
+ tool_schemas: list,
1205
+ config: dict,
1206
+ ) -> Generator:
1207
+ """Stream from kimi.com consumer web using harvested gRPC-Web tokens."""
1208
+ import json
1209
+ import struct
1210
+ import os
1211
+ from pathlib import Path
1212
+
1213
+ # 1. Load harvested auth
1214
+ if not os.path.exists(auth_file):
1215
+ msg = f"[kimi-web] Auth file not found: {auth_file}. Run harvester first."
1216
+ yield TextChunk(msg)
1217
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1218
+ return
1219
+
1220
+ with open(auth_file, "r", encoding="utf-8") as f:
1221
+ auth_data = json.load(f)
1222
+
1223
+ session = urllib.request.build_opener()
1224
+
1225
+ # Set cookies
1226
+ cookies = []
1227
+ for c in auth_data.get("cookies", []):
1228
+ cookies.append(f"{c['name']}={c['value']}")
1229
+
1230
+ headers = auth_data.get("headers", {}).copy()
1231
+ headers["Cookie"] = "; ".join(cookies)
1232
+ # Ensure Connect protocol
1233
+ headers["Content-Type"] = "application/connect+json"
1234
+
1235
+ # 2. Maintain state (chat_id, parent_id)
1236
+ last_payload = auth_data.get("last_payload", {})
1237
+ harvested_chat_id = last_payload.get("chat_id")
1238
+ chat_id = config.get("kimi_web_chat_id") or harvested_chat_id
1239
+
1240
+ # parent_id priority: use config value ONLY if it belongs to the current chat
1241
+ # (config may hold a stale parent_id from a previous session with a different chat_id)
1242
+ harvested_parent_id = last_payload.get("message", {}).get("parent_id")
1243
+ config_parent_id = config.get("kimi_web_parent_id")
1244
+ config_chat_id = config.get("kimi_web_chat_id")
1245
+ if config_parent_id and config_chat_id == harvested_chat_id:
1246
+ _kimi_web_parent_id = config_parent_id
1247
+ elif harvested_parent_id:
1248
+ _kimi_web_parent_id = harvested_parent_id
1249
+ else:
1250
+ _kimi_web_parent_id = None # explicit fallback — new chat will be created
1251
+
1252
+ # ── Build prompt from history ──────────────────────────────────────────
1253
+ manifest = _format_web_tool_manifest(tool_schemas, config, messages)
1254
+ last_user_msg = _consolidate_web_history(messages, manifest)
1255
+
1256
+ payload = last_payload.copy()
1257
+ payload["chat_id"] = chat_id
1258
+ payload["message"] = {
1259
+ "parent_id": _kimi_web_parent_id,
1260
+ "role": "user",
1261
+ "blocks": [{"message_id": "", "text": {"content": last_user_msg}}],
1262
+ "scenario": last_payload.get("message", {}).get("scenario", "SCENARIO_K2D5")
1263
+ }
1264
+
1265
+ # ... (binary framing) ...
1266
+ payload_bytes = json.dumps(payload, separators=(',', ':')).encode('utf-8')
1267
+ header_frame = struct.pack(">B I", 0, len(payload_bytes))
1268
+ data_to_send = header_frame + payload_bytes
1269
+
1270
+ url = auth_data.get("url")
1271
+ req = urllib.request.Request(url, data=data_to_send, headers=headers, method="POST")
1272
+
1273
+ # ── Streaming with Retries ──────────────────────────────────────────────
1274
+ text = ""
1275
+ raw_content = "" # accumulate full response before parsing
1276
+ parser = WebToolParser(auto_wrap_json=True)
1277
+
1278
+ for attempt in range(2):
1279
+ # attempt 0: original try
1280
+ # attempt 1: retry fresh thread if attempt 0 empty
1281
+
1282
+ if attempt == 1:
1283
+ config.pop("kimi_web_chat_id", None)
1284
+ config.pop("kimi_web_parent_id", None)
1285
+ yield TextChunk("[kimi-web] Empty response — retrying with fresh thread...\n")
1286
+
1287
+ # Rebuild payload for fresh thread
1288
+ payload["chat_id"] = None
1289
+ payload["message"]["parent_id"] = None
1290
+ payload_bytes = json.dumps(payload, separators=(',', ':')).encode('utf-8')
1291
+ header_frame = struct.pack(">B I", 0, len(payload_bytes))
1292
+ data_to_send = header_frame + payload_bytes
1293
+ req = urllib.request.Request(url, data=data_to_send, headers=headers, method="POST")
1294
+
1295
+ try:
1296
+ with urllib.request.urlopen(req, timeout=120) as resp:
1297
+ while True:
1298
+ h_bytes = resp.read(5)
1299
+ if not h_bytes or len(h_bytes) < 5: break
1300
+ flags, length = struct.unpack(">B I", h_bytes)
1301
+ body = resp.read(length)
1302
+ if not body: break
1303
+
1304
+ try:
1305
+ data = json.loads(body.decode("utf-8", errors="ignore"))
1306
+
1307
+ # Capture state
1308
+ if data.get("op") == "set":
1309
+ if data.get("mask") == "chat":
1310
+ config["kimi_web_chat_id"] = data.get("chat", {}).get("id")
1311
+ elif data.get("mask") == "message":
1312
+ msg_info = data.get("message", {})
1313
+ if msg_info.get("role") == "user":
1314
+ if not config.get("kimi_web_parent_id"):
1315
+ config["kimi_web_parent_id"] = msg_info.get("id")
1316
+ elif msg_info.get("role") == "assistant":
1317
+ config["kimi_web_parent_id"] = msg_info.get("id")
1318
+
1319
+ content = ""
1320
+ if data.get("op") == "set" and data.get("mask") == "block.text":
1321
+ content = data.get("block", {}).get("text", {}).get("content", "")
1322
+ elif data.get("op") == "append" and data.get("mask") == "block.text.content":
1323
+ content = data.get("block", {}).get("text", {}).get("content", "")
1324
+
1325
+ if content:
1326
+ raw_content += content
1327
+ except:
1328
+ continue
1329
+
1330
+ # If we got output, we are done
1331
+ if raw_content or parser.tool_calls:
1332
+ break
1333
+
1334
+ except Exception as e:
1335
+ if attempt == 0: continue
1336
+ msg = f"[kimi-web] Error: {e}"
1337
+ yield TextChunk(msg)
1338
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1339
+ return
1340
+
1341
+ # Parse the full response once — avoids tool_call tags split across chunks
1342
+ if raw_content:
1343
+ text = parser.parse_chunk(raw_content)
1344
+ text += parser.flush()
1345
+ if text:
1346
+ yield TextChunk(text)
1347
+
1348
+ yield AssistantTurn(text, parser.tool_calls, 0, 0)
1349
+
1350
+
1351
+
1352
+ def stream_gemini_web(
1353
+ auth_file: str,
1354
+ model: str,
1355
+ system: str,
1356
+ messages: list,
1357
+ tool_schemas: list,
1358
+ config: dict,
1359
+ ) -> Generator:
1360
+ """Stream from gemini.google.com using the fast REST API with user-provided headers.
1361
+
1362
+ Uses the 'requests' library with the exact cookies and headers captured from
1363
+ the user's browser. The harvester requires the user to type 'DULUS' as the
1364
+ message so we can locate and replace it in the f.req payload.
1365
+ """
1366
+ import requests
1367
+ import os
1368
+ import re
1369
+ import urllib.parse
1370
+
1371
+ if not os.path.exists(auth_file):
1372
+ msg = f"[gemini-web] Error: Auth file {auth_file} not found. Run /harvest-gemini."
1373
+ yield TextChunk(msg)
1374
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1375
+ return
1376
+
1377
+ with open(auth_file, "r", encoding="utf-8") as f:
1378
+ auth_data = json.load(f)
1379
+
1380
+ # ── State / Prompt Extraction ──────────────────────────────────────────
1381
+ manifest = _format_web_tool_manifest(tool_schemas, config, messages)
1382
+ last_user_msg = _consolidate_web_history(messages, manifest)
1383
+
1384
+ # ── Payload Building ───────────────────────────────────────────────────
1385
+ last_req = auth_data.get("intercepted_requests", [{}])[-1]
1386
+ url = last_req.get("url")
1387
+ if not url:
1388
+ msg = "[gemini-web] Error: Intercepted URL not found. Re-harvest."
1389
+ yield TextChunk(msg)
1390
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1391
+ return
1392
+
1393
+ pd_raw = last_req.get("post_data", "")
1394
+ pd_parsed = urllib.parse.parse_qs(pd_raw)
1395
+
1396
+ # Extract URL params for requests.post
1397
+ parsed_url = urllib.parse.urlparse(url)
1398
+ params_qs = urllib.parse.parse_qs(parsed_url.query)
1399
+ requests_params = {k: v[0] for k, v in params_qs.items()}
1400
+
1401
+ def find_and_replace(obj, target1, replacement):
1402
+ if isinstance(obj, list):
1403
+ for i, v in enumerate(obj):
1404
+ if isinstance(v, str) and target1 in v:
1405
+ if v == target1:
1406
+ obj[i] = replacement
1407
+ else:
1408
+ try:
1409
+ inner = json.loads(v)
1410
+ find_and_replace(inner, target1, replacement)
1411
+ obj[i] = json.dumps(inner, separators=(',', ':'))
1412
+ except Exception:
1413
+ pass
1414
+ elif isinstance(v, (list, dict)):
1415
+ find_and_replace(v, target1, replacement)
1416
+ elif isinstance(obj, dict):
1417
+ for k, v in obj.items():
1418
+ if isinstance(v, str) and target1 in v:
1419
+ if v == target1:
1420
+ obj[k] = replacement
1421
+ elif isinstance(v, (list, dict)):
1422
+ find_and_replace(v, target1, replacement)
1423
+
1424
+ f_req = []
1425
+ f_req_source = None
1426
+ if "f.req" in pd_parsed:
1427
+ f_req = json.loads(pd_parsed["f.req"][0])
1428
+ f_req_source = "post_data"
1429
+ elif "f.req" in requests_params:
1430
+ f_req = json.loads(requests_params["f.req"])
1431
+ f_req_source = "params"
1432
+
1433
+ if f_req_source:
1434
+ find_and_replace(f_req, "DULUS", last_user_msg)
1435
+
1436
+ # Inject IDs to maintain conversation thread
1437
+ try:
1438
+ # f.req structure for Gemini usually has IDs at specific positions
1439
+ # We try to inject them if they exist in config
1440
+ c_id = config.get("gemini_web_c_id")
1441
+ r_id = config.get("gemini_web_r_id")
1442
+ if c_id and r_id:
1443
+ # Typically [null, "[[\"message\",0,null,null,null,null,0],[\"es\"],[\"c_id\",\"r_id\"]...]"]
1444
+ # The inner string is what we need to modify
1445
+ for i, val in enumerate(f_req):
1446
+ if isinstance(val, str) and val.startswith("["):
1447
+ try:
1448
+ inner_req = json.loads(val)
1449
+ # inner_req[1] is usually the language ["es"]
1450
+ # inner_req[2] is usually [conv_id, reply_to_id]
1451
+ if len(inner_req) > 2:
1452
+ if not inner_req[2] or (isinstance(inner_req[2], list) and not inner_req[2][0]):
1453
+ inner_req[2] = [c_id, r_id]
1454
+ f_req[i] = json.dumps(inner_req, separators=(',', ':'))
1455
+ except:
1456
+ pass
1457
+ except:
1458
+ pass
1459
+
1460
+ pd_new_dict = {}
1461
+ for k, v in pd_parsed.items():
1462
+ if k == "f.req" and f_req_source == "post_data":
1463
+ pd_new_dict[k] = json.dumps(f_req, separators=(',', ':'))
1464
+ else:
1465
+ pd_new_dict[k] = v[0] if isinstance(v, list) else v
1466
+
1467
+ if f_req_source == "params":
1468
+ requests_params["f.req"] = json.dumps(f_req, separators=(',', ':'))
1469
+
1470
+ # Ensure 'at' token is present
1471
+ if "at" not in pd_new_dict and auth_data.get("snlm0e"):
1472
+ pd_new_dict["at"] = auth_data["snlm0e"]
1473
+
1474
+ # ── Headers / Cookies ──────────────────────────────────────────────────
1475
+ cookies = {c['name']: c['value'] for c in auth_data.get('cookies', [])}
1476
+
1477
+ headers = last_req.get("headers", {}).copy()
1478
+ for h in ["Content-Length", "Accept-Encoding", "Content-Type"]:
1479
+ headers.pop(h, None)
1480
+ headers.pop(h.lower(), None)
1481
+ headers["Content-Type"] = "application/x-www-form-urlencoded;charset=UTF-8"
1482
+
1483
+ # ── Streaming with Retries ──────────────────────────────────────────────
1484
+ # Accumulate the FULL raw response per attempt and parse <tool_call> tags
1485
+ # ONCE at the very end (same pattern as stream_kimi_web / stream_qwen_web).
1486
+ # Per-chunk parsing is fragile in gemini-web: tags can arrive split across
1487
+ # frames or come in a single blob, so end-of-response parsing is more robust.
1488
+ raw_content = ""
1489
+ text = ""
1490
+ parser = WebToolParser(auto_wrap_json=True)
1491
+
1492
+ for attempt in range(3):
1493
+ raw_content = "" # reset per attempt; previous attempt may have been incomplete
1494
+ # attempt 0: original try
1495
+ # attempt 1: same-thread retry (if attempt 0 was empty)
1496
+ # attempt 2: fresh-thread retry (clear IDs if attempt 1 was empty)
1497
+
1498
+ if attempt == 1:
1499
+ yield TextChunk("[gemini-web] Empty response — retrying same thread...\n")
1500
+ elif attempt == 2:
1501
+ config.pop("gemini_web_c_id", None)
1502
+ config.pop("gemini_web_r_id", None)
1503
+ config.pop("gemini_web_rc_id", None)
1504
+ yield TextChunk("[gemini-web] Empty response — IDs cleared, retrying with new thread...\n")
1505
+
1506
+ # Build/Re-build payload
1507
+ curr_f_req = []
1508
+ f_req_source = None
1509
+ if "f.req" in pd_parsed:
1510
+ curr_f_req = json.loads(pd_parsed["f.req"][0])
1511
+ f_req_source = "post_data"
1512
+ elif "f.req" in requests_params:
1513
+ curr_f_req = json.loads(requests_params["f.req"])
1514
+ f_req_source = "params"
1515
+
1516
+ if f_req_source:
1517
+ find_and_replace(curr_f_req, "DULUS", last_user_msg)
1518
+ # Inject IDs if not on attempt 2 (fresh thread)
1519
+ if attempt < 2:
1520
+ try:
1521
+ c_id = config.get("gemini_web_c_id")
1522
+ r_id = config.get("gemini_web_r_id")
1523
+ if c_id and r_id:
1524
+ for i, val in enumerate(curr_f_req):
1525
+ if isinstance(val, str) and val.startswith("["):
1526
+ try:
1527
+ inner_req = json.loads(val)
1528
+ if len(inner_req) > 2:
1529
+ if not inner_req[2] or (isinstance(inner_req[2], list) and not inner_req[2][0]):
1530
+ inner_req[2] = [c_id, r_id]
1531
+ curr_f_req[i] = json.dumps(inner_req, separators=(',', ':'))
1532
+ except: pass
1533
+ except: pass
1534
+
1535
+ pd_curr_dict = {}
1536
+ curr_requests_params = requests_params.copy()
1537
+ for k, v in pd_parsed.items():
1538
+ if k == "f.req" and f_req_source == "post_data":
1539
+ pd_curr_dict[k] = json.dumps(curr_f_req, separators=(',', ':'))
1540
+ else:
1541
+ pd_curr_dict[k] = v[0] if isinstance(v, list) else v
1542
+
1543
+ if f_req_source == "params":
1544
+ curr_requests_params["f.req"] = json.dumps(curr_f_req, separators=(',', ':'))
1545
+ if "at" not in pd_curr_dict and auth_data.get("snlm0e"):
1546
+ pd_curr_dict["at"] = auth_data["snlm0e"]
1547
+
1548
+ raw_text_len = 0
1549
+ try:
1550
+ response = requests.post(
1551
+ url.split('?')[0],
1552
+ params=curr_requests_params,
1553
+ cookies=cookies,
1554
+ headers=headers,
1555
+ data=pd_curr_dict,
1556
+ stream=True,
1557
+ timeout=120,
1558
+ )
1559
+
1560
+ if response.status_code != 200:
1561
+ if attempt < 2: continue # Retry on HTTP error too? maybe only on 429/500
1562
+ msg = f"[gemini-web] HTTP {response.status_code}: {response.text[:200]}"
1563
+ yield TextChunk(msg)
1564
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1565
+ return
1566
+
1567
+ for raw_line in response.iter_lines():
1568
+ if not raw_line: continue
1569
+ try:
1570
+ line = raw_line.decode('utf-8').strip()
1571
+ except: continue
1572
+ if not line.startswith('[["wrb.fr"'): continue
1573
+
1574
+ try:
1575
+ envelope = json.loads(line)
1576
+ for item in envelope:
1577
+ if len(item) > 2 and item[0] == "wrb.fr" and isinstance(item[2], str) and item[2].startswith("["):
1578
+ try:
1579
+ inner = json.loads(item[2])
1580
+ # Capture IDs
1581
+ if isinstance(inner, list) and len(inner) > 1:
1582
+ ids = inner[1]
1583
+ if isinstance(ids, list) and len(ids) >= 2:
1584
+ if ids[0]: config["gemini_web_c_id"] = ids[0]
1585
+ if ids[1]: config["gemini_web_r_id"] = ids[1]
1586
+
1587
+ # Text Extraction
1588
+ candidate = None
1589
+ try:
1590
+ if (isinstance(inner, list) and len(inner) > 4
1591
+ and isinstance(inner[4], list) and inner[4]
1592
+ and isinstance(inner[4][0], list) and len(inner[4][0]) > 1
1593
+ and isinstance(inner[4][0][1], list) and inner[4][0][1]):
1594
+ candidate = inner[4][0][1][0]
1595
+ except: pass
1596
+ if not candidate:
1597
+ try:
1598
+ if (isinstance(inner, list) and len(inner) > 0
1599
+ and isinstance(inner[0], list) and len(inner[0]) > 0
1600
+ and isinstance(inner[0][0], str) and inner[0][0]):
1601
+ candidate = inner[0][0]
1602
+ except: pass
1603
+
1604
+ if candidate and isinstance(candidate, str) and len(candidate) > raw_text_len:
1605
+ diff = candidate[raw_text_len:]
1606
+ raw_text_len = len(candidate)
1607
+ raw_content += diff
1608
+ try:
1609
+ if len(inner) > 4 and inner[4][0][0]:
1610
+ config["gemini_web_rc_id"] = inner[4][0][0]
1611
+ except: pass
1612
+ except: pass
1613
+ except: continue
1614
+ except Exception as e:
1615
+ if attempt < 2: continue
1616
+ msg = f"[gemini-web] Protocol Error: {e}"
1617
+ yield TextChunk(msg)
1618
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1619
+ return
1620
+
1621
+ # Check if we got something
1622
+ if raw_content:
1623
+ break
1624
+
1625
+ # Parse the full response once — avoids tool_call tags split across chunks
1626
+ if raw_content:
1627
+ text = parser.parse_chunk(raw_content)
1628
+ text += parser.flush()
1629
+ if text:
1630
+ yield TextChunk(text)
1631
+
1632
+ if not text and not parser.tool_calls:
1633
+ yield AssistantTurn("[gemini-web: no response after retries]", [], 0, 0)
1634
+ else:
1635
+ yield AssistantTurn(text, parser.tool_calls, 0, 0)
1636
+
1637
+
1638
+
1639
+ class _DeepSeekPoWSolver:
1640
+ """Lazy-initialized WASM PoW solver for DeepSeek web (sha3_wasm_bg)."""
1641
+ _instance = None
1642
+
1643
+ @classmethod
1644
+ def get(cls):
1645
+ if cls._instance is None:
1646
+ cls._instance = cls()
1647
+ return cls._instance
1648
+
1649
+ def __init__(self):
1650
+ import os
1651
+ import wasmtime
1652
+ import ctypes
1653
+ wasm_path = os.path.join(os.path.dirname(__file__), "sha3_wasm_bg.7b9ca65ddd.wasm")
1654
+ if not os.path.exists(wasm_path):
1655
+ raise FileNotFoundError(f"WASM not found: {wasm_path}")
1656
+ self._engine = wasmtime.Engine()
1657
+ self._store = wasmtime.Store(self._engine)
1658
+ self._module = wasmtime.Module.from_file(self._engine, wasm_path)
1659
+ self._instance = wasmtime.Instance(self._store, self._module, [])
1660
+ self._mem = self._instance.exports(self._store)["memory"]
1661
+ self._sp = self._instance.exports(self._store)["__wbindgen_add_to_stack_pointer"]
1662
+ self._malloc = self._instance.exports(self._store)["__wbindgen_export_0"]
1663
+ self._solve = self._instance.exports(self._store)["wasm_solve"]
1664
+
1665
+ def _get_mem_array(self):
1666
+ import ctypes
1667
+ ptr = self._mem.data_ptr(self._store)
1668
+ return ctypes.cast(ptr, ctypes.POINTER(ctypes.c_ubyte * self._mem.data_len(self._store))).contents
1669
+
1670
+ def _alloc_string(self, s: str):
1671
+ data = s.encode("utf-8")
1672
+ ptr = self._malloc(self._store, len(data), 1)
1673
+ arr = self._get_mem_array()
1674
+ for i, b in enumerate(data):
1675
+ arr[ptr + i] = b
1676
+ return ptr, len(data)
1677
+
1678
+ def solve(self, challenge: str, salt: str, expire_at: int, difficulty: int):
1679
+ import struct
1680
+ prefix = f"{salt}_{expire_at}_"
1681
+ retptr = self._sp(self._store, -16)
1682
+ try:
1683
+ ch_ptr, ch_len = self._alloc_string(challenge)
1684
+ prefix_ptr, prefix_len = self._alloc_string(prefix)
1685
+ self._solve(self._store, retptr, ch_ptr, ch_len, prefix_ptr, prefix_len, float(difficulty))
1686
+ arr = self._get_mem_array()
1687
+ status = struct.unpack("<i", bytes(arr[retptr:retptr + 4]))[0]
1688
+ value = struct.unpack("<d", bytes(arr[retptr + 8:retptr + 16]))[0]
1689
+ if status == 0:
1690
+ return None
1691
+ return int(value)
1692
+ finally:
1693
+ self._sp(self._store, 16)
1694
+
1695
+
1696
+ def stream_deepseek_web(
1697
+ auth_file: str,
1698
+ model: str,
1699
+ system: str,
1700
+ messages: list,
1701
+ tool_schemas: list,
1702
+ config: dict,
1703
+ ) -> Generator:
1704
+ """Stream from chat.deepseek.com web using harvested browser session.
1705
+
1706
+ DeepSeek's web UI uses a simple SSE (text/event-stream) API:
1707
+ POST https://chat.deepseek.com/api/v0/chat/completion
1708
+ Headers: Authorization: Bearer <token>
1709
+ Body: { model, messages, stream: true, chat_session_id? }
1710
+
1711
+ The harvester captures: Authorization token, cookies, and optionally a
1712
+ chat_session_id so the conversation continues in the same thread.
1713
+
1714
+ Harvester writes JSON: {
1715
+ "token": "...",
1716
+ "cookies": [...],
1717
+ "headers": {...},
1718
+ "chat_session_id": "...", // optional, for session continuity
1719
+ "model": "deepseek_v3" // internal model name used by the web UI
1720
+ }
1721
+ """
1722
+ import requests
1723
+ import os
1724
+
1725
+ if not os.path.exists(auth_file):
1726
+ msg = f"[deepseek-web] Auth file not found: {auth_file}. Run /harvest-deepseek."
1727
+ yield TextChunk(msg)
1728
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1729
+ return
1730
+
1731
+ with open(auth_file, "r", encoding="utf-8") as f:
1732
+ auth_data = json.load(f)
1733
+
1734
+ # ── Load persisted chat state (session + parent message) ─────────────────
1735
+ import pathlib as _pl
1736
+ _ds_state_path = _pl.Path.home() / ".dulus" / "deepseek_chat_state.json"
1737
+ _ds_state = {}
1738
+ if _ds_state_path.exists():
1739
+ try:
1740
+ with open(_ds_state_path, "r", encoding="utf-8") as _f:
1741
+ _ds_state = json.load(_f)
1742
+ except Exception:
1743
+ _ds_state = {}
1744
+
1745
+ def _save_ds_state(st: dict):
1746
+ try:
1747
+ _ds_state_path.parent.mkdir(parents=True, exist_ok=True)
1748
+ with open(_ds_state_path, "w", encoding="utf-8") as _f:
1749
+ json.dump(st, _f, indent=2)
1750
+ except Exception:
1751
+ pass
1752
+
1753
+ token = auth_data.get("token") or auth_data.get("authorization", "")
1754
+ if token and not token.startswith("Bearer "):
1755
+ token = f"Bearer {token}"
1756
+
1757
+ cookies = {c["name"]: c["value"] for c in auth_data.get("cookies", [])}
1758
+
1759
+ # Build conversation history
1760
+ manifest = _format_web_tool_manifest(tool_schemas, config, messages)
1761
+ last_user_msg = _consolidate_web_history(messages, manifest)
1762
+
1763
+ # Build messages list (system + history + new user message)
1764
+ ds_messages = []
1765
+ if system:
1766
+ ds_messages.append({"role": "system", "content": system})
1767
+
1768
+ # Include prior turns for context (last N to stay within limits)
1769
+ for m in messages[:-1][-20:]:
1770
+ role = m.get("role", "user")
1771
+ content = m.get("content", "")
1772
+ if isinstance(content, list):
1773
+ content = " ".join(
1774
+ b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text"
1775
+ )
1776
+ if role in ("user", "assistant") and content:
1777
+ ds_messages.append({"role": role, "content": content})
1778
+
1779
+ ds_messages.append({"role": "user", "content": last_user_msg})
1780
+
1781
+ # Internal model name (DeepSeek web uses "deepseek_v3" / "deepseek_r1" not "deepseek-v3")
1782
+ internal_model = auth_data.get("model", "deepseek_v3")
1783
+ if "r1" in model.lower():
1784
+ internal_model = "deepseek_r1"
1785
+ elif "v3" in model.lower() or "chat" in model.lower():
1786
+ internal_model = "deepseek_v3"
1787
+
1788
+ # Session continuity — state file has highest priority, then config, then auth_data
1789
+ chat_session_id = (
1790
+ _ds_state.get("chat_session_id")
1791
+ or config.get("deepseek_web_session_id")
1792
+ or auth_data.get("chat_session_id")
1793
+ )
1794
+ parent_message_id = (
1795
+ _ds_state.get("parent_message_id")
1796
+ or config.get("deepseek_web_parent_id")
1797
+ )
1798
+
1799
+ # ── Headers ──────────────────────────────────────────────────────────
1800
+ headers = auth_data.get("headers", {}).copy()
1801
+ for h in ["Content-Length", "Accept-Encoding", "Content-Type", "content-length"]:
1802
+ headers.pop(h, None)
1803
+ headers["Content-Type"] = "application/json"
1804
+ headers["Accept"] = "text/event-stream"
1805
+ if token:
1806
+ headers["Authorization"] = token
1807
+
1808
+ url = auth_data.get("url") or "https://chat.deepseek.com/api/v0/chat/completion"
1809
+
1810
+ # DeepSeek web API uses `prompt` (string) not `messages` (array).
1811
+ # Conversation history is maintained server-side via chat_session_id.
1812
+ if chat_session_id:
1813
+ # Server has history — just send the new user message as prompt
1814
+ prompt_text = last_user_msg
1815
+ else:
1816
+ # No session — flatten everything into a single prompt string
1817
+ parts = []
1818
+ if system:
1819
+ parts.append(f"[System]: {system}")
1820
+ for m in ds_messages[:-1]: # exclude last user msg, already in last_user_msg
1821
+ role = m.get("role", "user").capitalize()
1822
+ parts.append(f"[{role}]: {m.get('content', '')}")
1823
+ parts.append(last_user_msg)
1824
+ prompt_text = "\n\n".join(parts)
1825
+
1826
+ payload = {
1827
+ "model": internal_model,
1828
+ "prompt": prompt_text,
1829
+ "ref_file_ids": [],
1830
+ "thinking_enabled": internal_model == "deepseek_r1",
1831
+ "search_enabled": False,
1832
+ "stream": True,
1833
+ }
1834
+ if chat_session_id:
1835
+ payload["chat_session_id"] = chat_session_id
1836
+ if parent_message_id is not None:
1837
+ payload["parent_message_id"] = parent_message_id
1838
+
1839
+ text = ""
1840
+ thinking = ""
1841
+ raw_content = "" # accumulate full response before parsing
1842
+ parser = WebToolParser(auto_wrap_json=True)
1843
+ in_thinking = False
1844
+
1845
+ try:
1846
+ # Fetch and solve PoW challenge
1847
+ try:
1848
+ pow_resp = requests.post(
1849
+ "https://chat.deepseek.com/api/v0/chat/create_pow_challenge",
1850
+ cookies=cookies,
1851
+ headers={k: v for k, v in headers.items() if k.lower() != "x-ds-pow-response"},
1852
+ json={"target_path": "/api/v0/chat/completion"},
1853
+ timeout=10,
1854
+ )
1855
+ if pow_resp.status_code == 200:
1856
+ ch = pow_resp.json()["data"]["biz_data"]["challenge"]
1857
+ solver = _DeepSeekPoWSolver.get()
1858
+ ans = solver.solve(ch["challenge"], ch["salt"], ch["expire_at"], ch["difficulty"])
1859
+ if ans is not None:
1860
+ import base64 as _b64
1861
+ pow_obj = {
1862
+ "algorithm": ch["algorithm"],
1863
+ "challenge": ch["challenge"],
1864
+ "salt": ch["salt"],
1865
+ "answer": ans,
1866
+ "signature": ch["signature"],
1867
+ "target_path": ch["target_path"],
1868
+ }
1869
+ headers["x-ds-pow-response"] = _b64.b64encode(
1870
+ json.dumps(pow_obj, separators=(",", ":")).encode()
1871
+ ).decode()
1872
+ except Exception: pass
1873
+
1874
+ response = requests.post(
1875
+ url,
1876
+ json=payload,
1877
+ headers=headers,
1878
+ cookies=cookies,
1879
+ stream=True,
1880
+ timeout=120,
1881
+ )
1882
+
1883
+ if response.status_code == 401:
1884
+ msg = "[deepseek-web] Auth error (401) — token expired. Run /harvest-deepseek."
1885
+ yield TextChunk(msg)
1886
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1887
+ return
1888
+
1889
+ if response.status_code != 200:
1890
+ msg = f"[deepseek-web] HTTP {response.status_code}: {response.text[:200]}"
1891
+ yield TextChunk(msg)
1892
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1893
+ return
1894
+
1895
+ for raw_line in response.iter_lines():
1896
+ if not raw_line:
1897
+ continue
1898
+ try:
1899
+ line = raw_line.decode("utf-8").strip()
1900
+ except Exception:
1901
+ continue
1902
+
1903
+ if not line.startswith("data:"):
1904
+ continue
1905
+ data_str = line[5:].strip()
1906
+ if data_str == "[DONE]":
1907
+ break
1908
+
1909
+ try:
1910
+ data = json.loads(data_str)
1911
+ except Exception:
1912
+ continue
1913
+
1914
+ content_chunk = ""
1915
+ thinking_chunk = ""
1916
+
1917
+ if isinstance(data, dict):
1918
+ # Capture message IDs
1919
+ if "response_message_id" in data:
1920
+ config["deepseek_web_parent_id"] = data["response_message_id"]
1921
+ _save_ds_state({
1922
+ "chat_session_id": chat_session_id or data.get("id"),
1923
+ "parent_message_id": data["response_message_id"],
1924
+ })
1925
+ if data.get("id"):
1926
+ config["deepseek_web_session_id"] = data["id"]
1927
+
1928
+ # Native protocol
1929
+ p, o, v = data.get("p", ""), data.get("o", ""), data.get("v")
1930
+ if p == "response/fragments/-1/content" and o == "APPEND" and isinstance(v, str):
1931
+ content_chunk = v
1932
+ elif "p" not in data and "o" not in data and isinstance(v, str):
1933
+ content_chunk = v
1934
+ elif isinstance(v, dict):
1935
+ response_obj = v.get("response", {})
1936
+ if isinstance(response_obj, dict):
1937
+ fragments = response_obj.get("fragments", [])
1938
+ for frag in fragments:
1939
+ if isinstance(frag, dict) and frag.get("type") == "RESPONSE":
1940
+ content_chunk += frag.get("content", "")
1941
+
1942
+ # Fallback: SSE format
1943
+ if not content_chunk and not thinking_chunk:
1944
+ choices = data.get("choices", [])
1945
+ for choice in choices:
1946
+ delta = choice.get("delta", {})
1947
+ thinking_chunk = delta.get("reasoning_content") or delta.get("thinking_content", "")
1948
+ content_chunk = delta.get("content", "")
1949
+
1950
+ if thinking_chunk:
1951
+ thinking += thinking_chunk
1952
+ if not in_thinking:
1953
+ in_thinking = True
1954
+ yield TextChunk(thinking_chunk)
1955
+
1956
+ if content_chunk:
1957
+ in_thinking = False
1958
+ raw_content += content_chunk
1959
+
1960
+ except Exception as e:
1961
+ msg = f"[deepseek-web] Error: {e}"
1962
+ yield TextChunk(msg)
1963
+ yield AssistantTurn(msg, [], 0, 0, error=True)
1964
+ return
1965
+
1966
+ # Parse the full response once — avoids tool_call tags split across chunks
1967
+ if raw_content:
1968
+ text = parser.parse_chunk(raw_content)
1969
+ text += parser.flush()
1970
+ if text:
1971
+ yield TextChunk(text)
1972
+
1973
+ yield AssistantTurn(text or "[deepseek-web: no response]", parser.tool_calls, 0, 0)
1974
+
1975
+
1976
+ def stream_qwen_web(
1977
+ auth_file: str,
1978
+ model: str,
1979
+ system: str,
1980
+ messages: list,
1981
+ tool_schemas: list,
1982
+ config: dict,
1983
+ ) -> Generator:
1984
+ """Stream from chat.qwen.ai web using harvested browser session.
1985
+
1986
+ Qwen web uses a JSON-stream API:
1987
+ POST https://chat.qwen.ai/api/v2/chat/completions?chat_id=<uuid>
1988
+ Cookies: token=<JWT>, plus anti-bot cookies (cna/isg/tfstk/...)
1989
+ Body: {stream:true, version:"2.1", incremental_output:true, chat_id,
1990
+ chat_mode:"normal", model, parent_id, messages:[...]}
1991
+
1992
+ Harvester writes JSON: {
1993
+ "token": "<JWT>",
1994
+ "cookies": [...],
1995
+ "headers": {...},
1996
+ "chat_id": "...",
1997
+ "parent_id": "...",
1998
+ "model": "qwen3.6-plus"
1999
+ }
2000
+ """
2001
+ import requests
2002
+ import os
2003
+ import time
2004
+ import uuid
2005
+
2006
+ if not os.path.exists(auth_file):
2007
+ msg = f"[qwen-web] Auth file not found: {auth_file}. Run /harvest-qwen."
2008
+ yield TextChunk(msg)
2009
+ yield AssistantTurn(msg, [], 0, 0, error=True)
2010
+ return
2011
+
2012
+ with open(auth_file, "r", encoding="utf-8") as f:
2013
+ auth_data = json.load(f)
2014
+
2015
+ # ── Load persisted chat state (chat_id + parent_id across restarts) ──
2016
+ import pathlib as _pl
2017
+ _qw_state_path = _pl.Path.home() / ".dulus" / "qwen_chat_state.json"
2018
+ _qw_state = {}
2019
+ if _qw_state_path.exists():
2020
+ try:
2021
+ with open(_qw_state_path, "r", encoding="utf-8") as _f:
2022
+ _qw_state = json.load(_f)
2023
+ except Exception:
2024
+ _qw_state = {}
2025
+
2026
+ def _save_qw_state(st: dict):
2027
+ try:
2028
+ _qw_state_path.parent.mkdir(parents=True, exist_ok=True)
2029
+ with open(_qw_state_path, "w", encoding="utf-8") as _f:
2030
+ json.dump(st, _f, indent=2)
2031
+ except Exception:
2032
+ pass
2033
+
2034
+ cookies = {c["name"]: c["value"] for c in auth_data.get("cookies", [])}
2035
+ if auth_data.get("token") and "token" not in cookies:
2036
+ cookies["token"] = auth_data["token"]
2037
+
2038
+ # Session continuity — state file (most fresh) > config > auth_data > new
2039
+ chat_id = (
2040
+ _qw_state.get("chat_id")
2041
+ or config.get("qwen_web_chat_id")
2042
+ or auth_data.get("chat_id")
2043
+ or str(uuid.uuid4())
2044
+ )
2045
+ parent_id = (
2046
+ _qw_state.get("parent_id")
2047
+ or config.get("qwen_web_parent_id")
2048
+ or auth_data.get("parent_id")
2049
+ )
2050
+
2051
+ # Build conversation history. Qwen's server keeps the thread (chat_id +
2052
+ # parent_id), so on continuation turns we send ONLY the new user content
2053
+ # + tool results — re-sending the system prompt and tool manifest every
2054
+ # turn wastes 1-2K tokens per call.
2055
+ is_first_turn = not parent_id
2056
+ if is_first_turn:
2057
+ manifest = _format_web_tool_manifest(tool_schemas, config, messages)
2058
+ last_user_msg = _consolidate_web_history(messages, manifest)
2059
+ if system:
2060
+ last_user_msg = f"[System]: {system}\n\n{last_user_msg}"
2061
+ else:
2062
+ last_user_msg = _consolidate_web_history(messages, "")
2063
+
2064
+ fid = str(uuid.uuid4())
2065
+ next_child_id = str(uuid.uuid4())
2066
+ ts = int(time.time())
2067
+
2068
+ # Internal model name — strip provider prefix if any
2069
+ internal_model = model
2070
+ if "/" in internal_model:
2071
+ internal_model = internal_model.split("/", 1)[1]
2072
+ if not internal_model or internal_model == "qwen-latest":
2073
+ internal_model = auth_data.get("model") or "qwen3.6-plus"
2074
+
2075
+ # ── Headers ──────────────────────────────────────────────────────────
2076
+ headers = auth_data.get("headers", {}).copy()
2077
+ for h in ["Content-Length", "Accept-Encoding", "Content-Type",
2078
+ "content-length", "Cookie", "cookie"]:
2079
+ headers.pop(h, None)
2080
+ headers["Content-Type"] = "application/json"
2081
+ headers["Accept"] = "application/json"
2082
+ headers.setdefault("Origin", "https://chat.qwen.ai")
2083
+ headers.setdefault("Referer", f"https://chat.qwen.ai/c/{chat_id}")
2084
+ headers.setdefault("source", "web")
2085
+ headers.setdefault("Version", "0.2.45")
2086
+ headers["X-Request-Id"] = str(uuid.uuid4())
2087
+
2088
+ user_message = {
2089
+ "fid": fid,
2090
+ "parentId": parent_id,
2091
+ "childrenIds": [next_child_id],
2092
+ "role": "user",
2093
+ "content": last_user_msg,
2094
+ "user_action": "chat",
2095
+ "files": [],
2096
+ "timestamp": ts,
2097
+ "models": [internal_model],
2098
+ "chat_type": "t2t",
2099
+ "feature_config": {
2100
+ "thinking_enabled": False,
2101
+ "output_schema": "phase",
2102
+ "research_mode": "normal",
2103
+ "auto_thinking": False,
2104
+ "thinking_mode": "Auto",
2105
+ "thinking_format": "summary",
2106
+ "auto_search": False,
2107
+ },
2108
+ "extra": {"meta": {"subChatType": "t2t"}},
2109
+ "sub_chat_type": "t2t",
2110
+ "parent_id": parent_id,
2111
+ }
2112
+
2113
+ payload = {
2114
+ "stream": True,
2115
+ "version": "2.1",
2116
+ "incremental_output": True,
2117
+ "chat_id": chat_id,
2118
+ "chat_mode": "normal",
2119
+ "model": internal_model,
2120
+ "parent_id": parent_id,
2121
+ "messages": [user_message],
2122
+ "timestamp": ts,
2123
+ }
2124
+
2125
+ url = "https://chat.qwen.ai/api/v2/chat/completions"
2126
+ params = {"chat_id": chat_id}
2127
+
2128
+ raw_content = ""
2129
+ text = ""
2130
+ parser = WebToolParser(auto_wrap_json=True)
2131
+
2132
+ # ── 2-attempt loop: if chat was deleted server-side (404 / 400 / empty
2133
+ # stream) regenerate chat_id+parent_id once and retry as a fresh thread.
2134
+ for attempt in range(2):
2135
+ if attempt == 1:
2136
+ config.pop("qwen_web_chat_id", None)
2137
+ config.pop("qwen_web_parent_id", None)
2138
+ _save_qw_state({})
2139
+ chat_id = str(uuid.uuid4())
2140
+ parent_id = None
2141
+ params["chat_id"] = chat_id
2142
+ payload["chat_id"] = chat_id
2143
+ payload["parent_id"] = None
2144
+ user_message["parentId"] = None
2145
+ user_message["parent_id"] = None
2146
+ payload["messages"] = [user_message]
2147
+ yield TextChunk("[qwen-web] Chat unavailable — retrying with fresh thread...\n")
2148
+ raw_content = ""
2149
+
2150
+ try:
2151
+ response = requests.post(
2152
+ url, params=params, json=payload,
2153
+ headers=headers, cookies=cookies,
2154
+ stream=True, timeout=120,
2155
+ )
2156
+ except Exception as e:
2157
+ msg = f"[qwen-web] Error: {e}"
2158
+ yield TextChunk(msg)
2159
+ yield AssistantTurn(msg, [], 0, 0, error=True)
2160
+ return
2161
+
2162
+ if response.status_code == 401:
2163
+ msg = "[qwen-web] Auth error (401) — token expired. Run /harvest-qwen."
2164
+ yield TextChunk(msg)
2165
+ yield AssistantTurn(msg, [], 0, 0, error=True)
2166
+ return
2167
+
2168
+ if response.status_code in (400, 404) and attempt == 0:
2169
+ continue # likely chat deleted — retry with fresh thread
2170
+
2171
+ if response.status_code != 200:
2172
+ msg = f"[qwen-web] HTTP {response.status_code}: {response.text[:300]}"
2173
+ yield TextChunk(msg)
2174
+ yield AssistantTurn(msg, [], 0, 0, error=True)
2175
+ return
2176
+
2177
+ try:
2178
+ for raw_line in response.iter_lines():
2179
+ if not raw_line:
2180
+ continue
2181
+ try:
2182
+ line = raw_line.decode("utf-8").strip()
2183
+ except Exception:
2184
+ continue
2185
+
2186
+ # Qwen uses SSE-style "data: {...}" lines
2187
+ if line.startswith("data:"):
2188
+ data_str = line[5:].strip()
2189
+ else:
2190
+ data_str = line
2191
+ if not data_str or data_str == "[DONE]":
2192
+ if data_str == "[DONE]":
2193
+ break
2194
+ continue
2195
+
2196
+ try:
2197
+ data = json.loads(data_str)
2198
+ except Exception:
2199
+ continue
2200
+
2201
+ if not isinstance(data, dict):
2202
+ continue
2203
+
2204
+ content_chunk = ""
2205
+
2206
+ # ── Capture assistant message ID for thread continuity ──
2207
+ # Qwen response shapes vary; scan many likely keys. Whatever
2208
+ # ID we land on becomes the next turn's parent_id (mirrors
2209
+ # kimi-web / deepseek-web — without this, every turn looks
2210
+ # like a fresh chat to Qwen's server).
2211
+ captured_id = (
2212
+ data.get("response.message_id")
2213
+ or data.get("response_message_id")
2214
+ or data.get("message_id")
2215
+ or (data.get("message", {}) or {}).get("id")
2216
+ or data.get("response_id")
2217
+ )
2218
+ if not captured_id:
2219
+ for ch in data.get("choices", []) or []:
2220
+ msg_obj = ch.get("message") if isinstance(ch, dict) else None
2221
+ if isinstance(msg_obj, dict) and msg_obj.get("id"):
2222
+ captured_id = msg_obj["id"]
2223
+ break
2224
+ if not captured_id:
2225
+ resp_obj = data.get("response", {})
2226
+ if isinstance(resp_obj, dict):
2227
+ captured_id = resp_obj.get("id") or resp_obj.get("message_id")
2228
+ if not captured_id and data.get("id") and data.get("id") != chat_id:
2229
+ captured_id = data["id"]
2230
+ if captured_id:
2231
+ config["qwen_web_parent_id"] = captured_id
2232
+ _save_qw_state({
2233
+ "chat_id": config.get("qwen_web_chat_id") or chat_id,
2234
+ "parent_id": captured_id,
2235
+ })
2236
+
2237
+ if data.get("chat_id") and not config.get("qwen_web_chat_id"):
2238
+ config["qwen_web_chat_id"] = data["chat_id"]
2239
+
2240
+ # Try multiple shapes the Qwen API has been seen using:
2241
+ # 1) {"choices":[{"delta":{"content":"..."}}]}
2242
+ choices = data.get("choices", [])
2243
+ for choice in choices:
2244
+ delta = choice.get("delta", {}) if isinstance(choice, dict) else {}
2245
+ if isinstance(delta, dict):
2246
+ c = delta.get("content")
2247
+ if isinstance(c, str):
2248
+ content_chunk += c
2249
+ rc = delta.get("reasoning_content") or delta.get("thinking_content")
2250
+ if isinstance(rc, str) and rc:
2251
+ yield TextChunk(rc)
2252
+
2253
+ # 2) {"output":{"text":"...", "finish_reason":...}}
2254
+ if not content_chunk:
2255
+ output = data.get("output", {})
2256
+ if isinstance(output, dict):
2257
+ t = output.get("text") or output.get("content")
2258
+ if isinstance(t, str):
2259
+ content_chunk = t
2260
+
2261
+ # 3) {"content":"..."} (rare flat form)
2262
+ if not content_chunk and isinstance(data.get("content"), str):
2263
+ content_chunk = data["content"]
2264
+
2265
+ if content_chunk:
2266
+ raw_content += content_chunk
2267
+ except Exception as e:
2268
+ msg = f"[qwen-web] Error: {e}"
2269
+ yield TextChunk(msg)
2270
+ yield AssistantTurn(msg, [], 0, 0, error=True)
2271
+ return
2272
+
2273
+ # If first attempt produced nothing, retry with a fresh thread once
2274
+ if not raw_content and attempt == 0:
2275
+ continue
2276
+
2277
+ break # success — exit retry loop
2278
+
2279
+ # Parse the full response once — avoids tool_call tags split across chunks
2280
+ if raw_content:
2281
+ text = parser.parse_chunk(raw_content)
2282
+ text += parser.flush()
2283
+ if text:
2284
+ yield TextChunk(text)
2285
+
2286
+ # Persist next-turn state in config + disk (covers the case where the
2287
+ # chat_id was generated client-side and never echoed back in the stream).
2288
+ if not config.get("qwen_web_chat_id"):
2289
+ config["qwen_web_chat_id"] = chat_id
2290
+ _save_qw_state({
2291
+ "chat_id": config.get("qwen_web_chat_id") or chat_id,
2292
+ "parent_id": config.get("qwen_web_parent_id"),
2293
+ })
2294
+
2295
+ yield AssistantTurn(text or "[qwen-web: no response]", parser.tool_calls, 0, 0)
2296
+
2297
+
2298
+ def bare_model(model: str) -> str:
2299
+ """Strip 'provider/' prefix if present."""
2300
+ return model.split("/", 1)[1] if "/" in model else model
2301
+
2302
+
2303
+ def get_api_key(provider_name: str, config: dict) -> str:
2304
+ prov = PROVIDERS.get(provider_name, {})
2305
+ # 1. Check config dict (e.g. config["kimi_api_key"])
2306
+ cfg_key = config.get(f"{provider_name}_api_key", "")
2307
+ if cfg_key:
2308
+ return cfg_key
2309
+
2310
+ # Alias fallback: moonshot <-> kimi
2311
+ if provider_name == "moonshot":
2312
+ cfg_key = config.get("kimi_api_key", "")
2313
+ if cfg_key: return cfg_key
2314
+ elif provider_name == "kimi":
2315
+ cfg_key = config.get("moonshot_api_key", "")
2316
+ if cfg_key: return cfg_key
2317
+ elif provider_name == "kimi-code":
2318
+ cfg_key = config.get("kimi_code_api_key", "")
2319
+ if cfg_key: return cfg_key
2320
+ elif provider_name == "kimi-code2":
2321
+ cfg_key = config.get("kimi_code2_api_key", "")
2322
+ if cfg_key: return cfg_key
2323
+ elif provider_name == "kimi-code3":
2324
+ cfg_key = config.get("kimi_code3_api_key", "")
2325
+ if cfg_key: return cfg_key
2326
+
2327
+ # 2. Check env var
2328
+ env_var = prov.get("api_key_env")
2329
+ if env_var:
2330
+ import os
2331
+ return os.environ.get(env_var, "")
2332
+ # 3. Hardcoded (for local providers)
2333
+ return prov.get("api_key", "")
2334
+
2335
+
2336
+ def calc_cost(model: str, in_tok: int, out_tok: int) -> float:
2337
+ ic, oc = COSTS.get(bare_model(model), (0.0, 0.0))
2338
+ return (in_tok * ic + out_tok * oc) / 1_000_000
2339
+
2340
+
2341
+ def estimate_tokens_kimi(api_key: str, model: str, messages: list) -> int | None:
2342
+ """Estimate token count using Kimi's native API endpoint.
2343
+
2344
+ Args:
2345
+ api_key: Moonshot API key
2346
+ model: Model name (e.g., "kimi-k2.5")
2347
+ messages: List of message dicts with "role" and "content"
2348
+ Returns:
2349
+ Estimated token count, or None if the request fails
2350
+ """
2351
+ if not api_key:
2352
+ return None
2353
+
2354
+ url = "https://api.moonshot.ai/v1/tokenizers/estimate-token-count"
2355
+
2356
+ # Convert messages to Kimi format (similar to OpenAI format)
2357
+ kimi_messages = []
2358
+ for m in messages:
2359
+ role = m.get("role", "user")
2360
+ content = m.get("content", "")
2361
+ if isinstance(content, str):
2362
+ kimi_messages.append({"role": role, "content": content})
2363
+ elif isinstance(content, list):
2364
+ # Multimodal content - extract text parts
2365
+ text_parts = []
2366
+ for part in content:
2367
+ if isinstance(part, dict) and part.get("type") == "text":
2368
+ text_parts.append(part.get("text", ""))
2369
+ if text_parts:
2370
+ kimi_messages.append({"role": role, "content": " ".join(text_parts)})
2371
+
2372
+ payload = {
2373
+ "model": model,
2374
+ "messages": kimi_messages
2375
+ }
2376
+
2377
+ try:
2378
+ req = urllib.request.Request(
2379
+ url,
2380
+ data=json.dumps(payload).encode("utf-8"),
2381
+ headers={
2382
+ "Content-Type": "application/json",
2383
+ "Authorization": f"Bearer {api_key}"
2384
+ }
2385
+ )
2386
+ with urllib.request.urlopen(req, timeout=10) as resp:
2387
+ data = json.loads(resp.read().decode("utf-8"))
2388
+ # Response: {"data": {"total_tokens": 123}}
2389
+ if "data" in data and "total_tokens" in data["data"]:
2390
+ return data["data"]["total_tokens"]
2391
+ return None
2392
+ except Exception:
2393
+ # Silently fail - caller will fall back to character-based estimation
2394
+ return None
2395
+
2396
+
2397
+ # ── Tool schema conversion ─────────────────────────────────────────────────
2398
+
2399
+ def scrub_any_type(obj: Any) -> Any:
2400
+ """Recursively remove 'type': 'any' from schema dictionaries as it's not valid JSON Schema."""
2401
+ if isinstance(obj, dict):
2402
+ new_obj = {}
2403
+ for k, v in obj.items():
2404
+ if k == "type" and v == "any":
2405
+ continue
2406
+ new_obj[k] = scrub_any_type(v)
2407
+ return new_obj
2408
+ elif isinstance(obj, list):
2409
+ return [scrub_any_type(item) for item in obj]
2410
+ return obj
2411
+
2412
+
2413
+ def tools_to_openai(tool_schemas: list) -> list:
2414
+ """Convert Anthropic-style tool schemas to OpenAI function-calling format."""
2415
+ out = []
2416
+ for t in tool_schemas:
2417
+ if not isinstance(t, dict) or "name" not in t:
2418
+ continue
2419
+
2420
+ # Handle different schema names (Anthropic input_schema vs OpenAI parameters)
2421
+ params = t.get("input_schema") or t.get("parameters")
2422
+ if params is None:
2423
+ # Fallback to empty object if missing, better than crashing
2424
+ params = {"type": "object", "properties": {}}
2425
+
2426
+ # Scrub invalid 'any' types that some models hallucinate
2427
+ params = scrub_any_type(params)
2428
+
2429
+ out.append({
2430
+ "type": "function",
2431
+ "function": {
2432
+ "name": t["name"],
2433
+ "description": t.get("description", ""),
2434
+ "parameters": params,
2435
+ },
2436
+ })
2437
+ return out
2438
+
2439
+
2440
+ # ── Message format conversion ──────────────────────────────────────────────
2441
+ #
2442
+ # Internal "neutral" message format:
2443
+ # {"role": "user", "content": "text"}
2444
+ # {"role": "assistant", "content": "text", "tool_calls": [
2445
+ # {"id": "...", "name": "...", "input": {...}}
2446
+ # ]}
2447
+ # {"role": "tool", "tool_call_id": "...", "name": "...", "content": "..."}
2448
+
2449
+ def messages_to_anthropic(messages: list) -> list:
2450
+ """Convert neutral messages → Anthropic API format."""
2451
+ result = []
2452
+ i = 0
2453
+ while i < len(messages):
2454
+ m = messages[i]
2455
+ role = m["role"]
2456
+
2457
+ if role == "user":
2458
+ result.append({"role": "user", "content": m["content"]})
2459
+ i += 1
2460
+
2461
+ elif role == "assistant":
2462
+ blocks = []
2463
+ thinking = m.get("thinking", "")
2464
+ if thinking:
2465
+ blocks.append({"type": "thinking", "thinking": thinking})
2466
+
2467
+ text = m.get("content", "")
2468
+ if text:
2469
+ blocks.append({"type": "text", "text": text})
2470
+ for tc in m.get("tool_calls", []):
2471
+ blocks.append({
2472
+ "type": "tool_use",
2473
+ "id": tc["id"],
2474
+ "name": tc["name"],
2475
+ "input": tc["input"],
2476
+ })
2477
+ result.append({"role": "assistant", "content": blocks})
2478
+ i += 1
2479
+
2480
+ elif role == "tool":
2481
+ # Collect consecutive tool results into one user message
2482
+ tool_blocks = []
2483
+ while i < len(messages) and messages[i]["role"] == "tool":
2484
+ t = messages[i]
2485
+ tool_blocks.append({
2486
+ "type": "tool_result",
2487
+ "tool_use_id": t["tool_call_id"],
2488
+ "content": t["content"],
2489
+ })
2490
+ i += 1
2491
+ result.append({"role": "user", "content": tool_blocks})
2492
+
2493
+ else:
2494
+ i += 1
2495
+
2496
+ return result
2497
+
2498
+
2499
+ def messages_to_openai(messages: list, ollama_native_images: bool = False) -> list:
2500
+ """Convert neutral messages → OpenAI API format.
2501
+
2502
+ Also sanitizes orphan tool_calls — if an assistant message has tool_calls
2503
+ but the matching tool responses are missing (e.g. user interrupted mid-call),
2504
+ the tool_calls are stripped to avoid API rejection.
2505
+ """
2506
+ # ── Sanitize orphan tool_calls ────────────────────────────────────────
2507
+ # Collect all tool_call_ids that have a matching tool response
2508
+ answered_ids = {m.get("tool_call_id") for m in messages if m.get("role") == "tool"}
2509
+ sanitized = []
2510
+ for m in messages:
2511
+ if m.get("role") == "assistant" and m.get("tool_calls"):
2512
+ # Keep only tool_calls that have a matching response
2513
+ valid_tcs = [tc for tc in m["tool_calls"] if tc.get("id") in answered_ids]
2514
+ if valid_tcs:
2515
+ sanitized.append({**m, "tool_calls": valid_tcs})
2516
+ else:
2517
+ # All tool_calls are orphans — strip them, keep text content only
2518
+ sanitized.append({"role": "assistant", "content": m.get("content") or "(interrupted)"})
2519
+ else:
2520
+ sanitized.append(m)
2521
+ messages = sanitized
2522
+
2523
+ result = []
2524
+ for m in messages:
2525
+ role = m["role"]
2526
+
2527
+ if role == "user":
2528
+ content = m["content"]
2529
+ if ollama_native_images and m.get("images"):
2530
+ # Ollama /api/chat native: bare base64 list on the message
2531
+ msg_out = {"role": "user", "content": content, "images": m["images"]}
2532
+ elif not ollama_native_images and m.get("images"):
2533
+ # OpenAI / Gemini multipart vision format
2534
+ parts = [{"type": "text", "text": content}]
2535
+ for img_b64 in m["images"]:
2536
+ parts.append({
2537
+ "type": "image_url",
2538
+ "image_url": {"url": f"data:image/png;base64,{img_b64}"},
2539
+ })
2540
+ msg_out = {"role": "user", "content": parts}
2541
+ else:
2542
+ msg_out = {"role": "user", "content": content}
2543
+ result.append(msg_out)
2544
+
2545
+ elif role == "assistant":
2546
+ msg: dict = {"role": "assistant", "content": m.get("content") or None}
2547
+ if "thinking" in m and m["thinking"]:
2548
+ msg["reasoning_content"] = m["thinking"]
2549
+
2550
+ tcs = m.get("tool_calls", [])
2551
+ if tcs:
2552
+ msg["tool_calls"] = []
2553
+ for tc in tcs:
2554
+ tc_msg = {
2555
+ "id": tc["id"],
2556
+ "type": "function",
2557
+ "function": {
2558
+ "name": tc["name"],
2559
+ "arguments": json.dumps(tc["input"], ensure_ascii=False),
2560
+ },
2561
+ }
2562
+ # Pass through provider-specific fields (e.g. Gemini thought_signature)
2563
+ if tc.get("extra_content"):
2564
+ tc_msg["extra_content"] = tc["extra_content"]
2565
+ msg["tool_calls"].append(tc_msg)
2566
+ result.append(msg)
2567
+
2568
+ elif role == "tool":
2569
+ result.append({
2570
+ "role": "tool",
2571
+ "tool_call_id": m["tool_call_id"],
2572
+ "content": m["content"],
2573
+ })
2574
+
2575
+ return result
2576
+
2577
+
2578
+ # ── Streaming adapters ─────────────────────────────────────────────────────
2579
+
2580
+ class TextChunk:
2581
+ def __init__(self, text): self.text = text
2582
+
2583
+ class ThinkingChunk:
2584
+ def __init__(self, text): self.text = text
2585
+
2586
+ class AssistantTurn:
2587
+ """Completed assistant turn with text + tool_calls + thinking."""
2588
+ def __init__(self, text, tool_calls, in_tokens, out_tokens, thinking="", error=False,
2589
+ cache_creation_tokens=0, cache_read_tokens=0):
2590
+ self.text = text
2591
+ self.tool_calls = tool_calls # list of {id, name, input}
2592
+ self.in_tokens = in_tokens
2593
+ self.out_tokens = out_tokens
2594
+ self.thinking = thinking
2595
+ self.error = error
2596
+ # Anthropic explicit caching + OpenAI prompt-cached tokens.
2597
+ # 0 when the provider doesn't report it.
2598
+ self.cache_creation_tokens = cache_creation_tokens
2599
+ self.cache_read_tokens = cache_read_tokens
2600
+
2601
+
2602
+ def friendly_api_error(exc: Exception) -> str:
2603
+ """Map common API exceptions to short, actionable hints for the user.
2604
+
2605
+ Returns a single-line string suitable for streaming back to the REPL.
2606
+ Falls back to the raw exception message when no pattern matches.
2607
+ """
2608
+ s = str(exc).lower()
2609
+ etype = type(exc).__name__
2610
+
2611
+ # Auth / key problems
2612
+ if "authentication" in s or "invalid_api_key" in s or "401" in s or etype == "AuthenticationError":
2613
+ return "API key is missing or invalid. Run /config <provider>_api_key=... or set the env var."
2614
+ # Rate limit
2615
+ if "rate limit" in s or "rate_limit" in s or "429" in s or etype == "RateLimitError":
2616
+ return "Rate limit hit. Wait a bit and retry, or switch model with /model."
2617
+ # Overload / capacity
2618
+ if "overloaded" in s or "capacity" in s or "503" in s or "502" in s:
2619
+ return "Provider is overloaded right now. Retry in a few seconds or switch model."
2620
+ # Context / token limit
2621
+ if "context_length" in s or "maximum context" in s or "too many tokens" in s or "context_window" in s:
2622
+ return "Context window exceeded. Try /compact to shrink history or /clear to reset."
2623
+ # Bad request / tool schema
2624
+ if "invalid_request" in s or "400" in s or etype == "BadRequestError":
2625
+ return f"API rejected the request: {exc}. Check tool schemas, message format, or model name."
2626
+ # Network / DNS
2627
+ if "connection" in s or "timeout" in s or "dns" in s or etype in ("APIConnectionError", "ConnectTimeout"):
2628
+ return "Network problem reaching the API. Check connection, VPN, or provider status."
2629
+ # Permission / model access
2630
+ if "permission" in s or "model_not_found" in s or "404" in s:
2631
+ return "Model not found or not enabled for your account. Check model name or billing."
2632
+ return f"API error: {exc}"
2633
+
2634
+
2635
+ def _thinking_level_from(value) -> int:
2636
+ """Coerce legacy bool/int thinking config into an int 0-4."""
2637
+ if value is True: return 3
2638
+ if value is False or value is None: return 0
2639
+ try:
2640
+ lvl = int(value)
2641
+ except (TypeError, ValueError):
2642
+ return 0
2643
+ return max(0, min(4, lvl))
2644
+
2645
+
2646
+ def stream_anthropic(
2647
+ api_key: str,
2648
+ model: str,
2649
+ system: str,
2650
+ messages: list,
2651
+ tool_schemas: list,
2652
+ config: dict,
2653
+ ) -> Generator:
2654
+ """Stream from Anthropic API. Yields TextChunk/ThinkingChunk, then AssistantTurn.
2655
+
2656
+ Prompt caching: marks up to 3 cache breakpoints — system prompt, tools
2657
+ block, and the latest user message. Anthropic caches everything BEFORE
2658
+ each breakpoint, so the conversation history up to the latest user turn
2659
+ rides the same cache as long as it's appended (not edited). 4-breakpoint
2660
+ cap is the API limit; 3 is the practical sweet spot for an agent loop.
2661
+ """
2662
+ import anthropic as _ant
2663
+ client = _ant.Anthropic(api_key=api_key)
2664
+
2665
+ # 1) System prompt as a single text block with cache_control.
2666
+ if isinstance(system, str) and system:
2667
+ system_blocks = [{
2668
+ "type": "text",
2669
+ "text": system,
2670
+ "cache_control": {"type": "ephemeral"},
2671
+ }]
2672
+ else:
2673
+ system_blocks = system # already structured, leave as-is
2674
+
2675
+ # 2) Tools: cache the last tool's schema. Caches the whole tools array.
2676
+ cached_tools = list(tool_schemas) if tool_schemas else tool_schemas
2677
+ if cached_tools:
2678
+ last_tool = dict(cached_tools[-1])
2679
+ last_tool["cache_control"] = {"type": "ephemeral"}
2680
+ cached_tools[-1] = last_tool
2681
+
2682
+ # 3) Latest user message: marker on the last content block. Caches the
2683
+ # full prior conversation so multi-turn sessions hit the cache.
2684
+ ant_messages = messages_to_anthropic(messages)
2685
+ for i in range(len(ant_messages) - 1, -1, -1):
2686
+ m = ant_messages[i]
2687
+ if m.get("role") != "user":
2688
+ continue
2689
+ c = m.get("content")
2690
+ if isinstance(c, str):
2691
+ m["content"] = [{
2692
+ "type": "text",
2693
+ "text": c,
2694
+ "cache_control": {"type": "ephemeral"},
2695
+ }]
2696
+ elif isinstance(c, list) and c:
2697
+ last = c[-1]
2698
+ if isinstance(last, dict):
2699
+ # Don't double-mark if caller already set it.
2700
+ last.setdefault("cache_control", {"type": "ephemeral"})
2701
+ break
2702
+
2703
+ kwargs = {
2704
+ "model": model,
2705
+ "max_tokens": config.get("max_tokens", 8192),
2706
+ "system": system_blocks,
2707
+ "messages": ant_messages,
2708
+ "tools": cached_tools,
2709
+ }
2710
+ _thk_raw = config.get("thinking", 0)
2711
+ _thk_level = _thinking_level_from(_thk_raw)
2712
+ if _thk_level > 0:
2713
+ # Budget scales with level: 1=low, 2=medium, 3=high, 4=normal (mid). Explicit
2714
+ # thinking_budget in config still wins when provided.
2715
+ _level_budgets = {1: 2048, 2: 6000, 3: 16000, 4: 8192}
2716
+ budget = config.get("thinking_budget") or _level_budgets[_thk_level]
2717
+ kwargs["thinking"] = {
2718
+ "type": "enabled",
2719
+ "budget_tokens": budget,
2720
+ }
2721
+
2722
+ tool_calls = []
2723
+ text = ""
2724
+ thinking = ""
2725
+
2726
+ try:
2727
+ with client.messages.stream(**kwargs) as stream:
2728
+ for event in stream:
2729
+ etype = getattr(event, "type", None)
2730
+ if etype == "content_block_delta":
2731
+ delta = event.delta
2732
+ dtype = getattr(delta, "type", None)
2733
+ if dtype == "text_delta":
2734
+ text += delta.text
2735
+ yield TextChunk(delta.text)
2736
+ elif dtype == "thinking_delta":
2737
+ thinking += delta.thinking
2738
+ yield ThinkingChunk(delta.thinking)
2739
+
2740
+ final = stream.get_final_message()
2741
+ for block in final.content:
2742
+ if block.type == "tool_use":
2743
+ tool_calls.append({
2744
+ "id": block.id,
2745
+ "name": block.name,
2746
+ "input": block.input,
2747
+ })
2748
+
2749
+ _cc = getattr(final.usage, "cache_creation_input_tokens", 0) or 0
2750
+ _cr = getattr(final.usage, "cache_read_input_tokens", 0) or 0
2751
+ yield AssistantTurn(
2752
+ text, tool_calls,
2753
+ final.usage.input_tokens,
2754
+ final.usage.output_tokens,
2755
+ thinking=thinking,
2756
+ cache_creation_tokens=_cc,
2757
+ cache_read_tokens=_cr,
2758
+ )
2759
+ except Exception as _e:
2760
+ msg = friendly_api_error(_e)
2761
+ yield TextChunk(msg)
2762
+ yield AssistantTurn(msg, [], 0, 0, error=True)
2763
+ return
2764
+
2765
+
2766
+ def stream_kimi(
2767
+ api_key: str,
2768
+ model: str,
2769
+ system: str,
2770
+ messages: list,
2771
+ tool_schemas: list,
2772
+ config: dict,
2773
+ ) -> Generator:
2774
+ """Stream from Kimi API using native HTTP requests. Yields TextChunk, then AssistantTurn.
2775
+
2776
+ This is a native implementation using urllib.request instead of the OpenAI SDK,
2777
+ allowing direct comparison with the OpenAI-compatible version.
2778
+
2779
+ Token estimation:
2780
+ 1. Input tokens: Estimados ANTES usando estimate_tokens_kimi() (endpoint nativo de Kimi)
2781
+ 2. Output tokens: Capturados del campo usage de la respuesta streaming
2782
+ """
2783
+ url = "https://api.moonshot.ai/v1/chat/completions"
2784
+
2785
+ # Build messages
2786
+ kimi_messages = [{"role": "system", "content": system}] + messages_to_openai(messages)
2787
+
2788
+ # Kimi rejects assistant messages with null/empty content and no tool_calls
2789
+ # (happens when a prior turn was thinking-only or interrupted).
2790
+ # Replace empty content with a placeholder so the conversation chain stays valid.
2791
+ for _m in kimi_messages:
2792
+ if _m.get("role") == "assistant" and not _m.get("tool_calls"):
2793
+ if not _m.get("content"):
2794
+ _m["content"] = "..."
2795
+
2796
+ # === CONTADOR DE TOKENS ===
2797
+ # Input: Estimación por caracteres (fallback simple y confiable)
2798
+ # Output: Capturado del usage del stream
2799
+ in_tok = 0
2800
+
2801
+ # Build request payload
2802
+ payload: dict = {
2803
+ "model": model,
2804
+ "messages": kimi_messages,
2805
+ "stream": True,
2806
+ "stream_options": {"include_usage": True}, # ensure token usage in stream
2807
+ }
2808
+
2809
+ # Kimi thinking control
2810
+ thinking_mode = "enabled" if config.get("thinking", False) else "disabled"
2811
+ payload["thinking"] = {"type": thinking_mode}
2812
+
2813
+ # Tools
2814
+ if tool_schemas and not config.get("no_tools"):
2815
+ payload["tools"] = tools_to_openai(tool_schemas)
2816
+ if not config.get("disable_tool_choice"):
2817
+ payload["tool_choice"] = "auto"
2818
+
2819
+ # Max tokens (Kimi prefers max_completion_tokens like OpenAI new API)
2820
+ if config.get("max_tokens"):
2821
+ prov_cap = PROVIDERS.get("kimi", {}).get("max_completion_tokens")
2822
+ mt = config["max_tokens"]
2823
+ payload["max_completion_tokens"] = min(mt, prov_cap) if prov_cap else mt
2824
+
2825
+ # Extra options
2826
+ if config.get("temperature") is not None:
2827
+ payload["temperature"] = config["temperature"]
2828
+ if config.get("top_p") is not None:
2829
+ payload["top_p"] = config["top_p"]
2830
+
2831
+ # Make request
2832
+ req = urllib.request.Request(
2833
+ url,
2834
+ data=json.dumps(payload).encode("utf-8"),
2835
+ headers={
2836
+ "Content-Type": "application/json",
2837
+ "Authorization": f"Bearer {api_key}"
2838
+ },
2839
+ method="POST"
2840
+ )
2841
+
2842
+ text = ""
2843
+ thinking = ""
2844
+ tool_buf: dict = {}
2845
+ out_tok = 0
2846
+ cached_tok = 0
2847
+
2848
+ # Estimación simple de tokens de entrada (caracteres / 4)
2849
+ # Esto es aproximado pero confiable
2850
+ total_chars = len(system) + sum(len(str(m.get("content", ""))) for m in messages)
2851
+ in_tok = max(1, total_chars // 4)
2852
+
2853
+ try:
2854
+ resp = urllib.request.urlopen(req, timeout=300)
2855
+ except urllib.error.HTTPError as e:
2856
+ try:
2857
+ err_body = e.read().decode("utf-8")
2858
+ err_data = json.loads(err_body)
2859
+ err_msg = err_data.get("error", {}).get("message", str(e))
2860
+ except:
2861
+ err_msg = str(e)
2862
+ msg = f"Error: Kimi API error: {err_msg}"
2863
+ yield TextChunk(msg)
2864
+ yield AssistantTurn(msg, [], 0, 0, error=True)
2865
+ return
2866
+ except Exception as e:
2867
+ msg = f"Error: Failed to connect to Kimi API: {e}"
2868
+ yield TextChunk(msg)
2869
+ yield AssistantTurn(msg, [], 0, 0, error=True)
2870
+ return
2871
+
2872
+ # Parse SSE stream
2873
+ for line in resp:
2874
+ line = line.decode("utf-8").strip()
2875
+ if not line or not line.startswith("data: "):
2876
+ continue
2877
+
2878
+ data_str = line[6:] # Remove "data: " prefix
2879
+ if data_str == "[DONE]":
2880
+ break
2881
+
2882
+ try:
2883
+ data = json.loads(data_str)
2884
+ except json.JSONDecodeError:
2885
+ continue
2886
+
2887
+ # Extract usage if present
2888
+ if "usage" in data and data["usage"]:
2889
+ u = data["usage"]
2890
+ in_tok = u.get("prompt_tokens", 0) or in_tok
2891
+ out_tok = u.get("completion_tokens", 0) or out_tok
2892
+ # Kimi exposes cached prompt tokens at top-level usage.cached_tokens
2893
+ # (some accounts also report prompt_tokens_details.cached_tokens).
2894
+ cached_tok = (
2895
+ u.get("cached_tokens", 0)
2896
+ or (u.get("prompt_tokens_details") or {}).get("cached_tokens", 0)
2897
+ or cached_tok
2898
+ )
2899
+
2900
+ # Extract choices
2901
+ choices = data.get("choices", [])
2902
+ if not choices:
2903
+ continue
2904
+
2905
+ delta = choices[0].get("delta", {})
2906
+
2907
+ # Content
2908
+ content = delta.get("content")
2909
+ if content:
2910
+ text += content
2911
+ yield TextChunk(content)
2912
+
2913
+ # Reasoning content
2914
+ reasoning = delta.get("reasoning_content") or delta.get("reasoning")
2915
+ if reasoning:
2916
+ thinking += reasoning
2917
+ yield ThinkingChunk(reasoning)
2918
+
2919
+ # Tool calls
2920
+ tool_calls = delta.get("tool_calls", [])
2921
+ for tc in tool_calls:
2922
+ idx = tc.get("index", 0)
2923
+ if idx not in tool_buf:
2924
+ tool_buf[idx] = {"id": "", "name": "", "args": ""}
2925
+ if tc.get("id"):
2926
+ tool_buf[idx]["id"] = tc["id"]
2927
+ fn = tc.get("function", {})
2928
+ if fn.get("name"):
2929
+ tool_buf[idx]["name"] += fn["name"]
2930
+ if fn.get("arguments"):
2931
+ tool_buf[idx]["args"] += fn["arguments"]
2932
+
2933
+ # Build final tool calls
2934
+ final_tool_calls = []
2935
+ for idx in sorted(tool_buf):
2936
+ v = tool_buf[idx]
2937
+ try:
2938
+ inp = json.loads(v["args"]) if v["args"] else {}
2939
+ except json.JSONDecodeError:
2940
+ inp = {"_raw": v["args"]}
2941
+ final_tool_calls.append({
2942
+ "id": v["id"] or f"call_{idx}",
2943
+ "name": v["name"],
2944
+ "input": inp
2945
+ })
2946
+
2947
+ yield AssistantTurn(text, final_tool_calls, in_tok, out_tok, thinking=thinking,
2948
+ cache_read_tokens=cached_tok)
2949
+
2950
+
2951
+ def stream_openai_compat(
2952
+ api_key: str,
2953
+ base_url: str,
2954
+ model: str,
2955
+ system: str,
2956
+ messages: list,
2957
+ tool_schemas: list,
2958
+ config: dict,
2959
+ ) -> Generator:
2960
+ """Stream from any OpenAI-compatible API. Yields TextChunk, then AssistantTurn."""
2961
+ from openai import OpenAI
2962
+ # Detect kimi-code by base_url, NOT by model name. Reason: when invoked as
2963
+ # `kimi-code/kimi-k2.5` (or k2.6, kimi-latest, etc.), `model` arrives here
2964
+ # already stripped to the bare name, and detect_provider("kimi-k2.5") falls
2965
+ # through to the generic "kimi" prefix → header omitted → 403.
2966
+ # The /coding/v1 endpoint is unique to kimi-code regardless of model.
2967
+ _is_kimi_code = (
2968
+ "api.kimi.com/coding" in (base_url or "")
2969
+ or detect_provider(model) in ("kimi-code", "kimi-code2", "kimi-code3")
2970
+ )
2971
+ client_kwargs: dict = {"api_key": api_key or "dummy", "base_url": base_url}
2972
+ if _is_kimi_code:
2973
+ # Kimi Code API whitelists only known Coding Agents by User-Agent.
2974
+ # Without this header the API returns 403.
2975
+ client_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.30.0"}
2976
+ client = OpenAI(**client_kwargs)
2977
+
2978
+ oai_messages = [{"role": "system", "content": system}] + messages_to_openai(messages)
2979
+
2980
+ _is_nvidia = detect_provider(model) == "nvidia-web"
2981
+
2982
+ kwargs: dict = {
2983
+ "model": model,
2984
+ "messages": oai_messages,
2985
+ "stream": True,
2986
+ "stream_options": {"include_usage": True},
2987
+ }
2988
+
2989
+ # Pass num_ctx for known Ollama/LM Studio ports only — avoids matching other local servers (e.g. vLLM on :8000)
2990
+ _is_local_ollama = "11434" in base_url
2991
+ _is_lmstudio = "1234" in base_url and ("lmstudio" in base_url or "localhost" in base_url or "127.0.0.1" in base_url)
2992
+ if _is_local_ollama or _is_lmstudio:
2993
+ prov = detect_provider(model)
2994
+ ctx_limit = PROVIDERS.get(prov if prov in ("ollama", "lmstudio") else "ollama", {}).get("context_limit", 128000)
2995
+ kwargs["extra_body"] = {"options": {"num_ctx": ctx_limit}}
2996
+
2997
+ # Kimi thinking control (v1.0.1.20+)
2998
+ if detect_provider(model) in ("kimi", "moonshot", "kimi-code", "kimi-code2", "kimi-code3"):
2999
+ if not kwargs.get("extra_body"): kwargs["extra_body"] = {}
3000
+ # Kimi expects an object: {"type": "enabled" | "disabled"}
3001
+ mode = "enabled" if config.get("thinking", False) else "disabled"
3002
+ kwargs["extra_body"]["thinking"] = {"type": mode}
3003
+
3004
+ # DeepSeek reasoning control (reasoning_effort for thinking models)
3005
+ if detect_provider(model) == "deepseek":
3006
+ if config.get("thinking", False):
3007
+ # Map thinking mode to reasoning_effort
3008
+ kwargs["reasoning_effort"] = "medium" # default
3009
+ else:
3010
+ kwargs["reasoning_effort"] = "none"
3011
+
3012
+ # NVIDIA NIM thinking control (chat_template_kwargs)
3013
+ if _is_nvidia and config.get("thinking", False):
3014
+ if not kwargs.get("extra_body"):
3015
+ kwargs["extra_body"] = {}
3016
+ kwargs["extra_body"]["chat_template_kwargs"] = {
3017
+ "thinking": True,
3018
+ "reasoning_effort": "high",
3019
+ }
3020
+
3021
+ if tool_schemas and not config.get("no_tools"):
3022
+ kwargs["tools"] = tools_to_openai(tool_schemas)
3023
+ # "auto" requires vLLM --enable-auto-tool-choice; omit if server doesn't support it
3024
+ if not config.get("disable_tool_choice"):
3025
+ kwargs["tool_choice"] = "auto"
3026
+ if config.get("max_tokens"):
3027
+ prov_cap = PROVIDERS.get(detect_provider(model), {}).get("max_completion_tokens")
3028
+ mt = config["max_tokens"]
3029
+ kwargs["max_tokens"] = min(mt, prov_cap) if prov_cap else mt
3030
+
3031
+ text = ""
3032
+ thinking = ""
3033
+ tool_buf: dict = {} # index → {id, name, args_str}
3034
+ in_tok = out_tok = 0
3035
+ cached_tok = 0 # OpenAI-compat prefix-cached prompt tokens (when reported)
3036
+
3037
+ try:
3038
+ from openai import AuthenticationError, RateLimitError, APIConnectionError, APIStatusError
3039
+ stream = client.chat.completions.create(**kwargs)
3040
+ except (AuthenticationError, RateLimitError, APIConnectionError, APIStatusError) as e:
3041
+ if _is_nvidia:
3042
+ import sys; print(f"[nvidia-web RAW ERROR] {type(e).__name__}: {e}", file=sys.stderr, flush=True)
3043
+ msg = friendly_api_error(e)
3044
+ yield TextChunk(msg)
3045
+ yield AssistantTurn(msg, [], 0, 0, error=True)
3046
+ return
3047
+ except Exception as e:
3048
+ if _is_nvidia:
3049
+ import sys; print(f"[nvidia-web RAW ERROR] {type(e).__name__}: {e}", file=sys.stderr, flush=True)
3050
+ msg = friendly_api_error(e)
3051
+ yield TextChunk(msg)
3052
+ yield AssistantTurn(msg, [], 0, 0, error=True)
3053
+ return
3054
+
3055
+ in_thought = False
3056
+ def _extract_cached(u) -> int:
3057
+ # Cached prompt tokens come in different shapes depending on provider:
3058
+ # OpenAI: usage.prompt_tokens_details.cached_tokens
3059
+ # Kimi/code: usage.cached_tokens (top-level) or same as OpenAI
3060
+ # DeepSeek: usage.prompt_cache_hit_tokens
3061
+ # Anthropic-style proxy: usage.cache_read_input_tokens
3062
+ c = 0
3063
+ details = getattr(u, "prompt_tokens_details", None)
3064
+ if details:
3065
+ c = (
3066
+ getattr(details, "cached_tokens", 0)
3067
+ or (details.get("cached_tokens", 0) if isinstance(details, dict) else 0)
3068
+ or 0
3069
+ )
3070
+ return (
3071
+ c
3072
+ or getattr(u, "cached_tokens", 0)
3073
+ or getattr(u, "prompt_cache_hit_tokens", 0)
3074
+ or getattr(u, "cache_read_input_tokens", 0)
3075
+ or 0
3076
+ )
3077
+
3078
+ for chunk in stream:
3079
+ if not chunk.choices:
3080
+ # usage-only chunk (some providers send this last)
3081
+ if hasattr(chunk, "usage") and chunk.usage:
3082
+ u = chunk.usage
3083
+ in_tok = getattr(u, "prompt_tokens", 0) or in_tok
3084
+ out_tok = getattr(u, "completion_tokens", 0) or out_tok
3085
+ cached_tok = _extract_cached(u) or cached_tok
3086
+ continue
3087
+
3088
+ choice = chunk.choices[0]
3089
+ delta = choice.delta
3090
+
3091
+ content = delta.content
3092
+ if content:
3093
+ # Heuristic: detect reasoning tags in the content stream
3094
+ lower_c = content.lower()
3095
+ if "<thought" in lower_c or "<reasoning" in lower_c:
3096
+ in_thought = True
3097
+
3098
+ # If we are inside a thought block, check for closing tags
3099
+ if in_thought:
3100
+ if "</thought" in lower_c or "</reasoning" in lower_c:
3101
+ # Closing tag found: yield current chunk as thinking, then flip
3102
+ yield ThinkingChunk(content)
3103
+ in_thought = False
3104
+ else:
3105
+ yield ThinkingChunk(content)
3106
+ else:
3107
+ text += content
3108
+ yield TextChunk(content)
3109
+
3110
+ # Capture native reasoning content (DeepSeek/Gemini/OpenAI/Custom)
3111
+ reasoning = (
3112
+ getattr(delta, "reasoning_content", None)
3113
+ or getattr(delta, "reasoning", None)
3114
+ or getattr(delta, "thought", None)
3115
+ )
3116
+ if reasoning:
3117
+ thinking += reasoning
3118
+ yield ThinkingChunk(reasoning)
3119
+
3120
+ if delta.tool_calls:
3121
+ for tc in delta.tool_calls:
3122
+ idx = tc.index
3123
+ if idx not in tool_buf:
3124
+ tool_buf[idx] = {"id": "", "name": "", "args": "", "extra_content": None}
3125
+ if tc.id:
3126
+ tool_buf[idx]["id"] = tc.id
3127
+ if tc.function:
3128
+ if tc.function.name:
3129
+ tool_buf[idx]["name"] += tc.function.name
3130
+ if tc.function.arguments:
3131
+ tool_buf[idx]["args"] += tc.function.arguments
3132
+ # Capture extra_content (e.g. Gemini thought_signature)
3133
+ extra = getattr(tc, "extra_content", None)
3134
+ if extra:
3135
+ tool_buf[idx]["extra_content"] = extra
3136
+
3137
+ # Some providers include usage in the last chunk
3138
+ if hasattr(chunk, "usage") and chunk.usage:
3139
+ u = chunk.usage
3140
+ in_tok = (getattr(u, "prompt_tokens", 0) or getattr(u, "prompt_token_count", 0) or in_tok)
3141
+ out_tok = (getattr(u, "completion_tokens", 0) or getattr(u, "candidate_token_count", 0) or out_tok)
3142
+ cached_tok = _extract_cached(u) or cached_tok
3143
+ elif hasattr(chunk, "x_groq") and chunk.x_groq and "usage" in chunk.x_groq:
3144
+ # Groq-specific usage
3145
+ u = chunk.x_groq["usage"]
3146
+ in_tok = u.get("prompt_tokens", 0) or in_tok
3147
+ out_tok = u.get("completion_tokens", 0) or out_tok
3148
+ elif hasattr(chunk, "model_extra") and chunk.model_extra and "usage" in chunk.model_extra:
3149
+ # Pydantic v2 / Gemini proxy fallback
3150
+ u = chunk.model_extra["usage"]
3151
+ if u:
3152
+ if isinstance(u, dict):
3153
+ in_tok = u.get("prompt_tokens", 0) or in_tok
3154
+ out_tok = u.get("completion_tokens", 0) or out_tok
3155
+ else:
3156
+ in_tok = getattr(u, "prompt_tokens", 0) or in_tok
3157
+ out_tok = getattr(u, "completion_tokens", 0) or out_tok
3158
+
3159
+ tool_calls = []
3160
+ for idx in sorted(tool_buf):
3161
+ v = tool_buf[idx]
3162
+ try:
3163
+ inp = json.loads(v["args"]) if v["args"] else {}
3164
+ except json.JSONDecodeError:
3165
+ inp = {"_raw": v["args"]}
3166
+ tc_entry = {"id": v["id"] or f"call_{idx}", "name": v["name"], "input": inp}
3167
+ if v.get("extra_content"):
3168
+ tc_entry["extra_content"] = v["extra_content"]
3169
+ tool_calls.append(tc_entry)
3170
+
3171
+ yield AssistantTurn(
3172
+ text, tool_calls, in_tok, out_tok,
3173
+ thinking=thinking,
3174
+ cache_read_tokens=cached_tok,
3175
+ )
3176
+
3177
+
3178
+ def _flatten_tool_messages(messages: list) -> list:
3179
+ """Convert tool-call history to plain text for models without native tool support.
3180
+
3181
+ Transforms:
3182
+ - assistant messages with tool_calls → text + inline <tool_call> representation
3183
+ - role:tool messages → role:user with [Tool Result] prefix
3184
+ This lets the model see the full conversation without needing the tools API.
3185
+ """
3186
+ out = []
3187
+ for m in messages:
3188
+ role = m.get("role", "")
3189
+
3190
+ if role == "assistant":
3191
+ text = m.get("content") or ""
3192
+ tcs = m.get("tool_calls", [])
3193
+ if tcs:
3194
+ # Append inline <tool_call> tags so the model sees what it called
3195
+ parts = [text] if text else []
3196
+ for tc in tcs:
3197
+ fn = tc.get("function", {})
3198
+ name = fn.get("name", tc.get("name", ""))
3199
+ args = fn.get("arguments", tc.get("input", {}))
3200
+ if isinstance(args, str):
3201
+ try:
3202
+ args = json.loads(args)
3203
+ except Exception:
3204
+ pass
3205
+ parts.append(
3206
+ f'<tool_call>{json.dumps({"name": name, "input": args}, ensure_ascii=False)}</tool_call>'
3207
+ )
3208
+ out.append({"role": "assistant", "content": "\n".join(parts)})
3209
+ else:
3210
+ out.append({"role": "assistant", "content": text})
3211
+
3212
+ elif role == "tool":
3213
+ # Convert tool result to a user message the model can read
3214
+ name = m.get("name", m.get("tool_call_id", "unknown"))
3215
+ content = m.get("content", "")
3216
+ # Make format more explicit for DeepSeek-R1
3217
+ tool_result_msg = f"[Tool Result: {name}]\n{content}\n\n[INSTRUCTION: Use this data to respond. Do not ask what to do next.]"
3218
+ out.append({
3219
+ "role": "user",
3220
+ "content": tool_result_msg,
3221
+ })
3222
+
3223
+ else:
3224
+ # system / user — pass through as-is
3225
+ out.append(m)
3226
+
3227
+ return out
3228
+
3229
+
3230
+ def _build_prompt_tool_manifest(tool_schemas: list) -> str:
3231
+ """Build the text block injected into the system prompt for prompt-based tool calling."""
3232
+ oai_tools = tools_to_openai(tool_schemas)
3233
+ tool_lines = []
3234
+ for t in oai_tools:
3235
+ fn = t.get("function", t)
3236
+ name = fn.get("name", "")
3237
+ desc = fn.get("description", "")
3238
+ params = json.dumps(fn.get("parameters", {}))
3239
+ tool_lines.append(f" - {name}: {desc}\n Parameters: {params}")
3240
+
3241
+ return (
3242
+ "\n\n[TOOL USE]\nYou have access to these tools. "
3243
+ "When you need to use a tool, you MUST output EXACTLY this format (no extra text):\n"
3244
+ '<tool_call>{"name": "tool_name", "input": {"param": "value"}}</tool_call>\n\n'
3245
+ "EXAMPLE - If you need to search, output ONLY this exact line:\n"
3246
+ '<tool_call>{"name": "web_search", "input": {"query": "search term"}}</tool_call>\n\n'
3247
+ "CRITICAL RULES:\n"
3248
+ "1. ALWAYS wrap the JSON in <tool_call>...</tool_call> tags\n"
3249
+ "2. The <tool_call> tag must be on its own line with NO extra text before or after\n"
3250
+ "3. Use ONLY the exact JSON format: {\"name\": \"tool_name\", \"input\": {...}}\n"
3251
+ "4. Output the tool call IMMEDIATELY - do not explain first\n"
3252
+ "5. Wait for the tool result before continuing\n"
3253
+ "6. After your thinking is done, output ONLY the <tool_call> line\n"
3254
+ "7. AFTER calling a tool, WAIT for [Tool Result] and READ it before responding\n"
3255
+ "8. DO NOT call multiple tools at once - wait for each result\n\n"
3256
+ "WORKFLOW EXAMPLE:\n"
3257
+ "User: 'List files'\n"
3258
+ "Assistant: <tool_call>{\"name\": \"Glob\", \"input\": {\"pattern\": \"*\"}}</tool_call>\n"
3259
+ "[Tool Result: Glob]\n"
3260
+ "file1.txt\nfile2.txt\n"
3261
+ "Assistant: 'Found files: file1.txt, file2.txt'\n\n"
3262
+ "Available tools:\n" + "\n".join(tool_lines)
3263
+ )
3264
+
3265
+
3266
+ def _get_gcloud_token() -> str:
3267
+ """Obtain OAuth2 access token from gcloud CLI."""
3268
+ use_shell = platform.system() == "Windows"
3269
+ result = subprocess.run(
3270
+ "gcloud auth print-access-token",
3271
+ capture_output=True,
3272
+ text=True,
3273
+ check=True,
3274
+ shell=use_shell,
3275
+ )
3276
+ return result.stdout.strip()
3277
+
3278
+
3279
+ def _openai_messages_to_vertex_contents(messages: list) -> list:
3280
+ """Convert OpenAI-format messages to Vertex AI generateContent 'contents'."""
3281
+ contents = []
3282
+ for m in messages:
3283
+ role = m.get("role", "user")
3284
+ content = m.get("content", "")
3285
+
3286
+ if role == "system":
3287
+ continue # handled separately as systemInstruction
3288
+
3289
+ if role == "user":
3290
+ parts = []
3291
+ if content:
3292
+ parts.append({"text": content})
3293
+ contents.append({"role": "user", "parts": parts})
3294
+
3295
+ elif role == "assistant":
3296
+ parts = []
3297
+ if content:
3298
+ parts.append({"text": content})
3299
+ # Native tool calls from OpenAI format
3300
+ for tc in m.get("tool_calls", []):
3301
+ fn = tc.get("function", {})
3302
+ name = fn.get("name", tc.get("name", ""))
3303
+ args = fn.get("arguments", tc.get("input", {}))
3304
+ if isinstance(args, str):
3305
+ try:
3306
+ args = json.loads(args)
3307
+ except Exception:
3308
+ args = {}
3309
+ if name:
3310
+ parts.append({
3311
+ "functionCall": {"name": name, "args": args}
3312
+ })
3313
+ contents.append({"role": "model", "parts": parts})
3314
+
3315
+ elif role == "tool":
3316
+ name = m.get("name", m.get("tool_call_id", "unknown"))
3317
+ parts = [{
3318
+ "functionResponse": {
3319
+ "name": name,
3320
+ "response": {"result": content},
3321
+ }
3322
+ }]
3323
+ contents.append({"role": "user", "parts": parts})
3324
+
3325
+ return contents
3326
+
3327
+
3328
+ def _openai_tools_to_vertex_tools(tool_schemas: list) -> list:
3329
+ """Convert OpenAI-format tools to Vertex AI functionDeclarations."""
3330
+ declarations = []
3331
+ for t in tool_schemas:
3332
+ if not isinstance(t, dict):
3333
+ continue
3334
+ fn = t.get("function", t)
3335
+ name = fn.get("name", t.get("name", ""))
3336
+ if not name:
3337
+ continue
3338
+ declarations.append({
3339
+ "name": name,
3340
+ "description": fn.get("description", t.get("description", "")),
3341
+ "parameters": fn.get("parameters", t.get("input_schema", {"type": "object", "properties": {}})),
3342
+ })
3343
+ return [{"functionDeclarations": declarations}] if declarations else []
3344
+
3345
+
3346
+ def stream_gcloud(
3347
+ model: str,
3348
+ system: str,
3349
+ messages: list,
3350
+ tool_schemas: list,
3351
+ config: dict,
3352
+ ) -> Generator:
3353
+ """Stream from Google Cloud Vertex AI using gcloud OAuth2 authentication.
3354
+
3355
+ Uses the generateContent REST API directly with Bearer tokens from
3356
+ `gcloud auth print-access-token`. Supports native function calling.
3357
+ """
3358
+ # ── Auth ────────────────────────────────────────────────────────────────
3359
+ try:
3360
+ token = _get_gcloud_token()
3361
+ except Exception as e:
3362
+ msg = f"[gcloud] Failed to get gcloud token: {e}. Run `gcloud auth login`."
3363
+ yield TextChunk(msg)
3364
+ yield AssistantTurn(msg, [], 0, 0, error=True)
3365
+ return
3366
+
3367
+ # ── Configurable project/location (fallback to hardcoded) ─────────────
3368
+ project_id = config.get("gcloud_project_id", "gen-lang-client-0108363942")
3369
+ location = config.get("gcloud_location", "us-west1")
3370
+ bare = model.split("/")[-1] if "/" in model else model
3371
+
3372
+ url = (
3373
+ f"https://{location}-aiplatform.googleapis.com/v1/"
3374
+ f"projects/{project_id}/locations/{location}/"
3375
+ f"publishers/google/models/{bare}:generateContent"
3376
+ )
3377
+
3378
+ headers = {
3379
+ "Authorization": f"Bearer {token}",
3380
+ "Content-Type": "application/json",
3381
+ }
3382
+
3383
+ # ── Convert messages ────────────────────────────────────────────────────
3384
+ oai_messages = messages_to_openai(messages)
3385
+ contents = _openai_messages_to_vertex_contents(oai_messages)
3386
+
3387
+ # Cap maxOutputTokens to Vertex AI limit (65536)
3388
+ prov_cap = PROVIDERS.get("gcloud", {}).get("max_completion_tokens", 65536)
3389
+ req_max = config.get("max_tokens", 2048)
3390
+ safe_max = min(req_max, prov_cap) if prov_cap else req_max
3391
+
3392
+ payload: dict = {
3393
+ "contents": contents,
3394
+ "generationConfig": {
3395
+ "temperature": config.get("temperature", 0.7),
3396
+ "maxOutputTokens": safe_max,
3397
+ },
3398
+ }
3399
+
3400
+ if system:
3401
+ payload["systemInstruction"] = {"parts": [{"text": system}]}
3402
+
3403
+ # ── Tools ───────────────────────────────────────────────────────────────
3404
+ if tool_schemas and not config.get("no_tools"):
3405
+ vertex_tools = _openai_tools_to_vertex_tools(tools_to_openai(tool_schemas))
3406
+ if vertex_tools:
3407
+ payload["tools"] = vertex_tools
3408
+ payload["toolConfig"] = {
3409
+ "functionCallingConfig": {"mode": "AUTO"}
3410
+ }
3411
+
3412
+ # ── Request ─────────────────────────────────────────────────────────────
3413
+ text = ""
3414
+ thinking = ""
3415
+ tool_calls: list = []
3416
+ in_tok = out_tok = 0
3417
+
3418
+ try:
3419
+ resp = requests.post(url, headers=headers, json=payload, timeout=120)
3420
+ if resp.status_code != 200:
3421
+ msg = f"[gcloud] HTTP {resp.status_code}: {resp.text[:400]}"
3422
+ yield TextChunk(msg)
3423
+ yield AssistantTurn(msg, [], 0, 0, error=True)
3424
+ return
3425
+ data = resp.json()
3426
+ except Exception as e:
3427
+ msg = f"[gcloud] Request error: {e}"
3428
+ yield TextChunk(msg)
3429
+ yield AssistantTurn(msg, [], 0, 0, error=True)
3430
+ return
3431
+
3432
+ # ── Parse response ──────────────────────────────────────────────────────
3433
+ candidates = data.get("candidates", [])
3434
+ if not candidates:
3435
+ msg = "[gcloud] No candidates in response."
3436
+ yield TextChunk(msg)
3437
+ yield AssistantTurn(msg, [], 0, 0, error=True)
3438
+ return
3439
+
3440
+ candidate = candidates[0]
3441
+ parts = candidate.get("content", {}).get("parts", [])
3442
+
3443
+ for part in parts:
3444
+ if "text" in part:
3445
+ chunk_text = part["text"]
3446
+ text += chunk_text
3447
+ yield TextChunk(chunk_text)
3448
+
3449
+ if "functionCall" in part:
3450
+ fc = part["functionCall"]
3451
+ tool_calls.append({
3452
+ "id": f"call_gc_{len(tool_calls)}",
3453
+ "name": fc.get("name", ""),
3454
+ "input": fc.get("args", {}),
3455
+ })
3456
+
3457
+ # Token usage (Vertex AI sometimes includes usageMetadata)
3458
+ usage = data.get("usageMetadata", {})
3459
+ in_tok = usage.get("promptTokenCount", 0)
3460
+ out_tok = usage.get("candidatesTokenCount", 0)
3461
+
3462
+ yield AssistantTurn(text, tool_calls, in_tok, out_tok, thinking=thinking)
3463
+
3464
+
3465
+ def stream_ollama(
3466
+ base_url: str,
3467
+ model: str,
3468
+ system: str,
3469
+ messages: list,
3470
+ tool_schemas: list,
3471
+ config: dict,
3472
+ ) -> Generator:
3473
+ # pass_images=True: Ollama /api/chat accepts base64 images natively in the message
3474
+ oai_messages = [{"role": "system", "content": system}] + messages_to_openai(messages, ollama_native_images=True)
3475
+
3476
+ # Ollama requires tool arguments as dict objects, not strings. OpenAI uses strings.
3477
+ for m in oai_messages:
3478
+ if m.get("content") is None:
3479
+ m["content"] = ""
3480
+ if "tool_calls" in m and m["tool_calls"]:
3481
+ for tc in m["tool_calls"]:
3482
+ fn = tc.get("function", {})
3483
+ if isinstance(fn.get("arguments"), str):
3484
+ try:
3485
+ fn["arguments"] = json.loads(fn["arguments"])
3486
+ except Exception:
3487
+ pass
3488
+
3489
+ # ── DeepSeek-R1 Specific Fix ─────────────────────────────────────────
3490
+ # Simplified instructions for smaller models
3491
+ is_deepseek_r1 = "deepseek-r1" in model.lower()
3492
+ if is_deepseek_r1:
3493
+ deepseek_fix = (
3494
+ '\n\nRules: Reply directly. Use tools ONLY when needed. Format: <tool_call>{"name": "...", "input": {}}</tool_call>\n'
3495
+ )
3496
+ for msg in oai_messages:
3497
+ if msg.get("role") == "system":
3498
+ msg["content"] += deepseek_fix
3499
+ break
3500
+
3501
+ # ── Check if a previous turn already detected no native tool support ──
3502
+ # Use model-specific key to persist across sessions
3503
+ _no_native_tools_key = f"_no_native_tools_{model}"
3504
+ _prompt_tool_mode = False
3505
+
3506
+ # Check both the old generic flag and the new model-specific flag
3507
+ if (config.get("_prompt_tool_mode") or config.get(_no_native_tools_key)) and tool_schemas and not config.get("no_tools"):
3508
+ _prompt_tool_mode = True
3509
+ # Flatten tool messages in history so the model can read them as plain text
3510
+ oai_messages = _flatten_tool_messages(oai_messages)
3511
+ # Inject tool manifest into system prompt
3512
+ tool_manifest = _build_prompt_tool_manifest(tool_schemas)
3513
+ for msg in oai_messages:
3514
+ if msg.get("role") == "system":
3515
+ msg["content"] += tool_manifest
3516
+ break
3517
+
3518
+ payload = {
3519
+ "model": model,
3520
+ "messages": oai_messages,
3521
+ "stream": True,
3522
+ "options": {
3523
+ "num_ctx": config.get("context_limit", 32768)
3524
+ }
3525
+ }
3526
+
3527
+ if tool_schemas and not config.get("no_tools") and not _prompt_tool_mode:
3528
+ payload["tools"] = tools_to_openai(tool_schemas)
3529
+
3530
+ def _make_request(p):
3531
+ return urllib.request.Request(
3532
+ f"{base_url.rstrip('/')}/api/chat",
3533
+ data=json.dumps(p).encode("utf-8"),
3534
+ headers={"Content-Type": "application/json"}
3535
+ )
3536
+
3537
+ req = _make_request(payload)
3538
+
3539
+ text = ""
3540
+ thinking = ""
3541
+ tool_buf: dict = {}
3542
+
3543
+ # State for prompt-based tool call parsing across streamed chunks
3544
+ use_deep_tools = config.get("deep_tools", False) if config else False
3545
+ _auto_wrap_json = is_deepseek_r1 and use_deep_tools
3546
+ parser = WebToolParser(auto_wrap_json=_auto_wrap_json)
3547
+
3548
+ # Cloud-routed Ollama models (e.g. minimax-m2.7:cloud) need a moment before
3549
+ # the proxy starts streaming real content — without this, the first response
3550
+ # can come back empty.
3551
+ import time as _time
3552
+ if ":cloud" in model.lower():
3553
+ _time.sleep(1)
3554
+
3555
+ try:
3556
+ resp_cm = urllib.request.urlopen(req)
3557
+ except urllib.error.HTTPError:
3558
+ raise
3559
+
3560
+ # Buffer for accumulating thinking content to reduce word-by-word chunks
3561
+ _thinking_buffer = ""
3562
+
3563
+ with resp_cm as resp:
3564
+ for line in resp:
3565
+ if not line.strip(): continue
3566
+ try:
3567
+ data = json.loads(line)
3568
+ except Exception: continue
3569
+
3570
+ msg = data.get("message", {})
3571
+ reasoning = None
3572
+ for r_key in ["thinking", "reasoning", "thought", "reasoning_content"]:
3573
+ if r_key in msg and msg[r_key]:
3574
+ reasoning = msg[r_key]
3575
+ break
3576
+
3577
+ if reasoning:
3578
+ thinking += reasoning
3579
+ _thinking_buffer += reasoning
3580
+ if len(_thinking_buffer) >= 20 or (_thinking_buffer and reasoning and reasoning[0] in " \n\t.,;:!?"):
3581
+ yield ThinkingChunk(_thinking_buffer)
3582
+ _thinking_buffer = ""
3583
+
3584
+ content = msg.get("content", "") if "content" in msg else ""
3585
+ content = msg.get("content", "") if "content" in msg else ""
3586
+ if content:
3587
+ # Flush thinking buffer before content
3588
+ if _thinking_buffer:
3589
+ yield ThinkingChunk(_thinking_buffer)
3590
+ _thinking_buffer = ""
3591
+
3592
+ if _prompt_tool_mode:
3593
+ display = parser.parse_chunk(content)
3594
+ if display:
3595
+ text += display
3596
+ yield TextChunk(display)
3597
+ else:
3598
+ text += content
3599
+ yield TextChunk(content)
3600
+
3601
+ # Handle native ollama tools format
3602
+ for tc in msg.get("tool_calls", []):
3603
+ fn = tc.get("function", {})
3604
+ idx = len(tool_buf) # Ollama sends complete tool calls, not delta
3605
+ tool_buf[idx] = {
3606
+ "id": "call_ollama" + str(idx),
3607
+ "name": fn.get("name", ""),
3608
+ "input": fn.get("arguments", {})
3609
+ }
3610
+
3611
+ # Flush any remaining thinking buffer at end of stream
3612
+ if _thinking_buffer:
3613
+ yield ThinkingChunk(_thinking_buffer)
3614
+
3615
+ if _prompt_tool_mode:
3616
+ remaining = parser.flush()
3617
+ if remaining:
3618
+ text += remaining
3619
+ yield TextChunk(remaining)
3620
+
3621
+ tool_calls = []
3622
+ # Merge native Ollama tools
3623
+ for idx in sorted(tool_buf):
3624
+ v = tool_buf[idx]
3625
+ tool_calls.append({"id": v["id"], "name": v["name"], "input": v["input"]})
3626
+
3627
+ # Merge prompt-based tools from parser
3628
+ if _prompt_tool_mode:
3629
+ for tc in parser.tool_calls:
3630
+ tool_calls.append(tc)
3631
+
3632
+ # NOTE: Sanitizer temporarily disabled due to space issues
3633
+ # if is_deepseek_r1:
3634
+ # text = _sanitize_deepseek_output(text)
3635
+ # thinking = _sanitize_deepseek_output(thinking)
3636
+
3637
+ # Ollama doesn't return exact token counts via livestream easily until "done",
3638
+ # but we can do a rough estimate or 0, dulus handles zero gracefully
3639
+
3640
+ # For cloud-routed models: if text is empty (timing issue), retry once with longer wait
3641
+ if not text and not tool_calls and ":cloud" in model.lower():
3642
+ _time.sleep(2)
3643
+ try:
3644
+ req2 = _make_request(payload)
3645
+ text2 = ""
3646
+ thinking2 = ""
3647
+ with urllib.request.urlopen(req2) as resp2:
3648
+ for line in resp2:
3649
+ if not line.strip(): continue
3650
+ try:
3651
+ data2 = json.loads(line)
3652
+ except Exception: continue
3653
+ msg2 = data2.get("message", {})
3654
+ c2 = msg2.get("content", "") if "content" in msg2 else ""
3655
+ if c2:
3656
+ text2 += c2
3657
+ yield TextChunk(c2)
3658
+ for r_key in ["thinking", "reasoning", "thought", "reasoning_content"]:
3659
+ if r_key in msg2 and msg2[r_key]:
3660
+ thinking2 += msg2[r_key]
3661
+ break
3662
+ yield AssistantTurn(text2 or "[ollama-cloud: no response]", [], 0, 0, thinking=thinking2)
3663
+ except Exception as _e:
3664
+ msg_err = f"[ollama-cloud] Retry failed: {_e}"
3665
+ yield TextChunk(msg_err)
3666
+ yield AssistantTurn(msg_err, [], 0, 0, error=True)
3667
+ return
3668
+
3669
+ yield AssistantTurn(text, tool_calls, 0, 0, thinking=thinking)
3670
+
3671
+
3672
+ def stream(
3673
+ model: str,
3674
+ system: str,
3675
+ messages: list,
3676
+ tool_schemas: list,
3677
+ config: dict,
3678
+ ) -> Generator:
3679
+ """
3680
+ Unified streaming entry point.
3681
+ Auto-detects provider from model string.
3682
+ Yields: TextChunk | ThinkingChunk | AssistantTurn
3683
+
3684
+ All provider calls are wrapped with automatic retry on transient
3685
+ failures (timeouts, 429 rate-limit, 5xx server errors).
3686
+ """
3687
+ provider_name = detect_provider(model)
3688
+ model_name = bare_model(model)
3689
+ prov = PROVIDERS.get(provider_name, PROVIDERS["openai"])
3690
+ api_key = get_api_key(provider_name, config)
3691
+
3692
+ def _inner_stream() -> Generator:
3693
+ if prov["type"] == "claude_web":
3694
+ cookies_file = _claude_web_cookies_path(config)
3695
+ yield from stream_claude_web(cookies_file, model_name, system, messages, tool_schemas, config)
3696
+ elif prov["type"] == "claude_code":
3697
+ cookies_file = _claude_web_cookies_path(config)
3698
+ yield from stream_claude_code(cookies_file, model_name, system, messages, tool_schemas, config)
3699
+ elif prov["type"] == "kimi_web":
3700
+ auth_file = _kimi_web_auth_path(config)
3701
+ yield from stream_kimi_web(auth_file, model_name, system, messages, tool_schemas, config)
3702
+ elif prov["type"] == "gemini-web":
3703
+ auth_file = _gemini_web_auth_path(config)
3704
+ yield from stream_gemini_web(auth_file, model_name, system, messages, tool_schemas, config)
3705
+ elif prov["type"] == "deepseek_web":
3706
+ auth_file = _deepseek_web_auth_path(config)
3707
+ yield from stream_deepseek_web(auth_file, model_name, system, messages, tool_schemas, config)
3708
+ elif prov["type"] == "qwen_web":
3709
+ auth_file = _qwen_web_auth_path(config)
3710
+ yield from stream_qwen_web(auth_file, model_name, system, messages, tool_schemas, config)
3711
+ elif prov["type"] == "gcloud":
3712
+ yield from stream_gcloud(model_name, system, messages, tool_schemas, config)
3713
+ elif prov["type"] == "anthropic":
3714
+ yield from stream_anthropic(api_key, model_name, system, messages, tool_schemas, config)
3715
+ elif prov["type"] == "ollama":
3716
+ base_url = prov.get("base_url", "http://localhost:11434")
3717
+ yield from stream_ollama(base_url, model_name, system, messages, tool_schemas, config)
3718
+ elif provider_name in ("kimi", "moonshot"):
3719
+ # Use native Kimi HTTP implementation for testing/comparison
3720
+ yield from stream_kimi(api_key, model_name, system, messages, tool_schemas, config)
3721
+ else:
3722
+ import os as _os
3723
+ if provider_name == "custom":
3724
+ base_url = (config.get("custom_base_url")
3725
+ or _os.environ.get("CUSTOM_BASE_URL", ""))
3726
+ if not base_url:
3727
+ raise ValueError(
3728
+ "custom provider requires a base_url. "
3729
+ "Set CUSTOM_BASE_URL env var or run: /config custom_base_url=http://..."
3730
+ )
3731
+ else:
3732
+ base_url = prov.get("base_url", "https://api.openai.com/v1")
3733
+ yield from stream_openai_compat(
3734
+ api_key, base_url, model_name, system, messages, tool_schemas, config
3735
+ )
3736
+
3737
+ # Wrap with retry on transient failures
3738
+ yield from _ProviderRetry.wrap_generator(_inner_stream)
3739
+
3740
+
3741
+ def list_ollama_models(base_url: str) -> list[str]:
3742
+ """Fetch locally available model tags from Ollama server."""
3743
+ try:
3744
+ url = f"{base_url.rstrip('/')}/api/tags"
3745
+ with urllib.request.urlopen(url, timeout=3) as resp:
3746
+ data = json.loads(resp.read().decode("utf-8"))
3747
+ # Ollama returns {"models": [{"name": "llama3:latest", ...}, ...]}
3748
+ return [m["name"] for m in data.get("models", [])]
3749
+ except Exception:
3750
+ return []