@oneciel-ai/claude-any 0.1.34 → 0.1.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -1
- package/claude_any.py +470 -125
- package/docs/README.ja.md +18 -1
- package/docs/README.ko.md +18 -1
- package/docs/README.zh.md +16 -1
- package/docs/manual.md +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -48,7 +48,7 @@ arguments through unchanged.
|
|
|
48
48
|
|
|
49
49
|
Credits: One Ciel LLC
|
|
50
50
|
|
|
51
|
-
Current version: `0.1.
|
|
51
|
+
Current version: `0.1.36`
|
|
52
52
|
|
|
53
53
|
## Why This Exists
|
|
54
54
|
|
|
@@ -381,6 +381,23 @@ steps under that larger model's supervision.
|
|
|
381
381
|
|
|
382
382
|
## Changelog
|
|
383
383
|
|
|
384
|
+
### 0.1.36
|
|
385
|
+
|
|
386
|
+
- **NVIDIA upstream streaming**: NVIDIA hosted router calls now use upstream
|
|
387
|
+
`stream=true`, so long responses can flow as chunks instead of waiting for a
|
|
388
|
+
full non-streaming completion.
|
|
389
|
+
- **Stream retry diagnostics**: streamed NVIDIA calls keep the same retry and
|
|
390
|
+
request-size activity status used by the statusline.
|
|
391
|
+
|
|
392
|
+
### 0.1.35
|
|
393
|
+
|
|
394
|
+
- **NVIDIA router context guard**: NVIDIA hosted now defaults to a 32K router
|
|
395
|
+
context window and LLM presets may tune that cap, reducing timeout-prone
|
|
396
|
+
payload growth in long Claude Code sessions.
|
|
397
|
+
- **Upstream activity status**: the router records current request, retry,
|
|
398
|
+
success, and error state with estimated token/byte size so the statusline can
|
|
399
|
+
distinguish active upstream waits from idle sessions.
|
|
400
|
+
|
|
384
401
|
### 0.1.34
|
|
385
402
|
|
|
386
403
|
- **Complete headless configuration path**: add `--ca-env-file`,
|
package/claude_any.py
CHANGED
|
@@ -39,8 +39,9 @@ LOG_LEVEL_PATH = CONFIG_DIR / "log-level"
|
|
|
39
39
|
REQUEST_DUMP_PATH = CONFIG_DIR / "requests.jsonl"
|
|
40
40
|
RESPONSE_DUMP_PATH = CONFIG_DIR / "responses.jsonl"
|
|
41
41
|
TOOL_CALL_LOG_PATH = CONFIG_DIR / "tool-calls.jsonl"
|
|
42
|
-
RATE_LIMIT_STATE_PATH = CONFIG_DIR / "rate-limit-state.json"
|
|
43
|
-
|
|
42
|
+
RATE_LIMIT_STATE_PATH = CONFIG_DIR / "rate-limit-state.json"
|
|
43
|
+
ROUTER_ACTIVITY_PATH = CONFIG_DIR / "router-activity.json"
|
|
44
|
+
CHAT_MESSAGES_PATH = CONFIG_DIR / "chat-messages.jsonl"
|
|
44
45
|
CHAT_FILES_DIR = CONFIG_DIR / "chat-files"
|
|
45
46
|
PLAN_ARTIFACTS_DIR = CONFIG_DIR / "plan-artifacts"
|
|
46
47
|
PID_PATH = CONFIG_DIR / "router.pid"
|
|
@@ -84,7 +85,7 @@ PROVIDER_LABELS = {
|
|
|
84
85
|
"self-hosted-nim": "Self Hosted NIM",
|
|
85
86
|
}
|
|
86
87
|
APP_NAME = "Claude Any"
|
|
87
|
-
VERSION = "0.1.
|
|
88
|
+
VERSION = "0.1.36"
|
|
88
89
|
CREDITS = "Credits: One Ciel LLC"
|
|
89
90
|
|
|
90
91
|
LOG_LEVELS = {"SILENT": 0, "ERROR": 1, "WARN": 2, "INFO": 3, "DEBUG": 4, "TRACE": 5}
|
|
@@ -712,17 +713,18 @@ DEFAULT_CONFIG: dict[str, Any] = {
|
|
|
712
713
|
"stream_enabled": True,
|
|
713
714
|
"stream_word_chunking": False,
|
|
714
715
|
},
|
|
715
|
-
"nvidia-hosted": {
|
|
716
|
-
"base_url": "https://integrate.api.nvidia.com/v1",
|
|
717
|
-
"api_key": "not-used",
|
|
718
|
-
"current_model": "qwen/qwen3-coder-480b-a35b-instruct",
|
|
716
|
+
"nvidia-hosted": {
|
|
717
|
+
"base_url": "https://integrate.api.nvidia.com/v1",
|
|
718
|
+
"api_key": "not-used",
|
|
719
|
+
"current_model": "qwen/qwen3-coder-480b-a35b-instruct",
|
|
719
720
|
"advisor_model": "",
|
|
720
721
|
"custom_models": [],
|
|
721
722
|
"native_compat": False,
|
|
722
|
-
"rate_limit_rpm": 40,
|
|
723
|
-
"rate_limit_status": True,
|
|
724
|
-
"
|
|
725
|
-
"
|
|
723
|
+
"rate_limit_rpm": 40,
|
|
724
|
+
"rate_limit_status": True,
|
|
725
|
+
"context_window": 32768,
|
|
726
|
+
"max_output_tokens": 4096,
|
|
727
|
+
"temperature": 0.7,
|
|
726
728
|
"top_p": 0.8,
|
|
727
729
|
"request_timeout_ms": 300000,
|
|
728
730
|
"stream_enabled": True,
|
|
@@ -773,14 +775,21 @@ def apply_config_migrations(cfg: dict[str, Any]) -> None:
|
|
|
773
775
|
pcfg["native_compat"] = False
|
|
774
776
|
migrations[marker] = True
|
|
775
777
|
|
|
776
|
-
marker = "default_timeout_5m_20260513"
|
|
777
|
-
if not migrations.get(marker):
|
|
778
|
-
for pcfg in (cfg.get("providers") or {}).values():
|
|
779
|
-
if not isinstance(pcfg, dict):
|
|
780
|
-
continue
|
|
781
|
-
if positive_int(pcfg.get("request_timeout_ms")) in (600000, 1800000):
|
|
782
|
-
pcfg["request_timeout_ms"] = 300000
|
|
783
|
-
migrations[marker] = True
|
|
778
|
+
marker = "default_timeout_5m_20260513"
|
|
779
|
+
if not migrations.get(marker):
|
|
780
|
+
for pcfg in (cfg.get("providers") or {}).values():
|
|
781
|
+
if not isinstance(pcfg, dict):
|
|
782
|
+
continue
|
|
783
|
+
if positive_int(pcfg.get("request_timeout_ms")) in (600000, 1800000):
|
|
784
|
+
pcfg["request_timeout_ms"] = 300000
|
|
785
|
+
migrations[marker] = True
|
|
786
|
+
|
|
787
|
+
marker = "nvidia_context_window_32k_20260513"
|
|
788
|
+
if not migrations.get(marker):
|
|
789
|
+
pcfg = cfg.get("providers", {}).get("nvidia-hosted", {})
|
|
790
|
+
if isinstance(pcfg, dict) and not positive_int(pcfg.get("context_window")):
|
|
791
|
+
pcfg["context_window"] = 32768
|
|
792
|
+
migrations[marker] = True
|
|
784
793
|
|
|
785
794
|
|
|
786
795
|
_config_cache: dict[str, Any] | None = None
|
|
@@ -1174,8 +1183,9 @@ from pathlib import Path
|
|
|
1174
1183
|
HOME = Path.home()
|
|
1175
1184
|
CONFIG_DIR = Path(os.environ.get("CLAUDE_ANY_CONFIG_DIR") or (HOME / ".config" / "claude-any"))
|
|
1176
1185
|
CONFIG_PATH = CONFIG_DIR / "config.json"
|
|
1177
|
-
STATE_PATH = CONFIG_DIR / "rate-limit-state.json"
|
|
1178
|
-
|
|
1186
|
+
STATE_PATH = CONFIG_DIR / "rate-limit-state.json"
|
|
1187
|
+
ACTIVITY_PATH = CONFIG_DIR / "router-activity.json"
|
|
1188
|
+
PALETTE = (203, 209, 215, 221, 229, 187, 151, 116, 111, 147, 183, 219)
|
|
1179
1189
|
|
|
1180
1190
|
|
|
1181
1191
|
def load_json(path, default):
|
|
@@ -1225,8 +1235,9 @@ def main():
|
|
|
1225
1235
|
rpm = int(raw_rpm)
|
|
1226
1236
|
except Exception:
|
|
1227
1237
|
rpm = 40
|
|
1228
|
-
state = load_json(STATE_PATH, {})
|
|
1229
|
-
|
|
1238
|
+
state = load_json(STATE_PATH, {})
|
|
1239
|
+
activity = load_json(ACTIVITY_PATH, {})
|
|
1240
|
+
now = time.time()
|
|
1230
1241
|
key = f"{provider}:__global__" if provider else ""
|
|
1231
1242
|
entry = state.get(key) if key else None
|
|
1232
1243
|
if not isinstance(entry, dict):
|
|
@@ -1288,9 +1299,25 @@ def main():
|
|
|
1288
1299
|
rpm_text += " | server " + ", ".join(parts)
|
|
1289
1300
|
if penalty_until > now:
|
|
1290
1301
|
rpm_text += f" | wait {max(0.0, penalty_until - now):.0f}s"
|
|
1291
|
-
elif last_wait >= 0.5 and 0.0 <= now - updated_at < 60.0:
|
|
1292
|
-
rpm_text += f" | wait {last_wait:.1f}s"
|
|
1293
|
-
|
|
1302
|
+
elif last_wait >= 0.5 and 0.0 <= now - updated_at < 60.0:
|
|
1303
|
+
rpm_text += f" | wait {last_wait:.1f}s"
|
|
1304
|
+
if isinstance(activity, dict):
|
|
1305
|
+
try:
|
|
1306
|
+
age = now - float(activity.get("updated_at") or 0)
|
|
1307
|
+
except Exception:
|
|
1308
|
+
age = 999999
|
|
1309
|
+
if 0 <= age < 180:
|
|
1310
|
+
event = str(activity.get("event") or "")
|
|
1311
|
+
if event == "retry":
|
|
1312
|
+
rpm_text += f" | retry {activity.get('attempt')}/{activity.get('total')}"
|
|
1313
|
+
elif event == "request":
|
|
1314
|
+
tokens = activity.get("tokens")
|
|
1315
|
+
rpm_text += f" | upstream {age:.0f}s"
|
|
1316
|
+
if tokens:
|
|
1317
|
+
rpm_text += f" {tokens}tok"
|
|
1318
|
+
elif event in ("success", "error"):
|
|
1319
|
+
rpm_text += f" | {event} {age:.0f}s"
|
|
1320
|
+
print(f"{left} | {color(rpm_text)}")
|
|
1294
1321
|
|
|
1295
1322
|
|
|
1296
1323
|
if __name__ == "__main__":
|
|
@@ -2834,16 +2861,34 @@ def native_anthropic_base_url(provider: str, pcfg: dict[str, Any]) -> str:
|
|
|
2834
2861
|
return base
|
|
2835
2862
|
|
|
2836
2863
|
|
|
2837
|
-
def write_json(handler: BaseHTTPRequestHandler, obj: Any, status: int = 200) -> None:
|
|
2838
|
-
body = json.dumps(obj).encode("utf-8")
|
|
2839
|
-
handler.send_response(status)
|
|
2840
|
-
handler.send_header("content-type", "application/json")
|
|
2841
|
-
handler.send_header("content-length", str(len(body)))
|
|
2842
|
-
handler.end_headers()
|
|
2843
|
-
handler.wfile.write(body)
|
|
2844
|
-
|
|
2845
|
-
|
|
2846
|
-
def
|
|
2864
|
+
def write_json(handler: BaseHTTPRequestHandler, obj: Any, status: int = 200) -> None:
|
|
2865
|
+
body = json.dumps(obj).encode("utf-8")
|
|
2866
|
+
handler.send_response(status)
|
|
2867
|
+
handler.send_header("content-type", "application/json")
|
|
2868
|
+
handler.send_header("content-length", str(len(body)))
|
|
2869
|
+
handler.end_headers()
|
|
2870
|
+
handler.wfile.write(body)
|
|
2871
|
+
|
|
2872
|
+
|
|
2873
|
+
def write_router_activity(event: str, provider: str, model: str | None = None, **fields: Any) -> None:
|
|
2874
|
+
try:
|
|
2875
|
+
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
|
|
2876
|
+
data = {
|
|
2877
|
+
"updated_at": time.time(),
|
|
2878
|
+
"time": time.strftime("%Y-%m-%dT%H:%M:%S"),
|
|
2879
|
+
"event": event,
|
|
2880
|
+
"provider": provider,
|
|
2881
|
+
"model": model or "",
|
|
2882
|
+
}
|
|
2883
|
+
data.update(fields)
|
|
2884
|
+
tmp = ROUTER_ACTIVITY_PATH.with_name(f"{ROUTER_ACTIVITY_PATH.name}.{os.getpid()}.{time.time_ns()}.tmp")
|
|
2885
|
+
tmp.write_text(json.dumps(data, ensure_ascii=False, separators=(",", ":")), encoding="utf-8")
|
|
2886
|
+
tmp.replace(ROUTER_ACTIVITY_PATH)
|
|
2887
|
+
except Exception:
|
|
2888
|
+
pass
|
|
2889
|
+
|
|
2890
|
+
|
|
2891
|
+
def write_text_response(handler: BaseHTTPRequestHandler, text: str, status: int = 200, content_type: str = "text/plain; charset=utf-8") -> None:
|
|
2847
2892
|
body = text.encode("utf-8")
|
|
2848
2893
|
handler.send_response(status)
|
|
2849
2894
|
handler.send_header("content-type", content_type)
|
|
@@ -3556,11 +3601,20 @@ def cap_output_tokens_for_context(
|
|
|
3556
3601
|
return max(1, min(configured, available))
|
|
3557
3602
|
|
|
3558
3603
|
|
|
3559
|
-
def ollama_context_limit_for_budget(pcfg: dict[str, Any]) -> int:
|
|
3560
|
-
raw = pcfg.get("num_ctx", "auto")
|
|
3561
|
-
if isinstance(raw, str) and raw.strip().lower() == "auto":
|
|
3562
|
-
return positive_int(pcfg.get("num_ctx_max")) or 65536
|
|
3563
|
-
return positive_int(raw) or positive_int(pcfg.get("num_ctx_max")) or 65536
|
|
3604
|
+
def ollama_context_limit_for_budget(pcfg: dict[str, Any]) -> int:
|
|
3605
|
+
raw = pcfg.get("num_ctx", "auto")
|
|
3606
|
+
if isinstance(raw, str) and raw.strip().lower() == "auto":
|
|
3607
|
+
return positive_int(pcfg.get("num_ctx_max")) or 65536
|
|
3608
|
+
return positive_int(raw) or positive_int(pcfg.get("num_ctx_max")) or 65536
|
|
3609
|
+
|
|
3610
|
+
|
|
3611
|
+
def openai_context_limit_for_budget(provider: str, pcfg: dict[str, Any]) -> int:
|
|
3612
|
+
configured = positive_int(pcfg.get("context_window")) or positive_int(pcfg.get("max_model_len"))
|
|
3613
|
+
if configured:
|
|
3614
|
+
return configured
|
|
3615
|
+
if provider == "nvidia-hosted":
|
|
3616
|
+
return 32768
|
|
3617
|
+
return 65536
|
|
3564
3618
|
|
|
3565
3619
|
|
|
3566
3620
|
def compact_ollama_messages_for_budget(
|
|
@@ -3695,10 +3749,10 @@ def ollama_chat_request(model: str, body: dict[str, Any], pcfg: dict[str, Any],
|
|
|
3695
3749
|
return req
|
|
3696
3750
|
|
|
3697
3751
|
|
|
3698
|
-
def openai_compatible_chat_request(model: str, body: dict[str, Any], pcfg: dict[str, Any], stream: bool = False) -> dict[str, Any]:
|
|
3699
|
-
messages = anthropic_messages_to_openai(body)
|
|
3700
|
-
tools = anthropic_tools_to_ollama(body.get("tools"))
|
|
3701
|
-
context_limit =
|
|
3752
|
+
def openai_compatible_chat_request(provider: str, model: str, body: dict[str, Any], pcfg: dict[str, Any], stream: bool = False) -> dict[str, Any]:
|
|
3753
|
+
messages = anthropic_messages_to_openai(body)
|
|
3754
|
+
tools = anthropic_tools_to_ollama(body.get("tools"))
|
|
3755
|
+
context_limit = openai_context_limit_for_budget(provider, pcfg)
|
|
3702
3756
|
configured = configured_output_tokens(pcfg, body)
|
|
3703
3757
|
reserve = positive_int(pcfg.get("context_reserve_tokens")) or 1024
|
|
3704
3758
|
output_reserve = configured or positive_int(body.get("max_tokens")) or 4096
|
|
@@ -4525,7 +4579,7 @@ def forward_ollama_api_chat(handler: BaseHTTPRequestHandler, provider: str, pcfg
|
|
|
4525
4579
|
write_json(handler, message)
|
|
4526
4580
|
|
|
4527
4581
|
|
|
4528
|
-
def openai_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict[str, Any] | None = None) -> dict[str, Any]:
|
|
4582
|
+
def openai_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict[str, Any] | None = None) -> dict[str, Any]:
|
|
4529
4583
|
choice = {}
|
|
4530
4584
|
choices = data.get("choices")
|
|
4531
4585
|
if isinstance(choices, list) and choices:
|
|
@@ -4538,10 +4592,202 @@ def openai_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict
|
|
|
4538
4592
|
},
|
|
4539
4593
|
"done_reason": "length" if choice.get("finish_reason") == "length" else "stop",
|
|
4540
4594
|
}
|
|
4541
|
-
return ollama_chat_to_anthropic(wrapped, model, source_body=source_body)
|
|
4542
|
-
|
|
4543
|
-
|
|
4544
|
-
def
|
|
4595
|
+
return ollama_chat_to_anthropic(wrapped, model, source_body=source_body)
|
|
4596
|
+
|
|
4597
|
+
|
|
4598
|
+
def stream_openai_chat_to_anthropic_sse(
|
|
4599
|
+
handler: BaseHTTPRequestHandler,
|
|
4600
|
+
resp: Any,
|
|
4601
|
+
model: str,
|
|
4602
|
+
provider: str,
|
|
4603
|
+
source_body: dict[str, Any] | None = None,
|
|
4604
|
+
start_index: int = 0,
|
|
4605
|
+
word_chunking: bool = False,
|
|
4606
|
+
) -> None:
|
|
4607
|
+
next_content_index = start_index
|
|
4608
|
+
text_started = False
|
|
4609
|
+
text_suppressed_for_plan = False
|
|
4610
|
+
text_index: int | None = None
|
|
4611
|
+
text_so_far = ""
|
|
4612
|
+
text_buffer = ""
|
|
4613
|
+
tool_fragments: dict[int, dict[str, Any]] = {}
|
|
4614
|
+
output_tokens = 0
|
|
4615
|
+
finish_reason = "stop"
|
|
4616
|
+
|
|
4617
|
+
def emit(event_name: str, payload: dict[str, Any]) -> None:
|
|
4618
|
+
handler.wfile.write(f"event: {event_name}\ndata: {json.dumps(payload, ensure_ascii=False)}\n\n".encode())
|
|
4619
|
+
handler.wfile.flush()
|
|
4620
|
+
|
|
4621
|
+
def ensure_text_started() -> int:
|
|
4622
|
+
nonlocal text_started, text_index, next_content_index
|
|
4623
|
+
if text_started and text_index is not None:
|
|
4624
|
+
return text_index
|
|
4625
|
+
text_started = True
|
|
4626
|
+
text_index = next_content_index
|
|
4627
|
+
next_content_index += 1
|
|
4628
|
+
emit(
|
|
4629
|
+
"content_block_start",
|
|
4630
|
+
{"type": "content_block_start", "index": text_index, "content_block": {"type": "text", "text": ""}},
|
|
4631
|
+
)
|
|
4632
|
+
return text_index
|
|
4633
|
+
|
|
4634
|
+
def emit_text_delta(text: str) -> None:
|
|
4635
|
+
if not text:
|
|
4636
|
+
return
|
|
4637
|
+
idx = ensure_text_started()
|
|
4638
|
+
emit(
|
|
4639
|
+
"content_block_delta",
|
|
4640
|
+
{"type": "content_block_delta", "index": idx, "delta": {"type": "text_delta", "text": text}},
|
|
4641
|
+
)
|
|
4642
|
+
|
|
4643
|
+
try:
|
|
4644
|
+
for raw_line in resp:
|
|
4645
|
+
line = raw_line.decode("utf-8", errors="ignore").strip()
|
|
4646
|
+
if not line or line.startswith(":"):
|
|
4647
|
+
continue
|
|
4648
|
+
if line.startswith("data:"):
|
|
4649
|
+
line = line[5:].strip()
|
|
4650
|
+
if not line or line == "[DONE]":
|
|
4651
|
+
break
|
|
4652
|
+
try:
|
|
4653
|
+
event = json.loads(line)
|
|
4654
|
+
except Exception:
|
|
4655
|
+
continue
|
|
4656
|
+
if not isinstance(event, dict):
|
|
4657
|
+
continue
|
|
4658
|
+
usage = event.get("usage")
|
|
4659
|
+
if isinstance(usage, dict):
|
|
4660
|
+
output_tokens = max(output_tokens, positive_int(usage.get("completion_tokens")) or 0)
|
|
4661
|
+
choices = event.get("choices")
|
|
4662
|
+
if not isinstance(choices, list) or not choices:
|
|
4663
|
+
continue
|
|
4664
|
+
choice = choices[0] if isinstance(choices[0], dict) else {}
|
|
4665
|
+
if choice.get("finish_reason"):
|
|
4666
|
+
finish_reason = str(choice.get("finish_reason"))
|
|
4667
|
+
delta = choice.get("delta") if isinstance(choice.get("delta"), dict) else {}
|
|
4668
|
+
text_chunk = delta.get("content") or ""
|
|
4669
|
+
if text_chunk:
|
|
4670
|
+
if source_body is not None and not text_started and not tool_fragments and should_auto_enter_plan_mode(source_body, text_so_far + text_chunk, []):
|
|
4671
|
+
text_so_far += text_chunk
|
|
4672
|
+
text_suppressed_for_plan = True
|
|
4673
|
+
continue
|
|
4674
|
+
text_so_far += text_chunk
|
|
4675
|
+
if word_chunking:
|
|
4676
|
+
text_buffer += text_chunk
|
|
4677
|
+
to_flush, text_buffer = _split_word_buffer(text_buffer, force=False)
|
|
4678
|
+
emit_text_delta(to_flush)
|
|
4679
|
+
else:
|
|
4680
|
+
emit_text_delta(text_chunk)
|
|
4681
|
+
for call in delta.get("tool_calls") or []:
|
|
4682
|
+
if not isinstance(call, dict):
|
|
4683
|
+
continue
|
|
4684
|
+
try:
|
|
4685
|
+
call_index = int(call.get("index"))
|
|
4686
|
+
except Exception:
|
|
4687
|
+
call_index = len(tool_fragments)
|
|
4688
|
+
slot = tool_fragments.setdefault(call_index, {"id": "", "name": "", "arguments": ""})
|
|
4689
|
+
if call.get("id"):
|
|
4690
|
+
slot["id"] = str(call.get("id"))
|
|
4691
|
+
fn = call.get("function") if isinstance(call.get("function"), dict) else {}
|
|
4692
|
+
if fn.get("name"):
|
|
4693
|
+
slot["name"] += str(fn.get("name"))
|
|
4694
|
+
if fn.get("arguments"):
|
|
4695
|
+
slot["arguments"] += str(fn.get("arguments"))
|
|
4696
|
+
if word_chunking and text_buffer:
|
|
4697
|
+
to_flush, text_buffer = _split_word_buffer(text_buffer, force=True)
|
|
4698
|
+
emit_text_delta(to_flush)
|
|
4699
|
+
|
|
4700
|
+
tool_calls: list[dict[str, Any]] = []
|
|
4701
|
+
for _, fragment in sorted(tool_fragments.items()):
|
|
4702
|
+
raw_name = str(fragment.get("name") or "")
|
|
4703
|
+
if not raw_name:
|
|
4704
|
+
continue
|
|
4705
|
+
matched_name = _fuzzy_match_tool_name(raw_name) or raw_name
|
|
4706
|
+
normalized_args = normalize_tool_arguments(matched_name, fragment.get("arguments") or {})
|
|
4707
|
+
fixed_input = _validate_and_fix_tool_input(matched_name, normalized_args)
|
|
4708
|
+
if source_body is not None:
|
|
4709
|
+
matched_name, fixed_input = plan_mode_tool_name_for_emit(source_body, matched_name, fixed_input)
|
|
4710
|
+
if matched_name is None:
|
|
4711
|
+
continue
|
|
4712
|
+
tool_calls.append({"function": {"name": matched_name, "arguments": fixed_input}})
|
|
4713
|
+
tool_index = next_content_index
|
|
4714
|
+
next_content_index += 1
|
|
4715
|
+
tool_id = str(fragment.get("id") or f"toolu_openai_{int(time.time() * 1000)}_{tool_index}")
|
|
4716
|
+
append_tool_call_log(
|
|
4717
|
+
"openai_stream_tool_call",
|
|
4718
|
+
{
|
|
4719
|
+
"model": model,
|
|
4720
|
+
"raw_name": raw_name,
|
|
4721
|
+
"matched_name": matched_name,
|
|
4722
|
+
"raw_arguments": fragment.get("arguments"),
|
|
4723
|
+
"emitted_input": fixed_input,
|
|
4724
|
+
"sse_index": tool_index,
|
|
4725
|
+
},
|
|
4726
|
+
)
|
|
4727
|
+
emit(
|
|
4728
|
+
"content_block_start",
|
|
4729
|
+
{
|
|
4730
|
+
"type": "content_block_start",
|
|
4731
|
+
"index": tool_index,
|
|
4732
|
+
"content_block": {"type": "tool_use", "id": tool_id, "name": matched_name, "input": {}},
|
|
4733
|
+
},
|
|
4734
|
+
)
|
|
4735
|
+
emit(
|
|
4736
|
+
"content_block_delta",
|
|
4737
|
+
{
|
|
4738
|
+
"type": "content_block_delta",
|
|
4739
|
+
"index": tool_index,
|
|
4740
|
+
"delta": {"type": "input_json_delta", "partial_json": json.dumps(fixed_input, ensure_ascii=False)},
|
|
4741
|
+
},
|
|
4742
|
+
)
|
|
4743
|
+
emit("content_block_stop", {"type": "content_block_stop", "index": tool_index})
|
|
4744
|
+
|
|
4745
|
+
if source_body is not None and should_auto_enter_plan_mode(source_body, text_so_far, tool_calls):
|
|
4746
|
+
router_log("WARN", "auto-synthesized EnterPlanMode from short/empty upstream OpenAI stream")
|
|
4747
|
+
tool_index = next_content_index
|
|
4748
|
+
next_content_index += 1
|
|
4749
|
+
tool_calls.append({"function": {"name": "EnterPlanMode", "arguments": {}}})
|
|
4750
|
+
emit(
|
|
4751
|
+
"content_block_start",
|
|
4752
|
+
{
|
|
4753
|
+
"type": "content_block_start",
|
|
4754
|
+
"index": tool_index,
|
|
4755
|
+
"content_block": {"type": "tool_use", "id": f"toolu_openai_plan_{int(time.time() * 1000)}", "name": "EnterPlanMode", "input": {}},
|
|
4756
|
+
},
|
|
4757
|
+
)
|
|
4758
|
+
emit("content_block_delta", {"type": "content_block_delta", "index": tool_index, "delta": {"type": "input_json_delta", "partial_json": "{}"}})
|
|
4759
|
+
emit("content_block_stop", {"type": "content_block_stop", "index": tool_index})
|
|
4760
|
+
elif text_suppressed_for_plan and text_so_far:
|
|
4761
|
+
emit_text_delta(text_so_far)
|
|
4762
|
+
|
|
4763
|
+
if source_body is not None and should_keep_work_alive_with_tasklist(source_body, text_so_far, tool_calls):
|
|
4764
|
+
router_log("WARN", "auto-synthesized TaskList to keep work moving after OpenAI stream")
|
|
4765
|
+
tool_index = next_content_index
|
|
4766
|
+
next_content_index += 1
|
|
4767
|
+
tool_calls.append({"function": {"name": "TaskList", "arguments": {}}})
|
|
4768
|
+
emit(
|
|
4769
|
+
"content_block_start",
|
|
4770
|
+
{
|
|
4771
|
+
"type": "content_block_start",
|
|
4772
|
+
"index": tool_index,
|
|
4773
|
+
"content_block": {"type": "tool_use", "id": f"toolu_openai_keepalive_{int(time.time() * 1000)}", "name": "TaskList", "input": {}},
|
|
4774
|
+
},
|
|
4775
|
+
)
|
|
4776
|
+
emit("content_block_delta", {"type": "content_block_delta", "index": tool_index, "delta": {"type": "input_json_delta", "partial_json": "{}"}})
|
|
4777
|
+
emit("content_block_stop", {"type": "content_block_stop", "index": tool_index})
|
|
4778
|
+
|
|
4779
|
+
if text_started and text_index is not None:
|
|
4780
|
+
emit("content_block_stop", {"type": "content_block_stop", "index": text_index})
|
|
4781
|
+
stop_reason = "tool_use" if tool_calls else ("max_tokens" if finish_reason == "length" else "end_turn")
|
|
4782
|
+
write_anthropic_open_stream_stop(handler, {"stop_reason": stop_reason, "usage": {"output_tokens": output_tokens or max(1, len(text_so_far) // 4)}})
|
|
4783
|
+
finally:
|
|
4784
|
+
try:
|
|
4785
|
+
resp.close()
|
|
4786
|
+
except Exception:
|
|
4787
|
+
pass
|
|
4788
|
+
|
|
4789
|
+
|
|
4790
|
+
def upstream_http_error_message(exc: urllib.error.HTTPError, raw: str | None = None) -> str:
|
|
4545
4791
|
if raw is None:
|
|
4546
4792
|
raw = exc.read().decode("utf-8", errors="ignore")
|
|
4547
4793
|
msg = raw.strip() or str(exc)
|
|
@@ -4582,7 +4828,7 @@ def retryable_timeout_exception(exc: BaseException) -> bool:
|
|
|
4582
4828
|
return "timed out" in text or "timeout" in text
|
|
4583
4829
|
|
|
4584
4830
|
|
|
4585
|
-
def post_json_with_rate_retry(
|
|
4831
|
+
def post_json_with_rate_retry(
|
|
4586
4832
|
url: str,
|
|
4587
4833
|
req_body: Any,
|
|
4588
4834
|
headers: dict[str, str],
|
|
@@ -4592,81 +4838,180 @@ def post_json_with_rate_retry(
|
|
|
4592
4838
|
model: str,
|
|
4593
4839
|
retry_notice: Callable[[str], None] | None = None,
|
|
4594
4840
|
) -> Any:
|
|
4595
|
-
gateway_retries = positive_int(pcfg.get("gateway_retries")) or 2
|
|
4596
|
-
max_attempts = max(1, gateway_retries + 1)
|
|
4597
|
-
|
|
4598
|
-
|
|
4599
|
-
|
|
4600
|
-
|
|
4601
|
-
|
|
4602
|
-
|
|
4603
|
-
|
|
4604
|
-
|
|
4605
|
-
|
|
4606
|
-
|
|
4841
|
+
gateway_retries = positive_int(pcfg.get("gateway_retries")) or 2
|
|
4842
|
+
max_attempts = max(1, gateway_retries + 1)
|
|
4843
|
+
token_estimate = estimate_tokens(req_body)
|
|
4844
|
+
byte_estimate = len(json.dumps(req_body, ensure_ascii=False).encode("utf-8"))
|
|
4845
|
+
for attempt in range(max_attempts):
|
|
4846
|
+
try:
|
|
4847
|
+
write_router_activity(
|
|
4848
|
+
"request",
|
|
4849
|
+
provider,
|
|
4850
|
+
model,
|
|
4851
|
+
attempt=attempt + 1,
|
|
4852
|
+
total=max_attempts,
|
|
4853
|
+
tokens=token_estimate,
|
|
4854
|
+
bytes=byte_estimate,
|
|
4855
|
+
timeout=timeout,
|
|
4856
|
+
)
|
|
4857
|
+
router_log("INFO", f"upstream_request provider={provider} model={model} attempt={attempt + 1}/{max_attempts} tokens={token_estimate} bytes={byte_estimate} timeout={timeout}")
|
|
4858
|
+
data_bytes = json.dumps(req_body).encode("utf-8")
|
|
4859
|
+
req = urllib.request.Request(url, data=data_bytes, headers=headers, method="POST")
|
|
4860
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
4861
|
+
learn_router_rate_limit_headers(provider, pcfg, model, resp.headers)
|
|
4862
|
+
data = json.loads(resp.read().decode("utf-8"))
|
|
4863
|
+
write_router_activity("success", provider, model, attempt=attempt + 1, tokens=token_estimate, bytes=byte_estimate)
|
|
4864
|
+
return data
|
|
4865
|
+
except urllib.error.HTTPError as exc:
|
|
4866
|
+
raw = exc.read().decode("utf-8", errors="ignore")
|
|
4867
|
+
learn_router_rate_limit_headers(provider, pcfg, model, exc.headers)
|
|
4607
4868
|
if exc.code == 429 and attempt == 0:
|
|
4608
4869
|
wait = register_router_rate_limit_backoff(provider, pcfg, model, exc.headers.get("Retry-After"))
|
|
4609
4870
|
time.sleep(wait)
|
|
4610
4871
|
continue
|
|
4611
|
-
if exc.code in UPSTREAM_RETRY_HTTP_CODES and attempt + 1 < max_attempts:
|
|
4612
|
-
retry_no = attempt + 1
|
|
4613
|
-
|
|
4614
|
-
|
|
4615
|
-
|
|
4616
|
-
|
|
4617
|
-
|
|
4618
|
-
|
|
4619
|
-
|
|
4620
|
-
|
|
4621
|
-
|
|
4622
|
-
|
|
4623
|
-
|
|
4624
|
-
|
|
4625
|
-
|
|
4626
|
-
|
|
4627
|
-
|
|
4628
|
-
|
|
4629
|
-
|
|
4630
|
-
|
|
4631
|
-
|
|
4632
|
-
|
|
4633
|
-
|
|
4634
|
-
|
|
4635
|
-
|
|
4636
|
-
|
|
4637
|
-
|
|
4638
|
-
|
|
4639
|
-
|
|
4640
|
-
|
|
4641
|
-
|
|
4642
|
-
|
|
4643
|
-
|
|
4644
|
-
|
|
4872
|
+
if exc.code in UPSTREAM_RETRY_HTTP_CODES and attempt + 1 < max_attempts:
|
|
4873
|
+
retry_no = attempt + 1
|
|
4874
|
+
write_router_activity("retry", provider, model, attempt=retry_no, total=gateway_retries, code=exc.code, tokens=token_estimate, bytes=byte_estimate)
|
|
4875
|
+
router_log("WARN", f"upstream_retry provider={provider} model={model} attempt={retry_no}/{gateway_retries} code={exc.code} tokens={token_estimate} bytes={byte_estimate}")
|
|
4876
|
+
if retry_notice:
|
|
4877
|
+
retry_notice(upstream_retry_message(retry_no, gateway_retries))
|
|
4878
|
+
time.sleep(upstream_retry_wait_seconds(retry_no))
|
|
4879
|
+
continue
|
|
4880
|
+
write_router_activity("error", provider, model, code=exc.code, tokens=token_estimate, bytes=byte_estimate)
|
|
4881
|
+
raise RuntimeError(upstream_http_error_message(exc, raw)) from exc
|
|
4882
|
+
except (TimeoutError, urllib.error.URLError) as exc:
|
|
4883
|
+
if retryable_timeout_exception(exc) and attempt + 1 < max_attempts:
|
|
4884
|
+
retry_no = attempt + 1
|
|
4885
|
+
write_router_activity("retry", provider, model, attempt=retry_no, total=gateway_retries, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate)
|
|
4886
|
+
router_log("WARN", f"upstream_retry provider={provider} model={model} attempt={retry_no}/{gateway_retries} error={type(exc).__name__} tokens={token_estimate} bytes={byte_estimate}")
|
|
4887
|
+
if retry_notice:
|
|
4888
|
+
retry_notice(upstream_retry_message(retry_no, gateway_retries))
|
|
4889
|
+
time.sleep(upstream_retry_wait_seconds(retry_no))
|
|
4890
|
+
continue
|
|
4891
|
+
write_router_activity("error", provider, model, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate)
|
|
4892
|
+
raise RuntimeError(f"{type(exc).__name__}: {exc}") from exc
|
|
4893
|
+
raise RuntimeError("upstream request failed")
|
|
4894
|
+
|
|
4895
|
+
|
|
4896
|
+
def open_openai_stream_with_rate_retry(
|
|
4897
|
+
url: str,
|
|
4898
|
+
req_body: Any,
|
|
4899
|
+
headers: dict[str, str],
|
|
4900
|
+
timeout: float,
|
|
4901
|
+
provider: str,
|
|
4902
|
+
pcfg: dict[str, Any],
|
|
4903
|
+
model: str,
|
|
4904
|
+
retry_notice: Callable[[str], None] | None = None,
|
|
4905
|
+
) -> Any:
|
|
4906
|
+
gateway_retries = positive_int(pcfg.get("gateway_retries")) or 2
|
|
4907
|
+
max_attempts = max(1, gateway_retries + 1)
|
|
4908
|
+
token_estimate = estimate_tokens(req_body)
|
|
4909
|
+
byte_estimate = len(json.dumps(req_body, ensure_ascii=False).encode("utf-8"))
|
|
4910
|
+
data_bytes = json.dumps(req_body).encode("utf-8")
|
|
4911
|
+
for attempt in range(max_attempts):
|
|
4912
|
+
try:
|
|
4913
|
+
write_router_activity(
|
|
4914
|
+
"request",
|
|
4915
|
+
provider,
|
|
4916
|
+
model,
|
|
4917
|
+
attempt=attempt + 1,
|
|
4918
|
+
total=max_attempts,
|
|
4919
|
+
tokens=token_estimate,
|
|
4920
|
+
bytes=byte_estimate,
|
|
4921
|
+
timeout=timeout,
|
|
4922
|
+
stream=True,
|
|
4923
|
+
)
|
|
4924
|
+
router_log("INFO", f"upstream_stream_request provider={provider} model={model} attempt={attempt + 1}/{max_attempts} tokens={token_estimate} bytes={byte_estimate} timeout={timeout}")
|
|
4925
|
+
req = urllib.request.Request(url, data=data_bytes, headers=headers, method="POST")
|
|
4926
|
+
resp = urllib.request.urlopen(req, timeout=timeout)
|
|
4927
|
+
learn_router_rate_limit_headers(provider, pcfg, model, resp.headers)
|
|
4928
|
+
return resp
|
|
4929
|
+
except urllib.error.HTTPError as exc:
|
|
4930
|
+
raw = exc.read().decode("utf-8", errors="ignore")
|
|
4931
|
+
learn_router_rate_limit_headers(provider, pcfg, model, exc.headers)
|
|
4932
|
+
if exc.code == 429 and attempt == 0:
|
|
4933
|
+
wait = register_router_rate_limit_backoff(provider, pcfg, model, exc.headers.get("Retry-After"))
|
|
4934
|
+
time.sleep(wait)
|
|
4935
|
+
continue
|
|
4936
|
+
if exc.code in UPSTREAM_RETRY_HTTP_CODES and attempt + 1 < max_attempts:
|
|
4937
|
+
retry_no = attempt + 1
|
|
4938
|
+
write_router_activity("retry", provider, model, attempt=retry_no, total=gateway_retries, code=exc.code, tokens=token_estimate, bytes=byte_estimate, stream=True)
|
|
4939
|
+
router_log("WARN", f"upstream_stream_retry provider={provider} model={model} attempt={retry_no}/{gateway_retries} code={exc.code} tokens={token_estimate} bytes={byte_estimate}")
|
|
4940
|
+
if retry_notice:
|
|
4941
|
+
retry_notice(upstream_retry_message(retry_no, gateway_retries))
|
|
4942
|
+
time.sleep(upstream_retry_wait_seconds(retry_no))
|
|
4943
|
+
continue
|
|
4944
|
+
write_router_activity("error", provider, model, code=exc.code, tokens=token_estimate, bytes=byte_estimate, stream=True)
|
|
4945
|
+
raise RuntimeError(upstream_http_error_message(exc, raw)) from exc
|
|
4946
|
+
except (TimeoutError, urllib.error.URLError) as exc:
|
|
4947
|
+
if retryable_timeout_exception(exc) and attempt + 1 < max_attempts:
|
|
4948
|
+
retry_no = attempt + 1
|
|
4949
|
+
write_router_activity("retry", provider, model, attempt=retry_no, total=gateway_retries, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate, stream=True)
|
|
4950
|
+
router_log("WARN", f"upstream_stream_retry provider={provider} model={model} attempt={retry_no}/{gateway_retries} error={type(exc).__name__} tokens={token_estimate} bytes={byte_estimate}")
|
|
4951
|
+
if retry_notice:
|
|
4952
|
+
retry_notice(upstream_retry_message(retry_no, gateway_retries))
|
|
4953
|
+
time.sleep(upstream_retry_wait_seconds(retry_no))
|
|
4954
|
+
continue
|
|
4955
|
+
write_router_activity("error", provider, model, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate, stream=True)
|
|
4956
|
+
raise RuntimeError(f"{type(exc).__name__}: {exc}") from exc
|
|
4957
|
+
raise RuntimeError("upstream stream request failed")
|
|
4958
|
+
|
|
4959
|
+
|
|
4960
|
+
def forward_openai_compatible_chat(handler: BaseHTTPRequestHandler, provider: str, pcfg: dict[str, Any], body: dict[str, Any]) -> None:
|
|
4961
|
+
_update_tool_schema_registry(body.get("tools"))
|
|
4962
|
+
model = resolve_requested_model(provider, pcfg, body.get("model"))
|
|
4963
|
+
if provider == "nvidia-hosted":
|
|
4964
|
+
model = ncp_model_id_for_nvidia_hosted(model)
|
|
4965
|
+
url = join_url(provider_upstream_request_base(provider, pcfg), "/chat/completions")
|
|
4966
|
+
waited, rpm_used, rpm_limit = apply_router_rate_limit(provider, pcfg, model)
|
|
4967
|
+
stream = bool(body.get("stream", True))
|
|
4968
|
+
notice = rate_limit_notice(waited, rpm_used, rpm_limit, bool(pcfg.get("rate_limit_status", True)))
|
|
4969
|
+
if stream:
|
|
4970
|
+
req_body = openai_compatible_chat_request(provider, model, body, pcfg, stream=True)
|
|
4971
|
+
write_anthropic_open_stream_start(handler, model)
|
|
4972
|
+
index = 0
|
|
4973
|
+
if notice:
|
|
4974
|
+
index = write_anthropic_stream_blocks(handler, [{"type": "text", "text": notice}], index)
|
|
4975
|
+
try:
|
|
4645
4976
|
def emit_retry_notice(text: str) -> None:
|
|
4646
4977
|
nonlocal index
|
|
4647
4978
|
index = write_anthropic_stream_blocks(handler, [{"type": "text", "text": text + "\n"}], index)
|
|
4648
4979
|
|
|
4649
|
-
|
|
4650
|
-
url,
|
|
4651
|
-
req_body,
|
|
4652
|
-
provider_headers(provider, pcfg),
|
|
4653
|
-
provider_request_timeout_seconds(pcfg),
|
|
4654
|
-
provider,
|
|
4980
|
+
resp = open_openai_stream_with_rate_retry(
|
|
4981
|
+
url,
|
|
4982
|
+
req_body,
|
|
4983
|
+
provider_headers(provider, pcfg),
|
|
4984
|
+
provider_request_timeout_seconds(pcfg),
|
|
4985
|
+
provider,
|
|
4655
4986
|
pcfg,
|
|
4656
|
-
model,
|
|
4657
|
-
emit_retry_notice,
|
|
4658
|
-
)
|
|
4659
|
-
|
|
4660
|
-
|
|
4661
|
-
|
|
4662
|
-
|
|
4663
|
-
|
|
4664
|
-
|
|
4665
|
-
|
|
4666
|
-
|
|
4667
|
-
|
|
4668
|
-
|
|
4669
|
-
|
|
4987
|
+
model,
|
|
4988
|
+
emit_retry_notice,
|
|
4989
|
+
)
|
|
4990
|
+
stream_openai_chat_to_anthropic_sse(
|
|
4991
|
+
handler,
|
|
4992
|
+
resp,
|
|
4993
|
+
model,
|
|
4994
|
+
provider,
|
|
4995
|
+
source_body=body,
|
|
4996
|
+
start_index=index,
|
|
4997
|
+
word_chunking=bool(pcfg.get("stream_word_chunking", False)),
|
|
4998
|
+
)
|
|
4999
|
+
write_router_activity("success", provider, model, tokens=estimate_tokens(req_body), bytes=len(json.dumps(req_body, ensure_ascii=False).encode("utf-8")), stream=True)
|
|
5000
|
+
except RuntimeError as exc:
|
|
5001
|
+
msg = str(exc)
|
|
5002
|
+
write_anthropic_stream_blocks(handler, [{"type": "text", "text": f"Upstream error: {msg}"}], index)
|
|
5003
|
+
write_anthropic_open_stream_stop(handler)
|
|
5004
|
+
return
|
|
5005
|
+
except Exception as exc:
|
|
5006
|
+
msg = f"{type(exc).__name__}: {exc}"
|
|
5007
|
+
write_router_activity("error", provider, model, error=type(exc).__name__, stream=True)
|
|
5008
|
+
write_anthropic_stream_blocks(handler, [{"type": "text", "text": f"Upstream error: {msg}"}], index)
|
|
5009
|
+
write_anthropic_open_stream_stop(handler)
|
|
5010
|
+
return
|
|
5011
|
+
return
|
|
5012
|
+
req_body = openai_compatible_chat_request(provider, model, body, pcfg, stream=False)
|
|
5013
|
+
try:
|
|
5014
|
+
data = post_json_with_rate_retry(
|
|
4670
5015
|
url,
|
|
4671
5016
|
req_body,
|
|
4672
5017
|
provider_headers(provider, pcfg),
|
|
@@ -5112,7 +5457,7 @@ def status_lines() -> list[str]:
|
|
|
5112
5457
|
*([f"keep_alive: {pcfg.get('keep_alive', 'default')}"] if provider in ("ollama", "ollama-cloud") else []),
|
|
5113
5458
|
*([f"think: {bool(pcfg.get('think', False))}"] if provider in ("ollama", "ollama-cloud") else []),
|
|
5114
5459
|
*([f"request_timeout_ms: {pcfg.get('request_timeout_ms', 'default')}"] if provider in ("ollama", "ollama-cloud") else []),
|
|
5115
|
-
*([f"context_window: {pcfg.get('context_window', 'default')}"] if provider in ("vllm", "self-hosted-nim") else []),
|
|
5460
|
+
*([f"context_window: {pcfg.get('context_window', 'default')}"] if provider in ("vllm", "nvidia-hosted", "self-hosted-nim") else []),
|
|
5116
5461
|
*([f"context_reserve_tokens: {pcfg.get('context_reserve_tokens', 'default')}"] if provider in ("vllm", "self-hosted-nim") else []),
|
|
5117
5462
|
*([f"max_output_tokens: {pcfg.get('max_output_tokens', 'default')}"] if provider in ("vllm", "nvidia-hosted", "self-hosted-nim") else []),
|
|
5118
5463
|
*([f"request_timeout_ms: {pcfg.get('request_timeout_ms', 'default')}"] if provider in ("vllm", "nvidia-hosted", "self-hosted-nim") else []),
|
|
@@ -5421,9 +5766,9 @@ def provider_options_status(provider: str, pcfg: dict[str, Any]) -> str:
|
|
|
5421
5766
|
if limit is not None:
|
|
5422
5767
|
suffix = f"{used}/{limit}" if limit > 0 else f"{used}/min(unlimited)"
|
|
5423
5768
|
parts.append(f"rpm_used={suffix}")
|
|
5424
|
-
if provider in ("vllm", "self-hosted-nim"):
|
|
5425
|
-
parts.insert(0, f"context_window={pcfg.get('context_window', 'default')}")
|
|
5426
|
-
parts.insert(1, f"reserve={pcfg.get('context_reserve_tokens', 'default')}")
|
|
5769
|
+
if provider in ("vllm", "nvidia-hosted", "self-hosted-nim"):
|
|
5770
|
+
parts.insert(0, f"context_window={pcfg.get('context_window', 'default')}")
|
|
5771
|
+
parts.insert(1, f"reserve={pcfg.get('context_reserve_tokens', 'default')}")
|
|
5427
5772
|
if provider in ("vllm", "self-hosted-nim"):
|
|
5428
5773
|
native_default = False if provider == "nvidia-hosted" else True
|
|
5429
5774
|
parts.append(f"native={bool(pcfg.get('native_compat', native_default))}")
|
|
@@ -5741,10 +6086,10 @@ def apply_llm_preset_to_provider(provider: str, pcfg: dict[str, Any], preset_id:
|
|
|
5741
6086
|
f"native={native_default}",
|
|
5742
6087
|
],
|
|
5743
6088
|
}
|
|
5744
|
-
for token in tokens_by_preset[preset_id]:
|
|
5745
|
-
if provider == "nvidia-hosted" and token.startswith(
|
|
5746
|
-
continue
|
|
5747
|
-
apply_provider_option(provider, pcfg, token)
|
|
6089
|
+
for token in tokens_by_preset[preset_id]:
|
|
6090
|
+
if provider == "nvidia-hosted" and token.startswith("native="):
|
|
6091
|
+
continue
|
|
6092
|
+
apply_provider_option(provider, pcfg, token)
|
|
5748
6093
|
if server_limit:
|
|
5749
6094
|
requested_context = positive_int(pcfg.get("context_window"))
|
|
5750
6095
|
if requested_context and requested_context > server_limit:
|
package/docs/README.ja.md
CHANGED
|
@@ -47,7 +47,7 @@ vLLM、NVIDIA hosted、self-hosted NIM を選択し、通常の Claude Code 引
|
|
|
47
47
|
|
|
48
48
|
Credits: One Ciel LLC
|
|
49
49
|
|
|
50
|
-
現在のバージョン: `0.1.
|
|
50
|
+
現在のバージョン: `0.1.36`
|
|
51
51
|
|
|
52
52
|
## 作られた理由
|
|
53
53
|
|
|
@@ -351,6 +351,23 @@ Windows/Linux 管理、クリーンアップスクリプト、定期的なセキ
|
|
|
351
351
|
|
|
352
352
|
## 変更履歴
|
|
353
353
|
|
|
354
|
+
### 0.1.36
|
|
355
|
+
|
|
356
|
+
- **NVIDIA upstream streaming**: NVIDIA hosted router 呼び出しは upstream にも
|
|
357
|
+
`stream=true` を使用します。長い応答を完全な non-streaming completion まで
|
|
358
|
+
待たず、chunk として流せます。
|
|
359
|
+
- **Stream retry diagnostics**: streaming NVIDIA 呼び出しでも statusline 用の
|
|
360
|
+
retry/request size activity 状態を維持します。
|
|
361
|
+
|
|
362
|
+
### 0.1.35
|
|
363
|
+
|
|
364
|
+
- **NVIDIA router context guard**: NVIDIA hosted の router context 既定値を 32K
|
|
365
|
+
に下げ、LLM preset がこの cap を調整できるようにしました。長い Claude Code
|
|
366
|
+
セッションで payload が肥大して timeout する状況を減らします。
|
|
367
|
+
- **Upstream activity status**: router が現在の request/retry/success/error
|
|
368
|
+
状態と推定 token/byte サイズを記録し、statusline で upstream 待機と idle を
|
|
369
|
+
判別できるようにしました。
|
|
370
|
+
|
|
354
371
|
### 0.1.34
|
|
355
372
|
|
|
356
373
|
- **完全な headless 設定経路**: `--ca-env-file`、環境変数マッピング、Advisor
|
package/docs/README.ko.md
CHANGED
|
@@ -47,7 +47,7 @@ NVIDIA hosted, self-hosted NIM을 선택하고, Claude Code의 일반 인자는
|
|
|
47
47
|
|
|
48
48
|
Credits: One Ciel LLC
|
|
49
49
|
|
|
50
|
-
현재 버전: `0.1.
|
|
50
|
+
현재 버전: `0.1.36`
|
|
51
51
|
|
|
52
52
|
## 왜 만들었나
|
|
53
53
|
|
|
@@ -351,6 +351,23 @@ Windows 이벤트 로그 리뷰, 바이러스/랜섬웨어 침입 시도 정리,
|
|
|
351
351
|
|
|
352
352
|
## 변경 이력
|
|
353
353
|
|
|
354
|
+
### 0.1.36
|
|
355
|
+
|
|
356
|
+
- **NVIDIA upstream streaming**: NVIDIA hosted router 호출은 이제 upstream에도
|
|
357
|
+
`stream=true`를 사용합니다. 긴 응답을 전체 완료까지 기다리지 않고 chunk로
|
|
358
|
+
흘려보내 timeout 가능성을 낮춥니다.
|
|
359
|
+
- **Stream retry diagnostics**: streaming NVIDIA 호출도 statusline에서 쓰는
|
|
360
|
+
retry/request size activity 상태를 유지합니다.
|
|
361
|
+
|
|
362
|
+
### 0.1.35
|
|
363
|
+
|
|
364
|
+
- **NVIDIA router context guard**: NVIDIA hosted의 router context 기본값을 32K로
|
|
365
|
+
낮추고 LLM preset이 이 cap을 조정할 수 있게 하여, 긴 Claude Code 세션에서
|
|
366
|
+
payload가 커져 timeout이 나는 상황을 줄였습니다.
|
|
367
|
+
- **Upstream activity status**: router가 현재 request/retry/success/error 상태와
|
|
368
|
+
추정 token/byte 크기를 기록하여, statusline에서 upstream 대기와 idle 상태를
|
|
369
|
+
구분할 수 있습니다.
|
|
370
|
+
|
|
354
371
|
### 0.1.34
|
|
355
372
|
|
|
356
373
|
- **완전한 headless 설정 경로**: `--ca-env-file`, 환경변수 매핑, Advisor model,
|
package/docs/README.zh.md
CHANGED
|
@@ -47,7 +47,7 @@ NIM,并把普通 Claude Code 参数原样传递。
|
|
|
47
47
|
|
|
48
48
|
Credits: One Ciel LLC
|
|
49
49
|
|
|
50
|
-
当前版本: `0.1.
|
|
50
|
+
当前版本: `0.1.36`
|
|
51
51
|
|
|
52
52
|
## 为什么存在
|
|
53
53
|
|
|
@@ -337,6 +337,21 @@ Hermes 格式模型或部分较旧的 Qwen tool template。
|
|
|
337
337
|
|
|
338
338
|
## 更新日志
|
|
339
339
|
|
|
340
|
+
### 0.1.36
|
|
341
|
+
|
|
342
|
+
- **NVIDIA upstream streaming**:NVIDIA hosted router 调用现在也会向 upstream
|
|
343
|
+
使用 `stream=true`,长响应可以按 chunk 流出,不再等待完整的非流式 completion。
|
|
344
|
+
- **Stream retry diagnostics**:streaming NVIDIA 调用也保留 statusline 使用的
|
|
345
|
+
retry/request size activity 状态。
|
|
346
|
+
|
|
347
|
+
### 0.1.35
|
|
348
|
+
|
|
349
|
+
- **NVIDIA router context guard**:NVIDIA hosted 的 router context 默认值改为
|
|
350
|
+
32K,并允许 LLM preset 调整该 cap,减少长 Claude Code 会话中 payload 变大后
|
|
351
|
+
触发 timeout 的情况。
|
|
352
|
+
- **Upstream activity status**:router 会记录当前 request/retry/success/error
|
|
353
|
+
状态和估算 token/byte 大小,statusline 可以区分正在等待 upstream 还是已 idle。
|
|
354
|
+
|
|
340
355
|
### 0.1.34
|
|
341
356
|
|
|
342
357
|
- **完整 headless 配置路径**:新增 `--ca-env-file`、环境变量映射、Advisor
|
package/docs/manual.md
CHANGED
package/package.json
CHANGED