@oneciel-ai/claude-any 0.1.34 → 0.1.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -48,7 +48,7 @@ arguments through unchanged.
48
48
 
49
49
  Credits: One Ciel LLC
50
50
 
51
- Current version: `0.1.34`
51
+ Current version: `0.1.36`
52
52
 
53
53
  ## Why This Exists
54
54
 
@@ -381,6 +381,23 @@ steps under that larger model's supervision.
381
381
 
382
382
  ## Changelog
383
383
 
384
+ ### 0.1.36
385
+
386
+ - **NVIDIA upstream streaming**: NVIDIA hosted router calls now use upstream
387
+ `stream=true`, so long responses can flow as chunks instead of waiting for a
388
+ full non-streaming completion.
389
+ - **Stream retry diagnostics**: streamed NVIDIA calls keep the same retry and
390
+ request-size activity status used by the statusline.
391
+
392
+ ### 0.1.35
393
+
394
+ - **NVIDIA router context guard**: NVIDIA hosted now defaults to a 32K router
395
+ context window and LLM presets may tune that cap, reducing timeout-prone
396
+ payload growth in long Claude Code sessions.
397
+ - **Upstream activity status**: the router records current request, retry,
398
+ success, and error state with estimated token/byte size so the statusline can
399
+ distinguish active upstream waits from idle sessions.
400
+
384
401
  ### 0.1.34
385
402
 
386
403
  - **Complete headless configuration path**: add `--ca-env-file`,
package/claude_any.py CHANGED
@@ -39,8 +39,9 @@ LOG_LEVEL_PATH = CONFIG_DIR / "log-level"
39
39
  REQUEST_DUMP_PATH = CONFIG_DIR / "requests.jsonl"
40
40
  RESPONSE_DUMP_PATH = CONFIG_DIR / "responses.jsonl"
41
41
  TOOL_CALL_LOG_PATH = CONFIG_DIR / "tool-calls.jsonl"
42
- RATE_LIMIT_STATE_PATH = CONFIG_DIR / "rate-limit-state.json"
43
- CHAT_MESSAGES_PATH = CONFIG_DIR / "chat-messages.jsonl"
42
+ RATE_LIMIT_STATE_PATH = CONFIG_DIR / "rate-limit-state.json"
43
+ ROUTER_ACTIVITY_PATH = CONFIG_DIR / "router-activity.json"
44
+ CHAT_MESSAGES_PATH = CONFIG_DIR / "chat-messages.jsonl"
44
45
  CHAT_FILES_DIR = CONFIG_DIR / "chat-files"
45
46
  PLAN_ARTIFACTS_DIR = CONFIG_DIR / "plan-artifacts"
46
47
  PID_PATH = CONFIG_DIR / "router.pid"
@@ -84,7 +85,7 @@ PROVIDER_LABELS = {
84
85
  "self-hosted-nim": "Self Hosted NIM",
85
86
  }
86
87
  APP_NAME = "Claude Any"
87
- VERSION = "0.1.34"
88
+ VERSION = "0.1.36"
88
89
  CREDITS = "Credits: One Ciel LLC"
89
90
 
90
91
  LOG_LEVELS = {"SILENT": 0, "ERROR": 1, "WARN": 2, "INFO": 3, "DEBUG": 4, "TRACE": 5}
@@ -712,17 +713,18 @@ DEFAULT_CONFIG: dict[str, Any] = {
712
713
  "stream_enabled": True,
713
714
  "stream_word_chunking": False,
714
715
  },
715
- "nvidia-hosted": {
716
- "base_url": "https://integrate.api.nvidia.com/v1",
717
- "api_key": "not-used",
718
- "current_model": "qwen/qwen3-coder-480b-a35b-instruct",
716
+ "nvidia-hosted": {
717
+ "base_url": "https://integrate.api.nvidia.com/v1",
718
+ "api_key": "not-used",
719
+ "current_model": "qwen/qwen3-coder-480b-a35b-instruct",
719
720
  "advisor_model": "",
720
721
  "custom_models": [],
721
722
  "native_compat": False,
722
- "rate_limit_rpm": 40,
723
- "rate_limit_status": True,
724
- "max_output_tokens": 4096,
725
- "temperature": 0.7,
723
+ "rate_limit_rpm": 40,
724
+ "rate_limit_status": True,
725
+ "context_window": 32768,
726
+ "max_output_tokens": 4096,
727
+ "temperature": 0.7,
726
728
  "top_p": 0.8,
727
729
  "request_timeout_ms": 300000,
728
730
  "stream_enabled": True,
@@ -773,14 +775,21 @@ def apply_config_migrations(cfg: dict[str, Any]) -> None:
773
775
  pcfg["native_compat"] = False
774
776
  migrations[marker] = True
775
777
 
776
- marker = "default_timeout_5m_20260513"
777
- if not migrations.get(marker):
778
- for pcfg in (cfg.get("providers") or {}).values():
779
- if not isinstance(pcfg, dict):
780
- continue
781
- if positive_int(pcfg.get("request_timeout_ms")) in (600000, 1800000):
782
- pcfg["request_timeout_ms"] = 300000
783
- migrations[marker] = True
778
+ marker = "default_timeout_5m_20260513"
779
+ if not migrations.get(marker):
780
+ for pcfg in (cfg.get("providers") or {}).values():
781
+ if not isinstance(pcfg, dict):
782
+ continue
783
+ if positive_int(pcfg.get("request_timeout_ms")) in (600000, 1800000):
784
+ pcfg["request_timeout_ms"] = 300000
785
+ migrations[marker] = True
786
+
787
+ marker = "nvidia_context_window_32k_20260513"
788
+ if not migrations.get(marker):
789
+ pcfg = cfg.get("providers", {}).get("nvidia-hosted", {})
790
+ if isinstance(pcfg, dict) and not positive_int(pcfg.get("context_window")):
791
+ pcfg["context_window"] = 32768
792
+ migrations[marker] = True
784
793
 
785
794
 
786
795
  _config_cache: dict[str, Any] | None = None
@@ -1174,8 +1183,9 @@ from pathlib import Path
1174
1183
  HOME = Path.home()
1175
1184
  CONFIG_DIR = Path(os.environ.get("CLAUDE_ANY_CONFIG_DIR") or (HOME / ".config" / "claude-any"))
1176
1185
  CONFIG_PATH = CONFIG_DIR / "config.json"
1177
- STATE_PATH = CONFIG_DIR / "rate-limit-state.json"
1178
- PALETTE = (203, 209, 215, 221, 229, 187, 151, 116, 111, 147, 183, 219)
1186
+ STATE_PATH = CONFIG_DIR / "rate-limit-state.json"
1187
+ ACTIVITY_PATH = CONFIG_DIR / "router-activity.json"
1188
+ PALETTE = (203, 209, 215, 221, 229, 187, 151, 116, 111, 147, 183, 219)
1179
1189
 
1180
1190
 
1181
1191
  def load_json(path, default):
@@ -1225,8 +1235,9 @@ def main():
1225
1235
  rpm = int(raw_rpm)
1226
1236
  except Exception:
1227
1237
  rpm = 40
1228
- state = load_json(STATE_PATH, {})
1229
- now = time.time()
1238
+ state = load_json(STATE_PATH, {})
1239
+ activity = load_json(ACTIVITY_PATH, {})
1240
+ now = time.time()
1230
1241
  key = f"{provider}:__global__" if provider else ""
1231
1242
  entry = state.get(key) if key else None
1232
1243
  if not isinstance(entry, dict):
@@ -1288,9 +1299,25 @@ def main():
1288
1299
  rpm_text += " | server " + ", ".join(parts)
1289
1300
  if penalty_until > now:
1290
1301
  rpm_text += f" | wait {max(0.0, penalty_until - now):.0f}s"
1291
- elif last_wait >= 0.5 and 0.0 <= now - updated_at < 60.0:
1292
- rpm_text += f" | wait {last_wait:.1f}s"
1293
- print(f"{left} | {color(rpm_text)}")
1302
+ elif last_wait >= 0.5 and 0.0 <= now - updated_at < 60.0:
1303
+ rpm_text += f" | wait {last_wait:.1f}s"
1304
+ if isinstance(activity, dict):
1305
+ try:
1306
+ age = now - float(activity.get("updated_at") or 0)
1307
+ except Exception:
1308
+ age = 999999
1309
+ if 0 <= age < 180:
1310
+ event = str(activity.get("event") or "")
1311
+ if event == "retry":
1312
+ rpm_text += f" | retry {activity.get('attempt')}/{activity.get('total')}"
1313
+ elif event == "request":
1314
+ tokens = activity.get("tokens")
1315
+ rpm_text += f" | upstream {age:.0f}s"
1316
+ if tokens:
1317
+ rpm_text += f" {tokens}tok"
1318
+ elif event in ("success", "error"):
1319
+ rpm_text += f" | {event} {age:.0f}s"
1320
+ print(f"{left} | {color(rpm_text)}")
1294
1321
 
1295
1322
 
1296
1323
  if __name__ == "__main__":
@@ -2834,16 +2861,34 @@ def native_anthropic_base_url(provider: str, pcfg: dict[str, Any]) -> str:
2834
2861
  return base
2835
2862
 
2836
2863
 
2837
- def write_json(handler: BaseHTTPRequestHandler, obj: Any, status: int = 200) -> None:
2838
- body = json.dumps(obj).encode("utf-8")
2839
- handler.send_response(status)
2840
- handler.send_header("content-type", "application/json")
2841
- handler.send_header("content-length", str(len(body)))
2842
- handler.end_headers()
2843
- handler.wfile.write(body)
2844
-
2845
-
2846
- def write_text_response(handler: BaseHTTPRequestHandler, text: str, status: int = 200, content_type: str = "text/plain; charset=utf-8") -> None:
2864
+ def write_json(handler: BaseHTTPRequestHandler, obj: Any, status: int = 200) -> None:
2865
+ body = json.dumps(obj).encode("utf-8")
2866
+ handler.send_response(status)
2867
+ handler.send_header("content-type", "application/json")
2868
+ handler.send_header("content-length", str(len(body)))
2869
+ handler.end_headers()
2870
+ handler.wfile.write(body)
2871
+
2872
+
2873
+ def write_router_activity(event: str, provider: str, model: str | None = None, **fields: Any) -> None:
2874
+ try:
2875
+ CONFIG_DIR.mkdir(parents=True, exist_ok=True)
2876
+ data = {
2877
+ "updated_at": time.time(),
2878
+ "time": time.strftime("%Y-%m-%dT%H:%M:%S"),
2879
+ "event": event,
2880
+ "provider": provider,
2881
+ "model": model or "",
2882
+ }
2883
+ data.update(fields)
2884
+ tmp = ROUTER_ACTIVITY_PATH.with_name(f"{ROUTER_ACTIVITY_PATH.name}.{os.getpid()}.{time.time_ns()}.tmp")
2885
+ tmp.write_text(json.dumps(data, ensure_ascii=False, separators=(",", ":")), encoding="utf-8")
2886
+ tmp.replace(ROUTER_ACTIVITY_PATH)
2887
+ except Exception:
2888
+ pass
2889
+
2890
+
2891
+ def write_text_response(handler: BaseHTTPRequestHandler, text: str, status: int = 200, content_type: str = "text/plain; charset=utf-8") -> None:
2847
2892
  body = text.encode("utf-8")
2848
2893
  handler.send_response(status)
2849
2894
  handler.send_header("content-type", content_type)
@@ -3556,11 +3601,20 @@ def cap_output_tokens_for_context(
3556
3601
  return max(1, min(configured, available))
3557
3602
 
3558
3603
 
3559
- def ollama_context_limit_for_budget(pcfg: dict[str, Any]) -> int:
3560
- raw = pcfg.get("num_ctx", "auto")
3561
- if isinstance(raw, str) and raw.strip().lower() == "auto":
3562
- return positive_int(pcfg.get("num_ctx_max")) or 65536
3563
- return positive_int(raw) or positive_int(pcfg.get("num_ctx_max")) or 65536
3604
+ def ollama_context_limit_for_budget(pcfg: dict[str, Any]) -> int:
3605
+ raw = pcfg.get("num_ctx", "auto")
3606
+ if isinstance(raw, str) and raw.strip().lower() == "auto":
3607
+ return positive_int(pcfg.get("num_ctx_max")) or 65536
3608
+ return positive_int(raw) or positive_int(pcfg.get("num_ctx_max")) or 65536
3609
+
3610
+
3611
+ def openai_context_limit_for_budget(provider: str, pcfg: dict[str, Any]) -> int:
3612
+ configured = positive_int(pcfg.get("context_window")) or positive_int(pcfg.get("max_model_len"))
3613
+ if configured:
3614
+ return configured
3615
+ if provider == "nvidia-hosted":
3616
+ return 32768
3617
+ return 65536
3564
3618
 
3565
3619
 
3566
3620
  def compact_ollama_messages_for_budget(
@@ -3695,10 +3749,10 @@ def ollama_chat_request(model: str, body: dict[str, Any], pcfg: dict[str, Any],
3695
3749
  return req
3696
3750
 
3697
3751
 
3698
- def openai_compatible_chat_request(model: str, body: dict[str, Any], pcfg: dict[str, Any], stream: bool = False) -> dict[str, Any]:
3699
- messages = anthropic_messages_to_openai(body)
3700
- tools = anthropic_tools_to_ollama(body.get("tools"))
3701
- context_limit = positive_int(pcfg.get("context_window")) or positive_int(pcfg.get("max_model_len")) or 65536
3752
+ def openai_compatible_chat_request(provider: str, model: str, body: dict[str, Any], pcfg: dict[str, Any], stream: bool = False) -> dict[str, Any]:
3753
+ messages = anthropic_messages_to_openai(body)
3754
+ tools = anthropic_tools_to_ollama(body.get("tools"))
3755
+ context_limit = openai_context_limit_for_budget(provider, pcfg)
3702
3756
  configured = configured_output_tokens(pcfg, body)
3703
3757
  reserve = positive_int(pcfg.get("context_reserve_tokens")) or 1024
3704
3758
  output_reserve = configured or positive_int(body.get("max_tokens")) or 4096
@@ -4525,7 +4579,7 @@ def forward_ollama_api_chat(handler: BaseHTTPRequestHandler, provider: str, pcfg
4525
4579
  write_json(handler, message)
4526
4580
 
4527
4581
 
4528
- def openai_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict[str, Any] | None = None) -> dict[str, Any]:
4582
+ def openai_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict[str, Any] | None = None) -> dict[str, Any]:
4529
4583
  choice = {}
4530
4584
  choices = data.get("choices")
4531
4585
  if isinstance(choices, list) and choices:
@@ -4538,10 +4592,202 @@ def openai_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict
4538
4592
  },
4539
4593
  "done_reason": "length" if choice.get("finish_reason") == "length" else "stop",
4540
4594
  }
4541
- return ollama_chat_to_anthropic(wrapped, model, source_body=source_body)
4542
-
4543
-
4544
- def upstream_http_error_message(exc: urllib.error.HTTPError, raw: str | None = None) -> str:
4595
+ return ollama_chat_to_anthropic(wrapped, model, source_body=source_body)
4596
+
4597
+
4598
+ def stream_openai_chat_to_anthropic_sse(
4599
+ handler: BaseHTTPRequestHandler,
4600
+ resp: Any,
4601
+ model: str,
4602
+ provider: str,
4603
+ source_body: dict[str, Any] | None = None,
4604
+ start_index: int = 0,
4605
+ word_chunking: bool = False,
4606
+ ) -> None:
4607
+ next_content_index = start_index
4608
+ text_started = False
4609
+ text_suppressed_for_plan = False
4610
+ text_index: int | None = None
4611
+ text_so_far = ""
4612
+ text_buffer = ""
4613
+ tool_fragments: dict[int, dict[str, Any]] = {}
4614
+ output_tokens = 0
4615
+ finish_reason = "stop"
4616
+
4617
+ def emit(event_name: str, payload: dict[str, Any]) -> None:
4618
+ handler.wfile.write(f"event: {event_name}\ndata: {json.dumps(payload, ensure_ascii=False)}\n\n".encode())
4619
+ handler.wfile.flush()
4620
+
4621
+ def ensure_text_started() -> int:
4622
+ nonlocal text_started, text_index, next_content_index
4623
+ if text_started and text_index is not None:
4624
+ return text_index
4625
+ text_started = True
4626
+ text_index = next_content_index
4627
+ next_content_index += 1
4628
+ emit(
4629
+ "content_block_start",
4630
+ {"type": "content_block_start", "index": text_index, "content_block": {"type": "text", "text": ""}},
4631
+ )
4632
+ return text_index
4633
+
4634
+ def emit_text_delta(text: str) -> None:
4635
+ if not text:
4636
+ return
4637
+ idx = ensure_text_started()
4638
+ emit(
4639
+ "content_block_delta",
4640
+ {"type": "content_block_delta", "index": idx, "delta": {"type": "text_delta", "text": text}},
4641
+ )
4642
+
4643
+ try:
4644
+ for raw_line in resp:
4645
+ line = raw_line.decode("utf-8", errors="ignore").strip()
4646
+ if not line or line.startswith(":"):
4647
+ continue
4648
+ if line.startswith("data:"):
4649
+ line = line[5:].strip()
4650
+ if not line or line == "[DONE]":
4651
+ break
4652
+ try:
4653
+ event = json.loads(line)
4654
+ except Exception:
4655
+ continue
4656
+ if not isinstance(event, dict):
4657
+ continue
4658
+ usage = event.get("usage")
4659
+ if isinstance(usage, dict):
4660
+ output_tokens = max(output_tokens, positive_int(usage.get("completion_tokens")) or 0)
4661
+ choices = event.get("choices")
4662
+ if not isinstance(choices, list) or not choices:
4663
+ continue
4664
+ choice = choices[0] if isinstance(choices[0], dict) else {}
4665
+ if choice.get("finish_reason"):
4666
+ finish_reason = str(choice.get("finish_reason"))
4667
+ delta = choice.get("delta") if isinstance(choice.get("delta"), dict) else {}
4668
+ text_chunk = delta.get("content") or ""
4669
+ if text_chunk:
4670
+ if source_body is not None and not text_started and not tool_fragments and should_auto_enter_plan_mode(source_body, text_so_far + text_chunk, []):
4671
+ text_so_far += text_chunk
4672
+ text_suppressed_for_plan = True
4673
+ continue
4674
+ text_so_far += text_chunk
4675
+ if word_chunking:
4676
+ text_buffer += text_chunk
4677
+ to_flush, text_buffer = _split_word_buffer(text_buffer, force=False)
4678
+ emit_text_delta(to_flush)
4679
+ else:
4680
+ emit_text_delta(text_chunk)
4681
+ for call in delta.get("tool_calls") or []:
4682
+ if not isinstance(call, dict):
4683
+ continue
4684
+ try:
4685
+ call_index = int(call.get("index"))
4686
+ except Exception:
4687
+ call_index = len(tool_fragments)
4688
+ slot = tool_fragments.setdefault(call_index, {"id": "", "name": "", "arguments": ""})
4689
+ if call.get("id"):
4690
+ slot["id"] = str(call.get("id"))
4691
+ fn = call.get("function") if isinstance(call.get("function"), dict) else {}
4692
+ if fn.get("name"):
4693
+ slot["name"] += str(fn.get("name"))
4694
+ if fn.get("arguments"):
4695
+ slot["arguments"] += str(fn.get("arguments"))
4696
+ if word_chunking and text_buffer:
4697
+ to_flush, text_buffer = _split_word_buffer(text_buffer, force=True)
4698
+ emit_text_delta(to_flush)
4699
+
4700
+ tool_calls: list[dict[str, Any]] = []
4701
+ for _, fragment in sorted(tool_fragments.items()):
4702
+ raw_name = str(fragment.get("name") or "")
4703
+ if not raw_name:
4704
+ continue
4705
+ matched_name = _fuzzy_match_tool_name(raw_name) or raw_name
4706
+ normalized_args = normalize_tool_arguments(matched_name, fragment.get("arguments") or {})
4707
+ fixed_input = _validate_and_fix_tool_input(matched_name, normalized_args)
4708
+ if source_body is not None:
4709
+ matched_name, fixed_input = plan_mode_tool_name_for_emit(source_body, matched_name, fixed_input)
4710
+ if matched_name is None:
4711
+ continue
4712
+ tool_calls.append({"function": {"name": matched_name, "arguments": fixed_input}})
4713
+ tool_index = next_content_index
4714
+ next_content_index += 1
4715
+ tool_id = str(fragment.get("id") or f"toolu_openai_{int(time.time() * 1000)}_{tool_index}")
4716
+ append_tool_call_log(
4717
+ "openai_stream_tool_call",
4718
+ {
4719
+ "model": model,
4720
+ "raw_name": raw_name,
4721
+ "matched_name": matched_name,
4722
+ "raw_arguments": fragment.get("arguments"),
4723
+ "emitted_input": fixed_input,
4724
+ "sse_index": tool_index,
4725
+ },
4726
+ )
4727
+ emit(
4728
+ "content_block_start",
4729
+ {
4730
+ "type": "content_block_start",
4731
+ "index": tool_index,
4732
+ "content_block": {"type": "tool_use", "id": tool_id, "name": matched_name, "input": {}},
4733
+ },
4734
+ )
4735
+ emit(
4736
+ "content_block_delta",
4737
+ {
4738
+ "type": "content_block_delta",
4739
+ "index": tool_index,
4740
+ "delta": {"type": "input_json_delta", "partial_json": json.dumps(fixed_input, ensure_ascii=False)},
4741
+ },
4742
+ )
4743
+ emit("content_block_stop", {"type": "content_block_stop", "index": tool_index})
4744
+
4745
+ if source_body is not None and should_auto_enter_plan_mode(source_body, text_so_far, tool_calls):
4746
+ router_log("WARN", "auto-synthesized EnterPlanMode from short/empty upstream OpenAI stream")
4747
+ tool_index = next_content_index
4748
+ next_content_index += 1
4749
+ tool_calls.append({"function": {"name": "EnterPlanMode", "arguments": {}}})
4750
+ emit(
4751
+ "content_block_start",
4752
+ {
4753
+ "type": "content_block_start",
4754
+ "index": tool_index,
4755
+ "content_block": {"type": "tool_use", "id": f"toolu_openai_plan_{int(time.time() * 1000)}", "name": "EnterPlanMode", "input": {}},
4756
+ },
4757
+ )
4758
+ emit("content_block_delta", {"type": "content_block_delta", "index": tool_index, "delta": {"type": "input_json_delta", "partial_json": "{}"}})
4759
+ emit("content_block_stop", {"type": "content_block_stop", "index": tool_index})
4760
+ elif text_suppressed_for_plan and text_so_far:
4761
+ emit_text_delta(text_so_far)
4762
+
4763
+ if source_body is not None and should_keep_work_alive_with_tasklist(source_body, text_so_far, tool_calls):
4764
+ router_log("WARN", "auto-synthesized TaskList to keep work moving after OpenAI stream")
4765
+ tool_index = next_content_index
4766
+ next_content_index += 1
4767
+ tool_calls.append({"function": {"name": "TaskList", "arguments": {}}})
4768
+ emit(
4769
+ "content_block_start",
4770
+ {
4771
+ "type": "content_block_start",
4772
+ "index": tool_index,
4773
+ "content_block": {"type": "tool_use", "id": f"toolu_openai_keepalive_{int(time.time() * 1000)}", "name": "TaskList", "input": {}},
4774
+ },
4775
+ )
4776
+ emit("content_block_delta", {"type": "content_block_delta", "index": tool_index, "delta": {"type": "input_json_delta", "partial_json": "{}"}})
4777
+ emit("content_block_stop", {"type": "content_block_stop", "index": tool_index})
4778
+
4779
+ if text_started and text_index is not None:
4780
+ emit("content_block_stop", {"type": "content_block_stop", "index": text_index})
4781
+ stop_reason = "tool_use" if tool_calls else ("max_tokens" if finish_reason == "length" else "end_turn")
4782
+ write_anthropic_open_stream_stop(handler, {"stop_reason": stop_reason, "usage": {"output_tokens": output_tokens or max(1, len(text_so_far) // 4)}})
4783
+ finally:
4784
+ try:
4785
+ resp.close()
4786
+ except Exception:
4787
+ pass
4788
+
4789
+
4790
+ def upstream_http_error_message(exc: urllib.error.HTTPError, raw: str | None = None) -> str:
4545
4791
  if raw is None:
4546
4792
  raw = exc.read().decode("utf-8", errors="ignore")
4547
4793
  msg = raw.strip() or str(exc)
@@ -4582,7 +4828,7 @@ def retryable_timeout_exception(exc: BaseException) -> bool:
4582
4828
  return "timed out" in text or "timeout" in text
4583
4829
 
4584
4830
 
4585
- def post_json_with_rate_retry(
4831
+ def post_json_with_rate_retry(
4586
4832
  url: str,
4587
4833
  req_body: Any,
4588
4834
  headers: dict[str, str],
@@ -4592,81 +4838,180 @@ def post_json_with_rate_retry(
4592
4838
  model: str,
4593
4839
  retry_notice: Callable[[str], None] | None = None,
4594
4840
  ) -> Any:
4595
- gateway_retries = positive_int(pcfg.get("gateway_retries")) or 2
4596
- max_attempts = max(1, gateway_retries + 1)
4597
- for attempt in range(max_attempts):
4598
- try:
4599
- data_bytes = json.dumps(req_body).encode("utf-8")
4600
- req = urllib.request.Request(url, data=data_bytes, headers=headers, method="POST")
4601
- with urllib.request.urlopen(req, timeout=timeout) as resp:
4602
- learn_router_rate_limit_headers(provider, pcfg, model, resp.headers)
4603
- return json.loads(resp.read().decode("utf-8"))
4604
- except urllib.error.HTTPError as exc:
4605
- raw = exc.read().decode("utf-8", errors="ignore")
4606
- learn_router_rate_limit_headers(provider, pcfg, model, exc.headers)
4841
+ gateway_retries = positive_int(pcfg.get("gateway_retries")) or 2
4842
+ max_attempts = max(1, gateway_retries + 1)
4843
+ token_estimate = estimate_tokens(req_body)
4844
+ byte_estimate = len(json.dumps(req_body, ensure_ascii=False).encode("utf-8"))
4845
+ for attempt in range(max_attempts):
4846
+ try:
4847
+ write_router_activity(
4848
+ "request",
4849
+ provider,
4850
+ model,
4851
+ attempt=attempt + 1,
4852
+ total=max_attempts,
4853
+ tokens=token_estimate,
4854
+ bytes=byte_estimate,
4855
+ timeout=timeout,
4856
+ )
4857
+ router_log("INFO", f"upstream_request provider={provider} model={model} attempt={attempt + 1}/{max_attempts} tokens={token_estimate} bytes={byte_estimate} timeout={timeout}")
4858
+ data_bytes = json.dumps(req_body).encode("utf-8")
4859
+ req = urllib.request.Request(url, data=data_bytes, headers=headers, method="POST")
4860
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
4861
+ learn_router_rate_limit_headers(provider, pcfg, model, resp.headers)
4862
+ data = json.loads(resp.read().decode("utf-8"))
4863
+ write_router_activity("success", provider, model, attempt=attempt + 1, tokens=token_estimate, bytes=byte_estimate)
4864
+ return data
4865
+ except urllib.error.HTTPError as exc:
4866
+ raw = exc.read().decode("utf-8", errors="ignore")
4867
+ learn_router_rate_limit_headers(provider, pcfg, model, exc.headers)
4607
4868
  if exc.code == 429 and attempt == 0:
4608
4869
  wait = register_router_rate_limit_backoff(provider, pcfg, model, exc.headers.get("Retry-After"))
4609
4870
  time.sleep(wait)
4610
4871
  continue
4611
- if exc.code in UPSTREAM_RETRY_HTTP_CODES and attempt + 1 < max_attempts:
4612
- retry_no = attempt + 1
4613
- if retry_notice:
4614
- retry_notice(upstream_retry_message(retry_no, gateway_retries))
4615
- time.sleep(upstream_retry_wait_seconds(retry_no))
4616
- continue
4617
- raise RuntimeError(upstream_http_error_message(exc, raw)) from exc
4618
- except (TimeoutError, urllib.error.URLError) as exc:
4619
- if retryable_timeout_exception(exc) and attempt + 1 < max_attempts:
4620
- retry_no = attempt + 1
4621
- if retry_notice:
4622
- retry_notice(upstream_retry_message(retry_no, gateway_retries))
4623
- time.sleep(upstream_retry_wait_seconds(retry_no))
4624
- continue
4625
- raise RuntimeError(f"{type(exc).__name__}: {exc}") from exc
4626
- raise RuntimeError("upstream request failed")
4627
-
4628
-
4629
- def forward_openai_compatible_chat(handler: BaseHTTPRequestHandler, provider: str, pcfg: dict[str, Any], body: dict[str, Any]) -> None:
4630
- _update_tool_schema_registry(body.get("tools"))
4631
- model = resolve_requested_model(provider, pcfg, body.get("model"))
4632
- if provider == "nvidia-hosted":
4633
- model = ncp_model_id_for_nvidia_hosted(model)
4634
- req_body = openai_compatible_chat_request(model, body, pcfg, stream=False)
4635
- url = join_url(provider_upstream_request_base(provider, pcfg), "/chat/completions")
4636
- waited, rpm_used, rpm_limit = apply_router_rate_limit(provider, pcfg, model)
4637
- stream = bool(body.get("stream", True))
4638
- notice = rate_limit_notice(waited, rpm_used, rpm_limit, bool(pcfg.get("rate_limit_status", True)))
4639
- if stream:
4640
- write_anthropic_open_stream_start(handler, model)
4641
- index = 0
4642
- if notice:
4643
- index = write_anthropic_stream_blocks(handler, [{"type": "text", "text": notice}], index)
4644
- try:
4872
+ if exc.code in UPSTREAM_RETRY_HTTP_CODES and attempt + 1 < max_attempts:
4873
+ retry_no = attempt + 1
4874
+ write_router_activity("retry", provider, model, attempt=retry_no, total=gateway_retries, code=exc.code, tokens=token_estimate, bytes=byte_estimate)
4875
+ router_log("WARN", f"upstream_retry provider={provider} model={model} attempt={retry_no}/{gateway_retries} code={exc.code} tokens={token_estimate} bytes={byte_estimate}")
4876
+ if retry_notice:
4877
+ retry_notice(upstream_retry_message(retry_no, gateway_retries))
4878
+ time.sleep(upstream_retry_wait_seconds(retry_no))
4879
+ continue
4880
+ write_router_activity("error", provider, model, code=exc.code, tokens=token_estimate, bytes=byte_estimate)
4881
+ raise RuntimeError(upstream_http_error_message(exc, raw)) from exc
4882
+ except (TimeoutError, urllib.error.URLError) as exc:
4883
+ if retryable_timeout_exception(exc) and attempt + 1 < max_attempts:
4884
+ retry_no = attempt + 1
4885
+ write_router_activity("retry", provider, model, attempt=retry_no, total=gateway_retries, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate)
4886
+ router_log("WARN", f"upstream_retry provider={provider} model={model} attempt={retry_no}/{gateway_retries} error={type(exc).__name__} tokens={token_estimate} bytes={byte_estimate}")
4887
+ if retry_notice:
4888
+ retry_notice(upstream_retry_message(retry_no, gateway_retries))
4889
+ time.sleep(upstream_retry_wait_seconds(retry_no))
4890
+ continue
4891
+ write_router_activity("error", provider, model, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate)
4892
+ raise RuntimeError(f"{type(exc).__name__}: {exc}") from exc
4893
+ raise RuntimeError("upstream request failed")
4894
+
4895
+
4896
+ def open_openai_stream_with_rate_retry(
4897
+ url: str,
4898
+ req_body: Any,
4899
+ headers: dict[str, str],
4900
+ timeout: float,
4901
+ provider: str,
4902
+ pcfg: dict[str, Any],
4903
+ model: str,
4904
+ retry_notice: Callable[[str], None] | None = None,
4905
+ ) -> Any:
4906
+ gateway_retries = positive_int(pcfg.get("gateway_retries")) or 2
4907
+ max_attempts = max(1, gateway_retries + 1)
4908
+ token_estimate = estimate_tokens(req_body)
4909
+ byte_estimate = len(json.dumps(req_body, ensure_ascii=False).encode("utf-8"))
4910
+ data_bytes = json.dumps(req_body).encode("utf-8")
4911
+ for attempt in range(max_attempts):
4912
+ try:
4913
+ write_router_activity(
4914
+ "request",
4915
+ provider,
4916
+ model,
4917
+ attempt=attempt + 1,
4918
+ total=max_attempts,
4919
+ tokens=token_estimate,
4920
+ bytes=byte_estimate,
4921
+ timeout=timeout,
4922
+ stream=True,
4923
+ )
4924
+ router_log("INFO", f"upstream_stream_request provider={provider} model={model} attempt={attempt + 1}/{max_attempts} tokens={token_estimate} bytes={byte_estimate} timeout={timeout}")
4925
+ req = urllib.request.Request(url, data=data_bytes, headers=headers, method="POST")
4926
+ resp = urllib.request.urlopen(req, timeout=timeout)
4927
+ learn_router_rate_limit_headers(provider, pcfg, model, resp.headers)
4928
+ return resp
4929
+ except urllib.error.HTTPError as exc:
4930
+ raw = exc.read().decode("utf-8", errors="ignore")
4931
+ learn_router_rate_limit_headers(provider, pcfg, model, exc.headers)
4932
+ if exc.code == 429 and attempt == 0:
4933
+ wait = register_router_rate_limit_backoff(provider, pcfg, model, exc.headers.get("Retry-After"))
4934
+ time.sleep(wait)
4935
+ continue
4936
+ if exc.code in UPSTREAM_RETRY_HTTP_CODES and attempt + 1 < max_attempts:
4937
+ retry_no = attempt + 1
4938
+ write_router_activity("retry", provider, model, attempt=retry_no, total=gateway_retries, code=exc.code, tokens=token_estimate, bytes=byte_estimate, stream=True)
4939
+ router_log("WARN", f"upstream_stream_retry provider={provider} model={model} attempt={retry_no}/{gateway_retries} code={exc.code} tokens={token_estimate} bytes={byte_estimate}")
4940
+ if retry_notice:
4941
+ retry_notice(upstream_retry_message(retry_no, gateway_retries))
4942
+ time.sleep(upstream_retry_wait_seconds(retry_no))
4943
+ continue
4944
+ write_router_activity("error", provider, model, code=exc.code, tokens=token_estimate, bytes=byte_estimate, stream=True)
4945
+ raise RuntimeError(upstream_http_error_message(exc, raw)) from exc
4946
+ except (TimeoutError, urllib.error.URLError) as exc:
4947
+ if retryable_timeout_exception(exc) and attempt + 1 < max_attempts:
4948
+ retry_no = attempt + 1
4949
+ write_router_activity("retry", provider, model, attempt=retry_no, total=gateway_retries, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate, stream=True)
4950
+ router_log("WARN", f"upstream_stream_retry provider={provider} model={model} attempt={retry_no}/{gateway_retries} error={type(exc).__name__} tokens={token_estimate} bytes={byte_estimate}")
4951
+ if retry_notice:
4952
+ retry_notice(upstream_retry_message(retry_no, gateway_retries))
4953
+ time.sleep(upstream_retry_wait_seconds(retry_no))
4954
+ continue
4955
+ write_router_activity("error", provider, model, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate, stream=True)
4956
+ raise RuntimeError(f"{type(exc).__name__}: {exc}") from exc
4957
+ raise RuntimeError("upstream stream request failed")
4958
+
4959
+
4960
+ def forward_openai_compatible_chat(handler: BaseHTTPRequestHandler, provider: str, pcfg: dict[str, Any], body: dict[str, Any]) -> None:
4961
+ _update_tool_schema_registry(body.get("tools"))
4962
+ model = resolve_requested_model(provider, pcfg, body.get("model"))
4963
+ if provider == "nvidia-hosted":
4964
+ model = ncp_model_id_for_nvidia_hosted(model)
4965
+ url = join_url(provider_upstream_request_base(provider, pcfg), "/chat/completions")
4966
+ waited, rpm_used, rpm_limit = apply_router_rate_limit(provider, pcfg, model)
4967
+ stream = bool(body.get("stream", True))
4968
+ notice = rate_limit_notice(waited, rpm_used, rpm_limit, bool(pcfg.get("rate_limit_status", True)))
4969
+ if stream:
4970
+ req_body = openai_compatible_chat_request(provider, model, body, pcfg, stream=True)
4971
+ write_anthropic_open_stream_start(handler, model)
4972
+ index = 0
4973
+ if notice:
4974
+ index = write_anthropic_stream_blocks(handler, [{"type": "text", "text": notice}], index)
4975
+ try:
4645
4976
  def emit_retry_notice(text: str) -> None:
4646
4977
  nonlocal index
4647
4978
  index = write_anthropic_stream_blocks(handler, [{"type": "text", "text": text + "\n"}], index)
4648
4979
 
4649
- data = post_json_with_rate_retry(
4650
- url,
4651
- req_body,
4652
- provider_headers(provider, pcfg),
4653
- provider_request_timeout_seconds(pcfg),
4654
- provider,
4980
+ resp = open_openai_stream_with_rate_retry(
4981
+ url,
4982
+ req_body,
4983
+ provider_headers(provider, pcfg),
4984
+ provider_request_timeout_seconds(pcfg),
4985
+ provider,
4655
4986
  pcfg,
4656
- model,
4657
- emit_retry_notice,
4658
- )
4659
- except RuntimeError as exc:
4660
- msg = str(exc)
4661
- write_anthropic_stream_blocks(handler, [{"type": "text", "text": f"Upstream error: {msg}"}], index)
4662
- write_anthropic_open_stream_stop(handler)
4663
- return
4664
- message = openai_chat_to_anthropic(data, model, source_body=body)
4665
- write_anthropic_stream_blocks(handler, list(message.get("content") or []), index)
4666
- write_anthropic_open_stream_stop(handler, message)
4667
- return
4668
- try:
4669
- data = post_json_with_rate_retry(
4987
+ model,
4988
+ emit_retry_notice,
4989
+ )
4990
+ stream_openai_chat_to_anthropic_sse(
4991
+ handler,
4992
+ resp,
4993
+ model,
4994
+ provider,
4995
+ source_body=body,
4996
+ start_index=index,
4997
+ word_chunking=bool(pcfg.get("stream_word_chunking", False)),
4998
+ )
4999
+ write_router_activity("success", provider, model, tokens=estimate_tokens(req_body), bytes=len(json.dumps(req_body, ensure_ascii=False).encode("utf-8")), stream=True)
5000
+ except RuntimeError as exc:
5001
+ msg = str(exc)
5002
+ write_anthropic_stream_blocks(handler, [{"type": "text", "text": f"Upstream error: {msg}"}], index)
5003
+ write_anthropic_open_stream_stop(handler)
5004
+ return
5005
+ except Exception as exc:
5006
+ msg = f"{type(exc).__name__}: {exc}"
5007
+ write_router_activity("error", provider, model, error=type(exc).__name__, stream=True)
5008
+ write_anthropic_stream_blocks(handler, [{"type": "text", "text": f"Upstream error: {msg}"}], index)
5009
+ write_anthropic_open_stream_stop(handler)
5010
+ return
5011
+ return
5012
+ req_body = openai_compatible_chat_request(provider, model, body, pcfg, stream=False)
5013
+ try:
5014
+ data = post_json_with_rate_retry(
4670
5015
  url,
4671
5016
  req_body,
4672
5017
  provider_headers(provider, pcfg),
@@ -5112,7 +5457,7 @@ def status_lines() -> list[str]:
5112
5457
  *([f"keep_alive: {pcfg.get('keep_alive', 'default')}"] if provider in ("ollama", "ollama-cloud") else []),
5113
5458
  *([f"think: {bool(pcfg.get('think', False))}"] if provider in ("ollama", "ollama-cloud") else []),
5114
5459
  *([f"request_timeout_ms: {pcfg.get('request_timeout_ms', 'default')}"] if provider in ("ollama", "ollama-cloud") else []),
5115
- *([f"context_window: {pcfg.get('context_window', 'default')}"] if provider in ("vllm", "self-hosted-nim") else []),
5460
+ *([f"context_window: {pcfg.get('context_window', 'default')}"] if provider in ("vllm", "nvidia-hosted", "self-hosted-nim") else []),
5116
5461
  *([f"context_reserve_tokens: {pcfg.get('context_reserve_tokens', 'default')}"] if provider in ("vllm", "self-hosted-nim") else []),
5117
5462
  *([f"max_output_tokens: {pcfg.get('max_output_tokens', 'default')}"] if provider in ("vllm", "nvidia-hosted", "self-hosted-nim") else []),
5118
5463
  *([f"request_timeout_ms: {pcfg.get('request_timeout_ms', 'default')}"] if provider in ("vllm", "nvidia-hosted", "self-hosted-nim") else []),
@@ -5421,9 +5766,9 @@ def provider_options_status(provider: str, pcfg: dict[str, Any]) -> str:
5421
5766
  if limit is not None:
5422
5767
  suffix = f"{used}/{limit}" if limit > 0 else f"{used}/min(unlimited)"
5423
5768
  parts.append(f"rpm_used={suffix}")
5424
- if provider in ("vllm", "self-hosted-nim"):
5425
- parts.insert(0, f"context_window={pcfg.get('context_window', 'default')}")
5426
- parts.insert(1, f"reserve={pcfg.get('context_reserve_tokens', 'default')}")
5769
+ if provider in ("vllm", "nvidia-hosted", "self-hosted-nim"):
5770
+ parts.insert(0, f"context_window={pcfg.get('context_window', 'default')}")
5771
+ parts.insert(1, f"reserve={pcfg.get('context_reserve_tokens', 'default')}")
5427
5772
  if provider in ("vllm", "self-hosted-nim"):
5428
5773
  native_default = False if provider == "nvidia-hosted" else True
5429
5774
  parts.append(f"native={bool(pcfg.get('native_compat', native_default))}")
@@ -5741,10 +6086,10 @@ def apply_llm_preset_to_provider(provider: str, pcfg: dict[str, Any], preset_id:
5741
6086
  f"native={native_default}",
5742
6087
  ],
5743
6088
  }
5744
- for token in tokens_by_preset[preset_id]:
5745
- if provider == "nvidia-hosted" and token.startswith(("context_window=", "reserve=", "native=")):
5746
- continue
5747
- apply_provider_option(provider, pcfg, token)
6089
+ for token in tokens_by_preset[preset_id]:
6090
+ if provider == "nvidia-hosted" and token.startswith("native="):
6091
+ continue
6092
+ apply_provider_option(provider, pcfg, token)
5748
6093
  if server_limit:
5749
6094
  requested_context = positive_int(pcfg.get("context_window"))
5750
6095
  if requested_context and requested_context > server_limit:
package/docs/README.ja.md CHANGED
@@ -47,7 +47,7 @@ vLLM、NVIDIA hosted、self-hosted NIM を選択し、通常の Claude Code 引
47
47
 
48
48
  Credits: One Ciel LLC
49
49
 
50
- 現在のバージョン: `0.1.34`
50
+ 現在のバージョン: `0.1.36`
51
51
 
52
52
  ## 作られた理由
53
53
 
@@ -351,6 +351,23 @@ Windows/Linux 管理、クリーンアップスクリプト、定期的なセキ
351
351
 
352
352
  ## 変更履歴
353
353
 
354
+ ### 0.1.36
355
+
356
+ - **NVIDIA upstream streaming**: NVIDIA hosted router 呼び出しは upstream にも
357
+ `stream=true` を使用します。長い応答を完全な non-streaming completion まで
358
+ 待たず、chunk として流せます。
359
+ - **Stream retry diagnostics**: streaming NVIDIA 呼び出しでも statusline 用の
360
+ retry/request size activity 状態を維持します。
361
+
362
+ ### 0.1.35
363
+
364
+ - **NVIDIA router context guard**: NVIDIA hosted の router context 既定値を 32K
365
+ に下げ、LLM preset がこの cap を調整できるようにしました。長い Claude Code
366
+ セッションで payload が肥大して timeout する状況を減らします。
367
+ - **Upstream activity status**: router が現在の request/retry/success/error
368
+ 状態と推定 token/byte サイズを記録し、statusline で upstream 待機と idle を
369
+ 判別できるようにしました。
370
+
354
371
  ### 0.1.34
355
372
 
356
373
  - **完全な headless 設定経路**: `--ca-env-file`、環境変数マッピング、Advisor
package/docs/README.ko.md CHANGED
@@ -47,7 +47,7 @@ NVIDIA hosted, self-hosted NIM을 선택하고, Claude Code의 일반 인자는
47
47
 
48
48
  Credits: One Ciel LLC
49
49
 
50
- 현재 버전: `0.1.34`
50
+ 현재 버전: `0.1.36`
51
51
 
52
52
  ## 왜 만들었나
53
53
 
@@ -351,6 +351,23 @@ Windows 이벤트 로그 리뷰, 바이러스/랜섬웨어 침입 시도 정리,
351
351
 
352
352
  ## 변경 이력
353
353
 
354
+ ### 0.1.36
355
+
356
+ - **NVIDIA upstream streaming**: NVIDIA hosted router 호출은 이제 upstream에도
357
+ `stream=true`를 사용합니다. 긴 응답을 전체 완료까지 기다리지 않고 chunk로
358
+ 흘려보내 timeout 가능성을 낮춥니다.
359
+ - **Stream retry diagnostics**: streaming NVIDIA 호출도 statusline에서 쓰는
360
+ retry/request size activity 상태를 유지합니다.
361
+
362
+ ### 0.1.35
363
+
364
+ - **NVIDIA router context guard**: NVIDIA hosted의 router context 기본값을 32K로
365
+ 낮추고 LLM preset이 이 cap을 조정할 수 있게 하여, 긴 Claude Code 세션에서
366
+ payload가 커져 timeout이 나는 상황을 줄였습니다.
367
+ - **Upstream activity status**: router가 현재 request/retry/success/error 상태와
368
+ 추정 token/byte 크기를 기록하여, statusline에서 upstream 대기와 idle 상태를
369
+ 구분할 수 있습니다.
370
+
354
371
  ### 0.1.34
355
372
 
356
373
  - **완전한 headless 설정 경로**: `--ca-env-file`, 환경변수 매핑, Advisor model,
package/docs/README.zh.md CHANGED
@@ -47,7 +47,7 @@ NIM,并把普通 Claude Code 参数原样传递。
47
47
 
48
48
  Credits: One Ciel LLC
49
49
 
50
- 当前版本: `0.1.34`
50
+ 当前版本: `0.1.36`
51
51
 
52
52
  ## 为什么存在
53
53
 
@@ -337,6 +337,21 @@ Hermes 格式模型或部分较旧的 Qwen tool template。
337
337
 
338
338
  ## 更新日志
339
339
 
340
+ ### 0.1.36
341
+
342
+ - **NVIDIA upstream streaming**:NVIDIA hosted router 调用现在也会向 upstream
343
+ 使用 `stream=true`,长响应可以按 chunk 流出,不再等待完整的非流式 completion。
344
+ - **Stream retry diagnostics**:streaming NVIDIA 调用也保留 statusline 使用的
345
+ retry/request size activity 状态。
346
+
347
+ ### 0.1.35
348
+
349
+ - **NVIDIA router context guard**:NVIDIA hosted 的 router context 默认值改为
350
+ 32K,并允许 LLM preset 调整该 cap,减少长 Claude Code 会话中 payload 变大后
351
+ 触发 timeout 的情况。
352
+ - **Upstream activity status**:router 会记录当前 request/retry/success/error
353
+ 状态和估算 token/byte 大小,statusline 可以区分正在等待 upstream 还是已 idle。
354
+
340
355
  ### 0.1.34
341
356
 
342
357
  - **完整 headless 配置路径**:新增 `--ca-env-file`、环境变量映射、Advisor
package/docs/manual.md CHANGED
@@ -10,7 +10,7 @@ Code starts, while passing normal Claude Code arguments through unchanged.
10
10
 
11
11
  Credits: One Ciel LLC
12
12
 
13
- Current version: `0.1.34`
13
+ Current version: `0.1.36`
14
14
 
15
15
  ## Install
16
16
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@oneciel-ai/claude-any",
3
- "version": "0.1.34",
3
+ "version": "0.1.36",
4
4
  "description": "Claude Code provider selector for Anthropic, Ollama, Ollama Cloud, vLLM, NVIDIA hosted, and self-hosted NIM.",
5
5
  "license": "MIT",
6
6
  "author": "One Ciel LLC",