@oneciel-ai/claude-any 0.1.35 → 0.1.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -48,7 +48,7 @@ arguments through unchanged.
48
48
 
49
49
  Credits: One Ciel LLC
50
50
 
51
- Current version: `0.1.35`
51
+ Current version: `0.1.37`
52
52
 
53
53
  ## Why This Exists
54
54
 
@@ -381,6 +381,22 @@ steps under that larger model's supervision.
381
381
 
382
382
  ## Changelog
383
383
 
384
+ ### 0.1.37
385
+
386
+ - **Pseudo tool-call recovery**: the NVIDIA/OpenAI-compatible stream path now
387
+ suppresses `<|tool_calls_section_begin|>...` pseudo tool-call text and
388
+ converts it back into Claude `tool_use` blocks when possible.
389
+ - **Streaming defaults**: provider streaming defaults to on; NVIDIA hosted
390
+ remains forced to the streaming upstream path for stability.
391
+
392
+ ### 0.1.36
393
+
394
+ - **NVIDIA upstream streaming**: NVIDIA hosted router calls now use upstream
395
+ `stream=true`, so long responses can flow as chunks instead of waiting for a
396
+ full non-streaming completion.
397
+ - **Stream retry diagnostics**: streamed NVIDIA calls keep the same retry and
398
+ request-size activity status used by the statusline.
399
+
384
400
  ### 0.1.35
385
401
 
386
402
  - **NVIDIA router context guard**: NVIDIA hosted now defaults to a 32K router
package/claude_any.py CHANGED
@@ -85,7 +85,7 @@ PROVIDER_LABELS = {
85
85
  "self-hosted-nim": "Self Hosted NIM",
86
86
  }
87
87
  APP_NAME = "Claude Any"
88
- VERSION = "0.1.35"
88
+ VERSION = "0.1.37"
89
89
  CREDITS = "Credits: One Ciel LLC"
90
90
 
91
91
  LOG_LEVELS = {"SILENT": 0, "ERROR": 1, "WARN": 2, "INFO": 3, "DEBUG": 4, "TRACE": 5}
@@ -790,6 +790,13 @@ def apply_config_migrations(cfg: dict[str, Any]) -> None:
790
790
  if isinstance(pcfg, dict) and not positive_int(pcfg.get("context_window")):
791
791
  pcfg["context_window"] = 32768
792
792
  migrations[marker] = True
793
+
794
+ marker = "stream_enabled_default_true_20260513"
795
+ if not migrations.get(marker):
796
+ for pcfg in (cfg.get("providers") or {}).values():
797
+ if isinstance(pcfg, dict) and "stream_enabled" not in pcfg:
798
+ pcfg["stream_enabled"] = True
799
+ migrations[marker] = True
793
800
 
794
801
 
795
802
  _config_cache: dict[str, Any] | None = None
@@ -3988,7 +3995,7 @@ def maybe_handle_advisor_request(handler: BaseHTTPRequestHandler, provider: str,
3988
3995
  return True
3989
3996
 
3990
3997
 
3991
- def normalize_tool_arguments(tool_name: str, args: Any) -> dict[str, Any]:
3998
+ def normalize_tool_arguments(tool_name: str, args: Any) -> dict[str, Any]:
3992
3999
  if isinstance(args, dict):
3993
4000
  return args
3994
4001
  if isinstance(args, str):
@@ -4003,17 +4010,86 @@ def normalize_tool_arguments(tool_name: str, args: Any) -> dict[str, Any]:
4003
4010
  pass
4004
4011
  if tool_name == "Bash":
4005
4012
  return {"command": text}
4006
- return {}
4007
-
4008
-
4009
- def ollama_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict[str, Any] | None = None) -> dict[str, Any]:
4010
- message = data.get("message") if isinstance(data.get("message"), dict) else {}
4011
- content: list[dict[str, Any]] = []
4012
- text = message.get("content") or ""
4013
- if text:
4014
- content.append({"type": "text", "text": text})
4015
- tool_id_prefix = f"toolu_ollama_{int(time.time() * 1000)}_{os.getpid()}"
4016
- for i, call in enumerate(message.get("tool_calls") or []):
4013
+ return {}
4014
+
4015
+
4016
+ PSEUDO_TOOL_START = "<|tool_calls_section_begin|>"
4017
+ PSEUDO_TOOL_END = "<|tool_calls_section_end|>"
4018
+ PSEUDO_CALL_BEGIN = "<|tool_call_begin|>"
4019
+ PSEUDO_ARG_BEGIN = "<|tool_call_argument_begin|>"
4020
+ PSEUDO_CALL_END = "<|tool_call_end|>"
4021
+
4022
+
4023
+ def infer_tool_name_from_args(args: dict[str, Any]) -> str:
4024
+ keys = set(args)
4025
+ if "command" in keys:
4026
+ return "Bash"
4027
+ if {"file_path", "content"}.issubset(keys):
4028
+ return "Write"
4029
+ if {"file_path", "old_string", "new_string"}.issubset(keys):
4030
+ return "Edit"
4031
+ if "file_path" in keys:
4032
+ return "Read"
4033
+ if "taskId" in keys and "status" in keys:
4034
+ return "TaskUpdate"
4035
+ return "TaskList" if not args else "Write"
4036
+
4037
+
4038
+ def parse_pseudo_tool_calls(text: str) -> tuple[str, list[dict[str, Any]]]:
4039
+ if PSEUDO_TOOL_START not in text:
4040
+ return text, []
4041
+ visible_parts: list[str] = []
4042
+ calls: list[dict[str, Any]] = []
4043
+ pos = 0
4044
+ while True:
4045
+ start = text.find(PSEUDO_TOOL_START, pos)
4046
+ if start < 0:
4047
+ visible_parts.append(text[pos:])
4048
+ break
4049
+ visible_parts.append(text[pos:start])
4050
+ end = text.find(PSEUDO_TOOL_END, start)
4051
+ if end < 0:
4052
+ section = text[start + len(PSEUDO_TOOL_START):]
4053
+ pos = len(text)
4054
+ else:
4055
+ section = text[start + len(PSEUDO_TOOL_START):end]
4056
+ pos = end + len(PSEUDO_TOOL_END)
4057
+ for match in re.finditer(
4058
+ re.escape(PSEUDO_CALL_BEGIN) + r"(.*?)" + re.escape(PSEUDO_ARG_BEGIN) + r"(.*?)" + re.escape(PSEUDO_CALL_END),
4059
+ section,
4060
+ flags=re.DOTALL,
4061
+ ):
4062
+ raw_header = match.group(1).strip()
4063
+ raw_args = match.group(2).strip()
4064
+ try:
4065
+ args = json.loads(raw_args)
4066
+ except Exception:
4067
+ continue
4068
+ if not isinstance(args, dict):
4069
+ continue
4070
+ name = ""
4071
+ for part in re.split(r"[\s:|,]+", raw_header):
4072
+ candidate = _fuzzy_match_tool_name(part)
4073
+ if candidate:
4074
+ name = candidate
4075
+ break
4076
+ if not name:
4077
+ name = infer_tool_name_from_args(args)
4078
+ calls.append({"function": {"name": name, "arguments": args}, "id": raw_header})
4079
+ if end < 0:
4080
+ break
4081
+ return "".join(visible_parts), calls
4082
+
4083
+
4084
+ def ollama_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict[str, Any] | None = None) -> dict[str, Any]:
4085
+ message = data.get("message") if isinstance(data.get("message"), dict) else {}
4086
+ content: list[dict[str, Any]] = []
4087
+ text = message.get("content") or ""
4088
+ text, pseudo_tool_calls = parse_pseudo_tool_calls(text)
4089
+ if text:
4090
+ content.append({"type": "text", "text": text})
4091
+ tool_id_prefix = f"toolu_ollama_{int(time.time() * 1000)}_{os.getpid()}"
4092
+ for i, call in enumerate(list(message.get("tool_calls") or []) + pseudo_tool_calls):
4017
4093
  fn = call.get("function") if isinstance(call, dict) else {}
4018
4094
  if not isinstance(fn, dict) or not fn.get("name"):
4019
4095
  continue
@@ -4579,7 +4655,7 @@ def forward_ollama_api_chat(handler: BaseHTTPRequestHandler, provider: str, pcfg
4579
4655
  write_json(handler, message)
4580
4656
 
4581
4657
 
4582
- def openai_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict[str, Any] | None = None) -> dict[str, Any]:
4658
+ def openai_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict[str, Any] | None = None) -> dict[str, Any]:
4583
4659
  choice = {}
4584
4660
  choices = data.get("choices")
4585
4661
  if isinstance(choices, list) and choices:
@@ -4592,10 +4668,228 @@ def openai_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict
4592
4668
  },
4593
4669
  "done_reason": "length" if choice.get("finish_reason") == "length" else "stop",
4594
4670
  }
4595
- return ollama_chat_to_anthropic(wrapped, model, source_body=source_body)
4596
-
4597
-
4598
- def upstream_http_error_message(exc: urllib.error.HTTPError, raw: str | None = None) -> str:
4671
+ return ollama_chat_to_anthropic(wrapped, model, source_body=source_body)
4672
+
4673
+
4674
+ def stream_openai_chat_to_anthropic_sse(
4675
+ handler: BaseHTTPRequestHandler,
4676
+ resp: Any,
4677
+ model: str,
4678
+ provider: str,
4679
+ source_body: dict[str, Any] | None = None,
4680
+ start_index: int = 0,
4681
+ word_chunking: bool = False,
4682
+ ) -> None:
4683
+ next_content_index = start_index
4684
+ text_started = False
4685
+ text_suppressed_for_plan = False
4686
+ text_index: int | None = None
4687
+ text_so_far = ""
4688
+ pseudo_text = ""
4689
+ pseudo_mode = False
4690
+ text_buffer = ""
4691
+ tool_fragments: dict[int, dict[str, Any]] = {}
4692
+ output_tokens = 0
4693
+ finish_reason = "stop"
4694
+
4695
+ def emit(event_name: str, payload: dict[str, Any]) -> None:
4696
+ handler.wfile.write(f"event: {event_name}\ndata: {json.dumps(payload, ensure_ascii=False)}\n\n".encode())
4697
+ handler.wfile.flush()
4698
+
4699
+ def ensure_text_started() -> int:
4700
+ nonlocal text_started, text_index, next_content_index
4701
+ if text_started and text_index is not None:
4702
+ return text_index
4703
+ text_started = True
4704
+ text_index = next_content_index
4705
+ next_content_index += 1
4706
+ emit(
4707
+ "content_block_start",
4708
+ {"type": "content_block_start", "index": text_index, "content_block": {"type": "text", "text": ""}},
4709
+ )
4710
+ return text_index
4711
+
4712
+ def emit_text_delta(text: str) -> None:
4713
+ if not text:
4714
+ return
4715
+ idx = ensure_text_started()
4716
+ emit(
4717
+ "content_block_delta",
4718
+ {"type": "content_block_delta", "index": idx, "delta": {"type": "text_delta", "text": text}},
4719
+ )
4720
+
4721
+ try:
4722
+ for raw_line in resp:
4723
+ line = raw_line.decode("utf-8", errors="ignore").strip()
4724
+ if not line or line.startswith(":"):
4725
+ continue
4726
+ if line.startswith("data:"):
4727
+ line = line[5:].strip()
4728
+ if not line or line == "[DONE]":
4729
+ break
4730
+ try:
4731
+ event = json.loads(line)
4732
+ except Exception:
4733
+ continue
4734
+ if not isinstance(event, dict):
4735
+ continue
4736
+ usage = event.get("usage")
4737
+ if isinstance(usage, dict):
4738
+ output_tokens = max(output_tokens, positive_int(usage.get("completion_tokens")) or 0)
4739
+ choices = event.get("choices")
4740
+ if not isinstance(choices, list) or not choices:
4741
+ continue
4742
+ choice = choices[0] if isinstance(choices[0], dict) else {}
4743
+ if choice.get("finish_reason"):
4744
+ finish_reason = str(choice.get("finish_reason"))
4745
+ delta = choice.get("delta") if isinstance(choice.get("delta"), dict) else {}
4746
+ text_chunk = delta.get("content") or ""
4747
+ if text_chunk:
4748
+ if pseudo_mode or PSEUDO_TOOL_START in text_chunk:
4749
+ before, sep, after = text_chunk.partition(PSEUDO_TOOL_START)
4750
+ if before and not pseudo_mode:
4751
+ text_so_far += before
4752
+ if word_chunking:
4753
+ text_buffer += before
4754
+ to_flush, text_buffer = _split_word_buffer(text_buffer, force=False)
4755
+ emit_text_delta(to_flush)
4756
+ else:
4757
+ emit_text_delta(before)
4758
+ pseudo_mode = True
4759
+ pseudo_text += (sep + after) if sep else text_chunk
4760
+ if PSEUDO_TOOL_END in pseudo_text:
4761
+ pseudo_mode = False
4762
+ continue
4763
+ if source_body is not None and not text_started and not tool_fragments and should_auto_enter_plan_mode(source_body, text_so_far + text_chunk, []):
4764
+ text_so_far += text_chunk
4765
+ text_suppressed_for_plan = True
4766
+ continue
4767
+ text_so_far += text_chunk
4768
+ if word_chunking:
4769
+ text_buffer += text_chunk
4770
+ to_flush, text_buffer = _split_word_buffer(text_buffer, force=False)
4771
+ emit_text_delta(to_flush)
4772
+ else:
4773
+ emit_text_delta(text_chunk)
4774
+ for call in delta.get("tool_calls") or []:
4775
+ if not isinstance(call, dict):
4776
+ continue
4777
+ try:
4778
+ call_index = int(call.get("index"))
4779
+ except Exception:
4780
+ call_index = len(tool_fragments)
4781
+ slot = tool_fragments.setdefault(call_index, {"id": "", "name": "", "arguments": ""})
4782
+ if call.get("id"):
4783
+ slot["id"] = str(call.get("id"))
4784
+ fn = call.get("function") if isinstance(call.get("function"), dict) else {}
4785
+ if fn.get("name"):
4786
+ slot["name"] += str(fn.get("name"))
4787
+ if fn.get("arguments"):
4788
+ slot["arguments"] += str(fn.get("arguments"))
4789
+ if word_chunking and text_buffer:
4790
+ to_flush, text_buffer = _split_word_buffer(text_buffer, force=True)
4791
+ emit_text_delta(to_flush)
4792
+
4793
+ tool_calls: list[dict[str, Any]] = []
4794
+ _, pseudo_tool_calls = parse_pseudo_tool_calls(pseudo_text)
4795
+ for i, pseudo in enumerate(pseudo_tool_calls):
4796
+ fn = pseudo.get("function") if isinstance(pseudo, dict) else {}
4797
+ if isinstance(fn, dict):
4798
+ tool_fragments.setdefault(100000 + i, {
4799
+ "id": str(pseudo.get("id") or ""),
4800
+ "name": str(fn.get("name") or ""),
4801
+ "arguments": json.dumps(fn.get("arguments") or {}, ensure_ascii=False),
4802
+ })
4803
+ for _, fragment in sorted(tool_fragments.items()):
4804
+ raw_name = str(fragment.get("name") or "")
4805
+ if not raw_name:
4806
+ continue
4807
+ matched_name = _fuzzy_match_tool_name(raw_name) or raw_name
4808
+ normalized_args = normalize_tool_arguments(matched_name, fragment.get("arguments") or {})
4809
+ fixed_input = _validate_and_fix_tool_input(matched_name, normalized_args)
4810
+ if source_body is not None:
4811
+ matched_name, fixed_input = plan_mode_tool_name_for_emit(source_body, matched_name, fixed_input)
4812
+ if matched_name is None:
4813
+ continue
4814
+ tool_calls.append({"function": {"name": matched_name, "arguments": fixed_input}})
4815
+ tool_index = next_content_index
4816
+ next_content_index += 1
4817
+ tool_id = str(fragment.get("id") or f"toolu_openai_{int(time.time() * 1000)}_{tool_index}")
4818
+ append_tool_call_log(
4819
+ "openai_stream_tool_call",
4820
+ {
4821
+ "model": model,
4822
+ "raw_name": raw_name,
4823
+ "matched_name": matched_name,
4824
+ "raw_arguments": fragment.get("arguments"),
4825
+ "emitted_input": fixed_input,
4826
+ "sse_index": tool_index,
4827
+ },
4828
+ )
4829
+ emit(
4830
+ "content_block_start",
4831
+ {
4832
+ "type": "content_block_start",
4833
+ "index": tool_index,
4834
+ "content_block": {"type": "tool_use", "id": tool_id, "name": matched_name, "input": {}},
4835
+ },
4836
+ )
4837
+ emit(
4838
+ "content_block_delta",
4839
+ {
4840
+ "type": "content_block_delta",
4841
+ "index": tool_index,
4842
+ "delta": {"type": "input_json_delta", "partial_json": json.dumps(fixed_input, ensure_ascii=False)},
4843
+ },
4844
+ )
4845
+ emit("content_block_stop", {"type": "content_block_stop", "index": tool_index})
4846
+
4847
+ if source_body is not None and should_auto_enter_plan_mode(source_body, text_so_far, tool_calls):
4848
+ router_log("WARN", "auto-synthesized EnterPlanMode from short/empty upstream OpenAI stream")
4849
+ tool_index = next_content_index
4850
+ next_content_index += 1
4851
+ tool_calls.append({"function": {"name": "EnterPlanMode", "arguments": {}}})
4852
+ emit(
4853
+ "content_block_start",
4854
+ {
4855
+ "type": "content_block_start",
4856
+ "index": tool_index,
4857
+ "content_block": {"type": "tool_use", "id": f"toolu_openai_plan_{int(time.time() * 1000)}", "name": "EnterPlanMode", "input": {}},
4858
+ },
4859
+ )
4860
+ emit("content_block_delta", {"type": "content_block_delta", "index": tool_index, "delta": {"type": "input_json_delta", "partial_json": "{}"}})
4861
+ emit("content_block_stop", {"type": "content_block_stop", "index": tool_index})
4862
+ elif text_suppressed_for_plan and text_so_far:
4863
+ emit_text_delta(text_so_far)
4864
+
4865
+ if source_body is not None and should_keep_work_alive_with_tasklist(source_body, text_so_far, tool_calls):
4866
+ router_log("WARN", "auto-synthesized TaskList to keep work moving after OpenAI stream")
4867
+ tool_index = next_content_index
4868
+ next_content_index += 1
4869
+ tool_calls.append({"function": {"name": "TaskList", "arguments": {}}})
4870
+ emit(
4871
+ "content_block_start",
4872
+ {
4873
+ "type": "content_block_start",
4874
+ "index": tool_index,
4875
+ "content_block": {"type": "tool_use", "id": f"toolu_openai_keepalive_{int(time.time() * 1000)}", "name": "TaskList", "input": {}},
4876
+ },
4877
+ )
4878
+ emit("content_block_delta", {"type": "content_block_delta", "index": tool_index, "delta": {"type": "input_json_delta", "partial_json": "{}"}})
4879
+ emit("content_block_stop", {"type": "content_block_stop", "index": tool_index})
4880
+
4881
+ if text_started and text_index is not None:
4882
+ emit("content_block_stop", {"type": "content_block_stop", "index": text_index})
4883
+ stop_reason = "tool_use" if tool_calls else ("max_tokens" if finish_reason == "length" else "end_turn")
4884
+ write_anthropic_open_stream_stop(handler, {"stop_reason": stop_reason, "usage": {"output_tokens": output_tokens or max(1, len(text_so_far) // 4)}})
4885
+ finally:
4886
+ try:
4887
+ resp.close()
4888
+ except Exception:
4889
+ pass
4890
+
4891
+
4892
+ def upstream_http_error_message(exc: urllib.error.HTTPError, raw: str | None = None) -> str:
4599
4893
  if raw is None:
4600
4894
  raw = exc.read().decode("utf-8", errors="ignore")
4601
4895
  msg = raw.strip() or str(exc)
@@ -4636,7 +4930,7 @@ def retryable_timeout_exception(exc: BaseException) -> bool:
4636
4930
  return "timed out" in text or "timeout" in text
4637
4931
 
4638
4932
 
4639
- def post_json_with_rate_retry(
4933
+ def post_json_with_rate_retry(
4640
4934
  url: str,
4641
4935
  req_body: Any,
4642
4936
  headers: dict[str, str],
@@ -4698,50 +4992,129 @@ def post_json_with_rate_retry(
4698
4992
  continue
4699
4993
  write_router_activity("error", provider, model, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate)
4700
4994
  raise RuntimeError(f"{type(exc).__name__}: {exc}") from exc
4701
- raise RuntimeError("upstream request failed")
4702
-
4703
-
4704
- def forward_openai_compatible_chat(handler: BaseHTTPRequestHandler, provider: str, pcfg: dict[str, Any], body: dict[str, Any]) -> None:
4705
- _update_tool_schema_registry(body.get("tools"))
4706
- model = resolve_requested_model(provider, pcfg, body.get("model"))
4707
- if provider == "nvidia-hosted":
4708
- model = ncp_model_id_for_nvidia_hosted(model)
4709
- req_body = openai_compatible_chat_request(provider, model, body, pcfg, stream=False)
4710
- url = join_url(provider_upstream_request_base(provider, pcfg), "/chat/completions")
4711
- waited, rpm_used, rpm_limit = apply_router_rate_limit(provider, pcfg, model)
4712
- stream = bool(body.get("stream", True))
4713
- notice = rate_limit_notice(waited, rpm_used, rpm_limit, bool(pcfg.get("rate_limit_status", True)))
4714
- if stream:
4715
- write_anthropic_open_stream_start(handler, model)
4716
- index = 0
4717
- if notice:
4718
- index = write_anthropic_stream_blocks(handler, [{"type": "text", "text": notice}], index)
4719
- try:
4995
+ raise RuntimeError("upstream request failed")
4996
+
4997
+
4998
+ def open_openai_stream_with_rate_retry(
4999
+ url: str,
5000
+ req_body: Any,
5001
+ headers: dict[str, str],
5002
+ timeout: float,
5003
+ provider: str,
5004
+ pcfg: dict[str, Any],
5005
+ model: str,
5006
+ retry_notice: Callable[[str], None] | None = None,
5007
+ ) -> Any:
5008
+ gateway_retries = positive_int(pcfg.get("gateway_retries")) or 2
5009
+ max_attempts = max(1, gateway_retries + 1)
5010
+ token_estimate = estimate_tokens(req_body)
5011
+ byte_estimate = len(json.dumps(req_body, ensure_ascii=False).encode("utf-8"))
5012
+ data_bytes = json.dumps(req_body).encode("utf-8")
5013
+ for attempt in range(max_attempts):
5014
+ try:
5015
+ write_router_activity(
5016
+ "request",
5017
+ provider,
5018
+ model,
5019
+ attempt=attempt + 1,
5020
+ total=max_attempts,
5021
+ tokens=token_estimate,
5022
+ bytes=byte_estimate,
5023
+ timeout=timeout,
5024
+ stream=True,
5025
+ )
5026
+ router_log("INFO", f"upstream_stream_request provider={provider} model={model} attempt={attempt + 1}/{max_attempts} tokens={token_estimate} bytes={byte_estimate} timeout={timeout}")
5027
+ req = urllib.request.Request(url, data=data_bytes, headers=headers, method="POST")
5028
+ resp = urllib.request.urlopen(req, timeout=timeout)
5029
+ learn_router_rate_limit_headers(provider, pcfg, model, resp.headers)
5030
+ return resp
5031
+ except urllib.error.HTTPError as exc:
5032
+ raw = exc.read().decode("utf-8", errors="ignore")
5033
+ learn_router_rate_limit_headers(provider, pcfg, model, exc.headers)
5034
+ if exc.code == 429 and attempt == 0:
5035
+ wait = register_router_rate_limit_backoff(provider, pcfg, model, exc.headers.get("Retry-After"))
5036
+ time.sleep(wait)
5037
+ continue
5038
+ if exc.code in UPSTREAM_RETRY_HTTP_CODES and attempt + 1 < max_attempts:
5039
+ retry_no = attempt + 1
5040
+ write_router_activity("retry", provider, model, attempt=retry_no, total=gateway_retries, code=exc.code, tokens=token_estimate, bytes=byte_estimate, stream=True)
5041
+ router_log("WARN", f"upstream_stream_retry provider={provider} model={model} attempt={retry_no}/{gateway_retries} code={exc.code} tokens={token_estimate} bytes={byte_estimate}")
5042
+ if retry_notice:
5043
+ retry_notice(upstream_retry_message(retry_no, gateway_retries))
5044
+ time.sleep(upstream_retry_wait_seconds(retry_no))
5045
+ continue
5046
+ write_router_activity("error", provider, model, code=exc.code, tokens=token_estimate, bytes=byte_estimate, stream=True)
5047
+ raise RuntimeError(upstream_http_error_message(exc, raw)) from exc
5048
+ except (TimeoutError, urllib.error.URLError) as exc:
5049
+ if retryable_timeout_exception(exc) and attempt + 1 < max_attempts:
5050
+ retry_no = attempt + 1
5051
+ write_router_activity("retry", provider, model, attempt=retry_no, total=gateway_retries, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate, stream=True)
5052
+ router_log("WARN", f"upstream_stream_retry provider={provider} model={model} attempt={retry_no}/{gateway_retries} error={type(exc).__name__} tokens={token_estimate} bytes={byte_estimate}")
5053
+ if retry_notice:
5054
+ retry_notice(upstream_retry_message(retry_no, gateway_retries))
5055
+ time.sleep(upstream_retry_wait_seconds(retry_no))
5056
+ continue
5057
+ write_router_activity("error", provider, model, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate, stream=True)
5058
+ raise RuntimeError(f"{type(exc).__name__}: {exc}") from exc
5059
+ raise RuntimeError("upstream stream request failed")
5060
+
5061
+
5062
+ def forward_openai_compatible_chat(handler: BaseHTTPRequestHandler, provider: str, pcfg: dict[str, Any], body: dict[str, Any]) -> None:
5063
+ _update_tool_schema_registry(body.get("tools"))
5064
+ model = resolve_requested_model(provider, pcfg, body.get("model"))
5065
+ if provider == "nvidia-hosted":
5066
+ model = ncp_model_id_for_nvidia_hosted(model)
5067
+ url = join_url(provider_upstream_request_base(provider, pcfg), "/chat/completions")
5068
+ waited, rpm_used, rpm_limit = apply_router_rate_limit(provider, pcfg, model)
5069
+ stream_enabled = bool(pcfg.get("stream_enabled", True))
5070
+ stream = True if provider == "nvidia-hosted" else bool(body.get("stream", stream_enabled)) and stream_enabled
5071
+ notice = rate_limit_notice(waited, rpm_used, rpm_limit, bool(pcfg.get("rate_limit_status", True)))
5072
+ if stream:
5073
+ req_body = openai_compatible_chat_request(provider, model, body, pcfg, stream=True)
5074
+ write_anthropic_open_stream_start(handler, model)
5075
+ index = 0
5076
+ if notice:
5077
+ index = write_anthropic_stream_blocks(handler, [{"type": "text", "text": notice}], index)
5078
+ try:
4720
5079
  def emit_retry_notice(text: str) -> None:
4721
5080
  nonlocal index
4722
5081
  index = write_anthropic_stream_blocks(handler, [{"type": "text", "text": text + "\n"}], index)
4723
5082
 
4724
- data = post_json_with_rate_retry(
4725
- url,
4726
- req_body,
4727
- provider_headers(provider, pcfg),
4728
- provider_request_timeout_seconds(pcfg),
4729
- provider,
5083
+ resp = open_openai_stream_with_rate_retry(
5084
+ url,
5085
+ req_body,
5086
+ provider_headers(provider, pcfg),
5087
+ provider_request_timeout_seconds(pcfg),
5088
+ provider,
4730
5089
  pcfg,
4731
- model,
4732
- emit_retry_notice,
4733
- )
4734
- except RuntimeError as exc:
4735
- msg = str(exc)
4736
- write_anthropic_stream_blocks(handler, [{"type": "text", "text": f"Upstream error: {msg}"}], index)
4737
- write_anthropic_open_stream_stop(handler)
4738
- return
4739
- message = openai_chat_to_anthropic(data, model, source_body=body)
4740
- write_anthropic_stream_blocks(handler, list(message.get("content") or []), index)
4741
- write_anthropic_open_stream_stop(handler, message)
4742
- return
4743
- try:
4744
- data = post_json_with_rate_retry(
5090
+ model,
5091
+ emit_retry_notice,
5092
+ )
5093
+ stream_openai_chat_to_anthropic_sse(
5094
+ handler,
5095
+ resp,
5096
+ model,
5097
+ provider,
5098
+ source_body=body,
5099
+ start_index=index,
5100
+ word_chunking=bool(pcfg.get("stream_word_chunking", False)),
5101
+ )
5102
+ write_router_activity("success", provider, model, tokens=estimate_tokens(req_body), bytes=len(json.dumps(req_body, ensure_ascii=False).encode("utf-8")), stream=True)
5103
+ except RuntimeError as exc:
5104
+ msg = str(exc)
5105
+ write_anthropic_stream_blocks(handler, [{"type": "text", "text": f"Upstream error: {msg}"}], index)
5106
+ write_anthropic_open_stream_stop(handler)
5107
+ return
5108
+ except Exception as exc:
5109
+ msg = f"{type(exc).__name__}: {exc}"
5110
+ write_router_activity("error", provider, model, error=type(exc).__name__, stream=True)
5111
+ write_anthropic_stream_blocks(handler, [{"type": "text", "text": f"Upstream error: {msg}"}], index)
5112
+ write_anthropic_open_stream_stop(handler)
5113
+ return
5114
+ return
5115
+ req_body = openai_compatible_chat_request(provider, model, body, pcfg, stream=False)
5116
+ try:
5117
+ data = post_json_with_rate_retry(
4745
5118
  url,
4746
5119
  req_body,
4747
5120
  provider_headers(provider, pcfg),
package/docs/README.ja.md CHANGED
@@ -47,7 +47,7 @@ vLLM、NVIDIA hosted、self-hosted NIM を選択し、通常の Claude Code 引
47
47
 
48
48
  Credits: One Ciel LLC
49
49
 
50
- 現在のバージョン: `0.1.35`
50
+ 現在のバージョン: `0.1.37`
51
51
 
52
52
  ## 作られた理由
53
53
 
@@ -351,6 +351,22 @@ Windows/Linux 管理、クリーンアップスクリプト、定期的なセキ
351
351
 
352
352
  ## 変更履歴
353
353
 
354
+ ### 0.1.37
355
+
356
+ - **Pseudo tool-call recovery**: NVIDIA/OpenAI-compatible stream 経路で
357
+ `<|tool_calls_section_begin|>...` pseudo tool-call テキストを画面に出さず、
358
+ 可能な場合は Claude `tool_use` ブロックへ復元します。
359
+ - **Streaming defaults**: provider streaming の既定値は on です。NVIDIA hosted
360
+ は安定性のため upstream streaming 経路に固定されます。
361
+
362
+ ### 0.1.36
363
+
364
+ - **NVIDIA upstream streaming**: NVIDIA hosted router 呼び出しは upstream にも
365
+ `stream=true` を使用します。長い応答を完全な non-streaming completion まで
366
+ 待たず、chunk として流せます。
367
+ - **Stream retry diagnostics**: streaming NVIDIA 呼び出しでも statusline 用の
368
+ retry/request size activity 状態を維持します。
369
+
354
370
  ### 0.1.35
355
371
 
356
372
  - **NVIDIA router context guard**: NVIDIA hosted の router context 既定値を 32K
package/docs/README.ko.md CHANGED
@@ -47,7 +47,7 @@ NVIDIA hosted, self-hosted NIM을 선택하고, Claude Code의 일반 인자는
47
47
 
48
48
  Credits: One Ciel LLC
49
49
 
50
- 현재 버전: `0.1.35`
50
+ 현재 버전: `0.1.37`
51
51
 
52
52
  ## 왜 만들었나
53
53
 
@@ -351,6 +351,22 @@ Windows 이벤트 로그 리뷰, 바이러스/랜섬웨어 침입 시도 정리,
351
351
 
352
352
  ## 변경 이력
353
353
 
354
+ ### 0.1.37
355
+
356
+ - **Pseudo tool-call recovery**: NVIDIA/OpenAI-compatible stream 경로에서
357
+ `<|tool_calls_section_begin|>...` pseudo tool-call 텍스트를 화면에 출력하지
358
+ 않고 가능한 경우 Claude `tool_use` 블록으로 복구합니다.
359
+ - **Streaming defaults**: provider streaming 기본값은 on이며, NVIDIA hosted는
360
+ 안정성을 위해 upstream streaming 경로로 고정됩니다.
361
+
362
+ ### 0.1.36
363
+
364
+ - **NVIDIA upstream streaming**: NVIDIA hosted router 호출은 이제 upstream에도
365
+ `stream=true`를 사용합니다. 긴 응답을 전체 완료까지 기다리지 않고 chunk로
366
+ 흘려보내 timeout 가능성을 낮춥니다.
367
+ - **Stream retry diagnostics**: streaming NVIDIA 호출도 statusline에서 쓰는
368
+ retry/request size activity 상태를 유지합니다.
369
+
354
370
  ### 0.1.35
355
371
 
356
372
  - **NVIDIA router context guard**: NVIDIA hosted의 router context 기본값을 32K로
package/docs/README.zh.md CHANGED
@@ -47,7 +47,7 @@ NIM,并把普通 Claude Code 参数原样传递。
47
47
 
48
48
  Credits: One Ciel LLC
49
49
 
50
- 当前版本: `0.1.35`
50
+ 当前版本: `0.1.37`
51
51
 
52
52
  ## 为什么存在
53
53
 
@@ -337,6 +337,21 @@ Hermes 格式模型或部分较旧的 Qwen tool template。
337
337
 
338
338
  ## 更新日志
339
339
 
340
+ ### 0.1.37
341
+
342
+ - **Pseudo tool-call recovery**:NVIDIA/OpenAI-compatible stream 路径现在会
343
+ 隐藏 `<|tool_calls_section_begin|>...` pseudo tool-call 文本,并尽可能恢复为
344
+ Claude `tool_use` block。
345
+ - **Streaming defaults**:provider streaming 默认开启;NVIDIA hosted 为了稳定性
346
+ 固定使用 upstream streaming 路径。
347
+
348
+ ### 0.1.36
349
+
350
+ - **NVIDIA upstream streaming**:NVIDIA hosted router 调用现在也会向 upstream
351
+ 使用 `stream=true`,长响应可以按 chunk 流出,不再等待完整的非流式 completion。
352
+ - **Stream retry diagnostics**:streaming NVIDIA 调用也保留 statusline 使用的
353
+ retry/request size activity 状态。
354
+
340
355
  ### 0.1.35
341
356
 
342
357
  - **NVIDIA router context guard**:NVIDIA hosted 的 router context 默认值改为
package/docs/manual.md CHANGED
@@ -10,7 +10,7 @@ Code starts, while passing normal Claude Code arguments through unchanged.
10
10
 
11
11
  Credits: One Ciel LLC
12
12
 
13
- Current version: `0.1.35`
13
+ Current version: `0.1.37`
14
14
 
15
15
  ## Install
16
16
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@oneciel-ai/claude-any",
3
- "version": "0.1.35",
3
+ "version": "0.1.37",
4
4
  "description": "Claude Code provider selector for Anthropic, Ollama, Ollama Cloud, vLLM, NVIDIA hosted, and self-hosted NIM.",
5
5
  "license": "MIT",
6
6
  "author": "One Ciel LLC",