@oneciel-ai/claude-any 0.1.35 → 0.1.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -1
- package/claude_any.py +316 -46
- package/docs/README.ja.md +9 -1
- package/docs/README.ko.md +9 -1
- package/docs/README.zh.md +8 -1
- package/docs/manual.md +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -48,7 +48,7 @@ arguments through unchanged.
|
|
|
48
48
|
|
|
49
49
|
Credits: One Ciel LLC
|
|
50
50
|
|
|
51
|
-
Current version: `0.1.
|
|
51
|
+
Current version: `0.1.36`
|
|
52
52
|
|
|
53
53
|
## Why This Exists
|
|
54
54
|
|
|
@@ -381,6 +381,14 @@ steps under that larger model's supervision.
|
|
|
381
381
|
|
|
382
382
|
## Changelog
|
|
383
383
|
|
|
384
|
+
### 0.1.36
|
|
385
|
+
|
|
386
|
+
- **NVIDIA upstream streaming**: NVIDIA hosted router calls now use upstream
|
|
387
|
+
`stream=true`, so long responses can flow as chunks instead of waiting for a
|
|
388
|
+
full non-streaming completion.
|
|
389
|
+
- **Stream retry diagnostics**: streamed NVIDIA calls keep the same retry and
|
|
390
|
+
request-size activity status used by the statusline.
|
|
391
|
+
|
|
384
392
|
### 0.1.35
|
|
385
393
|
|
|
386
394
|
- **NVIDIA router context guard**: NVIDIA hosted now defaults to a 32K router
|
package/claude_any.py
CHANGED
|
@@ -85,7 +85,7 @@ PROVIDER_LABELS = {
|
|
|
85
85
|
"self-hosted-nim": "Self Hosted NIM",
|
|
86
86
|
}
|
|
87
87
|
APP_NAME = "Claude Any"
|
|
88
|
-
VERSION = "0.1.
|
|
88
|
+
VERSION = "0.1.36"
|
|
89
89
|
CREDITS = "Credits: One Ciel LLC"
|
|
90
90
|
|
|
91
91
|
LOG_LEVELS = {"SILENT": 0, "ERROR": 1, "WARN": 2, "INFO": 3, "DEBUG": 4, "TRACE": 5}
|
|
@@ -4579,7 +4579,7 @@ def forward_ollama_api_chat(handler: BaseHTTPRequestHandler, provider: str, pcfg
|
|
|
4579
4579
|
write_json(handler, message)
|
|
4580
4580
|
|
|
4581
4581
|
|
|
4582
|
-
def openai_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict[str, Any] | None = None) -> dict[str, Any]:
|
|
4582
|
+
def openai_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict[str, Any] | None = None) -> dict[str, Any]:
|
|
4583
4583
|
choice = {}
|
|
4584
4584
|
choices = data.get("choices")
|
|
4585
4585
|
if isinstance(choices, list) and choices:
|
|
@@ -4592,10 +4592,202 @@ def openai_chat_to_anthropic(data: dict[str, Any], model: str, source_body: dict
|
|
|
4592
4592
|
},
|
|
4593
4593
|
"done_reason": "length" if choice.get("finish_reason") == "length" else "stop",
|
|
4594
4594
|
}
|
|
4595
|
-
return ollama_chat_to_anthropic(wrapped, model, source_body=source_body)
|
|
4596
|
-
|
|
4597
|
-
|
|
4598
|
-
def
|
|
4595
|
+
return ollama_chat_to_anthropic(wrapped, model, source_body=source_body)
|
|
4596
|
+
|
|
4597
|
+
|
|
4598
|
+
def stream_openai_chat_to_anthropic_sse(
|
|
4599
|
+
handler: BaseHTTPRequestHandler,
|
|
4600
|
+
resp: Any,
|
|
4601
|
+
model: str,
|
|
4602
|
+
provider: str,
|
|
4603
|
+
source_body: dict[str, Any] | None = None,
|
|
4604
|
+
start_index: int = 0,
|
|
4605
|
+
word_chunking: bool = False,
|
|
4606
|
+
) -> None:
|
|
4607
|
+
next_content_index = start_index
|
|
4608
|
+
text_started = False
|
|
4609
|
+
text_suppressed_for_plan = False
|
|
4610
|
+
text_index: int | None = None
|
|
4611
|
+
text_so_far = ""
|
|
4612
|
+
text_buffer = ""
|
|
4613
|
+
tool_fragments: dict[int, dict[str, Any]] = {}
|
|
4614
|
+
output_tokens = 0
|
|
4615
|
+
finish_reason = "stop"
|
|
4616
|
+
|
|
4617
|
+
def emit(event_name: str, payload: dict[str, Any]) -> None:
|
|
4618
|
+
handler.wfile.write(f"event: {event_name}\ndata: {json.dumps(payload, ensure_ascii=False)}\n\n".encode())
|
|
4619
|
+
handler.wfile.flush()
|
|
4620
|
+
|
|
4621
|
+
def ensure_text_started() -> int:
|
|
4622
|
+
nonlocal text_started, text_index, next_content_index
|
|
4623
|
+
if text_started and text_index is not None:
|
|
4624
|
+
return text_index
|
|
4625
|
+
text_started = True
|
|
4626
|
+
text_index = next_content_index
|
|
4627
|
+
next_content_index += 1
|
|
4628
|
+
emit(
|
|
4629
|
+
"content_block_start",
|
|
4630
|
+
{"type": "content_block_start", "index": text_index, "content_block": {"type": "text", "text": ""}},
|
|
4631
|
+
)
|
|
4632
|
+
return text_index
|
|
4633
|
+
|
|
4634
|
+
def emit_text_delta(text: str) -> None:
|
|
4635
|
+
if not text:
|
|
4636
|
+
return
|
|
4637
|
+
idx = ensure_text_started()
|
|
4638
|
+
emit(
|
|
4639
|
+
"content_block_delta",
|
|
4640
|
+
{"type": "content_block_delta", "index": idx, "delta": {"type": "text_delta", "text": text}},
|
|
4641
|
+
)
|
|
4642
|
+
|
|
4643
|
+
try:
|
|
4644
|
+
for raw_line in resp:
|
|
4645
|
+
line = raw_line.decode("utf-8", errors="ignore").strip()
|
|
4646
|
+
if not line or line.startswith(":"):
|
|
4647
|
+
continue
|
|
4648
|
+
if line.startswith("data:"):
|
|
4649
|
+
line = line[5:].strip()
|
|
4650
|
+
if not line or line == "[DONE]":
|
|
4651
|
+
break
|
|
4652
|
+
try:
|
|
4653
|
+
event = json.loads(line)
|
|
4654
|
+
except Exception:
|
|
4655
|
+
continue
|
|
4656
|
+
if not isinstance(event, dict):
|
|
4657
|
+
continue
|
|
4658
|
+
usage = event.get("usage")
|
|
4659
|
+
if isinstance(usage, dict):
|
|
4660
|
+
output_tokens = max(output_tokens, positive_int(usage.get("completion_tokens")) or 0)
|
|
4661
|
+
choices = event.get("choices")
|
|
4662
|
+
if not isinstance(choices, list) or not choices:
|
|
4663
|
+
continue
|
|
4664
|
+
choice = choices[0] if isinstance(choices[0], dict) else {}
|
|
4665
|
+
if choice.get("finish_reason"):
|
|
4666
|
+
finish_reason = str(choice.get("finish_reason"))
|
|
4667
|
+
delta = choice.get("delta") if isinstance(choice.get("delta"), dict) else {}
|
|
4668
|
+
text_chunk = delta.get("content") or ""
|
|
4669
|
+
if text_chunk:
|
|
4670
|
+
if source_body is not None and not text_started and not tool_fragments and should_auto_enter_plan_mode(source_body, text_so_far + text_chunk, []):
|
|
4671
|
+
text_so_far += text_chunk
|
|
4672
|
+
text_suppressed_for_plan = True
|
|
4673
|
+
continue
|
|
4674
|
+
text_so_far += text_chunk
|
|
4675
|
+
if word_chunking:
|
|
4676
|
+
text_buffer += text_chunk
|
|
4677
|
+
to_flush, text_buffer = _split_word_buffer(text_buffer, force=False)
|
|
4678
|
+
emit_text_delta(to_flush)
|
|
4679
|
+
else:
|
|
4680
|
+
emit_text_delta(text_chunk)
|
|
4681
|
+
for call in delta.get("tool_calls") or []:
|
|
4682
|
+
if not isinstance(call, dict):
|
|
4683
|
+
continue
|
|
4684
|
+
try:
|
|
4685
|
+
call_index = int(call.get("index"))
|
|
4686
|
+
except Exception:
|
|
4687
|
+
call_index = len(tool_fragments)
|
|
4688
|
+
slot = tool_fragments.setdefault(call_index, {"id": "", "name": "", "arguments": ""})
|
|
4689
|
+
if call.get("id"):
|
|
4690
|
+
slot["id"] = str(call.get("id"))
|
|
4691
|
+
fn = call.get("function") if isinstance(call.get("function"), dict) else {}
|
|
4692
|
+
if fn.get("name"):
|
|
4693
|
+
slot["name"] += str(fn.get("name"))
|
|
4694
|
+
if fn.get("arguments"):
|
|
4695
|
+
slot["arguments"] += str(fn.get("arguments"))
|
|
4696
|
+
if word_chunking and text_buffer:
|
|
4697
|
+
to_flush, text_buffer = _split_word_buffer(text_buffer, force=True)
|
|
4698
|
+
emit_text_delta(to_flush)
|
|
4699
|
+
|
|
4700
|
+
tool_calls: list[dict[str, Any]] = []
|
|
4701
|
+
for _, fragment in sorted(tool_fragments.items()):
|
|
4702
|
+
raw_name = str(fragment.get("name") or "")
|
|
4703
|
+
if not raw_name:
|
|
4704
|
+
continue
|
|
4705
|
+
matched_name = _fuzzy_match_tool_name(raw_name) or raw_name
|
|
4706
|
+
normalized_args = normalize_tool_arguments(matched_name, fragment.get("arguments") or {})
|
|
4707
|
+
fixed_input = _validate_and_fix_tool_input(matched_name, normalized_args)
|
|
4708
|
+
if source_body is not None:
|
|
4709
|
+
matched_name, fixed_input = plan_mode_tool_name_for_emit(source_body, matched_name, fixed_input)
|
|
4710
|
+
if matched_name is None:
|
|
4711
|
+
continue
|
|
4712
|
+
tool_calls.append({"function": {"name": matched_name, "arguments": fixed_input}})
|
|
4713
|
+
tool_index = next_content_index
|
|
4714
|
+
next_content_index += 1
|
|
4715
|
+
tool_id = str(fragment.get("id") or f"toolu_openai_{int(time.time() * 1000)}_{tool_index}")
|
|
4716
|
+
append_tool_call_log(
|
|
4717
|
+
"openai_stream_tool_call",
|
|
4718
|
+
{
|
|
4719
|
+
"model": model,
|
|
4720
|
+
"raw_name": raw_name,
|
|
4721
|
+
"matched_name": matched_name,
|
|
4722
|
+
"raw_arguments": fragment.get("arguments"),
|
|
4723
|
+
"emitted_input": fixed_input,
|
|
4724
|
+
"sse_index": tool_index,
|
|
4725
|
+
},
|
|
4726
|
+
)
|
|
4727
|
+
emit(
|
|
4728
|
+
"content_block_start",
|
|
4729
|
+
{
|
|
4730
|
+
"type": "content_block_start",
|
|
4731
|
+
"index": tool_index,
|
|
4732
|
+
"content_block": {"type": "tool_use", "id": tool_id, "name": matched_name, "input": {}},
|
|
4733
|
+
},
|
|
4734
|
+
)
|
|
4735
|
+
emit(
|
|
4736
|
+
"content_block_delta",
|
|
4737
|
+
{
|
|
4738
|
+
"type": "content_block_delta",
|
|
4739
|
+
"index": tool_index,
|
|
4740
|
+
"delta": {"type": "input_json_delta", "partial_json": json.dumps(fixed_input, ensure_ascii=False)},
|
|
4741
|
+
},
|
|
4742
|
+
)
|
|
4743
|
+
emit("content_block_stop", {"type": "content_block_stop", "index": tool_index})
|
|
4744
|
+
|
|
4745
|
+
if source_body is not None and should_auto_enter_plan_mode(source_body, text_so_far, tool_calls):
|
|
4746
|
+
router_log("WARN", "auto-synthesized EnterPlanMode from short/empty upstream OpenAI stream")
|
|
4747
|
+
tool_index = next_content_index
|
|
4748
|
+
next_content_index += 1
|
|
4749
|
+
tool_calls.append({"function": {"name": "EnterPlanMode", "arguments": {}}})
|
|
4750
|
+
emit(
|
|
4751
|
+
"content_block_start",
|
|
4752
|
+
{
|
|
4753
|
+
"type": "content_block_start",
|
|
4754
|
+
"index": tool_index,
|
|
4755
|
+
"content_block": {"type": "tool_use", "id": f"toolu_openai_plan_{int(time.time() * 1000)}", "name": "EnterPlanMode", "input": {}},
|
|
4756
|
+
},
|
|
4757
|
+
)
|
|
4758
|
+
emit("content_block_delta", {"type": "content_block_delta", "index": tool_index, "delta": {"type": "input_json_delta", "partial_json": "{}"}})
|
|
4759
|
+
emit("content_block_stop", {"type": "content_block_stop", "index": tool_index})
|
|
4760
|
+
elif text_suppressed_for_plan and text_so_far:
|
|
4761
|
+
emit_text_delta(text_so_far)
|
|
4762
|
+
|
|
4763
|
+
if source_body is not None and should_keep_work_alive_with_tasklist(source_body, text_so_far, tool_calls):
|
|
4764
|
+
router_log("WARN", "auto-synthesized TaskList to keep work moving after OpenAI stream")
|
|
4765
|
+
tool_index = next_content_index
|
|
4766
|
+
next_content_index += 1
|
|
4767
|
+
tool_calls.append({"function": {"name": "TaskList", "arguments": {}}})
|
|
4768
|
+
emit(
|
|
4769
|
+
"content_block_start",
|
|
4770
|
+
{
|
|
4771
|
+
"type": "content_block_start",
|
|
4772
|
+
"index": tool_index,
|
|
4773
|
+
"content_block": {"type": "tool_use", "id": f"toolu_openai_keepalive_{int(time.time() * 1000)}", "name": "TaskList", "input": {}},
|
|
4774
|
+
},
|
|
4775
|
+
)
|
|
4776
|
+
emit("content_block_delta", {"type": "content_block_delta", "index": tool_index, "delta": {"type": "input_json_delta", "partial_json": "{}"}})
|
|
4777
|
+
emit("content_block_stop", {"type": "content_block_stop", "index": tool_index})
|
|
4778
|
+
|
|
4779
|
+
if text_started and text_index is not None:
|
|
4780
|
+
emit("content_block_stop", {"type": "content_block_stop", "index": text_index})
|
|
4781
|
+
stop_reason = "tool_use" if tool_calls else ("max_tokens" if finish_reason == "length" else "end_turn")
|
|
4782
|
+
write_anthropic_open_stream_stop(handler, {"stop_reason": stop_reason, "usage": {"output_tokens": output_tokens or max(1, len(text_so_far) // 4)}})
|
|
4783
|
+
finally:
|
|
4784
|
+
try:
|
|
4785
|
+
resp.close()
|
|
4786
|
+
except Exception:
|
|
4787
|
+
pass
|
|
4788
|
+
|
|
4789
|
+
|
|
4790
|
+
def upstream_http_error_message(exc: urllib.error.HTTPError, raw: str | None = None) -> str:
|
|
4599
4791
|
if raw is None:
|
|
4600
4792
|
raw = exc.read().decode("utf-8", errors="ignore")
|
|
4601
4793
|
msg = raw.strip() or str(exc)
|
|
@@ -4636,7 +4828,7 @@ def retryable_timeout_exception(exc: BaseException) -> bool:
|
|
|
4636
4828
|
return "timed out" in text or "timeout" in text
|
|
4637
4829
|
|
|
4638
4830
|
|
|
4639
|
-
def post_json_with_rate_retry(
|
|
4831
|
+
def post_json_with_rate_retry(
|
|
4640
4832
|
url: str,
|
|
4641
4833
|
req_body: Any,
|
|
4642
4834
|
headers: dict[str, str],
|
|
@@ -4698,50 +4890,128 @@ def post_json_with_rate_retry(
|
|
|
4698
4890
|
continue
|
|
4699
4891
|
write_router_activity("error", provider, model, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate)
|
|
4700
4892
|
raise RuntimeError(f"{type(exc).__name__}: {exc}") from exc
|
|
4701
|
-
raise RuntimeError("upstream request failed")
|
|
4702
|
-
|
|
4703
|
-
|
|
4704
|
-
def
|
|
4705
|
-
|
|
4706
|
-
|
|
4707
|
-
|
|
4708
|
-
|
|
4709
|
-
|
|
4710
|
-
|
|
4711
|
-
|
|
4712
|
-
|
|
4713
|
-
|
|
4714
|
-
|
|
4715
|
-
|
|
4716
|
-
|
|
4717
|
-
|
|
4718
|
-
|
|
4719
|
-
|
|
4893
|
+
raise RuntimeError("upstream request failed")
|
|
4894
|
+
|
|
4895
|
+
|
|
4896
|
+
def open_openai_stream_with_rate_retry(
|
|
4897
|
+
url: str,
|
|
4898
|
+
req_body: Any,
|
|
4899
|
+
headers: dict[str, str],
|
|
4900
|
+
timeout: float,
|
|
4901
|
+
provider: str,
|
|
4902
|
+
pcfg: dict[str, Any],
|
|
4903
|
+
model: str,
|
|
4904
|
+
retry_notice: Callable[[str], None] | None = None,
|
|
4905
|
+
) -> Any:
|
|
4906
|
+
gateway_retries = positive_int(pcfg.get("gateway_retries")) or 2
|
|
4907
|
+
max_attempts = max(1, gateway_retries + 1)
|
|
4908
|
+
token_estimate = estimate_tokens(req_body)
|
|
4909
|
+
byte_estimate = len(json.dumps(req_body, ensure_ascii=False).encode("utf-8"))
|
|
4910
|
+
data_bytes = json.dumps(req_body).encode("utf-8")
|
|
4911
|
+
for attempt in range(max_attempts):
|
|
4912
|
+
try:
|
|
4913
|
+
write_router_activity(
|
|
4914
|
+
"request",
|
|
4915
|
+
provider,
|
|
4916
|
+
model,
|
|
4917
|
+
attempt=attempt + 1,
|
|
4918
|
+
total=max_attempts,
|
|
4919
|
+
tokens=token_estimate,
|
|
4920
|
+
bytes=byte_estimate,
|
|
4921
|
+
timeout=timeout,
|
|
4922
|
+
stream=True,
|
|
4923
|
+
)
|
|
4924
|
+
router_log("INFO", f"upstream_stream_request provider={provider} model={model} attempt={attempt + 1}/{max_attempts} tokens={token_estimate} bytes={byte_estimate} timeout={timeout}")
|
|
4925
|
+
req = urllib.request.Request(url, data=data_bytes, headers=headers, method="POST")
|
|
4926
|
+
resp = urllib.request.urlopen(req, timeout=timeout)
|
|
4927
|
+
learn_router_rate_limit_headers(provider, pcfg, model, resp.headers)
|
|
4928
|
+
return resp
|
|
4929
|
+
except urllib.error.HTTPError as exc:
|
|
4930
|
+
raw = exc.read().decode("utf-8", errors="ignore")
|
|
4931
|
+
learn_router_rate_limit_headers(provider, pcfg, model, exc.headers)
|
|
4932
|
+
if exc.code == 429 and attempt == 0:
|
|
4933
|
+
wait = register_router_rate_limit_backoff(provider, pcfg, model, exc.headers.get("Retry-After"))
|
|
4934
|
+
time.sleep(wait)
|
|
4935
|
+
continue
|
|
4936
|
+
if exc.code in UPSTREAM_RETRY_HTTP_CODES and attempt + 1 < max_attempts:
|
|
4937
|
+
retry_no = attempt + 1
|
|
4938
|
+
write_router_activity("retry", provider, model, attempt=retry_no, total=gateway_retries, code=exc.code, tokens=token_estimate, bytes=byte_estimate, stream=True)
|
|
4939
|
+
router_log("WARN", f"upstream_stream_retry provider={provider} model={model} attempt={retry_no}/{gateway_retries} code={exc.code} tokens={token_estimate} bytes={byte_estimate}")
|
|
4940
|
+
if retry_notice:
|
|
4941
|
+
retry_notice(upstream_retry_message(retry_no, gateway_retries))
|
|
4942
|
+
time.sleep(upstream_retry_wait_seconds(retry_no))
|
|
4943
|
+
continue
|
|
4944
|
+
write_router_activity("error", provider, model, code=exc.code, tokens=token_estimate, bytes=byte_estimate, stream=True)
|
|
4945
|
+
raise RuntimeError(upstream_http_error_message(exc, raw)) from exc
|
|
4946
|
+
except (TimeoutError, urllib.error.URLError) as exc:
|
|
4947
|
+
if retryable_timeout_exception(exc) and attempt + 1 < max_attempts:
|
|
4948
|
+
retry_no = attempt + 1
|
|
4949
|
+
write_router_activity("retry", provider, model, attempt=retry_no, total=gateway_retries, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate, stream=True)
|
|
4950
|
+
router_log("WARN", f"upstream_stream_retry provider={provider} model={model} attempt={retry_no}/{gateway_retries} error={type(exc).__name__} tokens={token_estimate} bytes={byte_estimate}")
|
|
4951
|
+
if retry_notice:
|
|
4952
|
+
retry_notice(upstream_retry_message(retry_no, gateway_retries))
|
|
4953
|
+
time.sleep(upstream_retry_wait_seconds(retry_no))
|
|
4954
|
+
continue
|
|
4955
|
+
write_router_activity("error", provider, model, error=type(exc).__name__, tokens=token_estimate, bytes=byte_estimate, stream=True)
|
|
4956
|
+
raise RuntimeError(f"{type(exc).__name__}: {exc}") from exc
|
|
4957
|
+
raise RuntimeError("upstream stream request failed")
|
|
4958
|
+
|
|
4959
|
+
|
|
4960
|
+
def forward_openai_compatible_chat(handler: BaseHTTPRequestHandler, provider: str, pcfg: dict[str, Any], body: dict[str, Any]) -> None:
|
|
4961
|
+
_update_tool_schema_registry(body.get("tools"))
|
|
4962
|
+
model = resolve_requested_model(provider, pcfg, body.get("model"))
|
|
4963
|
+
if provider == "nvidia-hosted":
|
|
4964
|
+
model = ncp_model_id_for_nvidia_hosted(model)
|
|
4965
|
+
url = join_url(provider_upstream_request_base(provider, pcfg), "/chat/completions")
|
|
4966
|
+
waited, rpm_used, rpm_limit = apply_router_rate_limit(provider, pcfg, model)
|
|
4967
|
+
stream = bool(body.get("stream", True))
|
|
4968
|
+
notice = rate_limit_notice(waited, rpm_used, rpm_limit, bool(pcfg.get("rate_limit_status", True)))
|
|
4969
|
+
if stream:
|
|
4970
|
+
req_body = openai_compatible_chat_request(provider, model, body, pcfg, stream=True)
|
|
4971
|
+
write_anthropic_open_stream_start(handler, model)
|
|
4972
|
+
index = 0
|
|
4973
|
+
if notice:
|
|
4974
|
+
index = write_anthropic_stream_blocks(handler, [{"type": "text", "text": notice}], index)
|
|
4975
|
+
try:
|
|
4720
4976
|
def emit_retry_notice(text: str) -> None:
|
|
4721
4977
|
nonlocal index
|
|
4722
4978
|
index = write_anthropic_stream_blocks(handler, [{"type": "text", "text": text + "\n"}], index)
|
|
4723
4979
|
|
|
4724
|
-
|
|
4725
|
-
url,
|
|
4726
|
-
req_body,
|
|
4727
|
-
provider_headers(provider, pcfg),
|
|
4728
|
-
provider_request_timeout_seconds(pcfg),
|
|
4729
|
-
provider,
|
|
4980
|
+
resp = open_openai_stream_with_rate_retry(
|
|
4981
|
+
url,
|
|
4982
|
+
req_body,
|
|
4983
|
+
provider_headers(provider, pcfg),
|
|
4984
|
+
provider_request_timeout_seconds(pcfg),
|
|
4985
|
+
provider,
|
|
4730
4986
|
pcfg,
|
|
4731
|
-
model,
|
|
4732
|
-
emit_retry_notice,
|
|
4733
|
-
)
|
|
4734
|
-
|
|
4735
|
-
|
|
4736
|
-
|
|
4737
|
-
|
|
4738
|
-
|
|
4739
|
-
|
|
4740
|
-
|
|
4741
|
-
|
|
4742
|
-
|
|
4743
|
-
|
|
4744
|
-
|
|
4987
|
+
model,
|
|
4988
|
+
emit_retry_notice,
|
|
4989
|
+
)
|
|
4990
|
+
stream_openai_chat_to_anthropic_sse(
|
|
4991
|
+
handler,
|
|
4992
|
+
resp,
|
|
4993
|
+
model,
|
|
4994
|
+
provider,
|
|
4995
|
+
source_body=body,
|
|
4996
|
+
start_index=index,
|
|
4997
|
+
word_chunking=bool(pcfg.get("stream_word_chunking", False)),
|
|
4998
|
+
)
|
|
4999
|
+
write_router_activity("success", provider, model, tokens=estimate_tokens(req_body), bytes=len(json.dumps(req_body, ensure_ascii=False).encode("utf-8")), stream=True)
|
|
5000
|
+
except RuntimeError as exc:
|
|
5001
|
+
msg = str(exc)
|
|
5002
|
+
write_anthropic_stream_blocks(handler, [{"type": "text", "text": f"Upstream error: {msg}"}], index)
|
|
5003
|
+
write_anthropic_open_stream_stop(handler)
|
|
5004
|
+
return
|
|
5005
|
+
except Exception as exc:
|
|
5006
|
+
msg = f"{type(exc).__name__}: {exc}"
|
|
5007
|
+
write_router_activity("error", provider, model, error=type(exc).__name__, stream=True)
|
|
5008
|
+
write_anthropic_stream_blocks(handler, [{"type": "text", "text": f"Upstream error: {msg}"}], index)
|
|
5009
|
+
write_anthropic_open_stream_stop(handler)
|
|
5010
|
+
return
|
|
5011
|
+
return
|
|
5012
|
+
req_body = openai_compatible_chat_request(provider, model, body, pcfg, stream=False)
|
|
5013
|
+
try:
|
|
5014
|
+
data = post_json_with_rate_retry(
|
|
4745
5015
|
url,
|
|
4746
5016
|
req_body,
|
|
4747
5017
|
provider_headers(provider, pcfg),
|
package/docs/README.ja.md
CHANGED
|
@@ -47,7 +47,7 @@ vLLM、NVIDIA hosted、self-hosted NIM を選択し、通常の Claude Code 引
|
|
|
47
47
|
|
|
48
48
|
Credits: One Ciel LLC
|
|
49
49
|
|
|
50
|
-
現在のバージョン: `0.1.
|
|
50
|
+
現在のバージョン: `0.1.36`
|
|
51
51
|
|
|
52
52
|
## 作られた理由
|
|
53
53
|
|
|
@@ -351,6 +351,14 @@ Windows/Linux 管理、クリーンアップスクリプト、定期的なセキ
|
|
|
351
351
|
|
|
352
352
|
## 変更履歴
|
|
353
353
|
|
|
354
|
+
### 0.1.36
|
|
355
|
+
|
|
356
|
+
- **NVIDIA upstream streaming**: NVIDIA hosted router 呼び出しは upstream にも
|
|
357
|
+
`stream=true` を使用します。長い応答を完全な non-streaming completion まで
|
|
358
|
+
待たず、chunk として流せます。
|
|
359
|
+
- **Stream retry diagnostics**: streaming NVIDIA 呼び出しでも statusline 用の
|
|
360
|
+
retry/request size activity 状態を維持します。
|
|
361
|
+
|
|
354
362
|
### 0.1.35
|
|
355
363
|
|
|
356
364
|
- **NVIDIA router context guard**: NVIDIA hosted の router context 既定値を 32K
|
package/docs/README.ko.md
CHANGED
|
@@ -47,7 +47,7 @@ NVIDIA hosted, self-hosted NIM을 선택하고, Claude Code의 일반 인자는
|
|
|
47
47
|
|
|
48
48
|
Credits: One Ciel LLC
|
|
49
49
|
|
|
50
|
-
현재 버전: `0.1.
|
|
50
|
+
현재 버전: `0.1.36`
|
|
51
51
|
|
|
52
52
|
## 왜 만들었나
|
|
53
53
|
|
|
@@ -351,6 +351,14 @@ Windows 이벤트 로그 리뷰, 바이러스/랜섬웨어 침입 시도 정리,
|
|
|
351
351
|
|
|
352
352
|
## 변경 이력
|
|
353
353
|
|
|
354
|
+
### 0.1.36
|
|
355
|
+
|
|
356
|
+
- **NVIDIA upstream streaming**: NVIDIA hosted router 호출은 이제 upstream에도
|
|
357
|
+
`stream=true`를 사용합니다. 긴 응답을 전체 완료까지 기다리지 않고 chunk로
|
|
358
|
+
흘려보내 timeout 가능성을 낮춥니다.
|
|
359
|
+
- **Stream retry diagnostics**: streaming NVIDIA 호출도 statusline에서 쓰는
|
|
360
|
+
retry/request size activity 상태를 유지합니다.
|
|
361
|
+
|
|
354
362
|
### 0.1.35
|
|
355
363
|
|
|
356
364
|
- **NVIDIA router context guard**: NVIDIA hosted의 router context 기본값을 32K로
|
package/docs/README.zh.md
CHANGED
|
@@ -47,7 +47,7 @@ NIM,并把普通 Claude Code 参数原样传递。
|
|
|
47
47
|
|
|
48
48
|
Credits: One Ciel LLC
|
|
49
49
|
|
|
50
|
-
当前版本: `0.1.
|
|
50
|
+
当前版本: `0.1.36`
|
|
51
51
|
|
|
52
52
|
## 为什么存在
|
|
53
53
|
|
|
@@ -337,6 +337,13 @@ Hermes 格式模型或部分较旧的 Qwen tool template。
|
|
|
337
337
|
|
|
338
338
|
## 更新日志
|
|
339
339
|
|
|
340
|
+
### 0.1.36
|
|
341
|
+
|
|
342
|
+
- **NVIDIA upstream streaming**:NVIDIA hosted router 调用现在也会向 upstream
|
|
343
|
+
使用 `stream=true`,长响应可以按 chunk 流出,不再等待完整的非流式 completion。
|
|
344
|
+
- **Stream retry diagnostics**:streaming NVIDIA 调用也保留 statusline 使用的
|
|
345
|
+
retry/request size activity 状态。
|
|
346
|
+
|
|
340
347
|
### 0.1.35
|
|
341
348
|
|
|
342
349
|
- **NVIDIA router context guard**:NVIDIA hosted 的 router context 默认值改为
|
package/docs/manual.md
CHANGED
package/package.json
CHANGED