anveesa 0.2.7 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +1 -1
- package/Cargo.toml +1 -1
- package/package.json +1 -1
- package/src/config.rs +10 -0
- package/src/provider/openai_compatible.rs +72 -0
package/Cargo.lock
CHANGED
package/Cargo.toml
CHANGED
package/package.json
CHANGED
package/src/config.rs
CHANGED
|
@@ -22,6 +22,9 @@ kind = "openai-compatible"
|
|
|
22
22
|
base_url = "https://openrouter.ai/api/v1"
|
|
23
23
|
api_key_env = "OPENROUTER_API_KEY"
|
|
24
24
|
# default_model = "openai/gpt-4.1-mini"
|
|
25
|
+
# Raise the per-response output cap to reduce truncation on long answers.
|
|
26
|
+
# Anveesa continues truncated answers automatically either way.
|
|
27
|
+
# max_tokens = 8192
|
|
25
28
|
|
|
26
29
|
[providers.sumopod]
|
|
27
30
|
kind = "openai-compatible"
|
|
@@ -436,6 +439,12 @@ pub struct OpenAiCompatibleProviderConfig {
|
|
|
436
439
|
/// For Anthropic models this also sends the `anthropic-beta: prompt-caching-2024-07-31` header.
|
|
437
440
|
#[serde(default, skip_serializing_if = "Option::is_none")]
|
|
438
441
|
pub prompt_cache: Option<bool>,
|
|
442
|
+
|
|
443
|
+
/// Upper bound on tokens the model may generate per response. When unset the
|
|
444
|
+
/// provider default applies. Raising this reduces how often long answers are
|
|
445
|
+
/// truncated by the output limit (Anveesa continues truncated answers either way).
|
|
446
|
+
#[serde(default, skip_serializing_if = "Option::is_none")]
|
|
447
|
+
pub max_tokens: Option<u32>,
|
|
439
448
|
}
|
|
440
449
|
|
|
441
450
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
@@ -472,6 +481,7 @@ fn insert_openai_provider(
|
|
|
472
481
|
default_model: None,
|
|
473
482
|
headers: BTreeMap::new(),
|
|
474
483
|
prompt_cache: None,
|
|
484
|
+
max_tokens: None,
|
|
475
485
|
}),
|
|
476
486
|
);
|
|
477
487
|
}
|
|
@@ -22,6 +22,9 @@ const CONNECT_TIMEOUT: Duration = Duration::from_secs(15);
|
|
|
22
22
|
/// How many times the model may call the exact same (tool, arguments) pair before we refuse.
|
|
23
23
|
const MAX_IDENTICAL_CALLS: usize = 3;
|
|
24
24
|
const MAX_TOOL_INTENT_REPROMPTS: usize = 2;
|
|
25
|
+
/// How many times we ask the model to continue after its output was cut off by the
|
|
26
|
+
/// provider's token limit (`finish_reason == "length"`) before giving up.
|
|
27
|
+
const MAX_LENGTH_CONTINUATIONS: usize = 8;
|
|
25
28
|
|
|
26
29
|
pub async fn ask(
|
|
27
30
|
provider_name: &str,
|
|
@@ -60,6 +63,7 @@ pub async fn ask(
|
|
|
60
63
|
let mut full_text = String::new();
|
|
61
64
|
let mut last_usage: Option<Usage> = None;
|
|
62
65
|
let mut tool_intent_reprompts = 0usize;
|
|
66
|
+
let mut length_continuations = 0usize;
|
|
63
67
|
|
|
64
68
|
loop {
|
|
65
69
|
let _ = events.send(StreamEvent::Status {
|
|
@@ -78,6 +82,9 @@ pub async fn ask(
|
|
|
78
82
|
if usage_requested {
|
|
79
83
|
body["stream_options"] = json!({ "include_usage": true });
|
|
80
84
|
}
|
|
85
|
+
if let Some(max_tokens) = config.max_tokens {
|
|
86
|
+
body["max_tokens"] = json!(max_tokens);
|
|
87
|
+
}
|
|
81
88
|
if tools_enabled {
|
|
82
89
|
body["tools"] = json!(tools::definitions(policy.allows_write_tools()));
|
|
83
90
|
body["tool_choice"] = json!("auto");
|
|
@@ -108,6 +115,31 @@ pub async fn ask(
|
|
|
108
115
|
last_usage = Some(usage);
|
|
109
116
|
}
|
|
110
117
|
|
|
118
|
+
// The provider cut the response off at its output-token limit. Treating the
|
|
119
|
+
// partial text (or partial tool call) as final is what makes Anveesa appear to
|
|
120
|
+
// "stop suddenly" mid-task — instead, keep what we have and ask it to continue.
|
|
121
|
+
if state.finish_reason.as_deref() == Some("length")
|
|
122
|
+
&& length_continuations < MAX_LENGTH_CONTINUATIONS
|
|
123
|
+
{
|
|
124
|
+
length_continuations += 1;
|
|
125
|
+
full_text.push_str(&state.content);
|
|
126
|
+
let _ = events.send(StreamEvent::Status {
|
|
127
|
+
message: "Response hit the output token limit; asking the model to continue"
|
|
128
|
+
.to_string(),
|
|
129
|
+
});
|
|
130
|
+
// Drop any partial tool call: a length-truncated call has incomplete
|
|
131
|
+
// arguments and can't be dispatched. The continuation nudge tells the
|
|
132
|
+
// model to re-issue it.
|
|
133
|
+
if !state.content.is_empty() {
|
|
134
|
+
messages.push(json!({
|
|
135
|
+
"role": "assistant",
|
|
136
|
+
"content": state.content,
|
|
137
|
+
}));
|
|
138
|
+
}
|
|
139
|
+
messages.push(length_continuation_message());
|
|
140
|
+
continue;
|
|
141
|
+
}
|
|
142
|
+
|
|
111
143
|
if state.tool_calls.is_empty() {
|
|
112
144
|
if tools_enabled
|
|
113
145
|
&& tool_intent_reprompts < MAX_TOOL_INTENT_REPROMPTS
|
|
@@ -457,6 +489,13 @@ fn tool_limit_message(max_tool_rounds: usize) -> Value {
|
|
|
457
489
|
})
|
|
458
490
|
}
|
|
459
491
|
|
|
492
|
+
fn length_continuation_message() -> Value {
|
|
493
|
+
json!({
|
|
494
|
+
"role": "system",
|
|
495
|
+
"content": "Your previous response was cut off because it reached the output token limit. Continue from exactly where you left off. Do not repeat text you already produced and do not restart the answer. If you were in the middle of a tool call, re-issue that complete tool call now."
|
|
496
|
+
})
|
|
497
|
+
}
|
|
498
|
+
|
|
460
499
|
fn tool_intent_reprompt_message() -> Value {
|
|
461
500
|
json!({
|
|
462
501
|
"role": "system",
|
|
@@ -819,6 +858,7 @@ struct StreamState {
|
|
|
819
858
|
content: String,
|
|
820
859
|
tool_calls: Vec<PartialToolCall>,
|
|
821
860
|
usage: Option<Usage>,
|
|
861
|
+
finish_reason: Option<String>,
|
|
822
862
|
done: bool,
|
|
823
863
|
}
|
|
824
864
|
|
|
@@ -861,6 +901,13 @@ impl StreamState {
|
|
|
861
901
|
let Some(first_choice) = choices.get(0) else {
|
|
862
902
|
return None;
|
|
863
903
|
};
|
|
904
|
+
|
|
905
|
+
// `finish_reason` is a sibling of `delta` and only carries a string on the
|
|
906
|
+
// final chunk for the choice (it's null on every intermediate chunk).
|
|
907
|
+
if let Some(reason) = first_choice.get("finish_reason").and_then(Value::as_str) {
|
|
908
|
+
self.finish_reason = Some(reason.to_string());
|
|
909
|
+
}
|
|
910
|
+
|
|
864
911
|
let Some(delta) = first_choice.get("delta") else {
|
|
865
912
|
return None;
|
|
866
913
|
};
|
|
@@ -1016,6 +1063,31 @@ mod tests {
|
|
|
1016
1063
|
assert_eq!(state.tool_calls[0].arguments, "{\"path\":\"x\"}");
|
|
1017
1064
|
}
|
|
1018
1065
|
|
|
1066
|
+
#[test]
|
|
1067
|
+
fn captures_finish_reason_from_final_chunk() {
|
|
1068
|
+
let mut state = StreamState::default();
|
|
1069
|
+
// Intermediate chunk: finish_reason is null and must not be recorded.
|
|
1070
|
+
state.apply_chunk(&json!({
|
|
1071
|
+
"choices": [{ "delta": { "content": "partial" }, "finish_reason": null }]
|
|
1072
|
+
}));
|
|
1073
|
+
assert_eq!(state.finish_reason, None);
|
|
1074
|
+
// Final chunk reports truncation.
|
|
1075
|
+
state.apply_chunk(&json!({
|
|
1076
|
+
"choices": [{ "delta": {}, "finish_reason": "length" }]
|
|
1077
|
+
}));
|
|
1078
|
+
assert_eq!(state.finish_reason.as_deref(), Some("length"));
|
|
1079
|
+
assert_eq!(state.content, "partial");
|
|
1080
|
+
}
|
|
1081
|
+
|
|
1082
|
+
#[test]
|
|
1083
|
+
fn length_continuation_message_asks_to_resume_without_repeating() {
|
|
1084
|
+
let message = length_continuation_message();
|
|
1085
|
+
assert_eq!(message["role"], json!("system"));
|
|
1086
|
+
let content = message["content"].as_str().unwrap();
|
|
1087
|
+
assert!(content.contains("cut off"));
|
|
1088
|
+
assert!(content.contains("Do not repeat"));
|
|
1089
|
+
}
|
|
1090
|
+
|
|
1019
1091
|
#[test]
|
|
1020
1092
|
fn parses_usage_chunk() {
|
|
1021
1093
|
let mut state = StreamState::default();
|