@pedrofariasx/qwenproxy 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +13 -0
  2. package/README.md +292 -0
  3. package/bin/qwenproxy.mjs +11 -0
  4. package/package.json +56 -0
  5. package/src/api/models.ts +183 -0
  6. package/src/api/server.ts +126 -0
  7. package/src/cache/memory-cache.ts +186 -0
  8. package/src/core/account-manager.ts +132 -0
  9. package/src/core/accounts.ts +78 -0
  10. package/src/core/config.ts +91 -0
  11. package/src/core/database.ts +92 -0
  12. package/src/core/logger.ts +96 -0
  13. package/src/core/metrics.ts +169 -0
  14. package/src/core/model-registry.ts +30 -0
  15. package/src/core/stream-registry.ts +40 -0
  16. package/src/core/watchdog.ts +130 -0
  17. package/src/index.ts +7 -0
  18. package/src/linter/extraction-engine.ts +165 -0
  19. package/src/linter/index.ts +258 -0
  20. package/src/linter/repair-normalize.ts +245 -0
  21. package/src/linter/safety-gate.ts +219 -0
  22. package/src/linter/streaming-state-machine.ts +252 -0
  23. package/src/linter/structural-parser.ts +352 -0
  24. package/src/linter/types.ts +74 -0
  25. package/src/login.ts +228 -0
  26. package/src/routes/chat.ts +801 -0
  27. package/src/routes/upload.ts +700 -0
  28. package/src/services/playwright.ts +778 -0
  29. package/src/services/qwen.ts +500 -0
  30. package/src/tests/advanced.test.ts +227 -0
  31. package/src/tests/agenticStress.test.ts +360 -0
  32. package/src/tests/concurrency.test.ts +103 -0
  33. package/src/tests/concurrentChat.test.ts +71 -0
  34. package/src/tests/delta.test.ts +63 -0
  35. package/src/tests/index.test.ts +356 -0
  36. package/src/tests/jsonFix.test.ts +98 -0
  37. package/src/tests/linter.test.ts +151 -0
  38. package/src/tests/parallel.test.ts +42 -0
  39. package/src/tests/parser.test.ts +89 -0
  40. package/src/tests/rotation.test.ts +45 -0
  41. package/src/tests/streamingOptimizations.test.ts +328 -0
  42. package/src/tests/structureVerification.test.ts +176 -0
  43. package/src/tools/ast.ts +15 -0
  44. package/src/tools/coercion.ts +67 -0
  45. package/src/tools/confidence.ts +48 -0
  46. package/src/tools/detector.ts +40 -0
  47. package/src/tools/executor.ts +236 -0
  48. package/src/tools/parser.ts +446 -0
  49. package/src/tools/pipeline.ts +122 -0
  50. package/src/tools/registry-runtime.ts +34 -0
  51. package/src/tools/registry.ts +142 -0
  52. package/src/tools/repair.ts +42 -0
  53. package/src/tools/schema.ts +285 -0
  54. package/src/tools/types.ts +104 -0
  55. package/src/tools/validator.ts +33 -0
  56. package/src/utils/context-truncation.ts +61 -0
  57. package/src/utils/json.ts +114 -0
  58. package/src/utils/qwen-stream-parser.ts +286 -0
  59. package/src/utils/types.ts +101 -0
@@ -0,0 +1,801 @@
1
+ /*
2
+ * File: chat.ts
3
+ * Project: qwenproxy
4
+ * Author: Pedro Farias
5
+ * Created: 2026-05-09
6
+ *
7
+ * Last Modified: Sat May 09 2026
8
+ * Modified By: Pedro Farias
9
+ */
10
+
11
+ import { Context } from 'hono';
12
+ import { stream as honoStream } from 'hono/streaming';
13
+ import { v4 as uuidv4 } from 'uuid';
14
+ import { createQwenStream, updateSessionParent } from '../services/qwen.ts';
15
+ import { OpenAIRequest, ChoiceDelta, Message } from '../utils/types.ts';
16
+ import { registry } from '../tools/registry.ts';
17
+ import type { FunctionToolDefinition } from '../tools/types.ts';
18
+ import { robustParseJSON } from '../utils/json.ts';
19
+ import { StreamingToolParser } from '../tools/parser.ts';
20
+ import { QwenStreamParser, ParsedChunkResult } from '../utils/qwen-stream-parser.ts';
21
+ import { RetryableQwenStreamError } from '../services/qwen.ts';
22
+ import { getModelContextWindow } from '../core/model-registry.js'
23
+ import { truncateMessages, estimateTokenCount } from '../utils/context-truncation.ts';
24
+ import { getNextAccount, getNextAvailableAccount, markAccountRateLimited, getAccountCooldownInfo } from '../core/account-manager.ts';
25
+ import { registerStream, removeStream, getStream } from '../core/stream-registry.ts';
26
+ import { metrics } from '../core/metrics.js'
27
+
28
+ export function cleanupAllAccountMutexes(): void {
29
+ // No-op - kept for backward compatibility
30
+ }
31
+
32
+ export interface DeltaResult {
33
+ delta: string;
34
+ matchedContent: string;
35
+ }
36
+
37
+ export function getIncrementalDelta(oldStr: string, newStr: string): DeltaResult {
38
+ if (!oldStr) {
39
+ return { delta: newStr, matchedContent: newStr };
40
+ }
41
+ if (newStr === oldStr) {
42
+ return { delta: '', matchedContent: oldStr };
43
+ }
44
+
45
+ // Fast path: incremental SSE streams append to oldStr most of the time
46
+ if (newStr.startsWith(oldStr)) {
47
+ const delta = newStr.slice(oldStr.length);
48
+ if (delta.length <= 4 && oldStr.length > 2000) {
49
+ return { delta: newStr, matchedContent: oldStr + newStr };
50
+ }
51
+ return { delta, matchedContent: newStr };
52
+ }
53
+
54
+ // Fallback: segment-based prefix matching
55
+ const scanWindow = Math.min(2000, oldStr.length);
56
+ const maxLen = Math.min(scanWindow, newStr.length);
57
+
58
+ let commonPrefixLen = 0;
59
+ const segmentLen = 64;
60
+ while (commonPrefixLen + segmentLen <= maxLen) {
61
+ if (oldStr.slice(commonPrefixLen, commonPrefixLen + segmentLen) !==
62
+ newStr.slice(commonPrefixLen, commonPrefixLen + segmentLen)) {
63
+ break;
64
+ }
65
+ commonPrefixLen += segmentLen;
66
+ }
67
+
68
+ // Fine-grained scan within the mismatching segment
69
+ while (commonPrefixLen < maxLen && oldStr[commonPrefixLen] === newStr[commonPrefixLen]) {
70
+ commonPrefixLen++;
71
+ }
72
+
73
+ const threshold = Math.min(scanWindow, 4);
74
+ if (commonPrefixLen >= threshold) {
75
+ return { delta: newStr.substring(commonPrefixLen), matchedContent: newStr };
76
+ }
77
+
78
+ return { delta: newStr, matchedContent: oldStr + newStr };
79
+ }
80
+
81
+ function parseQwenErrorPayload(raw: string): { message: string; status: number } | null {
82
+ const text = raw.trim();
83
+ if (!text || text.startsWith('data: ')) return null;
84
+
85
+ try {
86
+ const payload = JSON.parse(text);
87
+ if (payload && payload.success === false) {
88
+ const code = payload.data?.code || payload.code || 'UpstreamError';
89
+ const details = payload.data?.details || payload.message || 'Qwen returned an error';
90
+ const wait = payload.data?.num !== undefined ? ` Wait about ${payload.data.num} hour(s) before trying again.` : '';
91
+ const status = code === 'RateLimited' ? 429 : (code === 'Not_Found' ? 404 : 502);
92
+ return { message: `Qwen upstream error: ${code}: ${details}.${wait}`, status };
93
+ }
94
+ if (payload && payload.error) {
95
+ const msg = typeof payload.error === 'string' ? payload.error : (payload.error.message || JSON.stringify(payload.error));
96
+ return { message: `Qwen upstream error: ${msg}`, status: 502 };
97
+ }
98
+ } catch {
99
+ // Non-SSE, non-JSON upstream body. Keep this as an explicit bad gateway
100
+ // instead of silently returning an empty assistant message.
101
+ return { message: `Qwen upstream returned non-SSE response: ${text.slice(0, 300)}`, status: 502 };
102
+ }
103
+
104
+ return null;
105
+ }
106
+
107
+ export async function chatCompletions(c: Context) {
108
+ try {
109
+ const body: OpenAIRequest = await c.req.json();
110
+ const isStream = body.stream ?? false;
111
+
112
+ // Extract the prompt
113
+ let prompt = '';
114
+ const messages = body.messages || [];
115
+ let systemPrompt = '';
116
+ const pendingMultimodal: Array<Array<{ type: string; text?: string; image_url?: { url: string }; video_url?: { url: string }; audio_url?: { url: string }; file_url?: { url: string } }>> = [];
117
+
118
+ for (let i = 0; i < messages.length; i++) {
119
+ const msg = messages[i];
120
+ let contentStr = '';
121
+ if (Array.isArray(msg.content)) {
122
+ // Handle multimodal content (text + images + videos + audio + files)
123
+ const multimodalParts = msg.content.filter(
124
+ (p: any) =>
125
+ (p.type === "image_url" && p.image_url?.url) ||
126
+ (p.type === "video_url" && p.video_url?.url) ||
127
+ (p.type === "audio_url" && p.audio_url?.url) ||
128
+ (p.type === "file_url" && p.file_url?.url),
129
+ );
130
+
131
+ if (multimodalParts.length > 0) {
132
+ // Defer processing to after account selection to reuse cached headers
133
+ pendingMultimodal.push(multimodalParts);
134
+ // Extract text parts for prompt building
135
+ contentStr = msg.content
136
+ .filter((p: any) => p.type === "text")
137
+ .map((p: any) => p.text)
138
+ .join("\n");
139
+ } else {
140
+ // No multimodal parts, just extract text
141
+ contentStr = msg.content
142
+ .filter((p: any) => p.type === "text")
143
+ .map((p: any) => p.text)
144
+ .join("\n");
145
+ }
146
+ } else if (typeof msg.content === 'object' && msg.content !== null) {
147
+ contentStr = JSON.stringify(msg.content);
148
+ } else {
149
+ contentStr = msg.content || '';
150
+ }
151
+
152
+ if (msg.role === 'system') {
153
+ systemPrompt += (contentStr || '') + '\n\n';
154
+ } else if (msg.role === 'user') {
155
+ prompt += `User: ${contentStr || ''}\n\n`;
156
+ } else if (msg.role === 'assistant') {
157
+ let assistantContent = contentStr || '';
158
+ const reasoning = (msg as any).reasoning_content;
159
+ if (reasoning) {
160
+ assistantContent = `<think>\n${reasoning}\n</think>\n${assistantContent}`;
161
+ }
162
+ if (msg.tool_calls && Array.isArray(msg.tool_calls)) {
163
+ for (const tc of msg.tool_calls) {
164
+ const args = tc.function?.arguments;
165
+ let parsedArgs: any = {};
166
+ if (typeof args === 'string') {
167
+ try { parsedArgs = JSON.parse(args); } catch { parsedArgs = {}; }
168
+ } else if (args && typeof args === 'object') {
169
+ parsedArgs = args;
170
+ }
171
+ const payload = { name: tc.function?.name, arguments: parsedArgs };
172
+ const toolCallStr = `\n<tool_call>\n${JSON.stringify(payload)}\n</tool_call>`;
173
+ assistantContent = assistantContent ? assistantContent + toolCallStr : toolCallStr.trim();
174
+ }
175
+ }
176
+ prompt += `Assistant: ${assistantContent.trim()}\n\n`;
177
+ } else if (msg.role === 'tool' || msg.role === 'function') {
178
+ let toolName = msg.name;
179
+ if (!toolName && msg.tool_call_id) {
180
+ // Look up tool name in history by tool_call_id
181
+ for (let j = i - 1; j >= 0; j--) {
182
+ const prevMsg = messages[j];
183
+ if (prevMsg.role === 'assistant' && prevMsg.tool_calls) {
184
+ const call = prevMsg.tool_calls.find(tc => tc.id === msg.tool_call_id);
185
+ if (call) {
186
+ toolName = call.function?.name;
187
+ break;
188
+ }
189
+ }
190
+ }
191
+ }
192
+ prompt += `Tool Response (${toolName || 'tool'}): ${contentStr || ''}\n\n`;
193
+ }
194
+ }
195
+
196
+ // Inject tools instructions
197
+ const bodyAny = body as any;
198
+ if (bodyAny.tools && Array.isArray(bodyAny.tools) && bodyAny.tools.length > 0) {
199
+ // Better formatting for tools
200
+ const formattedTools = bodyAny.tools.map((t: any) => {
201
+ if (t.type === 'function') {
202
+ return {
203
+ name: t.function.name,
204
+ description: t.function.description || '',
205
+ parameters: t.function.parameters
206
+ };
207
+ }
208
+ return t;
209
+ });
210
+ const toolsJson = JSON.stringify(formattedTools, null, 2);
211
+
212
+ systemPrompt += `\n\n# TOOLS AVAILABLE\nYou have access to the following tools:\n${toolsJson}\n\n# TOOL CALLING FORMAT (MANDATORY)\nTo use a tool, you MUST output a JSON object wrapped EXACTLY in these tags:\n<tool_call>\n{"name": "tool_name", "arguments": {"param_name": "value"}}\n</tool_call>\n\nEXAMPLE OF MULTIPLE TOOL CALLS:\n<tool_call>\n{"name": "read_file", "arguments": {"path": "file1.txt"}}\n</tool_call>\n<tool_call>\n{"name": "read_file", "arguments": {"path": "file2.txt"}}\n</tool_call>\n\nCRITICAL RULES:\n1. ONLY use the tags above for tool calling. NEVER output raw JSON without tags.\n2. You can call multiple tools by outputting multiple <tool_call> blocks consecutively.\n3. Do NOT output any other text (explanations, chat, etc.) after your <tool_call> blocks. Wait for the user to provide the tool response.\n4. The JSON inside the tags MUST be valid and include ALL required braces and the "arguments" field.\n5. If you need to use a tool, do it IMMEDIATELY without preamble.\n\n`;
213
+
214
+ if (bodyAny.tool_choice && typeof bodyAny.tool_choice === 'object' && bodyAny.tool_choice.function) {
215
+ const forcedTool = bodyAny.tool_choice.function.name;
216
+ systemPrompt += `CRITICAL: You MUST call the tool "${forcedTool}" in this response.\n\n`;
217
+ }
218
+ }
219
+
220
+ const modelId = body.model.replace('-no-thinking', '');
221
+ const modelContextWindow = getModelContextWindow(modelId)
222
+ const estimatedTokens = estimateTokenCount(systemPrompt + prompt);
223
+
224
+ let finalPrompt: string;
225
+ if (estimatedTokens > modelContextWindow - 1000) {
226
+ const truncated = truncateMessages(messages, modelContextWindow, systemPrompt);
227
+ finalPrompt = truncated.map(m => `${m.role === 'user' ? 'User' : m.role === 'assistant' ? 'Assistant' : m.role}: ${m.content}`).join('\n\n');
228
+ } else {
229
+ finalPrompt = systemPrompt ? `${systemPrompt}\n${prompt}` : prompt;
230
+ }
231
+
232
+ const isThinkingModel = !body.model.includes('no-thinking');
233
+
234
+ // A session is new if it doesn't have any assistant messages yet.
235
+ // This handles cases where the first request has [System, User] messages.
236
+ const isNewSession = !messages.some(m => m.role === 'assistant');
237
+
238
+ // Account selection with fallback on rate-limit/failure
239
+ let account = getNextAccount();
240
+ let triedAccountIds = new Set<string>();
241
+ let lastError: any = null;
242
+
243
+ let stream: ReadableStream | undefined;
244
+ let uiSessionId = '';
245
+ const completionId = 'chatcmpl-' + uuidv4();
246
+
247
+ while (account) {
248
+ const accountId = account.id;
249
+ const accountEmail = account.email;
250
+
251
+ if (triedAccountIds.has(accountId)) {
252
+ account = getNextAvailableAccount(accountId);
253
+ continue;
254
+ }
255
+ triedAccountIds.add(accountId);
256
+
257
+ const cooldownInfo = getAccountCooldownInfo(accountId);
258
+ if (cooldownInfo && accountId !== 'global') {
259
+ console.log(`[Chat] Skipping account ${accountEmail} (${accountId}) — on cooldown for ${Math.round(cooldownInfo.remainingMs / 1000)}s (${cooldownInfo.reason})`);
260
+ account = getNextAvailableAccount(accountId);
261
+ continue;
262
+ }
263
+
264
+ console.log(`[Chat] Routing request to account: ${accountEmail} (${accountId})`);
265
+
266
+ let retries = 3;
267
+ let retryDelay = 500;
268
+ let success = false;
269
+
270
+ while (retries > 0) {
271
+ try {
272
+ const result = await createQwenStream(
273
+ finalPrompt,
274
+ isThinkingModel,
275
+ body.model,
276
+ null, // Always force new chat for concurrency isolation
277
+ accountId === 'global' ? undefined : accountId,
278
+ undefined,
279
+ pendingMultimodal.length > 0 ? pendingMultimodal : undefined
280
+ );
281
+ stream = result.stream;
282
+ uiSessionId = result.uiSessionId;
283
+ registerStream(completionId, {
284
+ abortController: result.controller,
285
+ accountId: result.accountId,
286
+ uiSessionId: result.uiSessionId,
287
+ targetResponseId: '',
288
+ headers: result.headers,
289
+ });
290
+ success = true;
291
+ break;
292
+ } catch (err: any) {
293
+ retries--;
294
+
295
+ if (err.upstreamCode === 'RateLimited' || err.upstreamStatus === 429) {
296
+ const hourHint = err.message?.match(/Wait about (\d+) hour/);
297
+ const cooldownMs = hourHint ? parseInt(hourHint[1]) * 60 * 60 * 1000 : undefined;
298
+ markAccountRateLimited(accountId, cooldownMs, 'RateLimited');
299
+ console.warn(`[Chat] Account ${accountEmail} (${accountId}) rate-limited. Marked for cooldown.`);
300
+ lastError = err;
301
+ break;
302
+ }
303
+
304
+ if (retries === 0) {
305
+ if (err.upstreamStatus && err.upstreamStatus >= 500) {
306
+ markAccountRateLimited(accountId, undefined, 'ServerError');
307
+ console.warn(`[Chat] Account ${accountEmail} (${accountId}) returned server error. Marked for cooldown.`);
308
+ }
309
+ lastError = err;
310
+ break;
311
+ }
312
+
313
+ let useDelay = retryDelay;
314
+ if (err instanceof RetryableQwenStreamError && err.retryAfterMs !== undefined) {
315
+ useDelay = err.retryAfterMs;
316
+ }
317
+ const isRetryable = err instanceof RetryableQwenStreamError || err.message?.includes('in progress') || err.message?.includes('Bad_Request');
318
+ if (!isRetryable) {
319
+ lastError = err;
320
+ break;
321
+ }
322
+ console.warn(`[Chat] Qwen request failed for ${accountEmail}, retrying in ${useDelay}ms... (${retries} left)`);
323
+ await new Promise(r => setTimeout(r, useDelay));
324
+ retryDelay = Math.min(retryDelay * 2, 5000);
325
+ }
326
+ }
327
+
328
+ if (success) {
329
+ break;
330
+ }
331
+
332
+ account = getNextAvailableAccount(accountId);
333
+ }
334
+
335
+ if (!stream) {
336
+ removeStream(completionId);
337
+ throw lastError || new Error('All accounts failed');
338
+ }
339
+
340
+ if (!isStream) {
341
+ const reader = stream!.getReader();
342
+ const decoder = new TextDecoder();
343
+
344
+ const toolCallsOut: any[] = [];
345
+ let buffer = '';
346
+ const hasTools = Array.isArray(bodyAny.tools) && bodyAny.tools.length > 0;
347
+
348
+ const qwenParser = new QwenStreamParser(uiSessionId, {
349
+ tools: hasTools ? bodyAny.tools : [],
350
+ onThinking: (content: string) => {
351
+ // Accumulate reasoning content (handled via parser state)
352
+ },
353
+ onToolCall: (tc) => {
354
+ toolCallsOut.push({
355
+ id: tc.id,
356
+ type: 'function',
357
+ function: {
358
+ name: tc.name,
359
+ arguments: JSON.stringify(tc.arguments)
360
+ }
361
+ });
362
+ },
363
+ });
364
+
365
+ while (true) {
366
+ const { done, value } = await reader.read();
367
+ if (done) break;
368
+
369
+ buffer += decoder.decode(value, { stream: true });
370
+ const lines = buffer.split('\n');
371
+ buffer = lines.pop() || '';
372
+
373
+ for (const line of lines) {
374
+ const trimmed = line.trim();
375
+ if (!trimmed || !trimmed.startsWith('data: ')) continue;
376
+
377
+ const dataStr = trimmed.slice(6);
378
+ if (dataStr === '[DONE]') continue;
379
+
380
+ qwenParser.parseLine(dataStr);
381
+ }
382
+ }
383
+
384
+ const upstreamError = parseQwenErrorPayload(buffer);
385
+ if (upstreamError) {
386
+ removeStream(completionId);
387
+ return c.json({ error: { message: upstreamError.message } }, upstreamError.status as any);
388
+ }
389
+
390
+ const { text: remainingText, toolCalls: remainingToolCalls } = qwenParser.flush();
391
+ const parserState = qwenParser.state;
392
+ let finalContent = parserState.lastFullContent;
393
+ if (remainingText) {
394
+ finalContent += remainingText;
395
+ }
396
+ for (const tc of remainingToolCalls) {
397
+ toolCallsOut.push({
398
+ id: tc.id,
399
+ type: 'function',
400
+ function: {
401
+ name: tc.name,
402
+ arguments: JSON.stringify(tc.arguments)
403
+ }
404
+ });
405
+ }
406
+
407
+ const usage = {
408
+ prompt_tokens: parserState.promptTokens,
409
+ completion_tokens: parserState.completionTokens,
410
+ total_tokens: parserState.promptTokens + parserState.completionTokens,
411
+ prompt_tokens_details: { cached_tokens: 0 }
412
+ };
413
+ const message: any = { role: 'assistant', content: toolCallsOut.length ? null : finalContent };
414
+ if (parserState.reasoningBuffer) message.reasoning_content = parserState.reasoningBuffer;
415
+ if (toolCallsOut.length) toolCallsOut.forEach((tc, idx) => tc.index = idx);
416
+ if (toolCallsOut.length) message.tool_calls = toolCallsOut;
417
+
418
+ removeStream(completionId);
419
+ return c.json({
420
+ id: completionId,
421
+ object: 'chat.completion',
422
+ created: Math.floor(Date.now() / 1000),
423
+ model: body.model,
424
+ choices: [{
425
+ index: 0,
426
+ message,
427
+ logprobs: null,
428
+ finish_reason: toolCallsOut.length ? 'tool_calls' : 'stop'
429
+ }],
430
+ usage
431
+ });
432
+ }
433
+
434
+ // Disable Nagle's algorithm to transmit small chunks immediately without buffering delay
435
+ const socket = (c.env as any)?.incoming?.socket || (c.req.raw as any).socket;
436
+ if (socket && typeof socket.setNoDelay === 'function') {
437
+ socket.setNoDelay(true);
438
+ }
439
+
440
+ c.header('Content-Type', 'text/event-stream');
441
+ c.header('Cache-Control', 'no-cache, no-transform');
442
+ c.header('Connection', 'keep-alive');
443
+ c.header('X-Accel-Buffering', 'no');
444
+
445
+ return honoStream(c, async (streamWriter: any) => {
446
+ let heartbeatInterval: any;
447
+ try {
448
+ // Send heartbeat to prevent Cloudflare 524 timeout
449
+ await streamWriter.write(': heartbeat\n\n');
450
+
451
+ // Set up a periodic heartbeat to keep the connection alive during long thinking phases
452
+ heartbeatInterval = setInterval(async () => {
453
+ try {
454
+ await streamWriter.write(': keep-alive\n\n');
455
+ } catch (e) {
456
+ clearInterval(heartbeatInterval);
457
+ }
458
+ }, 15000); // Every 15 seconds
459
+
460
+ // Optimized: fire-and-forget write (Hono's streamWriter has internal buffering)
461
+ const writeEvent = (data: any) => {
462
+ streamWriter.write(`data: ${JSON.stringify(data)}\n\n`);
463
+ };
464
+
465
+ const makeChoice = (delta: any, finishReason: string | null = null) => ({
466
+ index: 0,
467
+ delta,
468
+ logprobs: null,
469
+ finish_reason: finishReason
470
+ });
471
+
472
+ // Pre-compute timestamp once before the stream loop
473
+ const createdTimestamp = Math.floor(Date.now() / 1000);
474
+
475
+ // Send initial chunk
476
+ writeEvent({
477
+ id: completionId,
478
+ object: 'chat.completion.chunk',
479
+ created: createdTimestamp,
480
+ model: body.model,
481
+ choices: [makeChoice({ role: 'assistant', content: '' })]
482
+ });
483
+
484
+ const reader = stream.getReader();
485
+ const decoder = new TextDecoder();
486
+
487
+ let reasoningBuffer = '';
488
+ let lastFullContent = '';
489
+ let targetResponseId: string | null = null;
490
+ let targetResponseIdSet = false;
491
+ let currentThoughtIndex = 0;
492
+ const hasTools = Array.isArray(bodyAny.tools) && bodyAny.tools.length > 0;
493
+ const toolParser = hasTools ? new StreamingToolParser(bodyAny.tools) : null;
494
+
495
+ let buffer = '';
496
+ let completionTokens = 0;
497
+ let promptTokens = Math.ceil(finalPrompt.length / 3.5);
498
+
499
+ // Real-time flush: send each event immediately to minimize latency
500
+ let chunkCount = 0;
501
+ while (true) {
502
+ const { done, value } = await reader.read();
503
+ if (done) break;
504
+
505
+ buffer += decoder.decode(value, { stream: true });
506
+
507
+ let startIdx = 0;
508
+ let newlineIdx: number;
509
+ while ((newlineIdx = buffer.indexOf('\n', startIdx)) !== -1) {
510
+ const line = buffer.slice(startIdx, newlineIdx);
511
+ startIdx = newlineIdx + 1;
512
+
513
+ const trimmed = line.trim();
514
+ if (!trimmed || !trimmed.startsWith('data: ')) continue;
515
+
516
+ const dataStr = trimmed.slice(6);
517
+ if (dataStr === '[DONE]') {
518
+ streamWriter.write('data: [DONE]\n\n');
519
+ continue;
520
+ }
521
+
522
+ try {
523
+ const chunk = JSON.parse(dataStr);
524
+
525
+ // Extract response_id for session tracking and target filtering
526
+ if (chunk['response.created'] && chunk['response.created'].response_id) {
527
+ if (!targetResponseId) {
528
+ targetResponseId = chunk['response.created'].response_id;
529
+ targetResponseIdSet = true;
530
+ }
531
+ updateSessionParent(uiSessionId, chunk['response.created'].response_id);
532
+ } else if (chunk.response_id && !targetResponseIdSet) {
533
+ targetResponseId = chunk.response_id;
534
+ targetResponseIdSet = true;
535
+ updateSessionParent(uiSessionId, chunk.response_id);
536
+ }
537
+
538
+ if (chunk.usage) {
539
+ if (chunk.usage.output_tokens) completionTokens = chunk.usage.output_tokens;
540
+ if (chunk.usage.input_tokens) promptTokens = chunk.usage.input_tokens;
541
+ }
542
+
543
+ let vStr = '';
544
+ let foundStr = false;
545
+ let isThinkingChunk = false;
546
+
547
+ if (chunk.choices && chunk.choices[0] && chunk.choices[0].delta &&
548
+ (!targetResponseIdSet || chunk.response_id === targetResponseId)) {
549
+ const delta = chunk.choices[0].delta;
550
+
551
+ if (delta.phase === 'thinking_summary') {
552
+ isThinkingChunk = true;
553
+ if (delta.extra && delta.extra.summary_thought && delta.extra.summary_thought.content) {
554
+ const thoughts = delta.extra.summary_thought.content;
555
+ if (thoughts.length > currentThoughtIndex) {
556
+ vStr = thoughts.slice(currentThoughtIndex).join('\n');
557
+ currentThoughtIndex = thoughts.length;
558
+ foundStr = true;
559
+ }
560
+ }
561
+ } else if (delta.phase === 'answer') {
562
+ isThinkingChunk = false;
563
+ if (delta.content !== undefined) {
564
+ const newContent = delta.content || '';
565
+ const result = getIncrementalDelta(lastFullContent, newContent);
566
+ vStr = result.delta;
567
+ if (vStr) {
568
+ lastFullContent = result.matchedContent;
569
+ foundStr = true;
570
+ }
571
+ }
572
+ }
573
+ }
574
+
575
+ if (foundStr && vStr !== '') {
576
+ if (vStr === 'FINISHED') continue;
577
+
578
+ if (isThinkingChunk) {
579
+ reasoningBuffer += vStr;
580
+ streamWriter.write(`data: ${JSON.stringify({
581
+ id: completionId,
582
+ object: 'chat.completion.chunk',
583
+ created: createdTimestamp,
584
+ model: body.model,
585
+ choices: [makeChoice({ reasoning_content: vStr })]
586
+ })}\n\n`);
587
+ } else {
588
+ if (hasTools && toolParser) {
589
+ const { text, toolCalls } = toolParser.feed(vStr);
590
+ if (text) {
591
+ streamWriter.write(`data: ${JSON.stringify({
592
+ id: completionId,
593
+ object: 'chat.completion.chunk',
594
+ created: createdTimestamp,
595
+ model: body.model,
596
+ choices: [makeChoice({ content: text })]
597
+ })}\n\n`);
598
+ }
599
+ for (const tc of toolCalls) {
600
+ streamWriter.write(`data: ${JSON.stringify({
601
+ id: completionId,
602
+ object: 'chat.completion.chunk',
603
+ created: createdTimestamp,
604
+ model: body.model,
605
+ choices: [makeChoice({
606
+ tool_calls: [{
607
+ index: toolParser.getEmittedToolCallCount() - toolCalls.length + toolCalls.indexOf(tc),
608
+ id: tc.id,
609
+ type: 'function',
610
+ function: {
611
+ name: tc.name,
612
+ arguments: JSON.stringify(tc.arguments)
613
+ }
614
+ }]
615
+ })]
616
+ })}\n\n`);
617
+ }
618
+ } else {
619
+ if (vStr) {
620
+ streamWriter.write(`data: ${JSON.stringify({
621
+ id: completionId,
622
+ object: 'chat.completion.chunk',
623
+ created: createdTimestamp,
624
+ model: body.model,
625
+ choices: [makeChoice({ content: vStr })]
626
+ })}\n\n`);
627
+ }
628
+ }
629
+ }
630
+ }
631
+ } catch (e) {
632
+ // parse error, ignore partial chunk
633
+ }
634
+ }
635
+
636
+ // Trim processed portion from buffer
637
+ if (startIdx > 0) {
638
+ buffer = buffer.slice(startIdx);
639
+ }
640
+
641
+ // Periodic yielding to prevent event loop starvation
642
+ chunkCount++;
643
+ if (chunkCount % 100 === 0) {
644
+ await new Promise(r => setImmediate(r));
645
+ }
646
+ }
647
+
648
+ const upstreamError = parseQwenErrorPayload(buffer);
649
+ if (upstreamError) {
650
+ writeEvent({
651
+ id: completionId,
652
+ object: 'chat.completion.chunk',
653
+ created: createdTimestamp,
654
+ model: body.model,
655
+ choices: [makeChoice({ content: upstreamError.message })]
656
+ });
657
+ writeEvent({
658
+ id: completionId,
659
+ object: 'chat.completion.chunk',
660
+ created: createdTimestamp,
661
+ model: body.model,
662
+ choices: [makeChoice({}, 'stop')]
663
+ });
664
+ streamWriter.write('data: [DONE]\n\n');
665
+ return;
666
+ }
667
+
668
+ if (toolParser) {
669
+ const flushResult = toolParser.flush();
670
+
671
+ if (flushResult.text) {
672
+ writeEvent({
673
+ id: completionId,
674
+ object: 'chat.completion.chunk',
675
+ created: createdTimestamp,
676
+ model: body.model,
677
+ choices: [makeChoice({ content: flushResult.text })]
678
+ });
679
+ }
680
+ for (const tc of flushResult.toolCalls) {
681
+ const idx = toolParser.getEmittedToolCallCount() - flushResult.toolCalls.length + flushResult.toolCalls.indexOf(tc);
682
+ writeEvent({
683
+ id: completionId,
684
+ object: 'chat.completion.chunk',
685
+ created: createdTimestamp,
686
+ model: body.model,
687
+ choices: [makeChoice({
688
+ tool_calls: [{
689
+ index: idx,
690
+ id: tc.id,
691
+ type: 'function',
692
+ function: {
693
+ name: tc.name,
694
+ arguments: JSON.stringify(tc.arguments)
695
+ }
696
+ }]
697
+ })]
698
+ });
699
+ }
700
+ }
701
+
702
+ const usage = {
703
+ prompt_tokens: promptTokens,
704
+ completion_tokens: completionTokens,
705
+ total_tokens: promptTokens + completionTokens,
706
+ prompt_tokens_details: { cached_tokens: 0 }
707
+ };
708
+
709
+ const finalFinishReason = toolParser && toolParser.getEmittedToolCallCount() > 0 ? 'tool_calls' : 'stop';
710
+
711
+ writeEvent({
712
+ id: completionId,
713
+ object: 'chat.completion.chunk',
714
+ created: createdTimestamp,
715
+ model: body.model,
716
+ choices: [makeChoice({}, finalFinishReason)],
717
+ ...(body.stream_options?.include_usage ? {} : { usage })
718
+ });
719
+
720
+ if (body.stream_options?.include_usage) {
721
+ writeEvent({
722
+ id: completionId,
723
+ object: 'chat.completion.chunk',
724
+ created: createdTimestamp,
725
+ model: body.model,
726
+ choices: [],
727
+ usage
728
+ });
729
+ }
730
+ streamWriter.write('data: [DONE]\n\n');
731
+
732
+ } finally {
733
+ clearInterval(heartbeatInterval);
734
+ removeStream(completionId);
735
+ }
736
+ });
737
+ } catch (err: any) {
738
+ console.error('Error in chatCompletions:', err)
739
+ const status = err.upstreamStatus || 500
740
+ if (status >= 500) {
741
+ metrics.increment('requests.errors')
742
+ }
743
+ return c.json({ error: { message: err.message } }, status)
744
+ }
745
+ }
746
+
747
+ export async function chatCompletionsStop(c: Context) {
748
+ try {
749
+ const body = await c.req.json();
750
+ const { chat_id, response_id } = body;
751
+
752
+ if (!chat_id || !response_id) {
753
+ return c.json({ error: 'chat_id and response_id are required' }, 400);
754
+ }
755
+
756
+ const stream = getStream(chat_id);
757
+ if (!stream) {
758
+ return c.json({ error: 'Stream not found' }, 404);
759
+ }
760
+
761
+ if (stream.targetResponseId && stream.targetResponseId !== response_id) {
762
+ return c.json({ error: 'response_id mismatch' }, 400);
763
+ }
764
+
765
+ const stopResponse = await fetch(`https://chat.qwen.ai/api/v2/chat/completions/stop?chat_id=${chat_id}`, {
766
+ method: 'POST',
767
+ headers: {
768
+ 'Accept': 'application/json, text/plain, */*',
769
+ 'Accept-Language': 'pt-BR,pt;q=0.9',
770
+ 'Content-Type': 'application/json',
771
+ 'Cookie': stream.headers.cookie,
772
+ 'Origin': 'https://chat.qwen.ai',
773
+ 'Referer': `https://chat.qwen.ai/c/${chat_id}`,
774
+ 'Sec-Fetch-Dest': 'empty',
775
+ 'Sec-Fetch-Mode': 'cors',
776
+ 'Sec-Fetch-Site': 'same-origin',
777
+ 'User-Agent': stream.headers['user-agent'],
778
+ 'X-Request-Id': uuidv4(),
779
+ 'bx-ua': stream.headers['bx-ua'],
780
+ 'bx-umidtoken': stream.headers['bx-umidtoken'],
781
+ 'bx-v': stream.headers['bx-v'],
782
+ },
783
+ body: JSON.stringify({ chat_id, response_id }),
784
+ });
785
+
786
+ if (!stopResponse.ok) {
787
+ const errorText = await stopResponse.text();
788
+ console.error(`[Stop] Failed to stop generation for chat_id=${chat_id}: ${stopResponse.status} ${errorText}`);
789
+ return c.json({ error: 'Failed to stop generation' }, stopResponse.status as any);
790
+ }
791
+
792
+ stream.abortController.abort();
793
+ removeStream(chat_id);
794
+
795
+ console.log(`[Stop] Generation stopped for chat_id=${chat_id}`);
796
+ return c.json({ success: true });
797
+ } catch (err: any) {
798
+ console.error('Error in chatCompletionsStop:', err);
799
+ return c.json({ error: err.message }, 500);
800
+ }
801
+ }