webpeel 0.21.35 → 0.21.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/llm-provider.js +45 -21
- package/package.json +1 -1
|
@@ -391,13 +391,52 @@ async function callOllama(config, options) {
|
|
|
391
391
|
const endpoint = (config.endpoint || process.env.OLLAMA_URL || 'http://localhost:11434').replace(/\/$/, '');
|
|
392
392
|
const model = config.model || process.env.OLLAMA_MODEL || defaultModel('ollama');
|
|
393
393
|
const { messages, stream, onChunk, signal, maxTokens = 4096, temperature = 0.2 } = options;
|
|
394
|
-
const url = `${endpoint}/v1/chat/completions`;
|
|
395
394
|
// Support bearer token auth (for nginx reverse proxy on Hetzner)
|
|
396
395
|
const headers = { 'Content-Type': 'application/json' };
|
|
397
396
|
const secret = config.apiKey || process.env.OLLAMA_SECRET;
|
|
398
397
|
if (secret)
|
|
399
398
|
headers['Authorization'] = `Bearer ${secret}`;
|
|
400
|
-
|
|
399
|
+
// ── Non-streaming: use /api/generate with think:false for speed ──────
|
|
400
|
+
// Qwen3 thinking mode wastes 300-400 tokens on CoT and takes 25s+.
|
|
401
|
+
// With think:false via /api/generate, response comes in ~8s.
|
|
402
|
+
if (!stream) {
|
|
403
|
+
// Build a single prompt from messages (system + user)
|
|
404
|
+
const systemMsg = messages.find((m) => m.role === 'system')?.content || '';
|
|
405
|
+
const userMsg = messages.filter((m) => m.role === 'user').map((m) => m.content).join('\n\n');
|
|
406
|
+
const prompt = systemMsg ? `${systemMsg}\n\n${userMsg}` : userMsg;
|
|
407
|
+
const resp = await fetch(`${endpoint}/api/generate`, {
|
|
408
|
+
method: 'POST',
|
|
409
|
+
headers,
|
|
410
|
+
body: JSON.stringify({
|
|
411
|
+
model,
|
|
412
|
+
prompt,
|
|
413
|
+
stream: false,
|
|
414
|
+
think: false, // Critical: disables Qwen3 CoT thinking (8s vs 25s+)
|
|
415
|
+
options: {
|
|
416
|
+
temperature,
|
|
417
|
+
num_predict: maxTokens,
|
|
418
|
+
},
|
|
419
|
+
}),
|
|
420
|
+
signal,
|
|
421
|
+
});
|
|
422
|
+
if (!resp.ok) {
|
|
423
|
+
const text = await resp.text().catch(() => '');
|
|
424
|
+
throw new Error(`Ollama API error: HTTP ${resp.status}${text ? ` - ${text}` : ''}`);
|
|
425
|
+
}
|
|
426
|
+
const json = await resp.json();
|
|
427
|
+
let text = String(json?.response || '').trim();
|
|
428
|
+
// Strip any residual <think> tags
|
|
429
|
+
text = text.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
|
|
430
|
+
return {
|
|
431
|
+
text,
|
|
432
|
+
usage: {
|
|
433
|
+
input: Number(json?.prompt_eval_count || 0),
|
|
434
|
+
output: Number(json?.eval_count || 0),
|
|
435
|
+
},
|
|
436
|
+
};
|
|
437
|
+
}
|
|
438
|
+
// ── Streaming: use OpenAI-compatible /v1/chat/completions ────────────
|
|
439
|
+
const resp = await fetch(`${endpoint}/v1/chat/completions`, {
|
|
401
440
|
method: 'POST',
|
|
402
441
|
headers,
|
|
403
442
|
body: JSON.stringify({
|
|
@@ -405,7 +444,7 @@ async function callOllama(config, options) {
|
|
|
405
444
|
messages,
|
|
406
445
|
temperature,
|
|
407
446
|
max_tokens: maxTokens,
|
|
408
|
-
stream:
|
|
447
|
+
stream: true,
|
|
409
448
|
}),
|
|
410
449
|
signal,
|
|
411
450
|
});
|
|
@@ -413,23 +452,6 @@ async function callOllama(config, options) {
|
|
|
413
452
|
const text = await resp.text().catch(() => '');
|
|
414
453
|
throw new Error(`Ollama API error: HTTP ${resp.status}${text ? ` - ${text}` : ''}`);
|
|
415
454
|
}
|
|
416
|
-
if (!stream) {
|
|
417
|
-
const json = await resp.json();
|
|
418
|
-
const msg = json?.choices?.[0]?.message;
|
|
419
|
-
// Ollama Qwen3 thinking: content may be empty, CoT goes to `reasoning` field
|
|
420
|
-
let text = String(msg?.content || '').trim();
|
|
421
|
-
if (!text && msg?.reasoning)
|
|
422
|
-
text = String(msg.reasoning).trim();
|
|
423
|
-
// Strip <think> tags from Qwen3 models
|
|
424
|
-
text = text.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
|
|
425
|
-
return {
|
|
426
|
-
text,
|
|
427
|
-
usage: {
|
|
428
|
-
input: Number(json?.usage?.prompt_tokens || 0),
|
|
429
|
-
output: Number(json?.usage?.completion_tokens || 0),
|
|
430
|
-
},
|
|
431
|
-
};
|
|
432
|
-
}
|
|
433
455
|
if (!resp.body)
|
|
434
456
|
throw new Error('Ollama stream: missing body');
|
|
435
457
|
let out = '';
|
|
@@ -449,7 +471,9 @@ async function callOllama(config, options) {
|
|
|
449
471
|
onChunk?.(delta);
|
|
450
472
|
}
|
|
451
473
|
}, signal);
|
|
452
|
-
|
|
474
|
+
// Strip thinking from streamed output
|
|
475
|
+
out = out.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
|
|
476
|
+
return { text: out, usage: { input: 0, output: 0 } };
|
|
453
477
|
}
|
|
454
478
|
// ---------------------------------------------------------------------------
|
|
455
479
|
// Cerebras (OpenAI-compatible)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.36",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|