webpeel 0.21.35 → 0.21.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -391,13 +391,52 @@ async function callOllama(config, options) {
391
391
  const endpoint = (config.endpoint || process.env.OLLAMA_URL || 'http://localhost:11434').replace(/\/$/, '');
392
392
  const model = config.model || process.env.OLLAMA_MODEL || defaultModel('ollama');
393
393
  const { messages, stream, onChunk, signal, maxTokens = 4096, temperature = 0.2 } = options;
394
- const url = `${endpoint}/v1/chat/completions`;
395
394
  // Support bearer token auth (for nginx reverse proxy on Hetzner)
396
395
  const headers = { 'Content-Type': 'application/json' };
397
396
  const secret = config.apiKey || process.env.OLLAMA_SECRET;
398
397
  if (secret)
399
398
  headers['Authorization'] = `Bearer ${secret}`;
400
- const resp = await fetch(url, {
399
+ // ── Non-streaming: use /api/generate with think:false for speed ──────
400
+ // Qwen3 thinking mode wastes 300-400 tokens on CoT and takes 25s+.
401
+ // With think:false via /api/generate, response comes in ~8s.
402
+ if (!stream) {
403
+ // Build a single prompt from messages (system + user)
404
+ const systemMsg = messages.find((m) => m.role === 'system')?.content || '';
405
+ const userMsg = messages.filter((m) => m.role === 'user').map((m) => m.content).join('\n\n');
406
+ const prompt = systemMsg ? `${systemMsg}\n\n${userMsg}` : userMsg;
407
+ const resp = await fetch(`${endpoint}/api/generate`, {
408
+ method: 'POST',
409
+ headers,
410
+ body: JSON.stringify({
411
+ model,
412
+ prompt,
413
+ stream: false,
414
+ think: false, // Critical: disables Qwen3 CoT thinking (8s vs 25s+)
415
+ options: {
416
+ temperature,
417
+ num_predict: maxTokens,
418
+ },
419
+ }),
420
+ signal,
421
+ });
422
+ if (!resp.ok) {
423
+ const text = await resp.text().catch(() => '');
424
+ throw new Error(`Ollama API error: HTTP ${resp.status}${text ? ` - ${text}` : ''}`);
425
+ }
426
+ const json = await resp.json();
427
+ let text = String(json?.response || '').trim();
428
+ // Strip any residual <think> tags
429
+ text = text.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
430
+ return {
431
+ text,
432
+ usage: {
433
+ input: Number(json?.prompt_eval_count || 0),
434
+ output: Number(json?.eval_count || 0),
435
+ },
436
+ };
437
+ }
438
+ // ── Streaming: use OpenAI-compatible /v1/chat/completions ────────────
439
+ const resp = await fetch(`${endpoint}/v1/chat/completions`, {
401
440
  method: 'POST',
402
441
  headers,
403
442
  body: JSON.stringify({
@@ -405,7 +444,7 @@ async function callOllama(config, options) {
405
444
  messages,
406
445
  temperature,
407
446
  max_tokens: maxTokens,
408
- stream: stream ?? false,
447
+ stream: true,
409
448
  }),
410
449
  signal,
411
450
  });
@@ -413,23 +452,6 @@ async function callOllama(config, options) {
413
452
  const text = await resp.text().catch(() => '');
414
453
  throw new Error(`Ollama API error: HTTP ${resp.status}${text ? ` - ${text}` : ''}`);
415
454
  }
416
- if (!stream) {
417
- const json = await resp.json();
418
- const msg = json?.choices?.[0]?.message;
419
- // Ollama Qwen3 thinking: content may be empty, CoT goes to `reasoning` field
420
- let text = String(msg?.content || '').trim();
421
- if (!text && msg?.reasoning)
422
- text = String(msg.reasoning).trim();
423
- // Strip <think> tags from Qwen3 models
424
- text = text.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
425
- return {
426
- text,
427
- usage: {
428
- input: Number(json?.usage?.prompt_tokens || 0),
429
- output: Number(json?.usage?.completion_tokens || 0),
430
- },
431
- };
432
- }
433
455
  if (!resp.body)
434
456
  throw new Error('Ollama stream: missing body');
435
457
  let out = '';
@@ -449,7 +471,9 @@ async function callOllama(config, options) {
449
471
  onChunk?.(delta);
450
472
  }
451
473
  }, signal);
452
- return { text: out.trim(), usage: { input: 0, output: 0 } };
474
+ // Strip thinking from streamed output
475
+ out = out.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
476
+ return { text: out, usage: { input: 0, output: 0 } };
453
477
  }
454
478
  // ---------------------------------------------------------------------------
455
479
  // Cerebras (OpenAI-compatible)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.35",
3
+ "version": "0.21.36",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",