npm - webpeel - Versions diffs - 0.20.0 → 0.20.2 - Mend

webpeel 0.20.0 → 0.20.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/core/llm-extract.d.ts +17 -1
package/dist/core/llm-extract.js +255 -11
package/dist/mcp/handlers/extract.d.ts +1 -0
package/dist/mcp/handlers/extract.js +31 -0
package/package.json +1 -1

package/dist/core/llm-extract.d.ts CHANGED Viewed

@@ -2,8 +2,14 @@
  * LLM-based extraction: sends markdown/text content to an LLM
  * with instructions to extract structured data.
  *
- * Supports OpenAI-compatible APIs (OpenAI, Anthropic via proxy, local models).
+ * Supports:
+ *   - OpenAI-compatible APIs (OpenAI, custom models via baseUrl)
+ *   - Anthropic (Claude Haiku, Sonnet, Opus)
+ *   - Google (Gemini Flash, Pro)
  */
+export type LLMProvider = 'openai' | 'anthropic' | 'google';
+/** Default models per provider (cheapest/fastest) */
+export declare const DEFAULT_PROVIDER_MODELS: Record<LLMProvider, string>;
 export interface LLMExtractionOptions {
     content: string;
     instruction?: string;
@@ -12,6 +18,11 @@ export interface LLMExtractionOptions {
     baseUrl?: string;
     model?: string;
     maxTokens?: number;
+    url?: string;
+    prompt?: string;
+    llmProvider?: LLMProvider;
+    llmApiKey?: string;
+    llmModel?: string;
 }
 export interface LLMExtractionResult {
     items: Array<Record<string, any>>;
@@ -21,6 +32,7 @@ export interface LLMExtractionResult {
     };
     model: string;
     cost?: number;
+    provider?: LLMProvider;
 }
 /**
  * Detect if schema is a "full" JSON Schema (has type:"object" and properties).
@@ -51,5 +63,9 @@ export declare function estimateCost(model: string, inputTokens: number, outputT
 export declare function parseItems(text: string, _schema?: object): Array<Record<string, any>>;
 /**
  * Extract structured data from content using an LLM.
+ *
+ * Supports OpenAI (default), Anthropic, and Google providers.
+ * Pass `llmProvider` + `llmApiKey` to select a provider.
+ * Falls back to OpenAI-compatible path when no provider is specified.
  */
 export declare function extractWithLLM(options: LLMExtractionOptions): Promise<LLMExtractionResult>;

package/dist/core/llm-extract.js CHANGED Viewed

@@ -2,8 +2,17 @@
  * LLM-based extraction: sends markdown/text content to an LLM
  * with instructions to extract structured data.
  *
- * Supports OpenAI-compatible APIs (OpenAI, Anthropic via proxy, local models).
+ * Supports:
+ *   - OpenAI-compatible APIs (OpenAI, custom models via baseUrl)
+ *   - Anthropic (Claude Haiku, Sonnet, Opus)
+ *   - Google (Gemini Flash, Pro)
  */
+/** Default models per provider (cheapest/fastest) */
+export const DEFAULT_PROVIDER_MODELS = {
+    openai: 'gpt-4o-mini',
+    anthropic: 'claude-haiku-4-5',
+    google: 'gemini-2.0-flash',
+};
 // Cost per 1M tokens (input, output) for known models
 const MODEL_COSTS = {
     'gpt-4o-mini': [0.15, 0.60],
@@ -197,16 +206,250 @@ function buildResponseFormat(schema) {
     // For simple example schemas, fall back to json_object
     return { type: 'json_object' };
 }
+// ─── Multi-provider helpers ────────────────────────────────────────────────
+/**
+ * Strip markdown code block wrappers from LLM output.
+ * Handles ```json...``` or ```...``` patterns.
+ */
+function stripMarkdownCodeBlocks(text) {
+    // Match ```json ... ``` or ``` ... ``` (possibly multiline)
+    const stripped = text.replace(/^```(?:json)?\s*\n?([\s\S]*?)\n?```\s*$/m, '$1').trim();
+    return stripped || text.trim();
+}
+/**
+ * Attempt to fix common JSON issues: comments, trailing commas.
+ */
+function fixJsonString(text) {
+    return text
+        .replace(/\/\/[^\n]*/g, '') // single-line comments
+        .replace(/\/\*[\s\S]*?\*\//g, '') // multi-line comments
+        .replace(/,(\s*[}\]])/g, '$1') // trailing commas
+        .trim();
+}
+/**
+ * Parse a raw LLM response into a JSON value (object or array).
+ * Strips markdown code blocks and attempts to fix invalid JSON.
+ * Returns the parsed value, or throws with `rawOutput` attached.
+ */
+function parseJsonSafe(text) {
+    const cleaned = stripMarkdownCodeBlocks(text);
+    // 1. Direct parse
+    try {
+        return JSON.parse(cleaned);
+    }
+    catch { /* continue */ }
+    // 2. Fix comments/trailing commas
+    try {
+        return JSON.parse(fixJsonString(cleaned));
+    }
+    catch { /* continue */ }
+    // 3. Extract JSON object or array from surrounding text
+    const objMatch = cleaned.match(/\{[\s\S]*\}/);
+    const arrMatch = cleaned.match(/\[[\s\S]*\]/);
+    if (objMatch) {
+        try {
+            return JSON.parse(objMatch[0]);
+        }
+        catch { /* continue */ }
+        try {
+            return JSON.parse(fixJsonString(objMatch[0]));
+        }
+        catch { /* continue */ }
+    }
+    if (arrMatch) {
+        try {
+            return JSON.parse(arrMatch[0]);
+        }
+        catch { /* continue */ }
+        try {
+            return JSON.parse(fixJsonString(arrMatch[0]));
+        }
+        catch { /* continue */ }
+    }
+    const err = new Error(`Failed to parse LLM response as JSON: ${text.slice(0, 200)}`);
+    err.rawOutput = text;
+    throw err;
+}
+/**
+ * Normalize a parsed JSON value into an items array.
+ */
+function normalizeToItems(parsed) {
+    if (Array.isArray(parsed))
+        return parsed;
+    if (parsed && typeof parsed === 'object') {
+        const obj = parsed;
+        if (Array.isArray(obj['items']))
+            return obj['items'];
+        if (Array.isArray(obj['data']))
+            return obj['data'];
+        if (Array.isArray(obj['results']))
+            return obj['results'];
+        return [obj];
+    }
+    return [];
+}
+/**
+ * Call the Anthropic Messages API for extraction.
+ */
+async function callAnthropicExtract(params) {
+    const { content, schema, prompt, llmApiKey, llmModel } = params;
+    const model = llmModel || DEFAULT_PROVIDER_MODELS.anthropic;
+    const truncated = content.slice(0, 30_000);
+    const userContent = `Extract data from this webpage content according to the JSON schema.\n\n` +
+        `Schema: ${JSON.stringify(schema)}\n` +
+        (prompt ? `Instructions: ${prompt}\n` : '') +
+        `\nWebpage content:\n${truncated}\n\n` +
+        `Return ONLY valid JSON matching the schema. No explanation.`;
+    const response = await fetch('https://api.anthropic.com/v1/messages', {
+        method: 'POST',
+        headers: {
+            'x-api-key': llmApiKey,
+            'anthropic-version': '2023-06-01',
+            'content-type': 'application/json',
+        },
+        body: JSON.stringify({
+            model,
+            max_tokens: 4096,
+            messages: [{ role: 'user', content: userContent }],
+        }),
+    });
+    if (!response.ok) {
+        const body = await response.text().catch(() => '');
+        if (response.status === 401)
+            throw new Error('LLM API authentication failed (401). Check your Anthropic API key.');
+        if (response.status === 429)
+            throw new Error('LLM API rate limit exceeded (429). Please wait and retry.');
+        throw new Error(`Anthropic API error: HTTP ${response.status}${body ? ` — ${body.slice(0, 200)}` : ''}`);
+    }
+    const data = await response.json();
+    const text = (data.content ?? []).filter(b => b.type === 'text').map(b => b.text).join('');
+    let parsed;
+    try {
+        parsed = parseJsonSafe(text);
+    }
+    catch (err) {
+        const e = new Error('llm_parse_error');
+        e.rawOutput = text;
+        throw e;
+    }
+    return {
+        items: normalizeToItems(parsed),
+        tokens: {
+            input: data.usage?.input_tokens ?? 0,
+            output: data.usage?.output_tokens ?? 0,
+        },
+        model: data.model || model,
+    };
+}
+/**
+ * Call the Google Gemini API for extraction.
+ */
+async function callGoogleExtract(params) {
+    const { content, schema, prompt, llmApiKey, llmModel } = params;
+    const model = llmModel || DEFAULT_PROVIDER_MODELS.google;
+    const truncated = content.slice(0, 30_000);
+    const userText = `Extract data from this webpage content according to the JSON schema.\n\n` +
+        `Schema: ${JSON.stringify(schema)}\n` +
+        (prompt ? `Instructions: ${prompt}\n` : '') +
+        `\nWebpage content:\n${truncated}\n\n` +
+        `Return ONLY valid JSON matching the schema. No explanation.`;
+    const response = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${llmApiKey}`, {
+        method: 'POST',
+        headers: { 'content-type': 'application/json' },
+        body: JSON.stringify({
+            contents: [{ parts: [{ text: userText }] }],
+            generationConfig: { responseMimeType: 'application/json' },
+        }),
+    });
+    if (!response.ok) {
+        const body = await response.text().catch(() => '');
+        if (response.status === 401 || response.status === 403)
+            throw new Error('LLM API authentication failed. Check your Google API key.');
+        if (response.status === 429)
+            throw new Error('LLM API rate limit exceeded (429). Please wait and retry.');
+        throw new Error(`Google API error: HTTP ${response.status}${body ? ` — ${body.slice(0, 200)}` : ''}`);
+    }
+    const data = await response.json();
+    const text = (data.candidates?.[0]?.content?.parts ?? []).map(p => p.text).join('');
+    let parsed;
+    try {
+        parsed = parseJsonSafe(text);
+    }
+    catch (err) {
+        const e = new Error('llm_parse_error');
+        e.rawOutput = text;
+        throw e;
+    }
+    return {
+        items: normalizeToItems(parsed),
+        tokens: {
+            input: data.usageMetadata?.promptTokenCount ?? 0,
+            output: data.usageMetadata?.candidatesTokenCount ?? 0,
+        },
+        model: data.modelVersion || model,
+    };
+}
+// ─── Main export ───────────────────────────────────────────────────────────
 /**
  * Extract structured data from content using an LLM.
+ *
+ * Supports OpenAI (default), Anthropic, and Google providers.
+ * Pass `llmProvider` + `llmApiKey` to select a provider.
+ * Falls back to OpenAI-compatible path when no provider is specified.
  */
 export async function extractWithLLM(options) {
-    const { content, instruction, baseUrl = 'https://api.openai.com/v1', model = 'gpt-4o-mini', maxTokens = 4000, } = options;
-    const apiKey = options.apiKey || process.env.OPENAI_API_KEY;
-    if (!apiKey) {
+    // Resolve aliases: new-style params take precedence over old-style
+    const resolvedProvider = (options.llmProvider || 'openai');
+    const resolvedApiKey = options.llmApiKey || options.apiKey || process.env.OPENAI_API_KEY;
+    const resolvedModel = options.llmModel || options.model;
+    const resolvedInstruction = options.prompt || options.instruction;
+    const { content, baseUrl = 'https://api.openai.com/v1', maxTokens = 4000, } = options;
+    if (!resolvedApiKey) {
         throw new Error('LLM extraction requires an API key.\n' +
-            'Set OPENAI_API_KEY environment variable or use --llm-key <key>');
+            'Set OPENAI_API_KEY environment variable or provide llmApiKey in the request.');
+    }
+    // ── Anthropic path ────────────────────────────────────────────────────────
+    if (resolvedProvider === 'anthropic') {
+        const schema = options.schema || {};
+        const result = await callAnthropicExtract({
+            content,
+            schema,
+            prompt: resolvedInstruction,
+            llmApiKey: resolvedApiKey,
+            llmModel: resolvedModel || DEFAULT_PROVIDER_MODELS.anthropic,
+        });
+        if (options.schema) {
+            validateSchemaShape(result.items, options.schema);
+        }
+        return {
+            items: result.items,
+            tokensUsed: result.tokens,
+            model: result.model,
+            provider: 'anthropic',
+        };
     }
+    // ── Google path ───────────────────────────────────────────────────────────
+    if (resolvedProvider === 'google') {
+        const schema = options.schema || {};
+        const result = await callGoogleExtract({
+            content,
+            schema,
+            prompt: resolvedInstruction,
+            llmApiKey: resolvedApiKey,
+            llmModel: resolvedModel || DEFAULT_PROVIDER_MODELS.google,
+        });
+        if (options.schema) {
+            validateSchemaShape(result.items, options.schema);
+        }
+        return {
+            items: result.items,
+            tokensUsed: result.tokens,
+            model: result.model,
+            provider: 'google',
+        };
+    }
+    // ── OpenAI path (default, backward-compatible) ────────────────────────────
+    const finalModel = resolvedModel || DEFAULT_PROVIDER_MODELS.openai;
     // Resolve schema: convert simple schemas to full JSON Schema if needed
     let resolvedSchema = options.schema;
     if (resolvedSchema && !isFullJsonSchema(resolvedSchema)) {
@@ -214,16 +457,16 @@ export async function extractWithLLM(options) {
     }
     // Choose system prompt based on whether a schema is provided
     const systemPrompt = resolvedSchema ? SCHEMA_SYSTEM_PROMPT : GENERIC_SYSTEM_PROMPT;
-    const userMessage = buildUserMessage(content, instruction, resolvedSchema ?? options.schema);
+    const userMessage = buildUserMessage(content, resolvedInstruction, resolvedSchema ?? options.schema);
     const responseFormat = buildResponseFormat(resolvedSchema);
     const response = await fetch(`${baseUrl}/chat/completions`, {
         method: 'POST',
         headers: {
             'Content-Type': 'application/json',
-            'Authorization': `Bearer ${apiKey}`,
+            'Authorization': `Bearer ${resolvedApiKey}`,
         },
         body: JSON.stringify({
-            model,
+            model: finalModel,
             messages: [
                 { role: 'system', content: systemPrompt },
                 { role: 'user', content: userMessage },
@@ -252,12 +495,13 @@ export async function extractWithLLM(options) {
     }
     const inputTokens = data.usage?.prompt_tokens ?? 0;
     const outputTokens = data.usage?.completion_tokens ?? 0;
-    const resolvedModel = data.model ?? model;
-    const cost = estimateCost(resolvedModel, inputTokens, outputTokens);
+    const resolvedFinalModel = data.model ?? finalModel;
+    const cost = estimateCost(resolvedFinalModel, inputTokens, outputTokens);
     return {
         items,
         tokensUsed: { input: inputTokens, output: outputTokens },
-        model: resolvedModel,
+        model: resolvedFinalModel,
         cost,
+        provider: 'openai',
     };
 }

package/dist/mcp/handlers/extract.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 /**
  * handleExtract — extract structured data from a URL.
  * Supports auto-detection, field lists, schema, and brand presets.
+ * Supports LLM-based extraction via llmProvider + llmApiKey.
  */
 import { type McpHandler } from './types.js';
 export declare const handleExtract: McpHandler;

package/dist/mcp/handlers/extract.js CHANGED Viewed

@@ -1,9 +1,11 @@
 /**
  * handleExtract — extract structured data from a URL.
  * Supports auto-detection, field lists, schema, and brand presets.
+ * Supports LLM-based extraction via llmProvider + llmApiKey.
  */
 import { peel } from '../../index.js';
 import { textResult, safeStringify, timeout } from './types.js';
+import { extractWithLLM } from '../../core/llm-extract.js';
 function extractColorsFromContent(content) {
     const hexRegex = /#[0-9A-Fa-f]{6}|#[0-9A-Fa-f]{3}/g;
     const matches = content.match(hexRegex);
@@ -27,6 +29,35 @@ export const handleExtract = async (args, _ctx) => {
     const schema = args['schema'];
     const fields = args['fields'];
     const render = args['render'] || false;
+    const llmApiKey = args['llmApiKey'];
+    const llmProvider = args['llmProvider'];
+    const llmModel = args['llmModel'];
+    const prompt = args['prompt'];
+    // LLM-based extraction: when llmApiKey (and optionally llmProvider) are provided
+    if (llmApiKey && (schema || prompt)) {
+        const peelResult = await Promise.race([
+            peel(url, { format: 'markdown', render }),
+            timeout(60000, 'LLM extract fetch'),
+        ]);
+        const extractResult = await extractWithLLM({
+            content: peelResult.content,
+            schema: schema,
+            prompt,
+            llmApiKey,
+            llmProvider: llmProvider || 'openai',
+            llmModel,
+        });
+        return textResult(safeStringify({
+            success: true,
+            url: peelResult.url,
+            data: extractResult.items.length === 1 ? extractResult.items[0] : extractResult.items,
+            llm: {
+                provider: extractResult.provider || llmProvider || 'openai',
+                model: extractResult.model,
+                tokens: extractResult.tokensUsed,
+            },
+        }));
+    }
     // Brand preset: fields=['name','logo','colors','fonts','socials'] or _brand flag
     const isBrandPreset = args['_brand'] ||
         (Array.isArray(fields) &&

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.20.0",
+  "version": "0.20.2",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",