npm - webpeel - Versions diffs - 0.20.7 → 0.20.9 - Mend

webpeel 0.20.7 → 0.20.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/cli/commands/auth.js +30 -5
package/dist/cli/commands/interact.js +1 -0
package/dist/core/profiles.d.ts +15 -1
package/dist/core/profiles.js +137 -2
package/dist/server/routes/agent.d.ts +1 -0
package/dist/server/routes/agent.js +98 -5
package/dist/server/routes/crawl.js +74 -1
package/package.json +1 -1

package/dist/cli/commands/auth.js CHANGED Viewed

@@ -3,6 +3,7 @@
  */
 import { handleLogin, handleLogout, handleUsage, loadConfig, saveConfig } from '../../cli-auth.js';
 import { clearCache, cacheStats } from '../../cache.js';
+import { loginToProfile } from '../../core/profiles.js';
 import { cliVersion } from '../utils.js';
 export function registerAuthCommands(program) {
     // ── auth command ──────────────────────────────────────────────────────────
@@ -254,13 +255,37 @@ export function registerAuthCommands(program) {
         console.log('   Try: webpeel "https://news.ycombinator.com" --json');
     });
     // ── login command ─────────────────────────────────────────────────────────
+    // Two modes:
+    //   webpeel login             — interactive API key authentication (existing)
+    //   webpeel login <domain>    — browser login: open site, log in, save cookies as profile
     program
-        .command('login')
-        .description('Authenticate the CLI with your API key')
-        .action(async () => {
+        .command('login [domain]')
+        .description('Authenticate: no args = API key auth; with domain = browser login (saves cookies as a named profile)')
+        .option('--profile <name>', 'Profile name to save under (defaults to the domain)')
+        .action(async (domain, opts) => {
         try {
-            await handleLogin();
-            process.exit(0);
+            if (domain) {
+                // ── Browser login mode ──────────────────────────────────────────
+                const url = domain.startsWith('http') ? domain : `https://${domain}`;
+                // Extract hostname for profile name default (e.g. "instagram.com" from "https://www.instagram.com/")
+                let defaultProfileName;
+                try {
+                    const hostname = new URL(url).hostname;
+                    // Strip "www." prefix for cleaner profile names
+                    defaultProfileName = hostname.replace(/^www\./, '');
+                }
+                catch {
+                    defaultProfileName = domain;
+                }
+                const profileName = opts.profile || defaultProfileName;
+                await loginToProfile(url, profileName);
+                process.exit(0);
+            }
+            else {
+                // ── API key auth mode (original behavior) ───────────────────────
+                await handleLogin();
+                process.exit(0);
+            }
         }
         catch (error) {
             console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);

package/dist/cli/commands/interact.js CHANGED Viewed

@@ -310,6 +310,7 @@ export function registerInteractCommands(program) {
         .option('--schema <json>', 'Schema template name (e.g. product, article) or JSON schema for structured output')
         .option('-s, --silent', 'Silent mode (no spinner)')
         .option('--json', 'Output as JSON')
+        .option('--stream', 'Stream progress via SSE (calls API endpoint, requires API key)')
         .action(async (prompt, options) => {
         const llmApiKey = options.llmKey || process.env.OPENAI_API_KEY;
         const urls = options.urls ? options.urls.split(',').map((u) => u.trim()) : undefined;

package/dist/core/profiles.d.ts CHANGED Viewed

@@ -14,7 +14,8 @@ export interface ProfileMetadata {
     description?: string;
 }
 /**
- * Valid profile names: letters, digits, hyphens only. No spaces or special chars.
+ * Valid profile names: letters, digits, hyphens, and dots. No spaces or special chars.
+ * Dots are allowed so domain names like "instagram.com" work as profile names.
  */
 export declare function isValidProfileName(name: string): boolean;
 /**
@@ -45,3 +46,16 @@ export declare function deleteProfile(name: string): boolean;
  * 3. On browser close or Ctrl+C, captures storage state and saves the profile
  */
 export declare function createProfile(name: string, description?: string): Promise<void>;
+/**
+ * Open a headed browser, navigate to `url`, and wait for the user to log in.
+ * Pressing Enter (or closing the browser) saves the session as a named profile.
+ *
+ * Unlike `createProfile()` (which opens to about:blank and waits for browser close),
+ * this function:
+ *   1. Navigates directly to the given URL on launch
+ *   2. Waits for the user to press Enter (or close the browser) to save
+ *   3. Saves storage state AND creates metadata under ~/.webpeel/profiles/<name>/
+ *
+ * Profile names may contain letters, digits, hyphens, and dots (e.g. "instagram.com").
+ */
+export declare function loginToProfile(url: string, profileName: string, description?: string): Promise<void>;

package/dist/core/profiles.js CHANGED Viewed

@@ -19,10 +19,11 @@ function ensureProfilesDir() {
 }
 // ─── Name validation ─────────────────────────────────────────────────────────
 /**
- * Valid profile names: letters, digits, hyphens only. No spaces or special chars.
+ * Valid profile names: letters, digits, hyphens, and dots. No spaces or special chars.
+ * Dots are allowed so domain names like "instagram.com" work as profile names.
  */
 export function isValidProfileName(name) {
-    return /^[a-zA-Z0-9-]+$/.test(name) && name.length > 0 && name.length <= 64;
+    return /^[a-zA-Z0-9\-.]+$/.test(name) && name.length > 0 && name.length <= 64;
 }
 // ─── Core helpers ─────────────────────────────────────────────────────────────
 /**
@@ -213,3 +214,137 @@ export async function createProfile(name, description) {
         });
     });
 }
+// ─── Browser-based login helper ───────────────────────────────────────────────
+/**
+ * Open a headed browser, navigate to `url`, and wait for the user to log in.
+ * Pressing Enter (or closing the browser) saves the session as a named profile.
+ *
+ * Unlike `createProfile()` (which opens to about:blank and waits for browser close),
+ * this function:
+ *   1. Navigates directly to the given URL on launch
+ *   2. Waits for the user to press Enter (or close the browser) to save
+ *   3. Saves storage state AND creates metadata under ~/.webpeel/profiles/<name>/
+ *
+ * Profile names may contain letters, digits, hyphens, and dots (e.g. "instagram.com").
+ */
+export async function loginToProfile(url, profileName, description) {
+    if (!isValidProfileName(profileName)) {
+        throw new Error(`Invalid profile name "${profileName}". Use only letters, numbers, hyphens, and dots (no spaces).`);
+    }
+    ensureProfilesDir();
+    const profileDir = path.join(PROFILES_DIR, profileName);
+    const isUpdate = existsSync(profileDir) && existsSync(path.join(profileDir, 'metadata.json'));
+    mkdirSync(profileDir, { recursive: true });
+    const browser = await chromium.launch({ headless: false });
+    const context = await browser.newContext();
+    const page = await context.newPage();
+    try {
+        await page.goto(url);
+    }
+    catch (e) {
+        // Non-fatal — browser is open, user can navigate manually
+        if (process.env.DEBUG)
+            console.debug('[webpeel]', 'initial navigation error:', e instanceof Error ? e.message : e);
+    }
+    console.log('');
+    console.log('╔══════════════════════════════════════════════════════╗');
+    console.log(`║  WebPeel Browser Login`);
+    console.log(`║  URL:     ${url}`);
+    console.log(`║  Profile: ${profileName}`);
+    console.log('║                                                      ║');
+    console.log('║  Log in, then press Enter here to save your session. ║');
+    console.log('║  (Or close the browser window — same effect.)        ║');
+    console.log('╚══════════════════════════════════════════════════════╝');
+    console.log('');
+    let saved = false;
+    const saveAndClose = async () => {
+        if (saved)
+            return;
+        saved = true;
+        console.log('\nCapturing browser session...');
+        try {
+            const storageState = await context.storageState();
+            writeFileSync(path.join(profileDir, 'storage-state.json'), JSON.stringify(storageState, null, 2));
+            // Extract unique domains from cookies (strip leading dot)
+            const domains = [
+                ...new Set((storageState.cookies ?? [])
+                    .map((c) => (c.domain ?? '').replace(/^\./, ''))
+                    .filter(Boolean)),
+            ];
+            const now = new Date().toISOString();
+            const meta = isUpdate
+                ? {
+                    // Preserve original creation date on update
+                    ...((() => {
+                        try {
+                            return JSON.parse(readFileSync(path.join(profileDir, 'metadata.json'), 'utf-8'));
+                        }
+                        catch {
+                            return {};
+                        }
+                    })()),
+                    name: profileName,
+                    lastUsed: now,
+                    domains,
+                    ...(description ? { description } : {}),
+                }
+                : {
+                    name: profileName,
+                    created: now,
+                    lastUsed: now,
+                    domains,
+                    ...(description ? { description } : {}),
+                };
+            writeFileSync(path.join(profileDir, 'metadata.json'), JSON.stringify(meta, null, 2));
+            console.log(`✅ Profile "${profileName}" ${isUpdate ? 'updated' : 'saved'}!`);
+            if (domains.length > 0) {
+                console.log(`   Domains: ${domains.join(', ')}`);
+            }
+            else {
+                console.log('   No login sessions detected (no cookies captured).');
+                console.log('   Make sure you completed the login before pressing Enter.');
+            }
+            console.log('');
+            console.log(`   Use with: webpeel "${url}" --profile ${profileName}`);
+        }
+        catch (e) {
+            console.error('Warning: Failed to save storage state:', e instanceof Error ? e.message : String(e));
+            // Clean up partial directory if this was a new profile
+            if (!isUpdate) {
+                try {
+                    rmSync(profileDir, { recursive: true, force: true });
+                }
+                catch {
+                    // ignore cleanup errors
+                }
+            }
+        }
+        try {
+            await browser.close();
+        }
+        catch {
+            // ignore close errors
+        }
+    };
+    // Three ways to save: Enter key, browser close, or Ctrl+C
+    await new Promise((resolve) => {
+        let resolved = false;
+        const done = async () => {
+            if (resolved)
+                return;
+            resolved = true;
+            await saveAndClose();
+            resolve();
+        };
+        // Wait for Enter key on stdin
+        if (process.stdin.isTTY) {
+            process.stdin.setRawMode(false);
+        }
+        process.stdin.resume();
+        process.stdin.once('data', () => done());
+        // Browser closed by user
+        browser.on('disconnected', () => done());
+        // Ctrl+C
+        process.once('SIGINT', () => done());
+    });
+}

package/dist/server/routes/agent.d.ts CHANGED Viewed

@@ -14,6 +14,7 @@
  * Returns: { success, data|answer, sources, method, elapsed, tokensUsed }
  *
  * Webhook support: pass `webhook` URL to get async delivery with HMAC-SHA256 signing.
+ * Streaming support: pass `stream: true` to get SSE events instead of polling.
  *
  * 5-minute in-memory cache. Max 10 sources per request.
  */

package/dist/server/routes/agent.js CHANGED Viewed

@@ -14,6 +14,7 @@
  * Returns: { success, data|answer, sources, method, elapsed, tokensUsed }
  *
  * Webhook support: pass `webhook` URL to get async delivery with HMAC-SHA256 signing.
+ * Streaming support: pass `stream: true` to get SSE events instead of polling.
  *
  * 5-minute in-memory cache. Max 10 sources per request.
  */
@@ -81,8 +82,14 @@ function setCache(key, result) {
     }
     cache.set(key, { result, expiresAt: Date.now() + CACHE_TTL });
 }
+// ---------------------------------------------------------------------------
+// SSE helpers
+// ---------------------------------------------------------------------------
+function sseWrite(res, event, data) {
+    res.write(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`);
+}
 async function runAgentQuery(params) {
-    const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources } = params;
+    const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources, onSearching, onFetching, onExtracting } = params;
     const startMs = Date.now();
     const numSources = Math.min(maxSources || 5, 10);
     // Cache check
@@ -97,6 +104,8 @@ async function runAgentQuery(params) {
     }
     else {
         log.info(`Searching web for: "${prompt}"`);
+        if (onSearching)
+            onSearching();
         const { provider, apiKey: searchApiKey } = getBestSearchProvider();
         try {
             const searchResults = await provider.searchWeb(prompt.trim(), { count: numSources, apiKey: searchApiKey });
@@ -111,6 +120,8 @@ async function runAgentQuery(params) {
     }
     // Step 2: Fetch pages in parallel
     log.info(`Fetching ${sourceUrls.length} sources in parallel`);
+    if (onFetching)
+        onFetching(sourceUrls.length);
     const PER_SOURCE_TIMEOUT_MS = 5000;
     const fetchPromises = sourceUrls.map(async (source) => {
         try {
@@ -136,6 +147,8 @@ async function runAgentQuery(params) {
     let result;
     if (schema && llmApiKey) {
         log.info('Using LLM extraction');
+        if (onExtracting)
+            onExtracting('llm');
         const extracted = await extractWithLLM({
             content: combinedContent.slice(0, 30000), schema, llmApiKey, llmProvider: (llmProvider || 'openai'), llmModel,
             prompt: `Based on these web pages, ${prompt}`, url: fetchResults[0].url,
@@ -146,6 +159,8 @@ async function runAgentQuery(params) {
     }
     else {
         log.info('Using BM25 text extraction');
+        if (onExtracting)
+            onExtracting('bm25');
         const qa = quickAnswer({ question: prompt, content: combinedContent, maxPassages: 3, maxChars: 2000 });
         result = { success: true, answer: qa.answer || combinedContent.slice(0, 2000), confidence: qa.confidence ?? 0,
             sources: fetchResults.map((r) => ({ url: r.url, title: r.title })), method: 'agent-bm25', tokensUsed: totalTokens, elapsed: Date.now() - startMs };
@@ -158,9 +173,9 @@ async function runAgentQuery(params) {
 // ---------------------------------------------------------------------------
 export function createAgentRouter() {
     const router = Router();
-    // ── POST /v1/agent — single query (with optional webhook) ──────────────
+    // ── POST /v1/agent — single query (with optional webhook or stream) ──────
     router.post('/', async (req, res) => {
-        const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources, webhook } = req.body || {};
+        const { prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources, webhook, stream } = req.body || {};
         const requestId = req.requestId || crypto.randomUUID();
         if (!prompt?.trim()) {
             return res.status(400).json({
@@ -170,6 +185,34 @@ export function createAgentRouter() {
                 requestId,
             });
         }
+        // ── Streaming mode (SSE) ─────────────────────────────────────────────
+        if (stream === true) {
+            res.setHeader('Content-Type', 'text/event-stream');
+            res.setHeader('Cache-Control', 'no-cache');
+            res.setHeader('Connection', 'keep-alive');
+            res.setHeader('X-Accel-Buffering', 'no');
+            res.flushHeaders();
+            try {
+                const result = await runAgentQuery({
+                    prompt, schema, llmApiKey, llmProvider, llmModel, urls, sources: maxSources,
+                    onSearching: () => {
+                        sseWrite(res, 'searching', { message: 'Searching the web...' });
+                    },
+                    onFetching: (count) => {
+                        sseWrite(res, 'fetching', { message: `Fetching ${count} sources...`, count });
+                    },
+                    onExtracting: (method) => {
+                        sseWrite(res, 'extracting', { message: method === 'llm' ? 'Extracting with LLM...' : 'Analyzing with BM25...', method });
+                    },
+                });
+                sseWrite(res, 'done', { ...result, requestId });
+            }
+            catch (err) {
+                sseWrite(res, 'error', { message: err.message || 'An unexpected error occurred', requestId });
+            }
+            res.end();
+            return;
+        }
         // Async mode: webhook provided → return immediately, deliver result later
         if (webhook) {
             const jobId = crypto.randomUUID();
@@ -198,7 +241,7 @@ export function createAgentRouter() {
     });
     // ── POST /v1/agent/batch — parallel batch queries ─────────────────────
     router.post('/batch', async (req, res) => {
-        const { prompts, schema, llmApiKey, llmProvider, llmModel, sources, webhook } = req.body || {};
+        const { prompts, schema, llmApiKey, llmProvider, llmModel, sources, webhook, stream } = req.body || {};
         const requestId = req.requestId || crypto.randomUUID();
         if (!Array.isArray(prompts) || prompts.length === 0) {
             return res.status(400).json({
@@ -214,7 +257,57 @@ export function createAgentRouter() {
         const jobId = crypto.randomUUID();
         const job = { id: jobId, status: 'processing', total: prompts.length, completed: 0, results: [], webhook, createdAt: Date.now() };
         batchJobs.set(jobId, job);
-        // Return immediately, then process in background
+        // ── Streaming mode (SSE) — keep connection open ──────────────────────
+        if (stream === true) {
+            res.setHeader('Content-Type', 'text/event-stream');
+            res.setHeader('Cache-Control', 'no-cache');
+            res.setHeader('Connection', 'keep-alive');
+            res.setHeader('X-Accel-Buffering', 'no');
+            res.flushHeaders();
+            // Send start event
+            sseWrite(res, 'start', { id: jobId, total: prompts.length, requestId });
+            const sem = new Semaphore(5);
+            const tasks = prompts.map(async (prompt) => {
+                await sem.acquire();
+                try {
+                    const result = await runAgentQuery({ prompt, schema, llmApiKey, llmProvider, llmModel, sources });
+                    const entry = {
+                        prompt,
+                        success: !!result.success,
+                        answer: result.answer,
+                        data: result.data,
+                        sources: result.sources,
+                        method: result.method,
+                        elapsed: result.elapsed,
+                    };
+                    job.results.push(entry);
+                    job.completed++;
+                    // Send per-prompt progress event
+                    sseWrite(res, 'progress', { completed: job.completed, total: job.total, result: entry });
+                }
+                catch (err) {
+                    const entry = { prompt, success: false, error: err.message };
+                    job.results.push(entry);
+                    job.completed++;
+                    sseWrite(res, 'progress', { completed: job.completed, total: job.total, result: entry });
+                }
+                finally {
+                    sem.release();
+                }
+            });
+            await Promise.allSettled(tasks);
+            job.status = 'completed';
+            // Send done event
+            sseWrite(res, 'done', { id: jobId, total: job.total, completed: job.completed, requestId });
+            res.end();
+            // Fire webhook if configured
+            if (webhook) {
+                sendWebhook(webhook, 'agent.batch.completed', { id: jobId, total: job.total, completed: job.completed, results: job.results })
+                    .catch((err) => log.error('Batch webhook failed:', err.message));
+            }
+            return;
+        }
+        // Non-streaming mode: Return immediately, then process in background
         res.json({ success: true, id: jobId, status: 'processing', total: prompts.length, requestId });
         // Process in background with concurrency limit of 5
         // eslint-disable-next-line @typescript-eslint/no-floating-promises

package/dist/server/routes/crawl.js CHANGED Viewed

@@ -11,12 +11,14 @@ import { Router } from 'express';
 import '../types.js'; // Augments Express.Request with requestId
 import { crawl } from '../../core/crawler.js';
 import { validateUrlForSSRF, SSRFError } from '../middleware/url-validator.js';
+import crypto from 'crypto';
 export function createCrawlRouter(jobQueue) {
     const router = Router();
     /**
      * POST /v1/crawl
      *
      * Start an async crawl job. Returns a job ID immediately; poll GET /v1/crawl/:id for status.
+     * With stream:true, keeps the connection open and sends SSE events per page.
      *
      * Body:
      *   url            {string}   Required. Starting URL.
@@ -26,10 +28,11 @@ export function createCrawlRouter(jobQueue) {
      *   excludePatterns {string[]} Regex patterns — skip matching URLs.
      *   formats        {string[]} Content formats: 'markdown' | 'text' (default: ['markdown']).
      *   webhook        {object}   Optional webhook to POST results to when done.
+     *   stream         {boolean}  If true, respond with SSE events (start → progress → done).
      */
     router.post('/', async (req, res) => {
         try {
-            const { url, maxPages = 10, maxDepth = 2, includePatterns = [], excludePatterns = [], webhook, } = req.body ?? {};
+            const { url, maxPages = 10, maxDepth = 2, includePatterns = [], excludePatterns = [], webhook, stream, } = req.body ?? {};
             // Validate URL
             if (!url || typeof url !== 'string') {
                 res.status(400).json({
@@ -78,6 +81,76 @@ export function createCrawlRouter(jobQueue) {
                 throw error;
             }
             const ownerId = req.auth?.keyInfo?.accountId;
+            // ── Streaming mode (SSE) — keep connection open ──────────────────────
+            if (stream === true) {
+                res.setHeader('Content-Type', 'text/event-stream');
+                res.setHeader('Cache-Control', 'no-cache');
+                res.setHeader('Connection', 'keep-alive');
+                res.setHeader('X-Accel-Buffering', 'no');
+                res.flushHeaders();
+                const jobId = crypto.randomUUID();
+                // Send start event (total unknown until crawl runs)
+                res.write(`event: start\ndata: ${JSON.stringify({ id: jobId, url, maxPages, requestId: req.requestId })}\n\n`);
+                const crawlOptions = {
+                    maxPages,
+                    maxDepth,
+                    tier: req.auth?.tier,
+                    onProgress: (progress) => {
+                        const total = progress.crawled + progress.queued;
+                        res.write(`event: progress\ndata: ${JSON.stringify({
+                            id: jobId,
+                            completed: progress.crawled,
+                            total,
+                            queued: progress.queued,
+                            currentUrl: progress.currentUrl,
+                        })}\n\n`);
+                    },
+                };
+                if (Array.isArray(includePatterns) && includePatterns.length > 0) {
+                    crawlOptions.includePatterns = includePatterns;
+                }
+                if (Array.isArray(excludePatterns) && excludePatterns.length > 0) {
+                    crawlOptions.excludePatterns = excludePatterns;
+                }
+                try {
+                    const results = await crawl(url, crawlOptions);
+                    const data = results.map(r => ({
+                        url: r.url,
+                        title: r.title,
+                        content: r.markdown,
+                        links: r.links,
+                        elapsed: r.elapsed,
+                    }));
+                    res.write(`event: done\ndata: ${JSON.stringify({
+                        id: jobId,
+                        total: results.length,
+                        completed: results.length,
+                        results: data,
+                        requestId: req.requestId,
+                    })}\n\n`);
+                    // Fire webhook if configured
+                    if (webhook) {
+                        Promise.resolve(jobQueue.createJob('crawl', webhook, ownerId)).then((job) => {
+                            jobQueue.updateJob(job.id, {
+                                status: 'completed',
+                                data,
+                                total: results.length,
+                                completed: results.length,
+                                creditsUsed: results.length,
+                            });
+                        }).catch(() => { });
+                    }
+                }
+                catch (error) {
+                    res.write(`event: error\ndata: ${JSON.stringify({
+                        id: jobId,
+                        message: error.message || 'Crawl failed',
+                        requestId: req.requestId,
+                    })}\n\n`);
+                }
+                res.end();
+                return;
+            }
             const job = await jobQueue.createJob('crawl', webhook, ownerId);
             // Start crawl in background
             setImmediate(async () => {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.20.7",
+  "version": "0.20.9",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",