npm - webpeel - Versions diffs - 0.21.84 → 0.21.86 - Mend

webpeel 0.21.84 → 0.21.86

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/dist/core/pipeline.d.ts +1 -1
package/dist/core/pipeline.js +7 -38
package/dist/core/strategy-hooks.d.ts +1 -1
package/dist/ee/challenge-re-export.d.ts +1 -0
package/dist/ee/challenge-re-export.js +1 -0
package/dist/ee/challenge-solver.d.ts +72 -0
package/dist/ee/challenge-solver.js +720 -0
package/dist/ee/domain-extractors.d.ts +48 -0
package/dist/ee/domain-extractors.js +6342 -0
package/dist/ee/domain-intel.d.ts +16 -0
package/dist/ee/domain-intel.js +133 -0
package/dist/ee/extractors-re-export.d.ts +1 -0
package/dist/ee/extractors-re-export.js +1 -0
package/dist/ee/premium-hooks.d.ts +20 -0
package/dist/ee/premium-hooks.js +50 -0
package/dist/ee/spa-detection.d.ts +2 -0
package/dist/ee/spa-detection.js +2 -0
package/dist/ee/stability.d.ts +4 -0
package/dist/ee/stability.js +29 -0
package/dist/ee/swr-cache.d.ts +14 -0
package/dist/ee/swr-cache.js +34 -0
package/dist/index.d.ts +1 -2
package/dist/index.js +3 -1
package/dist/server/app.js +4 -5
package/dist/server/routes/smart-search.d.ts +11 -0
package/dist/server/routes/smart-search.js +70 -10
package/dist/types.d.ts +1 -1
package/package.json +2 -1

package/dist/core/pipeline.d.ts CHANGED Viewed

@@ -5,7 +5,7 @@
  * mutable PipelineContext.  The stages are called in order by peel().
  */
 import { type AutoScrollOptions } from './actions.js';
-import { type DomainExtractResult } from './domain-extractors-basic.js';
+import { type DomainExtractResult } from '../ee/domain-extractors.js';
 import { type ReadabilityResult } from './readability.js';
 import { type QuickAnswerResult } from './quick-answer.js';
 import { Timer } from './timing.js';

package/dist/core/pipeline.js CHANGED Viewed

@@ -14,34 +14,8 @@ import { autoScroll as runAutoScroll } from './actions.js';
 import { extractStructured } from './extract.js';
 import { isPdfContentType, isDocxContentType, extractDocumentToFormat } from './documents.js';
 import { parseYouTubeUrl, getYouTubeTranscript } from './youtube.js';
-import { extractDomainDataBasic, getDomainExtractorBasic } from './domain-extractors-basic.js';
+import { extractDomainData, getDomainExtractor } from '../ee/domain-extractors.js';
 import { getDomainExtractHook, getDomainExtractorHook, getSPADomainsHook, getSPAPatternsHook } from './strategy-hooks.js';
-// ---------------------------------------------------------------------------
-// Domain extraction — lazy-load full extractors from compiled JS
-// ---------------------------------------------------------------------------
-// The compiled domain-extractors.js (312KB) ships in the npm package.
-// TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
-// If compiled JS is missing (bare repo clone without proprietary files),
-// falls back to basic stub (no domain extraction, just standard markdown).
-// Server premium hooks can override for additional caching/intelligence.
-let _extractorsLoaded = false;
-let _extractDomainData = null;
-let _getDomainExtractor = null;
-async function loadExtractors() {
-    if (_extractorsLoaded)
-        return;
-    _extractorsLoaded = true;
-    try {
-        const mod = await import('./domain-extractors.js');
-        _extractDomainData = mod.extractDomainData;
-        _getDomainExtractor = mod.getDomainExtractor;
-    }
-    catch {
-        // Compiled JS not available (bare repo clone) — basic stub will be used
-    }
-}
-// Start loading immediately (non-blocking)
-loadExtractors();
 import { extractReadableContent } from './readability.js';
 import { quickAnswer as runQuickAnswer } from './quick-answer.js';
 import { Timer } from './timing.js';
@@ -56,28 +30,23 @@ const log = createLogger('pipeline');
 // ---------------------------------------------------------------------------
 /**
  * Check if a URL has a domain extractor.
- * Priority: premium hook → full extractors (repo/server) → basic stub.
+ * Priority: premium hook → ee/domain-extractors.
  */
 function hasDomainExtractor(url) {
     const hookFn = getDomainExtractorHook();
     if (hookFn)
         return hookFn(url) !== null;
-    if (_getDomainExtractor)
-        return _getDomainExtractor(url) !== null;
-    return getDomainExtractorBasic(url) !== null;
+    return getDomainExtractor(url) !== null;
 }
 /**
  * Run domain extraction on HTML/URL.
- * Priority: premium hook → compiled extractors → basic stub.
+ * Priority: premium hook → ee/domain-extractors.
  */
 async function runDomainExtract(html, url) {
     const hookFn = getDomainExtractHook();
     if (hookFn)
         return hookFn(html, url);
-    await loadExtractors();
-    if (_extractDomainData)
-        return _extractDomainData(html, url);
-    return extractDomainDataBasic(html, url);
+    return extractDomainData(html, url);
 }
 /** Create the initial PipelineContext with defaults */
 export function createContext(url, options) {
@@ -609,7 +578,7 @@ export async function fetchContent(ctx) {
         const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
         if (canSolve) {
             try {
-                const { solveChallenge } = await import('./challenge-solver.js');
+                const { solveChallenge } = await import('../ee/challenge-solver.js');
                 const { detectChallenge } = await import('./challenge-detection.js');
                 const rawHtml = fetchResult.html || '';
                 const detectionResult = detectChallenge(rawHtml, fetchResult.statusCode);
@@ -1179,7 +1148,7 @@ export async function postProcess(ctx) {
             const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
             if (canSolve && ctx.fetchResult?.html) {
                 try {
-                    const { solveChallenge } = await import('./challenge-solver.js');
+                    const { solveChallenge } = await import('../ee/challenge-solver.js');
                     const { detectChallenge } = await import('./challenge-detection.js');
                     const rawHtml = ctx.fetchResult.html;
                     const detectionResult = detectChallenge(rawHtml, ctx.fetchResult.statusCode);

package/dist/core/strategy-hooks.d.ts CHANGED Viewed

@@ -10,7 +10,7 @@
  * All hook methods are optional — unset hooks are simply skipped.
  */
 import type { FetchResult } from './fetcher.js';
-import type { DomainExtractResult } from './domain-extractors-basic.js';
+import type { DomainExtractResult } from '../ee/domain-extractors.js';
 export interface StrategyResult extends FetchResult {
     method: 'simple' | 'browser' | 'stealth' | 'cached' | 'cloaked' | 'cycle' | 'peeltls' | 'cf-worker' | 'google-cache';
     /**

package/dist/ee/challenge-re-export.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export { solveChallenge } from './challenge-solver.js';

package/dist/ee/challenge-re-export.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export { solveChallenge } from './challenge-solver.js';

package/dist/ee/challenge-solver.d.ts ADDED Viewed

@@ -0,0 +1,72 @@
+/**
+ * Challenge / bot-protection solver.
+ *
+ * Attempts to bypass bot-protection challenges using free, in-process methods:
+ *  1. Cloudflare JS challenge — render in stealth Playwright, wait for auto-solve
+ *  2. hCaptcha — accessibility bypass (TODO: implement if API is confirmed available)
+ *
+ * Architecture note:
+ *  Browser-based solving is CPU/RAM intensive. When the env var BROWSER_WORKER_URL
+ *  is set, the solve request is proxied to an external worker (e.g. Hetzner 4GB VM)
+ *  instead of running locally. This keeps the main Render container (512 MB) lean.
+ *
+ * Usage:
+ *  const result = await solveChallenge(url, 'cloudflare', html);
+ *  if (result.solved) {
+ *    // result.html = real page content
+ *    // result.cookies = ["cf_clearance=...", ...]
+ *  }
+ */
+import type { ChallengeType } from '../core/challenge-detection.js';
+export interface ImageCaptchaResult {
+    solved: boolean;
+    rounds: number;
+    error?: string;
+}
+/**
+ * Ask the moondream vision model which grid cells contain the target object.
+ * Returns an array of 1-indexed grid positions (1–9), or null if the call fails.
+ */
+export declare function askVisionModel(base64Image: string, targetObject: string): Promise<number[] | null>;
+/**
+ * Detect if the page has an image grid CAPTCHA and extract the target object.
+ * Returns the object name (e.g. "traffic lights") or null if not detected.
+ */
+export declare function detectImageCaptchaTarget(page: import('playwright').Page): Promise<string | null>;
+/**
+ * Solve an image grid CAPTCHA using the moondream vision model.
+ *
+ * Flow per round:
+ *  1. Screenshot the CAPTCHA grid element
+ *  2. Send to moondream → get grid positions
+ *  3. Click identified cells
+ *  4. Click Verify button
+ *  5. Check if solved; if a new round appears, repeat (max 3 rounds)
+ */
+export declare function solveImageCaptcha(page: import('playwright').Page, targetObject: string): Promise<ImageCaptchaResult>;
+export interface SolveOptions {
+    /** Hard timeout in ms (default: 15 000) */
+    timeout?: number;
+    /** Optional proxy URL (http://user:pass@host:port) */
+    proxy?: string;
+}
+export interface SolveResult {
+    solved: boolean;
+    html: string;
+    /** Raw Set-Cookie header values extracted after solve */
+    cookies?: string[];
+    /** How the solve was performed */
+    method?: 'local-browser' | 'remote-worker' | 'accessibility';
+    /** Error details if solve failed */
+    error?: string;
+}
+/**
+ * Attempt to solve a bot-protection challenge.
+ *
+ * @param url            The page URL (used for proxy routing and cookie caching)
+ * @param challengeType  The type of challenge as detected by challenge-detection
+ * @param html           The raw challenge HTML (used for context / fallback)
+ * @param options        Optional timeout and proxy settings
+ * @returns              Solve result with real HTML content and cookies if successful
+ */
+export declare function solveChallenge(url: string, challengeType: ChallengeType, html: string, options?: SolveOptions): Promise<SolveResult>;