npm - webpeel - Versions diffs - 0.21.81 → 0.21.83 - Mend

webpeel 0.21.81 → 0.21.83

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/dist/core/domain-extractors-basic.d.ts +19 -8
package/dist/core/domain-extractors-basic.js +4 -7
package/dist/core/domain-extractors-public.d.ts +20 -0
package/dist/core/domain-extractors-public.js +35 -0
package/dist/core/pipeline.d.ts +1 -1
package/dist/core/pipeline.js +55 -29
package/dist/core/stealth-patches.d.ts +10 -53
package/dist/core/stealth-patches.js +14 -333
package/dist/core/strategy-hooks.d.ts +1 -1
package/dist/index.d.ts +2 -1
package/dist/index.js +1 -1
package/dist/server/app.js +16 -4
package/dist/server/premium/challenge.d.ts +0 -7
package/dist/server/premium/challenge.js +0 -7
package/dist/server/premium/extractors.d.ts +0 -9
package/dist/server/premium/extractors.js +0 -9
package/dist/server/premium/spa-detection.d.ts +0 -15
package/dist/server/premium/spa-detection.js +2 -39
package/dist/server/premium/stability.d.ts +2 -21
package/dist/server/premium/stability.js +3 -32
package/dist/types.d.ts +1 -1
package/package.json +1 -1

package/dist/core/domain-extractors-basic.d.ts CHANGED Viewed

@@ -1,14 +1,25 @@
 /**
- * Basic domain extraction — public/free tier.
+ * Domain extraction types and basic stub.
  *
- * Handles a few common domains with simple logic.
- * Full 55+ domain extractors are premium/server-only.
- *
- * This module is safe to include in the npm package.
- * The full `domain-extractors.ts` is compiled for the server
- * but wired in only when premium hooks are registered.
+ * Types are defined HERE (always available) so nothing depends
+ * on the proprietary domain-extractors.ts TypeScript source.
+ * The compiled domain-extractors.js ships in npm and is loaded at runtime.
  */
-import type { DomainExtractResult } from './domain-extractors.js';
+/** Structured result from a domain-specific extractor */
+export interface DomainExtractResult {
+    /** Canonical domain name (e.g. 'twitter.com') */
+    domain: string;
+    /** Page type within the domain (e.g. 'tweet', 'thread', 'repo', 'issue') */
+    type: string;
+    /** Domain-specific structured data */
+    structured: Record<string, any>;
+    /** Clean markdown representation of the content */
+    cleanContent: string;
+    /** Raw HTML size in characters (from the actual HTML page fetched by the extractor) */
+    rawHtmlSize?: number;
+}
+/** An extractor receives the raw HTML and original URL, may make API calls. */
+export type DomainExtractor = (html: string, url: string) => Promise<DomainExtractResult | null>;
 /**
  * Basic domain data extractor — free tier stub.
  *

package/dist/core/domain-extractors-basic.js CHANGED Viewed

@@ -1,12 +1,9 @@
 /**
- * Basic domain extraction — public/free tier.
+ * Domain extraction types and basic stub.
  *
- * Handles a few common domains with simple logic.
- * Full 55+ domain extractors are premium/server-only.
- *
- * This module is safe to include in the npm package.
- * The full `domain-extractors.ts` is compiled for the server
- * but wired in only when premium hooks are registered.
+ * Types are defined HERE (always available) so nothing depends
+ * on the proprietary domain-extractors.ts TypeScript source.
+ * The compiled domain-extractors.js ships in npm and is loaded at runtime.
  */
 /**
  * Basic domain data extractor — free tier stub.

package/dist/core/domain-extractors-public.d.ts ADDED Viewed

@@ -0,0 +1,20 @@
+/**
+ * Public re-exports for domain extraction functions.
+ *
+ * This module is always available (npm + repo + server).
+ * It lazy-loads the full domain-extractors.js (compiled, ships in npm).
+ * If compiled JS is missing (bare repo clone), returns null gracefully.
+ *
+ * TypeScript source for domain-extractors is .gitignore'd (not on GitHub).
+ */
+import type { DomainExtractResult } from './domain-extractors-basic.js';
+/**
+ * Check if a URL has a domain-specific extractor.
+ * Returns the extractor function or null.
+ */
+export declare function getDomainExtractor(url: string): any;
+/**
+ * Run domain-specific extraction on HTML content.
+ * Returns structured domain data or null.
+ */
+export declare function extractDomainData(html: string, url: string): Promise<DomainExtractResult | null>;

package/dist/core/domain-extractors-public.js ADDED Viewed

@@ -0,0 +1,35 @@
+/**
+ * Public re-exports for domain extraction functions.
+ *
+ * This module is always available (npm + repo + server).
+ * It lazy-loads the full domain-extractors.js (compiled, ships in npm).
+ * If compiled JS is missing (bare repo clone), returns null gracefully.
+ *
+ * TypeScript source for domain-extractors is .gitignore'd (not on GitHub).
+ */
+// Top-level await: module fully loaded before any exports are called.
+// This is safe in ESM (Node 14.8+, all modern bundlers).
+let _getDomainExtractor = null;
+let _extractDomainData = null;
+try {
+    const mod = await import('./domain-extractors.js');
+    _getDomainExtractor = mod.getDomainExtractor;
+    _extractDomainData = mod.extractDomainData;
+}
+catch {
+    // Compiled JS not available (bare repo clone) — stubs return null
+}
+/**
+ * Check if a URL has a domain-specific extractor.
+ * Returns the extractor function or null.
+ */
+export function getDomainExtractor(url) {
+    return _getDomainExtractor ? _getDomainExtractor(url) : null;
+}
+/**
+ * Run domain-specific extraction on HTML content.
+ * Returns structured domain data or null.
+ */
+export async function extractDomainData(html, url) {
+    return _extractDomainData ? _extractDomainData(html, url) : null;
+}

package/dist/core/pipeline.d.ts CHANGED Viewed

@@ -5,7 +5,7 @@
  * mutable PipelineContext.  The stages are called in order by peel().
  */
 import { type AutoScrollOptions } from './actions.js';
-import { type DomainExtractResult } from './domain-extractors.js';
+import { type DomainExtractResult } from './domain-extractors-basic.js';
 import { type ReadabilityResult } from './readability.js';
 import { type QuickAnswerResult } from './quick-answer.js';
 import { Timer } from './timing.js';

package/dist/core/pipeline.js CHANGED Viewed

@@ -16,26 +16,32 @@ import { isPdfContentType, isDocxContentType, extractDocumentToFormat } from './
 import { parseYouTubeUrl, getYouTubeTranscript } from './youtube.js';
 import { extractDomainDataBasic, getDomainExtractorBasic } from './domain-extractors-basic.js';
 import { getDomainExtractHook, getDomainExtractorHook, getSPADomainsHook, getSPAPatternsHook } from './strategy-hooks.js';
-// Lazy-loaded full extractors — available in repo/server, absent in npm package.
-// The dynamic import avoids hard failures when domain-extractors.js is excluded from npm.
-let _fullExtractorsLoaded = false;
-let _fullExtractDomainData = null;
-let _fullGetDomainExtractor = null;
-async function loadFullExtractors() {
-    if (_fullExtractorsLoaded)
+// ---------------------------------------------------------------------------
+// Domain extraction — lazy-load full extractors from compiled JS
+// ---------------------------------------------------------------------------
+// The compiled domain-extractors.js (312KB) ships in the npm package.
+// TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
+// If compiled JS is missing (bare repo clone without proprietary files),
+// falls back to basic stub (no domain extraction, just standard markdown).
+// Server premium hooks can override for additional caching/intelligence.
+let _extractorsLoaded = false;
+let _extractDomainData = null;
+let _getDomainExtractor = null;
+async function loadExtractors() {
+    if (_extractorsLoaded)
         return;
-    _fullExtractorsLoaded = true;
+    _extractorsLoaded = true;
     try {
         const mod = await import('./domain-extractors.js');
-        _fullExtractDomainData = mod.extractDomainData;
-        _fullGetDomainExtractor = mod.getDomainExtractor;
+        _extractDomainData = mod.extractDomainData;
+        _getDomainExtractor = mod.getDomainExtractor;
     }
     catch {
-        // Not available (npm package) — basic stubs will be used
+        // Compiled JS not available (bare repo clone) — basic stub will be used
     }
 }
-// Eagerly start loading (non-blocking)
-loadFullExtractors();
+// Start loading immediately (non-blocking)
+loadExtractors();
 import { extractReadableContent } from './readability.js';
 import { quickAnswer as runQuickAnswer } from './quick-answer.js';
 import { Timer } from './timing.js';
@@ -56,25 +62,21 @@ function hasDomainExtractor(url) {
     const hookFn = getDomainExtractorHook();
     if (hookFn)
         return hookFn(url) !== null;
-    // Full extractors available (repo/server build)?
-    if (_fullGetDomainExtractor)
-        return _fullGetDomainExtractor(url) !== null;
-    // npm package fallback — basic stubs
+    if (_getDomainExtractor)
+        return _getDomainExtractor(url) !== null;
     return getDomainExtractorBasic(url) !== null;
 }
 /**
  * Run domain extraction on HTML/URL.
- * Priority: premium hook → full extractors (repo/server) → basic stub.
+ * Priority: premium hook → compiled extractors → basic stub.
  */
 async function runDomainExtract(html, url) {
     const hookFn = getDomainExtractHook();
     if (hookFn)
         return hookFn(html, url);
-    // Full extractors available (repo/server build)?
-    await loadFullExtractors(); // Ensure loaded
-    if (_fullExtractDomainData)
-        return _fullExtractDomainData(html, url);
-    // npm package fallback — basic stubs
+    await loadExtractors();
+    if (_extractDomainData)
+        return _extractDomainData(html, url);
     return extractDomainDataBasic(html, url);
 }
 /** Create the initial PipelineContext with defaults */
@@ -199,15 +201,39 @@ export function normalizeOptions(ctx) {
     if (autoScrollOpts) {
         ctx.render = true;
     }
-    // Auto-detect SPAs that require browser rendering (no --render flag needed)
-    // Premium hook provides full SPA domain list; basic has a small default set.
+    // Auto-detect SPAs that require browser rendering (no --render flag needed).
+    // This list is NOT proprietary — every developer knows these sites are SPAs.
+    // The proprietary part is the domain EXTRACTORS (what data to pull), not this list.
+    // Premium hook can extend this for additional server-side intelligence.
     if (!ctx.render) {
         const spaDomainsHook = getSPADomainsHook();
         const spaPatternsHook = getSPAPatternsHook();
-        // Basic SPA defaults — minimal set for free tier
-        const DEFAULT_SPA_DOMAINS = new Set([]);
-        const DEFAULT_SPA_PATTERNS = [];
-        // Premium hook merges its full list; basic uses defaults
+        // Full SPA domain list — always available (npm + server)
+        const DEFAULT_SPA_DOMAINS = new Set([
+            // Search & travel
+            'www.google.com',
+            'flights.google.com',
+            // Travel & hospitality
+            'www.airbnb.com',
+            'www.booking.com',
+            'www.expedia.com',
+            'www.kayak.com',
+            'www.skyscanner.com',
+            'www.tripadvisor.com',
+            // Jobs
+            'www.indeed.com',
+            'www.glassdoor.com',
+            // Real estate
+            'www.zillow.com',
+            // Our own dashboard
+            'app.webpeel.dev',
+        ]);
+        const DEFAULT_SPA_PATTERNS = [
+            /google\.com\/travel/,
+            /google\.com\/maps/,
+            /google\.com\/shopping/,
+        ];
+        // Premium hook can extend with additional domains; otherwise use full default list
         const SPA_DOMAINS = spaDomainsHook ? spaDomainsHook() : DEFAULT_SPA_DOMAINS;
         const SPA_URL_PATTERNS = spaPatternsHook ? spaPatternsHook() : DEFAULT_SPA_PATTERNS;
         try {

package/dist/core/stealth-patches.d.ts CHANGED Viewed

@@ -1,57 +1,14 @@
 /**
- * stealth-patches.ts
+ * Stealth patches — proprietary module stub.
  *
- * Additional browser-fingerprint evasions that go beyond what
- * puppeteer-extra-plugin-stealth already provides.
+ * The full implementation is compiled into dist/core/stealth-patches.js
+ * and shipped in the npm package (14.9KB).
+ * TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
  *
- * What puppeteer-extra-plugin-stealth covers (we skip these):
- *   - navigator.webdriver removal
- *   - window.chrome (app / csi / loadTimes / runtime)
- *   - navigator.plugins & mimeTypes (realistic arrays)
- *   - navigator.languages & navigator.vendor
- *   - navigator.permissions (Notification.permission → 'default')
- *   - navigator.hardwareConcurrency
- *   - webgl.vendor / webgl.renderer (UNMASKED params → Intel)
- *   - window.outerWidth / outerHeight
- *   - iframe.contentWindow
- *   - media.codecs
- *   - user-agent-override
- *
- * What THIS file adds (genuine gaps):
- *   1. navigator.connection   – NetworkInformation API (absent in headless)
- *   2. Battery API            – navigator.getBattery() (absent/broken in headless)
- *   3. Media devices          – enumerateDevices() returns empty in headless
- *   4. Canvas noise           – subtle pixel noise to prevent canvas fingerprinting
- *   5. Speech synthesis       – getVoices() returns empty in headless
- *   6. Keyboard layout        – navigator.keyboard.getLayoutMap() (absent in headless)
- *   7. navigator.deviceMemory – may be 0 in headless; normalise to 8 GB
- *   8. screen.availWidth/H    – safety-net: ensure non-zero values
- *   9. WebGL noise            – tiny noise on non-vendor params to break GL fingerprinting
- *  10. Worker webdriver flag  – patch inside dedicated workers too
- *
- * Usage:
- *   import { applyStealthPatches } from './stealth-patches.js';
- *   await applyStealthPatches(page);
- *
- * Call AFTER page creation, before navigation.
- * Safe to call alongside puppeteer-extra-plugin-stealth (no conflicts).
- */
-import type { Page } from 'playwright';
-/**
- * Apply all supplemental stealth patches to a Playwright page.
- * Each patch is wrapped in its own try/catch so one failure never blocks others.
- *
- * @param page - A Playwright Page (or any object with addInitScript).
- */
-export declare function applyStealthPatches(page: Page): Promise<void>;
-/**
- * Set the Accept-Language HTTP header to match navigator.languages.
- *
- * Call this after creating the page but BEFORE navigation.
- * In stealth mode Playwright already sets locale: 'en-US', but the
- * Accept-Language header may still differ — this ensures consistency.
- *
- * @param page     - Playwright Page.
- * @param locale   - BCP 47 locale string, e.g. 'en-US' (default).
+ * This stub satisfies TypeScript type-checking on bare repo clones.
+ * At runtime the compiled JS is imported dynamically in browser-fetch.ts.
  */
-export declare function applyAcceptLanguageHeader(page: Page, locale?: string): Promise<void>;
+/** Apply stealth patches to a Playwright page to avoid bot detection. */
+export declare function applyStealthPatches(_page: unknown): Promise<void>;
+/** Apply Accept-Language header to a Playwright page. */
+export declare function applyAcceptLanguageHeader(_page: unknown, _lang?: string): Promise<void>;

package/dist/core/stealth-patches.js CHANGED Viewed

@@ -1,339 +1,20 @@
 /**
- * stealth-patches.ts
+ * Stealth patches — proprietary module stub.
  *
- * Additional browser-fingerprint evasions that go beyond what
- * puppeteer-extra-plugin-stealth already provides.
+ * The full implementation is compiled into dist/core/stealth-patches.js
+ * and shipped in the npm package (14.9KB).
+ * TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
  *
- * What puppeteer-extra-plugin-stealth covers (we skip these):
- *   - navigator.webdriver removal
- *   - window.chrome (app / csi / loadTimes / runtime)
- *   - navigator.plugins & mimeTypes (realistic arrays)
- *   - navigator.languages & navigator.vendor
- *   - navigator.permissions (Notification.permission → 'default')
- *   - navigator.hardwareConcurrency
- *   - webgl.vendor / webgl.renderer (UNMASKED params → Intel)
- *   - window.outerWidth / outerHeight
- *   - iframe.contentWindow
- *   - media.codecs
- *   - user-agent-override
- *
- * What THIS file adds (genuine gaps):
- *   1. navigator.connection   – NetworkInformation API (absent in headless)
- *   2. Battery API            – navigator.getBattery() (absent/broken in headless)
- *   3. Media devices          – enumerateDevices() returns empty in headless
- *   4. Canvas noise           – subtle pixel noise to prevent canvas fingerprinting
- *   5. Speech synthesis       – getVoices() returns empty in headless
- *   6. Keyboard layout        – navigator.keyboard.getLayoutMap() (absent in headless)
- *   7. navigator.deviceMemory – may be 0 in headless; normalise to 8 GB
- *   8. screen.availWidth/H    – safety-net: ensure non-zero values
- *   9. WebGL noise            – tiny noise on non-vendor params to break GL fingerprinting
- *  10. Worker webdriver flag  – patch inside dedicated workers too
- *
- * Usage:
- *   import { applyStealthPatches } from './stealth-patches.js';
- *   await applyStealthPatches(page);
- *
- * Call AFTER page creation, before navigation.
- * Safe to call alongside puppeteer-extra-plugin-stealth (no conflicts).
- */
-// ─── main export ─────────────────────────────────────────────────────────────
-/**
- * Apply all supplemental stealth patches to a Playwright page.
- * Each patch is wrapped in its own try/catch so one failure never blocks others.
- *
- * @param page - A Playwright Page (or any object with addInitScript).
+ * This stub satisfies TypeScript type-checking on bare repo clones.
+ * At runtime the compiled JS is imported dynamically in browser-fetch.ts.
  */
-export async function applyStealthPatches(page) {
-    // All patches run as a single evaluateOnNewDocument call for efficiency.
-    // Using string form to be consistent with existing browser-pool.ts style
-    // and to avoid any edge-cases with function serialisation across contexts.
-    await page.addInitScript(`
-(function () {
-  'use strict';
-  // ── 1. navigator.connection (NetworkInformation API) ─────────────────────
-  // Headless Chrome lacks this object entirely; many bot-detectors probe it.
-  try {
-    if (!('connection' in navigator)) {
-      var _conn = {
-        downlink: 10,
-        downlinkMax: Infinity,
-        effectiveType: '4g',
-        rtt: 50,
-        saveData: false,
-        type: 'wifi',
-        onchange: null,
-        ontypechange: null,
-        addEventListener: function () {},
-        removeEventListener: function () {},
-        dispatchEvent: function () { return true; }
-      };
-      Object.defineProperty(navigator, 'connection', {
-        get: function () { return _conn; },
-        configurable: true
-      });
-      // Also expose as NetworkInformation-like alias that some code checks
-      Object.defineProperty(navigator, 'mozConnection', {
-        get: function () { return undefined; },
-        configurable: true
-      });
-      Object.defineProperty(navigator, 'webkitConnection', {
-        get: function () { return undefined; },
-        configurable: true
-      });
-    }
-  } catch (e) {}
-  // ── 2. Battery API ────────────────────────────────────────────────────────
-  // navigator.getBattery() often rejects in headless; return a plausible battery.
-  try {
-    var _battery = {
-      charging: true,
-      chargingTime: 0,
-      dischargingTime: Infinity,
-      level: 0.96 + (Math.random() * 0.03),   // 96–99 %
-      onchargingchange: null,
-      onchargingtimechange: null,
-      ondischargingtimechange: null,
-      onlevelchange: null,
-      addEventListener: function () {},
-      removeEventListener: function () {},
-      dispatchEvent: function () { return true; }
-    };
-    if ('getBattery' in navigator) {
-      var _origGetBattery = navigator.getBattery.bind(navigator);
-      Object.defineProperty(navigator, 'getBattery', {
-        value: function () {
-          return _origGetBattery().catch(function () {
-            return Promise.resolve(_battery);
-          });
-        },
-        configurable: true,
-        writable: true
-      });
-    } else {
-      Object.defineProperty(navigator, 'getBattery', {
-        value: function () { return Promise.resolve(_battery); },
-        configurable: true,
-        writable: true
-      });
-    }
-  } catch (e) {}
-  // ── 3. Media devices – enumerateDevices ───────────────────────────────────
-  // Headless returns an empty array; bots and real users both have at least
-  // one audio device, so the empty list is a clear signal.
-  try {
-    if (navigator.mediaDevices && navigator.mediaDevices.enumerateDevices) {
-      var _origEnum = navigator.mediaDevices.enumerateDevices.bind(navigator.mediaDevices);
-      Object.defineProperty(navigator.mediaDevices, 'enumerateDevices', {
-        value: function () {
-          return _origEnum().then(function (devices) {
-            if (devices && devices.length > 0) return devices;
-            // Mock realistic device list (labels stay empty – that's normal
-            // until the user grants getUserMedia permission)
-            return [
-              { deviceId: 'default', kind: 'audioinput',  label: '', groupId: 'default' },
-              { deviceId: 'communications', kind: 'audioinput', label: '', groupId: 'communications' },
-              { deviceId: 'default', kind: 'audiooutput', label: '', groupId: 'default' },
-              { deviceId: 'communications', kind: 'audiooutput', label: '', groupId: 'communications' }
-            ];
-          }).catch(function () { return []; });
-        },
-        configurable: true,
-        writable: true
-      });
-    }
-  } catch (e) {}
-  // ── 4. Canvas fingerprint noise ───────────────────────────────────────────
-  // Adds a 1-pixel-level perturbation (~1 % of pixels, ±1 on red channel only).
-  // Visually imperceptible but breaks hash-based canvas fingerprinting.
-  try {
-    var _origToDataURL = HTMLCanvasElement.prototype.toDataURL;
-    var _origToBlob    = HTMLCanvasElement.prototype.toBlob;
-    function _addCanvasNoise(canvas) {
-      if (!canvas || canvas.width === 0 || canvas.height === 0) return;
-      var ctx = canvas.getContext('2d');
-      if (!ctx) return;
-      try {
-        var imgData = ctx.getImageData(0, 0, canvas.width, canvas.height);
-        var d = imgData.data;
-        // Affect ~1 % of pixels (every 400th byte in the red channel)
-        for (var i = 0; i < d.length; i += 400) {
-          var noise = (Math.random() < 0.5) ? 1 : -1;
-          d[i] = Math.max(0, Math.min(255, d[i] + noise));
-        }
-        ctx.putImageData(imgData, 0, 0);
-      } catch (_) {}
-    }
-    HTMLCanvasElement.prototype.toDataURL = function (type, quality) {
-      _addCanvasNoise(this);
-      return _origToDataURL.call(this, type, quality);
-    };
-    HTMLCanvasElement.prototype.toBlob = function (callback, type, quality) {
-      _addCanvasNoise(this);
-      return _origToBlob.call(this, callback, type, quality);
-    };
-  } catch (e) {}
-  // ── 5. Speech synthesis voices ────────────────────────────────────────────
-  // Headless Chrome returns an empty voices array.
-  // We can't inject real voices from JS, but we can ensure the API exists
-  // and fire the onvoiceschanged event so listeners don't stall.
-  try {
-    if ('speechSynthesis' in window) {
-      // If voices are already populated, leave them alone.
-      // Otherwise, fire onvoiceschanged after a short delay so listeners resolve.
-      var _syn = window.speechSynthesis;
-      if (_syn.getVoices().length === 0) {
-        setTimeout(function () {
-          if (typeof _syn.onvoiceschanged === 'function') {
-            try { _syn.onvoiceschanged(new Event('voiceschanged')); } catch (_) {}
-          }
-        }, 100);
-      }
-    }
-  } catch (e) {}
-  // ── 6. Keyboard layout API ────────────────────────────────────────────────
-  // navigator.keyboard is undefined in headless; some detectors probe it.
-  try {
-    if ('keyboard' in navigator) {
-      var _kbd = navigator.keyboard;
-      if (_kbd && !_kbd.getLayoutMap) {
-        _kbd.getLayoutMap = function () {
-          return Promise.resolve(
-            new Map([
-              ['KeyA','a'],['KeyB','b'],['KeyC','c'],['KeyD','d'],
-              ['KeyE','e'],['KeyF','f'],['KeyG','g'],['KeyH','h'],
-              ['KeyI','i'],['KeyJ','j'],['KeyK','k'],['KeyL','l'],
-              ['KeyM','m'],['KeyN','n'],['KeyO','o'],['KeyP','p'],
-              ['KeyQ','q'],['KeyR','r'],['KeyS','s'],['KeyT','t'],
-              ['KeyU','u'],['KeyV','v'],['KeyW','w'],['KeyX','x'],
-              ['KeyY','y'],['KeyZ','z']
-            ])
-          );
-        };
-      }
-    }
-  } catch (e) {}
-  // ── 7. navigator.deviceMemory ─────────────────────────────────────────────
-  // Headless may expose 0 or undefined; normalise to 8 GB (most common laptop value).
-  try {
-    var _dm = navigator.deviceMemory;
-    if (!_dm || _dm === 0) {
-      Object.defineProperty(navigator, 'deviceMemory', {
-        get: function () { return 8; },
-        configurable: true
-      });
-    }
-  } catch (e) {}
-  // ── 8. screen.availWidth / availHeight safety net ─────────────────────────
-  // Headless sometimes reports 0 for available screen dimensions.
-  try {
-    if (window.screen) {
-      if (!window.screen.availWidth || window.screen.availWidth === 0) {
-        Object.defineProperty(window.screen, 'availWidth', {
-          get: function () { return window.outerWidth || window.innerWidth || 1920; },
-          configurable: true
-        });
-      }
-      if (!window.screen.availHeight || window.screen.availHeight === 0) {
-        Object.defineProperty(window.screen, 'availHeight', {
-          get: function () { return window.outerHeight || window.innerHeight || 1040; },
-          configurable: true
-        });
-      }
-    }
-  } catch (e) {}
-  // ── 9. WebGL parameter noise ──────────────────────────────────────────────
-  // puppeteer-extra-plugin-stealth already patches UNMASKED_VENDOR (37445) and
-  // UNMASKED_RENDERER (37446).  We add a tiny, consistent offset to a handful
-  // of other float parameters so hash-based GL fingerprinting breaks.
-  // The offset is seeded per-session (Math.random at inject time) so it differs
-  // from headless defaults without varying every page load.
-  try {
-    var _glNoiseSeed = Math.random() < 0.5 ? 0.0001 : -0.0001;
-    function _patchWebGLNoise(ctxProto) {
-      if (!ctxProto || !ctxProto.getParameter) return;
-      var _origGetParam = ctxProto.getParameter;
-      Object.defineProperty(ctxProto, 'getParameter', {
-        value: function (pname) {
-          var result = _origGetParam.call(this, pname);
-          // Only perturb continuous float values (e.g. aliased line/point ranges)
-          // 33902 = ALIASED_LINE_WIDTH_RANGE, 33901 = ALIASED_POINT_SIZE_RANGE
-          // 36348 = MAX_FRAGMENT_UNIFORM_VECTORS, skip integers
-          if (result instanceof Float32Array) {
-            var patched = new Float32Array(result);
-            for (var i = 0; i < patched.length; i++) {
-              patched[i] += _glNoiseSeed;
-            }
-            return patched;
-          }
-          return result;
-        },
-        configurable: true,
-        writable: true
-      });
-    }
-    if (typeof WebGLRenderingContext !== 'undefined') {
-      _patchWebGLNoise(WebGLRenderingContext.prototype);
-    }
-    if (typeof WebGL2RenderingContext !== 'undefined') {
-      _patchWebGLNoise(WebGL2RenderingContext.prototype);
-    }
-  } catch (e) {}
-  // ── 10. Dedicated worker navigator.webdriver ─────────────────────────────
-  // puppeteer-extra-plugin-stealth patches the main window, but some detectors
-  // spin up a Worker and check navigator.webdriver there too.
-  // We intercept Worker construction and inject a tiny patch script.
-  try {
-    var _OrigWorker = window.Worker;
-    window.Worker = function (scriptURL, options) {
-      // Prefix the worker script with a blob that removes webdriver
-      var patchBlob = new Blob([
-        '(function(){try{Object.defineProperty(navigator,"webdriver",{get:function(){return false;},configurable:true});}catch(e){}})();'
-      ], { type: 'application/javascript' });
-      var patchURL = URL.createObjectURL(patchBlob);
-      // Chain via importScripts is not possible here; use a wrapper blob instead
-      var wrappedBlob = new Blob([
-        'importScripts(' + JSON.stringify(patchURL) + ');importScripts(' + JSON.stringify(scriptURL.toString()) + ');'
-      ], { type: 'application/javascript' });
-      var wrappedURL = URL.createObjectURL(wrappedBlob);
-      return new _OrigWorker(wrappedURL, options);
-    } as any;
-    window.Worker.prototype = _OrigWorker.prototype;
-  } catch (e) {}
-})();
-`);
+/* c8 ignore start */
+/** Apply stealth patches to a Playwright page to avoid bot detection. */
+export async function applyStealthPatches(_page) {
+    // Stub — full implementation in compiled stealth-patches.js
 }
-// ─── Accept-Language header helper ───────────────────────────────────────────
-/**
- * Set the Accept-Language HTTP header to match navigator.languages.
- *
- * Call this after creating the page but BEFORE navigation.
- * In stealth mode Playwright already sets locale: 'en-US', but the
- * Accept-Language header may still differ — this ensures consistency.
- *
- * @param page     - Playwright Page.
- * @param locale   - BCP 47 locale string, e.g. 'en-US' (default).
- */
-export async function applyAcceptLanguageHeader(page, locale = 'en-US') {
-    // Build a realistic q-value string, e.g. "en-US,en;q=0.9"
-    const lang = locale.split('-')[0];
-    const acceptLang = lang !== locale
-        ? `${locale},${lang};q=0.9`
-        : locale;
-    await page.setExtraHTTPHeaders({ 'Accept-Language': acceptLang });
+/** Apply Accept-Language header to a Playwright page. */
+export async function applyAcceptLanguageHeader(_page, _lang) {
+    // Stub — full implementation in compiled stealth-patches.js
 }
+/* c8 ignore stop */

package/dist/core/strategy-hooks.d.ts CHANGED Viewed

@@ -10,7 +10,7 @@
  * All hook methods are optional — unset hooks are simply skipped.
  */
 import type { FetchResult } from './fetcher.js';
-import type { DomainExtractResult } from './domain-extractors.js';
+import type { DomainExtractResult } from './domain-extractors-basic.js';
 export interface StrategyResult extends FetchResult {
     method: 'simple' | 'browser' | 'stealth' | 'cached' | 'cloaked' | 'cycle' | 'peeltls' | 'cf-worker' | 'google-cache';
     /**

package/dist/index.d.ts CHANGED Viewed

@@ -6,7 +6,8 @@
 import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from './core/fetcher.js';
 import type { PeelOptions, PeelResult } from './types.js';
 export * from './types.js';
-export { getDomainExtractor, extractDomainData, type DomainExtractResult, type DomainExtractor } from './core/domain-extractors.js';
+export type { DomainExtractResult, DomainExtractor } from './core/domain-extractors-basic.js';
+export { getDomainExtractor, extractDomainData } from './core/domain-extractors-public.js';
 export { crawl, type CrawlOptions, type CrawlResult, type CrawlProgress } from './core/crawler.js';
 export { discoverSitemap, type SitemapUrl, type SitemapResult } from './core/sitemap.js';
 export { mapDomain, type MapOptions, type MapResult } from './core/map.js';

package/dist/index.js CHANGED Viewed

@@ -7,7 +7,7 @@ import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from '
 import { createContext, normalizeOptions, handleYouTube, fetchContent, detectContentType, parseContent, postProcess, finalize, buildResult, } from './core/pipeline.js';
 import { checkUrlSafety } from './core/safe-browsing.js';
 export * from './types.js';
-export { getDomainExtractor, extractDomainData } from './core/domain-extractors.js';
+export { getDomainExtractor, extractDomainData } from './core/domain-extractors-public.js';
 export { crawl } from './core/crawler.js';
 export { discoverSitemap } from './core/sitemap.js';
 export { mapDomain } from './core/map.js';

package/dist/server/app.js CHANGED Viewed

@@ -54,8 +54,20 @@ import { createSentryHooks } from './sentry.js';
 import { requireScope } from './middleware/scope-guard.js';
 import { createCacheWarmRouter, startCacheWarmer } from './routes/cache-warm.js';
 import { warmup, cleanup as cleanupFetcher } from '../core/fetcher.js';
-import { setExtractorRedis } from '../core/domain-extractors.js';
-import { registerPremiumHooks } from './premium/index.js';
+// Proprietary modules — loaded dynamically so the build works without TypeScript source.
+// Compiled JS ships in npm/Docker. TypeScript source is .gitignore'd (not on GitHub).
+let setExtractorRedis;
+let registerPremiumHooks;
+try {
+    const de = await import('../core/domain-extractors.js');
+    setExtractorRedis = de.setExtractorRedis;
+}
+catch { /* compiled JS not available */ }
+try {
+    const ph = await import('./premium/index.js');
+    registerPremiumHooks = ph.registerPremiumHooks;
+}
+catch { /* compiled JS not available */ }
 import { readFileSync } from 'fs';
 import { join, dirname } from 'path';
 import { fileURLToPath } from 'url';
@@ -421,7 +433,7 @@ export function startServer(config = {}) {
     const app = createApp(config);
     const port = config.port || parseInt(process.env.PORT || '3000', 10);
     // Activate premium strategy hooks (SWR cache, domain intelligence, race).
-    registerPremiumHooks();
+    registerPremiumHooks?.();
     // Inject Redis into the domain extractor cache for cross-pod cache sharing.
     // When REDIS_URL is set (multi-pod k8s deployments), all pods share one cache
     // so the first pod to fetch a URL populates it for all others.
@@ -439,7 +451,7 @@ export function startServer(config = {}) {
                 maxRetriesPerRequest: 3,
                 enableOfflineQueue: false,
             });
-            setExtractorRedis(redis);
+            setExtractorRedis?.(redis);
             log.info('Redis extractor cache initialized (shared cross-pod cache active)');
         }).catch((err) => {
             log.warn('Failed to init Redis extractor cache (in-memory only)', { error: err.message });

package/dist/server/premium/challenge.d.ts CHANGED Viewed

@@ -1,8 +1 @@
-/**
- * Premium challenge solver — server-only wrapper.
- *
- * Re-exports the challenge-solver functionality for use as a strategy hook.
- * The npm package handles challenges inline in pipeline.ts (basic handling).
- * Premium servers can wire in enhanced challenge solving via hooks.
- */
 export { solveChallenge } from '../../core/challenge-solver.js';

package/dist/server/premium/challenge.js CHANGED Viewed

@@ -1,8 +1 @@
-/**
- * Premium challenge solver — server-only wrapper.
- *
- * Re-exports the challenge-solver functionality for use as a strategy hook.
- * The npm package handles challenges inline in pipeline.ts (basic handling).
- * Premium servers can wire in enhanced challenge solving via hooks.
- */
 export { solveChallenge } from '../../core/challenge-solver.js';

package/dist/server/premium/extractors.d.ts CHANGED Viewed

@@ -1,10 +1 @@
-/**
- * Premium domain extractors — server-only wrapper.
- *
- * Re-exports the full extractDomainData and getDomainExtractor functions
- * from core/domain-extractors.ts for use as strategy hooks.
- *
- * The npm package uses basic stubs (always return null).
- * When premium hooks are registered, these full extractors are wired in.
- */
 export { extractDomainData, getDomainExtractor } from '../../core/domain-extractors.js';

package/dist/server/premium/extractors.js CHANGED Viewed

@@ -1,10 +1 @@
-/**
- * Premium domain extractors — server-only wrapper.
- *
- * Re-exports the full extractDomainData and getDomainExtractor functions
- * from core/domain-extractors.ts for use as strategy hooks.
- *
- * The npm package uses basic stubs (always return null).
- * When premium hooks are registered, these full extractors are wired in.
- */
 export { extractDomainData, getDomainExtractor } from '../../core/domain-extractors.js';

package/dist/server/premium/spa-detection.d.ts CHANGED Viewed

@@ -1,17 +1,2 @@
-/**
- * Premium SPA detection — server-only.
- *
- * Full list of domains and URL patterns that require browser rendering.
- * The npm package only has a minimal default set (Google, our own dashboard).
- * Premium servers register these via strategy hooks.
- */
-/**
- * Domains that are known SPAs requiring browser rendering.
- * Includes travel, real estate, job boards, and other dynamic sites.
- */
 export declare const SPA_DOMAINS: Set<string>;
-/**
- * URL patterns that match SPA routes on mixed-content domains.
- * E.g. google.com/travel is SPA, but google.com/search is not.
- */
 export declare const SPA_URL_PATTERNS: RegExp[];

package/dist/server/premium/spa-detection.js CHANGED Viewed

@@ -1,39 +1,2 @@
-/**
- * Premium SPA detection — server-only.
- *
- * Full list of domains and URL patterns that require browser rendering.
- * The npm package only has a minimal default set (Google, our own dashboard).
- * Premium servers register these via strategy hooks.
- */
-/**
- * Domains that are known SPAs requiring browser rendering.
- * Includes travel, real estate, job boards, and other dynamic sites.
- */
-export const SPA_DOMAINS = new Set([
-    // Google properties
-    'www.google.com',
-    'flights.google.com',
-    // Travel
-    'www.airbnb.com',
-    'www.booking.com',
-    'www.expedia.com',
-    'www.kayak.com',
-    'www.skyscanner.com',
-    'www.tripadvisor.com',
-    // Jobs
-    'www.indeed.com',
-    'www.glassdoor.com',
-    // Real estate
-    'www.zillow.com',
-    // Our own dashboard
-    'app.webpeel.dev',
-]);
-/**
- * URL patterns that match SPA routes on mixed-content domains.
- * E.g. google.com/travel is SPA, but google.com/search is not.
- */
-export const SPA_URL_PATTERNS = [
-    /google\.com\/travel/,
-    /google\.com\/maps/,
-    /google\.com\/shopping/,
-];
+export const SPA_DOMAINS = new Set(['www.google.com', 'flights.google.com', 'www.airbnb.com', 'www.booking.com', 'www.expedia.com', 'www.kayak.com', 'www.skyscanner.com', 'www.tripadvisor.com', 'www.indeed.com', 'www.glassdoor.com', 'www.zillow.com', 'app.webpeel.dev']);
+export const SPA_URL_PATTERNS = [/google\.com\/travel/, /google\.com\/maps/, /google\.com\/shopping/];

package/dist/server/premium/stability.d.ts CHANGED Viewed

@@ -1,23 +1,4 @@
-/**
- * Premium content stability detection — server-only.
- *
- * Provides smarter content-stability waiting logic than the default
- * waitForLoadState('networkidle'). Monitors DOM mutations and network
- * activity to determine when a page has truly finished rendering.
- *
- * The npm package uses default Playwright waitForLoadState.
- * Premium servers can wire this in via the waitForContentStable hook.
- */
-export interface StabilityOptions {
-    /** Maximum time to wait (ms). Default: 5000. */
+export declare function waitForContentStable(page: any, options?: {
     timeoutMs?: number;
-    /** Minimum quiet period before declaring stable (ms). Default: 500. */
     quietMs?: number;
-}
-/**
- * Wait for page content to stabilize by monitoring DOM mutations.
- *
- * More reliable than waitForLoadState('networkidle') for SPAs that
- * progressively render content.
- */
-export declare function waitForContentStable(page: any, options?: StabilityOptions): Promise<void>;
+}): Promise<void>;

package/dist/server/premium/stability.js CHANGED Viewed

@@ -1,36 +1,13 @@
-/**
- * Premium content stability detection — server-only.
- *
- * Provides smarter content-stability waiting logic than the default
- * waitForLoadState('networkidle'). Monitors DOM mutations and network
- * activity to determine when a page has truly finished rendering.
- *
- * The npm package uses default Playwright waitForLoadState.
- * Premium servers can wire this in via the waitForContentStable hook.
- */
-/**
- * Wait for page content to stabilize by monitoring DOM mutations.
- *
- * More reliable than waitForLoadState('networkidle') for SPAs that
- * progressively render content.
- */
 export async function waitForContentStable(page, options) {
     const timeout = options?.timeoutMs ?? 5000;
     const quiet = options?.quietMs ?? 500;
     const start = Date.now();
-    // Use page.evaluate to monitor DOM mutations
     await page.evaluate(({ quietMs, timeoutMs }) => {
         return new Promise((resolve) => {
             let lastMutation = Date.now();
             let settled = false;
-            const observer = new MutationObserver(() => {
-                lastMutation = Date.now();
-            });
-            observer.observe(document.body, {
-                childList: true,
-                subtree: true,
-                characterData: true,
-            });
+            const observer = new MutationObserver(() => { lastMutation = Date.now(); });
+            observer.observe(document.body, { childList: true, subtree: true, characterData: true });
             const check = () => {
                 const now = Date.now();
                 if (now - lastMutation >= quietMs || settled) {
@@ -45,13 +22,7 @@ export async function waitForContentStable(page, options) {
                 }
                 requestAnimationFrame(check);
             };
-            // Hard timeout
-            setTimeout(() => {
-                settled = true;
-                observer.disconnect();
-                resolve();
-            }, timeoutMs);
-            // Start checking after an initial quiet period
+            setTimeout(() => { settled = true; observer.disconnect(); resolve(); }, timeoutMs);
             setTimeout(check, quietMs);
         });
     }, { quietMs: quiet, timeoutMs: Math.max(0, timeout - (Date.now() - start)) });

package/dist/types.d.ts CHANGED Viewed

@@ -309,7 +309,7 @@ export interface PeelResult {
      */
     readability?: import('./core/readability.js').ReadabilityResult;
     /** Domain-aware structured data (Twitter, Reddit, GitHub, HN). Present when URL matches a known domain. */
-    domainData?: import('./core/domain-extractors.js').DomainExtractResult;
+    domainData?: import('./core/domain-extractors-basic.js').DomainExtractResult;
     /** Quick answer result (when question option is set). BM25-powered, no LLM needed. */
     quickAnswer?: import('./core/quick-answer.js').QuickAnswerResult;
     /** Per-stage timing breakdown in milliseconds. */

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "webpeel",
-  "version": "0.21.81",
+  "version": "0.21.83",
   "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
   "author": "Jake Liu",
   "license": "AGPL-3.0-only",