webpeel 0.21.80 → 0.21.82

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ /**
2
+ * Domain extraction types and basic stub.
3
+ *
4
+ * Types are defined HERE (always available) so nothing depends
5
+ * on the proprietary domain-extractors.ts TypeScript source.
6
+ * The compiled domain-extractors.js ships in npm and is loaded at runtime.
7
+ */
8
+ /** Structured result from a domain-specific extractor */
9
+ export interface DomainExtractResult {
10
+ /** Canonical domain name (e.g. 'twitter.com') */
11
+ domain: string;
12
+ /** Page type within the domain (e.g. 'tweet', 'thread', 'repo', 'issue') */
13
+ type: string;
14
+ /** Domain-specific structured data */
15
+ structured: Record<string, any>;
16
+ /** Clean markdown representation of the content */
17
+ cleanContent: string;
18
+ /** Raw HTML size in characters (from the actual HTML page fetched by the extractor) */
19
+ rawHtmlSize?: number;
20
+ }
21
+ /** An extractor receives the raw HTML and original URL, may make API calls. */
22
+ export type DomainExtractor = (html: string, url: string) => Promise<DomainExtractResult | null>;
23
+ /**
24
+ * Basic domain data extractor — free tier stub.
25
+ *
26
+ * Always returns null (delegates all extraction to the normal pipeline).
27
+ * Premium servers override this via the `extractDomainData` strategy hook.
28
+ */
29
+ export declare function extractDomainDataBasic(_html: string, _url: string): Promise<DomainExtractResult | null>;
30
+ /**
31
+ * Basic domain extractor lookup — free tier stub.
32
+ *
33
+ * Always returns null (no domain is recognized in basic mode).
34
+ * Premium servers override this via the `getDomainExtractor` strategy hook.
35
+ */
36
+ export declare function getDomainExtractorBasic(_url: string): ((html: string, url: string) => Promise<DomainExtractResult | null>) | null;
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Domain extraction types and basic stub.
3
+ *
4
+ * Types are defined HERE (always available) so nothing depends
5
+ * on the proprietary domain-extractors.ts TypeScript source.
6
+ * The compiled domain-extractors.js ships in npm and is loaded at runtime.
7
+ */
8
+ /**
9
+ * Basic domain data extractor — free tier stub.
10
+ *
11
+ * Always returns null (delegates all extraction to the normal pipeline).
12
+ * Premium servers override this via the `extractDomainData` strategy hook.
13
+ */
14
+ export async function extractDomainDataBasic(_html, _url) {
15
+ // Basic (free) tier: no domain-specific extraction.
16
+ // The normal fetch + markdown pipeline handles everything.
17
+ // Premium hook provides 55+ domain extractors (Twitter, Reddit, GitHub, HN, etc.)
18
+ return null;
19
+ }
20
+ /**
21
+ * Basic domain extractor lookup — free tier stub.
22
+ *
23
+ * Always returns null (no domain is recognized in basic mode).
24
+ * Premium servers override this via the `getDomainExtractor` strategy hook.
25
+ */
26
+ export function getDomainExtractorBasic(_url) {
27
+ return null;
28
+ }
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Public re-exports for domain extraction functions.
3
+ *
4
+ * This module is always available (npm + repo + server).
5
+ * It lazy-loads the full domain-extractors.js (compiled, ships in npm).
6
+ * If compiled JS is missing (bare repo clone), returns null gracefully.
7
+ *
8
+ * TypeScript source for domain-extractors is .gitignore'd (not on GitHub).
9
+ */
10
+ import type { DomainExtractResult } from './domain-extractors-basic.js';
11
+ /**
12
+ * Check if a URL has a domain-specific extractor.
13
+ * Returns the extractor function or null.
14
+ */
15
+ export declare function getDomainExtractor(url: string): any;
16
+ /**
17
+ * Run domain-specific extraction on HTML content.
18
+ * Returns structured domain data or null.
19
+ */
20
+ export declare function extractDomainData(html: string, url: string): Promise<DomainExtractResult | null>;
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Public re-exports for domain extraction functions.
3
+ *
4
+ * This module is always available (npm + repo + server).
5
+ * It lazy-loads the full domain-extractors.js (compiled, ships in npm).
6
+ * If compiled JS is missing (bare repo clone), returns null gracefully.
7
+ *
8
+ * TypeScript source for domain-extractors is .gitignore'd (not on GitHub).
9
+ */
10
+ // Top-level await: module fully loaded before any exports are called.
11
+ // This is safe in ESM (Node 14.8+, all modern bundlers).
12
+ let _getDomainExtractor = null;
13
+ let _extractDomainData = null;
14
+ try {
15
+ const mod = await import('./domain-extractors.js');
16
+ _getDomainExtractor = mod.getDomainExtractor;
17
+ _extractDomainData = mod.extractDomainData;
18
+ }
19
+ catch {
20
+ // Compiled JS not available (bare repo clone) — stubs return null
21
+ }
22
+ /**
23
+ * Check if a URL has a domain-specific extractor.
24
+ * Returns the extractor function or null.
25
+ */
26
+ export function getDomainExtractor(url) {
27
+ return _getDomainExtractor ? _getDomainExtractor(url) : null;
28
+ }
29
+ /**
30
+ * Run domain-specific extraction on HTML content.
31
+ * Returns structured domain data or null.
32
+ */
33
+ export async function extractDomainData(html, url) {
34
+ return _extractDomainData ? _extractDomainData(html, url) : null;
35
+ }
@@ -5,7 +5,7 @@
5
5
  * mutable PipelineContext. The stages are called in order by peel().
6
6
  */
7
7
  import { type AutoScrollOptions } from './actions.js';
8
- import { type DomainExtractResult } from './domain-extractors.js';
8
+ import { type DomainExtractResult } from './domain-extractors-basic.js';
9
9
  import { type ReadabilityResult } from './readability.js';
10
10
  import { type QuickAnswerResult } from './quick-answer.js';
11
11
  import { Timer } from './timing.js';
@@ -14,7 +14,34 @@ import { autoScroll as runAutoScroll } from './actions.js';
14
14
  import { extractStructured } from './extract.js';
15
15
  import { isPdfContentType, isDocxContentType, extractDocumentToFormat } from './documents.js';
16
16
  import { parseYouTubeUrl, getYouTubeTranscript } from './youtube.js';
17
- import { extractDomainData, getDomainExtractor } from './domain-extractors.js';
17
+ import { extractDomainDataBasic, getDomainExtractorBasic } from './domain-extractors-basic.js';
18
+ import { getDomainExtractHook, getDomainExtractorHook, getSPADomainsHook, getSPAPatternsHook } from './strategy-hooks.js';
19
+ // ---------------------------------------------------------------------------
20
+ // Domain extraction — lazy-load full extractors from compiled JS
21
+ // ---------------------------------------------------------------------------
22
+ // The compiled domain-extractors.js (312KB) ships in the npm package.
23
+ // TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
24
+ // If compiled JS is missing (bare repo clone without proprietary files),
25
+ // falls back to basic stub (no domain extraction, just standard markdown).
26
+ // Server premium hooks can override for additional caching/intelligence.
27
+ let _extractorsLoaded = false;
28
+ let _extractDomainData = null;
29
+ let _getDomainExtractor = null;
30
+ async function loadExtractors() {
31
+ if (_extractorsLoaded)
32
+ return;
33
+ _extractorsLoaded = true;
34
+ try {
35
+ const mod = await import('./domain-extractors.js');
36
+ _extractDomainData = mod.extractDomainData;
37
+ _getDomainExtractor = mod.getDomainExtractor;
38
+ }
39
+ catch {
40
+ // Compiled JS not available (bare repo clone) — basic stub will be used
41
+ }
42
+ }
43
+ // Start loading immediately (non-blocking)
44
+ loadExtractors();
18
45
  import { extractReadableContent } from './readability.js';
19
46
  import { quickAnswer as runQuickAnswer } from './quick-answer.js';
20
47
  import { Timer } from './timing.js';
@@ -24,6 +51,34 @@ import { sanitizeForLLM } from './prompt-guard.js';
24
51
  import { getSourceCredibility } from './source-credibility.js';
25
52
  import { createLogger } from './logger.js';
26
53
  const log = createLogger('pipeline');
54
+ // ---------------------------------------------------------------------------
55
+ // Hook-aware wrappers — route through premium hooks, fall back to basic stubs
56
+ // ---------------------------------------------------------------------------
57
+ /**
58
+ * Check if a URL has a domain extractor.
59
+ * Priority: premium hook → full extractors (repo/server) → basic stub.
60
+ */
61
+ function hasDomainExtractor(url) {
62
+ const hookFn = getDomainExtractorHook();
63
+ if (hookFn)
64
+ return hookFn(url) !== null;
65
+ if (_getDomainExtractor)
66
+ return _getDomainExtractor(url) !== null;
67
+ return getDomainExtractorBasic(url) !== null;
68
+ }
69
+ /**
70
+ * Run domain extraction on HTML/URL.
71
+ * Priority: premium hook → compiled extractors → basic stub.
72
+ */
73
+ async function runDomainExtract(html, url) {
74
+ const hookFn = getDomainExtractHook();
75
+ if (hookFn)
76
+ return hookFn(html, url);
77
+ await loadExtractors();
78
+ if (_extractDomainData)
79
+ return _extractDomainData(html, url);
80
+ return extractDomainDataBasic(html, url);
81
+ }
27
82
  /** Create the initial PipelineContext with defaults */
28
83
  export function createContext(url, options) {
29
84
  return {
@@ -146,28 +201,41 @@ export function normalizeOptions(ctx) {
146
201
  if (autoScrollOpts) {
147
202
  ctx.render = true;
148
203
  }
149
- // Auto-detect SPAs that require browser rendering (no --render flag needed)
204
+ // Auto-detect SPAs that require browser rendering (no --render flag needed).
205
+ // This list is NOT proprietary — every developer knows these sites are SPAs.
206
+ // The proprietary part is the domain EXTRACTORS (what data to pull), not this list.
207
+ // Premium hook can extend this for additional server-side intelligence.
150
208
  if (!ctx.render) {
151
- const SPA_DOMAINS = new Set([
152
- 'www.google.com', // Google Flights, Maps, Shopping etc.
209
+ const spaDomainsHook = getSPADomainsHook();
210
+ const spaPatternsHook = getSPAPatternsHook();
211
+ // Full SPA domain list — always available (npm + server)
212
+ const DEFAULT_SPA_DOMAINS = new Set([
213
+ // Search & travel
214
+ 'www.google.com',
153
215
  'flights.google.com',
216
+ // Travel & hospitality
154
217
  'www.airbnb.com',
155
218
  'www.booking.com',
156
219
  'www.expedia.com',
157
220
  'www.kayak.com',
158
221
  'www.skyscanner.com',
159
222
  'www.tripadvisor.com',
223
+ // Jobs
160
224
  'www.indeed.com',
161
225
  'www.glassdoor.com',
162
- 'www.zillow.com', // already handled but backup
163
- 'app.webpeel.dev', // our own dashboard is a SPA
226
+ // Real estate
227
+ 'www.zillow.com',
228
+ // Our own dashboard
229
+ 'app.webpeel.dev',
164
230
  ]);
165
- // More specific: some google.com paths need render, not all
166
- const SPA_URL_PATTERNS = [
231
+ const DEFAULT_SPA_PATTERNS = [
167
232
  /google\.com\/travel/,
168
233
  /google\.com\/maps/,
169
234
  /google\.com\/shopping/,
170
235
  ];
236
+ // Premium hook can extend with additional domains; otherwise use full default list
237
+ const SPA_DOMAINS = spaDomainsHook ? spaDomainsHook() : DEFAULT_SPA_DOMAINS;
238
+ const SPA_URL_PATTERNS = spaPatternsHook ? spaPatternsHook() : DEFAULT_SPA_PATTERNS;
171
239
  try {
172
240
  const hostname = new URL(ctx.url).hostname;
173
241
  if (SPA_DOMAINS.has(hostname)) {
@@ -304,10 +372,10 @@ export async function fetchContent(ctx) {
304
372
  const needsDesignAnalysis = ctx.options.designAnalysis && ctx.render;
305
373
  // Try API-based domain extraction first (Reddit, GitHub, HN use APIs, not HTML)
306
374
  // This avoids expensive browser fetches that often get blocked
307
- if (getDomainExtractor(ctx.url)) {
375
+ if (hasDomainExtractor(ctx.url)) {
308
376
  try {
309
377
  ctx.timer.mark('domainApiFirst');
310
- const ddResult = await extractDomainData('', ctx.url);
378
+ const ddResult = await runDomainExtract('', ctx.url);
311
379
  ctx.timer.end('domainApiFirst');
312
380
  if (ddResult && ddResult.cleanContent.length > 50) {
313
381
  ctx.domainData = ddResult;
@@ -385,9 +453,9 @@ export async function fetchContent(ctx) {
385
453
  }
386
454
  catch (fetchError) {
387
455
  // If fetch failed but we have a domain extractor, try it as fallback
388
- if (getDomainExtractor(ctx.url)) {
456
+ if (hasDomainExtractor(ctx.url)) {
389
457
  try {
390
- const ddResult = await extractDomainData('', ctx.url);
458
+ const ddResult = await runDomainExtract('', ctx.url);
391
459
  if (ddResult && ddResult.cleanContent.length > 50) {
392
460
  ctx.timer.end('fetch');
393
461
  ctx.domainData = ddResult;
@@ -1041,14 +1109,14 @@ export async function postProcess(ctx) {
1041
1109
  }
1042
1110
  // Domain-aware structured extraction (Twitter, Reddit, GitHub, HN)
1043
1111
  // Fires when URL matches a known domain. Replaces content with clean markdown.
1044
- if (getDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
1112
+ if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
1045
1113
  try {
1046
1114
  ctx.timer.mark('domainExtract');
1047
1115
  // Try raw HTML first, then fall back to readability-processed content
1048
1116
  // (some SPAs like Google Flights have data only after readability processing)
1049
- let ddResult = await extractDomainData(fetchResult.html, fetchResult.url);
1117
+ let ddResult = await runDomainExtract(fetchResult.html, fetchResult.url);
1050
1118
  if (!ddResult && ctx.content) {
1051
- ddResult = await extractDomainData(ctx.content, fetchResult.url);
1119
+ ddResult = await runDomainExtract(ctx.content, fetchResult.url);
1052
1120
  }
1053
1121
  ctx.timer.end('domainExtract');
1054
1122
  if (ddResult) {
@@ -1,57 +1,14 @@
1
1
  /**
2
- * stealth-patches.ts
2
+ * Stealth patches — proprietary module stub.
3
3
  *
4
- * Additional browser-fingerprint evasions that go beyond what
5
- * puppeteer-extra-plugin-stealth already provides.
4
+ * The full implementation is compiled into dist/core/stealth-patches.js
5
+ * and shipped in the npm package (14.9KB).
6
+ * TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
6
7
  *
7
- * What puppeteer-extra-plugin-stealth covers (we skip these):
8
- * - navigator.webdriver removal
9
- * - window.chrome (app / csi / loadTimes / runtime)
10
- * - navigator.plugins & mimeTypes (realistic arrays)
11
- * - navigator.languages & navigator.vendor
12
- * - navigator.permissions (Notification.permission → 'default')
13
- * - navigator.hardwareConcurrency
14
- * - webgl.vendor / webgl.renderer (UNMASKED params → Intel)
15
- * - window.outerWidth / outerHeight
16
- * - iframe.contentWindow
17
- * - media.codecs
18
- * - user-agent-override
19
- *
20
- * What THIS file adds (genuine gaps):
21
- * 1. navigator.connection – NetworkInformation API (absent in headless)
22
- * 2. Battery API – navigator.getBattery() (absent/broken in headless)
23
- * 3. Media devices – enumerateDevices() returns empty in headless
24
- * 4. Canvas noise – subtle pixel noise to prevent canvas fingerprinting
25
- * 5. Speech synthesis – getVoices() returns empty in headless
26
- * 6. Keyboard layout – navigator.keyboard.getLayoutMap() (absent in headless)
27
- * 7. navigator.deviceMemory – may be 0 in headless; normalise to 8 GB
28
- * 8. screen.availWidth/H – safety-net: ensure non-zero values
29
- * 9. WebGL noise – tiny noise on non-vendor params to break GL fingerprinting
30
- * 10. Worker webdriver flag – patch inside dedicated workers too
31
- *
32
- * Usage:
33
- * import { applyStealthPatches } from './stealth-patches.js';
34
- * await applyStealthPatches(page);
35
- *
36
- * Call AFTER page creation, before navigation.
37
- * Safe to call alongside puppeteer-extra-plugin-stealth (no conflicts).
38
- */
39
- import type { Page } from 'playwright';
40
- /**
41
- * Apply all supplemental stealth patches to a Playwright page.
42
- * Each patch is wrapped in its own try/catch so one failure never blocks others.
43
- *
44
- * @param page - A Playwright Page (or any object with addInitScript).
45
- */
46
- export declare function applyStealthPatches(page: Page): Promise<void>;
47
- /**
48
- * Set the Accept-Language HTTP header to match navigator.languages.
49
- *
50
- * Call this after creating the page but BEFORE navigation.
51
- * In stealth mode Playwright already sets locale: 'en-US', but the
52
- * Accept-Language header may still differ — this ensures consistency.
53
- *
54
- * @param page - Playwright Page.
55
- * @param locale - BCP 47 locale string, e.g. 'en-US' (default).
8
+ * This stub satisfies TypeScript type-checking on bare repo clones.
9
+ * At runtime the compiled JS is imported dynamically in browser-fetch.ts.
56
10
  */
57
- export declare function applyAcceptLanguageHeader(page: Page, locale?: string): Promise<void>;
11
+ /** Apply stealth patches to a Playwright page to avoid bot detection. */
12
+ export declare function applyStealthPatches(_page: unknown): Promise<void>;
13
+ /** Apply Accept-Language header to a Playwright page. */
14
+ export declare function applyAcceptLanguageHeader(_page: unknown, _lang?: string): Promise<void>;
@@ -1,339 +1,20 @@
1
1
  /**
2
- * stealth-patches.ts
2
+ * Stealth patches — proprietary module stub.
3
3
  *
4
- * Additional browser-fingerprint evasions that go beyond what
5
- * puppeteer-extra-plugin-stealth already provides.
4
+ * The full implementation is compiled into dist/core/stealth-patches.js
5
+ * and shipped in the npm package (14.9KB).
6
+ * TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
6
7
  *
7
- * What puppeteer-extra-plugin-stealth covers (we skip these):
8
- * - navigator.webdriver removal
9
- * - window.chrome (app / csi / loadTimes / runtime)
10
- * - navigator.plugins & mimeTypes (realistic arrays)
11
- * - navigator.languages & navigator.vendor
12
- * - navigator.permissions (Notification.permission → 'default')
13
- * - navigator.hardwareConcurrency
14
- * - webgl.vendor / webgl.renderer (UNMASKED params → Intel)
15
- * - window.outerWidth / outerHeight
16
- * - iframe.contentWindow
17
- * - media.codecs
18
- * - user-agent-override
19
- *
20
- * What THIS file adds (genuine gaps):
21
- * 1. navigator.connection – NetworkInformation API (absent in headless)
22
- * 2. Battery API – navigator.getBattery() (absent/broken in headless)
23
- * 3. Media devices – enumerateDevices() returns empty in headless
24
- * 4. Canvas noise – subtle pixel noise to prevent canvas fingerprinting
25
- * 5. Speech synthesis – getVoices() returns empty in headless
26
- * 6. Keyboard layout – navigator.keyboard.getLayoutMap() (absent in headless)
27
- * 7. navigator.deviceMemory – may be 0 in headless; normalise to 8 GB
28
- * 8. screen.availWidth/H – safety-net: ensure non-zero values
29
- * 9. WebGL noise – tiny noise on non-vendor params to break GL fingerprinting
30
- * 10. Worker webdriver flag – patch inside dedicated workers too
31
- *
32
- * Usage:
33
- * import { applyStealthPatches } from './stealth-patches.js';
34
- * await applyStealthPatches(page);
35
- *
36
- * Call AFTER page creation, before navigation.
37
- * Safe to call alongside puppeteer-extra-plugin-stealth (no conflicts).
38
- */
39
- // ─── main export ─────────────────────────────────────────────────────────────
40
- /**
41
- * Apply all supplemental stealth patches to a Playwright page.
42
- * Each patch is wrapped in its own try/catch so one failure never blocks others.
43
- *
44
- * @param page - A Playwright Page (or any object with addInitScript).
8
+ * This stub satisfies TypeScript type-checking on bare repo clones.
9
+ * At runtime the compiled JS is imported dynamically in browser-fetch.ts.
45
10
  */
46
- export async function applyStealthPatches(page) {
47
- // All patches run as a single evaluateOnNewDocument call for efficiency.
48
- // Using string form to be consistent with existing browser-pool.ts style
49
- // and to avoid any edge-cases with function serialisation across contexts.
50
- await page.addInitScript(`
51
- (function () {
52
- 'use strict';
53
-
54
- // ── 1. navigator.connection (NetworkInformation API) ─────────────────────
55
- // Headless Chrome lacks this object entirely; many bot-detectors probe it.
56
- try {
57
- if (!('connection' in navigator)) {
58
- var _conn = {
59
- downlink: 10,
60
- downlinkMax: Infinity,
61
- effectiveType: '4g',
62
- rtt: 50,
63
- saveData: false,
64
- type: 'wifi',
65
- onchange: null,
66
- ontypechange: null,
67
- addEventListener: function () {},
68
- removeEventListener: function () {},
69
- dispatchEvent: function () { return true; }
70
- };
71
- Object.defineProperty(navigator, 'connection', {
72
- get: function () { return _conn; },
73
- configurable: true
74
- });
75
- // Also expose as NetworkInformation-like alias that some code checks
76
- Object.defineProperty(navigator, 'mozConnection', {
77
- get: function () { return undefined; },
78
- configurable: true
79
- });
80
- Object.defineProperty(navigator, 'webkitConnection', {
81
- get: function () { return undefined; },
82
- configurable: true
83
- });
84
- }
85
- } catch (e) {}
86
-
87
- // ── 2. Battery API ────────────────────────────────────────────────────────
88
- // navigator.getBattery() often rejects in headless; return a plausible battery.
89
- try {
90
- var _battery = {
91
- charging: true,
92
- chargingTime: 0,
93
- dischargingTime: Infinity,
94
- level: 0.96 + (Math.random() * 0.03), // 96–99 %
95
- onchargingchange: null,
96
- onchargingtimechange: null,
97
- ondischargingtimechange: null,
98
- onlevelchange: null,
99
- addEventListener: function () {},
100
- removeEventListener: function () {},
101
- dispatchEvent: function () { return true; }
102
- };
103
- if ('getBattery' in navigator) {
104
- var _origGetBattery = navigator.getBattery.bind(navigator);
105
- Object.defineProperty(navigator, 'getBattery', {
106
- value: function () {
107
- return _origGetBattery().catch(function () {
108
- return Promise.resolve(_battery);
109
- });
110
- },
111
- configurable: true,
112
- writable: true
113
- });
114
- } else {
115
- Object.defineProperty(navigator, 'getBattery', {
116
- value: function () { return Promise.resolve(_battery); },
117
- configurable: true,
118
- writable: true
119
- });
120
- }
121
- } catch (e) {}
122
-
123
- // ── 3. Media devices – enumerateDevices ───────────────────────────────────
124
- // Headless returns an empty array; bots and real users both have at least
125
- // one audio device, so the empty list is a clear signal.
126
- try {
127
- if (navigator.mediaDevices && navigator.mediaDevices.enumerateDevices) {
128
- var _origEnum = navigator.mediaDevices.enumerateDevices.bind(navigator.mediaDevices);
129
- Object.defineProperty(navigator.mediaDevices, 'enumerateDevices', {
130
- value: function () {
131
- return _origEnum().then(function (devices) {
132
- if (devices && devices.length > 0) return devices;
133
- // Mock realistic device list (labels stay empty – that's normal
134
- // until the user grants getUserMedia permission)
135
- return [
136
- { deviceId: 'default', kind: 'audioinput', label: '', groupId: 'default' },
137
- { deviceId: 'communications', kind: 'audioinput', label: '', groupId: 'communications' },
138
- { deviceId: 'default', kind: 'audiooutput', label: '', groupId: 'default' },
139
- { deviceId: 'communications', kind: 'audiooutput', label: '', groupId: 'communications' }
140
- ];
141
- }).catch(function () { return []; });
142
- },
143
- configurable: true,
144
- writable: true
145
- });
146
- }
147
- } catch (e) {}
148
-
149
- // ── 4. Canvas fingerprint noise ───────────────────────────────────────────
150
- // Adds a 1-pixel-level perturbation (~1 % of pixels, ±1 on red channel only).
151
- // Visually imperceptible but breaks hash-based canvas fingerprinting.
152
- try {
153
- var _origToDataURL = HTMLCanvasElement.prototype.toDataURL;
154
- var _origToBlob = HTMLCanvasElement.prototype.toBlob;
155
-
156
- function _addCanvasNoise(canvas) {
157
- if (!canvas || canvas.width === 0 || canvas.height === 0) return;
158
- var ctx = canvas.getContext('2d');
159
- if (!ctx) return;
160
- try {
161
- var imgData = ctx.getImageData(0, 0, canvas.width, canvas.height);
162
- var d = imgData.data;
163
- // Affect ~1 % of pixels (every 400th byte in the red channel)
164
- for (var i = 0; i < d.length; i += 400) {
165
- var noise = (Math.random() < 0.5) ? 1 : -1;
166
- d[i] = Math.max(0, Math.min(255, d[i] + noise));
167
- }
168
- ctx.putImageData(imgData, 0, 0);
169
- } catch (_) {}
170
- }
171
-
172
- HTMLCanvasElement.prototype.toDataURL = function (type, quality) {
173
- _addCanvasNoise(this);
174
- return _origToDataURL.call(this, type, quality);
175
- };
176
-
177
- HTMLCanvasElement.prototype.toBlob = function (callback, type, quality) {
178
- _addCanvasNoise(this);
179
- return _origToBlob.call(this, callback, type, quality);
180
- };
181
- } catch (e) {}
182
-
183
- // ── 5. Speech synthesis voices ────────────────────────────────────────────
184
- // Headless Chrome returns an empty voices array.
185
- // We can't inject real voices from JS, but we can ensure the API exists
186
- // and fire the onvoiceschanged event so listeners don't stall.
187
- try {
188
- if ('speechSynthesis' in window) {
189
- // If voices are already populated, leave them alone.
190
- // Otherwise, fire onvoiceschanged after a short delay so listeners resolve.
191
- var _syn = window.speechSynthesis;
192
- if (_syn.getVoices().length === 0) {
193
- setTimeout(function () {
194
- if (typeof _syn.onvoiceschanged === 'function') {
195
- try { _syn.onvoiceschanged(new Event('voiceschanged')); } catch (_) {}
196
- }
197
- }, 100);
198
- }
199
- }
200
- } catch (e) {}
201
-
202
- // ── 6. Keyboard layout API ────────────────────────────────────────────────
203
- // navigator.keyboard is undefined in headless; some detectors probe it.
204
- try {
205
- if ('keyboard' in navigator) {
206
- var _kbd = navigator.keyboard;
207
- if (_kbd && !_kbd.getLayoutMap) {
208
- _kbd.getLayoutMap = function () {
209
- return Promise.resolve(
210
- new Map([
211
- ['KeyA','a'],['KeyB','b'],['KeyC','c'],['KeyD','d'],
212
- ['KeyE','e'],['KeyF','f'],['KeyG','g'],['KeyH','h'],
213
- ['KeyI','i'],['KeyJ','j'],['KeyK','k'],['KeyL','l'],
214
- ['KeyM','m'],['KeyN','n'],['KeyO','o'],['KeyP','p'],
215
- ['KeyQ','q'],['KeyR','r'],['KeyS','s'],['KeyT','t'],
216
- ['KeyU','u'],['KeyV','v'],['KeyW','w'],['KeyX','x'],
217
- ['KeyY','y'],['KeyZ','z']
218
- ])
219
- );
220
- };
221
- }
222
- }
223
- } catch (e) {}
224
-
225
- // ── 7. navigator.deviceMemory ─────────────────────────────────────────────
226
- // Headless may expose 0 or undefined; normalise to 8 GB (most common laptop value).
227
- try {
228
- var _dm = navigator.deviceMemory;
229
- if (!_dm || _dm === 0) {
230
- Object.defineProperty(navigator, 'deviceMemory', {
231
- get: function () { return 8; },
232
- configurable: true
233
- });
234
- }
235
- } catch (e) {}
236
-
237
- // ── 8. screen.availWidth / availHeight safety net ─────────────────────────
238
- // Headless sometimes reports 0 for available screen dimensions.
239
- try {
240
- if (window.screen) {
241
- if (!window.screen.availWidth || window.screen.availWidth === 0) {
242
- Object.defineProperty(window.screen, 'availWidth', {
243
- get: function () { return window.outerWidth || window.innerWidth || 1920; },
244
- configurable: true
245
- });
246
- }
247
- if (!window.screen.availHeight || window.screen.availHeight === 0) {
248
- Object.defineProperty(window.screen, 'availHeight', {
249
- get: function () { return window.outerHeight || window.innerHeight || 1040; },
250
- configurable: true
251
- });
252
- }
253
- }
254
- } catch (e) {}
255
-
256
- // ── 9. WebGL parameter noise ──────────────────────────────────────────────
257
- // puppeteer-extra-plugin-stealth already patches UNMASKED_VENDOR (37445) and
258
- // UNMASKED_RENDERER (37446). We add a tiny, consistent offset to a handful
259
- // of other float parameters so hash-based GL fingerprinting breaks.
260
- // The offset is seeded per-session (Math.random at inject time) so it differs
261
- // from headless defaults without varying every page load.
262
- try {
263
- var _glNoiseSeed = Math.random() < 0.5 ? 0.0001 : -0.0001;
264
-
265
- function _patchWebGLNoise(ctxProto) {
266
- if (!ctxProto || !ctxProto.getParameter) return;
267
- var _origGetParam = ctxProto.getParameter;
268
- Object.defineProperty(ctxProto, 'getParameter', {
269
- value: function (pname) {
270
- var result = _origGetParam.call(this, pname);
271
- // Only perturb continuous float values (e.g. aliased line/point ranges)
272
- // 33902 = ALIASED_LINE_WIDTH_RANGE, 33901 = ALIASED_POINT_SIZE_RANGE
273
- // 36348 = MAX_FRAGMENT_UNIFORM_VECTORS, skip integers
274
- if (result instanceof Float32Array) {
275
- var patched = new Float32Array(result);
276
- for (var i = 0; i < patched.length; i++) {
277
- patched[i] += _glNoiseSeed;
278
- }
279
- return patched;
280
- }
281
- return result;
282
- },
283
- configurable: true,
284
- writable: true
285
- });
286
- }
287
-
288
- if (typeof WebGLRenderingContext !== 'undefined') {
289
- _patchWebGLNoise(WebGLRenderingContext.prototype);
290
- }
291
- if (typeof WebGL2RenderingContext !== 'undefined') {
292
- _patchWebGLNoise(WebGL2RenderingContext.prototype);
293
- }
294
- } catch (e) {}
295
-
296
- // ── 10. Dedicated worker navigator.webdriver ─────────────────────────────
297
- // puppeteer-extra-plugin-stealth patches the main window, but some detectors
298
- // spin up a Worker and check navigator.webdriver there too.
299
- // We intercept Worker construction and inject a tiny patch script.
300
- try {
301
- var _OrigWorker = window.Worker;
302
- window.Worker = function (scriptURL, options) {
303
- // Prefix the worker script with a blob that removes webdriver
304
- var patchBlob = new Blob([
305
- '(function(){try{Object.defineProperty(navigator,"webdriver",{get:function(){return false;},configurable:true});}catch(e){}})();'
306
- ], { type: 'application/javascript' });
307
- var patchURL = URL.createObjectURL(patchBlob);
308
- // Chain via importScripts is not possible here; use a wrapper blob instead
309
- var wrappedBlob = new Blob([
310
- 'importScripts(' + JSON.stringify(patchURL) + ');importScripts(' + JSON.stringify(scriptURL.toString()) + ');'
311
- ], { type: 'application/javascript' });
312
- var wrappedURL = URL.createObjectURL(wrappedBlob);
313
- return new _OrigWorker(wrappedURL, options);
314
- } as any;
315
- window.Worker.prototype = _OrigWorker.prototype;
316
- } catch (e) {}
317
-
318
- })();
319
- `);
11
+ /* c8 ignore start */
12
+ /** Apply stealth patches to a Playwright page to avoid bot detection. */
13
+ export async function applyStealthPatches(_page) {
14
+ // Stub full implementation in compiled stealth-patches.js
320
15
  }
321
- // ─── Accept-Language header helper ───────────────────────────────────────────
322
- /**
323
- * Set the Accept-Language HTTP header to match navigator.languages.
324
- *
325
- * Call this after creating the page but BEFORE navigation.
326
- * In stealth mode Playwright already sets locale: 'en-US', but the
327
- * Accept-Language header may still differ — this ensures consistency.
328
- *
329
- * @param page - Playwright Page.
330
- * @param locale - BCP 47 locale string, e.g. 'en-US' (default).
331
- */
332
- export async function applyAcceptLanguageHeader(page, locale = 'en-US') {
333
- // Build a realistic q-value string, e.g. "en-US,en;q=0.9"
334
- const lang = locale.split('-')[0];
335
- const acceptLang = lang !== locale
336
- ? `${locale},${lang};q=0.9`
337
- : locale;
338
- await page.setExtraHTTPHeaders({ 'Accept-Language': acceptLang });
16
+ /** Apply Accept-Language header to a Playwright page. */
17
+ export async function applyAcceptLanguageHeader(_page, _lang) {
18
+ // Stub full implementation in compiled stealth-patches.js
339
19
  }
20
+ /* c8 ignore stop */
@@ -10,6 +10,7 @@
10
10
  * All hook methods are optional — unset hooks are simply skipped.
11
11
  */
12
12
  import type { FetchResult } from './fetcher.js';
13
+ import type { DomainExtractResult } from './domain-extractors-basic.js';
13
14
  export interface StrategyResult extends FetchResult {
14
15
  method: 'simple' | 'browser' | 'stealth' | 'cached' | 'cloaked' | 'cycle' | 'peeltls' | 'cf-worker' | 'google-cache';
15
16
  /**
@@ -65,6 +66,39 @@ export interface StrategyHooks {
65
66
  * Only called when `shouldRace()` returns true. Default: 2000.
66
67
  */
67
68
  getRaceTimeoutMs?(): number;
69
+ /**
70
+ * Premium domain extraction hook — 55+ domain extractors.
71
+ * Return null to fall back to basic/no extraction.
72
+ */
73
+ extractDomainData?(html: string, url: string): Promise<DomainExtractResult | null>;
74
+ /**
75
+ * Returns a function that checks if a URL has a known domain extractor.
76
+ * Premium knows which domains have extractors; basic returns null for all.
77
+ */
78
+ getDomainExtractor?(url: string): ((html: string, url: string) => Promise<DomainExtractResult | null>) | null;
79
+ /**
80
+ * Premium SPA domain list — knows which sites require browser rendering.
81
+ * Basic: returns empty set (no SPA auto-detection).
82
+ */
83
+ getSPADomains?(): Set<string>;
84
+ /**
85
+ * Premium SPA URL patterns — matches specific paths needing render.
86
+ * Basic: returns empty array.
87
+ */
88
+ getSPAPatterns?(): RegExp[];
89
+ /**
90
+ * Premium CAPTCHA/challenge solving hook.
91
+ * Return null to fall back to default challenge handling.
92
+ */
93
+ solveChallenge?(page: any, url: string): Promise<{
94
+ solved: boolean;
95
+ html?: string;
96
+ } | null>;
97
+ /**
98
+ * Premium wait-for-stable content logic — smarter than waitForLoadState.
99
+ * Return null/undefined to fall back to default wait logic.
100
+ */
101
+ waitForContentStable?(page: any, options?: any): Promise<void>;
68
102
  }
69
103
  /**
70
104
  * Register premium strategy hooks. Should be called once at server startup.
@@ -79,3 +113,33 @@ export declare function clearStrategyHooks(): void;
79
113
  * Retrieve the current hooks (internal — used by strategies.ts).
80
114
  */
81
115
  export declare function getStrategyHooks(): Readonly<StrategyHooks>;
116
+ /**
117
+ * Get the premium domain extraction hook, if registered.
118
+ * Returns undefined when no premium hooks are active (basic/npm mode).
119
+ */
120
+ export declare function getDomainExtractHook(): StrategyHooks['extractDomainData'];
121
+ /**
122
+ * Get the premium domain extractor lookup hook, if registered.
123
+ * Returns undefined when no premium hooks are active (basic/npm mode).
124
+ */
125
+ export declare function getDomainExtractorHook(): StrategyHooks['getDomainExtractor'];
126
+ /**
127
+ * Get the premium SPA domains hook, if registered.
128
+ * Returns undefined when no premium hooks are active (basic/npm mode).
129
+ */
130
+ export declare function getSPADomainsHook(): StrategyHooks['getSPADomains'];
131
+ /**
132
+ * Get the premium SPA patterns hook, if registered.
133
+ * Returns undefined when no premium hooks are active (basic/npm mode).
134
+ */
135
+ export declare function getSPAPatternsHook(): StrategyHooks['getSPAPatterns'];
136
+ /**
137
+ * Get the premium challenge solver hook, if registered.
138
+ * Returns undefined when no premium hooks are active (basic/npm mode).
139
+ */
140
+ export declare function getChallengeHook(): StrategyHooks['solveChallenge'];
141
+ /**
142
+ * Get the premium content stability hook, if registered.
143
+ * Returns undefined when no premium hooks are active (basic/npm mode).
144
+ */
145
+ export declare function getStabilityHook(): StrategyHooks['waitForContentStable'];
@@ -30,3 +30,45 @@ export function clearStrategyHooks() {
30
30
  export function getStrategyHooks() {
31
31
  return registeredHooks;
32
32
  }
33
+ /**
34
+ * Get the premium domain extraction hook, if registered.
35
+ * Returns undefined when no premium hooks are active (basic/npm mode).
36
+ */
37
+ export function getDomainExtractHook() {
38
+ return registeredHooks.extractDomainData;
39
+ }
40
+ /**
41
+ * Get the premium domain extractor lookup hook, if registered.
42
+ * Returns undefined when no premium hooks are active (basic/npm mode).
43
+ */
44
+ export function getDomainExtractorHook() {
45
+ return registeredHooks.getDomainExtractor;
46
+ }
47
+ /**
48
+ * Get the premium SPA domains hook, if registered.
49
+ * Returns undefined when no premium hooks are active (basic/npm mode).
50
+ */
51
+ export function getSPADomainsHook() {
52
+ return registeredHooks.getSPADomains;
53
+ }
54
+ /**
55
+ * Get the premium SPA patterns hook, if registered.
56
+ * Returns undefined when no premium hooks are active (basic/npm mode).
57
+ */
58
+ export function getSPAPatternsHook() {
59
+ return registeredHooks.getSPAPatterns;
60
+ }
61
+ /**
62
+ * Get the premium challenge solver hook, if registered.
63
+ * Returns undefined when no premium hooks are active (basic/npm mode).
64
+ */
65
+ export function getChallengeHook() {
66
+ return registeredHooks.solveChallenge;
67
+ }
68
+ /**
69
+ * Get the premium content stability hook, if registered.
70
+ * Returns undefined when no premium hooks are active (basic/npm mode).
71
+ */
72
+ export function getStabilityHook() {
73
+ return registeredHooks.waitForContentStable;
74
+ }
package/dist/index.d.ts CHANGED
@@ -6,7 +6,8 @@
6
6
  import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from './core/fetcher.js';
7
7
  import type { PeelOptions, PeelResult } from './types.js';
8
8
  export * from './types.js';
9
- export { getDomainExtractor, extractDomainData, type DomainExtractResult, type DomainExtractor } from './core/domain-extractors.js';
9
+ export type { DomainExtractResult, DomainExtractor } from './core/domain-extractors-basic.js';
10
+ export { getDomainExtractor, extractDomainData } from './core/domain-extractors-public.js';
10
11
  export { crawl, type CrawlOptions, type CrawlResult, type CrawlProgress } from './core/crawler.js';
11
12
  export { discoverSitemap, type SitemapUrl, type SitemapResult } from './core/sitemap.js';
12
13
  export { mapDomain, type MapOptions, type MapResult } from './core/map.js';
package/dist/index.js CHANGED
@@ -7,7 +7,7 @@ import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from '
7
7
  import { createContext, normalizeOptions, handleYouTube, fetchContent, detectContentType, parseContent, postProcess, finalize, buildResult, } from './core/pipeline.js';
8
8
  import { checkUrlSafety } from './core/safe-browsing.js';
9
9
  export * from './types.js';
10
- export { getDomainExtractor, extractDomainData } from './core/domain-extractors.js';
10
+ export { getDomainExtractor, extractDomainData } from './core/domain-extractors-public.js';
11
11
  export { crawl } from './core/crawler.js';
12
12
  export { discoverSitemap } from './core/sitemap.js';
13
13
  export { mapDomain } from './core/map.js';
@@ -54,8 +54,20 @@ import { createSentryHooks } from './sentry.js';
54
54
  import { requireScope } from './middleware/scope-guard.js';
55
55
  import { createCacheWarmRouter, startCacheWarmer } from './routes/cache-warm.js';
56
56
  import { warmup, cleanup as cleanupFetcher } from '../core/fetcher.js';
57
- import { setExtractorRedis } from '../core/domain-extractors.js';
58
- import { registerPremiumHooks } from './premium/index.js';
57
+ // Proprietary modules loaded dynamically so the build works without TypeScript source.
58
+ // Compiled JS ships in npm/Docker. TypeScript source is .gitignore'd (not on GitHub).
59
+ let setExtractorRedis;
60
+ let registerPremiumHooks;
61
+ try {
62
+ const de = await import('../core/domain-extractors.js');
63
+ setExtractorRedis = de.setExtractorRedis;
64
+ }
65
+ catch { /* compiled JS not available */ }
66
+ try {
67
+ const ph = await import('./premium/index.js');
68
+ registerPremiumHooks = ph.registerPremiumHooks;
69
+ }
70
+ catch { /* compiled JS not available */ }
59
71
  import { readFileSync } from 'fs';
60
72
  import { join, dirname } from 'path';
61
73
  import { fileURLToPath } from 'url';
@@ -421,7 +433,7 @@ export function startServer(config = {}) {
421
433
  const app = createApp(config);
422
434
  const port = config.port || parseInt(process.env.PORT || '3000', 10);
423
435
  // Activate premium strategy hooks (SWR cache, domain intelligence, race).
424
- registerPremiumHooks();
436
+ registerPremiumHooks?.();
425
437
  // Inject Redis into the domain extractor cache for cross-pod cache sharing.
426
438
  // When REDIS_URL is set (multi-pod k8s deployments), all pods share one cache
427
439
  // so the first pod to fetch a URL populates it for all others.
@@ -439,7 +451,7 @@ export function startServer(config = {}) {
439
451
  maxRetriesPerRequest: 3,
440
452
  enableOfflineQueue: false,
441
453
  });
442
- setExtractorRedis(redis);
454
+ setExtractorRedis?.(redis);
443
455
  log.info('Redis extractor cache initialized (shared cross-pod cache active)');
444
456
  }).catch((err) => {
445
457
  log.warn('Failed to init Redis extractor cache (in-memory only)', { error: err.message });
@@ -0,0 +1 @@
1
+ export { solveChallenge } from '../../core/challenge-solver.js';
@@ -0,0 +1 @@
1
+ export { solveChallenge } from '../../core/challenge-solver.js';
@@ -0,0 +1 @@
1
+ export { extractDomainData, getDomainExtractor } from '../../core/domain-extractors.js';
@@ -0,0 +1 @@
1
+ export { extractDomainData, getDomainExtractor } from '../../core/domain-extractors.js';
@@ -5,6 +5,9 @@
5
5
  * • SWR (stale-while-revalidate) response cache
6
6
  * • Domain intelligence (learns which sites need browser/stealth)
7
7
  * • Parallel race strategy (starts browser if simple fetch is slow)
8
+ * • 55+ domain extractors (Twitter, Reddit, GitHub, HN, Wikipedia, etc.)
9
+ * • SPA auto-detection (travel, jobs, real estate sites)
10
+ * • Content stability detection (smart DOM mutation monitoring)
8
11
  *
9
12
  * These modules are NOT shipped in the npm package.
10
13
  */
@@ -5,12 +5,18 @@
5
5
  * • SWR (stale-while-revalidate) response cache
6
6
  * • Domain intelligence (learns which sites need browser/stealth)
7
7
  * • Parallel race strategy (starts browser if simple fetch is slow)
8
+ * • 55+ domain extractors (Twitter, Reddit, GitHub, HN, Wikipedia, etc.)
9
+ * • SPA auto-detection (travel, jobs, real estate sites)
10
+ * • Content stability detection (smart DOM mutation monitoring)
8
11
  *
9
12
  * These modules are NOT shipped in the npm package.
10
13
  */
11
14
  import { registerStrategyHooks } from '../../core/strategy-hooks.js';
12
15
  import { createSWRCacheHooks } from './swr-cache.js';
13
16
  import { createDomainIntelHooks } from './domain-intel.js';
17
+ import { extractDomainData, getDomainExtractor } from './extractors.js';
18
+ import { SPA_DOMAINS, SPA_URL_PATTERNS } from './spa-detection.js';
19
+ import { waitForContentStable } from './stability.js';
14
20
  export { clearDomainIntel } from './domain-intel.js';
15
21
  /**
16
22
  * Wire all premium hooks into the core strategy layer.
@@ -31,5 +37,14 @@ export function registerPremiumHooks() {
31
37
  // Parallel race strategy
32
38
  shouldRace: () => true,
33
39
  getRaceTimeoutMs: () => 2000,
40
+ // Premium domain extraction (55+ extractors)
41
+ extractDomainData,
42
+ // Premium domain extractor lookup
43
+ getDomainExtractor: (url) => getDomainExtractor(url),
44
+ // Premium SPA detection
45
+ getSPADomains: () => SPA_DOMAINS,
46
+ getSPAPatterns: () => SPA_URL_PATTERNS,
47
+ // Premium content stability (DOM mutation monitoring)
48
+ waitForContentStable,
34
49
  });
35
50
  }
@@ -0,0 +1,2 @@
1
+ export declare const SPA_DOMAINS: Set<string>;
2
+ export declare const SPA_URL_PATTERNS: RegExp[];
@@ -0,0 +1,2 @@
1
+ export const SPA_DOMAINS = new Set(['www.google.com', 'flights.google.com', 'www.airbnb.com', 'www.booking.com', 'www.expedia.com', 'www.kayak.com', 'www.skyscanner.com', 'www.tripadvisor.com', 'www.indeed.com', 'www.glassdoor.com', 'www.zillow.com', 'app.webpeel.dev']);
2
+ export const SPA_URL_PATTERNS = [/google\.com\/travel/, /google\.com\/maps/, /google\.com\/shopping/];
@@ -0,0 +1,4 @@
1
+ export declare function waitForContentStable(page: any, options?: {
2
+ timeoutMs?: number;
3
+ quietMs?: number;
4
+ }): Promise<void>;
@@ -0,0 +1,29 @@
1
+ export async function waitForContentStable(page, options) {
2
+ const timeout = options?.timeoutMs ?? 5000;
3
+ const quiet = options?.quietMs ?? 500;
4
+ const start = Date.now();
5
+ await page.evaluate(({ quietMs, timeoutMs }) => {
6
+ return new Promise((resolve) => {
7
+ let lastMutation = Date.now();
8
+ let settled = false;
9
+ const observer = new MutationObserver(() => { lastMutation = Date.now(); });
10
+ observer.observe(document.body, { childList: true, subtree: true, characterData: true });
11
+ const check = () => {
12
+ const now = Date.now();
13
+ if (now - lastMutation >= quietMs || settled) {
14
+ observer.disconnect();
15
+ resolve();
16
+ return;
17
+ }
18
+ if (now - lastMutation > timeoutMs) {
19
+ observer.disconnect();
20
+ resolve();
21
+ return;
22
+ }
23
+ requestAnimationFrame(check);
24
+ };
25
+ setTimeout(() => { settled = true; observer.disconnect(); resolve(); }, timeoutMs);
26
+ setTimeout(check, quietMs);
27
+ });
28
+ }, { quietMs: quiet, timeoutMs: Math.max(0, timeout - (Date.now() - start)) });
29
+ }
package/dist/types.d.ts CHANGED
@@ -309,7 +309,7 @@ export interface PeelResult {
309
309
  */
310
310
  readability?: import('./core/readability.js').ReadabilityResult;
311
311
  /** Domain-aware structured data (Twitter, Reddit, GitHub, HN). Present when URL matches a known domain. */
312
- domainData?: import('./core/domain-extractors.js').DomainExtractResult;
312
+ domainData?: import('./core/domain-extractors-basic.js').DomainExtractResult;
313
313
  /** Quick answer result (when question option is set). BM25-powered, no LLM needed. */
314
314
  quickAnswer?: import('./core/quick-answer.js').QuickAnswerResult;
315
315
  /** Per-stage timing breakdown in milliseconds. */
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.80",
3
+ "version": "0.21.82",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",