webpeel 0.21.81 → 0.21.83

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,25 @@
1
1
  /**
2
- * Basic domain extraction public/free tier.
2
+ * Domain extraction types and basic stub.
3
3
  *
4
- * Handles a few common domains with simple logic.
5
- * Full 55+ domain extractors are premium/server-only.
6
- *
7
- * This module is safe to include in the npm package.
8
- * The full `domain-extractors.ts` is compiled for the server
9
- * but wired in only when premium hooks are registered.
4
+ * Types are defined HERE (always available) so nothing depends
5
+ * on the proprietary domain-extractors.ts TypeScript source.
6
+ * The compiled domain-extractors.js ships in npm and is loaded at runtime.
10
7
  */
11
- import type { DomainExtractResult } from './domain-extractors.js';
8
+ /** Structured result from a domain-specific extractor */
9
+ export interface DomainExtractResult {
10
+ /** Canonical domain name (e.g. 'twitter.com') */
11
+ domain: string;
12
+ /** Page type within the domain (e.g. 'tweet', 'thread', 'repo', 'issue') */
13
+ type: string;
14
+ /** Domain-specific structured data */
15
+ structured: Record<string, any>;
16
+ /** Clean markdown representation of the content */
17
+ cleanContent: string;
18
+ /** Raw HTML size in characters (from the actual HTML page fetched by the extractor) */
19
+ rawHtmlSize?: number;
20
+ }
21
+ /** An extractor receives the raw HTML and original URL, may make API calls. */
22
+ export type DomainExtractor = (html: string, url: string) => Promise<DomainExtractResult | null>;
12
23
  /**
13
24
  * Basic domain data extractor — free tier stub.
14
25
  *
@@ -1,12 +1,9 @@
1
1
  /**
2
- * Basic domain extraction public/free tier.
2
+ * Domain extraction types and basic stub.
3
3
  *
4
- * Handles a few common domains with simple logic.
5
- * Full 55+ domain extractors are premium/server-only.
6
- *
7
- * This module is safe to include in the npm package.
8
- * The full `domain-extractors.ts` is compiled for the server
9
- * but wired in only when premium hooks are registered.
4
+ * Types are defined HERE (always available) so nothing depends
5
+ * on the proprietary domain-extractors.ts TypeScript source.
6
+ * The compiled domain-extractors.js ships in npm and is loaded at runtime.
10
7
  */
11
8
  /**
12
9
  * Basic domain data extractor — free tier stub.
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Public re-exports for domain extraction functions.
3
+ *
4
+ * This module is always available (npm + repo + server).
5
+ * It lazy-loads the full domain-extractors.js (compiled, ships in npm).
6
+ * If compiled JS is missing (bare repo clone), returns null gracefully.
7
+ *
8
+ * TypeScript source for domain-extractors is .gitignore'd (not on GitHub).
9
+ */
10
+ import type { DomainExtractResult } from './domain-extractors-basic.js';
11
+ /**
12
+ * Check if a URL has a domain-specific extractor.
13
+ * Returns the extractor function or null.
14
+ */
15
+ export declare function getDomainExtractor(url: string): any;
16
+ /**
17
+ * Run domain-specific extraction on HTML content.
18
+ * Returns structured domain data or null.
19
+ */
20
+ export declare function extractDomainData(html: string, url: string): Promise<DomainExtractResult | null>;
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Public re-exports for domain extraction functions.
3
+ *
4
+ * This module is always available (npm + repo + server).
5
+ * It lazy-loads the full domain-extractors.js (compiled, ships in npm).
6
+ * If compiled JS is missing (bare repo clone), returns null gracefully.
7
+ *
8
+ * TypeScript source for domain-extractors is .gitignore'd (not on GitHub).
9
+ */
10
+ // Top-level await: module fully loaded before any exports are called.
11
+ // This is safe in ESM (Node 14.8+, all modern bundlers).
12
+ let _getDomainExtractor = null;
13
+ let _extractDomainData = null;
14
+ try {
15
+ const mod = await import('./domain-extractors.js');
16
+ _getDomainExtractor = mod.getDomainExtractor;
17
+ _extractDomainData = mod.extractDomainData;
18
+ }
19
+ catch {
20
+ // Compiled JS not available (bare repo clone) — stubs return null
21
+ }
22
+ /**
23
+ * Check if a URL has a domain-specific extractor.
24
+ * Returns the extractor function or null.
25
+ */
26
+ export function getDomainExtractor(url) {
27
+ return _getDomainExtractor ? _getDomainExtractor(url) : null;
28
+ }
29
+ /**
30
+ * Run domain-specific extraction on HTML content.
31
+ * Returns structured domain data or null.
32
+ */
33
+ export async function extractDomainData(html, url) {
34
+ return _extractDomainData ? _extractDomainData(html, url) : null;
35
+ }
@@ -5,7 +5,7 @@
5
5
  * mutable PipelineContext. The stages are called in order by peel().
6
6
  */
7
7
  import { type AutoScrollOptions } from './actions.js';
8
- import { type DomainExtractResult } from './domain-extractors.js';
8
+ import { type DomainExtractResult } from './domain-extractors-basic.js';
9
9
  import { type ReadabilityResult } from './readability.js';
10
10
  import { type QuickAnswerResult } from './quick-answer.js';
11
11
  import { Timer } from './timing.js';
@@ -16,26 +16,32 @@ import { isPdfContentType, isDocxContentType, extractDocumentToFormat } from './
16
16
  import { parseYouTubeUrl, getYouTubeTranscript } from './youtube.js';
17
17
  import { extractDomainDataBasic, getDomainExtractorBasic } from './domain-extractors-basic.js';
18
18
  import { getDomainExtractHook, getDomainExtractorHook, getSPADomainsHook, getSPAPatternsHook } from './strategy-hooks.js';
19
- // Lazy-loaded full extractors — available in repo/server, absent in npm package.
20
- // The dynamic import avoids hard failures when domain-extractors.js is excluded from npm.
21
- let _fullExtractorsLoaded = false;
22
- let _fullExtractDomainData = null;
23
- let _fullGetDomainExtractor = null;
24
- async function loadFullExtractors() {
25
- if (_fullExtractorsLoaded)
19
+ // ---------------------------------------------------------------------------
20
+ // Domain extraction lazy-load full extractors from compiled JS
21
+ // ---------------------------------------------------------------------------
22
+ // The compiled domain-extractors.js (312KB) ships in the npm package.
23
+ // TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
24
+ // If compiled JS is missing (bare repo clone without proprietary files),
25
+ // falls back to basic stub (no domain extraction, just standard markdown).
26
+ // Server premium hooks can override for additional caching/intelligence.
27
+ let _extractorsLoaded = false;
28
+ let _extractDomainData = null;
29
+ let _getDomainExtractor = null;
30
+ async function loadExtractors() {
31
+ if (_extractorsLoaded)
26
32
  return;
27
- _fullExtractorsLoaded = true;
33
+ _extractorsLoaded = true;
28
34
  try {
29
35
  const mod = await import('./domain-extractors.js');
30
- _fullExtractDomainData = mod.extractDomainData;
31
- _fullGetDomainExtractor = mod.getDomainExtractor;
36
+ _extractDomainData = mod.extractDomainData;
37
+ _getDomainExtractor = mod.getDomainExtractor;
32
38
  }
33
39
  catch {
34
- // Not available (npm package) — basic stubs will be used
40
+ // Compiled JS not available (bare repo clone) — basic stub will be used
35
41
  }
36
42
  }
37
- // Eagerly start loading (non-blocking)
38
- loadFullExtractors();
43
+ // Start loading immediately (non-blocking)
44
+ loadExtractors();
39
45
  import { extractReadableContent } from './readability.js';
40
46
  import { quickAnswer as runQuickAnswer } from './quick-answer.js';
41
47
  import { Timer } from './timing.js';
@@ -56,25 +62,21 @@ function hasDomainExtractor(url) {
56
62
  const hookFn = getDomainExtractorHook();
57
63
  if (hookFn)
58
64
  return hookFn(url) !== null;
59
- // Full extractors available (repo/server build)?
60
- if (_fullGetDomainExtractor)
61
- return _fullGetDomainExtractor(url) !== null;
62
- // npm package fallback — basic stubs
65
+ if (_getDomainExtractor)
66
+ return _getDomainExtractor(url) !== null;
63
67
  return getDomainExtractorBasic(url) !== null;
64
68
  }
65
69
  /**
66
70
  * Run domain extraction on HTML/URL.
67
- * Priority: premium hook → full extractors (repo/server) → basic stub.
71
+ * Priority: premium hook → compiled extractors → basic stub.
68
72
  */
69
73
  async function runDomainExtract(html, url) {
70
74
  const hookFn = getDomainExtractHook();
71
75
  if (hookFn)
72
76
  return hookFn(html, url);
73
- // Full extractors available (repo/server build)?
74
- await loadFullExtractors(); // Ensure loaded
75
- if (_fullExtractDomainData)
76
- return _fullExtractDomainData(html, url);
77
- // npm package fallback — basic stubs
77
+ await loadExtractors();
78
+ if (_extractDomainData)
79
+ return _extractDomainData(html, url);
78
80
  return extractDomainDataBasic(html, url);
79
81
  }
80
82
  /** Create the initial PipelineContext with defaults */
@@ -199,15 +201,39 @@ export function normalizeOptions(ctx) {
199
201
  if (autoScrollOpts) {
200
202
  ctx.render = true;
201
203
  }
202
- // Auto-detect SPAs that require browser rendering (no --render flag needed)
203
- // Premium hook provides full SPA domain list; basic has a small default set.
204
+ // Auto-detect SPAs that require browser rendering (no --render flag needed).
205
+ // This list is NOT proprietary every developer knows these sites are SPAs.
206
+ // The proprietary part is the domain EXTRACTORS (what data to pull), not this list.
207
+ // Premium hook can extend this for additional server-side intelligence.
204
208
  if (!ctx.render) {
205
209
  const spaDomainsHook = getSPADomainsHook();
206
210
  const spaPatternsHook = getSPAPatternsHook();
207
- // Basic SPA defaultsminimal set for free tier
208
- const DEFAULT_SPA_DOMAINS = new Set([]);
209
- const DEFAULT_SPA_PATTERNS = [];
210
- // Premium hook merges its full list; basic uses defaults
211
+ // Full SPA domain list always available (npm + server)
212
+ const DEFAULT_SPA_DOMAINS = new Set([
213
+ // Search & travel
214
+ 'www.google.com',
215
+ 'flights.google.com',
216
+ // Travel & hospitality
217
+ 'www.airbnb.com',
218
+ 'www.booking.com',
219
+ 'www.expedia.com',
220
+ 'www.kayak.com',
221
+ 'www.skyscanner.com',
222
+ 'www.tripadvisor.com',
223
+ // Jobs
224
+ 'www.indeed.com',
225
+ 'www.glassdoor.com',
226
+ // Real estate
227
+ 'www.zillow.com',
228
+ // Our own dashboard
229
+ 'app.webpeel.dev',
230
+ ]);
231
+ const DEFAULT_SPA_PATTERNS = [
232
+ /google\.com\/travel/,
233
+ /google\.com\/maps/,
234
+ /google\.com\/shopping/,
235
+ ];
236
+ // Premium hook can extend with additional domains; otherwise use full default list
211
237
  const SPA_DOMAINS = spaDomainsHook ? spaDomainsHook() : DEFAULT_SPA_DOMAINS;
212
238
  const SPA_URL_PATTERNS = spaPatternsHook ? spaPatternsHook() : DEFAULT_SPA_PATTERNS;
213
239
  try {
@@ -1,57 +1,14 @@
1
1
  /**
2
- * stealth-patches.ts
2
+ * Stealth patches — proprietary module stub.
3
3
  *
4
- * Additional browser-fingerprint evasions that go beyond what
5
- * puppeteer-extra-plugin-stealth already provides.
4
+ * The full implementation is compiled into dist/core/stealth-patches.js
5
+ * and shipped in the npm package (14.9KB).
6
+ * TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
6
7
  *
7
- * What puppeteer-extra-plugin-stealth covers (we skip these):
8
- * - navigator.webdriver removal
9
- * - window.chrome (app / csi / loadTimes / runtime)
10
- * - navigator.plugins & mimeTypes (realistic arrays)
11
- * - navigator.languages & navigator.vendor
12
- * - navigator.permissions (Notification.permission → 'default')
13
- * - navigator.hardwareConcurrency
14
- * - webgl.vendor / webgl.renderer (UNMASKED params → Intel)
15
- * - window.outerWidth / outerHeight
16
- * - iframe.contentWindow
17
- * - media.codecs
18
- * - user-agent-override
19
- *
20
- * What THIS file adds (genuine gaps):
21
- * 1. navigator.connection – NetworkInformation API (absent in headless)
22
- * 2. Battery API – navigator.getBattery() (absent/broken in headless)
23
- * 3. Media devices – enumerateDevices() returns empty in headless
24
- * 4. Canvas noise – subtle pixel noise to prevent canvas fingerprinting
25
- * 5. Speech synthesis – getVoices() returns empty in headless
26
- * 6. Keyboard layout – navigator.keyboard.getLayoutMap() (absent in headless)
27
- * 7. navigator.deviceMemory – may be 0 in headless; normalise to 8 GB
28
- * 8. screen.availWidth/H – safety-net: ensure non-zero values
29
- * 9. WebGL noise – tiny noise on non-vendor params to break GL fingerprinting
30
- * 10. Worker webdriver flag – patch inside dedicated workers too
31
- *
32
- * Usage:
33
- * import { applyStealthPatches } from './stealth-patches.js';
34
- * await applyStealthPatches(page);
35
- *
36
- * Call AFTER page creation, before navigation.
37
- * Safe to call alongside puppeteer-extra-plugin-stealth (no conflicts).
38
- */
39
- import type { Page } from 'playwright';
40
- /**
41
- * Apply all supplemental stealth patches to a Playwright page.
42
- * Each patch is wrapped in its own try/catch so one failure never blocks others.
43
- *
44
- * @param page - A Playwright Page (or any object with addInitScript).
45
- */
46
- export declare function applyStealthPatches(page: Page): Promise<void>;
47
- /**
48
- * Set the Accept-Language HTTP header to match navigator.languages.
49
- *
50
- * Call this after creating the page but BEFORE navigation.
51
- * In stealth mode Playwright already sets locale: 'en-US', but the
52
- * Accept-Language header may still differ — this ensures consistency.
53
- *
54
- * @param page - Playwright Page.
55
- * @param locale - BCP 47 locale string, e.g. 'en-US' (default).
8
+ * This stub satisfies TypeScript type-checking on bare repo clones.
9
+ * At runtime the compiled JS is imported dynamically in browser-fetch.ts.
56
10
  */
57
- export declare function applyAcceptLanguageHeader(page: Page, locale?: string): Promise<void>;
11
+ /** Apply stealth patches to a Playwright page to avoid bot detection. */
12
+ export declare function applyStealthPatches(_page: unknown): Promise<void>;
13
+ /** Apply Accept-Language header to a Playwright page. */
14
+ export declare function applyAcceptLanguageHeader(_page: unknown, _lang?: string): Promise<void>;
@@ -1,339 +1,20 @@
1
1
  /**
2
- * stealth-patches.ts
2
+ * Stealth patches — proprietary module stub.
3
3
  *
4
- * Additional browser-fingerprint evasions that go beyond what
5
- * puppeteer-extra-plugin-stealth already provides.
4
+ * The full implementation is compiled into dist/core/stealth-patches.js
5
+ * and shipped in the npm package (14.9KB).
6
+ * TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
6
7
  *
7
- * What puppeteer-extra-plugin-stealth covers (we skip these):
8
- * - navigator.webdriver removal
9
- * - window.chrome (app / csi / loadTimes / runtime)
10
- * - navigator.plugins & mimeTypes (realistic arrays)
11
- * - navigator.languages & navigator.vendor
12
- * - navigator.permissions (Notification.permission → 'default')
13
- * - navigator.hardwareConcurrency
14
- * - webgl.vendor / webgl.renderer (UNMASKED params → Intel)
15
- * - window.outerWidth / outerHeight
16
- * - iframe.contentWindow
17
- * - media.codecs
18
- * - user-agent-override
19
- *
20
- * What THIS file adds (genuine gaps):
21
- * 1. navigator.connection – NetworkInformation API (absent in headless)
22
- * 2. Battery API – navigator.getBattery() (absent/broken in headless)
23
- * 3. Media devices – enumerateDevices() returns empty in headless
24
- * 4. Canvas noise – subtle pixel noise to prevent canvas fingerprinting
25
- * 5. Speech synthesis – getVoices() returns empty in headless
26
- * 6. Keyboard layout – navigator.keyboard.getLayoutMap() (absent in headless)
27
- * 7. navigator.deviceMemory – may be 0 in headless; normalise to 8 GB
28
- * 8. screen.availWidth/H – safety-net: ensure non-zero values
29
- * 9. WebGL noise – tiny noise on non-vendor params to break GL fingerprinting
30
- * 10. Worker webdriver flag – patch inside dedicated workers too
31
- *
32
- * Usage:
33
- * import { applyStealthPatches } from './stealth-patches.js';
34
- * await applyStealthPatches(page);
35
- *
36
- * Call AFTER page creation, before navigation.
37
- * Safe to call alongside puppeteer-extra-plugin-stealth (no conflicts).
38
- */
39
- // ─── main export ─────────────────────────────────────────────────────────────
40
- /**
41
- * Apply all supplemental stealth patches to a Playwright page.
42
- * Each patch is wrapped in its own try/catch so one failure never blocks others.
43
- *
44
- * @param page - A Playwright Page (or any object with addInitScript).
8
+ * This stub satisfies TypeScript type-checking on bare repo clones.
9
+ * At runtime the compiled JS is imported dynamically in browser-fetch.ts.
45
10
  */
46
- export async function applyStealthPatches(page) {
47
- // All patches run as a single evaluateOnNewDocument call for efficiency.
48
- // Using string form to be consistent with existing browser-pool.ts style
49
- // and to avoid any edge-cases with function serialisation across contexts.
50
- await page.addInitScript(`
51
- (function () {
52
- 'use strict';
53
-
54
- // ── 1. navigator.connection (NetworkInformation API) ─────────────────────
55
- // Headless Chrome lacks this object entirely; many bot-detectors probe it.
56
- try {
57
- if (!('connection' in navigator)) {
58
- var _conn = {
59
- downlink: 10,
60
- downlinkMax: Infinity,
61
- effectiveType: '4g',
62
- rtt: 50,
63
- saveData: false,
64
- type: 'wifi',
65
- onchange: null,
66
- ontypechange: null,
67
- addEventListener: function () {},
68
- removeEventListener: function () {},
69
- dispatchEvent: function () { return true; }
70
- };
71
- Object.defineProperty(navigator, 'connection', {
72
- get: function () { return _conn; },
73
- configurable: true
74
- });
75
- // Also expose as NetworkInformation-like alias that some code checks
76
- Object.defineProperty(navigator, 'mozConnection', {
77
- get: function () { return undefined; },
78
- configurable: true
79
- });
80
- Object.defineProperty(navigator, 'webkitConnection', {
81
- get: function () { return undefined; },
82
- configurable: true
83
- });
84
- }
85
- } catch (e) {}
86
-
87
- // ── 2. Battery API ────────────────────────────────────────────────────────
88
- // navigator.getBattery() often rejects in headless; return a plausible battery.
89
- try {
90
- var _battery = {
91
- charging: true,
92
- chargingTime: 0,
93
- dischargingTime: Infinity,
94
- level: 0.96 + (Math.random() * 0.03), // 96–99 %
95
- onchargingchange: null,
96
- onchargingtimechange: null,
97
- ondischargingtimechange: null,
98
- onlevelchange: null,
99
- addEventListener: function () {},
100
- removeEventListener: function () {},
101
- dispatchEvent: function () { return true; }
102
- };
103
- if ('getBattery' in navigator) {
104
- var _origGetBattery = navigator.getBattery.bind(navigator);
105
- Object.defineProperty(navigator, 'getBattery', {
106
- value: function () {
107
- return _origGetBattery().catch(function () {
108
- return Promise.resolve(_battery);
109
- });
110
- },
111
- configurable: true,
112
- writable: true
113
- });
114
- } else {
115
- Object.defineProperty(navigator, 'getBattery', {
116
- value: function () { return Promise.resolve(_battery); },
117
- configurable: true,
118
- writable: true
119
- });
120
- }
121
- } catch (e) {}
122
-
123
- // ── 3. Media devices – enumerateDevices ───────────────────────────────────
124
- // Headless returns an empty array; bots and real users both have at least
125
- // one audio device, so the empty list is a clear signal.
126
- try {
127
- if (navigator.mediaDevices && navigator.mediaDevices.enumerateDevices) {
128
- var _origEnum = navigator.mediaDevices.enumerateDevices.bind(navigator.mediaDevices);
129
- Object.defineProperty(navigator.mediaDevices, 'enumerateDevices', {
130
- value: function () {
131
- return _origEnum().then(function (devices) {
132
- if (devices && devices.length > 0) return devices;
133
- // Mock realistic device list (labels stay empty – that's normal
134
- // until the user grants getUserMedia permission)
135
- return [
136
- { deviceId: 'default', kind: 'audioinput', label: '', groupId: 'default' },
137
- { deviceId: 'communications', kind: 'audioinput', label: '', groupId: 'communications' },
138
- { deviceId: 'default', kind: 'audiooutput', label: '', groupId: 'default' },
139
- { deviceId: 'communications', kind: 'audiooutput', label: '', groupId: 'communications' }
140
- ];
141
- }).catch(function () { return []; });
142
- },
143
- configurable: true,
144
- writable: true
145
- });
146
- }
147
- } catch (e) {}
148
-
149
- // ── 4. Canvas fingerprint noise ───────────────────────────────────────────
150
- // Adds a 1-pixel-level perturbation (~1 % of pixels, ±1 on red channel only).
151
- // Visually imperceptible but breaks hash-based canvas fingerprinting.
152
- try {
153
- var _origToDataURL = HTMLCanvasElement.prototype.toDataURL;
154
- var _origToBlob = HTMLCanvasElement.prototype.toBlob;
155
-
156
- function _addCanvasNoise(canvas) {
157
- if (!canvas || canvas.width === 0 || canvas.height === 0) return;
158
- var ctx = canvas.getContext('2d');
159
- if (!ctx) return;
160
- try {
161
- var imgData = ctx.getImageData(0, 0, canvas.width, canvas.height);
162
- var d = imgData.data;
163
- // Affect ~1 % of pixels (every 400th byte in the red channel)
164
- for (var i = 0; i < d.length; i += 400) {
165
- var noise = (Math.random() < 0.5) ? 1 : -1;
166
- d[i] = Math.max(0, Math.min(255, d[i] + noise));
167
- }
168
- ctx.putImageData(imgData, 0, 0);
169
- } catch (_) {}
170
- }
171
-
172
- HTMLCanvasElement.prototype.toDataURL = function (type, quality) {
173
- _addCanvasNoise(this);
174
- return _origToDataURL.call(this, type, quality);
175
- };
176
-
177
- HTMLCanvasElement.prototype.toBlob = function (callback, type, quality) {
178
- _addCanvasNoise(this);
179
- return _origToBlob.call(this, callback, type, quality);
180
- };
181
- } catch (e) {}
182
-
183
- // ── 5. Speech synthesis voices ────────────────────────────────────────────
184
- // Headless Chrome returns an empty voices array.
185
- // We can't inject real voices from JS, but we can ensure the API exists
186
- // and fire the onvoiceschanged event so listeners don't stall.
187
- try {
188
- if ('speechSynthesis' in window) {
189
- // If voices are already populated, leave them alone.
190
- // Otherwise, fire onvoiceschanged after a short delay so listeners resolve.
191
- var _syn = window.speechSynthesis;
192
- if (_syn.getVoices().length === 0) {
193
- setTimeout(function () {
194
- if (typeof _syn.onvoiceschanged === 'function') {
195
- try { _syn.onvoiceschanged(new Event('voiceschanged')); } catch (_) {}
196
- }
197
- }, 100);
198
- }
199
- }
200
- } catch (e) {}
201
-
202
- // ── 6. Keyboard layout API ────────────────────────────────────────────────
203
- // navigator.keyboard is undefined in headless; some detectors probe it.
204
- try {
205
- if ('keyboard' in navigator) {
206
- var _kbd = navigator.keyboard;
207
- if (_kbd && !_kbd.getLayoutMap) {
208
- _kbd.getLayoutMap = function () {
209
- return Promise.resolve(
210
- new Map([
211
- ['KeyA','a'],['KeyB','b'],['KeyC','c'],['KeyD','d'],
212
- ['KeyE','e'],['KeyF','f'],['KeyG','g'],['KeyH','h'],
213
- ['KeyI','i'],['KeyJ','j'],['KeyK','k'],['KeyL','l'],
214
- ['KeyM','m'],['KeyN','n'],['KeyO','o'],['KeyP','p'],
215
- ['KeyQ','q'],['KeyR','r'],['KeyS','s'],['KeyT','t'],
216
- ['KeyU','u'],['KeyV','v'],['KeyW','w'],['KeyX','x'],
217
- ['KeyY','y'],['KeyZ','z']
218
- ])
219
- );
220
- };
221
- }
222
- }
223
- } catch (e) {}
224
-
225
- // ── 7. navigator.deviceMemory ─────────────────────────────────────────────
226
- // Headless may expose 0 or undefined; normalise to 8 GB (most common laptop value).
227
- try {
228
- var _dm = navigator.deviceMemory;
229
- if (!_dm || _dm === 0) {
230
- Object.defineProperty(navigator, 'deviceMemory', {
231
- get: function () { return 8; },
232
- configurable: true
233
- });
234
- }
235
- } catch (e) {}
236
-
237
- // ── 8. screen.availWidth / availHeight safety net ─────────────────────────
238
- // Headless sometimes reports 0 for available screen dimensions.
239
- try {
240
- if (window.screen) {
241
- if (!window.screen.availWidth || window.screen.availWidth === 0) {
242
- Object.defineProperty(window.screen, 'availWidth', {
243
- get: function () { return window.outerWidth || window.innerWidth || 1920; },
244
- configurable: true
245
- });
246
- }
247
- if (!window.screen.availHeight || window.screen.availHeight === 0) {
248
- Object.defineProperty(window.screen, 'availHeight', {
249
- get: function () { return window.outerHeight || window.innerHeight || 1040; },
250
- configurable: true
251
- });
252
- }
253
- }
254
- } catch (e) {}
255
-
256
- // ── 9. WebGL parameter noise ──────────────────────────────────────────────
257
- // puppeteer-extra-plugin-stealth already patches UNMASKED_VENDOR (37445) and
258
- // UNMASKED_RENDERER (37446). We add a tiny, consistent offset to a handful
259
- // of other float parameters so hash-based GL fingerprinting breaks.
260
- // The offset is seeded per-session (Math.random at inject time) so it differs
261
- // from headless defaults without varying every page load.
262
- try {
263
- var _glNoiseSeed = Math.random() < 0.5 ? 0.0001 : -0.0001;
264
-
265
- function _patchWebGLNoise(ctxProto) {
266
- if (!ctxProto || !ctxProto.getParameter) return;
267
- var _origGetParam = ctxProto.getParameter;
268
- Object.defineProperty(ctxProto, 'getParameter', {
269
- value: function (pname) {
270
- var result = _origGetParam.call(this, pname);
271
- // Only perturb continuous float values (e.g. aliased line/point ranges)
272
- // 33902 = ALIASED_LINE_WIDTH_RANGE, 33901 = ALIASED_POINT_SIZE_RANGE
273
- // 36348 = MAX_FRAGMENT_UNIFORM_VECTORS, skip integers
274
- if (result instanceof Float32Array) {
275
- var patched = new Float32Array(result);
276
- for (var i = 0; i < patched.length; i++) {
277
- patched[i] += _glNoiseSeed;
278
- }
279
- return patched;
280
- }
281
- return result;
282
- },
283
- configurable: true,
284
- writable: true
285
- });
286
- }
287
-
288
- if (typeof WebGLRenderingContext !== 'undefined') {
289
- _patchWebGLNoise(WebGLRenderingContext.prototype);
290
- }
291
- if (typeof WebGL2RenderingContext !== 'undefined') {
292
- _patchWebGLNoise(WebGL2RenderingContext.prototype);
293
- }
294
- } catch (e) {}
295
-
296
- // ── 10. Dedicated worker navigator.webdriver ─────────────────────────────
297
- // puppeteer-extra-plugin-stealth patches the main window, but some detectors
298
- // spin up a Worker and check navigator.webdriver there too.
299
- // We intercept Worker construction and inject a tiny patch script.
300
- try {
301
- var _OrigWorker = window.Worker;
302
- window.Worker = function (scriptURL, options) {
303
- // Prefix the worker script with a blob that removes webdriver
304
- var patchBlob = new Blob([
305
- '(function(){try{Object.defineProperty(navigator,"webdriver",{get:function(){return false;},configurable:true});}catch(e){}})();'
306
- ], { type: 'application/javascript' });
307
- var patchURL = URL.createObjectURL(patchBlob);
308
- // Chain via importScripts is not possible here; use a wrapper blob instead
309
- var wrappedBlob = new Blob([
310
- 'importScripts(' + JSON.stringify(patchURL) + ');importScripts(' + JSON.stringify(scriptURL.toString()) + ');'
311
- ], { type: 'application/javascript' });
312
- var wrappedURL = URL.createObjectURL(wrappedBlob);
313
- return new _OrigWorker(wrappedURL, options);
314
- } as any;
315
- window.Worker.prototype = _OrigWorker.prototype;
316
- } catch (e) {}
317
-
318
- })();
319
- `);
11
+ /* c8 ignore start */
12
+ /** Apply stealth patches to a Playwright page to avoid bot detection. */
13
+ export async function applyStealthPatches(_page) {
14
+ // Stub full implementation in compiled stealth-patches.js
320
15
  }
321
- // ─── Accept-Language header helper ───────────────────────────────────────────
322
- /**
323
- * Set the Accept-Language HTTP header to match navigator.languages.
324
- *
325
- * Call this after creating the page but BEFORE navigation.
326
- * In stealth mode Playwright already sets locale: 'en-US', but the
327
- * Accept-Language header may still differ — this ensures consistency.
328
- *
329
- * @param page - Playwright Page.
330
- * @param locale - BCP 47 locale string, e.g. 'en-US' (default).
331
- */
332
- export async function applyAcceptLanguageHeader(page, locale = 'en-US') {
333
- // Build a realistic q-value string, e.g. "en-US,en;q=0.9"
334
- const lang = locale.split('-')[0];
335
- const acceptLang = lang !== locale
336
- ? `${locale},${lang};q=0.9`
337
- : locale;
338
- await page.setExtraHTTPHeaders({ 'Accept-Language': acceptLang });
16
+ /** Apply Accept-Language header to a Playwright page. */
17
+ export async function applyAcceptLanguageHeader(_page, _lang) {
18
+ // Stub full implementation in compiled stealth-patches.js
339
19
  }
20
+ /* c8 ignore stop */
@@ -10,7 +10,7 @@
10
10
  * All hook methods are optional — unset hooks are simply skipped.
11
11
  */
12
12
  import type { FetchResult } from './fetcher.js';
13
- import type { DomainExtractResult } from './domain-extractors.js';
13
+ import type { DomainExtractResult } from './domain-extractors-basic.js';
14
14
  export interface StrategyResult extends FetchResult {
15
15
  method: 'simple' | 'browser' | 'stealth' | 'cached' | 'cloaked' | 'cycle' | 'peeltls' | 'cf-worker' | 'google-cache';
16
16
  /**
package/dist/index.d.ts CHANGED
@@ -6,7 +6,8 @@
6
6
  import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from './core/fetcher.js';
7
7
  import type { PeelOptions, PeelResult } from './types.js';
8
8
  export * from './types.js';
9
- export { getDomainExtractor, extractDomainData, type DomainExtractResult, type DomainExtractor } from './core/domain-extractors.js';
9
+ export type { DomainExtractResult, DomainExtractor } from './core/domain-extractors-basic.js';
10
+ export { getDomainExtractor, extractDomainData } from './core/domain-extractors-public.js';
10
11
  export { crawl, type CrawlOptions, type CrawlResult, type CrawlProgress } from './core/crawler.js';
11
12
  export { discoverSitemap, type SitemapUrl, type SitemapResult } from './core/sitemap.js';
12
13
  export { mapDomain, type MapOptions, type MapResult } from './core/map.js';
package/dist/index.js CHANGED
@@ -7,7 +7,7 @@ import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from '
7
7
  import { createContext, normalizeOptions, handleYouTube, fetchContent, detectContentType, parseContent, postProcess, finalize, buildResult, } from './core/pipeline.js';
8
8
  import { checkUrlSafety } from './core/safe-browsing.js';
9
9
  export * from './types.js';
10
- export { getDomainExtractor, extractDomainData } from './core/domain-extractors.js';
10
+ export { getDomainExtractor, extractDomainData } from './core/domain-extractors-public.js';
11
11
  export { crawl } from './core/crawler.js';
12
12
  export { discoverSitemap } from './core/sitemap.js';
13
13
  export { mapDomain } from './core/map.js';
@@ -54,8 +54,20 @@ import { createSentryHooks } from './sentry.js';
54
54
  import { requireScope } from './middleware/scope-guard.js';
55
55
  import { createCacheWarmRouter, startCacheWarmer } from './routes/cache-warm.js';
56
56
  import { warmup, cleanup as cleanupFetcher } from '../core/fetcher.js';
57
- import { setExtractorRedis } from '../core/domain-extractors.js';
58
- import { registerPremiumHooks } from './premium/index.js';
57
+ // Proprietary modules loaded dynamically so the build works without TypeScript source.
58
+ // Compiled JS ships in npm/Docker. TypeScript source is .gitignore'd (not on GitHub).
59
+ let setExtractorRedis;
60
+ let registerPremiumHooks;
61
+ try {
62
+ const de = await import('../core/domain-extractors.js');
63
+ setExtractorRedis = de.setExtractorRedis;
64
+ }
65
+ catch { /* compiled JS not available */ }
66
+ try {
67
+ const ph = await import('./premium/index.js');
68
+ registerPremiumHooks = ph.registerPremiumHooks;
69
+ }
70
+ catch { /* compiled JS not available */ }
59
71
  import { readFileSync } from 'fs';
60
72
  import { join, dirname } from 'path';
61
73
  import { fileURLToPath } from 'url';
@@ -421,7 +433,7 @@ export function startServer(config = {}) {
421
433
  const app = createApp(config);
422
434
  const port = config.port || parseInt(process.env.PORT || '3000', 10);
423
435
  // Activate premium strategy hooks (SWR cache, domain intelligence, race).
424
- registerPremiumHooks();
436
+ registerPremiumHooks?.();
425
437
  // Inject Redis into the domain extractor cache for cross-pod cache sharing.
426
438
  // When REDIS_URL is set (multi-pod k8s deployments), all pods share one cache
427
439
  // so the first pod to fetch a URL populates it for all others.
@@ -439,7 +451,7 @@ export function startServer(config = {}) {
439
451
  maxRetriesPerRequest: 3,
440
452
  enableOfflineQueue: false,
441
453
  });
442
- setExtractorRedis(redis);
454
+ setExtractorRedis?.(redis);
443
455
  log.info('Redis extractor cache initialized (shared cross-pod cache active)');
444
456
  }).catch((err) => {
445
457
  log.warn('Failed to init Redis extractor cache (in-memory only)', { error: err.message });
@@ -1,8 +1 @@
1
- /**
2
- * Premium challenge solver — server-only wrapper.
3
- *
4
- * Re-exports the challenge-solver functionality for use as a strategy hook.
5
- * The npm package handles challenges inline in pipeline.ts (basic handling).
6
- * Premium servers can wire in enhanced challenge solving via hooks.
7
- */
8
1
  export { solveChallenge } from '../../core/challenge-solver.js';
@@ -1,8 +1 @@
1
- /**
2
- * Premium challenge solver — server-only wrapper.
3
- *
4
- * Re-exports the challenge-solver functionality for use as a strategy hook.
5
- * The npm package handles challenges inline in pipeline.ts (basic handling).
6
- * Premium servers can wire in enhanced challenge solving via hooks.
7
- */
8
1
  export { solveChallenge } from '../../core/challenge-solver.js';
@@ -1,10 +1 @@
1
- /**
2
- * Premium domain extractors — server-only wrapper.
3
- *
4
- * Re-exports the full extractDomainData and getDomainExtractor functions
5
- * from core/domain-extractors.ts for use as strategy hooks.
6
- *
7
- * The npm package uses basic stubs (always return null).
8
- * When premium hooks are registered, these full extractors are wired in.
9
- */
10
1
  export { extractDomainData, getDomainExtractor } from '../../core/domain-extractors.js';
@@ -1,10 +1 @@
1
- /**
2
- * Premium domain extractors — server-only wrapper.
3
- *
4
- * Re-exports the full extractDomainData and getDomainExtractor functions
5
- * from core/domain-extractors.ts for use as strategy hooks.
6
- *
7
- * The npm package uses basic stubs (always return null).
8
- * When premium hooks are registered, these full extractors are wired in.
9
- */
10
1
  export { extractDomainData, getDomainExtractor } from '../../core/domain-extractors.js';
@@ -1,17 +1,2 @@
1
- /**
2
- * Premium SPA detection — server-only.
3
- *
4
- * Full list of domains and URL patterns that require browser rendering.
5
- * The npm package only has a minimal default set (Google, our own dashboard).
6
- * Premium servers register these via strategy hooks.
7
- */
8
- /**
9
- * Domains that are known SPAs requiring browser rendering.
10
- * Includes travel, real estate, job boards, and other dynamic sites.
11
- */
12
1
  export declare const SPA_DOMAINS: Set<string>;
13
- /**
14
- * URL patterns that match SPA routes on mixed-content domains.
15
- * E.g. google.com/travel is SPA, but google.com/search is not.
16
- */
17
2
  export declare const SPA_URL_PATTERNS: RegExp[];
@@ -1,39 +1,2 @@
1
- /**
2
- * Premium SPA detection server-only.
3
- *
4
- * Full list of domains and URL patterns that require browser rendering.
5
- * The npm package only has a minimal default set (Google, our own dashboard).
6
- * Premium servers register these via strategy hooks.
7
- */
8
- /**
9
- * Domains that are known SPAs requiring browser rendering.
10
- * Includes travel, real estate, job boards, and other dynamic sites.
11
- */
12
- export const SPA_DOMAINS = new Set([
13
- // Google properties
14
- 'www.google.com',
15
- 'flights.google.com',
16
- // Travel
17
- 'www.airbnb.com',
18
- 'www.booking.com',
19
- 'www.expedia.com',
20
- 'www.kayak.com',
21
- 'www.skyscanner.com',
22
- 'www.tripadvisor.com',
23
- // Jobs
24
- 'www.indeed.com',
25
- 'www.glassdoor.com',
26
- // Real estate
27
- 'www.zillow.com',
28
- // Our own dashboard
29
- 'app.webpeel.dev',
30
- ]);
31
- /**
32
- * URL patterns that match SPA routes on mixed-content domains.
33
- * E.g. google.com/travel is SPA, but google.com/search is not.
34
- */
35
- export const SPA_URL_PATTERNS = [
36
- /google\.com\/travel/,
37
- /google\.com\/maps/,
38
- /google\.com\/shopping/,
39
- ];
1
+ export const SPA_DOMAINS = new Set(['www.google.com', 'flights.google.com', 'www.airbnb.com', 'www.booking.com', 'www.expedia.com', 'www.kayak.com', 'www.skyscanner.com', 'www.tripadvisor.com', 'www.indeed.com', 'www.glassdoor.com', 'www.zillow.com', 'app.webpeel.dev']);
2
+ export const SPA_URL_PATTERNS = [/google\.com\/travel/, /google\.com\/maps/, /google\.com\/shopping/];
@@ -1,23 +1,4 @@
1
- /**
2
- * Premium content stability detection — server-only.
3
- *
4
- * Provides smarter content-stability waiting logic than the default
5
- * waitForLoadState('networkidle'). Monitors DOM mutations and network
6
- * activity to determine when a page has truly finished rendering.
7
- *
8
- * The npm package uses default Playwright waitForLoadState.
9
- * Premium servers can wire this in via the waitForContentStable hook.
10
- */
11
- export interface StabilityOptions {
12
- /** Maximum time to wait (ms). Default: 5000. */
1
+ export declare function waitForContentStable(page: any, options?: {
13
2
  timeoutMs?: number;
14
- /** Minimum quiet period before declaring stable (ms). Default: 500. */
15
3
  quietMs?: number;
16
- }
17
- /**
18
- * Wait for page content to stabilize by monitoring DOM mutations.
19
- *
20
- * More reliable than waitForLoadState('networkidle') for SPAs that
21
- * progressively render content.
22
- */
23
- export declare function waitForContentStable(page: any, options?: StabilityOptions): Promise<void>;
4
+ }): Promise<void>;
@@ -1,36 +1,13 @@
1
- /**
2
- * Premium content stability detection — server-only.
3
- *
4
- * Provides smarter content-stability waiting logic than the default
5
- * waitForLoadState('networkidle'). Monitors DOM mutations and network
6
- * activity to determine when a page has truly finished rendering.
7
- *
8
- * The npm package uses default Playwright waitForLoadState.
9
- * Premium servers can wire this in via the waitForContentStable hook.
10
- */
11
- /**
12
- * Wait for page content to stabilize by monitoring DOM mutations.
13
- *
14
- * More reliable than waitForLoadState('networkidle') for SPAs that
15
- * progressively render content.
16
- */
17
1
  export async function waitForContentStable(page, options) {
18
2
  const timeout = options?.timeoutMs ?? 5000;
19
3
  const quiet = options?.quietMs ?? 500;
20
4
  const start = Date.now();
21
- // Use page.evaluate to monitor DOM mutations
22
5
  await page.evaluate(({ quietMs, timeoutMs }) => {
23
6
  return new Promise((resolve) => {
24
7
  let lastMutation = Date.now();
25
8
  let settled = false;
26
- const observer = new MutationObserver(() => {
27
- lastMutation = Date.now();
28
- });
29
- observer.observe(document.body, {
30
- childList: true,
31
- subtree: true,
32
- characterData: true,
33
- });
9
+ const observer = new MutationObserver(() => { lastMutation = Date.now(); });
10
+ observer.observe(document.body, { childList: true, subtree: true, characterData: true });
34
11
  const check = () => {
35
12
  const now = Date.now();
36
13
  if (now - lastMutation >= quietMs || settled) {
@@ -45,13 +22,7 @@ export async function waitForContentStable(page, options) {
45
22
  }
46
23
  requestAnimationFrame(check);
47
24
  };
48
- // Hard timeout
49
- setTimeout(() => {
50
- settled = true;
51
- observer.disconnect();
52
- resolve();
53
- }, timeoutMs);
54
- // Start checking after an initial quiet period
25
+ setTimeout(() => { settled = true; observer.disconnect(); resolve(); }, timeoutMs);
55
26
  setTimeout(check, quietMs);
56
27
  });
57
28
  }, { quietMs: quiet, timeoutMs: Math.max(0, timeout - (Date.now() - start)) });
package/dist/types.d.ts CHANGED
@@ -309,7 +309,7 @@ export interface PeelResult {
309
309
  */
310
310
  readability?: import('./core/readability.js').ReadabilityResult;
311
311
  /** Domain-aware structured data (Twitter, Reddit, GitHub, HN). Present when URL matches a known domain. */
312
- domainData?: import('./core/domain-extractors.js').DomainExtractResult;
312
+ domainData?: import('./core/domain-extractors-basic.js').DomainExtractResult;
313
313
  /** Quick answer result (when question option is set). BM25-powered, no LLM needed. */
314
314
  quickAnswer?: import('./core/quick-answer.js').QuickAnswerResult;
315
315
  /** Per-stage timing breakdown in milliseconds. */
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.81",
3
+ "version": "0.21.83",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",