webpeel 0.21.81 → 0.21.83
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/domain-extractors-basic.d.ts +19 -8
- package/dist/core/domain-extractors-basic.js +4 -7
- package/dist/core/domain-extractors-public.d.ts +20 -0
- package/dist/core/domain-extractors-public.js +35 -0
- package/dist/core/pipeline.d.ts +1 -1
- package/dist/core/pipeline.js +55 -29
- package/dist/core/stealth-patches.d.ts +10 -53
- package/dist/core/stealth-patches.js +14 -333
- package/dist/core/strategy-hooks.d.ts +1 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +1 -1
- package/dist/server/app.js +16 -4
- package/dist/server/premium/challenge.d.ts +0 -7
- package/dist/server/premium/challenge.js +0 -7
- package/dist/server/premium/extractors.d.ts +0 -9
- package/dist/server/premium/extractors.js +0 -9
- package/dist/server/premium/spa-detection.d.ts +0 -15
- package/dist/server/premium/spa-detection.js +2 -39
- package/dist/server/premium/stability.d.ts +2 -21
- package/dist/server/premium/stability.js +3 -32
- package/dist/types.d.ts +1 -1
- package/package.json +1 -1
|
@@ -1,14 +1,25 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Domain extraction types and basic stub.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
* This module is safe to include in the npm package.
|
|
8
|
-
* The full `domain-extractors.ts` is compiled for the server
|
|
9
|
-
* but wired in only when premium hooks are registered.
|
|
4
|
+
* Types are defined HERE (always available) so nothing depends
|
|
5
|
+
* on the proprietary domain-extractors.ts TypeScript source.
|
|
6
|
+
* The compiled domain-extractors.js ships in npm and is loaded at runtime.
|
|
10
7
|
*/
|
|
11
|
-
|
|
8
|
+
/** Structured result from a domain-specific extractor */
|
|
9
|
+
export interface DomainExtractResult {
|
|
10
|
+
/** Canonical domain name (e.g. 'twitter.com') */
|
|
11
|
+
domain: string;
|
|
12
|
+
/** Page type within the domain (e.g. 'tweet', 'thread', 'repo', 'issue') */
|
|
13
|
+
type: string;
|
|
14
|
+
/** Domain-specific structured data */
|
|
15
|
+
structured: Record<string, any>;
|
|
16
|
+
/** Clean markdown representation of the content */
|
|
17
|
+
cleanContent: string;
|
|
18
|
+
/** Raw HTML size in characters (from the actual HTML page fetched by the extractor) */
|
|
19
|
+
rawHtmlSize?: number;
|
|
20
|
+
}
|
|
21
|
+
/** An extractor receives the raw HTML and original URL, may make API calls. */
|
|
22
|
+
export type DomainExtractor = (html: string, url: string) => Promise<DomainExtractResult | null>;
|
|
12
23
|
/**
|
|
13
24
|
* Basic domain data extractor — free tier stub.
|
|
14
25
|
*
|
|
@@ -1,12 +1,9 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Domain extraction types and basic stub.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
* This module is safe to include in the npm package.
|
|
8
|
-
* The full `domain-extractors.ts` is compiled for the server
|
|
9
|
-
* but wired in only when premium hooks are registered.
|
|
4
|
+
* Types are defined HERE (always available) so nothing depends
|
|
5
|
+
* on the proprietary domain-extractors.ts TypeScript source.
|
|
6
|
+
* The compiled domain-extractors.js ships in npm and is loaded at runtime.
|
|
10
7
|
*/
|
|
11
8
|
/**
|
|
12
9
|
* Basic domain data extractor — free tier stub.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Public re-exports for domain extraction functions.
|
|
3
|
+
*
|
|
4
|
+
* This module is always available (npm + repo + server).
|
|
5
|
+
* It lazy-loads the full domain-extractors.js (compiled, ships in npm).
|
|
6
|
+
* If compiled JS is missing (bare repo clone), returns null gracefully.
|
|
7
|
+
*
|
|
8
|
+
* TypeScript source for domain-extractors is .gitignore'd (not on GitHub).
|
|
9
|
+
*/
|
|
10
|
+
import type { DomainExtractResult } from './domain-extractors-basic.js';
|
|
11
|
+
/**
|
|
12
|
+
* Check if a URL has a domain-specific extractor.
|
|
13
|
+
* Returns the extractor function or null.
|
|
14
|
+
*/
|
|
15
|
+
export declare function getDomainExtractor(url: string): any;
|
|
16
|
+
/**
|
|
17
|
+
* Run domain-specific extraction on HTML content.
|
|
18
|
+
* Returns structured domain data or null.
|
|
19
|
+
*/
|
|
20
|
+
export declare function extractDomainData(html: string, url: string): Promise<DomainExtractResult | null>;
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Public re-exports for domain extraction functions.
|
|
3
|
+
*
|
|
4
|
+
* This module is always available (npm + repo + server).
|
|
5
|
+
* It lazy-loads the full domain-extractors.js (compiled, ships in npm).
|
|
6
|
+
* If compiled JS is missing (bare repo clone), returns null gracefully.
|
|
7
|
+
*
|
|
8
|
+
* TypeScript source for domain-extractors is .gitignore'd (not on GitHub).
|
|
9
|
+
*/
|
|
10
|
+
// Top-level await: module fully loaded before any exports are called.
|
|
11
|
+
// This is safe in ESM (Node 14.8+, all modern bundlers).
|
|
12
|
+
let _getDomainExtractor = null;
|
|
13
|
+
let _extractDomainData = null;
|
|
14
|
+
try {
|
|
15
|
+
const mod = await import('./domain-extractors.js');
|
|
16
|
+
_getDomainExtractor = mod.getDomainExtractor;
|
|
17
|
+
_extractDomainData = mod.extractDomainData;
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
// Compiled JS not available (bare repo clone) — stubs return null
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Check if a URL has a domain-specific extractor.
|
|
24
|
+
* Returns the extractor function or null.
|
|
25
|
+
*/
|
|
26
|
+
export function getDomainExtractor(url) {
|
|
27
|
+
return _getDomainExtractor ? _getDomainExtractor(url) : null;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Run domain-specific extraction on HTML content.
|
|
31
|
+
* Returns structured domain data or null.
|
|
32
|
+
*/
|
|
33
|
+
export async function extractDomainData(html, url) {
|
|
34
|
+
return _extractDomainData ? _extractDomainData(html, url) : null;
|
|
35
|
+
}
|
package/dist/core/pipeline.d.ts
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* mutable PipelineContext. The stages are called in order by peel().
|
|
6
6
|
*/
|
|
7
7
|
import { type AutoScrollOptions } from './actions.js';
|
|
8
|
-
import { type DomainExtractResult } from './domain-extractors.js';
|
|
8
|
+
import { type DomainExtractResult } from './domain-extractors-basic.js';
|
|
9
9
|
import { type ReadabilityResult } from './readability.js';
|
|
10
10
|
import { type QuickAnswerResult } from './quick-answer.js';
|
|
11
11
|
import { Timer } from './timing.js';
|
package/dist/core/pipeline.js
CHANGED
|
@@ -16,26 +16,32 @@ import { isPdfContentType, isDocxContentType, extractDocumentToFormat } from './
|
|
|
16
16
|
import { parseYouTubeUrl, getYouTubeTranscript } from './youtube.js';
|
|
17
17
|
import { extractDomainDataBasic, getDomainExtractorBasic } from './domain-extractors-basic.js';
|
|
18
18
|
import { getDomainExtractHook, getDomainExtractorHook, getSPADomainsHook, getSPAPatternsHook } from './strategy-hooks.js';
|
|
19
|
-
//
|
|
20
|
-
//
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Domain extraction — lazy-load full extractors from compiled JS
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// The compiled domain-extractors.js (312KB) ships in the npm package.
|
|
23
|
+
// TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
|
|
24
|
+
// If compiled JS is missing (bare repo clone without proprietary files),
|
|
25
|
+
// falls back to basic stub (no domain extraction, just standard markdown).
|
|
26
|
+
// Server premium hooks can override for additional caching/intelligence.
|
|
27
|
+
let _extractorsLoaded = false;
|
|
28
|
+
let _extractDomainData = null;
|
|
29
|
+
let _getDomainExtractor = null;
|
|
30
|
+
async function loadExtractors() {
|
|
31
|
+
if (_extractorsLoaded)
|
|
26
32
|
return;
|
|
27
|
-
|
|
33
|
+
_extractorsLoaded = true;
|
|
28
34
|
try {
|
|
29
35
|
const mod = await import('./domain-extractors.js');
|
|
30
|
-
|
|
31
|
-
|
|
36
|
+
_extractDomainData = mod.extractDomainData;
|
|
37
|
+
_getDomainExtractor = mod.getDomainExtractor;
|
|
32
38
|
}
|
|
33
39
|
catch {
|
|
34
|
-
//
|
|
40
|
+
// Compiled JS not available (bare repo clone) — basic stub will be used
|
|
35
41
|
}
|
|
36
42
|
}
|
|
37
|
-
//
|
|
38
|
-
|
|
43
|
+
// Start loading immediately (non-blocking)
|
|
44
|
+
loadExtractors();
|
|
39
45
|
import { extractReadableContent } from './readability.js';
|
|
40
46
|
import { quickAnswer as runQuickAnswer } from './quick-answer.js';
|
|
41
47
|
import { Timer } from './timing.js';
|
|
@@ -56,25 +62,21 @@ function hasDomainExtractor(url) {
|
|
|
56
62
|
const hookFn = getDomainExtractorHook();
|
|
57
63
|
if (hookFn)
|
|
58
64
|
return hookFn(url) !== null;
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
return _fullGetDomainExtractor(url) !== null;
|
|
62
|
-
// npm package fallback — basic stubs
|
|
65
|
+
if (_getDomainExtractor)
|
|
66
|
+
return _getDomainExtractor(url) !== null;
|
|
63
67
|
return getDomainExtractorBasic(url) !== null;
|
|
64
68
|
}
|
|
65
69
|
/**
|
|
66
70
|
* Run domain extraction on HTML/URL.
|
|
67
|
-
* Priority: premium hook →
|
|
71
|
+
* Priority: premium hook → compiled extractors → basic stub.
|
|
68
72
|
*/
|
|
69
73
|
async function runDomainExtract(html, url) {
|
|
70
74
|
const hookFn = getDomainExtractHook();
|
|
71
75
|
if (hookFn)
|
|
72
76
|
return hookFn(html, url);
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
return _fullExtractDomainData(html, url);
|
|
77
|
-
// npm package fallback — basic stubs
|
|
77
|
+
await loadExtractors();
|
|
78
|
+
if (_extractDomainData)
|
|
79
|
+
return _extractDomainData(html, url);
|
|
78
80
|
return extractDomainDataBasic(html, url);
|
|
79
81
|
}
|
|
80
82
|
/** Create the initial PipelineContext with defaults */
|
|
@@ -199,15 +201,39 @@ export function normalizeOptions(ctx) {
|
|
|
199
201
|
if (autoScrollOpts) {
|
|
200
202
|
ctx.render = true;
|
|
201
203
|
}
|
|
202
|
-
// Auto-detect SPAs that require browser rendering (no --render flag needed)
|
|
203
|
-
//
|
|
204
|
+
// Auto-detect SPAs that require browser rendering (no --render flag needed).
|
|
205
|
+
// This list is NOT proprietary — every developer knows these sites are SPAs.
|
|
206
|
+
// The proprietary part is the domain EXTRACTORS (what data to pull), not this list.
|
|
207
|
+
// Premium hook can extend this for additional server-side intelligence.
|
|
204
208
|
if (!ctx.render) {
|
|
205
209
|
const spaDomainsHook = getSPADomainsHook();
|
|
206
210
|
const spaPatternsHook = getSPAPatternsHook();
|
|
207
|
-
//
|
|
208
|
-
const DEFAULT_SPA_DOMAINS = new Set([
|
|
209
|
-
|
|
210
|
-
|
|
211
|
+
// Full SPA domain list — always available (npm + server)
|
|
212
|
+
const DEFAULT_SPA_DOMAINS = new Set([
|
|
213
|
+
// Search & travel
|
|
214
|
+
'www.google.com',
|
|
215
|
+
'flights.google.com',
|
|
216
|
+
// Travel & hospitality
|
|
217
|
+
'www.airbnb.com',
|
|
218
|
+
'www.booking.com',
|
|
219
|
+
'www.expedia.com',
|
|
220
|
+
'www.kayak.com',
|
|
221
|
+
'www.skyscanner.com',
|
|
222
|
+
'www.tripadvisor.com',
|
|
223
|
+
// Jobs
|
|
224
|
+
'www.indeed.com',
|
|
225
|
+
'www.glassdoor.com',
|
|
226
|
+
// Real estate
|
|
227
|
+
'www.zillow.com',
|
|
228
|
+
// Our own dashboard
|
|
229
|
+
'app.webpeel.dev',
|
|
230
|
+
]);
|
|
231
|
+
const DEFAULT_SPA_PATTERNS = [
|
|
232
|
+
/google\.com\/travel/,
|
|
233
|
+
/google\.com\/maps/,
|
|
234
|
+
/google\.com\/shopping/,
|
|
235
|
+
];
|
|
236
|
+
// Premium hook can extend with additional domains; otherwise use full default list
|
|
211
237
|
const SPA_DOMAINS = spaDomainsHook ? spaDomainsHook() : DEFAULT_SPA_DOMAINS;
|
|
212
238
|
const SPA_URL_PATTERNS = spaPatternsHook ? spaPatternsHook() : DEFAULT_SPA_PATTERNS;
|
|
213
239
|
try {
|
|
@@ -1,57 +1,14 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Stealth patches — proprietary module stub.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
4
|
+
* The full implementation is compiled into dist/core/stealth-patches.js
|
|
5
|
+
* and shipped in the npm package (14.9KB).
|
|
6
|
+
* TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
|
|
6
7
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
* - window.chrome (app / csi / loadTimes / runtime)
|
|
10
|
-
* - navigator.plugins & mimeTypes (realistic arrays)
|
|
11
|
-
* - navigator.languages & navigator.vendor
|
|
12
|
-
* - navigator.permissions (Notification.permission → 'default')
|
|
13
|
-
* - navigator.hardwareConcurrency
|
|
14
|
-
* - webgl.vendor / webgl.renderer (UNMASKED params → Intel)
|
|
15
|
-
* - window.outerWidth / outerHeight
|
|
16
|
-
* - iframe.contentWindow
|
|
17
|
-
* - media.codecs
|
|
18
|
-
* - user-agent-override
|
|
19
|
-
*
|
|
20
|
-
* What THIS file adds (genuine gaps):
|
|
21
|
-
* 1. navigator.connection – NetworkInformation API (absent in headless)
|
|
22
|
-
* 2. Battery API – navigator.getBattery() (absent/broken in headless)
|
|
23
|
-
* 3. Media devices – enumerateDevices() returns empty in headless
|
|
24
|
-
* 4. Canvas noise – subtle pixel noise to prevent canvas fingerprinting
|
|
25
|
-
* 5. Speech synthesis – getVoices() returns empty in headless
|
|
26
|
-
* 6. Keyboard layout – navigator.keyboard.getLayoutMap() (absent in headless)
|
|
27
|
-
* 7. navigator.deviceMemory – may be 0 in headless; normalise to 8 GB
|
|
28
|
-
* 8. screen.availWidth/H – safety-net: ensure non-zero values
|
|
29
|
-
* 9. WebGL noise – tiny noise on non-vendor params to break GL fingerprinting
|
|
30
|
-
* 10. Worker webdriver flag – patch inside dedicated workers too
|
|
31
|
-
*
|
|
32
|
-
* Usage:
|
|
33
|
-
* import { applyStealthPatches } from './stealth-patches.js';
|
|
34
|
-
* await applyStealthPatches(page);
|
|
35
|
-
*
|
|
36
|
-
* Call AFTER page creation, before navigation.
|
|
37
|
-
* Safe to call alongside puppeteer-extra-plugin-stealth (no conflicts).
|
|
38
|
-
*/
|
|
39
|
-
import type { Page } from 'playwright';
|
|
40
|
-
/**
|
|
41
|
-
* Apply all supplemental stealth patches to a Playwright page.
|
|
42
|
-
* Each patch is wrapped in its own try/catch so one failure never blocks others.
|
|
43
|
-
*
|
|
44
|
-
* @param page - A Playwright Page (or any object with addInitScript).
|
|
45
|
-
*/
|
|
46
|
-
export declare function applyStealthPatches(page: Page): Promise<void>;
|
|
47
|
-
/**
|
|
48
|
-
* Set the Accept-Language HTTP header to match navigator.languages.
|
|
49
|
-
*
|
|
50
|
-
* Call this after creating the page but BEFORE navigation.
|
|
51
|
-
* In stealth mode Playwright already sets locale: 'en-US', but the
|
|
52
|
-
* Accept-Language header may still differ — this ensures consistency.
|
|
53
|
-
*
|
|
54
|
-
* @param page - Playwright Page.
|
|
55
|
-
* @param locale - BCP 47 locale string, e.g. 'en-US' (default).
|
|
8
|
+
* This stub satisfies TypeScript type-checking on bare repo clones.
|
|
9
|
+
* At runtime the compiled JS is imported dynamically in browser-fetch.ts.
|
|
56
10
|
*/
|
|
57
|
-
|
|
11
|
+
/** Apply stealth patches to a Playwright page to avoid bot detection. */
|
|
12
|
+
export declare function applyStealthPatches(_page: unknown): Promise<void>;
|
|
13
|
+
/** Apply Accept-Language header to a Playwright page. */
|
|
14
|
+
export declare function applyAcceptLanguageHeader(_page: unknown, _lang?: string): Promise<void>;
|
|
@@ -1,339 +1,20 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Stealth patches — proprietary module stub.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
4
|
+
* The full implementation is compiled into dist/core/stealth-patches.js
|
|
5
|
+
* and shipped in the npm package (14.9KB).
|
|
6
|
+
* TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
|
|
6
7
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
* - window.chrome (app / csi / loadTimes / runtime)
|
|
10
|
-
* - navigator.plugins & mimeTypes (realistic arrays)
|
|
11
|
-
* - navigator.languages & navigator.vendor
|
|
12
|
-
* - navigator.permissions (Notification.permission → 'default')
|
|
13
|
-
* - navigator.hardwareConcurrency
|
|
14
|
-
* - webgl.vendor / webgl.renderer (UNMASKED params → Intel)
|
|
15
|
-
* - window.outerWidth / outerHeight
|
|
16
|
-
* - iframe.contentWindow
|
|
17
|
-
* - media.codecs
|
|
18
|
-
* - user-agent-override
|
|
19
|
-
*
|
|
20
|
-
* What THIS file adds (genuine gaps):
|
|
21
|
-
* 1. navigator.connection – NetworkInformation API (absent in headless)
|
|
22
|
-
* 2. Battery API – navigator.getBattery() (absent/broken in headless)
|
|
23
|
-
* 3. Media devices – enumerateDevices() returns empty in headless
|
|
24
|
-
* 4. Canvas noise – subtle pixel noise to prevent canvas fingerprinting
|
|
25
|
-
* 5. Speech synthesis – getVoices() returns empty in headless
|
|
26
|
-
* 6. Keyboard layout – navigator.keyboard.getLayoutMap() (absent in headless)
|
|
27
|
-
* 7. navigator.deviceMemory – may be 0 in headless; normalise to 8 GB
|
|
28
|
-
* 8. screen.availWidth/H – safety-net: ensure non-zero values
|
|
29
|
-
* 9. WebGL noise – tiny noise on non-vendor params to break GL fingerprinting
|
|
30
|
-
* 10. Worker webdriver flag – patch inside dedicated workers too
|
|
31
|
-
*
|
|
32
|
-
* Usage:
|
|
33
|
-
* import { applyStealthPatches } from './stealth-patches.js';
|
|
34
|
-
* await applyStealthPatches(page);
|
|
35
|
-
*
|
|
36
|
-
* Call AFTER page creation, before navigation.
|
|
37
|
-
* Safe to call alongside puppeteer-extra-plugin-stealth (no conflicts).
|
|
38
|
-
*/
|
|
39
|
-
// ─── main export ─────────────────────────────────────────────────────────────
|
|
40
|
-
/**
|
|
41
|
-
* Apply all supplemental stealth patches to a Playwright page.
|
|
42
|
-
* Each patch is wrapped in its own try/catch so one failure never blocks others.
|
|
43
|
-
*
|
|
44
|
-
* @param page - A Playwright Page (or any object with addInitScript).
|
|
8
|
+
* This stub satisfies TypeScript type-checking on bare repo clones.
|
|
9
|
+
* At runtime the compiled JS is imported dynamically in browser-fetch.ts.
|
|
45
10
|
*/
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
//
|
|
50
|
-
await page.addInitScript(`
|
|
51
|
-
(function () {
|
|
52
|
-
'use strict';
|
|
53
|
-
|
|
54
|
-
// ── 1. navigator.connection (NetworkInformation API) ─────────────────────
|
|
55
|
-
// Headless Chrome lacks this object entirely; many bot-detectors probe it.
|
|
56
|
-
try {
|
|
57
|
-
if (!('connection' in navigator)) {
|
|
58
|
-
var _conn = {
|
|
59
|
-
downlink: 10,
|
|
60
|
-
downlinkMax: Infinity,
|
|
61
|
-
effectiveType: '4g',
|
|
62
|
-
rtt: 50,
|
|
63
|
-
saveData: false,
|
|
64
|
-
type: 'wifi',
|
|
65
|
-
onchange: null,
|
|
66
|
-
ontypechange: null,
|
|
67
|
-
addEventListener: function () {},
|
|
68
|
-
removeEventListener: function () {},
|
|
69
|
-
dispatchEvent: function () { return true; }
|
|
70
|
-
};
|
|
71
|
-
Object.defineProperty(navigator, 'connection', {
|
|
72
|
-
get: function () { return _conn; },
|
|
73
|
-
configurable: true
|
|
74
|
-
});
|
|
75
|
-
// Also expose as NetworkInformation-like alias that some code checks
|
|
76
|
-
Object.defineProperty(navigator, 'mozConnection', {
|
|
77
|
-
get: function () { return undefined; },
|
|
78
|
-
configurable: true
|
|
79
|
-
});
|
|
80
|
-
Object.defineProperty(navigator, 'webkitConnection', {
|
|
81
|
-
get: function () { return undefined; },
|
|
82
|
-
configurable: true
|
|
83
|
-
});
|
|
84
|
-
}
|
|
85
|
-
} catch (e) {}
|
|
86
|
-
|
|
87
|
-
// ── 2. Battery API ────────────────────────────────────────────────────────
|
|
88
|
-
// navigator.getBattery() often rejects in headless; return a plausible battery.
|
|
89
|
-
try {
|
|
90
|
-
var _battery = {
|
|
91
|
-
charging: true,
|
|
92
|
-
chargingTime: 0,
|
|
93
|
-
dischargingTime: Infinity,
|
|
94
|
-
level: 0.96 + (Math.random() * 0.03), // 96–99 %
|
|
95
|
-
onchargingchange: null,
|
|
96
|
-
onchargingtimechange: null,
|
|
97
|
-
ondischargingtimechange: null,
|
|
98
|
-
onlevelchange: null,
|
|
99
|
-
addEventListener: function () {},
|
|
100
|
-
removeEventListener: function () {},
|
|
101
|
-
dispatchEvent: function () { return true; }
|
|
102
|
-
};
|
|
103
|
-
if ('getBattery' in navigator) {
|
|
104
|
-
var _origGetBattery = navigator.getBattery.bind(navigator);
|
|
105
|
-
Object.defineProperty(navigator, 'getBattery', {
|
|
106
|
-
value: function () {
|
|
107
|
-
return _origGetBattery().catch(function () {
|
|
108
|
-
return Promise.resolve(_battery);
|
|
109
|
-
});
|
|
110
|
-
},
|
|
111
|
-
configurable: true,
|
|
112
|
-
writable: true
|
|
113
|
-
});
|
|
114
|
-
} else {
|
|
115
|
-
Object.defineProperty(navigator, 'getBattery', {
|
|
116
|
-
value: function () { return Promise.resolve(_battery); },
|
|
117
|
-
configurable: true,
|
|
118
|
-
writable: true
|
|
119
|
-
});
|
|
120
|
-
}
|
|
121
|
-
} catch (e) {}
|
|
122
|
-
|
|
123
|
-
// ── 3. Media devices – enumerateDevices ───────────────────────────────────
|
|
124
|
-
// Headless returns an empty array; bots and real users both have at least
|
|
125
|
-
// one audio device, so the empty list is a clear signal.
|
|
126
|
-
try {
|
|
127
|
-
if (navigator.mediaDevices && navigator.mediaDevices.enumerateDevices) {
|
|
128
|
-
var _origEnum = navigator.mediaDevices.enumerateDevices.bind(navigator.mediaDevices);
|
|
129
|
-
Object.defineProperty(navigator.mediaDevices, 'enumerateDevices', {
|
|
130
|
-
value: function () {
|
|
131
|
-
return _origEnum().then(function (devices) {
|
|
132
|
-
if (devices && devices.length > 0) return devices;
|
|
133
|
-
// Mock realistic device list (labels stay empty – that's normal
|
|
134
|
-
// until the user grants getUserMedia permission)
|
|
135
|
-
return [
|
|
136
|
-
{ deviceId: 'default', kind: 'audioinput', label: '', groupId: 'default' },
|
|
137
|
-
{ deviceId: 'communications', kind: 'audioinput', label: '', groupId: 'communications' },
|
|
138
|
-
{ deviceId: 'default', kind: 'audiooutput', label: '', groupId: 'default' },
|
|
139
|
-
{ deviceId: 'communications', kind: 'audiooutput', label: '', groupId: 'communications' }
|
|
140
|
-
];
|
|
141
|
-
}).catch(function () { return []; });
|
|
142
|
-
},
|
|
143
|
-
configurable: true,
|
|
144
|
-
writable: true
|
|
145
|
-
});
|
|
146
|
-
}
|
|
147
|
-
} catch (e) {}
|
|
148
|
-
|
|
149
|
-
// ── 4. Canvas fingerprint noise ───────────────────────────────────────────
|
|
150
|
-
// Adds a 1-pixel-level perturbation (~1 % of pixels, ±1 on red channel only).
|
|
151
|
-
// Visually imperceptible but breaks hash-based canvas fingerprinting.
|
|
152
|
-
try {
|
|
153
|
-
var _origToDataURL = HTMLCanvasElement.prototype.toDataURL;
|
|
154
|
-
var _origToBlob = HTMLCanvasElement.prototype.toBlob;
|
|
155
|
-
|
|
156
|
-
function _addCanvasNoise(canvas) {
|
|
157
|
-
if (!canvas || canvas.width === 0 || canvas.height === 0) return;
|
|
158
|
-
var ctx = canvas.getContext('2d');
|
|
159
|
-
if (!ctx) return;
|
|
160
|
-
try {
|
|
161
|
-
var imgData = ctx.getImageData(0, 0, canvas.width, canvas.height);
|
|
162
|
-
var d = imgData.data;
|
|
163
|
-
// Affect ~1 % of pixels (every 400th byte in the red channel)
|
|
164
|
-
for (var i = 0; i < d.length; i += 400) {
|
|
165
|
-
var noise = (Math.random() < 0.5) ? 1 : -1;
|
|
166
|
-
d[i] = Math.max(0, Math.min(255, d[i] + noise));
|
|
167
|
-
}
|
|
168
|
-
ctx.putImageData(imgData, 0, 0);
|
|
169
|
-
} catch (_) {}
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
HTMLCanvasElement.prototype.toDataURL = function (type, quality) {
|
|
173
|
-
_addCanvasNoise(this);
|
|
174
|
-
return _origToDataURL.call(this, type, quality);
|
|
175
|
-
};
|
|
176
|
-
|
|
177
|
-
HTMLCanvasElement.prototype.toBlob = function (callback, type, quality) {
|
|
178
|
-
_addCanvasNoise(this);
|
|
179
|
-
return _origToBlob.call(this, callback, type, quality);
|
|
180
|
-
};
|
|
181
|
-
} catch (e) {}
|
|
182
|
-
|
|
183
|
-
// ── 5. Speech synthesis voices ────────────────────────────────────────────
|
|
184
|
-
// Headless Chrome returns an empty voices array.
|
|
185
|
-
// We can't inject real voices from JS, but we can ensure the API exists
|
|
186
|
-
// and fire the onvoiceschanged event so listeners don't stall.
|
|
187
|
-
try {
|
|
188
|
-
if ('speechSynthesis' in window) {
|
|
189
|
-
// If voices are already populated, leave them alone.
|
|
190
|
-
// Otherwise, fire onvoiceschanged after a short delay so listeners resolve.
|
|
191
|
-
var _syn = window.speechSynthesis;
|
|
192
|
-
if (_syn.getVoices().length === 0) {
|
|
193
|
-
setTimeout(function () {
|
|
194
|
-
if (typeof _syn.onvoiceschanged === 'function') {
|
|
195
|
-
try { _syn.onvoiceschanged(new Event('voiceschanged')); } catch (_) {}
|
|
196
|
-
}
|
|
197
|
-
}, 100);
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
} catch (e) {}
|
|
201
|
-
|
|
202
|
-
// ── 6. Keyboard layout API ────────────────────────────────────────────────
|
|
203
|
-
// navigator.keyboard is undefined in headless; some detectors probe it.
|
|
204
|
-
try {
|
|
205
|
-
if ('keyboard' in navigator) {
|
|
206
|
-
var _kbd = navigator.keyboard;
|
|
207
|
-
if (_kbd && !_kbd.getLayoutMap) {
|
|
208
|
-
_kbd.getLayoutMap = function () {
|
|
209
|
-
return Promise.resolve(
|
|
210
|
-
new Map([
|
|
211
|
-
['KeyA','a'],['KeyB','b'],['KeyC','c'],['KeyD','d'],
|
|
212
|
-
['KeyE','e'],['KeyF','f'],['KeyG','g'],['KeyH','h'],
|
|
213
|
-
['KeyI','i'],['KeyJ','j'],['KeyK','k'],['KeyL','l'],
|
|
214
|
-
['KeyM','m'],['KeyN','n'],['KeyO','o'],['KeyP','p'],
|
|
215
|
-
['KeyQ','q'],['KeyR','r'],['KeyS','s'],['KeyT','t'],
|
|
216
|
-
['KeyU','u'],['KeyV','v'],['KeyW','w'],['KeyX','x'],
|
|
217
|
-
['KeyY','y'],['KeyZ','z']
|
|
218
|
-
])
|
|
219
|
-
);
|
|
220
|
-
};
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
|
-
} catch (e) {}
|
|
224
|
-
|
|
225
|
-
// ── 7. navigator.deviceMemory ─────────────────────────────────────────────
|
|
226
|
-
// Headless may expose 0 or undefined; normalise to 8 GB (most common laptop value).
|
|
227
|
-
try {
|
|
228
|
-
var _dm = navigator.deviceMemory;
|
|
229
|
-
if (!_dm || _dm === 0) {
|
|
230
|
-
Object.defineProperty(navigator, 'deviceMemory', {
|
|
231
|
-
get: function () { return 8; },
|
|
232
|
-
configurable: true
|
|
233
|
-
});
|
|
234
|
-
}
|
|
235
|
-
} catch (e) {}
|
|
236
|
-
|
|
237
|
-
// ── 8. screen.availWidth / availHeight safety net ─────────────────────────
|
|
238
|
-
// Headless sometimes reports 0 for available screen dimensions.
|
|
239
|
-
try {
|
|
240
|
-
if (window.screen) {
|
|
241
|
-
if (!window.screen.availWidth || window.screen.availWidth === 0) {
|
|
242
|
-
Object.defineProperty(window.screen, 'availWidth', {
|
|
243
|
-
get: function () { return window.outerWidth || window.innerWidth || 1920; },
|
|
244
|
-
configurable: true
|
|
245
|
-
});
|
|
246
|
-
}
|
|
247
|
-
if (!window.screen.availHeight || window.screen.availHeight === 0) {
|
|
248
|
-
Object.defineProperty(window.screen, 'availHeight', {
|
|
249
|
-
get: function () { return window.outerHeight || window.innerHeight || 1040; },
|
|
250
|
-
configurable: true
|
|
251
|
-
});
|
|
252
|
-
}
|
|
253
|
-
}
|
|
254
|
-
} catch (e) {}
|
|
255
|
-
|
|
256
|
-
// ── 9. WebGL parameter noise ──────────────────────────────────────────────
|
|
257
|
-
// puppeteer-extra-plugin-stealth already patches UNMASKED_VENDOR (37445) and
|
|
258
|
-
// UNMASKED_RENDERER (37446). We add a tiny, consistent offset to a handful
|
|
259
|
-
// of other float parameters so hash-based GL fingerprinting breaks.
|
|
260
|
-
// The offset is seeded per-session (Math.random at inject time) so it differs
|
|
261
|
-
// from headless defaults without varying every page load.
|
|
262
|
-
try {
|
|
263
|
-
var _glNoiseSeed = Math.random() < 0.5 ? 0.0001 : -0.0001;
|
|
264
|
-
|
|
265
|
-
function _patchWebGLNoise(ctxProto) {
|
|
266
|
-
if (!ctxProto || !ctxProto.getParameter) return;
|
|
267
|
-
var _origGetParam = ctxProto.getParameter;
|
|
268
|
-
Object.defineProperty(ctxProto, 'getParameter', {
|
|
269
|
-
value: function (pname) {
|
|
270
|
-
var result = _origGetParam.call(this, pname);
|
|
271
|
-
// Only perturb continuous float values (e.g. aliased line/point ranges)
|
|
272
|
-
// 33902 = ALIASED_LINE_WIDTH_RANGE, 33901 = ALIASED_POINT_SIZE_RANGE
|
|
273
|
-
// 36348 = MAX_FRAGMENT_UNIFORM_VECTORS, skip integers
|
|
274
|
-
if (result instanceof Float32Array) {
|
|
275
|
-
var patched = new Float32Array(result);
|
|
276
|
-
for (var i = 0; i < patched.length; i++) {
|
|
277
|
-
patched[i] += _glNoiseSeed;
|
|
278
|
-
}
|
|
279
|
-
return patched;
|
|
280
|
-
}
|
|
281
|
-
return result;
|
|
282
|
-
},
|
|
283
|
-
configurable: true,
|
|
284
|
-
writable: true
|
|
285
|
-
});
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
if (typeof WebGLRenderingContext !== 'undefined') {
|
|
289
|
-
_patchWebGLNoise(WebGLRenderingContext.prototype);
|
|
290
|
-
}
|
|
291
|
-
if (typeof WebGL2RenderingContext !== 'undefined') {
|
|
292
|
-
_patchWebGLNoise(WebGL2RenderingContext.prototype);
|
|
293
|
-
}
|
|
294
|
-
} catch (e) {}
|
|
295
|
-
|
|
296
|
-
// ── 10. Dedicated worker navigator.webdriver ─────────────────────────────
|
|
297
|
-
// puppeteer-extra-plugin-stealth patches the main window, but some detectors
|
|
298
|
-
// spin up a Worker and check navigator.webdriver there too.
|
|
299
|
-
// We intercept Worker construction and inject a tiny patch script.
|
|
300
|
-
try {
|
|
301
|
-
var _OrigWorker = window.Worker;
|
|
302
|
-
window.Worker = function (scriptURL, options) {
|
|
303
|
-
// Prefix the worker script with a blob that removes webdriver
|
|
304
|
-
var patchBlob = new Blob([
|
|
305
|
-
'(function(){try{Object.defineProperty(navigator,"webdriver",{get:function(){return false;},configurable:true});}catch(e){}})();'
|
|
306
|
-
], { type: 'application/javascript' });
|
|
307
|
-
var patchURL = URL.createObjectURL(patchBlob);
|
|
308
|
-
// Chain via importScripts is not possible here; use a wrapper blob instead
|
|
309
|
-
var wrappedBlob = new Blob([
|
|
310
|
-
'importScripts(' + JSON.stringify(patchURL) + ');importScripts(' + JSON.stringify(scriptURL.toString()) + ');'
|
|
311
|
-
], { type: 'application/javascript' });
|
|
312
|
-
var wrappedURL = URL.createObjectURL(wrappedBlob);
|
|
313
|
-
return new _OrigWorker(wrappedURL, options);
|
|
314
|
-
} as any;
|
|
315
|
-
window.Worker.prototype = _OrigWorker.prototype;
|
|
316
|
-
} catch (e) {}
|
|
317
|
-
|
|
318
|
-
})();
|
|
319
|
-
`);
|
|
11
|
+
/* c8 ignore start */
|
|
12
|
+
/** Apply stealth patches to a Playwright page to avoid bot detection. */
|
|
13
|
+
export async function applyStealthPatches(_page) {
|
|
14
|
+
// Stub — full implementation in compiled stealth-patches.js
|
|
320
15
|
}
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
*
|
|
325
|
-
* Call this after creating the page but BEFORE navigation.
|
|
326
|
-
* In stealth mode Playwright already sets locale: 'en-US', but the
|
|
327
|
-
* Accept-Language header may still differ — this ensures consistency.
|
|
328
|
-
*
|
|
329
|
-
* @param page - Playwright Page.
|
|
330
|
-
* @param locale - BCP 47 locale string, e.g. 'en-US' (default).
|
|
331
|
-
*/
|
|
332
|
-
export async function applyAcceptLanguageHeader(page, locale = 'en-US') {
|
|
333
|
-
// Build a realistic q-value string, e.g. "en-US,en;q=0.9"
|
|
334
|
-
const lang = locale.split('-')[0];
|
|
335
|
-
const acceptLang = lang !== locale
|
|
336
|
-
? `${locale},${lang};q=0.9`
|
|
337
|
-
: locale;
|
|
338
|
-
await page.setExtraHTTPHeaders({ 'Accept-Language': acceptLang });
|
|
16
|
+
/** Apply Accept-Language header to a Playwright page. */
|
|
17
|
+
export async function applyAcceptLanguageHeader(_page, _lang) {
|
|
18
|
+
// Stub — full implementation in compiled stealth-patches.js
|
|
339
19
|
}
|
|
20
|
+
/* c8 ignore stop */
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* All hook methods are optional — unset hooks are simply skipped.
|
|
11
11
|
*/
|
|
12
12
|
import type { FetchResult } from './fetcher.js';
|
|
13
|
-
import type { DomainExtractResult } from './domain-extractors.js';
|
|
13
|
+
import type { DomainExtractResult } from './domain-extractors-basic.js';
|
|
14
14
|
export interface StrategyResult extends FetchResult {
|
|
15
15
|
method: 'simple' | 'browser' | 'stealth' | 'cached' | 'cloaked' | 'cycle' | 'peeltls' | 'cf-worker' | 'google-cache';
|
|
16
16
|
/**
|
package/dist/index.d.ts
CHANGED
|
@@ -6,7 +6,8 @@
|
|
|
6
6
|
import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from './core/fetcher.js';
|
|
7
7
|
import type { PeelOptions, PeelResult } from './types.js';
|
|
8
8
|
export * from './types.js';
|
|
9
|
-
export {
|
|
9
|
+
export type { DomainExtractResult, DomainExtractor } from './core/domain-extractors-basic.js';
|
|
10
|
+
export { getDomainExtractor, extractDomainData } from './core/domain-extractors-public.js';
|
|
10
11
|
export { crawl, type CrawlOptions, type CrawlResult, type CrawlProgress } from './core/crawler.js';
|
|
11
12
|
export { discoverSitemap, type SitemapUrl, type SitemapResult } from './core/sitemap.js';
|
|
12
13
|
export { mapDomain, type MapOptions, type MapResult } from './core/map.js';
|
package/dist/index.js
CHANGED
|
@@ -7,7 +7,7 @@ import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from '
|
|
|
7
7
|
import { createContext, normalizeOptions, handleYouTube, fetchContent, detectContentType, parseContent, postProcess, finalize, buildResult, } from './core/pipeline.js';
|
|
8
8
|
import { checkUrlSafety } from './core/safe-browsing.js';
|
|
9
9
|
export * from './types.js';
|
|
10
|
-
export { getDomainExtractor, extractDomainData } from './core/domain-extractors.js';
|
|
10
|
+
export { getDomainExtractor, extractDomainData } from './core/domain-extractors-public.js';
|
|
11
11
|
export { crawl } from './core/crawler.js';
|
|
12
12
|
export { discoverSitemap } from './core/sitemap.js';
|
|
13
13
|
export { mapDomain } from './core/map.js';
|
package/dist/server/app.js
CHANGED
|
@@ -54,8 +54,20 @@ import { createSentryHooks } from './sentry.js';
|
|
|
54
54
|
import { requireScope } from './middleware/scope-guard.js';
|
|
55
55
|
import { createCacheWarmRouter, startCacheWarmer } from './routes/cache-warm.js';
|
|
56
56
|
import { warmup, cleanup as cleanupFetcher } from '../core/fetcher.js';
|
|
57
|
-
|
|
58
|
-
|
|
57
|
+
// Proprietary modules — loaded dynamically so the build works without TypeScript source.
|
|
58
|
+
// Compiled JS ships in npm/Docker. TypeScript source is .gitignore'd (not on GitHub).
|
|
59
|
+
let setExtractorRedis;
|
|
60
|
+
let registerPremiumHooks;
|
|
61
|
+
try {
|
|
62
|
+
const de = await import('../core/domain-extractors.js');
|
|
63
|
+
setExtractorRedis = de.setExtractorRedis;
|
|
64
|
+
}
|
|
65
|
+
catch { /* compiled JS not available */ }
|
|
66
|
+
try {
|
|
67
|
+
const ph = await import('./premium/index.js');
|
|
68
|
+
registerPremiumHooks = ph.registerPremiumHooks;
|
|
69
|
+
}
|
|
70
|
+
catch { /* compiled JS not available */ }
|
|
59
71
|
import { readFileSync } from 'fs';
|
|
60
72
|
import { join, dirname } from 'path';
|
|
61
73
|
import { fileURLToPath } from 'url';
|
|
@@ -421,7 +433,7 @@ export function startServer(config = {}) {
|
|
|
421
433
|
const app = createApp(config);
|
|
422
434
|
const port = config.port || parseInt(process.env.PORT || '3000', 10);
|
|
423
435
|
// Activate premium strategy hooks (SWR cache, domain intelligence, race).
|
|
424
|
-
registerPremiumHooks();
|
|
436
|
+
registerPremiumHooks?.();
|
|
425
437
|
// Inject Redis into the domain extractor cache for cross-pod cache sharing.
|
|
426
438
|
// When REDIS_URL is set (multi-pod k8s deployments), all pods share one cache
|
|
427
439
|
// so the first pod to fetch a URL populates it for all others.
|
|
@@ -439,7 +451,7 @@ export function startServer(config = {}) {
|
|
|
439
451
|
maxRetriesPerRequest: 3,
|
|
440
452
|
enableOfflineQueue: false,
|
|
441
453
|
});
|
|
442
|
-
setExtractorRedis(redis);
|
|
454
|
+
setExtractorRedis?.(redis);
|
|
443
455
|
log.info('Redis extractor cache initialized (shared cross-pod cache active)');
|
|
444
456
|
}).catch((err) => {
|
|
445
457
|
log.warn('Failed to init Redis extractor cache (in-memory only)', { error: err.message });
|
|
@@ -1,8 +1 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Premium challenge solver — server-only wrapper.
|
|
3
|
-
*
|
|
4
|
-
* Re-exports the challenge-solver functionality for use as a strategy hook.
|
|
5
|
-
* The npm package handles challenges inline in pipeline.ts (basic handling).
|
|
6
|
-
* Premium servers can wire in enhanced challenge solving via hooks.
|
|
7
|
-
*/
|
|
8
1
|
export { solveChallenge } from '../../core/challenge-solver.js';
|
|
@@ -1,8 +1 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Premium challenge solver — server-only wrapper.
|
|
3
|
-
*
|
|
4
|
-
* Re-exports the challenge-solver functionality for use as a strategy hook.
|
|
5
|
-
* The npm package handles challenges inline in pipeline.ts (basic handling).
|
|
6
|
-
* Premium servers can wire in enhanced challenge solving via hooks.
|
|
7
|
-
*/
|
|
8
1
|
export { solveChallenge } from '../../core/challenge-solver.js';
|
|
@@ -1,10 +1 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Premium domain extractors — server-only wrapper.
|
|
3
|
-
*
|
|
4
|
-
* Re-exports the full extractDomainData and getDomainExtractor functions
|
|
5
|
-
* from core/domain-extractors.ts for use as strategy hooks.
|
|
6
|
-
*
|
|
7
|
-
* The npm package uses basic stubs (always return null).
|
|
8
|
-
* When premium hooks are registered, these full extractors are wired in.
|
|
9
|
-
*/
|
|
10
1
|
export { extractDomainData, getDomainExtractor } from '../../core/domain-extractors.js';
|
|
@@ -1,10 +1 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Premium domain extractors — server-only wrapper.
|
|
3
|
-
*
|
|
4
|
-
* Re-exports the full extractDomainData and getDomainExtractor functions
|
|
5
|
-
* from core/domain-extractors.ts for use as strategy hooks.
|
|
6
|
-
*
|
|
7
|
-
* The npm package uses basic stubs (always return null).
|
|
8
|
-
* When premium hooks are registered, these full extractors are wired in.
|
|
9
|
-
*/
|
|
10
1
|
export { extractDomainData, getDomainExtractor } from '../../core/domain-extractors.js';
|
|
@@ -1,17 +1,2 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Premium SPA detection — server-only.
|
|
3
|
-
*
|
|
4
|
-
* Full list of domains and URL patterns that require browser rendering.
|
|
5
|
-
* The npm package only has a minimal default set (Google, our own dashboard).
|
|
6
|
-
* Premium servers register these via strategy hooks.
|
|
7
|
-
*/
|
|
8
|
-
/**
|
|
9
|
-
* Domains that are known SPAs requiring browser rendering.
|
|
10
|
-
* Includes travel, real estate, job boards, and other dynamic sites.
|
|
11
|
-
*/
|
|
12
1
|
export declare const SPA_DOMAINS: Set<string>;
|
|
13
|
-
/**
|
|
14
|
-
* URL patterns that match SPA routes on mixed-content domains.
|
|
15
|
-
* E.g. google.com/travel is SPA, but google.com/search is not.
|
|
16
|
-
*/
|
|
17
2
|
export declare const SPA_URL_PATTERNS: RegExp[];
|
|
@@ -1,39 +1,2 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
*
|
|
4
|
-
* Full list of domains and URL patterns that require browser rendering.
|
|
5
|
-
* The npm package only has a minimal default set (Google, our own dashboard).
|
|
6
|
-
* Premium servers register these via strategy hooks.
|
|
7
|
-
*/
|
|
8
|
-
/**
|
|
9
|
-
* Domains that are known SPAs requiring browser rendering.
|
|
10
|
-
* Includes travel, real estate, job boards, and other dynamic sites.
|
|
11
|
-
*/
|
|
12
|
-
export const SPA_DOMAINS = new Set([
|
|
13
|
-
// Google properties
|
|
14
|
-
'www.google.com',
|
|
15
|
-
'flights.google.com',
|
|
16
|
-
// Travel
|
|
17
|
-
'www.airbnb.com',
|
|
18
|
-
'www.booking.com',
|
|
19
|
-
'www.expedia.com',
|
|
20
|
-
'www.kayak.com',
|
|
21
|
-
'www.skyscanner.com',
|
|
22
|
-
'www.tripadvisor.com',
|
|
23
|
-
// Jobs
|
|
24
|
-
'www.indeed.com',
|
|
25
|
-
'www.glassdoor.com',
|
|
26
|
-
// Real estate
|
|
27
|
-
'www.zillow.com',
|
|
28
|
-
// Our own dashboard
|
|
29
|
-
'app.webpeel.dev',
|
|
30
|
-
]);
|
|
31
|
-
/**
|
|
32
|
-
* URL patterns that match SPA routes on mixed-content domains.
|
|
33
|
-
* E.g. google.com/travel is SPA, but google.com/search is not.
|
|
34
|
-
*/
|
|
35
|
-
export const SPA_URL_PATTERNS = [
|
|
36
|
-
/google\.com\/travel/,
|
|
37
|
-
/google\.com\/maps/,
|
|
38
|
-
/google\.com\/shopping/,
|
|
39
|
-
];
|
|
1
|
+
export const SPA_DOMAINS = new Set(['www.google.com', 'flights.google.com', 'www.airbnb.com', 'www.booking.com', 'www.expedia.com', 'www.kayak.com', 'www.skyscanner.com', 'www.tripadvisor.com', 'www.indeed.com', 'www.glassdoor.com', 'www.zillow.com', 'app.webpeel.dev']);
|
|
2
|
+
export const SPA_URL_PATTERNS = [/google\.com\/travel/, /google\.com\/maps/, /google\.com\/shopping/];
|
|
@@ -1,23 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
* Premium content stability detection — server-only.
|
|
3
|
-
*
|
|
4
|
-
* Provides smarter content-stability waiting logic than the default
|
|
5
|
-
* waitForLoadState('networkidle'). Monitors DOM mutations and network
|
|
6
|
-
* activity to determine when a page has truly finished rendering.
|
|
7
|
-
*
|
|
8
|
-
* The npm package uses default Playwright waitForLoadState.
|
|
9
|
-
* Premium servers can wire this in via the waitForContentStable hook.
|
|
10
|
-
*/
|
|
11
|
-
export interface StabilityOptions {
|
|
12
|
-
/** Maximum time to wait (ms). Default: 5000. */
|
|
1
|
+
export declare function waitForContentStable(page: any, options?: {
|
|
13
2
|
timeoutMs?: number;
|
|
14
|
-
/** Minimum quiet period before declaring stable (ms). Default: 500. */
|
|
15
3
|
quietMs?: number;
|
|
16
|
-
}
|
|
17
|
-
/**
|
|
18
|
-
* Wait for page content to stabilize by monitoring DOM mutations.
|
|
19
|
-
*
|
|
20
|
-
* More reliable than waitForLoadState('networkidle') for SPAs that
|
|
21
|
-
* progressively render content.
|
|
22
|
-
*/
|
|
23
|
-
export declare function waitForContentStable(page: any, options?: StabilityOptions): Promise<void>;
|
|
4
|
+
}): Promise<void>;
|
|
@@ -1,36 +1,13 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Premium content stability detection — server-only.
|
|
3
|
-
*
|
|
4
|
-
* Provides smarter content-stability waiting logic than the default
|
|
5
|
-
* waitForLoadState('networkidle'). Monitors DOM mutations and network
|
|
6
|
-
* activity to determine when a page has truly finished rendering.
|
|
7
|
-
*
|
|
8
|
-
* The npm package uses default Playwright waitForLoadState.
|
|
9
|
-
* Premium servers can wire this in via the waitForContentStable hook.
|
|
10
|
-
*/
|
|
11
|
-
/**
|
|
12
|
-
* Wait for page content to stabilize by monitoring DOM mutations.
|
|
13
|
-
*
|
|
14
|
-
* More reliable than waitForLoadState('networkidle') for SPAs that
|
|
15
|
-
* progressively render content.
|
|
16
|
-
*/
|
|
17
1
|
export async function waitForContentStable(page, options) {
|
|
18
2
|
const timeout = options?.timeoutMs ?? 5000;
|
|
19
3
|
const quiet = options?.quietMs ?? 500;
|
|
20
4
|
const start = Date.now();
|
|
21
|
-
// Use page.evaluate to monitor DOM mutations
|
|
22
5
|
await page.evaluate(({ quietMs, timeoutMs }) => {
|
|
23
6
|
return new Promise((resolve) => {
|
|
24
7
|
let lastMutation = Date.now();
|
|
25
8
|
let settled = false;
|
|
26
|
-
const observer = new MutationObserver(() => {
|
|
27
|
-
|
|
28
|
-
});
|
|
29
|
-
observer.observe(document.body, {
|
|
30
|
-
childList: true,
|
|
31
|
-
subtree: true,
|
|
32
|
-
characterData: true,
|
|
33
|
-
});
|
|
9
|
+
const observer = new MutationObserver(() => { lastMutation = Date.now(); });
|
|
10
|
+
observer.observe(document.body, { childList: true, subtree: true, characterData: true });
|
|
34
11
|
const check = () => {
|
|
35
12
|
const now = Date.now();
|
|
36
13
|
if (now - lastMutation >= quietMs || settled) {
|
|
@@ -45,13 +22,7 @@ export async function waitForContentStable(page, options) {
|
|
|
45
22
|
}
|
|
46
23
|
requestAnimationFrame(check);
|
|
47
24
|
};
|
|
48
|
-
|
|
49
|
-
setTimeout(() => {
|
|
50
|
-
settled = true;
|
|
51
|
-
observer.disconnect();
|
|
52
|
-
resolve();
|
|
53
|
-
}, timeoutMs);
|
|
54
|
-
// Start checking after an initial quiet period
|
|
25
|
+
setTimeout(() => { settled = true; observer.disconnect(); resolve(); }, timeoutMs);
|
|
55
26
|
setTimeout(check, quietMs);
|
|
56
27
|
});
|
|
57
28
|
}, { quietMs: quiet, timeoutMs: Math.max(0, timeout - (Date.now() - start)) });
|
package/dist/types.d.ts
CHANGED
|
@@ -309,7 +309,7 @@ export interface PeelResult {
|
|
|
309
309
|
*/
|
|
310
310
|
readability?: import('./core/readability.js').ReadabilityResult;
|
|
311
311
|
/** Domain-aware structured data (Twitter, Reddit, GitHub, HN). Present when URL matches a known domain. */
|
|
312
|
-
domainData?: import('./core/domain-extractors.js').DomainExtractResult;
|
|
312
|
+
domainData?: import('./core/domain-extractors-basic.js').DomainExtractResult;
|
|
313
313
|
/** Quick answer result (when question option is set). BM25-powered, no LLM needed. */
|
|
314
314
|
quickAnswer?: import('./core/quick-answer.js').QuickAnswerResult;
|
|
315
315
|
/** Per-stage timing breakdown in milliseconds. */
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.83",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|