webpeel 0.21.80 → 0.21.81
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/domain-extractors-basic.d.ts +25 -0
- package/dist/core/domain-extractors-basic.js +31 -0
- package/dist/core/pipeline.js +70 -28
- package/dist/core/strategy-hooks.d.ts +64 -0
- package/dist/core/strategy-hooks.js +42 -0
- package/dist/server/premium/challenge.d.ts +8 -0
- package/dist/server/premium/challenge.js +8 -0
- package/dist/server/premium/extractors.d.ts +10 -0
- package/dist/server/premium/extractors.js +10 -0
- package/dist/server/premium/index.d.ts +3 -0
- package/dist/server/premium/index.js +15 -0
- package/dist/server/premium/spa-detection.d.ts +17 -0
- package/dist/server/premium/spa-detection.js +39 -0
- package/dist/server/premium/stability.d.ts +23 -0
- package/dist/server/premium/stability.js +58 -0
- package/package.json +1 -1
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Basic domain extraction — public/free tier.
|
|
3
|
+
*
|
|
4
|
+
* Handles a few common domains with simple logic.
|
|
5
|
+
* Full 55+ domain extractors are premium/server-only.
|
|
6
|
+
*
|
|
7
|
+
* This module is safe to include in the npm package.
|
|
8
|
+
* The full `domain-extractors.ts` is compiled for the server
|
|
9
|
+
* but wired in only when premium hooks are registered.
|
|
10
|
+
*/
|
|
11
|
+
import type { DomainExtractResult } from './domain-extractors.js';
|
|
12
|
+
/**
|
|
13
|
+
* Basic domain data extractor — free tier stub.
|
|
14
|
+
*
|
|
15
|
+
* Always returns null (delegates all extraction to the normal pipeline).
|
|
16
|
+
* Premium servers override this via the `extractDomainData` strategy hook.
|
|
17
|
+
*/
|
|
18
|
+
export declare function extractDomainDataBasic(_html: string, _url: string): Promise<DomainExtractResult | null>;
|
|
19
|
+
/**
|
|
20
|
+
* Basic domain extractor lookup — free tier stub.
|
|
21
|
+
*
|
|
22
|
+
* Always returns null (no domain is recognized in basic mode).
|
|
23
|
+
* Premium servers override this via the `getDomainExtractor` strategy hook.
|
|
24
|
+
*/
|
|
25
|
+
export declare function getDomainExtractorBasic(_url: string): ((html: string, url: string) => Promise<DomainExtractResult | null>) | null;
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Basic domain extraction — public/free tier.
|
|
3
|
+
*
|
|
4
|
+
* Handles a few common domains with simple logic.
|
|
5
|
+
* Full 55+ domain extractors are premium/server-only.
|
|
6
|
+
*
|
|
7
|
+
* This module is safe to include in the npm package.
|
|
8
|
+
* The full `domain-extractors.ts` is compiled for the server
|
|
9
|
+
* but wired in only when premium hooks are registered.
|
|
10
|
+
*/
|
|
11
|
+
/**
|
|
12
|
+
* Basic domain data extractor — free tier stub.
|
|
13
|
+
*
|
|
14
|
+
* Always returns null (delegates all extraction to the normal pipeline).
|
|
15
|
+
* Premium servers override this via the `extractDomainData` strategy hook.
|
|
16
|
+
*/
|
|
17
|
+
export async function extractDomainDataBasic(_html, _url) {
|
|
18
|
+
// Basic (free) tier: no domain-specific extraction.
|
|
19
|
+
// The normal fetch + markdown pipeline handles everything.
|
|
20
|
+
// Premium hook provides 55+ domain extractors (Twitter, Reddit, GitHub, HN, etc.)
|
|
21
|
+
return null;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Basic domain extractor lookup — free tier stub.
|
|
25
|
+
*
|
|
26
|
+
* Always returns null (no domain is recognized in basic mode).
|
|
27
|
+
* Premium servers override this via the `getDomainExtractor` strategy hook.
|
|
28
|
+
*/
|
|
29
|
+
export function getDomainExtractorBasic(_url) {
|
|
30
|
+
return null;
|
|
31
|
+
}
|
package/dist/core/pipeline.js
CHANGED
|
@@ -14,7 +14,28 @@ import { autoScroll as runAutoScroll } from './actions.js';
|
|
|
14
14
|
import { extractStructured } from './extract.js';
|
|
15
15
|
import { isPdfContentType, isDocxContentType, extractDocumentToFormat } from './documents.js';
|
|
16
16
|
import { parseYouTubeUrl, getYouTubeTranscript } from './youtube.js';
|
|
17
|
-
import {
|
|
17
|
+
import { extractDomainDataBasic, getDomainExtractorBasic } from './domain-extractors-basic.js';
|
|
18
|
+
import { getDomainExtractHook, getDomainExtractorHook, getSPADomainsHook, getSPAPatternsHook } from './strategy-hooks.js';
|
|
19
|
+
// Lazy-loaded full extractors — available in repo/server, absent in npm package.
|
|
20
|
+
// The dynamic import avoids hard failures when domain-extractors.js is excluded from npm.
|
|
21
|
+
let _fullExtractorsLoaded = false;
|
|
22
|
+
let _fullExtractDomainData = null;
|
|
23
|
+
let _fullGetDomainExtractor = null;
|
|
24
|
+
async function loadFullExtractors() {
|
|
25
|
+
if (_fullExtractorsLoaded)
|
|
26
|
+
return;
|
|
27
|
+
_fullExtractorsLoaded = true;
|
|
28
|
+
try {
|
|
29
|
+
const mod = await import('./domain-extractors.js');
|
|
30
|
+
_fullExtractDomainData = mod.extractDomainData;
|
|
31
|
+
_fullGetDomainExtractor = mod.getDomainExtractor;
|
|
32
|
+
}
|
|
33
|
+
catch {
|
|
34
|
+
// Not available (npm package) — basic stubs will be used
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
// Eagerly start loading (non-blocking)
|
|
38
|
+
loadFullExtractors();
|
|
18
39
|
import { extractReadableContent } from './readability.js';
|
|
19
40
|
import { quickAnswer as runQuickAnswer } from './quick-answer.js';
|
|
20
41
|
import { Timer } from './timing.js';
|
|
@@ -24,6 +45,38 @@ import { sanitizeForLLM } from './prompt-guard.js';
|
|
|
24
45
|
import { getSourceCredibility } from './source-credibility.js';
|
|
25
46
|
import { createLogger } from './logger.js';
|
|
26
47
|
const log = createLogger('pipeline');
|
|
48
|
+
// ---------------------------------------------------------------------------
|
|
49
|
+
// Hook-aware wrappers — route through premium hooks, fall back to basic stubs
|
|
50
|
+
// ---------------------------------------------------------------------------
|
|
51
|
+
/**
|
|
52
|
+
* Check if a URL has a domain extractor.
|
|
53
|
+
* Priority: premium hook → full extractors (repo/server) → basic stub.
|
|
54
|
+
*/
|
|
55
|
+
function hasDomainExtractor(url) {
|
|
56
|
+
const hookFn = getDomainExtractorHook();
|
|
57
|
+
if (hookFn)
|
|
58
|
+
return hookFn(url) !== null;
|
|
59
|
+
// Full extractors available (repo/server build)?
|
|
60
|
+
if (_fullGetDomainExtractor)
|
|
61
|
+
return _fullGetDomainExtractor(url) !== null;
|
|
62
|
+
// npm package fallback — basic stubs
|
|
63
|
+
return getDomainExtractorBasic(url) !== null;
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Run domain extraction on HTML/URL.
|
|
67
|
+
* Priority: premium hook → full extractors (repo/server) → basic stub.
|
|
68
|
+
*/
|
|
69
|
+
async function runDomainExtract(html, url) {
|
|
70
|
+
const hookFn = getDomainExtractHook();
|
|
71
|
+
if (hookFn)
|
|
72
|
+
return hookFn(html, url);
|
|
73
|
+
// Full extractors available (repo/server build)?
|
|
74
|
+
await loadFullExtractors(); // Ensure loaded
|
|
75
|
+
if (_fullExtractDomainData)
|
|
76
|
+
return _fullExtractDomainData(html, url);
|
|
77
|
+
// npm package fallback — basic stubs
|
|
78
|
+
return extractDomainDataBasic(html, url);
|
|
79
|
+
}
|
|
27
80
|
/** Create the initial PipelineContext with defaults */
|
|
28
81
|
export function createContext(url, options) {
|
|
29
82
|
return {
|
|
@@ -147,27 +200,16 @@ export function normalizeOptions(ctx) {
|
|
|
147
200
|
ctx.render = true;
|
|
148
201
|
}
|
|
149
202
|
// Auto-detect SPAs that require browser rendering (no --render flag needed)
|
|
203
|
+
// Premium hook provides full SPA domain list; basic has a small default set.
|
|
150
204
|
if (!ctx.render) {
|
|
151
|
-
const
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
'www.tripadvisor.com',
|
|
160
|
-
'www.indeed.com',
|
|
161
|
-
'www.glassdoor.com',
|
|
162
|
-
'www.zillow.com', // already handled but backup
|
|
163
|
-
'app.webpeel.dev', // our own dashboard is a SPA
|
|
164
|
-
]);
|
|
165
|
-
// More specific: some google.com paths need render, not all
|
|
166
|
-
const SPA_URL_PATTERNS = [
|
|
167
|
-
/google\.com\/travel/,
|
|
168
|
-
/google\.com\/maps/,
|
|
169
|
-
/google\.com\/shopping/,
|
|
170
|
-
];
|
|
205
|
+
const spaDomainsHook = getSPADomainsHook();
|
|
206
|
+
const spaPatternsHook = getSPAPatternsHook();
|
|
207
|
+
// Basic SPA defaults — minimal set for free tier
|
|
208
|
+
const DEFAULT_SPA_DOMAINS = new Set([]);
|
|
209
|
+
const DEFAULT_SPA_PATTERNS = [];
|
|
210
|
+
// Premium hook merges its full list; basic uses defaults
|
|
211
|
+
const SPA_DOMAINS = spaDomainsHook ? spaDomainsHook() : DEFAULT_SPA_DOMAINS;
|
|
212
|
+
const SPA_URL_PATTERNS = spaPatternsHook ? spaPatternsHook() : DEFAULT_SPA_PATTERNS;
|
|
171
213
|
try {
|
|
172
214
|
const hostname = new URL(ctx.url).hostname;
|
|
173
215
|
if (SPA_DOMAINS.has(hostname)) {
|
|
@@ -304,10 +346,10 @@ export async function fetchContent(ctx) {
|
|
|
304
346
|
const needsDesignAnalysis = ctx.options.designAnalysis && ctx.render;
|
|
305
347
|
// Try API-based domain extraction first (Reddit, GitHub, HN use APIs, not HTML)
|
|
306
348
|
// This avoids expensive browser fetches that often get blocked
|
|
307
|
-
if (
|
|
349
|
+
if (hasDomainExtractor(ctx.url)) {
|
|
308
350
|
try {
|
|
309
351
|
ctx.timer.mark('domainApiFirst');
|
|
310
|
-
const ddResult = await
|
|
352
|
+
const ddResult = await runDomainExtract('', ctx.url);
|
|
311
353
|
ctx.timer.end('domainApiFirst');
|
|
312
354
|
if (ddResult && ddResult.cleanContent.length > 50) {
|
|
313
355
|
ctx.domainData = ddResult;
|
|
@@ -385,9 +427,9 @@ export async function fetchContent(ctx) {
|
|
|
385
427
|
}
|
|
386
428
|
catch (fetchError) {
|
|
387
429
|
// If fetch failed but we have a domain extractor, try it as fallback
|
|
388
|
-
if (
|
|
430
|
+
if (hasDomainExtractor(ctx.url)) {
|
|
389
431
|
try {
|
|
390
|
-
const ddResult = await
|
|
432
|
+
const ddResult = await runDomainExtract('', ctx.url);
|
|
391
433
|
if (ddResult && ddResult.cleanContent.length > 50) {
|
|
392
434
|
ctx.timer.end('fetch');
|
|
393
435
|
ctx.domainData = ddResult;
|
|
@@ -1041,14 +1083,14 @@ export async function postProcess(ctx) {
|
|
|
1041
1083
|
}
|
|
1042
1084
|
// Domain-aware structured extraction (Twitter, Reddit, GitHub, HN)
|
|
1043
1085
|
// Fires when URL matches a known domain. Replaces content with clean markdown.
|
|
1044
|
-
if (
|
|
1086
|
+
if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
|
|
1045
1087
|
try {
|
|
1046
1088
|
ctx.timer.mark('domainExtract');
|
|
1047
1089
|
// Try raw HTML first, then fall back to readability-processed content
|
|
1048
1090
|
// (some SPAs like Google Flights have data only after readability processing)
|
|
1049
|
-
let ddResult = await
|
|
1091
|
+
let ddResult = await runDomainExtract(fetchResult.html, fetchResult.url);
|
|
1050
1092
|
if (!ddResult && ctx.content) {
|
|
1051
|
-
ddResult = await
|
|
1093
|
+
ddResult = await runDomainExtract(ctx.content, fetchResult.url);
|
|
1052
1094
|
}
|
|
1053
1095
|
ctx.timer.end('domainExtract');
|
|
1054
1096
|
if (ddResult) {
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
* All hook methods are optional — unset hooks are simply skipped.
|
|
11
11
|
*/
|
|
12
12
|
import type { FetchResult } from './fetcher.js';
|
|
13
|
+
import type { DomainExtractResult } from './domain-extractors.js';
|
|
13
14
|
export interface StrategyResult extends FetchResult {
|
|
14
15
|
method: 'simple' | 'browser' | 'stealth' | 'cached' | 'cloaked' | 'cycle' | 'peeltls' | 'cf-worker' | 'google-cache';
|
|
15
16
|
/**
|
|
@@ -65,6 +66,39 @@ export interface StrategyHooks {
|
|
|
65
66
|
* Only called when `shouldRace()` returns true. Default: 2000.
|
|
66
67
|
*/
|
|
67
68
|
getRaceTimeoutMs?(): number;
|
|
69
|
+
/**
|
|
70
|
+
* Premium domain extraction hook — 55+ domain extractors.
|
|
71
|
+
* Return null to fall back to basic/no extraction.
|
|
72
|
+
*/
|
|
73
|
+
extractDomainData?(html: string, url: string): Promise<DomainExtractResult | null>;
|
|
74
|
+
/**
|
|
75
|
+
* Returns a function that checks if a URL has a known domain extractor.
|
|
76
|
+
* Premium knows which domains have extractors; basic returns null for all.
|
|
77
|
+
*/
|
|
78
|
+
getDomainExtractor?(url: string): ((html: string, url: string) => Promise<DomainExtractResult | null>) | null;
|
|
79
|
+
/**
|
|
80
|
+
* Premium SPA domain list — knows which sites require browser rendering.
|
|
81
|
+
* Basic: returns empty set (no SPA auto-detection).
|
|
82
|
+
*/
|
|
83
|
+
getSPADomains?(): Set<string>;
|
|
84
|
+
/**
|
|
85
|
+
* Premium SPA URL patterns — matches specific paths needing render.
|
|
86
|
+
* Basic: returns empty array.
|
|
87
|
+
*/
|
|
88
|
+
getSPAPatterns?(): RegExp[];
|
|
89
|
+
/**
|
|
90
|
+
* Premium CAPTCHA/challenge solving hook.
|
|
91
|
+
* Return null to fall back to default challenge handling.
|
|
92
|
+
*/
|
|
93
|
+
solveChallenge?(page: any, url: string): Promise<{
|
|
94
|
+
solved: boolean;
|
|
95
|
+
html?: string;
|
|
96
|
+
} | null>;
|
|
97
|
+
/**
|
|
98
|
+
* Premium wait-for-stable content logic — smarter than waitForLoadState.
|
|
99
|
+
* Return null/undefined to fall back to default wait logic.
|
|
100
|
+
*/
|
|
101
|
+
waitForContentStable?(page: any, options?: any): Promise<void>;
|
|
68
102
|
}
|
|
69
103
|
/**
|
|
70
104
|
* Register premium strategy hooks. Should be called once at server startup.
|
|
@@ -79,3 +113,33 @@ export declare function clearStrategyHooks(): void;
|
|
|
79
113
|
* Retrieve the current hooks (internal — used by strategies.ts).
|
|
80
114
|
*/
|
|
81
115
|
export declare function getStrategyHooks(): Readonly<StrategyHooks>;
|
|
116
|
+
/**
|
|
117
|
+
* Get the premium domain extraction hook, if registered.
|
|
118
|
+
* Returns undefined when no premium hooks are active (basic/npm mode).
|
|
119
|
+
*/
|
|
120
|
+
export declare function getDomainExtractHook(): StrategyHooks['extractDomainData'];
|
|
121
|
+
/**
|
|
122
|
+
* Get the premium domain extractor lookup hook, if registered.
|
|
123
|
+
* Returns undefined when no premium hooks are active (basic/npm mode).
|
|
124
|
+
*/
|
|
125
|
+
export declare function getDomainExtractorHook(): StrategyHooks['getDomainExtractor'];
|
|
126
|
+
/**
|
|
127
|
+
* Get the premium SPA domains hook, if registered.
|
|
128
|
+
* Returns undefined when no premium hooks are active (basic/npm mode).
|
|
129
|
+
*/
|
|
130
|
+
export declare function getSPADomainsHook(): StrategyHooks['getSPADomains'];
|
|
131
|
+
/**
|
|
132
|
+
* Get the premium SPA patterns hook, if registered.
|
|
133
|
+
* Returns undefined when no premium hooks are active (basic/npm mode).
|
|
134
|
+
*/
|
|
135
|
+
export declare function getSPAPatternsHook(): StrategyHooks['getSPAPatterns'];
|
|
136
|
+
/**
|
|
137
|
+
* Get the premium challenge solver hook, if registered.
|
|
138
|
+
* Returns undefined when no premium hooks are active (basic/npm mode).
|
|
139
|
+
*/
|
|
140
|
+
export declare function getChallengeHook(): StrategyHooks['solveChallenge'];
|
|
141
|
+
/**
|
|
142
|
+
* Get the premium content stability hook, if registered.
|
|
143
|
+
* Returns undefined when no premium hooks are active (basic/npm mode).
|
|
144
|
+
*/
|
|
145
|
+
export declare function getStabilityHook(): StrategyHooks['waitForContentStable'];
|
|
@@ -30,3 +30,45 @@ export function clearStrategyHooks() {
|
|
|
30
30
|
export function getStrategyHooks() {
|
|
31
31
|
return registeredHooks;
|
|
32
32
|
}
|
|
33
|
+
/**
|
|
34
|
+
* Get the premium domain extraction hook, if registered.
|
|
35
|
+
* Returns undefined when no premium hooks are active (basic/npm mode).
|
|
36
|
+
*/
|
|
37
|
+
export function getDomainExtractHook() {
|
|
38
|
+
return registeredHooks.extractDomainData;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Get the premium domain extractor lookup hook, if registered.
|
|
42
|
+
* Returns undefined when no premium hooks are active (basic/npm mode).
|
|
43
|
+
*/
|
|
44
|
+
export function getDomainExtractorHook() {
|
|
45
|
+
return registeredHooks.getDomainExtractor;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Get the premium SPA domains hook, if registered.
|
|
49
|
+
* Returns undefined when no premium hooks are active (basic/npm mode).
|
|
50
|
+
*/
|
|
51
|
+
export function getSPADomainsHook() {
|
|
52
|
+
return registeredHooks.getSPADomains;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Get the premium SPA patterns hook, if registered.
|
|
56
|
+
* Returns undefined when no premium hooks are active (basic/npm mode).
|
|
57
|
+
*/
|
|
58
|
+
export function getSPAPatternsHook() {
|
|
59
|
+
return registeredHooks.getSPAPatterns;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Get the premium challenge solver hook, if registered.
|
|
63
|
+
* Returns undefined when no premium hooks are active (basic/npm mode).
|
|
64
|
+
*/
|
|
65
|
+
export function getChallengeHook() {
|
|
66
|
+
return registeredHooks.solveChallenge;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Get the premium content stability hook, if registered.
|
|
70
|
+
* Returns undefined when no premium hooks are active (basic/npm mode).
|
|
71
|
+
*/
|
|
72
|
+
export function getStabilityHook() {
|
|
73
|
+
return registeredHooks.waitForContentStable;
|
|
74
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Premium challenge solver — server-only wrapper.
|
|
3
|
+
*
|
|
4
|
+
* Re-exports the challenge-solver functionality for use as a strategy hook.
|
|
5
|
+
* The npm package handles challenges inline in pipeline.ts (basic handling).
|
|
6
|
+
* Premium servers can wire in enhanced challenge solving via hooks.
|
|
7
|
+
*/
|
|
8
|
+
export { solveChallenge } from '../../core/challenge-solver.js';
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Premium challenge solver — server-only wrapper.
|
|
3
|
+
*
|
|
4
|
+
* Re-exports the challenge-solver functionality for use as a strategy hook.
|
|
5
|
+
* The npm package handles challenges inline in pipeline.ts (basic handling).
|
|
6
|
+
* Premium servers can wire in enhanced challenge solving via hooks.
|
|
7
|
+
*/
|
|
8
|
+
export { solveChallenge } from '../../core/challenge-solver.js';
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Premium domain extractors — server-only wrapper.
|
|
3
|
+
*
|
|
4
|
+
* Re-exports the full extractDomainData and getDomainExtractor functions
|
|
5
|
+
* from core/domain-extractors.ts for use as strategy hooks.
|
|
6
|
+
*
|
|
7
|
+
* The npm package uses basic stubs (always return null).
|
|
8
|
+
* When premium hooks are registered, these full extractors are wired in.
|
|
9
|
+
*/
|
|
10
|
+
export { extractDomainData, getDomainExtractor } from '../../core/domain-extractors.js';
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Premium domain extractors — server-only wrapper.
|
|
3
|
+
*
|
|
4
|
+
* Re-exports the full extractDomainData and getDomainExtractor functions
|
|
5
|
+
* from core/domain-extractors.ts for use as strategy hooks.
|
|
6
|
+
*
|
|
7
|
+
* The npm package uses basic stubs (always return null).
|
|
8
|
+
* When premium hooks are registered, these full extractors are wired in.
|
|
9
|
+
*/
|
|
10
|
+
export { extractDomainData, getDomainExtractor } from '../../core/domain-extractors.js';
|
|
@@ -5,6 +5,9 @@
|
|
|
5
5
|
* • SWR (stale-while-revalidate) response cache
|
|
6
6
|
* • Domain intelligence (learns which sites need browser/stealth)
|
|
7
7
|
* • Parallel race strategy (starts browser if simple fetch is slow)
|
|
8
|
+
* • 55+ domain extractors (Twitter, Reddit, GitHub, HN, Wikipedia, etc.)
|
|
9
|
+
* • SPA auto-detection (travel, jobs, real estate sites)
|
|
10
|
+
* • Content stability detection (smart DOM mutation monitoring)
|
|
8
11
|
*
|
|
9
12
|
* These modules are NOT shipped in the npm package.
|
|
10
13
|
*/
|
|
@@ -5,12 +5,18 @@
|
|
|
5
5
|
* • SWR (stale-while-revalidate) response cache
|
|
6
6
|
* • Domain intelligence (learns which sites need browser/stealth)
|
|
7
7
|
* • Parallel race strategy (starts browser if simple fetch is slow)
|
|
8
|
+
* • 55+ domain extractors (Twitter, Reddit, GitHub, HN, Wikipedia, etc.)
|
|
9
|
+
* • SPA auto-detection (travel, jobs, real estate sites)
|
|
10
|
+
* • Content stability detection (smart DOM mutation monitoring)
|
|
8
11
|
*
|
|
9
12
|
* These modules are NOT shipped in the npm package.
|
|
10
13
|
*/
|
|
11
14
|
import { registerStrategyHooks } from '../../core/strategy-hooks.js';
|
|
12
15
|
import { createSWRCacheHooks } from './swr-cache.js';
|
|
13
16
|
import { createDomainIntelHooks } from './domain-intel.js';
|
|
17
|
+
import { extractDomainData, getDomainExtractor } from './extractors.js';
|
|
18
|
+
import { SPA_DOMAINS, SPA_URL_PATTERNS } from './spa-detection.js';
|
|
19
|
+
import { waitForContentStable } from './stability.js';
|
|
14
20
|
export { clearDomainIntel } from './domain-intel.js';
|
|
15
21
|
/**
|
|
16
22
|
* Wire all premium hooks into the core strategy layer.
|
|
@@ -31,5 +37,14 @@ export function registerPremiumHooks() {
|
|
|
31
37
|
// Parallel race strategy
|
|
32
38
|
shouldRace: () => true,
|
|
33
39
|
getRaceTimeoutMs: () => 2000,
|
|
40
|
+
// Premium domain extraction (55+ extractors)
|
|
41
|
+
extractDomainData,
|
|
42
|
+
// Premium domain extractor lookup
|
|
43
|
+
getDomainExtractor: (url) => getDomainExtractor(url),
|
|
44
|
+
// Premium SPA detection
|
|
45
|
+
getSPADomains: () => SPA_DOMAINS,
|
|
46
|
+
getSPAPatterns: () => SPA_URL_PATTERNS,
|
|
47
|
+
// Premium content stability (DOM mutation monitoring)
|
|
48
|
+
waitForContentStable,
|
|
34
49
|
});
|
|
35
50
|
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Premium SPA detection — server-only.
|
|
3
|
+
*
|
|
4
|
+
* Full list of domains and URL patterns that require browser rendering.
|
|
5
|
+
* The npm package only has a minimal default set (Google, our own dashboard).
|
|
6
|
+
* Premium servers register these via strategy hooks.
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Domains that are known SPAs requiring browser rendering.
|
|
10
|
+
* Includes travel, real estate, job boards, and other dynamic sites.
|
|
11
|
+
*/
|
|
12
|
+
export declare const SPA_DOMAINS: Set<string>;
|
|
13
|
+
/**
|
|
14
|
+
* URL patterns that match SPA routes on mixed-content domains.
|
|
15
|
+
* E.g. google.com/travel is SPA, but google.com/search is not.
|
|
16
|
+
*/
|
|
17
|
+
export declare const SPA_URL_PATTERNS: RegExp[];
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Premium SPA detection — server-only.
|
|
3
|
+
*
|
|
4
|
+
* Full list of domains and URL patterns that require browser rendering.
|
|
5
|
+
* The npm package only has a minimal default set (Google, our own dashboard).
|
|
6
|
+
* Premium servers register these via strategy hooks.
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Domains that are known SPAs requiring browser rendering.
|
|
10
|
+
* Includes travel, real estate, job boards, and other dynamic sites.
|
|
11
|
+
*/
|
|
12
|
+
export const SPA_DOMAINS = new Set([
|
|
13
|
+
// Google properties
|
|
14
|
+
'www.google.com',
|
|
15
|
+
'flights.google.com',
|
|
16
|
+
// Travel
|
|
17
|
+
'www.airbnb.com',
|
|
18
|
+
'www.booking.com',
|
|
19
|
+
'www.expedia.com',
|
|
20
|
+
'www.kayak.com',
|
|
21
|
+
'www.skyscanner.com',
|
|
22
|
+
'www.tripadvisor.com',
|
|
23
|
+
// Jobs
|
|
24
|
+
'www.indeed.com',
|
|
25
|
+
'www.glassdoor.com',
|
|
26
|
+
// Real estate
|
|
27
|
+
'www.zillow.com',
|
|
28
|
+
// Our own dashboard
|
|
29
|
+
'app.webpeel.dev',
|
|
30
|
+
]);
|
|
31
|
+
/**
|
|
32
|
+
* URL patterns that match SPA routes on mixed-content domains.
|
|
33
|
+
* E.g. google.com/travel is SPA, but google.com/search is not.
|
|
34
|
+
*/
|
|
35
|
+
export const SPA_URL_PATTERNS = [
|
|
36
|
+
/google\.com\/travel/,
|
|
37
|
+
/google\.com\/maps/,
|
|
38
|
+
/google\.com\/shopping/,
|
|
39
|
+
];
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Premium content stability detection — server-only.
|
|
3
|
+
*
|
|
4
|
+
* Provides smarter content-stability waiting logic than the default
|
|
5
|
+
* waitForLoadState('networkidle'). Monitors DOM mutations and network
|
|
6
|
+
* activity to determine when a page has truly finished rendering.
|
|
7
|
+
*
|
|
8
|
+
* The npm package uses default Playwright waitForLoadState.
|
|
9
|
+
* Premium servers can wire this in via the waitForContentStable hook.
|
|
10
|
+
*/
|
|
11
|
+
export interface StabilityOptions {
|
|
12
|
+
/** Maximum time to wait (ms). Default: 5000. */
|
|
13
|
+
timeoutMs?: number;
|
|
14
|
+
/** Minimum quiet period before declaring stable (ms). Default: 500. */
|
|
15
|
+
quietMs?: number;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Wait for page content to stabilize by monitoring DOM mutations.
|
|
19
|
+
*
|
|
20
|
+
* More reliable than waitForLoadState('networkidle') for SPAs that
|
|
21
|
+
* progressively render content.
|
|
22
|
+
*/
|
|
23
|
+
export declare function waitForContentStable(page: any, options?: StabilityOptions): Promise<void>;
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Premium content stability detection — server-only.
|
|
3
|
+
*
|
|
4
|
+
* Provides smarter content-stability waiting logic than the default
|
|
5
|
+
* waitForLoadState('networkidle'). Monitors DOM mutations and network
|
|
6
|
+
* activity to determine when a page has truly finished rendering.
|
|
7
|
+
*
|
|
8
|
+
* The npm package uses default Playwright waitForLoadState.
|
|
9
|
+
* Premium servers can wire this in via the waitForContentStable hook.
|
|
10
|
+
*/
|
|
11
|
+
/**
|
|
12
|
+
* Wait for page content to stabilize by monitoring DOM mutations.
|
|
13
|
+
*
|
|
14
|
+
* More reliable than waitForLoadState('networkidle') for SPAs that
|
|
15
|
+
* progressively render content.
|
|
16
|
+
*/
|
|
17
|
+
export async function waitForContentStable(page, options) {
|
|
18
|
+
const timeout = options?.timeoutMs ?? 5000;
|
|
19
|
+
const quiet = options?.quietMs ?? 500;
|
|
20
|
+
const start = Date.now();
|
|
21
|
+
// Use page.evaluate to monitor DOM mutations
|
|
22
|
+
await page.evaluate(({ quietMs, timeoutMs }) => {
|
|
23
|
+
return new Promise((resolve) => {
|
|
24
|
+
let lastMutation = Date.now();
|
|
25
|
+
let settled = false;
|
|
26
|
+
const observer = new MutationObserver(() => {
|
|
27
|
+
lastMutation = Date.now();
|
|
28
|
+
});
|
|
29
|
+
observer.observe(document.body, {
|
|
30
|
+
childList: true,
|
|
31
|
+
subtree: true,
|
|
32
|
+
characterData: true,
|
|
33
|
+
});
|
|
34
|
+
const check = () => {
|
|
35
|
+
const now = Date.now();
|
|
36
|
+
if (now - lastMutation >= quietMs || settled) {
|
|
37
|
+
observer.disconnect();
|
|
38
|
+
resolve();
|
|
39
|
+
return;
|
|
40
|
+
}
|
|
41
|
+
if (now - lastMutation > timeoutMs) {
|
|
42
|
+
observer.disconnect();
|
|
43
|
+
resolve();
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
requestAnimationFrame(check);
|
|
47
|
+
};
|
|
48
|
+
// Hard timeout
|
|
49
|
+
setTimeout(() => {
|
|
50
|
+
settled = true;
|
|
51
|
+
observer.disconnect();
|
|
52
|
+
resolve();
|
|
53
|
+
}, timeoutMs);
|
|
54
|
+
// Start checking after an initial quiet period
|
|
55
|
+
setTimeout(check, quietMs);
|
|
56
|
+
});
|
|
57
|
+
}, { quietMs: quiet, timeoutMs: Math.max(0, timeout - (Date.now() - start)) });
|
|
58
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.81",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|