webpeel 0.21.80 → 0.21.81

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Basic domain extraction — public/free tier.
3
+ *
4
+ * Handles a few common domains with simple logic.
5
+ * Full 55+ domain extractors are premium/server-only.
6
+ *
7
+ * This module is safe to include in the npm package.
8
+ * The full `domain-extractors.ts` is compiled for the server
9
+ * but wired in only when premium hooks are registered.
10
+ */
11
+ import type { DomainExtractResult } from './domain-extractors.js';
12
+ /**
13
+ * Basic domain data extractor — free tier stub.
14
+ *
15
+ * Always returns null (delegates all extraction to the normal pipeline).
16
+ * Premium servers override this via the `extractDomainData` strategy hook.
17
+ */
18
+ export declare function extractDomainDataBasic(_html: string, _url: string): Promise<DomainExtractResult | null>;
19
+ /**
20
+ * Basic domain extractor lookup — free tier stub.
21
+ *
22
+ * Always returns null (no domain is recognized in basic mode).
23
+ * Premium servers override this via the `getDomainExtractor` strategy hook.
24
+ */
25
+ export declare function getDomainExtractorBasic(_url: string): ((html: string, url: string) => Promise<DomainExtractResult | null>) | null;
@@ -0,0 +1,31 @@
1
+ /**
2
+ * Basic domain extraction — public/free tier.
3
+ *
4
+ * Handles a few common domains with simple logic.
5
+ * Full 55+ domain extractors are premium/server-only.
6
+ *
7
+ * This module is safe to include in the npm package.
8
+ * The full `domain-extractors.ts` is compiled for the server
9
+ * but wired in only when premium hooks are registered.
10
+ */
11
+ /**
12
+ * Basic domain data extractor — free tier stub.
13
+ *
14
+ * Always returns null (delegates all extraction to the normal pipeline).
15
+ * Premium servers override this via the `extractDomainData` strategy hook.
16
+ */
17
+ export async function extractDomainDataBasic(_html, _url) {
18
+ // Basic (free) tier: no domain-specific extraction.
19
+ // The normal fetch + markdown pipeline handles everything.
20
+ // Premium hook provides 55+ domain extractors (Twitter, Reddit, GitHub, HN, etc.)
21
+ return null;
22
+ }
23
+ /**
24
+ * Basic domain extractor lookup — free tier stub.
25
+ *
26
+ * Always returns null (no domain is recognized in basic mode).
27
+ * Premium servers override this via the `getDomainExtractor` strategy hook.
28
+ */
29
+ export function getDomainExtractorBasic(_url) {
30
+ return null;
31
+ }
@@ -14,7 +14,28 @@ import { autoScroll as runAutoScroll } from './actions.js';
14
14
  import { extractStructured } from './extract.js';
15
15
  import { isPdfContentType, isDocxContentType, extractDocumentToFormat } from './documents.js';
16
16
  import { parseYouTubeUrl, getYouTubeTranscript } from './youtube.js';
17
- import { extractDomainData, getDomainExtractor } from './domain-extractors.js';
17
+ import { extractDomainDataBasic, getDomainExtractorBasic } from './domain-extractors-basic.js';
18
+ import { getDomainExtractHook, getDomainExtractorHook, getSPADomainsHook, getSPAPatternsHook } from './strategy-hooks.js';
19
+ // Lazy-loaded full extractors — available in repo/server, absent in npm package.
20
+ // The dynamic import avoids hard failures when domain-extractors.js is excluded from npm.
21
+ let _fullExtractorsLoaded = false;
22
+ let _fullExtractDomainData = null;
23
+ let _fullGetDomainExtractor = null;
24
+ async function loadFullExtractors() {
25
+ if (_fullExtractorsLoaded)
26
+ return;
27
+ _fullExtractorsLoaded = true;
28
+ try {
29
+ const mod = await import('./domain-extractors.js');
30
+ _fullExtractDomainData = mod.extractDomainData;
31
+ _fullGetDomainExtractor = mod.getDomainExtractor;
32
+ }
33
+ catch {
34
+ // Not available (npm package) — basic stubs will be used
35
+ }
36
+ }
37
+ // Eagerly start loading (non-blocking)
38
+ loadFullExtractors();
18
39
  import { extractReadableContent } from './readability.js';
19
40
  import { quickAnswer as runQuickAnswer } from './quick-answer.js';
20
41
  import { Timer } from './timing.js';
@@ -24,6 +45,38 @@ import { sanitizeForLLM } from './prompt-guard.js';
24
45
  import { getSourceCredibility } from './source-credibility.js';
25
46
  import { createLogger } from './logger.js';
26
47
  const log = createLogger('pipeline');
48
+ // ---------------------------------------------------------------------------
49
+ // Hook-aware wrappers — route through premium hooks, fall back to basic stubs
50
+ // ---------------------------------------------------------------------------
51
+ /**
52
+ * Check if a URL has a domain extractor.
53
+ * Priority: premium hook → full extractors (repo/server) → basic stub.
54
+ */
55
+ function hasDomainExtractor(url) {
56
+ const hookFn = getDomainExtractorHook();
57
+ if (hookFn)
58
+ return hookFn(url) !== null;
59
+ // Full extractors available (repo/server build)?
60
+ if (_fullGetDomainExtractor)
61
+ return _fullGetDomainExtractor(url) !== null;
62
+ // npm package fallback — basic stubs
63
+ return getDomainExtractorBasic(url) !== null;
64
+ }
65
+ /**
66
+ * Run domain extraction on HTML/URL.
67
+ * Priority: premium hook → full extractors (repo/server) → basic stub.
68
+ */
69
+ async function runDomainExtract(html, url) {
70
+ const hookFn = getDomainExtractHook();
71
+ if (hookFn)
72
+ return hookFn(html, url);
73
+ // Full extractors available (repo/server build)?
74
+ await loadFullExtractors(); // Ensure loaded
75
+ if (_fullExtractDomainData)
76
+ return _fullExtractDomainData(html, url);
77
+ // npm package fallback — basic stubs
78
+ return extractDomainDataBasic(html, url);
79
+ }
27
80
  /** Create the initial PipelineContext with defaults */
28
81
  export function createContext(url, options) {
29
82
  return {
@@ -147,27 +200,16 @@ export function normalizeOptions(ctx) {
147
200
  ctx.render = true;
148
201
  }
149
202
  // Auto-detect SPAs that require browser rendering (no --render flag needed)
203
+ // Premium hook provides full SPA domain list; basic has a small default set.
150
204
  if (!ctx.render) {
151
- const SPA_DOMAINS = new Set([
152
- 'www.google.com', // Google Flights, Maps, Shopping etc.
153
- 'flights.google.com',
154
- 'www.airbnb.com',
155
- 'www.booking.com',
156
- 'www.expedia.com',
157
- 'www.kayak.com',
158
- 'www.skyscanner.com',
159
- 'www.tripadvisor.com',
160
- 'www.indeed.com',
161
- 'www.glassdoor.com',
162
- 'www.zillow.com', // already handled but backup
163
- 'app.webpeel.dev', // our own dashboard is a SPA
164
- ]);
165
- // More specific: some google.com paths need render, not all
166
- const SPA_URL_PATTERNS = [
167
- /google\.com\/travel/,
168
- /google\.com\/maps/,
169
- /google\.com\/shopping/,
170
- ];
205
+ const spaDomainsHook = getSPADomainsHook();
206
+ const spaPatternsHook = getSPAPatternsHook();
207
+ // Basic SPA defaults — minimal set for free tier
208
+ const DEFAULT_SPA_DOMAINS = new Set([]);
209
+ const DEFAULT_SPA_PATTERNS = [];
210
+ // Premium hook merges its full list; basic uses defaults
211
+ const SPA_DOMAINS = spaDomainsHook ? spaDomainsHook() : DEFAULT_SPA_DOMAINS;
212
+ const SPA_URL_PATTERNS = spaPatternsHook ? spaPatternsHook() : DEFAULT_SPA_PATTERNS;
171
213
  try {
172
214
  const hostname = new URL(ctx.url).hostname;
173
215
  if (SPA_DOMAINS.has(hostname)) {
@@ -304,10 +346,10 @@ export async function fetchContent(ctx) {
304
346
  const needsDesignAnalysis = ctx.options.designAnalysis && ctx.render;
305
347
  // Try API-based domain extraction first (Reddit, GitHub, HN use APIs, not HTML)
306
348
  // This avoids expensive browser fetches that often get blocked
307
- if (getDomainExtractor(ctx.url)) {
349
+ if (hasDomainExtractor(ctx.url)) {
308
350
  try {
309
351
  ctx.timer.mark('domainApiFirst');
310
- const ddResult = await extractDomainData('', ctx.url);
352
+ const ddResult = await runDomainExtract('', ctx.url);
311
353
  ctx.timer.end('domainApiFirst');
312
354
  if (ddResult && ddResult.cleanContent.length > 50) {
313
355
  ctx.domainData = ddResult;
@@ -385,9 +427,9 @@ export async function fetchContent(ctx) {
385
427
  }
386
428
  catch (fetchError) {
387
429
  // If fetch failed but we have a domain extractor, try it as fallback
388
- if (getDomainExtractor(ctx.url)) {
430
+ if (hasDomainExtractor(ctx.url)) {
389
431
  try {
390
- const ddResult = await extractDomainData('', ctx.url);
432
+ const ddResult = await runDomainExtract('', ctx.url);
391
433
  if (ddResult && ddResult.cleanContent.length > 50) {
392
434
  ctx.timer.end('fetch');
393
435
  ctx.domainData = ddResult;
@@ -1041,14 +1083,14 @@ export async function postProcess(ctx) {
1041
1083
  }
1042
1084
  // Domain-aware structured extraction (Twitter, Reddit, GitHub, HN)
1043
1085
  // Fires when URL matches a known domain. Replaces content with clean markdown.
1044
- if (getDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
1086
+ if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
1045
1087
  try {
1046
1088
  ctx.timer.mark('domainExtract');
1047
1089
  // Try raw HTML first, then fall back to readability-processed content
1048
1090
  // (some SPAs like Google Flights have data only after readability processing)
1049
- let ddResult = await extractDomainData(fetchResult.html, fetchResult.url);
1091
+ let ddResult = await runDomainExtract(fetchResult.html, fetchResult.url);
1050
1092
  if (!ddResult && ctx.content) {
1051
- ddResult = await extractDomainData(ctx.content, fetchResult.url);
1093
+ ddResult = await runDomainExtract(ctx.content, fetchResult.url);
1052
1094
  }
1053
1095
  ctx.timer.end('domainExtract');
1054
1096
  if (ddResult) {
@@ -10,6 +10,7 @@
10
10
  * All hook methods are optional — unset hooks are simply skipped.
11
11
  */
12
12
  import type { FetchResult } from './fetcher.js';
13
+ import type { DomainExtractResult } from './domain-extractors.js';
13
14
  export interface StrategyResult extends FetchResult {
14
15
  method: 'simple' | 'browser' | 'stealth' | 'cached' | 'cloaked' | 'cycle' | 'peeltls' | 'cf-worker' | 'google-cache';
15
16
  /**
@@ -65,6 +66,39 @@ export interface StrategyHooks {
65
66
  * Only called when `shouldRace()` returns true. Default: 2000.
66
67
  */
67
68
  getRaceTimeoutMs?(): number;
69
+ /**
70
+ * Premium domain extraction hook — 55+ domain extractors.
71
+ * Return null to fall back to basic/no extraction.
72
+ */
73
+ extractDomainData?(html: string, url: string): Promise<DomainExtractResult | null>;
74
+ /**
75
+ * Returns a function that checks if a URL has a known domain extractor.
76
+ * Premium knows which domains have extractors; basic returns null for all.
77
+ */
78
+ getDomainExtractor?(url: string): ((html: string, url: string) => Promise<DomainExtractResult | null>) | null;
79
+ /**
80
+ * Premium SPA domain list — knows which sites require browser rendering.
81
+ * Basic: returns empty set (no SPA auto-detection).
82
+ */
83
+ getSPADomains?(): Set<string>;
84
+ /**
85
+ * Premium SPA URL patterns — matches specific paths needing render.
86
+ * Basic: returns empty array.
87
+ */
88
+ getSPAPatterns?(): RegExp[];
89
+ /**
90
+ * Premium CAPTCHA/challenge solving hook.
91
+ * Return null to fall back to default challenge handling.
92
+ */
93
+ solveChallenge?(page: any, url: string): Promise<{
94
+ solved: boolean;
95
+ html?: string;
96
+ } | null>;
97
+ /**
98
+ * Premium wait-for-stable content logic — smarter than waitForLoadState.
99
+ * Return null/undefined to fall back to default wait logic.
100
+ */
101
+ waitForContentStable?(page: any, options?: any): Promise<void>;
68
102
  }
69
103
  /**
70
104
  * Register premium strategy hooks. Should be called once at server startup.
@@ -79,3 +113,33 @@ export declare function clearStrategyHooks(): void;
79
113
  * Retrieve the current hooks (internal — used by strategies.ts).
80
114
  */
81
115
  export declare function getStrategyHooks(): Readonly<StrategyHooks>;
116
+ /**
117
+ * Get the premium domain extraction hook, if registered.
118
+ * Returns undefined when no premium hooks are active (basic/npm mode).
119
+ */
120
+ export declare function getDomainExtractHook(): StrategyHooks['extractDomainData'];
121
+ /**
122
+ * Get the premium domain extractor lookup hook, if registered.
123
+ * Returns undefined when no premium hooks are active (basic/npm mode).
124
+ */
125
+ export declare function getDomainExtractorHook(): StrategyHooks['getDomainExtractor'];
126
+ /**
127
+ * Get the premium SPA domains hook, if registered.
128
+ * Returns undefined when no premium hooks are active (basic/npm mode).
129
+ */
130
+ export declare function getSPADomainsHook(): StrategyHooks['getSPADomains'];
131
+ /**
132
+ * Get the premium SPA patterns hook, if registered.
133
+ * Returns undefined when no premium hooks are active (basic/npm mode).
134
+ */
135
+ export declare function getSPAPatternsHook(): StrategyHooks['getSPAPatterns'];
136
+ /**
137
+ * Get the premium challenge solver hook, if registered.
138
+ * Returns undefined when no premium hooks are active (basic/npm mode).
139
+ */
140
+ export declare function getChallengeHook(): StrategyHooks['solveChallenge'];
141
+ /**
142
+ * Get the premium content stability hook, if registered.
143
+ * Returns undefined when no premium hooks are active (basic/npm mode).
144
+ */
145
+ export declare function getStabilityHook(): StrategyHooks['waitForContentStable'];
@@ -30,3 +30,45 @@ export function clearStrategyHooks() {
30
30
  export function getStrategyHooks() {
31
31
  return registeredHooks;
32
32
  }
33
+ /**
34
+ * Get the premium domain extraction hook, if registered.
35
+ * Returns undefined when no premium hooks are active (basic/npm mode).
36
+ */
37
+ export function getDomainExtractHook() {
38
+ return registeredHooks.extractDomainData;
39
+ }
40
+ /**
41
+ * Get the premium domain extractor lookup hook, if registered.
42
+ * Returns undefined when no premium hooks are active (basic/npm mode).
43
+ */
44
+ export function getDomainExtractorHook() {
45
+ return registeredHooks.getDomainExtractor;
46
+ }
47
+ /**
48
+ * Get the premium SPA domains hook, if registered.
49
+ * Returns undefined when no premium hooks are active (basic/npm mode).
50
+ */
51
+ export function getSPADomainsHook() {
52
+ return registeredHooks.getSPADomains;
53
+ }
54
+ /**
55
+ * Get the premium SPA patterns hook, if registered.
56
+ * Returns undefined when no premium hooks are active (basic/npm mode).
57
+ */
58
+ export function getSPAPatternsHook() {
59
+ return registeredHooks.getSPAPatterns;
60
+ }
61
+ /**
62
+ * Get the premium challenge solver hook, if registered.
63
+ * Returns undefined when no premium hooks are active (basic/npm mode).
64
+ */
65
+ export function getChallengeHook() {
66
+ return registeredHooks.solveChallenge;
67
+ }
68
+ /**
69
+ * Get the premium content stability hook, if registered.
70
+ * Returns undefined when no premium hooks are active (basic/npm mode).
71
+ */
72
+ export function getStabilityHook() {
73
+ return registeredHooks.waitForContentStable;
74
+ }
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Premium challenge solver — server-only wrapper.
3
+ *
4
+ * Re-exports the challenge-solver functionality for use as a strategy hook.
5
+ * The npm package handles challenges inline in pipeline.ts (basic handling).
6
+ * Premium servers can wire in enhanced challenge solving via hooks.
7
+ */
8
+ export { solveChallenge } from '../../core/challenge-solver.js';
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Premium challenge solver — server-only wrapper.
3
+ *
4
+ * Re-exports the challenge-solver functionality for use as a strategy hook.
5
+ * The npm package handles challenges inline in pipeline.ts (basic handling).
6
+ * Premium servers can wire in enhanced challenge solving via hooks.
7
+ */
8
+ export { solveChallenge } from '../../core/challenge-solver.js';
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Premium domain extractors — server-only wrapper.
3
+ *
4
+ * Re-exports the full extractDomainData and getDomainExtractor functions
5
+ * from core/domain-extractors.ts for use as strategy hooks.
6
+ *
7
+ * The npm package uses basic stubs (always return null).
8
+ * When premium hooks are registered, these full extractors are wired in.
9
+ */
10
+ export { extractDomainData, getDomainExtractor } from '../../core/domain-extractors.js';
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Premium domain extractors — server-only wrapper.
3
+ *
4
+ * Re-exports the full extractDomainData and getDomainExtractor functions
5
+ * from core/domain-extractors.ts for use as strategy hooks.
6
+ *
7
+ * The npm package uses basic stubs (always return null).
8
+ * When premium hooks are registered, these full extractors are wired in.
9
+ */
10
+ export { extractDomainData, getDomainExtractor } from '../../core/domain-extractors.js';
@@ -5,6 +5,9 @@
5
5
  * • SWR (stale-while-revalidate) response cache
6
6
  * • Domain intelligence (learns which sites need browser/stealth)
7
7
  * • Parallel race strategy (starts browser if simple fetch is slow)
8
+ * • 55+ domain extractors (Twitter, Reddit, GitHub, HN, Wikipedia, etc.)
9
+ * • SPA auto-detection (travel, jobs, real estate sites)
10
+ * • Content stability detection (smart DOM mutation monitoring)
8
11
  *
9
12
  * These modules are NOT shipped in the npm package.
10
13
  */
@@ -5,12 +5,18 @@
5
5
  * • SWR (stale-while-revalidate) response cache
6
6
  * • Domain intelligence (learns which sites need browser/stealth)
7
7
  * • Parallel race strategy (starts browser if simple fetch is slow)
8
+ * • 55+ domain extractors (Twitter, Reddit, GitHub, HN, Wikipedia, etc.)
9
+ * • SPA auto-detection (travel, jobs, real estate sites)
10
+ * • Content stability detection (smart DOM mutation monitoring)
8
11
  *
9
12
  * These modules are NOT shipped in the npm package.
10
13
  */
11
14
  import { registerStrategyHooks } from '../../core/strategy-hooks.js';
12
15
  import { createSWRCacheHooks } from './swr-cache.js';
13
16
  import { createDomainIntelHooks } from './domain-intel.js';
17
+ import { extractDomainData, getDomainExtractor } from './extractors.js';
18
+ import { SPA_DOMAINS, SPA_URL_PATTERNS } from './spa-detection.js';
19
+ import { waitForContentStable } from './stability.js';
14
20
  export { clearDomainIntel } from './domain-intel.js';
15
21
  /**
16
22
  * Wire all premium hooks into the core strategy layer.
@@ -31,5 +37,14 @@ export function registerPremiumHooks() {
31
37
  // Parallel race strategy
32
38
  shouldRace: () => true,
33
39
  getRaceTimeoutMs: () => 2000,
40
+ // Premium domain extraction (55+ extractors)
41
+ extractDomainData,
42
+ // Premium domain extractor lookup
43
+ getDomainExtractor: (url) => getDomainExtractor(url),
44
+ // Premium SPA detection
45
+ getSPADomains: () => SPA_DOMAINS,
46
+ getSPAPatterns: () => SPA_URL_PATTERNS,
47
+ // Premium content stability (DOM mutation monitoring)
48
+ waitForContentStable,
34
49
  });
35
50
  }
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Premium SPA detection — server-only.
3
+ *
4
+ * Full list of domains and URL patterns that require browser rendering.
5
+ * The npm package only has a minimal default set (Google, our own dashboard).
6
+ * Premium servers register these via strategy hooks.
7
+ */
8
+ /**
9
+ * Domains that are known SPAs requiring browser rendering.
10
+ * Includes travel, real estate, job boards, and other dynamic sites.
11
+ */
12
+ export declare const SPA_DOMAINS: Set<string>;
13
+ /**
14
+ * URL patterns that match SPA routes on mixed-content domains.
15
+ * E.g. google.com/travel is SPA, but google.com/search is not.
16
+ */
17
+ export declare const SPA_URL_PATTERNS: RegExp[];
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Premium SPA detection — server-only.
3
+ *
4
+ * Full list of domains and URL patterns that require browser rendering.
5
+ * The npm package only has a minimal default set (Google, our own dashboard).
6
+ * Premium servers register these via strategy hooks.
7
+ */
8
+ /**
9
+ * Domains that are known SPAs requiring browser rendering.
10
+ * Includes travel, real estate, job boards, and other dynamic sites.
11
+ */
12
+ export const SPA_DOMAINS = new Set([
13
+ // Google properties
14
+ 'www.google.com',
15
+ 'flights.google.com',
16
+ // Travel
17
+ 'www.airbnb.com',
18
+ 'www.booking.com',
19
+ 'www.expedia.com',
20
+ 'www.kayak.com',
21
+ 'www.skyscanner.com',
22
+ 'www.tripadvisor.com',
23
+ // Jobs
24
+ 'www.indeed.com',
25
+ 'www.glassdoor.com',
26
+ // Real estate
27
+ 'www.zillow.com',
28
+ // Our own dashboard
29
+ 'app.webpeel.dev',
30
+ ]);
31
+ /**
32
+ * URL patterns that match SPA routes on mixed-content domains.
33
+ * E.g. google.com/travel is SPA, but google.com/search is not.
34
+ */
35
+ export const SPA_URL_PATTERNS = [
36
+ /google\.com\/travel/,
37
+ /google\.com\/maps/,
38
+ /google\.com\/shopping/,
39
+ ];
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Premium content stability detection — server-only.
3
+ *
4
+ * Provides smarter content-stability waiting logic than the default
5
+ * waitForLoadState('networkidle'). Monitors DOM mutations and network
6
+ * activity to determine when a page has truly finished rendering.
7
+ *
8
+ * The npm package uses default Playwright waitForLoadState.
9
+ * Premium servers can wire this in via the waitForContentStable hook.
10
+ */
11
+ export interface StabilityOptions {
12
+ /** Maximum time to wait (ms). Default: 5000. */
13
+ timeoutMs?: number;
14
+ /** Minimum quiet period before declaring stable (ms). Default: 500. */
15
+ quietMs?: number;
16
+ }
17
+ /**
18
+ * Wait for page content to stabilize by monitoring DOM mutations.
19
+ *
20
+ * More reliable than waitForLoadState('networkidle') for SPAs that
21
+ * progressively render content.
22
+ */
23
+ export declare function waitForContentStable(page: any, options?: StabilityOptions): Promise<void>;
@@ -0,0 +1,58 @@
1
+ /**
2
+ * Premium content stability detection — server-only.
3
+ *
4
+ * Provides smarter content-stability waiting logic than the default
5
+ * waitForLoadState('networkidle'). Monitors DOM mutations and network
6
+ * activity to determine when a page has truly finished rendering.
7
+ *
8
+ * The npm package uses default Playwright waitForLoadState.
9
+ * Premium servers can wire this in via the waitForContentStable hook.
10
+ */
11
+ /**
12
+ * Wait for page content to stabilize by monitoring DOM mutations.
13
+ *
14
+ * More reliable than waitForLoadState('networkidle') for SPAs that
15
+ * progressively render content.
16
+ */
17
+ export async function waitForContentStable(page, options) {
18
+ const timeout = options?.timeoutMs ?? 5000;
19
+ const quiet = options?.quietMs ?? 500;
20
+ const start = Date.now();
21
+ // Use page.evaluate to monitor DOM mutations
22
+ await page.evaluate(({ quietMs, timeoutMs }) => {
23
+ return new Promise((resolve) => {
24
+ let lastMutation = Date.now();
25
+ let settled = false;
26
+ const observer = new MutationObserver(() => {
27
+ lastMutation = Date.now();
28
+ });
29
+ observer.observe(document.body, {
30
+ childList: true,
31
+ subtree: true,
32
+ characterData: true,
33
+ });
34
+ const check = () => {
35
+ const now = Date.now();
36
+ if (now - lastMutation >= quietMs || settled) {
37
+ observer.disconnect();
38
+ resolve();
39
+ return;
40
+ }
41
+ if (now - lastMutation > timeoutMs) {
42
+ observer.disconnect();
43
+ resolve();
44
+ return;
45
+ }
46
+ requestAnimationFrame(check);
47
+ };
48
+ // Hard timeout
49
+ setTimeout(() => {
50
+ settled = true;
51
+ observer.disconnect();
52
+ resolve();
53
+ }, timeoutMs);
54
+ // Start checking after an initial quiet period
55
+ setTimeout(check, quietMs);
56
+ });
57
+ }, { quietMs: quiet, timeoutMs: Math.max(0, timeout - (Date.now() - start)) });
58
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.80",
3
+ "version": "0.21.81",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",