webpeel 0.21.84 → 0.21.86
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/pipeline.d.ts +1 -1
- package/dist/core/pipeline.js +7 -38
- package/dist/core/strategy-hooks.d.ts +1 -1
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +48 -0
- package/dist/ee/domain-extractors.js +6342 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +1 -2
- package/dist/index.js +3 -1
- package/dist/server/app.js +4 -5
- package/dist/server/routes/smart-search.d.ts +11 -0
- package/dist/server/routes/smart-search.js +70 -10
- package/dist/types.d.ts +1 -1
- package/package.json +2 -1
package/dist/core/pipeline.d.ts
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* mutable PipelineContext. The stages are called in order by peel().
|
|
6
6
|
*/
|
|
7
7
|
import { type AutoScrollOptions } from './actions.js';
|
|
8
|
-
import { type DomainExtractResult } from '
|
|
8
|
+
import { type DomainExtractResult } from '../ee/domain-extractors.js';
|
|
9
9
|
import { type ReadabilityResult } from './readability.js';
|
|
10
10
|
import { type QuickAnswerResult } from './quick-answer.js';
|
|
11
11
|
import { Timer } from './timing.js';
|
package/dist/core/pipeline.js
CHANGED
|
@@ -14,34 +14,8 @@ import { autoScroll as runAutoScroll } from './actions.js';
|
|
|
14
14
|
import { extractStructured } from './extract.js';
|
|
15
15
|
import { isPdfContentType, isDocxContentType, extractDocumentToFormat } from './documents.js';
|
|
16
16
|
import { parseYouTubeUrl, getYouTubeTranscript } from './youtube.js';
|
|
17
|
-
import {
|
|
17
|
+
import { extractDomainData, getDomainExtractor } from '../ee/domain-extractors.js';
|
|
18
18
|
import { getDomainExtractHook, getDomainExtractorHook, getSPADomainsHook, getSPAPatternsHook } from './strategy-hooks.js';
|
|
19
|
-
// ---------------------------------------------------------------------------
|
|
20
|
-
// Domain extraction — lazy-load full extractors from compiled JS
|
|
21
|
-
// ---------------------------------------------------------------------------
|
|
22
|
-
// The compiled domain-extractors.js (312KB) ships in the npm package.
|
|
23
|
-
// TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
|
|
24
|
-
// If compiled JS is missing (bare repo clone without proprietary files),
|
|
25
|
-
// falls back to basic stub (no domain extraction, just standard markdown).
|
|
26
|
-
// Server premium hooks can override for additional caching/intelligence.
|
|
27
|
-
let _extractorsLoaded = false;
|
|
28
|
-
let _extractDomainData = null;
|
|
29
|
-
let _getDomainExtractor = null;
|
|
30
|
-
async function loadExtractors() {
|
|
31
|
-
if (_extractorsLoaded)
|
|
32
|
-
return;
|
|
33
|
-
_extractorsLoaded = true;
|
|
34
|
-
try {
|
|
35
|
-
const mod = await import('./domain-extractors.js');
|
|
36
|
-
_extractDomainData = mod.extractDomainData;
|
|
37
|
-
_getDomainExtractor = mod.getDomainExtractor;
|
|
38
|
-
}
|
|
39
|
-
catch {
|
|
40
|
-
// Compiled JS not available (bare repo clone) — basic stub will be used
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
// Start loading immediately (non-blocking)
|
|
44
|
-
loadExtractors();
|
|
45
19
|
import { extractReadableContent } from './readability.js';
|
|
46
20
|
import { quickAnswer as runQuickAnswer } from './quick-answer.js';
|
|
47
21
|
import { Timer } from './timing.js';
|
|
@@ -56,28 +30,23 @@ const log = createLogger('pipeline');
|
|
|
56
30
|
// ---------------------------------------------------------------------------
|
|
57
31
|
/**
|
|
58
32
|
* Check if a URL has a domain extractor.
|
|
59
|
-
* Priority: premium hook →
|
|
33
|
+
* Priority: premium hook → ee/domain-extractors.
|
|
60
34
|
*/
|
|
61
35
|
function hasDomainExtractor(url) {
|
|
62
36
|
const hookFn = getDomainExtractorHook();
|
|
63
37
|
if (hookFn)
|
|
64
38
|
return hookFn(url) !== null;
|
|
65
|
-
|
|
66
|
-
return _getDomainExtractor(url) !== null;
|
|
67
|
-
return getDomainExtractorBasic(url) !== null;
|
|
39
|
+
return getDomainExtractor(url) !== null;
|
|
68
40
|
}
|
|
69
41
|
/**
|
|
70
42
|
* Run domain extraction on HTML/URL.
|
|
71
|
-
* Priority: premium hook →
|
|
43
|
+
* Priority: premium hook → ee/domain-extractors.
|
|
72
44
|
*/
|
|
73
45
|
async function runDomainExtract(html, url) {
|
|
74
46
|
const hookFn = getDomainExtractHook();
|
|
75
47
|
if (hookFn)
|
|
76
48
|
return hookFn(html, url);
|
|
77
|
-
|
|
78
|
-
if (_extractDomainData)
|
|
79
|
-
return _extractDomainData(html, url);
|
|
80
|
-
return extractDomainDataBasic(html, url);
|
|
49
|
+
return extractDomainData(html, url);
|
|
81
50
|
}
|
|
82
51
|
/** Create the initial PipelineContext with defaults */
|
|
83
52
|
export function createContext(url, options) {
|
|
@@ -609,7 +578,7 @@ export async function fetchContent(ctx) {
|
|
|
609
578
|
const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
|
|
610
579
|
if (canSolve) {
|
|
611
580
|
try {
|
|
612
|
-
const { solveChallenge } = await import('
|
|
581
|
+
const { solveChallenge } = await import('../ee/challenge-solver.js');
|
|
613
582
|
const { detectChallenge } = await import('./challenge-detection.js');
|
|
614
583
|
const rawHtml = fetchResult.html || '';
|
|
615
584
|
const detectionResult = detectChallenge(rawHtml, fetchResult.statusCode);
|
|
@@ -1179,7 +1148,7 @@ export async function postProcess(ctx) {
|
|
|
1179
1148
|
const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
|
|
1180
1149
|
if (canSolve && ctx.fetchResult?.html) {
|
|
1181
1150
|
try {
|
|
1182
|
-
const { solveChallenge } = await import('
|
|
1151
|
+
const { solveChallenge } = await import('../ee/challenge-solver.js');
|
|
1183
1152
|
const { detectChallenge } = await import('./challenge-detection.js');
|
|
1184
1153
|
const rawHtml = ctx.fetchResult.html;
|
|
1185
1154
|
const detectionResult = detectChallenge(rawHtml, ctx.fetchResult.statusCode);
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* All hook methods are optional — unset hooks are simply skipped.
|
|
11
11
|
*/
|
|
12
12
|
import type { FetchResult } from './fetcher.js';
|
|
13
|
-
import type { DomainExtractResult } from '
|
|
13
|
+
import type { DomainExtractResult } from '../ee/domain-extractors.js';
|
|
14
14
|
export interface StrategyResult extends FetchResult {
|
|
15
15
|
method: 'simple' | 'browser' | 'stealth' | 'cached' | 'cloaked' | 'cycle' | 'peeltls' | 'cf-worker' | 'google-cache';
|
|
16
16
|
/**
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { solveChallenge } from './challenge-solver.js';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { solveChallenge } from './challenge-solver.js';
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Challenge / bot-protection solver.
|
|
3
|
+
*
|
|
4
|
+
* Attempts to bypass bot-protection challenges using free, in-process methods:
|
|
5
|
+
* 1. Cloudflare JS challenge — render in stealth Playwright, wait for auto-solve
|
|
6
|
+
* 2. hCaptcha — accessibility bypass (TODO: implement if API is confirmed available)
|
|
7
|
+
*
|
|
8
|
+
* Architecture note:
|
|
9
|
+
* Browser-based solving is CPU/RAM intensive. When the env var BROWSER_WORKER_URL
|
|
10
|
+
* is set, the solve request is proxied to an external worker (e.g. Hetzner 4GB VM)
|
|
11
|
+
* instead of running locally. This keeps the main Render container (512 MB) lean.
|
|
12
|
+
*
|
|
13
|
+
* Usage:
|
|
14
|
+
* const result = await solveChallenge(url, 'cloudflare', html);
|
|
15
|
+
* if (result.solved) {
|
|
16
|
+
* // result.html = real page content
|
|
17
|
+
* // result.cookies = ["cf_clearance=...", ...]
|
|
18
|
+
* }
|
|
19
|
+
*/
|
|
20
|
+
import type { ChallengeType } from '../core/challenge-detection.js';
|
|
21
|
+
export interface ImageCaptchaResult {
|
|
22
|
+
solved: boolean;
|
|
23
|
+
rounds: number;
|
|
24
|
+
error?: string;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Ask the moondream vision model which grid cells contain the target object.
|
|
28
|
+
* Returns an array of 1-indexed grid positions (1–9), or null if the call fails.
|
|
29
|
+
*/
|
|
30
|
+
export declare function askVisionModel(base64Image: string, targetObject: string): Promise<number[] | null>;
|
|
31
|
+
/**
|
|
32
|
+
* Detect if the page has an image grid CAPTCHA and extract the target object.
|
|
33
|
+
* Returns the object name (e.g. "traffic lights") or null if not detected.
|
|
34
|
+
*/
|
|
35
|
+
export declare function detectImageCaptchaTarget(page: import('playwright').Page): Promise<string | null>;
|
|
36
|
+
/**
|
|
37
|
+
* Solve an image grid CAPTCHA using the moondream vision model.
|
|
38
|
+
*
|
|
39
|
+
* Flow per round:
|
|
40
|
+
* 1. Screenshot the CAPTCHA grid element
|
|
41
|
+
* 2. Send to moondream → get grid positions
|
|
42
|
+
* 3. Click identified cells
|
|
43
|
+
* 4. Click Verify button
|
|
44
|
+
* 5. Check if solved; if a new round appears, repeat (max 3 rounds)
|
|
45
|
+
*/
|
|
46
|
+
export declare function solveImageCaptcha(page: import('playwright').Page, targetObject: string): Promise<ImageCaptchaResult>;
|
|
47
|
+
export interface SolveOptions {
|
|
48
|
+
/** Hard timeout in ms (default: 15 000) */
|
|
49
|
+
timeout?: number;
|
|
50
|
+
/** Optional proxy URL (http://user:pass@host:port) */
|
|
51
|
+
proxy?: string;
|
|
52
|
+
}
|
|
53
|
+
export interface SolveResult {
|
|
54
|
+
solved: boolean;
|
|
55
|
+
html: string;
|
|
56
|
+
/** Raw Set-Cookie header values extracted after solve */
|
|
57
|
+
cookies?: string[];
|
|
58
|
+
/** How the solve was performed */
|
|
59
|
+
method?: 'local-browser' | 'remote-worker' | 'accessibility';
|
|
60
|
+
/** Error details if solve failed */
|
|
61
|
+
error?: string;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Attempt to solve a bot-protection challenge.
|
|
65
|
+
*
|
|
66
|
+
* @param url The page URL (used for proxy routing and cookie caching)
|
|
67
|
+
* @param challengeType The type of challenge as detected by challenge-detection
|
|
68
|
+
* @param html The raw challenge HTML (used for context / fallback)
|
|
69
|
+
* @param options Optional timeout and proxy settings
|
|
70
|
+
* @returns Solve result with real HTML content and cookies if successful
|
|
71
|
+
*/
|
|
72
|
+
export declare function solveChallenge(url: string, challengeType: ChallengeType, html: string, options?: SolveOptions): Promise<SolveResult>;
|