webpeel 0.21.84 → 0.21.86

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@
5
5
  * mutable PipelineContext. The stages are called in order by peel().
6
6
  */
7
7
  import { type AutoScrollOptions } from './actions.js';
8
- import { type DomainExtractResult } from './domain-extractors-basic.js';
8
+ import { type DomainExtractResult } from '../ee/domain-extractors.js';
9
9
  import { type ReadabilityResult } from './readability.js';
10
10
  import { type QuickAnswerResult } from './quick-answer.js';
11
11
  import { Timer } from './timing.js';
@@ -14,34 +14,8 @@ import { autoScroll as runAutoScroll } from './actions.js';
14
14
  import { extractStructured } from './extract.js';
15
15
  import { isPdfContentType, isDocxContentType, extractDocumentToFormat } from './documents.js';
16
16
  import { parseYouTubeUrl, getYouTubeTranscript } from './youtube.js';
17
- import { extractDomainDataBasic, getDomainExtractorBasic } from './domain-extractors-basic.js';
17
+ import { extractDomainData, getDomainExtractor } from '../ee/domain-extractors.js';
18
18
  import { getDomainExtractHook, getDomainExtractorHook, getSPADomainsHook, getSPAPatternsHook } from './strategy-hooks.js';
19
- // ---------------------------------------------------------------------------
20
- // Domain extraction — lazy-load full extractors from compiled JS
21
- // ---------------------------------------------------------------------------
22
- // The compiled domain-extractors.js (312KB) ships in the npm package.
23
- // TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
24
- // If compiled JS is missing (bare repo clone without proprietary files),
25
- // falls back to basic stub (no domain extraction, just standard markdown).
26
- // Server premium hooks can override for additional caching/intelligence.
27
- let _extractorsLoaded = false;
28
- let _extractDomainData = null;
29
- let _getDomainExtractor = null;
30
- async function loadExtractors() {
31
- if (_extractorsLoaded)
32
- return;
33
- _extractorsLoaded = true;
34
- try {
35
- const mod = await import('./domain-extractors.js');
36
- _extractDomainData = mod.extractDomainData;
37
- _getDomainExtractor = mod.getDomainExtractor;
38
- }
39
- catch {
40
- // Compiled JS not available (bare repo clone) — basic stub will be used
41
- }
42
- }
43
- // Start loading immediately (non-blocking)
44
- loadExtractors();
45
19
  import { extractReadableContent } from './readability.js';
46
20
  import { quickAnswer as runQuickAnswer } from './quick-answer.js';
47
21
  import { Timer } from './timing.js';
@@ -56,28 +30,23 @@ const log = createLogger('pipeline');
56
30
  // ---------------------------------------------------------------------------
57
31
  /**
58
32
  * Check if a URL has a domain extractor.
59
- * Priority: premium hook → full extractors (repo/server) → basic stub.
33
+ * Priority: premium hook → ee/domain-extractors.
60
34
  */
61
35
  function hasDomainExtractor(url) {
62
36
  const hookFn = getDomainExtractorHook();
63
37
  if (hookFn)
64
38
  return hookFn(url) !== null;
65
- if (_getDomainExtractor)
66
- return _getDomainExtractor(url) !== null;
67
- return getDomainExtractorBasic(url) !== null;
39
+ return getDomainExtractor(url) !== null;
68
40
  }
69
41
  /**
70
42
  * Run domain extraction on HTML/URL.
71
- * Priority: premium hook → compiled extractors → basic stub.
43
+ * Priority: premium hook → ee/domain-extractors.
72
44
  */
73
45
  async function runDomainExtract(html, url) {
74
46
  const hookFn = getDomainExtractHook();
75
47
  if (hookFn)
76
48
  return hookFn(html, url);
77
- await loadExtractors();
78
- if (_extractDomainData)
79
- return _extractDomainData(html, url);
80
- return extractDomainDataBasic(html, url);
49
+ return extractDomainData(html, url);
81
50
  }
82
51
  /** Create the initial PipelineContext with defaults */
83
52
  export function createContext(url, options) {
@@ -609,7 +578,7 @@ export async function fetchContent(ctx) {
609
578
  const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
610
579
  if (canSolve) {
611
580
  try {
612
- const { solveChallenge } = await import('./challenge-solver.js');
581
+ const { solveChallenge } = await import('../ee/challenge-solver.js');
613
582
  const { detectChallenge } = await import('./challenge-detection.js');
614
583
  const rawHtml = fetchResult.html || '';
615
584
  const detectionResult = detectChallenge(rawHtml, fetchResult.statusCode);
@@ -1179,7 +1148,7 @@ export async function postProcess(ctx) {
1179
1148
  const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
1180
1149
  if (canSolve && ctx.fetchResult?.html) {
1181
1150
  try {
1182
- const { solveChallenge } = await import('./challenge-solver.js');
1151
+ const { solveChallenge } = await import('../ee/challenge-solver.js');
1183
1152
  const { detectChallenge } = await import('./challenge-detection.js');
1184
1153
  const rawHtml = ctx.fetchResult.html;
1185
1154
  const detectionResult = detectChallenge(rawHtml, ctx.fetchResult.statusCode);
@@ -10,7 +10,7 @@
10
10
  * All hook methods are optional — unset hooks are simply skipped.
11
11
  */
12
12
  import type { FetchResult } from './fetcher.js';
13
- import type { DomainExtractResult } from './domain-extractors-basic.js';
13
+ import type { DomainExtractResult } from '../ee/domain-extractors.js';
14
14
  export interface StrategyResult extends FetchResult {
15
15
  method: 'simple' | 'browser' | 'stealth' | 'cached' | 'cloaked' | 'cycle' | 'peeltls' | 'cf-worker' | 'google-cache';
16
16
  /**
@@ -0,0 +1 @@
1
+ export { solveChallenge } from './challenge-solver.js';
@@ -0,0 +1 @@
1
+ export { solveChallenge } from './challenge-solver.js';
@@ -0,0 +1,72 @@
1
+ /**
2
+ * Challenge / bot-protection solver.
3
+ *
4
+ * Attempts to bypass bot-protection challenges using free, in-process methods:
5
+ * 1. Cloudflare JS challenge — render in stealth Playwright, wait for auto-solve
6
+ * 2. hCaptcha — accessibility bypass (TODO: implement if API is confirmed available)
7
+ *
8
+ * Architecture note:
9
+ * Browser-based solving is CPU/RAM intensive. When the env var BROWSER_WORKER_URL
10
+ * is set, the solve request is proxied to an external worker (e.g. Hetzner 4GB VM)
11
+ * instead of running locally. This keeps the main Render container (512 MB) lean.
12
+ *
13
+ * Usage:
14
+ * const result = await solveChallenge(url, 'cloudflare', html);
15
+ * if (result.solved) {
16
+ * // result.html = real page content
17
+ * // result.cookies = ["cf_clearance=...", ...]
18
+ * }
19
+ */
20
+ import type { ChallengeType } from '../core/challenge-detection.js';
21
+ export interface ImageCaptchaResult {
22
+ solved: boolean;
23
+ rounds: number;
24
+ error?: string;
25
+ }
26
+ /**
27
+ * Ask the moondream vision model which grid cells contain the target object.
28
+ * Returns an array of 1-indexed grid positions (1–9), or null if the call fails.
29
+ */
30
+ export declare function askVisionModel(base64Image: string, targetObject: string): Promise<number[] | null>;
31
+ /**
32
+ * Detect if the page has an image grid CAPTCHA and extract the target object.
33
+ * Returns the object name (e.g. "traffic lights") or null if not detected.
34
+ */
35
+ export declare function detectImageCaptchaTarget(page: import('playwright').Page): Promise<string | null>;
36
+ /**
37
+ * Solve an image grid CAPTCHA using the moondream vision model.
38
+ *
39
+ * Flow per round:
40
+ * 1. Screenshot the CAPTCHA grid element
41
+ * 2. Send to moondream → get grid positions
42
+ * 3. Click identified cells
43
+ * 4. Click Verify button
44
+ * 5. Check if solved; if a new round appears, repeat (max 3 rounds)
45
+ */
46
+ export declare function solveImageCaptcha(page: import('playwright').Page, targetObject: string): Promise<ImageCaptchaResult>;
47
+ export interface SolveOptions {
48
+ /** Hard timeout in ms (default: 15 000) */
49
+ timeout?: number;
50
+ /** Optional proxy URL (http://user:pass@host:port) */
51
+ proxy?: string;
52
+ }
53
+ export interface SolveResult {
54
+ solved: boolean;
55
+ html: string;
56
+ /** Raw Set-Cookie header values extracted after solve */
57
+ cookies?: string[];
58
+ /** How the solve was performed */
59
+ method?: 'local-browser' | 'remote-worker' | 'accessibility';
60
+ /** Error details if solve failed */
61
+ error?: string;
62
+ }
63
+ /**
64
+ * Attempt to solve a bot-protection challenge.
65
+ *
66
+ * @param url The page URL (used for proxy routing and cookie caching)
67
+ * @param challengeType The type of challenge as detected by challenge-detection
68
+ * @param html The raw challenge HTML (used for context / fallback)
69
+ * @param options Optional timeout and proxy settings
70
+ * @returns Solve result with real HTML content and cookies if successful
71
+ */
72
+ export declare function solveChallenge(url: string, challengeType: ChallengeType, html: string, options?: SolveOptions): Promise<SolveResult>;