webpeel 0.21.84 → 0.21.85

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@
5
5
  * mutable PipelineContext. The stages are called in order by peel().
6
6
  */
7
7
  import { type AutoScrollOptions } from './actions.js';
8
- import { type DomainExtractResult } from './domain-extractors-basic.js';
8
+ import { type DomainExtractResult } from '../ee/domain-extractors.js';
9
9
  import { type ReadabilityResult } from './readability.js';
10
10
  import { type QuickAnswerResult } from './quick-answer.js';
11
11
  import { Timer } from './timing.js';
@@ -14,34 +14,8 @@ import { autoScroll as runAutoScroll } from './actions.js';
14
14
  import { extractStructured } from './extract.js';
15
15
  import { isPdfContentType, isDocxContentType, extractDocumentToFormat } from './documents.js';
16
16
  import { parseYouTubeUrl, getYouTubeTranscript } from './youtube.js';
17
- import { extractDomainDataBasic, getDomainExtractorBasic } from './domain-extractors-basic.js';
17
+ import { extractDomainData, getDomainExtractor } from '../ee/domain-extractors.js';
18
18
  import { getDomainExtractHook, getDomainExtractorHook, getSPADomainsHook, getSPAPatternsHook } from './strategy-hooks.js';
19
- // ---------------------------------------------------------------------------
20
- // Domain extraction — lazy-load full extractors from compiled JS
21
- // ---------------------------------------------------------------------------
22
- // The compiled domain-extractors.js (312KB) ships in the npm package.
23
- // TypeScript source is NOT on GitHub (proprietary, .gitignore'd).
24
- // If compiled JS is missing (bare repo clone without proprietary files),
25
- // falls back to basic stub (no domain extraction, just standard markdown).
26
- // Server premium hooks can override for additional caching/intelligence.
27
- let _extractorsLoaded = false;
28
- let _extractDomainData = null;
29
- let _getDomainExtractor = null;
30
- async function loadExtractors() {
31
- if (_extractorsLoaded)
32
- return;
33
- _extractorsLoaded = true;
34
- try {
35
- const mod = await import('./domain-extractors.js');
36
- _extractDomainData = mod.extractDomainData;
37
- _getDomainExtractor = mod.getDomainExtractor;
38
- }
39
- catch {
40
- // Compiled JS not available (bare repo clone) — basic stub will be used
41
- }
42
- }
43
- // Start loading immediately (non-blocking)
44
- loadExtractors();
45
19
  import { extractReadableContent } from './readability.js';
46
20
  import { quickAnswer as runQuickAnswer } from './quick-answer.js';
47
21
  import { Timer } from './timing.js';
@@ -56,28 +30,23 @@ const log = createLogger('pipeline');
56
30
  // ---------------------------------------------------------------------------
57
31
  /**
58
32
  * Check if a URL has a domain extractor.
59
- * Priority: premium hook → full extractors (repo/server) → basic stub.
33
+ * Priority: premium hook → ee/domain-extractors.
60
34
  */
61
35
  function hasDomainExtractor(url) {
62
36
  const hookFn = getDomainExtractorHook();
63
37
  if (hookFn)
64
38
  return hookFn(url) !== null;
65
- if (_getDomainExtractor)
66
- return _getDomainExtractor(url) !== null;
67
- return getDomainExtractorBasic(url) !== null;
39
+ return getDomainExtractor(url) !== null;
68
40
  }
69
41
  /**
70
42
  * Run domain extraction on HTML/URL.
71
- * Priority: premium hook → compiled extractors → basic stub.
43
+ * Priority: premium hook → ee/domain-extractors.
72
44
  */
73
45
  async function runDomainExtract(html, url) {
74
46
  const hookFn = getDomainExtractHook();
75
47
  if (hookFn)
76
48
  return hookFn(html, url);
77
- await loadExtractors();
78
- if (_extractDomainData)
79
- return _extractDomainData(html, url);
80
- return extractDomainDataBasic(html, url);
49
+ return extractDomainData(html, url);
81
50
  }
82
51
  /** Create the initial PipelineContext with defaults */
83
52
  export function createContext(url, options) {
@@ -609,7 +578,7 @@ export async function fetchContent(ctx) {
609
578
  const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
610
579
  if (canSolve) {
611
580
  try {
612
- const { solveChallenge } = await import('./challenge-solver.js');
581
+ const { solveChallenge } = await import('../ee/challenge-solver.js');
613
582
  const { detectChallenge } = await import('./challenge-detection.js');
614
583
  const rawHtml = fetchResult.html || '';
615
584
  const detectionResult = detectChallenge(rawHtml, fetchResult.statusCode);
@@ -1179,7 +1148,7 @@ export async function postProcess(ctx) {
1179
1148
  const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
1180
1149
  if (canSolve && ctx.fetchResult?.html) {
1181
1150
  try {
1182
- const { solveChallenge } = await import('./challenge-solver.js');
1151
+ const { solveChallenge } = await import('../ee/challenge-solver.js');
1183
1152
  const { detectChallenge } = await import('./challenge-detection.js');
1184
1153
  const rawHtml = ctx.fetchResult.html;
1185
1154
  const detectionResult = detectChallenge(rawHtml, ctx.fetchResult.statusCode);
@@ -10,7 +10,7 @@
10
10
  * All hook methods are optional — unset hooks are simply skipped.
11
11
  */
12
12
  import type { FetchResult } from './fetcher.js';
13
- import type { DomainExtractResult } from './domain-extractors-basic.js';
13
+ import type { DomainExtractResult } from '../ee/domain-extractors.js';
14
14
  export interface StrategyResult extends FetchResult {
15
15
  method: 'simple' | 'browser' | 'stealth' | 'cached' | 'cloaked' | 'cycle' | 'peeltls' | 'cf-worker' | 'google-cache';
16
16
  /**
package/dist/index.d.ts CHANGED
@@ -6,8 +6,7 @@
6
6
  import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from './core/fetcher.js';
7
7
  import type { PeelOptions, PeelResult } from './types.js';
8
8
  export * from './types.js';
9
- export type { DomainExtractResult, DomainExtractor } from './core/domain-extractors-basic.js';
10
- export { getDomainExtractor, extractDomainData } from './core/domain-extractors-public.js';
9
+ export { getDomainExtractor, extractDomainData, type DomainExtractResult, type DomainExtractor } from './ee/domain-extractors.js';
11
10
  export { crawl, type CrawlOptions, type CrawlResult, type CrawlProgress } from './core/crawler.js';
12
11
  export { discoverSitemap, type SitemapUrl, type SitemapResult } from './core/sitemap.js';
13
12
  export { mapDomain, type MapOptions, type MapResult } from './core/map.js';
package/dist/index.js CHANGED
@@ -7,7 +7,9 @@ import { cleanup, warmup, closePool, scrollAndWait, closeProfileBrowser } from '
7
7
  import { createContext, normalizeOptions, handleYouTube, fetchContent, detectContentType, parseContent, postProcess, finalize, buildResult, } from './core/pipeline.js';
8
8
  import { checkUrlSafety } from './core/safe-browsing.js';
9
9
  export * from './types.js';
10
- export { getDomainExtractor, extractDomainData } from './core/domain-extractors-public.js';
10
+ // Domain extractors — compiled JS ships in npm, TypeScript source is .gitignore'd.
11
+ // Re-export types from the basic stub (always available), runtime functions via lazy wrapper.
12
+ export { getDomainExtractor, extractDomainData } from './ee/domain-extractors.js';
11
13
  export { crawl } from './core/crawler.js';
12
14
  export { discoverSitemap } from './core/sitemap.js';
13
15
  export { mapDomain } from './core/map.js';
@@ -56,19 +56,18 @@ import { requireScope } from './middleware/scope-guard.js';
56
56
  import { createCacheWarmRouter, startCacheWarmer } from './routes/cache-warm.js';
57
57
  import { warmup, cleanup as cleanupFetcher } from '../core/fetcher.js';
58
58
  // Proprietary modules — loaded dynamically so the build works without TypeScript source.
59
- // Compiled JS ships in npm/Docker. TypeScript source is .gitignore'd (not on GitHub).
60
59
  let setExtractorRedis;
61
60
  let registerPremiumHooks;
62
61
  try {
63
- const de = await import('../core/domain-extractors.js');
62
+ const de = await import('../ee/domain-extractors.js');
64
63
  setExtractorRedis = de.setExtractorRedis;
65
64
  }
66
- catch { /* compiled JS not available */ }
65
+ catch { /* ee module not available */ }
67
66
  try {
68
- const ph = await import('./premium/index.js');
67
+ const ph = await import('../ee/premium-hooks.js');
69
68
  registerPremiumHooks = ph.registerPremiumHooks;
70
69
  }
71
- catch { /* compiled JS not available */ }
70
+ catch { /* ee module not available */ }
72
71
  import { readFileSync } from 'fs';
73
72
  import { join, dirname } from 'path';
74
73
  import { fileURLToPath } from 'url';
@@ -22,13 +22,16 @@ export function detectSearchIntent(query) {
22
22
  if (/\b(car|cars|vehicle|sedan|suv|truck|honda|toyota|tesla|bmw|ford|chevy|chevrolet|nissan|hyundai|kia|mazda|subaru|lexus|audi|mercedes|volkswagen|jeep|dodge|ram|buick|cadillac|gmc|chrysler|acura|infiniti|volvo|porsche|mini|fiat|mitsubishi)\b/.test(q) &&
23
23
  /\b(buy|cheap|under|budget|price|used|new|for sale|listing|deal)\b/.test(q)) {
24
24
  const priceMatch = q.match(/(?:under|\$|budget|max)\s*\$?(\d[\d,]*)/);
25
- const zipMatch = q.match(/\b(\d{5})\b/);
25
+ const priceValue = priceMatch ? priceMatch[1].replace(/,/g, '') : '';
26
+ // Find all 5-digit numbers, pick the one that isn't the price
27
+ const allZips = [...q.matchAll(/\b(\d{5})\b/g)].map(m => m[1]);
28
+ const finalZip = allZips.find(z => z !== priceValue) || '10001';
26
29
  return {
27
30
  type: 'cars',
28
31
  query: q,
29
32
  params: {
30
- maxPrice: priceMatch ? priceMatch[1].replace(/,/g, '') : '',
31
- zip: zipMatch ? zipMatch[1] : '10001',
33
+ maxPrice: priceValue,
34
+ zip: finalZip,
32
35
  },
33
36
  };
34
37
  }
package/dist/types.d.ts CHANGED
@@ -309,7 +309,7 @@ export interface PeelResult {
309
309
  */
310
310
  readability?: import('./core/readability.js').ReadabilityResult;
311
311
  /** Domain-aware structured data (Twitter, Reddit, GitHub, HN). Present when URL matches a known domain. */
312
- domainData?: import('./core/domain-extractors-basic.js').DomainExtractResult;
312
+ domainData?: import('./ee/domain-extractors.js').DomainExtractResult;
313
313
  /** Quick answer result (when question option is set). BM25-powered, no LLM needed. */
314
314
  quickAnswer?: import('./core/quick-answer.js').QuickAnswerResult;
315
315
  /** Per-stage timing breakdown in milliseconds. */
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.84",
3
+ "version": "0.21.85",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",