webpeel 0.21.85 → 0.21.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/dist/cli/commands/fetch.js +13 -0
  2. package/dist/cli/utils.js +10 -1
  3. package/dist/core/http-fetch.js +19 -2
  4. package/dist/core/pipeline.js +3 -2
  5. package/dist/core/schema-templates.js +37 -24
  6. package/dist/core/search-provider.d.ts +2 -0
  7. package/dist/core/search-provider.js +9 -2
  8. package/dist/core/searxng-provider.d.ts +1 -0
  9. package/dist/core/searxng-provider.js +1 -0
  10. package/dist/ee/challenge-re-export.d.ts +1 -0
  11. package/dist/ee/challenge-re-export.js +1 -0
  12. package/dist/{core → ee}/challenge-solver.d.ts +1 -1
  13. package/dist/{core → ee}/challenge-solver.js +5 -5
  14. package/dist/ee/domain-extractors.d.ts +8 -0
  15. package/dist/ee/domain-extractors.js +8 -0
  16. package/dist/{server/premium → ee}/domain-intel.d.ts +1 -1
  17. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  18. package/dist/ee/extractors/allrecipes.js +120 -0
  19. package/dist/ee/extractors/amazon.d.ts +2 -0
  20. package/dist/ee/extractors/amazon.js +78 -0
  21. package/dist/ee/extractors/arxiv.d.ts +2 -0
  22. package/dist/ee/extractors/arxiv.js +137 -0
  23. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  24. package/dist/ee/extractors/bestbuy.js +78 -0
  25. package/dist/ee/extractors/carscom.d.ts +2 -0
  26. package/dist/ee/extractors/carscom.js +121 -0
  27. package/dist/ee/extractors/coingecko.d.ts +2 -0
  28. package/dist/ee/extractors/coingecko.js +134 -0
  29. package/dist/ee/extractors/craigslist.d.ts +2 -0
  30. package/dist/ee/extractors/craigslist.js +92 -0
  31. package/dist/ee/extractors/devto.d.ts +2 -0
  32. package/dist/ee/extractors/devto.js +135 -0
  33. package/dist/ee/extractors/ebay.d.ts +2 -0
  34. package/dist/ee/extractors/ebay.js +90 -0
  35. package/dist/ee/extractors/espn.d.ts +2 -0
  36. package/dist/ee/extractors/espn.js +255 -0
  37. package/dist/ee/extractors/etsy.d.ts +2 -0
  38. package/dist/ee/extractors/etsy.js +52 -0
  39. package/dist/ee/extractors/facebook.d.ts +2 -0
  40. package/dist/ee/extractors/facebook.js +46 -0
  41. package/dist/ee/extractors/github.d.ts +2 -0
  42. package/dist/ee/extractors/github.js +196 -0
  43. package/dist/ee/extractors/google-flights.d.ts +2 -0
  44. package/dist/ee/extractors/google-flights.js +176 -0
  45. package/dist/ee/extractors/hackernews.d.ts +2 -0
  46. package/dist/ee/extractors/hackernews.js +147 -0
  47. package/dist/ee/extractors/imdb.d.ts +2 -0
  48. package/dist/ee/extractors/imdb.js +172 -0
  49. package/dist/ee/extractors/index.d.ts +26 -0
  50. package/dist/ee/extractors/index.js +247 -0
  51. package/dist/ee/extractors/instagram.d.ts +2 -0
  52. package/dist/ee/extractors/instagram.js +102 -0
  53. package/dist/ee/extractors/kalshi.d.ts +2 -0
  54. package/dist/ee/extractors/kalshi.js +115 -0
  55. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  56. package/dist/ee/extractors/kayak-cars.js +270 -0
  57. package/dist/ee/extractors/linkedin.d.ts +2 -0
  58. package/dist/ee/extractors/linkedin.js +113 -0
  59. package/dist/ee/extractors/medium.d.ts +2 -0
  60. package/dist/ee/extractors/medium.js +130 -0
  61. package/dist/ee/extractors/news.d.ts +4 -0
  62. package/dist/ee/extractors/news.js +173 -0
  63. package/dist/ee/extractors/npm.d.ts +2 -0
  64. package/dist/ee/extractors/npm.js +86 -0
  65. package/dist/ee/extractors/pdf.d.ts +2 -0
  66. package/dist/ee/extractors/pdf.js +108 -0
  67. package/dist/ee/extractors/pinterest.d.ts +2 -0
  68. package/dist/ee/extractors/pinterest.js +34 -0
  69. package/dist/ee/extractors/polymarket.d.ts +2 -0
  70. package/dist/ee/extractors/polymarket.js +162 -0
  71. package/dist/ee/extractors/producthunt.d.ts +2 -0
  72. package/dist/ee/extractors/producthunt.js +88 -0
  73. package/dist/ee/extractors/pubmed.d.ts +2 -0
  74. package/dist/ee/extractors/pubmed.js +162 -0
  75. package/dist/ee/extractors/pypi.d.ts +2 -0
  76. package/dist/ee/extractors/pypi.js +80 -0
  77. package/dist/ee/extractors/reddit.d.ts +2 -0
  78. package/dist/ee/extractors/reddit.js +308 -0
  79. package/dist/ee/extractors/redfin.d.ts +2 -0
  80. package/dist/ee/extractors/redfin.js +156 -0
  81. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  82. package/dist/ee/extractors/semanticscholar.js +131 -0
  83. package/dist/ee/extractors/shared.d.ts +12 -0
  84. package/dist/ee/extractors/shared.js +76 -0
  85. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  86. package/dist/ee/extractors/soundcloud.js +34 -0
  87. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  88. package/dist/ee/extractors/sportsbetting.js +37 -0
  89. package/dist/ee/extractors/spotify.d.ts +2 -0
  90. package/dist/ee/extractors/spotify.js +34 -0
  91. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  92. package/dist/ee/extractors/stackoverflow.js +61 -0
  93. package/dist/ee/extractors/substack.d.ts +2 -0
  94. package/dist/ee/extractors/substack.js +115 -0
  95. package/dist/ee/extractors/substackroot.d.ts +2 -0
  96. package/dist/ee/extractors/substackroot.js +46 -0
  97. package/dist/ee/extractors/tiktok.d.ts +2 -0
  98. package/dist/ee/extractors/tiktok.js +29 -0
  99. package/dist/ee/extractors/tradingview.d.ts +2 -0
  100. package/dist/ee/extractors/tradingview.js +176 -0
  101. package/dist/ee/extractors/twitch.d.ts +2 -0
  102. package/dist/ee/extractors/twitch.js +36 -0
  103. package/dist/ee/extractors/twitter.d.ts +2 -0
  104. package/dist/ee/extractors/twitter.js +327 -0
  105. package/dist/ee/extractors/types.d.ts +14 -0
  106. package/dist/ee/extractors/types.js +1 -0
  107. package/dist/ee/extractors/walmart.d.ts +2 -0
  108. package/dist/ee/extractors/walmart.js +50 -0
  109. package/dist/ee/extractors/weather.d.ts +2 -0
  110. package/dist/ee/extractors/weather.js +133 -0
  111. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  112. package/dist/ee/extractors/wikipedia.js +103 -0
  113. package/dist/ee/extractors/yelp.d.ts +2 -0
  114. package/dist/ee/extractors/yelp.js +216 -0
  115. package/dist/ee/extractors/youtube.d.ts +2 -0
  116. package/dist/ee/extractors/youtube.js +189 -0
  117. package/dist/ee/extractors/zillow.d.ts +54 -0
  118. package/dist/ee/extractors/zillow.js +247 -0
  119. package/dist/ee/extractors-re-export.d.ts +1 -0
  120. package/dist/ee/extractors-re-export.js +1 -0
  121. package/dist/{server/premium/index.js → ee/premium-hooks.js} +2 -2
  122. package/dist/{server/premium → ee}/swr-cache.d.ts +1 -1
  123. package/dist/{server/premium → ee}/swr-cache.js +1 -1
  124. package/dist/server/app.js +8 -0
  125. package/dist/server/bull-queues.d.ts +1 -0
  126. package/dist/server/routes/feed.d.ts +15 -0
  127. package/dist/server/routes/feed.js +311 -0
  128. package/dist/server/routes/fetch-queue.js +1 -0
  129. package/dist/server/routes/fetch.js +120 -2
  130. package/dist/server/routes/go.d.ts +14 -0
  131. package/dist/server/routes/go.js +81 -0
  132. package/dist/server/routes/smart-search.d.ts +16 -3
  133. package/dist/server/routes/smart-search.js +1875 -117
  134. package/dist/types.d.ts +4 -0
  135. package/package.json +13 -2
  136. package/dist/core/cloak-fetch.d.ts +0 -42
  137. package/dist/core/cloak-fetch.js +0 -148
  138. package/dist/core/cycle-fetch.d.ts +0 -26
  139. package/dist/core/cycle-fetch.js +0 -98
  140. package/dist/core/domain-extractors-basic.d.ts +0 -36
  141. package/dist/core/domain-extractors-basic.js +0 -28
  142. package/dist/core/domain-extractors-public.d.ts +0 -20
  143. package/dist/core/domain-extractors-public.js +0 -35
  144. package/dist/core/domain-extractors.d.ts +0 -48
  145. package/dist/core/domain-extractors.js +0 -6342
  146. package/dist/core/search-fallback.d.ts +0 -28
  147. package/dist/core/search-fallback.js +0 -209
  148. package/dist/core/stealth-patches.d.ts +0 -14
  149. package/dist/core/stealth-patches.js +0 -20
  150. package/dist/server/premium/challenge.d.ts +0 -1
  151. package/dist/server/premium/challenge.js +0 -1
  152. package/dist/server/premium/extractors.d.ts +0 -1
  153. package/dist/server/premium/extractors.js +0 -1
  154. /package/dist/{server/premium → ee}/domain-intel.js +0 -0
  155. /package/dist/{server/premium/index.d.ts → ee/premium-hooks.d.ts} +0 -0
  156. /package/dist/{server/premium → ee}/spa-detection.d.ts +0 -0
  157. /package/dist/{server/premium → ee}/spa-detection.js +0 -0
  158. /package/dist/{server/premium → ee}/stability.d.ts +0 -0
  159. /package/dist/{server/premium → ee}/stability.js +0 -0
@@ -289,6 +289,7 @@ export async function runFetch(url, options) {
289
289
  format: options.html ? 'html' : options.text ? 'text' : options.clean ? 'clean' : 'markdown',
290
290
  budget: null, // Budget excluded from cache key — cache stores full content
291
291
  readable: options.readable || false,
292
+ noDomainApi: options.skipDomainApi || false, // Different cache for domain-api bypass
292
293
  };
293
294
  const cachedResult = getCache(url, cacheOptions);
294
295
  if (cachedResult) {
@@ -603,6 +604,7 @@ export async function runFetch(url, options) {
603
604
  headers,
604
605
  cookies: options.cookie,
605
606
  raw: options.raw || false,
607
+ noDomainApi: options.skipDomainApi || false,
606
608
  lite: options.lite || false,
607
609
  actions,
608
610
  maxTokens: options.maxTokens,
@@ -724,6 +726,16 @@ export async function runFetch(url, options) {
724
726
  ? ` [${result.domainData.domain}:${result.domainData.type}]`
725
727
  : '';
726
728
  spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method${domainTag}`);
729
+ // Smart hints — suggest features the user might not know about
730
+ if (!options.silent && !options.json && !options.skipDomainApi) {
731
+ if (result.method === 'domain-api') {
732
+ const extractorName = result.domainData?.domain || new URL(url).hostname.replace('www.', '') || 'domain';
733
+ console.error(`\x1b[33m💡 Tip: Using our ${extractorName} extractor. Want the raw page instead? Add --skip-domain-api\x1b[0m`);
734
+ }
735
+ }
736
+ if (!options.silent && !options.json && result.tokens && result.tokens < 50 && !options.render) {
737
+ console.error(`\x1b[33m💡 Tip: Page returned very little content. Try --render for JavaScript-heavy sites or --stealth if blocked.\x1b[0m`);
738
+ }
727
739
  }
728
740
  // Show metadata header
729
741
  const pageTitle = result.metadata?.title || result.title;
@@ -1176,6 +1188,7 @@ export function registerFetchCommands(program) {
1176
1188
  .option('--images', 'Output image URLs from the page')
1177
1189
  .option('--meta', 'Output only the page metadata (title, description, author, etc.)')
1178
1190
  .option('--raw', 'Return full page without smart content extraction')
1191
+ .option('--skip-domain-api', 'Bypass domain-specific API extractors — force actual page scraping')
1179
1192
  .option('--full', 'Alias for --raw — full page content, no budget')
1180
1193
  .option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
1181
1194
  .option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
package/dist/cli/utils.js CHANGED
@@ -35,7 +35,14 @@ export async function checkForUpdates() {
35
35
  if (latest && latest !== cliVersion && cliVersion !== '0.0.0') {
36
36
  // Skip update notice in silent mode
37
37
  if (process.env.WEBPEEL_LOG_LEVEL !== 'silent') {
38
- console.error(`\n💡 WebPeel v${latest} available (you have v${cliVersion}). Update: npm i -g webpeel@latest\n`);
38
+ const msg = `Update available: ${cliVersion} ${latest}`;
39
+ const cmd = 'npm i -g webpeel@latest';
40
+ const width = Math.max(msg.length, cmd.length) + 4;
41
+ const line = '─'.repeat(width);
42
+ console.error(`\n\x1b[33m╭${line}╮\x1b[0m`);
43
+ console.error(`\x1b[33m│\x1b[0m ${msg.padEnd(width - 2)} \x1b[33m│\x1b[0m`);
44
+ console.error(`\x1b[33m│\x1b[0m Run: \x1b[36m${cmd}\x1b[0m${' '.repeat(width - 6 - cmd.length)} \x1b[33m│\x1b[0m`);
45
+ console.error(`\x1b[33m╰${line}╯\x1b[0m\n`);
39
46
  }
40
47
  }
41
48
  }
@@ -208,6 +215,8 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
208
215
  params.set('budget', String(options.budget));
209
216
  if (options.question)
210
217
  params.set('question', options.question);
218
+ if (options.noDomainApi)
219
+ params.set('noDomainApi', 'true');
211
220
  const res = await fetch(`${apiUrl}/v1/fetch?${params}`, {
212
221
  headers: { Authorization: `Bearer ${apiKey}` },
213
222
  signal: AbortSignal.timeout(60000),
@@ -154,10 +154,12 @@ export function createAbortError() {
154
154
  * proxy when proxy credentials are configured (WEBSHARE_PROXY_* env vars).
155
155
  */
156
156
  export const PROXY_PREFERRED_DOMAINS = [
157
+ // Social / content
157
158
  'reddit.com',
158
159
  'old.reddit.com',
159
160
  'forbes.com',
160
161
  'fortune.com',
162
+ // Auto / cars
161
163
  'cargurus.com',
162
164
  'edmunds.com',
163
165
  'cars.com',
@@ -165,14 +167,29 @@ export const PROXY_PREFERRED_DOMAINS = [
165
167
  'autotrader.com',
166
168
  'carfax.com',
167
169
  'tesla.com',
170
+ 'motortrend.com',
171
+ 'jdpower.com',
172
+ // Finance / home
168
173
  'nerdwallet.com',
169
174
  'bankrate.com',
170
175
  'homeadvisor.com',
171
176
  'angi.com',
177
+ // EV / auto news
172
178
  'insideevs.com',
173
179
  'electrek.co',
174
- 'motortrend.com',
175
- 'jdpower.com',
180
+ // Restaurants / food
181
+ 'yelp.com',
182
+ // Travel
183
+ 'kayak.com',
184
+ 'booking.com',
185
+ 'expedia.com',
186
+ 'tripadvisor.com',
187
+ 'hotels.com',
188
+ // Shopping / products
189
+ 'amazon.com',
190
+ 'bestbuy.com',
191
+ 'walmart.com',
192
+ 'target.com',
176
193
  ];
177
194
  /**
178
195
  * Returns true if the URL's domain is on the proxy-preferred blocklist.
@@ -341,7 +341,8 @@ export async function fetchContent(ctx) {
341
341
  const needsDesignAnalysis = ctx.options.designAnalysis && ctx.render;
342
342
  // Try API-based domain extraction first (Reddit, GitHub, HN use APIs, not HTML)
343
343
  // This avoids expensive browser fetches that often get blocked
344
- if (hasDomainExtractor(ctx.url)) {
344
+ // Skip if noDomainApi is set — user wants raw page content, not API shortcut
345
+ if (hasDomainExtractor(ctx.url) && !ctx.options.noDomainApi) {
345
346
  try {
346
347
  ctx.timer.mark('domainApiFirst');
347
348
  const ddResult = await runDomainExtract('', ctx.url);
@@ -1078,7 +1079,7 @@ export async function postProcess(ctx) {
1078
1079
  }
1079
1080
  // Domain-aware structured extraction (Twitter, Reddit, GitHub, HN)
1080
1081
  // Fires when URL matches a known domain. Replaces content with clean markdown.
1081
- if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
1082
+ if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled && !ctx.options.noDomainApi) {
1082
1083
  try {
1083
1084
  ctx.timer.mark('domainExtract');
1084
1085
  // Try raw HTML first, then fall back to readability-processed content
@@ -57,42 +57,55 @@ export const SCHEMA_TEMPLATES = {
57
57
  name: 'Event',
58
58
  description: 'Extract event information',
59
59
  fields: {
60
- name: 'event name or title',
61
- date: 'event date and time',
62
- location: 'venue or location',
63
- description: 'event description',
64
- price: 'ticket price or cost',
65
- organizer: 'event organizer',
66
- url: 'registration or ticket URL',
60
+ name: 'What is the name of this event?',
61
+ date: 'When does this event take place?',
62
+ time: 'What time does this event start?',
63
+ location: 'Where is this event held?',
64
+ price: 'How much does this event cost?',
65
+ description: 'What is this event about?',
66
+ organizer: 'Who is organizing this event?',
67
67
  },
68
68
  },
69
69
  recipe: {
70
70
  name: 'Recipe',
71
71
  description: 'Extract recipe information from cooking sites',
72
72
  fields: {
73
- title: 'recipe name',
74
- ingredients: 'list of ingredients with quantities',
75
- instructions: 'cooking steps or directions',
76
- prepTime: 'preparation time',
77
- cookTime: 'cooking time',
78
- servings: 'number of servings',
79
- calories: 'calories per serving',
80
- author: 'recipe author or source',
73
+ name: 'What is the name of this recipe?',
74
+ ingredients: 'What ingredients are needed? List all.',
75
+ steps: 'What are the cooking steps or instructions?',
76
+ prepTime: 'How long does preparation take?',
77
+ cookTime: 'How long does cooking take?',
78
+ servings: 'How many servings does this recipe make?',
79
+ calories: 'How many calories per serving?',
80
+ rating: 'What is the recipe rating?',
81
81
  },
82
82
  },
83
83
  job: {
84
84
  name: 'Job',
85
85
  description: 'Extract job posting information',
86
86
  fields: {
87
- title: 'job title',
88
- company: 'company name',
89
- location: 'job location',
90
- salary: 'salary range or compensation',
91
- description: 'job description',
92
- requirements: 'required qualifications or skills',
93
- type: 'job type (full-time, part-time, remote)',
94
- posted: 'date posted',
95
- applyUrl: 'application URL or link',
87
+ title: 'What is the job title?',
88
+ company: 'What company is hiring?',
89
+ location: 'Where is the job located?',
90
+ salary: 'What is the salary or compensation range?',
91
+ type: 'Is this full-time, part-time, contract, or remote?',
92
+ requirements: 'What are the key requirements or qualifications?',
93
+ description: 'What is the job description?',
94
+ applyUrl: 'What is the URL or method to apply?',
95
+ },
96
+ },
97
+ business: {
98
+ name: 'Business',
99
+ description: 'Extract business/company information',
100
+ fields: {
101
+ name: 'What is the business name?',
102
+ address: 'What is the full address?',
103
+ phone: 'What is the phone number?',
104
+ hours: 'What are the business hours?',
105
+ rating: 'What is the business rating?',
106
+ reviewCount: 'How many reviews does this business have?',
107
+ website: 'What is the business website URL?',
108
+ categories: 'What type of business is this?',
96
109
  },
97
110
  },
98
111
  review: {
@@ -19,6 +19,8 @@ export interface WebSearchResult {
19
19
  snippet: string;
20
20
  /** Relevance score (0–1) based on keyword overlap with query. Added by filterRelevantResults. */
21
21
  relevanceScore?: number;
22
+ /** Thumbnail/image URL from SearXNG results (img_src or thumbnail field). */
23
+ imageUrl?: string;
22
24
  }
23
25
  export interface WebSearchOptions {
24
26
  /** Number of results (1-10) */
@@ -1066,8 +1066,15 @@ export class DuckDuckGoProvider {
1066
1066
  if (searxResults.length > 0) {
1067
1067
  providerStats.record('searxng', true);
1068
1068
  log.debug(`source=searxng returned ${searxResults.length} results`);
1069
- const filtered = filterRelevantResults(searxResults, query);
1070
- return filtered.length > 0 ? filtered : searxResults;
1069
+ // Map SearXNG results to WebSearchResult (description → snippet, imageUrl passthrough)
1070
+ const mapped = searxResults.map(r => ({
1071
+ title: r.title,
1072
+ url: r.url,
1073
+ snippet: r.description ?? '',
1074
+ imageUrl: r.imageUrl,
1075
+ }));
1076
+ const filtered = filterRelevantResults(mapped, query);
1077
+ return filtered.length > 0 ? filtered : mapped;
1071
1078
  }
1072
1079
  providerStats.record('searxng', false);
1073
1080
  log.debug('SearXNG returned 0 results, falling through to DDG');
@@ -16,6 +16,7 @@ export interface SearXNGSearchResult {
16
16
  description?: string;
17
17
  publishedDate?: string;
18
18
  score?: number;
19
+ imageUrl?: string;
19
20
  }
20
21
  /**
21
22
  * Fetches search results from a SearXNG instance.
@@ -69,6 +69,7 @@ export async function searchViaSearXNG(query, options = {}) {
69
69
  description: r.content ?? undefined,
70
70
  publishedDate: r.publishedDate ?? undefined,
71
71
  score: r.score ?? undefined,
72
+ imageUrl: r.img_src ?? r.thumbnail ?? undefined,
72
73
  });
73
74
  if (output.length >= count)
74
75
  break;
@@ -0,0 +1 @@
1
+ export { solveChallenge } from './challenge-solver.js';
@@ -0,0 +1 @@
1
+ export { solveChallenge } from './challenge-solver.js';
@@ -17,7 +17,7 @@
17
17
  * // result.cookies = ["cf_clearance=...", ...]
18
18
  * }
19
19
  */
20
- import type { ChallengeType } from './challenge-detection.js';
20
+ import type { ChallengeType } from '../core/challenge-detection.js';
21
21
  export interface ImageCaptchaResult {
22
22
  solved: boolean;
23
23
  rounds: number;
@@ -17,8 +17,8 @@
17
17
  * // result.cookies = ["cf_clearance=...", ...]
18
18
  * }
19
19
  */
20
- import { cacheCookiesForUrl } from './cookie-cache.js';
21
- import { createLogger } from './logger.js';
20
+ import { cacheCookiesForUrl } from '../core/cookie-cache.js';
21
+ import { createLogger } from '../core/logger.js';
22
22
  const log = createLogger('challenge-solver');
23
23
  // ── Image CAPTCHA solver constants ────────────────────────────────────────────
24
24
  const OLLAMA_VISION_URL = 'http://178.156.229.86:11435/api/generate';
@@ -372,7 +372,7 @@ export async function solveChallenge(url, challengeType, html, options = {}) {
372
372
  async function solveCaptchaWithVision(url, _html, timeoutMs, proxy) {
373
373
  let page = null;
374
374
  try {
375
- const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('./browser-pool.js');
375
+ const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('../core/browser-pool.js');
376
376
  const browser = await getStealthBrowser();
377
377
  const vp = getRandomViewport();
378
378
  const ctx = await browser.newContext({
@@ -446,7 +446,7 @@ async function solveCloudflare(url, _html, timeoutMs, proxy) {
446
446
  let browser = null;
447
447
  let page = null;
448
448
  try {
449
- const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('./browser-pool.js');
449
+ const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('../core/browser-pool.js');
450
450
  browser = await getStealthBrowser();
451
451
  const vp = getRandomViewport();
452
452
  const ctx = await browser.newContext({
@@ -528,7 +528,7 @@ async function solveCloudflare(url, _html, timeoutMs, proxy) {
528
528
  async function solveWithStealthBrowser(url, _html, timeoutMs, proxy, challengeType) {
529
529
  let page = null;
530
530
  try {
531
- const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('./browser-pool.js');
531
+ const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('../core/browser-pool.js');
532
532
  const browser = await getStealthBrowser();
533
533
  const vp = getRandomViewport();
534
534
  const ctx = await browser.newContext({
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Domain-aware structured extractors for WebPeel.
3
+ *
4
+ * This file re-exports from individual extractor files for backward compatibility.
5
+ * Each extractor now lives in its own file under src/ee/extractors/.
6
+ */
7
+ export { getDomainExtractor, hasDomainExtractor, extractDomainData, clearExtractorCache, setExtractorRedis, } from './extractors/index.js';
8
+ export type { DomainExtractResult, DomainExtractor } from './extractors/index.js';
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Domain-aware structured extractors for WebPeel.
3
+ *
4
+ * This file re-exports from individual extractor files for backward compatibility.
5
+ * Each extractor now lives in its own file under src/ee/extractors/.
6
+ */
7
+ // Re-exported from individual extractor files for backward compatibility
8
+ export { getDomainExtractor, hasDomainExtractor, extractDomainData, clearExtractorCache, setExtractorRedis, } from './extractors/index.js';
@@ -11,6 +11,6 @@
11
11
  *
12
12
  * This module is NOT shipped in the npm package.
13
13
  */
14
- import type { StrategyHooks } from '../../core/strategy-hooks.js';
14
+ import type { StrategyHooks } from '../core/strategy-hooks.js';
15
15
  export declare function clearDomainIntel(): void;
16
16
  export declare function createDomainIntelHooks(): Pick<StrategyHooks, 'getDomainRecommendation' | 'recordDomainResult'>;
@@ -0,0 +1,2 @@
1
+ import type { DomainExtractResult } from './types.js';
2
+ export declare function allrecipesExtractor(html: string, url: string): Promise<DomainExtractResult | null>;
@@ -0,0 +1,120 @@
1
+ import { tryParseJson } from './shared.js';
2
+ // ---------------------------------------------------------------------------
3
+ // 15. Allrecipes (Recipe Sites) extractor
4
+ // ---------------------------------------------------------------------------
5
+ export async function allrecipesExtractor(html, url) {
6
+ try {
7
+ const { load } = await import('cheerio');
8
+ const $ = load(html);
9
+ // Try Schema.org Recipe JSON-LD first
10
+ let recipe = null;
11
+ $('script[type="application/ld+json"]').each((_, el) => {
12
+ if (recipe)
13
+ return;
14
+ const raw = $(el).html() || '';
15
+ const parsed = tryParseJson(raw);
16
+ // Can be an array or direct object
17
+ const candidates = Array.isArray(parsed) ? parsed : [parsed];
18
+ for (const item of candidates) {
19
+ if (item?.['@type'] === 'Recipe' || (Array.isArray(item?.['@type']) && item['@type'].includes('Recipe'))) {
20
+ recipe = item;
21
+ break;
22
+ }
23
+ // Sometimes it's nested in @graph
24
+ if (item?.['@graph']) {
25
+ const graphRecipe = item['@graph'].find((g) => g?.['@type'] === 'Recipe');
26
+ if (graphRecipe) {
27
+ recipe = graphRecipe;
28
+ break;
29
+ }
30
+ }
31
+ }
32
+ });
33
+ let title;
34
+ let ingredients = [];
35
+ let instructions = [];
36
+ let prepTime = '';
37
+ let cookTime = '';
38
+ let totalTime = '';
39
+ let servings = '';
40
+ let rating = '';
41
+ let reviewCount = '';
42
+ let description = '';
43
+ if (recipe) {
44
+ title = recipe.name || '';
45
+ description = recipe.description || '';
46
+ ingredients = (recipe.recipeIngredient || []).map((i) => i.trim());
47
+ // Instructions can be strings or HowToStep objects
48
+ const rawInstructions = recipe.recipeInstructions || [];
49
+ for (const step of rawInstructions) {
50
+ if (typeof step === 'string')
51
+ instructions.push(step.trim());
52
+ else if (step.text)
53
+ instructions.push(step.text.trim());
54
+ else if (step['@type'] === 'HowToSection' && step.itemListElement) {
55
+ for (const s of step.itemListElement) {
56
+ if (s.text)
57
+ instructions.push(s.text.trim());
58
+ }
59
+ }
60
+ }
61
+ // Parse ISO 8601 duration (PT30M, PT1H30M)
62
+ const parseDuration = (d) => {
63
+ if (!d)
64
+ return '';
65
+ const h = d.match(/(\d+)H/)?.[1];
66
+ const m = d.match(/(\d+)M/)?.[1];
67
+ return [h ? `${h}h` : '', m ? `${m}m` : ''].filter(Boolean).join(' ');
68
+ };
69
+ prepTime = parseDuration(recipe.prepTime || '');
70
+ cookTime = parseDuration(recipe.cookTime || '');
71
+ totalTime = parseDuration(recipe.totalTime || '');
72
+ servings = String(recipe.recipeYield || '');
73
+ rating = recipe.aggregateRating?.ratingValue ? String(recipe.aggregateRating.ratingValue) : '';
74
+ reviewCount = recipe.aggregateRating?.reviewCount ? String(recipe.aggregateRating.reviewCount) : '';
75
+ }
76
+ else {
77
+ // HTML fallback
78
+ title = $('h1').first().text().trim() ||
79
+ $('meta[property="og:title"]').attr('content') || '';
80
+ description = $('meta[property="og:description"]').attr('content') || '';
81
+ $('[class*="ingredient"]').each((_, el) => {
82
+ const text = $(el).text().trim();
83
+ if (text && text.length < 200)
84
+ ingredients.push(text);
85
+ });
86
+ $('[class*="instruction"] li, [class*="step"] li').each((_, el) => {
87
+ const text = $(el).text().trim();
88
+ if (text)
89
+ instructions.push(text);
90
+ });
91
+ }
92
+ if (!title)
93
+ return null;
94
+ const structured = {
95
+ title, description, ingredients, instructions,
96
+ prepTime, cookTime, totalTime, servings, rating, reviewCount, url,
97
+ };
98
+ const timeParts = [
99
+ prepTime ? `Prep: ${prepTime}` : '',
100
+ cookTime ? `Cook: ${cookTime}` : '',
101
+ totalTime ? `Total: ${totalTime}` : '',
102
+ ].filter(Boolean).join(' | ');
103
+ const metaLine = [
104
+ timeParts,
105
+ servings ? `Servings: ${servings}` : '',
106
+ rating ? `Rating: ${rating}${reviewCount ? ` (${reviewCount} reviews)` : ''}` : '',
107
+ ].filter(Boolean).join(' | ');
108
+ const ingredientsMd = ingredients.length
109
+ ? `## Ingredients\n\n${ingredients.map(i => `- ${i}`).join('\n')}`
110
+ : '';
111
+ const instructionsMd = instructions.length
112
+ ? `## Instructions\n\n${instructions.map((s, i) => `${i + 1}. ${s}`).join('\n')}`
113
+ : '';
114
+ const cleanContent = `# 🍽️ ${title}\n\n${metaLine ? `*${metaLine}*\n\n` : ''}${description ? description + '\n\n' : ''}${ingredientsMd}\n\n${instructionsMd}`.trim();
115
+ return { domain: 'allrecipes.com', type: 'recipe', structured, cleanContent };
116
+ }
117
+ catch {
118
+ return null;
119
+ }
120
+ }
@@ -0,0 +1,2 @@
1
+ import type { DomainExtractResult } from './types.js';
2
+ export declare function amazonExtractor(html: string, url: string): Promise<DomainExtractResult | null>;
@@ -0,0 +1,78 @@
1
+ import { tryParseJson } from './shared.js';
2
+ // ---------------------------------------------------------------------------
3
+ // 12. Amazon Products extractor
4
+ // ---------------------------------------------------------------------------
5
+ export async function amazonExtractor(html, url) {
6
+ try {
7
+ const { load } = await import('cheerio');
8
+ const $ = load(html);
9
+ // Extract from JSON-LD first
10
+ let jsonLdData = null;
11
+ $('script[type="application/ld+json"]').each((_, el) => {
12
+ if (jsonLdData)
13
+ return;
14
+ const raw = $(el).html() || '';
15
+ const parsed = tryParseJson(raw);
16
+ if (parsed?.['@type'] === 'Product')
17
+ jsonLdData = parsed;
18
+ });
19
+ // Meta tag fallbacks
20
+ const ogTitle = $('meta[property="og:title"]').attr('content') || '';
21
+ const ogDescription = $('meta[property="og:description"]').attr('content') || '';
22
+ const ogImage = $('meta[property="og:image"]').attr('content') || '';
23
+ // HTML selectors
24
+ const title = jsonLdData?.name ||
25
+ $('#productTitle').text().trim() ||
26
+ $('#title').text().trim() ||
27
+ ogTitle;
28
+ if (!title)
29
+ return null;
30
+ const priceWhole = $('#priceblock_ourprice').text().trim() ||
31
+ $('.a-price .a-offscreen').first().text().trim() ||
32
+ $('[data-asin-price]').first().attr('data-asin-price') || '';
33
+ const rating = jsonLdData?.aggregateRating?.ratingValue ||
34
+ $('#acrPopover .a-size-base.a-color-base').first().text().trim() ||
35
+ $('span[data-hook="rating-out-of-text"]').text().trim() || '';
36
+ const reviewCount = jsonLdData?.aggregateRating?.reviewCount ||
37
+ $('#acrCustomerReviewText').text().replace(/[^0-9,]/g, '').trim() || '';
38
+ const availability = jsonLdData?.offers?.availability?.replace('https://schema.org/', '') ||
39
+ $('#availability span').first().text().trim() || '';
40
+ const description = jsonLdData?.description ||
41
+ $('#feature-bullets .a-list-item').map((_, el) => $(el).text().trim()).get().join('\n') ||
42
+ $('#productDescription p').text().trim() ||
43
+ ogDescription;
44
+ const features = [];
45
+ $('#feature-bullets li').each((_, el) => {
46
+ const text = $(el).text().trim();
47
+ if (text && !text.includes('Make sure this fits'))
48
+ features.push(text);
49
+ });
50
+ // ASIN from URL
51
+ const asinMatch = url.match(/\/dp\/([A-Z0-9]{10})/i);
52
+ const asin = asinMatch?.[1] || '';
53
+ const structured = {
54
+ title,
55
+ price: priceWhole,
56
+ rating,
57
+ reviewCount,
58
+ availability,
59
+ description,
60
+ features,
61
+ asin,
62
+ image: ogImage,
63
+ url,
64
+ };
65
+ const ratingLine = rating ? `\n**Rating:** ${rating}${reviewCount ? ` (${reviewCount} reviews)` : ''}` : '';
66
+ const priceLine = priceWhole ? `\n**Price:** ${priceWhole}` : '';
67
+ const availLine = availability ? `\n**Availability:** ${availability}` : '';
68
+ const featuresSection = features.length
69
+ ? `\n\n## Features\n\n${features.map(f => `- ${f}`).join('\n')}`
70
+ : '';
71
+ const descSection = description ? `\n\n## Description\n\n${description.substring(0, 1000)}` : '';
72
+ const cleanContent = `# 🛒 ${title}${priceLine}${ratingLine}${availLine}${descSection}${featuresSection}`;
73
+ return { domain: 'amazon.com', type: 'product', structured, cleanContent };
74
+ }
75
+ catch {
76
+ return null;
77
+ }
78
+ }
@@ -0,0 +1,2 @@
1
+ import type { DomainExtractResult } from './types.js';
2
+ export declare function arxivExtractor(_html: string, url: string): Promise<DomainExtractResult | null>;