webpeel 0.21.85 → 0.21.87
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/fetch.js +13 -0
- package/dist/cli/utils.js +10 -1
- package/dist/core/http-fetch.js +19 -2
- package/dist/core/pipeline.js +3 -2
- package/dist/core/schema-templates.js +37 -24
- package/dist/core/search-provider.d.ts +2 -0
- package/dist/core/search-provider.js +9 -2
- package/dist/core/searxng-provider.d.ts +1 -0
- package/dist/core/searxng-provider.js +1 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/{core → ee}/challenge-solver.d.ts +1 -1
- package/dist/{core → ee}/challenge-solver.js +5 -5
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/{server/premium → ee}/domain-intel.d.ts +1 -1
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +255 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +115 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +162 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +308 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +176 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +103 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/{server/premium/index.js → ee/premium-hooks.js} +2 -2
- package/dist/{server/premium → ee}/swr-cache.d.ts +1 -1
- package/dist/{server/premium → ee}/swr-cache.js +1 -1
- package/dist/server/app.js +8 -0
- package/dist/server/bull-queues.d.ts +1 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.js +1 -0
- package/dist/server/routes/fetch.js +120 -2
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/smart-search.d.ts +16 -3
- package/dist/server/routes/smart-search.js +1875 -117
- package/dist/types.d.ts +4 -0
- package/package.json +13 -2
- package/dist/core/cloak-fetch.d.ts +0 -42
- package/dist/core/cloak-fetch.js +0 -148
- package/dist/core/cycle-fetch.d.ts +0 -26
- package/dist/core/cycle-fetch.js +0 -98
- package/dist/core/domain-extractors-basic.d.ts +0 -36
- package/dist/core/domain-extractors-basic.js +0 -28
- package/dist/core/domain-extractors-public.d.ts +0 -20
- package/dist/core/domain-extractors-public.js +0 -35
- package/dist/core/domain-extractors.d.ts +0 -48
- package/dist/core/domain-extractors.js +0 -6342
- package/dist/core/search-fallback.d.ts +0 -28
- package/dist/core/search-fallback.js +0 -209
- package/dist/core/stealth-patches.d.ts +0 -14
- package/dist/core/stealth-patches.js +0 -20
- package/dist/server/premium/challenge.d.ts +0 -1
- package/dist/server/premium/challenge.js +0 -1
- package/dist/server/premium/extractors.d.ts +0 -1
- package/dist/server/premium/extractors.js +0 -1
- /package/dist/{server/premium → ee}/domain-intel.js +0 -0
- /package/dist/{server/premium/index.d.ts → ee/premium-hooks.d.ts} +0 -0
- /package/dist/{server/premium → ee}/spa-detection.d.ts +0 -0
- /package/dist/{server/premium → ee}/spa-detection.js +0 -0
- /package/dist/{server/premium → ee}/stability.d.ts +0 -0
- /package/dist/{server/premium → ee}/stability.js +0 -0
|
@@ -289,6 +289,7 @@ export async function runFetch(url, options) {
|
|
|
289
289
|
format: options.html ? 'html' : options.text ? 'text' : options.clean ? 'clean' : 'markdown',
|
|
290
290
|
budget: null, // Budget excluded from cache key — cache stores full content
|
|
291
291
|
readable: options.readable || false,
|
|
292
|
+
noDomainApi: options.skipDomainApi || false, // Different cache for domain-api bypass
|
|
292
293
|
};
|
|
293
294
|
const cachedResult = getCache(url, cacheOptions);
|
|
294
295
|
if (cachedResult) {
|
|
@@ -603,6 +604,7 @@ export async function runFetch(url, options) {
|
|
|
603
604
|
headers,
|
|
604
605
|
cookies: options.cookie,
|
|
605
606
|
raw: options.raw || false,
|
|
607
|
+
noDomainApi: options.skipDomainApi || false,
|
|
606
608
|
lite: options.lite || false,
|
|
607
609
|
actions,
|
|
608
610
|
maxTokens: options.maxTokens,
|
|
@@ -724,6 +726,16 @@ export async function runFetch(url, options) {
|
|
|
724
726
|
? ` [${result.domainData.domain}:${result.domainData.type}]`
|
|
725
727
|
: '';
|
|
726
728
|
spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method${domainTag}`);
|
|
729
|
+
// Smart hints — suggest features the user might not know about
|
|
730
|
+
if (!options.silent && !options.json && !options.skipDomainApi) {
|
|
731
|
+
if (result.method === 'domain-api') {
|
|
732
|
+
const extractorName = result.domainData?.domain || new URL(url).hostname.replace('www.', '') || 'domain';
|
|
733
|
+
console.error(`\x1b[33m💡 Tip: Using our ${extractorName} extractor. Want the raw page instead? Add --skip-domain-api\x1b[0m`);
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
if (!options.silent && !options.json && result.tokens && result.tokens < 50 && !options.render) {
|
|
737
|
+
console.error(`\x1b[33m💡 Tip: Page returned very little content. Try --render for JavaScript-heavy sites or --stealth if blocked.\x1b[0m`);
|
|
738
|
+
}
|
|
727
739
|
}
|
|
728
740
|
// Show metadata header
|
|
729
741
|
const pageTitle = result.metadata?.title || result.title;
|
|
@@ -1176,6 +1188,7 @@ export function registerFetchCommands(program) {
|
|
|
1176
1188
|
.option('--images', 'Output image URLs from the page')
|
|
1177
1189
|
.option('--meta', 'Output only the page metadata (title, description, author, etc.)')
|
|
1178
1190
|
.option('--raw', 'Return full page without smart content extraction')
|
|
1191
|
+
.option('--skip-domain-api', 'Bypass domain-specific API extractors — force actual page scraping')
|
|
1179
1192
|
.option('--full', 'Alias for --raw — full page content, no budget')
|
|
1180
1193
|
.option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
|
|
1181
1194
|
.option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
|
package/dist/cli/utils.js
CHANGED
|
@@ -35,7 +35,14 @@ export async function checkForUpdates() {
|
|
|
35
35
|
if (latest && latest !== cliVersion && cliVersion !== '0.0.0') {
|
|
36
36
|
// Skip update notice in silent mode
|
|
37
37
|
if (process.env.WEBPEEL_LOG_LEVEL !== 'silent') {
|
|
38
|
-
|
|
38
|
+
const msg = `Update available: ${cliVersion} → ${latest}`;
|
|
39
|
+
const cmd = 'npm i -g webpeel@latest';
|
|
40
|
+
const width = Math.max(msg.length, cmd.length) + 4;
|
|
41
|
+
const line = '─'.repeat(width);
|
|
42
|
+
console.error(`\n\x1b[33m╭${line}╮\x1b[0m`);
|
|
43
|
+
console.error(`\x1b[33m│\x1b[0m ${msg.padEnd(width - 2)} \x1b[33m│\x1b[0m`);
|
|
44
|
+
console.error(`\x1b[33m│\x1b[0m Run: \x1b[36m${cmd}\x1b[0m${' '.repeat(width - 6 - cmd.length)} \x1b[33m│\x1b[0m`);
|
|
45
|
+
console.error(`\x1b[33m╰${line}╯\x1b[0m\n`);
|
|
39
46
|
}
|
|
40
47
|
}
|
|
41
48
|
}
|
|
@@ -208,6 +215,8 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
|
|
|
208
215
|
params.set('budget', String(options.budget));
|
|
209
216
|
if (options.question)
|
|
210
217
|
params.set('question', options.question);
|
|
218
|
+
if (options.noDomainApi)
|
|
219
|
+
params.set('noDomainApi', 'true');
|
|
211
220
|
const res = await fetch(`${apiUrl}/v1/fetch?${params}`, {
|
|
212
221
|
headers: { Authorization: `Bearer ${apiKey}` },
|
|
213
222
|
signal: AbortSignal.timeout(60000),
|
package/dist/core/http-fetch.js
CHANGED
|
@@ -154,10 +154,12 @@ export function createAbortError() {
|
|
|
154
154
|
* proxy when proxy credentials are configured (WEBSHARE_PROXY_* env vars).
|
|
155
155
|
*/
|
|
156
156
|
export const PROXY_PREFERRED_DOMAINS = [
|
|
157
|
+
// Social / content
|
|
157
158
|
'reddit.com',
|
|
158
159
|
'old.reddit.com',
|
|
159
160
|
'forbes.com',
|
|
160
161
|
'fortune.com',
|
|
162
|
+
// Auto / cars
|
|
161
163
|
'cargurus.com',
|
|
162
164
|
'edmunds.com',
|
|
163
165
|
'cars.com',
|
|
@@ -165,14 +167,29 @@ export const PROXY_PREFERRED_DOMAINS = [
|
|
|
165
167
|
'autotrader.com',
|
|
166
168
|
'carfax.com',
|
|
167
169
|
'tesla.com',
|
|
170
|
+
'motortrend.com',
|
|
171
|
+
'jdpower.com',
|
|
172
|
+
// Finance / home
|
|
168
173
|
'nerdwallet.com',
|
|
169
174
|
'bankrate.com',
|
|
170
175
|
'homeadvisor.com',
|
|
171
176
|
'angi.com',
|
|
177
|
+
// EV / auto news
|
|
172
178
|
'insideevs.com',
|
|
173
179
|
'electrek.co',
|
|
174
|
-
|
|
175
|
-
'
|
|
180
|
+
// Restaurants / food
|
|
181
|
+
'yelp.com',
|
|
182
|
+
// Travel
|
|
183
|
+
'kayak.com',
|
|
184
|
+
'booking.com',
|
|
185
|
+
'expedia.com',
|
|
186
|
+
'tripadvisor.com',
|
|
187
|
+
'hotels.com',
|
|
188
|
+
// Shopping / products
|
|
189
|
+
'amazon.com',
|
|
190
|
+
'bestbuy.com',
|
|
191
|
+
'walmart.com',
|
|
192
|
+
'target.com',
|
|
176
193
|
];
|
|
177
194
|
/**
|
|
178
195
|
* Returns true if the URL's domain is on the proxy-preferred blocklist.
|
package/dist/core/pipeline.js
CHANGED
|
@@ -341,7 +341,8 @@ export async function fetchContent(ctx) {
|
|
|
341
341
|
const needsDesignAnalysis = ctx.options.designAnalysis && ctx.render;
|
|
342
342
|
// Try API-based domain extraction first (Reddit, GitHub, HN use APIs, not HTML)
|
|
343
343
|
// This avoids expensive browser fetches that often get blocked
|
|
344
|
-
if
|
|
344
|
+
// Skip if noDomainApi is set — user wants raw page content, not API shortcut
|
|
345
|
+
if (hasDomainExtractor(ctx.url) && !ctx.options.noDomainApi) {
|
|
345
346
|
try {
|
|
346
347
|
ctx.timer.mark('domainApiFirst');
|
|
347
348
|
const ddResult = await runDomainExtract('', ctx.url);
|
|
@@ -1078,7 +1079,7 @@ export async function postProcess(ctx) {
|
|
|
1078
1079
|
}
|
|
1079
1080
|
// Domain-aware structured extraction (Twitter, Reddit, GitHub, HN)
|
|
1080
1081
|
// Fires when URL matches a known domain. Replaces content with clean markdown.
|
|
1081
|
-
if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
|
|
1082
|
+
if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled && !ctx.options.noDomainApi) {
|
|
1082
1083
|
try {
|
|
1083
1084
|
ctx.timer.mark('domainExtract');
|
|
1084
1085
|
// Try raw HTML first, then fall back to readability-processed content
|
|
@@ -57,42 +57,55 @@ export const SCHEMA_TEMPLATES = {
|
|
|
57
57
|
name: 'Event',
|
|
58
58
|
description: 'Extract event information',
|
|
59
59
|
fields: {
|
|
60
|
-
name: '
|
|
61
|
-
date: 'event
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
price: '
|
|
65
|
-
|
|
66
|
-
|
|
60
|
+
name: 'What is the name of this event?',
|
|
61
|
+
date: 'When does this event take place?',
|
|
62
|
+
time: 'What time does this event start?',
|
|
63
|
+
location: 'Where is this event held?',
|
|
64
|
+
price: 'How much does this event cost?',
|
|
65
|
+
description: 'What is this event about?',
|
|
66
|
+
organizer: 'Who is organizing this event?',
|
|
67
67
|
},
|
|
68
68
|
},
|
|
69
69
|
recipe: {
|
|
70
70
|
name: 'Recipe',
|
|
71
71
|
description: 'Extract recipe information from cooking sites',
|
|
72
72
|
fields: {
|
|
73
|
-
|
|
74
|
-
ingredients: '
|
|
75
|
-
|
|
76
|
-
prepTime: 'preparation
|
|
77
|
-
cookTime: 'cooking
|
|
78
|
-
servings: '
|
|
79
|
-
calories: 'calories per serving',
|
|
80
|
-
|
|
73
|
+
name: 'What is the name of this recipe?',
|
|
74
|
+
ingredients: 'What ingredients are needed? List all.',
|
|
75
|
+
steps: 'What are the cooking steps or instructions?',
|
|
76
|
+
prepTime: 'How long does preparation take?',
|
|
77
|
+
cookTime: 'How long does cooking take?',
|
|
78
|
+
servings: 'How many servings does this recipe make?',
|
|
79
|
+
calories: 'How many calories per serving?',
|
|
80
|
+
rating: 'What is the recipe rating?',
|
|
81
81
|
},
|
|
82
82
|
},
|
|
83
83
|
job: {
|
|
84
84
|
name: 'Job',
|
|
85
85
|
description: 'Extract job posting information',
|
|
86
86
|
fields: {
|
|
87
|
-
title: 'job title',
|
|
88
|
-
company: 'company
|
|
89
|
-
location: 'job
|
|
90
|
-
salary: 'salary
|
|
91
|
-
|
|
92
|
-
requirements: '
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
87
|
+
title: 'What is the job title?',
|
|
88
|
+
company: 'What company is hiring?',
|
|
89
|
+
location: 'Where is the job located?',
|
|
90
|
+
salary: 'What is the salary or compensation range?',
|
|
91
|
+
type: 'Is this full-time, part-time, contract, or remote?',
|
|
92
|
+
requirements: 'What are the key requirements or qualifications?',
|
|
93
|
+
description: 'What is the job description?',
|
|
94
|
+
applyUrl: 'What is the URL or method to apply?',
|
|
95
|
+
},
|
|
96
|
+
},
|
|
97
|
+
business: {
|
|
98
|
+
name: 'Business',
|
|
99
|
+
description: 'Extract business/company information',
|
|
100
|
+
fields: {
|
|
101
|
+
name: 'What is the business name?',
|
|
102
|
+
address: 'What is the full address?',
|
|
103
|
+
phone: 'What is the phone number?',
|
|
104
|
+
hours: 'What are the business hours?',
|
|
105
|
+
rating: 'What is the business rating?',
|
|
106
|
+
reviewCount: 'How many reviews does this business have?',
|
|
107
|
+
website: 'What is the business website URL?',
|
|
108
|
+
categories: 'What type of business is this?',
|
|
96
109
|
},
|
|
97
110
|
},
|
|
98
111
|
review: {
|
|
@@ -19,6 +19,8 @@ export interface WebSearchResult {
|
|
|
19
19
|
snippet: string;
|
|
20
20
|
/** Relevance score (0–1) based on keyword overlap with query. Added by filterRelevantResults. */
|
|
21
21
|
relevanceScore?: number;
|
|
22
|
+
/** Thumbnail/image URL from SearXNG results (img_src or thumbnail field). */
|
|
23
|
+
imageUrl?: string;
|
|
22
24
|
}
|
|
23
25
|
export interface WebSearchOptions {
|
|
24
26
|
/** Number of results (1-10) */
|
|
@@ -1066,8 +1066,15 @@ export class DuckDuckGoProvider {
|
|
|
1066
1066
|
if (searxResults.length > 0) {
|
|
1067
1067
|
providerStats.record('searxng', true);
|
|
1068
1068
|
log.debug(`source=searxng returned ${searxResults.length} results`);
|
|
1069
|
-
|
|
1070
|
-
|
|
1069
|
+
// Map SearXNG results to WebSearchResult (description → snippet, imageUrl passthrough)
|
|
1070
|
+
const mapped = searxResults.map(r => ({
|
|
1071
|
+
title: r.title,
|
|
1072
|
+
url: r.url,
|
|
1073
|
+
snippet: r.description ?? '',
|
|
1074
|
+
imageUrl: r.imageUrl,
|
|
1075
|
+
}));
|
|
1076
|
+
const filtered = filterRelevantResults(mapped, query);
|
|
1077
|
+
return filtered.length > 0 ? filtered : mapped;
|
|
1071
1078
|
}
|
|
1072
1079
|
providerStats.record('searxng', false);
|
|
1073
1080
|
log.debug('SearXNG returned 0 results, falling through to DDG');
|
|
@@ -69,6 +69,7 @@ export async function searchViaSearXNG(query, options = {}) {
|
|
|
69
69
|
description: r.content ?? undefined,
|
|
70
70
|
publishedDate: r.publishedDate ?? undefined,
|
|
71
71
|
score: r.score ?? undefined,
|
|
72
|
+
imageUrl: r.img_src ?? r.thumbnail ?? undefined,
|
|
72
73
|
});
|
|
73
74
|
if (output.length >= count)
|
|
74
75
|
break;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { solveChallenge } from './challenge-solver.js';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { solveChallenge } from './challenge-solver.js';
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
* // result.cookies = ["cf_clearance=...", ...]
|
|
18
18
|
* }
|
|
19
19
|
*/
|
|
20
|
-
import type { ChallengeType } from '
|
|
20
|
+
import type { ChallengeType } from '../core/challenge-detection.js';
|
|
21
21
|
export interface ImageCaptchaResult {
|
|
22
22
|
solved: boolean;
|
|
23
23
|
rounds: number;
|
|
@@ -17,8 +17,8 @@
|
|
|
17
17
|
* // result.cookies = ["cf_clearance=...", ...]
|
|
18
18
|
* }
|
|
19
19
|
*/
|
|
20
|
-
import { cacheCookiesForUrl } from '
|
|
21
|
-
import { createLogger } from '
|
|
20
|
+
import { cacheCookiesForUrl } from '../core/cookie-cache.js';
|
|
21
|
+
import { createLogger } from '../core/logger.js';
|
|
22
22
|
const log = createLogger('challenge-solver');
|
|
23
23
|
// ── Image CAPTCHA solver constants ────────────────────────────────────────────
|
|
24
24
|
const OLLAMA_VISION_URL = 'http://178.156.229.86:11435/api/generate';
|
|
@@ -372,7 +372,7 @@ export async function solveChallenge(url, challengeType, html, options = {}) {
|
|
|
372
372
|
async function solveCaptchaWithVision(url, _html, timeoutMs, proxy) {
|
|
373
373
|
let page = null;
|
|
374
374
|
try {
|
|
375
|
-
const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('
|
|
375
|
+
const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('../core/browser-pool.js');
|
|
376
376
|
const browser = await getStealthBrowser();
|
|
377
377
|
const vp = getRandomViewport();
|
|
378
378
|
const ctx = await browser.newContext({
|
|
@@ -446,7 +446,7 @@ async function solveCloudflare(url, _html, timeoutMs, proxy) {
|
|
|
446
446
|
let browser = null;
|
|
447
447
|
let page = null;
|
|
448
448
|
try {
|
|
449
|
-
const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('
|
|
449
|
+
const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('../core/browser-pool.js');
|
|
450
450
|
browser = await getStealthBrowser();
|
|
451
451
|
const vp = getRandomViewport();
|
|
452
452
|
const ctx = await browser.newContext({
|
|
@@ -528,7 +528,7 @@ async function solveCloudflare(url, _html, timeoutMs, proxy) {
|
|
|
528
528
|
async function solveWithStealthBrowser(url, _html, timeoutMs, proxy, challengeType) {
|
|
529
529
|
let page = null;
|
|
530
530
|
try {
|
|
531
|
-
const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('
|
|
531
|
+
const { getStealthBrowser, getRandomUserAgent, getRandomViewport, applyStealthScripts } = await import('../core/browser-pool.js');
|
|
532
532
|
const browser = await getStealthBrowser();
|
|
533
533
|
const vp = getRandomViewport();
|
|
534
534
|
const ctx = await browser.newContext({
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Domain-aware structured extractors for WebPeel.
|
|
3
|
+
*
|
|
4
|
+
* This file re-exports from individual extractor files for backward compatibility.
|
|
5
|
+
* Each extractor now lives in its own file under src/ee/extractors/.
|
|
6
|
+
*/
|
|
7
|
+
export { getDomainExtractor, hasDomainExtractor, extractDomainData, clearExtractorCache, setExtractorRedis, } from './extractors/index.js';
|
|
8
|
+
export type { DomainExtractResult, DomainExtractor } from './extractors/index.js';
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Domain-aware structured extractors for WebPeel.
|
|
3
|
+
*
|
|
4
|
+
* This file re-exports from individual extractor files for backward compatibility.
|
|
5
|
+
* Each extractor now lives in its own file under src/ee/extractors/.
|
|
6
|
+
*/
|
|
7
|
+
// Re-exported from individual extractor files for backward compatibility
|
|
8
|
+
export { getDomainExtractor, hasDomainExtractor, extractDomainData, clearExtractorCache, setExtractorRedis, } from './extractors/index.js';
|
|
@@ -11,6 +11,6 @@
|
|
|
11
11
|
*
|
|
12
12
|
* This module is NOT shipped in the npm package.
|
|
13
13
|
*/
|
|
14
|
-
import type { StrategyHooks } from '
|
|
14
|
+
import type { StrategyHooks } from '../core/strategy-hooks.js';
|
|
15
15
|
export declare function clearDomainIntel(): void;
|
|
16
16
|
export declare function createDomainIntelHooks(): Pick<StrategyHooks, 'getDomainRecommendation' | 'recordDomainResult'>;
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import { tryParseJson } from './shared.js';
|
|
2
|
+
// ---------------------------------------------------------------------------
|
|
3
|
+
// 15. Allrecipes (Recipe Sites) extractor
|
|
4
|
+
// ---------------------------------------------------------------------------
|
|
5
|
+
export async function allrecipesExtractor(html, url) {
|
|
6
|
+
try {
|
|
7
|
+
const { load } = await import('cheerio');
|
|
8
|
+
const $ = load(html);
|
|
9
|
+
// Try Schema.org Recipe JSON-LD first
|
|
10
|
+
let recipe = null;
|
|
11
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
12
|
+
if (recipe)
|
|
13
|
+
return;
|
|
14
|
+
const raw = $(el).html() || '';
|
|
15
|
+
const parsed = tryParseJson(raw);
|
|
16
|
+
// Can be an array or direct object
|
|
17
|
+
const candidates = Array.isArray(parsed) ? parsed : [parsed];
|
|
18
|
+
for (const item of candidates) {
|
|
19
|
+
if (item?.['@type'] === 'Recipe' || (Array.isArray(item?.['@type']) && item['@type'].includes('Recipe'))) {
|
|
20
|
+
recipe = item;
|
|
21
|
+
break;
|
|
22
|
+
}
|
|
23
|
+
// Sometimes it's nested in @graph
|
|
24
|
+
if (item?.['@graph']) {
|
|
25
|
+
const graphRecipe = item['@graph'].find((g) => g?.['@type'] === 'Recipe');
|
|
26
|
+
if (graphRecipe) {
|
|
27
|
+
recipe = graphRecipe;
|
|
28
|
+
break;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
});
|
|
33
|
+
let title;
|
|
34
|
+
let ingredients = [];
|
|
35
|
+
let instructions = [];
|
|
36
|
+
let prepTime = '';
|
|
37
|
+
let cookTime = '';
|
|
38
|
+
let totalTime = '';
|
|
39
|
+
let servings = '';
|
|
40
|
+
let rating = '';
|
|
41
|
+
let reviewCount = '';
|
|
42
|
+
let description = '';
|
|
43
|
+
if (recipe) {
|
|
44
|
+
title = recipe.name || '';
|
|
45
|
+
description = recipe.description || '';
|
|
46
|
+
ingredients = (recipe.recipeIngredient || []).map((i) => i.trim());
|
|
47
|
+
// Instructions can be strings or HowToStep objects
|
|
48
|
+
const rawInstructions = recipe.recipeInstructions || [];
|
|
49
|
+
for (const step of rawInstructions) {
|
|
50
|
+
if (typeof step === 'string')
|
|
51
|
+
instructions.push(step.trim());
|
|
52
|
+
else if (step.text)
|
|
53
|
+
instructions.push(step.text.trim());
|
|
54
|
+
else if (step['@type'] === 'HowToSection' && step.itemListElement) {
|
|
55
|
+
for (const s of step.itemListElement) {
|
|
56
|
+
if (s.text)
|
|
57
|
+
instructions.push(s.text.trim());
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
// Parse ISO 8601 duration (PT30M, PT1H30M)
|
|
62
|
+
const parseDuration = (d) => {
|
|
63
|
+
if (!d)
|
|
64
|
+
return '';
|
|
65
|
+
const h = d.match(/(\d+)H/)?.[1];
|
|
66
|
+
const m = d.match(/(\d+)M/)?.[1];
|
|
67
|
+
return [h ? `${h}h` : '', m ? `${m}m` : ''].filter(Boolean).join(' ');
|
|
68
|
+
};
|
|
69
|
+
prepTime = parseDuration(recipe.prepTime || '');
|
|
70
|
+
cookTime = parseDuration(recipe.cookTime || '');
|
|
71
|
+
totalTime = parseDuration(recipe.totalTime || '');
|
|
72
|
+
servings = String(recipe.recipeYield || '');
|
|
73
|
+
rating = recipe.aggregateRating?.ratingValue ? String(recipe.aggregateRating.ratingValue) : '';
|
|
74
|
+
reviewCount = recipe.aggregateRating?.reviewCount ? String(recipe.aggregateRating.reviewCount) : '';
|
|
75
|
+
}
|
|
76
|
+
else {
|
|
77
|
+
// HTML fallback
|
|
78
|
+
title = $('h1').first().text().trim() ||
|
|
79
|
+
$('meta[property="og:title"]').attr('content') || '';
|
|
80
|
+
description = $('meta[property="og:description"]').attr('content') || '';
|
|
81
|
+
$('[class*="ingredient"]').each((_, el) => {
|
|
82
|
+
const text = $(el).text().trim();
|
|
83
|
+
if (text && text.length < 200)
|
|
84
|
+
ingredients.push(text);
|
|
85
|
+
});
|
|
86
|
+
$('[class*="instruction"] li, [class*="step"] li').each((_, el) => {
|
|
87
|
+
const text = $(el).text().trim();
|
|
88
|
+
if (text)
|
|
89
|
+
instructions.push(text);
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
if (!title)
|
|
93
|
+
return null;
|
|
94
|
+
const structured = {
|
|
95
|
+
title, description, ingredients, instructions,
|
|
96
|
+
prepTime, cookTime, totalTime, servings, rating, reviewCount, url,
|
|
97
|
+
};
|
|
98
|
+
const timeParts = [
|
|
99
|
+
prepTime ? `Prep: ${prepTime}` : '',
|
|
100
|
+
cookTime ? `Cook: ${cookTime}` : '',
|
|
101
|
+
totalTime ? `Total: ${totalTime}` : '',
|
|
102
|
+
].filter(Boolean).join(' | ');
|
|
103
|
+
const metaLine = [
|
|
104
|
+
timeParts,
|
|
105
|
+
servings ? `Servings: ${servings}` : '',
|
|
106
|
+
rating ? `Rating: ${rating}${reviewCount ? ` (${reviewCount} reviews)` : ''}` : '',
|
|
107
|
+
].filter(Boolean).join(' | ');
|
|
108
|
+
const ingredientsMd = ingredients.length
|
|
109
|
+
? `## Ingredients\n\n${ingredients.map(i => `- ${i}`).join('\n')}`
|
|
110
|
+
: '';
|
|
111
|
+
const instructionsMd = instructions.length
|
|
112
|
+
? `## Instructions\n\n${instructions.map((s, i) => `${i + 1}. ${s}`).join('\n')}`
|
|
113
|
+
: '';
|
|
114
|
+
const cleanContent = `# 🍽️ ${title}\n\n${metaLine ? `*${metaLine}*\n\n` : ''}${description ? description + '\n\n' : ''}${ingredientsMd}\n\n${instructionsMd}`.trim();
|
|
115
|
+
return { domain: 'allrecipes.com', type: 'recipe', structured, cleanContent };
|
|
116
|
+
}
|
|
117
|
+
catch {
|
|
118
|
+
return null;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { tryParseJson } from './shared.js';
|
|
2
|
+
// ---------------------------------------------------------------------------
|
|
3
|
+
// 12. Amazon Products extractor
|
|
4
|
+
// ---------------------------------------------------------------------------
|
|
5
|
+
export async function amazonExtractor(html, url) {
|
|
6
|
+
try {
|
|
7
|
+
const { load } = await import('cheerio');
|
|
8
|
+
const $ = load(html);
|
|
9
|
+
// Extract from JSON-LD first
|
|
10
|
+
let jsonLdData = null;
|
|
11
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
12
|
+
if (jsonLdData)
|
|
13
|
+
return;
|
|
14
|
+
const raw = $(el).html() || '';
|
|
15
|
+
const parsed = tryParseJson(raw);
|
|
16
|
+
if (parsed?.['@type'] === 'Product')
|
|
17
|
+
jsonLdData = parsed;
|
|
18
|
+
});
|
|
19
|
+
// Meta tag fallbacks
|
|
20
|
+
const ogTitle = $('meta[property="og:title"]').attr('content') || '';
|
|
21
|
+
const ogDescription = $('meta[property="og:description"]').attr('content') || '';
|
|
22
|
+
const ogImage = $('meta[property="og:image"]').attr('content') || '';
|
|
23
|
+
// HTML selectors
|
|
24
|
+
const title = jsonLdData?.name ||
|
|
25
|
+
$('#productTitle').text().trim() ||
|
|
26
|
+
$('#title').text().trim() ||
|
|
27
|
+
ogTitle;
|
|
28
|
+
if (!title)
|
|
29
|
+
return null;
|
|
30
|
+
const priceWhole = $('#priceblock_ourprice').text().trim() ||
|
|
31
|
+
$('.a-price .a-offscreen').first().text().trim() ||
|
|
32
|
+
$('[data-asin-price]').first().attr('data-asin-price') || '';
|
|
33
|
+
const rating = jsonLdData?.aggregateRating?.ratingValue ||
|
|
34
|
+
$('#acrPopover .a-size-base.a-color-base').first().text().trim() ||
|
|
35
|
+
$('span[data-hook="rating-out-of-text"]').text().trim() || '';
|
|
36
|
+
const reviewCount = jsonLdData?.aggregateRating?.reviewCount ||
|
|
37
|
+
$('#acrCustomerReviewText').text().replace(/[^0-9,]/g, '').trim() || '';
|
|
38
|
+
const availability = jsonLdData?.offers?.availability?.replace('https://schema.org/', '') ||
|
|
39
|
+
$('#availability span').first().text().trim() || '';
|
|
40
|
+
const description = jsonLdData?.description ||
|
|
41
|
+
$('#feature-bullets .a-list-item').map((_, el) => $(el).text().trim()).get().join('\n') ||
|
|
42
|
+
$('#productDescription p').text().trim() ||
|
|
43
|
+
ogDescription;
|
|
44
|
+
const features = [];
|
|
45
|
+
$('#feature-bullets li').each((_, el) => {
|
|
46
|
+
const text = $(el).text().trim();
|
|
47
|
+
if (text && !text.includes('Make sure this fits'))
|
|
48
|
+
features.push(text);
|
|
49
|
+
});
|
|
50
|
+
// ASIN from URL
|
|
51
|
+
const asinMatch = url.match(/\/dp\/([A-Z0-9]{10})/i);
|
|
52
|
+
const asin = asinMatch?.[1] || '';
|
|
53
|
+
const structured = {
|
|
54
|
+
title,
|
|
55
|
+
price: priceWhole,
|
|
56
|
+
rating,
|
|
57
|
+
reviewCount,
|
|
58
|
+
availability,
|
|
59
|
+
description,
|
|
60
|
+
features,
|
|
61
|
+
asin,
|
|
62
|
+
image: ogImage,
|
|
63
|
+
url,
|
|
64
|
+
};
|
|
65
|
+
const ratingLine = rating ? `\n**Rating:** ${rating}${reviewCount ? ` (${reviewCount} reviews)` : ''}` : '';
|
|
66
|
+
const priceLine = priceWhole ? `\n**Price:** ${priceWhole}` : '';
|
|
67
|
+
const availLine = availability ? `\n**Availability:** ${availability}` : '';
|
|
68
|
+
const featuresSection = features.length
|
|
69
|
+
? `\n\n## Features\n\n${features.map(f => `- ${f}`).join('\n')}`
|
|
70
|
+
: '';
|
|
71
|
+
const descSection = description ? `\n\n## Description\n\n${description.substring(0, 1000)}` : '';
|
|
72
|
+
const cleanContent = `# 🛒 ${title}${priceLine}${ratingLine}${availLine}${descSection}${featuresSection}`;
|
|
73
|
+
return { domain: 'amazon.com', type: 'product', structured, cleanContent };
|
|
74
|
+
}
|
|
75
|
+
catch {
|
|
76
|
+
return null;
|
|
77
|
+
}
|
|
78
|
+
}
|