webpeel 0.21.86 → 0.21.87
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/fetch.js +13 -0
- package/dist/cli/utils.js +10 -1
- package/dist/core/http-fetch.js +19 -2
- package/dist/core/pipeline.js +3 -2
- package/dist/core/schema-templates.js +37 -24
- package/dist/core/search-provider.d.ts +2 -0
- package/dist/core/search-provider.js +9 -2
- package/dist/core/searxng-provider.d.ts +1 -0
- package/dist/core/searxng-provider.js +1 -0
- package/dist/ee/domain-extractors.d.ts +4 -44
- package/dist/ee/domain-extractors.js +4 -6338
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +255 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +115 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +162 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +308 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +176 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +103 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/server/app.js +8 -0
- package/dist/server/bull-queues.d.ts +1 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.js +1 -0
- package/dist/server/routes/fetch.js +120 -2
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/smart-search.d.ts +5 -3
- package/dist/server/routes/smart-search.js +1842 -141
- package/dist/types.d.ts +4 -0
- package/package.json +12 -2
- package/dist/core/challenge-solver.d.ts +0 -72
- package/dist/core/challenge-solver.js +0 -720
- package/dist/core/cloak-fetch.d.ts +0 -42
- package/dist/core/cloak-fetch.js +0 -148
- package/dist/core/cycle-fetch.d.ts +0 -26
- package/dist/core/cycle-fetch.js +0 -98
- package/dist/core/domain-extractors-basic.d.ts +0 -36
- package/dist/core/domain-extractors-basic.js +0 -28
- package/dist/core/domain-extractors-public.d.ts +0 -20
- package/dist/core/domain-extractors-public.js +0 -35
- package/dist/core/domain-extractors.d.ts +0 -48
- package/dist/core/domain-extractors.js +0 -6342
- package/dist/core/search-fallback.d.ts +0 -28
- package/dist/core/search-fallback.js +0 -209
- package/dist/core/stealth-patches.d.ts +0 -14
- package/dist/core/stealth-patches.js +0 -20
- package/dist/server/premium/challenge.d.ts +0 -1
- package/dist/server/premium/challenge.js +0 -1
- package/dist/server/premium/domain-intel.d.ts +0 -16
- package/dist/server/premium/domain-intel.js +0 -133
- package/dist/server/premium/extractors.d.ts +0 -1
- package/dist/server/premium/extractors.js +0 -1
- package/dist/server/premium/index.d.ts +0 -20
- package/dist/server/premium/index.js +0 -50
- package/dist/server/premium/spa-detection.d.ts +0 -2
- package/dist/server/premium/spa-detection.js +0 -2
- package/dist/server/premium/stability.d.ts +0 -4
- package/dist/server/premium/stability.js +0 -29
- package/dist/server/premium/swr-cache.d.ts +0 -14
- package/dist/server/premium/swr-cache.js +0 -34
|
@@ -289,6 +289,7 @@ export async function runFetch(url, options) {
|
|
|
289
289
|
format: options.html ? 'html' : options.text ? 'text' : options.clean ? 'clean' : 'markdown',
|
|
290
290
|
budget: null, // Budget excluded from cache key — cache stores full content
|
|
291
291
|
readable: options.readable || false,
|
|
292
|
+
noDomainApi: options.skipDomainApi || false, // Different cache for domain-api bypass
|
|
292
293
|
};
|
|
293
294
|
const cachedResult = getCache(url, cacheOptions);
|
|
294
295
|
if (cachedResult) {
|
|
@@ -603,6 +604,7 @@ export async function runFetch(url, options) {
|
|
|
603
604
|
headers,
|
|
604
605
|
cookies: options.cookie,
|
|
605
606
|
raw: options.raw || false,
|
|
607
|
+
noDomainApi: options.skipDomainApi || false,
|
|
606
608
|
lite: options.lite || false,
|
|
607
609
|
actions,
|
|
608
610
|
maxTokens: options.maxTokens,
|
|
@@ -724,6 +726,16 @@ export async function runFetch(url, options) {
|
|
|
724
726
|
? ` [${result.domainData.domain}:${result.domainData.type}]`
|
|
725
727
|
: '';
|
|
726
728
|
spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method${domainTag}`);
|
|
729
|
+
// Smart hints — suggest features the user might not know about
|
|
730
|
+
if (!options.silent && !options.json && !options.skipDomainApi) {
|
|
731
|
+
if (result.method === 'domain-api') {
|
|
732
|
+
const extractorName = result.domainData?.domain || new URL(url).hostname.replace('www.', '') || 'domain';
|
|
733
|
+
console.error(`\x1b[33m💡 Tip: Using our ${extractorName} extractor. Want the raw page instead? Add --skip-domain-api\x1b[0m`);
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
if (!options.silent && !options.json && result.tokens && result.tokens < 50 && !options.render) {
|
|
737
|
+
console.error(`\x1b[33m💡 Tip: Page returned very little content. Try --render for JavaScript-heavy sites or --stealth if blocked.\x1b[0m`);
|
|
738
|
+
}
|
|
727
739
|
}
|
|
728
740
|
// Show metadata header
|
|
729
741
|
const pageTitle = result.metadata?.title || result.title;
|
|
@@ -1176,6 +1188,7 @@ export function registerFetchCommands(program) {
|
|
|
1176
1188
|
.option('--images', 'Output image URLs from the page')
|
|
1177
1189
|
.option('--meta', 'Output only the page metadata (title, description, author, etc.)')
|
|
1178
1190
|
.option('--raw', 'Return full page without smart content extraction')
|
|
1191
|
+
.option('--skip-domain-api', 'Bypass domain-specific API extractors — force actual page scraping')
|
|
1179
1192
|
.option('--full', 'Alias for --raw — full page content, no budget')
|
|
1180
1193
|
.option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
|
|
1181
1194
|
.option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
|
package/dist/cli/utils.js
CHANGED
|
@@ -35,7 +35,14 @@ export async function checkForUpdates() {
|
|
|
35
35
|
if (latest && latest !== cliVersion && cliVersion !== '0.0.0') {
|
|
36
36
|
// Skip update notice in silent mode
|
|
37
37
|
if (process.env.WEBPEEL_LOG_LEVEL !== 'silent') {
|
|
38
|
-
|
|
38
|
+
const msg = `Update available: ${cliVersion} → ${latest}`;
|
|
39
|
+
const cmd = 'npm i -g webpeel@latest';
|
|
40
|
+
const width = Math.max(msg.length, cmd.length) + 4;
|
|
41
|
+
const line = '─'.repeat(width);
|
|
42
|
+
console.error(`\n\x1b[33m╭${line}╮\x1b[0m`);
|
|
43
|
+
console.error(`\x1b[33m│\x1b[0m ${msg.padEnd(width - 2)} \x1b[33m│\x1b[0m`);
|
|
44
|
+
console.error(`\x1b[33m│\x1b[0m Run: \x1b[36m${cmd}\x1b[0m${' '.repeat(width - 6 - cmd.length)} \x1b[33m│\x1b[0m`);
|
|
45
|
+
console.error(`\x1b[33m╰${line}╯\x1b[0m\n`);
|
|
39
46
|
}
|
|
40
47
|
}
|
|
41
48
|
}
|
|
@@ -208,6 +215,8 @@ export async function fetchViaApi(url, options, apiKey, apiUrl) {
|
|
|
208
215
|
params.set('budget', String(options.budget));
|
|
209
216
|
if (options.question)
|
|
210
217
|
params.set('question', options.question);
|
|
218
|
+
if (options.noDomainApi)
|
|
219
|
+
params.set('noDomainApi', 'true');
|
|
211
220
|
const res = await fetch(`${apiUrl}/v1/fetch?${params}`, {
|
|
212
221
|
headers: { Authorization: `Bearer ${apiKey}` },
|
|
213
222
|
signal: AbortSignal.timeout(60000),
|
package/dist/core/http-fetch.js
CHANGED
|
@@ -154,10 +154,12 @@ export function createAbortError() {
|
|
|
154
154
|
* proxy when proxy credentials are configured (WEBSHARE_PROXY_* env vars).
|
|
155
155
|
*/
|
|
156
156
|
export const PROXY_PREFERRED_DOMAINS = [
|
|
157
|
+
// Social / content
|
|
157
158
|
'reddit.com',
|
|
158
159
|
'old.reddit.com',
|
|
159
160
|
'forbes.com',
|
|
160
161
|
'fortune.com',
|
|
162
|
+
// Auto / cars
|
|
161
163
|
'cargurus.com',
|
|
162
164
|
'edmunds.com',
|
|
163
165
|
'cars.com',
|
|
@@ -165,14 +167,29 @@ export const PROXY_PREFERRED_DOMAINS = [
|
|
|
165
167
|
'autotrader.com',
|
|
166
168
|
'carfax.com',
|
|
167
169
|
'tesla.com',
|
|
170
|
+
'motortrend.com',
|
|
171
|
+
'jdpower.com',
|
|
172
|
+
// Finance / home
|
|
168
173
|
'nerdwallet.com',
|
|
169
174
|
'bankrate.com',
|
|
170
175
|
'homeadvisor.com',
|
|
171
176
|
'angi.com',
|
|
177
|
+
// EV / auto news
|
|
172
178
|
'insideevs.com',
|
|
173
179
|
'electrek.co',
|
|
174
|
-
|
|
175
|
-
'
|
|
180
|
+
// Restaurants / food
|
|
181
|
+
'yelp.com',
|
|
182
|
+
// Travel
|
|
183
|
+
'kayak.com',
|
|
184
|
+
'booking.com',
|
|
185
|
+
'expedia.com',
|
|
186
|
+
'tripadvisor.com',
|
|
187
|
+
'hotels.com',
|
|
188
|
+
// Shopping / products
|
|
189
|
+
'amazon.com',
|
|
190
|
+
'bestbuy.com',
|
|
191
|
+
'walmart.com',
|
|
192
|
+
'target.com',
|
|
176
193
|
];
|
|
177
194
|
/**
|
|
178
195
|
* Returns true if the URL's domain is on the proxy-preferred blocklist.
|
package/dist/core/pipeline.js
CHANGED
|
@@ -341,7 +341,8 @@ export async function fetchContent(ctx) {
|
|
|
341
341
|
const needsDesignAnalysis = ctx.options.designAnalysis && ctx.render;
|
|
342
342
|
// Try API-based domain extraction first (Reddit, GitHub, HN use APIs, not HTML)
|
|
343
343
|
// This avoids expensive browser fetches that often get blocked
|
|
344
|
-
if
|
|
344
|
+
// Skip if noDomainApi is set — user wants raw page content, not API shortcut
|
|
345
|
+
if (hasDomainExtractor(ctx.url) && !ctx.options.noDomainApi) {
|
|
345
346
|
try {
|
|
346
347
|
ctx.timer.mark('domainApiFirst');
|
|
347
348
|
const ddResult = await runDomainExtract('', ctx.url);
|
|
@@ -1078,7 +1079,7 @@ export async function postProcess(ctx) {
|
|
|
1078
1079
|
}
|
|
1079
1080
|
// Domain-aware structured extraction (Twitter, Reddit, GitHub, HN)
|
|
1080
1081
|
// Fires when URL matches a known domain. Replaces content with clean markdown.
|
|
1081
|
-
if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled) {
|
|
1082
|
+
if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled && !ctx.options.noDomainApi) {
|
|
1082
1083
|
try {
|
|
1083
1084
|
ctx.timer.mark('domainExtract');
|
|
1084
1085
|
// Try raw HTML first, then fall back to readability-processed content
|
|
@@ -57,42 +57,55 @@ export const SCHEMA_TEMPLATES = {
|
|
|
57
57
|
name: 'Event',
|
|
58
58
|
description: 'Extract event information',
|
|
59
59
|
fields: {
|
|
60
|
-
name: '
|
|
61
|
-
date: 'event
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
price: '
|
|
65
|
-
|
|
66
|
-
|
|
60
|
+
name: 'What is the name of this event?',
|
|
61
|
+
date: 'When does this event take place?',
|
|
62
|
+
time: 'What time does this event start?',
|
|
63
|
+
location: 'Where is this event held?',
|
|
64
|
+
price: 'How much does this event cost?',
|
|
65
|
+
description: 'What is this event about?',
|
|
66
|
+
organizer: 'Who is organizing this event?',
|
|
67
67
|
},
|
|
68
68
|
},
|
|
69
69
|
recipe: {
|
|
70
70
|
name: 'Recipe',
|
|
71
71
|
description: 'Extract recipe information from cooking sites',
|
|
72
72
|
fields: {
|
|
73
|
-
|
|
74
|
-
ingredients: '
|
|
75
|
-
|
|
76
|
-
prepTime: 'preparation
|
|
77
|
-
cookTime: 'cooking
|
|
78
|
-
servings: '
|
|
79
|
-
calories: 'calories per serving',
|
|
80
|
-
|
|
73
|
+
name: 'What is the name of this recipe?',
|
|
74
|
+
ingredients: 'What ingredients are needed? List all.',
|
|
75
|
+
steps: 'What are the cooking steps or instructions?',
|
|
76
|
+
prepTime: 'How long does preparation take?',
|
|
77
|
+
cookTime: 'How long does cooking take?',
|
|
78
|
+
servings: 'How many servings does this recipe make?',
|
|
79
|
+
calories: 'How many calories per serving?',
|
|
80
|
+
rating: 'What is the recipe rating?',
|
|
81
81
|
},
|
|
82
82
|
},
|
|
83
83
|
job: {
|
|
84
84
|
name: 'Job',
|
|
85
85
|
description: 'Extract job posting information',
|
|
86
86
|
fields: {
|
|
87
|
-
title: 'job title',
|
|
88
|
-
company: 'company
|
|
89
|
-
location: 'job
|
|
90
|
-
salary: 'salary
|
|
91
|
-
|
|
92
|
-
requirements: '
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
87
|
+
title: 'What is the job title?',
|
|
88
|
+
company: 'What company is hiring?',
|
|
89
|
+
location: 'Where is the job located?',
|
|
90
|
+
salary: 'What is the salary or compensation range?',
|
|
91
|
+
type: 'Is this full-time, part-time, contract, or remote?',
|
|
92
|
+
requirements: 'What are the key requirements or qualifications?',
|
|
93
|
+
description: 'What is the job description?',
|
|
94
|
+
applyUrl: 'What is the URL or method to apply?',
|
|
95
|
+
},
|
|
96
|
+
},
|
|
97
|
+
business: {
|
|
98
|
+
name: 'Business',
|
|
99
|
+
description: 'Extract business/company information',
|
|
100
|
+
fields: {
|
|
101
|
+
name: 'What is the business name?',
|
|
102
|
+
address: 'What is the full address?',
|
|
103
|
+
phone: 'What is the phone number?',
|
|
104
|
+
hours: 'What are the business hours?',
|
|
105
|
+
rating: 'What is the business rating?',
|
|
106
|
+
reviewCount: 'How many reviews does this business have?',
|
|
107
|
+
website: 'What is the business website URL?',
|
|
108
|
+
categories: 'What type of business is this?',
|
|
96
109
|
},
|
|
97
110
|
},
|
|
98
111
|
review: {
|
|
@@ -19,6 +19,8 @@ export interface WebSearchResult {
|
|
|
19
19
|
snippet: string;
|
|
20
20
|
/** Relevance score (0–1) based on keyword overlap with query. Added by filterRelevantResults. */
|
|
21
21
|
relevanceScore?: number;
|
|
22
|
+
/** Thumbnail/image URL from SearXNG results (img_src or thumbnail field). */
|
|
23
|
+
imageUrl?: string;
|
|
22
24
|
}
|
|
23
25
|
export interface WebSearchOptions {
|
|
24
26
|
/** Number of results (1-10) */
|
|
@@ -1066,8 +1066,15 @@ export class DuckDuckGoProvider {
|
|
|
1066
1066
|
if (searxResults.length > 0) {
|
|
1067
1067
|
providerStats.record('searxng', true);
|
|
1068
1068
|
log.debug(`source=searxng returned ${searxResults.length} results`);
|
|
1069
|
-
|
|
1070
|
-
|
|
1069
|
+
// Map SearXNG results to WebSearchResult (description → snippet, imageUrl passthrough)
|
|
1070
|
+
const mapped = searxResults.map(r => ({
|
|
1071
|
+
title: r.title,
|
|
1072
|
+
url: r.url,
|
|
1073
|
+
snippet: r.description ?? '',
|
|
1074
|
+
imageUrl: r.imageUrl,
|
|
1075
|
+
}));
|
|
1076
|
+
const filtered = filterRelevantResults(mapped, query);
|
|
1077
|
+
return filtered.length > 0 ? filtered : mapped;
|
|
1071
1078
|
}
|
|
1072
1079
|
providerStats.record('searxng', false);
|
|
1073
1080
|
log.debug('SearXNG returned 0 results, falling through to DDG');
|
|
@@ -69,6 +69,7 @@ export async function searchViaSearXNG(query, options = {}) {
|
|
|
69
69
|
description: r.content ?? undefined,
|
|
70
70
|
publishedDate: r.publishedDate ?? undefined,
|
|
71
71
|
score: r.score ?? undefined,
|
|
72
|
+
imageUrl: r.img_src ?? r.thumbnail ?? undefined,
|
|
72
73
|
});
|
|
73
74
|
if (output.length >= count)
|
|
74
75
|
break;
|
|
@@ -1,48 +1,8 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Domain-aware structured extractors for WebPeel.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* extractor
|
|
6
|
-
*
|
|
7
|
-
* Supported domains:
|
|
8
|
-
* - twitter.com / x.com — tweets, threads, profiles
|
|
9
|
-
* - reddit.com — posts with comments (via JSON API)
|
|
10
|
-
* - github.com — repos, issues, PRs, users (via GitHub API)
|
|
11
|
-
* - news.ycombinator.com — stories with comments (via HN Firebase API)
|
|
12
|
-
*/
|
|
13
|
-
export interface DomainExtractResult {
|
|
14
|
-
/** Canonical domain name (e.g. 'twitter.com') */
|
|
15
|
-
domain: string;
|
|
16
|
-
/** Page type within the domain (e.g. 'tweet', 'thread', 'repo', 'issue') */
|
|
17
|
-
type: string;
|
|
18
|
-
/** Domain-specific structured data */
|
|
19
|
-
structured: Record<string, any>;
|
|
20
|
-
/** Clean markdown representation of the content */
|
|
21
|
-
cleanContent: string;
|
|
22
|
-
/** Raw HTML size in characters (from the actual HTML page fetched by the extractor) */
|
|
23
|
-
rawHtmlSize?: number;
|
|
24
|
-
}
|
|
25
|
-
/** An extractor receives the raw HTML and original URL, may make API calls. */
|
|
26
|
-
export type DomainExtractor = (html: string, url: string) => Promise<DomainExtractResult | null>;
|
|
27
|
-
/**
|
|
28
|
-
* Returns the domain extractor for a URL, or null if none matches.
|
|
29
|
-
*/
|
|
30
|
-
export declare function getDomainExtractor(url: string): DomainExtractor | null;
|
|
31
|
-
/** Clear the extractor response cache (used in tests). */
|
|
32
|
-
export declare function clearExtractorCache(): void;
|
|
33
|
-
/**
|
|
34
|
-
* Inject a Redis client for shared cross-pod caching.
|
|
35
|
-
* Called from server startup after Redis is initialized.
|
|
36
|
-
* Safe to call with null to disable Redis caching (e.g., CLI mode).
|
|
37
|
-
*/
|
|
38
|
-
export declare function setExtractorRedis(redis: any): void;
|
|
39
|
-
/**
|
|
40
|
-
* Convenience: run the extractor for the URL (if one exists).
|
|
41
|
-
* Wraps _extractDomainDataImpl with a two-tier cache:
|
|
42
|
-
* 1. In-memory LRU (per-pod, fastest)
|
|
43
|
-
* 2. Redis shared cache (cross-pod, shared across all replicas)
|
|
44
|
-
*
|
|
45
|
-
* With multiple API pods, Redis ensures the first pod to fetch a URL
|
|
46
|
-
* populates cache for all others — eliminating redundant API calls.
|
|
4
|
+
* This file re-exports from individual extractor files for backward compatibility.
|
|
5
|
+
* Each extractor now lives in its own file under src/ee/extractors/.
|
|
47
6
|
*/
|
|
48
|
-
export
|
|
7
|
+
export { getDomainExtractor, hasDomainExtractor, extractDomainData, clearExtractorCache, setExtractorRedis, } from './extractors/index.js';
|
|
8
|
+
export type { DomainExtractResult, DomainExtractor } from './extractors/index.js';
|