webpeel 0.20.3 → 0.20.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/fetch.js +78 -9
- package/dist/cli/commands/search.js +71 -4
- package/dist/cli/utils.js +13 -6
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/pipeline.js +8 -0
- package/dist/core/search-provider.d.ts +2 -2
- package/dist/core/search-provider.js +52 -6
- package/dist/core/strategies.js +64 -2
- package/dist/server/app.js +11 -0
- package/dist/server/routes/agent.d.ts +21 -0
- package/dist/server/routes/agent.js +238 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +214 -0
- package/dist/server/routes/fetch.js +44 -2
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +187 -0
- package/dist/server/routes/search.js +3 -0
- package/dist/types.d.ts +7 -0
- package/package.json +1 -1
|
@@ -13,6 +13,14 @@ import { parseActions, formatError, fetchViaApi, outputResult, writeStdout, buil
|
|
|
13
13
|
// ─── runFetch ─────────────────────────────────────────────────────────────────
|
|
14
14
|
// Main fetch handler — shared with the `pipe` and `ask` subcommands
|
|
15
15
|
export async function runFetch(url, options) {
|
|
16
|
+
// --content-only: override all output flags — we just want raw content
|
|
17
|
+
if (options.contentOnly) {
|
|
18
|
+
options.silent = true;
|
|
19
|
+
// Disable json/text/html — we output content directly
|
|
20
|
+
options.json = false;
|
|
21
|
+
options.html = false;
|
|
22
|
+
options.text = false;
|
|
23
|
+
}
|
|
16
24
|
// Handle --format flag: maps to existing boolean flags
|
|
17
25
|
if (options.format) {
|
|
18
26
|
const fmt = options.format.toLowerCase();
|
|
@@ -30,9 +38,10 @@ export async function runFetch(url, options) {
|
|
|
30
38
|
}
|
|
31
39
|
// Smart defaults: when piped (not a TTY), default to silent JSON + budget
|
|
32
40
|
// BUT respect explicit --format flag (user chose the output format)
|
|
41
|
+
// AND respect --content-only (raw content output, no JSON wrapper)
|
|
33
42
|
const isPiped = !process.stdout.isTTY;
|
|
34
43
|
const hasExplicitFormat = options.format && ['text', 'html', 'markdown', 'md'].includes(options.format.toLowerCase());
|
|
35
|
-
if (isPiped && !options.html && !options.text && !hasExplicitFormat) {
|
|
44
|
+
if (isPiped && !options.html && !options.text && !hasExplicitFormat && !options.contentOnly) {
|
|
36
45
|
if (!options.json)
|
|
37
46
|
options.json = true;
|
|
38
47
|
if (!options.silent)
|
|
@@ -284,11 +293,38 @@ export async function runFetch(url, options) {
|
|
|
284
293
|
cachedResult.extracted = extractedCached;
|
|
285
294
|
}
|
|
286
295
|
}
|
|
287
|
-
|
|
296
|
+
if (options.contentOnly) {
|
|
297
|
+
await writeStdout(cachedResult.content + '\n');
|
|
298
|
+
}
|
|
299
|
+
else {
|
|
300
|
+
await outputResult(cachedResult, options, { cached: true });
|
|
301
|
+
}
|
|
288
302
|
process.exit(0);
|
|
289
303
|
}
|
|
290
304
|
}
|
|
291
|
-
|
|
305
|
+
// --progress: show escalation steps on stderr (overrides spinner)
|
|
306
|
+
let progressInterval;
|
|
307
|
+
const progressStart = Date.now();
|
|
308
|
+
if (options.progress) {
|
|
309
|
+
process.stderr.write(`[simple] Fetching ${url}...\n`);
|
|
310
|
+
// Show escalation hints based on elapsed time (best-effort approximations)
|
|
311
|
+
const progressSteps = [
|
|
312
|
+
{ afterMs: 2500, message: '[simple] Waiting for response...' },
|
|
313
|
+
{ afterMs: 6000, message: '[browser] Simple too slow — escalating to browser render...' },
|
|
314
|
+
{ afterMs: 12000, message: '[browser] Rendering with Chromium...' },
|
|
315
|
+
{ afterMs: 20000, message: '[stealth] Escalating to stealth mode...' },
|
|
316
|
+
];
|
|
317
|
+
let stepIdx = 0;
|
|
318
|
+
progressInterval = setInterval(() => {
|
|
319
|
+
const elapsed = Date.now() - progressStart;
|
|
320
|
+
while (stepIdx < progressSteps.length && elapsed >= progressSteps[stepIdx].afterMs) {
|
|
321
|
+
process.stderr.write(`${progressSteps[stepIdx].message}\n`);
|
|
322
|
+
stepIdx++;
|
|
323
|
+
}
|
|
324
|
+
}, 500);
|
|
325
|
+
}
|
|
326
|
+
// Suppress spinner when --progress is active (progress lines replace it)
|
|
327
|
+
const spinner = (options.silent || options.progress) ? null : ora('Fetching...').start();
|
|
292
328
|
try {
|
|
293
329
|
// Validate options
|
|
294
330
|
if (options.wait && (options.wait < 0 || options.wait > 60000)) {
|
|
@@ -528,7 +564,22 @@ export async function runFetch(url, options) {
|
|
|
528
564
|
if (resolvedProfileName) {
|
|
529
565
|
touchProfile(resolvedProfileName);
|
|
530
566
|
}
|
|
531
|
-
|
|
567
|
+
// Stop progress interval and show final result
|
|
568
|
+
if (progressInterval) {
|
|
569
|
+
clearInterval(progressInterval);
|
|
570
|
+
progressInterval = undefined;
|
|
571
|
+
}
|
|
572
|
+
if (options.progress) {
|
|
573
|
+
const method = result.method || 'simple';
|
|
574
|
+
const elapsedSec = ((result.elapsed || (Date.now() - progressStart)) / 1000).toFixed(1);
|
|
575
|
+
const tokenCount = (result.tokens || 0).toLocaleString();
|
|
576
|
+
// Show escalation arrow if browser/stealth was needed
|
|
577
|
+
if (method !== 'simple') {
|
|
578
|
+
process.stderr.write(`[simple] → [${method}] escalated\n`);
|
|
579
|
+
}
|
|
580
|
+
process.stderr.write(`[${method}] Done — ${tokenCount} tokens in ${elapsedSec}s\n`);
|
|
581
|
+
}
|
|
582
|
+
else if (spinner) {
|
|
532
583
|
const domainTag = result.domainData
|
|
533
584
|
? ` [${result.domainData.domain}:${result.domainData.type}]`
|
|
534
585
|
: '';
|
|
@@ -866,11 +917,27 @@ export async function runFetch(url, options) {
|
|
|
866
917
|
result.extracted = extracted;
|
|
867
918
|
}
|
|
868
919
|
}
|
|
869
|
-
//
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
920
|
+
// --content-only: output raw content only, no wrapper
|
|
921
|
+
if (options.contentOnly) {
|
|
922
|
+
await writeStdout(result.content + '\n');
|
|
923
|
+
}
|
|
924
|
+
else {
|
|
925
|
+
// Output results (default path)
|
|
926
|
+
await outputResult(result, options, {
|
|
927
|
+
cached: false,
|
|
928
|
+
truncated: contentTruncated || undefined,
|
|
929
|
+
});
|
|
930
|
+
// Token savings display (our unique selling point)
|
|
931
|
+
if (!options.json && !options.silent && result.tokenSavingsPercent) {
|
|
932
|
+
const savings = result.tokenSavingsPercent;
|
|
933
|
+
const raw = result.rawTokenEstimate;
|
|
934
|
+
const optimized = result.tokens || 0;
|
|
935
|
+
if (savings > 0) {
|
|
936
|
+
const rawStr = raw ? `${raw.toLocaleString()}→${optimized.toLocaleString()} tokens` : `${optimized.toLocaleString()} tokens`;
|
|
937
|
+
process.stderr.write(`\x1b[32m💰 Token savings: ${savings}% smaller than raw HTML (${rawStr})\x1b[0m\n`);
|
|
938
|
+
}
|
|
939
|
+
}
|
|
940
|
+
}
|
|
874
941
|
}
|
|
875
942
|
// Clean up and exit
|
|
876
943
|
await cleanup();
|
|
@@ -976,6 +1043,8 @@ export function registerFetchCommands(program) {
|
|
|
976
1043
|
.option('--wait-selector <css>', 'Wait for CSS selector before extracting (auto-enables --render)')
|
|
977
1044
|
.option('--block-resources <types>', 'Block resource types, comma-separated: image,stylesheet,font,media,script (auto-enables --render)')
|
|
978
1045
|
.option('--format <type>', 'Output format: markdown (default), text, html, json')
|
|
1046
|
+
.option('--content-only', 'Output only the raw content field (no metadata, no JSON wrapper) — ideal for piping to LLMs')
|
|
1047
|
+
.option('--progress', 'Show engine escalation steps (simple → browser → stealth) with timing')
|
|
979
1048
|
.action(async (url, options) => {
|
|
980
1049
|
await runFetch(url, options);
|
|
981
1050
|
});
|
|
@@ -23,6 +23,7 @@ export function registerSearchCommands(program) {
|
|
|
23
23
|
.option('--budget <n>', 'Token budget for site-search result content', parseInt)
|
|
24
24
|
.option('-s, --silent', 'Silent mode')
|
|
25
25
|
.option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
|
|
26
|
+
.option('--fetch', 'Also fetch and include content from each result URL')
|
|
26
27
|
.option('--agent', 'Agent mode: sets --json, --silent, and --budget 4000 (override with --budget N)')
|
|
27
28
|
.action(async (query, options) => {
|
|
28
29
|
// --agent sets sensible defaults for AI agents; explicit flags override
|
|
@@ -178,9 +179,61 @@ export function registerSearchCommands(program) {
|
|
|
178
179
|
const searchData = await searchRes.json();
|
|
179
180
|
// API returns { success: true, data: { web: [...] } } or { results: [...] }
|
|
180
181
|
let results = searchData.data?.web || searchData.data?.results || searchData.results || [];
|
|
182
|
+
// Client-side ad filtering: remove DuckDuckGo ads that slip through the server
|
|
183
|
+
results = results.filter(r => {
|
|
184
|
+
// Filter DDG-internal URLs
|
|
185
|
+
try {
|
|
186
|
+
const parsed = new URL(r.url);
|
|
187
|
+
if (parsed.hostname === 'duckduckgo.com')
|
|
188
|
+
return false;
|
|
189
|
+
if (parsed.searchParams.has('ad_domain') ||
|
|
190
|
+
parsed.searchParams.has('ad_provider') ||
|
|
191
|
+
parsed.searchParams.has('ad_type'))
|
|
192
|
+
return false;
|
|
193
|
+
}
|
|
194
|
+
catch {
|
|
195
|
+
return false;
|
|
196
|
+
}
|
|
197
|
+
// Filter ad snippets
|
|
198
|
+
if (r.snippet && (r.snippet.includes('Ad ·') ||
|
|
199
|
+
r.snippet.includes('Ad Viewing ads is privacy protected by DuckDuckGo') ||
|
|
200
|
+
r.snippet.toLowerCase().startsWith('ad ·')))
|
|
201
|
+
return false;
|
|
202
|
+
return true;
|
|
203
|
+
});
|
|
181
204
|
if (spinner) {
|
|
182
205
|
spinner.succeed(`Found ${results.length} results`);
|
|
183
206
|
}
|
|
207
|
+
// --fetch: fetch content from each result
|
|
208
|
+
if (options.fetch && results.length > 0) {
|
|
209
|
+
const fetchCfg = loadConfig();
|
|
210
|
+
const fetchApiKey = fetchCfg.apiKey || process.env.WEBPEEL_API_KEY;
|
|
211
|
+
const fetchApiUrl = process.env.WEBPEEL_API_URL || 'https://api.webpeel.dev';
|
|
212
|
+
if (fetchApiKey) {
|
|
213
|
+
const fetchSpinner = isSilent ? null : ora(`Fetching content from ${results.length} results...`).start();
|
|
214
|
+
await Promise.all(results.map(async (result) => {
|
|
215
|
+
try {
|
|
216
|
+
const fetchParams = new URLSearchParams({ url: result.url });
|
|
217
|
+
if (options.budget)
|
|
218
|
+
fetchParams.set('budget', String(options.budget || 2000));
|
|
219
|
+
const fetchRes = await fetch(`${fetchApiUrl}/v1/fetch?${fetchParams}`, {
|
|
220
|
+
headers: { Authorization: `Bearer ${fetchApiKey}` },
|
|
221
|
+
signal: AbortSignal.timeout(20000),
|
|
222
|
+
});
|
|
223
|
+
if (fetchRes.ok) {
|
|
224
|
+
const fetchData = await fetchRes.json();
|
|
225
|
+
result.content = fetchData.content || fetchData.data?.content || '';
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
catch { /* skip on error */ }
|
|
229
|
+
}));
|
|
230
|
+
if (fetchSpinner)
|
|
231
|
+
fetchSpinner.succeed('Content fetched');
|
|
232
|
+
}
|
|
233
|
+
else if (!isSilent) {
|
|
234
|
+
console.error('Warning: --fetch requires API key (run: webpeel auth <key>)');
|
|
235
|
+
}
|
|
236
|
+
}
|
|
184
237
|
// Show usage footer for free/anonymous users
|
|
185
238
|
if (usageCheck.usageInfo && !isSilent) {
|
|
186
239
|
showUsageFooter(usageCheck.usageInfo, usageCheck.isAnonymous || false, false);
|
|
@@ -196,10 +249,24 @@ export function registerSearchCommands(program) {
|
|
|
196
249
|
await writeStdout(jsonStr + '\n');
|
|
197
250
|
}
|
|
198
251
|
else {
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
252
|
+
// Human-readable numbered results
|
|
253
|
+
if (results.length === 0) {
|
|
254
|
+
await writeStdout('No results found.\n');
|
|
255
|
+
}
|
|
256
|
+
else {
|
|
257
|
+
await writeStdout(`\n`);
|
|
258
|
+
for (const [i, result] of results.entries()) {
|
|
259
|
+
await writeStdout(`${i + 1}. ${result.title}\n`);
|
|
260
|
+
await writeStdout(` ${result.url}\n`);
|
|
261
|
+
if (result.snippet) {
|
|
262
|
+
await writeStdout(` ${result.snippet}\n`);
|
|
263
|
+
}
|
|
264
|
+
if (result.content) {
|
|
265
|
+
const preview = result.content.slice(0, 500);
|
|
266
|
+
await writeStdout(`\n --- Content ---\n${preview}${result.content.length > 500 ? '\n [...]' : ''}\n`);
|
|
267
|
+
}
|
|
268
|
+
await writeStdout('\n');
|
|
269
|
+
}
|
|
203
270
|
}
|
|
204
271
|
}
|
|
205
272
|
process.exit(0);
|
package/dist/cli/utils.js
CHANGED
|
@@ -508,13 +508,11 @@ export async function outputResult(result, options, extra = {}) {
|
|
|
508
508
|
// Default: full output
|
|
509
509
|
if (options.json) {
|
|
510
510
|
// Build clean JSON output with guaranteed top-level fields
|
|
511
|
+
// Note: elapsed/method/tokens are placed at the END so `tail -3` shows perf metrics
|
|
511
512
|
const output = {
|
|
512
513
|
url: result.url,
|
|
513
514
|
title: result.metadata?.title || result.title || null,
|
|
514
|
-
tokens: result.tokens || 0,
|
|
515
515
|
fetchedAt: new Date().toISOString(),
|
|
516
|
-
method: result.method || 'simple',
|
|
517
|
-
elapsed: result.elapsed,
|
|
518
516
|
content: result.content,
|
|
519
517
|
};
|
|
520
518
|
// Add optional fields only if present (filter out undefined/null values from metadata)
|
|
@@ -529,6 +527,10 @@ export async function outputResult(result, options, extra = {}) {
|
|
|
529
527
|
}
|
|
530
528
|
if (result.links?.length)
|
|
531
529
|
output.links = result.links;
|
|
530
|
+
if (result.tokenSavingsPercent !== undefined)
|
|
531
|
+
output.tokenSavingsPercent = result.tokenSavingsPercent;
|
|
532
|
+
if (result.rawTokenEstimate !== undefined)
|
|
533
|
+
output.rawTokenEstimate = result.rawTokenEstimate;
|
|
532
534
|
if (result.images?.length)
|
|
533
535
|
output.images = result.images;
|
|
534
536
|
if (result.structured)
|
|
@@ -562,6 +564,10 @@ export async function outputResult(result, options, extra = {}) {
|
|
|
562
564
|
if (extra.totalAvailable !== undefined)
|
|
563
565
|
output.totalAvailable = extra.totalAvailable;
|
|
564
566
|
output._meta = { version: cliVersion, method: result.method || 'simple', timing: result.timing, serverMarkdown: result.serverMarkdown || false };
|
|
567
|
+
// Perf metrics at the end — `tail -3` shows: elapsed | method | tokens
|
|
568
|
+
output.elapsed = result.elapsed;
|
|
569
|
+
output.method = result.method || 'simple';
|
|
570
|
+
output.tokens = result.tokens || 0;
|
|
565
571
|
await writeStdout(JSON.stringify(output, null, 2) + '\n');
|
|
566
572
|
}
|
|
567
573
|
else {
|
|
@@ -586,10 +592,11 @@ export async function outputResult(result, options, extra = {}) {
|
|
|
586
592
|
}
|
|
587
593
|
// Stream content immediately to stdout — consumer gets it without waiting
|
|
588
594
|
await writeStdout(result.content + '\n');
|
|
589
|
-
// Append timing summary to stderr
|
|
590
|
-
|
|
595
|
+
// Append timing summary to stderr (always — doesn't pollute stdout pipe)
|
|
596
|
+
{
|
|
591
597
|
const totalMs = result.timing?.total ?? result.elapsed;
|
|
592
|
-
|
|
598
|
+
const method = result.method || 'simple';
|
|
599
|
+
process.stderr.write(`\n--- ${totalMs}ms | ${method} | ${result.tokens} tokens ---\n`);
|
|
593
600
|
}
|
|
594
601
|
}
|
|
595
602
|
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Image alt-text enhancement module.
|
|
3
|
+
*
|
|
4
|
+
* Two strategies:
|
|
5
|
+
* 1. Heuristic (no LLM) — generates captions from filename, URL path, and nearby text context
|
|
6
|
+
* 2. LLM vision (with user's API key) — generates accurate descriptions via vision models
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Enhance images that lack alt text with heuristic-based descriptions.
|
|
10
|
+
*
|
|
11
|
+
* Processes <img> tags that have:
|
|
12
|
+
* - No alt attribute at all
|
|
13
|
+
* - An empty alt attribute (alt="")
|
|
14
|
+
*
|
|
15
|
+
* Caption priority:
|
|
16
|
+
* 1. Filename analysis: `/images/team-photo-2024.jpg` → "Team Photo 2024"
|
|
17
|
+
* 2. URL path segments: `/products/widget/hero.png` → "Widget image"
|
|
18
|
+
* 3. Nearby heading/figcaption/paragraph text (within 300 chars)
|
|
19
|
+
* 4. Generic fallback: "Image"
|
|
20
|
+
*
|
|
21
|
+
* Non-empty alt text is always preserved unchanged.
|
|
22
|
+
*
|
|
23
|
+
* @param html - Raw HTML string to process
|
|
24
|
+
* @returns HTML with alt text added/replaced on qualifying img tags
|
|
25
|
+
*/
|
|
26
|
+
export declare function enhanceImageAltText(html: string): string;
|
|
27
|
+
/**
|
|
28
|
+
* Caption images using LLM vision models.
|
|
29
|
+
*
|
|
30
|
+
* Requires the user to supply their own API key. No key is stored server-side.
|
|
31
|
+
* Processes images sequentially to avoid rate limiting.
|
|
32
|
+
*
|
|
33
|
+
* @param images - Array of {url, context} pairs. `context` is nearby text for better accuracy.
|
|
34
|
+
* @param llmApiKey - API key for the chosen provider
|
|
35
|
+
* @param llmProvider - Vision-capable model to use: 'openai' | 'anthropic' | 'google'
|
|
36
|
+
* @returns Array of {url, caption} — same order as input
|
|
37
|
+
*/
|
|
38
|
+
export declare function captionImagesWithLLM(images: {
|
|
39
|
+
url: string;
|
|
40
|
+
context: string;
|
|
41
|
+
}[], llmApiKey: string, llmProvider: 'openai' | 'anthropic' | 'google'): Promise<{
|
|
42
|
+
url: string;
|
|
43
|
+
caption: string;
|
|
44
|
+
}[]>;
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Image alt-text enhancement module.
|
|
3
|
+
*
|
|
4
|
+
* Two strategies:
|
|
5
|
+
* 1. Heuristic (no LLM) — generates captions from filename, URL path, and nearby text context
|
|
6
|
+
* 2. LLM vision (with user's API key) — generates accurate descriptions via vision models
|
|
7
|
+
*/
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
// Heuristic helpers
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
const GENERIC_FILENAMES = new Set([
|
|
12
|
+
'image', 'img', 'photo', 'picture', 'thumbnail', 'thumb',
|
|
13
|
+
'icon', 'logo', 'banner', 'placeholder', 'default', 'hero',
|
|
14
|
+
'bg', 'background', 'avatar', 'pic', 'graphic', 'figure', 'shot',
|
|
15
|
+
]);
|
|
16
|
+
const NOISE_PATH_SEGMENTS = new Set([
|
|
17
|
+
'images', 'img', 'imgs', 'photos', 'assets', 'static', 'media',
|
|
18
|
+
'public', 'uploads', 'files', 'resources', 'content', 'cdn',
|
|
19
|
+
'dist', 'build', 'src', 'www', 'web', 'site',
|
|
20
|
+
]);
|
|
21
|
+
/**
|
|
22
|
+
* Convert a URL slug / camelCase / underscored name into readable title-cased text.
|
|
23
|
+
* Examples:
|
|
24
|
+
* "team-photo-2024" → "Team Photo 2024"
|
|
25
|
+
* "heroImage" → "Hero Image"
|
|
26
|
+
* "my_product_shot" → "My Product Shot"
|
|
27
|
+
*/
|
|
28
|
+
function slugToTitle(slug) {
|
|
29
|
+
return slug
|
|
30
|
+
.replace(/[-_]+/g, ' ') // hyphens/underscores → spaces
|
|
31
|
+
.replace(/([a-z])([A-Z])/g, '$1 $2') // camelCase split
|
|
32
|
+
.replace(/([A-Z]{2,})([A-Z][a-z])/g, '$1 $2') // HTMLParser → HTML Parser
|
|
33
|
+
.replace(/\s+/g, ' ')
|
|
34
|
+
.trim()
|
|
35
|
+
.replace(/\b\w/g, (c) => c.toUpperCase()); // Title Case
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Derive a caption from the image src URL.
|
|
39
|
+
* Tries (in order): filename, parent path segment.
|
|
40
|
+
* Returns null if nothing useful can be derived.
|
|
41
|
+
*/
|
|
42
|
+
function captionFromUrl(src) {
|
|
43
|
+
try {
|
|
44
|
+
const pathStr = src.startsWith('http') ? new URL(src).pathname : src;
|
|
45
|
+
const parts = pathStr.split('/').filter(Boolean);
|
|
46
|
+
// 1. Try the filename (without extension)
|
|
47
|
+
const filename = parts[parts.length - 1] ?? '';
|
|
48
|
+
const nameWithoutExt = filename.replace(/\.[^.]+$/, '');
|
|
49
|
+
if (nameWithoutExt.length > 2 &&
|
|
50
|
+
!GENERIC_FILENAMES.has(nameWithoutExt.toLowerCase())) {
|
|
51
|
+
const title = slugToTitle(nameWithoutExt);
|
|
52
|
+
if (title.length > 2)
|
|
53
|
+
return title;
|
|
54
|
+
}
|
|
55
|
+
// 2. Try meaningful parent path segments (walk up from the file)
|
|
56
|
+
for (let i = parts.length - 2; i >= 0; i--) {
|
|
57
|
+
const seg = parts[i];
|
|
58
|
+
if (seg && seg.length > 2 && !NOISE_PATH_SEGMENTS.has(seg.toLowerCase())) {
|
|
59
|
+
const title = slugToTitle(seg);
|
|
60
|
+
return `${title} image`;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
catch {
|
|
65
|
+
// URL parse error — fall through
|
|
66
|
+
}
|
|
67
|
+
return null;
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Extract the nearest meaningful text context surrounding an img tag.
|
|
71
|
+
* Searches up to 300 chars before and after the tag position.
|
|
72
|
+
* Prefers headings, then figcaption, then raw surrounding text.
|
|
73
|
+
*/
|
|
74
|
+
function extractNearbyText(html, imgStart) {
|
|
75
|
+
const beforeHtml = html.slice(Math.max(0, imgStart - 300), imgStart);
|
|
76
|
+
const afterHtml = html.slice(imgStart, Math.min(html.length, imgStart + 400));
|
|
77
|
+
// Prefer the nearest heading before the image
|
|
78
|
+
const headingMatches = beforeHtml.match(/<h[1-6][^>]*>([^<]{3,80})<\/h[1-6]>/gi);
|
|
79
|
+
if (headingMatches) {
|
|
80
|
+
const lastHeading = headingMatches[headingMatches.length - 1];
|
|
81
|
+
const text = lastHeading.replace(/<[^>]+>/g, '').trim();
|
|
82
|
+
if (text.length > 3)
|
|
83
|
+
return text;
|
|
84
|
+
}
|
|
85
|
+
// Prefer figcaption near the image
|
|
86
|
+
const figMatch = afterHtml.match(/<figcaption[^>]*>([^<]{3,120})<\/figcaption>/i);
|
|
87
|
+
if (figMatch) {
|
|
88
|
+
const text = (figMatch[1] ?? '').trim();
|
|
89
|
+
if (text.length > 3)
|
|
90
|
+
return text;
|
|
91
|
+
}
|
|
92
|
+
// Strip tags, return the richer side
|
|
93
|
+
const stripTags = (s) => s.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
|
|
94
|
+
const beforeText = stripTags(beforeHtml);
|
|
95
|
+
const afterText = stripTags(afterHtml);
|
|
96
|
+
return afterText.length > beforeText.length
|
|
97
|
+
? afterText.slice(0, 80)
|
|
98
|
+
: beforeText.slice(-80);
|
|
99
|
+
}
|
|
100
|
+
// ---------------------------------------------------------------------------
|
|
101
|
+
// Public: heuristic alt-text enhancement
|
|
102
|
+
// ---------------------------------------------------------------------------
|
|
103
|
+
/**
|
|
104
|
+
* Enhance images that lack alt text with heuristic-based descriptions.
|
|
105
|
+
*
|
|
106
|
+
* Processes <img> tags that have:
|
|
107
|
+
* - No alt attribute at all
|
|
108
|
+
* - An empty alt attribute (alt="")
|
|
109
|
+
*
|
|
110
|
+
* Caption priority:
|
|
111
|
+
* 1. Filename analysis: `/images/team-photo-2024.jpg` → "Team Photo 2024"
|
|
112
|
+
* 2. URL path segments: `/products/widget/hero.png` → "Widget image"
|
|
113
|
+
* 3. Nearby heading/figcaption/paragraph text (within 300 chars)
|
|
114
|
+
* 4. Generic fallback: "Image"
|
|
115
|
+
*
|
|
116
|
+
* Non-empty alt text is always preserved unchanged.
|
|
117
|
+
*
|
|
118
|
+
* @param html - Raw HTML string to process
|
|
119
|
+
* @returns HTML with alt text added/replaced on qualifying img tags
|
|
120
|
+
*/
|
|
121
|
+
export function enhanceImageAltText(html) {
|
|
122
|
+
return html.replace(/<img(\s[^>]*)>/gi, (match, attrs, offset) => {
|
|
123
|
+
const srcMatch = attrs.match(/\bsrc=["']([^"']*)["']/i);
|
|
124
|
+
if (!srcMatch)
|
|
125
|
+
return match; // No src — leave unchanged
|
|
126
|
+
const src = srcMatch[1] ?? '';
|
|
127
|
+
const altMatch = attrs.match(/\balt=["']([^"']*)["']/i);
|
|
128
|
+
const altValue = altMatch ? altMatch[1] : null;
|
|
129
|
+
// Already has meaningful alt text — preserve as-is
|
|
130
|
+
if (altValue !== null && altValue.trim() !== '')
|
|
131
|
+
return match;
|
|
132
|
+
// Build caption: URL → nearby text → generic fallback
|
|
133
|
+
let caption = captionFromUrl(src);
|
|
134
|
+
if (!caption) {
|
|
135
|
+
const nearbyText = extractNearbyText(html, offset).trim();
|
|
136
|
+
if (nearbyText.length > 3) {
|
|
137
|
+
caption = `Image: ${nearbyText.slice(0, 60)}`;
|
|
138
|
+
}
|
|
139
|
+
else {
|
|
140
|
+
caption = 'Image';
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
const escaped = caption.replace(/"/g, '"');
|
|
144
|
+
if (altMatch) {
|
|
145
|
+
// Replace the empty alt value in-place
|
|
146
|
+
const newAttrs = attrs.replace(/\balt=["'][^"']*["']/i, `alt="${escaped}"`);
|
|
147
|
+
return `<img${newAttrs}>`;
|
|
148
|
+
}
|
|
149
|
+
else {
|
|
150
|
+
// Prepend alt attribute (keeps src first is fine; alt first is valid too)
|
|
151
|
+
return `<img alt="${escaped}"${attrs}>`;
|
|
152
|
+
}
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
// ---------------------------------------------------------------------------
|
|
156
|
+
// Public: LLM vision captioning (BYOK)
|
|
157
|
+
// ---------------------------------------------------------------------------
|
|
158
|
+
/**
|
|
159
|
+
* Caption images using LLM vision models.
|
|
160
|
+
*
|
|
161
|
+
* Requires the user to supply their own API key. No key is stored server-side.
|
|
162
|
+
* Processes images sequentially to avoid rate limiting.
|
|
163
|
+
*
|
|
164
|
+
* @param images - Array of {url, context} pairs. `context` is nearby text for better accuracy.
|
|
165
|
+
* @param llmApiKey - API key for the chosen provider
|
|
166
|
+
* @param llmProvider - Vision-capable model to use: 'openai' | 'anthropic' | 'google'
|
|
167
|
+
* @returns Array of {url, caption} — same order as input
|
|
168
|
+
*/
|
|
169
|
+
export async function captionImagesWithLLM(images, llmApiKey, llmProvider) {
|
|
170
|
+
const results = [];
|
|
171
|
+
for (const image of images) {
|
|
172
|
+
try {
|
|
173
|
+
const prompt = `Write a concise, descriptive alt text (1–2 sentences) for this image. Context from the surrounding page: "${image.context || 'none'}". Be specific and informative.`;
|
|
174
|
+
let caption = '';
|
|
175
|
+
if (llmProvider === 'openai') {
|
|
176
|
+
// GPT-4o-mini supports image_url with public URLs
|
|
177
|
+
const response = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
178
|
+
method: 'POST',
|
|
179
|
+
headers: {
|
|
180
|
+
Authorization: `Bearer ${llmApiKey}`,
|
|
181
|
+
'Content-Type': 'application/json',
|
|
182
|
+
},
|
|
183
|
+
body: JSON.stringify({
|
|
184
|
+
model: 'gpt-4o-mini',
|
|
185
|
+
max_tokens: 120,
|
|
186
|
+
messages: [
|
|
187
|
+
{
|
|
188
|
+
role: 'user',
|
|
189
|
+
content: [
|
|
190
|
+
{ type: 'image_url', image_url: { url: image.url, detail: 'low' } },
|
|
191
|
+
{ type: 'text', text: prompt },
|
|
192
|
+
],
|
|
193
|
+
},
|
|
194
|
+
],
|
|
195
|
+
}),
|
|
196
|
+
});
|
|
197
|
+
const data = (await response.json());
|
|
198
|
+
caption = (data?.choices?.[0]?.message?.content ?? '').trim();
|
|
199
|
+
}
|
|
200
|
+
else if (llmProvider === 'anthropic') {
|
|
201
|
+
// claude-haiku-4-5 supports url-type image sources
|
|
202
|
+
const response = await fetch('https://api.anthropic.com/v1/messages', {
|
|
203
|
+
method: 'POST',
|
|
204
|
+
headers: {
|
|
205
|
+
'x-api-key': llmApiKey,
|
|
206
|
+
'anthropic-version': '2023-06-01',
|
|
207
|
+
'Content-Type': 'application/json',
|
|
208
|
+
},
|
|
209
|
+
body: JSON.stringify({
|
|
210
|
+
model: 'claude-haiku-4-5',
|
|
211
|
+
max_tokens: 120,
|
|
212
|
+
messages: [
|
|
213
|
+
{
|
|
214
|
+
role: 'user',
|
|
215
|
+
content: [
|
|
216
|
+
{ type: 'image', source: { type: 'url', url: image.url } },
|
|
217
|
+
{ type: 'text', text: prompt },
|
|
218
|
+
],
|
|
219
|
+
},
|
|
220
|
+
],
|
|
221
|
+
}),
|
|
222
|
+
});
|
|
223
|
+
const data = (await response.json());
|
|
224
|
+
caption = (data?.content?.[0]?.text ?? '').trim();
|
|
225
|
+
}
|
|
226
|
+
else if (llmProvider === 'google') {
|
|
227
|
+
// Gemini requires base64 inlineData — fetch the image first
|
|
228
|
+
let imageData = null;
|
|
229
|
+
let mimeType = 'image/jpeg';
|
|
230
|
+
try {
|
|
231
|
+
const imgResp = await fetch(image.url, {
|
|
232
|
+
headers: { Accept: 'image/*,*/*;q=0.8' },
|
|
233
|
+
});
|
|
234
|
+
if (imgResp.ok) {
|
|
235
|
+
const buffer = await imgResp.arrayBuffer();
|
|
236
|
+
imageData = Buffer.from(buffer).toString('base64');
|
|
237
|
+
const ct = imgResp.headers.get('content-type') ?? 'image/jpeg';
|
|
238
|
+
mimeType = ct.split(';')[0]?.trim() ?? 'image/jpeg';
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
catch {
|
|
242
|
+
// Image download failed — skip this provider for this image
|
|
243
|
+
}
|
|
244
|
+
if (imageData) {
|
|
245
|
+
const response = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-lite:generateContent?key=${llmApiKey}`, {
|
|
246
|
+
method: 'POST',
|
|
247
|
+
headers: { 'Content-Type': 'application/json' },
|
|
248
|
+
body: JSON.stringify({
|
|
249
|
+
contents: [
|
|
250
|
+
{
|
|
251
|
+
parts: [
|
|
252
|
+
{ inlineData: { mimeType, data: imageData } },
|
|
253
|
+
{ text: prompt },
|
|
254
|
+
],
|
|
255
|
+
},
|
|
256
|
+
],
|
|
257
|
+
}),
|
|
258
|
+
});
|
|
259
|
+
const data = (await response.json());
|
|
260
|
+
caption = (data?.candidates?.[0]?.content?.parts?.[0]?.text ?? '').trim();
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
results.push({ url: image.url, caption: caption || 'Image' });
|
|
264
|
+
}
|
|
265
|
+
catch {
|
|
266
|
+
// Non-fatal — captioning failed for this image
|
|
267
|
+
results.push({ url: image.url, caption: 'Image' });
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
return results;
|
|
271
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export interface ExtractedLink {
|
|
2
|
+
url: string;
|
|
3
|
+
text: string;
|
|
4
|
+
}
|
|
5
|
+
/**
|
|
6
|
+
* Extract all links from an HTML string.
|
|
7
|
+
* Returns a deduplicated list of { url, text } pairs, excluding anchors,
|
|
8
|
+
* javascript: hrefs, mailto:, and tel: links.
|
|
9
|
+
*/
|
|
10
|
+
export declare function extractLinks(html: string, baseUrl?: string): ExtractedLink[];
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Link extraction from HTML
|
|
3
|
+
* Extracts all <a href="..."> tags and returns deduplicated { url, text } pairs
|
|
4
|
+
*/
|
|
5
|
+
import { load } from 'cheerio';
|
|
6
|
+
/**
|
|
7
|
+
* Extract all links from an HTML string.
|
|
8
|
+
* Returns a deduplicated list of { url, text } pairs, excluding anchors,
|
|
9
|
+
* javascript: hrefs, mailto:, and tel: links.
|
|
10
|
+
*/
|
|
11
|
+
export function extractLinks(html, baseUrl) {
|
|
12
|
+
if (!html)
|
|
13
|
+
return [];
|
|
14
|
+
const $ = load(html);
|
|
15
|
+
const seen = new Set();
|
|
16
|
+
const links = [];
|
|
17
|
+
$('a[href]').each((_i, el) => {
|
|
18
|
+
const href = $(el).attr('href') || '';
|
|
19
|
+
if (!href ||
|
|
20
|
+
href.startsWith('#') ||
|
|
21
|
+
href.startsWith('javascript:') ||
|
|
22
|
+
href.startsWith('mailto:') ||
|
|
23
|
+
href.startsWith('tel:') ||
|
|
24
|
+
href.startsWith('data:')) {
|
|
25
|
+
return;
|
|
26
|
+
}
|
|
27
|
+
let url = href;
|
|
28
|
+
// Resolve relative URLs when baseUrl is provided
|
|
29
|
+
if (baseUrl && !href.match(/^https?:\/\//)) {
|
|
30
|
+
try {
|
|
31
|
+
url = new URL(href, baseUrl).href;
|
|
32
|
+
}
|
|
33
|
+
catch {
|
|
34
|
+
return; // Skip unresolvable relative URLs
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
if (!seen.has(url)) {
|
|
38
|
+
seen.add(url);
|
|
39
|
+
const text = $(el).text().trim().replace(/\s+/g, ' ');
|
|
40
|
+
links.push({ url, text });
|
|
41
|
+
}
|
|
42
|
+
});
|
|
43
|
+
return links;
|
|
44
|
+
}
|
package/dist/core/pipeline.js
CHANGED
|
@@ -441,6 +441,14 @@ export async function parseContent(ctx) {
|
|
|
441
441
|
const fetchResult = ctx.fetchResult;
|
|
442
442
|
const { contentType, format, fullPage, raw, selector, exclude, includeTags, excludeTags } = ctx;
|
|
443
443
|
const hasBuffer = !!fetchResult.buffer;
|
|
444
|
+
// === Image alt-text enhancement (opt-in, heuristic) ===
|
|
445
|
+
// Runs before any conversion so both lite mode and standard mode benefit.
|
|
446
|
+
if (ctx.options.captionImages && contentType === 'html' && fetchResult.html) {
|
|
447
|
+
ctx.timer.mark('captionImages');
|
|
448
|
+
const { enhanceImageAltText } = await import('./image-caption.js');
|
|
449
|
+
fetchResult.html = enhanceImageAltText(fetchResult.html);
|
|
450
|
+
ctx.timer.end('captionImages');
|
|
451
|
+
}
|
|
444
452
|
if (contentType === 'document' && hasBuffer) {
|
|
445
453
|
// Document parsing pipeline (PDF/DOCX)
|
|
446
454
|
// 'clean' maps to 'markdown' for extraction; cleanForAI is applied in buildResult
|