webpeel 0.21.11 → 0.21.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/domain-extractors.js +165 -17
- package/dist/core/pipeline.js +20 -0
- package/dist/core/structured-extract.js +36 -6
- package/package.json +1 -1
|
@@ -1518,12 +1518,27 @@ async function npmExtractor(_html, url) {
|
|
|
1518
1518
|
modified: data.time?.modified || undefined,
|
|
1519
1519
|
};
|
|
1520
1520
|
// Include README if available (some packages have it, some don't)
|
|
1521
|
-
|
|
1521
|
+
let readmeText = data.readme && data.readme.length > 10 ? data.readme.slice(0, 5000) : '';
|
|
1522
|
+
// If no README in registry, try fetching from unpkg.com
|
|
1523
|
+
if (!readmeText) {
|
|
1524
|
+
try {
|
|
1525
|
+
const unpkgUrl = `https://unpkg.com/${encodeURIComponent(packageName)}/README.md`;
|
|
1526
|
+
const readmeResult = await simpleFetch(unpkgUrl, undefined, 10000);
|
|
1527
|
+
if (readmeResult?.html && readmeResult.html.length > 10 && !readmeResult.html.trim().startsWith('<')) {
|
|
1528
|
+
readmeText = readmeResult.html.slice(0, 5000);
|
|
1529
|
+
}
|
|
1530
|
+
}
|
|
1531
|
+
catch { /* README from unpkg optional */ }
|
|
1532
|
+
}
|
|
1522
1533
|
// Add to structured data
|
|
1523
1534
|
structured.readme = readmeText;
|
|
1524
1535
|
const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
|
|
1536
|
+
// Show ALL dependencies (not capped at 15)
|
|
1525
1537
|
const depsLine = structured.dependencies.length
|
|
1526
|
-
? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.
|
|
1538
|
+
? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.join(', ')}`
|
|
1539
|
+
: '';
|
|
1540
|
+
const devDepsLine = structured.devDependencies.length
|
|
1541
|
+
? `\n**Dev Dependencies (${structured.devDependencies.length}):** ${structured.devDependencies.slice(0, 10).join(', ')}${structured.devDependencies.length > 10 ? '...' : ''}`
|
|
1527
1542
|
: '';
|
|
1528
1543
|
const repoLine = structured.repository ? `\n**Repository:** ${structured.repository.replace('git+', '').replace('.git', '')}` : '';
|
|
1529
1544
|
const homepageLine = structured.homepage ? `\n**Homepage:** ${structured.homepage}` : '';
|
|
@@ -1536,7 +1551,7 @@ async function npmExtractor(_html, url) {
|
|
|
1536
1551
|
${structured.description}
|
|
1537
1552
|
|
|
1538
1553
|
**License:** ${structured.license} | **Weekly Downloads:** ${structured.weeklyDownloads?.toLocaleString() || 'N/A'}
|
|
1539
|
-
**Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
|
|
1554
|
+
**Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${devDepsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
|
|
1540
1555
|
return { domain: 'npmjs.com', type: 'package', structured, cleanContent };
|
|
1541
1556
|
}
|
|
1542
1557
|
catch (e) {
|
|
@@ -1832,7 +1847,27 @@ async function mediumExtractor(html, url) {
|
|
|
1832
1847
|
async function substackExtractor(html, url) {
|
|
1833
1848
|
try {
|
|
1834
1849
|
const { load } = await import('cheerio');
|
|
1835
|
-
|
|
1850
|
+
// Handle open.substack.com/pub/{publication}/p/{slug} redirect URLs
|
|
1851
|
+
// These are share links that redirect to the actual post. Redirect to the real URL.
|
|
1852
|
+
const urlObj = new URL(url);
|
|
1853
|
+
let workingHtml = html;
|
|
1854
|
+
let workingUrl = url;
|
|
1855
|
+
if (urlObj.hostname === 'open.substack.com') {
|
|
1856
|
+
const openMatch = urlObj.pathname.match(/\/pub\/([^/]+)\/p\/([^/]+)/);
|
|
1857
|
+
if (openMatch) {
|
|
1858
|
+
const [, publication, slug] = openMatch;
|
|
1859
|
+
const actualUrl = `https://${publication}.substack.com/p/${slug}`;
|
|
1860
|
+
try {
|
|
1861
|
+
const fetchResult = await simpleFetch(actualUrl, undefined, 15000);
|
|
1862
|
+
if (fetchResult?.html && fetchResult.html.length > 500) {
|
|
1863
|
+
workingHtml = fetchResult.html;
|
|
1864
|
+
workingUrl = actualUrl;
|
|
1865
|
+
}
|
|
1866
|
+
}
|
|
1867
|
+
catch { /* fall through with original HTML */ }
|
|
1868
|
+
}
|
|
1869
|
+
}
|
|
1870
|
+
const $ = load(workingHtml);
|
|
1836
1871
|
// JSON-LD
|
|
1837
1872
|
let jsonLdData = null;
|
|
1838
1873
|
$('script[type="application/ld+json"]').each((_, el) => {
|
|
@@ -1857,14 +1892,14 @@ async function substackExtractor(html, url) {
|
|
|
1857
1892
|
$('meta[property="article:published_time"]').attr('content') ||
|
|
1858
1893
|
$('time').first().attr('datetime') || '';
|
|
1859
1894
|
const publication = $('meta[property="og:site_name"]').attr('content') ||
|
|
1860
|
-
$('a.navbar-title-link').text().trim() || new URL(
|
|
1895
|
+
$('a.navbar-title-link').text().trim() || new URL(workingUrl).hostname.replace('.substack.com', '');
|
|
1861
1896
|
const description = jsonLdData?.description ||
|
|
1862
1897
|
$('meta[property="og:description"]').attr('content') || '';
|
|
1863
|
-
// Article content
|
|
1898
|
+
// Article content — try multiple Substack CSS patterns
|
|
1864
1899
|
let articleBody = '';
|
|
1865
|
-
const postContent = $('.body.markup, .post-content, article').first();
|
|
1900
|
+
const postContent = $('.body.markup, .post-content, article, [class*="post-content"], .available-content').first();
|
|
1866
1901
|
if (postContent.length) {
|
|
1867
|
-
postContent.find('script, style, nav, .paywall, .subscribe-widget').remove();
|
|
1902
|
+
postContent.find('script, style, nav, .paywall, .subscribe-widget, .subscription-widget').remove();
|
|
1868
1903
|
const parts = [];
|
|
1869
1904
|
postContent.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
|
|
1870
1905
|
const tag = el.name;
|
|
@@ -1884,19 +1919,35 @@ async function substackExtractor(html, url) {
|
|
|
1884
1919
|
});
|
|
1885
1920
|
articleBody = parts.join('\n\n');
|
|
1886
1921
|
}
|
|
1922
|
+
// If no article body found, try broader search
|
|
1923
|
+
if (!articleBody) {
|
|
1924
|
+
const parts = [];
|
|
1925
|
+
$('main p, article p, [class*="content"] p').each((_, el) => {
|
|
1926
|
+
const text = $(el).text().trim();
|
|
1927
|
+
if (text && text.length > 20)
|
|
1928
|
+
parts.push(text);
|
|
1929
|
+
});
|
|
1930
|
+
articleBody = parts.slice(0, 20).join('\n\n');
|
|
1931
|
+
}
|
|
1887
1932
|
const contentBody = articleBody || description;
|
|
1933
|
+
// Detect if the post appears paywalled (short content with no article body)
|
|
1934
|
+
const isPaywalled = !articleBody && description.length > 0;
|
|
1935
|
+
const paywallNote = isPaywalled
|
|
1936
|
+
? '\n\n---\n*⚠️ This post appears to be behind a paywall. Only the preview/description is available. Full content requires a subscription.*'
|
|
1937
|
+
: '';
|
|
1888
1938
|
const structured = {
|
|
1889
1939
|
title,
|
|
1890
1940
|
author,
|
|
1891
1941
|
publication,
|
|
1892
1942
|
publishDate,
|
|
1893
1943
|
description,
|
|
1894
|
-
|
|
1944
|
+
paywalled: isPaywalled,
|
|
1945
|
+
url: workingUrl,
|
|
1895
1946
|
};
|
|
1896
1947
|
const authorLine = author ? `\n**Author:** ${author}` : '';
|
|
1897
1948
|
const pubLine = publication ? `\n**Publication:** ${publication}` : '';
|
|
1898
1949
|
const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
|
|
1899
|
-
const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}`;
|
|
1950
|
+
const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}${paywallNote}`;
|
|
1900
1951
|
return { domain: 'substack.com', type: 'post', structured, cleanContent };
|
|
1901
1952
|
}
|
|
1902
1953
|
catch {
|
|
@@ -2071,11 +2122,20 @@ async function imdbExtractor(html, url) {
|
|
|
2071
2122
|
? jsonLd.director.map((d) => d.name || d).join(', ')
|
|
2072
2123
|
: jsonLd.director?.name || String(jsonLd.director))
|
|
2073
2124
|
: $('a[href*="/name/"][class*="ipc-metadata-list-item__list-content-item"]').first().text().trim() || '';
|
|
2074
|
-
// Cast
|
|
2075
|
-
const
|
|
2125
|
+
// Cast — JSON-LD has top actors, also parse HTML for broader cast list
|
|
2126
|
+
const castFromLd = jsonLd?.actor
|
|
2076
2127
|
? (Array.isArray(jsonLd.actor) ? jsonLd.actor : [jsonLd.actor])
|
|
2077
|
-
.map((a) => a.name || a)
|
|
2128
|
+
.map((a) => a.name || a)
|
|
2078
2129
|
: [];
|
|
2130
|
+
// Parse additional cast from HTML (IMDB cast section)
|
|
2131
|
+
const castFromHtml = [];
|
|
2132
|
+
// Try multiple IMDB cast selectors across page versions
|
|
2133
|
+
$('[data-testid="title-cast-item"] a[href*="/name/nm"], a[data-testid*="cast"] span[class*="title"], .cast_list td.itemprop a').each((_, el) => {
|
|
2134
|
+
const name = $(el).text().trim();
|
|
2135
|
+
if (name && name.length > 1 && !castFromHtml.includes(name))
|
|
2136
|
+
castFromHtml.push(name);
|
|
2137
|
+
});
|
|
2138
|
+
const cast = [...new Set([...castFromLd, ...castFromHtml])].slice(0, 15);
|
|
2079
2139
|
// Runtime
|
|
2080
2140
|
const runtime = jsonLd?.duration
|
|
2081
2141
|
? (() => {
|
|
@@ -2085,17 +2145,56 @@ async function imdbExtractor(html, url) {
|
|
|
2085
2145
|
return String(jsonLd.duration);
|
|
2086
2146
|
})()
|
|
2087
2147
|
: '';
|
|
2148
|
+
// Full plot/storyline — try to get the longer version from HTML
|
|
2149
|
+
const fullPlot = $('[data-testid="storyline-plot-summary"] span, [data-testid="plot-xl"] span, span[data-testid="plot-l"], #titleStoryLine p, .plot_summary .summary_text').first().text().trim() || description;
|
|
2150
|
+
// Additional details: Writers, Keywords, Awards
|
|
2151
|
+
const writers = [];
|
|
2152
|
+
$('[data-testid="title-pc-wide-screen"] li[data-testid="title-pc-principal-credit"]:nth-child(2) a, .credit_summary_item:contains("Writer") a').each((_, el) => {
|
|
2153
|
+
const name = $(el).text().trim();
|
|
2154
|
+
if (name && !writers.includes(name))
|
|
2155
|
+
writers.push(name);
|
|
2156
|
+
});
|
|
2157
|
+
// Keywords — try HTML first, fall back to JSON-LD keywords
|
|
2158
|
+
let keywords = [];
|
|
2159
|
+
$('[data-testid="storyline-plot-keywords"] a, .see-more.inline.canwrap span a, a[href*="keyword"]').each((_, el) => {
|
|
2160
|
+
const kw = $(el).text().trim();
|
|
2161
|
+
if (kw && kw.length < 30 && !keywords.includes(kw))
|
|
2162
|
+
keywords.push(kw);
|
|
2163
|
+
});
|
|
2164
|
+
// Fall back to JSON-LD keywords if HTML didn't yield any
|
|
2165
|
+
if (!keywords.length && jsonLd?.keywords) {
|
|
2166
|
+
keywords = (typeof jsonLd.keywords === 'string'
|
|
2167
|
+
? jsonLd.keywords.split(',')
|
|
2168
|
+
: Array.isArray(jsonLd.keywords) ? jsonLd.keywords : []).map((k) => k.trim()).filter(Boolean);
|
|
2169
|
+
}
|
|
2170
|
+
// Writers — also try JSON-LD creator field
|
|
2171
|
+
if (!writers.length && jsonLd?.creator) {
|
|
2172
|
+
const creators = Array.isArray(jsonLd.creator) ? jsonLd.creator : [jsonLd.creator];
|
|
2173
|
+
for (const c of creators) {
|
|
2174
|
+
const name = c?.name || (typeof c === 'string' ? c : '');
|
|
2175
|
+
if (name && !writers.includes(name))
|
|
2176
|
+
writers.push(name);
|
|
2177
|
+
}
|
|
2178
|
+
}
|
|
2179
|
+
// Content rating & release date from JSON-LD
|
|
2180
|
+
const contentRating = jsonLd?.contentRating || '';
|
|
2181
|
+
const datePublished = jsonLd?.datePublished || '';
|
|
2088
2182
|
const structured = {
|
|
2089
|
-
title, year, contentType, description, ratingValue, ratingCount,
|
|
2090
|
-
genres, director, cast, runtime, url,
|
|
2183
|
+
title, year, contentType, description: fullPlot, ratingValue, ratingCount,
|
|
2184
|
+
genres, director, writers, cast, runtime, keywords, contentRating, datePublished, url,
|
|
2091
2185
|
};
|
|
2092
2186
|
const ratingLine = ratingValue ? `⭐ ${ratingValue}/10${ratingCount ? ` (${Number(ratingCount).toLocaleString()} votes)` : ''}` : '';
|
|
2093
2187
|
const genreLine = genres.length ? genres.join(', ') : '';
|
|
2094
2188
|
const directorLine = director ? `**Director:** ${director}` : '';
|
|
2189
|
+
const writersLine = writers.length ? `**Writers:** ${writers.slice(0, 5).join(', ')}` : '';
|
|
2095
2190
|
const castLine = cast.length ? `**Cast:** ${cast.join(', ')}` : '';
|
|
2096
2191
|
const runtimeLine = runtime ? `**Runtime:** ${runtime}` : '';
|
|
2192
|
+
const ratedLine = contentRating ? `**Rated:** ${contentRating}` : '';
|
|
2193
|
+
const releaseLine = datePublished ? `**Released:** ${datePublished}` : '';
|
|
2194
|
+
const keywordsLine = keywords.length ? `\n**Keywords:** ${keywords.slice(0, 10).join(', ')}` : '';
|
|
2097
2195
|
const metaParts = [ratingLine, genreLine, runtimeLine, year ? `**Year:** ${year}` : ''].filter(Boolean).join(' | ');
|
|
2098
|
-
const
|
|
2196
|
+
const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine].filter(Boolean).join('\n');
|
|
2197
|
+
const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${detailParts}${keywordsLine}\n\n## Plot\n\n${fullPlot}`;
|
|
2099
2198
|
return { domain: 'imdb.com', type: contentType === 'TVSeries' ? 'tv_show' : 'movie', structured, cleanContent };
|
|
2100
2199
|
}
|
|
2101
2200
|
catch {
|
|
@@ -2245,17 +2344,34 @@ async function pypiExtractor(_html, url) {
|
|
|
2245
2344
|
requiresDist: (info.requires_dist || []).slice(0, 20),
|
|
2246
2345
|
classifiers: (info.classifiers || []).slice(0, 10),
|
|
2247
2346
|
};
|
|
2347
|
+
// Full description/README from PyPI (info.description is the full README in markdown)
|
|
2348
|
+
const fullDescription = info.description && info.description.length > 100 &&
|
|
2349
|
+
info.description !== 'UNKNOWN' && info.description !== info.summary
|
|
2350
|
+
? info.description.slice(0, 8000)
|
|
2351
|
+
: null;
|
|
2352
|
+
// Store full description in structured
|
|
2353
|
+
structured.fullDescription = fullDescription;
|
|
2248
2354
|
const installCmd = `pip install ${info.name}`;
|
|
2249
2355
|
const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
|
|
2250
2356
|
const pyVersionLine = structured.requiresPython ? `\n**Requires Python:** ${structured.requiresPython}` : '';
|
|
2357
|
+
// Show all dependencies
|
|
2251
2358
|
const depsLine = structured.requiresDist.length
|
|
2252
2359
|
? `\n\n## Dependencies\n\n${structured.requiresDist.map((d) => `- ${d}`).join('\n')}`
|
|
2253
2360
|
: '';
|
|
2361
|
+
// Classifiers — extract useful ones (license, status, Python versions)
|
|
2362
|
+
const usefulClassifiers = structured.classifiers.filter((c) => c.startsWith('Programming Language') || c.startsWith('License') || c.startsWith('Development Status'));
|
|
2363
|
+
const classifiersSection = usefulClassifiers.length
|
|
2364
|
+
? `\n\n## Classifiers\n\n${usefulClassifiers.map((c) => `- ${c}`).join('\n')}`
|
|
2365
|
+
: '';
|
|
2254
2366
|
// Find project URLs
|
|
2255
2367
|
const projectUrlLines = [];
|
|
2256
2368
|
for (const [label, u] of Object.entries(structured.projectUrls)) {
|
|
2257
2369
|
projectUrlLines.push(`- **${label}:** ${u}`);
|
|
2258
2370
|
}
|
|
2371
|
+
// Full description section (package README from PyPI)
|
|
2372
|
+
const descSection = fullDescription
|
|
2373
|
+
? `\n\n## Description\n\n${fullDescription}`
|
|
2374
|
+
: '';
|
|
2259
2375
|
const cleanContent = `# 📦 ${info.name} ${info.version}
|
|
2260
2376
|
|
|
2261
2377
|
${info.summary || ''}
|
|
@@ -2266,7 +2382,7 @@ ${installCmd}
|
|
|
2266
2382
|
|
|
2267
2383
|
**Author:** ${info.author || 'N/A'} | **License:** ${info.license || 'N/A'}${keywordsLine}${pyVersionLine}
|
|
2268
2384
|
|
|
2269
|
-
${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}`;
|
|
2385
|
+
${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}${classifiersSection}${descSection}`;
|
|
2270
2386
|
return { domain: 'pypi.org', type: 'package', structured, cleanContent };
|
|
2271
2387
|
}
|
|
2272
2388
|
catch (e) {
|
|
@@ -2289,6 +2405,38 @@ async function devtoExtractor(html, url) {
|
|
|
2289
2405
|
const slug = pathParts.length >= 2
|
|
2290
2406
|
? pathParts.slice(0, 2).join('/').replace(/^@/, '')
|
|
2291
2407
|
: null;
|
|
2408
|
+
// Homepage: no slug → fetch recent top articles from Dev.to API
|
|
2409
|
+
if (!slug) {
|
|
2410
|
+
try {
|
|
2411
|
+
const topArticles = await fetchJson('https://dev.to/api/articles?page=1&per_page=20&top=1');
|
|
2412
|
+
if (Array.isArray(topArticles) && topArticles.length > 0) {
|
|
2413
|
+
const articles = topArticles.map((a) => ({
|
|
2414
|
+
title: a.title || '',
|
|
2415
|
+
author: a.user?.name || '',
|
|
2416
|
+
authorUsername: a.user?.username || '',
|
|
2417
|
+
tags: a.tag_list || [],
|
|
2418
|
+
reactions: a.public_reactions_count || 0,
|
|
2419
|
+
comments: a.comments_count || 0,
|
|
2420
|
+
readingTime: a.reading_time_minutes ? `${a.reading_time_minutes} min` : '',
|
|
2421
|
+
url: a.url || '',
|
|
2422
|
+
publishDate: a.published_at ? a.published_at.split('T')[0] : '',
|
|
2423
|
+
}));
|
|
2424
|
+
const listMd = articles.map((a, i) => {
|
|
2425
|
+
const tags = a.tags.length ? ` · #${a.tags.slice(0, 3).join(' #')}` : '';
|
|
2426
|
+
const stats = `❤️ ${a.reactions} | 💬 ${a.comments}${a.readingTime ? ` | ${a.readingTime}` : ''}`;
|
|
2427
|
+
return `${i + 1}. **[${a.title}](${a.url})**\n by @${a.authorUsername}${tags}\n ${stats} · ${a.publishDate}`;
|
|
2428
|
+
}).join('\n\n');
|
|
2429
|
+
const structured = {
|
|
2430
|
+
title: 'DEV Community — Top Articles',
|
|
2431
|
+
articles,
|
|
2432
|
+
fetchedAt: new Date().toISOString(),
|
|
2433
|
+
};
|
|
2434
|
+
const cleanContent = `# 🧑💻 DEV Community — Top Articles\n\n*${articles.length} articles from the community*\n\n${listMd}`;
|
|
2435
|
+
return { domain: 'dev.to', type: 'listing', structured, cleanContent };
|
|
2436
|
+
}
|
|
2437
|
+
}
|
|
2438
|
+
catch { /* fall through to HTML */ }
|
|
2439
|
+
}
|
|
2292
2440
|
if (slug) {
|
|
2293
2441
|
try {
|
|
2294
2442
|
const apiUrl = `https://dev.to/api/articles/${slug}`;
|
package/dist/core/pipeline.js
CHANGED
|
@@ -410,6 +410,18 @@ export async function fetchContent(ctx) {
|
|
|
410
410
|
}
|
|
411
411
|
catch { /* Search fallback also failed — rethrow original BlockedError */ }
|
|
412
412
|
}
|
|
413
|
+
// Enhance error messages with actionable advice
|
|
414
|
+
if (fetchError instanceof BlockedError) {
|
|
415
|
+
const actionableMsg = `${fetchError.message}\n\nThis site blocks automated access. Try using \`stealth: true\` and a residential proxy.`;
|
|
416
|
+
const enhancedError = new BlockedError(actionableMsg);
|
|
417
|
+
throw enhancedError;
|
|
418
|
+
}
|
|
419
|
+
const errMsg = fetchError instanceof Error ? fetchError.message : String(fetchError);
|
|
420
|
+
if (errMsg.toLowerCase().includes('timeout') || errMsg.toLowerCase().includes('timed out') || errMsg.includes('AbortError')) {
|
|
421
|
+
const ms = ctx.timeout ?? 30000;
|
|
422
|
+
const enhancedMsg = `Request timed out after ${Math.round(ms / 1000)}s. This site may require browser rendering — try \`render: true\`.`;
|
|
423
|
+
throw new Error(enhancedMsg);
|
|
424
|
+
}
|
|
413
425
|
throw fetchError;
|
|
414
426
|
}
|
|
415
427
|
const fetchDuration = ctx.timer.end('fetch');
|
|
@@ -1183,6 +1195,14 @@ export function buildResult(ctx) {
|
|
|
1183
1195
|
let warning;
|
|
1184
1196
|
const contentLen = ctx.content.length;
|
|
1185
1197
|
const htmlLen = ctx.fetchResult?.html?.length || 0;
|
|
1198
|
+
// Add contentQuality metadata for thin content (< 100 words)
|
|
1199
|
+
const wordCount = ctx.content.trim().split(/\s+/).filter((w) => w.length > 0).length;
|
|
1200
|
+
if (wordCount < 100 && wordCount > 0) {
|
|
1201
|
+
ctx.warnings.push(`Content is thin (${wordCount} words). The page may be paywalled, require authentication, or block automated access.`);
|
|
1202
|
+
if (ctx.metadata) {
|
|
1203
|
+
ctx.metadata.contentQuality = 'thin';
|
|
1204
|
+
}
|
|
1205
|
+
}
|
|
1186
1206
|
if (contentLen < 100 && htmlLen > 1000) {
|
|
1187
1207
|
warning = 'Content extraction produced very little text from a substantial page. The site may use heavy JavaScript rendering. Try adding render: true.';
|
|
1188
1208
|
}
|
|
@@ -143,6 +143,8 @@ function heuristicExtractString(fieldName, content, pageUrl) {
|
|
|
143
143
|
.replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
|
|
144
144
|
.replace(/[*_`[\]]/g, '')
|
|
145
145
|
.replace(/&[a-z]+;/g, '') // HTML entities
|
|
146
|
+
// Strip leading emoji (📦🎬🎵🎮 etc.) that domain extractors add as decoration
|
|
147
|
+
.replace(/^[\p{Emoji_Presentation}\p{Extended_Pictographic}\uFE0F]+\s*/u, '')
|
|
146
148
|
.replace(/\s+/g, ' ')
|
|
147
149
|
.trim().slice(0, 150);
|
|
148
150
|
}
|
|
@@ -156,6 +158,12 @@ function heuristicExtractString(fieldName, content, pageUrl) {
|
|
|
156
158
|
if (pageUrl)
|
|
157
159
|
return pageUrl;
|
|
158
160
|
}
|
|
161
|
+
// Director (for movies/films)
|
|
162
|
+
if (/director/.test(lf)) {
|
|
163
|
+
const m = content.match(/Director[:\s*]+([^\n|,]+)/i) ?? content.match(/Directed by[:\s]+([^\n|,]+)/i);
|
|
164
|
+
if (m?.[1])
|
|
165
|
+
return m[1].replace(/[*_`]/g, '').trim().slice(0, 100);
|
|
166
|
+
}
|
|
159
167
|
// Author/writer/by
|
|
160
168
|
if (/author|writer|by/.test(lf)) {
|
|
161
169
|
const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
|
|
@@ -339,9 +347,21 @@ async function heuristicExtract(content, schema) {
|
|
|
339
347
|
fieldsFound++;
|
|
340
348
|
data[field] = value;
|
|
341
349
|
}
|
|
342
|
-
// Confidence
|
|
350
|
+
// Confidence based on fill rate:
|
|
351
|
+
// - ALL fields null → 0.1 (extraction found nothing useful)
|
|
352
|
+
// - Some fields null → 0.3-0.5 based on fill ratio
|
|
353
|
+
// - ALL fields populated → 0.6-0.7 (heuristic max — values may still be imprecise)
|
|
343
354
|
const fillRate = totalFields > 0 ? fieldsFound / totalFields : 0;
|
|
344
|
-
|
|
355
|
+
let confidence;
|
|
356
|
+
if (fieldsFound === 0) {
|
|
357
|
+
confidence = 0.1; // All null — heuristic found nothing
|
|
358
|
+
}
|
|
359
|
+
else if (fieldsFound === totalFields) {
|
|
360
|
+
confidence = 0.65 + fillRate * 0.05; // 0.7 for fully populated heuristic
|
|
361
|
+
}
|
|
362
|
+
else {
|
|
363
|
+
confidence = 0.3 + fillRate * 0.2; // 0.3–0.5 based on fill ratio
|
|
364
|
+
}
|
|
345
365
|
return {
|
|
346
366
|
data,
|
|
347
367
|
confidence: parseFloat(confidence.toFixed(2)),
|
|
@@ -395,12 +415,22 @@ export async function extractStructured(content, schema, llmConfig, prompt) {
|
|
|
395
415
|
return heuristic;
|
|
396
416
|
}
|
|
397
417
|
const { data, missingRequired } = validateAndCoerce(parsed, schema);
|
|
398
|
-
// Confidence
|
|
399
|
-
|
|
418
|
+
// Confidence for LLM extraction:
|
|
419
|
+
// - ALL fields null → 0.1 (LLM couldn't extract anything)
|
|
420
|
+
// - Partial fill → 0.85+ (LLM is generally reliable when it finds data)
|
|
421
|
+
// - All populated → 0.90-0.98 based on fill rate
|
|
400
422
|
const filledCount = Object.values(data).filter((v) => v !== null && v !== undefined).length;
|
|
401
423
|
const totalCount = Object.keys(schema.properties).length;
|
|
402
|
-
const
|
|
403
|
-
const
|
|
424
|
+
const fillRate = totalCount > 0 ? filledCount / totalCount : 0;
|
|
425
|
+
const penalty = missingRequired.length * 0.05;
|
|
426
|
+
let confidence;
|
|
427
|
+
if (filledCount === 0) {
|
|
428
|
+
confidence = 0.1; // LLM returned all nulls — extraction failed
|
|
429
|
+
}
|
|
430
|
+
else {
|
|
431
|
+
const fillBonus = fillRate * 0.08; // Up to +0.08 for fully populated
|
|
432
|
+
confidence = Math.min(0.98, 0.85 + fillBonus - penalty); // 0.85–0.93+ for LLM
|
|
433
|
+
}
|
|
404
434
|
return {
|
|
405
435
|
data,
|
|
406
436
|
confidence: parseFloat(confidence.toFixed(2)),
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.12",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|