webpeel 0.21.11 → 0.21.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/domain-extractors.js +264 -21
- package/dist/core/pipeline.js +21 -0
- package/dist/core/structured-extract.js +36 -6
- package/package.json +1 -1
|
@@ -1518,12 +1518,27 @@ async function npmExtractor(_html, url) {
|
|
|
1518
1518
|
modified: data.time?.modified || undefined,
|
|
1519
1519
|
};
|
|
1520
1520
|
// Include README if available (some packages have it, some don't)
|
|
1521
|
-
|
|
1521
|
+
let readmeText = data.readme && data.readme.length > 10 ? data.readme.slice(0, 5000) : '';
|
|
1522
|
+
// If no README in registry, try fetching from unpkg.com
|
|
1523
|
+
if (!readmeText) {
|
|
1524
|
+
try {
|
|
1525
|
+
const unpkgUrl = `https://unpkg.com/${encodeURIComponent(packageName)}/README.md`;
|
|
1526
|
+
const readmeResult = await simpleFetch(unpkgUrl, undefined, 10000);
|
|
1527
|
+
if (readmeResult?.html && readmeResult.html.length > 10 && !readmeResult.html.trim().startsWith('<')) {
|
|
1528
|
+
readmeText = readmeResult.html.slice(0, 5000);
|
|
1529
|
+
}
|
|
1530
|
+
}
|
|
1531
|
+
catch { /* README from unpkg optional */ }
|
|
1532
|
+
}
|
|
1522
1533
|
// Add to structured data
|
|
1523
1534
|
structured.readme = readmeText;
|
|
1524
1535
|
const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
|
|
1536
|
+
// Show ALL dependencies (not capped at 15)
|
|
1525
1537
|
const depsLine = structured.dependencies.length
|
|
1526
|
-
? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.
|
|
1538
|
+
? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.join(', ')}`
|
|
1539
|
+
: '';
|
|
1540
|
+
const devDepsLine = structured.devDependencies.length
|
|
1541
|
+
? `\n**Dev Dependencies (${structured.devDependencies.length}):** ${structured.devDependencies.slice(0, 10).join(', ')}${structured.devDependencies.length > 10 ? '...' : ''}`
|
|
1527
1542
|
: '';
|
|
1528
1543
|
const repoLine = structured.repository ? `\n**Repository:** ${structured.repository.replace('git+', '').replace('.git', '')}` : '';
|
|
1529
1544
|
const homepageLine = structured.homepage ? `\n**Homepage:** ${structured.homepage}` : '';
|
|
@@ -1536,7 +1551,7 @@ async function npmExtractor(_html, url) {
|
|
|
1536
1551
|
${structured.description}
|
|
1537
1552
|
|
|
1538
1553
|
**License:** ${structured.license} | **Weekly Downloads:** ${structured.weeklyDownloads?.toLocaleString() || 'N/A'}
|
|
1539
|
-
**Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
|
|
1554
|
+
**Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${devDepsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
|
|
1540
1555
|
return { domain: 'npmjs.com', type: 'package', structured, cleanContent };
|
|
1541
1556
|
}
|
|
1542
1557
|
catch (e) {
|
|
@@ -1780,15 +1795,57 @@ async function mediumExtractor(html, url) {
|
|
|
1780
1795
|
$('span').filter((_, el) => $(el).text().includes('min read')).first().text().trim() || '';
|
|
1781
1796
|
const description = jsonLdData?.description ||
|
|
1782
1797
|
$('meta[property="og:description"]').attr('content') || '';
|
|
1798
|
+
// Publication name — subdomain (towardsdatascience.medium.com), meta tags, or breadcrumb
|
|
1799
|
+
let publication = '';
|
|
1800
|
+
try {
|
|
1801
|
+
const urlObj2 = new URL(url);
|
|
1802
|
+
const hostname = urlObj2.hostname;
|
|
1803
|
+
if (hostname !== 'medium.com' && hostname !== 'www.medium.com' && hostname.endsWith('.medium.com')) {
|
|
1804
|
+
publication = hostname.replace('.medium.com', '').replace(/-/g, ' ').replace(/\b\w/g, (c) => c.toUpperCase());
|
|
1805
|
+
}
|
|
1806
|
+
}
|
|
1807
|
+
catch { /* ignore */ }
|
|
1808
|
+
if (!publication) {
|
|
1809
|
+
publication = $('[data-testid="publicationName"]').text().trim() ||
|
|
1810
|
+
$('a[data-testid="publicationName"]').text().trim() ||
|
|
1811
|
+
$('meta[property="article:section"]').attr('content') ||
|
|
1812
|
+
$('a[href*="/tag/"]').first().text().trim() || '';
|
|
1813
|
+
}
|
|
1814
|
+
// Author bio — usually shown in an author card or bio section
|
|
1815
|
+
const authorBio = $('[data-testid="authorBio"]').text().trim() ||
|
|
1816
|
+
$('p[class*="bio"]').first().text().trim() ||
|
|
1817
|
+
$('[aria-label="authorBio"]').text().trim() || '';
|
|
1818
|
+
// Clap count — Medium shows clap button with count
|
|
1819
|
+
let clapCount = '';
|
|
1820
|
+
$('button[data-testid="storyClaps"], button[aria-label*="clap"]').each((_, el) => {
|
|
1821
|
+
const txt = $(el).text().trim();
|
|
1822
|
+
if (txt && /\d/.test(txt)) {
|
|
1823
|
+
clapCount = txt;
|
|
1824
|
+
return false;
|
|
1825
|
+
}
|
|
1826
|
+
});
|
|
1827
|
+
if (!clapCount) {
|
|
1828
|
+
// Fallback: find spans that look like clap counts (e.g., "2.4K")
|
|
1829
|
+
$('span').filter((_, el) => {
|
|
1830
|
+
const label = $(el).closest('[aria-label]').attr('aria-label') || '';
|
|
1831
|
+
return label.toLowerCase().includes('clap');
|
|
1832
|
+
}).each((_, el) => {
|
|
1833
|
+
const txt = $(el).text().trim();
|
|
1834
|
+
if (txt && /\d/.test(txt)) {
|
|
1835
|
+
clapCount = txt;
|
|
1836
|
+
return false;
|
|
1837
|
+
}
|
|
1838
|
+
});
|
|
1839
|
+
}
|
|
1783
1840
|
// Extract article body — Medium puts content in <article> or section
|
|
1784
1841
|
let articleBody = '';
|
|
1785
1842
|
const articleEl = $('article').first();
|
|
1786
1843
|
if (articleEl.length) {
|
|
1787
|
-
// Remove nav, aside, buttons
|
|
1788
|
-
articleEl.find('nav, aside, button, [data-testid="navbar"]').remove();
|
|
1844
|
+
// Remove nav, aside, buttons, author-card, footer sections
|
|
1845
|
+
articleEl.find('nav, aside, button, [data-testid="navbar"], footer, [data-testid="authorCard"]').remove();
|
|
1789
1846
|
// Get paragraphs and headings
|
|
1790
1847
|
const parts = [];
|
|
1791
|
-
articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
|
|
1848
|
+
articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li, figure figcaption').each((_, el) => {
|
|
1792
1849
|
const tag = el.name;
|
|
1793
1850
|
const text = $(el).text().trim();
|
|
1794
1851
|
if (!text || text.length < 5)
|
|
@@ -1801,6 +1858,8 @@ async function mediumExtractor(html, url) {
|
|
|
1801
1858
|
parts.push(`> ${text}`);
|
|
1802
1859
|
else if (tag === 'pre')
|
|
1803
1860
|
parts.push('```\n' + text + '\n```');
|
|
1861
|
+
else if (tag === 'figcaption')
|
|
1862
|
+
parts.push(`*${text}*`);
|
|
1804
1863
|
else
|
|
1805
1864
|
parts.push(text);
|
|
1806
1865
|
});
|
|
@@ -1811,15 +1870,22 @@ async function mediumExtractor(html, url) {
|
|
|
1811
1870
|
const structured = {
|
|
1812
1871
|
title,
|
|
1813
1872
|
author,
|
|
1873
|
+
authorBio,
|
|
1814
1874
|
publishDate,
|
|
1815
1875
|
readingTime,
|
|
1816
1876
|
description,
|
|
1877
|
+
publication,
|
|
1878
|
+
clapCount,
|
|
1817
1879
|
url,
|
|
1818
1880
|
};
|
|
1819
1881
|
const authorLine = author ? `\n**Author:** ${author}` : '';
|
|
1882
|
+
const bioLine = authorBio ? `\n**Author Bio:** ${authorBio}` : '';
|
|
1820
1883
|
const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
|
|
1821
1884
|
const timeLine = readingTime ? `\n**Reading time:** ${readingTime}` : '';
|
|
1822
|
-
const
|
|
1885
|
+
const pubLine = publication ? `\n**Publication:** ${publication}` : '';
|
|
1886
|
+
const clapsLine = clapCount ? `\n**Claps:** ${clapCount}` : '';
|
|
1887
|
+
// No hard character cap — let the pipeline's budget/maxTokens handle truncation
|
|
1888
|
+
const cleanContent = `# ${title}${authorLine}${bioLine}${dateLine}${timeLine}${pubLine}${clapsLine}\n\n${contentBody}`;
|
|
1823
1889
|
return { domain: 'medium.com', type: 'article', structured, cleanContent };
|
|
1824
1890
|
}
|
|
1825
1891
|
catch {
|
|
@@ -1832,7 +1898,27 @@ async function mediumExtractor(html, url) {
|
|
|
1832
1898
|
async function substackExtractor(html, url) {
|
|
1833
1899
|
try {
|
|
1834
1900
|
const { load } = await import('cheerio');
|
|
1835
|
-
|
|
1901
|
+
// Handle open.substack.com/pub/{publication}/p/{slug} redirect URLs
|
|
1902
|
+
// These are share links that redirect to the actual post. Redirect to the real URL.
|
|
1903
|
+
const urlObj = new URL(url);
|
|
1904
|
+
let workingHtml = html;
|
|
1905
|
+
let workingUrl = url;
|
|
1906
|
+
if (urlObj.hostname === 'open.substack.com') {
|
|
1907
|
+
const openMatch = urlObj.pathname.match(/\/pub\/([^/]+)\/p\/([^/]+)/);
|
|
1908
|
+
if (openMatch) {
|
|
1909
|
+
const [, publication, slug] = openMatch;
|
|
1910
|
+
const actualUrl = `https://${publication}.substack.com/p/${slug}`;
|
|
1911
|
+
try {
|
|
1912
|
+
const fetchResult = await simpleFetch(actualUrl, undefined, 15000);
|
|
1913
|
+
if (fetchResult?.html && fetchResult.html.length > 500) {
|
|
1914
|
+
workingHtml = fetchResult.html;
|
|
1915
|
+
workingUrl = actualUrl;
|
|
1916
|
+
}
|
|
1917
|
+
}
|
|
1918
|
+
catch { /* fall through with original HTML */ }
|
|
1919
|
+
}
|
|
1920
|
+
}
|
|
1921
|
+
const $ = load(workingHtml);
|
|
1836
1922
|
// JSON-LD
|
|
1837
1923
|
let jsonLdData = null;
|
|
1838
1924
|
$('script[type="application/ld+json"]').each((_, el) => {
|
|
@@ -1857,14 +1943,14 @@ async function substackExtractor(html, url) {
|
|
|
1857
1943
|
$('meta[property="article:published_time"]').attr('content') ||
|
|
1858
1944
|
$('time').first().attr('datetime') || '';
|
|
1859
1945
|
const publication = $('meta[property="og:site_name"]').attr('content') ||
|
|
1860
|
-
$('a.navbar-title-link').text().trim() || new URL(
|
|
1946
|
+
$('a.navbar-title-link').text().trim() || new URL(workingUrl).hostname.replace('.substack.com', '');
|
|
1861
1947
|
const description = jsonLdData?.description ||
|
|
1862
1948
|
$('meta[property="og:description"]').attr('content') || '';
|
|
1863
|
-
// Article content
|
|
1949
|
+
// Article content — try multiple Substack CSS patterns
|
|
1864
1950
|
let articleBody = '';
|
|
1865
|
-
const postContent = $('.body.markup, .post-content, article').first();
|
|
1951
|
+
const postContent = $('.body.markup, .post-content, article, [class*="post-content"], .available-content').first();
|
|
1866
1952
|
if (postContent.length) {
|
|
1867
|
-
postContent.find('script, style, nav, .paywall, .subscribe-widget').remove();
|
|
1953
|
+
postContent.find('script, style, nav, .paywall, .subscribe-widget, .subscription-widget').remove();
|
|
1868
1954
|
const parts = [];
|
|
1869
1955
|
postContent.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
|
|
1870
1956
|
const tag = el.name;
|
|
@@ -1884,19 +1970,35 @@ async function substackExtractor(html, url) {
|
|
|
1884
1970
|
});
|
|
1885
1971
|
articleBody = parts.join('\n\n');
|
|
1886
1972
|
}
|
|
1973
|
+
// If no article body found, try broader search
|
|
1974
|
+
if (!articleBody) {
|
|
1975
|
+
const parts = [];
|
|
1976
|
+
$('main p, article p, [class*="content"] p').each((_, el) => {
|
|
1977
|
+
const text = $(el).text().trim();
|
|
1978
|
+
if (text && text.length > 20)
|
|
1979
|
+
parts.push(text);
|
|
1980
|
+
});
|
|
1981
|
+
articleBody = parts.slice(0, 20).join('\n\n');
|
|
1982
|
+
}
|
|
1887
1983
|
const contentBody = articleBody || description;
|
|
1984
|
+
// Detect if the post appears paywalled (short content with no article body)
|
|
1985
|
+
const isPaywalled = !articleBody && description.length > 0;
|
|
1986
|
+
const paywallNote = isPaywalled
|
|
1987
|
+
? '\n\n---\n*⚠️ This post appears to be behind a paywall. Only the preview/description is available. Full content requires a subscription.*'
|
|
1988
|
+
: '';
|
|
1888
1989
|
const structured = {
|
|
1889
1990
|
title,
|
|
1890
1991
|
author,
|
|
1891
1992
|
publication,
|
|
1892
1993
|
publishDate,
|
|
1893
1994
|
description,
|
|
1894
|
-
|
|
1995
|
+
paywalled: isPaywalled,
|
|
1996
|
+
url: workingUrl,
|
|
1895
1997
|
};
|
|
1896
1998
|
const authorLine = author ? `\n**Author:** ${author}` : '';
|
|
1897
1999
|
const pubLine = publication ? `\n**Publication:** ${publication}` : '';
|
|
1898
2000
|
const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
|
|
1899
|
-
const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}`;
|
|
2001
|
+
const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}${paywallNote}`;
|
|
1900
2002
|
return { domain: 'substack.com', type: 'post', structured, cleanContent };
|
|
1901
2003
|
}
|
|
1902
2004
|
catch {
|
|
@@ -2071,11 +2173,37 @@ async function imdbExtractor(html, url) {
|
|
|
2071
2173
|
? jsonLd.director.map((d) => d.name || d).join(', ')
|
|
2072
2174
|
: jsonLd.director?.name || String(jsonLd.director))
|
|
2073
2175
|
: $('a[href*="/name/"][class*="ipc-metadata-list-item__list-content-item"]').first().text().trim() || '';
|
|
2074
|
-
// Cast
|
|
2075
|
-
const
|
|
2176
|
+
// Cast — parse HTML first for actor+character pairs, then fall back to JSON-LD
|
|
2177
|
+
const castPairs = [];
|
|
2178
|
+
// IMDB new UI: each title-cast-item contains actor link + character link
|
|
2179
|
+
$('[data-testid="title-cast-item"]').each((_, el) => {
|
|
2180
|
+
const actorEl = $(el).find('a[href*="/name/nm"]').first();
|
|
2181
|
+
const charEl = $(el).find('[data-testid="title-cast-item__character"]').first();
|
|
2182
|
+
const actor = actorEl.text().trim();
|
|
2183
|
+
// Character name may span multiple elements; clean whitespace
|
|
2184
|
+
const character = charEl.text().trim().replace(/\s+/g, ' ').replace(/^\.\.\.$/, '');
|
|
2185
|
+
if (actor && actor.length > 1) {
|
|
2186
|
+
castPairs.push({ actor, character: character || '' });
|
|
2187
|
+
}
|
|
2188
|
+
});
|
|
2189
|
+
// Fall back to classic cast list (older IMDB page versions)
|
|
2190
|
+
const castFromHtml = [];
|
|
2191
|
+
if (!castPairs.length) {
|
|
2192
|
+
$('.cast_list td.itemprop a').each((_, el) => {
|
|
2193
|
+
const name = $(el).text().trim();
|
|
2194
|
+
if (name && name.length > 1 && !castFromHtml.includes(name))
|
|
2195
|
+
castFromHtml.push(name);
|
|
2196
|
+
});
|
|
2197
|
+
}
|
|
2198
|
+
// JSON-LD actors as final fallback
|
|
2199
|
+
const castFromLd = jsonLd?.actor
|
|
2076
2200
|
? (Array.isArray(jsonLd.actor) ? jsonLd.actor : [jsonLd.actor])
|
|
2077
|
-
.map((a) => a.name || a)
|
|
2201
|
+
.map((a) => a.name || a)
|
|
2078
2202
|
: [];
|
|
2203
|
+
// Build final cast list: with characters if available (top 10), otherwise names only
|
|
2204
|
+
const cast = castPairs.length > 0
|
|
2205
|
+
? castPairs.slice(0, 10).map(({ actor, character }) => character ? `${actor} as ${character}` : actor)
|
|
2206
|
+
: [...new Set([...castFromLd, ...castFromHtml])].slice(0, 10);
|
|
2079
2207
|
// Runtime
|
|
2080
2208
|
const runtime = jsonLd?.duration
|
|
2081
2209
|
? (() => {
|
|
@@ -2085,17 +2213,82 @@ async function imdbExtractor(html, url) {
|
|
|
2085
2213
|
return String(jsonLd.duration);
|
|
2086
2214
|
})()
|
|
2087
2215
|
: '';
|
|
2216
|
+
// Full plot/storyline — try to get the longer version from HTML
|
|
2217
|
+
const fullPlot = $('[data-testid="storyline-plot-summary"] span, [data-testid="plot-xl"] span, span[data-testid="plot-l"], #titleStoryLine p, .plot_summary .summary_text').first().text().trim() || description;
|
|
2218
|
+
// Additional details: Writers, Keywords, Awards
|
|
2219
|
+
const writers = [];
|
|
2220
|
+
$('[data-testid="title-pc-wide-screen"] li[data-testid="title-pc-principal-credit"]:nth-child(2) a, .credit_summary_item:contains("Writer") a').each((_, el) => {
|
|
2221
|
+
const name = $(el).text().trim();
|
|
2222
|
+
if (name && !writers.includes(name))
|
|
2223
|
+
writers.push(name);
|
|
2224
|
+
});
|
|
2225
|
+
// Keywords — try HTML first, fall back to JSON-LD keywords
|
|
2226
|
+
let keywords = [];
|
|
2227
|
+
$('[data-testid="storyline-plot-keywords"] a, .see-more.inline.canwrap span a, a[href*="keyword"]').each((_, el) => {
|
|
2228
|
+
const kw = $(el).text().trim();
|
|
2229
|
+
if (kw && kw.length < 30 && !keywords.includes(kw))
|
|
2230
|
+
keywords.push(kw);
|
|
2231
|
+
});
|
|
2232
|
+
// Fall back to JSON-LD keywords if HTML didn't yield any
|
|
2233
|
+
if (!keywords.length && jsonLd?.keywords) {
|
|
2234
|
+
keywords = (typeof jsonLd.keywords === 'string'
|
|
2235
|
+
? jsonLd.keywords.split(',')
|
|
2236
|
+
: Array.isArray(jsonLd.keywords) ? jsonLd.keywords : []).map((k) => k.trim()).filter(Boolean);
|
|
2237
|
+
}
|
|
2238
|
+
// Writers — also try JSON-LD creator field
|
|
2239
|
+
if (!writers.length && jsonLd?.creator) {
|
|
2240
|
+
const creators = Array.isArray(jsonLd.creator) ? jsonLd.creator : [jsonLd.creator];
|
|
2241
|
+
for (const c of creators) {
|
|
2242
|
+
const name = c?.name || (typeof c === 'string' ? c : '');
|
|
2243
|
+
if (name && !writers.includes(name))
|
|
2244
|
+
writers.push(name);
|
|
2245
|
+
}
|
|
2246
|
+
}
|
|
2247
|
+
// Awards / accolades — try hero accolades chip, then any awards-related link text
|
|
2248
|
+
let awardsSummary = '';
|
|
2249
|
+
// IMDB new UI: awards accolades chip in the hero section
|
|
2250
|
+
const accoladesEl = $('[data-testid="awards-accolades"]');
|
|
2251
|
+
if (accoladesEl.length) {
|
|
2252
|
+
awardsSummary = accoladesEl.text().trim().replace(/\s+/g, ' ');
|
|
2253
|
+
}
|
|
2254
|
+
// Fallback: look for per-title awards link (href contains the title ID /tt\d+/awards)
|
|
2255
|
+
if (!awardsSummary) {
|
|
2256
|
+
const titleMatch = url.match(/\/(tt\d+)/);
|
|
2257
|
+
const titleId = titleMatch ? titleMatch[1] : '';
|
|
2258
|
+
if (titleId) {
|
|
2259
|
+
$(`a[href*="${titleId}"][href*="awards"]`).each((_, el) => {
|
|
2260
|
+
const text = $(el).text().trim().replace(/\s+/g, ' ');
|
|
2261
|
+
if (text && text.length > 3 && text.length < 200) {
|
|
2262
|
+
awardsSummary = text;
|
|
2263
|
+
return false; // break
|
|
2264
|
+
}
|
|
2265
|
+
});
|
|
2266
|
+
}
|
|
2267
|
+
}
|
|
2268
|
+
// Fallback: JSON-LD award field
|
|
2269
|
+
if (!awardsSummary && jsonLd?.award) {
|
|
2270
|
+
awardsSummary = typeof jsonLd.award === 'string' ? jsonLd.award : '';
|
|
2271
|
+
}
|
|
2272
|
+
// Content rating & release date from JSON-LD
|
|
2273
|
+
const contentRating = jsonLd?.contentRating || '';
|
|
2274
|
+
const datePublished = jsonLd?.datePublished || '';
|
|
2088
2275
|
const structured = {
|
|
2089
|
-
title, year, contentType, description, ratingValue, ratingCount,
|
|
2090
|
-
genres, director, cast, runtime, url,
|
|
2276
|
+
title, year, contentType, description: fullPlot, ratingValue, ratingCount,
|
|
2277
|
+
genres, director, writers, cast, runtime, keywords, contentRating, datePublished, awardsSummary, url,
|
|
2091
2278
|
};
|
|
2092
2279
|
const ratingLine = ratingValue ? `⭐ ${ratingValue}/10${ratingCount ? ` (${Number(ratingCount).toLocaleString()} votes)` : ''}` : '';
|
|
2093
2280
|
const genreLine = genres.length ? genres.join(', ') : '';
|
|
2094
2281
|
const directorLine = director ? `**Director:** ${director}` : '';
|
|
2282
|
+
const writersLine = writers.length ? `**Writers:** ${writers.slice(0, 5).join(', ')}` : '';
|
|
2095
2283
|
const castLine = cast.length ? `**Cast:** ${cast.join(', ')}` : '';
|
|
2096
2284
|
const runtimeLine = runtime ? `**Runtime:** ${runtime}` : '';
|
|
2285
|
+
const ratedLine = contentRating ? `**Rated:** ${contentRating}` : '';
|
|
2286
|
+
const releaseLine = datePublished ? `**Released:** ${datePublished}` : '';
|
|
2287
|
+
const keywordsLine = keywords.length ? `\n**Keywords:** ${keywords.slice(0, 10).join(', ')}` : '';
|
|
2288
|
+
const awardsLine = awardsSummary ? `**Awards:** ${awardsSummary}` : '';
|
|
2097
2289
|
const metaParts = [ratingLine, genreLine, runtimeLine, year ? `**Year:** ${year}` : ''].filter(Boolean).join(' | ');
|
|
2098
|
-
const
|
|
2290
|
+
const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine, awardsLine].filter(Boolean).join('\n');
|
|
2291
|
+
const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${detailParts}${keywordsLine}\n\n## Plot\n\n${fullPlot}`;
|
|
2099
2292
|
return { domain: 'imdb.com', type: contentType === 'TVSeries' ? 'tv_show' : 'movie', structured, cleanContent };
|
|
2100
2293
|
}
|
|
2101
2294
|
catch {
|
|
@@ -2232,6 +2425,7 @@ async function pypiExtractor(_html, url) {
|
|
|
2232
2425
|
return null;
|
|
2233
2426
|
const info = data.info;
|
|
2234
2427
|
const structured = {
|
|
2428
|
+
title: `${info.name} ${info.version}`,
|
|
2235
2429
|
name: info.name,
|
|
2236
2430
|
version: info.version,
|
|
2237
2431
|
description: info.summary || '',
|
|
@@ -2245,17 +2439,34 @@ async function pypiExtractor(_html, url) {
|
|
|
2245
2439
|
requiresDist: (info.requires_dist || []).slice(0, 20),
|
|
2246
2440
|
classifiers: (info.classifiers || []).slice(0, 10),
|
|
2247
2441
|
};
|
|
2442
|
+
// Full description/README from PyPI (info.description is the full README in markdown)
|
|
2443
|
+
const fullDescription = info.description && info.description.length > 100 &&
|
|
2444
|
+
info.description !== 'UNKNOWN' && info.description !== info.summary
|
|
2445
|
+
? info.description.slice(0, 8000)
|
|
2446
|
+
: null;
|
|
2447
|
+
// Store full description in structured
|
|
2448
|
+
structured.fullDescription = fullDescription;
|
|
2248
2449
|
const installCmd = `pip install ${info.name}`;
|
|
2249
2450
|
const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
|
|
2250
2451
|
const pyVersionLine = structured.requiresPython ? `\n**Requires Python:** ${structured.requiresPython}` : '';
|
|
2452
|
+
// Show all dependencies
|
|
2251
2453
|
const depsLine = structured.requiresDist.length
|
|
2252
2454
|
? `\n\n## Dependencies\n\n${structured.requiresDist.map((d) => `- ${d}`).join('\n')}`
|
|
2253
2455
|
: '';
|
|
2456
|
+
// Classifiers — extract useful ones (license, status, Python versions)
|
|
2457
|
+
const usefulClassifiers = structured.classifiers.filter((c) => c.startsWith('Programming Language') || c.startsWith('License') || c.startsWith('Development Status'));
|
|
2458
|
+
const classifiersSection = usefulClassifiers.length
|
|
2459
|
+
? `\n\n## Classifiers\n\n${usefulClassifiers.map((c) => `- ${c}`).join('\n')}`
|
|
2460
|
+
: '';
|
|
2254
2461
|
// Find project URLs
|
|
2255
2462
|
const projectUrlLines = [];
|
|
2256
2463
|
for (const [label, u] of Object.entries(structured.projectUrls)) {
|
|
2257
2464
|
projectUrlLines.push(`- **${label}:** ${u}`);
|
|
2258
2465
|
}
|
|
2466
|
+
// Full description section (package README from PyPI)
|
|
2467
|
+
const descSection = fullDescription
|
|
2468
|
+
? `\n\n## Description\n\n${fullDescription}`
|
|
2469
|
+
: '';
|
|
2259
2470
|
const cleanContent = `# 📦 ${info.name} ${info.version}
|
|
2260
2471
|
|
|
2261
2472
|
${info.summary || ''}
|
|
@@ -2266,7 +2477,7 @@ ${installCmd}
|
|
|
2266
2477
|
|
|
2267
2478
|
**Author:** ${info.author || 'N/A'} | **License:** ${info.license || 'N/A'}${keywordsLine}${pyVersionLine}
|
|
2268
2479
|
|
|
2269
|
-
${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}`;
|
|
2480
|
+
${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}${classifiersSection}${descSection}`;
|
|
2270
2481
|
return { domain: 'pypi.org', type: 'package', structured, cleanContent };
|
|
2271
2482
|
}
|
|
2272
2483
|
catch (e) {
|
|
@@ -2289,6 +2500,38 @@ async function devtoExtractor(html, url) {
|
|
|
2289
2500
|
const slug = pathParts.length >= 2
|
|
2290
2501
|
? pathParts.slice(0, 2).join('/').replace(/^@/, '')
|
|
2291
2502
|
: null;
|
|
2503
|
+
// Homepage: no slug → fetch recent top articles from Dev.to API
|
|
2504
|
+
if (!slug) {
|
|
2505
|
+
try {
|
|
2506
|
+
const topArticles = await fetchJson('https://dev.to/api/articles?page=1&per_page=20&top=1');
|
|
2507
|
+
if (Array.isArray(topArticles) && topArticles.length > 0) {
|
|
2508
|
+
const articles = topArticles.map((a) => ({
|
|
2509
|
+
title: a.title || '',
|
|
2510
|
+
author: a.user?.name || '',
|
|
2511
|
+
authorUsername: a.user?.username || '',
|
|
2512
|
+
tags: a.tag_list || [],
|
|
2513
|
+
reactions: a.public_reactions_count || 0,
|
|
2514
|
+
comments: a.comments_count || 0,
|
|
2515
|
+
readingTime: a.reading_time_minutes ? `${a.reading_time_minutes} min` : '',
|
|
2516
|
+
url: a.url || '',
|
|
2517
|
+
publishDate: a.published_at ? a.published_at.split('T')[0] : '',
|
|
2518
|
+
}));
|
|
2519
|
+
const listMd = articles.map((a, i) => {
|
|
2520
|
+
const tags = a.tags.length ? ` · #${a.tags.slice(0, 3).join(' #')}` : '';
|
|
2521
|
+
const stats = `❤️ ${a.reactions} | 💬 ${a.comments}${a.readingTime ? ` | ${a.readingTime}` : ''}`;
|
|
2522
|
+
return `${i + 1}. **[${a.title}](${a.url})**\n by @${a.authorUsername}${tags}\n ${stats} · ${a.publishDate}`;
|
|
2523
|
+
}).join('\n\n');
|
|
2524
|
+
const structured = {
|
|
2525
|
+
title: 'DEV Community — Top Articles',
|
|
2526
|
+
articles,
|
|
2527
|
+
fetchedAt: new Date().toISOString(),
|
|
2528
|
+
};
|
|
2529
|
+
const cleanContent = `# 🧑💻 DEV Community — Top Articles\n\n*${articles.length} articles from the community*\n\n${listMd}`;
|
|
2530
|
+
return { domain: 'dev.to', type: 'listing', structured, cleanContent };
|
|
2531
|
+
}
|
|
2532
|
+
}
|
|
2533
|
+
catch { /* fall through to HTML */ }
|
|
2534
|
+
}
|
|
2292
2535
|
if (slug) {
|
|
2293
2536
|
try {
|
|
2294
2537
|
const apiUrl = `https://dev.to/api/articles/${slug}`;
|
package/dist/core/pipeline.js
CHANGED
|
@@ -344,6 +344,7 @@ export async function fetchContent(ctx) {
|
|
|
344
344
|
blockResources: ctx.options.blockResources,
|
|
345
345
|
cloaked: ctx.options.cloaked,
|
|
346
346
|
cycle: ctx.options.cycle,
|
|
347
|
+
tls: ctx.options.tls,
|
|
347
348
|
noEscalate: ctx.options.noEscalate,
|
|
348
349
|
});
|
|
349
350
|
}
|
|
@@ -410,6 +411,18 @@ export async function fetchContent(ctx) {
|
|
|
410
411
|
}
|
|
411
412
|
catch { /* Search fallback also failed — rethrow original BlockedError */ }
|
|
412
413
|
}
|
|
414
|
+
// Enhance error messages with actionable advice
|
|
415
|
+
if (fetchError instanceof BlockedError) {
|
|
416
|
+
const actionableMsg = `${fetchError.message}\n\nThis site blocks automated access. Try using \`stealth: true\` and a residential proxy.`;
|
|
417
|
+
const enhancedError = new BlockedError(actionableMsg);
|
|
418
|
+
throw enhancedError;
|
|
419
|
+
}
|
|
420
|
+
const errMsg = fetchError instanceof Error ? fetchError.message : String(fetchError);
|
|
421
|
+
if (errMsg.toLowerCase().includes('timeout') || errMsg.toLowerCase().includes('timed out') || errMsg.includes('AbortError')) {
|
|
422
|
+
const ms = ctx.timeout ?? 30000;
|
|
423
|
+
const enhancedMsg = `Request timed out after ${Math.round(ms / 1000)}s. This site may require browser rendering — try \`render: true\`.`;
|
|
424
|
+
throw new Error(enhancedMsg);
|
|
425
|
+
}
|
|
413
426
|
throw fetchError;
|
|
414
427
|
}
|
|
415
428
|
const fetchDuration = ctx.timer.end('fetch');
|
|
@@ -1183,6 +1196,14 @@ export function buildResult(ctx) {
|
|
|
1183
1196
|
let warning;
|
|
1184
1197
|
const contentLen = ctx.content.length;
|
|
1185
1198
|
const htmlLen = ctx.fetchResult?.html?.length || 0;
|
|
1199
|
+
// Add contentQuality metadata for thin content (< 100 words)
|
|
1200
|
+
const wordCount = ctx.content.trim().split(/\s+/).filter((w) => w.length > 0).length;
|
|
1201
|
+
if (wordCount < 100 && wordCount > 0) {
|
|
1202
|
+
ctx.warnings.push(`Content is thin (${wordCount} words). The page may be paywalled, require authentication, or block automated access.`);
|
|
1203
|
+
if (ctx.metadata) {
|
|
1204
|
+
ctx.metadata.contentQuality = 'thin';
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1186
1207
|
if (contentLen < 100 && htmlLen > 1000) {
|
|
1187
1208
|
warning = 'Content extraction produced very little text from a substantial page. The site may use heavy JavaScript rendering. Try adding render: true.';
|
|
1188
1209
|
}
|
|
@@ -143,6 +143,8 @@ function heuristicExtractString(fieldName, content, pageUrl) {
|
|
|
143
143
|
.replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
|
|
144
144
|
.replace(/[*_`[\]]/g, '')
|
|
145
145
|
.replace(/&[a-z]+;/g, '') // HTML entities
|
|
146
|
+
// Strip leading emoji (📦🎬🎵🎮 etc.) that domain extractors add as decoration
|
|
147
|
+
.replace(/^[\p{Emoji_Presentation}\p{Extended_Pictographic}\uFE0F]+\s*/u, '')
|
|
146
148
|
.replace(/\s+/g, ' ')
|
|
147
149
|
.trim().slice(0, 150);
|
|
148
150
|
}
|
|
@@ -156,6 +158,12 @@ function heuristicExtractString(fieldName, content, pageUrl) {
|
|
|
156
158
|
if (pageUrl)
|
|
157
159
|
return pageUrl;
|
|
158
160
|
}
|
|
161
|
+
// Director (for movies/films)
|
|
162
|
+
if (/director/.test(lf)) {
|
|
163
|
+
const m = content.match(/Director[:\s*]+([^\n|,]+)/i) ?? content.match(/Directed by[:\s]+([^\n|,]+)/i);
|
|
164
|
+
if (m?.[1])
|
|
165
|
+
return m[1].replace(/[*_`]/g, '').trim().slice(0, 100);
|
|
166
|
+
}
|
|
159
167
|
// Author/writer/by
|
|
160
168
|
if (/author|writer|by/.test(lf)) {
|
|
161
169
|
const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
|
|
@@ -339,9 +347,21 @@ async function heuristicExtract(content, schema) {
|
|
|
339
347
|
fieldsFound++;
|
|
340
348
|
data[field] = value;
|
|
341
349
|
}
|
|
342
|
-
// Confidence
|
|
350
|
+
// Confidence based on fill rate:
|
|
351
|
+
// - ALL fields null → 0.1 (extraction found nothing useful)
|
|
352
|
+
// - Some fields null → 0.3-0.5 based on fill ratio
|
|
353
|
+
// - ALL fields populated → 0.6-0.7 (heuristic max — values may still be imprecise)
|
|
343
354
|
const fillRate = totalFields > 0 ? fieldsFound / totalFields : 0;
|
|
344
|
-
|
|
355
|
+
let confidence;
|
|
356
|
+
if (fieldsFound === 0) {
|
|
357
|
+
confidence = 0.1; // All null — heuristic found nothing
|
|
358
|
+
}
|
|
359
|
+
else if (fieldsFound === totalFields) {
|
|
360
|
+
confidence = 0.65 + fillRate * 0.05; // 0.7 for fully populated heuristic
|
|
361
|
+
}
|
|
362
|
+
else {
|
|
363
|
+
confidence = 0.3 + fillRate * 0.2; // 0.3–0.5 based on fill ratio
|
|
364
|
+
}
|
|
345
365
|
return {
|
|
346
366
|
data,
|
|
347
367
|
confidence: parseFloat(confidence.toFixed(2)),
|
|
@@ -395,12 +415,22 @@ export async function extractStructured(content, schema, llmConfig, prompt) {
|
|
|
395
415
|
return heuristic;
|
|
396
416
|
}
|
|
397
417
|
const { data, missingRequired } = validateAndCoerce(parsed, schema);
|
|
398
|
-
// Confidence
|
|
399
|
-
|
|
418
|
+
// Confidence for LLM extraction:
|
|
419
|
+
// - ALL fields null → 0.1 (LLM couldn't extract anything)
|
|
420
|
+
// - Partial fill → 0.85+ (LLM is generally reliable when it finds data)
|
|
421
|
+
// - All populated → 0.90-0.98 based on fill rate
|
|
400
422
|
const filledCount = Object.values(data).filter((v) => v !== null && v !== undefined).length;
|
|
401
423
|
const totalCount = Object.keys(schema.properties).length;
|
|
402
|
-
const
|
|
403
|
-
const
|
|
424
|
+
const fillRate = totalCount > 0 ? filledCount / totalCount : 0;
|
|
425
|
+
const penalty = missingRequired.length * 0.05;
|
|
426
|
+
let confidence;
|
|
427
|
+
if (filledCount === 0) {
|
|
428
|
+
confidence = 0.1; // LLM returned all nulls — extraction failed
|
|
429
|
+
}
|
|
430
|
+
else {
|
|
431
|
+
const fillBonus = fillRate * 0.08; // Up to +0.08 for fully populated
|
|
432
|
+
confidence = Math.min(0.98, 0.85 + fillBonus - penalty); // 0.85–0.93+ for LLM
|
|
433
|
+
}
|
|
404
434
|
return {
|
|
405
435
|
data,
|
|
406
436
|
confidence: parseFloat(confidence.toFixed(2)),
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.13",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|