webpeel 0.21.11 → 0.21.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1518,12 +1518,27 @@ async function npmExtractor(_html, url) {
1518
1518
  modified: data.time?.modified || undefined,
1519
1519
  };
1520
1520
  // Include README if available (some packages have it, some don't)
1521
- const readmeText = data.readme && data.readme.length > 10 ? data.readme.slice(0, 5000) : '';
1521
+ let readmeText = data.readme && data.readme.length > 10 ? data.readme.slice(0, 5000) : '';
1522
+ // If no README in registry, try fetching from unpkg.com
1523
+ if (!readmeText) {
1524
+ try {
1525
+ const unpkgUrl = `https://unpkg.com/${encodeURIComponent(packageName)}/README.md`;
1526
+ const readmeResult = await simpleFetch(unpkgUrl, undefined, 10000);
1527
+ if (readmeResult?.html && readmeResult.html.length > 10 && !readmeResult.html.trim().startsWith('<')) {
1528
+ readmeText = readmeResult.html.slice(0, 5000);
1529
+ }
1530
+ }
1531
+ catch { /* README from unpkg optional */ }
1532
+ }
1522
1533
  // Add to structured data
1523
1534
  structured.readme = readmeText;
1524
1535
  const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
1536
+ // Show ALL dependencies (not capped at 15)
1525
1537
  const depsLine = structured.dependencies.length
1526
- ? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.slice(0, 15).join(', ')}${structured.dependencies.length > 15 ? '...' : ''}`
1538
+ ? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.join(', ')}`
1539
+ : '';
1540
+ const devDepsLine = structured.devDependencies.length
1541
+ ? `\n**Dev Dependencies (${structured.devDependencies.length}):** ${structured.devDependencies.slice(0, 10).join(', ')}${structured.devDependencies.length > 10 ? '...' : ''}`
1527
1542
  : '';
1528
1543
  const repoLine = structured.repository ? `\n**Repository:** ${structured.repository.replace('git+', '').replace('.git', '')}` : '';
1529
1544
  const homepageLine = structured.homepage ? `\n**Homepage:** ${structured.homepage}` : '';
@@ -1536,7 +1551,7 @@ async function npmExtractor(_html, url) {
1536
1551
  ${structured.description}
1537
1552
 
1538
1553
  **License:** ${structured.license} | **Weekly Downloads:** ${structured.weeklyDownloads?.toLocaleString() || 'N/A'}
1539
- **Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
1554
+ **Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${devDepsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
1540
1555
  return { domain: 'npmjs.com', type: 'package', structured, cleanContent };
1541
1556
  }
1542
1557
  catch (e) {
@@ -1780,15 +1795,57 @@ async function mediumExtractor(html, url) {
1780
1795
  $('span').filter((_, el) => $(el).text().includes('min read')).first().text().trim() || '';
1781
1796
  const description = jsonLdData?.description ||
1782
1797
  $('meta[property="og:description"]').attr('content') || '';
1798
+ // Publication name — subdomain (towardsdatascience.medium.com), meta tags, or breadcrumb
1799
+ let publication = '';
1800
+ try {
1801
+ const urlObj2 = new URL(url);
1802
+ const hostname = urlObj2.hostname;
1803
+ if (hostname !== 'medium.com' && hostname !== 'www.medium.com' && hostname.endsWith('.medium.com')) {
1804
+ publication = hostname.replace('.medium.com', '').replace(/-/g, ' ').replace(/\b\w/g, (c) => c.toUpperCase());
1805
+ }
1806
+ }
1807
+ catch { /* ignore */ }
1808
+ if (!publication) {
1809
+ publication = $('[data-testid="publicationName"]').text().trim() ||
1810
+ $('a[data-testid="publicationName"]').text().trim() ||
1811
+ $('meta[property="article:section"]').attr('content') ||
1812
+ $('a[href*="/tag/"]').first().text().trim() || '';
1813
+ }
1814
+ // Author bio — usually shown in an author card or bio section
1815
+ const authorBio = $('[data-testid="authorBio"]').text().trim() ||
1816
+ $('p[class*="bio"]').first().text().trim() ||
1817
+ $('[aria-label="authorBio"]').text().trim() || '';
1818
+ // Clap count — Medium shows clap button with count
1819
+ let clapCount = '';
1820
+ $('button[data-testid="storyClaps"], button[aria-label*="clap"]').each((_, el) => {
1821
+ const txt = $(el).text().trim();
1822
+ if (txt && /\d/.test(txt)) {
1823
+ clapCount = txt;
1824
+ return false;
1825
+ }
1826
+ });
1827
+ if (!clapCount) {
1828
+ // Fallback: find spans that look like clap counts (e.g., "2.4K")
1829
+ $('span').filter((_, el) => {
1830
+ const label = $(el).closest('[aria-label]').attr('aria-label') || '';
1831
+ return label.toLowerCase().includes('clap');
1832
+ }).each((_, el) => {
1833
+ const txt = $(el).text().trim();
1834
+ if (txt && /\d/.test(txt)) {
1835
+ clapCount = txt;
1836
+ return false;
1837
+ }
1838
+ });
1839
+ }
1783
1840
  // Extract article body — Medium puts content in <article> or section
1784
1841
  let articleBody = '';
1785
1842
  const articleEl = $('article').first();
1786
1843
  if (articleEl.length) {
1787
- // Remove nav, aside, buttons
1788
- articleEl.find('nav, aside, button, [data-testid="navbar"]').remove();
1844
+ // Remove nav, aside, buttons, author-card, footer sections
1845
+ articleEl.find('nav, aside, button, [data-testid="navbar"], footer, [data-testid="authorCard"]').remove();
1789
1846
  // Get paragraphs and headings
1790
1847
  const parts = [];
1791
- articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
1848
+ articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li, figure figcaption').each((_, el) => {
1792
1849
  const tag = el.name;
1793
1850
  const text = $(el).text().trim();
1794
1851
  if (!text || text.length < 5)
@@ -1801,6 +1858,8 @@ async function mediumExtractor(html, url) {
1801
1858
  parts.push(`> ${text}`);
1802
1859
  else if (tag === 'pre')
1803
1860
  parts.push('```\n' + text + '\n```');
1861
+ else if (tag === 'figcaption')
1862
+ parts.push(`*${text}*`);
1804
1863
  else
1805
1864
  parts.push(text);
1806
1865
  });
@@ -1811,15 +1870,22 @@ async function mediumExtractor(html, url) {
1811
1870
  const structured = {
1812
1871
  title,
1813
1872
  author,
1873
+ authorBio,
1814
1874
  publishDate,
1815
1875
  readingTime,
1816
1876
  description,
1877
+ publication,
1878
+ clapCount,
1817
1879
  url,
1818
1880
  };
1819
1881
  const authorLine = author ? `\n**Author:** ${author}` : '';
1882
+ const bioLine = authorBio ? `\n**Author Bio:** ${authorBio}` : '';
1820
1883
  const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
1821
1884
  const timeLine = readingTime ? `\n**Reading time:** ${readingTime}` : '';
1822
- const cleanContent = `# ${title}${authorLine}${dateLine}${timeLine}\n\n${contentBody.substring(0, 8000)}`;
1885
+ const pubLine = publication ? `\n**Publication:** ${publication}` : '';
1886
+ const clapsLine = clapCount ? `\n**Claps:** ${clapCount}` : '';
1887
+ // No hard character cap — let the pipeline's budget/maxTokens handle truncation
1888
+ const cleanContent = `# ${title}${authorLine}${bioLine}${dateLine}${timeLine}${pubLine}${clapsLine}\n\n${contentBody}`;
1823
1889
  return { domain: 'medium.com', type: 'article', structured, cleanContent };
1824
1890
  }
1825
1891
  catch {
@@ -1832,7 +1898,27 @@ async function mediumExtractor(html, url) {
1832
1898
  async function substackExtractor(html, url) {
1833
1899
  try {
1834
1900
  const { load } = await import('cheerio');
1835
- const $ = load(html);
1901
+ // Handle open.substack.com/pub/{publication}/p/{slug} redirect URLs
1902
+ // These are share links that redirect to the actual post. Redirect to the real URL.
1903
+ const urlObj = new URL(url);
1904
+ let workingHtml = html;
1905
+ let workingUrl = url;
1906
+ if (urlObj.hostname === 'open.substack.com') {
1907
+ const openMatch = urlObj.pathname.match(/\/pub\/([^/]+)\/p\/([^/]+)/);
1908
+ if (openMatch) {
1909
+ const [, publication, slug] = openMatch;
1910
+ const actualUrl = `https://${publication}.substack.com/p/${slug}`;
1911
+ try {
1912
+ const fetchResult = await simpleFetch(actualUrl, undefined, 15000);
1913
+ if (fetchResult?.html && fetchResult.html.length > 500) {
1914
+ workingHtml = fetchResult.html;
1915
+ workingUrl = actualUrl;
1916
+ }
1917
+ }
1918
+ catch { /* fall through with original HTML */ }
1919
+ }
1920
+ }
1921
+ const $ = load(workingHtml);
1836
1922
  // JSON-LD
1837
1923
  let jsonLdData = null;
1838
1924
  $('script[type="application/ld+json"]').each((_, el) => {
@@ -1857,14 +1943,14 @@ async function substackExtractor(html, url) {
1857
1943
  $('meta[property="article:published_time"]').attr('content') ||
1858
1944
  $('time').first().attr('datetime') || '';
1859
1945
  const publication = $('meta[property="og:site_name"]').attr('content') ||
1860
- $('a.navbar-title-link').text().trim() || new URL(url).hostname.replace('.substack.com', '');
1946
+ $('a.navbar-title-link').text().trim() || new URL(workingUrl).hostname.replace('.substack.com', '');
1861
1947
  const description = jsonLdData?.description ||
1862
1948
  $('meta[property="og:description"]').attr('content') || '';
1863
- // Article content
1949
+ // Article content — try multiple Substack CSS patterns
1864
1950
  let articleBody = '';
1865
- const postContent = $('.body.markup, .post-content, article').first();
1951
+ const postContent = $('.body.markup, .post-content, article, [class*="post-content"], .available-content').first();
1866
1952
  if (postContent.length) {
1867
- postContent.find('script, style, nav, .paywall, .subscribe-widget').remove();
1953
+ postContent.find('script, style, nav, .paywall, .subscribe-widget, .subscription-widget').remove();
1868
1954
  const parts = [];
1869
1955
  postContent.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
1870
1956
  const tag = el.name;
@@ -1884,19 +1970,35 @@ async function substackExtractor(html, url) {
1884
1970
  });
1885
1971
  articleBody = parts.join('\n\n');
1886
1972
  }
1973
+ // If no article body found, try broader search
1974
+ if (!articleBody) {
1975
+ const parts = [];
1976
+ $('main p, article p, [class*="content"] p').each((_, el) => {
1977
+ const text = $(el).text().trim();
1978
+ if (text && text.length > 20)
1979
+ parts.push(text);
1980
+ });
1981
+ articleBody = parts.slice(0, 20).join('\n\n');
1982
+ }
1887
1983
  const contentBody = articleBody || description;
1984
+ // Detect if the post appears paywalled (short content with no article body)
1985
+ const isPaywalled = !articleBody && description.length > 0;
1986
+ const paywallNote = isPaywalled
1987
+ ? '\n\n---\n*⚠️ This post appears to be behind a paywall. Only the preview/description is available. Full content requires a subscription.*'
1988
+ : '';
1888
1989
  const structured = {
1889
1990
  title,
1890
1991
  author,
1891
1992
  publication,
1892
1993
  publishDate,
1893
1994
  description,
1894
- url,
1995
+ paywalled: isPaywalled,
1996
+ url: workingUrl,
1895
1997
  };
1896
1998
  const authorLine = author ? `\n**Author:** ${author}` : '';
1897
1999
  const pubLine = publication ? `\n**Publication:** ${publication}` : '';
1898
2000
  const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
1899
- const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}`;
2001
+ const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}${paywallNote}`;
1900
2002
  return { domain: 'substack.com', type: 'post', structured, cleanContent };
1901
2003
  }
1902
2004
  catch {
@@ -2071,11 +2173,37 @@ async function imdbExtractor(html, url) {
2071
2173
  ? jsonLd.director.map((d) => d.name || d).join(', ')
2072
2174
  : jsonLd.director?.name || String(jsonLd.director))
2073
2175
  : $('a[href*="/name/"][class*="ipc-metadata-list-item__list-content-item"]').first().text().trim() || '';
2074
- // Cast (top few from JSON-LD actor field)
2075
- const cast = jsonLd?.actor
2176
+ // Cast parse HTML first for actor+character pairs, then fall back to JSON-LD
2177
+ const castPairs = [];
2178
+ // IMDB new UI: each title-cast-item contains actor link + character link
2179
+ $('[data-testid="title-cast-item"]').each((_, el) => {
2180
+ const actorEl = $(el).find('a[href*="/name/nm"]').first();
2181
+ const charEl = $(el).find('[data-testid="title-cast-item__character"]').first();
2182
+ const actor = actorEl.text().trim();
2183
+ // Character name may span multiple elements; clean whitespace
2184
+ const character = charEl.text().trim().replace(/\s+/g, ' ').replace(/^\.\.\.$/, '');
2185
+ if (actor && actor.length > 1) {
2186
+ castPairs.push({ actor, character: character || '' });
2187
+ }
2188
+ });
2189
+ // Fall back to classic cast list (older IMDB page versions)
2190
+ const castFromHtml = [];
2191
+ if (!castPairs.length) {
2192
+ $('.cast_list td.itemprop a').each((_, el) => {
2193
+ const name = $(el).text().trim();
2194
+ if (name && name.length > 1 && !castFromHtml.includes(name))
2195
+ castFromHtml.push(name);
2196
+ });
2197
+ }
2198
+ // JSON-LD actors as final fallback
2199
+ const castFromLd = jsonLd?.actor
2076
2200
  ? (Array.isArray(jsonLd.actor) ? jsonLd.actor : [jsonLd.actor])
2077
- .map((a) => a.name || a).slice(0, 6)
2201
+ .map((a) => a.name || a)
2078
2202
  : [];
2203
+ // Build final cast list: with characters if available (top 10), otherwise names only
2204
+ const cast = castPairs.length > 0
2205
+ ? castPairs.slice(0, 10).map(({ actor, character }) => character ? `${actor} as ${character}` : actor)
2206
+ : [...new Set([...castFromLd, ...castFromHtml])].slice(0, 10);
2079
2207
  // Runtime
2080
2208
  const runtime = jsonLd?.duration
2081
2209
  ? (() => {
@@ -2085,17 +2213,82 @@ async function imdbExtractor(html, url) {
2085
2213
  return String(jsonLd.duration);
2086
2214
  })()
2087
2215
  : '';
2216
+ // Full plot/storyline — try to get the longer version from HTML
2217
+ const fullPlot = $('[data-testid="storyline-plot-summary"] span, [data-testid="plot-xl"] span, span[data-testid="plot-l"], #titleStoryLine p, .plot_summary .summary_text').first().text().trim() || description;
2218
+ // Additional details: Writers, Keywords, Awards
2219
+ const writers = [];
2220
+ $('[data-testid="title-pc-wide-screen"] li[data-testid="title-pc-principal-credit"]:nth-child(2) a, .credit_summary_item:contains("Writer") a').each((_, el) => {
2221
+ const name = $(el).text().trim();
2222
+ if (name && !writers.includes(name))
2223
+ writers.push(name);
2224
+ });
2225
+ // Keywords — try HTML first, fall back to JSON-LD keywords
2226
+ let keywords = [];
2227
+ $('[data-testid="storyline-plot-keywords"] a, .see-more.inline.canwrap span a, a[href*="keyword"]').each((_, el) => {
2228
+ const kw = $(el).text().trim();
2229
+ if (kw && kw.length < 30 && !keywords.includes(kw))
2230
+ keywords.push(kw);
2231
+ });
2232
+ // Fall back to JSON-LD keywords if HTML didn't yield any
2233
+ if (!keywords.length && jsonLd?.keywords) {
2234
+ keywords = (typeof jsonLd.keywords === 'string'
2235
+ ? jsonLd.keywords.split(',')
2236
+ : Array.isArray(jsonLd.keywords) ? jsonLd.keywords : []).map((k) => k.trim()).filter(Boolean);
2237
+ }
2238
+ // Writers — also try JSON-LD creator field
2239
+ if (!writers.length && jsonLd?.creator) {
2240
+ const creators = Array.isArray(jsonLd.creator) ? jsonLd.creator : [jsonLd.creator];
2241
+ for (const c of creators) {
2242
+ const name = c?.name || (typeof c === 'string' ? c : '');
2243
+ if (name && !writers.includes(name))
2244
+ writers.push(name);
2245
+ }
2246
+ }
2247
+ // Awards / accolades — try hero accolades chip, then any awards-related link text
2248
+ let awardsSummary = '';
2249
+ // IMDB new UI: awards accolades chip in the hero section
2250
+ const accoladesEl = $('[data-testid="awards-accolades"]');
2251
+ if (accoladesEl.length) {
2252
+ awardsSummary = accoladesEl.text().trim().replace(/\s+/g, ' ');
2253
+ }
2254
+ // Fallback: look for per-title awards link (href contains the title ID /tt\d+/awards)
2255
+ if (!awardsSummary) {
2256
+ const titleMatch = url.match(/\/(tt\d+)/);
2257
+ const titleId = titleMatch ? titleMatch[1] : '';
2258
+ if (titleId) {
2259
+ $(`a[href*="${titleId}"][href*="awards"]`).each((_, el) => {
2260
+ const text = $(el).text().trim().replace(/\s+/g, ' ');
2261
+ if (text && text.length > 3 && text.length < 200) {
2262
+ awardsSummary = text;
2263
+ return false; // break
2264
+ }
2265
+ });
2266
+ }
2267
+ }
2268
+ // Fallback: JSON-LD award field
2269
+ if (!awardsSummary && jsonLd?.award) {
2270
+ awardsSummary = typeof jsonLd.award === 'string' ? jsonLd.award : '';
2271
+ }
2272
+ // Content rating & release date from JSON-LD
2273
+ const contentRating = jsonLd?.contentRating || '';
2274
+ const datePublished = jsonLd?.datePublished || '';
2088
2275
  const structured = {
2089
- title, year, contentType, description, ratingValue, ratingCount,
2090
- genres, director, cast, runtime, url,
2276
+ title, year, contentType, description: fullPlot, ratingValue, ratingCount,
2277
+ genres, director, writers, cast, runtime, keywords, contentRating, datePublished, awardsSummary, url,
2091
2278
  };
2092
2279
  const ratingLine = ratingValue ? `⭐ ${ratingValue}/10${ratingCount ? ` (${Number(ratingCount).toLocaleString()} votes)` : ''}` : '';
2093
2280
  const genreLine = genres.length ? genres.join(', ') : '';
2094
2281
  const directorLine = director ? `**Director:** ${director}` : '';
2282
+ const writersLine = writers.length ? `**Writers:** ${writers.slice(0, 5).join(', ')}` : '';
2095
2283
  const castLine = cast.length ? `**Cast:** ${cast.join(', ')}` : '';
2096
2284
  const runtimeLine = runtime ? `**Runtime:** ${runtime}` : '';
2285
+ const ratedLine = contentRating ? `**Rated:** ${contentRating}` : '';
2286
+ const releaseLine = datePublished ? `**Released:** ${datePublished}` : '';
2287
+ const keywordsLine = keywords.length ? `\n**Keywords:** ${keywords.slice(0, 10).join(', ')}` : '';
2288
+ const awardsLine = awardsSummary ? `**Awards:** ${awardsSummary}` : '';
2097
2289
  const metaParts = [ratingLine, genreLine, runtimeLine, year ? `**Year:** ${year}` : ''].filter(Boolean).join(' | ');
2098
- const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${directorLine ? directorLine + '\n' : ''}${castLine ? castLine + '\n' : ''}\n## Plot\n\n${description}`;
2290
+ const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine, awardsLine].filter(Boolean).join('\n');
2291
+ const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${detailParts}${keywordsLine}\n\n## Plot\n\n${fullPlot}`;
2099
2292
  return { domain: 'imdb.com', type: contentType === 'TVSeries' ? 'tv_show' : 'movie', structured, cleanContent };
2100
2293
  }
2101
2294
  catch {
@@ -2232,6 +2425,7 @@ async function pypiExtractor(_html, url) {
2232
2425
  return null;
2233
2426
  const info = data.info;
2234
2427
  const structured = {
2428
+ title: `${info.name} ${info.version}`,
2235
2429
  name: info.name,
2236
2430
  version: info.version,
2237
2431
  description: info.summary || '',
@@ -2245,17 +2439,34 @@ async function pypiExtractor(_html, url) {
2245
2439
  requiresDist: (info.requires_dist || []).slice(0, 20),
2246
2440
  classifiers: (info.classifiers || []).slice(0, 10),
2247
2441
  };
2442
+ // Full description/README from PyPI (info.description is the full README in markdown)
2443
+ const fullDescription = info.description && info.description.length > 100 &&
2444
+ info.description !== 'UNKNOWN' && info.description !== info.summary
2445
+ ? info.description.slice(0, 8000)
2446
+ : null;
2447
+ // Store full description in structured
2448
+ structured.fullDescription = fullDescription;
2248
2449
  const installCmd = `pip install ${info.name}`;
2249
2450
  const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
2250
2451
  const pyVersionLine = structured.requiresPython ? `\n**Requires Python:** ${structured.requiresPython}` : '';
2452
+ // Show all dependencies
2251
2453
  const depsLine = structured.requiresDist.length
2252
2454
  ? `\n\n## Dependencies\n\n${structured.requiresDist.map((d) => `- ${d}`).join('\n')}`
2253
2455
  : '';
2456
+ // Classifiers — extract useful ones (license, status, Python versions)
2457
+ const usefulClassifiers = structured.classifiers.filter((c) => c.startsWith('Programming Language') || c.startsWith('License') || c.startsWith('Development Status'));
2458
+ const classifiersSection = usefulClassifiers.length
2459
+ ? `\n\n## Classifiers\n\n${usefulClassifiers.map((c) => `- ${c}`).join('\n')}`
2460
+ : '';
2254
2461
  // Find project URLs
2255
2462
  const projectUrlLines = [];
2256
2463
  for (const [label, u] of Object.entries(structured.projectUrls)) {
2257
2464
  projectUrlLines.push(`- **${label}:** ${u}`);
2258
2465
  }
2466
+ // Full description section (package README from PyPI)
2467
+ const descSection = fullDescription
2468
+ ? `\n\n## Description\n\n${fullDescription}`
2469
+ : '';
2259
2470
  const cleanContent = `# 📦 ${info.name} ${info.version}
2260
2471
 
2261
2472
  ${info.summary || ''}
@@ -2266,7 +2477,7 @@ ${installCmd}
2266
2477
 
2267
2478
  **Author:** ${info.author || 'N/A'} | **License:** ${info.license || 'N/A'}${keywordsLine}${pyVersionLine}
2268
2479
 
2269
- ${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}`;
2480
+ ${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}${classifiersSection}${descSection}`;
2270
2481
  return { domain: 'pypi.org', type: 'package', structured, cleanContent };
2271
2482
  }
2272
2483
  catch (e) {
@@ -2289,6 +2500,38 @@ async function devtoExtractor(html, url) {
2289
2500
  const slug = pathParts.length >= 2
2290
2501
  ? pathParts.slice(0, 2).join('/').replace(/^@/, '')
2291
2502
  : null;
2503
+ // Homepage: no slug → fetch recent top articles from Dev.to API
2504
+ if (!slug) {
2505
+ try {
2506
+ const topArticles = await fetchJson('https://dev.to/api/articles?page=1&per_page=20&top=1');
2507
+ if (Array.isArray(topArticles) && topArticles.length > 0) {
2508
+ const articles = topArticles.map((a) => ({
2509
+ title: a.title || '',
2510
+ author: a.user?.name || '',
2511
+ authorUsername: a.user?.username || '',
2512
+ tags: a.tag_list || [],
2513
+ reactions: a.public_reactions_count || 0,
2514
+ comments: a.comments_count || 0,
2515
+ readingTime: a.reading_time_minutes ? `${a.reading_time_minutes} min` : '',
2516
+ url: a.url || '',
2517
+ publishDate: a.published_at ? a.published_at.split('T')[0] : '',
2518
+ }));
2519
+ const listMd = articles.map((a, i) => {
2520
+ const tags = a.tags.length ? ` · #${a.tags.slice(0, 3).join(' #')}` : '';
2521
+ const stats = `❤️ ${a.reactions} | 💬 ${a.comments}${a.readingTime ? ` | ${a.readingTime}` : ''}`;
2522
+ return `${i + 1}. **[${a.title}](${a.url})**\n by @${a.authorUsername}${tags}\n ${stats} · ${a.publishDate}`;
2523
+ }).join('\n\n');
2524
+ const structured = {
2525
+ title: 'DEV Community — Top Articles',
2526
+ articles,
2527
+ fetchedAt: new Date().toISOString(),
2528
+ };
2529
+ const cleanContent = `# 🧑‍💻 DEV Community — Top Articles\n\n*${articles.length} articles from the community*\n\n${listMd}`;
2530
+ return { domain: 'dev.to', type: 'listing', structured, cleanContent };
2531
+ }
2532
+ }
2533
+ catch { /* fall through to HTML */ }
2534
+ }
2292
2535
  if (slug) {
2293
2536
  try {
2294
2537
  const apiUrl = `https://dev.to/api/articles/${slug}`;
@@ -344,6 +344,7 @@ export async function fetchContent(ctx) {
344
344
  blockResources: ctx.options.blockResources,
345
345
  cloaked: ctx.options.cloaked,
346
346
  cycle: ctx.options.cycle,
347
+ tls: ctx.options.tls,
347
348
  noEscalate: ctx.options.noEscalate,
348
349
  });
349
350
  }
@@ -410,6 +411,18 @@ export async function fetchContent(ctx) {
410
411
  }
411
412
  catch { /* Search fallback also failed — rethrow original BlockedError */ }
412
413
  }
414
+ // Enhance error messages with actionable advice
415
+ if (fetchError instanceof BlockedError) {
416
+ const actionableMsg = `${fetchError.message}\n\nThis site blocks automated access. Try using \`stealth: true\` and a residential proxy.`;
417
+ const enhancedError = new BlockedError(actionableMsg);
418
+ throw enhancedError;
419
+ }
420
+ const errMsg = fetchError instanceof Error ? fetchError.message : String(fetchError);
421
+ if (errMsg.toLowerCase().includes('timeout') || errMsg.toLowerCase().includes('timed out') || errMsg.includes('AbortError')) {
422
+ const ms = ctx.timeout ?? 30000;
423
+ const enhancedMsg = `Request timed out after ${Math.round(ms / 1000)}s. This site may require browser rendering — try \`render: true\`.`;
424
+ throw new Error(enhancedMsg);
425
+ }
413
426
  throw fetchError;
414
427
  }
415
428
  const fetchDuration = ctx.timer.end('fetch');
@@ -1183,6 +1196,14 @@ export function buildResult(ctx) {
1183
1196
  let warning;
1184
1197
  const contentLen = ctx.content.length;
1185
1198
  const htmlLen = ctx.fetchResult?.html?.length || 0;
1199
+ // Add contentQuality metadata for thin content (< 100 words)
1200
+ const wordCount = ctx.content.trim().split(/\s+/).filter((w) => w.length > 0).length;
1201
+ if (wordCount < 100 && wordCount > 0) {
1202
+ ctx.warnings.push(`Content is thin (${wordCount} words). The page may be paywalled, require authentication, or block automated access.`);
1203
+ if (ctx.metadata) {
1204
+ ctx.metadata.contentQuality = 'thin';
1205
+ }
1206
+ }
1186
1207
  if (contentLen < 100 && htmlLen > 1000) {
1187
1208
  warning = 'Content extraction produced very little text from a substantial page. The site may use heavy JavaScript rendering. Try adding render: true.';
1188
1209
  }
@@ -143,6 +143,8 @@ function heuristicExtractString(fieldName, content, pageUrl) {
143
143
  .replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
144
144
  .replace(/[*_`[\]]/g, '')
145
145
  .replace(/&[a-z]+;/g, '') // HTML entities
146
+ // Strip leading emoji (📦🎬🎵🎮 etc.) that domain extractors add as decoration
147
+ .replace(/^[\p{Emoji_Presentation}\p{Extended_Pictographic}\uFE0F]+\s*/u, '')
146
148
  .replace(/\s+/g, ' ')
147
149
  .trim().slice(0, 150);
148
150
  }
@@ -156,6 +158,12 @@ function heuristicExtractString(fieldName, content, pageUrl) {
156
158
  if (pageUrl)
157
159
  return pageUrl;
158
160
  }
161
+ // Director (for movies/films)
162
+ if (/director/.test(lf)) {
163
+ const m = content.match(/Director[:\s*]+([^\n|,]+)/i) ?? content.match(/Directed by[:\s]+([^\n|,]+)/i);
164
+ if (m?.[1])
165
+ return m[1].replace(/[*_`]/g, '').trim().slice(0, 100);
166
+ }
159
167
  // Author/writer/by
160
168
  if (/author|writer|by/.test(lf)) {
161
169
  const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
@@ -339,9 +347,21 @@ async function heuristicExtract(content, schema) {
339
347
  fieldsFound++;
340
348
  data[field] = value;
341
349
  }
342
- // Confidence: 0.3 base, up to 0.5 based on fill rate
350
+ // Confidence based on fill rate:
351
+ // - ALL fields null → 0.1 (extraction found nothing useful)
352
+ // - Some fields null → 0.3-0.5 based on fill ratio
353
+ // - ALL fields populated → 0.6-0.7 (heuristic max — values may still be imprecise)
343
354
  const fillRate = totalFields > 0 ? fieldsFound / totalFields : 0;
344
- const confidence = 0.3 + fillRate * 0.2;
355
+ let confidence;
356
+ if (fieldsFound === 0) {
357
+ confidence = 0.1; // All null — heuristic found nothing
358
+ }
359
+ else if (fieldsFound === totalFields) {
360
+ confidence = 0.65 + fillRate * 0.05; // 0.7 for fully populated heuristic
361
+ }
362
+ else {
363
+ confidence = 0.3 + fillRate * 0.2; // 0.3–0.5 based on fill ratio
364
+ }
345
365
  return {
346
366
  data,
347
367
  confidence: parseFloat(confidence.toFixed(2)),
@@ -395,12 +415,22 @@ export async function extractStructured(content, schema, llmConfig, prompt) {
395
415
  return heuristic;
396
416
  }
397
417
  const { data, missingRequired } = validateAndCoerce(parsed, schema);
398
- // Confidence: 0.9 base, penalised for missing required fields
399
- const penalty = missingRequired.length * 0.05;
418
+ // Confidence for LLM extraction:
419
+ // - ALL fields null 0.1 (LLM couldn't extract anything)
420
+ // - Partial fill → 0.85+ (LLM is generally reliable when it finds data)
421
+ // - All populated → 0.90-0.98 based on fill rate
400
422
  const filledCount = Object.values(data).filter((v) => v !== null && v !== undefined).length;
401
423
  const totalCount = Object.keys(schema.properties).length;
402
- const fillBonus = totalCount > 0 ? (filledCount / totalCount) * 0.05 : 0;
403
- const confidence = Math.max(0.5, Math.min(0.98, 0.9 + fillBonus - penalty));
424
+ const fillRate = totalCount > 0 ? filledCount / totalCount : 0;
425
+ const penalty = missingRequired.length * 0.05;
426
+ let confidence;
427
+ if (filledCount === 0) {
428
+ confidence = 0.1; // LLM returned all nulls — extraction failed
429
+ }
430
+ else {
431
+ const fillBonus = fillRate * 0.08; // Up to +0.08 for fully populated
432
+ confidence = Math.min(0.98, 0.85 + fillBonus - penalty); // 0.85–0.93+ for LLM
433
+ }
404
434
  return {
405
435
  data,
406
436
  confidence: parseFloat(confidence.toFixed(2)),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.11",
3
+ "version": "0.21.13",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",