webpeel 0.21.11 → 0.21.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1518,12 +1518,27 @@ async function npmExtractor(_html, url) {
1518
1518
  modified: data.time?.modified || undefined,
1519
1519
  };
1520
1520
  // Include README if available (some packages have it, some don't)
1521
- const readmeText = data.readme && data.readme.length > 10 ? data.readme.slice(0, 5000) : '';
1521
+ let readmeText = data.readme && data.readme.length > 10 ? data.readme.slice(0, 5000) : '';
1522
+ // If no README in registry, try fetching from unpkg.com
1523
+ if (!readmeText) {
1524
+ try {
1525
+ const unpkgUrl = `https://unpkg.com/${encodeURIComponent(packageName)}/README.md`;
1526
+ const readmeResult = await simpleFetch(unpkgUrl, undefined, 10000);
1527
+ if (readmeResult?.html && readmeResult.html.length > 10 && !readmeResult.html.trim().startsWith('<')) {
1528
+ readmeText = readmeResult.html.slice(0, 5000);
1529
+ }
1530
+ }
1531
+ catch { /* README from unpkg optional */ }
1532
+ }
1522
1533
  // Add to structured data
1523
1534
  structured.readme = readmeText;
1524
1535
  const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
1536
+ // Show ALL dependencies (not capped at 15)
1525
1537
  const depsLine = structured.dependencies.length
1526
- ? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.slice(0, 15).join(', ')}${structured.dependencies.length > 15 ? '...' : ''}`
1538
+ ? `\n**Dependencies (${structured.dependencies.length}):** ${structured.dependencies.join(', ')}`
1539
+ : '';
1540
+ const devDepsLine = structured.devDependencies.length
1541
+ ? `\n**Dev Dependencies (${structured.devDependencies.length}):** ${structured.devDependencies.slice(0, 10).join(', ')}${structured.devDependencies.length > 10 ? '...' : ''}`
1527
1542
  : '';
1528
1543
  const repoLine = structured.repository ? `\n**Repository:** ${structured.repository.replace('git+', '').replace('.git', '')}` : '';
1529
1544
  const homepageLine = structured.homepage ? `\n**Homepage:** ${structured.homepage}` : '';
@@ -1536,7 +1551,7 @@ async function npmExtractor(_html, url) {
1536
1551
  ${structured.description}
1537
1552
 
1538
1553
  **License:** ${structured.license} | **Weekly Downloads:** ${structured.weeklyDownloads?.toLocaleString() || 'N/A'}
1539
- **Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
1554
+ **Author:** ${structured.author || 'N/A'} | **Maintainers:** ${structured.maintainers.join(', ') || 'N/A'}${keywordsLine}${depsLine}${devDepsLine}${repoLine}${homepageLine}${datesLine}${readmeSection}`;
1540
1555
  return { domain: 'npmjs.com', type: 'package', structured, cleanContent };
1541
1556
  }
1542
1557
  catch (e) {
@@ -1832,7 +1847,27 @@ async function mediumExtractor(html, url) {
1832
1847
  async function substackExtractor(html, url) {
1833
1848
  try {
1834
1849
  const { load } = await import('cheerio');
1835
- const $ = load(html);
1850
+ // Handle open.substack.com/pub/{publication}/p/{slug} redirect URLs
1851
+ // These are share links that redirect to the actual post. Redirect to the real URL.
1852
+ const urlObj = new URL(url);
1853
+ let workingHtml = html;
1854
+ let workingUrl = url;
1855
+ if (urlObj.hostname === 'open.substack.com') {
1856
+ const openMatch = urlObj.pathname.match(/\/pub\/([^/]+)\/p\/([^/]+)/);
1857
+ if (openMatch) {
1858
+ const [, publication, slug] = openMatch;
1859
+ const actualUrl = `https://${publication}.substack.com/p/${slug}`;
1860
+ try {
1861
+ const fetchResult = await simpleFetch(actualUrl, undefined, 15000);
1862
+ if (fetchResult?.html && fetchResult.html.length > 500) {
1863
+ workingHtml = fetchResult.html;
1864
+ workingUrl = actualUrl;
1865
+ }
1866
+ }
1867
+ catch { /* fall through with original HTML */ }
1868
+ }
1869
+ }
1870
+ const $ = load(workingHtml);
1836
1871
  // JSON-LD
1837
1872
  let jsonLdData = null;
1838
1873
  $('script[type="application/ld+json"]').each((_, el) => {
@@ -1857,14 +1892,14 @@ async function substackExtractor(html, url) {
1857
1892
  $('meta[property="article:published_time"]').attr('content') ||
1858
1893
  $('time').first().attr('datetime') || '';
1859
1894
  const publication = $('meta[property="og:site_name"]').attr('content') ||
1860
- $('a.navbar-title-link').text().trim() || new URL(url).hostname.replace('.substack.com', '');
1895
+ $('a.navbar-title-link').text().trim() || new URL(workingUrl).hostname.replace('.substack.com', '');
1861
1896
  const description = jsonLdData?.description ||
1862
1897
  $('meta[property="og:description"]').attr('content') || '';
1863
- // Article content
1898
+ // Article content — try multiple Substack CSS patterns
1864
1899
  let articleBody = '';
1865
- const postContent = $('.body.markup, .post-content, article').first();
1900
+ const postContent = $('.body.markup, .post-content, article, [class*="post-content"], .available-content').first();
1866
1901
  if (postContent.length) {
1867
- postContent.find('script, style, nav, .paywall, .subscribe-widget').remove();
1902
+ postContent.find('script, style, nav, .paywall, .subscribe-widget, .subscription-widget').remove();
1868
1903
  const parts = [];
1869
1904
  postContent.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
1870
1905
  const tag = el.name;
@@ -1884,19 +1919,35 @@ async function substackExtractor(html, url) {
1884
1919
  });
1885
1920
  articleBody = parts.join('\n\n');
1886
1921
  }
1922
+ // If no article body found, try broader search
1923
+ if (!articleBody) {
1924
+ const parts = [];
1925
+ $('main p, article p, [class*="content"] p').each((_, el) => {
1926
+ const text = $(el).text().trim();
1927
+ if (text && text.length > 20)
1928
+ parts.push(text);
1929
+ });
1930
+ articleBody = parts.slice(0, 20).join('\n\n');
1931
+ }
1887
1932
  const contentBody = articleBody || description;
1933
+ // Detect if the post appears paywalled (short content with no article body)
1934
+ const isPaywalled = !articleBody && description.length > 0;
1935
+ const paywallNote = isPaywalled
1936
+ ? '\n\n---\n*⚠️ This post appears to be behind a paywall. Only the preview/description is available. Full content requires a subscription.*'
1937
+ : '';
1888
1938
  const structured = {
1889
1939
  title,
1890
1940
  author,
1891
1941
  publication,
1892
1942
  publishDate,
1893
1943
  description,
1894
- url,
1944
+ paywalled: isPaywalled,
1945
+ url: workingUrl,
1895
1946
  };
1896
1947
  const authorLine = author ? `\n**Author:** ${author}` : '';
1897
1948
  const pubLine = publication ? `\n**Publication:** ${publication}` : '';
1898
1949
  const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
1899
- const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}`;
1950
+ const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}${paywallNote}`;
1900
1951
  return { domain: 'substack.com', type: 'post', structured, cleanContent };
1901
1952
  }
1902
1953
  catch {
@@ -2071,11 +2122,20 @@ async function imdbExtractor(html, url) {
2071
2122
  ? jsonLd.director.map((d) => d.name || d).join(', ')
2072
2123
  : jsonLd.director?.name || String(jsonLd.director))
2073
2124
  : $('a[href*="/name/"][class*="ipc-metadata-list-item__list-content-item"]').first().text().trim() || '';
2074
- // Cast (top few from JSON-LD actor field)
2075
- const cast = jsonLd?.actor
2125
+ // Cast JSON-LD has top actors, also parse HTML for broader cast list
2126
+ const castFromLd = jsonLd?.actor
2076
2127
  ? (Array.isArray(jsonLd.actor) ? jsonLd.actor : [jsonLd.actor])
2077
- .map((a) => a.name || a).slice(0, 6)
2128
+ .map((a) => a.name || a)
2078
2129
  : [];
2130
+ // Parse additional cast from HTML (IMDB cast section)
2131
+ const castFromHtml = [];
2132
+ // Try multiple IMDB cast selectors across page versions
2133
+ $('[data-testid="title-cast-item"] a[href*="/name/nm"], a[data-testid*="cast"] span[class*="title"], .cast_list td.itemprop a').each((_, el) => {
2134
+ const name = $(el).text().trim();
2135
+ if (name && name.length > 1 && !castFromHtml.includes(name))
2136
+ castFromHtml.push(name);
2137
+ });
2138
+ const cast = [...new Set([...castFromLd, ...castFromHtml])].slice(0, 15);
2079
2139
  // Runtime
2080
2140
  const runtime = jsonLd?.duration
2081
2141
  ? (() => {
@@ -2085,17 +2145,56 @@ async function imdbExtractor(html, url) {
2085
2145
  return String(jsonLd.duration);
2086
2146
  })()
2087
2147
  : '';
2148
+ // Full plot/storyline — try to get the longer version from HTML
2149
+ const fullPlot = $('[data-testid="storyline-plot-summary"] span, [data-testid="plot-xl"] span, span[data-testid="plot-l"], #titleStoryLine p, .plot_summary .summary_text').first().text().trim() || description;
2150
+ // Additional details: Writers, Keywords, Awards
2151
+ const writers = [];
2152
+ $('[data-testid="title-pc-wide-screen"] li[data-testid="title-pc-principal-credit"]:nth-child(2) a, .credit_summary_item:contains("Writer") a').each((_, el) => {
2153
+ const name = $(el).text().trim();
2154
+ if (name && !writers.includes(name))
2155
+ writers.push(name);
2156
+ });
2157
+ // Keywords — try HTML first, fall back to JSON-LD keywords
2158
+ let keywords = [];
2159
+ $('[data-testid="storyline-plot-keywords"] a, .see-more.inline.canwrap span a, a[href*="keyword"]').each((_, el) => {
2160
+ const kw = $(el).text().trim();
2161
+ if (kw && kw.length < 30 && !keywords.includes(kw))
2162
+ keywords.push(kw);
2163
+ });
2164
+ // Fall back to JSON-LD keywords if HTML didn't yield any
2165
+ if (!keywords.length && jsonLd?.keywords) {
2166
+ keywords = (typeof jsonLd.keywords === 'string'
2167
+ ? jsonLd.keywords.split(',')
2168
+ : Array.isArray(jsonLd.keywords) ? jsonLd.keywords : []).map((k) => k.trim()).filter(Boolean);
2169
+ }
2170
+ // Writers — also try JSON-LD creator field
2171
+ if (!writers.length && jsonLd?.creator) {
2172
+ const creators = Array.isArray(jsonLd.creator) ? jsonLd.creator : [jsonLd.creator];
2173
+ for (const c of creators) {
2174
+ const name = c?.name || (typeof c === 'string' ? c : '');
2175
+ if (name && !writers.includes(name))
2176
+ writers.push(name);
2177
+ }
2178
+ }
2179
+ // Content rating & release date from JSON-LD
2180
+ const contentRating = jsonLd?.contentRating || '';
2181
+ const datePublished = jsonLd?.datePublished || '';
2088
2182
  const structured = {
2089
- title, year, contentType, description, ratingValue, ratingCount,
2090
- genres, director, cast, runtime, url,
2183
+ title, year, contentType, description: fullPlot, ratingValue, ratingCount,
2184
+ genres, director, writers, cast, runtime, keywords, contentRating, datePublished, url,
2091
2185
  };
2092
2186
  const ratingLine = ratingValue ? `⭐ ${ratingValue}/10${ratingCount ? ` (${Number(ratingCount).toLocaleString()} votes)` : ''}` : '';
2093
2187
  const genreLine = genres.length ? genres.join(', ') : '';
2094
2188
  const directorLine = director ? `**Director:** ${director}` : '';
2189
+ const writersLine = writers.length ? `**Writers:** ${writers.slice(0, 5).join(', ')}` : '';
2095
2190
  const castLine = cast.length ? `**Cast:** ${cast.join(', ')}` : '';
2096
2191
  const runtimeLine = runtime ? `**Runtime:** ${runtime}` : '';
2192
+ const ratedLine = contentRating ? `**Rated:** ${contentRating}` : '';
2193
+ const releaseLine = datePublished ? `**Released:** ${datePublished}` : '';
2194
+ const keywordsLine = keywords.length ? `\n**Keywords:** ${keywords.slice(0, 10).join(', ')}` : '';
2097
2195
  const metaParts = [ratingLine, genreLine, runtimeLine, year ? `**Year:** ${year}` : ''].filter(Boolean).join(' | ');
2098
- const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${directorLine ? directorLine + '\n' : ''}${castLine ? castLine + '\n' : ''}\n## Plot\n\n${description}`;
2196
+ const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine].filter(Boolean).join('\n');
2197
+ const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${detailParts}${keywordsLine}\n\n## Plot\n\n${fullPlot}`;
2099
2198
  return { domain: 'imdb.com', type: contentType === 'TVSeries' ? 'tv_show' : 'movie', structured, cleanContent };
2100
2199
  }
2101
2200
  catch {
@@ -2245,17 +2344,34 @@ async function pypiExtractor(_html, url) {
2245
2344
  requiresDist: (info.requires_dist || []).slice(0, 20),
2246
2345
  classifiers: (info.classifiers || []).slice(0, 10),
2247
2346
  };
2347
+ // Full description/README from PyPI (info.description is the full README in markdown)
2348
+ const fullDescription = info.description && info.description.length > 100 &&
2349
+ info.description !== 'UNKNOWN' && info.description !== info.summary
2350
+ ? info.description.slice(0, 8000)
2351
+ : null;
2352
+ // Store full description in structured
2353
+ structured.fullDescription = fullDescription;
2248
2354
  const installCmd = `pip install ${info.name}`;
2249
2355
  const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
2250
2356
  const pyVersionLine = structured.requiresPython ? `\n**Requires Python:** ${structured.requiresPython}` : '';
2357
+ // Show all dependencies
2251
2358
  const depsLine = structured.requiresDist.length
2252
2359
  ? `\n\n## Dependencies\n\n${structured.requiresDist.map((d) => `- ${d}`).join('\n')}`
2253
2360
  : '';
2361
+ // Classifiers — extract useful ones (license, status, Python versions)
2362
+ const usefulClassifiers = structured.classifiers.filter((c) => c.startsWith('Programming Language') || c.startsWith('License') || c.startsWith('Development Status'));
2363
+ const classifiersSection = usefulClassifiers.length
2364
+ ? `\n\n## Classifiers\n\n${usefulClassifiers.map((c) => `- ${c}`).join('\n')}`
2365
+ : '';
2254
2366
  // Find project URLs
2255
2367
  const projectUrlLines = [];
2256
2368
  for (const [label, u] of Object.entries(structured.projectUrls)) {
2257
2369
  projectUrlLines.push(`- **${label}:** ${u}`);
2258
2370
  }
2371
+ // Full description section (package README from PyPI)
2372
+ const descSection = fullDescription
2373
+ ? `\n\n## Description\n\n${fullDescription}`
2374
+ : '';
2259
2375
  const cleanContent = `# 📦 ${info.name} ${info.version}
2260
2376
 
2261
2377
  ${info.summary || ''}
@@ -2266,7 +2382,7 @@ ${installCmd}
2266
2382
 
2267
2383
  **Author:** ${info.author || 'N/A'} | **License:** ${info.license || 'N/A'}${keywordsLine}${pyVersionLine}
2268
2384
 
2269
- ${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}`;
2385
+ ${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}${classifiersSection}${descSection}`;
2270
2386
  return { domain: 'pypi.org', type: 'package', structured, cleanContent };
2271
2387
  }
2272
2388
  catch (e) {
@@ -2289,6 +2405,38 @@ async function devtoExtractor(html, url) {
2289
2405
  const slug = pathParts.length >= 2
2290
2406
  ? pathParts.slice(0, 2).join('/').replace(/^@/, '')
2291
2407
  : null;
2408
+ // Homepage: no slug → fetch recent top articles from Dev.to API
2409
+ if (!slug) {
2410
+ try {
2411
+ const topArticles = await fetchJson('https://dev.to/api/articles?page=1&per_page=20&top=1');
2412
+ if (Array.isArray(topArticles) && topArticles.length > 0) {
2413
+ const articles = topArticles.map((a) => ({
2414
+ title: a.title || '',
2415
+ author: a.user?.name || '',
2416
+ authorUsername: a.user?.username || '',
2417
+ tags: a.tag_list || [],
2418
+ reactions: a.public_reactions_count || 0,
2419
+ comments: a.comments_count || 0,
2420
+ readingTime: a.reading_time_minutes ? `${a.reading_time_minutes} min` : '',
2421
+ url: a.url || '',
2422
+ publishDate: a.published_at ? a.published_at.split('T')[0] : '',
2423
+ }));
2424
+ const listMd = articles.map((a, i) => {
2425
+ const tags = a.tags.length ? ` · #${a.tags.slice(0, 3).join(' #')}` : '';
2426
+ const stats = `❤️ ${a.reactions} | 💬 ${a.comments}${a.readingTime ? ` | ${a.readingTime}` : ''}`;
2427
+ return `${i + 1}. **[${a.title}](${a.url})**\n by @${a.authorUsername}${tags}\n ${stats} · ${a.publishDate}`;
2428
+ }).join('\n\n');
2429
+ const structured = {
2430
+ title: 'DEV Community — Top Articles',
2431
+ articles,
2432
+ fetchedAt: new Date().toISOString(),
2433
+ };
2434
+ const cleanContent = `# 🧑‍💻 DEV Community — Top Articles\n\n*${articles.length} articles from the community*\n\n${listMd}`;
2435
+ return { domain: 'dev.to', type: 'listing', structured, cleanContent };
2436
+ }
2437
+ }
2438
+ catch { /* fall through to HTML */ }
2439
+ }
2292
2440
  if (slug) {
2293
2441
  try {
2294
2442
  const apiUrl = `https://dev.to/api/articles/${slug}`;
@@ -410,6 +410,18 @@ export async function fetchContent(ctx) {
410
410
  }
411
411
  catch { /* Search fallback also failed — rethrow original BlockedError */ }
412
412
  }
413
+ // Enhance error messages with actionable advice
414
+ if (fetchError instanceof BlockedError) {
415
+ const actionableMsg = `${fetchError.message}\n\nThis site blocks automated access. Try using \`stealth: true\` and a residential proxy.`;
416
+ const enhancedError = new BlockedError(actionableMsg);
417
+ throw enhancedError;
418
+ }
419
+ const errMsg = fetchError instanceof Error ? fetchError.message : String(fetchError);
420
+ if (errMsg.toLowerCase().includes('timeout') || errMsg.toLowerCase().includes('timed out') || errMsg.includes('AbortError')) {
421
+ const ms = ctx.timeout ?? 30000;
422
+ const enhancedMsg = `Request timed out after ${Math.round(ms / 1000)}s. This site may require browser rendering — try \`render: true\`.`;
423
+ throw new Error(enhancedMsg);
424
+ }
413
425
  throw fetchError;
414
426
  }
415
427
  const fetchDuration = ctx.timer.end('fetch');
@@ -1183,6 +1195,14 @@ export function buildResult(ctx) {
1183
1195
  let warning;
1184
1196
  const contentLen = ctx.content.length;
1185
1197
  const htmlLen = ctx.fetchResult?.html?.length || 0;
1198
+ // Add contentQuality metadata for thin content (< 100 words)
1199
+ const wordCount = ctx.content.trim().split(/\s+/).filter((w) => w.length > 0).length;
1200
+ if (wordCount < 100 && wordCount > 0) {
1201
+ ctx.warnings.push(`Content is thin (${wordCount} words). The page may be paywalled, require authentication, or block automated access.`);
1202
+ if (ctx.metadata) {
1203
+ ctx.metadata.contentQuality = 'thin';
1204
+ }
1205
+ }
1186
1206
  if (contentLen < 100 && htmlLen > 1000) {
1187
1207
  warning = 'Content extraction produced very little text from a substantial page. The site may use heavy JavaScript rendering. Try adding render: true.';
1188
1208
  }
@@ -143,6 +143,8 @@ function heuristicExtractString(fieldName, content, pageUrl) {
143
143
  .replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
144
144
  .replace(/[*_`[\]]/g, '')
145
145
  .replace(/&[a-z]+;/g, '') // HTML entities
146
+ // Strip leading emoji (📦🎬🎵🎮 etc.) that domain extractors add as decoration
147
+ .replace(/^[\p{Emoji_Presentation}\p{Extended_Pictographic}\uFE0F]+\s*/u, '')
146
148
  .replace(/\s+/g, ' ')
147
149
  .trim().slice(0, 150);
148
150
  }
@@ -156,6 +158,12 @@ function heuristicExtractString(fieldName, content, pageUrl) {
156
158
  if (pageUrl)
157
159
  return pageUrl;
158
160
  }
161
+ // Director (for movies/films)
162
+ if (/director/.test(lf)) {
163
+ const m = content.match(/Director[:\s*]+([^\n|,]+)/i) ?? content.match(/Directed by[:\s]+([^\n|,]+)/i);
164
+ if (m?.[1])
165
+ return m[1].replace(/[*_`]/g, '').trim().slice(0, 100);
166
+ }
159
167
  // Author/writer/by
160
168
  if (/author|writer|by/.test(lf)) {
161
169
  const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
@@ -339,9 +347,21 @@ async function heuristicExtract(content, schema) {
339
347
  fieldsFound++;
340
348
  data[field] = value;
341
349
  }
342
- // Confidence: 0.3 base, up to 0.5 based on fill rate
350
+ // Confidence based on fill rate:
351
+ // - ALL fields null → 0.1 (extraction found nothing useful)
352
+ // - Some fields null → 0.3-0.5 based on fill ratio
353
+ // - ALL fields populated → 0.6-0.7 (heuristic max — values may still be imprecise)
343
354
  const fillRate = totalFields > 0 ? fieldsFound / totalFields : 0;
344
- const confidence = 0.3 + fillRate * 0.2;
355
+ let confidence;
356
+ if (fieldsFound === 0) {
357
+ confidence = 0.1; // All null — heuristic found nothing
358
+ }
359
+ else if (fieldsFound === totalFields) {
360
+ confidence = 0.65 + fillRate * 0.05; // 0.7 for fully populated heuristic
361
+ }
362
+ else {
363
+ confidence = 0.3 + fillRate * 0.2; // 0.3–0.5 based on fill ratio
364
+ }
345
365
  return {
346
366
  data,
347
367
  confidence: parseFloat(confidence.toFixed(2)),
@@ -395,12 +415,22 @@ export async function extractStructured(content, schema, llmConfig, prompt) {
395
415
  return heuristic;
396
416
  }
397
417
  const { data, missingRequired } = validateAndCoerce(parsed, schema);
398
- // Confidence: 0.9 base, penalised for missing required fields
399
- const penalty = missingRequired.length * 0.05;
418
+ // Confidence for LLM extraction:
419
+ // - ALL fields null 0.1 (LLM couldn't extract anything)
420
+ // - Partial fill → 0.85+ (LLM is generally reliable when it finds data)
421
+ // - All populated → 0.90-0.98 based on fill rate
400
422
  const filledCount = Object.values(data).filter((v) => v !== null && v !== undefined).length;
401
423
  const totalCount = Object.keys(schema.properties).length;
402
- const fillBonus = totalCount > 0 ? (filledCount / totalCount) * 0.05 : 0;
403
- const confidence = Math.max(0.5, Math.min(0.98, 0.9 + fillBonus - penalty));
424
+ const fillRate = totalCount > 0 ? filledCount / totalCount : 0;
425
+ const penalty = missingRequired.length * 0.05;
426
+ let confidence;
427
+ if (filledCount === 0) {
428
+ confidence = 0.1; // LLM returned all nulls — extraction failed
429
+ }
430
+ else {
431
+ const fillBonus = fillRate * 0.08; // Up to +0.08 for fully populated
432
+ confidence = Math.min(0.98, 0.85 + fillBonus - penalty); // 0.85–0.93+ for LLM
433
+ }
404
434
  return {
405
435
  data,
406
436
  confidence: parseFloat(confidence.toFixed(2)),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.11",
3
+ "version": "0.21.12",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",