webpeel 0.21.12 → 0.21.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -778,6 +778,10 @@ async function githubExtractor(_html, url) {
778
778
  if (pathParts.length === 0)
779
779
  return null;
780
780
  const ghHeaders = { Accept: 'application/vnd.github.v3+json' };
781
+ // Use GITHUB_TOKEN if available for higher rate limits (5000/hr vs 60/hr)
782
+ const ghToken = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
783
+ if (ghToken)
784
+ ghHeaders.Authorization = `token ${ghToken}`;
781
785
  // User profile: /username (single segment)
782
786
  if (pathParts.length === 1) {
783
787
  const username = pathParts[0];
@@ -1795,15 +1799,57 @@ async function mediumExtractor(html, url) {
1795
1799
  $('span').filter((_, el) => $(el).text().includes('min read')).first().text().trim() || '';
1796
1800
  const description = jsonLdData?.description ||
1797
1801
  $('meta[property="og:description"]').attr('content') || '';
1802
+ // Publication name — subdomain (towardsdatascience.medium.com), meta tags, or breadcrumb
1803
+ let publication = '';
1804
+ try {
1805
+ const urlObj2 = new URL(url);
1806
+ const hostname = urlObj2.hostname;
1807
+ if (hostname !== 'medium.com' && hostname !== 'www.medium.com' && hostname.endsWith('.medium.com')) {
1808
+ publication = hostname.replace('.medium.com', '').replace(/-/g, ' ').replace(/\b\w/g, (c) => c.toUpperCase());
1809
+ }
1810
+ }
1811
+ catch { /* ignore */ }
1812
+ if (!publication) {
1813
+ publication = $('[data-testid="publicationName"]').text().trim() ||
1814
+ $('a[data-testid="publicationName"]').text().trim() ||
1815
+ $('meta[property="article:section"]').attr('content') ||
1816
+ $('a[href*="/tag/"]').first().text().trim() || '';
1817
+ }
1818
+ // Author bio — usually shown in an author card or bio section
1819
+ const authorBio = $('[data-testid="authorBio"]').text().trim() ||
1820
+ $('p[class*="bio"]').first().text().trim() ||
1821
+ $('[aria-label="authorBio"]').text().trim() || '';
1822
+ // Clap count — Medium shows clap button with count
1823
+ let clapCount = '';
1824
+ $('button[data-testid="storyClaps"], button[aria-label*="clap"]').each((_, el) => {
1825
+ const txt = $(el).text().trim();
1826
+ if (txt && /\d/.test(txt)) {
1827
+ clapCount = txt;
1828
+ return false;
1829
+ }
1830
+ });
1831
+ if (!clapCount) {
1832
+ // Fallback: find spans that look like clap counts (e.g., "2.4K")
1833
+ $('span').filter((_, el) => {
1834
+ const label = $(el).closest('[aria-label]').attr('aria-label') || '';
1835
+ return label.toLowerCase().includes('clap');
1836
+ }).each((_, el) => {
1837
+ const txt = $(el).text().trim();
1838
+ if (txt && /\d/.test(txt)) {
1839
+ clapCount = txt;
1840
+ return false;
1841
+ }
1842
+ });
1843
+ }
1798
1844
  // Extract article body — Medium puts content in <article> or section
1799
1845
  let articleBody = '';
1800
1846
  const articleEl = $('article').first();
1801
1847
  if (articleEl.length) {
1802
- // Remove nav, aside, buttons
1803
- articleEl.find('nav, aside, button, [data-testid="navbar"]').remove();
1848
+ // Remove nav, aside, buttons, author-card, footer sections
1849
+ articleEl.find('nav, aside, button, [data-testid="navbar"], footer, [data-testid="authorCard"]').remove();
1804
1850
  // Get paragraphs and headings
1805
1851
  const parts = [];
1806
- articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
1852
+ articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li, figure figcaption').each((_, el) => {
1807
1853
  const tag = el.name;
1808
1854
  const text = $(el).text().trim();
1809
1855
  if (!text || text.length < 5)
@@ -1816,6 +1862,8 @@ async function mediumExtractor(html, url) {
1816
1862
  parts.push(`> ${text}`);
1817
1863
  else if (tag === 'pre')
1818
1864
  parts.push('```\n' + text + '\n```');
1865
+ else if (tag === 'figcaption')
1866
+ parts.push(`*${text}*`);
1819
1867
  else
1820
1868
  parts.push(text);
1821
1869
  });
@@ -1826,15 +1874,22 @@ async function mediumExtractor(html, url) {
1826
1874
  const structured = {
1827
1875
  title,
1828
1876
  author,
1877
+ authorBio,
1829
1878
  publishDate,
1830
1879
  readingTime,
1831
1880
  description,
1881
+ publication,
1882
+ clapCount,
1832
1883
  url,
1833
1884
  };
1834
1885
  const authorLine = author ? `\n**Author:** ${author}` : '';
1886
+ const bioLine = authorBio ? `\n**Author Bio:** ${authorBio}` : '';
1835
1887
  const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
1836
1888
  const timeLine = readingTime ? `\n**Reading time:** ${readingTime}` : '';
1837
- const cleanContent = `# ${title}${authorLine}${dateLine}${timeLine}\n\n${contentBody.substring(0, 8000)}`;
1889
+ const pubLine = publication ? `\n**Publication:** ${publication}` : '';
1890
+ const clapsLine = clapCount ? `\n**Claps:** ${clapCount}` : '';
1891
+ // No hard character cap — let the pipeline's budget/maxTokens handle truncation
1892
+ const cleanContent = `# ${title}${authorLine}${bioLine}${dateLine}${timeLine}${pubLine}${clapsLine}\n\n${contentBody}`;
1838
1893
  return { domain: 'medium.com', type: 'article', structured, cleanContent };
1839
1894
  }
1840
1895
  catch {
@@ -2122,20 +2177,37 @@ async function imdbExtractor(html, url) {
2122
2177
  ? jsonLd.director.map((d) => d.name || d).join(', ')
2123
2178
  : jsonLd.director?.name || String(jsonLd.director))
2124
2179
  : $('a[href*="/name/"][class*="ipc-metadata-list-item__list-content-item"]').first().text().trim() || '';
2125
- // Cast — JSON-LD has top actors, also parse HTML for broader cast list
2180
+ // Cast — parse HTML first for actor+character pairs, then fall back to JSON-LD
2181
+ const castPairs = [];
2182
+ // IMDB new UI: each title-cast-item contains actor link + character link
2183
+ $('[data-testid="title-cast-item"]').each((_, el) => {
2184
+ const actorEl = $(el).find('a[href*="/name/nm"]').first();
2185
+ const charEl = $(el).find('[data-testid="title-cast-item__character"]').first();
2186
+ const actor = actorEl.text().trim();
2187
+ // Character name may span multiple elements; clean whitespace
2188
+ const character = charEl.text().trim().replace(/\s+/g, ' ').replace(/^\.\.\.$/, '');
2189
+ if (actor && actor.length > 1) {
2190
+ castPairs.push({ actor, character: character || '' });
2191
+ }
2192
+ });
2193
+ // Fall back to classic cast list (older IMDB page versions)
2194
+ const castFromHtml = [];
2195
+ if (!castPairs.length) {
2196
+ $('.cast_list td.itemprop a').each((_, el) => {
2197
+ const name = $(el).text().trim();
2198
+ if (name && name.length > 1 && !castFromHtml.includes(name))
2199
+ castFromHtml.push(name);
2200
+ });
2201
+ }
2202
+ // JSON-LD actors as final fallback
2126
2203
  const castFromLd = jsonLd?.actor
2127
2204
  ? (Array.isArray(jsonLd.actor) ? jsonLd.actor : [jsonLd.actor])
2128
2205
  .map((a) => a.name || a)
2129
2206
  : [];
2130
- // Parse additional cast from HTML (IMDB cast section)
2131
- const castFromHtml = [];
2132
- // Try multiple IMDB cast selectors across page versions
2133
- $('[data-testid="title-cast-item"] a[href*="/name/nm"], a[data-testid*="cast"] span[class*="title"], .cast_list td.itemprop a').each((_, el) => {
2134
- const name = $(el).text().trim();
2135
- if (name && name.length > 1 && !castFromHtml.includes(name))
2136
- castFromHtml.push(name);
2137
- });
2138
- const cast = [...new Set([...castFromLd, ...castFromHtml])].slice(0, 15);
2207
+ // Build final cast list: with characters if available (top 10), otherwise names only
2208
+ const cast = castPairs.length > 0
2209
+ ? castPairs.slice(0, 10).map(({ actor, character }) => character ? `${actor} as ${character}` : actor)
2210
+ : [...new Set([...castFromLd, ...castFromHtml])].slice(0, 10);
2139
2211
  // Runtime
2140
2212
  const runtime = jsonLd?.duration
2141
2213
  ? (() => {
@@ -2176,12 +2248,37 @@ async function imdbExtractor(html, url) {
2176
2248
  writers.push(name);
2177
2249
  }
2178
2250
  }
2251
+ // Awards / accolades — try hero accolades chip, then any awards-related link text
2252
+ let awardsSummary = '';
2253
+ // IMDB new UI: awards accolades chip in the hero section
2254
+ const accoladesEl = $('[data-testid="awards-accolades"]');
2255
+ if (accoladesEl.length) {
2256
+ awardsSummary = accoladesEl.text().trim().replace(/\s+/g, ' ');
2257
+ }
2258
+ // Fallback: look for per-title awards link (href contains the title ID /tt\d+/awards)
2259
+ if (!awardsSummary) {
2260
+ const titleMatch = url.match(/\/(tt\d+)/);
2261
+ const titleId = titleMatch ? titleMatch[1] : '';
2262
+ if (titleId) {
2263
+ $(`a[href*="${titleId}"][href*="awards"]`).each((_, el) => {
2264
+ const text = $(el).text().trim().replace(/\s+/g, ' ');
2265
+ if (text && text.length > 3 && text.length < 200) {
2266
+ awardsSummary = text;
2267
+ return false; // break
2268
+ }
2269
+ });
2270
+ }
2271
+ }
2272
+ // Fallback: JSON-LD award field
2273
+ if (!awardsSummary && jsonLd?.award) {
2274
+ awardsSummary = typeof jsonLd.award === 'string' ? jsonLd.award : '';
2275
+ }
2179
2276
  // Content rating & release date from JSON-LD
2180
2277
  const contentRating = jsonLd?.contentRating || '';
2181
2278
  const datePublished = jsonLd?.datePublished || '';
2182
2279
  const structured = {
2183
2280
  title, year, contentType, description: fullPlot, ratingValue, ratingCount,
2184
- genres, director, writers, cast, runtime, keywords, contentRating, datePublished, url,
2281
+ genres, director, writers, cast, runtime, keywords, contentRating, datePublished, awardsSummary, url,
2185
2282
  };
2186
2283
  const ratingLine = ratingValue ? `⭐ ${ratingValue}/10${ratingCount ? ` (${Number(ratingCount).toLocaleString()} votes)` : ''}` : '';
2187
2284
  const genreLine = genres.length ? genres.join(', ') : '';
@@ -2192,8 +2289,9 @@ async function imdbExtractor(html, url) {
2192
2289
  const ratedLine = contentRating ? `**Rated:** ${contentRating}` : '';
2193
2290
  const releaseLine = datePublished ? `**Released:** ${datePublished}` : '';
2194
2291
  const keywordsLine = keywords.length ? `\n**Keywords:** ${keywords.slice(0, 10).join(', ')}` : '';
2292
+ const awardsLine = awardsSummary ? `**Awards:** ${awardsSummary}` : '';
2195
2293
  const metaParts = [ratingLine, genreLine, runtimeLine, year ? `**Year:** ${year}` : ''].filter(Boolean).join(' | ');
2196
- const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine].filter(Boolean).join('\n');
2294
+ const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine, awardsLine].filter(Boolean).join('\n');
2197
2295
  const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${detailParts}${keywordsLine}\n\n## Plot\n\n${fullPlot}`;
2198
2296
  return { domain: 'imdb.com', type: contentType === 'TVSeries' ? 'tv_show' : 'movie', structured, cleanContent };
2199
2297
  }
@@ -2331,6 +2429,7 @@ async function pypiExtractor(_html, url) {
2331
2429
  return null;
2332
2430
  const info = data.info;
2333
2431
  const structured = {
2432
+ title: `${info.name} ${info.version}`,
2334
2433
  name: info.name,
2335
2434
  version: info.version,
2336
2435
  description: info.summary || '',
@@ -344,6 +344,7 @@ export async function fetchContent(ctx) {
344
344
  blockResources: ctx.options.blockResources,
345
345
  cloaked: ctx.options.cloaked,
346
346
  cycle: ctx.options.cycle,
347
+ tls: ctx.options.tls,
347
348
  noEscalate: ctx.options.noEscalate,
348
349
  });
349
350
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.12",
3
+ "version": "0.21.14",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",