webpeel 0.21.12 → 0.21.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1795,15 +1795,57 @@ async function mediumExtractor(html, url) {
1795
1795
  $('span').filter((_, el) => $(el).text().includes('min read')).first().text().trim() || '';
1796
1796
  const description = jsonLdData?.description ||
1797
1797
  $('meta[property="og:description"]').attr('content') || '';
1798
+ // Publication name — subdomain (towardsdatascience.medium.com), meta tags, or breadcrumb
1799
+ let publication = '';
1800
+ try {
1801
+ const urlObj2 = new URL(url);
1802
+ const hostname = urlObj2.hostname;
1803
+ if (hostname !== 'medium.com' && hostname !== 'www.medium.com' && hostname.endsWith('.medium.com')) {
1804
+ publication = hostname.replace('.medium.com', '').replace(/-/g, ' ').replace(/\b\w/g, (c) => c.toUpperCase());
1805
+ }
1806
+ }
1807
+ catch { /* ignore */ }
1808
+ if (!publication) {
1809
+ publication = $('[data-testid="publicationName"]').text().trim() ||
1810
+ $('a[data-testid="publicationName"]').text().trim() ||
1811
+ $('meta[property="article:section"]').attr('content') ||
1812
+ $('a[href*="/tag/"]').first().text().trim() || '';
1813
+ }
1814
+ // Author bio — usually shown in an author card or bio section
1815
+ const authorBio = $('[data-testid="authorBio"]').text().trim() ||
1816
+ $('p[class*="bio"]').first().text().trim() ||
1817
+ $('[aria-label="authorBio"]').text().trim() || '';
1818
+ // Clap count — Medium shows clap button with count
1819
+ let clapCount = '';
1820
+ $('button[data-testid="storyClaps"], button[aria-label*="clap"]').each((_, el) => {
1821
+ const txt = $(el).text().trim();
1822
+ if (txt && /\d/.test(txt)) {
1823
+ clapCount = txt;
1824
+ return false;
1825
+ }
1826
+ });
1827
+ if (!clapCount) {
1828
+ // Fallback: find spans that look like clap counts (e.g., "2.4K")
1829
+ $('span').filter((_, el) => {
1830
+ const label = $(el).closest('[aria-label]').attr('aria-label') || '';
1831
+ return label.toLowerCase().includes('clap');
1832
+ }).each((_, el) => {
1833
+ const txt = $(el).text().trim();
1834
+ if (txt && /\d/.test(txt)) {
1835
+ clapCount = txt;
1836
+ return false;
1837
+ }
1838
+ });
1839
+ }
1798
1840
  // Extract article body — Medium puts content in <article> or section
1799
1841
  let articleBody = '';
1800
1842
  const articleEl = $('article').first();
1801
1843
  if (articleEl.length) {
1802
- // Remove nav, aside, buttons
1803
- articleEl.find('nav, aside, button, [data-testid="navbar"]').remove();
1844
+ // Remove nav, aside, buttons, author-card, footer sections
1845
+ articleEl.find('nav, aside, button, [data-testid="navbar"], footer, [data-testid="authorCard"]').remove();
1804
1846
  // Get paragraphs and headings
1805
1847
  const parts = [];
1806
- articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
1848
+ articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li, figure figcaption').each((_, el) => {
1807
1849
  const tag = el.name;
1808
1850
  const text = $(el).text().trim();
1809
1851
  if (!text || text.length < 5)
@@ -1816,6 +1858,8 @@ async function mediumExtractor(html, url) {
1816
1858
  parts.push(`> ${text}`);
1817
1859
  else if (tag === 'pre')
1818
1860
  parts.push('```\n' + text + '\n```');
1861
+ else if (tag === 'figcaption')
1862
+ parts.push(`*${text}*`);
1819
1863
  else
1820
1864
  parts.push(text);
1821
1865
  });
@@ -1826,15 +1870,22 @@ async function mediumExtractor(html, url) {
1826
1870
  const structured = {
1827
1871
  title,
1828
1872
  author,
1873
+ authorBio,
1829
1874
  publishDate,
1830
1875
  readingTime,
1831
1876
  description,
1877
+ publication,
1878
+ clapCount,
1832
1879
  url,
1833
1880
  };
1834
1881
  const authorLine = author ? `\n**Author:** ${author}` : '';
1882
+ const bioLine = authorBio ? `\n**Author Bio:** ${authorBio}` : '';
1835
1883
  const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
1836
1884
  const timeLine = readingTime ? `\n**Reading time:** ${readingTime}` : '';
1837
- const cleanContent = `# ${title}${authorLine}${dateLine}${timeLine}\n\n${contentBody.substring(0, 8000)}`;
1885
+ const pubLine = publication ? `\n**Publication:** ${publication}` : '';
1886
+ const clapsLine = clapCount ? `\n**Claps:** ${clapCount}` : '';
1887
+ // No hard character cap — let the pipeline's budget/maxTokens handle truncation
1888
+ const cleanContent = `# ${title}${authorLine}${bioLine}${dateLine}${timeLine}${pubLine}${clapsLine}\n\n${contentBody}`;
1838
1889
  return { domain: 'medium.com', type: 'article', structured, cleanContent };
1839
1890
  }
1840
1891
  catch {
@@ -2122,20 +2173,37 @@ async function imdbExtractor(html, url) {
2122
2173
  ? jsonLd.director.map((d) => d.name || d).join(', ')
2123
2174
  : jsonLd.director?.name || String(jsonLd.director))
2124
2175
  : $('a[href*="/name/"][class*="ipc-metadata-list-item__list-content-item"]').first().text().trim() || '';
2125
- // Cast — JSON-LD has top actors, also parse HTML for broader cast list
2176
+ // Cast — parse HTML first for actor+character pairs, then fall back to JSON-LD
2177
+ const castPairs = [];
2178
+ // IMDB new UI: each title-cast-item contains actor link + character link
2179
+ $('[data-testid="title-cast-item"]').each((_, el) => {
2180
+ const actorEl = $(el).find('a[href*="/name/nm"]').first();
2181
+ const charEl = $(el).find('[data-testid="title-cast-item__character"]').first();
2182
+ const actor = actorEl.text().trim();
2183
+ // Character name may span multiple elements; clean whitespace
2184
+ const character = charEl.text().trim().replace(/\s+/g, ' ').replace(/^\.\.\.$/, '');
2185
+ if (actor && actor.length > 1) {
2186
+ castPairs.push({ actor, character: character || '' });
2187
+ }
2188
+ });
2189
+ // Fall back to classic cast list (older IMDB page versions)
2190
+ const castFromHtml = [];
2191
+ if (!castPairs.length) {
2192
+ $('.cast_list td.itemprop a').each((_, el) => {
2193
+ const name = $(el).text().trim();
2194
+ if (name && name.length > 1 && !castFromHtml.includes(name))
2195
+ castFromHtml.push(name);
2196
+ });
2197
+ }
2198
+ // JSON-LD actors as final fallback
2126
2199
  const castFromLd = jsonLd?.actor
2127
2200
  ? (Array.isArray(jsonLd.actor) ? jsonLd.actor : [jsonLd.actor])
2128
2201
  .map((a) => a.name || a)
2129
2202
  : [];
2130
- // Parse additional cast from HTML (IMDB cast section)
2131
- const castFromHtml = [];
2132
- // Try multiple IMDB cast selectors across page versions
2133
- $('[data-testid="title-cast-item"] a[href*="/name/nm"], a[data-testid*="cast"] span[class*="title"], .cast_list td.itemprop a').each((_, el) => {
2134
- const name = $(el).text().trim();
2135
- if (name && name.length > 1 && !castFromHtml.includes(name))
2136
- castFromHtml.push(name);
2137
- });
2138
- const cast = [...new Set([...castFromLd, ...castFromHtml])].slice(0, 15);
2203
+ // Build final cast list: with characters if available (top 10), otherwise names only
2204
+ const cast = castPairs.length > 0
2205
+ ? castPairs.slice(0, 10).map(({ actor, character }) => character ? `${actor} as ${character}` : actor)
2206
+ : [...new Set([...castFromLd, ...castFromHtml])].slice(0, 10);
2139
2207
  // Runtime
2140
2208
  const runtime = jsonLd?.duration
2141
2209
  ? (() => {
@@ -2176,12 +2244,37 @@ async function imdbExtractor(html, url) {
2176
2244
  writers.push(name);
2177
2245
  }
2178
2246
  }
2247
+ // Awards / accolades — try hero accolades chip, then any awards-related link text
2248
+ let awardsSummary = '';
2249
+ // IMDB new UI: awards accolades chip in the hero section
2250
+ const accoladesEl = $('[data-testid="awards-accolades"]');
2251
+ if (accoladesEl.length) {
2252
+ awardsSummary = accoladesEl.text().trim().replace(/\s+/g, ' ');
2253
+ }
2254
+ // Fallback: look for per-title awards link (href contains the title ID /tt\d+/awards)
2255
+ if (!awardsSummary) {
2256
+ const titleMatch = url.match(/\/(tt\d+)/);
2257
+ const titleId = titleMatch ? titleMatch[1] : '';
2258
+ if (titleId) {
2259
+ $(`a[href*="${titleId}"][href*="awards"]`).each((_, el) => {
2260
+ const text = $(el).text().trim().replace(/\s+/g, ' ');
2261
+ if (text && text.length > 3 && text.length < 200) {
2262
+ awardsSummary = text;
2263
+ return false; // break
2264
+ }
2265
+ });
2266
+ }
2267
+ }
2268
+ // Fallback: JSON-LD award field
2269
+ if (!awardsSummary && jsonLd?.award) {
2270
+ awardsSummary = typeof jsonLd.award === 'string' ? jsonLd.award : '';
2271
+ }
2179
2272
  // Content rating & release date from JSON-LD
2180
2273
  const contentRating = jsonLd?.contentRating || '';
2181
2274
  const datePublished = jsonLd?.datePublished || '';
2182
2275
  const structured = {
2183
2276
  title, year, contentType, description: fullPlot, ratingValue, ratingCount,
2184
- genres, director, writers, cast, runtime, keywords, contentRating, datePublished, url,
2277
+ genres, director, writers, cast, runtime, keywords, contentRating, datePublished, awardsSummary, url,
2185
2278
  };
2186
2279
  const ratingLine = ratingValue ? `⭐ ${ratingValue}/10${ratingCount ? ` (${Number(ratingCount).toLocaleString()} votes)` : ''}` : '';
2187
2280
  const genreLine = genres.length ? genres.join(', ') : '';
@@ -2192,8 +2285,9 @@ async function imdbExtractor(html, url) {
2192
2285
  const ratedLine = contentRating ? `**Rated:** ${contentRating}` : '';
2193
2286
  const releaseLine = datePublished ? `**Released:** ${datePublished}` : '';
2194
2287
  const keywordsLine = keywords.length ? `\n**Keywords:** ${keywords.slice(0, 10).join(', ')}` : '';
2288
+ const awardsLine = awardsSummary ? `**Awards:** ${awardsSummary}` : '';
2195
2289
  const metaParts = [ratingLine, genreLine, runtimeLine, year ? `**Year:** ${year}` : ''].filter(Boolean).join(' | ');
2196
- const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine].filter(Boolean).join('\n');
2290
+ const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine, awardsLine].filter(Boolean).join('\n');
2197
2291
  const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${detailParts}${keywordsLine}\n\n## Plot\n\n${fullPlot}`;
2198
2292
  return { domain: 'imdb.com', type: contentType === 'TVSeries' ? 'tv_show' : 'movie', structured, cleanContent };
2199
2293
  }
@@ -2331,6 +2425,7 @@ async function pypiExtractor(_html, url) {
2331
2425
  return null;
2332
2426
  const info = data.info;
2333
2427
  const structured = {
2428
+ title: `${info.name} ${info.version}`,
2334
2429
  name: info.name,
2335
2430
  version: info.version,
2336
2431
  description: info.summary || '',
@@ -344,6 +344,7 @@ export async function fetchContent(ctx) {
344
344
  blockResources: ctx.options.blockResources,
345
345
  cloaked: ctx.options.cloaked,
346
346
  cycle: ctx.options.cycle,
347
+ tls: ctx.options.tls,
347
348
  noEscalate: ctx.options.noEscalate,
348
349
  });
349
350
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "webpeel",
3
- "version": "0.21.12",
3
+ "version": "0.21.13",
4
4
  "description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
5
5
  "author": "Jake Liu",
6
6
  "license": "AGPL-3.0-only",