webpeel 0.21.12 → 0.21.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/domain-extractors.js +111 -16
- package/dist/core/pipeline.js +1 -0
- package/package.json +1 -1
|
@@ -1795,15 +1795,57 @@ async function mediumExtractor(html, url) {
|
|
|
1795
1795
|
$('span').filter((_, el) => $(el).text().includes('min read')).first().text().trim() || '';
|
|
1796
1796
|
const description = jsonLdData?.description ||
|
|
1797
1797
|
$('meta[property="og:description"]').attr('content') || '';
|
|
1798
|
+
// Publication name — subdomain (towardsdatascience.medium.com), meta tags, or breadcrumb
|
|
1799
|
+
let publication = '';
|
|
1800
|
+
try {
|
|
1801
|
+
const urlObj2 = new URL(url);
|
|
1802
|
+
const hostname = urlObj2.hostname;
|
|
1803
|
+
if (hostname !== 'medium.com' && hostname !== 'www.medium.com' && hostname.endsWith('.medium.com')) {
|
|
1804
|
+
publication = hostname.replace('.medium.com', '').replace(/-/g, ' ').replace(/\b\w/g, (c) => c.toUpperCase());
|
|
1805
|
+
}
|
|
1806
|
+
}
|
|
1807
|
+
catch { /* ignore */ }
|
|
1808
|
+
if (!publication) {
|
|
1809
|
+
publication = $('[data-testid="publicationName"]').text().trim() ||
|
|
1810
|
+
$('a[data-testid="publicationName"]').text().trim() ||
|
|
1811
|
+
$('meta[property="article:section"]').attr('content') ||
|
|
1812
|
+
$('a[href*="/tag/"]').first().text().trim() || '';
|
|
1813
|
+
}
|
|
1814
|
+
// Author bio — usually shown in an author card or bio section
|
|
1815
|
+
const authorBio = $('[data-testid="authorBio"]').text().trim() ||
|
|
1816
|
+
$('p[class*="bio"]').first().text().trim() ||
|
|
1817
|
+
$('[aria-label="authorBio"]').text().trim() || '';
|
|
1818
|
+
// Clap count — Medium shows clap button with count
|
|
1819
|
+
let clapCount = '';
|
|
1820
|
+
$('button[data-testid="storyClaps"], button[aria-label*="clap"]').each((_, el) => {
|
|
1821
|
+
const txt = $(el).text().trim();
|
|
1822
|
+
if (txt && /\d/.test(txt)) {
|
|
1823
|
+
clapCount = txt;
|
|
1824
|
+
return false;
|
|
1825
|
+
}
|
|
1826
|
+
});
|
|
1827
|
+
if (!clapCount) {
|
|
1828
|
+
// Fallback: find spans that look like clap counts (e.g., "2.4K")
|
|
1829
|
+
$('span').filter((_, el) => {
|
|
1830
|
+
const label = $(el).closest('[aria-label]').attr('aria-label') || '';
|
|
1831
|
+
return label.toLowerCase().includes('clap');
|
|
1832
|
+
}).each((_, el) => {
|
|
1833
|
+
const txt = $(el).text().trim();
|
|
1834
|
+
if (txt && /\d/.test(txt)) {
|
|
1835
|
+
clapCount = txt;
|
|
1836
|
+
return false;
|
|
1837
|
+
}
|
|
1838
|
+
});
|
|
1839
|
+
}
|
|
1798
1840
|
// Extract article body — Medium puts content in <article> or section
|
|
1799
1841
|
let articleBody = '';
|
|
1800
1842
|
const articleEl = $('article').first();
|
|
1801
1843
|
if (articleEl.length) {
|
|
1802
|
-
// Remove nav, aside, buttons
|
|
1803
|
-
articleEl.find('nav, aside, button, [data-testid="navbar"]').remove();
|
|
1844
|
+
// Remove nav, aside, buttons, author-card, footer sections
|
|
1845
|
+
articleEl.find('nav, aside, button, [data-testid="navbar"], footer, [data-testid="authorCard"]').remove();
|
|
1804
1846
|
// Get paragraphs and headings
|
|
1805
1847
|
const parts = [];
|
|
1806
|
-
articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
|
|
1848
|
+
articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li, figure figcaption').each((_, el) => {
|
|
1807
1849
|
const tag = el.name;
|
|
1808
1850
|
const text = $(el).text().trim();
|
|
1809
1851
|
if (!text || text.length < 5)
|
|
@@ -1816,6 +1858,8 @@ async function mediumExtractor(html, url) {
|
|
|
1816
1858
|
parts.push(`> ${text}`);
|
|
1817
1859
|
else if (tag === 'pre')
|
|
1818
1860
|
parts.push('```\n' + text + '\n```');
|
|
1861
|
+
else if (tag === 'figcaption')
|
|
1862
|
+
parts.push(`*${text}*`);
|
|
1819
1863
|
else
|
|
1820
1864
|
parts.push(text);
|
|
1821
1865
|
});
|
|
@@ -1826,15 +1870,22 @@ async function mediumExtractor(html, url) {
|
|
|
1826
1870
|
const structured = {
|
|
1827
1871
|
title,
|
|
1828
1872
|
author,
|
|
1873
|
+
authorBio,
|
|
1829
1874
|
publishDate,
|
|
1830
1875
|
readingTime,
|
|
1831
1876
|
description,
|
|
1877
|
+
publication,
|
|
1878
|
+
clapCount,
|
|
1832
1879
|
url,
|
|
1833
1880
|
};
|
|
1834
1881
|
const authorLine = author ? `\n**Author:** ${author}` : '';
|
|
1882
|
+
const bioLine = authorBio ? `\n**Author Bio:** ${authorBio}` : '';
|
|
1835
1883
|
const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
|
|
1836
1884
|
const timeLine = readingTime ? `\n**Reading time:** ${readingTime}` : '';
|
|
1837
|
-
const
|
|
1885
|
+
const pubLine = publication ? `\n**Publication:** ${publication}` : '';
|
|
1886
|
+
const clapsLine = clapCount ? `\n**Claps:** ${clapCount}` : '';
|
|
1887
|
+
// No hard character cap — let the pipeline's budget/maxTokens handle truncation
|
|
1888
|
+
const cleanContent = `# ${title}${authorLine}${bioLine}${dateLine}${timeLine}${pubLine}${clapsLine}\n\n${contentBody}`;
|
|
1838
1889
|
return { domain: 'medium.com', type: 'article', structured, cleanContent };
|
|
1839
1890
|
}
|
|
1840
1891
|
catch {
|
|
@@ -2122,20 +2173,37 @@ async function imdbExtractor(html, url) {
|
|
|
2122
2173
|
? jsonLd.director.map((d) => d.name || d).join(', ')
|
|
2123
2174
|
: jsonLd.director?.name || String(jsonLd.director))
|
|
2124
2175
|
: $('a[href*="/name/"][class*="ipc-metadata-list-item__list-content-item"]').first().text().trim() || '';
|
|
2125
|
-
// Cast —
|
|
2176
|
+
// Cast — parse HTML first for actor+character pairs, then fall back to JSON-LD
|
|
2177
|
+
const castPairs = [];
|
|
2178
|
+
// IMDB new UI: each title-cast-item contains actor link + character link
|
|
2179
|
+
$('[data-testid="title-cast-item"]').each((_, el) => {
|
|
2180
|
+
const actorEl = $(el).find('a[href*="/name/nm"]').first();
|
|
2181
|
+
const charEl = $(el).find('[data-testid="title-cast-item__character"]').first();
|
|
2182
|
+
const actor = actorEl.text().trim();
|
|
2183
|
+
// Character name may span multiple elements; clean whitespace
|
|
2184
|
+
const character = charEl.text().trim().replace(/\s+/g, ' ').replace(/^\.\.\.$/, '');
|
|
2185
|
+
if (actor && actor.length > 1) {
|
|
2186
|
+
castPairs.push({ actor, character: character || '' });
|
|
2187
|
+
}
|
|
2188
|
+
});
|
|
2189
|
+
// Fall back to classic cast list (older IMDB page versions)
|
|
2190
|
+
const castFromHtml = [];
|
|
2191
|
+
if (!castPairs.length) {
|
|
2192
|
+
$('.cast_list td.itemprop a').each((_, el) => {
|
|
2193
|
+
const name = $(el).text().trim();
|
|
2194
|
+
if (name && name.length > 1 && !castFromHtml.includes(name))
|
|
2195
|
+
castFromHtml.push(name);
|
|
2196
|
+
});
|
|
2197
|
+
}
|
|
2198
|
+
// JSON-LD actors as final fallback
|
|
2126
2199
|
const castFromLd = jsonLd?.actor
|
|
2127
2200
|
? (Array.isArray(jsonLd.actor) ? jsonLd.actor : [jsonLd.actor])
|
|
2128
2201
|
.map((a) => a.name || a)
|
|
2129
2202
|
: [];
|
|
2130
|
-
//
|
|
2131
|
-
const
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
const name = $(el).text().trim();
|
|
2135
|
-
if (name && name.length > 1 && !castFromHtml.includes(name))
|
|
2136
|
-
castFromHtml.push(name);
|
|
2137
|
-
});
|
|
2138
|
-
const cast = [...new Set([...castFromLd, ...castFromHtml])].slice(0, 15);
|
|
2203
|
+
// Build final cast list: with characters if available (top 10), otherwise names only
|
|
2204
|
+
const cast = castPairs.length > 0
|
|
2205
|
+
? castPairs.slice(0, 10).map(({ actor, character }) => character ? `${actor} as ${character}` : actor)
|
|
2206
|
+
: [...new Set([...castFromLd, ...castFromHtml])].slice(0, 10);
|
|
2139
2207
|
// Runtime
|
|
2140
2208
|
const runtime = jsonLd?.duration
|
|
2141
2209
|
? (() => {
|
|
@@ -2176,12 +2244,37 @@ async function imdbExtractor(html, url) {
|
|
|
2176
2244
|
writers.push(name);
|
|
2177
2245
|
}
|
|
2178
2246
|
}
|
|
2247
|
+
// Awards / accolades — try hero accolades chip, then any awards-related link text
|
|
2248
|
+
let awardsSummary = '';
|
|
2249
|
+
// IMDB new UI: awards accolades chip in the hero section
|
|
2250
|
+
const accoladesEl = $('[data-testid="awards-accolades"]');
|
|
2251
|
+
if (accoladesEl.length) {
|
|
2252
|
+
awardsSummary = accoladesEl.text().trim().replace(/\s+/g, ' ');
|
|
2253
|
+
}
|
|
2254
|
+
// Fallback: look for per-title awards link (href contains the title ID /tt\d+/awards)
|
|
2255
|
+
if (!awardsSummary) {
|
|
2256
|
+
const titleMatch = url.match(/\/(tt\d+)/);
|
|
2257
|
+
const titleId = titleMatch ? titleMatch[1] : '';
|
|
2258
|
+
if (titleId) {
|
|
2259
|
+
$(`a[href*="${titleId}"][href*="awards"]`).each((_, el) => {
|
|
2260
|
+
const text = $(el).text().trim().replace(/\s+/g, ' ');
|
|
2261
|
+
if (text && text.length > 3 && text.length < 200) {
|
|
2262
|
+
awardsSummary = text;
|
|
2263
|
+
return false; // break
|
|
2264
|
+
}
|
|
2265
|
+
});
|
|
2266
|
+
}
|
|
2267
|
+
}
|
|
2268
|
+
// Fallback: JSON-LD award field
|
|
2269
|
+
if (!awardsSummary && jsonLd?.award) {
|
|
2270
|
+
awardsSummary = typeof jsonLd.award === 'string' ? jsonLd.award : '';
|
|
2271
|
+
}
|
|
2179
2272
|
// Content rating & release date from JSON-LD
|
|
2180
2273
|
const contentRating = jsonLd?.contentRating || '';
|
|
2181
2274
|
const datePublished = jsonLd?.datePublished || '';
|
|
2182
2275
|
const structured = {
|
|
2183
2276
|
title, year, contentType, description: fullPlot, ratingValue, ratingCount,
|
|
2184
|
-
genres, director, writers, cast, runtime, keywords, contentRating, datePublished, url,
|
|
2277
|
+
genres, director, writers, cast, runtime, keywords, contentRating, datePublished, awardsSummary, url,
|
|
2185
2278
|
};
|
|
2186
2279
|
const ratingLine = ratingValue ? `⭐ ${ratingValue}/10${ratingCount ? ` (${Number(ratingCount).toLocaleString()} votes)` : ''}` : '';
|
|
2187
2280
|
const genreLine = genres.length ? genres.join(', ') : '';
|
|
@@ -2192,8 +2285,9 @@ async function imdbExtractor(html, url) {
|
|
|
2192
2285
|
const ratedLine = contentRating ? `**Rated:** ${contentRating}` : '';
|
|
2193
2286
|
const releaseLine = datePublished ? `**Released:** ${datePublished}` : '';
|
|
2194
2287
|
const keywordsLine = keywords.length ? `\n**Keywords:** ${keywords.slice(0, 10).join(', ')}` : '';
|
|
2288
|
+
const awardsLine = awardsSummary ? `**Awards:** ${awardsSummary}` : '';
|
|
2195
2289
|
const metaParts = [ratingLine, genreLine, runtimeLine, year ? `**Year:** ${year}` : ''].filter(Boolean).join(' | ');
|
|
2196
|
-
const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine].filter(Boolean).join('\n');
|
|
2290
|
+
const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine, awardsLine].filter(Boolean).join('\n');
|
|
2197
2291
|
const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${detailParts}${keywordsLine}\n\n## Plot\n\n${fullPlot}`;
|
|
2198
2292
|
return { domain: 'imdb.com', type: contentType === 'TVSeries' ? 'tv_show' : 'movie', structured, cleanContent };
|
|
2199
2293
|
}
|
|
@@ -2331,6 +2425,7 @@ async function pypiExtractor(_html, url) {
|
|
|
2331
2425
|
return null;
|
|
2332
2426
|
const info = data.info;
|
|
2333
2427
|
const structured = {
|
|
2428
|
+
title: `${info.name} ${info.version}`,
|
|
2334
2429
|
name: info.name,
|
|
2335
2430
|
version: info.version,
|
|
2336
2431
|
description: info.summary || '',
|
package/dist/core/pipeline.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.13",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|