webpeel 0.21.12 → 0.21.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/domain-extractors.js +115 -16
- package/dist/core/pipeline.js +1 -0
- package/package.json +1 -1
|
@@ -778,6 +778,10 @@ async function githubExtractor(_html, url) {
|
|
|
778
778
|
if (pathParts.length === 0)
|
|
779
779
|
return null;
|
|
780
780
|
const ghHeaders = { Accept: 'application/vnd.github.v3+json' };
|
|
781
|
+
// Use GITHUB_TOKEN if available for higher rate limits (5000/hr vs 60/hr)
|
|
782
|
+
const ghToken = process.env.GITHUB_TOKEN || process.env.GH_TOKEN;
|
|
783
|
+
if (ghToken)
|
|
784
|
+
ghHeaders.Authorization = `token ${ghToken}`;
|
|
781
785
|
// User profile: /username (single segment)
|
|
782
786
|
if (pathParts.length === 1) {
|
|
783
787
|
const username = pathParts[0];
|
|
@@ -1795,15 +1799,57 @@ async function mediumExtractor(html, url) {
|
|
|
1795
1799
|
$('span').filter((_, el) => $(el).text().includes('min read')).first().text().trim() || '';
|
|
1796
1800
|
const description = jsonLdData?.description ||
|
|
1797
1801
|
$('meta[property="og:description"]').attr('content') || '';
|
|
1802
|
+
// Publication name — subdomain (towardsdatascience.medium.com), meta tags, or breadcrumb
|
|
1803
|
+
let publication = '';
|
|
1804
|
+
try {
|
|
1805
|
+
const urlObj2 = new URL(url);
|
|
1806
|
+
const hostname = urlObj2.hostname;
|
|
1807
|
+
if (hostname !== 'medium.com' && hostname !== 'www.medium.com' && hostname.endsWith('.medium.com')) {
|
|
1808
|
+
publication = hostname.replace('.medium.com', '').replace(/-/g, ' ').replace(/\b\w/g, (c) => c.toUpperCase());
|
|
1809
|
+
}
|
|
1810
|
+
}
|
|
1811
|
+
catch { /* ignore */ }
|
|
1812
|
+
if (!publication) {
|
|
1813
|
+
publication = $('[data-testid="publicationName"]').text().trim() ||
|
|
1814
|
+
$('a[data-testid="publicationName"]').text().trim() ||
|
|
1815
|
+
$('meta[property="article:section"]').attr('content') ||
|
|
1816
|
+
$('a[href*="/tag/"]').first().text().trim() || '';
|
|
1817
|
+
}
|
|
1818
|
+
// Author bio — usually shown in an author card or bio section
|
|
1819
|
+
const authorBio = $('[data-testid="authorBio"]').text().trim() ||
|
|
1820
|
+
$('p[class*="bio"]').first().text().trim() ||
|
|
1821
|
+
$('[aria-label="authorBio"]').text().trim() || '';
|
|
1822
|
+
// Clap count — Medium shows clap button with count
|
|
1823
|
+
let clapCount = '';
|
|
1824
|
+
$('button[data-testid="storyClaps"], button[aria-label*="clap"]').each((_, el) => {
|
|
1825
|
+
const txt = $(el).text().trim();
|
|
1826
|
+
if (txt && /\d/.test(txt)) {
|
|
1827
|
+
clapCount = txt;
|
|
1828
|
+
return false;
|
|
1829
|
+
}
|
|
1830
|
+
});
|
|
1831
|
+
if (!clapCount) {
|
|
1832
|
+
// Fallback: find spans that look like clap counts (e.g., "2.4K")
|
|
1833
|
+
$('span').filter((_, el) => {
|
|
1834
|
+
const label = $(el).closest('[aria-label]').attr('aria-label') || '';
|
|
1835
|
+
return label.toLowerCase().includes('clap');
|
|
1836
|
+
}).each((_, el) => {
|
|
1837
|
+
const txt = $(el).text().trim();
|
|
1838
|
+
if (txt && /\d/.test(txt)) {
|
|
1839
|
+
clapCount = txt;
|
|
1840
|
+
return false;
|
|
1841
|
+
}
|
|
1842
|
+
});
|
|
1843
|
+
}
|
|
1798
1844
|
// Extract article body — Medium puts content in <article> or section
|
|
1799
1845
|
let articleBody = '';
|
|
1800
1846
|
const articleEl = $('article').first();
|
|
1801
1847
|
if (articleEl.length) {
|
|
1802
|
-
// Remove nav, aside, buttons
|
|
1803
|
-
articleEl.find('nav, aside, button, [data-testid="navbar"]').remove();
|
|
1848
|
+
// Remove nav, aside, buttons, author-card, footer sections
|
|
1849
|
+
articleEl.find('nav, aside, button, [data-testid="navbar"], footer, [data-testid="authorCard"]').remove();
|
|
1804
1850
|
// Get paragraphs and headings
|
|
1805
1851
|
const parts = [];
|
|
1806
|
-
articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
|
|
1852
|
+
articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li, figure figcaption').each((_, el) => {
|
|
1807
1853
|
const tag = el.name;
|
|
1808
1854
|
const text = $(el).text().trim();
|
|
1809
1855
|
if (!text || text.length < 5)
|
|
@@ -1816,6 +1862,8 @@ async function mediumExtractor(html, url) {
|
|
|
1816
1862
|
parts.push(`> ${text}`);
|
|
1817
1863
|
else if (tag === 'pre')
|
|
1818
1864
|
parts.push('```\n' + text + '\n```');
|
|
1865
|
+
else if (tag === 'figcaption')
|
|
1866
|
+
parts.push(`*${text}*`);
|
|
1819
1867
|
else
|
|
1820
1868
|
parts.push(text);
|
|
1821
1869
|
});
|
|
@@ -1826,15 +1874,22 @@ async function mediumExtractor(html, url) {
|
|
|
1826
1874
|
const structured = {
|
|
1827
1875
|
title,
|
|
1828
1876
|
author,
|
|
1877
|
+
authorBio,
|
|
1829
1878
|
publishDate,
|
|
1830
1879
|
readingTime,
|
|
1831
1880
|
description,
|
|
1881
|
+
publication,
|
|
1882
|
+
clapCount,
|
|
1832
1883
|
url,
|
|
1833
1884
|
};
|
|
1834
1885
|
const authorLine = author ? `\n**Author:** ${author}` : '';
|
|
1886
|
+
const bioLine = authorBio ? `\n**Author Bio:** ${authorBio}` : '';
|
|
1835
1887
|
const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
|
|
1836
1888
|
const timeLine = readingTime ? `\n**Reading time:** ${readingTime}` : '';
|
|
1837
|
-
const
|
|
1889
|
+
const pubLine = publication ? `\n**Publication:** ${publication}` : '';
|
|
1890
|
+
const clapsLine = clapCount ? `\n**Claps:** ${clapCount}` : '';
|
|
1891
|
+
// No hard character cap — let the pipeline's budget/maxTokens handle truncation
|
|
1892
|
+
const cleanContent = `# ${title}${authorLine}${bioLine}${dateLine}${timeLine}${pubLine}${clapsLine}\n\n${contentBody}`;
|
|
1838
1893
|
return { domain: 'medium.com', type: 'article', structured, cleanContent };
|
|
1839
1894
|
}
|
|
1840
1895
|
catch {
|
|
@@ -2122,20 +2177,37 @@ async function imdbExtractor(html, url) {
|
|
|
2122
2177
|
? jsonLd.director.map((d) => d.name || d).join(', ')
|
|
2123
2178
|
: jsonLd.director?.name || String(jsonLd.director))
|
|
2124
2179
|
: $('a[href*="/name/"][class*="ipc-metadata-list-item__list-content-item"]').first().text().trim() || '';
|
|
2125
|
-
// Cast —
|
|
2180
|
+
// Cast — parse HTML first for actor+character pairs, then fall back to JSON-LD
|
|
2181
|
+
const castPairs = [];
|
|
2182
|
+
// IMDB new UI: each title-cast-item contains actor link + character link
|
|
2183
|
+
$('[data-testid="title-cast-item"]').each((_, el) => {
|
|
2184
|
+
const actorEl = $(el).find('a[href*="/name/nm"]').first();
|
|
2185
|
+
const charEl = $(el).find('[data-testid="title-cast-item__character"]').first();
|
|
2186
|
+
const actor = actorEl.text().trim();
|
|
2187
|
+
// Character name may span multiple elements; clean whitespace
|
|
2188
|
+
const character = charEl.text().trim().replace(/\s+/g, ' ').replace(/^\.\.\.$/, '');
|
|
2189
|
+
if (actor && actor.length > 1) {
|
|
2190
|
+
castPairs.push({ actor, character: character || '' });
|
|
2191
|
+
}
|
|
2192
|
+
});
|
|
2193
|
+
// Fall back to classic cast list (older IMDB page versions)
|
|
2194
|
+
const castFromHtml = [];
|
|
2195
|
+
if (!castPairs.length) {
|
|
2196
|
+
$('.cast_list td.itemprop a').each((_, el) => {
|
|
2197
|
+
const name = $(el).text().trim();
|
|
2198
|
+
if (name && name.length > 1 && !castFromHtml.includes(name))
|
|
2199
|
+
castFromHtml.push(name);
|
|
2200
|
+
});
|
|
2201
|
+
}
|
|
2202
|
+
// JSON-LD actors as final fallback
|
|
2126
2203
|
const castFromLd = jsonLd?.actor
|
|
2127
2204
|
? (Array.isArray(jsonLd.actor) ? jsonLd.actor : [jsonLd.actor])
|
|
2128
2205
|
.map((a) => a.name || a)
|
|
2129
2206
|
: [];
|
|
2130
|
-
//
|
|
2131
|
-
const
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
const name = $(el).text().trim();
|
|
2135
|
-
if (name && name.length > 1 && !castFromHtml.includes(name))
|
|
2136
|
-
castFromHtml.push(name);
|
|
2137
|
-
});
|
|
2138
|
-
const cast = [...new Set([...castFromLd, ...castFromHtml])].slice(0, 15);
|
|
2207
|
+
// Build final cast list: with characters if available (top 10), otherwise names only
|
|
2208
|
+
const cast = castPairs.length > 0
|
|
2209
|
+
? castPairs.slice(0, 10).map(({ actor, character }) => character ? `${actor} as ${character}` : actor)
|
|
2210
|
+
: [...new Set([...castFromLd, ...castFromHtml])].slice(0, 10);
|
|
2139
2211
|
// Runtime
|
|
2140
2212
|
const runtime = jsonLd?.duration
|
|
2141
2213
|
? (() => {
|
|
@@ -2176,12 +2248,37 @@ async function imdbExtractor(html, url) {
|
|
|
2176
2248
|
writers.push(name);
|
|
2177
2249
|
}
|
|
2178
2250
|
}
|
|
2251
|
+
// Awards / accolades — try hero accolades chip, then any awards-related link text
|
|
2252
|
+
let awardsSummary = '';
|
|
2253
|
+
// IMDB new UI: awards accolades chip in the hero section
|
|
2254
|
+
const accoladesEl = $('[data-testid="awards-accolades"]');
|
|
2255
|
+
if (accoladesEl.length) {
|
|
2256
|
+
awardsSummary = accoladesEl.text().trim().replace(/\s+/g, ' ');
|
|
2257
|
+
}
|
|
2258
|
+
// Fallback: look for per-title awards link (href contains the title ID /tt\d+/awards)
|
|
2259
|
+
if (!awardsSummary) {
|
|
2260
|
+
const titleMatch = url.match(/\/(tt\d+)/);
|
|
2261
|
+
const titleId = titleMatch ? titleMatch[1] : '';
|
|
2262
|
+
if (titleId) {
|
|
2263
|
+
$(`a[href*="${titleId}"][href*="awards"]`).each((_, el) => {
|
|
2264
|
+
const text = $(el).text().trim().replace(/\s+/g, ' ');
|
|
2265
|
+
if (text && text.length > 3 && text.length < 200) {
|
|
2266
|
+
awardsSummary = text;
|
|
2267
|
+
return false; // break
|
|
2268
|
+
}
|
|
2269
|
+
});
|
|
2270
|
+
}
|
|
2271
|
+
}
|
|
2272
|
+
// Fallback: JSON-LD award field
|
|
2273
|
+
if (!awardsSummary && jsonLd?.award) {
|
|
2274
|
+
awardsSummary = typeof jsonLd.award === 'string' ? jsonLd.award : '';
|
|
2275
|
+
}
|
|
2179
2276
|
// Content rating & release date from JSON-LD
|
|
2180
2277
|
const contentRating = jsonLd?.contentRating || '';
|
|
2181
2278
|
const datePublished = jsonLd?.datePublished || '';
|
|
2182
2279
|
const structured = {
|
|
2183
2280
|
title, year, contentType, description: fullPlot, ratingValue, ratingCount,
|
|
2184
|
-
genres, director, writers, cast, runtime, keywords, contentRating, datePublished, url,
|
|
2281
|
+
genres, director, writers, cast, runtime, keywords, contentRating, datePublished, awardsSummary, url,
|
|
2185
2282
|
};
|
|
2186
2283
|
const ratingLine = ratingValue ? `⭐ ${ratingValue}/10${ratingCount ? ` (${Number(ratingCount).toLocaleString()} votes)` : ''}` : '';
|
|
2187
2284
|
const genreLine = genres.length ? genres.join(', ') : '';
|
|
@@ -2192,8 +2289,9 @@ async function imdbExtractor(html, url) {
|
|
|
2192
2289
|
const ratedLine = contentRating ? `**Rated:** ${contentRating}` : '';
|
|
2193
2290
|
const releaseLine = datePublished ? `**Released:** ${datePublished}` : '';
|
|
2194
2291
|
const keywordsLine = keywords.length ? `\n**Keywords:** ${keywords.slice(0, 10).join(', ')}` : '';
|
|
2292
|
+
const awardsLine = awardsSummary ? `**Awards:** ${awardsSummary}` : '';
|
|
2195
2293
|
const metaParts = [ratingLine, genreLine, runtimeLine, year ? `**Year:** ${year}` : ''].filter(Boolean).join(' | ');
|
|
2196
|
-
const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine].filter(Boolean).join('\n');
|
|
2294
|
+
const detailParts = [directorLine, writersLine, castLine, ratedLine, releaseLine, awardsLine].filter(Boolean).join('\n');
|
|
2197
2295
|
const cleanContent = `# 🎬 ${title}\n\n${metaParts}\n\n${detailParts}${keywordsLine}\n\n## Plot\n\n${fullPlot}`;
|
|
2198
2296
|
return { domain: 'imdb.com', type: contentType === 'TVSeries' ? 'tv_show' : 'movie', structured, cleanContent };
|
|
2199
2297
|
}
|
|
@@ -2331,6 +2429,7 @@ async function pypiExtractor(_html, url) {
|
|
|
2331
2429
|
return null;
|
|
2332
2430
|
const info = data.info;
|
|
2333
2431
|
const structured = {
|
|
2432
|
+
title: `${info.name} ${info.version}`,
|
|
2334
2433
|
name: info.name,
|
|
2335
2434
|
version: info.version,
|
|
2336
2435
|
description: info.summary || '',
|
package/dist/core/pipeline.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.14",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|