webpeel 0.17.13 ā 0.17.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/domain-extractors.d.ts.map +1 -1
- package/dist/core/domain-extractors.js +793 -11
- package/dist/core/domain-extractors.js.map +1 -1
- package/dist/core/search-provider.d.ts +4 -9
- package/dist/core/search-provider.d.ts.map +1 -1
- package/dist/core/search-provider.js +81 -135
- package/dist/core/search-provider.js.map +1 -1
- package/dist/core/youtube.d.ts +36 -0
- package/dist/core/youtube.d.ts.map +1 -1
- package/dist/core/youtube.js +135 -4
- package/dist/core/youtube.js.map +1 -1
- package/dist/server/openapi.yaml +1 -1
- package/dist/server/routes/fetch.d.ts.map +1 -1
- package/dist/server/routes/fetch.js +20 -1
- package/dist/server/routes/fetch.js.map +1 -1
- package/package.json +2 -1
|
@@ -71,6 +71,15 @@ const REGISTRY = [
|
|
|
71
71
|
{ match: (h) => h === 'www.npmjs.com' || h === 'npmjs.com', extractor: npmExtractor },
|
|
72
72
|
{ match: (h) => h === 'www.bestbuy.com' || h === 'bestbuy.com', extractor: bestBuyExtractor },
|
|
73
73
|
{ match: (h) => h === 'www.walmart.com' || h === 'walmart.com', extractor: walmartExtractor },
|
|
74
|
+
{ match: (h) => h === 'www.amazon.com' || h === 'amazon.com', extractor: amazonExtractor },
|
|
75
|
+
{ match: (h) => h === 'medium.com' || h === 'www.medium.com' || h.endsWith('.medium.com'), extractor: mediumExtractor },
|
|
76
|
+
{ match: (h) => h.endsWith('.substack.com'), extractor: substackExtractor },
|
|
77
|
+
{ match: (h) => h === 'www.allrecipes.com' || h === 'allrecipes.com', extractor: allrecipesExtractor },
|
|
78
|
+
{ match: (h) => h === 'www.imdb.com' || h === 'imdb.com', extractor: imdbExtractor },
|
|
79
|
+
{ match: (h) => h === 'www.linkedin.com' || h === 'linkedin.com', extractor: linkedinExtractor },
|
|
80
|
+
{ match: (h) => h === 'pypi.org' || h === 'www.pypi.org', extractor: pypiExtractor },
|
|
81
|
+
{ match: (h) => h === 'dev.to' || h === 'www.dev.to', extractor: devtoExtractor },
|
|
82
|
+
{ match: (h) => h === 'craigslist.org' || h === 'www.craigslist.org' || h.endsWith('.craigslist.org'), extractor: craigslistExtractor },
|
|
74
83
|
];
|
|
75
84
|
/**
|
|
76
85
|
* Returns the domain extractor for a URL, or null if none matches.
|
|
@@ -1098,32 +1107,69 @@ async function youtubeExtractor(_html, url) {
|
|
|
1098
1107
|
const title = transcript.title || oembedData?.title || '';
|
|
1099
1108
|
const channel = transcript.channel || oembedData?.author_name || '';
|
|
1100
1109
|
const channelUrl = oembedData?.author_url || `https://www.youtube.com/@${channel}`;
|
|
1101
|
-
const description = noembedData?.description || oembedData?.description || '';
|
|
1110
|
+
const description = transcript.description || noembedData?.description || oembedData?.description || '';
|
|
1102
1111
|
const thumbnailUrl = oembedData?.thumbnail_url || '';
|
|
1112
|
+
const publishDate = transcript.publishDate || '';
|
|
1113
|
+
const hasTranscript = transcript.segments.length > 0;
|
|
1103
1114
|
const structured = {
|
|
1104
1115
|
title,
|
|
1105
1116
|
channel,
|
|
1106
1117
|
channelUrl,
|
|
1107
1118
|
duration: transcript.duration,
|
|
1119
|
+
publishDate,
|
|
1108
1120
|
language: transcript.language,
|
|
1109
1121
|
availableLanguages: transcript.availableLanguages,
|
|
1110
1122
|
transcriptSegments: transcript.segments.length,
|
|
1123
|
+
wordCount: transcript.wordCount ?? 0,
|
|
1111
1124
|
description,
|
|
1112
1125
|
thumbnailUrl,
|
|
1126
|
+
chapters: transcript.chapters ?? [],
|
|
1127
|
+
keyPoints: transcript.keyPoints ?? [],
|
|
1113
1128
|
source: 'transcript',
|
|
1114
1129
|
};
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1130
|
+
// Format the publish date nicely if it's an ISO date
|
|
1131
|
+
let publishStr = '';
|
|
1132
|
+
if (publishDate) {
|
|
1133
|
+
try {
|
|
1134
|
+
const d = new Date(publishDate);
|
|
1135
|
+
publishStr = d.toLocaleDateString('en-US', { month: 'short', year: 'numeric', day: 'numeric' });
|
|
1136
|
+
}
|
|
1137
|
+
catch {
|
|
1138
|
+
publishStr = publishDate;
|
|
1139
|
+
}
|
|
1122
1140
|
}
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1141
|
+
// Build header line
|
|
1142
|
+
const headerParts = [`**Channel:** ${channel}`];
|
|
1143
|
+
if (transcript.duration && transcript.duration !== '0:00')
|
|
1144
|
+
headerParts.push(`**Duration:** ${transcript.duration}`);
|
|
1145
|
+
if (publishStr)
|
|
1146
|
+
headerParts.push(`**Published:** ${publishStr}`);
|
|
1147
|
+
const headerLine = headerParts.join(' | ');
|
|
1148
|
+
const parts = [];
|
|
1149
|
+
parts.push(`# ${title}`);
|
|
1150
|
+
parts.push(headerLine);
|
|
1151
|
+
// Summary section
|
|
1152
|
+
if (transcript.summary && hasTranscript) {
|
|
1153
|
+
parts.push(`## Summary\n\n${transcript.summary}`);
|
|
1154
|
+
}
|
|
1155
|
+
else if (!hasTranscript && transcript.fullText) {
|
|
1156
|
+
parts.push(`## Description\n\n${transcript.fullText}`);
|
|
1157
|
+
}
|
|
1158
|
+
// Key Points section
|
|
1159
|
+
if (transcript.keyPoints && transcript.keyPoints.length > 0) {
|
|
1160
|
+
const kpLines = transcript.keyPoints.map(kp => `- ${kp}`).join('\n');
|
|
1161
|
+
parts.push(`## Key Points\n\n${kpLines}`);
|
|
1162
|
+
}
|
|
1163
|
+
// Chapters section
|
|
1164
|
+
if (transcript.chapters && transcript.chapters.length > 0) {
|
|
1165
|
+
const chLines = transcript.chapters.map(ch => `- ${ch.time} ā ${ch.title}`).join('\n');
|
|
1166
|
+
parts.push(`## Chapters\n\n${chLines}`);
|
|
1167
|
+
}
|
|
1168
|
+
// Full Transcript section (only if we have real transcript segments)
|
|
1169
|
+
if (hasTranscript) {
|
|
1170
|
+
parts.push(`## Full Transcript\n\n${transcript.fullText}`);
|
|
1126
1171
|
}
|
|
1172
|
+
const cleanContent = parts.join('\n\n');
|
|
1127
1173
|
return { domain: 'youtube.com', type: 'video', structured, cleanContent };
|
|
1128
1174
|
}
|
|
1129
1175
|
// Fall back to oEmbed if transcript failed
|
|
@@ -1462,4 +1508,740 @@ async function walmartExtractor(_html, url) {
|
|
|
1462
1508
|
return null; // API not accessible, fall through to other methods
|
|
1463
1509
|
}
|
|
1464
1510
|
}
|
|
1511
|
+
// ---------------------------------------------------------------------------
|
|
1512
|
+
// 12. Amazon Products extractor
|
|
1513
|
+
// ---------------------------------------------------------------------------
|
|
1514
|
+
async function amazonExtractor(html, url) {
|
|
1515
|
+
try {
|
|
1516
|
+
const { load } = await import('cheerio');
|
|
1517
|
+
const $ = load(html);
|
|
1518
|
+
// Extract from JSON-LD first
|
|
1519
|
+
let jsonLdData = null;
|
|
1520
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
1521
|
+
if (jsonLdData)
|
|
1522
|
+
return;
|
|
1523
|
+
const raw = $(el).html() || '';
|
|
1524
|
+
const parsed = tryParseJson(raw);
|
|
1525
|
+
if (parsed?.['@type'] === 'Product')
|
|
1526
|
+
jsonLdData = parsed;
|
|
1527
|
+
});
|
|
1528
|
+
// Meta tag fallbacks
|
|
1529
|
+
const ogTitle = $('meta[property="og:title"]').attr('content') || '';
|
|
1530
|
+
const ogDescription = $('meta[property="og:description"]').attr('content') || '';
|
|
1531
|
+
const ogImage = $('meta[property="og:image"]').attr('content') || '';
|
|
1532
|
+
// HTML selectors
|
|
1533
|
+
const title = jsonLdData?.name ||
|
|
1534
|
+
$('#productTitle').text().trim() ||
|
|
1535
|
+
$('#title').text().trim() ||
|
|
1536
|
+
ogTitle;
|
|
1537
|
+
if (!title)
|
|
1538
|
+
return null;
|
|
1539
|
+
const priceWhole = $('#priceblock_ourprice').text().trim() ||
|
|
1540
|
+
$('.a-price .a-offscreen').first().text().trim() ||
|
|
1541
|
+
$('[data-asin-price]').first().attr('data-asin-price') || '';
|
|
1542
|
+
const rating = jsonLdData?.aggregateRating?.ratingValue ||
|
|
1543
|
+
$('#acrPopover .a-size-base.a-color-base').first().text().trim() ||
|
|
1544
|
+
$('span[data-hook="rating-out-of-text"]').text().trim() || '';
|
|
1545
|
+
const reviewCount = jsonLdData?.aggregateRating?.reviewCount ||
|
|
1546
|
+
$('#acrCustomerReviewText').text().replace(/[^0-9,]/g, '').trim() || '';
|
|
1547
|
+
const availability = jsonLdData?.offers?.availability?.replace('https://schema.org/', '') ||
|
|
1548
|
+
$('#availability span').first().text().trim() || '';
|
|
1549
|
+
const description = jsonLdData?.description ||
|
|
1550
|
+
$('#feature-bullets .a-list-item').map((_, el) => $(el).text().trim()).get().join('\n') ||
|
|
1551
|
+
$('#productDescription p').text().trim() ||
|
|
1552
|
+
ogDescription;
|
|
1553
|
+
const features = [];
|
|
1554
|
+
$('#feature-bullets li').each((_, el) => {
|
|
1555
|
+
const text = $(el).text().trim();
|
|
1556
|
+
if (text && !text.includes('Make sure this fits'))
|
|
1557
|
+
features.push(text);
|
|
1558
|
+
});
|
|
1559
|
+
// ASIN from URL
|
|
1560
|
+
const asinMatch = url.match(/\/dp\/([A-Z0-9]{10})/i);
|
|
1561
|
+
const asin = asinMatch?.[1] || '';
|
|
1562
|
+
const structured = {
|
|
1563
|
+
title,
|
|
1564
|
+
price: priceWhole,
|
|
1565
|
+
rating,
|
|
1566
|
+
reviewCount,
|
|
1567
|
+
availability,
|
|
1568
|
+
description,
|
|
1569
|
+
features,
|
|
1570
|
+
asin,
|
|
1571
|
+
image: ogImage,
|
|
1572
|
+
url,
|
|
1573
|
+
};
|
|
1574
|
+
const ratingLine = rating ? `\n**Rating:** ${rating}${reviewCount ? ` (${reviewCount} reviews)` : ''}` : '';
|
|
1575
|
+
const priceLine = priceWhole ? `\n**Price:** ${priceWhole}` : '';
|
|
1576
|
+
const availLine = availability ? `\n**Availability:** ${availability}` : '';
|
|
1577
|
+
const featuresSection = features.length
|
|
1578
|
+
? `\n\n## Features\n\n${features.map(f => `- ${f}`).join('\n')}`
|
|
1579
|
+
: '';
|
|
1580
|
+
const descSection = description ? `\n\n## Description\n\n${description.substring(0, 1000)}` : '';
|
|
1581
|
+
const cleanContent = `# š ${title}${priceLine}${ratingLine}${availLine}${descSection}${featuresSection}`;
|
|
1582
|
+
return { domain: 'amazon.com', type: 'product', structured, cleanContent };
|
|
1583
|
+
}
|
|
1584
|
+
catch {
|
|
1585
|
+
return null;
|
|
1586
|
+
}
|
|
1587
|
+
}
|
|
1588
|
+
// ---------------------------------------------------------------------------
|
|
1589
|
+
// 13. Medium Articles extractor
|
|
1590
|
+
// ---------------------------------------------------------------------------
|
|
1591
|
+
async function mediumExtractor(html, url) {
|
|
1592
|
+
try {
|
|
1593
|
+
const { load } = await import('cheerio');
|
|
1594
|
+
const $ = load(html);
|
|
1595
|
+
// JSON-LD
|
|
1596
|
+
let jsonLdData = null;
|
|
1597
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
1598
|
+
if (jsonLdData)
|
|
1599
|
+
return;
|
|
1600
|
+
const raw = $(el).html() || '';
|
|
1601
|
+
const parsed = tryParseJson(raw);
|
|
1602
|
+
if (parsed?.['@type'] === 'NewsArticle' || parsed?.['@type'] === 'Article')
|
|
1603
|
+
jsonLdData = parsed;
|
|
1604
|
+
});
|
|
1605
|
+
const title = jsonLdData?.headline ||
|
|
1606
|
+
$('meta[property="og:title"]').attr('content') ||
|
|
1607
|
+
$('h1').first().text().trim() || '';
|
|
1608
|
+
if (!title)
|
|
1609
|
+
return null;
|
|
1610
|
+
const author = jsonLdData?.author?.name ||
|
|
1611
|
+
$('meta[name="author"]').attr('content') ||
|
|
1612
|
+
$('[data-testid="authorName"]').text().trim() ||
|
|
1613
|
+
$('a[rel="author"]').first().text().trim() || '';
|
|
1614
|
+
const publishDate = jsonLdData?.datePublished ||
|
|
1615
|
+
$('meta[property="article:published_time"]').attr('content') || '';
|
|
1616
|
+
const readingTime = $('[data-testid="storyReadTime"]').text().trim() ||
|
|
1617
|
+
$('span').filter((_, el) => $(el).text().includes('min read')).first().text().trim() || '';
|
|
1618
|
+
const description = jsonLdData?.description ||
|
|
1619
|
+
$('meta[property="og:description"]').attr('content') || '';
|
|
1620
|
+
// Extract article body ā Medium puts content in <article> or section
|
|
1621
|
+
let articleBody = '';
|
|
1622
|
+
const articleEl = $('article').first();
|
|
1623
|
+
if (articleEl.length) {
|
|
1624
|
+
// Remove nav, aside, buttons
|
|
1625
|
+
articleEl.find('nav, aside, button, [data-testid="navbar"]').remove();
|
|
1626
|
+
// Get paragraphs and headings
|
|
1627
|
+
const parts = [];
|
|
1628
|
+
articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
|
|
1629
|
+
const tag = el.name;
|
|
1630
|
+
const text = $(el).text().trim();
|
|
1631
|
+
if (!text || text.length < 5)
|
|
1632
|
+
return;
|
|
1633
|
+
if (tag === 'h1' || tag === 'h2')
|
|
1634
|
+
parts.push(`## ${text}`);
|
|
1635
|
+
else if (tag === 'h3' || tag === 'h4')
|
|
1636
|
+
parts.push(`### ${text}`);
|
|
1637
|
+
else if (tag === 'blockquote')
|
|
1638
|
+
parts.push(`> ${text}`);
|
|
1639
|
+
else if (tag === 'pre')
|
|
1640
|
+
parts.push('```\n' + text + '\n```');
|
|
1641
|
+
else
|
|
1642
|
+
parts.push(text);
|
|
1643
|
+
});
|
|
1644
|
+
articleBody = parts.join('\n\n');
|
|
1645
|
+
}
|
|
1646
|
+
// Fallback to og:description if no body
|
|
1647
|
+
const contentBody = articleBody || description;
|
|
1648
|
+
const structured = {
|
|
1649
|
+
title,
|
|
1650
|
+
author,
|
|
1651
|
+
publishDate,
|
|
1652
|
+
readingTime,
|
|
1653
|
+
description,
|
|
1654
|
+
url,
|
|
1655
|
+
};
|
|
1656
|
+
const authorLine = author ? `\n**Author:** ${author}` : '';
|
|
1657
|
+
const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
|
|
1658
|
+
const timeLine = readingTime ? `\n**Reading time:** ${readingTime}` : '';
|
|
1659
|
+
const cleanContent = `# ${title}${authorLine}${dateLine}${timeLine}\n\n${contentBody.substring(0, 8000)}`;
|
|
1660
|
+
return { domain: 'medium.com', type: 'article', structured, cleanContent };
|
|
1661
|
+
}
|
|
1662
|
+
catch {
|
|
1663
|
+
return null;
|
|
1664
|
+
}
|
|
1665
|
+
}
|
|
1666
|
+
// ---------------------------------------------------------------------------
|
|
1667
|
+
// 14. Substack Posts extractor
|
|
1668
|
+
// ---------------------------------------------------------------------------
|
|
1669
|
+
async function substackExtractor(html, url) {
|
|
1670
|
+
try {
|
|
1671
|
+
const { load } = await import('cheerio');
|
|
1672
|
+
const $ = load(html);
|
|
1673
|
+
// JSON-LD
|
|
1674
|
+
let jsonLdData = null;
|
|
1675
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
1676
|
+
if (jsonLdData)
|
|
1677
|
+
return;
|
|
1678
|
+
const raw = $(el).html() || '';
|
|
1679
|
+
const parsed = tryParseJson(raw);
|
|
1680
|
+
if (parsed?.['@type'] === 'NewsArticle' || parsed?.['@type'] === 'Article')
|
|
1681
|
+
jsonLdData = parsed;
|
|
1682
|
+
});
|
|
1683
|
+
const title = jsonLdData?.headline ||
|
|
1684
|
+
$('meta[property="og:title"]').attr('content') ||
|
|
1685
|
+
$('h1.post-title').first().text().trim() ||
|
|
1686
|
+
$('h1').first().text().trim() || '';
|
|
1687
|
+
if (!title)
|
|
1688
|
+
return null;
|
|
1689
|
+
const author = jsonLdData?.author?.name ||
|
|
1690
|
+
$('meta[name="author"]').attr('content') ||
|
|
1691
|
+
$('a.author-name').first().text().trim() ||
|
|
1692
|
+
$('[class*="author"]').first().text().trim() || '';
|
|
1693
|
+
const publishDate = jsonLdData?.datePublished ||
|
|
1694
|
+
$('meta[property="article:published_time"]').attr('content') ||
|
|
1695
|
+
$('time').first().attr('datetime') || '';
|
|
1696
|
+
const publication = $('meta[property="og:site_name"]').attr('content') ||
|
|
1697
|
+
$('a.navbar-title-link').text().trim() || new URL(url).hostname.replace('.substack.com', '');
|
|
1698
|
+
const description = jsonLdData?.description ||
|
|
1699
|
+
$('meta[property="og:description"]').attr('content') || '';
|
|
1700
|
+
// Article content
|
|
1701
|
+
let articleBody = '';
|
|
1702
|
+
const postContent = $('.body.markup, .post-content, article').first();
|
|
1703
|
+
if (postContent.length) {
|
|
1704
|
+
postContent.find('script, style, nav, .paywall, .subscribe-widget').remove();
|
|
1705
|
+
const parts = [];
|
|
1706
|
+
postContent.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
|
|
1707
|
+
const tag = el.name;
|
|
1708
|
+
const text = $(el).text().trim();
|
|
1709
|
+
if (!text || text.length < 3)
|
|
1710
|
+
return;
|
|
1711
|
+
if (tag === 'h1' || tag === 'h2')
|
|
1712
|
+
parts.push(`## ${text}`);
|
|
1713
|
+
else if (tag === 'h3' || tag === 'h4')
|
|
1714
|
+
parts.push(`### ${text}`);
|
|
1715
|
+
else if (tag === 'blockquote')
|
|
1716
|
+
parts.push(`> ${text}`);
|
|
1717
|
+
else if (tag === 'pre')
|
|
1718
|
+
parts.push('```\n' + text + '\n```');
|
|
1719
|
+
else
|
|
1720
|
+
parts.push(text);
|
|
1721
|
+
});
|
|
1722
|
+
articleBody = parts.join('\n\n');
|
|
1723
|
+
}
|
|
1724
|
+
const contentBody = articleBody || description;
|
|
1725
|
+
const structured = {
|
|
1726
|
+
title,
|
|
1727
|
+
author,
|
|
1728
|
+
publication,
|
|
1729
|
+
publishDate,
|
|
1730
|
+
description,
|
|
1731
|
+
url,
|
|
1732
|
+
};
|
|
1733
|
+
const authorLine = author ? `\n**Author:** ${author}` : '';
|
|
1734
|
+
const pubLine = publication ? `\n**Publication:** ${publication}` : '';
|
|
1735
|
+
const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
|
|
1736
|
+
const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}`;
|
|
1737
|
+
return { domain: 'substack.com', type: 'post', structured, cleanContent };
|
|
1738
|
+
}
|
|
1739
|
+
catch {
|
|
1740
|
+
return null;
|
|
1741
|
+
}
|
|
1742
|
+
}
|
|
1743
|
+
// ---------------------------------------------------------------------------
|
|
1744
|
+
// 15. Allrecipes (Recipe Sites) extractor
|
|
1745
|
+
// ---------------------------------------------------------------------------
|
|
1746
|
+
async function allrecipesExtractor(html, url) {
|
|
1747
|
+
try {
|
|
1748
|
+
const { load } = await import('cheerio');
|
|
1749
|
+
const $ = load(html);
|
|
1750
|
+
// Try Schema.org Recipe JSON-LD first
|
|
1751
|
+
let recipe = null;
|
|
1752
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
1753
|
+
if (recipe)
|
|
1754
|
+
return;
|
|
1755
|
+
const raw = $(el).html() || '';
|
|
1756
|
+
const parsed = tryParseJson(raw);
|
|
1757
|
+
// Can be an array or direct object
|
|
1758
|
+
const candidates = Array.isArray(parsed) ? parsed : [parsed];
|
|
1759
|
+
for (const item of candidates) {
|
|
1760
|
+
if (item?.['@type'] === 'Recipe' || (Array.isArray(item?.['@type']) && item['@type'].includes('Recipe'))) {
|
|
1761
|
+
recipe = item;
|
|
1762
|
+
break;
|
|
1763
|
+
}
|
|
1764
|
+
// Sometimes it's nested in @graph
|
|
1765
|
+
if (item?.['@graph']) {
|
|
1766
|
+
const graphRecipe = item['@graph'].find((g) => g?.['@type'] === 'Recipe');
|
|
1767
|
+
if (graphRecipe) {
|
|
1768
|
+
recipe = graphRecipe;
|
|
1769
|
+
break;
|
|
1770
|
+
}
|
|
1771
|
+
}
|
|
1772
|
+
}
|
|
1773
|
+
});
|
|
1774
|
+
let title;
|
|
1775
|
+
let ingredients = [];
|
|
1776
|
+
let instructions = [];
|
|
1777
|
+
let prepTime = '';
|
|
1778
|
+
let cookTime = '';
|
|
1779
|
+
let totalTime = '';
|
|
1780
|
+
let servings = '';
|
|
1781
|
+
let rating = '';
|
|
1782
|
+
let reviewCount = '';
|
|
1783
|
+
let description = '';
|
|
1784
|
+
if (recipe) {
|
|
1785
|
+
title = recipe.name || '';
|
|
1786
|
+
description = recipe.description || '';
|
|
1787
|
+
ingredients = (recipe.recipeIngredient || []).map((i) => i.trim());
|
|
1788
|
+
// Instructions can be strings or HowToStep objects
|
|
1789
|
+
const rawInstructions = recipe.recipeInstructions || [];
|
|
1790
|
+
for (const step of rawInstructions) {
|
|
1791
|
+
if (typeof step === 'string')
|
|
1792
|
+
instructions.push(step.trim());
|
|
1793
|
+
else if (step.text)
|
|
1794
|
+
instructions.push(step.text.trim());
|
|
1795
|
+
else if (step['@type'] === 'HowToSection' && step.itemListElement) {
|
|
1796
|
+
for (const s of step.itemListElement) {
|
|
1797
|
+
if (s.text)
|
|
1798
|
+
instructions.push(s.text.trim());
|
|
1799
|
+
}
|
|
1800
|
+
}
|
|
1801
|
+
}
|
|
1802
|
+
// Parse ISO 8601 duration (PT30M, PT1H30M)
|
|
1803
|
+
const parseDuration = (d) => {
|
|
1804
|
+
if (!d)
|
|
1805
|
+
return '';
|
|
1806
|
+
const h = d.match(/(\d+)H/)?.[1];
|
|
1807
|
+
const m = d.match(/(\d+)M/)?.[1];
|
|
1808
|
+
return [h ? `${h}h` : '', m ? `${m}m` : ''].filter(Boolean).join(' ');
|
|
1809
|
+
};
|
|
1810
|
+
prepTime = parseDuration(recipe.prepTime || '');
|
|
1811
|
+
cookTime = parseDuration(recipe.cookTime || '');
|
|
1812
|
+
totalTime = parseDuration(recipe.totalTime || '');
|
|
1813
|
+
servings = String(recipe.recipeYield || '');
|
|
1814
|
+
rating = recipe.aggregateRating?.ratingValue ? String(recipe.aggregateRating.ratingValue) : '';
|
|
1815
|
+
reviewCount = recipe.aggregateRating?.reviewCount ? String(recipe.aggregateRating.reviewCount) : '';
|
|
1816
|
+
}
|
|
1817
|
+
else {
|
|
1818
|
+
// HTML fallback
|
|
1819
|
+
title = $('h1').first().text().trim() ||
|
|
1820
|
+
$('meta[property="og:title"]').attr('content') || '';
|
|
1821
|
+
description = $('meta[property="og:description"]').attr('content') || '';
|
|
1822
|
+
$('[class*="ingredient"]').each((_, el) => {
|
|
1823
|
+
const text = $(el).text().trim();
|
|
1824
|
+
if (text && text.length < 200)
|
|
1825
|
+
ingredients.push(text);
|
|
1826
|
+
});
|
|
1827
|
+
$('[class*="instruction"] li, [class*="step"] li').each((_, el) => {
|
|
1828
|
+
const text = $(el).text().trim();
|
|
1829
|
+
if (text)
|
|
1830
|
+
instructions.push(text);
|
|
1831
|
+
});
|
|
1832
|
+
}
|
|
1833
|
+
if (!title)
|
|
1834
|
+
return null;
|
|
1835
|
+
const structured = {
|
|
1836
|
+
title, description, ingredients, instructions,
|
|
1837
|
+
prepTime, cookTime, totalTime, servings, rating, reviewCount, url,
|
|
1838
|
+
};
|
|
1839
|
+
const timeParts = [
|
|
1840
|
+
prepTime ? `Prep: ${prepTime}` : '',
|
|
1841
|
+
cookTime ? `Cook: ${cookTime}` : '',
|
|
1842
|
+
totalTime ? `Total: ${totalTime}` : '',
|
|
1843
|
+
].filter(Boolean).join(' | ');
|
|
1844
|
+
const metaLine = [
|
|
1845
|
+
timeParts,
|
|
1846
|
+
servings ? `Servings: ${servings}` : '',
|
|
1847
|
+
rating ? `Rating: ${rating}${reviewCount ? ` (${reviewCount} reviews)` : ''}` : '',
|
|
1848
|
+
].filter(Boolean).join(' | ');
|
|
1849
|
+
const ingredientsMd = ingredients.length
|
|
1850
|
+
? `## Ingredients\n\n${ingredients.map(i => `- ${i}`).join('\n')}`
|
|
1851
|
+
: '';
|
|
1852
|
+
const instructionsMd = instructions.length
|
|
1853
|
+
? `## Instructions\n\n${instructions.map((s, i) => `${i + 1}. ${s}`).join('\n')}`
|
|
1854
|
+
: '';
|
|
1855
|
+
const cleanContent = `# š½ļø ${title}\n\n${metaLine ? `*${metaLine}*\n\n` : ''}${description ? description + '\n\n' : ''}${ingredientsMd}\n\n${instructionsMd}`.trim();
|
|
1856
|
+
return { domain: 'allrecipes.com', type: 'recipe', structured, cleanContent };
|
|
1857
|
+
}
|
|
1858
|
+
catch {
|
|
1859
|
+
return null;
|
|
1860
|
+
}
|
|
1861
|
+
}
|
|
1862
|
+
// ---------------------------------------------------------------------------
|
|
1863
|
+
// 16. IMDB extractor
|
|
1864
|
+
// ---------------------------------------------------------------------------
|
|
1865
|
+
async function imdbExtractor(html, url) {
|
|
1866
|
+
try {
|
|
1867
|
+
const { load } = await import('cheerio');
|
|
1868
|
+
const $ = load(html);
|
|
1869
|
+
// IMDB uses JSON-LD richly
|
|
1870
|
+
let jsonLd = null;
|
|
1871
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
1872
|
+
if (jsonLd)
|
|
1873
|
+
return;
|
|
1874
|
+
const raw = $(el).html() || '';
|
|
1875
|
+
const parsed = tryParseJson(raw);
|
|
1876
|
+
if (parsed?.['@type'] === 'Movie' || parsed?.['@type'] === 'TVSeries' || parsed?.['@type'] === 'TVEpisode') {
|
|
1877
|
+
jsonLd = parsed;
|
|
1878
|
+
}
|
|
1879
|
+
});
|
|
1880
|
+
const title = jsonLd?.name ||
|
|
1881
|
+
$('meta[property="og:title"]').attr('content')?.replace(/ - IMDb$/, '') ||
|
|
1882
|
+
$('h1[data-testid="hero__pageTitle"] span').first().text().trim() || '';
|
|
1883
|
+
if (!title)
|
|
1884
|
+
return null;
|
|
1885
|
+
const description = jsonLd?.description ||
|
|
1886
|
+
$('meta[property="og:description"]').attr('content') ||
|
|
1887
|
+
$('p[data-testid="plot"]').text().trim() || '';
|
|
1888
|
+
const year = jsonLd?.datePublished?.substring(0, 4) ||
|
|
1889
|
+
$('a[href*="releaseinfo"]').first().text().trim() || '';
|
|
1890
|
+
const ratingValue = jsonLd?.aggregateRating?.ratingValue ||
|
|
1891
|
+
$('[data-testid="hero-rating-bar__aggregate-rating__score"] span').first().text().trim() || '';
|
|
1892
|
+
const ratingCount = jsonLd?.aggregateRating?.ratingCount || '';
|
|
1893
|
+
const contentType = jsonLd?.['@type'] || 'Movie';
|
|
1894
|
+
// Genres
|
|
1895
|
+
const genres = jsonLd?.genre
|
|
1896
|
+
? (Array.isArray(jsonLd.genre) ? jsonLd.genre : [jsonLd.genre])
|
|
1897
|
+
: [];
|
|
1898
|
+
if (!genres.length) {
|
|
1899
|
+
$('[data-testid="genres"] a, a[href*="/search/title?genres"]').each((_, el) => {
|
|
1900
|
+
const g = $(el).text().trim();
|
|
1901
|
+
if (g && !genres.includes(g))
|
|
1902
|
+
genres.push(g);
|
|
1903
|
+
});
|
|
1904
|
+
}
|
|
1905
|
+
// Director
|
|
1906
|
+
const director = jsonLd?.director
|
|
1907
|
+
? (Array.isArray(jsonLd.director)
|
|
1908
|
+
? jsonLd.director.map((d) => d.name || d).join(', ')
|
|
1909
|
+
: jsonLd.director?.name || String(jsonLd.director))
|
|
1910
|
+
: $('a[href*="/name/"][class*="ipc-metadata-list-item__list-content-item"]').first().text().trim() || '';
|
|
1911
|
+
// Cast (top few from JSON-LD actor field)
|
|
1912
|
+
const cast = jsonLd?.actor
|
|
1913
|
+
? (Array.isArray(jsonLd.actor) ? jsonLd.actor : [jsonLd.actor])
|
|
1914
|
+
.map((a) => a.name || a).slice(0, 6)
|
|
1915
|
+
: [];
|
|
1916
|
+
// Runtime
|
|
1917
|
+
const runtime = jsonLd?.duration
|
|
1918
|
+
? (() => {
|
|
1919
|
+
const m = String(jsonLd.duration).match(/PT(?:(\d+)H)?(?:(\d+)M)?/);
|
|
1920
|
+
if (m)
|
|
1921
|
+
return [m[1] ? `${m[1]}h` : '', m[2] ? `${m[2]}m` : ''].filter(Boolean).join(' ');
|
|
1922
|
+
return String(jsonLd.duration);
|
|
1923
|
+
})()
|
|
1924
|
+
: '';
|
|
1925
|
+
const structured = {
|
|
1926
|
+
title, year, contentType, description, ratingValue, ratingCount,
|
|
1927
|
+
genres, director, cast, runtime, url,
|
|
1928
|
+
};
|
|
1929
|
+
const ratingLine = ratingValue ? `ā ${ratingValue}/10${ratingCount ? ` (${Number(ratingCount).toLocaleString()} votes)` : ''}` : '';
|
|
1930
|
+
const genreLine = genres.length ? genres.join(', ') : '';
|
|
1931
|
+
const directorLine = director ? `**Director:** ${director}` : '';
|
|
1932
|
+
const castLine = cast.length ? `**Cast:** ${cast.join(', ')}` : '';
|
|
1933
|
+
const runtimeLine = runtime ? `**Runtime:** ${runtime}` : '';
|
|
1934
|
+
const metaParts = [ratingLine, genreLine, runtimeLine, year ? `**Year:** ${year}` : ''].filter(Boolean).join(' | ');
|
|
1935
|
+
const cleanContent = `# š¬ ${title}\n\n${metaParts}\n\n${directorLine ? directorLine + '\n' : ''}${castLine ? castLine + '\n' : ''}\n## Plot\n\n${description}`;
|
|
1936
|
+
return { domain: 'imdb.com', type: contentType === 'TVSeries' ? 'tv_show' : 'movie', structured, cleanContent };
|
|
1937
|
+
}
|
|
1938
|
+
catch {
|
|
1939
|
+
return null;
|
|
1940
|
+
}
|
|
1941
|
+
}
|
|
1942
|
+
// ---------------------------------------------------------------------------
|
|
1943
|
+
// 17. LinkedIn extractor
|
|
1944
|
+
// ---------------------------------------------------------------------------
|
|
1945
|
+
async function linkedinExtractor(html, url) {
|
|
1946
|
+
try {
|
|
1947
|
+
const { load } = await import('cheerio');
|
|
1948
|
+
const $ = load(html);
|
|
1949
|
+
// LinkedIn SSR exposes some data in meta tags and JSON-LD
|
|
1950
|
+
let jsonLd = null;
|
|
1951
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
1952
|
+
if (jsonLd)
|
|
1953
|
+
return;
|
|
1954
|
+
const raw = $(el).html() || '';
|
|
1955
|
+
const parsed = tryParseJson(raw);
|
|
1956
|
+
if (parsed?.['@type'] === 'Person' || parsed?.['@type'] === 'Organization')
|
|
1957
|
+
jsonLd = parsed;
|
|
1958
|
+
});
|
|
1959
|
+
const ogTitle = $('meta[property="og:title"]').attr('content') || '';
|
|
1960
|
+
const ogDescription = $('meta[property="og:description"]').attr('content') || '';
|
|
1961
|
+
const ogImage = $('meta[property="og:image"]').attr('content') || '';
|
|
1962
|
+
const name = jsonLd?.name || ogTitle.replace(/ \| LinkedIn$/, '').trim() || '';
|
|
1963
|
+
if (!name)
|
|
1964
|
+
return null;
|
|
1965
|
+
const headline = jsonLd?.jobTitle ||
|
|
1966
|
+
$('meta[name="description"]').attr('content')?.split('|')?.[0]?.trim() ||
|
|
1967
|
+
ogDescription || '';
|
|
1968
|
+
const description = jsonLd?.description || ogDescription || '';
|
|
1969
|
+
// Try to detect page type from URL
|
|
1970
|
+
const pathParts = new URL(url).pathname.split('/').filter(Boolean);
|
|
1971
|
+
const pageType = pathParts[0] === 'company' ? 'company'
|
|
1972
|
+
: pathParts[0] === 'in' ? 'profile'
|
|
1973
|
+
: pathParts[0] === 'jobs' ? 'job'
|
|
1974
|
+
: 'page';
|
|
1975
|
+
// Extract any visible structured info from the HTML
|
|
1976
|
+
const location = $('[class*="location"]').first().text().trim() ||
|
|
1977
|
+
jsonLd?.address?.addressLocality || '';
|
|
1978
|
+
const structured = {
|
|
1979
|
+
name, headline, description, location, pageType,
|
|
1980
|
+
image: ogImage, url,
|
|
1981
|
+
};
|
|
1982
|
+
const typeLine = pageType === 'company' ? 'š¢' : pageType === 'profile' ? 'š¤' : 'š';
|
|
1983
|
+
const locationLine = location ? `\nš ${location}` : '';
|
|
1984
|
+
const headlineLine = headline ? `\n*${headline}*` : '';
|
|
1985
|
+
const cleanContent = `# ${typeLine} ${name}${headlineLine}${locationLine}\n\n${description}`;
|
|
1986
|
+
return { domain: 'linkedin.com', type: pageType, structured, cleanContent };
|
|
1987
|
+
}
|
|
1988
|
+
catch {
|
|
1989
|
+
return null;
|
|
1990
|
+
}
|
|
1991
|
+
}
|
|
1992
|
+
// ---------------------------------------------------------------------------
|
|
1993
|
+
// 18. PyPI extractor
|
|
1994
|
+
// ---------------------------------------------------------------------------
|
|
1995
|
+
async function pypiExtractor(_html, url) {
|
|
1996
|
+
const urlObj = new URL(url);
|
|
1997
|
+
const path = urlObj.pathname;
|
|
1998
|
+
// Match /project/name or /project/name/version/
|
|
1999
|
+
const packageMatch = path.match(/\/project\/([^/]+)/);
|
|
2000
|
+
if (!packageMatch)
|
|
2001
|
+
return null;
|
|
2002
|
+
const packageName = packageMatch[1];
|
|
2003
|
+
try {
|
|
2004
|
+
const apiUrl = `https://pypi.org/pypi/${encodeURIComponent(packageName)}/json`;
|
|
2005
|
+
const data = await fetchJson(apiUrl);
|
|
2006
|
+
if (!data?.info)
|
|
2007
|
+
return null;
|
|
2008
|
+
const info = data.info;
|
|
2009
|
+
const structured = {
|
|
2010
|
+
name: info.name,
|
|
2011
|
+
version: info.version,
|
|
2012
|
+
description: info.summary || '',
|
|
2013
|
+
author: info.author || '',
|
|
2014
|
+
authorEmail: info.author_email || '',
|
|
2015
|
+
license: info.license || 'N/A',
|
|
2016
|
+
homepage: info.home_page || info.project_url || null,
|
|
2017
|
+
projectUrls: info.project_urls || {},
|
|
2018
|
+
keywords: info.keywords ? info.keywords.split(/[,\s]+/).filter(Boolean) : [],
|
|
2019
|
+
requiresPython: info.requires_python || '',
|
|
2020
|
+
requiresDist: (info.requires_dist || []).slice(0, 20),
|
|
2021
|
+
classifiers: (info.classifiers || []).slice(0, 10),
|
|
2022
|
+
};
|
|
2023
|
+
const installCmd = `pip install ${info.name}`;
|
|
2024
|
+
const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
|
|
2025
|
+
const pyVersionLine = structured.requiresPython ? `\n**Requires Python:** ${structured.requiresPython}` : '';
|
|
2026
|
+
const depsLine = structured.requiresDist.length
|
|
2027
|
+
? `\n\n## Dependencies\n\n${structured.requiresDist.map((d) => `- ${d}`).join('\n')}`
|
|
2028
|
+
: '';
|
|
2029
|
+
// Find project URLs
|
|
2030
|
+
const projectUrlLines = [];
|
|
2031
|
+
for (const [label, u] of Object.entries(structured.projectUrls)) {
|
|
2032
|
+
projectUrlLines.push(`- **${label}:** ${u}`);
|
|
2033
|
+
}
|
|
2034
|
+
const cleanContent = `# š¦ ${info.name} ${info.version}
|
|
2035
|
+
|
|
2036
|
+
${info.summary || ''}
|
|
2037
|
+
|
|
2038
|
+
\`\`\`
|
|
2039
|
+
${installCmd}
|
|
2040
|
+
\`\`\`
|
|
2041
|
+
|
|
2042
|
+
**Author:** ${info.author || 'N/A'} | **License:** ${info.license || 'N/A'}${keywordsLine}${pyVersionLine}
|
|
2043
|
+
|
|
2044
|
+
${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}`;
|
|
2045
|
+
return { domain: 'pypi.org', type: 'package', structured, cleanContent };
|
|
2046
|
+
}
|
|
2047
|
+
catch (e) {
|
|
2048
|
+
if (process.env.DEBUG)
|
|
2049
|
+
console.debug('[webpeel]', 'PyPI API failed:', e instanceof Error ? e.message : e);
|
|
2050
|
+
return null;
|
|
2051
|
+
}
|
|
2052
|
+
}
|
|
2053
|
+
// ---------------------------------------------------------------------------
|
|
2054
|
+
// 19. Dev.to extractor
|
|
2055
|
+
// ---------------------------------------------------------------------------
|
|
2056
|
+
async function devtoExtractor(html, url) {
|
|
2057
|
+
try {
|
|
2058
|
+
const { load } = await import('cheerio');
|
|
2059
|
+
const $ = load(html);
|
|
2060
|
+
// Try Dev.to article API if we can get the slug from the URL
|
|
2061
|
+
const urlObj = new URL(url);
|
|
2062
|
+
const pathParts = urlObj.pathname.split('/').filter(Boolean);
|
|
2063
|
+
// Dev.to article URL: /@username/article-slug-id or /username/article-slug-id
|
|
2064
|
+
const slug = pathParts.length >= 2
|
|
2065
|
+
? pathParts.slice(0, 2).join('/').replace(/^@/, '')
|
|
2066
|
+
: null;
|
|
2067
|
+
if (slug) {
|
|
2068
|
+
try {
|
|
2069
|
+
const apiUrl = `https://dev.to/api/articles/${slug}`;
|
|
2070
|
+
const apiData = await fetchJson(apiUrl);
|
|
2071
|
+
if (apiData?.title) {
|
|
2072
|
+
const structured = {
|
|
2073
|
+
title: apiData.title,
|
|
2074
|
+
author: apiData.user?.name || '',
|
|
2075
|
+
authorUsername: apiData.user?.username || '',
|
|
2076
|
+
publishDate: apiData.published_at || '',
|
|
2077
|
+
tags: apiData.tag_list || [],
|
|
2078
|
+
readingTime: apiData.reading_time_minutes ? `${apiData.reading_time_minutes} min read` : '',
|
|
2079
|
+
reactions: apiData.public_reactions_count || 0,
|
|
2080
|
+
comments: apiData.comments_count || 0,
|
|
2081
|
+
description: apiData.description || '',
|
|
2082
|
+
url: apiData.url || url,
|
|
2083
|
+
};
|
|
2084
|
+
const authorLine = structured.author ? `**Author:** ${structured.author} (@${structured.authorUsername})` : '';
|
|
2085
|
+
const dateLine = structured.publishDate ? `**Published:** ${structured.publishDate.split('T')[0]}` : '';
|
|
2086
|
+
const tagsLine = structured.tags.length ? `**Tags:** ${structured.tags.join(', ')}` : '';
|
|
2087
|
+
const statsLine = `ā¤ļø ${structured.reactions} reactions | š¬ ${structured.comments} comments${structured.readingTime ? ` | ā±ļø ${structured.readingTime}` : ''}`;
|
|
2088
|
+
const metaParts = [authorLine, dateLine, tagsLine, statsLine].filter(Boolean).join('\n');
|
|
2089
|
+
// Use body_html if available for article content
|
|
2090
|
+
let articleContent = '';
|
|
2091
|
+
if (apiData.body_html) {
|
|
2092
|
+
// Strip HTML tags for clean content
|
|
2093
|
+
articleContent = stripHtml(apiData.body_html)
|
|
2094
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
2095
|
+
.substring(0, 8000);
|
|
2096
|
+
}
|
|
2097
|
+
else if (apiData.body_markdown) {
|
|
2098
|
+
articleContent = apiData.body_markdown.substring(0, 8000);
|
|
2099
|
+
}
|
|
2100
|
+
const cleanContent = `# ${structured.title}\n\n${metaParts}\n\n${articleContent || structured.description}`;
|
|
2101
|
+
return { domain: 'dev.to', type: 'article', structured, cleanContent };
|
|
2102
|
+
}
|
|
2103
|
+
}
|
|
2104
|
+
catch { /* fall through to HTML */ }
|
|
2105
|
+
}
|
|
2106
|
+
// HTML fallback
|
|
2107
|
+
const title = $('meta[property="og:title"]').attr('content') ||
|
|
2108
|
+
$('h1').first().text().trim() || '';
|
|
2109
|
+
if (!title)
|
|
2110
|
+
return null;
|
|
2111
|
+
const author = $('meta[name="author"]').attr('content') ||
|
|
2112
|
+
$('[itemprop="name"]').first().text().trim() || '';
|
|
2113
|
+
const description = $('meta[property="og:description"]').attr('content') || '';
|
|
2114
|
+
const tags = [];
|
|
2115
|
+
$('a[data-no-instant][href*="/t/"]').each((_, el) => {
|
|
2116
|
+
const tag = $(el).text().trim().replace('#', '');
|
|
2117
|
+
if (tag)
|
|
2118
|
+
tags.push(tag);
|
|
2119
|
+
});
|
|
2120
|
+
// Article body
|
|
2121
|
+
let articleBody = '';
|
|
2122
|
+
const articleEl = $('article#article-body, .crayons-article__main, #article-body').first();
|
|
2123
|
+
if (articleEl.length) {
|
|
2124
|
+
const parts = [];
|
|
2125
|
+
articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
|
|
2126
|
+
const tag = el.name;
|
|
2127
|
+
const text = $(el).text().trim();
|
|
2128
|
+
if (!text || text.length < 3)
|
|
2129
|
+
return;
|
|
2130
|
+
if (tag === 'h2')
|
|
2131
|
+
parts.push(`## ${text}`);
|
|
2132
|
+
else if (tag === 'h3' || tag === 'h4')
|
|
2133
|
+
parts.push(`### ${text}`);
|
|
2134
|
+
else if (tag === 'blockquote')
|
|
2135
|
+
parts.push(`> ${text}`);
|
|
2136
|
+
else if (tag === 'pre')
|
|
2137
|
+
parts.push('```\n' + text + '\n```');
|
|
2138
|
+
else
|
|
2139
|
+
parts.push(text);
|
|
2140
|
+
});
|
|
2141
|
+
articleBody = parts.join('\n\n');
|
|
2142
|
+
}
|
|
2143
|
+
const structured = {
|
|
2144
|
+
title, author, description, tags, url,
|
|
2145
|
+
};
|
|
2146
|
+
const authorLine = author ? `\n**Author:** ${author}` : '';
|
|
2147
|
+
const tagsLine = tags.length ? `\n**Tags:** ${tags.join(', ')}` : '';
|
|
2148
|
+
const cleanContent = `# ${title}${authorLine}${tagsLine}\n\n${articleBody || description}`.substring(0, 10000);
|
|
2149
|
+
return { domain: 'dev.to', type: 'article', structured, cleanContent };
|
|
2150
|
+
}
|
|
2151
|
+
catch {
|
|
2152
|
+
return null;
|
|
2153
|
+
}
|
|
2154
|
+
}
|
|
2155
|
+
// ---------------------------------------------------------------------------
|
|
2156
|
+
// 20. Craigslist extractor
|
|
2157
|
+
// ---------------------------------------------------------------------------
|
|
2158
|
+
async function craigslistExtractor(html, url) {
|
|
2159
|
+
try {
|
|
2160
|
+
const { load } = await import('cheerio');
|
|
2161
|
+
const $ = load(html);
|
|
2162
|
+
const urlObj = new URL(url);
|
|
2163
|
+
const path = urlObj.pathname;
|
|
2164
|
+
// Detect if it's a listing page or individual post
|
|
2165
|
+
// Individual post: /xxx/yyy/d/title/12345678.html
|
|
2166
|
+
const isPost = /\/d\/[^/]+\/\d+\.html/.test(path) || /\/\d{10,}\.html/.test(path);
|
|
2167
|
+
if (isPost) {
|
|
2168
|
+
const title = $('#titletextonly').text().trim() ||
|
|
2169
|
+
$('span#titletextonly').text().trim() ||
|
|
2170
|
+
$('meta[property="og:title"]').attr('content') ||
|
|
2171
|
+
$('h2.postingtitle').text().trim() || '';
|
|
2172
|
+
if (!title)
|
|
2173
|
+
return null;
|
|
2174
|
+
const price = $('.price').first().text().trim() ||
|
|
2175
|
+
$('[class*="price"]').first().text().trim() || '';
|
|
2176
|
+
const location = $('.postingtitletext small').text().trim().replace(/[()]/g, '') ||
|
|
2177
|
+
$('#map').attr('data-address') || '';
|
|
2178
|
+
const postDate = $('#display-date time').attr('datetime') ||
|
|
2179
|
+
$('time.date').first().attr('datetime') ||
|
|
2180
|
+
$('p.postinginfo time').first().attr('datetime') || '';
|
|
2181
|
+
// Body text
|
|
2182
|
+
const bodyEl = $('#postingbody');
|
|
2183
|
+
bodyEl.find('.print-information, .QR-code').remove();
|
|
2184
|
+
const bodyText = bodyEl.text().trim()
|
|
2185
|
+
.replace(/QR Code Link to This Post/, '')
|
|
2186
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
2187
|
+
.trim();
|
|
2188
|
+
// Images
|
|
2189
|
+
const images = [];
|
|
2190
|
+
$('img.slide').each((_, el) => {
|
|
2191
|
+
const src = $(el).attr('src') || '';
|
|
2192
|
+
if (src && !images.includes(src))
|
|
2193
|
+
images.push(src);
|
|
2194
|
+
});
|
|
2195
|
+
$('img[id^="ii"]').each((_, el) => {
|
|
2196
|
+
const src = $(el).attr('src') || '';
|
|
2197
|
+
if (src && !images.includes(src))
|
|
2198
|
+
images.push(src);
|
|
2199
|
+
});
|
|
2200
|
+
// Attributes
|
|
2201
|
+
const attrs = {};
|
|
2202
|
+
$('.attrgroup span').each((_, el) => {
|
|
2203
|
+
const text = $(el).text().trim();
|
|
2204
|
+
const parts = text.split(':');
|
|
2205
|
+
if (parts.length === 2)
|
|
2206
|
+
attrs[parts[0].trim()] = parts[1].trim();
|
|
2207
|
+
});
|
|
2208
|
+
const structured = {
|
|
2209
|
+
title, price, location, postDate,
|
|
2210
|
+
bodyText, images, attributes: attrs, url,
|
|
2211
|
+
};
|
|
2212
|
+
const priceLine = price ? `\n**Price:** ${price}` : '';
|
|
2213
|
+
const locationLine = location ? `\n**Location:** ${location}` : '';
|
|
2214
|
+
const dateLine = postDate ? `\n**Posted:** ${postDate.split('T')[0]}` : '';
|
|
2215
|
+
const attrsSection = Object.keys(attrs).length
|
|
2216
|
+
? `\n\n## Details\n\n${Object.entries(attrs).map(([k, v]) => `- **${k}:** ${v}`).join('\n')}`
|
|
2217
|
+
: '';
|
|
2218
|
+
const imagesLine = images.length ? `\n\nš· ${images.length} image${images.length > 1 ? 's' : ''}` : '';
|
|
2219
|
+
const cleanContent = `# š ${title}${priceLine}${locationLine}${dateLine}${attrsSection}${imagesLine}\n\n${bodyText.substring(0, 3000)}`;
|
|
2220
|
+
return { domain: 'craigslist.org', type: 'listing', structured, cleanContent };
|
|
2221
|
+
}
|
|
2222
|
+
// Listing page (search results)
|
|
2223
|
+
const pageTitle = $('title').text().trim() ||
|
|
2224
|
+
$('meta[property="og:title"]').attr('content') || 'Craigslist Listings';
|
|
2225
|
+
const listings = [];
|
|
2226
|
+
$('.result-row, li.cl-static-search-result, .cl-search-result').each((_, el) => {
|
|
2227
|
+
const titleEl = $(el).find('a.titlestring, a[class*="title"], .result-title').first();
|
|
2228
|
+
const postTitle = titleEl.text().trim();
|
|
2229
|
+
const postUrl = titleEl.attr('href') || '';
|
|
2230
|
+
const postPrice = $(el).find('.result-price, [class*="price"]').first().text().trim();
|
|
2231
|
+
const postHood = $(el).find('.result-hood, [class*="hood"]').first().text().trim().replace(/[()]/g, '');
|
|
2232
|
+
if (postTitle) {
|
|
2233
|
+
listings.push({ title: postTitle, url: postUrl, price: postPrice, location: postHood });
|
|
2234
|
+
}
|
|
2235
|
+
});
|
|
2236
|
+
if (!listings.length)
|
|
2237
|
+
return null;
|
|
2238
|
+
const structured = { pageTitle, listings, url };
|
|
2239
|
+
const listMd = listings.slice(0, 20).map((l, i) => `${i + 1}. **${l.title}**${l.price ? ` ā ${l.price}` : ''}${l.location ? ` (${l.location})` : ''}${l.url ? `\n ${l.url}` : ''}`).join('\n\n');
|
|
2240
|
+
const cleanContent = `# š ${pageTitle}\n\n${listMd}`;
|
|
2241
|
+
return { domain: 'craigslist.org', type: 'search', structured, cleanContent };
|
|
2242
|
+
}
|
|
2243
|
+
catch {
|
|
2244
|
+
return null;
|
|
2245
|
+
}
|
|
2246
|
+
}
|
|
1465
2247
|
//# sourceMappingURL=domain-extractors.js.map
|