webpeel 0.17.13 → 0.17.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -71,6 +71,15 @@ const REGISTRY = [
71
71
  { match: (h) => h === 'www.npmjs.com' || h === 'npmjs.com', extractor: npmExtractor },
72
72
  { match: (h) => h === 'www.bestbuy.com' || h === 'bestbuy.com', extractor: bestBuyExtractor },
73
73
  { match: (h) => h === 'www.walmart.com' || h === 'walmart.com', extractor: walmartExtractor },
74
+ { match: (h) => h === 'www.amazon.com' || h === 'amazon.com', extractor: amazonExtractor },
75
+ { match: (h) => h === 'medium.com' || h === 'www.medium.com' || h.endsWith('.medium.com'), extractor: mediumExtractor },
76
+ { match: (h) => h.endsWith('.substack.com'), extractor: substackExtractor },
77
+ { match: (h) => h === 'www.allrecipes.com' || h === 'allrecipes.com', extractor: allrecipesExtractor },
78
+ { match: (h) => h === 'www.imdb.com' || h === 'imdb.com', extractor: imdbExtractor },
79
+ { match: (h) => h === 'www.linkedin.com' || h === 'linkedin.com', extractor: linkedinExtractor },
80
+ { match: (h) => h === 'pypi.org' || h === 'www.pypi.org', extractor: pypiExtractor },
81
+ { match: (h) => h === 'dev.to' || h === 'www.dev.to', extractor: devtoExtractor },
82
+ { match: (h) => h === 'craigslist.org' || h === 'www.craigslist.org' || h.endsWith('.craigslist.org'), extractor: craigslistExtractor },
74
83
  ];
75
84
  /**
76
85
  * Returns the domain extractor for a URL, or null if none matches.
@@ -1098,32 +1107,69 @@ async function youtubeExtractor(_html, url) {
1098
1107
  const title = transcript.title || oembedData?.title || '';
1099
1108
  const channel = transcript.channel || oembedData?.author_name || '';
1100
1109
  const channelUrl = oembedData?.author_url || `https://www.youtube.com/@${channel}`;
1101
- const description = noembedData?.description || oembedData?.description || '';
1110
+ const description = transcript.description || noembedData?.description || oembedData?.description || '';
1102
1111
  const thumbnailUrl = oembedData?.thumbnail_url || '';
1112
+ const publishDate = transcript.publishDate || '';
1113
+ const hasTranscript = transcript.segments.length > 0;
1103
1114
  const structured = {
1104
1115
  title,
1105
1116
  channel,
1106
1117
  channelUrl,
1107
1118
  duration: transcript.duration,
1119
+ publishDate,
1108
1120
  language: transcript.language,
1109
1121
  availableLanguages: transcript.availableLanguages,
1110
1122
  transcriptSegments: transcript.segments.length,
1123
+ wordCount: transcript.wordCount ?? 0,
1111
1124
  description,
1112
1125
  thumbnailUrl,
1126
+ chapters: transcript.chapters ?? [],
1127
+ keyPoints: transcript.keyPoints ?? [],
1113
1128
  source: 'transcript',
1114
1129
  };
1115
- const availLangs = transcript.availableLanguages.length;
1116
- const langLine = `${transcript.language}${availLangs > 1 ? ` (${availLangs} available)` : ''}`;
1117
- const hasTranscript = transcript.segments.length > 0;
1118
- let cleanContent;
1119
- if (hasTranscript) {
1120
- const descSection = description ? `\n\n**Description:** ${description}` : '';
1121
- cleanContent = `## šŸŽ¬ ${title}\n\n**Channel:** [${channel}](${channelUrl})\n**Duration:** ${transcript.duration}\n**Language:** ${langLine}${descSection}\n\n### Transcript\n\n${transcript.fullText}`;
1130
+ // Format the publish date nicely if it's an ISO date
1131
+ let publishStr = '';
1132
+ if (publishDate) {
1133
+ try {
1134
+ const d = new Date(publishDate);
1135
+ publishStr = d.toLocaleDateString('en-US', { month: 'short', year: 'numeric', day: 'numeric' });
1136
+ }
1137
+ catch {
1138
+ publishStr = publishDate;
1139
+ }
1122
1140
  }
1123
- else {
1124
- // No transcript — use description as content
1125
- cleanContent = `## šŸŽ¬ ${title}\n\n**Channel:** [${channel}](${channelUrl})\n**Duration:** ${transcript.duration}\n\n### Description\n\n${transcript.fullText}`;
1141
+ // Build header line
1142
+ const headerParts = [`**Channel:** ${channel}`];
1143
+ if (transcript.duration && transcript.duration !== '0:00')
1144
+ headerParts.push(`**Duration:** ${transcript.duration}`);
1145
+ if (publishStr)
1146
+ headerParts.push(`**Published:** ${publishStr}`);
1147
+ const headerLine = headerParts.join(' | ');
1148
+ const parts = [];
1149
+ parts.push(`# ${title}`);
1150
+ parts.push(headerLine);
1151
+ // Summary section
1152
+ if (transcript.summary && hasTranscript) {
1153
+ parts.push(`## Summary\n\n${transcript.summary}`);
1154
+ }
1155
+ else if (!hasTranscript && transcript.fullText) {
1156
+ parts.push(`## Description\n\n${transcript.fullText}`);
1157
+ }
1158
+ // Key Points section
1159
+ if (transcript.keyPoints && transcript.keyPoints.length > 0) {
1160
+ const kpLines = transcript.keyPoints.map(kp => `- ${kp}`).join('\n');
1161
+ parts.push(`## Key Points\n\n${kpLines}`);
1162
+ }
1163
+ // Chapters section
1164
+ if (transcript.chapters && transcript.chapters.length > 0) {
1165
+ const chLines = transcript.chapters.map(ch => `- ${ch.time} — ${ch.title}`).join('\n');
1166
+ parts.push(`## Chapters\n\n${chLines}`);
1167
+ }
1168
+ // Full Transcript section (only if we have real transcript segments)
1169
+ if (hasTranscript) {
1170
+ parts.push(`## Full Transcript\n\n${transcript.fullText}`);
1126
1171
  }
1172
+ const cleanContent = parts.join('\n\n');
1127
1173
  return { domain: 'youtube.com', type: 'video', structured, cleanContent };
1128
1174
  }
1129
1175
  // Fall back to oEmbed if transcript failed
@@ -1462,4 +1508,740 @@ async function walmartExtractor(_html, url) {
1462
1508
  return null; // API not accessible, fall through to other methods
1463
1509
  }
1464
1510
  }
1511
+ // ---------------------------------------------------------------------------
1512
+ // 12. Amazon Products extractor
1513
+ // ---------------------------------------------------------------------------
1514
+ async function amazonExtractor(html, url) {
1515
+ try {
1516
+ const { load } = await import('cheerio');
1517
+ const $ = load(html);
1518
+ // Extract from JSON-LD first
1519
+ let jsonLdData = null;
1520
+ $('script[type="application/ld+json"]').each((_, el) => {
1521
+ if (jsonLdData)
1522
+ return;
1523
+ const raw = $(el).html() || '';
1524
+ const parsed = tryParseJson(raw);
1525
+ if (parsed?.['@type'] === 'Product')
1526
+ jsonLdData = parsed;
1527
+ });
1528
+ // Meta tag fallbacks
1529
+ const ogTitle = $('meta[property="og:title"]').attr('content') || '';
1530
+ const ogDescription = $('meta[property="og:description"]').attr('content') || '';
1531
+ const ogImage = $('meta[property="og:image"]').attr('content') || '';
1532
+ // HTML selectors
1533
+ const title = jsonLdData?.name ||
1534
+ $('#productTitle').text().trim() ||
1535
+ $('#title').text().trim() ||
1536
+ ogTitle;
1537
+ if (!title)
1538
+ return null;
1539
+ const priceWhole = $('#priceblock_ourprice').text().trim() ||
1540
+ $('.a-price .a-offscreen').first().text().trim() ||
1541
+ $('[data-asin-price]').first().attr('data-asin-price') || '';
1542
+ const rating = jsonLdData?.aggregateRating?.ratingValue ||
1543
+ $('#acrPopover .a-size-base.a-color-base').first().text().trim() ||
1544
+ $('span[data-hook="rating-out-of-text"]').text().trim() || '';
1545
+ const reviewCount = jsonLdData?.aggregateRating?.reviewCount ||
1546
+ $('#acrCustomerReviewText').text().replace(/[^0-9,]/g, '').trim() || '';
1547
+ const availability = jsonLdData?.offers?.availability?.replace('https://schema.org/', '') ||
1548
+ $('#availability span').first().text().trim() || '';
1549
+ const description = jsonLdData?.description ||
1550
+ $('#feature-bullets .a-list-item').map((_, el) => $(el).text().trim()).get().join('\n') ||
1551
+ $('#productDescription p').text().trim() ||
1552
+ ogDescription;
1553
+ const features = [];
1554
+ $('#feature-bullets li').each((_, el) => {
1555
+ const text = $(el).text().trim();
1556
+ if (text && !text.includes('Make sure this fits'))
1557
+ features.push(text);
1558
+ });
1559
+ // ASIN from URL
1560
+ const asinMatch = url.match(/\/dp\/([A-Z0-9]{10})/i);
1561
+ const asin = asinMatch?.[1] || '';
1562
+ const structured = {
1563
+ title,
1564
+ price: priceWhole,
1565
+ rating,
1566
+ reviewCount,
1567
+ availability,
1568
+ description,
1569
+ features,
1570
+ asin,
1571
+ image: ogImage,
1572
+ url,
1573
+ };
1574
+ const ratingLine = rating ? `\n**Rating:** ${rating}${reviewCount ? ` (${reviewCount} reviews)` : ''}` : '';
1575
+ const priceLine = priceWhole ? `\n**Price:** ${priceWhole}` : '';
1576
+ const availLine = availability ? `\n**Availability:** ${availability}` : '';
1577
+ const featuresSection = features.length
1578
+ ? `\n\n## Features\n\n${features.map(f => `- ${f}`).join('\n')}`
1579
+ : '';
1580
+ const descSection = description ? `\n\n## Description\n\n${description.substring(0, 1000)}` : '';
1581
+ const cleanContent = `# šŸ›’ ${title}${priceLine}${ratingLine}${availLine}${descSection}${featuresSection}`;
1582
+ return { domain: 'amazon.com', type: 'product', structured, cleanContent };
1583
+ }
1584
+ catch {
1585
+ return null;
1586
+ }
1587
+ }
1588
+ // ---------------------------------------------------------------------------
1589
+ // 13. Medium Articles extractor
1590
+ // ---------------------------------------------------------------------------
1591
+ async function mediumExtractor(html, url) {
1592
+ try {
1593
+ const { load } = await import('cheerio');
1594
+ const $ = load(html);
1595
+ // JSON-LD
1596
+ let jsonLdData = null;
1597
+ $('script[type="application/ld+json"]').each((_, el) => {
1598
+ if (jsonLdData)
1599
+ return;
1600
+ const raw = $(el).html() || '';
1601
+ const parsed = tryParseJson(raw);
1602
+ if (parsed?.['@type'] === 'NewsArticle' || parsed?.['@type'] === 'Article')
1603
+ jsonLdData = parsed;
1604
+ });
1605
+ const title = jsonLdData?.headline ||
1606
+ $('meta[property="og:title"]').attr('content') ||
1607
+ $('h1').first().text().trim() || '';
1608
+ if (!title)
1609
+ return null;
1610
+ const author = jsonLdData?.author?.name ||
1611
+ $('meta[name="author"]').attr('content') ||
1612
+ $('[data-testid="authorName"]').text().trim() ||
1613
+ $('a[rel="author"]').first().text().trim() || '';
1614
+ const publishDate = jsonLdData?.datePublished ||
1615
+ $('meta[property="article:published_time"]').attr('content') || '';
1616
+ const readingTime = $('[data-testid="storyReadTime"]').text().trim() ||
1617
+ $('span').filter((_, el) => $(el).text().includes('min read')).first().text().trim() || '';
1618
+ const description = jsonLdData?.description ||
1619
+ $('meta[property="og:description"]').attr('content') || '';
1620
+ // Extract article body — Medium puts content in <article> or section
1621
+ let articleBody = '';
1622
+ const articleEl = $('article').first();
1623
+ if (articleEl.length) {
1624
+ // Remove nav, aside, buttons
1625
+ articleEl.find('nav, aside, button, [data-testid="navbar"]').remove();
1626
+ // Get paragraphs and headings
1627
+ const parts = [];
1628
+ articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
1629
+ const tag = el.name;
1630
+ const text = $(el).text().trim();
1631
+ if (!text || text.length < 5)
1632
+ return;
1633
+ if (tag === 'h1' || tag === 'h2')
1634
+ parts.push(`## ${text}`);
1635
+ else if (tag === 'h3' || tag === 'h4')
1636
+ parts.push(`### ${text}`);
1637
+ else if (tag === 'blockquote')
1638
+ parts.push(`> ${text}`);
1639
+ else if (tag === 'pre')
1640
+ parts.push('```\n' + text + '\n```');
1641
+ else
1642
+ parts.push(text);
1643
+ });
1644
+ articleBody = parts.join('\n\n');
1645
+ }
1646
+ // Fallback to og:description if no body
1647
+ const contentBody = articleBody || description;
1648
+ const structured = {
1649
+ title,
1650
+ author,
1651
+ publishDate,
1652
+ readingTime,
1653
+ description,
1654
+ url,
1655
+ };
1656
+ const authorLine = author ? `\n**Author:** ${author}` : '';
1657
+ const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
1658
+ const timeLine = readingTime ? `\n**Reading time:** ${readingTime}` : '';
1659
+ const cleanContent = `# ${title}${authorLine}${dateLine}${timeLine}\n\n${contentBody.substring(0, 8000)}`;
1660
+ return { domain: 'medium.com', type: 'article', structured, cleanContent };
1661
+ }
1662
+ catch {
1663
+ return null;
1664
+ }
1665
+ }
1666
+ // ---------------------------------------------------------------------------
1667
+ // 14. Substack Posts extractor
1668
+ // ---------------------------------------------------------------------------
1669
+ async function substackExtractor(html, url) {
1670
+ try {
1671
+ const { load } = await import('cheerio');
1672
+ const $ = load(html);
1673
+ // JSON-LD
1674
+ let jsonLdData = null;
1675
+ $('script[type="application/ld+json"]').each((_, el) => {
1676
+ if (jsonLdData)
1677
+ return;
1678
+ const raw = $(el).html() || '';
1679
+ const parsed = tryParseJson(raw);
1680
+ if (parsed?.['@type'] === 'NewsArticle' || parsed?.['@type'] === 'Article')
1681
+ jsonLdData = parsed;
1682
+ });
1683
+ const title = jsonLdData?.headline ||
1684
+ $('meta[property="og:title"]').attr('content') ||
1685
+ $('h1.post-title').first().text().trim() ||
1686
+ $('h1').first().text().trim() || '';
1687
+ if (!title)
1688
+ return null;
1689
+ const author = jsonLdData?.author?.name ||
1690
+ $('meta[name="author"]').attr('content') ||
1691
+ $('a.author-name').first().text().trim() ||
1692
+ $('[class*="author"]').first().text().trim() || '';
1693
+ const publishDate = jsonLdData?.datePublished ||
1694
+ $('meta[property="article:published_time"]').attr('content') ||
1695
+ $('time').first().attr('datetime') || '';
1696
+ const publication = $('meta[property="og:site_name"]').attr('content') ||
1697
+ $('a.navbar-title-link').text().trim() || new URL(url).hostname.replace('.substack.com', '');
1698
+ const description = jsonLdData?.description ||
1699
+ $('meta[property="og:description"]').attr('content') || '';
1700
+ // Article content
1701
+ let articleBody = '';
1702
+ const postContent = $('.body.markup, .post-content, article').first();
1703
+ if (postContent.length) {
1704
+ postContent.find('script, style, nav, .paywall, .subscribe-widget').remove();
1705
+ const parts = [];
1706
+ postContent.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
1707
+ const tag = el.name;
1708
+ const text = $(el).text().trim();
1709
+ if (!text || text.length < 3)
1710
+ return;
1711
+ if (tag === 'h1' || tag === 'h2')
1712
+ parts.push(`## ${text}`);
1713
+ else if (tag === 'h3' || tag === 'h4')
1714
+ parts.push(`### ${text}`);
1715
+ else if (tag === 'blockquote')
1716
+ parts.push(`> ${text}`);
1717
+ else if (tag === 'pre')
1718
+ parts.push('```\n' + text + '\n```');
1719
+ else
1720
+ parts.push(text);
1721
+ });
1722
+ articleBody = parts.join('\n\n');
1723
+ }
1724
+ const contentBody = articleBody || description;
1725
+ const structured = {
1726
+ title,
1727
+ author,
1728
+ publication,
1729
+ publishDate,
1730
+ description,
1731
+ url,
1732
+ };
1733
+ const authorLine = author ? `\n**Author:** ${author}` : '';
1734
+ const pubLine = publication ? `\n**Publication:** ${publication}` : '';
1735
+ const dateLine = publishDate ? `\n**Published:** ${publishDate.split('T')[0]}` : '';
1736
+ const cleanContent = `# ${title}${authorLine}${pubLine}${dateLine}\n\n${contentBody.substring(0, 8000)}`;
1737
+ return { domain: 'substack.com', type: 'post', structured, cleanContent };
1738
+ }
1739
+ catch {
1740
+ return null;
1741
+ }
1742
+ }
1743
+ // ---------------------------------------------------------------------------
1744
+ // 15. Allrecipes (Recipe Sites) extractor
1745
+ // ---------------------------------------------------------------------------
1746
+ async function allrecipesExtractor(html, url) {
1747
+ try {
1748
+ const { load } = await import('cheerio');
1749
+ const $ = load(html);
1750
+ // Try Schema.org Recipe JSON-LD first
1751
+ let recipe = null;
1752
+ $('script[type="application/ld+json"]').each((_, el) => {
1753
+ if (recipe)
1754
+ return;
1755
+ const raw = $(el).html() || '';
1756
+ const parsed = tryParseJson(raw);
1757
+ // Can be an array or direct object
1758
+ const candidates = Array.isArray(parsed) ? parsed : [parsed];
1759
+ for (const item of candidates) {
1760
+ if (item?.['@type'] === 'Recipe' || (Array.isArray(item?.['@type']) && item['@type'].includes('Recipe'))) {
1761
+ recipe = item;
1762
+ break;
1763
+ }
1764
+ // Sometimes it's nested in @graph
1765
+ if (item?.['@graph']) {
1766
+ const graphRecipe = item['@graph'].find((g) => g?.['@type'] === 'Recipe');
1767
+ if (graphRecipe) {
1768
+ recipe = graphRecipe;
1769
+ break;
1770
+ }
1771
+ }
1772
+ }
1773
+ });
1774
+ let title;
1775
+ let ingredients = [];
1776
+ let instructions = [];
1777
+ let prepTime = '';
1778
+ let cookTime = '';
1779
+ let totalTime = '';
1780
+ let servings = '';
1781
+ let rating = '';
1782
+ let reviewCount = '';
1783
+ let description = '';
1784
+ if (recipe) {
1785
+ title = recipe.name || '';
1786
+ description = recipe.description || '';
1787
+ ingredients = (recipe.recipeIngredient || []).map((i) => i.trim());
1788
+ // Instructions can be strings or HowToStep objects
1789
+ const rawInstructions = recipe.recipeInstructions || [];
1790
+ for (const step of rawInstructions) {
1791
+ if (typeof step === 'string')
1792
+ instructions.push(step.trim());
1793
+ else if (step.text)
1794
+ instructions.push(step.text.trim());
1795
+ else if (step['@type'] === 'HowToSection' && step.itemListElement) {
1796
+ for (const s of step.itemListElement) {
1797
+ if (s.text)
1798
+ instructions.push(s.text.trim());
1799
+ }
1800
+ }
1801
+ }
1802
+ // Parse ISO 8601 duration (PT30M, PT1H30M)
1803
+ const parseDuration = (d) => {
1804
+ if (!d)
1805
+ return '';
1806
+ const h = d.match(/(\d+)H/)?.[1];
1807
+ const m = d.match(/(\d+)M/)?.[1];
1808
+ return [h ? `${h}h` : '', m ? `${m}m` : ''].filter(Boolean).join(' ');
1809
+ };
1810
+ prepTime = parseDuration(recipe.prepTime || '');
1811
+ cookTime = parseDuration(recipe.cookTime || '');
1812
+ totalTime = parseDuration(recipe.totalTime || '');
1813
+ servings = String(recipe.recipeYield || '');
1814
+ rating = recipe.aggregateRating?.ratingValue ? String(recipe.aggregateRating.ratingValue) : '';
1815
+ reviewCount = recipe.aggregateRating?.reviewCount ? String(recipe.aggregateRating.reviewCount) : '';
1816
+ }
1817
+ else {
1818
+ // HTML fallback
1819
+ title = $('h1').first().text().trim() ||
1820
+ $('meta[property="og:title"]').attr('content') || '';
1821
+ description = $('meta[property="og:description"]').attr('content') || '';
1822
+ $('[class*="ingredient"]').each((_, el) => {
1823
+ const text = $(el).text().trim();
1824
+ if (text && text.length < 200)
1825
+ ingredients.push(text);
1826
+ });
1827
+ $('[class*="instruction"] li, [class*="step"] li').each((_, el) => {
1828
+ const text = $(el).text().trim();
1829
+ if (text)
1830
+ instructions.push(text);
1831
+ });
1832
+ }
1833
+ if (!title)
1834
+ return null;
1835
+ const structured = {
1836
+ title, description, ingredients, instructions,
1837
+ prepTime, cookTime, totalTime, servings, rating, reviewCount, url,
1838
+ };
1839
+ const timeParts = [
1840
+ prepTime ? `Prep: ${prepTime}` : '',
1841
+ cookTime ? `Cook: ${cookTime}` : '',
1842
+ totalTime ? `Total: ${totalTime}` : '',
1843
+ ].filter(Boolean).join(' | ');
1844
+ const metaLine = [
1845
+ timeParts,
1846
+ servings ? `Servings: ${servings}` : '',
1847
+ rating ? `Rating: ${rating}${reviewCount ? ` (${reviewCount} reviews)` : ''}` : '',
1848
+ ].filter(Boolean).join(' | ');
1849
+ const ingredientsMd = ingredients.length
1850
+ ? `## Ingredients\n\n${ingredients.map(i => `- ${i}`).join('\n')}`
1851
+ : '';
1852
+ const instructionsMd = instructions.length
1853
+ ? `## Instructions\n\n${instructions.map((s, i) => `${i + 1}. ${s}`).join('\n')}`
1854
+ : '';
1855
+ const cleanContent = `# šŸ½ļø ${title}\n\n${metaLine ? `*${metaLine}*\n\n` : ''}${description ? description + '\n\n' : ''}${ingredientsMd}\n\n${instructionsMd}`.trim();
1856
+ return { domain: 'allrecipes.com', type: 'recipe', structured, cleanContent };
1857
+ }
1858
+ catch {
1859
+ return null;
1860
+ }
1861
+ }
1862
+ // ---------------------------------------------------------------------------
1863
+ // 16. IMDB extractor
1864
+ // ---------------------------------------------------------------------------
1865
+ async function imdbExtractor(html, url) {
1866
+ try {
1867
+ const { load } = await import('cheerio');
1868
+ const $ = load(html);
1869
+ // IMDB uses JSON-LD richly
1870
+ let jsonLd = null;
1871
+ $('script[type="application/ld+json"]').each((_, el) => {
1872
+ if (jsonLd)
1873
+ return;
1874
+ const raw = $(el).html() || '';
1875
+ const parsed = tryParseJson(raw);
1876
+ if (parsed?.['@type'] === 'Movie' || parsed?.['@type'] === 'TVSeries' || parsed?.['@type'] === 'TVEpisode') {
1877
+ jsonLd = parsed;
1878
+ }
1879
+ });
1880
+ const title = jsonLd?.name ||
1881
+ $('meta[property="og:title"]').attr('content')?.replace(/ - IMDb$/, '') ||
1882
+ $('h1[data-testid="hero__pageTitle"] span').first().text().trim() || '';
1883
+ if (!title)
1884
+ return null;
1885
+ const description = jsonLd?.description ||
1886
+ $('meta[property="og:description"]').attr('content') ||
1887
+ $('p[data-testid="plot"]').text().trim() || '';
1888
+ const year = jsonLd?.datePublished?.substring(0, 4) ||
1889
+ $('a[href*="releaseinfo"]').first().text().trim() || '';
1890
+ const ratingValue = jsonLd?.aggregateRating?.ratingValue ||
1891
+ $('[data-testid="hero-rating-bar__aggregate-rating__score"] span').first().text().trim() || '';
1892
+ const ratingCount = jsonLd?.aggregateRating?.ratingCount || '';
1893
+ const contentType = jsonLd?.['@type'] || 'Movie';
1894
+ // Genres
1895
+ const genres = jsonLd?.genre
1896
+ ? (Array.isArray(jsonLd.genre) ? jsonLd.genre : [jsonLd.genre])
1897
+ : [];
1898
+ if (!genres.length) {
1899
+ $('[data-testid="genres"] a, a[href*="/search/title?genres"]').each((_, el) => {
1900
+ const g = $(el).text().trim();
1901
+ if (g && !genres.includes(g))
1902
+ genres.push(g);
1903
+ });
1904
+ }
1905
+ // Director
1906
+ const director = jsonLd?.director
1907
+ ? (Array.isArray(jsonLd.director)
1908
+ ? jsonLd.director.map((d) => d.name || d).join(', ')
1909
+ : jsonLd.director?.name || String(jsonLd.director))
1910
+ : $('a[href*="/name/"][class*="ipc-metadata-list-item__list-content-item"]').first().text().trim() || '';
1911
+ // Cast (top few from JSON-LD actor field)
1912
+ const cast = jsonLd?.actor
1913
+ ? (Array.isArray(jsonLd.actor) ? jsonLd.actor : [jsonLd.actor])
1914
+ .map((a) => a.name || a).slice(0, 6)
1915
+ : [];
1916
+ // Runtime
1917
+ const runtime = jsonLd?.duration
1918
+ ? (() => {
1919
+ const m = String(jsonLd.duration).match(/PT(?:(\d+)H)?(?:(\d+)M)?/);
1920
+ if (m)
1921
+ return [m[1] ? `${m[1]}h` : '', m[2] ? `${m[2]}m` : ''].filter(Boolean).join(' ');
1922
+ return String(jsonLd.duration);
1923
+ })()
1924
+ : '';
1925
+ const structured = {
1926
+ title, year, contentType, description, ratingValue, ratingCount,
1927
+ genres, director, cast, runtime, url,
1928
+ };
1929
+ const ratingLine = ratingValue ? `⭐ ${ratingValue}/10${ratingCount ? ` (${Number(ratingCount).toLocaleString()} votes)` : ''}` : '';
1930
+ const genreLine = genres.length ? genres.join(', ') : '';
1931
+ const directorLine = director ? `**Director:** ${director}` : '';
1932
+ const castLine = cast.length ? `**Cast:** ${cast.join(', ')}` : '';
1933
+ const runtimeLine = runtime ? `**Runtime:** ${runtime}` : '';
1934
+ const metaParts = [ratingLine, genreLine, runtimeLine, year ? `**Year:** ${year}` : ''].filter(Boolean).join(' | ');
1935
+ const cleanContent = `# šŸŽ¬ ${title}\n\n${metaParts}\n\n${directorLine ? directorLine + '\n' : ''}${castLine ? castLine + '\n' : ''}\n## Plot\n\n${description}`;
1936
+ return { domain: 'imdb.com', type: contentType === 'TVSeries' ? 'tv_show' : 'movie', structured, cleanContent };
1937
+ }
1938
+ catch {
1939
+ return null;
1940
+ }
1941
+ }
1942
+ // ---------------------------------------------------------------------------
1943
+ // 17. LinkedIn extractor
1944
+ // ---------------------------------------------------------------------------
1945
+ async function linkedinExtractor(html, url) {
1946
+ try {
1947
+ const { load } = await import('cheerio');
1948
+ const $ = load(html);
1949
+ // LinkedIn SSR exposes some data in meta tags and JSON-LD
1950
+ let jsonLd = null;
1951
+ $('script[type="application/ld+json"]').each((_, el) => {
1952
+ if (jsonLd)
1953
+ return;
1954
+ const raw = $(el).html() || '';
1955
+ const parsed = tryParseJson(raw);
1956
+ if (parsed?.['@type'] === 'Person' || parsed?.['@type'] === 'Organization')
1957
+ jsonLd = parsed;
1958
+ });
1959
+ const ogTitle = $('meta[property="og:title"]').attr('content') || '';
1960
+ const ogDescription = $('meta[property="og:description"]').attr('content') || '';
1961
+ const ogImage = $('meta[property="og:image"]').attr('content') || '';
1962
+ const name = jsonLd?.name || ogTitle.replace(/ \| LinkedIn$/, '').trim() || '';
1963
+ if (!name)
1964
+ return null;
1965
+ const headline = jsonLd?.jobTitle ||
1966
+ $('meta[name="description"]').attr('content')?.split('|')?.[0]?.trim() ||
1967
+ ogDescription || '';
1968
+ const description = jsonLd?.description || ogDescription || '';
1969
+ // Try to detect page type from URL
1970
+ const pathParts = new URL(url).pathname.split('/').filter(Boolean);
1971
+ const pageType = pathParts[0] === 'company' ? 'company'
1972
+ : pathParts[0] === 'in' ? 'profile'
1973
+ : pathParts[0] === 'jobs' ? 'job'
1974
+ : 'page';
1975
+ // Extract any visible structured info from the HTML
1976
+ const location = $('[class*="location"]').first().text().trim() ||
1977
+ jsonLd?.address?.addressLocality || '';
1978
+ const structured = {
1979
+ name, headline, description, location, pageType,
1980
+ image: ogImage, url,
1981
+ };
1982
+ const typeLine = pageType === 'company' ? 'šŸ¢' : pageType === 'profile' ? 'šŸ‘¤' : 'šŸ”—';
1983
+ const locationLine = location ? `\nšŸ“ ${location}` : '';
1984
+ const headlineLine = headline ? `\n*${headline}*` : '';
1985
+ const cleanContent = `# ${typeLine} ${name}${headlineLine}${locationLine}\n\n${description}`;
1986
+ return { domain: 'linkedin.com', type: pageType, structured, cleanContent };
1987
+ }
1988
+ catch {
1989
+ return null;
1990
+ }
1991
+ }
1992
+ // ---------------------------------------------------------------------------
1993
+ // 18. PyPI extractor
1994
+ // ---------------------------------------------------------------------------
1995
+ async function pypiExtractor(_html, url) {
1996
+ const urlObj = new URL(url);
1997
+ const path = urlObj.pathname;
1998
+ // Match /project/name or /project/name/version/
1999
+ const packageMatch = path.match(/\/project\/([^/]+)/);
2000
+ if (!packageMatch)
2001
+ return null;
2002
+ const packageName = packageMatch[1];
2003
+ try {
2004
+ const apiUrl = `https://pypi.org/pypi/${encodeURIComponent(packageName)}/json`;
2005
+ const data = await fetchJson(apiUrl);
2006
+ if (!data?.info)
2007
+ return null;
2008
+ const info = data.info;
2009
+ const structured = {
2010
+ name: info.name,
2011
+ version: info.version,
2012
+ description: info.summary || '',
2013
+ author: info.author || '',
2014
+ authorEmail: info.author_email || '',
2015
+ license: info.license || 'N/A',
2016
+ homepage: info.home_page || info.project_url || null,
2017
+ projectUrls: info.project_urls || {},
2018
+ keywords: info.keywords ? info.keywords.split(/[,\s]+/).filter(Boolean) : [],
2019
+ requiresPython: info.requires_python || '',
2020
+ requiresDist: (info.requires_dist || []).slice(0, 20),
2021
+ classifiers: (info.classifiers || []).slice(0, 10),
2022
+ };
2023
+ const installCmd = `pip install ${info.name}`;
2024
+ const keywordsLine = structured.keywords.length ? `\n**Keywords:** ${structured.keywords.join(', ')}` : '';
2025
+ const pyVersionLine = structured.requiresPython ? `\n**Requires Python:** ${structured.requiresPython}` : '';
2026
+ const depsLine = structured.requiresDist.length
2027
+ ? `\n\n## Dependencies\n\n${structured.requiresDist.map((d) => `- ${d}`).join('\n')}`
2028
+ : '';
2029
+ // Find project URLs
2030
+ const projectUrlLines = [];
2031
+ for (const [label, u] of Object.entries(structured.projectUrls)) {
2032
+ projectUrlLines.push(`- **${label}:** ${u}`);
2033
+ }
2034
+ const cleanContent = `# šŸ“¦ ${info.name} ${info.version}
2035
+
2036
+ ${info.summary || ''}
2037
+
2038
+ \`\`\`
2039
+ ${installCmd}
2040
+ \`\`\`
2041
+
2042
+ **Author:** ${info.author || 'N/A'} | **License:** ${info.license || 'N/A'}${keywordsLine}${pyVersionLine}
2043
+
2044
+ ${projectUrlLines.length ? `## Links\n\n${projectUrlLines.join('\n')}\n` : ''}${depsLine}`;
2045
+ return { domain: 'pypi.org', type: 'package', structured, cleanContent };
2046
+ }
2047
+ catch (e) {
2048
+ if (process.env.DEBUG)
2049
+ console.debug('[webpeel]', 'PyPI API failed:', e instanceof Error ? e.message : e);
2050
+ return null;
2051
+ }
2052
+ }
2053
+ // ---------------------------------------------------------------------------
2054
+ // 19. Dev.to extractor
2055
+ // ---------------------------------------------------------------------------
2056
+ async function devtoExtractor(html, url) {
2057
+ try {
2058
+ const { load } = await import('cheerio');
2059
+ const $ = load(html);
2060
+ // Try Dev.to article API if we can get the slug from the URL
2061
+ const urlObj = new URL(url);
2062
+ const pathParts = urlObj.pathname.split('/').filter(Boolean);
2063
+ // Dev.to article URL: /@username/article-slug-id or /username/article-slug-id
2064
+ const slug = pathParts.length >= 2
2065
+ ? pathParts.slice(0, 2).join('/').replace(/^@/, '')
2066
+ : null;
2067
+ if (slug) {
2068
+ try {
2069
+ const apiUrl = `https://dev.to/api/articles/${slug}`;
2070
+ const apiData = await fetchJson(apiUrl);
2071
+ if (apiData?.title) {
2072
+ const structured = {
2073
+ title: apiData.title,
2074
+ author: apiData.user?.name || '',
2075
+ authorUsername: apiData.user?.username || '',
2076
+ publishDate: apiData.published_at || '',
2077
+ tags: apiData.tag_list || [],
2078
+ readingTime: apiData.reading_time_minutes ? `${apiData.reading_time_minutes} min read` : '',
2079
+ reactions: apiData.public_reactions_count || 0,
2080
+ comments: apiData.comments_count || 0,
2081
+ description: apiData.description || '',
2082
+ url: apiData.url || url,
2083
+ };
2084
+ const authorLine = structured.author ? `**Author:** ${structured.author} (@${structured.authorUsername})` : '';
2085
+ const dateLine = structured.publishDate ? `**Published:** ${structured.publishDate.split('T')[0]}` : '';
2086
+ const tagsLine = structured.tags.length ? `**Tags:** ${structured.tags.join(', ')}` : '';
2087
+ const statsLine = `ā¤ļø ${structured.reactions} reactions | šŸ’¬ ${structured.comments} comments${structured.readingTime ? ` | ā±ļø ${structured.readingTime}` : ''}`;
2088
+ const metaParts = [authorLine, dateLine, tagsLine, statsLine].filter(Boolean).join('\n');
2089
+ // Use body_html if available for article content
2090
+ let articleContent = '';
2091
+ if (apiData.body_html) {
2092
+ // Strip HTML tags for clean content
2093
+ articleContent = stripHtml(apiData.body_html)
2094
+ .replace(/\n{3,}/g, '\n\n')
2095
+ .substring(0, 8000);
2096
+ }
2097
+ else if (apiData.body_markdown) {
2098
+ articleContent = apiData.body_markdown.substring(0, 8000);
2099
+ }
2100
+ const cleanContent = `# ${structured.title}\n\n${metaParts}\n\n${articleContent || structured.description}`;
2101
+ return { domain: 'dev.to', type: 'article', structured, cleanContent };
2102
+ }
2103
+ }
2104
+ catch { /* fall through to HTML */ }
2105
+ }
2106
+ // HTML fallback
2107
+ const title = $('meta[property="og:title"]').attr('content') ||
2108
+ $('h1').first().text().trim() || '';
2109
+ if (!title)
2110
+ return null;
2111
+ const author = $('meta[name="author"]').attr('content') ||
2112
+ $('[itemprop="name"]').first().text().trim() || '';
2113
+ const description = $('meta[property="og:description"]').attr('content') || '';
2114
+ const tags = [];
2115
+ $('a[data-no-instant][href*="/t/"]').each((_, el) => {
2116
+ const tag = $(el).text().trim().replace('#', '');
2117
+ if (tag)
2118
+ tags.push(tag);
2119
+ });
2120
+ // Article body
2121
+ let articleBody = '';
2122
+ const articleEl = $('article#article-body, .crayons-article__main, #article-body').first();
2123
+ if (articleEl.length) {
2124
+ const parts = [];
2125
+ articleEl.find('h1, h2, h3, h4, p, blockquote, pre, li').each((_, el) => {
2126
+ const tag = el.name;
2127
+ const text = $(el).text().trim();
2128
+ if (!text || text.length < 3)
2129
+ return;
2130
+ if (tag === 'h2')
2131
+ parts.push(`## ${text}`);
2132
+ else if (tag === 'h3' || tag === 'h4')
2133
+ parts.push(`### ${text}`);
2134
+ else if (tag === 'blockquote')
2135
+ parts.push(`> ${text}`);
2136
+ else if (tag === 'pre')
2137
+ parts.push('```\n' + text + '\n```');
2138
+ else
2139
+ parts.push(text);
2140
+ });
2141
+ articleBody = parts.join('\n\n');
2142
+ }
2143
+ const structured = {
2144
+ title, author, description, tags, url,
2145
+ };
2146
+ const authorLine = author ? `\n**Author:** ${author}` : '';
2147
+ const tagsLine = tags.length ? `\n**Tags:** ${tags.join(', ')}` : '';
2148
+ const cleanContent = `# ${title}${authorLine}${tagsLine}\n\n${articleBody || description}`.substring(0, 10000);
2149
+ return { domain: 'dev.to', type: 'article', structured, cleanContent };
2150
+ }
2151
+ catch {
2152
+ return null;
2153
+ }
2154
+ }
2155
+ // ---------------------------------------------------------------------------
2156
+ // 20. Craigslist extractor
2157
+ // ---------------------------------------------------------------------------
2158
+ async function craigslistExtractor(html, url) {
2159
+ try {
2160
+ const { load } = await import('cheerio');
2161
+ const $ = load(html);
2162
+ const urlObj = new URL(url);
2163
+ const path = urlObj.pathname;
2164
+ // Detect if it's a listing page or individual post
2165
+ // Individual post: /xxx/yyy/d/title/12345678.html
2166
+ const isPost = /\/d\/[^/]+\/\d+\.html/.test(path) || /\/\d{10,}\.html/.test(path);
2167
+ if (isPost) {
2168
+ const title = $('#titletextonly').text().trim() ||
2169
+ $('span#titletextonly').text().trim() ||
2170
+ $('meta[property="og:title"]').attr('content') ||
2171
+ $('h2.postingtitle').text().trim() || '';
2172
+ if (!title)
2173
+ return null;
2174
+ const price = $('.price').first().text().trim() ||
2175
+ $('[class*="price"]').first().text().trim() || '';
2176
+ const location = $('.postingtitletext small').text().trim().replace(/[()]/g, '') ||
2177
+ $('#map').attr('data-address') || '';
2178
+ const postDate = $('#display-date time').attr('datetime') ||
2179
+ $('time.date').first().attr('datetime') ||
2180
+ $('p.postinginfo time').first().attr('datetime') || '';
2181
+ // Body text
2182
+ const bodyEl = $('#postingbody');
2183
+ bodyEl.find('.print-information, .QR-code').remove();
2184
+ const bodyText = bodyEl.text().trim()
2185
+ .replace(/QR Code Link to This Post/, '')
2186
+ .replace(/\n{3,}/g, '\n\n')
2187
+ .trim();
2188
+ // Images
2189
+ const images = [];
2190
+ $('img.slide').each((_, el) => {
2191
+ const src = $(el).attr('src') || '';
2192
+ if (src && !images.includes(src))
2193
+ images.push(src);
2194
+ });
2195
+ $('img[id^="ii"]').each((_, el) => {
2196
+ const src = $(el).attr('src') || '';
2197
+ if (src && !images.includes(src))
2198
+ images.push(src);
2199
+ });
2200
+ // Attributes
2201
+ const attrs = {};
2202
+ $('.attrgroup span').each((_, el) => {
2203
+ const text = $(el).text().trim();
2204
+ const parts = text.split(':');
2205
+ if (parts.length === 2)
2206
+ attrs[parts[0].trim()] = parts[1].trim();
2207
+ });
2208
+ const structured = {
2209
+ title, price, location, postDate,
2210
+ bodyText, images, attributes: attrs, url,
2211
+ };
2212
+ const priceLine = price ? `\n**Price:** ${price}` : '';
2213
+ const locationLine = location ? `\n**Location:** ${location}` : '';
2214
+ const dateLine = postDate ? `\n**Posted:** ${postDate.split('T')[0]}` : '';
2215
+ const attrsSection = Object.keys(attrs).length
2216
+ ? `\n\n## Details\n\n${Object.entries(attrs).map(([k, v]) => `- **${k}:** ${v}`).join('\n')}`
2217
+ : '';
2218
+ const imagesLine = images.length ? `\n\nšŸ“· ${images.length} image${images.length > 1 ? 's' : ''}` : '';
2219
+ const cleanContent = `# šŸ“‹ ${title}${priceLine}${locationLine}${dateLine}${attrsSection}${imagesLine}\n\n${bodyText.substring(0, 3000)}`;
2220
+ return { domain: 'craigslist.org', type: 'listing', structured, cleanContent };
2221
+ }
2222
+ // Listing page (search results)
2223
+ const pageTitle = $('title').text().trim() ||
2224
+ $('meta[property="og:title"]').attr('content') || 'Craigslist Listings';
2225
+ const listings = [];
2226
+ $('.result-row, li.cl-static-search-result, .cl-search-result').each((_, el) => {
2227
+ const titleEl = $(el).find('a.titlestring, a[class*="title"], .result-title').first();
2228
+ const postTitle = titleEl.text().trim();
2229
+ const postUrl = titleEl.attr('href') || '';
2230
+ const postPrice = $(el).find('.result-price, [class*="price"]').first().text().trim();
2231
+ const postHood = $(el).find('.result-hood, [class*="hood"]').first().text().trim().replace(/[()]/g, '');
2232
+ if (postTitle) {
2233
+ listings.push({ title: postTitle, url: postUrl, price: postPrice, location: postHood });
2234
+ }
2235
+ });
2236
+ if (!listings.length)
2237
+ return null;
2238
+ const structured = { pageTitle, listings, url };
2239
+ const listMd = listings.slice(0, 20).map((l, i) => `${i + 1}. **${l.title}**${l.price ? ` — ${l.price}` : ''}${l.location ? ` (${l.location})` : ''}${l.url ? `\n ${l.url}` : ''}`).join('\n\n');
2240
+ const cleanContent = `# šŸ“‹ ${pageTitle}\n\n${listMd}`;
2241
+ return { domain: 'craigslist.org', type: 'search', structured, cleanContent };
2242
+ }
2243
+ catch {
2244
+ return null;
2245
+ }
2246
+ }
1465
2247
  //# sourceMappingURL=domain-extractors.js.map