webpeel 0.21.64 → 0.21.66

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -93,6 +93,25 @@ const REGISTRY = [
93
93
  { match: (h) => h === 'www.producthunt.com' || h === 'producthunt.com', extractor: productHuntExtractor },
94
94
  { match: (h) => h === 'substack.com' || h === 'www.substack.com', extractor: substackRootExtractor },
95
95
  { match: (_h, url = '') => /\.pdf(\?|$|#)/i.test(url) || /\/pdf\//i.test(url), extractor: pdfExtractor },
96
+ // ── Prediction markets & trading ─────────────────────────────────────────
97
+ { match: (h) => h === 'polymarket.com' || h === 'www.polymarket.com', extractor: polymarketExtractor },
98
+ { match: (h) => h === 'kalshi.com' || h === 'www.kalshi.com', extractor: kalshiExtractor },
99
+ { match: (h) => h === 'tradingview.com' || h === 'www.tradingview.com', extractor: tradingViewExtractor },
100
+ // ── Sports ───────────────────────────────────────────────────────────────
101
+ { match: (h) => h === 'espn.com' || h === 'www.espn.com', extractor: espnExtractor },
102
+ { match: (h) => h === 'draftkings.com' || h === 'www.draftkings.com' || h === 'sportsbook.draftkings.com', extractor: sportsBettingExtractor },
103
+ { match: (h) => h === 'fanduel.com' || h === 'www.fanduel.com' || h === 'sportsbook.fanduel.com', extractor: sportsBettingExtractor },
104
+ { match: (h) => h === 'betmgm.com' || h === 'www.betmgm.com', extractor: sportsBettingExtractor },
105
+ // ── Academic papers ───────────────────────────────────────────────────────
106
+ { match: (h) => h === 'semanticscholar.org' || h === 'www.semanticscholar.org', extractor: semanticScholarExtractor },
107
+ { match: (h) => h === 'pubmed.ncbi.nlm.nih.gov', extractor: pubmedExtractor },
108
+ // ── Crypto ───────────────────────────────────────────────────────────────
109
+ { match: (h) => h === 'coingecko.com' || h === 'www.coingecko.com', extractor: coinGeckoExtractor },
110
+ { match: (h) => h === 'coinmarketcap.com' || h === 'www.coinmarketcap.com', extractor: coinGeckoExtractor },
111
+ // ── Weather ──────────────────────────────────────────────────────────────
112
+ { match: (h) => h === 'open-meteo.com' || h === 'api.open-meteo.com' || h === 'www.open-meteo.com', extractor: weatherExtractor },
113
+ { match: (h) => h === 'weather.com' || h === 'www.weather.com', extractor: weatherExtractor },
114
+ { match: (h) => h === 'accuweather.com' || h === 'www.accuweather.com', extractor: weatherExtractor },
96
115
  ];
97
116
  /**
98
117
  * Returns the domain extractor for a URL, or null if none matches.
@@ -943,21 +962,13 @@ ${commentsMd || '*No comments.*'}`;
943
962
  if (repoData.message.includes('secondary rate limit') || repoData.message.includes('abuse'))
944
963
  return null;
945
964
  }
946
- const readmeData = await fetchJsonWithRetry(`https://api.github.com/repos/${owner}/${repo}/readme`, ghHeaders, 1, 500).catch(() => null);
947
- // README content is base64 encoded
948
- let readmeText = '';
949
- if (readmeData?.content) {
950
- try {
951
- readmeText = Buffer.from(readmeData.content, 'base64').toString('utf-8').slice(0, 5000);
952
- }
953
- catch { /* ignore */ }
954
- }
955
965
  const structured = {
956
966
  title: `${owner}/${repo}`,
957
967
  name: `${owner}/${repo}`,
958
968
  description: repoData.description || '',
959
969
  stars: repoData.stargazers_count ?? 0,
960
970
  forks: repoData.forks_count ?? 0,
971
+ watchers: repoData.watchers_count ?? 0,
961
972
  language: repoData.language || null,
962
973
  topics: repoData.topics || [],
963
974
  license: repoData.license?.spdx_id || null,
@@ -968,18 +979,27 @@ ${commentsMd || '*No comments.*'}`;
968
979
  homepage: repoData.homepage || null,
969
980
  archived: repoData.archived || false,
970
981
  fork: repoData.fork || false,
971
- readme: readmeText,
982
+ url: repoData.html_url || `https://github.com/${owner}/${repo}`,
972
983
  };
973
- const topicsStr = structured.topics.length ? structured.topics.join(', ') : 'none';
974
- const cleanContent = `## 📦 Repository: ${structured.name}
975
-
976
- ${structured.description || '*No description.*'}
977
-
978
- ⭐ ${structured.stars.toLocaleString()} stars | 🍴 ${structured.forks.toLocaleString()} forks | 💻 ${structured.language || 'N/A'} | 📜 ${structured.license || 'N/A'}
979
- 🏷️ Topics: ${topicsStr}
980
- 🔗 ${structured.homepage || 'No homepage'} | Last push: ${structured.lastPush}${structured.archived ? '\n⚠️ **ARCHIVED**' : ''}
981
-
982
- ${structured.readme ? `### README\n\n${structured.readme}` : ''}`;
984
+ const topicsStr = structured.topics.length ? structured.topics.slice(0, 8).join(', ') : '';
985
+ const updatedDate = structured.lastPush ? structured.lastPush.slice(0, 10) : 'N/A';
986
+ const lines = [
987
+ `# 💻 ${structured.name}`,
988
+ '',
989
+ structured.description ? `**${structured.description}**` : '*No description.*',
990
+ '',
991
+ `- ⭐ Stars: ${structured.stars.toLocaleString()} | 🍴 Forks: ${structured.forks.toLocaleString()} | 📝 Language: ${structured.language || 'N/A'}`,
992
+ `- 📦 License: ${structured.license || 'None'} | 🔄 Updated: ${updatedDate}`,
993
+ `- 📊 Open Issues: ${structured.openIssues}${structured.archived ? ' | ⚠️ ARCHIVED' : ''}`,
994
+ ];
995
+ if (topicsStr)
996
+ lines.push(`- 🏷️ Topics: ${topicsStr}`);
997
+ lines.push('');
998
+ const links = [`[Repository](${structured.url})`];
999
+ if (structured.homepage)
1000
+ links.push(`[Homepage](${structured.homepage})`);
1001
+ lines.push(`**Links:** ${links.join(' · ')}`);
1002
+ const cleanContent = lines.join('\n');
983
1003
  return { domain, type: 'repository', structured, cleanContent };
984
1004
  }
985
1005
  return null;
@@ -1095,11 +1115,18 @@ ${commentsMd || '*No comments found.*'}`;
1095
1115
  commentCount: s.descendants ?? 0,
1096
1116
  url: s.url || `https://news.ycombinator.com/item?id=${s.id}`,
1097
1117
  hnUrl: `https://news.ycombinator.com/item?id=${s.id}`,
1118
+ domain: s.url ? (() => { try {
1119
+ return new URL(s.url).hostname.replace(/^www\./, '');
1120
+ }
1121
+ catch {
1122
+ return '';
1123
+ } })() : '',
1098
1124
  }));
1099
1125
  const structured = { title: 'Hacker News — Front Page', stories };
1126
+ // Compact format: title (domain) | score pts | N comments
1100
1127
  const cleanContent = `## 🟠 Hacker News — Front Page
1101
1128
 
1102
- ${stories.map((s, i) => `${i + 1}. **${s.title}**\n ↑ ${s.score} | 💬 ${s.commentCount} | by ${s.author}\n ${s.url}`).join('\n\n')}`;
1129
+ ${stories.map((s, i) => `${i + 1}. **${s.title}**${s.domain ? ` (${s.domain})` : ''} — ↑${s.score} · 💬${s.commentCount}`).join('\n')}`;
1103
1130
  return { domain, type: 'frontpage', structured, cleanContent };
1104
1131
  }
1105
1132
  // User page: ?id=username
@@ -1141,7 +1168,7 @@ function cleanWikipediaContent(content) {
1141
1168
  .replace(/\n{3,}/g, '\n\n')
1142
1169
  .trim();
1143
1170
  }
1144
- async function wikipediaExtractor(_html, url) {
1171
+ async function wikipediaExtractor(_html, url, options) {
1145
1172
  const urlObj = new URL(url);
1146
1173
  const pathParts = urlObj.pathname.split('/').filter(Boolean);
1147
1174
  // Only handle article pages: /wiki/Article_Title
@@ -1152,56 +1179,70 @@ async function wikipediaExtractor(_html, url) {
1152
1179
  if (articleTitle.includes(':'))
1153
1180
  return null;
1154
1181
  const lang = urlObj.hostname.split('.')[0] || 'en';
1155
- const apiUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(articleTitle)}`;
1182
+ const summaryUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(articleTitle)}`;
1156
1183
  // Wikipedia REST API requires a descriptive User-Agent (https://meta.wikimedia.org/wiki/User-Agent_policy)
1157
1184
  const wikiHeaders = { 'User-Agent': 'WebPeel/0.17.1 (https://webpeel.dev; jake@jakeliu.me) Node.js', 'Api-User-Agent': 'WebPeel/0.17.1 (https://webpeel.dev; jake@jakeliu.me)' };
1158
1185
  try {
1159
- const data = await fetchJson(apiUrl, wikiHeaders);
1186
+ const data = await fetchJson(summaryUrl, wikiHeaders);
1160
1187
  if (!data || data.type === 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found')
1161
1188
  return null;
1162
- // For full article content, use the mobile-html endpoint (mobile-sections is deprecated)
1163
- let fullContent = '';
1164
- let mobileHtmlSize;
1165
- try {
1166
- const fullUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/mobile-html/${encodeURIComponent(articleTitle)}`;
1167
- const fullResult = await simpleFetch(fullUrl, undefined, 15000, {
1168
- ...wikiHeaders,
1169
- 'Accept': 'text/html',
1170
- });
1171
- if (fullResult?.html) {
1172
- mobileHtmlSize = fullResult.html.length;
1173
- // Parse sections from the mobile HTML
1174
- const sectionMatches = fullResult.html.match(/<section[^>]*>([\s\S]*?)<\/section>/gi) || [];
1175
- for (const section of sectionMatches) {
1176
- // Extract section heading
1177
- const headingMatch = section.match(/<h[2-6][^>]*id="([^"]*)"[^>]*class="[^"]*pcs-edit-section-title[^"]*"[^>]*>([\s\S]*?)<\/h[2-6]>/i);
1178
- const heading = headingMatch ? stripHtml(headingMatch[2]).trim() : '';
1179
- // Extract paragraphs
1180
- const paragraphs = section.match(/<p[^>]*>([\s\S]*?)<\/p>/gi) || [];
1181
- const sectionText = paragraphs.map((p) => stripHtml(p).trim()).filter((t) => t.length > 0).join('\n\n');
1182
- if (sectionText) {
1183
- const prefix = heading ? `## ${heading}\n\n` : '';
1184
- fullContent += `\n\n${prefix}${sectionText}`;
1185
- }
1186
- }
1187
- }
1188
- }
1189
- catch (e) {
1190
- // mobile-html failed — use summary extract as fallback
1191
- if (process.env.DEBUG)
1192
- console.debug('[webpeel]', 'Wikipedia mobile-html failed, using summary:', e instanceof Error ? e.message : e);
1193
- }
1194
- // Clean Wikipedia-specific noise
1195
- fullContent = cleanWikipediaContent(fullContent);
1196
1189
  const structured = {
1197
1190
  title: data.title || articleTitle.replace(/_/g, ' '),
1198
1191
  description: data.description || '',
1199
1192
  extract: data.extract || '',
1193
+ extractHtml: data.extract_html || '',
1200
1194
  thumbnail: data.thumbnail?.source || null,
1201
1195
  url: data.content_urls?.desktop?.page || url,
1202
1196
  lastModified: data.timestamp || null,
1197
+ coordinates: data.coordinates || null,
1203
1198
  };
1204
- const cleanContent = `# ${structured.title}\n\n${structured.description ? `*${structured.description}*\n\n` : ''}${fullContent || structured.extract}`;
1199
+ // Default: use summary API (200-400 tokens). Only fetch full article if budget > 5000.
1200
+ const budget = options?.budget ?? 0;
1201
+ const useFull = budget > 5000;
1202
+ let bodyContent = structured.extract;
1203
+ let mobileHtmlSize;
1204
+ if (useFull) {
1205
+ try {
1206
+ const fullUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/mobile-html/${encodeURIComponent(articleTitle)}`;
1207
+ const fullResult = await simpleFetch(fullUrl, undefined, 15000, {
1208
+ ...wikiHeaders,
1209
+ 'Accept': 'text/html',
1210
+ });
1211
+ if (fullResult?.html) {
1212
+ mobileHtmlSize = fullResult.html.length;
1213
+ let fullContent = '';
1214
+ const sectionMatches = fullResult.html.match(/<section[^>]*>([\s\S]*?)<\/section>/gi) || [];
1215
+ for (const section of sectionMatches) {
1216
+ const headingMatch = section.match(/<h[2-6][^>]*id="([^"]*)"[^>]*class="[^"]*pcs-edit-section-title[^"]*"[^>]*>([\s\S]*?)<\/h[2-6]>/i);
1217
+ const heading = headingMatch ? stripHtml(headingMatch[2]).trim() : '';
1218
+ const paragraphs = section.match(/<p[^>]*>([\s\S]*?)<\/p>/gi) || [];
1219
+ const sectionText = paragraphs.map((p) => stripHtml(p).trim()).filter((t) => t.length > 0).join('\n\n');
1220
+ if (sectionText) {
1221
+ const prefix = heading ? `## ${heading}\n\n` : '';
1222
+ fullContent += `\n\n${prefix}${sectionText}`;
1223
+ }
1224
+ }
1225
+ bodyContent = cleanWikipediaContent(fullContent) || structured.extract;
1226
+ }
1227
+ }
1228
+ catch (e) {
1229
+ if (process.env.DEBUG)
1230
+ console.debug('[webpeel]', 'Wikipedia mobile-html failed, using summary:', e instanceof Error ? e.message : e);
1231
+ }
1232
+ }
1233
+ const articleUrl = structured.url;
1234
+ const lines = [
1235
+ `# ${structured.title}`,
1236
+ '',
1237
+ ];
1238
+ if (structured.description)
1239
+ lines.push(`*${structured.description}*`, '');
1240
+ lines.push(bodyContent);
1241
+ if (structured.coordinates) {
1242
+ lines.push('', `📍 Coordinates: ${structured.coordinates.lat}, ${structured.coordinates.lon}`);
1243
+ }
1244
+ lines.push('', `📖 [Read full article on Wikipedia](${articleUrl})`);
1245
+ const cleanContent = lines.join('\n');
1205
1246
  return { domain: 'wikipedia.org', type: 'article', structured, cleanContent, rawHtmlSize: mobileHtmlSize };
1206
1247
  }
1207
1248
  catch (e) {
@@ -1403,6 +1444,67 @@ async function youtubeExtractor(_html, url) {
1403
1444
  async function arxivExtractor(_html, url) {
1404
1445
  const urlObj = new URL(url);
1405
1446
  const path = urlObj.pathname;
1447
+ // --- Search page: /search/?query=... or /search/?searchtype=all&query=... ---
1448
+ if (path.startsWith('/search')) {
1449
+ const rawQuery = urlObj.searchParams.get('query') || '';
1450
+ if (!rawQuery)
1451
+ return null;
1452
+ try {
1453
+ const searchQuery = encodeURIComponent(`all:${rawQuery}`);
1454
+ const apiUrl = `https://export.arxiv.org/api/query?search_query=${searchQuery}&max_results=10&sortBy=relevance`;
1455
+ const result = await simpleFetch(apiUrl, 'WebPeel/0.21', 20000, { Accept: 'application/xml' });
1456
+ if (!result?.html)
1457
+ return null;
1458
+ const xml = result.html;
1459
+ // Parse total results count from opensearch:totalResults
1460
+ const totalMatch = xml.match(/<opensearch:totalResults[^>]*>(\d+)<\/opensearch:totalResults>/);
1461
+ const total = totalMatch ? parseInt(totalMatch[1], 10) : 0;
1462
+ // Parse all entries
1463
+ const entries = [...xml.matchAll(/<entry[\s\S]*?<\/entry>/g)].map(m => m[0]);
1464
+ const papers = entries.map(entryXml => {
1465
+ const getTag = (tag) => {
1466
+ const match = entryXml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`));
1467
+ return match ? stripHtml(match[1]).trim() : '';
1468
+ };
1469
+ const getAllTags = (tag) => {
1470
+ const matches = [...entryXml.matchAll(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`, 'g'))];
1471
+ return matches.map(m => stripHtml(m[1]).trim()).filter(Boolean);
1472
+ };
1473
+ const title = getTag('title');
1474
+ const published = getTag('published');
1475
+ const authors = getAllTags('name');
1476
+ const summary = getTag('summary');
1477
+ // Extract arXiv ID from <id> tag
1478
+ const idTag = getTag('id');
1479
+ const idMatch2 = idTag.match(/abs\/(\d{4}\.\d{4,5}(?:v\d+)?)/);
1480
+ const paperId2 = idMatch2 ? idMatch2[1] : '';
1481
+ // Categories
1482
+ const cats = [...entryXml.matchAll(/category[^>]*term="([^"]+)"/g)].map(m => m[1]);
1483
+ return { title, published: published?.split('T')[0], authors, summary, paperId: paperId2, categories: cats };
1484
+ }).filter(p => p.title);
1485
+ if (papers.length === 0)
1486
+ return null;
1487
+ const rows = papers.map((p, i) => {
1488
+ const authorLine = p.authors.length === 0 ? '—'
1489
+ : p.authors.length === 1 ? p.authors[0]
1490
+ : `${p.authors[0]} et al.`;
1491
+ const pdfLink = p.paperId ? ` [[PDF](https://arxiv.org/pdf/${p.paperId})]` : '';
1492
+ return `| ${i + 1} | [${p.title}](https://arxiv.org/abs/${p.paperId}) | ${p.published || '?'} | ${authorLine} |${pdfLink}`;
1493
+ }).join('\n');
1494
+ const cleanContent = `# 🔍 arXiv Search — "${rawQuery}"\n\n| # | Paper | Published | Authors |\n|---|-------|-----------|--------|\n${rows}\n\n*Source: arXiv API · Total results: ${total.toLocaleString()}*`;
1495
+ return {
1496
+ domain: 'arxiv.org',
1497
+ type: 'search',
1498
+ structured: { query: rawQuery, total, papers },
1499
+ cleanContent,
1500
+ };
1501
+ }
1502
+ catch (e) {
1503
+ if (process.env.DEBUG)
1504
+ console.debug('[webpeel]', 'ArXiv search failed:', e instanceof Error ? e.message : e);
1505
+ return null;
1506
+ }
1507
+ }
1406
1508
  // Extract paper ID from URL patterns:
1407
1509
  // /abs/2501.12948, /pdf/2501.12948, /abs/2501.12948v2
1408
1510
  const idMatch = path.match(/\/(abs|pdf|html)\/(\d{4}\.\d{4,5}(?:v\d+)?)/);
@@ -1462,7 +1564,7 @@ async function arxivExtractor(_html, url) {
1462
1564
  const authorLine = authors.length <= 5
1463
1565
  ? authors.join(', ')
1464
1566
  : `${authors.slice(0, 5).join(', ')} et al. (${authors.length} authors)`;
1465
- const cleanContent = `# ${title}\n\n**Authors:** ${authorLine}\n**Published:** ${published?.split('T')[0] || 'N/A'}${categories.length ? `\n**Categories:** ${categories.join(', ')}` : ''}${doi ? `\n**DOI:** ${doi}` : ''}${journalRef ? `\n**Journal:** ${journalRef}` : ''}\n\n## Abstract\n\n${summary}\n\n📄 [PDF](${structured.pdfUrl}) | [Abstract](${structured.absUrl})`;
1567
+ const cleanContent = `# 📄 arXiv: ${title} (${paperId})\n\n**Authors:** ${authorLine}\n**Submitted:** ${published?.split('T')[0] || 'N/A'}${categories.length ? `\n**Categories:** ${categories.join(', ')}` : ''}${doi ? `\n**DOI:** ${doi}` : ''}${journalRef ? `\n**Journal:** ${journalRef}` : ''}\n\n## Abstract\n\n${summary}\n\n**PDF:** [Download](${structured.pdfUrl}) | **HTML:** [View](https://arxiv.org/html/${paperId})`;
1466
1568
  return { domain: 'arxiv.org', type: 'paper', structured, cleanContent };
1467
1569
  }
1468
1570
  catch (e) {
@@ -3385,3 +3487,1300 @@ Browse newsletters at:
3385
3487
  *WebPeel works best with individual Substack post URLs, not the root homepage.*`;
3386
3488
  return { domain: 'substack.com', type: 'homepage', structured, cleanContent };
3387
3489
  }
3490
+ // ---------------------------------------------------------------------------
3491
+ // 33. Polymarket extractor — prediction market data via Gamma API
3492
+ // ---------------------------------------------------------------------------
3493
+ async function polymarketExtractor(_html, url) {
3494
+ const urlObj = new URL(url);
3495
+ const path = urlObj.pathname;
3496
+ const domain = 'polymarket.com';
3497
+ // Helper: format price as percent
3498
+ const fmtPct = (p) => {
3499
+ const n = typeof p === 'string' ? parseFloat(p) : p;
3500
+ if (isNaN(n))
3501
+ return '?%';
3502
+ return (n * 100).toFixed(1) + '%';
3503
+ };
3504
+ // Helper: format large dollar amount
3505
+ const fmtVol = (v) => {
3506
+ const n = typeof v === 'string' ? parseFloat(v) : v;
3507
+ if (isNaN(n) || n === 0)
3508
+ return '$0';
3509
+ if (n >= 1_000_000)
3510
+ return `$${(n / 1_000_000).toFixed(1)}M`;
3511
+ if (n >= 1_000)
3512
+ return `$${(n / 1_000).toFixed(1)}K`;
3513
+ return `$${n.toFixed(0)}`;
3514
+ };
3515
+ // Helper: format date string
3516
+ const fmtDate = (d) => {
3517
+ if (!d)
3518
+ return '?';
3519
+ return d.slice(0, 10);
3520
+ };
3521
+ // --- Specific event page: /event/<slug> ---
3522
+ const eventMatch = path.match(/^\/event\/([^/?#]+)/);
3523
+ if (eventMatch) {
3524
+ const slug = eventMatch[1];
3525
+ try {
3526
+ // Fetch event by slug from gamma API
3527
+ const events = await fetchJson(`https://gamma-api.polymarket.com/events?slug=${encodeURIComponent(slug)}&limit=1`);
3528
+ if (Array.isArray(events) && events.length > 0) {
3529
+ const event = events[0];
3530
+ const markets = event.markets || [];
3531
+ const structured = {
3532
+ title: event.title || slug,
3533
+ slug: event.slug,
3534
+ volume: event.volume,
3535
+ volume24hr: event.volume24hr,
3536
+ endDate: event.endDate,
3537
+ markets: markets.map((m) => ({
3538
+ question: m.question,
3539
+ outcomes: m.outcomes,
3540
+ outcomePrices: m.outcomePrices,
3541
+ volume: m.volume,
3542
+ volume24hr: m.volume24hr,
3543
+ endDate: m.endDate,
3544
+ bestBid: m.bestBid,
3545
+ bestAsk: m.bestAsk,
3546
+ lastTradePrice: m.lastTradePrice,
3547
+ })),
3548
+ };
3549
+ const marketsMd = markets.map((m) => {
3550
+ const outcomes = JSON.parse(m.outcomes || '[]');
3551
+ const prices = JSON.parse(m.outcomePrices || '[]');
3552
+ const priceStr = outcomes.map((o, i) => `${o}: **${fmtPct(prices[i] ?? 0)}**`).join(' | ');
3553
+ const vol24 = m.volume24hr ? ` | Vol 24h: ${fmtVol(m.volume24hr)}` : '';
3554
+ const endDate = m.endDate ? ` | Ends: ${fmtDate(m.endDate)}` : '';
3555
+ return `- **${m.question}**\n ${priceStr}${vol24}${endDate}`;
3556
+ }).join('\n\n');
3557
+ const totalVol24 = fmtVol(event.volume24hr || 0);
3558
+ const totalVol = fmtVol(event.volume || 0);
3559
+ const cleanContent = `# 📊 Polymarket: ${event.title || slug}
3560
+
3561
+ **Volume (24h):** ${totalVol24} | **Total Volume:** ${totalVol} | **Ends:** ${fmtDate(event.endDate)}
3562
+
3563
+ ## Markets
3564
+
3565
+ ${marketsMd || '*No active markets found.*'}
3566
+
3567
+ ---
3568
+ *Source: [Polymarket](https://polymarket.com/event/${slug}) · Data via Polymarket Gamma API*`;
3569
+ return { domain, type: 'event', structured, cleanContent };
3570
+ }
3571
+ // If event not found by slug, try a keyword search in markets
3572
+ const markets = await fetchJson(`https://gamma-api.polymarket.com/markets?closed=false&limit=10&order=volume24hr&ascending=false&q=${encodeURIComponent(slug.replace(/-/g, ' '))}`);
3573
+ if (Array.isArray(markets) && markets.length > 0) {
3574
+ return buildPolymarketMarketList(markets, domain, `Search: ${slug}`);
3575
+ }
3576
+ }
3577
+ catch (e) {
3578
+ if (process.env.DEBUG)
3579
+ console.debug('[webpeel]', 'Polymarket event fetch failed:', e instanceof Error ? e.message : e);
3580
+ }
3581
+ }
3582
+ // --- Main page or /markets: show top markets by 24h volume ---
3583
+ try {
3584
+ const markets = await fetchJson('https://gamma-api.polymarket.com/markets?closed=false&limit=20&order=volume24hr&ascending=false');
3585
+ if (Array.isArray(markets)) {
3586
+ return buildPolymarketMarketList(markets, domain, 'Top Markets');
3587
+ }
3588
+ }
3589
+ catch (e) {
3590
+ if (process.env.DEBUG)
3591
+ console.debug('[webpeel]', 'Polymarket markets fetch failed:', e instanceof Error ? e.message : e);
3592
+ }
3593
+ return null;
3594
+ }
3595
+ function buildPolymarketMarketList(markets, domain, title) {
3596
+ const fmtPct = (p) => {
3597
+ const n = typeof p === 'string' ? parseFloat(p) : p;
3598
+ if (isNaN(n))
3599
+ return '?%';
3600
+ return (n * 100).toFixed(1) + '%';
3601
+ };
3602
+ const fmtVol = (v) => {
3603
+ const n = typeof v === 'string' ? parseFloat(v) : v;
3604
+ if (isNaN(n) || n === 0)
3605
+ return '$0';
3606
+ if (n >= 1_000_000)
3607
+ return `$${(n / 1_000_000).toFixed(1)}M`;
3608
+ if (n >= 1_000)
3609
+ return `$${(n / 1_000).toFixed(1)}K`;
3610
+ return `$${n.toFixed(0)}`;
3611
+ };
3612
+ const rows = markets.slice(0, 15).map((m) => {
3613
+ const outcomes = (() => { try {
3614
+ return JSON.parse(m.outcomes || '[]');
3615
+ }
3616
+ catch {
3617
+ return [];
3618
+ } })();
3619
+ const prices = (() => { try {
3620
+ return JSON.parse(m.outcomePrices || '[]');
3621
+ }
3622
+ catch {
3623
+ return [];
3624
+ } })();
3625
+ const yesPrice = outcomes[0] ? fmtPct(prices[0] ?? 0) : '?%';
3626
+ const vol24 = fmtVol(m.volume24hr || 0);
3627
+ const end = m.endDate ? m.endDate.slice(0, 10) : '?';
3628
+ return `| ${m.question} | ${yesPrice} | ${vol24} | ${end} |`;
3629
+ }).join('\n');
3630
+ const structured = {
3631
+ markets: markets.slice(0, 15).map((m) => ({
3632
+ question: m.question,
3633
+ slug: m.slug,
3634
+ outcomePrices: m.outcomePrices,
3635
+ outcomes: m.outcomes,
3636
+ volume24hr: m.volume24hr,
3637
+ endDate: m.endDate,
3638
+ })),
3639
+ fetchedAt: new Date().toISOString(),
3640
+ };
3641
+ const cleanContent = `# 📊 Polymarket — ${title}
3642
+
3643
+ | Question | Yes Price | Vol 24h | End Date |
3644
+ |----------|-----------|---------|----------|
3645
+ ${rows}
3646
+
3647
+ ---
3648
+ *Source: [Polymarket](https://polymarket.com) · Data via Polymarket Gamma API*`;
3649
+ return { domain, type: 'markets', structured, cleanContent };
3650
+ }
3651
+ // ---------------------------------------------------------------------------
3652
+ // 34. Kalshi extractor — prediction market data via Kalshi Elections API
3653
+ // ---------------------------------------------------------------------------
3654
+ async function kalshiExtractor(_html, url) {
3655
+ const urlObj = new URL(url);
3656
+ const path = urlObj.pathname;
3657
+ const domain = 'kalshi.com';
3658
+ // Helper: format Kalshi dollar price (they use dollars like 0.78 = 78¢ = 78%)
3659
+ const fmtPct = (v) => {
3660
+ const n = typeof v === 'string' ? parseFloat(v) : v;
3661
+ if (n == null || isNaN(n))
3662
+ return '?%';
3663
+ return (n * 100).toFixed(0) + '%';
3664
+ };
3665
+ const fmtVol = (v) => {
3666
+ const n = typeof v === 'string' ? parseFloat(v) : v;
3667
+ if (n == null || isNaN(n) || n === 0)
3668
+ return '$0';
3669
+ if (n >= 1_000_000)
3670
+ return `$${(n / 1_000_000).toFixed(1)}M`;
3671
+ if (n >= 1_000)
3672
+ return `$${(n / 1_000).toFixed(1)}K`;
3673
+ return `$${n.toFixed(0)}`;
3674
+ };
3675
+ // --- Specific market/event page: /markets/<ticker> or /events/<ticker> ---
3676
+ const tickerMatch = path.match(/^\/(?:markets|events)\/([^/?#]+)/);
3677
+ if (tickerMatch) {
3678
+ const ticker = tickerMatch[1].toUpperCase();
3679
+ try {
3680
+ // Try fetching the specific event by ticker
3681
+ const data = await fetchJson(`https://api.elections.kalshi.com/trade-api/v2/events/${ticker}?with_nested_markets=true`);
3682
+ const event = data?.event;
3683
+ if (event) {
3684
+ const markets = event.markets || [];
3685
+ const structured = {
3686
+ title: event.title,
3687
+ ticker: event.event_ticker,
3688
+ category: event.category,
3689
+ markets: markets.map((m) => ({
3690
+ title: m.title,
3691
+ ticker: m.ticker,
3692
+ yes_bid: m.yes_bid_dollars,
3693
+ yes_ask: m.yes_ask_dollars,
3694
+ volume: m.volume_fp,
3695
+ volume_24h: m.volume_24h_fp,
3696
+ last_price: m.last_price_dollars,
3697
+ expiration: m.expiration_time,
3698
+ })),
3699
+ };
3700
+ const marketsMd = markets.map((m) => {
3701
+ const yesBid = fmtPct(m.yes_bid_dollars);
3702
+ const yesAsk = fmtPct(m.yes_ask_dollars);
3703
+ const vol = fmtVol(m.volume_fp);
3704
+ const vol24 = fmtVol(m.volume_24h_fp);
3705
+ const expiry = m.expiration_time ? m.expiration_time.slice(0, 10) : '?';
3706
+ return `- **${m.title}**\n Yes: ${yesBid}–${yesAsk} | Vol: ${vol} | Vol 24h: ${vol24} | Expires: ${expiry}`;
3707
+ }).join('\n\n');
3708
+ const cleanContent = `# 🎯 Kalshi: ${event.title}
3709
+
3710
+ **Category:** ${event.category || 'General'} | **Ticker:** ${event.event_ticker}
3711
+
3712
+ ## Markets
3713
+
3714
+ ${marketsMd || '*No active markets found.*'}
3715
+
3716
+ ---
3717
+ *Source: [Kalshi](https://kalshi.com/markets/${ticker.toLowerCase()}) · Data via Kalshi Trade API*`;
3718
+ return { domain, type: 'event', structured, cleanContent };
3719
+ }
3720
+ }
3721
+ catch (e) {
3722
+ if (process.env.DEBUG)
3723
+ console.debug('[webpeel]', 'Kalshi event fetch failed:', e instanceof Error ? e.message : e);
3724
+ }
3725
+ }
3726
+ // --- Main page or /markets: show top open events ---
3727
+ try {
3728
+ const data = await fetchJson('https://api.elections.kalshi.com/trade-api/v2/events?limit=20&status=open&with_nested_markets=true');
3729
+ const events = data?.events || [];
3730
+ if (events.length > 0) {
3731
+ const rows = events.slice(0, 15).map((e) => {
3732
+ const markets = e.markets || [];
3733
+ const firstMkt = markets[0];
3734
+ const yesBid = firstMkt ? fmtPct(firstMkt.yes_bid_dollars) : '?%';
3735
+ const vol24 = firstMkt ? fmtVol(firstMkt.volume_24h_fp) : '$0';
3736
+ const mktCount = markets.length > 1 ? ` (+${markets.length - 1} more)` : '';
3737
+ return `| ${e.title} | ${yesBid}${mktCount} | ${vol24} | ${e.category || '?'} |`;
3738
+ }).join('\n');
3739
+ const structured = {
3740
+ events: events.slice(0, 15).map((e) => ({
3741
+ title: e.title,
3742
+ ticker: e.event_ticker,
3743
+ category: e.category,
3744
+ markets: (e.markets || []).length,
3745
+ })),
3746
+ fetchedAt: new Date().toISOString(),
3747
+ };
3748
+ const cleanContent = `# 🎯 Kalshi — Top Open Events
3749
+
3750
+ | Event | Yes Price | Vol 24h | Category |
3751
+ |-------|-----------|---------|----------|
3752
+ ${rows}
3753
+
3754
+ ---
3755
+ *Source: [Kalshi](https://kalshi.com/markets) · Data via Kalshi Trade API*`;
3756
+ return { domain, type: 'markets', structured, cleanContent };
3757
+ }
3758
+ }
3759
+ catch (e) {
3760
+ if (process.env.DEBUG)
3761
+ console.debug('[webpeel]', 'Kalshi markets fetch failed:', e instanceof Error ? e.message : e);
3762
+ }
3763
+ return null;
3764
+ }
3765
+ // ---------------------------------------------------------------------------
3766
+ // 35. TradingView extractor — stock/index data via TradingView Scanner API
3767
+ // ---------------------------------------------------------------------------
3768
+ async function tradingViewExtractor(_html, url) {
3769
+ const urlObj = new URL(url);
3770
+ const path = urlObj.pathname;
3771
+ const domain = 'tradingview.com';
3772
+ const scannerHeaders = {
3773
+ 'Origin': 'https://www.tradingview.com',
3774
+ 'Referer': 'https://www.tradingview.com/',
3775
+ 'Content-Type': 'application/json',
3776
+ };
3777
+ // Helper: format price
3778
+ const fmtPrice = (v) => {
3779
+ if (v == null)
3780
+ return '?';
3781
+ if (v >= 1_000_000_000_000)
3782
+ return `${(v / 1_000_000_000_000).toFixed(2)}T`;
3783
+ if (v >= 1_000_000_000)
3784
+ return `${(v / 1_000_000_000).toFixed(2)}B`;
3785
+ if (v >= 1_000_000)
3786
+ return `${(v / 1_000_000).toFixed(2)}M`;
3787
+ if (v >= 1_000)
3788
+ return `${(v / 1_000).toFixed(2)}K`;
3789
+ return v.toFixed(2);
3790
+ };
3791
+ const fmtChange = (c) => {
3792
+ if (c == null)
3793
+ return '';
3794
+ const sign = c >= 0 ? '+' : '';
3795
+ return `${sign}${c.toFixed(2)}%`;
3796
+ };
3797
+ // --- Symbol page: /symbols/<TICKER>/ or /chart?symbol=<TICKER> ---
3798
+ const symbolMatch = path.match(/^\/symbols\/([^/?#]+)\/?/);
3799
+ const chartSymbolParam = urlObj.searchParams.get('symbol');
3800
+ let ticker = symbolMatch?.[1] || chartSymbolParam || null;
3801
+ if (ticker) {
3802
+ ticker = ticker.toUpperCase().replace(/-/g, '');
3803
+ try {
3804
+ // Try symbol search to resolve exchange
3805
+ const searchResp = await fetch(`https://symbol-search.tradingview.com/symbol_search/?text=${encodeURIComponent(ticker)}&hl=0&lang=en&type=stock,fund,crypto,futures,forex&limit=5`, {
3806
+ headers: {
3807
+ 'User-Agent': 'webpeel/0.21 (https://webpeel.dev)',
3808
+ 'Origin': 'https://www.tradingview.com',
3809
+ 'Referer': 'https://www.tradingview.com/',
3810
+ },
3811
+ signal: AbortSignal.timeout(10000),
3812
+ });
3813
+ const searchData = await searchResp.json().catch(() => []);
3814
+ // Find exact match
3815
+ const exactMatch = searchData.find(s => s.symbol === ticker || s.symbol.replace(/<\/?em>/g, '') === ticker);
3816
+ const symbolInfo = exactMatch || searchData[0];
3817
+ if (symbolInfo) {
3818
+ const exchange = symbolInfo.source_id || symbolInfo.exchange || 'NASDAQ';
3819
+ // Fetch quote data via scanner
3820
+ const scannerUrl = exchange === 'CRYPTO' || exchange === 'COINBASE' || exchange === 'BINANCE'
3821
+ ? 'https://scanner.tradingview.com/crypto/scan'
3822
+ : 'https://scanner.tradingview.com/america/scan';
3823
+ const scanBody = {
3824
+ filter: [{ left: 'name', operation: 'equal', right: symbolInfo.symbol?.replace(/<\/?em>/g, '') || ticker }],
3825
+ columns: ['name', 'description', 'close', 'open', 'high', 'low', 'volume', 'change', 'change_abs', 'market_cap_basic', 'sector', 'industry', 'country', 'currency'],
3826
+ range: [0, 1],
3827
+ };
3828
+ const scanResp = await fetch(scannerUrl, {
3829
+ method: 'POST',
3830
+ headers: { ...scannerHeaders, 'User-Agent': 'webpeel/0.21 (https://webpeel.dev)' },
3831
+ body: JSON.stringify(scanBody),
3832
+ signal: AbortSignal.timeout(10000),
3833
+ });
3834
+ const scanData = await scanResp.json().catch(() => null);
3835
+ const row = scanData?.data?.[0]?.d;
3836
+ if (row) {
3837
+ const [name, desc, close, open, high, low, volume, changePct, changeAbs, mktCap, sector, industry, country, currency] = row;
3838
+ const currStr = currency || 'USD';
3839
+ const mktCapStr = mktCap ? fmtPrice(mktCap) : null;
3840
+ const structured = {
3841
+ symbol: name,
3842
+ description: desc,
3843
+ price: close,
3844
+ open,
3845
+ high,
3846
+ low,
3847
+ volume,
3848
+ change_pct: changePct,
3849
+ change_abs: changeAbs,
3850
+ market_cap: mktCap,
3851
+ sector,
3852
+ industry,
3853
+ country,
3854
+ currency: currStr,
3855
+ exchange,
3856
+ fetchedAt: new Date().toISOString(),
3857
+ };
3858
+ const changeStr = fmtChange(changePct);
3859
+ const changeIcon = (changePct ?? 0) >= 0 ? '📈' : '📉';
3860
+ const cleanContent = `# ${changeIcon} TradingView: ${desc || name} (${name})
3861
+
3862
+ ## Quote
3863
+ - **Price:** ${close?.toFixed(2) ?? '?'} ${currStr}
3864
+ - **Change:** ${changeStr} (${changeAbs?.toFixed(2) ?? '?'} ${currStr})
3865
+ - **Open:** ${open?.toFixed(2) ?? '?'} | **High:** ${high?.toFixed(2) ?? '?'} | **Low:** ${low?.toFixed(2) ?? '?'}
3866
+ - **Volume:** ${fmtPrice(volume ?? 0)}
3867
+ ${mktCapStr ? `- **Market Cap:** ${mktCapStr} ${currStr}` : ''}
3868
+
3869
+ ## Details
3870
+ ${sector ? `- **Sector:** ${sector}` : ''}
3871
+ ${industry ? `- **Industry:** ${industry}` : ''}
3872
+ ${country ? `- **Country:** ${country}` : ''}
3873
+ - **Exchange:** ${exchange}
3874
+
3875
+ ---
3876
+ *Source: [TradingView](https://www.tradingview.com/symbols/${name}/) · Data via TradingView Scanner API*`;
3877
+ return { domain, type: 'symbol', structured, cleanContent };
3878
+ }
3879
+ }
3880
+ }
3881
+ catch (e) {
3882
+ if (process.env.DEBUG)
3883
+ console.debug('[webpeel]', 'TradingView symbol fetch failed:', e instanceof Error ? e.message : e);
3884
+ }
3885
+ }
3886
+ // --- Markets overview page or fallback: show major indices ---
3887
+ try {
3888
+ // Fetch major indices + top stocks
3889
+ const scanBody = {
3890
+ filter: [
3891
+ { left: 'name', operation: 'in_range', right: ['SPX', 'NDX', 'DJI', 'RUT', 'VIX', 'AAPL', 'MSFT', 'NVDA', 'AMZN', 'GOOGL', 'META', 'TSLA'] },
3892
+ ],
3893
+ columns: ['name', 'description', 'close', 'change', 'volume', 'market_cap_basic'],
3894
+ sort: { sortBy: 'market_cap_basic', sortOrder: 'desc' },
3895
+ range: [0, 20],
3896
+ };
3897
+ const resp = await fetch('https://scanner.tradingview.com/global/scan', {
3898
+ method: 'POST',
3899
+ headers: { ...scannerHeaders, 'User-Agent': 'webpeel/0.21 (https://webpeel.dev)' },
3900
+ body: JSON.stringify(scanBody),
3901
+ signal: AbortSignal.timeout(10000),
3902
+ });
3903
+ const data = await resp.json().catch(() => null);
3904
+ const rows = data?.data || [];
3905
+ if (rows.length > 0) {
3906
+ const tableRows = rows.map((row) => {
3907
+ const [name, desc, close, changePct] = row.d;
3908
+ const changeStr = changePct != null ? `${changePct >= 0 ? '+' : ''}${changePct.toFixed(2)}%` : '?%';
3909
+ const icon = (changePct ?? 0) >= 0 ? '🟢' : '🔴';
3910
+ return `| ${name} | ${desc} | ${close?.toFixed(2) ?? '?'} | ${icon} ${changeStr} |`;
3911
+ }).join('\n');
3912
+ const structured = {
3913
+ symbols: rows.map((r) => ({
3914
+ symbol: r.d[0],
3915
+ description: r.d[1],
3916
+ price: r.d[2],
3917
+ change_pct: r.d[3],
3918
+ })),
3919
+ fetchedAt: new Date().toISOString(),
3920
+ };
3921
+ const now = new Date().toLocaleString('en-US', { timeZone: 'America/New_York', hour12: false });
3922
+ const cleanContent = `# 📈 TradingView — Market Overview
3923
+
3924
+ *As of ${now} ET*
3925
+
3926
+ | Symbol | Name | Price | Change |
3927
+ |--------|------|-------|--------|
3928
+ ${tableRows}
3929
+
3930
+ ---
3931
+ *Source: [TradingView](https://www.tradingview.com/markets/) · Data via TradingView Scanner API*`;
3932
+ return { domain, type: 'markets', structured, cleanContent };
3933
+ }
3934
+ }
3935
+ catch (e) {
3936
+ if (process.env.DEBUG)
3937
+ console.debug('[webpeel]', 'TradingView markets fetch failed:', e instanceof Error ? e.message : e);
3938
+ }
3939
+ return null;
3940
+ }
3941
+ // ---------------------------------------------------------------------------
3942
+ // 36. ESPN extractor — live scores, standings, schedules via ESPN public API
3943
+ // ---------------------------------------------------------------------------
3944
+ /** Map ESPN URL path prefixes to sport/league identifiers for the API. */
3945
+ function matchESPN(url) {
3946
+ let u;
3947
+ try {
3948
+ u = new URL(url);
3949
+ }
3950
+ catch {
3951
+ return null;
3952
+ }
3953
+ if (!u.hostname.includes('espn.com'))
3954
+ return null;
3955
+ const path = u.pathname.toLowerCase();
3956
+ // Map URL path prefixes to [sport, league]
3957
+ const sportMap = {
3958
+ '/nba': ['basketball', 'nba'],
3959
+ '/wnba': ['basketball', 'wnba'],
3960
+ '/nfl': ['football', 'nfl'],
3961
+ '/mlb': ['baseball', 'mlb'],
3962
+ '/nhl': ['hockey', 'nhl'],
3963
+ '/college-football': ['football', 'college-football'],
3964
+ '/mens-college-basketball': ['basketball', 'mens-college-basketball'],
3965
+ '/womens-college-basketball': ['basketball', 'womens-college-basketball'],
3966
+ '/soccer': ['soccer', 'eng.1'],
3967
+ '/mma': ['mma', 'ufc'],
3968
+ };
3969
+ for (const [prefix, [sport, league]] of Object.entries(sportMap)) {
3970
+ if (path.startsWith(prefix)) {
3971
+ // Override soccer league if explicitly in URL path (e.g. /soccer/scoreboard/_/league/usa.1)
3972
+ let resolvedLeague = league;
3973
+ if (sport === 'soccer') {
3974
+ const leagueMatch = path.match(/\/league\/([^/?#]+)/);
3975
+ if (leagueMatch)
3976
+ resolvedLeague = leagueMatch[1];
3977
+ }
3978
+ if (path.includes('standings'))
3979
+ return { sport, league: resolvedLeague, type: 'standings' };
3980
+ if (path.includes('/team/') || path.includes('/teams/')) {
3981
+ const nameMatch = path.split('/name/')[1]?.split('/')[0];
3982
+ return { sport, league: resolvedLeague, type: 'team', param: nameMatch };
3983
+ }
3984
+ if (path.includes('scores') || path.includes('scoreboard'))
3985
+ return { sport, league: resolvedLeague, type: 'scoreboard' };
3986
+ return { sport, league: resolvedLeague, type: 'scoreboard' }; // default to scoreboard
3987
+ }
3988
+ }
3989
+ // Fallback: espn.com root or unknown path → NBA scoreboard
3990
+ return { sport: 'basketball', league: 'nba', type: 'scoreboard' };
3991
+ }
3992
+ /** Sport emoji mapping. */
3993
+ function espnSportEmoji(sport, league) {
3994
+ if (league === 'nba' || league === 'wnba')
3995
+ return '🏀';
3996
+ if (sport === 'football')
3997
+ return '🏈';
3998
+ if (sport === 'baseball')
3999
+ return '⚾';
4000
+ if (sport === 'hockey')
4001
+ return '🏒';
4002
+ if (sport === 'soccer')
4003
+ return '⚽';
4004
+ if (sport === 'mma' || league === 'ufc')
4005
+ return '🥊';
4006
+ return '🏆';
4007
+ }
4008
+ /** Format a UTC ISO date string to "7:30 PM ET" style. */
4009
+ function fmtEspnTime(isoDate) {
4010
+ try {
4011
+ const d = new Date(isoDate);
4012
+ return d.toLocaleTimeString('en-US', {
4013
+ timeZone: 'America/New_York',
4014
+ hour: 'numeric',
4015
+ minute: '2-digit',
4016
+ hour12: true,
4017
+ }) + ' ET';
4018
+ }
4019
+ catch {
4020
+ return isoDate;
4021
+ }
4022
+ }
4023
+ /** Format today's date nicely: "March 18, 2026". */
4024
+ function fmtTodayESPN() {
4025
+ return new Date().toLocaleDateString('en-US', {
4026
+ timeZone: 'America/New_York',
4027
+ month: 'long',
4028
+ day: 'numeric',
4029
+ year: 'numeric',
4030
+ });
4031
+ }
4032
+ async function fetchEspnScoreboard(sport, league) {
4033
+ try {
4034
+ const apiUrl = `https://site.api.espn.com/apis/site/v2/sports/${sport}/${league}/scoreboard`;
4035
+ const data = await fetchJson(apiUrl);
4036
+ const events = data?.events || [];
4037
+ const emoji = espnSportEmoji(sport, league);
4038
+ const leagueName = data?.leagues?.[0]?.name || league.toUpperCase();
4039
+ const today = fmtTodayESPN();
4040
+ if (events.length === 0) {
4041
+ return `# ${emoji} ${leagueName} Scoreboard — ${today}\n\n*No games scheduled today.*`;
4042
+ }
4043
+ const rows = events.map((e) => {
4044
+ const comp = e.competitions?.[0] || {};
4045
+ const status = comp.status?.type || {};
4046
+ const competitors = comp.competitors || [];
4047
+ // Away team first, home team second (standard display)
4048
+ const away = competitors.find((c) => c.homeAway === 'away') || competitors[0];
4049
+ const home = competitors.find((c) => c.homeAway === 'home') || competitors[1];
4050
+ const awayName = away?.team?.displayName || away?.team?.name || '?';
4051
+ const homeName = home?.team?.displayName || home?.team?.name || '?';
4052
+ const gameLabel = `${awayName} at ${homeName}`;
4053
+ let scoreStr = '-';
4054
+ let statusStr = '';
4055
+ const state = status.state || 'pre';
4056
+ const description = status.description || 'Scheduled';
4057
+ if (state === 'pre') {
4058
+ scoreStr = '-';
4059
+ statusStr = fmtEspnTime(comp.startDate || e.date || '');
4060
+ }
4061
+ else if (state === 'in') {
4062
+ const awayScore = away?.score ?? '0';
4063
+ const homeScore = home?.score ?? '0';
4064
+ const awayAbbr = away?.team?.abbreviation || '?';
4065
+ const homeAbbr = home?.team?.abbreviation || '?';
4066
+ scoreStr = `${awayAbbr} ${awayScore}, ${homeAbbr} ${homeScore}`;
4067
+ const period = comp.status?.period ?? '';
4068
+ const clock = comp.status?.displayClock ?? '';
4069
+ statusStr = period && clock ? `Q${period} ${clock}` : 'Live';
4070
+ }
4071
+ else {
4072
+ const awayScore = away?.score ?? '0';
4073
+ const homeScore = home?.score ?? '0';
4074
+ const awayAbbr = away?.team?.abbreviation || '?';
4075
+ const homeAbbr = home?.team?.abbreviation || '?';
4076
+ scoreStr = `${awayAbbr} ${awayScore}, ${homeAbbr} ${homeScore}`;
4077
+ statusStr = description || 'Final';
4078
+ }
4079
+ return `| ${gameLabel} | ${scoreStr} | ${statusStr} |`;
4080
+ }).join('\n');
4081
+ return `# ${emoji} ${leagueName} Scoreboard — ${today}\n\n| Game | Score | Status |\n|------|-------|--------|\n${rows}`;
4082
+ }
4083
+ catch (e) {
4084
+ if (process.env.DEBUG)
4085
+ console.debug('[webpeel]', 'ESPN scoreboard fetch failed:', e instanceof Error ? e.message : e);
4086
+ return null;
4087
+ }
4088
+ }
4089
+ async function fetchEspnStandings(sport, league) {
4090
+ try {
4091
+ const apiUrl = `https://site.web.api.espn.com/apis/v2/sports/${sport}/${league}/standings?sort=winpercent:desc`;
4092
+ const data = await fetchJson(apiUrl);
4093
+ const children = data?.children || [];
4094
+ const emoji = espnSportEmoji(sport, league);
4095
+ const leagueName = data?.name || league.toUpperCase();
4096
+ const today = fmtTodayESPN();
4097
+ if (children.length === 0)
4098
+ return null;
4099
+ let output = `# ${emoji} ${leagueName} Standings — ${today}\n\n`;
4100
+ for (const conf of children) {
4101
+ const confName = conf.name || conf.abbreviation || 'Conference';
4102
+ const entries = conf.standings?.entries || [];
4103
+ output += `## ${confName}\n\n`;
4104
+ output += `| # | Team | W | L | PCT | Streak |\n`;
4105
+ output += `|---|------|---|---|-----|--------|\n`;
4106
+ // Sort by playoff seed
4107
+ const sorted = entries.slice().sort((a, b) => {
4108
+ const seedA = a.stats?.find((s) => s.name === 'playoffSeed')?.value ?? 99;
4109
+ const seedB = b.stats?.find((s) => s.name === 'playoffSeed')?.value ?? 99;
4110
+ return seedA - seedB;
4111
+ });
4112
+ for (const entry of sorted) {
4113
+ const team = entry.team?.displayName || '?';
4114
+ const stats = entry.stats || [];
4115
+ const getDisplay = (name) => stats.find((s) => s.name === name)?.displayValue || '?';
4116
+ const getStat = (name) => stats.find((s) => s.name === name)?.value ?? '?';
4117
+ const seed = getStat('playoffSeed');
4118
+ const wins = getDisplay('wins');
4119
+ const losses = getDisplay('losses');
4120
+ const pct = getDisplay('winPercent');
4121
+ const streak = getDisplay('streak');
4122
+ output += `| ${seed} | ${team} | ${wins} | ${losses} | ${pct} | ${streak} |\n`;
4123
+ }
4124
+ output += '\n';
4125
+ }
4126
+ return output.trim();
4127
+ }
4128
+ catch (e) {
4129
+ if (process.env.DEBUG)
4130
+ console.debug('[webpeel]', 'ESPN standings fetch failed:', e instanceof Error ? e.message : e);
4131
+ return null;
4132
+ }
4133
+ }
4134
+ async function espnExtractor(_html, url) {
4135
+ const match = matchESPN(url);
4136
+ if (!match)
4137
+ return null;
4138
+ const { sport, league, type } = match;
4139
+ const domain = 'espn.com';
4140
+ if (type === 'standings') {
4141
+ const content = await fetchEspnStandings(sport, league);
4142
+ if (!content)
4143
+ return null;
4144
+ return {
4145
+ domain,
4146
+ type: 'standings',
4147
+ structured: { sport, league, dataType: 'standings' },
4148
+ cleanContent: content,
4149
+ };
4150
+ }
4151
+ if (type === 'team') {
4152
+ // Try to get team info from the teams API
4153
+ try {
4154
+ const teamsUrl = `https://site.api.espn.com/apis/site/v2/sports/${sport}/${league}/teams`;
4155
+ const teamsData = await fetchJson(teamsUrl);
4156
+ const teams = teamsData?.sports?.[0]?.leagues?.[0]?.teams || [];
4157
+ const param = match.param?.toLowerCase();
4158
+ const teamEntry = param
4159
+ ? teams.find((t) => {
4160
+ const td = t.team || t;
4161
+ return td.abbreviation?.toLowerCase() === param ||
4162
+ td.slug?.toLowerCase() === param ||
4163
+ td.displayName?.toLowerCase().includes(param);
4164
+ })
4165
+ : teams[0];
4166
+ if (teamEntry) {
4167
+ const td = teamEntry.team || teamEntry;
4168
+ const emoji = espnSportEmoji(sport, league);
4169
+ const content = `# ${emoji} ${td.displayName}\n\n**League:** ${league.toUpperCase()}\n\n*For live scores and standings, use:*\n- \`webpeel "https://espn.com/${league}/scoreboard"\`\n- \`webpeel "https://espn.com/${league}/standings"\``;
4170
+ return {
4171
+ domain,
4172
+ type: 'team',
4173
+ structured: { sport, league, teamName: td.displayName, abbreviation: td.abbreviation },
4174
+ cleanContent: content,
4175
+ };
4176
+ }
4177
+ }
4178
+ catch (e) {
4179
+ if (process.env.DEBUG)
4180
+ console.debug('[webpeel]', 'ESPN team fetch failed:', e instanceof Error ? e.message : e);
4181
+ }
4182
+ // Fallback to scoreboard
4183
+ }
4184
+ // Default: scoreboard
4185
+ const content = await fetchEspnScoreboard(sport, league);
4186
+ if (!content)
4187
+ return null;
4188
+ return {
4189
+ domain,
4190
+ type: 'scoreboard',
4191
+ structured: { sport, league, dataType: 'scoreboard' },
4192
+ cleanContent: content,
4193
+ };
4194
+ }
4195
+ // ---------------------------------------------------------------------------
4196
+ // 37. Sports betting sites — helpful redirect message
4197
+ // ---------------------------------------------------------------------------
4198
+ async function sportsBettingExtractor(_html, url) {
4199
+ let brandName = 'Sports Betting Site';
4200
+ let domain = 'sportsbook';
4201
+ try {
4202
+ const hostname = new URL(url).hostname.replace('www.', '').replace('sportsbook.', '');
4203
+ domain = hostname;
4204
+ if (hostname.includes('draftkings'))
4205
+ brandName = 'DraftKings Sportsbook';
4206
+ else if (hostname.includes('fanduel'))
4207
+ brandName = 'FanDuel Sportsbook';
4208
+ else if (hostname.includes('betmgm'))
4209
+ brandName = 'BetMGM Sportsbook';
4210
+ }
4211
+ catch { /* ignore */ }
4212
+ const cleanContent = `# ⚠️ ${brandName}
4213
+
4214
+ ${brandName} requires authentication and geo-verification. WebPeel cannot scrape live odds directly.
4215
+
4216
+ **For live sports odds, use these alternatives:**
4217
+ - \`webpeel "https://espn.com/nba/scoreboard"\` — Live scores and schedules
4218
+ - \`webpeel "https://polymarket.com"\` — Prediction market prices
4219
+ - The Odds API (theOddsApi.com) — Aggregated odds from all sportsbooks (requires API key)
4220
+
4221
+ **For team schedules and standings:**
4222
+ - \`webpeel "https://espn.com/nba/standings"\` — NBA standings
4223
+ - \`webpeel "https://espn.com/nfl/scoreboard"\` — NFL scores
4224
+ - \`webpeel "https://espn.com/mlb/scoreboard"\` — MLB scores`;
4225
+ return {
4226
+ domain,
4227
+ type: 'blocked',
4228
+ structured: { site: brandName, reason: 'authentication and geo-verification required' },
4229
+ cleanContent,
4230
+ };
4231
+ }
4232
+ // ---------------------------------------------------------------------------
4233
+ // Semantic Scholar extractor (Semantic Scholar API — free, no key needed)
4234
+ // ---------------------------------------------------------------------------
4235
+ async function semanticScholarExtractor(_html, url) {
4236
+ const urlObj = new URL(url);
4237
+ const path = urlObj.pathname;
4238
+ const domain = 'semanticscholar.org';
4239
+ // --- Paper page: /paper/<title-slug>/<paperId> ---
4240
+ const paperMatch = path.match(/^\/paper\/(?:[^/]+\/)?([a-f0-9]{40})/i);
4241
+ if (paperMatch) {
4242
+ const paperId = paperMatch[1];
4243
+ try {
4244
+ const fields = 'title,abstract,authors,year,citationCount,referenceCount,url,openAccessPdf,venue,publicationDate,tldr';
4245
+ const apiUrl = `https://api.semanticscholar.org/graph/v1/paper/${paperId}?fields=${fields}`;
4246
+ const data = await fetchJson(apiUrl);
4247
+ if (!data)
4248
+ return null;
4249
+ // Rate limited — return null so pipeline falls back to browser rendering
4250
+ if (data.code === '429' || (data.message && String(data.message).includes('Too Many Requests'))) {
4251
+ return null;
4252
+ }
4253
+ if (!data.title)
4254
+ return null;
4255
+ const authors = data.authors || [];
4256
+ const authorNames = authors.map((a) => a.name);
4257
+ const authorLine = authorNames.length <= 5
4258
+ ? authorNames.join(', ')
4259
+ : `${authorNames.slice(0, 5).join(', ')} (+${authorNames.length - 5} more)`;
4260
+ const pdfObj = data.openAccessPdf;
4261
+ const pdfUrl = pdfObj?.url || null;
4262
+ const tldrText = data.tldr?.text || null;
4263
+ const citations = data.citationCount;
4264
+ const citStr = citations != null ? citations.toLocaleString() : '?';
4265
+ const structured = {
4266
+ paperId,
4267
+ title: data.title,
4268
+ authors: authorNames,
4269
+ year: data.year,
4270
+ venue: data.venue,
4271
+ citationCount: data.citationCount,
4272
+ referenceCount: data.referenceCount,
4273
+ abstract: data.abstract,
4274
+ tldr: tldrText,
4275
+ pdfUrl,
4276
+ url: data.url,
4277
+ publicationDate: data.publicationDate,
4278
+ };
4279
+ const lines = [
4280
+ `# 📄 ${data.title}`,
4281
+ '',
4282
+ `**Authors:** ${authorLine}`,
4283
+ `**Year:** ${data.year || '?'} | **Venue:** ${data.venue || 'N/A'} | **Citations:** ${citStr}`,
4284
+ ];
4285
+ if (data.referenceCount != null)
4286
+ lines.push(`**References:** ${data.referenceCount.toLocaleString()}`);
4287
+ if (tldrText) {
4288
+ lines.push('', '## TL;DR', '', tldrText);
4289
+ }
4290
+ if (data.abstract) {
4291
+ lines.push('', '## Abstract', '', data.abstract);
4292
+ }
4293
+ lines.push('');
4294
+ if (pdfUrl)
4295
+ lines.push(`**PDF:** [Open Access](${pdfUrl})`);
4296
+ lines.push(`**Link:** [Semantic Scholar](${data.url || `https://www.semanticscholar.org/paper/${paperId}`})`);
4297
+ return {
4298
+ domain,
4299
+ type: 'paper',
4300
+ structured,
4301
+ cleanContent: lines.join('\n'),
4302
+ };
4303
+ }
4304
+ catch (e) {
4305
+ if (process.env.DEBUG)
4306
+ console.debug('[webpeel]', 'Semantic Scholar paper API failed:', e instanceof Error ? e.message : e);
4307
+ return null;
4308
+ }
4309
+ }
4310
+ // --- Search page: /search?q=... ---
4311
+ const query = urlObj.searchParams.get('q') || urlObj.searchParams.get('query');
4312
+ if (path === '/search' || path.startsWith('/search/')) {
4313
+ if (!query)
4314
+ return null;
4315
+ try {
4316
+ const fields = 'title,authors,year,citationCount,url,openAccessPdf';
4317
+ const apiUrl = `https://api.semanticscholar.org/graph/v1/paper/search?query=${encodeURIComponent(query)}&limit=10&fields=${fields}`;
4318
+ const data = await fetchJson(apiUrl);
4319
+ // Rate limited or no data — return null so pipeline falls back to browser rendering
4320
+ if (!data)
4321
+ return null;
4322
+ if (data.code === '429' || (data.message && String(data.message).includes('Too Many Requests'))) {
4323
+ return null;
4324
+ }
4325
+ if (!Array.isArray(data.data))
4326
+ return null;
4327
+ const papers = data.data;
4328
+ const total = data.total || 0;
4329
+ const rows = papers.map((p, i) => {
4330
+ const authors = p.authors || [];
4331
+ const authorLine = authors.length === 0 ? '—'
4332
+ : authors.length === 1 ? authors[0].name
4333
+ : `${authors[0].name} et al.`;
4334
+ const paperUrl = p.url || `https://www.semanticscholar.org/paper/${p.paperId}`;
4335
+ const cits = p.citationCount != null ? p.citationCount.toLocaleString() : '?';
4336
+ return `| ${i + 1} | [${p.title}](${paperUrl}) | ${p.year || '?'} | ${cits} | ${authorLine} |`;
4337
+ }).join('\n');
4338
+ const cleanContent = [
4339
+ `# 🔍 Semantic Scholar — "${query}"`,
4340
+ '',
4341
+ '| # | Paper | Year | Citations | Authors |',
4342
+ '|---|-------|------|-----------|---------|',
4343
+ rows,
4344
+ '',
4345
+ `*Source: Semantic Scholar API · Total results: ${total.toLocaleString()}*`,
4346
+ ].join('\n');
4347
+ return {
4348
+ domain,
4349
+ type: 'search',
4350
+ structured: { query, total, papers },
4351
+ cleanContent,
4352
+ };
4353
+ }
4354
+ catch (e) {
4355
+ if (process.env.DEBUG)
4356
+ console.debug('[webpeel]', 'Semantic Scholar search API failed:', e instanceof Error ? e.message : e);
4357
+ return null;
4358
+ }
4359
+ }
4360
+ return null;
4361
+ }
4362
+ // ---------------------------------------------------------------------------
4363
+ // PubMed extractor (NCBI E-utilities API — free, no key needed)
4364
+ // ---------------------------------------------------------------------------
4365
+ async function pubmedExtractor(_html, url) {
4366
+ const urlObj = new URL(url);
4367
+ const path = urlObj.pathname;
4368
+ const domain = 'pubmed.ncbi.nlm.nih.gov';
4369
+ // --- Article page: /XXXXXX/ or /XXXXXX ---
4370
+ const pmidMatch = path.match(/^\/(\d+)\/?$/);
4371
+ if (pmidMatch) {
4372
+ const pmid = pmidMatch[1];
4373
+ try {
4374
+ // Fetch summary
4375
+ const summaryUrl = `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=${pmid}&retmode=json`;
4376
+ const summaryData = await fetchJson(summaryUrl);
4377
+ if (!summaryData?.result)
4378
+ return null;
4379
+ const result = summaryData.result;
4380
+ const article = result[pmid];
4381
+ if (!article)
4382
+ return null;
4383
+ // Fetch abstract via efetch
4384
+ let abstract = '';
4385
+ try {
4386
+ const efetchUrl = `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=${pmid}&retmode=xml&rettype=abstract`;
4387
+ const efetchResult = await simpleFetch(efetchUrl, 'WebPeel/0.21', 15000, { Accept: 'application/xml' });
4388
+ if (efetchResult?.html) {
4389
+ const abstractMatch = efetchResult.html.match(/<AbstractText[^>]*>([\s\S]*?)<\/AbstractText>/g);
4390
+ if (abstractMatch) {
4391
+ abstract = abstractMatch.map((m) => {
4392
+ const labelMatch = m.match(/Label="([^"]+)"/);
4393
+ const textMatch = m.match(/<AbstractText[^>]*>([\s\S]*?)<\/AbstractText>/);
4394
+ const text = textMatch ? stripHtml(textMatch[1]).trim() : '';
4395
+ return labelMatch ? `**${labelMatch[1]}:** ${text}` : text;
4396
+ }).join('\n\n');
4397
+ }
4398
+ }
4399
+ }
4400
+ catch { /* abstract is optional */ }
4401
+ const authors = article.authors || [];
4402
+ const authorNames = authors.filter(a => a.authtype !== 'CollectiveName').map(a => a.name);
4403
+ const authorLine = authorNames.length <= 6
4404
+ ? authorNames.join(', ')
4405
+ : `${authorNames.slice(0, 6).join(', ')} et al.`;
4406
+ const doi = article.elocationid?.replace(/^doi:\s*/i, '') || null;
4407
+ const pubDate = article.pubdate || '?';
4408
+ const journal = article.source || '?';
4409
+ const volume = article.volume ? ` ${article.volume}` : '';
4410
+ const issue = article.issue ? `(${article.issue})` : '';
4411
+ const pages = article.pages ? `:${article.pages}` : '';
4412
+ const structured = {
4413
+ pmid,
4414
+ title: article.title,
4415
+ authors: authorNames,
4416
+ journal,
4417
+ pubDate,
4418
+ volume: article.volume,
4419
+ issue: article.issue,
4420
+ pages: article.pages,
4421
+ doi,
4422
+ abstract: abstract || undefined,
4423
+ url: `https://pubmed.ncbi.nlm.nih.gov/${pmid}/`,
4424
+ };
4425
+ const lines = [
4426
+ `# 🧬 ${article.title}`,
4427
+ '',
4428
+ `**Authors:** ${authorLine}`,
4429
+ `**Journal:** *${journal}*${volume}${issue}${pages} (${pubDate})`,
4430
+ `**PMID:** ${pmid}`,
4431
+ ];
4432
+ if (doi)
4433
+ lines.push(`**DOI:** [${doi}](https://doi.org/${doi})`);
4434
+ if (abstract) {
4435
+ lines.push('', '## Abstract', '', abstract);
4436
+ }
4437
+ lines.push('', `**Link:** [PubMed](https://pubmed.ncbi.nlm.nih.gov/${pmid}/)`);
4438
+ return {
4439
+ domain,
4440
+ type: 'article',
4441
+ structured,
4442
+ cleanContent: lines.join('\n'),
4443
+ };
4444
+ }
4445
+ catch (e) {
4446
+ if (process.env.DEBUG)
4447
+ console.debug('[webpeel]', 'PubMed article API failed:', e instanceof Error ? e.message : e);
4448
+ return null;
4449
+ }
4450
+ }
4451
+ // --- Search page: /?term=... or /?query=... ---
4452
+ const term = urlObj.searchParams.get('term') || urlObj.searchParams.get('query');
4453
+ if (term) {
4454
+ try {
4455
+ // Step 1: search for IDs
4456
+ const searchUrl = `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=${encodeURIComponent(term)}&retmode=json&retmax=10`;
4457
+ const searchData = await fetchJson(searchUrl);
4458
+ if (!searchData?.esearchresult)
4459
+ return null;
4460
+ const esearch = searchData.esearchresult;
4461
+ const ids = esearch.idlist || [];
4462
+ const total = parseInt(esearch.count || '0', 10);
4463
+ if (ids.length === 0) {
4464
+ return {
4465
+ domain,
4466
+ type: 'search',
4467
+ structured: { query: term, total: 0, articles: [] },
4468
+ cleanContent: `# 🔍 PubMed — "${term}"\n\n*No results found.*`,
4469
+ };
4470
+ }
4471
+ // Step 2: fetch summaries
4472
+ const summaryUrl = `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=${ids.join(',')}&retmode=json`;
4473
+ const summaryData = await fetchJson(summaryUrl);
4474
+ if (!summaryData?.result)
4475
+ return null;
4476
+ const result = summaryData.result;
4477
+ const articles = (result.uids || ids).map((id) => {
4478
+ const a = result[id];
4479
+ if (!a)
4480
+ return null;
4481
+ const authors = a.authors || [];
4482
+ return {
4483
+ pmid: id,
4484
+ title: a.title,
4485
+ journal: a.source,
4486
+ pubDate: a.pubdate,
4487
+ authors: authors.map(x => x.name),
4488
+ doi: a.elocationid?.replace(/^doi:\s*/i, '') || null,
4489
+ };
4490
+ }).filter(Boolean);
4491
+ const rows = articles.map((a, i) => {
4492
+ const authorLine = a.authors.length === 0 ? '—'
4493
+ : a.authors.length === 1 ? a.authors[0]
4494
+ : `${a.authors[0]} et al.`;
4495
+ const link = `https://pubmed.ncbi.nlm.nih.gov/${a.pmid}/`;
4496
+ return `| ${i + 1} | [${a.title}](${link}) | *${a.journal}* | ${a.pubDate} | ${authorLine} |`;
4497
+ }).join('\n');
4498
+ const cleanContent = [
4499
+ `# 🔍 PubMed — "${term}"`,
4500
+ '',
4501
+ '| # | Article | Journal | Date | Authors |',
4502
+ '|---|---------|---------|------|---------|',
4503
+ rows,
4504
+ '',
4505
+ `*Source: NCBI PubMed E-utilities · Total results: ${total.toLocaleString()}*`,
4506
+ ].join('\n');
4507
+ return {
4508
+ domain,
4509
+ type: 'search',
4510
+ structured: { query: term, total, articles },
4511
+ cleanContent,
4512
+ };
4513
+ }
4514
+ catch (e) {
4515
+ if (process.env.DEBUG)
4516
+ console.debug('[webpeel]', 'PubMed search API failed:', e instanceof Error ? e.message : e);
4517
+ return null;
4518
+ }
4519
+ }
4520
+ return null;
4521
+ }
4522
+ // ---------------------------------------------------------------------------
4523
+ // 38. CoinGecko extractor — crypto prices via free CoinGecko API
4524
+ // ---------------------------------------------------------------------------
4525
+ async function coinGeckoExtractor(_html, url) {
4526
+ const urlObj = new URL(url);
4527
+ const path = urlObj.pathname;
4528
+ const domain = 'coingecko.com';
4529
+ const cgHeaders = {
4530
+ 'Accept': 'application/json',
4531
+ 'User-Agent': 'webpeel/0.21 (https://webpeel.dev)',
4532
+ };
4533
+ // Helper: compact number formatting
4534
+ const fmtMoney = (v) => {
4535
+ if (v == null || isNaN(v))
4536
+ return '?';
4537
+ if (v >= 1_000_000_000_000)
4538
+ return `$${(v / 1_000_000_000_000).toFixed(2)}T`;
4539
+ if (v >= 1_000_000_000)
4540
+ return `$${(v / 1_000_000_000).toFixed(2)}B`;
4541
+ if (v >= 1_000_000)
4542
+ return `$${(v / 1_000_000).toFixed(2)}M`;
4543
+ return `$${v.toLocaleString('en-US', { minimumFractionDigits: 2, maximumFractionDigits: 2 })}`;
4544
+ };
4545
+ const fmtPrice = (v) => {
4546
+ if (v == null || isNaN(v))
4547
+ return '?';
4548
+ if (v >= 1000)
4549
+ return `$${v.toLocaleString('en-US', { minimumFractionDigits: 2, maximumFractionDigits: 2 })}`;
4550
+ if (v >= 1)
4551
+ return `$${v.toFixed(4)}`;
4552
+ return `$${v.toFixed(8)}`;
4553
+ };
4554
+ const fmtChange = (c) => {
4555
+ if (c == null || isNaN(c))
4556
+ return '?';
4557
+ const sign = c >= 0 ? '+' : '';
4558
+ return `${sign}${c.toFixed(1)}%`;
4559
+ };
4560
+ // Coin detail page: /en/coins/<coin-id>
4561
+ const coinMatch = path.match(/^\/en\/coins\/([^/?#]+)\/?/);
4562
+ if (coinMatch) {
4563
+ const coinId = coinMatch[1].toLowerCase();
4564
+ try {
4565
+ const apiUrl = `https://api.coingecko.com/api/v3/coins/${encodeURIComponent(coinId)}?localization=false&tickers=false&community_data=false&developer_data=false`;
4566
+ const data = await fetchJson(apiUrl, cgHeaders);
4567
+ if (!data || data.error)
4568
+ return null;
4569
+ const md = data.market_data || {};
4570
+ const price = md.current_price?.usd;
4571
+ const change24h = md.price_change_percentage_24h;
4572
+ const change7d = md.price_change_percentage_7d;
4573
+ const marketCap = md.market_cap?.usd;
4574
+ const volume = md.total_volume?.usd;
4575
+ const ath = md.ath?.usd;
4576
+ const circulatingSupply = md.circulating_supply;
4577
+ const maxSupply = md.max_supply;
4578
+ const name = data.name || coinId;
4579
+ const symbol = (data.symbol || '').toUpperCase();
4580
+ const description = data.description?.en?.replace(/<[^>]+>/g, '').split('\r\n')[0]?.slice(0, 500) || '';
4581
+ const updatedAt = data.last_updated || new Date().toISOString();
4582
+ const structuredData = {
4583
+ id: coinId,
4584
+ name,
4585
+ symbol,
4586
+ price_usd: price,
4587
+ change_24h: change24h,
4588
+ change_7d: change7d,
4589
+ market_cap_usd: marketCap,
4590
+ volume_24h_usd: volume,
4591
+ ath_usd: ath,
4592
+ circulating_supply: circulatingSupply,
4593
+ max_supply: maxSupply,
4594
+ last_updated: updatedAt,
4595
+ };
4596
+ let cleanContent = `# 🪙 ${name} (${symbol})\n\n`;
4597
+ cleanContent += `## Quote\n`;
4598
+ cleanContent += `- **Price:** ${fmtPrice(price)}\n`;
4599
+ cleanContent += `- **24h Change:** ${fmtChange(change24h)}\n`;
4600
+ if (change7d != null)
4601
+ cleanContent += `- **7d Change:** ${fmtChange(change7d)}\n`;
4602
+ cleanContent += `- **Market Cap:** ${fmtMoney(marketCap)}\n`;
4603
+ cleanContent += `- **24h Volume:** ${fmtMoney(volume)}\n`;
4604
+ if (ath != null)
4605
+ cleanContent += `- **All-Time High:** ${fmtPrice(ath)}\n`;
4606
+ if (circulatingSupply) {
4607
+ const supply = circulatingSupply >= 1_000_000_000
4608
+ ? `${(circulatingSupply / 1_000_000_000).toFixed(2)}B`
4609
+ : circulatingSupply >= 1_000_000
4610
+ ? `${(circulatingSupply / 1_000_000).toFixed(2)}M`
4611
+ : circulatingSupply.toLocaleString();
4612
+ cleanContent += `- **Circulating Supply:** ${supply} ${symbol}\n`;
4613
+ }
4614
+ if (description) {
4615
+ cleanContent += `\n## Description\n${description}\n`;
4616
+ }
4617
+ cleanContent += `\n---\n*Source: CoinGecko API · Updated: ${updatedAt}*`;
4618
+ return { domain, type: 'coin', structured: structuredData, cleanContent };
4619
+ }
4620
+ catch (e) {
4621
+ if (process.env.DEBUG)
4622
+ console.debug('[webpeel]', 'CoinGecko coin API failed:', e instanceof Error ? e.message : e);
4623
+ return null;
4624
+ }
4625
+ }
4626
+ // Main page / markets overview: coingecko.com or coingecko.com/en
4627
+ try {
4628
+ const apiUrl = `https://api.coingecko.com/api/v3/coins/markets?vs_currency=usd&order=market_cap_desc&per_page=15&page=1`;
4629
+ const coins = await fetchJson(apiUrl, cgHeaders);
4630
+ if (!Array.isArray(coins) || coins.length === 0)
4631
+ return null;
4632
+ const rows = coins.slice(0, 15).map((c, i) => {
4633
+ const change = c.price_change_percentage_24h;
4634
+ const changeStr = change != null ? `${change >= 0 ? '+' : ''}${change.toFixed(1)}%` : '?';
4635
+ return `| ${i + 1} | ${c.name} (${(c.symbol || '').toUpperCase()}) | ${fmtPrice(c.current_price)} | ${changeStr} | ${fmtMoney(c.market_cap)} |`;
4636
+ });
4637
+ const cleanContent = `# 🪙 CoinGecko — Top Cryptocurrencies\n\n` +
4638
+ `| # | Coin | Price | 24h | Market Cap |\n` +
4639
+ `|---|------|-------|-----|------------|\n` +
4640
+ rows.join('\n') +
4641
+ `\n\n---\n*Source: CoinGecko API · Updated: ${new Date().toISOString()}*`;
4642
+ return {
4643
+ domain,
4644
+ type: 'markets',
4645
+ structured: { coins: coins.slice(0, 15) },
4646
+ cleanContent,
4647
+ };
4648
+ }
4649
+ catch (e) {
4650
+ if (process.env.DEBUG)
4651
+ console.debug('[webpeel]', 'CoinGecko markets API failed:', e instanceof Error ? e.message : e);
4652
+ return null;
4653
+ }
4654
+ }
4655
+ // ---------------------------------------------------------------------------
4656
+ // 39. Weather extractor — Open-Meteo free API (no key required)
4657
+ // ---------------------------------------------------------------------------
4658
+ // Weather code descriptions (WMO)
4659
+ const WMO_CODES = {
4660
+ 0: 'Clear sky', 1: 'Mainly clear', 2: 'Partly cloudy', 3: 'Overcast',
4661
+ 45: 'Foggy', 48: 'Icy fog',
4662
+ 51: 'Light drizzle', 53: 'Moderate drizzle', 55: 'Dense drizzle',
4663
+ 61: 'Slight rain', 63: 'Moderate rain', 65: 'Heavy rain',
4664
+ 71: 'Slight snow', 73: 'Moderate snow', 75: 'Heavy snow',
4665
+ 80: 'Slight showers', 81: 'Moderate showers', 82: 'Violent showers',
4666
+ 85: 'Slight snow showers', 86: 'Heavy snow showers',
4667
+ 95: 'Thunderstorm', 96: 'Thunderstorm w/ hail', 99: 'Thunderstorm w/ heavy hail',
4668
+ };
4669
+ const WEATHER_ICONS = {
4670
+ 0: '☀️', 1: '🌤️', 2: '⛅', 3: '☁️',
4671
+ 45: '🌫️', 48: '🌫️',
4672
+ 51: '🌦️', 53: '🌦️', 55: '🌧️',
4673
+ 61: '🌦️', 63: '🌧️', 65: '🌧️',
4674
+ 71: '🌨️', 73: '❄️', 75: '❄️',
4675
+ 80: '🌦️', 81: '🌧️', 82: '⛈️',
4676
+ 85: '🌨️', 86: '❄️',
4677
+ 95: '⛈️', 96: '⛈️', 99: '⛈️',
4678
+ };
4679
+ // Default city coordinates for common weather sites
4680
+ const DEFAULT_CITY = { name: 'New York City', lat: 40.7128, lon: -74.0060, tz: 'America/New_York' };
4681
+ async function weatherExtractor(_html, url) {
4682
+ const urlObj = new URL(url);
4683
+ const hostname = urlObj.hostname;
4684
+ // Determine lat/lon from URL params (for open-meteo.com direct API links)
4685
+ let lat = null;
4686
+ let lon = null;
4687
+ let cityName = DEFAULT_CITY.name;
4688
+ let timezone = DEFAULT_CITY.tz;
4689
+ if (hostname.includes('open-meteo.com')) {
4690
+ const latParam = urlObj.searchParams.get('latitude');
4691
+ const lonParam = urlObj.searchParams.get('longitude');
4692
+ const tzParam = urlObj.searchParams.get('timezone');
4693
+ if (latParam && lonParam) {
4694
+ lat = parseFloat(latParam);
4695
+ lon = parseFloat(lonParam);
4696
+ cityName = `${lat.toFixed(2)}°N, ${lon.toFixed(2)}°E`;
4697
+ if (tzParam)
4698
+ timezone = tzParam;
4699
+ }
4700
+ }
4701
+ // For weather.com / accuweather: try to extract city from URL path
4702
+ if (hostname.includes('weather.com') || hostname.includes('accuweather.com')) {
4703
+ const path = urlObj.pathname;
4704
+ // weather.com: /weather/today/l/40.71,-74.01:4:US or similar
4705
+ const coordMatch = path.match(/\/l\/(-?\d+\.?\d*),(-?\d+\.?\d*)/);
4706
+ if (coordMatch) {
4707
+ lat = parseFloat(coordMatch[1]);
4708
+ lon = parseFloat(coordMatch[2]);
4709
+ cityName = `${lat.toFixed(2)}, ${lon.toFixed(2)}`;
4710
+ }
4711
+ }
4712
+ // Default to NYC if no coords found
4713
+ if (lat == null || lon == null) {
4714
+ lat = DEFAULT_CITY.lat;
4715
+ lon = DEFAULT_CITY.lon;
4716
+ cityName = DEFAULT_CITY.name;
4717
+ timezone = DEFAULT_CITY.tz;
4718
+ }
4719
+ try {
4720
+ const apiUrl = `https://api.open-meteo.com/v1/forecast?latitude=${lat}&longitude=${lon}&current=temperature_2m,relative_humidity_2m,wind_speed_10m,weather_code&daily=temperature_2m_max,temperature_2m_min,precipitation_sum,weather_code&timezone=${encodeURIComponent(timezone)}&forecast_days=7`;
4721
+ const data = await fetchJson(apiUrl);
4722
+ if (!data || data.error)
4723
+ return null;
4724
+ const current = data.current || {};
4725
+ const daily = data.daily || {};
4726
+ const tempC = current.temperature_2m;
4727
+ const tempF = tempC != null ? Math.round(tempC * 9 / 5 + 32) : null;
4728
+ const humidity = current.relative_humidity_2m;
4729
+ const wind = current.wind_speed_10m;
4730
+ const wCode = current.weather_code;
4731
+ const condition = WMO_CODES[wCode] || 'Unknown';
4732
+ const icon = WEATHER_ICONS[wCode] || '🌡️';
4733
+ let cleanContent = `# ${icon} Weather Forecast — ${cityName}\n\n`;
4734
+ if (tempC != null) {
4735
+ cleanContent += `**Current:** ${tempC}°C (${tempF}°F)`;
4736
+ if (wind != null)
4737
+ cleanContent += `, Wind: ${wind} km/h`;
4738
+ if (humidity != null)
4739
+ cleanContent += `, Humidity: ${humidity}%`;
4740
+ cleanContent += `, ${condition}\n\n`;
4741
+ }
4742
+ if (daily.time?.length) {
4743
+ cleanContent += `| Date | Low | High | Precip | Condition |\n`;
4744
+ cleanContent += `|------|-----|------|--------|----------|\n`;
4745
+ for (let i = 0; i < Math.min(daily.time.length, 7); i++) {
4746
+ const date = daily.time[i];
4747
+ const low = daily.temperature_2m_min?.[i];
4748
+ const high = daily.temperature_2m_max?.[i];
4749
+ const precip = daily.precipitation_sum?.[i];
4750
+ const dayCode = daily.weather_code?.[i];
4751
+ const dayIcon = WEATHER_ICONS[dayCode] || '';
4752
+ const dayCondition = WMO_CODES[dayCode] || '';
4753
+ const lowStr = low != null ? `${low}°C` : '?';
4754
+ const highStr = high != null ? `${high}°C` : '?';
4755
+ const precipStr = precip != null ? `${precip}mm` : '0mm';
4756
+ cleanContent += `| ${date} | ${lowStr} | ${highStr} | ${precipStr} | ${dayIcon} ${dayCondition} |\n`;
4757
+ }
4758
+ }
4759
+ cleanContent += `\n---\n*Source: Open-Meteo API · Coordinates: ${lat}, ${lon} · Updated: ${data.current?.time || new Date().toISOString()}*`;
4760
+ return {
4761
+ domain: 'open-meteo.com',
4762
+ type: 'forecast',
4763
+ structured: {
4764
+ city: cityName,
4765
+ lat,
4766
+ lon,
4767
+ timezone,
4768
+ current: {
4769
+ temperature_c: tempC,
4770
+ temperature_f: tempF,
4771
+ humidity,
4772
+ wind_speed_kmh: wind,
4773
+ condition,
4774
+ weather_code: wCode,
4775
+ },
4776
+ daily: daily,
4777
+ },
4778
+ cleanContent,
4779
+ };
4780
+ }
4781
+ catch (e) {
4782
+ if (process.env.DEBUG)
4783
+ console.debug('[webpeel]', 'Weather API failed:', e instanceof Error ? e.message : e);
4784
+ return null;
4785
+ }
4786
+ }