mcp-scraper 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4628,9 +4628,12 @@ function parsePageData(url, html, status, via) {
4628
4628
  const bodyText = html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
4629
4629
  const wordCount = bodyText.split(" ").filter((w) => w.length > 2).length;
4630
4630
  const schemaTypes = [];
4631
+ const schema = [];
4631
4632
  for (const m of html.matchAll(/<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi)) {
4632
4633
  try {
4633
4634
  const parsed = JSON.parse(m[1]);
4635
+ if (Array.isArray(parsed)) schema.push(...parsed);
4636
+ else schema.push(parsed);
4634
4637
  const collect = (obj) => {
4635
4638
  if (!obj || typeof obj !== "object") return;
4636
4639
  const o = obj;
@@ -4642,6 +4645,13 @@ function parsePageData(url, html, status, via) {
4642
4645
  } catch {
4643
4646
  }
4644
4647
  }
4648
+ const mainHtml = (html.match(/<main[^>]*>([\s\S]*?)<\/main>/i)?.[1] ?? html.match(/<article[^>]*>([\s\S]*?)<\/article>/i)?.[1] ?? html.match(/<body[^>]*>([\s\S]*?)<\/body>/i)?.[1] ?? html).replace(/<(script|style|nav|header|footer|aside|noscript|svg|iframe)[^>]*>[\s\S]*?<\/\1>/gi, "");
4649
+ let bodyMarkdown = "";
4650
+ try {
4651
+ bodyMarkdown = turndown.turndown(mainHtml).replace(/\n{3,}/g, "\n\n").trim().slice(0, MAX_PAGE_MARKDOWN);
4652
+ } catch {
4653
+ bodyMarkdown = bodyText.slice(0, MAX_PAGE_MARKDOWN);
4654
+ }
4645
4655
  let internalLinks = 0;
4646
4656
  let externalLinks = 0;
4647
4657
  for (const m of html.matchAll(/href\s*=\s*["']([^"'\s>]+)/gi)) {
@@ -4652,7 +4662,7 @@ function parsePageData(url, html, status, via) {
4652
4662
  } catch {
4653
4663
  }
4654
4664
  }
4655
- return { url, status, via, title, metaDescription, h1, headings, wordCount, schemaTypes, canonicalUrl, internalLinks, externalLinks };
4665
+ return { url, status, via, title, metaDescription, h1, headings, wordCount, schemaTypes, canonicalUrl, internalLinks, externalLinks, bodyMarkdown, schema };
4656
4666
  }
4657
4667
  async function fetchAndParse(url, kernelApiKey) {
4658
4668
  try {
@@ -4667,13 +4677,13 @@ async function fetchAndParse(url, kernelApiKey) {
4667
4677
  const html = await res.text();
4668
4678
  return parsePageData(url, html, res.status, "fetch");
4669
4679
  }
4670
- return { url, status: res.status, via: "fetch", title: null, metaDescription: null, h1: null, headings: [], wordCount: 0, schemaTypes: [], canonicalUrl: null, internalLinks: 0, externalLinks: 0 };
4680
+ return { url, status: res.status, via: "fetch", title: null, metaDescription: null, h1: null, headings: [], wordCount: 0, schemaTypes: [], canonicalUrl: null, internalLinks: 0, externalLinks: 0, bodyMarkdown: "", schema: [] };
4671
4681
  }
4672
4682
  if ((res.status === 403 || res.status === 429) && kernelApiKey) {
4673
4683
  const html = await fetchWithKernel(url);
4674
4684
  return parsePageData(url, html, 200, "browser");
4675
4685
  }
4676
- return { url, status: res.status, via: "fetch", title: null, metaDescription: null, h1: null, headings: [], wordCount: 0, schemaTypes: [], canonicalUrl: null, internalLinks: 0, externalLinks: 0 };
4686
+ return { url, status: res.status, via: "fetch", title: null, metaDescription: null, h1: null, headings: [], wordCount: 0, schemaTypes: [], canonicalUrl: null, internalLinks: 0, externalLinks: 0, bodyMarkdown: "", schema: [] };
4677
4687
  } catch {
4678
4688
  if (kernelApiKey) {
4679
4689
  try {
@@ -4682,7 +4692,7 @@ async function fetchAndParse(url, kernelApiKey) {
4682
4692
  } catch {
4683
4693
  }
4684
4694
  }
4685
- return { url, status: null, via: "fetch", title: null, metaDescription: null, h1: null, headings: [], wordCount: 0, schemaTypes: [], canonicalUrl: null, internalLinks: 0, externalLinks: 0 };
4695
+ return { url, status: null, via: "fetch", title: null, metaDescription: null, h1: null, headings: [], wordCount: 0, schemaTypes: [], canonicalUrl: null, internalLinks: 0, externalLinks: 0, bodyMarkdown: "", schema: [] };
4686
4696
  }
4687
4697
  }
4688
4698
  async function runWithConcurrency(items, concurrency, fn) {
@@ -4723,14 +4733,17 @@ async function extractSite(opts) {
4723
4733
  browserRetries
4724
4734
  };
4725
4735
  }
4726
- var UA2, EXTRACT_CONCURRENCY;
4736
+ var import_turndown2, UA2, EXTRACT_CONCURRENCY, MAX_PAGE_MARKDOWN, turndown;
4727
4737
  var init_site_extractor = __esm({
4728
4738
  "src/api/site-extractor.ts"() {
4729
4739
  "use strict";
4740
+ import_turndown2 = __toESM(require("turndown"), 1);
4730
4741
  init_site_mapper();
4731
4742
  init_kernel_fetch();
4732
4743
  UA2 = "Mozilla/5.0 (compatible; ThorbitBot/1.0; +https://thorbit.ai)";
4733
4744
  EXTRACT_CONCURRENCY = 6;
4745
+ MAX_PAGE_MARKDOWN = 4e4;
4746
+ turndown = new import_turndown2.default({ headingStyle: "atx", bulletListMarker: "-", codeBlockStyle: "fenced" });
4734
4747
  }
4735
4748
  });
4736
4749
 
@@ -14937,6 +14950,9 @@ function truncate(s, max) {
14937
14950
  if (!s) return "";
14938
14951
  return s.length > max ? s.slice(0, max) + "\u2026" : s;
14939
14952
  }
14953
+ function cell(s) {
14954
+ return String(s ?? "").replace(/\r?\n+/g, " ").replace(/\|/g, "\\|").replace(/\s+/g, " ").trim();
14955
+ }
14940
14956
  function debugSection(debug) {
14941
14957
  if (!debug || typeof debug !== "object") return "";
14942
14958
  const request = debug.request ?? {};
@@ -14990,14 +15006,14 @@ function formatHarvestPaa(raw, input) {
14990
15006
  const diagnostics = d.diagnostics;
14991
15007
  const durationMs = d.stats?.durationMs;
14992
15008
  const paaRows = flat.map(
14993
- (r, i) => `| ${i + 1} | ${r.question} | ${truncate(r.answer, 120)} | ${r.source_site ?? ""} |`
15009
+ (r, i) => `| ${i + 1} | ${cell(r.question)} | ${cell(truncate(r.answer, 120))} | ${cell(r.source_title || r.source_site || "")} |`
14994
15010
  ).join("\n");
14995
15011
  const paaTable = flat.length ? `## People Also Ask (${flat.length} questions)
14996
15012
  | # | Question | Answer | Source |
14997
15013
  |---|----------|--------|--------|
14998
15014
  ${paaRows}` : "## People Also Ask\n*Google did not return a People Also Ask block for this query/location. SERP data was extracted successfully when available.*";
14999
15015
  const serpRows = organic.map(
15000
- (r) => `| ${r.position} | ${r.title} | [${r.domain}](${r.url}) | ${truncate(r.snippet, 100)} |`
15016
+ (r) => `| ${r.position} | ${cell(r.title)} | [${cell(r.domain)}](${r.url}) | ${cell(truncate(r.snippet, 100))} |`
15001
15017
  ).join("\n");
15002
15018
  const serpTable = organic.length ? `
15003
15019
  ## Organic Results (${organic.length})
@@ -15045,14 +15061,14 @@ function formatSearchSerp(raw, input) {
15045
15061
  const aiOvw = d.aiOverview;
15046
15062
  const diagnostics = d.diagnostics;
15047
15063
  const serpRows = organic.map(
15048
- (r) => `| ${r.position} | ${r.title} | [${r.domain}](${r.url}) | ${truncate(r.snippet, 100)} |`
15064
+ (r) => `| ${r.position} | ${cell(r.title)} | [${cell(r.domain)}](${r.url}) | ${cell(truncate(r.snippet, 100))} |`
15049
15065
  ).join("\n");
15050
15066
  const serpTable = organic.length ? `## Organic Results (${organic.length})
15051
15067
  | # | Title | URL | Snippet |
15052
15068
  |---|-------|-----|----------|
15053
15069
  ${serpRows}` : "## Organic Results\n*None found*";
15054
15070
  const localRows = localPack.map(
15055
- (b) => `| ${b.position} | ${b.name} | ${b.rating ?? "\u2014"} (${b.reviewCount ?? "0"}) | ${b.websiteUrl ? `[link](${b.websiteUrl})` : "\u2014"} |`
15071
+ (b) => `| ${b.position} | ${cell(b.name)} | ${b.rating ?? "\u2014"} (${b.reviewCount ?? "0"}) | ${b.websiteUrl ? `[link](${b.websiteUrl})` : "\u2014"} |`
15056
15072
  ).join("\n");
15057
15073
  const localSection = localPack.length ? `
15058
15074
  ## Local Pack (${localPack.length})
@@ -15213,7 +15229,7 @@ function formatExtractSite(raw, input) {
15213
15229
  const pages = d.pages ?? [];
15214
15230
  const pageRows = pages.map((p, i) => {
15215
15231
  const schemaInfo = p.kpo?.type?.join(", ") ?? (Array.isArray(p.schema) && p.schema.length ? `${p.schema.length} block(s)` : "\u2014");
15216
- return `| ${i + 1} | ${p.title ?? "Untitled"} | ${p.url} | ${schemaInfo} |`;
15232
+ return `| ${i + 1} | ${cell(p.title ?? "Untitled")} | ${p.url} | ${schemaInfo} |`;
15217
15233
  }).join("\n");
15218
15234
  const full = [
15219
15235
  `# Site Extract: ${input.url}`,
@@ -15245,7 +15261,7 @@ function formatYoutubeHarvest(raw, input) {
15245
15261
  const videos = d.videos ?? [];
15246
15262
  const label = input.mode === "channel" ? input.channelHandle ?? "channel" : `"${input.query ?? ""}"`;
15247
15263
  const videoRows = videos.map(
15248
- (v, i) => `| ${i + 1} | ${truncate(v.title, 70)} | ${v.channelName} | ${v.views ?? "\u2014"} | ${v.duration ?? "\u2014"} | \`${v.videoId}\` |`
15264
+ (v, i) => `| ${i + 1} | ${cell(truncate(v.title, 70))} | ${cell(v.channelName)} | ${v.views ?? "\u2014"} | ${v.duration ?? "\u2014"} | \`${v.videoId}\` |`
15249
15265
  ).join("\n");
15250
15266
  const channelSection = d.channelMeta ? `
15251
15267
  ## Channel
@@ -15288,7 +15304,7 @@ function formatYoutubeTranscribe(raw, input) {
15288
15304
  const sec = Number.isFinite(c.timestamp[0]) ? Math.floor(c.timestamp[0]) : 0;
15289
15305
  const mm = String(Math.floor(sec / 60)).padStart(2, "0");
15290
15306
  const ss = String(sec % 60).padStart(2, "0");
15291
- return `| ${mm}:${ss} | ${truncate(c.text, 120)} |`;
15307
+ return `| ${mm}:${ss} | ${cell(truncate(c.text, 120))} |`;
15292
15308
  }).join("\n");
15293
15309
  const full = [
15294
15310
  `# YouTube Transcript: \`${input.videoId}\``,
@@ -15361,7 +15377,7 @@ function formatFacebookAdSearch(raw, input) {
15361
15377
  const d = parsed.data;
15362
15378
  const advertisers = d.results ?? d.advertisers ?? [];
15363
15379
  const rows = advertisers.map(
15364
- (a, i) => `| ${i + 1} | ${a.name} | ${a.adCount ?? "\u2014"} | \`${a.libraryId ?? "\u2014"}\` |`
15380
+ (a, i) => `| ${i + 1} | ${cell(a.name)} | ${a.adCount ?? "\u2014"} | \`${a.libraryId ?? "\u2014"}\` |`
15365
15381
  ).join("\n");
15366
15382
  const full = [
15367
15383
  `# Facebook Ad Library Search: "${input.query}"`,
@@ -15558,7 +15574,7 @@ function formatFacebookAdTranscribe(raw, input) {
15558
15574
  const sec = Number.isFinite(c.timestamp[0]) ? Math.floor(c.timestamp[0]) : 0;
15559
15575
  const mm = String(Math.floor(sec / 60)).padStart(2, "0");
15560
15576
  const ss = String(sec % 60).padStart(2, "0");
15561
- return `| ${mm}:${ss} | ${truncate(c.text, 120)} |`;
15577
+ return `| ${mm}:${ss} | ${cell(truncate(c.text, 120))} |`;
15562
15578
  }).join("\n");
15563
15579
  const full = [
15564
15580
  `# Facebook Ad Transcript`,