mcp-scraper 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/api-server.cjs +40 -24
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +1 -1
- package/dist/bin/mcp-stdio-server.cjs +12 -9
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +1 -1
- package/dist/{chunk-4743MZHT.js → chunk-JQKZWEON.js} +13 -10
- package/dist/chunk-JQKZWEON.js.map +1 -0
- package/dist/{server-N7Q6H4OR.js → server-6CHHLOII.js} +29 -16
- package/dist/server-6CHHLOII.js.map +1 -0
- package/package.json +16 -16
- package/dist/chunk-4743MZHT.js.map +0 -1
- package/dist/server-N7Q6H4OR.js.map +0 -1
package/dist/bin/api-server.cjs
CHANGED
|
@@ -4628,9 +4628,12 @@ function parsePageData(url, html, status, via) {
|
|
|
4628
4628
|
const bodyText = html.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
4629
4629
|
const wordCount = bodyText.split(" ").filter((w) => w.length > 2).length;
|
|
4630
4630
|
const schemaTypes = [];
|
|
4631
|
+
const schema = [];
|
|
4631
4632
|
for (const m of html.matchAll(/<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi)) {
|
|
4632
4633
|
try {
|
|
4633
4634
|
const parsed = JSON.parse(m[1]);
|
|
4635
|
+
if (Array.isArray(parsed)) schema.push(...parsed);
|
|
4636
|
+
else schema.push(parsed);
|
|
4634
4637
|
const collect = (obj) => {
|
|
4635
4638
|
if (!obj || typeof obj !== "object") return;
|
|
4636
4639
|
const o = obj;
|
|
@@ -4642,6 +4645,13 @@ function parsePageData(url, html, status, via) {
|
|
|
4642
4645
|
} catch {
|
|
4643
4646
|
}
|
|
4644
4647
|
}
|
|
4648
|
+
const mainHtml = (html.match(/<main[^>]*>([\s\S]*?)<\/main>/i)?.[1] ?? html.match(/<article[^>]*>([\s\S]*?)<\/article>/i)?.[1] ?? html.match(/<body[^>]*>([\s\S]*?)<\/body>/i)?.[1] ?? html).replace(/<(script|style|nav|header|footer|aside|noscript|svg|iframe)[^>]*>[\s\S]*?<\/\1>/gi, "");
|
|
4649
|
+
let bodyMarkdown = "";
|
|
4650
|
+
try {
|
|
4651
|
+
bodyMarkdown = turndown.turndown(mainHtml).replace(/\n{3,}/g, "\n\n").trim().slice(0, MAX_PAGE_MARKDOWN);
|
|
4652
|
+
} catch {
|
|
4653
|
+
bodyMarkdown = bodyText.slice(0, MAX_PAGE_MARKDOWN);
|
|
4654
|
+
}
|
|
4645
4655
|
let internalLinks = 0;
|
|
4646
4656
|
let externalLinks = 0;
|
|
4647
4657
|
for (const m of html.matchAll(/href\s*=\s*["']([^"'\s>]+)/gi)) {
|
|
@@ -4652,7 +4662,7 @@ function parsePageData(url, html, status, via) {
|
|
|
4652
4662
|
} catch {
|
|
4653
4663
|
}
|
|
4654
4664
|
}
|
|
4655
|
-
return { url, status, via, title, metaDescription, h1, headings, wordCount, schemaTypes, canonicalUrl, internalLinks, externalLinks };
|
|
4665
|
+
return { url, status, via, title, metaDescription, h1, headings, wordCount, schemaTypes, canonicalUrl, internalLinks, externalLinks, bodyMarkdown, schema };
|
|
4656
4666
|
}
|
|
4657
4667
|
async function fetchAndParse(url, kernelApiKey) {
|
|
4658
4668
|
try {
|
|
@@ -4667,13 +4677,13 @@ async function fetchAndParse(url, kernelApiKey) {
|
|
|
4667
4677
|
const html = await res.text();
|
|
4668
4678
|
return parsePageData(url, html, res.status, "fetch");
|
|
4669
4679
|
}
|
|
4670
|
-
return { url, status: res.status, via: "fetch", title: null, metaDescription: null, h1: null, headings: [], wordCount: 0, schemaTypes: [], canonicalUrl: null, internalLinks: 0, externalLinks: 0 };
|
|
4680
|
+
return { url, status: res.status, via: "fetch", title: null, metaDescription: null, h1: null, headings: [], wordCount: 0, schemaTypes: [], canonicalUrl: null, internalLinks: 0, externalLinks: 0, bodyMarkdown: "", schema: [] };
|
|
4671
4681
|
}
|
|
4672
4682
|
if ((res.status === 403 || res.status === 429) && kernelApiKey) {
|
|
4673
4683
|
const html = await fetchWithKernel(url);
|
|
4674
4684
|
return parsePageData(url, html, 200, "browser");
|
|
4675
4685
|
}
|
|
4676
|
-
return { url, status: res.status, via: "fetch", title: null, metaDescription: null, h1: null, headings: [], wordCount: 0, schemaTypes: [], canonicalUrl: null, internalLinks: 0, externalLinks: 0 };
|
|
4686
|
+
return { url, status: res.status, via: "fetch", title: null, metaDescription: null, h1: null, headings: [], wordCount: 0, schemaTypes: [], canonicalUrl: null, internalLinks: 0, externalLinks: 0, bodyMarkdown: "", schema: [] };
|
|
4677
4687
|
} catch {
|
|
4678
4688
|
if (kernelApiKey) {
|
|
4679
4689
|
try {
|
|
@@ -4682,7 +4692,7 @@ async function fetchAndParse(url, kernelApiKey) {
|
|
|
4682
4692
|
} catch {
|
|
4683
4693
|
}
|
|
4684
4694
|
}
|
|
4685
|
-
return { url, status: null, via: "fetch", title: null, metaDescription: null, h1: null, headings: [], wordCount: 0, schemaTypes: [], canonicalUrl: null, internalLinks: 0, externalLinks: 0 };
|
|
4695
|
+
return { url, status: null, via: "fetch", title: null, metaDescription: null, h1: null, headings: [], wordCount: 0, schemaTypes: [], canonicalUrl: null, internalLinks: 0, externalLinks: 0, bodyMarkdown: "", schema: [] };
|
|
4686
4696
|
}
|
|
4687
4697
|
}
|
|
4688
4698
|
async function runWithConcurrency(items, concurrency, fn) {
|
|
@@ -4723,14 +4733,17 @@ async function extractSite(opts) {
|
|
|
4723
4733
|
browserRetries
|
|
4724
4734
|
};
|
|
4725
4735
|
}
|
|
4726
|
-
var UA2, EXTRACT_CONCURRENCY;
|
|
4736
|
+
var import_turndown2, UA2, EXTRACT_CONCURRENCY, MAX_PAGE_MARKDOWN, turndown;
|
|
4727
4737
|
var init_site_extractor = __esm({
|
|
4728
4738
|
"src/api/site-extractor.ts"() {
|
|
4729
4739
|
"use strict";
|
|
4740
|
+
import_turndown2 = __toESM(require("turndown"), 1);
|
|
4730
4741
|
init_site_mapper();
|
|
4731
4742
|
init_kernel_fetch();
|
|
4732
4743
|
UA2 = "Mozilla/5.0 (compatible; ThorbitBot/1.0; +https://thorbit.ai)";
|
|
4733
4744
|
EXTRACT_CONCURRENCY = 6;
|
|
4745
|
+
MAX_PAGE_MARKDOWN = 4e4;
|
|
4746
|
+
turndown = new import_turndown2.default({ headingStyle: "atx", bulletListMarker: "-", codeBlockStyle: "fenced" });
|
|
4734
4747
|
}
|
|
4735
4748
|
});
|
|
4736
4749
|
|
|
@@ -14937,6 +14950,9 @@ function truncate(s, max) {
|
|
|
14937
14950
|
if (!s) return "";
|
|
14938
14951
|
return s.length > max ? s.slice(0, max) + "\u2026" : s;
|
|
14939
14952
|
}
|
|
14953
|
+
function cell(s) {
|
|
14954
|
+
return String(s ?? "").replace(/\r?\n+/g, " ").replace(/\|/g, "\\|").replace(/\s+/g, " ").trim();
|
|
14955
|
+
}
|
|
14940
14956
|
function debugSection(debug) {
|
|
14941
14957
|
if (!debug || typeof debug !== "object") return "";
|
|
14942
14958
|
const request = debug.request ?? {};
|
|
@@ -14990,14 +15006,14 @@ function formatHarvestPaa(raw, input) {
|
|
|
14990
15006
|
const diagnostics = d.diagnostics;
|
|
14991
15007
|
const durationMs = d.stats?.durationMs;
|
|
14992
15008
|
const paaRows = flat.map(
|
|
14993
|
-
(r, i) => `| ${i + 1} | ${r.question} | ${truncate(r.answer, 120)} | ${r.source_site
|
|
15009
|
+
(r, i) => `| ${i + 1} | ${cell(r.question)} | ${cell(truncate(r.answer, 120))} | ${cell(r.source_title || r.source_site || "")} |`
|
|
14994
15010
|
).join("\n");
|
|
14995
15011
|
const paaTable = flat.length ? `## People Also Ask (${flat.length} questions)
|
|
14996
15012
|
| # | Question | Answer | Source |
|
|
14997
15013
|
|---|----------|--------|--------|
|
|
14998
15014
|
${paaRows}` : "## People Also Ask\n*Google did not return a People Also Ask block for this query/location. SERP data was extracted successfully when available.*";
|
|
14999
15015
|
const serpRows = organic.map(
|
|
15000
|
-
(r) => `| ${r.position} | ${r.title} | [${r.domain}](${r.url}) | ${truncate(r.snippet, 100)} |`
|
|
15016
|
+
(r) => `| ${r.position} | ${cell(r.title)} | [${cell(r.domain)}](${r.url}) | ${cell(truncate(r.snippet, 100))} |`
|
|
15001
15017
|
).join("\n");
|
|
15002
15018
|
const serpTable = organic.length ? `
|
|
15003
15019
|
## Organic Results (${organic.length})
|
|
@@ -15045,14 +15061,14 @@ function formatSearchSerp(raw, input) {
|
|
|
15045
15061
|
const aiOvw = d.aiOverview;
|
|
15046
15062
|
const diagnostics = d.diagnostics;
|
|
15047
15063
|
const serpRows = organic.map(
|
|
15048
|
-
(r) => `| ${r.position} | ${r.title} | [${r.domain}](${r.url}) | ${truncate(r.snippet, 100)} |`
|
|
15064
|
+
(r) => `| ${r.position} | ${cell(r.title)} | [${cell(r.domain)}](${r.url}) | ${cell(truncate(r.snippet, 100))} |`
|
|
15049
15065
|
).join("\n");
|
|
15050
15066
|
const serpTable = organic.length ? `## Organic Results (${organic.length})
|
|
15051
15067
|
| # | Title | URL | Snippet |
|
|
15052
15068
|
|---|-------|-----|----------|
|
|
15053
15069
|
${serpRows}` : "## Organic Results\n*None found*";
|
|
15054
15070
|
const localRows = localPack.map(
|
|
15055
|
-
(b) => `| ${b.position} | ${b.name} | ${b.rating ?? "\u2014"} (${b.reviewCount ?? "0"}) | ${b.websiteUrl ? `[link](${b.websiteUrl})` : "\u2014"} |`
|
|
15071
|
+
(b) => `| ${b.position} | ${cell(b.name)} | ${b.rating ?? "\u2014"} (${b.reviewCount ?? "0"}) | ${b.websiteUrl ? `[link](${b.websiteUrl})` : "\u2014"} |`
|
|
15056
15072
|
).join("\n");
|
|
15057
15073
|
const localSection = localPack.length ? `
|
|
15058
15074
|
## Local Pack (${localPack.length})
|
|
@@ -15213,7 +15229,7 @@ function formatExtractSite(raw, input) {
|
|
|
15213
15229
|
const pages = d.pages ?? [];
|
|
15214
15230
|
const pageRows = pages.map((p, i) => {
|
|
15215
15231
|
const schemaInfo = p.kpo?.type?.join(", ") ?? (Array.isArray(p.schema) && p.schema.length ? `${p.schema.length} block(s)` : "\u2014");
|
|
15216
|
-
return `| ${i + 1} | ${p.title ?? "Untitled"} | ${p.url} | ${schemaInfo} |`;
|
|
15232
|
+
return `| ${i + 1} | ${cell(p.title ?? "Untitled")} | ${p.url} | ${schemaInfo} |`;
|
|
15217
15233
|
}).join("\n");
|
|
15218
15234
|
const full = [
|
|
15219
15235
|
`# Site Extract: ${input.url}`,
|
|
@@ -15245,7 +15261,7 @@ function formatYoutubeHarvest(raw, input) {
|
|
|
15245
15261
|
const videos = d.videos ?? [];
|
|
15246
15262
|
const label = input.mode === "channel" ? input.channelHandle ?? "channel" : `"${input.query ?? ""}"`;
|
|
15247
15263
|
const videoRows = videos.map(
|
|
15248
|
-
(v, i) => `| ${i + 1} | ${truncate(v.title, 70)} | ${v.channelName} | ${v.views ?? "\u2014"} | ${v.duration ?? "\u2014"} | \`${v.videoId}\` |`
|
|
15264
|
+
(v, i) => `| ${i + 1} | ${cell(truncate(v.title, 70))} | ${cell(v.channelName)} | ${v.views ?? "\u2014"} | ${v.duration ?? "\u2014"} | \`${v.videoId}\` |`
|
|
15249
15265
|
).join("\n");
|
|
15250
15266
|
const channelSection = d.channelMeta ? `
|
|
15251
15267
|
## Channel
|
|
@@ -15288,7 +15304,7 @@ function formatYoutubeTranscribe(raw, input) {
|
|
|
15288
15304
|
const sec = Number.isFinite(c.timestamp[0]) ? Math.floor(c.timestamp[0]) : 0;
|
|
15289
15305
|
const mm = String(Math.floor(sec / 60)).padStart(2, "0");
|
|
15290
15306
|
const ss = String(sec % 60).padStart(2, "0");
|
|
15291
|
-
return `| ${mm}:${ss} | ${truncate(c.text, 120)} |`;
|
|
15307
|
+
return `| ${mm}:${ss} | ${cell(truncate(c.text, 120))} |`;
|
|
15292
15308
|
}).join("\n");
|
|
15293
15309
|
const full = [
|
|
15294
15310
|
`# YouTube Transcript: \`${input.videoId}\``,
|
|
@@ -15361,7 +15377,7 @@ function formatFacebookAdSearch(raw, input) {
|
|
|
15361
15377
|
const d = parsed.data;
|
|
15362
15378
|
const advertisers = d.results ?? d.advertisers ?? [];
|
|
15363
15379
|
const rows = advertisers.map(
|
|
15364
|
-
(a, i) => `| ${i + 1} | ${a.name} | ${a.adCount ?? "\u2014"} | \`${a.libraryId ?? "\u2014"}\` |`
|
|
15380
|
+
(a, i) => `| ${i + 1} | ${cell(a.name)} | ${a.adCount ?? "\u2014"} | \`${a.libraryId ?? "\u2014"}\` |`
|
|
15365
15381
|
).join("\n");
|
|
15366
15382
|
const full = [
|
|
15367
15383
|
`# Facebook Ad Library Search: "${input.query}"`,
|
|
@@ -15558,7 +15574,7 @@ function formatFacebookAdTranscribe(raw, input) {
|
|
|
15558
15574
|
const sec = Number.isFinite(c.timestamp[0]) ? Math.floor(c.timestamp[0]) : 0;
|
|
15559
15575
|
const mm = String(Math.floor(sec / 60)).padStart(2, "0");
|
|
15560
15576
|
const ss = String(sec % 60).padStart(2, "0");
|
|
15561
|
-
return `| ${mm}:${ss} | ${truncate(c.text, 120)} |`;
|
|
15577
|
+
return `| ${mm}:${ss} | ${cell(truncate(c.text, 120))} |`;
|
|
15562
15578
|
}).join("\n");
|
|
15563
15579
|
const full = [
|
|
15564
15580
|
`# Facebook Ad Transcript`,
|
|
@@ -16463,19 +16479,15 @@ var init_server = __esm({
|
|
|
16463
16479
|
const normalizedEmail = email?.trim().toLowerCase();
|
|
16464
16480
|
if (!normalizedEmail || !password) return c.json({ error: "Email and password required" }, 400);
|
|
16465
16481
|
if (password.length < 8) return c.json({ error: "Password must be at least 8 characters" }, 400);
|
|
16466
|
-
const limited = await enforceRateLimit(c, "auth_register", rateLimitKey(c), 5, 60 * 60);
|
|
16467
|
-
if (limited) return limited;
|
|
16468
16482
|
try {
|
|
16469
16483
|
const existing = await getUserByEmail(normalizedEmail);
|
|
16470
16484
|
if (existing) return c.json({ error: "Email already registered" }, 409);
|
|
16471
|
-
let stripeCustomerId;
|
|
16485
|
+
let stripeCustomerId = null;
|
|
16472
16486
|
try {
|
|
16473
16487
|
stripeCustomerId = await createSignupStripeCustomer(normalizedEmail);
|
|
16474
|
-
} catch {
|
|
16475
|
-
|
|
16476
|
-
|
|
16477
|
-
if (!stripeCustomerId && (process.env.NODE_ENV === "production" || process.env.VERCEL === "1")) {
|
|
16478
|
-
return c.json({ error: "Stripe customer setup failed" }, 503);
|
|
16488
|
+
} catch (err) {
|
|
16489
|
+
console.warn("[auth/register] Stripe customer creation failed; continuing without it (created lazily at checkout):", err instanceof Error ? err.message : String(err));
|
|
16490
|
+
stripeCustomerId = null;
|
|
16479
16491
|
}
|
|
16480
16492
|
const user = await createUser(normalizedEmail, void 0, password, stripeCustomerId ?? void 0);
|
|
16481
16493
|
if (stripeCustomerId) {
|
|
@@ -16536,14 +16548,18 @@ var init_server = __esm({
|
|
|
16536
16548
|
if (process.env.RESEND_API_KEY) {
|
|
16537
16549
|
try {
|
|
16538
16550
|
const resend = new import_resend.Resend(process.env.RESEND_API_KEY);
|
|
16539
|
-
await resend.emails.send({
|
|
16551
|
+
const sent = await resend.emails.send({
|
|
16540
16552
|
from: "MCP Scraper <noreply@updates.mcpscraper.dev>",
|
|
16541
16553
|
to: normalizedEmail,
|
|
16542
16554
|
subject: "Reset your MCP Scraper password",
|
|
16543
16555
|
html: `<p>Hi,</p><p>Click the link below to reset your password. This link expires in 1 hour.</p><p><a href="${resetUrl}">${resetUrl}</a></p><p>If you didn't request this, you can ignore this email.</p>`
|
|
16544
16556
|
});
|
|
16545
|
-
|
|
16557
|
+
if (sent.error) console.error("[auth/forgot-password] Resend rejected the email:", JSON.stringify(sent.error));
|
|
16558
|
+
} catch (err) {
|
|
16559
|
+
console.error("[auth/forgot-password] Resend send threw:", err instanceof Error ? err.message : String(err));
|
|
16546
16560
|
}
|
|
16561
|
+
} else {
|
|
16562
|
+
console.warn("[auth/forgot-password] RESEND_API_KEY not set \u2014 no reset email sent for", normalizedEmail);
|
|
16547
16563
|
}
|
|
16548
16564
|
return c.json({ ok: true });
|
|
16549
16565
|
});
|