freshcontext-mcp 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,8 @@
1
1
  import { chromium } from "playwright";
2
+ import { validateUrl } from "../security.js";
2
3
  export async function githubAdapter(options) {
4
+ const safeUrl = validateUrl(options.url, "github");
5
+ options = { ...options, url: safeUrl };
3
6
  const browser = await chromium.launch({ headless: true });
4
7
  const page = await browser.newPage();
5
8
  // Spoof a real browser UA to avoid bot detection
@@ -1,6 +1,8 @@
1
1
  import { chromium } from "playwright";
2
+ import { validateUrl } from "../security.js";
2
3
  export async function hackerNewsAdapter(options) {
3
- // If it's an Algolia API URL or search query, use the REST API directly (no browser)
4
+ // Validate URL allow both HN and Algolia domains
5
+ validateUrl(options.url, "hackernews");
4
6
  const url = options.url;
5
7
  if (url.includes("hn.algolia.com/api/") || url.startsWith("hn-search:")) {
6
8
  const query = url.startsWith("hn-search:")
@@ -1,8 +1,8 @@
1
+ import { sanitizePackages } from "../security.js";
1
2
  // Uses npm registry API + PyPI JSON API (no auth needed)
2
3
  export async function packageTrendsAdapter(options) {
3
- // options.url is the package name or a comma-separated list
4
- // e.g. "langchain" or "npm:langchain" or "pypi:langchain"
5
- const raw_input = options.url.replace(/^https?:\/\//, "").trim();
4
+ // Sanitize package input
5
+ const raw_input = sanitizePackages(options.url.replace(/^https?:\/\//, "").trim());
6
6
  // Parse ecosystem prefix
7
7
  const parts = raw_input.split(",").map((s) => s.trim());
8
8
  const results = [];
@@ -1,8 +1,9 @@
1
+ import { sanitizeQuery } from "../security.js";
1
2
  // Uses GitHub Search API (no auth needed for basic search)
2
3
  export async function repoSearchAdapter(options) {
3
- // options.url is treated as the search query string
4
- // e.g. "mcp server typescript" or a full GitHub search URL
5
- let query = options.url;
4
+ // Sanitize query input
5
+ const query_input = sanitizeQuery(options.url);
6
+ let query = query_input;
6
7
  // If it's a full URL, extract the query param
7
8
  try {
8
9
  const parsed = new URL(options.url);
@@ -1,5 +1,8 @@
1
1
  import { chromium } from "playwright";
2
+ import { validateUrl } from "../security.js";
2
3
  export async function scholarAdapter(options) {
4
+ const safeUrl = validateUrl(options.url, "scholar");
5
+ options = { ...options, url: safeUrl };
3
6
  const browser = await chromium.launch({ headless: true });
4
7
  const page = await browser.newPage();
5
8
  await page.setExtraHTTPHeaders({
@@ -1,5 +1,8 @@
1
1
  import { chromium } from "playwright";
2
+ import { validateUrl } from "../security.js";
2
3
  export async function ycAdapter(options) {
4
+ const safeUrl = validateUrl(options.url, "yc");
5
+ options = { ...options, url: safeUrl };
3
6
  const browser = await chromium.launch({ headless: true });
4
7
  const page = await browser.newPage();
5
8
  // YC company directory is React-rendered — wait for network to settle
@@ -0,0 +1,117 @@
1
+ /**
2
+ * freshcontext-mcp security module
3
+ * Input sanitization, domain allowlists, and request validation
4
+ */
5
+ // ─── Allowed domains per adapter ────────────────────────────────────────────
6
+ export const ALLOWED_DOMAINS = {
7
+ github: ["github.com", "raw.githubusercontent.com"],
8
+ scholar: ["scholar.google.com"],
9
+ hackernews: ["news.ycombinator.com", "hn.algolia.com"],
10
+ yc: ["www.ycombinator.com", "ycombinator.com"],
11
+ repoSearch: [], // uses GitHub API directly, no browser
12
+ packageTrends: [], // uses npm/PyPI APIs directly, no browser
13
+ };
14
+ // ─── Blocked IP ranges and internal hostnames ────────────────────────────────
15
+ const BLOCKED_PATTERNS = [
16
+ /^localhost$/i,
17
+ /^127\.\d+\.\d+\.\d+$/,
18
+ /^10\.\d+\.\d+\.\d+$/,
19
+ /^172\.(1[6-9]|2\d|3[01])\.\d+\.\d+$/,
20
+ /^192\.168\.\d+\.\d+$/,
21
+ /^169\.254\.\d+\.\d+$/, // AWS metadata
22
+ /^0\.0\.0\.0$/,
23
+ /^::1$/,
24
+ /^fc00:/i,
25
+ /^fe80:/i,
26
+ ];
27
+ // ─── Max length limits ────────────────────────────────────────────────────────
28
+ export const MAX_URL_LENGTH = 500;
29
+ export const MAX_QUERY_LENGTH = 200;
30
+ export const MAX_PACKAGES_LENGTH = 300;
31
+ // ─── Validation errors ───────────────────────────────────────────────────────
32
+ export class SecurityError extends Error {
33
+ constructor(message) {
34
+ super(message);
35
+ this.name = "SecurityError";
36
+ }
37
+ }
38
+ // ─── URL validator ───────────────────────────────────────────────────────────
39
+ export function validateUrl(rawUrl, adapterName) {
40
+ // Length check
41
+ if (!rawUrl || rawUrl.trim().length === 0) {
42
+ throw new SecurityError("URL cannot be empty");
43
+ }
44
+ if (rawUrl.length > MAX_URL_LENGTH) {
45
+ throw new SecurityError(`URL exceeds maximum length of ${MAX_URL_LENGTH} characters`);
46
+ }
47
+ // Must be a valid URL
48
+ let parsed;
49
+ try {
50
+ parsed = new URL(rawUrl.trim());
51
+ }
52
+ catch {
53
+ throw new SecurityError(`Invalid URL format: ${rawUrl}`);
54
+ }
55
+ // Must use http or https
56
+ if (!["http:", "https:"].includes(parsed.protocol)) {
57
+ throw new SecurityError(`Protocol not allowed: ${parsed.protocol}. Only http/https permitted.`);
58
+ }
59
+ const hostname = parsed.hostname.toLowerCase();
60
+ // Block internal/private IPs and hostnames
61
+ for (const pattern of BLOCKED_PATTERNS) {
62
+ if (pattern.test(hostname)) {
63
+ throw new SecurityError(`Access to internal/private addresses is not permitted: ${hostname}`);
64
+ }
65
+ }
66
+ // Domain allowlist check (skip if allowlist is empty — means no browser used)
67
+ const allowedDomains = ALLOWED_DOMAINS[adapterName];
68
+ if (allowedDomains && allowedDomains.length > 0) {
69
+ const isAllowed = allowedDomains.some((domain) => hostname === domain || hostname.endsWith(`.${domain}`));
70
+ if (!isAllowed) {
71
+ throw new SecurityError(`Domain not allowed for ${adapterName} adapter: ${hostname}. ` +
72
+ `Allowed domains: ${allowedDomains.join(", ")}`);
73
+ }
74
+ }
75
+ return parsed.toString();
76
+ }
77
+ // ─── Query string sanitizer ──────────────────────────────────────────────────
78
+ export function sanitizeQuery(query, maxLength = MAX_QUERY_LENGTH) {
79
+ if (!query || query.trim().length === 0) {
80
+ throw new SecurityError("Query cannot be empty");
81
+ }
82
+ const trimmed = query.trim().slice(0, maxLength);
83
+ // Strip null bytes and control characters
84
+ const cleaned = trimmed.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "");
85
+ if (cleaned.length === 0) {
86
+ throw new SecurityError("Query contains no valid characters after sanitization");
87
+ }
88
+ return cleaned;
89
+ }
90
+ // ─── Package name sanitizer ──────────────────────────────────────────────────
91
+ export function sanitizePackages(input) {
92
+ if (!input || input.trim().length === 0) {
93
+ throw new SecurityError("Package name cannot be empty");
94
+ }
95
+ if (input.length > MAX_PACKAGES_LENGTH) {
96
+ throw new SecurityError(`Package input exceeds maximum length of ${MAX_PACKAGES_LENGTH} characters`);
97
+ }
98
+ // Only allow valid npm/PyPI package name characters, commas, colons (for npm:/pypi: prefix)
99
+ const cleaned = input
100
+ .trim()
101
+ .replace(/[^a-zA-Z0-9@/._\-,:]/g, "")
102
+ .slice(0, MAX_PACKAGES_LENGTH);
103
+ if (cleaned.length === 0) {
104
+ throw new SecurityError("Package name contains no valid characters after sanitization");
105
+ }
106
+ return cleaned;
107
+ }
108
+ // ─── Error formatter ─────────────────────────────────────────────────────────
109
+ export function formatSecurityError(err) {
110
+ if (err instanceof SecurityError) {
111
+ return `[Security] ${err.message}`;
112
+ }
113
+ if (err instanceof Error) {
114
+ return `[Error] ${err.message}`;
115
+ }
116
+ return "[Error] Unknown error occurred";
117
+ }
package/dist/server.js CHANGED
@@ -8,6 +8,7 @@ import { ycAdapter } from "./adapters/yc.js";
8
8
  import { repoSearchAdapter } from "./adapters/repoSearch.js";
9
9
  import { packageTrendsAdapter } from "./adapters/packageTrends.js";
10
10
  import { stampFreshness, formatForLLM } from "./tools/freshnessStamp.js";
11
+ import { formatSecurityError } from "./security.js";
11
12
  const server = new McpServer({
12
13
  name: "freshcontext-mcp",
13
14
  version: "0.1.0",
@@ -21,9 +22,14 @@ server.registerTool("extract_github", {
21
22
  }),
22
23
  annotations: { readOnlyHint: true, openWorldHint: true },
23
24
  }, async ({ url, max_length }) => {
24
- const result = await githubAdapter({ url, maxLength: max_length });
25
- const ctx = stampFreshness(result, { url, maxLength: max_length }, "github");
26
- return { content: [{ type: "text", text: formatForLLM(ctx) }] };
25
+ try {
26
+ const result = await githubAdapter({ url, maxLength: max_length });
27
+ const ctx = stampFreshness(result, { url, maxLength: max_length }, "github");
28
+ return { content: [{ type: "text", text: formatForLLM(ctx) }] };
29
+ }
30
+ catch (err) {
31
+ return { content: [{ type: "text", text: formatSecurityError(err) }] };
32
+ }
27
33
  });
28
34
  // ─── Tool: extract_scholar ───────────────────────────────────────────────────
29
35
  server.registerTool("extract_scholar", {
@@ -34,9 +40,14 @@ server.registerTool("extract_scholar", {
34
40
  }),
35
41
  annotations: { readOnlyHint: true, openWorldHint: true },
36
42
  }, async ({ url, max_length }) => {
37
- const result = await scholarAdapter({ url, maxLength: max_length });
38
- const ctx = stampFreshness(result, { url, maxLength: max_length }, "google_scholar");
39
- return { content: [{ type: "text", text: formatForLLM(ctx) }] };
43
+ try {
44
+ const result = await scholarAdapter({ url, maxLength: max_length });
45
+ const ctx = stampFreshness(result, { url, maxLength: max_length }, "google_scholar");
46
+ return { content: [{ type: "text", text: formatForLLM(ctx) }] };
47
+ }
48
+ catch (err) {
49
+ return { content: [{ type: "text", text: formatSecurityError(err) }] };
50
+ }
40
51
  });
41
52
  // ─── Tool: extract_hackernews ────────────────────────────────────────────────
42
53
  server.registerTool("extract_hackernews", {
@@ -47,9 +58,14 @@ server.registerTool("extract_hackernews", {
47
58
  }),
48
59
  annotations: { readOnlyHint: true, openWorldHint: true },
49
60
  }, async ({ url, max_length }) => {
50
- const result = await hackerNewsAdapter({ url, maxLength: max_length });
51
- const ctx = stampFreshness(result, { url, maxLength: max_length }, "hackernews");
52
- return { content: [{ type: "text", text: formatForLLM(ctx) }] };
61
+ try {
62
+ const result = await hackerNewsAdapter({ url, maxLength: max_length });
63
+ const ctx = stampFreshness(result, { url, maxLength: max_length }, "hackernews");
64
+ return { content: [{ type: "text", text: formatForLLM(ctx) }] };
65
+ }
66
+ catch (err) {
67
+ return { content: [{ type: "text", text: formatSecurityError(err) }] };
68
+ }
53
69
  });
54
70
  // ─── Tool: extract_yc ──────────────────────────────────────────────────────────
55
71
  server.registerTool("extract_yc", {
@@ -60,9 +76,14 @@ server.registerTool("extract_yc", {
60
76
  }),
61
77
  annotations: { readOnlyHint: true, openWorldHint: true },
62
78
  }, async ({ url, max_length }) => {
63
- const result = await ycAdapter({ url, maxLength: max_length });
64
- const ctx = stampFreshness(result, { url, maxLength: max_length }, "ycombinator");
65
- return { content: [{ type: "text", text: formatForLLM(ctx) }] };
79
+ try {
80
+ const result = await ycAdapter({ url, maxLength: max_length });
81
+ const ctx = stampFreshness(result, { url, maxLength: max_length }, "ycombinator");
82
+ return { content: [{ type: "text", text: formatForLLM(ctx) }] };
83
+ }
84
+ catch (err) {
85
+ return { content: [{ type: "text", text: formatSecurityError(err) }] };
86
+ }
66
87
  });
67
88
  // ─── Tool: search_repos ──────────────────────────────────────────────────────
68
89
  server.registerTool("search_repos", {
@@ -73,9 +94,14 @@ server.registerTool("search_repos", {
73
94
  }),
74
95
  annotations: { readOnlyHint: true, openWorldHint: true },
75
96
  }, async ({ query, max_length }) => {
76
- const result = await repoSearchAdapter({ url: query, maxLength: max_length });
77
- const ctx = stampFreshness(result, { url: query, maxLength: max_length }, "github_search");
78
- return { content: [{ type: "text", text: formatForLLM(ctx) }] };
97
+ try {
98
+ const result = await repoSearchAdapter({ url: query, maxLength: max_length });
99
+ const ctx = stampFreshness(result, { url: query, maxLength: max_length }, "github_search");
100
+ return { content: [{ type: "text", text: formatForLLM(ctx) }] };
101
+ }
102
+ catch (err) {
103
+ return { content: [{ type: "text", text: formatSecurityError(err) }] };
104
+ }
79
105
  });
80
106
  // ─── Tool: package_trends ────────────────────────────────────────────────────
81
107
  server.registerTool("package_trends", {
@@ -86,9 +112,14 @@ server.registerTool("package_trends", {
86
112
  }),
87
113
  annotations: { readOnlyHint: true, openWorldHint: true },
88
114
  }, async ({ packages, max_length }) => {
89
- const result = await packageTrendsAdapter({ url: packages, maxLength: max_length });
90
- const ctx = stampFreshness(result, { url: packages, maxLength: max_length }, "package_registry");
91
- return { content: [{ type: "text", text: formatForLLM(ctx) }] };
115
+ try {
116
+ const result = await packageTrendsAdapter({ url: packages, maxLength: max_length });
117
+ const ctx = stampFreshness(result, { url: packages, maxLength: max_length }, "package_registry");
118
+ return { content: [{ type: "text", text: formatForLLM(ctx) }] };
119
+ }
120
+ catch (err) {
121
+ return { content: [{ type: "text", text: formatSecurityError(err) }] };
122
+ }
92
123
  });
93
124
  // ─── Tool: extract_landscape ─────────────────────────────────────────────────
94
125
  server.registerTool("extract_landscape", {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "freshcontext-mcp",
3
- "version": "0.1.2",
3
+ "version": "0.1.4",
4
4
  "description": "Real-time web extraction MCP server with freshness timestamps for AI agents",
5
5
  "keywords": [
6
6
  "mcp",
@@ -1,7 +1,11 @@
1
1
  import { chromium } from "playwright";
2
2
  import { AdapterResult, ExtractOptions } from "../types.js";
3
+ import { validateUrl } from "../security.js";
3
4
 
4
5
  export async function githubAdapter(options: ExtractOptions): Promise<AdapterResult> {
6
+ const safeUrl = validateUrl(options.url, "github");
7
+ options = { ...options, url: safeUrl };
8
+
5
9
  const browser = await chromium.launch({ headless: true });
6
10
  const page = await browser.newPage();
7
11
 
@@ -1,8 +1,10 @@
1
1
  import { chromium } from "playwright";
2
2
  import { AdapterResult, ExtractOptions } from "../types.js";
3
+ import { validateUrl } from "../security.js";
3
4
 
4
5
  export async function hackerNewsAdapter(options: ExtractOptions): Promise<AdapterResult> {
5
- // If it's an Algolia API URL or search query, use the REST API directly (no browser)
6
+ // Validate URL allow both HN and Algolia domains
7
+ validateUrl(options.url, "hackernews");
6
8
  const url = options.url;
7
9
 
8
10
  if (url.includes("hn.algolia.com/api/") || url.startsWith("hn-search:")) {
@@ -1,10 +1,10 @@
1
1
  import { AdapterResult, ExtractOptions } from "../types.js";
2
+ import { sanitizePackages } from "../security.js";
2
3
 
3
4
  // Uses npm registry API + PyPI JSON API (no auth needed)
4
5
  export async function packageTrendsAdapter(options: ExtractOptions): Promise<AdapterResult> {
5
- // options.url is the package name or a comma-separated list
6
- // e.g. "langchain" or "npm:langchain" or "pypi:langchain"
7
- const raw_input = options.url.replace(/^https?:\/\//, "").trim();
6
+ // Sanitize package input
7
+ const raw_input = sanitizePackages(options.url.replace(/^https?:\/\//, "").trim());
8
8
 
9
9
  // Parse ecosystem prefix
10
10
  const parts = raw_input.split(",").map((s) => s.trim());
@@ -1,10 +1,11 @@
1
1
  import { AdapterResult, ExtractOptions } from "../types.js";
2
+ import { sanitizeQuery } from "../security.js";
2
3
 
3
4
  // Uses GitHub Search API (no auth needed for basic search)
4
5
  export async function repoSearchAdapter(options: ExtractOptions): Promise<AdapterResult> {
5
- // options.url is treated as the search query string
6
- // e.g. "mcp server typescript" or a full GitHub search URL
7
- let query = options.url;
6
+ // Sanitize query input
7
+ const query_input = sanitizeQuery(options.url);
8
+ let query = query_input;
8
9
 
9
10
  // If it's a full URL, extract the query param
10
11
  try {
@@ -1,7 +1,11 @@
1
1
  import { chromium } from "playwright";
2
2
  import { AdapterResult, ExtractOptions } from "../types.js";
3
+ import { validateUrl } from "../security.js";
3
4
 
4
5
  export async function scholarAdapter(options: ExtractOptions): Promise<AdapterResult> {
6
+ const safeUrl = validateUrl(options.url, "scholar");
7
+ options = { ...options, url: safeUrl };
8
+
5
9
  const browser = await chromium.launch({ headless: true });
6
10
  const page = await browser.newPage();
7
11
 
@@ -1,7 +1,11 @@
1
1
  import { chromium } from "playwright";
2
2
  import { AdapterResult, ExtractOptions } from "../types.js";
3
+ import { validateUrl } from "../security.js";
3
4
 
4
5
  export async function ycAdapter(options: ExtractOptions): Promise<AdapterResult> {
6
+ const safeUrl = validateUrl(options.url, "yc");
7
+ options = { ...options, url: safeUrl };
8
+
5
9
  const browser = await chromium.launch({ headless: true });
6
10
  const page = await browser.newPage();
7
11
 
@@ -0,0 +1,161 @@
1
+ /**
2
+ * freshcontext-mcp security module
3
+ * Input sanitization, domain allowlists, and request validation
4
+ */
5
+
6
+ // ─── Allowed domains per adapter ────────────────────────────────────────────
7
+
8
+ export const ALLOWED_DOMAINS: Record<string, string[]> = {
9
+ github: ["github.com", "raw.githubusercontent.com"],
10
+ scholar: ["scholar.google.com"],
11
+ hackernews: ["news.ycombinator.com", "hn.algolia.com"],
12
+ yc: ["www.ycombinator.com", "ycombinator.com"],
13
+ repoSearch: [], // uses GitHub API directly, no browser
14
+ packageTrends: [], // uses npm/PyPI APIs directly, no browser
15
+ };
16
+
17
+ // ─── Blocked IP ranges and internal hostnames ────────────────────────────────
18
+
19
+ const BLOCKED_PATTERNS = [
20
+ /^localhost$/i,
21
+ /^127\.\d+\.\d+\.\d+$/,
22
+ /^10\.\d+\.\d+\.\d+$/,
23
+ /^172\.(1[6-9]|2\d|3[01])\.\d+\.\d+$/,
24
+ /^192\.168\.\d+\.\d+$/,
25
+ /^169\.254\.\d+\.\d+$/, // AWS metadata
26
+ /^0\.0\.0\.0$/,
27
+ /^::1$/,
28
+ /^fc00:/i,
29
+ /^fe80:/i,
30
+ ];
31
+
32
+ // ─── Max length limits ────────────────────────────────────────────────────────
33
+
34
+ export const MAX_URL_LENGTH = 500;
35
+ export const MAX_QUERY_LENGTH = 200;
36
+ export const MAX_PACKAGES_LENGTH = 300;
37
+
38
+ // ─── Validation errors ───────────────────────────────────────────────────────
39
+
40
+ export class SecurityError extends Error {
41
+ constructor(message: string) {
42
+ super(message);
43
+ this.name = "SecurityError";
44
+ }
45
+ }
46
+
47
+ // ─── URL validator ───────────────────────────────────────────────────────────
48
+
49
+ export function validateUrl(
50
+ rawUrl: string,
51
+ adapterName: keyof typeof ALLOWED_DOMAINS
52
+ ): string {
53
+ // Length check
54
+ if (!rawUrl || rawUrl.trim().length === 0) {
55
+ throw new SecurityError("URL cannot be empty");
56
+ }
57
+ if (rawUrl.length > MAX_URL_LENGTH) {
58
+ throw new SecurityError(
59
+ `URL exceeds maximum length of ${MAX_URL_LENGTH} characters`
60
+ );
61
+ }
62
+
63
+ // Must be a valid URL
64
+ let parsed: URL;
65
+ try {
66
+ parsed = new URL(rawUrl.trim());
67
+ } catch {
68
+ throw new SecurityError(`Invalid URL format: ${rawUrl}`);
69
+ }
70
+
71
+ // Must use http or https
72
+ if (!["http:", "https:"].includes(parsed.protocol)) {
73
+ throw new SecurityError(
74
+ `Protocol not allowed: ${parsed.protocol}. Only http/https permitted.`
75
+ );
76
+ }
77
+
78
+ const hostname = parsed.hostname.toLowerCase();
79
+
80
+ // Block internal/private IPs and hostnames
81
+ for (const pattern of BLOCKED_PATTERNS) {
82
+ if (pattern.test(hostname)) {
83
+ throw new SecurityError(
84
+ `Access to internal/private addresses is not permitted: ${hostname}`
85
+ );
86
+ }
87
+ }
88
+
89
+ // Domain allowlist check (skip if allowlist is empty — means no browser used)
90
+ const allowedDomains = ALLOWED_DOMAINS[adapterName];
91
+ if (allowedDomains && allowedDomains.length > 0) {
92
+ const isAllowed = allowedDomains.some(
93
+ (domain) => hostname === domain || hostname.endsWith(`.${domain}`)
94
+ );
95
+ if (!isAllowed) {
96
+ throw new SecurityError(
97
+ `Domain not allowed for ${adapterName} adapter: ${hostname}. ` +
98
+ `Allowed domains: ${allowedDomains.join(", ")}`
99
+ );
100
+ }
101
+ }
102
+
103
+ return parsed.toString();
104
+ }
105
+
106
+ // ─── Query string sanitizer ──────────────────────────────────────────────────
107
+
108
+ export function sanitizeQuery(query: string, maxLength = MAX_QUERY_LENGTH): string {
109
+ if (!query || query.trim().length === 0) {
110
+ throw new SecurityError("Query cannot be empty");
111
+ }
112
+
113
+ const trimmed = query.trim().slice(0, maxLength);
114
+
115
+ // Strip null bytes and control characters
116
+ const cleaned = trimmed.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "");
117
+
118
+ if (cleaned.length === 0) {
119
+ throw new SecurityError("Query contains no valid characters after sanitization");
120
+ }
121
+
122
+ return cleaned;
123
+ }
124
+
125
+ // ─── Package name sanitizer ──────────────────────────────────────────────────
126
+
127
+ export function sanitizePackages(input: string): string {
128
+ if (!input || input.trim().length === 0) {
129
+ throw new SecurityError("Package name cannot be empty");
130
+ }
131
+
132
+ if (input.length > MAX_PACKAGES_LENGTH) {
133
+ throw new SecurityError(
134
+ `Package input exceeds maximum length of ${MAX_PACKAGES_LENGTH} characters`
135
+ );
136
+ }
137
+
138
+ // Only allow valid npm/PyPI package name characters, commas, colons (for npm:/pypi: prefix)
139
+ const cleaned = input
140
+ .trim()
141
+ .replace(/[^a-zA-Z0-9@/._\-,:]/g, "")
142
+ .slice(0, MAX_PACKAGES_LENGTH);
143
+
144
+ if (cleaned.length === 0) {
145
+ throw new SecurityError("Package name contains no valid characters after sanitization");
146
+ }
147
+
148
+ return cleaned;
149
+ }
150
+
151
+ // ─── Error formatter ─────────────────────────────────────────────────────────
152
+
153
+ export function formatSecurityError(err: unknown): string {
154
+ if (err instanceof SecurityError) {
155
+ return `[Security] ${err.message}`;
156
+ }
157
+ if (err instanceof Error) {
158
+ return `[Error] ${err.message}`;
159
+ }
160
+ return "[Error] Unknown error occurred";
161
+ }
package/src/server.ts CHANGED
@@ -8,6 +8,7 @@ import { ycAdapter } from "./adapters/yc.js";
8
8
  import { repoSearchAdapter } from "./adapters/repoSearch.js";
9
9
  import { packageTrendsAdapter } from "./adapters/packageTrends.js";
10
10
  import { stampFreshness, formatForLLM } from "./tools/freshnessStamp.js";
11
+ import { SecurityError, formatSecurityError } from "./security.js";
11
12
 
12
13
  const server = new McpServer({
13
14
  name: "freshcontext-mcp",
@@ -27,9 +28,13 @@ server.registerTool(
27
28
  annotations: { readOnlyHint: true, openWorldHint: true },
28
29
  },
29
30
  async ({ url, max_length }) => {
30
- const result = await githubAdapter({ url, maxLength: max_length });
31
- const ctx = stampFreshness(result, { url, maxLength: max_length }, "github");
32
- return { content: [{ type: "text", text: formatForLLM(ctx) }] };
31
+ try {
32
+ const result = await githubAdapter({ url, maxLength: max_length });
33
+ const ctx = stampFreshness(result, { url, maxLength: max_length }, "github");
34
+ return { content: [{ type: "text", text: formatForLLM(ctx) }] };
35
+ } catch (err) {
36
+ return { content: [{ type: "text", text: formatSecurityError(err) }] };
37
+ }
33
38
  }
34
39
  );
35
40
 
@@ -46,9 +51,13 @@ server.registerTool(
46
51
  annotations: { readOnlyHint: true, openWorldHint: true },
47
52
  },
48
53
  async ({ url, max_length }) => {
49
- const result = await scholarAdapter({ url, maxLength: max_length });
50
- const ctx = stampFreshness(result, { url, maxLength: max_length }, "google_scholar");
51
- return { content: [{ type: "text", text: formatForLLM(ctx) }] };
54
+ try {
55
+ const result = await scholarAdapter({ url, maxLength: max_length });
56
+ const ctx = stampFreshness(result, { url, maxLength: max_length }, "google_scholar");
57
+ return { content: [{ type: "text", text: formatForLLM(ctx) }] };
58
+ } catch (err) {
59
+ return { content: [{ type: "text", text: formatSecurityError(err) }] };
60
+ }
52
61
  }
53
62
  );
54
63
 
@@ -65,9 +74,13 @@ server.registerTool(
65
74
  annotations: { readOnlyHint: true, openWorldHint: true },
66
75
  },
67
76
  async ({ url, max_length }) => {
68
- const result = await hackerNewsAdapter({ url, maxLength: max_length });
69
- const ctx = stampFreshness(result, { url, maxLength: max_length }, "hackernews");
70
- return { content: [{ type: "text", text: formatForLLM(ctx) }] };
77
+ try {
78
+ const result = await hackerNewsAdapter({ url, maxLength: max_length });
79
+ const ctx = stampFreshness(result, { url, maxLength: max_length }, "hackernews");
80
+ return { content: [{ type: "text", text: formatForLLM(ctx) }] };
81
+ } catch (err) {
82
+ return { content: [{ type: "text", text: formatSecurityError(err) }] };
83
+ }
71
84
  }
72
85
  );
73
86
 
@@ -84,9 +97,13 @@ server.registerTool(
84
97
  annotations: { readOnlyHint: true, openWorldHint: true },
85
98
  },
86
99
  async ({ url, max_length }) => {
87
- const result = await ycAdapter({ url, maxLength: max_length });
88
- const ctx = stampFreshness(result, { url, maxLength: max_length }, "ycombinator");
89
- return { content: [{ type: "text", text: formatForLLM(ctx) }] };
100
+ try {
101
+ const result = await ycAdapter({ url, maxLength: max_length });
102
+ const ctx = stampFreshness(result, { url, maxLength: max_length }, "ycombinator");
103
+ return { content: [{ type: "text", text: formatForLLM(ctx) }] };
104
+ } catch (err) {
105
+ return { content: [{ type: "text", text: formatSecurityError(err) }] };
106
+ }
90
107
  }
91
108
  );
92
109
 
@@ -103,9 +120,13 @@ server.registerTool(
103
120
  annotations: { readOnlyHint: true, openWorldHint: true },
104
121
  },
105
122
  async ({ query, max_length }) => {
106
- const result = await repoSearchAdapter({ url: query, maxLength: max_length });
107
- const ctx = stampFreshness(result, { url: query, maxLength: max_length }, "github_search");
108
- return { content: [{ type: "text", text: formatForLLM(ctx) }] };
123
+ try {
124
+ const result = await repoSearchAdapter({ url: query, maxLength: max_length });
125
+ const ctx = stampFreshness(result, { url: query, maxLength: max_length }, "github_search");
126
+ return { content: [{ type: "text", text: formatForLLM(ctx) }] };
127
+ } catch (err) {
128
+ return { content: [{ type: "text", text: formatSecurityError(err) }] };
129
+ }
109
130
  }
110
131
  );
111
132
 
@@ -122,9 +143,13 @@ server.registerTool(
122
143
  annotations: { readOnlyHint: true, openWorldHint: true },
123
144
  },
124
145
  async ({ packages, max_length }) => {
125
- const result = await packageTrendsAdapter({ url: packages, maxLength: max_length });
126
- const ctx = stampFreshness(result, { url: packages, maxLength: max_length }, "package_registry");
127
- return { content: [{ type: "text", text: formatForLLM(ctx) }] };
146
+ try {
147
+ const result = await packageTrendsAdapter({ url: packages, maxLength: max_length });
148
+ const ctx = stampFreshness(result, { url: packages, maxLength: max_length }, "package_registry");
149
+ return { content: [{ type: "text", text: formatForLLM(ctx) }] };
150
+ } catch (err) {
151
+ return { content: [{ type: "text", text: formatSecurityError(err) }] };
152
+ }
128
153
  }
129
154
  );
130
155
 
@@ -3,10 +3,11 @@ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
3
3
  import { WebStandardStreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js";
4
4
  import { z } from "zod";
5
5
 
6
- // ─── Types ───────────────────────────────────────────────────────────────────
6
+ // ─── Types ────────────────────────────────────────────────────────────────────
7
7
 
8
8
  interface Env {
9
9
  BROWSER: Fetcher;
10
+ API_KEY?: string; // Optional: set via `wrangler secret put API_KEY`
10
11
  }
11
12
 
12
13
  interface FreshContext {
@@ -18,9 +19,143 @@ interface FreshContext {
18
19
  adapter: string;
19
20
  }
20
21
 
21
- // ─── Freshness Stamp ─────────────────────────────────────────────────────────
22
+ // ─── Security ─────────────────────────────────────────────────────────────────
22
23
 
23
- function stamp(content: string, url: string, date: string | null, confidence: "high" | "medium" | "low", adapter: string): string {
24
+ const ALLOWED_DOMAINS: Record<string, string[]> = {
25
+ github: ["github.com", "raw.githubusercontent.com"],
26
+ scholar: ["scholar.google.com"],
27
+ hackernews: ["news.ycombinator.com", "hn.algolia.com"],
28
+ yc: ["www.ycombinator.com", "ycombinator.com"],
29
+ };
30
+
31
+ const PRIVATE_IP_PATTERNS = [
32
+ /^localhost$/i,
33
+ /^127\./,
34
+ /^10\./,
35
+ /^192\.168\./,
36
+ /^172\.(1[6-9]|2\d|3[01])\./,
37
+ /^169\.254\./,
38
+ /^::1$/,
39
+ /^fc00:/i,
40
+ /^fe80:/i,
41
+ ];
42
+
43
+ const MAX_URL_LENGTH = 500;
44
+ const MAX_QUERY_LENGTH = 200;
45
+
46
+ class SecurityError extends Error {
47
+ constructor(message: string) {
48
+ super(message);
49
+ this.name = "SecurityError";
50
+ }
51
+ }
52
+
53
+ function validateUrl(rawUrl: string, adapter: string): string {
54
+ if (rawUrl.length > MAX_URL_LENGTH)
55
+ throw new SecurityError(`URL too long (max ${MAX_URL_LENGTH} chars)`);
56
+
57
+ let parsed: URL;
58
+ try { parsed = new URL(rawUrl); }
59
+ catch { throw new SecurityError("Invalid URL format"); }
60
+
61
+ if (!["http:", "https:"].includes(parsed.protocol))
62
+ throw new SecurityError("Only http/https URLs are allowed");
63
+
64
+ const hostname = parsed.hostname.toLowerCase();
65
+
66
+ for (const pattern of PRIVATE_IP_PATTERNS) {
67
+ if (pattern.test(hostname))
68
+ throw new SecurityError("Access to private/internal addresses is not allowed");
69
+ }
70
+
71
+ const allowed = ALLOWED_DOMAINS[adapter];
72
+ if (allowed && allowed.length > 0) {
73
+ const ok = allowed.some(d => hostname === d || hostname.endsWith(`.${d}`));
74
+ if (!ok)
75
+ throw new SecurityError(`URL not allowed for ${adapter}. Allowed domains: ${allowed.join(", ")}`);
76
+ }
77
+
78
+ return rawUrl;
79
+ }
80
+
81
+ function sanitizeQuery(query: string, maxLen = MAX_QUERY_LENGTH): string {
82
+ if (query.length > maxLen)
83
+ throw new SecurityError(`Query too long (max ${maxLen} chars)`);
84
+ // Strip null bytes and control characters
85
+ return query.replace(/[\x00-\x1F\x7F]/g, "").trim();
86
+ }
87
+
88
+ // ─── Rate Limiting (in-memory, per isolate) ───────────────────────────────────
89
+
90
+ interface RateEntry { count: number; windowStart: number; }
91
+ const rateMap = new Map<string, RateEntry>();
92
+
93
+ const RATE_LIMIT = 20; // max requests
94
+ const RATE_WINDOW_MS = 60_000; // per 60 seconds
95
+
96
+ function checkRateLimit(ip: string): void {
97
+ const now = Date.now();
98
+ const entry = rateMap.get(ip);
99
+
100
+ if (!entry || now - entry.windowStart > RATE_WINDOW_MS) {
101
+ rateMap.set(ip, { count: 1, windowStart: now });
102
+ return;
103
+ }
104
+
105
+ if (entry.count >= RATE_LIMIT) {
106
+ throw new SecurityError(`Rate limit exceeded. Max ${RATE_LIMIT} requests per minute.`);
107
+ }
108
+
109
+ entry.count++;
110
+ }
111
+
112
+ // Prevent the map from growing unboundedly
113
+ function pruneRateMap(): void {
114
+ const now = Date.now();
115
+ for (const [ip, entry] of rateMap) {
116
+ if (now - entry.windowStart > RATE_WINDOW_MS) rateMap.delete(ip);
117
+ }
118
+ }
119
+
120
+ // ─── Auth ─────────────────────────────────────────────────────────────────────
121
+
122
+ function checkAuth(request: Request, env: Env): void {
123
+ if (!env.API_KEY) return; // Auth disabled if no key is set
124
+
125
+ const authHeader = request.headers.get("Authorization") ?? "";
126
+ const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
127
+
128
+ if (token !== env.API_KEY) {
129
+ throw new SecurityError("Unauthorized. Provide a valid Bearer token.");
130
+ }
131
+ }
132
+
133
+ // ─── Helpers ──────────────────────────────────────────────────────────────────
134
+
135
+ function getClientIp(request: Request): string {
136
+ return (
137
+ request.headers.get("CF-Connecting-IP") ??
138
+ request.headers.get("X-Forwarded-For")?.split(",")[0]?.trim() ??
139
+ "unknown"
140
+ );
141
+ }
142
+
143
+ function securityErrorResponse(message: string, status: number): Response {
144
+ return new Response(JSON.stringify({ error: message }), {
145
+ status,
146
+ headers: { "Content-Type": "application/json" },
147
+ });
148
+ }
149
+
150
+ // ─── Freshness Stamp ──────────────────────────────────────────────────────────
151
+
152
+ function stamp(
153
+ content: string,
154
+ url: string,
155
+ date: string | null,
156
+ confidence: "high" | "medium" | "low",
157
+ adapter: string
158
+ ): string {
24
159
  const ctx: FreshContext = {
25
160
  content: content.slice(0, 6000),
26
161
  source_url: url,
@@ -44,107 +179,133 @@ function stamp(content: string, url: string, date: string | null, confidence: "h
44
179
  // ─── Server Factory ───────────────────────────────────────────────────────────
45
180
 
46
181
  function createServer(env: Env): McpServer {
47
- const server = new McpServer({ name: "freshcontext-mcp", version: "0.1.0" });
182
+ const server = new McpServer({ name: "freshcontext-mcp", version: "0.1.3" });
48
183
 
49
184
  // ── extract_github ──────────────────────────────────────────────────────────
50
185
  server.registerTool("extract_github", {
51
186
  description: "Extract real-time data from a GitHub repository — README, stars, forks, last commit, topics. Returns timestamped freshcontext.",
52
187
  inputSchema: z.object({
53
- url: z.string().url().describe("Full GitHub repo URL"),
188
+ url: z.string().url().describe("Full GitHub repo URL e.g. https://github.com/owner/repo"),
54
189
  }),
55
190
  annotations: { readOnlyHint: true, openWorldHint: true },
56
191
  }, async ({ url }) => {
57
- const browser = await puppeteer.launch(env.BROWSER);
58
- const page = await browser.newPage();
59
- await page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36");
60
- await page.goto(url, { waitUntil: "domcontentloaded" });
61
-
62
- const data = await page.evaluate(`(function() {
63
- var readme = (document.querySelector('[data-target="readme-toc.content"]') || document.querySelector('.markdown-body') || {}).textContent || null;
64
- var starsEl = document.querySelector('[id="repo-stars-counter-star"]') || document.querySelector('.Counter.js-social-count');
65
- var stars = starsEl ? starsEl.textContent.trim() : null;
66
- var forksEl = document.querySelector('[id="repo-network-counter"]');
67
- var forks = forksEl ? forksEl.textContent.trim() : null;
68
- var commitEl = document.querySelector('relative-time');
69
- var lastCommit = commitEl ? commitEl.getAttribute('datetime') : null;
70
- var descEl = document.querySelector('.f4.my-3');
71
- var description = descEl ? descEl.textContent.trim() : null;
72
- var topics = Array.from(document.querySelectorAll('.topic-tag')).map(function(t) { return t.textContent.trim(); });
73
- var langEl = document.querySelector('.color-fg-default.text-bold.mr-1');
74
- var language = langEl ? langEl.textContent.trim() : null;
75
- return { readme, stars, forks, lastCommit, description, topics, language };
76
- })()`);
77
-
78
- await browser.close();
79
- const d = data as any;
80
- const raw = [`Description: ${d.description ?? "N/A"}`, `Stars: ${d.stars ?? "N/A"} | Forks: ${d.forks ?? "N/A"}`, `Language: ${d.language ?? "N/A"}`, `Last commit: ${d.lastCommit ?? "N/A"}`, `Topics: ${d.topics?.join(", ") ?? "none"}`, `\n--- README ---\n${d.readme ?? "No README"}`].join("\n");
81
- return { content: [{ type: "text", text: stamp(raw, url, d.lastCommit ?? null, d.lastCommit ? "high" : "medium", "github") }] };
192
+ try {
193
+ const safeUrl = validateUrl(url, "github");
194
+ const browser = await puppeteer.launch(env.BROWSER);
195
+ const page = await browser.newPage();
196
+ await page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36");
197
+ await page.goto(safeUrl, { waitUntil: "domcontentloaded" });
198
+
199
+ const data = await page.evaluate(`(function() {
200
+ var readme = (document.querySelector('[data-target="readme-toc.content"]') || document.querySelector('.markdown-body') || {}).textContent || null;
201
+ var starsEl = document.querySelector('[id="repo-stars-counter-star"]') || document.querySelector('.Counter.js-social-count');
202
+ var stars = starsEl ? starsEl.textContent.trim() : null;
203
+ var forksEl = document.querySelector('[id="repo-network-counter"]');
204
+ var forks = forksEl ? forksEl.textContent.trim() : null;
205
+ var commitEl = document.querySelector('relative-time');
206
+ var lastCommit = commitEl ? commitEl.getAttribute('datetime') : null;
207
+ var descEl = document.querySelector('.f4.my-3');
208
+ var description = descEl ? descEl.textContent.trim() : null;
209
+ var topics = Array.from(document.querySelectorAll('.topic-tag')).map(function(t) { return t.textContent.trim(); });
210
+ var langEl = document.querySelector('.color-fg-default.text-bold.mr-1');
211
+ var language = langEl ? langEl.textContent.trim() : null;
212
+ return { readme, stars, forks, lastCommit, description, topics, language };
213
+ })()`);
214
+
215
+ await browser.close();
216
+ const d = data as any;
217
+ const raw = [
218
+ `Description: ${d.description ?? "N/A"}`,
219
+ `Stars: ${d.stars ?? "N/A"} | Forks: ${d.forks ?? "N/A"}`,
220
+ `Language: ${d.language ?? "N/A"}`,
221
+ `Last commit: ${d.lastCommit ?? "N/A"}`,
222
+ `Topics: ${d.topics?.join(", ") ?? "none"}`,
223
+ `\n--- README ---\n${d.readme ?? "No README"}`,
224
+ ].join("\n");
225
+ return { content: [{ type: "text", text: stamp(raw, safeUrl, d.lastCommit ?? null, d.lastCommit ? "high" : "medium", "github") }] };
226
+ } catch (err: any) {
227
+ return { content: [{ type: "text", text: `[ERROR] ${err.message}` }] };
228
+ }
82
229
  });
83
230
 
84
231
  // ── extract_hackernews ──────────────────────────────────────────────────────
85
232
  server.registerTool("extract_hackernews", {
86
- description: "Extract top stories from Hacker News with real-time timestamps.",
87
- inputSchema: z.object({ url: z.string().url().describe("HN URL") }),
233
+ description: "Extract top stories or search results from Hacker News with real-time timestamps.",
234
+ inputSchema: z.object({ url: z.string().url().describe("HN URL e.g. https://news.ycombinator.com") }),
88
235
  annotations: { readOnlyHint: true, openWorldHint: true },
89
236
  }, async ({ url }) => {
90
- const browser = await puppeteer.launch(env.BROWSER);
91
- const page = await browser.newPage();
92
- await page.goto(url, { waitUntil: "domcontentloaded" });
93
-
94
- const data = await page.evaluate(`(function() {
95
- var items = Array.from(document.querySelectorAll('.athing')).slice(0, 20);
96
- return items.map(function(el) {
97
- var titleLineEl = el.querySelector('.titleline > a');
98
- var title = titleLineEl ? titleLineEl.textContent.trim() : null;
99
- var link = titleLineEl ? titleLineEl.getAttribute('href') : null;
100
- var subtext = el.nextElementSibling;
101
- var scoreEl = subtext ? subtext.querySelector('.score') : null;
102
- var score = scoreEl ? scoreEl.textContent.trim() : null;
103
- var ageEl = subtext ? subtext.querySelector('.age') : null;
104
- var age = ageEl ? ageEl.getAttribute('title') : null;
105
- return { title, link, score, age };
106
- });
107
- })()`);
108
-
109
- await browser.close();
110
- const items = data as any[];
111
- const raw = items.map((r, i) => `[${i + 1}] ${r.title}\nURL: ${r.link}\nScore: ${r.score ?? "N/A"}\nPosted: ${r.age ?? "unknown"}`).join("\n\n");
112
- const newest = items.map(r => r.age).filter(Boolean).sort().reverse()[0] ?? null;
113
- return { content: [{ type: "text", text: stamp(raw, url, newest, newest ? "high" : "medium", "hackernews") }] };
237
+ try {
238
+ const safeUrl = validateUrl(url, "hackernews");
239
+ const browser = await puppeteer.launch(env.BROWSER);
240
+ const page = await browser.newPage();
241
+ await page.goto(safeUrl, { waitUntil: "domcontentloaded" });
242
+
243
+ const data = await page.evaluate(`(function() {
244
+ var items = Array.from(document.querySelectorAll('.athing')).slice(0, 20);
245
+ return items.map(function(el) {
246
+ var titleLineEl = el.querySelector('.titleline > a');
247
+ var title = titleLineEl ? titleLineEl.textContent.trim() : null;
248
+ var link = titleLineEl ? titleLineEl.getAttribute('href') : null;
249
+ var subtext = el.nextElementSibling;
250
+ var scoreEl = subtext ? subtext.querySelector('.score') : null;
251
+ var score = scoreEl ? scoreEl.textContent.trim() : null;
252
+ var ageEl = subtext ? subtext.querySelector('.age') : null;
253
+ var age = ageEl ? ageEl.getAttribute('title') : null;
254
+ return { title, link, score, age };
255
+ });
256
+ })()`);
257
+
258
+ await browser.close();
259
+ const items = data as any[];
260
+ const raw = items.map((r, i) =>
261
+ `[${i + 1}] ${r.title}\nURL: ${r.link}\nScore: ${r.score ?? "N/A"}\nPosted: ${r.age ?? "unknown"}`
262
+ ).join("\n\n");
263
+ const newest = items.map(r => r.age).filter(Boolean).sort().reverse()[0] ?? null;
264
+ return { content: [{ type: "text", text: stamp(raw, safeUrl, newest, newest ? "high" : "medium", "hackernews") }] };
265
+ } catch (err: any) {
266
+ return { content: [{ type: "text", text: `[ERROR] ${err.message}` }] };
267
+ }
114
268
  });
115
269
 
116
270
  // ── extract_scholar ─────────────────────────────────────────────────────────
117
271
  server.registerTool("extract_scholar", {
118
272
  description: "Extract research results from Google Scholar with publication dates.",
119
- inputSchema: z.object({ url: z.string().url().describe("Google Scholar URL") }),
273
+ inputSchema: z.object({ url: z.string().url().describe("Google Scholar search URL") }),
120
274
  annotations: { readOnlyHint: true, openWorldHint: true },
121
275
  }, async ({ url }) => {
122
- const browser = await puppeteer.launch(env.BROWSER);
123
- const page = await browser.newPage();
124
- await page.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36");
125
- await page.goto(url, { waitUntil: "domcontentloaded" });
126
-
127
- const data = await page.evaluate(`(function() {
128
- var items = Array.from(document.querySelectorAll('.gs_r.gs_or.gs_scl'));
129
- return items.map(function(el) {
130
- var titleEl = el.querySelector('.gs_rt');
131
- var title = titleEl ? titleEl.textContent.trim() : null;
132
- var authorsEl = el.querySelector('.gs_a');
133
- var authors = authorsEl ? authorsEl.textContent.trim() : null;
134
- var snippetEl = el.querySelector('.gs_rs');
135
- var snippet = snippetEl ? snippetEl.textContent.trim() : null;
136
- var yearMatch = authors ? authors.match(/\\b(19|20)\\d{2}\\b/) : null;
137
- var year = yearMatch ? yearMatch[0] : null;
138
- return { title, authors, snippet, year };
139
- });
140
- })()`);
141
-
142
- await browser.close();
143
- const items = data as any[];
144
- const raw = items.map((r, i) => `[${i + 1}] ${r.title ?? "Untitled"}\nAuthors: ${r.authors ?? "Unknown"}\nYear: ${r.year ?? "Unknown"}\nSnippet: ${r.snippet ?? "N/A"}`).join("\n\n");
145
- const years = items.map(r => r.year).filter(Boolean).sort().reverse();
146
- const newest = years[0] ?? null;
147
- return { content: [{ type: "text", text: stamp(raw, url, newest ? `${newest}-01-01` : null, newest ? "high" : "low", "google_scholar") }] };
276
+ try {
277
+ const safeUrl = validateUrl(url, "scholar");
278
+ const browser = await puppeteer.launch(env.BROWSER);
279
+ const page = await browser.newPage();
280
+ await page.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36");
281
+ await page.goto(safeUrl, { waitUntil: "domcontentloaded" });
282
+
283
+ const data = await page.evaluate(`(function() {
284
+ var items = Array.from(document.querySelectorAll('.gs_r.gs_or.gs_scl'));
285
+ return items.map(function(el) {
286
+ var titleEl = el.querySelector('.gs_rt');
287
+ var title = titleEl ? titleEl.textContent.trim() : null;
288
+ var authorsEl = el.querySelector('.gs_a');
289
+ var authors = authorsEl ? authorsEl.textContent.trim() : null;
290
+ var snippetEl = el.querySelector('.gs_rs');
291
+ var snippet = snippetEl ? snippetEl.textContent.trim() : null;
292
+ var yearMatch = authors ? authors.match(/\\b(19|20)\\d{2}\\b/) : null;
293
+ var year = yearMatch ? yearMatch[0] : null;
294
+ return { title, authors, snippet, year };
295
+ });
296
+ })()`);
297
+
298
+ await browser.close();
299
+ const items = data as any[];
300
+ const raw = items.map((r, i) =>
301
+ `[${i + 1}] ${r.title ?? "Untitled"}\nAuthors: ${r.authors ?? "Unknown"}\nYear: ${r.year ?? "Unknown"}\nSnippet: ${r.snippet ?? "N/A"}`
302
+ ).join("\n\n");
303
+ const years = items.map(r => r.year).filter(Boolean).sort().reverse();
304
+ const newest = years[0] ?? null;
305
+ return { content: [{ type: "text", text: stamp(raw, safeUrl, newest ? `${newest}-01-01` : null, newest ? "high" : "low", "google_scholar") }] };
306
+ } catch (err: any) {
307
+ return { content: [{ type: "text", text: `[ERROR] ${err.message}` }] };
308
+ }
148
309
  });
149
310
 
150
311
  return server;
@@ -154,6 +315,23 @@ function createServer(env: Env): McpServer {
154
315
 
155
316
  export default {
156
317
  async fetch(request: Request, env: Env): Promise<Response> {
318
+ // Prune stale rate limit entries occasionally
319
+ if (Math.random() < 0.05) pruneRateMap();
320
+
321
+ try {
322
+ // 1. Auth check
323
+ checkAuth(request, env);
324
+
325
+ // 2. Rate limit check
326
+ const ip = getClientIp(request);
327
+ checkRateLimit(ip);
328
+
329
+ } catch (err: any) {
330
+ const status = err.message.startsWith("Unauthorized") ? 401 : 429;
331
+ return securityErrorResponse(err.message, status);
332
+ }
333
+
334
+ // 3. Handle MCP request
157
335
  const transport = new WebStandardStreamableHTTPServerTransport();
158
336
  const server = createServer(env);
159
337
  await server.connect(transport);