freshcontext-mcp 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/worker/src/worker.ts +261 -83
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "freshcontext-mcp",
3
- "version": "0.1.3",
3
+ "version": "0.1.4",
4
4
  "description": "Real-time web extraction MCP server with freshness timestamps for AI agents",
5
5
  "keywords": [
6
6
  "mcp",
@@ -3,10 +3,11 @@ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
3
3
  import { WebStandardStreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/webStandardStreamableHttp.js";
4
4
  import { z } from "zod";
5
5
 
6
- // ─── Types ───────────────────────────────────────────────────────────────────
6
+ // ─── Types ────────────────────────────────────────────────────────────────────
7
7
 
8
8
  interface Env {
9
9
  BROWSER: Fetcher;
10
+ API_KEY?: string; // Optional: set via `wrangler secret put API_KEY`
10
11
  }
11
12
 
12
13
  interface FreshContext {
@@ -18,9 +19,143 @@ interface FreshContext {
18
19
  adapter: string;
19
20
  }
20
21
 
21
- // ─── Freshness Stamp ─────────────────────────────────────────────────────────
22
+ // ─── Security ─────────────────────────────────────────────────────────────────
22
23
 
23
- function stamp(content: string, url: string, date: string | null, confidence: "high" | "medium" | "low", adapter: string): string {
24
+ const ALLOWED_DOMAINS: Record<string, string[]> = {
25
+ github: ["github.com", "raw.githubusercontent.com"],
26
+ scholar: ["scholar.google.com"],
27
+ hackernews: ["news.ycombinator.com", "hn.algolia.com"],
28
+ yc: ["www.ycombinator.com", "ycombinator.com"],
29
+ };
30
+
31
+ const PRIVATE_IP_PATTERNS = [
32
+ /^localhost$/i,
33
+ /^127\./,
34
+ /^10\./,
35
+ /^192\.168\./,
36
+ /^172\.(1[6-9]|2\d|3[01])\./,
37
+ /^169\.254\./,
38
+ /^::1$/,
39
+ /^fc00:/i,
40
+ /^fe80:/i,
41
+ ];
42
+
43
+ const MAX_URL_LENGTH = 500;
44
+ const MAX_QUERY_LENGTH = 200;
45
+
46
+ class SecurityError extends Error {
47
+ constructor(message: string) {
48
+ super(message);
49
+ this.name = "SecurityError";
50
+ }
51
+ }
52
+
53
+ function validateUrl(rawUrl: string, adapter: string): string {
54
+ if (rawUrl.length > MAX_URL_LENGTH)
55
+ throw new SecurityError(`URL too long (max ${MAX_URL_LENGTH} chars)`);
56
+
57
+ let parsed: URL;
58
+ try { parsed = new URL(rawUrl); }
59
+ catch { throw new SecurityError("Invalid URL format"); }
60
+
61
+ if (!["http:", "https:"].includes(parsed.protocol))
62
+ throw new SecurityError("Only http/https URLs are allowed");
63
+
64
+ const hostname = parsed.hostname.toLowerCase();
65
+
66
+ for (const pattern of PRIVATE_IP_PATTERNS) {
67
+ if (pattern.test(hostname))
68
+ throw new SecurityError("Access to private/internal addresses is not allowed");
69
+ }
70
+
71
+ const allowed = ALLOWED_DOMAINS[adapter];
72
+ if (allowed && allowed.length > 0) {
73
+ const ok = allowed.some(d => hostname === d || hostname.endsWith(`.${d}`));
74
+ if (!ok)
75
+ throw new SecurityError(`URL not allowed for ${adapter}. Allowed domains: ${allowed.join(", ")}`);
76
+ }
77
+
78
+ return rawUrl;
79
+ }
80
+
81
+ function sanitizeQuery(query: string, maxLen = MAX_QUERY_LENGTH): string {
82
+ if (query.length > maxLen)
83
+ throw new SecurityError(`Query too long (max ${maxLen} chars)`);
84
+ // Strip null bytes and control characters
85
+ return query.replace(/[\x00-\x1F\x7F]/g, "").trim();
86
+ }
87
+
88
+ // ─── Rate Limiting (in-memory, per isolate) ───────────────────────────────────
89
+
90
+ interface RateEntry { count: number; windowStart: number; }
91
+ const rateMap = new Map<string, RateEntry>();
92
+
93
+ const RATE_LIMIT = 20; // max requests
94
+ const RATE_WINDOW_MS = 60_000; // per 60 seconds
95
+
96
+ function checkRateLimit(ip: string): void {
97
+ const now = Date.now();
98
+ const entry = rateMap.get(ip);
99
+
100
+ if (!entry || now - entry.windowStart > RATE_WINDOW_MS) {
101
+ rateMap.set(ip, { count: 1, windowStart: now });
102
+ return;
103
+ }
104
+
105
+ if (entry.count >= RATE_LIMIT) {
106
+ throw new SecurityError(`Rate limit exceeded. Max ${RATE_LIMIT} requests per minute.`);
107
+ }
108
+
109
+ entry.count++;
110
+ }
111
+
112
+ // Prevent the map from growing unboundedly
113
+ function pruneRateMap(): void {
114
+ const now = Date.now();
115
+ for (const [ip, entry] of rateMap) {
116
+ if (now - entry.windowStart > RATE_WINDOW_MS) rateMap.delete(ip);
117
+ }
118
+ }
119
+
120
+ // ─── Auth ─────────────────────────────────────────────────────────────────────
121
+
122
+ function checkAuth(request: Request, env: Env): void {
123
+ if (!env.API_KEY) return; // Auth disabled if no key is set
124
+
125
+ const authHeader = request.headers.get("Authorization") ?? "";
126
+ const token = authHeader.startsWith("Bearer ") ? authHeader.slice(7) : "";
127
+
128
+ if (token !== env.API_KEY) {
129
+ throw new SecurityError("Unauthorized. Provide a valid Bearer token.");
130
+ }
131
+ }
132
+
133
+ // ─── Helpers ──────────────────────────────────────────────────────────────────
134
+
135
+ function getClientIp(request: Request): string {
136
+ return (
137
+ request.headers.get("CF-Connecting-IP") ??
138
+ request.headers.get("X-Forwarded-For")?.split(",")[0]?.trim() ??
139
+ "unknown"
140
+ );
141
+ }
142
+
143
+ function securityErrorResponse(message: string, status: number): Response {
144
+ return new Response(JSON.stringify({ error: message }), {
145
+ status,
146
+ headers: { "Content-Type": "application/json" },
147
+ });
148
+ }
149
+
150
+ // ─── Freshness Stamp ──────────────────────────────────────────────────────────
151
+
152
+ function stamp(
153
+ content: string,
154
+ url: string,
155
+ date: string | null,
156
+ confidence: "high" | "medium" | "low",
157
+ adapter: string
158
+ ): string {
24
159
  const ctx: FreshContext = {
25
160
  content: content.slice(0, 6000),
26
161
  source_url: url,
@@ -44,107 +179,133 @@ function stamp(content: string, url: string, date: string | null, confidence: "h
44
179
  // ─── Server Factory ───────────────────────────────────────────────────────────
45
180
 
46
181
  function createServer(env: Env): McpServer {
47
- const server = new McpServer({ name: "freshcontext-mcp", version: "0.1.0" });
182
+ const server = new McpServer({ name: "freshcontext-mcp", version: "0.1.3" });
48
183
 
49
184
  // ── extract_github ──────────────────────────────────────────────────────────
50
185
  server.registerTool("extract_github", {
51
186
  description: "Extract real-time data from a GitHub repository — README, stars, forks, last commit, topics. Returns timestamped freshcontext.",
52
187
  inputSchema: z.object({
53
- url: z.string().url().describe("Full GitHub repo URL"),
188
+ url: z.string().url().describe("Full GitHub repo URL e.g. https://github.com/owner/repo"),
54
189
  }),
55
190
  annotations: { readOnlyHint: true, openWorldHint: true },
56
191
  }, async ({ url }) => {
57
- const browser = await puppeteer.launch(env.BROWSER);
58
- const page = await browser.newPage();
59
- await page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36");
60
- await page.goto(url, { waitUntil: "domcontentloaded" });
61
-
62
- const data = await page.evaluate(`(function() {
63
- var readme = (document.querySelector('[data-target="readme-toc.content"]') || document.querySelector('.markdown-body') || {}).textContent || null;
64
- var starsEl = document.querySelector('[id="repo-stars-counter-star"]') || document.querySelector('.Counter.js-social-count');
65
- var stars = starsEl ? starsEl.textContent.trim() : null;
66
- var forksEl = document.querySelector('[id="repo-network-counter"]');
67
- var forks = forksEl ? forksEl.textContent.trim() : null;
68
- var commitEl = document.querySelector('relative-time');
69
- var lastCommit = commitEl ? commitEl.getAttribute('datetime') : null;
70
- var descEl = document.querySelector('.f4.my-3');
71
- var description = descEl ? descEl.textContent.trim() : null;
72
- var topics = Array.from(document.querySelectorAll('.topic-tag')).map(function(t) { return t.textContent.trim(); });
73
- var langEl = document.querySelector('.color-fg-default.text-bold.mr-1');
74
- var language = langEl ? langEl.textContent.trim() : null;
75
- return { readme, stars, forks, lastCommit, description, topics, language };
76
- })()`);
77
-
78
- await browser.close();
79
- const d = data as any;
80
- const raw = [`Description: ${d.description ?? "N/A"}`, `Stars: ${d.stars ?? "N/A"} | Forks: ${d.forks ?? "N/A"}`, `Language: ${d.language ?? "N/A"}`, `Last commit: ${d.lastCommit ?? "N/A"}`, `Topics: ${d.topics?.join(", ") ?? "none"}`, `\n--- README ---\n${d.readme ?? "No README"}`].join("\n");
81
- return { content: [{ type: "text", text: stamp(raw, url, d.lastCommit ?? null, d.lastCommit ? "high" : "medium", "github") }] };
192
+ try {
193
+ const safeUrl = validateUrl(url, "github");
194
+ const browser = await puppeteer.launch(env.BROWSER);
195
+ const page = await browser.newPage();
196
+ await page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36");
197
+ await page.goto(safeUrl, { waitUntil: "domcontentloaded" });
198
+
199
+ const data = await page.evaluate(`(function() {
200
+ var readme = (document.querySelector('[data-target="readme-toc.content"]') || document.querySelector('.markdown-body') || {}).textContent || null;
201
+ var starsEl = document.querySelector('[id="repo-stars-counter-star"]') || document.querySelector('.Counter.js-social-count');
202
+ var stars = starsEl ? starsEl.textContent.trim() : null;
203
+ var forksEl = document.querySelector('[id="repo-network-counter"]');
204
+ var forks = forksEl ? forksEl.textContent.trim() : null;
205
+ var commitEl = document.querySelector('relative-time');
206
+ var lastCommit = commitEl ? commitEl.getAttribute('datetime') : null;
207
+ var descEl = document.querySelector('.f4.my-3');
208
+ var description = descEl ? descEl.textContent.trim() : null;
209
+ var topics = Array.from(document.querySelectorAll('.topic-tag')).map(function(t) { return t.textContent.trim(); });
210
+ var langEl = document.querySelector('.color-fg-default.text-bold.mr-1');
211
+ var language = langEl ? langEl.textContent.trim() : null;
212
+ return { readme, stars, forks, lastCommit, description, topics, language };
213
+ })()`);
214
+
215
+ await browser.close();
216
+ const d = data as any;
217
+ const raw = [
218
+ `Description: ${d.description ?? "N/A"}`,
219
+ `Stars: ${d.stars ?? "N/A"} | Forks: ${d.forks ?? "N/A"}`,
220
+ `Language: ${d.language ?? "N/A"}`,
221
+ `Last commit: ${d.lastCommit ?? "N/A"}`,
222
+ `Topics: ${d.topics?.join(", ") ?? "none"}`,
223
+ `\n--- README ---\n${d.readme ?? "No README"}`,
224
+ ].join("\n");
225
+ return { content: [{ type: "text", text: stamp(raw, safeUrl, d.lastCommit ?? null, d.lastCommit ? "high" : "medium", "github") }] };
226
+ } catch (err: any) {
227
+ return { content: [{ type: "text", text: `[ERROR] ${err.message}` }] };
228
+ }
82
229
  });
83
230
 
84
231
  // ── extract_hackernews ──────────────────────────────────────────────────────
85
232
  server.registerTool("extract_hackernews", {
86
- description: "Extract top stories from Hacker News with real-time timestamps.",
87
- inputSchema: z.object({ url: z.string().url().describe("HN URL") }),
233
+ description: "Extract top stories or search results from Hacker News with real-time timestamps.",
234
+ inputSchema: z.object({ url: z.string().url().describe("HN URL e.g. https://news.ycombinator.com") }),
88
235
  annotations: { readOnlyHint: true, openWorldHint: true },
89
236
  }, async ({ url }) => {
90
- const browser = await puppeteer.launch(env.BROWSER);
91
- const page = await browser.newPage();
92
- await page.goto(url, { waitUntil: "domcontentloaded" });
93
-
94
- const data = await page.evaluate(`(function() {
95
- var items = Array.from(document.querySelectorAll('.athing')).slice(0, 20);
96
- return items.map(function(el) {
97
- var titleLineEl = el.querySelector('.titleline > a');
98
- var title = titleLineEl ? titleLineEl.textContent.trim() : null;
99
- var link = titleLineEl ? titleLineEl.getAttribute('href') : null;
100
- var subtext = el.nextElementSibling;
101
- var scoreEl = subtext ? subtext.querySelector('.score') : null;
102
- var score = scoreEl ? scoreEl.textContent.trim() : null;
103
- var ageEl = subtext ? subtext.querySelector('.age') : null;
104
- var age = ageEl ? ageEl.getAttribute('title') : null;
105
- return { title, link, score, age };
106
- });
107
- })()`);
108
-
109
- await browser.close();
110
- const items = data as any[];
111
- const raw = items.map((r, i) => `[${i + 1}] ${r.title}\nURL: ${r.link}\nScore: ${r.score ?? "N/A"}\nPosted: ${r.age ?? "unknown"}`).join("\n\n");
112
- const newest = items.map(r => r.age).filter(Boolean).sort().reverse()[0] ?? null;
113
- return { content: [{ type: "text", text: stamp(raw, url, newest, newest ? "high" : "medium", "hackernews") }] };
237
+ try {
238
+ const safeUrl = validateUrl(url, "hackernews");
239
+ const browser = await puppeteer.launch(env.BROWSER);
240
+ const page = await browser.newPage();
241
+ await page.goto(safeUrl, { waitUntil: "domcontentloaded" });
242
+
243
+ const data = await page.evaluate(`(function() {
244
+ var items = Array.from(document.querySelectorAll('.athing')).slice(0, 20);
245
+ return items.map(function(el) {
246
+ var titleLineEl = el.querySelector('.titleline > a');
247
+ var title = titleLineEl ? titleLineEl.textContent.trim() : null;
248
+ var link = titleLineEl ? titleLineEl.getAttribute('href') : null;
249
+ var subtext = el.nextElementSibling;
250
+ var scoreEl = subtext ? subtext.querySelector('.score') : null;
251
+ var score = scoreEl ? scoreEl.textContent.trim() : null;
252
+ var ageEl = subtext ? subtext.querySelector('.age') : null;
253
+ var age = ageEl ? ageEl.getAttribute('title') : null;
254
+ return { title, link, score, age };
255
+ });
256
+ })()`);
257
+
258
+ await browser.close();
259
+ const items = data as any[];
260
+ const raw = items.map((r, i) =>
261
+ `[${i + 1}] ${r.title}\nURL: ${r.link}\nScore: ${r.score ?? "N/A"}\nPosted: ${r.age ?? "unknown"}`
262
+ ).join("\n\n");
263
+ const newest = items.map(r => r.age).filter(Boolean).sort().reverse()[0] ?? null;
264
+ return { content: [{ type: "text", text: stamp(raw, safeUrl, newest, newest ? "high" : "medium", "hackernews") }] };
265
+ } catch (err: any) {
266
+ return { content: [{ type: "text", text: `[ERROR] ${err.message}` }] };
267
+ }
114
268
  });
115
269
 
116
270
  // ── extract_scholar ─────────────────────────────────────────────────────────
117
271
  server.registerTool("extract_scholar", {
118
272
  description: "Extract research results from Google Scholar with publication dates.",
119
- inputSchema: z.object({ url: z.string().url().describe("Google Scholar URL") }),
273
+ inputSchema: z.object({ url: z.string().url().describe("Google Scholar search URL") }),
120
274
  annotations: { readOnlyHint: true, openWorldHint: true },
121
275
  }, async ({ url }) => {
122
- const browser = await puppeteer.launch(env.BROWSER);
123
- const page = await browser.newPage();
124
- await page.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36");
125
- await page.goto(url, { waitUntil: "domcontentloaded" });
126
-
127
- const data = await page.evaluate(`(function() {
128
- var items = Array.from(document.querySelectorAll('.gs_r.gs_or.gs_scl'));
129
- return items.map(function(el) {
130
- var titleEl = el.querySelector('.gs_rt');
131
- var title = titleEl ? titleEl.textContent.trim() : null;
132
- var authorsEl = el.querySelector('.gs_a');
133
- var authors = authorsEl ? authorsEl.textContent.trim() : null;
134
- var snippetEl = el.querySelector('.gs_rs');
135
- var snippet = snippetEl ? snippetEl.textContent.trim() : null;
136
- var yearMatch = authors ? authors.match(/\\b(19|20)\\d{2}\\b/) : null;
137
- var year = yearMatch ? yearMatch[0] : null;
138
- return { title, authors, snippet, year };
139
- });
140
- })()`);
141
-
142
- await browser.close();
143
- const items = data as any[];
144
- const raw = items.map((r, i) => `[${i + 1}] ${r.title ?? "Untitled"}\nAuthors: ${r.authors ?? "Unknown"}\nYear: ${r.year ?? "Unknown"}\nSnippet: ${r.snippet ?? "N/A"}`).join("\n\n");
145
- const years = items.map(r => r.year).filter(Boolean).sort().reverse();
146
- const newest = years[0] ?? null;
147
- return { content: [{ type: "text", text: stamp(raw, url, newest ? `${newest}-01-01` : null, newest ? "high" : "low", "google_scholar") }] };
276
+ try {
277
+ const safeUrl = validateUrl(url, "scholar");
278
+ const browser = await puppeteer.launch(env.BROWSER);
279
+ const page = await browser.newPage();
280
+ await page.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36");
281
+ await page.goto(safeUrl, { waitUntil: "domcontentloaded" });
282
+
283
+ const data = await page.evaluate(`(function() {
284
+ var items = Array.from(document.querySelectorAll('.gs_r.gs_or.gs_scl'));
285
+ return items.map(function(el) {
286
+ var titleEl = el.querySelector('.gs_rt');
287
+ var title = titleEl ? titleEl.textContent.trim() : null;
288
+ var authorsEl = el.querySelector('.gs_a');
289
+ var authors = authorsEl ? authorsEl.textContent.trim() : null;
290
+ var snippetEl = el.querySelector('.gs_rs');
291
+ var snippet = snippetEl ? snippetEl.textContent.trim() : null;
292
+ var yearMatch = authors ? authors.match(/\\b(19|20)\\d{2}\\b/) : null;
293
+ var year = yearMatch ? yearMatch[0] : null;
294
+ return { title, authors, snippet, year };
295
+ });
296
+ })()`);
297
+
298
+ await browser.close();
299
+ const items = data as any[];
300
+ const raw = items.map((r, i) =>
301
+ `[${i + 1}] ${r.title ?? "Untitled"}\nAuthors: ${r.authors ?? "Unknown"}\nYear: ${r.year ?? "Unknown"}\nSnippet: ${r.snippet ?? "N/A"}`
302
+ ).join("\n\n");
303
+ const years = items.map(r => r.year).filter(Boolean).sort().reverse();
304
+ const newest = years[0] ?? null;
305
+ return { content: [{ type: "text", text: stamp(raw, safeUrl, newest ? `${newest}-01-01` : null, newest ? "high" : "low", "google_scholar") }] };
306
+ } catch (err: any) {
307
+ return { content: [{ type: "text", text: `[ERROR] ${err.message}` }] };
308
+ }
148
309
  });
149
310
 
150
311
  return server;
@@ -154,6 +315,23 @@ function createServer(env: Env): McpServer {
154
315
 
155
316
  export default {
156
317
  async fetch(request: Request, env: Env): Promise<Response> {
318
+ // Prune stale rate limit entries occasionally
319
+ if (Math.random() < 0.05) pruneRateMap();
320
+
321
+ try {
322
+ // 1. Auth check
323
+ checkAuth(request, env);
324
+
325
+ // 2. Rate limit check
326
+ const ip = getClientIp(request);
327
+ checkRateLimit(ip);
328
+
329
+ } catch (err: any) {
330
+ const status = err.message.startsWith("Unauthorized") ? 401 : 429;
331
+ return securityErrorResponse(err.message, status);
332
+ }
333
+
334
+ // 3. Handle MCP request
157
335
  const transport = new WebStandardStreamableHTTPServerTransport();
158
336
  const server = createServer(env);
159
337
  await server.connect(transport);