paperplain-mcp 1.1.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/server.js +250 -40
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "paperplain-mcp",
3
- "version": "1.1.2",
3
+ "version": "1.2.3",
4
4
  "description": "MCP server — search 200M+ peer-reviewed papers from PubMed, ArXiv, and Semantic Scholar. Free. No API key.",
5
5
  "type": "module",
6
6
  "bin": {
package/server.js CHANGED
@@ -15,14 +15,17 @@ const PUBMED_PARAMS = "tool=paperplain&email=hello@paperplain.io";
15
15
  const SEMANTIC_SCHOLAR_BASE = "https://api.semanticscholar.org/graph/v1";
16
16
 
17
17
  // ── Domain classifier (keyword-based, no LLM needed) ───────────────────────
18
+ // Note: "energy" intentionally excluded from health — it's more common in
19
+ // CS/engineering contexts (energy management, HEMS, smart grid) than health.
18
20
  const HEALTH_KEYWORDS =
19
- /\b(sleep|insomnia|anxiety|anxious|stress|depress|pain|ache|headache|migraine|diet|nutrition|weight|obese|exercise|fatigue|tired|energy|focus|adhd|autism|cancer|diabetes|blood|pressure|heart|cholesterol|vitamin|supplement|immune|gut|digestion|mental health|therapy|meditation|mindfulness|mood|burnout|inflammation|allergy|asthma|skin|aging|memory|alzheimer|cognitive|brain|alcohol|smoking|addiction|symptoms|treatment|medicine|medication|dose|chronic|surgery|vaccine|antibiot|clinical|patient|disease|disorder|syndrome|injury|rehabilitation|psychiatric|neurol|cardio|oncol|gastro|pediatr|geriatric)\b/i;
21
+ /\b(sleep|insomnia|anxiety|anxious|stress|depress|pain|ache|headache|migraine|diet|nutrition|weight|obese|exercise|fatigue|tired|focus|adhd|autism|cancer|diabetes|blood|pressure|heart|cholesterol|vitamin|supplement|immune|gut|digestion|mental health|therapy|meditation|mindfulness|mood|burnout|inflammation|allergy|asthma|skin|aging|memory|alzheimer|cognitive|brain|alcohol|smoking|addiction|symptoms|treatment|medicine|medication|dose|chronic|surgery|vaccine|antibiot|clinical|patient|disease|disorder|syndrome|injury|rehabilitation|psychiatric|neurol|cardio|oncol|gastro|pediatr|geriatric)\b/i;
20
22
  const CS_KEYWORDS =
21
- /\b(algorithm|neural network|machine learning|deep learning|transformer|llm|language model|reinforcement|classification|clustering|regression|computer vision|nlp|natural language|robotics|autonomous|blockchain|cryptograph|database|distributed|cloud|microservice|compiler|operating system|cybersecurity|quantum comput|software engineer|retrieval|embedding|vector|attention|fine.tun|prompt|inference|benchmark)\b/i;
23
+ /\b(algorithm|neural network|machine learning|deep learning|transformer|llm|large language model|language model|reinforcement|classification|clustering|regression|computer vision|nlp|natural language|robotics|autonomous|blockchain|cryptograph|database|distributed|cloud|microservice|compiler|operating system|cybersecurity|quantum comput|software engineer|retrieval|embedding|vector|attention|fine.tun|prompt|inference|benchmark|agentic|multi.agent|smart grid|demand response|energy management|HEMS|home energy|building energy|V2G|vehicle.to.grid|EV charging|electric vehicle|battery storage|renewable energy|solar|wind power|forecasting|optimization|scheduling|control system|model predictive|reinforcement learning)\b/i;
22
24
 
23
25
  function classifyDomain(query) {
24
- if (HEALTH_KEYWORDS.test(query)) return "health";
26
+ // CS check runs first — engineering/AI topics should not fall into health
25
27
  if (CS_KEYWORDS.test(query)) return "cs";
28
+ if (HEALTH_KEYWORDS.test(query)) return "health";
26
29
  return "general";
27
30
  }
28
31
 
@@ -83,16 +86,31 @@ async function searchArxiv(query, maxResults) {
83
86
  try {
84
87
  const res = await fetchWithTimeout(url);
85
88
  if (!res.ok) return [];
86
- return parseArxivXml(await res.text());
89
+ const papers = parseArxivXml(await res.text());
90
+ // If broad search returns nothing, retry with title-field search
91
+ if (papers.length === 0) {
92
+ const titleUrl = `${ARXIV_BASE}?search_query=ti:${encodeURIComponent(query)}&start=0&max_results=${maxResults}&sortBy=relevance&sortOrder=descending`;
93
+ const titleRes = await fetchWithTimeout(titleUrl);
94
+ if (titleRes.ok) return parseArxivXml(await titleRes.text());
95
+ }
96
+ return papers;
87
97
  } catch {
88
98
  return [];
89
99
  }
90
100
  }
91
101
 
102
+ function normalizeArxivId(arxivId) {
103
+ return arxivId
104
+ .replace(/^arxiv:/i, "")
105
+ .replace(/^https?:\/\/arxiv\.org\/(abs|pdf)\//, "")
106
+ .replace(/\.pdf$/i, "")
107
+ .trim();
108
+ }
109
+
92
110
  async function fetchArxivById(arxivId) {
93
- const clean = arxivId.replace(/^arxiv:/i, "").replace(/^.*abs\//, "").trim();
111
+ const clean = normalizeArxivId(arxivId);
94
112
  try {
95
- const res = await fetchWithTimeout(`${ARXIV_BASE}?id_list=${clean}`);
113
+ const res = await fetchWithTimeout(`${ARXIV_BASE}?id_list=${encodeURIComponent(clean)}`);
96
114
  if (!res.ok) return null;
97
115
  const papers = parseArxivXml(await res.text());
98
116
  return papers[0] || null;
@@ -101,6 +119,36 @@ async function fetchArxivById(arxivId) {
101
119
  }
102
120
  }
103
121
 
122
+ async function fetchS2ByArxivId(arxivId) {
123
+ // S2 accepts ARXIV: prefix — useful as fallback when ArXiv API is rate-limited
124
+ const clean = normalizeArxivId(arxivId).replace(/v\d+$/i, ""); // strip version for S2
125
+ try {
126
+ const fields = "title,abstract,authors,year,citationCount,openAccessPdf,externalIds";
127
+ const res = await fetchWithTimeout(
128
+ `${SEMANTIC_SCHOLAR_BASE}/paper/ARXIV:${encodeURIComponent(clean)}?fields=${fields}`
129
+ );
130
+ if (!res.ok) return null;
131
+ const item = await res.json().catch(() => null);
132
+ if (!item || !item.paperId || !item.title) return null;
133
+ const ext = item.externalIds || {};
134
+ const doi = ext.DOI || "";
135
+ return {
136
+ id: `arxiv:${clean}`,
137
+ source: "arxiv",
138
+ title: (item.title || "").replace(/\s+/g, " ").trim(),
139
+ authors: Array.isArray(item.authors) ? item.authors.map((a) => a.name).filter(Boolean) : [],
140
+ abstract: (item.abstract || "").replace(/\s+/g, " ").trim(),
141
+ published: item.year ? `${item.year}` : "",
142
+ doi,
143
+ url: `https://arxiv.org/abs/${clean}`,
144
+ pdf_url: item.openAccessPdf?.url || `https://arxiv.org/pdf/${clean}`,
145
+ citations: typeof item.citationCount === "number" ? item.citationCount : 0,
146
+ };
147
+ } catch {
148
+ return null;
149
+ }
150
+ }
151
+
104
152
  // ── PubMed ─────────────────────────────────────────────────────────────────
105
153
  async function searchPubMed(query, maxResults) {
106
154
  try {
@@ -214,7 +262,7 @@ async function searchSemanticScholar(query, maxResults) {
214
262
  // ── MCP Server ─────────────────────────────────────────────────────────────
215
263
  const server = new McpServer({
216
264
  name: "paperplain",
217
- version: "1.1.0",
265
+ version: "1.2.3",
218
266
  description:
219
267
  "Search 200M+ peer-reviewed papers from PubMed, ArXiv, and Semantic Scholar. Returns papers with full abstracts — use your own model to synthesize findings.",
220
268
  });
@@ -250,54 +298,83 @@ Use the returned abstracts to synthesize findings, answer the user's question, o
250
298
  async ({ query, max_results, domain }) => {
251
299
  const resolvedDomain = domain === "auto" ? classifyDomain(query) : domain;
252
300
  let papers = [];
253
- let sources = [];
301
+ // Track each source: "ok" | "empty" | "error"
302
+ const sourceStatus = {};
303
+
304
+ async function safeArxiv(q, n) {
305
+ try {
306
+ const r = await searchArxiv(q, n);
307
+ sourceStatus.arxiv = r.length ? "ok" : "empty";
308
+ return r;
309
+ } catch { sourceStatus.arxiv = "error"; return []; }
310
+ }
311
+ async function safePubMed(q, n) {
312
+ try {
313
+ const r = await searchPubMed(q, n);
314
+ sourceStatus.pubmed = r.length ? "ok" : "empty";
315
+ return r;
316
+ } catch { sourceStatus.pubmed = "error"; return []; }
317
+ }
318
+ async function safeS2(q, n) {
319
+ try {
320
+ const r = await searchSemanticScholar(q, n);
321
+ sourceStatus.semanticscholar = r.length ? "ok" : "empty";
322
+ return r;
323
+ } catch { sourceStatus.semanticscholar = "error"; return []; }
324
+ }
254
325
 
255
326
  try {
256
327
  if (resolvedDomain === "health") {
257
- // PubMed primary, Semantic Scholar as fill
258
- let pubmedPapers = await searchPubMed(query, max_results);
259
- if (pubmedPapers.length) sources.push("pubmed");
328
+ let pubmedPapers = await safePubMed(query, max_results);
260
329
  if (pubmedPapers.length < max_results) {
261
- const s2 = await searchSemanticScholar(query, max_results - pubmedPapers.length);
262
- if (s2.length) sources.push("semanticscholar");
330
+ const s2 = await safeS2(query, max_results - pubmedPapers.length);
263
331
  const seen = new Set(pubmedPapers.map((p) => p.id));
264
332
  for (const p of s2) if (!seen.has(p.id)) pubmedPapers.push(p);
265
333
  }
266
334
  papers = pubmedPapers.slice(0, max_results);
267
335
  } else if (resolvedDomain === "cs") {
268
- // ArXiv + Semantic Scholar, deduplicate overlaps
269
336
  const [arxiv, s2] = await Promise.all([
270
- searchArxiv(query, max_results),
271
- searchSemanticScholar(query, Math.ceil(max_results / 2)),
337
+ safeArxiv(query, max_results),
338
+ safeS2(query, Math.ceil(max_results / 2)),
272
339
  ]);
273
- if (arxiv.length) sources.push("arxiv");
274
- if (s2.length) sources.push("semanticscholar");
275
340
  const maxArxiv = Math.ceil(max_results * 0.6);
276
- const arxivIds = new Set(arxiv.map((p) => p.id));
277
- const uniqueS2 = s2.filter((p) => !arxivIds.has(p.id));
341
+ // Deduplicate on URL — S2 uses arxiv.org URLs for arXiv papers, matching exactly
342
+ const arxivUrls = new Set(arxiv.map((p) => p.url));
343
+ const uniqueS2 = s2.filter((p) => !arxivUrls.has(p.url));
278
344
  papers = [
279
345
  ...arxiv.slice(0, maxArxiv),
280
346
  ...uniqueS2.slice(0, max_results - Math.min(arxiv.length, maxArxiv)),
281
347
  ].slice(0, max_results);
282
348
  } else {
283
- // General: all three sources interleaved
284
349
  const [arxiv, pubmed, s2] = await Promise.all([
285
- searchArxiv(query, max_results),
286
- searchPubMed(query, max_results),
287
- searchSemanticScholar(query, Math.ceil(max_results / 2)),
350
+ safeArxiv(query, max_results),
351
+ safePubMed(query, max_results),
352
+ safeS2(query, Math.ceil(max_results / 2)),
288
353
  ]);
289
- if (arxiv.length) sources.push("arxiv");
290
- if (pubmed.length) sources.push("pubmed");
291
- if (s2.length) sources.push("semanticscholar");
354
+ // Deduplicate S2 against both ArXiv and PubMed URLs
355
+ const seenUrls = new Set([...arxiv.map((p) => p.url), ...pubmed.map((p) => p.url)]);
356
+ const uniqueS2 = s2.filter((p) => !seenUrls.has(p.url));
292
357
  const maxEach = Math.floor(max_results / 3);
293
358
  const remainder = max_results - maxEach * 3;
294
359
  papers = [
295
360
  ...arxiv.slice(0, maxEach + remainder),
296
361
  ...pubmed.slice(0, maxEach),
297
- ...s2.slice(0, maxEach),
362
+ ...uniqueS2.slice(0, maxEach),
298
363
  ].slice(0, max_results);
299
364
  }
300
365
 
366
+ // Warn if expected sources came back empty or errored
367
+ const warnings = [];
368
+ const expectedSources = resolvedDomain === "health"
369
+ ? ["pubmed", "semanticscholar"]
370
+ : resolvedDomain === "cs"
371
+ ? ["arxiv", "semanticscholar"]
372
+ : ["arxiv", "pubmed", "semanticscholar"];
373
+ for (const src of expectedSources) {
374
+ if (sourceStatus[src] === "empty") warnings.push(`${src}: returned 0 results (API may be rate-limited or query too specific)`);
375
+ if (sourceStatus[src] === "error") warnings.push(`${src}: request failed (API may be temporarily unavailable)`);
376
+ }
377
+
301
378
  return {
302
379
  content: [
303
380
  {
@@ -306,7 +383,8 @@ Use the returned abstracts to synthesize findings, answer the user's question, o
306
383
  {
307
384
  query,
308
385
  domain: resolvedDomain,
309
- sources_searched: sources,
386
+ source_status: sourceStatus,
387
+ ...(warnings.length ? { warnings } : {}),
310
388
  total: papers.length,
311
389
  papers: papers.map((p) => ({
312
390
  id: p.id,
@@ -335,33 +413,79 @@ Use the returned abstracts to synthesize findings, answer the user's question, o
335
413
  }
336
414
  );
337
415
 
416
+ // ── Semantic Scholar single-paper lookup (by DOI or S2 paper ID) ───────────
417
+ async function fetchS2ByDoi(doi) {
418
+ try {
419
+ const clean = doi.replace(/^doi:/i, "").trim();
420
+ const fields = "title,abstract,authors,year,citationCount,openAccessPdf,externalIds";
421
+ const res = await fetchWithTimeout(
422
+ `${SEMANTIC_SCHOLAR_BASE}/paper/DOI:${encodeURIComponent(clean)}?fields=${fields}`
423
+ );
424
+ if (!res.ok) return null;
425
+ const item = await res.json().catch(() => null);
426
+ if (!item || !item.paperId || !item.title) return null;
427
+ const ext = item.externalIds || {};
428
+ const arxivId = ext.ArXiv || "";
429
+ let url;
430
+ if (arxivId) url = `https://arxiv.org/abs/${arxivId}`;
431
+ else if (clean) url = `https://doi.org/${clean}`;
432
+ else url = `https://www.semanticscholar.org/paper/${item.paperId}`;
433
+ return {
434
+ id: `s2:${item.paperId}`,
435
+ source: "semanticscholar",
436
+ title: (item.title || "").replace(/\s+/g, " ").trim(),
437
+ authors: Array.isArray(item.authors) ? item.authors.map((a) => a.name).filter(Boolean) : [],
438
+ abstract: (item.abstract || "").replace(/\s+/g, " ").trim(),
439
+ published: item.year ? `${item.year}` : "",
440
+ doi: clean,
441
+ url,
442
+ pdf_url: item.openAccessPdf?.url || "",
443
+ citations: typeof item.citationCount === "number" ? item.citationCount : 0,
444
+ };
445
+ } catch {
446
+ return null;
447
+ }
448
+ }
449
+
338
450
  // Tool 2: fetch_paper
339
451
  server.tool(
340
452
  "fetch_paper",
341
- `Fetch the full abstract and metadata for a specific paper by ID.
342
- Supports ArXiv IDs (e.g. '2301.07041' or 'arxiv:2301.07041') and PubMed IDs (e.g. 'pubmed:37183813' or just '37183813').
343
- Use this to get the full abstract of a paper you already know about.`,
453
+ `Fetch the full abstract and metadata for a specific paper by ID or DOI.
454
+ Supports:
455
+ - ArXiv IDs: '2301.07041', 'arxiv:2301.07041v2', 'https://arxiv.org/abs/2301.07041'
456
+ - PubMed IDs: 'pubmed:37183813' or just '37183813'
457
+ - DOIs: '10.1145/3290605.3300857' or 'doi:10.1145/3290605.3300857' (looks up via Semantic Scholar)
458
+ Use this to verify a specific paper you already know about or to retrieve its abstract.`,
344
459
  {
345
460
  paper_id: z
346
461
  .string()
347
462
  .describe(
348
- "ArXiv ID (e.g. '2301.07041') or PubMed ID (e.g. 'pubmed:37183813')"
463
+ "ArXiv ID, PubMed ID, or DOI — e.g. '2301.07041', 'pubmed:37183813', or '10.1145/3290605.3300857'"
349
464
  ),
350
465
  },
351
466
  async ({ paper_id }) => {
352
467
  try {
468
+ const trimmed = paper_id.trim();
353
469
  const isArxiv =
354
- /arxiv:/i.test(paper_id) ||
355
- /^\d{4}\.\d{4,5}$/.test(paper_id.trim()) ||
356
- /arxiv\.org/.test(paper_id);
357
- const isPubMed = /pubmed:/i.test(paper_id) || /^\d{6,9}$/.test(paper_id.trim());
470
+ /arxiv:/i.test(trimmed) ||
471
+ /^\d{4}\.\d{4,5}(v\d+)?$/.test(trimmed) || // 2301.07041 or 2301.07041v2
472
+ /^[a-z-]+(\.[A-Z]+)?\/\d{7}(v\d+)?$/.test(trimmed) || // old format: cs.LG/0504010
473
+ /arxiv\.org/.test(trimmed);
474
+ const isPubMed =
475
+ /pubmed:/i.test(trimmed) || /^\d{6,9}$/.test(trimmed);
476
+ const isDOI =
477
+ /^doi:/i.test(trimmed) || /^10\.\d{4,}\/\S+$/.test(trimmed);
358
478
 
359
479
  let paper = null;
360
480
 
361
481
  if (isArxiv) {
362
- paper = await fetchArxivById(paper_id);
482
+ paper = await fetchArxivById(trimmed);
483
+ // Fallback: ArXiv API rate-limits under parallel load — try S2 ARXIV: endpoint
484
+ if (!paper) paper = await fetchS2ByArxivId(trimmed);
485
+ } else if (isDOI) {
486
+ paper = await fetchS2ByDoi(trimmed);
363
487
  } else if (isPubMed) {
364
- const pmid = paper_id.replace(/^pubmed:/i, "").trim();
488
+ const pmid = trimmed.replace(/^pubmed:/i, "").trim();
365
489
  const abstracts = await fetchPubMedAbstracts([pmid]);
366
490
  const summaryUrl = `${PUBMED_BASE}/esummary.fcgi?db=pubmed&id=${pmid}&retmode=json&${PUBMED_PARAMS}`;
367
491
  const summaryRes = await fetch(summaryUrl);
@@ -385,7 +509,10 @@ Use this to get the full abstract of a paper you already know about.`,
385
509
 
386
510
  if (!paper) {
387
511
  return {
388
- content: [{ type: "text", text: `Paper not found: ${paper_id}` }],
512
+ content: [{
513
+ type: "text",
514
+ text: `Paper not found: ${paper_id}\n\nTip: For arXiv papers, try the bare ID (e.g. '2301.07041'). For journal papers, try the DOI (e.g. '10.1145/3290605.3300857'). For PubMed papers, use the PMID number.`,
515
+ }],
389
516
  isError: true,
390
517
  };
391
518
  }
@@ -402,5 +529,88 @@ Use this to get the full abstract of a paper you already know about.`,
402
529
  }
403
530
  );
404
531
 
532
+ // Tool 3: find_paper_by_title
533
+ server.tool(
534
+ "find_paper_by_title",
535
+ `Find a specific paper when you only know its title (or partial title).
536
+ Uses Semantic Scholar's title-match search. Returns the closest match with full abstract, authors, DOI, and source URL.
537
+ Useful for verifying a citation or retrieving abstract details for a paper you already know exists.`,
538
+ {
539
+ title: z
540
+ .string()
541
+ .describe("The paper title or a key phrase from it, e.g. 'Attention Is All You Need'"),
542
+ year: z
543
+ .number()
544
+ .optional()
545
+ .describe("Publication year to narrow down the match (optional)"),
546
+ },
547
+ async ({ title, year }) => {
548
+ try {
549
+ const fields = "title,abstract,authors,year,citationCount,openAccessPdf,externalIds";
550
+ const url = `${SEMANTIC_SCHOLAR_BASE}/paper/search?query=${encodeURIComponent(title)}&limit=5&fields=${fields}`;
551
+ const res = await fetchWithTimeout(url);
552
+ if (!res.ok) {
553
+ return {
554
+ content: [{ type: "text", text: `Search failed: Semantic Scholar returned ${res.status}` }],
555
+ isError: true,
556
+ };
557
+ }
558
+ const data = await res.json().catch(() => null);
559
+ if (!data?.data?.length) {
560
+ return {
561
+ content: [{ type: "text", text: `No papers found matching: "${title}"` }],
562
+ isError: true,
563
+ };
564
+ }
565
+
566
+ // Pick best match: prefer year match if provided, otherwise take top result
567
+ let candidates = data.data.filter((p) => p.title && p.abstract);
568
+ if (!candidates.length) candidates = data.data.filter((p) => p.title);
569
+ if (!candidates.length) {
570
+ return {
571
+ content: [{ type: "text", text: `No papers found matching: "${title}"` }],
572
+ isError: true,
573
+ };
574
+ }
575
+
576
+ let best = candidates[0];
577
+ if (year) {
578
+ const yearMatch = candidates.find((p) => p.year === year);
579
+ if (yearMatch) best = yearMatch;
580
+ }
581
+
582
+ const ext = best.externalIds || {};
583
+ const doi = ext.DOI || "";
584
+ const arxivId = ext.ArXiv || "";
585
+ let paperUrl;
586
+ if (arxivId) paperUrl = `https://arxiv.org/abs/${arxivId}`;
587
+ else if (doi) paperUrl = `https://doi.org/${doi}`;
588
+ else paperUrl = `https://www.semanticscholar.org/paper/${best.paperId}`;
589
+
590
+ const paper = {
591
+ id: arxivId ? `arxiv:${arxivId}` : `s2:${best.paperId}`,
592
+ source: arxivId ? "arxiv" : "semanticscholar",
593
+ title: (best.title || "").replace(/\s+/g, " ").trim(),
594
+ authors: Array.isArray(best.authors) ? best.authors.map((a) => a.name).filter(Boolean) : [],
595
+ abstract: (best.abstract || "").replace(/\s+/g, " ").trim(),
596
+ published: best.year ? `${best.year}` : "",
597
+ doi,
598
+ url: paperUrl,
599
+ pdf_url: best.openAccessPdf?.url || "",
600
+ citations: typeof best.citationCount === "number" ? best.citationCount : 0,
601
+ };
602
+
603
+ return {
604
+ content: [{ type: "text", text: JSON.stringify(paper, null, 2) }],
605
+ };
606
+ } catch (err) {
607
+ return {
608
+ content: [{ type: "text", text: `find_paper_by_title failed: ${err.message}` }],
609
+ isError: true,
610
+ };
611
+ }
612
+ }
613
+ );
614
+
405
615
  const transport = new StdioServerTransport();
406
616
  await server.connect(transport);