npm - paperplain-mcp - Versions diffs - 1.1.2 → 1.2.3 - Mend

paperplain-mcp 1.1.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +1 -1
package/server.js +250 -40

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "paperplain-mcp",
-  "version": "1.1.2",
+  "version": "1.2.3",
   "description": "MCP server — search 200M+ peer-reviewed papers from PubMed, ArXiv, and Semantic Scholar. Free. No API key.",
   "type": "module",
   "bin": {

package/server.js CHANGED Viewed

@@ -15,14 +15,17 @@ const PUBMED_PARAMS = "tool=paperplain&email=hello@paperplain.io";
 const SEMANTIC_SCHOLAR_BASE = "https://api.semanticscholar.org/graph/v1";
 // ── Domain classifier (keyword-based, no LLM needed) ───────────────────────
+// Note: "energy" intentionally excluded from health — it's more common in
+// CS/engineering contexts (energy management, HEMS, smart grid) than health.
 const HEALTH_KEYWORDS =
-  /\b(sleep|insomnia|anxiety|anxious|stress|depress|pain|ache|headache|migraine|diet|nutrition|weight|obese|exercise|fatigue|tired|energy|focus|adhd|autism|cancer|diabetes|blood|pressure|heart|cholesterol|vitamin|supplement|immune|gut|digestion|mental health|therapy|meditation|mindfulness|mood|burnout|inflammation|allergy|asthma|skin|aging|memory|alzheimer|cognitive|brain|alcohol|smoking|addiction|symptoms|treatment|medicine|medication|dose|chronic|surgery|vaccine|antibiot|clinical|patient|disease|disorder|syndrome|injury|rehabilitation|psychiatric|neurol|cardio|oncol|gastro|pediatr|geriatric)\b/i;
+  /\b(sleep|insomnia|anxiety|anxious|stress|depress|pain|ache|headache|migraine|diet|nutrition|weight|obese|exercise|fatigue|tired|focus|adhd|autism|cancer|diabetes|blood|pressure|heart|cholesterol|vitamin|supplement|immune|gut|digestion|mental health|therapy|meditation|mindfulness|mood|burnout|inflammation|allergy|asthma|skin|aging|memory|alzheimer|cognitive|brain|alcohol|smoking|addiction|symptoms|treatment|medicine|medication|dose|chronic|surgery|vaccine|antibiot|clinical|patient|disease|disorder|syndrome|injury|rehabilitation|psychiatric|neurol|cardio|oncol|gastro|pediatr|geriatric)\b/i;
 const CS_KEYWORDS =
-  /\b(algorithm|neural network|machine learning|deep learning|transformer|llm|language model|reinforcement|classification|clustering|regression|computer vision|nlp|natural language|robotics|autonomous|blockchain|cryptograph|database|distributed|cloud|microservice|compiler|operating system|cybersecurity|quantum comput|software engineer|retrieval|embedding|vector|attention|fine.tun|prompt|inference|benchmark)\b/i;
+  /\b(algorithm|neural network|machine learning|deep learning|transformer|llm|large language model|language model|reinforcement|classification|clustering|regression|computer vision|nlp|natural language|robotics|autonomous|blockchain|cryptograph|database|distributed|cloud|microservice|compiler|operating system|cybersecurity|quantum comput|software engineer|retrieval|embedding|vector|attention|fine.tun|prompt|inference|benchmark|agentic|multi.agent|smart grid|demand response|energy management|HEMS|home energy|building energy|V2G|vehicle.to.grid|EV charging|electric vehicle|battery storage|renewable energy|solar|wind power|forecasting|optimization|scheduling|control system|model predictive|reinforcement learning)\b/i;
 function classifyDomain(query) {
-  if (HEALTH_KEYWORDS.test(query)) return "health";
+  // CS check runs first — engineering/AI topics should not fall into health
   if (CS_KEYWORDS.test(query)) return "cs";
+  if (HEALTH_KEYWORDS.test(query)) return "health";
   return "general";
 }
@@ -83,16 +86,31 @@ async function searchArxiv(query, maxResults) {
   try {
     const res = await fetchWithTimeout(url);
     if (!res.ok) return [];
-    return parseArxivXml(await res.text());
+    const papers = parseArxivXml(await res.text());
+    // If broad search returns nothing, retry with title-field search
+    if (papers.length === 0) {
+      const titleUrl = `${ARXIV_BASE}?search_query=ti:${encodeURIComponent(query)}&start=0&max_results=${maxResults}&sortBy=relevance&sortOrder=descending`;
+      const titleRes = await fetchWithTimeout(titleUrl);
+      if (titleRes.ok) return parseArxivXml(await titleRes.text());
+    }
+    return papers;
   } catch {
     return [];
   }
 }
+function normalizeArxivId(arxivId) {
+  return arxivId
+    .replace(/^arxiv:/i, "")
+    .replace(/^https?:\/\/arxiv\.org\/(abs|pdf)\//, "")
+    .replace(/\.pdf$/i, "")
+    .trim();
+}
 async function fetchArxivById(arxivId) {
-  const clean = arxivId.replace(/^arxiv:/i, "").replace(/^.*abs\//, "").trim();
+  const clean = normalizeArxivId(arxivId);
   try {
-    const res = await fetchWithTimeout(`${ARXIV_BASE}?id_list=${clean}`);
+    const res = await fetchWithTimeout(`${ARXIV_BASE}?id_list=${encodeURIComponent(clean)}`);
     if (!res.ok) return null;
     const papers = parseArxivXml(await res.text());
     return papers[0] || null;
@@ -101,6 +119,36 @@ async function fetchArxivById(arxivId) {
   }
 }
+async function fetchS2ByArxivId(arxivId) {
+  // S2 accepts ARXIV: prefix — useful as fallback when ArXiv API is rate-limited
+  const clean = normalizeArxivId(arxivId).replace(/v\d+$/i, ""); // strip version for S2
+  try {
+    const fields = "title,abstract,authors,year,citationCount,openAccessPdf,externalIds";
+    const res = await fetchWithTimeout(
+      `${SEMANTIC_SCHOLAR_BASE}/paper/ARXIV:${encodeURIComponent(clean)}?fields=${fields}`
+    );
+    if (!res.ok) return null;
+    const item = await res.json().catch(() => null);
+    if (!item || !item.paperId || !item.title) return null;
+    const ext = item.externalIds || {};
+    const doi = ext.DOI || "";
+    return {
+      id: `arxiv:${clean}`,
+      source: "arxiv",
+      title: (item.title || "").replace(/\s+/g, " ").trim(),
+      authors: Array.isArray(item.authors) ? item.authors.map((a) => a.name).filter(Boolean) : [],
+      abstract: (item.abstract || "").replace(/\s+/g, " ").trim(),
+      published: item.year ? `${item.year}` : "",
+      doi,
+      url: `https://arxiv.org/abs/${clean}`,
+      pdf_url: item.openAccessPdf?.url || `https://arxiv.org/pdf/${clean}`,
+      citations: typeof item.citationCount === "number" ? item.citationCount : 0,
+    };
+  } catch {
+    return null;
+  }
+}
 // ── PubMed ─────────────────────────────────────────────────────────────────
 async function searchPubMed(query, maxResults) {
   try {
@@ -214,7 +262,7 @@ async function searchSemanticScholar(query, maxResults) {
 // ── MCP Server ─────────────────────────────────────────────────────────────
 const server = new McpServer({
   name: "paperplain",
-  version: "1.1.0",
+  version: "1.2.3",
   description:
     "Search 200M+ peer-reviewed papers from PubMed, ArXiv, and Semantic Scholar. Returns papers with full abstracts — use your own model to synthesize findings.",
 });
@@ -250,54 +298,83 @@ Use the returned abstracts to synthesize findings, answer the user's question, o
   async ({ query, max_results, domain }) => {
     const resolvedDomain = domain === "auto" ? classifyDomain(query) : domain;
     let papers = [];
-    let sources = [];
+    // Track each source: "ok" | "empty" | "error"
+    const sourceStatus = {};
+    async function safeArxiv(q, n) {
+      try {
+        const r = await searchArxiv(q, n);
+        sourceStatus.arxiv = r.length ? "ok" : "empty";
+        return r;
+      } catch { sourceStatus.arxiv = "error"; return []; }
+    }
+    async function safePubMed(q, n) {
+      try {
+        const r = await searchPubMed(q, n);
+        sourceStatus.pubmed = r.length ? "ok" : "empty";
+        return r;
+      } catch { sourceStatus.pubmed = "error"; return []; }
+    }
+    async function safeS2(q, n) {
+      try {
+        const r = await searchSemanticScholar(q, n);
+        sourceStatus.semanticscholar = r.length ? "ok" : "empty";
+        return r;
+      } catch { sourceStatus.semanticscholar = "error"; return []; }
+    }
     try {
       if (resolvedDomain === "health") {
-        // PubMed primary, Semantic Scholar as fill
-        let pubmedPapers = await searchPubMed(query, max_results);
-        if (pubmedPapers.length) sources.push("pubmed");
+        let pubmedPapers = await safePubMed(query, max_results);
         if (pubmedPapers.length < max_results) {
-          const s2 = await searchSemanticScholar(query, max_results - pubmedPapers.length);
-          if (s2.length) sources.push("semanticscholar");
+          const s2 = await safeS2(query, max_results - pubmedPapers.length);
           const seen = new Set(pubmedPapers.map((p) => p.id));
           for (const p of s2) if (!seen.has(p.id)) pubmedPapers.push(p);
         }
         papers = pubmedPapers.slice(0, max_results);
       } else if (resolvedDomain === "cs") {
-        // ArXiv + Semantic Scholar, deduplicate overlaps
         const [arxiv, s2] = await Promise.all([
-          searchArxiv(query, max_results),
-          searchSemanticScholar(query, Math.ceil(max_results / 2)),
+          safeArxiv(query, max_results),
+          safeS2(query, Math.ceil(max_results / 2)),
         ]);
-        if (arxiv.length) sources.push("arxiv");
-        if (s2.length) sources.push("semanticscholar");
         const maxArxiv = Math.ceil(max_results * 0.6);
-        const arxivIds = new Set(arxiv.map((p) => p.id));
-        const uniqueS2 = s2.filter((p) => !arxivIds.has(p.id));
+        // Deduplicate on URL — S2 uses arxiv.org URLs for arXiv papers, matching exactly
+        const arxivUrls = new Set(arxiv.map((p) => p.url));
+        const uniqueS2 = s2.filter((p) => !arxivUrls.has(p.url));
         papers = [
           ...arxiv.slice(0, maxArxiv),
           ...uniqueS2.slice(0, max_results - Math.min(arxiv.length, maxArxiv)),
         ].slice(0, max_results);
       } else {
-        // General: all three sources interleaved
         const [arxiv, pubmed, s2] = await Promise.all([
-          searchArxiv(query, max_results),
-          searchPubMed(query, max_results),
-          searchSemanticScholar(query, Math.ceil(max_results / 2)),
+          safeArxiv(query, max_results),
+          safePubMed(query, max_results),
+          safeS2(query, Math.ceil(max_results / 2)),
         ]);
-        if (arxiv.length) sources.push("arxiv");
-        if (pubmed.length) sources.push("pubmed");
-        if (s2.length) sources.push("semanticscholar");
+        // Deduplicate S2 against both ArXiv and PubMed URLs
+        const seenUrls = new Set([...arxiv.map((p) => p.url), ...pubmed.map((p) => p.url)]);
+        const uniqueS2 = s2.filter((p) => !seenUrls.has(p.url));
         const maxEach = Math.floor(max_results / 3);
         const remainder = max_results - maxEach * 3;
         papers = [
           ...arxiv.slice(0, maxEach + remainder),
           ...pubmed.slice(0, maxEach),
-          ...s2.slice(0, maxEach),
+          ...uniqueS2.slice(0, maxEach),
         ].slice(0, max_results);
       }
+      // Warn if expected sources came back empty or errored
+      const warnings = [];
+      const expectedSources = resolvedDomain === "health"
+        ? ["pubmed", "semanticscholar"]
+        : resolvedDomain === "cs"
+        ? ["arxiv", "semanticscholar"]
+        : ["arxiv", "pubmed", "semanticscholar"];
+      for (const src of expectedSources) {
+        if (sourceStatus[src] === "empty") warnings.push(`${src}: returned 0 results (API may be rate-limited or query too specific)`);
+        if (sourceStatus[src] === "error") warnings.push(`${src}: request failed (API may be temporarily unavailable)`);
+      }
       return {
         content: [
           {
@@ -306,7 +383,8 @@ Use the returned abstracts to synthesize findings, answer the user's question, o
               {
                 query,
                 domain: resolvedDomain,
-                sources_searched: sources,
+                source_status: sourceStatus,
+                ...(warnings.length ? { warnings } : {}),
                 total: papers.length,
                 papers: papers.map((p) => ({
                   id: p.id,
@@ -335,33 +413,79 @@ Use the returned abstracts to synthesize findings, answer the user's question, o
   }
 );
+// ── Semantic Scholar single-paper lookup (by DOI or S2 paper ID) ───────────
+async function fetchS2ByDoi(doi) {
+  try {
+    const clean = doi.replace(/^doi:/i, "").trim();
+    const fields = "title,abstract,authors,year,citationCount,openAccessPdf,externalIds";
+    const res = await fetchWithTimeout(
+      `${SEMANTIC_SCHOLAR_BASE}/paper/DOI:${encodeURIComponent(clean)}?fields=${fields}`
+    );
+    if (!res.ok) return null;
+    const item = await res.json().catch(() => null);
+    if (!item || !item.paperId || !item.title) return null;
+    const ext = item.externalIds || {};
+    const arxivId = ext.ArXiv || "";
+    let url;
+    if (arxivId) url = `https://arxiv.org/abs/${arxivId}`;
+    else if (clean) url = `https://doi.org/${clean}`;
+    else url = `https://www.semanticscholar.org/paper/${item.paperId}`;
+    return {
+      id: `s2:${item.paperId}`,
+      source: "semanticscholar",
+      title: (item.title || "").replace(/\s+/g, " ").trim(),
+      authors: Array.isArray(item.authors) ? item.authors.map((a) => a.name).filter(Boolean) : [],
+      abstract: (item.abstract || "").replace(/\s+/g, " ").trim(),
+      published: item.year ? `${item.year}` : "",
+      doi: clean,
+      url,
+      pdf_url: item.openAccessPdf?.url || "",
+      citations: typeof item.citationCount === "number" ? item.citationCount : 0,
+    };
+  } catch {
+    return null;
+  }
+}
 // Tool 2: fetch_paper
 server.tool(
   "fetch_paper",
-  `Fetch the full abstract and metadata for a specific paper by ID.
-Supports ArXiv IDs (e.g. '2301.07041' or 'arxiv:2301.07041') and PubMed IDs (e.g. 'pubmed:37183813' or just '37183813').
-Use this to get the full abstract of a paper you already know about.`,
+  `Fetch the full abstract and metadata for a specific paper by ID or DOI.
+Supports:
+- ArXiv IDs: '2301.07041', 'arxiv:2301.07041v2', 'https://arxiv.org/abs/2301.07041'
+- PubMed IDs: 'pubmed:37183813' or just '37183813'
+- DOIs: '10.1145/3290605.3300857' or 'doi:10.1145/3290605.3300857' (looks up via Semantic Scholar)
+Use this to verify a specific paper you already know about or to retrieve its abstract.`,
   {
     paper_id: z
       .string()
       .describe(
-        "ArXiv ID (e.g. '2301.07041') or PubMed ID (e.g. 'pubmed:37183813')"
+        "ArXiv ID, PubMed ID, or DOI — e.g. '2301.07041', 'pubmed:37183813', or '10.1145/3290605.3300857'"
       ),
   },
   async ({ paper_id }) => {
     try {
+      const trimmed = paper_id.trim();
       const isArxiv =
-        /arxiv:/i.test(paper_id) ||
-        /^\d{4}\.\d{4,5}$/.test(paper_id.trim()) ||
-        /arxiv\.org/.test(paper_id);
-      const isPubMed = /pubmed:/i.test(paper_id) || /^\d{6,9}$/.test(paper_id.trim());
+        /arxiv:/i.test(trimmed) ||
+        /^\d{4}\.\d{4,5}(v\d+)?$/.test(trimmed) ||         // 2301.07041 or 2301.07041v2
+        /^[a-z-]+(\.[A-Z]+)?\/\d{7}(v\d+)?$/.test(trimmed) || // old format: cs.LG/0504010
+        /arxiv\.org/.test(trimmed);
+      const isPubMed =
+        /pubmed:/i.test(trimmed) || /^\d{6,9}$/.test(trimmed);
+      const isDOI =
+        /^doi:/i.test(trimmed) || /^10\.\d{4,}\/\S+$/.test(trimmed);
       let paper = null;
       if (isArxiv) {
-        paper = await fetchArxivById(paper_id);
+        paper = await fetchArxivById(trimmed);
+        // Fallback: ArXiv API rate-limits under parallel load — try S2 ARXIV: endpoint
+        if (!paper) paper = await fetchS2ByArxivId(trimmed);
+      } else if (isDOI) {
+        paper = await fetchS2ByDoi(trimmed);
       } else if (isPubMed) {
-        const pmid = paper_id.replace(/^pubmed:/i, "").trim();
+        const pmid = trimmed.replace(/^pubmed:/i, "").trim();
         const abstracts = await fetchPubMedAbstracts([pmid]);
         const summaryUrl = `${PUBMED_BASE}/esummary.fcgi?db=pubmed&id=${pmid}&retmode=json&${PUBMED_PARAMS}`;
         const summaryRes = await fetch(summaryUrl);
@@ -385,7 +509,10 @@ Use this to get the full abstract of a paper you already know about.`,
       if (!paper) {
         return {
-          content: [{ type: "text", text: `Paper not found: ${paper_id}` }],
+          content: [{
+            type: "text",
+            text: `Paper not found: ${paper_id}\n\nTip: For arXiv papers, try the bare ID (e.g. '2301.07041'). For journal papers, try the DOI (e.g. '10.1145/3290605.3300857'). For PubMed papers, use the PMID number.`,
+          }],
           isError: true,
         };
       }
@@ -402,5 +529,88 @@ Use this to get the full abstract of a paper you already know about.`,
   }
 );
+// Tool 3: find_paper_by_title
+server.tool(
+  "find_paper_by_title",
+  `Find a specific paper when you only know its title (or partial title).
+Uses Semantic Scholar's title-match search. Returns the closest match with full abstract, authors, DOI, and source URL.
+Useful for verifying a citation or retrieving abstract details for a paper you already know exists.`,
+  {
+    title: z
+      .string()
+      .describe("The paper title or a key phrase from it, e.g. 'Attention Is All You Need'"),
+    year: z
+      .number()
+      .optional()
+      .describe("Publication year to narrow down the match (optional)"),
+  },
+  async ({ title, year }) => {
+    try {
+      const fields = "title,abstract,authors,year,citationCount,openAccessPdf,externalIds";
+      const url = `${SEMANTIC_SCHOLAR_BASE}/paper/search?query=${encodeURIComponent(title)}&limit=5&fields=${fields}`;
+      const res = await fetchWithTimeout(url);
+      if (!res.ok) {
+        return {
+          content: [{ type: "text", text: `Search failed: Semantic Scholar returned ${res.status}` }],
+          isError: true,
+        };
+      }
+      const data = await res.json().catch(() => null);
+      if (!data?.data?.length) {
+        return {
+          content: [{ type: "text", text: `No papers found matching: "${title}"` }],
+          isError: true,
+        };
+      }
+      // Pick best match: prefer year match if provided, otherwise take top result
+      let candidates = data.data.filter((p) => p.title && p.abstract);
+      if (!candidates.length) candidates = data.data.filter((p) => p.title);
+      if (!candidates.length) {
+        return {
+          content: [{ type: "text", text: `No papers found matching: "${title}"` }],
+          isError: true,
+        };
+      }
+      let best = candidates[0];
+      if (year) {
+        const yearMatch = candidates.find((p) => p.year === year);
+        if (yearMatch) best = yearMatch;
+      }
+      const ext = best.externalIds || {};
+      const doi = ext.DOI || "";
+      const arxivId = ext.ArXiv || "";
+      let paperUrl;
+      if (arxivId) paperUrl = `https://arxiv.org/abs/${arxivId}`;
+      else if (doi) paperUrl = `https://doi.org/${doi}`;
+      else paperUrl = `https://www.semanticscholar.org/paper/${best.paperId}`;
+      const paper = {
+        id: arxivId ? `arxiv:${arxivId}` : `s2:${best.paperId}`,
+        source: arxivId ? "arxiv" : "semanticscholar",
+        title: (best.title || "").replace(/\s+/g, " ").trim(),
+        authors: Array.isArray(best.authors) ? best.authors.map((a) => a.name).filter(Boolean) : [],
+        abstract: (best.abstract || "").replace(/\s+/g, " ").trim(),
+        published: best.year ? `${best.year}` : "",
+        doi,
+        url: paperUrl,
+        pdf_url: best.openAccessPdf?.url || "",
+        citations: typeof best.citationCount === "number" ? best.citationCount : 0,
+      };
+      return {
+        content: [{ type: "text", text: JSON.stringify(paper, null, 2) }],
+      };
+    } catch (err) {
+      return {
+        content: [{ type: "text", text: `find_paper_by_title failed: ${err.message}` }],
+        isError: true,
+      };
+    }
+  }
+);
 const transport = new StdioServerTransport();
 await server.connect(transport);