scholar-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,232 @@
1
+ # ScholarMCP
2
+
3
+ ScholarMCP is an MCP server for literature research workflows in coding agents.
4
+
5
+ It gives your agent tools to:
6
+ - search papers across multiple sources
7
+ - ingest and parse full-text PDFs
8
+ - extract structured paper details
9
+ - suggest citations and build references
10
+ - validate manuscript citations
11
+
12
+ ## Who this is for
13
+
14
+ Use this if you want Claude Code, Codex, or any MCP-compatible coding agent to run research tasks directly from chat.
15
+
16
+ ## What you get
17
+
18
+ - Transports: `stdio` (recommended) and HTTP (`/mcp`)
19
+ - Research providers: Google Scholar, OpenAlex, Crossref, Semantic Scholar
20
+ - Full-text parsing pipeline: `grobid -> sidecar -> simple`
21
+ - Tooling for thesis/paper workflows: ingestion, extraction, references, validation
22
+
23
+ ## Quick Start
24
+
25
+ ### 1. Prerequisites
26
+
27
+ - Node.js `>=20`
28
+ - `npm` (for install/publish)
29
+ - `pnpm` (for contributors working from source)
30
+
31
+ ### 2. Install as an npm package (recommended)
32
+
33
+ ```bash
34
+ npm install -g scholar-mcp
35
+ ```
36
+
37
+ One-off run without global install:
38
+
39
+ ```bash
40
+ npx -y scholar-mcp --transport=stdio
41
+ ```
42
+
43
+ ### 3. Run
44
+
45
+ Stdio mode:
46
+
47
+ ```bash
48
+ scholar-mcp --transport=stdio
49
+ ```
50
+
51
+ HTTP mode:
52
+
53
+ ```bash
54
+ scholar-mcp --transport=http
55
+ ```
56
+
57
+ Health check (HTTP mode):
58
+
59
+ ```bash
60
+ curl http://127.0.0.1:3000/health
61
+ ```
62
+
63
+ ### 4. Run from source (contributors)
64
+
65
+ ```bash
66
+ pnpm install
67
+ pnpm dev:stdio
68
+ ```
69
+
70
+ ## Use with Coding Agents
71
+
72
+ ### Claude Code (recommended)
73
+
74
+ Register from globally installed binary:
75
+
76
+ ```bash
77
+ claude mcp add -s user \
78
+ -e SCHOLAR_MCP_TRANSPORT=stdio \
79
+ -e SCHOLAR_REQUEST_DELAY_MS=350 \
80
+ -e RESEARCH_ALLOW_REMOTE_PDFS=true \
81
+ -e RESEARCH_ALLOW_LOCAL_PDFS=true \
82
+ -- scholar_mcp scholar-mcp --transport=stdio
83
+ ```
84
+
85
+ Register without global install:
86
+
87
+ ```bash
88
+ claude mcp add -s user \
89
+ -e SCHOLAR_MCP_TRANSPORT=stdio \
90
+ -e SCHOLAR_REQUEST_DELAY_MS=350 \
91
+ -e RESEARCH_ALLOW_REMOTE_PDFS=true \
92
+ -e RESEARCH_ALLOW_LOCAL_PDFS=true \
93
+ -- scholar_mcp npx -y scholar-mcp --transport=stdio
94
+ ```
95
+
96
+ Check status:
97
+
98
+ ```bash
99
+ claude mcp get scholar_mcp
100
+ ```
101
+
102
+ Notes:
103
+ - Keep the `--` before `scholar_mcp` (required by current Claude CLI parsing for multiple `-e` entries).
104
+ - If you need to replace config: `claude mcp remove scholar_mcp -s project`.
105
+
106
+ ### OpenAI Codex App
107
+
108
+ Add to `~/.codex/config.toml`:
109
+
110
+ ```toml
111
+ [mcp_servers.scholar_mcp]
112
+ command = "npx"
113
+ args = ["-y", "scholar-mcp", "--transport=stdio"]
114
+
115
+ [mcp_servers.scholar_mcp.env]
116
+ SCHOLAR_MCP_TRANSPORT = "stdio"
117
+ SCHOLAR_REQUEST_DELAY_MS = "350"
118
+ RESEARCH_ALLOW_REMOTE_PDFS = "true"
119
+ RESEARCH_ALLOW_LOCAL_PDFS = "true"
120
+ ```
121
+
122
+ ### Generic MCP clients
123
+
124
+ - `stdio` command:
125
+ - `scholar-mcp --transport=stdio`
126
+ - Or: `npx -y scholar-mcp --transport=stdio`
127
+ - HTTP endpoint:
128
+ 1. Start server with `SCHOLAR_MCP_TRANSPORT=http scholar-mcp`
129
+ 2. Connect client to `http://127.0.0.1:3000/mcp`
130
+ 3. Optional auth: set `SCHOLAR_MCP_API_KEY` and send bearer auth header from your client
131
+
132
+ ## MCP Tools
133
+
134
+ | Tool | Purpose |
135
+ |---|---|
136
+ | `search_literature_graph` | Federated search over OpenAlex/Crossref/Semantic Scholar (+ optional scholar scrape). |
137
+ | `search_google_scholar_key_words` | Keyword search on Google Scholar. |
138
+ | `search_google_scholar_advanced` | Scholar search with author/year/phrase filters. |
139
+ | `get_author_info` | Resolve author profile and top publications. |
140
+ | `ingest_paper_fulltext` | Start async full-text ingestion from DOI/URL/PDF/local path. |
141
+ | `get_ingestion_status` | Poll ingestion job status and parsed summary. |
142
+ | `extract_granular_paper_details` | Extract methods, claims, datasets, metrics, and references. |
143
+ | `suggest_contextual_citations` | Suggest citations from manuscript context. |
144
+ | `build_reference_list` | Generate formatted bibliography and BibTeX. |
145
+ | `validate_manuscript_citations` | Detect missing/uncited/duplicate citation issues. |
146
+
147
+ ## Example Agent Prompts
148
+
149
+ - "Find 10 recent papers on retrieval-augmented generation and summarize methods and datasets."
150
+ - "Ingest full text for DOI `10.1038/s41467-024-55563-6`, then extract claims and limitations."
151
+ - "Given this draft section, suggest citations in IEEE style and generate BibTeX."
152
+ - "Validate my manuscript citations against this reference list and show missing citations."
153
+
154
+ ## Optional Python Sidecar (better parsing fallback)
155
+
156
+ Run sidecar:
157
+
158
+ ```bash
159
+ cd python-sidecar
160
+ python -m venv .venv
161
+ source .venv/bin/activate
162
+ pip install -r requirements.txt
163
+ uvicorn app:app --host 127.0.0.1 --port 8090
164
+ ```
165
+
166
+ Then set:
167
+
168
+ ```bash
169
+ RESEARCH_PYTHON_SIDECAR_URL=http://127.0.0.1:8090
170
+ ```
171
+
172
+ ## Configuration
173
+
174
+ Most users only need these:
175
+
176
+ - `SCHOLAR_MCP_TRANSPORT`: `stdio` | `http` | `both` (default: `stdio`)
177
+ - `SCHOLAR_REQUEST_DELAY_MS`: request pacing to reduce rate-limit risk (default: `250`)
178
+ - `RESEARCH_ALLOW_REMOTE_PDFS`: allow remote PDF downloads for ingestion (default: `true`)
179
+ - `RESEARCH_ALLOW_LOCAL_PDFS`: allow local PDF ingestion (default: `true`)
180
+ - `SCHOLAR_MCP_API_KEY`: optional bearer token for HTTP mode
181
+ - `RESEARCH_GROBID_URL`: optional GROBID endpoint
182
+ - `RESEARCH_PYTHON_SIDECAR_URL`: optional sidecar endpoint
183
+
184
+ The CLI loads `.env` from the current working directory automatically at startup.
185
+
186
+ Advanced options exist in `src/config.ts` for timeouts, retries, HTTP session capacity/TTL, provider tuning, and cache behavior.
187
+
188
+ ## Troubleshooting
189
+
190
+ - `Invalid environment variable format` in `claude mcp add`:
191
+ - Add `--` before the MCP server name (see Claude setup command above).
192
+ - `Unable to resolve a downloadable PDF URL from input` on DOI ingestion:
193
+ - The DOI landing page may not expose a downloadable PDF.
194
+ - Retry with `pdf_url` (direct PDF) or `local_pdf_path`.
195
+ - Too many Scholar failures or throttling:
196
+ - Increase `SCHOLAR_REQUEST_DELAY_MS` (for example `500` to `1000`).
197
+
198
+ ## Dev Verification
199
+
200
+ ```bash
201
+ pnpm check
202
+ pnpm test
203
+ ```
204
+
205
+ ## Publish Workflow
206
+
207
+ ```bash
208
+ # 1) update version
209
+ npm version patch
210
+
211
+ # 2) verify source quality
212
+ pnpm check
213
+ pnpm test
214
+
215
+ # 3) verify npm package contents and executable bin
216
+ npm run pack:dry-run
217
+ npm pack
218
+
219
+ # 4) publish to npm
220
+ npm publish
221
+ ```
222
+
223
+ Post-publish smoke test:
224
+
225
+ ```bash
226
+ npx -y scholar-mcp --version
227
+ npx -y scholar-mcp --help
228
+ ```
229
+
230
+ ## Usage Notes
231
+
232
+ Google Scholar may throttle automated traffic. Use conservative request pacing, respect provider terms, and avoid abusive query patterns.
@@ -0,0 +1,57 @@
1
+ const TRANSPORT_OPTIONS = ['stdio', 'http', 'both'];
2
+ const TRANSPORT_SET = new Set(TRANSPORT_OPTIONS);
3
+ const isTransportMode = (value) => TRANSPORT_SET.has(value);
4
+ const parseTransport = (value) => {
5
+ const normalized = value.trim().toLowerCase();
6
+ if (!isTransportMode(normalized)) {
7
+ throw new Error(`Invalid transport "${value}". Expected one of: ${TRANSPORT_OPTIONS.join(', ')}.`);
8
+ }
9
+ return normalized;
10
+ };
11
+ export const CLI_USAGE = `ScholarMCP MCP server
12
+
13
+ Usage:
14
+ scholar-mcp [--transport <stdio|http|both>]
15
+ scholar-mcp --help
16
+ scholar-mcp --version
17
+
18
+ Options:
19
+ --transport <mode> Override SCHOLAR_MCP_TRANSPORT for this run
20
+ -h, --help Show help
21
+ -v, --version Print package version`;
22
+ export const parseCliArgs = (argv) => {
23
+ const args = {
24
+ showHelp: false,
25
+ showVersion: false
26
+ };
27
+ for (let index = 0; index < argv.length; index += 1) {
28
+ const arg = argv[index]?.trim();
29
+ if (!arg) {
30
+ continue;
31
+ }
32
+ if (arg === '-h' || arg === '--help') {
33
+ args.showHelp = true;
34
+ continue;
35
+ }
36
+ if (arg === '-v' || arg === '--version') {
37
+ args.showVersion = true;
38
+ continue;
39
+ }
40
+ if (arg === '--transport') {
41
+ const nextValue = argv[index + 1];
42
+ if (!nextValue) {
43
+ throw new Error('Missing value after --transport.');
44
+ }
45
+ args.transport = parseTransport(nextValue);
46
+ index += 1;
47
+ continue;
48
+ }
49
+ if (arg.startsWith('--transport=')) {
50
+ const value = arg.slice('--transport='.length);
51
+ args.transport = parseTransport(value);
52
+ continue;
53
+ }
54
+ throw new Error(`Unknown argument "${arg}".`);
55
+ }
56
+ return args;
57
+ };
package/dist/config.js ADDED
@@ -0,0 +1,131 @@
1
+ import { z } from 'zod';
2
+ import { getPackageVersion } from './version.js';
3
+ const numberFromEnv = (defaultValue, min, max) => z.coerce.number().int().min(min).max(max).default(defaultValue);
4
+ const floatFromEnv = (defaultValue, min, max) => z.coerce.number().min(min).max(max).default(defaultValue);
5
+ const booleanFromEnv = (defaultValue) => z.preprocess((value) => {
6
+ if (typeof value === 'boolean') {
7
+ return value;
8
+ }
9
+ if (typeof value === 'number') {
10
+ return value !== 0;
11
+ }
12
+ if (typeof value === 'string') {
13
+ const normalized = value.trim().toLowerCase();
14
+ if (['1', 'true', 'yes', 'on'].includes(normalized)) {
15
+ return true;
16
+ }
17
+ if (['0', 'false', 'no', 'off'].includes(normalized)) {
18
+ return false;
19
+ }
20
+ }
21
+ return value;
22
+ }, z.boolean().default(defaultValue));
23
+ const defaultServerVersion = getPackageVersion();
24
+ const envSchema = z.object({
25
+ NODE_ENV: z.enum(['development', 'test', 'production']).default('development'),
26
+ LOG_LEVEL: z.enum(['debug', 'info', 'warn', 'error']).default('info'),
27
+ SCHOLAR_MCP_SERVER_NAME: z.string().default('scholar-mcp'),
28
+ SCHOLAR_MCP_SERVER_VERSION: z.string().default(defaultServerVersion),
29
+ SCHOLAR_MCP_TRANSPORT: z.enum(['stdio', 'http', 'both']).default('stdio'),
30
+ SCHOLAR_MCP_HOST: z.string().default('127.0.0.1'),
31
+ SCHOLAR_MCP_PORT: numberFromEnv(3000, 1, 65535),
32
+ SCHOLAR_MCP_ENDPOINT_PATH: z.string().default('/mcp'),
33
+ SCHOLAR_MCP_HEALTH_PATH: z.string().default('/health'),
34
+ SCHOLAR_MCP_HTTP_SESSION_MODE: z.enum(['stateful', 'stateless']).default('stateful'),
35
+ SCHOLAR_MCP_HTTP_SESSION_TTL_MS: numberFromEnv(30 * 60 * 1000, 10_000, 24 * 60 * 60 * 1000),
36
+ SCHOLAR_MCP_HTTP_MAX_SESSIONS: numberFromEnv(200, 1, 5000),
37
+ SCHOLAR_MCP_ALLOWED_ORIGINS: z.string().optional(),
38
+ SCHOLAR_MCP_ALLOWED_HOSTS: z.string().optional(),
39
+ SCHOLAR_MCP_API_KEY: z.string().optional(),
40
+ SCHOLAR_BASE_URL: z.string().url().default('https://scholar.google.com'),
41
+ SCHOLAR_LANGUAGE: z.string().default('en'),
42
+ SCHOLAR_TIMEOUT_MS: numberFromEnv(15000, 1000, 120000),
43
+ SCHOLAR_RETRY_ATTEMPTS: numberFromEnv(2, 0, 5),
44
+ SCHOLAR_RETRY_DELAY_MS: numberFromEnv(800, 0, 30000),
45
+ SCHOLAR_REQUEST_DELAY_MS: numberFromEnv(250, 0, 10000),
46
+ SCHOLAR_MAX_RESULTS_PER_REQUEST: numberFromEnv(20, 1, 20),
47
+ RESEARCH_OPENALEX_BASE_URL: z.string().url().default('https://api.openalex.org'),
48
+ RESEARCH_OPENALEX_API_KEY: z.string().optional(),
49
+ RESEARCH_CROSSREF_BASE_URL: z.string().url().default('https://api.crossref.org'),
50
+ RESEARCH_SEMANTIC_SCHOLAR_BASE_URL: z.string().url().default('https://api.semanticscholar.org/graph/v1'),
51
+ RESEARCH_SEMANTIC_SCHOLAR_API_KEY: z.string().optional(),
52
+ RESEARCH_TIMEOUT_MS: numberFromEnv(20000, 1000, 120000),
53
+ RESEARCH_RETRY_ATTEMPTS: numberFromEnv(2, 0, 5),
54
+ RESEARCH_RETRY_DELAY_MS: numberFromEnv(800, 0, 30000),
55
+ RESEARCH_REQUEST_DELAY_MS: numberFromEnv(100, 0, 10000),
56
+ RESEARCH_ALLOW_REMOTE_PDFS: booleanFromEnv(true),
57
+ RESEARCH_ALLOW_LOCAL_PDFS: booleanFromEnv(true),
58
+ RESEARCH_GROBID_URL: z.string().url().optional(),
59
+ RESEARCH_PYTHON_SIDECAR_URL: z.string().url().optional(),
60
+ RESEARCH_SEMANTIC_ENGINE: z.enum(['cloud-llm', 'none']).default('cloud-llm'),
61
+ RESEARCH_CLOUD_MODEL: z.string().default('gpt-4.1-mini'),
62
+ RESEARCH_GRAPH_CACHE_TTL_MS: numberFromEnv(5 * 60 * 1000, 0, 24 * 60 * 60 * 1000),
63
+ RESEARCH_GRAPH_MAX_CACHE_ENTRIES: numberFromEnv(300, 1, 5000),
64
+ RESEARCH_GRAPH_PROVIDER_RESULT_MULTIPLIER: numberFromEnv(2, 1, 5),
65
+ RESEARCH_GRAPH_FUZZY_TITLE_THRESHOLD: floatFromEnv(0.84, 0.6, 0.99)
66
+ });
67
+ const splitCsv = (value) => {
68
+ if (!value) {
69
+ return [];
70
+ }
71
+ return value
72
+ .split(',')
73
+ .map((item) => item.trim())
74
+ .filter((item) => item.length > 0);
75
+ };
76
+ const normalizePath = (value) => {
77
+ const withPrefix = value.startsWith('/') ? value : `/${value}`;
78
+ return withPrefix.length > 1 && withPrefix.endsWith('/')
79
+ ? withPrefix.slice(0, -1)
80
+ : withPrefix;
81
+ };
82
+ export const parseConfig = (overrides) => {
83
+ const mergedEnv = {
84
+ ...process.env,
85
+ ...(overrides ?? {})
86
+ };
87
+ const env = envSchema.parse(mergedEnv);
88
+ return {
89
+ nodeEnv: env.NODE_ENV,
90
+ logLevel: env.LOG_LEVEL,
91
+ serverName: env.SCHOLAR_MCP_SERVER_NAME,
92
+ serverVersion: env.SCHOLAR_MCP_SERVER_VERSION,
93
+ transport: env.SCHOLAR_MCP_TRANSPORT,
94
+ host: env.SCHOLAR_MCP_HOST,
95
+ port: env.SCHOLAR_MCP_PORT,
96
+ endpointPath: normalizePath(env.SCHOLAR_MCP_ENDPOINT_PATH),
97
+ healthPath: normalizePath(env.SCHOLAR_MCP_HEALTH_PATH),
98
+ httpSessionMode: env.SCHOLAR_MCP_HTTP_SESSION_MODE,
99
+ httpSessionTtlMs: env.SCHOLAR_MCP_HTTP_SESSION_TTL_MS,
100
+ httpMaxSessions: env.SCHOLAR_MCP_HTTP_MAX_SESSIONS,
101
+ allowedOrigins: splitCsv(env.SCHOLAR_MCP_ALLOWED_ORIGINS),
102
+ allowedHosts: splitCsv(env.SCHOLAR_MCP_ALLOWED_HOSTS).map((host) => host.toLowerCase()),
103
+ apiKey: env.SCHOLAR_MCP_API_KEY,
104
+ scholarBaseUrl: env.SCHOLAR_BASE_URL,
105
+ scholarLanguage: env.SCHOLAR_LANGUAGE,
106
+ scholarTimeoutMs: env.SCHOLAR_TIMEOUT_MS,
107
+ scholarRetryAttempts: env.SCHOLAR_RETRY_ATTEMPTS,
108
+ scholarRetryDelayMs: env.SCHOLAR_RETRY_DELAY_MS,
109
+ scholarRequestDelayMs: env.SCHOLAR_REQUEST_DELAY_MS,
110
+ scholarMaxResultsPerRequest: env.SCHOLAR_MAX_RESULTS_PER_REQUEST,
111
+ researchOpenAlexBaseUrl: env.RESEARCH_OPENALEX_BASE_URL,
112
+ researchOpenAlexApiKey: env.RESEARCH_OPENALEX_API_KEY,
113
+ researchCrossrefBaseUrl: env.RESEARCH_CROSSREF_BASE_URL,
114
+ researchSemanticScholarBaseUrl: env.RESEARCH_SEMANTIC_SCHOLAR_BASE_URL,
115
+ researchSemanticScholarApiKey: env.RESEARCH_SEMANTIC_SCHOLAR_API_KEY,
116
+ researchTimeoutMs: env.RESEARCH_TIMEOUT_MS,
117
+ researchRetryAttempts: env.RESEARCH_RETRY_ATTEMPTS,
118
+ researchRetryDelayMs: env.RESEARCH_RETRY_DELAY_MS,
119
+ researchRequestDelayMs: env.RESEARCH_REQUEST_DELAY_MS,
120
+ researchAllowRemotePdfs: env.RESEARCH_ALLOW_REMOTE_PDFS,
121
+ researchAllowLocalPdfs: env.RESEARCH_ALLOW_LOCAL_PDFS,
122
+ researchGrobidUrl: env.RESEARCH_GROBID_URL,
123
+ researchPythonSidecarUrl: env.RESEARCH_PYTHON_SIDECAR_URL,
124
+ researchSemanticEngine: env.RESEARCH_SEMANTIC_ENGINE,
125
+ researchCloudModel: env.RESEARCH_CLOUD_MODEL,
126
+ researchGraphCacheTtlMs: env.RESEARCH_GRAPH_CACHE_TTL_MS,
127
+ researchGraphMaxCacheEntries: env.RESEARCH_GRAPH_MAX_CACHE_ENTRIES,
128
+ researchGraphProviderResultMultiplier: env.RESEARCH_GRAPH_PROVIDER_RESULT_MULTIPLIER,
129
+ researchGraphFuzzyTitleThreshold: env.RESEARCH_GRAPH_FUZZY_TITLE_THRESHOLD
130
+ };
131
+ };
@@ -0,0 +1,36 @@
1
+ const PRIORITY = {
2
+ debug: 10,
3
+ info: 20,
4
+ warn: 30,
5
+ error: 40
6
+ };
7
+ export class Logger {
8
+ minLevel;
9
+ constructor(minLevel) {
10
+ this.minLevel = minLevel;
11
+ }
12
+ debug(message, context) {
13
+ this.log('debug', message, context);
14
+ }
15
+ info(message, context) {
16
+ this.log('info', message, context);
17
+ }
18
+ warn(message, context) {
19
+ this.log('warn', message, context);
20
+ }
21
+ error(message, context) {
22
+ this.log('error', message, context);
23
+ }
24
+ log(level, message, context) {
25
+ if (PRIORITY[level] < PRIORITY[this.minLevel]) {
26
+ return;
27
+ }
28
+ const payload = {
29
+ ts: new Date().toISOString(),
30
+ level,
31
+ message,
32
+ ...(context ? { context } : {})
33
+ };
34
+ process.stderr.write(`${JSON.stringify(payload)}\n`);
35
+ }
36
+ }