search-paper 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ali Soroush
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,42 @@
1
+ Metadata-Version: 2.4
2
+ Name: search-paper
3
+ Version: 0.1.1
4
+ Summary: CLI for academic paper search across 9 APIs (OpenAlex, Semantic Scholar, PubMed, arXiv, medRxiv, CrossRef, Google Scholar, ORCID, Unpaywall)
5
+ License-Expression: MIT
6
+ Requires-Python: >=3.14
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: defusedxml>=0.7.1
10
+ Requires-Dist: httpx>=0.28.1
11
+ Requires-Dist: requests>=2.34.2
12
+ Requires-Dist: scholarly>=1.7.11
13
+ Dynamic: license-file
14
+
15
+ # search-paper
16
+
17
+ Single-command CLI for academic paper search across 9 APIs. 25 tools, no API keys.
18
+
19
+ Based on [`alisoroushmd/academic-research-mcp`](https://github.com/alisoroushmd/academic-research-mcp) (MIT).
20
+
21
+ ## Install
22
+
23
+ ```bash
24
+ uv tool install search-paper
25
+ ```
26
+
27
+ ## Usage
28
+
29
+ ```shell
30
+ search-paper -t smart_search -a '{"query":"deep learning","num_results":5}'
31
+ search-paper -t find_paper -a '{"identifier":"10.1038/nature14539"}'
32
+ search-paper -t search_papers -a '{"query":"cancer","source":"pubmed","num_results":3}'
33
+ search-paper -L # list all 25 tools
34
+ ```
35
+
36
+ ## License
37
+
38
+ MIT
39
+
40
+ ## Credits
41
+
42
+ All API client modules, orchestrator, cache, and review manager are from [`alisoroushmd/academic-research-mcp`](https://github.com/alisoroushmd/academic-research-mcp) (MIT), repackaged as a standalone CLI.
@@ -0,0 +1,28 @@
1
+ # search-paper
2
+
3
+ Single-command CLI for academic paper search across 9 APIs. 25 tools, no API keys.
4
+
5
+ Based on [`alisoroushmd/academic-research-mcp`](https://github.com/alisoroushmd/academic-research-mcp) (MIT).
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ uv tool install search-paper
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ ```shell
16
+ search-paper -t smart_search -a '{"query":"deep learning","num_results":5}'
17
+ search-paper -t find_paper -a '{"identifier":"10.1038/nature14539"}'
18
+ search-paper -t search_papers -a '{"query":"cancer","source":"pubmed","num_results":3}'
19
+ search-paper -L # list all 25 tools
20
+ ```
21
+
22
+ ## License
23
+
24
+ MIT
25
+
26
+ ## Credits
27
+
28
+ All API client modules, orchestrator, cache, and review manager are from [`alisoroushmd/academic-research-mcp`](https://github.com/alisoroushmd/academic-research-mcp) (MIT), repackaged as a standalone CLI.
@@ -0,0 +1,267 @@
1
+ """
2
+ arXiv API client.
3
+
4
+ Uses the arXiv API (https://info.arxiv.org/help/api/index.html).
5
+ Free, no authentication required. Returns structured metadata including
6
+ abstracts, authors, categories, PDF links, and version history.
7
+
8
+ Rate limit: 1 request per 3 seconds (enforced by the client).
9
+ """
10
+
11
+ import re
12
+ import threading
13
+ import time
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ import defusedxml.ElementTree as ET
17
+
18
+ import cache
19
+ import http_client
20
+
21
+ ARXIV_API_BASE = "https://export.arxiv.org/api/query"
22
+ ATOM_NS = "{http://www.w3.org/2005/Atom}"
23
+ OPENSEARCH_NS = "{http://a9.com/-/spec/opensearch/1.1/}"
24
+ ARXIV_NS = "{http://arxiv.org/schemas/atom}"
25
+
26
+ # Throttle to respect arXiv rate limits (thread-safe)
27
+ _last_request_time = 0
28
+ _throttle_lock = threading.Lock()
29
+
30
+
31
+ def _throttle():
32
+ global _last_request_time
33
+ with _throttle_lock:
34
+ now = time.time()
35
+ elapsed = now - _last_request_time
36
+ if elapsed < 3.0:
37
+ time.sleep(3.0 - elapsed)
38
+ _last_request_time = time.time()
39
+
40
+
41
+ @cache.cached(category="search", ttl=cache.SEARCH_TTL)
42
+ def search_arxiv(
43
+ query: str,
44
+ num_results: int = 10,
45
+ sort_by: str = "relevance",
46
+ category: Optional[str] = None,
47
+ ) -> List[Dict[str, Any]]:
48
+ """
49
+ Search arXiv for papers.
50
+
51
+ Parameters:
52
+ query (str): Search query. Supports arXiv search syntax:
53
+ - Simple keywords: "computational pathology"
54
+ - Author: "au:Campanella"
55
+ - Title: "ti:foundation model endoscopy"
56
+ - Abstract: "abs:gastric intestinal metaplasia"
57
+ - Category: "cat:cs.CV"
58
+ - Combine with AND, OR, ANDNOT
59
+ num_results (int): Max results (default: 10, max: 100).
60
+ sort_by (str): "relevance", "lastUpdatedDate", or "submittedDate".
61
+ category (str): Filter by arXiv category (e.g., "cs.CV", "cs.AI",
62
+ "eess.IV", "q-bio.QM"). If provided, combined with query via AND.
63
+
64
+ Returns:
65
+ List of paper dicts.
66
+ """
67
+ search_query = query
68
+ if category:
69
+ search_query = f"({query}) AND cat:{category}"
70
+
71
+ sort_map = {
72
+ "relevance": "relevance",
73
+ "lastUpdatedDate": "lastUpdatedDate",
74
+ "submittedDate": "submittedDate",
75
+ }
76
+
77
+ params = {
78
+ "search_query": f"all:{search_query}" if " " in query and not any(
79
+ prefix in query for prefix in ["au:", "ti:", "abs:", "cat:", "AND", "OR"]
80
+ ) else search_query,
81
+ "start": 0,
82
+ "max_results": min(num_results, 100),
83
+ "sortBy": sort_map.get(sort_by, "relevance"),
84
+ "sortOrder": "descending",
85
+ }
86
+
87
+ _throttle()
88
+ resp = http_client.get(ARXIV_API_BASE, params=params, timeout=30)
89
+ resp.raise_for_status()
90
+
91
+ return _parse_feed(resp.text)
92
+
93
+
94
+ @cache.cached(category="paper", ttl=cache.PAPER_TTL)
95
+ def get_arxiv_paper(arxiv_id: str) -> Dict[str, Any]:
96
+ """
97
+ Get details for a specific arXiv paper by ID.
98
+
99
+ Parameters:
100
+ arxiv_id (str): arXiv ID (e.g., "2312.00567" or "2312.00567v2").
101
+ Also accepts full URLs like "https://arxiv.org/abs/2312.00567".
102
+
103
+ Returns:
104
+ Paper dict with full metadata.
105
+ """
106
+ # Extract ID from URL if needed
107
+ arxiv_id = _clean_arxiv_id(arxiv_id)
108
+
109
+ params = {
110
+ "id_list": arxiv_id,
111
+ "max_results": 1,
112
+ }
113
+
114
+ _throttle()
115
+ resp = http_client.get(ARXIV_API_BASE, params=params, timeout=30)
116
+ resp.raise_for_status()
117
+
118
+ papers = _parse_feed(resp.text)
119
+ if papers:
120
+ return papers[0]
121
+ return {"error": f"Paper not found: {arxiv_id}"}
122
+
123
+
124
+ @cache.cached(category="search", ttl=cache.SEARCH_TTL)
125
+ def get_arxiv_by_author(
126
+ author_name: str, num_results: int = 10, category: Optional[str] = None
127
+ ) -> List[Dict[str, Any]]:
128
+ """
129
+ Get papers by a specific author.
130
+
131
+ Parameters:
132
+ author_name (str): Author name (e.g., "Campanella, Gabriele").
133
+ num_results (int): Max results.
134
+ category (str): Optional category filter.
135
+
136
+ Returns:
137
+ List of paper dicts sorted by submission date (newest first).
138
+ """
139
+ query = f'au:"{author_name}"'
140
+ if category:
141
+ query = f'{query} AND cat:{category}'
142
+
143
+ params = {
144
+ "search_query": query,
145
+ "start": 0,
146
+ "max_results": min(num_results, 100),
147
+ "sortBy": "submittedDate",
148
+ "sortOrder": "descending",
149
+ }
150
+
151
+ _throttle()
152
+ resp = http_client.get(ARXIV_API_BASE, params=params, timeout=30)
153
+ resp.raise_for_status()
154
+
155
+ return _parse_feed(resp.text)
156
+
157
+
158
+ # --- Parsing helpers ---
159
+
160
+ def _parse_feed(xml_text: str) -> List[Dict[str, Any]]:
161
+ """Parse an arXiv Atom feed into a list of paper dicts."""
162
+ root = ET.fromstring(xml_text)
163
+ papers = []
164
+
165
+ for entry in root.findall(f"{ATOM_NS}entry"):
166
+ paper = _parse_entry(entry)
167
+ if paper.get("title"):
168
+ papers.append(paper)
169
+
170
+ return papers
171
+
172
+
173
+ def _parse_entry(entry) -> Dict[str, Any]:
174
+ """Parse a single Atom entry into a paper dict."""
175
+ # Title (clean whitespace)
176
+ title_el = entry.find(f"{ATOM_NS}title")
177
+ title = _clean_text(title_el.text) if title_el is not None and title_el.text else ""
178
+
179
+ # Abstract
180
+ summary_el = entry.find(f"{ATOM_NS}summary")
181
+ abstract = _clean_text(summary_el.text) if summary_el is not None and summary_el.text else ""
182
+
183
+ # Authors
184
+ authors = []
185
+ for author_el in entry.findall(f"{ATOM_NS}author"):
186
+ name_el = author_el.find(f"{ATOM_NS}name")
187
+ if name_el is not None and name_el.text:
188
+ authors.append(name_el.text.strip())
189
+
190
+ # arXiv ID and URLs
191
+ id_el = entry.find(f"{ATOM_NS}id")
192
+ arxiv_url = id_el.text.strip() if id_el is not None and id_el.text else ""
193
+ arxiv_id = _clean_arxiv_id(arxiv_url)
194
+
195
+ # PDF link
196
+ pdf_url = ""
197
+ for link_el in entry.findall(f"{ATOM_NS}link"):
198
+ if link_el.get("title") == "pdf":
199
+ pdf_url = link_el.get("href", "")
200
+
201
+ # Dates
202
+ published_el = entry.find(f"{ATOM_NS}published")
203
+ published = published_el.text.strip()[:10] if published_el is not None and published_el.text else ""
204
+
205
+ updated_el = entry.find(f"{ATOM_NS}updated")
206
+ updated = updated_el.text.strip()[:10] if updated_el is not None and updated_el.text else ""
207
+
208
+ # Categories
209
+ categories = []
210
+ primary_category_el = entry.find(f"{ARXIV_NS}primary_category")
211
+ primary_category = ""
212
+ if primary_category_el is not None:
213
+ primary_category = primary_category_el.get("term", "")
214
+
215
+ for cat_el in entry.findall(f"{ATOM_NS}category"):
216
+ term = cat_el.get("term", "")
217
+ if term:
218
+ categories.append(term)
219
+
220
+ # DOI (if available)
221
+ doi_el = entry.find(f"{ARXIV_NS}doi")
222
+ doi = doi_el.text.strip() if doi_el is not None and doi_el.text else ""
223
+
224
+ # Journal reference
225
+ journal_ref_el = entry.find(f"{ARXIV_NS}journal_ref")
226
+ journal_ref = journal_ref_el.text.strip() if journal_ref_el is not None and journal_ref_el.text else ""
227
+
228
+ # Comment (often contains page count, conference info)
229
+ comment_el = entry.find(f"{ARXIV_NS}comment")
230
+ comment = comment_el.text.strip() if comment_el is not None and comment_el.text else ""
231
+
232
+ return {
233
+ "arxiv_id": arxiv_id,
234
+ "title": title,
235
+ "authors": authors,
236
+ "abstract": abstract,
237
+ "published": published,
238
+ "updated": updated,
239
+ "primary_category": primary_category,
240
+ "categories": categories,
241
+ "doi": doi,
242
+ "journal_ref": journal_ref,
243
+ "comment": comment,
244
+ "arxiv_url": f"https://arxiv.org/abs/{arxiv_id}",
245
+ "pdf_url": pdf_url or f"https://arxiv.org/pdf/{arxiv_id}",
246
+ }
247
+
248
+
249
+ def _clean_arxiv_id(text: str) -> str:
250
+ """Extract a clean arXiv ID from a URL or ID string."""
251
+ text = text.strip()
252
+ # Match patterns like 2312.00567 or 2312.00567v2
253
+ match = re.search(r'(\d{4}\.\d{4,5}(?:v\d+)?)', text)
254
+ if match:
255
+ return match.group(1)
256
+ # Older format like cs/0601001
257
+ match = re.search(r'([a-z-]+/\d{7}(?:v\d+)?)', text)
258
+ if match:
259
+ return match.group(1)
260
+ return text
261
+
262
+
263
+ def _clean_text(text: str) -> str:
264
+ """Clean whitespace from arXiv text fields."""
265
+ if not text:
266
+ return ""
267
+ return " ".join(text.split())
@@ -0,0 +1,268 @@
1
+ """
2
+ Local SQLite cache for academic research API responses.
3
+
4
+ Caches paper metadata, search results, and author profiles to avoid
5
+ redundant API calls. Papers that appear across multiple searches
6
+ (landmark studies, your own work, frequently cited references) are
7
+ fetched once and served from cache thereafter.
8
+
9
+ Cache is stored at ~/.cache/academic-research-mcp/cache.db by default.
10
+ Set ACADEMIC_CACHE_DIR to override.
11
+ """
12
+
13
+ import functools
14
+ import hashlib
15
+ import json
16
+ import logging
17
+ import os
18
+ import time
19
+ from typing import Any, Dict, Optional
20
+
21
+ import db as _db
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Re-export the shared lock so any legacy references still work.
26
+ _lock = _db._lock
27
+
28
+ # Default TTL: 24 hours for search results, 7 days for paper details
29
+ SEARCH_TTL = 24 * 60 * 60 # 24 hours
30
+ PAPER_TTL = 7 * 24 * 60 * 60 # 7 days
31
+ AUTHOR_TTL = 3 * 24 * 60 * 60 # 3 days
32
+
33
+ # Guard so the table DDL only runs once per process.
34
+ _tables_created = False
35
+
36
+
37
+ def _ensure_cache_table() -> None:
38
+ """Create the cache table and index if they don't yet exist."""
39
+ global _tables_created
40
+ if _tables_created:
41
+ return
42
+ conn = _db.get_db()
43
+ with _lock:
44
+ if _tables_created:
45
+ return
46
+ conn.execute("""
47
+ CREATE TABLE IF NOT EXISTS cache (
48
+ key TEXT PRIMARY KEY,
49
+ value TEXT NOT NULL,
50
+ category TEXT NOT NULL,
51
+ created_at REAL NOT NULL,
52
+ ttl REAL NOT NULL
53
+ )
54
+ """)
55
+ conn.execute("""
56
+ CREATE INDEX IF NOT EXISTS idx_cache_category
57
+ ON cache(category)
58
+ """)
59
+ conn.commit()
60
+ _tables_created = True
61
+
62
+
63
+ def make_key(prefix: str, *args, **kwargs) -> str:
64
+ """Generate a deterministic cache key from function arguments."""
65
+ raw = json.dumps({"prefix": prefix, "args": args, "kwargs": kwargs}, sort_keys=True)
66
+ return hashlib.sha256(raw.encode()).hexdigest()
67
+
68
+
69
+ def get(key: str) -> Optional[Any]:
70
+ """
71
+ Get a value from cache if it exists and hasn't expired.
72
+
73
+ Parameters:
74
+ key: Cache key.
75
+
76
+ Returns:
77
+ Cached value or None if not found/expired.
78
+ """
79
+ try:
80
+ _ensure_cache_table()
81
+ conn = _db.get_db()
82
+ with _lock:
83
+ cursor = conn.execute(
84
+ "SELECT value, created_at, ttl FROM cache WHERE key = ?",
85
+ (key,),
86
+ )
87
+ row = cursor.fetchone()
88
+
89
+ if row is None:
90
+ return None
91
+
92
+ value, created_at, ttl = row
93
+ if time.time() - created_at > ttl:
94
+ _delete(key)
95
+ return None
96
+
97
+ return json.loads(value)
98
+ except Exception:
99
+ logger.debug("Cache get failed for key %s", key[:16], exc_info=True)
100
+ return None
101
+
102
+
103
+ def put(
104
+ key: str, value: Any, category: str = "general", ttl: float = SEARCH_TTL
105
+ ) -> None:
106
+ """
107
+ Store a value in cache.
108
+
109
+ Parameters:
110
+ key: Cache key.
111
+ value: Value to cache (must be JSON-serializable).
112
+ category: Category for grouping (e.g., "search", "paper", "author").
113
+ ttl: Time-to-live in seconds.
114
+ """
115
+ try:
116
+ _ensure_cache_table()
117
+ conn = _db.get_db()
118
+ with _lock:
119
+ conn.execute(
120
+ """INSERT OR REPLACE INTO cache (key, value, category, created_at, ttl)
121
+ VALUES (?, ?, ?, ?, ?)""",
122
+ (key, json.dumps(value), category, time.time(), ttl),
123
+ )
124
+ conn.commit()
125
+ except Exception:
126
+ logger.debug("Cache put failed for key %s", key[:16], exc_info=True)
127
+
128
+
129
+ def _delete(key: str) -> None:
130
+ """Delete a cache entry."""
131
+ try:
132
+ _ensure_cache_table()
133
+ conn = _db.get_db()
134
+ with _lock:
135
+ conn.execute("DELETE FROM cache WHERE key = ?", (key,))
136
+ conn.commit()
137
+ except Exception:
138
+ logger.debug("Cache delete failed for key %s", key[:16], exc_info=True)
139
+
140
+
141
+ def clear(category: Optional[str] = None) -> int:
142
+ """
143
+ Clear cache entries.
144
+
145
+ Parameters:
146
+ category: If provided, only clear entries in this category.
147
+ Otherwise clear everything.
148
+
149
+ Returns:
150
+ Number of entries cleared.
151
+ """
152
+ try:
153
+ _ensure_cache_table()
154
+ conn = _db.get_db()
155
+ with _lock:
156
+ if category:
157
+ cursor = conn.execute(
158
+ "DELETE FROM cache WHERE category = ?", (category,)
159
+ )
160
+ else:
161
+ cursor = conn.execute("DELETE FROM cache")
162
+ count = cursor.rowcount
163
+ conn.commit()
164
+ return count
165
+ except Exception:
166
+ logger.debug("Cache clear failed", exc_info=True)
167
+ return 0
168
+
169
+
170
+ def stats() -> Dict[str, Any]:
171
+ """
172
+ Get cache statistics.
173
+
174
+ Returns:
175
+ Dict with total entries, entries by category, and cache size.
176
+ """
177
+ try:
178
+ _ensure_cache_table()
179
+ conn = _db.get_db()
180
+ with _lock:
181
+ total = conn.execute("SELECT COUNT(*) FROM cache").fetchone()[0]
182
+
183
+ categories = {}
184
+ for row in conn.execute(
185
+ "SELECT category, COUNT(*) FROM cache GROUP BY category"
186
+ ):
187
+ categories[row[0]] = row[1]
188
+
189
+ expired = conn.execute(
190
+ "SELECT COUNT(*) FROM cache WHERE (? - created_at) > ttl",
191
+ (time.time(),),
192
+ ).fetchone()[0]
193
+
194
+ db_path = _db.get_db_path()
195
+ size_bytes = os.path.getsize(db_path) if os.path.exists(db_path) else 0
196
+
197
+ return {
198
+ "total_entries": total,
199
+ "expired_entries": expired,
200
+ "active_entries": total - expired,
201
+ "by_category": categories,
202
+ "cache_size_mb": round(size_bytes / (1024 * 1024), 2),
203
+ "cache_path": db_path,
204
+ }
205
+ except Exception as e:
206
+ return {"error": str(e)}
207
+
208
+
209
+ def cleanup() -> int:
210
+ """
211
+ Remove expired cache entries.
212
+
213
+ Returns:
214
+ Number of entries removed.
215
+ """
216
+ try:
217
+ _ensure_cache_table()
218
+ conn = _db.get_db()
219
+ with _lock:
220
+ cursor = conn.execute(
221
+ "DELETE FROM cache WHERE (? - created_at) > ttl",
222
+ (time.time(),),
223
+ )
224
+ count = cursor.rowcount
225
+ conn.commit()
226
+ return count
227
+ except Exception:
228
+ logger.debug("Cache cleanup failed", exc_info=True)
229
+ return 0
230
+
231
+
232
+ # --- Decorator for easy caching ---
233
+
234
+
235
+ def cached(category: str = "general", ttl: float = SEARCH_TTL):
236
+ """
237
+ Decorator that caches function results.
238
+
239
+ Usage:
240
+ @cached(category="search", ttl=SEARCH_TTL)
241
+ def search_papers(query, num_results=10):
242
+ ...
243
+ """
244
+
245
+ def decorator(func):
246
+ @functools.wraps(func)
247
+ def wrapper(*args, **kwargs):
248
+ key = make_key(func.__name__, *args, **kwargs)
249
+ result = get(key)
250
+ if result is not None:
251
+ return result
252
+ result = func(*args, **kwargs)
253
+ # Don't cache error responses
254
+ if isinstance(result, dict) and "error" in result:
255
+ return result
256
+ if (
257
+ isinstance(result, list)
258
+ and result
259
+ and isinstance(result[0], dict)
260
+ and "error" in result[0]
261
+ ):
262
+ return result
263
+ put(key, result, category=category, ttl=ttl)
264
+ return result
265
+
266
+ return wrapper
267
+
268
+ return decorator