all-in-mcp 0.2.2__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/PKG-INFO +10 -1
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/README.md +9 -0
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/docs/api.md +50 -0
- all_in_mcp-0.2.5/docs/google_scholar.md +215 -0
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/pyproject.toml +1 -1
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/src/all_in_mcp/academic_platforms/__init__.py +2 -1
- all_in_mcp-0.2.5/src/all_in_mcp/academic_platforms/crossref.py +328 -0
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/src/all_in_mcp/academic_platforms/cryptobib.py +4 -5
- all_in_mcp-0.2.5/src/all_in_mcp/academic_platforms/google_scholar.py +245 -0
- all_in_mcp-0.2.5/src/all_in_mcp/paper.py +201 -0
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/src/all_in_mcp/server.py +335 -3
- all_in_mcp-0.2.2/src/all_in_mcp/paper.py +0 -64
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/.gitignore +0 -0
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/LICENSE +0 -0
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/docs/configuration.md +0 -0
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/docs/cryptobib.md +0 -0
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/docs/development.md +0 -0
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/docs/iacr.md +0 -0
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/docs/pypi-setup.md +0 -0
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/src/all_in_mcp/__init__.py +0 -0
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/src/all_in_mcp/academic_platforms/base.py +0 -0
- {all_in_mcp-0.2.2 → all_in_mcp-0.2.5}/src/all_in_mcp/academic_platforms/iacr.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: all-in-mcp
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.5
|
4
4
|
Summary: An MCP (Model Context Protocol) server providing daily-use utility functions and academic paper search capabilities
|
5
5
|
Project-URL: Homepage, https://github.com/jiahaoxiang2000/all-in-mcp
|
6
6
|
Project-URL: Repository, https://github.com/jiahaoxiang2000/all-in-mcp
|
@@ -53,6 +53,8 @@ An MCP (Model Context Protocol) server that provides daily-use utility functions
|
|
53
53
|
|
54
54
|
- **Academic Research**: IACR ePrint Archive paper search, download, and reading
|
55
55
|
- **Bibliography Search**: CryptoBib database search for cryptography papers
|
56
|
+
- **Google Scholar**: Search academic papers across disciplines with citation data
|
57
|
+
- **PDF Reading**: Read and extract text from local and online PDF files
|
56
58
|
|
57
59
|
### Paper Search Capabilities
|
58
60
|
|
@@ -70,6 +72,13 @@ An MCP (Model Context Protocol) server that provides daily-use utility functions
|
|
70
72
|
- Retrieve structured paper metadata or raw BibTeX entries
|
71
73
|
- Support for all major cryptography venues and conferences
|
72
74
|
|
75
|
+
#### Google Scholar
|
76
|
+
|
77
|
+
- Search academic papers across multiple disciplines
|
78
|
+
- Access to citation counts and publication metadata
|
79
|
+
- Broad coverage of academic literature from various sources
|
80
|
+
- Year-based filtering for targeted searches
|
81
|
+
|
73
82
|
## Quick Start
|
74
83
|
|
75
84
|
### Prerequisites
|
@@ -8,6 +8,8 @@ An MCP (Model Context Protocol) server that provides daily-use utility functions
|
|
8
8
|
|
9
9
|
- **Academic Research**: IACR ePrint Archive paper search, download, and reading
|
10
10
|
- **Bibliography Search**: CryptoBib database search for cryptography papers
|
11
|
+
- **Google Scholar**: Search academic papers across disciplines with citation data
|
12
|
+
- **PDF Reading**: Read and extract text from local and online PDF files
|
11
13
|
|
12
14
|
### Paper Search Capabilities
|
13
15
|
|
@@ -25,6 +27,13 @@ An MCP (Model Context Protocol) server that provides daily-use utility functions
|
|
25
27
|
- Retrieve structured paper metadata or raw BibTeX entries
|
26
28
|
- Support for all major cryptography venues and conferences
|
27
29
|
|
30
|
+
#### Google Scholar
|
31
|
+
|
32
|
+
- Search academic papers across multiple disciplines
|
33
|
+
- Access to citation counts and publication metadata
|
34
|
+
- Broad coverage of academic literature from various sources
|
35
|
+
- Year-based filtering for targeted searches
|
36
|
+
|
28
37
|
## Quick Start
|
29
38
|
|
30
39
|
### Prerequisites
|
@@ -108,6 +108,56 @@ URL: https://eprint.iacr.org/2023/1234
|
|
108
108
|
[Full extracted text content]
|
109
109
|
```
|
110
110
|
|
111
|
+
## Google Scholar Search
|
112
|
+
|
113
|
+
### search-google-scholar-papers
|
114
|
+
|
115
|
+
Search academic papers from Google Scholar. This provides broad coverage across multiple academic disciplines and includes citation information.
|
116
|
+
|
117
|
+
**Parameters:**
|
118
|
+
|
119
|
+
- `query` (string, required): Search query string (e.g., 'machine learning', 'neural networks')
|
120
|
+
- `max_results` (integer, optional): Maximum number of results to return (default: 10)
|
121
|
+
- `year_low` (integer, optional): Minimum publication year for filtering
|
122
|
+
- `year_high` (integer, optional): Maximum publication year for filtering
|
123
|
+
|
124
|
+
**Returns:**
|
125
|
+
|
126
|
+
- List of papers with metadata (title, authors, citations, year, URL, abstract)
|
127
|
+
|
128
|
+
**Example:**
|
129
|
+
|
130
|
+
```json
|
131
|
+
{
|
132
|
+
"name": "search-google-scholar-papers",
|
133
|
+
"arguments": {
|
134
|
+
"query": "deep learning transformers",
|
135
|
+
"max_results": 5,
|
136
|
+
"year_low": 2020,
|
137
|
+
"year_high": 2024
|
138
|
+
}
|
139
|
+
}
|
140
|
+
```
|
141
|
+
|
142
|
+
**Response:**
|
143
|
+
|
144
|
+
```
|
145
|
+
Found 3 Google Scholar papers for query 'deep learning transformers' in year range (2020-2024):
|
146
|
+
|
147
|
+
1. **Attention Is All You Need**
|
148
|
+
- Authors: Ashish Vaswani, Noam Shazeer, Niki Parmar
|
149
|
+
- Citations: 85234
|
150
|
+
- Year: 2017
|
151
|
+
- URL: https://papers.nips.cc/paper/7181-attention-is-all-you-need
|
152
|
+
- Abstract: The dominant sequence transduction models are based on complex recurrent or convolutional neural networks...
|
153
|
+
```
|
154
|
+
|
155
|
+
**Limitations:**
|
156
|
+
|
157
|
+
- No direct PDF downloads (redirects to publisher websites)
|
158
|
+
- Rate limiting may apply for frequent requests
|
159
|
+
- Results may vary based on geographic location
|
160
|
+
|
111
161
|
## CryptoBib Bibliography Search
|
112
162
|
|
113
163
|
### search-cryptobib-papers
|
@@ -0,0 +1,215 @@
|
|
1
|
+
# Google Scholar Integration
|
2
|
+
|
3
|
+
This document describes the Google Scholar integration in the all-in-mcp server.
|
4
|
+
|
5
|
+
## Overview
|
6
|
+
|
7
|
+
The Google Scholar integration provides academic paper searching capabilities through Google Scholar's web interface. Unlike other academic platforms, Google Scholar aggregates papers from multiple sources and provides citation counts and broader coverage of academic literature.
|
8
|
+
|
9
|
+
## Features
|
10
|
+
|
11
|
+
- **Paper Search**: Search for academic papers across multiple disciplines
|
12
|
+
- **Citation Information**: Get citation counts for papers
|
13
|
+
- **Multiple Source Coverage**: Access papers from various publishers and repositories
|
14
|
+
- **Year-based Filtering**: Filter search results by publication year range
|
15
|
+
|
16
|
+
## Available Tools
|
17
|
+
|
18
|
+
### search-google-scholar-papers
|
19
|
+
|
20
|
+
Search for academic papers using Google Scholar.
|
21
|
+
|
22
|
+
**Parameters:**
|
23
|
+
|
24
|
+
- `query` (required): Search query string (e.g., "machine learning", "neural networks")
|
25
|
+
- `max_results` (optional): Maximum number of papers to return (default: 10, max: 100)
|
26
|
+
- `year_low` (optional): Minimum publication year for filtering results
|
27
|
+
- `year_high` (optional): Maximum publication year for filtering results
|
28
|
+
|
29
|
+
**Example Usage:**
|
30
|
+
|
31
|
+
```json
|
32
|
+
{
|
33
|
+
"query": "deep learning transformers",
|
34
|
+
"max_results": 5,
|
35
|
+
"year_low": 2020,
|
36
|
+
"year_high": 2024
|
37
|
+
}
|
38
|
+
```
|
39
|
+
|
40
|
+
**Returns:**
|
41
|
+
|
42
|
+
- Paper title
|
43
|
+
- Authors list
|
44
|
+
- Citation count (when available)
|
45
|
+
- Publication year (when available)
|
46
|
+
- Paper URL
|
47
|
+
- Abstract (truncated to 300 characters)
|
48
|
+
|
49
|
+
## Limitations
|
50
|
+
|
51
|
+
### No Direct PDF Access
|
52
|
+
|
53
|
+
Google Scholar doesn't provide direct PDF downloads. Users need to:
|
54
|
+
|
55
|
+
1. Use the provided paper URL to access the publisher's website
|
56
|
+
2. Check if the paper is available through institutional access
|
57
|
+
3. Look for open access versions on author websites or repositories
|
58
|
+
|
59
|
+
### Rate Limiting
|
60
|
+
|
61
|
+
Google Scholar implements rate limiting to prevent automated scraping:
|
62
|
+
|
63
|
+
- The implementation includes random delays between requests (1-3 seconds)
|
64
|
+
- Multiple rapid requests may result in temporary blocks
|
65
|
+
- Consider using other sources (IACR, arXiv) for bulk operations
|
66
|
+
|
67
|
+
### Search Result Variability
|
68
|
+
|
69
|
+
- Results may vary based on geographic location and Google's algorithms
|
70
|
+
- Some papers may not be accessible due to publisher restrictions
|
71
|
+
- Citation counts may not be real-time accurate
|
72
|
+
|
73
|
+
## Implementation Details
|
74
|
+
|
75
|
+
### Web Scraping Approach
|
76
|
+
|
77
|
+
The implementation uses web scraping with:
|
78
|
+
|
79
|
+
- Random user agent rotation to avoid detection
|
80
|
+
- BeautifulSoup for HTML parsing
|
81
|
+
- Request session management for cookie handling
|
82
|
+
- Error handling for network issues and parsing failures
|
83
|
+
|
84
|
+
### Paper Data Extraction
|
85
|
+
|
86
|
+
The parser extracts:
|
87
|
+
|
88
|
+
- **Title**: From `h3.gs_rt` elements, cleaned of PDF/HTML markers
|
89
|
+
- **Authors**: From `div.gs_a` elements, parsed from publication info
|
90
|
+
- **Abstract**: From `div.gs_rs` elements when available
|
91
|
+
- **Citations**: From citation links in `div.gs_fl` elements
|
92
|
+
- **Year**: Extracted from publication information using regex patterns
|
93
|
+
- **URL**: From title links to source papers
|
94
|
+
|
95
|
+
### Error Handling
|
96
|
+
|
97
|
+
- Network timeouts (30 seconds)
|
98
|
+
- HTTP error responses (rate limiting, server errors)
|
99
|
+
- Parsing failures for malformed HTML
|
100
|
+
- Missing required paper elements
|
101
|
+
|
102
|
+
## Best Practices
|
103
|
+
|
104
|
+
### Responsible Usage
|
105
|
+
|
106
|
+
1. **Respect Rate Limits**: Don't make too many requests in quick succession
|
107
|
+
2. **Cache Results**: Store search results locally to avoid repeated queries
|
108
|
+
3. **Use Appropriate Delays**: The implementation includes built-in delays
|
109
|
+
4. **Monitor for Blocks**: Be prepared to handle temporary access restrictions
|
110
|
+
|
111
|
+
### Query Optimization
|
112
|
+
|
113
|
+
1. **Specific Terms**: Use specific academic terms for better results
|
114
|
+
2. **Author Names**: Include author names when searching for specific papers
|
115
|
+
3. **Publication Venues**: Include conference or journal names for focused searches
|
116
|
+
4. **Year Ranges**: Use year filters to narrow down results
|
117
|
+
|
118
|
+
### Integration with Other Sources
|
119
|
+
|
120
|
+
Google Scholar works best when combined with other academic sources:
|
121
|
+
|
122
|
+
- Use IACR for cryptography papers with PDF access
|
123
|
+
- Use arXiv for preprints with full-text access
|
124
|
+
- Use institutional repositories for open access papers
|
125
|
+
|
126
|
+
## Example Responses
|
127
|
+
|
128
|
+
### Successful Search
|
129
|
+
|
130
|
+
```
|
131
|
+
Found 3 Google Scholar papers for query 'machine learning healthcare':
|
132
|
+
|
133
|
+
1. **Machine Learning in Healthcare: A Review**
|
134
|
+
- Authors: John Smith, Jane Doe, Bob Johnson
|
135
|
+
- Citations: 245
|
136
|
+
- Year: 2023
|
137
|
+
- URL: https://example.com/paper1
|
138
|
+
- Abstract: This comprehensive review examines the applications of machine learning techniques in healthcare settings, covering diagnostic imaging, predictive analytics, and treatment optimization...
|
139
|
+
|
140
|
+
2. **Deep Learning for Medical Diagnosis**
|
141
|
+
- Authors: Alice Brown, Charlie Wilson
|
142
|
+
- Citations: 156
|
143
|
+
- Year: 2022
|
144
|
+
- URL: https://example.com/paper2
|
145
|
+
- Abstract: We present a novel deep learning framework for automated medical diagnosis using convolutional neural networks...
|
146
|
+
```
|
147
|
+
|
148
|
+
### Empty Results
|
149
|
+
|
150
|
+
```
|
151
|
+
No papers found for query: obscure search term in year range (2025-2030)
|
152
|
+
```
|
153
|
+
|
154
|
+
### Error Response
|
155
|
+
|
156
|
+
```
|
157
|
+
Error searching Google Scholar: HTTP 429 - Rate limit exceeded. Please try again later.
|
158
|
+
```
|
159
|
+
|
160
|
+
## Troubleshooting
|
161
|
+
|
162
|
+
### Common Issues
|
163
|
+
|
164
|
+
**No Results Found**
|
165
|
+
|
166
|
+
- Check query spelling and syntax
|
167
|
+
- Try broader search terms
|
168
|
+
- Remove year filters if too restrictive
|
169
|
+
- Verify network connectivity
|
170
|
+
|
171
|
+
**Rate Limiting Errors**
|
172
|
+
|
173
|
+
- Wait before making additional requests
|
174
|
+
- Reduce the frequency of searches
|
175
|
+
- Consider using other academic sources
|
176
|
+
- Check if IP address is temporarily blocked
|
177
|
+
|
178
|
+
**Parsing Errors**
|
179
|
+
|
180
|
+
- Usually indicates changes in Google Scholar's HTML structure
|
181
|
+
- Check logs for specific parsing failures
|
182
|
+
- May require updates to the parsing logic
|
183
|
+
|
184
|
+
### Development and Testing
|
185
|
+
|
186
|
+
**Testing Considerations**
|
187
|
+
|
188
|
+
- Use mock responses for unit tests to avoid rate limiting
|
189
|
+
- Test with various query types and edge cases
|
190
|
+
- Include tests for error conditions and edge cases
|
191
|
+
- Verify handling of malformed or incomplete results
|
192
|
+
|
193
|
+
**Debugging Tips**
|
194
|
+
|
195
|
+
- Enable detailed logging to see request/response details
|
196
|
+
- Check network connectivity and DNS resolution
|
197
|
+
- Verify user agent and headers are being sent correctly
|
198
|
+
- Test individual parsing functions with real HTML samples
|
199
|
+
|
200
|
+
## Future Enhancements
|
201
|
+
|
202
|
+
### Potential Improvements
|
203
|
+
|
204
|
+
1. **Enhanced Parsing**: Better extraction of publication venues and DOIs
|
205
|
+
2. **Citation Tracking**: Track citation networks and related papers
|
206
|
+
3. **Advanced Filtering**: Filter by publication type, author affiliation
|
207
|
+
4. **Result Caching**: Implement intelligent caching to reduce API calls
|
208
|
+
5. **Proxy Support**: Add proxy rotation for high-volume usage
|
209
|
+
|
210
|
+
### Integration Opportunities
|
211
|
+
|
212
|
+
1. **Cross-Reference Validation**: Verify results against other academic databases
|
213
|
+
2. **Full-Text Integration**: Combine with repository APIs for full-text access
|
214
|
+
3. **Citation Analysis**: Integrate with citation analysis tools
|
215
|
+
4. **Recommendation System**: Suggest related papers based on search history
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# all_in_mcp/academic_platforms/__init__.py
|
2
2
|
from .base import PaperSource
|
3
3
|
from .cryptobib import CryptoBibSearcher
|
4
|
+
from .google_scholar import GoogleScholarSearcher
|
4
5
|
from .iacr import IACRSearcher
|
5
6
|
|
6
|
-
__all__ = ["
|
7
|
+
__all__ = ["CryptoBibSearcher", "GoogleScholarSearcher", "IACRSearcher", "PaperSource"]
|
@@ -0,0 +1,328 @@
|
|
1
|
+
# all_in_mcp/academic_platforms/crossref.py
|
2
|
+
import logging
|
3
|
+
from datetime import datetime
|
4
|
+
from typing import Optional
|
5
|
+
from urllib.parse import quote_plus
|
6
|
+
|
7
|
+
import httpx
|
8
|
+
|
9
|
+
from ..paper import Paper
|
10
|
+
from .base import PaperSource
|
11
|
+
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
class CrossrefSearcher(PaperSource):
|
16
|
+
"""Crossref API paper search implementation"""
|
17
|
+
|
18
|
+
BASE_URL = "https://api.crossref.org"
|
19
|
+
WORKS_ENDPOINT = f"{BASE_URL}/works"
|
20
|
+
|
21
|
+
def __init__(self, email: Optional[str] = None):
|
22
|
+
"""
|
23
|
+
Initialize Crossref searcher
|
24
|
+
|
25
|
+
Args:
|
26
|
+
email: Optional email for polite API usage (recommended by Crossref)
|
27
|
+
"""
|
28
|
+
self.email = email
|
29
|
+
self.client = httpx.Client(timeout=30.0)
|
30
|
+
|
31
|
+
def _get_headers(self) -> dict:
|
32
|
+
"""Get headers for API requests"""
|
33
|
+
headers = {
|
34
|
+
"User-Agent": "all-in-mcp/0.1.0 (https://github.com/user/all-in-mcp)"
|
35
|
+
}
|
36
|
+
if self.email:
|
37
|
+
headers["User-Agent"] += f" (mailto:{self.email})"
|
38
|
+
return headers
|
39
|
+
|
40
|
+
def _parse_date(self, date_parts: list) -> Optional[datetime]:
|
41
|
+
"""Parse Crossref date parts into datetime"""
|
42
|
+
if not date_parts or not isinstance(date_parts, list):
|
43
|
+
return None
|
44
|
+
|
45
|
+
try:
|
46
|
+
# Crossref provides date as [[year, month, day]] or [[year, month]] or [[year]]
|
47
|
+
if len(date_parts) > 0 and isinstance(date_parts[0], list):
|
48
|
+
parts = date_parts[0]
|
49
|
+
year = parts[0] if len(parts) > 0 else 1
|
50
|
+
month = parts[1] if len(parts) > 1 else 1
|
51
|
+
day = parts[2] if len(parts) > 2 else 1
|
52
|
+
return datetime(year, month, day)
|
53
|
+
except (ValueError, IndexError, TypeError):
|
54
|
+
pass
|
55
|
+
return None
|
56
|
+
|
57
|
+
def _extract_authors(self, authors_data: list) -> list[str]:
|
58
|
+
"""Extract author names from Crossref author data"""
|
59
|
+
authors = []
|
60
|
+
for author in authors_data or []:
|
61
|
+
if isinstance(author, dict):
|
62
|
+
given = author.get("given", "")
|
63
|
+
family = author.get("family", "")
|
64
|
+
if given and family:
|
65
|
+
authors.append(f"{given} {family}")
|
66
|
+
elif family:
|
67
|
+
authors.append(family)
|
68
|
+
elif given:
|
69
|
+
authors.append(given)
|
70
|
+
return authors
|
71
|
+
|
72
|
+
def _parse_work(self, work: dict) -> Optional[Paper]:
|
73
|
+
"""Parse a single work from Crossref API response"""
|
74
|
+
try:
|
75
|
+
# Extract basic information
|
76
|
+
title_list = work.get("title", [])
|
77
|
+
title = title_list[0] if title_list else ""
|
78
|
+
|
79
|
+
if not title:
|
80
|
+
return None
|
81
|
+
|
82
|
+
doi = work.get("DOI", "")
|
83
|
+
paper_id = doi or work.get("URL", "")
|
84
|
+
|
85
|
+
# Extract authors
|
86
|
+
authors = self._extract_authors(work.get("author", []))
|
87
|
+
|
88
|
+
# Extract abstract
|
89
|
+
abstract = work.get("abstract", "")
|
90
|
+
if abstract:
|
91
|
+
# Remove HTML tags if present
|
92
|
+
import re
|
93
|
+
|
94
|
+
abstract = re.sub(r"<[^>]+>", "", abstract)
|
95
|
+
|
96
|
+
# Extract publication date
|
97
|
+
published_date = (
|
98
|
+
self._parse_date(work.get("published-print", {}).get("date-parts"))
|
99
|
+
or self._parse_date(work.get("published-online", {}).get("date-parts"))
|
100
|
+
or self._parse_date(work.get("created", {}).get("date-parts"))
|
101
|
+
)
|
102
|
+
|
103
|
+
# Extract URLs
|
104
|
+
url = work.get("URL", "")
|
105
|
+
pdf_url = ""
|
106
|
+
|
107
|
+
# Look for PDF in links
|
108
|
+
links = work.get("link", [])
|
109
|
+
for link in links:
|
110
|
+
if link.get("content-type") == "application/pdf":
|
111
|
+
pdf_url = link.get("URL", "")
|
112
|
+
break
|
113
|
+
|
114
|
+
# Extract additional metadata
|
115
|
+
container_title = work.get("container-title", [])
|
116
|
+
journal = container_title[0] if container_title else ""
|
117
|
+
|
118
|
+
volume = work.get("volume", "")
|
119
|
+
issue = work.get("issue", "")
|
120
|
+
pages = work.get("page", "")
|
121
|
+
|
122
|
+
# Extract categories/subjects
|
123
|
+
categories = []
|
124
|
+
subjects = work.get("subject", [])
|
125
|
+
if subjects:
|
126
|
+
categories.extend(subjects)
|
127
|
+
|
128
|
+
# Citation count (if available)
|
129
|
+
citations = work.get("is-referenced-by-count", 0)
|
130
|
+
|
131
|
+
# Build extra metadata
|
132
|
+
extra = {
|
133
|
+
"journal": journal,
|
134
|
+
"volume": volume,
|
135
|
+
"issue": issue,
|
136
|
+
"pages": pages,
|
137
|
+
"type": work.get("type", ""),
|
138
|
+
"publisher": work.get("publisher", ""),
|
139
|
+
"issn": work.get("ISSN", []),
|
140
|
+
"isbn": work.get("ISBN", []),
|
141
|
+
}
|
142
|
+
|
143
|
+
# Remove empty values from extra
|
144
|
+
extra = {k: v for k, v in extra.items() if v}
|
145
|
+
|
146
|
+
return Paper(
|
147
|
+
paper_id=paper_id,
|
148
|
+
title=title,
|
149
|
+
authors=authors,
|
150
|
+
abstract=abstract,
|
151
|
+
doi=doi,
|
152
|
+
published_date=published_date or datetime(1900, 1, 1),
|
153
|
+
pdf_url=pdf_url,
|
154
|
+
url=url,
|
155
|
+
source="crossref",
|
156
|
+
categories=categories,
|
157
|
+
citations=citations,
|
158
|
+
extra=extra,
|
159
|
+
)
|
160
|
+
|
161
|
+
except Exception as e:
|
162
|
+
logger.error(f"Error parsing Crossref work: {e}")
|
163
|
+
return None
|
164
|
+
|
165
|
+
def search(
|
166
|
+
self,
|
167
|
+
query: str,
|
168
|
+
max_results: int = 10,
|
169
|
+
year_min: Optional[int] = None,
|
170
|
+
year_max: Optional[int] = None,
|
171
|
+
sort_by: str = "relevance",
|
172
|
+
**kwargs,
|
173
|
+
) -> list[Paper]:
|
174
|
+
"""
|
175
|
+
Search for papers using Crossref API
|
176
|
+
|
177
|
+
Args:
|
178
|
+
query: Search query string
|
179
|
+
max_results: Maximum number of results to return
|
180
|
+
year_min: Minimum publication year
|
181
|
+
year_max: Maximum publication year
|
182
|
+
sort_by: Sort order (relevance, published, indexed, updated)
|
183
|
+
"""
|
184
|
+
if not query.strip():
|
185
|
+
return []
|
186
|
+
|
187
|
+
try:
|
188
|
+
params = {
|
189
|
+
"query": query,
|
190
|
+
"rows": min(max_results, 1000), # Crossref max is 1000
|
191
|
+
"sort": sort_by,
|
192
|
+
"select": "DOI,title,author,abstract,published-print,published-online,created,URL,container-title,volume,issue,page,subject,is-referenced-by-count,type,publisher,ISSN,ISBN,link",
|
193
|
+
}
|
194
|
+
|
195
|
+
# Add year filters if specified
|
196
|
+
filters = []
|
197
|
+
if year_min:
|
198
|
+
filters.append(f"from-pub-date:{year_min}")
|
199
|
+
if year_max:
|
200
|
+
filters.append(f"until-pub-date:{year_max}")
|
201
|
+
|
202
|
+
if filters:
|
203
|
+
params["filter"] = ",".join(filters)
|
204
|
+
|
205
|
+
response = self.client.get(
|
206
|
+
self.WORKS_ENDPOINT, params=params, headers=self._get_headers()
|
207
|
+
)
|
208
|
+
response.raise_for_status()
|
209
|
+
|
210
|
+
data = response.json()
|
211
|
+
works = data.get("message", {}).get("items", [])
|
212
|
+
|
213
|
+
papers = []
|
214
|
+
for work in works:
|
215
|
+
paper = self._parse_work(work)
|
216
|
+
if paper:
|
217
|
+
papers.append(paper)
|
218
|
+
|
219
|
+
return papers[:max_results]
|
220
|
+
|
221
|
+
except Exception as e:
|
222
|
+
logger.error(f"Error searching Crossref: {e}")
|
223
|
+
return []
|
224
|
+
|
225
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
226
|
+
"""
|
227
|
+
Download PDF for a paper (limited functionality for Crossref)
|
228
|
+
|
229
|
+
Note: Crossref is primarily a metadata service. PDF downloads
|
230
|
+
depend on publisher policies and may not always be available.
|
231
|
+
"""
|
232
|
+
if not paper_id:
|
233
|
+
return "Error: paper_id is required"
|
234
|
+
|
235
|
+
try:
|
236
|
+
# If paper_id is a DOI, try to get work details first
|
237
|
+
if not paper_id.startswith("http"):
|
238
|
+
work_url = f"{self.WORKS_ENDPOINT}/{quote_plus(paper_id)}"
|
239
|
+
response = self.client.get(work_url, headers=self._get_headers())
|
240
|
+
response.raise_for_status()
|
241
|
+
|
242
|
+
work_data = response.json()
|
243
|
+
work = work_data.get("message", {})
|
244
|
+
|
245
|
+
# Look for PDF link
|
246
|
+
links = work.get("link", [])
|
247
|
+
pdf_url = None
|
248
|
+
for link in links:
|
249
|
+
if link.get("content-type") == "application/pdf":
|
250
|
+
pdf_url = link.get("URL")
|
251
|
+
break
|
252
|
+
|
253
|
+
if not pdf_url:
|
254
|
+
return f"Error: No PDF link found for DOI {paper_id}. Crossref provides metadata; PDFs are hosted by publishers."
|
255
|
+
else:
|
256
|
+
pdf_url = paper_id
|
257
|
+
|
258
|
+
# Attempt to download PDF
|
259
|
+
from pathlib import Path
|
260
|
+
|
261
|
+
save_path_obj = Path(save_path)
|
262
|
+
save_path_obj.mkdir(parents=True, exist_ok=True)
|
263
|
+
|
264
|
+
# Create filename from DOI or URL
|
265
|
+
if paper_id.startswith("10."):
|
266
|
+
filename = (
|
267
|
+
f"crossref_{paper_id.replace('/', '_').replace('.', '_')}.pdf"
|
268
|
+
)
|
269
|
+
else:
|
270
|
+
filename = f"crossref_paper_{hash(paper_id) % 10000}.pdf"
|
271
|
+
|
272
|
+
file_path = save_path_obj / filename
|
273
|
+
|
274
|
+
pdf_response = self.client.get(pdf_url, headers=self._get_headers())
|
275
|
+
pdf_response.raise_for_status()
|
276
|
+
|
277
|
+
with open(file_path, "wb") as f:
|
278
|
+
f.write(pdf_response.content)
|
279
|
+
|
280
|
+
return str(file_path)
|
281
|
+
|
282
|
+
except Exception as e:
|
283
|
+
return f"Error downloading PDF: {e}"
|
284
|
+
|
285
|
+
def read_paper(self, paper_id: str, save_path: str) -> str:
|
286
|
+
"""
|
287
|
+
Read paper text (downloads PDF first if needed)
|
288
|
+
|
289
|
+
Note: Success depends on PDF availability from publishers
|
290
|
+
"""
|
291
|
+
if not paper_id:
|
292
|
+
return "Error: paper_id is required"
|
293
|
+
|
294
|
+
try:
|
295
|
+
# First try to download the PDF
|
296
|
+
pdf_path = self.download_pdf(paper_id, save_path)
|
297
|
+
|
298
|
+
if pdf_path.startswith("Error"):
|
299
|
+
return pdf_path
|
300
|
+
|
301
|
+
# Read the PDF using the existing read_pdf function
|
302
|
+
from ..paper import read_pdf
|
303
|
+
|
304
|
+
return read_pdf(pdf_path)
|
305
|
+
|
306
|
+
except Exception as e:
|
307
|
+
return f"Error reading paper: {e}"
|
308
|
+
|
309
|
+
def search_by_doi(self, doi: str) -> Optional[Paper]:
|
310
|
+
"""Search for a specific paper by DOI"""
|
311
|
+
try:
|
312
|
+
work_url = f"{self.WORKS_ENDPOINT}/{quote_plus(doi)}"
|
313
|
+
response = self.client.get(work_url, headers=self._get_headers())
|
314
|
+
response.raise_for_status()
|
315
|
+
|
316
|
+
data = response.json()
|
317
|
+
work = data.get("message", {})
|
318
|
+
|
319
|
+
return self._parse_work(work)
|
320
|
+
|
321
|
+
except Exception as e:
|
322
|
+
logger.error(f"Error searching by DOI {doi}: {e}")
|
323
|
+
return None
|
324
|
+
|
325
|
+
def __del__(self):
|
326
|
+
"""Clean up HTTP client"""
|
327
|
+
if hasattr(self, "client"):
|
328
|
+
self.client.close()
|