academic-search-mcp 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- academic_search_mcp-0.1.3.dist-info/METADATA +243 -0
- academic_search_mcp-0.1.3.dist-info/RECORD +24 -0
- academic_search_mcp-0.1.3.dist-info/WHEEL +4 -0
- academic_search_mcp-0.1.3.dist-info/entry_points.txt +2 -0
- academic_search_mcp-0.1.3.dist-info/licenses/LICENSE +21 -0
- paper_search_mcp/__init__.py +0 -0
- paper_search_mcp/academic_platforms/__init__.py +0 -0
- paper_search_mcp/academic_platforms/arxiv.py +147 -0
- paper_search_mcp/academic_platforms/biorxiv.py +156 -0
- paper_search_mcp/academic_platforms/core.py +284 -0
- paper_search_mcp/academic_platforms/crossref.py +375 -0
- paper_search_mcp/academic_platforms/cyberleninka.py +396 -0
- paper_search_mcp/academic_platforms/google_scholar.py +249 -0
- paper_search_mcp/academic_platforms/hub.py +0 -0
- paper_search_mcp/academic_platforms/iacr.py +548 -0
- paper_search_mcp/academic_platforms/medrxiv.py +156 -0
- paper_search_mcp/academic_platforms/openalex.py +497 -0
- paper_search_mcp/academic_platforms/pubmed.py +159 -0
- paper_search_mcp/academic_platforms/sci_hub.py +178 -0
- paper_search_mcp/academic_platforms/semantic.py +492 -0
- paper_search_mcp/academic_platforms/ssrn.py +385 -0
- paper_search_mcp/paper.py +69 -0
- paper_search_mcp/pdf_utils.py +67 -0
- paper_search_mcp/server.py +514 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
"""CyberLeninka integration for Russian academic papers.
|
|
2
|
+
|
|
3
|
+
CyberLeninka is a Russian open access repository with scientific articles
|
|
4
|
+
from Russian journals. Supports filtering by VAK, RSCI, and SCOPUS indexed journals.
|
|
5
|
+
"""
|
|
6
|
+
from typing import List, Optional, Dict
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from curl_cffi import requests
|
|
9
|
+
from bs4 import BeautifulSoup
|
|
10
|
+
import re
|
|
11
|
+
import os
|
|
12
|
+
import logging
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
from ..paper import Paper
|
|
16
|
+
from ..pdf_utils import extract_text_from_pdf
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CyberLeninkaSearcher:
|
|
22
|
+
"""Searcher for CyberLeninka Russian academic repository.
|
|
23
|
+
|
|
24
|
+
Supports filtering by:
|
|
25
|
+
- VAK (ВАК) - Russian Higher Attestation Commission journals
|
|
26
|
+
- RSCI (РИНЦ) - Russian Science Citation Index
|
|
27
|
+
- SCOPUS - Scopus indexed journals
|
|
28
|
+
- Subject categories (e.g., economics, law, medicine)
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
BASE_URL = "https://cyberleninka.ru"
|
|
32
|
+
API_URL = f"{BASE_URL}/api/search"
|
|
33
|
+
|
|
34
|
+
# Catalog IDs for journal indexing filters
|
|
35
|
+
CATALOGS = {
|
|
36
|
+
"vak": 8, # ВАК
|
|
37
|
+
"rsci": 22, # РИНЦ
|
|
38
|
+
"scopus": 2, # SCOPUS
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# Common subject category IDs (terms)
|
|
42
|
+
CATEGORIES = {
|
|
43
|
+
"economics": 35,
|
|
44
|
+
"law": 36,
|
|
45
|
+
"medicine": 43,
|
|
46
|
+
"psychology": 46,
|
|
47
|
+
"sociology": 47,
|
|
48
|
+
"pedagogy": 44,
|
|
49
|
+
"philosophy": 48,
|
|
50
|
+
"history": 41,
|
|
51
|
+
"politics": 45,
|
|
52
|
+
"philology": 49,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
def __init__(self):
|
|
56
|
+
"""Initialize CyberLeninka searcher."""
|
|
57
|
+
self.impersonate = "chrome"
|
|
58
|
+
self.last_request_time = 0
|
|
59
|
+
|
|
60
|
+
def _rate_limit(self, delay: float = 0.5):
|
|
61
|
+
"""Apply rate limiting."""
|
|
62
|
+
elapsed = time.time() - self.last_request_time
|
|
63
|
+
if elapsed < delay:
|
|
64
|
+
time.sleep(delay - elapsed)
|
|
65
|
+
self.last_request_time = time.time()
|
|
66
|
+
|
|
67
|
+
def search(
|
|
68
|
+
self,
|
|
69
|
+
query: str,
|
|
70
|
+
max_results: int = 10,
|
|
71
|
+
date_from: Optional[str] = None,
|
|
72
|
+
date_to: Optional[str] = None,
|
|
73
|
+
catalog: Optional[str] = None,
|
|
74
|
+
category: Optional[str] = None,
|
|
75
|
+
**kwargs
|
|
76
|
+
) -> List[Paper]:
|
|
77
|
+
"""Search CyberLeninka for papers.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
query: Search query string
|
|
81
|
+
max_results: Maximum number of papers (default: 10, max: 100)
|
|
82
|
+
date_from: Start date YYYY-MM-DD (only year is used)
|
|
83
|
+
date_to: End date YYYY-MM-DD (only year is used)
|
|
84
|
+
catalog: Filter by indexing: 'vak', 'rsci', or 'scopus'
|
|
85
|
+
category: Filter by subject: 'economics', 'law', 'medicine', etc.
|
|
86
|
+
**kwargs: Additional parameters
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
List of Paper objects
|
|
90
|
+
"""
|
|
91
|
+
papers = []
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
self._rate_limit()
|
|
95
|
+
|
|
96
|
+
# Build request payload
|
|
97
|
+
payload = {
|
|
98
|
+
"q": query,
|
|
99
|
+
"mode": "articles",
|
|
100
|
+
"size": min(max_results, 100),
|
|
101
|
+
"from": kwargs.get("offset", 0)
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
# Date filters (year only)
|
|
105
|
+
if date_from:
|
|
106
|
+
payload["year_from"] = int(date_from[:4])
|
|
107
|
+
if date_to:
|
|
108
|
+
payload["year_to"] = int(date_to[:4])
|
|
109
|
+
|
|
110
|
+
# Catalog filter (VAK, RSCI, SCOPUS)
|
|
111
|
+
if catalog and catalog.lower() in self.CATALOGS:
|
|
112
|
+
payload["catalogs"] = [self.CATALOGS[catalog.lower()]]
|
|
113
|
+
|
|
114
|
+
# Category/subject filter
|
|
115
|
+
if category and category.lower() in self.CATEGORIES:
|
|
116
|
+
payload["terms"] = [self.CATEGORIES[category.lower()]]
|
|
117
|
+
|
|
118
|
+
response = requests.post(
|
|
119
|
+
self.API_URL,
|
|
120
|
+
json=payload,
|
|
121
|
+
timeout=30,
|
|
122
|
+
impersonate=self.impersonate
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if response.status_code != 200:
|
|
126
|
+
logger.error(f"CyberLeninka search failed with status {response.status_code}")
|
|
127
|
+
return papers
|
|
128
|
+
|
|
129
|
+
data = response.json()
|
|
130
|
+
articles = data.get("articles", [])
|
|
131
|
+
|
|
132
|
+
logger.info(f"CyberLeninka search: found {len(articles)} articles for '{query}'")
|
|
133
|
+
|
|
134
|
+
for article in articles:
|
|
135
|
+
try:
|
|
136
|
+
paper = self._parse_search_result(article)
|
|
137
|
+
if paper:
|
|
138
|
+
papers.append(paper)
|
|
139
|
+
except Exception as e:
|
|
140
|
+
logger.warning(f"Error parsing CyberLeninka result: {e}")
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.error(f"CyberLeninka search error: {e}")
|
|
145
|
+
|
|
146
|
+
return papers
|
|
147
|
+
|
|
148
|
+
def _parse_search_result(self, article: Dict) -> Optional[Paper]:
|
|
149
|
+
"""Parse a search result into a Paper object."""
|
|
150
|
+
try:
|
|
151
|
+
# Extract article ID from link
|
|
152
|
+
link = article.get("link", "")
|
|
153
|
+
paper_id = link.split("/")[-1] if link else ""
|
|
154
|
+
|
|
155
|
+
title = article.get("name", "")
|
|
156
|
+
if not title:
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
# Authors
|
|
160
|
+
authors = []
|
|
161
|
+
authors_data = article.get("authors", [])
|
|
162
|
+
if isinstance(authors_data, list):
|
|
163
|
+
for author in authors_data:
|
|
164
|
+
if isinstance(author, dict):
|
|
165
|
+
name = author.get("name", "")
|
|
166
|
+
else:
|
|
167
|
+
name = str(author)
|
|
168
|
+
if name:
|
|
169
|
+
authors.append(name)
|
|
170
|
+
|
|
171
|
+
# Abstract/annotation
|
|
172
|
+
abstract = article.get("annotation", "") or ""
|
|
173
|
+
|
|
174
|
+
# Year
|
|
175
|
+
year = article.get("year")
|
|
176
|
+
published_date = datetime(int(year), 1, 1) if year else datetime.min
|
|
177
|
+
|
|
178
|
+
# Journal info
|
|
179
|
+
journal = article.get("journal", {})
|
|
180
|
+
journal_name = journal.get("name", "") if isinstance(journal, dict) else ""
|
|
181
|
+
|
|
182
|
+
# URL
|
|
183
|
+
url = f"{self.BASE_URL}{link}" if link else ""
|
|
184
|
+
pdf_url = f"{url}/pdf" if url else ""
|
|
185
|
+
|
|
186
|
+
# Keywords
|
|
187
|
+
keywords = []
|
|
188
|
+
kw_data = article.get("keywords", [])
|
|
189
|
+
if isinstance(kw_data, list):
|
|
190
|
+
keywords = [k for k in kw_data if k]
|
|
191
|
+
|
|
192
|
+
return Paper(
|
|
193
|
+
paper_id=paper_id,
|
|
194
|
+
title=title,
|
|
195
|
+
authors=authors,
|
|
196
|
+
abstract=abstract[:5000] if abstract else "",
|
|
197
|
+
doi="", # DOI requires fetching article page
|
|
198
|
+
published_date=published_date,
|
|
199
|
+
pdf_url=pdf_url,
|
|
200
|
+
url=url,
|
|
201
|
+
source="cyberleninka",
|
|
202
|
+
categories=[],
|
|
203
|
+
keywords=keywords[:10],
|
|
204
|
+
citations=0,
|
|
205
|
+
references=[],
|
|
206
|
+
extra={
|
|
207
|
+
"journal": journal_name,
|
|
208
|
+
"year": year,
|
|
209
|
+
"cyberleninka_id": paper_id
|
|
210
|
+
}
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
except Exception as e:
|
|
214
|
+
logger.error(f"Error parsing CyberLeninka result: {e}")
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
def get_paper_by_id(self, paper_id: str) -> Optional[Paper]:
|
|
218
|
+
"""Get a specific paper by its CyberLeninka ID (URL slug).
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
paper_id: CyberLeninka article slug (e.g., 'tsifrovoy-suverenitet-i-strany-vostoka')
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Paper object or None
|
|
225
|
+
"""
|
|
226
|
+
try:
|
|
227
|
+
self._rate_limit()
|
|
228
|
+
|
|
229
|
+
url = f"{self.BASE_URL}/article/n/{paper_id}"
|
|
230
|
+
response = requests.get(url, timeout=30, impersonate=self.impersonate)
|
|
231
|
+
|
|
232
|
+
if response.status_code == 404:
|
|
233
|
+
return None
|
|
234
|
+
response.raise_for_status()
|
|
235
|
+
|
|
236
|
+
return self._parse_article_page(response.text, paper_id, url)
|
|
237
|
+
|
|
238
|
+
except Exception as e:
|
|
239
|
+
logger.error(f"Error fetching CyberLeninka paper {paper_id}: {e}")
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
def _parse_article_page(self, html: str, paper_id: str, url: str) -> Optional[Paper]:
|
|
243
|
+
"""Parse an article page into a Paper object."""
|
|
244
|
+
try:
|
|
245
|
+
soup = BeautifulSoup(html, 'lxml')
|
|
246
|
+
|
|
247
|
+
# Title from meta tag
|
|
248
|
+
title = ""
|
|
249
|
+
title_meta = soup.find("meta", {"name": "citation_title"})
|
|
250
|
+
if title_meta:
|
|
251
|
+
title = title_meta.get("content", "")
|
|
252
|
+
if not title:
|
|
253
|
+
title_meta = soup.find("meta", {"property": "og:title"})
|
|
254
|
+
if title_meta:
|
|
255
|
+
title = title_meta.get("content", "")
|
|
256
|
+
|
|
257
|
+
if not title:
|
|
258
|
+
return None
|
|
259
|
+
|
|
260
|
+
# Authors from meta tags
|
|
261
|
+
authors = []
|
|
262
|
+
for meta in soup.find_all("meta", {"name": "citation_author"}):
|
|
263
|
+
name = meta.get("content", "")
|
|
264
|
+
if name:
|
|
265
|
+
authors.append(name)
|
|
266
|
+
|
|
267
|
+
# Abstract from meta description
|
|
268
|
+
abstract = ""
|
|
269
|
+
desc_meta = soup.find("meta", {"name": "description"})
|
|
270
|
+
if desc_meta:
|
|
271
|
+
abstract = desc_meta.get("content", "")
|
|
272
|
+
|
|
273
|
+
# DOI
|
|
274
|
+
doi = ""
|
|
275
|
+
doi_div = soup.find("div", {"class": "label-doi"})
|
|
276
|
+
if doi_div:
|
|
277
|
+
doi_match = re.search(r'DOI:\s*(10\.\S+)', doi_div.get("title", ""))
|
|
278
|
+
if doi_match:
|
|
279
|
+
doi = doi_match.group(1)
|
|
280
|
+
|
|
281
|
+
# Publication date
|
|
282
|
+
published_date = datetime.min
|
|
283
|
+
date_meta = soup.find("meta", {"name": "citation_publication_date"})
|
|
284
|
+
if date_meta:
|
|
285
|
+
date_str = date_meta.get("content", "")
|
|
286
|
+
try:
|
|
287
|
+
published_date = datetime.strptime(date_str, "%Y")
|
|
288
|
+
except:
|
|
289
|
+
pass
|
|
290
|
+
|
|
291
|
+
# Journal
|
|
292
|
+
journal = ""
|
|
293
|
+
journal_meta = soup.find("meta", {"name": "citation_journal_title"})
|
|
294
|
+
if journal_meta:
|
|
295
|
+
journal = journal_meta.get("content", "")
|
|
296
|
+
|
|
297
|
+
# Keywords
|
|
298
|
+
keywords = []
|
|
299
|
+
kw_meta = soup.find("meta", {"name": "citation_keywords"})
|
|
300
|
+
if kw_meta:
|
|
301
|
+
kw_text = kw_meta.get("content", "")
|
|
302
|
+
keywords = [k.strip() for k in kw_text.split(",") if k.strip()]
|
|
303
|
+
|
|
304
|
+
# PDF URL
|
|
305
|
+
pdf_url = ""
|
|
306
|
+
pdf_meta = soup.find("meta", {"name": "citation_pdf_url"})
|
|
307
|
+
if pdf_meta:
|
|
308
|
+
pdf_url = pdf_meta.get("content", "")
|
|
309
|
+
|
|
310
|
+
# ISSN
|
|
311
|
+
issn = ""
|
|
312
|
+
issn_meta = soup.find("meta", {"name": "citation_issn"})
|
|
313
|
+
if issn_meta:
|
|
314
|
+
issn = issn_meta.get("content", "")
|
|
315
|
+
|
|
316
|
+
return Paper(
|
|
317
|
+
paper_id=paper_id,
|
|
318
|
+
title=title,
|
|
319
|
+
authors=authors,
|
|
320
|
+
abstract=abstract[:5000] if abstract else "",
|
|
321
|
+
doi=doi,
|
|
322
|
+
published_date=published_date,
|
|
323
|
+
pdf_url=pdf_url or f"{url}/pdf",
|
|
324
|
+
url=url,
|
|
325
|
+
source="cyberleninka",
|
|
326
|
+
categories=[],
|
|
327
|
+
keywords=keywords[:10],
|
|
328
|
+
citations=0,
|
|
329
|
+
references=[],
|
|
330
|
+
extra={
|
|
331
|
+
"journal": journal,
|
|
332
|
+
"issn": issn,
|
|
333
|
+
"cyberleninka_id": paper_id
|
|
334
|
+
}
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
except Exception as e:
|
|
338
|
+
logger.error(f"Error parsing CyberLeninka page: {e}")
|
|
339
|
+
return None
|
|
340
|
+
|
|
341
|
+
def download_pdf(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
342
|
+
"""Download PDF from CyberLeninka.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
paper_id: CyberLeninka article slug
|
|
346
|
+
save_path: Directory to save
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
Path to PDF or error message
|
|
350
|
+
"""
|
|
351
|
+
try:
|
|
352
|
+
os.makedirs(save_path, exist_ok=True)
|
|
353
|
+
|
|
354
|
+
self._rate_limit()
|
|
355
|
+
|
|
356
|
+
# PDF URL format
|
|
357
|
+
pdf_url = f"{self.BASE_URL}/article/n/{paper_id}/pdf"
|
|
358
|
+
|
|
359
|
+
response = requests.get(pdf_url, timeout=60, impersonate=self.impersonate)
|
|
360
|
+
|
|
361
|
+
if response.status_code != 200:
|
|
362
|
+
return f"PDF download failed for {paper_id}"
|
|
363
|
+
|
|
364
|
+
content_type = response.headers.get("Content-Type", "")
|
|
365
|
+
|
|
366
|
+
if "pdf" in content_type.lower():
|
|
367
|
+
filename = f"cyberleninka_{paper_id}.pdf"
|
|
368
|
+
file_path = os.path.join(save_path, filename)
|
|
369
|
+
|
|
370
|
+
with open(file_path, 'wb') as f:
|
|
371
|
+
f.write(response.content)
|
|
372
|
+
|
|
373
|
+
return file_path
|
|
374
|
+
|
|
375
|
+
return f"PDF not available for {paper_id}"
|
|
376
|
+
|
|
377
|
+
except Exception as e:
|
|
378
|
+
logger.error(f"Error downloading CyberLeninka PDF: {e}")
|
|
379
|
+
return f"Failed to download PDF: {e}"
|
|
380
|
+
|
|
381
|
+
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
382
|
+
"""Read and extract text from a CyberLeninka paper PDF.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
paper_id: CyberLeninka article slug
|
|
386
|
+
save_path: Directory for PDF storage
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
Extracted text or error message
|
|
390
|
+
"""
|
|
391
|
+
pdf_path = self.download_pdf(paper_id, save_path)
|
|
392
|
+
if not os.path.exists(pdf_path):
|
|
393
|
+
return pdf_path # Return error message
|
|
394
|
+
|
|
395
|
+
text = extract_text_from_pdf(pdf_path)
|
|
396
|
+
return text if text else "Failed to extract text from PDF"
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
import requests
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
import time
|
|
6
|
+
import random
|
|
7
|
+
from ..paper import Paper
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
class PaperSource:
|
|
13
|
+
"""Abstract base class for paper sources"""
|
|
14
|
+
def search(self, query: str, **kwargs) -> List[Paper]:
|
|
15
|
+
raise NotImplementedError
|
|
16
|
+
|
|
17
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
18
|
+
raise NotImplementedError
|
|
19
|
+
|
|
20
|
+
def read_paper(self, paper_id: str, save_path: str) -> str:
|
|
21
|
+
raise NotImplementedError
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class GoogleScholarSearcher(PaperSource):
|
|
25
|
+
"""Custom implementation of Google Scholar paper search"""
|
|
26
|
+
|
|
27
|
+
SCHOLAR_URL = "https://scholar.google.com/scholar"
|
|
28
|
+
BROWSERS = [
|
|
29
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
|
30
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
|
|
31
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
def __init__(self):
|
|
35
|
+
self._setup_session()
|
|
36
|
+
|
|
37
|
+
def _setup_session(self):
|
|
38
|
+
"""Initialize session with random user agent"""
|
|
39
|
+
self.session = requests.Session()
|
|
40
|
+
self.session.headers.update({
|
|
41
|
+
'User-Agent': random.choice(self.BROWSERS),
|
|
42
|
+
'Accept': 'text/html,application/xhtml+xml',
|
|
43
|
+
'Accept-Language': 'en-US,en;q=0.9'
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
def _extract_year(self, text: str) -> Optional[int]:
|
|
47
|
+
"""Extract year from publication info"""
|
|
48
|
+
for word in text.split():
|
|
49
|
+
if word.isdigit() and 1900 <= int(word) <= datetime.now().year:
|
|
50
|
+
return int(word)
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
def _extract_cluster_id(self, item) -> Optional[str]:
|
|
54
|
+
"""Extract Google Scholar cluster ID from result item"""
|
|
55
|
+
import re
|
|
56
|
+
# Look for cluster/cites ID in the links (gs_fl div contains "Cited by", "All versions", etc.)
|
|
57
|
+
links_div = item.find('div', class_='gs_fl')
|
|
58
|
+
if links_div:
|
|
59
|
+
for a in links_div.find_all('a', href=True):
|
|
60
|
+
href = a['href']
|
|
61
|
+
# Match cluster=ID or cites=ID
|
|
62
|
+
match = re.search(r'(?:cluster|cites)=(\d+)', href)
|
|
63
|
+
if match:
|
|
64
|
+
return match.group(1)
|
|
65
|
+
|
|
66
|
+
# Also check data-cid attribute on the result container
|
|
67
|
+
if item.get('data-cid'):
|
|
68
|
+
return item['data-cid']
|
|
69
|
+
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
def _extract_citations(self, item) -> int:
|
|
73
|
+
"""Extract citation count from result item"""
|
|
74
|
+
import re
|
|
75
|
+
links_div = item.find('div', class_='gs_fl')
|
|
76
|
+
if links_div:
|
|
77
|
+
for a in links_div.find_all('a'):
|
|
78
|
+
text = a.get_text()
|
|
79
|
+
if 'Cited by' in text:
|
|
80
|
+
match = re.search(r'Cited by (\d+)', text)
|
|
81
|
+
if match:
|
|
82
|
+
return int(match.group(1))
|
|
83
|
+
return 0
|
|
84
|
+
|
|
85
|
+
def _parse_paper(self, item) -> Optional[Paper]:
|
|
86
|
+
"""Parse single paper entry from HTML"""
|
|
87
|
+
try:
|
|
88
|
+
# Extract main paper elements
|
|
89
|
+
title_elem = item.find('h3', class_='gs_rt')
|
|
90
|
+
info_elem = item.find('div', class_='gs_a')
|
|
91
|
+
abstract_elem = item.find('div', class_='gs_rs')
|
|
92
|
+
|
|
93
|
+
if not title_elem or not info_elem:
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
# Process title and URL
|
|
97
|
+
title = title_elem.get_text(strip=True).replace('[PDF]', '').replace('[HTML]', '')
|
|
98
|
+
link = title_elem.find('a', href=True)
|
|
99
|
+
url = link['href'] if link else ''
|
|
100
|
+
|
|
101
|
+
# Extract cluster ID (Google Scholar's unique paper identifier)
|
|
102
|
+
cluster_id = self._extract_cluster_id(item)
|
|
103
|
+
|
|
104
|
+
# Fallback to URL hash if no cluster ID found
|
|
105
|
+
paper_id = cluster_id if cluster_id else f"gs_{abs(hash(url))}"
|
|
106
|
+
|
|
107
|
+
# Note: DOI not available in GS search results (would require extra requests)
|
|
108
|
+
|
|
109
|
+
# Process author info
|
|
110
|
+
info_text = info_elem.get_text()
|
|
111
|
+
authors = [a.strip() for a in info_text.split('-')[0].split(',')]
|
|
112
|
+
year = self._extract_year(info_text)
|
|
113
|
+
|
|
114
|
+
# Extract citation count
|
|
115
|
+
citations = self._extract_citations(item)
|
|
116
|
+
|
|
117
|
+
# Create paper object
|
|
118
|
+
return Paper(
|
|
119
|
+
paper_id=paper_id,
|
|
120
|
+
title=title,
|
|
121
|
+
authors=authors,
|
|
122
|
+
abstract=abstract_elem.get_text() if abstract_elem else "",
|
|
123
|
+
url=url,
|
|
124
|
+
pdf_url="",
|
|
125
|
+
published_date=datetime(year, 1, 1) if year else None,
|
|
126
|
+
updated_date=None,
|
|
127
|
+
source="google_scholar",
|
|
128
|
+
categories=[],
|
|
129
|
+
keywords=[],
|
|
130
|
+
doi="", # Not available in GS search results
|
|
131
|
+
citations=citations
|
|
132
|
+
)
|
|
133
|
+
except Exception as e:
|
|
134
|
+
logger.warning(f"Failed to parse paper: {e}")
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
def search(self, query: str, max_results: int = 10, date_from: str = None, date_to: str = None) -> List[Paper]:
|
|
138
|
+
"""
|
|
139
|
+
Search Google Scholar with custom parameters
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
query: Search query string
|
|
143
|
+
max_results: Maximum number of papers to return
|
|
144
|
+
date_from: Start date in YYYY-MM-DD format (only year is used)
|
|
145
|
+
date_to: End date in YYYY-MM-DD format (only year is used)
|
|
146
|
+
"""
|
|
147
|
+
papers = []
|
|
148
|
+
start = 0
|
|
149
|
+
results_per_page = min(10, max_results)
|
|
150
|
+
|
|
151
|
+
while len(papers) < max_results:
|
|
152
|
+
try:
|
|
153
|
+
# Construct search parameters
|
|
154
|
+
params = {
|
|
155
|
+
'q': query,
|
|
156
|
+
'start': start,
|
|
157
|
+
'hl': 'en',
|
|
158
|
+
'as_sdt': '0,5' # Include articles and citations
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
# Add year filters if provided (extract year from YYYY-MM-DD format)
|
|
162
|
+
if date_from:
|
|
163
|
+
try:
|
|
164
|
+
year_from = int(date_from.split('-')[0])
|
|
165
|
+
params['as_ylo'] = year_from
|
|
166
|
+
except (ValueError, IndexError):
|
|
167
|
+
logger.warning(f"Invalid date_from format: {date_from}")
|
|
168
|
+
|
|
169
|
+
if date_to:
|
|
170
|
+
try:
|
|
171
|
+
year_to = int(date_to.split('-')[0])
|
|
172
|
+
params['as_yhi'] = year_to
|
|
173
|
+
except (ValueError, IndexError):
|
|
174
|
+
logger.warning(f"Invalid date_to format: {date_to}")
|
|
175
|
+
|
|
176
|
+
# Make request with random delay
|
|
177
|
+
time.sleep(random.uniform(1.0, 3.0))
|
|
178
|
+
response = self.session.get(self.SCHOLAR_URL, params=params)
|
|
179
|
+
|
|
180
|
+
if response.status_code != 200:
|
|
181
|
+
logger.error(f"Search failed with status {response.status_code}")
|
|
182
|
+
break
|
|
183
|
+
|
|
184
|
+
# Parse results
|
|
185
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
|
186
|
+
results = soup.find_all('div', class_='gs_ri')
|
|
187
|
+
|
|
188
|
+
if not results:
|
|
189
|
+
break
|
|
190
|
+
|
|
191
|
+
# Process each result
|
|
192
|
+
for item in results:
|
|
193
|
+
if len(papers) >= max_results:
|
|
194
|
+
break
|
|
195
|
+
|
|
196
|
+
paper = self._parse_paper(item)
|
|
197
|
+
if paper:
|
|
198
|
+
papers.append(paper)
|
|
199
|
+
|
|
200
|
+
start += results_per_page
|
|
201
|
+
|
|
202
|
+
except Exception as e:
|
|
203
|
+
logger.error(f"Search error: {e}")
|
|
204
|
+
break
|
|
205
|
+
|
|
206
|
+
return papers[:max_results]
|
|
207
|
+
|
|
208
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
209
|
+
"""
|
|
210
|
+
Google Scholar doesn't support direct PDF downloads
|
|
211
|
+
|
|
212
|
+
Raises:
|
|
213
|
+
NotImplementedError: Always raises this error
|
|
214
|
+
"""
|
|
215
|
+
raise NotImplementedError(
|
|
216
|
+
"Google Scholar doesn't provide direct PDF downloads. "
|
|
217
|
+
"Please use the paper URL to access the publisher's website."
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
221
|
+
"""
|
|
222
|
+
Google Scholar doesn't support direct paper reading
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
str: Message indicating the feature is not supported
|
|
226
|
+
"""
|
|
227
|
+
return (
|
|
228
|
+
"Google Scholar doesn't support direct paper reading. "
|
|
229
|
+
"Please use the paper URL to access the full text on the publisher's website."
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
if __name__ == "__main__":
|
|
233
|
+
# Test Google Scholar searcher
|
|
234
|
+
searcher = GoogleScholarSearcher()
|
|
235
|
+
|
|
236
|
+
print("Testing search functionality...")
|
|
237
|
+
query = "machine learning"
|
|
238
|
+
max_results = 5
|
|
239
|
+
|
|
240
|
+
try:
|
|
241
|
+
papers = searcher.search(query, max_results=max_results)
|
|
242
|
+
print(f"\nFound {len(papers)} papers for query '{query}':")
|
|
243
|
+
for i, paper in enumerate(papers, 1):
|
|
244
|
+
print(f"\n{i}. {paper.title}")
|
|
245
|
+
print(f" Authors: {', '.join(paper.authors)}")
|
|
246
|
+
print(f" Citations: {paper.citations}")
|
|
247
|
+
print(f" URL: {paper.url}")
|
|
248
|
+
except Exception as e:
|
|
249
|
+
print(f"Error during search: {e}")
|
|
File without changes
|