academic-search-mcp 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,396 @@
1
+ """CyberLeninka integration for Russian academic papers.
2
+
3
+ CyberLeninka is a Russian open access repository with scientific articles
4
+ from Russian journals. Supports filtering by VAK, RSCI, and SCOPUS indexed journals.
5
+ """
6
+ from typing import List, Optional, Dict
7
+ from datetime import datetime
8
+ from curl_cffi import requests
9
+ from bs4 import BeautifulSoup
10
+ import re
11
+ import os
12
+ import logging
13
+ import time
14
+
15
+ from ..paper import Paper
16
+ from ..pdf_utils import extract_text_from_pdf
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class CyberLeninkaSearcher:
22
+ """Searcher for CyberLeninka Russian academic repository.
23
+
24
+ Supports filtering by:
25
+ - VAK (ВАК) - Russian Higher Attestation Commission journals
26
+ - RSCI (РИНЦ) - Russian Science Citation Index
27
+ - SCOPUS - Scopus indexed journals
28
+ - Subject categories (e.g., economics, law, medicine)
29
+ """
30
+
31
+ BASE_URL = "https://cyberleninka.ru"
32
+ API_URL = f"{BASE_URL}/api/search"
33
+
34
+ # Catalog IDs for journal indexing filters
35
+ CATALOGS = {
36
+ "vak": 8, # ВАК
37
+ "rsci": 22, # РИНЦ
38
+ "scopus": 2, # SCOPUS
39
+ }
40
+
41
+ # Common subject category IDs (terms)
42
+ CATEGORIES = {
43
+ "economics": 35,
44
+ "law": 36,
45
+ "medicine": 43,
46
+ "psychology": 46,
47
+ "sociology": 47,
48
+ "pedagogy": 44,
49
+ "philosophy": 48,
50
+ "history": 41,
51
+ "politics": 45,
52
+ "philology": 49,
53
+ }
54
+
55
+ def __init__(self):
56
+ """Initialize CyberLeninka searcher."""
57
+ self.impersonate = "chrome"
58
+ self.last_request_time = 0
59
+
60
+ def _rate_limit(self, delay: float = 0.5):
61
+ """Apply rate limiting."""
62
+ elapsed = time.time() - self.last_request_time
63
+ if elapsed < delay:
64
+ time.sleep(delay - elapsed)
65
+ self.last_request_time = time.time()
66
+
67
+ def search(
68
+ self,
69
+ query: str,
70
+ max_results: int = 10,
71
+ date_from: Optional[str] = None,
72
+ date_to: Optional[str] = None,
73
+ catalog: Optional[str] = None,
74
+ category: Optional[str] = None,
75
+ **kwargs
76
+ ) -> List[Paper]:
77
+ """Search CyberLeninka for papers.
78
+
79
+ Args:
80
+ query: Search query string
81
+ max_results: Maximum number of papers (default: 10, max: 100)
82
+ date_from: Start date YYYY-MM-DD (only year is used)
83
+ date_to: End date YYYY-MM-DD (only year is used)
84
+ catalog: Filter by indexing: 'vak', 'rsci', or 'scopus'
85
+ category: Filter by subject: 'economics', 'law', 'medicine', etc.
86
+ **kwargs: Additional parameters
87
+
88
+ Returns:
89
+ List of Paper objects
90
+ """
91
+ papers = []
92
+
93
+ try:
94
+ self._rate_limit()
95
+
96
+ # Build request payload
97
+ payload = {
98
+ "q": query,
99
+ "mode": "articles",
100
+ "size": min(max_results, 100),
101
+ "from": kwargs.get("offset", 0)
102
+ }
103
+
104
+ # Date filters (year only)
105
+ if date_from:
106
+ payload["year_from"] = int(date_from[:4])
107
+ if date_to:
108
+ payload["year_to"] = int(date_to[:4])
109
+
110
+ # Catalog filter (VAK, RSCI, SCOPUS)
111
+ if catalog and catalog.lower() in self.CATALOGS:
112
+ payload["catalogs"] = [self.CATALOGS[catalog.lower()]]
113
+
114
+ # Category/subject filter
115
+ if category and category.lower() in self.CATEGORIES:
116
+ payload["terms"] = [self.CATEGORIES[category.lower()]]
117
+
118
+ response = requests.post(
119
+ self.API_URL,
120
+ json=payload,
121
+ timeout=30,
122
+ impersonate=self.impersonate
123
+ )
124
+
125
+ if response.status_code != 200:
126
+ logger.error(f"CyberLeninka search failed with status {response.status_code}")
127
+ return papers
128
+
129
+ data = response.json()
130
+ articles = data.get("articles", [])
131
+
132
+ logger.info(f"CyberLeninka search: found {len(articles)} articles for '{query}'")
133
+
134
+ for article in articles:
135
+ try:
136
+ paper = self._parse_search_result(article)
137
+ if paper:
138
+ papers.append(paper)
139
+ except Exception as e:
140
+ logger.warning(f"Error parsing CyberLeninka result: {e}")
141
+ continue
142
+
143
+ except Exception as e:
144
+ logger.error(f"CyberLeninka search error: {e}")
145
+
146
+ return papers
147
+
148
+ def _parse_search_result(self, article: Dict) -> Optional[Paper]:
149
+ """Parse a search result into a Paper object."""
150
+ try:
151
+ # Extract article ID from link
152
+ link = article.get("link", "")
153
+ paper_id = link.split("/")[-1] if link else ""
154
+
155
+ title = article.get("name", "")
156
+ if not title:
157
+ return None
158
+
159
+ # Authors
160
+ authors = []
161
+ authors_data = article.get("authors", [])
162
+ if isinstance(authors_data, list):
163
+ for author in authors_data:
164
+ if isinstance(author, dict):
165
+ name = author.get("name", "")
166
+ else:
167
+ name = str(author)
168
+ if name:
169
+ authors.append(name)
170
+
171
+ # Abstract/annotation
172
+ abstract = article.get("annotation", "") or ""
173
+
174
+ # Year
175
+ year = article.get("year")
176
+ published_date = datetime(int(year), 1, 1) if year else datetime.min
177
+
178
+ # Journal info
179
+ journal = article.get("journal", {})
180
+ journal_name = journal.get("name", "") if isinstance(journal, dict) else ""
181
+
182
+ # URL
183
+ url = f"{self.BASE_URL}{link}" if link else ""
184
+ pdf_url = f"{url}/pdf" if url else ""
185
+
186
+ # Keywords
187
+ keywords = []
188
+ kw_data = article.get("keywords", [])
189
+ if isinstance(kw_data, list):
190
+ keywords = [k for k in kw_data if k]
191
+
192
+ return Paper(
193
+ paper_id=paper_id,
194
+ title=title,
195
+ authors=authors,
196
+ abstract=abstract[:5000] if abstract else "",
197
+ doi="", # DOI requires fetching article page
198
+ published_date=published_date,
199
+ pdf_url=pdf_url,
200
+ url=url,
201
+ source="cyberleninka",
202
+ categories=[],
203
+ keywords=keywords[:10],
204
+ citations=0,
205
+ references=[],
206
+ extra={
207
+ "journal": journal_name,
208
+ "year": year,
209
+ "cyberleninka_id": paper_id
210
+ }
211
+ )
212
+
213
+ except Exception as e:
214
+ logger.error(f"Error parsing CyberLeninka result: {e}")
215
+ return None
216
+
217
+ def get_paper_by_id(self, paper_id: str) -> Optional[Paper]:
218
+ """Get a specific paper by its CyberLeninka ID (URL slug).
219
+
220
+ Args:
221
+ paper_id: CyberLeninka article slug (e.g., 'tsifrovoy-suverenitet-i-strany-vostoka')
222
+
223
+ Returns:
224
+ Paper object or None
225
+ """
226
+ try:
227
+ self._rate_limit()
228
+
229
+ url = f"{self.BASE_URL}/article/n/{paper_id}"
230
+ response = requests.get(url, timeout=30, impersonate=self.impersonate)
231
+
232
+ if response.status_code == 404:
233
+ return None
234
+ response.raise_for_status()
235
+
236
+ return self._parse_article_page(response.text, paper_id, url)
237
+
238
+ except Exception as e:
239
+ logger.error(f"Error fetching CyberLeninka paper {paper_id}: {e}")
240
+ return None
241
+
242
+ def _parse_article_page(self, html: str, paper_id: str, url: str) -> Optional[Paper]:
243
+ """Parse an article page into a Paper object."""
244
+ try:
245
+ soup = BeautifulSoup(html, 'lxml')
246
+
247
+ # Title from meta tag
248
+ title = ""
249
+ title_meta = soup.find("meta", {"name": "citation_title"})
250
+ if title_meta:
251
+ title = title_meta.get("content", "")
252
+ if not title:
253
+ title_meta = soup.find("meta", {"property": "og:title"})
254
+ if title_meta:
255
+ title = title_meta.get("content", "")
256
+
257
+ if not title:
258
+ return None
259
+
260
+ # Authors from meta tags
261
+ authors = []
262
+ for meta in soup.find_all("meta", {"name": "citation_author"}):
263
+ name = meta.get("content", "")
264
+ if name:
265
+ authors.append(name)
266
+
267
+ # Abstract from meta description
268
+ abstract = ""
269
+ desc_meta = soup.find("meta", {"name": "description"})
270
+ if desc_meta:
271
+ abstract = desc_meta.get("content", "")
272
+
273
+ # DOI
274
+ doi = ""
275
+ doi_div = soup.find("div", {"class": "label-doi"})
276
+ if doi_div:
277
+ doi_match = re.search(r'DOI:\s*(10\.\S+)', doi_div.get("title", ""))
278
+ if doi_match:
279
+ doi = doi_match.group(1)
280
+
281
+ # Publication date
282
+ published_date = datetime.min
283
+ date_meta = soup.find("meta", {"name": "citation_publication_date"})
284
+ if date_meta:
285
+ date_str = date_meta.get("content", "")
286
+ try:
287
+ published_date = datetime.strptime(date_str, "%Y")
288
+ except:
289
+ pass
290
+
291
+ # Journal
292
+ journal = ""
293
+ journal_meta = soup.find("meta", {"name": "citation_journal_title"})
294
+ if journal_meta:
295
+ journal = journal_meta.get("content", "")
296
+
297
+ # Keywords
298
+ keywords = []
299
+ kw_meta = soup.find("meta", {"name": "citation_keywords"})
300
+ if kw_meta:
301
+ kw_text = kw_meta.get("content", "")
302
+ keywords = [k.strip() for k in kw_text.split(",") if k.strip()]
303
+
304
+ # PDF URL
305
+ pdf_url = ""
306
+ pdf_meta = soup.find("meta", {"name": "citation_pdf_url"})
307
+ if pdf_meta:
308
+ pdf_url = pdf_meta.get("content", "")
309
+
310
+ # ISSN
311
+ issn = ""
312
+ issn_meta = soup.find("meta", {"name": "citation_issn"})
313
+ if issn_meta:
314
+ issn = issn_meta.get("content", "")
315
+
316
+ return Paper(
317
+ paper_id=paper_id,
318
+ title=title,
319
+ authors=authors,
320
+ abstract=abstract[:5000] if abstract else "",
321
+ doi=doi,
322
+ published_date=published_date,
323
+ pdf_url=pdf_url or f"{url}/pdf",
324
+ url=url,
325
+ source="cyberleninka",
326
+ categories=[],
327
+ keywords=keywords[:10],
328
+ citations=0,
329
+ references=[],
330
+ extra={
331
+ "journal": journal,
332
+ "issn": issn,
333
+ "cyberleninka_id": paper_id
334
+ }
335
+ )
336
+
337
+ except Exception as e:
338
+ logger.error(f"Error parsing CyberLeninka page: {e}")
339
+ return None
340
+
341
+ def download_pdf(self, paper_id: str, save_path: str = "./downloads") -> str:
342
+ """Download PDF from CyberLeninka.
343
+
344
+ Args:
345
+ paper_id: CyberLeninka article slug
346
+ save_path: Directory to save
347
+
348
+ Returns:
349
+ Path to PDF or error message
350
+ """
351
+ try:
352
+ os.makedirs(save_path, exist_ok=True)
353
+
354
+ self._rate_limit()
355
+
356
+ # PDF URL format
357
+ pdf_url = f"{self.BASE_URL}/article/n/{paper_id}/pdf"
358
+
359
+ response = requests.get(pdf_url, timeout=60, impersonate=self.impersonate)
360
+
361
+ if response.status_code != 200:
362
+ return f"PDF download failed for {paper_id}"
363
+
364
+ content_type = response.headers.get("Content-Type", "")
365
+
366
+ if "pdf" in content_type.lower():
367
+ filename = f"cyberleninka_{paper_id}.pdf"
368
+ file_path = os.path.join(save_path, filename)
369
+
370
+ with open(file_path, 'wb') as f:
371
+ f.write(response.content)
372
+
373
+ return file_path
374
+
375
+ return f"PDF not available for {paper_id}"
376
+
377
+ except Exception as e:
378
+ logger.error(f"Error downloading CyberLeninka PDF: {e}")
379
+ return f"Failed to download PDF: {e}"
380
+
381
+ def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
382
+ """Read and extract text from a CyberLeninka paper PDF.
383
+
384
+ Args:
385
+ paper_id: CyberLeninka article slug
386
+ save_path: Directory for PDF storage
387
+
388
+ Returns:
389
+ Extracted text or error message
390
+ """
391
+ pdf_path = self.download_pdf(paper_id, save_path)
392
+ if not os.path.exists(pdf_path):
393
+ return pdf_path # Return error message
394
+
395
+ text = extract_text_from_pdf(pdf_path)
396
+ return text if text else "Failed to extract text from PDF"
@@ -0,0 +1,249 @@
1
+ from typing import List, Optional
2
+ from datetime import datetime
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import time
6
+ import random
7
+ from ..paper import Paper
8
+ import logging
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class PaperSource:
13
+ """Abstract base class for paper sources"""
14
+ def search(self, query: str, **kwargs) -> List[Paper]:
15
+ raise NotImplementedError
16
+
17
+ def download_pdf(self, paper_id: str, save_path: str) -> str:
18
+ raise NotImplementedError
19
+
20
+ def read_paper(self, paper_id: str, save_path: str) -> str:
21
+ raise NotImplementedError
22
+
23
+
24
+ class GoogleScholarSearcher(PaperSource):
25
+ """Custom implementation of Google Scholar paper search"""
26
+
27
+ SCHOLAR_URL = "https://scholar.google.com/scholar"
28
+ BROWSERS = [
29
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
30
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
31
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
32
+ ]
33
+
34
+ def __init__(self):
35
+ self._setup_session()
36
+
37
+ def _setup_session(self):
38
+ """Initialize session with random user agent"""
39
+ self.session = requests.Session()
40
+ self.session.headers.update({
41
+ 'User-Agent': random.choice(self.BROWSERS),
42
+ 'Accept': 'text/html,application/xhtml+xml',
43
+ 'Accept-Language': 'en-US,en;q=0.9'
44
+ })
45
+
46
+ def _extract_year(self, text: str) -> Optional[int]:
47
+ """Extract year from publication info"""
48
+ for word in text.split():
49
+ if word.isdigit() and 1900 <= int(word) <= datetime.now().year:
50
+ return int(word)
51
+ return None
52
+
53
+ def _extract_cluster_id(self, item) -> Optional[str]:
54
+ """Extract Google Scholar cluster ID from result item"""
55
+ import re
56
+ # Look for cluster/cites ID in the links (gs_fl div contains "Cited by", "All versions", etc.)
57
+ links_div = item.find('div', class_='gs_fl')
58
+ if links_div:
59
+ for a in links_div.find_all('a', href=True):
60
+ href = a['href']
61
+ # Match cluster=ID or cites=ID
62
+ match = re.search(r'(?:cluster|cites)=(\d+)', href)
63
+ if match:
64
+ return match.group(1)
65
+
66
+ # Also check data-cid attribute on the result container
67
+ if item.get('data-cid'):
68
+ return item['data-cid']
69
+
70
+ return None
71
+
72
+ def _extract_citations(self, item) -> int:
73
+ """Extract citation count from result item"""
74
+ import re
75
+ links_div = item.find('div', class_='gs_fl')
76
+ if links_div:
77
+ for a in links_div.find_all('a'):
78
+ text = a.get_text()
79
+ if 'Cited by' in text:
80
+ match = re.search(r'Cited by (\d+)', text)
81
+ if match:
82
+ return int(match.group(1))
83
+ return 0
84
+
85
+ def _parse_paper(self, item) -> Optional[Paper]:
86
+ """Parse single paper entry from HTML"""
87
+ try:
88
+ # Extract main paper elements
89
+ title_elem = item.find('h3', class_='gs_rt')
90
+ info_elem = item.find('div', class_='gs_a')
91
+ abstract_elem = item.find('div', class_='gs_rs')
92
+
93
+ if not title_elem or not info_elem:
94
+ return None
95
+
96
+ # Process title and URL
97
+ title = title_elem.get_text(strip=True).replace('[PDF]', '').replace('[HTML]', '')
98
+ link = title_elem.find('a', href=True)
99
+ url = link['href'] if link else ''
100
+
101
+ # Extract cluster ID (Google Scholar's unique paper identifier)
102
+ cluster_id = self._extract_cluster_id(item)
103
+
104
+ # Fallback to URL hash if no cluster ID found
105
+ paper_id = cluster_id if cluster_id else f"gs_{abs(hash(url))}"
106
+
107
+ # Note: DOI not available in GS search results (would require extra requests)
108
+
109
+ # Process author info
110
+ info_text = info_elem.get_text()
111
+ authors = [a.strip() for a in info_text.split('-')[0].split(',')]
112
+ year = self._extract_year(info_text)
113
+
114
+ # Extract citation count
115
+ citations = self._extract_citations(item)
116
+
117
+ # Create paper object
118
+ return Paper(
119
+ paper_id=paper_id,
120
+ title=title,
121
+ authors=authors,
122
+ abstract=abstract_elem.get_text() if abstract_elem else "",
123
+ url=url,
124
+ pdf_url="",
125
+ published_date=datetime(year, 1, 1) if year else None,
126
+ updated_date=None,
127
+ source="google_scholar",
128
+ categories=[],
129
+ keywords=[],
130
+ doi="", # Not available in GS search results
131
+ citations=citations
132
+ )
133
+ except Exception as e:
134
+ logger.warning(f"Failed to parse paper: {e}")
135
+ return None
136
+
137
+ def search(self, query: str, max_results: int = 10, date_from: str = None, date_to: str = None) -> List[Paper]:
138
+ """
139
+ Search Google Scholar with custom parameters
140
+
141
+ Args:
142
+ query: Search query string
143
+ max_results: Maximum number of papers to return
144
+ date_from: Start date in YYYY-MM-DD format (only year is used)
145
+ date_to: End date in YYYY-MM-DD format (only year is used)
146
+ """
147
+ papers = []
148
+ start = 0
149
+ results_per_page = min(10, max_results)
150
+
151
+ while len(papers) < max_results:
152
+ try:
153
+ # Construct search parameters
154
+ params = {
155
+ 'q': query,
156
+ 'start': start,
157
+ 'hl': 'en',
158
+ 'as_sdt': '0,5' # Include articles and citations
159
+ }
160
+
161
+ # Add year filters if provided (extract year from YYYY-MM-DD format)
162
+ if date_from:
163
+ try:
164
+ year_from = int(date_from.split('-')[0])
165
+ params['as_ylo'] = year_from
166
+ except (ValueError, IndexError):
167
+ logger.warning(f"Invalid date_from format: {date_from}")
168
+
169
+ if date_to:
170
+ try:
171
+ year_to = int(date_to.split('-')[0])
172
+ params['as_yhi'] = year_to
173
+ except (ValueError, IndexError):
174
+ logger.warning(f"Invalid date_to format: {date_to}")
175
+
176
+ # Make request with random delay
177
+ time.sleep(random.uniform(1.0, 3.0))
178
+ response = self.session.get(self.SCHOLAR_URL, params=params)
179
+
180
+ if response.status_code != 200:
181
+ logger.error(f"Search failed with status {response.status_code}")
182
+ break
183
+
184
+ # Parse results
185
+ soup = BeautifulSoup(response.text, 'html.parser')
186
+ results = soup.find_all('div', class_='gs_ri')
187
+
188
+ if not results:
189
+ break
190
+
191
+ # Process each result
192
+ for item in results:
193
+ if len(papers) >= max_results:
194
+ break
195
+
196
+ paper = self._parse_paper(item)
197
+ if paper:
198
+ papers.append(paper)
199
+
200
+ start += results_per_page
201
+
202
+ except Exception as e:
203
+ logger.error(f"Search error: {e}")
204
+ break
205
+
206
+ return papers[:max_results]
207
+
208
+ def download_pdf(self, paper_id: str, save_path: str) -> str:
209
+ """
210
+ Google Scholar doesn't support direct PDF downloads
211
+
212
+ Raises:
213
+ NotImplementedError: Always raises this error
214
+ """
215
+ raise NotImplementedError(
216
+ "Google Scholar doesn't provide direct PDF downloads. "
217
+ "Please use the paper URL to access the publisher's website."
218
+ )
219
+
220
+ def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
221
+ """
222
+ Google Scholar doesn't support direct paper reading
223
+
224
+ Returns:
225
+ str: Message indicating the feature is not supported
226
+ """
227
+ return (
228
+ "Google Scholar doesn't support direct paper reading. "
229
+ "Please use the paper URL to access the full text on the publisher's website."
230
+ )
231
+
232
+ if __name__ == "__main__":
233
+ # Test Google Scholar searcher
234
+ searcher = GoogleScholarSearcher()
235
+
236
+ print("Testing search functionality...")
237
+ query = "machine learning"
238
+ max_results = 5
239
+
240
+ try:
241
+ papers = searcher.search(query, max_results=max_results)
242
+ print(f"\nFound {len(papers)} papers for query '{query}':")
243
+ for i, paper in enumerate(papers, 1):
244
+ print(f"\n{i}. {paper.title}")
245
+ print(f" Authors: {', '.join(paper.authors)}")
246
+ print(f" Citations: {paper.citations}")
247
+ print(f" URL: {paper.url}")
248
+ except Exception as e:
249
+ print(f"Error during search: {e}")
File without changes