paper-search-cli 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paper_search/__init__.py +3 -0
- paper_search/academic_platforms/__init__.py +0 -0
- paper_search/academic_platforms/acm.py +113 -0
- paper_search/academic_platforms/arxiv.py +157 -0
- paper_search/academic_platforms/base.py +54 -0
- paper_search/academic_platforms/base_search.py +253 -0
- paper_search/academic_platforms/biorxiv.py +144 -0
- paper_search/academic_platforms/chemrxiv.py +183 -0
- paper_search/academic_platforms/citeseerx.py +407 -0
- paper_search/academic_platforms/core.py +470 -0
- paper_search/academic_platforms/crossref.py +354 -0
- paper_search/academic_platforms/dblp.py +387 -0
- paper_search/academic_platforms/doaj.py +476 -0
- paper_search/academic_platforms/europepmc.py +430 -0
- paper_search/academic_platforms/google_scholar.py +233 -0
- paper_search/academic_platforms/hal.py +259 -0
- paper_search/academic_platforms/iacr.py +499 -0
- paper_search/academic_platforms/ieee.py +107 -0
- paper_search/academic_platforms/medrxiv.py +145 -0
- paper_search/academic_platforms/oaipmh.py +467 -0
- paper_search/academic_platforms/openaire.py +718 -0
- paper_search/academic_platforms/openalex.py +188 -0
- paper_search/academic_platforms/pmc.py +413 -0
- paper_search/academic_platforms/pubmed.py +162 -0
- paper_search/academic_platforms/sci_hub.py +178 -0
- paper_search/academic_platforms/semantic.py +531 -0
- paper_search/academic_platforms/ssrn.py +365 -0
- paper_search/academic_platforms/unpaywall.py +227 -0
- paper_search/academic_platforms/zenodo.py +271 -0
- paper_search/cli.py +227 -0
- paper_search/config.py +89 -0
- paper_search/engine.py +341 -0
- paper_search/paper.py +59 -0
- paper_search/utils.py +8 -0
- paper_search_cli-1.0.2.dist-info/METADATA +191 -0
- paper_search_cli-1.0.2.dist-info/RECORD +39 -0
- paper_search_cli-1.0.2.dist-info/WHEEL +4 -0
- paper_search_cli-1.0.2.dist-info/entry_points.txt +2 -0
- paper_search_cli-1.0.2.dist-info/licenses/LICENSE +21 -0
paper_search/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""ACM Digital Library connector — optional, requires API key env.
|
|
2
|
+
|
|
3
|
+
This module is a **skeleton only**. No real ACM DL API requests are made
|
|
4
|
+
unless the ``PAPER_SEARCH_MCP_ACM_API_KEY`` (or legacy ``ACM_API_KEY``)
|
|
5
|
+
environment variable is configured. All methods
|
|
6
|
+
raise :class:`NotImplementedError` with a descriptive message when accessed
|
|
7
|
+
without a valid key so that the rest of the platform continues to work without
|
|
8
|
+
any paid credentials.
|
|
9
|
+
|
|
10
|
+
Enable usage::
|
|
11
|
+
|
|
12
|
+
export PAPER_SEARCH_MCP_ACM_API_KEY=<your_acm_api_key>
|
|
13
|
+
|
|
14
|
+
.. note::
|
|
15
|
+
ACM recently opened a limited metadata API. Check
|
|
16
|
+
https://libraries.acm.org/digital-library/acm-open for Open Access content
|
|
17
|
+
that does NOT require a key. Full-text/PDF download requires ACM membership
|
|
18
|
+
or institutional access.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import logging
|
|
24
|
+
from typing import List
|
|
25
|
+
|
|
26
|
+
from .base import PaperSource
|
|
27
|
+
from ..paper import Paper
|
|
28
|
+
from ..config import get_env
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
_NOT_CONFIGURED_MSG = (
|
|
33
|
+
"ACM Digital Library is not configured. Set PAPER_SEARCH_MCP_ACM_API_KEY "
|
|
34
|
+
"(or legacy ACM_API_KEY) environment "
|
|
35
|
+
"variable to enable ACM DL search. "
|
|
36
|
+
"See https://libraries.acm.org/digital-library/acm-open for access options."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ACMSearcher(PaperSource):
|
|
41
|
+
"""Skeleton connector for ACM Digital Library.
|
|
42
|
+
|
|
43
|
+
Instantiating this class without ``PAPER_SEARCH_MCP_ACM_API_KEY``
|
|
44
|
+
(or ``ACM_API_KEY``) set will log a warning
|
|
45
|
+
but will NOT raise an error. All actual operations raise
|
|
46
|
+
:class:`NotImplementedError` with a clear message directing the user to
|
|
47
|
+
configure their API key.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# ACM DL base URL (placeholder — real endpoint TBD once API key is available)
|
|
51
|
+
BASE_URL = "https://dl.acm.org/action/doSearch"
|
|
52
|
+
|
|
53
|
+
def __init__(self) -> None:
|
|
54
|
+
self.api_key: str = get_env("ACM_API_KEY", "")
|
|
55
|
+
if not self.api_key:
|
|
56
|
+
logger.warning(
|
|
57
|
+
"ACMSearcher initialised without PAPER_SEARCH_MCP_ACM_API_KEY/ACM_API_KEY. "
|
|
58
|
+
"All calls will raise NotImplementedError until the key is set."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# ------------------------------------------------------------------
|
|
62
|
+
# Public helpers
|
|
63
|
+
# ------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
def is_configured(self) -> bool:
|
|
66
|
+
"""Return True only when a non-empty ACM API key is available."""
|
|
67
|
+
return bool(self.api_key)
|
|
68
|
+
|
|
69
|
+
# ------------------------------------------------------------------
|
|
70
|
+
# PaperSource interface
|
|
71
|
+
# ------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
def search(self, query: str, max_results: int = 10, **kwargs) -> List[Paper]: # type: ignore[override]
|
|
74
|
+
"""Search ACM Digital Library — requires PAPER_SEARCH_MCP_ACM_API_KEY or ACM_API_KEY.
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
NotImplementedError: Always, when ACM API key env is not set.
|
|
78
|
+
"""
|
|
79
|
+
if not self.is_configured():
|
|
80
|
+
raise NotImplementedError(_NOT_CONFIGURED_MSG)
|
|
81
|
+
|
|
82
|
+
# TODO: implement real ACM DL API call here once key is available
|
|
83
|
+
raise NotImplementedError(
|
|
84
|
+
"ACM DL search is not yet implemented. "
|
|
85
|
+
"Contribute at https://github.com/your-repo/paper-search-cli."
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def download_pdf(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
89
|
+
"""Download a PDF from ACM DL — requires ACM API key env and institutional access.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
NotImplementedError: Always, until key + download logic are implemented.
|
|
93
|
+
"""
|
|
94
|
+
if not self.is_configured():
|
|
95
|
+
raise NotImplementedError(_NOT_CONFIGURED_MSG)
|
|
96
|
+
|
|
97
|
+
raise NotImplementedError(
|
|
98
|
+
"ACM DL PDF download is not yet implemented. "
|
|
99
|
+
"Note: full-text access also requires ACM membership or institutional access."
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
103
|
+
"""Read paper content from ACM DL — requires ACM API key env.
|
|
104
|
+
|
|
105
|
+
Raises:
|
|
106
|
+
NotImplementedError: Always, until download + extraction are implemented.
|
|
107
|
+
"""
|
|
108
|
+
if not self.is_configured():
|
|
109
|
+
raise NotImplementedError(_NOT_CONFIGURED_MSG)
|
|
110
|
+
|
|
111
|
+
raise NotImplementedError(
|
|
112
|
+
"ACM DL paper reading is not yet implemented."
|
|
113
|
+
)
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# paper_search/sources/arxiv.py
|
|
2
|
+
from typing import List
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import requests
|
|
5
|
+
import feedparser
|
|
6
|
+
import time
|
|
7
|
+
from ..paper import Paper
|
|
8
|
+
from ..utils import extract_doi
|
|
9
|
+
from .base import PaperSource
|
|
10
|
+
from PyPDF2 import PdfReader
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
class ArxivSearcher(PaperSource):
|
|
14
|
+
"""Searcher for arXiv papers"""
|
|
15
|
+
BASE_URL = "http://export.arxiv.org/api/query"
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
self.session = requests.Session()
|
|
19
|
+
self.session.headers.update({
|
|
20
|
+
'User-Agent': 'paper-search-cli/1.0.0 (mailto:openags@example.com)',
|
|
21
|
+
'Accept': 'application/atom+xml, application/xml;q=0.9, */*;q=0.8',
|
|
22
|
+
})
|
|
23
|
+
|
|
24
|
+
def search(self, query: str, max_results: int = 10) -> List[Paper]:
|
|
25
|
+
params = {
|
|
26
|
+
'search_query': f'all:{query}',
|
|
27
|
+
'max_results': max_results,
|
|
28
|
+
'sortBy': 'submittedDate',
|
|
29
|
+
'sortOrder': 'descending'
|
|
30
|
+
}
|
|
31
|
+
response = None
|
|
32
|
+
for attempt in range(3):
|
|
33
|
+
try:
|
|
34
|
+
response = self.session.get(self.BASE_URL, params=params, timeout=30)
|
|
35
|
+
except requests.RequestException:
|
|
36
|
+
time.sleep((attempt + 1) * 1.5)
|
|
37
|
+
continue
|
|
38
|
+
if response.status_code == 200:
|
|
39
|
+
break
|
|
40
|
+
if response.status_code in (429, 500, 502, 503, 504):
|
|
41
|
+
time.sleep((attempt + 1) * 1.5)
|
|
42
|
+
continue
|
|
43
|
+
break
|
|
44
|
+
|
|
45
|
+
if response is None or response.status_code != 200:
|
|
46
|
+
return []
|
|
47
|
+
|
|
48
|
+
feed = feedparser.parse(response.content)
|
|
49
|
+
papers = []
|
|
50
|
+
for entry in feed.entries:
|
|
51
|
+
try:
|
|
52
|
+
authors = [author.name for author in entry.authors]
|
|
53
|
+
published = datetime.strptime(entry.published, '%Y-%m-%dT%H:%M:%SZ')
|
|
54
|
+
updated = datetime.strptime(entry.updated, '%Y-%m-%dT%H:%M:%SZ')
|
|
55
|
+
pdf_url = next((link.href for link in entry.links if link.type == 'application/pdf'), '')
|
|
56
|
+
|
|
57
|
+
# Try to extract DOI from entry.doi or links or summary
|
|
58
|
+
doi = entry.get('doi', '') or extract_doi(entry.summary) or extract_doi(entry.id)
|
|
59
|
+
for link in entry.links:
|
|
60
|
+
if link.get('title') == 'doi':
|
|
61
|
+
doi = doi or extract_doi(link.href)
|
|
62
|
+
|
|
63
|
+
papers.append(Paper(
|
|
64
|
+
paper_id=entry.id.split('/')[-1],
|
|
65
|
+
title=entry.title,
|
|
66
|
+
authors=authors,
|
|
67
|
+
abstract=entry.summary,
|
|
68
|
+
url=entry.id,
|
|
69
|
+
pdf_url=pdf_url,
|
|
70
|
+
published_date=published,
|
|
71
|
+
updated_date=updated,
|
|
72
|
+
source='arxiv',
|
|
73
|
+
categories=[tag.term for tag in entry.tags],
|
|
74
|
+
keywords=[],
|
|
75
|
+
doi=doi
|
|
76
|
+
))
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(f"Error parsing arXiv entry: {e}")
|
|
79
|
+
return papers
|
|
80
|
+
|
|
81
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
82
|
+
pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
|
|
83
|
+
response = requests.get(pdf_url)
|
|
84
|
+
os.makedirs(save_path, exist_ok=True)
|
|
85
|
+
output_file = f"{save_path}/{paper_id}.pdf"
|
|
86
|
+
with open(output_file, 'wb') as f:
|
|
87
|
+
f.write(response.content)
|
|
88
|
+
return output_file
|
|
89
|
+
|
|
90
|
+
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
91
|
+
"""Read a paper and convert it to text format.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
paper_id: arXiv paper ID
|
|
95
|
+
save_path: Directory where the PDF is/will be saved
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
str: The extracted text content of the paper
|
|
99
|
+
"""
|
|
100
|
+
# First ensure we have the PDF
|
|
101
|
+
pdf_path = f"{save_path}/{paper_id}.pdf"
|
|
102
|
+
if not os.path.exists(pdf_path):
|
|
103
|
+
pdf_path = self.download_pdf(paper_id, save_path)
|
|
104
|
+
|
|
105
|
+
# Read the PDF
|
|
106
|
+
try:
|
|
107
|
+
reader = PdfReader(pdf_path)
|
|
108
|
+
text = ""
|
|
109
|
+
|
|
110
|
+
# Extract text from each page
|
|
111
|
+
for page in reader.pages:
|
|
112
|
+
text += page.extract_text() + "\n"
|
|
113
|
+
|
|
114
|
+
return text.strip()
|
|
115
|
+
except Exception as e:
|
|
116
|
+
print(f"Error reading PDF for paper {paper_id}: {e}")
|
|
117
|
+
return ""
|
|
118
|
+
|
|
119
|
+
if __name__ == "__main__":
|
|
120
|
+
# 测试 ArxivSearcher 的功能
|
|
121
|
+
searcher = ArxivSearcher()
|
|
122
|
+
|
|
123
|
+
# 测试搜索功能
|
|
124
|
+
print("Testing search functionality...")
|
|
125
|
+
query = "machine learning"
|
|
126
|
+
max_results = 5
|
|
127
|
+
try:
|
|
128
|
+
papers = searcher.search(query, max_results=max_results)
|
|
129
|
+
print(f"Found {len(papers)} papers for query '{query}':")
|
|
130
|
+
for i, paper in enumerate(papers, 1):
|
|
131
|
+
print(f"{i}. {paper.title} (ID: {paper.paper_id})")
|
|
132
|
+
except Exception as e:
|
|
133
|
+
print(f"Error during search: {e}")
|
|
134
|
+
|
|
135
|
+
# 测试 PDF 下载功能
|
|
136
|
+
if papers:
|
|
137
|
+
print("\nTesting PDF download functionality...")
|
|
138
|
+
paper_id = papers[0].paper_id
|
|
139
|
+
save_path = "./downloads" # 确保此目录存在
|
|
140
|
+
try:
|
|
141
|
+
os.makedirs(save_path, exist_ok=True)
|
|
142
|
+
pdf_path = searcher.download_pdf(paper_id, save_path)
|
|
143
|
+
print(f"PDF downloaded successfully: {pdf_path}")
|
|
144
|
+
except Exception as e:
|
|
145
|
+
print(f"Error during PDF download: {e}")
|
|
146
|
+
|
|
147
|
+
# 测试论文阅读功能
|
|
148
|
+
if papers:
|
|
149
|
+
print("\nTesting paper reading functionality...")
|
|
150
|
+
paper_id = papers[0].paper_id
|
|
151
|
+
try:
|
|
152
|
+
text_content = searcher.read_paper(paper_id)
|
|
153
|
+
print(f"\nFirst 500 characters of the paper content:")
|
|
154
|
+
print(text_content[:500] + "...")
|
|
155
|
+
print(f"\nTotal length of extracted text: {len(text_content)} characters")
|
|
156
|
+
except Exception as e:
|
|
157
|
+
print(f"Error during paper reading: {e}")
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Base class for all academic paper source searchers."""
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import List
|
|
4
|
+
from ..paper import Paper
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PaperSource(ABC):
|
|
8
|
+
"""Abstract base class for academic paper sources."""
|
|
9
|
+
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def search(self, query: str, **kwargs) -> List[Paper]:
|
|
12
|
+
"""Search papers matching the query.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
query: Search query string.
|
|
16
|
+
**kwargs: Source-specific parameters (e.g., max_results, year).
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
List of Paper objects.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
23
|
+
"""Download the PDF for a given paper.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
paper_id: Platform-specific paper identifier.
|
|
27
|
+
save_path: Directory to save the downloaded PDF.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Path to the saved PDF file.
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
NotImplementedError: If the source does not support PDF downloads.
|
|
34
|
+
"""
|
|
35
|
+
raise NotImplementedError(
|
|
36
|
+
f"{self.__class__.__name__} does not support PDF downloads."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
40
|
+
"""Download and extract text from a paper PDF.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
paper_id: Platform-specific paper identifier.
|
|
44
|
+
save_path: Directory where the PDF is/will be saved.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Extracted text content of the paper.
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
NotImplementedError: If the source does not support paper reading.
|
|
51
|
+
"""
|
|
52
|
+
raise NotImplementedError(
|
|
53
|
+
f"{self.__class__.__name__} does not support reading paper content."
|
|
54
|
+
)
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
# paper_search/academic_platforms/base_search.py
|
|
2
|
+
"""Searcher for BASE (Bielefeld Academic Search Engine).
|
|
3
|
+
|
|
4
|
+
BASE is one of the world's most voluminous search engines especially for
|
|
5
|
+
academic open access web resources. It provides OAI-PMH access to metadata
|
|
6
|
+
from thousands of repositories.
|
|
7
|
+
|
|
8
|
+
OAI-PMH Endpoint: https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi
|
|
9
|
+
Documentation: https://www.base-search.net/about/en/about_sources_date.php
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from typing import List, Optional, Dict, Any
|
|
13
|
+
import logging
|
|
14
|
+
from .oaipmh import OAIPMHSearcher
|
|
15
|
+
from ..paper import Paper
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BASESearcher(OAIPMHSearcher):
|
|
21
|
+
"""Searcher for BASE (Bielefeld Academic Search Engine)."""
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
"""Initialize BASE searcher with OAI-PMH endpoint."""
|
|
25
|
+
super().__init__(
|
|
26
|
+
base_url="https://api.base-search.net/cgi-bin/BaseHttpSearchInterface.fcgi",
|
|
27
|
+
metadata_prefix="oai_dc"
|
|
28
|
+
)
|
|
29
|
+
# Update User-Agent for BASE
|
|
30
|
+
self.session.headers.update({
|
|
31
|
+
'User-Agent': 'paper-search-cli/1.0.0 (BASE OAI-PMH client; https://github.com/openags/paper-search-cli)'
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
def search(self, query: str, max_results: int = 10, **kwargs) -> List[Paper]:
|
|
35
|
+
"""Search BASE using OAI-PMH with query filtering.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
query: Search query string
|
|
39
|
+
max_results: Maximum number of results to return
|
|
40
|
+
**kwargs: Additional parameters:
|
|
41
|
+
- set: OAI-PMH set specification (e.g., 'pubtype:article')
|
|
42
|
+
- from_date: Harvest from date (YYYY-MM-DD)
|
|
43
|
+
- until_date: Harvest until date (YYYY-MM-DD)
|
|
44
|
+
- language: Filter by language (e.g., 'en', 'de')
|
|
45
|
+
- subject: Filter by subject category
|
|
46
|
+
- has_fulltext: Filter for fulltext availability (True/False)
|
|
47
|
+
- open_access: Filter for open access content (True/False)
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
List of Paper objects
|
|
51
|
+
"""
|
|
52
|
+
# BASE-specific sets
|
|
53
|
+
if 'has_fulltext' in kwargs and kwargs['has_fulltext']:
|
|
54
|
+
kwargs['set'] = kwargs.get('set', '') + ' dcterms:accessRights:open'
|
|
55
|
+
if 'open_access' in kwargs and kwargs['open_access']:
|
|
56
|
+
kwargs['set'] = kwargs.get('set', '') + ' dcterms:accessRights:open'
|
|
57
|
+
|
|
58
|
+
# Call parent OAI-PMH search
|
|
59
|
+
papers = super().search(query, max_results, **kwargs)
|
|
60
|
+
|
|
61
|
+
# Apply additional BASE-specific filtering
|
|
62
|
+
filtered_papers = []
|
|
63
|
+
for paper in papers:
|
|
64
|
+
if self._filter_paper(paper, kwargs):
|
|
65
|
+
filtered_papers.append(paper)
|
|
66
|
+
if len(filtered_papers) >= max_results:
|
|
67
|
+
break
|
|
68
|
+
|
|
69
|
+
return filtered_papers[:max_results]
|
|
70
|
+
|
|
71
|
+
def _filter_paper(self, paper: Paper, filters: Dict[str, Any]) -> bool:
|
|
72
|
+
"""Apply BASE-specific filters to paper.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
paper: Paper object
|
|
76
|
+
filters: Filter parameters
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
True if paper passes all filters
|
|
80
|
+
"""
|
|
81
|
+
# Language filter
|
|
82
|
+
if 'language' in filters and filters['language']:
|
|
83
|
+
paper_lang = paper.extra.get('language', '').lower() if paper.extra else ''
|
|
84
|
+
if not paper_lang or paper_lang != filters['language'].lower():
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
# Subject filter
|
|
88
|
+
if 'subject' in filters and filters['subject']:
|
|
89
|
+
subject_lower = filters['subject'].lower()
|
|
90
|
+
in_categories = any(subject_lower in cat.lower() for cat in paper.categories)
|
|
91
|
+
in_keywords = any(subject_lower in kw.lower() for kw in paper.keywords)
|
|
92
|
+
if not in_categories and not in_keywords:
|
|
93
|
+
return False
|
|
94
|
+
|
|
95
|
+
# Open access filter (already handled in OAI-PMH set)
|
|
96
|
+
# Fulltext filter
|
|
97
|
+
if 'has_fulltext' in filters and filters['has_fulltext']:
|
|
98
|
+
if not paper.pdf_url and not paper.url:
|
|
99
|
+
return False
|
|
100
|
+
|
|
101
|
+
return True
|
|
102
|
+
|
|
103
|
+
def _enrich_paper_from_oai(self, paper: Paper, dc_root):
|
|
104
|
+
"""Enrich Paper object with BASE-specific metadata.
|
|
105
|
+
|
|
106
|
+
Overrides parent method to extract BASE-specific fields.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
paper: Paper object to enrich
|
|
110
|
+
dc_root: Dublin Core XML element
|
|
111
|
+
"""
|
|
112
|
+
super()._enrich_paper_from_oai(paper, dc_root)
|
|
113
|
+
|
|
114
|
+
# BASE-specific fields
|
|
115
|
+
if not paper.extra:
|
|
116
|
+
paper.extra = {}
|
|
117
|
+
|
|
118
|
+
# Extract BASE-specific identifiers
|
|
119
|
+
import xml.etree.ElementTree as ET
|
|
120
|
+
identifiers = dc_root.findall('.//{http://purl.org/dc/elements/1.1/}identifier') or \
|
|
121
|
+
dc_root.findall('identifier')
|
|
122
|
+
|
|
123
|
+
for ident_elem in identifiers:
|
|
124
|
+
if ident_elem.text:
|
|
125
|
+
ident_text = ident_elem.text.lower()
|
|
126
|
+
if 'base-search.net' in ident_text:
|
|
127
|
+
paper.extra['base_id'] = ident_text
|
|
128
|
+
elif 'urn:nbn:' in ident_text:
|
|
129
|
+
paper.extra['urn'] = ident_text
|
|
130
|
+
elif 'hdl.handle.net' in ident_text:
|
|
131
|
+
paper.extra['handle'] = ident_text
|
|
132
|
+
|
|
133
|
+
# Extract rights information
|
|
134
|
+
rights_elems = dc_root.findall('.//{http://purl.org/dc/elements/1.1/}rights') or \
|
|
135
|
+
dc_root.findall('rights')
|
|
136
|
+
if rights_elems:
|
|
137
|
+
paper.extra['rights'] = [elem.text for elem in rights_elems if elem.text]
|
|
138
|
+
|
|
139
|
+
# Extract source repository
|
|
140
|
+
source_elems = dc_root.findall('.//{http://purl.org/dc/elements/1.1/}source') or \
|
|
141
|
+
dc_root.findall('source')
|
|
142
|
+
if source_elems:
|
|
143
|
+
paper.extra['repository'] = source_elems[0].text if source_elems[0].text else ''
|
|
144
|
+
|
|
145
|
+
# Try to extract PDF URL from identifiers
|
|
146
|
+
if not paper.pdf_url:
|
|
147
|
+
for ident_elem in identifiers:
|
|
148
|
+
if ident_elem.text and ident_elem.text.lower().endswith('.pdf'):
|
|
149
|
+
paper.pdf_url = ident_elem.text
|
|
150
|
+
break
|
|
151
|
+
|
|
152
|
+
# Extract BASE relevance score if available
|
|
153
|
+
# (BASE doesn't provide relevance scores in OAI-PMH, but we might add it from other sources)
|
|
154
|
+
|
|
155
|
+
def download_pdf(self, paper_id: str, save_path: str) -> str:
|
|
156
|
+
"""Download PDF for a BASE record.
|
|
157
|
+
|
|
158
|
+
BASE often provides direct PDF links in metadata.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
paper_id: BASE identifier or OAI-PMH identifier
|
|
162
|
+
save_path: Directory to save PDF
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Path to saved PDF file
|
|
166
|
+
|
|
167
|
+
Raises:
|
|
168
|
+
NotImplementedError: If PDF cannot be downloaded
|
|
169
|
+
"""
|
|
170
|
+
# Try parent method first (searches for PDF URL)
|
|
171
|
+
try:
|
|
172
|
+
return super().download_pdf(paper_id, save_path)
|
|
173
|
+
except Exception as e:
|
|
174
|
+
logger.warning(f"Parent download failed: {e}")
|
|
175
|
+
|
|
176
|
+
# Try alternative approach: search for paper and use first PDF link
|
|
177
|
+
papers = self.search(paper_id, max_results=1)
|
|
178
|
+
if not papers:
|
|
179
|
+
raise ValueError(f"BASE record not found: {paper_id}")
|
|
180
|
+
|
|
181
|
+
paper = papers[0]
|
|
182
|
+
if paper.pdf_url:
|
|
183
|
+
import os
|
|
184
|
+
import requests
|
|
185
|
+
response = self.session.get(paper.pdf_url, timeout=30)
|
|
186
|
+
response.raise_for_status()
|
|
187
|
+
os.makedirs(save_path, exist_ok=True)
|
|
188
|
+
|
|
189
|
+
# Create safe filename
|
|
190
|
+
safe_id = paper_id.replace('/', '_').replace(':', '_')
|
|
191
|
+
filename = f"base_{safe_id}.pdf"
|
|
192
|
+
output_file = os.path.join(save_path, filename)
|
|
193
|
+
|
|
194
|
+
with open(output_file, 'wb') as f:
|
|
195
|
+
f.write(response.content)
|
|
196
|
+
|
|
197
|
+
logger.info(f"Downloaded PDF to {output_file}")
|
|
198
|
+
return output_file
|
|
199
|
+
|
|
200
|
+
raise NotImplementedError(
|
|
201
|
+
f"No PDF available for BASE record: {paper_id}"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
def read_paper(self, paper_id: str, save_path: str = "./downloads") -> str:
|
|
205
|
+
"""Read paper text from PDF.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
paper_id: Paper identifier
|
|
209
|
+
save_path: Directory where PDF is/will be saved
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Extracted text content
|
|
213
|
+
|
|
214
|
+
Raises:
|
|
215
|
+
NotImplementedError: If PDF cannot be read
|
|
216
|
+
"""
|
|
217
|
+
try:
|
|
218
|
+
return super().read_paper(paper_id, save_path)
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.error(f"Error reading BASE paper {paper_id}: {e}")
|
|
221
|
+
raise NotImplementedError(
|
|
222
|
+
f"Cannot read paper from BASE: {e}"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
if __name__ == "__main__":
|
|
227
|
+
"""Test the BASESearcher."""
|
|
228
|
+
import logging
|
|
229
|
+
logging.basicConfig(level=logging.INFO)
|
|
230
|
+
|
|
231
|
+
searcher = BASESearcher()
|
|
232
|
+
|
|
233
|
+
# Test search
|
|
234
|
+
print("Testing BASE search...")
|
|
235
|
+
|
|
236
|
+
# Test queries
|
|
237
|
+
test_queries = [
|
|
238
|
+
"machine learning",
|
|
239
|
+
"artificial intelligence",
|
|
240
|
+
"data science"
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
for query in test_queries[:1]: # Test first query only
|
|
244
|
+
print(f"\nSearching BASE for: '{query}'")
|
|
245
|
+
papers = searcher.search(query, max_results=3)
|
|
246
|
+
print(f"Found {len(papers)} papers")
|
|
247
|
+
for i, paper in enumerate(papers):
|
|
248
|
+
print(f"{i+1}. {paper.title}")
|
|
249
|
+
print(f" Authors: {', '.join(paper.authors[:3])}")
|
|
250
|
+
print(f" Source: {paper.source}")
|
|
251
|
+
print(f" PDF: {'Yes' if paper.pdf_url else 'No'}")
|
|
252
|
+
print(f" URL: {paper.url}")
|
|
253
|
+
print()
|