foundry-mcp 0.7.0__py3-none-any.whl → 0.8.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- foundry_mcp/cli/__init__.py +0 -13
- foundry_mcp/cli/commands/session.py +1 -8
- foundry_mcp/cli/context.py +39 -0
- foundry_mcp/config.py +381 -7
- foundry_mcp/core/batch_operations.py +1196 -0
- foundry_mcp/core/discovery.py +1 -1
- foundry_mcp/core/llm_config.py +8 -0
- foundry_mcp/core/naming.py +25 -2
- foundry_mcp/core/prometheus.py +0 -13
- foundry_mcp/core/providers/__init__.py +12 -0
- foundry_mcp/core/providers/base.py +39 -0
- foundry_mcp/core/providers/claude.py +45 -1
- foundry_mcp/core/providers/codex.py +64 -3
- foundry_mcp/core/providers/cursor_agent.py +22 -3
- foundry_mcp/core/providers/detectors.py +34 -7
- foundry_mcp/core/providers/gemini.py +63 -1
- foundry_mcp/core/providers/opencode.py +95 -71
- foundry_mcp/core/providers/package-lock.json +4 -4
- foundry_mcp/core/providers/package.json +1 -1
- foundry_mcp/core/providers/validation.py +128 -0
- foundry_mcp/core/research/memory.py +103 -0
- foundry_mcp/core/research/models.py +783 -0
- foundry_mcp/core/research/providers/__init__.py +40 -0
- foundry_mcp/core/research/providers/base.py +242 -0
- foundry_mcp/core/research/providers/google.py +507 -0
- foundry_mcp/core/research/providers/perplexity.py +442 -0
- foundry_mcp/core/research/providers/semantic_scholar.py +544 -0
- foundry_mcp/core/research/providers/tavily.py +383 -0
- foundry_mcp/core/research/workflows/__init__.py +5 -2
- foundry_mcp/core/research/workflows/base.py +106 -12
- foundry_mcp/core/research/workflows/consensus.py +160 -17
- foundry_mcp/core/research/workflows/deep_research.py +4020 -0
- foundry_mcp/core/responses.py +240 -0
- foundry_mcp/core/spec.py +1 -0
- foundry_mcp/core/task.py +141 -12
- foundry_mcp/core/validation.py +6 -1
- foundry_mcp/server.py +0 -52
- foundry_mcp/tools/unified/__init__.py +37 -18
- foundry_mcp/tools/unified/authoring.py +0 -33
- foundry_mcp/tools/unified/environment.py +202 -29
- foundry_mcp/tools/unified/plan.py +20 -1
- foundry_mcp/tools/unified/provider.py +0 -40
- foundry_mcp/tools/unified/research.py +644 -19
- foundry_mcp/tools/unified/review.py +5 -2
- foundry_mcp/tools/unified/review_helpers.py +16 -1
- foundry_mcp/tools/unified/server.py +9 -24
- foundry_mcp/tools/unified/task.py +528 -9
- {foundry_mcp-0.7.0.dist-info → foundry_mcp-0.8.10.dist-info}/METADATA +2 -1
- {foundry_mcp-0.7.0.dist-info → foundry_mcp-0.8.10.dist-info}/RECORD +52 -46
- foundry_mcp/cli/flags.py +0 -266
- foundry_mcp/core/feature_flags.py +0 -592
- {foundry_mcp-0.7.0.dist-info → foundry_mcp-0.8.10.dist-info}/WHEEL +0 -0
- {foundry_mcp-0.7.0.dist-info → foundry_mcp-0.8.10.dist-info}/entry_points.txt +0 -0
- {foundry_mcp-0.7.0.dist-info → foundry_mcp-0.8.10.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,544 @@
|
|
|
1
|
+
"""Semantic Scholar provider for academic paper search.
|
|
2
|
+
|
|
3
|
+
This module implements SemanticScholarProvider, which wraps the Semantic Scholar
|
|
4
|
+
Academic Graph API to provide academic paper search capabilities for the deep
|
|
5
|
+
research workflow.
|
|
6
|
+
|
|
7
|
+
Semantic Scholar API documentation:
|
|
8
|
+
https://api.semanticscholar.org/api-docs/
|
|
9
|
+
|
|
10
|
+
Example usage:
|
|
11
|
+
provider = SemanticScholarProvider(api_key="optional-key")
|
|
12
|
+
sources = await provider.search("transformer architecture", max_results=10)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from typing import Any, Optional
|
|
20
|
+
|
|
21
|
+
import httpx
|
|
22
|
+
|
|
23
|
+
from foundry_mcp.core.research.models import ResearchSource, SourceType
|
|
24
|
+
from foundry_mcp.core.research.providers.base import (
|
|
25
|
+
AuthenticationError,
|
|
26
|
+
RateLimitError,
|
|
27
|
+
SearchProvider,
|
|
28
|
+
SearchProviderError,
|
|
29
|
+
SearchResult,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
# Semantic Scholar API constants
|
|
35
|
+
SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1"
|
|
36
|
+
PAPER_SEARCH_ENDPOINT = "/paper/search/bulk"
|
|
37
|
+
DEFAULT_TIMEOUT = 30.0
|
|
38
|
+
DEFAULT_MAX_RETRIES = 3
|
|
39
|
+
DEFAULT_RATE_LIMIT = 1.0 # requests per second
|
|
40
|
+
|
|
41
|
+
# Fields to request from the API
|
|
42
|
+
# See: https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/post_graph_get_papers
|
|
43
|
+
DEFAULT_FIELDS = (
|
|
44
|
+
"paperId,title,abstract,authors,citationCount,year,"
|
|
45
|
+
"externalIds,url,openAccessPdf,publicationDate"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class SemanticScholarProvider(SearchProvider):
|
|
50
|
+
"""Semantic Scholar Academic Graph API provider for paper search.
|
|
51
|
+
|
|
52
|
+
Wraps the Semantic Scholar API to provide academic paper search capabilities.
|
|
53
|
+
API keys are optional but recommended for higher rate limits.
|
|
54
|
+
|
|
55
|
+
Without API key: Shared rate limit among all unauthenticated users
|
|
56
|
+
With API key: 1 request per second guaranteed
|
|
57
|
+
|
|
58
|
+
Attributes:
|
|
59
|
+
api_key: Semantic Scholar API key (optional)
|
|
60
|
+
base_url: API base URL (default: https://api.semanticscholar.org/graph/v1)
|
|
61
|
+
timeout: Request timeout in seconds (default: 30.0)
|
|
62
|
+
max_retries: Maximum retry attempts for rate limits (default: 3)
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
provider = SemanticScholarProvider(api_key="your-key")
|
|
66
|
+
sources = await provider.search(
|
|
67
|
+
"deep learning for NLP",
|
|
68
|
+
max_results=10,
|
|
69
|
+
year="2020-2024",
|
|
70
|
+
)
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def __init__(
|
|
74
|
+
self,
|
|
75
|
+
api_key: Optional[str] = None,
|
|
76
|
+
base_url: str = SEMANTIC_SCHOLAR_BASE_URL,
|
|
77
|
+
timeout: float = DEFAULT_TIMEOUT,
|
|
78
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
79
|
+
):
|
|
80
|
+
"""Initialize Semantic Scholar search provider.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
api_key: Semantic Scholar API key. If not provided, reads from
|
|
84
|
+
SEMANTIC_SCHOLAR_API_KEY env var. API key is optional but
|
|
85
|
+
recommended for higher rate limits.
|
|
86
|
+
base_url: API base URL (default: https://api.semanticscholar.org/graph/v1)
|
|
87
|
+
timeout: Request timeout in seconds (default: 30.0)
|
|
88
|
+
max_retries: Maximum retry attempts for rate limits (default: 3)
|
|
89
|
+
"""
|
|
90
|
+
self._api_key = api_key or os.environ.get("SEMANTIC_SCHOLAR_API_KEY")
|
|
91
|
+
self._base_url = base_url.rstrip("/")
|
|
92
|
+
self._timeout = timeout
|
|
93
|
+
self._max_retries = max_retries
|
|
94
|
+
self._rate_limit_value = DEFAULT_RATE_LIMIT
|
|
95
|
+
|
|
96
|
+
def get_provider_name(self) -> str:
|
|
97
|
+
"""Return the provider identifier.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
"semantic_scholar"
|
|
101
|
+
"""
|
|
102
|
+
return "semantic_scholar"
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def rate_limit(self) -> Optional[float]:
|
|
106
|
+
"""Return the rate limit in requests per second.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
1.0 (one request per second)
|
|
110
|
+
"""
|
|
111
|
+
return self._rate_limit_value
|
|
112
|
+
|
|
113
|
+
async def search(
|
|
114
|
+
self,
|
|
115
|
+
query: str,
|
|
116
|
+
max_results: int = 10,
|
|
117
|
+
**kwargs: Any,
|
|
118
|
+
) -> list[ResearchSource]:
|
|
119
|
+
"""Execute an academic paper search via Semantic Scholar API.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
query: The search query string. Supports quoted phrases for exact match.
|
|
123
|
+
max_results: Maximum number of results to return (default: 10, max: 1000)
|
|
124
|
+
**kwargs: Additional Semantic Scholar options:
|
|
125
|
+
- year: Filter by year range (e.g., "2020-2024", "2020-", "-2024")
|
|
126
|
+
- fields_of_study: Filter by fields (e.g., ["Computer Science", "Medicine"])
|
|
127
|
+
- open_access_pdf: Only include papers with free PDFs (bool)
|
|
128
|
+
- min_citation_count: Minimum citation count filter
|
|
129
|
+
- sub_query_id: SubQuery ID for source tracking
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
List of ResearchSource objects with source_type='academic'
|
|
133
|
+
|
|
134
|
+
Raises:
|
|
135
|
+
AuthenticationError: If API key is invalid
|
|
136
|
+
RateLimitError: If rate limit exceeded after all retries
|
|
137
|
+
SearchProviderError: For other API errors
|
|
138
|
+
"""
|
|
139
|
+
# Extract Semantic Scholar-specific options
|
|
140
|
+
year = kwargs.get("year")
|
|
141
|
+
fields_of_study = kwargs.get("fields_of_study")
|
|
142
|
+
open_access_pdf = kwargs.get("open_access_pdf")
|
|
143
|
+
min_citation_count = kwargs.get("min_citation_count")
|
|
144
|
+
sub_query_id = kwargs.get("sub_query_id")
|
|
145
|
+
|
|
146
|
+
# Build query parameters
|
|
147
|
+
params: dict[str, Any] = {
|
|
148
|
+
"query": query,
|
|
149
|
+
"limit": min(max_results, 1000), # API max is 1000
|
|
150
|
+
"fields": DEFAULT_FIELDS,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
if year:
|
|
154
|
+
params["year"] = year
|
|
155
|
+
if fields_of_study:
|
|
156
|
+
params["fieldsOfStudy"] = ",".join(fields_of_study)
|
|
157
|
+
if open_access_pdf:
|
|
158
|
+
params["openAccessPdf"] = "" # Empty string means filter to only open access
|
|
159
|
+
if min_citation_count:
|
|
160
|
+
params["minCitationCount"] = min_citation_count
|
|
161
|
+
|
|
162
|
+
# Execute with retry logic
|
|
163
|
+
response_data = await self._execute_with_retry(params)
|
|
164
|
+
|
|
165
|
+
# Parse results
|
|
166
|
+
return self._parse_response(response_data, sub_query_id)
|
|
167
|
+
|
|
168
|
+
async def _execute_with_retry(
|
|
169
|
+
self,
|
|
170
|
+
params: dict[str, Any],
|
|
171
|
+
) -> dict[str, Any]:
|
|
172
|
+
"""Execute API request with exponential backoff retry.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
params: Query parameters
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Parsed JSON response
|
|
179
|
+
|
|
180
|
+
Raises:
|
|
181
|
+
AuthenticationError: If API key is invalid
|
|
182
|
+
RateLimitError: If rate limit exceeded after all retries
|
|
183
|
+
SearchProviderError: For other API errors
|
|
184
|
+
"""
|
|
185
|
+
url = f"{self._base_url}{PAPER_SEARCH_ENDPOINT}"
|
|
186
|
+
headers: dict[str, str] = {}
|
|
187
|
+
|
|
188
|
+
# Add API key header if available
|
|
189
|
+
if self._api_key:
|
|
190
|
+
headers["x-api-key"] = self._api_key
|
|
191
|
+
|
|
192
|
+
last_error: Optional[Exception] = None
|
|
193
|
+
|
|
194
|
+
for attempt in range(self._max_retries):
|
|
195
|
+
try:
|
|
196
|
+
async with httpx.AsyncClient(timeout=self._timeout) as client:
|
|
197
|
+
response = await client.get(url, params=params, headers=headers)
|
|
198
|
+
|
|
199
|
+
# Handle authentication errors (not retryable)
|
|
200
|
+
if response.status_code == 401:
|
|
201
|
+
raise AuthenticationError(
|
|
202
|
+
provider="semantic_scholar",
|
|
203
|
+
message="Invalid API key",
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# Handle forbidden (invalid API key format)
|
|
207
|
+
if response.status_code == 403:
|
|
208
|
+
raise AuthenticationError(
|
|
209
|
+
provider="semantic_scholar",
|
|
210
|
+
message="Access forbidden - check API key",
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Handle rate limiting (429)
|
|
214
|
+
if response.status_code == 429:
|
|
215
|
+
retry_after = self._parse_retry_after(response)
|
|
216
|
+
if attempt < self._max_retries - 1:
|
|
217
|
+
wait_time = retry_after or (2**attempt)
|
|
218
|
+
logger.warning(
|
|
219
|
+
f"Semantic Scholar rate limit hit, waiting {wait_time}s "
|
|
220
|
+
f"(attempt {attempt + 1}/{self._max_retries})"
|
|
221
|
+
)
|
|
222
|
+
await asyncio.sleep(wait_time)
|
|
223
|
+
continue
|
|
224
|
+
raise RateLimitError(
|
|
225
|
+
provider="semantic_scholar",
|
|
226
|
+
retry_after=retry_after,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Handle other errors
|
|
230
|
+
if response.status_code >= 400:
|
|
231
|
+
error_msg = self._parse_error_response(response)
|
|
232
|
+
raise SearchProviderError(
|
|
233
|
+
provider="semantic_scholar",
|
|
234
|
+
message=f"API error {response.status_code}: {error_msg}",
|
|
235
|
+
retryable=response.status_code >= 500,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
return response.json()
|
|
239
|
+
|
|
240
|
+
except httpx.TimeoutException as e:
|
|
241
|
+
last_error = e
|
|
242
|
+
if attempt < self._max_retries - 1:
|
|
243
|
+
wait_time = 2**attempt
|
|
244
|
+
logger.warning(
|
|
245
|
+
f"Semantic Scholar request timeout, retrying in {wait_time}s "
|
|
246
|
+
f"(attempt {attempt + 1}/{self._max_retries})"
|
|
247
|
+
)
|
|
248
|
+
await asyncio.sleep(wait_time)
|
|
249
|
+
continue
|
|
250
|
+
|
|
251
|
+
except httpx.RequestError as e:
|
|
252
|
+
last_error = e
|
|
253
|
+
if attempt < self._max_retries - 1:
|
|
254
|
+
wait_time = 2**attempt
|
|
255
|
+
logger.warning(
|
|
256
|
+
f"Semantic Scholar request error: {e}, retrying in {wait_time}s "
|
|
257
|
+
f"(attempt {attempt + 1}/{self._max_retries})"
|
|
258
|
+
)
|
|
259
|
+
await asyncio.sleep(wait_time)
|
|
260
|
+
continue
|
|
261
|
+
|
|
262
|
+
except (AuthenticationError, RateLimitError, SearchProviderError):
|
|
263
|
+
raise
|
|
264
|
+
|
|
265
|
+
# All retries exhausted
|
|
266
|
+
raise SearchProviderError(
|
|
267
|
+
provider="semantic_scholar",
|
|
268
|
+
message=f"Request failed after {self._max_retries} attempts",
|
|
269
|
+
retryable=False,
|
|
270
|
+
original_error=last_error,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
def _parse_retry_after(self, response: httpx.Response) -> Optional[float]:
|
|
274
|
+
"""Parse Retry-After header from response.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
response: HTTP response
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
Seconds to wait, or None if not provided
|
|
281
|
+
"""
|
|
282
|
+
retry_after = response.headers.get("Retry-After")
|
|
283
|
+
if retry_after:
|
|
284
|
+
try:
|
|
285
|
+
return float(retry_after)
|
|
286
|
+
except ValueError:
|
|
287
|
+
pass
|
|
288
|
+
return None
|
|
289
|
+
|
|
290
|
+
def _parse_error_response(self, response: httpx.Response) -> str:
|
|
291
|
+
"""Extract error message from Semantic Scholar API error response.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
response: HTTP response
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
Error message string
|
|
298
|
+
"""
|
|
299
|
+
try:
|
|
300
|
+
data = response.json()
|
|
301
|
+
# Semantic Scholar returns {"error": "message"} or {"message": "..."}
|
|
302
|
+
return data.get("error", data.get("message", str(data)))
|
|
303
|
+
except Exception:
|
|
304
|
+
return response.text[:200] if response.text else "Unknown error"
|
|
305
|
+
|
|
306
|
+
def _parse_response(
|
|
307
|
+
self,
|
|
308
|
+
data: dict[str, Any],
|
|
309
|
+
sub_query_id: Optional[str] = None,
|
|
310
|
+
) -> list[ResearchSource]:
|
|
311
|
+
"""Parse Semantic Scholar API response into ResearchSource objects.
|
|
312
|
+
|
|
313
|
+
Semantic Scholar response structure:
|
|
314
|
+
{
|
|
315
|
+
"total": 12345,
|
|
316
|
+
"token": "...", # pagination token
|
|
317
|
+
"data": [
|
|
318
|
+
{
|
|
319
|
+
"paperId": "abc123",
|
|
320
|
+
"title": "...",
|
|
321
|
+
"abstract": "...",
|
|
322
|
+
"authors": [{"authorId": "...", "name": "John Doe"}],
|
|
323
|
+
"citationCount": 42,
|
|
324
|
+
"year": 2023,
|
|
325
|
+
"externalIds": {"DOI": "10.1234/...", "ArXiv": "2301.12345"},
|
|
326
|
+
"url": "https://www.semanticscholar.org/paper/...",
|
|
327
|
+
"openAccessPdf": {"url": "https://..."},
|
|
328
|
+
"publicationDate": "2023-01-15"
|
|
329
|
+
}
|
|
330
|
+
]
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
data: Semantic Scholar API response JSON
|
|
335
|
+
sub_query_id: SubQuery ID for source tracking
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
List of ResearchSource objects with source_type='academic'
|
|
339
|
+
"""
|
|
340
|
+
sources: list[ResearchSource] = []
|
|
341
|
+
papers = data.get("data", [])
|
|
342
|
+
|
|
343
|
+
for paper in papers:
|
|
344
|
+
# Extract external IDs (DOI, arXiv, etc.)
|
|
345
|
+
external_ids = self._extract_external_ids(paper.get("externalIds", {}))
|
|
346
|
+
|
|
347
|
+
# Format authors as comma-separated names
|
|
348
|
+
authors = self._format_authors(paper.get("authors", []))
|
|
349
|
+
|
|
350
|
+
# Extract open access PDF URL if available
|
|
351
|
+
open_access = paper.get("openAccessPdf")
|
|
352
|
+
pdf_url = open_access.get("url") if isinstance(open_access, dict) else None
|
|
353
|
+
|
|
354
|
+
# Parse publication date
|
|
355
|
+
pub_date = self._parse_date(paper.get("publicationDate"))
|
|
356
|
+
|
|
357
|
+
# Build the primary URL (prefer DOI link if available)
|
|
358
|
+
primary_url = self._get_primary_url(paper, external_ids)
|
|
359
|
+
|
|
360
|
+
# Create SearchResult from Semantic Scholar response
|
|
361
|
+
search_result = SearchResult(
|
|
362
|
+
url=primary_url,
|
|
363
|
+
title=paper.get("title", "Untitled"),
|
|
364
|
+
snippet=self._truncate_abstract(paper.get("abstract")),
|
|
365
|
+
content=paper.get("abstract"), # Full abstract as content
|
|
366
|
+
score=None, # Semantic Scholar doesn't provide relevance scores in bulk search
|
|
367
|
+
published_date=pub_date,
|
|
368
|
+
source="Semantic Scholar",
|
|
369
|
+
metadata={
|
|
370
|
+
"paper_id": paper.get("paperId"),
|
|
371
|
+
"authors": authors,
|
|
372
|
+
"citation_count": paper.get("citationCount"),
|
|
373
|
+
"year": paper.get("year"),
|
|
374
|
+
"doi": external_ids.get("doi"),
|
|
375
|
+
"arxiv_id": external_ids.get("arxiv"),
|
|
376
|
+
"pdf_url": pdf_url,
|
|
377
|
+
"semantic_scholar_url": paper.get("url"),
|
|
378
|
+
**{k: v for k, v in external_ids.items() if k not in ("doi", "arxiv")},
|
|
379
|
+
},
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
# Convert to ResearchSource with ACADEMIC type
|
|
383
|
+
research_source = search_result.to_research_source(
|
|
384
|
+
source_type=SourceType.ACADEMIC,
|
|
385
|
+
sub_query_id=sub_query_id,
|
|
386
|
+
)
|
|
387
|
+
sources.append(research_source)
|
|
388
|
+
|
|
389
|
+
return sources
|
|
390
|
+
|
|
391
|
+
def _extract_external_ids(
|
|
392
|
+
self,
|
|
393
|
+
external_ids: dict[str, Any],
|
|
394
|
+
) -> dict[str, str]:
|
|
395
|
+
"""Extract and normalize external IDs from Semantic Scholar response.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
external_ids: Raw externalIds object from API response
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
Dict with normalized keys (doi, arxiv, pubmed, etc.)
|
|
402
|
+
"""
|
|
403
|
+
result: dict[str, str] = {}
|
|
404
|
+
|
|
405
|
+
# Map common ID types to normalized keys
|
|
406
|
+
id_mapping = {
|
|
407
|
+
"DOI": "doi",
|
|
408
|
+
"ArXiv": "arxiv",
|
|
409
|
+
"PubMed": "pubmed",
|
|
410
|
+
"PubMedCentral": "pmc",
|
|
411
|
+
"MAG": "mag", # Microsoft Academic Graph
|
|
412
|
+
"CorpusId": "corpus_id",
|
|
413
|
+
"DBLP": "dblp",
|
|
414
|
+
"ACL": "acl",
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
for api_key, normalized_key in id_mapping.items():
|
|
418
|
+
if api_key in external_ids and external_ids[api_key]:
|
|
419
|
+
result[normalized_key] = str(external_ids[api_key])
|
|
420
|
+
|
|
421
|
+
return result
|
|
422
|
+
|
|
423
|
+
def _format_authors(self, authors: list[dict[str, Any]]) -> str:
|
|
424
|
+
"""Format author list as comma-separated names.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
authors: List of author objects from API response
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
Comma-separated author names (e.g., "John Doe, Jane Smith")
|
|
431
|
+
"""
|
|
432
|
+
if not authors:
|
|
433
|
+
return ""
|
|
434
|
+
|
|
435
|
+
names = [a.get("name", "") for a in authors if a.get("name")]
|
|
436
|
+
|
|
437
|
+
# Limit to first 5 authors with "et al." if more
|
|
438
|
+
if len(names) > 5:
|
|
439
|
+
return ", ".join(names[:5]) + " et al."
|
|
440
|
+
|
|
441
|
+
return ", ".join(names)
|
|
442
|
+
|
|
443
|
+
def _get_primary_url(
|
|
444
|
+
self,
|
|
445
|
+
paper: dict[str, Any],
|
|
446
|
+
external_ids: dict[str, str],
|
|
447
|
+
) -> str:
|
|
448
|
+
"""Get the best primary URL for the paper.
|
|
449
|
+
|
|
450
|
+
Priority:
|
|
451
|
+
1. DOI link (most stable)
|
|
452
|
+
2. arXiv link (commonly used in ML/AI)
|
|
453
|
+
3. Semantic Scholar URL (always available)
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
paper: Paper object from API response
|
|
457
|
+
external_ids: Extracted external IDs
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
Best available URL for the paper
|
|
461
|
+
"""
|
|
462
|
+
# DOI link
|
|
463
|
+
if external_ids.get("doi"):
|
|
464
|
+
return f"https://doi.org/{external_ids['doi']}"
|
|
465
|
+
|
|
466
|
+
# arXiv link
|
|
467
|
+
if external_ids.get("arxiv"):
|
|
468
|
+
return f"https://arxiv.org/abs/{external_ids['arxiv']}"
|
|
469
|
+
|
|
470
|
+
# Fall back to Semantic Scholar URL
|
|
471
|
+
return paper.get("url", "")
|
|
472
|
+
|
|
473
|
+
def _truncate_abstract(
|
|
474
|
+
self,
|
|
475
|
+
abstract: Optional[str],
|
|
476
|
+
max_length: int = 500,
|
|
477
|
+
) -> Optional[str]:
|
|
478
|
+
"""Truncate abstract for snippet field.
|
|
479
|
+
|
|
480
|
+
Args:
|
|
481
|
+
abstract: Full abstract text
|
|
482
|
+
max_length: Maximum snippet length
|
|
483
|
+
|
|
484
|
+
Returns:
|
|
485
|
+
Truncated abstract or None
|
|
486
|
+
"""
|
|
487
|
+
if not abstract:
|
|
488
|
+
return None
|
|
489
|
+
|
|
490
|
+
if len(abstract) <= max_length:
|
|
491
|
+
return abstract
|
|
492
|
+
|
|
493
|
+
# Truncate at word boundary
|
|
494
|
+
truncated = abstract[:max_length]
|
|
495
|
+
last_space = truncated.rfind(" ")
|
|
496
|
+
if last_space > max_length * 0.8:
|
|
497
|
+
truncated = truncated[:last_space]
|
|
498
|
+
|
|
499
|
+
return truncated + "..."
|
|
500
|
+
|
|
501
|
+
def _parse_date(self, date_str: Optional[str]) -> Optional[datetime]:
|
|
502
|
+
"""Parse date string from Semantic Scholar response.
|
|
503
|
+
|
|
504
|
+
Args:
|
|
505
|
+
date_str: Date string in YYYY-MM-DD or YYYY format
|
|
506
|
+
|
|
507
|
+
Returns:
|
|
508
|
+
Parsed datetime or None
|
|
509
|
+
"""
|
|
510
|
+
if not date_str:
|
|
511
|
+
return None
|
|
512
|
+
|
|
513
|
+
# Try full date format first
|
|
514
|
+
try:
|
|
515
|
+
return datetime.strptime(date_str, "%Y-%m-%d")
|
|
516
|
+
except ValueError:
|
|
517
|
+
pass
|
|
518
|
+
|
|
519
|
+
# Try year-only format
|
|
520
|
+
try:
|
|
521
|
+
return datetime.strptime(date_str, "%Y")
|
|
522
|
+
except ValueError:
|
|
523
|
+
pass
|
|
524
|
+
|
|
525
|
+
return None
|
|
526
|
+
|
|
527
|
+
async def health_check(self) -> bool:
|
|
528
|
+
"""Check if Semantic Scholar API is accessible.
|
|
529
|
+
|
|
530
|
+
Performs a lightweight search to verify connectivity (and API key if set).
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
True if provider is healthy, False otherwise
|
|
534
|
+
"""
|
|
535
|
+
try:
|
|
536
|
+
# Perform minimal search to verify connectivity
|
|
537
|
+
await self.search("test", max_results=1)
|
|
538
|
+
return True
|
|
539
|
+
except AuthenticationError:
|
|
540
|
+
logger.error("Semantic Scholar health check failed: invalid API key")
|
|
541
|
+
return False
|
|
542
|
+
except Exception as e:
|
|
543
|
+
logger.warning(f"Semantic Scholar health check failed: {e}")
|
|
544
|
+
return False
|