local-deep-research 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. local_deep_research/config.py +8 -8
  2. local_deep_research/defaults/search_engines.toml +39 -18
  3. local_deep_research/search_system.py +16 -10
  4. local_deep_research/utilties/enums.py +4 -4
  5. local_deep_research/web/app.py +6 -21
  6. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +3 -5
  7. local_deep_research/web_search_engines/engines/search_engine_brave.py +3 -5
  8. local_deep_research/web_search_engines/engines/search_engine_ddg.py +4 -3
  9. local_deep_research/web_search_engines/engines/search_engine_github.py +2 -4
  10. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +2 -4
  11. local_deep_research/web_search_engines/engines/search_engine_guardian.py +323 -78
  12. local_deep_research/web_search_engines/engines/search_engine_local_all.py +3 -5
  13. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +3 -4
  14. local_deep_research/web_search_engines/engines/search_engine_searxng.py +3 -2
  15. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +1128 -0
  16. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +2 -4
  17. local_deep_research/web_search_engines/engines/search_engine_wayback.py +2 -4
  18. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +2 -4
  19. local_deep_research/web_search_engines/search_engine_base.py +12 -4
  20. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.15.dist-info}/METADATA +1 -1
  21. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.15.dist-info}/RECORD +25 -25
  22. local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +0 -623
  23. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.15.dist-info}/WHEEL +0 -0
  24. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.15.dist-info}/entry_points.txt +0 -0
  25. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.15.dist-info}/licenses/LICENSE +0 -0
  26. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1128 @@
1
+ import requests
2
+ import logging
3
+ import json
4
+ from typing import Dict, List, Any, Optional, Tuple, Union
5
+ from langchain_core.language_models import BaseLLM
6
+ import time
7
+ import re
8
+ from datetime import datetime
9
+ from requests.adapters import HTTPAdapter
10
+ from urllib3.util import Retry
11
+
12
+ from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
13
+ from local_deep_research import config
14
+
15
+ # Setup logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class SemanticScholarSearchEngine(BaseSearchEngine):
20
+ """
21
+ Semantic Scholar search engine implementation with two-phase approach.
22
+ Provides efficient access to scientific literature across all fields.
23
+ """
24
+
25
+ def __init__(self,
26
+ max_results: int = 10,
27
+ api_key: Optional[str] = None,
28
+ year_range: Optional[Tuple[int, int]] = None,
29
+ get_abstracts: bool = True,
30
+ get_references: bool = False,
31
+ get_citations: bool = False,
32
+ get_embeddings: bool = False,
33
+ get_tldr: bool = True,
34
+ citation_limit: int = 10,
35
+ reference_limit: int = 10,
36
+ llm: Optional[BaseLLM] = None,
37
+ max_filtered_results: Optional[int] = None,
38
+ optimize_queries: bool = True,
39
+ max_retries: int = 5,
40
+ retry_backoff_factor: float = 1.0,
41
+ fields_of_study: Optional[List[str]] = None,
42
+ publication_types: Optional[List[str]] = None):
43
+ """
44
+ Initialize the Semantic Scholar search engine.
45
+
46
+ Args:
47
+ max_results: Maximum number of search results
48
+ api_key: Semantic Scholar API key for higher rate limits (optional)
49
+ year_range: Optional tuple of (start_year, end_year) to filter results
50
+ get_abstracts: Whether to fetch abstracts for all results
51
+ get_references: Whether to fetch references for papers
52
+ get_citations: Whether to fetch citations for papers
53
+ get_embeddings: Whether to fetch SPECTER embeddings for papers
54
+ get_tldr: Whether to fetch TLDR summaries for papers
55
+ citation_limit: Maximum number of citations to fetch per paper
56
+ reference_limit: Maximum number of references to fetch per paper
57
+ llm: Language model for relevance filtering
58
+ max_filtered_results: Maximum number of results to keep after filtering
59
+ optimize_queries: Whether to optimize natural language queries
60
+ max_retries: Maximum number of retries for API requests
61
+ retry_backoff_factor: Backoff factor for retries
62
+ fields_of_study: List of fields of study to filter results
63
+ publication_types: List of publication types to filter results
64
+ """
65
+ # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
66
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
67
+ self.api_key = api_key
68
+ self.year_range = year_range
69
+ self.get_abstracts = get_abstracts
70
+ self.get_references = get_references
71
+ self.get_citations = get_citations
72
+ self.get_embeddings = get_embeddings
73
+ self.get_tldr = get_tldr
74
+ self.citation_limit = citation_limit
75
+ self.reference_limit = reference_limit
76
+ self.optimize_queries = optimize_queries
77
+ self.max_retries = max_retries
78
+ self.retry_backoff_factor = retry_backoff_factor
79
+ self.fields_of_study = fields_of_study
80
+ self.publication_types = publication_types
81
+
82
+ # Base API URLs
83
+ self.base_url = "https://api.semanticscholar.org/graph/v1"
84
+ self.paper_search_url = f"{self.base_url}/paper/search"
85
+ self.paper_bulk_search_url = f"{self.base_url}/paper/search/bulk"
86
+ self.paper_batch_url = f"{self.base_url}/paper/batch"
87
+ self.paper_details_url = f"{self.base_url}/paper"
88
+ self.author_search_url = f"{self.base_url}/author/search"
89
+ self.author_details_url = f"{self.base_url}/author"
90
+ self.recommendations_url = "https://api.semanticscholar.org/recommendations/v1/papers"
91
+ self.datasets_url = "https://api.semanticscholar.org/datasets/v1"
92
+
93
+ # Create a session with retry capabilities
94
+ self.session = self._create_session()
95
+
96
+ # Rate limiting
97
+ self.rate_limit_wait = 1.0 # Default 1 second between requests
98
+ self.last_request_time = 0
99
+
100
+ def _create_session(self) -> requests.Session:
101
+ """Create and configure a requests session with retry capabilities"""
102
+ session = requests.Session()
103
+
104
+ # Configure automatic retries with exponential backoff
105
+ retry_strategy = Retry(
106
+ total=self.max_retries,
107
+ backoff_factor=self.retry_backoff_factor,
108
+ status_forcelist=[429, 500, 502, 503, 504],
109
+ allowed_methods={"HEAD", "GET", "POST", "OPTIONS"}
110
+ )
111
+
112
+ adapter = HTTPAdapter(max_retries=retry_strategy)
113
+ session.mount("https://", adapter)
114
+
115
+ # Set up headers
116
+ headers = {"Accept": "application/json"}
117
+ if self.api_key:
118
+ headers["x-api-key"] = self.api_key
119
+
120
+ session.headers.update(headers)
121
+
122
+ return session
123
+
124
+ def _respect_rate_limit(self):
125
+ """Apply rate limiting between requests"""
126
+ current_time = time.time()
127
+ elapsed = current_time - self.last_request_time
128
+
129
+ if elapsed < self.rate_limit_wait:
130
+ wait_time = self.rate_limit_wait - elapsed
131
+ logger.debug(f"Rate limiting: waiting {wait_time:.2f}s")
132
+ time.sleep(wait_time)
133
+
134
+ self.last_request_time = time.time()
135
+
136
+ def _get_headers(self) -> Dict[str, str]:
137
+ """Get the headers for API requests"""
138
+ headers = {"Accept": "application/json"}
139
+
140
+ if self.api_key:
141
+ headers["x-api-key"] = self.api_key
142
+
143
+ return headers
144
+
145
+ def _make_request(self, url: str, params: Optional[Dict] = None, data: Optional[Dict] = None,
146
+ method: str = "GET") -> Dict:
147
+ """
148
+ Make a request to the Semantic Scholar API.
149
+
150
+ Args:
151
+ url: API endpoint URL
152
+ params: Query parameters
153
+ data: JSON data for POST requests
154
+ method: HTTP method (GET or POST)
155
+
156
+ Returns:
157
+ API response as dictionary
158
+ """
159
+ self._respect_rate_limit()
160
+
161
+ try:
162
+ if method.upper() == "GET":
163
+ response = self.session.get(url, params=params, timeout=30)
164
+ elif method.upper() == "POST":
165
+ response = self.session.post(url, params=params, json=data, timeout=30)
166
+ else:
167
+ raise ValueError(f"Unsupported HTTP method: {method}")
168
+
169
+ # Handle rate limiting manually if retry strategy fails
170
+ if response.status_code == 429:
171
+ logger.warning("Rate limit exceeded, waiting and retrying...")
172
+ time.sleep(2.0) # Wait longer on rate limit
173
+ self.rate_limit_wait *= 1.5 # Increase wait time for future requests
174
+ return self._make_request(url, params, data, method) # Retry
175
+
176
+ response.raise_for_status()
177
+ return response.json()
178
+ except requests.RequestException as e:
179
+ logger.error(f"API request failed: {e}")
180
+ return {}
181
+
182
+ def _optimize_query(self, query: str) -> str:
183
+ """
184
+ Optimize a natural language query for Semantic Scholar search.
185
+ If LLM is available, uses it to extract key terms and concepts.
186
+
187
+ Args:
188
+ query: Natural language query
189
+
190
+ Returns:
191
+ Optimized query string
192
+ """
193
+ if not self.llm or not self.optimize_queries:
194
+ return query
195
+
196
+ try:
197
+ prompt = f"""Transform this natural language question into an optimized academic search query.
198
+
199
+ Original query: "{query}"
200
+
201
+ INSTRUCTIONS:
202
+ 1. Extract key academic concepts, technical terms, and proper nouns
203
+ 2. Remove generic words, filler words, and non-technical terms
204
+ 3. Add quotation marks around specific phrases that should be kept together
205
+ 4. Return ONLY the optimized search query with no explanation
206
+ 5. Keep it under 100 characters if possible
207
+
208
+ EXAMPLE TRANSFORMATIONS:
209
+ "What are the latest findings about mRNA vaccines and COVID-19?" → "mRNA vaccines COVID-19 recent findings"
210
+ "How does machine learning impact climate change prediction?" → "machine learning "climate change" prediction"
211
+ "Tell me about quantum computing approaches for encryption" → "quantum computing encryption"
212
+
213
+ Return ONLY the optimized search query with no explanation.
214
+ """
215
+
216
+ response = self.llm.invoke(prompt)
217
+ optimized_query = response.content.strip()
218
+
219
+ # Clean up the query - remove any explanations
220
+ lines = optimized_query.split('\n')
221
+ optimized_query = lines[0].strip()
222
+
223
+ # Safety check - if query looks too much like an explanation, use original
224
+ if len(optimized_query.split()) > 15 or ":" in optimized_query:
225
+ logger.warning("Query optimization result looks too verbose, using original")
226
+ return query
227
+
228
+ logger.info(f"Original query: '{query}'")
229
+ logger.info(f"Optimized for Semantic Scholar: '{optimized_query}'")
230
+
231
+ return optimized_query
232
+ except Exception as e:
233
+ logger.error(f"Error optimizing query: {e}")
234
+ return query # Fall back to original query on error
235
+
236
+ def _search_papers(self, query: str) -> List[Dict[str, Any]]:
237
+ """
238
+ Search for papers matching the query.
239
+
240
+ Args:
241
+ query: The search query
242
+
243
+ Returns:
244
+ List of paper dictionaries
245
+ """
246
+ try:
247
+ fields = [
248
+ "paperId",
249
+ "externalIds",
250
+ "url",
251
+ "title",
252
+ "abstract",
253
+ "venue",
254
+ "year",
255
+ "authors"
256
+ ]
257
+
258
+ if self.get_tldr:
259
+ fields.append("tldr")
260
+
261
+ params = {
262
+ "query": query,
263
+ "limit": min(self.max_results, 100), # Regular search API can return up to 100 results
264
+ "fields": ",".join(fields)
265
+ }
266
+
267
+ # Add year filter if specified
268
+ if self.year_range:
269
+ start_year, end_year = self.year_range
270
+ params["year"] = f"{start_year}-{end_year}"
271
+
272
+ # Add fields of study filter if specified
273
+ if self.fields_of_study:
274
+ params["fieldsOfStudy"] = ",".join(self.fields_of_study)
275
+
276
+ # Add publication types filter if specified
277
+ if self.publication_types:
278
+ params["publicationTypes"] = ",".join(self.publication_types)
279
+
280
+ response = self._make_request(self.paper_search_url, params)
281
+
282
+ if "data" in response:
283
+ papers = response["data"]
284
+ logger.info(f"Found {len(papers)} papers matching query: '{query}'")
285
+ return papers
286
+ else:
287
+ logger.warning(f"No data in response for query: '{query}'")
288
+ return []
289
+
290
+ except Exception as e:
291
+ logger.error(f"Error searching papers: {e}")
292
+ return []
293
+
294
+ def _search_papers_bulk(self, query: str, limit: int = 1000) -> List[Dict[str, Any]]:
295
+ """
296
+ Search for papers using the bulk search API, which can return up to 1000 papers.
297
+
298
+ Args:
299
+ query: The search query
300
+ limit: Maximum number of results (up to 1000)
301
+
302
+ Returns:
303
+ List of paper dictionaries
304
+ """
305
+ try:
306
+ fields = [
307
+ "paperId",
308
+ "externalIds",
309
+ "url",
310
+ "title",
311
+ "abstract",
312
+ "venue",
313
+ "year",
314
+ "authors",
315
+ "fieldsOfStudy"
316
+ ]
317
+
318
+ if self.get_tldr:
319
+ fields.append("tldr")
320
+
321
+ params = {
322
+ "query": query,
323
+ "limit": min(limit, 1000), # Bulk search API can return up to 1000 results
324
+ "fields": ",".join(fields)
325
+ }
326
+
327
+ # Add year filter if specified
328
+ if self.year_range:
329
+ start_year, end_year = self.year_range
330
+ params["year"] = f"{start_year}-{end_year}"
331
+
332
+ # Add fields of study filter if specified
333
+ if self.fields_of_study:
334
+ params["fieldsOfStudy"] = ",".join(self.fields_of_study)
335
+
336
+ # Add publication types filter if specified
337
+ if self.publication_types:
338
+ params["publicationTypes"] = ",".join(self.publication_types)
339
+
340
+ response = self._make_request(self.paper_bulk_search_url, params)
341
+
342
+ if "data" in response:
343
+ papers = response["data"]
344
+ logger.info(f"Found {len(papers)} papers using bulk search for query: '{query}'")
345
+ total_count = response.get("total", 0)
346
+ logger.info(f"Total available results: {total_count}")
347
+
348
+ # Handle continuation token for pagination if needed
349
+ if "token" in response and len(papers) < min(total_count, limit):
350
+ token = response["token"]
351
+ logger.info(f"Continuation token available: {token}")
352
+ # The caller would need to handle continuation tokens for pagination
353
+
354
+ return papers
355
+ else:
356
+ logger.warning(f"No data in response for bulk query: '{query}'")
357
+ return []
358
+
359
+ except Exception as e:
360
+ logger.error(f"Error in bulk paper search: {e}")
361
+ return []
362
+
363
+ def _get_paper_details(self, paper_id: str) -> Dict[str, Any]:
364
+ """
365
+ Get detailed information about a specific paper.
366
+
367
+ Args:
368
+ paper_id: Semantic Scholar Paper ID
369
+
370
+ Returns:
371
+ Dictionary with paper details
372
+ """
373
+ try:
374
+ # Construct fields parameter
375
+ fields = [
376
+ "paperId",
377
+ "externalIds",
378
+ "corpusId",
379
+ "url",
380
+ "title",
381
+ "abstract",
382
+ "venue",
383
+ "year",
384
+ "authors",
385
+ "fieldsOfStudy"
386
+ ]
387
+
388
+ if self.get_tldr:
389
+ fields.append("tldr")
390
+
391
+ if self.get_embeddings:
392
+ fields.append("embedding")
393
+
394
+ # Add citation and reference fields if requested
395
+ if self.get_citations:
396
+ fields.append(f"citations.limit({self.citation_limit})")
397
+
398
+ if self.get_references:
399
+ fields.append(f"references.limit({self.reference_limit})")
400
+
401
+ # Make the request
402
+ url = f"{self.paper_details_url}/{paper_id}"
403
+ params = {"fields": ",".join(fields)}
404
+
405
+ return self._make_request(url, params)
406
+
407
+ except Exception as e:
408
+ logger.error(f"Error getting paper details for {paper_id}: {e}")
409
+ return {}
410
+
411
+
412
+ def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
413
+ """
414
+ Perform an adaptive search that adjusts based on result volume.
415
+ Uses LLM to generate better fallback queries when available.
416
+
417
+ Args:
418
+ query: The search query (already optimized)
419
+
420
+ Returns:
421
+ Tuple of (list of paper results, search strategy used)
422
+ """
423
+ # Start with a standard search
424
+ papers = self._search_papers(query)
425
+ strategy = "standard"
426
+
427
+ # If no results, try different variations
428
+ if not papers:
429
+ # Try removing quotes to broaden search
430
+ if '"' in query:
431
+ unquoted_query = query.replace('"', '')
432
+ logger.info(f"No results with quoted terms, trying without quotes: {unquoted_query}")
433
+ papers = self._search_papers(unquoted_query)
434
+
435
+ if papers:
436
+ strategy = "unquoted"
437
+ return papers, strategy
438
+
439
+ # If LLM is available, use it to generate better fallback queries
440
+ if self.llm:
441
+ try:
442
+ # Generate alternate search queries focusing on core concepts
443
+ prompt = f"""You are helping refine a search query for academic papers related to cancer research that returned no results.
444
+
445
+ Original query: "{query}"
446
+
447
+ The query might be too specific, contain future dates, or use natural language phrasing that doesn't match academic paper keywords.
448
+
449
+ Please provide THREE alternative search queries that:
450
+ 1. Focus on the core academic concepts about cancer treatment, research, or therapies
451
+ 2. Remove future dates or references to "latest" or "current" (replace with terms like "recent" or "novel")
452
+ 3. Use precise medical/scientific terminology commonly found in academic papers
453
+ 4. Break down complex queries into more searchable components
454
+ 5. Format each as a concise keyword-focused search term (not a natural language question)
455
+
456
+ Format each query on a new line with no numbering or explanation. Keep each query under 8 words and very focused.
457
+ """
458
+ # Get the LLM's response
459
+ response = self.llm.invoke(prompt)
460
+
461
+ # Extract the alternative queries
462
+ alt_queries = []
463
+ if hasattr(response, 'content'): # Handle various LLM response formats
464
+ content = response.content
465
+ alt_queries = [q.strip() for q in content.strip().split('\n') if q.strip()]
466
+ elif isinstance(response, str):
467
+ alt_queries = [q.strip() for q in response.strip().split('\n') if q.strip()]
468
+
469
+ # Try each alternative query
470
+ for alt_query in alt_queries[:3]: # Limit to first 3 alternatives
471
+ logger.info(f"Trying LLM-suggested query: {alt_query}")
472
+ alt_papers = self._search_papers(alt_query)
473
+
474
+ if alt_papers:
475
+ logger.info(f"Found {len(alt_papers)} papers using LLM-suggested query: {alt_query}")
476
+ strategy = "llm_alternative"
477
+ return alt_papers, strategy
478
+ except Exception as e:
479
+ logger.error(f"Error using LLM for query refinement: {e}")
480
+ # Fall through to simpler strategies
481
+
482
+ # Fallback 1: Try extracting important cancer-related terms
483
+ cancer_terms = ["cancer", "tumor", "oncology", "carcinoma", "sarcoma", "leukemia",
484
+ "lymphoma", "metastasis", "therapy", "immunotherapy", "targeted",
485
+ "treatment", "drug", "clinical", "trial", "biomarker"]
486
+
487
+ words = re.findall(r'\b\w+\b', query.lower())
488
+ important_terms = [word for word in words if word in cancer_terms or len(word) > 7]
489
+
490
+ if important_terms:
491
+ important_query = ' '.join(important_terms[:5]) # Limit to 5 terms
492
+ logger.info(f"Trying with important cancer terms: {important_query}")
493
+ papers = self._search_papers(important_query)
494
+
495
+ if papers:
496
+ strategy = "cancer_terms"
497
+ return papers, strategy
498
+
499
+ # Fallback 2: Try with just specific cancer types or treatment modalities
500
+ cancer_types = ["breast", "lung", "colorectal", "prostate", "melanoma", "lymphoma",
501
+ "leukemia", "myeloma", "sarcoma", "glioblastoma"]
502
+ treatment_types = ["immunotherapy", "chemotherapy", "radiotherapy", "targeted",
503
+ "surgery", "vaccine", "antibody", "CAR-T", "inhibitor"]
504
+
505
+ cancer_matches = [word for word in words if word in cancer_types]
506
+ treatment_matches = [word for word in words if word in treatment_types]
507
+
508
+ if cancer_matches and treatment_matches:
509
+ specific_query = f"{cancer_matches[0]} {treatment_matches[0]}"
510
+ logger.info(f"Trying with specific cancer-treatment pair: {specific_query}")
511
+ papers = self._search_papers(specific_query)
512
+
513
+ if papers:
514
+ strategy = "specific_pair"
515
+ return papers, strategy
516
+
517
+ # Fallback 3: Extract the longest word (likely a specific term)
518
+ longest_word = max(re.findall(r'\w+', query), key=len, default='')
519
+ if len(longest_word) > 6:
520
+ logger.info(f"Trying with primary keyword: {longest_word}")
521
+ papers = self._search_papers(longest_word)
522
+
523
+ if papers:
524
+ strategy = "primary_keyword"
525
+ return papers, strategy
526
+
527
+ return papers, strategy
528
+
529
+
530
+ def _get_previews(self, query: str) -> List[Dict[str, Any]]:
531
+ """
532
+ Get preview information for Semantic Scholar papers.
533
+
534
+ Args:
535
+ query: The search query
536
+
537
+ Returns:
538
+ List of preview dictionaries
539
+ """
540
+ logger.info(f"Getting Semantic Scholar previews for query: {query}")
541
+
542
+ # Optimize the query if LLM is available
543
+ optimized_query = self._optimize_query(query)
544
+
545
+ # Perform adaptive search
546
+ papers, strategy = self._adaptive_search(optimized_query)
547
+
548
+ if not papers:
549
+ logger.warning(f"No Semantic Scholar results found using strategy: {strategy}")
550
+ return []
551
+
552
+ # Format as previews
553
+ previews = []
554
+ for paper in papers:
555
+ try:
556
+ # Format authors - ensure we have a valid list with string values
557
+ authors = []
558
+ if "authors" in paper and paper["authors"]:
559
+ authors = [author.get("name", "") for author in paper["authors"] if author and author.get("name")]
560
+
561
+ # Ensure we have valid strings for all fields
562
+ paper_id = paper.get("paperId", "")
563
+ title = paper.get("title", "")
564
+ url = paper.get("url", "")
565
+
566
+ # Handle abstract safely, ensuring we always have a string
567
+ abstract = paper.get("abstract")
568
+ snippet = ""
569
+ if abstract:
570
+ snippet = abstract[:250] + "..." if len(abstract) > 250 else abstract
571
+
572
+ venue = paper.get("venue", "")
573
+ year = paper.get("year")
574
+ external_ids = paper.get("externalIds", {})
575
+
576
+ # Handle TLDR safely
577
+ tldr_text = ""
578
+ if paper.get("tldr") and isinstance(paper.get("tldr"), dict):
579
+ tldr_text = paper.get("tldr", {}).get("text", "")
580
+
581
+ # Create preview with basic information, ensuring no None values
582
+ preview = {
583
+ "id": paper_id if paper_id else "",
584
+ "title": title if title else "",
585
+ "link": url if url else "",
586
+ "snippet": snippet, # Already handled above
587
+ "authors": authors, # List of strings, safe to use directly
588
+ "venue": venue if venue else "",
589
+ "year": year, # Can be None, handled in downstream processing
590
+ "external_ids": external_ids if external_ids else {},
591
+ "source": "Semantic Scholar",
592
+ "_paper_id": paper_id if paper_id else "",
593
+ "_search_strategy": strategy,
594
+ "tldr": tldr_text
595
+ }
596
+
597
+ # Store the full paper object for later reference
598
+ preview["_full_paper"] = paper
599
+
600
+ previews.append(preview)
601
+ except Exception as e:
602
+ logger.error(f"Error processing paper preview: {e}")
603
+ # Continue with the next paper
604
+
605
+ logger.info(f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}")
606
+ return previews
607
+
608
+ def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
609
+ """
610
+ Get full content for the relevant Semantic Scholar papers.
611
+ Gets additional details like citations, references, and full metadata.
612
+
613
+ Args:
614
+ relevant_items: List of relevant preview dictionaries
615
+
616
+ Returns:
617
+ List of result dictionaries with full content
618
+ """
619
+ # Check if we should add full content
620
+ if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
621
+ logger.info("Snippet-only mode, skipping full content retrieval")
622
+ return relevant_items
623
+
624
+ logger.info(f"Getting content for {len(relevant_items)} Semantic Scholar papers")
625
+
626
+ results = []
627
+ for item in relevant_items:
628
+ result = item.copy()
629
+ paper_id = item.get("_paper_id", "")
630
+
631
+ # Skip if no paper ID
632
+ if not paper_id:
633
+ results.append(result)
634
+ continue
635
+
636
+ # Get paper details if citations or references are requested
637
+ if self.get_citations or self.get_references or self.get_embeddings:
638
+ paper_details = self._get_paper_details(paper_id)
639
+
640
+ if paper_details:
641
+ # Add citation information
642
+ if self.get_citations and "citations" in paper_details:
643
+ result["citations"] = paper_details["citations"]
644
+
645
+ # Add reference information
646
+ if self.get_references and "references" in paper_details:
647
+ result["references"] = paper_details["references"]
648
+
649
+ # Add embedding if available
650
+ if self.get_embeddings and "embedding" in paper_details:
651
+ result["embedding"] = paper_details["embedding"]
652
+
653
+ # Add fields of study
654
+ if "fieldsOfStudy" in paper_details:
655
+ result["fields_of_study"] = paper_details["fieldsOfStudy"]
656
+
657
+ # Remove temporary fields
658
+ if "_paper_id" in result:
659
+ del result["_paper_id"]
660
+ if "_search_strategy" in result:
661
+ del result["_search_strategy"]
662
+ if "_full_paper" in result:
663
+ del result["_full_paper"]
664
+
665
+ results.append(result)
666
+
667
+ return results
668
+
669
+ def search_by_author(self, author_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
670
+ """
671
+ Search for papers by a specific author.
672
+
673
+ Args:
674
+ author_name: Name of the author
675
+ max_results: Maximum number of results (defaults to self.max_results)
676
+
677
+ Returns:
678
+ List of papers by the author
679
+ """
680
+ original_max_results = self.max_results
681
+
682
+ try:
683
+ if max_results:
684
+ self.max_results = max_results
685
+
686
+ # First search for the author
687
+ params = {
688
+ "query": author_name,
689
+ "limit": 5 # Limit to top 5 author matches
690
+ }
691
+
692
+ response = self._make_request(self.author_search_url, params)
693
+
694
+ if "data" not in response or not response["data"]:
695
+ logger.warning(f"No authors found matching: {author_name}")
696
+ return []
697
+
698
+ # Use the first (best) author match
699
+ author = response["data"][0]
700
+ author_id = author.get("authorId")
701
+
702
+ if not author_id:
703
+ logger.warning(f"No valid author ID found for: {author_name}")
704
+ return []
705
+
706
+ # Get the author's papers
707
+ fields = [
708
+ "papers.paperId",
709
+ "papers.title",
710
+ "papers.abstract",
711
+ "papers.venue",
712
+ "papers.year",
713
+ "papers.authors"
714
+ ]
715
+
716
+ if self.get_tldr:
717
+ fields.append("papers.tldr")
718
+
719
+ url = f"{self.author_details_url}/{author_id}"
720
+ author_params = {
721
+ "fields": ",".join(fields)
722
+ }
723
+
724
+ author_data = self._make_request(url, author_params)
725
+
726
+ if "papers" not in author_data or not author_data["papers"]:
727
+ logger.warning(f"No papers found for author: {author_name}")
728
+ return []
729
+
730
+ # Format as paper results
731
+ papers = author_data["papers"][:self.max_results]
732
+
733
+ # Convert to standard results format
734
+ results = []
735
+ for paper in papers:
736
+ # Format authors
737
+ authors = []
738
+ if "authors" in paper and paper["authors"]:
739
+ authors = [author.get("name", "") for author in paper["authors"]]
740
+
741
+ result = {
742
+ "id": paper.get("paperId", ""),
743
+ "title": paper.get("title", ""),
744
+ "link": f"https://www.semanticscholar.org/paper/{paper.get('paperId', '')}",
745
+ "snippet": paper.get("abstract", "")[:250] + "..." if paper.get("abstract", "") and len(paper.get("abstract", "")) > 250 else paper.get("abstract", ""),
746
+ "authors": authors,
747
+ "venue": paper.get("venue", ""),
748
+ "year": paper.get("year"),
749
+ "source": "Semantic Scholar",
750
+
751
+ # Include TLDR if available
752
+ "tldr": paper.get("tldr", {}).get("text", "") if paper.get("tldr") else ""
753
+ }
754
+
755
+ results.append(result)
756
+
757
+ # Add citations and references if needed
758
+ if self.get_citations or self.get_references:
759
+ results = self._get_full_content(results)
760
+
761
+ return results
762
+
763
+ finally:
764
+ # Restore original value
765
+ self.max_results = original_max_results
766
+
767
+ def search_by_venue(self, venue_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
768
+ """
769
+ Search for papers in a specific venue.
770
+
771
+ Args:
772
+ venue_name: Name of the venue (conference or journal)
773
+ max_results: Maximum number of results (defaults to self.max_results)
774
+
775
+ Returns:
776
+ List of papers from the venue
777
+ """
778
+ original_max_results = self.max_results
779
+
780
+ try:
781
+ if max_results:
782
+ self.max_results = max_results
783
+
784
+ # Semantic Scholar doesn't have a dedicated venue search API
785
+ # So we search for papers with the venue in the query
786
+ query = f'venue:"{venue_name}"'
787
+ return self.run(query)
788
+
789
+ finally:
790
+ # Restore original value
791
+ self.max_results = original_max_results
792
+
793
+ def search_by_year(self, query: str, year: int, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
794
+ """
795
+ Search for papers from a specific year matching the query.
796
+
797
+ Args:
798
+ query: The search query
799
+ year: Publication year
800
+ max_results: Maximum number of results (defaults to self.max_results)
801
+
802
+ Returns:
803
+ List of papers from the specified year matching the query
804
+ """
805
+ original_max_results = self.max_results
806
+ original_year_range = self.year_range
807
+
808
+ try:
809
+ if max_results:
810
+ self.max_results = max_results
811
+
812
+ # Set year range for this search
813
+ self.year_range = (year, year)
814
+
815
+ return self.run(query)
816
+
817
+ finally:
818
+ # Restore original values
819
+ self.max_results = original_max_results
820
+ self.year_range = original_year_range
821
+
822
+ def search_by_field(self, query: str, field_of_study: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
823
+ """
824
+ Search for papers in a specific field of study.
825
+
826
+ Args:
827
+ query: The search query
828
+ field_of_study: Field of study (e.g., "Computer Science", "Medicine")
829
+ max_results: Maximum number of results (defaults to self.max_results)
830
+
831
+ Returns:
832
+ List of papers in the specified field matching the query
833
+ """
834
+ original_max_results = self.max_results
835
+
836
+ try:
837
+ if max_results:
838
+ self.max_results = max_results
839
+
840
+ # Add field of study to query
841
+ field_query = f'{query} fieldofstudy:"{field_of_study}"'
842
+ return self.run(field_query)
843
+
844
+ finally:
845
+ # Restore original value
846
+ self.max_results = original_max_results
847
+
848
+ def get_paper_by_id(self, paper_id: str) -> Dict[str, Any]:
849
+ """
850
+ Get a specific paper by its Semantic Scholar ID.
851
+
852
+ Args:
853
+ paper_id: Semantic Scholar paper ID
854
+
855
+ Returns:
856
+ Dictionary with paper information
857
+ """
858
+ paper_details = self._get_paper_details(paper_id)
859
+
860
+ if not paper_details:
861
+ return {}
862
+
863
+ # Format authors
864
+ authors = []
865
+ if "authors" in paper_details and paper_details["authors"]:
866
+ authors = [author.get("name", "") for author in paper_details["authors"]]
867
+
868
+ # Create formatted result
869
+ result = {
870
+ "id": paper_details.get("paperId", ""),
871
+ "title": paper_details.get("title", ""),
872
+ "link": paper_details.get("url", ""),
873
+ "abstract": paper_details.get("abstract", ""),
874
+ "authors": authors,
875
+ "venue": paper_details.get("venue", ""),
876
+ "year": paper_details.get("year"),
877
+ "fields_of_study": paper_details.get("fieldsOfStudy", []),
878
+ "external_ids": paper_details.get("externalIds", {}),
879
+ "source": "Semantic Scholar",
880
+
881
+ # Include TLDR if available
882
+ "tldr": paper_details.get("tldr", {}).get("text", "") if paper_details.get("tldr") else ""
883
+ }
884
+
885
+ # Add citations and references if requested
886
+ if self.get_citations and "citations" in paper_details:
887
+ result["citations"] = paper_details["citations"]
888
+
889
+ if self.get_references and "references" in paper_details:
890
+ result["references"] = paper_details["references"]
891
+
892
+ # Add embedding if requested
893
+ if self.get_embeddings and "embedding" in paper_details:
894
+ result["embedding"] = paper_details["embedding"]
895
+
896
+ return result
897
+
898
+ def get_paper_by_doi(self, doi: str) -> Dict[str, Any]:
899
+ """
900
+ Get a paper by its DOI.
901
+
902
+ Args:
903
+ doi: Digital Object Identifier
904
+
905
+ Returns:
906
+ Dictionary with paper information
907
+ """
908
+ try:
909
+ # The Semantic Scholar API supports DOI lookup
910
+ url = f"{self.paper_details_url}/DOI:{doi}"
911
+ fields = [
912
+ "paperId",
913
+ "externalIds",
914
+ "url",
915
+ "title",
916
+ "abstract",
917
+ "venue",
918
+ "year",
919
+ "authors",
920
+ "fieldsOfStudy"
921
+ ]
922
+
923
+ if self.get_tldr:
924
+ fields.append("tldr")
925
+
926
+ if self.get_embeddings:
927
+ fields.append("embedding")
928
+
929
+ # Add citation and reference fields if requested
930
+ if self.get_citations:
931
+ fields.append(f"citations.limit({self.citation_limit})")
932
+
933
+ if self.get_references:
934
+ fields.append(f"references.limit({self.reference_limit})")
935
+
936
+ params = {"fields": ",".join(fields)}
937
+ paper_details = self._make_request(url, params)
938
+
939
+ if not paper_details:
940
+ return {}
941
+
942
+ # Format the paper info the same way as get_paper_by_id
943
+ # Format authors
944
+ authors = []
945
+ if "authors" in paper_details and paper_details["authors"]:
946
+ authors = [author.get("name", "") for author in paper_details["authors"]]
947
+
948
+ # Create formatted result
949
+ result = {
950
+ "id": paper_details.get("paperId", ""),
951
+ "title": paper_details.get("title", ""),
952
+ "link": paper_details.get("url", ""),
953
+ "abstract": paper_details.get("abstract", ""),
954
+ "authors": authors,
955
+ "venue": paper_details.get("venue", ""),
956
+ "year": paper_details.get("year"),
957
+ "fields_of_study": paper_details.get("fieldsOfStudy", []),
958
+ "external_ids": paper_details.get("externalIds", {}),
959
+ "source": "Semantic Scholar",
960
+
961
+ # Include TLDR if available
962
+ "tldr": paper_details.get("tldr", {}).get("text", "") if paper_details.get("tldr") else ""
963
+ }
964
+
965
+ # Add citations and references if requested
966
+ if self.get_citations and "citations" in paper_details:
967
+ result["citations"] = paper_details["citations"]
968
+
969
+ if self.get_references and "references" in paper_details:
970
+ result["references"] = paper_details["references"]
971
+
972
+ # Add embedding if requested
973
+ if self.get_embeddings and "embedding" in paper_details:
974
+ result["embedding"] = paper_details["embedding"]
975
+
976
+ return result
977
+
978
+ except Exception as e:
979
+ logger.error(f"Error getting paper by DOI {doi}: {e}")
980
+ return {}
981
+
982
+ def get_papers_batch(self, paper_ids: List[str], fields: Optional[List[str]] = None) -> List[Dict[str, Any]]:
983
+ """
984
+ Get details for multiple papers in a single batch request.
985
+
986
+ Args:
987
+ paper_ids: List of paper IDs (Semantic Scholar IDs, DOIs, arXiv IDs, etc.)
988
+ fields: Fields to include in the response
989
+
990
+ Returns:
991
+ List of paper details
992
+ """
993
+ if not paper_ids:
994
+ return []
995
+
996
+ if fields is None:
997
+ fields = [
998
+ "paperId",
999
+ "externalIds",
1000
+ "url",
1001
+ "title",
1002
+ "abstract",
1003
+ "venue",
1004
+ "year",
1005
+ "authors",
1006
+ "referenceCount",
1007
+ "citationCount"
1008
+ ]
1009
+
1010
+ if self.get_tldr:
1011
+ fields.append("tldr")
1012
+
1013
+ try:
1014
+ # Construct request params
1015
+ params = {
1016
+ "fields": ",".join(fields)
1017
+ }
1018
+
1019
+ # Make POST request with paper IDs in the body
1020
+ response = self._make_request(
1021
+ self.paper_batch_url,
1022
+ params=params,
1023
+ data={"ids": paper_ids},
1024
+ method="POST"
1025
+ )
1026
+
1027
+ if isinstance(response, list):
1028
+ return response
1029
+ else:
1030
+ logger.warning("Unexpected response format from batch API")
1031
+ return []
1032
+
1033
+ except Exception as e:
1034
+ logger.error(f"Error in batch paper lookup: {e}")
1035
+ return []
1036
+
1037
+ def get_paper_recommendations(self,
1038
+ positive_paper_ids: List[str],
1039
+ negative_paper_ids: Optional[List[str]] = None,
1040
+ max_results: Optional[int] = None) -> List[Dict[str, Any]]:
1041
+ """
1042
+ Get recommended papers based on positive and negative examples.
1043
+
1044
+ Args:
1045
+ positive_paper_ids: List of paper IDs to use as positive examples
1046
+ negative_paper_ids: Optional list of paper IDs to use as negative examples
1047
+ max_results: Maximum number of recommendations to return
1048
+
1049
+ Returns:
1050
+ List of recommended papers
1051
+ """
1052
+ if not positive_paper_ids:
1053
+ return []
1054
+
1055
+ limit = max_results or self.max_results
1056
+
1057
+ try:
1058
+ # Construct the request payload
1059
+ payload = {
1060
+ "positivePaperIds": positive_paper_ids
1061
+ }
1062
+
1063
+ if negative_paper_ids:
1064
+ payload["negativePaperIds"] = negative_paper_ids
1065
+
1066
+ # Define fields to include in the response
1067
+ fields = [
1068
+ "paperId",
1069
+ "externalIds",
1070
+ "url",
1071
+ "title",
1072
+ "abstract",
1073
+ "venue",
1074
+ "year",
1075
+ "authors"
1076
+ ]
1077
+
1078
+ if self.get_tldr:
1079
+ fields.append("tldr")
1080
+
1081
+ # Request parameters
1082
+ params = {
1083
+ "fields": ",".join(fields),
1084
+ "limit": limit
1085
+ }
1086
+
1087
+ # Make POST request to recommendations endpoint
1088
+ response = self._make_request(
1089
+ self.recommendations_url,
1090
+ params=params,
1091
+ data=payload,
1092
+ method="POST"
1093
+ )
1094
+
1095
+ if "recommendedPapers" not in response:
1096
+ return []
1097
+
1098
+ papers = response["recommendedPapers"]
1099
+
1100
+ # Format as standard results
1101
+ results = []
1102
+ for paper in papers:
1103
+ # Format authors
1104
+ authors = []
1105
+ if "authors" in paper and paper["authors"]:
1106
+ authors = [author.get("name", "") for author in paper["authors"]]
1107
+
1108
+ result = {
1109
+ "id": paper.get("paperId", ""),
1110
+ "title": paper.get("title", ""),
1111
+ "link": paper.get("url", ""),
1112
+ "snippet": paper.get("abstract", "")[:250] + "..." if paper.get("abstract", "") and len(paper.get("abstract", "")) > 250 else paper.get("abstract", ""),
1113
+ "authors": authors,
1114
+ "venue": paper.get("venue", ""),
1115
+ "year": paper.get("year"),
1116
+ "source": "Semantic Scholar",
1117
+
1118
+ # Include TLDR if available
1119
+ "tldr": paper.get("tldr", {}).get("text", "") if paper.get("tldr") else ""
1120
+ }
1121
+
1122
+ results.append(result)
1123
+
1124
+ return results
1125
+
1126
+ except Exception as e:
1127
+ logger.error(f"Error getting paper recommendations: {e}")
1128
+ return []