local-deep-research 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. local_deep_research/config.py +8 -8
  2. local_deep_research/defaults/search_engines.toml +39 -18
  3. local_deep_research/search_system.py +15 -9
  4. local_deep_research/utilties/enums.py +4 -4
  5. local_deep_research/web/app.py +3 -2
  6. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +3 -5
  7. local_deep_research/web_search_engines/engines/search_engine_brave.py +3 -5
  8. local_deep_research/web_search_engines/engines/search_engine_ddg.py +4 -3
  9. local_deep_research/web_search_engines/engines/search_engine_github.py +2 -4
  10. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +2 -4
  11. local_deep_research/web_search_engines/engines/search_engine_guardian.py +323 -78
  12. local_deep_research/web_search_engines/engines/search_engine_local_all.py +3 -5
  13. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +3 -4
  14. local_deep_research/web_search_engines/engines/search_engine_searxng.py +3 -2
  15. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +1128 -0
  16. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +2 -4
  17. local_deep_research/web_search_engines/engines/search_engine_wayback.py +2 -4
  18. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +2 -4
  19. local_deep_research/web_search_engines/search_engine_base.py +12 -4
  20. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/METADATA +1 -1
  21. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/RECORD +25 -25
  22. local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +0 -623
  23. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/WHEEL +0 -0
  24. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/entry_points.txt +0 -0
  25. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/licenses/LICENSE +0 -0
  26. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/top_level.txt +0 -0
@@ -1,623 +0,0 @@
1
- from typing import Dict, List, Any, Optional
2
- from langchain_core.language_models import BaseLLM
3
- import requests
4
- import logging
5
- import re
6
- import time
7
- from datetime import datetime
8
- import xml.etree.ElementTree as ET
9
- from urllib.parse import quote
10
-
11
- from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
12
- from local_deep_research import config
13
-
14
- # Setup logging
15
- logging.basicConfig(level=logging.INFO)
16
- logger = logging.getLogger(__name__)
17
-
18
- class MedRxivSearchEngine(BaseSearchEngine):
19
- """medRxiv search engine implementation with two-phase approach"""
20
-
21
- def __init__(self,
22
- max_results: int = 10,
23
- sort_by: str = "relevance_score",
24
- sort_order: str = "desc",
25
- include_full_text: bool = False,
26
- download_dir: Optional[str] = None,
27
- max_full_text: int = 1,
28
- llm: Optional[BaseLLM] = None,
29
- max_filtered_results: Optional[int] = None,
30
- days_limit: Optional[int] = None,
31
- optimize_queries: bool = True):
32
- """
33
- Initialize the medRxiv search engine.
34
-
35
- Args:
36
- max_results: Maximum number of search results
37
- sort_by: Sorting criteria ('relevance_score', 'date', or 'date_posted')
38
- sort_order: Sort order ('desc' or 'asc')
39
- include_full_text: Whether to include full paper content in results (downloads PDF)
40
- download_dir: Directory to download PDFs to (if include_full_text is True)
41
- max_full_text: Maximum number of PDFs to download and process (default: 1)
42
- llm: Language model for relevance filtering
43
- max_filtered_results: Maximum number of results to keep after filtering
44
- days_limit: Limit results to papers from the last N days
45
- """
46
- # Initialize the BaseSearchEngine with the LLM and max_filtered_results
47
- super().__init__(llm=llm, max_filtered_results=max_filtered_results)
48
-
49
- self.max_results = max_results
50
- self.sort_by = sort_by
51
- self.sort_order = sort_order
52
- self.include_full_text = include_full_text
53
- self.download_dir = download_dir
54
- self.max_full_text = max_full_text
55
- self.days_limit = days_limit
56
- self.optimize_queries = optimize_queries
57
-
58
- # medRxiv API endpoints
59
- self.search_base_url = "https://api.biorxiv.org/covid19/{}/{{}}".format(
60
- "desc" if sort_order.lower() == "desc" else "asc"
61
- )
62
- self.medrxiv_api_url = "https://api.biorxiv.org/details/medrxiv/{}/"
63
-
64
- # medRxiv base URL for papers
65
- self.medrxiv_base_url = "https://www.medrxiv.org/content/"
66
-
67
- def _optimize_query_for_medrxiv(self, query: str) -> str:
68
- """
69
- Optimize a natural language query for medRxiv search.
70
- Uses LLM to transform questions into effective keyword-based queries.
71
-
72
- Args:
73
- query: Natural language query
74
-
75
- Returns:
76
- Optimized query string for medRxiv
77
- """
78
- if not self.llm or not self.optimize_queries:
79
- # Return original query if no LLM available or optimization disabled
80
- return query
81
-
82
- try:
83
- # Prompt for query optimization
84
- prompt = f"""Transform this natural language question into an optimized search query for medRxiv (a medical preprint server).
85
-
86
- Original query: "{query}"
87
-
88
- CRITICAL RULES:
89
- 1. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS
90
- 2. Focus on clear medical terminology and keywords
91
- 3. Keep it concise but comprehensive (typically 2-5 key terms)
92
- 4. Include specific medical conditions, treatments, or methodologies
93
- 5. Use Boolean operators (AND, OR) when appropriate
94
- 6. Include common medical acronyms where relevant (e.g., COVID-19 instead of coronavirus disease)
95
- 7. Put multi-word phrases in quotes (e.g., "long covid")
96
- 8. Prioritize specific medical terms over general descriptions
97
-
98
- EXAMPLE CONVERSIONS:
99
- ✓ "what are the neurological effects of long COVID?" → "long covid" AND neurological OR "nervous system"
100
- ✓ "newest vaccine development for covid variants" → COVID-19 AND vaccine AND variant
101
- ✗ BAD: "Here's a query to find information about..."
102
- ✗ BAD: "The most effective search query would be..."
103
-
104
- Return ONLY the search query without any explanations.
105
- """
106
-
107
- # Get response from LLM
108
- response = self.llm.invoke(prompt)
109
- optimized_query = response.content.strip()
110
-
111
- # Clean up the query - remove any explanations
112
- lines = optimized_query.split('\n')
113
- for line in lines:
114
- line = line.strip()
115
- if line and not line.lower().startswith(('here', 'i would', 'the best', 'this query')):
116
- optimized_query = line
117
- break
118
-
119
- # Remove any quotes that wrap the entire query
120
- if optimized_query.startswith('"') and optimized_query.endswith('"'):
121
- optimized_query = optimized_query[1:-1]
122
-
123
- logger.info(f"Original query: '{query}'")
124
- logger.info(f"Optimized for medRxiv: '{optimized_query}'")
125
-
126
- return optimized_query
127
-
128
- except Exception as e:
129
- logger.error(f"Error optimizing query: {e}")
130
- return query # Fall back to original query on error
131
-
132
- def _search_medrxiv(self, query: str) -> List[Dict[str, Any]]:
133
- """
134
- Search medRxiv using their API.
135
-
136
- Args:
137
- query: The search query
138
-
139
- Returns:
140
- List of paper dictionaries
141
- """
142
- results = []
143
- cursor = 0
144
- max_per_page = 50 # medRxiv API typically returns 50 results per page
145
- total_fetched = 0
146
-
147
- try:
148
- # URL encode the query
149
- encoded_query = quote(query)
150
-
151
- # Format the URL based on sorting and query
152
- url = self.search_base_url.format(encoded_query)
153
-
154
- # Add time restriction if specified
155
- if self.days_limit:
156
- # Calculate date range using days_limit
157
- from datetime import datetime, timedelta
158
- end_date = datetime.now()
159
- start_date = end_date - timedelta(days=self.days_limit)
160
-
161
- # Format dates for the API (YYYY-MM-DD)
162
- start_date_str = start_date.strftime("%Y-%m-%d")
163
- end_date_str = end_date.strftime("%Y-%m-%d")
164
-
165
- # Add date parameters to URL
166
- url += f"/{start_date_str}/{end_date_str}"
167
- logger.info(f"Using date range filter: {start_date_str} to {end_date_str}")
168
-
169
- while total_fetched < self.max_results:
170
- # Add cursor to URL
171
- page_url = f"{url}/{cursor}"
172
-
173
- # Make the request
174
- logger.debug(f"Requesting: {page_url}")
175
- response = requests.get(page_url)
176
- if response.status_code != 200:
177
- logger.error(f"Error searching medRxiv: {response.status_code}")
178
- break
179
-
180
- data = response.json()
181
-
182
- # Check if we have results
183
- collection = data.get("collection", [])
184
- if not collection:
185
- break
186
-
187
- # Extract results
188
- for paper in collection:
189
- if paper.get("server") == "medRxiv": # Ensure we're only getting medRxiv papers
190
- results.append(paper)
191
- total_fetched += 1
192
-
193
- if total_fetched >= self.max_results:
194
- break
195
-
196
- # Check if we should continue to next page
197
- if len(collection) < max_per_page or total_fetched >= self.max_results:
198
- break
199
-
200
- cursor += max_per_page
201
- time.sleep(0.5) # Be respectful with API requests
202
-
203
- logger.info(f"Found {len(results)} papers from medRxiv for query: {query}")
204
- return results
205
-
206
- except Exception as e:
207
- logger.error(f"Error searching medRxiv: {e}")
208
- return []
209
-
210
- def _get_paper_details(self, doi: str) -> Dict[str, Any]:
211
- """
212
- Get detailed information about a specific paper using its DOI.
213
-
214
- Args:
215
- doi: Digital Object Identifier for the paper
216
-
217
- Returns:
218
- Dictionary with paper details
219
- """
220
- try:
221
- # Format the DOI for the API
222
- formatted_doi = doi.replace("10.1101/", "")
223
-
224
- # Get paper details from the API
225
- url = self.medrxiv_api_url.format(formatted_doi)
226
- response = requests.get(url)
227
-
228
- if response.status_code != 200:
229
- logger.error(f"Error getting paper details: {response.status_code}")
230
- return {}
231
-
232
- data = response.json()
233
-
234
- # Extract the paper details
235
- collection = data.get("collection", [])
236
- if not collection:
237
- return {}
238
-
239
- return collection[0]
240
-
241
- except Exception as e:
242
- logger.error(f"Error getting paper details: {e}")
243
- return {}
244
-
245
- def _get_full_text_url(self, doi: str) -> Optional[str]:
246
- """
247
- Get the URL for the full text PDF of a paper.
248
-
249
- Args:
250
- doi: Digital Object Identifier for the paper
251
-
252
- Returns:
253
- URL to the PDF or None if not available
254
- """
255
- pdf_url = None
256
-
257
- try:
258
- # Format the DOI for the URL
259
- formatted_doi = doi.replace("10.1101/", "")
260
-
261
- # Construct the PDF URL
262
- # Note: This is a typical pattern for medRxiv PDFs, but may need adjustment
263
- pdf_url = f"https://www.medrxiv.org/content/10.1101/{formatted_doi}.full.pdf"
264
-
265
- # Verify the URL is valid (optional)
266
- response = requests.head(pdf_url)
267
- if response.status_code != 200:
268
- logger.warning(f"PDF not available at {pdf_url}")
269
- return None
270
-
271
- return pdf_url
272
-
273
- except Exception as e:
274
- logger.error(f"Error getting PDF URL: {e}")
275
- return None
276
-
277
- def _download_pdf(self, pdf_url: str, file_name: str) -> Optional[str]:
278
- """
279
- Download a PDF from a URL to the specified download directory.
280
-
281
- Args:
282
- pdf_url: URL to the PDF
283
- file_name: Name to save the file as
284
-
285
- Returns:
286
- Path to the downloaded file or None if download failed
287
- """
288
- if not self.download_dir:
289
- return None
290
-
291
- import os
292
-
293
- try:
294
- # Create download directory if it doesn't exist
295
- os.makedirs(self.download_dir, exist_ok=True)
296
-
297
- # Clean the filename
298
- safe_name = re.sub(r'[^\w\-_\.]', '_', file_name)
299
- file_path = os.path.join(self.download_dir, safe_name)
300
-
301
- # Download the file
302
- response = requests.get(pdf_url, stream=True)
303
- response.raise_for_status()
304
-
305
- with open(file_path, 'wb') as f:
306
- for chunk in response.iter_content(chunk_size=8192):
307
- f.write(chunk)
308
-
309
- logger.info(f"Downloaded PDF to {file_path}")
310
- return file_path
311
-
312
- except Exception as e:
313
- logger.error(f"Error downloading PDF: {e}")
314
- return None
315
-
316
- def _extract_text_from_pdf(self, pdf_path: str) -> str:
317
- """
318
- Extract text from a PDF file.
319
-
320
- Args:
321
- pdf_path: Path to the PDF file
322
-
323
- Returns:
324
- Extracted text or empty string if extraction failed
325
- """
326
- text = ""
327
-
328
- try:
329
- # First try PyPDF2
330
- try:
331
- import PyPDF2
332
- with open(pdf_path, 'rb') as pdf_file:
333
- pdf_reader = PyPDF2.PdfReader(pdf_file)
334
- for page in pdf_reader.pages:
335
- text += page.extract_text() + "\n\n"
336
- except (ImportError, Exception) as e1:
337
- # Fall back to pdfplumber
338
- try:
339
- import pdfplumber
340
- with pdfplumber.open(pdf_path) as pdf:
341
- for page in pdf.pages:
342
- text += page.extract_text() + "\n\n"
343
- except (ImportError, Exception) as e2:
344
- logger.error(f"PDF extraction failed with both methods: {e1}, then {e2}")
345
- return ""
346
-
347
- return text
348
-
349
- except Exception as e:
350
- logger.error(f"Error extracting text from PDF: {e}")
351
- return ""
352
-
353
- def _get_previews(self, query: str) -> List[Dict[str, Any]]:
354
- """
355
- Get preview information for medRxiv papers.
356
-
357
- Args:
358
- query: The search query
359
-
360
- Returns:
361
- List of preview dictionaries
362
- """
363
- logger.info(f"Getting medRxiv previews for query: {query}")
364
-
365
- # Optimize the query for medRxiv if LLM is available
366
- if self.optimize_queries and self.llm:
367
- optimized_query = self._optimize_query_for_medrxiv(query)
368
-
369
- # Store original and optimized queries for potential fallback
370
- self._original_query = query
371
- self._optimized_query = optimized_query
372
-
373
- # Store for simplification if needed
374
- self._simplify_query_cache = optimized_query
375
-
376
- # Use the optimized query for adaptive search
377
- papers, strategy = self._adaptive_search(optimized_query)
378
- else:
379
- # Use the original query directly with adaptive search
380
- papers, strategy = self._adaptive_search(query)
381
-
382
- # If no results, return empty list
383
- if not papers:
384
- logger.warning(f"No medRxiv results found using strategy: {strategy}")
385
- return []
386
-
387
- # Store the paper objects for later use
388
- self._papers = {paper.get("doi"): paper for paper in papers}
389
- self._search_strategy = strategy
390
-
391
- # Format results as previews
392
- previews = []
393
- for paper in papers:
394
- # Extract the data
395
- doi = paper.get("doi", "")
396
- title = paper.get("title", "")
397
- abstract = paper.get("abstract", "")
398
- authors = paper.get("authors", "")
399
- date = paper.get("date", "")
400
-
401
- # Create a preview
402
- preview = {
403
- "id": doi, # Use DOI as ID
404
- "title": title,
405
- "link": f"https://www.medrxiv.org/content/{doi}v1",
406
- "snippet": abstract[:250] + "..." if len(abstract) > 250 else abstract,
407
- "authors": authors.split("; ") if authors else [],
408
- "published": date,
409
- "doi": doi,
410
- "source": "medRxiv",
411
- "_search_strategy": strategy # Store search strategy for analytics
412
- }
413
-
414
- previews.append(preview)
415
-
416
- logger.info(f"Found {len(previews)} medRxiv previews using strategy: {strategy}")
417
- return previews
418
-
419
- def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
420
- """
421
- Get full content for the relevant medRxiv papers.
422
-
423
- Args:
424
- relevant_items: List of relevant preview dictionaries
425
-
426
- Returns:
427
- List of result dictionaries with full content
428
- """
429
- # Check if we should get full content
430
- if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
431
- logger.info("Snippet-only mode, skipping full content retrieval")
432
- return relevant_items
433
-
434
- logger.info(f"Getting full content for {len(relevant_items)} medRxiv papers")
435
-
436
- results = []
437
- pdf_count = 0 # Track number of PDFs processed
438
-
439
- for item in relevant_items:
440
- # Start with the preview data
441
- result = item.copy()
442
-
443
- # Get the paper DOI
444
- doi = item.get("id") or item.get("doi")
445
-
446
- if not doi:
447
- results.append(result)
448
- continue
449
-
450
- # Try to get the cached paper details
451
- paper = None
452
- if hasattr(self, '_papers') and doi in self._papers:
453
- paper = self._papers[doi]
454
- else:
455
- # Get the paper details from the API
456
- paper = self._get_paper_details(doi)
457
-
458
- if paper:
459
- # Update with more complete information
460
- result.update({
461
- "title": paper.get("title", result.get("title", "")),
462
- "authors": paper.get("authors", "").split("; ") if paper.get("authors") else result.get("authors", []),
463
- "published": paper.get("date", result.get("published", "")),
464
- "abstract": paper.get("abstract", ""),
465
- "doi": paper.get("doi", doi),
466
- "category": paper.get("category", ""),
467
- "journal": "medRxiv", # It's a preprint server
468
- "version": paper.get("version", "1"),
469
- "type": paper.get("type", "new_result"),
470
- })
471
-
472
- # Use abstract as content by default
473
- result["content"] = paper.get("abstract", "")
474
- result["full_content"] = paper.get("abstract", "")
475
-
476
- # Add search strategy if available
477
- if "_search_strategy" in item:
478
- result["search_strategy"] = item["_search_strategy"]
479
- # Remove temporary field
480
- if "_search_strategy" in result:
481
- del result["_search_strategy"]
482
-
483
- # Download PDF and extract text if requested and within limit
484
- if (self.include_full_text and self.download_dir and
485
- pdf_count < self.max_full_text):
486
- try:
487
- # Get the PDF URL
488
- pdf_url = self._get_full_text_url(doi)
489
-
490
- if pdf_url:
491
- # Download the PDF
492
- pdf_count += 1
493
- safe_name = f"medrxiv_{doi.replace('/', '_')}.pdf"
494
- pdf_path = self._download_pdf(pdf_url, safe_name)
495
-
496
- if pdf_path:
497
- result["pdf_path"] = pdf_path
498
-
499
- # Extract text from PDF
500
- pdf_text = self._extract_text_from_pdf(pdf_path)
501
-
502
- if pdf_text:
503
- result["content"] = pdf_text
504
- result["full_content"] = pdf_text
505
- result["content_type"] = "full_text"
506
- else:
507
- result["content_type"] = "abstract"
508
- except Exception as e:
509
- logger.error(f"Error processing PDF for {doi}: {e}")
510
- result["content_type"] = "abstract"
511
-
512
- results.append(result)
513
-
514
- return results
515
-
516
- def run(self, query: str) -> List[Dict[str, Any]]:
517
- """
518
- Execute a search using medRxiv with the two-phase approach.
519
-
520
- Args:
521
- query: The search query
522
-
523
- Returns:
524
- List of search results
525
- """
526
- logger.info(f"---Execute a search using medRxiv---")
527
-
528
- # Use the implementation from the parent class which handles all phases
529
- # _get_previews will handle query optimization and adaptive search
530
- results = super().run(query)
531
-
532
- # Clean up temporary variables
533
- if hasattr(self, '_papers'):
534
- del self._papers
535
- if hasattr(self, '_original_query'):
536
- del self._original_query
537
- if hasattr(self, '_optimized_query'):
538
- del self._optimized_query
539
- if hasattr(self, '_simplify_query_cache'):
540
- del self._simplify_query_cache
541
- if hasattr(self, '_search_strategy'):
542
- del self._search_strategy
543
-
544
- return results
545
-
546
- def search_by_author(self, author_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
547
- """
548
- Search for papers by a specific author.
549
-
550
- Args:
551
- author_name: Name of the author
552
- max_results: Maximum number of results (defaults to self.max_results)
553
-
554
- Returns:
555
- List of papers by the author
556
- """
557
- original_max_results = self.max_results
558
-
559
- try:
560
- if max_results:
561
- self.max_results = max_results
562
-
563
- # medRxiv API doesn't have direct author search, so we include in query
564
- query = f"author:{author_name}"
565
- return self.run(query)
566
-
567
- finally:
568
- # Restore original value
569
- self.max_results = original_max_results
570
-
571
- def search_by_topic(self, topic: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
572
- """
573
- Search for papers on a specific topic.
574
-
575
- Args:
576
- topic: Topic to search for
577
- max_results: Maximum number of results (defaults to self.max_results)
578
-
579
- Returns:
580
- List of papers on the topic
581
- """
582
- original_max_results = self.max_results
583
-
584
- try:
585
- if max_results:
586
- self.max_results = max_results
587
-
588
- return self.run(topic)
589
-
590
- finally:
591
- # Restore original value
592
- self.max_results = original_max_results
593
-
594
- def search_recent(self, days: int = 30, topic: Optional[str] = None, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
595
- """
596
- Search for recent papers, optionally filtered by topic.
597
-
598
- Args:
599
- days: Number of days to look back
600
- topic: Optional topic filter
601
- max_results: Maximum number of results (defaults to self.max_results)
602
-
603
- Returns:
604
- List of recent papers
605
- """
606
- original_max_results = self.max_results
607
- original_days_limit = self.days_limit
608
-
609
- try:
610
- if max_results:
611
- self.max_results = max_results
612
-
613
- # Set days limit for this search
614
- self.days_limit = days
615
-
616
- # If topic is provided, use it as query, otherwise use a broad query
617
- query = topic if topic else "covid" # Default to COVID which will have many papers
618
- return self.run(query)
619
-
620
- finally:
621
- # Restore original values
622
- self.max_results = original_max_results
623
- self.days_limit = original_days_limit