local-deep-research 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. local_deep_research/__init__.py +24 -0
  2. local_deep_research/citation_handler.py +113 -0
  3. local_deep_research/config.py +166 -0
  4. local_deep_research/defaults/__init__.py +44 -0
  5. local_deep_research/defaults/llm_config.py +269 -0
  6. local_deep_research/defaults/local_collections.toml +47 -0
  7. local_deep_research/defaults/main.toml +57 -0
  8. local_deep_research/defaults/search_engines.toml +244 -0
  9. local_deep_research/local_collections.py +141 -0
  10. local_deep_research/main.py +113 -0
  11. local_deep_research/report_generator.py +206 -0
  12. local_deep_research/search_system.py +241 -0
  13. local_deep_research/utilties/__init__.py +0 -0
  14. local_deep_research/utilties/enums.py +9 -0
  15. local_deep_research/utilties/llm_utils.py +116 -0
  16. local_deep_research/utilties/search_utilities.py +115 -0
  17. local_deep_research/utilties/setup_utils.py +6 -0
  18. local_deep_research/web/__init__.py +2 -0
  19. local_deep_research/web/app.py +1209 -0
  20. local_deep_research/web/static/css/styles.css +1008 -0
  21. local_deep_research/web/static/js/app.js +2078 -0
  22. local_deep_research/web/templates/api_keys_config.html +82 -0
  23. local_deep_research/web/templates/collections_config.html +90 -0
  24. local_deep_research/web/templates/index.html +312 -0
  25. local_deep_research/web/templates/llm_config.html +120 -0
  26. local_deep_research/web/templates/main_config.html +89 -0
  27. local_deep_research/web/templates/search_engines_config.html +154 -0
  28. local_deep_research/web/templates/settings.html +519 -0
  29. local_deep_research/web/templates/settings_dashboard.html +207 -0
  30. local_deep_research/web_search_engines/__init__.py +0 -0
  31. local_deep_research/web_search_engines/engines/__init__.py +0 -0
  32. local_deep_research/web_search_engines/engines/full_search.py +128 -0
  33. local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
  34. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
  35. local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
  36. local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
  37. local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
  38. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
  39. local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
  40. local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
  41. local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
  42. local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
  43. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
  44. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
  45. local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
  46. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
  47. local_deep_research/web_search_engines/full_search.py +254 -0
  48. local_deep_research/web_search_engines/search_engine_base.py +197 -0
  49. local_deep_research/web_search_engines/search_engine_factory.py +233 -0
  50. local_deep_research/web_search_engines/search_engines_config.py +54 -0
  51. local_deep_research-0.1.0.dist-info/LICENSE +21 -0
  52. local_deep_research-0.1.0.dist-info/METADATA +328 -0
  53. local_deep_research-0.1.0.dist-info/RECORD +56 -0
  54. local_deep_research-0.1.0.dist-info/WHEEL +5 -0
  55. local_deep_research-0.1.0.dist-info/entry_points.txt +3 -0
  56. local_deep_research-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,992 @@
1
+ import requests
2
+ import logging
3
+ import xml.etree.ElementTree as ET
4
+ from typing import Dict, List, Any, Optional, Tuple
5
+ from langchain_core.language_models import BaseLLM
6
+ import time
7
+ import re
8
+ from datetime import datetime
9
+
10
+ from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
11
+ from local_deep_research import config
12
+
13
+ # Setup logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class PubMedSearchEngine(BaseSearchEngine):
18
+ """
19
+ PubMed search engine implementation with two-phase approach and adaptive search.
20
+ Provides efficient access to biomedical literature while minimizing API usage.
21
+ """
22
+
23
+ def __init__(self,
24
+ max_results: int = 10,
25
+ api_key: Optional[str] = None,
26
+ days_limit: Optional[int] = None,
27
+ get_abstracts: bool = True,
28
+ get_full_text: bool = False,
29
+ full_text_limit: int = 3,
30
+ llm: Optional[BaseLLM] = None,
31
+ max_filtered_results: Optional[int] = None,
32
+ optimize_queries: bool = True):
33
+ """
34
+ Initialize the PubMed search engine.
35
+
36
+ Args:
37
+ max_results: Maximum number of search results
38
+ api_key: NCBI API key for higher rate limits (optional)
39
+ days_limit: Limit results to N days (optional)
40
+ get_abstracts: Whether to fetch abstracts for all results
41
+ get_full_text: Whether to fetch full text content (when available in PMC)
42
+ full_text_limit: Max number of full-text articles to retrieve
43
+ llm: Language model for relevance filtering
44
+ max_filtered_results: Maximum number of results to keep after filtering
45
+ optimize_queries: Whether to optimize natural language queries for PubMed
46
+ """
47
+ # Initialize the BaseSearchEngine with the LLM and max_filtered_results
48
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results)
49
+
50
+ self.max_results = max_results
51
+ self.api_key = api_key
52
+ self.days_limit = days_limit
53
+ self.get_abstracts = get_abstracts
54
+ self.get_full_text = get_full_text
55
+ self.full_text_limit = full_text_limit
56
+ self.optimize_queries = optimize_queries
57
+
58
+ # Base API URLs
59
+ self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
60
+ self.search_url = f"{self.base_url}/esearch.fcgi"
61
+ self.summary_url = f"{self.base_url}/esummary.fcgi"
62
+ self.fetch_url = f"{self.base_url}/efetch.fcgi"
63
+ self.link_url = f"{self.base_url}/elink.fcgi"
64
+
65
+ # PMC base URL for full text
66
+ self.pmc_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/"
67
+
68
+ def _get_result_count(self, query: str) -> int:
69
+ """
70
+ Get the total number of results for a query without retrieving the results themselves.
71
+
72
+ Args:
73
+ query: The search query
74
+
75
+ Returns:
76
+ Total number of matching results
77
+ """
78
+ try:
79
+ # Prepare search parameters
80
+ params = {
81
+ "db": "pubmed",
82
+ "term": query,
83
+ "retmode": "json",
84
+ "retmax": 0 # Don't need actual results, just the count
85
+ }
86
+
87
+ # Add API key if available
88
+ if self.api_key:
89
+ params["api_key"] = self.api_key
90
+
91
+ # Execute search request
92
+ response = requests.get(self.search_url, params=params)
93
+ response.raise_for_status()
94
+
95
+ # Parse response
96
+ data = response.json()
97
+ count = int(data["esearchresult"]["count"])
98
+
99
+ logger.info(f"Query '{query}' has {count} total results in PubMed")
100
+ return count
101
+
102
+ except Exception as e:
103
+ logger.error(f"Error getting result count: {e}")
104
+ return 0
105
+
106
+ def _extract_core_terms(self, query: str) -> str:
107
+ """
108
+ Extract core terms from a complex query for volume estimation.
109
+
110
+ Args:
111
+ query: PubMed query string
112
+
113
+ Returns:
114
+ Simplified query with core terms
115
+ """
116
+ # Remove field specifications and operators
117
+ simplified = re.sub(r'\[\w+\]', '', query) # Remove [Field] tags
118
+ simplified = re.sub(r'\b(AND|OR|NOT)\b', '', simplified) # Remove operators
119
+
120
+ # Remove quotes and parentheses
121
+ simplified = simplified.replace('"', '').replace('(', '').replace(')', '')
122
+
123
+ # Split by whitespace and join terms with 4+ chars (likely meaningful)
124
+ terms = [term for term in simplified.split() if len(term) >= 4]
125
+
126
+ # Join with AND to create a basic search
127
+ return " ".join(terms[:5]) # Limit to top 5 terms
128
+
129
+ def _expand_time_window(self, time_filter: str) -> str:
130
+ """
131
+ Expand a time window to get more results.
132
+
133
+ Args:
134
+ time_filter: Current time filter
135
+
136
+ Returns:
137
+ Expanded time filter
138
+ """
139
+ # Parse current time window
140
+ import re
141
+ match = re.match(r'"last (\d+) (\w+)"[pdat]', time_filter)
142
+ if not match:
143
+ return '"last 10 years"[pdat]'
144
+
145
+ amount, unit = int(match.group(1)), match.group(2)
146
+
147
+ # Expand based on current unit
148
+ if unit == "months" or unit == "month":
149
+ if amount < 6:
150
+ return '"last 6 months"[pdat]'
151
+ elif amount < 12:
152
+ return '"last 1 year"[pdat]'
153
+ else:
154
+ return '"last 2 years"[pdat]'
155
+ elif unit == "years" or unit == "year":
156
+ if amount < 2:
157
+ return '"last 2 years"[pdat]'
158
+ elif amount < 5:
159
+ return '"last 5 years"[pdat]'
160
+ else:
161
+ return '"last 10 years"[pdat]'
162
+
163
+ return '"last 10 years"[pdat]'
164
+
165
+ def _optimize_query_for_pubmed(self, query: str) -> str:
166
+ """
167
+ Optimize a natural language query for PubMed search.
168
+ Uses LLM to transform questions into effective keyword-based queries.
169
+
170
+ Args:
171
+ query: Natural language query
172
+
173
+ Returns:
174
+ Optimized query string for PubMed
175
+ """
176
+ if not self.llm or not self.optimize_queries:
177
+ # Return original query if no LLM available or optimization disabled
178
+ return query
179
+
180
+ try:
181
+ # Prompt for query optimization
182
+ prompt = f"""Transform this natural language question into an optimized PubMed search query.
183
+
184
+ Original query: "{query}"
185
+
186
+ CRITICAL RULES:
187
+ 1. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS
188
+ 2. DO NOT wrap the entire query in quotes
189
+ 3. DO NOT include ANY date restrictions or year filters
190
+ 4. Use parentheses around OR statements: (term1[Field] OR term2[Field])
191
+ 5. Use only BASIC MeSH terms - stick to broad categories like "Vaccines"[Mesh]
192
+ 6. KEEP IT SIMPLE - use 2-3 main concepts maximum
193
+ 7. Focus on Title/Abstract searches for reliability: term[Title/Abstract]
194
+ 8. Use wildcards for variations: vaccin*[Title/Abstract]
195
+
196
+ EXAMPLE QUERIES:
197
+ ✓ GOOD: (mRNA[Title/Abstract] OR "messenger RNA"[Title/Abstract]) AND vaccin*[Title/Abstract]
198
+ ✓ GOOD: (influenza[Title/Abstract] OR flu[Title/Abstract]) AND treatment[Title/Abstract]
199
+ ✗ BAD: (mRNA[Title/Abstract]) AND "specific disease"[Mesh] AND treatment[Title/Abstract] AND 2023[dp]
200
+ ✗ BAD: "Here's a query to find articles about vaccines..."
201
+
202
+ Return ONLY the search query without any explanations.
203
+ """
204
+
205
+ # Get response from LLM
206
+ response = self.llm.invoke(prompt)
207
+ raw_response = response.content.strip()
208
+
209
+ # Clean up the query - extract only the actual query and remove any explanations
210
+ # First check if there are multiple lines and take the first non-empty line
211
+ lines = raw_response.split('\n')
212
+ cleaned_lines = [line.strip() for line in lines if line.strip()]
213
+
214
+ if cleaned_lines:
215
+ optimized_query = cleaned_lines[0]
216
+
217
+ # Remove any quotes that wrap the entire query
218
+ if optimized_query.startswith('"') and optimized_query.endswith('"'):
219
+ optimized_query = optimized_query[1:-1]
220
+
221
+ # Remove any explanation phrases that might be at the beginning
222
+ explanation_starters = ["here is", "here's", "this query", "the following"]
223
+ for starter in explanation_starters:
224
+ if optimized_query.lower().startswith(starter):
225
+ # Find the actual query part - typically after a colon
226
+ colon_pos = optimized_query.find(':')
227
+ if colon_pos > 0:
228
+ optimized_query = optimized_query[colon_pos + 1:].strip()
229
+
230
+ # Check if the query still seems to contain explanations
231
+ if len(optimized_query) > 200 or "this query will" in optimized_query.lower():
232
+ # It's probably still an explanation - try to extract just the query part
233
+ # Look for common patterns in the explanation like parentheses
234
+ pattern = r'\([^)]+\)\s+AND\s+'
235
+ import re
236
+ matches = re.findall(pattern, optimized_query)
237
+ if matches:
238
+ # Extract just the query syntax parts
239
+ query_parts = []
240
+ for part in re.split(r'\.\s+', optimized_query):
241
+ if '(' in part and ')' in part and ('AND' in part or 'OR' in part):
242
+ query_parts.append(part)
243
+ if query_parts:
244
+ optimized_query = ' '.join(query_parts)
245
+ else:
246
+ # Fall back to original query if cleaning fails
247
+ logger.warning("Failed to extract a clean query from LLM response")
248
+ optimized_query = query
249
+
250
+ # Final safety check - if query looks too much like an explanation, use original
251
+ if len(optimized_query.split()) > 30:
252
+ logger.warning("Query too verbose, falling back to simpler form")
253
+ # Create a simple query from the original
254
+ words = [w for w in query.split() if len(w) > 3 and w.lower() not in ('what', 'are', 'the', 'and', 'for', 'with', 'from', 'have', 'been', 'recent')]
255
+ optimized_query = ' AND '.join(words[:3])
256
+
257
+ # Safety check for invalid or overly complex MeSH terms
258
+ # This helps prevent errors with non-existent or complex MeSH terms
259
+ import re
260
+ mesh_terms = re.findall(r'"[^"]+"[Mesh]', optimized_query)
261
+ known_valid_mesh = ["Vaccines", "COVID-19", "Influenza", "Infectious Disease Medicine",
262
+ "Communicable Diseases", "RNA, Messenger", "Vaccination",
263
+ "Immunization"]
264
+
265
+ # Replace potentially problematic MeSH terms with Title/Abstract searches
266
+ for term in mesh_terms:
267
+ term_name = term.split('"')[1] # Extract term name without quotes and [Mesh]
268
+ if not any(valid in term_name for valid in known_valid_mesh):
269
+ # Replace with Title/Abstract search
270
+ replacement = f"{term_name.lower()}[Title/Abstract]"
271
+ optimized_query = optimized_query.replace(term, replacement)
272
+
273
+ # Simplify the query if still no results are found
274
+ self._simplify_query_cache = optimized_query
275
+
276
+ # Log original and optimized queries
277
+ logger.info(f"Original query: '{query}'")
278
+ logger.info(f"Optimized for PubMed: '{optimized_query}'")
279
+
280
+ return optimized_query
281
+
282
+ except Exception as e:
283
+ logger.error(f"Error optimizing query: {e}")
284
+ return query # Fall back to original query on error
285
+
286
+ def _simplify_query(self, query: str) -> str:
287
+ """
288
+ Simplify a PubMed query that returned no results.
289
+ Progressively removes elements to get a more basic query.
290
+
291
+ Args:
292
+ query: The original query that returned no results
293
+
294
+ Returns:
295
+ Simplified query
296
+ """
297
+ logger.info(f"Simplifying query: {query}")
298
+
299
+ # Attempt different simplification strategies
300
+
301
+ # 1. Remove any MeSH terms and replace with Title/Abstract
302
+ import re
303
+ simplified = re.sub(r'"[^"]+"[Mesh]', lambda m: m.group(0).split('"')[1].lower() + "[Title/Abstract]", query)
304
+
305
+ # 2. If that doesn't work, focus on just mRNA and vaccines - the core concepts
306
+ if simplified == query: # No changes were made
307
+ simplified = "(mRNA[Title/Abstract] OR \"messenger RNA\"[Title/Abstract]) AND vaccin*[Title/Abstract]"
308
+
309
+ logger.info(f"Simplified query: {simplified}")
310
+ return simplified
311
+
312
+ def _is_historical_focused(self, query: str) -> bool:
313
+ """
314
+ Determine if a query is specifically focused on historical/older information using LLM.
315
+ Default assumption is that queries should prioritize recent information unless
316
+ explicitly asking for historical content.
317
+
318
+ Args:
319
+ query: The search query
320
+
321
+ Returns:
322
+ Boolean indicating if the query is focused on historical information
323
+ """
324
+ if not self.llm:
325
+ # Fall back to basic keyword check if no LLM available
326
+ historical_terms = ["history", "historical", "early", "initial", "first", "original",
327
+ "before", "prior to", "origins", "evolution", "development"]
328
+ historical_years = [str(year) for year in range(1900, 2020)]
329
+
330
+ query_lower = query.lower()
331
+ has_historical_term = any(term in query_lower for term in historical_terms)
332
+ has_past_year = any(year in query for year in historical_years)
333
+
334
+ return has_historical_term or has_past_year
335
+
336
+ try:
337
+ # Use LLM to determine if the query is focused on historical information
338
+ prompt = f"""Determine if this query is specifically asking for HISTORICAL or OLDER information.
339
+
340
+ Query: "{query}"
341
+
342
+ Answer ONLY "yes" if the query is clearly asking for historical, early, original, or past information from more than 5 years ago.
343
+ Answer ONLY "no" if the query is asking about recent, current, or new information, or if it's a general query without a specific time focus.
344
+
345
+ The default assumption should be that medical and scientific queries want RECENT information unless clearly specified otherwise.
346
+ """
347
+
348
+ response = self.llm.invoke(prompt)
349
+ answer = response.content.strip().lower()
350
+
351
+ # Log the determination
352
+ logger.info(f"Historical focus determination for query: '{query}'")
353
+ logger.info(f"LLM determined historical focus: {answer}")
354
+
355
+ return "yes" in answer
356
+
357
+ except Exception as e:
358
+ logger.error(f"Error determining historical focus: {e}")
359
+ # Fall back to basic keyword check
360
+ historical_terms = ["history", "historical", "early", "initial", "first", "original",
361
+ "before", "prior to", "origins", "evolution", "development"]
362
+ return any(term in query.lower() for term in historical_terms)
363
+
364
+ def _adaptive_search(self, query: str) -> Tuple[List[str], str]:
365
+ """
366
+ Perform an adaptive search that adjusts based on topic volume and whether
367
+ the query focuses on historical information.
368
+
369
+ Args:
370
+ query: The search query (already optimized)
371
+
372
+ Returns:
373
+ Tuple of (list of PMIDs, search strategy used)
374
+ """
375
+ # Estimate topic volume
376
+ estimated_volume = self._get_result_count(query)
377
+
378
+ # Determine if the query is focused on historical information
379
+ is_historical_focused = self._is_historical_focused(query)
380
+
381
+ if is_historical_focused:
382
+ # User wants historical information - no date filtering
383
+ time_filter = None
384
+ strategy = "historical_focus"
385
+ elif estimated_volume > 5000:
386
+ # Very common topic - use tighter recency filter
387
+ time_filter = '"last 1 year"[pdat]'
388
+ strategy = "high_volume"
389
+ elif estimated_volume > 1000:
390
+ # Common topic
391
+ time_filter = '"last 3 years"[pdat]'
392
+ strategy = "common_topic"
393
+ elif estimated_volume > 100:
394
+ # Moderate volume
395
+ time_filter = '"last 5 years"[pdat]'
396
+ strategy = "moderate_volume"
397
+ else:
398
+ # Rare topic - still use recency but with wider range
399
+ time_filter = '"last 10 years"[pdat]'
400
+ strategy = "rare_topic"
401
+
402
+ # Run search based on strategy
403
+ if time_filter:
404
+ # Try with adaptive time filter
405
+ query_with_time = f"({query}) AND {time_filter}"
406
+ logger.info(f"Using adaptive search strategy: {strategy} with filter: {time_filter}")
407
+ results = self._search_pubmed(query_with_time)
408
+
409
+ # If too few results, gradually expand time window
410
+ if len(results) < 5 and not '"last 10 years"[pdat]' in time_filter:
411
+ logger.info(f"Insufficient results ({len(results)}), expanding time window")
412
+ expanded_time = self._expand_time_window(time_filter)
413
+ query_with_expanded_time = f"({query}) AND {expanded_time}"
414
+ expanded_results = self._search_pubmed(query_with_expanded_time)
415
+
416
+ if len(expanded_results) > len(results):
417
+ logger.info(f"Expanded time window yielded {len(expanded_results)} results")
418
+ return expanded_results, f"{strategy}_expanded"
419
+
420
+ # If still no results, try without time filter
421
+ if not results:
422
+ logger.info("No results with time filter, trying without time restrictions")
423
+ results = self._search_pubmed(query)
424
+ strategy = "no_time_filter"
425
+ else:
426
+ # Historical query - run without time filter
427
+ logger.info(f"Using historical search strategy without date filtering")
428
+ results = self._search_pubmed(query)
429
+
430
+ return results, strategy
431
+
432
+ def _search_pubmed(self, query: str) -> List[str]:
433
+ """
434
+ Search PubMed and return a list of article IDs.
435
+
436
+ Args:
437
+ query: The search query
438
+
439
+ Returns:
440
+ List of PubMed IDs matching the query
441
+ """
442
+ try:
443
+ # Prepare search parameters
444
+ params = {
445
+ "db": "pubmed",
446
+ "term": query,
447
+ "retmode": "json",
448
+ "retmax": self.max_results,
449
+ "usehistory": "y"
450
+ }
451
+
452
+ # Add API key if available
453
+ if self.api_key:
454
+ params["api_key"] = self.api_key
455
+
456
+ # Add date restriction if specified
457
+ if self.days_limit:
458
+ params["reldate"] = self.days_limit
459
+ params["datetype"] = "pdat" # Publication date
460
+
461
+ # Execute search request
462
+ response = requests.get(self.search_url, params=params)
463
+ response.raise_for_status()
464
+
465
+ # Parse response
466
+ data = response.json()
467
+ id_list = data["esearchresult"]["idlist"]
468
+
469
+ logger.info(f"PubMed search for '{query}' found {len(id_list)} results")
470
+ return id_list
471
+
472
+ except Exception as e:
473
+ logger.error(f"Error searching PubMed: {e}")
474
+ return []
475
+
476
+ def _get_article_summaries(self, id_list: List[str]) -> List[Dict[str, Any]]:
477
+ """
478
+ Get summaries for a list of PubMed article IDs.
479
+
480
+ Args:
481
+ id_list: List of PubMed IDs
482
+
483
+ Returns:
484
+ List of article summary dictionaries
485
+ """
486
+ if not id_list:
487
+ return []
488
+
489
+ try:
490
+ # Prepare parameters
491
+ params = {
492
+ "db": "pubmed",
493
+ "id": ",".join(id_list),
494
+ "retmode": "json",
495
+ "rettype": "summary"
496
+ }
497
+
498
+ # Add API key if available
499
+ if self.api_key:
500
+ params["api_key"] = self.api_key
501
+
502
+ # Execute request
503
+ response = requests.get(self.summary_url, params=params)
504
+ response.raise_for_status()
505
+
506
+ # Parse response
507
+ data = response.json()
508
+ summaries = []
509
+
510
+ for pmid in id_list:
511
+ if pmid in data["result"]:
512
+ article = data["result"][pmid]
513
+
514
+ # Extract authors (if available)
515
+ authors = []
516
+ if "authors" in article:
517
+ authors = [author["name"] for author in article["authors"]]
518
+
519
+ # Create summary dictionary
520
+ summary = {
521
+ "id": pmid,
522
+ "title": article.get("title", ""),
523
+ "pubdate": article.get("pubdate", ""),
524
+ "source": article.get("source", ""),
525
+ "authors": authors,
526
+ "journal": article.get("fulljournalname", ""),
527
+ "doi": article.get("doi", ""),
528
+ "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
529
+ }
530
+
531
+ summaries.append(summary)
532
+
533
+ return summaries
534
+
535
+ except Exception as e:
536
+ logger.error(f"Error getting article summaries: {e}")
537
+ return []
538
+
539
+ def _get_article_abstracts(self, id_list: List[str]) -> Dict[str, str]:
540
+ """
541
+ Get abstracts for a list of PubMed article IDs.
542
+
543
+ Args:
544
+ id_list: List of PubMed IDs
545
+
546
+ Returns:
547
+ Dictionary mapping PubMed IDs to their abstracts
548
+ """
549
+ if not id_list:
550
+ return {}
551
+
552
+ try:
553
+ # Prepare parameters
554
+ params = {
555
+ "db": "pubmed",
556
+ "id": ",".join(id_list),
557
+ "retmode": "xml",
558
+ "rettype": "abstract"
559
+ }
560
+
561
+ # Add API key if available
562
+ if self.api_key:
563
+ params["api_key"] = self.api_key
564
+
565
+ # Execute request
566
+ response = requests.get(self.fetch_url, params=params)
567
+ response.raise_for_status()
568
+
569
+ # Parse XML response
570
+ root = ET.fromstring(response.text)
571
+
572
+ # Extract abstracts
573
+ abstracts = {}
574
+
575
+ for article in root.findall(".//PubmedArticle"):
576
+ pmid_elem = article.find(".//PMID")
577
+ pmid = pmid_elem.text if pmid_elem is not None else None
578
+
579
+ if pmid is None:
580
+ continue
581
+
582
+ # Find abstract text
583
+ abstract_text = ""
584
+ abstract_elem = article.find(".//AbstractText")
585
+
586
+ if abstract_elem is not None:
587
+ abstract_text = abstract_elem.text or ""
588
+
589
+ # Some abstracts are split into multiple sections
590
+ for section in article.findall(".//AbstractText"):
591
+ # Get section label if it exists
592
+ label = section.get("Label")
593
+ section_text = section.text or ""
594
+
595
+ if label and section_text:
596
+ if abstract_text:
597
+ abstract_text += f"\n\n{label}: {section_text}"
598
+ else:
599
+ abstract_text = f"{label}: {section_text}"
600
+ elif section_text:
601
+ if abstract_text:
602
+ abstract_text += f"\n\n{section_text}"
603
+ else:
604
+ abstract_text = section_text
605
+
606
+ # Store in dictionary
607
+ if pmid and abstract_text:
608
+ abstracts[pmid] = abstract_text
609
+
610
+ return abstracts
611
+
612
+ except Exception as e:
613
+ logger.error(f"Error getting article abstracts: {e}")
614
+ return {}
615
+
616
+ def _find_pmc_ids(self, pmid_list: List[str]) -> Dict[str, str]:
617
+ """
618
+ Find PMC IDs for the given PubMed IDs (for full-text access).
619
+
620
+ Args:
621
+ pmid_list: List of PubMed IDs
622
+
623
+ Returns:
624
+ Dictionary mapping PubMed IDs to their PMC IDs (if available)
625
+ """
626
+ if not pmid_list or not self.get_full_text:
627
+ return {}
628
+
629
+ try:
630
+ # Prepare parameters
631
+ params = {
632
+ "dbfrom": "pubmed",
633
+ "db": "pmc",
634
+ "linkname": "pubmed_pmc",
635
+ "id": ",".join(pmid_list),
636
+ "retmode": "json"
637
+ }
638
+
639
+ # Add API key if available
640
+ if self.api_key:
641
+ params["api_key"] = self.api_key
642
+
643
+ # Execute request
644
+ response = requests.get(self.link_url, params=params)
645
+ response.raise_for_status()
646
+
647
+ # Parse response
648
+ data = response.json()
649
+
650
+ # Map PubMed IDs to PMC IDs
651
+ pmid_to_pmcid = {}
652
+
653
+ for linkset in data.get("linksets", []):
654
+ pmid = linkset.get("ids", [None])[0]
655
+
656
+ if not pmid:
657
+ continue
658
+
659
+ for link in linkset.get("linksetdbs", []):
660
+ if link.get("linkname") == "pubmed_pmc":
661
+ pmcids = link.get("links", [])
662
+ if pmcids:
663
+ pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}"
664
+
665
+ logger.info(f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access")
666
+ return pmid_to_pmcid
667
+
668
+ except Exception as e:
669
+ logger.error(f"Error finding PMC IDs: {e}")
670
+ return {}
671
+
672
+ def _get_pmc_full_text(self, pmcid: str) -> str:
673
+ """
674
+ Get full text for a PMC article.
675
+
676
+ Args:
677
+ pmcid: PMC ID of the article
678
+
679
+ Returns:
680
+ Full text content or empty string if not available
681
+ """
682
+ try:
683
+ # Prepare parameters
684
+ params = {
685
+ "db": "pmc",
686
+ "id": pmcid,
687
+ "retmode": "xml",
688
+ "rettype": "full"
689
+ }
690
+
691
+ # Add API key if available
692
+ if self.api_key:
693
+ params["api_key"] = self.api_key
694
+
695
+ # Execute request
696
+ response = requests.get(self.fetch_url, params=params)
697
+ response.raise_for_status()
698
+
699
+ # Parse XML response
700
+ root = ET.fromstring(response.text)
701
+
702
+ # Extract full text
703
+ full_text = []
704
+
705
+ # Extract article title
706
+ title_elem = root.find(".//article-title")
707
+ if title_elem is not None and title_elem.text:
708
+ full_text.append(f"# {title_elem.text}")
709
+
710
+ # Extract abstract
711
+ abstract_paras = root.findall(".//abstract//p")
712
+ if abstract_paras:
713
+ full_text.append("\n## Abstract\n")
714
+ for p in abstract_paras:
715
+ text = ''.join(p.itertext())
716
+ if text:
717
+ full_text.append(text)
718
+
719
+ # Extract body content
720
+ body = root.find(".//body")
721
+ if body is not None:
722
+ for section in body.findall(".//sec"):
723
+ # Get section title
724
+ title = section.find(".//title")
725
+ if title is not None and title.text:
726
+ full_text.append(f"\n## {title.text}\n")
727
+
728
+ # Get paragraphs
729
+ for p in section.findall(".//p"):
730
+ text = ''.join(p.itertext())
731
+ if text:
732
+ full_text.append(text)
733
+
734
+ return "\n\n".join(full_text)
735
+
736
+ except Exception as e:
737
+ logger.error(f"Error getting PMC full text: {e}")
738
+ return ""
739
+
740
+ def _get_previews(self, query: str) -> List[Dict[str, Any]]:
741
+ """
742
+ Get preview information for PubMed articles.
743
+
744
+ Args:
745
+ query: The search query
746
+
747
+ Returns:
748
+ List of preview dictionaries
749
+ """
750
+ logger.info(f"Getting PubMed previews for query: {query}")
751
+
752
+ # Optimize the query for PubMed if LLM is available
753
+ optimized_query = self._optimize_query_for_pubmed(query)
754
+
755
+ # Perform adaptive search
756
+ pmid_list, strategy = self._adaptive_search(optimized_query)
757
+
758
+ # If no results, try a simplified query
759
+ if not pmid_list:
760
+ logger.warning(f"No PubMed results found using strategy: {strategy}")
761
+ simplified_query = self._simplify_query(optimized_query)
762
+ if simplified_query != optimized_query:
763
+ logger.info(f"Trying with simplified query: {simplified_query}")
764
+ pmid_list, strategy = self._adaptive_search(simplified_query)
765
+ if pmid_list:
766
+ logger.info(f"Simplified query found {len(pmid_list)} results")
767
+
768
+ if not pmid_list:
769
+ logger.warning(f"No PubMed results found after query simplification")
770
+ return []
771
+
772
+ # Get article summaries
773
+ summaries = self._get_article_summaries(pmid_list)
774
+
775
+ # Rate limit compliance (NCBI allows 10 requests per second with an API key, 3 without)
776
+ time.sleep(0.1 if self.api_key else 0.33)
777
+
778
+ # Format as previews
779
+ previews = []
780
+ for summary in summaries:
781
+ # Authors formatting
782
+ authors_text = ", ".join(summary.get("authors", []))
783
+ if len(authors_text) > 100:
784
+ # Truncate long author lists
785
+ authors_text = authors_text[:97] + "..."
786
+
787
+ # Create preview with basic information
788
+ preview = {
789
+ "id": summary["id"],
790
+ "title": summary["title"],
791
+ "link": summary["link"],
792
+ "snippet": f"{authors_text}. {summary.get('journal', '')}. {summary.get('pubdate', '')}",
793
+ "authors": summary.get("authors", []),
794
+ "journal": summary.get("journal", ""),
795
+ "pubdate": summary.get("pubdate", ""),
796
+ "doi": summary.get("doi", ""),
797
+ "source": "PubMed",
798
+ "_pmid": summary["id"], # Store PMID for later use
799
+ "_search_strategy": strategy # Store search strategy for analytics
800
+ }
801
+
802
+ previews.append(preview)
803
+
804
+ logger.info(f"Found {len(previews)} PubMed previews using strategy: {strategy}")
805
+ return previews
806
+
807
+ def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
808
+ """
809
+ Get full content for the relevant PubMed articles.
810
+ Efficiently manages which content to retrieve (abstracts and/or full text).
811
+
812
+ Args:
813
+ relevant_items: List of relevant preview dictionaries
814
+
815
+ Returns:
816
+ List of result dictionaries with full content
817
+ """
818
+ # Check if we should add full content
819
+ if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
820
+ logger.info("Snippet-only mode, skipping full content retrieval")
821
+ return relevant_items
822
+
823
+ logger.info(f"Getting content for {len(relevant_items)} PubMed articles")
824
+
825
+ # Collect all PMIDs for relevant items
826
+ pmids = []
827
+ for item in relevant_items:
828
+ if "_pmid" in item:
829
+ pmids.append(item["_pmid"])
830
+
831
+ # Get abstracts if requested and PMIDs exist
832
+ abstracts = {}
833
+ if self.get_abstracts and pmids:
834
+ abstracts = self._get_article_abstracts(pmids)
835
+
836
+ # Find PMC IDs for full-text retrieval (if enabled)
837
+ pmid_to_pmcid = {}
838
+ if self.get_full_text and pmids:
839
+ pmid_to_pmcid = self._find_pmc_ids(pmids)
840
+
841
+ # Add content to results
842
+ results = []
843
+ for item in relevant_items:
844
+ result = item.copy()
845
+ pmid = item.get("_pmid", "")
846
+
847
+ # Add abstract if available
848
+ if pmid in abstracts:
849
+ result["abstract"] = abstracts[pmid]
850
+
851
+ # Use abstract as content if no full text
852
+ if pmid not in pmid_to_pmcid:
853
+ result["full_content"] = abstracts[pmid]
854
+ result["content"] = abstracts[pmid]
855
+ result["content_type"] = "abstract"
856
+
857
+ # Add full text for a limited number of top articles
858
+ if (pmid in pmid_to_pmcid and
859
+ self.get_full_text and
860
+ len([r for r in results if r.get("content_type") == "full_text"]) < self.full_text_limit):
861
+
862
+ # Get full text content
863
+ pmcid = pmid_to_pmcid[pmid]
864
+ full_text = self._get_pmc_full_text(pmcid)
865
+
866
+ if full_text:
867
+ result["full_content"] = full_text
868
+ result["content"] = full_text
869
+ result["content_type"] = "full_text"
870
+ result["pmcid"] = pmcid
871
+ elif pmid in abstracts:
872
+ # Fall back to abstract if full text retrieval fails
873
+ result["full_content"] = abstracts[pmid]
874
+ result["content"] = abstracts[pmid]
875
+ result["content_type"] = "abstract"
876
+
877
+ # Remove temporary fields
878
+ if "_pmid" in result:
879
+ del result["_pmid"]
880
+ if "_search_strategy" in result:
881
+ del result["_search_strategy"]
882
+
883
+ results.append(result)
884
+
885
+ return results
886
+
887
+ def search_by_author(self, author_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
888
+ """
889
+ Search for articles by a specific author.
890
+
891
+ Args:
892
+ author_name: Name of the author
893
+ max_results: Maximum number of results (defaults to self.max_results)
894
+
895
+ Returns:
896
+ List of articles by the author
897
+ """
898
+ original_max_results = self.max_results
899
+
900
+ try:
901
+ if max_results:
902
+ self.max_results = max_results
903
+
904
+ query = f"{author_name}[Author]"
905
+ return self.run(query)
906
+
907
+ finally:
908
+ # Restore original value
909
+ self.max_results = original_max_results
910
+
911
+ def search_by_journal(self, journal_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
912
+ """
913
+ Search for articles in a specific journal.
914
+
915
+ Args:
916
+ journal_name: Name of the journal
917
+ max_results: Maximum number of results (defaults to self.max_results)
918
+
919
+ Returns:
920
+ List of articles from the journal
921
+ """
922
+ original_max_results = self.max_results
923
+
924
+ try:
925
+ if max_results:
926
+ self.max_results = max_results
927
+
928
+ query = f"{journal_name}[Journal]"
929
+ return self.run(query)
930
+
931
+ finally:
932
+ # Restore original value
933
+ self.max_results = original_max_results
934
+
935
+ def search_recent(self, query: str, days: int = 30, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
936
+ """
937
+ Search for recent articles matching the query.
938
+
939
+ Args:
940
+ query: The search query
941
+ days: Number of days to look back
942
+ max_results: Maximum number of results (defaults to self.max_results)
943
+
944
+ Returns:
945
+ List of recent articles matching the query
946
+ """
947
+ original_max_results = self.max_results
948
+ original_days_limit = self.days_limit
949
+
950
+ try:
951
+ if max_results:
952
+ self.max_results = max_results
953
+
954
+ # Set days limit for this search
955
+ self.days_limit = days
956
+
957
+ return self.run(query)
958
+
959
+ finally:
960
+ # Restore original values
961
+ self.max_results = original_max_results
962
+ self.days_limit = original_days_limit
963
+
964
+ def advanced_search(self, terms: Dict[str, str], max_results: Optional[int] = None) -> List[Dict[str, Any]]:
965
+ """
966
+ Perform an advanced search with field-specific terms.
967
+
968
+ Args:
969
+ terms: Dictionary mapping fields to search terms
970
+ Valid fields: Author, Journal, Title, MeSH, Affiliation, etc.
971
+ max_results: Maximum number of results (defaults to self.max_results)
972
+
973
+ Returns:
974
+ List of articles matching the advanced query
975
+ """
976
+ original_max_results = self.max_results
977
+
978
+ try:
979
+ if max_results:
980
+ self.max_results = max_results
981
+
982
+ # Build advanced query string
983
+ query_parts = []
984
+ for field, term in terms.items():
985
+ query_parts.append(f"{term}[{field}]")
986
+
987
+ query = " AND ".join(query_parts)
988
+ return self.run(query)
989
+
990
+ finally:
991
+ # Restore original value
992
+ self.max_results = original_max_results