local-deep-research 0.1.26__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +23 -22
- local_deep_research/__main__.py +16 -0
- local_deep_research/advanced_search_system/__init__.py +7 -0
- local_deep_research/advanced_search_system/filters/__init__.py +8 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
- local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
- local_deep_research/advanced_search_system/findings/repository.py +452 -0
- local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
- local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
- local_deep_research/advanced_search_system/questions/__init__.py +1 -0
- local_deep_research/advanced_search_system/questions/base_question.py +64 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
- local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
- local_deep_research/advanced_search_system/tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
- local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
- local_deep_research/api/__init__.py +5 -5
- local_deep_research/api/research_functions.py +96 -84
- local_deep_research/app.py +8 -0
- local_deep_research/citation_handler.py +25 -16
- local_deep_research/{config.py → config/config_files.py} +102 -110
- local_deep_research/config/llm_config.py +472 -0
- local_deep_research/config/search_config.py +77 -0
- local_deep_research/defaults/__init__.py +10 -5
- local_deep_research/defaults/main.toml +2 -2
- local_deep_research/defaults/search_engines.toml +60 -34
- local_deep_research/main.py +121 -19
- local_deep_research/migrate_db.py +147 -0
- local_deep_research/report_generator.py +72 -44
- local_deep_research/search_system.py +147 -283
- local_deep_research/setup_data_dir.py +35 -0
- local_deep_research/test_migration.py +178 -0
- local_deep_research/utilities/__init__.py +0 -0
- local_deep_research/utilities/db_utils.py +49 -0
- local_deep_research/{utilties → utilities}/enums.py +2 -2
- local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
- local_deep_research/utilities/search_utilities.py +242 -0
- local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
- local_deep_research/web/__init__.py +0 -1
- local_deep_research/web/app.py +86 -1709
- local_deep_research/web/app_factory.py +289 -0
- local_deep_research/web/database/README.md +70 -0
- local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
- local_deep_research/web/database/migrations.py +447 -0
- local_deep_research/web/database/models.py +117 -0
- local_deep_research/web/database/schema_upgrade.py +107 -0
- local_deep_research/web/models/database.py +294 -0
- local_deep_research/web/models/settings.py +94 -0
- local_deep_research/web/routes/api_routes.py +559 -0
- local_deep_research/web/routes/history_routes.py +354 -0
- local_deep_research/web/routes/research_routes.py +715 -0
- local_deep_research/web/routes/settings_routes.py +1592 -0
- local_deep_research/web/services/research_service.py +947 -0
- local_deep_research/web/services/resource_service.py +149 -0
- local_deep_research/web/services/settings_manager.py +669 -0
- local_deep_research/web/services/settings_service.py +187 -0
- local_deep_research/web/services/socket_service.py +210 -0
- local_deep_research/web/static/css/custom_dropdown.css +277 -0
- local_deep_research/web/static/css/settings.css +1223 -0
- local_deep_research/web/static/css/styles.css +525 -48
- local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
- local_deep_research/web/static/js/components/detail.js +348 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
- local_deep_research/web/static/js/components/fallback/ui.js +215 -0
- local_deep_research/web/static/js/components/history.js +487 -0
- local_deep_research/web/static/js/components/logpanel.js +949 -0
- local_deep_research/web/static/js/components/progress.js +1107 -0
- local_deep_research/web/static/js/components/research.js +1865 -0
- local_deep_research/web/static/js/components/results.js +766 -0
- local_deep_research/web/static/js/components/settings.js +3981 -0
- local_deep_research/web/static/js/components/settings_sync.js +106 -0
- local_deep_research/web/static/js/main.js +226 -0
- local_deep_research/web/static/js/services/api.js +253 -0
- local_deep_research/web/static/js/services/audio.js +31 -0
- local_deep_research/web/static/js/services/formatting.js +119 -0
- local_deep_research/web/static/js/services/pdf.js +622 -0
- local_deep_research/web/static/js/services/socket.js +882 -0
- local_deep_research/web/static/js/services/ui.js +546 -0
- local_deep_research/web/templates/base.html +72 -0
- local_deep_research/web/templates/components/custom_dropdown.html +47 -0
- local_deep_research/web/templates/components/log_panel.html +32 -0
- local_deep_research/web/templates/components/mobile_nav.html +22 -0
- local_deep_research/web/templates/components/settings_form.html +299 -0
- local_deep_research/web/templates/components/sidebar.html +21 -0
- local_deep_research/web/templates/pages/details.html +73 -0
- local_deep_research/web/templates/pages/history.html +51 -0
- local_deep_research/web/templates/pages/progress.html +57 -0
- local_deep_research/web/templates/pages/research.html +139 -0
- local_deep_research/web/templates/pages/results.html +59 -0
- local_deep_research/web/templates/settings_dashboard.html +78 -192
- local_deep_research/web/utils/__init__.py +0 -0
- local_deep_research/web/utils/formatters.py +76 -0
- local_deep_research/web_search_engines/engines/full_search.py +18 -16
- local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
- local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
- local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
- local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +211 -159
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
- local_deep_research/web_search_engines/search_engine_base.py +174 -99
- local_deep_research/web_search_engines/search_engine_factory.py +192 -102
- local_deep_research/web_search_engines/search_engines_config.py +22 -15
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.0.dist-info}/METADATA +177 -97
- local_deep_research-0.2.0.dist-info/RECORD +135 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.0.dist-info}/WHEEL +1 -2
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.0.dist-info}/entry_points.txt +3 -0
- local_deep_research/defaults/llm_config.py +0 -338
- local_deep_research/utilties/search_utilities.py +0 -114
- local_deep_research/web/static/js/app.js +0 -3763
- local_deep_research/web/templates/api_keys_config.html +0 -82
- local_deep_research/web/templates/collections_config.html +0 -90
- local_deep_research/web/templates/index.html +0 -348
- local_deep_research/web/templates/llm_config.html +0 -120
- local_deep_research/web/templates/main_config.html +0 -89
- local_deep_research/web/templates/search_engines_config.html +0 -154
- local_deep_research/web/templates/settings.html +0 -519
- local_deep_research-0.1.26.dist-info/RECORD +0 -61
- local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
- /local_deep_research/{utilties → config}/__init__.py +0 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,38 +1,41 @@
|
|
1
|
-
import requests
|
2
1
|
import logging
|
2
|
+
import re
|
3
|
+
import time
|
3
4
|
import xml.etree.ElementTree as ET
|
4
|
-
from typing import Dict, List,
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple
|
6
|
+
|
7
|
+
import requests
|
5
8
|
from langchain_core.language_models import BaseLLM
|
6
|
-
import time
|
7
|
-
import re
|
8
|
-
from datetime import datetime
|
9
9
|
|
10
|
-
from
|
11
|
-
from
|
10
|
+
from ...config import search_config
|
11
|
+
from ..search_engine_base import BaseSearchEngine
|
12
12
|
|
13
13
|
# Setup logging
|
14
14
|
logging.basicConfig(level=logging.INFO)
|
15
15
|
logger = logging.getLogger(__name__)
|
16
16
|
|
17
|
+
|
17
18
|
class PubMedSearchEngine(BaseSearchEngine):
|
18
19
|
"""
|
19
20
|
PubMed search engine implementation with two-phase approach and adaptive search.
|
20
21
|
Provides efficient access to biomedical literature while minimizing API usage.
|
21
22
|
"""
|
22
|
-
|
23
|
-
def __init__(
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
23
|
+
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
max_results: int = 10,
|
27
|
+
api_key: Optional[str] = None,
|
28
|
+
days_limit: Optional[int] = None,
|
29
|
+
get_abstracts: bool = True,
|
30
|
+
get_full_text: bool = False,
|
31
|
+
full_text_limit: int = 3,
|
32
|
+
llm: Optional[BaseLLM] = None,
|
33
|
+
max_filtered_results: Optional[int] = None,
|
34
|
+
optimize_queries: bool = True,
|
35
|
+
):
|
33
36
|
"""
|
34
37
|
Initialize the PubMed search engine.
|
35
|
-
|
38
|
+
|
36
39
|
Args:
|
37
40
|
max_results: Maximum number of search results
|
38
41
|
api_key: NCBI API key for higher rate limits (optional)
|
@@ -45,32 +48,34 @@ class PubMedSearchEngine(BaseSearchEngine):
|
|
45
48
|
optimize_queries: Whether to optimize natural language queries for PubMed
|
46
49
|
"""
|
47
50
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
48
|
-
super().__init__(
|
49
|
-
|
51
|
+
super().__init__(
|
52
|
+
llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
|
53
|
+
)
|
54
|
+
self.max_results = max(self.max_results, 25)
|
50
55
|
self.api_key = api_key
|
51
56
|
self.days_limit = days_limit
|
52
57
|
self.get_abstracts = get_abstracts
|
53
58
|
self.get_full_text = get_full_text
|
54
59
|
self.full_text_limit = full_text_limit
|
55
60
|
self.optimize_queries = optimize_queries
|
56
|
-
|
61
|
+
|
57
62
|
# Base API URLs
|
58
63
|
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
|
59
64
|
self.search_url = f"{self.base_url}/esearch.fcgi"
|
60
65
|
self.summary_url = f"{self.base_url}/esummary.fcgi"
|
61
66
|
self.fetch_url = f"{self.base_url}/efetch.fcgi"
|
62
67
|
self.link_url = f"{self.base_url}/elink.fcgi"
|
63
|
-
|
68
|
+
|
64
69
|
# PMC base URL for full text
|
65
70
|
self.pmc_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/"
|
66
|
-
|
71
|
+
|
67
72
|
def _get_result_count(self, query: str) -> int:
|
68
73
|
"""
|
69
74
|
Get the total number of results for a query without retrieving the results themselves.
|
70
|
-
|
75
|
+
|
71
76
|
Args:
|
72
77
|
query: The search query
|
73
|
-
|
78
|
+
|
74
79
|
Returns:
|
75
80
|
Total number of matching results
|
76
81
|
"""
|
@@ -80,69 +85,70 @@ class PubMedSearchEngine(BaseSearchEngine):
|
|
80
85
|
"db": "pubmed",
|
81
86
|
"term": query,
|
82
87
|
"retmode": "json",
|
83
|
-
"retmax": 0 # Don't need actual results, just the count
|
88
|
+
"retmax": 0, # Don't need actual results, just the count
|
84
89
|
}
|
85
|
-
|
90
|
+
|
86
91
|
# Add API key if available
|
87
92
|
if self.api_key:
|
88
93
|
params["api_key"] = self.api_key
|
89
|
-
|
94
|
+
|
90
95
|
# Execute search request
|
91
96
|
response = requests.get(self.search_url, params=params)
|
92
97
|
response.raise_for_status()
|
93
|
-
|
98
|
+
|
94
99
|
# Parse response
|
95
100
|
data = response.json()
|
96
101
|
count = int(data["esearchresult"]["count"])
|
97
|
-
|
98
|
-
logger.info(
|
102
|
+
|
103
|
+
logger.info("Query '%s' has %s total results in PubMed", query, count)
|
99
104
|
return count
|
100
|
-
|
105
|
+
|
101
106
|
except Exception as e:
|
102
107
|
logger.error(f"Error getting result count: {e}")
|
103
108
|
return 0
|
104
|
-
|
109
|
+
|
105
110
|
def _extract_core_terms(self, query: str) -> str:
|
106
111
|
"""
|
107
112
|
Extract core terms from a complex query for volume estimation.
|
108
|
-
|
113
|
+
|
109
114
|
Args:
|
110
115
|
query: PubMed query string
|
111
|
-
|
116
|
+
|
112
117
|
Returns:
|
113
118
|
Simplified query with core terms
|
114
119
|
"""
|
115
120
|
# Remove field specifications and operators
|
116
|
-
simplified = re.sub(r
|
117
|
-
simplified = re.sub(r
|
118
|
-
|
121
|
+
simplified = re.sub(r"\[\w+\]", "", query) # Remove [Field] tags
|
122
|
+
simplified = re.sub(r"\b(AND|OR|NOT)\b", "", simplified) # Remove operators
|
123
|
+
|
119
124
|
# Remove quotes and parentheses
|
120
|
-
simplified = simplified.replace('"',
|
121
|
-
|
125
|
+
simplified = simplified.replace('"', "").replace("(", "").replace(")", "")
|
126
|
+
|
122
127
|
# Split by whitespace and join terms with 4+ chars (likely meaningful)
|
123
128
|
terms = [term for term in simplified.split() if len(term) >= 4]
|
124
|
-
|
129
|
+
|
125
130
|
# Join with AND to create a basic search
|
126
131
|
return " ".join(terms[:5]) # Limit to top 5 terms
|
127
|
-
|
132
|
+
|
128
133
|
def _expand_time_window(self, time_filter: str) -> str:
|
129
134
|
"""
|
130
135
|
Expand a time window to get more results.
|
131
|
-
|
136
|
+
|
132
137
|
Args:
|
133
138
|
time_filter: Current time filter
|
134
|
-
|
139
|
+
|
135
140
|
Returns:
|
136
141
|
Expanded time filter
|
137
142
|
"""
|
138
143
|
# Parse current time window
|
139
144
|
import re
|
145
|
+
|
140
146
|
match = re.match(r'"last (\d+) (\w+)"[pdat]', time_filter)
|
141
147
|
if not match:
|
142
148
|
return '"last 10 years"[pdat]'
|
143
|
-
|
149
|
+
|
144
150
|
amount, unit = int(match.group(1)), match.group(2)
|
145
|
-
|
151
|
+
|
146
152
|
# Expand based on current unit
|
147
153
|
if unit == "months" or unit == "month":
|
148
154
|
if amount < 6:
|
@@ -158,24 +164,24 @@ class PubMedSearchEngine(BaseSearchEngine):
|
|
158
164
|
return '"last 5 years"[pdat]'
|
159
165
|
else:
|
160
166
|
return '"last 10 years"[pdat]'
|
161
|
-
|
167
|
+
|
162
168
|
return '"last 10 years"[pdat]'
|
163
|
-
|
169
|
+
|
164
170
|
def _optimize_query_for_pubmed(self, query: str) -> str:
|
165
171
|
"""
|
166
172
|
Optimize a natural language query for PubMed search.
|
167
173
|
Uses LLM to transform questions into effective keyword-based queries.
|
168
|
-
|
174
|
+
|
169
175
|
Args:
|
170
176
|
query: Natural language query
|
171
|
-
|
177
|
+
|
172
178
|
Returns:
|
173
179
|
Optimized query string for PubMed
|
174
180
|
"""
|
175
181
|
if not self.llm or not self.optimize_queries:
|
176
182
|
# Return original query if no LLM available or optimization disabled
|
177
183
|
return query
|
178
|
-
|
184
|
+
|
179
185
|
try:
|
180
186
|
# Prompt for query optimization
|
181
187
|
prompt = f"""Transform this natural language question into an optimized PubMed search query.
|
@@ -200,138 +206,194 @@ EXAMPLE QUERIES:
|
|
200
206
|
|
201
207
|
Return ONLY the search query without any explanations.
|
202
208
|
"""
|
203
|
-
|
209
|
+
|
204
210
|
# Get response from LLM
|
205
211
|
response = self.llm.invoke(prompt)
|
206
212
|
raw_response = response.content.strip()
|
207
|
-
|
213
|
+
|
208
214
|
# Clean up the query - extract only the actual query and remove any explanations
|
209
215
|
# First check if there are multiple lines and take the first non-empty line
|
210
|
-
lines = raw_response.split(
|
216
|
+
lines = raw_response.split("\n")
|
211
217
|
cleaned_lines = [line.strip() for line in lines if line.strip()]
|
212
|
-
|
218
|
+
|
213
219
|
if cleaned_lines:
|
214
220
|
optimized_query = cleaned_lines[0]
|
215
|
-
|
221
|
+
|
216
222
|
# Remove any quotes that wrap the entire query
|
217
223
|
if optimized_query.startswith('"') and optimized_query.endswith('"'):
|
218
224
|
optimized_query = optimized_query[1:-1]
|
219
|
-
|
225
|
+
|
220
226
|
# Remove any explanation phrases that might be at the beginning
|
221
|
-
explanation_starters = [
|
227
|
+
explanation_starters = [
|
228
|
+
"here is",
|
229
|
+
"here's",
|
230
|
+
"this query",
|
231
|
+
"the following",
|
232
|
+
]
|
222
233
|
for starter in explanation_starters:
|
223
234
|
if optimized_query.lower().startswith(starter):
|
224
235
|
# Find the actual query part - typically after a colon
|
225
|
-
colon_pos = optimized_query.find(
|
236
|
+
colon_pos = optimized_query.find(":")
|
226
237
|
if colon_pos > 0:
|
227
|
-
optimized_query = optimized_query[colon_pos + 1:].strip()
|
228
|
-
|
238
|
+
optimized_query = optimized_query[colon_pos + 1 :].strip()
|
239
|
+
|
229
240
|
# Check if the query still seems to contain explanations
|
230
|
-
if
|
241
|
+
if (
|
242
|
+
len(optimized_query) > 200
|
243
|
+
or "this query will" in optimized_query.lower()
|
244
|
+
):
|
231
245
|
# It's probably still an explanation - try to extract just the query part
|
232
246
|
# Look for common patterns in the explanation like parentheses
|
233
|
-
pattern = r
|
247
|
+
pattern = r"\([^)]+\)\s+AND\s+"
|
234
248
|
import re
|
249
|
+
|
235
250
|
matches = re.findall(pattern, optimized_query)
|
236
251
|
if matches:
|
237
252
|
# Extract just the query syntax parts
|
238
253
|
query_parts = []
|
239
|
-
for part in re.split(r
|
240
|
-
if
|
254
|
+
for part in re.split(r"\.\s+", optimized_query):
|
255
|
+
if (
|
256
|
+
"(" in part
|
257
|
+
and ")" in part
|
258
|
+
and ("AND" in part or "OR" in part)
|
259
|
+
):
|
241
260
|
query_parts.append(part)
|
242
261
|
if query_parts:
|
243
|
-
optimized_query =
|
262
|
+
optimized_query = " ".join(query_parts)
|
244
263
|
else:
|
245
264
|
# Fall back to original query if cleaning fails
|
246
265
|
logger.warning("Failed to extract a clean query from LLM response")
|
247
266
|
optimized_query = query
|
248
|
-
|
267
|
+
|
249
268
|
# Final safety check - if query looks too much like an explanation, use original
|
250
269
|
if len(optimized_query.split()) > 30:
|
251
270
|
logger.warning("Query too verbose, falling back to simpler form")
|
252
271
|
# Create a simple query from the original
|
253
|
-
words = [
|
254
|
-
|
255
|
-
|
256
|
-
|
272
|
+
words = [
|
273
|
+
w
|
274
|
+
for w in query.split()
|
275
|
+
if len(w) > 3
|
276
|
+
and w.lower()
|
277
|
+
not in (
|
278
|
+
"what",
|
279
|
+
"are",
|
280
|
+
"the",
|
281
|
+
"and",
|
282
|
+
"for",
|
283
|
+
"with",
|
284
|
+
"from",
|
285
|
+
"have",
|
286
|
+
"been",
|
287
|
+
"recent",
|
288
|
+
)
|
289
|
+
]
|
290
|
+
optimized_query = " AND ".join(words[:3])
|
291
|
+
|
292
|
+
# Safety check for invalid or overly complex MeSH terms
|
257
293
|
# This helps prevent errors with non-existent or complex MeSH terms
|
258
294
|
import re
|
295
|
+
|
259
296
|
mesh_terms = re.findall(r'"[^"]+"[Mesh]', optimized_query)
|
260
|
-
known_valid_mesh = [
|
261
|
-
|
262
|
-
|
263
|
-
|
297
|
+
known_valid_mesh = [
|
298
|
+
"Vaccines",
|
299
|
+
"COVID-19",
|
300
|
+
"Influenza",
|
301
|
+
"Infectious Disease Medicine",
|
302
|
+
"Communicable Diseases",
|
303
|
+
"RNA, Messenger",
|
304
|
+
"Vaccination",
|
305
|
+
"Immunization",
|
306
|
+
]
|
307
|
+
|
264
308
|
# Replace potentially problematic MeSH terms with Title/Abstract searches
|
265
309
|
for term in mesh_terms:
|
266
|
-
term_name = term.split('"')[
|
310
|
+
term_name = term.split('"')[
|
311
|
+
1
|
312
|
+
] # Extract term name without quotes and [Mesh]
|
267
313
|
if not any(valid in term_name for valid in known_valid_mesh):
|
268
314
|
# Replace with Title/Abstract search
|
269
315
|
replacement = f"{term_name.lower()}[Title/Abstract]"
|
270
316
|
optimized_query = optimized_query.replace(term, replacement)
|
271
|
-
|
317
|
+
|
272
318
|
# Simplify the query if still no results are found
|
273
319
|
self._simplify_query_cache = optimized_query
|
274
|
-
|
320
|
+
|
275
321
|
# Log original and optimized queries
|
276
|
-
logger.info(
|
322
|
+
logger.info("Original query: '%s'", query)
|
277
323
|
logger.info(f"Optimized for PubMed: '{optimized_query}'")
|
278
|
-
|
324
|
+
|
279
325
|
return optimized_query
|
280
|
-
|
326
|
+
|
281
327
|
except Exception as e:
|
282
328
|
logger.error(f"Error optimizing query: {e}")
|
283
329
|
return query # Fall back to original query on error
|
284
|
-
|
330
|
+
|
285
331
|
def _simplify_query(self, query: str) -> str:
|
286
332
|
"""
|
287
333
|
Simplify a PubMed query that returned no results.
|
288
334
|
Progressively removes elements to get a more basic query.
|
289
|
-
|
335
|
+
|
290
336
|
Args:
|
291
337
|
query: The original query that returned no results
|
292
|
-
|
338
|
+
|
293
339
|
Returns:
|
294
340
|
Simplified query
|
295
341
|
"""
|
296
342
|
logger.info(f"Simplifying query: {query}")
|
297
|
-
|
343
|
+
|
298
344
|
# Attempt different simplification strategies
|
299
|
-
|
345
|
+
|
300
346
|
# 1. Remove any MeSH terms and replace with Title/Abstract
|
301
347
|
import re
|
302
|
-
|
303
|
-
|
348
|
+
|
349
|
+
simplified = re.sub(
|
350
|
+
r'"[^"]+"[Mesh]',
|
351
|
+
lambda m: m.group(0).split('"')[1].lower() + "[Title/Abstract]",
|
352
|
+
query,
|
353
|
+
)
|
354
|
+
|
304
355
|
# 2. If that doesn't work, focus on just mRNA and vaccines - the core concepts
|
305
356
|
if simplified == query: # No changes were made
|
306
|
-
simplified =
|
307
|
-
|
357
|
+
simplified = '(mRNA[Title/Abstract] OR "messenger RNA"[Title/Abstract]) AND vaccin*[Title/Abstract]'
|
358
|
+
|
308
359
|
logger.info(f"Simplified query: {simplified}")
|
309
360
|
return simplified
|
310
|
-
|
361
|
+
|
311
362
|
def _is_historical_focused(self, query: str) -> bool:
|
312
363
|
"""
|
313
364
|
Determine if a query is specifically focused on historical/older information using LLM.
|
314
365
|
Default assumption is that queries should prioritize recent information unless
|
315
366
|
explicitly asking for historical content.
|
316
|
-
|
367
|
+
|
317
368
|
Args:
|
318
369
|
query: The search query
|
319
|
-
|
370
|
+
|
320
371
|
Returns:
|
321
372
|
Boolean indicating if the query is focused on historical information
|
322
373
|
"""
|
323
374
|
if not self.llm:
|
324
375
|
# Fall back to basic keyword check if no LLM available
|
325
|
-
historical_terms = [
|
326
|
-
|
376
|
+
historical_terms = [
|
377
|
+
"history",
|
378
|
+
"historical",
|
379
|
+
"early",
|
380
|
+
"initial",
|
381
|
+
"first",
|
382
|
+
"original",
|
383
|
+
"before",
|
384
|
+
"prior to",
|
385
|
+
"origins",
|
386
|
+
"evolution",
|
387
|
+
"development",
|
388
|
+
]
|
327
389
|
historical_years = [str(year) for year in range(1900, 2020)]
|
328
|
-
|
390
|
+
|
329
391
|
query_lower = query.lower()
|
330
392
|
has_historical_term = any(term in query_lower for term in historical_terms)
|
331
393
|
has_past_year = any(year in query for year in historical_years)
|
332
|
-
|
394
|
+
|
333
395
|
return has_historical_term or has_past_year
|
334
|
-
|
396
|
+
|
335
397
|
try:
|
336
398
|
# Use LLM to determine if the query is focused on historical information
|
337
399
|
prompt = f"""Determine if this query is specifically asking for HISTORICAL or OLDER information.
|
@@ -343,40 +405,51 @@ Answer ONLY "no" if the query is asking about recent, current, or new informatio
|
|
343
405
|
|
344
406
|
The default assumption should be that medical and scientific queries want RECENT information unless clearly specified otherwise.
|
345
407
|
"""
|
346
|
-
|
408
|
+
|
347
409
|
response = self.llm.invoke(prompt)
|
348
410
|
answer = response.content.strip().lower()
|
349
|
-
|
411
|
+
|
350
412
|
# Log the determination
|
351
413
|
logger.info(f"Historical focus determination for query: '{query}'")
|
352
414
|
logger.info(f"LLM determined historical focus: {answer}")
|
353
|
-
|
415
|
+
|
354
416
|
return "yes" in answer
|
355
|
-
|
417
|
+
|
356
418
|
except Exception as e:
|
357
419
|
logger.error(f"Error determining historical focus: {e}")
|
358
420
|
# Fall back to basic keyword check
|
359
|
-
historical_terms = [
|
360
|
-
|
421
|
+
historical_terms = [
|
422
|
+
"history",
|
423
|
+
"historical",
|
424
|
+
"early",
|
425
|
+
"initial",
|
426
|
+
"first",
|
427
|
+
"original",
|
428
|
+
"before",
|
429
|
+
"prior to",
|
430
|
+
"origins",
|
431
|
+
"evolution",
|
432
|
+
"development",
|
433
|
+
]
|
361
434
|
return any(term in query.lower() for term in historical_terms)
|
362
|
-
|
435
|
+
|
363
436
|
def _adaptive_search(self, query: str) -> Tuple[List[str], str]:
|
364
437
|
"""
|
365
438
|
Perform an adaptive search that adjusts based on topic volume and whether
|
366
439
|
the query focuses on historical information.
|
367
|
-
|
440
|
+
|
368
441
|
Args:
|
369
442
|
query: The search query (already optimized)
|
370
|
-
|
443
|
+
|
371
444
|
Returns:
|
372
445
|
Tuple of (list of PMIDs, search strategy used)
|
373
446
|
"""
|
374
447
|
# Estimate topic volume
|
375
448
|
estimated_volume = self._get_result_count(query)
|
376
|
-
|
449
|
+
|
377
450
|
# Determine if the query is focused on historical information
|
378
451
|
is_historical_focused = self._is_historical_focused(query)
|
379
|
-
|
452
|
+
|
380
453
|
if is_historical_focused:
|
381
454
|
# User wants historical information - no date filtering
|
382
455
|
time_filter = None
|
@@ -397,44 +470,52 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
397
470
|
# Rare topic - still use recency but with wider range
|
398
471
|
time_filter = '"last 10 years"[pdat]'
|
399
472
|
strategy = "rare_topic"
|
400
|
-
|
473
|
+
|
401
474
|
# Run search based on strategy
|
402
475
|
if time_filter:
|
403
476
|
# Try with adaptive time filter
|
404
477
|
query_with_time = f"({query}) AND {time_filter}"
|
405
|
-
logger.info(
|
478
|
+
logger.info(
|
479
|
+
f"Using adaptive search strategy: {strategy} with filter: {time_filter}"
|
480
|
+
)
|
406
481
|
results = self._search_pubmed(query_with_time)
|
407
|
-
|
482
|
+
|
408
483
|
# If too few results, gradually expand time window
|
409
|
-
if len(results) < 5 and
|
410
|
-
logger.info(
|
484
|
+
if len(results) < 5 and '"last 10 years"[pdat]' not in time_filter:
|
485
|
+
logger.info(
|
486
|
+
f"Insufficient results ({len(results)}), expanding time window"
|
487
|
+
)
|
411
488
|
expanded_time = self._expand_time_window(time_filter)
|
412
489
|
query_with_expanded_time = f"({query}) AND {expanded_time}"
|
413
490
|
expanded_results = self._search_pubmed(query_with_expanded_time)
|
414
|
-
|
491
|
+
|
415
492
|
if len(expanded_results) > len(results):
|
416
|
-
logger.info(
|
493
|
+
logger.info(
|
494
|
+
f"Expanded time window yielded {len(expanded_results)} results"
|
495
|
+
)
|
417
496
|
return expanded_results, f"{strategy}_expanded"
|
418
|
-
|
497
|
+
|
419
498
|
# If still no results, try without time filter
|
420
499
|
if not results:
|
421
|
-
logger.info(
|
500
|
+
logger.info(
|
501
|
+
"No results with time filter, trying without time restrictions"
|
502
|
+
)
|
422
503
|
results = self._search_pubmed(query)
|
423
504
|
strategy = "no_time_filter"
|
424
505
|
else:
|
425
506
|
# Historical query - run without time filter
|
426
|
-
logger.info(
|
507
|
+
logger.info("Using historical search strategy without date filtering")
|
427
508
|
results = self._search_pubmed(query)
|
428
|
-
|
509
|
+
|
429
510
|
return results, strategy
|
430
|
-
|
511
|
+
|
431
512
|
def _search_pubmed(self, query: str) -> List[str]:
|
432
513
|
"""
|
433
514
|
Search PubMed and return a list of article IDs.
|
434
|
-
|
515
|
+
|
435
516
|
Args:
|
436
517
|
query: The search query
|
437
|
-
|
518
|
+
|
438
519
|
Returns:
|
439
520
|
List of PubMed IDs matching the query
|
440
521
|
"""
|
@@ -445,76 +526,76 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
445
526
|
"term": query,
|
446
527
|
"retmode": "json",
|
447
528
|
"retmax": self.max_results,
|
448
|
-
"usehistory": "y"
|
529
|
+
"usehistory": "y",
|
449
530
|
}
|
450
|
-
|
531
|
+
|
451
532
|
# Add API key if available
|
452
533
|
if self.api_key:
|
453
534
|
params["api_key"] = self.api_key
|
454
|
-
|
535
|
+
|
455
536
|
# Add date restriction if specified
|
456
537
|
if self.days_limit:
|
457
538
|
params["reldate"] = self.days_limit
|
458
539
|
params["datetype"] = "pdat" # Publication date
|
459
|
-
|
540
|
+
|
460
541
|
# Execute search request
|
461
542
|
response = requests.get(self.search_url, params=params)
|
462
543
|
response.raise_for_status()
|
463
|
-
|
544
|
+
|
464
545
|
# Parse response
|
465
546
|
data = response.json()
|
466
547
|
id_list = data["esearchresult"]["idlist"]
|
467
|
-
|
548
|
+
|
468
549
|
logger.info(f"PubMed search for '{query}' found {len(id_list)} results")
|
469
550
|
return id_list
|
470
|
-
|
551
|
+
|
471
552
|
except Exception as e:
|
472
553
|
logger.error(f"Error searching PubMed: {e}")
|
473
554
|
return []
|
474
|
-
|
555
|
+
|
475
556
|
def _get_article_summaries(self, id_list: List[str]) -> List[Dict[str, Any]]:
|
476
557
|
"""
|
477
558
|
Get summaries for a list of PubMed article IDs.
|
478
|
-
|
559
|
+
|
479
560
|
Args:
|
480
561
|
id_list: List of PubMed IDs
|
481
|
-
|
562
|
+
|
482
563
|
Returns:
|
483
564
|
List of article summary dictionaries
|
484
565
|
"""
|
485
566
|
if not id_list:
|
486
567
|
return []
|
487
|
-
|
568
|
+
|
488
569
|
try:
|
489
570
|
# Prepare parameters
|
490
571
|
params = {
|
491
572
|
"db": "pubmed",
|
492
573
|
"id": ",".join(id_list),
|
493
574
|
"retmode": "json",
|
494
|
-
"rettype": "summary"
|
575
|
+
"rettype": "summary",
|
495
576
|
}
|
496
|
-
|
577
|
+
|
497
578
|
# Add API key if available
|
498
579
|
if self.api_key:
|
499
580
|
params["api_key"] = self.api_key
|
500
|
-
|
581
|
+
|
501
582
|
# Execute request
|
502
583
|
response = requests.get(self.summary_url, params=params)
|
503
584
|
response.raise_for_status()
|
504
|
-
|
585
|
+
|
505
586
|
# Parse response
|
506
587
|
data = response.json()
|
507
588
|
summaries = []
|
508
|
-
|
589
|
+
|
509
590
|
for pmid in id_list:
|
510
591
|
if pmid in data["result"]:
|
511
592
|
article = data["result"][pmid]
|
512
|
-
|
593
|
+
|
513
594
|
# Extract authors (if available)
|
514
595
|
authors = []
|
515
596
|
if "authors" in article:
|
516
597
|
authors = [author["name"] for author in article["authors"]]
|
517
|
-
|
598
|
+
|
518
599
|
# Create summary dictionary
|
519
600
|
summary = {
|
520
601
|
"id": pmid,
|
@@ -524,73 +605,73 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
524
605
|
"authors": authors,
|
525
606
|
"journal": article.get("fulljournalname", ""),
|
526
607
|
"doi": article.get("doi", ""),
|
527
|
-
"link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
|
608
|
+
"link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
|
528
609
|
}
|
529
|
-
|
610
|
+
|
530
611
|
summaries.append(summary)
|
531
|
-
|
612
|
+
|
532
613
|
return summaries
|
533
|
-
|
614
|
+
|
534
615
|
except Exception as e:
|
535
616
|
logger.error(f"Error getting article summaries: {e}")
|
536
617
|
return []
|
537
|
-
|
618
|
+
|
538
619
|
def _get_article_abstracts(self, id_list: List[str]) -> Dict[str, str]:
|
539
620
|
"""
|
540
621
|
Get abstracts for a list of PubMed article IDs.
|
541
|
-
|
622
|
+
|
542
623
|
Args:
|
543
624
|
id_list: List of PubMed IDs
|
544
|
-
|
625
|
+
|
545
626
|
Returns:
|
546
627
|
Dictionary mapping PubMed IDs to their abstracts
|
547
628
|
"""
|
548
629
|
if not id_list:
|
549
630
|
return {}
|
550
|
-
|
631
|
+
|
551
632
|
try:
|
552
633
|
# Prepare parameters
|
553
634
|
params = {
|
554
635
|
"db": "pubmed",
|
555
636
|
"id": ",".join(id_list),
|
556
637
|
"retmode": "xml",
|
557
|
-
"rettype": "abstract"
|
638
|
+
"rettype": "abstract",
|
558
639
|
}
|
559
|
-
|
640
|
+
|
560
641
|
# Add API key if available
|
561
642
|
if self.api_key:
|
562
643
|
params["api_key"] = self.api_key
|
563
|
-
|
644
|
+
|
564
645
|
# Execute request
|
565
646
|
response = requests.get(self.fetch_url, params=params)
|
566
647
|
response.raise_for_status()
|
567
|
-
|
648
|
+
|
568
649
|
# Parse XML response
|
569
650
|
root = ET.fromstring(response.text)
|
570
|
-
|
651
|
+
|
571
652
|
# Extract abstracts
|
572
653
|
abstracts = {}
|
573
|
-
|
654
|
+
|
574
655
|
for article in root.findall(".//PubmedArticle"):
|
575
656
|
pmid_elem = article.find(".//PMID")
|
576
657
|
pmid = pmid_elem.text if pmid_elem is not None else None
|
577
|
-
|
658
|
+
|
578
659
|
if pmid is None:
|
579
660
|
continue
|
580
|
-
|
661
|
+
|
581
662
|
# Find abstract text
|
582
663
|
abstract_text = ""
|
583
664
|
abstract_elem = article.find(".//AbstractText")
|
584
|
-
|
665
|
+
|
585
666
|
if abstract_elem is not None:
|
586
667
|
abstract_text = abstract_elem.text or ""
|
587
|
-
|
668
|
+
|
588
669
|
# Some abstracts are split into multiple sections
|
589
670
|
for section in article.findall(".//AbstractText"):
|
590
671
|
# Get section label if it exists
|
591
672
|
label = section.get("Label")
|
592
673
|
section_text = section.text or ""
|
593
|
-
|
674
|
+
|
594
675
|
if label and section_text:
|
595
676
|
if abstract_text:
|
596
677
|
abstract_text += f"\n\n{label}: {section_text}"
|
@@ -601,30 +682,30 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
601
682
|
abstract_text += f"\n\n{section_text}"
|
602
683
|
else:
|
603
684
|
abstract_text = section_text
|
604
|
-
|
685
|
+
|
605
686
|
# Store in dictionary
|
606
687
|
if pmid and abstract_text:
|
607
688
|
abstracts[pmid] = abstract_text
|
608
|
-
|
689
|
+
|
609
690
|
return abstracts
|
610
|
-
|
691
|
+
|
611
692
|
except Exception as e:
|
612
693
|
logger.error(f"Error getting article abstracts: {e}")
|
613
694
|
return {}
|
614
|
-
|
695
|
+
|
615
696
|
def _find_pmc_ids(self, pmid_list: List[str]) -> Dict[str, str]:
|
616
697
|
"""
|
617
698
|
Find PMC IDs for the given PubMed IDs (for full-text access).
|
618
|
-
|
699
|
+
|
619
700
|
Args:
|
620
701
|
pmid_list: List of PubMed IDs
|
621
|
-
|
702
|
+
|
622
703
|
Returns:
|
623
704
|
Dictionary mapping PubMed IDs to their PMC IDs (if available)
|
624
705
|
"""
|
625
706
|
if not pmid_list or not self.get_full_text:
|
626
707
|
return {}
|
627
|
-
|
708
|
+
|
628
709
|
try:
|
629
710
|
# Prepare parameters
|
630
711
|
params = {
|
@@ -632,89 +713,84 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
632
713
|
"db": "pmc",
|
633
714
|
"linkname": "pubmed_pmc",
|
634
715
|
"id": ",".join(pmid_list),
|
635
|
-
"retmode": "json"
|
716
|
+
"retmode": "json",
|
636
717
|
}
|
637
|
-
|
718
|
+
|
638
719
|
# Add API key if available
|
639
720
|
if self.api_key:
|
640
721
|
params["api_key"] = self.api_key
|
641
|
-
|
722
|
+
|
642
723
|
# Execute request
|
643
724
|
response = requests.get(self.link_url, params=params)
|
644
725
|
response.raise_for_status()
|
645
|
-
|
726
|
+
|
646
727
|
# Parse response
|
647
728
|
data = response.json()
|
648
|
-
|
729
|
+
|
649
730
|
# Map PubMed IDs to PMC IDs
|
650
731
|
pmid_to_pmcid = {}
|
651
|
-
|
732
|
+
|
652
733
|
for linkset in data.get("linksets", []):
|
653
734
|
pmid = linkset.get("ids", [None])[0]
|
654
|
-
|
735
|
+
|
655
736
|
if not pmid:
|
656
737
|
continue
|
657
|
-
|
738
|
+
|
658
739
|
for link in linkset.get("linksetdbs", []):
|
659
740
|
if link.get("linkname") == "pubmed_pmc":
|
660
741
|
pmcids = link.get("links", [])
|
661
742
|
if pmcids:
|
662
743
|
pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}"
|
663
|
-
|
744
|
+
|
664
745
|
logger.info(f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access")
|
665
746
|
return pmid_to_pmcid
|
666
|
-
|
747
|
+
|
667
748
|
except Exception as e:
|
668
749
|
logger.error(f"Error finding PMC IDs: {e}")
|
669
750
|
return {}
|
670
|
-
|
751
|
+
|
671
752
|
def _get_pmc_full_text(self, pmcid: str) -> str:
|
672
753
|
"""
|
673
754
|
Get full text for a PMC article.
|
674
|
-
|
755
|
+
|
675
756
|
Args:
|
676
757
|
pmcid: PMC ID of the article
|
677
|
-
|
758
|
+
|
678
759
|
Returns:
|
679
760
|
Full text content or empty string if not available
|
680
761
|
"""
|
681
762
|
try:
|
682
763
|
# Prepare parameters
|
683
|
-
params = {
|
684
|
-
|
685
|
-
"id": pmcid,
|
686
|
-
"retmode": "xml",
|
687
|
-
"rettype": "full"
|
688
|
-
}
|
689
|
-
|
764
|
+
params = {"db": "pmc", "id": pmcid, "retmode": "xml", "rettype": "full"}
|
765
|
+
|
690
766
|
# Add API key if available
|
691
767
|
if self.api_key:
|
692
768
|
params["api_key"] = self.api_key
|
693
|
-
|
769
|
+
|
694
770
|
# Execute request
|
695
771
|
response = requests.get(self.fetch_url, params=params)
|
696
772
|
response.raise_for_status()
|
697
|
-
|
773
|
+
|
698
774
|
# Parse XML response
|
699
775
|
root = ET.fromstring(response.text)
|
700
|
-
|
776
|
+
|
701
777
|
# Extract full text
|
702
778
|
full_text = []
|
703
|
-
|
779
|
+
|
704
780
|
# Extract article title
|
705
781
|
title_elem = root.find(".//article-title")
|
706
782
|
if title_elem is not None and title_elem.text:
|
707
783
|
full_text.append(f"# {title_elem.text}")
|
708
|
-
|
784
|
+
|
709
785
|
# Extract abstract
|
710
786
|
abstract_paras = root.findall(".//abstract//p")
|
711
787
|
if abstract_paras:
|
712
788
|
full_text.append("\n## Abstract\n")
|
713
789
|
for p in abstract_paras:
|
714
|
-
text =
|
790
|
+
text = "".join(p.itertext())
|
715
791
|
if text:
|
716
792
|
full_text.append(text)
|
717
|
-
|
793
|
+
|
718
794
|
# Extract body content
|
719
795
|
body = root.find(".//body")
|
720
796
|
if body is not None:
|
@@ -723,37 +799,37 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
723
799
|
title = section.find(".//title")
|
724
800
|
if title is not None and title.text:
|
725
801
|
full_text.append(f"\n## {title.text}\n")
|
726
|
-
|
802
|
+
|
727
803
|
# Get paragraphs
|
728
804
|
for p in section.findall(".//p"):
|
729
|
-
text =
|
805
|
+
text = "".join(p.itertext())
|
730
806
|
if text:
|
731
807
|
full_text.append(text)
|
732
|
-
|
808
|
+
|
733
809
|
return "\n\n".join(full_text)
|
734
|
-
|
810
|
+
|
735
811
|
except Exception as e:
|
736
812
|
logger.error(f"Error getting PMC full text: {e}")
|
737
813
|
return ""
|
738
|
-
|
814
|
+
|
739
815
|
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
740
816
|
"""
|
741
817
|
Get preview information for PubMed articles.
|
742
|
-
|
818
|
+
|
743
819
|
Args:
|
744
820
|
query: The search query
|
745
|
-
|
821
|
+
|
746
822
|
Returns:
|
747
823
|
List of preview dictionaries
|
748
824
|
"""
|
749
825
|
logger.info(f"Getting PubMed previews for query: {query}")
|
750
|
-
|
826
|
+
|
751
827
|
# Optimize the query for PubMed if LLM is available
|
752
828
|
optimized_query = self._optimize_query_for_pubmed(query)
|
753
|
-
|
829
|
+
|
754
830
|
# Perform adaptive search
|
755
831
|
pmid_list, strategy = self._adaptive_search(optimized_query)
|
756
|
-
|
832
|
+
|
757
833
|
# If no results, try a simplified query
|
758
834
|
if not pmid_list:
|
759
835
|
logger.warning(f"No PubMed results found using strategy: {strategy}")
|
@@ -763,17 +839,17 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
763
839
|
pmid_list, strategy = self._adaptive_search(simplified_query)
|
764
840
|
if pmid_list:
|
765
841
|
logger.info(f"Simplified query found {len(pmid_list)} results")
|
766
|
-
|
842
|
+
|
767
843
|
if not pmid_list:
|
768
|
-
logger.warning(
|
844
|
+
logger.warning("No PubMed results found after query simplification")
|
769
845
|
return []
|
770
|
-
|
846
|
+
|
771
847
|
# Get article summaries
|
772
848
|
summaries = self._get_article_summaries(pmid_list)
|
773
|
-
|
849
|
+
|
774
850
|
# Rate limit compliance (NCBI allows 10 requests per second with an API key, 3 without)
|
775
851
|
time.sleep(0.1 if self.api_key else 0.33)
|
776
|
-
|
852
|
+
|
777
853
|
# Format as previews
|
778
854
|
previews = []
|
779
855
|
for summary in summaries:
|
@@ -782,7 +858,7 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
782
858
|
if len(authors_text) > 100:
|
783
859
|
# Truncate long author lists
|
784
860
|
authors_text = authors_text[:97] + "..."
|
785
|
-
|
861
|
+
|
786
862
|
# Create preview with basic information
|
787
863
|
preview = {
|
788
864
|
"id": summary["id"],
|
@@ -795,73 +871,81 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
795
871
|
"doi": summary.get("doi", ""),
|
796
872
|
"source": "PubMed",
|
797
873
|
"_pmid": summary["id"], # Store PMID for later use
|
798
|
-
"_search_strategy": strategy # Store search strategy for analytics
|
874
|
+
"_search_strategy": strategy, # Store search strategy for analytics
|
799
875
|
}
|
800
|
-
|
876
|
+
|
801
877
|
previews.append(preview)
|
802
|
-
|
878
|
+
|
803
879
|
logger.info(f"Found {len(previews)} PubMed previews using strategy: {strategy}")
|
804
880
|
return previews
|
805
|
-
|
806
|
-
def _get_full_content(
|
881
|
+
|
882
|
+
def _get_full_content(
|
883
|
+
self, relevant_items: List[Dict[str, Any]]
|
884
|
+
) -> List[Dict[str, Any]]:
|
807
885
|
"""
|
808
886
|
Get full content for the relevant PubMed articles.
|
809
887
|
Efficiently manages which content to retrieve (abstracts and/or full text).
|
810
|
-
|
888
|
+
|
811
889
|
Args:
|
812
890
|
relevant_items: List of relevant preview dictionaries
|
813
|
-
|
891
|
+
|
814
892
|
Returns:
|
815
893
|
List of result dictionaries with full content
|
816
894
|
"""
|
817
895
|
# Check if we should add full content
|
818
|
-
if
|
896
|
+
if (
|
897
|
+
hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
898
|
+
and search_config.SEARCH_SNIPPETS_ONLY
|
899
|
+
):
|
819
900
|
logger.info("Snippet-only mode, skipping full content retrieval")
|
820
901
|
return relevant_items
|
821
|
-
|
902
|
+
|
822
903
|
logger.info(f"Getting content for {len(relevant_items)} PubMed articles")
|
823
|
-
|
904
|
+
|
824
905
|
# Collect all PMIDs for relevant items
|
825
906
|
pmids = []
|
826
907
|
for item in relevant_items:
|
827
908
|
if "_pmid" in item:
|
828
909
|
pmids.append(item["_pmid"])
|
829
|
-
|
910
|
+
|
830
911
|
# Get abstracts if requested and PMIDs exist
|
831
912
|
abstracts = {}
|
832
913
|
if self.get_abstracts and pmids:
|
833
914
|
abstracts = self._get_article_abstracts(pmids)
|
834
|
-
|
915
|
+
|
835
916
|
# Find PMC IDs for full-text retrieval (if enabled)
|
836
917
|
pmid_to_pmcid = {}
|
837
918
|
if self.get_full_text and pmids:
|
838
919
|
pmid_to_pmcid = self._find_pmc_ids(pmids)
|
839
|
-
|
920
|
+
|
840
921
|
# Add content to results
|
841
922
|
results = []
|
842
923
|
for item in relevant_items:
|
843
924
|
result = item.copy()
|
844
925
|
pmid = item.get("_pmid", "")
|
845
|
-
|
926
|
+
|
846
927
|
# Add abstract if available
|
847
928
|
if pmid in abstracts:
|
848
929
|
result["abstract"] = abstracts[pmid]
|
849
|
-
|
930
|
+
|
850
931
|
# Use abstract as content if no full text
|
851
932
|
if pmid not in pmid_to_pmcid:
|
852
933
|
result["full_content"] = abstracts[pmid]
|
853
934
|
result["content"] = abstracts[pmid]
|
854
935
|
result["content_type"] = "abstract"
|
855
|
-
|
936
|
+
|
856
937
|
# Add full text for a limited number of top articles
|
857
|
-
if (
|
858
|
-
|
859
|
-
|
860
|
-
|
938
|
+
if (
|
939
|
+
pmid in pmid_to_pmcid
|
940
|
+
and self.get_full_text
|
941
|
+
and len([r for r in results if r.get("content_type") == "full_text"])
|
942
|
+
< self.full_text_limit
|
943
|
+
):
|
944
|
+
|
861
945
|
# Get full text content
|
862
946
|
pmcid = pmid_to_pmcid[pmid]
|
863
947
|
full_text = self._get_pmc_full_text(pmcid)
|
864
|
-
|
948
|
+
|
865
949
|
if full_text:
|
866
950
|
result["full_content"] = full_text
|
867
951
|
result["content"] = full_text
|
@@ -872,120 +956,128 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
872
956
|
result["full_content"] = abstracts[pmid]
|
873
957
|
result["content"] = abstracts[pmid]
|
874
958
|
result["content_type"] = "abstract"
|
875
|
-
|
959
|
+
|
876
960
|
# Remove temporary fields
|
877
961
|
if "_pmid" in result:
|
878
962
|
del result["_pmid"]
|
879
963
|
if "_search_strategy" in result:
|
880
964
|
del result["_search_strategy"]
|
881
|
-
|
965
|
+
|
882
966
|
results.append(result)
|
883
|
-
|
967
|
+
|
884
968
|
return results
|
885
|
-
|
886
|
-
def search_by_author(
|
969
|
+
|
970
|
+
def search_by_author(
|
971
|
+
self, author_name: str, max_results: Optional[int] = None
|
972
|
+
) -> List[Dict[str, Any]]:
|
887
973
|
"""
|
888
974
|
Search for articles by a specific author.
|
889
|
-
|
975
|
+
|
890
976
|
Args:
|
891
977
|
author_name: Name of the author
|
892
978
|
max_results: Maximum number of results (defaults to self.max_results)
|
893
|
-
|
979
|
+
|
894
980
|
Returns:
|
895
981
|
List of articles by the author
|
896
982
|
"""
|
897
983
|
original_max_results = self.max_results
|
898
|
-
|
984
|
+
|
899
985
|
try:
|
900
986
|
if max_results:
|
901
987
|
self.max_results = max_results
|
902
|
-
|
988
|
+
|
903
989
|
query = f"{author_name}[Author]"
|
904
990
|
return self.run(query)
|
905
|
-
|
991
|
+
|
906
992
|
finally:
|
907
993
|
# Restore original value
|
908
994
|
self.max_results = original_max_results
|
909
|
-
|
910
|
-
def search_by_journal(
|
995
|
+
|
996
|
+
def search_by_journal(
|
997
|
+
self, journal_name: str, max_results: Optional[int] = None
|
998
|
+
) -> List[Dict[str, Any]]:
|
911
999
|
"""
|
912
1000
|
Search for articles in a specific journal.
|
913
|
-
|
1001
|
+
|
914
1002
|
Args:
|
915
1003
|
journal_name: Name of the journal
|
916
1004
|
max_results: Maximum number of results (defaults to self.max_results)
|
917
|
-
|
1005
|
+
|
918
1006
|
Returns:
|
919
1007
|
List of articles from the journal
|
920
1008
|
"""
|
921
1009
|
original_max_results = self.max_results
|
922
|
-
|
1010
|
+
|
923
1011
|
try:
|
924
1012
|
if max_results:
|
925
1013
|
self.max_results = max_results
|
926
|
-
|
1014
|
+
|
927
1015
|
query = f"{journal_name}[Journal]"
|
928
1016
|
return self.run(query)
|
929
|
-
|
1017
|
+
|
930
1018
|
finally:
|
931
1019
|
# Restore original value
|
932
1020
|
self.max_results = original_max_results
|
933
|
-
|
934
|
-
def search_recent(
|
1021
|
+
|
1022
|
+
def search_recent(
|
1023
|
+
self, query: str, days: int = 30, max_results: Optional[int] = None
|
1024
|
+
) -> List[Dict[str, Any]]:
|
935
1025
|
"""
|
936
1026
|
Search for recent articles matching the query.
|
937
|
-
|
1027
|
+
|
938
1028
|
Args:
|
939
1029
|
query: The search query
|
940
1030
|
days: Number of days to look back
|
941
1031
|
max_results: Maximum number of results (defaults to self.max_results)
|
942
|
-
|
1032
|
+
|
943
1033
|
Returns:
|
944
1034
|
List of recent articles matching the query
|
945
1035
|
"""
|
946
1036
|
original_max_results = self.max_results
|
947
1037
|
original_days_limit = self.days_limit
|
948
|
-
|
1038
|
+
|
949
1039
|
try:
|
950
1040
|
if max_results:
|
951
1041
|
self.max_results = max_results
|
952
|
-
|
1042
|
+
|
953
1043
|
# Set days limit for this search
|
954
1044
|
self.days_limit = days
|
955
|
-
|
1045
|
+
|
956
1046
|
return self.run(query)
|
957
|
-
|
1047
|
+
|
958
1048
|
finally:
|
959
1049
|
# Restore original values
|
960
1050
|
self.max_results = original_max_results
|
961
1051
|
self.days_limit = original_days_limit
|
962
|
-
|
963
|
-
def advanced_search(
|
1052
|
+
|
1053
|
+
def advanced_search(
|
1054
|
+
self, terms: Dict[str, str], max_results: Optional[int] = None
|
1055
|
+
) -> List[Dict[str, Any]]:
|
964
1056
|
"""
|
965
1057
|
Perform an advanced search with field-specific terms.
|
966
|
-
|
1058
|
+
|
967
1059
|
Args:
|
968
1060
|
terms: Dictionary mapping fields to search terms
|
969
1061
|
Valid fields: Author, Journal, Title, MeSH, Affiliation, etc.
|
970
1062
|
max_results: Maximum number of results (defaults to self.max_results)
|
971
|
-
|
1063
|
+
|
972
1064
|
Returns:
|
973
1065
|
List of articles matching the advanced query
|
974
1066
|
"""
|
975
1067
|
original_max_results = self.max_results
|
976
|
-
|
1068
|
+
|
977
1069
|
try:
|
978
1070
|
if max_results:
|
979
1071
|
self.max_results = max_results
|
980
|
-
|
1072
|
+
|
981
1073
|
# Build advanced query string
|
982
1074
|
query_parts = []
|
983
1075
|
for field, term in terms.items():
|
984
1076
|
query_parts.append(f"{term}[{field}]")
|
985
|
-
|
1077
|
+
|
986
1078
|
query = " AND ".join(query_parts)
|
987
1079
|
return self.run(query)
|
988
|
-
|
1080
|
+
|
989
1081
|
finally:
|
990
1082
|
# Restore original value
|
991
|
-
self.max_results = original_max_results
|
1083
|
+
self.max_results = original_max_results
|