local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +23 -22
- local_deep_research/__main__.py +16 -0
- local_deep_research/advanced_search_system/__init__.py +7 -0
- local_deep_research/advanced_search_system/filters/__init__.py +8 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
- local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
- local_deep_research/advanced_search_system/findings/repository.py +452 -0
- local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
- local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
- local_deep_research/advanced_search_system/questions/__init__.py +1 -0
- local_deep_research/advanced_search_system/questions/base_question.py +64 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
- local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
- local_deep_research/advanced_search_system/tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
- local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
- local_deep_research/api/__init__.py +5 -5
- local_deep_research/api/research_functions.py +154 -160
- local_deep_research/app.py +8 -0
- local_deep_research/citation_handler.py +25 -16
- local_deep_research/{config.py → config/config_files.py} +102 -110
- local_deep_research/config/llm_config.py +472 -0
- local_deep_research/config/search_config.py +77 -0
- local_deep_research/defaults/__init__.py +10 -5
- local_deep_research/defaults/main.toml +2 -2
- local_deep_research/defaults/search_engines.toml +60 -34
- local_deep_research/main.py +121 -19
- local_deep_research/migrate_db.py +147 -0
- local_deep_research/report_generator.py +87 -45
- local_deep_research/search_system.py +153 -283
- local_deep_research/setup_data_dir.py +35 -0
- local_deep_research/test_migration.py +178 -0
- local_deep_research/utilities/__init__.py +0 -0
- local_deep_research/utilities/db_utils.py +49 -0
- local_deep_research/{utilties → utilities}/enums.py +2 -2
- local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
- local_deep_research/utilities/search_utilities.py +242 -0
- local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
- local_deep_research/web/__init__.py +0 -1
- local_deep_research/web/app.py +86 -1709
- local_deep_research/web/app_factory.py +289 -0
- local_deep_research/web/database/README.md +70 -0
- local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
- local_deep_research/web/database/migrations.py +447 -0
- local_deep_research/web/database/models.py +117 -0
- local_deep_research/web/database/schema_upgrade.py +107 -0
- local_deep_research/web/models/database.py +294 -0
- local_deep_research/web/models/settings.py +94 -0
- local_deep_research/web/routes/api_routes.py +559 -0
- local_deep_research/web/routes/history_routes.py +354 -0
- local_deep_research/web/routes/research_routes.py +715 -0
- local_deep_research/web/routes/settings_routes.py +1583 -0
- local_deep_research/web/services/research_service.py +947 -0
- local_deep_research/web/services/resource_service.py +149 -0
- local_deep_research/web/services/settings_manager.py +669 -0
- local_deep_research/web/services/settings_service.py +187 -0
- local_deep_research/web/services/socket_service.py +210 -0
- local_deep_research/web/static/css/custom_dropdown.css +277 -0
- local_deep_research/web/static/css/settings.css +1223 -0
- local_deep_research/web/static/css/styles.css +525 -48
- local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
- local_deep_research/web/static/js/components/detail.js +348 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
- local_deep_research/web/static/js/components/fallback/ui.js +215 -0
- local_deep_research/web/static/js/components/history.js +487 -0
- local_deep_research/web/static/js/components/logpanel.js +949 -0
- local_deep_research/web/static/js/components/progress.js +1107 -0
- local_deep_research/web/static/js/components/research.js +1865 -0
- local_deep_research/web/static/js/components/results.js +766 -0
- local_deep_research/web/static/js/components/settings.js +3981 -0
- local_deep_research/web/static/js/components/settings_sync.js +106 -0
- local_deep_research/web/static/js/main.js +226 -0
- local_deep_research/web/static/js/services/api.js +253 -0
- local_deep_research/web/static/js/services/audio.js +31 -0
- local_deep_research/web/static/js/services/formatting.js +119 -0
- local_deep_research/web/static/js/services/pdf.js +622 -0
- local_deep_research/web/static/js/services/socket.js +882 -0
- local_deep_research/web/static/js/services/ui.js +546 -0
- local_deep_research/web/templates/base.html +72 -0
- local_deep_research/web/templates/components/custom_dropdown.html +47 -0
- local_deep_research/web/templates/components/log_panel.html +32 -0
- local_deep_research/web/templates/components/mobile_nav.html +22 -0
- local_deep_research/web/templates/components/settings_form.html +299 -0
- local_deep_research/web/templates/components/sidebar.html +21 -0
- local_deep_research/web/templates/pages/details.html +73 -0
- local_deep_research/web/templates/pages/history.html +51 -0
- local_deep_research/web/templates/pages/progress.html +57 -0
- local_deep_research/web/templates/pages/research.html +139 -0
- local_deep_research/web/templates/pages/results.html +59 -0
- local_deep_research/web/templates/settings_dashboard.html +78 -192
- local_deep_research/web/utils/__init__.py +0 -0
- local_deep_research/web/utils/formatters.py +76 -0
- local_deep_research/web_search_engines/engines/full_search.py +18 -16
- local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
- local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
- local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
- local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +212 -160
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
- local_deep_research/web_search_engines/search_engine_base.py +174 -99
- local_deep_research/web_search_engines/search_engine_factory.py +192 -102
- local_deep_research/web_search_engines/search_engines_config.py +22 -15
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/METADATA +177 -97
- local_deep_research-0.2.2.dist-info/RECORD +135 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/WHEEL +1 -2
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/entry_points.txt +3 -0
- local_deep_research/defaults/llm_config.py +0 -338
- local_deep_research/utilties/search_utilities.py +0 -114
- local_deep_research/web/static/js/app.js +0 -3763
- local_deep_research/web/templates/api_keys_config.html +0 -82
- local_deep_research/web/templates/collections_config.html +0 -90
- local_deep_research/web/templates/index.html +0 -348
- local_deep_research/web/templates/llm_config.html +0 -120
- local_deep_research/web/templates/main_config.html +0 -89
- local_deep_research/web/templates/search_engines_config.html +0 -154
- local_deep_research/web/templates/settings.html +0 -519
- local_deep_research-0.1.26.dist-info/RECORD +0 -61
- local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
- /local_deep_research/{utilties → config}/__init__.py +0 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,48 +1,50 @@
|
|
1
|
-
import requests
|
2
1
|
import logging
|
3
|
-
import json
|
4
|
-
from typing import Dict, List, Any, Optional, Tuple, Union
|
5
|
-
from langchain_core.language_models import BaseLLM
|
6
|
-
import time
|
7
2
|
import re
|
8
|
-
|
3
|
+
import time
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
5
|
+
|
6
|
+
import requests
|
7
|
+
from langchain_core.language_models import BaseLLM
|
9
8
|
from requests.adapters import HTTPAdapter
|
10
9
|
from urllib3.util import Retry
|
11
10
|
|
12
|
-
from
|
13
|
-
from
|
11
|
+
from ...config import search_config
|
12
|
+
from ..search_engine_base import BaseSearchEngine
|
14
13
|
|
15
14
|
# Setup logging
|
16
15
|
logging.basicConfig(level=logging.INFO)
|
17
16
|
logger = logging.getLogger(__name__)
|
18
17
|
|
18
|
+
|
19
19
|
class SemanticScholarSearchEngine(BaseSearchEngine):
|
20
20
|
"""
|
21
21
|
Semantic Scholar search engine implementation with two-phase approach.
|
22
22
|
Provides efficient access to scientific literature across all fields.
|
23
23
|
"""
|
24
|
-
|
25
|
-
def __init__(
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
24
|
+
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
max_results: int = 10,
|
28
|
+
api_key: Optional[str] = None,
|
29
|
+
year_range: Optional[Tuple[int, int]] = None,
|
30
|
+
get_abstracts: bool = True,
|
31
|
+
get_references: bool = False,
|
32
|
+
get_citations: bool = False,
|
33
|
+
get_embeddings: bool = False,
|
34
|
+
get_tldr: bool = True,
|
35
|
+
citation_limit: int = 10,
|
36
|
+
reference_limit: int = 10,
|
37
|
+
llm: Optional[BaseLLM] = None,
|
38
|
+
max_filtered_results: Optional[int] = None,
|
39
|
+
optimize_queries: bool = True,
|
40
|
+
max_retries: int = 5,
|
41
|
+
retry_backoff_factor: float = 1.0,
|
42
|
+
fields_of_study: Optional[List[str]] = None,
|
43
|
+
publication_types: Optional[List[str]] = None,
|
44
|
+
):
|
43
45
|
"""
|
44
46
|
Initialize the Semantic Scholar search engine.
|
45
|
-
|
47
|
+
|
46
48
|
Args:
|
47
49
|
max_results: Maximum number of search results
|
48
50
|
api_key: Semantic Scholar API key for higher rate limits (optional)
|
@@ -63,8 +65,10 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
|
|
63
65
|
publication_types: List of publication types to filter results
|
64
66
|
"""
|
65
67
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
66
|
-
super().__init__(
|
67
|
-
|
68
|
+
super().__init__(
|
69
|
+
llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
|
70
|
+
)
|
71
|
+
|
68
72
|
self.api_key = api_key
|
69
73
|
self.year_range = year_range
|
70
74
|
self.get_abstracts = get_abstracts
|
@@ -79,71 +83,76 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
|
|
79
83
|
self.retry_backoff_factor = retry_backoff_factor
|
80
84
|
self.fields_of_study = fields_of_study
|
81
85
|
self.publication_types = publication_types
|
82
|
-
|
86
|
+
|
83
87
|
# Base API URLs
|
84
88
|
self.base_url = "https://api.semanticscholar.org/graph/v1"
|
85
89
|
self.paper_search_url = f"{self.base_url}/paper/search"
|
86
90
|
self.paper_details_url = f"{self.base_url}/paper"
|
87
|
-
|
91
|
+
|
88
92
|
# Create a session with retry capabilities
|
89
93
|
self.session = self._create_session()
|
90
|
-
|
94
|
+
|
91
95
|
# Rate limiting
|
92
96
|
self.rate_limit_wait = 1.0 # Default 1 second between requests
|
93
97
|
self.last_request_time = 0
|
94
|
-
|
98
|
+
|
95
99
|
def _create_session(self) -> requests.Session:
|
96
100
|
"""Create and configure a requests session with retry capabilities"""
|
97
101
|
session = requests.Session()
|
98
|
-
|
102
|
+
|
99
103
|
# Configure automatic retries with exponential backoff
|
100
104
|
retry_strategy = Retry(
|
101
105
|
total=self.max_retries,
|
102
106
|
backoff_factor=self.retry_backoff_factor,
|
103
107
|
status_forcelist=[429, 500, 502, 503, 504],
|
104
|
-
allowed_methods={"HEAD", "GET", "POST", "OPTIONS"}
|
108
|
+
allowed_methods={"HEAD", "GET", "POST", "OPTIONS"},
|
105
109
|
)
|
106
|
-
|
110
|
+
|
107
111
|
adapter = HTTPAdapter(max_retries=retry_strategy)
|
108
112
|
session.mount("https://", adapter)
|
109
|
-
|
113
|
+
|
110
114
|
# Set up headers
|
111
115
|
headers = {"Accept": "application/json"}
|
112
116
|
if self.api_key:
|
113
117
|
headers["x-api-key"] = self.api_key
|
114
|
-
|
118
|
+
|
115
119
|
session.headers.update(headers)
|
116
|
-
|
120
|
+
|
117
121
|
return session
|
118
|
-
|
122
|
+
|
119
123
|
def _respect_rate_limit(self):
|
120
124
|
"""Apply rate limiting between requests"""
|
121
125
|
current_time = time.time()
|
122
126
|
elapsed = current_time - self.last_request_time
|
123
|
-
|
127
|
+
|
124
128
|
if elapsed < self.rate_limit_wait:
|
125
129
|
wait_time = self.rate_limit_wait - elapsed
|
126
|
-
logger.debug(
|
130
|
+
logger.debug("Rate limiting: waiting %.2f s", wait_time)
|
127
131
|
time.sleep(wait_time)
|
128
|
-
|
132
|
+
|
129
133
|
self.last_request_time = time.time()
|
130
|
-
|
131
|
-
def _make_request(
|
132
|
-
|
134
|
+
|
135
|
+
def _make_request(
|
136
|
+
self,
|
137
|
+
url: str,
|
138
|
+
params: Optional[Dict] = None,
|
139
|
+
data: Optional[Dict] = None,
|
140
|
+
method: str = "GET",
|
141
|
+
) -> Dict:
|
133
142
|
"""
|
134
143
|
Make a request to the Semantic Scholar API.
|
135
|
-
|
144
|
+
|
136
145
|
Args:
|
137
146
|
url: API endpoint URL
|
138
147
|
params: Query parameters
|
139
148
|
data: JSON data for POST requests
|
140
149
|
method: HTTP method (GET or POST)
|
141
|
-
|
150
|
+
|
142
151
|
Returns:
|
143
152
|
API response as dictionary
|
144
153
|
"""
|
145
154
|
self._respect_rate_limit()
|
146
|
-
|
155
|
+
|
147
156
|
try:
|
148
157
|
if method.upper() == "GET":
|
149
158
|
response = self.session.get(url, params=params, timeout=30)
|
@@ -151,34 +160,34 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
|
|
151
160
|
response = self.session.post(url, params=params, json=data, timeout=30)
|
152
161
|
else:
|
153
162
|
raise ValueError(f"Unsupported HTTP method: {method}")
|
154
|
-
|
163
|
+
|
155
164
|
# Handle rate limiting manually if retry strategy fails
|
156
165
|
if response.status_code == 429:
|
157
166
|
logger.warning("Rate limit exceeded, waiting and retrying...")
|
158
167
|
time.sleep(2.0) # Wait longer on rate limit
|
159
168
|
self.rate_limit_wait *= 1.5 # Increase wait time for future requests
|
160
169
|
return self._make_request(url, params, data, method) # Retry
|
161
|
-
|
170
|
+
|
162
171
|
response.raise_for_status()
|
163
172
|
return response.json()
|
164
173
|
except requests.RequestException as e:
|
165
174
|
logger.error(f"API request failed: {e}")
|
166
175
|
return {}
|
167
|
-
|
176
|
+
|
168
177
|
def _optimize_query(self, query: str) -> str:
|
169
178
|
"""
|
170
179
|
Optimize a natural language query for Semantic Scholar search.
|
171
180
|
If LLM is available, uses it to extract key terms and concepts.
|
172
|
-
|
181
|
+
|
173
182
|
Args:
|
174
183
|
query: Natural language query
|
175
|
-
|
184
|
+
|
176
185
|
Returns:
|
177
186
|
Optimized query string
|
178
187
|
"""
|
179
188
|
if not self.llm or not self.optimize_queries:
|
180
189
|
return query
|
181
|
-
|
190
|
+
|
182
191
|
try:
|
183
192
|
prompt = f"""Transform this natural language question into an optimized academic search query.
|
184
193
|
|
@@ -198,113 +207,122 @@ EXAMPLE TRANSFORMATIONS:
|
|
198
207
|
|
199
208
|
Return ONLY the optimized search query with no explanation.
|
200
209
|
"""
|
201
|
-
|
210
|
+
|
202
211
|
response = self.llm.invoke(prompt)
|
203
212
|
optimized_query = response.content.strip()
|
204
|
-
|
213
|
+
|
205
214
|
# Clean up the query - remove any explanations
|
206
|
-
lines = optimized_query.split(
|
215
|
+
lines = optimized_query.split("\n")
|
207
216
|
optimized_query = lines[0].strip()
|
208
|
-
|
217
|
+
|
209
218
|
# Safety check - if query looks too much like an explanation, use original
|
210
219
|
if len(optimized_query.split()) > 15 or ":" in optimized_query:
|
211
|
-
logger.warning(
|
220
|
+
logger.warning(
|
221
|
+
"Query optimization result looks too verbose, using original"
|
222
|
+
)
|
212
223
|
return query
|
213
|
-
|
224
|
+
|
214
225
|
logger.info(f"Original query: '{query}'")
|
215
226
|
logger.info(f"Optimized for search: '{optimized_query}'")
|
216
|
-
|
227
|
+
|
217
228
|
return optimized_query
|
218
229
|
except Exception as e:
|
219
230
|
logger.error(f"Error optimizing query: {e}")
|
220
231
|
return query # Fall back to original query on error
|
221
|
-
|
232
|
+
|
222
233
|
def _direct_search(self, query: str) -> List[Dict[str, Any]]:
|
223
234
|
"""
|
224
235
|
Make a direct search request to the Semantic Scholar API.
|
225
|
-
|
236
|
+
|
226
237
|
Args:
|
227
238
|
query: The search query
|
228
|
-
|
239
|
+
|
229
240
|
Returns:
|
230
241
|
List of paper dictionaries
|
231
242
|
"""
|
232
243
|
try:
|
233
244
|
# Configure fields to retrieve
|
234
245
|
fields = [
|
235
|
-
"paperId",
|
236
|
-
"externalIds",
|
237
|
-
"url",
|
238
|
-
"title",
|
239
|
-
"abstract",
|
240
|
-
"venue",
|
241
|
-
"year",
|
242
|
-
"authors"
|
246
|
+
"paperId",
|
247
|
+
"externalIds",
|
248
|
+
"url",
|
249
|
+
"title",
|
250
|
+
"abstract",
|
251
|
+
"venue",
|
252
|
+
"year",
|
253
|
+
"authors",
|
243
254
|
]
|
244
|
-
|
255
|
+
|
245
256
|
if self.get_tldr:
|
246
257
|
fields.append("tldr")
|
247
|
-
|
258
|
+
|
248
259
|
params = {
|
249
260
|
"query": query,
|
250
261
|
"limit": min(self.max_results, 100), # API limit is 100 per request
|
251
|
-
"fields": ",".join(fields)
|
262
|
+
"fields": ",".join(fields),
|
252
263
|
}
|
253
|
-
|
264
|
+
|
254
265
|
# Add year filter if specified
|
255
266
|
if self.year_range:
|
256
267
|
start_year, end_year = self.year_range
|
257
268
|
params["year"] = f"{start_year}-{end_year}"
|
258
|
-
|
269
|
+
|
259
270
|
# Add fields of study filter if specified
|
260
271
|
if self.fields_of_study:
|
261
272
|
params["fieldsOfStudy"] = ",".join(self.fields_of_study)
|
262
|
-
|
273
|
+
|
263
274
|
# Add publication types filter if specified
|
264
275
|
if self.publication_types:
|
265
276
|
params["publicationTypes"] = ",".join(self.publication_types)
|
266
|
-
|
277
|
+
|
267
278
|
response = self._make_request(self.paper_search_url, params)
|
268
|
-
|
279
|
+
|
269
280
|
if "data" in response:
|
270
281
|
papers = response["data"]
|
271
|
-
logger.info(
|
282
|
+
logger.info(
|
283
|
+
f"Found {len(papers)} papers with direct search for query: '{query}'"
|
284
|
+
)
|
272
285
|
return papers
|
273
286
|
else:
|
274
|
-
logger.warning(
|
287
|
+
logger.warning(
|
288
|
+
f"No data in response for direct search query: '{query}'"
|
289
|
+
)
|
275
290
|
return []
|
276
|
-
|
291
|
+
|
277
292
|
except Exception as e:
|
278
293
|
logger.error(f"Error in direct search: {e}")
|
279
294
|
return []
|
280
|
-
|
295
|
+
|
281
296
|
def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
|
282
297
|
"""
|
283
298
|
Perform an adaptive search that adjusts based on result volume.
|
284
299
|
Uses LLM to generate better fallback queries when available.
|
285
|
-
|
300
|
+
|
286
301
|
Args:
|
287
302
|
query: The search query
|
288
|
-
|
303
|
+
|
289
304
|
Returns:
|
290
305
|
Tuple of (list of paper results, search strategy used)
|
291
306
|
"""
|
292
307
|
# Start with a standard search
|
293
308
|
papers = self._direct_search(query)
|
294
309
|
strategy = "standard"
|
295
|
-
|
310
|
+
|
296
311
|
# If no results, try different variations
|
297
312
|
if not papers:
|
298
313
|
# Try removing quotes to broaden search
|
299
314
|
if '"' in query:
|
300
|
-
unquoted_query = query.replace('"',
|
301
|
-
logger.info(
|
315
|
+
unquoted_query = query.replace('"', "")
|
316
|
+
logger.info(
|
317
|
+
"No results with quoted terms, trying without quotes: %s",
|
318
|
+
unquoted_query,
|
319
|
+
)
|
302
320
|
papers = self._direct_search(unquoted_query)
|
303
|
-
|
321
|
+
|
304
322
|
if papers:
|
305
323
|
strategy = "unquoted"
|
306
324
|
return papers, strategy
|
307
|
-
|
325
|
+
|
308
326
|
# If LLM is available, use it to generate better fallback queries
|
309
327
|
if self.llm:
|
310
328
|
try:
|
@@ -325,99 +343,109 @@ Format each query on a new line with no numbering or explanation. Keep each quer
|
|
325
343
|
"""
|
326
344
|
# Get the LLM's response
|
327
345
|
response = self.llm.invoke(prompt)
|
328
|
-
|
346
|
+
|
329
347
|
# Extract the alternative queries
|
330
348
|
alt_queries = []
|
331
|
-
if hasattr(
|
349
|
+
if hasattr(
|
350
|
+
response, "content"
|
351
|
+
): # Handle various LLM response formats
|
332
352
|
content = response.content
|
333
|
-
alt_queries = [
|
353
|
+
alt_queries = [
|
354
|
+
q.strip() for q in content.strip().split("\n") if q.strip()
|
355
|
+
]
|
334
356
|
elif isinstance(response, str):
|
335
|
-
alt_queries = [
|
336
|
-
|
357
|
+
alt_queries = [
|
358
|
+
q.strip() for q in response.strip().split("\n") if q.strip()
|
359
|
+
]
|
360
|
+
|
337
361
|
# Try each alternative query
|
338
362
|
for alt_query in alt_queries[:3]: # Limit to first 3 alternatives
|
339
|
-
logger.info(
|
363
|
+
logger.info("Trying LLM-suggested query: %s", alt_query)
|
340
364
|
alt_papers = self._direct_search(alt_query)
|
341
|
-
|
365
|
+
|
342
366
|
if alt_papers:
|
343
|
-
logger.info(
|
367
|
+
logger.info(
|
368
|
+
"Found %s papers using LLM-suggested query: %s",
|
369
|
+
len(alt_papers),
|
370
|
+
alt_query,
|
371
|
+
)
|
344
372
|
strategy = "llm_alternative"
|
345
373
|
return alt_papers, strategy
|
346
374
|
except Exception as e:
|
347
|
-
logger.error(
|
375
|
+
logger.error("Error using LLM for query refinement: %s", e)
|
348
376
|
# Fall through to simpler strategies
|
349
|
-
|
377
|
+
|
350
378
|
# Fallback: Try with the longest words (likely specific terms)
|
351
|
-
words = re.findall(r
|
379
|
+
words = re.findall(r"\w+", query)
|
352
380
|
longer_words = [word for word in words if len(word) > 6]
|
353
381
|
if longer_words:
|
354
382
|
# Use up to 3 of the longest words
|
355
383
|
longer_words = sorted(longer_words, key=len, reverse=True)[:3]
|
356
|
-
key_terms_query =
|
357
|
-
logger.info(
|
384
|
+
key_terms_query = " ".join(longer_words)
|
385
|
+
logger.info("Trying with key terms: %s", key_terms_query)
|
358
386
|
papers = self._direct_search(key_terms_query)
|
359
|
-
|
387
|
+
|
360
388
|
if papers:
|
361
389
|
strategy = "key_terms"
|
362
390
|
return papers, strategy
|
363
|
-
|
391
|
+
|
364
392
|
# Final fallback: Try with just the longest word
|
365
393
|
if words:
|
366
394
|
longest_word = max(words, key=len)
|
367
395
|
if len(longest_word) > 5: # Only use if it's reasonably long
|
368
|
-
logger.info(
|
396
|
+
logger.info("Trying with single key term: %s", longest_word)
|
369
397
|
papers = self._direct_search(longest_word)
|
370
|
-
|
398
|
+
|
371
399
|
if papers:
|
372
400
|
strategy = "single_term"
|
373
401
|
return papers, strategy
|
374
|
-
|
402
|
+
|
375
403
|
return papers, strategy
|
376
|
-
|
404
|
+
|
377
405
|
def _get_paper_details(self, paper_id: str) -> Dict[str, Any]:
|
378
406
|
"""
|
379
407
|
Get detailed information about a specific paper.
|
380
|
-
|
408
|
+
|
381
409
|
Args:
|
382
410
|
paper_id: Semantic Scholar Paper ID
|
383
|
-
|
411
|
+
|
384
412
|
Returns:
|
385
413
|
Dictionary with paper details
|
386
414
|
"""
|
387
415
|
try:
|
388
416
|
# Construct fields parameter
|
389
417
|
fields = [
|
390
|
-
"paperId",
|
391
|
-
"externalIds",
|
392
|
-
"corpusId",
|
393
|
-
"url",
|
394
|
-
"title",
|
395
|
-
"abstract",
|
396
|
-
"venue",
|
397
|
-
"year",
|
398
|
-
"authors",
|
399
|
-
"fieldsOfStudy"
|
418
|
+
"paperId",
|
419
|
+
"externalIds",
|
420
|
+
"corpusId",
|
421
|
+
"url",
|
422
|
+
"title",
|
423
|
+
"abstract",
|
424
|
+
"venue",
|
425
|
+
"year",
|
426
|
+
"authors",
|
427
|
+
"fieldsOfStudy",
|
400
428
|
]
|
401
|
-
|
429
|
+
|
402
430
|
if self.get_tldr:
|
403
431
|
fields.append("tldr")
|
404
|
-
|
432
|
+
|
405
433
|
if self.get_embeddings:
|
406
434
|
fields.append("embedding")
|
407
|
-
|
435
|
+
|
408
436
|
# Add citation and reference fields if requested
|
409
437
|
if self.get_citations:
|
410
438
|
fields.append(f"citations.limit({self.citation_limit})")
|
411
|
-
|
439
|
+
|
412
440
|
if self.get_references:
|
413
441
|
fields.append(f"references.limit({self.reference_limit})")
|
414
|
-
|
442
|
+
|
415
443
|
# Make the request
|
416
444
|
url = f"{self.paper_details_url}/{paper_id}"
|
417
445
|
params = {"fields": ",".join(fields)}
|
418
|
-
|
446
|
+
|
419
447
|
return self._make_request(url, params)
|
420
|
-
|
448
|
+
|
421
449
|
except Exception as e:
|
422
450
|
logger.error(f"Error getting paper details for {paper_id}: {e}")
|
423
451
|
return {}
|
@@ -425,25 +453,25 @@ Format each query on a new line with no numbering or explanation. Keep each quer
|
|
425
453
|
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
426
454
|
"""
|
427
455
|
Get preview information for Semantic Scholar papers.
|
428
|
-
|
456
|
+
|
429
457
|
Args:
|
430
458
|
query: The search query
|
431
|
-
|
459
|
+
|
432
460
|
Returns:
|
433
461
|
List of preview dictionaries
|
434
462
|
"""
|
435
463
|
logger.info(f"Getting Semantic Scholar previews for query: {query}")
|
436
|
-
|
464
|
+
|
437
465
|
# Optimize the query if LLM is available
|
438
466
|
optimized_query = self._optimize_query(query)
|
439
|
-
|
467
|
+
|
440
468
|
# Use the adaptive search approach
|
441
469
|
papers, strategy = self._adaptive_search(optimized_query)
|
442
|
-
|
470
|
+
|
443
471
|
if not papers:
|
444
|
-
logger.warning(
|
472
|
+
logger.warning("No Semantic Scholar results found")
|
445
473
|
return []
|
446
|
-
|
474
|
+
|
447
475
|
# Format as previews
|
448
476
|
previews = []
|
449
477
|
for paper in papers:
|
@@ -451,28 +479,34 @@ Format each query on a new line with no numbering or explanation. Keep each quer
|
|
451
479
|
# Format authors - ensure we have a valid list with string values
|
452
480
|
authors = []
|
453
481
|
if "authors" in paper and paper["authors"]:
|
454
|
-
authors = [
|
455
|
-
|
482
|
+
authors = [
|
483
|
+
author.get("name", "")
|
484
|
+
for author in paper["authors"]
|
485
|
+
if author and author.get("name")
|
486
|
+
]
|
487
|
+
|
456
488
|
# Ensure we have valid strings for all fields
|
457
489
|
paper_id = paper.get("paperId", "")
|
458
490
|
title = paper.get("title", "")
|
459
491
|
url = paper.get("url", "")
|
460
|
-
|
492
|
+
|
461
493
|
# Handle abstract safely, ensuring we always have a string
|
462
494
|
abstract = paper.get("abstract")
|
463
495
|
snippet = ""
|
464
496
|
if abstract:
|
465
|
-
snippet =
|
466
|
-
|
497
|
+
snippet = (
|
498
|
+
abstract[:250] + "..." if len(abstract) > 250 else abstract
|
499
|
+
)
|
500
|
+
|
467
501
|
venue = paper.get("venue", "")
|
468
502
|
year = paper.get("year")
|
469
503
|
external_ids = paper.get("externalIds", {})
|
470
|
-
|
504
|
+
|
471
505
|
# Handle TLDR safely
|
472
506
|
tldr_text = ""
|
473
507
|
if paper.get("tldr") and isinstance(paper.get("tldr"), dict):
|
474
508
|
tldr_text = paper.get("tldr", {}).get("text", "")
|
475
|
-
|
509
|
+
|
476
510
|
# Create preview with basic information, ensuring no None values
|
477
511
|
preview = {
|
478
512
|
"id": paper_id if paper_id else "",
|
@@ -486,76 +520,85 @@ Format each query on a new line with no numbering or explanation. Keep each quer
|
|
486
520
|
"source": "Semantic Scholar",
|
487
521
|
"_paper_id": paper_id if paper_id else "",
|
488
522
|
"_search_strategy": strategy,
|
489
|
-
"tldr": tldr_text
|
523
|
+
"tldr": tldr_text,
|
490
524
|
}
|
491
|
-
|
525
|
+
|
492
526
|
# Store the full paper object for later reference
|
493
527
|
preview["_full_paper"] = paper
|
494
|
-
|
528
|
+
|
495
529
|
previews.append(preview)
|
496
530
|
except Exception as e:
|
497
531
|
logger.error(f"Error processing paper preview: {e}")
|
498
532
|
# Continue with the next paper
|
499
|
-
|
533
|
+
|
500
534
|
# Sort by year (newer first) if available
|
501
535
|
previews = sorted(
|
502
536
|
previews,
|
503
537
|
key=lambda p: p.get("year", 0) if p.get("year") is not None else 0,
|
504
|
-
reverse=True
|
538
|
+
reverse=True,
|
539
|
+
)
|
540
|
+
|
541
|
+
logger.info(
|
542
|
+
f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}"
|
505
543
|
)
|
506
|
-
|
507
|
-
logger.info(f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}")
|
508
544
|
return previews
|
509
|
-
|
510
|
-
def _get_full_content(
|
545
|
+
|
546
|
+
def _get_full_content(
|
547
|
+
self, relevant_items: List[Dict[str, Any]]
|
548
|
+
) -> List[Dict[str, Any]]:
|
511
549
|
"""
|
512
550
|
Get full content for the relevant Semantic Scholar papers.
|
513
551
|
Gets additional details like citations, references, and full metadata.
|
514
|
-
|
552
|
+
|
515
553
|
Args:
|
516
554
|
relevant_items: List of relevant preview dictionaries
|
517
|
-
|
555
|
+
|
518
556
|
Returns:
|
519
557
|
List of result dictionaries with full content
|
520
558
|
"""
|
521
559
|
# Check if we should add full content
|
522
|
-
if
|
560
|
+
if (
|
561
|
+
hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
562
|
+
and search_config.SEARCH_SNIPPETS_ONLY
|
563
|
+
):
|
523
564
|
logger.info("Snippet-only mode, skipping full content retrieval")
|
524
565
|
return relevant_items
|
525
|
-
|
526
|
-
logger.info(
|
527
|
-
|
566
|
+
|
567
|
+
logger.info(
|
568
|
+
f"Getting content for {len(relevant_items)} Semantic Scholar papers"
|
569
|
+
)
|
570
|
+
|
528
571
|
results = []
|
529
572
|
for item in relevant_items:
|
530
573
|
result = item.copy()
|
531
574
|
paper_id = item.get("_paper_id", "")
|
532
|
-
|
575
|
+
|
533
576
|
# Skip if no paper ID
|
534
577
|
if not paper_id:
|
535
578
|
results.append(result)
|
536
579
|
continue
|
537
|
-
|
580
|
+
|
538
581
|
# Get paper details if citations or references are requested
|
539
582
|
if self.get_citations or self.get_references or self.get_embeddings:
|
540
583
|
paper_details = self._get_paper_details(paper_id)
|
541
|
-
|
584
|
+
|
542
585
|
if paper_details:
|
543
586
|
# Add citation information
|
544
587
|
if self.get_citations and "citations" in paper_details:
|
545
588
|
result["citations"] = paper_details["citations"]
|
546
|
-
|
589
|
+
|
547
590
|
# Add reference information
|
548
591
|
if self.get_references and "references" in paper_details:
|
549
592
|
result["references"] = paper_details["references"]
|
550
|
-
|
593
|
+
|
551
594
|
# Add embedding if available
|
552
595
|
if self.get_embeddings and "embedding" in paper_details:
|
553
596
|
result["embedding"] = paper_details["embedding"]
|
554
|
-
|
597
|
+
|
555
598
|
# Add fields of study
|
556
599
|
if "fieldsOfStudy" in paper_details:
|
557
600
|
result["fields_of_study"] = paper_details["fieldsOfStudy"]
|
558
|
-
|
601
|
+
|
559
602
|
# Remove temporary fields
|
560
603
|
if "_paper_id" in result:
|
561
604
|
del result["_paper_id"]
|
@@ -563,7 +606,7 @@ Format each query on a new line with no numbering or explanation. Keep each quer
|
|
563
606
|
del result["_search_strategy"]
|
564
607
|
if "_full_paper" in result:
|
565
608
|
del result["_full_paper"]
|
566
|
-
|
609
|
+
|
567
610
|
results.append(result)
|
568
|
-
|
569
|
-
return results
|
611
|
+
|
612
|
+
return results
|