local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +23 -22
- local_deep_research/__main__.py +16 -0
- local_deep_research/advanced_search_system/__init__.py +7 -0
- local_deep_research/advanced_search_system/filters/__init__.py +8 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +38 -0
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +200 -0
- local_deep_research/advanced_search_system/findings/base_findings.py +81 -0
- local_deep_research/advanced_search_system/findings/repository.py +452 -0
- local_deep_research/advanced_search_system/knowledge/__init__.py +1 -0
- local_deep_research/advanced_search_system/knowledge/base_knowledge.py +151 -0
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +159 -0
- local_deep_research/advanced_search_system/questions/__init__.py +1 -0
- local_deep_research/advanced_search_system/questions/base_question.py +64 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +445 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +119 -0
- local_deep_research/advanced_search_system/repositories/__init__.py +7 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +1 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +118 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +450 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +312 -0
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +270 -0
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +300 -0
- local_deep_research/advanced_search_system/tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/base_tool.py +100 -0
- local_deep_research/advanced_search_system/tools/knowledge_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/question_tools/__init__.py +1 -0
- local_deep_research/advanced_search_system/tools/search_tools/__init__.py +1 -0
- local_deep_research/api/__init__.py +5 -5
- local_deep_research/api/research_functions.py +154 -160
- local_deep_research/app.py +8 -0
- local_deep_research/citation_handler.py +25 -16
- local_deep_research/{config.py → config/config_files.py} +102 -110
- local_deep_research/config/llm_config.py +472 -0
- local_deep_research/config/search_config.py +77 -0
- local_deep_research/defaults/__init__.py +10 -5
- local_deep_research/defaults/main.toml +2 -2
- local_deep_research/defaults/search_engines.toml +60 -34
- local_deep_research/main.py +121 -19
- local_deep_research/migrate_db.py +147 -0
- local_deep_research/report_generator.py +87 -45
- local_deep_research/search_system.py +153 -283
- local_deep_research/setup_data_dir.py +35 -0
- local_deep_research/test_migration.py +178 -0
- local_deep_research/utilities/__init__.py +0 -0
- local_deep_research/utilities/db_utils.py +49 -0
- local_deep_research/{utilties → utilities}/enums.py +2 -2
- local_deep_research/{utilties → utilities}/llm_utils.py +63 -29
- local_deep_research/utilities/search_utilities.py +242 -0
- local_deep_research/{utilties → utilities}/setup_utils.py +4 -2
- local_deep_research/web/__init__.py +0 -1
- local_deep_research/web/app.py +86 -1709
- local_deep_research/web/app_factory.py +289 -0
- local_deep_research/web/database/README.md +70 -0
- local_deep_research/web/database/migrate_to_ldr_db.py +289 -0
- local_deep_research/web/database/migrations.py +447 -0
- local_deep_research/web/database/models.py +117 -0
- local_deep_research/web/database/schema_upgrade.py +107 -0
- local_deep_research/web/models/database.py +294 -0
- local_deep_research/web/models/settings.py +94 -0
- local_deep_research/web/routes/api_routes.py +559 -0
- local_deep_research/web/routes/history_routes.py +354 -0
- local_deep_research/web/routes/research_routes.py +715 -0
- local_deep_research/web/routes/settings_routes.py +1583 -0
- local_deep_research/web/services/research_service.py +947 -0
- local_deep_research/web/services/resource_service.py +149 -0
- local_deep_research/web/services/settings_manager.py +669 -0
- local_deep_research/web/services/settings_service.py +187 -0
- local_deep_research/web/services/socket_service.py +210 -0
- local_deep_research/web/static/css/custom_dropdown.css +277 -0
- local_deep_research/web/static/css/settings.css +1223 -0
- local_deep_research/web/static/css/styles.css +525 -48
- local_deep_research/web/static/js/components/custom_dropdown.js +428 -0
- local_deep_research/web/static/js/components/detail.js +348 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +122 -0
- local_deep_research/web/static/js/components/fallback/ui.js +215 -0
- local_deep_research/web/static/js/components/history.js +487 -0
- local_deep_research/web/static/js/components/logpanel.js +949 -0
- local_deep_research/web/static/js/components/progress.js +1107 -0
- local_deep_research/web/static/js/components/research.js +1865 -0
- local_deep_research/web/static/js/components/results.js +766 -0
- local_deep_research/web/static/js/components/settings.js +3981 -0
- local_deep_research/web/static/js/components/settings_sync.js +106 -0
- local_deep_research/web/static/js/main.js +226 -0
- local_deep_research/web/static/js/services/api.js +253 -0
- local_deep_research/web/static/js/services/audio.js +31 -0
- local_deep_research/web/static/js/services/formatting.js +119 -0
- local_deep_research/web/static/js/services/pdf.js +622 -0
- local_deep_research/web/static/js/services/socket.js +882 -0
- local_deep_research/web/static/js/services/ui.js +546 -0
- local_deep_research/web/templates/base.html +72 -0
- local_deep_research/web/templates/components/custom_dropdown.html +47 -0
- local_deep_research/web/templates/components/log_panel.html +32 -0
- local_deep_research/web/templates/components/mobile_nav.html +22 -0
- local_deep_research/web/templates/components/settings_form.html +299 -0
- local_deep_research/web/templates/components/sidebar.html +21 -0
- local_deep_research/web/templates/pages/details.html +73 -0
- local_deep_research/web/templates/pages/history.html +51 -0
- local_deep_research/web/templates/pages/progress.html +57 -0
- local_deep_research/web/templates/pages/research.html +139 -0
- local_deep_research/web/templates/pages/results.html +59 -0
- local_deep_research/web/templates/settings_dashboard.html +78 -192
- local_deep_research/web/utils/__init__.py +0 -0
- local_deep_research/web/utils/formatters.py +76 -0
- local_deep_research/web_search_engines/engines/full_search.py +18 -16
- local_deep_research/web_search_engines/engines/meta_search_engine.py +182 -131
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +224 -139
- local_deep_research/web_search_engines/engines/search_engine_brave.py +88 -71
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +48 -39
- local_deep_research/web_search_engines/engines/search_engine_github.py +415 -204
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +123 -90
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +210 -157
- local_deep_research/web_search_engines/engines/search_engine_local.py +532 -369
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +42 -36
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +358 -266
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +212 -160
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +213 -170
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +84 -68
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +186 -154
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +115 -77
- local_deep_research/web_search_engines/search_engine_base.py +174 -99
- local_deep_research/web_search_engines/search_engine_factory.py +192 -102
- local_deep_research/web_search_engines/search_engines_config.py +22 -15
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/METADATA +177 -97
- local_deep_research-0.2.2.dist-info/RECORD +135 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/WHEEL +1 -2
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/entry_points.txt +3 -0
- local_deep_research/defaults/llm_config.py +0 -338
- local_deep_research/utilties/search_utilities.py +0 -114
- local_deep_research/web/static/js/app.js +0 -3763
- local_deep_research/web/templates/api_keys_config.html +0 -82
- local_deep_research/web/templates/collections_config.html +0 -90
- local_deep_research/web/templates/index.html +0 -348
- local_deep_research/web/templates/llm_config.html +0 -120
- local_deep_research/web/templates/main_config.html +0 -89
- local_deep_research/web/templates/search_engines_config.html +0 -154
- local_deep_research/web/templates/settings.html +0 -519
- local_deep_research-0.1.26.dist-info/RECORD +0 -61
- local_deep_research-0.1.26.dist-info/top_level.txt +0 -1
- /local_deep_research/{utilties → config}/__init__.py +0 -0
- {local_deep_research-0.1.26.dist-info → local_deep_research-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,27 +1,32 @@
|
|
1
|
-
|
2
|
-
from
|
1
|
+
import logging
|
2
|
+
from typing import Any, Dict, List, Optional
|
3
3
|
|
4
|
-
from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
|
5
|
-
from local_deep_research import config
|
6
4
|
import arxiv
|
7
|
-
import
|
5
|
+
from langchain_core.language_models import BaseLLM
|
6
|
+
|
7
|
+
from ...config import search_config
|
8
|
+
from ..search_engine_base import BaseSearchEngine
|
9
|
+
|
8
10
|
logger = logging.getLogger(__name__)
|
9
11
|
|
12
|
+
|
10
13
|
class ArXivSearchEngine(BaseSearchEngine):
|
11
14
|
"""arXiv search engine implementation with two-phase approach"""
|
12
|
-
|
13
|
-
def __init__(
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
max_results: int = 10,
|
19
|
+
sort_by: str = "relevance",
|
20
|
+
sort_order: str = "descending",
|
21
|
+
include_full_text: bool = False,
|
22
|
+
download_dir: Optional[str] = None,
|
23
|
+
max_full_text: int = 1,
|
24
|
+
llm: Optional[BaseLLM] = None,
|
25
|
+
max_filtered_results: Optional[int] = None,
|
26
|
+
): # Added this parameter
|
22
27
|
"""
|
23
28
|
Initialize the arXiv search engine.
|
24
|
-
|
29
|
+
|
25
30
|
Args:
|
26
31
|
max_results: Maximum number of search results
|
27
32
|
sort_by: Sorting criteria ('relevance', 'lastUpdatedDate', or 'submittedDate')
|
@@ -33,77 +38,81 @@ class ArXivSearchEngine(BaseSearchEngine):
|
|
33
38
|
max_filtered_results: Maximum number of results to keep after filtering
|
34
39
|
"""
|
35
40
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
36
|
-
super().__init__(
|
37
|
-
|
41
|
+
super().__init__(
|
42
|
+
llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
|
43
|
+
)
|
44
|
+
self.max_results = max(self.max_results, 25)
|
38
45
|
self.sort_by = sort_by
|
39
46
|
self.sort_order = sort_order
|
40
47
|
self.include_full_text = include_full_text
|
41
48
|
self.download_dir = download_dir
|
42
49
|
self.max_full_text = max_full_text
|
43
|
-
|
50
|
+
|
44
51
|
# Map sort parameters to arxiv package parameters
|
45
52
|
self.sort_criteria = {
|
46
|
-
|
47
|
-
|
48
|
-
|
53
|
+
"relevance": arxiv.SortCriterion.Relevance,
|
54
|
+
"lastUpdatedDate": arxiv.SortCriterion.LastUpdatedDate,
|
55
|
+
"submittedDate": arxiv.SortCriterion.SubmittedDate,
|
49
56
|
}
|
50
|
-
|
57
|
+
|
51
58
|
self.sort_directions = {
|
52
|
-
|
53
|
-
|
59
|
+
"ascending": arxiv.SortOrder.Ascending,
|
60
|
+
"descending": arxiv.SortOrder.Descending,
|
54
61
|
}
|
55
62
|
|
56
63
|
def _get_search_results(self, query: str) -> List[Any]:
|
57
64
|
"""
|
58
65
|
Helper method to get search results from arXiv API.
|
59
|
-
|
66
|
+
|
60
67
|
Args:
|
61
68
|
query: The search query
|
62
|
-
|
69
|
+
|
63
70
|
Returns:
|
64
71
|
List of arXiv paper objects
|
65
72
|
"""
|
66
73
|
# Configure the search client
|
67
|
-
sort_criteria = self.sort_criteria.get(
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
74
|
+
sort_criteria = self.sort_criteria.get(
|
75
|
+
self.sort_by, arxiv.SortCriterion.Relevance
|
76
|
+
)
|
77
|
+
sort_order = self.sort_directions.get(
|
78
|
+
self.sort_order, arxiv.SortOrder.Descending
|
79
|
+
)
|
80
|
+
|
72
81
|
# Create the search client
|
73
82
|
client = arxiv.Client(page_size=self.max_results)
|
74
|
-
|
83
|
+
|
75
84
|
# Create the search query
|
76
85
|
search = arxiv.Search(
|
77
86
|
query=query,
|
78
87
|
max_results=self.max_results,
|
79
88
|
sort_by=sort_criteria,
|
80
|
-
sort_order=sort_order
|
89
|
+
sort_order=sort_order,
|
81
90
|
)
|
82
|
-
|
91
|
+
|
83
92
|
# Get the search results
|
84
93
|
papers = list(client.results(search))
|
85
|
-
|
94
|
+
|
86
95
|
return papers
|
87
|
-
|
96
|
+
|
88
97
|
def _get_previews(self, query: str) -> List[Dict[str, Any]]:
|
89
98
|
"""
|
90
99
|
Get preview information for arXiv papers.
|
91
|
-
|
100
|
+
|
92
101
|
Args:
|
93
102
|
query: The search query
|
94
|
-
|
103
|
+
|
95
104
|
Returns:
|
96
105
|
List of preview dictionaries
|
97
106
|
"""
|
98
107
|
logger.info("Getting paper previews from arXiv")
|
99
|
-
|
108
|
+
|
100
109
|
try:
|
101
110
|
# Get search results from arXiv
|
102
111
|
papers = self._get_search_results(query)
|
103
|
-
|
112
|
+
|
104
113
|
# Store the paper objects for later use
|
105
114
|
self._papers = {paper.entry_id: paper for paper in papers}
|
106
|
-
|
115
|
+
|
107
116
|
# Format results as previews with basic information
|
108
117
|
previews = []
|
109
118
|
for paper in papers:
|
@@ -111,196 +120,268 @@ class ArXivSearchEngine(BaseSearchEngine):
|
|
111
120
|
"id": paper.entry_id, # Use entry_id as ID
|
112
121
|
"title": paper.title,
|
113
122
|
"link": paper.entry_id, # arXiv URL
|
114
|
-
"snippet":
|
115
|
-
|
116
|
-
|
123
|
+
"snippet": (
|
124
|
+
paper.summary[:250] + "..."
|
125
|
+
if len(paper.summary) > 250
|
126
|
+
else paper.summary
|
127
|
+
),
|
128
|
+
"authors": [
|
129
|
+
author.name for author in paper.authors[:3]
|
130
|
+
], # First 3 authors
|
131
|
+
"published": (
|
132
|
+
paper.published.strftime("%Y-%m-%d")
|
133
|
+
if paper.published
|
134
|
+
else None
|
135
|
+
),
|
117
136
|
}
|
118
|
-
|
137
|
+
|
119
138
|
previews.append(preview)
|
120
|
-
|
139
|
+
|
121
140
|
return previews
|
122
|
-
|
141
|
+
|
123
142
|
except Exception as e:
|
124
143
|
logger.error(f"Error getting arXiv previews: {e}")
|
125
144
|
return []
|
126
|
-
|
127
|
-
def _get_full_content(
|
145
|
+
|
146
|
+
def _get_full_content(
|
147
|
+
self, relevant_items: List[Dict[str, Any]]
|
148
|
+
) -> List[Dict[str, Any]]:
|
128
149
|
"""
|
129
150
|
Get full content for the relevant arXiv papers.
|
130
151
|
Downloads PDFs and extracts text when include_full_text is True.
|
131
152
|
Limits the number of PDFs processed to max_full_text.
|
132
|
-
|
153
|
+
|
133
154
|
Args:
|
134
155
|
relevant_items: List of relevant preview dictionaries
|
135
|
-
|
156
|
+
|
136
157
|
Returns:
|
137
158
|
List of result dictionaries with full content
|
138
159
|
"""
|
139
160
|
# Check if we should get full content
|
140
|
-
if
|
161
|
+
if (
|
162
|
+
hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
163
|
+
and search_config.SEARCH_SNIPPETS_ONLY
|
164
|
+
):
|
141
165
|
logger.info("Snippet-only mode, skipping full content retrieval")
|
142
166
|
return relevant_items
|
143
|
-
|
167
|
+
|
144
168
|
logger.info("Getting full content for relevant arXiv papers")
|
145
|
-
|
169
|
+
|
146
170
|
results = []
|
147
171
|
pdf_count = 0 # Track number of PDFs processed
|
148
|
-
|
172
|
+
|
149
173
|
for item in relevant_items:
|
150
174
|
# Start with the preview data
|
151
175
|
result = item.copy()
|
152
|
-
|
176
|
+
|
153
177
|
# Get the paper ID
|
154
178
|
paper_id = item.get("id")
|
155
|
-
|
179
|
+
|
156
180
|
# Try to get the full paper from our cache
|
157
181
|
paper = None
|
158
|
-
if hasattr(self,
|
182
|
+
if hasattr(self, "_papers") and paper_id in self._papers:
|
159
183
|
paper = self._papers[paper_id]
|
160
|
-
|
184
|
+
|
161
185
|
if paper:
|
162
186
|
# Add complete paper information
|
163
|
-
result.update(
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
187
|
+
result.update(
|
188
|
+
{
|
189
|
+
"pdf_url": paper.pdf_url,
|
190
|
+
"authors": [
|
191
|
+
author.name for author in paper.authors
|
192
|
+
], # All authors
|
193
|
+
"published": (
|
194
|
+
paper.published.strftime("%Y-%m-%d")
|
195
|
+
if paper.published
|
196
|
+
else None
|
197
|
+
),
|
198
|
+
"updated": (
|
199
|
+
paper.updated.strftime("%Y-%m-%d")
|
200
|
+
if paper.updated
|
201
|
+
else None
|
202
|
+
),
|
203
|
+
"categories": paper.categories,
|
204
|
+
"summary": paper.summary, # Full summary
|
205
|
+
"comment": paper.comment,
|
206
|
+
"journal_ref": paper.journal_ref,
|
207
|
+
"doi": paper.doi,
|
208
|
+
}
|
209
|
+
)
|
210
|
+
|
175
211
|
# Default to using summary as content
|
176
212
|
result["content"] = paper.summary
|
177
213
|
result["full_content"] = paper.summary
|
178
|
-
|
214
|
+
|
179
215
|
# Download PDF and extract text if requested and within limit
|
180
|
-
if
|
216
|
+
if (
|
217
|
+
self.include_full_text
|
218
|
+
and self.download_dir
|
219
|
+
and pdf_count < self.max_full_text
|
220
|
+
):
|
181
221
|
try:
|
182
222
|
# Download the paper
|
183
223
|
pdf_count += 1 # Increment counter before attempting download
|
184
224
|
paper_path = paper.download_pdf(dirpath=self.download_dir)
|
185
225
|
result["pdf_path"] = str(paper_path)
|
186
|
-
|
226
|
+
|
187
227
|
# Extract text from PDF
|
188
228
|
try:
|
189
229
|
# Try PyPDF2 first
|
190
230
|
try:
|
191
231
|
import PyPDF2
|
192
|
-
|
232
|
+
|
233
|
+
with open(paper_path, "rb") as pdf_file:
|
193
234
|
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
194
235
|
pdf_text = ""
|
195
236
|
for page in pdf_reader.pages:
|
196
237
|
pdf_text += page.extract_text() + "\n\n"
|
197
|
-
|
198
|
-
if
|
238
|
+
|
239
|
+
if (
|
240
|
+
pdf_text.strip()
|
241
|
+
): # Only use if we got meaningful text
|
199
242
|
result["content"] = pdf_text
|
200
243
|
result["full_content"] = pdf_text
|
201
|
-
logger.info(
|
244
|
+
logger.info(
|
245
|
+
"Successfully extracted text from PDF using PyPDF2"
|
246
|
+
)
|
202
247
|
except (ImportError, Exception) as e1:
|
203
248
|
# Fall back to pdfplumber
|
204
249
|
try:
|
205
250
|
import pdfplumber
|
251
|
+
|
206
252
|
with pdfplumber.open(paper_path) as pdf:
|
207
253
|
pdf_text = ""
|
208
254
|
for page in pdf.pages:
|
209
255
|
pdf_text += page.extract_text() + "\n\n"
|
210
|
-
|
211
|
-
if
|
256
|
+
|
257
|
+
if (
|
258
|
+
pdf_text.strip()
|
259
|
+
): # Only use if we got meaningful text
|
212
260
|
result["content"] = pdf_text
|
213
261
|
result["full_content"] = pdf_text
|
214
|
-
logger.info(
|
262
|
+
logger.info(
|
263
|
+
"Successfully extracted text from PDF using pdfplumber"
|
264
|
+
)
|
215
265
|
except (ImportError, Exception) as e2:
|
216
|
-
logger.error(
|
217
|
-
|
266
|
+
logger.error(
|
267
|
+
f"PDF text extraction failed: {str(e1)}, then {str(e2)}"
|
268
|
+
)
|
269
|
+
logger.error(
|
270
|
+
"Using paper summary as content instead"
|
271
|
+
)
|
218
272
|
except Exception as e:
|
219
273
|
logger.error(f"Error extracting text from PDF: {e}")
|
220
|
-
logger.error(
|
274
|
+
logger.error("Using paper summary as content instead")
|
221
275
|
except Exception as e:
|
222
276
|
logger.error(f"Error downloading paper {paper.title}: {e}")
|
223
277
|
result["pdf_path"] = None
|
224
278
|
pdf_count -= 1 # Decrement counter if download fails
|
225
|
-
elif
|
279
|
+
elif (
|
280
|
+
self.include_full_text
|
281
|
+
and self.download_dir
|
282
|
+
and pdf_count >= self.max_full_text
|
283
|
+
):
|
226
284
|
# Reached PDF limit
|
227
|
-
logger.info(
|
285
|
+
logger.info(
|
286
|
+
f"Maximum number of PDFs ({self.max_full_text}) reached. Skipping remaining PDFs."
|
287
|
+
)
|
228
288
|
result["content"] = paper.summary
|
229
289
|
result["full_content"] = paper.summary
|
230
|
-
|
290
|
+
|
231
291
|
results.append(result)
|
232
|
-
|
292
|
+
|
233
293
|
return results
|
234
|
-
|
294
|
+
|
235
295
|
def run(self, query: str) -> List[Dict[str, Any]]:
|
236
296
|
"""
|
237
297
|
Execute a search using arXiv with the two-phase approach.
|
238
|
-
|
298
|
+
|
239
299
|
Args:
|
240
300
|
query: The search query
|
241
|
-
|
301
|
+
|
242
302
|
Returns:
|
243
303
|
List of search results
|
244
304
|
"""
|
245
305
|
logger.info("---Execute a search using arXiv---")
|
246
|
-
|
306
|
+
|
247
307
|
# Use the implementation from the parent class which handles all phases
|
248
308
|
results = super().run(query)
|
249
|
-
|
309
|
+
|
250
310
|
# Clean up
|
251
|
-
if hasattr(self,
|
311
|
+
if hasattr(self, "_papers"):
|
252
312
|
del self._papers
|
253
|
-
|
313
|
+
|
254
314
|
return results
|
255
|
-
|
315
|
+
|
256
316
|
def get_paper_details(self, arxiv_id: str) -> Dict[str, Any]:
|
257
317
|
"""
|
258
318
|
Get detailed information about a specific arXiv paper.
|
259
|
-
|
319
|
+
|
260
320
|
Args:
|
261
321
|
arxiv_id: arXiv ID of the paper (e.g., '2101.12345')
|
262
|
-
|
322
|
+
|
263
323
|
Returns:
|
264
324
|
Dictionary with paper information
|
265
325
|
"""
|
266
326
|
try:
|
267
327
|
# Create the search client
|
268
328
|
client = arxiv.Client()
|
269
|
-
|
329
|
+
|
270
330
|
# Search for the specific paper
|
271
331
|
search = arxiv.Search(id_list=[arxiv_id], max_results=1)
|
272
|
-
|
332
|
+
|
273
333
|
# Get the paper
|
274
334
|
papers = list(client.results(search))
|
275
335
|
if not papers:
|
276
336
|
return {}
|
277
|
-
|
337
|
+
|
278
338
|
paper = papers[0]
|
279
|
-
|
339
|
+
|
280
340
|
# Format result based on config
|
281
341
|
result = {
|
282
342
|
"title": paper.title,
|
283
343
|
"link": paper.entry_id,
|
284
|
-
"snippet":
|
285
|
-
|
344
|
+
"snippet": (
|
345
|
+
paper.summary[:250] + "..."
|
346
|
+
if len(paper.summary) > 250
|
347
|
+
else paper.summary
|
348
|
+
),
|
349
|
+
"authors": [
|
350
|
+
author.name for author in paper.authors[:3]
|
351
|
+
], # First 3 authors
|
286
352
|
}
|
287
|
-
|
353
|
+
|
288
354
|
# Add full content if not in snippet-only mode
|
289
|
-
if
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
355
|
+
if (
|
356
|
+
not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
|
357
|
+
or not search_config.SEARCH_SNIPPETS_ONLY
|
358
|
+
):
|
359
|
+
result.update(
|
360
|
+
{
|
361
|
+
"pdf_url": paper.pdf_url,
|
362
|
+
"authors": [
|
363
|
+
author.name for author in paper.authors
|
364
|
+
], # All authors
|
365
|
+
"published": (
|
366
|
+
paper.published.strftime("%Y-%m-%d")
|
367
|
+
if paper.published
|
368
|
+
else None
|
369
|
+
),
|
370
|
+
"updated": (
|
371
|
+
paper.updated.strftime("%Y-%m-%d")
|
372
|
+
if paper.updated
|
373
|
+
else None
|
374
|
+
),
|
375
|
+
"categories": paper.categories,
|
376
|
+
"summary": paper.summary, # Full summary
|
377
|
+
"comment": paper.comment,
|
378
|
+
"journal_ref": paper.journal_ref,
|
379
|
+
"doi": paper.doi,
|
380
|
+
"content": paper.summary, # Use summary as content
|
381
|
+
"full_content": paper.summary, # For consistency
|
382
|
+
}
|
383
|
+
)
|
384
|
+
|
304
385
|
# Download PDF if requested
|
305
386
|
if self.include_full_text and self.download_dir:
|
306
387
|
try:
|
@@ -309,57 +390,61 @@ class ArXivSearchEngine(BaseSearchEngine):
|
|
309
390
|
result["pdf_path"] = str(paper_path)
|
310
391
|
except Exception as e:
|
311
392
|
logger.error(f"Error downloading paper: {e}")
|
312
|
-
|
393
|
+
|
313
394
|
return result
|
314
|
-
|
395
|
+
|
315
396
|
except Exception as e:
|
316
397
|
logger.error(f"Error getting paper details: {e}")
|
317
398
|
return {}
|
318
|
-
|
319
|
-
def search_by_author(
|
399
|
+
|
400
|
+
def search_by_author(
|
401
|
+
self, author_name: str, max_results: Optional[int] = None
|
402
|
+
) -> List[Dict[str, Any]]:
|
320
403
|
"""
|
321
404
|
Search for papers by a specific author.
|
322
|
-
|
405
|
+
|
323
406
|
Args:
|
324
407
|
author_name: Name of the author
|
325
408
|
max_results: Maximum number of results (defaults to self.max_results)
|
326
|
-
|
409
|
+
|
327
410
|
Returns:
|
328
411
|
List of papers by the author
|
329
412
|
"""
|
330
413
|
original_max_results = self.max_results
|
331
|
-
|
414
|
+
|
332
415
|
try:
|
333
416
|
if max_results:
|
334
417
|
self.max_results = max_results
|
335
|
-
|
336
|
-
query = f
|
418
|
+
|
419
|
+
query = f'au:"{author_name}"'
|
337
420
|
return self.run(query)
|
338
|
-
|
421
|
+
|
339
422
|
finally:
|
340
423
|
# Restore original value
|
341
424
|
self.max_results = original_max_results
|
342
|
-
|
343
|
-
def search_by_category(
|
425
|
+
|
426
|
+
def search_by_category(
|
427
|
+
self, category: str, max_results: Optional[int] = None
|
428
|
+
) -> List[Dict[str, Any]]:
|
344
429
|
"""
|
345
430
|
Search for papers in a specific arXiv category.
|
346
|
-
|
431
|
+
|
347
432
|
Args:
|
348
433
|
category: arXiv category (e.g., 'cs.AI', 'physics.optics')
|
349
434
|
max_results: Maximum number of results (defaults to self.max_results)
|
350
|
-
|
435
|
+
|
351
436
|
Returns:
|
352
437
|
List of papers in the category
|
353
438
|
"""
|
354
439
|
original_max_results = self.max_results
|
355
|
-
|
440
|
+
|
356
441
|
try:
|
357
442
|
if max_results:
|
358
443
|
self.max_results = max_results
|
359
|
-
|
444
|
+
|
360
445
|
query = f"cat:{category}"
|
361
446
|
return self.run(query)
|
362
|
-
|
447
|
+
|
363
448
|
finally:
|
364
449
|
# Restore original value
|
365
|
-
self.max_results = original_max_results
|
450
|
+
self.max_results = original_max_results
|