local-deep-research 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/config.py +8 -8
- local_deep_research/defaults/search_engines.toml +39 -18
- local_deep_research/search_system.py +15 -9
- local_deep_research/utilties/enums.py +4 -4
- local_deep_research/web/app.py +3 -2
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +3 -5
- local_deep_research/web_search_engines/engines/search_engine_brave.py +3 -5
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +4 -3
- local_deep_research/web_search_engines/engines/search_engine_github.py +2 -4
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +2 -4
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +323 -78
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +3 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +3 -4
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +3 -2
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +1128 -0
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +2 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +2 -4
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +2 -4
- local_deep_research/web_search_engines/search_engine_base.py +12 -4
- {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/METADATA +1 -1
- {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/RECORD +25 -25
- local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +0 -623
- {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/WHEEL +0 -0
- {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/licenses/LICENSE +0 -0
- {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/top_level.txt +0 -0
local_deep_research/config.py
CHANGED
@@ -91,15 +91,15 @@ def get_search(search_tool=None):
|
|
91
91
|
params = {
|
92
92
|
"search_tool": tool,
|
93
93
|
"llm_instance": get_llm(),
|
94
|
-
"max_results": settings.
|
95
|
-
"region": settings.
|
96
|
-
"time_period": settings.
|
97
|
-
"safe_search": settings.
|
98
|
-
"search_snippets_only": settings.
|
99
|
-
"search_language": settings.
|
100
|
-
"max_filtered_results": settings.
|
94
|
+
"max_results": settings.search.max_results,
|
95
|
+
"region": settings.search.region,
|
96
|
+
"time_period": settings.search.time_period,
|
97
|
+
"safe_search": settings.search.safe_search,
|
98
|
+
"search_snippets_only": settings.search.snippets_only,
|
99
|
+
"search_language": settings.search.search_language,
|
100
|
+
"max_filtered_results": settings.search.max_filtered_results
|
101
101
|
}
|
102
|
-
|
102
|
+
logger.info(f"Search config params: {params}")
|
103
103
|
# Create and return search engine
|
104
104
|
return factory_get_search(**params)
|
105
105
|
|
@@ -186,9 +186,48 @@ strengths = ["searches all local collections", "personal documents", "offline ac
|
|
186
186
|
weaknesses = ["may return too many results", "requires indexing"]
|
187
187
|
requires_llm = true
|
188
188
|
|
189
|
+
[semantic_scholar]
|
190
|
+
module_path = "local_deep_research.web_search_engines.engines.search_engine_semantic_scholar"
|
191
|
+
class_name = "SemanticScholarSearchEngine"
|
192
|
+
requires_api_key = false
|
193
|
+
api_key_env = "S2_API_KEY"
|
194
|
+
reliability = 0.95
|
195
|
+
strengths = [
|
196
|
+
"comprehensive scientific literature",
|
197
|
+
"extensive citation network",
|
198
|
+
"AI-generated summaries (TLDRs)",
|
199
|
+
"academic paper metadata",
|
200
|
+
"cross-disciplinary coverage",
|
201
|
+
"200M+ papers across all fields",
|
202
|
+
"usable without API key"
|
203
|
+
]
|
204
|
+
weaknesses = [
|
205
|
+
"rate limited (1000 requests/day) without API key",
|
206
|
+
"limited to academic content"
|
207
|
+
]
|
208
|
+
supports_full_search = true
|
209
|
+
requires_llm = false
|
210
|
+
|
211
|
+
[semantic_scholar.default_params]
|
212
|
+
max_results = 20
|
213
|
+
get_abstracts = true
|
214
|
+
get_tldr = true
|
215
|
+
get_references = false
|
216
|
+
get_citations = false
|
217
|
+
get_embeddings = false
|
218
|
+
citation_limit = 10
|
219
|
+
reference_limit = 10
|
220
|
+
optimize_queries = true
|
221
|
+
max_retries = 5
|
222
|
+
retry_backoff_factor = 1.0
|
223
|
+
|
189
224
|
# Default search engine to use if none specified
|
190
225
|
DEFAULT_SEARCH_ENGINE = "wikipedia"
|
191
226
|
|
227
|
+
|
228
|
+
|
229
|
+
|
230
|
+
|
192
231
|
# Additional search engines can be added below
|
193
232
|
# Uncomment and modify these templates as needed
|
194
233
|
|
@@ -224,21 +263,3 @@ DEFAULT_SEARCH_ENGINE = "wikipedia"
|
|
224
263
|
#
|
225
264
|
# [guardian.default_params]
|
226
265
|
# order_by = "relevance"
|
227
|
-
|
228
|
-
# [medrxiv]
|
229
|
-
# module_path = "local_deep_research.web_search_engines.engines.search_engine_medrxiv"
|
230
|
-
# class_name = "MedRxivSearchEngine"
|
231
|
-
# requires_api_key = false
|
232
|
-
# reliability = 0.85
|
233
|
-
# strengths = [
|
234
|
-
# "medical preprints", "health research", "covid-19 research",
|
235
|
-
# "clinical studies", "medical sciences", "preliminary results"
|
236
|
-
# ]
|
237
|
-
# weaknesses = ["not peer-reviewed", "preliminary findings", "limited to medical research"]
|
238
|
-
# requires_llm = true
|
239
|
-
#
|
240
|
-
# [medrxiv.default_params]
|
241
|
-
# sort_by = "relevance_score"
|
242
|
-
# sort_order = "desc"
|
243
|
-
# include_full_text = false
|
244
|
-
# optimize_queries = true
|
@@ -216,11 +216,12 @@ class AdvancedSearchSystem:
|
|
216
216
|
}
|
217
217
|
)
|
218
218
|
|
219
|
-
if settings.general.knowledge_accumulation != KnowledgeAccumulationApproach.NO_KNOWLEDGE:
|
219
|
+
if settings.general.knowledge_accumulation != str(KnowledgeAccumulationApproach.NO_KNOWLEDGE.value):
|
220
220
|
current_knowledge = current_knowledge + "\n\n\n New: \n" + results_with_links
|
221
221
|
|
222
|
-
|
223
|
-
if settings.general.knowledge_accumulation == KnowledgeAccumulationApproach.QUESTION:
|
222
|
+
logger.info(settings.general.knowledge_accumulation)
|
223
|
+
if settings.general.knowledge_accumulation == str(KnowledgeAccumulationApproach.QUESTION.value):
|
224
|
+
logger.info("Compressing knowledge")
|
224
225
|
self._update_progress(f"Compress Knowledge for: {question}",
|
225
226
|
int(question_progress_base + 0),
|
226
227
|
{"phase": "analysis"})
|
@@ -240,10 +241,14 @@ class AdvancedSearchSystem:
|
|
240
241
|
self._update_progress(f"Compressing knowledge after iteration {iteration}",
|
241
242
|
int((iteration / total_iterations) * 100 - 5),
|
242
243
|
{"phase": "knowledge_compression"})
|
243
|
-
|
244
|
-
|
244
|
+
logger.info(str(iteration))
|
245
|
+
logger.info(settings.general.knowledge_accumulation)
|
246
|
+
logger.info(str(KnowledgeAccumulationApproach.ITERATION.value))
|
247
|
+
if settings.general.knowledge_accumulation == KnowledgeAccumulationApproach.ITERATION.value:
|
245
248
|
try:
|
249
|
+
logger.info("ITERATION - Compressing Knowledge")
|
246
250
|
current_knowledge = self._compress_knowledge(current_knowledge , query, section_links)
|
251
|
+
logger.info("FINISHED ITERATION - Compressing Knowledge")
|
247
252
|
except Exception as e:
|
248
253
|
error_msg = f"Error compressing knowledge: {str(e)}"
|
249
254
|
print(f"COMPRESSION ERROR: {error_msg}")
|
@@ -278,6 +283,7 @@ class AdvancedSearchSystem:
|
|
278
283
|
}
|
279
284
|
|
280
285
|
def _save_findings(self, findings: List[Dict], current_knowledge: str, query: str):
|
286
|
+
logger.info("Saving findings ...")
|
281
287
|
self._update_progress("Saving research findings...", None)
|
282
288
|
|
283
289
|
formatted_findings = format_findings_to_text(
|
@@ -287,15 +293,15 @@ class AdvancedSearchSystem:
|
|
287
293
|
:50
|
288
294
|
]
|
289
295
|
safe_query = safe_query.replace(" ", "_").lower()
|
290
|
-
|
291
|
-
output_dir = "research_outputs"
|
296
|
+
import local_deep_research.config as conf
|
297
|
+
output_dir = f"{conf.get_config_dir()}/research_outputs"
|
292
298
|
if not os.path.exists(output_dir):
|
293
299
|
os.makedirs(output_dir)
|
294
|
-
|
300
|
+
|
295
301
|
filename = os.path.join(output_dir, f"formatted_output_{safe_query}.txt")
|
296
302
|
|
297
303
|
with open(filename, "w", encoding="utf-8") as text_file:
|
298
304
|
text_file.write(formatted_findings)
|
299
|
-
|
305
|
+
logger.info("Saved findings")
|
300
306
|
self._update_progress("Research findings saved", None, {"filename": filename})
|
301
307
|
return formatted_findings
|
@@ -2,8 +2,8 @@
|
|
2
2
|
from enum import Enum, auto
|
3
3
|
|
4
4
|
class KnowledgeAccumulationApproach(Enum):
|
5
|
-
QUESTION =
|
6
|
-
ITERATION =
|
7
|
-
NO_KNOWLEDGE =
|
8
|
-
MAX_NR_OF_CHARACTERS =
|
5
|
+
QUESTION = "QUESTION"
|
6
|
+
ITERATION = "ITERATION"
|
7
|
+
NO_KNOWLEDGE = "NO_KNOWLEDGE"
|
8
|
+
MAX_NR_OF_CHARACTERS = "MAX_NR_OF_CHARACTERS"
|
9
9
|
|
local_deep_research/web/app.py
CHANGED
@@ -1009,14 +1009,15 @@ def run_research_process(research_id, query, mode):
|
|
1009
1009
|
|
1010
1010
|
# Check if formatted_findings exists in results
|
1011
1011
|
if 'formatted_findings' not in results:
|
1012
|
-
|
1012
|
+
logger.info("WARNING: 'formatted_findings' not found in results, using fallback")
|
1013
1013
|
# Create fallback formatted findings from available data
|
1014
1014
|
raw_formatted_findings = "# Research Findings\n\n"
|
1015
|
+
raw_formatted_findings = raw_formatted_findings + str(results.get('current_knowledge'))
|
1015
1016
|
for i, finding in enumerate(results.get('findings', [])):
|
1016
1017
|
raw_formatted_findings += f"## Finding {i+1}\n\n{finding.get('content', '')}\n\n"
|
1017
1018
|
else:
|
1018
1019
|
raw_formatted_findings = results['formatted_findings']
|
1019
|
-
|
1020
|
+
logger.info(f"Found formatted_findings of length: {len(str(raw_formatted_findings))}")
|
1020
1021
|
|
1021
1022
|
try:
|
1022
1023
|
# ADDED CODE: Convert debug output to clean markdown
|
@@ -32,11 +32,9 @@ class ArXivSearchEngine(BaseSearchEngine):
|
|
32
32
|
llm: Language model for relevance filtering
|
33
33
|
max_filtered_results: Maximum number of results to keep after filtering
|
34
34
|
"""
|
35
|
-
# Initialize the BaseSearchEngine with
|
36
|
-
super().__init__(llm=llm, max_filtered_results=max_filtered_results)
|
37
|
-
|
38
|
-
#max_results = min(max_results, 20) # required for arxiv
|
39
|
-
self.max_results = 20 # TODO this needs to be corrected.
|
35
|
+
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
36
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
|
37
|
+
self.max_results=max(self.max_results,25)
|
40
38
|
self.sort_by = sort_by
|
41
39
|
self.sort_order = sort_order
|
42
40
|
self.include_full_text = include_full_text
|
@@ -19,7 +19,7 @@ class BraveSearchEngine(BaseSearchEngine):
|
|
19
19
|
api_key: Optional[str] = None,
|
20
20
|
language_code_mapping: Optional[Dict[str, str]] = None,
|
21
21
|
llm: Optional[BaseLLM] = None,
|
22
|
-
include_full_content: bool =
|
22
|
+
include_full_content: bool = True,
|
23
23
|
max_filtered_results: Optional[int] = None,
|
24
24
|
**kwargs):
|
25
25
|
"""
|
@@ -38,10 +38,8 @@ class BraveSearchEngine(BaseSearchEngine):
|
|
38
38
|
max_filtered_results: Maximum number of results to keep after filtering
|
39
39
|
**kwargs: Additional parameters (ignored but accepted for compatibility)
|
40
40
|
"""
|
41
|
-
# Initialize the BaseSearchEngine with
|
42
|
-
super().__init__(llm=llm, max_filtered_results=max_filtered_results)
|
43
|
-
|
44
|
-
self.max_results = max_results
|
41
|
+
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
42
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
|
45
43
|
self.include_full_content = include_full_content
|
46
44
|
|
47
45
|
# Set up language code mapping
|
@@ -14,7 +14,8 @@ class DuckDuckGoSearchEngine(BaseSearchEngine):
|
|
14
14
|
safe_search: bool = True,
|
15
15
|
llm: Optional[BaseLLM] = None,
|
16
16
|
language: str = "English",
|
17
|
-
include_full_content: bool = False
|
17
|
+
include_full_content: bool = False,
|
18
|
+
max_filtered_results=5):
|
18
19
|
"""
|
19
20
|
Initialize the DuckDuckGo search engine.
|
20
21
|
|
@@ -26,8 +27,8 @@ class DuckDuckGoSearchEngine(BaseSearchEngine):
|
|
26
27
|
language: Language for content processing
|
27
28
|
include_full_content: Whether to include full webpage content in results
|
28
29
|
"""
|
29
|
-
|
30
|
-
|
30
|
+
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
31
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
|
31
32
|
self.region = region
|
32
33
|
self.safe_search = safe_search
|
33
34
|
self.language = language
|
@@ -39,10 +39,8 @@ class GitHubSearchEngine(BaseSearchEngine):
|
|
39
39
|
llm: Language model for relevance filtering
|
40
40
|
max_filtered_results: Maximum number of results to keep after filtering
|
41
41
|
"""
|
42
|
-
# Initialize the BaseSearchEngine with
|
43
|
-
super().__init__(llm=llm, max_filtered_results=max_filtered_results)
|
44
|
-
|
45
|
-
self.max_results = max_results
|
42
|
+
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
43
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
|
46
44
|
self.api_key = api_key or os.getenv("GITHUB_API_KEY")
|
47
45
|
self.search_type = search_type
|
48
46
|
self.include_readme = include_readme
|
@@ -47,10 +47,8 @@ class GooglePSESearchEngine(BaseSearchEngine):
|
|
47
47
|
retry_delay: Base delay in seconds between retry attempts
|
48
48
|
**kwargs: Additional parameters (ignored but accepted for compatibility)
|
49
49
|
"""
|
50
|
-
# Initialize the BaseSearchEngine with
|
51
|
-
super().__init__(llm=llm, max_filtered_results=max_filtered_results)
|
52
|
-
|
53
|
-
self.max_results = max_results
|
50
|
+
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
51
|
+
super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
|
54
52
|
self.include_full_content = include_full_content
|
55
53
|
|
56
54
|
# Retry configuration
|