local-deep-research 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. local_deep_research/config.py +8 -8
  2. local_deep_research/defaults/search_engines.toml +39 -18
  3. local_deep_research/search_system.py +16 -10
  4. local_deep_research/utilties/enums.py +4 -4
  5. local_deep_research/web/app.py +6 -21
  6. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +3 -5
  7. local_deep_research/web_search_engines/engines/search_engine_brave.py +3 -5
  8. local_deep_research/web_search_engines/engines/search_engine_ddg.py +4 -3
  9. local_deep_research/web_search_engines/engines/search_engine_github.py +2 -4
  10. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +2 -4
  11. local_deep_research/web_search_engines/engines/search_engine_guardian.py +323 -78
  12. local_deep_research/web_search_engines/engines/search_engine_local_all.py +3 -5
  13. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +3 -4
  14. local_deep_research/web_search_engines/engines/search_engine_searxng.py +3 -2
  15. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +1128 -0
  16. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +2 -4
  17. local_deep_research/web_search_engines/engines/search_engine_wayback.py +2 -4
  18. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +2 -4
  19. local_deep_research/web_search_engines/search_engine_base.py +12 -4
  20. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.15.dist-info}/METADATA +1 -1
  21. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.15.dist-info}/RECORD +25 -25
  22. local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +0 -623
  23. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.15.dist-info}/WHEEL +0 -0
  24. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.15.dist-info}/entry_points.txt +0 -0
  25. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.15.dist-info}/licenses/LICENSE +0 -0
  26. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.15.dist-info}/top_level.txt +0 -0
@@ -91,15 +91,15 @@ def get_search(search_tool=None):
91
91
  params = {
92
92
  "search_tool": tool,
93
93
  "llm_instance": get_llm(),
94
- "max_results": settings.get("max_results"),
95
- "region": settings.get("region"),
96
- "time_period": settings.get("time_period"),
97
- "safe_search": settings.get("safe_search"),
98
- "search_snippets_only": settings.get("snippets_only"),
99
- "search_language": settings.get("search_language"),
100
- "max_filtered_results": settings.get("max_filtered_results")
94
+ "max_results": settings.search.max_results,
95
+ "region": settings.search.region,
96
+ "time_period": settings.search.time_period,
97
+ "safe_search": settings.search.safe_search,
98
+ "search_snippets_only": settings.search.snippets_only,
99
+ "search_language": settings.search.search_language,
100
+ "max_filtered_results": settings.search.max_filtered_results
101
101
  }
102
-
102
+ logger.info(f"Search config params: {params}")
103
103
  # Create and return search engine
104
104
  return factory_get_search(**params)
105
105
 
@@ -186,9 +186,48 @@ strengths = ["searches all local collections", "personal documents", "offline ac
186
186
  weaknesses = ["may return too many results", "requires indexing"]
187
187
  requires_llm = true
188
188
 
189
+ [semantic_scholar]
190
+ module_path = "local_deep_research.web_search_engines.engines.search_engine_semantic_scholar"
191
+ class_name = "SemanticScholarSearchEngine"
192
+ requires_api_key = false
193
+ api_key_env = "S2_API_KEY"
194
+ reliability = 0.95
195
+ strengths = [
196
+ "comprehensive scientific literature",
197
+ "extensive citation network",
198
+ "AI-generated summaries (TLDRs)",
199
+ "academic paper metadata",
200
+ "cross-disciplinary coverage",
201
+ "200M+ papers across all fields",
202
+ "usable without API key"
203
+ ]
204
+ weaknesses = [
205
+ "rate limited (1000 requests/day) without API key",
206
+ "limited to academic content"
207
+ ]
208
+ supports_full_search = true
209
+ requires_llm = false
210
+
211
+ [semantic_scholar.default_params]
212
+ max_results = 20
213
+ get_abstracts = true
214
+ get_tldr = true
215
+ get_references = false
216
+ get_citations = false
217
+ get_embeddings = false
218
+ citation_limit = 10
219
+ reference_limit = 10
220
+ optimize_queries = true
221
+ max_retries = 5
222
+ retry_backoff_factor = 1.0
223
+
189
224
  # Default search engine to use if none specified
190
225
  DEFAULT_SEARCH_ENGINE = "wikipedia"
191
226
 
227
+
228
+
229
+
230
+
192
231
  # Additional search engines can be added below
193
232
  # Uncomment and modify these templates as needed
194
233
 
@@ -224,21 +263,3 @@ DEFAULT_SEARCH_ENGINE = "wikipedia"
224
263
  #
225
264
  # [guardian.default_params]
226
265
  # order_by = "relevance"
227
-
228
- # [medrxiv]
229
- # module_path = "local_deep_research.web_search_engines.engines.search_engine_medrxiv"
230
- # class_name = "MedRxivSearchEngine"
231
- # requires_api_key = false
232
- # reliability = 0.85
233
- # strengths = [
234
- # "medical preprints", "health research", "covid-19 research",
235
- # "clinical studies", "medical sciences", "preliminary results"
236
- # ]
237
- # weaknesses = ["not peer-reviewed", "preliminary findings", "limited to medical research"]
238
- # requires_llm = true
239
- #
240
- # [medrxiv.default_params]
241
- # sort_by = "relevance_score"
242
- # sort_order = "desc"
243
- # include_full_text = false
244
- # optimize_queries = true
@@ -216,11 +216,12 @@ class AdvancedSearchSystem:
216
216
  }
217
217
  )
218
218
 
219
- if settings.general.knowledge_accumulation != KnowledgeAccumulationApproach.NO_KNOWLEDGE:
219
+ if settings.general.knowledge_accumulation != str(KnowledgeAccumulationApproach.NO_KNOWLEDGE.value):
220
220
  current_knowledge = current_knowledge + "\n\n\n New: \n" + results_with_links
221
221
 
222
- print(current_knowledge)
223
- if settings.general.knowledge_accumulation == KnowledgeAccumulationApproach.QUESTION:
222
+ logger.info(settings.general.knowledge_accumulation)
223
+ if settings.general.knowledge_accumulation == str(KnowledgeAccumulationApproach.QUESTION.value):
224
+ logger.info("Compressing knowledge")
224
225
  self._update_progress(f"Compress Knowledge for: {question}",
225
226
  int(question_progress_base + 0),
226
227
  {"phase": "analysis"})
@@ -240,10 +241,14 @@ class AdvancedSearchSystem:
240
241
  self._update_progress(f"Compressing knowledge after iteration {iteration}",
241
242
  int((iteration / total_iterations) * 100 - 5),
242
243
  {"phase": "knowledge_compression"})
243
-
244
- if settings.general.knowledge_accumulation == KnowledgeAccumulationApproach.ITERATION:
244
+ logger.info(str(iteration))
245
+ logger.info(settings.general.knowledge_accumulation)
246
+ logger.info(str(KnowledgeAccumulationApproach.ITERATION.value))
247
+ if settings.general.knowledge_accumulation == KnowledgeAccumulationApproach.ITERATION.value:
245
248
  try:
249
+ logger.info("ITERATION - Compressing Knowledge")
246
250
  current_knowledge = self._compress_knowledge(current_knowledge , query, section_links)
251
+ logger.info("FINISHED ITERATION - Compressing Knowledge")
247
252
  except Exception as e:
248
253
  error_msg = f"Error compressing knowledge: {str(e)}"
249
254
  print(f"COMPRESSION ERROR: {error_msg}")
@@ -273,11 +278,12 @@ class AdvancedSearchSystem:
273
278
  "findings": findings,
274
279
  "iterations": iteration,
275
280
  "questions": self.questions_by_iteration,
276
- "formatted_findings": formatted_findings if 'formatted_findings' in locals() else "Error: Findings not available.",
281
+ "formatted_findings": formatted_findings,
277
282
  "current_knowledge": current_knowledge
278
283
  }
279
284
 
280
285
  def _save_findings(self, findings: List[Dict], current_knowledge: str, query: str):
286
+ logger.info("Saving findings ...")
281
287
  self._update_progress("Saving research findings...", None)
282
288
 
283
289
  formatted_findings = format_findings_to_text(
@@ -287,15 +293,15 @@ class AdvancedSearchSystem:
287
293
  :50
288
294
  ]
289
295
  safe_query = safe_query.replace(" ", "_").lower()
290
-
291
- output_dir = "research_outputs"
296
+ import local_deep_research.config as conf
297
+ output_dir = f"{conf.get_config_dir()}/research_outputs"
292
298
  if not os.path.exists(output_dir):
293
299
  os.makedirs(output_dir)
294
-
300
+
295
301
  filename = os.path.join(output_dir, f"formatted_output_{safe_query}.txt")
296
302
 
297
303
  with open(filename, "w", encoding="utf-8") as text_file:
298
304
  text_file.write(formatted_findings)
299
-
305
+ logger.info("Saved findings")
300
306
  self._update_progress("Research findings saved", None, {"filename": filename})
301
307
  return formatted_findings
@@ -2,8 +2,8 @@
2
2
  from enum import Enum, auto
3
3
 
4
4
  class KnowledgeAccumulationApproach(Enum):
5
- QUESTION = auto()
6
- ITERATION = auto()
7
- NO_KNOWLEDGE = auto()
8
- MAX_NR_OF_CHARACTERS = auto()
5
+ QUESTION = "QUESTION"
6
+ ITERATION = "ITERATION"
7
+ NO_KNOWLEDGE = "NO_KNOWLEDGE"
8
+ MAX_NR_OF_CHARACTERS = "MAX_NR_OF_CHARACTERS"
9
9
 
@@ -1001,26 +1001,14 @@ def run_research_process(research_id, query, mode):
1001
1001
  if mode == 'quick':
1002
1002
  # Quick Summary
1003
1003
  if results.get('findings'):
1004
- #initial_analysis = [finding['content'] for finding in results['findings']]
1005
- summary = ""
1006
-
1007
- # Safer access to formatted_findings with logging
1008
- print(f"Results keys: {list(results.keys())}")
1009
-
1010
- # Check if formatted_findings exists in results
1011
- if 'formatted_findings' not in results:
1012
- print("WARNING: 'formatted_findings' not found in results, using fallback")
1013
- # Create fallback formatted findings from available data
1014
- raw_formatted_findings = "# Research Findings\n\n"
1015
- for i, finding in enumerate(results.get('findings', [])):
1016
- raw_formatted_findings += f"## Finding {i+1}\n\n{finding.get('content', '')}\n\n"
1017
- else:
1018
- raw_formatted_findings = results['formatted_findings']
1019
- print(f"Found formatted_findings of length: {len(str(raw_formatted_findings))}")
1004
+
1005
+ raw_formatted_findings = results['formatted_findings']
1006
+ logger.info(f"Found formatted_findings of length: {len(str(raw_formatted_findings))}")
1020
1007
 
1021
1008
  try:
1009
+ clean_markdown = raw_formatted_findings
1022
1010
  # ADDED CODE: Convert debug output to clean markdown
1023
- clean_markdown = convert_debug_to_markdown(raw_formatted_findings, query)
1011
+ #clean_markdown = convert_debug_to_markdown(raw_formatted_findings, query)
1024
1012
  print(f"Successfully converted to clean markdown of length: {len(clean_markdown)}")
1025
1013
 
1026
1014
  # First send a progress update for generating the summary
@@ -1692,10 +1680,7 @@ def convert_debug_to_markdown(raw_text, query):
1692
1680
  lines_after = len(content.split("\n"))
1693
1681
  print(f"Removed {lines_before - lines_after} divider lines")
1694
1682
 
1695
- # If COMPLETE RESEARCH OUTPUT exists, remove that section
1696
- if "COMPLETE RESEARCH OUTPUT" in content:
1697
- print("Found and removing COMPLETE RESEARCH OUTPUT section")
1698
- content = content.split("COMPLETE RESEARCH OUTPUT")[0].strip()
1683
+
1699
1684
 
1700
1685
  # Remove SEARCH QUESTIONS BY ITERATION section
1701
1686
  if "SEARCH QUESTIONS BY ITERATION:" in content:
@@ -32,11 +32,9 @@ class ArXivSearchEngine(BaseSearchEngine):
32
32
  llm: Language model for relevance filtering
33
33
  max_filtered_results: Maximum number of results to keep after filtering
34
34
  """
35
- # Initialize the BaseSearchEngine with the LLM and max_filtered_results
36
- super().__init__(llm=llm, max_filtered_results=max_filtered_results)
37
-
38
- #max_results = min(max_results, 20) # required for arxiv
39
- self.max_results = 20 # TODO this needs to be corrected.
35
+ # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
36
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
37
+ self.max_results=max(self.max_results,25)
40
38
  self.sort_by = sort_by
41
39
  self.sort_order = sort_order
42
40
  self.include_full_text = include_full_text
@@ -19,7 +19,7 @@ class BraveSearchEngine(BaseSearchEngine):
19
19
  api_key: Optional[str] = None,
20
20
  language_code_mapping: Optional[Dict[str, str]] = None,
21
21
  llm: Optional[BaseLLM] = None,
22
- include_full_content: bool = False,
22
+ include_full_content: bool = True,
23
23
  max_filtered_results: Optional[int] = None,
24
24
  **kwargs):
25
25
  """
@@ -38,10 +38,8 @@ class BraveSearchEngine(BaseSearchEngine):
38
38
  max_filtered_results: Maximum number of results to keep after filtering
39
39
  **kwargs: Additional parameters (ignored but accepted for compatibility)
40
40
  """
41
- # Initialize the BaseSearchEngine with the LLM and max_filtered_results
42
- super().__init__(llm=llm, max_filtered_results=max_filtered_results)
43
-
44
- self.max_results = max_results
41
+ # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
42
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
45
43
  self.include_full_content = include_full_content
46
44
 
47
45
  # Set up language code mapping
@@ -14,7 +14,8 @@ class DuckDuckGoSearchEngine(BaseSearchEngine):
14
14
  safe_search: bool = True,
15
15
  llm: Optional[BaseLLM] = None,
16
16
  language: str = "English",
17
- include_full_content: bool = False):
17
+ include_full_content: bool = False,
18
+ max_filtered_results=5):
18
19
  """
19
20
  Initialize the DuckDuckGo search engine.
20
21
 
@@ -26,8 +27,8 @@ class DuckDuckGoSearchEngine(BaseSearchEngine):
26
27
  language: Language for content processing
27
28
  include_full_content: Whether to include full webpage content in results
28
29
  """
29
- super().__init__(llm=llm)
30
- self.max_results = max_results
30
+ # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
31
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
31
32
  self.region = region
32
33
  self.safe_search = safe_search
33
34
  self.language = language
@@ -39,10 +39,8 @@ class GitHubSearchEngine(BaseSearchEngine):
39
39
  llm: Language model for relevance filtering
40
40
  max_filtered_results: Maximum number of results to keep after filtering
41
41
  """
42
- # Initialize the BaseSearchEngine with the LLM and max_filtered_results
43
- super().__init__(llm=llm, max_filtered_results=max_filtered_results)
44
-
45
- self.max_results = max_results
42
+ # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
43
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
46
44
  self.api_key = api_key or os.getenv("GITHUB_API_KEY")
47
45
  self.search_type = search_type
48
46
  self.include_readme = include_readme
@@ -47,10 +47,8 @@ class GooglePSESearchEngine(BaseSearchEngine):
47
47
  retry_delay: Base delay in seconds between retry attempts
48
48
  **kwargs: Additional parameters (ignored but accepted for compatibility)
49
49
  """
50
- # Initialize the BaseSearchEngine with the LLM and max_filtered_results
51
- super().__init__(llm=llm, max_filtered_results=max_filtered_results)
52
-
53
- self.max_results = max_results
50
+ # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
51
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
54
52
  self.include_full_content = include_full_content
55
53
 
56
54
  # Retry configuration