local-deep-research 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. local_deep_research/config.py +8 -8
  2. local_deep_research/defaults/search_engines.toml +39 -18
  3. local_deep_research/search_system.py +15 -9
  4. local_deep_research/utilties/enums.py +4 -4
  5. local_deep_research/web/app.py +3 -2
  6. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +3 -5
  7. local_deep_research/web_search_engines/engines/search_engine_brave.py +3 -5
  8. local_deep_research/web_search_engines/engines/search_engine_ddg.py +4 -3
  9. local_deep_research/web_search_engines/engines/search_engine_github.py +2 -4
  10. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +2 -4
  11. local_deep_research/web_search_engines/engines/search_engine_guardian.py +323 -78
  12. local_deep_research/web_search_engines/engines/search_engine_local_all.py +3 -5
  13. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +3 -4
  14. local_deep_research/web_search_engines/engines/search_engine_searxng.py +3 -2
  15. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +1128 -0
  16. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +2 -4
  17. local_deep_research/web_search_engines/engines/search_engine_wayback.py +2 -4
  18. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +2 -4
  19. local_deep_research/web_search_engines/search_engine_base.py +12 -4
  20. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/METADATA +1 -1
  21. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/RECORD +25 -25
  22. local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +0 -623
  23. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/WHEEL +0 -0
  24. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/entry_points.txt +0 -0
  25. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/licenses/LICENSE +0 -0
  26. {local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/top_level.txt +0 -0
@@ -91,15 +91,15 @@ def get_search(search_tool=None):
91
91
  params = {
92
92
  "search_tool": tool,
93
93
  "llm_instance": get_llm(),
94
- "max_results": settings.get("max_results"),
95
- "region": settings.get("region"),
96
- "time_period": settings.get("time_period"),
97
- "safe_search": settings.get("safe_search"),
98
- "search_snippets_only": settings.get("snippets_only"),
99
- "search_language": settings.get("search_language"),
100
- "max_filtered_results": settings.get("max_filtered_results")
94
+ "max_results": settings.search.max_results,
95
+ "region": settings.search.region,
96
+ "time_period": settings.search.time_period,
97
+ "safe_search": settings.search.safe_search,
98
+ "search_snippets_only": settings.search.snippets_only,
99
+ "search_language": settings.search.search_language,
100
+ "max_filtered_results": settings.search.max_filtered_results
101
101
  }
102
-
102
+ logger.info(f"Search config params: {params}")
103
103
  # Create and return search engine
104
104
  return factory_get_search(**params)
105
105
 
@@ -186,9 +186,48 @@ strengths = ["searches all local collections", "personal documents", "offline ac
186
186
  weaknesses = ["may return too many results", "requires indexing"]
187
187
  requires_llm = true
188
188
 
189
+ [semantic_scholar]
190
+ module_path = "local_deep_research.web_search_engines.engines.search_engine_semantic_scholar"
191
+ class_name = "SemanticScholarSearchEngine"
192
+ requires_api_key = false
193
+ api_key_env = "S2_API_KEY"
194
+ reliability = 0.95
195
+ strengths = [
196
+ "comprehensive scientific literature",
197
+ "extensive citation network",
198
+ "AI-generated summaries (TLDRs)",
199
+ "academic paper metadata",
200
+ "cross-disciplinary coverage",
201
+ "200M+ papers across all fields",
202
+ "usable without API key"
203
+ ]
204
+ weaknesses = [
205
+ "rate limited (1000 requests/day) without API key",
206
+ "limited to academic content"
207
+ ]
208
+ supports_full_search = true
209
+ requires_llm = false
210
+
211
+ [semantic_scholar.default_params]
212
+ max_results = 20
213
+ get_abstracts = true
214
+ get_tldr = true
215
+ get_references = false
216
+ get_citations = false
217
+ get_embeddings = false
218
+ citation_limit = 10
219
+ reference_limit = 10
220
+ optimize_queries = true
221
+ max_retries = 5
222
+ retry_backoff_factor = 1.0
223
+
189
224
  # Default search engine to use if none specified
190
225
  DEFAULT_SEARCH_ENGINE = "wikipedia"
191
226
 
227
+
228
+
229
+
230
+
192
231
  # Additional search engines can be added below
193
232
  # Uncomment and modify these templates as needed
194
233
 
@@ -224,21 +263,3 @@ DEFAULT_SEARCH_ENGINE = "wikipedia"
224
263
  #
225
264
  # [guardian.default_params]
226
265
  # order_by = "relevance"
227
-
228
- # [medrxiv]
229
- # module_path = "local_deep_research.web_search_engines.engines.search_engine_medrxiv"
230
- # class_name = "MedRxivSearchEngine"
231
- # requires_api_key = false
232
- # reliability = 0.85
233
- # strengths = [
234
- # "medical preprints", "health research", "covid-19 research",
235
- # "clinical studies", "medical sciences", "preliminary results"
236
- # ]
237
- # weaknesses = ["not peer-reviewed", "preliminary findings", "limited to medical research"]
238
- # requires_llm = true
239
- #
240
- # [medrxiv.default_params]
241
- # sort_by = "relevance_score"
242
- # sort_order = "desc"
243
- # include_full_text = false
244
- # optimize_queries = true
@@ -216,11 +216,12 @@ class AdvancedSearchSystem:
216
216
  }
217
217
  )
218
218
 
219
- if settings.general.knowledge_accumulation != KnowledgeAccumulationApproach.NO_KNOWLEDGE:
219
+ if settings.general.knowledge_accumulation != str(KnowledgeAccumulationApproach.NO_KNOWLEDGE.value):
220
220
  current_knowledge = current_knowledge + "\n\n\n New: \n" + results_with_links
221
221
 
222
- print(current_knowledge)
223
- if settings.general.knowledge_accumulation == KnowledgeAccumulationApproach.QUESTION:
222
+ logger.info(settings.general.knowledge_accumulation)
223
+ if settings.general.knowledge_accumulation == str(KnowledgeAccumulationApproach.QUESTION.value):
224
+ logger.info("Compressing knowledge")
224
225
  self._update_progress(f"Compress Knowledge for: {question}",
225
226
  int(question_progress_base + 0),
226
227
  {"phase": "analysis"})
@@ -240,10 +241,14 @@ class AdvancedSearchSystem:
240
241
  self._update_progress(f"Compressing knowledge after iteration {iteration}",
241
242
  int((iteration / total_iterations) * 100 - 5),
242
243
  {"phase": "knowledge_compression"})
243
-
244
- if settings.general.knowledge_accumulation == KnowledgeAccumulationApproach.ITERATION:
244
+ logger.info(str(iteration))
245
+ logger.info(settings.general.knowledge_accumulation)
246
+ logger.info(str(KnowledgeAccumulationApproach.ITERATION.value))
247
+ if settings.general.knowledge_accumulation == KnowledgeAccumulationApproach.ITERATION.value:
245
248
  try:
249
+ logger.info("ITERATION - Compressing Knowledge")
246
250
  current_knowledge = self._compress_knowledge(current_knowledge , query, section_links)
251
+ logger.info("FINISHED ITERATION - Compressing Knowledge")
247
252
  except Exception as e:
248
253
  error_msg = f"Error compressing knowledge: {str(e)}"
249
254
  print(f"COMPRESSION ERROR: {error_msg}")
@@ -278,6 +283,7 @@ class AdvancedSearchSystem:
278
283
  }
279
284
 
280
285
  def _save_findings(self, findings: List[Dict], current_knowledge: str, query: str):
286
+ logger.info("Saving findings ...")
281
287
  self._update_progress("Saving research findings...", None)
282
288
 
283
289
  formatted_findings = format_findings_to_text(
@@ -287,15 +293,15 @@ class AdvancedSearchSystem:
287
293
  :50
288
294
  ]
289
295
  safe_query = safe_query.replace(" ", "_").lower()
290
-
291
- output_dir = "research_outputs"
296
+ import local_deep_research.config as conf
297
+ output_dir = f"{conf.get_config_dir()}/research_outputs"
292
298
  if not os.path.exists(output_dir):
293
299
  os.makedirs(output_dir)
294
-
300
+
295
301
  filename = os.path.join(output_dir, f"formatted_output_{safe_query}.txt")
296
302
 
297
303
  with open(filename, "w", encoding="utf-8") as text_file:
298
304
  text_file.write(formatted_findings)
299
-
305
+ logger.info("Saved findings")
300
306
  self._update_progress("Research findings saved", None, {"filename": filename})
301
307
  return formatted_findings
@@ -2,8 +2,8 @@
2
2
  from enum import Enum, auto
3
3
 
4
4
  class KnowledgeAccumulationApproach(Enum):
5
- QUESTION = auto()
6
- ITERATION = auto()
7
- NO_KNOWLEDGE = auto()
8
- MAX_NR_OF_CHARACTERS = auto()
5
+ QUESTION = "QUESTION"
6
+ ITERATION = "ITERATION"
7
+ NO_KNOWLEDGE = "NO_KNOWLEDGE"
8
+ MAX_NR_OF_CHARACTERS = "MAX_NR_OF_CHARACTERS"
9
9
 
@@ -1009,14 +1009,15 @@ def run_research_process(research_id, query, mode):
1009
1009
 
1010
1010
  # Check if formatted_findings exists in results
1011
1011
  if 'formatted_findings' not in results:
1012
- print("WARNING: 'formatted_findings' not found in results, using fallback")
1012
+ logger.info("WARNING: 'formatted_findings' not found in results, using fallback")
1013
1013
  # Create fallback formatted findings from available data
1014
1014
  raw_formatted_findings = "# Research Findings\n\n"
1015
+ raw_formatted_findings = raw_formatted_findings + str(results.get('current_knowledge'))
1015
1016
  for i, finding in enumerate(results.get('findings', [])):
1016
1017
  raw_formatted_findings += f"## Finding {i+1}\n\n{finding.get('content', '')}\n\n"
1017
1018
  else:
1018
1019
  raw_formatted_findings = results['formatted_findings']
1019
- print(f"Found formatted_findings of length: {len(str(raw_formatted_findings))}")
1020
+ logger.info(f"Found formatted_findings of length: {len(str(raw_formatted_findings))}")
1020
1021
 
1021
1022
  try:
1022
1023
  # ADDED CODE: Convert debug output to clean markdown
@@ -32,11 +32,9 @@ class ArXivSearchEngine(BaseSearchEngine):
32
32
  llm: Language model for relevance filtering
33
33
  max_filtered_results: Maximum number of results to keep after filtering
34
34
  """
35
- # Initialize the BaseSearchEngine with the LLM and max_filtered_results
36
- super().__init__(llm=llm, max_filtered_results=max_filtered_results)
37
-
38
- #max_results = min(max_results, 20) # required for arxiv
39
- self.max_results = 20 # TODO this needs to be corrected.
35
+ # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
36
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
37
+ self.max_results=max(self.max_results,25)
40
38
  self.sort_by = sort_by
41
39
  self.sort_order = sort_order
42
40
  self.include_full_text = include_full_text
@@ -19,7 +19,7 @@ class BraveSearchEngine(BaseSearchEngine):
19
19
  api_key: Optional[str] = None,
20
20
  language_code_mapping: Optional[Dict[str, str]] = None,
21
21
  llm: Optional[BaseLLM] = None,
22
- include_full_content: bool = False,
22
+ include_full_content: bool = True,
23
23
  max_filtered_results: Optional[int] = None,
24
24
  **kwargs):
25
25
  """
@@ -38,10 +38,8 @@ class BraveSearchEngine(BaseSearchEngine):
38
38
  max_filtered_results: Maximum number of results to keep after filtering
39
39
  **kwargs: Additional parameters (ignored but accepted for compatibility)
40
40
  """
41
- # Initialize the BaseSearchEngine with the LLM and max_filtered_results
42
- super().__init__(llm=llm, max_filtered_results=max_filtered_results)
43
-
44
- self.max_results = max_results
41
+ # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
42
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
45
43
  self.include_full_content = include_full_content
46
44
 
47
45
  # Set up language code mapping
@@ -14,7 +14,8 @@ class DuckDuckGoSearchEngine(BaseSearchEngine):
14
14
  safe_search: bool = True,
15
15
  llm: Optional[BaseLLM] = None,
16
16
  language: str = "English",
17
- include_full_content: bool = False):
17
+ include_full_content: bool = False,
18
+ max_filtered_results=5):
18
19
  """
19
20
  Initialize the DuckDuckGo search engine.
20
21
 
@@ -26,8 +27,8 @@ class DuckDuckGoSearchEngine(BaseSearchEngine):
26
27
  language: Language for content processing
27
28
  include_full_content: Whether to include full webpage content in results
28
29
  """
29
- super().__init__(llm=llm)
30
- self.max_results = max_results
30
+ # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
31
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
31
32
  self.region = region
32
33
  self.safe_search = safe_search
33
34
  self.language = language
@@ -39,10 +39,8 @@ class GitHubSearchEngine(BaseSearchEngine):
39
39
  llm: Language model for relevance filtering
40
40
  max_filtered_results: Maximum number of results to keep after filtering
41
41
  """
42
- # Initialize the BaseSearchEngine with the LLM and max_filtered_results
43
- super().__init__(llm=llm, max_filtered_results=max_filtered_results)
44
-
45
- self.max_results = max_results
42
+ # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
43
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
46
44
  self.api_key = api_key or os.getenv("GITHUB_API_KEY")
47
45
  self.search_type = search_type
48
46
  self.include_readme = include_readme
@@ -47,10 +47,8 @@ class GooglePSESearchEngine(BaseSearchEngine):
47
47
  retry_delay: Base delay in seconds between retry attempts
48
48
  **kwargs: Additional parameters (ignored but accepted for compatibility)
49
49
  """
50
- # Initialize the BaseSearchEngine with the LLM and max_filtered_results
51
- super().__init__(llm=llm, max_filtered_results=max_filtered_results)
52
-
53
- self.max_results = max_results
50
+ # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
51
+ super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
54
52
  self.include_full_content = include_full_content
55
53
 
56
54
  # Retry configuration