local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. local_deep_research/__init__.py +7 -0
  2. local_deep_research/__version__.py +1 -1
  3. local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
  4. local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
  5. local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
  6. local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
  7. local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
  8. local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
  9. local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
  10. local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
  11. local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
  12. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
  13. local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
  14. local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
  15. local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
  16. local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
  17. local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
  18. local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
  19. local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
  20. local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
  21. local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
  22. local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
  23. local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
  24. local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
  25. local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
  26. local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
  27. local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
  28. local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
  29. local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
  30. local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
  31. local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
  32. local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
  33. local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
  34. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
  35. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
  36. local_deep_research/advanced_search_system/findings/repository.py +54 -17
  37. local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
  38. local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
  39. local_deep_research/advanced_search_system/questions/__init__.py +16 -0
  40. local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
  41. local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
  42. local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
  43. local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
  44. local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
  45. local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
  46. local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
  47. local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
  48. local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
  49. local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
  50. local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
  51. local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
  52. local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
  53. local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
  54. local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
  55. local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
  56. local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
  57. local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
  58. local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
  59. local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
  60. local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
  61. local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
  62. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
  63. local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
  64. local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
  65. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
  66. local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
  67. local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
  68. local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
  69. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
  70. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
  71. local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
  72. local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
  73. local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
  74. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
  75. local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
  76. local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
  77. local_deep_research/api/benchmark_functions.py +6 -2
  78. local_deep_research/api/research_functions.py +10 -4
  79. local_deep_research/benchmarks/__init__.py +9 -7
  80. local_deep_research/benchmarks/benchmark_functions.py +6 -2
  81. local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
  82. local_deep_research/benchmarks/cli.py +38 -13
  83. local_deep_research/benchmarks/comparison/__init__.py +4 -2
  84. local_deep_research/benchmarks/comparison/evaluator.py +316 -239
  85. local_deep_research/benchmarks/datasets/__init__.py +1 -1
  86. local_deep_research/benchmarks/datasets/base.py +91 -72
  87. local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
  88. local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
  89. local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
  90. local_deep_research/benchmarks/datasets/utils.py +48 -29
  91. local_deep_research/benchmarks/datasets.py +4 -11
  92. local_deep_research/benchmarks/efficiency/__init__.py +8 -4
  93. local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
  94. local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
  95. local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
  96. local_deep_research/benchmarks/evaluators/composite.py +6 -2
  97. local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
  98. local_deep_research/benchmarks/graders.py +32 -10
  99. local_deep_research/benchmarks/metrics/README.md +1 -1
  100. local_deep_research/benchmarks/metrics/calculation.py +25 -10
  101. local_deep_research/benchmarks/metrics/reporting.py +7 -3
  102. local_deep_research/benchmarks/metrics/visualization.py +42 -23
  103. local_deep_research/benchmarks/metrics.py +1 -1
  104. local_deep_research/benchmarks/optimization/__init__.py +3 -1
  105. local_deep_research/benchmarks/optimization/api.py +7 -1
  106. local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
  107. local_deep_research/benchmarks/runners.py +48 -15
  108. local_deep_research/citation_handler.py +65 -92
  109. local_deep_research/citation_handlers/__init__.py +15 -0
  110. local_deep_research/citation_handlers/base_citation_handler.py +70 -0
  111. local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
  112. local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
  113. local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
  114. local_deep_research/config/llm_config.py +271 -169
  115. local_deep_research/config/search_config.py +14 -5
  116. local_deep_research/defaults/__init__.py +0 -1
  117. local_deep_research/metrics/__init__.py +13 -0
  118. local_deep_research/metrics/database.py +58 -0
  119. local_deep_research/metrics/db_models.py +115 -0
  120. local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
  121. local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
  122. local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
  123. local_deep_research/metrics/migrate_research_ratings.py +31 -0
  124. local_deep_research/metrics/models.py +61 -0
  125. local_deep_research/metrics/pricing/__init__.py +12 -0
  126. local_deep_research/metrics/pricing/cost_calculator.py +237 -0
  127. local_deep_research/metrics/pricing/pricing_cache.py +143 -0
  128. local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
  129. local_deep_research/metrics/query_utils.py +51 -0
  130. local_deep_research/metrics/search_tracker.py +380 -0
  131. local_deep_research/metrics/token_counter.py +1078 -0
  132. local_deep_research/migrate_db.py +3 -1
  133. local_deep_research/report_generator.py +22 -8
  134. local_deep_research/search_system.py +390 -9
  135. local_deep_research/test_migration.py +15 -5
  136. local_deep_research/utilities/db_utils.py +7 -4
  137. local_deep_research/utilities/es_utils.py +115 -104
  138. local_deep_research/utilities/llm_utils.py +15 -5
  139. local_deep_research/utilities/log_utils.py +151 -0
  140. local_deep_research/utilities/search_cache.py +387 -0
  141. local_deep_research/utilities/search_utilities.py +14 -6
  142. local_deep_research/utilities/threading_utils.py +92 -0
  143. local_deep_research/utilities/url_utils.py +6 -0
  144. local_deep_research/web/api.py +347 -0
  145. local_deep_research/web/app.py +13 -17
  146. local_deep_research/web/app_factory.py +71 -66
  147. local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
  148. local_deep_research/web/database/migrations.py +20 -3
  149. local_deep_research/web/database/models.py +74 -25
  150. local_deep_research/web/database/schema_upgrade.py +49 -29
  151. local_deep_research/web/models/database.py +63 -83
  152. local_deep_research/web/routes/api_routes.py +56 -22
  153. local_deep_research/web/routes/benchmark_routes.py +4 -1
  154. local_deep_research/web/routes/globals.py +22 -0
  155. local_deep_research/web/routes/history_routes.py +71 -46
  156. local_deep_research/web/routes/metrics_routes.py +1155 -0
  157. local_deep_research/web/routes/research_routes.py +192 -54
  158. local_deep_research/web/routes/settings_routes.py +156 -55
  159. local_deep_research/web/services/research_service.py +412 -251
  160. local_deep_research/web/services/resource_service.py +36 -11
  161. local_deep_research/web/services/settings_manager.py +55 -17
  162. local_deep_research/web/services/settings_service.py +12 -4
  163. local_deep_research/web/services/socket_service.py +295 -188
  164. local_deep_research/web/static/css/custom_dropdown.css +180 -0
  165. local_deep_research/web/static/css/styles.css +39 -1
  166. local_deep_research/web/static/js/components/detail.js +633 -267
  167. local_deep_research/web/static/js/components/details.js +751 -0
  168. local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
  169. local_deep_research/web/static/js/components/fallback/ui.js +23 -23
  170. local_deep_research/web/static/js/components/history.js +76 -76
  171. local_deep_research/web/static/js/components/logpanel.js +61 -13
  172. local_deep_research/web/static/js/components/progress.js +13 -2
  173. local_deep_research/web/static/js/components/research.js +99 -12
  174. local_deep_research/web/static/js/components/results.js +239 -106
  175. local_deep_research/web/static/js/main.js +40 -40
  176. local_deep_research/web/static/js/services/audio.js +1 -1
  177. local_deep_research/web/static/js/services/formatting.js +11 -11
  178. local_deep_research/web/static/js/services/keyboard.js +157 -0
  179. local_deep_research/web/static/js/services/pdf.js +80 -80
  180. local_deep_research/web/static/sounds/README.md +1 -1
  181. local_deep_research/web/templates/base.html +1 -0
  182. local_deep_research/web/templates/components/log_panel.html +7 -1
  183. local_deep_research/web/templates/components/mobile_nav.html +1 -1
  184. local_deep_research/web/templates/components/sidebar.html +3 -0
  185. local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
  186. local_deep_research/web/templates/pages/details.html +325 -24
  187. local_deep_research/web/templates/pages/history.html +1 -1
  188. local_deep_research/web/templates/pages/metrics.html +1929 -0
  189. local_deep_research/web/templates/pages/progress.html +2 -2
  190. local_deep_research/web/templates/pages/research.html +53 -17
  191. local_deep_research/web/templates/pages/results.html +12 -1
  192. local_deep_research/web/templates/pages/star_reviews.html +803 -0
  193. local_deep_research/web/utils/formatters.py +9 -3
  194. local_deep_research/web_search_engines/default_search_engines.py +5 -3
  195. local_deep_research/web_search_engines/engines/full_search.py +8 -2
  196. local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
  197. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
  198. local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
  199. local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
  200. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
  201. local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
  202. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
  203. local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
  204. local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
  205. local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
  206. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
  207. local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
  208. local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
  209. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
  210. local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
  211. local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
  212. local_deep_research/web_search_engines/search_engine_base.py +83 -35
  213. local_deep_research/web_search_engines/search_engine_factory.py +25 -8
  214. local_deep_research/web_search_engines/search_engines_config.py +9 -3
  215. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/METADATA +7 -1
  216. local_deep_research-0.5.2.dist-info/RECORD +265 -0
  217. local_deep_research-0.4.4.dist-info/RECORD +0 -177
  218. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/WHEEL +0 -0
  219. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/entry_points.txt +0 -0
  220. {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/licenses/LICENSE +0 -0
@@ -56,7 +56,9 @@ def _get_file_loader(file_path: str) -> Optional[BaseLoader]:
56
56
  return UnstructuredExcelLoader(str(file_path))
57
57
  else:
58
58
  # Try the text loader as a fallback for unknown extensions
59
- logger.warning(f"Unknown file extension for {file_path}, trying TextLoader")
59
+ logger.warning(
60
+ f"Unknown file extension for {file_path}, trying TextLoader"
61
+ )
60
62
  return TextLoader(str(file_path), encoding="utf-8")
61
63
  except Exception:
62
64
  logger.exception(f"Error creating loader for {file_path}")
@@ -208,7 +210,9 @@ class LocalEmbeddingManager:
208
210
 
209
211
  # Check if vector store exists and is up to date
210
212
  if vector_store_path.exists() and not self._check_folders_modified():
211
- logger.info(f"Loading existing vector store from {vector_store_path}")
213
+ logger.info(
214
+ f"Loading existing vector store from {vector_store_path}"
215
+ )
212
216
  try:
213
217
  vector_store = FAISS.load_local(
214
218
  str(vector_store_path),
@@ -306,9 +310,13 @@ class LocalEmbeddingManager:
306
310
  last_indexed = 0
307
311
  indexed_files = set()
308
312
  else:
309
- last_indexed = self.indexed_folders[folder_hash].get("last_indexed", 0)
313
+ last_indexed = self.indexed_folders[folder_hash].get(
314
+ "last_indexed", 0
315
+ )
310
316
  indexed_files = (
311
- self.indexed_folders[folder_hash].get("indexed_files", {}).keys()
317
+ self.indexed_folders[folder_hash]
318
+ .get("indexed_files", {})
319
+ .keys()
312
320
  )
313
321
 
314
322
  # Check if any file in the folder has been modified since last indexing
@@ -345,11 +353,15 @@ class LocalEmbeddingManager:
345
353
  self.chunk_overlap,
346
354
  self.embedding_model,
347
355
  ):
348
- logger.info("Embedding configuration has changed, re-indexing folder.")
356
+ logger.info(
357
+ "Embedding configuration has changed, re-indexing folder."
358
+ )
349
359
  return True
350
360
  return False
351
361
 
352
- def index_folder(self, folder_path: str, force_reindex: bool = False) -> bool:
362
+ def index_folder(
363
+ self, folder_path: str, force_reindex: bool = False
364
+ ) -> bool:
353
365
  """
354
366
  Index all documents in a folder for vector search.
355
367
 
@@ -434,7 +446,9 @@ class LocalEmbeddingManager:
434
446
  # Split documents into chunks
435
447
  logger.info(f"Splitting {len(all_docs)} documents into chunks")
436
448
  splits = self.text_splitter.split_documents(all_docs)
437
- logger.info(f"Created {len(splits)} chunks from {len(modified_files)} files")
449
+ logger.info(
450
+ f"Created {len(splits)} chunks from {len(modified_files)} files"
451
+ )
438
452
 
439
453
  # Create vector store
440
454
  ids = []
@@ -448,10 +462,14 @@ class LocalEmbeddingManager:
448
462
  indexed_files = {}
449
463
  if folder_hash in self.indexed_folders:
450
464
  indexed_files = (
451
- self.indexed_folders[folder_hash].get("indexed_files", {}).copy()
465
+ self.indexed_folders[folder_hash]
466
+ .get("indexed_files", {})
467
+ .copy()
452
468
  )
453
469
  for split_id, split in zip(ids, splits):
454
- split_source = str(Path(split.metadata["source"]).relative_to(folder_path))
470
+ split_source = str(
471
+ Path(split.metadata["source"]).relative_to(folder_path)
472
+ )
455
473
  id_list = indexed_files.setdefault(split_source, [])
456
474
  id_list.append(split_id)
457
475
 
@@ -465,7 +483,8 @@ class LocalEmbeddingManager:
465
483
  delete_paths.append(relative_path)
466
484
  if delete_ids:
467
485
  logger.info(
468
- f"Deleting {len(delete_paths)} non-existent files from the " f"index."
486
+ f"Deleting {len(delete_paths)} non-existent files from the "
487
+ f"index."
469
488
  )
470
489
  self.vector_stores[folder_hash].delete(delete_ids)
471
490
  for path in delete_paths:
@@ -543,7 +562,9 @@ class LocalEmbeddingManager:
543
562
  if path.exists() and path.is_dir():
544
563
  valid_folder_paths.append(path)
545
564
  else:
546
- logger.warning(f"Skipping non-existent folder in search: {path}")
565
+ logger.warning(
566
+ f"Skipping non-existent folder in search: {path}"
567
+ )
547
568
 
548
569
  # If no valid folders, return empty results
549
570
  if not valid_folder_paths:
@@ -578,8 +599,10 @@ class LocalEmbeddingManager:
578
599
  vector_store = self.vector_stores[folder_hash]
579
600
 
580
601
  try:
581
- docs_with_scores = vector_store.similarity_search_with_relevance_scores(
582
- query, k=limit
602
+ docs_with_scores = (
603
+ vector_store.similarity_search_with_relevance_scores(
604
+ query, k=limit
605
+ )
583
606
  )
584
607
 
585
608
  for doc, similarity in docs_with_scores:
@@ -685,7 +708,9 @@ class LocalSearchEngine(BaseSearchEngine):
685
708
  if os.path.exists(path) and os.path.isdir(path):
686
709
  self.valid_folder_paths.append(path)
687
710
  else:
688
- logger.warning(f"Folder not found or is not a directory: {path}")
711
+ logger.warning(
712
+ f"Folder not found or is not a directory: {path}"
713
+ )
689
714
 
690
715
  # If no valid folders, log a clear message
691
716
  if not self.valid_folder_paths and paths:
@@ -768,7 +793,9 @@ class LocalSearchEngine(BaseSearchEngine):
768
793
  if name in self.collections
769
794
  }
770
795
  if not collections_to_search:
771
- logger.warning(f"No valid collections found among: {collection_names}")
796
+ logger.warning(
797
+ f"No valid collections found among: {collection_names}"
798
+ )
772
799
  return []
773
800
  else:
774
801
  # Search in all collections
@@ -811,13 +838,13 @@ class LocalSearchEngine(BaseSearchEngine):
811
838
  previews = []
812
839
  for i, result in enumerate(raw_results):
813
840
  # Create a unique ID
814
- result_id = (
815
- f"local-{i}-{hashlib.md5(result['content'][:50].encode()).hexdigest()}"
816
- )
841
+ result_id = f"local-{i}-{hashlib.md5(result['content'][:50].encode()).hexdigest()}"
817
842
 
818
843
  # Extract filename and path
819
844
  source_path = result["metadata"].get("source", "Unknown")
820
- filename = result["metadata"].get("filename", os.path.basename(source_path))
845
+ filename = result["metadata"].get(
846
+ "filename", os.path.basename(source_path)
847
+ )
821
848
 
822
849
  # Create preview snippet (first ~200 chars of content)
823
850
  snippet = (
@@ -845,10 +872,12 @@ class LocalSearchEngine(BaseSearchEngine):
845
872
  "similarity": result["similarity"],
846
873
  "folder": folder_path.as_posix(),
847
874
  "collection": collection_name,
848
- "collection_description": self.collections.get(collection_name, {}).get(
849
- "description", ""
850
- ),
851
- "_full_content": result["content"], # Store full content for later
875
+ "collection_description": self.collections.get(
876
+ collection_name, {}
877
+ ).get("description", ""),
878
+ "_full_content": result[
879
+ "content"
880
+ ], # Store full content for later
852
881
  "_metadata": result["metadata"], # Store metadata for later
853
882
  }
854
883
 
@@ -933,7 +962,9 @@ class LocalSearchEngine(BaseSearchEngine):
933
962
  if collection_name in self.collections:
934
963
  specified_collections.append(collection_name)
935
964
  # Remove this part from the query
936
- remaining_query = remaining_query.replace(part, "", 1).strip()
965
+ remaining_query = remaining_query.replace(
966
+ part, "", 1
967
+ ).strip()
937
968
 
938
969
  # If collections were specified in the query, they override the parameter
939
970
  if specified_collections:
@@ -1047,7 +1078,9 @@ class LocalSearchEngine(BaseSearchEngine):
1047
1078
  success = True
1048
1079
 
1049
1080
  for path in paths:
1050
- if not self.embedding_manager.index_folder(path, force_reindex=True):
1081
+ if not self.embedding_manager.index_folder(
1082
+ path, force_reindex=True
1083
+ ):
1051
1084
  success = False
1052
1085
 
1053
1086
  return success
@@ -37,7 +37,9 @@ class LocalAllSearchEngine(BaseSearchEngine):
37
37
  """
38
38
  # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
39
39
  super().__init__(
40
- llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
40
+ llm=llm,
41
+ max_filtered_results=max_filtered_results,
42
+ max_results=max_results,
41
43
  )
42
44
 
43
45
  # Find all local collection search engines
@@ -76,7 +78,9 @@ class LocalAllSearchEngine(BaseSearchEngine):
76
78
  Returns:
77
79
  List of preview dictionaries
78
80
  """
79
- logger.info(f"Searching across all local collections for query: {query}")
81
+ logger.info(
82
+ f"Searching across all local collections for query: {query}"
83
+ )
80
84
 
81
85
  all_previews = []
82
86
 
@@ -91,18 +95,24 @@ class LocalAllSearchEngine(BaseSearchEngine):
91
95
  for preview in previews:
92
96
  preview["collection_id"] = collection_id
93
97
  preview["collection_name"] = engine_info["name"]
94
- preview["collection_description"] = engine_info["description"]
98
+ preview["collection_description"] = engine_info[
99
+ "description"
100
+ ]
95
101
 
96
102
  all_previews.extend(previews)
97
103
  except Exception:
98
- logger.exception(f"Error searching collection '{collection_id}'")
104
+ logger.exception(
105
+ f"Error searching collection '{collection_id}'"
106
+ )
99
107
 
100
108
  if not all_previews:
101
109
  logger.info(f"No local documents found for query: {query}")
102
110
  return []
103
111
 
104
112
  # Sort by similarity score if available
105
- all_previews.sort(key=lambda x: float(x.get("similarity", 0)), reverse=True)
113
+ all_previews.sort(
114
+ key=lambda x: float(x.get("similarity", 0)), reverse=True
115
+ )
106
116
 
107
117
  # Limit to max_results
108
118
  return all_previews[: self.max_results]
@@ -49,7 +49,9 @@ class PubMedSearchEngine(BaseSearchEngine):
49
49
  """
50
50
  # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
51
51
  super().__init__(
52
- llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
52
+ llm=llm,
53
+ max_filtered_results=max_filtered_results,
54
+ max_results=max_results,
53
55
  )
54
56
  self.max_results = max(self.max_results, 25)
55
57
  self.api_key = api_key
@@ -100,7 +102,9 @@ class PubMedSearchEngine(BaseSearchEngine):
100
102
  data = response.json()
101
103
  count = int(data["esearchresult"]["count"])
102
104
 
103
- logger.info("Query '%s' has %s total results in PubMed", query, count)
105
+ logger.info(
106
+ "Query '%s' has %s total results in PubMed", query, count
107
+ )
104
108
  return count
105
109
 
106
110
  except Exception as e:
@@ -119,10 +123,14 @@ class PubMedSearchEngine(BaseSearchEngine):
119
123
  """
120
124
  # Remove field specifications and operators
121
125
  simplified = re.sub(r"\[\w+\]", "", query) # Remove [Field] tags
122
- simplified = re.sub(r"\b(AND|OR|NOT)\b", "", simplified) # Remove operators
126
+ simplified = re.sub(
127
+ r"\b(AND|OR|NOT)\b", "", simplified
128
+ ) # Remove operators
123
129
 
124
130
  # Remove quotes and parentheses
125
- simplified = simplified.replace('"', "").replace("(", "").replace(")", "")
131
+ simplified = (
132
+ simplified.replace('"', "").replace("(", "").replace(")", "")
133
+ )
126
134
 
127
135
  # Split by whitespace and join terms with 4+ chars (likely meaningful)
128
136
  terms = [term for term in simplified.split() if len(term) >= 4]
@@ -220,7 +228,9 @@ Return ONLY the search query without any explanations.
220
228
  optimized_query = cleaned_lines[0]
221
229
 
222
230
  # Remove any quotes that wrap the entire query
223
- if optimized_query.startswith('"') and optimized_query.endswith('"'):
231
+ if optimized_query.startswith('"') and optimized_query.endswith(
232
+ '"'
233
+ ):
224
234
  optimized_query = optimized_query[1:-1]
225
235
 
226
236
  # Remove any explanation phrases that might be at the beginning
@@ -235,7 +245,9 @@ Return ONLY the search query without any explanations.
235
245
  # Find the actual query part - typically after a colon
236
246
  colon_pos = optimized_query.find(":")
237
247
  if colon_pos > 0:
238
- optimized_query = optimized_query[colon_pos + 1 :].strip()
248
+ optimized_query = optimized_query[
249
+ colon_pos + 1 :
250
+ ].strip()
239
251
 
240
252
  # Check if the query still seems to contain explanations
241
253
  if (
@@ -262,12 +274,16 @@ Return ONLY the search query without any explanations.
262
274
  optimized_query = " ".join(query_parts)
263
275
  else:
264
276
  # Fall back to original query if cleaning fails
265
- logger.warning("Failed to extract a clean query from LLM response")
277
+ logger.warning(
278
+ "Failed to extract a clean query from LLM response"
279
+ )
266
280
  optimized_query = query
267
281
 
268
282
  # Final safety check - if query looks too much like an explanation, use original
269
283
  if len(optimized_query.split()) > 30:
270
- logger.warning("Query too verbose, falling back to simpler form")
284
+ logger.warning(
285
+ "Query too verbose, falling back to simpler form"
286
+ )
271
287
  # Create a simple query from the original
272
288
  words = [
273
289
  w
@@ -389,7 +405,9 @@ Return ONLY the search query without any explanations.
389
405
  historical_years = [str(year) for year in range(1900, 2020)]
390
406
 
391
407
  query_lower = query.lower()
392
- has_historical_term = any(term in query_lower for term in historical_terms)
408
+ has_historical_term = any(
409
+ term in query_lower for term in historical_terms
410
+ )
393
411
  has_past_year = any(year in query for year in historical_years)
394
412
 
395
413
  return has_historical_term or has_past_year
@@ -504,7 +522,9 @@ The default assumption should be that medical and scientific queries want RECENT
504
522
  strategy = "no_time_filter"
505
523
  else:
506
524
  # Historical query - run without time filter
507
- logger.info("Using historical search strategy without date filtering")
525
+ logger.info(
526
+ "Using historical search strategy without date filtering"
527
+ )
508
528
  results = self._search_pubmed(query)
509
529
 
510
530
  return results, strategy
@@ -546,14 +566,18 @@ The default assumption should be that medical and scientific queries want RECENT
546
566
  data = response.json()
547
567
  id_list = data["esearchresult"]["idlist"]
548
568
 
549
- logger.info(f"PubMed search for '{query}' found {len(id_list)} results")
569
+ logger.info(
570
+ f"PubMed search for '{query}' found {len(id_list)} results"
571
+ )
550
572
  return id_list
551
573
 
552
574
  except Exception as e:
553
575
  logger.error(f"Error searching PubMed: {e}")
554
576
  return []
555
577
 
556
- def _get_article_summaries(self, id_list: List[str]) -> List[Dict[str, Any]]:
578
+ def _get_article_summaries(
579
+ self, id_list: List[str]
580
+ ) -> List[Dict[str, Any]]:
557
581
  """
558
582
  Get summaries for a list of PubMed article IDs.
559
583
 
@@ -594,7 +618,9 @@ The default assumption should be that medical and scientific queries want RECENT
594
618
  # Extract authors (if available)
595
619
  authors = []
596
620
  if "authors" in article:
597
- authors = [author["name"] for author in article["authors"]]
621
+ authors = [
622
+ author["name"] for author in article["authors"]
623
+ ]
598
624
 
599
625
  # Create summary dictionary
600
626
  summary = {
@@ -742,7 +768,9 @@ The default assumption should be that medical and scientific queries want RECENT
742
768
  if pmcids:
743
769
  pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}"
744
770
 
745
- logger.info(f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access")
771
+ logger.info(
772
+ f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access"
773
+ )
746
774
  return pmid_to_pmcid
747
775
 
748
776
  except Exception as e:
@@ -761,7 +789,12 @@ The default assumption should be that medical and scientific queries want RECENT
761
789
  """
762
790
  try:
763
791
  # Prepare parameters
764
- params = {"db": "pmc", "id": pmcid, "retmode": "xml", "rettype": "full"}
792
+ params = {
793
+ "db": "pmc",
794
+ "id": pmcid,
795
+ "retmode": "xml",
796
+ "rettype": "full",
797
+ }
765
798
 
766
799
  # Add API key if available
767
800
  if self.api_key:
@@ -832,13 +865,17 @@ The default assumption should be that medical and scientific queries want RECENT
832
865
 
833
866
  # If no results, try a simplified query
834
867
  if not pmid_list:
835
- logger.warning(f"No PubMed results found using strategy: {strategy}")
868
+ logger.warning(
869
+ f"No PubMed results found using strategy: {strategy}"
870
+ )
836
871
  simplified_query = self._simplify_query(optimized_query)
837
872
  if simplified_query != optimized_query:
838
873
  logger.info(f"Trying with simplified query: {simplified_query}")
839
874
  pmid_list, strategy = self._adaptive_search(simplified_query)
840
875
  if pmid_list:
841
- logger.info(f"Simplified query found {len(pmid_list)} results")
876
+ logger.info(
877
+ f"Simplified query found {len(pmid_list)} results"
878
+ )
842
879
 
843
880
  if not pmid_list:
844
881
  logger.warning("No PubMed results found after query simplification")
@@ -876,7 +913,9 @@ The default assumption should be that medical and scientific queries want RECENT
876
913
 
877
914
  previews.append(preview)
878
915
 
879
- logger.info(f"Found {len(previews)} PubMed previews using strategy: {strategy}")
916
+ logger.info(
917
+ f"Found {len(previews)} PubMed previews using strategy: {strategy}"
918
+ )
880
919
  return previews
881
920
 
882
921
  def _get_full_content(
@@ -900,7 +939,9 @@ The default assumption should be that medical and scientific queries want RECENT
900
939
  logger.info("Snippet-only mode, skipping full content retrieval")
901
940
  return relevant_items
902
941
 
903
- logger.info(f"Getting content for {len(relevant_items)} PubMed articles")
942
+ logger.info(
943
+ f"Getting content for {len(relevant_items)} PubMed articles"
944
+ )
904
945
 
905
946
  # Collect all PMIDs for relevant items
906
947
  pmids = []
@@ -938,10 +979,11 @@ The default assumption should be that medical and scientific queries want RECENT
938
979
  if (
939
980
  pmid in pmid_to_pmcid
940
981
  and self.get_full_text
941
- and len([r for r in results if r.get("content_type") == "full_text"])
982
+ and len(
983
+ [r for r in results if r.get("content_type") == "full_text"]
984
+ )
942
985
  < self.full_text_limit
943
986
  ):
944
-
945
987
  # Get full text content
946
988
  pmcid = pmid_to_pmcid[pmid]
947
989
  full_text = self._get_pmc_full_text(pmcid)
@@ -63,12 +63,16 @@ class SearXNGSearchEngine(BaseSearchEngine):
63
63
 
64
64
  # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
65
65
  super().__init__(
66
- llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
66
+ llm=llm,
67
+ max_filtered_results=max_filtered_results,
68
+ max_results=max_results,
67
69
  )
68
70
 
69
71
  # Validate and normalize the instance URL if provided
70
72
  self.instance_url = instance_url.rstrip("/")
71
- logger.info(f"SearXNG initialized with instance URL: {self.instance_url}")
73
+ logger.info(
74
+ f"SearXNG initialized with instance URL: {self.instance_url}"
75
+ )
72
76
  try:
73
77
  # Make sure it's accessible.
74
78
  response = requests.get(self.instance_url, timeout=5)
@@ -97,8 +101,14 @@ class SearXNGSearchEngine(BaseSearchEngine):
97
101
  self.engines = engines
98
102
  self.language = language
99
103
  try:
100
- self.safe_search = SafeSearchSetting[safe_search]
101
- except KeyError:
104
+ # Handle both string names and integer values
105
+ if isinstance(safe_search, int) or (
106
+ isinstance(safe_search, str) and str(safe_search).isdigit()
107
+ ):
108
+ self.safe_search = SafeSearchSetting(int(safe_search))
109
+ else:
110
+ self.safe_search = SafeSearchSetting[safe_search]
111
+ except (ValueError, KeyError):
102
112
  logger.error(
103
113
  "'{}' is not a valid safe search setting. Disabling safe search",
104
114
  safe_search,
@@ -207,7 +217,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
207
217
  "Upgrade-Insecure-Requests": "1",
208
218
  }
209
219
 
210
- logger.info(f"Sending request to SearXNG instance at {self.instance_url}")
220
+ logger.info(
221
+ f"Sending request to SearXNG instance at {self.instance_url}"
222
+ )
211
223
  response = requests.get(
212
224
  self.search_url,
213
225
  params=params,
@@ -237,7 +249,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
237
249
  )
238
250
  result_elements = soup.select('div[id^="result"]')
239
251
 
240
- logger.info(f"Found {len(result_elements)} search result elements")
252
+ logger.info(
253
+ f"Found {len(result_elements)} search result elements"
254
+ )
241
255
 
242
256
  for idx, result_element in enumerate(result_elements):
243
257
  if idx >= self.max_results:
@@ -264,7 +278,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
264
278
  )
265
279
 
266
280
  title = (
267
- title_element.get_text(strip=True) if title_element else ""
281
+ title_element.get_text(strip=True)
282
+ if title_element
283
+ else ""
268
284
  )
269
285
 
270
286
  url = ""
@@ -279,7 +295,11 @@ class SearXNGSearchEngine(BaseSearchEngine):
279
295
  else ""
280
296
  )
281
297
 
282
- if not url and title_element and title_element.has_attr("href"):
298
+ if (
299
+ not url
300
+ and title_element
301
+ and title_element.has_attr("href")
302
+ ):
283
303
  url = title_element["href"]
284
304
 
285
305
  logger.debug(
@@ -310,7 +330,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
310
330
  logger.exception("Error parsing HTML results")
311
331
  return []
312
332
  else:
313
- logger.error(f"SearXNG returned status code {response.status_code}")
333
+ logger.error(
334
+ f"SearXNG returned status code {response.status_code}"
335
+ )
314
336
  return []
315
337
 
316
338
  except Exception:
@@ -328,7 +350,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
328
350
  List of preview dictionaries
329
351
  """
330
352
  if not self.is_available:
331
- logger.warning("SearXNG engine is disabled (no instance URL provided)")
353
+ logger.warning(
354
+ "SearXNG engine is disabled (no instance URL provided)"
355
+ )
332
356
  return []
333
357
 
334
358
  logger.info(f"Getting SearXNG previews for query: {query}")
@@ -383,7 +407,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
383
407
  logger.info("Retrieving full webpage content")
384
408
 
385
409
  try:
386
- results_with_content = self.full_search._get_full_content(relevant_items)
410
+ results_with_content = self.full_search._get_full_content(
411
+ relevant_items
412
+ )
387
413
  return results_with_content
388
414
 
389
415
  except Exception:
@@ -66,7 +66,9 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
66
66
  """
67
67
  # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
68
68
  super().__init__(
69
- llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
69
+ llm=llm,
70
+ max_filtered_results=max_filtered_results,
71
+ max_results=max_results,
70
72
  )
71
73
 
72
74
  self.api_key = api_key
@@ -157,7 +159,9 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
157
159
  if method.upper() == "GET":
158
160
  response = self.session.get(url, params=params, timeout=30)
159
161
  elif method.upper() == "POST":
160
- response = self.session.post(url, params=params, json=data, timeout=30)
162
+ response = self.session.post(
163
+ url, params=params, json=data, timeout=30
164
+ )
161
165
  else:
162
166
  raise ValueError(f"Unsupported HTTP method: {method}")
163
167
 
@@ -165,7 +169,9 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
165
169
  if response.status_code == 429:
166
170
  logger.warning("Rate limit exceeded, waiting and retrying...")
167
171
  time.sleep(2.0) # Wait longer on rate limit
168
- self.rate_limit_wait *= 1.5 # Increase wait time for future requests
172
+ self.rate_limit_wait *= (
173
+ 1.5 # Increase wait time for future requests
174
+ )
169
175
  return self._make_request(url, params, data, method) # Retry
170
176
 
171
177
  response.raise_for_status()
@@ -258,7 +264,9 @@ Return ONLY the optimized search query with no explanation.
258
264
 
259
265
  params = {
260
266
  "query": query,
261
- "limit": min(self.max_results, 100), # API limit is 100 per request
267
+ "limit": min(
268
+ self.max_results, 100
269
+ ), # API limit is 100 per request
262
270
  "fields": ",".join(fields),
263
271
  }
264
272
 
@@ -351,15 +359,21 @@ Format each query on a new line with no numbering or explanation. Keep each quer
351
359
  ): # Handle various LLM response formats
352
360
  content = response.content
353
361
  alt_queries = [
354
- q.strip() for q in content.strip().split("\n") if q.strip()
362
+ q.strip()
363
+ for q in content.strip().split("\n")
364
+ if q.strip()
355
365
  ]
356
366
  elif isinstance(response, str):
357
367
  alt_queries = [
358
- q.strip() for q in response.strip().split("\n") if q.strip()
368
+ q.strip()
369
+ for q in response.strip().split("\n")
370
+ if q.strip()
359
371
  ]
360
372
 
361
373
  # Try each alternative query
362
- for alt_query in alt_queries[:3]: # Limit to first 3 alternatives
374
+ for alt_query in alt_queries[
375
+ :3
376
+ ]: # Limit to first 3 alternatives
363
377
  logger.info("Trying LLM-suggested query: %s", alt_query)
364
378
  alt_papers = self._direct_search(alt_query)
365
379
 
@@ -495,7 +509,9 @@ Format each query on a new line with no numbering or explanation. Keep each quer
495
509
  snippet = ""
496
510
  if abstract:
497
511
  snippet = (
498
- abstract[:250] + "..." if len(abstract) > 250 else abstract
512
+ abstract[:250] + "..."
513
+ if len(abstract) > 250
514
+ else abstract
499
515
  )
500
516
 
501
517
  venue = paper.get("venue", "")
@@ -597,7 +613,9 @@ Format each query on a new line with no numbering or explanation. Keep each quer
597
613
 
598
614
  # Add fields of study
599
615
  if "fieldsOfStudy" in paper_details:
600
- result["fields_of_study"] = paper_details["fieldsOfStudy"]
616
+ result["fields_of_study"] = paper_details[
617
+ "fieldsOfStudy"
618
+ ]
601
619
 
602
620
  # Remove temporary fields
603
621
  if "_paper_id" in result: