local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +7 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
- local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
- local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
- local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
- local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
- local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
- local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
- local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
- local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
- local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
- local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
- local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
- local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
- local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
- local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
- local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
- local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
- local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
- local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
- local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
- local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
- local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
- local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
- local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
- local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
- local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
- local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
- local_deep_research/advanced_search_system/findings/repository.py +54 -17
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
- local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
- local_deep_research/advanced_search_system/questions/__init__.py +16 -0
- local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
- local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
- local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
- local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
- local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
- local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
- local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
- local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
- local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
- local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
- local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
- local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
- local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
- local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
- local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
- local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
- local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
- local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
- local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
- local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
- local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
- local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
- local_deep_research/api/benchmark_functions.py +6 -2
- local_deep_research/api/research_functions.py +10 -4
- local_deep_research/benchmarks/__init__.py +9 -7
- local_deep_research/benchmarks/benchmark_functions.py +6 -2
- local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
- local_deep_research/benchmarks/cli.py +38 -13
- local_deep_research/benchmarks/comparison/__init__.py +4 -2
- local_deep_research/benchmarks/comparison/evaluator.py +316 -239
- local_deep_research/benchmarks/datasets/__init__.py +1 -1
- local_deep_research/benchmarks/datasets/base.py +91 -72
- local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
- local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
- local_deep_research/benchmarks/datasets/utils.py +48 -29
- local_deep_research/benchmarks/datasets.py +4 -11
- local_deep_research/benchmarks/efficiency/__init__.py +8 -4
- local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
- local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
- local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
- local_deep_research/benchmarks/evaluators/composite.py +6 -2
- local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
- local_deep_research/benchmarks/graders.py +32 -10
- local_deep_research/benchmarks/metrics/README.md +1 -1
- local_deep_research/benchmarks/metrics/calculation.py +25 -10
- local_deep_research/benchmarks/metrics/reporting.py +7 -3
- local_deep_research/benchmarks/metrics/visualization.py +42 -23
- local_deep_research/benchmarks/metrics.py +1 -1
- local_deep_research/benchmarks/optimization/__init__.py +3 -1
- local_deep_research/benchmarks/optimization/api.py +7 -1
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
- local_deep_research/benchmarks/runners.py +48 -15
- local_deep_research/citation_handler.py +65 -92
- local_deep_research/citation_handlers/__init__.py +15 -0
- local_deep_research/citation_handlers/base_citation_handler.py +70 -0
- local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
- local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
- local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
- local_deep_research/config/llm_config.py +271 -169
- local_deep_research/config/search_config.py +14 -5
- local_deep_research/defaults/__init__.py +0 -1
- local_deep_research/metrics/__init__.py +13 -0
- local_deep_research/metrics/database.py +58 -0
- local_deep_research/metrics/db_models.py +115 -0
- local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
- local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
- local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
- local_deep_research/metrics/migrate_research_ratings.py +31 -0
- local_deep_research/metrics/models.py +61 -0
- local_deep_research/metrics/pricing/__init__.py +12 -0
- local_deep_research/metrics/pricing/cost_calculator.py +237 -0
- local_deep_research/metrics/pricing/pricing_cache.py +143 -0
- local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
- local_deep_research/metrics/query_utils.py +51 -0
- local_deep_research/metrics/search_tracker.py +380 -0
- local_deep_research/metrics/token_counter.py +1078 -0
- local_deep_research/migrate_db.py +3 -1
- local_deep_research/report_generator.py +22 -8
- local_deep_research/search_system.py +390 -9
- local_deep_research/test_migration.py +15 -5
- local_deep_research/utilities/db_utils.py +7 -4
- local_deep_research/utilities/es_utils.py +115 -104
- local_deep_research/utilities/llm_utils.py +15 -5
- local_deep_research/utilities/log_utils.py +151 -0
- local_deep_research/utilities/search_cache.py +387 -0
- local_deep_research/utilities/search_utilities.py +14 -6
- local_deep_research/utilities/threading_utils.py +92 -0
- local_deep_research/utilities/url_utils.py +6 -0
- local_deep_research/web/api.py +347 -0
- local_deep_research/web/app.py +13 -17
- local_deep_research/web/app_factory.py +71 -66
- local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
- local_deep_research/web/database/migrations.py +5 -3
- local_deep_research/web/database/models.py +51 -2
- local_deep_research/web/database/schema_upgrade.py +49 -29
- local_deep_research/web/models/database.py +51 -61
- local_deep_research/web/routes/api_routes.py +56 -22
- local_deep_research/web/routes/benchmark_routes.py +4 -1
- local_deep_research/web/routes/globals.py +22 -0
- local_deep_research/web/routes/history_routes.py +71 -46
- local_deep_research/web/routes/metrics_routes.py +1155 -0
- local_deep_research/web/routes/research_routes.py +227 -41
- local_deep_research/web/routes/settings_routes.py +156 -55
- local_deep_research/web/services/research_service.py +310 -103
- local_deep_research/web/services/resource_service.py +36 -11
- local_deep_research/web/services/settings_manager.py +55 -17
- local_deep_research/web/services/settings_service.py +12 -4
- local_deep_research/web/services/socket_service.py +295 -188
- local_deep_research/web/static/css/custom_dropdown.css +180 -0
- local_deep_research/web/static/css/styles.css +39 -1
- local_deep_research/web/static/js/components/detail.js +633 -267
- local_deep_research/web/static/js/components/details.js +751 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
- local_deep_research/web/static/js/components/fallback/ui.js +23 -23
- local_deep_research/web/static/js/components/history.js +76 -76
- local_deep_research/web/static/js/components/logpanel.js +61 -13
- local_deep_research/web/static/js/components/progress.js +13 -2
- local_deep_research/web/static/js/components/research.js +99 -12
- local_deep_research/web/static/js/components/results.js +239 -106
- local_deep_research/web/static/js/main.js +40 -40
- local_deep_research/web/static/js/services/audio.js +1 -1
- local_deep_research/web/static/js/services/formatting.js +11 -11
- local_deep_research/web/static/js/services/keyboard.js +157 -0
- local_deep_research/web/static/js/services/pdf.js +80 -80
- local_deep_research/web/static/sounds/README.md +1 -1
- local_deep_research/web/templates/base.html +1 -0
- local_deep_research/web/templates/components/log_panel.html +7 -1
- local_deep_research/web/templates/components/mobile_nav.html +1 -1
- local_deep_research/web/templates/components/sidebar.html +3 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
- local_deep_research/web/templates/pages/details.html +325 -24
- local_deep_research/web/templates/pages/history.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +1929 -0
- local_deep_research/web/templates/pages/progress.html +2 -2
- local_deep_research/web/templates/pages/research.html +53 -17
- local_deep_research/web/templates/pages/results.html +12 -1
- local_deep_research/web/templates/pages/star_reviews.html +803 -0
- local_deep_research/web/utils/formatters.py +9 -3
- local_deep_research/web_search_engines/default_search_engines.py +5 -3
- local_deep_research/web_search_engines/engines/full_search.py +8 -2
- local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
- local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
- local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
- local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
- local_deep_research/web_search_engines/search_engine_base.py +83 -35
- local_deep_research/web_search_engines/search_engine_factory.py +25 -8
- local_deep_research/web_search_engines/search_engines_config.py +9 -3
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/METADATA +7 -1
- local_deep_research-0.5.0.dist-info/RECORD +265 -0
- local_deep_research-0.4.4.dist-info/RECORD +0 -177
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/WHEEL +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -56,7 +56,9 @@ def _get_file_loader(file_path: str) -> Optional[BaseLoader]:
|
|
56
56
|
return UnstructuredExcelLoader(str(file_path))
|
57
57
|
else:
|
58
58
|
# Try the text loader as a fallback for unknown extensions
|
59
|
-
logger.warning(
|
59
|
+
logger.warning(
|
60
|
+
f"Unknown file extension for {file_path}, trying TextLoader"
|
61
|
+
)
|
60
62
|
return TextLoader(str(file_path), encoding="utf-8")
|
61
63
|
except Exception:
|
62
64
|
logger.exception(f"Error creating loader for {file_path}")
|
@@ -208,7 +210,9 @@ class LocalEmbeddingManager:
|
|
208
210
|
|
209
211
|
# Check if vector store exists and is up to date
|
210
212
|
if vector_store_path.exists() and not self._check_folders_modified():
|
211
|
-
logger.info(
|
213
|
+
logger.info(
|
214
|
+
f"Loading existing vector store from {vector_store_path}"
|
215
|
+
)
|
212
216
|
try:
|
213
217
|
vector_store = FAISS.load_local(
|
214
218
|
str(vector_store_path),
|
@@ -306,9 +310,13 @@ class LocalEmbeddingManager:
|
|
306
310
|
last_indexed = 0
|
307
311
|
indexed_files = set()
|
308
312
|
else:
|
309
|
-
last_indexed = self.indexed_folders[folder_hash].get(
|
313
|
+
last_indexed = self.indexed_folders[folder_hash].get(
|
314
|
+
"last_indexed", 0
|
315
|
+
)
|
310
316
|
indexed_files = (
|
311
|
-
self.indexed_folders[folder_hash]
|
317
|
+
self.indexed_folders[folder_hash]
|
318
|
+
.get("indexed_files", {})
|
319
|
+
.keys()
|
312
320
|
)
|
313
321
|
|
314
322
|
# Check if any file in the folder has been modified since last indexing
|
@@ -345,11 +353,15 @@ class LocalEmbeddingManager:
|
|
345
353
|
self.chunk_overlap,
|
346
354
|
self.embedding_model,
|
347
355
|
):
|
348
|
-
logger.info(
|
356
|
+
logger.info(
|
357
|
+
"Embedding configuration has changed, re-indexing folder."
|
358
|
+
)
|
349
359
|
return True
|
350
360
|
return False
|
351
361
|
|
352
|
-
def index_folder(
|
362
|
+
def index_folder(
|
363
|
+
self, folder_path: str, force_reindex: bool = False
|
364
|
+
) -> bool:
|
353
365
|
"""
|
354
366
|
Index all documents in a folder for vector search.
|
355
367
|
|
@@ -434,7 +446,9 @@ class LocalEmbeddingManager:
|
|
434
446
|
# Split documents into chunks
|
435
447
|
logger.info(f"Splitting {len(all_docs)} documents into chunks")
|
436
448
|
splits = self.text_splitter.split_documents(all_docs)
|
437
|
-
logger.info(
|
449
|
+
logger.info(
|
450
|
+
f"Created {len(splits)} chunks from {len(modified_files)} files"
|
451
|
+
)
|
438
452
|
|
439
453
|
# Create vector store
|
440
454
|
ids = []
|
@@ -448,10 +462,14 @@ class LocalEmbeddingManager:
|
|
448
462
|
indexed_files = {}
|
449
463
|
if folder_hash in self.indexed_folders:
|
450
464
|
indexed_files = (
|
451
|
-
self.indexed_folders[folder_hash]
|
465
|
+
self.indexed_folders[folder_hash]
|
466
|
+
.get("indexed_files", {})
|
467
|
+
.copy()
|
452
468
|
)
|
453
469
|
for split_id, split in zip(ids, splits):
|
454
|
-
split_source = str(
|
470
|
+
split_source = str(
|
471
|
+
Path(split.metadata["source"]).relative_to(folder_path)
|
472
|
+
)
|
455
473
|
id_list = indexed_files.setdefault(split_source, [])
|
456
474
|
id_list.append(split_id)
|
457
475
|
|
@@ -465,7 +483,8 @@ class LocalEmbeddingManager:
|
|
465
483
|
delete_paths.append(relative_path)
|
466
484
|
if delete_ids:
|
467
485
|
logger.info(
|
468
|
-
f"Deleting {len(delete_paths)} non-existent files from the "
|
486
|
+
f"Deleting {len(delete_paths)} non-existent files from the "
|
487
|
+
f"index."
|
469
488
|
)
|
470
489
|
self.vector_stores[folder_hash].delete(delete_ids)
|
471
490
|
for path in delete_paths:
|
@@ -543,7 +562,9 @@ class LocalEmbeddingManager:
|
|
543
562
|
if path.exists() and path.is_dir():
|
544
563
|
valid_folder_paths.append(path)
|
545
564
|
else:
|
546
|
-
logger.warning(
|
565
|
+
logger.warning(
|
566
|
+
f"Skipping non-existent folder in search: {path}"
|
567
|
+
)
|
547
568
|
|
548
569
|
# If no valid folders, return empty results
|
549
570
|
if not valid_folder_paths:
|
@@ -578,8 +599,10 @@ class LocalEmbeddingManager:
|
|
578
599
|
vector_store = self.vector_stores[folder_hash]
|
579
600
|
|
580
601
|
try:
|
581
|
-
docs_with_scores =
|
582
|
-
|
602
|
+
docs_with_scores = (
|
603
|
+
vector_store.similarity_search_with_relevance_scores(
|
604
|
+
query, k=limit
|
605
|
+
)
|
583
606
|
)
|
584
607
|
|
585
608
|
for doc, similarity in docs_with_scores:
|
@@ -685,7 +708,9 @@ class LocalSearchEngine(BaseSearchEngine):
|
|
685
708
|
if os.path.exists(path) and os.path.isdir(path):
|
686
709
|
self.valid_folder_paths.append(path)
|
687
710
|
else:
|
688
|
-
logger.warning(
|
711
|
+
logger.warning(
|
712
|
+
f"Folder not found or is not a directory: {path}"
|
713
|
+
)
|
689
714
|
|
690
715
|
# If no valid folders, log a clear message
|
691
716
|
if not self.valid_folder_paths and paths:
|
@@ -768,7 +793,9 @@ class LocalSearchEngine(BaseSearchEngine):
|
|
768
793
|
if name in self.collections
|
769
794
|
}
|
770
795
|
if not collections_to_search:
|
771
|
-
logger.warning(
|
796
|
+
logger.warning(
|
797
|
+
f"No valid collections found among: {collection_names}"
|
798
|
+
)
|
772
799
|
return []
|
773
800
|
else:
|
774
801
|
# Search in all collections
|
@@ -811,13 +838,13 @@ class LocalSearchEngine(BaseSearchEngine):
|
|
811
838
|
previews = []
|
812
839
|
for i, result in enumerate(raw_results):
|
813
840
|
# Create a unique ID
|
814
|
-
result_id = (
|
815
|
-
f"local-{i}-{hashlib.md5(result['content'][:50].encode()).hexdigest()}"
|
816
|
-
)
|
841
|
+
result_id = f"local-{i}-{hashlib.md5(result['content'][:50].encode()).hexdigest()}"
|
817
842
|
|
818
843
|
# Extract filename and path
|
819
844
|
source_path = result["metadata"].get("source", "Unknown")
|
820
|
-
filename = result["metadata"].get(
|
845
|
+
filename = result["metadata"].get(
|
846
|
+
"filename", os.path.basename(source_path)
|
847
|
+
)
|
821
848
|
|
822
849
|
# Create preview snippet (first ~200 chars of content)
|
823
850
|
snippet = (
|
@@ -845,10 +872,12 @@ class LocalSearchEngine(BaseSearchEngine):
|
|
845
872
|
"similarity": result["similarity"],
|
846
873
|
"folder": folder_path.as_posix(),
|
847
874
|
"collection": collection_name,
|
848
|
-
"collection_description": self.collections.get(
|
849
|
-
|
850
|
-
),
|
851
|
-
"_full_content": result[
|
875
|
+
"collection_description": self.collections.get(
|
876
|
+
collection_name, {}
|
877
|
+
).get("description", ""),
|
878
|
+
"_full_content": result[
|
879
|
+
"content"
|
880
|
+
], # Store full content for later
|
852
881
|
"_metadata": result["metadata"], # Store metadata for later
|
853
882
|
}
|
854
883
|
|
@@ -933,7 +962,9 @@ class LocalSearchEngine(BaseSearchEngine):
|
|
933
962
|
if collection_name in self.collections:
|
934
963
|
specified_collections.append(collection_name)
|
935
964
|
# Remove this part from the query
|
936
|
-
remaining_query = remaining_query.replace(
|
965
|
+
remaining_query = remaining_query.replace(
|
966
|
+
part, "", 1
|
967
|
+
).strip()
|
937
968
|
|
938
969
|
# If collections were specified in the query, they override the parameter
|
939
970
|
if specified_collections:
|
@@ -1047,7 +1078,9 @@ class LocalSearchEngine(BaseSearchEngine):
|
|
1047
1078
|
success = True
|
1048
1079
|
|
1049
1080
|
for path in paths:
|
1050
|
-
if not self.embedding_manager.index_folder(
|
1081
|
+
if not self.embedding_manager.index_folder(
|
1082
|
+
path, force_reindex=True
|
1083
|
+
):
|
1051
1084
|
success = False
|
1052
1085
|
|
1053
1086
|
return success
|
@@ -37,7 +37,9 @@ class LocalAllSearchEngine(BaseSearchEngine):
|
|
37
37
|
"""
|
38
38
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
39
39
|
super().__init__(
|
40
|
-
llm=llm,
|
40
|
+
llm=llm,
|
41
|
+
max_filtered_results=max_filtered_results,
|
42
|
+
max_results=max_results,
|
41
43
|
)
|
42
44
|
|
43
45
|
# Find all local collection search engines
|
@@ -76,7 +78,9 @@ class LocalAllSearchEngine(BaseSearchEngine):
|
|
76
78
|
Returns:
|
77
79
|
List of preview dictionaries
|
78
80
|
"""
|
79
|
-
logger.info(
|
81
|
+
logger.info(
|
82
|
+
f"Searching across all local collections for query: {query}"
|
83
|
+
)
|
80
84
|
|
81
85
|
all_previews = []
|
82
86
|
|
@@ -91,18 +95,24 @@ class LocalAllSearchEngine(BaseSearchEngine):
|
|
91
95
|
for preview in previews:
|
92
96
|
preview["collection_id"] = collection_id
|
93
97
|
preview["collection_name"] = engine_info["name"]
|
94
|
-
preview["collection_description"] = engine_info[
|
98
|
+
preview["collection_description"] = engine_info[
|
99
|
+
"description"
|
100
|
+
]
|
95
101
|
|
96
102
|
all_previews.extend(previews)
|
97
103
|
except Exception:
|
98
|
-
logger.exception(
|
104
|
+
logger.exception(
|
105
|
+
f"Error searching collection '{collection_id}'"
|
106
|
+
)
|
99
107
|
|
100
108
|
if not all_previews:
|
101
109
|
logger.info(f"No local documents found for query: {query}")
|
102
110
|
return []
|
103
111
|
|
104
112
|
# Sort by similarity score if available
|
105
|
-
all_previews.sort(
|
113
|
+
all_previews.sort(
|
114
|
+
key=lambda x: float(x.get("similarity", 0)), reverse=True
|
115
|
+
)
|
106
116
|
|
107
117
|
# Limit to max_results
|
108
118
|
return all_previews[: self.max_results]
|
@@ -49,7 +49,9 @@ class PubMedSearchEngine(BaseSearchEngine):
|
|
49
49
|
"""
|
50
50
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
51
51
|
super().__init__(
|
52
|
-
llm=llm,
|
52
|
+
llm=llm,
|
53
|
+
max_filtered_results=max_filtered_results,
|
54
|
+
max_results=max_results,
|
53
55
|
)
|
54
56
|
self.max_results = max(self.max_results, 25)
|
55
57
|
self.api_key = api_key
|
@@ -100,7 +102,9 @@ class PubMedSearchEngine(BaseSearchEngine):
|
|
100
102
|
data = response.json()
|
101
103
|
count = int(data["esearchresult"]["count"])
|
102
104
|
|
103
|
-
logger.info(
|
105
|
+
logger.info(
|
106
|
+
"Query '%s' has %s total results in PubMed", query, count
|
107
|
+
)
|
104
108
|
return count
|
105
109
|
|
106
110
|
except Exception as e:
|
@@ -119,10 +123,14 @@ class PubMedSearchEngine(BaseSearchEngine):
|
|
119
123
|
"""
|
120
124
|
# Remove field specifications and operators
|
121
125
|
simplified = re.sub(r"\[\w+\]", "", query) # Remove [Field] tags
|
122
|
-
simplified = re.sub(
|
126
|
+
simplified = re.sub(
|
127
|
+
r"\b(AND|OR|NOT)\b", "", simplified
|
128
|
+
) # Remove operators
|
123
129
|
|
124
130
|
# Remove quotes and parentheses
|
125
|
-
simplified =
|
131
|
+
simplified = (
|
132
|
+
simplified.replace('"', "").replace("(", "").replace(")", "")
|
133
|
+
)
|
126
134
|
|
127
135
|
# Split by whitespace and join terms with 4+ chars (likely meaningful)
|
128
136
|
terms = [term for term in simplified.split() if len(term) >= 4]
|
@@ -220,7 +228,9 @@ Return ONLY the search query without any explanations.
|
|
220
228
|
optimized_query = cleaned_lines[0]
|
221
229
|
|
222
230
|
# Remove any quotes that wrap the entire query
|
223
|
-
if optimized_query.startswith('"') and optimized_query.endswith(
|
231
|
+
if optimized_query.startswith('"') and optimized_query.endswith(
|
232
|
+
'"'
|
233
|
+
):
|
224
234
|
optimized_query = optimized_query[1:-1]
|
225
235
|
|
226
236
|
# Remove any explanation phrases that might be at the beginning
|
@@ -235,7 +245,9 @@ Return ONLY the search query without any explanations.
|
|
235
245
|
# Find the actual query part - typically after a colon
|
236
246
|
colon_pos = optimized_query.find(":")
|
237
247
|
if colon_pos > 0:
|
238
|
-
optimized_query = optimized_query[
|
248
|
+
optimized_query = optimized_query[
|
249
|
+
colon_pos + 1 :
|
250
|
+
].strip()
|
239
251
|
|
240
252
|
# Check if the query still seems to contain explanations
|
241
253
|
if (
|
@@ -262,12 +274,16 @@ Return ONLY the search query without any explanations.
|
|
262
274
|
optimized_query = " ".join(query_parts)
|
263
275
|
else:
|
264
276
|
# Fall back to original query if cleaning fails
|
265
|
-
logger.warning(
|
277
|
+
logger.warning(
|
278
|
+
"Failed to extract a clean query from LLM response"
|
279
|
+
)
|
266
280
|
optimized_query = query
|
267
281
|
|
268
282
|
# Final safety check - if query looks too much like an explanation, use original
|
269
283
|
if len(optimized_query.split()) > 30:
|
270
|
-
logger.warning(
|
284
|
+
logger.warning(
|
285
|
+
"Query too verbose, falling back to simpler form"
|
286
|
+
)
|
271
287
|
# Create a simple query from the original
|
272
288
|
words = [
|
273
289
|
w
|
@@ -389,7 +405,9 @@ Return ONLY the search query without any explanations.
|
|
389
405
|
historical_years = [str(year) for year in range(1900, 2020)]
|
390
406
|
|
391
407
|
query_lower = query.lower()
|
392
|
-
has_historical_term = any(
|
408
|
+
has_historical_term = any(
|
409
|
+
term in query_lower for term in historical_terms
|
410
|
+
)
|
393
411
|
has_past_year = any(year in query for year in historical_years)
|
394
412
|
|
395
413
|
return has_historical_term or has_past_year
|
@@ -504,7 +522,9 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
504
522
|
strategy = "no_time_filter"
|
505
523
|
else:
|
506
524
|
# Historical query - run without time filter
|
507
|
-
logger.info(
|
525
|
+
logger.info(
|
526
|
+
"Using historical search strategy without date filtering"
|
527
|
+
)
|
508
528
|
results = self._search_pubmed(query)
|
509
529
|
|
510
530
|
return results, strategy
|
@@ -546,14 +566,18 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
546
566
|
data = response.json()
|
547
567
|
id_list = data["esearchresult"]["idlist"]
|
548
568
|
|
549
|
-
logger.info(
|
569
|
+
logger.info(
|
570
|
+
f"PubMed search for '{query}' found {len(id_list)} results"
|
571
|
+
)
|
550
572
|
return id_list
|
551
573
|
|
552
574
|
except Exception as e:
|
553
575
|
logger.error(f"Error searching PubMed: {e}")
|
554
576
|
return []
|
555
577
|
|
556
|
-
def _get_article_summaries(
|
578
|
+
def _get_article_summaries(
|
579
|
+
self, id_list: List[str]
|
580
|
+
) -> List[Dict[str, Any]]:
|
557
581
|
"""
|
558
582
|
Get summaries for a list of PubMed article IDs.
|
559
583
|
|
@@ -594,7 +618,9 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
594
618
|
# Extract authors (if available)
|
595
619
|
authors = []
|
596
620
|
if "authors" in article:
|
597
|
-
authors = [
|
621
|
+
authors = [
|
622
|
+
author["name"] for author in article["authors"]
|
623
|
+
]
|
598
624
|
|
599
625
|
# Create summary dictionary
|
600
626
|
summary = {
|
@@ -742,7 +768,9 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
742
768
|
if pmcids:
|
743
769
|
pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}"
|
744
770
|
|
745
|
-
logger.info(
|
771
|
+
logger.info(
|
772
|
+
f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access"
|
773
|
+
)
|
746
774
|
return pmid_to_pmcid
|
747
775
|
|
748
776
|
except Exception as e:
|
@@ -761,7 +789,12 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
761
789
|
"""
|
762
790
|
try:
|
763
791
|
# Prepare parameters
|
764
|
-
params = {
|
792
|
+
params = {
|
793
|
+
"db": "pmc",
|
794
|
+
"id": pmcid,
|
795
|
+
"retmode": "xml",
|
796
|
+
"rettype": "full",
|
797
|
+
}
|
765
798
|
|
766
799
|
# Add API key if available
|
767
800
|
if self.api_key:
|
@@ -832,13 +865,17 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
832
865
|
|
833
866
|
# If no results, try a simplified query
|
834
867
|
if not pmid_list:
|
835
|
-
logger.warning(
|
868
|
+
logger.warning(
|
869
|
+
f"No PubMed results found using strategy: {strategy}"
|
870
|
+
)
|
836
871
|
simplified_query = self._simplify_query(optimized_query)
|
837
872
|
if simplified_query != optimized_query:
|
838
873
|
logger.info(f"Trying with simplified query: {simplified_query}")
|
839
874
|
pmid_list, strategy = self._adaptive_search(simplified_query)
|
840
875
|
if pmid_list:
|
841
|
-
logger.info(
|
876
|
+
logger.info(
|
877
|
+
f"Simplified query found {len(pmid_list)} results"
|
878
|
+
)
|
842
879
|
|
843
880
|
if not pmid_list:
|
844
881
|
logger.warning("No PubMed results found after query simplification")
|
@@ -876,7 +913,9 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
876
913
|
|
877
914
|
previews.append(preview)
|
878
915
|
|
879
|
-
logger.info(
|
916
|
+
logger.info(
|
917
|
+
f"Found {len(previews)} PubMed previews using strategy: {strategy}"
|
918
|
+
)
|
880
919
|
return previews
|
881
920
|
|
882
921
|
def _get_full_content(
|
@@ -900,7 +939,9 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
900
939
|
logger.info("Snippet-only mode, skipping full content retrieval")
|
901
940
|
return relevant_items
|
902
941
|
|
903
|
-
logger.info(
|
942
|
+
logger.info(
|
943
|
+
f"Getting content for {len(relevant_items)} PubMed articles"
|
944
|
+
)
|
904
945
|
|
905
946
|
# Collect all PMIDs for relevant items
|
906
947
|
pmids = []
|
@@ -938,10 +979,11 @@ The default assumption should be that medical and scientific queries want RECENT
|
|
938
979
|
if (
|
939
980
|
pmid in pmid_to_pmcid
|
940
981
|
and self.get_full_text
|
941
|
-
and len(
|
982
|
+
and len(
|
983
|
+
[r for r in results if r.get("content_type") == "full_text"]
|
984
|
+
)
|
942
985
|
< self.full_text_limit
|
943
986
|
):
|
944
|
-
|
945
987
|
# Get full text content
|
946
988
|
pmcid = pmid_to_pmcid[pmid]
|
947
989
|
full_text = self._get_pmc_full_text(pmcid)
|
@@ -63,12 +63,16 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
63
63
|
|
64
64
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
65
65
|
super().__init__(
|
66
|
-
llm=llm,
|
66
|
+
llm=llm,
|
67
|
+
max_filtered_results=max_filtered_results,
|
68
|
+
max_results=max_results,
|
67
69
|
)
|
68
70
|
|
69
71
|
# Validate and normalize the instance URL if provided
|
70
72
|
self.instance_url = instance_url.rstrip("/")
|
71
|
-
logger.info(
|
73
|
+
logger.info(
|
74
|
+
f"SearXNG initialized with instance URL: {self.instance_url}"
|
75
|
+
)
|
72
76
|
try:
|
73
77
|
# Make sure it's accessible.
|
74
78
|
response = requests.get(self.instance_url, timeout=5)
|
@@ -97,8 +101,14 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
97
101
|
self.engines = engines
|
98
102
|
self.language = language
|
99
103
|
try:
|
100
|
-
|
101
|
-
|
104
|
+
# Handle both string names and integer values
|
105
|
+
if isinstance(safe_search, int) or (
|
106
|
+
isinstance(safe_search, str) and str(safe_search).isdigit()
|
107
|
+
):
|
108
|
+
self.safe_search = SafeSearchSetting(int(safe_search))
|
109
|
+
else:
|
110
|
+
self.safe_search = SafeSearchSetting[safe_search]
|
111
|
+
except (ValueError, KeyError):
|
102
112
|
logger.error(
|
103
113
|
"'{}' is not a valid safe search setting. Disabling safe search",
|
104
114
|
safe_search,
|
@@ -207,7 +217,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
207
217
|
"Upgrade-Insecure-Requests": "1",
|
208
218
|
}
|
209
219
|
|
210
|
-
logger.info(
|
220
|
+
logger.info(
|
221
|
+
f"Sending request to SearXNG instance at {self.instance_url}"
|
222
|
+
)
|
211
223
|
response = requests.get(
|
212
224
|
self.search_url,
|
213
225
|
params=params,
|
@@ -237,7 +249,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
237
249
|
)
|
238
250
|
result_elements = soup.select('div[id^="result"]')
|
239
251
|
|
240
|
-
logger.info(
|
252
|
+
logger.info(
|
253
|
+
f"Found {len(result_elements)} search result elements"
|
254
|
+
)
|
241
255
|
|
242
256
|
for idx, result_element in enumerate(result_elements):
|
243
257
|
if idx >= self.max_results:
|
@@ -264,7 +278,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
264
278
|
)
|
265
279
|
|
266
280
|
title = (
|
267
|
-
title_element.get_text(strip=True)
|
281
|
+
title_element.get_text(strip=True)
|
282
|
+
if title_element
|
283
|
+
else ""
|
268
284
|
)
|
269
285
|
|
270
286
|
url = ""
|
@@ -279,7 +295,11 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
279
295
|
else ""
|
280
296
|
)
|
281
297
|
|
282
|
-
if
|
298
|
+
if (
|
299
|
+
not url
|
300
|
+
and title_element
|
301
|
+
and title_element.has_attr("href")
|
302
|
+
):
|
283
303
|
url = title_element["href"]
|
284
304
|
|
285
305
|
logger.debug(
|
@@ -310,7 +330,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
310
330
|
logger.exception("Error parsing HTML results")
|
311
331
|
return []
|
312
332
|
else:
|
313
|
-
logger.error(
|
333
|
+
logger.error(
|
334
|
+
f"SearXNG returned status code {response.status_code}"
|
335
|
+
)
|
314
336
|
return []
|
315
337
|
|
316
338
|
except Exception:
|
@@ -328,7 +350,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
328
350
|
List of preview dictionaries
|
329
351
|
"""
|
330
352
|
if not self.is_available:
|
331
|
-
logger.warning(
|
353
|
+
logger.warning(
|
354
|
+
"SearXNG engine is disabled (no instance URL provided)"
|
355
|
+
)
|
332
356
|
return []
|
333
357
|
|
334
358
|
logger.info(f"Getting SearXNG previews for query: {query}")
|
@@ -383,7 +407,9 @@ class SearXNGSearchEngine(BaseSearchEngine):
|
|
383
407
|
logger.info("Retrieving full webpage content")
|
384
408
|
|
385
409
|
try:
|
386
|
-
results_with_content = self.full_search._get_full_content(
|
410
|
+
results_with_content = self.full_search._get_full_content(
|
411
|
+
relevant_items
|
412
|
+
)
|
387
413
|
return results_with_content
|
388
414
|
|
389
415
|
except Exception:
|
@@ -66,7 +66,9 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
|
|
66
66
|
"""
|
67
67
|
# Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
|
68
68
|
super().__init__(
|
69
|
-
llm=llm,
|
69
|
+
llm=llm,
|
70
|
+
max_filtered_results=max_filtered_results,
|
71
|
+
max_results=max_results,
|
70
72
|
)
|
71
73
|
|
72
74
|
self.api_key = api_key
|
@@ -157,7 +159,9 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
|
|
157
159
|
if method.upper() == "GET":
|
158
160
|
response = self.session.get(url, params=params, timeout=30)
|
159
161
|
elif method.upper() == "POST":
|
160
|
-
response = self.session.post(
|
162
|
+
response = self.session.post(
|
163
|
+
url, params=params, json=data, timeout=30
|
164
|
+
)
|
161
165
|
else:
|
162
166
|
raise ValueError(f"Unsupported HTTP method: {method}")
|
163
167
|
|
@@ -165,7 +169,9 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
|
|
165
169
|
if response.status_code == 429:
|
166
170
|
logger.warning("Rate limit exceeded, waiting and retrying...")
|
167
171
|
time.sleep(2.0) # Wait longer on rate limit
|
168
|
-
self.rate_limit_wait *=
|
172
|
+
self.rate_limit_wait *= (
|
173
|
+
1.5 # Increase wait time for future requests
|
174
|
+
)
|
169
175
|
return self._make_request(url, params, data, method) # Retry
|
170
176
|
|
171
177
|
response.raise_for_status()
|
@@ -258,7 +264,9 @@ Return ONLY the optimized search query with no explanation.
|
|
258
264
|
|
259
265
|
params = {
|
260
266
|
"query": query,
|
261
|
-
"limit": min(
|
267
|
+
"limit": min(
|
268
|
+
self.max_results, 100
|
269
|
+
), # API limit is 100 per request
|
262
270
|
"fields": ",".join(fields),
|
263
271
|
}
|
264
272
|
|
@@ -351,15 +359,21 @@ Format each query on a new line with no numbering or explanation. Keep each quer
|
|
351
359
|
): # Handle various LLM response formats
|
352
360
|
content = response.content
|
353
361
|
alt_queries = [
|
354
|
-
q.strip()
|
362
|
+
q.strip()
|
363
|
+
for q in content.strip().split("\n")
|
364
|
+
if q.strip()
|
355
365
|
]
|
356
366
|
elif isinstance(response, str):
|
357
367
|
alt_queries = [
|
358
|
-
q.strip()
|
368
|
+
q.strip()
|
369
|
+
for q in response.strip().split("\n")
|
370
|
+
if q.strip()
|
359
371
|
]
|
360
372
|
|
361
373
|
# Try each alternative query
|
362
|
-
for alt_query in alt_queries[
|
374
|
+
for alt_query in alt_queries[
|
375
|
+
:3
|
376
|
+
]: # Limit to first 3 alternatives
|
363
377
|
logger.info("Trying LLM-suggested query: %s", alt_query)
|
364
378
|
alt_papers = self._direct_search(alt_query)
|
365
379
|
|
@@ -495,7 +509,9 @@ Format each query on a new line with no numbering or explanation. Keep each quer
|
|
495
509
|
snippet = ""
|
496
510
|
if abstract:
|
497
511
|
snippet = (
|
498
|
-
abstract[:250] + "..."
|
512
|
+
abstract[:250] + "..."
|
513
|
+
if len(abstract) > 250
|
514
|
+
else abstract
|
499
515
|
)
|
500
516
|
|
501
517
|
venue = paper.get("venue", "")
|
@@ -597,7 +613,9 @@ Format each query on a new line with no numbering or explanation. Keep each quer
|
|
597
613
|
|
598
614
|
# Add fields of study
|
599
615
|
if "fieldsOfStudy" in paper_details:
|
600
|
-
result["fields_of_study"] = paper_details[
|
616
|
+
result["fields_of_study"] = paper_details[
|
617
|
+
"fieldsOfStudy"
|
618
|
+
]
|
601
619
|
|
602
620
|
# Remove temporary fields
|
603
621
|
if "_paper_id" in result:
|