local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +7 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
- local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
- local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
- local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
- local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
- local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
- local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
- local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
- local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
- local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
- local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
- local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
- local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
- local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
- local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
- local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
- local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
- local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
- local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
- local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
- local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
- local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
- local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
- local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
- local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
- local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
- local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
- local_deep_research/advanced_search_system/findings/repository.py +54 -17
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
- local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
- local_deep_research/advanced_search_system/questions/__init__.py +16 -0
- local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
- local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
- local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
- local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
- local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
- local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
- local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
- local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
- local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
- local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
- local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
- local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
- local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
- local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
- local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
- local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
- local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
- local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
- local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
- local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
- local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
- local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
- local_deep_research/api/benchmark_functions.py +6 -2
- local_deep_research/api/research_functions.py +10 -4
- local_deep_research/benchmarks/__init__.py +9 -7
- local_deep_research/benchmarks/benchmark_functions.py +6 -2
- local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
- local_deep_research/benchmarks/cli.py +38 -13
- local_deep_research/benchmarks/comparison/__init__.py +4 -2
- local_deep_research/benchmarks/comparison/evaluator.py +316 -239
- local_deep_research/benchmarks/datasets/__init__.py +1 -1
- local_deep_research/benchmarks/datasets/base.py +91 -72
- local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
- local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
- local_deep_research/benchmarks/datasets/utils.py +48 -29
- local_deep_research/benchmarks/datasets.py +4 -11
- local_deep_research/benchmarks/efficiency/__init__.py +8 -4
- local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
- local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
- local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
- local_deep_research/benchmarks/evaluators/composite.py +6 -2
- local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
- local_deep_research/benchmarks/graders.py +32 -10
- local_deep_research/benchmarks/metrics/README.md +1 -1
- local_deep_research/benchmarks/metrics/calculation.py +25 -10
- local_deep_research/benchmarks/metrics/reporting.py +7 -3
- local_deep_research/benchmarks/metrics/visualization.py +42 -23
- local_deep_research/benchmarks/metrics.py +1 -1
- local_deep_research/benchmarks/optimization/__init__.py +3 -1
- local_deep_research/benchmarks/optimization/api.py +7 -1
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
- local_deep_research/benchmarks/runners.py +48 -15
- local_deep_research/citation_handler.py +65 -92
- local_deep_research/citation_handlers/__init__.py +15 -0
- local_deep_research/citation_handlers/base_citation_handler.py +70 -0
- local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
- local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
- local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
- local_deep_research/config/llm_config.py +271 -169
- local_deep_research/config/search_config.py +14 -5
- local_deep_research/defaults/__init__.py +0 -1
- local_deep_research/metrics/__init__.py +13 -0
- local_deep_research/metrics/database.py +58 -0
- local_deep_research/metrics/db_models.py +115 -0
- local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
- local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
- local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
- local_deep_research/metrics/migrate_research_ratings.py +31 -0
- local_deep_research/metrics/models.py +61 -0
- local_deep_research/metrics/pricing/__init__.py +12 -0
- local_deep_research/metrics/pricing/cost_calculator.py +237 -0
- local_deep_research/metrics/pricing/pricing_cache.py +143 -0
- local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
- local_deep_research/metrics/query_utils.py +51 -0
- local_deep_research/metrics/search_tracker.py +380 -0
- local_deep_research/metrics/token_counter.py +1078 -0
- local_deep_research/migrate_db.py +3 -1
- local_deep_research/report_generator.py +22 -8
- local_deep_research/search_system.py +390 -9
- local_deep_research/test_migration.py +15 -5
- local_deep_research/utilities/db_utils.py +7 -4
- local_deep_research/utilities/es_utils.py +115 -104
- local_deep_research/utilities/llm_utils.py +15 -5
- local_deep_research/utilities/log_utils.py +151 -0
- local_deep_research/utilities/search_cache.py +387 -0
- local_deep_research/utilities/search_utilities.py +14 -6
- local_deep_research/utilities/threading_utils.py +92 -0
- local_deep_research/utilities/url_utils.py +6 -0
- local_deep_research/web/api.py +347 -0
- local_deep_research/web/app.py +13 -17
- local_deep_research/web/app_factory.py +71 -66
- local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
- local_deep_research/web/database/migrations.py +20 -3
- local_deep_research/web/database/models.py +74 -25
- local_deep_research/web/database/schema_upgrade.py +49 -29
- local_deep_research/web/models/database.py +63 -83
- local_deep_research/web/routes/api_routes.py +56 -22
- local_deep_research/web/routes/benchmark_routes.py +4 -1
- local_deep_research/web/routes/globals.py +22 -0
- local_deep_research/web/routes/history_routes.py +71 -46
- local_deep_research/web/routes/metrics_routes.py +1155 -0
- local_deep_research/web/routes/research_routes.py +192 -54
- local_deep_research/web/routes/settings_routes.py +156 -55
- local_deep_research/web/services/research_service.py +412 -251
- local_deep_research/web/services/resource_service.py +36 -11
- local_deep_research/web/services/settings_manager.py +55 -17
- local_deep_research/web/services/settings_service.py +12 -4
- local_deep_research/web/services/socket_service.py +295 -188
- local_deep_research/web/static/css/custom_dropdown.css +180 -0
- local_deep_research/web/static/css/styles.css +39 -1
- local_deep_research/web/static/js/components/detail.js +633 -267
- local_deep_research/web/static/js/components/details.js +751 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
- local_deep_research/web/static/js/components/fallback/ui.js +23 -23
- local_deep_research/web/static/js/components/history.js +76 -76
- local_deep_research/web/static/js/components/logpanel.js +61 -13
- local_deep_research/web/static/js/components/progress.js +13 -2
- local_deep_research/web/static/js/components/research.js +99 -12
- local_deep_research/web/static/js/components/results.js +239 -106
- local_deep_research/web/static/js/main.js +40 -40
- local_deep_research/web/static/js/services/audio.js +1 -1
- local_deep_research/web/static/js/services/formatting.js +11 -11
- local_deep_research/web/static/js/services/keyboard.js +157 -0
- local_deep_research/web/static/js/services/pdf.js +80 -80
- local_deep_research/web/static/sounds/README.md +1 -1
- local_deep_research/web/templates/base.html +1 -0
- local_deep_research/web/templates/components/log_panel.html +7 -1
- local_deep_research/web/templates/components/mobile_nav.html +1 -1
- local_deep_research/web/templates/components/sidebar.html +3 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
- local_deep_research/web/templates/pages/details.html +325 -24
- local_deep_research/web/templates/pages/history.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +1929 -0
- local_deep_research/web/templates/pages/progress.html +2 -2
- local_deep_research/web/templates/pages/research.html +53 -17
- local_deep_research/web/templates/pages/results.html +12 -1
- local_deep_research/web/templates/pages/star_reviews.html +803 -0
- local_deep_research/web/utils/formatters.py +9 -3
- local_deep_research/web_search_engines/default_search_engines.py +5 -3
- local_deep_research/web_search_engines/engines/full_search.py +8 -2
- local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
- local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
- local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
- local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
- local_deep_research/web_search_engines/search_engine_base.py +83 -35
- local_deep_research/web_search_engines/search_engine_factory.py +25 -8
- local_deep_research/web_search_engines/search_engines_config.py +9 -3
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/METADATA +7 -1
- local_deep_research-0.5.2.dist-info/RECORD +265 -0
- local_deep_research-0.4.4.dist-info/RECORD +0 -177
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/WHEEL +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,515 @@
|
|
1
|
+
"""
|
2
|
+
Smart query generation strategy that works for any type of search target.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import concurrent.futures
|
6
|
+
from typing import Dict, List
|
7
|
+
|
8
|
+
from loguru import logger
|
9
|
+
|
10
|
+
from ..constraints.base_constraint import Constraint
|
11
|
+
from ..constraints.constraint_analyzer import ConstraintType
|
12
|
+
from .early_stop_constrained_strategy import EarlyStopConstrainedStrategy
|
13
|
+
|
14
|
+
|
15
|
+
class SmartQueryStrategy(EarlyStopConstrainedStrategy):
|
16
|
+
"""
|
17
|
+
Enhanced strategy with intelligent query generation that:
|
18
|
+
1. Analyzes constraints to identify key search terms
|
19
|
+
2. Uses LLM to suggest search queries based on constraint meaning
|
20
|
+
3. Generates multiple query variations for better coverage
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
*args,
|
26
|
+
use_llm_query_generation: bool = True,
|
27
|
+
queries_per_combination: int = 3,
|
28
|
+
use_entity_seeding: bool = True,
|
29
|
+
use_direct_property_search: bool = True,
|
30
|
+
**kwargs,
|
31
|
+
):
|
32
|
+
super().__init__(*args, **kwargs)
|
33
|
+
self.use_llm_query_generation = use_llm_query_generation
|
34
|
+
self.queries_per_combination = queries_per_combination
|
35
|
+
self.use_entity_seeding = use_entity_seeding
|
36
|
+
self.use_direct_property_search = use_direct_property_search
|
37
|
+
# Track queries to avoid duplicates
|
38
|
+
self.searched_queries = set()
|
39
|
+
self.query_variations = set()
|
40
|
+
# Store entity seeds for targeted search
|
41
|
+
self.entity_seeds = []
|
42
|
+
|
43
|
+
def _build_query(self, constraints: List[Constraint]) -> str:
|
44
|
+
"""Build intelligent queries using constraint analysis."""
|
45
|
+
if self.use_llm_query_generation:
|
46
|
+
# Use LLM to generate smart queries
|
47
|
+
return self._generate_smart_query(constraints)
|
48
|
+
else:
|
49
|
+
# Fallback to improved standard approach
|
50
|
+
return self._build_standard_query(constraints)
|
51
|
+
|
52
|
+
def _generate_smart_query(self, constraints: List[Constraint]) -> str:
|
53
|
+
"""Use LLM to generate optimal search queries."""
|
54
|
+
constraint_text = "\n".join(
|
55
|
+
[
|
56
|
+
f"- {c.type.value}: {c.value} (weight: {c.weight})"
|
57
|
+
for c in constraints
|
58
|
+
]
|
59
|
+
)
|
60
|
+
|
61
|
+
# Build a list of already searched queries to avoid duplication
|
62
|
+
searched_list = list(self.searched_queries)[:10] # Show last 10 to LLM
|
63
|
+
already_searched = (
|
64
|
+
"\n".join([f"- {q}" for q in searched_list])
|
65
|
+
if searched_list
|
66
|
+
else "None"
|
67
|
+
)
|
68
|
+
|
69
|
+
prompt = f"""
|
70
|
+
Analyze these search constraints and generate an optimal web search query:
|
71
|
+
|
72
|
+
Constraints:
|
73
|
+
{constraint_text}
|
74
|
+
|
75
|
+
Target type: {getattr(self, "entity_type", "unknown")}
|
76
|
+
|
77
|
+
Already searched queries (avoid these):
|
78
|
+
{already_searched}
|
79
|
+
|
80
|
+
Generate a single search query that would most effectively find results matching these constraints.
|
81
|
+
The query should:
|
82
|
+
1. Include the most identifying/unique terms
|
83
|
+
2. Use appropriate search operators (quotes, AND, OR)
|
84
|
+
3. Be specific enough to find relevant results but not too narrow
|
85
|
+
4. Focus on the highest weighted constraints
|
86
|
+
5. Be different from already searched queries
|
87
|
+
|
88
|
+
Return only the search query, nothing else.
|
89
|
+
"""
|
90
|
+
|
91
|
+
try:
|
92
|
+
query = self.model.invoke(prompt).content.strip()
|
93
|
+
|
94
|
+
# Check if this query is too similar to existing ones
|
95
|
+
normalized_query = query.strip().lower()
|
96
|
+
if normalized_query in self.searched_queries:
|
97
|
+
logger.info(
|
98
|
+
f"LLM generated duplicate query, using fallback: {query}"
|
99
|
+
)
|
100
|
+
return self._build_standard_query(constraints)
|
101
|
+
|
102
|
+
logger.info(f"LLM generated query: {query}")
|
103
|
+
return query
|
104
|
+
except Exception as e:
|
105
|
+
logger.error(f"Failed to generate smart query: {e}")
|
106
|
+
return self._build_standard_query(constraints)
|
107
|
+
|
108
|
+
def _build_standard_query(self, constraints: List[Constraint]) -> str:
|
109
|
+
"""Improved standard query building."""
|
110
|
+
# Group constraints by importance
|
111
|
+
critical_terms = []
|
112
|
+
supplementary_terms = []
|
113
|
+
|
114
|
+
for c in constraints:
|
115
|
+
term = c.value
|
116
|
+
|
117
|
+
# Quote multi-word terms
|
118
|
+
if " " in term and not term.startswith('"'):
|
119
|
+
term = f'"{term}"'
|
120
|
+
|
121
|
+
if c.weight > 0.7:
|
122
|
+
critical_terms.append(term)
|
123
|
+
else:
|
124
|
+
supplementary_terms.append(term)
|
125
|
+
|
126
|
+
# Build query with critical terms required, supplementary optional
|
127
|
+
query_parts = []
|
128
|
+
|
129
|
+
# Add entity type if known
|
130
|
+
entity_type = getattr(self, "entity_type", None)
|
131
|
+
if entity_type and entity_type != "unknown entity":
|
132
|
+
query_parts.append(entity_type)
|
133
|
+
|
134
|
+
# Add critical terms
|
135
|
+
if critical_terms:
|
136
|
+
query_parts.extend(critical_terms)
|
137
|
+
|
138
|
+
# Add some supplementary terms
|
139
|
+
if supplementary_terms:
|
140
|
+
query_parts.extend(
|
141
|
+
supplementary_terms[:2]
|
142
|
+
) # Limit to avoid overly specific queries
|
143
|
+
|
144
|
+
return " ".join(query_parts)
|
145
|
+
|
146
|
+
def _execute_combination_search(self, combo) -> List:
|
147
|
+
"""Override to generate multiple query variations per combination."""
|
148
|
+
all_candidates = []
|
149
|
+
|
150
|
+
if self.use_llm_query_generation:
|
151
|
+
# Generate multiple query variations
|
152
|
+
queries = self._generate_query_variations(combo.constraints)
|
153
|
+
|
154
|
+
# Execute searches in parallel
|
155
|
+
with concurrent.futures.ThreadPoolExecutor(
|
156
|
+
max_workers=self.queries_per_combination
|
157
|
+
) as executor:
|
158
|
+
futures = []
|
159
|
+
for query in queries:
|
160
|
+
# Check if we've already searched this query
|
161
|
+
normalized_query = query.strip().lower()
|
162
|
+
if normalized_query in self.searched_queries:
|
163
|
+
logger.info(f"Skipping duplicate query: '{query}'")
|
164
|
+
continue
|
165
|
+
|
166
|
+
self.searched_queries.add(normalized_query)
|
167
|
+
future = executor.submit(self._execute_search, query)
|
168
|
+
futures.append((query, future))
|
169
|
+
|
170
|
+
for query, future in futures:
|
171
|
+
try:
|
172
|
+
results = future.result()
|
173
|
+
candidates = self._extract_candidates_from_results(
|
174
|
+
results
|
175
|
+
)
|
176
|
+
all_candidates.extend(candidates)
|
177
|
+
|
178
|
+
logger.info(
|
179
|
+
f"Query '{query}' found {len(candidates)} candidates"
|
180
|
+
)
|
181
|
+
except Exception as e:
|
182
|
+
logger.error(f"Search failed for query '{query}': {e}")
|
183
|
+
else:
|
184
|
+
# Use single query from parent implementation
|
185
|
+
candidates = super()._execute_combination_search(combo)
|
186
|
+
all_candidates.extend(candidates)
|
187
|
+
|
188
|
+
return all_candidates
|
189
|
+
|
190
|
+
def _generate_query_variations(
|
191
|
+
self, constraints: List[Constraint]
|
192
|
+
) -> List[str]:
|
193
|
+
"""Generate multiple query variations for better coverage."""
|
194
|
+
# Handle single constraint case
|
195
|
+
if isinstance(constraints, Constraint):
|
196
|
+
constraints = [constraints]
|
197
|
+
|
198
|
+
constraint_text = "\n".join(
|
199
|
+
[f"- {c.type.value}: {c.value}" for c in constraints]
|
200
|
+
)
|
201
|
+
|
202
|
+
# Build a list of already searched queries to avoid duplication
|
203
|
+
searched_list = list(self.searched_queries)[:20] # Show last 20 to LLM
|
204
|
+
already_searched = (
|
205
|
+
"\n".join([f"- {q}" for q in searched_list])
|
206
|
+
if searched_list
|
207
|
+
else "None"
|
208
|
+
)
|
209
|
+
|
210
|
+
prompt = f"""
|
211
|
+
Generate {self.queries_per_combination} different search queries for these constraints:
|
212
|
+
|
213
|
+
{constraint_text}
|
214
|
+
|
215
|
+
Already searched queries (avoid these):
|
216
|
+
{already_searched}
|
217
|
+
|
218
|
+
Each query should:
|
219
|
+
- Approach the search from a different angle
|
220
|
+
- Use different search terms or operators
|
221
|
+
- Target different aspects of the constraints
|
222
|
+
- Be distinctly different from already searched queries
|
223
|
+
|
224
|
+
Provide each query on a separate line.
|
225
|
+
"""
|
226
|
+
|
227
|
+
try:
|
228
|
+
response = self.model.invoke(prompt).content
|
229
|
+
queries = [q.strip() for q in response.split("\n") if q.strip()]
|
230
|
+
|
231
|
+
# Filter out duplicates
|
232
|
+
unique_queries = []
|
233
|
+
for query in queries:
|
234
|
+
normalized = query.strip().lower()
|
235
|
+
if (
|
236
|
+
normalized not in self.searched_queries
|
237
|
+
and normalized not in self.query_variations
|
238
|
+
):
|
239
|
+
unique_queries.append(query)
|
240
|
+
self.query_variations.add(normalized)
|
241
|
+
else:
|
242
|
+
logger.info(
|
243
|
+
f"Filtering out duplicate query variation: {query}"
|
244
|
+
)
|
245
|
+
|
246
|
+
# If all queries were duplicates, generate a fallback
|
247
|
+
if not unique_queries:
|
248
|
+
fallback = self._build_standard_query(constraints)
|
249
|
+
if fallback.strip().lower() not in self.searched_queries:
|
250
|
+
unique_queries = [fallback]
|
251
|
+
|
252
|
+
return unique_queries[: self.queries_per_combination]
|
253
|
+
except Exception as e:
|
254
|
+
logger.error(f"Failed to generate query variations: {e}")
|
255
|
+
# Fallback to single query
|
256
|
+
return [self._build_standard_query(constraints)]
|
257
|
+
|
258
|
+
def _extract_candidates_from_results(self, results: Dict) -> List:
|
259
|
+
"""Improved candidate extraction that's more generic."""
|
260
|
+
candidates = []
|
261
|
+
content = results.get("current_knowledge", "")
|
262
|
+
|
263
|
+
if not content:
|
264
|
+
return candidates
|
265
|
+
|
266
|
+
# Use LLM to extract relevant entities/topics from the content
|
267
|
+
prompt = f"""
|
268
|
+
From the following search results, extract all relevant entities, topics, or answers that match our search target type: {getattr(self, "entity_type", "unknown")}
|
269
|
+
|
270
|
+
Content:
|
271
|
+
{content}
|
272
|
+
|
273
|
+
List each potential match on a separate line.
|
274
|
+
Include only names/titles/identifiers, not descriptions.
|
275
|
+
"""
|
276
|
+
|
277
|
+
try:
|
278
|
+
response = self.model.invoke(prompt).content
|
279
|
+
entity_names = [
|
280
|
+
name.strip() for name in response.split("\n") if name.strip()
|
281
|
+
]
|
282
|
+
|
283
|
+
# Create candidates from extracted names
|
284
|
+
from ..candidates.base_candidate import Candidate
|
285
|
+
|
286
|
+
for name in entity_names:
|
287
|
+
if name and len(name) < 100: # Basic validation
|
288
|
+
candidate = Candidate(name=name)
|
289
|
+
candidates.append(candidate)
|
290
|
+
|
291
|
+
logger.info(f"Extracted {len(candidates)} candidates from results")
|
292
|
+
|
293
|
+
except Exception as e:
|
294
|
+
logger.error(f"Error extracting candidates: {e}")
|
295
|
+
|
296
|
+
return candidates
|
297
|
+
|
298
|
+
def _should_use_entity_seeding(self) -> bool:
|
299
|
+
"""Determine if entity seeding would be beneficial."""
|
300
|
+
entity_type = getattr(self, "entity_type", "").lower()
|
301
|
+
return (
|
302
|
+
"character" in entity_type
|
303
|
+
or "person" in entity_type
|
304
|
+
or "hero" in entity_type
|
305
|
+
)
|
306
|
+
|
307
|
+
def _perform_entity_seeding(self):
|
308
|
+
"""Use LLM to suggest specific entity names based on constraints."""
|
309
|
+
logger.info("Performing entity seeding based on constraints")
|
310
|
+
|
311
|
+
# Extract key properties from constraints
|
312
|
+
key_properties = []
|
313
|
+
for constraint in self.constraint_ranking:
|
314
|
+
if constraint.weight > 0.7: # High-weight constraints
|
315
|
+
key_properties.append(constraint.value)
|
316
|
+
|
317
|
+
if not key_properties:
|
318
|
+
return
|
319
|
+
|
320
|
+
properties_text = "\n".join([f"- {prop}" for prop in key_properties])
|
321
|
+
|
322
|
+
prompt = f"""
|
323
|
+
Based on these properties, suggest 5-10 specific {self.entity_type} names that might match:
|
324
|
+
|
325
|
+
Properties:
|
326
|
+
{properties_text}
|
327
|
+
|
328
|
+
For example, if looking for a scientist from the 19th century, you might suggest:
|
329
|
+
- Charles Darwin
|
330
|
+
- Marie Curie
|
331
|
+
- Louis Pasteur
|
332
|
+
- Thomas Edison
|
333
|
+
|
334
|
+
Provide one name per line. Be specific with actual character/entity names.
|
335
|
+
"""
|
336
|
+
|
337
|
+
try:
|
338
|
+
response = self.model.invoke(prompt).content
|
339
|
+
self.entity_seeds = [
|
340
|
+
name.strip() for name in response.split("\n") if name.strip()
|
341
|
+
]
|
342
|
+
logger.info(f"Generated entity seeds: {self.entity_seeds}")
|
343
|
+
|
344
|
+
# Immediately search for these seeds
|
345
|
+
self._search_entity_seeds()
|
346
|
+
|
347
|
+
except Exception as e:
|
348
|
+
logger.error(f"Error generating entity seeds: {e}")
|
349
|
+
|
350
|
+
def _search_entity_seeds(self):
|
351
|
+
"""Search for the entity seeds directly."""
|
352
|
+
if not self.entity_seeds:
|
353
|
+
return
|
354
|
+
|
355
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
356
|
+
futures = []
|
357
|
+
for seed in self.entity_seeds[:5]: # Limit to top 5
|
358
|
+
query = f'"{seed}"'
|
359
|
+
if query.lower() not in self.searched_queries:
|
360
|
+
self.searched_queries.add(query.lower())
|
361
|
+
future = executor.submit(self._execute_search, query)
|
362
|
+
futures.append((seed, future))
|
363
|
+
|
364
|
+
for seed, future in futures:
|
365
|
+
try:
|
366
|
+
results = future.result()
|
367
|
+
candidates = self._extract_candidates_from_results(results)
|
368
|
+
|
369
|
+
# Look for exact matches
|
370
|
+
for candidate in candidates:
|
371
|
+
if seed.lower() in candidate.name.lower():
|
372
|
+
logger.info(
|
373
|
+
f"Found seeded entity: {candidate.name}"
|
374
|
+
)
|
375
|
+
# Evaluate immediately
|
376
|
+
if hasattr(self, "_evaluate_candidate_immediately"):
|
377
|
+
self._evaluate_candidate_immediately(candidate)
|
378
|
+
else:
|
379
|
+
# Add to candidates list
|
380
|
+
if not hasattr(self, "candidates"):
|
381
|
+
self.candidates = []
|
382
|
+
self.candidates.append(candidate)
|
383
|
+
|
384
|
+
except Exception as e:
|
385
|
+
logger.error(f"Error searching for seed {seed}: {e}")
|
386
|
+
|
387
|
+
def _try_direct_property_search(self):
|
388
|
+
"""Try direct searches for high-weight property constraints."""
|
389
|
+
property_queries = []
|
390
|
+
|
391
|
+
for constraint in self.constraint_ranking:
|
392
|
+
if (
|
393
|
+
constraint.weight > 0.7
|
394
|
+
and constraint.type == ConstraintType.PROPERTY
|
395
|
+
):
|
396
|
+
# Create specific property-based queries
|
397
|
+
if (
|
398
|
+
"elastic" in constraint.value.lower()
|
399
|
+
or "stretch" in constraint.value.lower()
|
400
|
+
):
|
401
|
+
property_queries.extend(
|
402
|
+
[
|
403
|
+
f'"{constraint.value}" superhero character',
|
404
|
+
f'characters with "{constraint.value}"',
|
405
|
+
f"list of {self.entity_type} {constraint.value}",
|
406
|
+
]
|
407
|
+
)
|
408
|
+
elif (
|
409
|
+
"voice" in constraint.value.lower()
|
410
|
+
or "actor" in constraint.value.lower()
|
411
|
+
):
|
412
|
+
property_queries.append(
|
413
|
+
f"{constraint.value} {self.entity_type}"
|
414
|
+
)
|
415
|
+
|
416
|
+
# Execute property searches
|
417
|
+
if property_queries:
|
418
|
+
logger.info(
|
419
|
+
f"Executing direct property searches: {property_queries}"
|
420
|
+
)
|
421
|
+
with concurrent.futures.ThreadPoolExecutor(
|
422
|
+
max_workers=3
|
423
|
+
) as executor:
|
424
|
+
futures = []
|
425
|
+
for query in property_queries[
|
426
|
+
:3
|
427
|
+
]: # Limit to avoid too many searches
|
428
|
+
if query.lower() not in self.searched_queries:
|
429
|
+
self.searched_queries.add(query.lower())
|
430
|
+
future = executor.submit(self._execute_search, query)
|
431
|
+
futures.append(future)
|
432
|
+
|
433
|
+
for future in futures:
|
434
|
+
try:
|
435
|
+
results = future.result()
|
436
|
+
candidates = self._extract_candidates_from_results(
|
437
|
+
results
|
438
|
+
)
|
439
|
+
|
440
|
+
for candidate in candidates:
|
441
|
+
if hasattr(self, "_evaluate_candidate_immediately"):
|
442
|
+
self._evaluate_candidate_immediately(candidate)
|
443
|
+
|
444
|
+
except Exception as e:
|
445
|
+
logger.error(f"Property search error: {e}")
|
446
|
+
|
447
|
+
def _perform_entity_name_search(self):
|
448
|
+
"""Last resort: search for entity names directly with constraints."""
|
449
|
+
logger.info("Performing entity name search fallback")
|
450
|
+
|
451
|
+
for entity_name in self.entity_seeds[:3]: # Top 3 seeds
|
452
|
+
# Combine entity name with key constraints
|
453
|
+
constraint_terms = []
|
454
|
+
for constraint in self.constraint_ranking[:2]: # Top 2 constraints
|
455
|
+
if constraint.weight > 0.5:
|
456
|
+
constraint_terms.append(constraint.value)
|
457
|
+
|
458
|
+
if constraint_terms:
|
459
|
+
query = f'"{entity_name}" {" ".join(constraint_terms)}'
|
460
|
+
if query.lower() not in self.searched_queries:
|
461
|
+
logger.info(f"Trying targeted entity search: {query}")
|
462
|
+
self.searched_queries.add(query.lower())
|
463
|
+
|
464
|
+
try:
|
465
|
+
results = self._execute_search(query)
|
466
|
+
candidates = self._extract_candidates_from_results(
|
467
|
+
results
|
468
|
+
)
|
469
|
+
|
470
|
+
for candidate in candidates:
|
471
|
+
if entity_name.lower() in candidate.name.lower():
|
472
|
+
logger.info(
|
473
|
+
f"Found target entity in fallback: {candidate.name}"
|
474
|
+
)
|
475
|
+
if hasattr(
|
476
|
+
self, "_evaluate_candidate_immediately"
|
477
|
+
):
|
478
|
+
self._evaluate_candidate_immediately(
|
479
|
+
candidate
|
480
|
+
)
|
481
|
+
|
482
|
+
# Check for early stop
|
483
|
+
if (
|
484
|
+
hasattr(self, "best_score")
|
485
|
+
and self.best_score >= 0.99
|
486
|
+
):
|
487
|
+
return
|
488
|
+
|
489
|
+
except Exception as e:
|
490
|
+
logger.error(f"Entity name search error: {e}")
|
491
|
+
|
492
|
+
def _progressive_constraint_search(self):
|
493
|
+
"""Override to add entity seeding and property search."""
|
494
|
+
# Detect entity type first
|
495
|
+
self.entity_type = self._detect_entity_type()
|
496
|
+
logger.info(f"Detected entity type: {self.entity_type}")
|
497
|
+
|
498
|
+
# Perform entity seeding if enabled and entity type suggests specific entities
|
499
|
+
if self.use_entity_seeding and self._should_use_entity_seeding():
|
500
|
+
self._perform_entity_seeding()
|
501
|
+
|
502
|
+
# Try direct property search for high-weight properties
|
503
|
+
if self.use_direct_property_search:
|
504
|
+
self._try_direct_property_search()
|
505
|
+
|
506
|
+
# Continue with normal progressive search
|
507
|
+
super()._progressive_constraint_search()
|
508
|
+
|
509
|
+
# If still no good results, try name-based fallback
|
510
|
+
if (
|
511
|
+
hasattr(self, "best_score")
|
512
|
+
and self.best_score < 0.9
|
513
|
+
and self.entity_seeds
|
514
|
+
):
|
515
|
+
self._perform_entity_name_search()
|
@@ -1,18 +1,19 @@
|
|
1
1
|
import concurrent.futures
|
2
|
-
import logging
|
3
2
|
from typing import Dict
|
4
3
|
|
4
|
+
from loguru import logger
|
5
|
+
|
5
6
|
from ...citation_handler import CitationHandler
|
6
7
|
from ...config.llm_config import get_llm
|
7
8
|
from ...config.search_config import get_search
|
8
9
|
from ...utilities.db_utils import get_db_setting
|
10
|
+
from ...utilities.threading_utils import thread_context, thread_with_app_context
|
9
11
|
from ..filters.cross_engine_filter import CrossEngineFilter
|
10
12
|
from ..findings.repository import FindingsRepository
|
13
|
+
from ..questions.atomic_fact_question import AtomicFactQuestionGenerator
|
11
14
|
from ..questions.standard_question import StandardQuestionGenerator
|
12
15
|
from .base_strategy import BaseSearchStrategy
|
13
16
|
|
14
|
-
logger = logging.getLogger(__name__)
|
15
|
-
|
16
17
|
|
17
18
|
class SourceBasedSearchStrategy(BaseSearchStrategy):
|
18
19
|
"""
|
@@ -31,6 +32,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
|
|
31
32
|
filter_reindex: bool = True,
|
32
33
|
cross_engine_max_results: int = None,
|
33
34
|
all_links_of_system=None,
|
35
|
+
use_atomic_facts: bool = False,
|
34
36
|
):
|
35
37
|
"""Initialize with optional dependency injection for testing."""
|
36
38
|
# Pass the links list to the parent class
|
@@ -61,7 +63,10 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
|
|
61
63
|
self.citation_handler = citation_handler or CitationHandler(self.model)
|
62
64
|
|
63
65
|
# Initialize components
|
64
|
-
|
66
|
+
if use_atomic_facts:
|
67
|
+
self.question_generator = AtomicFactQuestionGenerator(self.model)
|
68
|
+
else:
|
69
|
+
self.question_generator = StandardQuestionGenerator(self.model)
|
65
70
|
self.findings_repository = FindingsRepository(self.model)
|
66
71
|
|
67
72
|
def _format_search_results_as_context(self, search_results):
|
@@ -87,9 +92,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
|
|
87
92
|
Analyze a topic using source-based search strategy.
|
88
93
|
"""
|
89
94
|
logger.info(f"Starting source-based research on topic: {query}")
|
90
|
-
accumulated_search_results_across_all_iterations =
|
91
|
-
[]
|
92
|
-
) # tracking links across iterations but not global
|
95
|
+
accumulated_search_results_across_all_iterations = [] # tracking links across iterations but not global
|
93
96
|
findings = []
|
94
97
|
total_citation_count_before_this_search = len(self.all_links_of_system)
|
95
98
|
|
@@ -120,10 +123,14 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
|
|
120
123
|
iterations_to_run = int(iterations_to_run)
|
121
124
|
try:
|
122
125
|
filtered_search_results = []
|
123
|
-
total_citation_count_before_this_search = len(
|
126
|
+
total_citation_count_before_this_search = len(
|
127
|
+
self.all_links_of_system
|
128
|
+
)
|
124
129
|
# Run each iteration
|
125
130
|
for iteration in range(1, iterations_to_run + 1):
|
126
|
-
iteration_progress_base = 5 + (iteration - 1) * (
|
131
|
+
iteration_progress_base = 5 + (iteration - 1) * (
|
132
|
+
70 / iterations_to_run
|
133
|
+
)
|
127
134
|
|
128
135
|
self._update_progress(
|
129
136
|
f"Starting iteration {iteration}/{iterations_to_run}",
|
@@ -141,7 +148,9 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
|
|
141
148
|
# For first iteration, use initial query
|
142
149
|
if iteration == 1:
|
143
150
|
# Generate questions for first iteration
|
144
|
-
context =
|
151
|
+
context = (
|
152
|
+
f"""Iteration: {iteration} of {iterations_to_run}"""
|
153
|
+
)
|
145
154
|
questions = self.question_generator.generate_questions(
|
146
155
|
current_knowledge=context,
|
147
156
|
query=query,
|
@@ -171,7 +180,9 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
|
|
171
180
|
elif iterations_to_run == 1:
|
172
181
|
context = ""
|
173
182
|
else:
|
174
|
-
context =
|
183
|
+
context = (
|
184
|
+
f"""Iteration: {iteration} of {iterations_to_run}"""
|
185
|
+
)
|
175
186
|
# Use standard question generator with search results as context
|
176
187
|
questions = self.question_generator.generate_questions(
|
177
188
|
current_knowledge=context,
|
@@ -199,6 +210,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
|
|
199
210
|
)
|
200
211
|
|
201
212
|
# Function for thread pool
|
213
|
+
@thread_with_app_context
|
202
214
|
def search_question(q):
|
203
215
|
try:
|
204
216
|
result = self.search.run(q)
|
@@ -212,7 +224,8 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
|
|
212
224
|
max_workers=len(all_questions)
|
213
225
|
) as executor:
|
214
226
|
futures = [
|
215
|
-
executor.submit(search_question,
|
227
|
+
executor.submit(search_question, thread_context(), q)
|
228
|
+
for q in all_questions
|
216
229
|
]
|
217
230
|
iteration_search_dict = {}
|
218
231
|
iteration_search_results = []
|
@@ -227,7 +240,7 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
|
|
227
240
|
iteration_search_dict[question] = search_results
|
228
241
|
|
229
242
|
self._update_progress(
|
230
|
-
f"Completed search {i + 1} of {len(all_questions)}: {question[:
|
243
|
+
f"Completed search {i + 1} of {len(all_questions)}: {question[:3000]}",
|
231
244
|
iteration_progress_base
|
232
245
|
+ 10
|
233
246
|
+ ((i + 1) / len(all_questions) * 30),
|
@@ -245,7 +258,10 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
|
|
245
258
|
self._update_progress(
|
246
259
|
f"Filtering search results for iteration {iteration}",
|
247
260
|
iteration_progress_base + 45,
|
248
|
-
{
|
261
|
+
{
|
262
|
+
"phase": "cross_engine_filtering",
|
263
|
+
"iteration": iteration,
|
264
|
+
},
|
249
265
|
)
|
250
266
|
|
251
267
|
existing_link_count = len(self.all_links_of_system)
|
@@ -301,13 +317,17 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
|
|
301
317
|
80,
|
302
318
|
{"phase": "final_filtering"},
|
303
319
|
)
|
304
|
-
final_filtered_results =
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
320
|
+
final_filtered_results = (
|
321
|
+
self.cross_engine_filter.filter_results(
|
322
|
+
accumulated_search_results_across_all_iterations,
|
323
|
+
query,
|
324
|
+
reorder=True, # Always reorder in final filtering
|
325
|
+
reindex=True, # Always reindex in final filtering
|
326
|
+
max_results=int(
|
327
|
+
get_db_setting("search.final_max_results") or 100
|
328
|
+
),
|
329
|
+
start_index=len(self.all_links_of_system),
|
330
|
+
)
|
311
331
|
)
|
312
332
|
self._update_progress(
|
313
333
|
f"Filtered from {len(accumulated_search_results_across_all_iterations)} to {len(final_filtered_results)} results",
|
@@ -341,7 +361,9 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
|
|
341
361
|
synthesized_content = final_citation_result["content"]
|
342
362
|
documents = final_citation_result.get("documents", [])
|
343
363
|
else:
|
344
|
-
synthesized_content =
|
364
|
+
synthesized_content = (
|
365
|
+
"No relevant results found in final synthesis."
|
366
|
+
)
|
345
367
|
documents = []
|
346
368
|
|
347
369
|
# Add a final synthesis finding
|
@@ -363,8 +385,10 @@ class SourceBasedSearchStrategy(BaseSearchStrategy):
|
|
363
385
|
)
|
364
386
|
|
365
387
|
# Format findings
|
366
|
-
formatted_findings =
|
367
|
-
|
388
|
+
formatted_findings = (
|
389
|
+
self.findings_repository.format_findings_to_text(
|
390
|
+
findings, synthesized_content
|
391
|
+
)
|
368
392
|
)
|
369
393
|
|
370
394
|
except Exception as e:
|