local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +7 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
- local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
- local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
- local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
- local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
- local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
- local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
- local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
- local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
- local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
- local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
- local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
- local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
- local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
- local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
- local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
- local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
- local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
- local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
- local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
- local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
- local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
- local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
- local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
- local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
- local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
- local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
- local_deep_research/advanced_search_system/findings/repository.py +54 -17
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
- local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
- local_deep_research/advanced_search_system/questions/__init__.py +16 -0
- local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
- local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
- local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
- local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
- local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
- local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
- local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
- local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
- local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
- local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
- local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
- local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
- local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
- local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
- local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
- local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
- local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
- local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
- local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
- local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
- local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
- local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
- local_deep_research/api/benchmark_functions.py +6 -2
- local_deep_research/api/research_functions.py +10 -4
- local_deep_research/benchmarks/__init__.py +9 -7
- local_deep_research/benchmarks/benchmark_functions.py +6 -2
- local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
- local_deep_research/benchmarks/cli.py +38 -13
- local_deep_research/benchmarks/comparison/__init__.py +4 -2
- local_deep_research/benchmarks/comparison/evaluator.py +316 -239
- local_deep_research/benchmarks/datasets/__init__.py +1 -1
- local_deep_research/benchmarks/datasets/base.py +91 -72
- local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
- local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
- local_deep_research/benchmarks/datasets/utils.py +48 -29
- local_deep_research/benchmarks/datasets.py +4 -11
- local_deep_research/benchmarks/efficiency/__init__.py +8 -4
- local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
- local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
- local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
- local_deep_research/benchmarks/evaluators/composite.py +6 -2
- local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
- local_deep_research/benchmarks/graders.py +32 -10
- local_deep_research/benchmarks/metrics/README.md +1 -1
- local_deep_research/benchmarks/metrics/calculation.py +25 -10
- local_deep_research/benchmarks/metrics/reporting.py +7 -3
- local_deep_research/benchmarks/metrics/visualization.py +42 -23
- local_deep_research/benchmarks/metrics.py +1 -1
- local_deep_research/benchmarks/optimization/__init__.py +3 -1
- local_deep_research/benchmarks/optimization/api.py +7 -1
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
- local_deep_research/benchmarks/runners.py +48 -15
- local_deep_research/citation_handler.py +65 -92
- local_deep_research/citation_handlers/__init__.py +15 -0
- local_deep_research/citation_handlers/base_citation_handler.py +70 -0
- local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
- local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
- local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
- local_deep_research/config/llm_config.py +271 -169
- local_deep_research/config/search_config.py +14 -5
- local_deep_research/defaults/__init__.py +0 -1
- local_deep_research/metrics/__init__.py +13 -0
- local_deep_research/metrics/database.py +58 -0
- local_deep_research/metrics/db_models.py +115 -0
- local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
- local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
- local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
- local_deep_research/metrics/migrate_research_ratings.py +31 -0
- local_deep_research/metrics/models.py +61 -0
- local_deep_research/metrics/pricing/__init__.py +12 -0
- local_deep_research/metrics/pricing/cost_calculator.py +237 -0
- local_deep_research/metrics/pricing/pricing_cache.py +143 -0
- local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
- local_deep_research/metrics/query_utils.py +51 -0
- local_deep_research/metrics/search_tracker.py +380 -0
- local_deep_research/metrics/token_counter.py +1078 -0
- local_deep_research/migrate_db.py +3 -1
- local_deep_research/report_generator.py +22 -8
- local_deep_research/search_system.py +390 -9
- local_deep_research/test_migration.py +15 -5
- local_deep_research/utilities/db_utils.py +7 -4
- local_deep_research/utilities/es_utils.py +115 -104
- local_deep_research/utilities/llm_utils.py +15 -5
- local_deep_research/utilities/log_utils.py +151 -0
- local_deep_research/utilities/search_cache.py +387 -0
- local_deep_research/utilities/search_utilities.py +14 -6
- local_deep_research/utilities/threading_utils.py +92 -0
- local_deep_research/utilities/url_utils.py +6 -0
- local_deep_research/web/api.py +347 -0
- local_deep_research/web/app.py +13 -17
- local_deep_research/web/app_factory.py +71 -66
- local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
- local_deep_research/web/database/migrations.py +5 -3
- local_deep_research/web/database/models.py +51 -2
- local_deep_research/web/database/schema_upgrade.py +49 -29
- local_deep_research/web/models/database.py +51 -61
- local_deep_research/web/routes/api_routes.py +56 -22
- local_deep_research/web/routes/benchmark_routes.py +4 -1
- local_deep_research/web/routes/globals.py +22 -0
- local_deep_research/web/routes/history_routes.py +71 -46
- local_deep_research/web/routes/metrics_routes.py +1155 -0
- local_deep_research/web/routes/research_routes.py +227 -41
- local_deep_research/web/routes/settings_routes.py +156 -55
- local_deep_research/web/services/research_service.py +310 -103
- local_deep_research/web/services/resource_service.py +36 -11
- local_deep_research/web/services/settings_manager.py +55 -17
- local_deep_research/web/services/settings_service.py +12 -4
- local_deep_research/web/services/socket_service.py +295 -188
- local_deep_research/web/static/css/custom_dropdown.css +180 -0
- local_deep_research/web/static/css/styles.css +39 -1
- local_deep_research/web/static/js/components/detail.js +633 -267
- local_deep_research/web/static/js/components/details.js +751 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
- local_deep_research/web/static/js/components/fallback/ui.js +23 -23
- local_deep_research/web/static/js/components/history.js +76 -76
- local_deep_research/web/static/js/components/logpanel.js +61 -13
- local_deep_research/web/static/js/components/progress.js +13 -2
- local_deep_research/web/static/js/components/research.js +99 -12
- local_deep_research/web/static/js/components/results.js +239 -106
- local_deep_research/web/static/js/main.js +40 -40
- local_deep_research/web/static/js/services/audio.js +1 -1
- local_deep_research/web/static/js/services/formatting.js +11 -11
- local_deep_research/web/static/js/services/keyboard.js +157 -0
- local_deep_research/web/static/js/services/pdf.js +80 -80
- local_deep_research/web/static/sounds/README.md +1 -1
- local_deep_research/web/templates/base.html +1 -0
- local_deep_research/web/templates/components/log_panel.html +7 -1
- local_deep_research/web/templates/components/mobile_nav.html +1 -1
- local_deep_research/web/templates/components/sidebar.html +3 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
- local_deep_research/web/templates/pages/details.html +325 -24
- local_deep_research/web/templates/pages/history.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +1929 -0
- local_deep_research/web/templates/pages/progress.html +2 -2
- local_deep_research/web/templates/pages/research.html +53 -17
- local_deep_research/web/templates/pages/results.html +12 -1
- local_deep_research/web/templates/pages/star_reviews.html +803 -0
- local_deep_research/web/utils/formatters.py +9 -3
- local_deep_research/web_search_engines/default_search_engines.py +5 -3
- local_deep_research/web_search_engines/engines/full_search.py +8 -2
- local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
- local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
- local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
- local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
- local_deep_research/web_search_engines/search_engine_base.py +83 -35
- local_deep_research/web_search_engines/search_engine_factory.py +25 -8
- local_deep_research/web_search_engines/search_engines_config.py +9 -3
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/METADATA +7 -1
- local_deep_research-0.5.0.dist-info/RECORD +265 -0
- local_deep_research-0.4.4.dist-info/RECORD +0 -177
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/WHEEL +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1031 @@
|
|
1
|
+
"""
|
2
|
+
BrowseComp Entity-Focused Search Strategy
|
3
|
+
|
4
|
+
This strategy is specifically designed for BrowseComp questions that require finding
|
5
|
+
specific entities (companies, people, events) that match multiple constraints.
|
6
|
+
|
7
|
+
Key features:
|
8
|
+
1. Entity extraction and progressive search
|
9
|
+
2. Knowledge graph building approach
|
10
|
+
3. Multi-constraint verification with caching
|
11
|
+
4. Specialized search patterns for different entity types
|
12
|
+
"""
|
13
|
+
|
14
|
+
import asyncio
|
15
|
+
import json
|
16
|
+
import re
|
17
|
+
from collections import defaultdict
|
18
|
+
from dataclasses import dataclass
|
19
|
+
from typing import Dict, List, Tuple
|
20
|
+
|
21
|
+
from loguru import logger
|
22
|
+
|
23
|
+
from ...utilities.search_cache import get_search_cache
|
24
|
+
from ..candidate_exploration import ConstraintGuidedExplorer
|
25
|
+
from ..constraint_checking import DualConfidenceChecker
|
26
|
+
from ..constraints import Constraint, ConstraintAnalyzer
|
27
|
+
from ..questions import BrowseCompQuestionGenerator
|
28
|
+
from .base_strategy import BaseSearchStrategy
|
29
|
+
|
30
|
+
|
31
|
+
@dataclass
|
32
|
+
class EntityCandidate:
|
33
|
+
"""Enhanced candidate with entity-specific metadata."""
|
34
|
+
|
35
|
+
name: str
|
36
|
+
entity_type: str # company, person, event, etc.
|
37
|
+
aliases: List[str] = None
|
38
|
+
properties: Dict[str, any] = None
|
39
|
+
sources: List[str] = None
|
40
|
+
confidence: float = 0.0
|
41
|
+
constraint_matches: Dict[str, float] = None
|
42
|
+
|
43
|
+
def __post_init__(self):
|
44
|
+
self.aliases = self.aliases or []
|
45
|
+
self.properties = self.properties or {}
|
46
|
+
self.sources = self.sources or []
|
47
|
+
self.constraint_matches = self.constraint_matches or {}
|
48
|
+
|
49
|
+
|
50
|
+
class EntityKnowledgeGraph:
|
51
|
+
"""Build and maintain knowledge about discovered entities."""
|
52
|
+
|
53
|
+
def __init__(self):
|
54
|
+
self.entities = {} # name -> EntityCandidate
|
55
|
+
self.constraint_evidence = defaultdict(
|
56
|
+
dict
|
57
|
+
) # constraint -> entity -> evidence
|
58
|
+
self.search_cache = {} # query -> results
|
59
|
+
|
60
|
+
def add_entity(self, entity: EntityCandidate):
|
61
|
+
"""Add or update an entity in the knowledge graph."""
|
62
|
+
if entity.name in self.entities:
|
63
|
+
# Merge information
|
64
|
+
existing = self.entities[entity.name]
|
65
|
+
existing.aliases.extend(entity.aliases)
|
66
|
+
existing.aliases = list(set(existing.aliases))
|
67
|
+
existing.properties.update(entity.properties)
|
68
|
+
existing.sources.extend(entity.sources)
|
69
|
+
existing.sources = list(set(existing.sources))
|
70
|
+
existing.constraint_matches.update(entity.constraint_matches)
|
71
|
+
else:
|
72
|
+
self.entities[entity.name] = entity
|
73
|
+
|
74
|
+
def add_constraint_evidence(
|
75
|
+
self, constraint: str, entity_name: str, evidence: Dict
|
76
|
+
):
|
77
|
+
"""Add evidence for a constraint-entity pair."""
|
78
|
+
self.constraint_evidence[constraint][entity_name] = evidence
|
79
|
+
|
80
|
+
def get_entities_by_constraint(
|
81
|
+
self, constraint: str, min_confidence: float = 0.5
|
82
|
+
) -> List[EntityCandidate]:
|
83
|
+
"""Get entities that match a constraint above confidence threshold."""
|
84
|
+
matching = []
|
85
|
+
for entity in self.entities.values():
|
86
|
+
if constraint in entity.constraint_matches:
|
87
|
+
if entity.constraint_matches[constraint] >= min_confidence:
|
88
|
+
matching.append(entity)
|
89
|
+
return sorted(
|
90
|
+
matching,
|
91
|
+
key=lambda e: e.constraint_matches.get(constraint, 0),
|
92
|
+
reverse=True,
|
93
|
+
)
|
94
|
+
|
95
|
+
|
96
|
+
class BrowseCompEntityStrategy(BaseSearchStrategy):
|
97
|
+
"""
|
98
|
+
Entity-focused search strategy for BrowseComp questions.
|
99
|
+
|
100
|
+
This strategy:
|
101
|
+
1. Extracts key entities from the query
|
102
|
+
2. Performs broad entity discovery searches
|
103
|
+
3. Builds a knowledge graph of candidates
|
104
|
+
4. Progressively verifies constraints
|
105
|
+
5. Uses caching to avoid redundant searches
|
106
|
+
"""
|
107
|
+
|
108
|
+
def __init__(
|
109
|
+
self, model=None, search=None, all_links_of_system=None, **kwargs
|
110
|
+
):
|
111
|
+
super().__init__(all_links_of_system=all_links_of_system)
|
112
|
+
|
113
|
+
# Store model and search engine
|
114
|
+
self.model = model
|
115
|
+
self.search_engine = search
|
116
|
+
|
117
|
+
# Initialize components that depend on model/search
|
118
|
+
if self.model:
|
119
|
+
self.constraint_analyzer = ConstraintAnalyzer(model=self.model)
|
120
|
+
self.question_generator = BrowseCompQuestionGenerator()
|
121
|
+
else:
|
122
|
+
logger.warning("No model provided to BrowseCompEntityStrategy")
|
123
|
+
|
124
|
+
self.knowledge_graph = EntityKnowledgeGraph()
|
125
|
+
|
126
|
+
# Initialize constraint checker with entity-aware settings
|
127
|
+
if self.model:
|
128
|
+
self.constraint_checker = DualConfidenceChecker(
|
129
|
+
evidence_gatherer=self._gather_entity_evidence,
|
130
|
+
negative_threshold=0.3, # More lenient for entities
|
131
|
+
positive_threshold=0.4,
|
132
|
+
uncertainty_penalty=0.1,
|
133
|
+
negative_weight=1.0,
|
134
|
+
)
|
135
|
+
|
136
|
+
# Initialize specialized explorer
|
137
|
+
if self.search_engine and self.model:
|
138
|
+
self.explorer = ConstraintGuidedExplorer(
|
139
|
+
search_engine=self.search_engine, model=self.model
|
140
|
+
)
|
141
|
+
|
142
|
+
# Entity type patterns
|
143
|
+
self.entity_patterns = {
|
144
|
+
"company": [
|
145
|
+
"company",
|
146
|
+
"corporation",
|
147
|
+
"group",
|
148
|
+
"firm",
|
149
|
+
"business",
|
150
|
+
"conglomerate",
|
151
|
+
],
|
152
|
+
"person": ["person", "individual", "character", "figure", "people"],
|
153
|
+
"event": [
|
154
|
+
"event",
|
155
|
+
"incident",
|
156
|
+
"occurrence",
|
157
|
+
"game",
|
158
|
+
"match",
|
159
|
+
"competition",
|
160
|
+
],
|
161
|
+
"location": [
|
162
|
+
"place",
|
163
|
+
"location",
|
164
|
+
"city",
|
165
|
+
"country",
|
166
|
+
"region",
|
167
|
+
"area",
|
168
|
+
],
|
169
|
+
"product": ["product", "item", "device", "software", "app", "tool"],
|
170
|
+
}
|
171
|
+
|
172
|
+
async def search(
|
173
|
+
self,
|
174
|
+
query: str,
|
175
|
+
search_engines: List[str] = None,
|
176
|
+
progress_callback=None,
|
177
|
+
**kwargs,
|
178
|
+
) -> Tuple[str, Dict]:
|
179
|
+
"""Execute entity-focused search strategy."""
|
180
|
+
try:
|
181
|
+
logger.info(f"🎯 Starting BrowseComp Entity Search for: {query}")
|
182
|
+
|
183
|
+
# Phase 1: Constraint and entity analysis
|
184
|
+
if progress_callback:
|
185
|
+
progress_callback(
|
186
|
+
{
|
187
|
+
"phase": "entity_analysis",
|
188
|
+
"progress": 10,
|
189
|
+
"message": "Analyzing query for entities and constraints",
|
190
|
+
}
|
191
|
+
)
|
192
|
+
|
193
|
+
constraints = self.constraint_analyzer.extract_constraints(query)
|
194
|
+
entity_type = self._identify_entity_type(query)
|
195
|
+
logger.info(
|
196
|
+
f"Identified entity type: {entity_type}, {len(constraints)} constraints"
|
197
|
+
)
|
198
|
+
|
199
|
+
# Phase 2: Initial entity discovery
|
200
|
+
if progress_callback:
|
201
|
+
progress_callback(
|
202
|
+
{
|
203
|
+
"phase": "entity_discovery",
|
204
|
+
"progress": 25,
|
205
|
+
"message": f"Searching for {entity_type} entities",
|
206
|
+
}
|
207
|
+
)
|
208
|
+
|
209
|
+
initial_entities = await self._discover_entities(
|
210
|
+
query,
|
211
|
+
entity_type,
|
212
|
+
constraints[:2], # Use first 2 constraints for initial search
|
213
|
+
)
|
214
|
+
logger.info(f"Discovered {len(initial_entities)} initial entities")
|
215
|
+
|
216
|
+
# Phase 3: Progressive constraint verification
|
217
|
+
best_candidate = None
|
218
|
+
iteration = 0
|
219
|
+
max_iterations = 10
|
220
|
+
|
221
|
+
while iteration < max_iterations:
|
222
|
+
iteration += 1
|
223
|
+
|
224
|
+
if progress_callback:
|
225
|
+
progress_callback(
|
226
|
+
{
|
227
|
+
"phase": "constraint_verification",
|
228
|
+
"progress": 25 + (iteration * 50 / max_iterations),
|
229
|
+
"message": f"Verifying constraints (iteration {iteration}/{max_iterations})",
|
230
|
+
}
|
231
|
+
)
|
232
|
+
|
233
|
+
# Generate targeted searches based on current knowledge
|
234
|
+
questions = self.question_generator.generate_questions(
|
235
|
+
current_knowledge=self._summarize_knowledge(),
|
236
|
+
query=query,
|
237
|
+
questions_per_iteration=5,
|
238
|
+
iteration=iteration,
|
239
|
+
)
|
240
|
+
|
241
|
+
# Search for evidence
|
242
|
+
new_entities = await self._search_with_questions(
|
243
|
+
questions, entity_type
|
244
|
+
)
|
245
|
+
|
246
|
+
# Add to knowledge graph
|
247
|
+
for entity in new_entities:
|
248
|
+
self.knowledge_graph.add_entity(entity)
|
249
|
+
|
250
|
+
# Evaluate all entities against constraints
|
251
|
+
evaluated = await self._evaluate_entities(constraints)
|
252
|
+
|
253
|
+
# Check for high-confidence matches
|
254
|
+
if evaluated:
|
255
|
+
best_candidate = evaluated[0]
|
256
|
+
if best_candidate.confidence > 0.8:
|
257
|
+
logger.info(
|
258
|
+
f"✅ Found high-confidence match: {best_candidate.name} ({best_candidate.confidence:.2%})"
|
259
|
+
)
|
260
|
+
break
|
261
|
+
|
262
|
+
# Early stopping if no progress
|
263
|
+
if iteration > 3 and not self.knowledge_graph.entities:
|
264
|
+
logger.warning(
|
265
|
+
"No entities found after 3 iterations, stopping"
|
266
|
+
)
|
267
|
+
break
|
268
|
+
|
269
|
+
# Phase 4: Generate final answer
|
270
|
+
if progress_callback:
|
271
|
+
progress_callback(
|
272
|
+
{
|
273
|
+
"phase": "answer_generation",
|
274
|
+
"progress": 90,
|
275
|
+
"message": "Generating final answer",
|
276
|
+
}
|
277
|
+
)
|
278
|
+
|
279
|
+
if best_candidate and best_candidate.confidence > 0.5:
|
280
|
+
answer = await self._generate_entity_answer(
|
281
|
+
query, best_candidate, constraints
|
282
|
+
)
|
283
|
+
else:
|
284
|
+
answer = await self._generate_uncertain_answer(
|
285
|
+
query, evaluated[:3] if evaluated else []
|
286
|
+
)
|
287
|
+
|
288
|
+
# Prepare metadata
|
289
|
+
metadata = {
|
290
|
+
"strategy": "browsecomp_entity",
|
291
|
+
"entity_type": entity_type,
|
292
|
+
"entities_discovered": len(self.knowledge_graph.entities),
|
293
|
+
"iterations": iteration,
|
294
|
+
"best_candidate": best_candidate.name
|
295
|
+
if best_candidate
|
296
|
+
else None,
|
297
|
+
"confidence": best_candidate.confidence
|
298
|
+
if best_candidate
|
299
|
+
else 0.0,
|
300
|
+
"constraint_count": len(constraints),
|
301
|
+
"cached_searches": len(self.knowledge_graph.search_cache),
|
302
|
+
}
|
303
|
+
|
304
|
+
return answer, metadata
|
305
|
+
|
306
|
+
except Exception as e:
|
307
|
+
logger.error(
|
308
|
+
f"Error in BrowseComp entity search: {e}", exc_info=True
|
309
|
+
)
|
310
|
+
return f"Search failed: {str(e)}", {"error": str(e)}
|
311
|
+
|
312
|
+
def _identify_entity_type(self, query: str) -> str:
|
313
|
+
"""Identify what type of entity we're looking for."""
|
314
|
+
query_lower = query.lower()
|
315
|
+
|
316
|
+
for entity_type, keywords in self.entity_patterns.items():
|
317
|
+
if any(keyword in query_lower for keyword in keywords):
|
318
|
+
return entity_type
|
319
|
+
|
320
|
+
# Default based on common patterns
|
321
|
+
if "who" in query_lower:
|
322
|
+
return "person"
|
323
|
+
elif "which" in query_lower:
|
324
|
+
return "product"
|
325
|
+
elif "what" in query_lower and "company" in query_lower:
|
326
|
+
return "company"
|
327
|
+
else:
|
328
|
+
return "entity"
|
329
|
+
|
330
|
+
async def _discover_entities(
|
331
|
+
self,
|
332
|
+
query: str,
|
333
|
+
entity_type: str,
|
334
|
+
initial_constraints: List[Constraint],
|
335
|
+
) -> List[EntityCandidate]:
|
336
|
+
"""Discover initial entity candidates."""
|
337
|
+
entities = []
|
338
|
+
|
339
|
+
# Generate entity-focused search queries
|
340
|
+
search_queries = self._generate_entity_searches(
|
341
|
+
entity_type, initial_constraints
|
342
|
+
)
|
343
|
+
|
344
|
+
# Execute searches in parallel
|
345
|
+
search_tasks = []
|
346
|
+
for search_query in search_queries[:5]: # Limit initial searches
|
347
|
+
if search_query not in self.knowledge_graph.search_cache:
|
348
|
+
search_tasks.append(self._cached_search(search_query))
|
349
|
+
|
350
|
+
results = await asyncio.gather(*search_tasks)
|
351
|
+
|
352
|
+
# Extract entities from results
|
353
|
+
for query_results in results:
|
354
|
+
extracted = await self._extract_entities_from_results(
|
355
|
+
query_results, entity_type
|
356
|
+
)
|
357
|
+
entities.extend(extracted)
|
358
|
+
|
359
|
+
return entities
|
360
|
+
|
361
|
+
def _generate_entity_searches(
|
362
|
+
self, entity_type: str, constraints: List[Constraint]
|
363
|
+
) -> List[str]:
|
364
|
+
"""Generate search queries for entity discovery."""
|
365
|
+
searches = []
|
366
|
+
|
367
|
+
# Type-specific base queries
|
368
|
+
if entity_type == "company":
|
369
|
+
searches.extend(
|
370
|
+
[
|
371
|
+
"largest companies conglomerates groups",
|
372
|
+
"major corporation multinational business",
|
373
|
+
"company group founded",
|
374
|
+
]
|
375
|
+
)
|
376
|
+
elif entity_type == "person":
|
377
|
+
searches.extend(
|
378
|
+
[
|
379
|
+
"famous people individuals",
|
380
|
+
"notable person character",
|
381
|
+
"who known for",
|
382
|
+
]
|
383
|
+
)
|
384
|
+
elif entity_type == "event":
|
385
|
+
searches.extend(
|
386
|
+
[
|
387
|
+
"major events competitions",
|
388
|
+
"historical event game match",
|
389
|
+
"significant occurrence",
|
390
|
+
]
|
391
|
+
)
|
392
|
+
|
393
|
+
# Add constraint-based searches
|
394
|
+
for constraint in constraints:
|
395
|
+
if constraint.type.value == "TEMPORAL":
|
396
|
+
# Extract years/dates
|
397
|
+
years = re.findall(r"\b(19\d{2}|20\d{2})\b", constraint.value)
|
398
|
+
for year in years:
|
399
|
+
searches.append(f"{entity_type} {year}")
|
400
|
+
elif constraint.type.value == "LOCATION":
|
401
|
+
# Extract location names
|
402
|
+
locations = re.findall(
|
403
|
+
r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", constraint.value
|
404
|
+
)
|
405
|
+
for location in locations:
|
406
|
+
searches.append(f"{entity_type} {location}")
|
407
|
+
elif constraint.type.value == "STATISTIC":
|
408
|
+
# Extract numbers
|
409
|
+
numbers = re.findall(r"\b\d+\b", constraint.value)
|
410
|
+
for number in numbers:
|
411
|
+
searches.append(f"{entity_type} {number}")
|
412
|
+
|
413
|
+
return searches
|
414
|
+
|
415
|
+
async def _extract_entities_from_results(
|
416
|
+
self, results: List[Dict], entity_type: str
|
417
|
+
) -> List[EntityCandidate]:
|
418
|
+
"""Extract entity candidates from search results."""
|
419
|
+
if not results:
|
420
|
+
return []
|
421
|
+
|
422
|
+
# Use LLM to extract entities
|
423
|
+
results_text = "\n".join(
|
424
|
+
[
|
425
|
+
f"- {r.get('title', '')}: {r.get('snippet', '')[:200]}"
|
426
|
+
for r in results[:10]
|
427
|
+
]
|
428
|
+
)
|
429
|
+
|
430
|
+
prompt = f"""Extract {entity_type} entities from these search results.
|
431
|
+
|
432
|
+
Search Results:
|
433
|
+
{results_text}
|
434
|
+
|
435
|
+
For each entity found, provide:
|
436
|
+
1. Name (official/full name)
|
437
|
+
2. Aliases (other names, abbreviations)
|
438
|
+
3. Key properties (founding year, location, size, etc.)
|
439
|
+
|
440
|
+
Format as JSON:
|
441
|
+
[
|
442
|
+
{{
|
443
|
+
"name": "Entity Name",
|
444
|
+
"aliases": ["alias1", "alias2"],
|
445
|
+
"properties": {{"key": "value"}}
|
446
|
+
}}
|
447
|
+
]
|
448
|
+
|
449
|
+
Return only entities that are clearly {entity_type} entities."""
|
450
|
+
|
451
|
+
response = await self.model.ainvoke(prompt)
|
452
|
+
|
453
|
+
try:
|
454
|
+
entities_data = json.loads(response.content)
|
455
|
+
entities = []
|
456
|
+
|
457
|
+
for data in entities_data:
|
458
|
+
entity = EntityCandidate(
|
459
|
+
name=data["name"],
|
460
|
+
entity_type=entity_type,
|
461
|
+
aliases=data.get("aliases", []),
|
462
|
+
properties=data.get("properties", {}),
|
463
|
+
sources=[r.get("url", "") for r in results[:3]],
|
464
|
+
)
|
465
|
+
entities.append(entity)
|
466
|
+
|
467
|
+
return entities
|
468
|
+
|
469
|
+
except json.JSONDecodeError:
|
470
|
+
logger.warning("Failed to parse entity extraction response")
|
471
|
+
return []
|
472
|
+
|
473
|
+
async def _search_with_questions(
|
474
|
+
self, questions: List[str], entity_type: str
|
475
|
+
) -> List[EntityCandidate]:
|
476
|
+
"""Search using generated questions and extract entities."""
|
477
|
+
all_entities = []
|
478
|
+
|
479
|
+
# Execute searches
|
480
|
+
search_tasks = []
|
481
|
+
for question in questions:
|
482
|
+
if question not in self.knowledge_graph.search_cache:
|
483
|
+
search_tasks.append(self._cached_search(question))
|
484
|
+
|
485
|
+
results = await asyncio.gather(*search_tasks)
|
486
|
+
|
487
|
+
# Extract entities
|
488
|
+
for query_results in results:
|
489
|
+
entities = await self._extract_entities_from_results(
|
490
|
+
query_results, entity_type
|
491
|
+
)
|
492
|
+
all_entities.extend(entities)
|
493
|
+
|
494
|
+
return all_entities
|
495
|
+
|
496
|
+
async def _evaluate_entities(
|
497
|
+
self, constraints: List[Constraint]
|
498
|
+
) -> List[EntityCandidate]:
|
499
|
+
"""Evaluate all entities against constraints."""
|
500
|
+
evaluated = []
|
501
|
+
|
502
|
+
for entity_name, entity in self.knowledge_graph.entities.items():
|
503
|
+
# Check each constraint
|
504
|
+
total_score = 0.0
|
505
|
+
constraint_scores = {}
|
506
|
+
|
507
|
+
for constraint in constraints:
|
508
|
+
# Check if we already have evidence for this constraint-entity pair
|
509
|
+
if constraint.value in self.knowledge_graph.constraint_evidence:
|
510
|
+
if (
|
511
|
+
entity_name
|
512
|
+
in self.knowledge_graph.constraint_evidence[
|
513
|
+
constraint.value
|
514
|
+
]
|
515
|
+
):
|
516
|
+
evidence = self.knowledge_graph.constraint_evidence[
|
517
|
+
constraint.value
|
518
|
+
][entity_name]
|
519
|
+
score = evidence.get("score", 0.0)
|
520
|
+
else:
|
521
|
+
# Gather new evidence
|
522
|
+
score = await self._verify_entity_constraint(
|
523
|
+
entity, constraint
|
524
|
+
)
|
525
|
+
else:
|
526
|
+
score = await self._verify_entity_constraint(
|
527
|
+
entity, constraint
|
528
|
+
)
|
529
|
+
|
530
|
+
constraint_scores[constraint.value] = score
|
531
|
+
total_score += score * constraint.weight
|
532
|
+
|
533
|
+
# Update entity with scores
|
534
|
+
entity.constraint_matches = constraint_scores
|
535
|
+
entity.confidence = total_score / sum(c.weight for c in constraints)
|
536
|
+
|
537
|
+
if entity.confidence > 0.3: # Only keep reasonable candidates
|
538
|
+
evaluated.append(entity)
|
539
|
+
|
540
|
+
# Sort by confidence
|
541
|
+
return sorted(evaluated, key=lambda e: e.confidence, reverse=True)
|
542
|
+
|
543
|
+
async def _verify_entity_constraint(
|
544
|
+
self, entity: EntityCandidate, constraint: Constraint
|
545
|
+
) -> float:
|
546
|
+
"""Verify if an entity satisfies a constraint."""
|
547
|
+
# Build targeted search query
|
548
|
+
search_terms = [entity.name] + entity.aliases[:2]
|
549
|
+
constraint_terms = self._extract_constraint_terms(constraint)
|
550
|
+
|
551
|
+
best_score = 0.0
|
552
|
+
for term in search_terms:
|
553
|
+
query = f'"{term}" {" ".join(constraint_terms)}'
|
554
|
+
|
555
|
+
# Search for evidence
|
556
|
+
results = await self._cached_search(query)
|
557
|
+
|
558
|
+
if results:
|
559
|
+
# Quick verification with LLM
|
560
|
+
evidence_text = " ".join(
|
561
|
+
[r.get("snippet", "") for r in results[:3]]
|
562
|
+
)
|
563
|
+
|
564
|
+
prompt = f"""Does {entity.name} satisfy this constraint?
|
565
|
+
|
566
|
+
Constraint: {constraint.description}
|
567
|
+
Evidence: {evidence_text}
|
568
|
+
|
569
|
+
Answer with a confidence score from 0.0 to 1.0 and brief explanation.
|
570
|
+
Format: SCORE: X.X | REASON: explanation"""
|
571
|
+
|
572
|
+
response = await self.model.ainvoke(prompt)
|
573
|
+
content = response.content
|
574
|
+
|
575
|
+
# Extract score
|
576
|
+
score_match = re.search(r"SCORE:\s*([\d.]+)", content)
|
577
|
+
if score_match:
|
578
|
+
score = float(score_match.group(1))
|
579
|
+
best_score = max(best_score, score)
|
580
|
+
|
581
|
+
# Cache the evidence
|
582
|
+
self.knowledge_graph.add_constraint_evidence(
|
583
|
+
constraint.value,
|
584
|
+
entity.name,
|
585
|
+
{
|
586
|
+
"score": score,
|
587
|
+
"evidence": evidence_text,
|
588
|
+
"reason": content,
|
589
|
+
},
|
590
|
+
)
|
591
|
+
|
592
|
+
return best_score
|
593
|
+
|
594
|
+
def _extract_constraint_terms(self, constraint: Constraint) -> List[str]:
|
595
|
+
"""Extract searchable terms from a constraint."""
|
596
|
+
terms = []
|
597
|
+
|
598
|
+
# Remove common prefixes
|
599
|
+
value = constraint.value
|
600
|
+
for prefix in ["The answer must", "Must be", "Should be", "Is"]:
|
601
|
+
if value.startswith(prefix):
|
602
|
+
value = value[len(prefix) :].strip()
|
603
|
+
break
|
604
|
+
|
605
|
+
# Extract specific terms based on constraint type
|
606
|
+
if constraint.type.value == "TEMPORAL":
|
607
|
+
# Extract years
|
608
|
+
terms.extend(re.findall(r"\b(19\d{2}|20\d{2})\b", value))
|
609
|
+
elif constraint.type.value == "STATISTIC":
|
610
|
+
# Extract numbers
|
611
|
+
terms.extend(re.findall(r"\b\d+\b", value))
|
612
|
+
elif constraint.type.value == "LOCATION":
|
613
|
+
# Extract proper nouns
|
614
|
+
terms.extend(
|
615
|
+
re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", value)
|
616
|
+
)
|
617
|
+
|
618
|
+
# Add key descriptive words
|
619
|
+
words = value.split()
|
620
|
+
for word in words:
|
621
|
+
if len(word) > 4 and word.lower() not in [
|
622
|
+
"must",
|
623
|
+
"should",
|
624
|
+
"would",
|
625
|
+
"could",
|
626
|
+
]:
|
627
|
+
terms.append(word)
|
628
|
+
|
629
|
+
return terms[:5] # Limit to avoid overly long queries
|
630
|
+
|
631
|
+
def extract_entity_candidates(
|
632
|
+
self, constraints: List[Constraint]
|
633
|
+
) -> List[str]:
|
634
|
+
"""
|
635
|
+
Extract potential entity names using constraint analysis.
|
636
|
+
Implements progressive entity discovery from improvement strategy.
|
637
|
+
"""
|
638
|
+
candidates = []
|
639
|
+
|
640
|
+
for constraint in constraints:
|
641
|
+
# Look for proper nouns (likely entity names)
|
642
|
+
proper_nouns = re.findall(
|
643
|
+
r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", constraint.value
|
644
|
+
)
|
645
|
+
candidates.extend(proper_nouns)
|
646
|
+
|
647
|
+
# Look for company name patterns
|
648
|
+
company_patterns = [
|
649
|
+
r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:Group|Inc|Ltd|Corp|Company|Corporation)",
|
650
|
+
r"([A-Z][a-z]+(?:-[A-Z][a-z]+)*)\s+(?:Group|Inc|Ltd|Corp)",
|
651
|
+
r"([A-Z]{2,}(?:-[A-Z]{2,})*)", # Acronyms like PRAN-RFL
|
652
|
+
]
|
653
|
+
|
654
|
+
for pattern in company_patterns:
|
655
|
+
matches = re.findall(pattern, constraint.value)
|
656
|
+
candidates.extend(matches)
|
657
|
+
|
658
|
+
# Remove duplicates and sort by specificity
|
659
|
+
unique_candidates = list(set(candidates))
|
660
|
+
return sorted(
|
661
|
+
unique_candidates, key=self.entity_specificity, reverse=True
|
662
|
+
)
|
663
|
+
|
664
|
+
def entity_specificity(self, entity: str) -> float:
|
665
|
+
"""
|
666
|
+
Score entity specificity for search prioritization.
|
667
|
+
Higher scores = more specific entities to search first.
|
668
|
+
"""
|
669
|
+
score = 0.0
|
670
|
+
|
671
|
+
# Longer names are typically more specific
|
672
|
+
score += len(entity) * 0.1
|
673
|
+
|
674
|
+
# Multiple words indicate more specificity
|
675
|
+
word_count = len(entity.split())
|
676
|
+
score += word_count * 2.0
|
677
|
+
|
678
|
+
# Company suffixes indicate high specificity
|
679
|
+
company_suffixes = [
|
680
|
+
"Group",
|
681
|
+
"Inc",
|
682
|
+
"Ltd",
|
683
|
+
"Corp",
|
684
|
+
"Corporation",
|
685
|
+
"Company",
|
686
|
+
"Conglomerate",
|
687
|
+
]
|
688
|
+
if any(suffix in entity for suffix in company_suffixes):
|
689
|
+
score += 10.0
|
690
|
+
|
691
|
+
# Hyphenated names (like PRAN-RFL) are often specific
|
692
|
+
if "-" in entity:
|
693
|
+
score += 5.0
|
694
|
+
|
695
|
+
# All caps acronyms are specific
|
696
|
+
if entity.isupper() and len(entity) >= 3:
|
697
|
+
score += 8.0
|
698
|
+
|
699
|
+
return score
|
700
|
+
|
701
|
+
def _gather_entity_evidence(self, candidate, constraint):
|
702
|
+
"""Evidence gatherer function for constraint checker."""
|
703
|
+
# Convert to EntityCandidate if needed
|
704
|
+
if not isinstance(candidate, EntityCandidate):
|
705
|
+
entity = EntityCandidate(
|
706
|
+
name=candidate.name
|
707
|
+
if hasattr(candidate, "name")
|
708
|
+
else str(candidate),
|
709
|
+
entity_type="unknown",
|
710
|
+
)
|
711
|
+
else:
|
712
|
+
entity = candidate
|
713
|
+
|
714
|
+
# Check cache first
|
715
|
+
if constraint.value in self.knowledge_graph.constraint_evidence:
|
716
|
+
if (
|
717
|
+
entity.name
|
718
|
+
in self.knowledge_graph.constraint_evidence[constraint.value]
|
719
|
+
):
|
720
|
+
evidence_data = self.knowledge_graph.constraint_evidence[
|
721
|
+
constraint.value
|
722
|
+
][entity.name]
|
723
|
+
return [
|
724
|
+
{
|
725
|
+
"text": evidence_data.get("evidence", ""),
|
726
|
+
"source": "cache",
|
727
|
+
"confidence": evidence_data.get("score", 0.5),
|
728
|
+
}
|
729
|
+
]
|
730
|
+
|
731
|
+
# Generate search query
|
732
|
+
constraint_terms = self._extract_constraint_terms(constraint)
|
733
|
+
query = f'"{entity.name}" {" ".join(constraint_terms)}'
|
734
|
+
|
735
|
+
# Search
|
736
|
+
results = (
|
737
|
+
self.search_engine.run(query)
|
738
|
+
if hasattr(self.search_engine, "run")
|
739
|
+
else []
|
740
|
+
)
|
741
|
+
|
742
|
+
# Convert to evidence format
|
743
|
+
evidence = []
|
744
|
+
for i, result in enumerate(results[:3]):
|
745
|
+
evidence.append(
|
746
|
+
{
|
747
|
+
"text": result.get("snippet", ""),
|
748
|
+
"source": result.get("url", f"result_{i}"),
|
749
|
+
"confidence": 0.7 - (i * 0.1),
|
750
|
+
}
|
751
|
+
)
|
752
|
+
|
753
|
+
return evidence
|
754
|
+
|
755
|
+
def _summarize_knowledge(self) -> str:
|
756
|
+
"""Summarize current knowledge for question generation."""
|
757
|
+
summary_parts = []
|
758
|
+
|
759
|
+
# Top entities by confidence
|
760
|
+
entities_by_confidence = sorted(
|
761
|
+
self.knowledge_graph.entities.values(),
|
762
|
+
key=lambda e: e.confidence,
|
763
|
+
reverse=True,
|
764
|
+
)[:5]
|
765
|
+
|
766
|
+
if entities_by_confidence:
|
767
|
+
summary_parts.append("Top candidates found:")
|
768
|
+
for entity in entities_by_confidence:
|
769
|
+
summary_parts.append(
|
770
|
+
f"- {entity.name} ({entity.entity_type}): {entity.confidence:.2%} confidence"
|
771
|
+
)
|
772
|
+
if entity.properties:
|
773
|
+
props = ", ".join(
|
774
|
+
f"{k}={v}"
|
775
|
+
for k, v in list(entity.properties.items())[:3]
|
776
|
+
)
|
777
|
+
summary_parts.append(f" Properties: {props}")
|
778
|
+
|
779
|
+
# Constraint satisfaction summary
|
780
|
+
if self.knowledge_graph.constraint_evidence:
|
781
|
+
summary_parts.append("\nConstraint verification status:")
|
782
|
+
for constraint, entities in list(
|
783
|
+
self.knowledge_graph.constraint_evidence.items()
|
784
|
+
)[:3]:
|
785
|
+
summary_parts.append(f"- {constraint[:50]}...")
|
786
|
+
for entity_name, evidence in list(entities.items())[:2]:
|
787
|
+
score = evidence.get("score", 0)
|
788
|
+
summary_parts.append(f" {entity_name}: {score:.2%}")
|
789
|
+
|
790
|
+
return "\n".join(summary_parts)
|
791
|
+
|
792
|
+
async def _cached_search(self, query: str) -> List[Dict]:
|
793
|
+
"""Perform search with caching support."""
|
794
|
+
cache = get_search_cache()
|
795
|
+
|
796
|
+
# Check cache first
|
797
|
+
cached_results = cache.get(query, "browsecomp_entity")
|
798
|
+
if cached_results is not None:
|
799
|
+
logger.debug(f"Using cached search results for: {query[:50]}...")
|
800
|
+
return cached_results
|
801
|
+
|
802
|
+
# Perform actual search
|
803
|
+
try:
|
804
|
+
if hasattr(self.search_engine, "run"):
|
805
|
+
results = self.search_engine.run(query)
|
806
|
+
elif hasattr(self.search_engine, "search"):
|
807
|
+
results = self.search_engine.search(query)
|
808
|
+
elif callable(self.search_engine):
|
809
|
+
results = self.search_engine(query)
|
810
|
+
else:
|
811
|
+
logger.warning("Search engine has no callable method")
|
812
|
+
return []
|
813
|
+
|
814
|
+
# Normalize results format
|
815
|
+
if isinstance(results, list):
|
816
|
+
normalized_results = results
|
817
|
+
elif isinstance(results, dict):
|
818
|
+
normalized_results = results.get("results", [])
|
819
|
+
else:
|
820
|
+
normalized_results = []
|
821
|
+
|
822
|
+
# Cache the results
|
823
|
+
cache.put(
|
824
|
+
query, normalized_results, "browsecomp_entity", ttl=1800
|
825
|
+
) # 30 minutes
|
826
|
+
|
827
|
+
logger.debug(f"Cached new search results for: {query[:50]}...")
|
828
|
+
return normalized_results
|
829
|
+
|
830
|
+
except Exception as e:
|
831
|
+
logger.error(f"Search failed for query '{query}': {e}")
|
832
|
+
return []
|
833
|
+
|
834
|
+
async def _generate_entity_answer(
|
835
|
+
self,
|
836
|
+
query: str,
|
837
|
+
best_entity: EntityCandidate,
|
838
|
+
constraints: List[Constraint],
|
839
|
+
) -> str:
|
840
|
+
"""Generate answer for the best matching entity."""
|
841
|
+
constraint_details = []
|
842
|
+
for constraint in constraints:
|
843
|
+
score = best_entity.constraint_matches.get(constraint.value, 0)
|
844
|
+
constraint_details.append(
|
845
|
+
f"- {constraint.description}: {score:.2%} confidence"
|
846
|
+
)
|
847
|
+
|
848
|
+
prompt = f"""Based on the search results, provide the answer to: {query}
|
849
|
+
|
850
|
+
Best matching {best_entity.entity_type}: {best_entity.name}
|
851
|
+
Overall confidence: {best_entity.confidence:.2%}
|
852
|
+
|
853
|
+
Aliases/Other names: {", ".join(best_entity.aliases[:3]) if best_entity.aliases else "None found"}
|
854
|
+
|
855
|
+
Properties:
|
856
|
+
{json.dumps(best_entity.properties, indent=2) if best_entity.properties else "No properties found"}
|
857
|
+
|
858
|
+
Constraint satisfaction:
|
859
|
+
{chr(10).join(constraint_details)}
|
860
|
+
|
861
|
+
Provide a clear, confident answer that explains why this entity matches the constraints."""
|
862
|
+
|
863
|
+
response = await self.model.ainvoke(prompt)
|
864
|
+
return response.content
|
865
|
+
|
866
|
+
async def _generate_uncertain_answer(
|
867
|
+
self, query: str, top_entities: List[EntityCandidate]
|
868
|
+
) -> str:
|
869
|
+
"""Generate answer when no high-confidence match is found."""
|
870
|
+
if not top_entities:
|
871
|
+
return "Unable to find any entities matching the specified constraints."
|
872
|
+
|
873
|
+
candidates_info = []
|
874
|
+
for entity in top_entities:
|
875
|
+
candidates_info.append(
|
876
|
+
f"- {entity.name}: {entity.confidence:.2%} confidence"
|
877
|
+
)
|
878
|
+
|
879
|
+
prompt = f"""Based on the search results for: {query}
|
880
|
+
|
881
|
+
Found these potential matches but with low confidence:
|
882
|
+
{chr(10).join(candidates_info)}
|
883
|
+
|
884
|
+
The search was unable to find a definitive answer matching all constraints.
|
885
|
+
|
886
|
+
Provide a helpful response explaining what was found and why no definitive answer could be determined."""
|
887
|
+
|
888
|
+
response = await self.model.ainvoke(prompt)
|
889
|
+
return response.content
|
890
|
+
|
891
|
+
def analyze_topic(self, query: str) -> Dict:
|
892
|
+
"""
|
893
|
+
Analyze a topic using entity-focused BrowseComp approach.
|
894
|
+
|
895
|
+
Args:
|
896
|
+
query: The research query to analyze
|
897
|
+
|
898
|
+
Returns:
|
899
|
+
Dict containing findings, iterations, and questions
|
900
|
+
"""
|
901
|
+
import asyncio
|
902
|
+
|
903
|
+
try:
|
904
|
+
# Run the async method in a new event loop if needed
|
905
|
+
try:
|
906
|
+
loop = asyncio.get_event_loop()
|
907
|
+
if loop.is_running():
|
908
|
+
# If loop is already running, create a new task
|
909
|
+
import concurrent.futures
|
910
|
+
|
911
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
912
|
+
future = executor.submit(
|
913
|
+
asyncio.run, self._analyze_topic_async(query)
|
914
|
+
)
|
915
|
+
return future.result()
|
916
|
+
else:
|
917
|
+
return loop.run_until_complete(
|
918
|
+
self._analyze_topic_async(query)
|
919
|
+
)
|
920
|
+
except RuntimeError:
|
921
|
+
# No event loop running, create new one
|
922
|
+
return asyncio.run(self._analyze_topic_async(query))
|
923
|
+
|
924
|
+
except Exception as e:
|
925
|
+
logger.error(f"Error in analyze_topic: {e}")
|
926
|
+
return {
|
927
|
+
"findings": [f"Error analyzing query: {str(e)}"],
|
928
|
+
"iterations": 0,
|
929
|
+
"questions": {},
|
930
|
+
"entities_found": 0,
|
931
|
+
"confidence": 0.0,
|
932
|
+
}
|
933
|
+
|
934
|
+
async def _analyze_topic_async(self, query: str) -> Dict:
|
935
|
+
"""Async implementation of topic analysis."""
|
936
|
+
try:
|
937
|
+
self._update_progress("Starting entity-focused analysis...", 0)
|
938
|
+
|
939
|
+
# Parse constraints from query
|
940
|
+
constraint_analyzer = ConstraintAnalyzer()
|
941
|
+
constraints = constraint_analyzer.analyze_query(query)
|
942
|
+
|
943
|
+
self._update_progress(
|
944
|
+
f"Identified {len(constraints)} constraints", 10
|
945
|
+
)
|
946
|
+
|
947
|
+
# Generate initial search questions
|
948
|
+
question_generator = BrowseCompQuestionGenerator()
|
949
|
+
initial_questions = question_generator.generate_questions(
|
950
|
+
query, constraints
|
951
|
+
)
|
952
|
+
|
953
|
+
self._update_progress("Generated initial questions", 20)
|
954
|
+
|
955
|
+
# Progressive entity discovery
|
956
|
+
all_entities = []
|
957
|
+
iteration = 0
|
958
|
+
max_iterations = 3
|
959
|
+
|
960
|
+
while iteration < max_iterations:
|
961
|
+
self._update_progress(
|
962
|
+
f"Iteration {iteration + 1}: Discovering entities...",
|
963
|
+
30 + iteration * 20,
|
964
|
+
)
|
965
|
+
|
966
|
+
# Search for entities
|
967
|
+
entities = await self._discover_entities_from_questions(
|
968
|
+
initial_questions, self._determine_entity_type(query)
|
969
|
+
)
|
970
|
+
all_entities.extend(entities)
|
971
|
+
|
972
|
+
# Break if we found high-confidence entities
|
973
|
+
if any(e.confidence > 0.8 for e in entities):
|
974
|
+
logger.info(
|
975
|
+
f"Found high-confidence entities in iteration {iteration + 1}"
|
976
|
+
)
|
977
|
+
break
|
978
|
+
|
979
|
+
iteration += 1
|
980
|
+
|
981
|
+
self._update_progress(
|
982
|
+
"Evaluating entities against constraints...", 80
|
983
|
+
)
|
984
|
+
|
985
|
+
# Evaluate entities
|
986
|
+
evaluated_entities = await self._evaluate_entities(constraints)
|
987
|
+
|
988
|
+
# Generate final answer
|
989
|
+
if evaluated_entities:
|
990
|
+
best_entity = max(
|
991
|
+
evaluated_entities, key=lambda e: e.confidence
|
992
|
+
)
|
993
|
+
if best_entity.confidence > 0.6:
|
994
|
+
answer = await self._generate_entity_answer(
|
995
|
+
query, best_entity, constraints
|
996
|
+
)
|
997
|
+
else:
|
998
|
+
answer = await self._generate_uncertain_answer(
|
999
|
+
query, evaluated_entities[:3]
|
1000
|
+
)
|
1001
|
+
else:
|
1002
|
+
answer = (
|
1003
|
+
"No entities were found matching the specified constraints."
|
1004
|
+
)
|
1005
|
+
|
1006
|
+
self._update_progress("Analysis complete", 100)
|
1007
|
+
|
1008
|
+
# Return results in expected format
|
1009
|
+
return {
|
1010
|
+
"findings": [answer],
|
1011
|
+
"iterations": iteration + 1,
|
1012
|
+
"questions": {
|
1013
|
+
f"iteration_{i}": initial_questions
|
1014
|
+
for i in range(iteration + 1)
|
1015
|
+
},
|
1016
|
+
"entities_found": len(evaluated_entities),
|
1017
|
+
"confidence": best_entity.confidence
|
1018
|
+
if evaluated_entities
|
1019
|
+
else 0.0,
|
1020
|
+
"strategy": "browsecomp_entity",
|
1021
|
+
}
|
1022
|
+
|
1023
|
+
except Exception as e:
|
1024
|
+
logger.error(f"Error in async topic analysis: {e}")
|
1025
|
+
return {
|
1026
|
+
"findings": [f"Analysis failed: {str(e)}"],
|
1027
|
+
"iterations": 0,
|
1028
|
+
"questions": {},
|
1029
|
+
"entities_found": 0,
|
1030
|
+
"confidence": 0.0,
|
1031
|
+
}
|