local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +7 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
- local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
- local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
- local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
- local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
- local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
- local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
- local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
- local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
- local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
- local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
- local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
- local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
- local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
- local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
- local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
- local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
- local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
- local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
- local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
- local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
- local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
- local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
- local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
- local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
- local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
- local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
- local_deep_research/advanced_search_system/findings/repository.py +54 -17
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
- local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
- local_deep_research/advanced_search_system/questions/__init__.py +16 -0
- local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
- local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
- local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
- local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
- local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
- local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
- local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
- local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
- local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
- local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
- local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
- local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
- local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
- local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
- local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
- local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
- local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
- local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
- local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
- local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
- local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
- local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
- local_deep_research/api/benchmark_functions.py +6 -2
- local_deep_research/api/research_functions.py +10 -4
- local_deep_research/benchmarks/__init__.py +9 -7
- local_deep_research/benchmarks/benchmark_functions.py +6 -2
- local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
- local_deep_research/benchmarks/cli.py +38 -13
- local_deep_research/benchmarks/comparison/__init__.py +4 -2
- local_deep_research/benchmarks/comparison/evaluator.py +316 -239
- local_deep_research/benchmarks/datasets/__init__.py +1 -1
- local_deep_research/benchmarks/datasets/base.py +91 -72
- local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
- local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
- local_deep_research/benchmarks/datasets/utils.py +48 -29
- local_deep_research/benchmarks/datasets.py +4 -11
- local_deep_research/benchmarks/efficiency/__init__.py +8 -4
- local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
- local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
- local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
- local_deep_research/benchmarks/evaluators/composite.py +6 -2
- local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
- local_deep_research/benchmarks/graders.py +32 -10
- local_deep_research/benchmarks/metrics/README.md +1 -1
- local_deep_research/benchmarks/metrics/calculation.py +25 -10
- local_deep_research/benchmarks/metrics/reporting.py +7 -3
- local_deep_research/benchmarks/metrics/visualization.py +42 -23
- local_deep_research/benchmarks/metrics.py +1 -1
- local_deep_research/benchmarks/optimization/__init__.py +3 -1
- local_deep_research/benchmarks/optimization/api.py +7 -1
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
- local_deep_research/benchmarks/runners.py +48 -15
- local_deep_research/citation_handler.py +65 -92
- local_deep_research/citation_handlers/__init__.py +15 -0
- local_deep_research/citation_handlers/base_citation_handler.py +70 -0
- local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
- local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
- local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
- local_deep_research/config/llm_config.py +271 -169
- local_deep_research/config/search_config.py +14 -5
- local_deep_research/defaults/__init__.py +0 -1
- local_deep_research/metrics/__init__.py +13 -0
- local_deep_research/metrics/database.py +58 -0
- local_deep_research/metrics/db_models.py +115 -0
- local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
- local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
- local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
- local_deep_research/metrics/migrate_research_ratings.py +31 -0
- local_deep_research/metrics/models.py +61 -0
- local_deep_research/metrics/pricing/__init__.py +12 -0
- local_deep_research/metrics/pricing/cost_calculator.py +237 -0
- local_deep_research/metrics/pricing/pricing_cache.py +143 -0
- local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
- local_deep_research/metrics/query_utils.py +51 -0
- local_deep_research/metrics/search_tracker.py +380 -0
- local_deep_research/metrics/token_counter.py +1078 -0
- local_deep_research/migrate_db.py +3 -1
- local_deep_research/report_generator.py +22 -8
- local_deep_research/search_system.py +390 -9
- local_deep_research/test_migration.py +15 -5
- local_deep_research/utilities/db_utils.py +7 -4
- local_deep_research/utilities/es_utils.py +115 -104
- local_deep_research/utilities/llm_utils.py +15 -5
- local_deep_research/utilities/log_utils.py +151 -0
- local_deep_research/utilities/search_cache.py +387 -0
- local_deep_research/utilities/search_utilities.py +14 -6
- local_deep_research/utilities/threading_utils.py +92 -0
- local_deep_research/utilities/url_utils.py +6 -0
- local_deep_research/web/api.py +347 -0
- local_deep_research/web/app.py +13 -17
- local_deep_research/web/app_factory.py +71 -66
- local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
- local_deep_research/web/database/migrations.py +5 -3
- local_deep_research/web/database/models.py +51 -2
- local_deep_research/web/database/schema_upgrade.py +49 -29
- local_deep_research/web/models/database.py +51 -61
- local_deep_research/web/routes/api_routes.py +56 -22
- local_deep_research/web/routes/benchmark_routes.py +4 -1
- local_deep_research/web/routes/globals.py +22 -0
- local_deep_research/web/routes/history_routes.py +71 -46
- local_deep_research/web/routes/metrics_routes.py +1155 -0
- local_deep_research/web/routes/research_routes.py +227 -41
- local_deep_research/web/routes/settings_routes.py +156 -55
- local_deep_research/web/services/research_service.py +310 -103
- local_deep_research/web/services/resource_service.py +36 -11
- local_deep_research/web/services/settings_manager.py +55 -17
- local_deep_research/web/services/settings_service.py +12 -4
- local_deep_research/web/services/socket_service.py +295 -188
- local_deep_research/web/static/css/custom_dropdown.css +180 -0
- local_deep_research/web/static/css/styles.css +39 -1
- local_deep_research/web/static/js/components/detail.js +633 -267
- local_deep_research/web/static/js/components/details.js +751 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
- local_deep_research/web/static/js/components/fallback/ui.js +23 -23
- local_deep_research/web/static/js/components/history.js +76 -76
- local_deep_research/web/static/js/components/logpanel.js +61 -13
- local_deep_research/web/static/js/components/progress.js +13 -2
- local_deep_research/web/static/js/components/research.js +99 -12
- local_deep_research/web/static/js/components/results.js +239 -106
- local_deep_research/web/static/js/main.js +40 -40
- local_deep_research/web/static/js/services/audio.js +1 -1
- local_deep_research/web/static/js/services/formatting.js +11 -11
- local_deep_research/web/static/js/services/keyboard.js +157 -0
- local_deep_research/web/static/js/services/pdf.js +80 -80
- local_deep_research/web/static/sounds/README.md +1 -1
- local_deep_research/web/templates/base.html +1 -0
- local_deep_research/web/templates/components/log_panel.html +7 -1
- local_deep_research/web/templates/components/mobile_nav.html +1 -1
- local_deep_research/web/templates/components/sidebar.html +3 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
- local_deep_research/web/templates/pages/details.html +325 -24
- local_deep_research/web/templates/pages/history.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +1929 -0
- local_deep_research/web/templates/pages/progress.html +2 -2
- local_deep_research/web/templates/pages/research.html +53 -17
- local_deep_research/web/templates/pages/results.html +12 -1
- local_deep_research/web/templates/pages/star_reviews.html +803 -0
- local_deep_research/web/utils/formatters.py +9 -3
- local_deep_research/web_search_engines/default_search_engines.py +5 -3
- local_deep_research/web_search_engines/engines/full_search.py +8 -2
- local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
- local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
- local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
- local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
- local_deep_research/web_search_engines/search_engine_base.py +83 -35
- local_deep_research/web_search_engines/search_engine_factory.py +25 -8
- local_deep_research/web_search_engines/search_engines_config.py +9 -3
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/METADATA +7 -1
- local_deep_research-0.5.0.dist-info/RECORD +265 -0
- local_deep_research-0.4.4.dist-info/RECORD +0 -177
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/WHEEL +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1348 @@
|
|
1
|
+
"""
|
2
|
+
Constrained search strategy that progressively narrows down candidates.
|
3
|
+
|
4
|
+
This strategy mimics human problem-solving by:
|
5
|
+
1. Starting with the most restrictive constraints
|
6
|
+
2. Finding candidates that match those constraints
|
7
|
+
3. Progressively checking additional constraints
|
8
|
+
4. Narrowing down the candidate pool step by step
|
9
|
+
"""
|
10
|
+
|
11
|
+
from datetime import datetime
|
12
|
+
from typing import Any, Dict, List
|
13
|
+
|
14
|
+
from langchain_core.language_models import BaseChatModel
|
15
|
+
from loguru import logger
|
16
|
+
|
17
|
+
from ...utilities.search_utilities import remove_think_tags
|
18
|
+
from ..candidates.base_candidate import Candidate
|
19
|
+
from ..constraints.base_constraint import Constraint, ConstraintType
|
20
|
+
from ..evidence.base_evidence import Evidence, EvidenceType
|
21
|
+
from .evidence_based_strategy import EvidenceBasedStrategy
|
22
|
+
|
23
|
+
|
24
|
+
class ConstrainedSearchStrategy(EvidenceBasedStrategy):
|
25
|
+
"""
|
26
|
+
Strategy that progressively narrows down candidates using constraints.
|
27
|
+
|
28
|
+
Key approach:
|
29
|
+
1. Rank constraints by restrictiveness
|
30
|
+
2. Start with most restrictive constraint
|
31
|
+
3. Find candidates matching that constraint
|
32
|
+
4. Progressively filter using additional constraints
|
33
|
+
5. Gather evidence only for promising candidates
|
34
|
+
"""
|
35
|
+
|
36
|
+
def __init__(
|
37
|
+
self,
|
38
|
+
model: BaseChatModel,
|
39
|
+
search: Any,
|
40
|
+
all_links_of_system: List[str],
|
41
|
+
max_iterations: int = 20,
|
42
|
+
confidence_threshold: float = 0.85,
|
43
|
+
candidate_limit: int = 100, # Increased to get more candidates
|
44
|
+
evidence_threshold: float = 0.6,
|
45
|
+
max_search_iterations: int = 2,
|
46
|
+
questions_per_iteration: int = 3,
|
47
|
+
min_candidates_per_stage: int = 20, # Need more candidates before filtering
|
48
|
+
):
|
49
|
+
"""Initialize the constrained search strategy."""
|
50
|
+
super().__init__(
|
51
|
+
model=model,
|
52
|
+
search=search,
|
53
|
+
all_links_of_system=all_links_of_system,
|
54
|
+
max_iterations=max_iterations,
|
55
|
+
confidence_threshold=confidence_threshold,
|
56
|
+
candidate_limit=candidate_limit,
|
57
|
+
evidence_threshold=evidence_threshold,
|
58
|
+
max_search_iterations=max_search_iterations,
|
59
|
+
questions_per_iteration=questions_per_iteration,
|
60
|
+
)
|
61
|
+
|
62
|
+
self.min_candidates_per_stage = min_candidates_per_stage
|
63
|
+
self.constraint_ranking: List[Constraint] = []
|
64
|
+
self.stage_candidates: Dict[int, List[Candidate]] = {}
|
65
|
+
|
66
|
+
# Enable direct search optimization for entity identification
|
67
|
+
# Note: parent class already sets this, but we ensure it's True
|
68
|
+
self.use_direct_search = True
|
69
|
+
logger.info(
|
70
|
+
f"ConstrainedSearchStrategy init: use_direct_search={self.use_direct_search}"
|
71
|
+
)
|
72
|
+
|
73
|
+
def analyze_topic(self, query: str) -> Dict:
|
74
|
+
"""Analyze topic using progressive constraint narrowing."""
|
75
|
+
# Initialize
|
76
|
+
self.all_links_of_system.clear()
|
77
|
+
self.questions_by_iteration = []
|
78
|
+
self.findings = []
|
79
|
+
self.iteration = 0
|
80
|
+
|
81
|
+
if self.progress_callback:
|
82
|
+
self.progress_callback(
|
83
|
+
"Analyzing query to identify constraints and rank by restrictiveness",
|
84
|
+
2,
|
85
|
+
{"phase": "initialization", "strategy": "constrained_search"},
|
86
|
+
)
|
87
|
+
|
88
|
+
# Extract and rank constraints
|
89
|
+
self.constraints = self.constraint_analyzer.extract_constraints(query)
|
90
|
+
self.constraint_ranking = self._rank_constraints_by_restrictiveness()
|
91
|
+
|
92
|
+
if self.progress_callback:
|
93
|
+
ranking_summary = ", ".join(
|
94
|
+
[
|
95
|
+
f"{i + 1}. {c.description[:30]}..."
|
96
|
+
for i, c in enumerate(self.constraint_ranking[:3])
|
97
|
+
]
|
98
|
+
)
|
99
|
+
self.progress_callback(
|
100
|
+
f"Found {len(self.constraints)} constraints. Order: {ranking_summary}",
|
101
|
+
5,
|
102
|
+
{
|
103
|
+
"phase": "constraint_ranking",
|
104
|
+
"constraint_count": len(self.constraints),
|
105
|
+
"ranking": [
|
106
|
+
(c.description, c.type.value)
|
107
|
+
for c in self.constraint_ranking
|
108
|
+
],
|
109
|
+
},
|
110
|
+
)
|
111
|
+
|
112
|
+
# Add initial analysis finding
|
113
|
+
initial_finding = {
|
114
|
+
"phase": "Constraint Analysis",
|
115
|
+
"content": self._format_constraint_analysis(),
|
116
|
+
"timestamp": self._get_timestamp(),
|
117
|
+
}
|
118
|
+
self.findings.append(initial_finding)
|
119
|
+
|
120
|
+
# Progressive constraint search
|
121
|
+
self._progressive_constraint_search()
|
122
|
+
|
123
|
+
# Add search summary finding
|
124
|
+
search_finding = {
|
125
|
+
"phase": "Progressive Search Summary",
|
126
|
+
"content": self._format_search_summary(),
|
127
|
+
"timestamp": self._get_timestamp(),
|
128
|
+
}
|
129
|
+
self.findings.append(search_finding)
|
130
|
+
|
131
|
+
# Evidence gathering for final candidates
|
132
|
+
self._focused_evidence_gathering()
|
133
|
+
|
134
|
+
# Add evidence summary finding
|
135
|
+
evidence_finding = {
|
136
|
+
"phase": "Evidence Summary",
|
137
|
+
"content": self._format_evidence_summary(),
|
138
|
+
"timestamp": self._get_timestamp(),
|
139
|
+
}
|
140
|
+
self.findings.append(evidence_finding)
|
141
|
+
|
142
|
+
# Add comprehensive debug summary
|
143
|
+
debug_finding = {
|
144
|
+
"phase": "Debug Summary",
|
145
|
+
"content": self._format_debug_summary(),
|
146
|
+
"timestamp": self._get_timestamp(),
|
147
|
+
"metadata": {
|
148
|
+
"total_searches": (
|
149
|
+
len(self.search_history)
|
150
|
+
if hasattr(self, "search_history")
|
151
|
+
else 0
|
152
|
+
),
|
153
|
+
"final_candidates": len(self.candidates),
|
154
|
+
"constraints_used": len(self.constraints),
|
155
|
+
"stages_completed": len(self.stage_candidates),
|
156
|
+
},
|
157
|
+
}
|
158
|
+
self.findings.append(debug_finding)
|
159
|
+
|
160
|
+
# Final synthesis
|
161
|
+
return self._synthesize_final_answer(query)
|
162
|
+
|
163
|
+
def _rank_constraints_by_restrictiveness(self) -> List[Constraint]:
|
164
|
+
"""Rank constraints from most to least restrictive."""
|
165
|
+
# Scoring system for restrictiveness
|
166
|
+
restrictiveness_scores = []
|
167
|
+
|
168
|
+
for constraint in self.constraints:
|
169
|
+
score = 0
|
170
|
+
|
171
|
+
# Type-based scoring
|
172
|
+
if constraint.type == ConstraintType.STATISTIC:
|
173
|
+
score += 10 # Numbers are usually very restrictive
|
174
|
+
elif constraint.type == ConstraintType.EVENT:
|
175
|
+
score += 8 # Events/time periods are restrictive
|
176
|
+
elif constraint.type == ConstraintType.LOCATION:
|
177
|
+
score += 6 # Locations are moderately restrictive
|
178
|
+
elif constraint.type == ConstraintType.PROPERTY:
|
179
|
+
score += 4 # Properties are less restrictive
|
180
|
+
|
181
|
+
# Specificity scoring
|
182
|
+
if constraint.value:
|
183
|
+
# Check for specific markers
|
184
|
+
if any(char.isdigit() for char in constraint.value):
|
185
|
+
score += 5 # Contains numbers
|
186
|
+
if len(constraint.value.split()) > 3:
|
187
|
+
score += 3 # Longer, more specific
|
188
|
+
if any(
|
189
|
+
term in constraint.value.lower()
|
190
|
+
for term in ["specific", "exact", "only", "must"]
|
191
|
+
):
|
192
|
+
score += 2 # Explicit specificity
|
193
|
+
|
194
|
+
restrictiveness_scores.append((constraint, score))
|
195
|
+
|
196
|
+
# Sort by score (highest first)
|
197
|
+
ranked = sorted(
|
198
|
+
restrictiveness_scores, key=lambda x: x[1], reverse=True
|
199
|
+
)
|
200
|
+
return [constraint for constraint, _ in ranked]
|
201
|
+
|
202
|
+
def _progressive_constraint_search(self):
|
203
|
+
"""Progressively search using constraints from most to least restrictive."""
|
204
|
+
current_candidates = []
|
205
|
+
|
206
|
+
for stage, constraint in enumerate(self.constraint_ranking):
|
207
|
+
self.current_stage = stage # Track current stage for logging
|
208
|
+
if self.progress_callback:
|
209
|
+
stage_desc = f"[{constraint.type.value}] {constraint.value} ({len(current_candidates)} candidates)"
|
210
|
+
self.progress_callback(
|
211
|
+
f"Stage {stage + 1}/{len(self.constraint_ranking)}: {stage_desc}",
|
212
|
+
10 + (stage * 15),
|
213
|
+
{
|
214
|
+
"phase": "progressive_search",
|
215
|
+
"stage": stage + 1,
|
216
|
+
"total_stages": len(self.constraint_ranking),
|
217
|
+
"constraint": constraint.description,
|
218
|
+
"constraint_type": constraint.type.value,
|
219
|
+
"constraint_value": constraint.value,
|
220
|
+
"current_candidates": len(current_candidates),
|
221
|
+
"search_intent": f"Finding entities matching: {constraint.value}",
|
222
|
+
},
|
223
|
+
)
|
224
|
+
|
225
|
+
if stage == 0:
|
226
|
+
# First stage - find initial candidates
|
227
|
+
current_candidates = self._search_with_single_constraint(
|
228
|
+
constraint
|
229
|
+
)
|
230
|
+
else:
|
231
|
+
# Subsequent stages - filter existing candidates
|
232
|
+
current_candidates = self._filter_candidates_with_constraint(
|
233
|
+
current_candidates, constraint
|
234
|
+
)
|
235
|
+
|
236
|
+
# Store stage results
|
237
|
+
self.stage_candidates[stage] = current_candidates.copy()
|
238
|
+
|
239
|
+
if self.progress_callback:
|
240
|
+
candidate_names = ", ".join(
|
241
|
+
[c.name for c in current_candidates[:3]]
|
242
|
+
)
|
243
|
+
more = (
|
244
|
+
f" (+{len(current_candidates) - 3})"
|
245
|
+
if len(current_candidates) > 3
|
246
|
+
else ""
|
247
|
+
)
|
248
|
+
change = len(current_candidates) - len(
|
249
|
+
self.stage_candidates.get(stage - 1, [])
|
250
|
+
)
|
251
|
+
change_str = f" (Δ{change:+d})" if stage > 0 else ""
|
252
|
+
|
253
|
+
self.progress_callback(
|
254
|
+
f"Stage {stage + 1} complete: {len(current_candidates)} candidates{change_str}. {candidate_names}{more}",
|
255
|
+
None,
|
256
|
+
{
|
257
|
+
"phase": "stage_complete",
|
258
|
+
"stage": stage + 1,
|
259
|
+
"candidates_found": len(current_candidates),
|
260
|
+
"candidates_delta": change if stage > 0 else 0,
|
261
|
+
"sample": [c.name for c in current_candidates[:10]],
|
262
|
+
},
|
263
|
+
)
|
264
|
+
|
265
|
+
# Add stage finding
|
266
|
+
stage_finding = {
|
267
|
+
"phase": f"Stage {stage + 1} - {constraint.type.value}",
|
268
|
+
"content": self._format_stage_results(
|
269
|
+
stage, constraint, current_candidates
|
270
|
+
),
|
271
|
+
"timestamp": self._get_timestamp(),
|
272
|
+
}
|
273
|
+
self.findings.append(stage_finding)
|
274
|
+
|
275
|
+
# Continue applying constraints unless we have very few candidates
|
276
|
+
if len(current_candidates) <= 3:
|
277
|
+
if self.progress_callback:
|
278
|
+
self.progress_callback(
|
279
|
+
f"Too few candidates ({len(current_candidates)}) - stopping constraint application",
|
280
|
+
None,
|
281
|
+
{
|
282
|
+
"phase": "early_stop",
|
283
|
+
"candidates_remaining": len(current_candidates),
|
284
|
+
},
|
285
|
+
)
|
286
|
+
break
|
287
|
+
|
288
|
+
# Stop if no candidates remain
|
289
|
+
if not current_candidates:
|
290
|
+
# Backtrack to previous stage if possible
|
291
|
+
if stage > 0:
|
292
|
+
current_candidates = self.stage_candidates[stage - 1]
|
293
|
+
break
|
294
|
+
|
295
|
+
self.candidates = current_candidates[: self.candidate_limit]
|
296
|
+
|
297
|
+
def _search_with_single_constraint(
|
298
|
+
self, constraint: Constraint
|
299
|
+
) -> List[Candidate]:
|
300
|
+
"""Search for candidates using a single constraint."""
|
301
|
+
candidates = []
|
302
|
+
|
303
|
+
# Generate targeted queries for this constraint
|
304
|
+
queries = self._generate_constraint_specific_queries(constraint)
|
305
|
+
|
306
|
+
# Add more diverse query patterns
|
307
|
+
additional_queries = self._generate_additional_queries(constraint)
|
308
|
+
queries.extend(additional_queries)
|
309
|
+
|
310
|
+
# Diversify query execution order
|
311
|
+
import random
|
312
|
+
|
313
|
+
random.shuffle(queries)
|
314
|
+
|
315
|
+
for i, query in enumerate(queries[:20]): # Increased query limit
|
316
|
+
if self.progress_callback:
|
317
|
+
# Show query and what we're looking for
|
318
|
+
self.progress_callback(
|
319
|
+
f"Q{i + 1}/{min(20, len(queries))}: '{query}' | Found: {len(candidates)} candidates",
|
320
|
+
None,
|
321
|
+
{
|
322
|
+
"phase": "constraint_search",
|
323
|
+
"query": query,
|
324
|
+
"query_index": i + 1,
|
325
|
+
"total_queries": min(20, len(queries)),
|
326
|
+
"constraint_type": constraint.type.value,
|
327
|
+
"constraint_value": constraint.value,
|
328
|
+
"candidates_so_far": len(candidates),
|
329
|
+
"search_context": f"Stage {getattr(self, 'current_stage', 0) + 1}: {constraint.value}",
|
330
|
+
},
|
331
|
+
)
|
332
|
+
|
333
|
+
results = self._execute_search(query)
|
334
|
+
|
335
|
+
# Validate search results before extraction
|
336
|
+
if self._validate_search_results(results, constraint):
|
337
|
+
extracted = self._extract_relevant_candidates(
|
338
|
+
results, constraint
|
339
|
+
)
|
340
|
+
candidates.extend(extracted)
|
341
|
+
|
342
|
+
# Track stage information in search history
|
343
|
+
if self.search_history:
|
344
|
+
self.search_history[-1]["stage"] = getattr(
|
345
|
+
self, "current_stage", 0
|
346
|
+
)
|
347
|
+
self.search_history[-1]["results_count"] = len(extracted)
|
348
|
+
self.search_history[-1]["results_preview"] = results.get(
|
349
|
+
"current_knowledge", ""
|
350
|
+
)[:200]
|
351
|
+
else:
|
352
|
+
logger.info(f"Skipping invalid results for query: {query}")
|
353
|
+
|
354
|
+
# Continue searching to build a comprehensive list
|
355
|
+
# Don't stop too early - we want diversity
|
356
|
+
if len(candidates) >= self.candidate_limit * 2:
|
357
|
+
break
|
358
|
+
|
359
|
+
return self._deduplicate_candidates(candidates)
|
360
|
+
|
361
|
+
def _generate_additional_queries(self, constraint: Constraint) -> List[str]:
|
362
|
+
"""Generate additional diverse queries for better coverage."""
|
363
|
+
queries = []
|
364
|
+
base_value = constraint.value
|
365
|
+
|
366
|
+
# Add reference source queries
|
367
|
+
queries.extend(
|
368
|
+
[
|
369
|
+
f"reference {base_value}",
|
370
|
+
f"authoritative {base_value}",
|
371
|
+
f"official {base_value}",
|
372
|
+
]
|
373
|
+
)
|
374
|
+
|
375
|
+
# Add structured data queries
|
376
|
+
if constraint.type == ConstraintType.STATISTIC:
|
377
|
+
queries.extend(
|
378
|
+
[
|
379
|
+
f"spreadsheet {base_value}",
|
380
|
+
f"dataset {base_value}",
|
381
|
+
f"statistical analysis {base_value}",
|
382
|
+
f"quantitative {base_value}",
|
383
|
+
]
|
384
|
+
)
|
385
|
+
elif constraint.type == ConstraintType.PROPERTY:
|
386
|
+
queries.extend(
|
387
|
+
[
|
388
|
+
f"characterized by {base_value}",
|
389
|
+
f"known for {base_value}",
|
390
|
+
f"featuring {base_value}",
|
391
|
+
]
|
392
|
+
)
|
393
|
+
else:
|
394
|
+
# Generic comprehensive queries
|
395
|
+
queries.extend(
|
396
|
+
[
|
397
|
+
f"exhaustive {base_value}",
|
398
|
+
f"thorough {base_value}",
|
399
|
+
f"detailed {base_value}",
|
400
|
+
]
|
401
|
+
)
|
402
|
+
|
403
|
+
return queries
|
404
|
+
|
405
|
+
def _generate_constraint_specific_queries(
|
406
|
+
self, constraint: Constraint
|
407
|
+
) -> List[str]:
|
408
|
+
"""Generate queries specific to a constraint type."""
|
409
|
+
queries = []
|
410
|
+
base_value = constraint.value
|
411
|
+
|
412
|
+
# Add context from other constraints for more targeted searches
|
413
|
+
context_parts = []
|
414
|
+
if hasattr(self, "constraints") and self.constraints:
|
415
|
+
for other_constraint in self.constraints[
|
416
|
+
:2
|
417
|
+
]: # Use top 2 constraints for context
|
418
|
+
if other_constraint.id != constraint.id:
|
419
|
+
context_parts.append(other_constraint.value)
|
420
|
+
|
421
|
+
# Base queries using the constraint description
|
422
|
+
if hasattr(constraint, "description") and constraint.description:
|
423
|
+
queries.append(constraint.description)
|
424
|
+
if context_parts:
|
425
|
+
queries.append(f"{constraint.description} {context_parts[0]}")
|
426
|
+
|
427
|
+
# Type-specific patterns
|
428
|
+
if constraint.type == ConstraintType.STATISTIC:
|
429
|
+
# Numeric constraints - look for quantitative information
|
430
|
+
queries.extend(
|
431
|
+
[
|
432
|
+
f"list {base_value}",
|
433
|
+
f"complete {base_value}",
|
434
|
+
f"all {base_value}",
|
435
|
+
f"comprehensive {base_value}",
|
436
|
+
f"database {base_value}",
|
437
|
+
f"statistics {base_value}",
|
438
|
+
f"data {base_value}",
|
439
|
+
f"comparison {base_value}",
|
440
|
+
]
|
441
|
+
)
|
442
|
+
|
443
|
+
elif (
|
444
|
+
constraint.type == ConstraintType.EVENT
|
445
|
+
or hasattr(constraint.type, "value")
|
446
|
+
and constraint.type.value == "temporal"
|
447
|
+
):
|
448
|
+
# Time-based constraints
|
449
|
+
queries.extend(
|
450
|
+
[
|
451
|
+
f"during {base_value}",
|
452
|
+
f"in {base_value}",
|
453
|
+
f"list {base_value}",
|
454
|
+
f"comprehensive {base_value}",
|
455
|
+
f"all from {base_value}",
|
456
|
+
f"complete list {base_value}",
|
457
|
+
f"history {base_value}",
|
458
|
+
f"timeline {base_value}",
|
459
|
+
]
|
460
|
+
)
|
461
|
+
|
462
|
+
elif constraint.type == ConstraintType.PROPERTY:
|
463
|
+
# Property constraints - characteristics and attributes
|
464
|
+
queries.extend(
|
465
|
+
[
|
466
|
+
f"with {base_value}",
|
467
|
+
f"having {base_value}",
|
468
|
+
f"characterized by {base_value}",
|
469
|
+
f"examples {base_value}",
|
470
|
+
f"instances {base_value}",
|
471
|
+
f"who {base_value}",
|
472
|
+
f"which {base_value}",
|
473
|
+
f"known for {base_value}",
|
474
|
+
]
|
475
|
+
)
|
476
|
+
else:
|
477
|
+
# Generic queries
|
478
|
+
queries.extend(
|
479
|
+
[
|
480
|
+
f"{base_value}",
|
481
|
+
f"list {base_value}",
|
482
|
+
f"examples {base_value}",
|
483
|
+
f"all {base_value}",
|
484
|
+
f"complete {base_value}",
|
485
|
+
]
|
486
|
+
)
|
487
|
+
|
488
|
+
# Add combined queries with other constraints
|
489
|
+
if context_parts:
|
490
|
+
queries.extend(
|
491
|
+
[
|
492
|
+
f"{base_value} {context_parts[0]}",
|
493
|
+
f"list {base_value} with {context_parts[0]}",
|
494
|
+
f"{base_value} and {context_parts[0]}",
|
495
|
+
]
|
496
|
+
)
|
497
|
+
|
498
|
+
return queries
|
499
|
+
|
500
|
+
def _filter_candidates_with_constraint(
|
501
|
+
self, candidates: List[Candidate], constraint: Constraint
|
502
|
+
) -> List[Candidate]:
|
503
|
+
"""Filter existing candidates using an additional constraint."""
|
504
|
+
filtered = []
|
505
|
+
|
506
|
+
for candidate in candidates:
|
507
|
+
# Check if candidate matches the constraint
|
508
|
+
query = f"{candidate.name} {constraint.value}"
|
509
|
+
|
510
|
+
results = self._execute_search(query)
|
511
|
+
|
512
|
+
# Quick evidence check
|
513
|
+
evidence = self._quick_evidence_check(
|
514
|
+
results, candidate, constraint
|
515
|
+
)
|
516
|
+
|
517
|
+
if evidence.confidence >= 0.5: # Lower threshold for filtering
|
518
|
+
candidate.add_evidence(constraint.id, evidence)
|
519
|
+
filtered.append(candidate)
|
520
|
+
|
521
|
+
return filtered
|
522
|
+
|
523
|
+
def _extract_relevant_candidates(
|
524
|
+
self, results: Dict, constraint: Constraint
|
525
|
+
) -> List[Candidate]:
|
526
|
+
"""Extract candidates relevant to a specific constraint."""
|
527
|
+
content = results.get("current_knowledge", "")
|
528
|
+
|
529
|
+
# If no content, return empty list
|
530
|
+
if not content or "Error" in content or "No results found" in content:
|
531
|
+
logger.warning(
|
532
|
+
f"No valid content to extract candidates from for constraint: {constraint.description}"
|
533
|
+
)
|
534
|
+
return []
|
535
|
+
|
536
|
+
# Determine what type of entity we're looking for
|
537
|
+
entity_type = getattr(self, "entity_type", "entity")
|
538
|
+
|
539
|
+
# Use LLM to extract entities matching the constraint
|
540
|
+
prompt = f"""Analyze these search results and extract {entity_type} names that could satisfy this constraint:
|
541
|
+
|
542
|
+
Constraint: {constraint.description}
|
543
|
+
Type: {constraint.type.value}
|
544
|
+
Value: {constraint.value}
|
545
|
+
|
546
|
+
Search Results:
|
547
|
+
{content}
|
548
|
+
|
549
|
+
Your task:
|
550
|
+
1. Understand what the constraint is asking for
|
551
|
+
2. Identify mentions of specific {entity_type} names in the search results
|
552
|
+
3. Extract only those names that could potentially satisfy the constraint
|
553
|
+
4. Focus on proper nouns and specific names
|
554
|
+
|
555
|
+
Important:
|
556
|
+
- Extract actual {entity_type} names, not descriptions or categories
|
557
|
+
- If the search results mention a specific {entity_type} that matches the constraint criteria, extract it
|
558
|
+
- Be thorough - don't miss names that are mentioned in passing
|
559
|
+
- Consider the context to determine if a name is relevant to the constraint
|
560
|
+
|
561
|
+
Return one {entity_type} name per line. Only include names that could satisfy the constraint."""
|
562
|
+
|
563
|
+
try:
|
564
|
+
response = self.model.invoke(prompt)
|
565
|
+
extracted_text = remove_think_tags(response.content)
|
566
|
+
|
567
|
+
candidates = []
|
568
|
+
seen_names = set() # Track unique names
|
569
|
+
|
570
|
+
for line in extracted_text.strip().split("\n"):
|
571
|
+
name = line.strip()
|
572
|
+
# Remove common list markers and clean up
|
573
|
+
name = name.lstrip("- •·*0123456789.").strip()
|
574
|
+
|
575
|
+
# Skip empty lines or very short names
|
576
|
+
if not name or len(name) <= 2:
|
577
|
+
continue
|
578
|
+
|
579
|
+
# Normalize for deduplication
|
580
|
+
normalized_name = name.lower()
|
581
|
+
if normalized_name in seen_names:
|
582
|
+
continue
|
583
|
+
|
584
|
+
# Exclude meta-commentary patterns
|
585
|
+
exclude_patterns = [
|
586
|
+
"search result",
|
587
|
+
"based on",
|
588
|
+
"provided",
|
589
|
+
"found",
|
590
|
+
"does not",
|
591
|
+
"doesn't",
|
592
|
+
"cannot",
|
593
|
+
"there are no",
|
594
|
+
"according to",
|
595
|
+
"mentions",
|
596
|
+
"discusses",
|
597
|
+
"shows that",
|
598
|
+
"indicates",
|
599
|
+
"suggests",
|
600
|
+
"appears",
|
601
|
+
"seems",
|
602
|
+
"search",
|
603
|
+
"constraint",
|
604
|
+
"extract",
|
605
|
+
"entity",
|
606
|
+
]
|
607
|
+
|
608
|
+
# Check if it's meta-commentary
|
609
|
+
is_meta = any(
|
610
|
+
pattern in name.lower() for pattern in exclude_patterns
|
611
|
+
)
|
612
|
+
is_too_long = (
|
613
|
+
len(name.split()) > 10
|
614
|
+
) # Very long strings are usually explanations
|
615
|
+
is_sentence = name.endswith(".") and len(name.split()) > 5
|
616
|
+
|
617
|
+
if not is_meta and not is_too_long and not is_sentence:
|
618
|
+
# Accept various name formats
|
619
|
+
if (
|
620
|
+
name[0].isupper() # Capitalized
|
621
|
+
or any(c.isupper() for c in name) # Has capitals
|
622
|
+
or any(c.isdigit() for c in name) # Contains numbers
|
623
|
+
or any(
|
624
|
+
c in name
|
625
|
+
for c in ["-", "&", "/", ":", "(", ")", '"', "'"]
|
626
|
+
) # Special chars
|
627
|
+
or len(name.split()) <= 6
|
628
|
+
): # Reasonable length phrases
|
629
|
+
candidate = Candidate(name=name)
|
630
|
+
candidates.append(candidate)
|
631
|
+
seen_names.add(normalized_name)
|
632
|
+
|
633
|
+
# Log extraction results for debugging
|
634
|
+
logger.info(
|
635
|
+
f"Extracted {len(candidates)} candidates for constraint: {constraint.description}"
|
636
|
+
)
|
637
|
+
if candidates:
|
638
|
+
logger.debug(
|
639
|
+
f"Sample candidates: {[c.name for c in candidates[:5]]}"
|
640
|
+
)
|
641
|
+
|
642
|
+
return candidates[:50] # Limit per search
|
643
|
+
|
644
|
+
except Exception as e:
|
645
|
+
logger.error(f"Error extracting candidates: {e}")
|
646
|
+
import traceback
|
647
|
+
|
648
|
+
logger.error(traceback.format_exc())
|
649
|
+
return []
|
650
|
+
|
651
|
+
def _quick_evidence_check(
|
652
|
+
self, results: Dict, candidate: Candidate, constraint: Constraint
|
653
|
+
) -> Evidence:
|
654
|
+
"""Quick evidence check for filtering with enhanced scoring."""
|
655
|
+
content = results.get("current_knowledge", "")
|
656
|
+
search_results = results.get("search_results", [])
|
657
|
+
|
658
|
+
# Initialize confidence components
|
659
|
+
name_presence = 0.0
|
660
|
+
constraint_presence = 0.0
|
661
|
+
co_occurrence = 0.0
|
662
|
+
context_quality = 0.0
|
663
|
+
|
664
|
+
candidate_lower = candidate.name.lower()
|
665
|
+
value_lower = constraint.value.lower()
|
666
|
+
content_lower = content.lower()
|
667
|
+
|
668
|
+
# Check candidate name presence
|
669
|
+
if candidate_lower in content_lower:
|
670
|
+
name_count = content_lower.count(candidate_lower)
|
671
|
+
name_presence = min(
|
672
|
+
0.3 + (name_count * 0.05), 0.4
|
673
|
+
) # More occurrences = higher confidence
|
674
|
+
|
675
|
+
# Check constraint value presence
|
676
|
+
if value_lower in content_lower:
|
677
|
+
value_count = content_lower.count(value_lower)
|
678
|
+
constraint_presence = min(0.3 + (value_count * 0.05), 0.4)
|
679
|
+
|
680
|
+
# Check co-occurrence and proximity
|
681
|
+
if name_presence > 0 and constraint_presence > 0:
|
682
|
+
# Find all positions
|
683
|
+
name_positions = []
|
684
|
+
start = 0
|
685
|
+
while start < len(content_lower):
|
686
|
+
pos = content_lower.find(candidate_lower, start)
|
687
|
+
if pos == -1:
|
688
|
+
break
|
689
|
+
name_positions.append(pos)
|
690
|
+
start = pos + 1
|
691
|
+
|
692
|
+
value_positions = []
|
693
|
+
start = 0
|
694
|
+
while start < len(content_lower):
|
695
|
+
pos = content_lower.find(value_lower, start)
|
696
|
+
if pos == -1:
|
697
|
+
break
|
698
|
+
value_positions.append(pos)
|
699
|
+
start = pos + 1
|
700
|
+
|
701
|
+
# Calculate minimum distance
|
702
|
+
if name_positions and value_positions:
|
703
|
+
min_distance = min(
|
704
|
+
abs(n - v) for n in name_positions for v in value_positions
|
705
|
+
)
|
706
|
+
|
707
|
+
if min_distance < 100: # Very close proximity
|
708
|
+
co_occurrence = 0.2
|
709
|
+
elif min_distance < 200: # Close proximity
|
710
|
+
co_occurrence = 0.15
|
711
|
+
elif min_distance < 500: # Moderate proximity
|
712
|
+
co_occurrence = 0.1
|
713
|
+
else: # Same document
|
714
|
+
co_occurrence = 0.05
|
715
|
+
|
716
|
+
# Check result quality
|
717
|
+
if search_results:
|
718
|
+
# Count how many results mention both candidate and constraint
|
719
|
+
relevant_results = 0
|
720
|
+
for result in search_results[:10]:
|
721
|
+
title = result.get("title", "").lower()
|
722
|
+
snippet = result.get("snippet", "").lower()
|
723
|
+
|
724
|
+
if (
|
725
|
+
candidate_lower in title or candidate_lower in snippet
|
726
|
+
) and (value_lower in title or value_lower in snippet):
|
727
|
+
relevant_results += 1
|
728
|
+
|
729
|
+
context_quality = min(relevant_results * 0.05, 0.2)
|
730
|
+
|
731
|
+
# Calculate final confidence
|
732
|
+
confidence = (
|
733
|
+
name_presence
|
734
|
+
+ constraint_presence
|
735
|
+
+ co_occurrence
|
736
|
+
+ context_quality
|
737
|
+
)
|
738
|
+
|
739
|
+
# Apply constraint type weight
|
740
|
+
if constraint.type == ConstraintType.STATISTIC:
|
741
|
+
confidence *= 1.1 # Numeric constraints need precise matching
|
742
|
+
elif constraint.type == ConstraintType.PROPERTY:
|
743
|
+
confidence *= 0.95 # Properties can be more flexible
|
744
|
+
|
745
|
+
return Evidence(
|
746
|
+
claim=f"Evidence for {candidate.name} matching {constraint.description}",
|
747
|
+
confidence=min(confidence, 1.0),
|
748
|
+
type=EvidenceType.INFERENCE,
|
749
|
+
source="quick_evidence_check",
|
750
|
+
metadata={
|
751
|
+
"name_presence": name_presence,
|
752
|
+
"constraint_presence": constraint_presence,
|
753
|
+
"co_occurrence": co_occurrence,
|
754
|
+
"context_quality": context_quality,
|
755
|
+
},
|
756
|
+
)
|
757
|
+
|
758
|
+
def _focused_evidence_gathering(self):
|
759
|
+
"""Gather detailed evidence for the narrowed candidates."""
|
760
|
+
if self.progress_callback:
|
761
|
+
constraint_count = len(self.constraints)
|
762
|
+
evidence_needed = len(self.candidates) * constraint_count
|
763
|
+
self.progress_callback(
|
764
|
+
f"Verifying {len(self.candidates)} candidates against {constraint_count} constraints ({evidence_needed} checks)",
|
765
|
+
80,
|
766
|
+
{
|
767
|
+
"phase": "evidence_gathering",
|
768
|
+
"candidate_count": len(self.candidates),
|
769
|
+
"constraint_count": constraint_count,
|
770
|
+
"total_evidence_needed": evidence_needed,
|
771
|
+
},
|
772
|
+
)
|
773
|
+
|
774
|
+
for i, candidate in enumerate(self.candidates):
|
775
|
+
for j, constraint in enumerate(self.constraints):
|
776
|
+
# Skip if we already have evidence from filtering
|
777
|
+
if constraint.id in candidate.evidence:
|
778
|
+
continue
|
779
|
+
|
780
|
+
# Detailed evidence search
|
781
|
+
query = f'"{candidate.name}" {constraint.value} verification'
|
782
|
+
results = self._execute_search(query)
|
783
|
+
|
784
|
+
evidence = self.evidence_evaluator.extract_evidence(
|
785
|
+
results.get("current_knowledge", ""),
|
786
|
+
candidate.name,
|
787
|
+
constraint,
|
788
|
+
)
|
789
|
+
|
790
|
+
candidate.add_evidence(constraint.id, evidence)
|
791
|
+
|
792
|
+
if (
|
793
|
+
self.progress_callback and i < 5
|
794
|
+
): # Report progress for top candidates
|
795
|
+
conf_emoji = (
|
796
|
+
"✓"
|
797
|
+
if evidence.confidence >= self.evidence_threshold
|
798
|
+
else "○"
|
799
|
+
)
|
800
|
+
self.progress_callback(
|
801
|
+
f"{conf_emoji} {candidate.name} | {constraint.type.value}: {evidence.confidence:.0%}",
|
802
|
+
None,
|
803
|
+
{
|
804
|
+
"phase": "evidence_detail",
|
805
|
+
"candidate": candidate.name,
|
806
|
+
"constraint": constraint.description,
|
807
|
+
"constraint_type": constraint.type.value,
|
808
|
+
"confidence": evidence.confidence,
|
809
|
+
"evidence_type": evidence.type.value,
|
810
|
+
"meets_threshold": evidence.confidence
|
811
|
+
>= self.evidence_threshold,
|
812
|
+
},
|
813
|
+
)
|
814
|
+
|
815
|
+
# Final scoring
|
816
|
+
for candidate in self.candidates:
|
817
|
+
candidate.calculate_score(self.constraints)
|
818
|
+
|
819
|
+
# Sort by score
|
820
|
+
self.candidates.sort(key=lambda c: c.score, reverse=True)
|
821
|
+
|
822
|
+
def _deduplicate_candidates(
|
823
|
+
self, candidates: List[Candidate]
|
824
|
+
) -> List[Candidate]:
|
825
|
+
"""Remove duplicate candidates."""
|
826
|
+
seen = {}
|
827
|
+
unique = []
|
828
|
+
|
829
|
+
for candidate in candidates:
|
830
|
+
key = candidate.name.lower().strip()
|
831
|
+
if key not in seen:
|
832
|
+
seen[key] = candidate
|
833
|
+
unique.append(candidate)
|
834
|
+
|
835
|
+
return unique
|
836
|
+
|
837
|
+
def _format_constraint_analysis(self) -> str:
|
838
|
+
"""Format initial constraint analysis."""
|
839
|
+
analysis = "**Query Constraint Analysis**\n\n"
|
840
|
+
analysis += f"Total constraints identified: {len(self.constraints)}\n\n"
|
841
|
+
analysis += "**Constraint Ranking (by restrictiveness):**\n"
|
842
|
+
|
843
|
+
for i, constraint in enumerate(self.constraint_ranking):
|
844
|
+
score = self._calculate_restrictiveness_score(constraint)
|
845
|
+
analysis += (
|
846
|
+
f"{i + 1}. [{constraint.type.value}] {constraint.description}\n"
|
847
|
+
)
|
848
|
+
analysis += f" Restrictiveness score: {score}\n"
|
849
|
+
analysis += f" Value: {constraint.value}\n\n"
|
850
|
+
|
851
|
+
return analysis
|
852
|
+
|
853
|
+
def _format_debug_summary(self) -> str:
|
854
|
+
"""Format comprehensive debug summary."""
|
855
|
+
summary = "**Debug Summary**\n\n"
|
856
|
+
|
857
|
+
# Constraint analysis
|
858
|
+
summary += "**Constraint Processing:**\n"
|
859
|
+
for i, constraint in enumerate(self.constraint_ranking):
|
860
|
+
score = self._calculate_restrictiveness_score(constraint)
|
861
|
+
summary += f"{i + 1}. [{constraint.type.value}] {constraint.value} (score: {score})\n"
|
862
|
+
|
863
|
+
# Search progression
|
864
|
+
summary += "\n**Search Progression:**\n"
|
865
|
+
if hasattr(self, "stage_candidates"):
|
866
|
+
for stage, candidates in self.stage_candidates.items():
|
867
|
+
summary += f"Stage {stage + 1}: {len(candidates)} candidates\n"
|
868
|
+
|
869
|
+
# Evidence coverage
|
870
|
+
summary += "\n**Evidence Coverage:**\n"
|
871
|
+
|
872
|
+
for i, candidate in enumerate(self.candidates[:5]):
|
873
|
+
evidence_count = len(candidate.evidence)
|
874
|
+
satisfied = sum(
|
875
|
+
1
|
876
|
+
for c in self.constraints
|
877
|
+
if c.id in candidate.evidence
|
878
|
+
and candidate.evidence[c.id].confidence
|
879
|
+
>= self.evidence_threshold
|
880
|
+
)
|
881
|
+
|
882
|
+
summary += f"{i + 1}. {candidate.name}: {evidence_count} evidence, "
|
883
|
+
summary += f"{satisfied}/{len(self.constraints)} constraints\n"
|
884
|
+
|
885
|
+
# Search statistics
|
886
|
+
summary += "\n**Search Statistics:**\n"
|
887
|
+
total_discovered = (
|
888
|
+
sum(len(c) for c in self.stage_candidates.values())
|
889
|
+
if hasattr(self, "stage_candidates")
|
890
|
+
else 0
|
891
|
+
)
|
892
|
+
summary += f"Total candidates discovered: {total_discovered}\n"
|
893
|
+
summary += f"Final candidates: {len(self.candidates)}\n"
|
894
|
+
summary += f"Constraints: {len(self.constraints)}\n"
|
895
|
+
|
896
|
+
return summary
|
897
|
+
|
898
|
+
def _calculate_restrictiveness_score(self, constraint: Constraint) -> int:
|
899
|
+
"""Calculate restrictiveness score for a constraint."""
|
900
|
+
score = 0
|
901
|
+
|
902
|
+
# Type-based scoring
|
903
|
+
if constraint.type == ConstraintType.STATISTIC:
|
904
|
+
score += 10
|
905
|
+
elif constraint.type == ConstraintType.EVENT:
|
906
|
+
score += 8
|
907
|
+
elif constraint.type == ConstraintType.LOCATION:
|
908
|
+
score += 6
|
909
|
+
elif constraint.type == ConstraintType.PROPERTY:
|
910
|
+
score += 4
|
911
|
+
|
912
|
+
# Specificity scoring
|
913
|
+
if constraint.value:
|
914
|
+
if any(char.isdigit() for char in constraint.value):
|
915
|
+
score += 5
|
916
|
+
if len(constraint.value.split()) > 3:
|
917
|
+
score += 3
|
918
|
+
if any(
|
919
|
+
term in constraint.value.lower()
|
920
|
+
for term in ["specific", "exact", "only", "must"]
|
921
|
+
):
|
922
|
+
score += 2
|
923
|
+
|
924
|
+
return score
|
925
|
+
|
926
|
+
def _format_stage_results(
|
927
|
+
self, stage: int, constraint: Constraint, candidates: List[Candidate]
|
928
|
+
) -> str:
|
929
|
+
"""Format results for a search stage with detailed information."""
|
930
|
+
result = f"**Search Stage {stage + 1}**\n\n"
|
931
|
+
result += f"Constraint: {constraint.description}\n"
|
932
|
+
result += f"Type: {constraint.type.value}\n"
|
933
|
+
result += f"Search Value: {constraint.value}\n"
|
934
|
+
result += f"Candidates found: {len(candidates)}\n\n"
|
935
|
+
|
936
|
+
# Add search statistics
|
937
|
+
result += "**Search Statistics:**\n"
|
938
|
+
if hasattr(self, "search_history"):
|
939
|
+
stage_searches = [
|
940
|
+
s for s in self.search_history if s.get("stage", -1) == stage
|
941
|
+
]
|
942
|
+
result += f"- Queries executed: {len(stage_searches)}\n"
|
943
|
+
result += f"- Total results analyzed: {getattr(self, f'stage_{stage}_results_count', 0)}\n"
|
944
|
+
|
945
|
+
result += f"- Candidates before filtering: {getattr(self, f'stage_{stage}_raw_candidates', len(candidates))}\n"
|
946
|
+
result += f"- Candidates after deduplication: {len(candidates)}\n\n"
|
947
|
+
|
948
|
+
if candidates:
|
949
|
+
result += "**Top Candidates:**\n"
|
950
|
+
# Group candidates to show variety
|
951
|
+
grouped = self._group_similar_candidates(candidates[:20])
|
952
|
+
for group_name, group_items in grouped.items():
|
953
|
+
result += f"\n{group_name} ({len(group_items)} items):\n"
|
954
|
+
for i, candidate in enumerate(group_items[:5]):
|
955
|
+
result += f" {i + 1}. {candidate.name}\n"
|
956
|
+
if len(group_items) > 5:
|
957
|
+
result += f" ... and {len(group_items) - 5} more\n"
|
958
|
+
else:
|
959
|
+
result += "No candidates found for this constraint.\n"
|
960
|
+
|
961
|
+
# Add sample search results for debugging
|
962
|
+
if hasattr(self, "search_history") and candidates:
|
963
|
+
result += "\n**Sample Search Results:**\n"
|
964
|
+
recent_searches = [
|
965
|
+
s
|
966
|
+
for s in self.search_history[-3:]
|
967
|
+
if s.get("stage", -1) == stage
|
968
|
+
]
|
969
|
+
for search in recent_searches[:2]:
|
970
|
+
result += f"- Query: '{search.get('query', '')}'\n"
|
971
|
+
if "results_preview" in search:
|
972
|
+
result += (
|
973
|
+
f" Preview: {search['results_preview'][:100]}...\n"
|
974
|
+
)
|
975
|
+
|
976
|
+
return result
|
977
|
+
|
978
|
+
def _format_search_summary(self) -> str:
|
979
|
+
"""Format progressive search summary."""
|
980
|
+
summary = "**Progressive Search Summary**\n\n"
|
981
|
+
|
982
|
+
# Show search progression
|
983
|
+
summary += "**Stage-by-Stage Filtering:**\n"
|
984
|
+
prev_count = 0
|
985
|
+
|
986
|
+
for stage, candidates in self.stage_candidates.items():
|
987
|
+
constraint = (
|
988
|
+
self.constraint_ranking[stage]
|
989
|
+
if stage < len(self.constraint_ranking)
|
990
|
+
else None
|
991
|
+
)
|
992
|
+
if constraint:
|
993
|
+
count = len(candidates)
|
994
|
+
change = count - prev_count if stage > 0 else count
|
995
|
+
change_str = f" ({change:+d})" if stage > 0 else ""
|
996
|
+
|
997
|
+
summary += f"\nStage {stage + 1} [{constraint.type.value}]: {constraint.value[:40]}\n"
|
998
|
+
summary += f" Results: {count} candidates{change_str}\n"
|
999
|
+
|
1000
|
+
if candidates:
|
1001
|
+
# Group candidates by type
|
1002
|
+
grouped = self._group_similar_candidates(candidates[:20])
|
1003
|
+
for group_name, group_items in grouped.items():
|
1004
|
+
summary += f" {group_name}: {len(group_items)} items\n"
|
1005
|
+
for item in group_items[:3]:
|
1006
|
+
summary += f" • {item.name}\n"
|
1007
|
+
if len(group_items) > 3:
|
1008
|
+
summary += (
|
1009
|
+
f" ... and {len(group_items) - 3} more\n"
|
1010
|
+
)
|
1011
|
+
|
1012
|
+
prev_count = count
|
1013
|
+
|
1014
|
+
summary += (
|
1015
|
+
f"\n**Final Result: {len(self.candidates)} candidates selected**\n"
|
1016
|
+
)
|
1017
|
+
|
1018
|
+
return summary
|
1019
|
+
|
1020
|
+
def _format_evidence_summary(self) -> str:
|
1021
|
+
"""Format evidence gathering summary."""
|
1022
|
+
summary = "**Evidence Gathering Summary**\n\n"
|
1023
|
+
|
1024
|
+
for i, candidate in enumerate(self.candidates[:5]):
|
1025
|
+
summary += f"**{i + 1}. {candidate.name}**\n"
|
1026
|
+
|
1027
|
+
for constraint in self.constraints:
|
1028
|
+
evidence = candidate.evidence.get(constraint.id)
|
1029
|
+
if evidence:
|
1030
|
+
conf_str = f"{evidence.confidence:.0%}"
|
1031
|
+
summary += (
|
1032
|
+
f" • {constraint.description[:40]}...: {conf_str}\n"
|
1033
|
+
)
|
1034
|
+
else:
|
1035
|
+
summary += (
|
1036
|
+
f" • {constraint.description[:40]}...: No evidence\n"
|
1037
|
+
)
|
1038
|
+
|
1039
|
+
summary += f" Overall Score: {candidate.score:.2f}\n\n"
|
1040
|
+
|
1041
|
+
return summary
|
1042
|
+
|
1043
|
+
# Commented out to use the parent's optimized _execute_search method
|
1044
|
+
'''def _execute_search(self, search_query: str) -> Dict:
|
1045
|
+
"""Execute a comprehensive search using source-based strategy for complex queries."""
|
1046
|
+
if not hasattr(self, "search_history"):
|
1047
|
+
self.search_history = []
|
1048
|
+
|
1049
|
+
self.search_history.append(
|
1050
|
+
{
|
1051
|
+
"query": search_query,
|
1052
|
+
"timestamp": self._get_timestamp(),
|
1053
|
+
"iteration": getattr(self, "iteration", 0),
|
1054
|
+
}
|
1055
|
+
)
|
1056
|
+
|
1057
|
+
# Debug: Check if search engine is available
|
1058
|
+
if not hasattr(self, "search") or self.search is None:
|
1059
|
+
logger.error(f"No search engine configured for query: {search_query}")
|
1060
|
+
logger.error(f"Strategy attributes: {list(self.__dict__.keys())}")
|
1061
|
+
return {"current_knowledge": "", "search_results": []}
|
1062
|
+
|
1063
|
+
try:
|
1064
|
+
# Log that we're attempting to use source-based strategy
|
1065
|
+
logger.info(f"Attempting source-based search for: {search_query}")
|
1066
|
+
|
1067
|
+
# For complex queries, use source-based strategy with multiple iterations
|
1068
|
+
if hasattr(self, "source_strategy"):
|
1069
|
+
source_strategy = self.source_strategy
|
1070
|
+
else:
|
1071
|
+
logger.info("Creating new SourceBasedSearchStrategy instance")
|
1072
|
+
source_strategy = SourceBasedSearchStrategy(
|
1073
|
+
model=self.model,
|
1074
|
+
search=self.search,
|
1075
|
+
all_links_of_system=self.all_links_of_system,
|
1076
|
+
include_text_content=True,
|
1077
|
+
use_cross_engine_filter=False, # We'll handle filtering ourselves
|
1078
|
+
use_atomic_facts=False,
|
1079
|
+
)
|
1080
|
+
source_strategy.max_iterations = (
|
1081
|
+
1 # More efficient with single iteration
|
1082
|
+
)
|
1083
|
+
source_strategy.questions_per_iteration = (
|
1084
|
+
9 # More questions for broader coverage
|
1085
|
+
)
|
1086
|
+
|
1087
|
+
# Use source-based strategy for complex search
|
1088
|
+
try:
|
1089
|
+
# Set a simple progress callback if we have one
|
1090
|
+
if self.progress_callback:
|
1091
|
+
|
1092
|
+
def sub_callback(msg, prog, data):
|
1093
|
+
# Don't propagate all sub-progress updates
|
1094
|
+
if "phase" in data and data["phase"] in [
|
1095
|
+
"search_complete",
|
1096
|
+
"final_results",
|
1097
|
+
]:
|
1098
|
+
self.progress_callback(f"Sub-search: {msg}", None, data)
|
1099
|
+
|
1100
|
+
source_strategy.set_progress_callback(sub_callback)
|
1101
|
+
|
1102
|
+
logger.info("Executing source-based search...")
|
1103
|
+
# Run the search
|
1104
|
+
result = source_strategy.analyze_topic(search_query)
|
1105
|
+
|
1106
|
+
if (
|
1107
|
+
result
|
1108
|
+
and "current_knowledge" in result
|
1109
|
+
and "all_links_of_system" in result
|
1110
|
+
):
|
1111
|
+
search_results = result.get("all_links_of_system", [])
|
1112
|
+
|
1113
|
+
# Extract the most relevant information from the findings
|
1114
|
+
knowledge_parts = []
|
1115
|
+
if "findings" in result:
|
1116
|
+
for finding in result["findings"]:
|
1117
|
+
if "content" in finding and finding["content"]:
|
1118
|
+
knowledge_parts.append(finding["content"])
|
1119
|
+
|
1120
|
+
# Also include search results summaries
|
1121
|
+
for i, link in enumerate(search_results[:15]): # More results
|
1122
|
+
if isinstance(link, dict):
|
1123
|
+
title = link.get("title", "")
|
1124
|
+
snippet = link.get("snippet", "")
|
1125
|
+
content = link.get("content", "")
|
1126
|
+
url = link.get("link", link.get("url", ""))
|
1127
|
+
|
1128
|
+
if title or snippet:
|
1129
|
+
result_text = f"\nResult {i+1}: {title}"
|
1130
|
+
if url:
|
1131
|
+
result_text += f"\nURL: {url}"
|
1132
|
+
if snippet:
|
1133
|
+
result_text += f"\nSnippet: {snippet}"
|
1134
|
+
if content and content != snippet:
|
1135
|
+
result_text += f"\nContent: {content[:500]}..."
|
1136
|
+
knowledge_parts.append(result_text)
|
1137
|
+
|
1138
|
+
current_knowledge = "\n\n".join(knowledge_parts)
|
1139
|
+
|
1140
|
+
return {
|
1141
|
+
"current_knowledge": current_knowledge,
|
1142
|
+
"search_results": search_results,
|
1143
|
+
"detailed_findings": result.get("findings", []),
|
1144
|
+
}
|
1145
|
+
else:
|
1146
|
+
# Fallback to simple search
|
1147
|
+
logger.warning(
|
1148
|
+
"Source-based search returned empty results, falling back to simple search"
|
1149
|
+
)
|
1150
|
+
return self._simple_search(search_query)
|
1151
|
+
|
1152
|
+
except Exception as e:
|
1153
|
+
logger.error(f"Source-based search failed with error: {e}")
|
1154
|
+
logger.error(f"Error type: {type(e).__name__}")
|
1155
|
+
import traceback
|
1156
|
+
|
1157
|
+
logger.error(f"Traceback: {traceback.format_exc()}")
|
1158
|
+
logger.warning("Falling back to simple search")
|
1159
|
+
return self._simple_search(search_query)
|
1160
|
+
|
1161
|
+
except Exception as e:
|
1162
|
+
logger.error(f"Error during search for '{search_query}': {str(e)}")
|
1163
|
+
return {
|
1164
|
+
"current_knowledge": f"Error during search: {str(e)}",
|
1165
|
+
"search_results": [],
|
1166
|
+
}'''
|
1167
|
+
|
1168
|
+
def _simple_search(self, search_query: str) -> Dict:
|
1169
|
+
"""Fallback simple search using search engine directly."""
|
1170
|
+
try:
|
1171
|
+
# Use the search engine directly for simple queries
|
1172
|
+
search_results = self.search.run(search_query)
|
1173
|
+
|
1174
|
+
if search_results and isinstance(search_results, list):
|
1175
|
+
# Format search results into a knowledge string
|
1176
|
+
content_parts = []
|
1177
|
+
|
1178
|
+
for i, result in enumerate(search_results[:15]): # More results
|
1179
|
+
title = result.get("title", "Untitled")
|
1180
|
+
snippet = result.get("snippet", "")
|
1181
|
+
content = result.get("content", "")
|
1182
|
+
url = result.get("link", result.get("url", ""))
|
1183
|
+
|
1184
|
+
content_parts.append(f"Result {i + 1}: {title}")
|
1185
|
+
if url:
|
1186
|
+
content_parts.append(f"URL: {url}")
|
1187
|
+
if snippet:
|
1188
|
+
content_parts.append(f"Snippet: {snippet}")
|
1189
|
+
if content and content != snippet:
|
1190
|
+
content_parts.append(
|
1191
|
+
f"Content preview: {content[:300]}..."
|
1192
|
+
)
|
1193
|
+
content_parts.append("") # Empty line between results
|
1194
|
+
|
1195
|
+
current_knowledge = "\n".join(content_parts)
|
1196
|
+
|
1197
|
+
return {
|
1198
|
+
"current_knowledge": current_knowledge,
|
1199
|
+
"search_results": search_results,
|
1200
|
+
}
|
1201
|
+
else:
|
1202
|
+
# Return empty knowledge if no results
|
1203
|
+
return {
|
1204
|
+
"current_knowledge": f"No results found for query: {search_query}",
|
1205
|
+
"search_results": [],
|
1206
|
+
}
|
1207
|
+
except Exception as e:
|
1208
|
+
logger.error(f"Simple search error: {e}")
|
1209
|
+
return {
|
1210
|
+
"current_knowledge": f"Search error: {str(e)}",
|
1211
|
+
"search_results": [],
|
1212
|
+
}
|
1213
|
+
|
1214
|
+
def _validate_search_results(
|
1215
|
+
self, results: Dict, constraint: Constraint
|
1216
|
+
) -> bool:
|
1217
|
+
"""Validate that search results contain relevant information."""
|
1218
|
+
if not results:
|
1219
|
+
return False
|
1220
|
+
|
1221
|
+
content = results.get("current_knowledge", "")
|
1222
|
+
search_results = results.get("search_results", [])
|
1223
|
+
|
1224
|
+
# Basic validation checks
|
1225
|
+
if not content or len(content) < 50: # Too short to be meaningful
|
1226
|
+
logger.debug(f"Content too short: {len(content)} characters")
|
1227
|
+
return False
|
1228
|
+
|
1229
|
+
if "Error" in content and len(content) < 100:
|
1230
|
+
logger.debug(f"Error in results: {content[:100]}")
|
1231
|
+
return False
|
1232
|
+
|
1233
|
+
if "No results found" in content:
|
1234
|
+
logger.debug("No results found")
|
1235
|
+
return False
|
1236
|
+
|
1237
|
+
# For stats/numeric constraints, check for related terms
|
1238
|
+
if constraint.type == ConstraintType.STATISTIC:
|
1239
|
+
# Look for related terms about TV shows, episodes, etc
|
1240
|
+
relevant_terms = [
|
1241
|
+
"tv",
|
1242
|
+
"show",
|
1243
|
+
"series",
|
1244
|
+
"episode",
|
1245
|
+
"season",
|
1246
|
+
"program",
|
1247
|
+
"character",
|
1248
|
+
"fiction",
|
1249
|
+
]
|
1250
|
+
content_lower = content.lower()
|
1251
|
+
|
1252
|
+
term_found = any(term in content_lower for term in relevant_terms)
|
1253
|
+
if not term_found:
|
1254
|
+
logger.debug(
|
1255
|
+
"No relevant TV/show terms found for statistic constraint"
|
1256
|
+
)
|
1257
|
+
return False
|
1258
|
+
else:
|
1259
|
+
# Check for relevance to constraint using key terms
|
1260
|
+
constraint_terms = [
|
1261
|
+
term
|
1262
|
+
for term in constraint.value.lower().split()
|
1263
|
+
if len(term) > 2
|
1264
|
+
and term
|
1265
|
+
not in ["the", "and", "with", "for", "had", "his", "her"]
|
1266
|
+
]
|
1267
|
+
content_lower = content.lower()
|
1268
|
+
|
1269
|
+
# Count how many meaningful terms appear
|
1270
|
+
if constraint_terms:
|
1271
|
+
term_matches = sum(
|
1272
|
+
1 for term in constraint_terms if term in content_lower
|
1273
|
+
)
|
1274
|
+
relevance_ratio = term_matches / len(constraint_terms)
|
1275
|
+
|
1276
|
+
# Require at least one term match
|
1277
|
+
if relevance_ratio < 0.2:
|
1278
|
+
logger.debug(
|
1279
|
+
f"Low relevance: {relevance_ratio:.0%} term matches"
|
1280
|
+
)
|
1281
|
+
return False
|
1282
|
+
|
1283
|
+
# Check search results quality
|
1284
|
+
if search_results and isinstance(search_results, list):
|
1285
|
+
valid_results = sum(
|
1286
|
+
1
|
1287
|
+
for r in search_results
|
1288
|
+
if isinstance(r, dict) and (r.get("title") or r.get("snippet"))
|
1289
|
+
)
|
1290
|
+
if valid_results < 1:
|
1291
|
+
logger.debug("No valid search results with title/snippet")
|
1292
|
+
return False
|
1293
|
+
|
1294
|
+
return True
|
1295
|
+
|
1296
|
+
def _get_timestamp(self) -> str:
|
1297
|
+
"""Get current timestamp."""
|
1298
|
+
return datetime.utcnow().isoformat()
|
1299
|
+
|
1300
|
+
def _group_similar_candidates(
|
1301
|
+
self, candidates: List[Candidate]
|
1302
|
+
) -> Dict[str, List[Candidate]]:
|
1303
|
+
"""Group candidates by similar characteristics."""
|
1304
|
+
grouped = {}
|
1305
|
+
|
1306
|
+
for candidate in candidates:
|
1307
|
+
# Try to determine group type based on name patterns
|
1308
|
+
name = candidate.name.lower()
|
1309
|
+
|
1310
|
+
if any(
|
1311
|
+
keyword in name
|
1312
|
+
for keyword in ["model", "llm", "gpt", "claude", "gemini"]
|
1313
|
+
):
|
1314
|
+
group = "AI Models"
|
1315
|
+
elif any(
|
1316
|
+
keyword in name
|
1317
|
+
for keyword in ["country", "nation", "republic", "kingdom"]
|
1318
|
+
):
|
1319
|
+
group = "Countries"
|
1320
|
+
elif any(
|
1321
|
+
keyword in name for keyword in ["city", "town", "village"]
|
1322
|
+
):
|
1323
|
+
group = "Cities"
|
1324
|
+
elif any(
|
1325
|
+
keyword in name for keyword in ["year", "century", "decade"]
|
1326
|
+
):
|
1327
|
+
group = "Time Periods"
|
1328
|
+
elif any(
|
1329
|
+
keyword in name
|
1330
|
+
for keyword in ["person", "mr", "ms", "dr", "prof"]
|
1331
|
+
):
|
1332
|
+
group = "People"
|
1333
|
+
elif any(c.isdigit() for c in name):
|
1334
|
+
group = "Numeric Items"
|
1335
|
+
else:
|
1336
|
+
# Default grouping based on first word
|
1337
|
+
first_word = (
|
1338
|
+
candidate.name.split()[0]
|
1339
|
+
if candidate.name.split()
|
1340
|
+
else "Other"
|
1341
|
+
)
|
1342
|
+
group = f"{first_word} Items"
|
1343
|
+
|
1344
|
+
if group not in grouped:
|
1345
|
+
grouped[group] = []
|
1346
|
+
grouped[group].append(candidate)
|
1347
|
+
|
1348
|
+
return grouped
|