local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +7 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
- local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
- local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
- local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
- local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
- local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
- local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
- local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
- local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
- local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
- local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
- local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
- local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
- local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
- local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
- local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
- local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
- local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
- local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
- local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
- local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
- local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
- local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
- local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
- local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
- local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
- local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
- local_deep_research/advanced_search_system/findings/repository.py +54 -17
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
- local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
- local_deep_research/advanced_search_system/questions/__init__.py +16 -0
- local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
- local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
- local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
- local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
- local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
- local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
- local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
- local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
- local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
- local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
- local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
- local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
- local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
- local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
- local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
- local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
- local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
- local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
- local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
- local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
- local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
- local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
- local_deep_research/api/benchmark_functions.py +6 -2
- local_deep_research/api/research_functions.py +10 -4
- local_deep_research/benchmarks/__init__.py +9 -7
- local_deep_research/benchmarks/benchmark_functions.py +6 -2
- local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
- local_deep_research/benchmarks/cli.py +38 -13
- local_deep_research/benchmarks/comparison/__init__.py +4 -2
- local_deep_research/benchmarks/comparison/evaluator.py +316 -239
- local_deep_research/benchmarks/datasets/__init__.py +1 -1
- local_deep_research/benchmarks/datasets/base.py +91 -72
- local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
- local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
- local_deep_research/benchmarks/datasets/utils.py +48 -29
- local_deep_research/benchmarks/datasets.py +4 -11
- local_deep_research/benchmarks/efficiency/__init__.py +8 -4
- local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
- local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
- local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
- local_deep_research/benchmarks/evaluators/composite.py +6 -2
- local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
- local_deep_research/benchmarks/graders.py +32 -10
- local_deep_research/benchmarks/metrics/README.md +1 -1
- local_deep_research/benchmarks/metrics/calculation.py +25 -10
- local_deep_research/benchmarks/metrics/reporting.py +7 -3
- local_deep_research/benchmarks/metrics/visualization.py +42 -23
- local_deep_research/benchmarks/metrics.py +1 -1
- local_deep_research/benchmarks/optimization/__init__.py +3 -1
- local_deep_research/benchmarks/optimization/api.py +7 -1
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
- local_deep_research/benchmarks/runners.py +48 -15
- local_deep_research/citation_handler.py +65 -92
- local_deep_research/citation_handlers/__init__.py +15 -0
- local_deep_research/citation_handlers/base_citation_handler.py +70 -0
- local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
- local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
- local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
- local_deep_research/config/llm_config.py +271 -169
- local_deep_research/config/search_config.py +14 -5
- local_deep_research/defaults/__init__.py +0 -1
- local_deep_research/metrics/__init__.py +13 -0
- local_deep_research/metrics/database.py +58 -0
- local_deep_research/metrics/db_models.py +115 -0
- local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
- local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
- local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
- local_deep_research/metrics/migrate_research_ratings.py +31 -0
- local_deep_research/metrics/models.py +61 -0
- local_deep_research/metrics/pricing/__init__.py +12 -0
- local_deep_research/metrics/pricing/cost_calculator.py +237 -0
- local_deep_research/metrics/pricing/pricing_cache.py +143 -0
- local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
- local_deep_research/metrics/query_utils.py +51 -0
- local_deep_research/metrics/search_tracker.py +380 -0
- local_deep_research/metrics/token_counter.py +1078 -0
- local_deep_research/migrate_db.py +3 -1
- local_deep_research/report_generator.py +22 -8
- local_deep_research/search_system.py +390 -9
- local_deep_research/test_migration.py +15 -5
- local_deep_research/utilities/db_utils.py +7 -4
- local_deep_research/utilities/es_utils.py +115 -104
- local_deep_research/utilities/llm_utils.py +15 -5
- local_deep_research/utilities/log_utils.py +151 -0
- local_deep_research/utilities/search_cache.py +387 -0
- local_deep_research/utilities/search_utilities.py +14 -6
- local_deep_research/utilities/threading_utils.py +92 -0
- local_deep_research/utilities/url_utils.py +6 -0
- local_deep_research/web/api.py +347 -0
- local_deep_research/web/app.py +13 -17
- local_deep_research/web/app_factory.py +71 -66
- local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
- local_deep_research/web/database/migrations.py +20 -3
- local_deep_research/web/database/models.py +74 -25
- local_deep_research/web/database/schema_upgrade.py +49 -29
- local_deep_research/web/models/database.py +63 -83
- local_deep_research/web/routes/api_routes.py +56 -22
- local_deep_research/web/routes/benchmark_routes.py +4 -1
- local_deep_research/web/routes/globals.py +22 -0
- local_deep_research/web/routes/history_routes.py +71 -46
- local_deep_research/web/routes/metrics_routes.py +1155 -0
- local_deep_research/web/routes/research_routes.py +192 -54
- local_deep_research/web/routes/settings_routes.py +156 -55
- local_deep_research/web/services/research_service.py +412 -251
- local_deep_research/web/services/resource_service.py +36 -11
- local_deep_research/web/services/settings_manager.py +55 -17
- local_deep_research/web/services/settings_service.py +12 -4
- local_deep_research/web/services/socket_service.py +295 -188
- local_deep_research/web/static/css/custom_dropdown.css +180 -0
- local_deep_research/web/static/css/styles.css +39 -1
- local_deep_research/web/static/js/components/detail.js +633 -267
- local_deep_research/web/static/js/components/details.js +751 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
- local_deep_research/web/static/js/components/fallback/ui.js +23 -23
- local_deep_research/web/static/js/components/history.js +76 -76
- local_deep_research/web/static/js/components/logpanel.js +61 -13
- local_deep_research/web/static/js/components/progress.js +13 -2
- local_deep_research/web/static/js/components/research.js +99 -12
- local_deep_research/web/static/js/components/results.js +239 -106
- local_deep_research/web/static/js/main.js +40 -40
- local_deep_research/web/static/js/services/audio.js +1 -1
- local_deep_research/web/static/js/services/formatting.js +11 -11
- local_deep_research/web/static/js/services/keyboard.js +157 -0
- local_deep_research/web/static/js/services/pdf.js +80 -80
- local_deep_research/web/static/sounds/README.md +1 -1
- local_deep_research/web/templates/base.html +1 -0
- local_deep_research/web/templates/components/log_panel.html +7 -1
- local_deep_research/web/templates/components/mobile_nav.html +1 -1
- local_deep_research/web/templates/components/sidebar.html +3 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
- local_deep_research/web/templates/pages/details.html +325 -24
- local_deep_research/web/templates/pages/history.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +1929 -0
- local_deep_research/web/templates/pages/progress.html +2 -2
- local_deep_research/web/templates/pages/research.html +53 -17
- local_deep_research/web/templates/pages/results.html +12 -1
- local_deep_research/web/templates/pages/star_reviews.html +803 -0
- local_deep_research/web/utils/formatters.py +9 -3
- local_deep_research/web_search_engines/default_search_engines.py +5 -3
- local_deep_research/web_search_engines/engines/full_search.py +8 -2
- local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
- local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
- local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
- local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
- local_deep_research/web_search_engines/search_engine_base.py +83 -35
- local_deep_research/web_search_engines/search_engine_factory.py +25 -8
- local_deep_research/web_search_engines/search_engines_config.py +9 -3
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/METADATA +7 -1
- local_deep_research-0.5.2.dist-info/RECORD +265 -0
- local_deep_research-0.4.4.dist-info/RECORD +0 -177
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/WHEEL +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,424 @@
|
|
1
|
+
"""
|
2
|
+
Dual confidence constraint checker implementation.
|
3
|
+
|
4
|
+
This implementation uses dual confidence scoring (positive/negative/uncertainty)
|
5
|
+
to evaluate constraints and make rejection decisions.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from typing import Dict, List, Tuple
|
9
|
+
|
10
|
+
from loguru import logger
|
11
|
+
|
12
|
+
from ..candidates.base_candidate import Candidate
|
13
|
+
from ..constraints.base_constraint import Constraint
|
14
|
+
from .base_constraint_checker import (
|
15
|
+
BaseConstraintChecker,
|
16
|
+
ConstraintCheckResult,
|
17
|
+
)
|
18
|
+
from .evidence_analyzer import ConstraintEvidence, EvidenceAnalyzer
|
19
|
+
|
20
|
+
|
21
|
+
class DualConfidenceChecker(BaseConstraintChecker):
|
22
|
+
"""
|
23
|
+
Constraint checker using dual confidence scoring.
|
24
|
+
|
25
|
+
This checker:
|
26
|
+
1. Analyzes evidence using positive/negative/uncertainty scores
|
27
|
+
2. Makes rejection decisions based on confidence thresholds
|
28
|
+
3. Provides detailed scoring breakdown
|
29
|
+
"""
|
30
|
+
|
31
|
+
def __init__(
|
32
|
+
self,
|
33
|
+
*args,
|
34
|
+
negative_threshold: float = 0.25, # Reject if negative evidence > 25%
|
35
|
+
positive_threshold: float = 0.4, # Reject if positive evidence < 40%
|
36
|
+
uncertainty_penalty: float = 0.2,
|
37
|
+
negative_weight: float = 0.5,
|
38
|
+
uncertainty_threshold: float = 0.6, # Re-evaluate if uncertainty > 60%
|
39
|
+
max_reevaluations: int = 2, # Maximum re-evaluation rounds
|
40
|
+
**kwargs,
|
41
|
+
):
|
42
|
+
"""
|
43
|
+
Initialize dual confidence checker.
|
44
|
+
|
45
|
+
Args:
|
46
|
+
negative_threshold: Threshold for negative evidence rejection
|
47
|
+
positive_threshold: Minimum positive evidence required
|
48
|
+
uncertainty_penalty: Penalty for uncertain evidence
|
49
|
+
negative_weight: Weight for negative evidence in scoring
|
50
|
+
uncertainty_threshold: Re-evaluate if uncertainty exceeds this
|
51
|
+
max_reevaluations: Maximum number of re-evaluation rounds
|
52
|
+
"""
|
53
|
+
super().__init__(*args, **kwargs)
|
54
|
+
|
55
|
+
self.negative_threshold = negative_threshold
|
56
|
+
self.positive_threshold = positive_threshold
|
57
|
+
self.uncertainty_penalty = uncertainty_penalty
|
58
|
+
self.negative_weight = negative_weight
|
59
|
+
self.uncertainty_threshold = uncertainty_threshold
|
60
|
+
self.max_reevaluations = max_reevaluations
|
61
|
+
|
62
|
+
# Initialize evidence analyzer
|
63
|
+
self.evidence_analyzer = EvidenceAnalyzer(self.model)
|
64
|
+
|
65
|
+
def check_candidate(
|
66
|
+
self,
|
67
|
+
candidate: Candidate,
|
68
|
+
constraints: List[Constraint],
|
69
|
+
original_query: str = None,
|
70
|
+
) -> ConstraintCheckResult:
|
71
|
+
"""Check candidate using dual confidence analysis with LLM pre-screening."""
|
72
|
+
logger.info(f"Checking candidate: {candidate.name} (dual confidence)")
|
73
|
+
|
74
|
+
# LLM PRE-SCREENING: Check all constraints in one call to save SearXNG capacity
|
75
|
+
pre_screen_result = self._llm_prescreen_candidate(
|
76
|
+
candidate, constraints, original_query
|
77
|
+
)
|
78
|
+
if pre_screen_result["should_reject"]:
|
79
|
+
logger.info(
|
80
|
+
f"🚫 LLM pre-screen rejected {candidate.name}: {pre_screen_result['reason']}"
|
81
|
+
)
|
82
|
+
return ConstraintCheckResult(
|
83
|
+
should_reject=True,
|
84
|
+
rejection_reason=pre_screen_result["reason"],
|
85
|
+
total_score=0.0,
|
86
|
+
detailed_results=pre_screen_result["detailed_results"],
|
87
|
+
)
|
88
|
+
|
89
|
+
constraint_scores = {}
|
90
|
+
detailed_results = []
|
91
|
+
rejection_reason = None
|
92
|
+
should_reject = False
|
93
|
+
|
94
|
+
for constraint in constraints:
|
95
|
+
# Perform initial evaluation with re-evaluation for uncertain constraints
|
96
|
+
result = self._evaluate_constraint_with_reevaluation(
|
97
|
+
candidate, constraint
|
98
|
+
)
|
99
|
+
|
100
|
+
avg_positive = result["positive"]
|
101
|
+
avg_negative = result["negative"]
|
102
|
+
avg_uncertainty = result["uncertainty"]
|
103
|
+
score = result["score"]
|
104
|
+
reevaluation_count = result.get("reevaluation_count", 0)
|
105
|
+
|
106
|
+
# Check for rejection based on final results
|
107
|
+
reject, reason = self.should_reject_candidate_from_averages(
|
108
|
+
candidate, constraint, avg_positive, avg_negative
|
109
|
+
)
|
110
|
+
|
111
|
+
if reject and not should_reject: # Only record first rejection
|
112
|
+
should_reject = True
|
113
|
+
rejection_reason = reason
|
114
|
+
|
115
|
+
# Store results
|
116
|
+
constraint_scores[constraint.value] = {
|
117
|
+
"total": score,
|
118
|
+
"positive": avg_positive,
|
119
|
+
"negative": avg_negative,
|
120
|
+
"uncertainty": avg_uncertainty,
|
121
|
+
"weight": constraint.weight,
|
122
|
+
"reevaluation_count": reevaluation_count,
|
123
|
+
}
|
124
|
+
|
125
|
+
detailed_results.append(
|
126
|
+
{
|
127
|
+
"constraint": constraint.value,
|
128
|
+
"score": score,
|
129
|
+
"positive": avg_positive,
|
130
|
+
"negative": avg_negative,
|
131
|
+
"uncertainty": avg_uncertainty,
|
132
|
+
"weight": constraint.weight,
|
133
|
+
"type": constraint.type.value,
|
134
|
+
"reevaluation_count": reevaluation_count,
|
135
|
+
}
|
136
|
+
)
|
137
|
+
|
138
|
+
# Log detailed result with re-evaluation info
|
139
|
+
self._log_constraint_result_detailed(
|
140
|
+
candidate,
|
141
|
+
constraint,
|
142
|
+
score,
|
143
|
+
avg_positive,
|
144
|
+
avg_negative,
|
145
|
+
avg_uncertainty,
|
146
|
+
reevaluation_count,
|
147
|
+
)
|
148
|
+
|
149
|
+
# Calculate total score
|
150
|
+
if should_reject:
|
151
|
+
total_score = 0.0
|
152
|
+
else:
|
153
|
+
if detailed_results:
|
154
|
+
weights = [r["weight"] for r in detailed_results]
|
155
|
+
scores = [r["score"] for r in detailed_results]
|
156
|
+
total_score = self._calculate_weighted_score(scores, weights)
|
157
|
+
else:
|
158
|
+
total_score = 0.0
|
159
|
+
|
160
|
+
logger.info(f"Final score for {candidate.name}: {total_score:.2%}")
|
161
|
+
|
162
|
+
return ConstraintCheckResult(
|
163
|
+
candidate=candidate,
|
164
|
+
total_score=total_score,
|
165
|
+
constraint_scores=constraint_scores,
|
166
|
+
should_reject=should_reject,
|
167
|
+
rejection_reason=rejection_reason,
|
168
|
+
detailed_results=detailed_results,
|
169
|
+
)
|
170
|
+
|
171
|
+
def _evaluate_constraint_with_reevaluation(
|
172
|
+
self, candidate: Candidate, constraint: Constraint
|
173
|
+
) -> Dict:
|
174
|
+
"""Evaluate constraint with potential re-evaluation for uncertain results."""
|
175
|
+
reevaluation_count = 0
|
176
|
+
evidence_list = []
|
177
|
+
|
178
|
+
while reevaluation_count <= self.max_reevaluations:
|
179
|
+
# Gather evidence (fresh each time for re-evaluation)
|
180
|
+
evidence_list = self._gather_evidence_for_constraint(
|
181
|
+
candidate, constraint
|
182
|
+
)
|
183
|
+
|
184
|
+
if not evidence_list:
|
185
|
+
# No evidence found
|
186
|
+
return {
|
187
|
+
"positive": 0.0,
|
188
|
+
"negative": 0.0,
|
189
|
+
"uncertainty": 1.0,
|
190
|
+
"score": 0.5 - self.uncertainty_penalty,
|
191
|
+
"evidence_list": [],
|
192
|
+
"reevaluation_count": reevaluation_count,
|
193
|
+
}
|
194
|
+
|
195
|
+
# Analyze with dual confidence
|
196
|
+
dual_evidence = [
|
197
|
+
self.evidence_analyzer.analyze_evidence_dual_confidence(
|
198
|
+
e, constraint
|
199
|
+
)
|
200
|
+
for e in evidence_list
|
201
|
+
]
|
202
|
+
|
203
|
+
# Calculate averages
|
204
|
+
avg_positive = sum(
|
205
|
+
e.positive_confidence for e in dual_evidence
|
206
|
+
) / len(dual_evidence)
|
207
|
+
avg_negative = sum(
|
208
|
+
e.negative_confidence for e in dual_evidence
|
209
|
+
) / len(dual_evidence)
|
210
|
+
avg_uncertainty = sum(e.uncertainty for e in dual_evidence) / len(
|
211
|
+
dual_evidence
|
212
|
+
)
|
213
|
+
|
214
|
+
# Calculate score
|
215
|
+
score = self.evidence_analyzer.evaluate_evidence_list(
|
216
|
+
evidence_list,
|
217
|
+
constraint,
|
218
|
+
self.uncertainty_penalty,
|
219
|
+
self.negative_weight,
|
220
|
+
)
|
221
|
+
|
222
|
+
# Check if we need re-evaluation
|
223
|
+
if (
|
224
|
+
reevaluation_count < self.max_reevaluations
|
225
|
+
and avg_uncertainty > self.uncertainty_threshold
|
226
|
+
and not self._should_early_reject(avg_positive, avg_negative)
|
227
|
+
):
|
228
|
+
reevaluation_count += 1
|
229
|
+
logger.info(
|
230
|
+
f"🔄 Re-evaluating {candidate.name} | {constraint.value} "
|
231
|
+
f"(round {reevaluation_count}) - high uncertainty: {avg_uncertainty:.0%}"
|
232
|
+
)
|
233
|
+
continue
|
234
|
+
else:
|
235
|
+
# Final result or early rejection
|
236
|
+
if reevaluation_count > 0:
|
237
|
+
logger.info(
|
238
|
+
f"✅ Final evaluation for {candidate.name} | {constraint.value} "
|
239
|
+
f"after {reevaluation_count} re-evaluation(s)"
|
240
|
+
)
|
241
|
+
|
242
|
+
return {
|
243
|
+
"positive": avg_positive,
|
244
|
+
"negative": avg_negative,
|
245
|
+
"uncertainty": avg_uncertainty,
|
246
|
+
"score": score,
|
247
|
+
"evidence_list": evidence_list,
|
248
|
+
"reevaluation_count": reevaluation_count,
|
249
|
+
}
|
250
|
+
|
251
|
+
# Should not reach here, but fallback
|
252
|
+
return {
|
253
|
+
"positive": avg_positive,
|
254
|
+
"negative": avg_negative,
|
255
|
+
"uncertainty": avg_uncertainty,
|
256
|
+
"score": score,
|
257
|
+
"evidence_list": evidence_list,
|
258
|
+
"reevaluation_count": reevaluation_count,
|
259
|
+
}
|
260
|
+
|
261
|
+
def _should_early_reject(
|
262
|
+
self, avg_positive: float, avg_negative: float
|
263
|
+
) -> bool:
|
264
|
+
"""Check if candidate should be rejected early (before re-evaluation)."""
|
265
|
+
return (
|
266
|
+
avg_negative > self.negative_threshold
|
267
|
+
or avg_positive < self.positive_threshold
|
268
|
+
)
|
269
|
+
|
270
|
+
def should_reject_candidate_from_averages(
|
271
|
+
self,
|
272
|
+
candidate: Candidate,
|
273
|
+
constraint: Constraint,
|
274
|
+
avg_positive: float,
|
275
|
+
avg_negative: float,
|
276
|
+
) -> Tuple[bool, str]:
|
277
|
+
"""Determine rejection based on average confidence scores."""
|
278
|
+
# PRIMARY REJECTION: High negative evidence
|
279
|
+
if avg_negative > self.negative_threshold:
|
280
|
+
reason = f"High negative evidence ({avg_negative:.0%}) for constraint '{constraint.value}'"
|
281
|
+
logger.info(f"❌ REJECTION: {candidate.name} - {reason}")
|
282
|
+
return True, reason
|
283
|
+
|
284
|
+
# SECONDARY REJECTION: Low positive evidence
|
285
|
+
if avg_positive < self.positive_threshold:
|
286
|
+
reason = f"Insufficient positive evidence ({avg_positive:.0%}) for constraint '{constraint.value}'"
|
287
|
+
logger.info(f"❌ REJECTION: {candidate.name} - {reason}")
|
288
|
+
return True, reason
|
289
|
+
|
290
|
+
return False, ""
|
291
|
+
|
292
|
+
def should_reject_candidate(
|
293
|
+
self,
|
294
|
+
candidate: Candidate,
|
295
|
+
constraint: Constraint,
|
296
|
+
dual_evidence: List[ConstraintEvidence],
|
297
|
+
) -> Tuple[bool, str]:
|
298
|
+
"""Determine rejection based on dual confidence scores."""
|
299
|
+
if not dual_evidence:
|
300
|
+
return False, ""
|
301
|
+
|
302
|
+
# Calculate averages
|
303
|
+
avg_positive = sum(e.positive_confidence for e in dual_evidence) / len(
|
304
|
+
dual_evidence
|
305
|
+
)
|
306
|
+
avg_negative = sum(e.negative_confidence for e in dual_evidence) / len(
|
307
|
+
dual_evidence
|
308
|
+
)
|
309
|
+
|
310
|
+
# PRIMARY REJECTION: High negative evidence
|
311
|
+
if avg_negative > self.negative_threshold:
|
312
|
+
reason = f"High negative evidence ({avg_negative:.0%}) for constraint '{constraint.value}'"
|
313
|
+
logger.info(f"❌ REJECTION: {candidate.name} - {reason}")
|
314
|
+
return True, reason
|
315
|
+
|
316
|
+
# SECONDARY REJECTION: Low positive evidence
|
317
|
+
if avg_positive < self.positive_threshold:
|
318
|
+
reason = f"Insufficient positive evidence ({avg_positive:.0%}) for constraint '{constraint.value}'"
|
319
|
+
logger.info(f"❌ REJECTION: {candidate.name} - {reason}")
|
320
|
+
return True, reason
|
321
|
+
|
322
|
+
return False, ""
|
323
|
+
|
324
|
+
def _log_constraint_result_detailed(
|
325
|
+
self,
|
326
|
+
candidate,
|
327
|
+
constraint,
|
328
|
+
score,
|
329
|
+
positive,
|
330
|
+
negative,
|
331
|
+
uncertainty,
|
332
|
+
reevaluation_count=0,
|
333
|
+
):
|
334
|
+
"""Log detailed constraint result."""
|
335
|
+
symbol = "✓" if score >= 0.8 else "○" if score >= 0.5 else "✗"
|
336
|
+
|
337
|
+
# Add re-evaluation indicator
|
338
|
+
reeval_indicator = (
|
339
|
+
f" [R{reevaluation_count}]" if reevaluation_count > 0 else ""
|
340
|
+
)
|
341
|
+
|
342
|
+
logger.info(
|
343
|
+
f"{symbol} {candidate.name} | {constraint.value}: {int(score * 100)}% "
|
344
|
+
f"(+{int(positive * 100)}% -{int(negative * 100)}% ?{int(uncertainty * 100)}%){reeval_indicator}"
|
345
|
+
)
|
346
|
+
|
347
|
+
def _llm_prescreen_candidate(
|
348
|
+
self, candidate, constraints, original_query=None
|
349
|
+
):
|
350
|
+
"""Simple quality check for answer candidates."""
|
351
|
+
|
352
|
+
if not original_query:
|
353
|
+
return {
|
354
|
+
"should_reject": False,
|
355
|
+
"reason": "No original query provided",
|
356
|
+
"detailed_results": [],
|
357
|
+
}
|
358
|
+
|
359
|
+
prompt = f"""Question: {original_query}
|
360
|
+
Answer: {candidate.name}
|
361
|
+
|
362
|
+
Is this a good answer to the question? Rate 0-100 where:
|
363
|
+
- 90-100: Excellent direct answer
|
364
|
+
- 70-89: Good answer
|
365
|
+
- 50-69: Partial answer
|
366
|
+
- 30-49: Weak answer
|
367
|
+
- 0-29: Poor/wrong answer
|
368
|
+
|
369
|
+
Just give the number:"""
|
370
|
+
|
371
|
+
try:
|
372
|
+
response = self.model.generate(prompt)
|
373
|
+
|
374
|
+
# Parse confidence score
|
375
|
+
import re
|
376
|
+
|
377
|
+
confidence_match = re.search(r"(\d{1,3})", response.strip())
|
378
|
+
|
379
|
+
if confidence_match:
|
380
|
+
quality_score = int(confidence_match.group(1))
|
381
|
+
|
382
|
+
# Accept good answers (50+ out of 100)
|
383
|
+
if quality_score >= 50:
|
384
|
+
return {
|
385
|
+
"should_reject": False,
|
386
|
+
"reason": f"Good answer quality: {quality_score}%",
|
387
|
+
"detailed_results": [
|
388
|
+
{
|
389
|
+
"constraint": "answer_quality",
|
390
|
+
"positive_confidence": quality_score / 100.0,
|
391
|
+
"source": "answer_quality_check",
|
392
|
+
}
|
393
|
+
],
|
394
|
+
}
|
395
|
+
else:
|
396
|
+
return {
|
397
|
+
"should_reject": True,
|
398
|
+
"reason": f"Poor answer quality: {quality_score}%",
|
399
|
+
"detailed_results": [
|
400
|
+
{
|
401
|
+
"constraint": "answer_quality",
|
402
|
+
"negative_confidence": (100 - quality_score)
|
403
|
+
/ 100.0,
|
404
|
+
"source": "answer_quality_check",
|
405
|
+
}
|
406
|
+
],
|
407
|
+
}
|
408
|
+
|
409
|
+
# Parsing failed - accept by default
|
410
|
+
return {
|
411
|
+
"should_reject": False,
|
412
|
+
"reason": "Could not parse quality score - accepting",
|
413
|
+
"detailed_results": [],
|
414
|
+
}
|
415
|
+
|
416
|
+
except Exception as e:
|
417
|
+
logger.warning(
|
418
|
+
f"Fast LLM pre-screening failed for {candidate.name}: {e}"
|
419
|
+
)
|
420
|
+
return {
|
421
|
+
"should_reject": False,
|
422
|
+
"reason": "",
|
423
|
+
"detailed_results": [],
|
424
|
+
}
|
@@ -0,0 +1,174 @@
|
|
1
|
+
"""
|
2
|
+
Evidence analysis for constraint checking.
|
3
|
+
|
4
|
+
This module provides dual confidence evidence analysis that separates
|
5
|
+
positive evidence, negative evidence, and uncertainty.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from dataclasses import dataclass
|
10
|
+
from typing import Dict, List
|
11
|
+
|
12
|
+
from langchain_core.language_models import BaseChatModel
|
13
|
+
from loguru import logger
|
14
|
+
|
15
|
+
from ..constraints.base_constraint import Constraint
|
16
|
+
|
17
|
+
|
18
|
+
@dataclass
|
19
|
+
class ConstraintEvidence:
|
20
|
+
"""Evidence for a constraint with dual confidence scores."""
|
21
|
+
|
22
|
+
positive_confidence: float # How sure we are the constraint IS satisfied
|
23
|
+
negative_confidence: (
|
24
|
+
float # How sure we are the constraint is NOT satisfied
|
25
|
+
)
|
26
|
+
uncertainty: float # How uncertain we are (neither positive nor negative)
|
27
|
+
evidence_text: str
|
28
|
+
source: str
|
29
|
+
|
30
|
+
|
31
|
+
class EvidenceAnalyzer:
|
32
|
+
"""
|
33
|
+
Analyzes evidence using dual confidence scoring.
|
34
|
+
|
35
|
+
This approach separates:
|
36
|
+
- Positive confidence: Evidence that constraint IS satisfied
|
37
|
+
- Negative confidence: Evidence that constraint is NOT satisfied
|
38
|
+
- Uncertainty: Lack of clear evidence either way
|
39
|
+
"""
|
40
|
+
|
41
|
+
def __init__(self, model: BaseChatModel):
|
42
|
+
"""Initialize the evidence analyzer."""
|
43
|
+
self.model = model
|
44
|
+
|
45
|
+
def analyze_evidence_dual_confidence(
|
46
|
+
self, evidence: Dict, constraint: Constraint
|
47
|
+
) -> ConstraintEvidence:
|
48
|
+
"""Analyze evidence to extract dual confidence scores."""
|
49
|
+
text = evidence.get("text", "")
|
50
|
+
|
51
|
+
# Use LLM to analyze evidence with dual confidence
|
52
|
+
prompt = f"""
|
53
|
+
Analyze this evidence for the constraint "{constraint.value}" (type: {constraint.type.value}).
|
54
|
+
|
55
|
+
Evidence:
|
56
|
+
{text[:1000]}
|
57
|
+
|
58
|
+
Provide three confidence scores (0-1):
|
59
|
+
1. POSITIVE_CONFIDENCE: How confident are you that this constraint IS satisfied?
|
60
|
+
2. NEGATIVE_CONFIDENCE: How confident are you that this constraint is NOT satisfied?
|
61
|
+
3. UNCERTAINTY: How uncertain are you (lack of clear evidence)?
|
62
|
+
|
63
|
+
The three scores should approximately sum to 1.0.
|
64
|
+
|
65
|
+
Format:
|
66
|
+
POSITIVE: [score]
|
67
|
+
NEGATIVE: [score]
|
68
|
+
UNCERTAINTY: [score]
|
69
|
+
"""
|
70
|
+
|
71
|
+
try:
|
72
|
+
response = self.model.invoke(prompt).content
|
73
|
+
|
74
|
+
# Extract scores
|
75
|
+
positive = self._extract_score(response, "POSITIVE")
|
76
|
+
negative = self._extract_score(response, "NEGATIVE")
|
77
|
+
uncertainty = self._extract_score(response, "UNCERTAINTY")
|
78
|
+
|
79
|
+
# Normalize if needed
|
80
|
+
total = positive + negative + uncertainty
|
81
|
+
if total > 0:
|
82
|
+
positive /= total
|
83
|
+
negative /= total
|
84
|
+
uncertainty /= total
|
85
|
+
else:
|
86
|
+
# Default to high uncertainty
|
87
|
+
uncertainty = 0.8
|
88
|
+
positive = 0.1
|
89
|
+
negative = 0.1
|
90
|
+
|
91
|
+
return ConstraintEvidence(
|
92
|
+
positive_confidence=positive,
|
93
|
+
negative_confidence=negative,
|
94
|
+
uncertainty=uncertainty,
|
95
|
+
evidence_text=text[:500],
|
96
|
+
source=evidence.get("source", "search"),
|
97
|
+
)
|
98
|
+
|
99
|
+
except Exception as e:
|
100
|
+
logger.error(f"Error analyzing evidence: {e}")
|
101
|
+
# Default to high uncertainty
|
102
|
+
return ConstraintEvidence(
|
103
|
+
positive_confidence=0.1,
|
104
|
+
negative_confidence=0.1,
|
105
|
+
uncertainty=0.8,
|
106
|
+
evidence_text=text[:500],
|
107
|
+
source=evidence.get("source", "search"),
|
108
|
+
)
|
109
|
+
|
110
|
+
def _extract_score(self, text: str, label: str) -> float:
|
111
|
+
"""Extract confidence score from LLM response."""
|
112
|
+
pattern = rf"{label}:\s*\[?(\d*\.?\d+)\]?"
|
113
|
+
match = re.search(pattern, text, re.IGNORECASE)
|
114
|
+
if match:
|
115
|
+
try:
|
116
|
+
return float(match.group(1))
|
117
|
+
except:
|
118
|
+
pass
|
119
|
+
return 0.1 # Default low score
|
120
|
+
|
121
|
+
def evaluate_evidence_list(
|
122
|
+
self,
|
123
|
+
evidence_list: List[Dict],
|
124
|
+
constraint: Constraint,
|
125
|
+
uncertainty_penalty: float = 0.2,
|
126
|
+
negative_weight: float = 0.5,
|
127
|
+
) -> float:
|
128
|
+
"""
|
129
|
+
Evaluate a list of evidence using dual confidence scoring.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
evidence_list: List of evidence dictionaries
|
133
|
+
constraint: The constraint being evaluated
|
134
|
+
uncertainty_penalty: Penalty for uncertainty
|
135
|
+
negative_weight: Weight for negative evidence
|
136
|
+
|
137
|
+
Returns:
|
138
|
+
float: Overall score between 0.0 and 1.0
|
139
|
+
"""
|
140
|
+
if not evidence_list:
|
141
|
+
# No evidence means high uncertainty
|
142
|
+
return 0.5 - uncertainty_penalty
|
143
|
+
|
144
|
+
# Convert evidence to dual confidence format
|
145
|
+
constraint_evidence = []
|
146
|
+
for evidence in evidence_list:
|
147
|
+
dual_evidence = self.analyze_evidence_dual_confidence(
|
148
|
+
evidence, constraint
|
149
|
+
)
|
150
|
+
constraint_evidence.append(dual_evidence)
|
151
|
+
|
152
|
+
# Calculate overall score
|
153
|
+
total_positive = sum(e.positive_confidence for e in constraint_evidence)
|
154
|
+
total_negative = sum(e.negative_confidence for e in constraint_evidence)
|
155
|
+
total_uncertainty = sum(e.uncertainty for e in constraint_evidence)
|
156
|
+
|
157
|
+
# Normalize
|
158
|
+
evidence_count = len(constraint_evidence)
|
159
|
+
avg_positive = total_positive / evidence_count
|
160
|
+
avg_negative = total_negative / evidence_count
|
161
|
+
avg_uncertainty = total_uncertainty / evidence_count
|
162
|
+
|
163
|
+
# Calculate final score
|
164
|
+
# High positive + low negative = high score
|
165
|
+
# Low positive + high negative = low score
|
166
|
+
# High uncertainty = penalty
|
167
|
+
score = (
|
168
|
+
avg_positive
|
169
|
+
- (avg_negative * negative_weight)
|
170
|
+
- (avg_uncertainty * uncertainty_penalty)
|
171
|
+
)
|
172
|
+
|
173
|
+
# Clamp to [0, 1]
|
174
|
+
return max(0.0, min(1.0, score))
|