local-deep-research 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +7 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/answer_decoding/__init__.py +5 -0
- local_deep_research/advanced_search_system/answer_decoding/browsecomp_answer_decoder.py +421 -0
- local_deep_research/advanced_search_system/candidate_exploration/README.md +219 -0
- local_deep_research/advanced_search_system/candidate_exploration/__init__.py +25 -0
- local_deep_research/advanced_search_system/candidate_exploration/adaptive_explorer.py +329 -0
- local_deep_research/advanced_search_system/candidate_exploration/base_explorer.py +341 -0
- local_deep_research/advanced_search_system/candidate_exploration/constraint_guided_explorer.py +436 -0
- local_deep_research/advanced_search_system/candidate_exploration/diversity_explorer.py +457 -0
- local_deep_research/advanced_search_system/candidate_exploration/parallel_explorer.py +250 -0
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +255 -0
- local_deep_research/advanced_search_system/candidates/__init__.py +5 -0
- local_deep_research/advanced_search_system/candidates/base_candidate.py +59 -0
- local_deep_research/advanced_search_system/constraint_checking/README.md +150 -0
- local_deep_research/advanced_search_system/constraint_checking/__init__.py +35 -0
- local_deep_research/advanced_search_system/constraint_checking/base_constraint_checker.py +122 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_checker.py +223 -0
- local_deep_research/advanced_search_system/constraint_checking/constraint_satisfaction_tracker.py +387 -0
- local_deep_research/advanced_search_system/constraint_checking/dual_confidence_checker.py +424 -0
- local_deep_research/advanced_search_system/constraint_checking/evidence_analyzer.py +174 -0
- local_deep_research/advanced_search_system/constraint_checking/intelligent_constraint_relaxer.py +503 -0
- local_deep_research/advanced_search_system/constraint_checking/rejection_engine.py +143 -0
- local_deep_research/advanced_search_system/constraint_checking/strict_checker.py +259 -0
- local_deep_research/advanced_search_system/constraint_checking/threshold_checker.py +213 -0
- local_deep_research/advanced_search_system/constraints/__init__.py +6 -0
- local_deep_research/advanced_search_system/constraints/base_constraint.py +58 -0
- local_deep_research/advanced_search_system/constraints/constraint_analyzer.py +143 -0
- local_deep_research/advanced_search_system/evidence/__init__.py +12 -0
- local_deep_research/advanced_search_system/evidence/base_evidence.py +57 -0
- local_deep_research/advanced_search_system/evidence/evaluator.py +159 -0
- local_deep_research/advanced_search_system/evidence/requirements.py +122 -0
- local_deep_research/advanced_search_system/filters/base_filter.py +3 -1
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +8 -2
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +43 -29
- local_deep_research/advanced_search_system/findings/repository.py +54 -17
- local_deep_research/advanced_search_system/knowledge/standard_knowledge.py +3 -1
- local_deep_research/advanced_search_system/query_generation/adaptive_query_generator.py +405 -0
- local_deep_research/advanced_search_system/questions/__init__.py +16 -0
- local_deep_research/advanced_search_system/questions/atomic_fact_question.py +171 -0
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +287 -0
- local_deep_research/advanced_search_system/questions/decomposition_question.py +13 -4
- local_deep_research/advanced_search_system/questions/entity_aware_question.py +184 -0
- local_deep_research/advanced_search_system/questions/standard_question.py +9 -3
- local_deep_research/advanced_search_system/search_optimization/cross_constraint_manager.py +624 -0
- local_deep_research/advanced_search_system/source_management/diversity_manager.py +613 -0
- local_deep_research/advanced_search_system/strategies/__init__.py +42 -0
- local_deep_research/advanced_search_system/strategies/adaptive_decomposition_strategy.py +564 -0
- local_deep_research/advanced_search_system/strategies/base_strategy.py +4 -4
- local_deep_research/advanced_search_system/strategies/browsecomp_entity_strategy.py +1031 -0
- local_deep_research/advanced_search_system/strategies/browsecomp_optimized_strategy.py +778 -0
- local_deep_research/advanced_search_system/strategies/concurrent_dual_confidence_strategy.py +446 -0
- local_deep_research/advanced_search_system/strategies/constrained_search_strategy.py +1348 -0
- local_deep_research/advanced_search_system/strategies/constraint_parallel_strategy.py +522 -0
- local_deep_research/advanced_search_system/strategies/direct_search_strategy.py +217 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_strategy.py +320 -0
- local_deep_research/advanced_search_system/strategies/dual_confidence_with_rejection.py +219 -0
- local_deep_research/advanced_search_system/strategies/early_stop_constrained_strategy.py +369 -0
- local_deep_research/advanced_search_system/strategies/entity_aware_source_strategy.py +140 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy.py +1248 -0
- local_deep_research/advanced_search_system/strategies/evidence_based_strategy_v2.py +1337 -0
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +537 -0
- local_deep_research/advanced_search_system/strategies/improved_evidence_based_strategy.py +782 -0
- local_deep_research/advanced_search_system/strategies/iterative_reasoning_strategy.py +760 -0
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +55 -21
- local_deep_research/advanced_search_system/strategies/llm_driven_modular_strategy.py +865 -0
- local_deep_research/advanced_search_system/strategies/modular_strategy.py +1142 -0
- local_deep_research/advanced_search_system/strategies/parallel_constrained_strategy.py +506 -0
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +34 -16
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +29 -9
- local_deep_research/advanced_search_system/strategies/recursive_decomposition_strategy.py +492 -0
- local_deep_research/advanced_search_system/strategies/smart_decomposition_strategy.py +284 -0
- local_deep_research/advanced_search_system/strategies/smart_query_strategy.py +515 -0
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +48 -24
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +34 -14
- local_deep_research/advanced_search_system/tools/base_tool.py +7 -2
- local_deep_research/api/benchmark_functions.py +6 -2
- local_deep_research/api/research_functions.py +10 -4
- local_deep_research/benchmarks/__init__.py +9 -7
- local_deep_research/benchmarks/benchmark_functions.py +6 -2
- local_deep_research/benchmarks/cli/benchmark_commands.py +27 -10
- local_deep_research/benchmarks/cli.py +38 -13
- local_deep_research/benchmarks/comparison/__init__.py +4 -2
- local_deep_research/benchmarks/comparison/evaluator.py +316 -239
- local_deep_research/benchmarks/datasets/__init__.py +1 -1
- local_deep_research/benchmarks/datasets/base.py +91 -72
- local_deep_research/benchmarks/datasets/browsecomp.py +54 -33
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +19 -19
- local_deep_research/benchmarks/datasets/simpleqa.py +14 -14
- local_deep_research/benchmarks/datasets/utils.py +48 -29
- local_deep_research/benchmarks/datasets.py +4 -11
- local_deep_research/benchmarks/efficiency/__init__.py +8 -4
- local_deep_research/benchmarks/efficiency/resource_monitor.py +223 -171
- local_deep_research/benchmarks/efficiency/speed_profiler.py +62 -48
- local_deep_research/benchmarks/evaluators/browsecomp.py +3 -1
- local_deep_research/benchmarks/evaluators/composite.py +6 -2
- local_deep_research/benchmarks/evaluators/simpleqa.py +36 -13
- local_deep_research/benchmarks/graders.py +32 -10
- local_deep_research/benchmarks/metrics/README.md +1 -1
- local_deep_research/benchmarks/metrics/calculation.py +25 -10
- local_deep_research/benchmarks/metrics/reporting.py +7 -3
- local_deep_research/benchmarks/metrics/visualization.py +42 -23
- local_deep_research/benchmarks/metrics.py +1 -1
- local_deep_research/benchmarks/optimization/__init__.py +3 -1
- local_deep_research/benchmarks/optimization/api.py +7 -1
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +75 -26
- local_deep_research/benchmarks/runners.py +48 -15
- local_deep_research/citation_handler.py +65 -92
- local_deep_research/citation_handlers/__init__.py +15 -0
- local_deep_research/citation_handlers/base_citation_handler.py +70 -0
- local_deep_research/citation_handlers/forced_answer_citation_handler.py +179 -0
- local_deep_research/citation_handlers/precision_extraction_handler.py +550 -0
- local_deep_research/citation_handlers/standard_citation_handler.py +80 -0
- local_deep_research/config/llm_config.py +271 -169
- local_deep_research/config/search_config.py +14 -5
- local_deep_research/defaults/__init__.py +0 -1
- local_deep_research/metrics/__init__.py +13 -0
- local_deep_research/metrics/database.py +58 -0
- local_deep_research/metrics/db_models.py +115 -0
- local_deep_research/metrics/migrate_add_provider_to_token_usage.py +148 -0
- local_deep_research/metrics/migrate_call_stack_tracking.py +105 -0
- local_deep_research/metrics/migrate_enhanced_tracking.py +75 -0
- local_deep_research/metrics/migrate_research_ratings.py +31 -0
- local_deep_research/metrics/models.py +61 -0
- local_deep_research/metrics/pricing/__init__.py +12 -0
- local_deep_research/metrics/pricing/cost_calculator.py +237 -0
- local_deep_research/metrics/pricing/pricing_cache.py +143 -0
- local_deep_research/metrics/pricing/pricing_fetcher.py +240 -0
- local_deep_research/metrics/query_utils.py +51 -0
- local_deep_research/metrics/search_tracker.py +380 -0
- local_deep_research/metrics/token_counter.py +1078 -0
- local_deep_research/migrate_db.py +3 -1
- local_deep_research/report_generator.py +22 -8
- local_deep_research/search_system.py +390 -9
- local_deep_research/test_migration.py +15 -5
- local_deep_research/utilities/db_utils.py +7 -4
- local_deep_research/utilities/es_utils.py +115 -104
- local_deep_research/utilities/llm_utils.py +15 -5
- local_deep_research/utilities/log_utils.py +151 -0
- local_deep_research/utilities/search_cache.py +387 -0
- local_deep_research/utilities/search_utilities.py +14 -6
- local_deep_research/utilities/threading_utils.py +92 -0
- local_deep_research/utilities/url_utils.py +6 -0
- local_deep_research/web/api.py +347 -0
- local_deep_research/web/app.py +13 -17
- local_deep_research/web/app_factory.py +71 -66
- local_deep_research/web/database/migrate_to_ldr_db.py +12 -4
- local_deep_research/web/database/migrations.py +5 -3
- local_deep_research/web/database/models.py +51 -2
- local_deep_research/web/database/schema_upgrade.py +49 -29
- local_deep_research/web/models/database.py +51 -61
- local_deep_research/web/routes/api_routes.py +56 -22
- local_deep_research/web/routes/benchmark_routes.py +4 -1
- local_deep_research/web/routes/globals.py +22 -0
- local_deep_research/web/routes/history_routes.py +71 -46
- local_deep_research/web/routes/metrics_routes.py +1155 -0
- local_deep_research/web/routes/research_routes.py +227 -41
- local_deep_research/web/routes/settings_routes.py +156 -55
- local_deep_research/web/services/research_service.py +310 -103
- local_deep_research/web/services/resource_service.py +36 -11
- local_deep_research/web/services/settings_manager.py +55 -17
- local_deep_research/web/services/settings_service.py +12 -4
- local_deep_research/web/services/socket_service.py +295 -188
- local_deep_research/web/static/css/custom_dropdown.css +180 -0
- local_deep_research/web/static/css/styles.css +39 -1
- local_deep_research/web/static/js/components/detail.js +633 -267
- local_deep_research/web/static/js/components/details.js +751 -0
- local_deep_research/web/static/js/components/fallback/formatting.js +11 -11
- local_deep_research/web/static/js/components/fallback/ui.js +23 -23
- local_deep_research/web/static/js/components/history.js +76 -76
- local_deep_research/web/static/js/components/logpanel.js +61 -13
- local_deep_research/web/static/js/components/progress.js +13 -2
- local_deep_research/web/static/js/components/research.js +99 -12
- local_deep_research/web/static/js/components/results.js +239 -106
- local_deep_research/web/static/js/main.js +40 -40
- local_deep_research/web/static/js/services/audio.js +1 -1
- local_deep_research/web/static/js/services/formatting.js +11 -11
- local_deep_research/web/static/js/services/keyboard.js +157 -0
- local_deep_research/web/static/js/services/pdf.js +80 -80
- local_deep_research/web/static/sounds/README.md +1 -1
- local_deep_research/web/templates/base.html +1 -0
- local_deep_research/web/templates/components/log_panel.html +7 -1
- local_deep_research/web/templates/components/mobile_nav.html +1 -1
- local_deep_research/web/templates/components/sidebar.html +3 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1245 -0
- local_deep_research/web/templates/pages/details.html +325 -24
- local_deep_research/web/templates/pages/history.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +1929 -0
- local_deep_research/web/templates/pages/progress.html +2 -2
- local_deep_research/web/templates/pages/research.html +53 -17
- local_deep_research/web/templates/pages/results.html +12 -1
- local_deep_research/web/templates/pages/star_reviews.html +803 -0
- local_deep_research/web/utils/formatters.py +9 -3
- local_deep_research/web_search_engines/default_search_engines.py +5 -3
- local_deep_research/web_search_engines/engines/full_search.py +8 -2
- local_deep_research/web_search_engines/engines/meta_search_engine.py +59 -20
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +19 -6
- local_deep_research/web_search_engines/engines/search_engine_brave.py +6 -2
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +3 -1
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +81 -58
- local_deep_research/web_search_engines/engines/search_engine_github.py +46 -15
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +16 -6
- local_deep_research/web_search_engines/engines/search_engine_guardian.py +39 -15
- local_deep_research/web_search_engines/engines/search_engine_local.py +58 -25
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +15 -5
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +63 -21
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +37 -11
- local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py +27 -9
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +12 -4
- local_deep_research/web_search_engines/engines/search_engine_wayback.py +31 -10
- local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +12 -3
- local_deep_research/web_search_engines/search_engine_base.py +83 -35
- local_deep_research/web_search_engines/search_engine_factory.py +25 -8
- local_deep_research/web_search_engines/search_engines_config.py +9 -3
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/METADATA +7 -1
- local_deep_research-0.5.0.dist-info/RECORD +265 -0
- local_deep_research-0.4.4.dist-info/RECORD +0 -177
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/WHEEL +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.4.4.dist-info → local_deep_research-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -8,7 +8,7 @@ of different components and processes in the research system.
|
|
8
8
|
import logging
|
9
9
|
import time
|
10
10
|
from contextlib import contextmanager
|
11
|
-
from typing import
|
11
|
+
from typing import Any, Callable, Dict
|
12
12
|
|
13
13
|
logger = logging.getLogger(__name__)
|
14
14
|
|
@@ -16,57 +16,57 @@ logger = logging.getLogger(__name__)
|
|
16
16
|
class SpeedProfiler:
|
17
17
|
"""
|
18
18
|
Profiler for tracking execution speed of components.
|
19
|
-
|
19
|
+
|
20
20
|
This class provides methods for timing operations and
|
21
21
|
collecting performance statistics for later analysis.
|
22
22
|
"""
|
23
|
-
|
23
|
+
|
24
24
|
def __init__(self):
|
25
25
|
"""Initialize the profiler with empty timing data."""
|
26
26
|
self.timings = {}
|
27
27
|
self.current_timers = {}
|
28
28
|
self.total_start_time = None
|
29
29
|
self.total_end_time = None
|
30
|
-
|
30
|
+
|
31
31
|
def start(self):
|
32
32
|
"""Start the global profiling session."""
|
33
33
|
self.timings = {}
|
34
34
|
self.current_timers = {}
|
35
35
|
self.total_start_time = time.time()
|
36
|
-
|
36
|
+
|
37
37
|
def stop(self):
|
38
38
|
"""Stop the global profiling session."""
|
39
39
|
self.total_end_time = time.time()
|
40
|
-
|
40
|
+
|
41
41
|
# Stop any timers that are still running
|
42
42
|
for name in list(self.current_timers.keys()):
|
43
43
|
self.stop_timer(name)
|
44
|
-
|
44
|
+
|
45
45
|
def start_timer(self, name: str):
|
46
46
|
"""
|
47
47
|
Start a named timer.
|
48
|
-
|
48
|
+
|
49
49
|
Args:
|
50
50
|
name: Name of the timer to start
|
51
51
|
"""
|
52
52
|
if name in self.current_timers:
|
53
53
|
logger.warning(f"Timer '{name}' is already running. Restarting.")
|
54
|
-
|
54
|
+
|
55
55
|
self.current_timers[name] = time.time()
|
56
|
-
|
56
|
+
|
57
57
|
def stop_timer(self, name: str):
|
58
58
|
"""
|
59
59
|
Stop a named timer and record the elapsed time.
|
60
|
-
|
60
|
+
|
61
61
|
Args:
|
62
62
|
name: Name of the timer to stop
|
63
63
|
"""
|
64
64
|
if name not in self.current_timers:
|
65
65
|
logger.warning(f"Timer '{name}' was not started.")
|
66
66
|
return
|
67
|
-
|
67
|
+
|
68
68
|
elapsed = time.time() - self.current_timers[name]
|
69
|
-
|
69
|
+
|
70
70
|
if name not in self.timings:
|
71
71
|
self.timings[name] = {
|
72
72
|
"total": elapsed,
|
@@ -74,7 +74,7 @@ class SpeedProfiler:
|
|
74
74
|
"min": elapsed,
|
75
75
|
"max": elapsed,
|
76
76
|
"starts": [self.current_timers[name]],
|
77
|
-
"durations": [elapsed]
|
77
|
+
"durations": [elapsed],
|
78
78
|
}
|
79
79
|
else:
|
80
80
|
self.timings[name]["total"] += elapsed
|
@@ -83,17 +83,17 @@ class SpeedProfiler:
|
|
83
83
|
self.timings[name]["max"] = max(self.timings[name]["max"], elapsed)
|
84
84
|
self.timings[name]["starts"].append(self.current_timers[name])
|
85
85
|
self.timings[name]["durations"].append(elapsed)
|
86
|
-
|
86
|
+
|
87
87
|
del self.current_timers[name]
|
88
|
-
|
88
|
+
|
89
89
|
@contextmanager
|
90
90
|
def timer(self, name: str):
|
91
91
|
"""
|
92
92
|
Context manager for timing a block of code.
|
93
|
-
|
93
|
+
|
94
94
|
Args:
|
95
95
|
name: Name of the timer
|
96
|
-
|
96
|
+
|
97
97
|
Example:
|
98
98
|
with profiler.timer("my_operation"):
|
99
99
|
# Code to time
|
@@ -104,23 +104,26 @@ class SpeedProfiler:
|
|
104
104
|
yield
|
105
105
|
finally:
|
106
106
|
self.stop_timer(name)
|
107
|
-
|
107
|
+
|
108
108
|
def get_timings(self) -> Dict[str, Any]:
|
109
109
|
"""
|
110
110
|
Get all recorded timings.
|
111
|
-
|
111
|
+
|
112
112
|
Returns:
|
113
113
|
Dictionary of timing data for all measured operations
|
114
114
|
"""
|
115
115
|
result = self.timings.copy()
|
116
|
-
|
116
|
+
|
117
117
|
# Add averages
|
118
118
|
for name, data in result.items():
|
119
119
|
if data["count"] > 0:
|
120
120
|
data["avg"] = data["total"] / data["count"]
|
121
|
-
|
121
|
+
|
122
122
|
# Add total duration
|
123
|
-
if
|
123
|
+
if (
|
124
|
+
self.total_start_time is not None
|
125
|
+
and self.total_end_time is not None
|
126
|
+
):
|
124
127
|
result["total"] = {
|
125
128
|
"total": self.total_end_time - self.total_start_time,
|
126
129
|
"count": 1,
|
@@ -128,87 +131,98 @@ class SpeedProfiler:
|
|
128
131
|
"max": self.total_end_time - self.total_start_time,
|
129
132
|
"avg": self.total_end_time - self.total_start_time,
|
130
133
|
"starts": [self.total_start_time],
|
131
|
-
"durations": [self.total_end_time - self.total_start_time]
|
134
|
+
"durations": [self.total_end_time - self.total_start_time],
|
132
135
|
}
|
133
|
-
|
136
|
+
|
134
137
|
return result
|
135
|
-
|
138
|
+
|
136
139
|
def get_summary(self) -> Dict[str, float]:
|
137
140
|
"""
|
138
141
|
Get a summary of timing information.
|
139
|
-
|
142
|
+
|
140
143
|
Returns:
|
141
144
|
Dictionary with summary statistics
|
142
145
|
"""
|
143
146
|
timings = self.get_timings()
|
144
147
|
summary = {}
|
145
|
-
|
148
|
+
|
146
149
|
# Total duration
|
147
150
|
if "total" in timings:
|
148
151
|
summary["total_duration"] = timings["total"]["total"]
|
149
|
-
elif
|
150
|
-
|
152
|
+
elif (
|
153
|
+
self.total_start_time is not None
|
154
|
+
and self.total_end_time is not None
|
155
|
+
):
|
156
|
+
summary["total_duration"] = (
|
157
|
+
self.total_end_time - self.total_start_time
|
158
|
+
)
|
151
159
|
else:
|
152
|
-
summary["total_duration"] = sum(
|
153
|
-
|
160
|
+
summary["total_duration"] = sum(
|
161
|
+
t["total"] for t in timings.values()
|
162
|
+
)
|
163
|
+
|
154
164
|
# Component durations
|
155
165
|
for name, data in timings.items():
|
156
166
|
if name != "total":
|
157
167
|
summary[f"{name}_duration"] = data["total"]
|
158
168
|
summary[f"{name}_percent"] = (
|
159
|
-
data["total"] / summary["total_duration"] * 100
|
160
|
-
if summary["total_duration"] > 0
|
169
|
+
data["total"] / summary["total_duration"] * 100
|
170
|
+
if summary["total_duration"] > 0
|
171
|
+
else 0
|
161
172
|
)
|
162
|
-
|
173
|
+
|
163
174
|
# Per-operation breakdowns
|
164
175
|
for name, data in timings.items():
|
165
176
|
if data["count"] > 0:
|
166
177
|
summary[f"{name}_per_operation"] = data["total"] / data["count"]
|
167
|
-
|
178
|
+
|
168
179
|
return summary
|
169
|
-
|
180
|
+
|
170
181
|
def print_summary(self):
|
171
182
|
"""Print a formatted summary of timing information."""
|
172
183
|
summary = self.get_summary()
|
173
184
|
total = summary.get("total_duration", 0)
|
174
|
-
|
185
|
+
|
175
186
|
print("\n===== SPEED PROFILE SUMMARY =====")
|
176
187
|
print(f"Total execution time: {total:.2f} seconds")
|
177
188
|
print("\n--- Component Breakdown ---")
|
178
|
-
|
189
|
+
|
179
190
|
# Print each component's timing
|
180
191
|
for name, data in self.timings.items():
|
181
192
|
if name != "total":
|
182
193
|
percent = data["total"] / total * 100 if total > 0 else 0
|
183
|
-
print(
|
184
|
-
|
185
|
-
|
194
|
+
print(
|
195
|
+
f"{name}: {data['total']:.2f}s ({percent:.1f}%) - "
|
196
|
+
f"{data['count']} calls, avg {data['total'] / data['count']:.3f}s per call"
|
197
|
+
)
|
198
|
+
|
186
199
|
print("\n==============================")
|
187
200
|
|
188
201
|
|
189
202
|
def time_function(func: Callable) -> Callable:
|
190
203
|
"""
|
191
204
|
Decorator to time a function's execution.
|
192
|
-
|
205
|
+
|
193
206
|
Args:
|
194
207
|
func: Function to time
|
195
|
-
|
208
|
+
|
196
209
|
Returns:
|
197
210
|
Wrapped function that logs its execution time
|
198
|
-
|
211
|
+
|
199
212
|
Example:
|
200
213
|
@time_function
|
201
214
|
def my_slow_function():
|
202
215
|
# Some slow code
|
203
216
|
pass
|
204
217
|
"""
|
218
|
+
|
205
219
|
def wrapper(*args, **kwargs):
|
206
220
|
start_time = time.time()
|
207
221
|
result = func(*args, **kwargs)
|
208
222
|
elapsed = time.time() - start_time
|
209
|
-
|
223
|
+
|
210
224
|
logger.info(f"{func.__name__} took {elapsed:.3f} seconds")
|
211
|
-
|
225
|
+
|
212
226
|
return result
|
213
|
-
|
227
|
+
|
214
228
|
return wrapper
|
@@ -47,7 +47,9 @@ class BrowseCompEvaluator(BaseBenchmarkEvaluator):
|
|
47
47
|
benchmark_dir = self._create_subdirectory(output_dir)
|
48
48
|
|
49
49
|
# Log benchmark execution
|
50
|
-
logger.info(
|
50
|
+
logger.info(
|
51
|
+
f"Running BrowseComp benchmark with {num_examples} examples"
|
52
|
+
)
|
51
53
|
|
52
54
|
try:
|
53
55
|
# Run BrowseComp benchmark
|
@@ -54,7 +54,9 @@ class CompositeBenchmarkEvaluator:
|
|
54
54
|
}
|
55
55
|
|
56
56
|
# Log the weights being used
|
57
|
-
logger.info(
|
57
|
+
logger.info(
|
58
|
+
f"Using normalized benchmark weights: {self.normalized_weights}"
|
59
|
+
)
|
58
60
|
|
59
61
|
def evaluate(
|
60
62
|
self,
|
@@ -105,7 +107,9 @@ class CompositeBenchmarkEvaluator:
|
|
105
107
|
combined_score += weighted_contribution
|
106
108
|
|
107
109
|
except Exception as e:
|
108
|
-
logger.error(
|
110
|
+
logger.error(
|
111
|
+
f"Error running {benchmark_name} benchmark: {str(e)}"
|
112
|
+
)
|
109
113
|
all_results[benchmark_name] = {
|
110
114
|
"benchmark_type": benchmark_name,
|
111
115
|
"error": str(e),
|
@@ -9,9 +9,9 @@ import json
|
|
9
9
|
import logging
|
10
10
|
import os
|
11
11
|
import time
|
12
|
-
from typing import Any, Dict
|
12
|
+
from typing import Any, Dict
|
13
|
+
|
13
14
|
|
14
|
-
from local_deep_research.api import quick_summary
|
15
15
|
from ..datasets.base import DatasetRegistry
|
16
16
|
from ..metrics import calculate_metrics, generate_report
|
17
17
|
from ..runners import run_simpleqa_benchmark # Keep for backward compatibility
|
@@ -134,9 +134,15 @@ class SimpleQAEvaluator(BaseBenchmarkEvaluator):
|
|
134
134
|
|
135
135
|
# Set up output files
|
136
136
|
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
137
|
-
results_file = os.path.join(
|
138
|
-
|
139
|
-
|
137
|
+
results_file = os.path.join(
|
138
|
+
output_dir, f"simpleqa_{timestamp}_results.jsonl"
|
139
|
+
)
|
140
|
+
evaluation_file = os.path.join(
|
141
|
+
output_dir, f"simpleqa_{timestamp}_evaluation.jsonl"
|
142
|
+
)
|
143
|
+
report_file = os.path.join(
|
144
|
+
output_dir, f"simpleqa_{timestamp}_report.md"
|
145
|
+
)
|
140
146
|
|
141
147
|
# Process each example
|
142
148
|
results = []
|
@@ -146,7 +152,9 @@ class SimpleQAEvaluator(BaseBenchmarkEvaluator):
|
|
146
152
|
question = dataset_instance.get_question(example)
|
147
153
|
correct_answer = dataset_instance.get_answer(example)
|
148
154
|
|
149
|
-
logger.info(
|
155
|
+
logger.info(
|
156
|
+
f"Processing {i + 1}/{len(examples)}: {question[:50]}..."
|
157
|
+
)
|
150
158
|
|
151
159
|
try:
|
152
160
|
# Format query based on dataset type
|
@@ -158,18 +166,25 @@ class SimpleQAEvaluator(BaseBenchmarkEvaluator):
|
|
158
166
|
# Create search config from system_config
|
159
167
|
search_params = {
|
160
168
|
"iterations": system_config.get("iterations", 3),
|
161
|
-
"questions_per_iteration": system_config.get(
|
162
|
-
|
169
|
+
"questions_per_iteration": system_config.get(
|
170
|
+
"questions_per_iteration", 3
|
171
|
+
),
|
172
|
+
"search_tool": system_config.get(
|
173
|
+
"search_tool", "searxng"
|
174
|
+
),
|
163
175
|
# Note: search_strategy is stored in the config but not passed to quick_summary
|
164
176
|
# as it's not supported by the underlying API
|
165
177
|
}
|
166
178
|
|
167
179
|
# Get response from LDR
|
168
180
|
from local_deep_research.api import quick_summary
|
181
|
+
|
169
182
|
search_result = quick_summary(
|
170
183
|
query=formatted_query,
|
171
184
|
iterations=search_params.get("iterations"),
|
172
|
-
questions_per_iteration=search_params.get(
|
185
|
+
questions_per_iteration=search_params.get(
|
186
|
+
"questions_per_iteration"
|
187
|
+
),
|
173
188
|
search_tool=search_params.get("search_tool"),
|
174
189
|
)
|
175
190
|
|
@@ -181,7 +196,10 @@ class SimpleQAEvaluator(BaseBenchmarkEvaluator):
|
|
181
196
|
|
182
197
|
# Extract structured answer
|
183
198
|
from ..graders import extract_answer_from_response
|
184
|
-
|
199
|
+
|
200
|
+
extracted = extract_answer_from_response(
|
201
|
+
response, "simpleqa"
|
202
|
+
)
|
185
203
|
|
186
204
|
# Format result
|
187
205
|
result = {
|
@@ -224,7 +242,8 @@ class SimpleQAEvaluator(BaseBenchmarkEvaluator):
|
|
224
242
|
|
225
243
|
# Grade results
|
226
244
|
from ..graders import grade_results
|
227
|
-
|
245
|
+
|
246
|
+
grade_results(
|
228
247
|
results_file=results_file,
|
229
248
|
output_file=evaluation_file,
|
230
249
|
dataset_type="simpleqa",
|
@@ -244,9 +263,13 @@ class SimpleQAEvaluator(BaseBenchmarkEvaluator):
|
|
244
263
|
"Dataset": "SimpleQA",
|
245
264
|
"Examples": len(examples),
|
246
265
|
"Iterations": search_params.get("iterations", 3),
|
247
|
-
"Questions per iteration": search_params.get(
|
266
|
+
"Questions per iteration": search_params.get(
|
267
|
+
"questions_per_iteration", 3
|
268
|
+
),
|
248
269
|
"Search tool": search_params.get("search_tool", "searxng"),
|
249
|
-
"Search strategy": search_params.get(
|
270
|
+
"Search strategy": search_params.get(
|
271
|
+
"search_strategy", "source_based"
|
272
|
+
),
|
250
273
|
},
|
251
274
|
)
|
252
275
|
|
@@ -59,7 +59,9 @@ def get_evaluation_llm(custom_config: Optional[Dict[str, Any]] = None):
|
|
59
59
|
"api_key",
|
60
60
|
}
|
61
61
|
|
62
|
-
filtered_config = {
|
62
|
+
filtered_config = {
|
63
|
+
k: v for k, v in config.items() if k in ldr_supported_params
|
64
|
+
}
|
63
65
|
|
64
66
|
# Check if we're using openai_endpoint but don't have an API key configured
|
65
67
|
if filtered_config.get("provider") == "openai_endpoint":
|
@@ -182,7 +184,9 @@ def grade_results(
|
|
182
184
|
|
183
185
|
try:
|
184
186
|
# Grade using LLM
|
185
|
-
if hasattr(evaluation_llm, "invoke") and callable(
|
187
|
+
if hasattr(evaluation_llm, "invoke") and callable(
|
188
|
+
evaluation_llm.invoke
|
189
|
+
):
|
186
190
|
if hasattr(evaluation_llm, "chat_messages"):
|
187
191
|
# Handle ChatOpenAI and similar models that use messages
|
188
192
|
grading_response = evaluation_llm.invoke(
|
@@ -214,7 +218,9 @@ def grade_results(
|
|
214
218
|
grading_response,
|
215
219
|
re.DOTALL,
|
216
220
|
)
|
217
|
-
reasoning =
|
221
|
+
reasoning = (
|
222
|
+
reasoning_match.group(1).strip() if reasoning_match else ""
|
223
|
+
)
|
218
224
|
|
219
225
|
correct_match = re.search(
|
220
226
|
r"correct:\s*(yes|no)", grading_response, re.IGNORECASE
|
@@ -225,8 +231,12 @@ def grade_results(
|
|
225
231
|
else False
|
226
232
|
)
|
227
233
|
|
228
|
-
confidence_match = re.search(
|
229
|
-
|
234
|
+
confidence_match = re.search(
|
235
|
+
r"confidence:\s*(\d+)", grading_response
|
236
|
+
)
|
237
|
+
confidence = (
|
238
|
+
confidence_match.group(1) if confidence_match else "100"
|
239
|
+
)
|
230
240
|
else:
|
231
241
|
# SimpleQA extraction
|
232
242
|
extracted_answer_match = re.search(
|
@@ -239,9 +249,13 @@ def grade_results(
|
|
239
249
|
)
|
240
250
|
|
241
251
|
reasoning_match = re.search(
|
242
|
-
r"Reasoning:\s*(.*?)(?:\nCorrect:|\Z)",
|
252
|
+
r"Reasoning:\s*(.*?)(?:\nCorrect:|\Z)",
|
253
|
+
grading_response,
|
254
|
+
re.DOTALL,
|
255
|
+
)
|
256
|
+
reasoning = (
|
257
|
+
reasoning_match.group(1).strip() if reasoning_match else ""
|
243
258
|
)
|
244
|
-
reasoning = reasoning_match.group(1).strip() if reasoning_match else ""
|
245
259
|
|
246
260
|
correct_match = re.search(
|
247
261
|
r"Correct:\s*(yes|no)", grading_response, re.IGNORECASE
|
@@ -304,7 +318,11 @@ def grade_results(
|
|
304
318
|
progress_callback(
|
305
319
|
idx,
|
306
320
|
len(results),
|
307
|
-
{
|
321
|
+
{
|
322
|
+
"status": "error",
|
323
|
+
"error": str(e),
|
324
|
+
"result": error_result,
|
325
|
+
},
|
308
326
|
)
|
309
327
|
|
310
328
|
accuracy = correct_count / len(results) if results else 0
|
@@ -366,7 +384,9 @@ def human_evaluation(
|
|
366
384
|
# Get human judgment
|
367
385
|
while True:
|
368
386
|
judgment = (
|
369
|
-
input("\nIs the model's answer correct? (y/n): ")
|
387
|
+
input("\nIs the model's answer correct? (y/n): ")
|
388
|
+
.strip()
|
389
|
+
.lower()
|
370
390
|
)
|
371
391
|
if judgment in ["y", "n"]:
|
372
392
|
break
|
@@ -375,7 +395,9 @@ def human_evaluation(
|
|
375
395
|
is_correct = judgment == "y"
|
376
396
|
|
377
397
|
# Get reasoning
|
378
|
-
reasoning = input(
|
398
|
+
reasoning = input(
|
399
|
+
"Please provide reasoning for your judgment: "
|
400
|
+
).strip()
|
379
401
|
else:
|
380
402
|
# Non-interactive mode - placeholder for API/UI implementation
|
381
403
|
# In a real implementation, this would be filled by UI actions
|
@@ -11,7 +11,7 @@ import os
|
|
11
11
|
import tempfile
|
12
12
|
import time
|
13
13
|
from datetime import datetime
|
14
|
-
from typing import Any, Dict,
|
14
|
+
from typing import Any, Dict, Optional
|
15
15
|
|
16
16
|
logger = logging.getLogger(__name__)
|
17
17
|
|
@@ -50,7 +50,9 @@ def calculate_metrics(results_file: str) -> Dict[str, Any]:
|
|
50
50
|
processing_times = [
|
51
51
|
r.get("processing_time", 0) for r in results if "processing_time" in r
|
52
52
|
]
|
53
|
-
avg_time =
|
53
|
+
avg_time = (
|
54
|
+
sum(processing_times) / len(processing_times) if processing_times else 0
|
55
|
+
)
|
54
56
|
|
55
57
|
# Average confidence if available
|
56
58
|
confidence_values = []
|
@@ -62,7 +64,9 @@ def calculate_metrics(results_file: str) -> Dict[str, Any]:
|
|
62
64
|
pass
|
63
65
|
|
64
66
|
avg_confidence = (
|
65
|
-
sum(confidence_values) / len(confidence_values)
|
67
|
+
sum(confidence_values) / len(confidence_values)
|
68
|
+
if confidence_values
|
69
|
+
else 0
|
66
70
|
)
|
67
71
|
|
68
72
|
# Calculate error rate
|
@@ -100,7 +104,9 @@ def calculate_metrics(results_file: str) -> Dict[str, Any]:
|
|
100
104
|
"total": counts["total"],
|
101
105
|
"correct": counts["correct"],
|
102
106
|
"accuracy": (
|
103
|
-
counts["correct"] / counts["total"]
|
107
|
+
counts["correct"] / counts["total"]
|
108
|
+
if counts["total"]
|
109
|
+
else 0
|
104
110
|
),
|
105
111
|
}
|
106
112
|
metrics["categories"] = category_metrics
|
@@ -136,7 +142,9 @@ def evaluate_benchmark_quality(
|
|
136
142
|
# Create search configuration from system config
|
137
143
|
search_config = {
|
138
144
|
"iterations": system_config.get("iterations", 2),
|
139
|
-
"questions_per_iteration": system_config.get(
|
145
|
+
"questions_per_iteration": system_config.get(
|
146
|
+
"questions_per_iteration", 2
|
147
|
+
),
|
140
148
|
"search_strategy": system_config.get("search_strategy", "iterdrag"),
|
141
149
|
"search_tool": system_config.get("search_tool", "searxng"),
|
142
150
|
"model_name": system_config.get("model_name"),
|
@@ -174,7 +182,9 @@ def evaluate_benchmark_quality(
|
|
174
182
|
try:
|
175
183
|
shutil.rmtree(temp_dir)
|
176
184
|
except Exception as e:
|
177
|
-
logger.warning(
|
185
|
+
logger.warning(
|
186
|
+
f"Failed to clean up temporary directory: {str(e)}"
|
187
|
+
)
|
178
188
|
|
179
189
|
|
180
190
|
def measure_execution_time(
|
@@ -216,7 +226,7 @@ def measure_execution_time(
|
|
216
226
|
|
217
227
|
try:
|
218
228
|
for i in range(num_runs):
|
219
|
-
logger.info(f"Executing speed test run {i+1}/{num_runs}")
|
229
|
+
logger.info(f"Executing speed test run {i + 1}/{num_runs}")
|
220
230
|
start_time = time.time()
|
221
231
|
system.search(query, full_response=False)
|
222
232
|
end_time = time.time()
|
@@ -264,7 +274,9 @@ def calculate_quality_metrics(
|
|
264
274
|
"""
|
265
275
|
# Run quality evaluation
|
266
276
|
quality_results = evaluate_benchmark_quality(
|
267
|
-
system_config=system_config,
|
277
|
+
system_config=system_config,
|
278
|
+
num_examples=num_examples,
|
279
|
+
output_dir=output_dir,
|
268
280
|
)
|
269
281
|
|
270
282
|
# Return normalized quality score
|
@@ -337,7 +349,10 @@ def calculate_resource_metrics(
|
|
337
349
|
# Normalize to 0-1 scale (lower is better)
|
338
350
|
resource_score = 1.0 / (1.0 + (complexity / 4.0))
|
339
351
|
|
340
|
-
return {
|
352
|
+
return {
|
353
|
+
"resource_score": resource_score,
|
354
|
+
"estimated_complexity": complexity,
|
355
|
+
}
|
341
356
|
|
342
357
|
|
343
358
|
def calculate_combined_score(
|
@@ -382,4 +397,4 @@ def calculate_combined_score(
|
|
382
397
|
resource_score = metrics["resource"].get("resource_score", 0.0)
|
383
398
|
score += resource_score * norm_weights["resource"]
|
384
399
|
|
385
|
-
return score
|
400
|
+
return score
|
@@ -46,7 +46,9 @@ def generate_report(
|
|
46
46
|
# Sample up to 5 correct and 5 incorrect examples
|
47
47
|
correct_examples = [r for r in results if r.get("is_correct", False)][:5]
|
48
48
|
incorrect_examples = [
|
49
|
-
r
|
49
|
+
r
|
50
|
+
for r in results
|
51
|
+
if "is_correct" in r and not r.get("is_correct", False)
|
50
52
|
][:5]
|
51
53
|
|
52
54
|
# Create report
|
@@ -67,7 +69,9 @@ def generate_report(
|
|
67
69
|
)
|
68
70
|
|
69
71
|
if "average_confidence" in metrics:
|
70
|
-
report.append(
|
72
|
+
report.append(
|
73
|
+
f"- **Average Confidence**: {metrics['average_confidence']:.2f}%"
|
74
|
+
)
|
71
75
|
|
72
76
|
if "error_count" in metrics and metrics["error_count"] > 0:
|
73
77
|
report.append(f"- **Error Count**: {metrics['error_count']}")
|
@@ -152,4 +156,4 @@ def generate_report(
|
|
152
156
|
f.write("\n".join(report))
|
153
157
|
|
154
158
|
logger.info(f"Report saved to {output_file}")
|
155
|
-
return output_file
|
159
|
+
return output_file
|