local-deep-research 0.3.12__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
- local_deep_research/advanced_search_system/findings/repository.py +0 -3
- local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
- local_deep_research/api/benchmark_functions.py +288 -0
- local_deep_research/api/research_functions.py +8 -4
- local_deep_research/benchmarks/README.md +162 -0
- local_deep_research/benchmarks/__init__.py +51 -0
- local_deep_research/benchmarks/benchmark_functions.py +353 -0
- local_deep_research/benchmarks/cli/__init__.py +16 -0
- local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
- local_deep_research/benchmarks/cli.py +347 -0
- local_deep_research/benchmarks/comparison/__init__.py +12 -0
- local_deep_research/benchmarks/comparison/evaluator.py +768 -0
- local_deep_research/benchmarks/datasets/__init__.py +53 -0
- local_deep_research/benchmarks/datasets/base.py +295 -0
- local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
- local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
- local_deep_research/benchmarks/datasets/utils.py +116 -0
- local_deep_research/benchmarks/datasets.py +31 -0
- local_deep_research/benchmarks/efficiency/__init__.py +14 -0
- local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
- local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
- local_deep_research/benchmarks/evaluators/__init__.py +18 -0
- local_deep_research/benchmarks/evaluators/base.py +74 -0
- local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
- local_deep_research/benchmarks/evaluators/composite.py +121 -0
- local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
- local_deep_research/benchmarks/graders.py +410 -0
- local_deep_research/benchmarks/metrics/README.md +80 -0
- local_deep_research/benchmarks/metrics/__init__.py +24 -0
- local_deep_research/benchmarks/metrics/calculation.py +385 -0
- local_deep_research/benchmarks/metrics/reporting.py +155 -0
- local_deep_research/benchmarks/metrics/visualization.py +205 -0
- local_deep_research/benchmarks/metrics.py +11 -0
- local_deep_research/benchmarks/optimization/__init__.py +32 -0
- local_deep_research/benchmarks/optimization/api.py +274 -0
- local_deep_research/benchmarks/optimization/metrics.py +20 -0
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
- local_deep_research/benchmarks/runners.py +434 -0
- local_deep_research/benchmarks/templates.py +65 -0
- local_deep_research/config/llm_config.py +26 -23
- local_deep_research/config/search_config.py +1 -5
- local_deep_research/defaults/default_settings.json +108 -7
- local_deep_research/search_system.py +16 -8
- local_deep_research/utilities/db_utils.py +3 -6
- local_deep_research/utilities/es_utils.py +441 -0
- local_deep_research/utilities/log_utils.py +36 -0
- local_deep_research/utilities/search_utilities.py +8 -9
- local_deep_research/web/app.py +7 -9
- local_deep_research/web/app_factory.py +9 -12
- local_deep_research/web/database/migrations.py +8 -5
- local_deep_research/web/database/models.py +20 -0
- local_deep_research/web/database/schema_upgrade.py +5 -8
- local_deep_research/web/models/database.py +15 -18
- local_deep_research/web/routes/benchmark_routes.py +427 -0
- local_deep_research/web/routes/research_routes.py +13 -17
- local_deep_research/web/routes/settings_routes.py +264 -67
- local_deep_research/web/services/research_service.py +47 -57
- local_deep_research/web/services/settings_manager.py +1 -4
- local_deep_research/web/services/settings_service.py +4 -6
- local_deep_research/web/static/css/styles.css +12 -0
- local_deep_research/web/static/js/components/logpanel.js +164 -155
- local_deep_research/web/static/js/components/research.js +44 -3
- local_deep_research/web/static/js/components/settings.js +27 -0
- local_deep_research/web/static/js/services/socket.js +47 -0
- local_deep_research/web_search_engines/default_search_engines.py +38 -0
- local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
- local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
- local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
- local_deep_research/web_search_engines/search_engine_base.py +22 -5
- local_deep_research/web_search_engines/search_engine_factory.py +32 -11
- local_deep_research/web_search_engines/search_engines_config.py +14 -1
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/METADATA +10 -2
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/RECORD +92 -49
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/WHEEL +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,214 @@
|
|
1
|
+
"""
|
2
|
+
Speed profiling tools for Local Deep Research.
|
3
|
+
|
4
|
+
This module provides functionality for measuring execution time
|
5
|
+
of different components and processes in the research system.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
import time
|
10
|
+
from contextlib import contextmanager
|
11
|
+
from typing import Dict, List, Optional, Any, Callable
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class SpeedProfiler:
|
17
|
+
"""
|
18
|
+
Profiler for tracking execution speed of components.
|
19
|
+
|
20
|
+
This class provides methods for timing operations and
|
21
|
+
collecting performance statistics for later analysis.
|
22
|
+
"""
|
23
|
+
|
24
|
+
def __init__(self):
|
25
|
+
"""Initialize the profiler with empty timing data."""
|
26
|
+
self.timings = {}
|
27
|
+
self.current_timers = {}
|
28
|
+
self.total_start_time = None
|
29
|
+
self.total_end_time = None
|
30
|
+
|
31
|
+
def start(self):
|
32
|
+
"""Start the global profiling session."""
|
33
|
+
self.timings = {}
|
34
|
+
self.current_timers = {}
|
35
|
+
self.total_start_time = time.time()
|
36
|
+
|
37
|
+
def stop(self):
|
38
|
+
"""Stop the global profiling session."""
|
39
|
+
self.total_end_time = time.time()
|
40
|
+
|
41
|
+
# Stop any timers that are still running
|
42
|
+
for name in list(self.current_timers.keys()):
|
43
|
+
self.stop_timer(name)
|
44
|
+
|
45
|
+
def start_timer(self, name: str):
|
46
|
+
"""
|
47
|
+
Start a named timer.
|
48
|
+
|
49
|
+
Args:
|
50
|
+
name: Name of the timer to start
|
51
|
+
"""
|
52
|
+
if name in self.current_timers:
|
53
|
+
logger.warning(f"Timer '{name}' is already running. Restarting.")
|
54
|
+
|
55
|
+
self.current_timers[name] = time.time()
|
56
|
+
|
57
|
+
def stop_timer(self, name: str):
|
58
|
+
"""
|
59
|
+
Stop a named timer and record the elapsed time.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
name: Name of the timer to stop
|
63
|
+
"""
|
64
|
+
if name not in self.current_timers:
|
65
|
+
logger.warning(f"Timer '{name}' was not started.")
|
66
|
+
return
|
67
|
+
|
68
|
+
elapsed = time.time() - self.current_timers[name]
|
69
|
+
|
70
|
+
if name not in self.timings:
|
71
|
+
self.timings[name] = {
|
72
|
+
"total": elapsed,
|
73
|
+
"count": 1,
|
74
|
+
"min": elapsed,
|
75
|
+
"max": elapsed,
|
76
|
+
"starts": [self.current_timers[name]],
|
77
|
+
"durations": [elapsed]
|
78
|
+
}
|
79
|
+
else:
|
80
|
+
self.timings[name]["total"] += elapsed
|
81
|
+
self.timings[name]["count"] += 1
|
82
|
+
self.timings[name]["min"] = min(self.timings[name]["min"], elapsed)
|
83
|
+
self.timings[name]["max"] = max(self.timings[name]["max"], elapsed)
|
84
|
+
self.timings[name]["starts"].append(self.current_timers[name])
|
85
|
+
self.timings[name]["durations"].append(elapsed)
|
86
|
+
|
87
|
+
del self.current_timers[name]
|
88
|
+
|
89
|
+
@contextmanager
|
90
|
+
def timer(self, name: str):
|
91
|
+
"""
|
92
|
+
Context manager for timing a block of code.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
name: Name of the timer
|
96
|
+
|
97
|
+
Example:
|
98
|
+
with profiler.timer("my_operation"):
|
99
|
+
# Code to time
|
100
|
+
do_something()
|
101
|
+
"""
|
102
|
+
self.start_timer(name)
|
103
|
+
try:
|
104
|
+
yield
|
105
|
+
finally:
|
106
|
+
self.stop_timer(name)
|
107
|
+
|
108
|
+
def get_timings(self) -> Dict[str, Any]:
|
109
|
+
"""
|
110
|
+
Get all recorded timings.
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
Dictionary of timing data for all measured operations
|
114
|
+
"""
|
115
|
+
result = self.timings.copy()
|
116
|
+
|
117
|
+
# Add averages
|
118
|
+
for name, data in result.items():
|
119
|
+
if data["count"] > 0:
|
120
|
+
data["avg"] = data["total"] / data["count"]
|
121
|
+
|
122
|
+
# Add total duration
|
123
|
+
if self.total_start_time is not None and self.total_end_time is not None:
|
124
|
+
result["total"] = {
|
125
|
+
"total": self.total_end_time - self.total_start_time,
|
126
|
+
"count": 1,
|
127
|
+
"min": self.total_end_time - self.total_start_time,
|
128
|
+
"max": self.total_end_time - self.total_start_time,
|
129
|
+
"avg": self.total_end_time - self.total_start_time,
|
130
|
+
"starts": [self.total_start_time],
|
131
|
+
"durations": [self.total_end_time - self.total_start_time]
|
132
|
+
}
|
133
|
+
|
134
|
+
return result
|
135
|
+
|
136
|
+
def get_summary(self) -> Dict[str, float]:
|
137
|
+
"""
|
138
|
+
Get a summary of timing information.
|
139
|
+
|
140
|
+
Returns:
|
141
|
+
Dictionary with summary statistics
|
142
|
+
"""
|
143
|
+
timings = self.get_timings()
|
144
|
+
summary = {}
|
145
|
+
|
146
|
+
# Total duration
|
147
|
+
if "total" in timings:
|
148
|
+
summary["total_duration"] = timings["total"]["total"]
|
149
|
+
elif self.total_start_time is not None and self.total_end_time is not None:
|
150
|
+
summary["total_duration"] = self.total_end_time - self.total_start_time
|
151
|
+
else:
|
152
|
+
summary["total_duration"] = sum(t["total"] for t in timings.values())
|
153
|
+
|
154
|
+
# Component durations
|
155
|
+
for name, data in timings.items():
|
156
|
+
if name != "total":
|
157
|
+
summary[f"{name}_duration"] = data["total"]
|
158
|
+
summary[f"{name}_percent"] = (
|
159
|
+
data["total"] / summary["total_duration"] * 100
|
160
|
+
if summary["total_duration"] > 0 else 0
|
161
|
+
)
|
162
|
+
|
163
|
+
# Per-operation breakdowns
|
164
|
+
for name, data in timings.items():
|
165
|
+
if data["count"] > 0:
|
166
|
+
summary[f"{name}_per_operation"] = data["total"] / data["count"]
|
167
|
+
|
168
|
+
return summary
|
169
|
+
|
170
|
+
def print_summary(self):
|
171
|
+
"""Print a formatted summary of timing information."""
|
172
|
+
summary = self.get_summary()
|
173
|
+
total = summary.get("total_duration", 0)
|
174
|
+
|
175
|
+
print("\n===== SPEED PROFILE SUMMARY =====")
|
176
|
+
print(f"Total execution time: {total:.2f} seconds")
|
177
|
+
print("\n--- Component Breakdown ---")
|
178
|
+
|
179
|
+
# Print each component's timing
|
180
|
+
for name, data in self.timings.items():
|
181
|
+
if name != "total":
|
182
|
+
percent = data["total"] / total * 100 if total > 0 else 0
|
183
|
+
print(f"{name}: {data['total']:.2f}s ({percent:.1f}%) - "
|
184
|
+
f"{data['count']} calls, avg {data['total'] / data['count']:.3f}s per call")
|
185
|
+
|
186
|
+
print("\n==============================")
|
187
|
+
|
188
|
+
|
189
|
+
def time_function(func: Callable) -> Callable:
|
190
|
+
"""
|
191
|
+
Decorator to time a function's execution.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
func: Function to time
|
195
|
+
|
196
|
+
Returns:
|
197
|
+
Wrapped function that logs its execution time
|
198
|
+
|
199
|
+
Example:
|
200
|
+
@time_function
|
201
|
+
def my_slow_function():
|
202
|
+
# Some slow code
|
203
|
+
pass
|
204
|
+
"""
|
205
|
+
def wrapper(*args, **kwargs):
|
206
|
+
start_time = time.time()
|
207
|
+
result = func(*args, **kwargs)
|
208
|
+
elapsed = time.time() - start_time
|
209
|
+
|
210
|
+
logger.info(f"{func.__name__} took {elapsed:.3f} seconds")
|
211
|
+
|
212
|
+
return result
|
213
|
+
|
214
|
+
return wrapper
|
@@ -0,0 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Benchmark evaluation package.
|
3
|
+
|
4
|
+
This package provides evaluators for different benchmark types and
|
5
|
+
a composite evaluator for weighted multi-benchmark evaluation.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .base import BaseBenchmarkEvaluator
|
9
|
+
from .browsecomp import BrowseCompEvaluator
|
10
|
+
from .composite import CompositeBenchmarkEvaluator
|
11
|
+
from .simpleqa import SimpleQAEvaluator
|
12
|
+
|
13
|
+
__all__ = [
|
14
|
+
"BaseBenchmarkEvaluator",
|
15
|
+
"SimpleQAEvaluator",
|
16
|
+
"BrowseCompEvaluator",
|
17
|
+
"CompositeBenchmarkEvaluator",
|
18
|
+
]
|
@@ -0,0 +1,74 @@
|
|
1
|
+
"""
|
2
|
+
Base class for benchmark evaluators.
|
3
|
+
|
4
|
+
This module defines the abstract base class that all benchmark evaluators
|
5
|
+
must implement, establishing a common interface for different benchmark types.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
import os
|
10
|
+
from abc import ABC, abstractmethod
|
11
|
+
from typing import Any, Dict
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class BaseBenchmarkEvaluator(ABC):
|
17
|
+
"""
|
18
|
+
Abstract base class for benchmark evaluators.
|
19
|
+
|
20
|
+
All benchmark evaluator implementations must inherit from this class and
|
21
|
+
implement the evaluate method to run their specific benchmark type.
|
22
|
+
"""
|
23
|
+
|
24
|
+
def __init__(self, name: str):
|
25
|
+
"""
|
26
|
+
Initialize benchmark evaluator with a name.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
name: Unique identifier for this benchmark type
|
30
|
+
"""
|
31
|
+
self.name = name
|
32
|
+
|
33
|
+
def get_name(self) -> str:
|
34
|
+
"""
|
35
|
+
Get the benchmark name.
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
The benchmark identifier
|
39
|
+
"""
|
40
|
+
return self.name
|
41
|
+
|
42
|
+
@abstractmethod
|
43
|
+
def evaluate(
|
44
|
+
self,
|
45
|
+
system_config: Dict[str, Any],
|
46
|
+
num_examples: int,
|
47
|
+
output_dir: str,
|
48
|
+
) -> Dict[str, Any]:
|
49
|
+
"""
|
50
|
+
Run benchmark evaluation with given system configuration.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
system_config: Configuration parameters for the system under test
|
54
|
+
num_examples: Number of benchmark examples to evaluate
|
55
|
+
output_dir: Directory to save evaluation results
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
Dictionary with evaluation metrics including quality_score (0-1)
|
59
|
+
"""
|
60
|
+
pass
|
61
|
+
|
62
|
+
def _create_subdirectory(self, output_dir: str) -> str:
|
63
|
+
"""
|
64
|
+
Create a benchmark-specific subdirectory for output.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
output_dir: Parent directory for output
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
Path to the benchmark-specific directory
|
71
|
+
"""
|
72
|
+
benchmark_dir = os.path.join(output_dir, self.name)
|
73
|
+
os.makedirs(benchmark_dir, exist_ok=True)
|
74
|
+
return benchmark_dir
|
@@ -0,0 +1,83 @@
|
|
1
|
+
"""
|
2
|
+
BrowseComp benchmark evaluator.
|
3
|
+
|
4
|
+
This module provides a benchmark evaluator implementation for the BrowseComp
|
5
|
+
benchmark, which tests browsing comprehension capabilities.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
from typing import Any, Dict
|
10
|
+
|
11
|
+
from ..runners import run_browsecomp_benchmark
|
12
|
+
from .base import BaseBenchmarkEvaluator
|
13
|
+
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class BrowseCompEvaluator(BaseBenchmarkEvaluator):
|
18
|
+
"""
|
19
|
+
Evaluator for the BrowseComp benchmark.
|
20
|
+
|
21
|
+
This evaluator runs the BrowseComp benchmark, which tests a system's ability
|
22
|
+
to accurately comprehend and answer questions from web browsing.
|
23
|
+
"""
|
24
|
+
|
25
|
+
def __init__(self):
|
26
|
+
"""Initialize the BrowseComp evaluator."""
|
27
|
+
super().__init__("browsecomp")
|
28
|
+
|
29
|
+
def evaluate(
|
30
|
+
self,
|
31
|
+
system_config: Dict[str, Any],
|
32
|
+
num_examples: int,
|
33
|
+
output_dir: str,
|
34
|
+
) -> Dict[str, Any]:
|
35
|
+
"""
|
36
|
+
Run BrowseComp benchmark and return metrics.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
system_config: Search and LLM configuration parameters
|
40
|
+
num_examples: Number of benchmark examples to run
|
41
|
+
output_dir: Directory to save evaluation results
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
Dictionary with metrics including quality_score based on accuracy
|
45
|
+
"""
|
46
|
+
# Create benchmark-specific directory
|
47
|
+
benchmark_dir = self._create_subdirectory(output_dir)
|
48
|
+
|
49
|
+
# Log benchmark execution
|
50
|
+
logger.info(f"Running BrowseComp benchmark with {num_examples} examples")
|
51
|
+
|
52
|
+
try:
|
53
|
+
# Run BrowseComp benchmark
|
54
|
+
results = run_browsecomp_benchmark(
|
55
|
+
num_examples=num_examples,
|
56
|
+
output_dir=benchmark_dir,
|
57
|
+
search_config=system_config,
|
58
|
+
run_evaluation=True,
|
59
|
+
)
|
60
|
+
|
61
|
+
# Extract metrics
|
62
|
+
metrics = results.get("metrics", {})
|
63
|
+
accuracy = metrics.get("accuracy", 0.0)
|
64
|
+
|
65
|
+
# Return evaluation results with quality score
|
66
|
+
return {
|
67
|
+
"benchmark_type": self.name,
|
68
|
+
"accuracy": accuracy,
|
69
|
+
"quality_score": accuracy, # Map accuracy directly to quality score
|
70
|
+
"raw_results": results,
|
71
|
+
"report_path": results.get("report_path"),
|
72
|
+
}
|
73
|
+
|
74
|
+
except Exception as e:
|
75
|
+
logger.error(f"Error in BrowseComp evaluation: {str(e)}")
|
76
|
+
|
77
|
+
# Return error information
|
78
|
+
return {
|
79
|
+
"benchmark_type": self.name,
|
80
|
+
"error": str(e),
|
81
|
+
"quality_score": 0.0,
|
82
|
+
"accuracy": 0.0,
|
83
|
+
}
|
@@ -0,0 +1,121 @@
|
|
1
|
+
"""
|
2
|
+
Composite benchmark evaluator.
|
3
|
+
|
4
|
+
This module provides a composite evaluator that can run multiple benchmarks
|
5
|
+
with weighted scores to provide a comprehensive evaluation.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import logging
|
9
|
+
from typing import Any, Dict, Optional
|
10
|
+
|
11
|
+
# Import specific evaluator implementations
|
12
|
+
from .browsecomp import BrowseCompEvaluator
|
13
|
+
from .simpleqa import SimpleQAEvaluator
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
class CompositeBenchmarkEvaluator:
|
19
|
+
"""
|
20
|
+
Evaluator that combines multiple benchmarks with weighted scores.
|
21
|
+
|
22
|
+
This evaluator runs multiple benchmark types and combines their scores
|
23
|
+
according to specified weights, enabling comprehensive evaluation across
|
24
|
+
different metrics and tasks.
|
25
|
+
"""
|
26
|
+
|
27
|
+
def __init__(self, benchmark_weights: Optional[Dict[str, float]] = None):
|
28
|
+
"""
|
29
|
+
Initialize with benchmark weights.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
benchmark_weights: Dictionary mapping benchmark names to weights
|
33
|
+
Default: {"simpleqa": 1.0}
|
34
|
+
"""
|
35
|
+
# Default to SimpleQA only if no weights provided
|
36
|
+
self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}
|
37
|
+
|
38
|
+
# Create evaluators for available benchmarks
|
39
|
+
self.evaluators = {
|
40
|
+
"simpleqa": SimpleQAEvaluator(),
|
41
|
+
"browsecomp": BrowseCompEvaluator(),
|
42
|
+
}
|
43
|
+
|
44
|
+
# Normalize weights to sum to 1.0
|
45
|
+
total_weight = sum(self.benchmark_weights.values())
|
46
|
+
if total_weight <= 0:
|
47
|
+
logger.warning(
|
48
|
+
"Total benchmark weight is zero or negative. Using default weights."
|
49
|
+
)
|
50
|
+
self.normalized_weights = {"simpleqa": 1.0}
|
51
|
+
else:
|
52
|
+
self.normalized_weights = {
|
53
|
+
k: w / total_weight for k, w in self.benchmark_weights.items()
|
54
|
+
}
|
55
|
+
|
56
|
+
# Log the weights being used
|
57
|
+
logger.info(f"Using normalized benchmark weights: {self.normalized_weights}")
|
58
|
+
|
59
|
+
def evaluate(
|
60
|
+
self,
|
61
|
+
system_config: Dict[str, Any],
|
62
|
+
num_examples: int,
|
63
|
+
output_dir: str,
|
64
|
+
) -> Dict[str, Any]:
|
65
|
+
"""
|
66
|
+
Run all requested benchmarks and compute weighted score.
|
67
|
+
|
68
|
+
Args:
|
69
|
+
system_config: Configuration parameters for the system under test
|
70
|
+
num_examples: Number of benchmark examples to evaluate
|
71
|
+
output_dir: Directory to save evaluation results
|
72
|
+
|
73
|
+
Returns:
|
74
|
+
Dictionary with combined metrics and individual benchmark results
|
75
|
+
"""
|
76
|
+
all_results = {}
|
77
|
+
combined_score = 0.0
|
78
|
+
|
79
|
+
# Run each benchmark with weight > 0
|
80
|
+
for benchmark_name, weight in self.normalized_weights.items():
|
81
|
+
if weight > 0 and benchmark_name in self.evaluators:
|
82
|
+
evaluator = self.evaluators[benchmark_name]
|
83
|
+
|
84
|
+
try:
|
85
|
+
# Run benchmark evaluation
|
86
|
+
result = evaluator.evaluate(
|
87
|
+
system_config=system_config,
|
88
|
+
num_examples=num_examples,
|
89
|
+
output_dir=output_dir,
|
90
|
+
)
|
91
|
+
|
92
|
+
# Store individual results
|
93
|
+
all_results[benchmark_name] = result
|
94
|
+
|
95
|
+
# Calculate weighted contribution to combined score
|
96
|
+
quality_score = result.get("quality_score", 0.0)
|
97
|
+
weighted_contribution = quality_score * weight
|
98
|
+
|
99
|
+
logger.info(
|
100
|
+
f"Benchmark {benchmark_name}: score={quality_score:.4f}, "
|
101
|
+
f"weight={weight:.2f}, contribution={weighted_contribution:.4f}"
|
102
|
+
)
|
103
|
+
|
104
|
+
# Add to combined score
|
105
|
+
combined_score += weighted_contribution
|
106
|
+
|
107
|
+
except Exception as e:
|
108
|
+
logger.error(f"Error running {benchmark_name} benchmark: {str(e)}")
|
109
|
+
all_results[benchmark_name] = {
|
110
|
+
"benchmark_type": benchmark_name,
|
111
|
+
"error": str(e),
|
112
|
+
"quality_score": 0.0,
|
113
|
+
}
|
114
|
+
|
115
|
+
# Return combined results
|
116
|
+
return {
|
117
|
+
"quality_score": combined_score,
|
118
|
+
"benchmark_results": all_results,
|
119
|
+
"benchmark_weights": self.normalized_weights,
|
120
|
+
"combined_score": combined_score,
|
121
|
+
}
|