local-deep-research 0.3.12__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
- local_deep_research/advanced_search_system/findings/repository.py +0 -3
- local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
- local_deep_research/api/benchmark_functions.py +288 -0
- local_deep_research/api/research_functions.py +8 -4
- local_deep_research/benchmarks/README.md +162 -0
- local_deep_research/benchmarks/__init__.py +51 -0
- local_deep_research/benchmarks/benchmark_functions.py +353 -0
- local_deep_research/benchmarks/cli/__init__.py +16 -0
- local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
- local_deep_research/benchmarks/cli.py +347 -0
- local_deep_research/benchmarks/comparison/__init__.py +12 -0
- local_deep_research/benchmarks/comparison/evaluator.py +768 -0
- local_deep_research/benchmarks/datasets/__init__.py +53 -0
- local_deep_research/benchmarks/datasets/base.py +295 -0
- local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
- local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
- local_deep_research/benchmarks/datasets/utils.py +116 -0
- local_deep_research/benchmarks/datasets.py +31 -0
- local_deep_research/benchmarks/efficiency/__init__.py +14 -0
- local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
- local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
- local_deep_research/benchmarks/evaluators/__init__.py +18 -0
- local_deep_research/benchmarks/evaluators/base.py +74 -0
- local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
- local_deep_research/benchmarks/evaluators/composite.py +121 -0
- local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
- local_deep_research/benchmarks/graders.py +410 -0
- local_deep_research/benchmarks/metrics/README.md +80 -0
- local_deep_research/benchmarks/metrics/__init__.py +24 -0
- local_deep_research/benchmarks/metrics/calculation.py +385 -0
- local_deep_research/benchmarks/metrics/reporting.py +155 -0
- local_deep_research/benchmarks/metrics/visualization.py +205 -0
- local_deep_research/benchmarks/metrics.py +11 -0
- local_deep_research/benchmarks/optimization/__init__.py +32 -0
- local_deep_research/benchmarks/optimization/api.py +274 -0
- local_deep_research/benchmarks/optimization/metrics.py +20 -0
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
- local_deep_research/benchmarks/runners.py +434 -0
- local_deep_research/benchmarks/templates.py +65 -0
- local_deep_research/config/llm_config.py +26 -23
- local_deep_research/config/search_config.py +1 -5
- local_deep_research/defaults/default_settings.json +108 -7
- local_deep_research/search_system.py +16 -8
- local_deep_research/utilities/db_utils.py +3 -6
- local_deep_research/utilities/es_utils.py +441 -0
- local_deep_research/utilities/log_utils.py +36 -0
- local_deep_research/utilities/search_utilities.py +8 -9
- local_deep_research/web/app.py +7 -9
- local_deep_research/web/app_factory.py +9 -12
- local_deep_research/web/database/migrations.py +8 -5
- local_deep_research/web/database/models.py +20 -0
- local_deep_research/web/database/schema_upgrade.py +5 -8
- local_deep_research/web/models/database.py +15 -18
- local_deep_research/web/routes/benchmark_routes.py +427 -0
- local_deep_research/web/routes/research_routes.py +13 -17
- local_deep_research/web/routes/settings_routes.py +264 -67
- local_deep_research/web/services/research_service.py +47 -57
- local_deep_research/web/services/settings_manager.py +1 -4
- local_deep_research/web/services/settings_service.py +4 -6
- local_deep_research/web/static/css/styles.css +12 -0
- local_deep_research/web/static/js/components/logpanel.js +164 -155
- local_deep_research/web/static/js/components/research.js +44 -3
- local_deep_research/web/static/js/components/settings.js +27 -0
- local_deep_research/web/static/js/services/socket.js +47 -0
- local_deep_research/web_search_engines/default_search_engines.py +38 -0
- local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
- local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
- local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
- local_deep_research/web_search_engines/search_engine_base.py +22 -5
- local_deep_research/web_search_engines/search_engine_factory.py +32 -11
- local_deep_research/web_search_engines/search_engines_config.py +14 -1
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/METADATA +10 -2
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/RECORD +92 -49
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/WHEEL +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,80 @@
|
|
1
|
+
# Unified Metrics Module
|
2
|
+
|
3
|
+
This module provides a unified approach to metrics calculation, reporting, and visualization for both standard benchmarks and parameter optimization.
|
4
|
+
|
5
|
+
## Overview
|
6
|
+
|
7
|
+
The metrics module consists of three primary components:
|
8
|
+
|
9
|
+
1. **Calculation**: Core functions for calculating metrics from benchmark results and system configurations
|
10
|
+
2. **Reporting**: Functions for generating detailed reports from benchmark results
|
11
|
+
3. **Visualization**: Utilities for creating visualizations of optimization results
|
12
|
+
|
13
|
+
## Usage
|
14
|
+
|
15
|
+
### Basic Metrics Calculation
|
16
|
+
|
17
|
+
```python
|
18
|
+
from local_deep_research.benchmarks.metrics import calculate_metrics
|
19
|
+
|
20
|
+
# Calculate metrics from a results file
|
21
|
+
metrics = calculate_metrics("path/to/results.jsonl")
|
22
|
+
```
|
23
|
+
|
24
|
+
### Generating Reports
|
25
|
+
|
26
|
+
```python
|
27
|
+
from local_deep_research.benchmarks.metrics import generate_report
|
28
|
+
|
29
|
+
# Generate a detailed report
|
30
|
+
report_path = generate_report(
|
31
|
+
metrics=metrics,
|
32
|
+
results_file="path/to/results.jsonl",
|
33
|
+
output_file="report.md",
|
34
|
+
dataset_name="SimpleQA",
|
35
|
+
config_info={"Dataset": "SimpleQA", "Examples": 100}
|
36
|
+
)
|
37
|
+
```
|
38
|
+
|
39
|
+
### Optimization Metrics
|
40
|
+
|
41
|
+
```python
|
42
|
+
from local_deep_research.benchmarks.metrics import (
|
43
|
+
calculate_quality_metrics,
|
44
|
+
calculate_speed_metrics,
|
45
|
+
calculate_resource_metrics,
|
46
|
+
calculate_combined_score
|
47
|
+
)
|
48
|
+
|
49
|
+
# Calculate quality metrics for a configuration
|
50
|
+
quality_metrics = calculate_quality_metrics(
|
51
|
+
system_config={"iterations": 3, "questions_per_iteration": 3}
|
52
|
+
)
|
53
|
+
|
54
|
+
# Calculate a combined score using multiple metrics
|
55
|
+
combined_score = calculate_combined_score(
|
56
|
+
metrics={
|
57
|
+
"quality": quality_metrics,
|
58
|
+
"speed": speed_metrics,
|
59
|
+
"resource": resource_metrics
|
60
|
+
},
|
61
|
+
weights={"quality": 0.6, "speed": 0.3, "resource": 0.1}
|
62
|
+
)
|
63
|
+
```
|
64
|
+
|
65
|
+
### Visualization
|
66
|
+
|
67
|
+
```python
|
68
|
+
from local_deep_research.benchmarks.metrics.visualization import (
|
69
|
+
plot_optimization_history,
|
70
|
+
plot_parameter_importance,
|
71
|
+
plot_quality_vs_speed
|
72
|
+
)
|
73
|
+
|
74
|
+
# Plot optimization history
|
75
|
+
fig = plot_optimization_history(
|
76
|
+
trial_values=[0.5, 0.6, 0.7, 0.65, 0.8],
|
77
|
+
best_values=[0.5, 0.6, 0.7, 0.7, 0.8],
|
78
|
+
output_file="optimization_history.png"
|
79
|
+
)
|
80
|
+
```
|
@@ -0,0 +1,24 @@
|
|
1
|
+
"""
|
2
|
+
Unified metrics module for benchmark evaluation and optimization.
|
3
|
+
|
4
|
+
This package provides metrics calculation, reporting, and visualization
|
5
|
+
functionality for both regular benchmarks and parameter optimization.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .calculation import (
|
9
|
+
calculate_combined_score,
|
10
|
+
calculate_metrics,
|
11
|
+
calculate_quality_metrics,
|
12
|
+
calculate_resource_metrics,
|
13
|
+
calculate_speed_metrics,
|
14
|
+
)
|
15
|
+
from .reporting import generate_report
|
16
|
+
|
17
|
+
__all__ = [
|
18
|
+
"calculate_metrics",
|
19
|
+
"calculate_quality_metrics",
|
20
|
+
"calculate_speed_metrics",
|
21
|
+
"calculate_resource_metrics",
|
22
|
+
"calculate_combined_score",
|
23
|
+
"generate_report",
|
24
|
+
]
|
@@ -0,0 +1,385 @@
|
|
1
|
+
"""
|
2
|
+
Unified metrics calculation module.
|
3
|
+
|
4
|
+
This module provides functions for calculating metrics for both
|
5
|
+
standard benchmarks and optimization tasks.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import json
|
9
|
+
import logging
|
10
|
+
import os
|
11
|
+
import tempfile
|
12
|
+
import time
|
13
|
+
from datetime import datetime
|
14
|
+
from typing import Any, Dict, List, Optional, Union
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
def calculate_metrics(results_file: str) -> Dict[str, Any]:
|
20
|
+
"""
|
21
|
+
Calculate evaluation metrics from results.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
results_file: Path to results file
|
25
|
+
|
26
|
+
Returns:
|
27
|
+
Dictionary of metrics
|
28
|
+
"""
|
29
|
+
# Load results
|
30
|
+
results = []
|
31
|
+
try:
|
32
|
+
with open(results_file, "r") as f:
|
33
|
+
for line in f:
|
34
|
+
if line.strip():
|
35
|
+
results.append(json.loads(line))
|
36
|
+
except Exception as e:
|
37
|
+
logger.error(f"Error loading results file: {e}")
|
38
|
+
return {"error": str(e)}
|
39
|
+
|
40
|
+
if not results:
|
41
|
+
return {"error": "No results found"}
|
42
|
+
|
43
|
+
# Calculate accuracy
|
44
|
+
graded_results = [r for r in results if "is_correct" in r]
|
45
|
+
correct_count = sum(1 for r in graded_results if r.get("is_correct", False))
|
46
|
+
total_graded = len(graded_results)
|
47
|
+
accuracy = correct_count / total_graded if total_graded else 0
|
48
|
+
|
49
|
+
# Calculate average processing time if available
|
50
|
+
processing_times = [
|
51
|
+
r.get("processing_time", 0) for r in results if "processing_time" in r
|
52
|
+
]
|
53
|
+
avg_time = sum(processing_times) / len(processing_times) if processing_times else 0
|
54
|
+
|
55
|
+
# Average confidence if available
|
56
|
+
confidence_values = []
|
57
|
+
for r in results:
|
58
|
+
if "confidence" in r and r["confidence"]:
|
59
|
+
try:
|
60
|
+
confidence_values.append(int(r["confidence"]))
|
61
|
+
except (ValueError, TypeError):
|
62
|
+
pass
|
63
|
+
|
64
|
+
avg_confidence = (
|
65
|
+
sum(confidence_values) / len(confidence_values) if confidence_values else 0
|
66
|
+
)
|
67
|
+
|
68
|
+
# Calculate error rate
|
69
|
+
error_count = sum(1 for r in results if "error" in r)
|
70
|
+
error_rate = error_count / len(results) if results else 0
|
71
|
+
|
72
|
+
# Basic metrics
|
73
|
+
metrics = {
|
74
|
+
"total_examples": len(results),
|
75
|
+
"graded_examples": total_graded,
|
76
|
+
"correct": correct_count,
|
77
|
+
"accuracy": accuracy,
|
78
|
+
"average_processing_time": avg_time,
|
79
|
+
"average_confidence": avg_confidence,
|
80
|
+
"error_count": error_count,
|
81
|
+
"error_rate": error_rate,
|
82
|
+
"timestamp": datetime.now().isoformat(),
|
83
|
+
}
|
84
|
+
|
85
|
+
# If we have category information, calculate per-category metrics
|
86
|
+
categories = {}
|
87
|
+
for r in graded_results:
|
88
|
+
if "category" in r:
|
89
|
+
category = r["category"]
|
90
|
+
if category not in categories:
|
91
|
+
categories[category] = {"total": 0, "correct": 0}
|
92
|
+
categories[category]["total"] += 1
|
93
|
+
if r.get("is_correct", False):
|
94
|
+
categories[category]["correct"] += 1
|
95
|
+
|
96
|
+
if categories:
|
97
|
+
category_metrics = {}
|
98
|
+
for category, counts in categories.items():
|
99
|
+
category_metrics[category] = {
|
100
|
+
"total": counts["total"],
|
101
|
+
"correct": counts["correct"],
|
102
|
+
"accuracy": (
|
103
|
+
counts["correct"] / counts["total"] if counts["total"] else 0
|
104
|
+
),
|
105
|
+
}
|
106
|
+
metrics["categories"] = category_metrics
|
107
|
+
|
108
|
+
return metrics
|
109
|
+
|
110
|
+
|
111
|
+
def evaluate_benchmark_quality(
|
112
|
+
system_config: Dict[str, Any],
|
113
|
+
num_examples: int = 10,
|
114
|
+
output_dir: Optional[str] = None,
|
115
|
+
) -> Dict[str, float]:
|
116
|
+
"""
|
117
|
+
Evaluate quality using SimpleQA benchmark.
|
118
|
+
|
119
|
+
Args:
|
120
|
+
system_config: Configuration parameters to evaluate
|
121
|
+
num_examples: Number of benchmark examples to use
|
122
|
+
output_dir: Directory to save results (temporary if None)
|
123
|
+
|
124
|
+
Returns:
|
125
|
+
Dictionary with benchmark metrics
|
126
|
+
"""
|
127
|
+
from ..runners import run_simpleqa_benchmark
|
128
|
+
|
129
|
+
# Create temporary directory if not provided
|
130
|
+
temp_dir = None
|
131
|
+
if output_dir is None:
|
132
|
+
temp_dir = tempfile.mkdtemp(prefix="ldr_benchmark_")
|
133
|
+
output_dir = temp_dir
|
134
|
+
|
135
|
+
try:
|
136
|
+
# Create search configuration from system config
|
137
|
+
search_config = {
|
138
|
+
"iterations": system_config.get("iterations", 2),
|
139
|
+
"questions_per_iteration": system_config.get("questions_per_iteration", 2),
|
140
|
+
"search_strategy": system_config.get("search_strategy", "iterdrag"),
|
141
|
+
"search_tool": system_config.get("search_tool", "searxng"),
|
142
|
+
"model_name": system_config.get("model_name"),
|
143
|
+
"provider": system_config.get("provider"),
|
144
|
+
}
|
145
|
+
|
146
|
+
# Run benchmark
|
147
|
+
logger.info(f"Running SimpleQA benchmark with {num_examples} examples")
|
148
|
+
benchmark_results = run_simpleqa_benchmark(
|
149
|
+
num_examples=num_examples,
|
150
|
+
output_dir=output_dir,
|
151
|
+
search_config=search_config,
|
152
|
+
run_evaluation=True,
|
153
|
+
)
|
154
|
+
|
155
|
+
# Extract key metrics
|
156
|
+
metrics = benchmark_results.get("metrics", {})
|
157
|
+
accuracy = metrics.get("accuracy", 0.0)
|
158
|
+
|
159
|
+
# Return only the most relevant metrics
|
160
|
+
return {
|
161
|
+
"accuracy": accuracy,
|
162
|
+
"quality_score": accuracy, # Map accuracy directly to quality score
|
163
|
+
}
|
164
|
+
|
165
|
+
except Exception as e:
|
166
|
+
logger.error(f"Error in benchmark evaluation: {str(e)}")
|
167
|
+
return {"accuracy": 0.0, "quality_score": 0.0, "error": str(e)}
|
168
|
+
|
169
|
+
finally:
|
170
|
+
# Clean up temporary directory if we created it
|
171
|
+
if temp_dir and os.path.exists(temp_dir):
|
172
|
+
import shutil
|
173
|
+
|
174
|
+
try:
|
175
|
+
shutil.rmtree(temp_dir)
|
176
|
+
except Exception as e:
|
177
|
+
logger.warning(f"Failed to clean up temporary directory: {str(e)}")
|
178
|
+
|
179
|
+
|
180
|
+
def measure_execution_time(
|
181
|
+
system_config: Dict[str, Any],
|
182
|
+
query: str = "test query",
|
183
|
+
search_tool: Optional[str] = None,
|
184
|
+
num_runs: int = 1,
|
185
|
+
) -> Dict[str, float]:
|
186
|
+
"""
|
187
|
+
Measure execution time for a given configuration.
|
188
|
+
|
189
|
+
Args:
|
190
|
+
system_config: Configuration parameters to evaluate
|
191
|
+
query: Query to use for timing tests
|
192
|
+
search_tool: Override search tool
|
193
|
+
num_runs: Number of runs to average time over
|
194
|
+
|
195
|
+
Returns:
|
196
|
+
Dictionary with speed metrics
|
197
|
+
"""
|
198
|
+
from local_deep_research.search_system import SearchSystem
|
199
|
+
|
200
|
+
if search_tool:
|
201
|
+
system_config["search_tool"] = search_tool
|
202
|
+
|
203
|
+
# Configure system
|
204
|
+
system = SearchSystem(
|
205
|
+
iterations=system_config.get("iterations", 2),
|
206
|
+
questions_per_iteration=system_config.get("questions_per_iteration", 2),
|
207
|
+
search_strategy=system_config.get("search_strategy", "iterdrag"),
|
208
|
+
search_tool=system_config.get("search_tool", "searxng"),
|
209
|
+
model_name=system_config.get("model_name"),
|
210
|
+
provider=system_config.get("provider"),
|
211
|
+
)
|
212
|
+
|
213
|
+
# Run multiple times and calculate average
|
214
|
+
total_time = 0
|
215
|
+
times = []
|
216
|
+
|
217
|
+
try:
|
218
|
+
for i in range(num_runs):
|
219
|
+
logger.info(f"Executing speed test run {i+1}/{num_runs}")
|
220
|
+
start_time = time.time()
|
221
|
+
system.search(query, full_response=False)
|
222
|
+
end_time = time.time()
|
223
|
+
run_time = end_time - start_time
|
224
|
+
times.append(run_time)
|
225
|
+
total_time += run_time
|
226
|
+
|
227
|
+
# Calculate metrics
|
228
|
+
average_time = total_time / num_runs
|
229
|
+
|
230
|
+
# Calculate speed score (0-1 scale, lower times are better)
|
231
|
+
# Using sigmoid-like normalization where:
|
232
|
+
# - Times around 30s get ~0.5 score
|
233
|
+
# - Times under 10s get >0.8 score
|
234
|
+
# - Times over 2min get <0.2 score
|
235
|
+
speed_score = 1.0 / (1.0 + (average_time / 30.0))
|
236
|
+
|
237
|
+
return {
|
238
|
+
"average_time": average_time,
|
239
|
+
"min_time": min(times),
|
240
|
+
"max_time": max(times),
|
241
|
+
"speed_score": speed_score,
|
242
|
+
}
|
243
|
+
|
244
|
+
except Exception as e:
|
245
|
+
logger.error(f"Error in speed measurement: {str(e)}")
|
246
|
+
return {"average_time": 0.0, "speed_score": 0.0, "error": str(e)}
|
247
|
+
|
248
|
+
|
249
|
+
def calculate_quality_metrics(
|
250
|
+
system_config: Dict[str, Any],
|
251
|
+
num_examples: int = 2, # Reduced for quicker demo
|
252
|
+
output_dir: Optional[str] = None,
|
253
|
+
) -> Dict[str, float]:
|
254
|
+
"""
|
255
|
+
Calculate quality-related metrics for a configuration.
|
256
|
+
|
257
|
+
Args:
|
258
|
+
system_config: Configuration parameters to evaluate
|
259
|
+
num_examples: Number of benchmark examples to use
|
260
|
+
output_dir: Directory to save results (temporary if None)
|
261
|
+
|
262
|
+
Returns:
|
263
|
+
Dictionary with quality metrics
|
264
|
+
"""
|
265
|
+
# Run quality evaluation
|
266
|
+
quality_results = evaluate_benchmark_quality(
|
267
|
+
system_config=system_config, num_examples=num_examples, output_dir=output_dir
|
268
|
+
)
|
269
|
+
|
270
|
+
# Return normalized quality score
|
271
|
+
return {
|
272
|
+
"quality_score": quality_results.get("quality_score", 0.0),
|
273
|
+
"accuracy": quality_results.get("accuracy", 0.0),
|
274
|
+
}
|
275
|
+
|
276
|
+
|
277
|
+
def calculate_speed_metrics(
|
278
|
+
system_config: Dict[str, Any],
|
279
|
+
query: str = "test query",
|
280
|
+
search_tool: Optional[str] = None,
|
281
|
+
num_runs: int = 1,
|
282
|
+
) -> Dict[str, float]:
|
283
|
+
"""
|
284
|
+
Calculate speed-related metrics for a configuration.
|
285
|
+
|
286
|
+
Args:
|
287
|
+
system_config: Configuration parameters to evaluate
|
288
|
+
query: Query to use for timing tests
|
289
|
+
search_tool: Override search tool
|
290
|
+
num_runs: Number of runs to average time over
|
291
|
+
|
292
|
+
Returns:
|
293
|
+
Dictionary with speed metrics
|
294
|
+
"""
|
295
|
+
# Run speed measurement
|
296
|
+
speed_results = measure_execution_time(
|
297
|
+
system_config=system_config,
|
298
|
+
query=query,
|
299
|
+
search_tool=search_tool,
|
300
|
+
num_runs=num_runs,
|
301
|
+
)
|
302
|
+
|
303
|
+
# Return normalized speed score
|
304
|
+
return {
|
305
|
+
"speed_score": speed_results.get("speed_score", 0.0),
|
306
|
+
"average_time": speed_results.get("average_time", 0.0),
|
307
|
+
}
|
308
|
+
|
309
|
+
|
310
|
+
def calculate_resource_metrics(
|
311
|
+
system_config: Dict[str, Any],
|
312
|
+
query: str = "test query",
|
313
|
+
search_tool: Optional[str] = None,
|
314
|
+
) -> Dict[str, float]:
|
315
|
+
"""
|
316
|
+
Calculate resource usage metrics for a configuration.
|
317
|
+
|
318
|
+
Args:
|
319
|
+
system_config: Configuration parameters to evaluate
|
320
|
+
query: Query to use for resource tests
|
321
|
+
search_tool: Override search tool
|
322
|
+
|
323
|
+
Returns:
|
324
|
+
Dictionary with resource metrics
|
325
|
+
"""
|
326
|
+
# This is a simplified version - in a real implementation,
|
327
|
+
# you would measure memory usage, API call counts, etc.
|
328
|
+
|
329
|
+
# For now, we'll use a heuristic based on configuration values
|
330
|
+
iterations = system_config.get("iterations", 2)
|
331
|
+
questions = system_config.get("questions_per_iteration", 2)
|
332
|
+
max_results = system_config.get("max_results", 50)
|
333
|
+
|
334
|
+
# Simple heuristic: more iterations, questions, and results = more resources
|
335
|
+
complexity = iterations * questions * (max_results / 50)
|
336
|
+
|
337
|
+
# Normalize to 0-1 scale (lower is better)
|
338
|
+
resource_score = 1.0 / (1.0 + (complexity / 4.0))
|
339
|
+
|
340
|
+
return {"resource_score": resource_score, "estimated_complexity": complexity}
|
341
|
+
|
342
|
+
|
343
|
+
def calculate_combined_score(
|
344
|
+
metrics: Dict[str, Dict[str, float]], weights: Dict[str, float] = None
|
345
|
+
) -> float:
|
346
|
+
"""
|
347
|
+
Calculate a combined optimization score from multiple metrics.
|
348
|
+
|
349
|
+
Args:
|
350
|
+
metrics: Dictionary of metric categories and their values
|
351
|
+
weights: Dictionary of weights for each metric category
|
352
|
+
|
353
|
+
Returns:
|
354
|
+
Combined score between 0 and 1
|
355
|
+
"""
|
356
|
+
# Default weights if not provided
|
357
|
+
if weights is None:
|
358
|
+
weights = {"quality": 0.6, "speed": 0.3, "resource": 0.1}
|
359
|
+
|
360
|
+
# Normalize weights to sum to 1
|
361
|
+
total_weight = sum(weights.values())
|
362
|
+
if total_weight == 0:
|
363
|
+
return 0.0
|
364
|
+
|
365
|
+
norm_weights = {k: v / total_weight for k, v in weights.items()}
|
366
|
+
|
367
|
+
# Calculate weighted score
|
368
|
+
score = 0.0
|
369
|
+
|
370
|
+
# Quality component
|
371
|
+
if "quality" in metrics and "quality" in norm_weights:
|
372
|
+
quality_score = metrics["quality"].get("quality_score", 0.0)
|
373
|
+
score += quality_score * norm_weights["quality"]
|
374
|
+
|
375
|
+
# Speed component
|
376
|
+
if "speed" in metrics and "speed" in norm_weights:
|
377
|
+
speed_score = metrics["speed"].get("speed_score", 0.0)
|
378
|
+
score += speed_score * norm_weights["speed"]
|
379
|
+
|
380
|
+
# Resource component
|
381
|
+
if "resource" in metrics and "resource" in norm_weights:
|
382
|
+
resource_score = metrics["resource"].get("resource_score", 0.0)
|
383
|
+
score += resource_score * norm_weights["resource"]
|
384
|
+
|
385
|
+
return score
|
@@ -0,0 +1,155 @@
|
|
1
|
+
"""
|
2
|
+
Report generation for benchmark results.
|
3
|
+
|
4
|
+
This module provides functions for generating detailed reports from benchmark results.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import json
|
8
|
+
import logging
|
9
|
+
from datetime import datetime
|
10
|
+
from typing import Any, Dict, Optional
|
11
|
+
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def generate_report(
|
16
|
+
metrics: Dict[str, Any],
|
17
|
+
results_file: str,
|
18
|
+
output_file: str = "evaluation_report.md",
|
19
|
+
dataset_name: str = "Unknown",
|
20
|
+
config_info: Optional[Dict[str, Any]] = None,
|
21
|
+
) -> str:
|
22
|
+
"""
|
23
|
+
Generate a detailed report from evaluation results.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
metrics: Dictionary of evaluation metrics
|
27
|
+
results_file: Path to results file
|
28
|
+
output_file: Path to save report
|
29
|
+
dataset_name: Name of dataset
|
30
|
+
config_info: Optional configuration information
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
Path to the generated report file
|
34
|
+
"""
|
35
|
+
# Load a sample of results for examples
|
36
|
+
results = []
|
37
|
+
try:
|
38
|
+
with open(results_file, "r") as f:
|
39
|
+
for line in f:
|
40
|
+
if line.strip():
|
41
|
+
results.append(json.loads(line))
|
42
|
+
except Exception as e:
|
43
|
+
logger.error(f"Error loading results for report: {e}")
|
44
|
+
results = []
|
45
|
+
|
46
|
+
# Sample up to 5 correct and 5 incorrect examples
|
47
|
+
correct_examples = [r for r in results if r.get("is_correct", False)][:5]
|
48
|
+
incorrect_examples = [
|
49
|
+
r for r in results if "is_correct" in r and not r.get("is_correct", False)
|
50
|
+
][:5]
|
51
|
+
|
52
|
+
# Create report
|
53
|
+
report = [
|
54
|
+
f"# Evaluation Report: {dataset_name}",
|
55
|
+
"",
|
56
|
+
"## Summary",
|
57
|
+
"",
|
58
|
+
f"- **Total Examples**: {metrics.get('total_examples', 0)}",
|
59
|
+
f"- **Graded Examples**: {metrics.get('graded_examples', 0)}",
|
60
|
+
f"- **Correct Answers**: {metrics.get('correct', 0)}",
|
61
|
+
f"- **Accuracy**: {metrics.get('accuracy', 0):.3f}",
|
62
|
+
]
|
63
|
+
|
64
|
+
if "average_processing_time" in metrics:
|
65
|
+
report.append(
|
66
|
+
f"- **Average Processing Time**: {metrics['average_processing_time']:.2f} seconds"
|
67
|
+
)
|
68
|
+
|
69
|
+
if "average_confidence" in metrics:
|
70
|
+
report.append(f"- **Average Confidence**: {metrics['average_confidence']:.2f}%")
|
71
|
+
|
72
|
+
if "error_count" in metrics and metrics["error_count"] > 0:
|
73
|
+
report.append(f"- **Error Count**: {metrics['error_count']}")
|
74
|
+
report.append(f"- **Error Rate**: {metrics['error_rate']:.3f}")
|
75
|
+
|
76
|
+
report.append("")
|
77
|
+
|
78
|
+
# Add per-category metrics if available
|
79
|
+
if "categories" in metrics:
|
80
|
+
report.extend(["## Category Performance", ""])
|
81
|
+
|
82
|
+
for category, category_metrics in metrics["categories"].items():
|
83
|
+
report.append(f"### {category}")
|
84
|
+
report.append("")
|
85
|
+
report.append(f"- **Total**: {category_metrics['total']}")
|
86
|
+
report.append(f"- **Correct**: {category_metrics['correct']}")
|
87
|
+
report.append(f"- **Accuracy**: {category_metrics['accuracy']:.3f}")
|
88
|
+
report.append("")
|
89
|
+
|
90
|
+
# Add configuration info if provided
|
91
|
+
if config_info:
|
92
|
+
report.extend(["## Configuration", ""])
|
93
|
+
|
94
|
+
for key, value in config_info.items():
|
95
|
+
report.append(f"- **{key}**: {value}")
|
96
|
+
|
97
|
+
report.append("")
|
98
|
+
|
99
|
+
# Add example sections
|
100
|
+
if correct_examples:
|
101
|
+
report.extend(["## Example Correct Answers", ""])
|
102
|
+
|
103
|
+
for idx, example in enumerate(correct_examples):
|
104
|
+
report.extend(
|
105
|
+
[
|
106
|
+
f"### Example {idx + 1}",
|
107
|
+
"",
|
108
|
+
f"**Question**: {example.get('problem', '')}",
|
109
|
+
"",
|
110
|
+
f"**Correct Answer**: {example.get('correct_answer', '')}",
|
111
|
+
"",
|
112
|
+
f"**Model Answer**: {example.get('extracted_answer', '')}",
|
113
|
+
"",
|
114
|
+
f"**Reasoning**: {example.get('reasoning', '')}",
|
115
|
+
"",
|
116
|
+
]
|
117
|
+
)
|
118
|
+
|
119
|
+
if incorrect_examples:
|
120
|
+
report.extend(["## Example Incorrect Answers", ""])
|
121
|
+
|
122
|
+
for idx, example in enumerate(incorrect_examples):
|
123
|
+
report.extend(
|
124
|
+
[
|
125
|
+
f"### Example {idx + 1}",
|
126
|
+
"",
|
127
|
+
f"**Question**: {example.get('problem', '')}",
|
128
|
+
"",
|
129
|
+
f"**Correct Answer**: {example.get('correct_answer', '')}",
|
130
|
+
"",
|
131
|
+
f"**Model Answer**: {example.get('extracted_answer', '')}",
|
132
|
+
"",
|
133
|
+
f"**Reasoning**: {example.get('reasoning', '')}",
|
134
|
+
"",
|
135
|
+
]
|
136
|
+
)
|
137
|
+
|
138
|
+
# Add timestamp
|
139
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
140
|
+
report.extend(
|
141
|
+
[
|
142
|
+
"## Metadata",
|
143
|
+
"",
|
144
|
+
f"- **Generated**: {timestamp}",
|
145
|
+
f"- **Dataset**: {dataset_name}",
|
146
|
+
"",
|
147
|
+
]
|
148
|
+
)
|
149
|
+
|
150
|
+
# Write report to file
|
151
|
+
with open(output_file, "w") as f:
|
152
|
+
f.write("\n".join(report))
|
153
|
+
|
154
|
+
logger.info(f"Report saved to {output_file}")
|
155
|
+
return output_file
|