local-deep-research 0.3.12__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +1 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
- local_deep_research/advanced_search_system/findings/repository.py +0 -3
- local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
- local_deep_research/api/benchmark_functions.py +288 -0
- local_deep_research/api/research_functions.py +8 -4
- local_deep_research/benchmarks/README.md +162 -0
- local_deep_research/benchmarks/__init__.py +51 -0
- local_deep_research/benchmarks/benchmark_functions.py +353 -0
- local_deep_research/benchmarks/cli/__init__.py +16 -0
- local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
- local_deep_research/benchmarks/cli.py +347 -0
- local_deep_research/benchmarks/comparison/__init__.py +12 -0
- local_deep_research/benchmarks/comparison/evaluator.py +768 -0
- local_deep_research/benchmarks/datasets/__init__.py +53 -0
- local_deep_research/benchmarks/datasets/base.py +295 -0
- local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
- local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
- local_deep_research/benchmarks/datasets/utils.py +116 -0
- local_deep_research/benchmarks/datasets.py +31 -0
- local_deep_research/benchmarks/efficiency/__init__.py +14 -0
- local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
- local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
- local_deep_research/benchmarks/evaluators/__init__.py +18 -0
- local_deep_research/benchmarks/evaluators/base.py +74 -0
- local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
- local_deep_research/benchmarks/evaluators/composite.py +121 -0
- local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
- local_deep_research/benchmarks/graders.py +410 -0
- local_deep_research/benchmarks/metrics/README.md +80 -0
- local_deep_research/benchmarks/metrics/__init__.py +24 -0
- local_deep_research/benchmarks/metrics/calculation.py +385 -0
- local_deep_research/benchmarks/metrics/reporting.py +155 -0
- local_deep_research/benchmarks/metrics/visualization.py +205 -0
- local_deep_research/benchmarks/metrics.py +11 -0
- local_deep_research/benchmarks/optimization/__init__.py +32 -0
- local_deep_research/benchmarks/optimization/api.py +274 -0
- local_deep_research/benchmarks/optimization/metrics.py +20 -0
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
- local_deep_research/benchmarks/runners.py +434 -0
- local_deep_research/benchmarks/templates.py +65 -0
- local_deep_research/config/llm_config.py +26 -23
- local_deep_research/config/search_config.py +1 -5
- local_deep_research/defaults/default_settings.json +108 -7
- local_deep_research/search_system.py +16 -8
- local_deep_research/utilities/db_utils.py +3 -6
- local_deep_research/utilities/es_utils.py +441 -0
- local_deep_research/utilities/log_utils.py +36 -0
- local_deep_research/utilities/search_utilities.py +8 -9
- local_deep_research/web/app.py +15 -10
- local_deep_research/web/app_factory.py +9 -12
- local_deep_research/web/database/migrations.py +8 -5
- local_deep_research/web/database/models.py +20 -0
- local_deep_research/web/database/schema_upgrade.py +5 -8
- local_deep_research/web/models/database.py +15 -18
- local_deep_research/web/routes/benchmark_routes.py +427 -0
- local_deep_research/web/routes/research_routes.py +13 -17
- local_deep_research/web/routes/settings_routes.py +264 -67
- local_deep_research/web/services/research_service.py +58 -73
- local_deep_research/web/services/settings_manager.py +1 -4
- local_deep_research/web/services/settings_service.py +4 -6
- local_deep_research/web/static/css/styles.css +12 -0
- local_deep_research/web/static/js/components/logpanel.js +164 -155
- local_deep_research/web/static/js/components/research.js +44 -3
- local_deep_research/web/static/js/components/settings.js +27 -0
- local_deep_research/web/static/js/services/socket.js +47 -0
- local_deep_research/web_search_engines/default_search_engines.py +38 -0
- local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
- local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
- local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
- local_deep_research/web_search_engines/search_engine_base.py +22 -5
- local_deep_research/web_search_engines/search_engine_factory.py +30 -11
- local_deep_research/web_search_engines/search_engines_config.py +14 -1
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/METADATA +10 -2
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/RECORD +93 -51
- local_deep_research/app.py +0 -8
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/WHEEL +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,288 @@
|
|
1
|
+
"""
|
2
|
+
API functions for benchmarking.
|
3
|
+
|
4
|
+
This module provides functions for running benchmarks programmatically.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import logging
|
8
|
+
from typing import Any, Dict, List, Optional
|
9
|
+
|
10
|
+
from ..benchmarks import (
|
11
|
+
calculate_metrics,
|
12
|
+
generate_report,
|
13
|
+
run_benchmark,
|
14
|
+
run_browsecomp_benchmark,
|
15
|
+
run_simpleqa_benchmark,
|
16
|
+
)
|
17
|
+
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
def evaluate_simpleqa(
|
22
|
+
num_examples: int = 100,
|
23
|
+
search_iterations: int = 3,
|
24
|
+
questions_per_iteration: int = 3,
|
25
|
+
search_tool: str = "searxng",
|
26
|
+
human_evaluation: bool = False,
|
27
|
+
evaluation_model: Optional[str] = None,
|
28
|
+
evaluation_provider: Optional[str] = None,
|
29
|
+
output_dir: str = "benchmark_results",
|
30
|
+
) -> Dict[str, Any]:
|
31
|
+
"""
|
32
|
+
Run SimpleQA benchmark evaluation.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
num_examples: Number of examples to evaluate
|
36
|
+
search_iterations: Number of search iterations per query
|
37
|
+
questions_per_iteration: Number of questions per iteration
|
38
|
+
search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
|
39
|
+
human_evaluation: Whether to use human evaluation
|
40
|
+
evaluation_model: Optional custom model for evaluation
|
41
|
+
evaluation_provider: Optional custom provider for evaluation
|
42
|
+
output_dir: Directory to save results
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
Dictionary with benchmark results
|
46
|
+
"""
|
47
|
+
logger.info(f"Starting SimpleQA benchmark with {num_examples} examples")
|
48
|
+
|
49
|
+
# Set up search configuration
|
50
|
+
search_config = {
|
51
|
+
"iterations": search_iterations,
|
52
|
+
"questions_per_iteration": questions_per_iteration,
|
53
|
+
"search_tool": search_tool,
|
54
|
+
}
|
55
|
+
|
56
|
+
# Set up evaluation configuration if needed
|
57
|
+
evaluation_config = None
|
58
|
+
if evaluation_model or evaluation_provider:
|
59
|
+
evaluation_config = {}
|
60
|
+
if evaluation_model:
|
61
|
+
evaluation_config["model_name"] = evaluation_model
|
62
|
+
if evaluation_provider:
|
63
|
+
evaluation_config["provider"] = evaluation_provider
|
64
|
+
|
65
|
+
# Run the benchmark
|
66
|
+
results = run_simpleqa_benchmark(
|
67
|
+
num_examples=num_examples,
|
68
|
+
output_dir=output_dir,
|
69
|
+
search_config=search_config,
|
70
|
+
evaluation_config=evaluation_config,
|
71
|
+
human_evaluation=human_evaluation,
|
72
|
+
)
|
73
|
+
|
74
|
+
return results
|
75
|
+
|
76
|
+
|
77
|
+
def evaluate_browsecomp(
|
78
|
+
num_examples: int = 100,
|
79
|
+
search_iterations: int = 3,
|
80
|
+
questions_per_iteration: int = 3,
|
81
|
+
search_tool: str = "searxng",
|
82
|
+
human_evaluation: bool = False,
|
83
|
+
evaluation_model: Optional[str] = None,
|
84
|
+
evaluation_provider: Optional[str] = None,
|
85
|
+
output_dir: str = "benchmark_results",
|
86
|
+
) -> Dict[str, Any]:
|
87
|
+
"""
|
88
|
+
Run BrowseComp benchmark evaluation.
|
89
|
+
|
90
|
+
Args:
|
91
|
+
num_examples: Number of examples to evaluate
|
92
|
+
search_iterations: Number of search iterations per query
|
93
|
+
questions_per_iteration: Number of questions per iteration
|
94
|
+
search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
|
95
|
+
human_evaluation: Whether to use human evaluation
|
96
|
+
evaluation_model: Optional custom model for evaluation
|
97
|
+
evaluation_provider: Optional custom provider for evaluation
|
98
|
+
output_dir: Directory to save results
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
Dictionary with benchmark results
|
102
|
+
"""
|
103
|
+
logger.info(f"Starting BrowseComp benchmark with {num_examples} examples")
|
104
|
+
|
105
|
+
# Set up search configuration
|
106
|
+
search_config = {
|
107
|
+
"iterations": search_iterations,
|
108
|
+
"questions_per_iteration": questions_per_iteration,
|
109
|
+
"search_tool": search_tool,
|
110
|
+
}
|
111
|
+
|
112
|
+
# Set up evaluation configuration if needed
|
113
|
+
evaluation_config = None
|
114
|
+
if evaluation_model or evaluation_provider:
|
115
|
+
evaluation_config = {}
|
116
|
+
if evaluation_model:
|
117
|
+
evaluation_config["model_name"] = evaluation_model
|
118
|
+
if evaluation_provider:
|
119
|
+
evaluation_config["provider"] = evaluation_provider
|
120
|
+
|
121
|
+
# Run the benchmark
|
122
|
+
results = run_browsecomp_benchmark(
|
123
|
+
num_examples=num_examples,
|
124
|
+
output_dir=output_dir,
|
125
|
+
search_config=search_config,
|
126
|
+
evaluation_config=evaluation_config,
|
127
|
+
human_evaluation=human_evaluation,
|
128
|
+
)
|
129
|
+
|
130
|
+
return results
|
131
|
+
|
132
|
+
|
133
|
+
def get_available_benchmarks() -> List[Dict[str, str]]:
|
134
|
+
"""
|
135
|
+
Get information about available benchmarks.
|
136
|
+
|
137
|
+
Returns:
|
138
|
+
List of dictionaries with benchmark information
|
139
|
+
"""
|
140
|
+
return [
|
141
|
+
{
|
142
|
+
"id": "simpleqa",
|
143
|
+
"name": "SimpleQA",
|
144
|
+
"description": "Benchmark for factual question answering",
|
145
|
+
"recommended_examples": 100,
|
146
|
+
},
|
147
|
+
{
|
148
|
+
"id": "browsecomp",
|
149
|
+
"name": "BrowseComp",
|
150
|
+
"description": "Benchmark for web browsing comprehension",
|
151
|
+
"recommended_examples": 100,
|
152
|
+
},
|
153
|
+
]
|
154
|
+
|
155
|
+
|
156
|
+
def compare_configurations(
|
157
|
+
dataset_type: str = "simpleqa",
|
158
|
+
num_examples: int = 20,
|
159
|
+
configurations: List[Dict[str, Any]] = None,
|
160
|
+
output_dir: str = "benchmark_comparisons",
|
161
|
+
) -> Dict[str, Any]:
|
162
|
+
"""
|
163
|
+
Compare multiple search configurations on the same benchmark.
|
164
|
+
|
165
|
+
Args:
|
166
|
+
dataset_type: Type of dataset to use
|
167
|
+
num_examples: Number of examples to evaluate
|
168
|
+
configurations: List of search configurations to compare
|
169
|
+
output_dir: Directory to save results
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
Dictionary with comparison results
|
173
|
+
"""
|
174
|
+
if not configurations:
|
175
|
+
# Default configurations to compare
|
176
|
+
configurations = [
|
177
|
+
{
|
178
|
+
"name": "Base Config",
|
179
|
+
"search_tool": "searxng",
|
180
|
+
"iterations": 1,
|
181
|
+
"questions_per_iteration": 3,
|
182
|
+
},
|
183
|
+
{
|
184
|
+
"name": "More Iterations",
|
185
|
+
"search_tool": "searxng",
|
186
|
+
"iterations": 3,
|
187
|
+
"questions_per_iteration": 3,
|
188
|
+
},
|
189
|
+
{
|
190
|
+
"name": "More Questions",
|
191
|
+
"search_tool": "searxng",
|
192
|
+
"iterations": 1,
|
193
|
+
"questions_per_iteration": 5,
|
194
|
+
},
|
195
|
+
]
|
196
|
+
|
197
|
+
# Create output directory
|
198
|
+
import os
|
199
|
+
|
200
|
+
os.makedirs(output_dir, exist_ok=True)
|
201
|
+
|
202
|
+
# Run benchmarks for each configuration
|
203
|
+
results = []
|
204
|
+
for config in configurations:
|
205
|
+
config_name = config.pop("name", f"Config-{len(results)}")
|
206
|
+
|
207
|
+
logger.info(f"Running benchmark with configuration: {config_name}")
|
208
|
+
|
209
|
+
search_config = {
|
210
|
+
"iterations": config.pop("iterations", 1),
|
211
|
+
"questions_per_iteration": config.pop("questions_per_iteration", 3),
|
212
|
+
"search_tool": config.pop("search_tool", "searxng"),
|
213
|
+
}
|
214
|
+
|
215
|
+
# Add any remaining config items
|
216
|
+
for key, value in config.items():
|
217
|
+
search_config[key] = value
|
218
|
+
|
219
|
+
# Run benchmark with this configuration
|
220
|
+
benchmark_result = run_benchmark(
|
221
|
+
dataset_type=dataset_type,
|
222
|
+
num_examples=num_examples,
|
223
|
+
output_dir=os.path.join(output_dir, config_name.replace(" ", "_")),
|
224
|
+
search_config=search_config,
|
225
|
+
run_evaluation=True,
|
226
|
+
)
|
227
|
+
|
228
|
+
# Add configuration name to results
|
229
|
+
benchmark_result["configuration_name"] = config_name
|
230
|
+
benchmark_result["search_config"] = search_config
|
231
|
+
|
232
|
+
results.append(benchmark_result)
|
233
|
+
|
234
|
+
# Generate comparison report
|
235
|
+
import time
|
236
|
+
|
237
|
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
238
|
+
report_file = os.path.join(output_dir, f"comparison_{dataset_type}_{timestamp}.md")
|
239
|
+
|
240
|
+
with open(report_file, "w") as f:
|
241
|
+
f.write(f"# Configuration Comparison - {dataset_type.capitalize()}\n\n")
|
242
|
+
|
243
|
+
# Write summary table
|
244
|
+
f.write("## Summary\n\n")
|
245
|
+
f.write("| Configuration | Accuracy | Avg. Time | Examples |\n")
|
246
|
+
f.write("|---------------|----------|-----------|----------|\n")
|
247
|
+
|
248
|
+
for result in results:
|
249
|
+
accuracy = result.get("metrics", {}).get("accuracy", 0)
|
250
|
+
avg_time = result.get("metrics", {}).get("average_processing_time", 0)
|
251
|
+
examples = result.get("total_examples", 0)
|
252
|
+
|
253
|
+
f.write(
|
254
|
+
f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n"
|
255
|
+
)
|
256
|
+
|
257
|
+
f.write("\n## Configuration Details\n\n")
|
258
|
+
|
259
|
+
for result in results:
|
260
|
+
f.write(f"### {result['configuration_name']}\n\n")
|
261
|
+
|
262
|
+
config = result.get("search_config", {})
|
263
|
+
f.write("```\n")
|
264
|
+
for key, value in config.items():
|
265
|
+
f.write(f"{key}: {value}\n")
|
266
|
+
f.write("```\n\n")
|
267
|
+
|
268
|
+
logger.info(f"Comparison report saved to {report_file}")
|
269
|
+
|
270
|
+
return {
|
271
|
+
"status": "complete",
|
272
|
+
"dataset_type": dataset_type,
|
273
|
+
"configurations_tested": len(configurations),
|
274
|
+
"report_path": report_file,
|
275
|
+
"results": results,
|
276
|
+
}
|
277
|
+
|
278
|
+
|
279
|
+
# Export the API functions
|
280
|
+
__all__ = [
|
281
|
+
"evaluate_simpleqa",
|
282
|
+
"evaluate_browsecomp",
|
283
|
+
"get_available_benchmarks",
|
284
|
+
"compare_configurations",
|
285
|
+
"run_benchmark", # For advanced users
|
286
|
+
"calculate_metrics",
|
287
|
+
"generate_report",
|
288
|
+
]
|
@@ -3,17 +3,16 @@ API module for Local Deep Research.
|
|
3
3
|
Provides programmatic access to search and research capabilities.
|
4
4
|
"""
|
5
5
|
|
6
|
-
import logging
|
7
6
|
from typing import Any, Callable, Dict, Optional
|
8
7
|
|
8
|
+
from loguru import logger
|
9
|
+
|
9
10
|
from ..config.llm_config import get_llm
|
10
11
|
from ..config.search_config import get_search
|
11
12
|
from ..report_generator import IntegratedReportGenerator
|
12
13
|
from ..search_system import AdvancedSearchSystem
|
13
14
|
from ..utilities.search_utilities import remove_think_tags
|
14
15
|
|
15
|
-
logger = logging.getLogger(__name__)
|
16
|
-
|
17
16
|
|
18
17
|
def _init_search_system(
|
19
18
|
model_name: str | None = None,
|
@@ -24,6 +23,7 @@ def _init_search_system(
|
|
24
23
|
search_tool: Optional[str] = None,
|
25
24
|
iterations: int = 1,
|
26
25
|
questions_per_iteration: int = 1,
|
26
|
+
search_strategy: str = "source_based",
|
27
27
|
) -> AdvancedSearchSystem:
|
28
28
|
"""
|
29
29
|
Initializes the advanced search system with specified parameters. This function sets up
|
@@ -41,6 +41,7 @@ def _init_search_system(
|
|
41
41
|
search_tool: Search engine to use (auto, wikipedia, arxiv, etc.). If None, uses default
|
42
42
|
iterations: Number of research cycles to perform
|
43
43
|
questions_per_iteration: Number of questions to generate per cycle
|
44
|
+
search_strategy: The name of the search strategy to use.
|
44
45
|
|
45
46
|
Returns:
|
46
47
|
AdvancedSearchSystem: An instance of the configured AdvancedSearchSystem.
|
@@ -64,7 +65,10 @@ def _init_search_system(
|
|
64
65
|
)
|
65
66
|
|
66
67
|
# Create search system with custom parameters
|
67
|
-
|
68
|
+
logger.info("Search strategy: {}", search_strategy)
|
69
|
+
system = AdvancedSearchSystem(
|
70
|
+
llm=llm, search=search_engine, strategy_name=search_strategy
|
71
|
+
)
|
68
72
|
|
69
73
|
# Override default settings with user-provided values
|
70
74
|
system.max_iterations = iterations
|
@@ -0,0 +1,162 @@
|
|
1
|
+
# LDR Benchmarking Framework
|
2
|
+
|
3
|
+
The Local Deep Research (LDR) Benchmarking Framework allows you to evaluate and compare LDR's performance on standardized benchmarks.
|
4
|
+
|
5
|
+
## Features
|
6
|
+
|
7
|
+
- Run benchmarks on SimpleQA and BrowseComp datasets
|
8
|
+
- Configure search parameters (iterations, questions per iteration, search tool)
|
9
|
+
- Use automated evaluation with Claude 3.7 Sonnet (default) or other models
|
10
|
+
- Generate detailed reports with metrics and examples
|
11
|
+
- Compare multiple configurations to find optimal settings
|
12
|
+
- API, CLI, and web interface integration
|
13
|
+
|
14
|
+
## Benchmark Datasets
|
15
|
+
|
16
|
+
### SimpleQA
|
17
|
+
|
18
|
+
A straightforward question-answering benchmark with factual questions. This benchmark tests LDR's ability to find and synthesize factual information.
|
19
|
+
|
20
|
+
### BrowseComp
|
21
|
+
|
22
|
+
A web browsing comprehension benchmark with more complex questions requiring synthesis across multiple sources. This benchmark tests LDR's ability to understand and navigate complex information needs.
|
23
|
+
|
24
|
+
## Usage
|
25
|
+
|
26
|
+
### Programmatic API
|
27
|
+
|
28
|
+
```python
|
29
|
+
from local_deep_research.api.benchmark_functions import evaluate_simpleqa
|
30
|
+
|
31
|
+
# Run SimpleQA benchmark with 20 examples
|
32
|
+
result = evaluate_simpleqa(
|
33
|
+
num_examples=20,
|
34
|
+
search_iterations=3,
|
35
|
+
questions_per_iteration=3,
|
36
|
+
search_tool="searxng"
|
37
|
+
)
|
38
|
+
|
39
|
+
# Print accuracy
|
40
|
+
print(f"Accuracy: {result['metrics']['accuracy']:.3f}")
|
41
|
+
```
|
42
|
+
|
43
|
+
### Command Line Interface
|
44
|
+
|
45
|
+
```bash
|
46
|
+
# Run SimpleQA benchmark
|
47
|
+
python -m local_deep_research.cli.benchmark_commands simpleqa --examples 20 --iterations 3
|
48
|
+
|
49
|
+
# Run BrowseComp benchmark
|
50
|
+
python -m local_deep_research.cli.benchmark_commands browsecomp --examples 10 --search-tool wikipedia
|
51
|
+
|
52
|
+
# Compare configurations
|
53
|
+
python -m local_deep_research.cli.benchmark_commands compare --dataset simpleqa --examples 5
|
54
|
+
```
|
55
|
+
|
56
|
+
### Web Interface
|
57
|
+
|
58
|
+
The benchmark dashboard is available at `/benchmark` in the LDR web interface. You can:
|
59
|
+
|
60
|
+
1. Select a benchmark to run
|
61
|
+
2. Configure parameters
|
62
|
+
3. Run the benchmark
|
63
|
+
4. View results and reports
|
64
|
+
|
65
|
+
## Evaluation
|
66
|
+
|
67
|
+
By default, benchmarks are evaluated using Claude 3.7 Sonnet via OpenRouter. You can customize the evaluation model:
|
68
|
+
|
69
|
+
```python
|
70
|
+
# Use a different model for evaluation
|
71
|
+
result = evaluate_simpleqa(
|
72
|
+
num_examples=10,
|
73
|
+
evaluation_model="gpt-4o",
|
74
|
+
evaluation_provider="openai"
|
75
|
+
)
|
76
|
+
```
|
77
|
+
|
78
|
+
You can also use human evaluation:
|
79
|
+
|
80
|
+
```python
|
81
|
+
# Use human evaluation
|
82
|
+
result = evaluate_simpleqa(
|
83
|
+
num_examples=5,
|
84
|
+
human_evaluation=True
|
85
|
+
)
|
86
|
+
```
|
87
|
+
|
88
|
+
## Configuration Comparison
|
89
|
+
|
90
|
+
Compare multiple configurations to find optimal settings:
|
91
|
+
|
92
|
+
```python
|
93
|
+
from local_deep_research.api.benchmark_functions import compare_configurations
|
94
|
+
|
95
|
+
# Define configurations to compare
|
96
|
+
configurations = [
|
97
|
+
{
|
98
|
+
"name": "Base Config",
|
99
|
+
"search_tool": "searxng",
|
100
|
+
"iterations": 1,
|
101
|
+
"questions_per_iteration": 3
|
102
|
+
},
|
103
|
+
{
|
104
|
+
"name": "More Iterations",
|
105
|
+
"search_tool": "searxng",
|
106
|
+
"iterations": 3,
|
107
|
+
"questions_per_iteration": 3
|
108
|
+
},
|
109
|
+
{
|
110
|
+
"name": "Different Search Engine",
|
111
|
+
"search_tool": "wikipedia",
|
112
|
+
"iterations": 1,
|
113
|
+
"questions_per_iteration": 3
|
114
|
+
}
|
115
|
+
]
|
116
|
+
|
117
|
+
# Run comparison
|
118
|
+
result = compare_configurations(
|
119
|
+
dataset_type="simpleqa",
|
120
|
+
num_examples=10,
|
121
|
+
configurations=configurations
|
122
|
+
)
|
123
|
+
```
|
124
|
+
|
125
|
+
## Output Format
|
126
|
+
|
127
|
+
Benchmark results include:
|
128
|
+
|
129
|
+
- **metrics**: Accuracy, processing time, confidence scores
|
130
|
+
- **report_path**: Path to generated report
|
131
|
+
- **results_path**: Path to raw results file
|
132
|
+
- **total_examples**: Number of examples processed
|
133
|
+
- **status**: Completion status
|
134
|
+
|
135
|
+
## Example Reports
|
136
|
+
|
137
|
+
Reports include:
|
138
|
+
|
139
|
+
- Overall accuracy and metrics
|
140
|
+
- Configuration details
|
141
|
+
- Example correct and incorrect answers
|
142
|
+
- Time and date information
|
143
|
+
|
144
|
+
## Integration with LDR Web App
|
145
|
+
|
146
|
+
The benchmarking framework is fully integrated with the LDR web interface. You can run benchmarks and view results directly in the web app.
|
147
|
+
|
148
|
+
## Adding Custom Benchmarks
|
149
|
+
|
150
|
+
To add a custom benchmark:
|
151
|
+
|
152
|
+
1. Create a dataset loader in `datasets.py`
|
153
|
+
2. Add evaluation templates in `templates.py`
|
154
|
+
3. Create benchmark runners in `runners.py`
|
155
|
+
4. Expose the benchmark through the API in `api/benchmark_functions.py`
|
156
|
+
|
157
|
+
## Performance Considerations
|
158
|
+
|
159
|
+
- Running benchmarks can be resource-intensive
|
160
|
+
- Start with a small number of examples for testing
|
161
|
+
- Full benchmarks with 100+ examples may take several hours to complete
|
162
|
+
- Consider using a more powerful machine for large benchmarks
|
@@ -0,0 +1,51 @@
|
|
1
|
+
"""
|
2
|
+
Benchmarking module for Local Deep Research.
|
3
|
+
|
4
|
+
This module provides tools for evaluating LDR's performance on standard benchmarks
|
5
|
+
and for optimizing performance through parameter tuning.
|
6
|
+
"""
|
7
|
+
|
8
|
+
__version__ = "0.2.0"
|
9
|
+
|
10
|
+
# Core benchmark functionality
|
11
|
+
from .datasets import get_available_datasets, load_dataset
|
12
|
+
from .metrics import (
|
13
|
+
calculate_metrics,
|
14
|
+
calculate_quality_metrics,
|
15
|
+
calculate_speed_metrics,
|
16
|
+
calculate_resource_metrics,
|
17
|
+
calculate_combined_score,
|
18
|
+
generate_report,
|
19
|
+
)
|
20
|
+
from .runners import run_benchmark, run_browsecomp_benchmark, run_simpleqa_benchmark
|
21
|
+
|
22
|
+
# Optimization functionality
|
23
|
+
from .optimization import (
|
24
|
+
optimize_parameters,
|
25
|
+
optimize_for_quality,
|
26
|
+
optimize_for_speed,
|
27
|
+
optimize_for_efficiency,
|
28
|
+
)
|
29
|
+
|
30
|
+
__all__ = [
|
31
|
+
# Core benchmark functionality
|
32
|
+
"run_benchmark",
|
33
|
+
"run_simpleqa_benchmark",
|
34
|
+
"run_browsecomp_benchmark",
|
35
|
+
"load_dataset",
|
36
|
+
"get_available_datasets",
|
37
|
+
"calculate_metrics",
|
38
|
+
"generate_report",
|
39
|
+
|
40
|
+
# Metrics for optimization
|
41
|
+
"calculate_quality_metrics",
|
42
|
+
"calculate_speed_metrics",
|
43
|
+
"calculate_resource_metrics",
|
44
|
+
"calculate_combined_score",
|
45
|
+
|
46
|
+
# Optimization functionality
|
47
|
+
"optimize_parameters",
|
48
|
+
"optimize_for_quality",
|
49
|
+
"optimize_for_speed",
|
50
|
+
"optimize_for_efficiency",
|
51
|
+
]
|