local-deep-research 0.3.12__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
- local_deep_research/advanced_search_system/findings/repository.py +0 -3
- local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
- local_deep_research/api/benchmark_functions.py +288 -0
- local_deep_research/api/research_functions.py +8 -4
- local_deep_research/benchmarks/README.md +162 -0
- local_deep_research/benchmarks/__init__.py +51 -0
- local_deep_research/benchmarks/benchmark_functions.py +353 -0
- local_deep_research/benchmarks/cli/__init__.py +16 -0
- local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
- local_deep_research/benchmarks/cli.py +347 -0
- local_deep_research/benchmarks/comparison/__init__.py +12 -0
- local_deep_research/benchmarks/comparison/evaluator.py +768 -0
- local_deep_research/benchmarks/datasets/__init__.py +53 -0
- local_deep_research/benchmarks/datasets/base.py +295 -0
- local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
- local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
- local_deep_research/benchmarks/datasets/utils.py +116 -0
- local_deep_research/benchmarks/datasets.py +31 -0
- local_deep_research/benchmarks/efficiency/__init__.py +14 -0
- local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
- local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
- local_deep_research/benchmarks/evaluators/__init__.py +18 -0
- local_deep_research/benchmarks/evaluators/base.py +74 -0
- local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
- local_deep_research/benchmarks/evaluators/composite.py +121 -0
- local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
- local_deep_research/benchmarks/graders.py +410 -0
- local_deep_research/benchmarks/metrics/README.md +80 -0
- local_deep_research/benchmarks/metrics/__init__.py +24 -0
- local_deep_research/benchmarks/metrics/calculation.py +385 -0
- local_deep_research/benchmarks/metrics/reporting.py +155 -0
- local_deep_research/benchmarks/metrics/visualization.py +205 -0
- local_deep_research/benchmarks/metrics.py +11 -0
- local_deep_research/benchmarks/optimization/__init__.py +32 -0
- local_deep_research/benchmarks/optimization/api.py +274 -0
- local_deep_research/benchmarks/optimization/metrics.py +20 -0
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
- local_deep_research/benchmarks/runners.py +434 -0
- local_deep_research/benchmarks/templates.py +65 -0
- local_deep_research/config/llm_config.py +26 -23
- local_deep_research/config/search_config.py +1 -5
- local_deep_research/defaults/default_settings.json +108 -7
- local_deep_research/search_system.py +16 -8
- local_deep_research/utilities/db_utils.py +3 -6
- local_deep_research/utilities/es_utils.py +441 -0
- local_deep_research/utilities/log_utils.py +36 -0
- local_deep_research/utilities/search_utilities.py +8 -9
- local_deep_research/web/app.py +7 -9
- local_deep_research/web/app_factory.py +9 -12
- local_deep_research/web/database/migrations.py +8 -5
- local_deep_research/web/database/models.py +20 -0
- local_deep_research/web/database/schema_upgrade.py +5 -8
- local_deep_research/web/models/database.py +15 -18
- local_deep_research/web/routes/benchmark_routes.py +427 -0
- local_deep_research/web/routes/research_routes.py +13 -17
- local_deep_research/web/routes/settings_routes.py +264 -67
- local_deep_research/web/services/research_service.py +47 -57
- local_deep_research/web/services/settings_manager.py +1 -4
- local_deep_research/web/services/settings_service.py +4 -6
- local_deep_research/web/static/css/styles.css +12 -0
- local_deep_research/web/static/js/components/logpanel.js +164 -155
- local_deep_research/web/static/js/components/research.js +44 -3
- local_deep_research/web/static/js/components/settings.js +27 -0
- local_deep_research/web/static/js/services/socket.js +47 -0
- local_deep_research/web_search_engines/default_search_engines.py +38 -0
- local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
- local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
- local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
- local_deep_research/web_search_engines/search_engine_base.py +22 -5
- local_deep_research/web_search_engines/search_engine_factory.py +32 -11
- local_deep_research/web_search_engines/search_engines_config.py +14 -1
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/METADATA +10 -2
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/RECORD +92 -49
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/WHEEL +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,353 @@
|
|
1
|
+
"""
|
2
|
+
API functions for benchmarking.
|
3
|
+
|
4
|
+
This module provides functions for running benchmarks programmatically.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import logging
|
8
|
+
import os
|
9
|
+
from typing import Any, Dict, List, Optional
|
10
|
+
|
11
|
+
from ..benchmarks import (
|
12
|
+
calculate_metrics,
|
13
|
+
generate_report,
|
14
|
+
run_benchmark,
|
15
|
+
run_browsecomp_benchmark,
|
16
|
+
run_simpleqa_benchmark,
|
17
|
+
)
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
def evaluate_simpleqa(
|
23
|
+
num_examples: int = 100,
|
24
|
+
search_iterations: int = 3,
|
25
|
+
questions_per_iteration: int = 3,
|
26
|
+
search_tool: str = "searxng",
|
27
|
+
human_evaluation: bool = False,
|
28
|
+
evaluation_model: Optional[str] = None,
|
29
|
+
evaluation_provider: Optional[str] = None,
|
30
|
+
output_dir: str = "benchmark_results",
|
31
|
+
search_model: Optional[str] = None,
|
32
|
+
search_provider: Optional[str] = None,
|
33
|
+
endpoint_url: Optional[str] = None,
|
34
|
+
search_strategy: str = "source_based",
|
35
|
+
) -> Dict[str, Any]:
|
36
|
+
"""
|
37
|
+
Run SimpleQA benchmark evaluation.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
num_examples: Number of examples to evaluate
|
41
|
+
search_iterations: Number of search iterations per query
|
42
|
+
questions_per_iteration: Number of questions per iteration
|
43
|
+
search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
|
44
|
+
human_evaluation: Whether to use human evaluation
|
45
|
+
evaluation_model: Optional custom model for evaluation
|
46
|
+
evaluation_provider: Optional custom provider for evaluation
|
47
|
+
output_dir: Directory to save results
|
48
|
+
search_model: Optional model to use for the search system
|
49
|
+
search_provider: Optional provider to use for the search system
|
50
|
+
endpoint_url: Optional endpoint URL for OpenRouter or other API services
|
51
|
+
search_strategy: Search strategy to use (default: 'source_based')
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
Dictionary with benchmark results
|
55
|
+
"""
|
56
|
+
logger.info(f"Starting SimpleQA benchmark with {num_examples} examples")
|
57
|
+
|
58
|
+
# Set up search configuration
|
59
|
+
search_config = {
|
60
|
+
"iterations": search_iterations,
|
61
|
+
"questions_per_iteration": questions_per_iteration,
|
62
|
+
"search_tool": search_tool,
|
63
|
+
"search_strategy": search_strategy,
|
64
|
+
}
|
65
|
+
|
66
|
+
# Add model configurations if provided
|
67
|
+
if search_model:
|
68
|
+
search_config["model_name"] = search_model
|
69
|
+
if search_provider:
|
70
|
+
search_config["provider"] = search_provider
|
71
|
+
if endpoint_url:
|
72
|
+
search_config["openai_endpoint_url"] = endpoint_url
|
73
|
+
|
74
|
+
# Check environment variables for additional configuration
|
75
|
+
if env_model := os.environ.get("LDR_SEARCH_MODEL"):
|
76
|
+
search_config["model_name"] = env_model
|
77
|
+
if env_provider := os.environ.get("LDR_SEARCH_PROVIDER"):
|
78
|
+
search_config["provider"] = env_provider
|
79
|
+
if env_url := os.environ.get("LDR_ENDPOINT_URL"):
|
80
|
+
search_config["openai_endpoint_url"] = env_url
|
81
|
+
|
82
|
+
# Set up evaluation configuration if needed
|
83
|
+
evaluation_config = None
|
84
|
+
if evaluation_model or evaluation_provider:
|
85
|
+
evaluation_config = {
|
86
|
+
"temperature": 0 # Always use zero temperature for evaluation
|
87
|
+
}
|
88
|
+
if evaluation_model:
|
89
|
+
evaluation_config["model_name"] = evaluation_model
|
90
|
+
if evaluation_provider:
|
91
|
+
evaluation_config["provider"] = evaluation_provider
|
92
|
+
# Add OpenRouter URL if using openai_endpoint
|
93
|
+
if evaluation_provider == "openai_endpoint":
|
94
|
+
evaluation_config["openai_endpoint_url"] = (
|
95
|
+
"https://openrouter.ai/api/v1"
|
96
|
+
)
|
97
|
+
|
98
|
+
# Run the benchmark
|
99
|
+
results = run_simpleqa_benchmark(
|
100
|
+
num_examples=num_examples,
|
101
|
+
output_dir=output_dir,
|
102
|
+
search_config=search_config,
|
103
|
+
evaluation_config=evaluation_config,
|
104
|
+
human_evaluation=human_evaluation,
|
105
|
+
)
|
106
|
+
|
107
|
+
return results
|
108
|
+
|
109
|
+
|
110
|
+
def evaluate_browsecomp(
|
111
|
+
num_examples: int = 100,
|
112
|
+
search_iterations: int = 3,
|
113
|
+
questions_per_iteration: int = 3,
|
114
|
+
search_tool: str = "searxng",
|
115
|
+
human_evaluation: bool = False,
|
116
|
+
evaluation_model: Optional[str] = None,
|
117
|
+
evaluation_provider: Optional[str] = None,
|
118
|
+
output_dir: str = "benchmark_results",
|
119
|
+
search_model: Optional[str] = None,
|
120
|
+
search_provider: Optional[str] = None,
|
121
|
+
endpoint_url: Optional[str] = None,
|
122
|
+
search_strategy: str = "source_based",
|
123
|
+
) -> Dict[str, Any]:
|
124
|
+
"""
|
125
|
+
Run BrowseComp benchmark evaluation.
|
126
|
+
|
127
|
+
Args:
|
128
|
+
num_examples: Number of examples to evaluate
|
129
|
+
search_iterations: Number of search iterations per query
|
130
|
+
questions_per_iteration: Number of questions per iteration
|
131
|
+
search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
|
132
|
+
human_evaluation: Whether to use human evaluation
|
133
|
+
evaluation_model: Optional custom model for evaluation
|
134
|
+
evaluation_provider: Optional custom provider for evaluation
|
135
|
+
output_dir: Directory to save results
|
136
|
+
search_model: Optional model to use for the search system
|
137
|
+
search_provider: Optional provider to use for the search system
|
138
|
+
endpoint_url: Optional endpoint URL for OpenRouter or other API services
|
139
|
+
search_strategy: Search strategy to use (default: 'source_based')
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
Dictionary with benchmark results
|
143
|
+
"""
|
144
|
+
logger.info(f"Starting BrowseComp benchmark with {num_examples} examples")
|
145
|
+
|
146
|
+
# Set up search configuration
|
147
|
+
search_config = {
|
148
|
+
"iterations": search_iterations,
|
149
|
+
"questions_per_iteration": questions_per_iteration,
|
150
|
+
"search_tool": search_tool,
|
151
|
+
"search_strategy": search_strategy,
|
152
|
+
}
|
153
|
+
|
154
|
+
# Add model configurations if provided
|
155
|
+
if search_model:
|
156
|
+
search_config["model_name"] = search_model
|
157
|
+
if search_provider:
|
158
|
+
search_config["provider"] = search_provider
|
159
|
+
if endpoint_url:
|
160
|
+
search_config["openai_endpoint_url"] = endpoint_url
|
161
|
+
|
162
|
+
# Check environment variables for additional configuration
|
163
|
+
if env_model := os.environ.get("LDR_SEARCH_MODEL"):
|
164
|
+
search_config["model_name"] = env_model
|
165
|
+
if env_provider := os.environ.get("LDR_SEARCH_PROVIDER"):
|
166
|
+
search_config["provider"] = env_provider
|
167
|
+
if env_url := os.environ.get("LDR_ENDPOINT_URL"):
|
168
|
+
search_config["openai_endpoint_url"] = env_url
|
169
|
+
|
170
|
+
# Set up evaluation configuration if needed
|
171
|
+
evaluation_config = None
|
172
|
+
if evaluation_model or evaluation_provider:
|
173
|
+
evaluation_config = {
|
174
|
+
"temperature": 0 # Always use zero temperature for evaluation
|
175
|
+
}
|
176
|
+
if evaluation_model:
|
177
|
+
evaluation_config["model_name"] = evaluation_model
|
178
|
+
if evaluation_provider:
|
179
|
+
evaluation_config["provider"] = evaluation_provider
|
180
|
+
# Add OpenRouter URL if using openai_endpoint
|
181
|
+
if evaluation_provider == "openai_endpoint":
|
182
|
+
evaluation_config["openai_endpoint_url"] = (
|
183
|
+
"https://openrouter.ai/api/v1"
|
184
|
+
)
|
185
|
+
|
186
|
+
# Run the benchmark
|
187
|
+
results = run_browsecomp_benchmark(
|
188
|
+
num_examples=num_examples,
|
189
|
+
output_dir=output_dir,
|
190
|
+
search_config=search_config,
|
191
|
+
evaluation_config=evaluation_config,
|
192
|
+
human_evaluation=human_evaluation,
|
193
|
+
)
|
194
|
+
|
195
|
+
return results
|
196
|
+
|
197
|
+
|
198
|
+
def get_available_benchmarks() -> List[Dict[str, str]]:
|
199
|
+
"""
|
200
|
+
Get information about available benchmarks.
|
201
|
+
|
202
|
+
Returns:
|
203
|
+
List of dictionaries with benchmark information
|
204
|
+
"""
|
205
|
+
return [
|
206
|
+
{
|
207
|
+
"id": "simpleqa",
|
208
|
+
"name": "SimpleQA",
|
209
|
+
"description": "Benchmark for factual question answering",
|
210
|
+
"recommended_examples": 100,
|
211
|
+
},
|
212
|
+
{
|
213
|
+
"id": "browsecomp",
|
214
|
+
"name": "BrowseComp",
|
215
|
+
"description": "Benchmark for web browsing comprehension",
|
216
|
+
"recommended_examples": 100,
|
217
|
+
},
|
218
|
+
]
|
219
|
+
|
220
|
+
|
221
|
+
def compare_configurations(
|
222
|
+
dataset_type: str = "simpleqa",
|
223
|
+
num_examples: int = 20,
|
224
|
+
configurations: List[Dict[str, Any]] = None,
|
225
|
+
output_dir: str = "benchmark_comparisons",
|
226
|
+
) -> Dict[str, Any]:
|
227
|
+
"""
|
228
|
+
Compare multiple search configurations on the same benchmark.
|
229
|
+
|
230
|
+
Args:
|
231
|
+
dataset_type: Type of dataset to use
|
232
|
+
num_examples: Number of examples to evaluate
|
233
|
+
configurations: List of search configurations to compare
|
234
|
+
output_dir: Directory to save results
|
235
|
+
|
236
|
+
Returns:
|
237
|
+
Dictionary with comparison results
|
238
|
+
"""
|
239
|
+
if not configurations:
|
240
|
+
# Default configurations to compare
|
241
|
+
configurations = [
|
242
|
+
{
|
243
|
+
"name": "Base Config",
|
244
|
+
"search_tool": "searxng",
|
245
|
+
"iterations": 1,
|
246
|
+
"questions_per_iteration": 3,
|
247
|
+
},
|
248
|
+
{
|
249
|
+
"name": "More Iterations",
|
250
|
+
"search_tool": "searxng",
|
251
|
+
"iterations": 3,
|
252
|
+
"questions_per_iteration": 3,
|
253
|
+
},
|
254
|
+
{
|
255
|
+
"name": "More Questions",
|
256
|
+
"search_tool": "searxng",
|
257
|
+
"iterations": 1,
|
258
|
+
"questions_per_iteration": 5,
|
259
|
+
},
|
260
|
+
]
|
261
|
+
|
262
|
+
# Create output directory
|
263
|
+
import os
|
264
|
+
|
265
|
+
os.makedirs(output_dir, exist_ok=True)
|
266
|
+
|
267
|
+
# Run benchmarks for each configuration
|
268
|
+
results = []
|
269
|
+
for config in configurations:
|
270
|
+
config_name = config.pop("name", f"Config-{len(results)}")
|
271
|
+
|
272
|
+
logger.info(f"Running benchmark with configuration: {config_name}")
|
273
|
+
|
274
|
+
search_config = {
|
275
|
+
"iterations": config.pop("iterations", 1),
|
276
|
+
"questions_per_iteration": config.pop("questions_per_iteration", 3),
|
277
|
+
"search_tool": config.pop("search_tool", "searxng"),
|
278
|
+
}
|
279
|
+
|
280
|
+
# Add any remaining config items
|
281
|
+
for key, value in config.items():
|
282
|
+
search_config[key] = value
|
283
|
+
|
284
|
+
# Run benchmark with this configuration
|
285
|
+
benchmark_result = run_benchmark(
|
286
|
+
dataset_type=dataset_type,
|
287
|
+
num_examples=num_examples,
|
288
|
+
output_dir=os.path.join(output_dir, config_name.replace(" ", "_")),
|
289
|
+
search_config=search_config,
|
290
|
+
run_evaluation=True,
|
291
|
+
)
|
292
|
+
|
293
|
+
# Add configuration name to results
|
294
|
+
benchmark_result["configuration_name"] = config_name
|
295
|
+
benchmark_result["search_config"] = search_config
|
296
|
+
|
297
|
+
results.append(benchmark_result)
|
298
|
+
|
299
|
+
# Generate comparison report
|
300
|
+
import time
|
301
|
+
|
302
|
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
303
|
+
report_file = os.path.join(output_dir, f"comparison_{dataset_type}_{timestamp}.md")
|
304
|
+
|
305
|
+
with open(report_file, "w") as f:
|
306
|
+
f.write(f"# Configuration Comparison - {dataset_type.capitalize()}\n\n")
|
307
|
+
|
308
|
+
# Write summary table
|
309
|
+
f.write("## Summary\n\n")
|
310
|
+
f.write("| Configuration | Accuracy | Avg. Time | Examples |\n")
|
311
|
+
f.write("|---------------|----------|-----------|----------|\n")
|
312
|
+
|
313
|
+
for result in results:
|
314
|
+
accuracy = result.get("metrics", {}).get("accuracy", 0)
|
315
|
+
avg_time = result.get("metrics", {}).get("average_processing_time", 0)
|
316
|
+
examples = result.get("total_examples", 0)
|
317
|
+
|
318
|
+
f.write(
|
319
|
+
f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n"
|
320
|
+
)
|
321
|
+
|
322
|
+
f.write("\n## Configuration Details\n\n")
|
323
|
+
|
324
|
+
for result in results:
|
325
|
+
f.write(f"### {result['configuration_name']}\n\n")
|
326
|
+
|
327
|
+
config = result.get("search_config", {})
|
328
|
+
f.write("```\n")
|
329
|
+
for key, value in config.items():
|
330
|
+
f.write(f"{key}: {value}\n")
|
331
|
+
f.write("```\n\n")
|
332
|
+
|
333
|
+
logger.info(f"Comparison report saved to {report_file}")
|
334
|
+
|
335
|
+
return {
|
336
|
+
"status": "complete",
|
337
|
+
"dataset_type": dataset_type,
|
338
|
+
"configurations_tested": len(configurations),
|
339
|
+
"report_path": report_file,
|
340
|
+
"results": results,
|
341
|
+
}
|
342
|
+
|
343
|
+
|
344
|
+
# Export the API functions
|
345
|
+
__all__ = [
|
346
|
+
"evaluate_simpleqa",
|
347
|
+
"evaluate_browsecomp",
|
348
|
+
"get_available_benchmarks",
|
349
|
+
"compare_configurations",
|
350
|
+
"run_benchmark", # For advanced users
|
351
|
+
"calculate_metrics",
|
352
|
+
"generate_report",
|
353
|
+
]
|
@@ -0,0 +1,16 @@
|
|
1
|
+
"""
|
2
|
+
CLI module for benchmark functionality.
|
3
|
+
|
4
|
+
This package provides command-line interface tools for
|
5
|
+
running benchmarks and optimization tasks.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .benchmark_commands import main as benchmark_main
|
9
|
+
from .benchmark_commands import (
|
10
|
+
setup_benchmark_parser,
|
11
|
+
)
|
12
|
+
|
13
|
+
__all__ = [
|
14
|
+
"benchmark_main",
|
15
|
+
"setup_benchmark_parser",
|
16
|
+
]
|
@@ -0,0 +1,338 @@
|
|
1
|
+
"""
|
2
|
+
Command-line interface for benchmarking.
|
3
|
+
|
4
|
+
This module provides CLI commands for running benchmarks.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import argparse
|
8
|
+
import logging
|
9
|
+
|
10
|
+
from .. import (
|
11
|
+
get_available_datasets,
|
12
|
+
run_browsecomp_benchmark,
|
13
|
+
run_simpleqa_benchmark,
|
14
|
+
)
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
def setup_benchmark_parser(subparsers):
|
20
|
+
"""
|
21
|
+
Set up the benchmark CLI commands.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
subparsers: argparse subparsers object to add commands to
|
25
|
+
"""
|
26
|
+
# Common benchmark arguments
|
27
|
+
benchmark_parent = argparse.ArgumentParser(add_help=False)
|
28
|
+
benchmark_parent.add_argument(
|
29
|
+
"--examples",
|
30
|
+
type=int,
|
31
|
+
default=100,
|
32
|
+
help="Number of examples to run (default: 100)",
|
33
|
+
)
|
34
|
+
benchmark_parent.add_argument(
|
35
|
+
"--iterations",
|
36
|
+
type=int,
|
37
|
+
default=3,
|
38
|
+
help="Number of search iterations (default: 3)",
|
39
|
+
)
|
40
|
+
benchmark_parent.add_argument(
|
41
|
+
"--questions", type=int, default=3, help="Questions per iteration (default: 3)"
|
42
|
+
)
|
43
|
+
benchmark_parent.add_argument(
|
44
|
+
"--search-tool",
|
45
|
+
type=str,
|
46
|
+
default="searxng",
|
47
|
+
help="Search tool to use (default: searxng)",
|
48
|
+
)
|
49
|
+
benchmark_parent.add_argument(
|
50
|
+
"--output-dir",
|
51
|
+
type=str,
|
52
|
+
default="data/benchmark_results",
|
53
|
+
help="Directory to save results (default: data/benchmark_results)",
|
54
|
+
)
|
55
|
+
benchmark_parent.add_argument(
|
56
|
+
"--human-eval",
|
57
|
+
action="store_true",
|
58
|
+
help="Use human evaluation instead of automated",
|
59
|
+
)
|
60
|
+
benchmark_parent.add_argument(
|
61
|
+
"--eval-model", type=str, help="Model to use for evaluation"
|
62
|
+
)
|
63
|
+
benchmark_parent.add_argument(
|
64
|
+
"--eval-provider", type=str, help="Provider to use for evaluation"
|
65
|
+
)
|
66
|
+
benchmark_parent.add_argument(
|
67
|
+
"--custom-dataset", type=str, help="Path to custom dataset"
|
68
|
+
)
|
69
|
+
benchmark_parent.add_argument(
|
70
|
+
"--no-eval", action="store_true", help="Skip evaluation phase"
|
71
|
+
)
|
72
|
+
|
73
|
+
# Add model configuration options for the search system
|
74
|
+
benchmark_parent.add_argument(
|
75
|
+
"--search-model", type=str, help="Model to use for the search system"
|
76
|
+
)
|
77
|
+
benchmark_parent.add_argument(
|
78
|
+
"--search-provider", type=str, help="Provider to use for the search system"
|
79
|
+
)
|
80
|
+
benchmark_parent.add_argument(
|
81
|
+
"--endpoint-url", type=str, help="Endpoint URL for OpenRouter or other API services"
|
82
|
+
)
|
83
|
+
benchmark_parent.add_argument(
|
84
|
+
"--search-strategy", type=str, default="source_based",
|
85
|
+
choices=["source_based", "standard", "rapid", "parallel", "iterdrag"],
|
86
|
+
help="Search strategy to use (default: source_based)"
|
87
|
+
)
|
88
|
+
|
89
|
+
# SimpleQA benchmark command
|
90
|
+
simpleqa_parser = subparsers.add_parser(
|
91
|
+
"simpleqa", parents=[benchmark_parent], help="Run SimpleQA benchmark"
|
92
|
+
)
|
93
|
+
simpleqa_parser.set_defaults(func=run_simpleqa_cli)
|
94
|
+
|
95
|
+
# BrowseComp benchmark command
|
96
|
+
browsecomp_parser = subparsers.add_parser(
|
97
|
+
"browsecomp", parents=[benchmark_parent], help="Run BrowseComp benchmark"
|
98
|
+
)
|
99
|
+
browsecomp_parser.set_defaults(func=run_browsecomp_cli)
|
100
|
+
|
101
|
+
# List available benchmarks command
|
102
|
+
list_parser = subparsers.add_parser("list", help="List available benchmarks")
|
103
|
+
list_parser.set_defaults(func=list_benchmarks_cli)
|
104
|
+
|
105
|
+
# Compare configurations command
|
106
|
+
compare_parser = subparsers.add_parser(
|
107
|
+
"compare", help="Compare multiple search configurations"
|
108
|
+
)
|
109
|
+
compare_parser.add_argument(
|
110
|
+
"--dataset",
|
111
|
+
type=str,
|
112
|
+
default="simpleqa",
|
113
|
+
choices=["simpleqa", "browsecomp"],
|
114
|
+
help="Dataset to use for comparison",
|
115
|
+
)
|
116
|
+
compare_parser.add_argument(
|
117
|
+
"--examples",
|
118
|
+
type=int,
|
119
|
+
default=20,
|
120
|
+
help="Number of examples for each configuration (default: 20)",
|
121
|
+
)
|
122
|
+
compare_parser.add_argument(
|
123
|
+
"--output-dir",
|
124
|
+
type=str,
|
125
|
+
default="data/benchmark_results/comparison",
|
126
|
+
help="Directory to save comparison results",
|
127
|
+
)
|
128
|
+
compare_parser.set_defaults(func=compare_configs_cli)
|
129
|
+
|
130
|
+
|
131
|
+
def run_simpleqa_cli(args):
|
132
|
+
"""
|
133
|
+
CLI handler for SimpleQA benchmark.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
args: Parsed command-line arguments
|
137
|
+
"""
|
138
|
+
# Set up search configuration
|
139
|
+
search_config = {
|
140
|
+
"iterations": args.iterations,
|
141
|
+
"questions_per_iteration": args.questions,
|
142
|
+
"search_tool": args.search_tool,
|
143
|
+
}
|
144
|
+
|
145
|
+
# Add model configuration if provided
|
146
|
+
if hasattr(args, "search_model") and args.search_model:
|
147
|
+
search_config["model_name"] = args.search_model
|
148
|
+
if hasattr(args, "search_provider") and args.search_provider:
|
149
|
+
search_config["provider"] = args.search_provider
|
150
|
+
if hasattr(args, "endpoint_url") and args.endpoint_url:
|
151
|
+
search_config["openai_endpoint_url"] = args.endpoint_url
|
152
|
+
if hasattr(args, "search_strategy") and args.search_strategy:
|
153
|
+
search_config["search_strategy"] = args.search_strategy
|
154
|
+
|
155
|
+
# Set up evaluation configuration if needed
|
156
|
+
evaluation_config = None
|
157
|
+
if args.eval_model or args.eval_provider:
|
158
|
+
evaluation_config = {}
|
159
|
+
if args.eval_model:
|
160
|
+
evaluation_config["model_name"] = args.eval_model
|
161
|
+
if args.eval_provider:
|
162
|
+
evaluation_config["provider"] = args.eval_provider
|
163
|
+
|
164
|
+
# Run the benchmark
|
165
|
+
result = run_simpleqa_benchmark(
|
166
|
+
num_examples=args.examples,
|
167
|
+
dataset_path=args.custom_dataset,
|
168
|
+
output_dir=args.output_dir,
|
169
|
+
search_config=search_config,
|
170
|
+
evaluation_config=evaluation_config,
|
171
|
+
human_evaluation=args.human_eval,
|
172
|
+
run_evaluation=not args.no_eval,
|
173
|
+
)
|
174
|
+
|
175
|
+
# Print results summary
|
176
|
+
if "metrics" in result:
|
177
|
+
print("\nSimpleQA Benchmark Results:")
|
178
|
+
print(f" Accuracy: {result['metrics'].get('accuracy', 0):.3f}")
|
179
|
+
print(f" Total examples: {result['total_examples']}")
|
180
|
+
print(f" Correct answers: {result['metrics'].get('correct', 0)}")
|
181
|
+
print(
|
182
|
+
f" Average time: {result['metrics'].get('average_processing_time', 0):.2f}s"
|
183
|
+
)
|
184
|
+
print(f"\nReport saved to: {result.get('report_path', 'N/A')}")
|
185
|
+
else:
|
186
|
+
print("\nSimpleQA Benchmark Completed (no evaluation)")
|
187
|
+
print(f" Total examples: {result['total_examples']}")
|
188
|
+
print(f" Results saved to: {result.get('results_path', 'N/A')}")
|
189
|
+
|
190
|
+
|
191
|
+
def run_browsecomp_cli(args):
|
192
|
+
"""
|
193
|
+
CLI handler for BrowseComp benchmark.
|
194
|
+
|
195
|
+
Args:
|
196
|
+
args: Parsed command-line arguments
|
197
|
+
"""
|
198
|
+
# Set up search configuration
|
199
|
+
search_config = {
|
200
|
+
"iterations": args.iterations,
|
201
|
+
"questions_per_iteration": args.questions,
|
202
|
+
"search_tool": args.search_tool,
|
203
|
+
}
|
204
|
+
|
205
|
+
# Add model configuration if provided
|
206
|
+
if hasattr(args, "search_model") and args.search_model:
|
207
|
+
search_config["model_name"] = args.search_model
|
208
|
+
if hasattr(args, "search_provider") and args.search_provider:
|
209
|
+
search_config["provider"] = args.search_provider
|
210
|
+
if hasattr(args, "endpoint_url") and args.endpoint_url:
|
211
|
+
search_config["openai_endpoint_url"] = args.endpoint_url
|
212
|
+
if hasattr(args, "search_strategy") and args.search_strategy:
|
213
|
+
search_config["search_strategy"] = args.search_strategy
|
214
|
+
|
215
|
+
# Set up evaluation configuration if needed
|
216
|
+
evaluation_config = None
|
217
|
+
if args.eval_model or args.eval_provider:
|
218
|
+
evaluation_config = {}
|
219
|
+
if args.eval_model:
|
220
|
+
evaluation_config["model_name"] = args.eval_model
|
221
|
+
if args.eval_provider:
|
222
|
+
evaluation_config["provider"] = args.eval_provider
|
223
|
+
|
224
|
+
# Run the benchmark
|
225
|
+
result = run_browsecomp_benchmark(
|
226
|
+
num_examples=args.examples,
|
227
|
+
dataset_path=args.custom_dataset,
|
228
|
+
output_dir=args.output_dir,
|
229
|
+
search_config=search_config,
|
230
|
+
evaluation_config=evaluation_config,
|
231
|
+
human_evaluation=args.human_eval,
|
232
|
+
run_evaluation=not args.no_eval,
|
233
|
+
)
|
234
|
+
|
235
|
+
# Print results summary
|
236
|
+
if "metrics" in result:
|
237
|
+
print("\nBrowseComp Benchmark Results:")
|
238
|
+
print(f" Accuracy: {result['metrics'].get('accuracy', 0):.3f}")
|
239
|
+
print(f" Total examples: {result['total_examples']}")
|
240
|
+
print(f" Correct answers: {result['metrics'].get('correct', 0)}")
|
241
|
+
print(
|
242
|
+
f" Average time: {result['metrics'].get('average_processing_time', 0):.2f}s"
|
243
|
+
)
|
244
|
+
print(f"\nReport saved to: {result.get('report_path', 'N/A')}")
|
245
|
+
else:
|
246
|
+
print("\nBrowseComp Benchmark Completed (no evaluation)")
|
247
|
+
print(f" Total examples: {result['total_examples']}")
|
248
|
+
print(f" Results saved to: {result.get('results_path', 'N/A')}")
|
249
|
+
|
250
|
+
|
251
|
+
def list_benchmarks_cli(args):
|
252
|
+
"""
|
253
|
+
CLI handler for listing available benchmarks.
|
254
|
+
|
255
|
+
Args:
|
256
|
+
args: Parsed command-line arguments
|
257
|
+
"""
|
258
|
+
datasets = get_available_datasets()
|
259
|
+
|
260
|
+
print("\nAvailable Benchmarks:")
|
261
|
+
for dataset in datasets:
|
262
|
+
print(f" {dataset['id']}: {dataset['name']}")
|
263
|
+
print(f" {dataset['description']}")
|
264
|
+
print(f" URL: {dataset['url']}")
|
265
|
+
print()
|
266
|
+
|
267
|
+
|
268
|
+
def compare_configs_cli(args):
|
269
|
+
"""
|
270
|
+
CLI handler for comparing multiple configurations.
|
271
|
+
|
272
|
+
Args:
|
273
|
+
args: Parsed command-line arguments
|
274
|
+
"""
|
275
|
+
# Import the compare configurations function
|
276
|
+
from ...api.benchmark_functions import compare_configurations
|
277
|
+
|
278
|
+
# Run the comparison
|
279
|
+
result = compare_configurations(
|
280
|
+
dataset_type=args.dataset,
|
281
|
+
num_examples=args.examples,
|
282
|
+
output_dir=args.output_dir,
|
283
|
+
)
|
284
|
+
|
285
|
+
# Print results summary
|
286
|
+
print("\nConfiguration Comparison Results:")
|
287
|
+
print(f" Dataset: {args.dataset}")
|
288
|
+
print(f" Configurations tested: {result['configurations_tested']}")
|
289
|
+
print(f" Report saved to: {result['report_path']}")
|
290
|
+
|
291
|
+
# Print brief comparison table
|
292
|
+
print("\nResults Summary:")
|
293
|
+
print("Configuration | Accuracy | Avg. Time")
|
294
|
+
print("--------------- | -------- | ---------")
|
295
|
+
for res in result["results"]:
|
296
|
+
name = res["configuration_name"]
|
297
|
+
acc = res.get("metrics", {}).get("accuracy", 0)
|
298
|
+
time = res.get("metrics", {}).get("average_processing_time", 0)
|
299
|
+
print(f"{name:15} | {acc:.3f} | {time:.2f}s")
|
300
|
+
|
301
|
+
|
302
|
+
def main():
|
303
|
+
"""
|
304
|
+
Main entry point for benchmark CLI.
|
305
|
+
"""
|
306
|
+
parser = argparse.ArgumentParser(
|
307
|
+
description="Local Deep Research Benchmarking Tool", prog="ldr-benchmark"
|
308
|
+
)
|
309
|
+
|
310
|
+
# Set up logging
|
311
|
+
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
|
312
|
+
|
313
|
+
# Create subparsers
|
314
|
+
subparsers = parser.add_subparsers(
|
315
|
+
dest="command", help="Command to run", required=True
|
316
|
+
)
|
317
|
+
|
318
|
+
# Set up commands
|
319
|
+
setup_benchmark_parser(subparsers)
|
320
|
+
|
321
|
+
# Parse arguments
|
322
|
+
args = parser.parse_args()
|
323
|
+
|
324
|
+
# Set up logging
|
325
|
+
log_level = logging.DEBUG if args.verbose else logging.INFO
|
326
|
+
logging.basicConfig(
|
327
|
+
level=log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
328
|
+
)
|
329
|
+
|
330
|
+
# Run command
|
331
|
+
if hasattr(args, "func"):
|
332
|
+
args.func(args)
|
333
|
+
else:
|
334
|
+
parser.print_help()
|
335
|
+
|
336
|
+
|
337
|
+
if __name__ == "__main__":
|
338
|
+
main()
|