local-deep-research 0.3.12__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. local_deep_research/__init__.py +1 -0
  2. local_deep_research/__version__.py +1 -1
  3. local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
  4. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
  5. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
  6. local_deep_research/advanced_search_system/findings/repository.py +0 -3
  7. local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
  8. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
  9. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
  10. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
  11. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
  12. local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
  13. local_deep_research/api/benchmark_functions.py +288 -0
  14. local_deep_research/api/research_functions.py +8 -4
  15. local_deep_research/benchmarks/README.md +162 -0
  16. local_deep_research/benchmarks/__init__.py +51 -0
  17. local_deep_research/benchmarks/benchmark_functions.py +353 -0
  18. local_deep_research/benchmarks/cli/__init__.py +16 -0
  19. local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
  20. local_deep_research/benchmarks/cli.py +347 -0
  21. local_deep_research/benchmarks/comparison/__init__.py +12 -0
  22. local_deep_research/benchmarks/comparison/evaluator.py +768 -0
  23. local_deep_research/benchmarks/datasets/__init__.py +53 -0
  24. local_deep_research/benchmarks/datasets/base.py +295 -0
  25. local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
  26. local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
  27. local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
  28. local_deep_research/benchmarks/datasets/utils.py +116 -0
  29. local_deep_research/benchmarks/datasets.py +31 -0
  30. local_deep_research/benchmarks/efficiency/__init__.py +14 -0
  31. local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
  32. local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
  33. local_deep_research/benchmarks/evaluators/__init__.py +18 -0
  34. local_deep_research/benchmarks/evaluators/base.py +74 -0
  35. local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
  36. local_deep_research/benchmarks/evaluators/composite.py +121 -0
  37. local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
  38. local_deep_research/benchmarks/graders.py +410 -0
  39. local_deep_research/benchmarks/metrics/README.md +80 -0
  40. local_deep_research/benchmarks/metrics/__init__.py +24 -0
  41. local_deep_research/benchmarks/metrics/calculation.py +385 -0
  42. local_deep_research/benchmarks/metrics/reporting.py +155 -0
  43. local_deep_research/benchmarks/metrics/visualization.py +205 -0
  44. local_deep_research/benchmarks/metrics.py +11 -0
  45. local_deep_research/benchmarks/optimization/__init__.py +32 -0
  46. local_deep_research/benchmarks/optimization/api.py +274 -0
  47. local_deep_research/benchmarks/optimization/metrics.py +20 -0
  48. local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
  49. local_deep_research/benchmarks/runners.py +434 -0
  50. local_deep_research/benchmarks/templates.py +65 -0
  51. local_deep_research/config/llm_config.py +26 -23
  52. local_deep_research/config/search_config.py +1 -5
  53. local_deep_research/defaults/default_settings.json +108 -7
  54. local_deep_research/search_system.py +16 -8
  55. local_deep_research/utilities/db_utils.py +3 -6
  56. local_deep_research/utilities/es_utils.py +441 -0
  57. local_deep_research/utilities/log_utils.py +36 -0
  58. local_deep_research/utilities/search_utilities.py +8 -9
  59. local_deep_research/web/app.py +15 -10
  60. local_deep_research/web/app_factory.py +9 -12
  61. local_deep_research/web/database/migrations.py +8 -5
  62. local_deep_research/web/database/models.py +20 -0
  63. local_deep_research/web/database/schema_upgrade.py +5 -8
  64. local_deep_research/web/models/database.py +15 -18
  65. local_deep_research/web/routes/benchmark_routes.py +427 -0
  66. local_deep_research/web/routes/research_routes.py +13 -17
  67. local_deep_research/web/routes/settings_routes.py +264 -67
  68. local_deep_research/web/services/research_service.py +58 -73
  69. local_deep_research/web/services/settings_manager.py +1 -4
  70. local_deep_research/web/services/settings_service.py +4 -6
  71. local_deep_research/web/static/css/styles.css +12 -0
  72. local_deep_research/web/static/js/components/logpanel.js +164 -155
  73. local_deep_research/web/static/js/components/research.js +44 -3
  74. local_deep_research/web/static/js/components/settings.js +27 -0
  75. local_deep_research/web/static/js/services/socket.js +47 -0
  76. local_deep_research/web_search_engines/default_search_engines.py +38 -0
  77. local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
  78. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
  79. local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
  80. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
  81. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
  82. local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
  83. local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
  84. local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
  85. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
  86. local_deep_research/web_search_engines/search_engine_base.py +22 -5
  87. local_deep_research/web_search_engines/search_engine_factory.py +30 -11
  88. local_deep_research/web_search_engines/search_engines_config.py +14 -1
  89. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/METADATA +10 -2
  90. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/RECORD +93 -51
  91. local_deep_research/app.py +0 -8
  92. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/WHEEL +0 -0
  93. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/entry_points.txt +0 -0
  94. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,353 @@
1
+ """
2
+ API functions for benchmarking.
3
+
4
+ This module provides functions for running benchmarks programmatically.
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from ..benchmarks import (
12
+ calculate_metrics,
13
+ generate_report,
14
+ run_benchmark,
15
+ run_browsecomp_benchmark,
16
+ run_simpleqa_benchmark,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def evaluate_simpleqa(
23
+ num_examples: int = 100,
24
+ search_iterations: int = 3,
25
+ questions_per_iteration: int = 3,
26
+ search_tool: str = "searxng",
27
+ human_evaluation: bool = False,
28
+ evaluation_model: Optional[str] = None,
29
+ evaluation_provider: Optional[str] = None,
30
+ output_dir: str = "benchmark_results",
31
+ search_model: Optional[str] = None,
32
+ search_provider: Optional[str] = None,
33
+ endpoint_url: Optional[str] = None,
34
+ search_strategy: str = "source_based",
35
+ ) -> Dict[str, Any]:
36
+ """
37
+ Run SimpleQA benchmark evaluation.
38
+
39
+ Args:
40
+ num_examples: Number of examples to evaluate
41
+ search_iterations: Number of search iterations per query
42
+ questions_per_iteration: Number of questions per iteration
43
+ search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
44
+ human_evaluation: Whether to use human evaluation
45
+ evaluation_model: Optional custom model for evaluation
46
+ evaluation_provider: Optional custom provider for evaluation
47
+ output_dir: Directory to save results
48
+ search_model: Optional model to use for the search system
49
+ search_provider: Optional provider to use for the search system
50
+ endpoint_url: Optional endpoint URL for OpenRouter or other API services
51
+ search_strategy: Search strategy to use (default: 'source_based')
52
+
53
+ Returns:
54
+ Dictionary with benchmark results
55
+ """
56
+ logger.info(f"Starting SimpleQA benchmark with {num_examples} examples")
57
+
58
+ # Set up search configuration
59
+ search_config = {
60
+ "iterations": search_iterations,
61
+ "questions_per_iteration": questions_per_iteration,
62
+ "search_tool": search_tool,
63
+ "search_strategy": search_strategy,
64
+ }
65
+
66
+ # Add model configurations if provided
67
+ if search_model:
68
+ search_config["model_name"] = search_model
69
+ if search_provider:
70
+ search_config["provider"] = search_provider
71
+ if endpoint_url:
72
+ search_config["openai_endpoint_url"] = endpoint_url
73
+
74
+ # Check environment variables for additional configuration
75
+ if env_model := os.environ.get("LDR_SEARCH_MODEL"):
76
+ search_config["model_name"] = env_model
77
+ if env_provider := os.environ.get("LDR_SEARCH_PROVIDER"):
78
+ search_config["provider"] = env_provider
79
+ if env_url := os.environ.get("LDR_ENDPOINT_URL"):
80
+ search_config["openai_endpoint_url"] = env_url
81
+
82
+ # Set up evaluation configuration if needed
83
+ evaluation_config = None
84
+ if evaluation_model or evaluation_provider:
85
+ evaluation_config = {
86
+ "temperature": 0 # Always use zero temperature for evaluation
87
+ }
88
+ if evaluation_model:
89
+ evaluation_config["model_name"] = evaluation_model
90
+ if evaluation_provider:
91
+ evaluation_config["provider"] = evaluation_provider
92
+ # Add OpenRouter URL if using openai_endpoint
93
+ if evaluation_provider == "openai_endpoint":
94
+ evaluation_config["openai_endpoint_url"] = (
95
+ "https://openrouter.ai/api/v1"
96
+ )
97
+
98
+ # Run the benchmark
99
+ results = run_simpleqa_benchmark(
100
+ num_examples=num_examples,
101
+ output_dir=output_dir,
102
+ search_config=search_config,
103
+ evaluation_config=evaluation_config,
104
+ human_evaluation=human_evaluation,
105
+ )
106
+
107
+ return results
108
+
109
+
110
+ def evaluate_browsecomp(
111
+ num_examples: int = 100,
112
+ search_iterations: int = 3,
113
+ questions_per_iteration: int = 3,
114
+ search_tool: str = "searxng",
115
+ human_evaluation: bool = False,
116
+ evaluation_model: Optional[str] = None,
117
+ evaluation_provider: Optional[str] = None,
118
+ output_dir: str = "benchmark_results",
119
+ search_model: Optional[str] = None,
120
+ search_provider: Optional[str] = None,
121
+ endpoint_url: Optional[str] = None,
122
+ search_strategy: str = "source_based",
123
+ ) -> Dict[str, Any]:
124
+ """
125
+ Run BrowseComp benchmark evaluation.
126
+
127
+ Args:
128
+ num_examples: Number of examples to evaluate
129
+ search_iterations: Number of search iterations per query
130
+ questions_per_iteration: Number of questions per iteration
131
+ search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
132
+ human_evaluation: Whether to use human evaluation
133
+ evaluation_model: Optional custom model for evaluation
134
+ evaluation_provider: Optional custom provider for evaluation
135
+ output_dir: Directory to save results
136
+ search_model: Optional model to use for the search system
137
+ search_provider: Optional provider to use for the search system
138
+ endpoint_url: Optional endpoint URL for OpenRouter or other API services
139
+ search_strategy: Search strategy to use (default: 'source_based')
140
+
141
+ Returns:
142
+ Dictionary with benchmark results
143
+ """
144
+ logger.info(f"Starting BrowseComp benchmark with {num_examples} examples")
145
+
146
+ # Set up search configuration
147
+ search_config = {
148
+ "iterations": search_iterations,
149
+ "questions_per_iteration": questions_per_iteration,
150
+ "search_tool": search_tool,
151
+ "search_strategy": search_strategy,
152
+ }
153
+
154
+ # Add model configurations if provided
155
+ if search_model:
156
+ search_config["model_name"] = search_model
157
+ if search_provider:
158
+ search_config["provider"] = search_provider
159
+ if endpoint_url:
160
+ search_config["openai_endpoint_url"] = endpoint_url
161
+
162
+ # Check environment variables for additional configuration
163
+ if env_model := os.environ.get("LDR_SEARCH_MODEL"):
164
+ search_config["model_name"] = env_model
165
+ if env_provider := os.environ.get("LDR_SEARCH_PROVIDER"):
166
+ search_config["provider"] = env_provider
167
+ if env_url := os.environ.get("LDR_ENDPOINT_URL"):
168
+ search_config["openai_endpoint_url"] = env_url
169
+
170
+ # Set up evaluation configuration if needed
171
+ evaluation_config = None
172
+ if evaluation_model or evaluation_provider:
173
+ evaluation_config = {
174
+ "temperature": 0 # Always use zero temperature for evaluation
175
+ }
176
+ if evaluation_model:
177
+ evaluation_config["model_name"] = evaluation_model
178
+ if evaluation_provider:
179
+ evaluation_config["provider"] = evaluation_provider
180
+ # Add OpenRouter URL if using openai_endpoint
181
+ if evaluation_provider == "openai_endpoint":
182
+ evaluation_config["openai_endpoint_url"] = (
183
+ "https://openrouter.ai/api/v1"
184
+ )
185
+
186
+ # Run the benchmark
187
+ results = run_browsecomp_benchmark(
188
+ num_examples=num_examples,
189
+ output_dir=output_dir,
190
+ search_config=search_config,
191
+ evaluation_config=evaluation_config,
192
+ human_evaluation=human_evaluation,
193
+ )
194
+
195
+ return results
196
+
197
+
198
+ def get_available_benchmarks() -> List[Dict[str, str]]:
199
+ """
200
+ Get information about available benchmarks.
201
+
202
+ Returns:
203
+ List of dictionaries with benchmark information
204
+ """
205
+ return [
206
+ {
207
+ "id": "simpleqa",
208
+ "name": "SimpleQA",
209
+ "description": "Benchmark for factual question answering",
210
+ "recommended_examples": 100,
211
+ },
212
+ {
213
+ "id": "browsecomp",
214
+ "name": "BrowseComp",
215
+ "description": "Benchmark for web browsing comprehension",
216
+ "recommended_examples": 100,
217
+ },
218
+ ]
219
+
220
+
221
+ def compare_configurations(
222
+ dataset_type: str = "simpleqa",
223
+ num_examples: int = 20,
224
+ configurations: List[Dict[str, Any]] = None,
225
+ output_dir: str = "benchmark_comparisons",
226
+ ) -> Dict[str, Any]:
227
+ """
228
+ Compare multiple search configurations on the same benchmark.
229
+
230
+ Args:
231
+ dataset_type: Type of dataset to use
232
+ num_examples: Number of examples to evaluate
233
+ configurations: List of search configurations to compare
234
+ output_dir: Directory to save results
235
+
236
+ Returns:
237
+ Dictionary with comparison results
238
+ """
239
+ if not configurations:
240
+ # Default configurations to compare
241
+ configurations = [
242
+ {
243
+ "name": "Base Config",
244
+ "search_tool": "searxng",
245
+ "iterations": 1,
246
+ "questions_per_iteration": 3,
247
+ },
248
+ {
249
+ "name": "More Iterations",
250
+ "search_tool": "searxng",
251
+ "iterations": 3,
252
+ "questions_per_iteration": 3,
253
+ },
254
+ {
255
+ "name": "More Questions",
256
+ "search_tool": "searxng",
257
+ "iterations": 1,
258
+ "questions_per_iteration": 5,
259
+ },
260
+ ]
261
+
262
+ # Create output directory
263
+ import os
264
+
265
+ os.makedirs(output_dir, exist_ok=True)
266
+
267
+ # Run benchmarks for each configuration
268
+ results = []
269
+ for config in configurations:
270
+ config_name = config.pop("name", f"Config-{len(results)}")
271
+
272
+ logger.info(f"Running benchmark with configuration: {config_name}")
273
+
274
+ search_config = {
275
+ "iterations": config.pop("iterations", 1),
276
+ "questions_per_iteration": config.pop("questions_per_iteration", 3),
277
+ "search_tool": config.pop("search_tool", "searxng"),
278
+ }
279
+
280
+ # Add any remaining config items
281
+ for key, value in config.items():
282
+ search_config[key] = value
283
+
284
+ # Run benchmark with this configuration
285
+ benchmark_result = run_benchmark(
286
+ dataset_type=dataset_type,
287
+ num_examples=num_examples,
288
+ output_dir=os.path.join(output_dir, config_name.replace(" ", "_")),
289
+ search_config=search_config,
290
+ run_evaluation=True,
291
+ )
292
+
293
+ # Add configuration name to results
294
+ benchmark_result["configuration_name"] = config_name
295
+ benchmark_result["search_config"] = search_config
296
+
297
+ results.append(benchmark_result)
298
+
299
+ # Generate comparison report
300
+ import time
301
+
302
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
303
+ report_file = os.path.join(output_dir, f"comparison_{dataset_type}_{timestamp}.md")
304
+
305
+ with open(report_file, "w") as f:
306
+ f.write(f"# Configuration Comparison - {dataset_type.capitalize()}\n\n")
307
+
308
+ # Write summary table
309
+ f.write("## Summary\n\n")
310
+ f.write("| Configuration | Accuracy | Avg. Time | Examples |\n")
311
+ f.write("|---------------|----------|-----------|----------|\n")
312
+
313
+ for result in results:
314
+ accuracy = result.get("metrics", {}).get("accuracy", 0)
315
+ avg_time = result.get("metrics", {}).get("average_processing_time", 0)
316
+ examples = result.get("total_examples", 0)
317
+
318
+ f.write(
319
+ f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n"
320
+ )
321
+
322
+ f.write("\n## Configuration Details\n\n")
323
+
324
+ for result in results:
325
+ f.write(f"### {result['configuration_name']}\n\n")
326
+
327
+ config = result.get("search_config", {})
328
+ f.write("```\n")
329
+ for key, value in config.items():
330
+ f.write(f"{key}: {value}\n")
331
+ f.write("```\n\n")
332
+
333
+ logger.info(f"Comparison report saved to {report_file}")
334
+
335
+ return {
336
+ "status": "complete",
337
+ "dataset_type": dataset_type,
338
+ "configurations_tested": len(configurations),
339
+ "report_path": report_file,
340
+ "results": results,
341
+ }
342
+
343
+
344
+ # Export the API functions
345
+ __all__ = [
346
+ "evaluate_simpleqa",
347
+ "evaluate_browsecomp",
348
+ "get_available_benchmarks",
349
+ "compare_configurations",
350
+ "run_benchmark", # For advanced users
351
+ "calculate_metrics",
352
+ "generate_report",
353
+ ]
@@ -0,0 +1,16 @@
1
+ """
2
+ CLI module for benchmark functionality.
3
+
4
+ This package provides command-line interface tools for
5
+ running benchmarks and optimization tasks.
6
+ """
7
+
8
+ from .benchmark_commands import main as benchmark_main
9
+ from .benchmark_commands import (
10
+ setup_benchmark_parser,
11
+ )
12
+
13
+ __all__ = [
14
+ "benchmark_main",
15
+ "setup_benchmark_parser",
16
+ ]
@@ -0,0 +1,338 @@
1
+ """
2
+ Command-line interface for benchmarking.
3
+
4
+ This module provides CLI commands for running benchmarks.
5
+ """
6
+
7
+ import argparse
8
+ import logging
9
+
10
+ from .. import (
11
+ get_available_datasets,
12
+ run_browsecomp_benchmark,
13
+ run_simpleqa_benchmark,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def setup_benchmark_parser(subparsers):
20
+ """
21
+ Set up the benchmark CLI commands.
22
+
23
+ Args:
24
+ subparsers: argparse subparsers object to add commands to
25
+ """
26
+ # Common benchmark arguments
27
+ benchmark_parent = argparse.ArgumentParser(add_help=False)
28
+ benchmark_parent.add_argument(
29
+ "--examples",
30
+ type=int,
31
+ default=100,
32
+ help="Number of examples to run (default: 100)",
33
+ )
34
+ benchmark_parent.add_argument(
35
+ "--iterations",
36
+ type=int,
37
+ default=3,
38
+ help="Number of search iterations (default: 3)",
39
+ )
40
+ benchmark_parent.add_argument(
41
+ "--questions", type=int, default=3, help="Questions per iteration (default: 3)"
42
+ )
43
+ benchmark_parent.add_argument(
44
+ "--search-tool",
45
+ type=str,
46
+ default="searxng",
47
+ help="Search tool to use (default: searxng)",
48
+ )
49
+ benchmark_parent.add_argument(
50
+ "--output-dir",
51
+ type=str,
52
+ default="data/benchmark_results",
53
+ help="Directory to save results (default: data/benchmark_results)",
54
+ )
55
+ benchmark_parent.add_argument(
56
+ "--human-eval",
57
+ action="store_true",
58
+ help="Use human evaluation instead of automated",
59
+ )
60
+ benchmark_parent.add_argument(
61
+ "--eval-model", type=str, help="Model to use for evaluation"
62
+ )
63
+ benchmark_parent.add_argument(
64
+ "--eval-provider", type=str, help="Provider to use for evaluation"
65
+ )
66
+ benchmark_parent.add_argument(
67
+ "--custom-dataset", type=str, help="Path to custom dataset"
68
+ )
69
+ benchmark_parent.add_argument(
70
+ "--no-eval", action="store_true", help="Skip evaluation phase"
71
+ )
72
+
73
+ # Add model configuration options for the search system
74
+ benchmark_parent.add_argument(
75
+ "--search-model", type=str, help="Model to use for the search system"
76
+ )
77
+ benchmark_parent.add_argument(
78
+ "--search-provider", type=str, help="Provider to use for the search system"
79
+ )
80
+ benchmark_parent.add_argument(
81
+ "--endpoint-url", type=str, help="Endpoint URL for OpenRouter or other API services"
82
+ )
83
+ benchmark_parent.add_argument(
84
+ "--search-strategy", type=str, default="source_based",
85
+ choices=["source_based", "standard", "rapid", "parallel", "iterdrag"],
86
+ help="Search strategy to use (default: source_based)"
87
+ )
88
+
89
+ # SimpleQA benchmark command
90
+ simpleqa_parser = subparsers.add_parser(
91
+ "simpleqa", parents=[benchmark_parent], help="Run SimpleQA benchmark"
92
+ )
93
+ simpleqa_parser.set_defaults(func=run_simpleqa_cli)
94
+
95
+ # BrowseComp benchmark command
96
+ browsecomp_parser = subparsers.add_parser(
97
+ "browsecomp", parents=[benchmark_parent], help="Run BrowseComp benchmark"
98
+ )
99
+ browsecomp_parser.set_defaults(func=run_browsecomp_cli)
100
+
101
+ # List available benchmarks command
102
+ list_parser = subparsers.add_parser("list", help="List available benchmarks")
103
+ list_parser.set_defaults(func=list_benchmarks_cli)
104
+
105
+ # Compare configurations command
106
+ compare_parser = subparsers.add_parser(
107
+ "compare", help="Compare multiple search configurations"
108
+ )
109
+ compare_parser.add_argument(
110
+ "--dataset",
111
+ type=str,
112
+ default="simpleqa",
113
+ choices=["simpleqa", "browsecomp"],
114
+ help="Dataset to use for comparison",
115
+ )
116
+ compare_parser.add_argument(
117
+ "--examples",
118
+ type=int,
119
+ default=20,
120
+ help="Number of examples for each configuration (default: 20)",
121
+ )
122
+ compare_parser.add_argument(
123
+ "--output-dir",
124
+ type=str,
125
+ default="data/benchmark_results/comparison",
126
+ help="Directory to save comparison results",
127
+ )
128
+ compare_parser.set_defaults(func=compare_configs_cli)
129
+
130
+
131
+ def run_simpleqa_cli(args):
132
+ """
133
+ CLI handler for SimpleQA benchmark.
134
+
135
+ Args:
136
+ args: Parsed command-line arguments
137
+ """
138
+ # Set up search configuration
139
+ search_config = {
140
+ "iterations": args.iterations,
141
+ "questions_per_iteration": args.questions,
142
+ "search_tool": args.search_tool,
143
+ }
144
+
145
+ # Add model configuration if provided
146
+ if hasattr(args, "search_model") and args.search_model:
147
+ search_config["model_name"] = args.search_model
148
+ if hasattr(args, "search_provider") and args.search_provider:
149
+ search_config["provider"] = args.search_provider
150
+ if hasattr(args, "endpoint_url") and args.endpoint_url:
151
+ search_config["openai_endpoint_url"] = args.endpoint_url
152
+ if hasattr(args, "search_strategy") and args.search_strategy:
153
+ search_config["search_strategy"] = args.search_strategy
154
+
155
+ # Set up evaluation configuration if needed
156
+ evaluation_config = None
157
+ if args.eval_model or args.eval_provider:
158
+ evaluation_config = {}
159
+ if args.eval_model:
160
+ evaluation_config["model_name"] = args.eval_model
161
+ if args.eval_provider:
162
+ evaluation_config["provider"] = args.eval_provider
163
+
164
+ # Run the benchmark
165
+ result = run_simpleqa_benchmark(
166
+ num_examples=args.examples,
167
+ dataset_path=args.custom_dataset,
168
+ output_dir=args.output_dir,
169
+ search_config=search_config,
170
+ evaluation_config=evaluation_config,
171
+ human_evaluation=args.human_eval,
172
+ run_evaluation=not args.no_eval,
173
+ )
174
+
175
+ # Print results summary
176
+ if "metrics" in result:
177
+ print("\nSimpleQA Benchmark Results:")
178
+ print(f" Accuracy: {result['metrics'].get('accuracy', 0):.3f}")
179
+ print(f" Total examples: {result['total_examples']}")
180
+ print(f" Correct answers: {result['metrics'].get('correct', 0)}")
181
+ print(
182
+ f" Average time: {result['metrics'].get('average_processing_time', 0):.2f}s"
183
+ )
184
+ print(f"\nReport saved to: {result.get('report_path', 'N/A')}")
185
+ else:
186
+ print("\nSimpleQA Benchmark Completed (no evaluation)")
187
+ print(f" Total examples: {result['total_examples']}")
188
+ print(f" Results saved to: {result.get('results_path', 'N/A')}")
189
+
190
+
191
+ def run_browsecomp_cli(args):
192
+ """
193
+ CLI handler for BrowseComp benchmark.
194
+
195
+ Args:
196
+ args: Parsed command-line arguments
197
+ """
198
+ # Set up search configuration
199
+ search_config = {
200
+ "iterations": args.iterations,
201
+ "questions_per_iteration": args.questions,
202
+ "search_tool": args.search_tool,
203
+ }
204
+
205
+ # Add model configuration if provided
206
+ if hasattr(args, "search_model") and args.search_model:
207
+ search_config["model_name"] = args.search_model
208
+ if hasattr(args, "search_provider") and args.search_provider:
209
+ search_config["provider"] = args.search_provider
210
+ if hasattr(args, "endpoint_url") and args.endpoint_url:
211
+ search_config["openai_endpoint_url"] = args.endpoint_url
212
+ if hasattr(args, "search_strategy") and args.search_strategy:
213
+ search_config["search_strategy"] = args.search_strategy
214
+
215
+ # Set up evaluation configuration if needed
216
+ evaluation_config = None
217
+ if args.eval_model or args.eval_provider:
218
+ evaluation_config = {}
219
+ if args.eval_model:
220
+ evaluation_config["model_name"] = args.eval_model
221
+ if args.eval_provider:
222
+ evaluation_config["provider"] = args.eval_provider
223
+
224
+ # Run the benchmark
225
+ result = run_browsecomp_benchmark(
226
+ num_examples=args.examples,
227
+ dataset_path=args.custom_dataset,
228
+ output_dir=args.output_dir,
229
+ search_config=search_config,
230
+ evaluation_config=evaluation_config,
231
+ human_evaluation=args.human_eval,
232
+ run_evaluation=not args.no_eval,
233
+ )
234
+
235
+ # Print results summary
236
+ if "metrics" in result:
237
+ print("\nBrowseComp Benchmark Results:")
238
+ print(f" Accuracy: {result['metrics'].get('accuracy', 0):.3f}")
239
+ print(f" Total examples: {result['total_examples']}")
240
+ print(f" Correct answers: {result['metrics'].get('correct', 0)}")
241
+ print(
242
+ f" Average time: {result['metrics'].get('average_processing_time', 0):.2f}s"
243
+ )
244
+ print(f"\nReport saved to: {result.get('report_path', 'N/A')}")
245
+ else:
246
+ print("\nBrowseComp Benchmark Completed (no evaluation)")
247
+ print(f" Total examples: {result['total_examples']}")
248
+ print(f" Results saved to: {result.get('results_path', 'N/A')}")
249
+
250
+
251
+ def list_benchmarks_cli(args):
252
+ """
253
+ CLI handler for listing available benchmarks.
254
+
255
+ Args:
256
+ args: Parsed command-line arguments
257
+ """
258
+ datasets = get_available_datasets()
259
+
260
+ print("\nAvailable Benchmarks:")
261
+ for dataset in datasets:
262
+ print(f" {dataset['id']}: {dataset['name']}")
263
+ print(f" {dataset['description']}")
264
+ print(f" URL: {dataset['url']}")
265
+ print()
266
+
267
+
268
+ def compare_configs_cli(args):
269
+ """
270
+ CLI handler for comparing multiple configurations.
271
+
272
+ Args:
273
+ args: Parsed command-line arguments
274
+ """
275
+ # Import the compare configurations function
276
+ from ...api.benchmark_functions import compare_configurations
277
+
278
+ # Run the comparison
279
+ result = compare_configurations(
280
+ dataset_type=args.dataset,
281
+ num_examples=args.examples,
282
+ output_dir=args.output_dir,
283
+ )
284
+
285
+ # Print results summary
286
+ print("\nConfiguration Comparison Results:")
287
+ print(f" Dataset: {args.dataset}")
288
+ print(f" Configurations tested: {result['configurations_tested']}")
289
+ print(f" Report saved to: {result['report_path']}")
290
+
291
+ # Print brief comparison table
292
+ print("\nResults Summary:")
293
+ print("Configuration | Accuracy | Avg. Time")
294
+ print("--------------- | -------- | ---------")
295
+ for res in result["results"]:
296
+ name = res["configuration_name"]
297
+ acc = res.get("metrics", {}).get("accuracy", 0)
298
+ time = res.get("metrics", {}).get("average_processing_time", 0)
299
+ print(f"{name:15} | {acc:.3f} | {time:.2f}s")
300
+
301
+
302
+ def main():
303
+ """
304
+ Main entry point for benchmark CLI.
305
+ """
306
+ parser = argparse.ArgumentParser(
307
+ description="Local Deep Research Benchmarking Tool", prog="ldr-benchmark"
308
+ )
309
+
310
+ # Set up logging
311
+ parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
312
+
313
+ # Create subparsers
314
+ subparsers = parser.add_subparsers(
315
+ dest="command", help="Command to run", required=True
316
+ )
317
+
318
+ # Set up commands
319
+ setup_benchmark_parser(subparsers)
320
+
321
+ # Parse arguments
322
+ args = parser.parse_args()
323
+
324
+ # Set up logging
325
+ log_level = logging.DEBUG if args.verbose else logging.INFO
326
+ logging.basicConfig(
327
+ level=log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
328
+ )
329
+
330
+ # Run command
331
+ if hasattr(args, "func"):
332
+ args.func(args)
333
+ else:
334
+ parser.print_help()
335
+
336
+
337
+ if __name__ == "__main__":
338
+ main()