local-deep-research 0.3.12__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. local_deep_research/__init__.py +1 -0
  2. local_deep_research/__version__.py +1 -1
  3. local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
  4. local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
  5. local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
  6. local_deep_research/advanced_search_system/findings/repository.py +0 -3
  7. local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
  8. local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
  9. local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
  10. local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
  11. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
  12. local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
  13. local_deep_research/api/benchmark_functions.py +288 -0
  14. local_deep_research/api/research_functions.py +8 -4
  15. local_deep_research/benchmarks/README.md +162 -0
  16. local_deep_research/benchmarks/__init__.py +51 -0
  17. local_deep_research/benchmarks/benchmark_functions.py +353 -0
  18. local_deep_research/benchmarks/cli/__init__.py +16 -0
  19. local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
  20. local_deep_research/benchmarks/cli.py +347 -0
  21. local_deep_research/benchmarks/comparison/__init__.py +12 -0
  22. local_deep_research/benchmarks/comparison/evaluator.py +768 -0
  23. local_deep_research/benchmarks/datasets/__init__.py +53 -0
  24. local_deep_research/benchmarks/datasets/base.py +295 -0
  25. local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
  26. local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
  27. local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
  28. local_deep_research/benchmarks/datasets/utils.py +116 -0
  29. local_deep_research/benchmarks/datasets.py +31 -0
  30. local_deep_research/benchmarks/efficiency/__init__.py +14 -0
  31. local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
  32. local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
  33. local_deep_research/benchmarks/evaluators/__init__.py +18 -0
  34. local_deep_research/benchmarks/evaluators/base.py +74 -0
  35. local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
  36. local_deep_research/benchmarks/evaluators/composite.py +121 -0
  37. local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
  38. local_deep_research/benchmarks/graders.py +410 -0
  39. local_deep_research/benchmarks/metrics/README.md +80 -0
  40. local_deep_research/benchmarks/metrics/__init__.py +24 -0
  41. local_deep_research/benchmarks/metrics/calculation.py +385 -0
  42. local_deep_research/benchmarks/metrics/reporting.py +155 -0
  43. local_deep_research/benchmarks/metrics/visualization.py +205 -0
  44. local_deep_research/benchmarks/metrics.py +11 -0
  45. local_deep_research/benchmarks/optimization/__init__.py +32 -0
  46. local_deep_research/benchmarks/optimization/api.py +274 -0
  47. local_deep_research/benchmarks/optimization/metrics.py +20 -0
  48. local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
  49. local_deep_research/benchmarks/runners.py +434 -0
  50. local_deep_research/benchmarks/templates.py +65 -0
  51. local_deep_research/config/llm_config.py +26 -23
  52. local_deep_research/config/search_config.py +1 -5
  53. local_deep_research/defaults/default_settings.json +108 -7
  54. local_deep_research/search_system.py +16 -8
  55. local_deep_research/utilities/db_utils.py +3 -6
  56. local_deep_research/utilities/es_utils.py +441 -0
  57. local_deep_research/utilities/log_utils.py +36 -0
  58. local_deep_research/utilities/search_utilities.py +8 -9
  59. local_deep_research/web/app.py +15 -10
  60. local_deep_research/web/app_factory.py +9 -12
  61. local_deep_research/web/database/migrations.py +8 -5
  62. local_deep_research/web/database/models.py +20 -0
  63. local_deep_research/web/database/schema_upgrade.py +5 -8
  64. local_deep_research/web/models/database.py +15 -18
  65. local_deep_research/web/routes/benchmark_routes.py +427 -0
  66. local_deep_research/web/routes/research_routes.py +13 -17
  67. local_deep_research/web/routes/settings_routes.py +264 -67
  68. local_deep_research/web/services/research_service.py +58 -73
  69. local_deep_research/web/services/settings_manager.py +1 -4
  70. local_deep_research/web/services/settings_service.py +4 -6
  71. local_deep_research/web/static/css/styles.css +12 -0
  72. local_deep_research/web/static/js/components/logpanel.js +164 -155
  73. local_deep_research/web/static/js/components/research.js +44 -3
  74. local_deep_research/web/static/js/components/settings.js +27 -0
  75. local_deep_research/web/static/js/services/socket.js +47 -0
  76. local_deep_research/web_search_engines/default_search_engines.py +38 -0
  77. local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
  78. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
  79. local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
  80. local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
  81. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
  82. local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
  83. local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
  84. local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
  85. local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
  86. local_deep_research/web_search_engines/search_engine_base.py +22 -5
  87. local_deep_research/web_search_engines/search_engine_factory.py +30 -11
  88. local_deep_research/web_search_engines/search_engines_config.py +14 -1
  89. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/METADATA +10 -2
  90. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/RECORD +93 -51
  91. local_deep_research/app.py +0 -8
  92. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/WHEEL +0 -0
  93. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/entry_points.txt +0 -0
  94. {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,288 @@
1
+ """
2
+ API functions for benchmarking.
3
+
4
+ This module provides functions for running benchmarks programmatically.
5
+ """
6
+
7
+ import logging
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ from ..benchmarks import (
11
+ calculate_metrics,
12
+ generate_report,
13
+ run_benchmark,
14
+ run_browsecomp_benchmark,
15
+ run_simpleqa_benchmark,
16
+ )
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def evaluate_simpleqa(
22
+ num_examples: int = 100,
23
+ search_iterations: int = 3,
24
+ questions_per_iteration: int = 3,
25
+ search_tool: str = "searxng",
26
+ human_evaluation: bool = False,
27
+ evaluation_model: Optional[str] = None,
28
+ evaluation_provider: Optional[str] = None,
29
+ output_dir: str = "benchmark_results",
30
+ ) -> Dict[str, Any]:
31
+ """
32
+ Run SimpleQA benchmark evaluation.
33
+
34
+ Args:
35
+ num_examples: Number of examples to evaluate
36
+ search_iterations: Number of search iterations per query
37
+ questions_per_iteration: Number of questions per iteration
38
+ search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
39
+ human_evaluation: Whether to use human evaluation
40
+ evaluation_model: Optional custom model for evaluation
41
+ evaluation_provider: Optional custom provider for evaluation
42
+ output_dir: Directory to save results
43
+
44
+ Returns:
45
+ Dictionary with benchmark results
46
+ """
47
+ logger.info(f"Starting SimpleQA benchmark with {num_examples} examples")
48
+
49
+ # Set up search configuration
50
+ search_config = {
51
+ "iterations": search_iterations,
52
+ "questions_per_iteration": questions_per_iteration,
53
+ "search_tool": search_tool,
54
+ }
55
+
56
+ # Set up evaluation configuration if needed
57
+ evaluation_config = None
58
+ if evaluation_model or evaluation_provider:
59
+ evaluation_config = {}
60
+ if evaluation_model:
61
+ evaluation_config["model_name"] = evaluation_model
62
+ if evaluation_provider:
63
+ evaluation_config["provider"] = evaluation_provider
64
+
65
+ # Run the benchmark
66
+ results = run_simpleqa_benchmark(
67
+ num_examples=num_examples,
68
+ output_dir=output_dir,
69
+ search_config=search_config,
70
+ evaluation_config=evaluation_config,
71
+ human_evaluation=human_evaluation,
72
+ )
73
+
74
+ return results
75
+
76
+
77
+ def evaluate_browsecomp(
78
+ num_examples: int = 100,
79
+ search_iterations: int = 3,
80
+ questions_per_iteration: int = 3,
81
+ search_tool: str = "searxng",
82
+ human_evaluation: bool = False,
83
+ evaluation_model: Optional[str] = None,
84
+ evaluation_provider: Optional[str] = None,
85
+ output_dir: str = "benchmark_results",
86
+ ) -> Dict[str, Any]:
87
+ """
88
+ Run BrowseComp benchmark evaluation.
89
+
90
+ Args:
91
+ num_examples: Number of examples to evaluate
92
+ search_iterations: Number of search iterations per query
93
+ questions_per_iteration: Number of questions per iteration
94
+ search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
95
+ human_evaluation: Whether to use human evaluation
96
+ evaluation_model: Optional custom model for evaluation
97
+ evaluation_provider: Optional custom provider for evaluation
98
+ output_dir: Directory to save results
99
+
100
+ Returns:
101
+ Dictionary with benchmark results
102
+ """
103
+ logger.info(f"Starting BrowseComp benchmark with {num_examples} examples")
104
+
105
+ # Set up search configuration
106
+ search_config = {
107
+ "iterations": search_iterations,
108
+ "questions_per_iteration": questions_per_iteration,
109
+ "search_tool": search_tool,
110
+ }
111
+
112
+ # Set up evaluation configuration if needed
113
+ evaluation_config = None
114
+ if evaluation_model or evaluation_provider:
115
+ evaluation_config = {}
116
+ if evaluation_model:
117
+ evaluation_config["model_name"] = evaluation_model
118
+ if evaluation_provider:
119
+ evaluation_config["provider"] = evaluation_provider
120
+
121
+ # Run the benchmark
122
+ results = run_browsecomp_benchmark(
123
+ num_examples=num_examples,
124
+ output_dir=output_dir,
125
+ search_config=search_config,
126
+ evaluation_config=evaluation_config,
127
+ human_evaluation=human_evaluation,
128
+ )
129
+
130
+ return results
131
+
132
+
133
+ def get_available_benchmarks() -> List[Dict[str, str]]:
134
+ """
135
+ Get information about available benchmarks.
136
+
137
+ Returns:
138
+ List of dictionaries with benchmark information
139
+ """
140
+ return [
141
+ {
142
+ "id": "simpleqa",
143
+ "name": "SimpleQA",
144
+ "description": "Benchmark for factual question answering",
145
+ "recommended_examples": 100,
146
+ },
147
+ {
148
+ "id": "browsecomp",
149
+ "name": "BrowseComp",
150
+ "description": "Benchmark for web browsing comprehension",
151
+ "recommended_examples": 100,
152
+ },
153
+ ]
154
+
155
+
156
+ def compare_configurations(
157
+ dataset_type: str = "simpleqa",
158
+ num_examples: int = 20,
159
+ configurations: List[Dict[str, Any]] = None,
160
+ output_dir: str = "benchmark_comparisons",
161
+ ) -> Dict[str, Any]:
162
+ """
163
+ Compare multiple search configurations on the same benchmark.
164
+
165
+ Args:
166
+ dataset_type: Type of dataset to use
167
+ num_examples: Number of examples to evaluate
168
+ configurations: List of search configurations to compare
169
+ output_dir: Directory to save results
170
+
171
+ Returns:
172
+ Dictionary with comparison results
173
+ """
174
+ if not configurations:
175
+ # Default configurations to compare
176
+ configurations = [
177
+ {
178
+ "name": "Base Config",
179
+ "search_tool": "searxng",
180
+ "iterations": 1,
181
+ "questions_per_iteration": 3,
182
+ },
183
+ {
184
+ "name": "More Iterations",
185
+ "search_tool": "searxng",
186
+ "iterations": 3,
187
+ "questions_per_iteration": 3,
188
+ },
189
+ {
190
+ "name": "More Questions",
191
+ "search_tool": "searxng",
192
+ "iterations": 1,
193
+ "questions_per_iteration": 5,
194
+ },
195
+ ]
196
+
197
+ # Create output directory
198
+ import os
199
+
200
+ os.makedirs(output_dir, exist_ok=True)
201
+
202
+ # Run benchmarks for each configuration
203
+ results = []
204
+ for config in configurations:
205
+ config_name = config.pop("name", f"Config-{len(results)}")
206
+
207
+ logger.info(f"Running benchmark with configuration: {config_name}")
208
+
209
+ search_config = {
210
+ "iterations": config.pop("iterations", 1),
211
+ "questions_per_iteration": config.pop("questions_per_iteration", 3),
212
+ "search_tool": config.pop("search_tool", "searxng"),
213
+ }
214
+
215
+ # Add any remaining config items
216
+ for key, value in config.items():
217
+ search_config[key] = value
218
+
219
+ # Run benchmark with this configuration
220
+ benchmark_result = run_benchmark(
221
+ dataset_type=dataset_type,
222
+ num_examples=num_examples,
223
+ output_dir=os.path.join(output_dir, config_name.replace(" ", "_")),
224
+ search_config=search_config,
225
+ run_evaluation=True,
226
+ )
227
+
228
+ # Add configuration name to results
229
+ benchmark_result["configuration_name"] = config_name
230
+ benchmark_result["search_config"] = search_config
231
+
232
+ results.append(benchmark_result)
233
+
234
+ # Generate comparison report
235
+ import time
236
+
237
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
238
+ report_file = os.path.join(output_dir, f"comparison_{dataset_type}_{timestamp}.md")
239
+
240
+ with open(report_file, "w") as f:
241
+ f.write(f"# Configuration Comparison - {dataset_type.capitalize()}\n\n")
242
+
243
+ # Write summary table
244
+ f.write("## Summary\n\n")
245
+ f.write("| Configuration | Accuracy | Avg. Time | Examples |\n")
246
+ f.write("|---------------|----------|-----------|----------|\n")
247
+
248
+ for result in results:
249
+ accuracy = result.get("metrics", {}).get("accuracy", 0)
250
+ avg_time = result.get("metrics", {}).get("average_processing_time", 0)
251
+ examples = result.get("total_examples", 0)
252
+
253
+ f.write(
254
+ f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n"
255
+ )
256
+
257
+ f.write("\n## Configuration Details\n\n")
258
+
259
+ for result in results:
260
+ f.write(f"### {result['configuration_name']}\n\n")
261
+
262
+ config = result.get("search_config", {})
263
+ f.write("```\n")
264
+ for key, value in config.items():
265
+ f.write(f"{key}: {value}\n")
266
+ f.write("```\n\n")
267
+
268
+ logger.info(f"Comparison report saved to {report_file}")
269
+
270
+ return {
271
+ "status": "complete",
272
+ "dataset_type": dataset_type,
273
+ "configurations_tested": len(configurations),
274
+ "report_path": report_file,
275
+ "results": results,
276
+ }
277
+
278
+
279
+ # Export the API functions
280
+ __all__ = [
281
+ "evaluate_simpleqa",
282
+ "evaluate_browsecomp",
283
+ "get_available_benchmarks",
284
+ "compare_configurations",
285
+ "run_benchmark", # For advanced users
286
+ "calculate_metrics",
287
+ "generate_report",
288
+ ]
@@ -3,17 +3,16 @@ API module for Local Deep Research.
3
3
  Provides programmatic access to search and research capabilities.
4
4
  """
5
5
 
6
- import logging
7
6
  from typing import Any, Callable, Dict, Optional
8
7
 
8
+ from loguru import logger
9
+
9
10
  from ..config.llm_config import get_llm
10
11
  from ..config.search_config import get_search
11
12
  from ..report_generator import IntegratedReportGenerator
12
13
  from ..search_system import AdvancedSearchSystem
13
14
  from ..utilities.search_utilities import remove_think_tags
14
15
 
15
- logger = logging.getLogger(__name__)
16
-
17
16
 
18
17
  def _init_search_system(
19
18
  model_name: str | None = None,
@@ -24,6 +23,7 @@ def _init_search_system(
24
23
  search_tool: Optional[str] = None,
25
24
  iterations: int = 1,
26
25
  questions_per_iteration: int = 1,
26
+ search_strategy: str = "source_based",
27
27
  ) -> AdvancedSearchSystem:
28
28
  """
29
29
  Initializes the advanced search system with specified parameters. This function sets up
@@ -41,6 +41,7 @@ def _init_search_system(
41
41
  search_tool: Search engine to use (auto, wikipedia, arxiv, etc.). If None, uses default
42
42
  iterations: Number of research cycles to perform
43
43
  questions_per_iteration: Number of questions to generate per cycle
44
+ search_strategy: The name of the search strategy to use.
44
45
 
45
46
  Returns:
46
47
  AdvancedSearchSystem: An instance of the configured AdvancedSearchSystem.
@@ -64,7 +65,10 @@ def _init_search_system(
64
65
  )
65
66
 
66
67
  # Create search system with custom parameters
67
- system = AdvancedSearchSystem(llm=llm, search=search_engine)
68
+ logger.info("Search strategy: {}", search_strategy)
69
+ system = AdvancedSearchSystem(
70
+ llm=llm, search=search_engine, strategy_name=search_strategy
71
+ )
68
72
 
69
73
  # Override default settings with user-provided values
70
74
  system.max_iterations = iterations
@@ -0,0 +1,162 @@
1
+ # LDR Benchmarking Framework
2
+
3
+ The Local Deep Research (LDR) Benchmarking Framework allows you to evaluate and compare LDR's performance on standardized benchmarks.
4
+
5
+ ## Features
6
+
7
+ - Run benchmarks on SimpleQA and BrowseComp datasets
8
+ - Configure search parameters (iterations, questions per iteration, search tool)
9
+ - Use automated evaluation with Claude 3.7 Sonnet (default) or other models
10
+ - Generate detailed reports with metrics and examples
11
+ - Compare multiple configurations to find optimal settings
12
+ - API, CLI, and web interface integration
13
+
14
+ ## Benchmark Datasets
15
+
16
+ ### SimpleQA
17
+
18
+ A straightforward question-answering benchmark with factual questions. This benchmark tests LDR's ability to find and synthesize factual information.
19
+
20
+ ### BrowseComp
21
+
22
+ A web browsing comprehension benchmark with more complex questions requiring synthesis across multiple sources. This benchmark tests LDR's ability to understand and navigate complex information needs.
23
+
24
+ ## Usage
25
+
26
+ ### Programmatic API
27
+
28
+ ```python
29
+ from local_deep_research.api.benchmark_functions import evaluate_simpleqa
30
+
31
+ # Run SimpleQA benchmark with 20 examples
32
+ result = evaluate_simpleqa(
33
+ num_examples=20,
34
+ search_iterations=3,
35
+ questions_per_iteration=3,
36
+ search_tool="searxng"
37
+ )
38
+
39
+ # Print accuracy
40
+ print(f"Accuracy: {result['metrics']['accuracy']:.3f}")
41
+ ```
42
+
43
+ ### Command Line Interface
44
+
45
+ ```bash
46
+ # Run SimpleQA benchmark
47
+ python -m local_deep_research.cli.benchmark_commands simpleqa --examples 20 --iterations 3
48
+
49
+ # Run BrowseComp benchmark
50
+ python -m local_deep_research.cli.benchmark_commands browsecomp --examples 10 --search-tool wikipedia
51
+
52
+ # Compare configurations
53
+ python -m local_deep_research.cli.benchmark_commands compare --dataset simpleqa --examples 5
54
+ ```
55
+
56
+ ### Web Interface
57
+
58
+ The benchmark dashboard is available at `/benchmark` in the LDR web interface. You can:
59
+
60
+ 1. Select a benchmark to run
61
+ 2. Configure parameters
62
+ 3. Run the benchmark
63
+ 4. View results and reports
64
+
65
+ ## Evaluation
66
+
67
+ By default, benchmarks are evaluated using Claude 3.7 Sonnet via OpenRouter. You can customize the evaluation model:
68
+
69
+ ```python
70
+ # Use a different model for evaluation
71
+ result = evaluate_simpleqa(
72
+ num_examples=10,
73
+ evaluation_model="gpt-4o",
74
+ evaluation_provider="openai"
75
+ )
76
+ ```
77
+
78
+ You can also use human evaluation:
79
+
80
+ ```python
81
+ # Use human evaluation
82
+ result = evaluate_simpleqa(
83
+ num_examples=5,
84
+ human_evaluation=True
85
+ )
86
+ ```
87
+
88
+ ## Configuration Comparison
89
+
90
+ Compare multiple configurations to find optimal settings:
91
+
92
+ ```python
93
+ from local_deep_research.api.benchmark_functions import compare_configurations
94
+
95
+ # Define configurations to compare
96
+ configurations = [
97
+ {
98
+ "name": "Base Config",
99
+ "search_tool": "searxng",
100
+ "iterations": 1,
101
+ "questions_per_iteration": 3
102
+ },
103
+ {
104
+ "name": "More Iterations",
105
+ "search_tool": "searxng",
106
+ "iterations": 3,
107
+ "questions_per_iteration": 3
108
+ },
109
+ {
110
+ "name": "Different Search Engine",
111
+ "search_tool": "wikipedia",
112
+ "iterations": 1,
113
+ "questions_per_iteration": 3
114
+ }
115
+ ]
116
+
117
+ # Run comparison
118
+ result = compare_configurations(
119
+ dataset_type="simpleqa",
120
+ num_examples=10,
121
+ configurations=configurations
122
+ )
123
+ ```
124
+
125
+ ## Output Format
126
+
127
+ Benchmark results include:
128
+
129
+ - **metrics**: Accuracy, processing time, confidence scores
130
+ - **report_path**: Path to generated report
131
+ - **results_path**: Path to raw results file
132
+ - **total_examples**: Number of examples processed
133
+ - **status**: Completion status
134
+
135
+ ## Example Reports
136
+
137
+ Reports include:
138
+
139
+ - Overall accuracy and metrics
140
+ - Configuration details
141
+ - Example correct and incorrect answers
142
+ - Time and date information
143
+
144
+ ## Integration with LDR Web App
145
+
146
+ The benchmarking framework is fully integrated with the LDR web interface. You can run benchmarks and view results directly in the web app.
147
+
148
+ ## Adding Custom Benchmarks
149
+
150
+ To add a custom benchmark:
151
+
152
+ 1. Create a dataset loader in `datasets.py`
153
+ 2. Add evaluation templates in `templates.py`
154
+ 3. Create benchmark runners in `runners.py`
155
+ 4. Expose the benchmark through the API in `api/benchmark_functions.py`
156
+
157
+ ## Performance Considerations
158
+
159
+ - Running benchmarks can be resource-intensive
160
+ - Start with a small number of examples for testing
161
+ - Full benchmarks with 100+ examples may take several hours to complete
162
+ - Consider using a more powerful machine for large benchmarks
@@ -0,0 +1,51 @@
1
+ """
2
+ Benchmarking module for Local Deep Research.
3
+
4
+ This module provides tools for evaluating LDR's performance on standard benchmarks
5
+ and for optimizing performance through parameter tuning.
6
+ """
7
+
8
+ __version__ = "0.2.0"
9
+
10
+ # Core benchmark functionality
11
+ from .datasets import get_available_datasets, load_dataset
12
+ from .metrics import (
13
+ calculate_metrics,
14
+ calculate_quality_metrics,
15
+ calculate_speed_metrics,
16
+ calculate_resource_metrics,
17
+ calculate_combined_score,
18
+ generate_report,
19
+ )
20
+ from .runners import run_benchmark, run_browsecomp_benchmark, run_simpleqa_benchmark
21
+
22
+ # Optimization functionality
23
+ from .optimization import (
24
+ optimize_parameters,
25
+ optimize_for_quality,
26
+ optimize_for_speed,
27
+ optimize_for_efficiency,
28
+ )
29
+
30
+ __all__ = [
31
+ # Core benchmark functionality
32
+ "run_benchmark",
33
+ "run_simpleqa_benchmark",
34
+ "run_browsecomp_benchmark",
35
+ "load_dataset",
36
+ "get_available_datasets",
37
+ "calculate_metrics",
38
+ "generate_report",
39
+
40
+ # Metrics for optimization
41
+ "calculate_quality_metrics",
42
+ "calculate_speed_metrics",
43
+ "calculate_resource_metrics",
44
+ "calculate_combined_score",
45
+
46
+ # Optimization functionality
47
+ "optimize_parameters",
48
+ "optimize_for_quality",
49
+ "optimize_for_speed",
50
+ "optimize_for_efficiency",
51
+ ]