local-deep-research 0.3.12__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__init__.py +1 -0
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/filters/base_filter.py +2 -3
- local_deep_research/advanced_search_system/filters/cross_engine_filter.py +4 -5
- local_deep_research/advanced_search_system/filters/journal_reputation_filter.py +298 -0
- local_deep_research/advanced_search_system/findings/repository.py +0 -3
- local_deep_research/advanced_search_system/strategies/base_strategy.py +1 -2
- local_deep_research/advanced_search_system/strategies/iterdrag_strategy.py +14 -18
- local_deep_research/advanced_search_system/strategies/parallel_search_strategy.py +4 -8
- local_deep_research/advanced_search_system/strategies/rapid_search_strategy.py +5 -6
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -2
- local_deep_research/advanced_search_system/strategies/standard_strategy.py +9 -7
- local_deep_research/api/benchmark_functions.py +288 -0
- local_deep_research/api/research_functions.py +8 -4
- local_deep_research/benchmarks/README.md +162 -0
- local_deep_research/benchmarks/__init__.py +51 -0
- local_deep_research/benchmarks/benchmark_functions.py +353 -0
- local_deep_research/benchmarks/cli/__init__.py +16 -0
- local_deep_research/benchmarks/cli/benchmark_commands.py +338 -0
- local_deep_research/benchmarks/cli.py +347 -0
- local_deep_research/benchmarks/comparison/__init__.py +12 -0
- local_deep_research/benchmarks/comparison/evaluator.py +768 -0
- local_deep_research/benchmarks/datasets/__init__.py +53 -0
- local_deep_research/benchmarks/datasets/base.py +295 -0
- local_deep_research/benchmarks/datasets/browsecomp.py +116 -0
- local_deep_research/benchmarks/datasets/custom_dataset_template.py +98 -0
- local_deep_research/benchmarks/datasets/simpleqa.py +74 -0
- local_deep_research/benchmarks/datasets/utils.py +116 -0
- local_deep_research/benchmarks/datasets.py +31 -0
- local_deep_research/benchmarks/efficiency/__init__.py +14 -0
- local_deep_research/benchmarks/efficiency/resource_monitor.py +367 -0
- local_deep_research/benchmarks/efficiency/speed_profiler.py +214 -0
- local_deep_research/benchmarks/evaluators/__init__.py +18 -0
- local_deep_research/benchmarks/evaluators/base.py +74 -0
- local_deep_research/benchmarks/evaluators/browsecomp.py +83 -0
- local_deep_research/benchmarks/evaluators/composite.py +121 -0
- local_deep_research/benchmarks/evaluators/simpleqa.py +271 -0
- local_deep_research/benchmarks/graders.py +410 -0
- local_deep_research/benchmarks/metrics/README.md +80 -0
- local_deep_research/benchmarks/metrics/__init__.py +24 -0
- local_deep_research/benchmarks/metrics/calculation.py +385 -0
- local_deep_research/benchmarks/metrics/reporting.py +155 -0
- local_deep_research/benchmarks/metrics/visualization.py +205 -0
- local_deep_research/benchmarks/metrics.py +11 -0
- local_deep_research/benchmarks/optimization/__init__.py +32 -0
- local_deep_research/benchmarks/optimization/api.py +274 -0
- local_deep_research/benchmarks/optimization/metrics.py +20 -0
- local_deep_research/benchmarks/optimization/optuna_optimizer.py +1163 -0
- local_deep_research/benchmarks/runners.py +434 -0
- local_deep_research/benchmarks/templates.py +65 -0
- local_deep_research/config/llm_config.py +26 -23
- local_deep_research/config/search_config.py +1 -5
- local_deep_research/defaults/default_settings.json +108 -7
- local_deep_research/search_system.py +16 -8
- local_deep_research/utilities/db_utils.py +3 -6
- local_deep_research/utilities/es_utils.py +441 -0
- local_deep_research/utilities/log_utils.py +36 -0
- local_deep_research/utilities/search_utilities.py +8 -9
- local_deep_research/web/app.py +15 -10
- local_deep_research/web/app_factory.py +9 -12
- local_deep_research/web/database/migrations.py +8 -5
- local_deep_research/web/database/models.py +20 -0
- local_deep_research/web/database/schema_upgrade.py +5 -8
- local_deep_research/web/models/database.py +15 -18
- local_deep_research/web/routes/benchmark_routes.py +427 -0
- local_deep_research/web/routes/research_routes.py +13 -17
- local_deep_research/web/routes/settings_routes.py +264 -67
- local_deep_research/web/services/research_service.py +58 -73
- local_deep_research/web/services/settings_manager.py +1 -4
- local_deep_research/web/services/settings_service.py +4 -6
- local_deep_research/web/static/css/styles.css +12 -0
- local_deep_research/web/static/js/components/logpanel.js +164 -155
- local_deep_research/web/static/js/components/research.js +44 -3
- local_deep_research/web/static/js/components/settings.js +27 -0
- local_deep_research/web/static/js/services/socket.js +47 -0
- local_deep_research/web_search_engines/default_search_engines.py +38 -0
- local_deep_research/web_search_engines/engines/meta_search_engine.py +100 -33
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +31 -17
- local_deep_research/web_search_engines/engines/search_engine_brave.py +8 -3
- local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py +343 -0
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +14 -6
- local_deep_research/web_search_engines/engines/search_engine_local.py +19 -23
- local_deep_research/web_search_engines/engines/search_engine_local_all.py +9 -12
- local_deep_research/web_search_engines/engines/search_engine_searxng.py +12 -17
- local_deep_research/web_search_engines/engines/search_engine_serpapi.py +8 -4
- local_deep_research/web_search_engines/search_engine_base.py +22 -5
- local_deep_research/web_search_engines/search_engine_factory.py +30 -11
- local_deep_research/web_search_engines/search_engines_config.py +14 -1
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/METADATA +10 -2
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/RECORD +93 -51
- local_deep_research/app.py +0 -8
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/WHEEL +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.3.12.dist-info → local_deep_research-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,434 @@
|
|
1
|
+
"""
|
2
|
+
Benchmark runners for Local Deep Research.
|
3
|
+
|
4
|
+
This module provides the main functions for running benchmarks using LDR.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import json
|
8
|
+
import logging
|
9
|
+
import os
|
10
|
+
import time
|
11
|
+
from typing import Any, Callable, Dict, Optional
|
12
|
+
|
13
|
+
from ..api import quick_summary
|
14
|
+
from .datasets import DEFAULT_DATASET_URLS, load_dataset
|
15
|
+
from .datasets.base import DatasetRegistry
|
16
|
+
from .graders import extract_answer_from_response, grade_results
|
17
|
+
from .metrics import calculate_metrics, generate_report
|
18
|
+
from .templates import BROWSECOMP_QUERY_TEMPLATE
|
19
|
+
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
def format_query(question: str, dataset_type: str = "simpleqa") -> str:
|
24
|
+
"""
|
25
|
+
Format query based on dataset type.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
question: Original question
|
29
|
+
dataset_type: Type of dataset
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
Formatted query for LDR
|
33
|
+
"""
|
34
|
+
if dataset_type.lower() == "browsecomp":
|
35
|
+
# BrowseComp requires specific formatting
|
36
|
+
return BROWSECOMP_QUERY_TEMPLATE.format(question=question)
|
37
|
+
|
38
|
+
# Simple format for SimpleQA
|
39
|
+
return question
|
40
|
+
|
41
|
+
|
42
|
+
def run_benchmark(
|
43
|
+
dataset_type: str,
|
44
|
+
dataset_path: Optional[str] = None,
|
45
|
+
num_examples: Optional[int] = None,
|
46
|
+
output_dir: str = "benchmark_results",
|
47
|
+
run_evaluation: bool = True,
|
48
|
+
evaluation_config: Optional[Dict[str, Any]] = None,
|
49
|
+
search_config: Optional[Dict[str, Any]] = None,
|
50
|
+
human_evaluation: bool = False,
|
51
|
+
progress_callback: Optional[Callable[[str, int, Dict], None]] = None,
|
52
|
+
seed: int = 42,
|
53
|
+
) -> Dict[str, Any]:
|
54
|
+
"""
|
55
|
+
Run a benchmark on the specified dataset.
|
56
|
+
|
57
|
+
Args:
|
58
|
+
dataset_type: Type of dataset ("simpleqa" or "browsecomp")
|
59
|
+
dataset_path: Optional custom dataset path
|
60
|
+
num_examples: Number of examples to use
|
61
|
+
output_dir: Directory to save results
|
62
|
+
run_evaluation: Whether to evaluate results
|
63
|
+
evaluation_config: Custom LLM config for evaluation
|
64
|
+
search_config: Custom search parameters
|
65
|
+
human_evaluation: Whether to use human evaluation
|
66
|
+
progress_callback: Optional callback for progress updates
|
67
|
+
seed: Random seed for reproducibility
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
Dictionary with benchmark results and metrics
|
71
|
+
"""
|
72
|
+
# Ensure output directory exists
|
73
|
+
os.makedirs(output_dir, exist_ok=True)
|
74
|
+
|
75
|
+
# Default search configuration
|
76
|
+
if not search_config:
|
77
|
+
search_config = {
|
78
|
+
"iterations": 3,
|
79
|
+
"questions_per_iteration": 3,
|
80
|
+
"search_tool": "searxng",
|
81
|
+
}
|
82
|
+
|
83
|
+
# Load dataset using the class-based approach
|
84
|
+
try:
|
85
|
+
# Create the dataset instance from registry
|
86
|
+
dataset_instance = DatasetRegistry.create_dataset(
|
87
|
+
dataset_id=dataset_type.lower(),
|
88
|
+
dataset_path=dataset_path,
|
89
|
+
num_examples=num_examples,
|
90
|
+
seed=seed,
|
91
|
+
)
|
92
|
+
# Load the examples
|
93
|
+
dataset = dataset_instance.load()
|
94
|
+
|
95
|
+
logger.info(f"Loaded {len(dataset)} examples using dataset class {type(dataset_instance).__name__}")
|
96
|
+
except Exception as e:
|
97
|
+
# Fallback to legacy function if there's any issue
|
98
|
+
logger.warning(f"Error using dataset class: {e}. Falling back to legacy function.")
|
99
|
+
dataset = load_dataset(
|
100
|
+
dataset_type=dataset_type,
|
101
|
+
dataset_path=dataset_path,
|
102
|
+
num_examples=num_examples,
|
103
|
+
seed=seed,
|
104
|
+
)
|
105
|
+
|
106
|
+
# Set up output files
|
107
|
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
108
|
+
results_file = os.path.join(output_dir, f"{dataset_type}_{timestamp}_results.jsonl")
|
109
|
+
evaluation_file = os.path.join(
|
110
|
+
output_dir, f"{dataset_type}_{timestamp}_evaluation.jsonl"
|
111
|
+
)
|
112
|
+
report_file = os.path.join(output_dir, f"{dataset_type}_{timestamp}_report.md")
|
113
|
+
|
114
|
+
# Make sure output files don't exist
|
115
|
+
for file in [results_file, evaluation_file, report_file]:
|
116
|
+
if os.path.exists(file):
|
117
|
+
os.remove(file)
|
118
|
+
|
119
|
+
# Progress tracking
|
120
|
+
total_examples = len(dataset)
|
121
|
+
|
122
|
+
if progress_callback:
|
123
|
+
progress_callback(
|
124
|
+
"Starting benchmark",
|
125
|
+
0,
|
126
|
+
{
|
127
|
+
"status": "started",
|
128
|
+
"dataset_type": dataset_type,
|
129
|
+
"total_examples": total_examples,
|
130
|
+
},
|
131
|
+
)
|
132
|
+
|
133
|
+
# Process each example
|
134
|
+
results = []
|
135
|
+
|
136
|
+
for i, example in enumerate(dataset):
|
137
|
+
# Extract question and answer in a way that uses the dataset class when available
|
138
|
+
if 'dataset_instance' in locals() and isinstance(dataset_instance, DatasetRegistry.get_dataset_class(dataset_type.lower())):
|
139
|
+
# Use the dataset class methods to extract question and answer
|
140
|
+
question = dataset_instance.get_question(example)
|
141
|
+
correct_answer = dataset_instance.get_answer(example)
|
142
|
+
logger.debug(f"Using dataset class methods to extract question and answer")
|
143
|
+
else:
|
144
|
+
# Fallback to the legacy approach
|
145
|
+
if dataset_type.lower() == "simpleqa":
|
146
|
+
question = example.get("problem", "")
|
147
|
+
correct_answer = example.get("answer", "")
|
148
|
+
else: # browsecomp
|
149
|
+
question = example.get("problem", "")
|
150
|
+
# For BrowseComp, the answer should be in "correct_answer" after decryption
|
151
|
+
correct_answer = example.get("correct_answer", "")
|
152
|
+
if not correct_answer and "answer" in example:
|
153
|
+
# Fallback to "answer" field if "correct_answer" is not available
|
154
|
+
correct_answer = example.get("answer", "")
|
155
|
+
|
156
|
+
# Update progress
|
157
|
+
if progress_callback:
|
158
|
+
progress_callback(
|
159
|
+
f"Processing example {i + 1}/{total_examples}",
|
160
|
+
int(i / total_examples * 50),
|
161
|
+
{
|
162
|
+
"status": "processing",
|
163
|
+
"current": i + 1,
|
164
|
+
"total": total_examples,
|
165
|
+
"question": (
|
166
|
+
question[:50] + "..." if len(question) > 50 else question
|
167
|
+
),
|
168
|
+
},
|
169
|
+
)
|
170
|
+
|
171
|
+
logger.info(f"Processing {i + 1}/{total_examples}: {question[:50]}...")
|
172
|
+
|
173
|
+
try:
|
174
|
+
# Format query based on dataset type
|
175
|
+
formatted_query = format_query(question, dataset_type)
|
176
|
+
|
177
|
+
# Time the search
|
178
|
+
start_time = time.time()
|
179
|
+
|
180
|
+
# Get response from LDR
|
181
|
+
search_result = quick_summary(
|
182
|
+
query=formatted_query,
|
183
|
+
iterations=search_config.get("iterations", 3),
|
184
|
+
questions_per_iteration=search_config.get("questions_per_iteration", 3),
|
185
|
+
search_tool=search_config.get("search_tool", "searxng"),
|
186
|
+
)
|
187
|
+
|
188
|
+
end_time = time.time()
|
189
|
+
processing_time = end_time - start_time
|
190
|
+
|
191
|
+
# Extract response and search info
|
192
|
+
response = search_result.get("summary", "")
|
193
|
+
|
194
|
+
# Extract structured information
|
195
|
+
extracted = extract_answer_from_response(response, dataset_type)
|
196
|
+
|
197
|
+
# Format result
|
198
|
+
result = {
|
199
|
+
"id": example.get("id", f"example_{i}"),
|
200
|
+
"problem": question,
|
201
|
+
"correct_answer": correct_answer,
|
202
|
+
"response": response,
|
203
|
+
"extracted_answer": extracted["extracted_answer"],
|
204
|
+
"confidence": extracted["confidence"],
|
205
|
+
"processing_time": processing_time,
|
206
|
+
"sources": search_result.get("sources", []),
|
207
|
+
"search_config": search_config,
|
208
|
+
}
|
209
|
+
|
210
|
+
# Add to results list
|
211
|
+
results.append(result)
|
212
|
+
|
213
|
+
# Write result to file
|
214
|
+
with open(results_file, "a") as f:
|
215
|
+
f.write(json.dumps(result) + "\n")
|
216
|
+
|
217
|
+
# Update progress
|
218
|
+
if progress_callback:
|
219
|
+
progress_callback(
|
220
|
+
f"Completed example {i + 1}/{total_examples}",
|
221
|
+
int((i + 0.5) / total_examples * 50),
|
222
|
+
{
|
223
|
+
"status": "completed_example",
|
224
|
+
"current": i + 1,
|
225
|
+
"total": total_examples,
|
226
|
+
"result": result,
|
227
|
+
},
|
228
|
+
)
|
229
|
+
|
230
|
+
except Exception as e:
|
231
|
+
logger.error(f"Error processing example {i + 1}: {str(e)}")
|
232
|
+
|
233
|
+
# Create error result
|
234
|
+
error_result = {
|
235
|
+
"id": example.get("id", f"example_{i}"),
|
236
|
+
"problem": question,
|
237
|
+
"correct_answer": correct_answer,
|
238
|
+
"error": str(e),
|
239
|
+
"processing_time": (
|
240
|
+
time.time() - start_time if "start_time" in locals() else 0
|
241
|
+
),
|
242
|
+
}
|
243
|
+
|
244
|
+
# Add to results list
|
245
|
+
results.append(error_result)
|
246
|
+
|
247
|
+
# Write error result to file
|
248
|
+
with open(results_file, "a") as f:
|
249
|
+
f.write(json.dumps(error_result) + "\n")
|
250
|
+
|
251
|
+
# Update progress
|
252
|
+
if progress_callback:
|
253
|
+
progress_callback(
|
254
|
+
f"Error processing example {i + 1}/{total_examples}",
|
255
|
+
int((i + 0.5) / total_examples * 50),
|
256
|
+
{
|
257
|
+
"status": "error",
|
258
|
+
"current": i + 1,
|
259
|
+
"total": total_examples,
|
260
|
+
"error": str(e),
|
261
|
+
"result": error_result,
|
262
|
+
},
|
263
|
+
)
|
264
|
+
|
265
|
+
logger.info(f"Completed processing {total_examples} examples")
|
266
|
+
|
267
|
+
# Run evaluation if requested
|
268
|
+
if run_evaluation:
|
269
|
+
if progress_callback:
|
270
|
+
progress_callback(
|
271
|
+
"Starting evaluation",
|
272
|
+
50,
|
273
|
+
{"status": "evaluating", "results_file": results_file},
|
274
|
+
)
|
275
|
+
|
276
|
+
if human_evaluation:
|
277
|
+
from .graders import human_evaluation as evaluate
|
278
|
+
|
279
|
+
logger.info("Running human evaluation...")
|
280
|
+
evaluation_results = evaluate(
|
281
|
+
results_file=results_file, output_file=evaluation_file, interactive=True
|
282
|
+
)
|
283
|
+
else:
|
284
|
+
logger.info("Running automated evaluation...")
|
285
|
+
try:
|
286
|
+
evaluation_results = grade_results(
|
287
|
+
results_file=results_file,
|
288
|
+
output_file=evaluation_file,
|
289
|
+
dataset_type=dataset_type,
|
290
|
+
evaluation_config=evaluation_config,
|
291
|
+
progress_callback=lambda current, total, meta: (
|
292
|
+
progress_callback(
|
293
|
+
f"Evaluating {current + 1}/{total}",
|
294
|
+
50 + int((current + 0.5) / total * 40),
|
295
|
+
{**meta, "status": "evaluating"},
|
296
|
+
)
|
297
|
+
if progress_callback
|
298
|
+
else None
|
299
|
+
),
|
300
|
+
)
|
301
|
+
except Exception as e:
|
302
|
+
logger.error(f"Automated evaluation failed: {str(e)}")
|
303
|
+
|
304
|
+
if progress_callback:
|
305
|
+
progress_callback(
|
306
|
+
"Automated evaluation failed. Falling back to human evaluation.",
|
307
|
+
60,
|
308
|
+
{"status": "evaluation_fallback", "error": str(e)},
|
309
|
+
)
|
310
|
+
|
311
|
+
# Ask if user wants to fall back to human evaluation
|
312
|
+
fallback_to_human = False
|
313
|
+
print("\nAutomated evaluation failed with error:", str(e))
|
314
|
+
response = input(
|
315
|
+
"Do you want to fall back to human evaluation? (y/n): "
|
316
|
+
)
|
317
|
+
fallback_to_human = response.strip().lower() == "y"
|
318
|
+
|
319
|
+
if fallback_to_human:
|
320
|
+
logger.info("Falling back to human evaluation...")
|
321
|
+
from .graders import human_evaluation as evaluate
|
322
|
+
|
323
|
+
evaluation_results = evaluate(
|
324
|
+
results_file=results_file,
|
325
|
+
output_file=evaluation_file,
|
326
|
+
interactive=True,
|
327
|
+
)
|
328
|
+
else:
|
329
|
+
logger.info("Skipping evaluation due to error.")
|
330
|
+
# Create an empty evaluation file to prevent issues
|
331
|
+
with open(evaluation_file, "w") as f:
|
332
|
+
f.write("")
|
333
|
+
|
334
|
+
return {
|
335
|
+
"status": "evaluation_error",
|
336
|
+
"dataset_type": dataset_type,
|
337
|
+
"results_path": results_file,
|
338
|
+
"evaluation_error": str(e),
|
339
|
+
"total_examples": total_examples,
|
340
|
+
}
|
341
|
+
|
342
|
+
# Calculate metrics
|
343
|
+
if progress_callback:
|
344
|
+
progress_callback(
|
345
|
+
"Calculating metrics", 90, {"status": "calculating_metrics"}
|
346
|
+
)
|
347
|
+
|
348
|
+
metrics = calculate_metrics(evaluation_file)
|
349
|
+
|
350
|
+
# Generate report
|
351
|
+
if progress_callback:
|
352
|
+
progress_callback("Generating report", 95, {"status": "generating_report"})
|
353
|
+
|
354
|
+
dataset_name = dataset_type.capitalize()
|
355
|
+
report_path = generate_report(
|
356
|
+
metrics=metrics,
|
357
|
+
results_file=evaluation_file,
|
358
|
+
output_file=report_file,
|
359
|
+
dataset_name=dataset_name,
|
360
|
+
config_info={
|
361
|
+
"Dataset": dataset_path
|
362
|
+
or DEFAULT_DATASET_URLS.get(dataset_type, "Unknown"),
|
363
|
+
"Examples": total_examples,
|
364
|
+
"Iterations": search_config.get("iterations", 3),
|
365
|
+
"Questions per iteration": search_config.get(
|
366
|
+
"questions_per_iteration", 3
|
367
|
+
),
|
368
|
+
"Search tool": search_config.get("search_tool", "searxng"),
|
369
|
+
"Evaluation method": "Human" if human_evaluation else "Automated",
|
370
|
+
},
|
371
|
+
)
|
372
|
+
|
373
|
+
# Mark as complete
|
374
|
+
if progress_callback:
|
375
|
+
progress_callback(
|
376
|
+
"Benchmark complete",
|
377
|
+
100,
|
378
|
+
{"status": "complete", "metrics": metrics, "report_path": report_path},
|
379
|
+
)
|
380
|
+
|
381
|
+
return {
|
382
|
+
"status": "complete",
|
383
|
+
"dataset_type": dataset_type,
|
384
|
+
"results_path": results_file,
|
385
|
+
"evaluation_path": evaluation_file,
|
386
|
+
"report_path": report_path,
|
387
|
+
"metrics": metrics,
|
388
|
+
"total_examples": total_examples,
|
389
|
+
"accuracy": metrics.get("accuracy", 0),
|
390
|
+
}
|
391
|
+
|
392
|
+
else:
|
393
|
+
# No evaluation, just return results
|
394
|
+
if progress_callback:
|
395
|
+
progress_callback(
|
396
|
+
"Benchmark complete (no evaluation)",
|
397
|
+
100,
|
398
|
+
{"status": "complete_no_eval", "results_path": results_file},
|
399
|
+
)
|
400
|
+
|
401
|
+
return {
|
402
|
+
"status": "complete_no_eval",
|
403
|
+
"dataset_type": dataset_type,
|
404
|
+
"results_path": results_file,
|
405
|
+
"total_examples": total_examples,
|
406
|
+
}
|
407
|
+
|
408
|
+
|
409
|
+
def run_simpleqa_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any]:
|
410
|
+
"""
|
411
|
+
Run SimpleQA benchmark with default settings.
|
412
|
+
|
413
|
+
Args:
|
414
|
+
num_examples: Number of examples to process
|
415
|
+
**kwargs: Additional arguments to pass to run_benchmark
|
416
|
+
|
417
|
+
Returns:
|
418
|
+
Dictionary with benchmark results
|
419
|
+
"""
|
420
|
+
return run_benchmark(dataset_type="simpleqa", num_examples=num_examples, **kwargs)
|
421
|
+
|
422
|
+
|
423
|
+
def run_browsecomp_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any]:
|
424
|
+
"""
|
425
|
+
Run BrowseComp benchmark with default settings.
|
426
|
+
|
427
|
+
Args:
|
428
|
+
num_examples: Number of examples to process
|
429
|
+
**kwargs: Additional arguments to pass to run_benchmark
|
430
|
+
|
431
|
+
Returns:
|
432
|
+
Dictionary with benchmark results
|
433
|
+
"""
|
434
|
+
return run_benchmark(dataset_type="browsecomp", num_examples=num_examples, **kwargs)
|
@@ -0,0 +1,65 @@
|
|
1
|
+
"""
|
2
|
+
Templates for benchmark evaluation.
|
3
|
+
|
4
|
+
This module contains prompt templates used for evaluating model outputs
|
5
|
+
against reference answers.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Template for SimpleQA grading
|
9
|
+
SIMPLEQA_GRADER_TEMPLATE = """
|
10
|
+
You are evaluating a question-answering system. Given a question, the correct answer,
|
11
|
+
and the model's response, determine if the model's answer is correct.
|
12
|
+
|
13
|
+
Question: {question}
|
14
|
+
|
15
|
+
Correct Answer: {correct_answer}
|
16
|
+
|
17
|
+
Model Response: {response}
|
18
|
+
|
19
|
+
First, extract the model's final answer, ignoring explanations or reasoning.
|
20
|
+
Then determine if the model's answer is correct by comparing it to the correct answer.
|
21
|
+
Allow for minor variations in wording but be strict about factual accuracy.
|
22
|
+
|
23
|
+
Provide your evaluation in this format:
|
24
|
+
Extracted Answer: [The model's final answer, extracted from the full response]
|
25
|
+
Reasoning: [Your explanation for why the answer is correct or incorrect]
|
26
|
+
Correct: [yes/no]
|
27
|
+
"""
|
28
|
+
|
29
|
+
# Template for BrowseComp grading
|
30
|
+
BROWSECOMP_GRADER_TEMPLATE = """
|
31
|
+
Judge whether the following [response] to [question] is correct or not based on
|
32
|
+
the precise and unambiguous [correct_answer] below.
|
33
|
+
|
34
|
+
[question]: {question}
|
35
|
+
|
36
|
+
[response]: {response}
|
37
|
+
|
38
|
+
Your judgement must be in the format and criteria specified below:
|
39
|
+
|
40
|
+
extracted_final_answer: The final exact answer extracted from the [response].
|
41
|
+
Put the extracted answer as 'None' if there is no exact, final answer to extract
|
42
|
+
from the response.
|
43
|
+
|
44
|
+
[correct_answer]: {correct_answer}
|
45
|
+
|
46
|
+
reasoning: Explain why the extracted_final_answer is correct or incorrect based
|
47
|
+
on [correct_answer], focusing only on if there are meaningful differences between
|
48
|
+
[correct_answer] and the extracted_final_answer.
|
49
|
+
|
50
|
+
correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given
|
51
|
+
above, or is within a small margin of error for numerical problems. Answer 'no' otherwise.
|
52
|
+
|
53
|
+
confidence: The extracted confidence score between 0% and 100% from [response].
|
54
|
+
Put 100 if there is no confidence score available.
|
55
|
+
"""
|
56
|
+
|
57
|
+
# Template for formatted BrowseComp queries
|
58
|
+
BROWSECOMP_QUERY_TEMPLATE = """
|
59
|
+
{question}
|
60
|
+
|
61
|
+
Your response should be in the following format:
|
62
|
+
Explanation: {{your explanation for your final answer}}
|
63
|
+
Exact Answer: {{your succinct, final answer}}
|
64
|
+
Confidence: {{your confidence score between 0% and 100% for your answer}}
|
65
|
+
"""
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import logging
|
2
1
|
import os
|
3
2
|
|
4
3
|
from langchain_anthropic import ChatAnthropic
|
@@ -6,14 +5,12 @@ from langchain_community.llms import VLLM
|
|
6
5
|
from langchain_core.language_models import FakeListChatModel
|
7
6
|
from langchain_ollama import ChatOllama
|
8
7
|
from langchain_openai import ChatOpenAI
|
8
|
+
from loguru import logger
|
9
9
|
|
10
10
|
from ..utilities.db_utils import get_db_setting
|
11
11
|
from ..utilities.search_utilities import remove_think_tags
|
12
12
|
from ..utilities.url_utils import normalize_url
|
13
13
|
|
14
|
-
# Setup logging
|
15
|
-
logger = logging.getLogger(__name__)
|
16
|
-
|
17
14
|
# Valid provider options
|
18
15
|
VALID_PROVIDERS = [
|
19
16
|
"ollama",
|
@@ -67,7 +64,7 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
|
|
67
64
|
raise ValueError(
|
68
65
|
f"Invalid provider: {provider}. Must be one of: {VALID_PROVIDERS}"
|
69
66
|
)
|
70
|
-
|
67
|
+
logger.info(
|
71
68
|
f"Getting LLM with model: {model_name}, temperature: {temperature}, provider: {provider}"
|
72
69
|
)
|
73
70
|
|
@@ -75,8 +72,16 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
|
|
75
72
|
common_params = {
|
76
73
|
"temperature": temperature,
|
77
74
|
}
|
75
|
+
|
76
|
+
# Get context window size from settings
|
77
|
+
context_window_size = get_db_setting("llm.context_window_size", 32000)
|
78
|
+
|
78
79
|
if get_db_setting("llm.supports_max_tokens", True):
|
79
|
-
|
80
|
+
# Use 80% of context window to leave room for prompts
|
81
|
+
max_tokens = min(
|
82
|
+
get_db_setting("llm.max_tokens", 30000), int(context_window_size * 0.8)
|
83
|
+
)
|
84
|
+
common_params["max_tokens"] = max_tokens
|
80
85
|
|
81
86
|
# Handle different providers
|
82
87
|
if provider == "anthropic":
|
@@ -134,9 +139,8 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
|
|
134
139
|
temperature=temperature,
|
135
140
|
)
|
136
141
|
return wrap_llm_without_think_tags(llm)
|
137
|
-
except Exception
|
138
|
-
logger.
|
139
|
-
logger.warning("Falling back.")
|
142
|
+
except Exception:
|
143
|
+
logger.exception("Error loading VLLM model")
|
140
144
|
return get_fallback_model(temperature)
|
141
145
|
|
142
146
|
elif provider == "ollama":
|
@@ -184,10 +188,8 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
|
|
184
188
|
f"Model '{model_name}' not found in Ollama. Available models: {', '.join(model_names[:5])}"
|
185
189
|
)
|
186
190
|
return get_fallback_model(temperature)
|
187
|
-
except Exception
|
188
|
-
logger.
|
189
|
-
f"Error checking for model '{model_name}' in Ollama: {str(model_check_error)}"
|
190
|
-
)
|
191
|
+
except Exception:
|
192
|
+
logger.exception(f"Error checking for model '{model_name}' in Ollama")
|
191
193
|
# Continue anyway, let ChatOllama handle potential errors
|
192
194
|
|
193
195
|
logger.info(
|
@@ -202,11 +204,11 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
|
|
202
204
|
f"Ollama test successful. Response type: {type(test_result)}"
|
203
205
|
)
|
204
206
|
return wrap_llm_without_think_tags(llm)
|
205
|
-
except Exception
|
206
|
-
logger.
|
207
|
+
except Exception:
|
208
|
+
logger.exception("Error creating or testing ChatOllama")
|
207
209
|
return get_fallback_model(temperature)
|
208
|
-
except Exception
|
209
|
-
logger.
|
210
|
+
except Exception:
|
211
|
+
logger.exception("Error in Ollama provider section")
|
210
212
|
return get_fallback_model(temperature)
|
211
213
|
|
212
214
|
elif provider == "lmstudio":
|
@@ -218,7 +220,7 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
|
|
218
220
|
api_key="lm-studio", # LM Studio doesn't require a real API key
|
219
221
|
base_url=f"{lmstudio_url}/v1", # Use the configured URL with /v1 endpoint
|
220
222
|
temperature=temperature,
|
221
|
-
max_tokens=
|
223
|
+
max_tokens=max_tokens, # Use calculated max_tokens based on context size
|
222
224
|
)
|
223
225
|
return wrap_llm_without_think_tags(llm)
|
224
226
|
|
@@ -260,10 +262,11 @@ def get_llm(model_name=None, temperature=None, provider=None, openai_endpoint_ur
|
|
260
262
|
llm = LlamaCpp(
|
261
263
|
model_path=model_path,
|
262
264
|
temperature=temperature,
|
263
|
-
max_tokens=
|
265
|
+
max_tokens=max_tokens, # Use calculated max_tokens
|
264
266
|
n_gpu_layers=n_gpu_layers,
|
265
267
|
n_batch=n_batch,
|
266
268
|
f16_kv=f16_kv,
|
269
|
+
n_ctx=context_window_size, # Set context window size directly
|
267
270
|
verbose=True,
|
268
271
|
)
|
269
272
|
|
@@ -398,11 +401,11 @@ def is_ollama_available():
|
|
398
401
|
except requests.exceptions.RequestException as req_error:
|
399
402
|
logger.error(f"Request error when checking Ollama: {str(req_error)}")
|
400
403
|
return False
|
401
|
-
except Exception
|
402
|
-
logger.
|
404
|
+
except Exception:
|
405
|
+
logger.exception("Unexpected error when checking Ollama")
|
403
406
|
return False
|
404
|
-
except Exception
|
405
|
-
logger.
|
407
|
+
except Exception:
|
408
|
+
logger.exception("Error in is_ollama_available")
|
406
409
|
return False
|
407
410
|
|
408
411
|
|
@@ -1,14 +1,10 @@
|
|
1
1
|
# local_deep_research/config.py
|
2
|
-
import
|
2
|
+
from loguru import logger
|
3
3
|
|
4
4
|
from ..utilities.db_utils import get_db_setting
|
5
5
|
from ..web_search_engines.search_engine_factory import get_search as factory_get_search
|
6
6
|
from .llm_config import get_llm
|
7
7
|
|
8
|
-
# Setup logging
|
9
|
-
logger = logging.getLogger(__name__)
|
10
|
-
|
11
|
-
|
12
8
|
# Whether to check the quality search results using the LLM.
|
13
9
|
QUALITY_CHECK_DDG_URLS = True
|
14
10
|
# Whether to only retrieve snippets instead of full search results.
|