local-deep-research 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +11 -1
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +32 -6
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +32 -8
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -0
- local_deep_research/api/__init__.py +2 -0
- local_deep_research/api/research_functions.py +177 -3
- local_deep_research/benchmarks/graders.py +150 -5
- local_deep_research/benchmarks/models/__init__.py +19 -0
- local_deep_research/benchmarks/models/benchmark_models.py +283 -0
- local_deep_research/benchmarks/ui/__init__.py +1 -0
- local_deep_research/benchmarks/web_api/__init__.py +6 -0
- local_deep_research/benchmarks/web_api/benchmark_routes.py +862 -0
- local_deep_research/benchmarks/web_api/benchmark_service.py +920 -0
- local_deep_research/config/llm_config.py +106 -21
- local_deep_research/defaults/default_settings.json +447 -2
- local_deep_research/error_handling/report_generator.py +10 -0
- local_deep_research/llm/__init__.py +19 -0
- local_deep_research/llm/llm_registry.py +155 -0
- local_deep_research/metrics/db_models.py +3 -7
- local_deep_research/metrics/search_tracker.py +25 -11
- local_deep_research/search_system.py +12 -9
- local_deep_research/utilities/log_utils.py +23 -10
- local_deep_research/utilities/thread_context.py +99 -0
- local_deep_research/web/app_factory.py +32 -8
- local_deep_research/web/database/benchmark_schema.py +230 -0
- local_deep_research/web/database/convert_research_id_to_string.py +161 -0
- local_deep_research/web/database/models.py +55 -1
- local_deep_research/web/database/schema_upgrade.py +397 -2
- local_deep_research/web/database/uuid_migration.py +265 -0
- local_deep_research/web/routes/api_routes.py +62 -31
- local_deep_research/web/routes/history_routes.py +13 -6
- local_deep_research/web/routes/metrics_routes.py +264 -4
- local_deep_research/web/routes/research_routes.py +45 -18
- local_deep_research/web/routes/route_registry.py +352 -0
- local_deep_research/web/routes/settings_routes.py +382 -22
- local_deep_research/web/services/research_service.py +22 -29
- local_deep_research/web/services/settings_manager.py +53 -0
- local_deep_research/web/services/settings_service.py +2 -0
- local_deep_research/web/static/css/styles.css +8 -0
- local_deep_research/web/static/js/components/detail.js +7 -14
- local_deep_research/web/static/js/components/details.js +8 -10
- local_deep_research/web/static/js/components/fallback/ui.js +4 -4
- local_deep_research/web/static/js/components/history.js +6 -6
- local_deep_research/web/static/js/components/logpanel.js +14 -11
- local_deep_research/web/static/js/components/progress.js +51 -46
- local_deep_research/web/static/js/components/research.js +250 -89
- local_deep_research/web/static/js/components/results.js +5 -7
- local_deep_research/web/static/js/components/settings.js +32 -26
- local_deep_research/web/static/js/components/settings_sync.js +24 -23
- local_deep_research/web/static/js/config/urls.js +285 -0
- local_deep_research/web/static/js/main.js +8 -8
- local_deep_research/web/static/js/research_form.js +267 -12
- local_deep_research/web/static/js/services/api.js +18 -18
- local_deep_research/web/static/js/services/keyboard.js +8 -8
- local_deep_research/web/static/js/services/socket.js +53 -35
- local_deep_research/web/static/js/services/ui.js +1 -1
- local_deep_research/web/templates/base.html +4 -1
- local_deep_research/web/templates/components/custom_dropdown.html +5 -3
- local_deep_research/web/templates/components/mobile_nav.html +3 -3
- local_deep_research/web/templates/components/sidebar.html +9 -3
- local_deep_research/web/templates/pages/benchmark.html +2697 -0
- local_deep_research/web/templates/pages/benchmark_results.html +1136 -0
- local_deep_research/web/templates/pages/benchmark_simple.html +453 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +212 -39
- local_deep_research/web/templates/pages/research.html +8 -6
- local_deep_research/web/templates/pages/star_reviews.html +1 -1
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +14 -1
- local_deep_research/web_search_engines/engines/search_engine_brave.py +15 -1
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +20 -1
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +26 -2
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +15 -1
- local_deep_research/web_search_engines/engines/search_engine_retriever.py +192 -0
- local_deep_research/web_search_engines/engines/search_engine_tavily.py +307 -0
- local_deep_research/web_search_engines/rate_limiting/__init__.py +14 -0
- local_deep_research/web_search_engines/rate_limiting/__main__.py +9 -0
- local_deep_research/web_search_engines/rate_limiting/cli.py +209 -0
- local_deep_research/web_search_engines/rate_limiting/exceptions.py +21 -0
- local_deep_research/web_search_engines/rate_limiting/tracker.py +506 -0
- local_deep_research/web_search_engines/retriever_registry.py +108 -0
- local_deep_research/web_search_engines/search_engine_base.py +161 -43
- local_deep_research/web_search_engines/search_engine_factory.py +14 -0
- local_deep_research/web_search_engines/search_engines_config.py +20 -0
- local_deep_research-0.6.0.dist-info/METADATA +374 -0
- {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/RECORD +89 -64
- local_deep_research-0.5.9.dist-info/METADATA +0 -420
- {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/WHEEL +0 -0
- {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,920 @@
|
|
1
|
+
"""Benchmark service for handling web-based benchmark execution."""
|
2
|
+
|
3
|
+
import hashlib
|
4
|
+
import json
|
5
|
+
import threading
|
6
|
+
import time
|
7
|
+
from datetime import datetime
|
8
|
+
from typing import Dict, List, Optional, Any
|
9
|
+
|
10
|
+
from loguru import logger
|
11
|
+
|
12
|
+
from ..models.benchmark_models import (
|
13
|
+
BenchmarkRun,
|
14
|
+
BenchmarkResult,
|
15
|
+
BenchmarkStatus,
|
16
|
+
DatasetType,
|
17
|
+
)
|
18
|
+
from ..datasets import load_dataset
|
19
|
+
from ..graders import extract_answer_from_response, grade_single_result
|
20
|
+
from ..runners import format_query
|
21
|
+
from ...api.research_functions import quick_summary
|
22
|
+
from ...utilities.db_utils import get_db_session
|
23
|
+
from ...web.services.socket_service import SocketIOService
|
24
|
+
|
25
|
+
|
26
|
+
class BenchmarkService:
|
27
|
+
"""Service for managing benchmark runs through the web interface."""
|
28
|
+
|
29
|
+
def __init__(self, socket_service=None):
|
30
|
+
self.active_runs: Dict[int, Dict] = {}
|
31
|
+
self.socket_service = socket_service or self._get_socket_service()
|
32
|
+
self.rate_limit_detected: Dict[
|
33
|
+
int, bool
|
34
|
+
] = {} # Track rate limiting per benchmark run
|
35
|
+
|
36
|
+
def _get_socket_service(self):
|
37
|
+
"""Get socket service instance, handling cases where Flask app is not available."""
|
38
|
+
try:
|
39
|
+
return SocketIOService()
|
40
|
+
except Exception:
|
41
|
+
# Return a mock socket service for testing/standalone use
|
42
|
+
class MockSocketService:
|
43
|
+
def emit_to_room(self, room, event, data):
|
44
|
+
pass
|
45
|
+
|
46
|
+
return MockSocketService()
|
47
|
+
|
48
|
+
def generate_config_hash(self, search_config: Dict[str, Any]) -> str:
|
49
|
+
"""Generate a hash for search configuration compatibility checking."""
|
50
|
+
relevant_params = {
|
51
|
+
"iterations": search_config.get("iterations"),
|
52
|
+
"questions_per_iteration": search_config.get(
|
53
|
+
"questions_per_iteration"
|
54
|
+
),
|
55
|
+
"search_tool": search_config.get("search_tool"),
|
56
|
+
"search_strategy": search_config.get("search_strategy"),
|
57
|
+
"model_name": search_config.get("model_name"),
|
58
|
+
"provider": search_config.get("provider"),
|
59
|
+
}
|
60
|
+
# Remove None values
|
61
|
+
relevant_params = {
|
62
|
+
k: v for k, v in relevant_params.items() if v is not None
|
63
|
+
}
|
64
|
+
config_str = json.dumps(relevant_params, sort_keys=True)
|
65
|
+
return hashlib.md5(config_str.encode()).hexdigest()[:8]
|
66
|
+
|
67
|
+
def generate_query_hash(self, question: str, dataset_type: str) -> str:
|
68
|
+
"""Generate a hash for a query to enable deduplication."""
|
69
|
+
query_content = f"{question.strip()}|{dataset_type.lower()}"
|
70
|
+
return hashlib.md5(query_content.encode()).hexdigest()
|
71
|
+
|
72
|
+
def create_benchmark_run(
|
73
|
+
self,
|
74
|
+
run_name: Optional[str],
|
75
|
+
search_config: Dict[str, Any],
|
76
|
+
evaluation_config: Dict[str, Any],
|
77
|
+
datasets_config: Dict[str, Dict],
|
78
|
+
) -> int:
|
79
|
+
"""Create a new benchmark run in the database."""
|
80
|
+
session = get_db_session()
|
81
|
+
|
82
|
+
try:
|
83
|
+
config_hash = self.generate_config_hash(search_config)
|
84
|
+
|
85
|
+
# Calculate total examples
|
86
|
+
total_examples = sum(
|
87
|
+
config.get("count", 0) for config in datasets_config.values()
|
88
|
+
)
|
89
|
+
|
90
|
+
benchmark_run = BenchmarkRun(
|
91
|
+
run_name=run_name,
|
92
|
+
config_hash=config_hash,
|
93
|
+
query_hash_list=[], # Will be populated as we process
|
94
|
+
search_config=search_config,
|
95
|
+
evaluation_config=evaluation_config,
|
96
|
+
datasets_config=datasets_config,
|
97
|
+
total_examples=total_examples,
|
98
|
+
status=BenchmarkStatus.PENDING,
|
99
|
+
)
|
100
|
+
|
101
|
+
session.add(benchmark_run)
|
102
|
+
session.commit()
|
103
|
+
|
104
|
+
logger.info(
|
105
|
+
f"Created benchmark run {benchmark_run.id} with config hash {config_hash}"
|
106
|
+
)
|
107
|
+
return benchmark_run.id
|
108
|
+
|
109
|
+
except Exception:
|
110
|
+
session.rollback()
|
111
|
+
logger.exception("Error creating benchmark run")
|
112
|
+
raise
|
113
|
+
finally:
|
114
|
+
session.close()
|
115
|
+
|
116
|
+
def get_existing_results(self, config_hash: str) -> Dict[str, Dict]:
|
117
|
+
"""Get existing results with compatible configuration."""
|
118
|
+
session = get_db_session()
|
119
|
+
|
120
|
+
try:
|
121
|
+
# Find compatible runs
|
122
|
+
compatible_runs = (
|
123
|
+
session.query(BenchmarkRun)
|
124
|
+
.filter(BenchmarkRun.config_hash == config_hash)
|
125
|
+
.filter(BenchmarkRun.status == BenchmarkStatus.COMPLETED)
|
126
|
+
.all()
|
127
|
+
)
|
128
|
+
|
129
|
+
existing_results = {}
|
130
|
+
for run in compatible_runs:
|
131
|
+
results = (
|
132
|
+
session.query(BenchmarkResult)
|
133
|
+
.filter(BenchmarkResult.benchmark_run_id == run.id)
|
134
|
+
.filter(
|
135
|
+
BenchmarkResult.is_correct.isnot(None)
|
136
|
+
) # Only completed evaluations
|
137
|
+
.all()
|
138
|
+
)
|
139
|
+
|
140
|
+
for result in results:
|
141
|
+
existing_results[result.query_hash] = {
|
142
|
+
"id": result.example_id,
|
143
|
+
"dataset_type": result.dataset_type.value,
|
144
|
+
"problem": result.question,
|
145
|
+
"correct_answer": result.correct_answer,
|
146
|
+
"response": result.response,
|
147
|
+
"extracted_answer": result.extracted_answer,
|
148
|
+
"confidence": result.confidence,
|
149
|
+
"processing_time": result.processing_time,
|
150
|
+
"sources": result.sources,
|
151
|
+
"is_correct": result.is_correct,
|
152
|
+
"graded_confidence": result.graded_confidence,
|
153
|
+
"grader_response": result.grader_response,
|
154
|
+
"query_hash": result.query_hash,
|
155
|
+
}
|
156
|
+
|
157
|
+
logger.info(
|
158
|
+
f"Found {len(existing_results)} existing results for config hash {config_hash}"
|
159
|
+
)
|
160
|
+
return existing_results
|
161
|
+
|
162
|
+
except Exception:
|
163
|
+
logger.exception("Error loading existing results")
|
164
|
+
return {}
|
165
|
+
finally:
|
166
|
+
session.close()
|
167
|
+
|
168
|
+
def start_benchmark(self, benchmark_run_id: int) -> bool:
|
169
|
+
"""Start a benchmark run in a background thread."""
|
170
|
+
try:
|
171
|
+
# Mark as in progress
|
172
|
+
self.update_benchmark_status(
|
173
|
+
benchmark_run_id, BenchmarkStatus.IN_PROGRESS
|
174
|
+
)
|
175
|
+
|
176
|
+
# Start background thread
|
177
|
+
thread = threading.Thread(
|
178
|
+
target=self._run_benchmark_thread,
|
179
|
+
args=(benchmark_run_id,),
|
180
|
+
daemon=True,
|
181
|
+
)
|
182
|
+
thread.start()
|
183
|
+
|
184
|
+
self.active_runs[benchmark_run_id] = {
|
185
|
+
"thread": thread,
|
186
|
+
"start_time": datetime.now(),
|
187
|
+
"status": "running",
|
188
|
+
}
|
189
|
+
|
190
|
+
logger.info(f"Started benchmark run {benchmark_run_id}")
|
191
|
+
return True
|
192
|
+
|
193
|
+
except Exception as e:
|
194
|
+
logger.exception(f"Error starting benchmark {benchmark_run_id}")
|
195
|
+
self.update_benchmark_status(
|
196
|
+
benchmark_run_id, BenchmarkStatus.FAILED, str(e)
|
197
|
+
)
|
198
|
+
return False
|
199
|
+
|
200
|
+
def _run_benchmark_thread(self, benchmark_run_id: int):
|
201
|
+
"""Main benchmark execution thread."""
|
202
|
+
session = get_db_session()
|
203
|
+
|
204
|
+
try:
|
205
|
+
# Get benchmark run details
|
206
|
+
benchmark_run = (
|
207
|
+
session.query(BenchmarkRun)
|
208
|
+
.filter(BenchmarkRun.id == benchmark_run_id)
|
209
|
+
.first()
|
210
|
+
)
|
211
|
+
if not benchmark_run:
|
212
|
+
raise ValueError(f"Benchmark run {benchmark_run_id} not found")
|
213
|
+
|
214
|
+
# Load existing results for deduplication
|
215
|
+
existing_results = self.get_existing_results(
|
216
|
+
benchmark_run.config_hash
|
217
|
+
)
|
218
|
+
|
219
|
+
# Create task queue
|
220
|
+
task_queue = self._create_task_queue(
|
221
|
+
benchmark_run.datasets_config,
|
222
|
+
existing_results,
|
223
|
+
benchmark_run_id,
|
224
|
+
)
|
225
|
+
|
226
|
+
# Update total with new tasks only
|
227
|
+
benchmark_run.total_examples = len(task_queue) + len(
|
228
|
+
existing_results
|
229
|
+
)
|
230
|
+
benchmark_run.completed_examples = len(existing_results)
|
231
|
+
benchmark_run.start_time = datetime.now()
|
232
|
+
session.commit()
|
233
|
+
|
234
|
+
# Process tasks
|
235
|
+
for i, task in enumerate(task_queue):
|
236
|
+
try:
|
237
|
+
# Process single task
|
238
|
+
result = self._process_benchmark_task(
|
239
|
+
task,
|
240
|
+
benchmark_run.search_config,
|
241
|
+
benchmark_run.evaluation_config,
|
242
|
+
)
|
243
|
+
|
244
|
+
# Save result
|
245
|
+
self._save_benchmark_result(result, benchmark_run_id)
|
246
|
+
|
247
|
+
# Update progress
|
248
|
+
benchmark_run.completed_examples += 1
|
249
|
+
session.commit()
|
250
|
+
|
251
|
+
# Send real-time update
|
252
|
+
self._send_progress_update(
|
253
|
+
benchmark_run_id,
|
254
|
+
benchmark_run.completed_examples,
|
255
|
+
benchmark_run.total_examples,
|
256
|
+
)
|
257
|
+
|
258
|
+
except Exception as e:
|
259
|
+
logger.exception(f"Error processing task {i}")
|
260
|
+
benchmark_run.failed_examples += 1
|
261
|
+
session.commit()
|
262
|
+
|
263
|
+
# Check if this is a rate limiting error
|
264
|
+
error_str = str(e).lower()
|
265
|
+
if (
|
266
|
+
"403" in error_str
|
267
|
+
or "rate limit" in error_str
|
268
|
+
or "forbidden" in error_str
|
269
|
+
):
|
270
|
+
self.rate_limit_detected[benchmark_run_id] = True
|
271
|
+
# Send rate limit warning via WebSocket
|
272
|
+
self.socket_service.emit_to_subscribers(
|
273
|
+
"research_progress",
|
274
|
+
benchmark_run_id,
|
275
|
+
{
|
276
|
+
"rate_limit_detected": True,
|
277
|
+
"message": "SearXNG rate limiting detected",
|
278
|
+
},
|
279
|
+
)
|
280
|
+
|
281
|
+
# Mark as completed
|
282
|
+
benchmark_run.end_time = datetime.now()
|
283
|
+
benchmark_run.status = BenchmarkStatus.COMPLETED
|
284
|
+
|
285
|
+
# Calculate final accuracy
|
286
|
+
self._calculate_final_accuracy(benchmark_run_id)
|
287
|
+
session.commit()
|
288
|
+
|
289
|
+
# Send completion notification
|
290
|
+
self.socket_service.emit_to_subscribers(
|
291
|
+
"research_progress",
|
292
|
+
benchmark_run_id,
|
293
|
+
{
|
294
|
+
"status": "completed",
|
295
|
+
"message": "Benchmark completed successfully",
|
296
|
+
"progress": 100,
|
297
|
+
"benchmark_run_id": benchmark_run_id,
|
298
|
+
},
|
299
|
+
)
|
300
|
+
|
301
|
+
except Exception as e:
|
302
|
+
logger.exception(f"Benchmark run {benchmark_run_id} failed")
|
303
|
+
self.update_benchmark_status(
|
304
|
+
benchmark_run_id, BenchmarkStatus.FAILED, str(e)
|
305
|
+
)
|
306
|
+
finally:
|
307
|
+
session.close()
|
308
|
+
if benchmark_run_id in self.active_runs:
|
309
|
+
del self.active_runs[benchmark_run_id]
|
310
|
+
|
311
|
+
def _create_task_queue(
|
312
|
+
self,
|
313
|
+
datasets_config: Dict,
|
314
|
+
existing_results: Dict,
|
315
|
+
benchmark_run_id: int,
|
316
|
+
) -> List[Dict]:
|
317
|
+
"""Create list of tasks to process, excluding existing results."""
|
318
|
+
tasks = []
|
319
|
+
|
320
|
+
for dataset_name, config in datasets_config.items():
|
321
|
+
if config.get("count", 0) > 0:
|
322
|
+
dataset = load_dataset(
|
323
|
+
dataset_type=dataset_name,
|
324
|
+
num_examples=config["count"],
|
325
|
+
seed=None,
|
326
|
+
)
|
327
|
+
|
328
|
+
for i, example in enumerate(dataset):
|
329
|
+
# Extract question based on dataset type
|
330
|
+
if dataset_name.lower() == "simpleqa":
|
331
|
+
question = example.get("problem", "")
|
332
|
+
correct_answer = example.get("answer", "")
|
333
|
+
else: # browsecomp
|
334
|
+
question = example.get("problem", "")
|
335
|
+
correct_answer = example.get("answer", "")
|
336
|
+
|
337
|
+
# Generate query hash
|
338
|
+
query_hash = self.generate_query_hash(
|
339
|
+
question, dataset_name
|
340
|
+
)
|
341
|
+
|
342
|
+
# Skip if already processed
|
343
|
+
if query_hash in existing_results:
|
344
|
+
continue
|
345
|
+
|
346
|
+
tasks.append(
|
347
|
+
{
|
348
|
+
"benchmark_run_id": benchmark_run_id,
|
349
|
+
"example_id": example.get("id", f"example_{i}"),
|
350
|
+
"dataset_type": dataset_name,
|
351
|
+
"question": question,
|
352
|
+
"correct_answer": correct_answer,
|
353
|
+
"query_hash": query_hash,
|
354
|
+
"task_index": len(tasks),
|
355
|
+
}
|
356
|
+
)
|
357
|
+
|
358
|
+
return tasks
|
359
|
+
|
360
|
+
def _process_benchmark_task(
|
361
|
+
self, task: Dict, search_config: Dict, evaluation_config: Dict
|
362
|
+
) -> Dict:
|
363
|
+
"""Process a single benchmark task."""
|
364
|
+
try:
|
365
|
+
# Generate a unique tracking ID for this benchmark task
|
366
|
+
import uuid
|
367
|
+
|
368
|
+
tracking_id = str(uuid.uuid4())
|
369
|
+
|
370
|
+
# Format query
|
371
|
+
formatted_query = format_query(
|
372
|
+
task["question"], task["dataset_type"]
|
373
|
+
)
|
374
|
+
|
375
|
+
# Run research with progress callback for WebSocket updates
|
376
|
+
start_time = time.time()
|
377
|
+
|
378
|
+
def benchmark_progress_callback(
|
379
|
+
status: str, progress: int, data: dict
|
380
|
+
):
|
381
|
+
"""Progress callback to emit detailed research progress via WebSocket"""
|
382
|
+
try:
|
383
|
+
timestamp = datetime.now().isoformat()
|
384
|
+
|
385
|
+
# Create research-compatible log entry
|
386
|
+
log_entry = {
|
387
|
+
"time": timestamp,
|
388
|
+
"message": f"Example {task['example_id']}: {status}",
|
389
|
+
"progress": progress,
|
390
|
+
"metadata": {
|
391
|
+
"phase": data.get("phase", "benchmark_processing"),
|
392
|
+
"type": data.get("type", "info"),
|
393
|
+
"example_id": task["example_id"],
|
394
|
+
"benchmark_run_id": task["benchmark_run_id"],
|
395
|
+
**data, # Include all other data
|
396
|
+
},
|
397
|
+
}
|
398
|
+
|
399
|
+
# Determine log type based on status/message content
|
400
|
+
if (
|
401
|
+
"complete" in status.lower()
|
402
|
+
or "finished" in status.lower()
|
403
|
+
):
|
404
|
+
log_entry["metadata"]["type"] = "milestone"
|
405
|
+
elif (
|
406
|
+
"error" in status.lower() or "failed" in status.lower()
|
407
|
+
):
|
408
|
+
log_entry["metadata"]["type"] = "error"
|
409
|
+
elif (
|
410
|
+
"starting" in status.lower()
|
411
|
+
or "begin" in status.lower()
|
412
|
+
):
|
413
|
+
log_entry["metadata"]["type"] = "milestone"
|
414
|
+
|
415
|
+
# Create progress data in research format
|
416
|
+
progress_data = {
|
417
|
+
"progress": progress,
|
418
|
+
"message": status,
|
419
|
+
"status": "in_progress",
|
420
|
+
"log_entry": log_entry,
|
421
|
+
"progress_log": json.dumps(
|
422
|
+
[log_entry]
|
423
|
+
), # Array format expected by socket.js
|
424
|
+
}
|
425
|
+
|
426
|
+
# Emit using research_progress format that the UI expects
|
427
|
+
self.socket_service.emit_to_subscribers(
|
428
|
+
"research_progress",
|
429
|
+
task["benchmark_run_id"],
|
430
|
+
progress_data,
|
431
|
+
)
|
432
|
+
|
433
|
+
except Exception:
|
434
|
+
logger.exception("Error sending benchmark progress update")
|
435
|
+
|
436
|
+
search_result = quick_summary(
|
437
|
+
query=formatted_query,
|
438
|
+
research_id=tracking_id, # Pass the tracking ID
|
439
|
+
iterations=search_config.get("iterations", 8),
|
440
|
+
questions_per_iteration=search_config.get(
|
441
|
+
"questions_per_iteration", 5
|
442
|
+
),
|
443
|
+
search_tool=search_config.get("search_tool", "searxng"),
|
444
|
+
search_strategy=search_config.get(
|
445
|
+
"search_strategy", "focused_iteration"
|
446
|
+
),
|
447
|
+
progress_callback=benchmark_progress_callback,
|
448
|
+
)
|
449
|
+
processing_time = time.time() - start_time
|
450
|
+
|
451
|
+
# Extract answer
|
452
|
+
response = search_result.get("summary", "")
|
453
|
+
extracted_data = extract_answer_from_response(
|
454
|
+
response, task["dataset_type"]
|
455
|
+
)
|
456
|
+
extracted_answer = (
|
457
|
+
extracted_data.get("extracted_answer", "")
|
458
|
+
if isinstance(extracted_data, dict)
|
459
|
+
else str(extracted_data)
|
460
|
+
)
|
461
|
+
|
462
|
+
# Extract sources - handle both direct sources and all_links_of_system
|
463
|
+
sources = search_result.get("sources", [])
|
464
|
+
if not sources and "all_links_of_system" in search_result:
|
465
|
+
sources = search_result.get("all_links_of_system", [])
|
466
|
+
|
467
|
+
# Log for debugging
|
468
|
+
logger.debug(f"Search result keys: {list(search_result.keys())}")
|
469
|
+
logger.debug(f"Sources found: {len(sources)} items")
|
470
|
+
|
471
|
+
# Prepare result
|
472
|
+
result = {
|
473
|
+
**task,
|
474
|
+
"response": response,
|
475
|
+
"extracted_answer": extracted_answer,
|
476
|
+
"confidence": str(
|
477
|
+
extracted_data.get("confidence", "100")
|
478
|
+
if isinstance(extracted_data, dict)
|
479
|
+
else "100"
|
480
|
+
),
|
481
|
+
"processing_time": processing_time,
|
482
|
+
"sources": json.dumps(sources), # Convert to JSON string
|
483
|
+
"completed_at": datetime.now(),
|
484
|
+
"research_id": tracking_id, # Store the UUID in the research_id field
|
485
|
+
}
|
486
|
+
|
487
|
+
# Evaluate result - requires proper grading model
|
488
|
+
try:
|
489
|
+
# Check if we have a proper evaluation model configured
|
490
|
+
eval_provider = evaluation_config.get("provider", "").lower()
|
491
|
+
eval_model = evaluation_config.get("model_name", "")
|
492
|
+
|
493
|
+
if (
|
494
|
+
eval_provider in ["ollama", "local"]
|
495
|
+
or "gemma" in eval_model.lower()
|
496
|
+
):
|
497
|
+
# Local models are not reliable enough for grading
|
498
|
+
result.update(
|
499
|
+
{
|
500
|
+
"is_correct": None,
|
501
|
+
"graded_confidence": "0",
|
502
|
+
"grader_response": "🔑 Evaluation requires OpenRouter API key. Set llm.openai_endpoint.api_key in database settings to use Claude 3.7 Sonnet for accurate grading via OpenRouter.",
|
503
|
+
"evaluation_error": "Local models not suitable for grading",
|
504
|
+
}
|
505
|
+
)
|
506
|
+
else:
|
507
|
+
# Try to evaluate with proper model
|
508
|
+
result_data = {
|
509
|
+
"id": task["example_id"],
|
510
|
+
"problem": task["question"],
|
511
|
+
"correct_answer": task["correct_answer"],
|
512
|
+
"response": response,
|
513
|
+
"extracted_answer": extracted_answer,
|
514
|
+
}
|
515
|
+
|
516
|
+
eval_result = grade_single_result(
|
517
|
+
result_data, task["dataset_type"], evaluation_config
|
518
|
+
)
|
519
|
+
if eval_result and not eval_result.get("grading_error"):
|
520
|
+
result.update(
|
521
|
+
{
|
522
|
+
"is_correct": eval_result.get(
|
523
|
+
"is_correct", False
|
524
|
+
),
|
525
|
+
"graded_confidence": eval_result.get(
|
526
|
+
"graded_confidence", "0"
|
527
|
+
),
|
528
|
+
"grader_response": eval_result.get(
|
529
|
+
"grader_response", ""
|
530
|
+
),
|
531
|
+
}
|
532
|
+
)
|
533
|
+
else:
|
534
|
+
error_msg = (
|
535
|
+
eval_result.get(
|
536
|
+
"grading_error", "Unknown evaluation error"
|
537
|
+
)
|
538
|
+
if eval_result
|
539
|
+
else "No evaluation results returned"
|
540
|
+
)
|
541
|
+
result.update(
|
542
|
+
{
|
543
|
+
"is_correct": None,
|
544
|
+
"graded_confidence": "0",
|
545
|
+
"grader_response": f"🔑 Evaluation failed: {error_msg}. Set llm.openai_endpoint.api_key in database settings to use Claude 3.7 Sonnet via OpenRouter.",
|
546
|
+
"evaluation_error": error_msg,
|
547
|
+
}
|
548
|
+
)
|
549
|
+
|
550
|
+
except Exception as e:
|
551
|
+
logger.exception("Evaluation error")
|
552
|
+
result.update(
|
553
|
+
{
|
554
|
+
"is_correct": None,
|
555
|
+
"graded_confidence": "0",
|
556
|
+
"grader_response": f"🔑 Evaluation failed: {str(e)}. Set llm.openai_endpoint.api_key in database settings to use Claude 3.7 Sonnet via OpenRouter.",
|
557
|
+
"evaluation_error": str(e),
|
558
|
+
}
|
559
|
+
)
|
560
|
+
|
561
|
+
return result
|
562
|
+
|
563
|
+
except Exception as e:
|
564
|
+
logger.exception("Research error")
|
565
|
+
return {
|
566
|
+
**task,
|
567
|
+
"research_error": str(e),
|
568
|
+
"completed_at": datetime.now(),
|
569
|
+
}
|
570
|
+
|
571
|
+
def _save_benchmark_result(self, result: Dict, benchmark_run_id: int):
|
572
|
+
"""Save benchmark result to database."""
|
573
|
+
session = get_db_session()
|
574
|
+
|
575
|
+
try:
|
576
|
+
benchmark_result = BenchmarkResult(
|
577
|
+
benchmark_run_id=benchmark_run_id,
|
578
|
+
example_id=result["example_id"],
|
579
|
+
query_hash=result["query_hash"],
|
580
|
+
dataset_type=DatasetType(result["dataset_type"]),
|
581
|
+
research_id=result.get(
|
582
|
+
"research_id"
|
583
|
+
), # Include the research_id (UUID)
|
584
|
+
question=result["question"],
|
585
|
+
correct_answer=result["correct_answer"],
|
586
|
+
response=result.get("response"),
|
587
|
+
extracted_answer=result.get("extracted_answer"),
|
588
|
+
confidence=result.get("confidence"),
|
589
|
+
processing_time=result.get("processing_time"),
|
590
|
+
sources=result.get("sources"),
|
591
|
+
is_correct=result.get("is_correct"),
|
592
|
+
graded_confidence=result.get("graded_confidence"),
|
593
|
+
grader_response=result.get("grader_response"),
|
594
|
+
completed_at=result.get("completed_at"),
|
595
|
+
research_error=result.get("research_error"),
|
596
|
+
evaluation_error=result.get("evaluation_error"),
|
597
|
+
task_index=result.get("task_index"),
|
598
|
+
)
|
599
|
+
|
600
|
+
session.add(benchmark_result)
|
601
|
+
session.commit()
|
602
|
+
|
603
|
+
except Exception:
|
604
|
+
session.rollback()
|
605
|
+
logger.exception("Error saving benchmark result")
|
606
|
+
raise
|
607
|
+
finally:
|
608
|
+
session.close()
|
609
|
+
|
610
|
+
def _send_progress_update(
|
611
|
+
self, benchmark_run_id: int, completed: int, total: int
|
612
|
+
):
|
613
|
+
"""Send real-time progress update via websocket."""
|
614
|
+
try:
|
615
|
+
percentage = (completed / total * 100) if total > 0 else 0
|
616
|
+
|
617
|
+
# Create log entry for milestone progress
|
618
|
+
log_entry = {
|
619
|
+
"time": datetime.now().isoformat(),
|
620
|
+
"message": f"Completed {completed}/{total} examples ({percentage:.1f}%)",
|
621
|
+
"progress": percentage,
|
622
|
+
"metadata": {
|
623
|
+
"phase": "benchmark_progress",
|
624
|
+
"type": "milestone",
|
625
|
+
"completed": completed,
|
626
|
+
"total": total,
|
627
|
+
"benchmark_run_id": benchmark_run_id,
|
628
|
+
},
|
629
|
+
}
|
630
|
+
|
631
|
+
progress_data = {
|
632
|
+
"status": "in_progress",
|
633
|
+
"message": f"Processing examples: {completed}/{total}",
|
634
|
+
"progress": percentage,
|
635
|
+
"completed": completed,
|
636
|
+
"total": total,
|
637
|
+
"benchmark_run_id": benchmark_run_id,
|
638
|
+
"log_entry": log_entry,
|
639
|
+
"progress_log": json.dumps([log_entry]),
|
640
|
+
}
|
641
|
+
|
642
|
+
self.socket_service.emit_to_subscribers(
|
643
|
+
"research_progress", benchmark_run_id, progress_data
|
644
|
+
)
|
645
|
+
|
646
|
+
except Exception:
|
647
|
+
logger.exception("Error sending progress update")
|
648
|
+
|
649
|
+
def _calculate_final_accuracy(self, benchmark_run_id: int):
|
650
|
+
"""Calculate and save final accuracy metrics."""
|
651
|
+
session = get_db_session()
|
652
|
+
|
653
|
+
try:
|
654
|
+
# Get all results for this run
|
655
|
+
results = (
|
656
|
+
session.query(BenchmarkResult)
|
657
|
+
.filter(BenchmarkResult.benchmark_run_id == benchmark_run_id)
|
658
|
+
.filter(BenchmarkResult.is_correct.isnot(None))
|
659
|
+
.all()
|
660
|
+
)
|
661
|
+
|
662
|
+
if results:
|
663
|
+
correct_count = sum(1 for r in results if r.is_correct)
|
664
|
+
overall_accuracy = (correct_count / len(results)) * 100
|
665
|
+
|
666
|
+
# Calculate processing rate
|
667
|
+
total_time = sum(r.processing_time or 0 for r in results)
|
668
|
+
processing_rate = (
|
669
|
+
(len(results) / (total_time / 60)) if total_time > 0 else 0
|
670
|
+
)
|
671
|
+
|
672
|
+
# Update benchmark run
|
673
|
+
benchmark_run = (
|
674
|
+
session.query(BenchmarkRun)
|
675
|
+
.filter(BenchmarkRun.id == benchmark_run_id)
|
676
|
+
.first()
|
677
|
+
)
|
678
|
+
if benchmark_run:
|
679
|
+
benchmark_run.overall_accuracy = overall_accuracy
|
680
|
+
benchmark_run.processing_rate = processing_rate
|
681
|
+
session.commit()
|
682
|
+
|
683
|
+
except Exception:
|
684
|
+
logger.exception("Error calculating final accuracy")
|
685
|
+
finally:
|
686
|
+
session.close()
|
687
|
+
|
688
|
+
def update_benchmark_status(
|
689
|
+
self,
|
690
|
+
benchmark_run_id: int,
|
691
|
+
status: BenchmarkStatus,
|
692
|
+
error_message: str = None,
|
693
|
+
):
|
694
|
+
"""Update benchmark run status."""
|
695
|
+
session = get_db_session()
|
696
|
+
|
697
|
+
try:
|
698
|
+
benchmark_run = (
|
699
|
+
session.query(BenchmarkRun)
|
700
|
+
.filter(BenchmarkRun.id == benchmark_run_id)
|
701
|
+
.first()
|
702
|
+
)
|
703
|
+
if benchmark_run:
|
704
|
+
benchmark_run.status = status
|
705
|
+
benchmark_run.updated_at = datetime.now()
|
706
|
+
|
707
|
+
if error_message:
|
708
|
+
benchmark_run.error_message = error_message
|
709
|
+
|
710
|
+
if (
|
711
|
+
status == BenchmarkStatus.IN_PROGRESS
|
712
|
+
and not benchmark_run.start_time
|
713
|
+
):
|
714
|
+
benchmark_run.start_time = datetime.now()
|
715
|
+
elif (
|
716
|
+
status
|
717
|
+
in [BenchmarkStatus.COMPLETED, BenchmarkStatus.FAILED]
|
718
|
+
and not benchmark_run.end_time
|
719
|
+
):
|
720
|
+
benchmark_run.end_time = datetime.now()
|
721
|
+
|
722
|
+
session.commit()
|
723
|
+
|
724
|
+
except Exception:
|
725
|
+
session.rollback()
|
726
|
+
logger.exception("Error updating benchmark status")
|
727
|
+
finally:
|
728
|
+
session.close()
|
729
|
+
|
730
|
+
def get_benchmark_status(self, benchmark_run_id: int) -> Optional[Dict]:
|
731
|
+
"""Get current status of a benchmark run."""
|
732
|
+
session = get_db_session()
|
733
|
+
|
734
|
+
try:
|
735
|
+
benchmark_run = (
|
736
|
+
session.query(BenchmarkRun)
|
737
|
+
.filter(BenchmarkRun.id == benchmark_run_id)
|
738
|
+
.first()
|
739
|
+
)
|
740
|
+
if not benchmark_run:
|
741
|
+
return None
|
742
|
+
|
743
|
+
# Calculate running accuracy from current results AND reused results from compatible runs
|
744
|
+
# First get results specifically for this benchmark run
|
745
|
+
current_results = (
|
746
|
+
session.query(BenchmarkResult)
|
747
|
+
.filter(BenchmarkResult.benchmark_run_id == benchmark_run_id)
|
748
|
+
.filter(BenchmarkResult.is_correct.isnot(None))
|
749
|
+
.all()
|
750
|
+
)
|
751
|
+
|
752
|
+
# Then get reused results from compatible benchmark runs (same config hash)
|
753
|
+
# Only count results up to the number we say we've "completed"
|
754
|
+
if benchmark_run.completed_examples > len(current_results):
|
755
|
+
# We have reused results, get them from compatible runs
|
756
|
+
reused_count_needed = benchmark_run.completed_examples - len(
|
757
|
+
current_results
|
758
|
+
)
|
759
|
+
|
760
|
+
compatible_results = (
|
761
|
+
session.query(BenchmarkResult)
|
762
|
+
.join(
|
763
|
+
BenchmarkRun,
|
764
|
+
BenchmarkResult.benchmark_run_id == BenchmarkRun.id,
|
765
|
+
)
|
766
|
+
.filter(
|
767
|
+
BenchmarkRun.config_hash == benchmark_run.config_hash
|
768
|
+
)
|
769
|
+
.filter(
|
770
|
+
BenchmarkRun.id != benchmark_run_id
|
771
|
+
) # Exclude current run
|
772
|
+
.filter(BenchmarkRun.status == BenchmarkStatus.COMPLETED)
|
773
|
+
.filter(BenchmarkResult.is_correct.isnot(None))
|
774
|
+
.order_by(BenchmarkResult.id) # Consistent ordering
|
775
|
+
.limit(reused_count_needed)
|
776
|
+
.all()
|
777
|
+
)
|
778
|
+
|
779
|
+
# Combine current and reused results
|
780
|
+
results = (
|
781
|
+
current_results + compatible_results[:reused_count_needed]
|
782
|
+
)
|
783
|
+
else:
|
784
|
+
# No reused results, just use current results
|
785
|
+
results = current_results
|
786
|
+
|
787
|
+
running_accuracy = None
|
788
|
+
simpleqa_accuracy = None
|
789
|
+
browsecomp_accuracy = None
|
790
|
+
|
791
|
+
if results:
|
792
|
+
# Overall running accuracy
|
793
|
+
correct_count = sum(1 for r in results if r.is_correct)
|
794
|
+
running_accuracy = (correct_count / len(results)) * 100
|
795
|
+
|
796
|
+
# Per-dataset accuracy
|
797
|
+
simpleqa_results = [
|
798
|
+
r for r in results if r.dataset_type.value == "simpleqa"
|
799
|
+
]
|
800
|
+
if simpleqa_results:
|
801
|
+
simpleqa_correct = sum(
|
802
|
+
1 for r in simpleqa_results if r.is_correct
|
803
|
+
)
|
804
|
+
simpleqa_accuracy = (
|
805
|
+
simpleqa_correct / len(simpleqa_results)
|
806
|
+
) * 100
|
807
|
+
|
808
|
+
browsecomp_results = [
|
809
|
+
r for r in results if r.dataset_type.value == "browsecomp"
|
810
|
+
]
|
811
|
+
if browsecomp_results:
|
812
|
+
browsecomp_correct = sum(
|
813
|
+
1 for r in browsecomp_results if r.is_correct
|
814
|
+
)
|
815
|
+
browsecomp_accuracy = (
|
816
|
+
browsecomp_correct / len(browsecomp_results)
|
817
|
+
) * 100
|
818
|
+
|
819
|
+
# Calculate time estimates and reliability metrics
|
820
|
+
estimated_time_remaining = None
|
821
|
+
total_elapsed_time = None
|
822
|
+
avg_time_per_example = None
|
823
|
+
accuracy_confidence = None
|
824
|
+
|
825
|
+
if (
|
826
|
+
benchmark_run.start_time
|
827
|
+
and benchmark_run.completed_examples > 0
|
828
|
+
):
|
829
|
+
# Calculate elapsed time
|
830
|
+
current_time = datetime.now()
|
831
|
+
total_elapsed_time = (
|
832
|
+
current_time - benchmark_run.start_time
|
833
|
+
).total_seconds()
|
834
|
+
|
835
|
+
# Calculate average processing time per example
|
836
|
+
avg_time_per_example = (
|
837
|
+
total_elapsed_time / benchmark_run.completed_examples
|
838
|
+
)
|
839
|
+
|
840
|
+
# Estimate remaining time
|
841
|
+
remaining_examples = (
|
842
|
+
benchmark_run.total_examples
|
843
|
+
- benchmark_run.completed_examples
|
844
|
+
)
|
845
|
+
if remaining_examples > 0:
|
846
|
+
estimated_time_remaining = (
|
847
|
+
avg_time_per_example * remaining_examples
|
848
|
+
)
|
849
|
+
|
850
|
+
# Calculate accuracy confidence interval (95% confidence)
|
851
|
+
if results and len(results) >= 3:
|
852
|
+
import math
|
853
|
+
|
854
|
+
n = len(results)
|
855
|
+
p = running_accuracy / 100 if running_accuracy else 0
|
856
|
+
# Standard error for proportion
|
857
|
+
se = math.sqrt(p * (1 - p) / n)
|
858
|
+
# 95% confidence interval (±1.96 * SE)
|
859
|
+
margin_of_error = 1.96 * se * 100
|
860
|
+
accuracy_confidence = {
|
861
|
+
"lower_bound": max(0, running_accuracy - margin_of_error),
|
862
|
+
"upper_bound": min(100, running_accuracy + margin_of_error),
|
863
|
+
"margin_of_error": margin_of_error,
|
864
|
+
"sample_size": n,
|
865
|
+
}
|
866
|
+
|
867
|
+
return {
|
868
|
+
"id": benchmark_run.id,
|
869
|
+
"run_name": benchmark_run.run_name,
|
870
|
+
"status": benchmark_run.status.value,
|
871
|
+
"completed_examples": benchmark_run.completed_examples,
|
872
|
+
"total_examples": benchmark_run.total_examples,
|
873
|
+
"failed_examples": benchmark_run.failed_examples,
|
874
|
+
"overall_accuracy": benchmark_run.overall_accuracy
|
875
|
+
or running_accuracy, # Use running accuracy if final not calculated
|
876
|
+
"running_accuracy": running_accuracy, # Current running accuracy
|
877
|
+
"simpleqa_accuracy": simpleqa_accuracy, # Per-dataset accuracy
|
878
|
+
"browsecomp_accuracy": browsecomp_accuracy,
|
879
|
+
"processing_rate": benchmark_run.processing_rate,
|
880
|
+
"estimated_time_remaining": estimated_time_remaining, # seconds
|
881
|
+
"total_elapsed_time": total_elapsed_time, # seconds
|
882
|
+
"avg_time_per_example": avg_time_per_example, # seconds
|
883
|
+
"accuracy_confidence": accuracy_confidence, # confidence interval
|
884
|
+
"created_at": benchmark_run.created_at.isoformat()
|
885
|
+
if benchmark_run.created_at
|
886
|
+
else None,
|
887
|
+
"start_time": benchmark_run.start_time.isoformat()
|
888
|
+
if benchmark_run.start_time
|
889
|
+
else None,
|
890
|
+
"end_time": benchmark_run.end_time.isoformat()
|
891
|
+
if benchmark_run.end_time
|
892
|
+
else None,
|
893
|
+
"error_message": benchmark_run.error_message,
|
894
|
+
}
|
895
|
+
|
896
|
+
except Exception:
|
897
|
+
logger.exception("Error getting benchmark status")
|
898
|
+
return None
|
899
|
+
finally:
|
900
|
+
session.close()
|
901
|
+
|
902
|
+
def cancel_benchmark(self, benchmark_run_id: int) -> bool:
|
903
|
+
"""Cancel a running benchmark."""
|
904
|
+
try:
|
905
|
+
if benchmark_run_id in self.active_runs:
|
906
|
+
self.active_runs[benchmark_run_id]["status"] = "cancelled"
|
907
|
+
|
908
|
+
self.update_benchmark_status(
|
909
|
+
benchmark_run_id, BenchmarkStatus.CANCELLED
|
910
|
+
)
|
911
|
+
logger.info(f"Cancelled benchmark run {benchmark_run_id}")
|
912
|
+
return True
|
913
|
+
|
914
|
+
except Exception:
|
915
|
+
logger.exception(f"Error cancelling benchmark {benchmark_run_id}")
|
916
|
+
return False
|
917
|
+
|
918
|
+
|
919
|
+
# Global service instance
|
920
|
+
benchmark_service = BenchmarkService()
|