local-deep-research 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. local_deep_research/__version__.py +1 -1
  2. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +11 -1
  3. local_deep_research/advanced_search_system/questions/browsecomp_question.py +32 -6
  4. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +32 -8
  5. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -0
  6. local_deep_research/api/__init__.py +2 -0
  7. local_deep_research/api/research_functions.py +177 -3
  8. local_deep_research/benchmarks/graders.py +150 -5
  9. local_deep_research/benchmarks/models/__init__.py +19 -0
  10. local_deep_research/benchmarks/models/benchmark_models.py +283 -0
  11. local_deep_research/benchmarks/ui/__init__.py +1 -0
  12. local_deep_research/benchmarks/web_api/__init__.py +6 -0
  13. local_deep_research/benchmarks/web_api/benchmark_routes.py +862 -0
  14. local_deep_research/benchmarks/web_api/benchmark_service.py +920 -0
  15. local_deep_research/config/llm_config.py +106 -21
  16. local_deep_research/defaults/default_settings.json +447 -2
  17. local_deep_research/error_handling/report_generator.py +10 -0
  18. local_deep_research/llm/__init__.py +19 -0
  19. local_deep_research/llm/llm_registry.py +155 -0
  20. local_deep_research/metrics/db_models.py +3 -7
  21. local_deep_research/metrics/search_tracker.py +25 -11
  22. local_deep_research/search_system.py +12 -9
  23. local_deep_research/utilities/log_utils.py +23 -10
  24. local_deep_research/utilities/thread_context.py +99 -0
  25. local_deep_research/web/app_factory.py +32 -8
  26. local_deep_research/web/database/benchmark_schema.py +230 -0
  27. local_deep_research/web/database/convert_research_id_to_string.py +161 -0
  28. local_deep_research/web/database/models.py +55 -1
  29. local_deep_research/web/database/schema_upgrade.py +397 -2
  30. local_deep_research/web/database/uuid_migration.py +265 -0
  31. local_deep_research/web/routes/api_routes.py +62 -31
  32. local_deep_research/web/routes/history_routes.py +13 -6
  33. local_deep_research/web/routes/metrics_routes.py +264 -4
  34. local_deep_research/web/routes/research_routes.py +45 -18
  35. local_deep_research/web/routes/route_registry.py +352 -0
  36. local_deep_research/web/routes/settings_routes.py +382 -22
  37. local_deep_research/web/services/research_service.py +22 -29
  38. local_deep_research/web/services/settings_manager.py +53 -0
  39. local_deep_research/web/services/settings_service.py +2 -0
  40. local_deep_research/web/static/css/styles.css +8 -0
  41. local_deep_research/web/static/js/components/detail.js +7 -14
  42. local_deep_research/web/static/js/components/details.js +8 -10
  43. local_deep_research/web/static/js/components/fallback/ui.js +4 -4
  44. local_deep_research/web/static/js/components/history.js +6 -6
  45. local_deep_research/web/static/js/components/logpanel.js +14 -11
  46. local_deep_research/web/static/js/components/progress.js +51 -46
  47. local_deep_research/web/static/js/components/research.js +250 -89
  48. local_deep_research/web/static/js/components/results.js +5 -7
  49. local_deep_research/web/static/js/components/settings.js +32 -26
  50. local_deep_research/web/static/js/components/settings_sync.js +24 -23
  51. local_deep_research/web/static/js/config/urls.js +285 -0
  52. local_deep_research/web/static/js/main.js +8 -8
  53. local_deep_research/web/static/js/research_form.js +267 -12
  54. local_deep_research/web/static/js/services/api.js +18 -18
  55. local_deep_research/web/static/js/services/keyboard.js +8 -8
  56. local_deep_research/web/static/js/services/socket.js +53 -35
  57. local_deep_research/web/static/js/services/ui.js +1 -1
  58. local_deep_research/web/templates/base.html +4 -1
  59. local_deep_research/web/templates/components/custom_dropdown.html +5 -3
  60. local_deep_research/web/templates/components/mobile_nav.html +3 -3
  61. local_deep_research/web/templates/components/sidebar.html +9 -3
  62. local_deep_research/web/templates/pages/benchmark.html +2697 -0
  63. local_deep_research/web/templates/pages/benchmark_results.html +1136 -0
  64. local_deep_research/web/templates/pages/benchmark_simple.html +453 -0
  65. local_deep_research/web/templates/pages/cost_analytics.html +1 -1
  66. local_deep_research/web/templates/pages/metrics.html +212 -39
  67. local_deep_research/web/templates/pages/research.html +8 -6
  68. local_deep_research/web/templates/pages/star_reviews.html +1 -1
  69. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +14 -1
  70. local_deep_research/web_search_engines/engines/search_engine_brave.py +15 -1
  71. local_deep_research/web_search_engines/engines/search_engine_ddg.py +20 -1
  72. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +26 -2
  73. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +15 -1
  74. local_deep_research/web_search_engines/engines/search_engine_retriever.py +192 -0
  75. local_deep_research/web_search_engines/engines/search_engine_tavily.py +307 -0
  76. local_deep_research/web_search_engines/rate_limiting/__init__.py +14 -0
  77. local_deep_research/web_search_engines/rate_limiting/__main__.py +9 -0
  78. local_deep_research/web_search_engines/rate_limiting/cli.py +209 -0
  79. local_deep_research/web_search_engines/rate_limiting/exceptions.py +21 -0
  80. local_deep_research/web_search_engines/rate_limiting/tracker.py +506 -0
  81. local_deep_research/web_search_engines/retriever_registry.py +108 -0
  82. local_deep_research/web_search_engines/search_engine_base.py +161 -43
  83. local_deep_research/web_search_engines/search_engine_factory.py +14 -0
  84. local_deep_research/web_search_engines/search_engines_config.py +20 -0
  85. local_deep_research-0.6.0.dist-info/METADATA +374 -0
  86. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/RECORD +89 -64
  87. local_deep_research-0.5.9.dist-info/METADATA +0 -420
  88. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/WHEEL +0 -0
  89. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/entry_points.txt +0 -0
  90. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,920 @@
1
+ """Benchmark service for handling web-based benchmark execution."""
2
+
3
+ import hashlib
4
+ import json
5
+ import threading
6
+ import time
7
+ from datetime import datetime
8
+ from typing import Dict, List, Optional, Any
9
+
10
+ from loguru import logger
11
+
12
+ from ..models.benchmark_models import (
13
+ BenchmarkRun,
14
+ BenchmarkResult,
15
+ BenchmarkStatus,
16
+ DatasetType,
17
+ )
18
+ from ..datasets import load_dataset
19
+ from ..graders import extract_answer_from_response, grade_single_result
20
+ from ..runners import format_query
21
+ from ...api.research_functions import quick_summary
22
+ from ...utilities.db_utils import get_db_session
23
+ from ...web.services.socket_service import SocketIOService
24
+
25
+
26
+ class BenchmarkService:
27
+ """Service for managing benchmark runs through the web interface."""
28
+
29
+ def __init__(self, socket_service=None):
30
+ self.active_runs: Dict[int, Dict] = {}
31
+ self.socket_service = socket_service or self._get_socket_service()
32
+ self.rate_limit_detected: Dict[
33
+ int, bool
34
+ ] = {} # Track rate limiting per benchmark run
35
+
36
+ def _get_socket_service(self):
37
+ """Get socket service instance, handling cases where Flask app is not available."""
38
+ try:
39
+ return SocketIOService()
40
+ except Exception:
41
+ # Return a mock socket service for testing/standalone use
42
+ class MockSocketService:
43
+ def emit_to_room(self, room, event, data):
44
+ pass
45
+
46
+ return MockSocketService()
47
+
48
+ def generate_config_hash(self, search_config: Dict[str, Any]) -> str:
49
+ """Generate a hash for search configuration compatibility checking."""
50
+ relevant_params = {
51
+ "iterations": search_config.get("iterations"),
52
+ "questions_per_iteration": search_config.get(
53
+ "questions_per_iteration"
54
+ ),
55
+ "search_tool": search_config.get("search_tool"),
56
+ "search_strategy": search_config.get("search_strategy"),
57
+ "model_name": search_config.get("model_name"),
58
+ "provider": search_config.get("provider"),
59
+ }
60
+ # Remove None values
61
+ relevant_params = {
62
+ k: v for k, v in relevant_params.items() if v is not None
63
+ }
64
+ config_str = json.dumps(relevant_params, sort_keys=True)
65
+ return hashlib.md5(config_str.encode()).hexdigest()[:8]
66
+
67
+ def generate_query_hash(self, question: str, dataset_type: str) -> str:
68
+ """Generate a hash for a query to enable deduplication."""
69
+ query_content = f"{question.strip()}|{dataset_type.lower()}"
70
+ return hashlib.md5(query_content.encode()).hexdigest()
71
+
72
+ def create_benchmark_run(
73
+ self,
74
+ run_name: Optional[str],
75
+ search_config: Dict[str, Any],
76
+ evaluation_config: Dict[str, Any],
77
+ datasets_config: Dict[str, Dict],
78
+ ) -> int:
79
+ """Create a new benchmark run in the database."""
80
+ session = get_db_session()
81
+
82
+ try:
83
+ config_hash = self.generate_config_hash(search_config)
84
+
85
+ # Calculate total examples
86
+ total_examples = sum(
87
+ config.get("count", 0) for config in datasets_config.values()
88
+ )
89
+
90
+ benchmark_run = BenchmarkRun(
91
+ run_name=run_name,
92
+ config_hash=config_hash,
93
+ query_hash_list=[], # Will be populated as we process
94
+ search_config=search_config,
95
+ evaluation_config=evaluation_config,
96
+ datasets_config=datasets_config,
97
+ total_examples=total_examples,
98
+ status=BenchmarkStatus.PENDING,
99
+ )
100
+
101
+ session.add(benchmark_run)
102
+ session.commit()
103
+
104
+ logger.info(
105
+ f"Created benchmark run {benchmark_run.id} with config hash {config_hash}"
106
+ )
107
+ return benchmark_run.id
108
+
109
+ except Exception:
110
+ session.rollback()
111
+ logger.exception("Error creating benchmark run")
112
+ raise
113
+ finally:
114
+ session.close()
115
+
116
+ def get_existing_results(self, config_hash: str) -> Dict[str, Dict]:
117
+ """Get existing results with compatible configuration."""
118
+ session = get_db_session()
119
+
120
+ try:
121
+ # Find compatible runs
122
+ compatible_runs = (
123
+ session.query(BenchmarkRun)
124
+ .filter(BenchmarkRun.config_hash == config_hash)
125
+ .filter(BenchmarkRun.status == BenchmarkStatus.COMPLETED)
126
+ .all()
127
+ )
128
+
129
+ existing_results = {}
130
+ for run in compatible_runs:
131
+ results = (
132
+ session.query(BenchmarkResult)
133
+ .filter(BenchmarkResult.benchmark_run_id == run.id)
134
+ .filter(
135
+ BenchmarkResult.is_correct.isnot(None)
136
+ ) # Only completed evaluations
137
+ .all()
138
+ )
139
+
140
+ for result in results:
141
+ existing_results[result.query_hash] = {
142
+ "id": result.example_id,
143
+ "dataset_type": result.dataset_type.value,
144
+ "problem": result.question,
145
+ "correct_answer": result.correct_answer,
146
+ "response": result.response,
147
+ "extracted_answer": result.extracted_answer,
148
+ "confidence": result.confidence,
149
+ "processing_time": result.processing_time,
150
+ "sources": result.sources,
151
+ "is_correct": result.is_correct,
152
+ "graded_confidence": result.graded_confidence,
153
+ "grader_response": result.grader_response,
154
+ "query_hash": result.query_hash,
155
+ }
156
+
157
+ logger.info(
158
+ f"Found {len(existing_results)} existing results for config hash {config_hash}"
159
+ )
160
+ return existing_results
161
+
162
+ except Exception:
163
+ logger.exception("Error loading existing results")
164
+ return {}
165
+ finally:
166
+ session.close()
167
+
168
+ def start_benchmark(self, benchmark_run_id: int) -> bool:
169
+ """Start a benchmark run in a background thread."""
170
+ try:
171
+ # Mark as in progress
172
+ self.update_benchmark_status(
173
+ benchmark_run_id, BenchmarkStatus.IN_PROGRESS
174
+ )
175
+
176
+ # Start background thread
177
+ thread = threading.Thread(
178
+ target=self._run_benchmark_thread,
179
+ args=(benchmark_run_id,),
180
+ daemon=True,
181
+ )
182
+ thread.start()
183
+
184
+ self.active_runs[benchmark_run_id] = {
185
+ "thread": thread,
186
+ "start_time": datetime.now(),
187
+ "status": "running",
188
+ }
189
+
190
+ logger.info(f"Started benchmark run {benchmark_run_id}")
191
+ return True
192
+
193
+ except Exception as e:
194
+ logger.exception(f"Error starting benchmark {benchmark_run_id}")
195
+ self.update_benchmark_status(
196
+ benchmark_run_id, BenchmarkStatus.FAILED, str(e)
197
+ )
198
+ return False
199
+
200
+ def _run_benchmark_thread(self, benchmark_run_id: int):
201
+ """Main benchmark execution thread."""
202
+ session = get_db_session()
203
+
204
+ try:
205
+ # Get benchmark run details
206
+ benchmark_run = (
207
+ session.query(BenchmarkRun)
208
+ .filter(BenchmarkRun.id == benchmark_run_id)
209
+ .first()
210
+ )
211
+ if not benchmark_run:
212
+ raise ValueError(f"Benchmark run {benchmark_run_id} not found")
213
+
214
+ # Load existing results for deduplication
215
+ existing_results = self.get_existing_results(
216
+ benchmark_run.config_hash
217
+ )
218
+
219
+ # Create task queue
220
+ task_queue = self._create_task_queue(
221
+ benchmark_run.datasets_config,
222
+ existing_results,
223
+ benchmark_run_id,
224
+ )
225
+
226
+ # Update total with new tasks only
227
+ benchmark_run.total_examples = len(task_queue) + len(
228
+ existing_results
229
+ )
230
+ benchmark_run.completed_examples = len(existing_results)
231
+ benchmark_run.start_time = datetime.now()
232
+ session.commit()
233
+
234
+ # Process tasks
235
+ for i, task in enumerate(task_queue):
236
+ try:
237
+ # Process single task
238
+ result = self._process_benchmark_task(
239
+ task,
240
+ benchmark_run.search_config,
241
+ benchmark_run.evaluation_config,
242
+ )
243
+
244
+ # Save result
245
+ self._save_benchmark_result(result, benchmark_run_id)
246
+
247
+ # Update progress
248
+ benchmark_run.completed_examples += 1
249
+ session.commit()
250
+
251
+ # Send real-time update
252
+ self._send_progress_update(
253
+ benchmark_run_id,
254
+ benchmark_run.completed_examples,
255
+ benchmark_run.total_examples,
256
+ )
257
+
258
+ except Exception as e:
259
+ logger.exception(f"Error processing task {i}")
260
+ benchmark_run.failed_examples += 1
261
+ session.commit()
262
+
263
+ # Check if this is a rate limiting error
264
+ error_str = str(e).lower()
265
+ if (
266
+ "403" in error_str
267
+ or "rate limit" in error_str
268
+ or "forbidden" in error_str
269
+ ):
270
+ self.rate_limit_detected[benchmark_run_id] = True
271
+ # Send rate limit warning via WebSocket
272
+ self.socket_service.emit_to_subscribers(
273
+ "research_progress",
274
+ benchmark_run_id,
275
+ {
276
+ "rate_limit_detected": True,
277
+ "message": "SearXNG rate limiting detected",
278
+ },
279
+ )
280
+
281
+ # Mark as completed
282
+ benchmark_run.end_time = datetime.now()
283
+ benchmark_run.status = BenchmarkStatus.COMPLETED
284
+
285
+ # Calculate final accuracy
286
+ self._calculate_final_accuracy(benchmark_run_id)
287
+ session.commit()
288
+
289
+ # Send completion notification
290
+ self.socket_service.emit_to_subscribers(
291
+ "research_progress",
292
+ benchmark_run_id,
293
+ {
294
+ "status": "completed",
295
+ "message": "Benchmark completed successfully",
296
+ "progress": 100,
297
+ "benchmark_run_id": benchmark_run_id,
298
+ },
299
+ )
300
+
301
+ except Exception as e:
302
+ logger.exception(f"Benchmark run {benchmark_run_id} failed")
303
+ self.update_benchmark_status(
304
+ benchmark_run_id, BenchmarkStatus.FAILED, str(e)
305
+ )
306
+ finally:
307
+ session.close()
308
+ if benchmark_run_id in self.active_runs:
309
+ del self.active_runs[benchmark_run_id]
310
+
311
+ def _create_task_queue(
312
+ self,
313
+ datasets_config: Dict,
314
+ existing_results: Dict,
315
+ benchmark_run_id: int,
316
+ ) -> List[Dict]:
317
+ """Create list of tasks to process, excluding existing results."""
318
+ tasks = []
319
+
320
+ for dataset_name, config in datasets_config.items():
321
+ if config.get("count", 0) > 0:
322
+ dataset = load_dataset(
323
+ dataset_type=dataset_name,
324
+ num_examples=config["count"],
325
+ seed=None,
326
+ )
327
+
328
+ for i, example in enumerate(dataset):
329
+ # Extract question based on dataset type
330
+ if dataset_name.lower() == "simpleqa":
331
+ question = example.get("problem", "")
332
+ correct_answer = example.get("answer", "")
333
+ else: # browsecomp
334
+ question = example.get("problem", "")
335
+ correct_answer = example.get("answer", "")
336
+
337
+ # Generate query hash
338
+ query_hash = self.generate_query_hash(
339
+ question, dataset_name
340
+ )
341
+
342
+ # Skip if already processed
343
+ if query_hash in existing_results:
344
+ continue
345
+
346
+ tasks.append(
347
+ {
348
+ "benchmark_run_id": benchmark_run_id,
349
+ "example_id": example.get("id", f"example_{i}"),
350
+ "dataset_type": dataset_name,
351
+ "question": question,
352
+ "correct_answer": correct_answer,
353
+ "query_hash": query_hash,
354
+ "task_index": len(tasks),
355
+ }
356
+ )
357
+
358
+ return tasks
359
+
360
+ def _process_benchmark_task(
361
+ self, task: Dict, search_config: Dict, evaluation_config: Dict
362
+ ) -> Dict:
363
+ """Process a single benchmark task."""
364
+ try:
365
+ # Generate a unique tracking ID for this benchmark task
366
+ import uuid
367
+
368
+ tracking_id = str(uuid.uuid4())
369
+
370
+ # Format query
371
+ formatted_query = format_query(
372
+ task["question"], task["dataset_type"]
373
+ )
374
+
375
+ # Run research with progress callback for WebSocket updates
376
+ start_time = time.time()
377
+
378
+ def benchmark_progress_callback(
379
+ status: str, progress: int, data: dict
380
+ ):
381
+ """Progress callback to emit detailed research progress via WebSocket"""
382
+ try:
383
+ timestamp = datetime.now().isoformat()
384
+
385
+ # Create research-compatible log entry
386
+ log_entry = {
387
+ "time": timestamp,
388
+ "message": f"Example {task['example_id']}: {status}",
389
+ "progress": progress,
390
+ "metadata": {
391
+ "phase": data.get("phase", "benchmark_processing"),
392
+ "type": data.get("type", "info"),
393
+ "example_id": task["example_id"],
394
+ "benchmark_run_id": task["benchmark_run_id"],
395
+ **data, # Include all other data
396
+ },
397
+ }
398
+
399
+ # Determine log type based on status/message content
400
+ if (
401
+ "complete" in status.lower()
402
+ or "finished" in status.lower()
403
+ ):
404
+ log_entry["metadata"]["type"] = "milestone"
405
+ elif (
406
+ "error" in status.lower() or "failed" in status.lower()
407
+ ):
408
+ log_entry["metadata"]["type"] = "error"
409
+ elif (
410
+ "starting" in status.lower()
411
+ or "begin" in status.lower()
412
+ ):
413
+ log_entry["metadata"]["type"] = "milestone"
414
+
415
+ # Create progress data in research format
416
+ progress_data = {
417
+ "progress": progress,
418
+ "message": status,
419
+ "status": "in_progress",
420
+ "log_entry": log_entry,
421
+ "progress_log": json.dumps(
422
+ [log_entry]
423
+ ), # Array format expected by socket.js
424
+ }
425
+
426
+ # Emit using research_progress format that the UI expects
427
+ self.socket_service.emit_to_subscribers(
428
+ "research_progress",
429
+ task["benchmark_run_id"],
430
+ progress_data,
431
+ )
432
+
433
+ except Exception:
434
+ logger.exception("Error sending benchmark progress update")
435
+
436
+ search_result = quick_summary(
437
+ query=formatted_query,
438
+ research_id=tracking_id, # Pass the tracking ID
439
+ iterations=search_config.get("iterations", 8),
440
+ questions_per_iteration=search_config.get(
441
+ "questions_per_iteration", 5
442
+ ),
443
+ search_tool=search_config.get("search_tool", "searxng"),
444
+ search_strategy=search_config.get(
445
+ "search_strategy", "focused_iteration"
446
+ ),
447
+ progress_callback=benchmark_progress_callback,
448
+ )
449
+ processing_time = time.time() - start_time
450
+
451
+ # Extract answer
452
+ response = search_result.get("summary", "")
453
+ extracted_data = extract_answer_from_response(
454
+ response, task["dataset_type"]
455
+ )
456
+ extracted_answer = (
457
+ extracted_data.get("extracted_answer", "")
458
+ if isinstance(extracted_data, dict)
459
+ else str(extracted_data)
460
+ )
461
+
462
+ # Extract sources - handle both direct sources and all_links_of_system
463
+ sources = search_result.get("sources", [])
464
+ if not sources and "all_links_of_system" in search_result:
465
+ sources = search_result.get("all_links_of_system", [])
466
+
467
+ # Log for debugging
468
+ logger.debug(f"Search result keys: {list(search_result.keys())}")
469
+ logger.debug(f"Sources found: {len(sources)} items")
470
+
471
+ # Prepare result
472
+ result = {
473
+ **task,
474
+ "response": response,
475
+ "extracted_answer": extracted_answer,
476
+ "confidence": str(
477
+ extracted_data.get("confidence", "100")
478
+ if isinstance(extracted_data, dict)
479
+ else "100"
480
+ ),
481
+ "processing_time": processing_time,
482
+ "sources": json.dumps(sources), # Convert to JSON string
483
+ "completed_at": datetime.now(),
484
+ "research_id": tracking_id, # Store the UUID in the research_id field
485
+ }
486
+
487
+ # Evaluate result - requires proper grading model
488
+ try:
489
+ # Check if we have a proper evaluation model configured
490
+ eval_provider = evaluation_config.get("provider", "").lower()
491
+ eval_model = evaluation_config.get("model_name", "")
492
+
493
+ if (
494
+ eval_provider in ["ollama", "local"]
495
+ or "gemma" in eval_model.lower()
496
+ ):
497
+ # Local models are not reliable enough for grading
498
+ result.update(
499
+ {
500
+ "is_correct": None,
501
+ "graded_confidence": "0",
502
+ "grader_response": "🔑 Evaluation requires OpenRouter API key. Set llm.openai_endpoint.api_key in database settings to use Claude 3.7 Sonnet for accurate grading via OpenRouter.",
503
+ "evaluation_error": "Local models not suitable for grading",
504
+ }
505
+ )
506
+ else:
507
+ # Try to evaluate with proper model
508
+ result_data = {
509
+ "id": task["example_id"],
510
+ "problem": task["question"],
511
+ "correct_answer": task["correct_answer"],
512
+ "response": response,
513
+ "extracted_answer": extracted_answer,
514
+ }
515
+
516
+ eval_result = grade_single_result(
517
+ result_data, task["dataset_type"], evaluation_config
518
+ )
519
+ if eval_result and not eval_result.get("grading_error"):
520
+ result.update(
521
+ {
522
+ "is_correct": eval_result.get(
523
+ "is_correct", False
524
+ ),
525
+ "graded_confidence": eval_result.get(
526
+ "graded_confidence", "0"
527
+ ),
528
+ "grader_response": eval_result.get(
529
+ "grader_response", ""
530
+ ),
531
+ }
532
+ )
533
+ else:
534
+ error_msg = (
535
+ eval_result.get(
536
+ "grading_error", "Unknown evaluation error"
537
+ )
538
+ if eval_result
539
+ else "No evaluation results returned"
540
+ )
541
+ result.update(
542
+ {
543
+ "is_correct": None,
544
+ "graded_confidence": "0",
545
+ "grader_response": f"🔑 Evaluation failed: {error_msg}. Set llm.openai_endpoint.api_key in database settings to use Claude 3.7 Sonnet via OpenRouter.",
546
+ "evaluation_error": error_msg,
547
+ }
548
+ )
549
+
550
+ except Exception as e:
551
+ logger.exception("Evaluation error")
552
+ result.update(
553
+ {
554
+ "is_correct": None,
555
+ "graded_confidence": "0",
556
+ "grader_response": f"🔑 Evaluation failed: {str(e)}. Set llm.openai_endpoint.api_key in database settings to use Claude 3.7 Sonnet via OpenRouter.",
557
+ "evaluation_error": str(e),
558
+ }
559
+ )
560
+
561
+ return result
562
+
563
+ except Exception as e:
564
+ logger.exception("Research error")
565
+ return {
566
+ **task,
567
+ "research_error": str(e),
568
+ "completed_at": datetime.now(),
569
+ }
570
+
571
+ def _save_benchmark_result(self, result: Dict, benchmark_run_id: int):
572
+ """Save benchmark result to database."""
573
+ session = get_db_session()
574
+
575
+ try:
576
+ benchmark_result = BenchmarkResult(
577
+ benchmark_run_id=benchmark_run_id,
578
+ example_id=result["example_id"],
579
+ query_hash=result["query_hash"],
580
+ dataset_type=DatasetType(result["dataset_type"]),
581
+ research_id=result.get(
582
+ "research_id"
583
+ ), # Include the research_id (UUID)
584
+ question=result["question"],
585
+ correct_answer=result["correct_answer"],
586
+ response=result.get("response"),
587
+ extracted_answer=result.get("extracted_answer"),
588
+ confidence=result.get("confidence"),
589
+ processing_time=result.get("processing_time"),
590
+ sources=result.get("sources"),
591
+ is_correct=result.get("is_correct"),
592
+ graded_confidence=result.get("graded_confidence"),
593
+ grader_response=result.get("grader_response"),
594
+ completed_at=result.get("completed_at"),
595
+ research_error=result.get("research_error"),
596
+ evaluation_error=result.get("evaluation_error"),
597
+ task_index=result.get("task_index"),
598
+ )
599
+
600
+ session.add(benchmark_result)
601
+ session.commit()
602
+
603
+ except Exception:
604
+ session.rollback()
605
+ logger.exception("Error saving benchmark result")
606
+ raise
607
+ finally:
608
+ session.close()
609
+
610
+ def _send_progress_update(
611
+ self, benchmark_run_id: int, completed: int, total: int
612
+ ):
613
+ """Send real-time progress update via websocket."""
614
+ try:
615
+ percentage = (completed / total * 100) if total > 0 else 0
616
+
617
+ # Create log entry for milestone progress
618
+ log_entry = {
619
+ "time": datetime.now().isoformat(),
620
+ "message": f"Completed {completed}/{total} examples ({percentage:.1f}%)",
621
+ "progress": percentage,
622
+ "metadata": {
623
+ "phase": "benchmark_progress",
624
+ "type": "milestone",
625
+ "completed": completed,
626
+ "total": total,
627
+ "benchmark_run_id": benchmark_run_id,
628
+ },
629
+ }
630
+
631
+ progress_data = {
632
+ "status": "in_progress",
633
+ "message": f"Processing examples: {completed}/{total}",
634
+ "progress": percentage,
635
+ "completed": completed,
636
+ "total": total,
637
+ "benchmark_run_id": benchmark_run_id,
638
+ "log_entry": log_entry,
639
+ "progress_log": json.dumps([log_entry]),
640
+ }
641
+
642
+ self.socket_service.emit_to_subscribers(
643
+ "research_progress", benchmark_run_id, progress_data
644
+ )
645
+
646
+ except Exception:
647
+ logger.exception("Error sending progress update")
648
+
649
+ def _calculate_final_accuracy(self, benchmark_run_id: int):
650
+ """Calculate and save final accuracy metrics."""
651
+ session = get_db_session()
652
+
653
+ try:
654
+ # Get all results for this run
655
+ results = (
656
+ session.query(BenchmarkResult)
657
+ .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id)
658
+ .filter(BenchmarkResult.is_correct.isnot(None))
659
+ .all()
660
+ )
661
+
662
+ if results:
663
+ correct_count = sum(1 for r in results if r.is_correct)
664
+ overall_accuracy = (correct_count / len(results)) * 100
665
+
666
+ # Calculate processing rate
667
+ total_time = sum(r.processing_time or 0 for r in results)
668
+ processing_rate = (
669
+ (len(results) / (total_time / 60)) if total_time > 0 else 0
670
+ )
671
+
672
+ # Update benchmark run
673
+ benchmark_run = (
674
+ session.query(BenchmarkRun)
675
+ .filter(BenchmarkRun.id == benchmark_run_id)
676
+ .first()
677
+ )
678
+ if benchmark_run:
679
+ benchmark_run.overall_accuracy = overall_accuracy
680
+ benchmark_run.processing_rate = processing_rate
681
+ session.commit()
682
+
683
+ except Exception:
684
+ logger.exception("Error calculating final accuracy")
685
+ finally:
686
+ session.close()
687
+
688
+ def update_benchmark_status(
689
+ self,
690
+ benchmark_run_id: int,
691
+ status: BenchmarkStatus,
692
+ error_message: str = None,
693
+ ):
694
+ """Update benchmark run status."""
695
+ session = get_db_session()
696
+
697
+ try:
698
+ benchmark_run = (
699
+ session.query(BenchmarkRun)
700
+ .filter(BenchmarkRun.id == benchmark_run_id)
701
+ .first()
702
+ )
703
+ if benchmark_run:
704
+ benchmark_run.status = status
705
+ benchmark_run.updated_at = datetime.now()
706
+
707
+ if error_message:
708
+ benchmark_run.error_message = error_message
709
+
710
+ if (
711
+ status == BenchmarkStatus.IN_PROGRESS
712
+ and not benchmark_run.start_time
713
+ ):
714
+ benchmark_run.start_time = datetime.now()
715
+ elif (
716
+ status
717
+ in [BenchmarkStatus.COMPLETED, BenchmarkStatus.FAILED]
718
+ and not benchmark_run.end_time
719
+ ):
720
+ benchmark_run.end_time = datetime.now()
721
+
722
+ session.commit()
723
+
724
+ except Exception:
725
+ session.rollback()
726
+ logger.exception("Error updating benchmark status")
727
+ finally:
728
+ session.close()
729
+
730
+ def get_benchmark_status(self, benchmark_run_id: int) -> Optional[Dict]:
731
+ """Get current status of a benchmark run."""
732
+ session = get_db_session()
733
+
734
+ try:
735
+ benchmark_run = (
736
+ session.query(BenchmarkRun)
737
+ .filter(BenchmarkRun.id == benchmark_run_id)
738
+ .first()
739
+ )
740
+ if not benchmark_run:
741
+ return None
742
+
743
+ # Calculate running accuracy from current results AND reused results from compatible runs
744
+ # First get results specifically for this benchmark run
745
+ current_results = (
746
+ session.query(BenchmarkResult)
747
+ .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id)
748
+ .filter(BenchmarkResult.is_correct.isnot(None))
749
+ .all()
750
+ )
751
+
752
+ # Then get reused results from compatible benchmark runs (same config hash)
753
+ # Only count results up to the number we say we've "completed"
754
+ if benchmark_run.completed_examples > len(current_results):
755
+ # We have reused results, get them from compatible runs
756
+ reused_count_needed = benchmark_run.completed_examples - len(
757
+ current_results
758
+ )
759
+
760
+ compatible_results = (
761
+ session.query(BenchmarkResult)
762
+ .join(
763
+ BenchmarkRun,
764
+ BenchmarkResult.benchmark_run_id == BenchmarkRun.id,
765
+ )
766
+ .filter(
767
+ BenchmarkRun.config_hash == benchmark_run.config_hash
768
+ )
769
+ .filter(
770
+ BenchmarkRun.id != benchmark_run_id
771
+ ) # Exclude current run
772
+ .filter(BenchmarkRun.status == BenchmarkStatus.COMPLETED)
773
+ .filter(BenchmarkResult.is_correct.isnot(None))
774
+ .order_by(BenchmarkResult.id) # Consistent ordering
775
+ .limit(reused_count_needed)
776
+ .all()
777
+ )
778
+
779
+ # Combine current and reused results
780
+ results = (
781
+ current_results + compatible_results[:reused_count_needed]
782
+ )
783
+ else:
784
+ # No reused results, just use current results
785
+ results = current_results
786
+
787
+ running_accuracy = None
788
+ simpleqa_accuracy = None
789
+ browsecomp_accuracy = None
790
+
791
+ if results:
792
+ # Overall running accuracy
793
+ correct_count = sum(1 for r in results if r.is_correct)
794
+ running_accuracy = (correct_count / len(results)) * 100
795
+
796
+ # Per-dataset accuracy
797
+ simpleqa_results = [
798
+ r for r in results if r.dataset_type.value == "simpleqa"
799
+ ]
800
+ if simpleqa_results:
801
+ simpleqa_correct = sum(
802
+ 1 for r in simpleqa_results if r.is_correct
803
+ )
804
+ simpleqa_accuracy = (
805
+ simpleqa_correct / len(simpleqa_results)
806
+ ) * 100
807
+
808
+ browsecomp_results = [
809
+ r for r in results if r.dataset_type.value == "browsecomp"
810
+ ]
811
+ if browsecomp_results:
812
+ browsecomp_correct = sum(
813
+ 1 for r in browsecomp_results if r.is_correct
814
+ )
815
+ browsecomp_accuracy = (
816
+ browsecomp_correct / len(browsecomp_results)
817
+ ) * 100
818
+
819
+ # Calculate time estimates and reliability metrics
820
+ estimated_time_remaining = None
821
+ total_elapsed_time = None
822
+ avg_time_per_example = None
823
+ accuracy_confidence = None
824
+
825
+ if (
826
+ benchmark_run.start_time
827
+ and benchmark_run.completed_examples > 0
828
+ ):
829
+ # Calculate elapsed time
830
+ current_time = datetime.now()
831
+ total_elapsed_time = (
832
+ current_time - benchmark_run.start_time
833
+ ).total_seconds()
834
+
835
+ # Calculate average processing time per example
836
+ avg_time_per_example = (
837
+ total_elapsed_time / benchmark_run.completed_examples
838
+ )
839
+
840
+ # Estimate remaining time
841
+ remaining_examples = (
842
+ benchmark_run.total_examples
843
+ - benchmark_run.completed_examples
844
+ )
845
+ if remaining_examples > 0:
846
+ estimated_time_remaining = (
847
+ avg_time_per_example * remaining_examples
848
+ )
849
+
850
+ # Calculate accuracy confidence interval (95% confidence)
851
+ if results and len(results) >= 3:
852
+ import math
853
+
854
+ n = len(results)
855
+ p = running_accuracy / 100 if running_accuracy else 0
856
+ # Standard error for proportion
857
+ se = math.sqrt(p * (1 - p) / n)
858
+ # 95% confidence interval (±1.96 * SE)
859
+ margin_of_error = 1.96 * se * 100
860
+ accuracy_confidence = {
861
+ "lower_bound": max(0, running_accuracy - margin_of_error),
862
+ "upper_bound": min(100, running_accuracy + margin_of_error),
863
+ "margin_of_error": margin_of_error,
864
+ "sample_size": n,
865
+ }
866
+
867
+ return {
868
+ "id": benchmark_run.id,
869
+ "run_name": benchmark_run.run_name,
870
+ "status": benchmark_run.status.value,
871
+ "completed_examples": benchmark_run.completed_examples,
872
+ "total_examples": benchmark_run.total_examples,
873
+ "failed_examples": benchmark_run.failed_examples,
874
+ "overall_accuracy": benchmark_run.overall_accuracy
875
+ or running_accuracy, # Use running accuracy if final not calculated
876
+ "running_accuracy": running_accuracy, # Current running accuracy
877
+ "simpleqa_accuracy": simpleqa_accuracy, # Per-dataset accuracy
878
+ "browsecomp_accuracy": browsecomp_accuracy,
879
+ "processing_rate": benchmark_run.processing_rate,
880
+ "estimated_time_remaining": estimated_time_remaining, # seconds
881
+ "total_elapsed_time": total_elapsed_time, # seconds
882
+ "avg_time_per_example": avg_time_per_example, # seconds
883
+ "accuracy_confidence": accuracy_confidence, # confidence interval
884
+ "created_at": benchmark_run.created_at.isoformat()
885
+ if benchmark_run.created_at
886
+ else None,
887
+ "start_time": benchmark_run.start_time.isoformat()
888
+ if benchmark_run.start_time
889
+ else None,
890
+ "end_time": benchmark_run.end_time.isoformat()
891
+ if benchmark_run.end_time
892
+ else None,
893
+ "error_message": benchmark_run.error_message,
894
+ }
895
+
896
+ except Exception:
897
+ logger.exception("Error getting benchmark status")
898
+ return None
899
+ finally:
900
+ session.close()
901
+
902
+ def cancel_benchmark(self, benchmark_run_id: int) -> bool:
903
+ """Cancel a running benchmark."""
904
+ try:
905
+ if benchmark_run_id in self.active_runs:
906
+ self.active_runs[benchmark_run_id]["status"] = "cancelled"
907
+
908
+ self.update_benchmark_status(
909
+ benchmark_run_id, BenchmarkStatus.CANCELLED
910
+ )
911
+ logger.info(f"Cancelled benchmark run {benchmark_run_id}")
912
+ return True
913
+
914
+ except Exception:
915
+ logger.exception(f"Error cancelling benchmark {benchmark_run_id}")
916
+ return False
917
+
918
+
919
+ # Global service instance
920
+ benchmark_service = BenchmarkService()