local-deep-research 0.5.9__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. local_deep_research/__version__.py +1 -1
  2. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +11 -1
  3. local_deep_research/advanced_search_system/questions/browsecomp_question.py +32 -6
  4. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +32 -8
  5. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -0
  6. local_deep_research/api/__init__.py +2 -0
  7. local_deep_research/api/research_functions.py +177 -3
  8. local_deep_research/benchmarks/graders.py +150 -5
  9. local_deep_research/benchmarks/models/__init__.py +19 -0
  10. local_deep_research/benchmarks/models/benchmark_models.py +283 -0
  11. local_deep_research/benchmarks/ui/__init__.py +1 -0
  12. local_deep_research/benchmarks/web_api/__init__.py +6 -0
  13. local_deep_research/benchmarks/web_api/benchmark_routes.py +862 -0
  14. local_deep_research/benchmarks/web_api/benchmark_service.py +920 -0
  15. local_deep_research/config/llm_config.py +106 -21
  16. local_deep_research/defaults/default_settings.json +447 -2
  17. local_deep_research/error_handling/report_generator.py +10 -0
  18. local_deep_research/llm/__init__.py +19 -0
  19. local_deep_research/llm/llm_registry.py +155 -0
  20. local_deep_research/metrics/db_models.py +3 -7
  21. local_deep_research/metrics/search_tracker.py +25 -11
  22. local_deep_research/search_system.py +12 -9
  23. local_deep_research/utilities/log_utils.py +23 -10
  24. local_deep_research/utilities/thread_context.py +99 -0
  25. local_deep_research/web/app_factory.py +32 -8
  26. local_deep_research/web/database/benchmark_schema.py +230 -0
  27. local_deep_research/web/database/convert_research_id_to_string.py +161 -0
  28. local_deep_research/web/database/models.py +55 -1
  29. local_deep_research/web/database/schema_upgrade.py +397 -2
  30. local_deep_research/web/database/uuid_migration.py +265 -0
  31. local_deep_research/web/routes/api_routes.py +62 -31
  32. local_deep_research/web/routes/history_routes.py +13 -6
  33. local_deep_research/web/routes/metrics_routes.py +264 -4
  34. local_deep_research/web/routes/research_routes.py +45 -18
  35. local_deep_research/web/routes/route_registry.py +352 -0
  36. local_deep_research/web/routes/settings_routes.py +382 -22
  37. local_deep_research/web/services/research_service.py +22 -29
  38. local_deep_research/web/services/settings_manager.py +53 -0
  39. local_deep_research/web/services/settings_service.py +2 -0
  40. local_deep_research/web/static/css/styles.css +8 -0
  41. local_deep_research/web/static/js/components/detail.js +7 -14
  42. local_deep_research/web/static/js/components/details.js +8 -10
  43. local_deep_research/web/static/js/components/fallback/ui.js +4 -4
  44. local_deep_research/web/static/js/components/history.js +6 -6
  45. local_deep_research/web/static/js/components/logpanel.js +14 -11
  46. local_deep_research/web/static/js/components/progress.js +51 -46
  47. local_deep_research/web/static/js/components/research.js +250 -89
  48. local_deep_research/web/static/js/components/results.js +5 -7
  49. local_deep_research/web/static/js/components/settings.js +32 -26
  50. local_deep_research/web/static/js/components/settings_sync.js +24 -23
  51. local_deep_research/web/static/js/config/urls.js +285 -0
  52. local_deep_research/web/static/js/main.js +8 -8
  53. local_deep_research/web/static/js/research_form.js +267 -12
  54. local_deep_research/web/static/js/services/api.js +18 -18
  55. local_deep_research/web/static/js/services/keyboard.js +8 -8
  56. local_deep_research/web/static/js/services/socket.js +53 -35
  57. local_deep_research/web/static/js/services/ui.js +1 -1
  58. local_deep_research/web/templates/base.html +4 -1
  59. local_deep_research/web/templates/components/custom_dropdown.html +5 -3
  60. local_deep_research/web/templates/components/mobile_nav.html +3 -3
  61. local_deep_research/web/templates/components/sidebar.html +9 -3
  62. local_deep_research/web/templates/pages/benchmark.html +2697 -0
  63. local_deep_research/web/templates/pages/benchmark_results.html +1274 -0
  64. local_deep_research/web/templates/pages/benchmark_simple.html +453 -0
  65. local_deep_research/web/templates/pages/cost_analytics.html +1 -1
  66. local_deep_research/web/templates/pages/metrics.html +212 -39
  67. local_deep_research/web/templates/pages/research.html +8 -6
  68. local_deep_research/web/templates/pages/star_reviews.html +1 -1
  69. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +14 -1
  70. local_deep_research/web_search_engines/engines/search_engine_brave.py +15 -1
  71. local_deep_research/web_search_engines/engines/search_engine_ddg.py +20 -1
  72. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +26 -2
  73. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +15 -1
  74. local_deep_research/web_search_engines/engines/search_engine_retriever.py +192 -0
  75. local_deep_research/web_search_engines/engines/search_engine_tavily.py +307 -0
  76. local_deep_research/web_search_engines/rate_limiting/__init__.py +14 -0
  77. local_deep_research/web_search_engines/rate_limiting/__main__.py +9 -0
  78. local_deep_research/web_search_engines/rate_limiting/cli.py +209 -0
  79. local_deep_research/web_search_engines/rate_limiting/exceptions.py +21 -0
  80. local_deep_research/web_search_engines/rate_limiting/tracker.py +506 -0
  81. local_deep_research/web_search_engines/retriever_registry.py +108 -0
  82. local_deep_research/web_search_engines/search_engine_base.py +161 -43
  83. local_deep_research/web_search_engines/search_engine_factory.py +14 -0
  84. local_deep_research/web_search_engines/search_engines_config.py +20 -0
  85. local_deep_research-0.6.1.dist-info/METADATA +374 -0
  86. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.1.dist-info}/RECORD +89 -64
  87. local_deep_research-0.5.9.dist-info/METADATA +0 -420
  88. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.1.dist-info}/WHEEL +0 -0
  89. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.1.dist-info}/entry_points.txt +0 -0
  90. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,862 @@
1
+ """Flask routes for benchmark web interface."""
2
+
3
+ import time
4
+ from flask import Blueprint, request, jsonify
5
+ from loguru import logger
6
+
7
+ from .benchmark_service import benchmark_service
8
+ from ...web.utils.templates import render_template_with_defaults
9
+ from ...web.services.settings_manager import SettingsManager
10
+ from ...utilities.db_utils import get_db_session, get_db_setting
11
+
12
+ # Create blueprint for benchmark routes
13
+ benchmark_bp = Blueprint("benchmark", __name__, url_prefix="/benchmark")
14
+
15
+
16
+ @benchmark_bp.route("/")
17
+ def index():
18
+ """Benchmark dashboard page."""
19
+ # Load evaluation settings from database
20
+ eval_settings = {
21
+ "evaluation_provider": get_db_setting(
22
+ "benchmark.evaluation.provider", "openai_endpoint"
23
+ ),
24
+ "evaluation_model": get_db_setting("benchmark.evaluation.model", ""),
25
+ "evaluation_endpoint_url": get_db_setting(
26
+ "benchmark.evaluation.endpoint_url", ""
27
+ ),
28
+ "evaluation_temperature": get_db_setting(
29
+ "benchmark.evaluation.temperature", 0
30
+ ),
31
+ }
32
+
33
+ return render_template_with_defaults(
34
+ "pages/benchmark.html", eval_settings=eval_settings
35
+ )
36
+
37
+
38
+ @benchmark_bp.route("/results")
39
+ def results():
40
+ """Benchmark results history page."""
41
+ return render_template_with_defaults("pages/benchmark_results.html")
42
+
43
+
44
+ @benchmark_bp.route("/api/start", methods=["POST"])
45
+ def start_benchmark():
46
+ """Start a new benchmark run."""
47
+ try:
48
+ data = request.get_json()
49
+
50
+ if not data:
51
+ return jsonify({"error": "No data provided"}), 400
52
+
53
+ # Extract configuration
54
+ run_name = data.get("run_name")
55
+
56
+ # Get search config from database instead of request
57
+ from ...web.services.settings_manager import SettingsManager
58
+ from ...utilities.db_utils import get_db_session
59
+
60
+ session = get_db_session()
61
+ settings_manager = SettingsManager(db_session=session)
62
+
63
+ # Build search config from database settings
64
+ search_config = {
65
+ "iterations": int(
66
+ settings_manager.get_setting("search.iterations", 8)
67
+ ),
68
+ "questions_per_iteration": int(
69
+ settings_manager.get_setting(
70
+ "search.questions_per_iteration", 5
71
+ )
72
+ ),
73
+ "search_tool": settings_manager.get_setting(
74
+ "search.tool", "searxng"
75
+ ),
76
+ "search_strategy": settings_manager.get_setting(
77
+ "search.search_strategy", "focused_iteration"
78
+ ),
79
+ "model_name": settings_manager.get_setting("llm.model"),
80
+ "provider": settings_manager.get_setting("llm.provider"),
81
+ "temperature": float(
82
+ settings_manager.get_setting("llm.temperature", 0.7)
83
+ ),
84
+ }
85
+
86
+ # Add provider-specific settings
87
+ provider = search_config.get("provider")
88
+ if provider == "openai_endpoint":
89
+ search_config["openai_endpoint_url"] = settings_manager.get_setting(
90
+ "llm.openai_endpoint.url"
91
+ )
92
+ search_config["openai_endpoint_api_key"] = (
93
+ settings_manager.get_setting("llm.openai_endpoint.api_key")
94
+ )
95
+ elif provider == "openai":
96
+ search_config["openai_api_key"] = settings_manager.get_setting(
97
+ "llm.openai.api_key"
98
+ )
99
+ elif provider == "anthropic":
100
+ search_config["anthropic_api_key"] = settings_manager.get_setting(
101
+ "llm.anthropic.api_key"
102
+ )
103
+
104
+ # Get evaluation config from database settings or request
105
+ if "evaluation_config" in data:
106
+ evaluation_config = data["evaluation_config"]
107
+ else:
108
+ # Read evaluation config from database settings
109
+ evaluation_provider = settings_manager.get_setting(
110
+ "benchmark.evaluation.provider", "openai_endpoint"
111
+ )
112
+ evaluation_model = settings_manager.get_setting(
113
+ "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet"
114
+ )
115
+ evaluation_temperature = float(
116
+ settings_manager.get_setting(
117
+ "benchmark.evaluation.temperature", 0
118
+ )
119
+ )
120
+
121
+ evaluation_config = {
122
+ "provider": evaluation_provider,
123
+ "model_name": evaluation_model,
124
+ "temperature": evaluation_temperature,
125
+ }
126
+
127
+ # Add provider-specific settings for evaluation
128
+ if evaluation_provider == "openai_endpoint":
129
+ evaluation_config["openai_endpoint_url"] = (
130
+ settings_manager.get_setting(
131
+ "benchmark.evaluation.endpoint_url",
132
+ "https://openrouter.ai/api/v1",
133
+ )
134
+ )
135
+ evaluation_config["openai_endpoint_api_key"] = (
136
+ settings_manager.get_setting("llm.openai_endpoint.api_key")
137
+ )
138
+ elif evaluation_provider == "openai":
139
+ evaluation_config["openai_api_key"] = (
140
+ settings_manager.get_setting("llm.openai.api_key")
141
+ )
142
+ elif evaluation_provider == "anthropic":
143
+ evaluation_config["anthropic_api_key"] = (
144
+ settings_manager.get_setting("llm.anthropic.api_key")
145
+ )
146
+ datasets_config = data.get("datasets_config", {})
147
+
148
+ # Close database session
149
+ session.close()
150
+
151
+ # Validate datasets config
152
+ if not datasets_config or not any(
153
+ config.get("count", 0) > 0 for config in datasets_config.values()
154
+ ):
155
+ return jsonify(
156
+ {
157
+ "error": "At least one dataset with count > 0 must be specified"
158
+ }
159
+ ), 400
160
+
161
+ # Create benchmark run
162
+ benchmark_run_id = benchmark_service.create_benchmark_run(
163
+ run_name=run_name,
164
+ search_config=search_config,
165
+ evaluation_config=evaluation_config,
166
+ datasets_config=datasets_config,
167
+ )
168
+
169
+ # Start benchmark
170
+ success = benchmark_service.start_benchmark(benchmark_run_id)
171
+
172
+ if success:
173
+ return jsonify(
174
+ {
175
+ "success": True,
176
+ "benchmark_run_id": benchmark_run_id,
177
+ "message": "Benchmark started successfully",
178
+ }
179
+ )
180
+ else:
181
+ return jsonify(
182
+ {"success": False, "error": "Failed to start benchmark"}
183
+ ), 500
184
+
185
+ except Exception:
186
+ logger.exception("Error starting benchmark")
187
+ return jsonify(
188
+ {"success": False, "error": "An internal error has occurred."}
189
+ ), 500
190
+
191
+
192
+ @benchmark_bp.route("/api/running", methods=["GET"])
193
+ def get_running_benchmark():
194
+ """Check if there's a running benchmark and return its ID."""
195
+ try:
196
+ from ...utilities.db_utils import get_db_session
197
+ from ..models.benchmark_models import BenchmarkRun, BenchmarkStatus
198
+
199
+ session = get_db_session()
200
+
201
+ # Find any benchmark that's currently running
202
+ running_benchmark = (
203
+ session.query(BenchmarkRun)
204
+ .filter(BenchmarkRun.status == BenchmarkStatus.IN_PROGRESS)
205
+ .order_by(BenchmarkRun.created_at.desc())
206
+ .first()
207
+ )
208
+
209
+ session.close()
210
+
211
+ if running_benchmark:
212
+ return jsonify(
213
+ {
214
+ "success": True,
215
+ "benchmark_run_id": running_benchmark.id,
216
+ "run_name": running_benchmark.run_name,
217
+ "total_examples": running_benchmark.total_examples,
218
+ "completed_examples": running_benchmark.completed_examples,
219
+ }
220
+ )
221
+ else:
222
+ return jsonify(
223
+ {"success": False, "message": "No running benchmark found"}
224
+ )
225
+
226
+ except Exception:
227
+ logger.exception("Error checking for running benchmark")
228
+ return jsonify(
229
+ {"success": False, "error": "An internal error has occurred."}
230
+ ), 500
231
+
232
+
233
+ @benchmark_bp.route("/api/status/<int:benchmark_run_id>", methods=["GET"])
234
+ def get_benchmark_status(benchmark_run_id: int):
235
+ """Get status of a benchmark run."""
236
+ try:
237
+ status = benchmark_service.get_benchmark_status(benchmark_run_id)
238
+
239
+ if status:
240
+ return jsonify({"success": True, "status": status})
241
+ else:
242
+ return jsonify(
243
+ {"success": False, "error": "Benchmark run not found"}
244
+ ), 404
245
+
246
+ except Exception:
247
+ logger.exception("Error getting benchmark status")
248
+ return jsonify(
249
+ {"success": False, "error": "An internal error has occurred."}
250
+ ), 500
251
+
252
+
253
+ @benchmark_bp.route("/api/cancel/<int:benchmark_run_id>", methods=["POST"])
254
+ def cancel_benchmark(benchmark_run_id: int):
255
+ """Cancel a running benchmark."""
256
+ try:
257
+ success = benchmark_service.cancel_benchmark(benchmark_run_id)
258
+
259
+ if success:
260
+ return jsonify(
261
+ {"success": True, "message": "Benchmark cancelled successfully"}
262
+ )
263
+ else:
264
+ return jsonify(
265
+ {"success": False, "error": "Failed to cancel benchmark"}
266
+ ), 500
267
+
268
+ except Exception:
269
+ logger.exception("Error cancelling benchmark")
270
+ return jsonify(
271
+ {"success": False, "error": "An internal error has occurred."}
272
+ ), 500
273
+
274
+
275
+ @benchmark_bp.route("/api/history", methods=["GET"])
276
+ def get_benchmark_history():
277
+ """Get list of recent benchmark runs."""
278
+ try:
279
+ from ...utilities.db_utils import get_db_session
280
+ from ..models.benchmark_models import BenchmarkRun
281
+
282
+ session = get_db_session()
283
+
284
+ # Get all benchmark runs (completed, failed, cancelled, or in-progress)
285
+ runs = (
286
+ session.query(BenchmarkRun)
287
+ .order_by(BenchmarkRun.created_at.desc())
288
+ .limit(50)
289
+ .all()
290
+ )
291
+
292
+ # Format runs for display
293
+ formatted_runs = []
294
+ for run in runs:
295
+ # Calculate average processing time from results
296
+ avg_processing_time = None
297
+ avg_search_results = None
298
+ try:
299
+ from ..models.benchmark_models import BenchmarkResult
300
+ from sqlalchemy import func
301
+
302
+ avg_result = (
303
+ session.query(func.avg(BenchmarkResult.processing_time))
304
+ .filter(
305
+ BenchmarkResult.benchmark_run_id == run.id,
306
+ BenchmarkResult.processing_time.isnot(None),
307
+ BenchmarkResult.processing_time > 0,
308
+ )
309
+ .scalar()
310
+ )
311
+
312
+ if avg_result:
313
+ avg_processing_time = float(avg_result)
314
+ except Exception as e:
315
+ logger.warning(
316
+ f"Error calculating avg processing time for run {run.id}: {e}"
317
+ )
318
+
319
+ # Calculate average search results and total search requests from metrics
320
+ total_search_requests = None
321
+ try:
322
+ from ...metrics.search_tracker import get_search_tracker
323
+ from ...metrics.db_models import SearchCall
324
+
325
+ # Get all results for this run to find research_ids
326
+ results = (
327
+ session.query(BenchmarkResult)
328
+ .filter(BenchmarkResult.benchmark_run_id == run.id)
329
+ .all()
330
+ )
331
+
332
+ research_ids = [r.research_id for r in results if r.research_id]
333
+
334
+ if research_ids:
335
+ tracker = get_search_tracker()
336
+ with tracker.db.get_session() as metric_session:
337
+ # Get all search calls for these research_ids
338
+ search_calls = (
339
+ metric_session.query(SearchCall)
340
+ .filter(SearchCall.research_id.in_(research_ids))
341
+ .all()
342
+ )
343
+
344
+ # Group by research_id and calculate metrics per research session
345
+ research_results = {}
346
+ research_requests = {}
347
+
348
+ for call in search_calls:
349
+ if call.research_id:
350
+ if call.research_id not in research_results:
351
+ research_results[call.research_id] = 0
352
+ research_requests[call.research_id] = 0
353
+ research_results[call.research_id] += (
354
+ call.results_count or 0
355
+ )
356
+ research_requests[call.research_id] += 1
357
+
358
+ # Calculate averages across research sessions
359
+ if research_results:
360
+ total_results = sum(research_results.values())
361
+ avg_search_results = total_results / len(
362
+ research_results
363
+ )
364
+
365
+ total_requests = sum(research_requests.values())
366
+ total_search_requests = total_requests / len(
367
+ research_requests
368
+ )
369
+
370
+ except Exception as e:
371
+ logger.warning(
372
+ f"Error calculating search metrics for run {run.id}: {e}"
373
+ )
374
+
375
+ formatted_runs.append(
376
+ {
377
+ "id": run.id,
378
+ "run_name": run.run_name or f"Benchmark #{run.id}",
379
+ "created_at": run.created_at.isoformat(),
380
+ "total_examples": run.total_examples,
381
+ "completed_examples": run.completed_examples,
382
+ "overall_accuracy": run.overall_accuracy,
383
+ "status": run.status.value,
384
+ "search_config": run.search_config,
385
+ "evaluation_config": run.evaluation_config,
386
+ "datasets_config": run.datasets_config,
387
+ "avg_processing_time": avg_processing_time,
388
+ "avg_search_results": avg_search_results,
389
+ "total_search_requests": total_search_requests,
390
+ }
391
+ )
392
+
393
+ session.close()
394
+
395
+ return jsonify({"success": True, "runs": formatted_runs})
396
+
397
+ except Exception:
398
+ logger.exception("Error getting benchmark history")
399
+ return jsonify(
400
+ {"success": False, "error": "An internal error has occurred."}
401
+ ), 500
402
+
403
+
404
+ @benchmark_bp.route("/api/results/<int:benchmark_run_id>", methods=["GET"])
405
+ def get_benchmark_results(benchmark_run_id: int):
406
+ """Get detailed results for a benchmark run."""
407
+ try:
408
+ from ...utilities.db_utils import get_db_session
409
+ from ..models.benchmark_models import BenchmarkResult
410
+
411
+ logger.info(f"Getting results for benchmark {benchmark_run_id}")
412
+ session = get_db_session()
413
+
414
+ # Get recent results (limit to last 10)
415
+ limit = int(request.args.get("limit", 10))
416
+
417
+ results = (
418
+ session.query(BenchmarkResult)
419
+ .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id)
420
+ # Temporarily show all results including pending evaluations
421
+ # .filter(
422
+ # BenchmarkResult.is_correct.isnot(None)
423
+ # ) # Only completed evaluations
424
+ .order_by(BenchmarkResult.id.desc()) # Most recent first
425
+ .limit(limit)
426
+ .all()
427
+ )
428
+
429
+ logger.info(f"Found {len(results)} results")
430
+
431
+ # Build a map of research_id to total search results
432
+ search_results_by_research_id = {}
433
+ try:
434
+ from ...metrics.search_tracker import get_search_tracker
435
+ from ...metrics.db_models import SearchCall
436
+
437
+ tracker = get_search_tracker()
438
+
439
+ # Get all unique research_ids from our results
440
+ research_ids = [r.research_id for r in results if r.research_id]
441
+
442
+ if research_ids:
443
+ with tracker.db.get_session() as metric_session:
444
+ # Get all search calls for these research_ids
445
+ all_search_calls = (
446
+ metric_session.query(SearchCall)
447
+ .filter(SearchCall.research_id.in_(research_ids))
448
+ .all()
449
+ )
450
+
451
+ # Group search results by research_id
452
+ for call in all_search_calls:
453
+ if call.research_id:
454
+ if (
455
+ call.research_id
456
+ not in search_results_by_research_id
457
+ ):
458
+ search_results_by_research_id[
459
+ call.research_id
460
+ ] = 0
461
+ search_results_by_research_id[call.research_id] += (
462
+ call.results_count or 0
463
+ )
464
+
465
+ logger.info(
466
+ f"Found search metrics for {len(search_results_by_research_id)} research IDs from {len(all_search_calls)} total search calls"
467
+ )
468
+ logger.debug(
469
+ f"Research IDs from results: {research_ids[:5] if len(research_ids) > 5 else research_ids}"
470
+ )
471
+ logger.debug(
472
+ f"Search results by research_id: {dict(list(search_results_by_research_id.items())[:5])}"
473
+ )
474
+ except Exception:
475
+ logger.exception(
476
+ f"Error getting search metrics for benchmark {benchmark_run_id}"
477
+ )
478
+
479
+ # Format results for UI display
480
+ formatted_results = []
481
+ for result in results:
482
+ # Get search result count using research_id
483
+ search_result_count = 0
484
+
485
+ try:
486
+ if (
487
+ result.research_id
488
+ and result.research_id in search_results_by_research_id
489
+ ):
490
+ search_result_count = search_results_by_research_id[
491
+ result.research_id
492
+ ]
493
+ logger.debug(
494
+ f"Found {search_result_count} search results for research_id {result.research_id}"
495
+ )
496
+
497
+ except Exception:
498
+ logger.exception(
499
+ f"Error getting search results for result {result.example_id}"
500
+ )
501
+
502
+ # Fallback to sources if available and we didn't find metrics
503
+ if search_result_count == 0 and result.sources:
504
+ try:
505
+ if isinstance(result.sources, list):
506
+ search_result_count = len(result.sources)
507
+ elif (
508
+ isinstance(result.sources, dict)
509
+ and "all_links_of_system" in result.sources
510
+ ):
511
+ search_result_count = len(
512
+ result.sources["all_links_of_system"]
513
+ )
514
+ except:
515
+ pass
516
+
517
+ formatted_results.append(
518
+ {
519
+ "example_id": result.example_id,
520
+ "dataset_type": result.dataset_type.value,
521
+ "question": result.question,
522
+ "correct_answer": result.correct_answer,
523
+ "model_answer": result.extracted_answer,
524
+ "full_response": result.response,
525
+ "is_correct": result.is_correct,
526
+ "confidence": result.confidence,
527
+ "grader_response": result.grader_response,
528
+ "processing_time": result.processing_time,
529
+ "search_result_count": search_result_count,
530
+ "sources": result.sources,
531
+ "completed_at": result.completed_at.isoformat()
532
+ if result.completed_at
533
+ else None,
534
+ }
535
+ )
536
+
537
+ session.close()
538
+
539
+ return jsonify({"success": True, "results": formatted_results})
540
+
541
+ except Exception:
542
+ logger.exception("Error getting benchmark results")
543
+ return jsonify(
544
+ {"success": False, "error": "An internal error has occurred."}
545
+ ), 500
546
+
547
+
548
+ @benchmark_bp.route("/api/configs", methods=["GET"])
549
+ def get_saved_configs():
550
+ """Get list of saved benchmark configurations."""
551
+ try:
552
+ # TODO: Implement saved configs retrieval from database
553
+ # For now return default configs
554
+ default_configs = [
555
+ {
556
+ "id": 1,
557
+ "name": "Quick Test",
558
+ "description": "Fast benchmark with minimal examples",
559
+ "search_config": {
560
+ "iterations": 3,
561
+ "questions_per_iteration": 3,
562
+ "search_tool": "searxng",
563
+ "search_strategy": "focused_iteration",
564
+ },
565
+ "datasets_config": {
566
+ "simpleqa": {"count": 10},
567
+ "browsecomp": {"count": 5},
568
+ },
569
+ },
570
+ {
571
+ "id": 2,
572
+ "name": "Standard Evaluation",
573
+ "description": "Comprehensive benchmark with standard settings",
574
+ "search_config": {
575
+ "iterations": 8,
576
+ "questions_per_iteration": 5,
577
+ "search_tool": "searxng",
578
+ "search_strategy": "focused_iteration",
579
+ },
580
+ "datasets_config": {
581
+ "simpleqa": {"count": 50},
582
+ "browsecomp": {"count": 25},
583
+ },
584
+ },
585
+ ]
586
+
587
+ return jsonify({"success": True, "configs": default_configs})
588
+
589
+ except Exception:
590
+ logger.exception("Error getting saved configs")
591
+ return jsonify(
592
+ {"success": False, "error": "An internal error has occurred."}
593
+ ), 500
594
+
595
+
596
+ @benchmark_bp.route("/api/start-simple", methods=["POST"])
597
+ def start_benchmark_simple():
598
+ """Start a benchmark using current database settings."""
599
+ try:
600
+ data = request.get_json()
601
+ datasets_config = data.get("datasets_config", {})
602
+
603
+ # Validate datasets
604
+ if not datasets_config or not any(
605
+ config.get("count", 0) > 0 for config in datasets_config.values()
606
+ ):
607
+ return jsonify(
608
+ {
609
+ "error": "At least one dataset with count > 0 must be specified"
610
+ }
611
+ ), 400
612
+
613
+ # Get current settings from database
614
+ session = get_db_session()
615
+ settings_manager = SettingsManager(db_session=session)
616
+
617
+ # Build search config from database settings
618
+ search_config = {
619
+ "iterations": int(
620
+ settings_manager.get_setting("search.iterations", 8)
621
+ ),
622
+ "questions_per_iteration": int(
623
+ settings_manager.get_setting(
624
+ "search.questions_per_iteration", 5
625
+ )
626
+ ),
627
+ "search_tool": settings_manager.get_setting(
628
+ "search.tool", "searxng"
629
+ ),
630
+ "search_strategy": settings_manager.get_setting(
631
+ "search.search_strategy", "focused_iteration"
632
+ ),
633
+ "model_name": settings_manager.get_setting("llm.model"),
634
+ "provider": settings_manager.get_setting("llm.provider"),
635
+ "temperature": float(
636
+ settings_manager.get_setting("llm.temperature", 0.7)
637
+ ),
638
+ }
639
+
640
+ # Add provider-specific settings
641
+ provider = search_config.get("provider")
642
+ if provider == "openai_endpoint":
643
+ search_config["openai_endpoint_url"] = settings_manager.get_setting(
644
+ "llm.openai_endpoint.url"
645
+ )
646
+ search_config["openai_endpoint_api_key"] = (
647
+ settings_manager.get_setting("llm.openai_endpoint.api_key")
648
+ )
649
+ elif provider == "openai":
650
+ search_config["openai_api_key"] = settings_manager.get_setting(
651
+ "llm.openai.api_key"
652
+ )
653
+ elif provider == "anthropic":
654
+ search_config["anthropic_api_key"] = settings_manager.get_setting(
655
+ "llm.anthropic.api_key"
656
+ )
657
+
658
+ # Read evaluation config from database settings
659
+ evaluation_provider = settings_manager.get_setting(
660
+ "benchmark.evaluation.provider", "openai_endpoint"
661
+ )
662
+ evaluation_model = settings_manager.get_setting(
663
+ "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet"
664
+ )
665
+ evaluation_temperature = float(
666
+ settings_manager.get_setting("benchmark.evaluation.temperature", 0)
667
+ )
668
+
669
+ evaluation_config = {
670
+ "provider": evaluation_provider,
671
+ "model_name": evaluation_model,
672
+ "temperature": evaluation_temperature,
673
+ }
674
+
675
+ # Add provider-specific settings for evaluation
676
+ if evaluation_provider == "openai_endpoint":
677
+ evaluation_config["openai_endpoint_url"] = (
678
+ settings_manager.get_setting(
679
+ "benchmark.evaluation.endpoint_url",
680
+ "https://openrouter.ai/api/v1",
681
+ )
682
+ )
683
+ evaluation_config["openai_endpoint_api_key"] = (
684
+ settings_manager.get_setting("llm.openai_endpoint.api_key")
685
+ )
686
+ elif evaluation_provider == "openai":
687
+ evaluation_config["openai_api_key"] = settings_manager.get_setting(
688
+ "llm.openai.api_key"
689
+ )
690
+ elif evaluation_provider == "anthropic":
691
+ evaluation_config["anthropic_api_key"] = (
692
+ settings_manager.get_setting("llm.anthropic.api_key")
693
+ )
694
+
695
+ session.close()
696
+
697
+ # Create and start benchmark
698
+ benchmark_run_id = benchmark_service.create_benchmark_run(
699
+ run_name=f"Quick Benchmark - {data.get('run_name', '')}",
700
+ search_config=search_config,
701
+ evaluation_config=evaluation_config,
702
+ datasets_config=datasets_config,
703
+ )
704
+
705
+ success = benchmark_service.start_benchmark(benchmark_run_id)
706
+
707
+ if success:
708
+ return jsonify(
709
+ {
710
+ "success": True,
711
+ "benchmark_run_id": benchmark_run_id,
712
+ "message": "Benchmark started with current settings",
713
+ }
714
+ )
715
+ else:
716
+ return jsonify(
717
+ {"success": False, "error": "Failed to start benchmark"}
718
+ ), 500
719
+
720
+ except Exception:
721
+ logger.exception("Error starting simple benchmark")
722
+ return jsonify(
723
+ {"success": False, "error": "An internal error has occurred."}
724
+ ), 500
725
+
726
+
727
+ @benchmark_bp.route("/api/validate-config", methods=["POST"])
728
+ def validate_config():
729
+ """Validate a benchmark configuration."""
730
+ try:
731
+ data = request.get_json()
732
+
733
+ if not data:
734
+ return jsonify({"valid": False, "errors": ["No data provided"]})
735
+
736
+ errors = []
737
+
738
+ # Validate search config
739
+ search_config = data.get("search_config", {})
740
+ if not search_config.get("search_tool"):
741
+ errors.append("Search tool is required")
742
+ if not search_config.get("search_strategy"):
743
+ errors.append("Search strategy is required")
744
+
745
+ # Validate datasets config
746
+ datasets_config = data.get("datasets_config", {})
747
+ if not datasets_config:
748
+ errors.append("At least one dataset must be configured")
749
+
750
+ total_examples = sum(
751
+ config.get("count", 0) for config in datasets_config.values()
752
+ )
753
+ if total_examples == 0:
754
+ errors.append("Total examples must be greater than 0")
755
+
756
+ if total_examples > 1000:
757
+ errors.append(
758
+ "Total examples should not exceed 1000 for web interface"
759
+ )
760
+
761
+ return jsonify(
762
+ {
763
+ "valid": len(errors) == 0,
764
+ "errors": errors,
765
+ "total_examples": total_examples,
766
+ }
767
+ )
768
+
769
+ except Exception:
770
+ logger.exception("Error validating config")
771
+ return jsonify(
772
+ {"valid": False, "errors": ["An internal error has occurred."]}
773
+ ), 500
774
+
775
+
776
+ @benchmark_bp.route("/api/search-quality", methods=["GET"])
777
+ def get_search_quality():
778
+ """Get current search quality metrics from rate limiting tracker."""
779
+ try:
780
+ from ...web_search_engines.rate_limiting import get_tracker
781
+
782
+ tracker = get_tracker()
783
+ quality_stats = tracker.get_search_quality_stats()
784
+
785
+ return jsonify(
786
+ {
787
+ "success": True,
788
+ "search_quality": quality_stats,
789
+ "timestamp": time.time(),
790
+ }
791
+ )
792
+
793
+ except Exception:
794
+ logger.exception("Error getting search quality")
795
+ return jsonify(
796
+ {"success": False, "error": "An internal error has occurred."}
797
+ ), 500
798
+
799
+
800
+ @benchmark_bp.route("/api/delete/<int:benchmark_run_id>", methods=["DELETE"])
801
+ def delete_benchmark_run(benchmark_run_id: int):
802
+ """Delete a benchmark run and all its results."""
803
+ try:
804
+ from ...utilities.db_utils import get_db_session
805
+ from ..models.benchmark_models import (
806
+ BenchmarkRun,
807
+ BenchmarkResult,
808
+ BenchmarkProgress,
809
+ )
810
+
811
+ session = get_db_session()
812
+
813
+ # Check if benchmark run exists
814
+ benchmark_run = (
815
+ session.query(BenchmarkRun)
816
+ .filter(BenchmarkRun.id == benchmark_run_id)
817
+ .first()
818
+ )
819
+
820
+ if not benchmark_run:
821
+ session.close()
822
+ return jsonify(
823
+ {"success": False, "error": "Benchmark run not found"}
824
+ ), 404
825
+
826
+ # Prevent deletion of running benchmarks
827
+ if benchmark_run.status.value == "in_progress":
828
+ session.close()
829
+ return jsonify(
830
+ {
831
+ "success": False,
832
+ "error": "Cannot delete a running benchmark. Cancel it first.",
833
+ }
834
+ ), 400
835
+
836
+ # Delete related records (cascade should handle this, but being explicit)
837
+ session.query(BenchmarkResult).filter(
838
+ BenchmarkResult.benchmark_run_id == benchmark_run_id
839
+ ).delete()
840
+
841
+ session.query(BenchmarkProgress).filter(
842
+ BenchmarkProgress.benchmark_run_id == benchmark_run_id
843
+ ).delete()
844
+
845
+ # Delete the benchmark run
846
+ session.delete(benchmark_run)
847
+ session.commit()
848
+ session.close()
849
+
850
+ logger.info(f"Deleted benchmark run {benchmark_run_id}")
851
+ return jsonify(
852
+ {
853
+ "success": True,
854
+ "message": f"Benchmark run {benchmark_run_id} deleted successfully",
855
+ }
856
+ )
857
+
858
+ except Exception:
859
+ logger.exception(f"Error deleting benchmark run {benchmark_run_id}")
860
+ return jsonify(
861
+ {"success": False, "error": "An internal error has occurred."}
862
+ ), 500