local-deep-research 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. local_deep_research/__version__.py +1 -1
  2. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +11 -1
  3. local_deep_research/advanced_search_system/questions/browsecomp_question.py +32 -6
  4. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +32 -8
  5. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -0
  6. local_deep_research/api/__init__.py +2 -0
  7. local_deep_research/api/research_functions.py +177 -3
  8. local_deep_research/benchmarks/graders.py +150 -5
  9. local_deep_research/benchmarks/models/__init__.py +19 -0
  10. local_deep_research/benchmarks/models/benchmark_models.py +283 -0
  11. local_deep_research/benchmarks/ui/__init__.py +1 -0
  12. local_deep_research/benchmarks/web_api/__init__.py +6 -0
  13. local_deep_research/benchmarks/web_api/benchmark_routes.py +862 -0
  14. local_deep_research/benchmarks/web_api/benchmark_service.py +920 -0
  15. local_deep_research/config/llm_config.py +106 -21
  16. local_deep_research/defaults/default_settings.json +447 -2
  17. local_deep_research/error_handling/report_generator.py +10 -0
  18. local_deep_research/llm/__init__.py +19 -0
  19. local_deep_research/llm/llm_registry.py +155 -0
  20. local_deep_research/metrics/db_models.py +3 -7
  21. local_deep_research/metrics/search_tracker.py +25 -11
  22. local_deep_research/search_system.py +12 -9
  23. local_deep_research/utilities/log_utils.py +23 -10
  24. local_deep_research/utilities/thread_context.py +99 -0
  25. local_deep_research/web/app_factory.py +32 -8
  26. local_deep_research/web/database/benchmark_schema.py +230 -0
  27. local_deep_research/web/database/convert_research_id_to_string.py +161 -0
  28. local_deep_research/web/database/models.py +55 -1
  29. local_deep_research/web/database/schema_upgrade.py +397 -2
  30. local_deep_research/web/database/uuid_migration.py +265 -0
  31. local_deep_research/web/routes/api_routes.py +62 -31
  32. local_deep_research/web/routes/history_routes.py +13 -6
  33. local_deep_research/web/routes/metrics_routes.py +264 -4
  34. local_deep_research/web/routes/research_routes.py +45 -18
  35. local_deep_research/web/routes/route_registry.py +352 -0
  36. local_deep_research/web/routes/settings_routes.py +382 -22
  37. local_deep_research/web/services/research_service.py +22 -29
  38. local_deep_research/web/services/settings_manager.py +53 -0
  39. local_deep_research/web/services/settings_service.py +2 -0
  40. local_deep_research/web/static/css/styles.css +8 -0
  41. local_deep_research/web/static/js/components/detail.js +7 -14
  42. local_deep_research/web/static/js/components/details.js +8 -10
  43. local_deep_research/web/static/js/components/fallback/ui.js +4 -4
  44. local_deep_research/web/static/js/components/history.js +6 -6
  45. local_deep_research/web/static/js/components/logpanel.js +14 -11
  46. local_deep_research/web/static/js/components/progress.js +51 -46
  47. local_deep_research/web/static/js/components/research.js +250 -89
  48. local_deep_research/web/static/js/components/results.js +5 -7
  49. local_deep_research/web/static/js/components/settings.js +32 -26
  50. local_deep_research/web/static/js/components/settings_sync.js +24 -23
  51. local_deep_research/web/static/js/config/urls.js +285 -0
  52. local_deep_research/web/static/js/main.js +8 -8
  53. local_deep_research/web/static/js/research_form.js +267 -12
  54. local_deep_research/web/static/js/services/api.js +18 -18
  55. local_deep_research/web/static/js/services/keyboard.js +8 -8
  56. local_deep_research/web/static/js/services/socket.js +53 -35
  57. local_deep_research/web/static/js/services/ui.js +1 -1
  58. local_deep_research/web/templates/base.html +4 -1
  59. local_deep_research/web/templates/components/custom_dropdown.html +5 -3
  60. local_deep_research/web/templates/components/mobile_nav.html +3 -3
  61. local_deep_research/web/templates/components/sidebar.html +9 -3
  62. local_deep_research/web/templates/pages/benchmark.html +2697 -0
  63. local_deep_research/web/templates/pages/benchmark_results.html +1136 -0
  64. local_deep_research/web/templates/pages/benchmark_simple.html +453 -0
  65. local_deep_research/web/templates/pages/cost_analytics.html +1 -1
  66. local_deep_research/web/templates/pages/metrics.html +212 -39
  67. local_deep_research/web/templates/pages/research.html +8 -6
  68. local_deep_research/web/templates/pages/star_reviews.html +1 -1
  69. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +14 -1
  70. local_deep_research/web_search_engines/engines/search_engine_brave.py +15 -1
  71. local_deep_research/web_search_engines/engines/search_engine_ddg.py +20 -1
  72. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +26 -2
  73. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +15 -1
  74. local_deep_research/web_search_engines/engines/search_engine_retriever.py +192 -0
  75. local_deep_research/web_search_engines/engines/search_engine_tavily.py +307 -0
  76. local_deep_research/web_search_engines/rate_limiting/__init__.py +14 -0
  77. local_deep_research/web_search_engines/rate_limiting/__main__.py +9 -0
  78. local_deep_research/web_search_engines/rate_limiting/cli.py +209 -0
  79. local_deep_research/web_search_engines/rate_limiting/exceptions.py +21 -0
  80. local_deep_research/web_search_engines/rate_limiting/tracker.py +506 -0
  81. local_deep_research/web_search_engines/retriever_registry.py +108 -0
  82. local_deep_research/web_search_engines/search_engine_base.py +161 -43
  83. local_deep_research/web_search_engines/search_engine_factory.py +14 -0
  84. local_deep_research/web_search_engines/search_engines_config.py +20 -0
  85. local_deep_research-0.6.0.dist-info/METADATA +374 -0
  86. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/RECORD +89 -64
  87. local_deep_research-0.5.9.dist-info/METADATA +0 -420
  88. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/WHEEL +0 -0
  89. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/entry_points.txt +0 -0
  90. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,2697 @@
1
+ {% extends "base.html" %}
2
+ {% from "components/custom_dropdown.html" import render_dropdown %}
3
+
4
+ {% set active_page = 'benchmark' %}
5
+
6
+ {% block title %}Benchmark Configuration - Deep Research System{% endblock %}
7
+
8
+ {% block extra_head %}
9
+ <meta name="csrf-token" content="{{ csrf_token() }}">
10
+ <link rel="stylesheet" href="{{ url_for('research.serve_static', path='css/custom_dropdown.css') }}">
11
+ <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
12
+ <style>
13
+ .benchmark-card {
14
+ width: 100%;
15
+ margin: 0;
16
+ }
17
+
18
+ .dataset-config {
19
+ border: 1px solid var(--border-color);
20
+ border-radius: 8px;
21
+ padding: 20px;
22
+ margin-bottom: 20px;
23
+ background: var(--card-bg);
24
+ }
25
+
26
+ .dataset-header {
27
+ display: flex;
28
+ justify-content: space-between;
29
+ align-items: center;
30
+ margin-bottom: 15px;
31
+ }
32
+
33
+ .dataset-toggle {
34
+ display: flex;
35
+ align-items: center;
36
+ gap: 10px;
37
+ }
38
+
39
+ .benchmark-progress {
40
+ margin-top: 20px;
41
+ padding: 20px;
42
+ background: var(--card-bg);
43
+ border-radius: 8px;
44
+ border: 1px solid var(--border-color);
45
+ display: none;
46
+ }
47
+
48
+ .progress-header {
49
+ display: flex;
50
+ justify-content: space-between;
51
+ align-items: center;
52
+ margin-bottom: 15px;
53
+ }
54
+
55
+ .progress-bar {
56
+ width: 100%;
57
+ height: 20px;
58
+ background: var(--bg-color);
59
+ border-radius: 10px;
60
+ overflow: hidden;
61
+ margin-bottom: 15px;
62
+ }
63
+
64
+ .progress-fill {
65
+ height: 100%;
66
+ background: linear-gradient(90deg, var(--primary-color), var(--accent-color));
67
+ width: 0%;
68
+ transition: width 0.3s ease;
69
+ }
70
+
71
+ .metrics-grid {
72
+ display: grid;
73
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
74
+ gap: 15px;
75
+ margin-top: 15px;
76
+ }
77
+
78
+ .metric-card {
79
+ padding: 15px;
80
+ background: var(--bg-color);
81
+ border-radius: 6px;
82
+ text-align: center;
83
+ }
84
+
85
+ .metric-value {
86
+ font-size: 1.5rem;
87
+ font-weight: bold;
88
+ color: var(--primary-color);
89
+ }
90
+
91
+ .metric-label {
92
+ font-size: 0.9rem;
93
+ color: var(--text-muted);
94
+ margin-top: 5px;
95
+ }
96
+
97
+ .dataset-accuracy {
98
+ display: flex;
99
+ justify-content: space-between;
100
+ margin-top: 10px;
101
+ padding: 10px;
102
+ background: var(--bg-color);
103
+ border-radius: 6px;
104
+ }
105
+
106
+ .alert {
107
+ padding: 15px;
108
+ border-radius: 6px;
109
+ margin-bottom: 15px;
110
+ }
111
+
112
+ .alert-warning {
113
+ background-color: #fff3cd;
114
+ border: 1px solid #ffeaa7;
115
+ color: #856404;
116
+ }
117
+
118
+ .alert i {
119
+ margin-right: 8px;
120
+ }
121
+
122
+ /* Question and Results Display Styles */
123
+ .benchmark-section {
124
+ margin-top: 20px;
125
+ }
126
+
127
+ .question-card {
128
+ background: #1a1a1a;
129
+ border: 1px solid #333;
130
+ border-radius: 8px;
131
+ padding: 15px;
132
+ margin-bottom: 10px;
133
+ }
134
+
135
+ .question-content {
136
+ margin-bottom: 10px;
137
+ }
138
+
139
+ .question-text {
140
+ font-size: 1rem;
141
+ line-height: 1.4;
142
+ color: #e0e0e0;
143
+ margin-bottom: 8px;
144
+ padding: 10px;
145
+ background: #2a2a2a;
146
+ border-radius: 4px;
147
+ border-left: 4px solid var(--primary-color);
148
+ }
149
+
150
+ .question-meta {
151
+ display: flex;
152
+ gap: 10px;
153
+ font-size: 0.85rem;
154
+ color: var(--text-muted);
155
+ }
156
+
157
+ .dataset-badge {
158
+ background: var(--primary-color);
159
+ color: white;
160
+ padding: 2px 8px;
161
+ border-radius: 12px;
162
+ font-size: 0.8rem;
163
+ font-weight: 500;
164
+ }
165
+
166
+ .search-count-badge {
167
+ color: white;
168
+ padding: 2px 6px;
169
+ border-radius: 10px;
170
+ font-size: 0.75rem;
171
+ font-weight: 500;
172
+ margin-left: 8px;
173
+ }
174
+
175
+ .search-count-badge.critical {
176
+ background: #f44336; /* Red for 0-1 results */
177
+ }
178
+
179
+ .search-count-badge.warning {
180
+ background: #ff9800; /* Orange for 2-4 results */
181
+ }
182
+
183
+ .search-count-badge.good {
184
+ background: #4caf50; /* Green for 5+ results */
185
+ }
186
+
187
+ .processing-status {
188
+ padding: 8px 12px;
189
+ background: var(--bg-secondary);
190
+ border-radius: 4px;
191
+ font-size: 0.9rem;
192
+ color: var(--text-muted);
193
+ }
194
+
195
+ .processing-status.processing {
196
+ background: #ff9800;
197
+ color: #ffffff;
198
+ }
199
+
200
+ .processing-status.completed {
201
+ background: #e8f5e8;
202
+ color: #2e7d32;
203
+ }
204
+
205
+ .result-card {
206
+ background: #1a1a1a;
207
+ border: 1px solid #333;
208
+ border-radius: 6px;
209
+ padding: 12px;
210
+ margin-bottom: 8px;
211
+ transition: border-color 0.2s;
212
+ }
213
+
214
+ .result-card.correct {
215
+ border-left: 4px solid #4caf50;
216
+ }
217
+
218
+ .result-card.incorrect {
219
+ border-left: 4px solid #f44336;
220
+ }
221
+
222
+ .result-header {
223
+ display: flex;
224
+ justify-content: between;
225
+ align-items: center;
226
+ margin-bottom: 8px;
227
+ font-size: 0.85rem;
228
+ color: #a0a0a0;
229
+ }
230
+
231
+ .result-status {
232
+ font-weight: 600;
233
+ }
234
+
235
+ .result-status.correct {
236
+ color: #4caf50;
237
+ }
238
+
239
+ .result-status.incorrect {
240
+ color: #f44336;
241
+ }
242
+
243
+ .answer-comparison {
244
+ display: grid;
245
+ gap: 8px;
246
+ }
247
+
248
+ .answer-box {
249
+ padding: 12px;
250
+ border-radius: 4px;
251
+ font-size: 0.95rem;
252
+ line-height: 1.5;
253
+ white-space: pre-wrap;
254
+ word-break: break-word;
255
+ min-height: 60px;
256
+ color: #e0e0e0 !important;
257
+ }
258
+
259
+ .answer-box > div {
260
+ margin-top: 5px;
261
+ color: #e0e0e0 !important;
262
+ }
263
+
264
+ .model-answer {
265
+ background: #1e2a3a;
266
+ border-left: 4px solid #2196f3;
267
+ color: #e0e0e0 !important;
268
+ }
269
+
270
+ .correct-answer {
271
+ background: #1e3a1e;
272
+ border-left: 4px solid #4caf50;
273
+ color: #e0e0e0 !important;
274
+ }
275
+
276
+ .answer-label {
277
+ font-size: 0.75rem;
278
+ font-weight: 600;
279
+ color: #a0a0a0;
280
+ margin-bottom: 4px;
281
+ text-transform: uppercase;
282
+ letter-spacing: 0.5px;
283
+ }
284
+
285
+ .no-results {
286
+ text-align: center;
287
+ color: var(--text-muted);
288
+ padding: 20px;
289
+ font-style: italic;
290
+ }
291
+
292
+ #recent-results-container {
293
+ max-height: 600px;
294
+ overflow-y: auto;
295
+ }
296
+
297
+ /* Improved layout structure */
298
+ .page-content {
299
+ width: 100%;
300
+ max-width: none;
301
+ }
302
+
303
+ .benchmark-progress .card {
304
+ width: 100%;
305
+ max-width: none;
306
+ }
307
+
308
+ .form-group {
309
+ width: 100%;
310
+ }
311
+
312
+ /* Better visual hierarchy */
313
+ .benchmark-guidelines {
314
+ background: linear-gradient(135deg, #1e1e1e 0%, #2a2a2a 100%);
315
+ border: 1px solid #404040;
316
+ border-left: 4px solid var(--primary-color);
317
+ border-radius: 8px;
318
+ margin-bottom: 25px;
319
+ box-shadow: 0 2px 8px rgba(0,0,0,0.3);
320
+ }
321
+
322
+ .guidelines-content {
323
+ display: grid;
324
+ grid-template-columns: 1fr auto;
325
+ gap: 25px;
326
+ align-items: start;
327
+ }
328
+
329
+ .guidelines-text {
330
+ padding: 25px;
331
+ }
332
+
333
+ .guidelines-sidebar {
334
+ min-width: 200px;
335
+ background: rgba(var(--primary-color-rgb), 0.1);
336
+ padding: 20px;
337
+ border-radius: 0 8px 8px 0;
338
+ text-align: center;
339
+ border-left: 1px solid rgba(var(--primary-color-rgb), 0.2);
340
+ }
341
+
342
+ /* Enhanced form sections */
343
+ .form-section {
344
+ background: #1a1a1a;
345
+ border: 1px solid #333;
346
+ border-radius: 8px;
347
+ margin-bottom: 20px;
348
+ overflow: hidden;
349
+ }
350
+
351
+ .form-section-header {
352
+ background: linear-gradient(90deg, #2a2a2a 0%, #333 100%);
353
+ padding: 15px 20px;
354
+ border-bottom: 1px solid #404040;
355
+ }
356
+
357
+ .form-section-title {
358
+ color: var(--primary-color);
359
+ font-size: 1.1rem;
360
+ font-weight: 600;
361
+ margin: 0;
362
+ display: flex;
363
+ align-items: center;
364
+ gap: 8px;
365
+ }
366
+
367
+ .form-section-content {
368
+ padding: 20px;
369
+ }
370
+
371
+ /* Improved dataset configuration cards */
372
+ .dataset-grid {
373
+ display: grid;
374
+ grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
375
+ gap: 20px;
376
+ }
377
+
378
+ .dataset-card {
379
+ background: #1e1e1e;
380
+ border: 2px solid #333;
381
+ border-radius: 10px;
382
+ padding: 20px;
383
+ transition: all 0.3s ease;
384
+ position: relative;
385
+ }
386
+
387
+ .dataset-card:hover {
388
+ border-color: var(--primary-color);
389
+ box-shadow: 0 4px 12px rgba(var(--primary-color-rgb), 0.2);
390
+ }
391
+
392
+ .dataset-card.disabled {
393
+ opacity: 0.6;
394
+ border-color: #555;
395
+ }
396
+
397
+ .dataset-header {
398
+ display: flex;
399
+ justify-content: space-between;
400
+ align-items: flex-start;
401
+ margin-bottom: 15px;
402
+ }
403
+
404
+ .dataset-info h3 {
405
+ color: #e0e0e0;
406
+ margin: 0 0 5px 0;
407
+ font-size: 1.2rem;
408
+ }
409
+
410
+ .dataset-info p {
411
+ color: #a0a0a0;
412
+ margin: 0;
413
+ font-size: 0.9rem;
414
+ }
415
+
416
+ .dataset-toggle {
417
+ display: flex;
418
+ align-items: center;
419
+ gap: 8px;
420
+ }
421
+
422
+ /* Modern toggle switch */
423
+ .toggle-switch {
424
+ position: relative;
425
+ width: 50px;
426
+ height: 24px;
427
+ background: #555;
428
+ border-radius: 12px;
429
+ cursor: pointer;
430
+ transition: background 0.3s;
431
+ }
432
+
433
+ .toggle-switch.active {
434
+ background: var(--primary-color);
435
+ }
436
+
437
+ .toggle-switch::after {
438
+ content: '';
439
+ position: absolute;
440
+ top: 2px;
441
+ left: 2px;
442
+ width: 20px;
443
+ height: 20px;
444
+ background: white;
445
+ border-radius: 50%;
446
+ transition: transform 0.3s;
447
+ }
448
+
449
+ .toggle-switch.active::after {
450
+ transform: translateX(26px);
451
+ }
452
+
453
+ /* Enhanced input styling */
454
+ .form-control {
455
+ background: #2a2a2a;
456
+ border: 2px solid #404040;
457
+ border-radius: 6px;
458
+ padding: 10px 12px;
459
+ color: #e0e0e0;
460
+ font-size: 0.95rem;
461
+ transition: border-color 0.3s, box-shadow 0.3s;
462
+ }
463
+
464
+ .form-control:focus {
465
+ border-color: var(--primary-color);
466
+ box-shadow: 0 0 0 3px rgba(var(--primary-color-rgb), 0.2);
467
+ outline: none;
468
+ }
469
+
470
+ /* Responsive improvements */
471
+ @media (max-width: 1200px) {
472
+ .guidelines-content {
473
+ grid-template-columns: 1fr;
474
+ }
475
+
476
+ .guidelines-sidebar {
477
+ border-radius: 0 0 8px 8px;
478
+ border-left: none;
479
+ border-top: 1px solid rgba(var(--primary-color-rgb), 0.2);
480
+ }
481
+ }
482
+
483
+ @media (max-width: 768px) {
484
+ .dataset-grid {
485
+ grid-template-columns: 1fr;
486
+ }
487
+
488
+ .guidelines-text {
489
+ padding: 20px;
490
+ }
491
+
492
+ .form-section-content {
493
+ padding: 15px;
494
+ }
495
+ }
496
+
497
+
498
+ /* Performance Charts Styles */
499
+ .charts-section {
500
+ margin-top: 20px;
501
+ }
502
+
503
+ .charts-grid {
504
+ display: grid;
505
+ grid-template-columns: 1fr 1fr;
506
+ gap: 20px;
507
+ margin-top: 15px;
508
+ }
509
+
510
+ .chart-container {
511
+ background: #1a1a1a;
512
+ border: 1px solid #333;
513
+ border-radius: 8px;
514
+ padding: 15px;
515
+ height: 300px;
516
+ }
517
+
518
+ .chart-title {
519
+ color: #e0e0e0;
520
+ font-size: 1rem;
521
+ font-weight: 600;
522
+ margin-bottom: 10px;
523
+ text-align: center;
524
+ }
525
+
526
+ .chart-canvas {
527
+ width: 100% !important;
528
+ height: 250px !important;
529
+ }
530
+
531
+ @media (max-width: 768px) {
532
+ .charts-grid {
533
+ grid-template-columns: 1fr;
534
+ }
535
+ }
536
+
537
+ /* Evaluation Settings Styles */
538
+ .form-row {
539
+ display: grid;
540
+ grid-template-columns: 1fr 1fr;
541
+ gap: 20px;
542
+ margin-bottom: 15px;
543
+ }
544
+
545
+ .form-group.half {
546
+ margin-bottom: 0;
547
+ }
548
+
549
+ @media (max-width: 768px) {
550
+ .form-row {
551
+ grid-template-columns: 1fr;
552
+ gap: 15px;
553
+ }
554
+ }
555
+ </style>
556
+ {% endblock %}
557
+
558
+ {% block content %}
559
+ <div class="page active" id="benchmark">
560
+ <div class="page-header">
561
+ <h1>Benchmark Configuration</h1>
562
+ <p class="page-subtitle">Test and optimize your search configurations</p>
563
+ <div style="margin-top: 10px;">
564
+ <a href="{{ url_for('benchmark.results') }}" class="btn btn-secondary">
565
+ <i class="fas fa-chart-line"></i> View Past Results
566
+ </a>
567
+ </div>
568
+ </div>
569
+
570
+ <!-- Benchmark Usage Guidelines -->
571
+ <div class="benchmark-guidelines">
572
+ <div class="guidelines-content">
573
+ <div class="guidelines-text">
574
+ <h3 style="color: var(--primary-color); margin-bottom: 15px; font-size: 1.3rem;">
575
+ <i class="fas fa-info-circle"></i> Benchmark Guidelines
576
+ </h3>
577
+ <p style="margin-bottom: 12px; line-height: 1.6; color: #e0e0e0;">
578
+ <strong>Purpose:</strong> Benchmarks are designed to help you evaluate if your configuration works well, not for research papers or production use.
579
+ </p>
580
+ <p style="margin-bottom: 12px; line-height: 1.6; color: #e0e0e0;">
581
+ <strong>Responsible Usage:</strong> Please use reasonable example counts to avoid overwhelming search engines. The default of 75 examples provides a good balance for configuration testing.
582
+ </p>
583
+ <p style="margin-bottom: 18px; line-height: 1.6; color: #e0e0e0;">
584
+ <strong>Requirements:</strong> Benchmarks require an evaluation model for grading results. You can configure your preferred provider and model in the Evaluation Settings below. The default uses OpenRouter with Claude 3.7 Sonnet, but you can choose from various providers including OpenAI, Anthropic, or local models.
585
+ </p>
586
+ <div style="background: rgba(255, 167, 38, 0.1); border: 1px solid rgba(255, 167, 38, 0.3); padding: 15px; border-radius: 8px; margin-top: 15px;">
587
+ <h4 style="color: #ffa726; margin-bottom: 10px; font-size: 1rem; display: flex; align-items: center; gap: 8px;">
588
+ <i class="fas fa-search"></i> Search Engine Recommendations
589
+ </h4>
590
+ <ul style="margin: 0; padding-left: 20px; font-size: 0.95rem; line-height: 1.5; color: #e0e0e0;">
591
+ <li style="margin-bottom: 8px;"><strong style="color: #4caf50;">Tavily:</strong> Recommended for general knowledge benchmarks - AI-optimized search API, reliable results</li>
592
+ <li style="margin-bottom: 8px;"><strong style="color: #2196f3;">Brave:</strong> Independent search engine but unknown why performance is lower - could be smaller index, different ranking algorithm, or API limitations</li>
593
+ <li style="margin-bottom: 8px;"><strong style="color: #ff9800;">SearXNG:</strong> Often outperforms commercial APIs by aggregating multiple sources - shared resource, use moderate example counts</li>
594
+ <li style="margin-bottom: 8px;"><strong style="color: #f44336;">Specialized engines (ArXiv, PubMed, Wikipedia):</strong> Shared resources that are useless for general SimpleQA questions - should not be used for this test</li>
595
+ </ul>
596
+ <div style="background: rgba(33, 150, 243, 0.1); border: 1px solid rgba(33, 150, 243, 0.3); padding: 12px; border-radius: 6px; margin-top: 12px;">
597
+ <p style="margin: 0; font-size: 0.9rem; color: #e0e0e0;">
598
+ <strong style="color: #2196f3;">🔧 For Shared Resources:</strong> When using SearXNG or other shared engines, reduce iterations and questions per iteration in Settings to minimize load on shared infrastructure.
599
+ </p>
600
+ </div>
601
+ </div>
602
+ </div>
603
+ <div class="guidelines-sidebar">
604
+ <div style="font-size: 2.5rem; color: var(--primary-color); margin-bottom: 12px;">
605
+ <i class="fas fa-tachometer-alt"></i>
606
+ </div>
607
+ <div style="font-size: 1.2rem; font-weight: 600; color: #e0e0e0; margin-bottom: 8px;">
608
+ Quick Check
609
+ </div>
610
+ <div style="font-size: 0.9rem; color: #a0a0a0; line-height: 1.4; margin-bottom: 15px;">
611
+ Test your config with reasonable limits
612
+ </div>
613
+ <div style="background: rgba(var(--primary-color-rgb), 0.2); padding: 8px 12px; border-radius: 6px; font-size: 0.85rem; color: var(--primary-color); font-weight: 500;">
614
+ 🎯 Configuration Testing
615
+ </div>
616
+ </div>
617
+ </div>
618
+ </div>
619
+
620
+ <!-- Alert container -->
621
+ <div id="benchmark-alert" class="settings-alert-container" style="display:none"></div>
622
+
623
+ <div class="card benchmark-card">
624
+ <div class="card-content">
625
+ <form id="benchmark-form">
626
+
627
+ <!-- Benchmark Name -->
628
+ <div class="form-group">
629
+ <label for="run_name">Benchmark Name (Optional)</label>
630
+ <input type="text" id="run_name" name="run_name" class="form-control" placeholder="e.g., 'Test new search strategy'">
631
+ <span class="input-help">Give your benchmark run a descriptive name</span>
632
+ </div>
633
+
634
+ <!-- Dataset Configuration -->
635
+ <div class="form-group">
636
+ <fieldset>
637
+ <legend>Dataset Selection</legend>
638
+
639
+ <!-- SimpleQA Dataset -->
640
+ <div class="dataset-config">
641
+ <div class="dataset-header">
642
+ <div>
643
+ <h3>SimpleQA</h3>
644
+ <p>Fact-based questions with clear answers</p>
645
+ </div>
646
+ <div class="dataset-toggle">
647
+ <input type="checkbox" id="simpleqa_enabled" checked>
648
+ <label for="simpleqa_enabled">Enable</label>
649
+ </div>
650
+ </div>
651
+ <div class="form-group">
652
+ <label for="simpleqa_count">Number of Examples</label>
653
+ <input type="number" id="simpleqa_count" name="simpleqa_count" value="50" min="1" max="500" class="form-control">
654
+ <span class="input-help">Recommended: 50 examples provides good balance for configuration testing</span>
655
+ </div>
656
+ </div>
657
+
658
+ <!-- BrowseComp Dataset -->
659
+ <div class="dataset-config" style="border: 2px solid #f44336; background: #2a1e1e;">
660
+ <div class="dataset-header">
661
+ <div>
662
+ <h3 style="color: #f44336;">BrowseComp</h3>
663
+ <p style="color: #ccc;">Complex browsing and comparison tasks</p>
664
+ <div style="background: #3a1e1e; border: 1px solid #f44336; color: #f44336; padding: 10px 12px; border-radius: 4px; margin-top: 10px; font-size: 0.85rem; line-height: 1.4;">
665
+ <i class="fas fa-exclamation-triangle"></i> <strong>Poor Performance Warning:</strong> We currently achieve close to 0% accuracy on BrowseComp.
666
+ <br><strong>For testing only:</strong> Limited to 20 examples max to see what this benchmark is about.
667
+ </div>
668
+ </div>
669
+ <div class="dataset-toggle">
670
+ <input type="checkbox" id="browsecomp_enabled">
671
+ <label for="browsecomp_enabled">Enable (Testing Only)</label>
672
+ </div>
673
+ </div>
674
+ <div class="form-group">
675
+ <label for="browsecomp_count">Number of Examples (Max 20)</label>
676
+ <input type="number" id="browsecomp_count" name="browsecomp_count" value="0" min="0" max="20" class="form-control" disabled>
677
+ <span class="input-help" style="color: #f44336;">Restricted to max 20 examples due to poor performance - for curiosity testing only</span>
678
+ </div>
679
+ </div>
680
+ </fieldset>
681
+ </div>
682
+
683
+ <!-- Current Database Settings -->
684
+ <div class="form-group">
685
+ <fieldset>
686
+ <legend>Current Configuration</legend>
687
+ <div class="dataset-config" style="background: var(--bg-color); border: 1px solid var(--border-color);">
688
+ <div class="dataset-header">
689
+ <div>
690
+ <h3>Active Database Settings</h3>
691
+ <p>Benchmark will use all settings from your database configuration</p>
692
+ </div>
693
+ </div>
694
+
695
+ <div id="current-settings-display" style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 15px; margin-bottom: 15px;">
696
+ <div class="metric-card" style="text-align: left; padding: 10px;">
697
+ <div class="metric-label">Provider</div>
698
+ <div class="metric-value" style="font-size: 0.9rem;" id="current-provider">Loading...</div>
699
+ </div>
700
+ <div class="metric-card" style="text-align: left; padding: 10px;">
701
+ <div class="metric-label">Model</div>
702
+ <div class="metric-value" style="font-size: 0.9rem;" id="current-model">Loading...</div>
703
+ </div>
704
+ <div class="metric-card" style="text-align: left; padding: 10px;">
705
+ <div class="metric-label">Search Tool</div>
706
+ <div class="metric-value" style="font-size: 0.9rem;" id="current-search-tool">Loading...</div>
707
+ </div>
708
+ <div class="metric-card" style="text-align: left; padding: 10px;">
709
+ <div class="metric-label">Iterations</div>
710
+ <div class="metric-value" style="font-size: 0.9rem;" id="current-iterations">Loading...</div>
711
+ </div>
712
+ <div class="metric-card" style="text-align: left; padding: 10px;">
713
+ <div class="metric-label">Questions/Iter</div>
714
+ <div class="metric-value" style="font-size: 0.9rem;" id="current-questions">Loading...</div>
715
+ </div>
716
+ <div class="metric-card" style="text-align: left; padding: 10px;">
717
+ <div class="metric-label">Strategy</div>
718
+ <div class="metric-value" style="font-size: 0.9rem;" id="current-strategy">Loading...</div>
719
+ </div>
720
+ </div>
721
+
722
+ <div style="font-size: 0.9rem; color: var(--text-muted); text-align: center;">
723
+ <i class="fas fa-info-circle"></i> To change any settings, go to <a href="/research/" target="_blank" style="color: var(--primary-color);">Settings Dashboard</a>
724
+ </div>
725
+ </div>
726
+ </fieldset>
727
+ </div>
728
+
729
+ <!-- Evaluation Model Settings -->
730
+ <div class="form-group">
731
+ <fieldset>
732
+ <legend>Evaluation Model Settings</legend>
733
+ <div class="dataset-config" style="background: var(--bg-color); border: 1px solid var(--border-color);">
734
+ <div class="dataset-header">
735
+ <div>
736
+ <h3>Benchmark Evaluation Configuration</h3>
737
+ <p>Configure the model used to grade benchmark results</p>
738
+ </div>
739
+ </div>
740
+ <div class="form-row">
741
+ <!-- Evaluation Provider Selection -->
742
+ <div class="form-group half">
743
+ <label for="evaluation_provider">Evaluation Provider</label>
744
+ <select id="evaluation_provider" name="evaluation_provider" class="form-control" data-initial-value="{{ eval_settings.evaluation_provider }}">
745
+ <option value="">Loading providers...</option>
746
+ </select>
747
+ <span class="input-help">Provider for the evaluation model</span>
748
+ </div>
749
+
750
+ <!-- Evaluation Model Selection -->
751
+ <div class="form-group half">
752
+ {{ render_dropdown(
753
+ input_id="evaluation_model",
754
+ dropdown_id="evaluation-model-dropdown",
755
+ placeholder="Enter or select evaluation model",
756
+ label="Evaluation Model",
757
+ help_text="Model to grade benchmark results",
758
+ allow_custom=true,
759
+ show_refresh=true,
760
+ refresh_aria_label="Refresh evaluation model list",
761
+ data_initial_value=eval_settings.evaluation_model
762
+ ) }}
763
+ </div>
764
+ </div>
765
+
766
+ <div class="form-row">
767
+ <!-- Evaluation Endpoint URL -->
768
+ <div class="form-group half">
769
+ <label for="evaluation_endpoint_url">Endpoint URL</label>
770
+ <input type="text" id="evaluation_endpoint_url" name="evaluation_endpoint_url" class="form-control" placeholder="https://openrouter.ai/api/v1" value="{{ eval_settings.evaluation_endpoint_url }}">
771
+ <span class="input-help">API endpoint for evaluation model</span>
772
+ </div>
773
+
774
+ <!-- Evaluation Temperature -->
775
+ <div class="form-group half">
776
+ <label for="evaluation_temperature">Temperature</label>
777
+ <input type="range" id="evaluation_temperature" name="evaluation_temperature" class="form-control" min="0" max="1" step="0.1" value="{{ eval_settings.evaluation_temperature }}">
778
+ <span class="input-help">0 recommended for consistent evaluation</span>
779
+ </div>
780
+ </div>
781
+
782
+ <div class="alert" style="background: rgba(33, 150, 243, 0.1); border: 1px solid rgba(33, 150, 243, 0.3); color: #ffffff; padding: 15px; border-radius: 6px; margin-top: 15px;">
783
+ <i class="fas fa-info-circle" style="color: #2196f3; margin-right: 8px;"></i>
784
+ <strong style="color: #2196f3;">Evaluation Model Selection:</strong>
785
+ For accurate benchmark grading, use flagship models from major providers like Claude Sonnet series or GPT-4 class models.
786
+ Local models and smaller cloud models may produce inconsistent evaluations, affecting benchmark accuracy scores.
787
+ However, preliminary tests indicate that local models might be adequate for performance evaluation if highest grade standards are not required.
788
+ </div>
789
+ </div>
790
+ </fieldset>
791
+ </div>
792
+
793
+ <!-- Search Engine Warning -->
794
+ <div class="form-group" id="search-engine-warning" style="display: none;">
795
+ <div class="alert" style="background: #2d1b1b; border: 1px solid #f44336; color: #ffffff; padding: 15px; border-radius: 6px;">
796
+ <i class="fas fa-exclamation-triangle" style="color: #f44336; margin-right: 8px;"></i>
797
+ <strong style="color: #f44336;">Search Engine Notice:</strong>
798
+ <span id="search-warning-text" style="color: #ffffff;"></span>
799
+ </div>
800
+ </div>
801
+
802
+ <!-- Configuration Summary -->
803
+ <div class="form-group">
804
+ <div id="config-summary" class="metric-card">
805
+ <div class="metric-value" id="total-examples">50</div>
806
+ <div class="metric-label">Total Examples</div>
807
+ <div style="margin-top: 10px; font-size: 0.9rem; color: var(--text-muted);">
808
+ Estimated time: <span id="estimated-time">40-60 minutes</span>
809
+ </div>
810
+ </div>
811
+ </div>
812
+
813
+
814
+ <!-- Action Buttons -->
815
+ <div class="form-actions">
816
+ <button type="button" id="validate-config-btn" class="btn btn-secondary">
817
+ <i class="fas fa-check-circle"></i> Validate Configuration
818
+ </button>
819
+ <button type="submit" id="start-benchmark-btn" class="btn btn-primary">
820
+ <i class="fas fa-play"></i> Start Benchmark
821
+ </button>
822
+ </div>
823
+ </form>
824
+ </div>
825
+ </div>
826
+
827
+ <!-- Progress Panel - Reusing research progress component -->
828
+ <div id="benchmark-progress" class="benchmark-progress">
829
+ <div class="card benchmark-card">
830
+ <div class="card-content">
831
+ <div class="progress-info">
832
+ <div class="current-query-container">
833
+ <div class="current-query-label">Current Benchmark:</div>
834
+ <div id="current-benchmark" class="current-query"></div>
835
+ </div>
836
+ <div class="progress-container">
837
+ <div class="progress-bar">
838
+ <div id="progress-bar" class="progress-fill"></div>
839
+ </div>
840
+ <div id="progress-percentage" class="progress-percentage">0%</div>
841
+ </div>
842
+ <div class="status-container">
843
+ <div class="status-label">Status:</div>
844
+ <div id="status-text" class="status-indicator">Initializing</div>
845
+ </div>
846
+ <div class="task-container">
847
+ <div class="task-label">Current Task:</div>
848
+ <div id="current-task" class="task-text">Starting benchmark...</div>
849
+ </div>
850
+
851
+ <!-- Benchmark-specific metrics -->
852
+ <div class="metrics-grid">
853
+ <div class="metric-card">
854
+ <div class="metric-value" id="overall-accuracy">--%</div>
855
+ <div class="metric-label">Overall Accuracy</div>
856
+ <div class="metric-subtitle" id="accuracy-confidence" style="font-size: 0.8rem; color: var(--text-muted); margin-top: 2px;">--</div>
857
+ </div>
858
+ <div class="metric-card">
859
+ <div class="metric-value" id="estimated-time">--</div>
860
+ <div class="metric-label">Est. Time Left</div>
861
+ <div class="metric-subtitle" id="elapsed-time" style="font-size: 0.8rem; color: var(--text-muted); margin-top: 2px;">--</div>
862
+ </div>
863
+ <div class="metric-card">
864
+ <div class="metric-value" id="completed-count">0</div>
865
+ <div class="metric-label">Completed</div>
866
+ </div>
867
+ <div class="metric-card">
868
+ <div class="metric-value" id="processing-rate">--</div>
869
+ <div class="metric-label">Avg Time/Example</div>
870
+ </div>
871
+ </div>
872
+
873
+ <!-- SearXNG Rate Limiting Warning in Progress -->
874
+ <div id="rate-limit-warning" class="alert alert-warning" style="margin-top: 15px; margin-bottom: 15px; display: none;">
875
+ <i class="fas fa-exclamation-triangle"></i>
876
+ <strong>Rate Limiting Detected!</strong> SearXNG is blocking requests due to too many parallel searches.
877
+ <br><small><strong>Quick fix:</strong> <code>docker restart searxng</code> or wait 5-10 minutes for limits to reset.</small>
878
+ <br><small><strong>Prevention:</strong> Reduce iterations/questions per iteration in Settings.</small>
879
+ </div>
880
+
881
+ <div id="dataset-accuracies">
882
+ <div class="dataset-accuracy">
883
+ <span>SimpleQA: <strong id="simpleqa-accuracy">--%</strong></span>
884
+ <span>BrowseComp: <strong id="browsecomp-accuracy">--%</strong></span>
885
+ </div>
886
+ </div>
887
+
888
+ <!-- Benchmark Control Actions -->
889
+ <div class="progress-actions" style="margin: 20px 0; text-align: center;">
890
+ <button id="cancel-benchmark-btn" class="btn btn-outline terminate-btn">
891
+ <i class="fas fa-stop-circle"></i> Cancel Benchmark
892
+ </button>
893
+ <button id="view-results-btn" class="btn btn-primary" style="display: none;">
894
+ <i class="fas fa-eye"></i> View Results
895
+ </button>
896
+ </div>
897
+
898
+ <!-- Current Question Display -->
899
+ <div id="current-question-section" class="benchmark-section" style="margin-top: 20px;">
900
+ <h4 style="color: var(--primary-color); margin-bottom: 15px;">
901
+ <i class="fas fa-question-circle"></i> Current Question
902
+ </h4>
903
+ <div id="current-question-card" class="question-card">
904
+ <div class="question-content">
905
+ <div class="question-text" id="current-question-text">No question being processed...</div>
906
+ <div class="question-meta">
907
+ <span class="dataset-badge" id="current-dataset">--</span>
908
+ <span class="example-id" id="current-example-id">--</span>
909
+ </div>
910
+ </div>
911
+ <div class="processing-status" id="current-processing-status">
912
+ <i class="fas fa-clock"></i> Waiting for benchmark to start...
913
+ </div>
914
+ </div>
915
+ </div>
916
+
917
+ <!-- Performance Charts -->
918
+ <div id="performance-charts-section" class="charts-section" style="display: none;">
919
+ <h4 style="color: var(--primary-color); margin-bottom: 15px;">
920
+ <i class="fas fa-chart-line"></i> Performance Analysis
921
+ </h4>
922
+ <div class="charts-grid">
923
+ <div class="chart-container">
924
+ <div class="chart-title">Accuracy Trend</div>
925
+ <canvas id="accuracy-chart" class="chart-canvas"></canvas>
926
+ </div>
927
+ <div class="chart-container">
928
+ <div class="chart-title">Processing Time per Example</div>
929
+ <canvas id="timing-chart" class="chart-canvas"></canvas>
930
+ </div>
931
+ </div>
932
+ <div class="charts-grid" style="margin-top: 20px;">
933
+ <div class="chart-container">
934
+ <div class="chart-title">Search Results Count</div>
935
+ <canvas id="search-results-chart" class="chart-canvas"></canvas>
936
+ </div>
937
+ <div class="chart-container">
938
+ <div class="chart-title">Search Quality Alert</div>
939
+ <div id="search-quality-status" style="padding: 20px; text-align: center; color: #e0e0e0;">
940
+ <div id="search-status-icon" style="font-size: 2rem; margin-bottom: 10px;">
941
+ <i class="fas fa-search"></i>
942
+ </div>
943
+ <div id="search-status-text" style="font-size: 1.1rem; margin-bottom: 10px;">
944
+ Waiting for data...
945
+ </div>
946
+ <div id="search-status-details" style="font-size: 0.9rem; color: #a0a0a0;">
947
+ Search result monitoring will begin when benchmark starts
948
+ </div>
949
+ </div>
950
+ </div>
951
+ </div>
952
+ </div>
953
+
954
+ <!-- All Results Display -->
955
+ <div id="recent-results-section" class="benchmark-section" style="margin-top: 20px;">
956
+ <h4 style="color: var(--primary-color); margin-bottom: 15px;">
957
+ <i class="fas fa-history"></i> All Results
958
+ </h4>
959
+ <div id="recent-results-container">
960
+ <div class="no-results">No results yet...</div>
961
+ </div>
962
+ </div>
963
+ </div>
964
+ </div>
965
+ </div>
966
+ </div>
967
+ </div>
968
+
969
+ <script>
970
+ // Benchmark configuration and progress tracking
971
+ let currentBenchmarkId = null;
972
+ let progressInterval = null;
973
+
974
+ // Global variables for evaluation settings data
975
+ let evalProviderData = null;
976
+ let evalModelData = null;
977
+ let evalTempData = null;
978
+ let evalEndpointData = null;
979
+
980
+
981
+ // Charts for performance tracking
982
+ let accuracyChart = null;
983
+ let timingChart = null;
984
+ let searchResultsChart = null;
985
+ let chartData = {
986
+ examples: [],
987
+ accuracies: [],
988
+ processingTimes: [],
989
+ timestamps: [],
990
+ searchResultCounts: []
991
+ };
992
+
993
+ // Search quality monitoring
994
+ let recentSearchCounts = [];
995
+ let searchQualityAlert = false;
996
+
997
+ document.addEventListener('DOMContentLoaded', function() {
998
+ // Initialize socket service first - but don't let it keep retrying if it fails
999
+ if (window.socket && typeof window.socket.initializeSocket === 'function') {
1000
+ try {
1001
+ window.socket.initializeSocket();
1002
+ } catch (e) {
1003
+ console.warn('Socket initialization failed, continuing without real-time updates');
1004
+ }
1005
+ }
1006
+
1007
+ initializeBenchmarkForm();
1008
+ initializeEvaluationSettings();
1009
+ loadCurrentSettings();
1010
+ updateConfigSummary();
1011
+ checkForRunningBenchmark();
1012
+ });
1013
+
1014
+ function initializeBenchmarkForm() {
1015
+ const form = document.getElementById('benchmark-form');
1016
+ const validateBtn = document.getElementById('validate-config-btn');
1017
+ const cancelBtn = document.getElementById('cancel-benchmark-btn');
1018
+
1019
+ // Form submission
1020
+ form.addEventListener('submit', function(e) {
1021
+ e.preventDefault();
1022
+ startBenchmark();
1023
+ });
1024
+
1025
+ // Validate configuration
1026
+ validateBtn.addEventListener('click', validateConfiguration);
1027
+
1028
+ // Cancel benchmark
1029
+ cancelBtn.addEventListener('click', cancelBenchmark);
1030
+
1031
+ // Update summary when inputs change
1032
+ const inputs = form.querySelectorAll('input, select');
1033
+ inputs.forEach(input => {
1034
+ input.addEventListener('change', updateConfigSummary);
1035
+ });
1036
+
1037
+ // Toggle dataset sections
1038
+ document.getElementById('simpleqa_enabled').addEventListener('change', function() {
1039
+ document.getElementById('simpleqa_count').disabled = !this.checked;
1040
+ updateConfigSummary();
1041
+ });
1042
+
1043
+ // BrowseComp toggle - enable/disable count input when checkbox is toggled
1044
+ document.getElementById('browsecomp_enabled').addEventListener('change', function() {
1045
+ const countInput = document.getElementById('browsecomp_count');
1046
+ countInput.disabled = !this.checked;
1047
+ if (!this.checked) {
1048
+ countInput.value = 0; // Reset to 0 when disabled
1049
+ } else {
1050
+ countInput.value = 5; // Set a reasonable default when enabled
1051
+ }
1052
+ updateConfigSummary();
1053
+ });
1054
+
1055
+ }
1056
+
1057
+ function updateConfigSummary() {
1058
+ const simpleqaEnabled = document.getElementById('simpleqa_enabled').checked;
1059
+ const browsecompEnabled = document.getElementById('browsecomp_enabled').checked;
1060
+ const simpleqaCount = simpleqaEnabled ? parseInt(document.getElementById('simpleqa_count').value) || 0 : 0;
1061
+ const browsecompCount = browsecompEnabled ? parseInt(document.getElementById('browsecomp_count').value) || 0 : 0;
1062
+
1063
+ const totalExamples = simpleqaCount + browsecompCount;
1064
+ document.getElementById('total-examples').textContent = totalExamples;
1065
+
1066
+ // Estimate time (roughly 1-2 minutes per example)
1067
+ const estimatedMinutes = Math.round(totalExamples * 1.5);
1068
+ const estimatedTime = estimatedMinutes < 60 ?
1069
+ `${estimatedMinutes} minutes` :
1070
+ `${Math.round(estimatedMinutes/60)} hour${estimatedMinutes >= 120 ? 's' : ''}`;
1071
+ document.getElementById('estimated-time').textContent = estimatedTime;
1072
+ }
1073
+
1074
+ function validateConfiguration() {
1075
+ const config = getConfigurationData();
1076
+
1077
+ fetch('/benchmark/api/validate-config', {
1078
+ method: 'POST',
1079
+ headers: {
1080
+ 'Content-Type': 'application/json',
1081
+ },
1082
+ body: JSON.stringify(config)
1083
+ })
1084
+ .then(response => response.json())
1085
+ .then(data => {
1086
+ if (data.valid) {
1087
+ showAlert('Configuration is valid! Ready to start benchmark.', 'success');
1088
+ } else {
1089
+ showAlert('Configuration errors: ' + data.errors.join(', '), 'error');
1090
+ }
1091
+ })
1092
+ .catch(error => {
1093
+ console.error('Validation error:', error);
1094
+ showAlert('Error validating configuration: ' + error.message, 'error');
1095
+ });
1096
+ }
1097
+
1098
+ function getConfigurationData() {
1099
+ const simpleqaEnabled = document.getElementById('simpleqa_enabled').checked;
1100
+ const browsecompEnabled = document.getElementById('browsecomp_enabled').checked;
1101
+
1102
+ const datasets_config = {};
1103
+ if (simpleqaEnabled) {
1104
+ datasets_config.simpleqa = {
1105
+ count: parseInt(document.getElementById('simpleqa_count').value) || 0
1106
+ };
1107
+ }
1108
+ if (browsecompEnabled) {
1109
+ datasets_config.browsecomp = {
1110
+ count: parseInt(document.getElementById('browsecomp_count').value) || 0
1111
+ };
1112
+ }
1113
+
1114
+ return {
1115
+ run_name: document.getElementById('run_name').value,
1116
+ datasets_config: datasets_config
1117
+ // All other config will be taken from database by the backend
1118
+ };
1119
+ }
1120
+
1121
+ function startBenchmark() {
1122
+ const config = getConfigurationData();
1123
+
1124
+ // Disable form
1125
+ document.getElementById('benchmark-form').style.display = 'none';
1126
+ document.getElementById('benchmark-progress').style.display = 'block';
1127
+
1128
+ fetch('/benchmark/api/start', {
1129
+ method: 'POST',
1130
+ headers: {
1131
+ 'Content-Type': 'application/json',
1132
+ },
1133
+ body: JSON.stringify(config)
1134
+ })
1135
+ .then(response => response.json())
1136
+ .then(data => {
1137
+ if (data.success) {
1138
+ currentBenchmarkId = data.benchmark_run_id;
1139
+ showAlert('Benchmark started successfully!', 'success');
1140
+ startProgressTracking();
1141
+ } else {
1142
+ showAlert('Error starting benchmark: ' + data.error, 'error');
1143
+ resetForm();
1144
+ }
1145
+ })
1146
+ .catch(error => {
1147
+ console.error('Start error:', error);
1148
+ showAlert('Error starting benchmark: ' + error.message, 'error');
1149
+ resetForm();
1150
+ });
1151
+ }
1152
+
1153
+ function startProgressTracking() {
1154
+ if (!currentBenchmarkId) return;
1155
+
1156
+ // Initialize charts
1157
+ initializeCharts();
1158
+
1159
+ // Show charts section
1160
+ document.getElementById('performance-charts-section').style.display = 'block';
1161
+
1162
+ // Load historical data if reconnecting to running benchmark
1163
+ setTimeout(() => {
1164
+ loadHistoricalChartData();
1165
+ }, 1000);
1166
+
1167
+ progressInterval = setInterval(() => {
1168
+ updateBenchmarkProgress();
1169
+ }, 3000); // Update every 3 seconds (reduced from 2 for better performance)
1170
+
1171
+ // Initialize socket if not already done
1172
+ if (!window.socket || !window.socket.initializeSocket) {
1173
+ console.log('Socket service not available');
1174
+ } else if (!window.socket.socket) {
1175
+ console.log('Socket not initialized, initializing now...');
1176
+ window.socket.initializeSocket();
1177
+ }
1178
+
1179
+ // Connect to WebSocket for detailed progress updates (reuse socket service)
1180
+ setTimeout(() => {
1181
+ if (window.socket && typeof window.socket.subscribeToResearch === 'function') {
1182
+ console.log('Subscribing to benchmark progress for ID:', currentBenchmarkId);
1183
+ // Subscribe to benchmark progress events using research subscription (same format)
1184
+ window.socket.subscribeToResearch(currentBenchmarkId, (eventData) => {
1185
+ handleDetailedProgress(eventData);
1186
+ });
1187
+ } else {
1188
+ console.warn('Socket service not available, falling back to polling only');
1189
+ }
1190
+ }, 500); // Small delay to ensure socket is ready
1191
+ }
1192
+
1193
+ function handleDetailedProgress(data) {
1194
+ // Update current task display
1195
+ const currentTask = document.getElementById('current-task');
1196
+ if (currentTask && data.status) {
1197
+ currentTask.textContent = `Example ${data.example_id}: ${data.status}`;
1198
+ }
1199
+
1200
+ }
1201
+
1202
+ // Track last update times to avoid too frequent updates
1203
+ let lastResultsUpdate = 0;
1204
+ let lastChartsUpdate = 0;
1205
+
1206
+ function updateBenchmarkProgress() {
1207
+ if (!currentBenchmarkId) return;
1208
+
1209
+ fetch(`/benchmark/api/status/${currentBenchmarkId}`)
1210
+ .then(response => response.json())
1211
+ .then(data => {
1212
+ if (data.success) {
1213
+ const status = data.status;
1214
+ updateProgressDisplay(status);
1215
+
1216
+ // Update question/answer displays
1217
+ updateCurrentQuestion(status);
1218
+
1219
+ // Only update results every 10 seconds to avoid performance issues
1220
+ const now = Date.now();
1221
+ if (now - lastResultsUpdate > 10000) {
1222
+ updateRecentResults();
1223
+ lastResultsUpdate = now;
1224
+ }
1225
+
1226
+ // Update charts every 5 seconds
1227
+ if (now - lastChartsUpdate > 5000) {
1228
+ updateCharts(status);
1229
+ lastChartsUpdate = now;
1230
+ }
1231
+
1232
+ // Update search result monitoring
1233
+ updateSearchQualityMonitoring();
1234
+
1235
+ // Update rate limiting status
1236
+ updateRateLimitingStatus();
1237
+
1238
+ if (status.status === 'completed' || status.status === 'failed' || status.status === 'cancelled') {
1239
+ clearInterval(progressInterval);
1240
+ progressInterval = null;
1241
+
1242
+ if (status.status === 'completed') {
1243
+ showAlert('Benchmark completed successfully!', 'success');
1244
+ } else {
1245
+ showAlert(`Benchmark ${status.status}: ${status.error_message || ''}`, 'error');
1246
+ }
1247
+ }
1248
+ }
1249
+ })
1250
+ .catch(error => {
1251
+ console.error('Progress update error:', error);
1252
+ });
1253
+ }
1254
+
1255
+ function updateProgressDisplay(status) {
1256
+ const percentage = status.total_examples > 0 ?
1257
+ (status.completed_examples / status.total_examples * 100) : 0;
1258
+
1259
+ // Update progress bar (using research progress component IDs)
1260
+ const progressBar = document.getElementById('progress-bar');
1261
+ const progressPercentage = document.getElementById('progress-percentage');
1262
+ const statusText = document.getElementById('status-text');
1263
+ const currentTask = document.getElementById('current-task');
1264
+ const currentBenchmark = document.getElementById('current-benchmark');
1265
+
1266
+ if (progressBar) progressBar.style.width = percentage + '%';
1267
+ if (progressPercentage) progressPercentage.textContent = Math.round(percentage) + '%';
1268
+ if (statusText) statusText.textContent = status.status || 'Running';
1269
+ if (currentTask) currentTask.textContent = `Processing example ${status.completed_examples} of ${status.total_examples}`;
1270
+ if (currentBenchmark && status.run_name) currentBenchmark.textContent = status.run_name;
1271
+
1272
+ // Update benchmark-specific metrics
1273
+ const overallAccuracy = document.getElementById('overall-accuracy');
1274
+ const accuracyConfidence = document.getElementById('accuracy-confidence');
1275
+ const estimatedTime = document.getElementById('estimated-time');
1276
+ const elapsedTime = document.getElementById('elapsed-time');
1277
+ const processingRate = document.getElementById('processing-rate');
1278
+ const completedCount = document.getElementById('completed-count');
1279
+
1280
+ // Overall accuracy with confidence interval
1281
+ if (overallAccuracy) overallAccuracy.textContent =
1282
+ status.overall_accuracy ? status.overall_accuracy.toFixed(1) + '%' : '--%';
1283
+
1284
+ if (accuracyConfidence && status.accuracy_confidence) {
1285
+ const conf = status.accuracy_confidence;
1286
+ accuracyConfidence.textContent =
1287
+ `±${conf.margin_of_error.toFixed(1)}% (95% CI, n=${conf.sample_size})`;
1288
+ } else if (accuracyConfidence) {
1289
+ accuracyConfidence.textContent = '--';
1290
+ }
1291
+
1292
+ // Time estimates
1293
+ if (estimatedTime && status.estimated_time_remaining) {
1294
+ const minutes = Math.round(status.estimated_time_remaining / 60);
1295
+ estimatedTime.textContent = minutes > 0 ? `${minutes}m` : '<1m';
1296
+ } else if (estimatedTime) {
1297
+ estimatedTime.textContent = '--';
1298
+ }
1299
+
1300
+ if (elapsedTime && status.total_elapsed_time) {
1301
+ const minutes = Math.round(status.total_elapsed_time / 60);
1302
+ elapsedTime.textContent = `${minutes}m elapsed`;
1303
+ } else if (elapsedTime) {
1304
+ elapsedTime.textContent = '--';
1305
+ }
1306
+
1307
+ // Average processing time per example
1308
+ if (processingRate && status.avg_time_per_example) {
1309
+ const avgMinutes = (status.avg_time_per_example / 60).toFixed(1);
1310
+ processingRate.textContent = `${avgMinutes}m`;
1311
+ } else if (processingRate) {
1312
+ processingRate.textContent = '--';
1313
+ }
1314
+
1315
+ if (completedCount) completedCount.textContent = status.completed_examples;
1316
+
1317
+ // Update per-dataset accuracy displays
1318
+ const simpleqaAccuracy = document.getElementById('simpleqa-accuracy');
1319
+ const browsecompAccuracy = document.getElementById('browsecomp-accuracy');
1320
+
1321
+ if (simpleqaAccuracy) simpleqaAccuracy.textContent =
1322
+ status.simpleqa_accuracy ? status.simpleqa_accuracy.toFixed(1) + '%' : '--%';
1323
+ if (browsecompAccuracy) browsecompAccuracy.textContent =
1324
+ status.browsecomp_accuracy ? status.browsecomp_accuracy.toFixed(1) + '%' : '--%';
1325
+ }
1326
+
1327
+ function cancelBenchmark() {
1328
+ if (!currentBenchmarkId) return;
1329
+
1330
+ fetch(`/benchmark/api/cancel/${currentBenchmarkId}`, {
1331
+ method: 'POST'
1332
+ })
1333
+ .then(response => response.json())
1334
+ .then(data => {
1335
+ if (data.success) {
1336
+ showAlert('Benchmark cancelled successfully.', 'info');
1337
+ clearInterval(progressInterval);
1338
+ progressInterval = null;
1339
+ resetForm();
1340
+ } else {
1341
+ showAlert('Error cancelling benchmark: ' + data.error, 'error');
1342
+ }
1343
+ })
1344
+ .catch(error => {
1345
+ console.error('Cancel error:', error);
1346
+ showAlert('Error cancelling benchmark: ' + error.message, 'error');
1347
+ });
1348
+ }
1349
+
1350
+ function resetForm() {
1351
+ document.getElementById('benchmark-form').style.display = 'block';
1352
+ document.getElementById('benchmark-progress').style.display = 'none';
1353
+ document.getElementById('performance-charts-section').style.display = 'none';
1354
+ currentBenchmarkId = null;
1355
+
1356
+ // Clear any running intervals
1357
+ if (progressInterval) {
1358
+ clearInterval(progressInterval);
1359
+ progressInterval = null;
1360
+ }
1361
+
1362
+ // Reset loading flags
1363
+ window.modelsLoading = false;
1364
+
1365
+ // Reset chart data
1366
+ chartData = {
1367
+ examples: [],
1368
+ accuracies: [],
1369
+ processingTimes: [],
1370
+ timestamps: [],
1371
+ searchResultCounts: []
1372
+ };
1373
+
1374
+ // Reset search quality monitoring
1375
+ recentSearchCounts = [];
1376
+ searchQualityAlert = false;
1377
+
1378
+ // Destroy existing charts
1379
+ if (accuracyChart) {
1380
+ accuracyChart.destroy();
1381
+ accuracyChart = null;
1382
+ }
1383
+ if (timingChart) {
1384
+ timingChart.destroy();
1385
+ timingChart = null;
1386
+ }
1387
+ if (searchResultsChart) {
1388
+ searchResultsChart.destroy();
1389
+ searchResultsChart = null;
1390
+ }
1391
+
1392
+ // Unsubscribe from socket events if connected
1393
+ if (window.socket && window.socket.unsubscribeFromResearch) {
1394
+ window.socket.unsubscribeFromResearch(currentBenchmarkId);
1395
+ }
1396
+ }
1397
+
1398
+ function showAlert(message, type) {
1399
+ const alertContainer = document.getElementById('benchmark-alert');
1400
+ alertContainer.innerHTML = `
1401
+ <div class="settings-alert alert-${type}">
1402
+ <span>${message}</span>
1403
+ <button type="button" class="close-alert" onclick="this.parentElement.parentElement.style.display='none'">
1404
+ <i class="fas fa-times"></i>
1405
+ </button>
1406
+ </div>
1407
+ `;
1408
+ alertContainer.style.display = 'block';
1409
+
1410
+ // Auto-hide success messages
1411
+ if (type === 'success') {
1412
+ setTimeout(() => {
1413
+ alertContainer.style.display = 'none';
1414
+ }, 5000);
1415
+ }
1416
+ }
1417
+
1418
+ function checkForRunningBenchmark() {
1419
+ // Check if there's a running benchmark when page loads
1420
+ fetch('/benchmark/api/running')
1421
+ .then(response => response.json())
1422
+ .then(data => {
1423
+ if (data.success && data.benchmark_run_id) {
1424
+ currentBenchmarkId = data.benchmark_run_id;
1425
+ showAlert(`Reconnected to running benchmark #${currentBenchmarkId}`, 'info');
1426
+
1427
+ // Show progress panel and hide form
1428
+ document.getElementById('benchmark-form').style.display = 'none';
1429
+ document.getElementById('benchmark-progress').style.display = 'block';
1430
+
1431
+ // Start tracking progress
1432
+ startProgressTracking();
1433
+ }
1434
+ })
1435
+ .catch(error => {
1436
+ console.log('No running benchmark found (this is normal)');
1437
+ });
1438
+ }
1439
+
1440
+ // Load current settings from database and display them
1441
+ async function loadCurrentSettings() {
1442
+ console.log('Starting loadCurrentSettings...');
1443
+
1444
+ try {
1445
+ // Load settings individually
1446
+ const [
1447
+ llmProviderResp,
1448
+ llmModelResp,
1449
+ searchToolResp,
1450
+ iterationsResp,
1451
+ questionsResp,
1452
+ strategyResp,
1453
+ evalProviderResp,
1454
+ evalModelResp,
1455
+ evalTempResp,
1456
+ evalEndpointResp
1457
+ ] = await Promise.all([
1458
+ fetch('/settings/api/llm.provider'),
1459
+ fetch('/settings/api/llm.model'),
1460
+ fetch('/settings/api/search.tool'),
1461
+ fetch('/settings/api/search.iterations'),
1462
+ fetch('/settings/api/search.questions_per_iteration'),
1463
+ fetch('/settings/api/search.search_strategy'),
1464
+ fetch('/settings/api/benchmark.evaluation.provider'),
1465
+ fetch('/settings/api/benchmark.evaluation.model'),
1466
+ fetch('/settings/api/benchmark.evaluation.temperature'),
1467
+ fetch('/settings/api/benchmark.evaluation.endpoint_url')
1468
+ ]);
1469
+
1470
+ // Parse responses
1471
+ const llmProviderData = await llmProviderResp.json();
1472
+ const llmModelData = await llmModelResp.json();
1473
+ const searchToolData = await searchToolResp.json();
1474
+ const iterationsData = await iterationsResp.json();
1475
+ const questionsData = await questionsResp.json();
1476
+ const strategyData = await strategyResp.json();
1477
+ evalProviderData = await evalProviderResp.json();
1478
+ evalModelData = await evalModelResp.json();
1479
+ evalTempData = await evalTempResp.json();
1480
+ evalEndpointData = await evalEndpointResp.json();
1481
+
1482
+ // Display LLM settings with error handling
1483
+ try {
1484
+ const providerEl = document.getElementById('current-provider');
1485
+ const modelEl = document.getElementById('current-model');
1486
+
1487
+ // Set provider
1488
+ if (llmProviderData && llmProviderData.settings && llmProviderData.settings.value) {
1489
+ const provider = llmProviderData.settings.value;
1490
+ if (providerEl) providerEl.textContent = provider ? provider.toUpperCase() : 'Not set';
1491
+ } else {
1492
+ if (providerEl) providerEl.textContent = 'Not set';
1493
+ }
1494
+
1495
+ // Set model
1496
+ if (llmModelData && llmModelData.settings && llmModelData.settings.value) {
1497
+ const model = llmModelData.settings.value;
1498
+ if (modelEl) modelEl.textContent = model || 'Not set';
1499
+ } else {
1500
+ if (modelEl) modelEl.textContent = 'Not set';
1501
+ }
1502
+ } catch (e) {
1503
+ console.error('Error setting LLM display:', e);
1504
+ }
1505
+
1506
+ // Display search tool and check for warnings
1507
+ if (searchToolData && searchToolData.settings && searchToolData.settings.value) {
1508
+ const searchTool = searchToolData.settings.value || 'Not set';
1509
+ document.getElementById('current-search-tool').textContent = searchTool;
1510
+
1511
+ // Check for search engine warnings
1512
+ checkSearchEngineWarnings(searchTool);
1513
+ } else {
1514
+ document.getElementById('current-search-tool').textContent = 'Not set';
1515
+ }
1516
+
1517
+ // Display search iterations
1518
+ if (iterationsData && iterationsData.settings && iterationsData.settings.value !== null) {
1519
+ document.getElementById('current-iterations').textContent =
1520
+ iterationsData.settings.value || '8';
1521
+ } else {
1522
+ document.getElementById('current-iterations').textContent = '8'; // default
1523
+ }
1524
+
1525
+ // Display questions per iteration
1526
+ if (questionsData && questionsData.settings && questionsData.settings.value !== null) {
1527
+ document.getElementById('current-questions').textContent =
1528
+ questionsData.settings.value || '5';
1529
+ } else {
1530
+ document.getElementById('current-questions').textContent = '5'; // default
1531
+ }
1532
+
1533
+ // Display search strategy
1534
+ if (strategyData && strategyData.settings && strategyData.settings.value) {
1535
+ document.getElementById('current-strategy').textContent =
1536
+ strategyData.settings.value || 'focused_iteration';
1537
+ } else {
1538
+ document.getElementById('current-strategy').textContent = 'focused_iteration'; // default
1539
+ }
1540
+
1541
+ // Display evaluation settings (commented out as these elements don't exist)
1542
+ // TODO: Add evaluation settings display section if needed
1543
+
1544
+ // For now, just log the evaluation settings
1545
+ console.log('Evaluation settings loaded:', {
1546
+ provider: evalProviderData?.settings?.value || 'openai_endpoint',
1547
+ model: evalModelData?.settings?.value || 'anthropic/claude-3.7-sonnet',
1548
+ temperature: evalTempData?.settings?.value || 0,
1549
+ endpoint: evalEndpointData?.settings?.value || 'https://openrouter.ai/api/v1'
1550
+ });
1551
+
1552
+ } catch (error) {
1553
+ console.error('Error loading current settings:', error);
1554
+ console.error('Error details:', error.message);
1555
+ console.error('Error stack:', error.stack);
1556
+
1557
+ // Set error text on all metric values
1558
+ document.querySelectorAll('#current-settings-display .metric-value').forEach(el => {
1559
+ el.textContent = 'Error loading';
1560
+ });
1561
+
1562
+ showAlert('Could not load current settings. Check console for details.', 'warning');
1563
+ }
1564
+ }
1565
+
1566
+ function updateCurrentQuestion(status) {
1567
+ const currentQuestionText = document.getElementById('current-question-text');
1568
+ const currentDataset = document.getElementById('current-dataset');
1569
+ const currentExampleId = document.getElementById('current-example-id');
1570
+ const currentProcessingStatus = document.getElementById('current-processing-status');
1571
+
1572
+ if (status.status === 'in_progress') {
1573
+ currentProcessingStatus.innerHTML = '<i class="fas fa-spinner fa-spin"></i> Processing...';
1574
+ currentProcessingStatus.className = 'processing-status processing';
1575
+
1576
+ // Show progress info
1577
+ const progressText = `Processing example ${status.completed_examples + 1} of ${status.total_examples}`;
1578
+ currentQuestionText.textContent = progressText;
1579
+ currentDataset.textContent = 'Active';
1580
+ currentExampleId.textContent = `Example ${status.completed_examples + 1}`;
1581
+ } else if (status.status === 'completed') {
1582
+ currentProcessingStatus.innerHTML = '<i class="fas fa-check-circle"></i> Benchmark completed!';
1583
+ currentProcessingStatus.className = 'processing-status completed';
1584
+ currentQuestionText.textContent = 'All questions processed successfully.';
1585
+ currentDataset.textContent = 'Completed';
1586
+ currentExampleId.textContent = `${status.completed_examples}/${status.total_examples}`;
1587
+ } else {
1588
+ currentProcessingStatus.innerHTML = '<i class="fas fa-clock"></i> Waiting...';
1589
+ currentProcessingStatus.className = 'processing-status';
1590
+ currentQuestionText.textContent = 'No question being processed...';
1591
+ currentDataset.textContent = '--';
1592
+ currentExampleId.textContent = '--';
1593
+ }
1594
+ }
1595
+
1596
+ // Cache last results to avoid unnecessary re-renders
1597
+ let lastResultsData = null;
1598
+
1599
+ function updateRecentResults() {
1600
+ if (!currentBenchmarkId) return;
1601
+
1602
+ fetch(`/benchmark/api/results/${currentBenchmarkId}?limit=50`) // Reduced from 100 for performance
1603
+ .then(response => response.json())
1604
+ .then(data => {
1605
+ if (data.success && data.results) {
1606
+ // Only update if data has changed
1607
+ const newResultsStr = JSON.stringify(data.results);
1608
+ const oldResultsStr = JSON.stringify(lastResultsData);
1609
+
1610
+ if (newResultsStr !== oldResultsStr) {
1611
+ lastResultsData = data.results;
1612
+ displayRecentResults(data.results);
1613
+ }
1614
+ }
1615
+ })
1616
+ .catch(error => {
1617
+ console.error('Error fetching all results:', error);
1618
+ });
1619
+ }
1620
+
1621
+ function getSearchCountClass(count) {
1622
+ if (count <= 1) return 'critical';
1623
+ if (count <= 4) return 'warning';
1624
+ return 'good';
1625
+ }
1626
+
1627
+ function displayRecentResults(results) {
1628
+ const container = document.getElementById('recent-results-container');
1629
+
1630
+ if (!results || results.length === 0) {
1631
+ container.innerHTML = '<div class="no-results">No results yet...</div>';
1632
+ return;
1633
+ }
1634
+
1635
+ // Save expanded states before re-rendering
1636
+ const expandedStates = {};
1637
+ const allToggles = container.querySelectorAll('[id^="toggle-"]');
1638
+ allToggles.forEach(toggle => {
1639
+ const id = toggle.id.replace('toggle-', '');
1640
+ const fullTextElement = document.getElementById(`full-${id}`);
1641
+ if (fullTextElement && fullTextElement.style.display !== 'none') {
1642
+ expandedStates[id] = true;
1643
+ }
1644
+ });
1645
+
1646
+ const resultsHtml = results.map((result, index) => {
1647
+ const statusClass = result.is_correct ? 'correct' : 'incorrect';
1648
+ const statusIcon = result.is_correct ? '<i class="fas fa-check-circle"></i>' : '<i class="fas fa-times-circle"></i>';
1649
+ const statusText = result.is_correct ? 'Correct' : 'Incorrect';
1650
+
1651
+ // Function to create expandable text
1652
+ const createExpandableText = (text, id, maxLength = 200) => {
1653
+ if (!text) return 'No answer provided';
1654
+
1655
+ if (text.length <= maxLength) return text;
1656
+
1657
+ const truncated = text.substring(0, maxLength) + '...';
1658
+ const isExpanded = expandedStates[id] || false;
1659
+
1660
+ return `
1661
+ <span id="truncated-${id}" style="display: ${isExpanded ? 'none' : 'inline'};">${truncated}</span>
1662
+ <span id="full-${id}" style="display: ${isExpanded ? 'inline' : 'none'};">${text}</span>
1663
+ <a href="#" onclick="toggleText('${id}'); return false;" id="toggle-${id}" style="color: #2196f3; font-size: 0.85rem; margin-left: 5px; text-decoration: underline;">${isExpanded ? 'Show less' : 'Show more'}</a>
1664
+ `;
1665
+ };
1666
+
1667
+ return `
1668
+ <div class="result-card ${statusClass}">
1669
+ <div class="result-header">
1670
+ <div>
1671
+ <span class="dataset-badge">${result.dataset_type}</span>
1672
+ <span class="example-id">${result.example_id}</span>
1673
+ ${result.search_result_count !== undefined ?
1674
+ `<span class="search-count-badge ${getSearchCountClass(result.search_result_count)}" title="Search results found">${result.search_result_count} results</span>` :
1675
+ ''}
1676
+ </div>
1677
+ <span class="result-status ${statusClass}">
1678
+ ${statusIcon} ${statusText}
1679
+ </span>
1680
+ </div>
1681
+ <div class="question-text" style="margin-bottom: 10px; font-size: 0.9rem;">
1682
+ ${result.question || 'No question provided'}
1683
+ </div>
1684
+ <div class="answer-comparison">
1685
+ <div class="answer-box model-answer">
1686
+ <div class="answer-label">Model Answer</div>
1687
+ <div>${createExpandableText(result.model_answer || 'No answer provided', `model-${index}`)}</div>
1688
+ </div>
1689
+ <div class="answer-box correct-answer">
1690
+ <div class="answer-label">Correct Answer</div>
1691
+ <div>${createExpandableText(result.correct_answer || 'No correct answer available', `correct-${index}`)}</div>
1692
+ </div>
1693
+ </div>
1694
+ </div>
1695
+ `;
1696
+ }).join('');
1697
+
1698
+ container.innerHTML = resultsHtml;
1699
+ }
1700
+
1701
+ // Toggle function for expandable text
1702
+ function toggleText(id) {
1703
+ const truncated = document.getElementById(`truncated-${id}`);
1704
+ const full = document.getElementById(`full-${id}`);
1705
+ const toggle = document.getElementById(`toggle-${id}`);
1706
+
1707
+ if (truncated.style.display === 'none') {
1708
+ truncated.style.display = 'inline';
1709
+ full.style.display = 'none';
1710
+ toggle.textContent = 'Show more';
1711
+ } else {
1712
+ truncated.style.display = 'none';
1713
+ full.style.display = 'inline';
1714
+ toggle.textContent = 'Show less';
1715
+ }
1716
+ }
1717
+
1718
+ // Chart initialization and management
1719
+ function initializeCharts() {
1720
+ const chartOptions = {
1721
+ responsive: true,
1722
+ maintainAspectRatio: false,
1723
+ plugins: {
1724
+ legend: {
1725
+ labels: {
1726
+ color: '#e0e0e0'
1727
+ }
1728
+ }
1729
+ },
1730
+ scales: {
1731
+ x: {
1732
+ ticks: {
1733
+ color: '#a0a0a0'
1734
+ },
1735
+ grid: {
1736
+ color: '#333'
1737
+ }
1738
+ },
1739
+ y: {
1740
+ ticks: {
1741
+ color: '#a0a0a0'
1742
+ },
1743
+ grid: {
1744
+ color: '#333'
1745
+ }
1746
+ }
1747
+ }
1748
+ };
1749
+
1750
+ // Accuracy Chart
1751
+ const accuracyCtx = document.getElementById('accuracy-chart').getContext('2d');
1752
+ accuracyChart = new Chart(accuracyCtx, {
1753
+ type: 'line',
1754
+ data: {
1755
+ labels: [],
1756
+ datasets: [{
1757
+ label: 'Overall Accuracy',
1758
+ data: [],
1759
+ borderColor: '#4caf50',
1760
+ backgroundColor: 'rgba(76, 175, 80, 0.1)',
1761
+ tension: 0.4,
1762
+ fill: true
1763
+ }, {
1764
+ label: 'SimpleQA Accuracy',
1765
+ data: [],
1766
+ borderColor: '#2196f3',
1767
+ backgroundColor: 'rgba(33, 150, 243, 0.1)',
1768
+ tension: 0.4,
1769
+ fill: false
1770
+ }, {
1771
+ label: 'BrowseComp Accuracy',
1772
+ data: [],
1773
+ borderColor: '#ff9800',
1774
+ backgroundColor: 'rgba(255, 152, 0, 0.1)',
1775
+ tension: 0.4,
1776
+ fill: false
1777
+ }]
1778
+ },
1779
+ options: {
1780
+ ...chartOptions,
1781
+ scales: {
1782
+ ...chartOptions.scales,
1783
+ y: {
1784
+ ...chartOptions.scales.y,
1785
+ min: 0,
1786
+ max: 100,
1787
+ ticks: {
1788
+ ...chartOptions.scales.y.ticks,
1789
+ callback: function(value) {
1790
+ return value + '%';
1791
+ }
1792
+ }
1793
+ }
1794
+ }
1795
+ }
1796
+ });
1797
+
1798
+ // Timing Chart
1799
+ const timingCtx = document.getElementById('timing-chart').getContext('2d');
1800
+ timingChart = new Chart(timingCtx, {
1801
+ type: 'line',
1802
+ data: {
1803
+ labels: [],
1804
+ datasets: [{
1805
+ label: 'Processing Time (seconds)',
1806
+ data: [],
1807
+ borderColor: '#e91e63',
1808
+ backgroundColor: 'rgba(233, 30, 99, 0.1)',
1809
+ tension: 0.4,
1810
+ fill: true
1811
+ }]
1812
+ },
1813
+ options: {
1814
+ ...chartOptions,
1815
+ scales: {
1816
+ ...chartOptions.scales,
1817
+ y: {
1818
+ ...chartOptions.scales.y,
1819
+ min: 0,
1820
+ ticks: {
1821
+ ...chartOptions.scales.y.ticks,
1822
+ callback: function(value) {
1823
+ return value + 's';
1824
+ }
1825
+ }
1826
+ }
1827
+ }
1828
+ }
1829
+ });
1830
+
1831
+ // Search Results Chart
1832
+ const searchResultsCtx = document.getElementById('search-results-chart').getContext('2d');
1833
+ searchResultsChart = new Chart(searchResultsCtx, {
1834
+ type: 'line',
1835
+ data: {
1836
+ labels: [],
1837
+ datasets: [{
1838
+ label: 'Search Results Count',
1839
+ data: [],
1840
+ borderColor: '#9c27b0',
1841
+ backgroundColor: 'rgba(156, 39, 176, 0.1)',
1842
+ tension: 0.4,
1843
+ fill: true
1844
+ }]
1845
+ },
1846
+ options: {
1847
+ ...chartOptions,
1848
+ scales: {
1849
+ ...chartOptions.scales,
1850
+ y: {
1851
+ ...chartOptions.scales.y,
1852
+ min: 0,
1853
+ ticks: {
1854
+ ...chartOptions.scales.y.ticks,
1855
+ callback: function(value) {
1856
+ return Math.round(value) + ' results';
1857
+ }
1858
+ }
1859
+ }
1860
+ }
1861
+ }
1862
+ });
1863
+ }
1864
+
1865
+ function updateCharts(status) {
1866
+ if (!accuracyChart || !timingChart || !searchResultsChart || !status) return;
1867
+
1868
+ const currentExample = status.completed_examples;
1869
+ if (currentExample <= 0) return;
1870
+
1871
+ // Update accuracy chart
1872
+ if (status.overall_accuracy !== undefined) {
1873
+ // Add new data point
1874
+ const labels = accuracyChart.data.labels;
1875
+ if (!labels.includes(currentExample)) {
1876
+ labels.push(currentExample);
1877
+ accuracyChart.data.datasets[0].data.push(status.overall_accuracy || 0);
1878
+ accuracyChart.data.datasets[1].data.push(status.simpleqa_accuracy || 0);
1879
+ accuracyChart.data.datasets[2].data.push(status.browsecomp_accuracy || 0);
1880
+ } else {
1881
+ // Update existing data point
1882
+ const index = labels.indexOf(currentExample);
1883
+ if (index >= 0) {
1884
+ accuracyChart.data.datasets[0].data[index] = status.overall_accuracy || 0;
1885
+ accuracyChart.data.datasets[1].data[index] = status.simpleqa_accuracy || 0;
1886
+ accuracyChart.data.datasets[2].data[index] = status.browsecomp_accuracy || 0;
1887
+ }
1888
+ }
1889
+
1890
+ // Keep only last 50 data points
1891
+ if (labels.length > 50) {
1892
+ labels.shift();
1893
+ accuracyChart.data.datasets.forEach(dataset => dataset.data.shift());
1894
+ }
1895
+
1896
+ accuracyChart.update('none');
1897
+ }
1898
+
1899
+ // Update timing chart
1900
+ if (status.avg_time_per_example !== undefined) {
1901
+ const timingLabels = timingChart.data.labels;
1902
+ if (!timingLabels.includes(currentExample)) {
1903
+ timingLabels.push(currentExample);
1904
+ timingChart.data.datasets[0].data.push(status.avg_time_per_example || 0);
1905
+ } else {
1906
+ // Update existing data point
1907
+ const index = timingLabels.indexOf(currentExample);
1908
+ if (index >= 0) {
1909
+ timingChart.data.datasets[0].data[index] = status.avg_time_per_example || 0;
1910
+ }
1911
+ }
1912
+
1913
+ // Keep only last 50 data points
1914
+ if (timingLabels.length > 50) {
1915
+ timingLabels.shift();
1916
+ timingChart.data.datasets[0].data.shift();
1917
+ }
1918
+
1919
+ timingChart.update('none');
1920
+ }
1921
+ }
1922
+
1923
+ // Load historical chart data when reconnecting to running benchmark
1924
+ async function loadHistoricalChartData() {
1925
+ if (!currentBenchmarkId || !accuracyChart || !timingChart) return;
1926
+
1927
+ try {
1928
+ // Get benchmark status to populate initial chart data
1929
+ const response = await fetch(`/benchmark/api/status/${currentBenchmarkId}`);
1930
+ const data = await response.json();
1931
+
1932
+ if (data.success && data.status.completed_examples > 0) {
1933
+ // Create simulated historical data points for a smooth chart
1934
+ // In a real implementation, you'd store this data in the database
1935
+ const status = data.status;
1936
+ const totalCompleted = status.completed_examples;
1937
+
1938
+ // Generate some sample points for the chart
1939
+ for (let i = 1; i <= totalCompleted; i += Math.max(1, Math.floor(totalCompleted / 20))) {
1940
+ accuracyChart.data.labels.push(i);
1941
+ // Use current accuracy as approximation (in real implementation, store historical values)
1942
+ accuracyChart.data.datasets[0].data.push(status.overall_accuracy || 0);
1943
+ accuracyChart.data.datasets[1].data.push(status.simpleqa_accuracy || 0);
1944
+ accuracyChart.data.datasets[2].data.push(status.browsecomp_accuracy || 0);
1945
+
1946
+ timingChart.data.labels.push(i);
1947
+ timingChart.data.datasets[0].data.push(status.avg_time_per_example || 0);
1948
+ }
1949
+
1950
+ accuracyChart.update();
1951
+ timingChart.update();
1952
+ }
1953
+ } catch (error) {
1954
+ console.error('Error loading historical chart data:', error);
1955
+ }
1956
+ }
1957
+
1958
+ // Search quality monitoring functions
1959
+ async function updateSearchQualityMonitoring() {
1960
+ if (!currentBenchmarkId) return;
1961
+
1962
+ try {
1963
+ // Fetch recent results to get search counts
1964
+ const response = await fetch(`/benchmark/api/results/${currentBenchmarkId}?limit=5`);
1965
+ const data = await response.json();
1966
+
1967
+ if (data.success && data.results && data.results.length > 0) {
1968
+ // Process search result counts (already calculated by backend)
1969
+ const recentResults = data.results;
1970
+ let totalSearchResults = 0;
1971
+ let validResults = 0;
1972
+
1973
+ recentResults.forEach(result => {
1974
+ if (result.search_result_count !== undefined && result.search_result_count !== null) {
1975
+ totalSearchResults += result.search_result_count;
1976
+ validResults++;
1977
+ }
1978
+ });
1979
+
1980
+ if (validResults > 0) {
1981
+ const avgSearchResults = totalSearchResults / validResults;
1982
+ updateSearchResultsChart(avgSearchResults);
1983
+ updateSearchQualityAlert(avgSearchResults);
1984
+ }
1985
+ }
1986
+ } catch (error) {
1987
+ console.error('Error updating search quality monitoring:', error);
1988
+ }
1989
+ }
1990
+
1991
+ function updateSearchResultsChart(avgSearchResults) {
1992
+ if (!searchResultsChart || !currentBenchmarkId) return;
1993
+
1994
+ // Get current timestamp or progress for x-axis
1995
+ const now = new Date().toLocaleTimeString();
1996
+ const labels = searchResultsChart.data.labels;
1997
+
1998
+ // Always add new data point with timestamp
1999
+ labels.push(now);
2000
+ searchResultsChart.data.datasets[0].data.push(avgSearchResults);
2001
+
2002
+ // Keep only last 20 data points for readability
2003
+ if (labels.length > 20) {
2004
+ labels.shift();
2005
+ searchResultsChart.data.datasets[0].data.shift();
2006
+ }
2007
+
2008
+ // Store for alert monitoring
2009
+ recentSearchCounts.push(avgSearchResults);
2010
+ if (recentSearchCounts.length > 10) {
2011
+ recentSearchCounts.shift();
2012
+ }
2013
+
2014
+ searchResultsChart.update('none');
2015
+ }
2016
+
2017
+ function updateSearchQualityAlert(avgSearchResults) {
2018
+ const statusIcon = document.getElementById('search-status-icon');
2019
+ const statusText = document.getElementById('search-status-text');
2020
+ const statusDetails = document.getElementById('search-status-details');
2021
+
2022
+ if (!statusIcon || !statusText || !statusDetails) return;
2023
+
2024
+ // Determine alert level based on search result count
2025
+ let alertLevel = 'good';
2026
+ let alertMessage = '';
2027
+ let alertDetails = '';
2028
+ let alertIcon = 'fas fa-check-circle';
2029
+ let alertColor = '#4caf50';
2030
+
2031
+ if (avgSearchResults < 2) {
2032
+ alertLevel = 'critical';
2033
+ alertMessage = 'CRITICAL: Very few search results';
2034
+ alertDetails = `Only ${avgSearchResults.toFixed(1)} results per query. Accuracy likely severely degraded.`;
2035
+ alertIcon = 'fas fa-exclamation-triangle';
2036
+ alertColor = '#f44336';
2037
+
2038
+ // Show rate limit warning
2039
+ document.getElementById('rate-limit-warning').style.display = 'block';
2040
+
2041
+ } else if (avgSearchResults < 5) {
2042
+ alertLevel = 'warning';
2043
+ alertMessage = 'WARNING: Low search results';
2044
+ alertDetails = `${avgSearchResults.toFixed(1)} results per query. Consider restarting SearXNG.`;
2045
+ alertIcon = 'fas fa-exclamation-circle';
2046
+ alertColor = '#ff9800';
2047
+
2048
+ } else if (avgSearchResults < 10) {
2049
+ alertLevel = 'caution';
2050
+ alertMessage = 'CAUTION: Moderate search results';
2051
+ alertDetails = `${avgSearchResults.toFixed(1)} results per query. Performance may be affected.`;
2052
+ alertIcon = 'fas fa-info-circle';
2053
+ alertColor = '#2196f3';
2054
+
2055
+ } else {
2056
+ alertLevel = 'good';
2057
+ alertMessage = 'GOOD: Healthy search results';
2058
+ alertDetails = `${avgSearchResults.toFixed(1)} results per query. Search engines working well.`;
2059
+ alertIcon = 'fas fa-check-circle';
2060
+ alertColor = '#4caf50';
2061
+
2062
+ // Hide rate limit warning if it was shown
2063
+ document.getElementById('rate-limit-warning').style.display = 'none';
2064
+ }
2065
+
2066
+ // Update UI
2067
+ statusIcon.innerHTML = `<i class="${alertIcon}"></i>`;
2068
+ statusIcon.style.color = alertColor;
2069
+ statusText.textContent = alertMessage;
2070
+ statusText.style.color = alertColor;
2071
+ statusDetails.textContent = alertDetails;
2072
+
2073
+ // Trigger alert if we detect degradation
2074
+ if (alertLevel === 'critical' && !searchQualityAlert) {
2075
+ searchQualityAlert = true;
2076
+ showAlert('Search engine performance critically degraded! Consider restarting SearXNG.', 'error');
2077
+ } else if (alertLevel === 'warning' && !searchQualityAlert) {
2078
+ searchQualityAlert = true;
2079
+ showAlert('Search engine performance is declining. Monitor closely.', 'warning');
2080
+ } else if (alertLevel === 'good') {
2081
+ searchQualityAlert = false; // Reset alert flag when performance improves
2082
+ }
2083
+ }
2084
+
2085
+ // Rate limiting status monitoring (simplified)
2086
+ async function updateRateLimitingStatus() {
2087
+ try {
2088
+ const response = await fetch('/benchmark/api/search-quality');
2089
+ const data = await response.json();
2090
+
2091
+ if (data.success && data.search_quality && data.search_quality.length > 0) {
2092
+ // Find SearXNG engine specifically (most critical for benchmarks)
2093
+ const searxngStats = data.search_quality.find(stat =>
2094
+ stat.engine_type.toLowerCase().includes('searxng')
2095
+ );
2096
+
2097
+ if (searxngStats && searxngStats.recent_avg_results < 2) {
2098
+ // Show warning when search results are critically low
2099
+ console.warn('Low search results detected:', searxngStats);
2100
+
2101
+ const statusDetails = document.getElementById('search-status-details');
2102
+ if (statusDetails && !statusDetails.textContent.includes('Very low results')) {
2103
+ statusDetails.textContent += ` Very low results: ${searxngStats.recent_avg_results.toFixed(1)} avg.`;
2104
+ }
2105
+ }
2106
+ }
2107
+ } catch (error) {
2108
+ console.error('Error fetching rate limiting status:', error);
2109
+ }
2110
+ }
2111
+
2112
+ // Check for search engine warnings and display appropriate messages
2113
+ function checkSearchEngineWarnings(searchTool) {
2114
+ const warningContainer = document.getElementById('search-engine-warning');
2115
+ const warningText = document.getElementById('search-warning-text');
2116
+
2117
+ let showWarning = false;
2118
+ let message = '';
2119
+
2120
+ switch (searchTool?.toLowerCase()) {
2121
+ case 'searxng':
2122
+ showWarning = true;
2123
+ message = 'SearXNG is a shared resource. Please use reasonable example counts to avoid affecting other users. Consider shorter benchmarks for testing.';
2124
+ break;
2125
+ case 'arxiv':
2126
+ showWarning = true;
2127
+ message = 'ArXiv is a shared resource containing only academic papers - benchmarking with SimpleQA is useless as it will find zero relevant results for general knowledge questions. Should not be used for this test. Use Tavily instead.';
2128
+ break;
2129
+ case 'pubmed':
2130
+ showWarning = true;
2131
+ message = 'PubMed is a shared resource containing only medical literature - benchmarking with SimpleQA is absolutely useless as general knowledge questions will find zero relevant results. Should not be used for this test. Use Tavily instead.';
2132
+ break;
2133
+ case 'semanticscholar':
2134
+ showWarning = true;
2135
+ message = 'Semantic Scholar is a shared resource specialized for academic research - not suitable for general SimpleQA questions and should not be used for this test. Use Tavily instead.';
2136
+ break;
2137
+ case 'wikipedia':
2138
+ showWarning = true;
2139
+ message = 'Wikipedia is a shared resource with limited coverage - benchmarking with it is useless for comprehensive testing and should not be used for this test. Use Tavily instead.';
2140
+ break;
2141
+ default:
2142
+ showWarning = false;
2143
+ }
2144
+
2145
+ if (showWarning) {
2146
+ warningText.textContent = message;
2147
+ warningContainer.style.display = 'block';
2148
+ } else {
2149
+ warningContainer.style.display = 'none';
2150
+ }
2151
+ }
2152
+
2153
+ // ==============================================
2154
+ // Evaluation Settings Functionality
2155
+ // (Reusing research page model functionality)
2156
+ // ==============================================
2157
+
2158
+ // Evaluation settings DOM elements
2159
+ let evaluationProviderSelect = null;
2160
+ let evaluationModelInput = null;
2161
+ let evaluationEndpointInput = null;
2162
+ let evaluationTemperatureInput = null;
2163
+
2164
+ function initializeEvaluationSettings() {
2165
+ console.log('Initializing evaluation settings...');
2166
+
2167
+ // Initialize the global models object
2168
+ window.evaluationModels = window.evaluationModels || {};
2169
+
2170
+ // Get DOM elements
2171
+ evaluationProviderSelect = document.getElementById('evaluation_provider');
2172
+ evaluationModelInput = document.getElementById('evaluation_model');
2173
+ evaluationEndpointInput = document.getElementById('evaluation_endpoint_url');
2174
+ evaluationTemperatureInput = document.getElementById('evaluation_temperature');
2175
+
2176
+ console.log('DOM elements found:', {
2177
+ provider: !!evaluationProviderSelect,
2178
+ model: !!evaluationModelInput,
2179
+ endpoint: !!evaluationEndpointInput,
2180
+ temperature: !!evaluationTemperatureInput
2181
+ });
2182
+
2183
+ // Populate evaluation provider dropdown
2184
+ populateEvaluationProviders();
2185
+
2186
+ // Setup evaluation model dropdown using existing custom dropdown
2187
+ setupEvaluationModelDropdown();
2188
+
2189
+ // Setup event handlers
2190
+ setupEvaluationEventHandlers();
2191
+
2192
+ // Load models from API - this will populate window.evaluationModels
2193
+ loadEvaluationModelsFromAPI();
2194
+
2195
+ // Load settings with a small delay to ensure DOM is ready
2196
+ setTimeout(() => {
2197
+ loadEvaluationSettings();
2198
+ }, 100);
2199
+ }
2200
+
2201
+ function populateEvaluationProviders() {
2202
+ if (!evaluationProviderSelect) return;
2203
+
2204
+ // Clear existing options
2205
+ evaluationProviderSelect.innerHTML = '';
2206
+
2207
+ // Provider options (same as research page)
2208
+ const providers = [
2209
+ { value: 'ollama', label: 'Ollama (Local)' },
2210
+ { value: 'openai', label: 'OpenAI (Cloud)' },
2211
+ { value: 'anthropic', label: 'Anthropic (Cloud)' },
2212
+ { value: 'openai_endpoint', label: 'Custom OpenAI Endpoint' },
2213
+ { value: 'vllm', label: 'vLLM (Local)' },
2214
+ { value: 'lmstudio', label: 'LM Studio (Local)' },
2215
+ { value: 'llamacpp', label: 'Llama.cpp (Local)' }
2216
+ ];
2217
+
2218
+ // Add options
2219
+ providers.forEach(provider => {
2220
+ const option = document.createElement('option');
2221
+ option.value = provider.value;
2222
+ option.textContent = provider.label;
2223
+ evaluationProviderSelect.appendChild(option);
2224
+ });
2225
+
2226
+ // Set initial value from data attribute
2227
+ const initialProvider = evaluationProviderSelect.getAttribute('data-initial-value') || 'openai_endpoint';
2228
+ console.log('Setting initial evaluation provider to:', initialProvider);
2229
+ evaluationProviderSelect.value = initialProvider;
2230
+
2231
+ // Show/hide endpoint field based on initial provider
2232
+ if (evaluationEndpointInput && evaluationEndpointInput.parentNode) {
2233
+ evaluationEndpointInput.parentNode.style.display =
2234
+ initialProvider === 'openai_endpoint' ? 'block' : 'none';
2235
+ }
2236
+
2237
+ console.log('Populated evaluation providers with initial value:', initialProvider);
2238
+ }
2239
+
2240
+ function setupEvaluationModelDropdown() {
2241
+ if (!evaluationModelInput) return;
2242
+
2243
+ const dropdownList = document.getElementById('evaluation-model-dropdown-list');
2244
+ if (!dropdownList) return;
2245
+
2246
+ // Setup custom dropdown using the existing component
2247
+ if (window.setupCustomDropdown) {
2248
+ window.evaluationDropdownInstance = window.setupCustomDropdown(
2249
+ evaluationModelInput,
2250
+ dropdownList,
2251
+ function() {
2252
+ // Get models dynamically based on current provider
2253
+ const provider = evaluationProviderSelect ? evaluationProviderSelect.value : 'openai_endpoint';
2254
+
2255
+ // Use loaded models if available
2256
+ if (window.evaluationModels && window.evaluationModels[provider]) {
2257
+ console.log(`Returning ${window.evaluationModels[provider].length} loaded models for ${provider}`);
2258
+ return window.evaluationModels[provider];
2259
+ }
2260
+
2261
+ // Otherwise return defaults
2262
+ return getEvaluationModelOptions();
2263
+ },
2264
+ function(value, item) {
2265
+ // On selection callback
2266
+ const hiddenInput = document.getElementById('evaluation_model_hidden');
2267
+ if (hiddenInput) {
2268
+ hiddenInput.value = value;
2269
+ }
2270
+ saveEvaluationSetting('benchmark.evaluation.model', value);
2271
+ },
2272
+ true, // Allow custom values
2273
+ 'No models available'
2274
+ );
2275
+ }
2276
+
2277
+ // Setup refresh button
2278
+ const refreshBtn = document.querySelector('[data-target="evaluation-model-dropdown"] .refresh-btn');
2279
+ if (refreshBtn) {
2280
+ refreshBtn.addEventListener('click', function(e) {
2281
+ e.preventDefault();
2282
+ console.log('Refresh button clicked, force reloading models...');
2283
+
2284
+ // Show loading state
2285
+ const icon = this.querySelector('i');
2286
+ if (icon) {
2287
+ icon.classList.add('fa-spin');
2288
+ }
2289
+
2290
+ // Force reload models from API
2291
+ window.modelsLoading = false; // Reset the flag
2292
+
2293
+ fetch('/settings/api/available-models?force_refresh=true')
2294
+ .then(response => response.json())
2295
+ .then(data => {
2296
+ console.log('Force refresh received model data:', data);
2297
+
2298
+ if (data && data.providers) {
2299
+ // Store the formatted models
2300
+ window.evaluationModels = {};
2301
+
2302
+ // Process each provider's models
2303
+ Object.entries(data.providers).forEach(([providerKey, models]) => {
2304
+ if (Array.isArray(models)) {
2305
+ // Map provider keys to expected provider names
2306
+ let providerName = providerKey.replace('_models', '').toLowerCase();
2307
+
2308
+ // Special handling for openai_endpoint
2309
+ if (providerName === 'openai_endpoint') {
2310
+ providerName = 'openai_endpoint';
2311
+ }
2312
+
2313
+ window.evaluationModels[providerName] = models.map(model => ({
2314
+ value: model.value || model.id,
2315
+ label: model.label || model.name || model.value
2316
+ }));
2317
+ console.log(`Loaded ${models.length} models for ${providerName}`);
2318
+ }
2319
+ });
2320
+
2321
+ // Update dropdown with new data
2322
+ refreshEvaluationModels();
2323
+ }
2324
+ })
2325
+ .catch(error => {
2326
+ console.error('Error loading evaluation models:', error);
2327
+ })
2328
+ .finally(() => {
2329
+ // Remove loading state
2330
+ if (icon) {
2331
+ icon.classList.remove('fa-spin');
2332
+ }
2333
+ window.modelsLoading = false;
2334
+ });
2335
+ });
2336
+ }
2337
+ }
2338
+
2339
+ function getEvaluationModelOptions() {
2340
+ const provider = evaluationProviderSelect ? evaluationProviderSelect.value : 'openai_endpoint';
2341
+ console.log('Getting evaluation model options for provider:', provider);
2342
+
2343
+ // Check if we have loaded models
2344
+ if (window.evaluationModels && window.evaluationModels[provider] && window.evaluationModels[provider].length > 0) {
2345
+ console.log(`Returning ${window.evaluationModels[provider].length} cached models for ${provider}`);
2346
+ return window.evaluationModels[provider];
2347
+ }
2348
+
2349
+ // Load models from API if not already loading
2350
+ if (!window.modelsLoading) {
2351
+ window.modelsLoading = true;
2352
+ loadEvaluationModelsFromAPI();
2353
+ }
2354
+
2355
+ // Return minimal defaults while loading
2356
+ console.log(`No models loaded yet for ${provider}, returning defaults`);
2357
+ if (provider === 'openai_endpoint') {
2358
+ return [
2359
+ { value: 'anthropic/claude-3.5-sonnet', label: 'Claude 3.5 Sonnet' },
2360
+ { value: 'openai/gpt-4o', label: 'GPT-4o' },
2361
+ { value: '01-ai/yi-large', label: 'Yi Large' }
2362
+ ];
2363
+ } else if (provider === 'openai') {
2364
+ return [
2365
+ { value: 'gpt-4o', label: 'GPT-4o' },
2366
+ { value: 'gpt-4', label: 'GPT-4' },
2367
+ { value: 'gpt-3.5-turbo', label: 'GPT-3.5 Turbo' }
2368
+ ];
2369
+ } else if (provider === 'anthropic') {
2370
+ return [
2371
+ { value: 'claude-3-5-sonnet-latest', label: 'Claude 3.5 Sonnet' },
2372
+ { value: 'claude-3-opus-20240229', label: 'Claude 3 Opus' }
2373
+ ];
2374
+ } else {
2375
+ // Return empty array for other providers
2376
+ return [];
2377
+ }
2378
+ }
2379
+
2380
+ // Debounce function to prevent too many API calls
2381
+ function debounce(func, wait) {
2382
+ let timeout;
2383
+ return function executedFunction(...args) {
2384
+ const later = () => {
2385
+ clearTimeout(timeout);
2386
+ func(...args);
2387
+ };
2388
+ clearTimeout(timeout);
2389
+ timeout = setTimeout(later, wait);
2390
+ };
2391
+ }
2392
+
2393
+ // Debounced version of loadEvaluationModelsFromAPI
2394
+ const loadEvaluationModelsFromAPI = debounce(function(forceRefresh = false) {
2395
+ console.log('Loading evaluation models from API...', forceRefresh ? '(force refresh)' : '');
2396
+
2397
+ // Prevent multiple simultaneous loads
2398
+ if (window.modelsLoading && !forceRefresh) {
2399
+ console.log('Models already loading, skipping...');
2400
+ return;
2401
+ }
2402
+
2403
+ window.modelsLoading = true;
2404
+
2405
+ // Use the correct API endpoint with optional force_refresh parameter
2406
+ const url = forceRefresh ? '/settings/api/available-models?force_refresh=true' : '/settings/api/available-models';
2407
+
2408
+ fetch(url)
2409
+ .then(response => response.json())
2410
+ .then(data => {
2411
+ console.log('Received model data:', data);
2412
+
2413
+ if (data && data.providers) {
2414
+ // Store the formatted models in a temporary variable
2415
+ window.evaluationModels = {};
2416
+
2417
+ // Process each provider's models
2418
+ Object.entries(data.providers).forEach(([providerKey, models]) => {
2419
+ if (Array.isArray(models)) {
2420
+ // Map provider keys to expected provider names
2421
+ let providerName = providerKey.replace('_models', '').toLowerCase();
2422
+
2423
+ // Special handling for openai_endpoint
2424
+ if (providerName === 'openai_endpoint') {
2425
+ providerName = 'openai_endpoint';
2426
+ }
2427
+
2428
+ window.evaluationModels[providerName] = models.map(model => ({
2429
+ value: model.value || model.id,
2430
+ label: model.label || model.name || model.value
2431
+ }));
2432
+ console.log(`Loaded ${models.length} models for ${providerName}`);
2433
+ }
2434
+ });
2435
+
2436
+ // Update dropdown with new data
2437
+ refreshEvaluationModels();
2438
+ }
2439
+ })
2440
+ .catch(error => {
2441
+ console.error('Error loading evaluation models:', error);
2442
+ })
2443
+ .finally(() => {
2444
+ window.modelsLoading = false;
2445
+ });
2446
+ }, 500); // Wait 500ms before making the API call
2447
+
2448
+ function filterModelsForProvider(models, provider) {
2449
+ const providerUpper = provider.toUpperCase();
2450
+
2451
+ let filtered = models.filter(model => {
2452
+ const modelProvider = (model.provider || '').toUpperCase();
2453
+ // Handle provider name variations
2454
+ if (providerUpper === 'OPENAI_ENDPOINT' && modelProvider === 'OPENAI_ENDPOINT') return true;
2455
+ if (providerUpper === 'OPENAI' && modelProvider === 'OPENAI') return true;
2456
+ if (providerUpper === 'ANTHROPIC' && modelProvider === 'ANTHROPIC') return true;
2457
+ if (providerUpper === 'OLLAMA' && modelProvider === 'OLLAMA') return true;
2458
+ return modelProvider === providerUpper;
2459
+ });
2460
+
2461
+ // If no models found for provider, return some defaults
2462
+ if (filtered.length === 0) {
2463
+ if (providerUpper === 'OPENAI_ENDPOINT') {
2464
+ return [
2465
+ { value: '01-ai/yi-large', label: 'Yi Large' },
2466
+ { value: 'anthropic/claude-3.5-sonnet', label: 'Claude 3.5 Sonnet' },
2467
+ { value: 'openai/gpt-4o', label: 'GPT-4o' }
2468
+ ];
2469
+ } else if (providerUpper === 'OPENAI') {
2470
+ return [
2471
+ { value: 'gpt-4o', label: 'GPT-4o' },
2472
+ { value: 'gpt-4', label: 'GPT-4' },
2473
+ { value: 'gpt-3.5-turbo', label: 'GPT-3.5 Turbo' }
2474
+ ];
2475
+ } else if (providerUpper === 'ANTHROPIC') {
2476
+ return [
2477
+ { value: 'claude-3-5-sonnet-latest', label: 'Claude 3.5 Sonnet' },
2478
+ { value: 'claude-3-opus-20240229', label: 'Claude 3 Opus' }
2479
+ ];
2480
+ }
2481
+ }
2482
+
2483
+ return filtered.map(model => ({
2484
+ value: model.value || model.id,
2485
+ label: model.label || model.name || model.value
2486
+ }));
2487
+ }
2488
+
2489
+ function refreshEvaluationModels() {
2490
+ const provider = evaluationProviderSelect ? evaluationProviderSelect.value : 'openai_endpoint';
2491
+ const options = window.evaluationModels && window.evaluationModels[provider] ?
2492
+ window.evaluationModels[provider] : [];
2493
+
2494
+ console.log(`Refreshing evaluation dropdown with ${options.length} options for provider ${provider}`);
2495
+
2496
+ // If we have the updateDropdownOptions function and the input
2497
+ if (window.updateDropdownOptions && evaluationModelInput) {
2498
+ // Update the dropdown with the actual loaded models
2499
+ window.updateDropdownOptions(evaluationModelInput, options);
2500
+ }
2501
+
2502
+ // Force a click event to show the dropdown with new options
2503
+ if (evaluationModelInput && options.length > 0) {
2504
+ // Trigger a click to show the dropdown with updated options
2505
+ setTimeout(() => {
2506
+ evaluationModelInput.click();
2507
+ }, 100);
2508
+ }
2509
+ }
2510
+
2511
+
2512
+ function setupEvaluationEventHandlers() {
2513
+ // Provider change handler
2514
+ if (evaluationProviderSelect) {
2515
+ evaluationProviderSelect.addEventListener('change', function() {
2516
+ const provider = this.value;
2517
+ console.log('Evaluation provider changed to:', provider);
2518
+
2519
+ // Show/hide endpoint URL field
2520
+ if (evaluationEndpointInput && evaluationEndpointInput.parentNode) {
2521
+ evaluationEndpointInput.parentNode.style.display =
2522
+ provider === 'openai_endpoint' ? 'block' : 'none';
2523
+ }
2524
+
2525
+ // Update model options for new provider
2526
+ refreshEvaluationModels();
2527
+
2528
+ // Save provider setting
2529
+ saveEvaluationSetting('benchmark.evaluation.provider', provider);
2530
+ });
2531
+ }
2532
+
2533
+ // Model input change handler
2534
+ if (evaluationModelInput) {
2535
+ evaluationModelInput.addEventListener('change', function() {
2536
+ saveEvaluationSetting('benchmark.evaluation.model', this.value);
2537
+ });
2538
+ }
2539
+
2540
+ // Endpoint URL change handler
2541
+ if (evaluationEndpointInput) {
2542
+ evaluationEndpointInput.addEventListener('change', function() {
2543
+ saveEvaluationSetting('benchmark.evaluation.endpoint_url', this.value);
2544
+ });
2545
+ }
2546
+
2547
+ // Temperature change handler
2548
+ if (evaluationTemperatureInput) {
2549
+ evaluationTemperatureInput.addEventListener('change', function() {
2550
+ saveEvaluationSetting('benchmark.evaluation.temperature', parseFloat(this.value));
2551
+ });
2552
+ }
2553
+ }
2554
+
2555
+ function loadEvaluationSettings() {
2556
+ console.log('Loading evaluation settings...');
2557
+ console.log('Current DOM elements state:', {
2558
+ provider: !!evaluationProviderSelect,
2559
+ model: !!evaluationModelInput,
2560
+ endpoint: !!evaluationEndpointInput,
2561
+ temperature: !!evaluationTemperatureInput
2562
+ });
2563
+
2564
+ // Use the same evalProviderData, evalModelData, etc. that were already loaded
2565
+ if (!evalProviderData || !evalModelData || !evalEndpointData || !evalTempData) {
2566
+ console.log('Evaluation settings not loaded yet, skipping...');
2567
+ return;
2568
+ }
2569
+
2570
+ // Set provider
2571
+ if (evaluationProviderSelect && evalProviderData && evalProviderData.settings) {
2572
+ const providerValue = evalProviderData.settings.value || 'openai_endpoint';
2573
+ console.log('Setting evaluation provider to:', providerValue);
2574
+ evaluationProviderSelect.value = providerValue;
2575
+
2576
+ // Show/hide endpoint field
2577
+ if (evaluationEndpointInput && evaluationEndpointInput.parentNode) {
2578
+ evaluationEndpointInput.parentNode.style.display =
2579
+ providerValue === 'openai_endpoint' ? 'block' : 'none';
2580
+ }
2581
+ }
2582
+
2583
+ // Set model
2584
+ if (evaluationModelInput && evalModelData && evalModelData.settings) {
2585
+ const modelValue = evalModelData.settings.value || 'anthropic/claude-3.7-sonnet';
2586
+ console.log('Setting evaluation model to:', modelValue);
2587
+
2588
+ // Wait for models to be loaded, then set the value
2589
+ const setModelValue = () => {
2590
+ const provider = evaluationProviderSelect ? evaluationProviderSelect.value : 'openai_endpoint';
2591
+ const models = window.evaluationModels && window.evaluationModels[provider] ?
2592
+ window.evaluationModels[provider] : [];
2593
+
2594
+ // Find the matching model to get the label
2595
+ const matchingModel = models.find(m => m.value === modelValue);
2596
+
2597
+ if (matchingModel) {
2598
+ // Set the display value to the label
2599
+ evaluationModelInput.value = matchingModel.label;
2600
+ console.log('Found matching model, setting label:', matchingModel.label);
2601
+ } else {
2602
+ // If no match, just set the raw value
2603
+ evaluationModelInput.value = modelValue;
2604
+ console.log('No matching model found, setting raw value:', modelValue);
2605
+ }
2606
+
2607
+ // Update hidden input
2608
+ const hiddenInput = document.getElementById('evaluation_model_hidden');
2609
+ if (hiddenInput) {
2610
+ hiddenInput.value = modelValue;
2611
+ }
2612
+
2613
+ // Use the dropdown instance's setValue method if available
2614
+ if (window.evaluationDropdownInstance && window.evaluationDropdownInstance.setValue) {
2615
+ window.evaluationDropdownInstance.setValue(modelValue, false);
2616
+ }
2617
+ };
2618
+
2619
+ // If models are already loaded, set immediately
2620
+ if (window.evaluationModels && Object.keys(window.evaluationModels).length > 0) {
2621
+ setModelValue();
2622
+ } else {
2623
+ // Otherwise wait for models to load
2624
+ setTimeout(setModelValue, 1000);
2625
+ }
2626
+ }
2627
+
2628
+ // Set endpoint URL
2629
+ if (evaluationEndpointInput && evalEndpointData && evalEndpointData.settings) {
2630
+ const endpointValue = evalEndpointData.settings.value || 'https://openrouter.ai/api/v1';
2631
+ console.log('Setting evaluation endpoint to:', endpointValue);
2632
+ evaluationEndpointInput.value = endpointValue;
2633
+ }
2634
+
2635
+ // Set temperature
2636
+ if (evaluationTemperatureInput && evalTempData && evalTempData.settings) {
2637
+ const tempValue = evalTempData.settings.value || 0;
2638
+ console.log('Setting evaluation temperature to:', tempValue);
2639
+ evaluationTemperatureInput.value = tempValue;
2640
+ }
2641
+ }
2642
+
2643
+ function setEvaluationDefaults() {
2644
+ console.log('Setting evaluation defaults');
2645
+ if (evaluationProviderSelect) {
2646
+ evaluationProviderSelect.value = 'openai_endpoint';
2647
+ // Show endpoint field for default provider
2648
+ if (evaluationEndpointInput && evaluationEndpointInput.parentNode) {
2649
+ evaluationEndpointInput.parentNode.style.display = 'block';
2650
+ }
2651
+ }
2652
+ if (evaluationModelInput) evaluationModelInput.value = 'anthropic/claude-3.7-sonnet';
2653
+ if (evaluationEndpointInput) evaluationEndpointInput.value = 'https://openrouter.ai/api/v1';
2654
+ if (evaluationTemperatureInput) evaluationTemperatureInput.value = 0;
2655
+ }
2656
+
2657
+ function saveEvaluationSetting(key, value) {
2658
+ console.log('Saving evaluation setting:', key, '=', value);
2659
+
2660
+ // Get CSRF token
2661
+ const csrfToken = document.querySelector('meta[name="csrf-token"]')?.getAttribute('content') || '';
2662
+
2663
+ fetch(`/settings/api/${key}`, {
2664
+ method: 'PUT',
2665
+ headers: {
2666
+ 'Content-Type': 'application/json',
2667
+ 'X-CSRFToken': csrfToken
2668
+ },
2669
+ body: JSON.stringify({ value: value })
2670
+ })
2671
+ .then(response => response.json())
2672
+ .then(data => {
2673
+ if (data.success) {
2674
+ console.log('Successfully saved evaluation setting:', key);
2675
+ } else {
2676
+ console.error('Failed to save evaluation setting:', data.error);
2677
+ }
2678
+ })
2679
+ .catch(error => {
2680
+ console.error('Error saving evaluation setting:', error);
2681
+ });
2682
+ }
2683
+
2684
+
2685
+ </script>
2686
+ {% endblock %}
2687
+
2688
+ {% block page_scripts %}
2689
+ <!-- Load required services for progress tracking -->
2690
+ <script src="{{ url_for('research.serve_static', path='js/services/audio.js') }}"></script>
2691
+ <script src="{{ url_for('research.serve_static', path='js/services/ui.js') }}"></script>
2692
+ <script src="{{ url_for('research.serve_static', path='js/services/formatting.js') }}"></script>
2693
+ <script src="{{ url_for('research.serve_static', path='js/services/api.js') }}"></script>
2694
+ <script src="{{ url_for('research.serve_static', path='js/services/socket.js') }}"></script>
2695
+ <!-- Load custom dropdown component for evaluation model selection -->
2696
+ <script src="{{ url_for('research.serve_static', path='js/components/custom_dropdown.js') }}"></script>
2697
+ {% endblock %}