local-deep-research 0.5.9__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +11 -1
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +32 -6
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +32 -8
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -0
- local_deep_research/api/__init__.py +2 -0
- local_deep_research/api/research_functions.py +177 -3
- local_deep_research/benchmarks/graders.py +150 -5
- local_deep_research/benchmarks/models/__init__.py +19 -0
- local_deep_research/benchmarks/models/benchmark_models.py +283 -0
- local_deep_research/benchmarks/ui/__init__.py +1 -0
- local_deep_research/benchmarks/web_api/__init__.py +6 -0
- local_deep_research/benchmarks/web_api/benchmark_routes.py +862 -0
- local_deep_research/benchmarks/web_api/benchmark_service.py +920 -0
- local_deep_research/config/llm_config.py +106 -21
- local_deep_research/defaults/default_settings.json +447 -2
- local_deep_research/error_handling/report_generator.py +10 -0
- local_deep_research/llm/__init__.py +19 -0
- local_deep_research/llm/llm_registry.py +155 -0
- local_deep_research/metrics/db_models.py +3 -7
- local_deep_research/metrics/search_tracker.py +25 -11
- local_deep_research/search_system.py +12 -9
- local_deep_research/utilities/log_utils.py +23 -10
- local_deep_research/utilities/thread_context.py +99 -0
- local_deep_research/web/app_factory.py +32 -8
- local_deep_research/web/database/benchmark_schema.py +230 -0
- local_deep_research/web/database/convert_research_id_to_string.py +161 -0
- local_deep_research/web/database/models.py +55 -1
- local_deep_research/web/database/schema_upgrade.py +397 -2
- local_deep_research/web/database/uuid_migration.py +265 -0
- local_deep_research/web/routes/api_routes.py +62 -31
- local_deep_research/web/routes/history_routes.py +13 -6
- local_deep_research/web/routes/metrics_routes.py +264 -4
- local_deep_research/web/routes/research_routes.py +45 -18
- local_deep_research/web/routes/route_registry.py +352 -0
- local_deep_research/web/routes/settings_routes.py +382 -22
- local_deep_research/web/services/research_service.py +22 -29
- local_deep_research/web/services/settings_manager.py +53 -0
- local_deep_research/web/services/settings_service.py +2 -0
- local_deep_research/web/static/css/styles.css +8 -0
- local_deep_research/web/static/js/components/detail.js +7 -14
- local_deep_research/web/static/js/components/details.js +8 -10
- local_deep_research/web/static/js/components/fallback/ui.js +4 -4
- local_deep_research/web/static/js/components/history.js +6 -6
- local_deep_research/web/static/js/components/logpanel.js +14 -11
- local_deep_research/web/static/js/components/progress.js +51 -46
- local_deep_research/web/static/js/components/research.js +250 -89
- local_deep_research/web/static/js/components/results.js +5 -7
- local_deep_research/web/static/js/components/settings.js +32 -26
- local_deep_research/web/static/js/components/settings_sync.js +24 -23
- local_deep_research/web/static/js/config/urls.js +285 -0
- local_deep_research/web/static/js/main.js +8 -8
- local_deep_research/web/static/js/research_form.js +267 -12
- local_deep_research/web/static/js/services/api.js +18 -18
- local_deep_research/web/static/js/services/keyboard.js +8 -8
- local_deep_research/web/static/js/services/socket.js +53 -35
- local_deep_research/web/static/js/services/ui.js +1 -1
- local_deep_research/web/templates/base.html +4 -1
- local_deep_research/web/templates/components/custom_dropdown.html +5 -3
- local_deep_research/web/templates/components/mobile_nav.html +3 -3
- local_deep_research/web/templates/components/sidebar.html +9 -3
- local_deep_research/web/templates/pages/benchmark.html +2697 -0
- local_deep_research/web/templates/pages/benchmark_results.html +1274 -0
- local_deep_research/web/templates/pages/benchmark_simple.html +453 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +212 -39
- local_deep_research/web/templates/pages/research.html +8 -6
- local_deep_research/web/templates/pages/star_reviews.html +1 -1
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +14 -1
- local_deep_research/web_search_engines/engines/search_engine_brave.py +15 -1
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +20 -1
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +26 -2
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +15 -1
- local_deep_research/web_search_engines/engines/search_engine_retriever.py +192 -0
- local_deep_research/web_search_engines/engines/search_engine_tavily.py +307 -0
- local_deep_research/web_search_engines/rate_limiting/__init__.py +14 -0
- local_deep_research/web_search_engines/rate_limiting/__main__.py +9 -0
- local_deep_research/web_search_engines/rate_limiting/cli.py +209 -0
- local_deep_research/web_search_engines/rate_limiting/exceptions.py +21 -0
- local_deep_research/web_search_engines/rate_limiting/tracker.py +506 -0
- local_deep_research/web_search_engines/retriever_registry.py +108 -0
- local_deep_research/web_search_engines/search_engine_base.py +161 -43
- local_deep_research/web_search_engines/search_engine_factory.py +14 -0
- local_deep_research/web_search_engines/search_engines_config.py +20 -0
- local_deep_research-0.6.1.dist-info/METADATA +374 -0
- {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.1.dist-info}/RECORD +89 -64
- local_deep_research-0.5.9.dist-info/METADATA +0 -420
- {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.1.dist-info}/WHEEL +0 -0
- {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.1.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,2697 @@
|
|
1
|
+
{% extends "base.html" %}
|
2
|
+
{% from "components/custom_dropdown.html" import render_dropdown %}
|
3
|
+
|
4
|
+
{% set active_page = 'benchmark' %}
|
5
|
+
|
6
|
+
{% block title %}Benchmark Configuration - Deep Research System{% endblock %}
|
7
|
+
|
8
|
+
{% block extra_head %}
|
9
|
+
<meta name="csrf-token" content="{{ csrf_token() }}">
|
10
|
+
<link rel="stylesheet" href="{{ url_for('research.serve_static', path='css/custom_dropdown.css') }}">
|
11
|
+
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
12
|
+
<style>
|
13
|
+
.benchmark-card {
|
14
|
+
width: 100%;
|
15
|
+
margin: 0;
|
16
|
+
}
|
17
|
+
|
18
|
+
.dataset-config {
|
19
|
+
border: 1px solid var(--border-color);
|
20
|
+
border-radius: 8px;
|
21
|
+
padding: 20px;
|
22
|
+
margin-bottom: 20px;
|
23
|
+
background: var(--card-bg);
|
24
|
+
}
|
25
|
+
|
26
|
+
.dataset-header {
|
27
|
+
display: flex;
|
28
|
+
justify-content: space-between;
|
29
|
+
align-items: center;
|
30
|
+
margin-bottom: 15px;
|
31
|
+
}
|
32
|
+
|
33
|
+
.dataset-toggle {
|
34
|
+
display: flex;
|
35
|
+
align-items: center;
|
36
|
+
gap: 10px;
|
37
|
+
}
|
38
|
+
|
39
|
+
.benchmark-progress {
|
40
|
+
margin-top: 20px;
|
41
|
+
padding: 20px;
|
42
|
+
background: var(--card-bg);
|
43
|
+
border-radius: 8px;
|
44
|
+
border: 1px solid var(--border-color);
|
45
|
+
display: none;
|
46
|
+
}
|
47
|
+
|
48
|
+
.progress-header {
|
49
|
+
display: flex;
|
50
|
+
justify-content: space-between;
|
51
|
+
align-items: center;
|
52
|
+
margin-bottom: 15px;
|
53
|
+
}
|
54
|
+
|
55
|
+
.progress-bar {
|
56
|
+
width: 100%;
|
57
|
+
height: 20px;
|
58
|
+
background: var(--bg-color);
|
59
|
+
border-radius: 10px;
|
60
|
+
overflow: hidden;
|
61
|
+
margin-bottom: 15px;
|
62
|
+
}
|
63
|
+
|
64
|
+
.progress-fill {
|
65
|
+
height: 100%;
|
66
|
+
background: linear-gradient(90deg, var(--primary-color), var(--accent-color));
|
67
|
+
width: 0%;
|
68
|
+
transition: width 0.3s ease;
|
69
|
+
}
|
70
|
+
|
71
|
+
.metrics-grid {
|
72
|
+
display: grid;
|
73
|
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
74
|
+
gap: 15px;
|
75
|
+
margin-top: 15px;
|
76
|
+
}
|
77
|
+
|
78
|
+
.metric-card {
|
79
|
+
padding: 15px;
|
80
|
+
background: var(--bg-color);
|
81
|
+
border-radius: 6px;
|
82
|
+
text-align: center;
|
83
|
+
}
|
84
|
+
|
85
|
+
.metric-value {
|
86
|
+
font-size: 1.5rem;
|
87
|
+
font-weight: bold;
|
88
|
+
color: var(--primary-color);
|
89
|
+
}
|
90
|
+
|
91
|
+
.metric-label {
|
92
|
+
font-size: 0.9rem;
|
93
|
+
color: var(--text-muted);
|
94
|
+
margin-top: 5px;
|
95
|
+
}
|
96
|
+
|
97
|
+
.dataset-accuracy {
|
98
|
+
display: flex;
|
99
|
+
justify-content: space-between;
|
100
|
+
margin-top: 10px;
|
101
|
+
padding: 10px;
|
102
|
+
background: var(--bg-color);
|
103
|
+
border-radius: 6px;
|
104
|
+
}
|
105
|
+
|
106
|
+
.alert {
|
107
|
+
padding: 15px;
|
108
|
+
border-radius: 6px;
|
109
|
+
margin-bottom: 15px;
|
110
|
+
}
|
111
|
+
|
112
|
+
.alert-warning {
|
113
|
+
background-color: #fff3cd;
|
114
|
+
border: 1px solid #ffeaa7;
|
115
|
+
color: #856404;
|
116
|
+
}
|
117
|
+
|
118
|
+
.alert i {
|
119
|
+
margin-right: 8px;
|
120
|
+
}
|
121
|
+
|
122
|
+
/* Question and Results Display Styles */
|
123
|
+
.benchmark-section {
|
124
|
+
margin-top: 20px;
|
125
|
+
}
|
126
|
+
|
127
|
+
.question-card {
|
128
|
+
background: #1a1a1a;
|
129
|
+
border: 1px solid #333;
|
130
|
+
border-radius: 8px;
|
131
|
+
padding: 15px;
|
132
|
+
margin-bottom: 10px;
|
133
|
+
}
|
134
|
+
|
135
|
+
.question-content {
|
136
|
+
margin-bottom: 10px;
|
137
|
+
}
|
138
|
+
|
139
|
+
.question-text {
|
140
|
+
font-size: 1rem;
|
141
|
+
line-height: 1.4;
|
142
|
+
color: #e0e0e0;
|
143
|
+
margin-bottom: 8px;
|
144
|
+
padding: 10px;
|
145
|
+
background: #2a2a2a;
|
146
|
+
border-radius: 4px;
|
147
|
+
border-left: 4px solid var(--primary-color);
|
148
|
+
}
|
149
|
+
|
150
|
+
.question-meta {
|
151
|
+
display: flex;
|
152
|
+
gap: 10px;
|
153
|
+
font-size: 0.85rem;
|
154
|
+
color: var(--text-muted);
|
155
|
+
}
|
156
|
+
|
157
|
+
.dataset-badge {
|
158
|
+
background: var(--primary-color);
|
159
|
+
color: white;
|
160
|
+
padding: 2px 8px;
|
161
|
+
border-radius: 12px;
|
162
|
+
font-size: 0.8rem;
|
163
|
+
font-weight: 500;
|
164
|
+
}
|
165
|
+
|
166
|
+
.search-count-badge {
|
167
|
+
color: white;
|
168
|
+
padding: 2px 6px;
|
169
|
+
border-radius: 10px;
|
170
|
+
font-size: 0.75rem;
|
171
|
+
font-weight: 500;
|
172
|
+
margin-left: 8px;
|
173
|
+
}
|
174
|
+
|
175
|
+
.search-count-badge.critical {
|
176
|
+
background: #f44336; /* Red for 0-1 results */
|
177
|
+
}
|
178
|
+
|
179
|
+
.search-count-badge.warning {
|
180
|
+
background: #ff9800; /* Orange for 2-4 results */
|
181
|
+
}
|
182
|
+
|
183
|
+
.search-count-badge.good {
|
184
|
+
background: #4caf50; /* Green for 5+ results */
|
185
|
+
}
|
186
|
+
|
187
|
+
.processing-status {
|
188
|
+
padding: 8px 12px;
|
189
|
+
background: var(--bg-secondary);
|
190
|
+
border-radius: 4px;
|
191
|
+
font-size: 0.9rem;
|
192
|
+
color: var(--text-muted);
|
193
|
+
}
|
194
|
+
|
195
|
+
.processing-status.processing {
|
196
|
+
background: #ff9800;
|
197
|
+
color: #ffffff;
|
198
|
+
}
|
199
|
+
|
200
|
+
.processing-status.completed {
|
201
|
+
background: #e8f5e8;
|
202
|
+
color: #2e7d32;
|
203
|
+
}
|
204
|
+
|
205
|
+
.result-card {
|
206
|
+
background: #1a1a1a;
|
207
|
+
border: 1px solid #333;
|
208
|
+
border-radius: 6px;
|
209
|
+
padding: 12px;
|
210
|
+
margin-bottom: 8px;
|
211
|
+
transition: border-color 0.2s;
|
212
|
+
}
|
213
|
+
|
214
|
+
.result-card.correct {
|
215
|
+
border-left: 4px solid #4caf50;
|
216
|
+
}
|
217
|
+
|
218
|
+
.result-card.incorrect {
|
219
|
+
border-left: 4px solid #f44336;
|
220
|
+
}
|
221
|
+
|
222
|
+
.result-header {
|
223
|
+
display: flex;
|
224
|
+
justify-content: between;
|
225
|
+
align-items: center;
|
226
|
+
margin-bottom: 8px;
|
227
|
+
font-size: 0.85rem;
|
228
|
+
color: #a0a0a0;
|
229
|
+
}
|
230
|
+
|
231
|
+
.result-status {
|
232
|
+
font-weight: 600;
|
233
|
+
}
|
234
|
+
|
235
|
+
.result-status.correct {
|
236
|
+
color: #4caf50;
|
237
|
+
}
|
238
|
+
|
239
|
+
.result-status.incorrect {
|
240
|
+
color: #f44336;
|
241
|
+
}
|
242
|
+
|
243
|
+
.answer-comparison {
|
244
|
+
display: grid;
|
245
|
+
gap: 8px;
|
246
|
+
}
|
247
|
+
|
248
|
+
.answer-box {
|
249
|
+
padding: 12px;
|
250
|
+
border-radius: 4px;
|
251
|
+
font-size: 0.95rem;
|
252
|
+
line-height: 1.5;
|
253
|
+
white-space: pre-wrap;
|
254
|
+
word-break: break-word;
|
255
|
+
min-height: 60px;
|
256
|
+
color: #e0e0e0 !important;
|
257
|
+
}
|
258
|
+
|
259
|
+
.answer-box > div {
|
260
|
+
margin-top: 5px;
|
261
|
+
color: #e0e0e0 !important;
|
262
|
+
}
|
263
|
+
|
264
|
+
.model-answer {
|
265
|
+
background: #1e2a3a;
|
266
|
+
border-left: 4px solid #2196f3;
|
267
|
+
color: #e0e0e0 !important;
|
268
|
+
}
|
269
|
+
|
270
|
+
.correct-answer {
|
271
|
+
background: #1e3a1e;
|
272
|
+
border-left: 4px solid #4caf50;
|
273
|
+
color: #e0e0e0 !important;
|
274
|
+
}
|
275
|
+
|
276
|
+
.answer-label {
|
277
|
+
font-size: 0.75rem;
|
278
|
+
font-weight: 600;
|
279
|
+
color: #a0a0a0;
|
280
|
+
margin-bottom: 4px;
|
281
|
+
text-transform: uppercase;
|
282
|
+
letter-spacing: 0.5px;
|
283
|
+
}
|
284
|
+
|
285
|
+
.no-results {
|
286
|
+
text-align: center;
|
287
|
+
color: var(--text-muted);
|
288
|
+
padding: 20px;
|
289
|
+
font-style: italic;
|
290
|
+
}
|
291
|
+
|
292
|
+
#recent-results-container {
|
293
|
+
max-height: 600px;
|
294
|
+
overflow-y: auto;
|
295
|
+
}
|
296
|
+
|
297
|
+
/* Improved layout structure */
|
298
|
+
.page-content {
|
299
|
+
width: 100%;
|
300
|
+
max-width: none;
|
301
|
+
}
|
302
|
+
|
303
|
+
.benchmark-progress .card {
|
304
|
+
width: 100%;
|
305
|
+
max-width: none;
|
306
|
+
}
|
307
|
+
|
308
|
+
.form-group {
|
309
|
+
width: 100%;
|
310
|
+
}
|
311
|
+
|
312
|
+
/* Better visual hierarchy */
|
313
|
+
.benchmark-guidelines {
|
314
|
+
background: linear-gradient(135deg, #1e1e1e 0%, #2a2a2a 100%);
|
315
|
+
border: 1px solid #404040;
|
316
|
+
border-left: 4px solid var(--primary-color);
|
317
|
+
border-radius: 8px;
|
318
|
+
margin-bottom: 25px;
|
319
|
+
box-shadow: 0 2px 8px rgba(0,0,0,0.3);
|
320
|
+
}
|
321
|
+
|
322
|
+
.guidelines-content {
|
323
|
+
display: grid;
|
324
|
+
grid-template-columns: 1fr auto;
|
325
|
+
gap: 25px;
|
326
|
+
align-items: start;
|
327
|
+
}
|
328
|
+
|
329
|
+
.guidelines-text {
|
330
|
+
padding: 25px;
|
331
|
+
}
|
332
|
+
|
333
|
+
.guidelines-sidebar {
|
334
|
+
min-width: 200px;
|
335
|
+
background: rgba(var(--primary-color-rgb), 0.1);
|
336
|
+
padding: 20px;
|
337
|
+
border-radius: 0 8px 8px 0;
|
338
|
+
text-align: center;
|
339
|
+
border-left: 1px solid rgba(var(--primary-color-rgb), 0.2);
|
340
|
+
}
|
341
|
+
|
342
|
+
/* Enhanced form sections */
|
343
|
+
.form-section {
|
344
|
+
background: #1a1a1a;
|
345
|
+
border: 1px solid #333;
|
346
|
+
border-radius: 8px;
|
347
|
+
margin-bottom: 20px;
|
348
|
+
overflow: hidden;
|
349
|
+
}
|
350
|
+
|
351
|
+
.form-section-header {
|
352
|
+
background: linear-gradient(90deg, #2a2a2a 0%, #333 100%);
|
353
|
+
padding: 15px 20px;
|
354
|
+
border-bottom: 1px solid #404040;
|
355
|
+
}
|
356
|
+
|
357
|
+
.form-section-title {
|
358
|
+
color: var(--primary-color);
|
359
|
+
font-size: 1.1rem;
|
360
|
+
font-weight: 600;
|
361
|
+
margin: 0;
|
362
|
+
display: flex;
|
363
|
+
align-items: center;
|
364
|
+
gap: 8px;
|
365
|
+
}
|
366
|
+
|
367
|
+
.form-section-content {
|
368
|
+
padding: 20px;
|
369
|
+
}
|
370
|
+
|
371
|
+
/* Improved dataset configuration cards */
|
372
|
+
.dataset-grid {
|
373
|
+
display: grid;
|
374
|
+
grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
|
375
|
+
gap: 20px;
|
376
|
+
}
|
377
|
+
|
378
|
+
.dataset-card {
|
379
|
+
background: #1e1e1e;
|
380
|
+
border: 2px solid #333;
|
381
|
+
border-radius: 10px;
|
382
|
+
padding: 20px;
|
383
|
+
transition: all 0.3s ease;
|
384
|
+
position: relative;
|
385
|
+
}
|
386
|
+
|
387
|
+
.dataset-card:hover {
|
388
|
+
border-color: var(--primary-color);
|
389
|
+
box-shadow: 0 4px 12px rgba(var(--primary-color-rgb), 0.2);
|
390
|
+
}
|
391
|
+
|
392
|
+
.dataset-card.disabled {
|
393
|
+
opacity: 0.6;
|
394
|
+
border-color: #555;
|
395
|
+
}
|
396
|
+
|
397
|
+
.dataset-header {
|
398
|
+
display: flex;
|
399
|
+
justify-content: space-between;
|
400
|
+
align-items: flex-start;
|
401
|
+
margin-bottom: 15px;
|
402
|
+
}
|
403
|
+
|
404
|
+
.dataset-info h3 {
|
405
|
+
color: #e0e0e0;
|
406
|
+
margin: 0 0 5px 0;
|
407
|
+
font-size: 1.2rem;
|
408
|
+
}
|
409
|
+
|
410
|
+
.dataset-info p {
|
411
|
+
color: #a0a0a0;
|
412
|
+
margin: 0;
|
413
|
+
font-size: 0.9rem;
|
414
|
+
}
|
415
|
+
|
416
|
+
.dataset-toggle {
|
417
|
+
display: flex;
|
418
|
+
align-items: center;
|
419
|
+
gap: 8px;
|
420
|
+
}
|
421
|
+
|
422
|
+
/* Modern toggle switch */
|
423
|
+
.toggle-switch {
|
424
|
+
position: relative;
|
425
|
+
width: 50px;
|
426
|
+
height: 24px;
|
427
|
+
background: #555;
|
428
|
+
border-radius: 12px;
|
429
|
+
cursor: pointer;
|
430
|
+
transition: background 0.3s;
|
431
|
+
}
|
432
|
+
|
433
|
+
.toggle-switch.active {
|
434
|
+
background: var(--primary-color);
|
435
|
+
}
|
436
|
+
|
437
|
+
.toggle-switch::after {
|
438
|
+
content: '';
|
439
|
+
position: absolute;
|
440
|
+
top: 2px;
|
441
|
+
left: 2px;
|
442
|
+
width: 20px;
|
443
|
+
height: 20px;
|
444
|
+
background: white;
|
445
|
+
border-radius: 50%;
|
446
|
+
transition: transform 0.3s;
|
447
|
+
}
|
448
|
+
|
449
|
+
.toggle-switch.active::after {
|
450
|
+
transform: translateX(26px);
|
451
|
+
}
|
452
|
+
|
453
|
+
/* Enhanced input styling */
|
454
|
+
.form-control {
|
455
|
+
background: #2a2a2a;
|
456
|
+
border: 2px solid #404040;
|
457
|
+
border-radius: 6px;
|
458
|
+
padding: 10px 12px;
|
459
|
+
color: #e0e0e0;
|
460
|
+
font-size: 0.95rem;
|
461
|
+
transition: border-color 0.3s, box-shadow 0.3s;
|
462
|
+
}
|
463
|
+
|
464
|
+
.form-control:focus {
|
465
|
+
border-color: var(--primary-color);
|
466
|
+
box-shadow: 0 0 0 3px rgba(var(--primary-color-rgb), 0.2);
|
467
|
+
outline: none;
|
468
|
+
}
|
469
|
+
|
470
|
+
/* Responsive improvements */
|
471
|
+
@media (max-width: 1200px) {
|
472
|
+
.guidelines-content {
|
473
|
+
grid-template-columns: 1fr;
|
474
|
+
}
|
475
|
+
|
476
|
+
.guidelines-sidebar {
|
477
|
+
border-radius: 0 0 8px 8px;
|
478
|
+
border-left: none;
|
479
|
+
border-top: 1px solid rgba(var(--primary-color-rgb), 0.2);
|
480
|
+
}
|
481
|
+
}
|
482
|
+
|
483
|
+
@media (max-width: 768px) {
|
484
|
+
.dataset-grid {
|
485
|
+
grid-template-columns: 1fr;
|
486
|
+
}
|
487
|
+
|
488
|
+
.guidelines-text {
|
489
|
+
padding: 20px;
|
490
|
+
}
|
491
|
+
|
492
|
+
.form-section-content {
|
493
|
+
padding: 15px;
|
494
|
+
}
|
495
|
+
}
|
496
|
+
|
497
|
+
|
498
|
+
/* Performance Charts Styles */
|
499
|
+
.charts-section {
|
500
|
+
margin-top: 20px;
|
501
|
+
}
|
502
|
+
|
503
|
+
.charts-grid {
|
504
|
+
display: grid;
|
505
|
+
grid-template-columns: 1fr 1fr;
|
506
|
+
gap: 20px;
|
507
|
+
margin-top: 15px;
|
508
|
+
}
|
509
|
+
|
510
|
+
.chart-container {
|
511
|
+
background: #1a1a1a;
|
512
|
+
border: 1px solid #333;
|
513
|
+
border-radius: 8px;
|
514
|
+
padding: 15px;
|
515
|
+
height: 300px;
|
516
|
+
}
|
517
|
+
|
518
|
+
.chart-title {
|
519
|
+
color: #e0e0e0;
|
520
|
+
font-size: 1rem;
|
521
|
+
font-weight: 600;
|
522
|
+
margin-bottom: 10px;
|
523
|
+
text-align: center;
|
524
|
+
}
|
525
|
+
|
526
|
+
.chart-canvas {
|
527
|
+
width: 100% !important;
|
528
|
+
height: 250px !important;
|
529
|
+
}
|
530
|
+
|
531
|
+
@media (max-width: 768px) {
|
532
|
+
.charts-grid {
|
533
|
+
grid-template-columns: 1fr;
|
534
|
+
}
|
535
|
+
}
|
536
|
+
|
537
|
+
/* Evaluation Settings Styles */
|
538
|
+
.form-row {
|
539
|
+
display: grid;
|
540
|
+
grid-template-columns: 1fr 1fr;
|
541
|
+
gap: 20px;
|
542
|
+
margin-bottom: 15px;
|
543
|
+
}
|
544
|
+
|
545
|
+
.form-group.half {
|
546
|
+
margin-bottom: 0;
|
547
|
+
}
|
548
|
+
|
549
|
+
@media (max-width: 768px) {
|
550
|
+
.form-row {
|
551
|
+
grid-template-columns: 1fr;
|
552
|
+
gap: 15px;
|
553
|
+
}
|
554
|
+
}
|
555
|
+
</style>
|
556
|
+
{% endblock %}
|
557
|
+
|
558
|
+
{% block content %}
|
559
|
+
<div class="page active" id="benchmark">
|
560
|
+
<div class="page-header">
|
561
|
+
<h1>Benchmark Configuration</h1>
|
562
|
+
<p class="page-subtitle">Test and optimize your search configurations</p>
|
563
|
+
<div style="margin-top: 10px;">
|
564
|
+
<a href="{{ url_for('benchmark.results') }}" class="btn btn-secondary">
|
565
|
+
<i class="fas fa-chart-line"></i> View Past Results
|
566
|
+
</a>
|
567
|
+
</div>
|
568
|
+
</div>
|
569
|
+
|
570
|
+
<!-- Benchmark Usage Guidelines -->
|
571
|
+
<div class="benchmark-guidelines">
|
572
|
+
<div class="guidelines-content">
|
573
|
+
<div class="guidelines-text">
|
574
|
+
<h3 style="color: var(--primary-color); margin-bottom: 15px; font-size: 1.3rem;">
|
575
|
+
<i class="fas fa-info-circle"></i> Benchmark Guidelines
|
576
|
+
</h3>
|
577
|
+
<p style="margin-bottom: 12px; line-height: 1.6; color: #e0e0e0;">
|
578
|
+
<strong>Purpose:</strong> Benchmarks are designed to help you evaluate if your configuration works well, not for research papers or production use.
|
579
|
+
</p>
|
580
|
+
<p style="margin-bottom: 12px; line-height: 1.6; color: #e0e0e0;">
|
581
|
+
<strong>Responsible Usage:</strong> Please use reasonable example counts to avoid overwhelming search engines. The default of 75 examples provides a good balance for configuration testing.
|
582
|
+
</p>
|
583
|
+
<p style="margin-bottom: 18px; line-height: 1.6; color: #e0e0e0;">
|
584
|
+
<strong>Requirements:</strong> Benchmarks require an evaluation model for grading results. You can configure your preferred provider and model in the Evaluation Settings below. The default uses OpenRouter with Claude 3.7 Sonnet, but you can choose from various providers including OpenAI, Anthropic, or local models.
|
585
|
+
</p>
|
586
|
+
<div style="background: rgba(255, 167, 38, 0.1); border: 1px solid rgba(255, 167, 38, 0.3); padding: 15px; border-radius: 8px; margin-top: 15px;">
|
587
|
+
<h4 style="color: #ffa726; margin-bottom: 10px; font-size: 1rem; display: flex; align-items: center; gap: 8px;">
|
588
|
+
<i class="fas fa-search"></i> Search Engine Recommendations
|
589
|
+
</h4>
|
590
|
+
<ul style="margin: 0; padding-left: 20px; font-size: 0.95rem; line-height: 1.5; color: #e0e0e0;">
|
591
|
+
<li style="margin-bottom: 8px;"><strong style="color: #4caf50;">Tavily:</strong> Recommended for general knowledge benchmarks - AI-optimized search API, reliable results</li>
|
592
|
+
<li style="margin-bottom: 8px;"><strong style="color: #2196f3;">Brave:</strong> Independent search engine but unknown why performance is lower - could be smaller index, different ranking algorithm, or API limitations</li>
|
593
|
+
<li style="margin-bottom: 8px;"><strong style="color: #ff9800;">SearXNG:</strong> Often outperforms commercial APIs by aggregating multiple sources - shared resource, use moderate example counts</li>
|
594
|
+
<li style="margin-bottom: 8px;"><strong style="color: #f44336;">Specialized engines (ArXiv, PubMed, Wikipedia):</strong> Shared resources that are useless for general SimpleQA questions - should not be used for this test</li>
|
595
|
+
</ul>
|
596
|
+
<div style="background: rgba(33, 150, 243, 0.1); border: 1px solid rgba(33, 150, 243, 0.3); padding: 12px; border-radius: 6px; margin-top: 12px;">
|
597
|
+
<p style="margin: 0; font-size: 0.9rem; color: #e0e0e0;">
|
598
|
+
<strong style="color: #2196f3;">🔧 For Shared Resources:</strong> When using SearXNG or other shared engines, reduce iterations and questions per iteration in Settings to minimize load on shared infrastructure.
|
599
|
+
</p>
|
600
|
+
</div>
|
601
|
+
</div>
|
602
|
+
</div>
|
603
|
+
<div class="guidelines-sidebar">
|
604
|
+
<div style="font-size: 2.5rem; color: var(--primary-color); margin-bottom: 12px;">
|
605
|
+
<i class="fas fa-tachometer-alt"></i>
|
606
|
+
</div>
|
607
|
+
<div style="font-size: 1.2rem; font-weight: 600; color: #e0e0e0; margin-bottom: 8px;">
|
608
|
+
Quick Check
|
609
|
+
</div>
|
610
|
+
<div style="font-size: 0.9rem; color: #a0a0a0; line-height: 1.4; margin-bottom: 15px;">
|
611
|
+
Test your config with reasonable limits
|
612
|
+
</div>
|
613
|
+
<div style="background: rgba(var(--primary-color-rgb), 0.2); padding: 8px 12px; border-radius: 6px; font-size: 0.85rem; color: var(--primary-color); font-weight: 500;">
|
614
|
+
🎯 Configuration Testing
|
615
|
+
</div>
|
616
|
+
</div>
|
617
|
+
</div>
|
618
|
+
</div>
|
619
|
+
|
620
|
+
<!-- Alert container -->
|
621
|
+
<div id="benchmark-alert" class="settings-alert-container" style="display:none"></div>
|
622
|
+
|
623
|
+
<div class="card benchmark-card">
|
624
|
+
<div class="card-content">
|
625
|
+
<form id="benchmark-form">
|
626
|
+
|
627
|
+
<!-- Benchmark Name -->
|
628
|
+
<div class="form-group">
|
629
|
+
<label for="run_name">Benchmark Name (Optional)</label>
|
630
|
+
<input type="text" id="run_name" name="run_name" class="form-control" placeholder="e.g., 'Test new search strategy'">
|
631
|
+
<span class="input-help">Give your benchmark run a descriptive name</span>
|
632
|
+
</div>
|
633
|
+
|
634
|
+
<!-- Dataset Configuration -->
|
635
|
+
<div class="form-group">
|
636
|
+
<fieldset>
|
637
|
+
<legend>Dataset Selection</legend>
|
638
|
+
|
639
|
+
<!-- SimpleQA Dataset -->
|
640
|
+
<div class="dataset-config">
|
641
|
+
<div class="dataset-header">
|
642
|
+
<div>
|
643
|
+
<h3>SimpleQA</h3>
|
644
|
+
<p>Fact-based questions with clear answers</p>
|
645
|
+
</div>
|
646
|
+
<div class="dataset-toggle">
|
647
|
+
<input type="checkbox" id="simpleqa_enabled" checked>
|
648
|
+
<label for="simpleqa_enabled">Enable</label>
|
649
|
+
</div>
|
650
|
+
</div>
|
651
|
+
<div class="form-group">
|
652
|
+
<label for="simpleqa_count">Number of Examples</label>
|
653
|
+
<input type="number" id="simpleqa_count" name="simpleqa_count" value="50" min="1" max="500" class="form-control">
|
654
|
+
<span class="input-help">Recommended: 50 examples provides good balance for configuration testing</span>
|
655
|
+
</div>
|
656
|
+
</div>
|
657
|
+
|
658
|
+
<!-- BrowseComp Dataset -->
|
659
|
+
<div class="dataset-config" style="border: 2px solid #f44336; background: #2a1e1e;">
|
660
|
+
<div class="dataset-header">
|
661
|
+
<div>
|
662
|
+
<h3 style="color: #f44336;">BrowseComp</h3>
|
663
|
+
<p style="color: #ccc;">Complex browsing and comparison tasks</p>
|
664
|
+
<div style="background: #3a1e1e; border: 1px solid #f44336; color: #f44336; padding: 10px 12px; border-radius: 4px; margin-top: 10px; font-size: 0.85rem; line-height: 1.4;">
|
665
|
+
<i class="fas fa-exclamation-triangle"></i> <strong>Poor Performance Warning:</strong> We currently achieve close to 0% accuracy on BrowseComp.
|
666
|
+
<br><strong>For testing only:</strong> Limited to 20 examples max to see what this benchmark is about.
|
667
|
+
</div>
|
668
|
+
</div>
|
669
|
+
<div class="dataset-toggle">
|
670
|
+
<input type="checkbox" id="browsecomp_enabled">
|
671
|
+
<label for="browsecomp_enabled">Enable (Testing Only)</label>
|
672
|
+
</div>
|
673
|
+
</div>
|
674
|
+
<div class="form-group">
|
675
|
+
<label for="browsecomp_count">Number of Examples (Max 20)</label>
|
676
|
+
<input type="number" id="browsecomp_count" name="browsecomp_count" value="0" min="0" max="20" class="form-control" disabled>
|
677
|
+
<span class="input-help" style="color: #f44336;">Restricted to max 20 examples due to poor performance - for curiosity testing only</span>
|
678
|
+
</div>
|
679
|
+
</div>
|
680
|
+
</fieldset>
|
681
|
+
</div>
|
682
|
+
|
683
|
+
<!-- Current Database Settings -->
|
684
|
+
<div class="form-group">
|
685
|
+
<fieldset>
|
686
|
+
<legend>Current Configuration</legend>
|
687
|
+
<div class="dataset-config" style="background: var(--bg-color); border: 1px solid var(--border-color);">
|
688
|
+
<div class="dataset-header">
|
689
|
+
<div>
|
690
|
+
<h3>Active Database Settings</h3>
|
691
|
+
<p>Benchmark will use all settings from your database configuration</p>
|
692
|
+
</div>
|
693
|
+
</div>
|
694
|
+
|
695
|
+
<div id="current-settings-display" style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 15px; margin-bottom: 15px;">
|
696
|
+
<div class="metric-card" style="text-align: left; padding: 10px;">
|
697
|
+
<div class="metric-label">Provider</div>
|
698
|
+
<div class="metric-value" style="font-size: 0.9rem;" id="current-provider">Loading...</div>
|
699
|
+
</div>
|
700
|
+
<div class="metric-card" style="text-align: left; padding: 10px;">
|
701
|
+
<div class="metric-label">Model</div>
|
702
|
+
<div class="metric-value" style="font-size: 0.9rem;" id="current-model">Loading...</div>
|
703
|
+
</div>
|
704
|
+
<div class="metric-card" style="text-align: left; padding: 10px;">
|
705
|
+
<div class="metric-label">Search Tool</div>
|
706
|
+
<div class="metric-value" style="font-size: 0.9rem;" id="current-search-tool">Loading...</div>
|
707
|
+
</div>
|
708
|
+
<div class="metric-card" style="text-align: left; padding: 10px;">
|
709
|
+
<div class="metric-label">Iterations</div>
|
710
|
+
<div class="metric-value" style="font-size: 0.9rem;" id="current-iterations">Loading...</div>
|
711
|
+
</div>
|
712
|
+
<div class="metric-card" style="text-align: left; padding: 10px;">
|
713
|
+
<div class="metric-label">Questions/Iter</div>
|
714
|
+
<div class="metric-value" style="font-size: 0.9rem;" id="current-questions">Loading...</div>
|
715
|
+
</div>
|
716
|
+
<div class="metric-card" style="text-align: left; padding: 10px;">
|
717
|
+
<div class="metric-label">Strategy</div>
|
718
|
+
<div class="metric-value" style="font-size: 0.9rem;" id="current-strategy">Loading...</div>
|
719
|
+
</div>
|
720
|
+
</div>
|
721
|
+
|
722
|
+
<div style="font-size: 0.9rem; color: var(--text-muted); text-align: center;">
|
723
|
+
<i class="fas fa-info-circle"></i> To change any settings, go to <a href="/research/" target="_blank" style="color: var(--primary-color);">Settings Dashboard</a>
|
724
|
+
</div>
|
725
|
+
</div>
|
726
|
+
</fieldset>
|
727
|
+
</div>
|
728
|
+
|
729
|
+
<!-- Evaluation Model Settings -->
|
730
|
+
<div class="form-group">
|
731
|
+
<fieldset>
|
732
|
+
<legend>Evaluation Model Settings</legend>
|
733
|
+
<div class="dataset-config" style="background: var(--bg-color); border: 1px solid var(--border-color);">
|
734
|
+
<div class="dataset-header">
|
735
|
+
<div>
|
736
|
+
<h3>Benchmark Evaluation Configuration</h3>
|
737
|
+
<p>Configure the model used to grade benchmark results</p>
|
738
|
+
</div>
|
739
|
+
</div>
|
740
|
+
<div class="form-row">
|
741
|
+
<!-- Evaluation Provider Selection -->
|
742
|
+
<div class="form-group half">
|
743
|
+
<label for="evaluation_provider">Evaluation Provider</label>
|
744
|
+
<select id="evaluation_provider" name="evaluation_provider" class="form-control" data-initial-value="{{ eval_settings.evaluation_provider }}">
|
745
|
+
<option value="">Loading providers...</option>
|
746
|
+
</select>
|
747
|
+
<span class="input-help">Provider for the evaluation model</span>
|
748
|
+
</div>
|
749
|
+
|
750
|
+
<!-- Evaluation Model Selection -->
|
751
|
+
<div class="form-group half">
|
752
|
+
{{ render_dropdown(
|
753
|
+
input_id="evaluation_model",
|
754
|
+
dropdown_id="evaluation-model-dropdown",
|
755
|
+
placeholder="Enter or select evaluation model",
|
756
|
+
label="Evaluation Model",
|
757
|
+
help_text="Model to grade benchmark results",
|
758
|
+
allow_custom=true,
|
759
|
+
show_refresh=true,
|
760
|
+
refresh_aria_label="Refresh evaluation model list",
|
761
|
+
data_initial_value=eval_settings.evaluation_model
|
762
|
+
) }}
|
763
|
+
</div>
|
764
|
+
</div>
|
765
|
+
|
766
|
+
<div class="form-row">
|
767
|
+
<!-- Evaluation Endpoint URL -->
|
768
|
+
<div class="form-group half">
|
769
|
+
<label for="evaluation_endpoint_url">Endpoint URL</label>
|
770
|
+
<input type="text" id="evaluation_endpoint_url" name="evaluation_endpoint_url" class="form-control" placeholder="https://openrouter.ai/api/v1" value="{{ eval_settings.evaluation_endpoint_url }}">
|
771
|
+
<span class="input-help">API endpoint for evaluation model</span>
|
772
|
+
</div>
|
773
|
+
|
774
|
+
<!-- Evaluation Temperature -->
|
775
|
+
<div class="form-group half">
|
776
|
+
<label for="evaluation_temperature">Temperature</label>
|
777
|
+
<input type="range" id="evaluation_temperature" name="evaluation_temperature" class="form-control" min="0" max="1" step="0.1" value="{{ eval_settings.evaluation_temperature }}">
|
778
|
+
<span class="input-help">0 recommended for consistent evaluation</span>
|
779
|
+
</div>
|
780
|
+
</div>
|
781
|
+
|
782
|
+
<div class="alert" style="background: rgba(33, 150, 243, 0.1); border: 1px solid rgba(33, 150, 243, 0.3); color: #ffffff; padding: 15px; border-radius: 6px; margin-top: 15px;">
|
783
|
+
<i class="fas fa-info-circle" style="color: #2196f3; margin-right: 8px;"></i>
|
784
|
+
<strong style="color: #2196f3;">Evaluation Model Selection:</strong>
|
785
|
+
For accurate benchmark grading, use flagship models from major providers like Claude Sonnet series or GPT-4 class models.
|
786
|
+
Local models and smaller cloud models may produce inconsistent evaluations, affecting benchmark accuracy scores.
|
787
|
+
However, preliminary tests indicate that local models might be adequate for performance evaluation if highest grade standards are not required.
|
788
|
+
</div>
|
789
|
+
</div>
|
790
|
+
</fieldset>
|
791
|
+
</div>
|
792
|
+
|
793
|
+
<!-- Search Engine Warning -->
|
794
|
+
<div class="form-group" id="search-engine-warning" style="display: none;">
|
795
|
+
<div class="alert" style="background: #2d1b1b; border: 1px solid #f44336; color: #ffffff; padding: 15px; border-radius: 6px;">
|
796
|
+
<i class="fas fa-exclamation-triangle" style="color: #f44336; margin-right: 8px;"></i>
|
797
|
+
<strong style="color: #f44336;">Search Engine Notice:</strong>
|
798
|
+
<span id="search-warning-text" style="color: #ffffff;"></span>
|
799
|
+
</div>
|
800
|
+
</div>
|
801
|
+
|
802
|
+
<!-- Configuration Summary -->
|
803
|
+
<div class="form-group">
|
804
|
+
<div id="config-summary" class="metric-card">
|
805
|
+
<div class="metric-value" id="total-examples">50</div>
|
806
|
+
<div class="metric-label">Total Examples</div>
|
807
|
+
<div style="margin-top: 10px; font-size: 0.9rem; color: var(--text-muted);">
|
808
|
+
Estimated time: <span id="estimated-time">40-60 minutes</span>
|
809
|
+
</div>
|
810
|
+
</div>
|
811
|
+
</div>
|
812
|
+
|
813
|
+
|
814
|
+
<!-- Action Buttons -->
|
815
|
+
<div class="form-actions">
|
816
|
+
<button type="button" id="validate-config-btn" class="btn btn-secondary">
|
817
|
+
<i class="fas fa-check-circle"></i> Validate Configuration
|
818
|
+
</button>
|
819
|
+
<button type="submit" id="start-benchmark-btn" class="btn btn-primary">
|
820
|
+
<i class="fas fa-play"></i> Start Benchmark
|
821
|
+
</button>
|
822
|
+
</div>
|
823
|
+
</form>
|
824
|
+
</div>
|
825
|
+
</div>
|
826
|
+
|
827
|
+
<!-- Progress Panel - Reusing research progress component -->
|
828
|
+
<div id="benchmark-progress" class="benchmark-progress">
|
829
|
+
<div class="card benchmark-card">
|
830
|
+
<div class="card-content">
|
831
|
+
<div class="progress-info">
|
832
|
+
<div class="current-query-container">
|
833
|
+
<div class="current-query-label">Current Benchmark:</div>
|
834
|
+
<div id="current-benchmark" class="current-query"></div>
|
835
|
+
</div>
|
836
|
+
<div class="progress-container">
|
837
|
+
<div class="progress-bar">
|
838
|
+
<div id="progress-bar" class="progress-fill"></div>
|
839
|
+
</div>
|
840
|
+
<div id="progress-percentage" class="progress-percentage">0%</div>
|
841
|
+
</div>
|
842
|
+
<div class="status-container">
|
843
|
+
<div class="status-label">Status:</div>
|
844
|
+
<div id="status-text" class="status-indicator">Initializing</div>
|
845
|
+
</div>
|
846
|
+
<div class="task-container">
|
847
|
+
<div class="task-label">Current Task:</div>
|
848
|
+
<div id="current-task" class="task-text">Starting benchmark...</div>
|
849
|
+
</div>
|
850
|
+
|
851
|
+
<!-- Benchmark-specific metrics -->
|
852
|
+
<div class="metrics-grid">
|
853
|
+
<div class="metric-card">
|
854
|
+
<div class="metric-value" id="overall-accuracy">--%</div>
|
855
|
+
<div class="metric-label">Overall Accuracy</div>
|
856
|
+
<div class="metric-subtitle" id="accuracy-confidence" style="font-size: 0.8rem; color: var(--text-muted); margin-top: 2px;">--</div>
|
857
|
+
</div>
|
858
|
+
<div class="metric-card">
|
859
|
+
<div class="metric-value" id="estimated-time">--</div>
|
860
|
+
<div class="metric-label">Est. Time Left</div>
|
861
|
+
<div class="metric-subtitle" id="elapsed-time" style="font-size: 0.8rem; color: var(--text-muted); margin-top: 2px;">--</div>
|
862
|
+
</div>
|
863
|
+
<div class="metric-card">
|
864
|
+
<div class="metric-value" id="completed-count">0</div>
|
865
|
+
<div class="metric-label">Completed</div>
|
866
|
+
</div>
|
867
|
+
<div class="metric-card">
|
868
|
+
<div class="metric-value" id="processing-rate">--</div>
|
869
|
+
<div class="metric-label">Avg Time/Example</div>
|
870
|
+
</div>
|
871
|
+
</div>
|
872
|
+
|
873
|
+
<!-- SearXNG Rate Limiting Warning in Progress -->
|
874
|
+
<div id="rate-limit-warning" class="alert alert-warning" style="margin-top: 15px; margin-bottom: 15px; display: none;">
|
875
|
+
<i class="fas fa-exclamation-triangle"></i>
|
876
|
+
<strong>Rate Limiting Detected!</strong> SearXNG is blocking requests due to too many parallel searches.
|
877
|
+
<br><small><strong>Quick fix:</strong> <code>docker restart searxng</code> or wait 5-10 minutes for limits to reset.</small>
|
878
|
+
<br><small><strong>Prevention:</strong> Reduce iterations/questions per iteration in Settings.</small>
|
879
|
+
</div>
|
880
|
+
|
881
|
+
<div id="dataset-accuracies">
|
882
|
+
<div class="dataset-accuracy">
|
883
|
+
<span>SimpleQA: <strong id="simpleqa-accuracy">--%</strong></span>
|
884
|
+
<span>BrowseComp: <strong id="browsecomp-accuracy">--%</strong></span>
|
885
|
+
</div>
|
886
|
+
</div>
|
887
|
+
|
888
|
+
<!-- Benchmark Control Actions -->
|
889
|
+
<div class="progress-actions" style="margin: 20px 0; text-align: center;">
|
890
|
+
<button id="cancel-benchmark-btn" class="btn btn-outline terminate-btn">
|
891
|
+
<i class="fas fa-stop-circle"></i> Cancel Benchmark
|
892
|
+
</button>
|
893
|
+
<button id="view-results-btn" class="btn btn-primary" style="display: none;">
|
894
|
+
<i class="fas fa-eye"></i> View Results
|
895
|
+
</button>
|
896
|
+
</div>
|
897
|
+
|
898
|
+
<!-- Current Question Display -->
|
899
|
+
<div id="current-question-section" class="benchmark-section" style="margin-top: 20px;">
|
900
|
+
<h4 style="color: var(--primary-color); margin-bottom: 15px;">
|
901
|
+
<i class="fas fa-question-circle"></i> Current Question
|
902
|
+
</h4>
|
903
|
+
<div id="current-question-card" class="question-card">
|
904
|
+
<div class="question-content">
|
905
|
+
<div class="question-text" id="current-question-text">No question being processed...</div>
|
906
|
+
<div class="question-meta">
|
907
|
+
<span class="dataset-badge" id="current-dataset">--</span>
|
908
|
+
<span class="example-id" id="current-example-id">--</span>
|
909
|
+
</div>
|
910
|
+
</div>
|
911
|
+
<div class="processing-status" id="current-processing-status">
|
912
|
+
<i class="fas fa-clock"></i> Waiting for benchmark to start...
|
913
|
+
</div>
|
914
|
+
</div>
|
915
|
+
</div>
|
916
|
+
|
917
|
+
<!-- Performance Charts -->
|
918
|
+
<div id="performance-charts-section" class="charts-section" style="display: none;">
|
919
|
+
<h4 style="color: var(--primary-color); margin-bottom: 15px;">
|
920
|
+
<i class="fas fa-chart-line"></i> Performance Analysis
|
921
|
+
</h4>
|
922
|
+
<div class="charts-grid">
|
923
|
+
<div class="chart-container">
|
924
|
+
<div class="chart-title">Accuracy Trend</div>
|
925
|
+
<canvas id="accuracy-chart" class="chart-canvas"></canvas>
|
926
|
+
</div>
|
927
|
+
<div class="chart-container">
|
928
|
+
<div class="chart-title">Processing Time per Example</div>
|
929
|
+
<canvas id="timing-chart" class="chart-canvas"></canvas>
|
930
|
+
</div>
|
931
|
+
</div>
|
932
|
+
<div class="charts-grid" style="margin-top: 20px;">
|
933
|
+
<div class="chart-container">
|
934
|
+
<div class="chart-title">Search Results Count</div>
|
935
|
+
<canvas id="search-results-chart" class="chart-canvas"></canvas>
|
936
|
+
</div>
|
937
|
+
<div class="chart-container">
|
938
|
+
<div class="chart-title">Search Quality Alert</div>
|
939
|
+
<div id="search-quality-status" style="padding: 20px; text-align: center; color: #e0e0e0;">
|
940
|
+
<div id="search-status-icon" style="font-size: 2rem; margin-bottom: 10px;">
|
941
|
+
<i class="fas fa-search"></i>
|
942
|
+
</div>
|
943
|
+
<div id="search-status-text" style="font-size: 1.1rem; margin-bottom: 10px;">
|
944
|
+
Waiting for data...
|
945
|
+
</div>
|
946
|
+
<div id="search-status-details" style="font-size: 0.9rem; color: #a0a0a0;">
|
947
|
+
Search result monitoring will begin when benchmark starts
|
948
|
+
</div>
|
949
|
+
</div>
|
950
|
+
</div>
|
951
|
+
</div>
|
952
|
+
</div>
|
953
|
+
|
954
|
+
<!-- All Results Display -->
|
955
|
+
<div id="recent-results-section" class="benchmark-section" style="margin-top: 20px;">
|
956
|
+
<h4 style="color: var(--primary-color); margin-bottom: 15px;">
|
957
|
+
<i class="fas fa-history"></i> All Results
|
958
|
+
</h4>
|
959
|
+
<div id="recent-results-container">
|
960
|
+
<div class="no-results">No results yet...</div>
|
961
|
+
</div>
|
962
|
+
</div>
|
963
|
+
</div>
|
964
|
+
</div>
|
965
|
+
</div>
|
966
|
+
</div>
|
967
|
+
</div>
|
968
|
+
|
969
|
+
<script>
|
970
|
+
// Benchmark configuration and progress tracking
|
971
|
+
let currentBenchmarkId = null;
|
972
|
+
let progressInterval = null;
|
973
|
+
|
974
|
+
// Global variables for evaluation settings data
|
975
|
+
let evalProviderData = null;
|
976
|
+
let evalModelData = null;
|
977
|
+
let evalTempData = null;
|
978
|
+
let evalEndpointData = null;
|
979
|
+
|
980
|
+
|
981
|
+
// Charts for performance tracking
|
982
|
+
let accuracyChart = null;
|
983
|
+
let timingChart = null;
|
984
|
+
let searchResultsChart = null;
|
985
|
+
let chartData = {
|
986
|
+
examples: [],
|
987
|
+
accuracies: [],
|
988
|
+
processingTimes: [],
|
989
|
+
timestamps: [],
|
990
|
+
searchResultCounts: []
|
991
|
+
};
|
992
|
+
|
993
|
+
// Search quality monitoring
|
994
|
+
let recentSearchCounts = [];
|
995
|
+
let searchQualityAlert = false;
|
996
|
+
|
997
|
+
document.addEventListener('DOMContentLoaded', function() {
|
998
|
+
// Initialize socket service first - but don't let it keep retrying if it fails
|
999
|
+
if (window.socket && typeof window.socket.initializeSocket === 'function') {
|
1000
|
+
try {
|
1001
|
+
window.socket.initializeSocket();
|
1002
|
+
} catch (e) {
|
1003
|
+
console.warn('Socket initialization failed, continuing without real-time updates');
|
1004
|
+
}
|
1005
|
+
}
|
1006
|
+
|
1007
|
+
initializeBenchmarkForm();
|
1008
|
+
initializeEvaluationSettings();
|
1009
|
+
loadCurrentSettings();
|
1010
|
+
updateConfigSummary();
|
1011
|
+
checkForRunningBenchmark();
|
1012
|
+
});
|
1013
|
+
|
1014
|
+
function initializeBenchmarkForm() {
|
1015
|
+
const form = document.getElementById('benchmark-form');
|
1016
|
+
const validateBtn = document.getElementById('validate-config-btn');
|
1017
|
+
const cancelBtn = document.getElementById('cancel-benchmark-btn');
|
1018
|
+
|
1019
|
+
// Form submission
|
1020
|
+
form.addEventListener('submit', function(e) {
|
1021
|
+
e.preventDefault();
|
1022
|
+
startBenchmark();
|
1023
|
+
});
|
1024
|
+
|
1025
|
+
// Validate configuration
|
1026
|
+
validateBtn.addEventListener('click', validateConfiguration);
|
1027
|
+
|
1028
|
+
// Cancel benchmark
|
1029
|
+
cancelBtn.addEventListener('click', cancelBenchmark);
|
1030
|
+
|
1031
|
+
// Update summary when inputs change
|
1032
|
+
const inputs = form.querySelectorAll('input, select');
|
1033
|
+
inputs.forEach(input => {
|
1034
|
+
input.addEventListener('change', updateConfigSummary);
|
1035
|
+
});
|
1036
|
+
|
1037
|
+
// Toggle dataset sections
|
1038
|
+
document.getElementById('simpleqa_enabled').addEventListener('change', function() {
|
1039
|
+
document.getElementById('simpleqa_count').disabled = !this.checked;
|
1040
|
+
updateConfigSummary();
|
1041
|
+
});
|
1042
|
+
|
1043
|
+
// BrowseComp toggle - enable/disable count input when checkbox is toggled
|
1044
|
+
document.getElementById('browsecomp_enabled').addEventListener('change', function() {
|
1045
|
+
const countInput = document.getElementById('browsecomp_count');
|
1046
|
+
countInput.disabled = !this.checked;
|
1047
|
+
if (!this.checked) {
|
1048
|
+
countInput.value = 0; // Reset to 0 when disabled
|
1049
|
+
} else {
|
1050
|
+
countInput.value = 5; // Set a reasonable default when enabled
|
1051
|
+
}
|
1052
|
+
updateConfigSummary();
|
1053
|
+
});
|
1054
|
+
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
function updateConfigSummary() {
|
1058
|
+
const simpleqaEnabled = document.getElementById('simpleqa_enabled').checked;
|
1059
|
+
const browsecompEnabled = document.getElementById('browsecomp_enabled').checked;
|
1060
|
+
const simpleqaCount = simpleqaEnabled ? parseInt(document.getElementById('simpleqa_count').value) || 0 : 0;
|
1061
|
+
const browsecompCount = browsecompEnabled ? parseInt(document.getElementById('browsecomp_count').value) || 0 : 0;
|
1062
|
+
|
1063
|
+
const totalExamples = simpleqaCount + browsecompCount;
|
1064
|
+
document.getElementById('total-examples').textContent = totalExamples;
|
1065
|
+
|
1066
|
+
// Estimate time (roughly 1-2 minutes per example)
|
1067
|
+
const estimatedMinutes = Math.round(totalExamples * 1.5);
|
1068
|
+
const estimatedTime = estimatedMinutes < 60 ?
|
1069
|
+
`${estimatedMinutes} minutes` :
|
1070
|
+
`${Math.round(estimatedMinutes/60)} hour${estimatedMinutes >= 120 ? 's' : ''}`;
|
1071
|
+
document.getElementById('estimated-time').textContent = estimatedTime;
|
1072
|
+
}
|
1073
|
+
|
1074
|
+
function validateConfiguration() {
|
1075
|
+
const config = getConfigurationData();
|
1076
|
+
|
1077
|
+
fetch('/benchmark/api/validate-config', {
|
1078
|
+
method: 'POST',
|
1079
|
+
headers: {
|
1080
|
+
'Content-Type': 'application/json',
|
1081
|
+
},
|
1082
|
+
body: JSON.stringify(config)
|
1083
|
+
})
|
1084
|
+
.then(response => response.json())
|
1085
|
+
.then(data => {
|
1086
|
+
if (data.valid) {
|
1087
|
+
showAlert('Configuration is valid! Ready to start benchmark.', 'success');
|
1088
|
+
} else {
|
1089
|
+
showAlert('Configuration errors: ' + data.errors.join(', '), 'error');
|
1090
|
+
}
|
1091
|
+
})
|
1092
|
+
.catch(error => {
|
1093
|
+
console.error('Validation error:', error);
|
1094
|
+
showAlert('Error validating configuration: ' + error.message, 'error');
|
1095
|
+
});
|
1096
|
+
}
|
1097
|
+
|
1098
|
+
function getConfigurationData() {
|
1099
|
+
const simpleqaEnabled = document.getElementById('simpleqa_enabled').checked;
|
1100
|
+
const browsecompEnabled = document.getElementById('browsecomp_enabled').checked;
|
1101
|
+
|
1102
|
+
const datasets_config = {};
|
1103
|
+
if (simpleqaEnabled) {
|
1104
|
+
datasets_config.simpleqa = {
|
1105
|
+
count: parseInt(document.getElementById('simpleqa_count').value) || 0
|
1106
|
+
};
|
1107
|
+
}
|
1108
|
+
if (browsecompEnabled) {
|
1109
|
+
datasets_config.browsecomp = {
|
1110
|
+
count: parseInt(document.getElementById('browsecomp_count').value) || 0
|
1111
|
+
};
|
1112
|
+
}
|
1113
|
+
|
1114
|
+
return {
|
1115
|
+
run_name: document.getElementById('run_name').value,
|
1116
|
+
datasets_config: datasets_config
|
1117
|
+
// All other config will be taken from database by the backend
|
1118
|
+
};
|
1119
|
+
}
|
1120
|
+
|
1121
|
+
function startBenchmark() {
|
1122
|
+
const config = getConfigurationData();
|
1123
|
+
|
1124
|
+
// Disable form
|
1125
|
+
document.getElementById('benchmark-form').style.display = 'none';
|
1126
|
+
document.getElementById('benchmark-progress').style.display = 'block';
|
1127
|
+
|
1128
|
+
fetch('/benchmark/api/start', {
|
1129
|
+
method: 'POST',
|
1130
|
+
headers: {
|
1131
|
+
'Content-Type': 'application/json',
|
1132
|
+
},
|
1133
|
+
body: JSON.stringify(config)
|
1134
|
+
})
|
1135
|
+
.then(response => response.json())
|
1136
|
+
.then(data => {
|
1137
|
+
if (data.success) {
|
1138
|
+
currentBenchmarkId = data.benchmark_run_id;
|
1139
|
+
showAlert('Benchmark started successfully!', 'success');
|
1140
|
+
startProgressTracking();
|
1141
|
+
} else {
|
1142
|
+
showAlert('Error starting benchmark: ' + data.error, 'error');
|
1143
|
+
resetForm();
|
1144
|
+
}
|
1145
|
+
})
|
1146
|
+
.catch(error => {
|
1147
|
+
console.error('Start error:', error);
|
1148
|
+
showAlert('Error starting benchmark: ' + error.message, 'error');
|
1149
|
+
resetForm();
|
1150
|
+
});
|
1151
|
+
}
|
1152
|
+
|
1153
|
+
function startProgressTracking() {
|
1154
|
+
if (!currentBenchmarkId) return;
|
1155
|
+
|
1156
|
+
// Initialize charts
|
1157
|
+
initializeCharts();
|
1158
|
+
|
1159
|
+
// Show charts section
|
1160
|
+
document.getElementById('performance-charts-section').style.display = 'block';
|
1161
|
+
|
1162
|
+
// Load historical data if reconnecting to running benchmark
|
1163
|
+
setTimeout(() => {
|
1164
|
+
loadHistoricalChartData();
|
1165
|
+
}, 1000);
|
1166
|
+
|
1167
|
+
progressInterval = setInterval(() => {
|
1168
|
+
updateBenchmarkProgress();
|
1169
|
+
}, 3000); // Update every 3 seconds (reduced from 2 for better performance)
|
1170
|
+
|
1171
|
+
// Initialize socket if not already done
|
1172
|
+
if (!window.socket || !window.socket.initializeSocket) {
|
1173
|
+
console.log('Socket service not available');
|
1174
|
+
} else if (!window.socket.socket) {
|
1175
|
+
console.log('Socket not initialized, initializing now...');
|
1176
|
+
window.socket.initializeSocket();
|
1177
|
+
}
|
1178
|
+
|
1179
|
+
// Connect to WebSocket for detailed progress updates (reuse socket service)
|
1180
|
+
setTimeout(() => {
|
1181
|
+
if (window.socket && typeof window.socket.subscribeToResearch === 'function') {
|
1182
|
+
console.log('Subscribing to benchmark progress for ID:', currentBenchmarkId);
|
1183
|
+
// Subscribe to benchmark progress events using research subscription (same format)
|
1184
|
+
window.socket.subscribeToResearch(currentBenchmarkId, (eventData) => {
|
1185
|
+
handleDetailedProgress(eventData);
|
1186
|
+
});
|
1187
|
+
} else {
|
1188
|
+
console.warn('Socket service not available, falling back to polling only');
|
1189
|
+
}
|
1190
|
+
}, 500); // Small delay to ensure socket is ready
|
1191
|
+
}
|
1192
|
+
|
1193
|
+
function handleDetailedProgress(data) {
|
1194
|
+
// Update current task display
|
1195
|
+
const currentTask = document.getElementById('current-task');
|
1196
|
+
if (currentTask && data.status) {
|
1197
|
+
currentTask.textContent = `Example ${data.example_id}: ${data.status}`;
|
1198
|
+
}
|
1199
|
+
|
1200
|
+
}
|
1201
|
+
|
1202
|
+
// Track last update times to avoid too frequent updates
|
1203
|
+
let lastResultsUpdate = 0;
|
1204
|
+
let lastChartsUpdate = 0;
|
1205
|
+
|
1206
|
+
function updateBenchmarkProgress() {
|
1207
|
+
if (!currentBenchmarkId) return;
|
1208
|
+
|
1209
|
+
fetch(`/benchmark/api/status/${currentBenchmarkId}`)
|
1210
|
+
.then(response => response.json())
|
1211
|
+
.then(data => {
|
1212
|
+
if (data.success) {
|
1213
|
+
const status = data.status;
|
1214
|
+
updateProgressDisplay(status);
|
1215
|
+
|
1216
|
+
// Update question/answer displays
|
1217
|
+
updateCurrentQuestion(status);
|
1218
|
+
|
1219
|
+
// Only update results every 10 seconds to avoid performance issues
|
1220
|
+
const now = Date.now();
|
1221
|
+
if (now - lastResultsUpdate > 10000) {
|
1222
|
+
updateRecentResults();
|
1223
|
+
lastResultsUpdate = now;
|
1224
|
+
}
|
1225
|
+
|
1226
|
+
// Update charts every 5 seconds
|
1227
|
+
if (now - lastChartsUpdate > 5000) {
|
1228
|
+
updateCharts(status);
|
1229
|
+
lastChartsUpdate = now;
|
1230
|
+
}
|
1231
|
+
|
1232
|
+
// Update search result monitoring
|
1233
|
+
updateSearchQualityMonitoring();
|
1234
|
+
|
1235
|
+
// Update rate limiting status
|
1236
|
+
updateRateLimitingStatus();
|
1237
|
+
|
1238
|
+
if (status.status === 'completed' || status.status === 'failed' || status.status === 'cancelled') {
|
1239
|
+
clearInterval(progressInterval);
|
1240
|
+
progressInterval = null;
|
1241
|
+
|
1242
|
+
if (status.status === 'completed') {
|
1243
|
+
showAlert('Benchmark completed successfully!', 'success');
|
1244
|
+
} else {
|
1245
|
+
showAlert(`Benchmark ${status.status}: ${status.error_message || ''}`, 'error');
|
1246
|
+
}
|
1247
|
+
}
|
1248
|
+
}
|
1249
|
+
})
|
1250
|
+
.catch(error => {
|
1251
|
+
console.error('Progress update error:', error);
|
1252
|
+
});
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
function updateProgressDisplay(status) {
|
1256
|
+
const percentage = status.total_examples > 0 ?
|
1257
|
+
(status.completed_examples / status.total_examples * 100) : 0;
|
1258
|
+
|
1259
|
+
// Update progress bar (using research progress component IDs)
|
1260
|
+
const progressBar = document.getElementById('progress-bar');
|
1261
|
+
const progressPercentage = document.getElementById('progress-percentage');
|
1262
|
+
const statusText = document.getElementById('status-text');
|
1263
|
+
const currentTask = document.getElementById('current-task');
|
1264
|
+
const currentBenchmark = document.getElementById('current-benchmark');
|
1265
|
+
|
1266
|
+
if (progressBar) progressBar.style.width = percentage + '%';
|
1267
|
+
if (progressPercentage) progressPercentage.textContent = Math.round(percentage) + '%';
|
1268
|
+
if (statusText) statusText.textContent = status.status || 'Running';
|
1269
|
+
if (currentTask) currentTask.textContent = `Processing example ${status.completed_examples} of ${status.total_examples}`;
|
1270
|
+
if (currentBenchmark && status.run_name) currentBenchmark.textContent = status.run_name;
|
1271
|
+
|
1272
|
+
// Update benchmark-specific metrics
|
1273
|
+
const overallAccuracy = document.getElementById('overall-accuracy');
|
1274
|
+
const accuracyConfidence = document.getElementById('accuracy-confidence');
|
1275
|
+
const estimatedTime = document.getElementById('estimated-time');
|
1276
|
+
const elapsedTime = document.getElementById('elapsed-time');
|
1277
|
+
const processingRate = document.getElementById('processing-rate');
|
1278
|
+
const completedCount = document.getElementById('completed-count');
|
1279
|
+
|
1280
|
+
// Overall accuracy with confidence interval
|
1281
|
+
if (overallAccuracy) overallAccuracy.textContent =
|
1282
|
+
status.overall_accuracy ? status.overall_accuracy.toFixed(1) + '%' : '--%';
|
1283
|
+
|
1284
|
+
if (accuracyConfidence && status.accuracy_confidence) {
|
1285
|
+
const conf = status.accuracy_confidence;
|
1286
|
+
accuracyConfidence.textContent =
|
1287
|
+
`±${conf.margin_of_error.toFixed(1)}% (95% CI, n=${conf.sample_size})`;
|
1288
|
+
} else if (accuracyConfidence) {
|
1289
|
+
accuracyConfidence.textContent = '--';
|
1290
|
+
}
|
1291
|
+
|
1292
|
+
// Time estimates
|
1293
|
+
if (estimatedTime && status.estimated_time_remaining) {
|
1294
|
+
const minutes = Math.round(status.estimated_time_remaining / 60);
|
1295
|
+
estimatedTime.textContent = minutes > 0 ? `${minutes}m` : '<1m';
|
1296
|
+
} else if (estimatedTime) {
|
1297
|
+
estimatedTime.textContent = '--';
|
1298
|
+
}
|
1299
|
+
|
1300
|
+
if (elapsedTime && status.total_elapsed_time) {
|
1301
|
+
const minutes = Math.round(status.total_elapsed_time / 60);
|
1302
|
+
elapsedTime.textContent = `${minutes}m elapsed`;
|
1303
|
+
} else if (elapsedTime) {
|
1304
|
+
elapsedTime.textContent = '--';
|
1305
|
+
}
|
1306
|
+
|
1307
|
+
// Average processing time per example
|
1308
|
+
if (processingRate && status.avg_time_per_example) {
|
1309
|
+
const avgMinutes = (status.avg_time_per_example / 60).toFixed(1);
|
1310
|
+
processingRate.textContent = `${avgMinutes}m`;
|
1311
|
+
} else if (processingRate) {
|
1312
|
+
processingRate.textContent = '--';
|
1313
|
+
}
|
1314
|
+
|
1315
|
+
if (completedCount) completedCount.textContent = status.completed_examples;
|
1316
|
+
|
1317
|
+
// Update per-dataset accuracy displays
|
1318
|
+
const simpleqaAccuracy = document.getElementById('simpleqa-accuracy');
|
1319
|
+
const browsecompAccuracy = document.getElementById('browsecomp-accuracy');
|
1320
|
+
|
1321
|
+
if (simpleqaAccuracy) simpleqaAccuracy.textContent =
|
1322
|
+
status.simpleqa_accuracy ? status.simpleqa_accuracy.toFixed(1) + '%' : '--%';
|
1323
|
+
if (browsecompAccuracy) browsecompAccuracy.textContent =
|
1324
|
+
status.browsecomp_accuracy ? status.browsecomp_accuracy.toFixed(1) + '%' : '--%';
|
1325
|
+
}
|
1326
|
+
|
1327
|
+
function cancelBenchmark() {
|
1328
|
+
if (!currentBenchmarkId) return;
|
1329
|
+
|
1330
|
+
fetch(`/benchmark/api/cancel/${currentBenchmarkId}`, {
|
1331
|
+
method: 'POST'
|
1332
|
+
})
|
1333
|
+
.then(response => response.json())
|
1334
|
+
.then(data => {
|
1335
|
+
if (data.success) {
|
1336
|
+
showAlert('Benchmark cancelled successfully.', 'info');
|
1337
|
+
clearInterval(progressInterval);
|
1338
|
+
progressInterval = null;
|
1339
|
+
resetForm();
|
1340
|
+
} else {
|
1341
|
+
showAlert('Error cancelling benchmark: ' + data.error, 'error');
|
1342
|
+
}
|
1343
|
+
})
|
1344
|
+
.catch(error => {
|
1345
|
+
console.error('Cancel error:', error);
|
1346
|
+
showAlert('Error cancelling benchmark: ' + error.message, 'error');
|
1347
|
+
});
|
1348
|
+
}
|
1349
|
+
|
1350
|
+
function resetForm() {
|
1351
|
+
document.getElementById('benchmark-form').style.display = 'block';
|
1352
|
+
document.getElementById('benchmark-progress').style.display = 'none';
|
1353
|
+
document.getElementById('performance-charts-section').style.display = 'none';
|
1354
|
+
currentBenchmarkId = null;
|
1355
|
+
|
1356
|
+
// Clear any running intervals
|
1357
|
+
if (progressInterval) {
|
1358
|
+
clearInterval(progressInterval);
|
1359
|
+
progressInterval = null;
|
1360
|
+
}
|
1361
|
+
|
1362
|
+
// Reset loading flags
|
1363
|
+
window.modelsLoading = false;
|
1364
|
+
|
1365
|
+
// Reset chart data
|
1366
|
+
chartData = {
|
1367
|
+
examples: [],
|
1368
|
+
accuracies: [],
|
1369
|
+
processingTimes: [],
|
1370
|
+
timestamps: [],
|
1371
|
+
searchResultCounts: []
|
1372
|
+
};
|
1373
|
+
|
1374
|
+
// Reset search quality monitoring
|
1375
|
+
recentSearchCounts = [];
|
1376
|
+
searchQualityAlert = false;
|
1377
|
+
|
1378
|
+
// Destroy existing charts
|
1379
|
+
if (accuracyChart) {
|
1380
|
+
accuracyChart.destroy();
|
1381
|
+
accuracyChart = null;
|
1382
|
+
}
|
1383
|
+
if (timingChart) {
|
1384
|
+
timingChart.destroy();
|
1385
|
+
timingChart = null;
|
1386
|
+
}
|
1387
|
+
if (searchResultsChart) {
|
1388
|
+
searchResultsChart.destroy();
|
1389
|
+
searchResultsChart = null;
|
1390
|
+
}
|
1391
|
+
|
1392
|
+
// Unsubscribe from socket events if connected
|
1393
|
+
if (window.socket && window.socket.unsubscribeFromResearch) {
|
1394
|
+
window.socket.unsubscribeFromResearch(currentBenchmarkId);
|
1395
|
+
}
|
1396
|
+
}
|
1397
|
+
|
1398
|
+
function showAlert(message, type) {
|
1399
|
+
const alertContainer = document.getElementById('benchmark-alert');
|
1400
|
+
alertContainer.innerHTML = `
|
1401
|
+
<div class="settings-alert alert-${type}">
|
1402
|
+
<span>${message}</span>
|
1403
|
+
<button type="button" class="close-alert" onclick="this.parentElement.parentElement.style.display='none'">
|
1404
|
+
<i class="fas fa-times"></i>
|
1405
|
+
</button>
|
1406
|
+
</div>
|
1407
|
+
`;
|
1408
|
+
alertContainer.style.display = 'block';
|
1409
|
+
|
1410
|
+
// Auto-hide success messages
|
1411
|
+
if (type === 'success') {
|
1412
|
+
setTimeout(() => {
|
1413
|
+
alertContainer.style.display = 'none';
|
1414
|
+
}, 5000);
|
1415
|
+
}
|
1416
|
+
}
|
1417
|
+
|
1418
|
+
function checkForRunningBenchmark() {
|
1419
|
+
// Check if there's a running benchmark when page loads
|
1420
|
+
fetch('/benchmark/api/running')
|
1421
|
+
.then(response => response.json())
|
1422
|
+
.then(data => {
|
1423
|
+
if (data.success && data.benchmark_run_id) {
|
1424
|
+
currentBenchmarkId = data.benchmark_run_id;
|
1425
|
+
showAlert(`Reconnected to running benchmark #${currentBenchmarkId}`, 'info');
|
1426
|
+
|
1427
|
+
// Show progress panel and hide form
|
1428
|
+
document.getElementById('benchmark-form').style.display = 'none';
|
1429
|
+
document.getElementById('benchmark-progress').style.display = 'block';
|
1430
|
+
|
1431
|
+
// Start tracking progress
|
1432
|
+
startProgressTracking();
|
1433
|
+
}
|
1434
|
+
})
|
1435
|
+
.catch(error => {
|
1436
|
+
console.log('No running benchmark found (this is normal)');
|
1437
|
+
});
|
1438
|
+
}
|
1439
|
+
|
1440
|
+
// Load current settings from database and display them
|
1441
|
+
async function loadCurrentSettings() {
|
1442
|
+
console.log('Starting loadCurrentSettings...');
|
1443
|
+
|
1444
|
+
try {
|
1445
|
+
// Load settings individually
|
1446
|
+
const [
|
1447
|
+
llmProviderResp,
|
1448
|
+
llmModelResp,
|
1449
|
+
searchToolResp,
|
1450
|
+
iterationsResp,
|
1451
|
+
questionsResp,
|
1452
|
+
strategyResp,
|
1453
|
+
evalProviderResp,
|
1454
|
+
evalModelResp,
|
1455
|
+
evalTempResp,
|
1456
|
+
evalEndpointResp
|
1457
|
+
] = await Promise.all([
|
1458
|
+
fetch('/settings/api/llm.provider'),
|
1459
|
+
fetch('/settings/api/llm.model'),
|
1460
|
+
fetch('/settings/api/search.tool'),
|
1461
|
+
fetch('/settings/api/search.iterations'),
|
1462
|
+
fetch('/settings/api/search.questions_per_iteration'),
|
1463
|
+
fetch('/settings/api/search.search_strategy'),
|
1464
|
+
fetch('/settings/api/benchmark.evaluation.provider'),
|
1465
|
+
fetch('/settings/api/benchmark.evaluation.model'),
|
1466
|
+
fetch('/settings/api/benchmark.evaluation.temperature'),
|
1467
|
+
fetch('/settings/api/benchmark.evaluation.endpoint_url')
|
1468
|
+
]);
|
1469
|
+
|
1470
|
+
// Parse responses
|
1471
|
+
const llmProviderData = await llmProviderResp.json();
|
1472
|
+
const llmModelData = await llmModelResp.json();
|
1473
|
+
const searchToolData = await searchToolResp.json();
|
1474
|
+
const iterationsData = await iterationsResp.json();
|
1475
|
+
const questionsData = await questionsResp.json();
|
1476
|
+
const strategyData = await strategyResp.json();
|
1477
|
+
evalProviderData = await evalProviderResp.json();
|
1478
|
+
evalModelData = await evalModelResp.json();
|
1479
|
+
evalTempData = await evalTempResp.json();
|
1480
|
+
evalEndpointData = await evalEndpointResp.json();
|
1481
|
+
|
1482
|
+
// Display LLM settings with error handling
|
1483
|
+
try {
|
1484
|
+
const providerEl = document.getElementById('current-provider');
|
1485
|
+
const modelEl = document.getElementById('current-model');
|
1486
|
+
|
1487
|
+
// Set provider
|
1488
|
+
if (llmProviderData && llmProviderData.settings && llmProviderData.settings.value) {
|
1489
|
+
const provider = llmProviderData.settings.value;
|
1490
|
+
if (providerEl) providerEl.textContent = provider ? provider.toUpperCase() : 'Not set';
|
1491
|
+
} else {
|
1492
|
+
if (providerEl) providerEl.textContent = 'Not set';
|
1493
|
+
}
|
1494
|
+
|
1495
|
+
// Set model
|
1496
|
+
if (llmModelData && llmModelData.settings && llmModelData.settings.value) {
|
1497
|
+
const model = llmModelData.settings.value;
|
1498
|
+
if (modelEl) modelEl.textContent = model || 'Not set';
|
1499
|
+
} else {
|
1500
|
+
if (modelEl) modelEl.textContent = 'Not set';
|
1501
|
+
}
|
1502
|
+
} catch (e) {
|
1503
|
+
console.error('Error setting LLM display:', e);
|
1504
|
+
}
|
1505
|
+
|
1506
|
+
// Display search tool and check for warnings
|
1507
|
+
if (searchToolData && searchToolData.settings && searchToolData.settings.value) {
|
1508
|
+
const searchTool = searchToolData.settings.value || 'Not set';
|
1509
|
+
document.getElementById('current-search-tool').textContent = searchTool;
|
1510
|
+
|
1511
|
+
// Check for search engine warnings
|
1512
|
+
checkSearchEngineWarnings(searchTool);
|
1513
|
+
} else {
|
1514
|
+
document.getElementById('current-search-tool').textContent = 'Not set';
|
1515
|
+
}
|
1516
|
+
|
1517
|
+
// Display search iterations
|
1518
|
+
if (iterationsData && iterationsData.settings && iterationsData.settings.value !== null) {
|
1519
|
+
document.getElementById('current-iterations').textContent =
|
1520
|
+
iterationsData.settings.value || '8';
|
1521
|
+
} else {
|
1522
|
+
document.getElementById('current-iterations').textContent = '8'; // default
|
1523
|
+
}
|
1524
|
+
|
1525
|
+
// Display questions per iteration
|
1526
|
+
if (questionsData && questionsData.settings && questionsData.settings.value !== null) {
|
1527
|
+
document.getElementById('current-questions').textContent =
|
1528
|
+
questionsData.settings.value || '5';
|
1529
|
+
} else {
|
1530
|
+
document.getElementById('current-questions').textContent = '5'; // default
|
1531
|
+
}
|
1532
|
+
|
1533
|
+
// Display search strategy
|
1534
|
+
if (strategyData && strategyData.settings && strategyData.settings.value) {
|
1535
|
+
document.getElementById('current-strategy').textContent =
|
1536
|
+
strategyData.settings.value || 'focused_iteration';
|
1537
|
+
} else {
|
1538
|
+
document.getElementById('current-strategy').textContent = 'focused_iteration'; // default
|
1539
|
+
}
|
1540
|
+
|
1541
|
+
// Display evaluation settings (commented out as these elements don't exist)
|
1542
|
+
// TODO: Add evaluation settings display section if needed
|
1543
|
+
|
1544
|
+
// For now, just log the evaluation settings
|
1545
|
+
console.log('Evaluation settings loaded:', {
|
1546
|
+
provider: evalProviderData?.settings?.value || 'openai_endpoint',
|
1547
|
+
model: evalModelData?.settings?.value || 'anthropic/claude-3.7-sonnet',
|
1548
|
+
temperature: evalTempData?.settings?.value || 0,
|
1549
|
+
endpoint: evalEndpointData?.settings?.value || 'https://openrouter.ai/api/v1'
|
1550
|
+
});
|
1551
|
+
|
1552
|
+
} catch (error) {
|
1553
|
+
console.error('Error loading current settings:', error);
|
1554
|
+
console.error('Error details:', error.message);
|
1555
|
+
console.error('Error stack:', error.stack);
|
1556
|
+
|
1557
|
+
// Set error text on all metric values
|
1558
|
+
document.querySelectorAll('#current-settings-display .metric-value').forEach(el => {
|
1559
|
+
el.textContent = 'Error loading';
|
1560
|
+
});
|
1561
|
+
|
1562
|
+
showAlert('Could not load current settings. Check console for details.', 'warning');
|
1563
|
+
}
|
1564
|
+
}
|
1565
|
+
|
1566
|
+
function updateCurrentQuestion(status) {
|
1567
|
+
const currentQuestionText = document.getElementById('current-question-text');
|
1568
|
+
const currentDataset = document.getElementById('current-dataset');
|
1569
|
+
const currentExampleId = document.getElementById('current-example-id');
|
1570
|
+
const currentProcessingStatus = document.getElementById('current-processing-status');
|
1571
|
+
|
1572
|
+
if (status.status === 'in_progress') {
|
1573
|
+
currentProcessingStatus.innerHTML = '<i class="fas fa-spinner fa-spin"></i> Processing...';
|
1574
|
+
currentProcessingStatus.className = 'processing-status processing';
|
1575
|
+
|
1576
|
+
// Show progress info
|
1577
|
+
const progressText = `Processing example ${status.completed_examples + 1} of ${status.total_examples}`;
|
1578
|
+
currentQuestionText.textContent = progressText;
|
1579
|
+
currentDataset.textContent = 'Active';
|
1580
|
+
currentExampleId.textContent = `Example ${status.completed_examples + 1}`;
|
1581
|
+
} else if (status.status === 'completed') {
|
1582
|
+
currentProcessingStatus.innerHTML = '<i class="fas fa-check-circle"></i> Benchmark completed!';
|
1583
|
+
currentProcessingStatus.className = 'processing-status completed';
|
1584
|
+
currentQuestionText.textContent = 'All questions processed successfully.';
|
1585
|
+
currentDataset.textContent = 'Completed';
|
1586
|
+
currentExampleId.textContent = `${status.completed_examples}/${status.total_examples}`;
|
1587
|
+
} else {
|
1588
|
+
currentProcessingStatus.innerHTML = '<i class="fas fa-clock"></i> Waiting...';
|
1589
|
+
currentProcessingStatus.className = 'processing-status';
|
1590
|
+
currentQuestionText.textContent = 'No question being processed...';
|
1591
|
+
currentDataset.textContent = '--';
|
1592
|
+
currentExampleId.textContent = '--';
|
1593
|
+
}
|
1594
|
+
}
|
1595
|
+
|
1596
|
+
// Cache last results to avoid unnecessary re-renders
|
1597
|
+
let lastResultsData = null;
|
1598
|
+
|
1599
|
+
function updateRecentResults() {
|
1600
|
+
if (!currentBenchmarkId) return;
|
1601
|
+
|
1602
|
+
fetch(`/benchmark/api/results/${currentBenchmarkId}?limit=50`) // Reduced from 100 for performance
|
1603
|
+
.then(response => response.json())
|
1604
|
+
.then(data => {
|
1605
|
+
if (data.success && data.results) {
|
1606
|
+
// Only update if data has changed
|
1607
|
+
const newResultsStr = JSON.stringify(data.results);
|
1608
|
+
const oldResultsStr = JSON.stringify(lastResultsData);
|
1609
|
+
|
1610
|
+
if (newResultsStr !== oldResultsStr) {
|
1611
|
+
lastResultsData = data.results;
|
1612
|
+
displayRecentResults(data.results);
|
1613
|
+
}
|
1614
|
+
}
|
1615
|
+
})
|
1616
|
+
.catch(error => {
|
1617
|
+
console.error('Error fetching all results:', error);
|
1618
|
+
});
|
1619
|
+
}
|
1620
|
+
|
1621
|
+
function getSearchCountClass(count) {
|
1622
|
+
if (count <= 1) return 'critical';
|
1623
|
+
if (count <= 4) return 'warning';
|
1624
|
+
return 'good';
|
1625
|
+
}
|
1626
|
+
|
1627
|
+
function displayRecentResults(results) {
|
1628
|
+
const container = document.getElementById('recent-results-container');
|
1629
|
+
|
1630
|
+
if (!results || results.length === 0) {
|
1631
|
+
container.innerHTML = '<div class="no-results">No results yet...</div>';
|
1632
|
+
return;
|
1633
|
+
}
|
1634
|
+
|
1635
|
+
// Save expanded states before re-rendering
|
1636
|
+
const expandedStates = {};
|
1637
|
+
const allToggles = container.querySelectorAll('[id^="toggle-"]');
|
1638
|
+
allToggles.forEach(toggle => {
|
1639
|
+
const id = toggle.id.replace('toggle-', '');
|
1640
|
+
const fullTextElement = document.getElementById(`full-${id}`);
|
1641
|
+
if (fullTextElement && fullTextElement.style.display !== 'none') {
|
1642
|
+
expandedStates[id] = true;
|
1643
|
+
}
|
1644
|
+
});
|
1645
|
+
|
1646
|
+
const resultsHtml = results.map((result, index) => {
|
1647
|
+
const statusClass = result.is_correct ? 'correct' : 'incorrect';
|
1648
|
+
const statusIcon = result.is_correct ? '<i class="fas fa-check-circle"></i>' : '<i class="fas fa-times-circle"></i>';
|
1649
|
+
const statusText = result.is_correct ? 'Correct' : 'Incorrect';
|
1650
|
+
|
1651
|
+
// Function to create expandable text
|
1652
|
+
const createExpandableText = (text, id, maxLength = 200) => {
|
1653
|
+
if (!text) return 'No answer provided';
|
1654
|
+
|
1655
|
+
if (text.length <= maxLength) return text;
|
1656
|
+
|
1657
|
+
const truncated = text.substring(0, maxLength) + '...';
|
1658
|
+
const isExpanded = expandedStates[id] || false;
|
1659
|
+
|
1660
|
+
return `
|
1661
|
+
<span id="truncated-${id}" style="display: ${isExpanded ? 'none' : 'inline'};">${truncated}</span>
|
1662
|
+
<span id="full-${id}" style="display: ${isExpanded ? 'inline' : 'none'};">${text}</span>
|
1663
|
+
<a href="#" onclick="toggleText('${id}'); return false;" id="toggle-${id}" style="color: #2196f3; font-size: 0.85rem; margin-left: 5px; text-decoration: underline;">${isExpanded ? 'Show less' : 'Show more'}</a>
|
1664
|
+
`;
|
1665
|
+
};
|
1666
|
+
|
1667
|
+
return `
|
1668
|
+
<div class="result-card ${statusClass}">
|
1669
|
+
<div class="result-header">
|
1670
|
+
<div>
|
1671
|
+
<span class="dataset-badge">${result.dataset_type}</span>
|
1672
|
+
<span class="example-id">${result.example_id}</span>
|
1673
|
+
${result.search_result_count !== undefined ?
|
1674
|
+
`<span class="search-count-badge ${getSearchCountClass(result.search_result_count)}" title="Search results found">${result.search_result_count} results</span>` :
|
1675
|
+
''}
|
1676
|
+
</div>
|
1677
|
+
<span class="result-status ${statusClass}">
|
1678
|
+
${statusIcon} ${statusText}
|
1679
|
+
</span>
|
1680
|
+
</div>
|
1681
|
+
<div class="question-text" style="margin-bottom: 10px; font-size: 0.9rem;">
|
1682
|
+
${result.question || 'No question provided'}
|
1683
|
+
</div>
|
1684
|
+
<div class="answer-comparison">
|
1685
|
+
<div class="answer-box model-answer">
|
1686
|
+
<div class="answer-label">Model Answer</div>
|
1687
|
+
<div>${createExpandableText(result.model_answer || 'No answer provided', `model-${index}`)}</div>
|
1688
|
+
</div>
|
1689
|
+
<div class="answer-box correct-answer">
|
1690
|
+
<div class="answer-label">Correct Answer</div>
|
1691
|
+
<div>${createExpandableText(result.correct_answer || 'No correct answer available', `correct-${index}`)}</div>
|
1692
|
+
</div>
|
1693
|
+
</div>
|
1694
|
+
</div>
|
1695
|
+
`;
|
1696
|
+
}).join('');
|
1697
|
+
|
1698
|
+
container.innerHTML = resultsHtml;
|
1699
|
+
}
|
1700
|
+
|
1701
|
+
// Toggle function for expandable text
|
1702
|
+
function toggleText(id) {
|
1703
|
+
const truncated = document.getElementById(`truncated-${id}`);
|
1704
|
+
const full = document.getElementById(`full-${id}`);
|
1705
|
+
const toggle = document.getElementById(`toggle-${id}`);
|
1706
|
+
|
1707
|
+
if (truncated.style.display === 'none') {
|
1708
|
+
truncated.style.display = 'inline';
|
1709
|
+
full.style.display = 'none';
|
1710
|
+
toggle.textContent = 'Show more';
|
1711
|
+
} else {
|
1712
|
+
truncated.style.display = 'none';
|
1713
|
+
full.style.display = 'inline';
|
1714
|
+
toggle.textContent = 'Show less';
|
1715
|
+
}
|
1716
|
+
}
|
1717
|
+
|
1718
|
+
// Chart initialization and management
|
1719
|
+
function initializeCharts() {
|
1720
|
+
const chartOptions = {
|
1721
|
+
responsive: true,
|
1722
|
+
maintainAspectRatio: false,
|
1723
|
+
plugins: {
|
1724
|
+
legend: {
|
1725
|
+
labels: {
|
1726
|
+
color: '#e0e0e0'
|
1727
|
+
}
|
1728
|
+
}
|
1729
|
+
},
|
1730
|
+
scales: {
|
1731
|
+
x: {
|
1732
|
+
ticks: {
|
1733
|
+
color: '#a0a0a0'
|
1734
|
+
},
|
1735
|
+
grid: {
|
1736
|
+
color: '#333'
|
1737
|
+
}
|
1738
|
+
},
|
1739
|
+
y: {
|
1740
|
+
ticks: {
|
1741
|
+
color: '#a0a0a0'
|
1742
|
+
},
|
1743
|
+
grid: {
|
1744
|
+
color: '#333'
|
1745
|
+
}
|
1746
|
+
}
|
1747
|
+
}
|
1748
|
+
};
|
1749
|
+
|
1750
|
+
// Accuracy Chart
|
1751
|
+
const accuracyCtx = document.getElementById('accuracy-chart').getContext('2d');
|
1752
|
+
accuracyChart = new Chart(accuracyCtx, {
|
1753
|
+
type: 'line',
|
1754
|
+
data: {
|
1755
|
+
labels: [],
|
1756
|
+
datasets: [{
|
1757
|
+
label: 'Overall Accuracy',
|
1758
|
+
data: [],
|
1759
|
+
borderColor: '#4caf50',
|
1760
|
+
backgroundColor: 'rgba(76, 175, 80, 0.1)',
|
1761
|
+
tension: 0.4,
|
1762
|
+
fill: true
|
1763
|
+
}, {
|
1764
|
+
label: 'SimpleQA Accuracy',
|
1765
|
+
data: [],
|
1766
|
+
borderColor: '#2196f3',
|
1767
|
+
backgroundColor: 'rgba(33, 150, 243, 0.1)',
|
1768
|
+
tension: 0.4,
|
1769
|
+
fill: false
|
1770
|
+
}, {
|
1771
|
+
label: 'BrowseComp Accuracy',
|
1772
|
+
data: [],
|
1773
|
+
borderColor: '#ff9800',
|
1774
|
+
backgroundColor: 'rgba(255, 152, 0, 0.1)',
|
1775
|
+
tension: 0.4,
|
1776
|
+
fill: false
|
1777
|
+
}]
|
1778
|
+
},
|
1779
|
+
options: {
|
1780
|
+
...chartOptions,
|
1781
|
+
scales: {
|
1782
|
+
...chartOptions.scales,
|
1783
|
+
y: {
|
1784
|
+
...chartOptions.scales.y,
|
1785
|
+
min: 0,
|
1786
|
+
max: 100,
|
1787
|
+
ticks: {
|
1788
|
+
...chartOptions.scales.y.ticks,
|
1789
|
+
callback: function(value) {
|
1790
|
+
return value + '%';
|
1791
|
+
}
|
1792
|
+
}
|
1793
|
+
}
|
1794
|
+
}
|
1795
|
+
}
|
1796
|
+
});
|
1797
|
+
|
1798
|
+
// Timing Chart
|
1799
|
+
const timingCtx = document.getElementById('timing-chart').getContext('2d');
|
1800
|
+
timingChart = new Chart(timingCtx, {
|
1801
|
+
type: 'line',
|
1802
|
+
data: {
|
1803
|
+
labels: [],
|
1804
|
+
datasets: [{
|
1805
|
+
label: 'Processing Time (seconds)',
|
1806
|
+
data: [],
|
1807
|
+
borderColor: '#e91e63',
|
1808
|
+
backgroundColor: 'rgba(233, 30, 99, 0.1)',
|
1809
|
+
tension: 0.4,
|
1810
|
+
fill: true
|
1811
|
+
}]
|
1812
|
+
},
|
1813
|
+
options: {
|
1814
|
+
...chartOptions,
|
1815
|
+
scales: {
|
1816
|
+
...chartOptions.scales,
|
1817
|
+
y: {
|
1818
|
+
...chartOptions.scales.y,
|
1819
|
+
min: 0,
|
1820
|
+
ticks: {
|
1821
|
+
...chartOptions.scales.y.ticks,
|
1822
|
+
callback: function(value) {
|
1823
|
+
return value + 's';
|
1824
|
+
}
|
1825
|
+
}
|
1826
|
+
}
|
1827
|
+
}
|
1828
|
+
}
|
1829
|
+
});
|
1830
|
+
|
1831
|
+
// Search Results Chart
|
1832
|
+
const searchResultsCtx = document.getElementById('search-results-chart').getContext('2d');
|
1833
|
+
searchResultsChart = new Chart(searchResultsCtx, {
|
1834
|
+
type: 'line',
|
1835
|
+
data: {
|
1836
|
+
labels: [],
|
1837
|
+
datasets: [{
|
1838
|
+
label: 'Search Results Count',
|
1839
|
+
data: [],
|
1840
|
+
borderColor: '#9c27b0',
|
1841
|
+
backgroundColor: 'rgba(156, 39, 176, 0.1)',
|
1842
|
+
tension: 0.4,
|
1843
|
+
fill: true
|
1844
|
+
}]
|
1845
|
+
},
|
1846
|
+
options: {
|
1847
|
+
...chartOptions,
|
1848
|
+
scales: {
|
1849
|
+
...chartOptions.scales,
|
1850
|
+
y: {
|
1851
|
+
...chartOptions.scales.y,
|
1852
|
+
min: 0,
|
1853
|
+
ticks: {
|
1854
|
+
...chartOptions.scales.y.ticks,
|
1855
|
+
callback: function(value) {
|
1856
|
+
return Math.round(value) + ' results';
|
1857
|
+
}
|
1858
|
+
}
|
1859
|
+
}
|
1860
|
+
}
|
1861
|
+
}
|
1862
|
+
});
|
1863
|
+
}
|
1864
|
+
|
1865
|
+
function updateCharts(status) {
|
1866
|
+
if (!accuracyChart || !timingChart || !searchResultsChart || !status) return;
|
1867
|
+
|
1868
|
+
const currentExample = status.completed_examples;
|
1869
|
+
if (currentExample <= 0) return;
|
1870
|
+
|
1871
|
+
// Update accuracy chart
|
1872
|
+
if (status.overall_accuracy !== undefined) {
|
1873
|
+
// Add new data point
|
1874
|
+
const labels = accuracyChart.data.labels;
|
1875
|
+
if (!labels.includes(currentExample)) {
|
1876
|
+
labels.push(currentExample);
|
1877
|
+
accuracyChart.data.datasets[0].data.push(status.overall_accuracy || 0);
|
1878
|
+
accuracyChart.data.datasets[1].data.push(status.simpleqa_accuracy || 0);
|
1879
|
+
accuracyChart.data.datasets[2].data.push(status.browsecomp_accuracy || 0);
|
1880
|
+
} else {
|
1881
|
+
// Update existing data point
|
1882
|
+
const index = labels.indexOf(currentExample);
|
1883
|
+
if (index >= 0) {
|
1884
|
+
accuracyChart.data.datasets[0].data[index] = status.overall_accuracy || 0;
|
1885
|
+
accuracyChart.data.datasets[1].data[index] = status.simpleqa_accuracy || 0;
|
1886
|
+
accuracyChart.data.datasets[2].data[index] = status.browsecomp_accuracy || 0;
|
1887
|
+
}
|
1888
|
+
}
|
1889
|
+
|
1890
|
+
// Keep only last 50 data points
|
1891
|
+
if (labels.length > 50) {
|
1892
|
+
labels.shift();
|
1893
|
+
accuracyChart.data.datasets.forEach(dataset => dataset.data.shift());
|
1894
|
+
}
|
1895
|
+
|
1896
|
+
accuracyChart.update('none');
|
1897
|
+
}
|
1898
|
+
|
1899
|
+
// Update timing chart
|
1900
|
+
if (status.avg_time_per_example !== undefined) {
|
1901
|
+
const timingLabels = timingChart.data.labels;
|
1902
|
+
if (!timingLabels.includes(currentExample)) {
|
1903
|
+
timingLabels.push(currentExample);
|
1904
|
+
timingChart.data.datasets[0].data.push(status.avg_time_per_example || 0);
|
1905
|
+
} else {
|
1906
|
+
// Update existing data point
|
1907
|
+
const index = timingLabels.indexOf(currentExample);
|
1908
|
+
if (index >= 0) {
|
1909
|
+
timingChart.data.datasets[0].data[index] = status.avg_time_per_example || 0;
|
1910
|
+
}
|
1911
|
+
}
|
1912
|
+
|
1913
|
+
// Keep only last 50 data points
|
1914
|
+
if (timingLabels.length > 50) {
|
1915
|
+
timingLabels.shift();
|
1916
|
+
timingChart.data.datasets[0].data.shift();
|
1917
|
+
}
|
1918
|
+
|
1919
|
+
timingChart.update('none');
|
1920
|
+
}
|
1921
|
+
}
|
1922
|
+
|
1923
|
+
// Load historical chart data when reconnecting to running benchmark
|
1924
|
+
async function loadHistoricalChartData() {
|
1925
|
+
if (!currentBenchmarkId || !accuracyChart || !timingChart) return;
|
1926
|
+
|
1927
|
+
try {
|
1928
|
+
// Get benchmark status to populate initial chart data
|
1929
|
+
const response = await fetch(`/benchmark/api/status/${currentBenchmarkId}`);
|
1930
|
+
const data = await response.json();
|
1931
|
+
|
1932
|
+
if (data.success && data.status.completed_examples > 0) {
|
1933
|
+
// Create simulated historical data points for a smooth chart
|
1934
|
+
// In a real implementation, you'd store this data in the database
|
1935
|
+
const status = data.status;
|
1936
|
+
const totalCompleted = status.completed_examples;
|
1937
|
+
|
1938
|
+
// Generate some sample points for the chart
|
1939
|
+
for (let i = 1; i <= totalCompleted; i += Math.max(1, Math.floor(totalCompleted / 20))) {
|
1940
|
+
accuracyChart.data.labels.push(i);
|
1941
|
+
// Use current accuracy as approximation (in real implementation, store historical values)
|
1942
|
+
accuracyChart.data.datasets[0].data.push(status.overall_accuracy || 0);
|
1943
|
+
accuracyChart.data.datasets[1].data.push(status.simpleqa_accuracy || 0);
|
1944
|
+
accuracyChart.data.datasets[2].data.push(status.browsecomp_accuracy || 0);
|
1945
|
+
|
1946
|
+
timingChart.data.labels.push(i);
|
1947
|
+
timingChart.data.datasets[0].data.push(status.avg_time_per_example || 0);
|
1948
|
+
}
|
1949
|
+
|
1950
|
+
accuracyChart.update();
|
1951
|
+
timingChart.update();
|
1952
|
+
}
|
1953
|
+
} catch (error) {
|
1954
|
+
console.error('Error loading historical chart data:', error);
|
1955
|
+
}
|
1956
|
+
}
|
1957
|
+
|
1958
|
+
// Search quality monitoring functions
|
1959
|
+
async function updateSearchQualityMonitoring() {
|
1960
|
+
if (!currentBenchmarkId) return;
|
1961
|
+
|
1962
|
+
try {
|
1963
|
+
// Fetch recent results to get search counts
|
1964
|
+
const response = await fetch(`/benchmark/api/results/${currentBenchmarkId}?limit=5`);
|
1965
|
+
const data = await response.json();
|
1966
|
+
|
1967
|
+
if (data.success && data.results && data.results.length > 0) {
|
1968
|
+
// Process search result counts (already calculated by backend)
|
1969
|
+
const recentResults = data.results;
|
1970
|
+
let totalSearchResults = 0;
|
1971
|
+
let validResults = 0;
|
1972
|
+
|
1973
|
+
recentResults.forEach(result => {
|
1974
|
+
if (result.search_result_count !== undefined && result.search_result_count !== null) {
|
1975
|
+
totalSearchResults += result.search_result_count;
|
1976
|
+
validResults++;
|
1977
|
+
}
|
1978
|
+
});
|
1979
|
+
|
1980
|
+
if (validResults > 0) {
|
1981
|
+
const avgSearchResults = totalSearchResults / validResults;
|
1982
|
+
updateSearchResultsChart(avgSearchResults);
|
1983
|
+
updateSearchQualityAlert(avgSearchResults);
|
1984
|
+
}
|
1985
|
+
}
|
1986
|
+
} catch (error) {
|
1987
|
+
console.error('Error updating search quality monitoring:', error);
|
1988
|
+
}
|
1989
|
+
}
|
1990
|
+
|
1991
|
+
function updateSearchResultsChart(avgSearchResults) {
|
1992
|
+
if (!searchResultsChart || !currentBenchmarkId) return;
|
1993
|
+
|
1994
|
+
// Get current timestamp or progress for x-axis
|
1995
|
+
const now = new Date().toLocaleTimeString();
|
1996
|
+
const labels = searchResultsChart.data.labels;
|
1997
|
+
|
1998
|
+
// Always add new data point with timestamp
|
1999
|
+
labels.push(now);
|
2000
|
+
searchResultsChart.data.datasets[0].data.push(avgSearchResults);
|
2001
|
+
|
2002
|
+
// Keep only last 20 data points for readability
|
2003
|
+
if (labels.length > 20) {
|
2004
|
+
labels.shift();
|
2005
|
+
searchResultsChart.data.datasets[0].data.shift();
|
2006
|
+
}
|
2007
|
+
|
2008
|
+
// Store for alert monitoring
|
2009
|
+
recentSearchCounts.push(avgSearchResults);
|
2010
|
+
if (recentSearchCounts.length > 10) {
|
2011
|
+
recentSearchCounts.shift();
|
2012
|
+
}
|
2013
|
+
|
2014
|
+
searchResultsChart.update('none');
|
2015
|
+
}
|
2016
|
+
|
2017
|
+
function updateSearchQualityAlert(avgSearchResults) {
|
2018
|
+
const statusIcon = document.getElementById('search-status-icon');
|
2019
|
+
const statusText = document.getElementById('search-status-text');
|
2020
|
+
const statusDetails = document.getElementById('search-status-details');
|
2021
|
+
|
2022
|
+
if (!statusIcon || !statusText || !statusDetails) return;
|
2023
|
+
|
2024
|
+
// Determine alert level based on search result count
|
2025
|
+
let alertLevel = 'good';
|
2026
|
+
let alertMessage = '';
|
2027
|
+
let alertDetails = '';
|
2028
|
+
let alertIcon = 'fas fa-check-circle';
|
2029
|
+
let alertColor = '#4caf50';
|
2030
|
+
|
2031
|
+
if (avgSearchResults < 2) {
|
2032
|
+
alertLevel = 'critical';
|
2033
|
+
alertMessage = 'CRITICAL: Very few search results';
|
2034
|
+
alertDetails = `Only ${avgSearchResults.toFixed(1)} results per query. Accuracy likely severely degraded.`;
|
2035
|
+
alertIcon = 'fas fa-exclamation-triangle';
|
2036
|
+
alertColor = '#f44336';
|
2037
|
+
|
2038
|
+
// Show rate limit warning
|
2039
|
+
document.getElementById('rate-limit-warning').style.display = 'block';
|
2040
|
+
|
2041
|
+
} else if (avgSearchResults < 5) {
|
2042
|
+
alertLevel = 'warning';
|
2043
|
+
alertMessage = 'WARNING: Low search results';
|
2044
|
+
alertDetails = `${avgSearchResults.toFixed(1)} results per query. Consider restarting SearXNG.`;
|
2045
|
+
alertIcon = 'fas fa-exclamation-circle';
|
2046
|
+
alertColor = '#ff9800';
|
2047
|
+
|
2048
|
+
} else if (avgSearchResults < 10) {
|
2049
|
+
alertLevel = 'caution';
|
2050
|
+
alertMessage = 'CAUTION: Moderate search results';
|
2051
|
+
alertDetails = `${avgSearchResults.toFixed(1)} results per query. Performance may be affected.`;
|
2052
|
+
alertIcon = 'fas fa-info-circle';
|
2053
|
+
alertColor = '#2196f3';
|
2054
|
+
|
2055
|
+
} else {
|
2056
|
+
alertLevel = 'good';
|
2057
|
+
alertMessage = 'GOOD: Healthy search results';
|
2058
|
+
alertDetails = `${avgSearchResults.toFixed(1)} results per query. Search engines working well.`;
|
2059
|
+
alertIcon = 'fas fa-check-circle';
|
2060
|
+
alertColor = '#4caf50';
|
2061
|
+
|
2062
|
+
// Hide rate limit warning if it was shown
|
2063
|
+
document.getElementById('rate-limit-warning').style.display = 'none';
|
2064
|
+
}
|
2065
|
+
|
2066
|
+
// Update UI
|
2067
|
+
statusIcon.innerHTML = `<i class="${alertIcon}"></i>`;
|
2068
|
+
statusIcon.style.color = alertColor;
|
2069
|
+
statusText.textContent = alertMessage;
|
2070
|
+
statusText.style.color = alertColor;
|
2071
|
+
statusDetails.textContent = alertDetails;
|
2072
|
+
|
2073
|
+
// Trigger alert if we detect degradation
|
2074
|
+
if (alertLevel === 'critical' && !searchQualityAlert) {
|
2075
|
+
searchQualityAlert = true;
|
2076
|
+
showAlert('Search engine performance critically degraded! Consider restarting SearXNG.', 'error');
|
2077
|
+
} else if (alertLevel === 'warning' && !searchQualityAlert) {
|
2078
|
+
searchQualityAlert = true;
|
2079
|
+
showAlert('Search engine performance is declining. Monitor closely.', 'warning');
|
2080
|
+
} else if (alertLevel === 'good') {
|
2081
|
+
searchQualityAlert = false; // Reset alert flag when performance improves
|
2082
|
+
}
|
2083
|
+
}
|
2084
|
+
|
2085
|
+
// Rate limiting status monitoring (simplified)
|
2086
|
+
async function updateRateLimitingStatus() {
|
2087
|
+
try {
|
2088
|
+
const response = await fetch('/benchmark/api/search-quality');
|
2089
|
+
const data = await response.json();
|
2090
|
+
|
2091
|
+
if (data.success && data.search_quality && data.search_quality.length > 0) {
|
2092
|
+
// Find SearXNG engine specifically (most critical for benchmarks)
|
2093
|
+
const searxngStats = data.search_quality.find(stat =>
|
2094
|
+
stat.engine_type.toLowerCase().includes('searxng')
|
2095
|
+
);
|
2096
|
+
|
2097
|
+
if (searxngStats && searxngStats.recent_avg_results < 2) {
|
2098
|
+
// Show warning when search results are critically low
|
2099
|
+
console.warn('Low search results detected:', searxngStats);
|
2100
|
+
|
2101
|
+
const statusDetails = document.getElementById('search-status-details');
|
2102
|
+
if (statusDetails && !statusDetails.textContent.includes('Very low results')) {
|
2103
|
+
statusDetails.textContent += ` Very low results: ${searxngStats.recent_avg_results.toFixed(1)} avg.`;
|
2104
|
+
}
|
2105
|
+
}
|
2106
|
+
}
|
2107
|
+
} catch (error) {
|
2108
|
+
console.error('Error fetching rate limiting status:', error);
|
2109
|
+
}
|
2110
|
+
}
|
2111
|
+
|
2112
|
+
// Check for search engine warnings and display appropriate messages
|
2113
|
+
function checkSearchEngineWarnings(searchTool) {
|
2114
|
+
const warningContainer = document.getElementById('search-engine-warning');
|
2115
|
+
const warningText = document.getElementById('search-warning-text');
|
2116
|
+
|
2117
|
+
let showWarning = false;
|
2118
|
+
let message = '';
|
2119
|
+
|
2120
|
+
switch (searchTool?.toLowerCase()) {
|
2121
|
+
case 'searxng':
|
2122
|
+
showWarning = true;
|
2123
|
+
message = 'SearXNG is a shared resource. Please use reasonable example counts to avoid affecting other users. Consider shorter benchmarks for testing.';
|
2124
|
+
break;
|
2125
|
+
case 'arxiv':
|
2126
|
+
showWarning = true;
|
2127
|
+
message = 'ArXiv is a shared resource containing only academic papers - benchmarking with SimpleQA is useless as it will find zero relevant results for general knowledge questions. Should not be used for this test. Use Tavily instead.';
|
2128
|
+
break;
|
2129
|
+
case 'pubmed':
|
2130
|
+
showWarning = true;
|
2131
|
+
message = 'PubMed is a shared resource containing only medical literature - benchmarking with SimpleQA is absolutely useless as general knowledge questions will find zero relevant results. Should not be used for this test. Use Tavily instead.';
|
2132
|
+
break;
|
2133
|
+
case 'semanticscholar':
|
2134
|
+
showWarning = true;
|
2135
|
+
message = 'Semantic Scholar is a shared resource specialized for academic research - not suitable for general SimpleQA questions and should not be used for this test. Use Tavily instead.';
|
2136
|
+
break;
|
2137
|
+
case 'wikipedia':
|
2138
|
+
showWarning = true;
|
2139
|
+
message = 'Wikipedia is a shared resource with limited coverage - benchmarking with it is useless for comprehensive testing and should not be used for this test. Use Tavily instead.';
|
2140
|
+
break;
|
2141
|
+
default:
|
2142
|
+
showWarning = false;
|
2143
|
+
}
|
2144
|
+
|
2145
|
+
if (showWarning) {
|
2146
|
+
warningText.textContent = message;
|
2147
|
+
warningContainer.style.display = 'block';
|
2148
|
+
} else {
|
2149
|
+
warningContainer.style.display = 'none';
|
2150
|
+
}
|
2151
|
+
}
|
2152
|
+
|
2153
|
+
// ==============================================
|
2154
|
+
// Evaluation Settings Functionality
|
2155
|
+
// (Reusing research page model functionality)
|
2156
|
+
// ==============================================
|
2157
|
+
|
2158
|
+
// Evaluation settings DOM elements
|
2159
|
+
let evaluationProviderSelect = null;
|
2160
|
+
let evaluationModelInput = null;
|
2161
|
+
let evaluationEndpointInput = null;
|
2162
|
+
let evaluationTemperatureInput = null;
|
2163
|
+
|
2164
|
+
function initializeEvaluationSettings() {
|
2165
|
+
console.log('Initializing evaluation settings...');
|
2166
|
+
|
2167
|
+
// Initialize the global models object
|
2168
|
+
window.evaluationModels = window.evaluationModels || {};
|
2169
|
+
|
2170
|
+
// Get DOM elements
|
2171
|
+
evaluationProviderSelect = document.getElementById('evaluation_provider');
|
2172
|
+
evaluationModelInput = document.getElementById('evaluation_model');
|
2173
|
+
evaluationEndpointInput = document.getElementById('evaluation_endpoint_url');
|
2174
|
+
evaluationTemperatureInput = document.getElementById('evaluation_temperature');
|
2175
|
+
|
2176
|
+
console.log('DOM elements found:', {
|
2177
|
+
provider: !!evaluationProviderSelect,
|
2178
|
+
model: !!evaluationModelInput,
|
2179
|
+
endpoint: !!evaluationEndpointInput,
|
2180
|
+
temperature: !!evaluationTemperatureInput
|
2181
|
+
});
|
2182
|
+
|
2183
|
+
// Populate evaluation provider dropdown
|
2184
|
+
populateEvaluationProviders();
|
2185
|
+
|
2186
|
+
// Setup evaluation model dropdown using existing custom dropdown
|
2187
|
+
setupEvaluationModelDropdown();
|
2188
|
+
|
2189
|
+
// Setup event handlers
|
2190
|
+
setupEvaluationEventHandlers();
|
2191
|
+
|
2192
|
+
// Load models from API - this will populate window.evaluationModels
|
2193
|
+
loadEvaluationModelsFromAPI();
|
2194
|
+
|
2195
|
+
// Load settings with a small delay to ensure DOM is ready
|
2196
|
+
setTimeout(() => {
|
2197
|
+
loadEvaluationSettings();
|
2198
|
+
}, 100);
|
2199
|
+
}
|
2200
|
+
|
2201
|
+
function populateEvaluationProviders() {
|
2202
|
+
if (!evaluationProviderSelect) return;
|
2203
|
+
|
2204
|
+
// Clear existing options
|
2205
|
+
evaluationProviderSelect.innerHTML = '';
|
2206
|
+
|
2207
|
+
// Provider options (same as research page)
|
2208
|
+
const providers = [
|
2209
|
+
{ value: 'ollama', label: 'Ollama (Local)' },
|
2210
|
+
{ value: 'openai', label: 'OpenAI (Cloud)' },
|
2211
|
+
{ value: 'anthropic', label: 'Anthropic (Cloud)' },
|
2212
|
+
{ value: 'openai_endpoint', label: 'Custom OpenAI Endpoint' },
|
2213
|
+
{ value: 'vllm', label: 'vLLM (Local)' },
|
2214
|
+
{ value: 'lmstudio', label: 'LM Studio (Local)' },
|
2215
|
+
{ value: 'llamacpp', label: 'Llama.cpp (Local)' }
|
2216
|
+
];
|
2217
|
+
|
2218
|
+
// Add options
|
2219
|
+
providers.forEach(provider => {
|
2220
|
+
const option = document.createElement('option');
|
2221
|
+
option.value = provider.value;
|
2222
|
+
option.textContent = provider.label;
|
2223
|
+
evaluationProviderSelect.appendChild(option);
|
2224
|
+
});
|
2225
|
+
|
2226
|
+
// Set initial value from data attribute
|
2227
|
+
const initialProvider = evaluationProviderSelect.getAttribute('data-initial-value') || 'openai_endpoint';
|
2228
|
+
console.log('Setting initial evaluation provider to:', initialProvider);
|
2229
|
+
evaluationProviderSelect.value = initialProvider;
|
2230
|
+
|
2231
|
+
// Show/hide endpoint field based on initial provider
|
2232
|
+
if (evaluationEndpointInput && evaluationEndpointInput.parentNode) {
|
2233
|
+
evaluationEndpointInput.parentNode.style.display =
|
2234
|
+
initialProvider === 'openai_endpoint' ? 'block' : 'none';
|
2235
|
+
}
|
2236
|
+
|
2237
|
+
console.log('Populated evaluation providers with initial value:', initialProvider);
|
2238
|
+
}
|
2239
|
+
|
2240
|
+
function setupEvaluationModelDropdown() {
|
2241
|
+
if (!evaluationModelInput) return;
|
2242
|
+
|
2243
|
+
const dropdownList = document.getElementById('evaluation-model-dropdown-list');
|
2244
|
+
if (!dropdownList) return;
|
2245
|
+
|
2246
|
+
// Setup custom dropdown using the existing component
|
2247
|
+
if (window.setupCustomDropdown) {
|
2248
|
+
window.evaluationDropdownInstance = window.setupCustomDropdown(
|
2249
|
+
evaluationModelInput,
|
2250
|
+
dropdownList,
|
2251
|
+
function() {
|
2252
|
+
// Get models dynamically based on current provider
|
2253
|
+
const provider = evaluationProviderSelect ? evaluationProviderSelect.value : 'openai_endpoint';
|
2254
|
+
|
2255
|
+
// Use loaded models if available
|
2256
|
+
if (window.evaluationModels && window.evaluationModels[provider]) {
|
2257
|
+
console.log(`Returning ${window.evaluationModels[provider].length} loaded models for ${provider}`);
|
2258
|
+
return window.evaluationModels[provider];
|
2259
|
+
}
|
2260
|
+
|
2261
|
+
// Otherwise return defaults
|
2262
|
+
return getEvaluationModelOptions();
|
2263
|
+
},
|
2264
|
+
function(value, item) {
|
2265
|
+
// On selection callback
|
2266
|
+
const hiddenInput = document.getElementById('evaluation_model_hidden');
|
2267
|
+
if (hiddenInput) {
|
2268
|
+
hiddenInput.value = value;
|
2269
|
+
}
|
2270
|
+
saveEvaluationSetting('benchmark.evaluation.model', value);
|
2271
|
+
},
|
2272
|
+
true, // Allow custom values
|
2273
|
+
'No models available'
|
2274
|
+
);
|
2275
|
+
}
|
2276
|
+
|
2277
|
+
// Setup refresh button
|
2278
|
+
const refreshBtn = document.querySelector('[data-target="evaluation-model-dropdown"] .refresh-btn');
|
2279
|
+
if (refreshBtn) {
|
2280
|
+
refreshBtn.addEventListener('click', function(e) {
|
2281
|
+
e.preventDefault();
|
2282
|
+
console.log('Refresh button clicked, force reloading models...');
|
2283
|
+
|
2284
|
+
// Show loading state
|
2285
|
+
const icon = this.querySelector('i');
|
2286
|
+
if (icon) {
|
2287
|
+
icon.classList.add('fa-spin');
|
2288
|
+
}
|
2289
|
+
|
2290
|
+
// Force reload models from API
|
2291
|
+
window.modelsLoading = false; // Reset the flag
|
2292
|
+
|
2293
|
+
fetch('/settings/api/available-models?force_refresh=true')
|
2294
|
+
.then(response => response.json())
|
2295
|
+
.then(data => {
|
2296
|
+
console.log('Force refresh received model data:', data);
|
2297
|
+
|
2298
|
+
if (data && data.providers) {
|
2299
|
+
// Store the formatted models
|
2300
|
+
window.evaluationModels = {};
|
2301
|
+
|
2302
|
+
// Process each provider's models
|
2303
|
+
Object.entries(data.providers).forEach(([providerKey, models]) => {
|
2304
|
+
if (Array.isArray(models)) {
|
2305
|
+
// Map provider keys to expected provider names
|
2306
|
+
let providerName = providerKey.replace('_models', '').toLowerCase();
|
2307
|
+
|
2308
|
+
// Special handling for openai_endpoint
|
2309
|
+
if (providerName === 'openai_endpoint') {
|
2310
|
+
providerName = 'openai_endpoint';
|
2311
|
+
}
|
2312
|
+
|
2313
|
+
window.evaluationModels[providerName] = models.map(model => ({
|
2314
|
+
value: model.value || model.id,
|
2315
|
+
label: model.label || model.name || model.value
|
2316
|
+
}));
|
2317
|
+
console.log(`Loaded ${models.length} models for ${providerName}`);
|
2318
|
+
}
|
2319
|
+
});
|
2320
|
+
|
2321
|
+
// Update dropdown with new data
|
2322
|
+
refreshEvaluationModels();
|
2323
|
+
}
|
2324
|
+
})
|
2325
|
+
.catch(error => {
|
2326
|
+
console.error('Error loading evaluation models:', error);
|
2327
|
+
})
|
2328
|
+
.finally(() => {
|
2329
|
+
// Remove loading state
|
2330
|
+
if (icon) {
|
2331
|
+
icon.classList.remove('fa-spin');
|
2332
|
+
}
|
2333
|
+
window.modelsLoading = false;
|
2334
|
+
});
|
2335
|
+
});
|
2336
|
+
}
|
2337
|
+
}
|
2338
|
+
|
2339
|
+
function getEvaluationModelOptions() {
|
2340
|
+
const provider = evaluationProviderSelect ? evaluationProviderSelect.value : 'openai_endpoint';
|
2341
|
+
console.log('Getting evaluation model options for provider:', provider);
|
2342
|
+
|
2343
|
+
// Check if we have loaded models
|
2344
|
+
if (window.evaluationModels && window.evaluationModels[provider] && window.evaluationModels[provider].length > 0) {
|
2345
|
+
console.log(`Returning ${window.evaluationModels[provider].length} cached models for ${provider}`);
|
2346
|
+
return window.evaluationModels[provider];
|
2347
|
+
}
|
2348
|
+
|
2349
|
+
// Load models from API if not already loading
|
2350
|
+
if (!window.modelsLoading) {
|
2351
|
+
window.modelsLoading = true;
|
2352
|
+
loadEvaluationModelsFromAPI();
|
2353
|
+
}
|
2354
|
+
|
2355
|
+
// Return minimal defaults while loading
|
2356
|
+
console.log(`No models loaded yet for ${provider}, returning defaults`);
|
2357
|
+
if (provider === 'openai_endpoint') {
|
2358
|
+
return [
|
2359
|
+
{ value: 'anthropic/claude-3.5-sonnet', label: 'Claude 3.5 Sonnet' },
|
2360
|
+
{ value: 'openai/gpt-4o', label: 'GPT-4o' },
|
2361
|
+
{ value: '01-ai/yi-large', label: 'Yi Large' }
|
2362
|
+
];
|
2363
|
+
} else if (provider === 'openai') {
|
2364
|
+
return [
|
2365
|
+
{ value: 'gpt-4o', label: 'GPT-4o' },
|
2366
|
+
{ value: 'gpt-4', label: 'GPT-4' },
|
2367
|
+
{ value: 'gpt-3.5-turbo', label: 'GPT-3.5 Turbo' }
|
2368
|
+
];
|
2369
|
+
} else if (provider === 'anthropic') {
|
2370
|
+
return [
|
2371
|
+
{ value: 'claude-3-5-sonnet-latest', label: 'Claude 3.5 Sonnet' },
|
2372
|
+
{ value: 'claude-3-opus-20240229', label: 'Claude 3 Opus' }
|
2373
|
+
];
|
2374
|
+
} else {
|
2375
|
+
// Return empty array for other providers
|
2376
|
+
return [];
|
2377
|
+
}
|
2378
|
+
}
|
2379
|
+
|
2380
|
+
// Debounce function to prevent too many API calls
|
2381
|
+
function debounce(func, wait) {
|
2382
|
+
let timeout;
|
2383
|
+
return function executedFunction(...args) {
|
2384
|
+
const later = () => {
|
2385
|
+
clearTimeout(timeout);
|
2386
|
+
func(...args);
|
2387
|
+
};
|
2388
|
+
clearTimeout(timeout);
|
2389
|
+
timeout = setTimeout(later, wait);
|
2390
|
+
};
|
2391
|
+
}
|
2392
|
+
|
2393
|
+
// Debounced version of loadEvaluationModelsFromAPI
|
2394
|
+
const loadEvaluationModelsFromAPI = debounce(function(forceRefresh = false) {
|
2395
|
+
console.log('Loading evaluation models from API...', forceRefresh ? '(force refresh)' : '');
|
2396
|
+
|
2397
|
+
// Prevent multiple simultaneous loads
|
2398
|
+
if (window.modelsLoading && !forceRefresh) {
|
2399
|
+
console.log('Models already loading, skipping...');
|
2400
|
+
return;
|
2401
|
+
}
|
2402
|
+
|
2403
|
+
window.modelsLoading = true;
|
2404
|
+
|
2405
|
+
// Use the correct API endpoint with optional force_refresh parameter
|
2406
|
+
const url = forceRefresh ? '/settings/api/available-models?force_refresh=true' : '/settings/api/available-models';
|
2407
|
+
|
2408
|
+
fetch(url)
|
2409
|
+
.then(response => response.json())
|
2410
|
+
.then(data => {
|
2411
|
+
console.log('Received model data:', data);
|
2412
|
+
|
2413
|
+
if (data && data.providers) {
|
2414
|
+
// Store the formatted models in a temporary variable
|
2415
|
+
window.evaluationModels = {};
|
2416
|
+
|
2417
|
+
// Process each provider's models
|
2418
|
+
Object.entries(data.providers).forEach(([providerKey, models]) => {
|
2419
|
+
if (Array.isArray(models)) {
|
2420
|
+
// Map provider keys to expected provider names
|
2421
|
+
let providerName = providerKey.replace('_models', '').toLowerCase();
|
2422
|
+
|
2423
|
+
// Special handling for openai_endpoint
|
2424
|
+
if (providerName === 'openai_endpoint') {
|
2425
|
+
providerName = 'openai_endpoint';
|
2426
|
+
}
|
2427
|
+
|
2428
|
+
window.evaluationModels[providerName] = models.map(model => ({
|
2429
|
+
value: model.value || model.id,
|
2430
|
+
label: model.label || model.name || model.value
|
2431
|
+
}));
|
2432
|
+
console.log(`Loaded ${models.length} models for ${providerName}`);
|
2433
|
+
}
|
2434
|
+
});
|
2435
|
+
|
2436
|
+
// Update dropdown with new data
|
2437
|
+
refreshEvaluationModels();
|
2438
|
+
}
|
2439
|
+
})
|
2440
|
+
.catch(error => {
|
2441
|
+
console.error('Error loading evaluation models:', error);
|
2442
|
+
})
|
2443
|
+
.finally(() => {
|
2444
|
+
window.modelsLoading = false;
|
2445
|
+
});
|
2446
|
+
}, 500); // Wait 500ms before making the API call
|
2447
|
+
|
2448
|
+
function filterModelsForProvider(models, provider) {
|
2449
|
+
const providerUpper = provider.toUpperCase();
|
2450
|
+
|
2451
|
+
let filtered = models.filter(model => {
|
2452
|
+
const modelProvider = (model.provider || '').toUpperCase();
|
2453
|
+
// Handle provider name variations
|
2454
|
+
if (providerUpper === 'OPENAI_ENDPOINT' && modelProvider === 'OPENAI_ENDPOINT') return true;
|
2455
|
+
if (providerUpper === 'OPENAI' && modelProvider === 'OPENAI') return true;
|
2456
|
+
if (providerUpper === 'ANTHROPIC' && modelProvider === 'ANTHROPIC') return true;
|
2457
|
+
if (providerUpper === 'OLLAMA' && modelProvider === 'OLLAMA') return true;
|
2458
|
+
return modelProvider === providerUpper;
|
2459
|
+
});
|
2460
|
+
|
2461
|
+
// If no models found for provider, return some defaults
|
2462
|
+
if (filtered.length === 0) {
|
2463
|
+
if (providerUpper === 'OPENAI_ENDPOINT') {
|
2464
|
+
return [
|
2465
|
+
{ value: '01-ai/yi-large', label: 'Yi Large' },
|
2466
|
+
{ value: 'anthropic/claude-3.5-sonnet', label: 'Claude 3.5 Sonnet' },
|
2467
|
+
{ value: 'openai/gpt-4o', label: 'GPT-4o' }
|
2468
|
+
];
|
2469
|
+
} else if (providerUpper === 'OPENAI') {
|
2470
|
+
return [
|
2471
|
+
{ value: 'gpt-4o', label: 'GPT-4o' },
|
2472
|
+
{ value: 'gpt-4', label: 'GPT-4' },
|
2473
|
+
{ value: 'gpt-3.5-turbo', label: 'GPT-3.5 Turbo' }
|
2474
|
+
];
|
2475
|
+
} else if (providerUpper === 'ANTHROPIC') {
|
2476
|
+
return [
|
2477
|
+
{ value: 'claude-3-5-sonnet-latest', label: 'Claude 3.5 Sonnet' },
|
2478
|
+
{ value: 'claude-3-opus-20240229', label: 'Claude 3 Opus' }
|
2479
|
+
];
|
2480
|
+
}
|
2481
|
+
}
|
2482
|
+
|
2483
|
+
return filtered.map(model => ({
|
2484
|
+
value: model.value || model.id,
|
2485
|
+
label: model.label || model.name || model.value
|
2486
|
+
}));
|
2487
|
+
}
|
2488
|
+
|
2489
|
+
function refreshEvaluationModels() {
|
2490
|
+
const provider = evaluationProviderSelect ? evaluationProviderSelect.value : 'openai_endpoint';
|
2491
|
+
const options = window.evaluationModels && window.evaluationModels[provider] ?
|
2492
|
+
window.evaluationModels[provider] : [];
|
2493
|
+
|
2494
|
+
console.log(`Refreshing evaluation dropdown with ${options.length} options for provider ${provider}`);
|
2495
|
+
|
2496
|
+
// If we have the updateDropdownOptions function and the input
|
2497
|
+
if (window.updateDropdownOptions && evaluationModelInput) {
|
2498
|
+
// Update the dropdown with the actual loaded models
|
2499
|
+
window.updateDropdownOptions(evaluationModelInput, options);
|
2500
|
+
}
|
2501
|
+
|
2502
|
+
// Force a click event to show the dropdown with new options
|
2503
|
+
if (evaluationModelInput && options.length > 0) {
|
2504
|
+
// Trigger a click to show the dropdown with updated options
|
2505
|
+
setTimeout(() => {
|
2506
|
+
evaluationModelInput.click();
|
2507
|
+
}, 100);
|
2508
|
+
}
|
2509
|
+
}
|
2510
|
+
|
2511
|
+
|
2512
|
+
function setupEvaluationEventHandlers() {
|
2513
|
+
// Provider change handler
|
2514
|
+
if (evaluationProviderSelect) {
|
2515
|
+
evaluationProviderSelect.addEventListener('change', function() {
|
2516
|
+
const provider = this.value;
|
2517
|
+
console.log('Evaluation provider changed to:', provider);
|
2518
|
+
|
2519
|
+
// Show/hide endpoint URL field
|
2520
|
+
if (evaluationEndpointInput && evaluationEndpointInput.parentNode) {
|
2521
|
+
evaluationEndpointInput.parentNode.style.display =
|
2522
|
+
provider === 'openai_endpoint' ? 'block' : 'none';
|
2523
|
+
}
|
2524
|
+
|
2525
|
+
// Update model options for new provider
|
2526
|
+
refreshEvaluationModels();
|
2527
|
+
|
2528
|
+
// Save provider setting
|
2529
|
+
saveEvaluationSetting('benchmark.evaluation.provider', provider);
|
2530
|
+
});
|
2531
|
+
}
|
2532
|
+
|
2533
|
+
// Model input change handler
|
2534
|
+
if (evaluationModelInput) {
|
2535
|
+
evaluationModelInput.addEventListener('change', function() {
|
2536
|
+
saveEvaluationSetting('benchmark.evaluation.model', this.value);
|
2537
|
+
});
|
2538
|
+
}
|
2539
|
+
|
2540
|
+
// Endpoint URL change handler
|
2541
|
+
if (evaluationEndpointInput) {
|
2542
|
+
evaluationEndpointInput.addEventListener('change', function() {
|
2543
|
+
saveEvaluationSetting('benchmark.evaluation.endpoint_url', this.value);
|
2544
|
+
});
|
2545
|
+
}
|
2546
|
+
|
2547
|
+
// Temperature change handler
|
2548
|
+
if (evaluationTemperatureInput) {
|
2549
|
+
evaluationTemperatureInput.addEventListener('change', function() {
|
2550
|
+
saveEvaluationSetting('benchmark.evaluation.temperature', parseFloat(this.value));
|
2551
|
+
});
|
2552
|
+
}
|
2553
|
+
}
|
2554
|
+
|
2555
|
+
function loadEvaluationSettings() {
|
2556
|
+
console.log('Loading evaluation settings...');
|
2557
|
+
console.log('Current DOM elements state:', {
|
2558
|
+
provider: !!evaluationProviderSelect,
|
2559
|
+
model: !!evaluationModelInput,
|
2560
|
+
endpoint: !!evaluationEndpointInput,
|
2561
|
+
temperature: !!evaluationTemperatureInput
|
2562
|
+
});
|
2563
|
+
|
2564
|
+
// Use the same evalProviderData, evalModelData, etc. that were already loaded
|
2565
|
+
if (!evalProviderData || !evalModelData || !evalEndpointData || !evalTempData) {
|
2566
|
+
console.log('Evaluation settings not loaded yet, skipping...');
|
2567
|
+
return;
|
2568
|
+
}
|
2569
|
+
|
2570
|
+
// Set provider
|
2571
|
+
if (evaluationProviderSelect && evalProviderData && evalProviderData.settings) {
|
2572
|
+
const providerValue = evalProviderData.settings.value || 'openai_endpoint';
|
2573
|
+
console.log('Setting evaluation provider to:', providerValue);
|
2574
|
+
evaluationProviderSelect.value = providerValue;
|
2575
|
+
|
2576
|
+
// Show/hide endpoint field
|
2577
|
+
if (evaluationEndpointInput && evaluationEndpointInput.parentNode) {
|
2578
|
+
evaluationEndpointInput.parentNode.style.display =
|
2579
|
+
providerValue === 'openai_endpoint' ? 'block' : 'none';
|
2580
|
+
}
|
2581
|
+
}
|
2582
|
+
|
2583
|
+
// Set model
|
2584
|
+
if (evaluationModelInput && evalModelData && evalModelData.settings) {
|
2585
|
+
const modelValue = evalModelData.settings.value || 'anthropic/claude-3.7-sonnet';
|
2586
|
+
console.log('Setting evaluation model to:', modelValue);
|
2587
|
+
|
2588
|
+
// Wait for models to be loaded, then set the value
|
2589
|
+
const setModelValue = () => {
|
2590
|
+
const provider = evaluationProviderSelect ? evaluationProviderSelect.value : 'openai_endpoint';
|
2591
|
+
const models = window.evaluationModels && window.evaluationModels[provider] ?
|
2592
|
+
window.evaluationModels[provider] : [];
|
2593
|
+
|
2594
|
+
// Find the matching model to get the label
|
2595
|
+
const matchingModel = models.find(m => m.value === modelValue);
|
2596
|
+
|
2597
|
+
if (matchingModel) {
|
2598
|
+
// Set the display value to the label
|
2599
|
+
evaluationModelInput.value = matchingModel.label;
|
2600
|
+
console.log('Found matching model, setting label:', matchingModel.label);
|
2601
|
+
} else {
|
2602
|
+
// If no match, just set the raw value
|
2603
|
+
evaluationModelInput.value = modelValue;
|
2604
|
+
console.log('No matching model found, setting raw value:', modelValue);
|
2605
|
+
}
|
2606
|
+
|
2607
|
+
// Update hidden input
|
2608
|
+
const hiddenInput = document.getElementById('evaluation_model_hidden');
|
2609
|
+
if (hiddenInput) {
|
2610
|
+
hiddenInput.value = modelValue;
|
2611
|
+
}
|
2612
|
+
|
2613
|
+
// Use the dropdown instance's setValue method if available
|
2614
|
+
if (window.evaluationDropdownInstance && window.evaluationDropdownInstance.setValue) {
|
2615
|
+
window.evaluationDropdownInstance.setValue(modelValue, false);
|
2616
|
+
}
|
2617
|
+
};
|
2618
|
+
|
2619
|
+
// If models are already loaded, set immediately
|
2620
|
+
if (window.evaluationModels && Object.keys(window.evaluationModels).length > 0) {
|
2621
|
+
setModelValue();
|
2622
|
+
} else {
|
2623
|
+
// Otherwise wait for models to load
|
2624
|
+
setTimeout(setModelValue, 1000);
|
2625
|
+
}
|
2626
|
+
}
|
2627
|
+
|
2628
|
+
// Set endpoint URL
|
2629
|
+
if (evaluationEndpointInput && evalEndpointData && evalEndpointData.settings) {
|
2630
|
+
const endpointValue = evalEndpointData.settings.value || 'https://openrouter.ai/api/v1';
|
2631
|
+
console.log('Setting evaluation endpoint to:', endpointValue);
|
2632
|
+
evaluationEndpointInput.value = endpointValue;
|
2633
|
+
}
|
2634
|
+
|
2635
|
+
// Set temperature
|
2636
|
+
if (evaluationTemperatureInput && evalTempData && evalTempData.settings) {
|
2637
|
+
const tempValue = evalTempData.settings.value || 0;
|
2638
|
+
console.log('Setting evaluation temperature to:', tempValue);
|
2639
|
+
evaluationTemperatureInput.value = tempValue;
|
2640
|
+
}
|
2641
|
+
}
|
2642
|
+
|
2643
|
+
function setEvaluationDefaults() {
|
2644
|
+
console.log('Setting evaluation defaults');
|
2645
|
+
if (evaluationProviderSelect) {
|
2646
|
+
evaluationProviderSelect.value = 'openai_endpoint';
|
2647
|
+
// Show endpoint field for default provider
|
2648
|
+
if (evaluationEndpointInput && evaluationEndpointInput.parentNode) {
|
2649
|
+
evaluationEndpointInput.parentNode.style.display = 'block';
|
2650
|
+
}
|
2651
|
+
}
|
2652
|
+
if (evaluationModelInput) evaluationModelInput.value = 'anthropic/claude-3.7-sonnet';
|
2653
|
+
if (evaluationEndpointInput) evaluationEndpointInput.value = 'https://openrouter.ai/api/v1';
|
2654
|
+
if (evaluationTemperatureInput) evaluationTemperatureInput.value = 0;
|
2655
|
+
}
|
2656
|
+
|
2657
|
+
function saveEvaluationSetting(key, value) {
|
2658
|
+
console.log('Saving evaluation setting:', key, '=', value);
|
2659
|
+
|
2660
|
+
// Get CSRF token
|
2661
|
+
const csrfToken = document.querySelector('meta[name="csrf-token"]')?.getAttribute('content') || '';
|
2662
|
+
|
2663
|
+
fetch(`/settings/api/${key}`, {
|
2664
|
+
method: 'PUT',
|
2665
|
+
headers: {
|
2666
|
+
'Content-Type': 'application/json',
|
2667
|
+
'X-CSRFToken': csrfToken
|
2668
|
+
},
|
2669
|
+
body: JSON.stringify({ value: value })
|
2670
|
+
})
|
2671
|
+
.then(response => response.json())
|
2672
|
+
.then(data => {
|
2673
|
+
if (data.success) {
|
2674
|
+
console.log('Successfully saved evaluation setting:', key);
|
2675
|
+
} else {
|
2676
|
+
console.error('Failed to save evaluation setting:', data.error);
|
2677
|
+
}
|
2678
|
+
})
|
2679
|
+
.catch(error => {
|
2680
|
+
console.error('Error saving evaluation setting:', error);
|
2681
|
+
});
|
2682
|
+
}
|
2683
|
+
|
2684
|
+
|
2685
|
+
</script>
|
2686
|
+
{% endblock %}
|
2687
|
+
|
2688
|
+
{% block page_scripts %}
|
2689
|
+
<!-- Load required services for progress tracking -->
|
2690
|
+
<script src="{{ url_for('research.serve_static', path='js/services/audio.js') }}"></script>
|
2691
|
+
<script src="{{ url_for('research.serve_static', path='js/services/ui.js') }}"></script>
|
2692
|
+
<script src="{{ url_for('research.serve_static', path='js/services/formatting.js') }}"></script>
|
2693
|
+
<script src="{{ url_for('research.serve_static', path='js/services/api.js') }}"></script>
|
2694
|
+
<script src="{{ url_for('research.serve_static', path='js/services/socket.js') }}"></script>
|
2695
|
+
<!-- Load custom dropdown component for evaluation model selection -->
|
2696
|
+
<script src="{{ url_for('research.serve_static', path='js/components/custom_dropdown.js') }}"></script>
|
2697
|
+
{% endblock %}
|