local-deep-research 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- local_deep_research/__version__.py +1 -1
- local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +11 -1
- local_deep_research/advanced_search_system/questions/browsecomp_question.py +32 -6
- local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +32 -8
- local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -0
- local_deep_research/api/__init__.py +2 -0
- local_deep_research/api/research_functions.py +177 -3
- local_deep_research/benchmarks/graders.py +150 -5
- local_deep_research/benchmarks/models/__init__.py +19 -0
- local_deep_research/benchmarks/models/benchmark_models.py +283 -0
- local_deep_research/benchmarks/ui/__init__.py +1 -0
- local_deep_research/benchmarks/web_api/__init__.py +6 -0
- local_deep_research/benchmarks/web_api/benchmark_routes.py +862 -0
- local_deep_research/benchmarks/web_api/benchmark_service.py +920 -0
- local_deep_research/config/llm_config.py +106 -21
- local_deep_research/defaults/default_settings.json +447 -2
- local_deep_research/error_handling/report_generator.py +10 -0
- local_deep_research/llm/__init__.py +19 -0
- local_deep_research/llm/llm_registry.py +155 -0
- local_deep_research/metrics/db_models.py +3 -7
- local_deep_research/metrics/search_tracker.py +25 -11
- local_deep_research/search_system.py +12 -9
- local_deep_research/utilities/log_utils.py +23 -10
- local_deep_research/utilities/thread_context.py +99 -0
- local_deep_research/web/app_factory.py +32 -8
- local_deep_research/web/database/benchmark_schema.py +230 -0
- local_deep_research/web/database/convert_research_id_to_string.py +161 -0
- local_deep_research/web/database/models.py +55 -1
- local_deep_research/web/database/schema_upgrade.py +397 -2
- local_deep_research/web/database/uuid_migration.py +265 -0
- local_deep_research/web/routes/api_routes.py +62 -31
- local_deep_research/web/routes/history_routes.py +13 -6
- local_deep_research/web/routes/metrics_routes.py +264 -4
- local_deep_research/web/routes/research_routes.py +45 -18
- local_deep_research/web/routes/route_registry.py +352 -0
- local_deep_research/web/routes/settings_routes.py +382 -22
- local_deep_research/web/services/research_service.py +22 -29
- local_deep_research/web/services/settings_manager.py +53 -0
- local_deep_research/web/services/settings_service.py +2 -0
- local_deep_research/web/static/css/styles.css +8 -0
- local_deep_research/web/static/js/components/detail.js +7 -14
- local_deep_research/web/static/js/components/details.js +8 -10
- local_deep_research/web/static/js/components/fallback/ui.js +4 -4
- local_deep_research/web/static/js/components/history.js +6 -6
- local_deep_research/web/static/js/components/logpanel.js +14 -11
- local_deep_research/web/static/js/components/progress.js +51 -46
- local_deep_research/web/static/js/components/research.js +250 -89
- local_deep_research/web/static/js/components/results.js +5 -7
- local_deep_research/web/static/js/components/settings.js +32 -26
- local_deep_research/web/static/js/components/settings_sync.js +24 -23
- local_deep_research/web/static/js/config/urls.js +285 -0
- local_deep_research/web/static/js/main.js +8 -8
- local_deep_research/web/static/js/research_form.js +267 -12
- local_deep_research/web/static/js/services/api.js +18 -18
- local_deep_research/web/static/js/services/keyboard.js +8 -8
- local_deep_research/web/static/js/services/socket.js +53 -35
- local_deep_research/web/static/js/services/ui.js +1 -1
- local_deep_research/web/templates/base.html +4 -1
- local_deep_research/web/templates/components/custom_dropdown.html +5 -3
- local_deep_research/web/templates/components/mobile_nav.html +3 -3
- local_deep_research/web/templates/components/sidebar.html +9 -3
- local_deep_research/web/templates/pages/benchmark.html +2697 -0
- local_deep_research/web/templates/pages/benchmark_results.html +1136 -0
- local_deep_research/web/templates/pages/benchmark_simple.html +453 -0
- local_deep_research/web/templates/pages/cost_analytics.html +1 -1
- local_deep_research/web/templates/pages/metrics.html +212 -39
- local_deep_research/web/templates/pages/research.html +8 -6
- local_deep_research/web/templates/pages/star_reviews.html +1 -1
- local_deep_research/web_search_engines/engines/search_engine_arxiv.py +14 -1
- local_deep_research/web_search_engines/engines/search_engine_brave.py +15 -1
- local_deep_research/web_search_engines/engines/search_engine_ddg.py +20 -1
- local_deep_research/web_search_engines/engines/search_engine_google_pse.py +26 -2
- local_deep_research/web_search_engines/engines/search_engine_pubmed.py +15 -1
- local_deep_research/web_search_engines/engines/search_engine_retriever.py +192 -0
- local_deep_research/web_search_engines/engines/search_engine_tavily.py +307 -0
- local_deep_research/web_search_engines/rate_limiting/__init__.py +14 -0
- local_deep_research/web_search_engines/rate_limiting/__main__.py +9 -0
- local_deep_research/web_search_engines/rate_limiting/cli.py +209 -0
- local_deep_research/web_search_engines/rate_limiting/exceptions.py +21 -0
- local_deep_research/web_search_engines/rate_limiting/tracker.py +506 -0
- local_deep_research/web_search_engines/retriever_registry.py +108 -0
- local_deep_research/web_search_engines/search_engine_base.py +161 -43
- local_deep_research/web_search_engines/search_engine_factory.py +14 -0
- local_deep_research/web_search_engines/search_engines_config.py +20 -0
- local_deep_research-0.6.0.dist-info/METADATA +374 -0
- {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/RECORD +89 -64
- local_deep_research-0.5.9.dist-info/METADATA +0 -420
- {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/WHEEL +0 -0
- {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/entry_points.txt +0 -0
- {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1136 @@
|
|
1
|
+
{% extends "base.html" %}
|
2
|
+
|
3
|
+
{% set active_page = 'benchmark-results' %}
|
4
|
+
|
5
|
+
{% block title %}Benchmark Results History - Deep Research System{% endblock %}
|
6
|
+
|
7
|
+
{% block extra_head %}
|
8
|
+
<meta name="csrf-token" content="{{ csrf_token() }}">
|
9
|
+
<style>
|
10
|
+
.benchmark-results-card {
|
11
|
+
width: 100%;
|
12
|
+
margin: 0;
|
13
|
+
padding: 0;
|
14
|
+
background: transparent;
|
15
|
+
border: none;
|
16
|
+
box-shadow: none;
|
17
|
+
}
|
18
|
+
|
19
|
+
.card-content {
|
20
|
+
padding: 0;
|
21
|
+
}
|
22
|
+
|
23
|
+
.run-card {
|
24
|
+
background: #1a1a1a;
|
25
|
+
border: 1px solid #333;
|
26
|
+
border-radius: 8px;
|
27
|
+
padding: 20px;
|
28
|
+
margin-bottom: 20px;
|
29
|
+
cursor: pointer;
|
30
|
+
transition: border-color 0.2s, background-color 0.2s;
|
31
|
+
}
|
32
|
+
|
33
|
+
.run-card:hover {
|
34
|
+
border-color: var(--primary-color);
|
35
|
+
background: #1e1e1e;
|
36
|
+
}
|
37
|
+
|
38
|
+
.run-card.expanded {
|
39
|
+
border-color: var(--primary-color);
|
40
|
+
}
|
41
|
+
|
42
|
+
.run-header {
|
43
|
+
display: flex;
|
44
|
+
justify-content: space-between;
|
45
|
+
align-items: center;
|
46
|
+
margin-bottom: 10px;
|
47
|
+
}
|
48
|
+
|
49
|
+
.run-title {
|
50
|
+
font-size: 1.2rem;
|
51
|
+
font-weight: bold;
|
52
|
+
color: #e0e0e0;
|
53
|
+
}
|
54
|
+
|
55
|
+
.run-date {
|
56
|
+
color: #a0a0a0;
|
57
|
+
font-size: 0.9rem;
|
58
|
+
}
|
59
|
+
|
60
|
+
.run-summary {
|
61
|
+
display: grid;
|
62
|
+
grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
|
63
|
+
gap: 12px;
|
64
|
+
margin-bottom: 15px;
|
65
|
+
}
|
66
|
+
|
67
|
+
/* Responsive grid adjustments for more columns on wider screens */
|
68
|
+
@media (min-width: 768px) {
|
69
|
+
.run-summary {
|
70
|
+
grid-template-columns: repeat(auto-fit, minmax(130px, 1fr));
|
71
|
+
gap: 15px;
|
72
|
+
}
|
73
|
+
}
|
74
|
+
|
75
|
+
@media (min-width: 1200px) {
|
76
|
+
.run-summary {
|
77
|
+
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
|
78
|
+
gap: 16px;
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
@media (min-width: 1600px) {
|
83
|
+
.run-summary {
|
84
|
+
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
|
85
|
+
gap: 18px;
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
@media (min-width: 1920px) {
|
90
|
+
.run-summary {
|
91
|
+
grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
|
92
|
+
gap: 20px;
|
93
|
+
}
|
94
|
+
}
|
95
|
+
|
96
|
+
.summary-item {
|
97
|
+
text-align: center;
|
98
|
+
padding: 10px;
|
99
|
+
background: #2a2a2a;
|
100
|
+
border-radius: 6px;
|
101
|
+
}
|
102
|
+
|
103
|
+
.summary-value {
|
104
|
+
font-size: 1.4rem;
|
105
|
+
font-weight: bold;
|
106
|
+
color: var(--primary-color);
|
107
|
+
}
|
108
|
+
|
109
|
+
.summary-label {
|
110
|
+
font-size: 0.85rem;
|
111
|
+
color: #a0a0a0;
|
112
|
+
margin-top: 5px;
|
113
|
+
}
|
114
|
+
|
115
|
+
.accuracy-indicator {
|
116
|
+
display: inline-block;
|
117
|
+
padding: 4px 8px;
|
118
|
+
border-radius: 12px;
|
119
|
+
font-size: 0.85rem;
|
120
|
+
font-weight: bold;
|
121
|
+
}
|
122
|
+
|
123
|
+
.accuracy-high {
|
124
|
+
background: #1e3a1e;
|
125
|
+
color: #4caf50;
|
126
|
+
}
|
127
|
+
|
128
|
+
.accuracy-medium {
|
129
|
+
background: #3a2a1e;
|
130
|
+
color: #ff9800;
|
131
|
+
}
|
132
|
+
|
133
|
+
.accuracy-low {
|
134
|
+
background: #3a1e1e;
|
135
|
+
color: #f44336;
|
136
|
+
}
|
137
|
+
|
138
|
+
.status-indicator {
|
139
|
+
display: inline-block;
|
140
|
+
padding: 4px 8px;
|
141
|
+
border-radius: 12px;
|
142
|
+
font-size: 0.85rem;
|
143
|
+
font-weight: bold;
|
144
|
+
}
|
145
|
+
|
146
|
+
.status-completed {
|
147
|
+
background: #1e3a1e;
|
148
|
+
color: #4caf50;
|
149
|
+
}
|
150
|
+
|
151
|
+
.status-in-progress {
|
152
|
+
background: #1e2a3a;
|
153
|
+
color: #2196f3;
|
154
|
+
}
|
155
|
+
|
156
|
+
.status-failed {
|
157
|
+
background: #3a1e1e;
|
158
|
+
color: #f44336;
|
159
|
+
}
|
160
|
+
|
161
|
+
.status-cancelled {
|
162
|
+
background: #2a2a2a;
|
163
|
+
color: #999;
|
164
|
+
}
|
165
|
+
|
166
|
+
.run-config {
|
167
|
+
display: grid;
|
168
|
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
169
|
+
gap: 10px;
|
170
|
+
margin-bottom: 15px;
|
171
|
+
padding: 15px;
|
172
|
+
background: #242424;
|
173
|
+
border-radius: 6px;
|
174
|
+
}
|
175
|
+
|
176
|
+
.config-item {
|
177
|
+
display: flex;
|
178
|
+
justify-content: space-between;
|
179
|
+
padding: 5px 0;
|
180
|
+
border-bottom: 1px solid #333;
|
181
|
+
}
|
182
|
+
|
183
|
+
.config-label {
|
184
|
+
color: #a0a0a0;
|
185
|
+
font-size: 0.9rem;
|
186
|
+
}
|
187
|
+
|
188
|
+
.config-value {
|
189
|
+
color: #e0e0e0;
|
190
|
+
font-weight: 500;
|
191
|
+
}
|
192
|
+
|
193
|
+
.results-section {
|
194
|
+
margin-top: 20px;
|
195
|
+
display: none;
|
196
|
+
}
|
197
|
+
|
198
|
+
.results-section.visible {
|
199
|
+
display: block;
|
200
|
+
}
|
201
|
+
|
202
|
+
.examples-grid {
|
203
|
+
display: grid;
|
204
|
+
gap: 15px;
|
205
|
+
margin-top: 15px;
|
206
|
+
}
|
207
|
+
|
208
|
+
.example-card {
|
209
|
+
background: #1a1a1a;
|
210
|
+
border: 1px solid #333;
|
211
|
+
border-radius: 6px;
|
212
|
+
padding: 15px;
|
213
|
+
}
|
214
|
+
|
215
|
+
.example-card.correct {
|
216
|
+
border-left: 4px solid #4caf50;
|
217
|
+
}
|
218
|
+
|
219
|
+
.example-card.incorrect {
|
220
|
+
border-left: 4px solid #f44336;
|
221
|
+
}
|
222
|
+
|
223
|
+
.example-header {
|
224
|
+
display: flex;
|
225
|
+
justify-content: space-between;
|
226
|
+
align-items: center;
|
227
|
+
margin-bottom: 10px;
|
228
|
+
}
|
229
|
+
|
230
|
+
.example-status {
|
231
|
+
display: flex;
|
232
|
+
align-items: center;
|
233
|
+
gap: 5px;
|
234
|
+
font-weight: 600;
|
235
|
+
}
|
236
|
+
|
237
|
+
.example-status.correct {
|
238
|
+
color: #4caf50;
|
239
|
+
}
|
240
|
+
|
241
|
+
.example-status.incorrect {
|
242
|
+
color: #f44336;
|
243
|
+
}
|
244
|
+
|
245
|
+
.example-question {
|
246
|
+
background: #2a2a2a;
|
247
|
+
padding: 12px;
|
248
|
+
border-radius: 4px;
|
249
|
+
border-left: 4px solid var(--primary-color);
|
250
|
+
margin-bottom: 12px;
|
251
|
+
color: #e0e0e0;
|
252
|
+
}
|
253
|
+
|
254
|
+
.example-answers {
|
255
|
+
display: grid;
|
256
|
+
grid-template-columns: 1fr 1fr;
|
257
|
+
gap: 12px;
|
258
|
+
}
|
259
|
+
|
260
|
+
.answer-section {
|
261
|
+
padding: 10px;
|
262
|
+
border-radius: 4px;
|
263
|
+
font-size: 0.9rem;
|
264
|
+
line-height: 1.4;
|
265
|
+
}
|
266
|
+
|
267
|
+
.model-answer-section {
|
268
|
+
background: #1e2a3a;
|
269
|
+
border-left: 3px solid #2196f3;
|
270
|
+
}
|
271
|
+
|
272
|
+
.correct-answer-section {
|
273
|
+
background: #1e3a1e;
|
274
|
+
border-left: 3px solid #4caf50;
|
275
|
+
}
|
276
|
+
|
277
|
+
.answer-label {
|
278
|
+
font-size: 0.75rem;
|
279
|
+
font-weight: 600;
|
280
|
+
color: #a0a0a0;
|
281
|
+
text-transform: uppercase;
|
282
|
+
margin-bottom: 6px;
|
283
|
+
}
|
284
|
+
|
285
|
+
.answer-text {
|
286
|
+
color: #e0e0e0;
|
287
|
+
}
|
288
|
+
|
289
|
+
.no-results {
|
290
|
+
text-align: center;
|
291
|
+
color: #a0a0a0;
|
292
|
+
padding: 40px;
|
293
|
+
font-style: italic;
|
294
|
+
}
|
295
|
+
|
296
|
+
.expand-indicator {
|
297
|
+
color: #a0a0a0;
|
298
|
+
font-size: 0.9rem;
|
299
|
+
margin-top: 10px;
|
300
|
+
text-align: center;
|
301
|
+
}
|
302
|
+
|
303
|
+
.dataset-breakdown {
|
304
|
+
display: grid;
|
305
|
+
grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
|
306
|
+
gap: 10px;
|
307
|
+
margin-top: 10px;
|
308
|
+
}
|
309
|
+
|
310
|
+
.dataset-item {
|
311
|
+
text-align: center;
|
312
|
+
padding: 8px;
|
313
|
+
background: #2a2a2a;
|
314
|
+
border-radius: 4px;
|
315
|
+
}
|
316
|
+
|
317
|
+
.dataset-name {
|
318
|
+
font-size: 0.8rem;
|
319
|
+
color: #a0a0a0;
|
320
|
+
margin-bottom: 4px;
|
321
|
+
}
|
322
|
+
|
323
|
+
.dataset-accuracy {
|
324
|
+
font-weight: bold;
|
325
|
+
color: var(--primary-color);
|
326
|
+
}
|
327
|
+
|
328
|
+
.loading {
|
329
|
+
text-align: center;
|
330
|
+
padding: 40px;
|
331
|
+
color: #a0a0a0;
|
332
|
+
}
|
333
|
+
|
334
|
+
.pagination {
|
335
|
+
display: flex;
|
336
|
+
justify-content: center;
|
337
|
+
align-items: center;
|
338
|
+
gap: 10px;
|
339
|
+
margin-top: 30px;
|
340
|
+
}
|
341
|
+
|
342
|
+
.pagination button {
|
343
|
+
padding: 8px 12px;
|
344
|
+
background: #2a2a2a;
|
345
|
+
border: 1px solid #333;
|
346
|
+
border-radius: 4px;
|
347
|
+
color: #e0e0e0;
|
348
|
+
cursor: pointer;
|
349
|
+
}
|
350
|
+
|
351
|
+
.pagination button:hover {
|
352
|
+
background: var(--primary-color);
|
353
|
+
}
|
354
|
+
|
355
|
+
.pagination button:disabled {
|
356
|
+
opacity: 0.5;
|
357
|
+
cursor: not-allowed;
|
358
|
+
}
|
359
|
+
|
360
|
+
.delete-btn {
|
361
|
+
background: #3a1e1e !important;
|
362
|
+
border-color: #f44336 !important;
|
363
|
+
color: #f44336 !important;
|
364
|
+
font-size: 0.8rem;
|
365
|
+
padding: 4px 8px;
|
366
|
+
transition: all 0.2s;
|
367
|
+
}
|
368
|
+
|
369
|
+
.delete-btn:hover:not(:disabled) {
|
370
|
+
background: #f44336 !important;
|
371
|
+
color: white !important;
|
372
|
+
}
|
373
|
+
|
374
|
+
.delete-btn:disabled {
|
375
|
+
background: #2a2a2a !important;
|
376
|
+
border-color: #555 !important;
|
377
|
+
color: #888 !important;
|
378
|
+
cursor: not-allowed;
|
379
|
+
}
|
380
|
+
|
381
|
+
.filters {
|
382
|
+
display: grid;
|
383
|
+
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
|
384
|
+
gap: 15px;
|
385
|
+
margin-bottom: 20px;
|
386
|
+
}
|
387
|
+
|
388
|
+
/* Responsive filter adjustments for full-width layout */
|
389
|
+
@media (max-width: 767px) {
|
390
|
+
.filters {
|
391
|
+
grid-template-columns: 1fr 1fr;
|
392
|
+
gap: 12px;
|
393
|
+
}
|
394
|
+
}
|
395
|
+
|
396
|
+
@media (min-width: 768px) {
|
397
|
+
.filters {
|
398
|
+
grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
|
399
|
+
gap: 16px;
|
400
|
+
}
|
401
|
+
}
|
402
|
+
|
403
|
+
@media (min-width: 1200px) {
|
404
|
+
.filters {
|
405
|
+
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
|
406
|
+
gap: 18px;
|
407
|
+
}
|
408
|
+
}
|
409
|
+
|
410
|
+
@media (min-width: 1600px) {
|
411
|
+
.filters {
|
412
|
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
413
|
+
gap: 20px;
|
414
|
+
}
|
415
|
+
}
|
416
|
+
|
417
|
+
.processing-time {
|
418
|
+
background: #2a3f2a;
|
419
|
+
color: #90ee90;
|
420
|
+
padding: 2px 6px;
|
421
|
+
border-radius: 3px;
|
422
|
+
font-size: 0.8rem;
|
423
|
+
font-weight: 500;
|
424
|
+
white-space: nowrap;
|
425
|
+
}
|
426
|
+
|
427
|
+
.filter-group {
|
428
|
+
display: flex;
|
429
|
+
flex-direction: column;
|
430
|
+
gap: 5px;
|
431
|
+
}
|
432
|
+
|
433
|
+
.filter-group label {
|
434
|
+
font-size: 0.85rem;
|
435
|
+
color: #a0a0a0;
|
436
|
+
}
|
437
|
+
|
438
|
+
.filter-group select,
|
439
|
+
.filter-group input {
|
440
|
+
padding: 6px 10px;
|
441
|
+
background: #2a2a2a;
|
442
|
+
border: 1px solid #333;
|
443
|
+
border-radius: 4px;
|
444
|
+
color: #e0e0e0;
|
445
|
+
}
|
446
|
+
|
447
|
+
.search-stats-section {
|
448
|
+
margin-bottom: 20px;
|
449
|
+
padding: 15px;
|
450
|
+
background: #1e1e1e;
|
451
|
+
border: 1px solid #333;
|
452
|
+
border-radius: 6px;
|
453
|
+
}
|
454
|
+
|
455
|
+
.section-title {
|
456
|
+
display: flex;
|
457
|
+
align-items: center;
|
458
|
+
gap: 8px;
|
459
|
+
margin-bottom: 15px;
|
460
|
+
color: #e0e0e0;
|
461
|
+
font-size: 1.1rem;
|
462
|
+
font-weight: 600;
|
463
|
+
}
|
464
|
+
|
465
|
+
.search-stats-grid {
|
466
|
+
display: grid;
|
467
|
+
grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
|
468
|
+
gap: 15px;
|
469
|
+
}
|
470
|
+
|
471
|
+
.stat-item {
|
472
|
+
text-align: center;
|
473
|
+
padding: 12px;
|
474
|
+
background: #2a2a2a;
|
475
|
+
border-radius: 6px;
|
476
|
+
}
|
477
|
+
|
478
|
+
.stat-value {
|
479
|
+
font-size: 1.4rem;
|
480
|
+
font-weight: bold;
|
481
|
+
color: var(--primary-color);
|
482
|
+
margin-bottom: 5px;
|
483
|
+
}
|
484
|
+
|
485
|
+
.stat-label {
|
486
|
+
font-size: 0.85rem;
|
487
|
+
color: #a0a0a0;
|
488
|
+
}
|
489
|
+
|
490
|
+
.results-divider {
|
491
|
+
height: 1px;
|
492
|
+
background: #333;
|
493
|
+
margin: 20px 0;
|
494
|
+
}
|
495
|
+
|
496
|
+
.examples-section {
|
497
|
+
margin-top: 15px;
|
498
|
+
}
|
499
|
+
|
500
|
+
.example-metrics {
|
501
|
+
display: flex;
|
502
|
+
gap: 10px;
|
503
|
+
align-items: center;
|
504
|
+
}
|
505
|
+
|
506
|
+
.search-results-count {
|
507
|
+
background: #2a3a3a;
|
508
|
+
color: #81c784;
|
509
|
+
padding: 2px 6px;
|
510
|
+
border-radius: 3px;
|
511
|
+
font-size: 0.8rem;
|
512
|
+
font-weight: 500;
|
513
|
+
white-space: nowrap;
|
514
|
+
}
|
515
|
+
</style>
|
516
|
+
{% endblock %}
|
517
|
+
|
518
|
+
{% block content %}
|
519
|
+
<div class="page active" id="benchmark-results">
|
520
|
+
<div class="page-header">
|
521
|
+
<h1>Benchmark Results History</h1>
|
522
|
+
<p class="page-subtitle">Compare accuracy across different models, search engines, and strategies</p>
|
523
|
+
</div>
|
524
|
+
|
525
|
+
<div class="card benchmark-results-card">
|
526
|
+
<div class="card-content">
|
527
|
+
<!-- Filters -->
|
528
|
+
<div class="filters">
|
529
|
+
<div class="filter-group">
|
530
|
+
<label for="accuracy-filter">Accuracy Range</label>
|
531
|
+
<select id="accuracy-filter">
|
532
|
+
<option value="">All</option>
|
533
|
+
<option value="high">90%+ (High)</option>
|
534
|
+
<option value="medium">70-90% (Medium)</option>
|
535
|
+
<option value="low"><70% (Low)</option>
|
536
|
+
</select>
|
537
|
+
</div>
|
538
|
+
<div class="filter-group">
|
539
|
+
<label for="model-filter">Model</label>
|
540
|
+
<select id="model-filter">
|
541
|
+
<option value="">All Models</option>
|
542
|
+
</select>
|
543
|
+
</div>
|
544
|
+
<div class="filter-group">
|
545
|
+
<label for="strategy-filter">Strategy</label>
|
546
|
+
<select id="strategy-filter">
|
547
|
+
<option value="">All Strategies</option>
|
548
|
+
</select>
|
549
|
+
</div>
|
550
|
+
<div class="filter-group">
|
551
|
+
<label for="status-filter">Status</label>
|
552
|
+
<select id="status-filter">
|
553
|
+
<option value="">All Statuses</option>
|
554
|
+
<option value="completed">Completed</option>
|
555
|
+
<option value="in_progress">In Progress</option>
|
556
|
+
<option value="failed">Failed</option>
|
557
|
+
<option value="cancelled">Cancelled</option>
|
558
|
+
</select>
|
559
|
+
</div>
|
560
|
+
<div class="filter-group">
|
561
|
+
<label for="date-filter">Date Range</label>
|
562
|
+
<input type="date" id="date-from">
|
563
|
+
<input type="date" id="date-to">
|
564
|
+
</div>
|
565
|
+
</div>
|
566
|
+
|
567
|
+
<!-- Results List -->
|
568
|
+
<div id="results-container">
|
569
|
+
<div class="loading">
|
570
|
+
<i class="fas fa-spinner fa-spin"></i> Loading benchmark results...
|
571
|
+
</div>
|
572
|
+
</div>
|
573
|
+
|
574
|
+
<!-- Pagination -->
|
575
|
+
<div class="pagination" id="pagination" style="display: none;">
|
576
|
+
<button id="prev-page">← Previous</button>
|
577
|
+
<span id="page-info">Page 1 of 1</span>
|
578
|
+
<button id="next-page">Next →</button>
|
579
|
+
</div>
|
580
|
+
</div>
|
581
|
+
</div>
|
582
|
+
</div>
|
583
|
+
|
584
|
+
<script>
|
585
|
+
let benchmarkRuns = [];
|
586
|
+
let filteredRuns = [];
|
587
|
+
let currentPage = 1;
|
588
|
+
const itemsPerPage = 20;
|
589
|
+
|
590
|
+
document.addEventListener('DOMContentLoaded', function() {
|
591
|
+
loadBenchmarkHistory();
|
592
|
+
setupFilters();
|
593
|
+
});
|
594
|
+
|
595
|
+
async function loadBenchmarkHistory() {
|
596
|
+
try {
|
597
|
+
const response = await fetch('/benchmark/api/history');
|
598
|
+
const data = await response.json();
|
599
|
+
|
600
|
+
if (data.success) {
|
601
|
+
benchmarkRuns = data.runs;
|
602
|
+
filteredRuns = [...benchmarkRuns];
|
603
|
+
populateFilters();
|
604
|
+
displayResults();
|
605
|
+
} else {
|
606
|
+
document.getElementById('results-container').innerHTML =
|
607
|
+
'<div class="no-results">Error loading benchmark results</div>';
|
608
|
+
}
|
609
|
+
} catch (error) {
|
610
|
+
console.error('Error loading benchmark history:', error);
|
611
|
+
document.getElementById('results-container').innerHTML =
|
612
|
+
'<div class="no-results">Error loading benchmark results</div>';
|
613
|
+
}
|
614
|
+
}
|
615
|
+
|
616
|
+
function populateFilters() {
|
617
|
+
// Populate model filter
|
618
|
+
const models = [...new Set(benchmarkRuns.map(run =>
|
619
|
+
run.search_config?.model_name).filter(Boolean))];
|
620
|
+
const modelFilter = document.getElementById('model-filter');
|
621
|
+
models.forEach(model => {
|
622
|
+
const option = document.createElement('option');
|
623
|
+
option.value = model;
|
624
|
+
option.textContent = model;
|
625
|
+
modelFilter.appendChild(option);
|
626
|
+
});
|
627
|
+
|
628
|
+
// Populate strategy filter
|
629
|
+
const strategies = [...new Set(benchmarkRuns.map(run =>
|
630
|
+
run.search_config?.search_strategy).filter(Boolean))];
|
631
|
+
const strategyFilter = document.getElementById('strategy-filter');
|
632
|
+
strategies.forEach(strategy => {
|
633
|
+
const option = document.createElement('option');
|
634
|
+
option.value = strategy;
|
635
|
+
option.textContent = strategy.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase());
|
636
|
+
strategyFilter.appendChild(option);
|
637
|
+
});
|
638
|
+
}
|
639
|
+
|
640
|
+
function setupFilters() {
|
641
|
+
document.getElementById('accuracy-filter').addEventListener('change', applyFilters);
|
642
|
+
document.getElementById('model-filter').addEventListener('change', applyFilters);
|
643
|
+
document.getElementById('strategy-filter').addEventListener('change', applyFilters);
|
644
|
+
document.getElementById('status-filter').addEventListener('change', applyFilters);
|
645
|
+
document.getElementById('date-from').addEventListener('change', applyFilters);
|
646
|
+
document.getElementById('date-to').addEventListener('change', applyFilters);
|
647
|
+
}
|
648
|
+
|
649
|
+
function applyFilters() {
|
650
|
+
const accuracyFilter = document.getElementById('accuracy-filter').value;
|
651
|
+
const modelFilter = document.getElementById('model-filter').value;
|
652
|
+
const strategyFilter = document.getElementById('strategy-filter').value;
|
653
|
+
const statusFilter = document.getElementById('status-filter').value;
|
654
|
+
const dateFrom = document.getElementById('date-from').value;
|
655
|
+
const dateTo = document.getElementById('date-to').value;
|
656
|
+
|
657
|
+
filteredRuns = benchmarkRuns.filter(run => {
|
658
|
+
// Accuracy filter
|
659
|
+
if (accuracyFilter) {
|
660
|
+
const accuracy = run.overall_accuracy || 0;
|
661
|
+
if (accuracyFilter === 'high' && accuracy < 90) return false;
|
662
|
+
if (accuracyFilter === 'medium' && (accuracy < 70 || accuracy >= 90)) return false;
|
663
|
+
if (accuracyFilter === 'low' && accuracy >= 70) return false;
|
664
|
+
}
|
665
|
+
|
666
|
+
// Model filter
|
667
|
+
if (modelFilter && run.search_config?.model_name !== modelFilter) return false;
|
668
|
+
|
669
|
+
// Strategy filter
|
670
|
+
if (strategyFilter && run.search_config?.search_strategy !== strategyFilter) return false;
|
671
|
+
|
672
|
+
// Status filter
|
673
|
+
if (statusFilter && run.status !== statusFilter) return false;
|
674
|
+
|
675
|
+
// Date filters
|
676
|
+
const runDate = new Date(run.created_at).toISOString().split('T')[0];
|
677
|
+
if (dateFrom && runDate < dateFrom) return false;
|
678
|
+
if (dateTo && runDate > dateTo) return false;
|
679
|
+
|
680
|
+
return true;
|
681
|
+
});
|
682
|
+
|
683
|
+
currentPage = 1;
|
684
|
+
displayResults();
|
685
|
+
}
|
686
|
+
|
687
|
+
function displayResults() {
|
688
|
+
const container = document.getElementById('results-container');
|
689
|
+
|
690
|
+
if (filteredRuns.length === 0) {
|
691
|
+
container.innerHTML = '<div class="no-results">No benchmark results found</div>';
|
692
|
+
document.getElementById('pagination').style.display = 'none';
|
693
|
+
return;
|
694
|
+
}
|
695
|
+
|
696
|
+
const startIndex = (currentPage - 1) * itemsPerPage;
|
697
|
+
const endIndex = Math.min(startIndex + itemsPerPage, filteredRuns.length);
|
698
|
+
const pageRuns = filteredRuns.slice(startIndex, endIndex);
|
699
|
+
|
700
|
+
const html = pageRuns.map(run => createRunCard(run)).join('');
|
701
|
+
container.innerHTML = html;
|
702
|
+
|
703
|
+
// Setup pagination
|
704
|
+
setupPagination();
|
705
|
+
}
|
706
|
+
|
707
|
+
function createRunCard(run) {
|
708
|
+
const accuracy = run.overall_accuracy || 0;
|
709
|
+
const accuracyClass = accuracy >= 90 ? 'accuracy-high' :
|
710
|
+
accuracy >= 70 ? 'accuracy-medium' : 'accuracy-low';
|
711
|
+
|
712
|
+
// Status handling
|
713
|
+
const status = run.status || 'unknown';
|
714
|
+
const statusClass = `status-${status.replace('_', '-')}`;
|
715
|
+
const statusText = status.replace('_', ' ').replace(/\b\w/g, l => l.toUpperCase());
|
716
|
+
|
717
|
+
const date = new Date(run.created_at).toLocaleDateString();
|
718
|
+
const time = new Date(run.created_at).toLocaleTimeString();
|
719
|
+
|
720
|
+
// Progress calculation
|
721
|
+
const progress = run.total_examples > 0 ?
|
722
|
+
((run.completed_examples / run.total_examples) * 100).toFixed(1) : 0;
|
723
|
+
|
724
|
+
// Format status text with proper capitalization
|
725
|
+
const formattedStatus = status.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase());
|
726
|
+
|
727
|
+
return `
|
728
|
+
<div class="run-card" onclick="toggleRunDetails(${run.id})">
|
729
|
+
<div class="run-header">
|
730
|
+
<div class="run-title">${run.run_name}</div>
|
731
|
+
<div class="run-date">${date} ${time}</div>
|
732
|
+
</div>
|
733
|
+
|
734
|
+
<div class="run-summary">
|
735
|
+
<div class="summary-item">
|
736
|
+
<div class="summary-value ${statusClass} status-indicator">${formattedStatus}</div>
|
737
|
+
<div class="summary-label">Status</div>
|
738
|
+
</div>
|
739
|
+
<div class="summary-item">
|
740
|
+
<div class="summary-value">${run.completed_examples}/${run.total_examples}</div>
|
741
|
+
<div class="summary-label">Progress (${progress}%)</div>
|
742
|
+
</div>
|
743
|
+
<div class="summary-item">
|
744
|
+
<div class="summary-value ${status === 'completed' && accuracy > 0 ? accuracyClass + ' accuracy-indicator' : ''}">${status === 'completed' && accuracy > 0 ? accuracy.toFixed(1) + '%' : 'N/A'}</div>
|
745
|
+
<div class="summary-label">Accuracy</div>
|
746
|
+
</div>
|
747
|
+
<div class="summary-item">
|
748
|
+
<div class="summary-value">${run.search_config?.model_name || 'Unknown'}</div>
|
749
|
+
<div class="summary-label">Model</div>
|
750
|
+
</div>
|
751
|
+
<div class="summary-item">
|
752
|
+
<div class="summary-value">${(run.search_config?.search_strategy || 'unknown').replace(/_/g, ' ')}</div>
|
753
|
+
<div class="summary-label">Strategy</div>
|
754
|
+
</div>
|
755
|
+
<div class="summary-item">
|
756
|
+
<div class="summary-value">${run.search_config?.search_tool || 'Unknown'}</div>
|
757
|
+
<div class="summary-label">Search Engine</div>
|
758
|
+
</div>
|
759
|
+
<div class="summary-item">
|
760
|
+
<div class="summary-value">${formatAvgSearchResults(run)}</div>
|
761
|
+
<div class="summary-label">Avg Search Results</div>
|
762
|
+
</div>
|
763
|
+
<div class="summary-item">
|
764
|
+
<div class="summary-value">${formatAvgSearchRequests(run)}</div>
|
765
|
+
<div class="summary-label">Avg Search Requests</div>
|
766
|
+
</div>
|
767
|
+
<div class="summary-item">
|
768
|
+
<div class="summary-value">${formatAvgProcessingTime(run)}</div>
|
769
|
+
<div class="summary-label">Avg Time/Question</div>
|
770
|
+
</div>
|
771
|
+
<div class="summary-item">
|
772
|
+
${status === 'in_progress' ?
|
773
|
+
`<button class="btn btn-outline btn-sm delete-btn" onclick="event.stopPropagation(); cancelAndDeleteBenchmarkRun(${run.id})" style="background: #3a1e1e !important; border-color: #f44336 !important; color: #f44336 !important;">
|
774
|
+
<i class="fas fa-stop"></i> Cancel & Delete
|
775
|
+
</button>` :
|
776
|
+
`<button class="btn btn-outline btn-sm delete-btn" onclick="event.stopPropagation(); deleteBenchmarkRun(${run.id})">
|
777
|
+
<i class="fas fa-trash"></i> Delete
|
778
|
+
</button>`
|
779
|
+
}
|
780
|
+
<div class="summary-label">Actions</div>
|
781
|
+
</div>
|
782
|
+
</div>
|
783
|
+
|
784
|
+
<div class="expand-indicator">
|
785
|
+
<i class="fas fa-chevron-down"></i> Click to view detailed results and examples
|
786
|
+
</div>
|
787
|
+
|
788
|
+
<div class="results-section" id="results-${run.id}">
|
789
|
+
<div class="run-config">
|
790
|
+
<div class="config-item">
|
791
|
+
<span class="config-label">Iterations:</span>
|
792
|
+
<span class="config-value">${run.search_config?.iterations || 'N/A'}</span>
|
793
|
+
</div>
|
794
|
+
<div class="config-item">
|
795
|
+
<span class="config-label">Questions/Iteration:</span>
|
796
|
+
<span class="config-value">${run.search_config?.questions_per_iteration || 'N/A'}</span>
|
797
|
+
</div>
|
798
|
+
<div class="config-item">
|
799
|
+
<span class="config-label">Temperature:</span>
|
800
|
+
<span class="config-value">${run.search_config?.temperature || 'N/A'}</span>
|
801
|
+
</div>
|
802
|
+
<div class="config-item">
|
803
|
+
<span class="config-label">Provider:</span>
|
804
|
+
<span class="config-value">${run.search_config?.provider || 'N/A'}</span>
|
805
|
+
</div>
|
806
|
+
</div>
|
807
|
+
|
808
|
+
<div id="examples-${run.id}">
|
809
|
+
<div class="loading">Loading detailed results...</div>
|
810
|
+
</div>
|
811
|
+
</div>
|
812
|
+
</div>
|
813
|
+
`;
|
814
|
+
}
|
815
|
+
|
816
|
+
async function toggleRunDetails(runId) {
|
817
|
+
const resultsSection = document.getElementById(`results-${runId}`);
|
818
|
+
const runCard = resultsSection.closest('.run-card');
|
819
|
+
|
820
|
+
if (resultsSection.classList.contains('visible')) {
|
821
|
+
resultsSection.classList.remove('visible');
|
822
|
+
runCard.classList.remove('expanded');
|
823
|
+
return;
|
824
|
+
}
|
825
|
+
|
826
|
+
resultsSection.classList.add('visible');
|
827
|
+
runCard.classList.add('expanded');
|
828
|
+
|
829
|
+
// Load examples if not already loaded
|
830
|
+
const examplesContainer = document.getElementById(`examples-${runId}`);
|
831
|
+
if (examplesContainer.innerHTML.includes('Loading detailed results...')) {
|
832
|
+
await loadExamples(runId);
|
833
|
+
}
|
834
|
+
}
|
835
|
+
|
836
|
+
async function loadExamples(runId) {
|
837
|
+
try {
|
838
|
+
const response = await fetch(`/benchmark/api/results/${runId}?limit=50`);
|
839
|
+
const data = await response.json();
|
840
|
+
|
841
|
+
const examplesContainer = document.getElementById(`examples-${runId}`);
|
842
|
+
|
843
|
+
if (data.success && data.results.length > 0) {
|
844
|
+
// Calculate search result statistics
|
845
|
+
const searchResultCounts = data.results.map(r => r.search_result_count || 0);
|
846
|
+
const avgSearchResults = searchResultCounts.length > 0
|
847
|
+
? (searchResultCounts.reduce((sum, count) => sum + count, 0) / searchResultCounts.length).toFixed(1)
|
848
|
+
: 'N/A';
|
849
|
+
const minSearchResults = searchResultCounts.length > 0 ? Math.min(...searchResultCounts) : 'N/A';
|
850
|
+
const maxSearchResults = searchResultCounts.length > 0 ? Math.max(...searchResultCounts) : 'N/A';
|
851
|
+
|
852
|
+
// Create search results statistics section
|
853
|
+
const statsHtml = `
|
854
|
+
<div class="search-stats-section">
|
855
|
+
<h4 class="section-title">
|
856
|
+
<i class="fas fa-search"></i> Search Results Statistics
|
857
|
+
</h4>
|
858
|
+
<div class="search-stats-grid">
|
859
|
+
<div class="stat-item">
|
860
|
+
<div class="stat-value">${avgSearchResults}</div>
|
861
|
+
<div class="stat-label">Avg Search Results</div>
|
862
|
+
</div>
|
863
|
+
<div class="stat-item">
|
864
|
+
<div class="stat-value">${minSearchResults}</div>
|
865
|
+
<div class="stat-label">Min Results</div>
|
866
|
+
</div>
|
867
|
+
<div class="stat-item">
|
868
|
+
<div class="stat-value">${maxSearchResults}</div>
|
869
|
+
<div class="stat-label">Max Results</div>
|
870
|
+
</div>
|
871
|
+
<div class="stat-item">
|
872
|
+
<div class="stat-value">${data.results.length}</div>
|
873
|
+
<div class="stat-label">Total Queries</div>
|
874
|
+
</div>
|
875
|
+
</div>
|
876
|
+
</div>
|
877
|
+
`;
|
878
|
+
|
879
|
+
// Create examples grid
|
880
|
+
const examplesHtml = data.results.map(result => createExampleCard(result)).join('');
|
881
|
+
|
882
|
+
examplesContainer.innerHTML = `
|
883
|
+
${statsHtml}
|
884
|
+
<div class="results-divider"></div>
|
885
|
+
<div class="examples-section">
|
886
|
+
<h4 class="section-title">
|
887
|
+
<i class="fas fa-list"></i> Individual Query Results (${data.results.length} shown)
|
888
|
+
</h4>
|
889
|
+
<div class="examples-grid">${examplesHtml}</div>
|
890
|
+
</div>
|
891
|
+
`;
|
892
|
+
} else {
|
893
|
+
examplesContainer.innerHTML = '<div class="no-results">No detailed results available</div>';
|
894
|
+
}
|
895
|
+
} catch (error) {
|
896
|
+
console.error('Error loading examples:', error);
|
897
|
+
document.getElementById(`examples-${runId}`).innerHTML =
|
898
|
+
'<div class="no-results">Error loading examples</div>';
|
899
|
+
}
|
900
|
+
}
|
901
|
+
|
902
|
+
function formatAvgSearchResults(run) {
|
903
|
+
if (!run.avg_search_results || run.avg_search_results <= 0) {
|
904
|
+
return 'N/A';
|
905
|
+
}
|
906
|
+
|
907
|
+
return Math.round(run.avg_search_results).toString();
|
908
|
+
}
|
909
|
+
|
910
|
+
function formatAvgSearchRequests(run) {
|
911
|
+
if (!run.total_search_requests || run.total_search_requests <= 0) {
|
912
|
+
return 'N/A';
|
913
|
+
}
|
914
|
+
|
915
|
+
return Math.round(run.total_search_requests).toString();
|
916
|
+
}
|
917
|
+
|
918
|
+
function formatAvgProcessingTime(run) {
|
919
|
+
// Calculate average processing time from completed examples
|
920
|
+
if (!run.avg_processing_time && (!run.results || run.results.length === 0)) {
|
921
|
+
return 'N/A';
|
922
|
+
}
|
923
|
+
|
924
|
+
// Use avg_processing_time if available, otherwise calculate from results
|
925
|
+
let avgTime = run.avg_processing_time;
|
926
|
+
if (!avgTime && run.results) {
|
927
|
+
const timesWithValues = run.results
|
928
|
+
.filter(r => r.processing_time && r.processing_time > 0)
|
929
|
+
.map(r => r.processing_time);
|
930
|
+
|
931
|
+
if (timesWithValues.length === 0) return 'N/A';
|
932
|
+
avgTime = timesWithValues.reduce((sum, time) => sum + time, 0) / timesWithValues.length;
|
933
|
+
}
|
934
|
+
|
935
|
+
if (!avgTime || avgTime <= 0) return 'N/A';
|
936
|
+
|
937
|
+
// Format time nicely
|
938
|
+
if (avgTime < 60) {
|
939
|
+
return `${avgTime.toFixed(1)}s`;
|
940
|
+
} else if (avgTime < 3600) {
|
941
|
+
const minutes = Math.floor(avgTime / 60);
|
942
|
+
const seconds = Math.round(avgTime % 60);
|
943
|
+
return seconds > 0 ? `${minutes}m ${seconds}s` : `${minutes}m`;
|
944
|
+
} else {
|
945
|
+
const hours = Math.floor(avgTime / 3600);
|
946
|
+
const minutes = Math.round((avgTime % 3600) / 60);
|
947
|
+
return minutes > 0 ? `${hours}h ${minutes}m` : `${hours}h`;
|
948
|
+
}
|
949
|
+
}
|
950
|
+
|
951
|
+
function createExampleCard(result) {
|
952
|
+
const statusClass = result.is_correct ? 'correct' : 'incorrect';
|
953
|
+
const statusIcon = result.is_correct ? '<i class="fas fa-check-circle"></i>' : '<i class="fas fa-times-circle"></i>';
|
954
|
+
const statusText = result.is_correct ? 'Correct' : 'Incorrect';
|
955
|
+
|
956
|
+
// Format processing time for individual result
|
957
|
+
const processingTime = result.processing_time && result.processing_time > 0
|
958
|
+
? (result.processing_time < 60
|
959
|
+
? `${result.processing_time.toFixed(1)}s`
|
960
|
+
: `${Math.floor(result.processing_time / 60)}m ${Math.round(result.processing_time % 60)}s`)
|
961
|
+
: 'N/A';
|
962
|
+
|
963
|
+
// Format search results count
|
964
|
+
const searchResultCount = result.search_result_count || 0;
|
965
|
+
|
966
|
+
return `
|
967
|
+
<div class="example-card ${statusClass}">
|
968
|
+
<div class="example-header">
|
969
|
+
<span class="dataset-badge">${result.dataset_type}</span>
|
970
|
+
<span class="example-status ${statusClass}">
|
971
|
+
${statusIcon} ${statusText}
|
972
|
+
</span>
|
973
|
+
<div class="example-metrics">
|
974
|
+
<span class="processing-time">⏱️ ${processingTime}</span>
|
975
|
+
<span class="search-results-count">🔍 ${searchResultCount} results</span>
|
976
|
+
</div>
|
977
|
+
</div>
|
978
|
+
|
979
|
+
<div class="example-question">
|
980
|
+
<strong>Question:</strong> ${result.question}
|
981
|
+
</div>
|
982
|
+
|
983
|
+
<div class="example-answers">
|
984
|
+
<div class="answer-section model-answer-section">
|
985
|
+
<div class="answer-label">Model Answer</div>
|
986
|
+
<div class="answer-text">${result.model_answer || 'No answer provided'}</div>
|
987
|
+
</div>
|
988
|
+
<div class="answer-section correct-answer-section">
|
989
|
+
<div class="answer-label">Expected Answer</div>
|
990
|
+
<div class="answer-text">${result.correct_answer || 'No expected answer'}</div>
|
991
|
+
</div>
|
992
|
+
</div>
|
993
|
+
</div>
|
994
|
+
`;
|
995
|
+
}
|
996
|
+
|
997
|
+
function setupPagination() {
|
998
|
+
const totalPages = Math.ceil(filteredRuns.length / itemsPerPage);
|
999
|
+
const paginationDiv = document.getElementById('pagination');
|
1000
|
+
|
1001
|
+
if (totalPages <= 1) {
|
1002
|
+
paginationDiv.style.display = 'none';
|
1003
|
+
return;
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
paginationDiv.style.display = 'flex';
|
1007
|
+
|
1008
|
+
const prevBtn = document.getElementById('prev-page');
|
1009
|
+
const nextBtn = document.getElementById('next-page');
|
1010
|
+
const pageInfo = document.getElementById('page-info');
|
1011
|
+
|
1012
|
+
prevBtn.disabled = currentPage === 1;
|
1013
|
+
nextBtn.disabled = currentPage === totalPages;
|
1014
|
+
pageInfo.textContent = `Page ${currentPage} of ${totalPages}`;
|
1015
|
+
|
1016
|
+
prevBtn.onclick = () => {
|
1017
|
+
if (currentPage > 1) {
|
1018
|
+
currentPage--;
|
1019
|
+
displayResults();
|
1020
|
+
}
|
1021
|
+
};
|
1022
|
+
|
1023
|
+
nextBtn.onclick = () => {
|
1024
|
+
if (currentPage < totalPages) {
|
1025
|
+
currentPage++;
|
1026
|
+
displayResults();
|
1027
|
+
}
|
1028
|
+
};
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
async function cancelAndDeleteBenchmarkRun(runId) {
|
1032
|
+
if (!confirm('Are you sure you want to cancel and delete this running benchmark? This action cannot be undone.')) {
|
1033
|
+
return;
|
1034
|
+
}
|
1035
|
+
|
1036
|
+
try {
|
1037
|
+
// First cancel the benchmark
|
1038
|
+
const cancelResponse = await fetch(`/benchmark/api/cancel/${runId}`, {
|
1039
|
+
method: 'POST',
|
1040
|
+
headers: {
|
1041
|
+
'Content-Type': 'application/json',
|
1042
|
+
}
|
1043
|
+
});
|
1044
|
+
|
1045
|
+
const cancelData = await cancelResponse.json();
|
1046
|
+
|
1047
|
+
if (cancelData.success) {
|
1048
|
+
showAlert('Benchmark cancelled successfully. Deleting...', 'info');
|
1049
|
+
|
1050
|
+
// Wait a moment for cancellation to process
|
1051
|
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
1052
|
+
|
1053
|
+
// Then delete it
|
1054
|
+
await deleteBenchmarkRun(runId);
|
1055
|
+
} else {
|
1056
|
+
showAlert('Error cancelling benchmark: ' + cancelData.error, 'error');
|
1057
|
+
}
|
1058
|
+
} catch (error) {
|
1059
|
+
console.error('Error cancelling benchmark:', error);
|
1060
|
+
showAlert('Error cancelling benchmark: ' + error.message, 'error');
|
1061
|
+
}
|
1062
|
+
}
|
1063
|
+
|
1064
|
+
async function deleteBenchmarkRun(runId) {
|
1065
|
+
try {
|
1066
|
+
const response = await fetch(`/benchmark/api/delete/${runId}`, {
|
1067
|
+
method: 'DELETE',
|
1068
|
+
headers: {
|
1069
|
+
'Content-Type': 'application/json',
|
1070
|
+
}
|
1071
|
+
});
|
1072
|
+
|
1073
|
+
const data = await response.json();
|
1074
|
+
|
1075
|
+
if (data.success) {
|
1076
|
+
// Show success message
|
1077
|
+
showAlert('Benchmark run deleted successfully!', 'success');
|
1078
|
+
|
1079
|
+
// Remove the run from our local data
|
1080
|
+
benchmarkRuns = benchmarkRuns.filter(run => run.id !== runId);
|
1081
|
+
|
1082
|
+
// Reapply filters and redisplay
|
1083
|
+
applyFilters();
|
1084
|
+
} else {
|
1085
|
+
showAlert('Error deleting benchmark run: ' + data.error, 'error');
|
1086
|
+
}
|
1087
|
+
} catch (error) {
|
1088
|
+
console.error('Error deleting benchmark run:', error);
|
1089
|
+
showAlert('Error deleting benchmark run: ' + error.message, 'error');
|
1090
|
+
}
|
1091
|
+
}
|
1092
|
+
|
1093
|
+
function showAlert(message, type) {
|
1094
|
+
// Create alert element
|
1095
|
+
const alertDiv = document.createElement('div');
|
1096
|
+
alertDiv.className = `alert alert-${type}`;
|
1097
|
+
alertDiv.style.cssText = `
|
1098
|
+
position: fixed;
|
1099
|
+
top: 20px;
|
1100
|
+
right: 20px;
|
1101
|
+
z-index: 1000;
|
1102
|
+
max-width: 400px;
|
1103
|
+
padding: 15px;
|
1104
|
+
border-radius: 6px;
|
1105
|
+
color: white;
|
1106
|
+
font-weight: 500;
|
1107
|
+
box-shadow: 0 4px 12px rgba(0,0,0,0.3);
|
1108
|
+
`;
|
1109
|
+
|
1110
|
+
// Set background color based on type
|
1111
|
+
const colors = {
|
1112
|
+
success: '#4caf50',
|
1113
|
+
error: '#f44336',
|
1114
|
+
warning: '#ff9800',
|
1115
|
+
info: '#2196f3'
|
1116
|
+
};
|
1117
|
+
alertDiv.style.backgroundColor = colors[type] || colors.info;
|
1118
|
+
|
1119
|
+
alertDiv.innerHTML = `
|
1120
|
+
<div style="display: flex; align-items: center; justify-content: space-between;">
|
1121
|
+
<span>${message}</span>
|
1122
|
+
<button onclick="this.parentElement.parentElement.remove()" style="background: none; border: none; color: white; font-size: 1.2rem; cursor: pointer; margin-left: 10px;">×</button>
|
1123
|
+
</div>
|
1124
|
+
`;
|
1125
|
+
|
1126
|
+
document.body.appendChild(alertDiv);
|
1127
|
+
|
1128
|
+
// Auto-remove after 5 seconds
|
1129
|
+
setTimeout(() => {
|
1130
|
+
if (alertDiv.parentElement) {
|
1131
|
+
alertDiv.remove();
|
1132
|
+
}
|
1133
|
+
}, 5000);
|
1134
|
+
}
|
1135
|
+
</script>
|
1136
|
+
{% endblock %}
|