local-deep-research 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. local_deep_research/__version__.py +1 -1
  2. local_deep_research/advanced_search_system/candidate_exploration/progressive_explorer.py +11 -1
  3. local_deep_research/advanced_search_system/questions/browsecomp_question.py +32 -6
  4. local_deep_research/advanced_search_system/strategies/focused_iteration_strategy.py +32 -8
  5. local_deep_research/advanced_search_system/strategies/source_based_strategy.py +2 -0
  6. local_deep_research/api/__init__.py +2 -0
  7. local_deep_research/api/research_functions.py +177 -3
  8. local_deep_research/benchmarks/graders.py +150 -5
  9. local_deep_research/benchmarks/models/__init__.py +19 -0
  10. local_deep_research/benchmarks/models/benchmark_models.py +283 -0
  11. local_deep_research/benchmarks/ui/__init__.py +1 -0
  12. local_deep_research/benchmarks/web_api/__init__.py +6 -0
  13. local_deep_research/benchmarks/web_api/benchmark_routes.py +862 -0
  14. local_deep_research/benchmarks/web_api/benchmark_service.py +920 -0
  15. local_deep_research/config/llm_config.py +106 -21
  16. local_deep_research/defaults/default_settings.json +447 -2
  17. local_deep_research/error_handling/report_generator.py +10 -0
  18. local_deep_research/llm/__init__.py +19 -0
  19. local_deep_research/llm/llm_registry.py +155 -0
  20. local_deep_research/metrics/db_models.py +3 -7
  21. local_deep_research/metrics/search_tracker.py +25 -11
  22. local_deep_research/search_system.py +12 -9
  23. local_deep_research/utilities/log_utils.py +23 -10
  24. local_deep_research/utilities/thread_context.py +99 -0
  25. local_deep_research/web/app_factory.py +32 -8
  26. local_deep_research/web/database/benchmark_schema.py +230 -0
  27. local_deep_research/web/database/convert_research_id_to_string.py +161 -0
  28. local_deep_research/web/database/models.py +55 -1
  29. local_deep_research/web/database/schema_upgrade.py +397 -2
  30. local_deep_research/web/database/uuid_migration.py +265 -0
  31. local_deep_research/web/routes/api_routes.py +62 -31
  32. local_deep_research/web/routes/history_routes.py +13 -6
  33. local_deep_research/web/routes/metrics_routes.py +264 -4
  34. local_deep_research/web/routes/research_routes.py +45 -18
  35. local_deep_research/web/routes/route_registry.py +352 -0
  36. local_deep_research/web/routes/settings_routes.py +382 -22
  37. local_deep_research/web/services/research_service.py +22 -29
  38. local_deep_research/web/services/settings_manager.py +53 -0
  39. local_deep_research/web/services/settings_service.py +2 -0
  40. local_deep_research/web/static/css/styles.css +8 -0
  41. local_deep_research/web/static/js/components/detail.js +7 -14
  42. local_deep_research/web/static/js/components/details.js +8 -10
  43. local_deep_research/web/static/js/components/fallback/ui.js +4 -4
  44. local_deep_research/web/static/js/components/history.js +6 -6
  45. local_deep_research/web/static/js/components/logpanel.js +14 -11
  46. local_deep_research/web/static/js/components/progress.js +51 -46
  47. local_deep_research/web/static/js/components/research.js +250 -89
  48. local_deep_research/web/static/js/components/results.js +5 -7
  49. local_deep_research/web/static/js/components/settings.js +32 -26
  50. local_deep_research/web/static/js/components/settings_sync.js +24 -23
  51. local_deep_research/web/static/js/config/urls.js +285 -0
  52. local_deep_research/web/static/js/main.js +8 -8
  53. local_deep_research/web/static/js/research_form.js +267 -12
  54. local_deep_research/web/static/js/services/api.js +18 -18
  55. local_deep_research/web/static/js/services/keyboard.js +8 -8
  56. local_deep_research/web/static/js/services/socket.js +53 -35
  57. local_deep_research/web/static/js/services/ui.js +1 -1
  58. local_deep_research/web/templates/base.html +4 -1
  59. local_deep_research/web/templates/components/custom_dropdown.html +5 -3
  60. local_deep_research/web/templates/components/mobile_nav.html +3 -3
  61. local_deep_research/web/templates/components/sidebar.html +9 -3
  62. local_deep_research/web/templates/pages/benchmark.html +2697 -0
  63. local_deep_research/web/templates/pages/benchmark_results.html +1136 -0
  64. local_deep_research/web/templates/pages/benchmark_simple.html +453 -0
  65. local_deep_research/web/templates/pages/cost_analytics.html +1 -1
  66. local_deep_research/web/templates/pages/metrics.html +212 -39
  67. local_deep_research/web/templates/pages/research.html +8 -6
  68. local_deep_research/web/templates/pages/star_reviews.html +1 -1
  69. local_deep_research/web_search_engines/engines/search_engine_arxiv.py +14 -1
  70. local_deep_research/web_search_engines/engines/search_engine_brave.py +15 -1
  71. local_deep_research/web_search_engines/engines/search_engine_ddg.py +20 -1
  72. local_deep_research/web_search_engines/engines/search_engine_google_pse.py +26 -2
  73. local_deep_research/web_search_engines/engines/search_engine_pubmed.py +15 -1
  74. local_deep_research/web_search_engines/engines/search_engine_retriever.py +192 -0
  75. local_deep_research/web_search_engines/engines/search_engine_tavily.py +307 -0
  76. local_deep_research/web_search_engines/rate_limiting/__init__.py +14 -0
  77. local_deep_research/web_search_engines/rate_limiting/__main__.py +9 -0
  78. local_deep_research/web_search_engines/rate_limiting/cli.py +209 -0
  79. local_deep_research/web_search_engines/rate_limiting/exceptions.py +21 -0
  80. local_deep_research/web_search_engines/rate_limiting/tracker.py +506 -0
  81. local_deep_research/web_search_engines/retriever_registry.py +108 -0
  82. local_deep_research/web_search_engines/search_engine_base.py +161 -43
  83. local_deep_research/web_search_engines/search_engine_factory.py +14 -0
  84. local_deep_research/web_search_engines/search_engines_config.py +20 -0
  85. local_deep_research-0.6.0.dist-info/METADATA +374 -0
  86. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/RECORD +89 -64
  87. local_deep_research-0.5.9.dist-info/METADATA +0 -420
  88. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/WHEEL +0 -0
  89. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/entry_points.txt +0 -0
  90. {local_deep_research-0.5.9.dist-info → local_deep_research-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1136 @@
1
+ {% extends "base.html" %}
2
+
3
+ {% set active_page = 'benchmark-results' %}
4
+
5
+ {% block title %}Benchmark Results History - Deep Research System{% endblock %}
6
+
7
+ {% block extra_head %}
8
+ <meta name="csrf-token" content="{{ csrf_token() }}">
9
+ <style>
10
+ .benchmark-results-card {
11
+ width: 100%;
12
+ margin: 0;
13
+ padding: 0;
14
+ background: transparent;
15
+ border: none;
16
+ box-shadow: none;
17
+ }
18
+
19
+ .card-content {
20
+ padding: 0;
21
+ }
22
+
23
+ .run-card {
24
+ background: #1a1a1a;
25
+ border: 1px solid #333;
26
+ border-radius: 8px;
27
+ padding: 20px;
28
+ margin-bottom: 20px;
29
+ cursor: pointer;
30
+ transition: border-color 0.2s, background-color 0.2s;
31
+ }
32
+
33
+ .run-card:hover {
34
+ border-color: var(--primary-color);
35
+ background: #1e1e1e;
36
+ }
37
+
38
+ .run-card.expanded {
39
+ border-color: var(--primary-color);
40
+ }
41
+
42
+ .run-header {
43
+ display: flex;
44
+ justify-content: space-between;
45
+ align-items: center;
46
+ margin-bottom: 10px;
47
+ }
48
+
49
+ .run-title {
50
+ font-size: 1.2rem;
51
+ font-weight: bold;
52
+ color: #e0e0e0;
53
+ }
54
+
55
+ .run-date {
56
+ color: #a0a0a0;
57
+ font-size: 0.9rem;
58
+ }
59
+
60
+ .run-summary {
61
+ display: grid;
62
+ grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
63
+ gap: 12px;
64
+ margin-bottom: 15px;
65
+ }
66
+
67
+ /* Responsive grid adjustments for more columns on wider screens */
68
+ @media (min-width: 768px) {
69
+ .run-summary {
70
+ grid-template-columns: repeat(auto-fit, minmax(130px, 1fr));
71
+ gap: 15px;
72
+ }
73
+ }
74
+
75
+ @media (min-width: 1200px) {
76
+ .run-summary {
77
+ grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
78
+ gap: 16px;
79
+ }
80
+ }
81
+
82
+ @media (min-width: 1600px) {
83
+ .run-summary {
84
+ grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
85
+ gap: 18px;
86
+ }
87
+ }
88
+
89
+ @media (min-width: 1920px) {
90
+ .run-summary {
91
+ grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
92
+ gap: 20px;
93
+ }
94
+ }
95
+
96
+ .summary-item {
97
+ text-align: center;
98
+ padding: 10px;
99
+ background: #2a2a2a;
100
+ border-radius: 6px;
101
+ }
102
+
103
+ .summary-value {
104
+ font-size: 1.4rem;
105
+ font-weight: bold;
106
+ color: var(--primary-color);
107
+ }
108
+
109
+ .summary-label {
110
+ font-size: 0.85rem;
111
+ color: #a0a0a0;
112
+ margin-top: 5px;
113
+ }
114
+
115
+ .accuracy-indicator {
116
+ display: inline-block;
117
+ padding: 4px 8px;
118
+ border-radius: 12px;
119
+ font-size: 0.85rem;
120
+ font-weight: bold;
121
+ }
122
+
123
+ .accuracy-high {
124
+ background: #1e3a1e;
125
+ color: #4caf50;
126
+ }
127
+
128
+ .accuracy-medium {
129
+ background: #3a2a1e;
130
+ color: #ff9800;
131
+ }
132
+
133
+ .accuracy-low {
134
+ background: #3a1e1e;
135
+ color: #f44336;
136
+ }
137
+
138
+ .status-indicator {
139
+ display: inline-block;
140
+ padding: 4px 8px;
141
+ border-radius: 12px;
142
+ font-size: 0.85rem;
143
+ font-weight: bold;
144
+ }
145
+
146
+ .status-completed {
147
+ background: #1e3a1e;
148
+ color: #4caf50;
149
+ }
150
+
151
+ .status-in-progress {
152
+ background: #1e2a3a;
153
+ color: #2196f3;
154
+ }
155
+
156
+ .status-failed {
157
+ background: #3a1e1e;
158
+ color: #f44336;
159
+ }
160
+
161
+ .status-cancelled {
162
+ background: #2a2a2a;
163
+ color: #999;
164
+ }
165
+
166
+ .run-config {
167
+ display: grid;
168
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
169
+ gap: 10px;
170
+ margin-bottom: 15px;
171
+ padding: 15px;
172
+ background: #242424;
173
+ border-radius: 6px;
174
+ }
175
+
176
+ .config-item {
177
+ display: flex;
178
+ justify-content: space-between;
179
+ padding: 5px 0;
180
+ border-bottom: 1px solid #333;
181
+ }
182
+
183
+ .config-label {
184
+ color: #a0a0a0;
185
+ font-size: 0.9rem;
186
+ }
187
+
188
+ .config-value {
189
+ color: #e0e0e0;
190
+ font-weight: 500;
191
+ }
192
+
193
+ .results-section {
194
+ margin-top: 20px;
195
+ display: none;
196
+ }
197
+
198
+ .results-section.visible {
199
+ display: block;
200
+ }
201
+
202
+ .examples-grid {
203
+ display: grid;
204
+ gap: 15px;
205
+ margin-top: 15px;
206
+ }
207
+
208
+ .example-card {
209
+ background: #1a1a1a;
210
+ border: 1px solid #333;
211
+ border-radius: 6px;
212
+ padding: 15px;
213
+ }
214
+
215
+ .example-card.correct {
216
+ border-left: 4px solid #4caf50;
217
+ }
218
+
219
+ .example-card.incorrect {
220
+ border-left: 4px solid #f44336;
221
+ }
222
+
223
+ .example-header {
224
+ display: flex;
225
+ justify-content: space-between;
226
+ align-items: center;
227
+ margin-bottom: 10px;
228
+ }
229
+
230
+ .example-status {
231
+ display: flex;
232
+ align-items: center;
233
+ gap: 5px;
234
+ font-weight: 600;
235
+ }
236
+
237
+ .example-status.correct {
238
+ color: #4caf50;
239
+ }
240
+
241
+ .example-status.incorrect {
242
+ color: #f44336;
243
+ }
244
+
245
+ .example-question {
246
+ background: #2a2a2a;
247
+ padding: 12px;
248
+ border-radius: 4px;
249
+ border-left: 4px solid var(--primary-color);
250
+ margin-bottom: 12px;
251
+ color: #e0e0e0;
252
+ }
253
+
254
+ .example-answers {
255
+ display: grid;
256
+ grid-template-columns: 1fr 1fr;
257
+ gap: 12px;
258
+ }
259
+
260
+ .answer-section {
261
+ padding: 10px;
262
+ border-radius: 4px;
263
+ font-size: 0.9rem;
264
+ line-height: 1.4;
265
+ }
266
+
267
+ .model-answer-section {
268
+ background: #1e2a3a;
269
+ border-left: 3px solid #2196f3;
270
+ }
271
+
272
+ .correct-answer-section {
273
+ background: #1e3a1e;
274
+ border-left: 3px solid #4caf50;
275
+ }
276
+
277
+ .answer-label {
278
+ font-size: 0.75rem;
279
+ font-weight: 600;
280
+ color: #a0a0a0;
281
+ text-transform: uppercase;
282
+ margin-bottom: 6px;
283
+ }
284
+
285
+ .answer-text {
286
+ color: #e0e0e0;
287
+ }
288
+
289
+ .no-results {
290
+ text-align: center;
291
+ color: #a0a0a0;
292
+ padding: 40px;
293
+ font-style: italic;
294
+ }
295
+
296
+ .expand-indicator {
297
+ color: #a0a0a0;
298
+ font-size: 0.9rem;
299
+ margin-top: 10px;
300
+ text-align: center;
301
+ }
302
+
303
+ .dataset-breakdown {
304
+ display: grid;
305
+ grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
306
+ gap: 10px;
307
+ margin-top: 10px;
308
+ }
309
+
310
+ .dataset-item {
311
+ text-align: center;
312
+ padding: 8px;
313
+ background: #2a2a2a;
314
+ border-radius: 4px;
315
+ }
316
+
317
+ .dataset-name {
318
+ font-size: 0.8rem;
319
+ color: #a0a0a0;
320
+ margin-bottom: 4px;
321
+ }
322
+
323
+ .dataset-accuracy {
324
+ font-weight: bold;
325
+ color: var(--primary-color);
326
+ }
327
+
328
+ .loading {
329
+ text-align: center;
330
+ padding: 40px;
331
+ color: #a0a0a0;
332
+ }
333
+
334
+ .pagination {
335
+ display: flex;
336
+ justify-content: center;
337
+ align-items: center;
338
+ gap: 10px;
339
+ margin-top: 30px;
340
+ }
341
+
342
+ .pagination button {
343
+ padding: 8px 12px;
344
+ background: #2a2a2a;
345
+ border: 1px solid #333;
346
+ border-radius: 4px;
347
+ color: #e0e0e0;
348
+ cursor: pointer;
349
+ }
350
+
351
+ .pagination button:hover {
352
+ background: var(--primary-color);
353
+ }
354
+
355
+ .pagination button:disabled {
356
+ opacity: 0.5;
357
+ cursor: not-allowed;
358
+ }
359
+
360
+ .delete-btn {
361
+ background: #3a1e1e !important;
362
+ border-color: #f44336 !important;
363
+ color: #f44336 !important;
364
+ font-size: 0.8rem;
365
+ padding: 4px 8px;
366
+ transition: all 0.2s;
367
+ }
368
+
369
+ .delete-btn:hover:not(:disabled) {
370
+ background: #f44336 !important;
371
+ color: white !important;
372
+ }
373
+
374
+ .delete-btn:disabled {
375
+ background: #2a2a2a !important;
376
+ border-color: #555 !important;
377
+ color: #888 !important;
378
+ cursor: not-allowed;
379
+ }
380
+
381
+ .filters {
382
+ display: grid;
383
+ grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
384
+ gap: 15px;
385
+ margin-bottom: 20px;
386
+ }
387
+
388
+ /* Responsive filter adjustments for full-width layout */
389
+ @media (max-width: 767px) {
390
+ .filters {
391
+ grid-template-columns: 1fr 1fr;
392
+ gap: 12px;
393
+ }
394
+ }
395
+
396
+ @media (min-width: 768px) {
397
+ .filters {
398
+ grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
399
+ gap: 16px;
400
+ }
401
+ }
402
+
403
+ @media (min-width: 1200px) {
404
+ .filters {
405
+ grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
406
+ gap: 18px;
407
+ }
408
+ }
409
+
410
+ @media (min-width: 1600px) {
411
+ .filters {
412
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
413
+ gap: 20px;
414
+ }
415
+ }
416
+
417
+ .processing-time {
418
+ background: #2a3f2a;
419
+ color: #90ee90;
420
+ padding: 2px 6px;
421
+ border-radius: 3px;
422
+ font-size: 0.8rem;
423
+ font-weight: 500;
424
+ white-space: nowrap;
425
+ }
426
+
427
+ .filter-group {
428
+ display: flex;
429
+ flex-direction: column;
430
+ gap: 5px;
431
+ }
432
+
433
+ .filter-group label {
434
+ font-size: 0.85rem;
435
+ color: #a0a0a0;
436
+ }
437
+
438
+ .filter-group select,
439
+ .filter-group input {
440
+ padding: 6px 10px;
441
+ background: #2a2a2a;
442
+ border: 1px solid #333;
443
+ border-radius: 4px;
444
+ color: #e0e0e0;
445
+ }
446
+
447
+ .search-stats-section {
448
+ margin-bottom: 20px;
449
+ padding: 15px;
450
+ background: #1e1e1e;
451
+ border: 1px solid #333;
452
+ border-radius: 6px;
453
+ }
454
+
455
+ .section-title {
456
+ display: flex;
457
+ align-items: center;
458
+ gap: 8px;
459
+ margin-bottom: 15px;
460
+ color: #e0e0e0;
461
+ font-size: 1.1rem;
462
+ font-weight: 600;
463
+ }
464
+
465
+ .search-stats-grid {
466
+ display: grid;
467
+ grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
468
+ gap: 15px;
469
+ }
470
+
471
+ .stat-item {
472
+ text-align: center;
473
+ padding: 12px;
474
+ background: #2a2a2a;
475
+ border-radius: 6px;
476
+ }
477
+
478
+ .stat-value {
479
+ font-size: 1.4rem;
480
+ font-weight: bold;
481
+ color: var(--primary-color);
482
+ margin-bottom: 5px;
483
+ }
484
+
485
+ .stat-label {
486
+ font-size: 0.85rem;
487
+ color: #a0a0a0;
488
+ }
489
+
490
+ .results-divider {
491
+ height: 1px;
492
+ background: #333;
493
+ margin: 20px 0;
494
+ }
495
+
496
+ .examples-section {
497
+ margin-top: 15px;
498
+ }
499
+
500
+ .example-metrics {
501
+ display: flex;
502
+ gap: 10px;
503
+ align-items: center;
504
+ }
505
+
506
+ .search-results-count {
507
+ background: #2a3a3a;
508
+ color: #81c784;
509
+ padding: 2px 6px;
510
+ border-radius: 3px;
511
+ font-size: 0.8rem;
512
+ font-weight: 500;
513
+ white-space: nowrap;
514
+ }
515
+ </style>
516
+ {% endblock %}
517
+
518
+ {% block content %}
519
+ <div class="page active" id="benchmark-results">
520
+ <div class="page-header">
521
+ <h1>Benchmark Results History</h1>
522
+ <p class="page-subtitle">Compare accuracy across different models, search engines, and strategies</p>
523
+ </div>
524
+
525
+ <div class="card benchmark-results-card">
526
+ <div class="card-content">
527
+ <!-- Filters -->
528
+ <div class="filters">
529
+ <div class="filter-group">
530
+ <label for="accuracy-filter">Accuracy Range</label>
531
+ <select id="accuracy-filter">
532
+ <option value="">All</option>
533
+ <option value="high">90%+ (High)</option>
534
+ <option value="medium">70-90% (Medium)</option>
535
+ <option value="low"><70% (Low)</option>
536
+ </select>
537
+ </div>
538
+ <div class="filter-group">
539
+ <label for="model-filter">Model</label>
540
+ <select id="model-filter">
541
+ <option value="">All Models</option>
542
+ </select>
543
+ </div>
544
+ <div class="filter-group">
545
+ <label for="strategy-filter">Strategy</label>
546
+ <select id="strategy-filter">
547
+ <option value="">All Strategies</option>
548
+ </select>
549
+ </div>
550
+ <div class="filter-group">
551
+ <label for="status-filter">Status</label>
552
+ <select id="status-filter">
553
+ <option value="">All Statuses</option>
554
+ <option value="completed">Completed</option>
555
+ <option value="in_progress">In Progress</option>
556
+ <option value="failed">Failed</option>
557
+ <option value="cancelled">Cancelled</option>
558
+ </select>
559
+ </div>
560
+ <div class="filter-group">
561
+ <label for="date-filter">Date Range</label>
562
+ <input type="date" id="date-from">
563
+ <input type="date" id="date-to">
564
+ </div>
565
+ </div>
566
+
567
+ <!-- Results List -->
568
+ <div id="results-container">
569
+ <div class="loading">
570
+ <i class="fas fa-spinner fa-spin"></i> Loading benchmark results...
571
+ </div>
572
+ </div>
573
+
574
+ <!-- Pagination -->
575
+ <div class="pagination" id="pagination" style="display: none;">
576
+ <button id="prev-page">← Previous</button>
577
+ <span id="page-info">Page 1 of 1</span>
578
+ <button id="next-page">Next →</button>
579
+ </div>
580
+ </div>
581
+ </div>
582
+ </div>
583
+
584
+ <script>
585
+ let benchmarkRuns = [];
586
+ let filteredRuns = [];
587
+ let currentPage = 1;
588
+ const itemsPerPage = 20;
589
+
590
+ document.addEventListener('DOMContentLoaded', function() {
591
+ loadBenchmarkHistory();
592
+ setupFilters();
593
+ });
594
+
595
+ async function loadBenchmarkHistory() {
596
+ try {
597
+ const response = await fetch('/benchmark/api/history');
598
+ const data = await response.json();
599
+
600
+ if (data.success) {
601
+ benchmarkRuns = data.runs;
602
+ filteredRuns = [...benchmarkRuns];
603
+ populateFilters();
604
+ displayResults();
605
+ } else {
606
+ document.getElementById('results-container').innerHTML =
607
+ '<div class="no-results">Error loading benchmark results</div>';
608
+ }
609
+ } catch (error) {
610
+ console.error('Error loading benchmark history:', error);
611
+ document.getElementById('results-container').innerHTML =
612
+ '<div class="no-results">Error loading benchmark results</div>';
613
+ }
614
+ }
615
+
616
+ function populateFilters() {
617
+ // Populate model filter
618
+ const models = [...new Set(benchmarkRuns.map(run =>
619
+ run.search_config?.model_name).filter(Boolean))];
620
+ const modelFilter = document.getElementById('model-filter');
621
+ models.forEach(model => {
622
+ const option = document.createElement('option');
623
+ option.value = model;
624
+ option.textContent = model;
625
+ modelFilter.appendChild(option);
626
+ });
627
+
628
+ // Populate strategy filter
629
+ const strategies = [...new Set(benchmarkRuns.map(run =>
630
+ run.search_config?.search_strategy).filter(Boolean))];
631
+ const strategyFilter = document.getElementById('strategy-filter');
632
+ strategies.forEach(strategy => {
633
+ const option = document.createElement('option');
634
+ option.value = strategy;
635
+ option.textContent = strategy.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase());
636
+ strategyFilter.appendChild(option);
637
+ });
638
+ }
639
+
640
+ function setupFilters() {
641
+ document.getElementById('accuracy-filter').addEventListener('change', applyFilters);
642
+ document.getElementById('model-filter').addEventListener('change', applyFilters);
643
+ document.getElementById('strategy-filter').addEventListener('change', applyFilters);
644
+ document.getElementById('status-filter').addEventListener('change', applyFilters);
645
+ document.getElementById('date-from').addEventListener('change', applyFilters);
646
+ document.getElementById('date-to').addEventListener('change', applyFilters);
647
+ }
648
+
649
+ function applyFilters() {
650
+ const accuracyFilter = document.getElementById('accuracy-filter').value;
651
+ const modelFilter = document.getElementById('model-filter').value;
652
+ const strategyFilter = document.getElementById('strategy-filter').value;
653
+ const statusFilter = document.getElementById('status-filter').value;
654
+ const dateFrom = document.getElementById('date-from').value;
655
+ const dateTo = document.getElementById('date-to').value;
656
+
657
+ filteredRuns = benchmarkRuns.filter(run => {
658
+ // Accuracy filter
659
+ if (accuracyFilter) {
660
+ const accuracy = run.overall_accuracy || 0;
661
+ if (accuracyFilter === 'high' && accuracy < 90) return false;
662
+ if (accuracyFilter === 'medium' && (accuracy < 70 || accuracy >= 90)) return false;
663
+ if (accuracyFilter === 'low' && accuracy >= 70) return false;
664
+ }
665
+
666
+ // Model filter
667
+ if (modelFilter && run.search_config?.model_name !== modelFilter) return false;
668
+
669
+ // Strategy filter
670
+ if (strategyFilter && run.search_config?.search_strategy !== strategyFilter) return false;
671
+
672
+ // Status filter
673
+ if (statusFilter && run.status !== statusFilter) return false;
674
+
675
+ // Date filters
676
+ const runDate = new Date(run.created_at).toISOString().split('T')[0];
677
+ if (dateFrom && runDate < dateFrom) return false;
678
+ if (dateTo && runDate > dateTo) return false;
679
+
680
+ return true;
681
+ });
682
+
683
+ currentPage = 1;
684
+ displayResults();
685
+ }
686
+
687
+ function displayResults() {
688
+ const container = document.getElementById('results-container');
689
+
690
+ if (filteredRuns.length === 0) {
691
+ container.innerHTML = '<div class="no-results">No benchmark results found</div>';
692
+ document.getElementById('pagination').style.display = 'none';
693
+ return;
694
+ }
695
+
696
+ const startIndex = (currentPage - 1) * itemsPerPage;
697
+ const endIndex = Math.min(startIndex + itemsPerPage, filteredRuns.length);
698
+ const pageRuns = filteredRuns.slice(startIndex, endIndex);
699
+
700
+ const html = pageRuns.map(run => createRunCard(run)).join('');
701
+ container.innerHTML = html;
702
+
703
+ // Setup pagination
704
+ setupPagination();
705
+ }
706
+
707
+ function createRunCard(run) {
708
+ const accuracy = run.overall_accuracy || 0;
709
+ const accuracyClass = accuracy >= 90 ? 'accuracy-high' :
710
+ accuracy >= 70 ? 'accuracy-medium' : 'accuracy-low';
711
+
712
+ // Status handling
713
+ const status = run.status || 'unknown';
714
+ const statusClass = `status-${status.replace('_', '-')}`;
715
+ const statusText = status.replace('_', ' ').replace(/\b\w/g, l => l.toUpperCase());
716
+
717
+ const date = new Date(run.created_at).toLocaleDateString();
718
+ const time = new Date(run.created_at).toLocaleTimeString();
719
+
720
+ // Progress calculation
721
+ const progress = run.total_examples > 0 ?
722
+ ((run.completed_examples / run.total_examples) * 100).toFixed(1) : 0;
723
+
724
+ // Format status text with proper capitalization
725
+ const formattedStatus = status.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase());
726
+
727
+ return `
728
+ <div class="run-card" onclick="toggleRunDetails(${run.id})">
729
+ <div class="run-header">
730
+ <div class="run-title">${run.run_name}</div>
731
+ <div class="run-date">${date} ${time}</div>
732
+ </div>
733
+
734
+ <div class="run-summary">
735
+ <div class="summary-item">
736
+ <div class="summary-value ${statusClass} status-indicator">${formattedStatus}</div>
737
+ <div class="summary-label">Status</div>
738
+ </div>
739
+ <div class="summary-item">
740
+ <div class="summary-value">${run.completed_examples}/${run.total_examples}</div>
741
+ <div class="summary-label">Progress (${progress}%)</div>
742
+ </div>
743
+ <div class="summary-item">
744
+ <div class="summary-value ${status === 'completed' && accuracy > 0 ? accuracyClass + ' accuracy-indicator' : ''}">${status === 'completed' && accuracy > 0 ? accuracy.toFixed(1) + '%' : 'N/A'}</div>
745
+ <div class="summary-label">Accuracy</div>
746
+ </div>
747
+ <div class="summary-item">
748
+ <div class="summary-value">${run.search_config?.model_name || 'Unknown'}</div>
749
+ <div class="summary-label">Model</div>
750
+ </div>
751
+ <div class="summary-item">
752
+ <div class="summary-value">${(run.search_config?.search_strategy || 'unknown').replace(/_/g, ' ')}</div>
753
+ <div class="summary-label">Strategy</div>
754
+ </div>
755
+ <div class="summary-item">
756
+ <div class="summary-value">${run.search_config?.search_tool || 'Unknown'}</div>
757
+ <div class="summary-label">Search Engine</div>
758
+ </div>
759
+ <div class="summary-item">
760
+ <div class="summary-value">${formatAvgSearchResults(run)}</div>
761
+ <div class="summary-label">Avg Search Results</div>
762
+ </div>
763
+ <div class="summary-item">
764
+ <div class="summary-value">${formatAvgSearchRequests(run)}</div>
765
+ <div class="summary-label">Avg Search Requests</div>
766
+ </div>
767
+ <div class="summary-item">
768
+ <div class="summary-value">${formatAvgProcessingTime(run)}</div>
769
+ <div class="summary-label">Avg Time/Question</div>
770
+ </div>
771
+ <div class="summary-item">
772
+ ${status === 'in_progress' ?
773
+ `<button class="btn btn-outline btn-sm delete-btn" onclick="event.stopPropagation(); cancelAndDeleteBenchmarkRun(${run.id})" style="background: #3a1e1e !important; border-color: #f44336 !important; color: #f44336 !important;">
774
+ <i class="fas fa-stop"></i> Cancel & Delete
775
+ </button>` :
776
+ `<button class="btn btn-outline btn-sm delete-btn" onclick="event.stopPropagation(); deleteBenchmarkRun(${run.id})">
777
+ <i class="fas fa-trash"></i> Delete
778
+ </button>`
779
+ }
780
+ <div class="summary-label">Actions</div>
781
+ </div>
782
+ </div>
783
+
784
+ <div class="expand-indicator">
785
+ <i class="fas fa-chevron-down"></i> Click to view detailed results and examples
786
+ </div>
787
+
788
+ <div class="results-section" id="results-${run.id}">
789
+ <div class="run-config">
790
+ <div class="config-item">
791
+ <span class="config-label">Iterations:</span>
792
+ <span class="config-value">${run.search_config?.iterations || 'N/A'}</span>
793
+ </div>
794
+ <div class="config-item">
795
+ <span class="config-label">Questions/Iteration:</span>
796
+ <span class="config-value">${run.search_config?.questions_per_iteration || 'N/A'}</span>
797
+ </div>
798
+ <div class="config-item">
799
+ <span class="config-label">Temperature:</span>
800
+ <span class="config-value">${run.search_config?.temperature || 'N/A'}</span>
801
+ </div>
802
+ <div class="config-item">
803
+ <span class="config-label">Provider:</span>
804
+ <span class="config-value">${run.search_config?.provider || 'N/A'}</span>
805
+ </div>
806
+ </div>
807
+
808
+ <div id="examples-${run.id}">
809
+ <div class="loading">Loading detailed results...</div>
810
+ </div>
811
+ </div>
812
+ </div>
813
+ `;
814
+ }
815
+
816
+ async function toggleRunDetails(runId) {
817
+ const resultsSection = document.getElementById(`results-${runId}`);
818
+ const runCard = resultsSection.closest('.run-card');
819
+
820
+ if (resultsSection.classList.contains('visible')) {
821
+ resultsSection.classList.remove('visible');
822
+ runCard.classList.remove('expanded');
823
+ return;
824
+ }
825
+
826
+ resultsSection.classList.add('visible');
827
+ runCard.classList.add('expanded');
828
+
829
+ // Load examples if not already loaded
830
+ const examplesContainer = document.getElementById(`examples-${runId}`);
831
+ if (examplesContainer.innerHTML.includes('Loading detailed results...')) {
832
+ await loadExamples(runId);
833
+ }
834
+ }
835
+
836
+ async function loadExamples(runId) {
837
+ try {
838
+ const response = await fetch(`/benchmark/api/results/${runId}?limit=50`);
839
+ const data = await response.json();
840
+
841
+ const examplesContainer = document.getElementById(`examples-${runId}`);
842
+
843
+ if (data.success && data.results.length > 0) {
844
+ // Calculate search result statistics
845
+ const searchResultCounts = data.results.map(r => r.search_result_count || 0);
846
+ const avgSearchResults = searchResultCounts.length > 0
847
+ ? (searchResultCounts.reduce((sum, count) => sum + count, 0) / searchResultCounts.length).toFixed(1)
848
+ : 'N/A';
849
+ const minSearchResults = searchResultCounts.length > 0 ? Math.min(...searchResultCounts) : 'N/A';
850
+ const maxSearchResults = searchResultCounts.length > 0 ? Math.max(...searchResultCounts) : 'N/A';
851
+
852
+ // Create search results statistics section
853
+ const statsHtml = `
854
+ <div class="search-stats-section">
855
+ <h4 class="section-title">
856
+ <i class="fas fa-search"></i> Search Results Statistics
857
+ </h4>
858
+ <div class="search-stats-grid">
859
+ <div class="stat-item">
860
+ <div class="stat-value">${avgSearchResults}</div>
861
+ <div class="stat-label">Avg Search Results</div>
862
+ </div>
863
+ <div class="stat-item">
864
+ <div class="stat-value">${minSearchResults}</div>
865
+ <div class="stat-label">Min Results</div>
866
+ </div>
867
+ <div class="stat-item">
868
+ <div class="stat-value">${maxSearchResults}</div>
869
+ <div class="stat-label">Max Results</div>
870
+ </div>
871
+ <div class="stat-item">
872
+ <div class="stat-value">${data.results.length}</div>
873
+ <div class="stat-label">Total Queries</div>
874
+ </div>
875
+ </div>
876
+ </div>
877
+ `;
878
+
879
+ // Create examples grid
880
+ const examplesHtml = data.results.map(result => createExampleCard(result)).join('');
881
+
882
+ examplesContainer.innerHTML = `
883
+ ${statsHtml}
884
+ <div class="results-divider"></div>
885
+ <div class="examples-section">
886
+ <h4 class="section-title">
887
+ <i class="fas fa-list"></i> Individual Query Results (${data.results.length} shown)
888
+ </h4>
889
+ <div class="examples-grid">${examplesHtml}</div>
890
+ </div>
891
+ `;
892
+ } else {
893
+ examplesContainer.innerHTML = '<div class="no-results">No detailed results available</div>';
894
+ }
895
+ } catch (error) {
896
+ console.error('Error loading examples:', error);
897
+ document.getElementById(`examples-${runId}`).innerHTML =
898
+ '<div class="no-results">Error loading examples</div>';
899
+ }
900
+ }
901
+
902
+ function formatAvgSearchResults(run) {
903
+ if (!run.avg_search_results || run.avg_search_results <= 0) {
904
+ return 'N/A';
905
+ }
906
+
907
+ return Math.round(run.avg_search_results).toString();
908
+ }
909
+
910
+ function formatAvgSearchRequests(run) {
911
+ if (!run.total_search_requests || run.total_search_requests <= 0) {
912
+ return 'N/A';
913
+ }
914
+
915
+ return Math.round(run.total_search_requests).toString();
916
+ }
917
+
918
+ function formatAvgProcessingTime(run) {
919
+ // Calculate average processing time from completed examples
920
+ if (!run.avg_processing_time && (!run.results || run.results.length === 0)) {
921
+ return 'N/A';
922
+ }
923
+
924
+ // Use avg_processing_time if available, otherwise calculate from results
925
+ let avgTime = run.avg_processing_time;
926
+ if (!avgTime && run.results) {
927
+ const timesWithValues = run.results
928
+ .filter(r => r.processing_time && r.processing_time > 0)
929
+ .map(r => r.processing_time);
930
+
931
+ if (timesWithValues.length === 0) return 'N/A';
932
+ avgTime = timesWithValues.reduce((sum, time) => sum + time, 0) / timesWithValues.length;
933
+ }
934
+
935
+ if (!avgTime || avgTime <= 0) return 'N/A';
936
+
937
+ // Format time nicely
938
+ if (avgTime < 60) {
939
+ return `${avgTime.toFixed(1)}s`;
940
+ } else if (avgTime < 3600) {
941
+ const minutes = Math.floor(avgTime / 60);
942
+ const seconds = Math.round(avgTime % 60);
943
+ return seconds > 0 ? `${minutes}m ${seconds}s` : `${minutes}m`;
944
+ } else {
945
+ const hours = Math.floor(avgTime / 3600);
946
+ const minutes = Math.round((avgTime % 3600) / 60);
947
+ return minutes > 0 ? `${hours}h ${minutes}m` : `${hours}h`;
948
+ }
949
+ }
950
+
951
+ function createExampleCard(result) {
952
+ const statusClass = result.is_correct ? 'correct' : 'incorrect';
953
+ const statusIcon = result.is_correct ? '<i class="fas fa-check-circle"></i>' : '<i class="fas fa-times-circle"></i>';
954
+ const statusText = result.is_correct ? 'Correct' : 'Incorrect';
955
+
956
+ // Format processing time for individual result
957
+ const processingTime = result.processing_time && result.processing_time > 0
958
+ ? (result.processing_time < 60
959
+ ? `${result.processing_time.toFixed(1)}s`
960
+ : `${Math.floor(result.processing_time / 60)}m ${Math.round(result.processing_time % 60)}s`)
961
+ : 'N/A';
962
+
963
+ // Format search results count
964
+ const searchResultCount = result.search_result_count || 0;
965
+
966
+ return `
967
+ <div class="example-card ${statusClass}">
968
+ <div class="example-header">
969
+ <span class="dataset-badge">${result.dataset_type}</span>
970
+ <span class="example-status ${statusClass}">
971
+ ${statusIcon} ${statusText}
972
+ </span>
973
+ <div class="example-metrics">
974
+ <span class="processing-time">⏱️ ${processingTime}</span>
975
+ <span class="search-results-count">🔍 ${searchResultCount} results</span>
976
+ </div>
977
+ </div>
978
+
979
+ <div class="example-question">
980
+ <strong>Question:</strong> ${result.question}
981
+ </div>
982
+
983
+ <div class="example-answers">
984
+ <div class="answer-section model-answer-section">
985
+ <div class="answer-label">Model Answer</div>
986
+ <div class="answer-text">${result.model_answer || 'No answer provided'}</div>
987
+ </div>
988
+ <div class="answer-section correct-answer-section">
989
+ <div class="answer-label">Expected Answer</div>
990
+ <div class="answer-text">${result.correct_answer || 'No expected answer'}</div>
991
+ </div>
992
+ </div>
993
+ </div>
994
+ `;
995
+ }
996
+
997
+ function setupPagination() {
998
+ const totalPages = Math.ceil(filteredRuns.length / itemsPerPage);
999
+ const paginationDiv = document.getElementById('pagination');
1000
+
1001
+ if (totalPages <= 1) {
1002
+ paginationDiv.style.display = 'none';
1003
+ return;
1004
+ }
1005
+
1006
+ paginationDiv.style.display = 'flex';
1007
+
1008
+ const prevBtn = document.getElementById('prev-page');
1009
+ const nextBtn = document.getElementById('next-page');
1010
+ const pageInfo = document.getElementById('page-info');
1011
+
1012
+ prevBtn.disabled = currentPage === 1;
1013
+ nextBtn.disabled = currentPage === totalPages;
1014
+ pageInfo.textContent = `Page ${currentPage} of ${totalPages}`;
1015
+
1016
+ prevBtn.onclick = () => {
1017
+ if (currentPage > 1) {
1018
+ currentPage--;
1019
+ displayResults();
1020
+ }
1021
+ };
1022
+
1023
+ nextBtn.onclick = () => {
1024
+ if (currentPage < totalPages) {
1025
+ currentPage++;
1026
+ displayResults();
1027
+ }
1028
+ };
1029
+ }
1030
+
1031
+ async function cancelAndDeleteBenchmarkRun(runId) {
1032
+ if (!confirm('Are you sure you want to cancel and delete this running benchmark? This action cannot be undone.')) {
1033
+ return;
1034
+ }
1035
+
1036
+ try {
1037
+ // First cancel the benchmark
1038
+ const cancelResponse = await fetch(`/benchmark/api/cancel/${runId}`, {
1039
+ method: 'POST',
1040
+ headers: {
1041
+ 'Content-Type': 'application/json',
1042
+ }
1043
+ });
1044
+
1045
+ const cancelData = await cancelResponse.json();
1046
+
1047
+ if (cancelData.success) {
1048
+ showAlert('Benchmark cancelled successfully. Deleting...', 'info');
1049
+
1050
+ // Wait a moment for cancellation to process
1051
+ await new Promise(resolve => setTimeout(resolve, 1000));
1052
+
1053
+ // Then delete it
1054
+ await deleteBenchmarkRun(runId);
1055
+ } else {
1056
+ showAlert('Error cancelling benchmark: ' + cancelData.error, 'error');
1057
+ }
1058
+ } catch (error) {
1059
+ console.error('Error cancelling benchmark:', error);
1060
+ showAlert('Error cancelling benchmark: ' + error.message, 'error');
1061
+ }
1062
+ }
1063
+
1064
+ async function deleteBenchmarkRun(runId) {
1065
+ try {
1066
+ const response = await fetch(`/benchmark/api/delete/${runId}`, {
1067
+ method: 'DELETE',
1068
+ headers: {
1069
+ 'Content-Type': 'application/json',
1070
+ }
1071
+ });
1072
+
1073
+ const data = await response.json();
1074
+
1075
+ if (data.success) {
1076
+ // Show success message
1077
+ showAlert('Benchmark run deleted successfully!', 'success');
1078
+
1079
+ // Remove the run from our local data
1080
+ benchmarkRuns = benchmarkRuns.filter(run => run.id !== runId);
1081
+
1082
+ // Reapply filters and redisplay
1083
+ applyFilters();
1084
+ } else {
1085
+ showAlert('Error deleting benchmark run: ' + data.error, 'error');
1086
+ }
1087
+ } catch (error) {
1088
+ console.error('Error deleting benchmark run:', error);
1089
+ showAlert('Error deleting benchmark run: ' + error.message, 'error');
1090
+ }
1091
+ }
1092
+
1093
+ function showAlert(message, type) {
1094
+ // Create alert element
1095
+ const alertDiv = document.createElement('div');
1096
+ alertDiv.className = `alert alert-${type}`;
1097
+ alertDiv.style.cssText = `
1098
+ position: fixed;
1099
+ top: 20px;
1100
+ right: 20px;
1101
+ z-index: 1000;
1102
+ max-width: 400px;
1103
+ padding: 15px;
1104
+ border-radius: 6px;
1105
+ color: white;
1106
+ font-weight: 500;
1107
+ box-shadow: 0 4px 12px rgba(0,0,0,0.3);
1108
+ `;
1109
+
1110
+ // Set background color based on type
1111
+ const colors = {
1112
+ success: '#4caf50',
1113
+ error: '#f44336',
1114
+ warning: '#ff9800',
1115
+ info: '#2196f3'
1116
+ };
1117
+ alertDiv.style.backgroundColor = colors[type] || colors.info;
1118
+
1119
+ alertDiv.innerHTML = `
1120
+ <div style="display: flex; align-items: center; justify-content: space-between;">
1121
+ <span>${message}</span>
1122
+ <button onclick="this.parentElement.parentElement.remove()" style="background: none; border: none; color: white; font-size: 1.2rem; cursor: pointer; margin-left: 10px;">&times;</button>
1123
+ </div>
1124
+ `;
1125
+
1126
+ document.body.appendChild(alertDiv);
1127
+
1128
+ // Auto-remove after 5 seconds
1129
+ setTimeout(() => {
1130
+ if (alertDiv.parentElement) {
1131
+ alertDiv.remove();
1132
+ }
1133
+ }, 5000);
1134
+ }
1135
+ </script>
1136
+ {% endblock %}