eval-ai-library 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eval-ai-library might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-ai-library
3
- Version: 0.3.10
3
+ Version: 0.3.12
4
4
  Summary: Comprehensive AI Model Evaluation Framework with support for multiple LLM providers
5
5
  Author-email: Aleksandr Meshkov <alekslynx90@gmail.com>
6
6
  License: MIT
@@ -1,11 +1,11 @@
1
- eval_ai_library-0.3.10.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
2
- eval_lib/__init__.py,sha256=OMrncAoUbbrJXfaYf8k2wJEGw1e2r9k-s1uXkerZ9mE,3204
3
- eval_lib/cli.py,sha256=Fvnj6HgCQ3lhx28skweALgHSm3FMEpavQCB3o_sQhtE,4731
4
- eval_lib/dashboard_server.py,sha256=6ND7ujtzN0PdMyVmJFnKDWrIf4kaodnetLZRPUhYHas,6751
1
+ eval_ai_library-0.3.12.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
2
+ eval_lib/__init__.py,sha256=Q4pyJAdbOOGyYJG9U2nrcm6koA4YRl-vwtStysEFjok,3204
3
+ eval_lib/cli.py,sha256=cRjEZhDVpRaP8jnGva-Fv1dHfcQ2h8OBAmNxxcXf_ww,5440
4
+ eval_lib/dashboard_server.py,sha256=kVkXihQh7WwoWBxsdt9jADOwCJtuAsjIqw9eaoNpUqI,6768
5
5
  eval_lib/evaluate.py,sha256=LEjwPsuuPGpdwes-xXesCKtKlBFFMF5X1CpIGJIrZ20,12630
6
6
  eval_lib/evaluation_schema.py,sha256=7IDd_uozqewhh7k0p1hKut_20udvRxxkV6thclxKUg0,1904
7
- eval_lib/html.py,sha256=_tBTtwxZpjIwc3TVOyLGDw2VFD77aAeA47JdovoZ0CI,24094
8
- eval_lib/llm_client.py,sha256=eeTVhCLR1uYbhqOEOSBt3wWPKuzgzA9v8m0F9f-4Gqg,14910
7
+ eval_lib/html.py,sha256=N4lSBI1LKuZ3Iqgm_Vjy2F1o1qb0kT0fgXukSYqDido,1709
8
+ eval_lib/llm_client.py,sha256=Emv4nq80VquvB0FMzLJt66UqimFNfRGOZ6__0ULTPAo,15813
9
9
  eval_lib/metric_pattern.py,sha256=wULgMNDeAqJC_Qjglo7bYzY2eGhA_PmY_hA_qGfg0sI,11730
10
10
  eval_lib/price.py,sha256=jbmkkUTxPuXrkSHuaJYPl7jSzfDIzQ9p_swWWs26UJ0,1986
11
11
  eval_lib/py.typed,sha256=8PjyZ1aVoQpRVvt71muvuq5qE-jTFZkK-GLHkhdebmc,26
@@ -31,8 +31,8 @@ eval_lib/metrics/faithfulness_metric/faithfulness.py,sha256=OqamlhTOps7d-NOStSIK
31
31
  eval_lib/metrics/geval/geval.py,sha256=mNciHXnqU2drOJsWlYmbwftGiKM89-Ykw2f6XneIGBM,10629
32
32
  eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py,sha256=4QqYgGMcp6W9Lw-v4s0AlUhMSOKvBOEgnLvhqVXaT9I,4286
33
33
  eval_lib/metrics/toxicity_metric/toxicity.py,sha256=rBE1_fvpbCRdBpBep1y1LTIhofKR8GD4Eh76EOYzxL0,4076
34
- eval_ai_library-0.3.10.dist-info/METADATA,sha256=pevxrimXqbreKbRwHZ0GBu_VXsfGhles6OMN2SBOJHo,47969
35
- eval_ai_library-0.3.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
- eval_ai_library-0.3.10.dist-info/entry_points.txt,sha256=VTDuJiTezDkBLQw1NWcRoOOuZPHqYgOCcVIoYno-L00,47
37
- eval_ai_library-0.3.10.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
38
- eval_ai_library-0.3.10.dist-info/RECORD,,
34
+ eval_ai_library-0.3.12.dist-info/METADATA,sha256=9nKU4DquxCgqMbqQ9DlEs7ex7uAw-qPUwV1bJleORXg,47969
35
+ eval_ai_library-0.3.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
+ eval_ai_library-0.3.12.dist-info/entry_points.txt,sha256=VTDuJiTezDkBLQw1NWcRoOOuZPHqYgOCcVIoYno-L00,47
37
+ eval_ai_library-0.3.12.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
38
+ eval_ai_library-0.3.12.dist-info/RECORD,,
eval_lib/__init__.py CHANGED
@@ -7,7 +7,7 @@ A powerful library for evaluating AI models with support for multiple LLM provid
7
7
  and a wide range of evaluation metrics for RAG systems and AI agents.
8
8
  """
9
9
 
10
- __version__ = "0.3.10"
10
+ __version__ = "0.3.12"
11
11
  __author__ = "Aleksandr Meshkov"
12
12
 
13
13
  # Core evaluation functions
eval_lib/cli.py CHANGED
@@ -6,6 +6,8 @@ Command-line interface for Eval AI Library
6
6
  import argparse
7
7
  import sys
8
8
  from pathlib import Path
9
+ import os
10
+ import json
9
11
 
10
12
 
11
13
  def run_dashboard():
@@ -70,8 +72,11 @@ def run_dashboard():
70
72
  print(f" Press Ctrl+C to stop\n")
71
73
  print("="*70 + "\n")
72
74
 
73
- app = Flask(__name__)
75
+ static_folder = os.path.join(os.path.dirname(__file__), 'static')
76
+
77
+ app = Flask(__name__, static_folder=static_folder)
74
78
  app.config['WTF_CSRF_ENABLED'] = False
79
+ app.config['JSON_SORT_KEYS'] = False
75
80
 
76
81
  @app.route('/')
77
82
  def index():
@@ -93,7 +98,13 @@ def run_dashboard():
93
98
  cache = get_fresh_cache()
94
99
  latest = cache.get_latest()
95
100
  if latest:
96
- return jsonify(latest)
101
+ json_str = json.dumps(latest, ensure_ascii=False, sort_keys=False)
102
+ from flask import Response
103
+ return Response(
104
+ json_str,
105
+ mimetype='application/json',
106
+ headers={'Content-Type': 'application/json; charset=utf-8'}
107
+ )
97
108
  return jsonify({'error': 'No results available'}), 404
98
109
 
99
110
  @app.route('/api/sessions')
@@ -114,7 +125,13 @@ def run_dashboard():
114
125
  cache = get_fresh_cache()
115
126
  session = cache.get_by_session(session_id)
116
127
  if session:
117
- return jsonify(session)
128
+ json_str = json.dumps(session, ensure_ascii=False, sort_keys=False)
129
+ from flask import Response
130
+ return Response(
131
+ json_str,
132
+ mimetype='application/json',
133
+ headers={'Content-Type': 'application/json; charset=utf-8'}
134
+ )
118
135
  return jsonify({'error': 'Session not found'}), 404
119
136
 
120
137
  @app.route('/api/clear')
@@ -31,7 +31,7 @@ class DashboardCache:
31
31
  try:
32
32
  with open(self.cache_file, 'w', encoding='utf-8') as f:
33
33
  json.dump(self.results_history, f,
34
- indent=2, ensure_ascii=False)
34
+ indent=2, ensure_ascii=False, sort_keys=False)
35
35
  except Exception as e:
36
36
  print(f"Warning: Could not save cache: {e}")
37
37
 
eval_lib/html.py CHANGED
@@ -4,395 +4,24 @@ HTML_TEMPLATE = """
4
4
  <head>
5
5
  <meta charset="UTF-8">
6
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
- <title>Eval AI Library - Interactive Dashboard</title>
7
+ <title>Eval AI Library - Dashboard</title>
8
8
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
9
- <style>
10
- * {
11
- margin: 0;
12
- padding: 0;
13
- box-sizing: border-box;
14
- }
15
-
16
- body {
17
- font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
18
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
19
- padding: 20px;
20
- min-height: 100vh;
21
- }
22
-
23
- .container {
24
- max-width: 1400px;
25
- margin: 0 auto;
26
- background: white;
27
- border-radius: 20px;
28
- padding: 30px;
29
- box-shadow: 0 20px 60px rgba(0,0,0,0.3);
30
- }
31
-
32
- header {
33
- display: flex;
34
- justify-content: space-between;
35
- align-items: center;
36
- margin-bottom: 40px;
37
- padding-bottom: 20px;
38
- border-bottom: 3px solid #667eea;
39
- }
40
-
41
- h1 {
42
- color: #667eea;
43
- font-size: 2.5em;
44
- }
45
-
46
- .controls {
47
- display: flex;
48
- gap: 10px;
49
- align-items: center;
50
- }
51
-
52
- select, button {
53
- padding: 10px 20px;
54
- border-radius: 8px;
55
- border: 2px solid #667eea;
56
- background: white;
57
- color: #667eea;
58
- font-weight: 600;
59
- cursor: pointer;
60
- transition: all 0.3s;
61
- }
62
-
63
- button:hover {
64
- background: #667eea;
65
- color: white;
66
- }
67
-
68
- .timestamp {
69
- color: #666;
70
- font-size: 0.9em;
71
- margin-left: 20px;
72
- }
73
-
74
- .summary {
75
- display: grid;
76
- grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
77
- gap: 20px;
78
- margin-bottom: 40px;
79
- }
80
-
81
- .summary-card {
82
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
83
- color: white;
84
- padding: 25px;
85
- border-radius: 15px;
86
- text-align: center;
87
- box-shadow: 0 5px 15px rgba(0,0,0,0.2);
88
- transition: transform 0.3s;
89
- }
90
-
91
- .summary-card:hover {
92
- transform: translateY(-5px);
93
- }
94
-
95
- .summary-card h3 {
96
- font-size: 0.9em;
97
- margin-bottom: 10px;
98
- opacity: 0.9;
99
- }
100
-
101
- .summary-card .value {
102
- font-size: 2em;
103
- font-weight: bold;
104
- }
105
-
106
- .metrics-grid {
107
- display: grid;
108
- grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
109
- gap: 20px;
110
- margin-bottom: 40px;
111
- }
112
-
113
- .metric-card {
114
- background: #f8f9fa;
115
- border-radius: 15px;
116
- padding: 20px;
117
- box-shadow: 0 3px 10px rgba(0,0,0,0.1);
118
- transition: transform 0.3s;
119
- }
120
-
121
- .metric-card:hover {
122
- transform: translateY(-5px);
123
- }
124
-
125
- .metric-card h3 {
126
- color: #667eea;
127
- margin-bottom: 15px;
128
- font-size: 1.1em;
129
- }
130
-
131
- .metric-score {
132
- font-size: 2.5em;
133
- font-weight: bold;
134
- color: #764ba2;
135
- margin-bottom: 15px;
136
- }
137
-
138
- .metric-details p {
139
- margin: 8px 0;
140
- color: #555;
141
- font-size: 0.9em;
142
- }
143
-
144
- .charts {
145
- display: grid;
146
- grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
147
- gap: 30px;
148
- margin-bottom: 40px;
149
- }
150
-
151
- .chart-container {
152
- background: #f8f9fa;
153
- border-radius: 15px;
154
- padding: 20px;
155
- box-shadow: 0 3px 10px rgba(0,0,0,0.1);
156
- }
157
-
158
- .chart-container h2 {
159
- color: #667eea;
160
- margin-bottom: 20px;
161
- font-size: 1.3em;
162
- }
163
-
164
- table {
165
- width: 100%;
166
- border-collapse: collapse;
167
- background: white;
168
- border-radius: 10px;
169
- overflow: hidden;
170
- box-shadow: 0 3px 10px rgba(0,0,0,0.1);
171
- }
172
-
173
- th {
174
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
175
- color: white;
176
- padding: 15px;
177
- text-align: left;
178
- font-weight: 600;
179
- cursor: pointer;
180
- user-select: none;
181
- }
182
-
183
- th:hover {
184
- background: linear-gradient(135deg, #5568d3 0%, #653a8b 100%);
185
- }
186
-
187
- td {
188
- padding: 12px 15px;
189
- border-bottom: 1px solid #eee;
190
- font-size: 0.9em;
191
- }
192
-
193
- tr.success {
194
- background: #f0fdf4;
195
- }
196
-
197
- tr.failed {
198
- background: #fef2f2;
199
- }
200
-
201
- tr:hover {
202
- background: #f8f9fa !important;
203
- }
204
-
205
- .reason {
206
- max-width: 300px;
207
- color: #666;
208
- }
209
-
210
- .view-details-btn {
211
- background: #667eea;
212
- color: white;
213
- border: none;
214
- padding: 5px 12px;
215
- border-radius: 5px;
216
- cursor: pointer;
217
- font-size: 0.85em;
218
- transition: all 0.3s;
219
- }
220
-
221
- .view-details-btn:hover {
222
- background: #5568d3;
223
- transform: scale(1.05);
224
- }
225
-
226
- /* Modal styles */
227
- .modal {
228
- display: none;
229
- position: fixed;
230
- z-index: 1000;
231
- left: 0;
232
- top: 0;
233
- width: 100%;
234
- height: 100%;
235
- overflow: auto;
236
- background-color: rgba(0,0,0,0.7);
237
- animation: fadeIn 0.3s;
238
- }
239
-
240
- @keyframes fadeIn {
241
- from { opacity: 0; }
242
- to { opacity: 1; }
243
- }
244
-
245
- .modal-content {
246
- background-color: #fefefe;
247
- margin: 2% auto;
248
- padding: 30px;
249
- border-radius: 15px;
250
- width: 90%;
251
- max-width: 900px;
252
- max-height: 90vh;
253
- overflow-y: auto;
254
- box-shadow: 0 20px 60px rgba(0,0,0,0.3);
255
- animation: slideIn 0.3s;
256
- }
257
-
258
- @keyframes slideIn {
259
- from {
260
- transform: translateY(-50px);
261
- opacity: 0;
262
- }
263
- to {
264
- transform: translateY(0);
265
- opacity: 1;
266
- }
267
- }
268
-
269
- .modal-header {
270
- display: flex;
271
- justify-content: space-between;
272
- align-items: center;
273
- margin-bottom: 20px;
274
- padding-bottom: 15px;
275
- border-bottom: 2px solid #667eea;
276
- }
277
-
278
- .modal-header h2 {
279
- color: #667eea;
280
- margin: 0;
281
- }
282
-
283
- .close {
284
- color: #aaa;
285
- font-size: 35px;
286
- font-weight: bold;
287
- cursor: pointer;
288
- transition: color 0.3s;
289
- }
290
-
291
- .close:hover {
292
- color: #667eea;
293
- }
294
-
295
- .detail-section {
296
- margin: 20px 0;
297
- padding: 15px;
298
- background: #f8f9fa;
299
- border-radius: 10px;
300
- border-left: 4px solid #667eea;
301
- }
302
-
303
- .detail-section h3 {
304
- color: #667eea;
305
- margin-bottom: 10px;
306
- font-size: 1.1em;
307
- }
308
-
309
- .detail-section pre {
310
- background: white;
311
- padding: 15px;
312
- border-radius: 8px;
313
- overflow-x: auto;
314
- font-size: 0.85em;
315
- line-height: 1.5;
316
- }
317
-
318
- .detail-section p {
319
- margin: 8px 0;
320
- color: #555;
321
- line-height: 1.6;
322
- }
323
-
324
- .badge {
325
- display: inline-block;
326
- padding: 4px 10px;
327
- border-radius: 12px;
328
- font-size: 0.8em;
329
- font-weight: 600;
330
- margin-right: 8px;
331
- }
332
-
333
- .badge-success {
334
- background: #d1fae5;
335
- color: #065f46;
336
- }
337
-
338
- .badge-failed {
339
- background: #fee2e2;
340
- color: #991b1b;
341
- }
342
-
343
- .loading {
344
- text-align: center;
345
- padding: 40px;
346
- color: #667eea;
347
- font-size: 1.2em;
348
- }
349
-
350
- .no-data {
351
- text-align: center;
352
- padding: 60px;
353
- color: #999;
354
- }
355
-
356
- .no-data h2 {
357
- color: #667eea;
358
- margin-bottom: 20px;
359
- }
360
-
361
- @media (max-width: 768px) {
362
- .charts {
363
- grid-template-columns: 1fr;
364
- }
365
-
366
- .metrics-grid {
367
- grid-template-columns: 1fr;
368
- }
369
-
370
- header {
371
- flex-direction: column;
372
- gap: 15px;
373
- }
374
-
375
- .modal-content {
376
- width: 95%;
377
- margin: 5% auto;
378
- padding: 20px;
379
- }
380
- }
381
- </style>
9
+ <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
10
+ <link rel="stylesheet" href="{{ url_for('static', filename='dashboard.css') }}">
382
11
  </head>
383
12
  <body>
384
13
  <div class="container">
385
14
  <header>
386
15
  <div>
387
- <h1>📊 Eval AI Library Dashboard</h1>
388
- <span class="timestamp" id="timestamp">Loading...</span>
16
+ <h1>Eval AI Library Dashboard</h1>
17
+ <div class="timestamp" id="timestamp">Loading...</div>
389
18
  </div>
390
19
  <div class="controls">
391
20
  <select id="sessionSelect" onchange="loadSession()">
392
21
  <option value="">Loading sessions...</option>
393
22
  </select>
394
- <button onclick="refreshData()">🔄 Refresh</button>
395
- <button onclick="clearCache()">🗑️ Clear Cache</button>
23
+ <button onclick="refreshData()">Refresh</button>
24
+ <button class="primary" onclick="clearCache()">Clear Cache</button>
396
25
  </div>
397
26
  </header>
398
27
 
@@ -401,336 +30,20 @@ HTML_TEMPLATE = """
401
30
  </div>
402
31
  </div>
403
32
 
404
- <!-- Modal для детальной информации -->
33
+ <!-- Modal for detailed information -->
405
34
  <div id="detailsModal" class="modal">
406
35
  <div class="modal-content">
407
36
  <div class="modal-header">
408
- <h2>📋 Evaluation Details</h2>
37
+ <div class="test-status">
38
+ <h2 id="modalTitle">Test Details</h2>
39
+ </div>
409
40
  <span class="close" onclick="closeModal()">&times;</span>
410
41
  </div>
411
- <div id="modalBody"></div>
42
+ <div class="modal-body" id="modalBody"></div>
412
43
  </div>
413
44
  </div>
414
45
 
415
- <script>
416
- let currentData = null;
417
- let scoresChart = null;
418
- let successChart = null;
419
-
420
- // Загрузить список сессий
421
- async function loadSessions() {
422
- try {
423
- const response = await fetch('/api/sessions');
424
- const sessions = await response.json();
425
-
426
- const select = document.getElementById('sessionSelect');
427
- select.innerHTML = '<option value="latest">Latest Results</option>';
428
-
429
- sessions.reverse().forEach(session => {
430
- const option = document.createElement('option');
431
- option.value = session.session_id;
432
- option.textContent = `${session.session_id} (${session.timestamp}) - ${session.total_tests} tests`;
433
- select.appendChild(option);
434
- });
435
- } catch (error) {
436
- console.error('Error loading sessions:', error);
437
- }
438
- }
439
-
440
- // Загрузить данные сессии
441
- async function loadSession() {
442
- const select = document.getElementById('sessionSelect');
443
- const sessionId = select.value;
444
-
445
- try {
446
- let url = '/api/latest';
447
- if (sessionId && sessionId !== 'latest') {
448
- url = `/api/session/${sessionId}`;
449
- }
450
-
451
- const response = await fetch(url);
452
- if (!response.ok) {
453
- showNoData();
454
- return;
455
- }
456
-
457
- const session = await response.json();
458
- currentData = session.data;
459
- renderDashboard(session);
460
- } catch (error) {
461
- console.error('Error loading session:', error);
462
- showNoData();
463
- }
464
- }
465
-
466
- // Показать "нет данных"
467
- function showNoData() {
468
- document.getElementById('content').innerHTML = `
469
- <div class="no-data">
470
- <h2>No evaluation results available</h2>
471
- <p>Run an evaluation with <code>show_dashboard=True</code> to see results here.</p>
472
- </div>
473
- `;
474
- }
475
-
476
- // Отрисовать дашборд
477
- function renderDashboard(session) {
478
- const data = session.data;
479
- document.getElementById('timestamp').textContent = `Generated: ${session.timestamp}`;
480
-
481
- const metricsLabels = Object.keys(data.metrics_summary);
482
- const metricsScores = metricsLabels.map(m => data.metrics_summary[m].avg_score);
483
- const metricsSuccessRates = metricsLabels.map(m => data.metrics_summary[m].success_rate);
484
-
485
- let metricCards = '';
486
- for (const [metricName, metricData] of Object.entries(data.metrics_summary)) {
487
- metricCards += `
488
- <div class="metric-card">
489
- <h3>${metricName}</h3>
490
- <div class="metric-score">${metricData.avg_score.toFixed(3)}</div>
491
- <div class="metric-details">
492
- <p>✅ Passed: ${metricData.passed}</p>
493
- <p>❌ Failed: ${metricData.failed}</p>
494
- <p>📊 Success Rate: ${metricData.success_rate.toFixed(1)}%</p>
495
- <p>🎯 Threshold: ${metricData.threshold}</p>
496
- <p>🤖 Model: ${metricData.model}</p>
497
- <p>💰 Total Cost: $${metricData.total_cost.toFixed(6)}</p>
498
- </div>
499
- </div>
500
- `;
501
- }
502
-
503
- let tableRows = '';
504
- data.test_cases.forEach((testCase, tcIdx) => {
505
- testCase.metrics.forEach((metric, mIdx) => {
506
- const statusEmoji = metric.success ? '✅' : '❌';
507
- const statusClass = metric.success ? 'success' : 'failed';
508
-
509
- tableRows += `
510
- <tr class="${statusClass}">
511
- <td>${testCase.test_index}</td>
512
- <td>${testCase.input}</td>
513
- <td>${metric.name}</td>
514
- <td>${metric.score.toFixed(3)}</td>
515
- <td>${metric.threshold}</td>
516
- <td>${statusEmoji}</td>
517
- <td>${metric.evaluation_model}</td>
518
- <td>$${(metric.evaluation_cost || 0).toFixed(6)}</td>
519
- <td>
520
- <button class="view-details-btn" onclick="showDetails(${tcIdx}, ${mIdx})">
521
- View Details
522
- </button>
523
- </td>
524
- </tr>
525
- `;
526
- });
527
- });
528
-
529
- document.getElementById('content').innerHTML = `
530
- <div class="summary">
531
- <div class="summary-card">
532
- <h3>Total Tests</h3>
533
- <div class="value">${data.total_tests}</div>
534
- </div>
535
- <div class="summary-card">
536
- <h3>Total Cost</h3>
537
- <div class="value">$${data.total_cost.toFixed(6)}</div>
538
- </div>
539
- <div class="summary-card">
540
- <h3>Metrics</h3>
541
- <div class="value">${metricsLabels.length}</div>
542
- </div>
543
- </div>
544
-
545
- <h2 style="color: #667eea; margin-bottom: 20px;">📈 Metrics Summary</h2>
546
- <div class="metrics-grid">
547
- ${metricCards}
548
- </div>
549
-
550
- <h2 style="color: #667eea; margin-bottom: 20px;">📊 Charts</h2>
551
- <div class="charts">
552
- <div class="chart-container">
553
- <h2>Average Scores by Metric</h2>
554
- <canvas id="scoresChart"></canvas>
555
- </div>
556
- <div class="chart-container">
557
- <h2>Success Rate by Metric</h2>
558
- <canvas id="successChart"></canvas>
559
- </div>
560
- </div>
561
-
562
- <h2 style="color: #667eea; margin: 40px 0 20px 0;">📋 Detailed Results</h2>
563
- <table>
564
- <thead>
565
- <tr>
566
- <th>Test #</th>
567
- <th>Input</th>
568
- <th>Metric</th>
569
- <th>Score</th>
570
- <th>Threshold</th>
571
- <th>Status</th>
572
- <th>Model</th>
573
- <th>Cost</th>
574
- <th>Actions</th>
575
- </tr>
576
- </thead>
577
- <tbody>
578
- ${tableRows}
579
- </tbody>
580
- </table>
581
- `;
582
-
583
- renderCharts(metricsLabels, metricsScores, metricsSuccessRates);
584
- }
585
-
586
- // Показать детали в модальном окне
587
- function showDetails(testCaseIdx, metricIdx) {
588
- const testCase = currentData.test_cases[testCaseIdx];
589
- const metric = testCase.metrics[metricIdx];
590
-
591
- const statusBadge = metric.success
592
- ? '<span class="badge badge-success">✅ PASSED</span>'
593
- : '<span class="badge badge-failed">❌ FAILED</span>';
594
-
595
- let modalContent = `
596
- <div class="detail-section">
597
- <h3>Test Case #${testCase.test_index}</h3>
598
- <p><strong>Input:</strong> ${testCase.input_full}</p>
599
- <p><strong>Actual Output:</strong> ${testCase.actual_output_full || 'N/A'}</p>
600
- <p><strong>Expected Output:</strong> ${testCase.expected_output_full || 'N/A'}</p>
601
- </div>
602
-
603
- <div class="detail-section">
604
- <h3>Metric: ${metric.name}</h3>
605
- ${statusBadge}
606
- <p><strong>Score:</strong> ${metric.score.toFixed(3)} / ${metric.threshold}</p>
607
- <p><strong>Model:</strong> ${metric.evaluation_model}</p>
608
- <p><strong>Cost:</strong> $${(metric.evaluation_cost || 0).toFixed(6)}</p>
609
- </div>
610
-
611
- <div class="detail-section">
612
- <h3>Reason</h3>
613
- <p>${metric.reason_full || metric.reason}</p>
614
- </div>
615
- `;
616
-
617
- // Добавляем retrieval context если есть
618
- if (testCase.retrieval_context && testCase.retrieval_context.length > 0) {
619
- modalContent += `
620
- <div class="detail-section">
621
- <h3>Retrieval Context (${testCase.retrieval_context.length} chunks)</h3>
622
- ${testCase.retrieval_context.map((ctx, idx) => `
623
- <p><strong>Chunk ${idx + 1}:</strong></p>
624
- <p style="margin-left: 20px; color: #666;">${ctx.substring(0, 300)}${ctx.length > 300 ? '...' : ''}</p>
625
- `).join('')}
626
- </div>
627
- `;
628
- }
629
-
630
- // Добавляем evaluation log если есть
631
- if (metric.evaluation_log) {
632
- modalContent += `
633
- <div class="detail-section">
634
- <h3>Evaluation Log</h3>
635
- <pre>${JSON.stringify(metric.evaluation_log, null, 2)}</pre>
636
- </div>
637
- `;
638
- }
639
-
640
- document.getElementById('modalBody').innerHTML = modalContent;
641
- document.getElementById('detailsModal').style.display = 'block';
642
- }
643
-
644
- // Закрыть модальное окно
645
- function closeModal() {
646
- document.getElementById('detailsModal').style.display = 'none';
647
- }
648
-
649
- // Закрытие по клику вне модального окна
650
- window.onclick = function(event) {
651
- const modal = document.getElementById('detailsModal');
652
- if (event.target == modal) {
653
- closeModal();
654
- }
655
- }
656
-
657
- // Отрисовать графики
658
- function renderCharts(labels, scores, successRates) {
659
- if (scoresChart) scoresChart.destroy();
660
- if (successChart) successChart.destroy();
661
-
662
- const scoresCtx = document.getElementById('scoresChart').getContext('2d');
663
- scoresChart = new Chart(scoresCtx, {
664
- type: 'bar',
665
- data: {
666
- labels: labels,
667
- datasets: [{
668
- label: 'Average Score',
669
- data: scores,
670
- backgroundColor: 'rgba(102, 126, 234, 0.8)',
671
- borderColor: 'rgba(102, 126, 234, 1)',
672
- borderWidth: 2
673
- }]
674
- },
675
- options: {
676
- responsive: true,
677
- scales: {
678
- y: {
679
- beginAtZero: true,
680
- max: 1.0
681
- }
682
- }
683
- }
684
- });
685
-
686
- const successCtx = document.getElementById('successChart').getContext('2d');
687
- successChart = new Chart(successCtx, {
688
- type: 'doughnut',
689
- data: {
690
- labels: labels,
691
- datasets: [{
692
- label: 'Success Rate (%)',
693
- data: successRates,
694
- backgroundColor: [
695
- 'rgba(102, 126, 234, 0.8)',
696
- 'rgba(118, 75, 162, 0.8)',
697
- 'rgba(237, 100, 166, 0.8)',
698
- 'rgba(255, 154, 158, 0.8)',
699
- 'rgba(250, 208, 196, 0.8)'
700
- ],
701
- borderWidth: 2
702
- }]
703
- },
704
- options: {
705
- responsive: true
706
- }
707
- });
708
- }
709
-
710
- // Обновить данные
711
- function refreshData() {
712
- loadSessions();
713
- loadSession();
714
- }
715
-
716
- // Очистить кеш
717
- async function clearCache() {
718
- if (confirm('Are you sure you want to clear all cached results?')) {
719
- try {
720
- await fetch('/api/clear');
721
- alert('Cache cleared!');
722
- refreshData();
723
- } catch (error) {
724
- console.error('Error clearing cache:', error);
725
- alert('Error clearing cache');
726
- }
727
- }
728
- }
729
-
730
- // Инициализация
731
- loadSessions();
732
- loadSession();
733
- </script>
46
+ <script src="{{ url_for('static', filename='dashboard.js') }}"></script>
734
47
  </body>
735
48
  </html>
736
49
  """
eval_lib/llm_client.py CHANGED
@@ -47,6 +47,29 @@ class CustomLLMClient(ABC):
47
47
  """
48
48
  pass
49
49
 
50
+ async def get_embeddings(
51
+ self,
52
+ texts: list[str],
53
+ model: str = "text-embedding-3-small"
54
+ ) -> tuple[list[list[float]], Optional[float]]:
55
+ """
56
+ Get embeddings for texts (optional implementation).
57
+
58
+ Args:
59
+ texts: List of texts to embed
60
+ model: Embedding model name
61
+
62
+ Returns:
63
+ Tuple of (embeddings_list, cost_in_usd)
64
+
65
+ Raises:
66
+ NotImplementedError: If custom client doesn't support embeddings
67
+ """
68
+ raise NotImplementedError(
69
+ f"{self.__class__.__name__} does not support embeddings. "
70
+ "Implement get_embeddings() method or use OpenAI for embeddings."
71
+ )
72
+
50
73
  @abstractmethod
51
74
  def get_model_name(self) -> str:
52
75
  """Return the model name for logging/tracking purposes."""
@@ -405,14 +428,14 @@ def _calculate_cost(llm: LLMDescriptor, usage) -> Optional[float]:
405
428
 
406
429
 
407
430
  async def get_embeddings(
408
- model: str | tuple[str, str] | LLMDescriptor,
431
+ model: str | tuple[str, str] | LLMDescriptor | CustomLLMClient,
409
432
  texts: list[str],
410
433
  ) -> tuple[list[list[float]], Optional[float]]:
411
434
  """
412
- Get embeddings for a list of texts using OpenAI models.
435
+ Get embeddings for a list of texts.
413
436
 
414
437
  Args:
415
- model: Model specification (e.g., "openai:text-embedding-3-small")
438
+ model: Model specification or CustomLLMClient instance
416
439
  texts: List of texts to embed
417
440
 
418
441
  Returns:
@@ -420,8 +443,13 @@ async def get_embeddings(
420
443
 
421
444
  Raises:
422
445
  LLMConfigurationError: If required API keys are missing
423
- ValueError: If non-OpenAI provider is specified
446
+ ValueError: If provider doesn't support embeddings
447
+ NotImplementedError: If CustomLLMClient doesn't implement get_embeddings
424
448
  """
449
+ # Handle custom LLM clients
450
+ if isinstance(model, CustomLLMClient):
451
+ return await model.get_embeddings(texts)
452
+
425
453
  llm = LLMDescriptor.parse(model)
426
454
 
427
455
  if llm.provider != Provider.OPENAI: