rakam-eval-sdk 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/PKG-INFO +1 -1
- {rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/pyproject.toml +1 -1
- {rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/src/rakam_eval_sdk/cli.py +163 -25
- {rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/src/rakam_eval_sdk/client.py +37 -31
- {rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/src/rakam_eval_sdk/schema.py +21 -0
- {rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/README.md +0 -0
- {rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/src/rakam_eval_sdk/__init__.py +0 -0
- {rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/src/rakam_eval_sdk/decorators.py +0 -0
- {rakam_eval_sdk-0.2.3 → rakam_eval_sdk-0.2.4}/src/rakam_eval_sdk/utils/decorator_utils.py +0 -0
|
@@ -6,16 +6,18 @@ import uuid
|
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from pprint import pprint
|
|
9
|
-
from typing import Any, Dict, Optional
|
|
9
|
+
from typing import Any, Dict, Optional
|
|
10
10
|
|
|
11
11
|
import typer
|
|
12
12
|
from dotenv import load_dotenv
|
|
13
13
|
from rich.console import Console
|
|
14
14
|
from rich.panel import Panel
|
|
15
15
|
from rich.pretty import Pretty
|
|
16
|
+
from typer import secho
|
|
16
17
|
|
|
17
18
|
from rakam_eval_sdk.client import DeepEvalClient
|
|
18
19
|
from rakam_eval_sdk.decorators import eval_run
|
|
20
|
+
from rakam_eval_sdk.schema import MetricDiff, TestCaseComparison
|
|
19
21
|
from rakam_eval_sdk.utils.decorator_utils import (
|
|
20
22
|
find_decorated_functions,
|
|
21
23
|
load_module_from_path,
|
|
@@ -163,19 +165,22 @@ def list_runs(
|
|
|
163
165
|
"""
|
|
164
166
|
client = DeepEvalClient()
|
|
165
167
|
|
|
166
|
-
|
|
168
|
+
response = client.list_evaluation_testcases(
|
|
167
169
|
limit=limit,
|
|
168
170
|
offset=offset,
|
|
169
171
|
raise_exception=True,
|
|
170
172
|
)
|
|
171
173
|
|
|
172
|
-
|
|
174
|
+
items = response.get("items", [])
|
|
175
|
+
total = response.get("total", 0)
|
|
176
|
+
|
|
177
|
+
if not items:
|
|
173
178
|
typer.echo("No evaluation runs found.")
|
|
174
179
|
return
|
|
175
180
|
|
|
176
|
-
typer.echo(f"[id]
|
|
177
|
-
|
|
178
|
-
for run in
|
|
181
|
+
typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
|
|
182
|
+
|
|
183
|
+
for run in items:
|
|
179
184
|
run_id = run.get("id")
|
|
180
185
|
label = run.get("label") or "-"
|
|
181
186
|
uid = run.get("tag") or "-"
|
|
@@ -189,8 +194,12 @@ def list_runs(
|
|
|
189
194
|
except ValueError:
|
|
190
195
|
pass
|
|
191
196
|
|
|
192
|
-
typer.echo(
|
|
193
|
-
|
|
197
|
+
typer.echo(f"[{run_id}] {uid:<20} {label:<20} {created_at}")
|
|
198
|
+
|
|
199
|
+
shown = offset + len(items)
|
|
200
|
+
if shown < total:
|
|
201
|
+
typer.echo()
|
|
202
|
+
typer.echo(f"Showing {shown} of {total} runs. Use --limit to see more.")
|
|
194
203
|
|
|
195
204
|
|
|
196
205
|
@app.command()
|
|
@@ -410,8 +419,7 @@ def _print_and_save(
|
|
|
410
419
|
return
|
|
411
420
|
|
|
412
421
|
if out.exists() and not overwrite:
|
|
413
|
-
typer.echo(
|
|
414
|
-
f"❌ File already exists: {out} (use --overwrite to replace)")
|
|
422
|
+
typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
|
|
415
423
|
raise typer.Exit(code=1)
|
|
416
424
|
|
|
417
425
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -422,12 +430,109 @@ def _print_and_save(
|
|
|
422
430
|
typer.echo(f"💾 Result saved to {out}")
|
|
423
431
|
|
|
424
432
|
|
|
433
|
+
def pct_change(a: float | None, b: float | None) -> str | None:
|
|
434
|
+
if a is None or b is None or a == 0:
|
|
435
|
+
return None
|
|
436
|
+
return f"{((b - a) / a) * 100:+.2f}%"
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def metric_direction(delta: float | None) -> str:
|
|
440
|
+
if delta is None:
|
|
441
|
+
return "unchanged"
|
|
442
|
+
if delta > 0:
|
|
443
|
+
return "improved"
|
|
444
|
+
if delta < 0:
|
|
445
|
+
return "regressed"
|
|
446
|
+
return "unchanged"
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def print_metric_diff(diff: MetricDiff):
|
|
450
|
+
secho(f"\nMetric: {diff.metric}", bold=True)
|
|
451
|
+
|
|
452
|
+
if diff.status == "added":
|
|
453
|
+
secho(f"+ score: {diff.score_b}", fg="green")
|
|
454
|
+
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
455
|
+
secho(f"+ success: {diff.success_b}", fg="green")
|
|
456
|
+
return
|
|
457
|
+
|
|
458
|
+
if diff.status == "removed":
|
|
459
|
+
secho(f"- score: {diff.score_a}", fg="red")
|
|
460
|
+
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
461
|
+
secho(f"- success: {diff.success_a}", fg="red")
|
|
462
|
+
return
|
|
463
|
+
|
|
464
|
+
# unchanged / changed
|
|
465
|
+
if diff.score_a != diff.score_b:
|
|
466
|
+
direction = metric_direction(diff.delta)
|
|
467
|
+
color = "green" if direction == "improved" else "red"
|
|
468
|
+
pct = pct_change(diff.score_a, diff.score_b)
|
|
469
|
+
|
|
470
|
+
secho(f"- score: {diff.score_a}", fg="red")
|
|
471
|
+
secho(
|
|
472
|
+
f"+ score: {diff.score_b}" + (f" ({pct})" if pct else ""),
|
|
473
|
+
fg=color,
|
|
474
|
+
)
|
|
475
|
+
else:
|
|
476
|
+
secho(f" score: {diff.score_a}", dim=True)
|
|
477
|
+
|
|
478
|
+
if diff.threshold_a != diff.threshold_b:
|
|
479
|
+
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
480
|
+
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
481
|
+
else:
|
|
482
|
+
secho(f" threshold: {diff.threshold_a}", dim=True)
|
|
483
|
+
|
|
484
|
+
if diff.success_a != diff.success_b:
|
|
485
|
+
secho(f"- success: {diff.success_a}", fg="red")
|
|
486
|
+
secho(f"+ success: {diff.success_b}", fg="green")
|
|
487
|
+
else:
|
|
488
|
+
secho(f" success: {diff.success_a}", dim=True)
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def summarize(metrics: Any) -> Dict[str, int]:
|
|
492
|
+
summary = {
|
|
493
|
+
"improved": 0,
|
|
494
|
+
"regressed": 0,
|
|
495
|
+
"unchanged": 0,
|
|
496
|
+
"added": 0,
|
|
497
|
+
"removed": 0,
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
for m in metrics:
|
|
501
|
+
if m.status in ("added", "removed"):
|
|
502
|
+
summary[m.status] += 1
|
|
503
|
+
else:
|
|
504
|
+
direction = metric_direction(m.delta)
|
|
505
|
+
summary[direction] += 1
|
|
506
|
+
|
|
507
|
+
return summary
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def pretty_print_comparison(resp: Any, summary_only: bool = False):
|
|
511
|
+
if not summary_only:
|
|
512
|
+
for metric in resp.metrics:
|
|
513
|
+
print_metric_diff(metric)
|
|
514
|
+
return
|
|
515
|
+
|
|
516
|
+
print_summary(resp.metrics)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def print_summary(metrics: Any):
|
|
520
|
+
summary = summarize(metrics)
|
|
521
|
+
|
|
522
|
+
secho("\nSummary:", bold=True)
|
|
523
|
+
secho(f" ↑ Improved: {summary['improved']}", fg="green")
|
|
524
|
+
secho(f" ↓ Regressed: {summary['regressed']}", fg="red")
|
|
525
|
+
secho(f" ± Unchanged: {summary['unchanged']}", dim=True)
|
|
526
|
+
secho(f" + Added: {summary['added']}", fg="green")
|
|
527
|
+
secho(f" - Removed: {summary['removed']}", fg="red")
|
|
528
|
+
|
|
529
|
+
|
|
425
530
|
@app.command()
|
|
426
531
|
def compare(
|
|
427
532
|
tag: list[str] = typer.Option(
|
|
428
533
|
[],
|
|
429
534
|
"--tag",
|
|
430
|
-
help="
|
|
535
|
+
help="Label identifying a reference testcase",
|
|
431
536
|
),
|
|
432
537
|
run: list[int] = typer.Option(
|
|
433
538
|
[],
|
|
@@ -437,7 +542,12 @@ def compare(
|
|
|
437
542
|
pretty: bool = typer.Option(
|
|
438
543
|
True,
|
|
439
544
|
"--pretty/--raw",
|
|
440
|
-
help="Pretty
|
|
545
|
+
help="Pretty diff output (default) or raw JSON",
|
|
546
|
+
),
|
|
547
|
+
summary: bool = typer.Option(
|
|
548
|
+
False,
|
|
549
|
+
"--summary",
|
|
550
|
+
help="Show summary only (no per-metric diff)",
|
|
441
551
|
),
|
|
442
552
|
raise_exception: bool = typer.Option(
|
|
443
553
|
False,
|
|
@@ -457,31 +567,39 @@ def compare(
|
|
|
457
567
|
),
|
|
458
568
|
) -> None:
|
|
459
569
|
"""
|
|
460
|
-
Compare two evaluation testcases
|
|
570
|
+
Compare two evaluation testcases (runs and/or labels).
|
|
571
|
+
|
|
572
|
+
Output:
|
|
573
|
+
- Unified diff-style view by default
|
|
574
|
+
- Raw JSON with --raw
|
|
575
|
+
- Summary of improvements / regressions
|
|
461
576
|
"""
|
|
462
577
|
|
|
463
|
-
targets = []
|
|
578
|
+
targets: list[tuple[str, str | int]] = []
|
|
464
579
|
|
|
465
580
|
for r in run:
|
|
466
581
|
targets.append(("run", r))
|
|
467
582
|
|
|
468
|
-
for
|
|
469
|
-
targets.append(("label",
|
|
583
|
+
for t in tag:
|
|
584
|
+
targets.append(("label", t))
|
|
470
585
|
|
|
471
586
|
if len(targets) != 2:
|
|
472
|
-
typer.
|
|
473
|
-
"❌ Provide exactly two targets using --run and/or --
|
|
587
|
+
typer.secho(
|
|
588
|
+
"❌ Provide exactly two targets using --run and/or --tag",
|
|
589
|
+
fg="red",
|
|
590
|
+
bold=True,
|
|
474
591
|
)
|
|
475
592
|
raise typer.Exit(code=1)
|
|
476
593
|
|
|
477
|
-
client = DeepEvalClient()
|
|
478
|
-
|
|
479
594
|
(type_a, value_a), (type_b, value_b) = targets
|
|
480
595
|
|
|
481
|
-
typer.
|
|
482
|
-
f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'"
|
|
596
|
+
typer.secho(
|
|
597
|
+
f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
|
|
598
|
+
bold=True,
|
|
483
599
|
)
|
|
484
600
|
|
|
601
|
+
client = DeepEvalClient()
|
|
602
|
+
|
|
485
603
|
kwargs = {"raise_exception": raise_exception}
|
|
486
604
|
|
|
487
605
|
if type_a == "run":
|
|
@@ -497,14 +615,34 @@ def compare(
|
|
|
497
615
|
try:
|
|
498
616
|
resp = client.compare_testcases(**kwargs)
|
|
499
617
|
except Exception as e:
|
|
500
|
-
typer.
|
|
618
|
+
typer.secho(f"❌ Request failed: {e}", fg="red")
|
|
501
619
|
raise typer.Exit(code=1)
|
|
502
620
|
|
|
503
621
|
if not resp:
|
|
504
|
-
typer.
|
|
622
|
+
typer.secho("⚠️ No response received", fg="yellow")
|
|
505
623
|
raise typer.Exit(code=1)
|
|
506
624
|
|
|
507
|
-
|
|
625
|
+
if out:
|
|
626
|
+
if out.exists() and not overwrite:
|
|
627
|
+
typer.secho(
|
|
628
|
+
f"❌ File already exists: {out} (use --overwrite)",
|
|
629
|
+
fg="red",
|
|
630
|
+
)
|
|
631
|
+
raise typer.Exit(code=1)
|
|
632
|
+
|
|
633
|
+
out.write_text(json.dumps(resp, indent=2))
|
|
634
|
+
typer.secho(f"💾 Saved raw output to {out}", fg="green")
|
|
635
|
+
|
|
636
|
+
if not pretty:
|
|
637
|
+
typer.echo(json.dumps(resp, indent=2))
|
|
638
|
+
return
|
|
639
|
+
|
|
640
|
+
comparison = TestCaseComparison(**resp)
|
|
641
|
+
|
|
642
|
+
pretty_print_comparison(
|
|
643
|
+
comparison,
|
|
644
|
+
summary_only=summary,
|
|
645
|
+
)
|
|
508
646
|
|
|
509
647
|
|
|
510
648
|
@app.command(hidden=True)
|
|
@@ -40,8 +40,7 @@ class DeepEvalClient:
|
|
|
40
40
|
)
|
|
41
41
|
self.base_url = raw_url.rstrip("/")
|
|
42
42
|
self.api_token = (
|
|
43
|
-
api_token or settings_token or os.getenv(
|
|
44
|
-
"EVALFRAMEWORK_API_KEY", "")
|
|
43
|
+
api_token or settings_token or os.getenv("EVALFRAMEWORK_API_KEY", "")
|
|
45
44
|
)
|
|
46
45
|
self.timeout = timeout
|
|
47
46
|
|
|
@@ -50,10 +49,10 @@ class DeepEvalClient:
|
|
|
50
49
|
method: HTTPMethod,
|
|
51
50
|
endpoint: str,
|
|
52
51
|
*,
|
|
53
|
-
json:
|
|
54
|
-
params:
|
|
52
|
+
json: Dict | None = None,
|
|
53
|
+
params: Dict | None = None,
|
|
55
54
|
raise_exception: bool = False,
|
|
56
|
-
) -> Optional[
|
|
55
|
+
) -> Optional[Dict]:
|
|
57
56
|
url = f"{self.base_url}{endpoint}"
|
|
58
57
|
|
|
59
58
|
headers = {
|
|
@@ -90,16 +89,16 @@ class DeepEvalClient:
|
|
|
90
89
|
"raw": resp.text,
|
|
91
90
|
}
|
|
92
91
|
|
|
93
|
-
def _get(self, endpoint: str, params:
|
|
92
|
+
def _get(self, endpoint: str, params: Dict, *args, **kw):
|
|
94
93
|
return self._request("GET", endpoint, params=params, *args, **kw)
|
|
95
94
|
|
|
96
|
-
def _post(self, endpoint: str, payload:
|
|
95
|
+
def _post(self, endpoint: str, payload: Dict, *args, **kw):
|
|
97
96
|
return self._request("POST", endpoint, json=payload, *args, **kw)
|
|
98
97
|
|
|
99
|
-
def _patch(self, endpoint: str, payload:
|
|
98
|
+
def _patch(self, endpoint: str, payload: Dict, *args, **kw):
|
|
100
99
|
return self._request("PATCH", endpoint, json=payload, *args, **kw)
|
|
101
100
|
|
|
102
|
-
def _delete(self, endpoint: str, payload:
|
|
101
|
+
def _delete(self, endpoint: str, payload: Dict, *args, **kw):
|
|
103
102
|
return self._request("DELETE", endpoint, json=payload, *args, **kw)
|
|
104
103
|
|
|
105
104
|
def update_evaluation_testcase_tag(
|
|
@@ -133,10 +132,11 @@ class DeepEvalClient:
|
|
|
133
132
|
limit: int = 10,
|
|
134
133
|
offset: int = 0,
|
|
135
134
|
raise_exception: bool = False,
|
|
136
|
-
) -> Optional[
|
|
135
|
+
) -> Optional[Dict]:
|
|
137
136
|
"""
|
|
138
137
|
List evaluation testcases for the current API token only.
|
|
139
138
|
Sorted by created_at DESC (newest first).
|
|
139
|
+
Returns items + pagination metadata.
|
|
140
140
|
"""
|
|
141
141
|
return self._get(
|
|
142
142
|
endpoint="/eval-framework/deepeval/evaluation-testcases/token",
|
|
@@ -202,7 +202,7 @@ class DeepEvalClient:
|
|
|
202
202
|
testcase_b_id: int | None = None,
|
|
203
203
|
testcase_b_tag: str | None = None,
|
|
204
204
|
raise_exception: bool = False,
|
|
205
|
-
) -> Optional[
|
|
205
|
+
) -> Optional[Dict]:
|
|
206
206
|
"""
|
|
207
207
|
Compare two evaluation testcases using IDs or tags.
|
|
208
208
|
Exactly one identifier (id or tag) must be provided per testcase.
|
|
@@ -210,9 +210,7 @@ class DeepEvalClient:
|
|
|
210
210
|
|
|
211
211
|
def validate(id_, tag, name: str):
|
|
212
212
|
if bool(id_) == bool(tag):
|
|
213
|
-
raise ValueError(
|
|
214
|
-
f"Provide exactly one of {name}_id or {name}_tag"
|
|
215
|
-
)
|
|
213
|
+
raise ValueError(f"Provide exactly one of {name}_id or {name}_tag")
|
|
216
214
|
|
|
217
215
|
validate(testcase_a_id, testcase_a_tag, "testcase_a")
|
|
218
216
|
validate(testcase_b_id, testcase_b_tag, "testcase_b")
|
|
@@ -241,7 +239,7 @@ class DeepEvalClient:
|
|
|
241
239
|
label_a: str,
|
|
242
240
|
label_b: str,
|
|
243
241
|
raise_exception: bool = False,
|
|
244
|
-
) -> Optional[
|
|
242
|
+
) -> Optional[Dict]:
|
|
245
243
|
"""
|
|
246
244
|
Compare the latest evaluation testcases for two labels.
|
|
247
245
|
"""
|
|
@@ -259,7 +257,7 @@ class DeepEvalClient:
|
|
|
259
257
|
*,
|
|
260
258
|
label: str,
|
|
261
259
|
raise_exception: bool = False,
|
|
262
|
-
) -> Optional[
|
|
260
|
+
) -> Optional[Dict]:
|
|
263
261
|
"""
|
|
264
262
|
Compare the last two evaluation testcases for a given label.
|
|
265
263
|
"""
|
|
@@ -277,7 +275,7 @@ class DeepEvalClient:
|
|
|
277
275
|
config: EvalConfig,
|
|
278
276
|
*,
|
|
279
277
|
raise_exception: bool = False,
|
|
280
|
-
) -> Optional[
|
|
278
|
+
) -> Optional[Dict]: ...
|
|
281
279
|
|
|
282
280
|
@overload
|
|
283
281
|
def text_eval(
|
|
@@ -288,7 +286,7 @@ class DeepEvalClient:
|
|
|
288
286
|
component: str = "unknown",
|
|
289
287
|
label: str | None = None,
|
|
290
288
|
raise_exception: bool = False,
|
|
291
|
-
) -> Optional[
|
|
289
|
+
) -> Optional[Dict]: ...
|
|
292
290
|
|
|
293
291
|
def text_eval(
|
|
294
292
|
self,
|
|
@@ -299,7 +297,7 @@ class DeepEvalClient:
|
|
|
299
297
|
component: str = "unknown",
|
|
300
298
|
label: str | None = None,
|
|
301
299
|
raise_exception: bool = False,
|
|
302
|
-
) -> Optional[
|
|
300
|
+
) -> Optional[Dict]:
|
|
303
301
|
if config is None:
|
|
304
302
|
config = EvalConfig(
|
|
305
303
|
data=data,
|
|
@@ -309,7 +307,9 @@ class DeepEvalClient:
|
|
|
309
307
|
)
|
|
310
308
|
|
|
311
309
|
return self._post(
|
|
312
|
-
endpoint="/deepeval/text-eval",
|
|
310
|
+
endpoint="/deepeval/text-eval",
|
|
311
|
+
payload=config.model_dump(),
|
|
312
|
+
raise_exception=raise_exception,
|
|
313
313
|
)
|
|
314
314
|
|
|
315
315
|
def text_eval_background(
|
|
@@ -319,12 +319,16 @@ class DeepEvalClient:
|
|
|
319
319
|
raise_exception: bool = False,
|
|
320
320
|
component: str = "unknown",
|
|
321
321
|
label: Union[str, None] = None,
|
|
322
|
-
) -> Optional[
|
|
322
|
+
) -> Optional[Dict]:
|
|
323
323
|
"""Run background text evaluation (async job)."""
|
|
324
324
|
payload = EvalConfig.model_construct(
|
|
325
325
|
data=data, metrics=metrics, component=component, version=label
|
|
326
326
|
).model_dump()
|
|
327
|
-
return self._post(
|
|
327
|
+
return self._post(
|
|
328
|
+
endpoint="/deepeval/text-eval/background",
|
|
329
|
+
payload=payload,
|
|
330
|
+
raise_exception=raise_exception,
|
|
331
|
+
)
|
|
328
332
|
|
|
329
333
|
@overload
|
|
330
334
|
def schema_eval(
|
|
@@ -335,7 +339,7 @@ class DeepEvalClient:
|
|
|
335
339
|
component: str = "unknown",
|
|
336
340
|
label: str | None = None,
|
|
337
341
|
raise_exception: bool = False,
|
|
338
|
-
) -> Optional[
|
|
342
|
+
) -> Optional[Dict]: ...
|
|
339
343
|
|
|
340
344
|
@overload
|
|
341
345
|
def schema_eval(
|
|
@@ -343,7 +347,7 @@ class DeepEvalClient:
|
|
|
343
347
|
config: SchemaEvalConfig,
|
|
344
348
|
*,
|
|
345
349
|
raise_exception: bool = False,
|
|
346
|
-
) -> Optional[
|
|
350
|
+
) -> Optional[Dict]: ...
|
|
347
351
|
|
|
348
352
|
def schema_eval(
|
|
349
353
|
self,
|
|
@@ -354,7 +358,7 @@ class DeepEvalClient:
|
|
|
354
358
|
component: str = "unknown",
|
|
355
359
|
label: str | None = None,
|
|
356
360
|
raise_exception: bool = False,
|
|
357
|
-
) -> Optional[
|
|
361
|
+
) -> Optional[Dict]:
|
|
358
362
|
if config is None:
|
|
359
363
|
if data is None or metrics is None:
|
|
360
364
|
raise ValueError(
|
|
@@ -381,13 +385,15 @@ class DeepEvalClient:
|
|
|
381
385
|
raise_exception: bool = False,
|
|
382
386
|
component: str = "unknown",
|
|
383
387
|
label: Union[str, None] = None,
|
|
384
|
-
) -> Optional[
|
|
388
|
+
) -> Optional[Dict]:
|
|
385
389
|
"""Run background schema evaluation (async job)."""
|
|
386
390
|
payload = SchemaEvalConfig.model_construct(
|
|
387
391
|
data=data, metrics=metrics, component=component, version=label
|
|
388
392
|
).model_dump()
|
|
389
393
|
return self._post(
|
|
390
|
-
endpoint="/deepeval/schema-eval/background",
|
|
394
|
+
endpoint="/deepeval/schema-eval/background",
|
|
395
|
+
payload=payload,
|
|
396
|
+
raise_exception=raise_exception,
|
|
391
397
|
)
|
|
392
398
|
|
|
393
399
|
def maybe_text_eval(
|
|
@@ -398,7 +404,7 @@ class DeepEvalClient:
|
|
|
398
404
|
raise_exception: bool = False,
|
|
399
405
|
component: str = "unknown",
|
|
400
406
|
label: Union[str, None] = None,
|
|
401
|
-
) -> Optional[
|
|
407
|
+
) -> Optional[Dict]:
|
|
402
408
|
"""Randomly run text_eval based on a probability between 0 and 1."""
|
|
403
409
|
self._validate_chance(chance)
|
|
404
410
|
return (
|
|
@@ -421,7 +427,7 @@ class DeepEvalClient:
|
|
|
421
427
|
raise_exception: bool = False,
|
|
422
428
|
component: str = "unknown",
|
|
423
429
|
label: Union[str, None] = None,
|
|
424
|
-
) -> Optional[
|
|
430
|
+
) -> Optional[Dict]:
|
|
425
431
|
"""Randomly run text_eval_background based on a probability between 0 and 1."""
|
|
426
432
|
self._validate_chance(chance)
|
|
427
433
|
return (
|
|
@@ -440,7 +446,7 @@ class DeepEvalClient:
|
|
|
440
446
|
raise_exception: bool = False,
|
|
441
447
|
component: str = "unknown",
|
|
442
448
|
label: Union[str, None] = None,
|
|
443
|
-
) -> Optional[
|
|
449
|
+
) -> Optional[Dict]:
|
|
444
450
|
"""Randomly run schema_eval based on a probability between 0 and 1."""
|
|
445
451
|
self._validate_chance(chance)
|
|
446
452
|
return (
|
|
@@ -463,7 +469,7 @@ class DeepEvalClient:
|
|
|
463
469
|
raise_exception: bool = False,
|
|
464
470
|
component: str = "unknown",
|
|
465
471
|
label: Union[str, None] = None,
|
|
466
|
-
) -> Optional[
|
|
472
|
+
) -> Optional[Dict]:
|
|
467
473
|
"""Randomly run text_eval_background based on a probability between 0 and 1."""
|
|
468
474
|
self._validate_chance(chance)
|
|
469
475
|
return (
|
|
@@ -129,3 +129,24 @@ class SchemaEvalConfig(BaseModel):
|
|
|
129
129
|
label: Union[str, None] = None
|
|
130
130
|
data: List[SchemaInputItem]
|
|
131
131
|
metrics: List[SchemaMetricConfig] = Field(default_factory=list)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class MetricDiff(BaseModel):
|
|
135
|
+
metric: str
|
|
136
|
+
score_a: Optional[float]
|
|
137
|
+
score_b: Optional[float]
|
|
138
|
+
delta: Optional[float]
|
|
139
|
+
|
|
140
|
+
success_a: Optional[bool]
|
|
141
|
+
success_b: Optional[bool]
|
|
142
|
+
|
|
143
|
+
threshold_a: Optional[float]
|
|
144
|
+
threshold_b: Optional[float]
|
|
145
|
+
|
|
146
|
+
status: str # "unchanged" | "changed" | "added" | "removed"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class TestCaseComparison(BaseModel):
|
|
150
|
+
testcase_a_id: int
|
|
151
|
+
testcase_b_id: int
|
|
152
|
+
metrics: List[MetricDiff]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|