rakam-eval-sdk 0.2.3__py3-none-any.whl → 0.2.4rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rakam_eval_sdk/cli.py +182 -41
- rakam_eval_sdk/client.py +104 -52
- rakam_eval_sdk/schema.py +22 -1
- {rakam_eval_sdk-0.2.3.dist-info → rakam_eval_sdk-0.2.4rc1.dist-info}/METADATA +1 -1
- rakam_eval_sdk-0.2.4rc1.dist-info/RECORD +10 -0
- rakam_eval_sdk-0.2.3.dist-info/RECORD +0 -10
- {rakam_eval_sdk-0.2.3.dist-info → rakam_eval_sdk-0.2.4rc1.dist-info}/WHEEL +0 -0
- {rakam_eval_sdk-0.2.3.dist-info → rakam_eval_sdk-0.2.4rc1.dist-info}/entry_points.txt +0 -0
rakam_eval_sdk/cli.py
CHANGED
|
@@ -6,16 +6,18 @@ import uuid
|
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from pprint import pprint
|
|
9
|
-
from typing import Any, Dict, Optional,
|
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
10
10
|
|
|
11
11
|
import typer
|
|
12
12
|
from dotenv import load_dotenv
|
|
13
13
|
from rich.console import Console
|
|
14
14
|
from rich.panel import Panel
|
|
15
15
|
from rich.pretty import Pretty
|
|
16
|
+
from typer import secho
|
|
16
17
|
|
|
17
18
|
from rakam_eval_sdk.client import DeepEvalClient
|
|
18
19
|
from rakam_eval_sdk.decorators import eval_run
|
|
20
|
+
from rakam_eval_sdk.schema import MetricDiff, TestCaseComparison
|
|
19
21
|
from rakam_eval_sdk.utils.decorator_utils import (
|
|
20
22
|
find_decorated_functions,
|
|
21
23
|
load_module_from_path,
|
|
@@ -35,14 +37,14 @@ metrics_app = typer.Typer(help="Metrics utilities")
|
|
|
35
37
|
app.add_typer(metrics_app, name="metrics")
|
|
36
38
|
|
|
37
39
|
|
|
38
|
-
def extract_metric_names(config: Any) -> list[tuple[str, str
|
|
40
|
+
def extract_metric_names(config: Any) -> list[tuple[str, Optional[str]]]:
|
|
39
41
|
"""
|
|
40
42
|
Returns [(type, name)] from EvalConfig / SchemaEvalConfig
|
|
41
43
|
"""
|
|
42
44
|
if not hasattr(config, "metrics"):
|
|
43
45
|
return []
|
|
44
46
|
|
|
45
|
-
results: list[tuple[str, str
|
|
47
|
+
results: list[tuple[str, Optional[str]]] = []
|
|
46
48
|
|
|
47
49
|
for metric in config.metrics or []:
|
|
48
50
|
metric_type = getattr(metric, "type", None)
|
|
@@ -75,7 +77,7 @@ def metrics(
|
|
|
75
77
|
files = directory.rglob("*.py") if recursive else directory.glob("*.py")
|
|
76
78
|
TARGET_DECORATOR = eval_run.__name__
|
|
77
79
|
|
|
78
|
-
all_metrics: set[tuple[str, str
|
|
80
|
+
all_metrics: set[tuple[str, Optional[str]]] = set()
|
|
79
81
|
found_any = False
|
|
80
82
|
|
|
81
83
|
for file in sorted(files):
|
|
@@ -157,25 +159,28 @@ def list_evals(
|
|
|
157
159
|
def list_runs(
|
|
158
160
|
limit: int = typer.Option(20, help="Max number of runs"),
|
|
159
161
|
offset: int = typer.Option(0, help="Pagination offset"),
|
|
160
|
-
):
|
|
162
|
+
) -> None:
|
|
161
163
|
"""
|
|
162
164
|
List evaluation runs (newest first).
|
|
163
165
|
"""
|
|
164
166
|
client = DeepEvalClient()
|
|
165
167
|
|
|
166
|
-
|
|
168
|
+
response = client.list_evaluation_testcases(
|
|
167
169
|
limit=limit,
|
|
168
170
|
offset=offset,
|
|
169
171
|
raise_exception=True,
|
|
170
172
|
)
|
|
173
|
+
assert response is not None
|
|
174
|
+
items = response.get("items", [])
|
|
175
|
+
total = response.get("total", 0)
|
|
171
176
|
|
|
172
|
-
if not
|
|
177
|
+
if not items:
|
|
173
178
|
typer.echo("No evaluation runs found.")
|
|
174
179
|
return
|
|
175
180
|
|
|
176
|
-
typer.echo(f"[id]
|
|
177
|
-
|
|
178
|
-
for run in
|
|
181
|
+
typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
|
|
182
|
+
|
|
183
|
+
for run in items:
|
|
179
184
|
run_id = run.get("id")
|
|
180
185
|
label = run.get("label") or "-"
|
|
181
186
|
uid = run.get("tag") or "-"
|
|
@@ -189,8 +194,12 @@ def list_runs(
|
|
|
189
194
|
except ValueError:
|
|
190
195
|
pass
|
|
191
196
|
|
|
192
|
-
typer.echo(
|
|
193
|
-
|
|
197
|
+
typer.echo(f"[{run_id}] {uid:<20} {label:<20} {created_at}")
|
|
198
|
+
|
|
199
|
+
shown = offset + len(items)
|
|
200
|
+
if shown < total:
|
|
201
|
+
typer.echo()
|
|
202
|
+
typer.echo(f"Showing {shown} of {total} runs. Use --limit to see more.")
|
|
194
203
|
|
|
195
204
|
|
|
196
205
|
@app.command()
|
|
@@ -210,15 +219,15 @@ def show(
|
|
|
210
219
|
"--raw",
|
|
211
220
|
help="Print raw JSON instead of formatted output",
|
|
212
221
|
),
|
|
213
|
-
):
|
|
222
|
+
) -> None:
|
|
214
223
|
"""
|
|
215
224
|
Show a single evaluation testcase by ID or tag.
|
|
216
225
|
"""
|
|
217
226
|
if not id and not tag:
|
|
218
|
-
raise typer.BadParameter("You must provide either --id or --
|
|
227
|
+
raise typer.BadParameter("You must provide either --id or --tag")
|
|
219
228
|
|
|
220
229
|
if id and tag:
|
|
221
|
-
raise typer.BadParameter("Provide only one of --id or --
|
|
230
|
+
raise typer.BadParameter("Provide only one of --id or --tag")
|
|
222
231
|
|
|
223
232
|
client = DeepEvalClient()
|
|
224
233
|
|
|
@@ -226,6 +235,7 @@ def show(
|
|
|
226
235
|
result = client.get_evaluation_testcase_by_id(id)
|
|
227
236
|
identifier = f"id={id}"
|
|
228
237
|
else:
|
|
238
|
+
assert tag is not None
|
|
229
239
|
result = client.get_evaluation_testcase_by_tag(tag)
|
|
230
240
|
identifier = f"tag={tag}"
|
|
231
241
|
|
|
@@ -397,7 +407,7 @@ def run(
|
|
|
397
407
|
def _print_and_save(
|
|
398
408
|
resp: dict,
|
|
399
409
|
pretty: bool,
|
|
400
|
-
out: Path
|
|
410
|
+
out: Optional[Path],
|
|
401
411
|
overwrite: bool,
|
|
402
412
|
) -> None:
|
|
403
413
|
if pretty:
|
|
@@ -410,8 +420,7 @@ def _print_and_save(
|
|
|
410
420
|
return
|
|
411
421
|
|
|
412
422
|
if out.exists() and not overwrite:
|
|
413
|
-
typer.echo(
|
|
414
|
-
f"❌ File already exists: {out} (use --overwrite to replace)")
|
|
423
|
+
typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
|
|
415
424
|
raise typer.Exit(code=1)
|
|
416
425
|
|
|
417
426
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -422,12 +431,109 @@ def _print_and_save(
|
|
|
422
431
|
typer.echo(f"💾 Result saved to {out}")
|
|
423
432
|
|
|
424
433
|
|
|
434
|
+
def pct_change(a: Optional[float], b: Optional[float]) -> Optional[str]:
|
|
435
|
+
if a is None or b is None or a == 0:
|
|
436
|
+
return None
|
|
437
|
+
return f"{((b - a) / a) * 100:+.2f}%"
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def metric_direction(delta: Optional[float]) -> str:
|
|
441
|
+
if delta is None:
|
|
442
|
+
return "unchanged"
|
|
443
|
+
if delta > 0:
|
|
444
|
+
return "improved"
|
|
445
|
+
if delta < 0:
|
|
446
|
+
return "regressed"
|
|
447
|
+
return "unchanged"
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def print_metric_diff(diff: MetricDiff) -> None:
|
|
451
|
+
secho(f"\nMetric: {diff.metric}", bold=True)
|
|
452
|
+
|
|
453
|
+
if diff.status == "added":
|
|
454
|
+
secho(f"+ score: {diff.score_b}", fg="green")
|
|
455
|
+
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
456
|
+
secho(f"+ success: {diff.success_b}", fg="green")
|
|
457
|
+
return
|
|
458
|
+
|
|
459
|
+
if diff.status == "removed":
|
|
460
|
+
secho(f"- score: {diff.score_a}", fg="red")
|
|
461
|
+
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
462
|
+
secho(f"- success: {diff.success_a}", fg="red")
|
|
463
|
+
return
|
|
464
|
+
|
|
465
|
+
# unchanged / changed
|
|
466
|
+
if diff.score_a != diff.score_b:
|
|
467
|
+
direction = metric_direction(diff.delta)
|
|
468
|
+
color = "green" if direction == "improved" else "red"
|
|
469
|
+
pct = pct_change(diff.score_a, diff.score_b)
|
|
470
|
+
|
|
471
|
+
secho(f"- score: {diff.score_a}", fg="red")
|
|
472
|
+
secho(
|
|
473
|
+
f"+ score: {diff.score_b}" + (f" ({pct})" if pct else ""),
|
|
474
|
+
fg=color,
|
|
475
|
+
)
|
|
476
|
+
else:
|
|
477
|
+
secho(f" score: {diff.score_a}", dim=True)
|
|
478
|
+
|
|
479
|
+
if diff.threshold_a != diff.threshold_b:
|
|
480
|
+
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
481
|
+
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
482
|
+
else:
|
|
483
|
+
secho(f" threshold: {diff.threshold_a}", dim=True)
|
|
484
|
+
|
|
485
|
+
if diff.success_a != diff.success_b:
|
|
486
|
+
secho(f"- success: {diff.success_a}", fg="red")
|
|
487
|
+
secho(f"+ success: {diff.success_b}", fg="green")
|
|
488
|
+
else:
|
|
489
|
+
secho(f" success: {diff.success_a}", dim=True)
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def summarize(metrics: Any) -> Dict[str, int]:
|
|
493
|
+
summary = {
|
|
494
|
+
"improved": 0,
|
|
495
|
+
"regressed": 0,
|
|
496
|
+
"unchanged": 0,
|
|
497
|
+
"added": 0,
|
|
498
|
+
"removed": 0,
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
for m in metrics:
|
|
502
|
+
if m.status in ("added", "removed"):
|
|
503
|
+
summary[m.status] += 1
|
|
504
|
+
else:
|
|
505
|
+
direction = metric_direction(m.delta)
|
|
506
|
+
summary[direction] += 1
|
|
507
|
+
|
|
508
|
+
return summary
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
|
|
512
|
+
if not summary_only:
|
|
513
|
+
for metric in resp.metrics:
|
|
514
|
+
print_metric_diff(metric)
|
|
515
|
+
return
|
|
516
|
+
|
|
517
|
+
print_summary(resp.metrics)
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def print_summary(metrics: Any) -> None:
|
|
521
|
+
summary = summarize(metrics)
|
|
522
|
+
|
|
523
|
+
secho("\nSummary:", bold=True)
|
|
524
|
+
secho(f" ↑ Improved: {summary['improved']}", fg="green")
|
|
525
|
+
secho(f" ↓ Regressed: {summary['regressed']}", fg="red")
|
|
526
|
+
secho(f" ± Unchanged: {summary['unchanged']}", dim=True)
|
|
527
|
+
secho(f" + Added: {summary['added']}", fg="green")
|
|
528
|
+
secho(f" - Removed: {summary['removed']}", fg="red")
|
|
529
|
+
|
|
530
|
+
|
|
425
531
|
@app.command()
|
|
426
532
|
def compare(
|
|
427
533
|
tag: list[str] = typer.Option(
|
|
428
534
|
[],
|
|
429
535
|
"--tag",
|
|
430
|
-
help="
|
|
536
|
+
help="Label identifying a reference testcase",
|
|
431
537
|
),
|
|
432
538
|
run: list[int] = typer.Option(
|
|
433
539
|
[],
|
|
@@ -437,14 +543,19 @@ def compare(
|
|
|
437
543
|
pretty: bool = typer.Option(
|
|
438
544
|
True,
|
|
439
545
|
"--pretty/--raw",
|
|
440
|
-
help="Pretty
|
|
546
|
+
help="Pretty diff output (default) or raw JSON",
|
|
547
|
+
),
|
|
548
|
+
summary: bool = typer.Option(
|
|
549
|
+
False,
|
|
550
|
+
"--summary",
|
|
551
|
+
help="Show summary only (no per-metric diff)",
|
|
441
552
|
),
|
|
442
553
|
raise_exception: bool = typer.Option(
|
|
443
554
|
False,
|
|
444
555
|
"--raise",
|
|
445
556
|
help="Raise HTTP exceptions instead of swallowing them",
|
|
446
557
|
),
|
|
447
|
-
out: Path
|
|
558
|
+
out: Optional[Path] = typer.Option(
|
|
448
559
|
None,
|
|
449
560
|
"-o",
|
|
450
561
|
"--out",
|
|
@@ -457,32 +568,40 @@ def compare(
|
|
|
457
568
|
),
|
|
458
569
|
) -> None:
|
|
459
570
|
"""
|
|
460
|
-
Compare two evaluation testcases
|
|
571
|
+
Compare two evaluation testcases (runs and/or labels).
|
|
572
|
+
|
|
573
|
+
Output:
|
|
574
|
+
- Unified diff-style view by default
|
|
575
|
+
- Raw JSON with --raw
|
|
576
|
+
- Summary of improvements / regressions
|
|
461
577
|
"""
|
|
462
578
|
|
|
463
|
-
targets = []
|
|
579
|
+
targets: List[Tuple[str, Union[str, int]]] = []
|
|
464
580
|
|
|
465
581
|
for r in run:
|
|
466
582
|
targets.append(("run", r))
|
|
467
583
|
|
|
468
|
-
for
|
|
469
|
-
targets.append(("label",
|
|
584
|
+
for t in tag:
|
|
585
|
+
targets.append(("label", t))
|
|
470
586
|
|
|
471
587
|
if len(targets) != 2:
|
|
472
|
-
typer.
|
|
473
|
-
"❌ Provide exactly two targets using --run and/or --
|
|
588
|
+
typer.secho(
|
|
589
|
+
"❌ Provide exactly two targets using --run and/or --tag",
|
|
590
|
+
fg="red",
|
|
591
|
+
bold=True,
|
|
474
592
|
)
|
|
475
593
|
raise typer.Exit(code=1)
|
|
476
594
|
|
|
477
|
-
client = DeepEvalClient()
|
|
478
|
-
|
|
479
595
|
(type_a, value_a), (type_b, value_b) = targets
|
|
480
596
|
|
|
481
|
-
typer.
|
|
482
|
-
f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'"
|
|
597
|
+
typer.secho(
|
|
598
|
+
f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
|
|
599
|
+
bold=True,
|
|
483
600
|
)
|
|
484
601
|
|
|
485
|
-
|
|
602
|
+
client = DeepEvalClient()
|
|
603
|
+
|
|
604
|
+
kwargs: Dict[str, Any] = {"raise_exception": raise_exception}
|
|
486
605
|
|
|
487
606
|
if type_a == "run":
|
|
488
607
|
kwargs["testcase_a_id"] = value_a
|
|
@@ -497,14 +616,34 @@ def compare(
|
|
|
497
616
|
try:
|
|
498
617
|
resp = client.compare_testcases(**kwargs)
|
|
499
618
|
except Exception as e:
|
|
500
|
-
typer.
|
|
619
|
+
typer.secho(f"❌ Request failed: {e}", fg="red")
|
|
501
620
|
raise typer.Exit(code=1)
|
|
502
621
|
|
|
503
622
|
if not resp:
|
|
504
|
-
typer.
|
|
623
|
+
typer.secho("⚠️ No response received", fg="yellow")
|
|
505
624
|
raise typer.Exit(code=1)
|
|
506
625
|
|
|
507
|
-
|
|
626
|
+
if out:
|
|
627
|
+
if out.exists() and not overwrite:
|
|
628
|
+
typer.secho(
|
|
629
|
+
f"❌ File already exists: {out} (use --overwrite)",
|
|
630
|
+
fg="red",
|
|
631
|
+
)
|
|
632
|
+
raise typer.Exit(code=1)
|
|
633
|
+
|
|
634
|
+
out.write_text(json.dumps(resp, indent=2))
|
|
635
|
+
typer.secho(f"💾 Saved raw output to {out}", fg="green")
|
|
636
|
+
|
|
637
|
+
if not pretty:
|
|
638
|
+
typer.echo(json.dumps(resp, indent=2))
|
|
639
|
+
return
|
|
640
|
+
|
|
641
|
+
comparison = TestCaseComparison(**resp)
|
|
642
|
+
|
|
643
|
+
pretty_print_comparison(
|
|
644
|
+
comparison,
|
|
645
|
+
summary_only=summary,
|
|
646
|
+
)
|
|
508
647
|
|
|
509
648
|
|
|
510
649
|
@app.command(hidden=True)
|
|
@@ -527,7 +666,7 @@ def compare_label_latest(
|
|
|
527
666
|
"--raise",
|
|
528
667
|
help="Raise HTTP exceptions instead of swallowing them",
|
|
529
668
|
),
|
|
530
|
-
out: Path
|
|
669
|
+
out: Optional[Path] = typer.Option(
|
|
531
670
|
None,
|
|
532
671
|
"-o",
|
|
533
672
|
"--out",
|
|
@@ -579,7 +718,7 @@ def compare_last(
|
|
|
579
718
|
"--raise",
|
|
580
719
|
help="Raise HTTP exceptions instead of swallowing them",
|
|
581
720
|
),
|
|
582
|
-
out: Path
|
|
721
|
+
out: Optional[Path] = typer.Option(
|
|
583
722
|
None,
|
|
584
723
|
"-o",
|
|
585
724
|
"--out",
|
|
@@ -631,7 +770,7 @@ def tag_command(
|
|
|
631
770
|
"--delete",
|
|
632
771
|
help="Delete a tag",
|
|
633
772
|
),
|
|
634
|
-
):
|
|
773
|
+
) -> None:
|
|
635
774
|
"""
|
|
636
775
|
Assign a tag to a run or delete a tag.
|
|
637
776
|
"""
|
|
@@ -648,8 +787,9 @@ def tag_command(
|
|
|
648
787
|
|
|
649
788
|
client = DeepEvalClient()
|
|
650
789
|
|
|
651
|
-
# --- delete mode ---
|
|
652
790
|
if delete:
|
|
791
|
+
assert run_id is not None
|
|
792
|
+
|
|
653
793
|
result = client.update_evaluation_testcase_tag(
|
|
654
794
|
testcase_id=run_id,
|
|
655
795
|
tag=delete,
|
|
@@ -658,13 +798,14 @@ def tag_command(
|
|
|
658
798
|
typer.echo("🗑️ Tag deleted successfully")
|
|
659
799
|
typer.echo(f"Tag: {delete}")
|
|
660
800
|
return
|
|
661
|
-
|
|
662
|
-
|
|
801
|
+
assert run_id is not None
|
|
802
|
+
assert tag is not None
|
|
663
803
|
result = client.update_evaluation_testcase_tag(
|
|
664
804
|
testcase_id=run_id,
|
|
665
805
|
tag=tag,
|
|
666
806
|
raise_exception=True,
|
|
667
807
|
)
|
|
808
|
+
assert result is not None
|
|
668
809
|
|
|
669
810
|
typer.echo("✅ Tag assigned successfully")
|
|
670
811
|
typer.echo(f"Run ID: {run_id}")
|
rakam_eval_sdk/client.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import random
|
|
3
|
-
from typing import Any, Dict, List, Literal, Optional, Union, cast, overload
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast, overload
|
|
4
4
|
|
|
5
5
|
import requests
|
|
6
6
|
|
|
@@ -40,8 +40,7 @@ class DeepEvalClient:
|
|
|
40
40
|
)
|
|
41
41
|
self.base_url = raw_url.rstrip("/")
|
|
42
42
|
self.api_token = (
|
|
43
|
-
api_token or settings_token or os.getenv(
|
|
44
|
-
"EVALFRAMEWORK_API_KEY", "")
|
|
43
|
+
api_token or settings_token or os.getenv("EVALFRAMEWORK_API_KEY", "")
|
|
45
44
|
)
|
|
46
45
|
self.timeout = timeout
|
|
47
46
|
|
|
@@ -50,10 +49,10 @@ class DeepEvalClient:
|
|
|
50
49
|
method: HTTPMethod,
|
|
51
50
|
endpoint: str,
|
|
52
51
|
*,
|
|
53
|
-
json:
|
|
54
|
-
params:
|
|
52
|
+
json: Optional[Dict] = None,
|
|
53
|
+
params: Optional[Dict] = None,
|
|
55
54
|
raise_exception: bool = False,
|
|
56
|
-
) -> Optional[
|
|
55
|
+
) -> Optional[Dict]:
|
|
57
56
|
url = f"{self.base_url}{endpoint}"
|
|
58
57
|
|
|
59
58
|
headers = {
|
|
@@ -90,17 +89,63 @@ class DeepEvalClient:
|
|
|
90
89
|
"raw": resp.text,
|
|
91
90
|
}
|
|
92
91
|
|
|
93
|
-
def _get(
|
|
94
|
-
|
|
92
|
+
def _get(
|
|
93
|
+
self,
|
|
94
|
+
endpoint: str,
|
|
95
|
+
params: Dict,
|
|
96
|
+
raise_exception: bool = False,
|
|
97
|
+
*args: Tuple,
|
|
98
|
+
**kw: Dict,
|
|
99
|
+
) -> Optional[Dict]:
|
|
100
|
+
return self._request(
|
|
101
|
+
"GET", endpoint, params=params, raise_exception=raise_exception, *args, **kw
|
|
102
|
+
)
|
|
95
103
|
|
|
96
|
-
def _post(
|
|
97
|
-
|
|
104
|
+
def _post(
|
|
105
|
+
self,
|
|
106
|
+
endpoint: str,
|
|
107
|
+
payload: Dict,
|
|
108
|
+
raise_exception: bool = False,
|
|
109
|
+
*args: Tuple,
|
|
110
|
+
**kw: Dict,
|
|
111
|
+
) -> Optional[Dict]:
|
|
112
|
+
return self._request(
|
|
113
|
+
"POST", endpoint, json=payload, raise_exception=raise_exception, *args, **kw
|
|
114
|
+
)
|
|
98
115
|
|
|
99
|
-
def _patch(
|
|
100
|
-
|
|
116
|
+
def _patch(
|
|
117
|
+
self,
|
|
118
|
+
endpoint: str,
|
|
119
|
+
payload: Dict,
|
|
120
|
+
raise_exception: bool = False,
|
|
121
|
+
*args: Tuple,
|
|
122
|
+
**kw: Dict,
|
|
123
|
+
) -> Optional[Dict]:
|
|
124
|
+
return self._request(
|
|
125
|
+
"PATCH",
|
|
126
|
+
endpoint,
|
|
127
|
+
json=payload,
|
|
128
|
+
raise_exception=raise_exception,
|
|
129
|
+
*args,
|
|
130
|
+
**kw,
|
|
131
|
+
)
|
|
101
132
|
|
|
102
|
-
def _delete(
|
|
103
|
-
|
|
133
|
+
def _delete(
|
|
134
|
+
self,
|
|
135
|
+
endpoint: str,
|
|
136
|
+
payload: Dict,
|
|
137
|
+
raise_exception: bool = False,
|
|
138
|
+
*args: Tuple,
|
|
139
|
+
**kw: Dict,
|
|
140
|
+
) -> Optional[Dict]:
|
|
141
|
+
return self._request(
|
|
142
|
+
"DELETE",
|
|
143
|
+
endpoint,
|
|
144
|
+
json=payload,
|
|
145
|
+
raise_exception=raise_exception,
|
|
146
|
+
*args,
|
|
147
|
+
**kw,
|
|
148
|
+
)
|
|
104
149
|
|
|
105
150
|
def update_evaluation_testcase_tag(
|
|
106
151
|
self,
|
|
@@ -133,10 +178,11 @@ class DeepEvalClient:
|
|
|
133
178
|
limit: int = 10,
|
|
134
179
|
offset: int = 0,
|
|
135
180
|
raise_exception: bool = False,
|
|
136
|
-
) -> Optional[
|
|
181
|
+
) -> Optional[Dict]:
|
|
137
182
|
"""
|
|
138
183
|
List evaluation testcases for the current API token only.
|
|
139
184
|
Sorted by created_at DESC (newest first).
|
|
185
|
+
Returns items + pagination metadata.
|
|
140
186
|
"""
|
|
141
187
|
return self._get(
|
|
142
188
|
endpoint="/eval-framework/deepeval/evaluation-testcases/token",
|
|
@@ -197,27 +243,25 @@ class DeepEvalClient:
|
|
|
197
243
|
def compare_testcases(
|
|
198
244
|
self,
|
|
199
245
|
*,
|
|
200
|
-
testcase_a_id: int
|
|
201
|
-
testcase_a_tag: str
|
|
202
|
-
testcase_b_id: int
|
|
203
|
-
testcase_b_tag: str
|
|
246
|
+
testcase_a_id: Optional[int] = None,
|
|
247
|
+
testcase_a_tag: Optional[str] = None,
|
|
248
|
+
testcase_b_id: Optional[int] = None,
|
|
249
|
+
testcase_b_tag: Optional[str] = None,
|
|
204
250
|
raise_exception: bool = False,
|
|
205
|
-
) -> Optional[
|
|
251
|
+
) -> Optional[Dict]:
|
|
206
252
|
"""
|
|
207
253
|
Compare two evaluation testcases using IDs or tags.
|
|
208
254
|
Exactly one identifier (id or tag) must be provided per testcase.
|
|
209
255
|
"""
|
|
210
256
|
|
|
211
|
-
def validate(id_, tag, name: str):
|
|
257
|
+
def validate(id_: Optional[int], tag: Optional[str], name: str) -> None:
|
|
212
258
|
if bool(id_) == bool(tag):
|
|
213
|
-
raise ValueError(
|
|
214
|
-
f"Provide exactly one of {name}_id or {name}_tag"
|
|
215
|
-
)
|
|
259
|
+
raise ValueError(f"Provide exactly one of {name}_id or {name}_tag")
|
|
216
260
|
|
|
217
261
|
validate(testcase_a_id, testcase_a_tag, "testcase_a")
|
|
218
262
|
validate(testcase_b_id, testcase_b_tag, "testcase_b")
|
|
219
263
|
|
|
220
|
-
params:
|
|
264
|
+
params: Dict[str, Union[int, str]] = {}
|
|
221
265
|
|
|
222
266
|
if testcase_a_id is not None:
|
|
223
267
|
params["testcase_a_id"] = testcase_a_id
|
|
@@ -241,7 +285,7 @@ class DeepEvalClient:
|
|
|
241
285
|
label_a: str,
|
|
242
286
|
label_b: str,
|
|
243
287
|
raise_exception: bool = False,
|
|
244
|
-
) -> Optional[
|
|
288
|
+
) -> Optional[Dict]:
|
|
245
289
|
"""
|
|
246
290
|
Compare the latest evaluation testcases for two labels.
|
|
247
291
|
"""
|
|
@@ -259,7 +303,7 @@ class DeepEvalClient:
|
|
|
259
303
|
*,
|
|
260
304
|
label: str,
|
|
261
305
|
raise_exception: bool = False,
|
|
262
|
-
) -> Optional[
|
|
306
|
+
) -> Optional[Dict]:
|
|
263
307
|
"""
|
|
264
308
|
Compare the last two evaluation testcases for a given label.
|
|
265
309
|
"""
|
|
@@ -277,7 +321,7 @@ class DeepEvalClient:
|
|
|
277
321
|
config: EvalConfig,
|
|
278
322
|
*,
|
|
279
323
|
raise_exception: bool = False,
|
|
280
|
-
) -> Optional[
|
|
324
|
+
) -> Optional[Dict]: ...
|
|
281
325
|
|
|
282
326
|
@overload
|
|
283
327
|
def text_eval(
|
|
@@ -286,20 +330,20 @@ class DeepEvalClient:
|
|
|
286
330
|
data: List[TextInputItem],
|
|
287
331
|
metrics: List[MetricConfig],
|
|
288
332
|
component: str = "unknown",
|
|
289
|
-
label: str
|
|
333
|
+
label: Optional[str] = None,
|
|
290
334
|
raise_exception: bool = False,
|
|
291
|
-
) -> Optional[
|
|
335
|
+
) -> Optional[Dict]: ...
|
|
292
336
|
|
|
293
337
|
def text_eval(
|
|
294
338
|
self,
|
|
295
|
-
config: EvalConfig
|
|
339
|
+
config: Optional[EvalConfig] = None,
|
|
296
340
|
*,
|
|
297
|
-
data: List[TextInputItem]
|
|
298
|
-
metrics: List[MetricConfig]
|
|
341
|
+
data: Optional[List[TextInputItem]] = None,
|
|
342
|
+
metrics: Optional[List[MetricConfig]] = None,
|
|
299
343
|
component: str = "unknown",
|
|
300
|
-
label: str
|
|
344
|
+
label: Optional[str] = None,
|
|
301
345
|
raise_exception: bool = False,
|
|
302
|
-
) -> Optional[
|
|
346
|
+
) -> Optional[Dict]:
|
|
303
347
|
if config is None:
|
|
304
348
|
config = EvalConfig(
|
|
305
349
|
data=data,
|
|
@@ -309,7 +353,9 @@ class DeepEvalClient:
|
|
|
309
353
|
)
|
|
310
354
|
|
|
311
355
|
return self._post(
|
|
312
|
-
endpoint="/deepeval/text-eval",
|
|
356
|
+
endpoint="/deepeval/text-eval",
|
|
357
|
+
payload=config.model_dump(),
|
|
358
|
+
raise_exception=raise_exception,
|
|
313
359
|
)
|
|
314
360
|
|
|
315
361
|
def text_eval_background(
|
|
@@ -319,12 +365,16 @@ class DeepEvalClient:
|
|
|
319
365
|
raise_exception: bool = False,
|
|
320
366
|
component: str = "unknown",
|
|
321
367
|
label: Union[str, None] = None,
|
|
322
|
-
) -> Optional[
|
|
368
|
+
) -> Optional[Dict]:
|
|
323
369
|
"""Run background text evaluation (async job)."""
|
|
324
370
|
payload = EvalConfig.model_construct(
|
|
325
371
|
data=data, metrics=metrics, component=component, version=label
|
|
326
372
|
).model_dump()
|
|
327
|
-
return self._post(
|
|
373
|
+
return self._post(
|
|
374
|
+
endpoint="/deepeval/text-eval/background",
|
|
375
|
+
payload=payload,
|
|
376
|
+
raise_exception=raise_exception,
|
|
377
|
+
)
|
|
328
378
|
|
|
329
379
|
@overload
|
|
330
380
|
def schema_eval(
|
|
@@ -333,9 +383,9 @@ class DeepEvalClient:
|
|
|
333
383
|
data: List[SchemaInputItem],
|
|
334
384
|
metrics: List[SchemaMetricConfig],
|
|
335
385
|
component: str = "unknown",
|
|
336
|
-
label: str
|
|
386
|
+
label: Optional[str] = None,
|
|
337
387
|
raise_exception: bool = False,
|
|
338
|
-
) -> Optional[
|
|
388
|
+
) -> Optional[Dict]: ...
|
|
339
389
|
|
|
340
390
|
@overload
|
|
341
391
|
def schema_eval(
|
|
@@ -343,18 +393,18 @@ class DeepEvalClient:
|
|
|
343
393
|
config: SchemaEvalConfig,
|
|
344
394
|
*,
|
|
345
395
|
raise_exception: bool = False,
|
|
346
|
-
) -> Optional[
|
|
396
|
+
) -> Optional[Dict]: ...
|
|
347
397
|
|
|
348
398
|
def schema_eval(
|
|
349
399
|
self,
|
|
350
|
-
config: SchemaEvalConfig
|
|
400
|
+
config: Optional[SchemaEvalConfig] = None,
|
|
351
401
|
*,
|
|
352
|
-
data: List[SchemaInputItem]
|
|
353
|
-
metrics: List[SchemaMetricConfig]
|
|
402
|
+
data: Optional[List[SchemaInputItem]] = None,
|
|
403
|
+
metrics: Optional[List[SchemaMetricConfig]] = None,
|
|
354
404
|
component: str = "unknown",
|
|
355
|
-
label: str
|
|
405
|
+
label: Optional[str] = None,
|
|
356
406
|
raise_exception: bool = False,
|
|
357
|
-
) -> Optional[
|
|
407
|
+
) -> Optional[Dict]:
|
|
358
408
|
if config is None:
|
|
359
409
|
if data is None or metrics is None:
|
|
360
410
|
raise ValueError(
|
|
@@ -381,13 +431,15 @@ class DeepEvalClient:
|
|
|
381
431
|
raise_exception: bool = False,
|
|
382
432
|
component: str = "unknown",
|
|
383
433
|
label: Union[str, None] = None,
|
|
384
|
-
) -> Optional[
|
|
434
|
+
) -> Optional[Dict]:
|
|
385
435
|
"""Run background schema evaluation (async job)."""
|
|
386
436
|
payload = SchemaEvalConfig.model_construct(
|
|
387
437
|
data=data, metrics=metrics, component=component, version=label
|
|
388
438
|
).model_dump()
|
|
389
439
|
return self._post(
|
|
390
|
-
endpoint="/deepeval/schema-eval/background",
|
|
440
|
+
endpoint="/deepeval/schema-eval/background",
|
|
441
|
+
payload=payload,
|
|
442
|
+
raise_exception=raise_exception,
|
|
391
443
|
)
|
|
392
444
|
|
|
393
445
|
def maybe_text_eval(
|
|
@@ -398,7 +450,7 @@ class DeepEvalClient:
|
|
|
398
450
|
raise_exception: bool = False,
|
|
399
451
|
component: str = "unknown",
|
|
400
452
|
label: Union[str, None] = None,
|
|
401
|
-
) -> Optional[
|
|
453
|
+
) -> Optional[Dict]:
|
|
402
454
|
"""Randomly run text_eval based on a probability between 0 and 1."""
|
|
403
455
|
self._validate_chance(chance)
|
|
404
456
|
return (
|
|
@@ -421,7 +473,7 @@ class DeepEvalClient:
|
|
|
421
473
|
raise_exception: bool = False,
|
|
422
474
|
component: str = "unknown",
|
|
423
475
|
label: Union[str, None] = None,
|
|
424
|
-
) -> Optional[
|
|
476
|
+
) -> Optional[Dict]:
|
|
425
477
|
"""Randomly run text_eval_background based on a probability between 0 and 1."""
|
|
426
478
|
self._validate_chance(chance)
|
|
427
479
|
return (
|
|
@@ -440,7 +492,7 @@ class DeepEvalClient:
|
|
|
440
492
|
raise_exception: bool = False,
|
|
441
493
|
component: str = "unknown",
|
|
442
494
|
label: Union[str, None] = None,
|
|
443
|
-
) -> Optional[
|
|
495
|
+
) -> Optional[Dict]:
|
|
444
496
|
"""Randomly run schema_eval based on a probability between 0 and 1."""
|
|
445
497
|
self._validate_chance(chance)
|
|
446
498
|
return (
|
|
@@ -463,7 +515,7 @@ class DeepEvalClient:
|
|
|
463
515
|
raise_exception: bool = False,
|
|
464
516
|
component: str = "unknown",
|
|
465
517
|
label: Union[str, None] = None,
|
|
466
|
-
) -> Optional[
|
|
518
|
+
) -> Optional[Dict]:
|
|
467
519
|
"""Randomly run text_eval_background based on a probability between 0 and 1."""
|
|
468
520
|
self._validate_chance(chance)
|
|
469
521
|
return (
|
rakam_eval_sdk/schema.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Common base class for all metric configs
|
|
2
2
|
import sys
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
4
4
|
|
|
5
5
|
# Base class (you can keep this abstract)
|
|
6
6
|
from pydantic import BaseModel, Field
|
|
@@ -129,3 +129,24 @@ class SchemaEvalConfig(BaseModel):
|
|
|
129
129
|
label: Union[str, None] = None
|
|
130
130
|
data: List[SchemaInputItem]
|
|
131
131
|
metrics: List[SchemaMetricConfig] = Field(default_factory=list)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class MetricDiff(BaseModel):
|
|
135
|
+
metric: str
|
|
136
|
+
score_a: Optional[float]
|
|
137
|
+
score_b: Optional[float]
|
|
138
|
+
delta: Optional[float]
|
|
139
|
+
|
|
140
|
+
success_a: Optional[bool]
|
|
141
|
+
success_b: Optional[bool]
|
|
142
|
+
|
|
143
|
+
threshold_a: Optional[float]
|
|
144
|
+
threshold_b: Optional[float]
|
|
145
|
+
|
|
146
|
+
status: str # "unchanged" | "changed" | "added" | "removed"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class TestCaseComparison(BaseModel):
|
|
150
|
+
testcase_a_id: int
|
|
151
|
+
testcase_b_id: int
|
|
152
|
+
metrics: List[MetricDiff]
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
rakam_eval_sdk/cli.py,sha256=HjymI0UTKRgJb-ewDuINuTtH605MkcB9nD0qNlbdFyc,21905
|
|
3
|
+
rakam_eval_sdk/client.py,sha256=8KwpNt7WHgMfcXAQtrAlnUihai9TiADRieiuWeIxg6E,15575
|
|
4
|
+
rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
|
|
5
|
+
rakam_eval_sdk/schema.py,sha256=DT-uQsE3XB7AAii68QACwIslLqXRQFb1c6efkqLQysI,3960
|
|
6
|
+
rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
|
|
7
|
+
rakam_eval_sdk-0.2.4rc1.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
8
|
+
rakam_eval_sdk-0.2.4rc1.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
|
|
9
|
+
rakam_eval_sdk-0.2.4rc1.dist-info/METADATA,sha256=Sy2Ghp5wcrmS7AzBkhp8C8LVb0dTS7tSNtgEAJCfbU8,6022
|
|
10
|
+
rakam_eval_sdk-0.2.4rc1.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
rakam_eval_sdk/cli.py,sha256=gV4FsAde1WB5rRPLW2mRqTO4X0SZhvepcWH10KE5IIY,17642
|
|
3
|
-
rakam_eval_sdk/client.py,sha256=sEXogMqtXgKGt_1Xuwg-S2Q2zRC6Y0ZVG_Wy1djbi28,14585
|
|
4
|
-
rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
|
|
5
|
-
rakam_eval_sdk/schema.py,sha256=woXu5GT1wuP_d7G11idopwpGmk0nV1SrxGShgWk-RA8,3530
|
|
6
|
-
rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
|
|
7
|
-
rakam_eval_sdk-0.2.3.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
8
|
-
rakam_eval_sdk-0.2.3.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
|
|
9
|
-
rakam_eval_sdk-0.2.3.dist-info/METADATA,sha256=ybu9LsKuQw0gfhJGPW-wTO6LC4bJw7sDlVuUQJQRqZ8,6019
|
|
10
|
-
rakam_eval_sdk-0.2.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|