rakam-eval-sdk 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rakam_eval_sdk/cli.py +306 -87
- rakam_eval_sdk/client.py +85 -50
- rakam_eval_sdk/schema.py +21 -0
- {rakam_eval_sdk-0.2.2.dist-info → rakam_eval_sdk-0.2.4.dist-info}/METADATA +1 -1
- rakam_eval_sdk-0.2.4.dist-info/RECORD +10 -0
- rakam_eval_sdk-0.2.2.dist-info/RECORD +0 -10
- {rakam_eval_sdk-0.2.2.dist-info → rakam_eval_sdk-0.2.4.dist-info}/WHEEL +0 -0
- {rakam_eval_sdk-0.2.2.dist-info → rakam_eval_sdk-0.2.4.dist-info}/entry_points.txt +0 -0
rakam_eval_sdk/cli.py
CHANGED
|
@@ -6,16 +6,18 @@ import uuid
|
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from pprint import pprint
|
|
9
|
-
from typing import Any, Dict, Optional
|
|
9
|
+
from typing import Any, Dict, Optional
|
|
10
10
|
|
|
11
11
|
import typer
|
|
12
12
|
from dotenv import load_dotenv
|
|
13
13
|
from rich.console import Console
|
|
14
14
|
from rich.panel import Panel
|
|
15
15
|
from rich.pretty import Pretty
|
|
16
|
+
from typer import secho
|
|
16
17
|
|
|
17
18
|
from rakam_eval_sdk.client import DeepEvalClient
|
|
18
19
|
from rakam_eval_sdk.decorators import eval_run
|
|
20
|
+
from rakam_eval_sdk.schema import MetricDiff, TestCaseComparison
|
|
19
21
|
from rakam_eval_sdk.utils.decorator_utils import (
|
|
20
22
|
find_decorated_functions,
|
|
21
23
|
load_module_from_path,
|
|
@@ -35,55 +37,88 @@ metrics_app = typer.Typer(help="Metrics utilities")
|
|
|
35
37
|
app.add_typer(metrics_app, name="metrics")
|
|
36
38
|
|
|
37
39
|
|
|
40
|
+
def extract_metric_names(config: Any) -> list[tuple[str, str | None]]:
|
|
41
|
+
"""
|
|
42
|
+
Returns [(type, name)] from EvalConfig / SchemaEvalConfig
|
|
43
|
+
"""
|
|
44
|
+
if not hasattr(config, "metrics"):
|
|
45
|
+
return []
|
|
46
|
+
|
|
47
|
+
results: list[tuple[str, str | None]] = []
|
|
48
|
+
|
|
49
|
+
for metric in config.metrics or []:
|
|
50
|
+
metric_type = getattr(metric, "type", None)
|
|
51
|
+
metric_name = getattr(metric, "name", None)
|
|
52
|
+
if metric_type:
|
|
53
|
+
results.append((metric_type, metric_name))
|
|
54
|
+
|
|
55
|
+
return results
|
|
56
|
+
|
|
57
|
+
|
|
38
58
|
@metrics_app.command("list")
|
|
39
|
-
def
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
59
|
+
def metrics(
|
|
60
|
+
directory: Path = typer.Argument(
|
|
61
|
+
Path("./eval"),
|
|
62
|
+
exists=True,
|
|
63
|
+
file_okay=False,
|
|
64
|
+
dir_okay=True,
|
|
65
|
+
help="Directory to scan (default: ./eval)",
|
|
44
66
|
),
|
|
45
|
-
|
|
67
|
+
recursive: bool = typer.Option(
|
|
68
|
+
False,
|
|
69
|
+
"-r",
|
|
70
|
+
"--recursive",
|
|
71
|
+
help="Recursively search for Python files",
|
|
72
|
+
),
|
|
73
|
+
) -> None:
|
|
46
74
|
"""
|
|
47
|
-
List
|
|
75
|
+
List all metric types used by loaded eval configs.
|
|
48
76
|
"""
|
|
49
|
-
|
|
77
|
+
files = directory.rglob("*.py") if recursive else directory.glob("*.py")
|
|
78
|
+
TARGET_DECORATOR = eval_run.__name__
|
|
50
79
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
offset=0,
|
|
54
|
-
raise_exception=True,
|
|
55
|
-
)
|
|
80
|
+
all_metrics: set[tuple[str, str | None]] = set()
|
|
81
|
+
found_any = False
|
|
56
82
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
83
|
+
for file in sorted(files):
|
|
84
|
+
functions = find_decorated_functions(file, TARGET_DECORATOR)
|
|
85
|
+
if not functions:
|
|
86
|
+
continue
|
|
60
87
|
|
|
61
|
-
|
|
88
|
+
typer.echo(f"\n📄 {file}")
|
|
62
89
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
name = metric.get("name")
|
|
69
|
-
if name:
|
|
70
|
-
metric_names.add(name)
|
|
90
|
+
try:
|
|
91
|
+
module = load_module_from_path(file)
|
|
92
|
+
except Exception as e:
|
|
93
|
+
typer.echo(f" ❌ Failed to import module: {e}")
|
|
94
|
+
continue
|
|
71
95
|
|
|
72
|
-
|
|
96
|
+
for fn_name in functions:
|
|
97
|
+
try:
|
|
98
|
+
func = getattr(module, fn_name)
|
|
99
|
+
result = func()
|
|
73
100
|
|
|
74
|
-
|
|
101
|
+
metrics = extract_metric_names(result)
|
|
102
|
+
if not metrics:
|
|
103
|
+
continue
|
|
75
104
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
105
|
+
found_any = True
|
|
106
|
+
for metric_type, metric_name in metrics:
|
|
107
|
+
all_metrics.add((metric_type, metric_name))
|
|
79
108
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
109
|
+
if metric_name:
|
|
110
|
+
typer.echo(f" • {metric_type} (alias: {metric_name})")
|
|
111
|
+
else:
|
|
112
|
+
typer.echo(f" • {metric_type}")
|
|
113
|
+
|
|
114
|
+
except Exception as e:
|
|
115
|
+
typer.echo(f" ❌ Failed to inspect {fn_name}: {e}")
|
|
116
|
+
|
|
117
|
+
if not found_any:
|
|
118
|
+
typer.echo("\nNo metrics found.")
|
|
119
|
+
raise typer.Exit(code=0)
|
|
84
120
|
|
|
85
|
-
|
|
86
|
-
typer.echo(f"- {name}")
|
|
121
|
+
typer.echo(f"\n✅ {len(all_metrics)} unique metrics found")
|
|
87
122
|
|
|
88
123
|
|
|
89
124
|
@list_app.command("evals")
|
|
@@ -130,19 +165,22 @@ def list_runs(
|
|
|
130
165
|
"""
|
|
131
166
|
client = DeepEvalClient()
|
|
132
167
|
|
|
133
|
-
|
|
168
|
+
response = client.list_evaluation_testcases(
|
|
134
169
|
limit=limit,
|
|
135
170
|
offset=offset,
|
|
136
171
|
raise_exception=True,
|
|
137
172
|
)
|
|
138
173
|
|
|
139
|
-
|
|
174
|
+
items = response.get("items", [])
|
|
175
|
+
total = response.get("total", 0)
|
|
176
|
+
|
|
177
|
+
if not items:
|
|
140
178
|
typer.echo("No evaluation runs found.")
|
|
141
179
|
return
|
|
142
180
|
|
|
143
|
-
typer.echo(f"[id]
|
|
144
|
-
|
|
145
|
-
for run in
|
|
181
|
+
typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
|
|
182
|
+
|
|
183
|
+
for run in items:
|
|
146
184
|
run_id = run.get("id")
|
|
147
185
|
label = run.get("label") or "-"
|
|
148
186
|
uid = run.get("tag") or "-"
|
|
@@ -156,8 +194,12 @@ def list_runs(
|
|
|
156
194
|
except ValueError:
|
|
157
195
|
pass
|
|
158
196
|
|
|
159
|
-
typer.echo(
|
|
160
|
-
|
|
197
|
+
typer.echo(f"[{run_id}] {uid:<20} {label:<20} {created_at}")
|
|
198
|
+
|
|
199
|
+
shown = offset + len(items)
|
|
200
|
+
if shown < total:
|
|
201
|
+
typer.echo()
|
|
202
|
+
typer.echo(f"Showing {shown} of {total} runs. Use --limit to see more.")
|
|
161
203
|
|
|
162
204
|
|
|
163
205
|
@app.command()
|
|
@@ -165,12 +207,12 @@ def show(
|
|
|
165
207
|
id: Optional[int] = typer.Option(
|
|
166
208
|
None,
|
|
167
209
|
"--id",
|
|
168
|
-
help="
|
|
210
|
+
help="Run ID",
|
|
169
211
|
),
|
|
170
212
|
tag: Optional[str] = typer.Option(
|
|
171
213
|
None,
|
|
172
214
|
"--tag",
|
|
173
|
-
help="
|
|
215
|
+
help="Run tag",
|
|
174
216
|
),
|
|
175
217
|
raw: bool = typer.Option(
|
|
176
218
|
False,
|
|
@@ -377,8 +419,7 @@ def _print_and_save(
|
|
|
377
419
|
return
|
|
378
420
|
|
|
379
421
|
if out.exists() and not overwrite:
|
|
380
|
-
typer.echo(
|
|
381
|
-
f"❌ File already exists: {out} (use --overwrite to replace)")
|
|
422
|
+
typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
|
|
382
423
|
raise typer.Exit(code=1)
|
|
383
424
|
|
|
384
425
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -389,20 +430,124 @@ def _print_and_save(
|
|
|
389
430
|
typer.echo(f"💾 Result saved to {out}")
|
|
390
431
|
|
|
391
432
|
|
|
433
|
+
def pct_change(a: float | None, b: float | None) -> str | None:
|
|
434
|
+
if a is None or b is None or a == 0:
|
|
435
|
+
return None
|
|
436
|
+
return f"{((b - a) / a) * 100:+.2f}%"
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def metric_direction(delta: float | None) -> str:
|
|
440
|
+
if delta is None:
|
|
441
|
+
return "unchanged"
|
|
442
|
+
if delta > 0:
|
|
443
|
+
return "improved"
|
|
444
|
+
if delta < 0:
|
|
445
|
+
return "regressed"
|
|
446
|
+
return "unchanged"
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def print_metric_diff(diff: MetricDiff):
|
|
450
|
+
secho(f"\nMetric: {diff.metric}", bold=True)
|
|
451
|
+
|
|
452
|
+
if diff.status == "added":
|
|
453
|
+
secho(f"+ score: {diff.score_b}", fg="green")
|
|
454
|
+
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
455
|
+
secho(f"+ success: {diff.success_b}", fg="green")
|
|
456
|
+
return
|
|
457
|
+
|
|
458
|
+
if diff.status == "removed":
|
|
459
|
+
secho(f"- score: {diff.score_a}", fg="red")
|
|
460
|
+
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
461
|
+
secho(f"- success: {diff.success_a}", fg="red")
|
|
462
|
+
return
|
|
463
|
+
|
|
464
|
+
# unchanged / changed
|
|
465
|
+
if diff.score_a != diff.score_b:
|
|
466
|
+
direction = metric_direction(diff.delta)
|
|
467
|
+
color = "green" if direction == "improved" else "red"
|
|
468
|
+
pct = pct_change(diff.score_a, diff.score_b)
|
|
469
|
+
|
|
470
|
+
secho(f"- score: {diff.score_a}", fg="red")
|
|
471
|
+
secho(
|
|
472
|
+
f"+ score: {diff.score_b}" + (f" ({pct})" if pct else ""),
|
|
473
|
+
fg=color,
|
|
474
|
+
)
|
|
475
|
+
else:
|
|
476
|
+
secho(f" score: {diff.score_a}", dim=True)
|
|
477
|
+
|
|
478
|
+
if diff.threshold_a != diff.threshold_b:
|
|
479
|
+
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
480
|
+
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
481
|
+
else:
|
|
482
|
+
secho(f" threshold: {diff.threshold_a}", dim=True)
|
|
483
|
+
|
|
484
|
+
if diff.success_a != diff.success_b:
|
|
485
|
+
secho(f"- success: {diff.success_a}", fg="red")
|
|
486
|
+
secho(f"+ success: {diff.success_b}", fg="green")
|
|
487
|
+
else:
|
|
488
|
+
secho(f" success: {diff.success_a}", dim=True)
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def summarize(metrics: Any) -> Dict[str, int]:
|
|
492
|
+
summary = {
|
|
493
|
+
"improved": 0,
|
|
494
|
+
"regressed": 0,
|
|
495
|
+
"unchanged": 0,
|
|
496
|
+
"added": 0,
|
|
497
|
+
"removed": 0,
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
for m in metrics:
|
|
501
|
+
if m.status in ("added", "removed"):
|
|
502
|
+
summary[m.status] += 1
|
|
503
|
+
else:
|
|
504
|
+
direction = metric_direction(m.delta)
|
|
505
|
+
summary[direction] += 1
|
|
506
|
+
|
|
507
|
+
return summary
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def pretty_print_comparison(resp: Any, summary_only: bool = False):
|
|
511
|
+
if not summary_only:
|
|
512
|
+
for metric in resp.metrics:
|
|
513
|
+
print_metric_diff(metric)
|
|
514
|
+
return
|
|
515
|
+
|
|
516
|
+
print_summary(resp.metrics)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def print_summary(metrics: Any):
|
|
520
|
+
summary = summarize(metrics)
|
|
521
|
+
|
|
522
|
+
secho("\nSummary:", bold=True)
|
|
523
|
+
secho(f" ↑ Improved: {summary['improved']}", fg="green")
|
|
524
|
+
secho(f" ↓ Regressed: {summary['regressed']}", fg="red")
|
|
525
|
+
secho(f" ± Unchanged: {summary['unchanged']}", dim=True)
|
|
526
|
+
secho(f" + Added: {summary['added']}", fg="green")
|
|
527
|
+
secho(f" - Removed: {summary['removed']}", fg="red")
|
|
528
|
+
|
|
529
|
+
|
|
392
530
|
@app.command()
|
|
393
531
|
def compare(
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
532
|
+
tag: list[str] = typer.Option(
|
|
533
|
+
[],
|
|
534
|
+
"--tag",
|
|
535
|
+
help="Label identifying a reference testcase",
|
|
397
536
|
),
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
537
|
+
run: list[int] = typer.Option(
|
|
538
|
+
[],
|
|
539
|
+
"--run",
|
|
540
|
+
help="Run ID identifying an evaluation testcase",
|
|
401
541
|
),
|
|
402
542
|
pretty: bool = typer.Option(
|
|
403
543
|
True,
|
|
404
544
|
"--pretty/--raw",
|
|
405
|
-
help="Pretty
|
|
545
|
+
help="Pretty diff output (default) or raw JSON",
|
|
546
|
+
),
|
|
547
|
+
summary: bool = typer.Option(
|
|
548
|
+
False,
|
|
549
|
+
"--summary",
|
|
550
|
+
help="Show summary only (no per-metric diff)",
|
|
406
551
|
),
|
|
407
552
|
raise_exception: bool = typer.Option(
|
|
408
553
|
False,
|
|
@@ -422,29 +567,85 @@ def compare(
|
|
|
422
567
|
),
|
|
423
568
|
) -> None:
|
|
424
569
|
"""
|
|
425
|
-
Compare two
|
|
570
|
+
Compare two evaluation testcases (runs and/or labels).
|
|
571
|
+
|
|
572
|
+
Output:
|
|
573
|
+
- Unified diff-style view by default
|
|
574
|
+
- Raw JSON with --raw
|
|
575
|
+
- Summary of improvements / regressions
|
|
426
576
|
"""
|
|
577
|
+
|
|
578
|
+
targets: list[tuple[str, str | int]] = []
|
|
579
|
+
|
|
580
|
+
for r in run:
|
|
581
|
+
targets.append(("run", r))
|
|
582
|
+
|
|
583
|
+
for t in tag:
|
|
584
|
+
targets.append(("label", t))
|
|
585
|
+
|
|
586
|
+
if len(targets) != 2:
|
|
587
|
+
typer.secho(
|
|
588
|
+
"❌ Provide exactly two targets using --run and/or --tag",
|
|
589
|
+
fg="red",
|
|
590
|
+
bold=True,
|
|
591
|
+
)
|
|
592
|
+
raise typer.Exit(code=1)
|
|
593
|
+
|
|
594
|
+
(type_a, value_a), (type_b, value_b) = targets
|
|
595
|
+
|
|
596
|
+
typer.secho(
|
|
597
|
+
f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
|
|
598
|
+
bold=True,
|
|
599
|
+
)
|
|
600
|
+
|
|
427
601
|
client = DeepEvalClient()
|
|
428
602
|
|
|
429
|
-
|
|
603
|
+
kwargs = {"raise_exception": raise_exception}
|
|
604
|
+
|
|
605
|
+
if type_a == "run":
|
|
606
|
+
kwargs["testcase_a_id"] = value_a
|
|
607
|
+
else:
|
|
608
|
+
kwargs["testcase_a_tag"] = value_a
|
|
609
|
+
|
|
610
|
+
if type_b == "run":
|
|
611
|
+
kwargs["testcase_b_id"] = value_b
|
|
612
|
+
else:
|
|
613
|
+
kwargs["testcase_b_tag"] = value_b
|
|
430
614
|
|
|
431
615
|
try:
|
|
432
|
-
resp = client.compare_testcases(
|
|
433
|
-
testcase_a_id=testcase_a_id,
|
|
434
|
-
testcase_b_id=testcase_b_id,
|
|
435
|
-
raise_exception=raise_exception,
|
|
436
|
-
)
|
|
616
|
+
resp = client.compare_testcases(**kwargs)
|
|
437
617
|
except Exception as e:
|
|
438
|
-
typer.
|
|
618
|
+
typer.secho(f"❌ Request failed: {e}", fg="red")
|
|
439
619
|
raise typer.Exit(code=1)
|
|
440
620
|
|
|
441
621
|
if not resp:
|
|
442
|
-
typer.
|
|
622
|
+
typer.secho("⚠️ No response received", fg="yellow")
|
|
443
623
|
raise typer.Exit(code=1)
|
|
444
|
-
_print_and_save(resp, pretty, out, overwrite)
|
|
445
624
|
|
|
625
|
+
if out:
|
|
626
|
+
if out.exists() and not overwrite:
|
|
627
|
+
typer.secho(
|
|
628
|
+
f"❌ File already exists: {out} (use --overwrite)",
|
|
629
|
+
fg="red",
|
|
630
|
+
)
|
|
631
|
+
raise typer.Exit(code=1)
|
|
446
632
|
|
|
447
|
-
|
|
633
|
+
out.write_text(json.dumps(resp, indent=2))
|
|
634
|
+
typer.secho(f"💾 Saved raw output to {out}", fg="green")
|
|
635
|
+
|
|
636
|
+
if not pretty:
|
|
637
|
+
typer.echo(json.dumps(resp, indent=2))
|
|
638
|
+
return
|
|
639
|
+
|
|
640
|
+
comparison = TestCaseComparison(**resp)
|
|
641
|
+
|
|
642
|
+
pretty_print_comparison(
|
|
643
|
+
comparison,
|
|
644
|
+
summary_only=summary,
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
@app.command(hidden=True)
|
|
448
649
|
def compare_label_latest(
|
|
449
650
|
label_a: str = typer.Argument(
|
|
450
651
|
...,
|
|
@@ -500,7 +701,7 @@ def compare_label_latest(
|
|
|
500
701
|
_print_and_save(resp, pretty, out, overwrite)
|
|
501
702
|
|
|
502
703
|
|
|
503
|
-
@app.command()
|
|
704
|
+
@app.command(hidden=True)
|
|
504
705
|
def compare_last(
|
|
505
706
|
label: str = typer.Argument(
|
|
506
707
|
...,
|
|
@@ -551,44 +752,62 @@ def compare_last(
|
|
|
551
752
|
_print_and_save(resp, pretty, out, overwrite)
|
|
552
753
|
|
|
553
754
|
|
|
554
|
-
@
|
|
555
|
-
def
|
|
556
|
-
run_id: int = typer.
|
|
755
|
+
@app.command("tag")
|
|
756
|
+
def tag_command(
|
|
757
|
+
run_id: Optional[int] = typer.Option(
|
|
758
|
+
None,
|
|
759
|
+
"--run",
|
|
760
|
+
help="Evaluation run ID",
|
|
761
|
+
),
|
|
557
762
|
tag: Optional[str] = typer.Option(
|
|
558
763
|
None,
|
|
559
764
|
"--tag",
|
|
560
|
-
"
|
|
561
|
-
help="Tag to add or update",
|
|
765
|
+
help="Tag to assign to the run",
|
|
562
766
|
),
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
"--
|
|
566
|
-
help="
|
|
767
|
+
delete: Optional[str] = typer.Option(
|
|
768
|
+
None,
|
|
769
|
+
"--delete",
|
|
770
|
+
help="Delete a tag",
|
|
567
771
|
),
|
|
568
772
|
):
|
|
569
773
|
"""
|
|
570
|
-
|
|
774
|
+
Assign a tag to a run or delete a tag.
|
|
571
775
|
"""
|
|
572
|
-
if not tag and not remove:
|
|
573
|
-
typer.echo("❌ You must provide --tag or --remove")
|
|
574
|
-
raise typer.Exit(code=1)
|
|
575
776
|
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
777
|
+
# --- validation ---
|
|
778
|
+
if delete:
|
|
779
|
+
if run_id or tag:
|
|
780
|
+
typer.echo("❌ --delete cannot be used with --run or --tag")
|
|
781
|
+
raise typer.Exit(code=1)
|
|
782
|
+
else:
|
|
783
|
+
if not run_id or not tag:
|
|
784
|
+
typer.echo("❌ Use --run and --tag together, or --delete")
|
|
785
|
+
raise typer.Exit(code=1)
|
|
579
786
|
|
|
580
787
|
client = DeepEvalClient()
|
|
581
788
|
|
|
789
|
+
# --- delete mode ---
|
|
790
|
+
if delete:
|
|
791
|
+
result = client.update_evaluation_testcase_tag(
|
|
792
|
+
testcase_id=run_id,
|
|
793
|
+
tag=delete,
|
|
794
|
+
raise_exception=True,
|
|
795
|
+
)
|
|
796
|
+
typer.echo("🗑️ Tag deleted successfully")
|
|
797
|
+
typer.echo(f"Tag: {delete}")
|
|
798
|
+
return
|
|
799
|
+
|
|
800
|
+
# --- assign/update mode ---
|
|
582
801
|
result = client.update_evaluation_testcase_tag(
|
|
583
802
|
testcase_id=run_id,
|
|
584
|
-
tag=
|
|
803
|
+
tag=tag,
|
|
585
804
|
raise_exception=True,
|
|
586
805
|
)
|
|
587
806
|
|
|
588
|
-
|
|
589
|
-
typer.echo(f"✅ Tag {action} successfully")
|
|
807
|
+
typer.echo("✅ Tag assigned successfully")
|
|
590
808
|
typer.echo(f"Run ID: {run_id}")
|
|
591
|
-
typer.echo(f"Tag: {result.get('tag')
|
|
809
|
+
typer.echo(f"Tag: {result.get('tag')}")
|
|
810
|
+
|
|
592
811
|
|
|
593
812
|
def main() -> None:
|
|
594
813
|
app()
|
rakam_eval_sdk/client.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
from typing import Optional, Dict
|
|
2
1
|
import os
|
|
3
2
|
import random
|
|
4
|
-
from typing import Any, Dict, List, Optional, Union, cast, overload
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Union, cast, overload
|
|
5
4
|
|
|
6
5
|
import requests
|
|
7
6
|
|
|
@@ -13,7 +12,6 @@ from .schema import (
|
|
|
13
12
|
SchemaMetricConfig,
|
|
14
13
|
TextInputItem,
|
|
15
14
|
)
|
|
16
|
-
from typing import Optional, Literal, cast
|
|
17
15
|
|
|
18
16
|
HTTPMethod = Literal["GET", "POST", "PATCH", "PUT", "DELETE"]
|
|
19
17
|
|
|
@@ -42,8 +40,7 @@ class DeepEvalClient:
|
|
|
42
40
|
)
|
|
43
41
|
self.base_url = raw_url.rstrip("/")
|
|
44
42
|
self.api_token = (
|
|
45
|
-
api_token or settings_token or os.getenv(
|
|
46
|
-
"EVALFRAMEWORK_API_KEY", "")
|
|
43
|
+
api_token or settings_token or os.getenv("EVALFRAMEWORK_API_KEY", "")
|
|
47
44
|
)
|
|
48
45
|
self.timeout = timeout
|
|
49
46
|
|
|
@@ -52,10 +49,10 @@ class DeepEvalClient:
|
|
|
52
49
|
method: HTTPMethod,
|
|
53
50
|
endpoint: str,
|
|
54
51
|
*,
|
|
55
|
-
json:
|
|
56
|
-
params:
|
|
52
|
+
json: Dict | None = None,
|
|
53
|
+
params: Dict | None = None,
|
|
57
54
|
raise_exception: bool = False,
|
|
58
|
-
) -> Optional[
|
|
55
|
+
) -> Optional[Dict]:
|
|
59
56
|
url = f"{self.base_url}{endpoint}"
|
|
60
57
|
|
|
61
58
|
headers = {
|
|
@@ -92,14 +89,17 @@ class DeepEvalClient:
|
|
|
92
89
|
"raw": resp.text,
|
|
93
90
|
}
|
|
94
91
|
|
|
95
|
-
def _get(self, endpoint: str, params:
|
|
96
|
-
return self._request("GET", endpoint, params=params, **kw)
|
|
92
|
+
def _get(self, endpoint: str, params: Dict, *args, **kw):
|
|
93
|
+
return self._request("GET", endpoint, params=params, *args, **kw)
|
|
94
|
+
|
|
95
|
+
def _post(self, endpoint: str, payload: Dict, *args, **kw):
|
|
96
|
+
return self._request("POST", endpoint, json=payload, *args, **kw)
|
|
97
97
|
|
|
98
|
-
def
|
|
99
|
-
return self._request("
|
|
98
|
+
def _patch(self, endpoint: str, payload: Dict, *args, **kw):
|
|
99
|
+
return self._request("PATCH", endpoint, json=payload, *args, **kw)
|
|
100
100
|
|
|
101
|
-
def
|
|
102
|
-
return self._request("
|
|
101
|
+
def _delete(self, endpoint: str, payload: Dict, *args, **kw):
|
|
102
|
+
return self._request("DELETE", endpoint, json=payload, *args, **kw)
|
|
103
103
|
|
|
104
104
|
def update_evaluation_testcase_tag(
|
|
105
105
|
self,
|
|
@@ -114,8 +114,14 @@ class DeepEvalClient:
|
|
|
114
114
|
- tag="smoke" → add / update tag
|
|
115
115
|
- tag=None → remove tag
|
|
116
116
|
"""
|
|
117
|
+
if testcase_id is None:
|
|
118
|
+
return self._delete(
|
|
119
|
+
endpoint=f"/deepeval/tag/{tag}",
|
|
120
|
+
payload={},
|
|
121
|
+
raise_exception=raise_exception,
|
|
122
|
+
)
|
|
117
123
|
return self._patch(
|
|
118
|
-
f"/
|
|
124
|
+
endpoint=f"/deepeval/{testcase_id}/tag",
|
|
119
125
|
payload={"tag": tag},
|
|
120
126
|
raise_exception=raise_exception,
|
|
121
127
|
)
|
|
@@ -126,13 +132,14 @@ class DeepEvalClient:
|
|
|
126
132
|
limit: int = 10,
|
|
127
133
|
offset: int = 0,
|
|
128
134
|
raise_exception: bool = False,
|
|
129
|
-
) -> Optional[
|
|
135
|
+
) -> Optional[Dict]:
|
|
130
136
|
"""
|
|
131
137
|
List evaluation testcases for the current API token only.
|
|
132
138
|
Sorted by created_at DESC (newest first).
|
|
139
|
+
Returns items + pagination metadata.
|
|
133
140
|
"""
|
|
134
141
|
return self._get(
|
|
135
|
-
"/eval-framework/deepeval/evaluation-testcases/token",
|
|
142
|
+
endpoint="/eval-framework/deepeval/evaluation-testcases/token",
|
|
136
143
|
params={
|
|
137
144
|
"limit": limit,
|
|
138
145
|
"offset": offset,
|
|
@@ -150,7 +157,7 @@ class DeepEvalClient:
|
|
|
150
157
|
Fetch a single evaluation testcase by numeric ID.
|
|
151
158
|
"""
|
|
152
159
|
return self._get(
|
|
153
|
-
f"/eval-framework/deepeval/id/{testcase_id}",
|
|
160
|
+
endpoint=f"/eval-framework/deepeval/id/{testcase_id}",
|
|
154
161
|
params={},
|
|
155
162
|
raise_exception=raise_exception,
|
|
156
163
|
)
|
|
@@ -165,7 +172,7 @@ class DeepEvalClient:
|
|
|
165
172
|
Fetch a single evaluation testcase by tag.
|
|
166
173
|
"""
|
|
167
174
|
return self._get(
|
|
168
|
-
f"/eval-framework/deepeval/tag/{tag}",
|
|
175
|
+
endpoint=f"/eval-framework/deepeval/tag/{tag}",
|
|
169
176
|
params={},
|
|
170
177
|
raise_exception=raise_exception,
|
|
171
178
|
)
|
|
@@ -190,19 +197,39 @@ class DeepEvalClient:
|
|
|
190
197
|
def compare_testcases(
|
|
191
198
|
self,
|
|
192
199
|
*,
|
|
193
|
-
testcase_a_id: int,
|
|
194
|
-
|
|
200
|
+
testcase_a_id: int | None = None,
|
|
201
|
+
testcase_a_tag: str | None = None,
|
|
202
|
+
testcase_b_id: int | None = None,
|
|
203
|
+
testcase_b_tag: str | None = None,
|
|
195
204
|
raise_exception: bool = False,
|
|
196
|
-
) -> Optional[
|
|
205
|
+
) -> Optional[Dict]:
|
|
197
206
|
"""
|
|
198
|
-
Compare two evaluation testcases.
|
|
207
|
+
Compare two evaluation testcases using IDs or tags.
|
|
208
|
+
Exactly one identifier (id or tag) must be provided per testcase.
|
|
199
209
|
"""
|
|
210
|
+
|
|
211
|
+
def validate(id_, tag, name: str):
|
|
212
|
+
if bool(id_) == bool(tag):
|
|
213
|
+
raise ValueError(f"Provide exactly one of {name}_id or {name}_tag")
|
|
214
|
+
|
|
215
|
+
validate(testcase_a_id, testcase_a_tag, "testcase_a")
|
|
216
|
+
validate(testcase_b_id, testcase_b_tag, "testcase_b")
|
|
217
|
+
|
|
218
|
+
params: dict[str, int | str] = {}
|
|
219
|
+
|
|
220
|
+
if testcase_a_id is not None:
|
|
221
|
+
params["testcase_a_id"] = testcase_a_id
|
|
222
|
+
else:
|
|
223
|
+
params["testcase_a_tag"] = testcase_a_tag # type: ignore
|
|
224
|
+
|
|
225
|
+
if testcase_b_id is not None:
|
|
226
|
+
params["testcase_b_id"] = testcase_b_id
|
|
227
|
+
else:
|
|
228
|
+
params["testcase_b_tag"] = testcase_b_tag # type: ignore
|
|
229
|
+
|
|
200
230
|
return self._get(
|
|
201
|
-
"/eval-framework/deepeval/evaluation-testcases/compare",
|
|
202
|
-
params=
|
|
203
|
-
"testcase_a_id": testcase_a_id,
|
|
204
|
-
"testcase_b_id": testcase_b_id,
|
|
205
|
-
},
|
|
231
|
+
endpoint="/eval-framework/deepeval/evaluation-testcases/compare",
|
|
232
|
+
params=params,
|
|
206
233
|
raise_exception=raise_exception,
|
|
207
234
|
)
|
|
208
235
|
|
|
@@ -212,12 +239,12 @@ class DeepEvalClient:
|
|
|
212
239
|
label_a: str,
|
|
213
240
|
label_b: str,
|
|
214
241
|
raise_exception: bool = False,
|
|
215
|
-
) -> Optional[
|
|
242
|
+
) -> Optional[Dict]:
|
|
216
243
|
"""
|
|
217
244
|
Compare the latest evaluation testcases for two labels.
|
|
218
245
|
"""
|
|
219
246
|
return self._get(
|
|
220
|
-
"/eval-framework/deepeval/evaluation-testcases/compare-latest",
|
|
247
|
+
endpoint="/eval-framework/deepeval/evaluation-testcases/compare-latest",
|
|
221
248
|
params={
|
|
222
249
|
"label_a": label_a,
|
|
223
250
|
"label_b": label_b,
|
|
@@ -230,12 +257,12 @@ class DeepEvalClient:
|
|
|
230
257
|
*,
|
|
231
258
|
label: str,
|
|
232
259
|
raise_exception: bool = False,
|
|
233
|
-
) -> Optional[
|
|
260
|
+
) -> Optional[Dict]:
|
|
234
261
|
"""
|
|
235
262
|
Compare the last two evaluation testcases for a given label.
|
|
236
263
|
"""
|
|
237
264
|
return self._get(
|
|
238
|
-
"/eval-framework/deepeval/evaluation-testcases/compare-last-two",
|
|
265
|
+
endpoint="/eval-framework/deepeval/evaluation-testcases/compare-last-two",
|
|
239
266
|
params={
|
|
240
267
|
"label": label,
|
|
241
268
|
},
|
|
@@ -248,7 +275,7 @@ class DeepEvalClient:
|
|
|
248
275
|
config: EvalConfig,
|
|
249
276
|
*,
|
|
250
277
|
raise_exception: bool = False,
|
|
251
|
-
) -> Optional[
|
|
278
|
+
) -> Optional[Dict]: ...
|
|
252
279
|
|
|
253
280
|
@overload
|
|
254
281
|
def text_eval(
|
|
@@ -259,7 +286,7 @@ class DeepEvalClient:
|
|
|
259
286
|
component: str = "unknown",
|
|
260
287
|
label: str | None = None,
|
|
261
288
|
raise_exception: bool = False,
|
|
262
|
-
) -> Optional[
|
|
289
|
+
) -> Optional[Dict]: ...
|
|
263
290
|
|
|
264
291
|
def text_eval(
|
|
265
292
|
self,
|
|
@@ -270,7 +297,7 @@ class DeepEvalClient:
|
|
|
270
297
|
component: str = "unknown",
|
|
271
298
|
label: str | None = None,
|
|
272
299
|
raise_exception: bool = False,
|
|
273
|
-
) -> Optional[
|
|
300
|
+
) -> Optional[Dict]:
|
|
274
301
|
if config is None:
|
|
275
302
|
config = EvalConfig(
|
|
276
303
|
data=data,
|
|
@@ -280,7 +307,9 @@ class DeepEvalClient:
|
|
|
280
307
|
)
|
|
281
308
|
|
|
282
309
|
return self._post(
|
|
283
|
-
"/deepeval/text-eval",
|
|
310
|
+
endpoint="/deepeval/text-eval",
|
|
311
|
+
payload=config.model_dump(),
|
|
312
|
+
raise_exception=raise_exception,
|
|
284
313
|
)
|
|
285
314
|
|
|
286
315
|
def text_eval_background(
|
|
@@ -290,12 +319,16 @@ class DeepEvalClient:
|
|
|
290
319
|
raise_exception: bool = False,
|
|
291
320
|
component: str = "unknown",
|
|
292
321
|
label: Union[str, None] = None,
|
|
293
|
-
) -> Optional[
|
|
322
|
+
) -> Optional[Dict]:
|
|
294
323
|
"""Run background text evaluation (async job)."""
|
|
295
324
|
payload = EvalConfig.model_construct(
|
|
296
325
|
data=data, metrics=metrics, component=component, version=label
|
|
297
326
|
).model_dump()
|
|
298
|
-
return self._post(
|
|
327
|
+
return self._post(
|
|
328
|
+
endpoint="/deepeval/text-eval/background",
|
|
329
|
+
payload=payload,
|
|
330
|
+
raise_exception=raise_exception,
|
|
331
|
+
)
|
|
299
332
|
|
|
300
333
|
@overload
|
|
301
334
|
def schema_eval(
|
|
@@ -306,7 +339,7 @@ class DeepEvalClient:
|
|
|
306
339
|
component: str = "unknown",
|
|
307
340
|
label: str | None = None,
|
|
308
341
|
raise_exception: bool = False,
|
|
309
|
-
) -> Optional[
|
|
342
|
+
) -> Optional[Dict]: ...
|
|
310
343
|
|
|
311
344
|
@overload
|
|
312
345
|
def schema_eval(
|
|
@@ -314,7 +347,7 @@ class DeepEvalClient:
|
|
|
314
347
|
config: SchemaEvalConfig,
|
|
315
348
|
*,
|
|
316
349
|
raise_exception: bool = False,
|
|
317
|
-
) -> Optional[
|
|
350
|
+
) -> Optional[Dict]: ...
|
|
318
351
|
|
|
319
352
|
def schema_eval(
|
|
320
353
|
self,
|
|
@@ -325,7 +358,7 @@ class DeepEvalClient:
|
|
|
325
358
|
component: str = "unknown",
|
|
326
359
|
label: str | None = None,
|
|
327
360
|
raise_exception: bool = False,
|
|
328
|
-
) -> Optional[
|
|
361
|
+
) -> Optional[Dict]:
|
|
329
362
|
if config is None:
|
|
330
363
|
if data is None or metrics is None:
|
|
331
364
|
raise ValueError(
|
|
@@ -340,9 +373,9 @@ class DeepEvalClient:
|
|
|
340
373
|
)
|
|
341
374
|
|
|
342
375
|
return self._post(
|
|
343
|
-
"/deepeval/schema-eval",
|
|
344
|
-
config.model_dump(),
|
|
345
|
-
raise_exception,
|
|
376
|
+
endpoint="/deepeval/schema-eval",
|
|
377
|
+
payload=config.model_dump(),
|
|
378
|
+
raise_exception=raise_exception,
|
|
346
379
|
)
|
|
347
380
|
|
|
348
381
|
def schema_eval_background(
|
|
@@ -352,13 +385,15 @@ class DeepEvalClient:
|
|
|
352
385
|
raise_exception: bool = False,
|
|
353
386
|
component: str = "unknown",
|
|
354
387
|
label: Union[str, None] = None,
|
|
355
|
-
) -> Optional[
|
|
388
|
+
) -> Optional[Dict]:
|
|
356
389
|
"""Run background schema evaluation (async job)."""
|
|
357
390
|
payload = SchemaEvalConfig.model_construct(
|
|
358
391
|
data=data, metrics=metrics, component=component, version=label
|
|
359
392
|
).model_dump()
|
|
360
393
|
return self._post(
|
|
361
|
-
"/deepeval/schema-eval/background",
|
|
394
|
+
endpoint="/deepeval/schema-eval/background",
|
|
395
|
+
payload=payload,
|
|
396
|
+
raise_exception=raise_exception,
|
|
362
397
|
)
|
|
363
398
|
|
|
364
399
|
def maybe_text_eval(
|
|
@@ -369,7 +404,7 @@ class DeepEvalClient:
|
|
|
369
404
|
raise_exception: bool = False,
|
|
370
405
|
component: str = "unknown",
|
|
371
406
|
label: Union[str, None] = None,
|
|
372
|
-
) -> Optional[
|
|
407
|
+
) -> Optional[Dict]:
|
|
373
408
|
"""Randomly run text_eval based on a probability between 0 and 1."""
|
|
374
409
|
self._validate_chance(chance)
|
|
375
410
|
return (
|
|
@@ -392,7 +427,7 @@ class DeepEvalClient:
|
|
|
392
427
|
raise_exception: bool = False,
|
|
393
428
|
component: str = "unknown",
|
|
394
429
|
label: Union[str, None] = None,
|
|
395
|
-
) -> Optional[
|
|
430
|
+
) -> Optional[Dict]:
|
|
396
431
|
"""Randomly run text_eval_background based on a probability between 0 and 1."""
|
|
397
432
|
self._validate_chance(chance)
|
|
398
433
|
return (
|
|
@@ -411,7 +446,7 @@ class DeepEvalClient:
|
|
|
411
446
|
raise_exception: bool = False,
|
|
412
447
|
component: str = "unknown",
|
|
413
448
|
label: Union[str, None] = None,
|
|
414
|
-
) -> Optional[
|
|
449
|
+
) -> Optional[Dict]:
|
|
415
450
|
"""Randomly run schema_eval based on a probability between 0 and 1."""
|
|
416
451
|
self._validate_chance(chance)
|
|
417
452
|
return (
|
|
@@ -434,7 +469,7 @@ class DeepEvalClient:
|
|
|
434
469
|
raise_exception: bool = False,
|
|
435
470
|
component: str = "unknown",
|
|
436
471
|
label: Union[str, None] = None,
|
|
437
|
-
) -> Optional[
|
|
472
|
+
) -> Optional[Dict]:
|
|
438
473
|
"""Randomly run text_eval_background based on a probability between 0 and 1."""
|
|
439
474
|
self._validate_chance(chance)
|
|
440
475
|
return (
|
rakam_eval_sdk/schema.py
CHANGED
|
@@ -129,3 +129,24 @@ class SchemaEvalConfig(BaseModel):
|
|
|
129
129
|
label: Union[str, None] = None
|
|
130
130
|
data: List[SchemaInputItem]
|
|
131
131
|
metrics: List[SchemaMetricConfig] = Field(default_factory=list)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class MetricDiff(BaseModel):
|
|
135
|
+
metric: str
|
|
136
|
+
score_a: Optional[float]
|
|
137
|
+
score_b: Optional[float]
|
|
138
|
+
delta: Optional[float]
|
|
139
|
+
|
|
140
|
+
success_a: Optional[bool]
|
|
141
|
+
success_b: Optional[bool]
|
|
142
|
+
|
|
143
|
+
threshold_a: Optional[float]
|
|
144
|
+
threshold_b: Optional[float]
|
|
145
|
+
|
|
146
|
+
status: str # "unchanged" | "changed" | "added" | "removed"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class TestCaseComparison(BaseModel):
|
|
150
|
+
testcase_a_id: int
|
|
151
|
+
testcase_b_id: int
|
|
152
|
+
metrics: List[MetricDiff]
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
rakam_eval_sdk/cli.py,sha256=IvC45LyALCCoxZ7ZNifVCbPs6UveGtUCfjr1q9e1A_Y,21658
|
|
3
|
+
rakam_eval_sdk/client.py,sha256=7spK7P3h5uiA3Q3erOHOwf04N1_RT1pysWyt_E-cz0Q,14666
|
|
4
|
+
rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
|
|
5
|
+
rakam_eval_sdk/schema.py,sha256=YfZgHtPh41xFYdQTsbS0OSvNdI8yIv5zoh76v21hemo,3971
|
|
6
|
+
rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
|
|
7
|
+
rakam_eval_sdk-0.2.4.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
8
|
+
rakam_eval_sdk-0.2.4.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
|
|
9
|
+
rakam_eval_sdk-0.2.4.dist-info/METADATA,sha256=XOTmtgO7NiG8NuLn4B_yJujUJONSJCvwGQ9ak3zIhv4,6019
|
|
10
|
+
rakam_eval_sdk-0.2.4.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
rakam_eval_sdk/cli.py,sha256=LQBaTJGpXGRMNPkkqw_NzK4d1dbF8p4A4_8SN_XALgI,15500
|
|
3
|
-
rakam_eval_sdk/client.py,sha256=e15d7ycm8G7oJuX_Ie9cB39TK33taMjFvVwc_4kLMww,13255
|
|
4
|
-
rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
|
|
5
|
-
rakam_eval_sdk/schema.py,sha256=woXu5GT1wuP_d7G11idopwpGmk0nV1SrxGShgWk-RA8,3530
|
|
6
|
-
rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
|
|
7
|
-
rakam_eval_sdk-0.2.2.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
8
|
-
rakam_eval_sdk-0.2.2.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
|
|
9
|
-
rakam_eval_sdk-0.2.2.dist-info/METADATA,sha256=2KBfQDA4cIq634ZFZ0z--FtOtty_PG_T8Pgh8a9CLJk,6019
|
|
10
|
-
rakam_eval_sdk-0.2.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|