rakam-eval-sdk 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rakam_eval_sdk/cli.py CHANGED
@@ -6,16 +6,18 @@ import uuid
6
6
  from datetime import datetime
7
7
  from pathlib import Path
8
8
  from pprint import pprint
9
- from typing import Any, Dict, Optional, Sequence
9
+ from typing import Any, Dict, Optional
10
10
 
11
11
  import typer
12
12
  from dotenv import load_dotenv
13
13
  from rich.console import Console
14
14
  from rich.panel import Panel
15
15
  from rich.pretty import Pretty
16
+ from typer import secho
16
17
 
17
18
  from rakam_eval_sdk.client import DeepEvalClient
18
19
  from rakam_eval_sdk.decorators import eval_run
20
+ from rakam_eval_sdk.schema import MetricDiff, TestCaseComparison
19
21
  from rakam_eval_sdk.utils.decorator_utils import (
20
22
  find_decorated_functions,
21
23
  load_module_from_path,
@@ -35,55 +37,88 @@ metrics_app = typer.Typer(help="Metrics utilities")
35
37
  app.add_typer(metrics_app, name="metrics")
36
38
 
37
39
 
40
+ def extract_metric_names(config: Any) -> list[tuple[str, str | None]]:
41
+ """
42
+ Returns [(type, name)] from EvalConfig / SchemaEvalConfig
43
+ """
44
+ if not hasattr(config, "metrics"):
45
+ return []
46
+
47
+ results: list[tuple[str, str | None]] = []
48
+
49
+ for metric in config.metrics or []:
50
+ metric_type = getattr(metric, "type", None)
51
+ metric_name = getattr(metric, "name", None)
52
+ if metric_type:
53
+ results.append((metric_type, metric_name))
54
+
55
+ return results
56
+
57
+
38
58
  @metrics_app.command("list")
39
- def list_metrics(
40
- limit: int = typer.Option(
41
- 20,
42
- "--limit",
43
- help="Number of testcases to inspect for metrics",
59
+ def metrics(
60
+ directory: Path = typer.Argument(
61
+ Path("./eval"),
62
+ exists=True,
63
+ file_okay=False,
64
+ dir_okay=True,
65
+ help="Directory to scan (default: ./eval)",
44
66
  ),
45
- ):
67
+ recursive: bool = typer.Option(
68
+ False,
69
+ "-r",
70
+ "--recursive",
71
+ help="Recursively search for Python files",
72
+ ),
73
+ ) -> None:
46
74
  """
47
- List unique metric names found in evaluation testcases.
75
+ List all metric types used by loaded eval configs.
48
76
  """
49
- client = DeepEvalClient()
77
+ files = directory.rglob("*.py") if recursive else directory.glob("*.py")
78
+ TARGET_DECORATOR = eval_run.__name__
50
79
 
51
- testcases = client.list_evaluation_testcases(
52
- limit=limit,
53
- offset=0,
54
- raise_exception=True,
55
- )
80
+ all_metrics: set[tuple[str, str | None]] = set()
81
+ found_any = False
56
82
 
57
- if not testcases:
58
- typer.echo("No evaluation testcases found.")
59
- return
83
+ for file in sorted(files):
84
+ functions = find_decorated_functions(file, TARGET_DECORATOR)
85
+ if not functions:
86
+ continue
60
87
 
61
- metric_names: set[str] = set()
88
+ typer.echo(f"\n📄 {file}")
62
89
 
63
- def collect_metrics(entries: Sequence[Dict] | None):
64
- if not entries:
65
- return
66
- for entry in entries:
67
- for metric in entry.get("metrics", []) or []:
68
- name = metric.get("name")
69
- if name:
70
- metric_names.add(name)
90
+ try:
91
+ module = load_module_from_path(file)
92
+ except Exception as e:
93
+ typer.echo(f" ❌ Failed to import module: {e}")
94
+ continue
71
95
 
72
- for tc in testcases:
96
+ for fn_name in functions:
97
+ try:
98
+ func = getattr(module, fn_name)
99
+ result = func()
73
100
 
74
- collect_metrics(tc.get("result"))
101
+ metrics = extract_metric_names(result)
102
+ if not metrics:
103
+ continue
75
104
 
76
- if not metric_names:
77
- typer.echo("No metrics found.")
78
- return
105
+ found_any = True
106
+ for metric_type, metric_name in metrics:
107
+ all_metrics.add((metric_type, metric_name))
79
108
 
80
- typer.echo(
81
- f"📊 Found {len(metric_names)} unique metrics "
82
- f"(from latest {limit} testcases)\n"
83
- )
109
+ if metric_name:
110
+ typer.echo(f" {metric_type} (alias: {metric_name})")
111
+ else:
112
+ typer.echo(f" • {metric_type}")
113
+
114
+ except Exception as e:
115
+ typer.echo(f" ❌ Failed to inspect {fn_name}: {e}")
116
+
117
+ if not found_any:
118
+ typer.echo("\nNo metrics found.")
119
+ raise typer.Exit(code=0)
84
120
 
85
- for name in sorted(metric_names):
86
- typer.echo(f"- {name}")
121
+ typer.echo(f"\n✅ {len(all_metrics)} unique metrics found")
87
122
 
88
123
 
89
124
  @list_app.command("evals")
@@ -130,19 +165,22 @@ def list_runs(
130
165
  """
131
166
  client = DeepEvalClient()
132
167
 
133
- runs = client.list_evaluation_testcases(
168
+ response = client.list_evaluation_testcases(
134
169
  limit=limit,
135
170
  offset=offset,
136
171
  raise_exception=True,
137
172
  )
138
173
 
139
- if not runs:
174
+ items = response.get("items", [])
175
+ total = response.get("total", 0)
176
+
177
+ if not items:
140
178
  typer.echo("No evaluation runs found.")
141
179
  return
142
180
 
143
- typer.echo(f"[id] " f"{'tag':<20}" f"{'label':<20}" f"created_at")
144
- # pretty CLI output
145
- for run in runs:
181
+ typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
182
+
183
+ for run in items:
146
184
  run_id = run.get("id")
147
185
  label = run.get("label") or "-"
148
186
  uid = run.get("tag") or "-"
@@ -156,8 +194,12 @@ def list_runs(
156
194
  except ValueError:
157
195
  pass
158
196
 
159
- typer.echo(
160
- f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
197
+ typer.echo(f"[{run_id}] {uid:<20} {label:<20} {created_at}")
198
+
199
+ shown = offset + len(items)
200
+ if shown < total:
201
+ typer.echo()
202
+ typer.echo(f"Showing {shown} of {total} runs. Use --limit to see more.")
161
203
 
162
204
 
163
205
  @app.command()
@@ -165,12 +207,12 @@ def show(
165
207
  id: Optional[int] = typer.Option(
166
208
  None,
167
209
  "--id",
168
- help="Numeric evaluation testcase ID",
210
+ help="Run ID",
169
211
  ),
170
212
  tag: Optional[str] = typer.Option(
171
213
  None,
172
214
  "--tag",
173
- help="Evaluation testcase tag",
215
+ help="Run tag",
174
216
  ),
175
217
  raw: bool = typer.Option(
176
218
  False,
@@ -377,8 +419,7 @@ def _print_and_save(
377
419
  return
378
420
 
379
421
  if out.exists() and not overwrite:
380
- typer.echo(
381
- f"❌ File already exists: {out} (use --overwrite to replace)")
422
+ typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
382
423
  raise typer.Exit(code=1)
383
424
 
384
425
  out.parent.mkdir(parents=True, exist_ok=True)
@@ -389,20 +430,124 @@ def _print_and_save(
389
430
  typer.echo(f"💾 Result saved to {out}")
390
431
 
391
432
 
433
+ def pct_change(a: float | None, b: float | None) -> str | None:
434
+ if a is None or b is None or a == 0:
435
+ return None
436
+ return f"{((b - a) / a) * 100:+.2f}%"
437
+
438
+
439
+ def metric_direction(delta: float | None) -> str:
440
+ if delta is None:
441
+ return "unchanged"
442
+ if delta > 0:
443
+ return "improved"
444
+ if delta < 0:
445
+ return "regressed"
446
+ return "unchanged"
447
+
448
+
449
+ def print_metric_diff(diff: MetricDiff):
450
+ secho(f"\nMetric: {diff.metric}", bold=True)
451
+
452
+ if diff.status == "added":
453
+ secho(f"+ score: {diff.score_b}", fg="green")
454
+ secho(f"+ threshold: {diff.threshold_b}", fg="green")
455
+ secho(f"+ success: {diff.success_b}", fg="green")
456
+ return
457
+
458
+ if diff.status == "removed":
459
+ secho(f"- score: {diff.score_a}", fg="red")
460
+ secho(f"- threshold: {diff.threshold_a}", fg="red")
461
+ secho(f"- success: {diff.success_a}", fg="red")
462
+ return
463
+
464
+ # unchanged / changed
465
+ if diff.score_a != diff.score_b:
466
+ direction = metric_direction(diff.delta)
467
+ color = "green" if direction == "improved" else "red"
468
+ pct = pct_change(diff.score_a, diff.score_b)
469
+
470
+ secho(f"- score: {diff.score_a}", fg="red")
471
+ secho(
472
+ f"+ score: {diff.score_b}" + (f" ({pct})" if pct else ""),
473
+ fg=color,
474
+ )
475
+ else:
476
+ secho(f" score: {diff.score_a}", dim=True)
477
+
478
+ if diff.threshold_a != diff.threshold_b:
479
+ secho(f"- threshold: {diff.threshold_a}", fg="red")
480
+ secho(f"+ threshold: {diff.threshold_b}", fg="green")
481
+ else:
482
+ secho(f" threshold: {diff.threshold_a}", dim=True)
483
+
484
+ if diff.success_a != diff.success_b:
485
+ secho(f"- success: {diff.success_a}", fg="red")
486
+ secho(f"+ success: {diff.success_b}", fg="green")
487
+ else:
488
+ secho(f" success: {diff.success_a}", dim=True)
489
+
490
+
491
+ def summarize(metrics: Any) -> Dict[str, int]:
492
+ summary = {
493
+ "improved": 0,
494
+ "regressed": 0,
495
+ "unchanged": 0,
496
+ "added": 0,
497
+ "removed": 0,
498
+ }
499
+
500
+ for m in metrics:
501
+ if m.status in ("added", "removed"):
502
+ summary[m.status] += 1
503
+ else:
504
+ direction = metric_direction(m.delta)
505
+ summary[direction] += 1
506
+
507
+ return summary
508
+
509
+
510
+ def pretty_print_comparison(resp: Any, summary_only: bool = False):
511
+ if not summary_only:
512
+ for metric in resp.metrics:
513
+ print_metric_diff(metric)
514
+ return
515
+
516
+ print_summary(resp.metrics)
517
+
518
+
519
+ def print_summary(metrics: Any):
520
+ summary = summarize(metrics)
521
+
522
+ secho("\nSummary:", bold=True)
523
+ secho(f" ↑ Improved: {summary['improved']}", fg="green")
524
+ secho(f" ↓ Regressed: {summary['regressed']}", fg="red")
525
+ secho(f" ± Unchanged: {summary['unchanged']}", dim=True)
526
+ secho(f" + Added: {summary['added']}", fg="green")
527
+ secho(f" - Removed: {summary['removed']}", fg="red")
528
+
529
+
392
530
  @app.command()
393
531
  def compare(
394
- testcase_a_id: int = typer.Argument(
395
- ...,
396
- help="ID of the first testcase",
532
+ tag: list[str] = typer.Option(
533
+ [],
534
+ "--tag",
535
+ help="Label identifying a reference testcase",
397
536
  ),
398
- testcase_b_id: int = typer.Argument(
399
- ...,
400
- help="ID of the second testcase",
537
+ run: list[int] = typer.Option(
538
+ [],
539
+ "--run",
540
+ help="Run ID identifying an evaluation testcase",
401
541
  ),
402
542
  pretty: bool = typer.Option(
403
543
  True,
404
544
  "--pretty/--raw",
405
- help="Pretty-print the response",
545
+ help="Pretty diff output (default) or raw JSON",
546
+ ),
547
+ summary: bool = typer.Option(
548
+ False,
549
+ "--summary",
550
+ help="Show summary only (no per-metric diff)",
406
551
  ),
407
552
  raise_exception: bool = typer.Option(
408
553
  False,
@@ -422,29 +567,85 @@ def compare(
422
567
  ),
423
568
  ) -> None:
424
569
  """
425
- Compare two DeepEval evaluation testcases.
570
+ Compare two evaluation testcases (runs and/or labels).
571
+
572
+ Output:
573
+ - Unified diff-style view by default
574
+ - Raw JSON with --raw
575
+ - Summary of improvements / regressions
426
576
  """
577
+
578
+ targets: list[tuple[str, str | int]] = []
579
+
580
+ for r in run:
581
+ targets.append(("run", r))
582
+
583
+ for t in tag:
584
+ targets.append(("label", t))
585
+
586
+ if len(targets) != 2:
587
+ typer.secho(
588
+ "❌ Provide exactly two targets using --run and/or --tag",
589
+ fg="red",
590
+ bold=True,
591
+ )
592
+ raise typer.Exit(code=1)
593
+
594
+ (type_a, value_a), (type_b, value_b) = targets
595
+
596
+ typer.secho(
597
+ f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
598
+ bold=True,
599
+ )
600
+
427
601
  client = DeepEvalClient()
428
602
 
429
- typer.echo(f"🔍 Comparing testcases {testcase_a_id} ↔ {testcase_b_id}")
603
+ kwargs = {"raise_exception": raise_exception}
604
+
605
+ if type_a == "run":
606
+ kwargs["testcase_a_id"] = value_a
607
+ else:
608
+ kwargs["testcase_a_tag"] = value_a
609
+
610
+ if type_b == "run":
611
+ kwargs["testcase_b_id"] = value_b
612
+ else:
613
+ kwargs["testcase_b_tag"] = value_b
430
614
 
431
615
  try:
432
- resp = client.compare_testcases(
433
- testcase_a_id=testcase_a_id,
434
- testcase_b_id=testcase_b_id,
435
- raise_exception=raise_exception,
436
- )
616
+ resp = client.compare_testcases(**kwargs)
437
617
  except Exception as e:
438
- typer.echo(f"❌ Request failed: {e}")
618
+ typer.secho(f"❌ Request failed: {e}", fg="red")
439
619
  raise typer.Exit(code=1)
440
620
 
441
621
  if not resp:
442
- typer.echo("⚠️ No response received")
622
+ typer.secho("⚠️ No response received", fg="yellow")
443
623
  raise typer.Exit(code=1)
444
- _print_and_save(resp, pretty, out, overwrite)
445
624
 
625
+ if out:
626
+ if out.exists() and not overwrite:
627
+ typer.secho(
628
+ f"❌ File already exists: {out} (use --overwrite)",
629
+ fg="red",
630
+ )
631
+ raise typer.Exit(code=1)
446
632
 
447
- @app.command()
633
+ out.write_text(json.dumps(resp, indent=2))
634
+ typer.secho(f"💾 Saved raw output to {out}", fg="green")
635
+
636
+ if not pretty:
637
+ typer.echo(json.dumps(resp, indent=2))
638
+ return
639
+
640
+ comparison = TestCaseComparison(**resp)
641
+
642
+ pretty_print_comparison(
643
+ comparison,
644
+ summary_only=summary,
645
+ )
646
+
647
+
648
+ @app.command(hidden=True)
448
649
  def compare_label_latest(
449
650
  label_a: str = typer.Argument(
450
651
  ...,
@@ -500,7 +701,7 @@ def compare_label_latest(
500
701
  _print_and_save(resp, pretty, out, overwrite)
501
702
 
502
703
 
503
- @app.command()
704
+ @app.command(hidden=True)
504
705
  def compare_last(
505
706
  label: str = typer.Argument(
506
707
  ...,
@@ -551,44 +752,62 @@ def compare_last(
551
752
  _print_and_save(resp, pretty, out, overwrite)
552
753
 
553
754
 
554
- @list_app.command("tag")
555
- def update_run_tag(
556
- run_id: int = typer.Argument(..., help="Evaluation run ID"),
755
+ @app.command("tag")
756
+ def tag_command(
757
+ run_id: Optional[int] = typer.Option(
758
+ None,
759
+ "--run",
760
+ help="Evaluation run ID",
761
+ ),
557
762
  tag: Optional[str] = typer.Option(
558
763
  None,
559
764
  "--tag",
560
- "-t",
561
- help="Tag to add or update",
765
+ help="Tag to assign to the run",
562
766
  ),
563
- remove: bool = typer.Option(
564
- False,
565
- "--remove",
566
- help="Remove tag from the run",
767
+ delete: Optional[str] = typer.Option(
768
+ None,
769
+ "--delete",
770
+ help="Delete a tag",
567
771
  ),
568
772
  ):
569
773
  """
570
- Add, update, or remove a tag from an evaluation run.
774
+ Assign a tag to a run or delete a tag.
571
775
  """
572
- if not tag and not remove:
573
- typer.echo("❌ You must provide --tag or --remove")
574
- raise typer.Exit(code=1)
575
776
 
576
- if tag and remove:
577
- typer.echo("❌ Use either --tag or --remove, not both")
578
- raise typer.Exit(code=1)
777
+ # --- validation ---
778
+ if delete:
779
+ if run_id or tag:
780
+ typer.echo("❌ --delete cannot be used with --run or --tag")
781
+ raise typer.Exit(code=1)
782
+ else:
783
+ if not run_id or not tag:
784
+ typer.echo("❌ Use --run and --tag together, or --delete")
785
+ raise typer.Exit(code=1)
579
786
 
580
787
  client = DeepEvalClient()
581
788
 
789
+ # --- delete mode ---
790
+ if delete:
791
+ result = client.update_evaluation_testcase_tag(
792
+ testcase_id=run_id,
793
+ tag=delete,
794
+ raise_exception=True,
795
+ )
796
+ typer.echo("🗑️ Tag deleted successfully")
797
+ typer.echo(f"Tag: {delete}")
798
+ return
799
+
800
+ # --- assign/update mode ---
582
801
  result = client.update_evaluation_testcase_tag(
583
802
  testcase_id=run_id,
584
- tag=None if remove else tag,
803
+ tag=tag,
585
804
  raise_exception=True,
586
805
  )
587
806
 
588
- action = "removed" if remove else "updated"
589
- typer.echo(f"✅ Tag {action} successfully")
807
+ typer.echo(" Tag assigned successfully")
590
808
  typer.echo(f"Run ID: {run_id}")
591
- typer.echo(f"Tag: {result.get('tag') or '-'}")
809
+ typer.echo(f"Tag: {result.get('tag')}")
810
+
592
811
 
593
812
  def main() -> None:
594
813
  app()
rakam_eval_sdk/client.py CHANGED
@@ -1,7 +1,6 @@
1
- from typing import Optional, Dict
2
1
  import os
3
2
  import random
4
- from typing import Any, Dict, List, Optional, Union, cast, overload
3
+ from typing import Any, Dict, List, Literal, Optional, Union, cast, overload
5
4
 
6
5
  import requests
7
6
 
@@ -13,7 +12,6 @@ from .schema import (
13
12
  SchemaMetricConfig,
14
13
  TextInputItem,
15
14
  )
16
- from typing import Optional, Literal, cast
17
15
 
18
16
  HTTPMethod = Literal["GET", "POST", "PATCH", "PUT", "DELETE"]
19
17
 
@@ -42,8 +40,7 @@ class DeepEvalClient:
42
40
  )
43
41
  self.base_url = raw_url.rstrip("/")
44
42
  self.api_token = (
45
- api_token or settings_token or os.getenv(
46
- "EVALFRAMEWORK_API_KEY", "")
43
+ api_token or settings_token or os.getenv("EVALFRAMEWORK_API_KEY", "")
47
44
  )
48
45
  self.timeout = timeout
49
46
 
@@ -52,10 +49,10 @@ class DeepEvalClient:
52
49
  method: HTTPMethod,
53
50
  endpoint: str,
54
51
  *,
55
- json: dict | None = None,
56
- params: dict | None = None,
52
+ json: Dict | None = None,
53
+ params: Dict | None = None,
57
54
  raise_exception: bool = False,
58
- ) -> Optional[dict]:
55
+ ) -> Optional[Dict]:
59
56
  url = f"{self.base_url}{endpoint}"
60
57
 
61
58
  headers = {
@@ -92,14 +89,17 @@ class DeepEvalClient:
92
89
  "raw": resp.text,
93
90
  }
94
91
 
95
- def _get(self, endpoint: str, params: dict, **kw):
96
- return self._request("GET", endpoint, params=params, **kw)
92
+ def _get(self, endpoint: str, params: Dict, *args, **kw):
93
+ return self._request("GET", endpoint, params=params, *args, **kw)
94
+
95
+ def _post(self, endpoint: str, payload: Dict, *args, **kw):
96
+ return self._request("POST", endpoint, json=payload, *args, **kw)
97
97
 
98
- def _post(self, endpoint: str, payload: dict, **kw):
99
- return self._request("POST", endpoint, json=payload, **kw)
98
+ def _patch(self, endpoint: str, payload: Dict, *args, **kw):
99
+ return self._request("PATCH", endpoint, json=payload, *args, **kw)
100
100
 
101
- def _patch(self, endpoint: str, payload: dict, **kw):
102
- return self._request("PATCH", endpoint, json=payload, **kw)
101
+ def _delete(self, endpoint: str, payload: Dict, *args, **kw):
102
+ return self._request("DELETE", endpoint, json=payload, *args, **kw)
103
103
 
104
104
  def update_evaluation_testcase_tag(
105
105
  self,
@@ -114,8 +114,14 @@ class DeepEvalClient:
114
114
  - tag="smoke" → add / update tag
115
115
  - tag=None → remove tag
116
116
  """
117
+ if testcase_id is None:
118
+ return self._delete(
119
+ endpoint=f"/deepeval/tag/{tag}",
120
+ payload={},
121
+ raise_exception=raise_exception,
122
+ )
117
123
  return self._patch(
118
- f"/evaluation-testcases/{testcase_id}/tag",
124
+ endpoint=f"/deepeval/{testcase_id}/tag",
119
125
  payload={"tag": tag},
120
126
  raise_exception=raise_exception,
121
127
  )
@@ -126,13 +132,14 @@ class DeepEvalClient:
126
132
  limit: int = 10,
127
133
  offset: int = 0,
128
134
  raise_exception: bool = False,
129
- ) -> Optional[List[Dict]]:
135
+ ) -> Optional[Dict]:
130
136
  """
131
137
  List evaluation testcases for the current API token only.
132
138
  Sorted by created_at DESC (newest first).
139
+ Returns items + pagination metadata.
133
140
  """
134
141
  return self._get(
135
- "/eval-framework/deepeval/evaluation-testcases/token",
142
+ endpoint="/eval-framework/deepeval/evaluation-testcases/token",
136
143
  params={
137
144
  "limit": limit,
138
145
  "offset": offset,
@@ -150,7 +157,7 @@ class DeepEvalClient:
150
157
  Fetch a single evaluation testcase by numeric ID.
151
158
  """
152
159
  return self._get(
153
- f"/eval-framework/deepeval/id/{testcase_id}",
160
+ endpoint=f"/eval-framework/deepeval/id/{testcase_id}",
154
161
  params={},
155
162
  raise_exception=raise_exception,
156
163
  )
@@ -165,7 +172,7 @@ class DeepEvalClient:
165
172
  Fetch a single evaluation testcase by tag.
166
173
  """
167
174
  return self._get(
168
- f"/eval-framework/deepeval/tag/{tag}",
175
+ endpoint=f"/eval-framework/deepeval/tag/{tag}",
169
176
  params={},
170
177
  raise_exception=raise_exception,
171
178
  )
@@ -190,19 +197,39 @@ class DeepEvalClient:
190
197
  def compare_testcases(
191
198
  self,
192
199
  *,
193
- testcase_a_id: int,
194
- testcase_b_id: int,
200
+ testcase_a_id: int | None = None,
201
+ testcase_a_tag: str | None = None,
202
+ testcase_b_id: int | None = None,
203
+ testcase_b_tag: str | None = None,
195
204
  raise_exception: bool = False,
196
- ) -> Optional[dict]:
205
+ ) -> Optional[Dict]:
197
206
  """
198
- Compare two evaluation testcases.
207
+ Compare two evaluation testcases using IDs or tags.
208
+ Exactly one identifier (id or tag) must be provided per testcase.
199
209
  """
210
+
211
+ def validate(id_, tag, name: str):
212
+ if bool(id_) == bool(tag):
213
+ raise ValueError(f"Provide exactly one of {name}_id or {name}_tag")
214
+
215
+ validate(testcase_a_id, testcase_a_tag, "testcase_a")
216
+ validate(testcase_b_id, testcase_b_tag, "testcase_b")
217
+
218
+ params: dict[str, int | str] = {}
219
+
220
+ if testcase_a_id is not None:
221
+ params["testcase_a_id"] = testcase_a_id
222
+ else:
223
+ params["testcase_a_tag"] = testcase_a_tag # type: ignore
224
+
225
+ if testcase_b_id is not None:
226
+ params["testcase_b_id"] = testcase_b_id
227
+ else:
228
+ params["testcase_b_tag"] = testcase_b_tag # type: ignore
229
+
200
230
  return self._get(
201
- "/eval-framework/deepeval/evaluation-testcases/compare",
202
- params={
203
- "testcase_a_id": testcase_a_id,
204
- "testcase_b_id": testcase_b_id,
205
- },
231
+ endpoint="/eval-framework/deepeval/evaluation-testcases/compare",
232
+ params=params,
206
233
  raise_exception=raise_exception,
207
234
  )
208
235
 
@@ -212,12 +239,12 @@ class DeepEvalClient:
212
239
  label_a: str,
213
240
  label_b: str,
214
241
  raise_exception: bool = False,
215
- ) -> Optional[dict]:
242
+ ) -> Optional[Dict]:
216
243
  """
217
244
  Compare the latest evaluation testcases for two labels.
218
245
  """
219
246
  return self._get(
220
- "/eval-framework/deepeval/evaluation-testcases/compare-latest",
247
+ endpoint="/eval-framework/deepeval/evaluation-testcases/compare-latest",
221
248
  params={
222
249
  "label_a": label_a,
223
250
  "label_b": label_b,
@@ -230,12 +257,12 @@ class DeepEvalClient:
230
257
  *,
231
258
  label: str,
232
259
  raise_exception: bool = False,
233
- ) -> Optional[dict]:
260
+ ) -> Optional[Dict]:
234
261
  """
235
262
  Compare the last two evaluation testcases for a given label.
236
263
  """
237
264
  return self._get(
238
- "/eval-framework/deepeval/evaluation-testcases/compare-last-two",
265
+ endpoint="/eval-framework/deepeval/evaluation-testcases/compare-last-two",
239
266
  params={
240
267
  "label": label,
241
268
  },
@@ -248,7 +275,7 @@ class DeepEvalClient:
248
275
  config: EvalConfig,
249
276
  *,
250
277
  raise_exception: bool = False,
251
- ) -> Optional[dict]: ...
278
+ ) -> Optional[Dict]: ...
252
279
 
253
280
  @overload
254
281
  def text_eval(
@@ -259,7 +286,7 @@ class DeepEvalClient:
259
286
  component: str = "unknown",
260
287
  label: str | None = None,
261
288
  raise_exception: bool = False,
262
- ) -> Optional[dict]: ...
289
+ ) -> Optional[Dict]: ...
263
290
 
264
291
  def text_eval(
265
292
  self,
@@ -270,7 +297,7 @@ class DeepEvalClient:
270
297
  component: str = "unknown",
271
298
  label: str | None = None,
272
299
  raise_exception: bool = False,
273
- ) -> Optional[dict]:
300
+ ) -> Optional[Dict]:
274
301
  if config is None:
275
302
  config = EvalConfig(
276
303
  data=data,
@@ -280,7 +307,9 @@ class DeepEvalClient:
280
307
  )
281
308
 
282
309
  return self._post(
283
- "/deepeval/text-eval", config.model_dump(), raise_exception
310
+ endpoint="/deepeval/text-eval",
311
+ payload=config.model_dump(),
312
+ raise_exception=raise_exception,
284
313
  )
285
314
 
286
315
  def text_eval_background(
@@ -290,12 +319,16 @@ class DeepEvalClient:
290
319
  raise_exception: bool = False,
291
320
  component: str = "unknown",
292
321
  label: Union[str, None] = None,
293
- ) -> Optional[dict]:
322
+ ) -> Optional[Dict]:
294
323
  """Run background text evaluation (async job)."""
295
324
  payload = EvalConfig.model_construct(
296
325
  data=data, metrics=metrics, component=component, version=label
297
326
  ).model_dump()
298
- return self._post("/deepeval/text-eval/background", payload, raise_exception)
327
+ return self._post(
328
+ endpoint="/deepeval/text-eval/background",
329
+ payload=payload,
330
+ raise_exception=raise_exception,
331
+ )
299
332
 
300
333
  @overload
301
334
  def schema_eval(
@@ -306,7 +339,7 @@ class DeepEvalClient:
306
339
  component: str = "unknown",
307
340
  label: str | None = None,
308
341
  raise_exception: bool = False,
309
- ) -> Optional[dict]: ...
342
+ ) -> Optional[Dict]: ...
310
343
 
311
344
  @overload
312
345
  def schema_eval(
@@ -314,7 +347,7 @@ class DeepEvalClient:
314
347
  config: SchemaEvalConfig,
315
348
  *,
316
349
  raise_exception: bool = False,
317
- ) -> Optional[dict]: ...
350
+ ) -> Optional[Dict]: ...
318
351
 
319
352
  def schema_eval(
320
353
  self,
@@ -325,7 +358,7 @@ class DeepEvalClient:
325
358
  component: str = "unknown",
326
359
  label: str | None = None,
327
360
  raise_exception: bool = False,
328
- ) -> Optional[dict]:
361
+ ) -> Optional[Dict]:
329
362
  if config is None:
330
363
  if data is None or metrics is None:
331
364
  raise ValueError(
@@ -340,9 +373,9 @@ class DeepEvalClient:
340
373
  )
341
374
 
342
375
  return self._post(
343
- "/deepeval/schema-eval",
344
- config.model_dump(),
345
- raise_exception,
376
+ endpoint="/deepeval/schema-eval",
377
+ payload=config.model_dump(),
378
+ raise_exception=raise_exception,
346
379
  )
347
380
 
348
381
  def schema_eval_background(
@@ -352,13 +385,15 @@ class DeepEvalClient:
352
385
  raise_exception: bool = False,
353
386
  component: str = "unknown",
354
387
  label: Union[str, None] = None,
355
- ) -> Optional[dict]:
388
+ ) -> Optional[Dict]:
356
389
  """Run background schema evaluation (async job)."""
357
390
  payload = SchemaEvalConfig.model_construct(
358
391
  data=data, metrics=metrics, component=component, version=label
359
392
  ).model_dump()
360
393
  return self._post(
361
- "/deepeval/schema-eval/background", payload, raise_exception
394
+ endpoint="/deepeval/schema-eval/background",
395
+ payload=payload,
396
+ raise_exception=raise_exception,
362
397
  )
363
398
 
364
399
  def maybe_text_eval(
@@ -369,7 +404,7 @@ class DeepEvalClient:
369
404
  raise_exception: bool = False,
370
405
  component: str = "unknown",
371
406
  label: Union[str, None] = None,
372
- ) -> Optional[dict]:
407
+ ) -> Optional[Dict]:
373
408
  """Randomly run text_eval based on a probability between 0 and 1."""
374
409
  self._validate_chance(chance)
375
410
  return (
@@ -392,7 +427,7 @@ class DeepEvalClient:
392
427
  raise_exception: bool = False,
393
428
  component: str = "unknown",
394
429
  label: Union[str, None] = None,
395
- ) -> Optional[dict]:
430
+ ) -> Optional[Dict]:
396
431
  """Randomly run text_eval_background based on a probability between 0 and 1."""
397
432
  self._validate_chance(chance)
398
433
  return (
@@ -411,7 +446,7 @@ class DeepEvalClient:
411
446
  raise_exception: bool = False,
412
447
  component: str = "unknown",
413
448
  label: Union[str, None] = None,
414
- ) -> Optional[dict]:
449
+ ) -> Optional[Dict]:
415
450
  """Randomly run schema_eval based on a probability between 0 and 1."""
416
451
  self._validate_chance(chance)
417
452
  return (
@@ -434,7 +469,7 @@ class DeepEvalClient:
434
469
  raise_exception: bool = False,
435
470
  component: str = "unknown",
436
471
  label: Union[str, None] = None,
437
- ) -> Optional[dict]:
472
+ ) -> Optional[Dict]:
438
473
  """Randomly run text_eval_background based on a probability between 0 and 1."""
439
474
  self._validate_chance(chance)
440
475
  return (
rakam_eval_sdk/schema.py CHANGED
@@ -129,3 +129,24 @@ class SchemaEvalConfig(BaseModel):
129
129
  label: Union[str, None] = None
130
130
  data: List[SchemaInputItem]
131
131
  metrics: List[SchemaMetricConfig] = Field(default_factory=list)
132
+
133
+
134
+ class MetricDiff(BaseModel):
135
+ metric: str
136
+ score_a: Optional[float]
137
+ score_b: Optional[float]
138
+ delta: Optional[float]
139
+
140
+ success_a: Optional[bool]
141
+ success_b: Optional[bool]
142
+
143
+ threshold_a: Optional[float]
144
+ threshold_b: Optional[float]
145
+
146
+ status: str # "unchanged" | "changed" | "added" | "removed"
147
+
148
+
149
+ class TestCaseComparison(BaseModel):
150
+ testcase_a_id: int
151
+ testcase_b_id: int
152
+ metrics: List[MetricDiff]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rakam-eval-sdk
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Evaluation Framework SDK
5
5
  Author: Mohamed Bachar Touil
6
6
  License: MIT
@@ -0,0 +1,10 @@
1
+ rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ rakam_eval_sdk/cli.py,sha256=IvC45LyALCCoxZ7ZNifVCbPs6UveGtUCfjr1q9e1A_Y,21658
3
+ rakam_eval_sdk/client.py,sha256=7spK7P3h5uiA3Q3erOHOwf04N1_RT1pysWyt_E-cz0Q,14666
4
+ rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
5
+ rakam_eval_sdk/schema.py,sha256=YfZgHtPh41xFYdQTsbS0OSvNdI8yIv5zoh76v21hemo,3971
6
+ rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
7
+ rakam_eval_sdk-0.2.4.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
8
+ rakam_eval_sdk-0.2.4.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
9
+ rakam_eval_sdk-0.2.4.dist-info/METADATA,sha256=XOTmtgO7NiG8NuLn4B_yJujUJONSJCvwGQ9ak3zIhv4,6019
10
+ rakam_eval_sdk-0.2.4.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- rakam_eval_sdk/cli.py,sha256=LQBaTJGpXGRMNPkkqw_NzK4d1dbF8p4A4_8SN_XALgI,15500
3
- rakam_eval_sdk/client.py,sha256=e15d7ycm8G7oJuX_Ie9cB39TK33taMjFvVwc_4kLMww,13255
4
- rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
5
- rakam_eval_sdk/schema.py,sha256=woXu5GT1wuP_d7G11idopwpGmk0nV1SrxGShgWk-RA8,3530
6
- rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
7
- rakam_eval_sdk-0.2.2.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
8
- rakam_eval_sdk-0.2.2.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
9
- rakam_eval_sdk-0.2.2.dist-info/METADATA,sha256=2KBfQDA4cIq634ZFZ0z--FtOtty_PG_T8Pgh8a9CLJk,6019
10
- rakam_eval_sdk-0.2.2.dist-info/RECORD,,