rakam-eval-sdk 0.2.3__py3-none-any.whl → 0.2.4rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rakam_eval_sdk/cli.py CHANGED
@@ -6,16 +6,18 @@ import uuid
6
6
  from datetime import datetime
7
7
  from pathlib import Path
8
8
  from pprint import pprint
9
- from typing import Any, Dict, Optional, Sequence
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
10
10
 
11
11
  import typer
12
12
  from dotenv import load_dotenv
13
13
  from rich.console import Console
14
14
  from rich.panel import Panel
15
15
  from rich.pretty import Pretty
16
+ from typer import secho
16
17
 
17
18
  from rakam_eval_sdk.client import DeepEvalClient
18
19
  from rakam_eval_sdk.decorators import eval_run
20
+ from rakam_eval_sdk.schema import MetricDiff, TestCaseComparison
19
21
  from rakam_eval_sdk.utils.decorator_utils import (
20
22
  find_decorated_functions,
21
23
  load_module_from_path,
@@ -35,14 +37,14 @@ metrics_app = typer.Typer(help="Metrics utilities")
35
37
  app.add_typer(metrics_app, name="metrics")
36
38
 
37
39
 
38
- def extract_metric_names(config: Any) -> list[tuple[str, str | None]]:
40
+ def extract_metric_names(config: Any) -> list[tuple[str, Optional[str]]]:
39
41
  """
40
42
  Returns [(type, name)] from EvalConfig / SchemaEvalConfig
41
43
  """
42
44
  if not hasattr(config, "metrics"):
43
45
  return []
44
46
 
45
- results: list[tuple[str, str | None]] = []
47
+ results: list[tuple[str, Optional[str]]] = []
46
48
 
47
49
  for metric in config.metrics or []:
48
50
  metric_type = getattr(metric, "type", None)
@@ -75,7 +77,7 @@ def metrics(
75
77
  files = directory.rglob("*.py") if recursive else directory.glob("*.py")
76
78
  TARGET_DECORATOR = eval_run.__name__
77
79
 
78
- all_metrics: set[tuple[str, str | None]] = set()
80
+ all_metrics: set[tuple[str, Optional[str]]] = set()
79
81
  found_any = False
80
82
 
81
83
  for file in sorted(files):
@@ -157,25 +159,28 @@ def list_evals(
157
159
  def list_runs(
158
160
  limit: int = typer.Option(20, help="Max number of runs"),
159
161
  offset: int = typer.Option(0, help="Pagination offset"),
160
- ):
162
+ ) -> None:
161
163
  """
162
164
  List evaluation runs (newest first).
163
165
  """
164
166
  client = DeepEvalClient()
165
167
 
166
- runs = client.list_evaluation_testcases(
168
+ response = client.list_evaluation_testcases(
167
169
  limit=limit,
168
170
  offset=offset,
169
171
  raise_exception=True,
170
172
  )
173
+ assert response is not None
174
+ items = response.get("items", [])
175
+ total = response.get("total", 0)
171
176
 
172
- if not runs:
177
+ if not items:
173
178
  typer.echo("No evaluation runs found.")
174
179
  return
175
180
 
176
- typer.echo(f"[id] " f"{'tag':<20}" f"{'label':<20}" f"created_at")
177
- # pretty CLI output
178
- for run in runs:
181
+ typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
182
+
183
+ for run in items:
179
184
  run_id = run.get("id")
180
185
  label = run.get("label") or "-"
181
186
  uid = run.get("tag") or "-"
@@ -189,8 +194,12 @@ def list_runs(
189
194
  except ValueError:
190
195
  pass
191
196
 
192
- typer.echo(
193
- f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
197
+ typer.echo(f"[{run_id}] {uid:<20} {label:<20} {created_at}")
198
+
199
+ shown = offset + len(items)
200
+ if shown < total:
201
+ typer.echo()
202
+ typer.echo(f"Showing {shown} of {total} runs. Use --limit to see more.")
194
203
 
195
204
 
196
205
  @app.command()
@@ -210,15 +219,15 @@ def show(
210
219
  "--raw",
211
220
  help="Print raw JSON instead of formatted output",
212
221
  ),
213
- ):
222
+ ) -> None:
214
223
  """
215
224
  Show a single evaluation testcase by ID or tag.
216
225
  """
217
226
  if not id and not tag:
218
- raise typer.BadParameter("You must provide either --id or --uid")
227
+ raise typer.BadParameter("You must provide either --id or --tag")
219
228
 
220
229
  if id and tag:
221
- raise typer.BadParameter("Provide only one of --id or --uid")
230
+ raise typer.BadParameter("Provide only one of --id or --tag")
222
231
 
223
232
  client = DeepEvalClient()
224
233
 
@@ -226,6 +235,7 @@ def show(
226
235
  result = client.get_evaluation_testcase_by_id(id)
227
236
  identifier = f"id={id}"
228
237
  else:
238
+ assert tag is not None
229
239
  result = client.get_evaluation_testcase_by_tag(tag)
230
240
  identifier = f"tag={tag}"
231
241
 
@@ -397,7 +407,7 @@ def run(
397
407
  def _print_and_save(
398
408
  resp: dict,
399
409
  pretty: bool,
400
- out: Path | None,
410
+ out: Optional[Path],
401
411
  overwrite: bool,
402
412
  ) -> None:
403
413
  if pretty:
@@ -410,8 +420,7 @@ def _print_and_save(
410
420
  return
411
421
 
412
422
  if out.exists() and not overwrite:
413
- typer.echo(
414
- f"❌ File already exists: {out} (use --overwrite to replace)")
423
+ typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
415
424
  raise typer.Exit(code=1)
416
425
 
417
426
  out.parent.mkdir(parents=True, exist_ok=True)
@@ -422,12 +431,109 @@ def _print_and_save(
422
431
  typer.echo(f"💾 Result saved to {out}")
423
432
 
424
433
 
434
+ def pct_change(a: Optional[float], b: Optional[float]) -> Optional[str]:
435
+ if a is None or b is None or a == 0:
436
+ return None
437
+ return f"{((b - a) / a) * 100:+.2f}%"
438
+
439
+
440
+ def metric_direction(delta: Optional[float]) -> str:
441
+ if delta is None:
442
+ return "unchanged"
443
+ if delta > 0:
444
+ return "improved"
445
+ if delta < 0:
446
+ return "regressed"
447
+ return "unchanged"
448
+
449
+
450
+ def print_metric_diff(diff: MetricDiff) -> None:
451
+ secho(f"\nMetric: {diff.metric}", bold=True)
452
+
453
+ if diff.status == "added":
454
+ secho(f"+ score: {diff.score_b}", fg="green")
455
+ secho(f"+ threshold: {diff.threshold_b}", fg="green")
456
+ secho(f"+ success: {diff.success_b}", fg="green")
457
+ return
458
+
459
+ if diff.status == "removed":
460
+ secho(f"- score: {diff.score_a}", fg="red")
461
+ secho(f"- threshold: {diff.threshold_a}", fg="red")
462
+ secho(f"- success: {diff.success_a}", fg="red")
463
+ return
464
+
465
+ # unchanged / changed
466
+ if diff.score_a != diff.score_b:
467
+ direction = metric_direction(diff.delta)
468
+ color = "green" if direction == "improved" else "red"
469
+ pct = pct_change(diff.score_a, diff.score_b)
470
+
471
+ secho(f"- score: {diff.score_a}", fg="red")
472
+ secho(
473
+ f"+ score: {diff.score_b}" + (f" ({pct})" if pct else ""),
474
+ fg=color,
475
+ )
476
+ else:
477
+ secho(f" score: {diff.score_a}", dim=True)
478
+
479
+ if diff.threshold_a != diff.threshold_b:
480
+ secho(f"- threshold: {diff.threshold_a}", fg="red")
481
+ secho(f"+ threshold: {diff.threshold_b}", fg="green")
482
+ else:
483
+ secho(f" threshold: {diff.threshold_a}", dim=True)
484
+
485
+ if diff.success_a != diff.success_b:
486
+ secho(f"- success: {diff.success_a}", fg="red")
487
+ secho(f"+ success: {diff.success_b}", fg="green")
488
+ else:
489
+ secho(f" success: {diff.success_a}", dim=True)
490
+
491
+
492
+ def summarize(metrics: Any) -> Dict[str, int]:
493
+ summary = {
494
+ "improved": 0,
495
+ "regressed": 0,
496
+ "unchanged": 0,
497
+ "added": 0,
498
+ "removed": 0,
499
+ }
500
+
501
+ for m in metrics:
502
+ if m.status in ("added", "removed"):
503
+ summary[m.status] += 1
504
+ else:
505
+ direction = metric_direction(m.delta)
506
+ summary[direction] += 1
507
+
508
+ return summary
509
+
510
+
511
+ def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
512
+ if not summary_only:
513
+ for metric in resp.metrics:
514
+ print_metric_diff(metric)
515
+ return
516
+
517
+ print_summary(resp.metrics)
518
+
519
+
520
+ def print_summary(metrics: Any) -> None:
521
+ summary = summarize(metrics)
522
+
523
+ secho("\nSummary:", bold=True)
524
+ secho(f" ↑ Improved: {summary['improved']}", fg="green")
525
+ secho(f" ↓ Regressed: {summary['regressed']}", fg="red")
526
+ secho(f" ± Unchanged: {summary['unchanged']}", dim=True)
527
+ secho(f" + Added: {summary['added']}", fg="green")
528
+ secho(f" - Removed: {summary['removed']}", fg="red")
529
+
530
+
425
531
  @app.command()
426
532
  def compare(
427
533
  tag: list[str] = typer.Option(
428
534
  [],
429
535
  "--tag",
430
- help="Tag identifying a reference testcase",
536
+ help="Label identifying a reference testcase",
431
537
  ),
432
538
  run: list[int] = typer.Option(
433
539
  [],
@@ -437,14 +543,19 @@ def compare(
437
543
  pretty: bool = typer.Option(
438
544
  True,
439
545
  "--pretty/--raw",
440
- help="Pretty-print the response",
546
+ help="Pretty diff output (default) or raw JSON",
547
+ ),
548
+ summary: bool = typer.Option(
549
+ False,
550
+ "--summary",
551
+ help="Show summary only (no per-metric diff)",
441
552
  ),
442
553
  raise_exception: bool = typer.Option(
443
554
  False,
444
555
  "--raise",
445
556
  help="Raise HTTP exceptions instead of swallowing them",
446
557
  ),
447
- out: Path | None = typer.Option(
558
+ out: Optional[Path] = typer.Option(
448
559
  None,
449
560
  "-o",
450
561
  "--out",
@@ -457,32 +568,40 @@ def compare(
457
568
  ),
458
569
  ) -> None:
459
570
  """
460
- Compare two evaluation testcases using runs and/or labels.
571
+ Compare two evaluation testcases (runs and/or labels).
572
+
573
+ Output:
574
+ - Unified diff-style view by default
575
+ - Raw JSON with --raw
576
+ - Summary of improvements / regressions
461
577
  """
462
578
 
463
- targets = []
579
+ targets: List[Tuple[str, Union[str, int]]] = []
464
580
 
465
581
  for r in run:
466
582
  targets.append(("run", r))
467
583
 
468
- for l in tag:
469
- targets.append(("label", l))
584
+ for t in tag:
585
+ targets.append(("label", t))
470
586
 
471
587
  if len(targets) != 2:
472
- typer.echo(
473
- "❌ Provide exactly two targets using --run and/or --label"
588
+ typer.secho(
589
+ "❌ Provide exactly two targets using --run and/or --tag",
590
+ fg="red",
591
+ bold=True,
474
592
  )
475
593
  raise typer.Exit(code=1)
476
594
 
477
- client = DeepEvalClient()
478
-
479
595
  (type_a, value_a), (type_b, value_b) = targets
480
596
 
481
- typer.echo(
482
- f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'"
597
+ typer.secho(
598
+ f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
599
+ bold=True,
483
600
  )
484
601
 
485
- kwargs = {"raise_exception": raise_exception}
602
+ client = DeepEvalClient()
603
+
604
+ kwargs: Dict[str, Any] = {"raise_exception": raise_exception}
486
605
 
487
606
  if type_a == "run":
488
607
  kwargs["testcase_a_id"] = value_a
@@ -497,14 +616,34 @@ def compare(
497
616
  try:
498
617
  resp = client.compare_testcases(**kwargs)
499
618
  except Exception as e:
500
- typer.echo(f"❌ Request failed: {e}")
619
+ typer.secho(f"❌ Request failed: {e}", fg="red")
501
620
  raise typer.Exit(code=1)
502
621
 
503
622
  if not resp:
504
- typer.echo("⚠️ No response received")
623
+ typer.secho("⚠️ No response received", fg="yellow")
505
624
  raise typer.Exit(code=1)
506
625
 
507
- _print_and_save(resp, pretty, out, overwrite)
626
+ if out:
627
+ if out.exists() and not overwrite:
628
+ typer.secho(
629
+ f"❌ File already exists: {out} (use --overwrite)",
630
+ fg="red",
631
+ )
632
+ raise typer.Exit(code=1)
633
+
634
+ out.write_text(json.dumps(resp, indent=2))
635
+ typer.secho(f"💾 Saved raw output to {out}", fg="green")
636
+
637
+ if not pretty:
638
+ typer.echo(json.dumps(resp, indent=2))
639
+ return
640
+
641
+ comparison = TestCaseComparison(**resp)
642
+
643
+ pretty_print_comparison(
644
+ comparison,
645
+ summary_only=summary,
646
+ )
508
647
 
509
648
 
510
649
  @app.command(hidden=True)
@@ -527,7 +666,7 @@ def compare_label_latest(
527
666
  "--raise",
528
667
  help="Raise HTTP exceptions instead of swallowing them",
529
668
  ),
530
- out: Path | None = typer.Option(
669
+ out: Optional[Path] = typer.Option(
531
670
  None,
532
671
  "-o",
533
672
  "--out",
@@ -579,7 +718,7 @@ def compare_last(
579
718
  "--raise",
580
719
  help="Raise HTTP exceptions instead of swallowing them",
581
720
  ),
582
- out: Path | None = typer.Option(
721
+ out: Optional[Path] = typer.Option(
583
722
  None,
584
723
  "-o",
585
724
  "--out",
@@ -631,7 +770,7 @@ def tag_command(
631
770
  "--delete",
632
771
  help="Delete a tag",
633
772
  ),
634
- ):
773
+ ) -> None:
635
774
  """
636
775
  Assign a tag to a run or delete a tag.
637
776
  """
@@ -648,8 +787,9 @@ def tag_command(
648
787
 
649
788
  client = DeepEvalClient()
650
789
 
651
- # --- delete mode ---
652
790
  if delete:
791
+ assert run_id is not None
792
+
653
793
  result = client.update_evaluation_testcase_tag(
654
794
  testcase_id=run_id,
655
795
  tag=delete,
@@ -658,13 +798,14 @@ def tag_command(
658
798
  typer.echo("🗑️ Tag deleted successfully")
659
799
  typer.echo(f"Tag: {delete}")
660
800
  return
661
-
662
- # --- assign/update mode ---
801
+ assert run_id is not None
802
+ assert tag is not None
663
803
  result = client.update_evaluation_testcase_tag(
664
804
  testcase_id=run_id,
665
805
  tag=tag,
666
806
  raise_exception=True,
667
807
  )
808
+ assert result is not None
668
809
 
669
810
  typer.echo("✅ Tag assigned successfully")
670
811
  typer.echo(f"Run ID: {run_id}")
rakam_eval_sdk/client.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import os
2
2
  import random
3
- from typing import Any, Dict, List, Literal, Optional, Union, cast, overload
3
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast, overload
4
4
 
5
5
  import requests
6
6
 
@@ -40,8 +40,7 @@ class DeepEvalClient:
40
40
  )
41
41
  self.base_url = raw_url.rstrip("/")
42
42
  self.api_token = (
43
- api_token or settings_token or os.getenv(
44
- "EVALFRAMEWORK_API_KEY", "")
43
+ api_token or settings_token or os.getenv("EVALFRAMEWORK_API_KEY", "")
45
44
  )
46
45
  self.timeout = timeout
47
46
 
@@ -50,10 +49,10 @@ class DeepEvalClient:
50
49
  method: HTTPMethod,
51
50
  endpoint: str,
52
51
  *,
53
- json: dict | None = None,
54
- params: dict | None = None,
52
+ json: Optional[Dict] = None,
53
+ params: Optional[Dict] = None,
55
54
  raise_exception: bool = False,
56
- ) -> Optional[dict]:
55
+ ) -> Optional[Dict]:
57
56
  url = f"{self.base_url}{endpoint}"
58
57
 
59
58
  headers = {
@@ -90,17 +89,63 @@ class DeepEvalClient:
90
89
  "raw": resp.text,
91
90
  }
92
91
 
93
- def _get(self, endpoint: str, params: dict, *args, **kw):
94
- return self._request("GET", endpoint, params=params, *args, **kw)
92
+ def _get(
93
+ self,
94
+ endpoint: str,
95
+ params: Dict,
96
+ raise_exception: bool = False,
97
+ *args: Tuple,
98
+ **kw: Dict,
99
+ ) -> Optional[Dict]:
100
+ return self._request(
101
+ "GET", endpoint, params=params, raise_exception=raise_exception, *args, **kw
102
+ )
95
103
 
96
- def _post(self, endpoint: str, payload: dict, *args, **kw):
97
- return self._request("POST", endpoint, json=payload, *args, **kw)
104
+ def _post(
105
+ self,
106
+ endpoint: str,
107
+ payload: Dict,
108
+ raise_exception: bool = False,
109
+ *args: Tuple,
110
+ **kw: Dict,
111
+ ) -> Optional[Dict]:
112
+ return self._request(
113
+ "POST", endpoint, json=payload, raise_exception=raise_exception, *args, **kw
114
+ )
98
115
 
99
- def _patch(self, endpoint: str, payload: dict, *args, **kw):
100
- return self._request("PATCH", endpoint, json=payload, *args, **kw)
116
+ def _patch(
117
+ self,
118
+ endpoint: str,
119
+ payload: Dict,
120
+ raise_exception: bool = False,
121
+ *args: Tuple,
122
+ **kw: Dict,
123
+ ) -> Optional[Dict]:
124
+ return self._request(
125
+ "PATCH",
126
+ endpoint,
127
+ json=payload,
128
+ raise_exception=raise_exception,
129
+ *args,
130
+ **kw,
131
+ )
101
132
 
102
- def _delete(self, endpoint: str, payload: dict, *args, **kw):
103
- return self._request("DELETE", endpoint, json=payload, *args, **kw)
133
+ def _delete(
134
+ self,
135
+ endpoint: str,
136
+ payload: Dict,
137
+ raise_exception: bool = False,
138
+ *args: Tuple,
139
+ **kw: Dict,
140
+ ) -> Optional[Dict]:
141
+ return self._request(
142
+ "DELETE",
143
+ endpoint,
144
+ json=payload,
145
+ raise_exception=raise_exception,
146
+ *args,
147
+ **kw,
148
+ )
104
149
 
105
150
  def update_evaluation_testcase_tag(
106
151
  self,
@@ -133,10 +178,11 @@ class DeepEvalClient:
133
178
  limit: int = 10,
134
179
  offset: int = 0,
135
180
  raise_exception: bool = False,
136
- ) -> Optional[List[Dict]]:
181
+ ) -> Optional[Dict]:
137
182
  """
138
183
  List evaluation testcases for the current API token only.
139
184
  Sorted by created_at DESC (newest first).
185
+ Returns items + pagination metadata.
140
186
  """
141
187
  return self._get(
142
188
  endpoint="/eval-framework/deepeval/evaluation-testcases/token",
@@ -197,27 +243,25 @@ class DeepEvalClient:
197
243
  def compare_testcases(
198
244
  self,
199
245
  *,
200
- testcase_a_id: int | None = None,
201
- testcase_a_tag: str | None = None,
202
- testcase_b_id: int | None = None,
203
- testcase_b_tag: str | None = None,
246
+ testcase_a_id: Optional[int] = None,
247
+ testcase_a_tag: Optional[str] = None,
248
+ testcase_b_id: Optional[int] = None,
249
+ testcase_b_tag: Optional[str] = None,
204
250
  raise_exception: bool = False,
205
- ) -> Optional[dict]:
251
+ ) -> Optional[Dict]:
206
252
  """
207
253
  Compare two evaluation testcases using IDs or tags.
208
254
  Exactly one identifier (id or tag) must be provided per testcase.
209
255
  """
210
256
 
211
- def validate(id_, tag, name: str):
257
+ def validate(id_: Optional[int], tag: Optional[str], name: str) -> None:
212
258
  if bool(id_) == bool(tag):
213
- raise ValueError(
214
- f"Provide exactly one of {name}_id or {name}_tag"
215
- )
259
+ raise ValueError(f"Provide exactly one of {name}_id or {name}_tag")
216
260
 
217
261
  validate(testcase_a_id, testcase_a_tag, "testcase_a")
218
262
  validate(testcase_b_id, testcase_b_tag, "testcase_b")
219
263
 
220
- params: dict[str, int | str] = {}
264
+ params: Dict[str, Union[int, str]] = {}
221
265
 
222
266
  if testcase_a_id is not None:
223
267
  params["testcase_a_id"] = testcase_a_id
@@ -241,7 +285,7 @@ class DeepEvalClient:
241
285
  label_a: str,
242
286
  label_b: str,
243
287
  raise_exception: bool = False,
244
- ) -> Optional[dict]:
288
+ ) -> Optional[Dict]:
245
289
  """
246
290
  Compare the latest evaluation testcases for two labels.
247
291
  """
@@ -259,7 +303,7 @@ class DeepEvalClient:
259
303
  *,
260
304
  label: str,
261
305
  raise_exception: bool = False,
262
- ) -> Optional[dict]:
306
+ ) -> Optional[Dict]:
263
307
  """
264
308
  Compare the last two evaluation testcases for a given label.
265
309
  """
@@ -277,7 +321,7 @@ class DeepEvalClient:
277
321
  config: EvalConfig,
278
322
  *,
279
323
  raise_exception: bool = False,
280
- ) -> Optional[dict]: ...
324
+ ) -> Optional[Dict]: ...
281
325
 
282
326
  @overload
283
327
  def text_eval(
@@ -286,20 +330,20 @@ class DeepEvalClient:
286
330
  data: List[TextInputItem],
287
331
  metrics: List[MetricConfig],
288
332
  component: str = "unknown",
289
- label: str | None = None,
333
+ label: Optional[str] = None,
290
334
  raise_exception: bool = False,
291
- ) -> Optional[dict]: ...
335
+ ) -> Optional[Dict]: ...
292
336
 
293
337
  def text_eval(
294
338
  self,
295
- config: EvalConfig | None = None,
339
+ config: Optional[EvalConfig] = None,
296
340
  *,
297
- data: List[TextInputItem] | None = None,
298
- metrics: List[MetricConfig] | None = None,
341
+ data: Optional[List[TextInputItem]] = None,
342
+ metrics: Optional[List[MetricConfig]] = None,
299
343
  component: str = "unknown",
300
- label: str | None = None,
344
+ label: Optional[str] = None,
301
345
  raise_exception: bool = False,
302
- ) -> Optional[dict]:
346
+ ) -> Optional[Dict]:
303
347
  if config is None:
304
348
  config = EvalConfig(
305
349
  data=data,
@@ -309,7 +353,9 @@ class DeepEvalClient:
309
353
  )
310
354
 
311
355
  return self._post(
312
- endpoint="/deepeval/text-eval", payload=config.model_dump(), raise_exception=raise_exception
356
+ endpoint="/deepeval/text-eval",
357
+ payload=config.model_dump(),
358
+ raise_exception=raise_exception,
313
359
  )
314
360
 
315
361
  def text_eval_background(
@@ -319,12 +365,16 @@ class DeepEvalClient:
319
365
  raise_exception: bool = False,
320
366
  component: str = "unknown",
321
367
  label: Union[str, None] = None,
322
- ) -> Optional[dict]:
368
+ ) -> Optional[Dict]:
323
369
  """Run background text evaluation (async job)."""
324
370
  payload = EvalConfig.model_construct(
325
371
  data=data, metrics=metrics, component=component, version=label
326
372
  ).model_dump()
327
- return self._post(endpoint="/deepeval/text-eval/background", payload=payload, raise_exception=raise_exception)
373
+ return self._post(
374
+ endpoint="/deepeval/text-eval/background",
375
+ payload=payload,
376
+ raise_exception=raise_exception,
377
+ )
328
378
 
329
379
  @overload
330
380
  def schema_eval(
@@ -333,9 +383,9 @@ class DeepEvalClient:
333
383
  data: List[SchemaInputItem],
334
384
  metrics: List[SchemaMetricConfig],
335
385
  component: str = "unknown",
336
- label: str | None = None,
386
+ label: Optional[str] = None,
337
387
  raise_exception: bool = False,
338
- ) -> Optional[dict]: ...
388
+ ) -> Optional[Dict]: ...
339
389
 
340
390
  @overload
341
391
  def schema_eval(
@@ -343,18 +393,18 @@ class DeepEvalClient:
343
393
  config: SchemaEvalConfig,
344
394
  *,
345
395
  raise_exception: bool = False,
346
- ) -> Optional[dict]: ...
396
+ ) -> Optional[Dict]: ...
347
397
 
348
398
  def schema_eval(
349
399
  self,
350
- config: SchemaEvalConfig | None = None,
400
+ config: Optional[SchemaEvalConfig] = None,
351
401
  *,
352
- data: List[SchemaInputItem] | None = None,
353
- metrics: List[SchemaMetricConfig] | None = None,
402
+ data: Optional[List[SchemaInputItem]] = None,
403
+ metrics: Optional[List[SchemaMetricConfig]] = None,
354
404
  component: str = "unknown",
355
- label: str | None = None,
405
+ label: Optional[str] = None,
356
406
  raise_exception: bool = False,
357
- ) -> Optional[dict]:
407
+ ) -> Optional[Dict]:
358
408
  if config is None:
359
409
  if data is None or metrics is None:
360
410
  raise ValueError(
@@ -381,13 +431,15 @@ class DeepEvalClient:
381
431
  raise_exception: bool = False,
382
432
  component: str = "unknown",
383
433
  label: Union[str, None] = None,
384
- ) -> Optional[dict]:
434
+ ) -> Optional[Dict]:
385
435
  """Run background schema evaluation (async job)."""
386
436
  payload = SchemaEvalConfig.model_construct(
387
437
  data=data, metrics=metrics, component=component, version=label
388
438
  ).model_dump()
389
439
  return self._post(
390
- endpoint="/deepeval/schema-eval/background", payload=payload, raise_exception=raise_exception
440
+ endpoint="/deepeval/schema-eval/background",
441
+ payload=payload,
442
+ raise_exception=raise_exception,
391
443
  )
392
444
 
393
445
  def maybe_text_eval(
@@ -398,7 +450,7 @@ class DeepEvalClient:
398
450
  raise_exception: bool = False,
399
451
  component: str = "unknown",
400
452
  label: Union[str, None] = None,
401
- ) -> Optional[dict]:
453
+ ) -> Optional[Dict]:
402
454
  """Randomly run text_eval based on a probability between 0 and 1."""
403
455
  self._validate_chance(chance)
404
456
  return (
@@ -421,7 +473,7 @@ class DeepEvalClient:
421
473
  raise_exception: bool = False,
422
474
  component: str = "unknown",
423
475
  label: Union[str, None] = None,
424
- ) -> Optional[dict]:
476
+ ) -> Optional[Dict]:
425
477
  """Randomly run text_eval_background based on a probability between 0 and 1."""
426
478
  self._validate_chance(chance)
427
479
  return (
@@ -440,7 +492,7 @@ class DeepEvalClient:
440
492
  raise_exception: bool = False,
441
493
  component: str = "unknown",
442
494
  label: Union[str, None] = None,
443
- ) -> Optional[dict]:
495
+ ) -> Optional[Dict]:
444
496
  """Randomly run schema_eval based on a probability between 0 and 1."""
445
497
  self._validate_chance(chance)
446
498
  return (
@@ -463,7 +515,7 @@ class DeepEvalClient:
463
515
  raise_exception: bool = False,
464
516
  component: str = "unknown",
465
517
  label: Union[str, None] = None,
466
- ) -> Optional[dict]:
518
+ ) -> Optional[Dict]:
467
519
  """Randomly run text_eval_background based on a probability between 0 and 1."""
468
520
  self._validate_chance(chance)
469
521
  return (
rakam_eval_sdk/schema.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Common base class for all metric configs
2
2
  import sys
3
- from typing import Annotated, Any, Dict, List, Literal, Optional, Union
3
+ from typing import Any, Dict, List, Literal, Optional, Union
4
4
 
5
5
  # Base class (you can keep this abstract)
6
6
  from pydantic import BaseModel, Field
@@ -129,3 +129,24 @@ class SchemaEvalConfig(BaseModel):
129
129
  label: Union[str, None] = None
130
130
  data: List[SchemaInputItem]
131
131
  metrics: List[SchemaMetricConfig] = Field(default_factory=list)
132
+
133
+
134
+ class MetricDiff(BaseModel):
135
+ metric: str
136
+ score_a: Optional[float]
137
+ score_b: Optional[float]
138
+ delta: Optional[float]
139
+
140
+ success_a: Optional[bool]
141
+ success_b: Optional[bool]
142
+
143
+ threshold_a: Optional[float]
144
+ threshold_b: Optional[float]
145
+
146
+ status: str # "unchanged" | "changed" | "added" | "removed"
147
+
148
+
149
+ class TestCaseComparison(BaseModel):
150
+ testcase_a_id: int
151
+ testcase_b_id: int
152
+ metrics: List[MetricDiff]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rakam-eval-sdk
3
- Version: 0.2.3
3
+ Version: 0.2.4rc1
4
4
  Summary: Evaluation Framework SDK
5
5
  Author: Mohamed Bachar Touil
6
6
  License: MIT
@@ -0,0 +1,10 @@
1
+ rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ rakam_eval_sdk/cli.py,sha256=HjymI0UTKRgJb-ewDuINuTtH605MkcB9nD0qNlbdFyc,21905
3
+ rakam_eval_sdk/client.py,sha256=8KwpNt7WHgMfcXAQtrAlnUihai9TiADRieiuWeIxg6E,15575
4
+ rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
5
+ rakam_eval_sdk/schema.py,sha256=DT-uQsE3XB7AAii68QACwIslLqXRQFb1c6efkqLQysI,3960
6
+ rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
7
+ rakam_eval_sdk-0.2.4rc1.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
8
+ rakam_eval_sdk-0.2.4rc1.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
9
+ rakam_eval_sdk-0.2.4rc1.dist-info/METADATA,sha256=Sy2Ghp5wcrmS7AzBkhp8C8LVb0dTS7tSNtgEAJCfbU8,6022
10
+ rakam_eval_sdk-0.2.4rc1.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- rakam_eval_sdk/cli.py,sha256=gV4FsAde1WB5rRPLW2mRqTO4X0SZhvepcWH10KE5IIY,17642
3
- rakam_eval_sdk/client.py,sha256=sEXogMqtXgKGt_1Xuwg-S2Q2zRC6Y0ZVG_Wy1djbi28,14585
4
- rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
5
- rakam_eval_sdk/schema.py,sha256=woXu5GT1wuP_d7G11idopwpGmk0nV1SrxGShgWk-RA8,3530
6
- rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
7
- rakam_eval_sdk-0.2.3.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
8
- rakam_eval_sdk-0.2.3.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
9
- rakam_eval_sdk-0.2.3.dist-info/METADATA,sha256=ybu9LsKuQw0gfhJGPW-wTO6LC4bJw7sDlVuUQJQRqZ8,6019
10
- rakam_eval_sdk-0.2.3.dist-info/RECORD,,