rakam-eval-sdk 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rakam-eval-sdk
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: Evaluation Framework SDK
5
5
  Author: Mohamed Bachar Touil
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "uv_build"
4
4
 
5
5
  [project]
6
6
  name = "rakam-eval-sdk"
7
- version = "0.2.3"
7
+ version = "0.2.4"
8
8
  description = "Evaluation Framework SDK"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -6,16 +6,18 @@ import uuid
6
6
  from datetime import datetime
7
7
  from pathlib import Path
8
8
  from pprint import pprint
9
- from typing import Any, Dict, Optional, Sequence
9
+ from typing import Any, Dict, Optional
10
10
 
11
11
  import typer
12
12
  from dotenv import load_dotenv
13
13
  from rich.console import Console
14
14
  from rich.panel import Panel
15
15
  from rich.pretty import Pretty
16
+ from typer import secho
16
17
 
17
18
  from rakam_eval_sdk.client import DeepEvalClient
18
19
  from rakam_eval_sdk.decorators import eval_run
20
+ from rakam_eval_sdk.schema import MetricDiff, TestCaseComparison
19
21
  from rakam_eval_sdk.utils.decorator_utils import (
20
22
  find_decorated_functions,
21
23
  load_module_from_path,
@@ -163,19 +165,22 @@ def list_runs(
163
165
  """
164
166
  client = DeepEvalClient()
165
167
 
166
- runs = client.list_evaluation_testcases(
168
+ response = client.list_evaluation_testcases(
167
169
  limit=limit,
168
170
  offset=offset,
169
171
  raise_exception=True,
170
172
  )
171
173
 
172
- if not runs:
174
+ items = response.get("items", [])
175
+ total = response.get("total", 0)
176
+
177
+ if not items:
173
178
  typer.echo("No evaluation runs found.")
174
179
  return
175
180
 
176
- typer.echo(f"[id] " f"{'tag':<20}" f"{'label':<20}" f"created_at")
177
- # pretty CLI output
178
- for run in runs:
181
+ typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
182
+
183
+ for run in items:
179
184
  run_id = run.get("id")
180
185
  label = run.get("label") or "-"
181
186
  uid = run.get("tag") or "-"
@@ -189,8 +194,12 @@ def list_runs(
189
194
  except ValueError:
190
195
  pass
191
196
 
192
- typer.echo(
193
- f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
197
+ typer.echo(f"[{run_id}] {uid:<20} {label:<20} {created_at}")
198
+
199
+ shown = offset + len(items)
200
+ if shown < total:
201
+ typer.echo()
202
+ typer.echo(f"Showing {shown} of {total} runs. Use --limit to see more.")
194
203
 
195
204
 
196
205
  @app.command()
@@ -410,8 +419,7 @@ def _print_and_save(
410
419
  return
411
420
 
412
421
  if out.exists() and not overwrite:
413
- typer.echo(
414
- f"❌ File already exists: {out} (use --overwrite to replace)")
422
+ typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
415
423
  raise typer.Exit(code=1)
416
424
 
417
425
  out.parent.mkdir(parents=True, exist_ok=True)
@@ -422,12 +430,109 @@ def _print_and_save(
422
430
  typer.echo(f"💾 Result saved to {out}")
423
431
 
424
432
 
433
+ def pct_change(a: float | None, b: float | None) -> str | None:
434
+ if a is None or b is None or a == 0:
435
+ return None
436
+ return f"{((b - a) / a) * 100:+.2f}%"
437
+
438
+
439
+ def metric_direction(delta: float | None) -> str:
440
+ if delta is None:
441
+ return "unchanged"
442
+ if delta > 0:
443
+ return "improved"
444
+ if delta < 0:
445
+ return "regressed"
446
+ return "unchanged"
447
+
448
+
449
+ def print_metric_diff(diff: MetricDiff):
450
+ secho(f"\nMetric: {diff.metric}", bold=True)
451
+
452
+ if diff.status == "added":
453
+ secho(f"+ score: {diff.score_b}", fg="green")
454
+ secho(f"+ threshold: {diff.threshold_b}", fg="green")
455
+ secho(f"+ success: {diff.success_b}", fg="green")
456
+ return
457
+
458
+ if diff.status == "removed":
459
+ secho(f"- score: {diff.score_a}", fg="red")
460
+ secho(f"- threshold: {diff.threshold_a}", fg="red")
461
+ secho(f"- success: {diff.success_a}", fg="red")
462
+ return
463
+
464
+ # unchanged / changed
465
+ if diff.score_a != diff.score_b:
466
+ direction = metric_direction(diff.delta)
467
+ color = "green" if direction == "improved" else "red"
468
+ pct = pct_change(diff.score_a, diff.score_b)
469
+
470
+ secho(f"- score: {diff.score_a}", fg="red")
471
+ secho(
472
+ f"+ score: {diff.score_b}" + (f" ({pct})" if pct else ""),
473
+ fg=color,
474
+ )
475
+ else:
476
+ secho(f" score: {diff.score_a}", dim=True)
477
+
478
+ if diff.threshold_a != diff.threshold_b:
479
+ secho(f"- threshold: {diff.threshold_a}", fg="red")
480
+ secho(f"+ threshold: {diff.threshold_b}", fg="green")
481
+ else:
482
+ secho(f" threshold: {diff.threshold_a}", dim=True)
483
+
484
+ if diff.success_a != diff.success_b:
485
+ secho(f"- success: {diff.success_a}", fg="red")
486
+ secho(f"+ success: {diff.success_b}", fg="green")
487
+ else:
488
+ secho(f" success: {diff.success_a}", dim=True)
489
+
490
+
491
+ def summarize(metrics: Any) -> Dict[str, int]:
492
+ summary = {
493
+ "improved": 0,
494
+ "regressed": 0,
495
+ "unchanged": 0,
496
+ "added": 0,
497
+ "removed": 0,
498
+ }
499
+
500
+ for m in metrics:
501
+ if m.status in ("added", "removed"):
502
+ summary[m.status] += 1
503
+ else:
504
+ direction = metric_direction(m.delta)
505
+ summary[direction] += 1
506
+
507
+ return summary
508
+
509
+
510
+ def pretty_print_comparison(resp: Any, summary_only: bool = False):
511
+ if not summary_only:
512
+ for metric in resp.metrics:
513
+ print_metric_diff(metric)
514
+ return
515
+
516
+ print_summary(resp.metrics)
517
+
518
+
519
+ def print_summary(metrics: Any):
520
+ summary = summarize(metrics)
521
+
522
+ secho("\nSummary:", bold=True)
523
+ secho(f" ↑ Improved: {summary['improved']}", fg="green")
524
+ secho(f" ↓ Regressed: {summary['regressed']}", fg="red")
525
+ secho(f" ± Unchanged: {summary['unchanged']}", dim=True)
526
+ secho(f" + Added: {summary['added']}", fg="green")
527
+ secho(f" - Removed: {summary['removed']}", fg="red")
528
+
529
+
425
530
  @app.command()
426
531
  def compare(
427
532
  tag: list[str] = typer.Option(
428
533
  [],
429
534
  "--tag",
430
- help="Tag identifying a reference testcase",
535
+ help="Label identifying a reference testcase",
431
536
  ),
432
537
  run: list[int] = typer.Option(
433
538
  [],
@@ -437,7 +542,12 @@ def compare(
437
542
  pretty: bool = typer.Option(
438
543
  True,
439
544
  "--pretty/--raw",
440
- help="Pretty-print the response",
545
+ help="Pretty diff output (default) or raw JSON",
546
+ ),
547
+ summary: bool = typer.Option(
548
+ False,
549
+ "--summary",
550
+ help="Show summary only (no per-metric diff)",
441
551
  ),
442
552
  raise_exception: bool = typer.Option(
443
553
  False,
@@ -457,31 +567,39 @@ def compare(
457
567
  ),
458
568
  ) -> None:
459
569
  """
460
- Compare two evaluation testcases using runs and/or labels.
570
+ Compare two evaluation testcases (runs and/or labels).
571
+
572
+ Output:
573
+ - Unified diff-style view by default
574
+ - Raw JSON with --raw
575
+ - Summary of improvements / regressions
461
576
  """
462
577
 
463
- targets = []
578
+ targets: list[tuple[str, str | int]] = []
464
579
 
465
580
  for r in run:
466
581
  targets.append(("run", r))
467
582
 
468
- for l in tag:
469
- targets.append(("label", l))
583
+ for t in tag:
584
+ targets.append(("label", t))
470
585
 
471
586
  if len(targets) != 2:
472
- typer.echo(
473
- "❌ Provide exactly two targets using --run and/or --label"
587
+ typer.secho(
588
+ "❌ Provide exactly two targets using --run and/or --tag",
589
+ fg="red",
590
+ bold=True,
474
591
  )
475
592
  raise typer.Exit(code=1)
476
593
 
477
- client = DeepEvalClient()
478
-
479
594
  (type_a, value_a), (type_b, value_b) = targets
480
595
 
481
- typer.echo(
482
- f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'"
596
+ typer.secho(
597
+ f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
598
+ bold=True,
483
599
  )
484
600
 
601
+ client = DeepEvalClient()
602
+
485
603
  kwargs = {"raise_exception": raise_exception}
486
604
 
487
605
  if type_a == "run":
@@ -497,14 +615,34 @@ def compare(
497
615
  try:
498
616
  resp = client.compare_testcases(**kwargs)
499
617
  except Exception as e:
500
- typer.echo(f"❌ Request failed: {e}")
618
+ typer.secho(f"❌ Request failed: {e}", fg="red")
501
619
  raise typer.Exit(code=1)
502
620
 
503
621
  if not resp:
504
- typer.echo("⚠️ No response received")
622
+ typer.secho("⚠️ No response received", fg="yellow")
505
623
  raise typer.Exit(code=1)
506
624
 
507
- _print_and_save(resp, pretty, out, overwrite)
625
+ if out:
626
+ if out.exists() and not overwrite:
627
+ typer.secho(
628
+ f"❌ File already exists: {out} (use --overwrite)",
629
+ fg="red",
630
+ )
631
+ raise typer.Exit(code=1)
632
+
633
+ out.write_text(json.dumps(resp, indent=2))
634
+ typer.secho(f"💾 Saved raw output to {out}", fg="green")
635
+
636
+ if not pretty:
637
+ typer.echo(json.dumps(resp, indent=2))
638
+ return
639
+
640
+ comparison = TestCaseComparison(**resp)
641
+
642
+ pretty_print_comparison(
643
+ comparison,
644
+ summary_only=summary,
645
+ )
508
646
 
509
647
 
510
648
  @app.command(hidden=True)
@@ -40,8 +40,7 @@ class DeepEvalClient:
40
40
  )
41
41
  self.base_url = raw_url.rstrip("/")
42
42
  self.api_token = (
43
- api_token or settings_token or os.getenv(
44
- "EVALFRAMEWORK_API_KEY", "")
43
+ api_token or settings_token or os.getenv("EVALFRAMEWORK_API_KEY", "")
45
44
  )
46
45
  self.timeout = timeout
47
46
 
@@ -50,10 +49,10 @@ class DeepEvalClient:
50
49
  method: HTTPMethod,
51
50
  endpoint: str,
52
51
  *,
53
- json: dict | None = None,
54
- params: dict | None = None,
52
+ json: Dict | None = None,
53
+ params: Dict | None = None,
55
54
  raise_exception: bool = False,
56
- ) -> Optional[dict]:
55
+ ) -> Optional[Dict]:
57
56
  url = f"{self.base_url}{endpoint}"
58
57
 
59
58
  headers = {
@@ -90,16 +89,16 @@ class DeepEvalClient:
90
89
  "raw": resp.text,
91
90
  }
92
91
 
93
- def _get(self, endpoint: str, params: dict, *args, **kw):
92
+ def _get(self, endpoint: str, params: Dict, *args, **kw):
94
93
  return self._request("GET", endpoint, params=params, *args, **kw)
95
94
 
96
- def _post(self, endpoint: str, payload: dict, *args, **kw):
95
+ def _post(self, endpoint: str, payload: Dict, *args, **kw):
97
96
  return self._request("POST", endpoint, json=payload, *args, **kw)
98
97
 
99
- def _patch(self, endpoint: str, payload: dict, *args, **kw):
98
+ def _patch(self, endpoint: str, payload: Dict, *args, **kw):
100
99
  return self._request("PATCH", endpoint, json=payload, *args, **kw)
101
100
 
102
- def _delete(self, endpoint: str, payload: dict, *args, **kw):
101
+ def _delete(self, endpoint: str, payload: Dict, *args, **kw):
103
102
  return self._request("DELETE", endpoint, json=payload, *args, **kw)
104
103
 
105
104
  def update_evaluation_testcase_tag(
@@ -133,10 +132,11 @@ class DeepEvalClient:
133
132
  limit: int = 10,
134
133
  offset: int = 0,
135
134
  raise_exception: bool = False,
136
- ) -> Optional[List[Dict]]:
135
+ ) -> Optional[Dict]:
137
136
  """
138
137
  List evaluation testcases for the current API token only.
139
138
  Sorted by created_at DESC (newest first).
139
+ Returns items + pagination metadata.
140
140
  """
141
141
  return self._get(
142
142
  endpoint="/eval-framework/deepeval/evaluation-testcases/token",
@@ -202,7 +202,7 @@ class DeepEvalClient:
202
202
  testcase_b_id: int | None = None,
203
203
  testcase_b_tag: str | None = None,
204
204
  raise_exception: bool = False,
205
- ) -> Optional[dict]:
205
+ ) -> Optional[Dict]:
206
206
  """
207
207
  Compare two evaluation testcases using IDs or tags.
208
208
  Exactly one identifier (id or tag) must be provided per testcase.
@@ -210,9 +210,7 @@ class DeepEvalClient:
210
210
 
211
211
  def validate(id_, tag, name: str):
212
212
  if bool(id_) == bool(tag):
213
- raise ValueError(
214
- f"Provide exactly one of {name}_id or {name}_tag"
215
- )
213
+ raise ValueError(f"Provide exactly one of {name}_id or {name}_tag")
216
214
 
217
215
  validate(testcase_a_id, testcase_a_tag, "testcase_a")
218
216
  validate(testcase_b_id, testcase_b_tag, "testcase_b")
@@ -241,7 +239,7 @@ class DeepEvalClient:
241
239
  label_a: str,
242
240
  label_b: str,
243
241
  raise_exception: bool = False,
244
- ) -> Optional[dict]:
242
+ ) -> Optional[Dict]:
245
243
  """
246
244
  Compare the latest evaluation testcases for two labels.
247
245
  """
@@ -259,7 +257,7 @@ class DeepEvalClient:
259
257
  *,
260
258
  label: str,
261
259
  raise_exception: bool = False,
262
- ) -> Optional[dict]:
260
+ ) -> Optional[Dict]:
263
261
  """
264
262
  Compare the last two evaluation testcases for a given label.
265
263
  """
@@ -277,7 +275,7 @@ class DeepEvalClient:
277
275
  config: EvalConfig,
278
276
  *,
279
277
  raise_exception: bool = False,
280
- ) -> Optional[dict]: ...
278
+ ) -> Optional[Dict]: ...
281
279
 
282
280
  @overload
283
281
  def text_eval(
@@ -288,7 +286,7 @@ class DeepEvalClient:
288
286
  component: str = "unknown",
289
287
  label: str | None = None,
290
288
  raise_exception: bool = False,
291
- ) -> Optional[dict]: ...
289
+ ) -> Optional[Dict]: ...
292
290
 
293
291
  def text_eval(
294
292
  self,
@@ -299,7 +297,7 @@ class DeepEvalClient:
299
297
  component: str = "unknown",
300
298
  label: str | None = None,
301
299
  raise_exception: bool = False,
302
- ) -> Optional[dict]:
300
+ ) -> Optional[Dict]:
303
301
  if config is None:
304
302
  config = EvalConfig(
305
303
  data=data,
@@ -309,7 +307,9 @@ class DeepEvalClient:
309
307
  )
310
308
 
311
309
  return self._post(
312
- endpoint="/deepeval/text-eval", payload=config.model_dump(), raise_exception=raise_exception
310
+ endpoint="/deepeval/text-eval",
311
+ payload=config.model_dump(),
312
+ raise_exception=raise_exception,
313
313
  )
314
314
 
315
315
  def text_eval_background(
@@ -319,12 +319,16 @@ class DeepEvalClient:
319
319
  raise_exception: bool = False,
320
320
  component: str = "unknown",
321
321
  label: Union[str, None] = None,
322
- ) -> Optional[dict]:
322
+ ) -> Optional[Dict]:
323
323
  """Run background text evaluation (async job)."""
324
324
  payload = EvalConfig.model_construct(
325
325
  data=data, metrics=metrics, component=component, version=label
326
326
  ).model_dump()
327
- return self._post(endpoint="/deepeval/text-eval/background", payload=payload, raise_exception=raise_exception)
327
+ return self._post(
328
+ endpoint="/deepeval/text-eval/background",
329
+ payload=payload,
330
+ raise_exception=raise_exception,
331
+ )
328
332
 
329
333
  @overload
330
334
  def schema_eval(
@@ -335,7 +339,7 @@ class DeepEvalClient:
335
339
  component: str = "unknown",
336
340
  label: str | None = None,
337
341
  raise_exception: bool = False,
338
- ) -> Optional[dict]: ...
342
+ ) -> Optional[Dict]: ...
339
343
 
340
344
  @overload
341
345
  def schema_eval(
@@ -343,7 +347,7 @@ class DeepEvalClient:
343
347
  config: SchemaEvalConfig,
344
348
  *,
345
349
  raise_exception: bool = False,
346
- ) -> Optional[dict]: ...
350
+ ) -> Optional[Dict]: ...
347
351
 
348
352
  def schema_eval(
349
353
  self,
@@ -354,7 +358,7 @@ class DeepEvalClient:
354
358
  component: str = "unknown",
355
359
  label: str | None = None,
356
360
  raise_exception: bool = False,
357
- ) -> Optional[dict]:
361
+ ) -> Optional[Dict]:
358
362
  if config is None:
359
363
  if data is None or metrics is None:
360
364
  raise ValueError(
@@ -381,13 +385,15 @@ class DeepEvalClient:
381
385
  raise_exception: bool = False,
382
386
  component: str = "unknown",
383
387
  label: Union[str, None] = None,
384
- ) -> Optional[dict]:
388
+ ) -> Optional[Dict]:
385
389
  """Run background schema evaluation (async job)."""
386
390
  payload = SchemaEvalConfig.model_construct(
387
391
  data=data, metrics=metrics, component=component, version=label
388
392
  ).model_dump()
389
393
  return self._post(
390
- endpoint="/deepeval/schema-eval/background", payload=payload, raise_exception=raise_exception
394
+ endpoint="/deepeval/schema-eval/background",
395
+ payload=payload,
396
+ raise_exception=raise_exception,
391
397
  )
392
398
 
393
399
  def maybe_text_eval(
@@ -398,7 +404,7 @@ class DeepEvalClient:
398
404
  raise_exception: bool = False,
399
405
  component: str = "unknown",
400
406
  label: Union[str, None] = None,
401
- ) -> Optional[dict]:
407
+ ) -> Optional[Dict]:
402
408
  """Randomly run text_eval based on a probability between 0 and 1."""
403
409
  self._validate_chance(chance)
404
410
  return (
@@ -421,7 +427,7 @@ class DeepEvalClient:
421
427
  raise_exception: bool = False,
422
428
  component: str = "unknown",
423
429
  label: Union[str, None] = None,
424
- ) -> Optional[dict]:
430
+ ) -> Optional[Dict]:
425
431
  """Randomly run text_eval_background based on a probability between 0 and 1."""
426
432
  self._validate_chance(chance)
427
433
  return (
@@ -440,7 +446,7 @@ class DeepEvalClient:
440
446
  raise_exception: bool = False,
441
447
  component: str = "unknown",
442
448
  label: Union[str, None] = None,
443
- ) -> Optional[dict]:
449
+ ) -> Optional[Dict]:
444
450
  """Randomly run schema_eval based on a probability between 0 and 1."""
445
451
  self._validate_chance(chance)
446
452
  return (
@@ -463,7 +469,7 @@ class DeepEvalClient:
463
469
  raise_exception: bool = False,
464
470
  component: str = "unknown",
465
471
  label: Union[str, None] = None,
466
- ) -> Optional[dict]:
472
+ ) -> Optional[Dict]:
467
473
  """Randomly run text_eval_background based on a probability between 0 and 1."""
468
474
  self._validate_chance(chance)
469
475
  return (
@@ -129,3 +129,24 @@ class SchemaEvalConfig(BaseModel):
129
129
  label: Union[str, None] = None
130
130
  data: List[SchemaInputItem]
131
131
  metrics: List[SchemaMetricConfig] = Field(default_factory=list)
132
+
133
+
134
+ class MetricDiff(BaseModel):
135
+ metric: str
136
+ score_a: Optional[float]
137
+ score_b: Optional[float]
138
+ delta: Optional[float]
139
+
140
+ success_a: Optional[bool]
141
+ success_b: Optional[bool]
142
+
143
+ threshold_a: Optional[float]
144
+ threshold_b: Optional[float]
145
+
146
+ status: str # "unchanged" | "changed" | "added" | "removed"
147
+
148
+
149
+ class TestCaseComparison(BaseModel):
150
+ testcase_a_id: int
151
+ testcase_b_id: int
152
+ metrics: List[MetricDiff]
File without changes