rakam-eval-sdk 0.2.4rc6__tar.gz → 0.2.4rc8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rakam-eval-sdk
3
- Version: 0.2.4rc6
3
+ Version: 0.2.4rc8
4
4
  Summary: Evaluation Framework SDK
5
5
  Author: Mohamed Bachar Touil
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "uv_build"
4
4
 
5
5
  [project]
6
6
  name = "rakam-eval-sdk"
7
- version = "0.2.4rc6"
7
+ version = "0.2.4rc8"
8
8
  description = "Evaluation Framework SDK"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -25,6 +25,7 @@ dev = [
25
25
  "build>=1.2.2.post1",
26
26
  "pytest>=8.3.5",
27
27
  "twine>=6.1.0",
28
+ "pytest-cov>=5.0.0",
28
29
  ]
29
30
 
30
31
  [tool.isort]
@@ -5,23 +5,27 @@ import sys
5
5
  import uuid
6
6
  from datetime import datetime
7
7
  from pathlib import Path
8
- from pprint import pprint
9
- from typing import Any, Dict, List, Optional, Tuple, Union, Set
8
+ from typing import Any, Dict, List, Optional, Set, Tuple
10
9
 
11
10
  import typer
12
11
  from dotenv import load_dotenv
13
12
  from rich.console import Console
14
13
  from rich.panel import Panel
15
14
  from rich.pretty import Pretty
16
- from typer import secho
17
15
 
18
16
  from rakam_eval_sdk.client import DeepEvalClient
19
17
  from rakam_eval_sdk.decorators import eval_run
20
- from rakam_eval_sdk.schema import MetricDiff, TestCaseComparison
18
+ from rakam_eval_sdk.schema import TestCaseComparison
21
19
  from rakam_eval_sdk.utils.decorator_utils import (
22
20
  find_decorated_functions,
23
21
  load_module_from_path,
24
22
  )
23
+ from rakam_eval_sdk.utils.print import (
24
+ _print_and_save,
25
+ git_diff,
26
+ pretty_print_comparison,
27
+ serialize_for_diff,
28
+ )
25
29
 
26
30
  load_dotenv()
27
31
  app = typer.Typer(help="CLI tools for evaluation utilities")
@@ -31,10 +35,10 @@ console = Console()
31
35
  PROJECT_ROOT = os.path.abspath(".")
32
36
  if PROJECT_ROOT not in sys.path:
33
37
  sys.path.insert(0, PROJECT_ROOT)
34
- list_app = typer.Typer(help="List resources")
35
- app.add_typer(list_app, name="list")
38
+ list_app = typer.Typer(help="List evaluations or runs")
36
39
  metrics_app = typer.Typer(help="Metrics utilities")
37
- app.add_typer(metrics_app, name="metrics")
40
+
41
+ # Sub-apps are registered at the end to control command order
38
42
 
39
43
 
40
44
  def extract_metric_names(config: Any) -> List[Tuple[str, Optional[str]]]:
@@ -138,7 +142,7 @@ def list_evals(
138
142
  ),
139
143
  ) -> None:
140
144
  """
141
- Find functions decorated with @track.
145
+ List evaluations (functions decorated with @eval_run).
142
146
  """
143
147
  TARGET_DECORATOR = eval_run.__name__
144
148
  files = directory.rglob("*.py") if recursive else directory.glob("*.py")
@@ -152,16 +156,16 @@ def list_evals(
152
156
  typer.echo(f"{file}:{fn}")
153
157
 
154
158
  if not found:
155
- typer.echo(f"No @{TARGET_DECORATOR} functions found.")
159
+ typer.echo("No evaluations found.")
156
160
 
157
161
 
158
162
  @list_app.command("runs")
159
163
  def list_runs(
160
- limit: int = typer.Option(20, help="Max number of runs"),
164
+ limit: int = typer.Option(20, "-l", "--limit", help="Max number of runs"),
161
165
  offset: int = typer.Option(0, help="Pagination offset"),
162
166
  ) -> None:
163
167
  """
164
- List evaluation runs (newest first).
168
+ List runs (newest first).
165
169
  """
166
170
  client = DeepEvalClient()
167
171
 
@@ -175,7 +179,7 @@ def list_runs(
175
179
  total = response.get("total", 0)
176
180
 
177
181
  if not items:
178
- typer.echo("No evaluation runs found.")
182
+ typer.echo("No runs found.")
179
183
  return
180
184
 
181
185
  typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
@@ -204,13 +208,15 @@ def list_runs(
204
208
 
205
209
  @app.command()
206
210
  def show(
207
- id: Optional[int] = typer.Option(
211
+ run_id: Optional[int] = typer.Option(
208
212
  None,
213
+ "-i",
209
214
  "--id",
210
215
  help="Run ID",
211
216
  ),
212
217
  tag: Optional[str] = typer.Option(
213
218
  None,
219
+ "-t",
214
220
  "--tag",
215
221
  help="Run tag",
216
222
  ),
@@ -221,19 +227,20 @@ def show(
221
227
  ),
222
228
  ) -> None:
223
229
  """
224
- Show a single evaluation testcase by ID or tag.
230
+ Show a run by ID or tag.
225
231
  """
226
- if not id and not tag:
232
+
233
+ if not run_id and not tag:
227
234
  raise typer.BadParameter("You must provide either --id or --tag")
228
235
 
229
- if id and tag:
236
+ if run_id and tag:
230
237
  raise typer.BadParameter("Provide only one of --id or --tag")
231
238
 
232
239
  client = DeepEvalClient()
233
240
 
234
- if id:
235
- result = client.get_evaluation_testcase_by_id(id)
236
- identifier = f"id={id}"
241
+ if run_id:
242
+ result = client.get_evaluation_testcase_by_id(run_id)
243
+ identifier = f"run_id={run_id}"
237
244
  else:
238
245
  assert tag is not None
239
246
  result = client.get_evaluation_testcase_by_tag(tag)
@@ -266,7 +273,7 @@ def show(
266
273
  console.print(
267
274
  Panel.fit(
268
275
  Pretty(result),
269
- title="Evaluation TestCase",
276
+ title="Run",
270
277
  subtitle=identifier,
271
278
  )
272
279
  )
@@ -312,7 +319,7 @@ def run(
312
319
  save_runs: bool = typer.Option(
313
320
  False,
314
321
  "--save-runs",
315
- help="Save each evaluation run result to a JSON file",
322
+ help="Save each run result to a JSON file",
316
323
  ),
317
324
  output_dir: Path = typer.Option(
318
325
  Path("./eval_runs"),
@@ -321,7 +328,7 @@ def run(
321
328
  ),
322
329
  ) -> None:
323
330
  """
324
- Find and execute all functions decorated with @eval_run.
331
+ Execute evaluations (functions decorated with @eval_run).
325
332
  """
326
333
  files = directory.rglob("*.py") if recursive else directory.glob("*.py")
327
334
  TARGET_DECORATOR = eval_run.__name__
@@ -401,248 +408,141 @@ def run(
401
408
  typer.echo(f" ❌ Execution failed: {e}")
402
409
 
403
410
  if not executed_any and not dry_run:
404
- typer.echo("\nNo @eval_run functions executed.")
405
-
406
-
407
- def _print_and_save(
408
- resp: dict,
409
- pretty: bool,
410
- out: Optional[Path],
411
- overwrite: bool,
412
- ) -> None:
413
- if pretty:
414
- typer.echo(typer.style("📊 Result:", bold=True))
415
- pprint(resp)
416
- else:
417
- typer.echo(resp)
418
-
419
- if out is None:
420
- return
421
-
422
- if out.exists() and not overwrite:
423
- typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
424
- raise typer.Exit(code=1)
411
+ typer.echo("\nNo evaluations executed.")
425
412
 
426
- out.parent.mkdir(parents=True, exist_ok=True)
427
413
 
428
- with out.open("w", encoding="utf-8") as f:
429
- json.dump(resp, f, indent=2, ensure_ascii=False)
430
-
431
- typer.echo(f"💾 Result saved to {out}")
432
-
433
-
434
- def pct_change(a: Optional[float], b: Optional[float]) -> Optional[str]:
435
- if a is None or b is None or a == 0:
436
- return None
437
- return f"{((b - a) / a) * 100:+.2f}%"
438
-
439
-
440
- def metric_direction(delta: Optional[float]) -> str:
441
- if delta is None:
442
- return "unchanged"
443
- if delta > 0:
444
- return "improved"
445
- if delta < 0:
446
- return "regressed"
447
- return "unchanged"
448
-
449
-
450
- def print_metric_diff(diff: MetricDiff) -> None:
451
- secho(f"\nMetric: {diff.metric}", bold=True)
452
-
453
- if diff.status == "added":
454
- secho(f"+ score: {diff.score_b}", fg="green")
455
- secho(f"+ threshold: {diff.threshold_b}", fg="green")
456
- secho(f"+ success: {diff.success_b}", fg="green")
457
- return
458
-
459
- if diff.status == "removed":
460
- secho(f"- score: {diff.score_a}", fg="red")
461
- secho(f"- threshold: {diff.threshold_a}", fg="red")
462
- secho(f"- success: {diff.success_a}", fg="red")
463
- return
464
-
465
- # unchanged / changed
466
- if diff.score_a != diff.score_b:
467
- direction = metric_direction(diff.delta)
468
- color = "green" if direction == "improved" else "red"
469
- pct = pct_change(diff.score_a, diff.score_b)
470
-
471
- secho(f"- score: {diff.score_a}", fg="red")
472
- secho(
473
- f"+ score: {diff.score_b}" + (f" ({pct})" if pct else ""),
474
- fg=color,
475
- )
476
- else:
477
- secho(f" score: {diff.score_a}", dim=True)
478
-
479
- if diff.threshold_a != diff.threshold_b:
480
- secho(f"- threshold: {diff.threshold_a}", fg="red")
481
- secho(f"+ threshold: {diff.threshold_b}", fg="green")
482
- else:
483
- secho(f" threshold: {diff.threshold_a}", dim=True)
484
-
485
- if diff.success_a != diff.success_b:
486
- secho(f"- success: {diff.success_a}", fg="red")
487
- secho(f"+ success: {diff.success_b}", fg="green")
414
+ def fetch_run(
415
+ client: DeepEvalClient,
416
+ *,
417
+ run_id: Optional[int],
418
+ tag: Optional[str],
419
+ ) -> Tuple[dict, str]:
420
+ """
421
+ Fetch a single run by id or tag.
422
+ Returns (payload, identifier)
423
+ """
424
+ if run_id is not None:
425
+ result = client.get_evaluation_testcase_by_id(run_id)
426
+ identifier = f"run_id={run_id}"
488
427
  else:
489
- secho(f" success: {diff.success_a}", dim=True)
490
-
491
-
492
- def summarize(metrics: Any) -> Dict[str, int]:
493
- summary = {
494
- "improved": 0,
495
- "regressed": 0,
496
- "unchanged": 0,
497
- "added": 0,
498
- "removed": 0,
499
- }
500
-
501
- for m in metrics:
502
- if m.status in ("added", "removed"):
503
- summary[m.status] += 1
504
- else:
505
- direction = metric_direction(m.delta)
506
- summary[direction] += 1
507
-
508
- return summary
509
-
510
-
511
- def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
512
- if not summary_only:
513
- for metric in resp.metrics:
514
- print_metric_diff(metric)
515
- return
516
-
517
- print_summary(resp.metrics)
518
-
428
+ assert tag is not None
429
+ result = client.get_evaluation_testcase_by_tag(tag)
430
+ identifier = f"tag={tag}"
519
431
 
520
- def print_summary(metrics: Any) -> None:
521
- summary = summarize(metrics)
432
+ if not result:
433
+ raise RuntimeError(f"No data returned for {identifier}")
522
434
 
523
- secho("\nSummary:", bold=True)
524
- secho(f" ↑ Improved: {summary['improved']}", fg="green")
525
- secho(f" ↓ Regressed: {summary['regressed']}", fg="red")
526
- secho(f" ± Unchanged: {summary['unchanged']}", dim=True)
527
- secho(f" + Added: {summary['added']}", fg="green")
528
- secho(f" - Removed: {summary['removed']}", fg="red")
435
+ return result, identifier
529
436
 
530
437
 
531
438
  @app.command()
532
439
  def compare(
533
440
  tag: List[str] = typer.Option(
534
441
  [],
442
+ "-t",
535
443
  "--tag",
536
- help="Label identifying a reference testcase",
444
+ help="Run tag",
537
445
  ),
538
- run: List[int] = typer.Option(
446
+ run_id: List[int] = typer.Option(
539
447
  [],
540
- "--run",
541
- help="Run ID identifying an evaluation testcase",
542
- ),
543
- pretty: bool = typer.Option(
544
- True,
545
- "--pretty/--raw",
546
- help="Pretty diff output (default) or raw JSON",
448
+ "-i",
449
+ "--id",
450
+ help="Run ID",
547
451
  ),
548
452
  summary: bool = typer.Option(
549
453
  False,
550
454
  "--summary",
551
- help="Show summary only (no per-metric diff)",
455
+ help="Show summary diff only",
552
456
  ),
553
- raise_exception: bool = typer.Option(
554
- False,
555
- "--raise",
556
- help="Raise HTTP exceptions instead of swallowing them",
557
- ),
558
- out: Optional[Path] = typer.Option(
559
- None,
560
- "-o",
561
- "--out",
562
- help="Optional file path to save the result as JSON",
563
- ),
564
- overwrite: bool = typer.Option(
457
+ side_by_side: bool = typer.Option(
565
458
  False,
566
- "--overwrite",
567
- help="Overwrite output file if it already exists",
459
+ "--side-by-side",
460
+ help="Show side-by-side diff (git)",
568
461
  ),
569
462
  ) -> None:
570
463
  """
571
- Compare two evaluation testcases (runs and/or labels).
464
+ Compare two evaluation runs.
572
465
 
573
- Output:
574
- - Unified diff-style view by default
575
- - Raw JSON with --raw
576
- - Summary of improvements / regressions
466
+ Default: unified git diff
577
467
  """
578
468
 
579
- targets: List[Tuple[str, Union[str, int]]] = []
469
+ if summary and side_by_side:
470
+ typer.secho(
471
+ "❌ --summary and --side-by-side cannot be used together",
472
+ fg="red",
473
+ bold=True,
474
+ )
475
+ raise typer.Exit(code=1)
580
476
 
581
- for r in run:
582
- targets.append(("run", r))
477
+ targets: List[Tuple[Optional[int], Optional[str]]] = []
583
478
 
479
+ for r in run_id:
480
+ targets.append(("run", r))
584
481
  for t in tag:
585
- targets.append(("label", t))
482
+ targets.append(("tag", t))
586
483
 
587
484
  if len(targets) != 2:
588
485
  typer.secho(
589
- "❌ Provide exactly two targets using --run and/or --tag",
486
+ "❌ Provide exactly two targets using --id and/or --tag",
590
487
  fg="red",
591
488
  bold=True,
592
489
  )
593
490
  raise typer.Exit(code=1)
594
491
 
595
- (type_a, value_a), (type_b, value_b) = targets
596
-
597
- typer.secho(
598
- f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
599
- bold=True,
600
- )
601
-
602
492
  client = DeepEvalClient()
493
+ # Summary mode (reduced payload)
494
+ (type_a, value_a), (type_b, value_b) = targets
495
+ if summary:
496
+ kwargs: Dict[str, Any] = {"raise_exception": True}
497
+ if type_a == "run":
498
+ kwargs["testcase_a_id"] = value_a
499
+ else:
500
+ kwargs["testcase_a_tag"] = value_a
603
501
 
604
- kwargs: Dict[str, Any] = {"raise_exception": raise_exception}
605
-
606
- if type_a == "run":
607
- kwargs["testcase_a_id"] = value_a
608
- else:
609
- kwargs["testcase_a_tag"] = value_a
502
+ if type_b == "run":
503
+ kwargs["testcase_b_id"] = value_b
504
+ else:
505
+ kwargs["testcase_b_tag"] = value_b
506
+ try:
507
+ resp = client.compare_testcases(**kwargs)
508
+ except Exception as e:
509
+ typer.secho(f"❌ Request failed: {e}", fg="red")
510
+ raise typer.Exit(code=1)
610
511
 
611
- if type_b == "run":
612
- kwargs["testcase_b_id"] = value_b
613
- else:
614
- kwargs["testcase_b_tag"] = value_b
512
+ if not resp:
513
+ typer.secho("⚠️ No response received", fg="yellow")
514
+ raise typer.Exit(code=1)
515
+ comparison = TestCaseComparison(**resp)
516
+ pretty_print_comparison(
517
+ comparison,
518
+ summary_only=summary,
519
+ )
520
+ return
615
521
 
616
522
  try:
617
- resp = client.compare_testcases(**kwargs)
523
+ run_a, id_a = fetch_run(
524
+ client,
525
+ run_id=value_a if type_a == "run" else None,
526
+ tag=value_a if type_a == "tag" else None,
527
+ )
528
+ run_b, id_b = fetch_run(
529
+ client,
530
+ run_id=value_b if type_b == "run" else None,
531
+ tag=value_b if type_b == "tag" else None,
532
+ )
618
533
  except Exception as e:
619
- typer.secho(f"❌ Request failed: {e}", fg="red")
534
+ typer.secho(f"❌ Fetch failed: {e}", fg="red")
620
535
  raise typer.Exit(code=1)
621
536
 
622
- if not resp:
623
- typer.secho("⚠️ No response received", fg="yellow")
624
- raise typer.Exit(code=1)
537
+ a_text = serialize_for_diff(run_a)
538
+ b_text = serialize_for_diff(run_b)
625
539
 
626
- if out:
627
- if out.exists() and not overwrite:
628
- typer.secho(
629
- f"❌ File already exists: {out} (use --overwrite)",
630
- fg="red",
631
- )
632
- raise typer.Exit(code=1)
633
-
634
- out.write_text(json.dumps(resp, indent=2))
635
- typer.secho(f"💾 Saved raw output to {out}", fg="green")
636
-
637
- if not pretty:
638
- typer.echo(json.dumps(resp, indent=2))
639
- return
640
-
641
- comparison = TestCaseComparison(**resp)
642
-
643
- pretty_print_comparison(
644
- comparison,
645
- summary_only=summary,
540
+ git_diff(
541
+ a_text,
542
+ b_text,
543
+ label_a=f"{id_a}.full.json",
544
+ label_b=f"{id_b}.full.json",
545
+ side_by_side=side_by_side,
646
546
  )
647
547
 
648
548
 
@@ -679,7 +579,7 @@ def compare_label_latest(
679
579
  ),
680
580
  ) -> None:
681
581
  """
682
- Compare the latest evaluation runs for two labels.
582
+ Compare the latest runs for two labels.
683
583
  """
684
584
  client = DeepEvalClient()
685
585
 
@@ -757,11 +657,13 @@ def compare_last(
757
657
  def tag_command(
758
658
  run_id: Optional[int] = typer.Option(
759
659
  None,
760
- "--run",
761
- help="Evaluation run ID",
660
+ "-i",
661
+ "--id",
662
+ help="Run ID",
762
663
  ),
763
664
  tag: Optional[str] = typer.Option(
764
665
  None,
666
+ "-t",
765
667
  "--tag",
766
668
  help="Tag to assign to the run",
767
669
  ),
@@ -778,11 +680,11 @@ def tag_command(
778
680
  # --- validation ---
779
681
  if delete:
780
682
  if run_id or tag:
781
- typer.echo("❌ --delete cannot be used with --run or --tag")
683
+ typer.echo("❌ --delete cannot be used with --id or --tag")
782
684
  raise typer.Exit(code=1)
783
685
  else:
784
686
  if not run_id or not tag:
785
- typer.echo("❌ Use --run and --tag together, or --delete")
687
+ typer.echo("❌ Use --id and --tag together, or --delete")
786
688
  raise typer.Exit(code=1)
787
689
 
788
690
  client = DeepEvalClient()
@@ -812,6 +714,11 @@ def tag_command(
812
714
  typer.echo(f"Tag: {result.get('tag')}")
813
715
 
814
716
 
717
+ # Register sub-apps in user journey order (after regular commands)
718
+ app.add_typer(list_app, name="list")
719
+ app.add_typer(metrics_app, name="metrics")
720
+
721
+
815
722
  def main() -> None:
816
723
  app()
817
724
 
@@ -0,0 +1,58 @@
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ import pytest
5
+ import typer
6
+
7
+ from rakam_eval_sdk.cli import metrics
8
+
9
+
10
+ def test_metrics_no_metrics_found(
11
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
12
+ ) -> None:
13
+ file = tmp_path / "eval.py"
14
+ file.write_text("def foo(): pass")
15
+
16
+ monkeypatch.setattr(
17
+ "rakam_eval_sdk.cli.find_decorated_functions",
18
+ lambda *_: [],
19
+ )
20
+
21
+ with pytest.raises(typer.Exit) as exc:
22
+ metrics(directory=tmp_path, recursive=False)
23
+
24
+ assert exc.value.exit_code == 0
25
+
26
+
27
+ def test_metrics_finds_metrics(
28
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys
29
+ ) -> None:
30
+ file = tmp_path / "eval.py"
31
+ file.write_text("")
32
+
33
+ monkeypatch.setattr(
34
+ "rakam_eval_sdk.cli.find_decorated_functions",
35
+ lambda *_: ["run_eval"],
36
+ )
37
+
38
+ fake_module = SimpleNamespace(
39
+ run_eval=lambda: SimpleNamespace(
40
+ metrics=[
41
+ SimpleNamespace(type="accuracy", name=None),
42
+ SimpleNamespace(type="f1", name="f1_v2"),
43
+ ]
44
+ )
45
+ )
46
+
47
+ monkeypatch.setattr(
48
+ "rakam_eval_sdk.cli.load_module_from_path",
49
+ lambda _: fake_module,
50
+ )
51
+
52
+ metrics(directory=tmp_path, recursive=False)
53
+
54
+ out = capsys.readouterr().out
55
+
56
+ assert "accuracy" in out
57
+ assert "f1" in out
58
+ assert "unique metrics found" in out
@@ -0,0 +1,112 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import List
4
+
5
+ import pytest
6
+
7
+ from rakam_eval_sdk.client import DeepEvalClient
8
+ from rakam_eval_sdk.decorators import eval_run
9
+ from rakam_eval_sdk.utils.decorator_utils import find_decorated_functions
10
+
11
+
12
+ @pytest.fixture
13
+ def client() -> DeepEvalClient:
14
+ return DeepEvalClient(base_url="http://testserver", api_token="testtoken")
15
+
16
+
17
+ class FakeCPUTimes:
18
+ def __init__(self, user: float = 1.0, system: float = 1.0) -> None:
19
+ self.user: float = user
20
+ self.system: float = system
21
+
22
+
23
+ class FakeMemInfo:
24
+ def __init__(self, rss: int) -> None:
25
+ self.rss: int = rss
26
+
27
+
28
+ class FakeProcess:
29
+ def __init__(self) -> None:
30
+ self._cpu_calls: int = 0
31
+ self._mem_calls: int = 0
32
+
33
+ def cpu_times(self) -> FakeCPUTimes:
34
+ self._cpu_calls += 1
35
+ # simulate CPU usage increase
36
+ return FakeCPUTimes(
37
+ user=1.0 + self._cpu_calls,
38
+ system=1.0,
39
+ )
40
+
41
+ def memory_info(self) -> FakeMemInfo:
42
+ self._mem_calls += 1
43
+ return FakeMemInfo(rss=100_000_000 + (self._mem_calls * 10_000))
44
+
45
+
46
+ @pytest.fixture(autouse=True)
47
+ def patch_psutil(
48
+ monkeypatch: pytest.MonkeyPatch,
49
+ ) -> FakeProcess:
50
+ fake_process: FakeProcess = FakeProcess()
51
+
52
+ monkeypatch.setattr(
53
+ "rakam_eval_sdk.decorators.psutil.Process",
54
+ lambda pid: fake_process,
55
+ )
56
+ monkeypatch.setattr(os, "getpid", lambda: 123)
57
+ return fake_process
58
+
59
+
60
+ def test_eval_run_basic(capsys: pytest.CaptureFixture[str]) -> None:
61
+ @eval_run
62
+ def add(a: int, b: int) -> int:
63
+ return a + b
64
+
65
+ result: int = add(2, 3)
66
+
67
+ assert result == 5
68
+
69
+ out: str = capsys.readouterr().out
70
+ assert "[eval_run]" in out
71
+ assert "add" in out
72
+ assert "time=" in out
73
+ assert "cpu=" in out
74
+ assert "mem_delta=" in out
75
+
76
+
77
+ def test_eval_run_with_parentheses(capsys: pytest.CaptureFixture[str]) -> None:
78
+ @eval_run()
79
+ def mul(a: int, b: int) -> int:
80
+ return a * b
81
+
82
+ result: int = mul(3, 4)
83
+
84
+ assert result == 12
85
+ assert "[eval_run]" in capsys.readouterr().out
86
+
87
+
88
+ def test_find_decorated_functions(tmp_path: Path) -> None:
89
+ code = """
90
+ from rakam_eval_sdk.decorators import eval_run
91
+
92
+ @eval_run
93
+ def foo():
94
+ pass
95
+
96
+ @eval_run()
97
+ def bar():
98
+ pass
99
+
100
+ async def baz():
101
+ pass
102
+
103
+ @other
104
+ def nope():
105
+ pass
106
+ """
107
+ file = tmp_path / "test_mod.py"
108
+ file.write_text(code)
109
+
110
+ result: List[str] = find_decorated_functions(file, "eval_run")
111
+
112
+ assert set(result) == {"foo", "bar"}
@@ -0,0 +1,51 @@
1
+ from types import SimpleNamespace
2
+
3
+ from rakam_eval_sdk.cli import extract_metric_names
4
+
5
+
6
+ def test_extract_metric_names_no_metrics_attr() -> None:
7
+ config = SimpleNamespace()
8
+
9
+ result = extract_metric_names(config)
10
+
11
+ assert result == []
12
+
13
+
14
+ def test_extract_metric_names_empty_metrics() -> None:
15
+ config = SimpleNamespace(metrics=[])
16
+
17
+ result = extract_metric_names(config)
18
+
19
+ assert result == []
20
+
21
+
22
+ def test_extract_metric_names_with_types_only() -> None:
23
+ config = SimpleNamespace(
24
+ metrics=[
25
+ SimpleNamespace(type="accuracy"),
26
+ SimpleNamespace(type="latency"),
27
+ ]
28
+ )
29
+
30
+ result = extract_metric_names(config)
31
+
32
+ assert result == [
33
+ ("accuracy", None),
34
+ ("latency", None),
35
+ ]
36
+
37
+
38
+ def test_extract_metric_names_with_type_and_name() -> None:
39
+ config = SimpleNamespace(
40
+ metrics=[
41
+ SimpleNamespace(type="accuracy", name="acc_v1"),
42
+ SimpleNamespace(type="f1", name=None),
43
+ ]
44
+ )
45
+
46
+ result = extract_metric_names(config)
47
+
48
+ assert result == [
49
+ ("accuracy", "acc_v1"),
50
+ ("f1", None),
51
+ ]
@@ -0,0 +1,30 @@
1
+ from pathlib import Path
2
+ from types import ModuleType
3
+
4
+ import pytest
5
+
6
+ from rakam_eval_sdk.utils.decorator_utils import load_module_from_path
7
+
8
+
9
+ def test_load_module_from_path_success(tmp_path: Path) -> None:
10
+ file = tmp_path / "mod.py"
11
+ file.write_text(
12
+ """
13
+ x = 42
14
+ def hello():
15
+ return "world"
16
+ """
17
+ )
18
+
19
+ module: ModuleType = load_module_from_path(file)
20
+
21
+ assert module.x == 42
22
+ assert module.hello() == "world"
23
+
24
+
25
+ def test_load_module_from_path_invalid(tmp_path: Path) -> None:
26
+ file: Path = tmp_path / "broken.py"
27
+ file.write_text("def foo(") # invalid syntax
28
+
29
+ with pytest.raises(SyntaxError):
30
+ load_module_from_path(file)
@@ -0,0 +1,177 @@
1
+ from typing import Optional, List, Dict
2
+ import json
3
+ from pathlib import Path
4
+ from types import SimpleNamespace
5
+ from typing import Any
6
+
7
+ import pytest
8
+ import typer
9
+
10
+ from rakam_eval_sdk.utils.print import (
11
+ _print_and_save,
12
+ metric_direction,
13
+ pct_change,
14
+ pretty_print_comparison,
15
+ summarize,
16
+ )
17
+
18
+
19
+ @pytest.mark.parametrize(
20
+ "a,b,expected",
21
+ [
22
+ (10, 20, "+100.00%"),
23
+ (20, 10, "-50.00%"),
24
+ (10, 10, "+0.00%"),
25
+ (None, 10, None),
26
+ (10, None, None),
27
+ (0, 10, None),
28
+ ],
29
+ )
30
+ def test_pct_change(a: Optional[int], b: Optional[int], expected: Optional[str]):
31
+ assert pct_change(a, b) == expected
32
+
33
+
34
+ # -------------------------
35
+ # metric_direction
36
+ # -------------------------
37
+
38
+
39
+ @pytest.mark.parametrize(
40
+ "delta,expected",
41
+ [
42
+ (1.0, "improved"),
43
+ (-1.0, "regressed"),
44
+ (0.0, "unchanged"),
45
+ (None, "unchanged"),
46
+ ],
47
+ )
48
+ def test_metric_direction(delta: Optional[float], expected: str):
49
+ assert metric_direction(delta) == expected
50
+
51
+
52
+ def test_print_and_save_no_output_path(capsys: pytest.CaptureFixture[str]) -> None:
53
+ resp: Dict[str, int] = {"a": 1}
54
+
55
+ _print_and_save(resp, pretty=False, out=None, overwrite=False)
56
+
57
+ captured = capsys.readouterr()
58
+ assert str(resp) in captured.out
59
+
60
+
61
+ def test_print_and_save_creates_file(
62
+ tmp_path: Path,
63
+ capsys: pytest.CaptureFixture[str],
64
+ ) -> None:
65
+ out: Path = tmp_path / "result.json"
66
+ resp: Dict[str, str] = {"hello": "world"}
67
+
68
+ _print_and_save(resp, pretty=False, out=out, overwrite=False)
69
+
70
+ assert out.exists()
71
+ assert json.loads(out.read_text()) == resp
72
+
73
+ captured = capsys.readouterr()
74
+ assert "Result saved" in captured.out
75
+
76
+
77
+ def test_print_and_save_refuses_overwrite(tmp_path: Path) -> None:
78
+ out: Path = tmp_path / "result.json"
79
+ out.write_text("{}")
80
+
81
+ with pytest.raises(typer.Exit):
82
+ _print_and_save({"x": 1}, pretty=False, out=out, overwrite=False)
83
+
84
+
85
+ def test_print_and_save_overwrites(tmp_path: Path) -> None:
86
+ out: Path = tmp_path / "result.json"
87
+ out.write_text("{}")
88
+
89
+ _print_and_save({"x": 2}, pretty=False, out=out, overwrite=True)
90
+
91
+ assert json.loads(out.read_text()) == {"x": 2}
92
+
93
+
94
+ def make_metric(
95
+ *,
96
+ metric: str,
97
+ status: str = "changed",
98
+ delta: Optional[float] = None,
99
+ ) -> SimpleNamespace:
100
+ return SimpleNamespace(
101
+ metric=metric,
102
+ status=status,
103
+ delta=delta,
104
+ )
105
+
106
+
107
+ def test_summarize_groups_all_states() -> None:
108
+ metrics: List[SimpleNamespace] = [
109
+ make_metric(metric="added_metric", status="added"),
110
+ make_metric(metric="removed_metric", status="removed"),
111
+ make_metric(metric="improved_metric", delta=1.0),
112
+ make_metric(metric="regressed_metric", delta=-1.0),
113
+ make_metric(metric="unchanged_zero", delta=0.0),
114
+ make_metric(metric="unchanged_none", delta=None),
115
+ ]
116
+
117
+ summary: Dict[str, List[str]] = summarize(metrics=metrics)
118
+
119
+ assert summary == {
120
+ "improved": ["improved_metric"],
121
+ "regressed": ["regressed_metric"],
122
+ "unchanged": ["unchanged_zero", "unchanged_none"],
123
+ "added": ["added_metric"],
124
+ "removed": ["removed_metric"],
125
+ }
126
+
127
+
128
+ def test_summarize_empty_metrics() -> None:
129
+ assert summarize(metrics=[]) == {
130
+ "improved": [],
131
+ "regressed": [],
132
+ "unchanged": [],
133
+ "added": [],
134
+ "removed": [],
135
+ }
136
+
137
+
138
+ def test_pretty_print_comparison_summary_only(monkeypatch: pytest.MonkeyPatch) -> None:
139
+ metrics: List[SimpleNamespace] = [
140
+ make_metric(metric="added_metric", status="added"),
141
+ make_metric(metric="improved_metric", delta=1.0),
142
+ ]
143
+ resp: SimpleNamespace = SimpleNamespace(metrics=metrics)
144
+
145
+ called: Dict[str, Any] = {}
146
+
147
+ def fake_print_summary(arg: Any) -> None:
148
+ called["metrics"] = arg
149
+
150
+ import rakam_eval_sdk.utils.print as mod
151
+
152
+ monkeypatch.setattr(mod, "print_summary", fake_print_summary)
153
+
154
+ pretty_print_comparison(resp, summary_only=True)
155
+
156
+ assert called["metrics"] == metrics
157
+
158
+
159
+ def test_pretty_print_comparison_full(monkeypatch: pytest.MonkeyPatch) -> None:
160
+ metrics: List[SimpleNamespace] = [
161
+ make_metric(metric="added_metric", status="added"),
162
+ make_metric(metric="improved_metric", delta=1.0),
163
+ ]
164
+ resp: SimpleNamespace = SimpleNamespace(metrics=metrics)
165
+
166
+ calls: List[Any] = []
167
+
168
+ def fake_print_metric_diff(metric: Any) -> None:
169
+ calls.append(metric)
170
+
171
+ import rakam_eval_sdk.utils.print as mod
172
+
173
+ monkeypatch.setattr(mod, "print_metric_diff", fake_print_metric_diff)
174
+
175
+ pretty_print_comparison(resp, summary_only=False)
176
+
177
+ assert calls == metrics
@@ -0,0 +1,269 @@
1
+ import json
2
+ from pathlib import Path
3
+ from pprint import pprint
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ import typer
7
+ from typer import secho
8
+
9
+ from rakam_eval_sdk.schema import MetricDiff
10
+
11
+
12
+ def _print_and_save(
13
+ resp: dict,
14
+ pretty: bool,
15
+ out: Optional[Path],
16
+ overwrite: bool,
17
+ ) -> None:
18
+ if pretty:
19
+ typer.echo(typer.style("📊 Result:", bold=True))
20
+ pprint(resp)
21
+ else:
22
+ typer.echo(resp)
23
+
24
+ if out is None:
25
+ return
26
+
27
+ if out.exists() and not overwrite:
28
+ typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
29
+ raise typer.Exit(code=1)
30
+
31
+ out.parent.mkdir(parents=True, exist_ok=True)
32
+
33
+ with out.open("w", encoding="utf-8") as f:
34
+ json.dump(resp, f, indent=2, ensure_ascii=False)
35
+
36
+ typer.echo(f"💾 Result saved to {out}")
37
+
38
+
39
+ def pct_change(a: Optional[float], b: Optional[float]) -> Optional[str]:
40
+ if a is None or b is None or a == 0:
41
+ return None
42
+ return f"{((b - a) / a) * 100:+.2f}%"
43
+
44
+
45
+ def metric_direction(delta: Optional[float]) -> str:
46
+ if delta is None:
47
+ return "unchanged"
48
+ if delta > 0:
49
+ return "improved"
50
+ if delta < 0:
51
+ return "regressed"
52
+ return "unchanged"
53
+
54
+
55
+ def print_metric_diff(diff: MetricDiff) -> None:
56
+ secho(f"\nMetric: {diff.metric}", bold=True)
57
+
58
+ if diff.status == "added":
59
+ secho(f"+ score: {diff.score_b}", fg="green")
60
+ secho(f"+ threshold: {diff.threshold_b}", fg="green")
61
+ secho(f"+ success: {diff.success_b}", fg="green")
62
+ return
63
+
64
+ if diff.status == "removed":
65
+ secho(f"- score: {diff.score_a}", fg="red")
66
+ secho(f"- threshold: {diff.threshold_a}", fg="red")
67
+ secho(f"- success: {diff.success_a}", fg="red")
68
+ return
69
+
70
+ # unchanged / changed
71
+ if diff.score_a != diff.score_b:
72
+ direction = metric_direction(diff.delta)
73
+ color = "green" if direction == "improved" else "red"
74
+ pct = pct_change(diff.score_a, diff.score_b)
75
+
76
+ secho(f"- score: {diff.score_a}", fg="red")
77
+ secho(
78
+ f"+ score: {diff.score_b}" + (f" ({pct})" if pct else ""),
79
+ fg=color,
80
+ )
81
+ else:
82
+ secho(f" score: {diff.score_a}", dim=True)
83
+
84
+ if diff.threshold_a != diff.threshold_b:
85
+ secho(f"- threshold: {diff.threshold_a}", fg="red")
86
+ secho(f"+ threshold: {diff.threshold_b}", fg="green")
87
+ else:
88
+ secho(f" threshold: {diff.threshold_a}", dim=True)
89
+
90
+ if diff.success_a != diff.success_b:
91
+ secho(f"- success: {diff.success_a}", fg="red")
92
+ secho(f"+ success: {diff.success_b}", fg="green")
93
+ else:
94
+ secho(f" success: {diff.success_a}", dim=True)
95
+
96
+
97
+ def summarize(metrics: List[MetricDiff]) -> Dict[str, List[str]]:
98
+ """
99
+ Returns metric names grouped by category.
100
+ """
101
+ summary: Dict[str, List[str]] = {
102
+ "improved": [],
103
+ "regressed": [],
104
+ "unchanged": [],
105
+ "added": [],
106
+ "removed": [],
107
+ }
108
+
109
+ for m in metrics:
110
+ if m.status == "added":
111
+ summary["added"].append(m.metric)
112
+ continue
113
+
114
+ if m.status == "removed":
115
+ summary["removed"].append(m.metric)
116
+ continue
117
+
118
+ direction = metric_direction(m.delta)
119
+ summary[direction].append(m.metric)
120
+
121
+ return summary
122
+
123
+
124
+ def _fmt(metrics: List[str]) -> str:
125
+ return ", ".join(metrics) if metrics else "-"
126
+
127
+
128
+ def print_summary(metrics: List[MetricDiff]) -> None:
129
+ summary = summarize(metrics)
130
+
131
+ rows = [
132
+ ("↑ Improved", "improved", "green"),
133
+ ("↓ Regressed", "regressed", "red"),
134
+ ("± Unchanged", "unchanged", None),
135
+ ("+ Added.", "added", "green"),
136
+ ("- Removed.", "removed", "red"),
137
+ ]
138
+
139
+ secho("\nSummary:", bold=True)
140
+ secho(
141
+ " | Status | # | Metrics |",
142
+ dim=True,
143
+ )
144
+ secho(
145
+ " |--------------|---|------------------------|",
146
+ dim=True,
147
+ )
148
+
149
+ for label, key, color in rows:
150
+ count = len(summary[key])
151
+ metrics_str = _fmt(summary[key])
152
+
153
+ line = f" | {label:<12} | {count:<1} | {metrics_str:<22} |"
154
+
155
+ if color:
156
+ secho(line, fg=color)
157
+ else:
158
+ secho(line, dim=True)
159
+
160
+
161
+ def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
162
+ if not summary_only:
163
+ for metric in resp.metrics:
164
+ print_metric_diff(metric)
165
+ return
166
+
167
+ print_summary(resp.metrics)
168
+
169
+
170
+ def serialize_for_diff(obj: dict) -> str:
171
+ """
172
+ Stable, git-friendly JSON representation
173
+ """
174
+ return (
175
+ json.dumps(
176
+ obj,
177
+ indent=4,
178
+ sort_keys=True,
179
+ ensure_ascii=False,
180
+ )
181
+ + "\n"
182
+ )
183
+
184
+
185
+ def git_diff(
186
+ a_text: str,
187
+ b_text: str,
188
+ *,
189
+ label_a: str,
190
+ label_b: str,
191
+ side_by_side: bool = False,
192
+ ) -> None:
193
+ """
194
+ Show diff between two text blobs.
195
+
196
+ - side_by_side: uses git difftool with vimdiff (interactive)
197
+ - fallback to git diff -U3 or difflib if git is not available
198
+ """
199
+ import shutil
200
+ import subprocess
201
+ from pathlib import Path
202
+ from tempfile import TemporaryDirectory
203
+
204
+ import typer
205
+
206
+ git = shutil.which("git")
207
+ vimdiff = shutil.which("vimdiff")
208
+
209
+ with TemporaryDirectory() as tmp:
210
+ a = Path(tmp) / label_a
211
+ b = Path(tmp) / label_b
212
+
213
+ a.write_text(a_text)
214
+ b.write_text(b_text)
215
+
216
+ # --- Side-by-side with vimdiff ---
217
+ if side_by_side:
218
+ if not git:
219
+ typer.secho(
220
+ "❌ Git is required for side-by-side diffs", fg="red", bold=True
221
+ )
222
+ return
223
+ if not vimdiff:
224
+ typer.secho(
225
+ "❌ Vimdiff is not installed. Please install vim or vimdiff to use side-by-side mode.",
226
+ fg="red",
227
+ bold=True,
228
+ )
229
+ return
230
+
231
+ cmd = [
232
+ git,
233
+ "difftool",
234
+ "--no-index",
235
+ "--tool=vimdiff",
236
+ "--no-prompt", # skip Y/n prompt
237
+ str(a),
238
+ str(b),
239
+ ]
240
+
241
+ subprocess.run(cmd, check=False)
242
+ return
243
+
244
+ # (default)
245
+ if git:
246
+ cmd = [
247
+ git,
248
+ "diff",
249
+ "--no-index",
250
+ "--color=always",
251
+ "-U3",
252
+ str(a),
253
+ str(b),
254
+ ]
255
+ subprocess.run(cmd, check=False)
256
+ return
257
+
258
+ # Fallback
259
+ import difflib
260
+
261
+ diff = difflib.unified_diff(
262
+ a_text.splitlines(),
263
+ b_text.splitlines(),
264
+ fromfile=label_a,
265
+ tofile=label_b,
266
+ lineterm="",
267
+ )
268
+ for line in diff:
269
+ typer.echo(line)