rakam-eval-sdk 0.2.4rc7__py3-none-any.whl → 0.2.4rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rakam_eval_sdk/cli.py CHANGED
@@ -5,7 +5,7 @@ import sys
5
5
  import uuid
6
6
  from datetime import datetime
7
7
  from pathlib import Path
8
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
8
+ from typing import Any, Dict, List, Optional, Set, Tuple
9
9
 
10
10
  import typer
11
11
  from dotenv import load_dotenv
@@ -20,7 +20,12 @@ from rakam_eval_sdk.utils.decorator_utils import (
20
20
  find_decorated_functions,
21
21
  load_module_from_path,
22
22
  )
23
- from rakam_eval_sdk.utils.print import _print_and_save, pretty_print_comparison
23
+ from rakam_eval_sdk.utils.print import (
24
+ _print_and_save,
25
+ git_diff,
26
+ pretty_print_comparison,
27
+ serialize_for_diff,
28
+ )
24
29
 
25
30
  load_dotenv()
26
31
  app = typer.Typer(help="CLI tools for evaluation utilities")
@@ -30,10 +35,10 @@ console = Console()
30
35
  PROJECT_ROOT = os.path.abspath(".")
31
36
  if PROJECT_ROOT not in sys.path:
32
37
  sys.path.insert(0, PROJECT_ROOT)
33
- list_app = typer.Typer(help="List resources")
34
- app.add_typer(list_app, name="list")
38
+ list_app = typer.Typer(help="List evaluations or runs")
35
39
  metrics_app = typer.Typer(help="Metrics utilities")
36
- app.add_typer(metrics_app, name="metrics")
40
+
41
+ # Sub-apps are registered at the end to control command order
37
42
 
38
43
 
39
44
  def extract_metric_names(config: Any) -> List[Tuple[str, Optional[str]]]:
@@ -137,7 +142,7 @@ def list_evals(
137
142
  ),
138
143
  ) -> None:
139
144
  """
140
- Find functions decorated with @track.
145
+ List evaluations (functions decorated with @eval_run).
141
146
  """
142
147
  TARGET_DECORATOR = eval_run.__name__
143
148
  files = directory.rglob("*.py") if recursive else directory.glob("*.py")
@@ -151,16 +156,16 @@ def list_evals(
151
156
  typer.echo(f"{file}:{fn}")
152
157
 
153
158
  if not found:
154
- typer.echo(f"No @{TARGET_DECORATOR} functions found.")
159
+ typer.echo("No evaluations found.")
155
160
 
156
161
 
157
162
  @list_app.command("runs")
158
163
  def list_runs(
159
- limit: int = typer.Option(20, help="Max number of runs"),
164
+ limit: int = typer.Option(20, "-l", "--limit", help="Max number of runs"),
160
165
  offset: int = typer.Option(0, help="Pagination offset"),
161
166
  ) -> None:
162
167
  """
163
- List evaluation runs (newest first).
168
+ List runs (newest first).
164
169
  """
165
170
  client = DeepEvalClient()
166
171
 
@@ -174,7 +179,7 @@ def list_runs(
174
179
  total = response.get("total", 0)
175
180
 
176
181
  if not items:
177
- typer.echo("No evaluation runs found.")
182
+ typer.echo("No runs found.")
178
183
  return
179
184
 
180
185
  typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
@@ -203,13 +208,15 @@ def list_runs(
203
208
 
204
209
  @app.command()
205
210
  def show(
206
- id: Optional[int] = typer.Option(
211
+ run_id: Optional[int] = typer.Option(
207
212
  None,
213
+ "-i",
208
214
  "--id",
209
215
  help="Run ID",
210
216
  ),
211
217
  tag: Optional[str] = typer.Option(
212
218
  None,
219
+ "-t",
213
220
  "--tag",
214
221
  help="Run tag",
215
222
  ),
@@ -220,19 +227,20 @@ def show(
220
227
  ),
221
228
  ) -> None:
222
229
  """
223
- Show a single evaluation testcase by ID or tag.
230
+ Show a run by ID or tag.
224
231
  """
225
- if not id and not tag:
232
+
233
+ if not run_id and not tag:
226
234
  raise typer.BadParameter("You must provide either --id or --tag")
227
235
 
228
- if id and tag:
236
+ if run_id and tag:
229
237
  raise typer.BadParameter("Provide only one of --id or --tag")
230
238
 
231
239
  client = DeepEvalClient()
232
240
 
233
- if id:
234
- result = client.get_evaluation_testcase_by_id(id)
235
- identifier = f"id={id}"
241
+ if run_id:
242
+ result = client.get_evaluation_testcase_by_id(run_id)
243
+ identifier = f"run_id={run_id}"
236
244
  else:
237
245
  assert tag is not None
238
246
  result = client.get_evaluation_testcase_by_tag(tag)
@@ -265,7 +273,7 @@ def show(
265
273
  console.print(
266
274
  Panel.fit(
267
275
  Pretty(result),
268
- title="Evaluation TestCase",
276
+ title="Run",
269
277
  subtitle=identifier,
270
278
  )
271
279
  )
@@ -311,7 +319,7 @@ def run(
311
319
  save_runs: bool = typer.Option(
312
320
  False,
313
321
  "--save-runs",
314
- help="Save each evaluation run result to a JSON file",
322
+ help="Save each run result to a JSON file",
315
323
  ),
316
324
  output_dir: Path = typer.Option(
317
325
  Path("./eval_runs"),
@@ -320,7 +328,7 @@ def run(
320
328
  ),
321
329
  ) -> None:
322
330
  """
323
- Find and execute all functions decorated with @eval_run.
331
+ Execute evaluations (functions decorated with @eval_run).
324
332
  """
325
333
  files = directory.rglob("*.py") if recursive else directory.glob("*.py")
326
334
  TARGET_DECORATOR = eval_run.__name__
@@ -400,124 +408,141 @@ def run(
400
408
  typer.echo(f" ❌ Execution failed: {e}")
401
409
 
402
410
  if not executed_any and not dry_run:
403
- typer.echo("\nNo @eval_run functions executed.")
411
+ typer.echo("\nNo evaluations executed.")
412
+
413
+
414
+ def fetch_run(
415
+ client: DeepEvalClient,
416
+ *,
417
+ run_id: Optional[int],
418
+ tag: Optional[str],
419
+ ) -> Tuple[dict, str]:
420
+ """
421
+ Fetch a single run by id or tag.
422
+ Returns (payload, identifier)
423
+ """
424
+ if run_id is not None:
425
+ result = client.get_evaluation_testcase_by_id(run_id)
426
+ identifier = f"run_id={run_id}"
427
+ else:
428
+ assert tag is not None
429
+ result = client.get_evaluation_testcase_by_tag(tag)
430
+ identifier = f"tag={tag}"
431
+
432
+ if not result:
433
+ raise RuntimeError(f"No data returned for {identifier}")
434
+
435
+ return result, identifier
404
436
 
405
437
 
406
438
  @app.command()
407
439
  def compare(
408
440
  tag: List[str] = typer.Option(
409
441
  [],
442
+ "-t",
410
443
  "--tag",
411
- help="Label identifying a reference testcase",
444
+ help="Run tag",
412
445
  ),
413
- run: List[int] = typer.Option(
446
+ run_id: List[int] = typer.Option(
414
447
  [],
415
- "--run",
416
- help="Run ID identifying an evaluation testcase",
417
- ),
418
- pretty: bool = typer.Option(
419
- True,
420
- "--pretty/--raw",
421
- help="Pretty diff output (default) or raw JSON",
448
+ "-i",
449
+ "--id",
450
+ help="Run ID",
422
451
  ),
423
452
  summary: bool = typer.Option(
424
453
  False,
425
454
  "--summary",
426
- help="Show summary only (no per-metric diff)",
455
+ help="Show summary diff only",
427
456
  ),
428
- raise_exception: bool = typer.Option(
457
+ side_by_side: bool = typer.Option(
429
458
  False,
430
- "--raise",
431
- help="Raise HTTP exceptions instead of swallowing them",
432
- ),
433
- out: Optional[Path] = typer.Option(
434
- None,
435
- "-o",
436
- "--out",
437
- help="Optional file path to save the result as JSON",
438
- ),
439
- overwrite: bool = typer.Option(
440
- False,
441
- "--overwrite",
442
- help="Overwrite output file if it already exists",
459
+ "--side-by-side",
460
+ help="Show side-by-side diff (git)",
443
461
  ),
444
462
  ) -> None:
445
463
  """
446
- Compare two evaluation testcases (runs and/or labels).
464
+ Compare two evaluation runs.
447
465
 
448
- Output:
449
- - Unified diff-style view by default
450
- - Raw JSON with --raw
451
- - Summary of improvements / regressions
466
+ Default: unified git diff
452
467
  """
453
468
 
454
- targets: List[Tuple[str, Union[str, int]]] = []
469
+ if summary and side_by_side:
470
+ typer.secho(
471
+ "❌ --summary and --side-by-side cannot be used together",
472
+ fg="red",
473
+ bold=True,
474
+ )
475
+ raise typer.Exit(code=1)
455
476
 
456
- for r in run:
457
- targets.append(("run", r))
477
+ targets: List[Tuple[Optional[int], Optional[str]]] = []
458
478
 
479
+ for r in run_id:
480
+ targets.append(("run", r))
459
481
  for t in tag:
460
- targets.append(("label", t))
482
+ targets.append(("tag", t))
461
483
 
462
484
  if len(targets) != 2:
463
485
  typer.secho(
464
- "❌ Provide exactly two targets using --run and/or --tag",
486
+ "❌ Provide exactly two targets using --id and/or --tag",
465
487
  fg="red",
466
488
  bold=True,
467
489
  )
468
490
  raise typer.Exit(code=1)
469
491
 
470
- (type_a, value_a), (type_b, value_b) = targets
471
-
472
- typer.secho(
473
- f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
474
- bold=True,
475
- )
476
-
477
492
  client = DeepEvalClient()
493
+ # Summary mode (reduced payload)
494
+ (type_a, value_a), (type_b, value_b) = targets
495
+ if summary:
496
+ kwargs: Dict[str, Any] = {"raise_exception": True}
497
+ if type_a == "run":
498
+ kwargs["testcase_a_id"] = value_a
499
+ else:
500
+ kwargs["testcase_a_tag"] = value_a
501
+
502
+ if type_b == "run":
503
+ kwargs["testcase_b_id"] = value_b
504
+ else:
505
+ kwargs["testcase_b_tag"] = value_b
506
+ try:
507
+ resp = client.compare_testcases(**kwargs)
508
+ except Exception as e:
509
+ typer.secho(f"❌ Request failed: {e}", fg="red")
510
+ raise typer.Exit(code=1)
478
511
 
479
- kwargs: Dict[str, Any] = {"raise_exception": raise_exception}
480
-
481
- if type_a == "run":
482
- kwargs["testcase_a_id"] = value_a
483
- else:
484
- kwargs["testcase_a_tag"] = value_a
485
-
486
- if type_b == "run":
487
- kwargs["testcase_b_id"] = value_b
488
- else:
489
- kwargs["testcase_b_tag"] = value_b
512
+ if not resp:
513
+ typer.secho("⚠️ No response received", fg="yellow")
514
+ raise typer.Exit(code=1)
515
+ comparison = TestCaseComparison(**resp)
516
+ pretty_print_comparison(
517
+ comparison,
518
+ summary_only=summary,
519
+ )
520
+ return
490
521
 
491
522
  try:
492
- resp = client.compare_testcases(**kwargs)
523
+ run_a, id_a = fetch_run(
524
+ client,
525
+ run_id=value_a if type_a == "run" else None,
526
+ tag=value_a if type_a == "tag" else None,
527
+ )
528
+ run_b, id_b = fetch_run(
529
+ client,
530
+ run_id=value_b if type_b == "run" else None,
531
+ tag=value_b if type_b == "tag" else None,
532
+ )
493
533
  except Exception as e:
494
- typer.secho(f"❌ Request failed: {e}", fg="red")
495
- raise typer.Exit(code=1)
496
-
497
- if not resp:
498
- typer.secho("⚠️ No response received", fg="yellow")
534
+ typer.secho(f"❌ Fetch failed: {e}", fg="red")
499
535
  raise typer.Exit(code=1)
500
536
 
501
- if out:
502
- if out.exists() and not overwrite:
503
- typer.secho(
504
- f"❌ File already exists: {out} (use --overwrite)",
505
- fg="red",
506
- )
507
- raise typer.Exit(code=1)
508
-
509
- out.write_text(json.dumps(resp, indent=2))
510
- typer.secho(f"💾 Saved raw output to {out}", fg="green")
537
+ a_text = serialize_for_diff(run_a)
538
+ b_text = serialize_for_diff(run_b)
511
539
 
512
- if not pretty:
513
- typer.echo(json.dumps(resp, indent=2))
514
- return
515
-
516
- comparison = TestCaseComparison(**resp)
517
-
518
- pretty_print_comparison(
519
- comparison,
520
- summary_only=summary,
540
+ git_diff(
541
+ a_text,
542
+ b_text,
543
+ label_a=f"{id_a}.full.json",
544
+ label_b=f"{id_b}.full.json",
545
+ side_by_side=side_by_side,
521
546
  )
522
547
 
523
548
 
@@ -554,7 +579,7 @@ def compare_label_latest(
554
579
  ),
555
580
  ) -> None:
556
581
  """
557
- Compare the latest evaluation runs for two labels.
582
+ Compare the latest runs for two labels.
558
583
  """
559
584
  client = DeepEvalClient()
560
585
 
@@ -632,11 +657,13 @@ def compare_last(
632
657
  def tag_command(
633
658
  run_id: Optional[int] = typer.Option(
634
659
  None,
635
- "--run",
636
- help="Evaluation run ID",
660
+ "-i",
661
+ "--id",
662
+ help="Run ID",
637
663
  ),
638
664
  tag: Optional[str] = typer.Option(
639
665
  None,
666
+ "-t",
640
667
  "--tag",
641
668
  help="Tag to assign to the run",
642
669
  ),
@@ -653,11 +680,11 @@ def tag_command(
653
680
  # --- validation ---
654
681
  if delete:
655
682
  if run_id or tag:
656
- typer.echo("❌ --delete cannot be used with --run or --tag")
683
+ typer.echo("❌ --delete cannot be used with --id or --tag")
657
684
  raise typer.Exit(code=1)
658
685
  else:
659
686
  if not run_id or not tag:
660
- typer.echo("❌ Use --run and --tag together, or --delete")
687
+ typer.echo("❌ Use --id and --tag together, or --delete")
661
688
  raise typer.Exit(code=1)
662
689
 
663
690
  client = DeepEvalClient()
@@ -687,6 +714,11 @@ def tag_command(
687
714
  typer.echo(f"Tag: {result.get('tag')}")
688
715
 
689
716
 
717
+ # Register sub-apps in user journey order (after regular commands)
718
+ app.add_typer(list_app, name="list")
719
+ app.add_typer(metrics_app, name="metrics")
720
+
721
+
690
722
  def main() -> None:
691
723
  app()
692
724
 
@@ -0,0 +1,58 @@
1
+ from pathlib import Path
2
+ from types import SimpleNamespace
3
+
4
+ import pytest
5
+ import typer
6
+
7
+ from rakam_eval_sdk.cli import metrics
8
+
9
+
10
+ def test_metrics_no_metrics_found(
11
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
12
+ ) -> None:
13
+ file = tmp_path / "eval.py"
14
+ file.write_text("def foo(): pass")
15
+
16
+ monkeypatch.setattr(
17
+ "rakam_eval_sdk.cli.find_decorated_functions",
18
+ lambda *_: [],
19
+ )
20
+
21
+ with pytest.raises(typer.Exit) as exc:
22
+ metrics(directory=tmp_path, recursive=False)
23
+
24
+ assert exc.value.exit_code == 0
25
+
26
+
27
+ def test_metrics_finds_metrics(
28
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys
29
+ ) -> None:
30
+ file = tmp_path / "eval.py"
31
+ file.write_text("")
32
+
33
+ monkeypatch.setattr(
34
+ "rakam_eval_sdk.cli.find_decorated_functions",
35
+ lambda *_: ["run_eval"],
36
+ )
37
+
38
+ fake_module = SimpleNamespace(
39
+ run_eval=lambda: SimpleNamespace(
40
+ metrics=[
41
+ SimpleNamespace(type="accuracy", name=None),
42
+ SimpleNamespace(type="f1", name="f1_v2"),
43
+ ]
44
+ )
45
+ )
46
+
47
+ monkeypatch.setattr(
48
+ "rakam_eval_sdk.cli.load_module_from_path",
49
+ lambda _: fake_module,
50
+ )
51
+
52
+ metrics(directory=tmp_path, recursive=False)
53
+
54
+ out = capsys.readouterr().out
55
+
56
+ assert "accuracy" in out
57
+ assert "f1" in out
58
+ assert "unique metrics found" in out
@@ -1,9 +1,12 @@
1
1
  import os
2
+ from pathlib import Path
3
+ from typing import List
2
4
 
3
5
  import pytest
4
6
 
5
7
  from rakam_eval_sdk.client import DeepEvalClient
6
8
  from rakam_eval_sdk.decorators import eval_run
9
+ from rakam_eval_sdk.utils.decorator_utils import find_decorated_functions
7
10
 
8
11
 
9
12
  @pytest.fixture
@@ -80,3 +83,30 @@ def test_eval_run_with_parentheses(capsys: pytest.CaptureFixture[str]) -> None:
80
83
 
81
84
  assert result == 12
82
85
  assert "[eval_run]" in capsys.readouterr().out
86
+
87
+
88
+ def test_find_decorated_functions(tmp_path: Path) -> None:
89
+ code = """
90
+ from rakam_eval_sdk.decorators import eval_run
91
+
92
+ @eval_run
93
+ def foo():
94
+ pass
95
+
96
+ @eval_run()
97
+ def bar():
98
+ pass
99
+
100
+ async def baz():
101
+ pass
102
+
103
+ @other
104
+ def nope():
105
+ pass
106
+ """
107
+ file = tmp_path / "test_mod.py"
108
+ file.write_text(code)
109
+
110
+ result: List[str] = find_decorated_functions(file, "eval_run")
111
+
112
+ assert set(result) == {"foo", "bar"}
@@ -0,0 +1,51 @@
1
+ from types import SimpleNamespace
2
+
3
+ from rakam_eval_sdk.cli import extract_metric_names
4
+
5
+
6
+ def test_extract_metric_names_no_metrics_attr() -> None:
7
+ config = SimpleNamespace()
8
+
9
+ result = extract_metric_names(config)
10
+
11
+ assert result == []
12
+
13
+
14
+ def test_extract_metric_names_empty_metrics() -> None:
15
+ config = SimpleNamespace(metrics=[])
16
+
17
+ result = extract_metric_names(config)
18
+
19
+ assert result == []
20
+
21
+
22
+ def test_extract_metric_names_with_types_only() -> None:
23
+ config = SimpleNamespace(
24
+ metrics=[
25
+ SimpleNamespace(type="accuracy"),
26
+ SimpleNamespace(type="latency"),
27
+ ]
28
+ )
29
+
30
+ result = extract_metric_names(config)
31
+
32
+ assert result == [
33
+ ("accuracy", None),
34
+ ("latency", None),
35
+ ]
36
+
37
+
38
+ def test_extract_metric_names_with_type_and_name() -> None:
39
+ config = SimpleNamespace(
40
+ metrics=[
41
+ SimpleNamespace(type="accuracy", name="acc_v1"),
42
+ SimpleNamespace(type="f1", name=None),
43
+ ]
44
+ )
45
+
46
+ result = extract_metric_names(config)
47
+
48
+ assert result == [
49
+ ("accuracy", "acc_v1"),
50
+ ("f1", None),
51
+ ]
@@ -0,0 +1,30 @@
1
+ from pathlib import Path
2
+ from types import ModuleType
3
+
4
+ import pytest
5
+
6
+ from rakam_eval_sdk.utils.decorator_utils import load_module_from_path
7
+
8
+
9
+ def test_load_module_from_path_success(tmp_path: Path) -> None:
10
+ file = tmp_path / "mod.py"
11
+ file.write_text(
12
+ """
13
+ x = 42
14
+ def hello():
15
+ return "world"
16
+ """
17
+ )
18
+
19
+ module: ModuleType = load_module_from_path(file)
20
+
21
+ assert module.x == 42
22
+ assert module.hello() == "world"
23
+
24
+
25
+ def test_load_module_from_path_invalid(tmp_path: Path) -> None:
26
+ file: Path = tmp_path / "broken.py"
27
+ file.write_text("def foo(") # invalid syntax
28
+
29
+ with pytest.raises(SyntaxError):
30
+ load_module_from_path(file)
@@ -1,7 +1,8 @@
1
+ from typing import Optional, List, Dict
1
2
  import json
2
3
  from pathlib import Path
3
4
  from types import SimpleNamespace
4
- from typing import Any, Dict, List, Optional
5
+ from typing import Any
5
6
 
6
7
  import pytest
7
8
  import typer
@@ -34,6 +35,7 @@ def test_pct_change(a: Optional[int], b: Optional[int], expected: Optional[str])
34
35
  # metric_direction
35
36
  # -------------------------
36
37
 
38
+
37
39
  @pytest.mark.parametrize(
38
40
  "delta,expected",
39
41
  [
@@ -89,47 +91,54 @@ def test_print_and_save_overwrites(tmp_path: Path) -> None:
89
91
  assert json.loads(out.read_text()) == {"x": 2}
90
92
 
91
93
 
92
- # -------------------------
93
- # summarize
94
- # -------------------------
95
-
96
94
  def make_metric(
97
95
  *,
96
+ metric: str,
98
97
  status: str = "changed",
99
98
  delta: Optional[float] = None,
100
99
  ) -> SimpleNamespace:
101
- return SimpleNamespace(status=status, delta=delta)
100
+ return SimpleNamespace(
101
+ metric=metric,
102
+ status=status,
103
+ delta=delta,
104
+ )
102
105
 
103
106
 
104
- def test_summarize_counts_all_states() -> None:
107
+ def test_summarize_groups_all_states() -> None:
105
108
  metrics: List[SimpleNamespace] = [
106
- make_metric(status="added"),
107
- make_metric(status="removed"),
108
- make_metric(delta=1.0),
109
- make_metric(delta=-1.0),
110
- make_metric(delta=0.0),
111
- make_metric(delta=None),
109
+ make_metric(metric="added_metric", status="added"),
110
+ make_metric(metric="removed_metric", status="removed"),
111
+ make_metric(metric="improved_metric", delta=1.0),
112
+ make_metric(metric="regressed_metric", delta=-1.0),
113
+ make_metric(metric="unchanged_zero", delta=0.0),
114
+ make_metric(metric="unchanged_none", delta=None),
112
115
  ]
113
116
 
114
- summary: Dict[str, int] = summarize(metrics)
117
+ summary: Dict[str, List[str]] = summarize(metrics=metrics)
115
118
 
116
119
  assert summary == {
117
- "improved": 1,
118
- "regressed": 1,
119
- "unchanged": 2,
120
- "added": 1,
121
- "removed": 1,
120
+ "improved": ["improved_metric"],
121
+ "regressed": ["regressed_metric"],
122
+ "unchanged": ["unchanged_zero", "unchanged_none"],
123
+ "added": ["added_metric"],
124
+ "removed": ["removed_metric"],
122
125
  }
123
126
 
124
127
 
125
- # -------------------------
126
- # pretty_print_comparison
127
- # -------------------------
128
+ def test_summarize_empty_metrics() -> None:
129
+ assert summarize(metrics=[]) == {
130
+ "improved": [],
131
+ "regressed": [],
132
+ "unchanged": [],
133
+ "added": [],
134
+ "removed": [],
135
+ }
136
+
128
137
 
129
138
  def test_pretty_print_comparison_summary_only(monkeypatch: pytest.MonkeyPatch) -> None:
130
139
  metrics: List[SimpleNamespace] = [
131
- make_metric(status="added"),
132
- make_metric(delta=1.0),
140
+ make_metric(metric="added_metric", status="added"),
141
+ make_metric(metric="improved_metric", delta=1.0),
133
142
  ]
134
143
  resp: SimpleNamespace = SimpleNamespace(metrics=metrics)
135
144
 
@@ -139,6 +148,7 @@ def test_pretty_print_comparison_summary_only(monkeypatch: pytest.MonkeyPatch) -
139
148
  called["metrics"] = arg
140
149
 
141
150
  import rakam_eval_sdk.utils.print as mod
151
+
142
152
  monkeypatch.setattr(mod, "print_summary", fake_print_summary)
143
153
 
144
154
  pretty_print_comparison(resp, summary_only=True)
@@ -148,8 +158,8 @@ def test_pretty_print_comparison_summary_only(monkeypatch: pytest.MonkeyPatch) -
148
158
 
149
159
  def test_pretty_print_comparison_full(monkeypatch: pytest.MonkeyPatch) -> None:
150
160
  metrics: List[SimpleNamespace] = [
151
- make_metric(status="added"),
152
- make_metric(delta=1.0),
161
+ make_metric(metric="added_metric", status="added"),
162
+ make_metric(metric="improved_metric", delta=1.0),
153
163
  ]
154
164
  resp: SimpleNamespace = SimpleNamespace(metrics=metrics)
155
165
 
@@ -159,6 +169,7 @@ def test_pretty_print_comparison_full(monkeypatch: pytest.MonkeyPatch) -> None:
159
169
  calls.append(metric)
160
170
 
161
171
  import rakam_eval_sdk.utils.print as mod
172
+
162
173
  monkeypatch.setattr(mod, "print_metric_diff", fake_print_metric_diff)
163
174
 
164
175
  pretty_print_comparison(resp, summary_only=False)
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  from pathlib import Path
3
3
  from pprint import pprint
4
- from typing import Any, Dict, Optional
4
+ from typing import Any, Dict, List, Optional
5
5
 
6
6
  import typer
7
7
  from typer import secho
@@ -25,8 +25,7 @@ def _print_and_save(
25
25
  return
26
26
 
27
27
  if out.exists() and not overwrite:
28
- typer.echo(
29
- f"❌ File already exists: {out} (use --overwrite to replace)")
28
+ typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
30
29
  raise typer.Exit(code=1)
31
30
 
32
31
  out.parent.mkdir(parents=True, exist_ok=True)
@@ -95,34 +94,68 @@ def print_metric_diff(diff: MetricDiff) -> None:
95
94
  secho(f" success: {diff.success_a}", dim=True)
96
95
 
97
96
 
98
- def summarize(metrics: Any) -> Dict[str, int]:
99
- summary = {
100
- "improved": 0,
101
- "regressed": 0,
102
- "unchanged": 0,
103
- "added": 0,
104
- "removed": 0,
97
+ def summarize(metrics: List[MetricDiff]) -> Dict[str, List[str]]:
98
+ """
99
+ Returns metric names grouped by category.
100
+ """
101
+ summary: Dict[str, List[str]] = {
102
+ "improved": [],
103
+ "regressed": [],
104
+ "unchanged": [],
105
+ "added": [],
106
+ "removed": [],
105
107
  }
106
108
 
107
109
  for m in metrics:
108
- if m.status in ("added", "removed"):
109
- summary[m.status] += 1
110
- else:
111
- direction = metric_direction(m.delta)
112
- summary[direction] += 1
110
+ if m.status == "added":
111
+ summary["added"].append(m.metric)
112
+ continue
113
+
114
+ if m.status == "removed":
115
+ summary["removed"].append(m.metric)
116
+ continue
117
+
118
+ direction = metric_direction(m.delta)
119
+ summary[direction].append(m.metric)
113
120
 
114
121
  return summary
115
122
 
116
123
 
117
- def print_summary(metrics: Any) -> None:
124
+ def _fmt(metrics: List[str]) -> str:
125
+ return ", ".join(metrics) if metrics else "-"
126
+
127
+
128
+ def print_summary(metrics: List[MetricDiff]) -> None:
118
129
  summary = summarize(metrics)
119
130
 
131
+ rows = [
132
+ ("↑ Improved", "improved", "green"),
133
+ ("↓ Regressed", "regressed", "red"),
134
+ ("± Unchanged", "unchanged", None),
135
+ ("+ Added.", "added", "green"),
136
+ ("- Removed.", "removed", "red"),
137
+ ]
138
+
120
139
  secho("\nSummary:", bold=True)
121
- secho(f" ↑ Improved: {summary['improved']}", fg="green")
122
- secho(f" Regressed: {summary['regressed']}", fg="red")
123
- secho(f" ± Unchanged: {summary['unchanged']}", dim=True)
124
- secho(f" + Added: {summary['added']}", fg="green")
125
- secho(f" - Removed: {summary['removed']}", fg="red")
140
+ secho(
141
+ " | Status | # | Metrics |",
142
+ dim=True,
143
+ )
144
+ secho(
145
+ " |--------------|---|------------------------|",
146
+ dim=True,
147
+ )
148
+
149
+ for label, key, color in rows:
150
+ count = len(summary[key])
151
+ metrics_str = _fmt(summary[key])
152
+
153
+ line = f" | {label:<12} | {count:<1} | {metrics_str:<22} |"
154
+
155
+ if color:
156
+ secho(line, fg=color)
157
+ else:
158
+ secho(line, dim=True)
126
159
 
127
160
 
128
161
  def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
@@ -132,3 +165,105 @@ def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
132
165
  return
133
166
 
134
167
  print_summary(resp.metrics)
168
+
169
+
170
+ def serialize_for_diff(obj: dict) -> str:
171
+ """
172
+ Stable, git-friendly JSON representation
173
+ """
174
+ return (
175
+ json.dumps(
176
+ obj,
177
+ indent=4,
178
+ sort_keys=True,
179
+ ensure_ascii=False,
180
+ )
181
+ + "\n"
182
+ )
183
+
184
+
185
+ def git_diff(
186
+ a_text: str,
187
+ b_text: str,
188
+ *,
189
+ label_a: str,
190
+ label_b: str,
191
+ side_by_side: bool = False,
192
+ ) -> None:
193
+ """
194
+ Show diff between two text blobs.
195
+
196
+ - side_by_side: uses git difftool with vimdiff (interactive)
197
+ - fallback to git diff -U3 or difflib if git is not available
198
+ """
199
+ import shutil
200
+ import subprocess
201
+ from pathlib import Path
202
+ from tempfile import TemporaryDirectory
203
+
204
+ import typer
205
+
206
+ git = shutil.which("git")
207
+ vimdiff = shutil.which("vimdiff")
208
+
209
+ with TemporaryDirectory() as tmp:
210
+ a = Path(tmp) / label_a
211
+ b = Path(tmp) / label_b
212
+
213
+ a.write_text(a_text)
214
+ b.write_text(b_text)
215
+
216
+ # --- Side-by-side with vimdiff ---
217
+ if side_by_side:
218
+ if not git:
219
+ typer.secho(
220
+ "❌ Git is required for side-by-side diffs", fg="red", bold=True
221
+ )
222
+ return
223
+ if not vimdiff:
224
+ typer.secho(
225
+ "❌ Vimdiff is not installed. Please install vim or vimdiff to use side-by-side mode.",
226
+ fg="red",
227
+ bold=True,
228
+ )
229
+ return
230
+
231
+ cmd = [
232
+ git,
233
+ "difftool",
234
+ "--no-index",
235
+ "--tool=vimdiff",
236
+ "--no-prompt", # skip Y/n prompt
237
+ str(a),
238
+ str(b),
239
+ ]
240
+
241
+ subprocess.run(cmd, check=False)
242
+ return
243
+
244
+ # (default)
245
+ if git:
246
+ cmd = [
247
+ git,
248
+ "diff",
249
+ "--no-index",
250
+ "--color=always",
251
+ "-U3",
252
+ str(a),
253
+ str(b),
254
+ ]
255
+ subprocess.run(cmd, check=False)
256
+ return
257
+
258
+ # Fallback
259
+ import difflib
260
+
261
+ diff = difflib.unified_diff(
262
+ a_text.splitlines(),
263
+ b_text.splitlines(),
264
+ fromfile=label_a,
265
+ tofile=label_b,
266
+ lineterm="",
267
+ )
268
+ for line in diff:
269
+ typer.echo(line)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rakam-eval-sdk
3
- Version: 0.2.4rc7
3
+ Version: 0.2.4rc8
4
4
  Summary: Evaluation Framework SDK
5
5
  Author: Mohamed Bachar Touil
6
6
  License: MIT
@@ -0,0 +1,17 @@
1
+ rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ rakam_eval_sdk/cli.py,sha256=mxLb2tJHdoSW33_4N6AoxZrMZVHoA2nDtLe8LBuUu-c,19068
3
+ rakam_eval_sdk/client.py,sha256=8KwpNt7WHgMfcXAQtrAlnUihai9TiADRieiuWeIxg6E,15575
4
+ rakam_eval_sdk/decorators.py,sha256=IOe4r4P8JoWM9thPtb3bJ1EXvGxkjydHbKS0W1fiYSk,1908
5
+ rakam_eval_sdk/schema.py,sha256=ouxiV1ydQXiVn5DjIAYDT4yqsXA0h95bq6c-oSkqxL8,3912
6
+ rakam_eval_sdk/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ rakam_eval_sdk/tests/test_cli.py,sha256=SkysYGjZyZuWkurgu9VmZQxpuDSRvssraYG3zqpnfBo,1366
8
+ rakam_eval_sdk/tests/test_decorator.py,sha256=TMkzWLv-WFhvrEhAmcmF1S-LJ_7ufIawNFVCIEcLQvw,2498
9
+ rakam_eval_sdk/tests/test_metrics.py,sha256=QC7yHdvB-GXIcTVOsHY8ednGBI18OI6FxWvGLUW_VHY,1118
10
+ rakam_eval_sdk/tests/test_modules_loader.py,sha256=s3jUVCUP1d-U4siDElVGjkjYVT6wL2kNbDKI3T_bca0,668
11
+ rakam_eval_sdk/tests/test_print.py,sha256=dBYYUnpv-PLU6mEPTITjCyhijOkKmbKozH6f0xU6JeI,4655
12
+ rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
13
+ rakam_eval_sdk/utils/print.py,sha256=8MbGryh6TxgaQENWcjnfDc1bbRA0FFvnNcu8jp-KEuQ,6857
14
+ rakam_eval_sdk-0.2.4rc8.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
15
+ rakam_eval_sdk-0.2.4rc8.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
16
+ rakam_eval_sdk-0.2.4rc8.dist-info/METADATA,sha256=Icpm85_loND_wTYbO1BajqCo9wacFP1GQZf8YpyqiSM,6022
17
+ rakam_eval_sdk-0.2.4rc8.dist-info/RECORD,,
@@ -1,14 +0,0 @@
1
- rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- rakam_eval_sdk/cli.py,sha256=ZFsEXVBlQNThu8dK_JVGpirmY5ppuzE8lubw-XqE1_k,18385
3
- rakam_eval_sdk/client.py,sha256=8KwpNt7WHgMfcXAQtrAlnUihai9TiADRieiuWeIxg6E,15575
4
- rakam_eval_sdk/decorators.py,sha256=IOe4r4P8JoWM9thPtb3bJ1EXvGxkjydHbKS0W1fiYSk,1908
5
- rakam_eval_sdk/schema.py,sha256=ouxiV1ydQXiVn5DjIAYDT4yqsXA0h95bq6c-oSkqxL8,3912
6
- rakam_eval_sdk/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- rakam_eval_sdk/tests/test_decorator.py,sha256=ppqdhHamyC-rs_RsEjCegsnkV4kLhX-qpG5UicA6D6w,1956
8
- rakam_eval_sdk/tests/test_print.py,sha256=uraR7qrZxnB3BCCUB6l9k1L62-VU7f0hJeArL3ellrk,4148
9
- rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
10
- rakam_eval_sdk/utils/print.py,sha256=PaeJrgPtLvVBr7uCXA-Q9UNJqrMyky_UMCd7JGFCQ2w,3743
11
- rakam_eval_sdk-0.2.4rc7.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
12
- rakam_eval_sdk-0.2.4rc7.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
13
- rakam_eval_sdk-0.2.4rc7.dist-info/METADATA,sha256=JkNU5IxCvwkTYOQA8f2yZcMpYYEl0QHOOybylw6--L0,6022
14
- rakam_eval_sdk-0.2.4rc7.dist-info/RECORD,,