rakam-eval-sdk 0.2.4rc5__py3-none-any.whl → 0.2.4rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rakam_eval_sdk/cli.py +4 -131
- rakam_eval_sdk/decorators.py +1 -2
- rakam_eval_sdk/schema.py +1 -2
- rakam_eval_sdk/tests/__init__.py +0 -0
- rakam_eval_sdk/tests/test_decorator.py +82 -0
- rakam_eval_sdk/tests/test_print.py +166 -0
- rakam_eval_sdk/utils/print.py +134 -0
- {rakam_eval_sdk-0.2.4rc5.dist-info → rakam_eval_sdk-0.2.4rc7.dist-info}/METADATA +1 -1
- rakam_eval_sdk-0.2.4rc7.dist-info/RECORD +14 -0
- rakam_eval_sdk-0.2.4rc5.dist-info/RECORD +0 -10
- {rakam_eval_sdk-0.2.4rc5.dist-info → rakam_eval_sdk-0.2.4rc7.dist-info}/WHEEL +0 -0
- {rakam_eval_sdk-0.2.4rc5.dist-info → rakam_eval_sdk-0.2.4rc7.dist-info}/entry_points.txt +0 -0
rakam_eval_sdk/cli.py
CHANGED
|
@@ -5,23 +5,22 @@ import sys
|
|
|
5
5
|
import uuid
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from
|
|
9
|
-
from typing import Any, Dict, List, Optional, Tuple, Union, Set
|
|
8
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
10
9
|
|
|
11
10
|
import typer
|
|
12
11
|
from dotenv import load_dotenv
|
|
13
12
|
from rich.console import Console
|
|
14
13
|
from rich.panel import Panel
|
|
15
14
|
from rich.pretty import Pretty
|
|
16
|
-
from typer import secho
|
|
17
15
|
|
|
18
16
|
from rakam_eval_sdk.client import DeepEvalClient
|
|
19
17
|
from rakam_eval_sdk.decorators import eval_run
|
|
20
|
-
from rakam_eval_sdk.schema import
|
|
18
|
+
from rakam_eval_sdk.schema import TestCaseComparison
|
|
21
19
|
from rakam_eval_sdk.utils.decorator_utils import (
|
|
22
20
|
find_decorated_functions,
|
|
23
21
|
load_module_from_path,
|
|
24
22
|
)
|
|
23
|
+
from rakam_eval_sdk.utils.print import _print_and_save, pretty_print_comparison
|
|
25
24
|
|
|
26
25
|
load_dotenv()
|
|
27
26
|
app = typer.Typer(help="CLI tools for evaluation utilities")
|
|
@@ -199,8 +198,7 @@ def list_runs(
|
|
|
199
198
|
shown = offset + len(items)
|
|
200
199
|
if shown < total:
|
|
201
200
|
typer.echo()
|
|
202
|
-
typer.echo(
|
|
203
|
-
f"Showing {shown} of {total} runs. Use --limit to see more.")
|
|
201
|
+
typer.echo(f"Showing {shown} of {total} runs. Use --limit to see more.")
|
|
204
202
|
|
|
205
203
|
|
|
206
204
|
@app.command()
|
|
@@ -405,131 +403,6 @@ def run(
|
|
|
405
403
|
typer.echo("\nNo @eval_run functions executed.")
|
|
406
404
|
|
|
407
405
|
|
|
408
|
-
def _print_and_save(
|
|
409
|
-
resp: dict,
|
|
410
|
-
pretty: bool,
|
|
411
|
-
out: Optional[Path],
|
|
412
|
-
overwrite: bool,
|
|
413
|
-
) -> None:
|
|
414
|
-
if pretty:
|
|
415
|
-
typer.echo(typer.style("📊 Result:", bold=True))
|
|
416
|
-
pprint(resp)
|
|
417
|
-
else:
|
|
418
|
-
typer.echo(resp)
|
|
419
|
-
|
|
420
|
-
if out is None:
|
|
421
|
-
return
|
|
422
|
-
|
|
423
|
-
if out.exists() and not overwrite:
|
|
424
|
-
typer.echo(
|
|
425
|
-
f"❌ File already exists: {out} (use --overwrite to replace)")
|
|
426
|
-
raise typer.Exit(code=1)
|
|
427
|
-
|
|
428
|
-
out.parent.mkdir(parents=True, exist_ok=True)
|
|
429
|
-
|
|
430
|
-
with out.open("w", encoding="utf-8") as f:
|
|
431
|
-
json.dump(resp, f, indent=2, ensure_ascii=False)
|
|
432
|
-
|
|
433
|
-
typer.echo(f"💾 Result saved to {out}")
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
def pct_change(a: Optional[float], b: Optional[float]) -> Optional[str]:
|
|
437
|
-
if a is None or b is None or a == 0:
|
|
438
|
-
return None
|
|
439
|
-
return f"{((b - a) / a) * 100:+.2f}%"
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
def metric_direction(delta: Optional[float]) -> str:
|
|
443
|
-
if delta is None:
|
|
444
|
-
return "unchanged"
|
|
445
|
-
if delta > 0:
|
|
446
|
-
return "improved"
|
|
447
|
-
if delta < 0:
|
|
448
|
-
return "regressed"
|
|
449
|
-
return "unchanged"
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
def print_metric_diff(diff: MetricDiff) -> None:
|
|
453
|
-
secho(f"\nMetric: {diff.metric}", bold=True)
|
|
454
|
-
|
|
455
|
-
if diff.status == "added":
|
|
456
|
-
secho(f"+ score: {diff.score_b}", fg="green")
|
|
457
|
-
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
458
|
-
secho(f"+ success: {diff.success_b}", fg="green")
|
|
459
|
-
return
|
|
460
|
-
|
|
461
|
-
if diff.status == "removed":
|
|
462
|
-
secho(f"- score: {diff.score_a}", fg="red")
|
|
463
|
-
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
464
|
-
secho(f"- success: {diff.success_a}", fg="red")
|
|
465
|
-
return
|
|
466
|
-
|
|
467
|
-
# unchanged / changed
|
|
468
|
-
if diff.score_a != diff.score_b:
|
|
469
|
-
direction = metric_direction(diff.delta)
|
|
470
|
-
color = "green" if direction == "improved" else "red"
|
|
471
|
-
pct = pct_change(diff.score_a, diff.score_b)
|
|
472
|
-
|
|
473
|
-
secho(f"- score: {diff.score_a}", fg="red")
|
|
474
|
-
secho(
|
|
475
|
-
f"+ score: {diff.score_b}" + (f" ({pct})" if pct else ""),
|
|
476
|
-
fg=color,
|
|
477
|
-
)
|
|
478
|
-
else:
|
|
479
|
-
secho(f" score: {diff.score_a}", dim=True)
|
|
480
|
-
|
|
481
|
-
if diff.threshold_a != diff.threshold_b:
|
|
482
|
-
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
483
|
-
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
484
|
-
else:
|
|
485
|
-
secho(f" threshold: {diff.threshold_a}", dim=True)
|
|
486
|
-
|
|
487
|
-
if diff.success_a != diff.success_b:
|
|
488
|
-
secho(f"- success: {diff.success_a}", fg="red")
|
|
489
|
-
secho(f"+ success: {diff.success_b}", fg="green")
|
|
490
|
-
else:
|
|
491
|
-
secho(f" success: {diff.success_a}", dim=True)
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
def summarize(metrics: Any) -> Dict[str, int]:
|
|
495
|
-
summary = {
|
|
496
|
-
"improved": 0,
|
|
497
|
-
"regressed": 0,
|
|
498
|
-
"unchanged": 0,
|
|
499
|
-
"added": 0,
|
|
500
|
-
"removed": 0,
|
|
501
|
-
}
|
|
502
|
-
|
|
503
|
-
for m in metrics:
|
|
504
|
-
if m.status in ("added", "removed"):
|
|
505
|
-
summary[m.status] += 1
|
|
506
|
-
else:
|
|
507
|
-
direction = metric_direction(m.delta)
|
|
508
|
-
summary[direction] += 1
|
|
509
|
-
|
|
510
|
-
return summary
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
|
|
514
|
-
if not summary_only:
|
|
515
|
-
for metric in resp.metrics:
|
|
516
|
-
print_metric_diff(metric)
|
|
517
|
-
return
|
|
518
|
-
|
|
519
|
-
print_summary(resp.metrics)
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
def print_summary(metrics: Any) -> None:
|
|
523
|
-
summary = summarize(metrics)
|
|
524
|
-
|
|
525
|
-
secho("\nSummary:", bold=True)
|
|
526
|
-
secho(f" ↑ Improved: {summary['improved']}", fg="green")
|
|
527
|
-
secho(f" ↓ Regressed: {summary['regressed']}", fg="red")
|
|
528
|
-
secho(f" ± Unchanged: {summary['unchanged']}", dim=True)
|
|
529
|
-
secho(f" + Added: {summary['added']}", fg="green")
|
|
530
|
-
secho(f" - Removed: {summary['removed']}", fg="red")
|
|
531
|
-
|
|
532
|
-
|
|
533
406
|
@app.command()
|
|
534
407
|
def compare(
|
|
535
408
|
tag: List[str] = typer.Option(
|
rakam_eval_sdk/decorators.py
CHANGED
|
@@ -6,9 +6,8 @@ from typing import Callable, Dict, Optional, TypeVar, Union, overload
|
|
|
6
6
|
|
|
7
7
|
import psutil
|
|
8
8
|
|
|
9
|
-
if sys.version_info < (3,
|
|
9
|
+
if sys.version_info < (3, 10):
|
|
10
10
|
from typing_extensions import ParamSpec
|
|
11
|
-
|
|
12
11
|
else:
|
|
13
12
|
from typing import ParamSpec
|
|
14
13
|
P = ParamSpec("P")
|
rakam_eval_sdk/schema.py
CHANGED
|
File without changes
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from rakam_eval_sdk.client import DeepEvalClient
|
|
6
|
+
from rakam_eval_sdk.decorators import eval_run
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@pytest.fixture
|
|
10
|
+
def client() -> DeepEvalClient:
|
|
11
|
+
return DeepEvalClient(base_url="http://testserver", api_token="testtoken")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FakeCPUTimes:
|
|
15
|
+
def __init__(self, user: float = 1.0, system: float = 1.0) -> None:
|
|
16
|
+
self.user: float = user
|
|
17
|
+
self.system: float = system
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FakeMemInfo:
|
|
21
|
+
def __init__(self, rss: int) -> None:
|
|
22
|
+
self.rss: int = rss
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class FakeProcess:
|
|
26
|
+
def __init__(self) -> None:
|
|
27
|
+
self._cpu_calls: int = 0
|
|
28
|
+
self._mem_calls: int = 0
|
|
29
|
+
|
|
30
|
+
def cpu_times(self) -> FakeCPUTimes:
|
|
31
|
+
self._cpu_calls += 1
|
|
32
|
+
# simulate CPU usage increase
|
|
33
|
+
return FakeCPUTimes(
|
|
34
|
+
user=1.0 + self._cpu_calls,
|
|
35
|
+
system=1.0,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def memory_info(self) -> FakeMemInfo:
|
|
39
|
+
self._mem_calls += 1
|
|
40
|
+
return FakeMemInfo(rss=100_000_000 + (self._mem_calls * 10_000))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@pytest.fixture(autouse=True)
|
|
44
|
+
def patch_psutil(
|
|
45
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
46
|
+
) -> FakeProcess:
|
|
47
|
+
fake_process: FakeProcess = FakeProcess()
|
|
48
|
+
|
|
49
|
+
monkeypatch.setattr(
|
|
50
|
+
"rakam_eval_sdk.decorators.psutil.Process",
|
|
51
|
+
lambda pid: fake_process,
|
|
52
|
+
)
|
|
53
|
+
monkeypatch.setattr(os, "getpid", lambda: 123)
|
|
54
|
+
return fake_process
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_eval_run_basic(capsys: pytest.CaptureFixture[str]) -> None:
|
|
58
|
+
@eval_run
|
|
59
|
+
def add(a: int, b: int) -> int:
|
|
60
|
+
return a + b
|
|
61
|
+
|
|
62
|
+
result: int = add(2, 3)
|
|
63
|
+
|
|
64
|
+
assert result == 5
|
|
65
|
+
|
|
66
|
+
out: str = capsys.readouterr().out
|
|
67
|
+
assert "[eval_run]" in out
|
|
68
|
+
assert "add" in out
|
|
69
|
+
assert "time=" in out
|
|
70
|
+
assert "cpu=" in out
|
|
71
|
+
assert "mem_delta=" in out
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_eval_run_with_parentheses(capsys: pytest.CaptureFixture[str]) -> None:
|
|
75
|
+
@eval_run()
|
|
76
|
+
def mul(a: int, b: int) -> int:
|
|
77
|
+
return a * b
|
|
78
|
+
|
|
79
|
+
result: int = mul(3, 4)
|
|
80
|
+
|
|
81
|
+
assert result == 12
|
|
82
|
+
assert "[eval_run]" in capsys.readouterr().out
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from types import SimpleNamespace
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
import typer
|
|
8
|
+
|
|
9
|
+
from rakam_eval_sdk.utils.print import (
|
|
10
|
+
_print_and_save,
|
|
11
|
+
metric_direction,
|
|
12
|
+
pct_change,
|
|
13
|
+
pretty_print_comparison,
|
|
14
|
+
summarize,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.mark.parametrize(
|
|
19
|
+
"a,b,expected",
|
|
20
|
+
[
|
|
21
|
+
(10, 20, "+100.00%"),
|
|
22
|
+
(20, 10, "-50.00%"),
|
|
23
|
+
(10, 10, "+0.00%"),
|
|
24
|
+
(None, 10, None),
|
|
25
|
+
(10, None, None),
|
|
26
|
+
(0, 10, None),
|
|
27
|
+
],
|
|
28
|
+
)
|
|
29
|
+
def test_pct_change(a: Optional[int], b: Optional[int], expected: Optional[str]):
|
|
30
|
+
assert pct_change(a, b) == expected
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# -------------------------
|
|
34
|
+
# metric_direction
|
|
35
|
+
# -------------------------
|
|
36
|
+
|
|
37
|
+
@pytest.mark.parametrize(
|
|
38
|
+
"delta,expected",
|
|
39
|
+
[
|
|
40
|
+
(1.0, "improved"),
|
|
41
|
+
(-1.0, "regressed"),
|
|
42
|
+
(0.0, "unchanged"),
|
|
43
|
+
(None, "unchanged"),
|
|
44
|
+
],
|
|
45
|
+
)
|
|
46
|
+
def test_metric_direction(delta: Optional[float], expected: str):
|
|
47
|
+
assert metric_direction(delta) == expected
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_print_and_save_no_output_path(capsys: pytest.CaptureFixture[str]) -> None:
|
|
51
|
+
resp: Dict[str, int] = {"a": 1}
|
|
52
|
+
|
|
53
|
+
_print_and_save(resp, pretty=False, out=None, overwrite=False)
|
|
54
|
+
|
|
55
|
+
captured = capsys.readouterr()
|
|
56
|
+
assert str(resp) in captured.out
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_print_and_save_creates_file(
|
|
60
|
+
tmp_path: Path,
|
|
61
|
+
capsys: pytest.CaptureFixture[str],
|
|
62
|
+
) -> None:
|
|
63
|
+
out: Path = tmp_path / "result.json"
|
|
64
|
+
resp: Dict[str, str] = {"hello": "world"}
|
|
65
|
+
|
|
66
|
+
_print_and_save(resp, pretty=False, out=out, overwrite=False)
|
|
67
|
+
|
|
68
|
+
assert out.exists()
|
|
69
|
+
assert json.loads(out.read_text()) == resp
|
|
70
|
+
|
|
71
|
+
captured = capsys.readouterr()
|
|
72
|
+
assert "Result saved" in captured.out
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_print_and_save_refuses_overwrite(tmp_path: Path) -> None:
|
|
76
|
+
out: Path = tmp_path / "result.json"
|
|
77
|
+
out.write_text("{}")
|
|
78
|
+
|
|
79
|
+
with pytest.raises(typer.Exit):
|
|
80
|
+
_print_and_save({"x": 1}, pretty=False, out=out, overwrite=False)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def test_print_and_save_overwrites(tmp_path: Path) -> None:
|
|
84
|
+
out: Path = tmp_path / "result.json"
|
|
85
|
+
out.write_text("{}")
|
|
86
|
+
|
|
87
|
+
_print_and_save({"x": 2}, pretty=False, out=out, overwrite=True)
|
|
88
|
+
|
|
89
|
+
assert json.loads(out.read_text()) == {"x": 2}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# -------------------------
|
|
93
|
+
# summarize
|
|
94
|
+
# -------------------------
|
|
95
|
+
|
|
96
|
+
def make_metric(
|
|
97
|
+
*,
|
|
98
|
+
status: str = "changed",
|
|
99
|
+
delta: Optional[float] = None,
|
|
100
|
+
) -> SimpleNamespace:
|
|
101
|
+
return SimpleNamespace(status=status, delta=delta)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def test_summarize_counts_all_states() -> None:
|
|
105
|
+
metrics: List[SimpleNamespace] = [
|
|
106
|
+
make_metric(status="added"),
|
|
107
|
+
make_metric(status="removed"),
|
|
108
|
+
make_metric(delta=1.0),
|
|
109
|
+
make_metric(delta=-1.0),
|
|
110
|
+
make_metric(delta=0.0),
|
|
111
|
+
make_metric(delta=None),
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
summary: Dict[str, int] = summarize(metrics)
|
|
115
|
+
|
|
116
|
+
assert summary == {
|
|
117
|
+
"improved": 1,
|
|
118
|
+
"regressed": 1,
|
|
119
|
+
"unchanged": 2,
|
|
120
|
+
"added": 1,
|
|
121
|
+
"removed": 1,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# -------------------------
|
|
126
|
+
# pretty_print_comparison
|
|
127
|
+
# -------------------------
|
|
128
|
+
|
|
129
|
+
def test_pretty_print_comparison_summary_only(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
130
|
+
metrics: List[SimpleNamespace] = [
|
|
131
|
+
make_metric(status="added"),
|
|
132
|
+
make_metric(delta=1.0),
|
|
133
|
+
]
|
|
134
|
+
resp: SimpleNamespace = SimpleNamespace(metrics=metrics)
|
|
135
|
+
|
|
136
|
+
called: Dict[str, Any] = {}
|
|
137
|
+
|
|
138
|
+
def fake_print_summary(arg: Any) -> None:
|
|
139
|
+
called["metrics"] = arg
|
|
140
|
+
|
|
141
|
+
import rakam_eval_sdk.utils.print as mod
|
|
142
|
+
monkeypatch.setattr(mod, "print_summary", fake_print_summary)
|
|
143
|
+
|
|
144
|
+
pretty_print_comparison(resp, summary_only=True)
|
|
145
|
+
|
|
146
|
+
assert called["metrics"] == metrics
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def test_pretty_print_comparison_full(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
150
|
+
metrics: List[SimpleNamespace] = [
|
|
151
|
+
make_metric(status="added"),
|
|
152
|
+
make_metric(delta=1.0),
|
|
153
|
+
]
|
|
154
|
+
resp: SimpleNamespace = SimpleNamespace(metrics=metrics)
|
|
155
|
+
|
|
156
|
+
calls: List[Any] = []
|
|
157
|
+
|
|
158
|
+
def fake_print_metric_diff(metric: Any) -> None:
|
|
159
|
+
calls.append(metric)
|
|
160
|
+
|
|
161
|
+
import rakam_eval_sdk.utils.print as mod
|
|
162
|
+
monkeypatch.setattr(mod, "print_metric_diff", fake_print_metric_diff)
|
|
163
|
+
|
|
164
|
+
pretty_print_comparison(resp, summary_only=False)
|
|
165
|
+
|
|
166
|
+
assert calls == metrics
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from pprint import pprint
|
|
4
|
+
from typing import Any, Dict, Optional
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from typer import secho
|
|
8
|
+
|
|
9
|
+
from rakam_eval_sdk.schema import MetricDiff
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _print_and_save(
|
|
13
|
+
resp: dict,
|
|
14
|
+
pretty: bool,
|
|
15
|
+
out: Optional[Path],
|
|
16
|
+
overwrite: bool,
|
|
17
|
+
) -> None:
|
|
18
|
+
if pretty:
|
|
19
|
+
typer.echo(typer.style("📊 Result:", bold=True))
|
|
20
|
+
pprint(resp)
|
|
21
|
+
else:
|
|
22
|
+
typer.echo(resp)
|
|
23
|
+
|
|
24
|
+
if out is None:
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
if out.exists() and not overwrite:
|
|
28
|
+
typer.echo(
|
|
29
|
+
f"❌ File already exists: {out} (use --overwrite to replace)")
|
|
30
|
+
raise typer.Exit(code=1)
|
|
31
|
+
|
|
32
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
|
|
34
|
+
with out.open("w", encoding="utf-8") as f:
|
|
35
|
+
json.dump(resp, f, indent=2, ensure_ascii=False)
|
|
36
|
+
|
|
37
|
+
typer.echo(f"💾 Result saved to {out}")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def pct_change(a: Optional[float], b: Optional[float]) -> Optional[str]:
|
|
41
|
+
if a is None or b is None or a == 0:
|
|
42
|
+
return None
|
|
43
|
+
return f"{((b - a) / a) * 100:+.2f}%"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def metric_direction(delta: Optional[float]) -> str:
|
|
47
|
+
if delta is None:
|
|
48
|
+
return "unchanged"
|
|
49
|
+
if delta > 0:
|
|
50
|
+
return "improved"
|
|
51
|
+
if delta < 0:
|
|
52
|
+
return "regressed"
|
|
53
|
+
return "unchanged"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def print_metric_diff(diff: MetricDiff) -> None:
|
|
57
|
+
secho(f"\nMetric: {diff.metric}", bold=True)
|
|
58
|
+
|
|
59
|
+
if diff.status == "added":
|
|
60
|
+
secho(f"+ score: {diff.score_b}", fg="green")
|
|
61
|
+
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
62
|
+
secho(f"+ success: {diff.success_b}", fg="green")
|
|
63
|
+
return
|
|
64
|
+
|
|
65
|
+
if diff.status == "removed":
|
|
66
|
+
secho(f"- score: {diff.score_a}", fg="red")
|
|
67
|
+
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
68
|
+
secho(f"- success: {diff.success_a}", fg="red")
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
# unchanged / changed
|
|
72
|
+
if diff.score_a != diff.score_b:
|
|
73
|
+
direction = metric_direction(diff.delta)
|
|
74
|
+
color = "green" if direction == "improved" else "red"
|
|
75
|
+
pct = pct_change(diff.score_a, diff.score_b)
|
|
76
|
+
|
|
77
|
+
secho(f"- score: {diff.score_a}", fg="red")
|
|
78
|
+
secho(
|
|
79
|
+
f"+ score: {diff.score_b}" + (f" ({pct})" if pct else ""),
|
|
80
|
+
fg=color,
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
secho(f" score: {diff.score_a}", dim=True)
|
|
84
|
+
|
|
85
|
+
if diff.threshold_a != diff.threshold_b:
|
|
86
|
+
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
87
|
+
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
88
|
+
else:
|
|
89
|
+
secho(f" threshold: {diff.threshold_a}", dim=True)
|
|
90
|
+
|
|
91
|
+
if diff.success_a != diff.success_b:
|
|
92
|
+
secho(f"- success: {diff.success_a}", fg="red")
|
|
93
|
+
secho(f"+ success: {diff.success_b}", fg="green")
|
|
94
|
+
else:
|
|
95
|
+
secho(f" success: {diff.success_a}", dim=True)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def summarize(metrics: Any) -> Dict[str, int]:
|
|
99
|
+
summary = {
|
|
100
|
+
"improved": 0,
|
|
101
|
+
"regressed": 0,
|
|
102
|
+
"unchanged": 0,
|
|
103
|
+
"added": 0,
|
|
104
|
+
"removed": 0,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
for m in metrics:
|
|
108
|
+
if m.status in ("added", "removed"):
|
|
109
|
+
summary[m.status] += 1
|
|
110
|
+
else:
|
|
111
|
+
direction = metric_direction(m.delta)
|
|
112
|
+
summary[direction] += 1
|
|
113
|
+
|
|
114
|
+
return summary
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def print_summary(metrics: Any) -> None:
|
|
118
|
+
summary = summarize(metrics)
|
|
119
|
+
|
|
120
|
+
secho("\nSummary:", bold=True)
|
|
121
|
+
secho(f" ↑ Improved: {summary['improved']}", fg="green")
|
|
122
|
+
secho(f" ↓ Regressed: {summary['regressed']}", fg="red")
|
|
123
|
+
secho(f" ± Unchanged: {summary['unchanged']}", dim=True)
|
|
124
|
+
secho(f" + Added: {summary['added']}", fg="green")
|
|
125
|
+
secho(f" - Removed: {summary['removed']}", fg="red")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
|
|
129
|
+
if not summary_only:
|
|
130
|
+
for metric in resp.metrics:
|
|
131
|
+
print_metric_diff(metric)
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
print_summary(resp.metrics)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
rakam_eval_sdk/cli.py,sha256=ZFsEXVBlQNThu8dK_JVGpirmY5ppuzE8lubw-XqE1_k,18385
|
|
3
|
+
rakam_eval_sdk/client.py,sha256=8KwpNt7WHgMfcXAQtrAlnUihai9TiADRieiuWeIxg6E,15575
|
|
4
|
+
rakam_eval_sdk/decorators.py,sha256=IOe4r4P8JoWM9thPtb3bJ1EXvGxkjydHbKS0W1fiYSk,1908
|
|
5
|
+
rakam_eval_sdk/schema.py,sha256=ouxiV1ydQXiVn5DjIAYDT4yqsXA0h95bq6c-oSkqxL8,3912
|
|
6
|
+
rakam_eval_sdk/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
rakam_eval_sdk/tests/test_decorator.py,sha256=ppqdhHamyC-rs_RsEjCegsnkV4kLhX-qpG5UicA6D6w,1956
|
|
8
|
+
rakam_eval_sdk/tests/test_print.py,sha256=uraR7qrZxnB3BCCUB6l9k1L62-VU7f0hJeArL3ellrk,4148
|
|
9
|
+
rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
|
|
10
|
+
rakam_eval_sdk/utils/print.py,sha256=PaeJrgPtLvVBr7uCXA-Q9UNJqrMyky_UMCd7JGFCQ2w,3743
|
|
11
|
+
rakam_eval_sdk-0.2.4rc7.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
12
|
+
rakam_eval_sdk-0.2.4rc7.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
|
|
13
|
+
rakam_eval_sdk-0.2.4rc7.dist-info/METADATA,sha256=JkNU5IxCvwkTYOQA8f2yZcMpYYEl0QHOOybylw6--L0,6022
|
|
14
|
+
rakam_eval_sdk-0.2.4rc7.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
rakam_eval_sdk/cli.py,sha256=-H5ggvyHllO3Epnvy3wcV_J0ktAdAU0ypk4ugXMO52Q,21936
|
|
3
|
-
rakam_eval_sdk/client.py,sha256=8KwpNt7WHgMfcXAQtrAlnUihai9TiADRieiuWeIxg6E,15575
|
|
4
|
-
rakam_eval_sdk/decorators.py,sha256=GoaAqnKg-VeYf5eBI-mEgJj6Asqmdqw4QG_qqHu5x2s,1908
|
|
5
|
-
rakam_eval_sdk/schema.py,sha256=X01IBtexmiJSJbdjAoqcMxCG_X1t1XfbIcFKGJ6-6q0,3921
|
|
6
|
-
rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
|
|
7
|
-
rakam_eval_sdk-0.2.4rc5.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
8
|
-
rakam_eval_sdk-0.2.4rc5.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
|
|
9
|
-
rakam_eval_sdk-0.2.4rc5.dist-info/METADATA,sha256=kxqmnzvDuSCTJXxtDdKt4hj95Zja9aulpCfAkTvpZRg,6022
|
|
10
|
-
rakam_eval_sdk-0.2.4rc5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|