rakam-eval-sdk 0.2.4rc6__tar.gz → 0.2.4rc8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/PKG-INFO +1 -1
- {rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/pyproject.toml +2 -1
- {rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/src/rakam_eval_sdk/cli.py +130 -223
- rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/tests/__init__.py +0 -0
- rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/tests/test_cli.py +58 -0
- rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/tests/test_decorator.py +112 -0
- rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/tests/test_metrics.py +51 -0
- rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/tests/test_modules_loader.py +30 -0
- rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/tests/test_print.py +177 -0
- rakam_eval_sdk-0.2.4rc8/src/rakam_eval_sdk/utils/print.py +269 -0
- {rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/README.md +0 -0
- {rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/src/rakam_eval_sdk/__init__.py +0 -0
- {rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/src/rakam_eval_sdk/client.py +0 -0
- {rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/src/rakam_eval_sdk/decorators.py +0 -0
- {rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/src/rakam_eval_sdk/schema.py +0 -0
- {rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/src/rakam_eval_sdk/utils/decorator_utils.py +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "uv_build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "rakam-eval-sdk"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.4rc8"
|
|
8
8
|
description = "Evaluation Framework SDK"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -25,6 +25,7 @@ dev = [
|
|
|
25
25
|
"build>=1.2.2.post1",
|
|
26
26
|
"pytest>=8.3.5",
|
|
27
27
|
"twine>=6.1.0",
|
|
28
|
+
"pytest-cov>=5.0.0",
|
|
28
29
|
]
|
|
29
30
|
|
|
30
31
|
[tool.isort]
|
|
@@ -5,23 +5,27 @@ import sys
|
|
|
5
5
|
import uuid
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from
|
|
9
|
-
from typing import Any, Dict, List, Optional, Tuple, Union, Set
|
|
8
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
10
9
|
|
|
11
10
|
import typer
|
|
12
11
|
from dotenv import load_dotenv
|
|
13
12
|
from rich.console import Console
|
|
14
13
|
from rich.panel import Panel
|
|
15
14
|
from rich.pretty import Pretty
|
|
16
|
-
from typer import secho
|
|
17
15
|
|
|
18
16
|
from rakam_eval_sdk.client import DeepEvalClient
|
|
19
17
|
from rakam_eval_sdk.decorators import eval_run
|
|
20
|
-
from rakam_eval_sdk.schema import
|
|
18
|
+
from rakam_eval_sdk.schema import TestCaseComparison
|
|
21
19
|
from rakam_eval_sdk.utils.decorator_utils import (
|
|
22
20
|
find_decorated_functions,
|
|
23
21
|
load_module_from_path,
|
|
24
22
|
)
|
|
23
|
+
from rakam_eval_sdk.utils.print import (
|
|
24
|
+
_print_and_save,
|
|
25
|
+
git_diff,
|
|
26
|
+
pretty_print_comparison,
|
|
27
|
+
serialize_for_diff,
|
|
28
|
+
)
|
|
25
29
|
|
|
26
30
|
load_dotenv()
|
|
27
31
|
app = typer.Typer(help="CLI tools for evaluation utilities")
|
|
@@ -31,10 +35,10 @@ console = Console()
|
|
|
31
35
|
PROJECT_ROOT = os.path.abspath(".")
|
|
32
36
|
if PROJECT_ROOT not in sys.path:
|
|
33
37
|
sys.path.insert(0, PROJECT_ROOT)
|
|
34
|
-
list_app = typer.Typer(help="List
|
|
35
|
-
app.add_typer(list_app, name="list")
|
|
38
|
+
list_app = typer.Typer(help="List evaluations or runs")
|
|
36
39
|
metrics_app = typer.Typer(help="Metrics utilities")
|
|
37
|
-
|
|
40
|
+
|
|
41
|
+
# Sub-apps are registered at the end to control command order
|
|
38
42
|
|
|
39
43
|
|
|
40
44
|
def extract_metric_names(config: Any) -> List[Tuple[str, Optional[str]]]:
|
|
@@ -138,7 +142,7 @@ def list_evals(
|
|
|
138
142
|
),
|
|
139
143
|
) -> None:
|
|
140
144
|
"""
|
|
141
|
-
|
|
145
|
+
List evaluations (functions decorated with @eval_run).
|
|
142
146
|
"""
|
|
143
147
|
TARGET_DECORATOR = eval_run.__name__
|
|
144
148
|
files = directory.rglob("*.py") if recursive else directory.glob("*.py")
|
|
@@ -152,16 +156,16 @@ def list_evals(
|
|
|
152
156
|
typer.echo(f"{file}:{fn}")
|
|
153
157
|
|
|
154
158
|
if not found:
|
|
155
|
-
typer.echo(
|
|
159
|
+
typer.echo("No evaluations found.")
|
|
156
160
|
|
|
157
161
|
|
|
158
162
|
@list_app.command("runs")
|
|
159
163
|
def list_runs(
|
|
160
|
-
limit: int = typer.Option(20, help="Max number of runs"),
|
|
164
|
+
limit: int = typer.Option(20, "-l", "--limit", help="Max number of runs"),
|
|
161
165
|
offset: int = typer.Option(0, help="Pagination offset"),
|
|
162
166
|
) -> None:
|
|
163
167
|
"""
|
|
164
|
-
List
|
|
168
|
+
List runs (newest first).
|
|
165
169
|
"""
|
|
166
170
|
client = DeepEvalClient()
|
|
167
171
|
|
|
@@ -175,7 +179,7 @@ def list_runs(
|
|
|
175
179
|
total = response.get("total", 0)
|
|
176
180
|
|
|
177
181
|
if not items:
|
|
178
|
-
typer.echo("No
|
|
182
|
+
typer.echo("No runs found.")
|
|
179
183
|
return
|
|
180
184
|
|
|
181
185
|
typer.echo(f"[id] {'tag':<20}{'label':<20}created_at")
|
|
@@ -204,13 +208,15 @@ def list_runs(
|
|
|
204
208
|
|
|
205
209
|
@app.command()
|
|
206
210
|
def show(
|
|
207
|
-
|
|
211
|
+
run_id: Optional[int] = typer.Option(
|
|
208
212
|
None,
|
|
213
|
+
"-i",
|
|
209
214
|
"--id",
|
|
210
215
|
help="Run ID",
|
|
211
216
|
),
|
|
212
217
|
tag: Optional[str] = typer.Option(
|
|
213
218
|
None,
|
|
219
|
+
"-t",
|
|
214
220
|
"--tag",
|
|
215
221
|
help="Run tag",
|
|
216
222
|
),
|
|
@@ -221,19 +227,20 @@ def show(
|
|
|
221
227
|
),
|
|
222
228
|
) -> None:
|
|
223
229
|
"""
|
|
224
|
-
Show a
|
|
230
|
+
Show a run by ID or tag.
|
|
225
231
|
"""
|
|
226
|
-
|
|
232
|
+
|
|
233
|
+
if not run_id and not tag:
|
|
227
234
|
raise typer.BadParameter("You must provide either --id or --tag")
|
|
228
235
|
|
|
229
|
-
if
|
|
236
|
+
if run_id and tag:
|
|
230
237
|
raise typer.BadParameter("Provide only one of --id or --tag")
|
|
231
238
|
|
|
232
239
|
client = DeepEvalClient()
|
|
233
240
|
|
|
234
|
-
if
|
|
235
|
-
result = client.get_evaluation_testcase_by_id(
|
|
236
|
-
identifier = f"
|
|
241
|
+
if run_id:
|
|
242
|
+
result = client.get_evaluation_testcase_by_id(run_id)
|
|
243
|
+
identifier = f"run_id={run_id}"
|
|
237
244
|
else:
|
|
238
245
|
assert tag is not None
|
|
239
246
|
result = client.get_evaluation_testcase_by_tag(tag)
|
|
@@ -266,7 +273,7 @@ def show(
|
|
|
266
273
|
console.print(
|
|
267
274
|
Panel.fit(
|
|
268
275
|
Pretty(result),
|
|
269
|
-
title="
|
|
276
|
+
title="Run",
|
|
270
277
|
subtitle=identifier,
|
|
271
278
|
)
|
|
272
279
|
)
|
|
@@ -312,7 +319,7 @@ def run(
|
|
|
312
319
|
save_runs: bool = typer.Option(
|
|
313
320
|
False,
|
|
314
321
|
"--save-runs",
|
|
315
|
-
help="Save each
|
|
322
|
+
help="Save each run result to a JSON file",
|
|
316
323
|
),
|
|
317
324
|
output_dir: Path = typer.Option(
|
|
318
325
|
Path("./eval_runs"),
|
|
@@ -321,7 +328,7 @@ def run(
|
|
|
321
328
|
),
|
|
322
329
|
) -> None:
|
|
323
330
|
"""
|
|
324
|
-
|
|
331
|
+
Execute evaluations (functions decorated with @eval_run).
|
|
325
332
|
"""
|
|
326
333
|
files = directory.rglob("*.py") if recursive else directory.glob("*.py")
|
|
327
334
|
TARGET_DECORATOR = eval_run.__name__
|
|
@@ -401,248 +408,141 @@ def run(
|
|
|
401
408
|
typer.echo(f" ❌ Execution failed: {e}")
|
|
402
409
|
|
|
403
410
|
if not executed_any and not dry_run:
|
|
404
|
-
typer.echo("\nNo
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
def _print_and_save(
|
|
408
|
-
resp: dict,
|
|
409
|
-
pretty: bool,
|
|
410
|
-
out: Optional[Path],
|
|
411
|
-
overwrite: bool,
|
|
412
|
-
) -> None:
|
|
413
|
-
if pretty:
|
|
414
|
-
typer.echo(typer.style("📊 Result:", bold=True))
|
|
415
|
-
pprint(resp)
|
|
416
|
-
else:
|
|
417
|
-
typer.echo(resp)
|
|
418
|
-
|
|
419
|
-
if out is None:
|
|
420
|
-
return
|
|
421
|
-
|
|
422
|
-
if out.exists() and not overwrite:
|
|
423
|
-
typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
|
|
424
|
-
raise typer.Exit(code=1)
|
|
411
|
+
typer.echo("\nNo evaluations executed.")
|
|
425
412
|
|
|
426
|
-
out.parent.mkdir(parents=True, exist_ok=True)
|
|
427
413
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
if delta is None:
|
|
442
|
-
return "unchanged"
|
|
443
|
-
if delta > 0:
|
|
444
|
-
return "improved"
|
|
445
|
-
if delta < 0:
|
|
446
|
-
return "regressed"
|
|
447
|
-
return "unchanged"
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
def print_metric_diff(diff: MetricDiff) -> None:
|
|
451
|
-
secho(f"\nMetric: {diff.metric}", bold=True)
|
|
452
|
-
|
|
453
|
-
if diff.status == "added":
|
|
454
|
-
secho(f"+ score: {diff.score_b}", fg="green")
|
|
455
|
-
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
456
|
-
secho(f"+ success: {diff.success_b}", fg="green")
|
|
457
|
-
return
|
|
458
|
-
|
|
459
|
-
if diff.status == "removed":
|
|
460
|
-
secho(f"- score: {diff.score_a}", fg="red")
|
|
461
|
-
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
462
|
-
secho(f"- success: {diff.success_a}", fg="red")
|
|
463
|
-
return
|
|
464
|
-
|
|
465
|
-
# unchanged / changed
|
|
466
|
-
if diff.score_a != diff.score_b:
|
|
467
|
-
direction = metric_direction(diff.delta)
|
|
468
|
-
color = "green" if direction == "improved" else "red"
|
|
469
|
-
pct = pct_change(diff.score_a, diff.score_b)
|
|
470
|
-
|
|
471
|
-
secho(f"- score: {diff.score_a}", fg="red")
|
|
472
|
-
secho(
|
|
473
|
-
f"+ score: {diff.score_b}" + (f" ({pct})" if pct else ""),
|
|
474
|
-
fg=color,
|
|
475
|
-
)
|
|
476
|
-
else:
|
|
477
|
-
secho(f" score: {diff.score_a}", dim=True)
|
|
478
|
-
|
|
479
|
-
if diff.threshold_a != diff.threshold_b:
|
|
480
|
-
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
481
|
-
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
482
|
-
else:
|
|
483
|
-
secho(f" threshold: {diff.threshold_a}", dim=True)
|
|
484
|
-
|
|
485
|
-
if diff.success_a != diff.success_b:
|
|
486
|
-
secho(f"- success: {diff.success_a}", fg="red")
|
|
487
|
-
secho(f"+ success: {diff.success_b}", fg="green")
|
|
414
|
+
def fetch_run(
|
|
415
|
+
client: DeepEvalClient,
|
|
416
|
+
*,
|
|
417
|
+
run_id: Optional[int],
|
|
418
|
+
tag: Optional[str],
|
|
419
|
+
) -> Tuple[dict, str]:
|
|
420
|
+
"""
|
|
421
|
+
Fetch a single run by id or tag.
|
|
422
|
+
Returns (payload, identifier)
|
|
423
|
+
"""
|
|
424
|
+
if run_id is not None:
|
|
425
|
+
result = client.get_evaluation_testcase_by_id(run_id)
|
|
426
|
+
identifier = f"run_id={run_id}"
|
|
488
427
|
else:
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
def summarize(metrics: Any) -> Dict[str, int]:
|
|
493
|
-
summary = {
|
|
494
|
-
"improved": 0,
|
|
495
|
-
"regressed": 0,
|
|
496
|
-
"unchanged": 0,
|
|
497
|
-
"added": 0,
|
|
498
|
-
"removed": 0,
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
for m in metrics:
|
|
502
|
-
if m.status in ("added", "removed"):
|
|
503
|
-
summary[m.status] += 1
|
|
504
|
-
else:
|
|
505
|
-
direction = metric_direction(m.delta)
|
|
506
|
-
summary[direction] += 1
|
|
507
|
-
|
|
508
|
-
return summary
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
|
|
512
|
-
if not summary_only:
|
|
513
|
-
for metric in resp.metrics:
|
|
514
|
-
print_metric_diff(metric)
|
|
515
|
-
return
|
|
516
|
-
|
|
517
|
-
print_summary(resp.metrics)
|
|
518
|
-
|
|
428
|
+
assert tag is not None
|
|
429
|
+
result = client.get_evaluation_testcase_by_tag(tag)
|
|
430
|
+
identifier = f"tag={tag}"
|
|
519
431
|
|
|
520
|
-
|
|
521
|
-
|
|
432
|
+
if not result:
|
|
433
|
+
raise RuntimeError(f"No data returned for {identifier}")
|
|
522
434
|
|
|
523
|
-
|
|
524
|
-
secho(f" ↑ Improved: {summary['improved']}", fg="green")
|
|
525
|
-
secho(f" ↓ Regressed: {summary['regressed']}", fg="red")
|
|
526
|
-
secho(f" ± Unchanged: {summary['unchanged']}", dim=True)
|
|
527
|
-
secho(f" + Added: {summary['added']}", fg="green")
|
|
528
|
-
secho(f" - Removed: {summary['removed']}", fg="red")
|
|
435
|
+
return result, identifier
|
|
529
436
|
|
|
530
437
|
|
|
531
438
|
@app.command()
|
|
532
439
|
def compare(
|
|
533
440
|
tag: List[str] = typer.Option(
|
|
534
441
|
[],
|
|
442
|
+
"-t",
|
|
535
443
|
"--tag",
|
|
536
|
-
help="
|
|
444
|
+
help="Run tag",
|
|
537
445
|
),
|
|
538
|
-
|
|
446
|
+
run_id: List[int] = typer.Option(
|
|
539
447
|
[],
|
|
540
|
-
"
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
pretty: bool = typer.Option(
|
|
544
|
-
True,
|
|
545
|
-
"--pretty/--raw",
|
|
546
|
-
help="Pretty diff output (default) or raw JSON",
|
|
448
|
+
"-i",
|
|
449
|
+
"--id",
|
|
450
|
+
help="Run ID",
|
|
547
451
|
),
|
|
548
452
|
summary: bool = typer.Option(
|
|
549
453
|
False,
|
|
550
454
|
"--summary",
|
|
551
|
-
help="Show summary only
|
|
455
|
+
help="Show summary diff only",
|
|
552
456
|
),
|
|
553
|
-
|
|
554
|
-
False,
|
|
555
|
-
"--raise",
|
|
556
|
-
help="Raise HTTP exceptions instead of swallowing them",
|
|
557
|
-
),
|
|
558
|
-
out: Optional[Path] = typer.Option(
|
|
559
|
-
None,
|
|
560
|
-
"-o",
|
|
561
|
-
"--out",
|
|
562
|
-
help="Optional file path to save the result as JSON",
|
|
563
|
-
),
|
|
564
|
-
overwrite: bool = typer.Option(
|
|
457
|
+
side_by_side: bool = typer.Option(
|
|
565
458
|
False,
|
|
566
|
-
"--
|
|
567
|
-
help="
|
|
459
|
+
"--side-by-side",
|
|
460
|
+
help="Show side-by-side diff (git)",
|
|
568
461
|
),
|
|
569
462
|
) -> None:
|
|
570
463
|
"""
|
|
571
|
-
Compare two evaluation
|
|
464
|
+
Compare two evaluation runs.
|
|
572
465
|
|
|
573
|
-
|
|
574
|
-
- Unified diff-style view by default
|
|
575
|
-
- Raw JSON with --raw
|
|
576
|
-
- Summary of improvements / regressions
|
|
466
|
+
Default: unified git diff
|
|
577
467
|
"""
|
|
578
468
|
|
|
579
|
-
|
|
469
|
+
if summary and side_by_side:
|
|
470
|
+
typer.secho(
|
|
471
|
+
"❌ --summary and --side-by-side cannot be used together",
|
|
472
|
+
fg="red",
|
|
473
|
+
bold=True,
|
|
474
|
+
)
|
|
475
|
+
raise typer.Exit(code=1)
|
|
580
476
|
|
|
581
|
-
|
|
582
|
-
targets.append(("run", r))
|
|
477
|
+
targets: List[Tuple[Optional[int], Optional[str]]] = []
|
|
583
478
|
|
|
479
|
+
for r in run_id:
|
|
480
|
+
targets.append(("run", r))
|
|
584
481
|
for t in tag:
|
|
585
|
-
targets.append(("
|
|
482
|
+
targets.append(("tag", t))
|
|
586
483
|
|
|
587
484
|
if len(targets) != 2:
|
|
588
485
|
typer.secho(
|
|
589
|
-
"❌ Provide exactly two targets using --
|
|
486
|
+
"❌ Provide exactly two targets using --id and/or --tag",
|
|
590
487
|
fg="red",
|
|
591
488
|
bold=True,
|
|
592
489
|
)
|
|
593
490
|
raise typer.Exit(code=1)
|
|
594
491
|
|
|
595
|
-
(type_a, value_a), (type_b, value_b) = targets
|
|
596
|
-
|
|
597
|
-
typer.secho(
|
|
598
|
-
f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'",
|
|
599
|
-
bold=True,
|
|
600
|
-
)
|
|
601
|
-
|
|
602
492
|
client = DeepEvalClient()
|
|
493
|
+
# Summary mode (reduced payload)
|
|
494
|
+
(type_a, value_a), (type_b, value_b) = targets
|
|
495
|
+
if summary:
|
|
496
|
+
kwargs: Dict[str, Any] = {"raise_exception": True}
|
|
497
|
+
if type_a == "run":
|
|
498
|
+
kwargs["testcase_a_id"] = value_a
|
|
499
|
+
else:
|
|
500
|
+
kwargs["testcase_a_tag"] = value_a
|
|
603
501
|
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
502
|
+
if type_b == "run":
|
|
503
|
+
kwargs["testcase_b_id"] = value_b
|
|
504
|
+
else:
|
|
505
|
+
kwargs["testcase_b_tag"] = value_b
|
|
506
|
+
try:
|
|
507
|
+
resp = client.compare_testcases(**kwargs)
|
|
508
|
+
except Exception as e:
|
|
509
|
+
typer.secho(f"❌ Request failed: {e}", fg="red")
|
|
510
|
+
raise typer.Exit(code=1)
|
|
610
511
|
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
512
|
+
if not resp:
|
|
513
|
+
typer.secho("⚠️ No response received", fg="yellow")
|
|
514
|
+
raise typer.Exit(code=1)
|
|
515
|
+
comparison = TestCaseComparison(**resp)
|
|
516
|
+
pretty_print_comparison(
|
|
517
|
+
comparison,
|
|
518
|
+
summary_only=summary,
|
|
519
|
+
)
|
|
520
|
+
return
|
|
615
521
|
|
|
616
522
|
try:
|
|
617
|
-
|
|
523
|
+
run_a, id_a = fetch_run(
|
|
524
|
+
client,
|
|
525
|
+
run_id=value_a if type_a == "run" else None,
|
|
526
|
+
tag=value_a if type_a == "tag" else None,
|
|
527
|
+
)
|
|
528
|
+
run_b, id_b = fetch_run(
|
|
529
|
+
client,
|
|
530
|
+
run_id=value_b if type_b == "run" else None,
|
|
531
|
+
tag=value_b if type_b == "tag" else None,
|
|
532
|
+
)
|
|
618
533
|
except Exception as e:
|
|
619
|
-
typer.secho(f"❌
|
|
534
|
+
typer.secho(f"❌ Fetch failed: {e}", fg="red")
|
|
620
535
|
raise typer.Exit(code=1)
|
|
621
536
|
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
raise typer.Exit(code=1)
|
|
537
|
+
a_text = serialize_for_diff(run_a)
|
|
538
|
+
b_text = serialize_for_diff(run_b)
|
|
625
539
|
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
raise typer.Exit(code=1)
|
|
633
|
-
|
|
634
|
-
out.write_text(json.dumps(resp, indent=2))
|
|
635
|
-
typer.secho(f"💾 Saved raw output to {out}", fg="green")
|
|
636
|
-
|
|
637
|
-
if not pretty:
|
|
638
|
-
typer.echo(json.dumps(resp, indent=2))
|
|
639
|
-
return
|
|
640
|
-
|
|
641
|
-
comparison = TestCaseComparison(**resp)
|
|
642
|
-
|
|
643
|
-
pretty_print_comparison(
|
|
644
|
-
comparison,
|
|
645
|
-
summary_only=summary,
|
|
540
|
+
git_diff(
|
|
541
|
+
a_text,
|
|
542
|
+
b_text,
|
|
543
|
+
label_a=f"{id_a}.full.json",
|
|
544
|
+
label_b=f"{id_b}.full.json",
|
|
545
|
+
side_by_side=side_by_side,
|
|
646
546
|
)
|
|
647
547
|
|
|
648
548
|
|
|
@@ -679,7 +579,7 @@ def compare_label_latest(
|
|
|
679
579
|
),
|
|
680
580
|
) -> None:
|
|
681
581
|
"""
|
|
682
|
-
Compare the latest
|
|
582
|
+
Compare the latest runs for two labels.
|
|
683
583
|
"""
|
|
684
584
|
client = DeepEvalClient()
|
|
685
585
|
|
|
@@ -757,11 +657,13 @@ def compare_last(
|
|
|
757
657
|
def tag_command(
|
|
758
658
|
run_id: Optional[int] = typer.Option(
|
|
759
659
|
None,
|
|
760
|
-
"
|
|
761
|
-
|
|
660
|
+
"-i",
|
|
661
|
+
"--id",
|
|
662
|
+
help="Run ID",
|
|
762
663
|
),
|
|
763
664
|
tag: Optional[str] = typer.Option(
|
|
764
665
|
None,
|
|
666
|
+
"-t",
|
|
765
667
|
"--tag",
|
|
766
668
|
help="Tag to assign to the run",
|
|
767
669
|
),
|
|
@@ -778,11 +680,11 @@ def tag_command(
|
|
|
778
680
|
# --- validation ---
|
|
779
681
|
if delete:
|
|
780
682
|
if run_id or tag:
|
|
781
|
-
typer.echo("❌ --delete cannot be used with --
|
|
683
|
+
typer.echo("❌ --delete cannot be used with --id or --tag")
|
|
782
684
|
raise typer.Exit(code=1)
|
|
783
685
|
else:
|
|
784
686
|
if not run_id or not tag:
|
|
785
|
-
typer.echo("❌ Use --
|
|
687
|
+
typer.echo("❌ Use --id and --tag together, or --delete")
|
|
786
688
|
raise typer.Exit(code=1)
|
|
787
689
|
|
|
788
690
|
client = DeepEvalClient()
|
|
@@ -812,6 +714,11 @@ def tag_command(
|
|
|
812
714
|
typer.echo(f"Tag: {result.get('tag')}")
|
|
813
715
|
|
|
814
716
|
|
|
717
|
+
# Register sub-apps in user journey order (after regular commands)
|
|
718
|
+
app.add_typer(list_app, name="list")
|
|
719
|
+
app.add_typer(metrics_app, name="metrics")
|
|
720
|
+
|
|
721
|
+
|
|
815
722
|
def main() -> None:
|
|
816
723
|
app()
|
|
817
724
|
|
|
File without changes
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from types import SimpleNamespace
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
from rakam_eval_sdk.cli import metrics
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_metrics_no_metrics_found(
|
|
11
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
|
12
|
+
) -> None:
|
|
13
|
+
file = tmp_path / "eval.py"
|
|
14
|
+
file.write_text("def foo(): pass")
|
|
15
|
+
|
|
16
|
+
monkeypatch.setattr(
|
|
17
|
+
"rakam_eval_sdk.cli.find_decorated_functions",
|
|
18
|
+
lambda *_: [],
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
with pytest.raises(typer.Exit) as exc:
|
|
22
|
+
metrics(directory=tmp_path, recursive=False)
|
|
23
|
+
|
|
24
|
+
assert exc.value.exit_code == 0
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_metrics_finds_metrics(
|
|
28
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys
|
|
29
|
+
) -> None:
|
|
30
|
+
file = tmp_path / "eval.py"
|
|
31
|
+
file.write_text("")
|
|
32
|
+
|
|
33
|
+
monkeypatch.setattr(
|
|
34
|
+
"rakam_eval_sdk.cli.find_decorated_functions",
|
|
35
|
+
lambda *_: ["run_eval"],
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
fake_module = SimpleNamespace(
|
|
39
|
+
run_eval=lambda: SimpleNamespace(
|
|
40
|
+
metrics=[
|
|
41
|
+
SimpleNamespace(type="accuracy", name=None),
|
|
42
|
+
SimpleNamespace(type="f1", name="f1_v2"),
|
|
43
|
+
]
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
monkeypatch.setattr(
|
|
48
|
+
"rakam_eval_sdk.cli.load_module_from_path",
|
|
49
|
+
lambda _: fake_module,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
metrics(directory=tmp_path, recursive=False)
|
|
53
|
+
|
|
54
|
+
out = capsys.readouterr().out
|
|
55
|
+
|
|
56
|
+
assert "accuracy" in out
|
|
57
|
+
assert "f1" in out
|
|
58
|
+
assert "unique metrics found" in out
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from rakam_eval_sdk.client import DeepEvalClient
|
|
8
|
+
from rakam_eval_sdk.decorators import eval_run
|
|
9
|
+
from rakam_eval_sdk.utils.decorator_utils import find_decorated_functions
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.fixture
|
|
13
|
+
def client() -> DeepEvalClient:
|
|
14
|
+
return DeepEvalClient(base_url="http://testserver", api_token="testtoken")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FakeCPUTimes:
|
|
18
|
+
def __init__(self, user: float = 1.0, system: float = 1.0) -> None:
|
|
19
|
+
self.user: float = user
|
|
20
|
+
self.system: float = system
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class FakeMemInfo:
|
|
24
|
+
def __init__(self, rss: int) -> None:
|
|
25
|
+
self.rss: int = rss
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class FakeProcess:
|
|
29
|
+
def __init__(self) -> None:
|
|
30
|
+
self._cpu_calls: int = 0
|
|
31
|
+
self._mem_calls: int = 0
|
|
32
|
+
|
|
33
|
+
def cpu_times(self) -> FakeCPUTimes:
|
|
34
|
+
self._cpu_calls += 1
|
|
35
|
+
# simulate CPU usage increase
|
|
36
|
+
return FakeCPUTimes(
|
|
37
|
+
user=1.0 + self._cpu_calls,
|
|
38
|
+
system=1.0,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def memory_info(self) -> FakeMemInfo:
|
|
42
|
+
self._mem_calls += 1
|
|
43
|
+
return FakeMemInfo(rss=100_000_000 + (self._mem_calls * 10_000))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.fixture(autouse=True)
|
|
47
|
+
def patch_psutil(
|
|
48
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
49
|
+
) -> FakeProcess:
|
|
50
|
+
fake_process: FakeProcess = FakeProcess()
|
|
51
|
+
|
|
52
|
+
monkeypatch.setattr(
|
|
53
|
+
"rakam_eval_sdk.decorators.psutil.Process",
|
|
54
|
+
lambda pid: fake_process,
|
|
55
|
+
)
|
|
56
|
+
monkeypatch.setattr(os, "getpid", lambda: 123)
|
|
57
|
+
return fake_process
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_eval_run_basic(capsys: pytest.CaptureFixture[str]) -> None:
|
|
61
|
+
@eval_run
|
|
62
|
+
def add(a: int, b: int) -> int:
|
|
63
|
+
return a + b
|
|
64
|
+
|
|
65
|
+
result: int = add(2, 3)
|
|
66
|
+
|
|
67
|
+
assert result == 5
|
|
68
|
+
|
|
69
|
+
out: str = capsys.readouterr().out
|
|
70
|
+
assert "[eval_run]" in out
|
|
71
|
+
assert "add" in out
|
|
72
|
+
assert "time=" in out
|
|
73
|
+
assert "cpu=" in out
|
|
74
|
+
assert "mem_delta=" in out
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def test_eval_run_with_parentheses(capsys: pytest.CaptureFixture[str]) -> None:
|
|
78
|
+
@eval_run()
|
|
79
|
+
def mul(a: int, b: int) -> int:
|
|
80
|
+
return a * b
|
|
81
|
+
|
|
82
|
+
result: int = mul(3, 4)
|
|
83
|
+
|
|
84
|
+
assert result == 12
|
|
85
|
+
assert "[eval_run]" in capsys.readouterr().out
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_find_decorated_functions(tmp_path: Path) -> None:
|
|
89
|
+
code = """
|
|
90
|
+
from rakam_eval_sdk.decorators import eval_run
|
|
91
|
+
|
|
92
|
+
@eval_run
|
|
93
|
+
def foo():
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
@eval_run()
|
|
97
|
+
def bar():
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
async def baz():
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
@other
|
|
104
|
+
def nope():
|
|
105
|
+
pass
|
|
106
|
+
"""
|
|
107
|
+
file = tmp_path / "test_mod.py"
|
|
108
|
+
file.write_text(code)
|
|
109
|
+
|
|
110
|
+
result: List[str] = find_decorated_functions(file, "eval_run")
|
|
111
|
+
|
|
112
|
+
assert set(result) == {"foo", "bar"}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from types import SimpleNamespace
|
|
2
|
+
|
|
3
|
+
from rakam_eval_sdk.cli import extract_metric_names
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_extract_metric_names_no_metrics_attr() -> None:
|
|
7
|
+
config = SimpleNamespace()
|
|
8
|
+
|
|
9
|
+
result = extract_metric_names(config)
|
|
10
|
+
|
|
11
|
+
assert result == []
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_extract_metric_names_empty_metrics() -> None:
|
|
15
|
+
config = SimpleNamespace(metrics=[])
|
|
16
|
+
|
|
17
|
+
result = extract_metric_names(config)
|
|
18
|
+
|
|
19
|
+
assert result == []
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_extract_metric_names_with_types_only() -> None:
|
|
23
|
+
config = SimpleNamespace(
|
|
24
|
+
metrics=[
|
|
25
|
+
SimpleNamespace(type="accuracy"),
|
|
26
|
+
SimpleNamespace(type="latency"),
|
|
27
|
+
]
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
result = extract_metric_names(config)
|
|
31
|
+
|
|
32
|
+
assert result == [
|
|
33
|
+
("accuracy", None),
|
|
34
|
+
("latency", None),
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_extract_metric_names_with_type_and_name() -> None:
|
|
39
|
+
config = SimpleNamespace(
|
|
40
|
+
metrics=[
|
|
41
|
+
SimpleNamespace(type="accuracy", name="acc_v1"),
|
|
42
|
+
SimpleNamespace(type="f1", name=None),
|
|
43
|
+
]
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
result = extract_metric_names(config)
|
|
47
|
+
|
|
48
|
+
assert result == [
|
|
49
|
+
("accuracy", "acc_v1"),
|
|
50
|
+
("f1", None),
|
|
51
|
+
]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from types import ModuleType
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from rakam_eval_sdk.utils.decorator_utils import load_module_from_path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_load_module_from_path_success(tmp_path: Path) -> None:
|
|
10
|
+
file = tmp_path / "mod.py"
|
|
11
|
+
file.write_text(
|
|
12
|
+
"""
|
|
13
|
+
x = 42
|
|
14
|
+
def hello():
|
|
15
|
+
return "world"
|
|
16
|
+
"""
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
module: ModuleType = load_module_from_path(file)
|
|
20
|
+
|
|
21
|
+
assert module.x == 42
|
|
22
|
+
assert module.hello() == "world"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_load_module_from_path_invalid(tmp_path: Path) -> None:
|
|
26
|
+
file: Path = tmp_path / "broken.py"
|
|
27
|
+
file.write_text("def foo(") # invalid syntax
|
|
28
|
+
|
|
29
|
+
with pytest.raises(SyntaxError):
|
|
30
|
+
load_module_from_path(file)
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
from typing import Optional, List, Dict
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from types import SimpleNamespace
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
import typer
|
|
9
|
+
|
|
10
|
+
from rakam_eval_sdk.utils.print import (
|
|
11
|
+
_print_and_save,
|
|
12
|
+
metric_direction,
|
|
13
|
+
pct_change,
|
|
14
|
+
pretty_print_comparison,
|
|
15
|
+
summarize,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@pytest.mark.parametrize(
|
|
20
|
+
"a,b,expected",
|
|
21
|
+
[
|
|
22
|
+
(10, 20, "+100.00%"),
|
|
23
|
+
(20, 10, "-50.00%"),
|
|
24
|
+
(10, 10, "+0.00%"),
|
|
25
|
+
(None, 10, None),
|
|
26
|
+
(10, None, None),
|
|
27
|
+
(0, 10, None),
|
|
28
|
+
],
|
|
29
|
+
)
|
|
30
|
+
def test_pct_change(a: Optional[int], b: Optional[int], expected: Optional[str]):
|
|
31
|
+
assert pct_change(a, b) == expected
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# -------------------------
|
|
35
|
+
# metric_direction
|
|
36
|
+
# -------------------------
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.mark.parametrize(
|
|
40
|
+
"delta,expected",
|
|
41
|
+
[
|
|
42
|
+
(1.0, "improved"),
|
|
43
|
+
(-1.0, "regressed"),
|
|
44
|
+
(0.0, "unchanged"),
|
|
45
|
+
(None, "unchanged"),
|
|
46
|
+
],
|
|
47
|
+
)
|
|
48
|
+
def test_metric_direction(delta: Optional[float], expected: str):
|
|
49
|
+
assert metric_direction(delta) == expected
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_print_and_save_no_output_path(capsys: pytest.CaptureFixture[str]) -> None:
|
|
53
|
+
resp: Dict[str, int] = {"a": 1}
|
|
54
|
+
|
|
55
|
+
_print_and_save(resp, pretty=False, out=None, overwrite=False)
|
|
56
|
+
|
|
57
|
+
captured = capsys.readouterr()
|
|
58
|
+
assert str(resp) in captured.out
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_print_and_save_creates_file(
|
|
62
|
+
tmp_path: Path,
|
|
63
|
+
capsys: pytest.CaptureFixture[str],
|
|
64
|
+
) -> None:
|
|
65
|
+
out: Path = tmp_path / "result.json"
|
|
66
|
+
resp: Dict[str, str] = {"hello": "world"}
|
|
67
|
+
|
|
68
|
+
_print_and_save(resp, pretty=False, out=out, overwrite=False)
|
|
69
|
+
|
|
70
|
+
assert out.exists()
|
|
71
|
+
assert json.loads(out.read_text()) == resp
|
|
72
|
+
|
|
73
|
+
captured = capsys.readouterr()
|
|
74
|
+
assert "Result saved" in captured.out
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def test_print_and_save_refuses_overwrite(tmp_path: Path) -> None:
|
|
78
|
+
out: Path = tmp_path / "result.json"
|
|
79
|
+
out.write_text("{}")
|
|
80
|
+
|
|
81
|
+
with pytest.raises(typer.Exit):
|
|
82
|
+
_print_and_save({"x": 1}, pretty=False, out=out, overwrite=False)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_print_and_save_overwrites(tmp_path: Path) -> None:
|
|
86
|
+
out: Path = tmp_path / "result.json"
|
|
87
|
+
out.write_text("{}")
|
|
88
|
+
|
|
89
|
+
_print_and_save({"x": 2}, pretty=False, out=out, overwrite=True)
|
|
90
|
+
|
|
91
|
+
assert json.loads(out.read_text()) == {"x": 2}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def make_metric(
|
|
95
|
+
*,
|
|
96
|
+
metric: str,
|
|
97
|
+
status: str = "changed",
|
|
98
|
+
delta: Optional[float] = None,
|
|
99
|
+
) -> SimpleNamespace:
|
|
100
|
+
return SimpleNamespace(
|
|
101
|
+
metric=metric,
|
|
102
|
+
status=status,
|
|
103
|
+
delta=delta,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def test_summarize_groups_all_states() -> None:
|
|
108
|
+
metrics: List[SimpleNamespace] = [
|
|
109
|
+
make_metric(metric="added_metric", status="added"),
|
|
110
|
+
make_metric(metric="removed_metric", status="removed"),
|
|
111
|
+
make_metric(metric="improved_metric", delta=1.0),
|
|
112
|
+
make_metric(metric="regressed_metric", delta=-1.0),
|
|
113
|
+
make_metric(metric="unchanged_zero", delta=0.0),
|
|
114
|
+
make_metric(metric="unchanged_none", delta=None),
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
summary: Dict[str, List[str]] = summarize(metrics=metrics)
|
|
118
|
+
|
|
119
|
+
assert summary == {
|
|
120
|
+
"improved": ["improved_metric"],
|
|
121
|
+
"regressed": ["regressed_metric"],
|
|
122
|
+
"unchanged": ["unchanged_zero", "unchanged_none"],
|
|
123
|
+
"added": ["added_metric"],
|
|
124
|
+
"removed": ["removed_metric"],
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def test_summarize_empty_metrics() -> None:
|
|
129
|
+
assert summarize(metrics=[]) == {
|
|
130
|
+
"improved": [],
|
|
131
|
+
"regressed": [],
|
|
132
|
+
"unchanged": [],
|
|
133
|
+
"added": [],
|
|
134
|
+
"removed": [],
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def test_pretty_print_comparison_summary_only(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
139
|
+
metrics: List[SimpleNamespace] = [
|
|
140
|
+
make_metric(metric="added_metric", status="added"),
|
|
141
|
+
make_metric(metric="improved_metric", delta=1.0),
|
|
142
|
+
]
|
|
143
|
+
resp: SimpleNamespace = SimpleNamespace(metrics=metrics)
|
|
144
|
+
|
|
145
|
+
called: Dict[str, Any] = {}
|
|
146
|
+
|
|
147
|
+
def fake_print_summary(arg: Any) -> None:
|
|
148
|
+
called["metrics"] = arg
|
|
149
|
+
|
|
150
|
+
import rakam_eval_sdk.utils.print as mod
|
|
151
|
+
|
|
152
|
+
monkeypatch.setattr(mod, "print_summary", fake_print_summary)
|
|
153
|
+
|
|
154
|
+
pretty_print_comparison(resp, summary_only=True)
|
|
155
|
+
|
|
156
|
+
assert called["metrics"] == metrics
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_pretty_print_comparison_full(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
160
|
+
metrics: List[SimpleNamespace] = [
|
|
161
|
+
make_metric(metric="added_metric", status="added"),
|
|
162
|
+
make_metric(metric="improved_metric", delta=1.0),
|
|
163
|
+
]
|
|
164
|
+
resp: SimpleNamespace = SimpleNamespace(metrics=metrics)
|
|
165
|
+
|
|
166
|
+
calls: List[Any] = []
|
|
167
|
+
|
|
168
|
+
def fake_print_metric_diff(metric: Any) -> None:
|
|
169
|
+
calls.append(metric)
|
|
170
|
+
|
|
171
|
+
import rakam_eval_sdk.utils.print as mod
|
|
172
|
+
|
|
173
|
+
monkeypatch.setattr(mod, "print_metric_diff", fake_print_metric_diff)
|
|
174
|
+
|
|
175
|
+
pretty_print_comparison(resp, summary_only=False)
|
|
176
|
+
|
|
177
|
+
assert calls == metrics
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from pprint import pprint
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from typer import secho
|
|
8
|
+
|
|
9
|
+
from rakam_eval_sdk.schema import MetricDiff
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _print_and_save(
|
|
13
|
+
resp: dict,
|
|
14
|
+
pretty: bool,
|
|
15
|
+
out: Optional[Path],
|
|
16
|
+
overwrite: bool,
|
|
17
|
+
) -> None:
|
|
18
|
+
if pretty:
|
|
19
|
+
typer.echo(typer.style("📊 Result:", bold=True))
|
|
20
|
+
pprint(resp)
|
|
21
|
+
else:
|
|
22
|
+
typer.echo(resp)
|
|
23
|
+
|
|
24
|
+
if out is None:
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
if out.exists() and not overwrite:
|
|
28
|
+
typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
|
|
29
|
+
raise typer.Exit(code=1)
|
|
30
|
+
|
|
31
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
32
|
+
|
|
33
|
+
with out.open("w", encoding="utf-8") as f:
|
|
34
|
+
json.dump(resp, f, indent=2, ensure_ascii=False)
|
|
35
|
+
|
|
36
|
+
typer.echo(f"💾 Result saved to {out}")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def pct_change(a: Optional[float], b: Optional[float]) -> Optional[str]:
|
|
40
|
+
if a is None or b is None or a == 0:
|
|
41
|
+
return None
|
|
42
|
+
return f"{((b - a) / a) * 100:+.2f}%"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def metric_direction(delta: Optional[float]) -> str:
|
|
46
|
+
if delta is None:
|
|
47
|
+
return "unchanged"
|
|
48
|
+
if delta > 0:
|
|
49
|
+
return "improved"
|
|
50
|
+
if delta < 0:
|
|
51
|
+
return "regressed"
|
|
52
|
+
return "unchanged"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def print_metric_diff(diff: MetricDiff) -> None:
|
|
56
|
+
secho(f"\nMetric: {diff.metric}", bold=True)
|
|
57
|
+
|
|
58
|
+
if diff.status == "added":
|
|
59
|
+
secho(f"+ score: {diff.score_b}", fg="green")
|
|
60
|
+
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
61
|
+
secho(f"+ success: {diff.success_b}", fg="green")
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
if diff.status == "removed":
|
|
65
|
+
secho(f"- score: {diff.score_a}", fg="red")
|
|
66
|
+
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
67
|
+
secho(f"- success: {diff.success_a}", fg="red")
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
# unchanged / changed
|
|
71
|
+
if diff.score_a != diff.score_b:
|
|
72
|
+
direction = metric_direction(diff.delta)
|
|
73
|
+
color = "green" if direction == "improved" else "red"
|
|
74
|
+
pct = pct_change(diff.score_a, diff.score_b)
|
|
75
|
+
|
|
76
|
+
secho(f"- score: {diff.score_a}", fg="red")
|
|
77
|
+
secho(
|
|
78
|
+
f"+ score: {diff.score_b}" + (f" ({pct})" if pct else ""),
|
|
79
|
+
fg=color,
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
secho(f" score: {diff.score_a}", dim=True)
|
|
83
|
+
|
|
84
|
+
if diff.threshold_a != diff.threshold_b:
|
|
85
|
+
secho(f"- threshold: {diff.threshold_a}", fg="red")
|
|
86
|
+
secho(f"+ threshold: {diff.threshold_b}", fg="green")
|
|
87
|
+
else:
|
|
88
|
+
secho(f" threshold: {diff.threshold_a}", dim=True)
|
|
89
|
+
|
|
90
|
+
if diff.success_a != diff.success_b:
|
|
91
|
+
secho(f"- success: {diff.success_a}", fg="red")
|
|
92
|
+
secho(f"+ success: {diff.success_b}", fg="green")
|
|
93
|
+
else:
|
|
94
|
+
secho(f" success: {diff.success_a}", dim=True)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def summarize(metrics: List[MetricDiff]) -> Dict[str, List[str]]:
|
|
98
|
+
"""
|
|
99
|
+
Returns metric names grouped by category.
|
|
100
|
+
"""
|
|
101
|
+
summary: Dict[str, List[str]] = {
|
|
102
|
+
"improved": [],
|
|
103
|
+
"regressed": [],
|
|
104
|
+
"unchanged": [],
|
|
105
|
+
"added": [],
|
|
106
|
+
"removed": [],
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
for m in metrics:
|
|
110
|
+
if m.status == "added":
|
|
111
|
+
summary["added"].append(m.metric)
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
if m.status == "removed":
|
|
115
|
+
summary["removed"].append(m.metric)
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
direction = metric_direction(m.delta)
|
|
119
|
+
summary[direction].append(m.metric)
|
|
120
|
+
|
|
121
|
+
return summary
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _fmt(metrics: List[str]) -> str:
|
|
125
|
+
return ", ".join(metrics) if metrics else "-"
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def print_summary(metrics: List[MetricDiff]) -> None:
|
|
129
|
+
summary = summarize(metrics)
|
|
130
|
+
|
|
131
|
+
rows = [
|
|
132
|
+
("↑ Improved", "improved", "green"),
|
|
133
|
+
("↓ Regressed", "regressed", "red"),
|
|
134
|
+
("± Unchanged", "unchanged", None),
|
|
135
|
+
("+ Added.", "added", "green"),
|
|
136
|
+
("- Removed.", "removed", "red"),
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
secho("\nSummary:", bold=True)
|
|
140
|
+
secho(
|
|
141
|
+
" | Status | # | Metrics |",
|
|
142
|
+
dim=True,
|
|
143
|
+
)
|
|
144
|
+
secho(
|
|
145
|
+
" |--------------|---|------------------------|",
|
|
146
|
+
dim=True,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
for label, key, color in rows:
|
|
150
|
+
count = len(summary[key])
|
|
151
|
+
metrics_str = _fmt(summary[key])
|
|
152
|
+
|
|
153
|
+
line = f" | {label:<12} | {count:<1} | {metrics_str:<22} |"
|
|
154
|
+
|
|
155
|
+
if color:
|
|
156
|
+
secho(line, fg=color)
|
|
157
|
+
else:
|
|
158
|
+
secho(line, dim=True)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
|
|
162
|
+
if not summary_only:
|
|
163
|
+
for metric in resp.metrics:
|
|
164
|
+
print_metric_diff(metric)
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
print_summary(resp.metrics)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def serialize_for_diff(obj: dict) -> str:
|
|
171
|
+
"""
|
|
172
|
+
Stable, git-friendly JSON representation
|
|
173
|
+
"""
|
|
174
|
+
return (
|
|
175
|
+
json.dumps(
|
|
176
|
+
obj,
|
|
177
|
+
indent=4,
|
|
178
|
+
sort_keys=True,
|
|
179
|
+
ensure_ascii=False,
|
|
180
|
+
)
|
|
181
|
+
+ "\n"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def git_diff(
|
|
186
|
+
a_text: str,
|
|
187
|
+
b_text: str,
|
|
188
|
+
*,
|
|
189
|
+
label_a: str,
|
|
190
|
+
label_b: str,
|
|
191
|
+
side_by_side: bool = False,
|
|
192
|
+
) -> None:
|
|
193
|
+
"""
|
|
194
|
+
Show diff between two text blobs.
|
|
195
|
+
|
|
196
|
+
- side_by_side: uses git difftool with vimdiff (interactive)
|
|
197
|
+
- fallback to git diff -U3 or difflib if git is not available
|
|
198
|
+
"""
|
|
199
|
+
import shutil
|
|
200
|
+
import subprocess
|
|
201
|
+
from pathlib import Path
|
|
202
|
+
from tempfile import TemporaryDirectory
|
|
203
|
+
|
|
204
|
+
import typer
|
|
205
|
+
|
|
206
|
+
git = shutil.which("git")
|
|
207
|
+
vimdiff = shutil.which("vimdiff")
|
|
208
|
+
|
|
209
|
+
with TemporaryDirectory() as tmp:
|
|
210
|
+
a = Path(tmp) / label_a
|
|
211
|
+
b = Path(tmp) / label_b
|
|
212
|
+
|
|
213
|
+
a.write_text(a_text)
|
|
214
|
+
b.write_text(b_text)
|
|
215
|
+
|
|
216
|
+
# --- Side-by-side with vimdiff ---
|
|
217
|
+
if side_by_side:
|
|
218
|
+
if not git:
|
|
219
|
+
typer.secho(
|
|
220
|
+
"❌ Git is required for side-by-side diffs", fg="red", bold=True
|
|
221
|
+
)
|
|
222
|
+
return
|
|
223
|
+
if not vimdiff:
|
|
224
|
+
typer.secho(
|
|
225
|
+
"❌ Vimdiff is not installed. Please install vim or vimdiff to use side-by-side mode.",
|
|
226
|
+
fg="red",
|
|
227
|
+
bold=True,
|
|
228
|
+
)
|
|
229
|
+
return
|
|
230
|
+
|
|
231
|
+
cmd = [
|
|
232
|
+
git,
|
|
233
|
+
"difftool",
|
|
234
|
+
"--no-index",
|
|
235
|
+
"--tool=vimdiff",
|
|
236
|
+
"--no-prompt", # skip Y/n prompt
|
|
237
|
+
str(a),
|
|
238
|
+
str(b),
|
|
239
|
+
]
|
|
240
|
+
|
|
241
|
+
subprocess.run(cmd, check=False)
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
# (default)
|
|
245
|
+
if git:
|
|
246
|
+
cmd = [
|
|
247
|
+
git,
|
|
248
|
+
"diff",
|
|
249
|
+
"--no-index",
|
|
250
|
+
"--color=always",
|
|
251
|
+
"-U3",
|
|
252
|
+
str(a),
|
|
253
|
+
str(b),
|
|
254
|
+
]
|
|
255
|
+
subprocess.run(cmd, check=False)
|
|
256
|
+
return
|
|
257
|
+
|
|
258
|
+
# Fallback
|
|
259
|
+
import difflib
|
|
260
|
+
|
|
261
|
+
diff = difflib.unified_diff(
|
|
262
|
+
a_text.splitlines(),
|
|
263
|
+
b_text.splitlines(),
|
|
264
|
+
fromfile=label_a,
|
|
265
|
+
tofile=label_b,
|
|
266
|
+
lineterm="",
|
|
267
|
+
)
|
|
268
|
+
for line in diff:
|
|
269
|
+
typer.echo(line)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rakam_eval_sdk-0.2.4rc6 → rakam_eval_sdk-0.2.4rc8}/src/rakam_eval_sdk/utils/decorator_utils.py
RENAMED
|
File without changes
|