rakam-eval-sdk 0.1.16__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,11 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rakam-eval-sdk
3
- Version: 0.1.16
3
+ Version: 0.2.0
4
4
  Summary: Evaluation Framework SDK
5
5
  Author: Mohamed Bachar Touil
6
6
  License: MIT
7
+ Requires-Dist: dotenv>=0.9.9
8
+ Requires-Dist: psutil>=7.2.1
7
9
  Requires-Dist: pydantic>=2.10.6
8
10
  Requires-Dist: requests
9
11
  Requires-Dist: typer>=0.20.1
@@ -4,7 +4,7 @@ build-backend = "uv_build"
4
4
 
5
5
  [project]
6
6
  name = "rakam-eval-sdk"
7
- version = "0.1.16"
7
+ version = "0.2.0"
8
8
  description = "Evaluation Framework SDK"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -13,6 +13,8 @@ authors = [
13
13
  { name = "Mohamed Bachar Touil" }
14
14
  ]
15
15
  dependencies = [
16
+ "dotenv>=0.9.9",
17
+ "psutil>=7.2.1",
16
18
  "pydantic>=2.10.6",
17
19
  "requests",
18
20
  "typer>=0.20.1",
@@ -24,6 +26,9 @@ dev = [
24
26
  "pytest>=8.3.5",
25
27
  "twine>=6.1.0",
26
28
  ]
29
+ [tool.pytest.ini_options]
30
+ testpaths = ["tests"]
31
+ python_files = "test_*.py"
27
32
 
28
33
  [tool.isort]
29
34
  profile = "black"
@@ -38,6 +43,10 @@ name = "testpypi"
38
43
  url = "https://test.pypi.org/simple/"
39
44
  publish-url = "https://test.pypi.org/legacy/"
40
45
  explicit = true
46
+ [tool.setuptools]
47
+ package-dir = {"" = "src"}
41
48
 
49
+ [tool.setuptools.packages.find]
50
+ where = ["src"]
42
51
  [project.scripts]
43
- mycli = "rakam_eval_sdk.cli:main"
52
+ rakam_eval = "rakam_eval_sdk.cli:main"
@@ -0,0 +1,557 @@
1
+ # cli.py
2
+ import json
3
+ import os
4
+ import sys
5
+ import uuid
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ from pprint import pprint
9
+ from typing import Any, Dict, Optional, Sequence
10
+
11
+ import typer
12
+ from dotenv import load_dotenv
13
+ from rich.console import Console
14
+ from rich.panel import Panel
15
+ from rich.pretty import Pretty
16
+
17
+ from rakam_eval_sdk.client import DeepEvalClient
18
+ from rakam_eval_sdk.decorators import eval_run
19
+ from rakam_eval_sdk.utils.decorator_utils import (
20
+ find_decorated_functions,
21
+ load_module_from_path,
22
+ )
23
+
24
+ load_dotenv()
25
+ app = typer.Typer(help="CLI tools for evaluation utilities")
26
+ console = Console()
27
+
28
+ # add root of the project to sys.path
29
+ PROJECT_ROOT = os.path.abspath(".")
30
+ if PROJECT_ROOT not in sys.path:
31
+ sys.path.insert(0, PROJECT_ROOT)
32
+ list_app = typer.Typer(help="List resources")
33
+ app.add_typer(list_app, name="list")
34
+ metrics_app = typer.Typer(help="Metrics utilities")
35
+ app.add_typer(metrics_app, name="metrics")
36
+
37
+
38
+ @metrics_app.command("list")
39
+ def list_metrics(
40
+ limit: int = typer.Option(
41
+ 20,
42
+ "--limit",
43
+ help="Number of testcases to inspect for metrics",
44
+ ),
45
+ ):
46
+ """
47
+ List unique metric names found in evaluation testcases.
48
+ """
49
+ client = DeepEvalClient()
50
+
51
+ testcases = client.list_evaluation_testcases(
52
+ limit=limit,
53
+ offset=0,
54
+ raise_exception=True,
55
+ )
56
+
57
+ if not testcases:
58
+ typer.echo("No evaluation testcases found.")
59
+ return
60
+
61
+ metric_names: set[str] = set()
62
+
63
+ def collect_metrics(entries: Sequence[Dict] | None):
64
+ if not entries:
65
+ return
66
+ for entry in entries:
67
+ for metric in entry.get("metrics", []) or []:
68
+ name = metric.get("name")
69
+ if name:
70
+ metric_names.add(name)
71
+
72
+ for tc in testcases:
73
+
74
+ collect_metrics(tc.get("result"))
75
+
76
+ if not metric_names:
77
+ typer.echo("No metrics found.")
78
+ return
79
+
80
+ typer.echo(
81
+ f"📊 Found {len(metric_names)} unique metrics "
82
+ f"(from latest {limit} testcases)\n"
83
+ )
84
+
85
+ for name in sorted(metric_names):
86
+ typer.echo(f"- {name}")
87
+
88
+
89
+ @list_app.command("eval")
90
+ def list(
91
+ directory: Path = typer.Argument(
92
+ Path("./eval"),
93
+ exists=True,
94
+ file_okay=False,
95
+ dir_okay=True,
96
+ help="Directory to scan (default: ./eval)",
97
+ ),
98
+ recursive: bool = typer.Option(
99
+ False,
100
+ "--recursive",
101
+ "-r",
102
+ help="Recursively search for Python files",
103
+ ),
104
+ ) -> None:
105
+ """
106
+ Find functions decorated with @track.
107
+ """
108
+ TARGET_DECORATOR = eval_run.__name__
109
+ files = directory.rglob("*.py") if recursive else directory.glob("*.py")
110
+
111
+ found = False
112
+
113
+ for file in sorted(files):
114
+ functions = find_decorated_functions(file, TARGET_DECORATOR)
115
+ for fn in functions:
116
+ found = True
117
+ typer.echo(f"{file}:{fn}")
118
+
119
+ if not found:
120
+ typer.echo(f"No @{TARGET_DECORATOR} functions found.")
121
+
122
+
123
+ @list_app.command("runs")
124
+ def list_runs(
125
+ limit: int = typer.Option(20, help="Max number of runs"),
126
+ offset: int = typer.Option(0, help="Pagination offset"),
127
+ ):
128
+ """
129
+ List evaluation runs (newest first).
130
+ """
131
+ client = DeepEvalClient()
132
+
133
+ runs = client.list_evaluation_testcases(
134
+ limit=limit,
135
+ offset=offset,
136
+ raise_exception=True,
137
+ )
138
+
139
+ if not runs:
140
+ typer.echo("No evaluation runs found.")
141
+ return
142
+
143
+ typer.echo(f"[id] " f"{'unique_id':<20}" f"{'label':<20}" f"created_at")
144
+ # pretty CLI output
145
+ for run in runs:
146
+ run_id = run.get("id")
147
+ label = run.get("label") or "-"
148
+ uid = run.get("unique_id") or "-"
149
+ created_at = run.get("created_at")
150
+
151
+ if created_at:
152
+ try:
153
+ created_at = datetime.fromisoformat(created_at).strftime(
154
+ "%Y-%m-%d %H:%M:%S"
155
+ )
156
+ except ValueError:
157
+ pass
158
+
159
+ typer.echo(f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
160
+
161
+
162
+ @list_app.command("show")
163
+ def show_testcase(
164
+ id: Optional[int] = typer.Option(
165
+ None,
166
+ "--id",
167
+ help="Numeric evaluation testcase ID",
168
+ ),
169
+ uid: Optional[str] = typer.Option(
170
+ None,
171
+ "--uid",
172
+ help="Evaluation testcase unique_id",
173
+ ),
174
+ raw: bool = typer.Option(
175
+ False,
176
+ "--raw",
177
+ help="Print raw JSON instead of formatted output",
178
+ ),
179
+ ):
180
+ """
181
+ Show a single evaluation testcase by ID or unique_id.
182
+ """
183
+ if not id and not uid:
184
+ raise typer.BadParameter("You must provide either --id or --uid")
185
+
186
+ if id and uid:
187
+ raise typer.BadParameter("Provide only one of --id or --uid")
188
+
189
+ client = DeepEvalClient()
190
+
191
+ if id:
192
+ result = client.get_evaluation_testcase_by_id(id)
193
+ identifier = f"id={id}"
194
+ else:
195
+ result = client.get_evaluation_testcase_by_unique_id(uid)
196
+ identifier = f"unique_id={uid}"
197
+
198
+ if not result:
199
+ console.print(
200
+ Panel(
201
+ f"No response received for {identifier}",
202
+ title="Error",
203
+ style="red",
204
+ )
205
+ )
206
+ raise typer.Exit(code=1)
207
+
208
+ if isinstance(result, dict) and result.get("error"):
209
+ console.print(
210
+ Panel(
211
+ result["error"],
212
+ title="Error",
213
+ style="red",
214
+ )
215
+ )
216
+ raise typer.Exit(code=1)
217
+
218
+ if raw:
219
+ console.print(Pretty(result))
220
+ raise typer.Exit()
221
+
222
+ console.print(
223
+ Panel.fit(
224
+ Pretty(result),
225
+ title="Evaluation TestCase",
226
+ subtitle=identifier,
227
+ )
228
+ )
229
+
230
+
231
+ def validate_eval_result(result: Any, fn_name: str) -> str:
232
+ eval_config = getattr(result, "__eval_config__", None)
233
+
234
+ if not isinstance(eval_config, str):
235
+ expected = "EvalConfig or SchemaEvalConfig"
236
+ actual = type(result).__name__
237
+
238
+ typer.echo(
239
+ f" ❌ Invalid return type from `{fn_name}`\n"
240
+ f" Expected: {expected}\n"
241
+ f" Got: {actual}"
242
+ )
243
+ return ""
244
+
245
+ return eval_config
246
+
247
+
248
+ @app.command()
249
+ def run(
250
+ directory: Path = typer.Argument(
251
+ Path("./eval"),
252
+ exists=True,
253
+ file_okay=False,
254
+ dir_okay=True,
255
+ help="Directory to scan (default: ./eval)",
256
+ ),
257
+ recursive: bool = typer.Option(
258
+ False,
259
+ "-r",
260
+ "--recursive",
261
+ help="Recursively search for Python files",
262
+ ),
263
+ dry_run: bool = typer.Option(
264
+ False,
265
+ "--dry-run",
266
+ help="Only list functions without executing them",
267
+ ),
268
+ save_runs: bool = typer.Option(
269
+ False,
270
+ "--save-runs",
271
+ help="Save each evaluation run result to a JSON file",
272
+ ),
273
+ output_dir: Path = typer.Option(
274
+ Path("./eval_runs"),
275
+ "--output-dir",
276
+ help="Directory where run results are saved",
277
+ ),
278
+ ) -> None:
279
+ """
280
+ Find and execute all functions decorated with @eval_run.
281
+ """
282
+ files = directory.rglob("*.py") if recursive else directory.glob("*.py")
283
+ TARGET_DECORATOR = eval_run.__name__
284
+
285
+ executed_any = False
286
+
287
+ if save_runs and not dry_run:
288
+ output_dir.mkdir(parents=True, exist_ok=True)
289
+
290
+ for file in sorted(files):
291
+ functions = find_decorated_functions(file, TARGET_DECORATOR)
292
+ if not functions:
293
+ continue
294
+
295
+ typer.echo(f"\n📄 {file}")
296
+
297
+ module = None
298
+ if not dry_run:
299
+ try:
300
+ module = load_module_from_path(file)
301
+ except Exception as e:
302
+ typer.echo(f" ❌ Failed to import module: {e}")
303
+ continue
304
+
305
+ for fn_name in functions:
306
+ typer.echo(f" ▶ {fn_name}")
307
+
308
+ if dry_run:
309
+ continue
310
+
311
+ try:
312
+ func = getattr(module, fn_name)
313
+ result = func()
314
+
315
+ eval_type = validate_eval_result(result, fn_name)
316
+ if not eval_type:
317
+ continue
318
+
319
+ client = DeepEvalClient()
320
+
321
+ if eval_type == "text_eval":
322
+ resp = client.text_eval(config=result)
323
+ else:
324
+ resp = client.schema_eval(config=result)
325
+
326
+ typer.echo(f"{resp}")
327
+ executed_any = True
328
+ typer.echo(f" ✅ Returned {type(result).__name__}")
329
+
330
+ if save_runs:
331
+ run_id = (
332
+ resp["id"]
333
+ if resp is not None and "id" in resp
334
+ else uuid.uuid4().hex[:8]
335
+ )
336
+
337
+ output_path = output_dir / f"run_{fn_name}_{run_id}.json"
338
+
339
+ def to_json_safe(obj: Any) -> Any:
340
+ if hasattr(obj, "model_dump"):
341
+ return obj.model_dump()
342
+ if hasattr(obj, "dict"):
343
+ return obj.dict()
344
+ return obj
345
+
346
+ with output_path.open("w", encoding="utf-8") as f:
347
+ json.dump(
348
+ to_json_safe(resp),
349
+ f,
350
+ indent=2,
351
+ ensure_ascii=False,
352
+ )
353
+
354
+ typer.echo(f" 💾 Saved run → {output_path}")
355
+
356
+ except Exception as e:
357
+ typer.echo(f" ❌ Execution failed: {e}")
358
+
359
+ if not executed_any and not dry_run:
360
+ typer.echo("\nNo @eval_run functions executed.")
361
+
362
+
363
+ def _print_and_save(
364
+ resp: dict,
365
+ pretty: bool,
366
+ out: Path | None,
367
+ overwrite: bool,
368
+ ) -> None:
369
+ if pretty:
370
+ typer.echo(typer.style("📊 Result:", bold=True))
371
+ pprint(resp)
372
+ else:
373
+ typer.echo(resp)
374
+
375
+ if out is None:
376
+ return
377
+
378
+ if out.exists() and not overwrite:
379
+ typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
380
+ raise typer.Exit(code=1)
381
+
382
+ out.parent.mkdir(parents=True, exist_ok=True)
383
+
384
+ with out.open("w", encoding="utf-8") as f:
385
+ json.dump(resp, f, indent=2, ensure_ascii=False)
386
+
387
+ typer.echo(f"💾 Result saved to {out}")
388
+
389
+
390
+ @app.command()
391
+ def compare_testcases(
392
+ testcase_a_id: int = typer.Argument(
393
+ ...,
394
+ help="ID of the first testcase",
395
+ ),
396
+ testcase_b_id: int = typer.Argument(
397
+ ...,
398
+ help="ID of the second testcase",
399
+ ),
400
+ pretty: bool = typer.Option(
401
+ True,
402
+ "--pretty/--raw",
403
+ help="Pretty-print the response",
404
+ ),
405
+ raise_exception: bool = typer.Option(
406
+ False,
407
+ "--raise",
408
+ help="Raise HTTP exceptions instead of swallowing them",
409
+ ),
410
+ out: Path | None = typer.Option(
411
+ None,
412
+ "-o",
413
+ "--out",
414
+ help="Optional file path to save the result as JSON",
415
+ ),
416
+ overwrite: bool = typer.Option(
417
+ False,
418
+ "--overwrite",
419
+ help="Overwrite output file if it already exists",
420
+ ),
421
+ ) -> None:
422
+ """
423
+ Compare two DeepEval evaluation testcases.
424
+ """
425
+ client = DeepEvalClient()
426
+
427
+ typer.echo(f"🔍 Comparing testcases {testcase_a_id} ↔ {testcase_b_id}")
428
+
429
+ try:
430
+ resp = client.compare_testcases(
431
+ testcase_a_id=testcase_a_id,
432
+ testcase_b_id=testcase_b_id,
433
+ raise_exception=raise_exception,
434
+ )
435
+ except Exception as e:
436
+ typer.echo(f"❌ Request failed: {e}")
437
+ raise typer.Exit(code=1)
438
+
439
+ if not resp:
440
+ typer.echo("⚠️ No response received")
441
+ raise typer.Exit(code=1)
442
+ _print_and_save(resp, pretty, out, overwrite)
443
+
444
+
445
+ @app.command()
446
+ def compare_label_latest(
447
+ label_a: str = typer.Argument(
448
+ ...,
449
+ help="First label (latest run will be used)",
450
+ ),
451
+ label_b: str = typer.Argument(
452
+ ...,
453
+ help="Second label (latest run will be used)",
454
+ ),
455
+ pretty: bool = typer.Option(
456
+ True,
457
+ "--pretty/--raw",
458
+ help="Pretty-print the response",
459
+ ),
460
+ raise_exception: bool = typer.Option(
461
+ False,
462
+ "--raise",
463
+ help="Raise HTTP exceptions instead of swallowing them",
464
+ ),
465
+ out: Path | None = typer.Option(
466
+ None,
467
+ "-o",
468
+ "--out",
469
+ help="Optional file path to save the result as JSON",
470
+ ),
471
+ overwrite: bool = typer.Option(
472
+ False,
473
+ "--overwrite",
474
+ help="Overwrite output file if it already exists",
475
+ ),
476
+ ) -> None:
477
+ """
478
+ Compare the latest evaluation runs for two labels.
479
+ """
480
+ client = DeepEvalClient()
481
+
482
+ typer.echo(f"🔍 Comparing latest runs: '{label_a}' ↔ '{label_b}'")
483
+
484
+ try:
485
+ resp = client.compare_latest_by_labels(
486
+ label_a=label_a,
487
+ label_b=label_b,
488
+ raise_exception=raise_exception,
489
+ )
490
+ except Exception as e:
491
+ typer.echo(f"❌ Request failed: {e}")
492
+ raise typer.Exit(code=1)
493
+
494
+ if not resp:
495
+ typer.echo("⚠️ No response received")
496
+ raise typer.Exit(code=1)
497
+
498
+ _print_and_save(resp, pretty, out, overwrite)
499
+
500
+
501
+ @app.command()
502
+ def compare_last(
503
+ label: str = typer.Argument(
504
+ ...,
505
+ help="Label whose last two runs will be compared",
506
+ ),
507
+ pretty: bool = typer.Option(
508
+ True,
509
+ "--pretty/--raw",
510
+ help="Pretty-print the response",
511
+ ),
512
+ raise_exception: bool = typer.Option(
513
+ False,
514
+ "--raise",
515
+ help="Raise HTTP exceptions instead of swallowing them",
516
+ ),
517
+ out: Path | None = typer.Option(
518
+ None,
519
+ "-o",
520
+ "--out",
521
+ help="Optional file path to save the result as JSON",
522
+ ),
523
+ overwrite: bool = typer.Option(
524
+ False,
525
+ "--overwrite",
526
+ help="Overwrite output file if it already exists",
527
+ ),
528
+ ) -> None:
529
+ """
530
+ Compare the last two evaluation runs of a label.
531
+ """
532
+ client = DeepEvalClient()
533
+
534
+ typer.echo(f"🔍 Comparing last two runs for label '{label}'")
535
+
536
+ try:
537
+ resp = client.compare_last_two_by_label(
538
+ label=label,
539
+ raise_exception=raise_exception,
540
+ )
541
+ except Exception as e:
542
+ typer.echo(f"❌ Request failed: {e}")
543
+ raise typer.Exit(code=1)
544
+
545
+ if not resp:
546
+ typer.echo("⚠️ No response received")
547
+ raise typer.Exit(code=1)
548
+
549
+ _print_and_save(resp, pretty, out, overwrite)
550
+
551
+
552
+ def main() -> None:
553
+ app()
554
+
555
+
556
+ if __name__ == "__main__":
557
+ main()