rakam-eval-sdk 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rakam-eval-sdk
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Evaluation Framework SDK
5
5
  Author: Mohamed Bachar Touil
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "uv_build"
4
4
 
5
5
  [project]
6
6
  name = "rakam-eval-sdk"
7
- version = "0.2.2"
7
+ version = "0.2.3"
8
8
  description = "Evaluation Framework SDK"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -35,55 +35,88 @@ metrics_app = typer.Typer(help="Metrics utilities")
35
35
  app.add_typer(metrics_app, name="metrics")
36
36
 
37
37
 
38
+ def extract_metric_names(config: Any) -> list[tuple[str, str | None]]:
39
+ """
40
+ Returns [(type, name)] from EvalConfig / SchemaEvalConfig
41
+ """
42
+ if not hasattr(config, "metrics"):
43
+ return []
44
+
45
+ results: list[tuple[str, str | None]] = []
46
+
47
+ for metric in config.metrics or []:
48
+ metric_type = getattr(metric, "type", None)
49
+ metric_name = getattr(metric, "name", None)
50
+ if metric_type:
51
+ results.append((metric_type, metric_name))
52
+
53
+ return results
54
+
55
+
38
56
  @metrics_app.command("list")
39
- def list_metrics(
40
- limit: int = typer.Option(
41
- 20,
42
- "--limit",
43
- help="Number of testcases to inspect for metrics",
57
+ def metrics(
58
+ directory: Path = typer.Argument(
59
+ Path("./eval"),
60
+ exists=True,
61
+ file_okay=False,
62
+ dir_okay=True,
63
+ help="Directory to scan (default: ./eval)",
44
64
  ),
45
- ):
65
+ recursive: bool = typer.Option(
66
+ False,
67
+ "-r",
68
+ "--recursive",
69
+ help="Recursively search for Python files",
70
+ ),
71
+ ) -> None:
46
72
  """
47
- List unique metric names found in evaluation testcases.
73
+ List all metric types used by loaded eval configs.
48
74
  """
49
- client = DeepEvalClient()
75
+ files = directory.rglob("*.py") if recursive else directory.glob("*.py")
76
+ TARGET_DECORATOR = eval_run.__name__
50
77
 
51
- testcases = client.list_evaluation_testcases(
52
- limit=limit,
53
- offset=0,
54
- raise_exception=True,
55
- )
78
+ all_metrics: set[tuple[str, str | None]] = set()
79
+ found_any = False
56
80
 
57
- if not testcases:
58
- typer.echo("No evaluation testcases found.")
59
- return
81
+ for file in sorted(files):
82
+ functions = find_decorated_functions(file, TARGET_DECORATOR)
83
+ if not functions:
84
+ continue
60
85
 
61
- metric_names: set[str] = set()
86
+ typer.echo(f"\n📄 {file}")
62
87
 
63
- def collect_metrics(entries: Sequence[Dict] | None):
64
- if not entries:
65
- return
66
- for entry in entries:
67
- for metric in entry.get("metrics", []) or []:
68
- name = metric.get("name")
69
- if name:
70
- metric_names.add(name)
88
+ try:
89
+ module = load_module_from_path(file)
90
+ except Exception as e:
91
+ typer.echo(f" ❌ Failed to import module: {e}")
92
+ continue
71
93
 
72
- for tc in testcases:
94
+ for fn_name in functions:
95
+ try:
96
+ func = getattr(module, fn_name)
97
+ result = func()
73
98
 
74
- collect_metrics(tc.get("result"))
99
+ metrics = extract_metric_names(result)
100
+ if not metrics:
101
+ continue
75
102
 
76
- if not metric_names:
77
- typer.echo("No metrics found.")
78
- return
103
+ found_any = True
104
+ for metric_type, metric_name in metrics:
105
+ all_metrics.add((metric_type, metric_name))
79
106
 
80
- typer.echo(
81
- f"📊 Found {len(metric_names)} unique metrics "
82
- f"(from latest {limit} testcases)\n"
83
- )
107
+ if metric_name:
108
+ typer.echo(f" {metric_type} (alias: {metric_name})")
109
+ else:
110
+ typer.echo(f" • {metric_type}")
111
+
112
+ except Exception as e:
113
+ typer.echo(f" ❌ Failed to inspect {fn_name}: {e}")
114
+
115
+ if not found_any:
116
+ typer.echo("\nNo metrics found.")
117
+ raise typer.Exit(code=0)
84
118
 
85
- for name in sorted(metric_names):
86
- typer.echo(f"- {name}")
119
+ typer.echo(f"\n✅ {len(all_metrics)} unique metrics found")
87
120
 
88
121
 
89
122
  @list_app.command("evals")
@@ -165,12 +198,12 @@ def show(
165
198
  id: Optional[int] = typer.Option(
166
199
  None,
167
200
  "--id",
168
- help="Numeric evaluation testcase ID",
201
+ help="Run ID",
169
202
  ),
170
203
  tag: Optional[str] = typer.Option(
171
204
  None,
172
205
  "--tag",
173
- help="Evaluation testcase tag",
206
+ help="Run tag",
174
207
  ),
175
208
  raw: bool = typer.Option(
176
209
  False,
@@ -391,13 +424,15 @@ def _print_and_save(
391
424
 
392
425
  @app.command()
393
426
  def compare(
394
- testcase_a_id: int = typer.Argument(
395
- ...,
396
- help="ID of the first testcase",
427
+ tag: list[str] = typer.Option(
428
+ [],
429
+ "--tag",
430
+ help="Tag identifying a reference testcase",
397
431
  ),
398
- testcase_b_id: int = typer.Argument(
399
- ...,
400
- help="ID of the second testcase",
432
+ run: list[int] = typer.Option(
433
+ [],
434
+ "--run",
435
+ help="Run ID identifying an evaluation testcase",
401
436
  ),
402
437
  pretty: bool = typer.Option(
403
438
  True,
@@ -422,18 +457,45 @@ def compare(
422
457
  ),
423
458
  ) -> None:
424
459
  """
425
- Compare two DeepEval evaluation testcases.
460
+ Compare two evaluation testcases using runs and/or labels.
426
461
  """
462
+
463
+ targets = []
464
+
465
+ for r in run:
466
+ targets.append(("run", r))
467
+
468
+ for l in tag:
469
+ targets.append(("label", l))
470
+
471
+ if len(targets) != 2:
472
+ typer.echo(
473
+ "❌ Provide exactly two targets using --run and/or --label"
474
+ )
475
+ raise typer.Exit(code=1)
476
+
427
477
  client = DeepEvalClient()
428
478
 
429
- typer.echo(f"🔍 Comparing testcases {testcase_a_id} {testcase_b_id}")
479
+ (type_a, value_a), (type_b, value_b) = targets
480
+
481
+ typer.echo(
482
+ f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'"
483
+ )
484
+
485
+ kwargs = {"raise_exception": raise_exception}
486
+
487
+ if type_a == "run":
488
+ kwargs["testcase_a_id"] = value_a
489
+ else:
490
+ kwargs["testcase_a_tag"] = value_a
491
+
492
+ if type_b == "run":
493
+ kwargs["testcase_b_id"] = value_b
494
+ else:
495
+ kwargs["testcase_b_tag"] = value_b
430
496
 
431
497
  try:
432
- resp = client.compare_testcases(
433
- testcase_a_id=testcase_a_id,
434
- testcase_b_id=testcase_b_id,
435
- raise_exception=raise_exception,
436
- )
498
+ resp = client.compare_testcases(**kwargs)
437
499
  except Exception as e:
438
500
  typer.echo(f"❌ Request failed: {e}")
439
501
  raise typer.Exit(code=1)
@@ -441,10 +503,11 @@ def compare(
441
503
  if not resp:
442
504
  typer.echo("⚠️ No response received")
443
505
  raise typer.Exit(code=1)
506
+
444
507
  _print_and_save(resp, pretty, out, overwrite)
445
508
 
446
509
 
447
- @app.command()
510
+ @app.command(hidden=True)
448
511
  def compare_label_latest(
449
512
  label_a: str = typer.Argument(
450
513
  ...,
@@ -500,7 +563,7 @@ def compare_label_latest(
500
563
  _print_and_save(resp, pretty, out, overwrite)
501
564
 
502
565
 
503
- @app.command()
566
+ @app.command(hidden=True)
504
567
  def compare_last(
505
568
  label: str = typer.Argument(
506
569
  ...,
@@ -551,44 +614,62 @@ def compare_last(
551
614
  _print_and_save(resp, pretty, out, overwrite)
552
615
 
553
616
 
554
- @list_app.command("tag")
555
- def update_run_tag(
556
- run_id: int = typer.Argument(..., help="Evaluation run ID"),
617
+ @app.command("tag")
618
+ def tag_command(
619
+ run_id: Optional[int] = typer.Option(
620
+ None,
621
+ "--run",
622
+ help="Evaluation run ID",
623
+ ),
557
624
  tag: Optional[str] = typer.Option(
558
625
  None,
559
626
  "--tag",
560
- "-t",
561
- help="Tag to add or update",
627
+ help="Tag to assign to the run",
562
628
  ),
563
- remove: bool = typer.Option(
564
- False,
565
- "--remove",
566
- help="Remove tag from the run",
629
+ delete: Optional[str] = typer.Option(
630
+ None,
631
+ "--delete",
632
+ help="Delete a tag",
567
633
  ),
568
634
  ):
569
635
  """
570
- Add, update, or remove a tag from an evaluation run.
636
+ Assign a tag to a run or delete a tag.
571
637
  """
572
- if not tag and not remove:
573
- typer.echo("❌ You must provide --tag or --remove")
574
- raise typer.Exit(code=1)
575
638
 
576
- if tag and remove:
577
- typer.echo("❌ Use either --tag or --remove, not both")
578
- raise typer.Exit(code=1)
639
+ # --- validation ---
640
+ if delete:
641
+ if run_id or tag:
642
+ typer.echo("❌ --delete cannot be used with --run or --tag")
643
+ raise typer.Exit(code=1)
644
+ else:
645
+ if not run_id or not tag:
646
+ typer.echo("❌ Use --run and --tag together, or --delete")
647
+ raise typer.Exit(code=1)
579
648
 
580
649
  client = DeepEvalClient()
581
650
 
651
+ # --- delete mode ---
652
+ if delete:
653
+ result = client.update_evaluation_testcase_tag(
654
+ testcase_id=run_id,
655
+ tag=delete,
656
+ raise_exception=True,
657
+ )
658
+ typer.echo("🗑️ Tag deleted successfully")
659
+ typer.echo(f"Tag: {delete}")
660
+ return
661
+
662
+ # --- assign/update mode ---
582
663
  result = client.update_evaluation_testcase_tag(
583
664
  testcase_id=run_id,
584
- tag=None if remove else tag,
665
+ tag=tag,
585
666
  raise_exception=True,
586
667
  )
587
668
 
588
- action = "removed" if remove else "updated"
589
- typer.echo(f"✅ Tag {action} successfully")
669
+ typer.echo(" Tag assigned successfully")
590
670
  typer.echo(f"Run ID: {run_id}")
591
- typer.echo(f"Tag: {result.get('tag') or '-'}")
671
+ typer.echo(f"Tag: {result.get('tag')}")
672
+
592
673
 
593
674
  def main() -> None:
594
675
  app()
@@ -1,7 +1,6 @@
1
- from typing import Optional, Dict
2
1
  import os
3
2
  import random
4
- from typing import Any, Dict, List, Optional, Union, cast, overload
3
+ from typing import Any, Dict, List, Literal, Optional, Union, cast, overload
5
4
 
6
5
  import requests
7
6
 
@@ -13,7 +12,6 @@ from .schema import (
13
12
  SchemaMetricConfig,
14
13
  TextInputItem,
15
14
  )
16
- from typing import Optional, Literal, cast
17
15
 
18
16
  HTTPMethod = Literal["GET", "POST", "PATCH", "PUT", "DELETE"]
19
17
 
@@ -92,14 +90,17 @@ class DeepEvalClient:
92
90
  "raw": resp.text,
93
91
  }
94
92
 
95
- def _get(self, endpoint: str, params: dict, **kw):
96
- return self._request("GET", endpoint, params=params, **kw)
93
+ def _get(self, endpoint: str, params: dict, *args, **kw):
94
+ return self._request("GET", endpoint, params=params, *args, **kw)
97
95
 
98
- def _post(self, endpoint: str, payload: dict, **kw):
99
- return self._request("POST", endpoint, json=payload, **kw)
96
+ def _post(self, endpoint: str, payload: dict, *args, **kw):
97
+ return self._request("POST", endpoint, json=payload, *args, **kw)
100
98
 
101
- def _patch(self, endpoint: str, payload: dict, **kw):
102
- return self._request("PATCH", endpoint, json=payload, **kw)
99
+ def _patch(self, endpoint: str, payload: dict, *args, **kw):
100
+ return self._request("PATCH", endpoint, json=payload, *args, **kw)
101
+
102
+ def _delete(self, endpoint: str, payload: dict, *args, **kw):
103
+ return self._request("DELETE", endpoint, json=payload, *args, **kw)
103
104
 
104
105
  def update_evaluation_testcase_tag(
105
106
  self,
@@ -114,8 +115,14 @@ class DeepEvalClient:
114
115
  - tag="smoke" → add / update tag
115
116
  - tag=None → remove tag
116
117
  """
118
+ if testcase_id is None:
119
+ return self._delete(
120
+ endpoint=f"/deepeval/tag/{tag}",
121
+ payload={},
122
+ raise_exception=raise_exception,
123
+ )
117
124
  return self._patch(
118
- f"/evaluation-testcases/{testcase_id}/tag",
125
+ endpoint=f"/deepeval/{testcase_id}/tag",
119
126
  payload={"tag": tag},
120
127
  raise_exception=raise_exception,
121
128
  )
@@ -132,7 +139,7 @@ class DeepEvalClient:
132
139
  Sorted by created_at DESC (newest first).
133
140
  """
134
141
  return self._get(
135
- "/eval-framework/deepeval/evaluation-testcases/token",
142
+ endpoint="/eval-framework/deepeval/evaluation-testcases/token",
136
143
  params={
137
144
  "limit": limit,
138
145
  "offset": offset,
@@ -150,7 +157,7 @@ class DeepEvalClient:
150
157
  Fetch a single evaluation testcase by numeric ID.
151
158
  """
152
159
  return self._get(
153
- f"/eval-framework/deepeval/id/{testcase_id}",
160
+ endpoint=f"/eval-framework/deepeval/id/{testcase_id}",
154
161
  params={},
155
162
  raise_exception=raise_exception,
156
163
  )
@@ -165,7 +172,7 @@ class DeepEvalClient:
165
172
  Fetch a single evaluation testcase by tag.
166
173
  """
167
174
  return self._get(
168
- f"/eval-framework/deepeval/tag/{tag}",
175
+ endpoint=f"/eval-framework/deepeval/tag/{tag}",
169
176
  params={},
170
177
  raise_exception=raise_exception,
171
178
  )
@@ -190,19 +197,41 @@ class DeepEvalClient:
190
197
  def compare_testcases(
191
198
  self,
192
199
  *,
193
- testcase_a_id: int,
194
- testcase_b_id: int,
200
+ testcase_a_id: int | None = None,
201
+ testcase_a_tag: str | None = None,
202
+ testcase_b_id: int | None = None,
203
+ testcase_b_tag: str | None = None,
195
204
  raise_exception: bool = False,
196
205
  ) -> Optional[dict]:
197
206
  """
198
- Compare two evaluation testcases.
207
+ Compare two evaluation testcases using IDs or tags.
208
+ Exactly one identifier (id or tag) must be provided per testcase.
199
209
  """
210
+
211
+ def validate(id_, tag, name: str):
212
+ if bool(id_) == bool(tag):
213
+ raise ValueError(
214
+ f"Provide exactly one of {name}_id or {name}_tag"
215
+ )
216
+
217
+ validate(testcase_a_id, testcase_a_tag, "testcase_a")
218
+ validate(testcase_b_id, testcase_b_tag, "testcase_b")
219
+
220
+ params: dict[str, int | str] = {}
221
+
222
+ if testcase_a_id is not None:
223
+ params["testcase_a_id"] = testcase_a_id
224
+ else:
225
+ params["testcase_a_tag"] = testcase_a_tag # type: ignore
226
+
227
+ if testcase_b_id is not None:
228
+ params["testcase_b_id"] = testcase_b_id
229
+ else:
230
+ params["testcase_b_tag"] = testcase_b_tag # type: ignore
231
+
200
232
  return self._get(
201
- "/eval-framework/deepeval/evaluation-testcases/compare",
202
- params={
203
- "testcase_a_id": testcase_a_id,
204
- "testcase_b_id": testcase_b_id,
205
- },
233
+ endpoint="/eval-framework/deepeval/evaluation-testcases/compare",
234
+ params=params,
206
235
  raise_exception=raise_exception,
207
236
  )
208
237
 
@@ -217,7 +246,7 @@ class DeepEvalClient:
217
246
  Compare the latest evaluation testcases for two labels.
218
247
  """
219
248
  return self._get(
220
- "/eval-framework/deepeval/evaluation-testcases/compare-latest",
249
+ endpoint="/eval-framework/deepeval/evaluation-testcases/compare-latest",
221
250
  params={
222
251
  "label_a": label_a,
223
252
  "label_b": label_b,
@@ -235,7 +264,7 @@ class DeepEvalClient:
235
264
  Compare the last two evaluation testcases for a given label.
236
265
  """
237
266
  return self._get(
238
- "/eval-framework/deepeval/evaluation-testcases/compare-last-two",
267
+ endpoint="/eval-framework/deepeval/evaluation-testcases/compare-last-two",
239
268
  params={
240
269
  "label": label,
241
270
  },
@@ -280,7 +309,7 @@ class DeepEvalClient:
280
309
  )
281
310
 
282
311
  return self._post(
283
- "/deepeval/text-eval", config.model_dump(), raise_exception
312
+ endpoint="/deepeval/text-eval", payload=config.model_dump(), raise_exception=raise_exception
284
313
  )
285
314
 
286
315
  def text_eval_background(
@@ -295,7 +324,7 @@ class DeepEvalClient:
295
324
  payload = EvalConfig.model_construct(
296
325
  data=data, metrics=metrics, component=component, version=label
297
326
  ).model_dump()
298
- return self._post("/deepeval/text-eval/background", payload, raise_exception)
327
+ return self._post(endpoint="/deepeval/text-eval/background", payload=payload, raise_exception=raise_exception)
299
328
 
300
329
  @overload
301
330
  def schema_eval(
@@ -340,9 +369,9 @@ class DeepEvalClient:
340
369
  )
341
370
 
342
371
  return self._post(
343
- "/deepeval/schema-eval",
344
- config.model_dump(),
345
- raise_exception,
372
+ endpoint="/deepeval/schema-eval",
373
+ payload=config.model_dump(),
374
+ raise_exception=raise_exception,
346
375
  )
347
376
 
348
377
  def schema_eval_background(
@@ -358,7 +387,7 @@ class DeepEvalClient:
358
387
  data=data, metrics=metrics, component=component, version=label
359
388
  ).model_dump()
360
389
  return self._post(
361
- "/deepeval/schema-eval/background", payload, raise_exception
390
+ endpoint="/deepeval/schema-eval/background", payload=payload, raise_exception=raise_exception
362
391
  )
363
392
 
364
393
  def maybe_text_eval(
File without changes