rakam-eval-sdk 0.2.4__py3-none-any.whl → 0.2.4rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rakam_eval_sdk/cli.py CHANGED
@@ -6,7 +6,7 @@ import uuid
6
6
  from datetime import datetime
7
7
  from pathlib import Path
8
8
  from pprint import pprint
9
- from typing import Any, Dict, Optional
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
10
10
 
11
11
  import typer
12
12
  from dotenv import load_dotenv
@@ -37,14 +37,14 @@ metrics_app = typer.Typer(help="Metrics utilities")
37
37
  app.add_typer(metrics_app, name="metrics")
38
38
 
39
39
 
40
- def extract_metric_names(config: Any) -> list[tuple[str, str | None]]:
40
+ def extract_metric_names(config: Any) -> list[tuple[str, Optional[str]]]:
41
41
  """
42
42
  Returns [(type, name)] from EvalConfig / SchemaEvalConfig
43
43
  """
44
44
  if not hasattr(config, "metrics"):
45
45
  return []
46
46
 
47
- results: list[tuple[str, str | None]] = []
47
+ results: list[tuple[str, Optional[str]]] = []
48
48
 
49
49
  for metric in config.metrics or []:
50
50
  metric_type = getattr(metric, "type", None)
@@ -77,7 +77,7 @@ def metrics(
77
77
  files = directory.rglob("*.py") if recursive else directory.glob("*.py")
78
78
  TARGET_DECORATOR = eval_run.__name__
79
79
 
80
- all_metrics: set[tuple[str, str | None]] = set()
80
+ all_metrics: set[tuple[str, Optional[str]]] = set()
81
81
  found_any = False
82
82
 
83
83
  for file in sorted(files):
@@ -159,7 +159,7 @@ def list_evals(
159
159
  def list_runs(
160
160
  limit: int = typer.Option(20, help="Max number of runs"),
161
161
  offset: int = typer.Option(0, help="Pagination offset"),
162
- ):
162
+ ) -> None:
163
163
  """
164
164
  List evaluation runs (newest first).
165
165
  """
@@ -170,7 +170,7 @@ def list_runs(
170
170
  offset=offset,
171
171
  raise_exception=True,
172
172
  )
173
-
173
+ assert response is not None
174
174
  items = response.get("items", [])
175
175
  total = response.get("total", 0)
176
176
 
@@ -199,7 +199,8 @@ def list_runs(
199
199
  shown = offset + len(items)
200
200
  if shown < total:
201
201
  typer.echo()
202
- typer.echo(f"Showing {shown} of {total} runs. Use --limit to see more.")
202
+ typer.echo(
203
+ f"Showing {shown} of {total} runs. Use --limit to see more.")
203
204
 
204
205
 
205
206
  @app.command()
@@ -219,15 +220,15 @@ def show(
219
220
  "--raw",
220
221
  help="Print raw JSON instead of formatted output",
221
222
  ),
222
- ):
223
+ ) -> None:
223
224
  """
224
225
  Show a single evaluation testcase by ID or tag.
225
226
  """
226
227
  if not id and not tag:
227
- raise typer.BadParameter("You must provide either --id or --uid")
228
+ raise typer.BadParameter("You must provide either --id or --tag")
228
229
 
229
230
  if id and tag:
230
- raise typer.BadParameter("Provide only one of --id or --uid")
231
+ raise typer.BadParameter("Provide only one of --id or --tag")
231
232
 
232
233
  client = DeepEvalClient()
233
234
 
@@ -235,6 +236,7 @@ def show(
235
236
  result = client.get_evaluation_testcase_by_id(id)
236
237
  identifier = f"id={id}"
237
238
  else:
239
+ assert tag is not None
238
240
  result = client.get_evaluation_testcase_by_tag(tag)
239
241
  identifier = f"tag={tag}"
240
242
 
@@ -406,7 +408,7 @@ def run(
406
408
  def _print_and_save(
407
409
  resp: dict,
408
410
  pretty: bool,
409
- out: Path | None,
411
+ out: Optional[Path],
410
412
  overwrite: bool,
411
413
  ) -> None:
412
414
  if pretty:
@@ -419,7 +421,8 @@ def _print_and_save(
419
421
  return
420
422
 
421
423
  if out.exists() and not overwrite:
422
- typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
424
+ typer.echo(
425
+ f"❌ File already exists: {out} (use --overwrite to replace)")
423
426
  raise typer.Exit(code=1)
424
427
 
425
428
  out.parent.mkdir(parents=True, exist_ok=True)
@@ -430,13 +433,13 @@ def _print_and_save(
430
433
  typer.echo(f"💾 Result saved to {out}")
431
434
 
432
435
 
433
- def pct_change(a: float | None, b: float | None) -> str | None:
436
+ def pct_change(a: Optional[float], b: Optional[float]) -> Optional[str]:
434
437
  if a is None or b is None or a == 0:
435
438
  return None
436
439
  return f"{((b - a) / a) * 100:+.2f}%"
437
440
 
438
441
 
439
- def metric_direction(delta: float | None) -> str:
442
+ def metric_direction(delta: Optional[float]) -> str:
440
443
  if delta is None:
441
444
  return "unchanged"
442
445
  if delta > 0:
@@ -446,7 +449,7 @@ def metric_direction(delta: float | None) -> str:
446
449
  return "unchanged"
447
450
 
448
451
 
449
- def print_metric_diff(diff: MetricDiff):
452
+ def print_metric_diff(diff: MetricDiff) -> None:
450
453
  secho(f"\nMetric: {diff.metric}", bold=True)
451
454
 
452
455
  if diff.status == "added":
@@ -507,7 +510,7 @@ def summarize(metrics: Any) -> Dict[str, int]:
507
510
  return summary
508
511
 
509
512
 
510
- def pretty_print_comparison(resp: Any, summary_only: bool = False):
513
+ def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
511
514
  if not summary_only:
512
515
  for metric in resp.metrics:
513
516
  print_metric_diff(metric)
@@ -516,7 +519,7 @@ def pretty_print_comparison(resp: Any, summary_only: bool = False):
516
519
  print_summary(resp.metrics)
517
520
 
518
521
 
519
- def print_summary(metrics: Any):
522
+ def print_summary(metrics: Any) -> None:
520
523
  summary = summarize(metrics)
521
524
 
522
525
  secho("\nSummary:", bold=True)
@@ -529,7 +532,7 @@ def print_summary(metrics: Any):
529
532
 
530
533
  @app.command()
531
534
  def compare(
532
- tag: list[str] = typer.Option(
535
+ tag: List[str] = typer.Option(
533
536
  [],
534
537
  "--tag",
535
538
  help="Label identifying a reference testcase",
@@ -554,7 +557,7 @@ def compare(
554
557
  "--raise",
555
558
  help="Raise HTTP exceptions instead of swallowing them",
556
559
  ),
557
- out: Path | None = typer.Option(
560
+ out: Optional[Path] = typer.Option(
558
561
  None,
559
562
  "-o",
560
563
  "--out",
@@ -575,7 +578,7 @@ def compare(
575
578
  - Summary of improvements / regressions
576
579
  """
577
580
 
578
- targets: list[tuple[str, str | int]] = []
581
+ targets: List[Tuple[str, Union[str, int]]] = []
579
582
 
580
583
  for r in run:
581
584
  targets.append(("run", r))
@@ -600,7 +603,7 @@ def compare(
600
603
 
601
604
  client = DeepEvalClient()
602
605
 
603
- kwargs = {"raise_exception": raise_exception}
606
+ kwargs: Dict[str, Any] = {"raise_exception": raise_exception}
604
607
 
605
608
  if type_a == "run":
606
609
  kwargs["testcase_a_id"] = value_a
@@ -665,7 +668,7 @@ def compare_label_latest(
665
668
  "--raise",
666
669
  help="Raise HTTP exceptions instead of swallowing them",
667
670
  ),
668
- out: Path | None = typer.Option(
671
+ out: Optional[Path] = typer.Option(
669
672
  None,
670
673
  "-o",
671
674
  "--out",
@@ -717,7 +720,7 @@ def compare_last(
717
720
  "--raise",
718
721
  help="Raise HTTP exceptions instead of swallowing them",
719
722
  ),
720
- out: Path | None = typer.Option(
723
+ out: Optional[Path] = typer.Option(
721
724
  None,
722
725
  "-o",
723
726
  "--out",
@@ -769,7 +772,7 @@ def tag_command(
769
772
  "--delete",
770
773
  help="Delete a tag",
771
774
  ),
772
- ):
775
+ ) -> None:
773
776
  """
774
777
  Assign a tag to a run or delete a tag.
775
778
  """
@@ -786,8 +789,9 @@ def tag_command(
786
789
 
787
790
  client = DeepEvalClient()
788
791
 
789
- # --- delete mode ---
790
792
  if delete:
793
+ assert run_id is not None
794
+
791
795
  result = client.update_evaluation_testcase_tag(
792
796
  testcase_id=run_id,
793
797
  tag=delete,
@@ -796,13 +800,14 @@ def tag_command(
796
800
  typer.echo("🗑️ Tag deleted successfully")
797
801
  typer.echo(f"Tag: {delete}")
798
802
  return
799
-
800
- # --- assign/update mode ---
803
+ assert run_id is not None
804
+ assert tag is not None
801
805
  result = client.update_evaluation_testcase_tag(
802
806
  testcase_id=run_id,
803
807
  tag=tag,
804
808
  raise_exception=True,
805
809
  )
810
+ assert result is not None
806
811
 
807
812
  typer.echo("✅ Tag assigned successfully")
808
813
  typer.echo(f"Run ID: {run_id}")
rakam_eval_sdk/client.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import os
2
2
  import random
3
- from typing import Any, Dict, List, Literal, Optional, Union, cast, overload
3
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast, overload
4
4
 
5
5
  import requests
6
6
 
@@ -49,8 +49,8 @@ class DeepEvalClient:
49
49
  method: HTTPMethod,
50
50
  endpoint: str,
51
51
  *,
52
- json: Dict | None = None,
53
- params: Dict | None = None,
52
+ json: Optional[Dict] = None,
53
+ params: Optional[Dict] = None,
54
54
  raise_exception: bool = False,
55
55
  ) -> Optional[Dict]:
56
56
  url = f"{self.base_url}{endpoint}"
@@ -89,17 +89,63 @@ class DeepEvalClient:
89
89
  "raw": resp.text,
90
90
  }
91
91
 
92
- def _get(self, endpoint: str, params: Dict, *args, **kw):
93
- return self._request("GET", endpoint, params=params, *args, **kw)
92
+ def _get(
93
+ self,
94
+ endpoint: str,
95
+ params: Dict,
96
+ raise_exception: bool = False,
97
+ *args: Tuple,
98
+ **kw: Dict,
99
+ ) -> Optional[Dict]:
100
+ return self._request(
101
+ "GET", endpoint, params=params, raise_exception=raise_exception, *args, **kw
102
+ )
94
103
 
95
- def _post(self, endpoint: str, payload: Dict, *args, **kw):
96
- return self._request("POST", endpoint, json=payload, *args, **kw)
104
+ def _post(
105
+ self,
106
+ endpoint: str,
107
+ payload: Dict,
108
+ raise_exception: bool = False,
109
+ *args: Tuple,
110
+ **kw: Dict,
111
+ ) -> Optional[Dict]:
112
+ return self._request(
113
+ "POST", endpoint, json=payload, raise_exception=raise_exception, *args, **kw
114
+ )
97
115
 
98
- def _patch(self, endpoint: str, payload: Dict, *args, **kw):
99
- return self._request("PATCH", endpoint, json=payload, *args, **kw)
116
+ def _patch(
117
+ self,
118
+ endpoint: str,
119
+ payload: Dict,
120
+ raise_exception: bool = False,
121
+ *args: Tuple,
122
+ **kw: Dict,
123
+ ) -> Optional[Dict]:
124
+ return self._request(
125
+ "PATCH",
126
+ endpoint,
127
+ json=payload,
128
+ raise_exception=raise_exception,
129
+ *args,
130
+ **kw,
131
+ )
100
132
 
101
- def _delete(self, endpoint: str, payload: Dict, *args, **kw):
102
- return self._request("DELETE", endpoint, json=payload, *args, **kw)
133
+ def _delete(
134
+ self,
135
+ endpoint: str,
136
+ payload: Dict,
137
+ raise_exception: bool = False,
138
+ *args: Tuple,
139
+ **kw: Dict,
140
+ ) -> Optional[Dict]:
141
+ return self._request(
142
+ "DELETE",
143
+ endpoint,
144
+ json=payload,
145
+ raise_exception=raise_exception,
146
+ *args,
147
+ **kw,
148
+ )
103
149
 
104
150
  def update_evaluation_testcase_tag(
105
151
  self,
@@ -197,10 +243,10 @@ class DeepEvalClient:
197
243
  def compare_testcases(
198
244
  self,
199
245
  *,
200
- testcase_a_id: int | None = None,
201
- testcase_a_tag: str | None = None,
202
- testcase_b_id: int | None = None,
203
- testcase_b_tag: str | None = None,
246
+ testcase_a_id: Optional[int] = None,
247
+ testcase_a_tag: Optional[str] = None,
248
+ testcase_b_id: Optional[int] = None,
249
+ testcase_b_tag: Optional[str] = None,
204
250
  raise_exception: bool = False,
205
251
  ) -> Optional[Dict]:
206
252
  """
@@ -208,14 +254,14 @@ class DeepEvalClient:
208
254
  Exactly one identifier (id or tag) must be provided per testcase.
209
255
  """
210
256
 
211
- def validate(id_, tag, name: str):
257
+ def validate(id_: Optional[int], tag: Optional[str], name: str) -> None:
212
258
  if bool(id_) == bool(tag):
213
259
  raise ValueError(f"Provide exactly one of {name}_id or {name}_tag")
214
260
 
215
261
  validate(testcase_a_id, testcase_a_tag, "testcase_a")
216
262
  validate(testcase_b_id, testcase_b_tag, "testcase_b")
217
263
 
218
- params: dict[str, int | str] = {}
264
+ params: Dict[str, Union[int, str]] = {}
219
265
 
220
266
  if testcase_a_id is not None:
221
267
  params["testcase_a_id"] = testcase_a_id
@@ -284,18 +330,18 @@ class DeepEvalClient:
284
330
  data: List[TextInputItem],
285
331
  metrics: List[MetricConfig],
286
332
  component: str = "unknown",
287
- label: str | None = None,
333
+ label: Optional[str] = None,
288
334
  raise_exception: bool = False,
289
335
  ) -> Optional[Dict]: ...
290
336
 
291
337
  def text_eval(
292
338
  self,
293
- config: EvalConfig | None = None,
339
+ config: Optional[EvalConfig] = None,
294
340
  *,
295
- data: List[TextInputItem] | None = None,
296
- metrics: List[MetricConfig] | None = None,
341
+ data: Optional[List[TextInputItem]] = None,
342
+ metrics: Optional[List[MetricConfig]] = None,
297
343
  component: str = "unknown",
298
- label: str | None = None,
344
+ label: Optional[str] = None,
299
345
  raise_exception: bool = False,
300
346
  ) -> Optional[Dict]:
301
347
  if config is None:
@@ -337,7 +383,7 @@ class DeepEvalClient:
337
383
  data: List[SchemaInputItem],
338
384
  metrics: List[SchemaMetricConfig],
339
385
  component: str = "unknown",
340
- label: str | None = None,
386
+ label: Optional[str] = None,
341
387
  raise_exception: bool = False,
342
388
  ) -> Optional[Dict]: ...
343
389
 
@@ -351,12 +397,12 @@ class DeepEvalClient:
351
397
 
352
398
  def schema_eval(
353
399
  self,
354
- config: SchemaEvalConfig | None = None,
400
+ config: Optional[SchemaEvalConfig] = None,
355
401
  *,
356
- data: List[SchemaInputItem] | None = None,
357
- metrics: List[SchemaMetricConfig] | None = None,
402
+ data: Optional[List[SchemaInputItem]] = None,
403
+ metrics: Optional[List[SchemaMetricConfig]] = None,
358
404
  component: str = "unknown",
359
- label: str | None = None,
405
+ label: Optional[str] = None,
360
406
  raise_exception: bool = False,
361
407
  ) -> Optional[Dict]:
362
408
  if config is None:
rakam_eval_sdk/schema.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Common base class for all metric configs
2
2
  import sys
3
- from typing import Annotated, Any, Dict, List, Literal, Optional, Union
3
+ from typing import Any, Dict, List, Literal, Optional, Union
4
4
 
5
5
  # Base class (you can keep this abstract)
6
6
  from pydantic import BaseModel, Field
@@ -94,7 +94,8 @@ MetricConfig = Annotated[
94
94
  ]
95
95
 
96
96
  SchemaMetricConfig = Annotated[
97
- Union[JsonCorrectnessConfig, FieldsPresenceConfig], Field(discriminator="type")
97
+ Union[JsonCorrectnessConfig, FieldsPresenceConfig], Field(
98
+ discriminator="type")
98
99
  ]
99
100
 
100
101
 
@@ -107,7 +108,7 @@ class InputItem(BaseModel):
107
108
 
108
109
  class TextInputItem(InputItem):
109
110
  expected_output: Optional[str] = None
110
- retrieval_context: Optional[list[str]] = None
111
+ retrieval_context: Optional[List[str]] = None
111
112
 
112
113
 
113
114
  class SchemaInputItem(InputItem):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rakam-eval-sdk
3
- Version: 0.2.4
3
+ Version: 0.2.4rc2
4
4
  Summary: Evaluation Framework SDK
5
5
  Author: Mohamed Bachar Touil
6
6
  License: MIT
@@ -0,0 +1,10 @@
1
+ rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ rakam_eval_sdk/cli.py,sha256=yhs-TSWrq-tCe5FtO50cqbqd66tBsAa2Tlav4ubMkrw,21931
3
+ rakam_eval_sdk/client.py,sha256=8KwpNt7WHgMfcXAQtrAlnUihai9TiADRieiuWeIxg6E,15575
4
+ rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
5
+ rakam_eval_sdk/schema.py,sha256=S16lSC1p4KuYd9qWZ5aquK6S8AzUEDCZYc4VfI58r9k,3969
6
+ rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
7
+ rakam_eval_sdk-0.2.4rc2.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
8
+ rakam_eval_sdk-0.2.4rc2.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
9
+ rakam_eval_sdk-0.2.4rc2.dist-info/METADATA,sha256=9uLO_02mm3GT0alrkDCMkS_HsozGc__gde5VOQObUsY,6022
10
+ rakam_eval_sdk-0.2.4rc2.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- rakam_eval_sdk/cli.py,sha256=IvC45LyALCCoxZ7ZNifVCbPs6UveGtUCfjr1q9e1A_Y,21658
3
- rakam_eval_sdk/client.py,sha256=7spK7P3h5uiA3Q3erOHOwf04N1_RT1pysWyt_E-cz0Q,14666
4
- rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
5
- rakam_eval_sdk/schema.py,sha256=YfZgHtPh41xFYdQTsbS0OSvNdI8yIv5zoh76v21hemo,3971
6
- rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
7
- rakam_eval_sdk-0.2.4.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
8
- rakam_eval_sdk-0.2.4.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
9
- rakam_eval_sdk-0.2.4.dist-info/METADATA,sha256=XOTmtgO7NiG8NuLn4B_yJujUJONSJCvwGQ9ak3zIhv4,6019
10
- rakam_eval_sdk-0.2.4.dist-info/RECORD,,