rakam-eval-sdk 0.2.0rc2__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rakam-eval-sdk
3
- Version: 0.2.0rc2
3
+ Version: 0.2.2
4
4
  Summary: Evaluation Framework SDK
5
5
  Author: Mohamed Bachar Touil
6
6
  License: MIT
@@ -94,6 +94,7 @@ client = DeepEvalClient(
94
94
  )
95
95
 
96
96
  ```
97
+
97
98
  3. Schema Evaluation
98
99
 
99
100
  ```python
@@ -137,6 +138,7 @@ client = DeepEvalClient(
137
138
  )
138
139
 
139
140
  ```
141
+
140
142
  ## Configuration
141
143
 
142
144
  The client can be configured in multiple ways:
@@ -150,7 +152,7 @@ DeepEvalClient(base_url="http://api", api_token="123")
150
152
  ### Environment variables
151
153
 
152
154
  ```bash
153
- export EVALFRAMWORK_URL=http://api
155
+ export EVALFRAMEWORK_URL=http://api
154
156
  export EVALFRAMWORK_API_KEY=123
155
157
  ```
156
158
 
@@ -80,6 +80,7 @@ client = DeepEvalClient(
80
80
  )
81
81
 
82
82
  ```
83
+
83
84
  3. Schema Evaluation
84
85
 
85
86
  ```python
@@ -123,6 +124,7 @@ client = DeepEvalClient(
123
124
  )
124
125
 
125
126
  ```
127
+
126
128
  ## Configuration
127
129
 
128
130
  The client can be configured in multiple ways:
@@ -136,7 +138,7 @@ DeepEvalClient(base_url="http://api", api_token="123")
136
138
  ### Environment variables
137
139
 
138
140
  ```bash
139
- export EVALFRAMWORK_URL=http://api
141
+ export EVALFRAMEWORK_URL=http://api
140
142
  export EVALFRAMWORK_API_KEY=123
141
143
  ```
142
144
 
@@ -4,7 +4,7 @@ build-backend = "uv_build"
4
4
 
5
5
  [project]
6
6
  name = "rakam-eval-sdk"
7
- version = "0.2.0rc2"
7
+ version = "0.2.2"
8
8
  description = "Evaluation Framework SDK"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -6,7 +6,7 @@ import uuid
6
6
  from datetime import datetime
7
7
  from pathlib import Path
8
8
  from pprint import pprint
9
- from typing import Any, Optional
9
+ from typing import Any, Dict, Optional, Sequence
10
10
 
11
11
  import typer
12
12
  from dotenv import load_dotenv
@@ -31,9 +31,63 @@ if PROJECT_ROOT not in sys.path:
31
31
  sys.path.insert(0, PROJECT_ROOT)
32
32
  list_app = typer.Typer(help="List resources")
33
33
  app.add_typer(list_app, name="list")
34
+ metrics_app = typer.Typer(help="Metrics utilities")
35
+ app.add_typer(metrics_app, name="metrics")
34
36
 
35
- @list_app.command("eval")
36
- def list(
37
+
38
+ @metrics_app.command("list")
39
+ def list_metrics(
40
+ limit: int = typer.Option(
41
+ 20,
42
+ "--limit",
43
+ help="Number of testcases to inspect for metrics",
44
+ ),
45
+ ):
46
+ """
47
+ List unique metric names found in evaluation testcases.
48
+ """
49
+ client = DeepEvalClient()
50
+
51
+ testcases = client.list_evaluation_testcases(
52
+ limit=limit,
53
+ offset=0,
54
+ raise_exception=True,
55
+ )
56
+
57
+ if not testcases:
58
+ typer.echo("No evaluation testcases found.")
59
+ return
60
+
61
+ metric_names: set[str] = set()
62
+
63
+ def collect_metrics(entries: Sequence[Dict] | None):
64
+ if not entries:
65
+ return
66
+ for entry in entries:
67
+ for metric in entry.get("metrics", []) or []:
68
+ name = metric.get("name")
69
+ if name:
70
+ metric_names.add(name)
71
+
72
+ for tc in testcases:
73
+
74
+ collect_metrics(tc.get("result"))
75
+
76
+ if not metric_names:
77
+ typer.echo("No metrics found.")
78
+ return
79
+
80
+ typer.echo(
81
+ f"📊 Found {len(metric_names)} unique metrics "
82
+ f"(from latest {limit} testcases)\n"
83
+ )
84
+
85
+ for name in sorted(metric_names):
86
+ typer.echo(f"- {name}")
87
+
88
+
89
+ @list_app.command("evals")
90
+ def list_evals(
37
91
  directory: Path = typer.Argument(
38
92
  Path("./eval"),
39
93
  exists=True,
@@ -66,16 +120,10 @@ def list(
66
120
  typer.echo(f"No @{TARGET_DECORATOR} functions found.")
67
121
 
68
122
 
69
-
70
-
71
-
72
123
  @list_app.command("runs")
73
124
  def list_runs(
74
125
  limit: int = typer.Option(20, help="Max number of runs"),
75
126
  offset: int = typer.Option(0, help="Pagination offset"),
76
- status: Optional[str] = typer.Option(
77
- None, help="Filter by status (running, completed, failed)"
78
- ),
79
127
  ):
80
128
  """
81
129
  List evaluation runs (newest first).
@@ -92,24 +140,12 @@ def list_runs(
92
140
  typer.echo("No evaluation runs found.")
93
141
  return
94
142
 
95
- # optional status filtering (client-side for now)
96
- if status:
97
- runs = [
98
- r for r in runs
99
- if r.get("result", {}).get("status") == status
100
- ]
101
-
102
- typer.echo(
103
- f"[id] "
104
- f"{'unique_id':<20}"
105
- f"{'label':<20}"
106
- f"created_at"
107
- )
143
+ typer.echo(f"[id] " f"{'tag':<20}" f"{'label':<20}" f"created_at")
108
144
  # pretty CLI output
109
145
  for run in runs:
110
146
  run_id = run.get("id")
111
147
  label = run.get("label") or "-"
112
- uid = run.get("unique_id") or "-"
148
+ uid = run.get("tag") or "-"
113
149
  created_at = run.get("created_at")
114
150
 
115
151
  if created_at:
@@ -121,24 +157,20 @@ def list_runs(
121
157
  pass
122
158
 
123
159
  typer.echo(
124
- f"[{run_id}] "
125
- f"{uid:<20} "
126
- f"{label:<20} "
127
- f"{created_at}"
128
- )
160
+ f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
129
161
 
130
162
 
131
- @list_app.command("show")
132
- def show_testcase(
163
+ @app.command()
164
+ def show(
133
165
  id: Optional[int] = typer.Option(
134
166
  None,
135
167
  "--id",
136
168
  help="Numeric evaluation testcase ID",
137
169
  ),
138
- uid: Optional[str] = typer.Option(
170
+ tag: Optional[str] = typer.Option(
139
171
  None,
140
- "--uid",
141
- help="Evaluation testcase unique_id",
172
+ "--tag",
173
+ help="Evaluation testcase tag",
142
174
  ),
143
175
  raw: bool = typer.Option(
144
176
  False,
@@ -147,12 +179,12 @@ def show_testcase(
147
179
  ),
148
180
  ):
149
181
  """
150
- Show a single evaluation testcase by ID or unique_id.
182
+ Show a single evaluation testcase by ID or tag.
151
183
  """
152
- if not id and not uid:
184
+ if not id and not tag:
153
185
  raise typer.BadParameter("You must provide either --id or --uid")
154
186
 
155
- if id and uid:
187
+ if id and tag:
156
188
  raise typer.BadParameter("Provide only one of --id or --uid")
157
189
 
158
190
  client = DeepEvalClient()
@@ -161,8 +193,8 @@ def show_testcase(
161
193
  result = client.get_evaluation_testcase_by_id(id)
162
194
  identifier = f"id={id}"
163
195
  else:
164
- result = client.get_evaluation_testcase_by_unique_id(uid)
165
- identifier = f"unique_id={uid}"
196
+ result = client.get_evaluation_testcase_by_tag(tag)
197
+ identifier = f"tag={tag}"
166
198
 
167
199
  if not result:
168
200
  console.print(
@@ -358,7 +390,7 @@ def _print_and_save(
358
390
 
359
391
 
360
392
  @app.command()
361
- def compare_testcases(
393
+ def compare(
362
394
  testcase_a_id: int = typer.Argument(
363
395
  ...,
364
396
  help="ID of the first testcase",
@@ -519,6 +551,45 @@ def compare_last(
519
551
  _print_and_save(resp, pretty, out, overwrite)
520
552
 
521
553
 
554
+ @list_app.command("tag")
555
+ def update_run_tag(
556
+ run_id: int = typer.Argument(..., help="Evaluation run ID"),
557
+ tag: Optional[str] = typer.Option(
558
+ None,
559
+ "--tag",
560
+ "-t",
561
+ help="Tag to add or update",
562
+ ),
563
+ remove: bool = typer.Option(
564
+ False,
565
+ "--remove",
566
+ help="Remove tag from the run",
567
+ ),
568
+ ):
569
+ """
570
+ Add, update, or remove a tag from an evaluation run.
571
+ """
572
+ if not tag and not remove:
573
+ typer.echo("❌ You must provide --tag or --remove")
574
+ raise typer.Exit(code=1)
575
+
576
+ if tag and remove:
577
+ typer.echo("❌ Use either --tag or --remove, not both")
578
+ raise typer.Exit(code=1)
579
+
580
+ client = DeepEvalClient()
581
+
582
+ result = client.update_evaluation_testcase_tag(
583
+ testcase_id=run_id,
584
+ tag=None if remove else tag,
585
+ raise_exception=True,
586
+ )
587
+
588
+ action = "removed" if remove else "updated"
589
+ typer.echo(f"✅ Tag {action} successfully")
590
+ typer.echo(f"Run ID: {run_id}")
591
+ typer.echo(f"Tag: {result.get('tag') or '-'}")
592
+
522
593
  def main() -> None:
523
594
  app()
524
595
 
@@ -1,3 +1,4 @@
1
+ from typing import Optional, Dict
1
2
  import os
2
3
  import random
3
4
  from typing import Any, Dict, List, Optional, Union, cast, overload
@@ -12,6 +13,9 @@ from .schema import (
12
13
  SchemaMetricConfig,
13
14
  TextInputItem,
14
15
  )
16
+ from typing import Optional, Literal, cast
17
+
18
+ HTTPMethod = Literal["GET", "POST", "PATCH", "PUT", "DELETE"]
15
19
 
16
20
 
17
21
  class DeepEvalClient:
@@ -27,71 +31,47 @@ class DeepEvalClient:
27
31
  settings_module: Optional[Any] = None, # optional external settings
28
32
  timeout: int = 30,
29
33
  ):
30
- settings_url = getattr(settings_module, "EVALFRAMWORK_URL", None)
34
+ settings_url = getattr(settings_module, "EVALFRAMEWORK_URL", None)
31
35
  settings_token = getattr(settings_module, "EVALFRAMWORK_API_KEY", None)
32
36
 
33
37
  raw_url = (
34
38
  base_url
35
39
  or settings_url
36
- or os.getenv("EVALFRAMWORK_URL")
40
+ or os.getenv("EVALFRAMEWORK_URL")
37
41
  or "http://localhost:8080"
38
42
  )
39
43
  self.base_url = raw_url.rstrip("/")
40
44
  self.api_token = (
41
45
  api_token or settings_token or os.getenv(
42
- "EVALFRAMWORK_API_KEY", "")
46
+ "EVALFRAMEWORK_API_KEY", "")
43
47
  )
44
48
  self.timeout = timeout
45
49
 
46
50
  def _request(
47
51
  self,
52
+ method: HTTPMethod,
48
53
  endpoint: str,
49
- payload: dict,
54
+ *,
55
+ json: dict | None = None,
56
+ params: dict | None = None,
50
57
  raise_exception: bool = False,
51
58
  ) -> Optional[dict]:
52
- """Internal helper to send POST requests with standard headers and error handling."""
53
59
  url = f"{self.base_url}{endpoint}"
54
- headers = {
55
- "accept": "application/json",
56
- "Content-Type": "application/json",
57
- "X-API-Token": self.api_token,
58
- }
59
-
60
- try:
61
- resp = requests.post(
62
- url, headers=headers, json=payload, timeout=self.timeout
63
- )
64
- if raise_exception:
65
- resp.raise_for_status()
66
- except requests.RequestException as e:
67
- if raise_exception:
68
- raise
69
- return {"error": str(e)}
70
-
71
- try:
72
- return cast(dict, resp.json())
73
- except ValueError:
74
- if raise_exception:
75
- raise
76
- return {"error": "Invalid JSON response", "raw": resp.text}
77
60
 
78
- def _get(
79
- self,
80
- endpoint: str,
81
- params: dict,
82
- raise_exception: bool = False,
83
- ) -> Optional[dict]:
84
- """Internal helper to send GET requests with standard headers and error handling."""
85
- url = f"{self.base_url}{endpoint}"
86
61
  headers = {
87
62
  "accept": "application/json",
88
63
  "X-API-Token": self.api_token,
89
64
  }
90
65
 
66
+ if json is not None:
67
+ headers["Content-Type"] = "application/json"
68
+
91
69
  try:
92
- resp = requests.get(
93
- url,
70
+ resp = requests.request(
71
+ method=method,
72
+ url=url,
94
73
  headers=headers,
74
+ json=json,
95
75
  params=params,
96
76
  timeout=self.timeout,
97
77
  )
@@ -107,7 +87,38 @@ class DeepEvalClient:
107
87
  except ValueError:
108
88
  if raise_exception:
109
89
  raise
110
- return {"error": "Invalid JSON response", "raw": resp.text}
90
+ return {
91
+ "error": "Invalid JSON response",
92
+ "raw": resp.text,
93
+ }
94
+
95
+ def _get(self, endpoint: str, params: dict, **kw):
96
+ return self._request("GET", endpoint, params=params, **kw)
97
+
98
+ def _post(self, endpoint: str, payload: dict, **kw):
99
+ return self._request("POST", endpoint, json=payload, **kw)
100
+
101
+ def _patch(self, endpoint: str, payload: dict, **kw):
102
+ return self._request("PATCH", endpoint, json=payload, **kw)
103
+
104
+ def update_evaluation_testcase_tag(
105
+ self,
106
+ *,
107
+ testcase_id: int,
108
+ tag: Optional[str],
109
+ raise_exception: bool = False,
110
+ ) -> Optional[Dict]:
111
+ """
112
+ Add, update, or remove a tag from an evaluation testcase.
113
+
114
+ - tag="smoke" → add / update tag
115
+ - tag=None → remove tag
116
+ """
117
+ return self._patch(
118
+ f"/evaluation-testcases/{testcase_id}/tag",
119
+ payload={"tag": tag},
120
+ raise_exception=raise_exception,
121
+ )
111
122
 
112
123
  def list_evaluation_testcases(
113
124
  self,
@@ -144,17 +155,17 @@ class DeepEvalClient:
144
155
  raise_exception=raise_exception,
145
156
  )
146
157
 
147
- def get_evaluation_testcase_by_unique_id(
158
+ def get_evaluation_testcase_by_tag(
148
159
  self,
149
- unique_id: str,
160
+ tag: str,
150
161
  *,
151
162
  raise_exception: bool = False,
152
163
  ) -> Optional[Dict]:
153
164
  """
154
- Fetch a single evaluation testcase by unique_id.
165
+ Fetch a single evaluation testcase by tag.
155
166
  """
156
167
  return self._get(
157
- f"/eval-framework/deepeval/uid/{unique_id}",
168
+ f"/eval-framework/deepeval/tag/{tag}",
158
169
  params={},
159
170
  raise_exception=raise_exception,
160
171
  )
@@ -163,18 +174,18 @@ class DeepEvalClient:
163
174
  self,
164
175
  *,
165
176
  id: Optional[int] = None,
166
- unique_id: Optional[str] = None,
177
+ tag: Optional[str] = None,
167
178
  raise_exception: bool = False,
168
179
  ) -> Optional[Dict]:
169
180
  if id is not None:
170
181
  return self.get_evaluation_testcase_by_id(
171
182
  id, raise_exception=raise_exception
172
183
  )
173
- if unique_id is not None:
174
- return self.get_evaluation_testcase_by_unique_id(
175
- unique_id, raise_exception=raise_exception
184
+ if tag is not None:
185
+ return self.get_evaluation_testcase_by_tag(
186
+ tag, raise_exception=raise_exception
176
187
  )
177
- raise ValueError("Either id or unique_id must be provided")
188
+ raise ValueError("Either id or tag must be provided")
178
189
 
179
190
  def compare_testcases(
180
191
  self,
@@ -268,7 +279,7 @@ class DeepEvalClient:
268
279
  label=label,
269
280
  )
270
281
 
271
- return self._request(
282
+ return self._post(
272
283
  "/deepeval/text-eval", config.model_dump(), raise_exception
273
284
  )
274
285
 
@@ -284,7 +295,7 @@ class DeepEvalClient:
284
295
  payload = EvalConfig.model_construct(
285
296
  data=data, metrics=metrics, component=component, version=label
286
297
  ).model_dump()
287
- return self._request("/deepeval/text-eval/background", payload, raise_exception)
298
+ return self._post("/deepeval/text-eval/background", payload, raise_exception)
288
299
 
289
300
  @overload
290
301
  def schema_eval(
@@ -328,7 +339,7 @@ class DeepEvalClient:
328
339
  label=label,
329
340
  )
330
341
 
331
- return self._request(
342
+ return self._post(
332
343
  "/deepeval/schema-eval",
333
344
  config.model_dump(),
334
345
  raise_exception,
@@ -346,7 +357,7 @@ class DeepEvalClient:
346
357
  payload = SchemaEvalConfig.model_construct(
347
358
  data=data, metrics=metrics, component=component, version=label
348
359
  ).model_dump()
349
- return self._request(
360
+ return self._post(
350
361
  "/deepeval/schema-eval/background", payload, raise_exception
351
362
  )
352
363
 
@@ -94,8 +94,7 @@ MetricConfig = Annotated[
94
94
  ]
95
95
 
96
96
  SchemaMetricConfig = Annotated[
97
- Union[JsonCorrectnessConfig, FieldsPresenceConfig], Field(
98
- discriminator="type")
97
+ Union[JsonCorrectnessConfig, FieldsPresenceConfig], Field(discriminator="type")
99
98
  ]
100
99
 
101
100
 
@@ -118,7 +117,6 @@ class SchemaInputItem(InputItem):
118
117
 
119
118
  class EvalConfig(BaseModel):
120
119
  __eval_config__ = "text_eval"
121
- unique_id: Union[str, None] = None
122
120
  component: str = "unknown"
123
121
  label: Union[str, None] = None
124
122
  data: List[TextInputItem]
@@ -128,7 +126,6 @@ class EvalConfig(BaseModel):
128
126
  class SchemaEvalConfig(BaseModel):
129
127
  __eval_config__ = "schema_eval"
130
128
  component: str = "unknown"
131
- unique_id: Union[str, None] = None
132
129
  label: Union[str, None] = None
133
130
  data: List[SchemaInputItem]
134
131
  metrics: List[SchemaMetricConfig] = Field(default_factory=list)