rakam-eval-sdk 0.1.16rc1__py3-none-any.whl → 0.2.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rakam_eval_sdk/cli.py CHANGED
@@ -1,15 +1,38 @@
1
1
  # cli.py
2
+ import json
3
+ import os
4
+ import sys
5
+ import uuid
6
+ from datetime import datetime
2
7
  from pathlib import Path
8
+ from pprint import pprint
9
+ from typing import Any, Optional
3
10
 
4
11
  import typer
12
+ from dotenv import load_dotenv
13
+ from rich.console import Console
14
+ from rich.panel import Panel
15
+ from rich.pretty import Pretty
5
16
 
6
- from rakam_eval_sdk.utils.decorator_utils import find_decorated_functions, load_module_from_path
17
+ from rakam_eval_sdk.client import DeepEvalClient
7
18
  from rakam_eval_sdk.decorators import eval_run
19
+ from rakam_eval_sdk.utils.decorator_utils import (
20
+ find_decorated_functions,
21
+ load_module_from_path,
22
+ )
23
+
24
+ load_dotenv()
8
25
  app = typer.Typer(help="CLI tools for evaluation utilities")
26
+ console = Console()
27
+
28
+ # add root of the project to sys.path
29
+ PROJECT_ROOT = os.path.abspath(".")
30
+ if PROJECT_ROOT not in sys.path:
31
+ sys.path.insert(0, PROJECT_ROOT)
9
32
 
10
33
 
11
34
  @app.command()
12
- def find_eval_run_by_name(
35
+ def list(
13
36
  directory: Path = typer.Argument(
14
37
  Path("./eval"),
15
38
  exists=True,
@@ -23,16 +46,12 @@ def find_eval_run_by_name(
23
46
  "-r",
24
47
  help="Recursively search for Python files",
25
48
  ),
26
- ):
49
+ ) -> None:
27
50
  """
28
51
  Find functions decorated with @track.
29
52
  """
30
53
  TARGET_DECORATOR = eval_run.__name__
31
- files = (
32
- directory.rglob("*.py")
33
- if recursive
34
- else directory.glob("*.py")
35
- )
54
+ files = directory.rglob("*.py") if recursive else directory.glob("*.py")
36
55
 
37
56
  found = False
38
57
 
@@ -46,8 +65,157 @@ def find_eval_run_by_name(
46
65
  typer.echo(f"No @{TARGET_DECORATOR} functions found.")
47
66
 
48
67
 
49
- @app.command("run")
50
- def run_eval_runs(
68
+ list_app = typer.Typer(help="List resources")
69
+ app.add_typer(list_app, name="list")
70
+
71
+
72
+ @list_app.command("runs")
73
+ def list_runs(
74
+ limit: int = typer.Option(20, help="Max number of runs"),
75
+ offset: int = typer.Option(0, help="Pagination offset"),
76
+ status: Optional[str] = typer.Option(
77
+ None, help="Filter by status (running, completed, failed)"
78
+ ),
79
+ ):
80
+ """
81
+ List evaluation runs (newest first).
82
+ """
83
+ client = DeepEvalClient()
84
+
85
+ runs = client.list_evaluation_testcases(
86
+ limit=limit,
87
+ offset=offset,
88
+ raise_exception=True,
89
+ )
90
+
91
+ if not runs:
92
+ typer.echo("No evaluation runs found.")
93
+ return
94
+
95
+ # optional status filtering (client-side for now)
96
+ if status:
97
+ runs = [
98
+ r for r in runs
99
+ if r.get("result", {}).get("status") == status
100
+ ]
101
+
102
+ typer.echo(
103
+ f"[id] "
104
+ f"{'unique_id':<20}"
105
+ f"{'label':<20}"
106
+ f"created_at"
107
+ )
108
+ # pretty CLI output
109
+ for run in runs:
110
+ run_id = run.get("id")
111
+ label = run.get("label") or "-"
112
+ uid = run.get("unique_id") or "-"
113
+ created_at = run.get("created_at")
114
+
115
+ if created_at:
116
+ try:
117
+ created_at = datetime.fromisoformat(created_at).strftime(
118
+ "%Y-%m-%d %H:%M:%S"
119
+ )
120
+ except ValueError:
121
+ pass
122
+
123
+ typer.echo(
124
+ f"[{run_id}] "
125
+ f"{uid:<20} "
126
+ f"{label:<20} "
127
+ f"{created_at}"
128
+ )
129
+
130
+
131
+ @list_app.command("show")
132
+ def show_testcase(
133
+ id: Optional[int] = typer.Option(
134
+ None,
135
+ "--id",
136
+ help="Numeric evaluation testcase ID",
137
+ ),
138
+ uid: Optional[str] = typer.Option(
139
+ None,
140
+ "--uid",
141
+ help="Evaluation testcase unique_id",
142
+ ),
143
+ raw: bool = typer.Option(
144
+ False,
145
+ "--raw",
146
+ help="Print raw JSON instead of formatted output",
147
+ ),
148
+ ):
149
+ """
150
+ Show a single evaluation testcase by ID or unique_id.
151
+ """
152
+ if not id and not uid:
153
+ raise typer.BadParameter("You must provide either --id or --uid")
154
+
155
+ if id and uid:
156
+ raise typer.BadParameter("Provide only one of --id or --uid")
157
+
158
+ client = DeepEvalClient()
159
+
160
+ if id:
161
+ result = client.get_evaluation_testcase_by_id(id)
162
+ identifier = f"id={id}"
163
+ else:
164
+ result = client.get_evaluation_testcase_by_unique_id(uid)
165
+ identifier = f"unique_id={uid}"
166
+
167
+ if not result:
168
+ console.print(
169
+ Panel(
170
+ f"No response received for {identifier}",
171
+ title="Error",
172
+ style="red",
173
+ )
174
+ )
175
+ raise typer.Exit(code=1)
176
+
177
+ if isinstance(result, dict) and result.get("error"):
178
+ console.print(
179
+ Panel(
180
+ result["error"],
181
+ title="Error",
182
+ style="red",
183
+ )
184
+ )
185
+ raise typer.Exit(code=1)
186
+
187
+ if raw:
188
+ console.print(Pretty(result))
189
+ raise typer.Exit()
190
+
191
+ console.print(
192
+ Panel.fit(
193
+ Pretty(result),
194
+ title="Evaluation TestCase",
195
+ subtitle=identifier,
196
+ )
197
+ )
198
+
199
+
200
+ def validate_eval_result(result: Any, fn_name: str) -> str:
201
+ eval_config = getattr(result, "__eval_config__", None)
202
+
203
+ if not isinstance(eval_config, str):
204
+ expected = "EvalConfig or SchemaEvalConfig"
205
+ actual = type(result).__name__
206
+
207
+ typer.echo(
208
+ f" ❌ Invalid return type from `{fn_name}`\n"
209
+ f" Expected: {expected}\n"
210
+ f" Got: {actual}"
211
+ )
212
+ return ""
213
+
214
+ return eval_config
215
+
216
+
217
+ @app.command()
218
+ def run(
51
219
  directory: Path = typer.Argument(
52
220
  Path("./eval"),
53
221
  exists=True,
@@ -66,19 +234,28 @@ def run_eval_runs(
66
234
  "--dry-run",
67
235
  help="Only list functions without executing them",
68
236
  ),
69
- ):
237
+ save_runs: bool = typer.Option(
238
+ False,
239
+ "--save-runs",
240
+ help="Save each evaluation run result to a JSON file",
241
+ ),
242
+ output_dir: Path = typer.Option(
243
+ Path("./eval_runs"),
244
+ "--output-dir",
245
+ help="Directory where run results are saved",
246
+ ),
247
+ ) -> None:
70
248
  """
71
249
  Find and execute all functions decorated with @eval_run.
72
250
  """
73
- files = (
74
- directory.rglob("*.py")
75
- if recursive
76
- else directory.glob("*.py")
77
- )
251
+ files = directory.rglob("*.py") if recursive else directory.glob("*.py")
78
252
  TARGET_DECORATOR = eval_run.__name__
79
253
 
80
254
  executed_any = False
81
255
 
256
+ if save_runs and not dry_run:
257
+ output_dir.mkdir(parents=True, exist_ok=True)
258
+
82
259
  for file in sorted(files):
83
260
  functions = find_decorated_functions(file, TARGET_DECORATOR)
84
261
  if not functions:
@@ -102,8 +279,49 @@ def run_eval_runs(
102
279
 
103
280
  try:
104
281
  func = getattr(module, fn_name)
105
- func() # <-- actual execution
282
+ result = func()
283
+
284
+ eval_type = validate_eval_result(result, fn_name)
285
+ if not eval_type:
286
+ continue
287
+
288
+ client = DeepEvalClient()
289
+
290
+ if eval_type == "text_eval":
291
+ resp = client.text_eval(config=result)
292
+ else:
293
+ resp = client.schema_eval(config=result)
294
+
295
+ typer.echo(f"{resp}")
106
296
  executed_any = True
297
+ typer.echo(f" ✅ Returned {type(result).__name__}")
298
+
299
+ if save_runs:
300
+ run_id = (
301
+ resp["id"]
302
+ if resp is not None and "id" in resp
303
+ else uuid.uuid4().hex[:8]
304
+ )
305
+
306
+ output_path = output_dir / f"run_{fn_name}_{run_id}.json"
307
+
308
+ def to_json_safe(obj: Any) -> Any:
309
+ if hasattr(obj, "model_dump"):
310
+ return obj.model_dump()
311
+ if hasattr(obj, "dict"):
312
+ return obj.dict()
313
+ return obj
314
+
315
+ with output_path.open("w", encoding="utf-8") as f:
316
+ json.dump(
317
+ to_json_safe(resp),
318
+ f,
319
+ indent=2,
320
+ ensure_ascii=False,
321
+ )
322
+
323
+ typer.echo(f" 💾 Saved run → {output_path}")
324
+
107
325
  except Exception as e:
108
326
  typer.echo(f" ❌ Execution failed: {e}")
109
327
 
@@ -111,7 +329,197 @@ def run_eval_runs(
111
329
  typer.echo("\nNo @eval_run functions executed.")
112
330
 
113
331
 
114
- def main():
332
+ def _print_and_save(
333
+ resp: dict,
334
+ pretty: bool,
335
+ out: Path | None,
336
+ overwrite: bool,
337
+ ) -> None:
338
+ if pretty:
339
+ typer.echo(typer.style("📊 Result:", bold=True))
340
+ pprint(resp)
341
+ else:
342
+ typer.echo(resp)
343
+
344
+ if out is None:
345
+ return
346
+
347
+ if out.exists() and not overwrite:
348
+ typer.echo(
349
+ f"❌ File already exists: {out} (use --overwrite to replace)")
350
+ raise typer.Exit(code=1)
351
+
352
+ out.parent.mkdir(parents=True, exist_ok=True)
353
+
354
+ with out.open("w", encoding="utf-8") as f:
355
+ json.dump(resp, f, indent=2, ensure_ascii=False)
356
+
357
+ typer.echo(f"💾 Result saved to {out}")
358
+
359
+
360
+ @app.command()
361
+ def compare_testcases(
362
+ testcase_a_id: int = typer.Argument(
363
+ ...,
364
+ help="ID of the first testcase",
365
+ ),
366
+ testcase_b_id: int = typer.Argument(
367
+ ...,
368
+ help="ID of the second testcase",
369
+ ),
370
+ pretty: bool = typer.Option(
371
+ True,
372
+ "--pretty/--raw",
373
+ help="Pretty-print the response",
374
+ ),
375
+ raise_exception: bool = typer.Option(
376
+ False,
377
+ "--raise",
378
+ help="Raise HTTP exceptions instead of swallowing them",
379
+ ),
380
+ out: Path | None = typer.Option(
381
+ None,
382
+ "-o",
383
+ "--out",
384
+ help="Optional file path to save the result as JSON",
385
+ ),
386
+ overwrite: bool = typer.Option(
387
+ False,
388
+ "--overwrite",
389
+ help="Overwrite output file if it already exists",
390
+ ),
391
+ ) -> None:
392
+ """
393
+ Compare two DeepEval evaluation testcases.
394
+ """
395
+ client = DeepEvalClient()
396
+
397
+ typer.echo(f"🔍 Comparing testcases {testcase_a_id} ↔ {testcase_b_id}")
398
+
399
+ try:
400
+ resp = client.compare_testcases(
401
+ testcase_a_id=testcase_a_id,
402
+ testcase_b_id=testcase_b_id,
403
+ raise_exception=raise_exception,
404
+ )
405
+ except Exception as e:
406
+ typer.echo(f"❌ Request failed: {e}")
407
+ raise typer.Exit(code=1)
408
+
409
+ if not resp:
410
+ typer.echo("⚠️ No response received")
411
+ raise typer.Exit(code=1)
412
+ _print_and_save(resp, pretty, out, overwrite)
413
+
414
+
415
+ @app.command()
416
+ def compare_label_latest(
417
+ label_a: str = typer.Argument(
418
+ ...,
419
+ help="First label (latest run will be used)",
420
+ ),
421
+ label_b: str = typer.Argument(
422
+ ...,
423
+ help="Second label (latest run will be used)",
424
+ ),
425
+ pretty: bool = typer.Option(
426
+ True,
427
+ "--pretty/--raw",
428
+ help="Pretty-print the response",
429
+ ),
430
+ raise_exception: bool = typer.Option(
431
+ False,
432
+ "--raise",
433
+ help="Raise HTTP exceptions instead of swallowing them",
434
+ ),
435
+ out: Path | None = typer.Option(
436
+ None,
437
+ "-o",
438
+ "--out",
439
+ help="Optional file path to save the result as JSON",
440
+ ),
441
+ overwrite: bool = typer.Option(
442
+ False,
443
+ "--overwrite",
444
+ help="Overwrite output file if it already exists",
445
+ ),
446
+ ) -> None:
447
+ """
448
+ Compare the latest evaluation runs for two labels.
449
+ """
450
+ client = DeepEvalClient()
451
+
452
+ typer.echo(f"🔍 Comparing latest runs: '{label_a}' ↔ '{label_b}'")
453
+
454
+ try:
455
+ resp = client.compare_latest_by_labels(
456
+ label_a=label_a,
457
+ label_b=label_b,
458
+ raise_exception=raise_exception,
459
+ )
460
+ except Exception as e:
461
+ typer.echo(f"❌ Request failed: {e}")
462
+ raise typer.Exit(code=1)
463
+
464
+ if not resp:
465
+ typer.echo("⚠️ No response received")
466
+ raise typer.Exit(code=1)
467
+
468
+ _print_and_save(resp, pretty, out, overwrite)
469
+
470
+
471
+ @app.command()
472
+ def compare_last(
473
+ label: str = typer.Argument(
474
+ ...,
475
+ help="Label whose last two runs will be compared",
476
+ ),
477
+ pretty: bool = typer.Option(
478
+ True,
479
+ "--pretty/--raw",
480
+ help="Pretty-print the response",
481
+ ),
482
+ raise_exception: bool = typer.Option(
483
+ False,
484
+ "--raise",
485
+ help="Raise HTTP exceptions instead of swallowing them",
486
+ ),
487
+ out: Path | None = typer.Option(
488
+ None,
489
+ "-o",
490
+ "--out",
491
+ help="Optional file path to save the result as JSON",
492
+ ),
493
+ overwrite: bool = typer.Option(
494
+ False,
495
+ "--overwrite",
496
+ help="Overwrite output file if it already exists",
497
+ ),
498
+ ) -> None:
499
+ """
500
+ Compare the last two evaluation runs of a label.
501
+ """
502
+ client = DeepEvalClient()
503
+
504
+ typer.echo(f"🔍 Comparing last two runs for label '{label}'")
505
+
506
+ try:
507
+ resp = client.compare_last_two_by_label(
508
+ label=label,
509
+ raise_exception=raise_exception,
510
+ )
511
+ except Exception as e:
512
+ typer.echo(f"❌ Request failed: {e}")
513
+ raise typer.Exit(code=1)
514
+
515
+ if not resp:
516
+ typer.echo("⚠️ No response received")
517
+ raise typer.Exit(code=1)
518
+
519
+ _print_and_save(resp, pretty, out, overwrite)
520
+
521
+
522
+ def main() -> None:
115
523
  app()
116
524
 
117
525
 
rakam_eval_sdk/client.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import os
2
2
  import random
3
- from typing import Any, List, Optional, Union, cast
3
+ from typing import Any, Dict, List, Optional, Union, cast, overload
4
4
 
5
5
  import requests
6
6
 
@@ -38,7 +38,8 @@ class DeepEvalClient:
38
38
  )
39
39
  self.base_url = raw_url.rstrip("/")
40
40
  self.api_token = (
41
- api_token or settings_token or os.getenv("EVALFRAMWORK_API_KEY", "")
41
+ api_token or settings_token or os.getenv(
42
+ "EVALFRAMWORK_API_KEY", "")
42
43
  )
43
44
  self.timeout = timeout
44
45
 
@@ -74,19 +75,202 @@ class DeepEvalClient:
74
75
  raise
75
76
  return {"error": "Invalid JSON response", "raw": resp.text}
76
77
 
78
+ def _get(
79
+ self,
80
+ endpoint: str,
81
+ params: dict,
82
+ raise_exception: bool = False,
83
+ ) -> Optional[dict]:
84
+ """Internal helper to send GET requests with standard headers and error handling."""
85
+ url = f"{self.base_url}{endpoint}"
86
+ headers = {
87
+ "accept": "application/json",
88
+ "X-API-Token": self.api_token,
89
+ }
90
+
91
+ try:
92
+ resp = requests.get(
93
+ url,
94
+ headers=headers,
95
+ params=params,
96
+ timeout=self.timeout,
97
+ )
98
+ if raise_exception:
99
+ resp.raise_for_status()
100
+ except requests.RequestException as e:
101
+ if raise_exception:
102
+ raise
103
+ return {"error": str(e)}
104
+
105
+ try:
106
+ return cast(dict, resp.json())
107
+ except ValueError:
108
+ if raise_exception:
109
+ raise
110
+ return {"error": "Invalid JSON response", "raw": resp.text}
111
+
112
+ def list_evaluation_testcases(
113
+ self,
114
+ *,
115
+ limit: int = 10,
116
+ offset: int = 0,
117
+ raise_exception: bool = False,
118
+ ) -> Optional[List[Dict]]:
119
+ """
120
+ List evaluation testcases for the current API token only.
121
+ Sorted by created_at DESC (newest first).
122
+ """
123
+ return self._get(
124
+ "/eval-framework/deepeval/evaluation-testcases/token",
125
+ params={
126
+ "limit": limit,
127
+ "offset": offset,
128
+ },
129
+ raise_exception=raise_exception,
130
+ )
131
+
132
+ def get_evaluation_testcase_by_id(
133
+ self,
134
+ testcase_id: int,
135
+ *,
136
+ raise_exception: bool = False,
137
+ ) -> Optional[Dict]:
138
+ """
139
+ Fetch a single evaluation testcase by numeric ID.
140
+ """
141
+ return self._get(
142
+ f"/eval-framework/deepeval/id/{testcase_id}",
143
+ params={},
144
+ raise_exception=raise_exception,
145
+ )
146
+
147
+ def get_evaluation_testcase_by_unique_id(
148
+ self,
149
+ unique_id: str,
150
+ *,
151
+ raise_exception: bool = False,
152
+ ) -> Optional[Dict]:
153
+ """
154
+ Fetch a single evaluation testcase by unique_id.
155
+ """
156
+ return self._get(
157
+ f"/eval-framework/deepeval/uid/{unique_id}",
158
+ params={},
159
+ raise_exception=raise_exception,
160
+ )
161
+
162
+ def get_evaluation_testcase(
163
+ self,
164
+ *,
165
+ id: Optional[int] = None,
166
+ unique_id: Optional[str] = None,
167
+ raise_exception: bool = False,
168
+ ) -> Optional[Dict]:
169
+ if id is not None:
170
+ return self.get_evaluation_testcase_by_id(
171
+ id, raise_exception=raise_exception
172
+ )
173
+ if unique_id is not None:
174
+ return self.get_evaluation_testcase_by_unique_id(
175
+ unique_id, raise_exception=raise_exception
176
+ )
177
+ raise ValueError("Either id or unique_id must be provided")
178
+
179
+ def compare_testcases(
180
+ self,
181
+ *,
182
+ testcase_a_id: int,
183
+ testcase_b_id: int,
184
+ raise_exception: bool = False,
185
+ ) -> Optional[dict]:
186
+ """
187
+ Compare two evaluation testcases.
188
+ """
189
+ return self._get(
190
+ "/eval-framework/deepeval/evaluation-testcases/compare",
191
+ params={
192
+ "testcase_a_id": testcase_a_id,
193
+ "testcase_b_id": testcase_b_id,
194
+ },
195
+ raise_exception=raise_exception,
196
+ )
197
+
198
+ def compare_latest_by_labels(
199
+ self,
200
+ *,
201
+ label_a: str,
202
+ label_b: str,
203
+ raise_exception: bool = False,
204
+ ) -> Optional[dict]:
205
+ """
206
+ Compare the latest evaluation testcases for two labels.
207
+ """
208
+ return self._get(
209
+ "/eval-framework/deepeval/evaluation-testcases/compare-latest",
210
+ params={
211
+ "label_a": label_a,
212
+ "label_b": label_b,
213
+ },
214
+ raise_exception=raise_exception,
215
+ )
216
+
217
+ def compare_last_two_by_label(
218
+ self,
219
+ *,
220
+ label: str,
221
+ raise_exception: bool = False,
222
+ ) -> Optional[dict]:
223
+ """
224
+ Compare the last two evaluation testcases for a given label.
225
+ """
226
+ return self._get(
227
+ "/eval-framework/deepeval/evaluation-testcases/compare-last-two",
228
+ params={
229
+ "label": label,
230
+ },
231
+ raise_exception=raise_exception,
232
+ )
233
+
234
+ @overload
235
+ def text_eval(
236
+ self,
237
+ config: EvalConfig,
238
+ *,
239
+ raise_exception: bool = False,
240
+ ) -> Optional[dict]: ...
241
+
242
+ @overload
77
243
  def text_eval(
78
244
  self,
245
+ *,
79
246
  data: List[TextInputItem],
80
247
  metrics: List[MetricConfig],
248
+ component: str = "unknown",
249
+ label: str | None = None,
81
250
  raise_exception: bool = False,
251
+ ) -> Optional[dict]: ...
252
+
253
+ def text_eval(
254
+ self,
255
+ config: EvalConfig | None = None,
256
+ *,
257
+ data: List[TextInputItem] | None = None,
258
+ metrics: List[MetricConfig] | None = None,
82
259
  component: str = "unknown",
83
- version: Union[str, None] = None,
260
+ label: str | None = None,
261
+ raise_exception: bool = False,
84
262
  ) -> Optional[dict]:
85
- """Run synchronous text evaluation."""
86
- payload = EvalConfig.model_construct(
87
- data=data, metrics=metrics, component=component, version=version
88
- ).model_dump()
89
- return self._request("/deepeval/text-eval", payload, raise_exception)
263
+ if config is None:
264
+ config = EvalConfig(
265
+ data=data,
266
+ metrics=metrics,
267
+ component=component,
268
+ label=label,
269
+ )
270
+
271
+ return self._request(
272
+ "/deepeval/text-eval", config.model_dump(), raise_exception
273
+ )
90
274
 
91
275
  def text_eval_background(
92
276
  self,
@@ -94,27 +278,61 @@ class DeepEvalClient:
94
278
  metrics: List[MetricConfig],
95
279
  raise_exception: bool = False,
96
280
  component: str = "unknown",
97
- version: Union[str, None] = None,
281
+ label: Union[str, None] = None,
98
282
  ) -> Optional[dict]:
99
283
  """Run background text evaluation (async job)."""
100
284
  payload = EvalConfig.model_construct(
101
- data=data, metrics=metrics, component=component, version=version
285
+ data=data, metrics=metrics, component=component, version=label
102
286
  ).model_dump()
103
287
  return self._request("/deepeval/text-eval/background", payload, raise_exception)
104
288
 
289
+ @overload
105
290
  def schema_eval(
106
291
  self,
292
+ *,
107
293
  data: List[SchemaInputItem],
108
294
  metrics: List[SchemaMetricConfig],
295
+ component: str = "unknown",
296
+ label: str | None = None,
297
+ raise_exception: bool = False,
298
+ ) -> Optional[dict]: ...
299
+
300
+ @overload
301
+ def schema_eval(
302
+ self,
303
+ config: SchemaEvalConfig,
304
+ *,
109
305
  raise_exception: bool = False,
306
+ ) -> Optional[dict]: ...
307
+
308
+ def schema_eval(
309
+ self,
310
+ config: SchemaEvalConfig | None = None,
311
+ *,
312
+ data: List[SchemaInputItem] | None = None,
313
+ metrics: List[SchemaMetricConfig] | None = None,
110
314
  component: str = "unknown",
111
- version: Union[str, None] = None,
315
+ label: str | None = None,
316
+ raise_exception: bool = False,
112
317
  ) -> Optional[dict]:
113
- """Run synchronous schema evaluation."""
114
- payload = SchemaEvalConfig.model_construct(
115
- data=data, metrics=metrics, component=component, version=version
116
- ).model_dump()
117
- return self._request("/deepeval/schema-eval", payload, raise_exception)
318
+ if config is None:
319
+ if data is None or metrics is None:
320
+ raise ValueError(
321
+ "Either `config` or both `data` and `metrics` must be provided"
322
+ )
323
+
324
+ config = SchemaEvalConfig(
325
+ data=data,
326
+ metrics=metrics,
327
+ component=component,
328
+ label=label,
329
+ )
330
+
331
+ return self._request(
332
+ "/deepeval/schema-eval",
333
+ config.model_dump(),
334
+ raise_exception,
335
+ )
118
336
 
119
337
  def schema_eval_background(
120
338
  self,
@@ -122,11 +340,11 @@ class DeepEvalClient:
122
340
  metrics: List[SchemaMetricConfig],
123
341
  raise_exception: bool = False,
124
342
  component: str = "unknown",
125
- version: Union[str, None] = None,
343
+ label: Union[str, None] = None,
126
344
  ) -> Optional[dict]:
127
345
  """Run background schema evaluation (async job)."""
128
346
  payload = SchemaEvalConfig.model_construct(
129
- data=data, metrics=metrics, component=component, version=version
347
+ data=data, metrics=metrics, component=component, version=label
130
348
  ).model_dump()
131
349
  return self._request(
132
350
  "/deepeval/schema-eval/background", payload, raise_exception
@@ -139,13 +357,17 @@ class DeepEvalClient:
139
357
  chance: float,
140
358
  raise_exception: bool = False,
141
359
  component: str = "unknown",
142
- version: Union[str, None] = None,
360
+ label: Union[str, None] = None,
143
361
  ) -> Optional[dict]:
144
362
  """Randomly run text_eval based on a probability between 0 and 1."""
145
363
  self._validate_chance(chance)
146
364
  return (
147
365
  self.text_eval(
148
- data, metrics, raise_exception, component=component, version=version
366
+ data=data,
367
+ metrics=metrics,
368
+ raise_exception=raise_exception,
369
+ component=component,
370
+ label=label,
149
371
  )
150
372
  if random.random() <= chance
151
373
  else None
@@ -158,13 +380,13 @@ class DeepEvalClient:
158
380
  chance: float,
159
381
  raise_exception: bool = False,
160
382
  component: str = "unknown",
161
- version: Union[str, None] = None,
383
+ label: Union[str, None] = None,
162
384
  ) -> Optional[dict]:
163
385
  """Randomly run text_eval_background based on a probability between 0 and 1."""
164
386
  self._validate_chance(chance)
165
387
  return (
166
388
  self.text_eval_background(
167
- data, metrics, raise_exception, component=component, version=version
389
+ data, metrics, raise_exception, component=component, label=label
168
390
  )
169
391
  if random.random() <= chance
170
392
  else None
@@ -177,13 +399,17 @@ class DeepEvalClient:
177
399
  chance: float,
178
400
  raise_exception: bool = False,
179
401
  component: str = "unknown",
180
- version: Union[str, None] = None,
402
+ label: Union[str, None] = None,
181
403
  ) -> Optional[dict]:
182
404
  """Randomly run schema_eval based on a probability between 0 and 1."""
183
405
  self._validate_chance(chance)
184
406
  return (
185
407
  self.schema_eval(
186
- data, metrics, raise_exception, component=component, version=version
408
+ data=data,
409
+ metrics=metrics,
410
+ raise_exception=raise_exception,
411
+ component=component,
412
+ label=label,
187
413
  )
188
414
  if random.random() <= chance
189
415
  else None
@@ -196,13 +422,13 @@ class DeepEvalClient:
196
422
  chance: float,
197
423
  raise_exception: bool = False,
198
424
  component: str = "unknown",
199
- version: Union[str, None] = None,
425
+ label: Union[str, None] = None,
200
426
  ) -> Optional[dict]:
201
427
  """Randomly run text_eval_background based on a probability between 0 and 1."""
202
428
  self._validate_chance(chance)
203
429
  return (
204
430
  self.schema_eval_background(
205
- data, metrics, raise_exception, component=component, version=version
431
+ data, metrics, raise_exception, component=component, label=label
206
432
  )
207
433
  if random.random() <= chance
208
434
  else None
@@ -1,44 +1,70 @@
1
-
2
- import time
1
+ import functools
3
2
  import os
3
+ import time
4
+ from typing import Callable, Dict, Optional, ParamSpec, TypeVar, Union, overload
5
+
4
6
  import psutil
5
- import functools
6
7
 
8
+ P = ParamSpec("P")
9
+ R = TypeVar("R")
10
+
11
+
12
+ @overload
13
+ def eval_run(func: Callable[P, R]) -> Callable[P, R]: ...
14
+
15
+
16
+ @overload
17
+ def eval_run(
18
+ func: None = None,
19
+ **decorator_kwargs: Dict[str, object],
20
+ ) -> Callable[[Callable[P, R]], Callable[P, R]]: ...
21
+
22
+
23
+ def eval_run(
24
+ func: Optional[Callable[P, R]] = None,
25
+ **decorator_kwargs: Dict[str, object],
26
+ ) -> Union[
27
+ Callable[P, R],
28
+ Callable[[Callable[P, R]], Callable[P, R]],
29
+ ]:
30
+ # used as @eval_run
31
+ if callable(func):
32
+ return _wrap(func)
33
+
34
+ # used as @eval_run(...)
35
+ def decorator(real_func: Callable[P, R]) -> Callable[P, R]:
36
+ return _wrap(real_func)
37
+
38
+ return decorator
39
+
40
+
41
+ def _wrap(func: Callable[P, R]) -> Callable[P, R]:
42
+ @functools.wraps(func)
43
+ def inner(*args: P.args, **kwargs: P.kwargs) -> R:
44
+ process = psutil.Process(os.getpid())
45
+
46
+ start_time = time.perf_counter()
47
+ start_cpu = process.cpu_times()
48
+ start_mem = process.memory_info().rss
49
+
50
+ try:
51
+ return func(*args, **kwargs)
52
+ finally:
53
+ end_time = time.perf_counter()
54
+ end_cpu = process.cpu_times()
55
+ end_mem = process.memory_info().rss
56
+
57
+ elapsed = end_time - start_time
58
+ cpu_used = (end_cpu.user + end_cpu.system) - (
59
+ start_cpu.user + start_cpu.system
60
+ )
61
+ mem_delta_mb = (end_mem - start_mem) / (1024 * 1024)
62
+
63
+ print(
64
+ f"[eval_run] {func.__module__}.{func.__name__} | "
65
+ f"time={elapsed:.4f}s | "
66
+ f"cpu={cpu_used:.4f}s | "
67
+ f"mem_delta={mem_delta_mb:.2f}MB"
68
+ )
7
69
 
8
- def eval_run(*dargs, **dkwargs):
9
- def wrapper(func):
10
- @functools.wraps(func)
11
- def inner(*args, **kwargs):
12
- process = psutil.Process(os.getpid())
13
-
14
- # Start metrics
15
- start_time = time.perf_counter()
16
- start_cpu = process.cpu_times()
17
- start_mem = process.memory_info().rss
18
-
19
- try:
20
- result = func(*args, **kwargs)
21
- return result
22
- finally:
23
- # End metrics
24
- end_time = time.perf_counter()
25
- end_cpu = process.cpu_times()
26
- end_mem = process.memory_info().rss
27
-
28
- elapsed = end_time - start_time
29
- cpu_used = (
30
- (end_cpu.user + end_cpu.system)
31
- - (start_cpu.user + start_cpu.system)
32
- )
33
- mem_diff_mb = (end_mem - start_mem) / (1024 * 1024)
34
-
35
- print(
36
- f"[eval_run] {func.__module__}.{func.__name__} | "
37
- f"time={elapsed:.4f}s | "
38
- f"cpu={cpu_used:.4f}s | "
39
- f"mem_delta={mem_diff_mb:.2f}MB"
40
- )
41
-
42
- return inner
43
-
44
- return wrapper
70
+ return inner
rakam_eval_sdk/schema.py CHANGED
@@ -39,7 +39,7 @@ class CorrectnessConfig(MetricConfigBase):
39
39
  "Minor formatting differences like '$1,250.00' vs '$1250.00' are acceptable."
40
40
  ]
41
41
  )
42
- criteria: Optional[str] = (None,)
42
+ criteria: Optional[str] = None
43
43
  params: List[Literal["actual_output", "expected_output"]] = Field(
44
44
  default=["actual_output", "expected_output"]
45
45
  )
@@ -94,7 +94,8 @@ MetricConfig = Annotated[
94
94
  ]
95
95
 
96
96
  SchemaMetricConfig = Annotated[
97
- Union[JsonCorrectnessConfig, FieldsPresenceConfig], Field(discriminator="type")
97
+ Union[JsonCorrectnessConfig, FieldsPresenceConfig], Field(
98
+ discriminator="type")
98
99
  ]
99
100
 
100
101
 
@@ -116,14 +117,18 @@ class SchemaInputItem(InputItem):
116
117
 
117
118
 
118
119
  class EvalConfig(BaseModel):
120
+ __eval_config__ = "text_eval"
121
+ unique_id: Union[str, None] = None
119
122
  component: str = "unknown"
120
- version: Union[str, None] = None
123
+ label: Union[str, None] = None
121
124
  data: List[TextInputItem]
122
125
  metrics: List[MetricConfig] = Field(default_factory=list)
123
126
 
124
127
 
125
128
  class SchemaEvalConfig(BaseModel):
129
+ __eval_config__ = "schema_eval"
126
130
  component: str = "unknown"
127
- version: Union[str, None] = None
131
+ unique_id: Union[str, None] = None
132
+ label: Union[str, None] = None
128
133
  data: List[SchemaInputItem]
129
134
  metrics: List[SchemaMetricConfig] = Field(default_factory=list)
@@ -1,10 +1,9 @@
1
1
  import ast
2
2
  import importlib
3
3
  import importlib.util
4
- from dataclasses import dataclass
5
4
  from pathlib import Path
6
5
  from types import ModuleType
7
- from typing import Callable, Iterable, List, Tuple
6
+ from typing import List
8
7
 
9
8
 
10
9
  class DecoratedFunctionVisitor(ast.NodeVisitor):
@@ -12,13 +11,13 @@ class DecoratedFunctionVisitor(ast.NodeVisitor):
12
11
  self.decorator_name = decorator_name
13
12
  self.results: List[str] = []
14
13
 
15
- def visit_FunctionDef(self, node: ast.FunctionDef):
14
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
16
15
  for deco in node.decorator_list:
17
16
  if self._matches(deco):
18
17
  self.results.append(node.name)
19
18
  self.generic_visit(node)
20
19
 
21
- def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
20
+ def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
22
21
  for deco in node.decorator_list:
23
22
  if self._matches(deco):
24
23
  self.results.append(node.name)
@@ -57,13 +56,3 @@ def load_module_from_path(file_path: Path) -> ModuleType:
57
56
  module = importlib.util.module_from_spec(spec)
58
57
  spec.loader.exec_module(module)
59
58
  return module
60
-
61
-
62
-
63
-
64
-
65
- def get_function(module: ModuleType, function_name: str) -> Callable:
66
- func = getattr(module, function_name, None)
67
- if func is None:
68
- raise AttributeError(f"{function_name} not found in {module.__name__}")
69
- return func
@@ -1,9 +1,10 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rakam-eval-sdk
3
- Version: 0.1.16rc1
3
+ Version: 0.2.0rc1
4
4
  Summary: Evaluation Framework SDK
5
5
  Author: Mohamed Bachar Touil
6
6
  License: MIT
7
+ Requires-Dist: dotenv>=0.9.9
7
8
  Requires-Dist: psutil>=7.2.1
8
9
  Requires-Dist: pydantic>=2.10.6
9
10
  Requires-Dist: requests
@@ -0,0 +1,10 @@
1
+ rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ rakam_eval_sdk/cli.py,sha256=eEfBIPcE8vrXsLc1pvah7FeQrH3KdmUcm4ndlTURlF4,13590
3
+ rakam_eval_sdk/client.py,sha256=4qUG8cLGqY8026s28uCHM3zFuGDzekLokZZDu7VRJ_8,13077
4
+ rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
5
+ rakam_eval_sdk/schema.py,sha256=ozNC56ygzR1G6UABjnqnJVAPVcF4rJMH1pUNH0a1K4M,3617
6
+ rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
7
+ rakam_eval_sdk-0.2.0rc1.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
8
+ rakam_eval_sdk-0.2.0rc1.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
9
+ rakam_eval_sdk-0.2.0rc1.dist-info/METADATA,sha256=ZPMVvCST3fb48UJSJfa1fj5qyjrLi-pQ3N_J1_4pEnA,6019
10
+ rakam_eval_sdk-0.2.0rc1.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- rakam_eval_sdk/cli.py,sha256=9BHZte3cS1LWL0_dOVEtws9xIhdw0yORW93Dm1uDxDw,2876
3
- rakam_eval_sdk/client.py,sha256=q-Y11maLVKaEnq4OSyFCqrP3JgFS1xpyp9-bZhFssIA,7123
4
- rakam_eval_sdk/decorators.py,sha256=ZEcZb2KUsPrtx-Guc7tYN9MVCMxIQ83yhiJxKE1fjdw,1262
5
- rakam_eval_sdk/schema.py,sha256=MQfF0SEHf2wzeXJNTsMs-yDbN0vZJQbN_crfpPXsTk8,3467
6
- rakam_eval_sdk/utils/decorator_utils.py,sha256=hCC4F7v3KjGSDt2NUXfDsbBTMPzlG6wMzZVdR_wWn14,2048
7
- rakam_eval_sdk-0.1.16rc1.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
8
- rakam_eval_sdk-0.1.16rc1.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
9
- rakam_eval_sdk-0.1.16rc1.dist-info/METADATA,sha256=DRKzVNNF426R3ipnpG8Xr5LXKLTY4Ar9WdPIxe6hjzI,5991
10
- rakam_eval_sdk-0.1.16rc1.dist-info/RECORD,,