rakam-eval-sdk 0.1.16__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rakam_eval_sdk/cli.py CHANGED
@@ -1,17 +1,557 @@
1
1
  # cli.py
2
- import typer
2
+ import json
3
+ import os
4
+ import sys
5
+ import uuid
6
+ from datetime import datetime
3
7
  from pathlib import Path
8
+ from pprint import pprint
9
+ from typing import Any, Dict, Optional, Sequence
10
+
11
+ import typer
12
+ from dotenv import load_dotenv
13
+ from rich.console import Console
14
+ from rich.panel import Panel
15
+ from rich.pretty import Pretty
16
+
17
+ from rakam_eval_sdk.client import DeepEvalClient
18
+ from rakam_eval_sdk.decorators import eval_run
19
+ from rakam_eval_sdk.utils.decorator_utils import (
20
+ find_decorated_functions,
21
+ load_module_from_path,
22
+ )
23
+
24
+ load_dotenv()
25
+ app = typer.Typer(help="CLI tools for evaluation utilities")
26
+ console = Console()
27
+
28
+ # add root of the project to sys.path
29
+ PROJECT_ROOT = os.path.abspath(".")
30
+ if PROJECT_ROOT not in sys.path:
31
+ sys.path.insert(0, PROJECT_ROOT)
32
+ list_app = typer.Typer(help="List resources")
33
+ app.add_typer(list_app, name="list")
34
+ metrics_app = typer.Typer(help="Metrics utilities")
35
+ app.add_typer(metrics_app, name="metrics")
36
+
37
+
38
+ @metrics_app.command("list")
39
+ def list_metrics(
40
+ limit: int = typer.Option(
41
+ 20,
42
+ "--limit",
43
+ help="Number of testcases to inspect for metrics",
44
+ ),
45
+ ):
46
+ """
47
+ List unique metric names found in evaluation testcases.
48
+ """
49
+ client = DeepEvalClient()
50
+
51
+ testcases = client.list_evaluation_testcases(
52
+ limit=limit,
53
+ offset=0,
54
+ raise_exception=True,
55
+ )
56
+
57
+ if not testcases:
58
+ typer.echo("No evaluation testcases found.")
59
+ return
60
+
61
+ metric_names: set[str] = set()
62
+
63
+ def collect_metrics(entries: Sequence[Dict] | None):
64
+ if not entries:
65
+ return
66
+ for entry in entries:
67
+ for metric in entry.get("metrics", []) or []:
68
+ name = metric.get("name")
69
+ if name:
70
+ metric_names.add(name)
71
+
72
+ for tc in testcases:
73
+
74
+ collect_metrics(tc.get("result"))
75
+
76
+ if not metric_names:
77
+ typer.echo("No metrics found.")
78
+ return
79
+
80
+ typer.echo(
81
+ f"📊 Found {len(metric_names)} unique metrics "
82
+ f"(from latest {limit} testcases)\n"
83
+ )
84
+
85
+ for name in sorted(metric_names):
86
+ typer.echo(f"- {name}")
87
+
88
+
89
+ @list_app.command("eval")
90
+ def list(
91
+ directory: Path = typer.Argument(
92
+ Path("./eval"),
93
+ exists=True,
94
+ file_okay=False,
95
+ dir_okay=True,
96
+ help="Directory to scan (default: ./eval)",
97
+ ),
98
+ recursive: bool = typer.Option(
99
+ False,
100
+ "--recursive",
101
+ "-r",
102
+ help="Recursively search for Python files",
103
+ ),
104
+ ) -> None:
105
+ """
106
+ Find functions decorated with @track.
107
+ """
108
+ TARGET_DECORATOR = eval_run.__name__
109
+ files = directory.rglob("*.py") if recursive else directory.glob("*.py")
110
+
111
+ found = False
112
+
113
+ for file in sorted(files):
114
+ functions = find_decorated_functions(file, TARGET_DECORATOR)
115
+ for fn in functions:
116
+ found = True
117
+ typer.echo(f"{file}:{fn}")
118
+
119
+ if not found:
120
+ typer.echo(f"No @{TARGET_DECORATOR} functions found.")
121
+
122
+
123
+ @list_app.command("runs")
124
+ def list_runs(
125
+ limit: int = typer.Option(20, help="Max number of runs"),
126
+ offset: int = typer.Option(0, help="Pagination offset"),
127
+ ):
128
+ """
129
+ List evaluation runs (newest first).
130
+ """
131
+ client = DeepEvalClient()
132
+
133
+ runs = client.list_evaluation_testcases(
134
+ limit=limit,
135
+ offset=offset,
136
+ raise_exception=True,
137
+ )
138
+
139
+ if not runs:
140
+ typer.echo("No evaluation runs found.")
141
+ return
142
+
143
+ typer.echo(f"[id] " f"{'unique_id':<20}" f"{'label':<20}" f"created_at")
144
+ # pretty CLI output
145
+ for run in runs:
146
+ run_id = run.get("id")
147
+ label = run.get("label") or "-"
148
+ uid = run.get("unique_id") or "-"
149
+ created_at = run.get("created_at")
150
+
151
+ if created_at:
152
+ try:
153
+ created_at = datetime.fromisoformat(created_at).strftime(
154
+ "%Y-%m-%d %H:%M:%S"
155
+ )
156
+ except ValueError:
157
+ pass
158
+
159
+ typer.echo(f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
160
+
161
+
162
+ @list_app.command("show")
163
+ def show_testcase(
164
+ id: Optional[int] = typer.Option(
165
+ None,
166
+ "--id",
167
+ help="Numeric evaluation testcase ID",
168
+ ),
169
+ uid: Optional[str] = typer.Option(
170
+ None,
171
+ "--uid",
172
+ help="Evaluation testcase unique_id",
173
+ ),
174
+ raw: bool = typer.Option(
175
+ False,
176
+ "--raw",
177
+ help="Print raw JSON instead of formatted output",
178
+ ),
179
+ ):
180
+ """
181
+ Show a single evaluation testcase by ID or unique_id.
182
+ """
183
+ if not id and not uid:
184
+ raise typer.BadParameter("You must provide either --id or --uid")
185
+
186
+ if id and uid:
187
+ raise typer.BadParameter("Provide only one of --id or --uid")
188
+
189
+ client = DeepEvalClient()
190
+
191
+ if id:
192
+ result = client.get_evaluation_testcase_by_id(id)
193
+ identifier = f"id={id}"
194
+ else:
195
+ result = client.get_evaluation_testcase_by_unique_id(uid)
196
+ identifier = f"unique_id={uid}"
197
+
198
+ if not result:
199
+ console.print(
200
+ Panel(
201
+ f"No response received for {identifier}",
202
+ title="Error",
203
+ style="red",
204
+ )
205
+ )
206
+ raise typer.Exit(code=1)
207
+
208
+ if isinstance(result, dict) and result.get("error"):
209
+ console.print(
210
+ Panel(
211
+ result["error"],
212
+ title="Error",
213
+ style="red",
214
+ )
215
+ )
216
+ raise typer.Exit(code=1)
217
+
218
+ if raw:
219
+ console.print(Pretty(result))
220
+ raise typer.Exit()
221
+
222
+ console.print(
223
+ Panel.fit(
224
+ Pretty(result),
225
+ title="Evaluation TestCase",
226
+ subtitle=identifier,
227
+ )
228
+ )
229
+
230
+
231
+ def validate_eval_result(result: Any, fn_name: str) -> str:
232
+ eval_config = getattr(result, "__eval_config__", None)
233
+
234
+ if not isinstance(eval_config, str):
235
+ expected = "EvalConfig or SchemaEvalConfig"
236
+ actual = type(result).__name__
237
+
238
+ typer.echo(
239
+ f" ❌ Invalid return type from `{fn_name}`\n"
240
+ f" Expected: {expected}\n"
241
+ f" Got: {actual}"
242
+ )
243
+ return ""
244
+
245
+ return eval_config
246
+
247
+
248
+ @app.command()
249
+ def run(
250
+ directory: Path = typer.Argument(
251
+ Path("./eval"),
252
+ exists=True,
253
+ file_okay=False,
254
+ dir_okay=True,
255
+ help="Directory to scan (default: ./eval)",
256
+ ),
257
+ recursive: bool = typer.Option(
258
+ False,
259
+ "-r",
260
+ "--recursive",
261
+ help="Recursively search for Python files",
262
+ ),
263
+ dry_run: bool = typer.Option(
264
+ False,
265
+ "--dry-run",
266
+ help="Only list functions without executing them",
267
+ ),
268
+ save_runs: bool = typer.Option(
269
+ False,
270
+ "--save-runs",
271
+ help="Save each evaluation run result to a JSON file",
272
+ ),
273
+ output_dir: Path = typer.Option(
274
+ Path("./eval_runs"),
275
+ "--output-dir",
276
+ help="Directory where run results are saved",
277
+ ),
278
+ ) -> None:
279
+ """
280
+ Find and execute all functions decorated with @eval_run.
281
+ """
282
+ files = directory.rglob("*.py") if recursive else directory.glob("*.py")
283
+ TARGET_DECORATOR = eval_run.__name__
284
+
285
+ executed_any = False
286
+
287
+ if save_runs and not dry_run:
288
+ output_dir.mkdir(parents=True, exist_ok=True)
289
+
290
+ for file in sorted(files):
291
+ functions = find_decorated_functions(file, TARGET_DECORATOR)
292
+ if not functions:
293
+ continue
294
+
295
+ typer.echo(f"\n📄 {file}")
296
+
297
+ module = None
298
+ if not dry_run:
299
+ try:
300
+ module = load_module_from_path(file)
301
+ except Exception as e:
302
+ typer.echo(f" ❌ Failed to import module: {e}")
303
+ continue
304
+
305
+ for fn_name in functions:
306
+ typer.echo(f" ▶ {fn_name}")
307
+
308
+ if dry_run:
309
+ continue
310
+
311
+ try:
312
+ func = getattr(module, fn_name)
313
+ result = func()
314
+
315
+ eval_type = validate_eval_result(result, fn_name)
316
+ if not eval_type:
317
+ continue
318
+
319
+ client = DeepEvalClient()
320
+
321
+ if eval_type == "text_eval":
322
+ resp = client.text_eval(config=result)
323
+ else:
324
+ resp = client.schema_eval(config=result)
325
+
326
+ typer.echo(f"{resp}")
327
+ executed_any = True
328
+ typer.echo(f" ✅ Returned {type(result).__name__}")
329
+
330
+ if save_runs:
331
+ run_id = (
332
+ resp["id"]
333
+ if resp is not None and "id" in resp
334
+ else uuid.uuid4().hex[:8]
335
+ )
336
+
337
+ output_path = output_dir / f"run_{fn_name}_{run_id}.json"
338
+
339
+ def to_json_safe(obj: Any) -> Any:
340
+ if hasattr(obj, "model_dump"):
341
+ return obj.model_dump()
342
+ if hasattr(obj, "dict"):
343
+ return obj.dict()
344
+ return obj
345
+
346
+ with output_path.open("w", encoding="utf-8") as f:
347
+ json.dump(
348
+ to_json_safe(resp),
349
+ f,
350
+ indent=2,
351
+ ensure_ascii=False,
352
+ )
353
+
354
+ typer.echo(f" 💾 Saved run → {output_path}")
355
+
356
+ except Exception as e:
357
+ typer.echo(f" ❌ Execution failed: {e}")
358
+
359
+ if not executed_any and not dry_run:
360
+ typer.echo("\nNo @eval_run functions executed.")
361
+
362
+
363
+ def _print_and_save(
364
+ resp: dict,
365
+ pretty: bool,
366
+ out: Path | None,
367
+ overwrite: bool,
368
+ ) -> None:
369
+ if pretty:
370
+ typer.echo(typer.style("📊 Result:", bold=True))
371
+ pprint(resp)
372
+ else:
373
+ typer.echo(resp)
374
+
375
+ if out is None:
376
+ return
4
377
 
5
- app = typer.Typer()
378
+ if out.exists() and not overwrite:
379
+ typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
380
+ raise typer.Exit(code=1)
381
+
382
+ out.parent.mkdir(parents=True, exist_ok=True)
383
+
384
+ with out.open("w", encoding="utf-8") as f:
385
+ json.dump(resp, f, indent=2, ensure_ascii=False)
386
+
387
+ typer.echo(f"💾 Result saved to {out}")
388
+
389
+
390
+ @app.command()
391
+ def compare_testcases(
392
+ testcase_a_id: int = typer.Argument(
393
+ ...,
394
+ help="ID of the first testcase",
395
+ ),
396
+ testcase_b_id: int = typer.Argument(
397
+ ...,
398
+ help="ID of the second testcase",
399
+ ),
400
+ pretty: bool = typer.Option(
401
+ True,
402
+ "--pretty/--raw",
403
+ help="Pretty-print the response",
404
+ ),
405
+ raise_exception: bool = typer.Option(
406
+ False,
407
+ "--raise",
408
+ help="Raise HTTP exceptions instead of swallowing them",
409
+ ),
410
+ out: Path | None = typer.Option(
411
+ None,
412
+ "-o",
413
+ "--out",
414
+ help="Optional file path to save the result as JSON",
415
+ ),
416
+ overwrite: bool = typer.Option(
417
+ False,
418
+ "--overwrite",
419
+ help="Overwrite output file if it already exists",
420
+ ),
421
+ ) -> None:
422
+ """
423
+ Compare two DeepEval evaluation testcases.
424
+ """
425
+ client = DeepEvalClient()
426
+
427
+ typer.echo(f"🔍 Comparing testcases {testcase_a_id} ↔ {testcase_b_id}")
428
+
429
+ try:
430
+ resp = client.compare_testcases(
431
+ testcase_a_id=testcase_a_id,
432
+ testcase_b_id=testcase_b_id,
433
+ raise_exception=raise_exception,
434
+ )
435
+ except Exception as e:
436
+ typer.echo(f"❌ Request failed: {e}")
437
+ raise typer.Exit(code=1)
438
+
439
+ if not resp:
440
+ typer.echo("⚠️ No response received")
441
+ raise typer.Exit(code=1)
442
+ _print_and_save(resp, pretty, out, overwrite)
443
+
444
+
445
+ @app.command()
446
+ def compare_label_latest(
447
+ label_a: str = typer.Argument(
448
+ ...,
449
+ help="First label (latest run will be used)",
450
+ ),
451
+ label_b: str = typer.Argument(
452
+ ...,
453
+ help="Second label (latest run will be used)",
454
+ ),
455
+ pretty: bool = typer.Option(
456
+ True,
457
+ "--pretty/--raw",
458
+ help="Pretty-print the response",
459
+ ),
460
+ raise_exception: bool = typer.Option(
461
+ False,
462
+ "--raise",
463
+ help="Raise HTTP exceptions instead of swallowing them",
464
+ ),
465
+ out: Path | None = typer.Option(
466
+ None,
467
+ "-o",
468
+ "--out",
469
+ help="Optional file path to save the result as JSON",
470
+ ),
471
+ overwrite: bool = typer.Option(
472
+ False,
473
+ "--overwrite",
474
+ help="Overwrite output file if it already exists",
475
+ ),
476
+ ) -> None:
477
+ """
478
+ Compare the latest evaluation runs for two labels.
479
+ """
480
+ client = DeepEvalClient()
481
+
482
+ typer.echo(f"🔍 Comparing latest runs: '{label_a}' ↔ '{label_b}'")
483
+
484
+ try:
485
+ resp = client.compare_latest_by_labels(
486
+ label_a=label_a,
487
+ label_b=label_b,
488
+ raise_exception=raise_exception,
489
+ )
490
+ except Exception as e:
491
+ typer.echo(f"❌ Request failed: {e}")
492
+ raise typer.Exit(code=1)
493
+
494
+ if not resp:
495
+ typer.echo("⚠️ No response received")
496
+ raise typer.Exit(code=1)
497
+
498
+ _print_and_save(resp, pretty, out, overwrite)
6
499
 
7
500
 
8
501
  @app.command()
9
- def read(file: Path):
10
- """Read a Python file"""
11
- if file.suffix != ".py":
12
- raise typer.BadParameter("Must be a .py file")
13
- typer.echo(file.read_text())
502
+ def compare_last(
503
+ label: str = typer.Argument(
504
+ ...,
505
+ help="Label whose last two runs will be compared",
506
+ ),
507
+ pretty: bool = typer.Option(
508
+ True,
509
+ "--pretty/--raw",
510
+ help="Pretty-print the response",
511
+ ),
512
+ raise_exception: bool = typer.Option(
513
+ False,
514
+ "--raise",
515
+ help="Raise HTTP exceptions instead of swallowing them",
516
+ ),
517
+ out: Path | None = typer.Option(
518
+ None,
519
+ "-o",
520
+ "--out",
521
+ help="Optional file path to save the result as JSON",
522
+ ),
523
+ overwrite: bool = typer.Option(
524
+ False,
525
+ "--overwrite",
526
+ help="Overwrite output file if it already exists",
527
+ ),
528
+ ) -> None:
529
+ """
530
+ Compare the last two evaluation runs of a label.
531
+ """
532
+ client = DeepEvalClient()
533
+
534
+ typer.echo(f"🔍 Comparing last two runs for label '{label}'")
14
535
 
536
+ try:
537
+ resp = client.compare_last_two_by_label(
538
+ label=label,
539
+ raise_exception=raise_exception,
540
+ )
541
+ except Exception as e:
542
+ typer.echo(f"❌ Request failed: {e}")
543
+ raise typer.Exit(code=1)
15
544
 
16
- def main():
545
+ if not resp:
546
+ typer.echo("⚠️ No response received")
547
+ raise typer.Exit(code=1)
548
+
549
+ _print_and_save(resp, pretty, out, overwrite)
550
+
551
+
552
+ def main() -> None:
17
553
  app()
554
+
555
+
556
+ if __name__ == "__main__":
557
+ main()
rakam_eval_sdk/client.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import os
2
2
  import random
3
- from typing import Any, List, Optional, Union, cast
3
+ from typing import Any, Dict, List, Optional, Union, cast, overload
4
4
 
5
5
  import requests
6
6
 
@@ -74,19 +74,202 @@ class DeepEvalClient:
74
74
  raise
75
75
  return {"error": "Invalid JSON response", "raw": resp.text}
76
76
 
77
+ def _get(
78
+ self,
79
+ endpoint: str,
80
+ params: dict,
81
+ raise_exception: bool = False,
82
+ ) -> Optional[dict]:
83
+ """Internal helper to send GET requests with standard headers and error handling."""
84
+ url = f"{self.base_url}{endpoint}"
85
+ headers = {
86
+ "accept": "application/json",
87
+ "X-API-Token": self.api_token,
88
+ }
89
+
90
+ try:
91
+ resp = requests.get(
92
+ url,
93
+ headers=headers,
94
+ params=params,
95
+ timeout=self.timeout,
96
+ )
97
+ if raise_exception:
98
+ resp.raise_for_status()
99
+ except requests.RequestException as e:
100
+ if raise_exception:
101
+ raise
102
+ return {"error": str(e)}
103
+
104
+ try:
105
+ return cast(dict, resp.json())
106
+ except ValueError:
107
+ if raise_exception:
108
+ raise
109
+ return {"error": "Invalid JSON response", "raw": resp.text}
110
+
111
+ def list_evaluation_testcases(
112
+ self,
113
+ *,
114
+ limit: int = 10,
115
+ offset: int = 0,
116
+ raise_exception: bool = False,
117
+ ) -> Optional[List[Dict]]:
118
+ """
119
+ List evaluation testcases for the current API token only.
120
+ Sorted by created_at DESC (newest first).
121
+ """
122
+ return self._get(
123
+ "/eval-framework/deepeval/evaluation-testcases/token",
124
+ params={
125
+ "limit": limit,
126
+ "offset": offset,
127
+ },
128
+ raise_exception=raise_exception,
129
+ )
130
+
131
+ def get_evaluation_testcase_by_id(
132
+ self,
133
+ testcase_id: int,
134
+ *,
135
+ raise_exception: bool = False,
136
+ ) -> Optional[Dict]:
137
+ """
138
+ Fetch a single evaluation testcase by numeric ID.
139
+ """
140
+ return self._get(
141
+ f"/eval-framework/deepeval/id/{testcase_id}",
142
+ params={},
143
+ raise_exception=raise_exception,
144
+ )
145
+
146
+ def get_evaluation_testcase_by_unique_id(
147
+ self,
148
+ unique_id: str,
149
+ *,
150
+ raise_exception: bool = False,
151
+ ) -> Optional[Dict]:
152
+ """
153
+ Fetch a single evaluation testcase by unique_id.
154
+ """
155
+ return self._get(
156
+ f"/eval-framework/deepeval/uid/{unique_id}",
157
+ params={},
158
+ raise_exception=raise_exception,
159
+ )
160
+
161
+ def get_evaluation_testcase(
162
+ self,
163
+ *,
164
+ id: Optional[int] = None,
165
+ unique_id: Optional[str] = None,
166
+ raise_exception: bool = False,
167
+ ) -> Optional[Dict]:
168
+ if id is not None:
169
+ return self.get_evaluation_testcase_by_id(
170
+ id, raise_exception=raise_exception
171
+ )
172
+ if unique_id is not None:
173
+ return self.get_evaluation_testcase_by_unique_id(
174
+ unique_id, raise_exception=raise_exception
175
+ )
176
+ raise ValueError("Either id or unique_id must be provided")
177
+
178
+ def compare_testcases(
179
+ self,
180
+ *,
181
+ testcase_a_id: int,
182
+ testcase_b_id: int,
183
+ raise_exception: bool = False,
184
+ ) -> Optional[dict]:
185
+ """
186
+ Compare two evaluation testcases.
187
+ """
188
+ return self._get(
189
+ "/eval-framework/deepeval/evaluation-testcases/compare",
190
+ params={
191
+ "testcase_a_id": testcase_a_id,
192
+ "testcase_b_id": testcase_b_id,
193
+ },
194
+ raise_exception=raise_exception,
195
+ )
196
+
197
+ def compare_latest_by_labels(
198
+ self,
199
+ *,
200
+ label_a: str,
201
+ label_b: str,
202
+ raise_exception: bool = False,
203
+ ) -> Optional[dict]:
204
+ """
205
+ Compare the latest evaluation testcases for two labels.
206
+ """
207
+ return self._get(
208
+ "/eval-framework/deepeval/evaluation-testcases/compare-latest",
209
+ params={
210
+ "label_a": label_a,
211
+ "label_b": label_b,
212
+ },
213
+ raise_exception=raise_exception,
214
+ )
215
+
216
+ def compare_last_two_by_label(
217
+ self,
218
+ *,
219
+ label: str,
220
+ raise_exception: bool = False,
221
+ ) -> Optional[dict]:
222
+ """
223
+ Compare the last two evaluation testcases for a given label.
224
+ """
225
+ return self._get(
226
+ "/eval-framework/deepeval/evaluation-testcases/compare-last-two",
227
+ params={
228
+ "label": label,
229
+ },
230
+ raise_exception=raise_exception,
231
+ )
232
+
233
+ @overload
234
+ def text_eval(
235
+ self,
236
+ config: EvalConfig,
237
+ *,
238
+ raise_exception: bool = False,
239
+ ) -> Optional[dict]: ...
240
+
241
+ @overload
77
242
  def text_eval(
78
243
  self,
244
+ *,
79
245
  data: List[TextInputItem],
80
246
  metrics: List[MetricConfig],
247
+ component: str = "unknown",
248
+ label: str | None = None,
81
249
  raise_exception: bool = False,
250
+ ) -> Optional[dict]: ...
251
+
252
+ def text_eval(
253
+ self,
254
+ config: EvalConfig | None = None,
255
+ *,
256
+ data: List[TextInputItem] | None = None,
257
+ metrics: List[MetricConfig] | None = None,
82
258
  component: str = "unknown",
83
- version: Union[str, None] = None,
259
+ label: str | None = None,
260
+ raise_exception: bool = False,
84
261
  ) -> Optional[dict]:
85
- """Run synchronous text evaluation."""
86
- payload = EvalConfig.model_construct(
87
- data=data, metrics=metrics, component=component, version=version
88
- ).model_dump()
89
- return self._request("/deepeval/text-eval", payload, raise_exception)
262
+ if config is None:
263
+ config = EvalConfig(
264
+ data=data,
265
+ metrics=metrics,
266
+ component=component,
267
+ label=label,
268
+ )
269
+
270
+ return self._request(
271
+ "/deepeval/text-eval", config.model_dump(), raise_exception
272
+ )
90
273
 
91
274
  def text_eval_background(
92
275
  self,
@@ -94,27 +277,61 @@ class DeepEvalClient:
94
277
  metrics: List[MetricConfig],
95
278
  raise_exception: bool = False,
96
279
  component: str = "unknown",
97
- version: Union[str, None] = None,
280
+ label: Union[str, None] = None,
98
281
  ) -> Optional[dict]:
99
282
  """Run background text evaluation (async job)."""
100
283
  payload = EvalConfig.model_construct(
101
- data=data, metrics=metrics, component=component, version=version
284
+ data=data, metrics=metrics, component=component, version=label
102
285
  ).model_dump()
103
286
  return self._request("/deepeval/text-eval/background", payload, raise_exception)
104
287
 
288
+ @overload
105
289
  def schema_eval(
106
290
  self,
291
+ *,
107
292
  data: List[SchemaInputItem],
108
293
  metrics: List[SchemaMetricConfig],
294
+ component: str = "unknown",
295
+ label: str | None = None,
296
+ raise_exception: bool = False,
297
+ ) -> Optional[dict]: ...
298
+
299
+ @overload
300
+ def schema_eval(
301
+ self,
302
+ config: SchemaEvalConfig,
303
+ *,
109
304
  raise_exception: bool = False,
305
+ ) -> Optional[dict]: ...
306
+
307
+ def schema_eval(
308
+ self,
309
+ config: SchemaEvalConfig | None = None,
310
+ *,
311
+ data: List[SchemaInputItem] | None = None,
312
+ metrics: List[SchemaMetricConfig] | None = None,
110
313
  component: str = "unknown",
111
- version: Union[str, None] = None,
314
+ label: str | None = None,
315
+ raise_exception: bool = False,
112
316
  ) -> Optional[dict]:
113
- """Run synchronous schema evaluation."""
114
- payload = SchemaEvalConfig.model_construct(
115
- data=data, metrics=metrics, component=component, version=version
116
- ).model_dump()
117
- return self._request("/deepeval/schema-eval", payload, raise_exception)
317
+ if config is None:
318
+ if data is None or metrics is None:
319
+ raise ValueError(
320
+ "Either `config` or both `data` and `metrics` must be provided"
321
+ )
322
+
323
+ config = SchemaEvalConfig(
324
+ data=data,
325
+ metrics=metrics,
326
+ component=component,
327
+ label=label,
328
+ )
329
+
330
+ return self._request(
331
+ "/deepeval/schema-eval",
332
+ config.model_dump(),
333
+ raise_exception,
334
+ )
118
335
 
119
336
  def schema_eval_background(
120
337
  self,
@@ -122,11 +339,11 @@ class DeepEvalClient:
122
339
  metrics: List[SchemaMetricConfig],
123
340
  raise_exception: bool = False,
124
341
  component: str = "unknown",
125
- version: Union[str, None] = None,
342
+ label: Union[str, None] = None,
126
343
  ) -> Optional[dict]:
127
344
  """Run background schema evaluation (async job)."""
128
345
  payload = SchemaEvalConfig.model_construct(
129
- data=data, metrics=metrics, component=component, version=version
346
+ data=data, metrics=metrics, component=component, version=label
130
347
  ).model_dump()
131
348
  return self._request(
132
349
  "/deepeval/schema-eval/background", payload, raise_exception
@@ -139,13 +356,17 @@ class DeepEvalClient:
139
356
  chance: float,
140
357
  raise_exception: bool = False,
141
358
  component: str = "unknown",
142
- version: Union[str, None] = None,
359
+ label: Union[str, None] = None,
143
360
  ) -> Optional[dict]:
144
361
  """Randomly run text_eval based on a probability between 0 and 1."""
145
362
  self._validate_chance(chance)
146
363
  return (
147
364
  self.text_eval(
148
- data, metrics, raise_exception, component=component, version=version
365
+ data=data,
366
+ metrics=metrics,
367
+ raise_exception=raise_exception,
368
+ component=component,
369
+ label=label,
149
370
  )
150
371
  if random.random() <= chance
151
372
  else None
@@ -158,13 +379,13 @@ class DeepEvalClient:
158
379
  chance: float,
159
380
  raise_exception: bool = False,
160
381
  component: str = "unknown",
161
- version: Union[str, None] = None,
382
+ label: Union[str, None] = None,
162
383
  ) -> Optional[dict]:
163
384
  """Randomly run text_eval_background based on a probability between 0 and 1."""
164
385
  self._validate_chance(chance)
165
386
  return (
166
387
  self.text_eval_background(
167
- data, metrics, raise_exception, component=component, version=version
388
+ data, metrics, raise_exception, component=component, label=label
168
389
  )
169
390
  if random.random() <= chance
170
391
  else None
@@ -177,13 +398,17 @@ class DeepEvalClient:
177
398
  chance: float,
178
399
  raise_exception: bool = False,
179
400
  component: str = "unknown",
180
- version: Union[str, None] = None,
401
+ label: Union[str, None] = None,
181
402
  ) -> Optional[dict]:
182
403
  """Randomly run schema_eval based on a probability between 0 and 1."""
183
404
  self._validate_chance(chance)
184
405
  return (
185
406
  self.schema_eval(
186
- data, metrics, raise_exception, component=component, version=version
407
+ data=data,
408
+ metrics=metrics,
409
+ raise_exception=raise_exception,
410
+ component=component,
411
+ label=label,
187
412
  )
188
413
  if random.random() <= chance
189
414
  else None
@@ -196,13 +421,13 @@ class DeepEvalClient:
196
421
  chance: float,
197
422
  raise_exception: bool = False,
198
423
  component: str = "unknown",
199
- version: Union[str, None] = None,
424
+ label: Union[str, None] = None,
200
425
  ) -> Optional[dict]:
201
426
  """Randomly run text_eval_background based on a probability between 0 and 1."""
202
427
  self._validate_chance(chance)
203
428
  return (
204
429
  self.schema_eval_background(
205
- data, metrics, raise_exception, component=component, version=version
430
+ data, metrics, raise_exception, component=component, label=label
206
431
  )
207
432
  if random.random() <= chance
208
433
  else None
@@ -0,0 +1,70 @@
1
+ import functools
2
+ import os
3
+ import time
4
+ from typing import Callable, Dict, Optional, ParamSpec, TypeVar, Union, overload
5
+
6
+ import psutil
7
+
8
+ P = ParamSpec("P")
9
+ R = TypeVar("R")
10
+
11
+
12
+ @overload
13
+ def eval_run(func: Callable[P, R]) -> Callable[P, R]: ...
14
+
15
+
16
+ @overload
17
+ def eval_run(
18
+ func: None = None,
19
+ **decorator_kwargs: Dict[str, object],
20
+ ) -> Callable[[Callable[P, R]], Callable[P, R]]: ...
21
+
22
+
23
+ def eval_run(
24
+ func: Optional[Callable[P, R]] = None,
25
+ **decorator_kwargs: Dict[str, object],
26
+ ) -> Union[
27
+ Callable[P, R],
28
+ Callable[[Callable[P, R]], Callable[P, R]],
29
+ ]:
30
+ # used as @eval_run
31
+ if callable(func):
32
+ return _wrap(func)
33
+
34
+ # used as @eval_run(...)
35
+ def decorator(real_func: Callable[P, R]) -> Callable[P, R]:
36
+ return _wrap(real_func)
37
+
38
+ return decorator
39
+
40
+
41
+ def _wrap(func: Callable[P, R]) -> Callable[P, R]:
42
+ @functools.wraps(func)
43
+ def inner(*args: P.args, **kwargs: P.kwargs) -> R:
44
+ process = psutil.Process(os.getpid())
45
+
46
+ start_time = time.perf_counter()
47
+ start_cpu = process.cpu_times()
48
+ start_mem = process.memory_info().rss
49
+
50
+ try:
51
+ return func(*args, **kwargs)
52
+ finally:
53
+ end_time = time.perf_counter()
54
+ end_cpu = process.cpu_times()
55
+ end_mem = process.memory_info().rss
56
+
57
+ elapsed = end_time - start_time
58
+ cpu_used = (end_cpu.user + end_cpu.system) - (
59
+ start_cpu.user + start_cpu.system
60
+ )
61
+ mem_delta_mb = (end_mem - start_mem) / (1024 * 1024)
62
+
63
+ print(
64
+ f"[eval_run] {func.__module__}.{func.__name__} | "
65
+ f"time={elapsed:.4f}s | "
66
+ f"cpu={cpu_used:.4f}s | "
67
+ f"mem_delta={mem_delta_mb:.2f}MB"
68
+ )
69
+
70
+ return inner
rakam_eval_sdk/schema.py CHANGED
@@ -39,7 +39,7 @@ class CorrectnessConfig(MetricConfigBase):
39
39
  "Minor formatting differences like '$1,250.00' vs '$1250.00' are acceptable."
40
40
  ]
41
41
  )
42
- criteria: Optional[str] = (None,)
42
+ criteria: Optional[str] = None
43
43
  params: List[Literal["actual_output", "expected_output"]] = Field(
44
44
  default=["actual_output", "expected_output"]
45
45
  )
@@ -116,14 +116,18 @@ class SchemaInputItem(InputItem):
116
116
 
117
117
 
118
118
  class EvalConfig(BaseModel):
119
+ __eval_config__ = "text_eval"
120
+ unique_id: Union[str, None] = None
119
121
  component: str = "unknown"
120
- version: Union[str, None] = None
122
+ label: Union[str, None] = None
121
123
  data: List[TextInputItem]
122
124
  metrics: List[MetricConfig] = Field(default_factory=list)
123
125
 
124
126
 
125
127
  class SchemaEvalConfig(BaseModel):
128
+ __eval_config__ = "schema_eval"
126
129
  component: str = "unknown"
127
- version: Union[str, None] = None
130
+ unique_id: Union[str, None] = None
131
+ label: Union[str, None] = None
128
132
  data: List[SchemaInputItem]
129
133
  metrics: List[SchemaMetricConfig] = Field(default_factory=list)
@@ -0,0 +1,58 @@
1
+ import ast
2
+ import importlib
3
+ import importlib.util
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+ from typing import List
7
+
8
+
9
+ class DecoratedFunctionVisitor(ast.NodeVisitor):
10
+ def __init__(self, decorator_name: str):
11
+ self.decorator_name = decorator_name
12
+ self.results: List[str] = []
13
+
14
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
15
+ for deco in node.decorator_list:
16
+ if self._matches(deco):
17
+ self.results.append(node.name)
18
+ self.generic_visit(node)
19
+
20
+ def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
21
+ for deco in node.decorator_list:
22
+ if self._matches(deco):
23
+ self.results.append(node.name)
24
+ self.generic_visit(node)
25
+
26
+ def _matches(self, deco: ast.expr) -> bool:
27
+ # @deco
28
+ if isinstance(deco, ast.Name):
29
+ return deco.id == self.decorator_name
30
+
31
+ # @module.deco
32
+ if isinstance(deco, ast.Attribute):
33
+ return deco.attr == self.decorator_name
34
+
35
+ # @deco(...)
36
+ if isinstance(deco, ast.Call):
37
+ return self._matches(deco.func)
38
+
39
+ return False
40
+
41
+
42
+ def find_decorated_functions(
43
+ file_path: Path,
44
+ decorator_name: str,
45
+ ) -> List[str]:
46
+ tree = ast.parse(file_path.read_text(encoding="utf-8"))
47
+ visitor = DecoratedFunctionVisitor(decorator_name)
48
+ visitor.visit(tree)
49
+ return visitor.results
50
+
51
+
52
+ def load_module_from_path(file_path: Path) -> ModuleType:
53
+ spec = importlib.util.spec_from_file_location(file_path.stem, file_path)
54
+ if spec is None or spec.loader is None:
55
+ raise ImportError(f"Cannot import {file_path}")
56
+ module = importlib.util.module_from_spec(spec)
57
+ spec.loader.exec_module(module)
58
+ return module
@@ -1,9 +1,11 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rakam-eval-sdk
3
- Version: 0.1.16
3
+ Version: 0.2.0
4
4
  Summary: Evaluation Framework SDK
5
5
  Author: Mohamed Bachar Touil
6
6
  License: MIT
7
+ Requires-Dist: dotenv>=0.9.9
8
+ Requires-Dist: psutil>=7.2.1
7
9
  Requires-Dist: pydantic>=2.10.6
8
10
  Requires-Dist: requests
9
11
  Requires-Dist: typer>=0.20.1
@@ -0,0 +1,10 @@
1
+ rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ rakam_eval_sdk/cli.py,sha256=0Ym7C83xirGEf0YCh4-agwfjYPJ3yieeS02nXJ_iE-Y,14486
3
+ rakam_eval_sdk/client.py,sha256=JQ-vCJmMLqXql7nNGYBy5dlkZsCq05gOOQhucwwexC8,13060
4
+ rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
5
+ rakam_eval_sdk/schema.py,sha256=P4LlnaInXWTq-ve6qPTTxPyzmj3j_1gcqV9i7CYRYec,3608
6
+ rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
7
+ rakam_eval_sdk-0.2.0.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
8
+ rakam_eval_sdk-0.2.0.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
9
+ rakam_eval_sdk-0.2.0.dist-info/METADATA,sha256=1l6TbYR49zIKpDyNjgqJFLQ_b1mAvBHKizAWF085-9M,6016
10
+ rakam_eval_sdk-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ rakam_eval = rakam_eval_sdk.cli:main
3
+
@@ -1,8 +0,0 @@
1
- rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- rakam_eval_sdk/cli.py,sha256=dn1KXh-_VLpIvLFHnpHPfAL33ICoAJ9Y2iCOikkJcxY,277
3
- rakam_eval_sdk/client.py,sha256=q-Y11maLVKaEnq4OSyFCqrP3JgFS1xpyp9-bZhFssIA,7123
4
- rakam_eval_sdk/schema.py,sha256=MQfF0SEHf2wzeXJNTsMs-yDbN0vZJQbN_crfpPXsTk8,3467
5
- rakam_eval_sdk-0.1.16.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
6
- rakam_eval_sdk-0.1.16.dist-info/entry_points.txt,sha256=NzE2wDRB4Kt-TblkjSD37abcfP4B5STBOyygEhGTLdU,51
7
- rakam_eval_sdk-0.1.16.dist-info/METADATA,sha256=s7N_RsRR87-6aQhNmCnuDeMDrL6ZOm0vr7iR2cS5FwU,5959
8
- rakam_eval_sdk-0.1.16.dist-info/RECORD,,
@@ -1,3 +0,0 @@
1
- [console_scripts]
2
- mycli = rakam_eval_sdk.cli:main
3
-