rakam-eval-sdk 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/PKG-INFO +1 -1
- {rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/pyproject.toml +1 -1
- {rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/cli.py +191 -69
- {rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/client.py +106 -66
- {rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/schema.py +0 -2
- {rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/README.md +0 -0
- {rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/__init__.py +0 -0
- {rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/decorators.py +0 -0
- {rakam_eval_sdk-0.2.1 → rakam_eval_sdk-0.2.3}/src/rakam_eval_sdk/utils/decorator_utils.py +0 -0
|
@@ -35,59 +35,92 @@ metrics_app = typer.Typer(help="Metrics utilities")
|
|
|
35
35
|
app.add_typer(metrics_app, name="metrics")
|
|
36
36
|
|
|
37
37
|
|
|
38
|
+
def extract_metric_names(config: Any) -> list[tuple[str, str | None]]:
|
|
39
|
+
"""
|
|
40
|
+
Returns [(type, name)] from EvalConfig / SchemaEvalConfig
|
|
41
|
+
"""
|
|
42
|
+
if not hasattr(config, "metrics"):
|
|
43
|
+
return []
|
|
44
|
+
|
|
45
|
+
results: list[tuple[str, str | None]] = []
|
|
46
|
+
|
|
47
|
+
for metric in config.metrics or []:
|
|
48
|
+
metric_type = getattr(metric, "type", None)
|
|
49
|
+
metric_name = getattr(metric, "name", None)
|
|
50
|
+
if metric_type:
|
|
51
|
+
results.append((metric_type, metric_name))
|
|
52
|
+
|
|
53
|
+
return results
|
|
54
|
+
|
|
55
|
+
|
|
38
56
|
@metrics_app.command("list")
|
|
39
|
-
def
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
57
|
+
def metrics(
|
|
58
|
+
directory: Path = typer.Argument(
|
|
59
|
+
Path("./eval"),
|
|
60
|
+
exists=True,
|
|
61
|
+
file_okay=False,
|
|
62
|
+
dir_okay=True,
|
|
63
|
+
help="Directory to scan (default: ./eval)",
|
|
44
64
|
),
|
|
45
|
-
|
|
65
|
+
recursive: bool = typer.Option(
|
|
66
|
+
False,
|
|
67
|
+
"-r",
|
|
68
|
+
"--recursive",
|
|
69
|
+
help="Recursively search for Python files",
|
|
70
|
+
),
|
|
71
|
+
) -> None:
|
|
46
72
|
"""
|
|
47
|
-
List
|
|
73
|
+
List all metric types used by loaded eval configs.
|
|
48
74
|
"""
|
|
49
|
-
|
|
75
|
+
files = directory.rglob("*.py") if recursive else directory.glob("*.py")
|
|
76
|
+
TARGET_DECORATOR = eval_run.__name__
|
|
50
77
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
offset=0,
|
|
54
|
-
raise_exception=True,
|
|
55
|
-
)
|
|
78
|
+
all_metrics: set[tuple[str, str | None]] = set()
|
|
79
|
+
found_any = False
|
|
56
80
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
81
|
+
for file in sorted(files):
|
|
82
|
+
functions = find_decorated_functions(file, TARGET_DECORATOR)
|
|
83
|
+
if not functions:
|
|
84
|
+
continue
|
|
60
85
|
|
|
61
|
-
|
|
86
|
+
typer.echo(f"\n📄 {file}")
|
|
62
87
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
name = metric.get("name")
|
|
69
|
-
if name:
|
|
70
|
-
metric_names.add(name)
|
|
88
|
+
try:
|
|
89
|
+
module = load_module_from_path(file)
|
|
90
|
+
except Exception as e:
|
|
91
|
+
typer.echo(f" ❌ Failed to import module: {e}")
|
|
92
|
+
continue
|
|
71
93
|
|
|
72
|
-
|
|
94
|
+
for fn_name in functions:
|
|
95
|
+
try:
|
|
96
|
+
func = getattr(module, fn_name)
|
|
97
|
+
result = func()
|
|
73
98
|
|
|
74
|
-
|
|
99
|
+
metrics = extract_metric_names(result)
|
|
100
|
+
if not metrics:
|
|
101
|
+
continue
|
|
75
102
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
103
|
+
found_any = True
|
|
104
|
+
for metric_type, metric_name in metrics:
|
|
105
|
+
all_metrics.add((metric_type, metric_name))
|
|
79
106
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
107
|
+
if metric_name:
|
|
108
|
+
typer.echo(f" • {metric_type} (alias: {metric_name})")
|
|
109
|
+
else:
|
|
110
|
+
typer.echo(f" • {metric_type}")
|
|
84
111
|
|
|
85
|
-
|
|
86
|
-
|
|
112
|
+
except Exception as e:
|
|
113
|
+
typer.echo(f" ❌ Failed to inspect {fn_name}: {e}")
|
|
114
|
+
|
|
115
|
+
if not found_any:
|
|
116
|
+
typer.echo("\nNo metrics found.")
|
|
117
|
+
raise typer.Exit(code=0)
|
|
118
|
+
|
|
119
|
+
typer.echo(f"\n✅ {len(all_metrics)} unique metrics found")
|
|
87
120
|
|
|
88
121
|
|
|
89
|
-
@list_app.command("
|
|
90
|
-
def
|
|
122
|
+
@list_app.command("evals")
|
|
123
|
+
def list_evals(
|
|
91
124
|
directory: Path = typer.Argument(
|
|
92
125
|
Path("./eval"),
|
|
93
126
|
exists=True,
|
|
@@ -140,12 +173,12 @@ def list_runs(
|
|
|
140
173
|
typer.echo("No evaluation runs found.")
|
|
141
174
|
return
|
|
142
175
|
|
|
143
|
-
typer.echo(f"[id] " f"{'
|
|
176
|
+
typer.echo(f"[id] " f"{'tag':<20}" f"{'label':<20}" f"created_at")
|
|
144
177
|
# pretty CLI output
|
|
145
178
|
for run in runs:
|
|
146
179
|
run_id = run.get("id")
|
|
147
180
|
label = run.get("label") or "-"
|
|
148
|
-
uid = run.get("
|
|
181
|
+
uid = run.get("tag") or "-"
|
|
149
182
|
created_at = run.get("created_at")
|
|
150
183
|
|
|
151
184
|
if created_at:
|
|
@@ -156,20 +189,21 @@ def list_runs(
|
|
|
156
189
|
except ValueError:
|
|
157
190
|
pass
|
|
158
191
|
|
|
159
|
-
typer.echo(
|
|
192
|
+
typer.echo(
|
|
193
|
+
f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
|
|
160
194
|
|
|
161
195
|
|
|
162
|
-
@
|
|
163
|
-
def
|
|
196
|
+
@app.command()
|
|
197
|
+
def show(
|
|
164
198
|
id: Optional[int] = typer.Option(
|
|
165
199
|
None,
|
|
166
200
|
"--id",
|
|
167
|
-
help="
|
|
201
|
+
help="Run ID",
|
|
168
202
|
),
|
|
169
|
-
|
|
203
|
+
tag: Optional[str] = typer.Option(
|
|
170
204
|
None,
|
|
171
|
-
"--
|
|
172
|
-
help="
|
|
205
|
+
"--tag",
|
|
206
|
+
help="Run tag",
|
|
173
207
|
),
|
|
174
208
|
raw: bool = typer.Option(
|
|
175
209
|
False,
|
|
@@ -178,12 +212,12 @@ def show_testcase(
|
|
|
178
212
|
),
|
|
179
213
|
):
|
|
180
214
|
"""
|
|
181
|
-
Show a single evaluation testcase by ID or
|
|
215
|
+
Show a single evaluation testcase by ID or tag.
|
|
182
216
|
"""
|
|
183
|
-
if not id and not
|
|
217
|
+
if not id and not tag:
|
|
184
218
|
raise typer.BadParameter("You must provide either --id or --uid")
|
|
185
219
|
|
|
186
|
-
if id and
|
|
220
|
+
if id and tag:
|
|
187
221
|
raise typer.BadParameter("Provide only one of --id or --uid")
|
|
188
222
|
|
|
189
223
|
client = DeepEvalClient()
|
|
@@ -192,8 +226,8 @@ def show_testcase(
|
|
|
192
226
|
result = client.get_evaluation_testcase_by_id(id)
|
|
193
227
|
identifier = f"id={id}"
|
|
194
228
|
else:
|
|
195
|
-
result = client.
|
|
196
|
-
identifier = f"
|
|
229
|
+
result = client.get_evaluation_testcase_by_tag(tag)
|
|
230
|
+
identifier = f"tag={tag}"
|
|
197
231
|
|
|
198
232
|
if not result:
|
|
199
233
|
console.print(
|
|
@@ -376,7 +410,8 @@ def _print_and_save(
|
|
|
376
410
|
return
|
|
377
411
|
|
|
378
412
|
if out.exists() and not overwrite:
|
|
379
|
-
typer.echo(
|
|
413
|
+
typer.echo(
|
|
414
|
+
f"❌ File already exists: {out} (use --overwrite to replace)")
|
|
380
415
|
raise typer.Exit(code=1)
|
|
381
416
|
|
|
382
417
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -388,14 +423,16 @@ def _print_and_save(
|
|
|
388
423
|
|
|
389
424
|
|
|
390
425
|
@app.command()
|
|
391
|
-
def
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
426
|
+
def compare(
|
|
427
|
+
tag: list[str] = typer.Option(
|
|
428
|
+
[],
|
|
429
|
+
"--tag",
|
|
430
|
+
help="Tag identifying a reference testcase",
|
|
395
431
|
),
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
432
|
+
run: list[int] = typer.Option(
|
|
433
|
+
[],
|
|
434
|
+
"--run",
|
|
435
|
+
help="Run ID identifying an evaluation testcase",
|
|
399
436
|
),
|
|
400
437
|
pretty: bool = typer.Option(
|
|
401
438
|
True,
|
|
@@ -420,18 +457,45 @@ def compare_testcases(
|
|
|
420
457
|
),
|
|
421
458
|
) -> None:
|
|
422
459
|
"""
|
|
423
|
-
Compare two
|
|
460
|
+
Compare two evaluation testcases using runs and/or labels.
|
|
424
461
|
"""
|
|
462
|
+
|
|
463
|
+
targets = []
|
|
464
|
+
|
|
465
|
+
for r in run:
|
|
466
|
+
targets.append(("run", r))
|
|
467
|
+
|
|
468
|
+
for l in tag:
|
|
469
|
+
targets.append(("label", l))
|
|
470
|
+
|
|
471
|
+
if len(targets) != 2:
|
|
472
|
+
typer.echo(
|
|
473
|
+
"❌ Provide exactly two targets using --run and/or --label"
|
|
474
|
+
)
|
|
475
|
+
raise typer.Exit(code=1)
|
|
476
|
+
|
|
425
477
|
client = DeepEvalClient()
|
|
426
478
|
|
|
427
|
-
|
|
479
|
+
(type_a, value_a), (type_b, value_b) = targets
|
|
480
|
+
|
|
481
|
+
typer.echo(
|
|
482
|
+
f"🔍 Comparing {type_a} '{value_a}' ↔ {type_b} '{value_b}'"
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
kwargs = {"raise_exception": raise_exception}
|
|
486
|
+
|
|
487
|
+
if type_a == "run":
|
|
488
|
+
kwargs["testcase_a_id"] = value_a
|
|
489
|
+
else:
|
|
490
|
+
kwargs["testcase_a_tag"] = value_a
|
|
491
|
+
|
|
492
|
+
if type_b == "run":
|
|
493
|
+
kwargs["testcase_b_id"] = value_b
|
|
494
|
+
else:
|
|
495
|
+
kwargs["testcase_b_tag"] = value_b
|
|
428
496
|
|
|
429
497
|
try:
|
|
430
|
-
resp = client.compare_testcases(
|
|
431
|
-
testcase_a_id=testcase_a_id,
|
|
432
|
-
testcase_b_id=testcase_b_id,
|
|
433
|
-
raise_exception=raise_exception,
|
|
434
|
-
)
|
|
498
|
+
resp = client.compare_testcases(**kwargs)
|
|
435
499
|
except Exception as e:
|
|
436
500
|
typer.echo(f"❌ Request failed: {e}")
|
|
437
501
|
raise typer.Exit(code=1)
|
|
@@ -439,10 +503,11 @@ def compare_testcases(
|
|
|
439
503
|
if not resp:
|
|
440
504
|
typer.echo("⚠️ No response received")
|
|
441
505
|
raise typer.Exit(code=1)
|
|
506
|
+
|
|
442
507
|
_print_and_save(resp, pretty, out, overwrite)
|
|
443
508
|
|
|
444
509
|
|
|
445
|
-
@app.command()
|
|
510
|
+
@app.command(hidden=True)
|
|
446
511
|
def compare_label_latest(
|
|
447
512
|
label_a: str = typer.Argument(
|
|
448
513
|
...,
|
|
@@ -498,7 +563,7 @@ def compare_label_latest(
|
|
|
498
563
|
_print_and_save(resp, pretty, out, overwrite)
|
|
499
564
|
|
|
500
565
|
|
|
501
|
-
@app.command()
|
|
566
|
+
@app.command(hidden=True)
|
|
502
567
|
def compare_last(
|
|
503
568
|
label: str = typer.Argument(
|
|
504
569
|
...,
|
|
@@ -549,6 +614,63 @@ def compare_last(
|
|
|
549
614
|
_print_and_save(resp, pretty, out, overwrite)
|
|
550
615
|
|
|
551
616
|
|
|
617
|
+
@app.command("tag")
|
|
618
|
+
def tag_command(
|
|
619
|
+
run_id: Optional[int] = typer.Option(
|
|
620
|
+
None,
|
|
621
|
+
"--run",
|
|
622
|
+
help="Evaluation run ID",
|
|
623
|
+
),
|
|
624
|
+
tag: Optional[str] = typer.Option(
|
|
625
|
+
None,
|
|
626
|
+
"--tag",
|
|
627
|
+
help="Tag to assign to the run",
|
|
628
|
+
),
|
|
629
|
+
delete: Optional[str] = typer.Option(
|
|
630
|
+
None,
|
|
631
|
+
"--delete",
|
|
632
|
+
help="Delete a tag",
|
|
633
|
+
),
|
|
634
|
+
):
|
|
635
|
+
"""
|
|
636
|
+
Assign a tag to a run or delete a tag.
|
|
637
|
+
"""
|
|
638
|
+
|
|
639
|
+
# --- validation ---
|
|
640
|
+
if delete:
|
|
641
|
+
if run_id or tag:
|
|
642
|
+
typer.echo("❌ --delete cannot be used with --run or --tag")
|
|
643
|
+
raise typer.Exit(code=1)
|
|
644
|
+
else:
|
|
645
|
+
if not run_id or not tag:
|
|
646
|
+
typer.echo("❌ Use --run and --tag together, or --delete")
|
|
647
|
+
raise typer.Exit(code=1)
|
|
648
|
+
|
|
649
|
+
client = DeepEvalClient()
|
|
650
|
+
|
|
651
|
+
# --- delete mode ---
|
|
652
|
+
if delete:
|
|
653
|
+
result = client.update_evaluation_testcase_tag(
|
|
654
|
+
testcase_id=run_id,
|
|
655
|
+
tag=delete,
|
|
656
|
+
raise_exception=True,
|
|
657
|
+
)
|
|
658
|
+
typer.echo("🗑️ Tag deleted successfully")
|
|
659
|
+
typer.echo(f"Tag: {delete}")
|
|
660
|
+
return
|
|
661
|
+
|
|
662
|
+
# --- assign/update mode ---
|
|
663
|
+
result = client.update_evaluation_testcase_tag(
|
|
664
|
+
testcase_id=run_id,
|
|
665
|
+
tag=tag,
|
|
666
|
+
raise_exception=True,
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
typer.echo("✅ Tag assigned successfully")
|
|
670
|
+
typer.echo(f"Run ID: {run_id}")
|
|
671
|
+
typer.echo(f"Tag: {result.get('tag')}")
|
|
672
|
+
|
|
673
|
+
|
|
552
674
|
def main() -> None:
|
|
553
675
|
app()
|
|
554
676
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import random
|
|
3
|
-
from typing import Any, Dict, List, Optional, Union, cast, overload
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Union, cast, overload
|
|
4
4
|
|
|
5
5
|
import requests
|
|
6
6
|
|
|
@@ -13,6 +13,8 @@ from .schema import (
|
|
|
13
13
|
TextInputItem,
|
|
14
14
|
)
|
|
15
15
|
|
|
16
|
+
HTTPMethod = Literal["GET", "POST", "PATCH", "PUT", "DELETE"]
|
|
17
|
+
|
|
16
18
|
|
|
17
19
|
class DeepEvalClient:
|
|
18
20
|
"""
|
|
@@ -45,21 +47,31 @@ class DeepEvalClient:
|
|
|
45
47
|
|
|
46
48
|
def _request(
|
|
47
49
|
self,
|
|
50
|
+
method: HTTPMethod,
|
|
48
51
|
endpoint: str,
|
|
49
|
-
|
|
52
|
+
*,
|
|
53
|
+
json: dict | None = None,
|
|
54
|
+
params: dict | None = None,
|
|
50
55
|
raise_exception: bool = False,
|
|
51
56
|
) -> Optional[dict]:
|
|
52
|
-
"""Internal helper to send POST requests with standard headers and error handling."""
|
|
53
57
|
url = f"{self.base_url}{endpoint}"
|
|
58
|
+
|
|
54
59
|
headers = {
|
|
55
60
|
"accept": "application/json",
|
|
56
|
-
"Content-Type": "application/json",
|
|
57
61
|
"X-API-Token": self.api_token,
|
|
58
62
|
}
|
|
59
63
|
|
|
64
|
+
if json is not None:
|
|
65
|
+
headers["Content-Type"] = "application/json"
|
|
66
|
+
|
|
60
67
|
try:
|
|
61
|
-
resp = requests.
|
|
62
|
-
|
|
68
|
+
resp = requests.request(
|
|
69
|
+
method=method,
|
|
70
|
+
url=url,
|
|
71
|
+
headers=headers,
|
|
72
|
+
json=json,
|
|
73
|
+
params=params,
|
|
74
|
+
timeout=self.timeout,
|
|
63
75
|
)
|
|
64
76
|
if raise_exception:
|
|
65
77
|
resp.raise_for_status()
|
|
@@ -73,41 +85,47 @@ class DeepEvalClient:
|
|
|
73
85
|
except ValueError:
|
|
74
86
|
if raise_exception:
|
|
75
87
|
raise
|
|
76
|
-
return {
|
|
88
|
+
return {
|
|
89
|
+
"error": "Invalid JSON response",
|
|
90
|
+
"raw": resp.text,
|
|
91
|
+
}
|
|
77
92
|
|
|
78
|
-
def _get(
|
|
93
|
+
def _get(self, endpoint: str, params: dict, *args, **kw):
|
|
94
|
+
return self._request("GET", endpoint, params=params, *args, **kw)
|
|
95
|
+
|
|
96
|
+
def _post(self, endpoint: str, payload: dict, *args, **kw):
|
|
97
|
+
return self._request("POST", endpoint, json=payload, *args, **kw)
|
|
98
|
+
|
|
99
|
+
def _patch(self, endpoint: str, payload: dict, *args, **kw):
|
|
100
|
+
return self._request("PATCH", endpoint, json=payload, *args, **kw)
|
|
101
|
+
|
|
102
|
+
def _delete(self, endpoint: str, payload: dict, *args, **kw):
|
|
103
|
+
return self._request("DELETE", endpoint, json=payload, *args, **kw)
|
|
104
|
+
|
|
105
|
+
def update_evaluation_testcase_tag(
|
|
79
106
|
self,
|
|
80
|
-
|
|
81
|
-
|
|
107
|
+
*,
|
|
108
|
+
testcase_id: int,
|
|
109
|
+
tag: Optional[str],
|
|
82
110
|
raise_exception: bool = False,
|
|
83
|
-
) -> Optional[
|
|
84
|
-
"""
|
|
85
|
-
|
|
86
|
-
headers = {
|
|
87
|
-
"accept": "application/json",
|
|
88
|
-
"X-API-Token": self.api_token,
|
|
89
|
-
}
|
|
111
|
+
) -> Optional[Dict]:
|
|
112
|
+
"""
|
|
113
|
+
Add, update, or remove a tag from an evaluation testcase.
|
|
90
114
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
115
|
+
- tag="smoke" → add / update tag
|
|
116
|
+
- tag=None → remove tag
|
|
117
|
+
"""
|
|
118
|
+
if testcase_id is None:
|
|
119
|
+
return self._delete(
|
|
120
|
+
endpoint=f"/deepeval/tag/{tag}",
|
|
121
|
+
payload={},
|
|
122
|
+
raise_exception=raise_exception,
|
|
97
123
|
)
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
return {"error": str(e)}
|
|
104
|
-
|
|
105
|
-
try:
|
|
106
|
-
return cast(dict, resp.json())
|
|
107
|
-
except ValueError:
|
|
108
|
-
if raise_exception:
|
|
109
|
-
raise
|
|
110
|
-
return {"error": "Invalid JSON response", "raw": resp.text}
|
|
124
|
+
return self._patch(
|
|
125
|
+
endpoint=f"/deepeval/{testcase_id}/tag",
|
|
126
|
+
payload={"tag": tag},
|
|
127
|
+
raise_exception=raise_exception,
|
|
128
|
+
)
|
|
111
129
|
|
|
112
130
|
def list_evaluation_testcases(
|
|
113
131
|
self,
|
|
@@ -121,7 +139,7 @@ class DeepEvalClient:
|
|
|
121
139
|
Sorted by created_at DESC (newest first).
|
|
122
140
|
"""
|
|
123
141
|
return self._get(
|
|
124
|
-
"/eval-framework/deepeval/evaluation-testcases/token",
|
|
142
|
+
endpoint="/eval-framework/deepeval/evaluation-testcases/token",
|
|
125
143
|
params={
|
|
126
144
|
"limit": limit,
|
|
127
145
|
"offset": offset,
|
|
@@ -139,22 +157,22 @@ class DeepEvalClient:
|
|
|
139
157
|
Fetch a single evaluation testcase by numeric ID.
|
|
140
158
|
"""
|
|
141
159
|
return self._get(
|
|
142
|
-
f"/eval-framework/deepeval/id/{testcase_id}",
|
|
160
|
+
endpoint=f"/eval-framework/deepeval/id/{testcase_id}",
|
|
143
161
|
params={},
|
|
144
162
|
raise_exception=raise_exception,
|
|
145
163
|
)
|
|
146
164
|
|
|
147
|
-
def
|
|
165
|
+
def get_evaluation_testcase_by_tag(
|
|
148
166
|
self,
|
|
149
|
-
|
|
167
|
+
tag: str,
|
|
150
168
|
*,
|
|
151
169
|
raise_exception: bool = False,
|
|
152
170
|
) -> Optional[Dict]:
|
|
153
171
|
"""
|
|
154
|
-
Fetch a single evaluation testcase by
|
|
172
|
+
Fetch a single evaluation testcase by tag.
|
|
155
173
|
"""
|
|
156
174
|
return self._get(
|
|
157
|
-
f"/eval-framework/deepeval/
|
|
175
|
+
endpoint=f"/eval-framework/deepeval/tag/{tag}",
|
|
158
176
|
params={},
|
|
159
177
|
raise_exception=raise_exception,
|
|
160
178
|
)
|
|
@@ -163,35 +181,57 @@ class DeepEvalClient:
|
|
|
163
181
|
self,
|
|
164
182
|
*,
|
|
165
183
|
id: Optional[int] = None,
|
|
166
|
-
|
|
184
|
+
tag: Optional[str] = None,
|
|
167
185
|
raise_exception: bool = False,
|
|
168
186
|
) -> Optional[Dict]:
|
|
169
187
|
if id is not None:
|
|
170
188
|
return self.get_evaluation_testcase_by_id(
|
|
171
189
|
id, raise_exception=raise_exception
|
|
172
190
|
)
|
|
173
|
-
if
|
|
174
|
-
return self.
|
|
175
|
-
|
|
191
|
+
if tag is not None:
|
|
192
|
+
return self.get_evaluation_testcase_by_tag(
|
|
193
|
+
tag, raise_exception=raise_exception
|
|
176
194
|
)
|
|
177
|
-
raise ValueError("Either id or
|
|
195
|
+
raise ValueError("Either id or tag must be provided")
|
|
178
196
|
|
|
179
197
|
def compare_testcases(
|
|
180
198
|
self,
|
|
181
199
|
*,
|
|
182
|
-
testcase_a_id: int,
|
|
183
|
-
|
|
200
|
+
testcase_a_id: int | None = None,
|
|
201
|
+
testcase_a_tag: str | None = None,
|
|
202
|
+
testcase_b_id: int | None = None,
|
|
203
|
+
testcase_b_tag: str | None = None,
|
|
184
204
|
raise_exception: bool = False,
|
|
185
205
|
) -> Optional[dict]:
|
|
186
206
|
"""
|
|
187
|
-
Compare two evaluation testcases.
|
|
207
|
+
Compare two evaluation testcases using IDs or tags.
|
|
208
|
+
Exactly one identifier (id or tag) must be provided per testcase.
|
|
188
209
|
"""
|
|
210
|
+
|
|
211
|
+
def validate(id_, tag, name: str):
|
|
212
|
+
if bool(id_) == bool(tag):
|
|
213
|
+
raise ValueError(
|
|
214
|
+
f"Provide exactly one of {name}_id or {name}_tag"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
validate(testcase_a_id, testcase_a_tag, "testcase_a")
|
|
218
|
+
validate(testcase_b_id, testcase_b_tag, "testcase_b")
|
|
219
|
+
|
|
220
|
+
params: dict[str, int | str] = {}
|
|
221
|
+
|
|
222
|
+
if testcase_a_id is not None:
|
|
223
|
+
params["testcase_a_id"] = testcase_a_id
|
|
224
|
+
else:
|
|
225
|
+
params["testcase_a_tag"] = testcase_a_tag # type: ignore
|
|
226
|
+
|
|
227
|
+
if testcase_b_id is not None:
|
|
228
|
+
params["testcase_b_id"] = testcase_b_id
|
|
229
|
+
else:
|
|
230
|
+
params["testcase_b_tag"] = testcase_b_tag # type: ignore
|
|
231
|
+
|
|
189
232
|
return self._get(
|
|
190
|
-
"/eval-framework/deepeval/evaluation-testcases/compare",
|
|
191
|
-
params=
|
|
192
|
-
"testcase_a_id": testcase_a_id,
|
|
193
|
-
"testcase_b_id": testcase_b_id,
|
|
194
|
-
},
|
|
233
|
+
endpoint="/eval-framework/deepeval/evaluation-testcases/compare",
|
|
234
|
+
params=params,
|
|
195
235
|
raise_exception=raise_exception,
|
|
196
236
|
)
|
|
197
237
|
|
|
@@ -206,7 +246,7 @@ class DeepEvalClient:
|
|
|
206
246
|
Compare the latest evaluation testcases for two labels.
|
|
207
247
|
"""
|
|
208
248
|
return self._get(
|
|
209
|
-
"/eval-framework/deepeval/evaluation-testcases/compare-latest",
|
|
249
|
+
endpoint="/eval-framework/deepeval/evaluation-testcases/compare-latest",
|
|
210
250
|
params={
|
|
211
251
|
"label_a": label_a,
|
|
212
252
|
"label_b": label_b,
|
|
@@ -224,7 +264,7 @@ class DeepEvalClient:
|
|
|
224
264
|
Compare the last two evaluation testcases for a given label.
|
|
225
265
|
"""
|
|
226
266
|
return self._get(
|
|
227
|
-
"/eval-framework/deepeval/evaluation-testcases/compare-last-two",
|
|
267
|
+
endpoint="/eval-framework/deepeval/evaluation-testcases/compare-last-two",
|
|
228
268
|
params={
|
|
229
269
|
"label": label,
|
|
230
270
|
},
|
|
@@ -268,8 +308,8 @@ class DeepEvalClient:
|
|
|
268
308
|
label=label,
|
|
269
309
|
)
|
|
270
310
|
|
|
271
|
-
return self.
|
|
272
|
-
"/deepeval/text-eval", config.model_dump(), raise_exception
|
|
311
|
+
return self._post(
|
|
312
|
+
endpoint="/deepeval/text-eval", payload=config.model_dump(), raise_exception=raise_exception
|
|
273
313
|
)
|
|
274
314
|
|
|
275
315
|
def text_eval_background(
|
|
@@ -284,7 +324,7 @@ class DeepEvalClient:
|
|
|
284
324
|
payload = EvalConfig.model_construct(
|
|
285
325
|
data=data, metrics=metrics, component=component, version=label
|
|
286
326
|
).model_dump()
|
|
287
|
-
return self.
|
|
327
|
+
return self._post(endpoint="/deepeval/text-eval/background", payload=payload, raise_exception=raise_exception)
|
|
288
328
|
|
|
289
329
|
@overload
|
|
290
330
|
def schema_eval(
|
|
@@ -328,10 +368,10 @@ class DeepEvalClient:
|
|
|
328
368
|
label=label,
|
|
329
369
|
)
|
|
330
370
|
|
|
331
|
-
return self.
|
|
332
|
-
"/deepeval/schema-eval",
|
|
333
|
-
config.model_dump(),
|
|
334
|
-
raise_exception,
|
|
371
|
+
return self._post(
|
|
372
|
+
endpoint="/deepeval/schema-eval",
|
|
373
|
+
payload=config.model_dump(),
|
|
374
|
+
raise_exception=raise_exception,
|
|
335
375
|
)
|
|
336
376
|
|
|
337
377
|
def schema_eval_background(
|
|
@@ -346,8 +386,8 @@ class DeepEvalClient:
|
|
|
346
386
|
payload = SchemaEvalConfig.model_construct(
|
|
347
387
|
data=data, metrics=metrics, component=component, version=label
|
|
348
388
|
).model_dump()
|
|
349
|
-
return self.
|
|
350
|
-
"/deepeval/schema-eval/background", payload, raise_exception
|
|
389
|
+
return self._post(
|
|
390
|
+
endpoint="/deepeval/schema-eval/background", payload=payload, raise_exception=raise_exception
|
|
351
391
|
)
|
|
352
392
|
|
|
353
393
|
def maybe_text_eval(
|
|
@@ -117,7 +117,6 @@ class SchemaInputItem(InputItem):
|
|
|
117
117
|
|
|
118
118
|
class EvalConfig(BaseModel):
|
|
119
119
|
__eval_config__ = "text_eval"
|
|
120
|
-
unique_id: Union[str, None] = None
|
|
121
120
|
component: str = "unknown"
|
|
122
121
|
label: Union[str, None] = None
|
|
123
122
|
data: List[TextInputItem]
|
|
@@ -127,7 +126,6 @@ class EvalConfig(BaseModel):
|
|
|
127
126
|
class SchemaEvalConfig(BaseModel):
|
|
128
127
|
__eval_config__ = "schema_eval"
|
|
129
128
|
component: str = "unknown"
|
|
130
|
-
unique_id: Union[str, None] = None
|
|
131
129
|
label: Union[str, None] = None
|
|
132
130
|
data: List[SchemaInputItem]
|
|
133
131
|
metrics: List[SchemaMetricConfig] = Field(default_factory=list)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|