rakam-eval-sdk 0.2.4__tar.gz → 0.2.4rc2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rakam_eval_sdk-0.2.4 → rakam_eval_sdk-0.2.4rc2}/PKG-INFO +1 -1
- {rakam_eval_sdk-0.2.4 → rakam_eval_sdk-0.2.4rc2}/pyproject.toml +1 -1
- {rakam_eval_sdk-0.2.4 → rakam_eval_sdk-0.2.4rc2}/src/rakam_eval_sdk/cli.py +32 -27
- {rakam_eval_sdk-0.2.4 → rakam_eval_sdk-0.2.4rc2}/src/rakam_eval_sdk/client.py +73 -27
- {rakam_eval_sdk-0.2.4 → rakam_eval_sdk-0.2.4rc2}/src/rakam_eval_sdk/schema.py +4 -3
- {rakam_eval_sdk-0.2.4 → rakam_eval_sdk-0.2.4rc2}/README.md +0 -0
- {rakam_eval_sdk-0.2.4 → rakam_eval_sdk-0.2.4rc2}/src/rakam_eval_sdk/__init__.py +0 -0
- {rakam_eval_sdk-0.2.4 → rakam_eval_sdk-0.2.4rc2}/src/rakam_eval_sdk/decorators.py +0 -0
- {rakam_eval_sdk-0.2.4 → rakam_eval_sdk-0.2.4rc2}/src/rakam_eval_sdk/utils/decorator_utils.py +0 -0
|
@@ -6,7 +6,7 @@ import uuid
|
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from pprint import pprint
|
|
9
|
-
from typing import Any, Dict, Optional
|
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
10
10
|
|
|
11
11
|
import typer
|
|
12
12
|
from dotenv import load_dotenv
|
|
@@ -37,14 +37,14 @@ metrics_app = typer.Typer(help="Metrics utilities")
|
|
|
37
37
|
app.add_typer(metrics_app, name="metrics")
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
def extract_metric_names(config: Any) -> list[tuple[str, str
|
|
40
|
+
def extract_metric_names(config: Any) -> list[tuple[str, Optional[str]]]:
|
|
41
41
|
"""
|
|
42
42
|
Returns [(type, name)] from EvalConfig / SchemaEvalConfig
|
|
43
43
|
"""
|
|
44
44
|
if not hasattr(config, "metrics"):
|
|
45
45
|
return []
|
|
46
46
|
|
|
47
|
-
results: list[tuple[str, str
|
|
47
|
+
results: list[tuple[str, Optional[str]]] = []
|
|
48
48
|
|
|
49
49
|
for metric in config.metrics or []:
|
|
50
50
|
metric_type = getattr(metric, "type", None)
|
|
@@ -77,7 +77,7 @@ def metrics(
|
|
|
77
77
|
files = directory.rglob("*.py") if recursive else directory.glob("*.py")
|
|
78
78
|
TARGET_DECORATOR = eval_run.__name__
|
|
79
79
|
|
|
80
|
-
all_metrics: set[tuple[str, str
|
|
80
|
+
all_metrics: set[tuple[str, Optional[str]]] = set()
|
|
81
81
|
found_any = False
|
|
82
82
|
|
|
83
83
|
for file in sorted(files):
|
|
@@ -159,7 +159,7 @@ def list_evals(
|
|
|
159
159
|
def list_runs(
|
|
160
160
|
limit: int = typer.Option(20, help="Max number of runs"),
|
|
161
161
|
offset: int = typer.Option(0, help="Pagination offset"),
|
|
162
|
-
):
|
|
162
|
+
) -> None:
|
|
163
163
|
"""
|
|
164
164
|
List evaluation runs (newest first).
|
|
165
165
|
"""
|
|
@@ -170,7 +170,7 @@ def list_runs(
|
|
|
170
170
|
offset=offset,
|
|
171
171
|
raise_exception=True,
|
|
172
172
|
)
|
|
173
|
-
|
|
173
|
+
assert response is not None
|
|
174
174
|
items = response.get("items", [])
|
|
175
175
|
total = response.get("total", 0)
|
|
176
176
|
|
|
@@ -199,7 +199,8 @@ def list_runs(
|
|
|
199
199
|
shown = offset + len(items)
|
|
200
200
|
if shown < total:
|
|
201
201
|
typer.echo()
|
|
202
|
-
typer.echo(
|
|
202
|
+
typer.echo(
|
|
203
|
+
f"Showing {shown} of {total} runs. Use --limit to see more.")
|
|
203
204
|
|
|
204
205
|
|
|
205
206
|
@app.command()
|
|
@@ -219,15 +220,15 @@ def show(
|
|
|
219
220
|
"--raw",
|
|
220
221
|
help="Print raw JSON instead of formatted output",
|
|
221
222
|
),
|
|
222
|
-
):
|
|
223
|
+
) -> None:
|
|
223
224
|
"""
|
|
224
225
|
Show a single evaluation testcase by ID or tag.
|
|
225
226
|
"""
|
|
226
227
|
if not id and not tag:
|
|
227
|
-
raise typer.BadParameter("You must provide either --id or --
|
|
228
|
+
raise typer.BadParameter("You must provide either --id or --tag")
|
|
228
229
|
|
|
229
230
|
if id and tag:
|
|
230
|
-
raise typer.BadParameter("Provide only one of --id or --
|
|
231
|
+
raise typer.BadParameter("Provide only one of --id or --tag")
|
|
231
232
|
|
|
232
233
|
client = DeepEvalClient()
|
|
233
234
|
|
|
@@ -235,6 +236,7 @@ def show(
|
|
|
235
236
|
result = client.get_evaluation_testcase_by_id(id)
|
|
236
237
|
identifier = f"id={id}"
|
|
237
238
|
else:
|
|
239
|
+
assert tag is not None
|
|
238
240
|
result = client.get_evaluation_testcase_by_tag(tag)
|
|
239
241
|
identifier = f"tag={tag}"
|
|
240
242
|
|
|
@@ -406,7 +408,7 @@ def run(
|
|
|
406
408
|
def _print_and_save(
|
|
407
409
|
resp: dict,
|
|
408
410
|
pretty: bool,
|
|
409
|
-
out: Path
|
|
411
|
+
out: Optional[Path],
|
|
410
412
|
overwrite: bool,
|
|
411
413
|
) -> None:
|
|
412
414
|
if pretty:
|
|
@@ -419,7 +421,8 @@ def _print_and_save(
|
|
|
419
421
|
return
|
|
420
422
|
|
|
421
423
|
if out.exists() and not overwrite:
|
|
422
|
-
typer.echo(
|
|
424
|
+
typer.echo(
|
|
425
|
+
f"❌ File already exists: {out} (use --overwrite to replace)")
|
|
423
426
|
raise typer.Exit(code=1)
|
|
424
427
|
|
|
425
428
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -430,13 +433,13 @@ def _print_and_save(
|
|
|
430
433
|
typer.echo(f"💾 Result saved to {out}")
|
|
431
434
|
|
|
432
435
|
|
|
433
|
-
def pct_change(a: float
|
|
436
|
+
def pct_change(a: Optional[float], b: Optional[float]) -> Optional[str]:
|
|
434
437
|
if a is None or b is None or a == 0:
|
|
435
438
|
return None
|
|
436
439
|
return f"{((b - a) / a) * 100:+.2f}%"
|
|
437
440
|
|
|
438
441
|
|
|
439
|
-
def metric_direction(delta: float
|
|
442
|
+
def metric_direction(delta: Optional[float]) -> str:
|
|
440
443
|
if delta is None:
|
|
441
444
|
return "unchanged"
|
|
442
445
|
if delta > 0:
|
|
@@ -446,7 +449,7 @@ def metric_direction(delta: float | None) -> str:
|
|
|
446
449
|
return "unchanged"
|
|
447
450
|
|
|
448
451
|
|
|
449
|
-
def print_metric_diff(diff: MetricDiff):
|
|
452
|
+
def print_metric_diff(diff: MetricDiff) -> None:
|
|
450
453
|
secho(f"\nMetric: {diff.metric}", bold=True)
|
|
451
454
|
|
|
452
455
|
if diff.status == "added":
|
|
@@ -507,7 +510,7 @@ def summarize(metrics: Any) -> Dict[str, int]:
|
|
|
507
510
|
return summary
|
|
508
511
|
|
|
509
512
|
|
|
510
|
-
def pretty_print_comparison(resp: Any, summary_only: bool = False):
|
|
513
|
+
def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
|
|
511
514
|
if not summary_only:
|
|
512
515
|
for metric in resp.metrics:
|
|
513
516
|
print_metric_diff(metric)
|
|
@@ -516,7 +519,7 @@ def pretty_print_comparison(resp: Any, summary_only: bool = False):
|
|
|
516
519
|
print_summary(resp.metrics)
|
|
517
520
|
|
|
518
521
|
|
|
519
|
-
def print_summary(metrics: Any):
|
|
522
|
+
def print_summary(metrics: Any) -> None:
|
|
520
523
|
summary = summarize(metrics)
|
|
521
524
|
|
|
522
525
|
secho("\nSummary:", bold=True)
|
|
@@ -529,7 +532,7 @@ def print_summary(metrics: Any):
|
|
|
529
532
|
|
|
530
533
|
@app.command()
|
|
531
534
|
def compare(
|
|
532
|
-
tag:
|
|
535
|
+
tag: List[str] = typer.Option(
|
|
533
536
|
[],
|
|
534
537
|
"--tag",
|
|
535
538
|
help="Label identifying a reference testcase",
|
|
@@ -554,7 +557,7 @@ def compare(
|
|
|
554
557
|
"--raise",
|
|
555
558
|
help="Raise HTTP exceptions instead of swallowing them",
|
|
556
559
|
),
|
|
557
|
-
out: Path
|
|
560
|
+
out: Optional[Path] = typer.Option(
|
|
558
561
|
None,
|
|
559
562
|
"-o",
|
|
560
563
|
"--out",
|
|
@@ -575,7 +578,7 @@ def compare(
|
|
|
575
578
|
- Summary of improvements / regressions
|
|
576
579
|
"""
|
|
577
580
|
|
|
578
|
-
targets:
|
|
581
|
+
targets: List[Tuple[str, Union[str, int]]] = []
|
|
579
582
|
|
|
580
583
|
for r in run:
|
|
581
584
|
targets.append(("run", r))
|
|
@@ -600,7 +603,7 @@ def compare(
|
|
|
600
603
|
|
|
601
604
|
client = DeepEvalClient()
|
|
602
605
|
|
|
603
|
-
kwargs = {"raise_exception": raise_exception}
|
|
606
|
+
kwargs: Dict[str, Any] = {"raise_exception": raise_exception}
|
|
604
607
|
|
|
605
608
|
if type_a == "run":
|
|
606
609
|
kwargs["testcase_a_id"] = value_a
|
|
@@ -665,7 +668,7 @@ def compare_label_latest(
|
|
|
665
668
|
"--raise",
|
|
666
669
|
help="Raise HTTP exceptions instead of swallowing them",
|
|
667
670
|
),
|
|
668
|
-
out: Path
|
|
671
|
+
out: Optional[Path] = typer.Option(
|
|
669
672
|
None,
|
|
670
673
|
"-o",
|
|
671
674
|
"--out",
|
|
@@ -717,7 +720,7 @@ def compare_last(
|
|
|
717
720
|
"--raise",
|
|
718
721
|
help="Raise HTTP exceptions instead of swallowing them",
|
|
719
722
|
),
|
|
720
|
-
out: Path
|
|
723
|
+
out: Optional[Path] = typer.Option(
|
|
721
724
|
None,
|
|
722
725
|
"-o",
|
|
723
726
|
"--out",
|
|
@@ -769,7 +772,7 @@ def tag_command(
|
|
|
769
772
|
"--delete",
|
|
770
773
|
help="Delete a tag",
|
|
771
774
|
),
|
|
772
|
-
):
|
|
775
|
+
) -> None:
|
|
773
776
|
"""
|
|
774
777
|
Assign a tag to a run or delete a tag.
|
|
775
778
|
"""
|
|
@@ -786,8 +789,9 @@ def tag_command(
|
|
|
786
789
|
|
|
787
790
|
client = DeepEvalClient()
|
|
788
791
|
|
|
789
|
-
# --- delete mode ---
|
|
790
792
|
if delete:
|
|
793
|
+
assert run_id is not None
|
|
794
|
+
|
|
791
795
|
result = client.update_evaluation_testcase_tag(
|
|
792
796
|
testcase_id=run_id,
|
|
793
797
|
tag=delete,
|
|
@@ -796,13 +800,14 @@ def tag_command(
|
|
|
796
800
|
typer.echo("🗑️ Tag deleted successfully")
|
|
797
801
|
typer.echo(f"Tag: {delete}")
|
|
798
802
|
return
|
|
799
|
-
|
|
800
|
-
|
|
803
|
+
assert run_id is not None
|
|
804
|
+
assert tag is not None
|
|
801
805
|
result = client.update_evaluation_testcase_tag(
|
|
802
806
|
testcase_id=run_id,
|
|
803
807
|
tag=tag,
|
|
804
808
|
raise_exception=True,
|
|
805
809
|
)
|
|
810
|
+
assert result is not None
|
|
806
811
|
|
|
807
812
|
typer.echo("✅ Tag assigned successfully")
|
|
808
813
|
typer.echo(f"Run ID: {run_id}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import random
|
|
3
|
-
from typing import Any, Dict, List, Literal, Optional, Union, cast, overload
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast, overload
|
|
4
4
|
|
|
5
5
|
import requests
|
|
6
6
|
|
|
@@ -49,8 +49,8 @@ class DeepEvalClient:
|
|
|
49
49
|
method: HTTPMethod,
|
|
50
50
|
endpoint: str,
|
|
51
51
|
*,
|
|
52
|
-
json: Dict
|
|
53
|
-
params: Dict
|
|
52
|
+
json: Optional[Dict] = None,
|
|
53
|
+
params: Optional[Dict] = None,
|
|
54
54
|
raise_exception: bool = False,
|
|
55
55
|
) -> Optional[Dict]:
|
|
56
56
|
url = f"{self.base_url}{endpoint}"
|
|
@@ -89,17 +89,63 @@ class DeepEvalClient:
|
|
|
89
89
|
"raw": resp.text,
|
|
90
90
|
}
|
|
91
91
|
|
|
92
|
-
def _get(
|
|
93
|
-
|
|
92
|
+
def _get(
|
|
93
|
+
self,
|
|
94
|
+
endpoint: str,
|
|
95
|
+
params: Dict,
|
|
96
|
+
raise_exception: bool = False,
|
|
97
|
+
*args: Tuple,
|
|
98
|
+
**kw: Dict,
|
|
99
|
+
) -> Optional[Dict]:
|
|
100
|
+
return self._request(
|
|
101
|
+
"GET", endpoint, params=params, raise_exception=raise_exception, *args, **kw
|
|
102
|
+
)
|
|
94
103
|
|
|
95
|
-
def _post(
|
|
96
|
-
|
|
104
|
+
def _post(
|
|
105
|
+
self,
|
|
106
|
+
endpoint: str,
|
|
107
|
+
payload: Dict,
|
|
108
|
+
raise_exception: bool = False,
|
|
109
|
+
*args: Tuple,
|
|
110
|
+
**kw: Dict,
|
|
111
|
+
) -> Optional[Dict]:
|
|
112
|
+
return self._request(
|
|
113
|
+
"POST", endpoint, json=payload, raise_exception=raise_exception, *args, **kw
|
|
114
|
+
)
|
|
97
115
|
|
|
98
|
-
def _patch(
|
|
99
|
-
|
|
116
|
+
def _patch(
|
|
117
|
+
self,
|
|
118
|
+
endpoint: str,
|
|
119
|
+
payload: Dict,
|
|
120
|
+
raise_exception: bool = False,
|
|
121
|
+
*args: Tuple,
|
|
122
|
+
**kw: Dict,
|
|
123
|
+
) -> Optional[Dict]:
|
|
124
|
+
return self._request(
|
|
125
|
+
"PATCH",
|
|
126
|
+
endpoint,
|
|
127
|
+
json=payload,
|
|
128
|
+
raise_exception=raise_exception,
|
|
129
|
+
*args,
|
|
130
|
+
**kw,
|
|
131
|
+
)
|
|
100
132
|
|
|
101
|
-
def _delete(
|
|
102
|
-
|
|
133
|
+
def _delete(
|
|
134
|
+
self,
|
|
135
|
+
endpoint: str,
|
|
136
|
+
payload: Dict,
|
|
137
|
+
raise_exception: bool = False,
|
|
138
|
+
*args: Tuple,
|
|
139
|
+
**kw: Dict,
|
|
140
|
+
) -> Optional[Dict]:
|
|
141
|
+
return self._request(
|
|
142
|
+
"DELETE",
|
|
143
|
+
endpoint,
|
|
144
|
+
json=payload,
|
|
145
|
+
raise_exception=raise_exception,
|
|
146
|
+
*args,
|
|
147
|
+
**kw,
|
|
148
|
+
)
|
|
103
149
|
|
|
104
150
|
def update_evaluation_testcase_tag(
|
|
105
151
|
self,
|
|
@@ -197,10 +243,10 @@ class DeepEvalClient:
|
|
|
197
243
|
def compare_testcases(
|
|
198
244
|
self,
|
|
199
245
|
*,
|
|
200
|
-
testcase_a_id: int
|
|
201
|
-
testcase_a_tag: str
|
|
202
|
-
testcase_b_id: int
|
|
203
|
-
testcase_b_tag: str
|
|
246
|
+
testcase_a_id: Optional[int] = None,
|
|
247
|
+
testcase_a_tag: Optional[str] = None,
|
|
248
|
+
testcase_b_id: Optional[int] = None,
|
|
249
|
+
testcase_b_tag: Optional[str] = None,
|
|
204
250
|
raise_exception: bool = False,
|
|
205
251
|
) -> Optional[Dict]:
|
|
206
252
|
"""
|
|
@@ -208,14 +254,14 @@ class DeepEvalClient:
|
|
|
208
254
|
Exactly one identifier (id or tag) must be provided per testcase.
|
|
209
255
|
"""
|
|
210
256
|
|
|
211
|
-
def validate(id_, tag, name: str):
|
|
257
|
+
def validate(id_: Optional[int], tag: Optional[str], name: str) -> None:
|
|
212
258
|
if bool(id_) == bool(tag):
|
|
213
259
|
raise ValueError(f"Provide exactly one of {name}_id or {name}_tag")
|
|
214
260
|
|
|
215
261
|
validate(testcase_a_id, testcase_a_tag, "testcase_a")
|
|
216
262
|
validate(testcase_b_id, testcase_b_tag, "testcase_b")
|
|
217
263
|
|
|
218
|
-
params:
|
|
264
|
+
params: Dict[str, Union[int, str]] = {}
|
|
219
265
|
|
|
220
266
|
if testcase_a_id is not None:
|
|
221
267
|
params["testcase_a_id"] = testcase_a_id
|
|
@@ -284,18 +330,18 @@ class DeepEvalClient:
|
|
|
284
330
|
data: List[TextInputItem],
|
|
285
331
|
metrics: List[MetricConfig],
|
|
286
332
|
component: str = "unknown",
|
|
287
|
-
label: str
|
|
333
|
+
label: Optional[str] = None,
|
|
288
334
|
raise_exception: bool = False,
|
|
289
335
|
) -> Optional[Dict]: ...
|
|
290
336
|
|
|
291
337
|
def text_eval(
|
|
292
338
|
self,
|
|
293
|
-
config: EvalConfig
|
|
339
|
+
config: Optional[EvalConfig] = None,
|
|
294
340
|
*,
|
|
295
|
-
data: List[TextInputItem]
|
|
296
|
-
metrics: List[MetricConfig]
|
|
341
|
+
data: Optional[List[TextInputItem]] = None,
|
|
342
|
+
metrics: Optional[List[MetricConfig]] = None,
|
|
297
343
|
component: str = "unknown",
|
|
298
|
-
label: str
|
|
344
|
+
label: Optional[str] = None,
|
|
299
345
|
raise_exception: bool = False,
|
|
300
346
|
) -> Optional[Dict]:
|
|
301
347
|
if config is None:
|
|
@@ -337,7 +383,7 @@ class DeepEvalClient:
|
|
|
337
383
|
data: List[SchemaInputItem],
|
|
338
384
|
metrics: List[SchemaMetricConfig],
|
|
339
385
|
component: str = "unknown",
|
|
340
|
-
label: str
|
|
386
|
+
label: Optional[str] = None,
|
|
341
387
|
raise_exception: bool = False,
|
|
342
388
|
) -> Optional[Dict]: ...
|
|
343
389
|
|
|
@@ -351,12 +397,12 @@ class DeepEvalClient:
|
|
|
351
397
|
|
|
352
398
|
def schema_eval(
|
|
353
399
|
self,
|
|
354
|
-
config: SchemaEvalConfig
|
|
400
|
+
config: Optional[SchemaEvalConfig] = None,
|
|
355
401
|
*,
|
|
356
|
-
data: List[SchemaInputItem]
|
|
357
|
-
metrics: List[SchemaMetricConfig]
|
|
402
|
+
data: Optional[List[SchemaInputItem]] = None,
|
|
403
|
+
metrics: Optional[List[SchemaMetricConfig]] = None,
|
|
358
404
|
component: str = "unknown",
|
|
359
|
-
label: str
|
|
405
|
+
label: Optional[str] = None,
|
|
360
406
|
raise_exception: bool = False,
|
|
361
407
|
) -> Optional[Dict]:
|
|
362
408
|
if config is None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Common base class for all metric configs
|
|
2
2
|
import sys
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
4
4
|
|
|
5
5
|
# Base class (you can keep this abstract)
|
|
6
6
|
from pydantic import BaseModel, Field
|
|
@@ -94,7 +94,8 @@ MetricConfig = Annotated[
|
|
|
94
94
|
]
|
|
95
95
|
|
|
96
96
|
SchemaMetricConfig = Annotated[
|
|
97
|
-
Union[JsonCorrectnessConfig, FieldsPresenceConfig], Field(
|
|
97
|
+
Union[JsonCorrectnessConfig, FieldsPresenceConfig], Field(
|
|
98
|
+
discriminator="type")
|
|
98
99
|
]
|
|
99
100
|
|
|
100
101
|
|
|
@@ -107,7 +108,7 @@ class InputItem(BaseModel):
|
|
|
107
108
|
|
|
108
109
|
class TextInputItem(InputItem):
|
|
109
110
|
expected_output: Optional[str] = None
|
|
110
|
-
retrieval_context: Optional[
|
|
111
|
+
retrieval_context: Optional[List[str]] = None
|
|
111
112
|
|
|
112
113
|
|
|
113
114
|
class SchemaInputItem(InputItem):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rakam_eval_sdk-0.2.4 → rakam_eval_sdk-0.2.4rc2}/src/rakam_eval_sdk/utils/decorator_utils.py
RENAMED
|
File without changes
|