rakam-eval-sdk 0.2.4__py3-none-any.whl → 0.2.4rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rakam_eval_sdk/cli.py +27 -24
- rakam_eval_sdk/client.py +73 -27
- rakam_eval_sdk/schema.py +1 -1
- {rakam_eval_sdk-0.2.4.dist-info → rakam_eval_sdk-0.2.4rc1.dist-info}/METADATA +1 -1
- rakam_eval_sdk-0.2.4rc1.dist-info/RECORD +10 -0
- rakam_eval_sdk-0.2.4.dist-info/RECORD +0 -10
- {rakam_eval_sdk-0.2.4.dist-info → rakam_eval_sdk-0.2.4rc1.dist-info}/WHEEL +0 -0
- {rakam_eval_sdk-0.2.4.dist-info → rakam_eval_sdk-0.2.4rc1.dist-info}/entry_points.txt +0 -0
rakam_eval_sdk/cli.py
CHANGED
|
@@ -6,7 +6,7 @@ import uuid
|
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from pprint import pprint
|
|
9
|
-
from typing import Any, Dict, Optional
|
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
10
10
|
|
|
11
11
|
import typer
|
|
12
12
|
from dotenv import load_dotenv
|
|
@@ -37,14 +37,14 @@ metrics_app = typer.Typer(help="Metrics utilities")
|
|
|
37
37
|
app.add_typer(metrics_app, name="metrics")
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
def extract_metric_names(config: Any) -> list[tuple[str, str
|
|
40
|
+
def extract_metric_names(config: Any) -> list[tuple[str, Optional[str]]]:
|
|
41
41
|
"""
|
|
42
42
|
Returns [(type, name)] from EvalConfig / SchemaEvalConfig
|
|
43
43
|
"""
|
|
44
44
|
if not hasattr(config, "metrics"):
|
|
45
45
|
return []
|
|
46
46
|
|
|
47
|
-
results: list[tuple[str, str
|
|
47
|
+
results: list[tuple[str, Optional[str]]] = []
|
|
48
48
|
|
|
49
49
|
for metric in config.metrics or []:
|
|
50
50
|
metric_type = getattr(metric, "type", None)
|
|
@@ -77,7 +77,7 @@ def metrics(
|
|
|
77
77
|
files = directory.rglob("*.py") if recursive else directory.glob("*.py")
|
|
78
78
|
TARGET_DECORATOR = eval_run.__name__
|
|
79
79
|
|
|
80
|
-
all_metrics: set[tuple[str, str
|
|
80
|
+
all_metrics: set[tuple[str, Optional[str]]] = set()
|
|
81
81
|
found_any = False
|
|
82
82
|
|
|
83
83
|
for file in sorted(files):
|
|
@@ -159,7 +159,7 @@ def list_evals(
|
|
|
159
159
|
def list_runs(
|
|
160
160
|
limit: int = typer.Option(20, help="Max number of runs"),
|
|
161
161
|
offset: int = typer.Option(0, help="Pagination offset"),
|
|
162
|
-
):
|
|
162
|
+
) -> None:
|
|
163
163
|
"""
|
|
164
164
|
List evaluation runs (newest first).
|
|
165
165
|
"""
|
|
@@ -170,7 +170,7 @@ def list_runs(
|
|
|
170
170
|
offset=offset,
|
|
171
171
|
raise_exception=True,
|
|
172
172
|
)
|
|
173
|
-
|
|
173
|
+
assert response is not None
|
|
174
174
|
items = response.get("items", [])
|
|
175
175
|
total = response.get("total", 0)
|
|
176
176
|
|
|
@@ -219,15 +219,15 @@ def show(
|
|
|
219
219
|
"--raw",
|
|
220
220
|
help="Print raw JSON instead of formatted output",
|
|
221
221
|
),
|
|
222
|
-
):
|
|
222
|
+
) -> None:
|
|
223
223
|
"""
|
|
224
224
|
Show a single evaluation testcase by ID or tag.
|
|
225
225
|
"""
|
|
226
226
|
if not id and not tag:
|
|
227
|
-
raise typer.BadParameter("You must provide either --id or --
|
|
227
|
+
raise typer.BadParameter("You must provide either --id or --tag")
|
|
228
228
|
|
|
229
229
|
if id and tag:
|
|
230
|
-
raise typer.BadParameter("Provide only one of --id or --
|
|
230
|
+
raise typer.BadParameter("Provide only one of --id or --tag")
|
|
231
231
|
|
|
232
232
|
client = DeepEvalClient()
|
|
233
233
|
|
|
@@ -235,6 +235,7 @@ def show(
|
|
|
235
235
|
result = client.get_evaluation_testcase_by_id(id)
|
|
236
236
|
identifier = f"id={id}"
|
|
237
237
|
else:
|
|
238
|
+
assert tag is not None
|
|
238
239
|
result = client.get_evaluation_testcase_by_tag(tag)
|
|
239
240
|
identifier = f"tag={tag}"
|
|
240
241
|
|
|
@@ -406,7 +407,7 @@ def run(
|
|
|
406
407
|
def _print_and_save(
|
|
407
408
|
resp: dict,
|
|
408
409
|
pretty: bool,
|
|
409
|
-
out: Path
|
|
410
|
+
out: Optional[Path],
|
|
410
411
|
overwrite: bool,
|
|
411
412
|
) -> None:
|
|
412
413
|
if pretty:
|
|
@@ -430,13 +431,13 @@ def _print_and_save(
|
|
|
430
431
|
typer.echo(f"💾 Result saved to {out}")
|
|
431
432
|
|
|
432
433
|
|
|
433
|
-
def pct_change(a: float
|
|
434
|
+
def pct_change(a: Optional[float], b: Optional[float]) -> Optional[str]:
|
|
434
435
|
if a is None or b is None or a == 0:
|
|
435
436
|
return None
|
|
436
437
|
return f"{((b - a) / a) * 100:+.2f}%"
|
|
437
438
|
|
|
438
439
|
|
|
439
|
-
def metric_direction(delta: float
|
|
440
|
+
def metric_direction(delta: Optional[float]) -> str:
|
|
440
441
|
if delta is None:
|
|
441
442
|
return "unchanged"
|
|
442
443
|
if delta > 0:
|
|
@@ -446,7 +447,7 @@ def metric_direction(delta: float | None) -> str:
|
|
|
446
447
|
return "unchanged"
|
|
447
448
|
|
|
448
449
|
|
|
449
|
-
def print_metric_diff(diff: MetricDiff):
|
|
450
|
+
def print_metric_diff(diff: MetricDiff) -> None:
|
|
450
451
|
secho(f"\nMetric: {diff.metric}", bold=True)
|
|
451
452
|
|
|
452
453
|
if diff.status == "added":
|
|
@@ -507,7 +508,7 @@ def summarize(metrics: Any) -> Dict[str, int]:
|
|
|
507
508
|
return summary
|
|
508
509
|
|
|
509
510
|
|
|
510
|
-
def pretty_print_comparison(resp: Any, summary_only: bool = False):
|
|
511
|
+
def pretty_print_comparison(resp: Any, summary_only: bool = False) -> None:
|
|
511
512
|
if not summary_only:
|
|
512
513
|
for metric in resp.metrics:
|
|
513
514
|
print_metric_diff(metric)
|
|
@@ -516,7 +517,7 @@ def pretty_print_comparison(resp: Any, summary_only: bool = False):
|
|
|
516
517
|
print_summary(resp.metrics)
|
|
517
518
|
|
|
518
519
|
|
|
519
|
-
def print_summary(metrics: Any):
|
|
520
|
+
def print_summary(metrics: Any) -> None:
|
|
520
521
|
summary = summarize(metrics)
|
|
521
522
|
|
|
522
523
|
secho("\nSummary:", bold=True)
|
|
@@ -554,7 +555,7 @@ def compare(
|
|
|
554
555
|
"--raise",
|
|
555
556
|
help="Raise HTTP exceptions instead of swallowing them",
|
|
556
557
|
),
|
|
557
|
-
out: Path
|
|
558
|
+
out: Optional[Path] = typer.Option(
|
|
558
559
|
None,
|
|
559
560
|
"-o",
|
|
560
561
|
"--out",
|
|
@@ -575,7 +576,7 @@ def compare(
|
|
|
575
576
|
- Summary of improvements / regressions
|
|
576
577
|
"""
|
|
577
578
|
|
|
578
|
-
targets:
|
|
579
|
+
targets: List[Tuple[str, Union[str, int]]] = []
|
|
579
580
|
|
|
580
581
|
for r in run:
|
|
581
582
|
targets.append(("run", r))
|
|
@@ -600,7 +601,7 @@ def compare(
|
|
|
600
601
|
|
|
601
602
|
client = DeepEvalClient()
|
|
602
603
|
|
|
603
|
-
kwargs = {"raise_exception": raise_exception}
|
|
604
|
+
kwargs: Dict[str, Any] = {"raise_exception": raise_exception}
|
|
604
605
|
|
|
605
606
|
if type_a == "run":
|
|
606
607
|
kwargs["testcase_a_id"] = value_a
|
|
@@ -665,7 +666,7 @@ def compare_label_latest(
|
|
|
665
666
|
"--raise",
|
|
666
667
|
help="Raise HTTP exceptions instead of swallowing them",
|
|
667
668
|
),
|
|
668
|
-
out: Path
|
|
669
|
+
out: Optional[Path] = typer.Option(
|
|
669
670
|
None,
|
|
670
671
|
"-o",
|
|
671
672
|
"--out",
|
|
@@ -717,7 +718,7 @@ def compare_last(
|
|
|
717
718
|
"--raise",
|
|
718
719
|
help="Raise HTTP exceptions instead of swallowing them",
|
|
719
720
|
),
|
|
720
|
-
out: Path
|
|
721
|
+
out: Optional[Path] = typer.Option(
|
|
721
722
|
None,
|
|
722
723
|
"-o",
|
|
723
724
|
"--out",
|
|
@@ -769,7 +770,7 @@ def tag_command(
|
|
|
769
770
|
"--delete",
|
|
770
771
|
help="Delete a tag",
|
|
771
772
|
),
|
|
772
|
-
):
|
|
773
|
+
) -> None:
|
|
773
774
|
"""
|
|
774
775
|
Assign a tag to a run or delete a tag.
|
|
775
776
|
"""
|
|
@@ -786,8 +787,9 @@ def tag_command(
|
|
|
786
787
|
|
|
787
788
|
client = DeepEvalClient()
|
|
788
789
|
|
|
789
|
-
# --- delete mode ---
|
|
790
790
|
if delete:
|
|
791
|
+
assert run_id is not None
|
|
792
|
+
|
|
791
793
|
result = client.update_evaluation_testcase_tag(
|
|
792
794
|
testcase_id=run_id,
|
|
793
795
|
tag=delete,
|
|
@@ -796,13 +798,14 @@ def tag_command(
|
|
|
796
798
|
typer.echo("🗑️ Tag deleted successfully")
|
|
797
799
|
typer.echo(f"Tag: {delete}")
|
|
798
800
|
return
|
|
799
|
-
|
|
800
|
-
|
|
801
|
+
assert run_id is not None
|
|
802
|
+
assert tag is not None
|
|
801
803
|
result = client.update_evaluation_testcase_tag(
|
|
802
804
|
testcase_id=run_id,
|
|
803
805
|
tag=tag,
|
|
804
806
|
raise_exception=True,
|
|
805
807
|
)
|
|
808
|
+
assert result is not None
|
|
806
809
|
|
|
807
810
|
typer.echo("✅ Tag assigned successfully")
|
|
808
811
|
typer.echo(f"Run ID: {run_id}")
|
rakam_eval_sdk/client.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import random
|
|
3
|
-
from typing import Any, Dict, List, Literal, Optional, Union, cast, overload
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast, overload
|
|
4
4
|
|
|
5
5
|
import requests
|
|
6
6
|
|
|
@@ -49,8 +49,8 @@ class DeepEvalClient:
|
|
|
49
49
|
method: HTTPMethod,
|
|
50
50
|
endpoint: str,
|
|
51
51
|
*,
|
|
52
|
-
json: Dict
|
|
53
|
-
params: Dict
|
|
52
|
+
json: Optional[Dict] = None,
|
|
53
|
+
params: Optional[Dict] = None,
|
|
54
54
|
raise_exception: bool = False,
|
|
55
55
|
) -> Optional[Dict]:
|
|
56
56
|
url = f"{self.base_url}{endpoint}"
|
|
@@ -89,17 +89,63 @@ class DeepEvalClient:
|
|
|
89
89
|
"raw": resp.text,
|
|
90
90
|
}
|
|
91
91
|
|
|
92
|
-
def _get(
|
|
93
|
-
|
|
92
|
+
def _get(
|
|
93
|
+
self,
|
|
94
|
+
endpoint: str,
|
|
95
|
+
params: Dict,
|
|
96
|
+
raise_exception: bool = False,
|
|
97
|
+
*args: Tuple,
|
|
98
|
+
**kw: Dict,
|
|
99
|
+
) -> Optional[Dict]:
|
|
100
|
+
return self._request(
|
|
101
|
+
"GET", endpoint, params=params, raise_exception=raise_exception, *args, **kw
|
|
102
|
+
)
|
|
94
103
|
|
|
95
|
-
def _post(
|
|
96
|
-
|
|
104
|
+
def _post(
|
|
105
|
+
self,
|
|
106
|
+
endpoint: str,
|
|
107
|
+
payload: Dict,
|
|
108
|
+
raise_exception: bool = False,
|
|
109
|
+
*args: Tuple,
|
|
110
|
+
**kw: Dict,
|
|
111
|
+
) -> Optional[Dict]:
|
|
112
|
+
return self._request(
|
|
113
|
+
"POST", endpoint, json=payload, raise_exception=raise_exception, *args, **kw
|
|
114
|
+
)
|
|
97
115
|
|
|
98
|
-
def _patch(
|
|
99
|
-
|
|
116
|
+
def _patch(
|
|
117
|
+
self,
|
|
118
|
+
endpoint: str,
|
|
119
|
+
payload: Dict,
|
|
120
|
+
raise_exception: bool = False,
|
|
121
|
+
*args: Tuple,
|
|
122
|
+
**kw: Dict,
|
|
123
|
+
) -> Optional[Dict]:
|
|
124
|
+
return self._request(
|
|
125
|
+
"PATCH",
|
|
126
|
+
endpoint,
|
|
127
|
+
json=payload,
|
|
128
|
+
raise_exception=raise_exception,
|
|
129
|
+
*args,
|
|
130
|
+
**kw,
|
|
131
|
+
)
|
|
100
132
|
|
|
101
|
-
def _delete(
|
|
102
|
-
|
|
133
|
+
def _delete(
|
|
134
|
+
self,
|
|
135
|
+
endpoint: str,
|
|
136
|
+
payload: Dict,
|
|
137
|
+
raise_exception: bool = False,
|
|
138
|
+
*args: Tuple,
|
|
139
|
+
**kw: Dict,
|
|
140
|
+
) -> Optional[Dict]:
|
|
141
|
+
return self._request(
|
|
142
|
+
"DELETE",
|
|
143
|
+
endpoint,
|
|
144
|
+
json=payload,
|
|
145
|
+
raise_exception=raise_exception,
|
|
146
|
+
*args,
|
|
147
|
+
**kw,
|
|
148
|
+
)
|
|
103
149
|
|
|
104
150
|
def update_evaluation_testcase_tag(
|
|
105
151
|
self,
|
|
@@ -197,10 +243,10 @@ class DeepEvalClient:
|
|
|
197
243
|
def compare_testcases(
|
|
198
244
|
self,
|
|
199
245
|
*,
|
|
200
|
-
testcase_a_id: int
|
|
201
|
-
testcase_a_tag: str
|
|
202
|
-
testcase_b_id: int
|
|
203
|
-
testcase_b_tag: str
|
|
246
|
+
testcase_a_id: Optional[int] = None,
|
|
247
|
+
testcase_a_tag: Optional[str] = None,
|
|
248
|
+
testcase_b_id: Optional[int] = None,
|
|
249
|
+
testcase_b_tag: Optional[str] = None,
|
|
204
250
|
raise_exception: bool = False,
|
|
205
251
|
) -> Optional[Dict]:
|
|
206
252
|
"""
|
|
@@ -208,14 +254,14 @@ class DeepEvalClient:
|
|
|
208
254
|
Exactly one identifier (id or tag) must be provided per testcase.
|
|
209
255
|
"""
|
|
210
256
|
|
|
211
|
-
def validate(id_, tag, name: str):
|
|
257
|
+
def validate(id_: Optional[int], tag: Optional[str], name: str) -> None:
|
|
212
258
|
if bool(id_) == bool(tag):
|
|
213
259
|
raise ValueError(f"Provide exactly one of {name}_id or {name}_tag")
|
|
214
260
|
|
|
215
261
|
validate(testcase_a_id, testcase_a_tag, "testcase_a")
|
|
216
262
|
validate(testcase_b_id, testcase_b_tag, "testcase_b")
|
|
217
263
|
|
|
218
|
-
params:
|
|
264
|
+
params: Dict[str, Union[int, str]] = {}
|
|
219
265
|
|
|
220
266
|
if testcase_a_id is not None:
|
|
221
267
|
params["testcase_a_id"] = testcase_a_id
|
|
@@ -284,18 +330,18 @@ class DeepEvalClient:
|
|
|
284
330
|
data: List[TextInputItem],
|
|
285
331
|
metrics: List[MetricConfig],
|
|
286
332
|
component: str = "unknown",
|
|
287
|
-
label: str
|
|
333
|
+
label: Optional[str] = None,
|
|
288
334
|
raise_exception: bool = False,
|
|
289
335
|
) -> Optional[Dict]: ...
|
|
290
336
|
|
|
291
337
|
def text_eval(
|
|
292
338
|
self,
|
|
293
|
-
config: EvalConfig
|
|
339
|
+
config: Optional[EvalConfig] = None,
|
|
294
340
|
*,
|
|
295
|
-
data: List[TextInputItem]
|
|
296
|
-
metrics: List[MetricConfig]
|
|
341
|
+
data: Optional[List[TextInputItem]] = None,
|
|
342
|
+
metrics: Optional[List[MetricConfig]] = None,
|
|
297
343
|
component: str = "unknown",
|
|
298
|
-
label: str
|
|
344
|
+
label: Optional[str] = None,
|
|
299
345
|
raise_exception: bool = False,
|
|
300
346
|
) -> Optional[Dict]:
|
|
301
347
|
if config is None:
|
|
@@ -337,7 +383,7 @@ class DeepEvalClient:
|
|
|
337
383
|
data: List[SchemaInputItem],
|
|
338
384
|
metrics: List[SchemaMetricConfig],
|
|
339
385
|
component: str = "unknown",
|
|
340
|
-
label: str
|
|
386
|
+
label: Optional[str] = None,
|
|
341
387
|
raise_exception: bool = False,
|
|
342
388
|
) -> Optional[Dict]: ...
|
|
343
389
|
|
|
@@ -351,12 +397,12 @@ class DeepEvalClient:
|
|
|
351
397
|
|
|
352
398
|
def schema_eval(
|
|
353
399
|
self,
|
|
354
|
-
config: SchemaEvalConfig
|
|
400
|
+
config: Optional[SchemaEvalConfig] = None,
|
|
355
401
|
*,
|
|
356
|
-
data: List[SchemaInputItem]
|
|
357
|
-
metrics: List[SchemaMetricConfig]
|
|
402
|
+
data: Optional[List[SchemaInputItem]] = None,
|
|
403
|
+
metrics: Optional[List[SchemaMetricConfig]] = None,
|
|
358
404
|
component: str = "unknown",
|
|
359
|
-
label: str
|
|
405
|
+
label: Optional[str] = None,
|
|
360
406
|
raise_exception: bool = False,
|
|
361
407
|
) -> Optional[Dict]:
|
|
362
408
|
if config is None:
|
rakam_eval_sdk/schema.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Common base class for all metric configs
|
|
2
2
|
import sys
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
4
4
|
|
|
5
5
|
# Base class (you can keep this abstract)
|
|
6
6
|
from pydantic import BaseModel, Field
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
rakam_eval_sdk/cli.py,sha256=HjymI0UTKRgJb-ewDuINuTtH605MkcB9nD0qNlbdFyc,21905
|
|
3
|
+
rakam_eval_sdk/client.py,sha256=8KwpNt7WHgMfcXAQtrAlnUihai9TiADRieiuWeIxg6E,15575
|
|
4
|
+
rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
|
|
5
|
+
rakam_eval_sdk/schema.py,sha256=DT-uQsE3XB7AAii68QACwIslLqXRQFb1c6efkqLQysI,3960
|
|
6
|
+
rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
|
|
7
|
+
rakam_eval_sdk-0.2.4rc1.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
8
|
+
rakam_eval_sdk-0.2.4rc1.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
|
|
9
|
+
rakam_eval_sdk-0.2.4rc1.dist-info/METADATA,sha256=Sy2Ghp5wcrmS7AzBkhp8C8LVb0dTS7tSNtgEAJCfbU8,6022
|
|
10
|
+
rakam_eval_sdk-0.2.4rc1.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
rakam_eval_sdk/cli.py,sha256=IvC45LyALCCoxZ7ZNifVCbPs6UveGtUCfjr1q9e1A_Y,21658
|
|
3
|
-
rakam_eval_sdk/client.py,sha256=7spK7P3h5uiA3Q3erOHOwf04N1_RT1pysWyt_E-cz0Q,14666
|
|
4
|
-
rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
|
|
5
|
-
rakam_eval_sdk/schema.py,sha256=YfZgHtPh41xFYdQTsbS0OSvNdI8yIv5zoh76v21hemo,3971
|
|
6
|
-
rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
|
|
7
|
-
rakam_eval_sdk-0.2.4.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
8
|
-
rakam_eval_sdk-0.2.4.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
|
|
9
|
-
rakam_eval_sdk-0.2.4.dist-info/METADATA,sha256=XOTmtgO7NiG8NuLn4B_yJujUJONSJCvwGQ9ak3zIhv4,6019
|
|
10
|
-
rakam_eval_sdk-0.2.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|