rakam-eval-sdk 0.1.16rc1__py3-none-any.whl → 0.2.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rakam_eval_sdk/cli.py +426 -18
- rakam_eval_sdk/client.py +252 -26
- rakam_eval_sdk/decorators.py +66 -40
- rakam_eval_sdk/schema.py +9 -4
- rakam_eval_sdk/utils/decorator_utils.py +3 -14
- {rakam_eval_sdk-0.1.16rc1.dist-info → rakam_eval_sdk-0.2.0rc1.dist-info}/METADATA +2 -1
- rakam_eval_sdk-0.2.0rc1.dist-info/RECORD +10 -0
- rakam_eval_sdk-0.1.16rc1.dist-info/RECORD +0 -10
- {rakam_eval_sdk-0.1.16rc1.dist-info → rakam_eval_sdk-0.2.0rc1.dist-info}/WHEEL +0 -0
- {rakam_eval_sdk-0.1.16rc1.dist-info → rakam_eval_sdk-0.2.0rc1.dist-info}/entry_points.txt +0 -0
rakam_eval_sdk/cli.py
CHANGED
|
@@ -1,15 +1,38 @@
|
|
|
1
1
|
# cli.py
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import uuid
|
|
6
|
+
from datetime import datetime
|
|
2
7
|
from pathlib import Path
|
|
8
|
+
from pprint import pprint
|
|
9
|
+
from typing import Any, Optional
|
|
3
10
|
|
|
4
11
|
import typer
|
|
12
|
+
from dotenv import load_dotenv
|
|
13
|
+
from rich.console import Console
|
|
14
|
+
from rich.panel import Panel
|
|
15
|
+
from rich.pretty import Pretty
|
|
5
16
|
|
|
6
|
-
from rakam_eval_sdk.
|
|
17
|
+
from rakam_eval_sdk.client import DeepEvalClient
|
|
7
18
|
from rakam_eval_sdk.decorators import eval_run
|
|
19
|
+
from rakam_eval_sdk.utils.decorator_utils import (
|
|
20
|
+
find_decorated_functions,
|
|
21
|
+
load_module_from_path,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
load_dotenv()
|
|
8
25
|
app = typer.Typer(help="CLI tools for evaluation utilities")
|
|
26
|
+
console = Console()
|
|
27
|
+
|
|
28
|
+
# add root of the project to sys.path
|
|
29
|
+
PROJECT_ROOT = os.path.abspath(".")
|
|
30
|
+
if PROJECT_ROOT not in sys.path:
|
|
31
|
+
sys.path.insert(0, PROJECT_ROOT)
|
|
9
32
|
|
|
10
33
|
|
|
11
34
|
@app.command()
|
|
12
|
-
def
|
|
35
|
+
def list(
|
|
13
36
|
directory: Path = typer.Argument(
|
|
14
37
|
Path("./eval"),
|
|
15
38
|
exists=True,
|
|
@@ -23,16 +46,12 @@ def find_eval_run_by_name(
|
|
|
23
46
|
"-r",
|
|
24
47
|
help="Recursively search for Python files",
|
|
25
48
|
),
|
|
26
|
-
):
|
|
49
|
+
) -> None:
|
|
27
50
|
"""
|
|
28
51
|
Find functions decorated with @track.
|
|
29
52
|
"""
|
|
30
53
|
TARGET_DECORATOR = eval_run.__name__
|
|
31
|
-
files = (
|
|
32
|
-
directory.rglob("*.py")
|
|
33
|
-
if recursive
|
|
34
|
-
else directory.glob("*.py")
|
|
35
|
-
)
|
|
54
|
+
files = directory.rglob("*.py") if recursive else directory.glob("*.py")
|
|
36
55
|
|
|
37
56
|
found = False
|
|
38
57
|
|
|
@@ -46,8 +65,157 @@ def find_eval_run_by_name(
|
|
|
46
65
|
typer.echo(f"No @{TARGET_DECORATOR} functions found.")
|
|
47
66
|
|
|
48
67
|
|
|
49
|
-
|
|
50
|
-
|
|
68
|
+
list_app = typer.Typer(help="List resources")
|
|
69
|
+
app.add_typer(list_app, name="list")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@list_app.command("runs")
|
|
73
|
+
def list_runs(
|
|
74
|
+
limit: int = typer.Option(20, help="Max number of runs"),
|
|
75
|
+
offset: int = typer.Option(0, help="Pagination offset"),
|
|
76
|
+
status: Optional[str] = typer.Option(
|
|
77
|
+
None, help="Filter by status (running, completed, failed)"
|
|
78
|
+
),
|
|
79
|
+
):
|
|
80
|
+
"""
|
|
81
|
+
List evaluation runs (newest first).
|
|
82
|
+
"""
|
|
83
|
+
client = DeepEvalClient()
|
|
84
|
+
|
|
85
|
+
runs = client.list_evaluation_testcases(
|
|
86
|
+
limit=limit,
|
|
87
|
+
offset=offset,
|
|
88
|
+
raise_exception=True,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if not runs:
|
|
92
|
+
typer.echo("No evaluation runs found.")
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
# optional status filtering (client-side for now)
|
|
96
|
+
if status:
|
|
97
|
+
runs = [
|
|
98
|
+
r for r in runs
|
|
99
|
+
if r.get("result", {}).get("status") == status
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
typer.echo(
|
|
103
|
+
f"[id] "
|
|
104
|
+
f"{'unique_id':<20}"
|
|
105
|
+
f"{'label':<20}"
|
|
106
|
+
f"created_at"
|
|
107
|
+
)
|
|
108
|
+
# pretty CLI output
|
|
109
|
+
for run in runs:
|
|
110
|
+
run_id = run.get("id")
|
|
111
|
+
label = run.get("label") or "-"
|
|
112
|
+
uid = run.get("unique_id") or "-"
|
|
113
|
+
created_at = run.get("created_at")
|
|
114
|
+
|
|
115
|
+
if created_at:
|
|
116
|
+
try:
|
|
117
|
+
created_at = datetime.fromisoformat(created_at).strftime(
|
|
118
|
+
"%Y-%m-%d %H:%M:%S"
|
|
119
|
+
)
|
|
120
|
+
except ValueError:
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
typer.echo(
|
|
124
|
+
f"[{run_id}] "
|
|
125
|
+
f"{uid:<20} "
|
|
126
|
+
f"{label:<20} "
|
|
127
|
+
f"{created_at}"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@list_app.command("show")
|
|
132
|
+
def show_testcase(
|
|
133
|
+
id: Optional[int] = typer.Option(
|
|
134
|
+
None,
|
|
135
|
+
"--id",
|
|
136
|
+
help="Numeric evaluation testcase ID",
|
|
137
|
+
),
|
|
138
|
+
uid: Optional[str] = typer.Option(
|
|
139
|
+
None,
|
|
140
|
+
"--uid",
|
|
141
|
+
help="Evaluation testcase unique_id",
|
|
142
|
+
),
|
|
143
|
+
raw: bool = typer.Option(
|
|
144
|
+
False,
|
|
145
|
+
"--raw",
|
|
146
|
+
help="Print raw JSON instead of formatted output",
|
|
147
|
+
),
|
|
148
|
+
):
|
|
149
|
+
"""
|
|
150
|
+
Show a single evaluation testcase by ID or unique_id.
|
|
151
|
+
"""
|
|
152
|
+
if not id and not uid:
|
|
153
|
+
raise typer.BadParameter("You must provide either --id or --uid")
|
|
154
|
+
|
|
155
|
+
if id and uid:
|
|
156
|
+
raise typer.BadParameter("Provide only one of --id or --uid")
|
|
157
|
+
|
|
158
|
+
client = DeepEvalClient()
|
|
159
|
+
|
|
160
|
+
if id:
|
|
161
|
+
result = client.get_evaluation_testcase_by_id(id)
|
|
162
|
+
identifier = f"id={id}"
|
|
163
|
+
else:
|
|
164
|
+
result = client.get_evaluation_testcase_by_unique_id(uid)
|
|
165
|
+
identifier = f"unique_id={uid}"
|
|
166
|
+
|
|
167
|
+
if not result:
|
|
168
|
+
console.print(
|
|
169
|
+
Panel(
|
|
170
|
+
f"No response received for {identifier}",
|
|
171
|
+
title="Error",
|
|
172
|
+
style="red",
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
raise typer.Exit(code=1)
|
|
176
|
+
|
|
177
|
+
if isinstance(result, dict) and result.get("error"):
|
|
178
|
+
console.print(
|
|
179
|
+
Panel(
|
|
180
|
+
result["error"],
|
|
181
|
+
title="Error",
|
|
182
|
+
style="red",
|
|
183
|
+
)
|
|
184
|
+
)
|
|
185
|
+
raise typer.Exit(code=1)
|
|
186
|
+
|
|
187
|
+
if raw:
|
|
188
|
+
console.print(Pretty(result))
|
|
189
|
+
raise typer.Exit()
|
|
190
|
+
|
|
191
|
+
console.print(
|
|
192
|
+
Panel.fit(
|
|
193
|
+
Pretty(result),
|
|
194
|
+
title="Evaluation TestCase",
|
|
195
|
+
subtitle=identifier,
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def validate_eval_result(result: Any, fn_name: str) -> str:
|
|
201
|
+
eval_config = getattr(result, "__eval_config__", None)
|
|
202
|
+
|
|
203
|
+
if not isinstance(eval_config, str):
|
|
204
|
+
expected = "EvalConfig or SchemaEvalConfig"
|
|
205
|
+
actual = type(result).__name__
|
|
206
|
+
|
|
207
|
+
typer.echo(
|
|
208
|
+
f" ❌ Invalid return type from `{fn_name}`\n"
|
|
209
|
+
f" Expected: {expected}\n"
|
|
210
|
+
f" Got: {actual}"
|
|
211
|
+
)
|
|
212
|
+
return ""
|
|
213
|
+
|
|
214
|
+
return eval_config
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@app.command()
|
|
218
|
+
def run(
|
|
51
219
|
directory: Path = typer.Argument(
|
|
52
220
|
Path("./eval"),
|
|
53
221
|
exists=True,
|
|
@@ -66,19 +234,28 @@ def run_eval_runs(
|
|
|
66
234
|
"--dry-run",
|
|
67
235
|
help="Only list functions without executing them",
|
|
68
236
|
),
|
|
69
|
-
|
|
237
|
+
save_runs: bool = typer.Option(
|
|
238
|
+
False,
|
|
239
|
+
"--save-runs",
|
|
240
|
+
help="Save each evaluation run result to a JSON file",
|
|
241
|
+
),
|
|
242
|
+
output_dir: Path = typer.Option(
|
|
243
|
+
Path("./eval_runs"),
|
|
244
|
+
"--output-dir",
|
|
245
|
+
help="Directory where run results are saved",
|
|
246
|
+
),
|
|
247
|
+
) -> None:
|
|
70
248
|
"""
|
|
71
249
|
Find and execute all functions decorated with @eval_run.
|
|
72
250
|
"""
|
|
73
|
-
files = (
|
|
74
|
-
directory.rglob("*.py")
|
|
75
|
-
if recursive
|
|
76
|
-
else directory.glob("*.py")
|
|
77
|
-
)
|
|
251
|
+
files = directory.rglob("*.py") if recursive else directory.glob("*.py")
|
|
78
252
|
TARGET_DECORATOR = eval_run.__name__
|
|
79
253
|
|
|
80
254
|
executed_any = False
|
|
81
255
|
|
|
256
|
+
if save_runs and not dry_run:
|
|
257
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
258
|
+
|
|
82
259
|
for file in sorted(files):
|
|
83
260
|
functions = find_decorated_functions(file, TARGET_DECORATOR)
|
|
84
261
|
if not functions:
|
|
@@ -102,8 +279,49 @@ def run_eval_runs(
|
|
|
102
279
|
|
|
103
280
|
try:
|
|
104
281
|
func = getattr(module, fn_name)
|
|
105
|
-
func()
|
|
282
|
+
result = func()
|
|
283
|
+
|
|
284
|
+
eval_type = validate_eval_result(result, fn_name)
|
|
285
|
+
if not eval_type:
|
|
286
|
+
continue
|
|
287
|
+
|
|
288
|
+
client = DeepEvalClient()
|
|
289
|
+
|
|
290
|
+
if eval_type == "text_eval":
|
|
291
|
+
resp = client.text_eval(config=result)
|
|
292
|
+
else:
|
|
293
|
+
resp = client.schema_eval(config=result)
|
|
294
|
+
|
|
295
|
+
typer.echo(f"{resp}")
|
|
106
296
|
executed_any = True
|
|
297
|
+
typer.echo(f" ✅ Returned {type(result).__name__}")
|
|
298
|
+
|
|
299
|
+
if save_runs:
|
|
300
|
+
run_id = (
|
|
301
|
+
resp["id"]
|
|
302
|
+
if resp is not None and "id" in resp
|
|
303
|
+
else uuid.uuid4().hex[:8]
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
output_path = output_dir / f"run_{fn_name}_{run_id}.json"
|
|
307
|
+
|
|
308
|
+
def to_json_safe(obj: Any) -> Any:
|
|
309
|
+
if hasattr(obj, "model_dump"):
|
|
310
|
+
return obj.model_dump()
|
|
311
|
+
if hasattr(obj, "dict"):
|
|
312
|
+
return obj.dict()
|
|
313
|
+
return obj
|
|
314
|
+
|
|
315
|
+
with output_path.open("w", encoding="utf-8") as f:
|
|
316
|
+
json.dump(
|
|
317
|
+
to_json_safe(resp),
|
|
318
|
+
f,
|
|
319
|
+
indent=2,
|
|
320
|
+
ensure_ascii=False,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
typer.echo(f" 💾 Saved run → {output_path}")
|
|
324
|
+
|
|
107
325
|
except Exception as e:
|
|
108
326
|
typer.echo(f" ❌ Execution failed: {e}")
|
|
109
327
|
|
|
@@ -111,7 +329,197 @@ def run_eval_runs(
|
|
|
111
329
|
typer.echo("\nNo @eval_run functions executed.")
|
|
112
330
|
|
|
113
331
|
|
|
114
|
-
def
|
|
332
|
+
def _print_and_save(
|
|
333
|
+
resp: dict,
|
|
334
|
+
pretty: bool,
|
|
335
|
+
out: Path | None,
|
|
336
|
+
overwrite: bool,
|
|
337
|
+
) -> None:
|
|
338
|
+
if pretty:
|
|
339
|
+
typer.echo(typer.style("📊 Result:", bold=True))
|
|
340
|
+
pprint(resp)
|
|
341
|
+
else:
|
|
342
|
+
typer.echo(resp)
|
|
343
|
+
|
|
344
|
+
if out is None:
|
|
345
|
+
return
|
|
346
|
+
|
|
347
|
+
if out.exists() and not overwrite:
|
|
348
|
+
typer.echo(
|
|
349
|
+
f"❌ File already exists: {out} (use --overwrite to replace)")
|
|
350
|
+
raise typer.Exit(code=1)
|
|
351
|
+
|
|
352
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
353
|
+
|
|
354
|
+
with out.open("w", encoding="utf-8") as f:
|
|
355
|
+
json.dump(resp, f, indent=2, ensure_ascii=False)
|
|
356
|
+
|
|
357
|
+
typer.echo(f"💾 Result saved to {out}")
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
@app.command()
|
|
361
|
+
def compare_testcases(
|
|
362
|
+
testcase_a_id: int = typer.Argument(
|
|
363
|
+
...,
|
|
364
|
+
help="ID of the first testcase",
|
|
365
|
+
),
|
|
366
|
+
testcase_b_id: int = typer.Argument(
|
|
367
|
+
...,
|
|
368
|
+
help="ID of the second testcase",
|
|
369
|
+
),
|
|
370
|
+
pretty: bool = typer.Option(
|
|
371
|
+
True,
|
|
372
|
+
"--pretty/--raw",
|
|
373
|
+
help="Pretty-print the response",
|
|
374
|
+
),
|
|
375
|
+
raise_exception: bool = typer.Option(
|
|
376
|
+
False,
|
|
377
|
+
"--raise",
|
|
378
|
+
help="Raise HTTP exceptions instead of swallowing them",
|
|
379
|
+
),
|
|
380
|
+
out: Path | None = typer.Option(
|
|
381
|
+
None,
|
|
382
|
+
"-o",
|
|
383
|
+
"--out",
|
|
384
|
+
help="Optional file path to save the result as JSON",
|
|
385
|
+
),
|
|
386
|
+
overwrite: bool = typer.Option(
|
|
387
|
+
False,
|
|
388
|
+
"--overwrite",
|
|
389
|
+
help="Overwrite output file if it already exists",
|
|
390
|
+
),
|
|
391
|
+
) -> None:
|
|
392
|
+
"""
|
|
393
|
+
Compare two DeepEval evaluation testcases.
|
|
394
|
+
"""
|
|
395
|
+
client = DeepEvalClient()
|
|
396
|
+
|
|
397
|
+
typer.echo(f"🔍 Comparing testcases {testcase_a_id} ↔ {testcase_b_id}")
|
|
398
|
+
|
|
399
|
+
try:
|
|
400
|
+
resp = client.compare_testcases(
|
|
401
|
+
testcase_a_id=testcase_a_id,
|
|
402
|
+
testcase_b_id=testcase_b_id,
|
|
403
|
+
raise_exception=raise_exception,
|
|
404
|
+
)
|
|
405
|
+
except Exception as e:
|
|
406
|
+
typer.echo(f"❌ Request failed: {e}")
|
|
407
|
+
raise typer.Exit(code=1)
|
|
408
|
+
|
|
409
|
+
if not resp:
|
|
410
|
+
typer.echo("⚠️ No response received")
|
|
411
|
+
raise typer.Exit(code=1)
|
|
412
|
+
_print_and_save(resp, pretty, out, overwrite)
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
@app.command()
|
|
416
|
+
def compare_label_latest(
|
|
417
|
+
label_a: str = typer.Argument(
|
|
418
|
+
...,
|
|
419
|
+
help="First label (latest run will be used)",
|
|
420
|
+
),
|
|
421
|
+
label_b: str = typer.Argument(
|
|
422
|
+
...,
|
|
423
|
+
help="Second label (latest run will be used)",
|
|
424
|
+
),
|
|
425
|
+
pretty: bool = typer.Option(
|
|
426
|
+
True,
|
|
427
|
+
"--pretty/--raw",
|
|
428
|
+
help="Pretty-print the response",
|
|
429
|
+
),
|
|
430
|
+
raise_exception: bool = typer.Option(
|
|
431
|
+
False,
|
|
432
|
+
"--raise",
|
|
433
|
+
help="Raise HTTP exceptions instead of swallowing them",
|
|
434
|
+
),
|
|
435
|
+
out: Path | None = typer.Option(
|
|
436
|
+
None,
|
|
437
|
+
"-o",
|
|
438
|
+
"--out",
|
|
439
|
+
help="Optional file path to save the result as JSON",
|
|
440
|
+
),
|
|
441
|
+
overwrite: bool = typer.Option(
|
|
442
|
+
False,
|
|
443
|
+
"--overwrite",
|
|
444
|
+
help="Overwrite output file if it already exists",
|
|
445
|
+
),
|
|
446
|
+
) -> None:
|
|
447
|
+
"""
|
|
448
|
+
Compare the latest evaluation runs for two labels.
|
|
449
|
+
"""
|
|
450
|
+
client = DeepEvalClient()
|
|
451
|
+
|
|
452
|
+
typer.echo(f"🔍 Comparing latest runs: '{label_a}' ↔ '{label_b}'")
|
|
453
|
+
|
|
454
|
+
try:
|
|
455
|
+
resp = client.compare_latest_by_labels(
|
|
456
|
+
label_a=label_a,
|
|
457
|
+
label_b=label_b,
|
|
458
|
+
raise_exception=raise_exception,
|
|
459
|
+
)
|
|
460
|
+
except Exception as e:
|
|
461
|
+
typer.echo(f"❌ Request failed: {e}")
|
|
462
|
+
raise typer.Exit(code=1)
|
|
463
|
+
|
|
464
|
+
if not resp:
|
|
465
|
+
typer.echo("⚠️ No response received")
|
|
466
|
+
raise typer.Exit(code=1)
|
|
467
|
+
|
|
468
|
+
_print_and_save(resp, pretty, out, overwrite)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
@app.command()
|
|
472
|
+
def compare_last(
|
|
473
|
+
label: str = typer.Argument(
|
|
474
|
+
...,
|
|
475
|
+
help="Label whose last two runs will be compared",
|
|
476
|
+
),
|
|
477
|
+
pretty: bool = typer.Option(
|
|
478
|
+
True,
|
|
479
|
+
"--pretty/--raw",
|
|
480
|
+
help="Pretty-print the response",
|
|
481
|
+
),
|
|
482
|
+
raise_exception: bool = typer.Option(
|
|
483
|
+
False,
|
|
484
|
+
"--raise",
|
|
485
|
+
help="Raise HTTP exceptions instead of swallowing them",
|
|
486
|
+
),
|
|
487
|
+
out: Path | None = typer.Option(
|
|
488
|
+
None,
|
|
489
|
+
"-o",
|
|
490
|
+
"--out",
|
|
491
|
+
help="Optional file path to save the result as JSON",
|
|
492
|
+
),
|
|
493
|
+
overwrite: bool = typer.Option(
|
|
494
|
+
False,
|
|
495
|
+
"--overwrite",
|
|
496
|
+
help="Overwrite output file if it already exists",
|
|
497
|
+
),
|
|
498
|
+
) -> None:
|
|
499
|
+
"""
|
|
500
|
+
Compare the last two evaluation runs of a label.
|
|
501
|
+
"""
|
|
502
|
+
client = DeepEvalClient()
|
|
503
|
+
|
|
504
|
+
typer.echo(f"🔍 Comparing last two runs for label '{label}'")
|
|
505
|
+
|
|
506
|
+
try:
|
|
507
|
+
resp = client.compare_last_two_by_label(
|
|
508
|
+
label=label,
|
|
509
|
+
raise_exception=raise_exception,
|
|
510
|
+
)
|
|
511
|
+
except Exception as e:
|
|
512
|
+
typer.echo(f"❌ Request failed: {e}")
|
|
513
|
+
raise typer.Exit(code=1)
|
|
514
|
+
|
|
515
|
+
if not resp:
|
|
516
|
+
typer.echo("⚠️ No response received")
|
|
517
|
+
raise typer.Exit(code=1)
|
|
518
|
+
|
|
519
|
+
_print_and_save(resp, pretty, out, overwrite)
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
def main() -> None:
|
|
115
523
|
app()
|
|
116
524
|
|
|
117
525
|
|
rakam_eval_sdk/client.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import random
|
|
3
|
-
from typing import Any, List, Optional, Union, cast
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union, cast, overload
|
|
4
4
|
|
|
5
5
|
import requests
|
|
6
6
|
|
|
@@ -38,7 +38,8 @@ class DeepEvalClient:
|
|
|
38
38
|
)
|
|
39
39
|
self.base_url = raw_url.rstrip("/")
|
|
40
40
|
self.api_token = (
|
|
41
|
-
api_token or settings_token or os.getenv(
|
|
41
|
+
api_token or settings_token or os.getenv(
|
|
42
|
+
"EVALFRAMWORK_API_KEY", "")
|
|
42
43
|
)
|
|
43
44
|
self.timeout = timeout
|
|
44
45
|
|
|
@@ -74,19 +75,202 @@ class DeepEvalClient:
|
|
|
74
75
|
raise
|
|
75
76
|
return {"error": "Invalid JSON response", "raw": resp.text}
|
|
76
77
|
|
|
78
|
+
def _get(
|
|
79
|
+
self,
|
|
80
|
+
endpoint: str,
|
|
81
|
+
params: dict,
|
|
82
|
+
raise_exception: bool = False,
|
|
83
|
+
) -> Optional[dict]:
|
|
84
|
+
"""Internal helper to send GET requests with standard headers and error handling."""
|
|
85
|
+
url = f"{self.base_url}{endpoint}"
|
|
86
|
+
headers = {
|
|
87
|
+
"accept": "application/json",
|
|
88
|
+
"X-API-Token": self.api_token,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
resp = requests.get(
|
|
93
|
+
url,
|
|
94
|
+
headers=headers,
|
|
95
|
+
params=params,
|
|
96
|
+
timeout=self.timeout,
|
|
97
|
+
)
|
|
98
|
+
if raise_exception:
|
|
99
|
+
resp.raise_for_status()
|
|
100
|
+
except requests.RequestException as e:
|
|
101
|
+
if raise_exception:
|
|
102
|
+
raise
|
|
103
|
+
return {"error": str(e)}
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
return cast(dict, resp.json())
|
|
107
|
+
except ValueError:
|
|
108
|
+
if raise_exception:
|
|
109
|
+
raise
|
|
110
|
+
return {"error": "Invalid JSON response", "raw": resp.text}
|
|
111
|
+
|
|
112
|
+
def list_evaluation_testcases(
|
|
113
|
+
self,
|
|
114
|
+
*,
|
|
115
|
+
limit: int = 10,
|
|
116
|
+
offset: int = 0,
|
|
117
|
+
raise_exception: bool = False,
|
|
118
|
+
) -> Optional[List[Dict]]:
|
|
119
|
+
"""
|
|
120
|
+
List evaluation testcases for the current API token only.
|
|
121
|
+
Sorted by created_at DESC (newest first).
|
|
122
|
+
"""
|
|
123
|
+
return self._get(
|
|
124
|
+
"/eval-framework/deepeval/evaluation-testcases/token",
|
|
125
|
+
params={
|
|
126
|
+
"limit": limit,
|
|
127
|
+
"offset": offset,
|
|
128
|
+
},
|
|
129
|
+
raise_exception=raise_exception,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def get_evaluation_testcase_by_id(
|
|
133
|
+
self,
|
|
134
|
+
testcase_id: int,
|
|
135
|
+
*,
|
|
136
|
+
raise_exception: bool = False,
|
|
137
|
+
) -> Optional[Dict]:
|
|
138
|
+
"""
|
|
139
|
+
Fetch a single evaluation testcase by numeric ID.
|
|
140
|
+
"""
|
|
141
|
+
return self._get(
|
|
142
|
+
f"/eval-framework/deepeval/id/{testcase_id}",
|
|
143
|
+
params={},
|
|
144
|
+
raise_exception=raise_exception,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
def get_evaluation_testcase_by_unique_id(
|
|
148
|
+
self,
|
|
149
|
+
unique_id: str,
|
|
150
|
+
*,
|
|
151
|
+
raise_exception: bool = False,
|
|
152
|
+
) -> Optional[Dict]:
|
|
153
|
+
"""
|
|
154
|
+
Fetch a single evaluation testcase by unique_id.
|
|
155
|
+
"""
|
|
156
|
+
return self._get(
|
|
157
|
+
f"/eval-framework/deepeval/uid/{unique_id}",
|
|
158
|
+
params={},
|
|
159
|
+
raise_exception=raise_exception,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
def get_evaluation_testcase(
|
|
163
|
+
self,
|
|
164
|
+
*,
|
|
165
|
+
id: Optional[int] = None,
|
|
166
|
+
unique_id: Optional[str] = None,
|
|
167
|
+
raise_exception: bool = False,
|
|
168
|
+
) -> Optional[Dict]:
|
|
169
|
+
if id is not None:
|
|
170
|
+
return self.get_evaluation_testcase_by_id(
|
|
171
|
+
id, raise_exception=raise_exception
|
|
172
|
+
)
|
|
173
|
+
if unique_id is not None:
|
|
174
|
+
return self.get_evaluation_testcase_by_unique_id(
|
|
175
|
+
unique_id, raise_exception=raise_exception
|
|
176
|
+
)
|
|
177
|
+
raise ValueError("Either id or unique_id must be provided")
|
|
178
|
+
|
|
179
|
+
def compare_testcases(
|
|
180
|
+
self,
|
|
181
|
+
*,
|
|
182
|
+
testcase_a_id: int,
|
|
183
|
+
testcase_b_id: int,
|
|
184
|
+
raise_exception: bool = False,
|
|
185
|
+
) -> Optional[dict]:
|
|
186
|
+
"""
|
|
187
|
+
Compare two evaluation testcases.
|
|
188
|
+
"""
|
|
189
|
+
return self._get(
|
|
190
|
+
"/eval-framework/deepeval/evaluation-testcases/compare",
|
|
191
|
+
params={
|
|
192
|
+
"testcase_a_id": testcase_a_id,
|
|
193
|
+
"testcase_b_id": testcase_b_id,
|
|
194
|
+
},
|
|
195
|
+
raise_exception=raise_exception,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def compare_latest_by_labels(
|
|
199
|
+
self,
|
|
200
|
+
*,
|
|
201
|
+
label_a: str,
|
|
202
|
+
label_b: str,
|
|
203
|
+
raise_exception: bool = False,
|
|
204
|
+
) -> Optional[dict]:
|
|
205
|
+
"""
|
|
206
|
+
Compare the latest evaluation testcases for two labels.
|
|
207
|
+
"""
|
|
208
|
+
return self._get(
|
|
209
|
+
"/eval-framework/deepeval/evaluation-testcases/compare-latest",
|
|
210
|
+
params={
|
|
211
|
+
"label_a": label_a,
|
|
212
|
+
"label_b": label_b,
|
|
213
|
+
},
|
|
214
|
+
raise_exception=raise_exception,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
def compare_last_two_by_label(
|
|
218
|
+
self,
|
|
219
|
+
*,
|
|
220
|
+
label: str,
|
|
221
|
+
raise_exception: bool = False,
|
|
222
|
+
) -> Optional[dict]:
|
|
223
|
+
"""
|
|
224
|
+
Compare the last two evaluation testcases for a given label.
|
|
225
|
+
"""
|
|
226
|
+
return self._get(
|
|
227
|
+
"/eval-framework/deepeval/evaluation-testcases/compare-last-two",
|
|
228
|
+
params={
|
|
229
|
+
"label": label,
|
|
230
|
+
},
|
|
231
|
+
raise_exception=raise_exception,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
@overload
|
|
235
|
+
def text_eval(
|
|
236
|
+
self,
|
|
237
|
+
config: EvalConfig,
|
|
238
|
+
*,
|
|
239
|
+
raise_exception: bool = False,
|
|
240
|
+
) -> Optional[dict]: ...
|
|
241
|
+
|
|
242
|
+
@overload
|
|
77
243
|
def text_eval(
|
|
78
244
|
self,
|
|
245
|
+
*,
|
|
79
246
|
data: List[TextInputItem],
|
|
80
247
|
metrics: List[MetricConfig],
|
|
248
|
+
component: str = "unknown",
|
|
249
|
+
label: str | None = None,
|
|
81
250
|
raise_exception: bool = False,
|
|
251
|
+
) -> Optional[dict]: ...
|
|
252
|
+
|
|
253
|
+
def text_eval(
|
|
254
|
+
self,
|
|
255
|
+
config: EvalConfig | None = None,
|
|
256
|
+
*,
|
|
257
|
+
data: List[TextInputItem] | None = None,
|
|
258
|
+
metrics: List[MetricConfig] | None = None,
|
|
82
259
|
component: str = "unknown",
|
|
83
|
-
|
|
260
|
+
label: str | None = None,
|
|
261
|
+
raise_exception: bool = False,
|
|
84
262
|
) -> Optional[dict]:
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
263
|
+
if config is None:
|
|
264
|
+
config = EvalConfig(
|
|
265
|
+
data=data,
|
|
266
|
+
metrics=metrics,
|
|
267
|
+
component=component,
|
|
268
|
+
label=label,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
return self._request(
|
|
272
|
+
"/deepeval/text-eval", config.model_dump(), raise_exception
|
|
273
|
+
)
|
|
90
274
|
|
|
91
275
|
def text_eval_background(
|
|
92
276
|
self,
|
|
@@ -94,27 +278,61 @@ class DeepEvalClient:
|
|
|
94
278
|
metrics: List[MetricConfig],
|
|
95
279
|
raise_exception: bool = False,
|
|
96
280
|
component: str = "unknown",
|
|
97
|
-
|
|
281
|
+
label: Union[str, None] = None,
|
|
98
282
|
) -> Optional[dict]:
|
|
99
283
|
"""Run background text evaluation (async job)."""
|
|
100
284
|
payload = EvalConfig.model_construct(
|
|
101
|
-
data=data, metrics=metrics, component=component, version=
|
|
285
|
+
data=data, metrics=metrics, component=component, version=label
|
|
102
286
|
).model_dump()
|
|
103
287
|
return self._request("/deepeval/text-eval/background", payload, raise_exception)
|
|
104
288
|
|
|
289
|
+
@overload
|
|
105
290
|
def schema_eval(
|
|
106
291
|
self,
|
|
292
|
+
*,
|
|
107
293
|
data: List[SchemaInputItem],
|
|
108
294
|
metrics: List[SchemaMetricConfig],
|
|
295
|
+
component: str = "unknown",
|
|
296
|
+
label: str | None = None,
|
|
297
|
+
raise_exception: bool = False,
|
|
298
|
+
) -> Optional[dict]: ...
|
|
299
|
+
|
|
300
|
+
@overload
|
|
301
|
+
def schema_eval(
|
|
302
|
+
self,
|
|
303
|
+
config: SchemaEvalConfig,
|
|
304
|
+
*,
|
|
109
305
|
raise_exception: bool = False,
|
|
306
|
+
) -> Optional[dict]: ...
|
|
307
|
+
|
|
308
|
+
def schema_eval(
|
|
309
|
+
self,
|
|
310
|
+
config: SchemaEvalConfig | None = None,
|
|
311
|
+
*,
|
|
312
|
+
data: List[SchemaInputItem] | None = None,
|
|
313
|
+
metrics: List[SchemaMetricConfig] | None = None,
|
|
110
314
|
component: str = "unknown",
|
|
111
|
-
|
|
315
|
+
label: str | None = None,
|
|
316
|
+
raise_exception: bool = False,
|
|
112
317
|
) -> Optional[dict]:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
318
|
+
if config is None:
|
|
319
|
+
if data is None or metrics is None:
|
|
320
|
+
raise ValueError(
|
|
321
|
+
"Either `config` or both `data` and `metrics` must be provided"
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
config = SchemaEvalConfig(
|
|
325
|
+
data=data,
|
|
326
|
+
metrics=metrics,
|
|
327
|
+
component=component,
|
|
328
|
+
label=label,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
return self._request(
|
|
332
|
+
"/deepeval/schema-eval",
|
|
333
|
+
config.model_dump(),
|
|
334
|
+
raise_exception,
|
|
335
|
+
)
|
|
118
336
|
|
|
119
337
|
def schema_eval_background(
|
|
120
338
|
self,
|
|
@@ -122,11 +340,11 @@ class DeepEvalClient:
|
|
|
122
340
|
metrics: List[SchemaMetricConfig],
|
|
123
341
|
raise_exception: bool = False,
|
|
124
342
|
component: str = "unknown",
|
|
125
|
-
|
|
343
|
+
label: Union[str, None] = None,
|
|
126
344
|
) -> Optional[dict]:
|
|
127
345
|
"""Run background schema evaluation (async job)."""
|
|
128
346
|
payload = SchemaEvalConfig.model_construct(
|
|
129
|
-
data=data, metrics=metrics, component=component, version=
|
|
347
|
+
data=data, metrics=metrics, component=component, version=label
|
|
130
348
|
).model_dump()
|
|
131
349
|
return self._request(
|
|
132
350
|
"/deepeval/schema-eval/background", payload, raise_exception
|
|
@@ -139,13 +357,17 @@ class DeepEvalClient:
|
|
|
139
357
|
chance: float,
|
|
140
358
|
raise_exception: bool = False,
|
|
141
359
|
component: str = "unknown",
|
|
142
|
-
|
|
360
|
+
label: Union[str, None] = None,
|
|
143
361
|
) -> Optional[dict]:
|
|
144
362
|
"""Randomly run text_eval based on a probability between 0 and 1."""
|
|
145
363
|
self._validate_chance(chance)
|
|
146
364
|
return (
|
|
147
365
|
self.text_eval(
|
|
148
|
-
data
|
|
366
|
+
data=data,
|
|
367
|
+
metrics=metrics,
|
|
368
|
+
raise_exception=raise_exception,
|
|
369
|
+
component=component,
|
|
370
|
+
label=label,
|
|
149
371
|
)
|
|
150
372
|
if random.random() <= chance
|
|
151
373
|
else None
|
|
@@ -158,13 +380,13 @@ class DeepEvalClient:
|
|
|
158
380
|
chance: float,
|
|
159
381
|
raise_exception: bool = False,
|
|
160
382
|
component: str = "unknown",
|
|
161
|
-
|
|
383
|
+
label: Union[str, None] = None,
|
|
162
384
|
) -> Optional[dict]:
|
|
163
385
|
"""Randomly run text_eval_background based on a probability between 0 and 1."""
|
|
164
386
|
self._validate_chance(chance)
|
|
165
387
|
return (
|
|
166
388
|
self.text_eval_background(
|
|
167
|
-
data, metrics, raise_exception, component=component,
|
|
389
|
+
data, metrics, raise_exception, component=component, label=label
|
|
168
390
|
)
|
|
169
391
|
if random.random() <= chance
|
|
170
392
|
else None
|
|
@@ -177,13 +399,17 @@ class DeepEvalClient:
|
|
|
177
399
|
chance: float,
|
|
178
400
|
raise_exception: bool = False,
|
|
179
401
|
component: str = "unknown",
|
|
180
|
-
|
|
402
|
+
label: Union[str, None] = None,
|
|
181
403
|
) -> Optional[dict]:
|
|
182
404
|
"""Randomly run schema_eval based on a probability between 0 and 1."""
|
|
183
405
|
self._validate_chance(chance)
|
|
184
406
|
return (
|
|
185
407
|
self.schema_eval(
|
|
186
|
-
data
|
|
408
|
+
data=data,
|
|
409
|
+
metrics=metrics,
|
|
410
|
+
raise_exception=raise_exception,
|
|
411
|
+
component=component,
|
|
412
|
+
label=label,
|
|
187
413
|
)
|
|
188
414
|
if random.random() <= chance
|
|
189
415
|
else None
|
|
@@ -196,13 +422,13 @@ class DeepEvalClient:
|
|
|
196
422
|
chance: float,
|
|
197
423
|
raise_exception: bool = False,
|
|
198
424
|
component: str = "unknown",
|
|
199
|
-
|
|
425
|
+
label: Union[str, None] = None,
|
|
200
426
|
) -> Optional[dict]:
|
|
201
427
|
"""Randomly run text_eval_background based on a probability between 0 and 1."""
|
|
202
428
|
self._validate_chance(chance)
|
|
203
429
|
return (
|
|
204
430
|
self.schema_eval_background(
|
|
205
|
-
data, metrics, raise_exception, component=component,
|
|
431
|
+
data, metrics, raise_exception, component=component, label=label
|
|
206
432
|
)
|
|
207
433
|
if random.random() <= chance
|
|
208
434
|
else None
|
rakam_eval_sdk/decorators.py
CHANGED
|
@@ -1,44 +1,70 @@
|
|
|
1
|
-
|
|
2
|
-
import time
|
|
1
|
+
import functools
|
|
3
2
|
import os
|
|
3
|
+
import time
|
|
4
|
+
from typing import Callable, Dict, Optional, ParamSpec, TypeVar, Union, overload
|
|
5
|
+
|
|
4
6
|
import psutil
|
|
5
|
-
import functools
|
|
6
7
|
|
|
8
|
+
P = ParamSpec("P")
|
|
9
|
+
R = TypeVar("R")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@overload
|
|
13
|
+
def eval_run(func: Callable[P, R]) -> Callable[P, R]: ...
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@overload
|
|
17
|
+
def eval_run(
|
|
18
|
+
func: None = None,
|
|
19
|
+
**decorator_kwargs: Dict[str, object],
|
|
20
|
+
) -> Callable[[Callable[P, R]], Callable[P, R]]: ...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def eval_run(
|
|
24
|
+
func: Optional[Callable[P, R]] = None,
|
|
25
|
+
**decorator_kwargs: Dict[str, object],
|
|
26
|
+
) -> Union[
|
|
27
|
+
Callable[P, R],
|
|
28
|
+
Callable[[Callable[P, R]], Callable[P, R]],
|
|
29
|
+
]:
|
|
30
|
+
# used as @eval_run
|
|
31
|
+
if callable(func):
|
|
32
|
+
return _wrap(func)
|
|
33
|
+
|
|
34
|
+
# used as @eval_run(...)
|
|
35
|
+
def decorator(real_func: Callable[P, R]) -> Callable[P, R]:
|
|
36
|
+
return _wrap(real_func)
|
|
37
|
+
|
|
38
|
+
return decorator
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _wrap(func: Callable[P, R]) -> Callable[P, R]:
|
|
42
|
+
@functools.wraps(func)
|
|
43
|
+
def inner(*args: P.args, **kwargs: P.kwargs) -> R:
|
|
44
|
+
process = psutil.Process(os.getpid())
|
|
45
|
+
|
|
46
|
+
start_time = time.perf_counter()
|
|
47
|
+
start_cpu = process.cpu_times()
|
|
48
|
+
start_mem = process.memory_info().rss
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
return func(*args, **kwargs)
|
|
52
|
+
finally:
|
|
53
|
+
end_time = time.perf_counter()
|
|
54
|
+
end_cpu = process.cpu_times()
|
|
55
|
+
end_mem = process.memory_info().rss
|
|
56
|
+
|
|
57
|
+
elapsed = end_time - start_time
|
|
58
|
+
cpu_used = (end_cpu.user + end_cpu.system) - (
|
|
59
|
+
start_cpu.user + start_cpu.system
|
|
60
|
+
)
|
|
61
|
+
mem_delta_mb = (end_mem - start_mem) / (1024 * 1024)
|
|
62
|
+
|
|
63
|
+
print(
|
|
64
|
+
f"[eval_run] {func.__module__}.{func.__name__} | "
|
|
65
|
+
f"time={elapsed:.4f}s | "
|
|
66
|
+
f"cpu={cpu_used:.4f}s | "
|
|
67
|
+
f"mem_delta={mem_delta_mb:.2f}MB"
|
|
68
|
+
)
|
|
7
69
|
|
|
8
|
-
|
|
9
|
-
def wrapper(func):
|
|
10
|
-
@functools.wraps(func)
|
|
11
|
-
def inner(*args, **kwargs):
|
|
12
|
-
process = psutil.Process(os.getpid())
|
|
13
|
-
|
|
14
|
-
# Start metrics
|
|
15
|
-
start_time = time.perf_counter()
|
|
16
|
-
start_cpu = process.cpu_times()
|
|
17
|
-
start_mem = process.memory_info().rss
|
|
18
|
-
|
|
19
|
-
try:
|
|
20
|
-
result = func(*args, **kwargs)
|
|
21
|
-
return result
|
|
22
|
-
finally:
|
|
23
|
-
# End metrics
|
|
24
|
-
end_time = time.perf_counter()
|
|
25
|
-
end_cpu = process.cpu_times()
|
|
26
|
-
end_mem = process.memory_info().rss
|
|
27
|
-
|
|
28
|
-
elapsed = end_time - start_time
|
|
29
|
-
cpu_used = (
|
|
30
|
-
(end_cpu.user + end_cpu.system)
|
|
31
|
-
- (start_cpu.user + start_cpu.system)
|
|
32
|
-
)
|
|
33
|
-
mem_diff_mb = (end_mem - start_mem) / (1024 * 1024)
|
|
34
|
-
|
|
35
|
-
print(
|
|
36
|
-
f"[eval_run] {func.__module__}.{func.__name__} | "
|
|
37
|
-
f"time={elapsed:.4f}s | "
|
|
38
|
-
f"cpu={cpu_used:.4f}s | "
|
|
39
|
-
f"mem_delta={mem_diff_mb:.2f}MB"
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
return inner
|
|
43
|
-
|
|
44
|
-
return wrapper
|
|
70
|
+
return inner
|
rakam_eval_sdk/schema.py
CHANGED
|
@@ -39,7 +39,7 @@ class CorrectnessConfig(MetricConfigBase):
|
|
|
39
39
|
"Minor formatting differences like '$1,250.00' vs '$1250.00' are acceptable."
|
|
40
40
|
]
|
|
41
41
|
)
|
|
42
|
-
criteria: Optional[str] =
|
|
42
|
+
criteria: Optional[str] = None
|
|
43
43
|
params: List[Literal["actual_output", "expected_output"]] = Field(
|
|
44
44
|
default=["actual_output", "expected_output"]
|
|
45
45
|
)
|
|
@@ -94,7 +94,8 @@ MetricConfig = Annotated[
|
|
|
94
94
|
]
|
|
95
95
|
|
|
96
96
|
SchemaMetricConfig = Annotated[
|
|
97
|
-
Union[JsonCorrectnessConfig, FieldsPresenceConfig], Field(
|
|
97
|
+
Union[JsonCorrectnessConfig, FieldsPresenceConfig], Field(
|
|
98
|
+
discriminator="type")
|
|
98
99
|
]
|
|
99
100
|
|
|
100
101
|
|
|
@@ -116,14 +117,18 @@ class SchemaInputItem(InputItem):
|
|
|
116
117
|
|
|
117
118
|
|
|
118
119
|
class EvalConfig(BaseModel):
|
|
120
|
+
__eval_config__ = "text_eval"
|
|
121
|
+
unique_id: Union[str, None] = None
|
|
119
122
|
component: str = "unknown"
|
|
120
|
-
|
|
123
|
+
label: Union[str, None] = None
|
|
121
124
|
data: List[TextInputItem]
|
|
122
125
|
metrics: List[MetricConfig] = Field(default_factory=list)
|
|
123
126
|
|
|
124
127
|
|
|
125
128
|
class SchemaEvalConfig(BaseModel):
|
|
129
|
+
__eval_config__ = "schema_eval"
|
|
126
130
|
component: str = "unknown"
|
|
127
|
-
|
|
131
|
+
unique_id: Union[str, None] = None
|
|
132
|
+
label: Union[str, None] = None
|
|
128
133
|
data: List[SchemaInputItem]
|
|
129
134
|
metrics: List[SchemaMetricConfig] = Field(default_factory=list)
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import ast
|
|
2
2
|
import importlib
|
|
3
3
|
import importlib.util
|
|
4
|
-
from dataclasses import dataclass
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from types import ModuleType
|
|
7
|
-
from typing import
|
|
6
|
+
from typing import List
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
class DecoratedFunctionVisitor(ast.NodeVisitor):
|
|
@@ -12,13 +11,13 @@ class DecoratedFunctionVisitor(ast.NodeVisitor):
|
|
|
12
11
|
self.decorator_name = decorator_name
|
|
13
12
|
self.results: List[str] = []
|
|
14
13
|
|
|
15
|
-
def visit_FunctionDef(self, node: ast.FunctionDef):
|
|
14
|
+
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
|
16
15
|
for deco in node.decorator_list:
|
|
17
16
|
if self._matches(deco):
|
|
18
17
|
self.results.append(node.name)
|
|
19
18
|
self.generic_visit(node)
|
|
20
19
|
|
|
21
|
-
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
|
|
20
|
+
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
|
22
21
|
for deco in node.decorator_list:
|
|
23
22
|
if self._matches(deco):
|
|
24
23
|
self.results.append(node.name)
|
|
@@ -57,13 +56,3 @@ def load_module_from_path(file_path: Path) -> ModuleType:
|
|
|
57
56
|
module = importlib.util.module_from_spec(spec)
|
|
58
57
|
spec.loader.exec_module(module)
|
|
59
58
|
return module
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def get_function(module: ModuleType, function_name: str) -> Callable:
|
|
66
|
-
func = getattr(module, function_name, None)
|
|
67
|
-
if func is None:
|
|
68
|
-
raise AttributeError(f"{function_name} not found in {module.__name__}")
|
|
69
|
-
return func
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: rakam-eval-sdk
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0rc1
|
|
4
4
|
Summary: Evaluation Framework SDK
|
|
5
5
|
Author: Mohamed Bachar Touil
|
|
6
6
|
License: MIT
|
|
7
|
+
Requires-Dist: dotenv>=0.9.9
|
|
7
8
|
Requires-Dist: psutil>=7.2.1
|
|
8
9
|
Requires-Dist: pydantic>=2.10.6
|
|
9
10
|
Requires-Dist: requests
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
rakam_eval_sdk/cli.py,sha256=eEfBIPcE8vrXsLc1pvah7FeQrH3KdmUcm4ndlTURlF4,13590
|
|
3
|
+
rakam_eval_sdk/client.py,sha256=4qUG8cLGqY8026s28uCHM3zFuGDzekLokZZDu7VRJ_8,13077
|
|
4
|
+
rakam_eval_sdk/decorators.py,sha256=_9VFQmoYWd6cqnNryZJWEwYHQRxY7vIOam4z45zBk3c,1794
|
|
5
|
+
rakam_eval_sdk/schema.py,sha256=ozNC56ygzR1G6UABjnqnJVAPVcF4rJMH1pUNH0a1K4M,3617
|
|
6
|
+
rakam_eval_sdk/utils/decorator_utils.py,sha256=g0TjXtG9o4hwhUAFP8GJsXAkjhZhzeseTAg-YBFjj2g,1763
|
|
7
|
+
rakam_eval_sdk-0.2.0rc1.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
8
|
+
rakam_eval_sdk-0.2.0rc1.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
|
|
9
|
+
rakam_eval_sdk-0.2.0rc1.dist-info/METADATA,sha256=ZPMVvCST3fb48UJSJfa1fj5qyjrLi-pQ3N_J1_4pEnA,6019
|
|
10
|
+
rakam_eval_sdk-0.2.0rc1.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
rakam_eval_sdk/cli.py,sha256=9BHZte3cS1LWL0_dOVEtws9xIhdw0yORW93Dm1uDxDw,2876
|
|
3
|
-
rakam_eval_sdk/client.py,sha256=q-Y11maLVKaEnq4OSyFCqrP3JgFS1xpyp9-bZhFssIA,7123
|
|
4
|
-
rakam_eval_sdk/decorators.py,sha256=ZEcZb2KUsPrtx-Guc7tYN9MVCMxIQ83yhiJxKE1fjdw,1262
|
|
5
|
-
rakam_eval_sdk/schema.py,sha256=MQfF0SEHf2wzeXJNTsMs-yDbN0vZJQbN_crfpPXsTk8,3467
|
|
6
|
-
rakam_eval_sdk/utils/decorator_utils.py,sha256=hCC4F7v3KjGSDt2NUXfDsbBTMPzlG6wMzZVdR_wWn14,2048
|
|
7
|
-
rakam_eval_sdk-0.1.16rc1.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
8
|
-
rakam_eval_sdk-0.1.16rc1.dist-info/entry_points.txt,sha256=tNhwmM_UGELb3h0zOfgCrtTheUkP-k8jGv0rTOfRSps,56
|
|
9
|
-
rakam_eval_sdk-0.1.16rc1.dist-info/METADATA,sha256=DRKzVNNF426R3ipnpG8Xr5LXKLTY4Ar9WdPIxe6hjzI,5991
|
|
10
|
-
rakam_eval_sdk-0.1.16rc1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|