rakam-eval-sdk 0.2.0__tar.gz → 0.2.0rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: rakam-eval-sdk
3
- Version: 0.2.0
3
+ Version: 0.2.0rc1
4
4
  Summary: Evaluation Framework SDK
5
5
  Author: Mohamed Bachar Touil
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "uv_build"
4
4
 
5
5
  [project]
6
6
  name = "rakam-eval-sdk"
7
- version = "0.2.0"
7
+ version = "0.2.0rc1"
8
8
  description = "Evaluation Framework SDK"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -6,7 +6,7 @@ import uuid
6
6
  from datetime import datetime
7
7
  from pathlib import Path
8
8
  from pprint import pprint
9
- from typing import Any, Dict, Optional, Sequence
9
+ from typing import Any, Optional
10
10
 
11
11
  import typer
12
12
  from dotenv import load_dotenv
@@ -29,64 +29,9 @@ console = Console()
29
29
  PROJECT_ROOT = os.path.abspath(".")
30
30
  if PROJECT_ROOT not in sys.path:
31
31
  sys.path.insert(0, PROJECT_ROOT)
32
- list_app = typer.Typer(help="List resources")
33
- app.add_typer(list_app, name="list")
34
- metrics_app = typer.Typer(help="Metrics utilities")
35
- app.add_typer(metrics_app, name="metrics")
36
-
37
32
 
38
- @metrics_app.command("list")
39
- def list_metrics(
40
- limit: int = typer.Option(
41
- 20,
42
- "--limit",
43
- help="Number of testcases to inspect for metrics",
44
- ),
45
- ):
46
- """
47
- List unique metric names found in evaluation testcases.
48
- """
49
- client = DeepEvalClient()
50
33
 
51
- testcases = client.list_evaluation_testcases(
52
- limit=limit,
53
- offset=0,
54
- raise_exception=True,
55
- )
56
-
57
- if not testcases:
58
- typer.echo("No evaluation testcases found.")
59
- return
60
-
61
- metric_names: set[str] = set()
62
-
63
- def collect_metrics(entries: Sequence[Dict] | None):
64
- if not entries:
65
- return
66
- for entry in entries:
67
- for metric in entry.get("metrics", []) or []:
68
- name = metric.get("name")
69
- if name:
70
- metric_names.add(name)
71
-
72
- for tc in testcases:
73
-
74
- collect_metrics(tc.get("result"))
75
-
76
- if not metric_names:
77
- typer.echo("No metrics found.")
78
- return
79
-
80
- typer.echo(
81
- f"📊 Found {len(metric_names)} unique metrics "
82
- f"(from latest {limit} testcases)\n"
83
- )
84
-
85
- for name in sorted(metric_names):
86
- typer.echo(f"- {name}")
87
-
88
-
89
- @list_app.command("eval")
34
+ @app.command()
90
35
  def list(
91
36
  directory: Path = typer.Argument(
92
37
  Path("./eval"),
@@ -120,10 +65,17 @@ def list(
120
65
  typer.echo(f"No @{TARGET_DECORATOR} functions found.")
121
66
 
122
67
 
68
+ list_app = typer.Typer(help="List resources")
69
+ app.add_typer(list_app, name="list")
70
+
71
+
123
72
  @list_app.command("runs")
124
73
  def list_runs(
125
74
  limit: int = typer.Option(20, help="Max number of runs"),
126
75
  offset: int = typer.Option(0, help="Pagination offset"),
76
+ status: Optional[str] = typer.Option(
77
+ None, help="Filter by status (running, completed, failed)"
78
+ ),
127
79
  ):
128
80
  """
129
81
  List evaluation runs (newest first).
@@ -140,7 +92,19 @@ def list_runs(
140
92
  typer.echo("No evaluation runs found.")
141
93
  return
142
94
 
143
- typer.echo(f"[id] " f"{'unique_id':<20}" f"{'label':<20}" f"created_at")
95
+ # optional status filtering (client-side for now)
96
+ if status:
97
+ runs = [
98
+ r for r in runs
99
+ if r.get("result", {}).get("status") == status
100
+ ]
101
+
102
+ typer.echo(
103
+ f"[id] "
104
+ f"{'unique_id':<20}"
105
+ f"{'label':<20}"
106
+ f"created_at"
107
+ )
144
108
  # pretty CLI output
145
109
  for run in runs:
146
110
  run_id = run.get("id")
@@ -156,7 +120,12 @@ def list_runs(
156
120
  except ValueError:
157
121
  pass
158
122
 
159
- typer.echo(f"[{run_id}] " f"{uid:<20} " f"{label:<20} " f"{created_at}")
123
+ typer.echo(
124
+ f"[{run_id}] "
125
+ f"{uid:<20} "
126
+ f"{label:<20} "
127
+ f"{created_at}"
128
+ )
160
129
 
161
130
 
162
131
  @list_app.command("show")
@@ -376,7 +345,8 @@ def _print_and_save(
376
345
  return
377
346
 
378
347
  if out.exists() and not overwrite:
379
- typer.echo(f"❌ File already exists: {out} (use --overwrite to replace)")
348
+ typer.echo(
349
+ f"❌ File already exists: {out} (use --overwrite to replace)")
380
350
  raise typer.Exit(code=1)
381
351
 
382
352
  out.parent.mkdir(parents=True, exist_ok=True)
@@ -38,7 +38,8 @@ class DeepEvalClient:
38
38
  )
39
39
  self.base_url = raw_url.rstrip("/")
40
40
  self.api_token = (
41
- api_token or settings_token or os.getenv("EVALFRAMWORK_API_KEY", "")
41
+ api_token or settings_token or os.getenv(
42
+ "EVALFRAMWORK_API_KEY", "")
42
43
  )
43
44
  self.timeout = timeout
44
45
 
@@ -94,7 +94,8 @@ MetricConfig = Annotated[
94
94
  ]
95
95
 
96
96
  SchemaMetricConfig = Annotated[
97
- Union[JsonCorrectnessConfig, FieldsPresenceConfig], Field(discriminator="type")
97
+ Union[JsonCorrectnessConfig, FieldsPresenceConfig], Field(
98
+ discriminator="type")
98
99
  ]
99
100
 
100
101