ldv-cli 0.9.0__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/PKG-INFO +1 -1
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/pyproject.toml +1 -1
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/__init__.py +1 -1
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/evals.py +57 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/instructions.py +15 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/.gitignore +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/README.md +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/examples/agent-traces.jsonl +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/package-lock.json +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/_group.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/_opts.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/api.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/cli.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/__init__.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/annotations.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/auth.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/buckets.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/datasets.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/edits.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/highlights.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/issues.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/preview.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/reports.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/skills.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/spec.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/tui.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/update.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/commands/workspaces.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/config.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/output.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/sessions.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/src/ldv/util.py +0 -0
- {ldv_cli-0.9.0 → ldv_cli-0.10.0}/uv.lock +0 -0
|
@@ -5,6 +5,7 @@ import sys
|
|
|
5
5
|
from typing import Annotated, List, Optional
|
|
6
6
|
|
|
7
7
|
import typer
|
|
8
|
+
from rich.console import Console
|
|
8
9
|
|
|
9
10
|
from .._group import AliasGroup
|
|
10
11
|
|
|
@@ -178,6 +179,62 @@ def correctness(
|
|
|
178
179
|
)
|
|
179
180
|
|
|
180
181
|
|
|
182
|
+
def _bar(pct: float, width: int = 20) -> str:
|
|
183
|
+
filled = round(pct * width)
|
|
184
|
+
filled = max(0, min(width, filled))
|
|
185
|
+
return "█" * filled + "░" * (width - filled)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
@app.command("failures")
|
|
189
|
+
def failures(
|
|
190
|
+
id: Annotated[str, typer.Argument(help="Dataset ID")],
|
|
191
|
+
json_out: JsonOpt = False,
|
|
192
|
+
profile: ProfileOpt = None,
|
|
193
|
+
api_url: ApiUrlOpt = None,
|
|
194
|
+
) -> None:
|
|
195
|
+
"""Quality analysis: clean vs. dirty rate + failure mode breakdown."""
|
|
196
|
+
client = ApiClient(profile=profile, api_url=api_url)
|
|
197
|
+
data = client.get(f"/v1/datasets/{q(id)}/eval-failure-analysis").json()
|
|
198
|
+
if json_out:
|
|
199
|
+
print_json(data)
|
|
200
|
+
return
|
|
201
|
+
skip = data.get("skip_reason")
|
|
202
|
+
if skip:
|
|
203
|
+
sys.stdout.write(f"No failure_analysis column found in this dataset.\n")
|
|
204
|
+
return
|
|
205
|
+
total = data.get("total") or 0
|
|
206
|
+
clean = data.get("clean") or 0
|
|
207
|
+
dirty = data.get("dirty") or 0
|
|
208
|
+
clean_rate = data.get("clean_rate") or 0.0
|
|
209
|
+
dirty_rate = 1.0 - clean_rate
|
|
210
|
+
|
|
211
|
+
console = Console()
|
|
212
|
+
|
|
213
|
+
console.print(f"\n[bold]Quality analysis: {total:,} samples[/bold]\n")
|
|
214
|
+
console.print(f" [green]Quality rate[/green] {_bar(clean_rate)} {clean_rate * 100:.1f}%")
|
|
215
|
+
console.print(f" [red]Issues[/red] {_bar(dirty_rate)} {dirty_rate * 100:.1f}%")
|
|
216
|
+
|
|
217
|
+
modes = data.get("mode_distribution") or []
|
|
218
|
+
if not modes:
|
|
219
|
+
if dirty == 0:
|
|
220
|
+
sys.stdout.write("\nNo issues detected.\n")
|
|
221
|
+
else:
|
|
222
|
+
sys.stdout.write(f"\n{dirty:,} samples with issues (no mode breakdown available).\n")
|
|
223
|
+
return
|
|
224
|
+
|
|
225
|
+
sys.stdout.write(f"\nFailure modes ({dirty:,} samples with issues):\n")
|
|
226
|
+
name_width = max((len(str(m.get("mode") or "").replace("_", " ")) for m in modes), default=0)
|
|
227
|
+
name_width = max(name_width, 10)
|
|
228
|
+
count_width = max((len(str(m.get("count") or 0)) for m in modes), default=0)
|
|
229
|
+
count_width = max(count_width, 5)
|
|
230
|
+
for m in modes:
|
|
231
|
+
name = str(m.get("mode") or "").replace("_", " ")
|
|
232
|
+
count = m.get("count") or 0
|
|
233
|
+
rate = m.get("rate") or 0.0
|
|
234
|
+
bar = _bar(rate)
|
|
235
|
+
sys.stdout.write(f" {name:<{name_width}} {count:>{count_width}} {bar} {rate * 100:.1f}%\n")
|
|
236
|
+
|
|
237
|
+
|
|
181
238
|
@app.command("samples")
|
|
182
239
|
def samples(
|
|
183
240
|
id: Annotated[str, typer.Argument(help="Dataset ID")],
|
|
@@ -149,6 +149,17 @@ primitives for error analysis — YOU do the reasoning over what they return.
|
|
|
149
149
|
ldv eval stats <id> # Accuracy + correctness counts + error-type
|
|
150
150
|
# distribution + token stats (the distribution view)
|
|
151
151
|
ldv eval correctness <id> # Fast accuracy + correct/incorrect/missing counts
|
|
152
|
+
ldv eval failures <id> # Quality analysis: clean-vs-dirty rate + failure mode
|
|
153
|
+
# breakdown from the failure_analysis column.
|
|
154
|
+
# Example output:
|
|
155
|
+
# Quality analysis: 1,000 samples
|
|
156
|
+
# Quality rate ████████████████████░░░░░ 80.0%
|
|
157
|
+
# Issues █████░░░░░░░░░░░░░░░░░░░░ 20.0%
|
|
158
|
+
# Failure modes (200 samples with issues):
|
|
159
|
+
# truncated response 100 ██████████████████ 50.0%
|
|
160
|
+
# missing think tags 80 ██████████████ 40.0%
|
|
161
|
+
# If no failure_analysis column exists, prints a clear
|
|
162
|
+
# message and exits 0. Use --json for the raw API response.
|
|
152
163
|
ldv eval samples <id> [--filter correct|incorrect|missing|all] [--search <text>]
|
|
153
164
|
[--error-type <value>] [--columns a,b] [--limit N] [--offset N]
|
|
154
165
|
# Slice the dataset for error analysis. Filters AND
|
|
@@ -160,6 +171,8 @@ Notes:
|
|
|
160
171
|
- --search matches a substring on the prompt OR response column (either one matching is a hit).
|
|
161
172
|
- --error-type values come from the `error_field` / `error_distribution` in `eval stats`.
|
|
162
173
|
- Use the 'index' from `eval samples` directly as `eval sample --row <index>`.
|
|
174
|
+
- `eval failures` reads the `failure_analysis` column; if absent, skip_reason is set and a
|
|
175
|
+
clear message is printed. Use --json to get the raw counts for programmatic consumption.
|
|
163
176
|
|
|
164
177
|
## Row Edits
|
|
165
178
|
|
|
@@ -278,6 +291,8 @@ never goes stale.
|
|
|
278
291
|
|
|
279
292
|
### Analyze an eval's failure modes
|
|
280
293
|
ldv eval list --json # find the eval dataset
|
|
294
|
+
ldv eval failures <id> --json # clean rate + failure mode breakdown
|
|
295
|
+
# (mode_distribution: name/count/rate per mode)
|
|
281
296
|
ldv eval stats <id> --json # accuracy + error_distribution_incorrect
|
|
282
297
|
# = the common errors AMONG the misses
|
|
283
298
|
ldv eval samples <id> --filter incorrect --json # pull the misses
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|