evalvault 1.72.0__py3-none-any.whl → 1.73.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/routers/pipeline.py +6 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +40 -1
- evalvault/adapters/inbound/cli/commands/pipeline.py +100 -0
- evalvault/adapters/inbound/cli/commands/regress.py +96 -0
- evalvault/adapters/inbound/cli/commands/stage.py +217 -24
- evalvault/adapters/outbound/analysis/__init__.py +4 -0
- evalvault/adapters/outbound/analysis/dataset_feature_analyzer_module.py +458 -0
- evalvault/adapters/outbound/analysis/pipeline_factory.py +1 -0
- evalvault/adapters/outbound/analysis/statistical_adapter.py +12 -6
- evalvault/adapters/outbound/improvement/pattern_detector.py +4 -0
- evalvault/adapters/outbound/storage/base_sql.py +160 -0
- evalvault/adapters/outbound/storage/postgres_adapter.py +132 -8
- evalvault/adapters/outbound/storage/postgres_schema.sql +15 -0
- evalvault/adapters/outbound/storage/schema.sql +18 -1
- evalvault/adapters/outbound/storage/sqlite_adapter.py +115 -1
- evalvault/adapters/outbound/tracer/open_rag_trace_adapter.py +23 -1
- evalvault/config/settings.py +2 -1
- evalvault/domain/entities/analysis.py +1 -0
- evalvault/domain/entities/analysis_pipeline.py +1 -0
- evalvault/domain/entities/stage.py +13 -0
- evalvault/domain/services/intent_classifier.py +13 -0
- evalvault/domain/services/pipeline_template_registry.py +22 -0
- evalvault/ports/outbound/storage_port.py +32 -0
- {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/METADATA +2 -1
- {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/RECORD +28 -27
- {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/WHEEL +0 -0
- {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -3,8 +3,10 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
|
+
import logging
|
|
6
7
|
from collections import defaultdict
|
|
7
8
|
from collections.abc import Iterable
|
|
9
|
+
from dataclasses import dataclass, field
|
|
8
10
|
from pathlib import Path
|
|
9
11
|
|
|
10
12
|
import typer
|
|
@@ -23,6 +25,62 @@ from evalvault.domain.services.stage_summary_service import StageSummaryService
|
|
|
23
25
|
|
|
24
26
|
from ..utils.options import db_option
|
|
25
27
|
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _resolve_db_path(db_path: Path | None) -> Path:
|
|
32
|
+
resolved = db_path or Settings().evalvault_db_path
|
|
33
|
+
if resolved is None:
|
|
34
|
+
raise typer.BadParameter("Database path is not configured.")
|
|
35
|
+
return resolved
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class ValidationStats:
|
|
40
|
+
"""Tracks StageEvent validation failures by error type."""
|
|
41
|
+
|
|
42
|
+
total_processed: int = 0
|
|
43
|
+
valid_count: int = 0
|
|
44
|
+
error_counts: dict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
45
|
+
|
|
46
|
+
def record_success(self) -> None:
|
|
47
|
+
self.total_processed += 1
|
|
48
|
+
self.valid_count += 1
|
|
49
|
+
|
|
50
|
+
def record_failure(self, error_message: str) -> None:
|
|
51
|
+
self.total_processed += 1
|
|
52
|
+
error_type = self._classify_error(error_message)
|
|
53
|
+
self.error_counts[error_type] += 1
|
|
54
|
+
|
|
55
|
+
def _classify_error(self, message: str) -> str:
|
|
56
|
+
"""Classify error messages into aggregate types."""
|
|
57
|
+
lower_msg = message.lower()
|
|
58
|
+
if "run_id" in lower_msg:
|
|
59
|
+
return "missing_run_id"
|
|
60
|
+
if "stage_type" in lower_msg:
|
|
61
|
+
return "invalid_stage_type"
|
|
62
|
+
if "attributes" in lower_msg:
|
|
63
|
+
return "invalid_attributes"
|
|
64
|
+
if "metadata" in lower_msg:
|
|
65
|
+
return "invalid_metadata"
|
|
66
|
+
if "attempt" in lower_msg:
|
|
67
|
+
return "invalid_attempt"
|
|
68
|
+
if "duration" in lower_msg:
|
|
69
|
+
return "invalid_duration"
|
|
70
|
+
if "datetime" in lower_msg or "started_at" in lower_msg or "finished_at" in lower_msg:
|
|
71
|
+
return "invalid_datetime"
|
|
72
|
+
if "payload" in lower_msg or "ref" in lower_msg:
|
|
73
|
+
return "invalid_payload_ref"
|
|
74
|
+
return "other"
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def failed_count(self) -> int:
|
|
78
|
+
return self.total_processed - self.valid_count
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def has_failures(self) -> bool:
|
|
82
|
+
return self.failed_count > 0
|
|
83
|
+
|
|
26
84
|
|
|
27
85
|
def create_stage_app(console: Console) -> typer.Typer:
|
|
28
86
|
"""Create the stage Typer sub-application."""
|
|
@@ -32,15 +90,40 @@ def create_stage_app(console: Console) -> typer.Typer:
|
|
|
32
90
|
@stage_app.command("ingest")
|
|
33
91
|
def ingest(
|
|
34
92
|
file: Path = typer.Argument(..., help="Stage events JSON/JSONL file."),
|
|
35
|
-
db_path: Path = db_option(help_text="Path to database file."),
|
|
93
|
+
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
94
|
+
failed_output: Path | None = typer.Option(
|
|
95
|
+
None,
|
|
96
|
+
"--failed-output",
|
|
97
|
+
help="Write invalid samples to JSONL for inspection.",
|
|
98
|
+
),
|
|
99
|
+
skip_invalid: bool = typer.Option(
|
|
100
|
+
False,
|
|
101
|
+
"--skip-invalid",
|
|
102
|
+
help="Continue processing after validation failures, logging aggregate counts.",
|
|
103
|
+
),
|
|
36
104
|
) -> None:
|
|
37
105
|
"""Ingest stage events from JSON or JSONL."""
|
|
38
|
-
events =
|
|
106
|
+
events, stats = _load_stage_events_with_stats(
|
|
107
|
+
file,
|
|
108
|
+
skip_invalid=skip_invalid,
|
|
109
|
+
failed_output=failed_output,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if stats.has_failures:
|
|
113
|
+
_print_validation_stats(console, stats)
|
|
114
|
+
logger.warning(
|
|
115
|
+
"StageEvent validation failures: %d/%d (types: %s)",
|
|
116
|
+
stats.failed_count,
|
|
117
|
+
stats.total_processed,
|
|
118
|
+
dict(stats.error_counts),
|
|
119
|
+
)
|
|
120
|
+
|
|
39
121
|
if not events:
|
|
40
|
-
console.print("[yellow]No stage events found in the input file.[/yellow]")
|
|
122
|
+
console.print("[yellow]No valid stage events found in the input file.[/yellow]")
|
|
41
123
|
raise typer.Exit(1)
|
|
42
124
|
|
|
43
|
-
|
|
125
|
+
resolved_db_path = _resolve_db_path(db_path)
|
|
126
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
44
127
|
stored = storage.save_stage_events(events)
|
|
45
128
|
|
|
46
129
|
console.print(f"[green]Stored {stored} stage event(s).[/green]")
|
|
@@ -61,10 +144,11 @@ def create_stage_app(console: Console) -> typer.Typer:
|
|
|
61
144
|
"-n",
|
|
62
145
|
help="Maximum number of rows to display.",
|
|
63
146
|
),
|
|
64
|
-
db_path: Path = db_option(help_text="Path to database file."),
|
|
147
|
+
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
65
148
|
) -> None:
|
|
66
149
|
"""List stage events for a run."""
|
|
67
|
-
|
|
150
|
+
resolved_db_path = _resolve_db_path(db_path)
|
|
151
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
68
152
|
events = storage.list_stage_events(run_id, stage_type=stage_type)
|
|
69
153
|
|
|
70
154
|
if not events:
|
|
@@ -97,10 +181,11 @@ def create_stage_app(console: Console) -> typer.Typer:
|
|
|
97
181
|
@stage_app.command("summary")
|
|
98
182
|
def summary(
|
|
99
183
|
run_id: str = typer.Argument(..., help="Run ID to summarize."),
|
|
100
|
-
db_path: Path = db_option(help_text="Path to database file."),
|
|
184
|
+
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
101
185
|
) -> None:
|
|
102
186
|
"""Show summary stats for stage events."""
|
|
103
|
-
|
|
187
|
+
resolved_db_path = _resolve_db_path(db_path)
|
|
188
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
104
189
|
events = storage.list_stage_events(run_id)
|
|
105
190
|
if not events:
|
|
106
191
|
console.print("[yellow]No stage events found.[/yellow]")
|
|
@@ -130,10 +215,11 @@ def create_stage_app(console: Console) -> typer.Typer:
|
|
|
130
215
|
"--thresholds-profile",
|
|
131
216
|
help="Profile key for thresholds JSON (defaults to Settings profile).",
|
|
132
217
|
),
|
|
133
|
-
db_path: Path = db_option(help_text="Path to database file."),
|
|
218
|
+
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
134
219
|
) -> None:
|
|
135
220
|
"""Compute stage metrics from stored events."""
|
|
136
|
-
|
|
221
|
+
resolved_db_path = _resolve_db_path(db_path)
|
|
222
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
137
223
|
events = storage.list_stage_events(run_id)
|
|
138
224
|
if not events:
|
|
139
225
|
console.print("[yellow]No stage events found.[/yellow]")
|
|
@@ -187,10 +273,11 @@ def create_stage_app(console: Console) -> typer.Typer:
|
|
|
187
273
|
"--save-metrics/--no-save-metrics",
|
|
188
274
|
help="Store computed stage metrics in the database.",
|
|
189
275
|
),
|
|
190
|
-
db_path: Path = db_option(help_text="Path to database file."),
|
|
276
|
+
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
191
277
|
) -> None:
|
|
192
278
|
"""Report stage summary, metrics, and improvement guides."""
|
|
193
|
-
|
|
279
|
+
resolved_db_path = _resolve_db_path(db_path)
|
|
280
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
194
281
|
events = storage.list_stage_events(run_id)
|
|
195
282
|
if not events:
|
|
196
283
|
console.print("[yellow]No stage events found.[/yellow]")
|
|
@@ -227,17 +314,36 @@ def create_stage_app(console: Console) -> typer.Typer:
|
|
|
227
314
|
return stage_app
|
|
228
315
|
|
|
229
316
|
|
|
230
|
-
def
|
|
317
|
+
def _load_stage_events_with_stats(
|
|
318
|
+
file_path: Path,
|
|
319
|
+
*,
|
|
320
|
+
skip_invalid: bool = False,
|
|
321
|
+
failed_output: Path | None = None,
|
|
322
|
+
) -> tuple[list[StageEvent], ValidationStats]:
|
|
231
323
|
suffix = file_path.suffix.lower()
|
|
232
324
|
if suffix == ".jsonl":
|
|
233
|
-
return
|
|
325
|
+
return _load_jsonl_with_stats(
|
|
326
|
+
file_path,
|
|
327
|
+
skip_invalid=skip_invalid,
|
|
328
|
+
failed_output=failed_output,
|
|
329
|
+
)
|
|
234
330
|
if suffix == ".json":
|
|
235
|
-
return
|
|
331
|
+
return _load_json_with_stats(
|
|
332
|
+
file_path,
|
|
333
|
+
skip_invalid=skip_invalid,
|
|
334
|
+
failed_output=failed_output,
|
|
335
|
+
)
|
|
236
336
|
raise typer.BadParameter("Unsupported file format. Use .json or .jsonl")
|
|
237
337
|
|
|
238
338
|
|
|
239
|
-
def
|
|
339
|
+
def _load_jsonl_with_stats(
|
|
340
|
+
file_path: Path,
|
|
341
|
+
*,
|
|
342
|
+
skip_invalid: bool = False,
|
|
343
|
+
failed_output: Path | None = None,
|
|
344
|
+
) -> tuple[list[StageEvent], ValidationStats]:
|
|
240
345
|
events: list[StageEvent] = []
|
|
346
|
+
stats = ValidationStats()
|
|
241
347
|
with file_path.open(encoding="utf-8") as handle:
|
|
242
348
|
for idx, line in enumerate(handle, start=1):
|
|
243
349
|
raw = line.strip()
|
|
@@ -246,15 +352,43 @@ def _load_jsonl(file_path: Path) -> list[StageEvent]:
|
|
|
246
352
|
try:
|
|
247
353
|
payload = json.loads(raw)
|
|
248
354
|
except json.JSONDecodeError as exc:
|
|
355
|
+
if failed_output:
|
|
356
|
+
_record_failed_sample(
|
|
357
|
+
failed_output,
|
|
358
|
+
{"raw": raw},
|
|
359
|
+
error=f"JSON parse error at line {idx}: {exc}",
|
|
360
|
+
line=idx,
|
|
361
|
+
)
|
|
362
|
+
if skip_invalid:
|
|
363
|
+
stats.record_failure(f"JSON parse error at line {idx}")
|
|
364
|
+
logger.debug("Skipped invalid JSON at line %d: %s", idx, exc)
|
|
365
|
+
continue
|
|
249
366
|
raise typer.BadParameter(f"Invalid JSON at line {idx}") from exc
|
|
250
367
|
try:
|
|
251
368
|
events.append(StageEvent.from_dict(payload))
|
|
369
|
+
stats.record_success()
|
|
252
370
|
except ValueError as exc:
|
|
371
|
+
if failed_output:
|
|
372
|
+
_record_failed_sample(
|
|
373
|
+
failed_output,
|
|
374
|
+
payload,
|
|
375
|
+
error=f"Invalid stage event at line {idx}: {exc}",
|
|
376
|
+
line=idx,
|
|
377
|
+
)
|
|
378
|
+
if skip_invalid:
|
|
379
|
+
stats.record_failure(str(exc))
|
|
380
|
+
logger.debug("Skipped invalid stage event at line %d: %s", idx, exc)
|
|
381
|
+
continue
|
|
253
382
|
raise typer.BadParameter(f"Invalid stage event at line {idx}: {exc}") from exc
|
|
254
|
-
return events
|
|
383
|
+
return events, stats
|
|
255
384
|
|
|
256
385
|
|
|
257
|
-
def
|
|
386
|
+
def _load_json_with_stats(
|
|
387
|
+
file_path: Path,
|
|
388
|
+
*,
|
|
389
|
+
skip_invalid: bool = False,
|
|
390
|
+
failed_output: Path | None = None,
|
|
391
|
+
) -> tuple[list[StageEvent], ValidationStats]:
|
|
258
392
|
with file_path.open(encoding="utf-8") as handle:
|
|
259
393
|
payload = json.load(handle)
|
|
260
394
|
|
|
@@ -267,13 +401,61 @@ def _load_json(file_path: Path) -> list[StageEvent]:
|
|
|
267
401
|
else:
|
|
268
402
|
raise typer.BadParameter("Unsupported JSON structure for stage events")
|
|
269
403
|
|
|
270
|
-
events = []
|
|
404
|
+
events: list[StageEvent] = []
|
|
405
|
+
stats = ValidationStats()
|
|
271
406
|
for idx, item in enumerate(raw_events, start=1):
|
|
272
407
|
try:
|
|
273
408
|
events.append(StageEvent.from_dict(item))
|
|
409
|
+
stats.record_success()
|
|
274
410
|
except ValueError as exc:
|
|
411
|
+
if failed_output:
|
|
412
|
+
_record_failed_sample(
|
|
413
|
+
failed_output,
|
|
414
|
+
item,
|
|
415
|
+
error=f"Invalid stage event at index {idx}: {exc}",
|
|
416
|
+
index=idx,
|
|
417
|
+
)
|
|
418
|
+
if skip_invalid:
|
|
419
|
+
stats.record_failure(str(exc))
|
|
420
|
+
logger.debug("Skipped invalid stage event at index %d: %s", idx, exc)
|
|
421
|
+
continue
|
|
275
422
|
raise typer.BadParameter(f"Invalid stage event at index {idx}: {exc}") from exc
|
|
276
|
-
return events
|
|
423
|
+
return events, stats
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def _record_failed_sample(
|
|
427
|
+
output_path: Path | None,
|
|
428
|
+
payload: object,
|
|
429
|
+
*,
|
|
430
|
+
error: str,
|
|
431
|
+
index: int | None = None,
|
|
432
|
+
line: int | None = None,
|
|
433
|
+
) -> None:
|
|
434
|
+
if output_path is None:
|
|
435
|
+
return
|
|
436
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
437
|
+
record = {
|
|
438
|
+
"error": error,
|
|
439
|
+
"index": index,
|
|
440
|
+
"line": line,
|
|
441
|
+
"payload": payload,
|
|
442
|
+
}
|
|
443
|
+
with output_path.open("a", encoding="utf-8") as handle:
|
|
444
|
+
handle.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def _print_validation_stats(console: Console, stats: ValidationStats) -> None:
|
|
448
|
+
console.print(
|
|
449
|
+
f"[yellow]Validation: {stats.valid_count}/{stats.total_processed} valid, "
|
|
450
|
+
f"{stats.failed_count} failed[/yellow]"
|
|
451
|
+
)
|
|
452
|
+
if stats.error_counts:
|
|
453
|
+
table = Table(show_header=True, header_style="bold yellow")
|
|
454
|
+
table.add_column("Error Type")
|
|
455
|
+
table.add_column("Count", justify="right")
|
|
456
|
+
for error_type, count in sorted(stats.error_counts.items(), key=lambda x: -x[1]):
|
|
457
|
+
table.add_row(error_type, str(count))
|
|
458
|
+
console.print(table)
|
|
277
459
|
|
|
278
460
|
|
|
279
461
|
def _print_ingest_summary(console: Console, events: Iterable[StageEvent]) -> None:
|
|
@@ -324,7 +506,10 @@ def _load_thresholds_map(file_path: Path, *, profile: str | None = None) -> dict
|
|
|
324
506
|
|
|
325
507
|
thresholds: dict[str, float] = {}
|
|
326
508
|
for key, value in thresholds_payload.items():
|
|
327
|
-
|
|
509
|
+
if isinstance(value, (int, float, str)):
|
|
510
|
+
thresholds[str(key)] = float(value)
|
|
511
|
+
continue
|
|
512
|
+
raise typer.BadParameter(f"Invalid threshold value for '{key}': {value}")
|
|
328
513
|
|
|
329
514
|
return thresholds
|
|
330
515
|
|
|
@@ -362,6 +547,13 @@ def _load_default_profile() -> str | None:
|
|
|
362
547
|
return None
|
|
363
548
|
|
|
364
549
|
|
|
550
|
+
def _resolve_db_path(db_path: Path | None) -> Path:
|
|
551
|
+
resolved = db_path or Settings().evalvault_db_path
|
|
552
|
+
if resolved is None:
|
|
553
|
+
raise typer.BadParameter("Database path is not configured.")
|
|
554
|
+
return resolved
|
|
555
|
+
|
|
556
|
+
|
|
365
557
|
def _print_stage_summary(console: Console, summary_data) -> None:
|
|
366
558
|
table = Table(show_header=True, header_style="bold cyan")
|
|
367
559
|
table.add_column("Stage Type")
|
|
@@ -397,24 +589,25 @@ def _print_metric_summary(console: Console, metrics: list[StageMetric]) -> None:
|
|
|
397
589
|
table.add_column("Pass Rate", justify="right")
|
|
398
590
|
|
|
399
591
|
for metric_name, stats in sorted(aggregates.items()):
|
|
592
|
+
pass_rate = str(stats.get("pass_rate", "-"))
|
|
400
593
|
table.add_row(
|
|
401
594
|
metric_name,
|
|
402
595
|
str(stats["count"]),
|
|
403
596
|
f"{stats['avg']:.4f}",
|
|
404
597
|
f"{stats['min']:.4f}",
|
|
405
598
|
f"{stats['max']:.4f}",
|
|
406
|
-
|
|
599
|
+
pass_rate,
|
|
407
600
|
)
|
|
408
601
|
|
|
409
602
|
console.print(table)
|
|
410
603
|
|
|
411
604
|
|
|
412
|
-
def _aggregate_metrics(metrics: list[StageMetric]) -> dict[str, dict[str, float]]:
|
|
605
|
+
def _aggregate_metrics(metrics: list[StageMetric]) -> dict[str, dict[str, float | str]]:
|
|
413
606
|
grouped: dict[str, list[StageMetric]] = defaultdict(list)
|
|
414
607
|
for metric in metrics:
|
|
415
608
|
grouped[metric.metric_name].append(metric)
|
|
416
609
|
|
|
417
|
-
aggregates: dict[str, dict[str, float]] = {}
|
|
610
|
+
aggregates: dict[str, dict[str, float | str]] = {}
|
|
418
611
|
for name, items in grouped.items():
|
|
419
612
|
scores = [item.score for item in items]
|
|
420
613
|
pass_values = [item.passed for item in items if item.passed is not None]
|
|
@@ -23,6 +23,9 @@ from evalvault.adapters.outbound.analysis.comparison_report_module import (
|
|
|
23
23
|
ComparisonReportModule,
|
|
24
24
|
)
|
|
25
25
|
from evalvault.adapters.outbound.analysis.data_loader_module import DataLoaderModule
|
|
26
|
+
from evalvault.adapters.outbound.analysis.dataset_feature_analyzer_module import (
|
|
27
|
+
DatasetFeatureAnalyzerModule,
|
|
28
|
+
)
|
|
26
29
|
from evalvault.adapters.outbound.analysis.detailed_report_module import (
|
|
27
30
|
DetailedReportModule,
|
|
28
31
|
)
|
|
@@ -136,6 +139,7 @@ __all__ = [
|
|
|
136
139
|
"CausalAnalyzerModule",
|
|
137
140
|
"ComparisonReportModule",
|
|
138
141
|
"DataLoaderModule",
|
|
142
|
+
"DatasetFeatureAnalyzerModule",
|
|
139
143
|
"DetailedReportModule",
|
|
140
144
|
"DiagnosticPlaybookModule",
|
|
141
145
|
"EmbeddingAnalyzerModule",
|