evalvault 1.72.0__py3-none-any.whl → 1.73.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. evalvault/adapters/inbound/api/routers/pipeline.py +6 -0
  2. evalvault/adapters/inbound/cli/commands/analyze.py +40 -1
  3. evalvault/adapters/inbound/cli/commands/pipeline.py +100 -0
  4. evalvault/adapters/inbound/cli/commands/regress.py +96 -0
  5. evalvault/adapters/inbound/cli/commands/stage.py +217 -24
  6. evalvault/adapters/outbound/analysis/__init__.py +4 -0
  7. evalvault/adapters/outbound/analysis/dataset_feature_analyzer_module.py +458 -0
  8. evalvault/adapters/outbound/analysis/pipeline_factory.py +1 -0
  9. evalvault/adapters/outbound/analysis/statistical_adapter.py +12 -6
  10. evalvault/adapters/outbound/improvement/pattern_detector.py +4 -0
  11. evalvault/adapters/outbound/storage/base_sql.py +160 -0
  12. evalvault/adapters/outbound/storage/postgres_adapter.py +132 -8
  13. evalvault/adapters/outbound/storage/postgres_schema.sql +15 -0
  14. evalvault/adapters/outbound/storage/schema.sql +18 -1
  15. evalvault/adapters/outbound/storage/sqlite_adapter.py +115 -1
  16. evalvault/adapters/outbound/tracer/open_rag_trace_adapter.py +23 -1
  17. evalvault/config/settings.py +2 -1
  18. evalvault/domain/entities/analysis.py +1 -0
  19. evalvault/domain/entities/analysis_pipeline.py +1 -0
  20. evalvault/domain/entities/stage.py +13 -0
  21. evalvault/domain/services/intent_classifier.py +13 -0
  22. evalvault/domain/services/pipeline_template_registry.py +22 -0
  23. evalvault/ports/outbound/storage_port.py +32 -0
  24. {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/METADATA +2 -1
  25. {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/RECORD +28 -27
  26. {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/WHEEL +0 -0
  27. {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/entry_points.txt +0 -0
  28. {evalvault-1.72.0.dist-info → evalvault-1.73.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -3,8 +3,10 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
+ import logging
6
7
  from collections import defaultdict
7
8
  from collections.abc import Iterable
9
+ from dataclasses import dataclass, field
8
10
  from pathlib import Path
9
11
 
10
12
  import typer
@@ -23,6 +25,62 @@ from evalvault.domain.services.stage_summary_service import StageSummaryService
23
25
 
24
26
  from ..utils.options import db_option
25
27
 
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def _resolve_db_path(db_path: Path | None) -> Path:
32
+ resolved = db_path or Settings().evalvault_db_path
33
+ if resolved is None:
34
+ raise typer.BadParameter("Database path is not configured.")
35
+ return resolved
36
+
37
+
38
+ @dataclass
39
+ class ValidationStats:
40
+ """Tracks StageEvent validation failures by error type."""
41
+
42
+ total_processed: int = 0
43
+ valid_count: int = 0
44
+ error_counts: dict[str, int] = field(default_factory=lambda: defaultdict(int))
45
+
46
+ def record_success(self) -> None:
47
+ self.total_processed += 1
48
+ self.valid_count += 1
49
+
50
+ def record_failure(self, error_message: str) -> None:
51
+ self.total_processed += 1
52
+ error_type = self._classify_error(error_message)
53
+ self.error_counts[error_type] += 1
54
+
55
+ def _classify_error(self, message: str) -> str:
56
+ """Classify error messages into aggregate types."""
57
+ lower_msg = message.lower()
58
+ if "run_id" in lower_msg:
59
+ return "missing_run_id"
60
+ if "stage_type" in lower_msg:
61
+ return "invalid_stage_type"
62
+ if "attributes" in lower_msg:
63
+ return "invalid_attributes"
64
+ if "metadata" in lower_msg:
65
+ return "invalid_metadata"
66
+ if "attempt" in lower_msg:
67
+ return "invalid_attempt"
68
+ if "duration" in lower_msg:
69
+ return "invalid_duration"
70
+ if "datetime" in lower_msg or "started_at" in lower_msg or "finished_at" in lower_msg:
71
+ return "invalid_datetime"
72
+ if "payload" in lower_msg or "ref" in lower_msg:
73
+ return "invalid_payload_ref"
74
+ return "other"
75
+
76
+ @property
77
+ def failed_count(self) -> int:
78
+ return self.total_processed - self.valid_count
79
+
80
+ @property
81
+ def has_failures(self) -> bool:
82
+ return self.failed_count > 0
83
+
26
84
 
27
85
  def create_stage_app(console: Console) -> typer.Typer:
28
86
  """Create the stage Typer sub-application."""
@@ -32,15 +90,40 @@ def create_stage_app(console: Console) -> typer.Typer:
32
90
  @stage_app.command("ingest")
33
91
  def ingest(
34
92
  file: Path = typer.Argument(..., help="Stage events JSON/JSONL file."),
35
- db_path: Path = db_option(help_text="Path to database file."),
93
+ db_path: Path | None = db_option(help_text="Path to database file."),
94
+ failed_output: Path | None = typer.Option(
95
+ None,
96
+ "--failed-output",
97
+ help="Write invalid samples to JSONL for inspection.",
98
+ ),
99
+ skip_invalid: bool = typer.Option(
100
+ False,
101
+ "--skip-invalid",
102
+ help="Continue processing after validation failures, logging aggregate counts.",
103
+ ),
36
104
  ) -> None:
37
105
  """Ingest stage events from JSON or JSONL."""
38
- events = _load_stage_events(file)
106
+ events, stats = _load_stage_events_with_stats(
107
+ file,
108
+ skip_invalid=skip_invalid,
109
+ failed_output=failed_output,
110
+ )
111
+
112
+ if stats.has_failures:
113
+ _print_validation_stats(console, stats)
114
+ logger.warning(
115
+ "StageEvent validation failures: %d/%d (types: %s)",
116
+ stats.failed_count,
117
+ stats.total_processed,
118
+ dict(stats.error_counts),
119
+ )
120
+
39
121
  if not events:
40
- console.print("[yellow]No stage events found in the input file.[/yellow]")
122
+ console.print("[yellow]No valid stage events found in the input file.[/yellow]")
41
123
  raise typer.Exit(1)
42
124
 
43
- storage = SQLiteStorageAdapter(db_path=db_path)
125
+ resolved_db_path = _resolve_db_path(db_path)
126
+ storage = SQLiteStorageAdapter(db_path=resolved_db_path)
44
127
  stored = storage.save_stage_events(events)
45
128
 
46
129
  console.print(f"[green]Stored {stored} stage event(s).[/green]")
@@ -61,10 +144,11 @@ def create_stage_app(console: Console) -> typer.Typer:
61
144
  "-n",
62
145
  help="Maximum number of rows to display.",
63
146
  ),
64
- db_path: Path = db_option(help_text="Path to database file."),
147
+ db_path: Path | None = db_option(help_text="Path to database file."),
65
148
  ) -> None:
66
149
  """List stage events for a run."""
67
- storage = SQLiteStorageAdapter(db_path=db_path)
150
+ resolved_db_path = _resolve_db_path(db_path)
151
+ storage = SQLiteStorageAdapter(db_path=resolved_db_path)
68
152
  events = storage.list_stage_events(run_id, stage_type=stage_type)
69
153
 
70
154
  if not events:
@@ -97,10 +181,11 @@ def create_stage_app(console: Console) -> typer.Typer:
97
181
  @stage_app.command("summary")
98
182
  def summary(
99
183
  run_id: str = typer.Argument(..., help="Run ID to summarize."),
100
- db_path: Path = db_option(help_text="Path to database file."),
184
+ db_path: Path | None = db_option(help_text="Path to database file."),
101
185
  ) -> None:
102
186
  """Show summary stats for stage events."""
103
- storage = SQLiteStorageAdapter(db_path=db_path)
187
+ resolved_db_path = _resolve_db_path(db_path)
188
+ storage = SQLiteStorageAdapter(db_path=resolved_db_path)
104
189
  events = storage.list_stage_events(run_id)
105
190
  if not events:
106
191
  console.print("[yellow]No stage events found.[/yellow]")
@@ -130,10 +215,11 @@ def create_stage_app(console: Console) -> typer.Typer:
130
215
  "--thresholds-profile",
131
216
  help="Profile key for thresholds JSON (defaults to Settings profile).",
132
217
  ),
133
- db_path: Path = db_option(help_text="Path to database file."),
218
+ db_path: Path | None = db_option(help_text="Path to database file."),
134
219
  ) -> None:
135
220
  """Compute stage metrics from stored events."""
136
- storage = SQLiteStorageAdapter(db_path=db_path)
221
+ resolved_db_path = _resolve_db_path(db_path)
222
+ storage = SQLiteStorageAdapter(db_path=resolved_db_path)
137
223
  events = storage.list_stage_events(run_id)
138
224
  if not events:
139
225
  console.print("[yellow]No stage events found.[/yellow]")
@@ -187,10 +273,11 @@ def create_stage_app(console: Console) -> typer.Typer:
187
273
  "--save-metrics/--no-save-metrics",
188
274
  help="Store computed stage metrics in the database.",
189
275
  ),
190
- db_path: Path = db_option(help_text="Path to database file."),
276
+ db_path: Path | None = db_option(help_text="Path to database file."),
191
277
  ) -> None:
192
278
  """Report stage summary, metrics, and improvement guides."""
193
- storage = SQLiteStorageAdapter(db_path=db_path)
279
+ resolved_db_path = _resolve_db_path(db_path)
280
+ storage = SQLiteStorageAdapter(db_path=resolved_db_path)
194
281
  events = storage.list_stage_events(run_id)
195
282
  if not events:
196
283
  console.print("[yellow]No stage events found.[/yellow]")
@@ -227,17 +314,36 @@ def create_stage_app(console: Console) -> typer.Typer:
227
314
  return stage_app
228
315
 
229
316
 
230
- def _load_stage_events(file_path: Path) -> list[StageEvent]:
317
+ def _load_stage_events_with_stats(
318
+ file_path: Path,
319
+ *,
320
+ skip_invalid: bool = False,
321
+ failed_output: Path | None = None,
322
+ ) -> tuple[list[StageEvent], ValidationStats]:
231
323
  suffix = file_path.suffix.lower()
232
324
  if suffix == ".jsonl":
233
- return _load_jsonl(file_path)
325
+ return _load_jsonl_with_stats(
326
+ file_path,
327
+ skip_invalid=skip_invalid,
328
+ failed_output=failed_output,
329
+ )
234
330
  if suffix == ".json":
235
- return _load_json(file_path)
331
+ return _load_json_with_stats(
332
+ file_path,
333
+ skip_invalid=skip_invalid,
334
+ failed_output=failed_output,
335
+ )
236
336
  raise typer.BadParameter("Unsupported file format. Use .json or .jsonl")
237
337
 
238
338
 
239
- def _load_jsonl(file_path: Path) -> list[StageEvent]:
339
+ def _load_jsonl_with_stats(
340
+ file_path: Path,
341
+ *,
342
+ skip_invalid: bool = False,
343
+ failed_output: Path | None = None,
344
+ ) -> tuple[list[StageEvent], ValidationStats]:
240
345
  events: list[StageEvent] = []
346
+ stats = ValidationStats()
241
347
  with file_path.open(encoding="utf-8") as handle:
242
348
  for idx, line in enumerate(handle, start=1):
243
349
  raw = line.strip()
@@ -246,15 +352,43 @@ def _load_jsonl(file_path: Path) -> list[StageEvent]:
246
352
  try:
247
353
  payload = json.loads(raw)
248
354
  except json.JSONDecodeError as exc:
355
+ if failed_output:
356
+ _record_failed_sample(
357
+ failed_output,
358
+ {"raw": raw},
359
+ error=f"JSON parse error at line {idx}: {exc}",
360
+ line=idx,
361
+ )
362
+ if skip_invalid:
363
+ stats.record_failure(f"JSON parse error at line {idx}")
364
+ logger.debug("Skipped invalid JSON at line %d: %s", idx, exc)
365
+ continue
249
366
  raise typer.BadParameter(f"Invalid JSON at line {idx}") from exc
250
367
  try:
251
368
  events.append(StageEvent.from_dict(payload))
369
+ stats.record_success()
252
370
  except ValueError as exc:
371
+ if failed_output:
372
+ _record_failed_sample(
373
+ failed_output,
374
+ payload,
375
+ error=f"Invalid stage event at line {idx}: {exc}",
376
+ line=idx,
377
+ )
378
+ if skip_invalid:
379
+ stats.record_failure(str(exc))
380
+ logger.debug("Skipped invalid stage event at line %d: %s", idx, exc)
381
+ continue
253
382
  raise typer.BadParameter(f"Invalid stage event at line {idx}: {exc}") from exc
254
- return events
383
+ return events, stats
255
384
 
256
385
 
257
- def _load_json(file_path: Path) -> list[StageEvent]:
386
+ def _load_json_with_stats(
387
+ file_path: Path,
388
+ *,
389
+ skip_invalid: bool = False,
390
+ failed_output: Path | None = None,
391
+ ) -> tuple[list[StageEvent], ValidationStats]:
258
392
  with file_path.open(encoding="utf-8") as handle:
259
393
  payload = json.load(handle)
260
394
 
@@ -267,13 +401,61 @@ def _load_json(file_path: Path) -> list[StageEvent]:
267
401
  else:
268
402
  raise typer.BadParameter("Unsupported JSON structure for stage events")
269
403
 
270
- events = []
404
+ events: list[StageEvent] = []
405
+ stats = ValidationStats()
271
406
  for idx, item in enumerate(raw_events, start=1):
272
407
  try:
273
408
  events.append(StageEvent.from_dict(item))
409
+ stats.record_success()
274
410
  except ValueError as exc:
411
+ if failed_output:
412
+ _record_failed_sample(
413
+ failed_output,
414
+ item,
415
+ error=f"Invalid stage event at index {idx}: {exc}",
416
+ index=idx,
417
+ )
418
+ if skip_invalid:
419
+ stats.record_failure(str(exc))
420
+ logger.debug("Skipped invalid stage event at index %d: %s", idx, exc)
421
+ continue
275
422
  raise typer.BadParameter(f"Invalid stage event at index {idx}: {exc}") from exc
276
- return events
423
+ return events, stats
424
+
425
+
426
+ def _record_failed_sample(
427
+ output_path: Path | None,
428
+ payload: object,
429
+ *,
430
+ error: str,
431
+ index: int | None = None,
432
+ line: int | None = None,
433
+ ) -> None:
434
+ if output_path is None:
435
+ return
436
+ output_path.parent.mkdir(parents=True, exist_ok=True)
437
+ record = {
438
+ "error": error,
439
+ "index": index,
440
+ "line": line,
441
+ "payload": payload,
442
+ }
443
+ with output_path.open("a", encoding="utf-8") as handle:
444
+ handle.write(json.dumps(record, ensure_ascii=False) + "\n")
445
+
446
+
447
+ def _print_validation_stats(console: Console, stats: ValidationStats) -> None:
448
+ console.print(
449
+ f"[yellow]Validation: {stats.valid_count}/{stats.total_processed} valid, "
450
+ f"{stats.failed_count} failed[/yellow]"
451
+ )
452
+ if stats.error_counts:
453
+ table = Table(show_header=True, header_style="bold yellow")
454
+ table.add_column("Error Type")
455
+ table.add_column("Count", justify="right")
456
+ for error_type, count in sorted(stats.error_counts.items(), key=lambda x: -x[1]):
457
+ table.add_row(error_type, str(count))
458
+ console.print(table)
277
459
 
278
460
 
279
461
  def _print_ingest_summary(console: Console, events: Iterable[StageEvent]) -> None:
@@ -324,7 +506,10 @@ def _load_thresholds_map(file_path: Path, *, profile: str | None = None) -> dict
324
506
 
325
507
  thresholds: dict[str, float] = {}
326
508
  for key, value in thresholds_payload.items():
327
- thresholds[str(key)] = float(value)
509
+ if isinstance(value, (int, float, str)):
510
+ thresholds[str(key)] = float(value)
511
+ continue
512
+ raise typer.BadParameter(f"Invalid threshold value for '{key}': {value}")
328
513
 
329
514
  return thresholds
330
515
 
@@ -362,6 +547,13 @@ def _load_default_profile() -> str | None:
362
547
  return None
363
548
 
364
549
 
550
+ def _resolve_db_path(db_path: Path | None) -> Path:
551
+ resolved = db_path or Settings().evalvault_db_path
552
+ if resolved is None:
553
+ raise typer.BadParameter("Database path is not configured.")
554
+ return resolved
555
+
556
+
365
557
  def _print_stage_summary(console: Console, summary_data) -> None:
366
558
  table = Table(show_header=True, header_style="bold cyan")
367
559
  table.add_column("Stage Type")
@@ -397,24 +589,25 @@ def _print_metric_summary(console: Console, metrics: list[StageMetric]) -> None:
397
589
  table.add_column("Pass Rate", justify="right")
398
590
 
399
591
  for metric_name, stats in sorted(aggregates.items()):
592
+ pass_rate = str(stats.get("pass_rate", "-"))
400
593
  table.add_row(
401
594
  metric_name,
402
595
  str(stats["count"]),
403
596
  f"{stats['avg']:.4f}",
404
597
  f"{stats['min']:.4f}",
405
598
  f"{stats['max']:.4f}",
406
- stats.get("pass_rate", "-"),
599
+ pass_rate,
407
600
  )
408
601
 
409
602
  console.print(table)
410
603
 
411
604
 
412
- def _aggregate_metrics(metrics: list[StageMetric]) -> dict[str, dict[str, float]]:
605
+ def _aggregate_metrics(metrics: list[StageMetric]) -> dict[str, dict[str, float | str]]:
413
606
  grouped: dict[str, list[StageMetric]] = defaultdict(list)
414
607
  for metric in metrics:
415
608
  grouped[metric.metric_name].append(metric)
416
609
 
417
- aggregates: dict[str, dict[str, float]] = {}
610
+ aggregates: dict[str, dict[str, float | str]] = {}
418
611
  for name, items in grouped.items():
419
612
  scores = [item.score for item in items]
420
613
  pass_values = [item.passed for item in items if item.passed is not None]
@@ -23,6 +23,9 @@ from evalvault.adapters.outbound.analysis.comparison_report_module import (
23
23
  ComparisonReportModule,
24
24
  )
25
25
  from evalvault.adapters.outbound.analysis.data_loader_module import DataLoaderModule
26
+ from evalvault.adapters.outbound.analysis.dataset_feature_analyzer_module import (
27
+ DatasetFeatureAnalyzerModule,
28
+ )
26
29
  from evalvault.adapters.outbound.analysis.detailed_report_module import (
27
30
  DetailedReportModule,
28
31
  )
@@ -136,6 +139,7 @@ __all__ = [
136
139
  "CausalAnalyzerModule",
137
140
  "ComparisonReportModule",
138
141
  "DataLoaderModule",
142
+ "DatasetFeatureAnalyzerModule",
139
143
  "DetailedReportModule",
140
144
  "DiagnosticPlaybookModule",
141
145
  "EmbeddingAnalyzerModule",