evalvault 1.63.1__py3-none-any.whl → 1.65.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. evalvault/adapters/inbound/api/main.py +147 -9
  2. evalvault/adapters/inbound/api/routers/config.py +6 -1
  3. evalvault/adapters/inbound/api/routers/knowledge.py +62 -6
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  13. evalvault/adapters/outbound/artifact_fs.py +16 -0
  14. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  15. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  16. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  17. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  18. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  19. evalvault/adapters/outbound/methods/external_command.py +22 -1
  20. evalvault/adapters/outbound/tracker/langfuse_adapter.py +40 -15
  21. evalvault/adapters/outbound/tracker/log_sanitizer.py +93 -0
  22. evalvault/adapters/outbound/tracker/mlflow_adapter.py +3 -2
  23. evalvault/adapters/outbound/tracker/phoenix_adapter.py +90 -37
  24. evalvault/config/secret_manager.py +118 -0
  25. evalvault/config/settings.py +141 -1
  26. evalvault/domain/entities/__init__.py +10 -0
  27. evalvault/domain/entities/judge_calibration.py +50 -0
  28. evalvault/domain/entities/stage.py +11 -3
  29. evalvault/domain/services/artifact_lint_service.py +268 -0
  30. evalvault/domain/services/benchmark_runner.py +1 -6
  31. evalvault/domain/services/dataset_preprocessor.py +26 -0
  32. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  33. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  34. evalvault/domain/services/evaluator.py +2 -0
  35. evalvault/domain/services/judge_calibration_service.py +495 -0
  36. evalvault/domain/services/ops_snapshot_service.py +159 -0
  37. evalvault/domain/services/regression_gate_service.py +199 -0
  38. evalvault/domain/services/run_comparison_service.py +159 -0
  39. evalvault/domain/services/stage_event_builder.py +6 -1
  40. evalvault/domain/services/stage_metric_service.py +83 -18
  41. evalvault/ports/outbound/__init__.py +4 -0
  42. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  43. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  44. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  45. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  46. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  47. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/METADATA +8 -1
  48. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/RECORD +51 -23
  49. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
  50. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
  51. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+
5
+ from evalvault.domain.entities.analysis_pipeline import AnalysisIntent, PipelineResult
6
+ from evalvault.domain.services.pipeline_orchestrator import AnalysisPipelineService
7
+ from evalvault.ports.outbound.comparison_pipeline_port import ComparisonPipelinePort
8
+
9
+
10
+ class ComparisonPipelineAdapter(ComparisonPipelinePort):
11
+ def __init__(self, service: AnalysisPipelineService) -> None:
12
+ self._service = service
13
+
14
+ def run_comparison(
15
+ self,
16
+ *,
17
+ run_ids: list[str],
18
+ compare_metrics: list[str] | None,
19
+ test_type: str,
20
+ parallel: bool,
21
+ concurrency: int | None,
22
+ report_type: str,
23
+ use_llm_report: bool,
24
+ ) -> PipelineResult:
25
+ params = {
26
+ "run_ids": run_ids,
27
+ "compare_metrics": compare_metrics,
28
+ "test_type": test_type,
29
+ "report_type": report_type,
30
+ "use_llm_report": use_llm_report,
31
+ }
32
+ if parallel:
33
+ if concurrency is not None:
34
+ params["max_concurrency"] = concurrency
35
+ return asyncio.run(
36
+ self._service.analyze_intent_async(
37
+ AnalysisIntent.GENERATE_COMPARISON,
38
+ run_id=run_ids[0] if run_ids else None,
39
+ **params,
40
+ )
41
+ )
42
+ return self._service.analyze_intent(
43
+ AnalysisIntent.GENERATE_COMPARISON,
44
+ run_id=run_ids[0] if run_ids else None,
45
+ **params,
46
+ )
47
+
48
+
49
+ __all__ = ["ComparisonPipelineAdapter"]
@@ -0,0 +1,16 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from evalvault.ports.outbound.artifact_fs_port import ArtifactFileSystemPort
6
+
7
+
8
+ class LocalArtifactFileSystemAdapter(ArtifactFileSystemPort):
9
+ def exists(self, path: Path) -> bool:
10
+ return path.exists()
11
+
12
+ def is_dir(self, path: Path) -> bool:
13
+ return path.is_dir()
14
+
15
+ def read_text(self, path: Path) -> str:
16
+ return path.read_text(encoding="utf-8")
@@ -0,0 +1,3 @@
1
+ from evalvault.adapters.outbound.filesystem.ops_snapshot_writer import OpsSnapshotWriter
2
+
3
+ __all__ = ["OpsSnapshotWriter"]
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
6
+ from evalvault.ports.outbound.difficulty_profile_port import DifficultyProfileWriterPort
7
+
8
+
9
+ class DifficultyProfileWriter(DifficultyProfileWriterPort):
10
+ def write_profile(
11
+ self,
12
+ *,
13
+ output_path: Path,
14
+ artifacts_dir: Path,
15
+ envelope: dict[str, object],
16
+ artifacts: dict[str, object],
17
+ ) -> dict[str, object]:
18
+ output_path.parent.mkdir(parents=True, exist_ok=True)
19
+ artifacts_dir.mkdir(parents=True, exist_ok=True)
20
+
21
+ breakdown_path = artifacts_dir / "difficulty_breakdown.json"
22
+ cases_path = artifacts_dir / "difficulty_cases.json"
23
+ breakdown_payload = artifacts.get("breakdown")
24
+ cases_payload = artifacts.get("cases")
25
+ write_json(
26
+ breakdown_path,
27
+ breakdown_payload if isinstance(breakdown_payload, dict) else {},
28
+ )
29
+ write_json(
30
+ cases_path,
31
+ {"cases": cases_payload} if isinstance(cases_payload, list) else {"cases": []},
32
+ )
33
+
34
+ index_payload = {
35
+ "files": {
36
+ "breakdown": str(breakdown_path),
37
+ "cases": str(cases_path),
38
+ }
39
+ }
40
+ index_path = artifacts_dir / "index.json"
41
+ write_json(index_path, index_payload)
42
+
43
+ artifacts_index = {
44
+ "dir": str(artifacts_dir),
45
+ "index": str(index_path),
46
+ }
47
+ envelope["artifacts"] = artifacts_index
48
+ write_json(output_path, envelope)
49
+
50
+ return artifacts_index
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
7
+ from evalvault.ports.outbound.ops_snapshot_port import OpsSnapshotWriterPort
8
+
9
+
10
+ class OpsSnapshotWriter(OpsSnapshotWriterPort):
11
+ def write_snapshot(self, path: Path, payload: dict[str, Any]) -> None:
12
+ path.parent.mkdir(parents=True, exist_ok=True)
13
+ write_json(path, payload)
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ from evalvault.domain.entities import EvaluationRun, SatisfactionFeedback
4
+ from evalvault.domain.entities.judge_calibration import JudgeCalibrationResult
5
+ from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
6
+ from evalvault.ports.outbound.judge_calibration_port import JudgeCalibrationPort
7
+
8
+
9
+ class JudgeCalibrationAdapter(JudgeCalibrationPort):
10
+ def __init__(self) -> None:
11
+ self._service = JudgeCalibrationService()
12
+
13
+ def calibrate(
14
+ self,
15
+ run: EvaluationRun,
16
+ feedbacks: list[SatisfactionFeedback],
17
+ *,
18
+ labels_source: str,
19
+ method: str,
20
+ metrics: list[str],
21
+ holdout_ratio: float,
22
+ seed: int,
23
+ parallel: bool = False,
24
+ concurrency: int = 8,
25
+ ) -> JudgeCalibrationResult:
26
+ return self._service.calibrate(
27
+ run,
28
+ feedbacks,
29
+ labels_source=labels_source,
30
+ method=method,
31
+ metrics=metrics,
32
+ holdout_ratio=holdout_ratio,
33
+ seed=seed,
34
+ parallel=parallel,
35
+ concurrency=concurrency,
36
+ )
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import asdict
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from evalvault.domain.entities.judge_calibration import JudgeCalibrationResult
9
+
10
+
11
+ class JudgeCalibrationReporter:
12
+ def render_json(self, result: JudgeCalibrationResult) -> dict[str, Any]:
13
+ return {
14
+ "summary": asdict(result.summary),
15
+ "metrics": [asdict(metric) for metric in result.metrics],
16
+ "case_results": {
17
+ metric: [asdict(entry) for entry in entries]
18
+ for metric, entries in result.case_results.items()
19
+ },
20
+ "warnings": list(result.warnings),
21
+ }
22
+
23
+ def write_artifacts(
24
+ self,
25
+ *,
26
+ result: JudgeCalibrationResult,
27
+ artifacts_dir: Path,
28
+ ) -> dict[str, str]:
29
+ artifacts_dir.mkdir(parents=True, exist_ok=True)
30
+ index_path = artifacts_dir / "index.json"
31
+ payload = {
32
+ "run_id": result.summary.run_id,
33
+ "metrics": [metric.metric for metric in result.metrics],
34
+ "cases": {},
35
+ }
36
+ for metric, cases in result.case_results.items():
37
+ case_path = artifacts_dir / f"{metric}.json"
38
+ case_payload = [
39
+ {
40
+ "test_case_id": case.test_case_id,
41
+ "raw_score": case.raw_score,
42
+ "calibrated_score": case.calibrated_score,
43
+ "label": case.label,
44
+ "label_source": case.label_source,
45
+ }
46
+ for case in cases
47
+ ]
48
+ case_path.write_text(
49
+ json.dumps(case_payload, ensure_ascii=False, indent=2),
50
+ encoding="utf-8",
51
+ )
52
+ payload["cases"][metric] = str(case_path)
53
+ index_path.write_text(
54
+ json.dumps(payload, ensure_ascii=False, indent=2),
55
+ encoding="utf-8",
56
+ )
57
+ return {"dir": str(artifacts_dir), "index": str(index_path)}
@@ -5,6 +5,7 @@ from __future__ import annotations
5
5
  import json
6
6
  import os
7
7
  import subprocess
8
+ import warnings
8
9
  from collections.abc import Sequence
9
10
  from pathlib import Path
10
11
  from typing import Any
@@ -18,7 +19,9 @@ class ExternalCommandMethod(RagMethodPort):
18
19
 
19
20
  name = "external_command"
20
21
  version = "0.1.0"
21
- description = "Execute a method in a separate process."
22
+ description = (
23
+ "Execute a method in a separate process (shell=True requires a trusted command string)."
24
+ )
22
25
  tags = ("external", "isolation")
23
26
 
24
27
  def __init__(
@@ -67,6 +70,7 @@ class ExternalCommandMethod(RagMethodPort):
67
70
  )
68
71
 
69
72
  command = self._build_command(runtime)
73
+ self._validate_shell_usage(command)
70
74
  result = subprocess.run( # noqa: S603 - user-controlled command by design
71
75
  command,
72
76
  cwd=self._workdir,
@@ -104,6 +108,23 @@ class ExternalCommandMethod(RagMethodPort):
104
108
  except KeyError as exc:
105
109
  raise ValueError(f"Unknown command placeholder: {exc}") from exc
106
110
 
111
+ def _validate_shell_usage(self, command: list[str] | str) -> None:
112
+ if not self._shell:
113
+ return
114
+ if not isinstance(command, str):
115
+ raise ValueError(
116
+ "shell=True requires a single command string; list arguments are rejected."
117
+ )
118
+ if not command.strip():
119
+ raise ValueError("shell=True requires a non-empty command string.")
120
+ if "\n" in command or "\r" in command:
121
+ raise ValueError("shell=True command must not contain newlines.")
122
+ warnings.warn(
123
+ "shell=True executes through the system shell. Use only trusted commands.",
124
+ RuntimeWarning,
125
+ stacklevel=2,
126
+ )
127
+
107
128
  @staticmethod
108
129
  def _load_payload(path: Path) -> Any:
109
130
  if not path.exists():
@@ -4,6 +4,13 @@ from typing import Any
4
4
 
5
5
  from langfuse import Langfuse
6
6
 
7
+ from evalvault.adapters.outbound.tracker.log_sanitizer import (
8
+ MAX_CONTEXT_CHARS,
9
+ MAX_LOG_CHARS,
10
+ sanitize_payload,
11
+ sanitize_text,
12
+ sanitize_text_list,
13
+ )
7
14
  from evalvault.config.phoenix_support import extract_phoenix_links
8
15
  from evalvault.domain.entities import EvaluationRun
9
16
  from evalvault.ports.outbound.tracker_port import TrackerPort
@@ -56,13 +63,15 @@ class LangfuseAdapter(TrackerPort):
56
63
  span.update_trace(name=name, metadata=metadata)
57
64
  self._traces[trace_id] = span
58
65
  else:
59
- # Langfuse 2.x: use trace method
60
- trace = self._client.trace(
66
+ trace_fn: Any = getattr(self._client, "trace", None)
67
+ if trace_fn is None:
68
+ raise RuntimeError("Langfuse client does not expose trace API")
69
+ trace_obj = trace_fn(
61
70
  name=name,
62
71
  metadata=metadata,
63
72
  )
64
- trace_id = trace.id
65
- self._traces[trace_id] = trace
73
+ trace_id = trace_obj.id
74
+ self._traces[trace_id] = trace_obj
66
75
  return trace_id
67
76
 
68
77
  def add_span(
@@ -88,21 +97,31 @@ class LangfuseAdapter(TrackerPort):
88
97
  raise ValueError(f"Trace not found: {trace_id}")
89
98
 
90
99
  trace_or_span = self._traces[trace_id]
100
+ safe_input = (
101
+ sanitize_payload(input_data, max_chars=MAX_LOG_CHARS)
102
+ if input_data is not None
103
+ else None
104
+ )
105
+ safe_output = (
106
+ sanitize_payload(output_data, max_chars=MAX_LOG_CHARS)
107
+ if output_data is not None
108
+ else None
109
+ )
91
110
  # Support both old and new Langfuse API
92
111
  if hasattr(trace_or_span, "start_span"):
93
112
  # Langfuse 3.x: create nested span
94
113
  child_span = trace_or_span.start_span(
95
114
  name=name,
96
- input=input_data,
97
- output=output_data,
115
+ input=safe_input,
116
+ output=safe_output,
98
117
  )
99
118
  child_span.end()
100
119
  else:
101
120
  # Langfuse 2.x: use span method on trace
102
121
  trace_or_span.span(
103
122
  name=name,
104
- input=input_data,
105
- output=output_data,
123
+ input=safe_input,
124
+ output=safe_output,
106
125
  )
107
126
 
108
127
  def log_score(
@@ -223,7 +242,7 @@ class LangfuseAdapter(TrackerPort):
223
242
  passed_count = sum(
224
243
  1
225
244
  for r in run.results
226
- if r.get_metric(metric_name) and r.get_metric(metric_name).passed
245
+ if (metric := r.get_metric(metric_name)) and metric.passed is True
227
246
  )
228
247
  avg_score = run.get_avg_score(metric_name)
229
248
  threshold = run.thresholds.get(metric_name, 0.7)
@@ -377,10 +396,13 @@ class LangfuseAdapter(TrackerPort):
377
396
  # Span input: test case data (question, answer, contexts, ground_truth)
378
397
  span_input = {
379
398
  "test_case_id": result.test_case_id,
380
- "question": result.question,
381
- "answer": result.answer,
382
- "contexts": result.contexts,
383
- "ground_truth": result.ground_truth,
399
+ "question": sanitize_text(result.question, max_chars=MAX_LOG_CHARS),
400
+ "answer": sanitize_text(result.answer, max_chars=MAX_LOG_CHARS),
401
+ "contexts": sanitize_text_list(
402
+ result.contexts,
403
+ max_chars=MAX_CONTEXT_CHARS,
404
+ ),
405
+ "ground_truth": sanitize_text(result.ground_truth, max_chars=MAX_LOG_CHARS),
384
406
  }
385
407
 
386
408
  # Span output: evaluation results
@@ -401,12 +423,15 @@ class LangfuseAdapter(TrackerPort):
401
423
  }
402
424
 
403
425
  # Span metadata: additional info
404
- span_metadata = {
426
+ span_metadata: dict[str, float | int] = {
405
427
  "tokens_used": result.tokens_used,
406
428
  "latency_ms": result.latency_ms,
407
429
  }
408
430
  if result.cost_usd:
409
- span_metadata["cost_usd"] = result.cost_usd
431
+ span_metadata = {
432
+ **span_metadata,
433
+ "cost_usd": float(result.cost_usd),
434
+ }
410
435
 
411
436
  if hasattr(root_span, "start_span"):
412
437
  child_span = root_span.start_span(
@@ -0,0 +1,93 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Any
5
+
6
+ MASK_TOKEN = "[REDACTED]"
7
+ MAX_LOG_CHARS = 1000
8
+ MAX_CONTEXT_CHARS = 500
9
+ MAX_LIST_ITEMS = 20
10
+ MAX_PAYLOAD_DEPTH = 2
11
+
12
+ _EMAIL_PATTERN = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
13
+ _PHONE_PATTERN = re.compile(
14
+ r"\b(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{4}\b"
15
+ )
16
+ _SSN_PATTERN = re.compile(r"\b\d{3}-\d{2}-\d{4}\b")
17
+ _CARD_PATTERN = re.compile(r"\b(?:\d[ -]*?){13,16}\b")
18
+
19
+
20
+ def _mask_pii(text: str) -> str:
21
+ text = _EMAIL_PATTERN.sub(MASK_TOKEN, text)
22
+ text = _PHONE_PATTERN.sub(MASK_TOKEN, text)
23
+ text = _SSN_PATTERN.sub(MASK_TOKEN, text)
24
+ text = _CARD_PATTERN.sub(MASK_TOKEN, text)
25
+ return text
26
+
27
+
28
+ def _truncate(text: str, max_chars: int) -> str:
29
+ if max_chars <= 0:
30
+ return ""
31
+ if len(text) <= max_chars:
32
+ return text
33
+ if max_chars <= 3:
34
+ return text[:max_chars]
35
+ return f"{text[: max_chars - 3]}..."
36
+
37
+
38
+ def sanitize_text(value: str | None, *, max_chars: int = MAX_LOG_CHARS) -> str | None:
39
+ if value is None:
40
+ return None
41
+ if not isinstance(value, str):
42
+ value = str(value)
43
+ return _truncate(_mask_pii(value), max_chars)
44
+
45
+
46
+ def sanitize_text_list(
47
+ values: list[str] | tuple[str, ...] | None,
48
+ *,
49
+ max_items: int = MAX_LIST_ITEMS,
50
+ max_chars: int = MAX_CONTEXT_CHARS,
51
+ ) -> list[str]:
52
+ if not values:
53
+ return []
54
+ trimmed = list(values)[:max_items]
55
+ return [sanitize_text(item, max_chars=max_chars) or "" for item in trimmed]
56
+
57
+
58
+ def sanitize_payload(
59
+ value: Any,
60
+ *,
61
+ max_chars: int = MAX_LOG_CHARS,
62
+ max_items: int = MAX_LIST_ITEMS,
63
+ max_depth: int = MAX_PAYLOAD_DEPTH,
64
+ ) -> Any:
65
+ if value is None:
66
+ return None
67
+ if isinstance(value, str):
68
+ return sanitize_text(value, max_chars=max_chars)
69
+ if isinstance(value, bool | int | float):
70
+ return value
71
+ if max_depth <= 0:
72
+ return sanitize_text(str(value), max_chars=max_chars)
73
+ if isinstance(value, dict):
74
+ return {
75
+ key: sanitize_payload(
76
+ item,
77
+ max_chars=max_chars,
78
+ max_items=max_items,
79
+ max_depth=max_depth - 1,
80
+ )
81
+ for key, item in list(value.items())[:max_items]
82
+ }
83
+ if isinstance(value, list | tuple | set):
84
+ return [
85
+ sanitize_payload(
86
+ item,
87
+ max_chars=max_chars,
88
+ max_items=max_items,
89
+ max_depth=max_depth - 1,
90
+ )
91
+ for item in list(value)[:max_items]
92
+ ]
93
+ return sanitize_text(str(value), max_chars=max_chars)
@@ -4,6 +4,7 @@ import json
4
4
  import tempfile
5
5
  from typing import Any
6
6
 
7
+ from evalvault.adapters.outbound.tracker.log_sanitizer import MAX_LOG_CHARS, sanitize_payload
7
8
  from evalvault.domain.entities import EvaluationRun
8
9
  from evalvault.ports.outbound.tracker_port import TrackerPort
9
10
 
@@ -85,8 +86,8 @@ class MLflowAdapter(TrackerPort):
85
86
  # Store span data as JSON artifact
86
87
  span_data = {
87
88
  "name": name,
88
- "input": input_data,
89
- "output": output_data,
89
+ "input": sanitize_payload(input_data, max_chars=MAX_LOG_CHARS),
90
+ "output": sanitize_payload(output_data, max_chars=MAX_LOG_CHARS),
90
91
  }
91
92
 
92
93
  with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: