evalvault 1.63.1__py3-none-any.whl → 1.65.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/main.py +147 -9
- evalvault/adapters/inbound/api/routers/config.py +6 -1
- evalvault/adapters/inbound/api/routers/knowledge.py +62 -6
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/methods/external_command.py +22 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +40 -15
- evalvault/adapters/outbound/tracker/log_sanitizer.py +93 -0
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +3 -2
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +90 -37
- evalvault/config/secret_manager.py +118 -0
- evalvault/config/settings.py +141 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +2 -0
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/METADATA +8 -1
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/RECORD +51 -23
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
5
|
+
from evalvault.domain.entities.analysis_pipeline import AnalysisIntent, PipelineResult
|
|
6
|
+
from evalvault.domain.services.pipeline_orchestrator import AnalysisPipelineService
|
|
7
|
+
from evalvault.ports.outbound.comparison_pipeline_port import ComparisonPipelinePort
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ComparisonPipelineAdapter(ComparisonPipelinePort):
|
|
11
|
+
def __init__(self, service: AnalysisPipelineService) -> None:
|
|
12
|
+
self._service = service
|
|
13
|
+
|
|
14
|
+
def run_comparison(
|
|
15
|
+
self,
|
|
16
|
+
*,
|
|
17
|
+
run_ids: list[str],
|
|
18
|
+
compare_metrics: list[str] | None,
|
|
19
|
+
test_type: str,
|
|
20
|
+
parallel: bool,
|
|
21
|
+
concurrency: int | None,
|
|
22
|
+
report_type: str,
|
|
23
|
+
use_llm_report: bool,
|
|
24
|
+
) -> PipelineResult:
|
|
25
|
+
params = {
|
|
26
|
+
"run_ids": run_ids,
|
|
27
|
+
"compare_metrics": compare_metrics,
|
|
28
|
+
"test_type": test_type,
|
|
29
|
+
"report_type": report_type,
|
|
30
|
+
"use_llm_report": use_llm_report,
|
|
31
|
+
}
|
|
32
|
+
if parallel:
|
|
33
|
+
if concurrency is not None:
|
|
34
|
+
params["max_concurrency"] = concurrency
|
|
35
|
+
return asyncio.run(
|
|
36
|
+
self._service.analyze_intent_async(
|
|
37
|
+
AnalysisIntent.GENERATE_COMPARISON,
|
|
38
|
+
run_id=run_ids[0] if run_ids else None,
|
|
39
|
+
**params,
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
return self._service.analyze_intent(
|
|
43
|
+
AnalysisIntent.GENERATE_COMPARISON,
|
|
44
|
+
run_id=run_ids[0] if run_ids else None,
|
|
45
|
+
**params,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
__all__ = ["ComparisonPipelineAdapter"]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from evalvault.ports.outbound.artifact_fs_port import ArtifactFileSystemPort
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LocalArtifactFileSystemAdapter(ArtifactFileSystemPort):
|
|
9
|
+
def exists(self, path: Path) -> bool:
|
|
10
|
+
return path.exists()
|
|
11
|
+
|
|
12
|
+
def is_dir(self, path: Path) -> bool:
|
|
13
|
+
return path.is_dir()
|
|
14
|
+
|
|
15
|
+
def read_text(self, path: Path) -> str:
|
|
16
|
+
return path.read_text(encoding="utf-8")
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
|
|
6
|
+
from evalvault.ports.outbound.difficulty_profile_port import DifficultyProfileWriterPort
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DifficultyProfileWriter(DifficultyProfileWriterPort):
|
|
10
|
+
def write_profile(
|
|
11
|
+
self,
|
|
12
|
+
*,
|
|
13
|
+
output_path: Path,
|
|
14
|
+
artifacts_dir: Path,
|
|
15
|
+
envelope: dict[str, object],
|
|
16
|
+
artifacts: dict[str, object],
|
|
17
|
+
) -> dict[str, object]:
|
|
18
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
19
|
+
artifacts_dir.mkdir(parents=True, exist_ok=True)
|
|
20
|
+
|
|
21
|
+
breakdown_path = artifacts_dir / "difficulty_breakdown.json"
|
|
22
|
+
cases_path = artifacts_dir / "difficulty_cases.json"
|
|
23
|
+
breakdown_payload = artifacts.get("breakdown")
|
|
24
|
+
cases_payload = artifacts.get("cases")
|
|
25
|
+
write_json(
|
|
26
|
+
breakdown_path,
|
|
27
|
+
breakdown_payload if isinstance(breakdown_payload, dict) else {},
|
|
28
|
+
)
|
|
29
|
+
write_json(
|
|
30
|
+
cases_path,
|
|
31
|
+
{"cases": cases_payload} if isinstance(cases_payload, list) else {"cases": []},
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
index_payload = {
|
|
35
|
+
"files": {
|
|
36
|
+
"breakdown": str(breakdown_path),
|
|
37
|
+
"cases": str(cases_path),
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
index_path = artifacts_dir / "index.json"
|
|
41
|
+
write_json(index_path, index_payload)
|
|
42
|
+
|
|
43
|
+
artifacts_index = {
|
|
44
|
+
"dir": str(artifacts_dir),
|
|
45
|
+
"index": str(index_path),
|
|
46
|
+
}
|
|
47
|
+
envelope["artifacts"] = artifacts_index
|
|
48
|
+
write_json(output_path, envelope)
|
|
49
|
+
|
|
50
|
+
return artifacts_index
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
|
|
7
|
+
from evalvault.ports.outbound.ops_snapshot_port import OpsSnapshotWriterPort
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class OpsSnapshotWriter(OpsSnapshotWriterPort):
|
|
11
|
+
def write_snapshot(self, path: Path, payload: dict[str, Any]) -> None:
|
|
12
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
13
|
+
write_json(path, payload)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from evalvault.domain.entities import EvaluationRun, SatisfactionFeedback
|
|
4
|
+
from evalvault.domain.entities.judge_calibration import JudgeCalibrationResult
|
|
5
|
+
from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
|
|
6
|
+
from evalvault.ports.outbound.judge_calibration_port import JudgeCalibrationPort
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class JudgeCalibrationAdapter(JudgeCalibrationPort):
|
|
10
|
+
def __init__(self) -> None:
|
|
11
|
+
self._service = JudgeCalibrationService()
|
|
12
|
+
|
|
13
|
+
def calibrate(
|
|
14
|
+
self,
|
|
15
|
+
run: EvaluationRun,
|
|
16
|
+
feedbacks: list[SatisfactionFeedback],
|
|
17
|
+
*,
|
|
18
|
+
labels_source: str,
|
|
19
|
+
method: str,
|
|
20
|
+
metrics: list[str],
|
|
21
|
+
holdout_ratio: float,
|
|
22
|
+
seed: int,
|
|
23
|
+
parallel: bool = False,
|
|
24
|
+
concurrency: int = 8,
|
|
25
|
+
) -> JudgeCalibrationResult:
|
|
26
|
+
return self._service.calibrate(
|
|
27
|
+
run,
|
|
28
|
+
feedbacks,
|
|
29
|
+
labels_source=labels_source,
|
|
30
|
+
method=method,
|
|
31
|
+
metrics=metrics,
|
|
32
|
+
holdout_ratio=holdout_ratio,
|
|
33
|
+
seed=seed,
|
|
34
|
+
parallel=parallel,
|
|
35
|
+
concurrency=concurrency,
|
|
36
|
+
)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import asdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from evalvault.domain.entities.judge_calibration import JudgeCalibrationResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class JudgeCalibrationReporter:
|
|
12
|
+
def render_json(self, result: JudgeCalibrationResult) -> dict[str, Any]:
|
|
13
|
+
return {
|
|
14
|
+
"summary": asdict(result.summary),
|
|
15
|
+
"metrics": [asdict(metric) for metric in result.metrics],
|
|
16
|
+
"case_results": {
|
|
17
|
+
metric: [asdict(entry) for entry in entries]
|
|
18
|
+
for metric, entries in result.case_results.items()
|
|
19
|
+
},
|
|
20
|
+
"warnings": list(result.warnings),
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
def write_artifacts(
|
|
24
|
+
self,
|
|
25
|
+
*,
|
|
26
|
+
result: JudgeCalibrationResult,
|
|
27
|
+
artifacts_dir: Path,
|
|
28
|
+
) -> dict[str, str]:
|
|
29
|
+
artifacts_dir.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
index_path = artifacts_dir / "index.json"
|
|
31
|
+
payload = {
|
|
32
|
+
"run_id": result.summary.run_id,
|
|
33
|
+
"metrics": [metric.metric for metric in result.metrics],
|
|
34
|
+
"cases": {},
|
|
35
|
+
}
|
|
36
|
+
for metric, cases in result.case_results.items():
|
|
37
|
+
case_path = artifacts_dir / f"{metric}.json"
|
|
38
|
+
case_payload = [
|
|
39
|
+
{
|
|
40
|
+
"test_case_id": case.test_case_id,
|
|
41
|
+
"raw_score": case.raw_score,
|
|
42
|
+
"calibrated_score": case.calibrated_score,
|
|
43
|
+
"label": case.label,
|
|
44
|
+
"label_source": case.label_source,
|
|
45
|
+
}
|
|
46
|
+
for case in cases
|
|
47
|
+
]
|
|
48
|
+
case_path.write_text(
|
|
49
|
+
json.dumps(case_payload, ensure_ascii=False, indent=2),
|
|
50
|
+
encoding="utf-8",
|
|
51
|
+
)
|
|
52
|
+
payload["cases"][metric] = str(case_path)
|
|
53
|
+
index_path.write_text(
|
|
54
|
+
json.dumps(payload, ensure_ascii=False, indent=2),
|
|
55
|
+
encoding="utf-8",
|
|
56
|
+
)
|
|
57
|
+
return {"dir": str(artifacts_dir), "index": str(index_path)}
|
|
@@ -5,6 +5,7 @@ from __future__ import annotations
|
|
|
5
5
|
import json
|
|
6
6
|
import os
|
|
7
7
|
import subprocess
|
|
8
|
+
import warnings
|
|
8
9
|
from collections.abc import Sequence
|
|
9
10
|
from pathlib import Path
|
|
10
11
|
from typing import Any
|
|
@@ -18,7 +19,9 @@ class ExternalCommandMethod(RagMethodPort):
|
|
|
18
19
|
|
|
19
20
|
name = "external_command"
|
|
20
21
|
version = "0.1.0"
|
|
21
|
-
description =
|
|
22
|
+
description = (
|
|
23
|
+
"Execute a method in a separate process (shell=True requires a trusted command string)."
|
|
24
|
+
)
|
|
22
25
|
tags = ("external", "isolation")
|
|
23
26
|
|
|
24
27
|
def __init__(
|
|
@@ -67,6 +70,7 @@ class ExternalCommandMethod(RagMethodPort):
|
|
|
67
70
|
)
|
|
68
71
|
|
|
69
72
|
command = self._build_command(runtime)
|
|
73
|
+
self._validate_shell_usage(command)
|
|
70
74
|
result = subprocess.run( # noqa: S603 - user-controlled command by design
|
|
71
75
|
command,
|
|
72
76
|
cwd=self._workdir,
|
|
@@ -104,6 +108,23 @@ class ExternalCommandMethod(RagMethodPort):
|
|
|
104
108
|
except KeyError as exc:
|
|
105
109
|
raise ValueError(f"Unknown command placeholder: {exc}") from exc
|
|
106
110
|
|
|
111
|
+
def _validate_shell_usage(self, command: list[str] | str) -> None:
|
|
112
|
+
if not self._shell:
|
|
113
|
+
return
|
|
114
|
+
if not isinstance(command, str):
|
|
115
|
+
raise ValueError(
|
|
116
|
+
"shell=True requires a single command string; list arguments are rejected."
|
|
117
|
+
)
|
|
118
|
+
if not command.strip():
|
|
119
|
+
raise ValueError("shell=True requires a non-empty command string.")
|
|
120
|
+
if "\n" in command or "\r" in command:
|
|
121
|
+
raise ValueError("shell=True command must not contain newlines.")
|
|
122
|
+
warnings.warn(
|
|
123
|
+
"shell=True executes through the system shell. Use only trusted commands.",
|
|
124
|
+
RuntimeWarning,
|
|
125
|
+
stacklevel=2,
|
|
126
|
+
)
|
|
127
|
+
|
|
107
128
|
@staticmethod
|
|
108
129
|
def _load_payload(path: Path) -> Any:
|
|
109
130
|
if not path.exists():
|
|
@@ -4,6 +4,13 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
from langfuse import Langfuse
|
|
6
6
|
|
|
7
|
+
from evalvault.adapters.outbound.tracker.log_sanitizer import (
|
|
8
|
+
MAX_CONTEXT_CHARS,
|
|
9
|
+
MAX_LOG_CHARS,
|
|
10
|
+
sanitize_payload,
|
|
11
|
+
sanitize_text,
|
|
12
|
+
sanitize_text_list,
|
|
13
|
+
)
|
|
7
14
|
from evalvault.config.phoenix_support import extract_phoenix_links
|
|
8
15
|
from evalvault.domain.entities import EvaluationRun
|
|
9
16
|
from evalvault.ports.outbound.tracker_port import TrackerPort
|
|
@@ -56,13 +63,15 @@ class LangfuseAdapter(TrackerPort):
|
|
|
56
63
|
span.update_trace(name=name, metadata=metadata)
|
|
57
64
|
self._traces[trace_id] = span
|
|
58
65
|
else:
|
|
59
|
-
|
|
60
|
-
|
|
66
|
+
trace_fn: Any = getattr(self._client, "trace", None)
|
|
67
|
+
if trace_fn is None:
|
|
68
|
+
raise RuntimeError("Langfuse client does not expose trace API")
|
|
69
|
+
trace_obj = trace_fn(
|
|
61
70
|
name=name,
|
|
62
71
|
metadata=metadata,
|
|
63
72
|
)
|
|
64
|
-
trace_id =
|
|
65
|
-
self._traces[trace_id] =
|
|
73
|
+
trace_id = trace_obj.id
|
|
74
|
+
self._traces[trace_id] = trace_obj
|
|
66
75
|
return trace_id
|
|
67
76
|
|
|
68
77
|
def add_span(
|
|
@@ -88,21 +97,31 @@ class LangfuseAdapter(TrackerPort):
|
|
|
88
97
|
raise ValueError(f"Trace not found: {trace_id}")
|
|
89
98
|
|
|
90
99
|
trace_or_span = self._traces[trace_id]
|
|
100
|
+
safe_input = (
|
|
101
|
+
sanitize_payload(input_data, max_chars=MAX_LOG_CHARS)
|
|
102
|
+
if input_data is not None
|
|
103
|
+
else None
|
|
104
|
+
)
|
|
105
|
+
safe_output = (
|
|
106
|
+
sanitize_payload(output_data, max_chars=MAX_LOG_CHARS)
|
|
107
|
+
if output_data is not None
|
|
108
|
+
else None
|
|
109
|
+
)
|
|
91
110
|
# Support both old and new Langfuse API
|
|
92
111
|
if hasattr(trace_or_span, "start_span"):
|
|
93
112
|
# Langfuse 3.x: create nested span
|
|
94
113
|
child_span = trace_or_span.start_span(
|
|
95
114
|
name=name,
|
|
96
|
-
input=
|
|
97
|
-
output=
|
|
115
|
+
input=safe_input,
|
|
116
|
+
output=safe_output,
|
|
98
117
|
)
|
|
99
118
|
child_span.end()
|
|
100
119
|
else:
|
|
101
120
|
# Langfuse 2.x: use span method on trace
|
|
102
121
|
trace_or_span.span(
|
|
103
122
|
name=name,
|
|
104
|
-
input=
|
|
105
|
-
output=
|
|
123
|
+
input=safe_input,
|
|
124
|
+
output=safe_output,
|
|
106
125
|
)
|
|
107
126
|
|
|
108
127
|
def log_score(
|
|
@@ -223,7 +242,7 @@ class LangfuseAdapter(TrackerPort):
|
|
|
223
242
|
passed_count = sum(
|
|
224
243
|
1
|
|
225
244
|
for r in run.results
|
|
226
|
-
if r.get_metric(metric_name) and
|
|
245
|
+
if (metric := r.get_metric(metric_name)) and metric.passed is True
|
|
227
246
|
)
|
|
228
247
|
avg_score = run.get_avg_score(metric_name)
|
|
229
248
|
threshold = run.thresholds.get(metric_name, 0.7)
|
|
@@ -377,10 +396,13 @@ class LangfuseAdapter(TrackerPort):
|
|
|
377
396
|
# Span input: test case data (question, answer, contexts, ground_truth)
|
|
378
397
|
span_input = {
|
|
379
398
|
"test_case_id": result.test_case_id,
|
|
380
|
-
"question": result.question,
|
|
381
|
-
"answer": result.answer,
|
|
382
|
-
"contexts":
|
|
383
|
-
|
|
399
|
+
"question": sanitize_text(result.question, max_chars=MAX_LOG_CHARS),
|
|
400
|
+
"answer": sanitize_text(result.answer, max_chars=MAX_LOG_CHARS),
|
|
401
|
+
"contexts": sanitize_text_list(
|
|
402
|
+
result.contexts,
|
|
403
|
+
max_chars=MAX_CONTEXT_CHARS,
|
|
404
|
+
),
|
|
405
|
+
"ground_truth": sanitize_text(result.ground_truth, max_chars=MAX_LOG_CHARS),
|
|
384
406
|
}
|
|
385
407
|
|
|
386
408
|
# Span output: evaluation results
|
|
@@ -401,12 +423,15 @@ class LangfuseAdapter(TrackerPort):
|
|
|
401
423
|
}
|
|
402
424
|
|
|
403
425
|
# Span metadata: additional info
|
|
404
|
-
span_metadata = {
|
|
426
|
+
span_metadata: dict[str, float | int] = {
|
|
405
427
|
"tokens_used": result.tokens_used,
|
|
406
428
|
"latency_ms": result.latency_ms,
|
|
407
429
|
}
|
|
408
430
|
if result.cost_usd:
|
|
409
|
-
span_metadata
|
|
431
|
+
span_metadata = {
|
|
432
|
+
**span_metadata,
|
|
433
|
+
"cost_usd": float(result.cost_usd),
|
|
434
|
+
}
|
|
410
435
|
|
|
411
436
|
if hasattr(root_span, "start_span"):
|
|
412
437
|
child_span = root_span.start_span(
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
MASK_TOKEN = "[REDACTED]"
|
|
7
|
+
MAX_LOG_CHARS = 1000
|
|
8
|
+
MAX_CONTEXT_CHARS = 500
|
|
9
|
+
MAX_LIST_ITEMS = 20
|
|
10
|
+
MAX_PAYLOAD_DEPTH = 2
|
|
11
|
+
|
|
12
|
+
_EMAIL_PATTERN = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
|
|
13
|
+
_PHONE_PATTERN = re.compile(
|
|
14
|
+
r"\b(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{4}\b"
|
|
15
|
+
)
|
|
16
|
+
_SSN_PATTERN = re.compile(r"\b\d{3}-\d{2}-\d{4}\b")
|
|
17
|
+
_CARD_PATTERN = re.compile(r"\b(?:\d[ -]*?){13,16}\b")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _mask_pii(text: str) -> str:
|
|
21
|
+
text = _EMAIL_PATTERN.sub(MASK_TOKEN, text)
|
|
22
|
+
text = _PHONE_PATTERN.sub(MASK_TOKEN, text)
|
|
23
|
+
text = _SSN_PATTERN.sub(MASK_TOKEN, text)
|
|
24
|
+
text = _CARD_PATTERN.sub(MASK_TOKEN, text)
|
|
25
|
+
return text
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _truncate(text: str, max_chars: int) -> str:
|
|
29
|
+
if max_chars <= 0:
|
|
30
|
+
return ""
|
|
31
|
+
if len(text) <= max_chars:
|
|
32
|
+
return text
|
|
33
|
+
if max_chars <= 3:
|
|
34
|
+
return text[:max_chars]
|
|
35
|
+
return f"{text[: max_chars - 3]}..."
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def sanitize_text(value: str | None, *, max_chars: int = MAX_LOG_CHARS) -> str | None:
|
|
39
|
+
if value is None:
|
|
40
|
+
return None
|
|
41
|
+
if not isinstance(value, str):
|
|
42
|
+
value = str(value)
|
|
43
|
+
return _truncate(_mask_pii(value), max_chars)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def sanitize_text_list(
|
|
47
|
+
values: list[str] | tuple[str, ...] | None,
|
|
48
|
+
*,
|
|
49
|
+
max_items: int = MAX_LIST_ITEMS,
|
|
50
|
+
max_chars: int = MAX_CONTEXT_CHARS,
|
|
51
|
+
) -> list[str]:
|
|
52
|
+
if not values:
|
|
53
|
+
return []
|
|
54
|
+
trimmed = list(values)[:max_items]
|
|
55
|
+
return [sanitize_text(item, max_chars=max_chars) or "" for item in trimmed]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def sanitize_payload(
|
|
59
|
+
value: Any,
|
|
60
|
+
*,
|
|
61
|
+
max_chars: int = MAX_LOG_CHARS,
|
|
62
|
+
max_items: int = MAX_LIST_ITEMS,
|
|
63
|
+
max_depth: int = MAX_PAYLOAD_DEPTH,
|
|
64
|
+
) -> Any:
|
|
65
|
+
if value is None:
|
|
66
|
+
return None
|
|
67
|
+
if isinstance(value, str):
|
|
68
|
+
return sanitize_text(value, max_chars=max_chars)
|
|
69
|
+
if isinstance(value, bool | int | float):
|
|
70
|
+
return value
|
|
71
|
+
if max_depth <= 0:
|
|
72
|
+
return sanitize_text(str(value), max_chars=max_chars)
|
|
73
|
+
if isinstance(value, dict):
|
|
74
|
+
return {
|
|
75
|
+
key: sanitize_payload(
|
|
76
|
+
item,
|
|
77
|
+
max_chars=max_chars,
|
|
78
|
+
max_items=max_items,
|
|
79
|
+
max_depth=max_depth - 1,
|
|
80
|
+
)
|
|
81
|
+
for key, item in list(value.items())[:max_items]
|
|
82
|
+
}
|
|
83
|
+
if isinstance(value, list | tuple | set):
|
|
84
|
+
return [
|
|
85
|
+
sanitize_payload(
|
|
86
|
+
item,
|
|
87
|
+
max_chars=max_chars,
|
|
88
|
+
max_items=max_items,
|
|
89
|
+
max_depth=max_depth - 1,
|
|
90
|
+
)
|
|
91
|
+
for item in list(value)[:max_items]
|
|
92
|
+
]
|
|
93
|
+
return sanitize_text(str(value), max_chars=max_chars)
|
|
@@ -4,6 +4,7 @@ import json
|
|
|
4
4
|
import tempfile
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
|
+
from evalvault.adapters.outbound.tracker.log_sanitizer import MAX_LOG_CHARS, sanitize_payload
|
|
7
8
|
from evalvault.domain.entities import EvaluationRun
|
|
8
9
|
from evalvault.ports.outbound.tracker_port import TrackerPort
|
|
9
10
|
|
|
@@ -85,8 +86,8 @@ class MLflowAdapter(TrackerPort):
|
|
|
85
86
|
# Store span data as JSON artifact
|
|
86
87
|
span_data = {
|
|
87
88
|
"name": name,
|
|
88
|
-
"input": input_data,
|
|
89
|
-
"output": output_data,
|
|
89
|
+
"input": sanitize_payload(input_data, max_chars=MAX_LOG_CHARS),
|
|
90
|
+
"output": sanitize_payload(output_data, max_chars=MAX_LOG_CHARS),
|
|
90
91
|
}
|
|
91
92
|
|
|
92
93
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|