evalvault 1.63.1__py3-none-any.whl → 1.65.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/main.py +147 -9
- evalvault/adapters/inbound/api/routers/config.py +6 -1
- evalvault/adapters/inbound/api/routers/knowledge.py +62 -6
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/methods/external_command.py +22 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +40 -15
- evalvault/adapters/outbound/tracker/log_sanitizer.py +93 -0
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +3 -2
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +90 -37
- evalvault/config/secret_manager.py +118 -0
- evalvault/config/settings.py +141 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +2 -0
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/METADATA +8 -1
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/RECORD +51 -23
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -9,6 +9,13 @@ from datetime import datetime
|
|
|
9
9
|
from typing import TYPE_CHECKING, Any
|
|
10
10
|
|
|
11
11
|
from evalvault.adapters.outbound.tracer.open_rag_trace_helpers import serialize_json
|
|
12
|
+
from evalvault.adapters.outbound.tracker.log_sanitizer import (
|
|
13
|
+
MAX_CONTEXT_CHARS,
|
|
14
|
+
MAX_LOG_CHARS,
|
|
15
|
+
sanitize_payload,
|
|
16
|
+
sanitize_text,
|
|
17
|
+
sanitize_text_list,
|
|
18
|
+
)
|
|
12
19
|
from evalvault.domain.entities import (
|
|
13
20
|
EvaluationRun,
|
|
14
21
|
GenerationData,
|
|
@@ -19,8 +26,7 @@ from evalvault.domain.entities import (
|
|
|
19
26
|
from evalvault.ports.outbound.tracker_port import TrackerPort
|
|
20
27
|
|
|
21
28
|
if TYPE_CHECKING:
|
|
22
|
-
from opentelemetry.sdk.trace import
|
|
23
|
-
from opentelemetry.trace import Tracer
|
|
29
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
24
30
|
|
|
25
31
|
|
|
26
32
|
class PhoenixAdapter(TrackerPort):
|
|
@@ -55,9 +61,10 @@ class PhoenixAdapter(TrackerPort):
|
|
|
55
61
|
"""
|
|
56
62
|
self._endpoint = endpoint
|
|
57
63
|
self._service_name = service_name
|
|
58
|
-
self._tracer:
|
|
64
|
+
self._tracer: Any | None = None
|
|
59
65
|
self._tracer_provider: TracerProvider | None = None
|
|
60
|
-
self._active_spans: dict[str,
|
|
66
|
+
self._active_spans: dict[str, Any] = {}
|
|
67
|
+
self._tracer_any: Any | None = None
|
|
61
68
|
self._initialized = False
|
|
62
69
|
|
|
63
70
|
def _ensure_initialized(self) -> None:
|
|
@@ -83,7 +90,8 @@ class PhoenixAdapter(TrackerPort):
|
|
|
83
90
|
provider = get_tracer_provider()
|
|
84
91
|
if provider:
|
|
85
92
|
self._tracer_provider = provider
|
|
86
|
-
self.
|
|
93
|
+
self._tracer_any = trace.get_tracer(__name__)
|
|
94
|
+
self._tracer = self._tracer_any
|
|
87
95
|
self._initialized = True
|
|
88
96
|
return
|
|
89
97
|
|
|
@@ -102,7 +110,8 @@ class PhoenixAdapter(TrackerPort):
|
|
|
102
110
|
trace.set_tracer_provider(self._tracer_provider)
|
|
103
111
|
|
|
104
112
|
# Get tracer
|
|
105
|
-
self.
|
|
113
|
+
self._tracer_any = trace.get_tracer(__name__)
|
|
114
|
+
self._tracer = self._tracer_any
|
|
106
115
|
self._initialized = True
|
|
107
116
|
|
|
108
117
|
except ImportError as e:
|
|
@@ -127,7 +136,12 @@ class PhoenixAdapter(TrackerPort):
|
|
|
127
136
|
self._ensure_initialized()
|
|
128
137
|
|
|
129
138
|
# Start a new span as root
|
|
130
|
-
|
|
139
|
+
tracer = self._tracer_any
|
|
140
|
+
if tracer is None:
|
|
141
|
+
tracer = self._tracer
|
|
142
|
+
if tracer is None:
|
|
143
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
144
|
+
span = tracer.start_span(name)
|
|
131
145
|
trace_id = str(uuid.uuid4())
|
|
132
146
|
|
|
133
147
|
# Set metadata as span attributes
|
|
@@ -166,14 +180,21 @@ class PhoenixAdapter(TrackerPort):
|
|
|
166
180
|
|
|
167
181
|
from opentelemetry import trace
|
|
168
182
|
|
|
183
|
+
tracer = self._tracer_any
|
|
184
|
+
if tracer is None:
|
|
185
|
+
tracer = self._tracer
|
|
186
|
+
if tracer is None:
|
|
187
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
169
188
|
parent_span = self._active_spans[trace_id]
|
|
170
189
|
context = trace.set_span_in_context(parent_span)
|
|
171
190
|
|
|
172
|
-
with
|
|
191
|
+
with tracer.start_span(name, context=context) as span:
|
|
173
192
|
if input_data is not None:
|
|
174
|
-
|
|
193
|
+
safe_input = sanitize_payload(input_data, max_chars=MAX_LOG_CHARS)
|
|
194
|
+
span.set_attribute("input", json.dumps(safe_input, default=str))
|
|
175
195
|
if output_data is not None:
|
|
176
|
-
|
|
196
|
+
safe_output = sanitize_payload(output_data, max_chars=MAX_LOG_CHARS)
|
|
197
|
+
span.set_attribute("output", json.dumps(safe_output, default=str))
|
|
177
198
|
|
|
178
199
|
def log_score(
|
|
179
200
|
self,
|
|
@@ -270,7 +291,7 @@ class PhoenixAdapter(TrackerPort):
|
|
|
270
291
|
passed_count = sum(
|
|
271
292
|
1
|
|
272
293
|
for r in run.results
|
|
273
|
-
if r.get_metric(metric_name) and
|
|
294
|
+
if (metric := r.get_metric(metric_name)) and metric.passed is True
|
|
274
295
|
)
|
|
275
296
|
avg_score = run.get_avg_score(metric_name)
|
|
276
297
|
threshold = run.thresholds.get(metric_name, 0.7)
|
|
@@ -360,20 +381,33 @@ class PhoenixAdapter(TrackerPort):
|
|
|
360
381
|
"""
|
|
361
382
|
from opentelemetry import trace
|
|
362
383
|
|
|
384
|
+
tracer = self._tracer_any
|
|
385
|
+
if tracer is None:
|
|
386
|
+
tracer = self._tracer
|
|
387
|
+
if tracer is None:
|
|
388
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
363
389
|
parent_span = self._active_spans[trace_id]
|
|
364
390
|
context = trace.set_span_in_context(parent_span)
|
|
365
391
|
|
|
366
|
-
with
|
|
392
|
+
with tracer.start_span(
|
|
367
393
|
f"test-case-{result.test_case_id}",
|
|
368
394
|
context=context,
|
|
369
395
|
) as span:
|
|
370
396
|
# Input data
|
|
371
|
-
|
|
372
|
-
|
|
397
|
+
safe_question = sanitize_text(result.question, max_chars=MAX_LOG_CHARS) or ""
|
|
398
|
+
safe_answer = sanitize_text(result.answer, max_chars=MAX_LOG_CHARS) or ""
|
|
399
|
+
span.set_attribute("input.question", safe_question)
|
|
400
|
+
span.set_attribute("input.answer", safe_answer)
|
|
373
401
|
if result.contexts:
|
|
374
|
-
|
|
402
|
+
safe_contexts = sanitize_text_list(
|
|
403
|
+
result.contexts,
|
|
404
|
+
max_chars=MAX_CONTEXT_CHARS,
|
|
405
|
+
)
|
|
406
|
+
span.set_attribute("input.contexts", json.dumps(safe_contexts))
|
|
375
407
|
if result.ground_truth:
|
|
376
|
-
|
|
408
|
+
safe_ground_truth = sanitize_text(result.ground_truth, max_chars=MAX_LOG_CHARS)
|
|
409
|
+
if safe_ground_truth:
|
|
410
|
+
span.set_attribute("input.ground_truth", safe_ground_truth)
|
|
377
411
|
|
|
378
412
|
# Metrics
|
|
379
413
|
span.set_attribute("output.all_passed", result.all_passed)
|
|
@@ -461,15 +495,22 @@ class PhoenixAdapter(TrackerPort):
|
|
|
461
495
|
parent_span = self._active_spans[trace_id]
|
|
462
496
|
context = trace.set_span_in_context(parent_span)
|
|
463
497
|
|
|
464
|
-
|
|
498
|
+
tracer = self._tracer_any
|
|
499
|
+
if tracer is None:
|
|
500
|
+
tracer = self._tracer
|
|
501
|
+
if tracer is None:
|
|
502
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
503
|
+
with tracer.start_span("retrieval", context=context) as span:
|
|
465
504
|
# Set retrieval attributes
|
|
466
505
|
for key, value in data.to_span_attributes().items():
|
|
467
506
|
span.set_attribute(key, value)
|
|
468
507
|
|
|
469
508
|
# Set query
|
|
470
509
|
if data.query:
|
|
471
|
-
|
|
472
|
-
|
|
510
|
+
safe_query = sanitize_text(data.query, max_chars=MAX_LOG_CHARS)
|
|
511
|
+
if safe_query:
|
|
512
|
+
span.set_attribute("retrieval.query", safe_query)
|
|
513
|
+
span.set_attribute("input.value", safe_query)
|
|
473
514
|
|
|
474
515
|
span.set_attribute("spec.version", "0.1")
|
|
475
516
|
span.set_attribute("rag.module", "retrieve")
|
|
@@ -495,11 +536,14 @@ class PhoenixAdapter(TrackerPort):
|
|
|
495
536
|
event_attrs["doc.rerank_rank"] = doc.rerank_rank
|
|
496
537
|
if doc.chunk_id:
|
|
497
538
|
event_attrs["doc.chunk_id"] = doc.chunk_id
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
539
|
+
safe_preview = (
|
|
540
|
+
sanitize_text(doc.content, max_chars=MAX_CONTEXT_CHARS) if doc.content else ""
|
|
541
|
+
)
|
|
542
|
+
if safe_preview:
|
|
543
|
+
event_attrs["doc.preview"] = safe_preview
|
|
501
544
|
if doc.metadata:
|
|
502
|
-
|
|
545
|
+
safe_metadata = sanitize_payload(doc.metadata, max_chars=MAX_LOG_CHARS)
|
|
546
|
+
event_attrs["doc.metadata"] = json.dumps(safe_metadata, default=str)
|
|
503
547
|
span.add_event(f"retrieved_doc_{i}", attributes=event_attrs)
|
|
504
548
|
|
|
505
549
|
def log_generation(
|
|
@@ -538,15 +582,19 @@ class PhoenixAdapter(TrackerPort):
|
|
|
538
582
|
parent_span = self._active_spans[trace_id]
|
|
539
583
|
context = trace.set_span_in_context(parent_span)
|
|
540
584
|
|
|
541
|
-
|
|
585
|
+
tracer = self._tracer_any
|
|
586
|
+
if tracer is None:
|
|
587
|
+
tracer = self._tracer
|
|
588
|
+
if tracer is None:
|
|
589
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
590
|
+
with tracer.start_span("generation", context=context) as span:
|
|
542
591
|
# Set generation attributes
|
|
543
592
|
for key, value in data.to_span_attributes().items():
|
|
544
593
|
span.set_attribute(key, value)
|
|
545
594
|
|
|
546
595
|
# Set prompt/response (truncate if too long)
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
response = data.response[:max_len] if data.response else ""
|
|
596
|
+
prompt = sanitize_text(data.prompt, max_chars=MAX_LOG_CHARS) or ""
|
|
597
|
+
response = sanitize_text(data.response, max_chars=MAX_LOG_CHARS) or ""
|
|
550
598
|
if prompt:
|
|
551
599
|
span.set_attribute("generation.prompt", prompt)
|
|
552
600
|
span.set_attribute("input.value", prompt)
|
|
@@ -559,24 +607,28 @@ class PhoenixAdapter(TrackerPort):
|
|
|
559
607
|
|
|
560
608
|
# Set prompt template if available
|
|
561
609
|
if data.prompt_template:
|
|
562
|
-
|
|
610
|
+
safe_template = sanitize_text(data.prompt_template, max_chars=MAX_LOG_CHARS)
|
|
611
|
+
if safe_template:
|
|
612
|
+
span.set_attribute("generation.prompt_template", safe_template)
|
|
563
613
|
|
|
564
614
|
def log_rag_trace(self, data: RAGTraceData) -> str:
|
|
565
615
|
"""Log a full RAG trace (retrieval + generation) to Phoenix."""
|
|
566
616
|
|
|
567
617
|
self._ensure_initialized()
|
|
568
618
|
metadata = {"event_type": "rag_trace", "total_time_ms": data.total_time_ms}
|
|
569
|
-
|
|
570
|
-
|
|
619
|
+
safe_query = sanitize_text(data.query, max_chars=MAX_LOG_CHARS)
|
|
620
|
+
if safe_query:
|
|
621
|
+
metadata["query"] = safe_query
|
|
571
622
|
if data.metadata:
|
|
572
|
-
|
|
623
|
+
safe_metadata = sanitize_payload(data.metadata, max_chars=MAX_LOG_CHARS)
|
|
624
|
+
metadata.update(safe_metadata)
|
|
573
625
|
|
|
574
626
|
should_end = False
|
|
575
627
|
trace_id = data.trace_id
|
|
576
628
|
if trace_id and trace_id in self._active_spans:
|
|
577
629
|
span = self._active_spans[trace_id]
|
|
578
630
|
else:
|
|
579
|
-
trace_name = f"rag-trace-{(
|
|
631
|
+
trace_name = f"rag-trace-{(safe_query or 'run')[:12]}"
|
|
580
632
|
trace_id = self.start_trace(trace_name, metadata=metadata)
|
|
581
633
|
span = self._active_spans[trace_id]
|
|
582
634
|
should_end = True
|
|
@@ -589,12 +641,13 @@ class PhoenixAdapter(TrackerPort):
|
|
|
589
641
|
if data.generation:
|
|
590
642
|
self.log_generation(trace_id, data.generation)
|
|
591
643
|
if data.final_answer:
|
|
592
|
-
preview = data.final_answer
|
|
593
|
-
|
|
594
|
-
|
|
644
|
+
preview = sanitize_text(data.final_answer, max_chars=MAX_LOG_CHARS)
|
|
645
|
+
if preview:
|
|
646
|
+
span.set_attribute("rag.final_answer", preview)
|
|
647
|
+
span.set_attribute("output.value", preview)
|
|
595
648
|
|
|
596
|
-
if
|
|
597
|
-
span.set_attribute("input.value",
|
|
649
|
+
if safe_query:
|
|
650
|
+
span.set_attribute("input.value", safe_query)
|
|
598
651
|
|
|
599
652
|
span.set_attribute("spec.version", "0.1")
|
|
600
653
|
span.set_attribute("rag.module", "custom.pipeline")
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import os
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Protocol
|
|
7
|
+
|
|
8
|
+
SECRET_REF_PREFIX = "secret://"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SecretProvider(Protocol):
|
|
12
|
+
def get_secret(self, name: str) -> str: ...
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SecretProviderError(RuntimeError):
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class EnvSecretProvider:
|
|
21
|
+
def get_secret(self, name: str) -> str:
|
|
22
|
+
value = os.environ.get(name)
|
|
23
|
+
if value is None:
|
|
24
|
+
raise SecretProviderError(f"Missing secret in environment: {name}")
|
|
25
|
+
return value
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class AwsSecretsManagerProvider:
|
|
30
|
+
region_name: str | None = None
|
|
31
|
+
|
|
32
|
+
def get_secret(self, name: str) -> str:
|
|
33
|
+
try:
|
|
34
|
+
import boto3 # type: ignore
|
|
35
|
+
except ImportError as exc:
|
|
36
|
+
raise SecretProviderError("boto3 is required for AWS Secrets Manager") from exc
|
|
37
|
+
client = boto3.client("secretsmanager", region_name=self.region_name)
|
|
38
|
+
response = client.get_secret_value(SecretId=name)
|
|
39
|
+
if "SecretString" in response and response["SecretString"] is not None:
|
|
40
|
+
return response["SecretString"]
|
|
41
|
+
secret_binary = response.get("SecretBinary")
|
|
42
|
+
if secret_binary is None:
|
|
43
|
+
raise SecretProviderError("Empty secret value returned from AWS Secrets Manager")
|
|
44
|
+
return base64.b64decode(secret_binary).decode("utf-8")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class GcpSecretManagerProvider:
|
|
49
|
+
def get_secret(self, name: str) -> str:
|
|
50
|
+
try:
|
|
51
|
+
from google.cloud import secretmanager # type: ignore
|
|
52
|
+
except ImportError as exc:
|
|
53
|
+
raise SecretProviderError(
|
|
54
|
+
"google-cloud-secret-manager is required for GCP Secret Manager"
|
|
55
|
+
) from exc
|
|
56
|
+
client = secretmanager.SecretManagerServiceClient()
|
|
57
|
+
response = client.access_secret_version(request={"name": name})
|
|
58
|
+
return response.payload.data.decode("utf-8")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class VaultSecretProvider:
|
|
63
|
+
def get_secret(self, name: str) -> str:
|
|
64
|
+
try:
|
|
65
|
+
import hvac # type: ignore
|
|
66
|
+
except ImportError as exc:
|
|
67
|
+
raise SecretProviderError("hvac is required for Vault secret access") from exc
|
|
68
|
+
client = hvac.Client()
|
|
69
|
+
if not client.is_authenticated():
|
|
70
|
+
raise SecretProviderError("Vault client authentication failed")
|
|
71
|
+
response = client.secrets.kv.v2.read_secret_version(path=name)
|
|
72
|
+
data = response.get("data", {}).get("data", {})
|
|
73
|
+
if not data:
|
|
74
|
+
raise SecretProviderError("Vault secret payload is empty")
|
|
75
|
+
if "value" in data:
|
|
76
|
+
return str(data["value"])
|
|
77
|
+
if len(data) == 1:
|
|
78
|
+
return str(next(iter(data.values())))
|
|
79
|
+
raise SecretProviderError("Vault secret has multiple keys; specify 'value' key")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def is_secret_reference(value: str | None) -> bool:
|
|
83
|
+
return bool(value) and value.startswith(SECRET_REF_PREFIX)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def parse_secret_reference(value: str) -> str:
|
|
87
|
+
return value.removeprefix(SECRET_REF_PREFIX).strip()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def build_secret_provider(provider_name: str | None) -> SecretProvider:
|
|
91
|
+
provider = (provider_name or "").strip().lower()
|
|
92
|
+
if not provider:
|
|
93
|
+
raise SecretProviderError("Secret provider is not configured.")
|
|
94
|
+
if provider == "env":
|
|
95
|
+
return EnvSecretProvider()
|
|
96
|
+
if provider in {"aws", "aws-secrets-manager", "secretsmanager"}:
|
|
97
|
+
return AwsSecretsManagerProvider(region_name=os.environ.get("AWS_REGION"))
|
|
98
|
+
if provider in {"gcp", "gcp-secret-manager", "secretmanager"}:
|
|
99
|
+
return GcpSecretManagerProvider()
|
|
100
|
+
if provider in {"vault", "hashicorp-vault"}:
|
|
101
|
+
return VaultSecretProvider()
|
|
102
|
+
raise SecretProviderError(f"Unknown secret provider: {provider_name}")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def resolve_secret_reference(
|
|
106
|
+
value: str,
|
|
107
|
+
provider: SecretProvider,
|
|
108
|
+
cache: dict[str, str] | None = None,
|
|
109
|
+
) -> str:
|
|
110
|
+
secret_name = parse_secret_reference(value)
|
|
111
|
+
if not secret_name:
|
|
112
|
+
raise SecretProviderError("Secret reference must include a name.")
|
|
113
|
+
if cache is not None and secret_name in cache:
|
|
114
|
+
return cache[secret_name]
|
|
115
|
+
secret_value = provider.get_secret(secret_name)
|
|
116
|
+
if cache is not None:
|
|
117
|
+
cache[secret_name] = secret_value
|
|
118
|
+
return secret_value
|
evalvault/config/settings.py
CHANGED
|
@@ -3,9 +3,16 @@
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Any
|
|
5
5
|
|
|
6
|
-
from pydantic import Field
|
|
6
|
+
from pydantic import Field, PrivateAttr
|
|
7
7
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
8
8
|
|
|
9
|
+
from evalvault.config.secret_manager import (
|
|
10
|
+
SecretProviderError,
|
|
11
|
+
build_secret_provider,
|
|
12
|
+
is_secret_reference,
|
|
13
|
+
resolve_secret_reference,
|
|
14
|
+
)
|
|
15
|
+
|
|
9
16
|
|
|
10
17
|
def _detect_repo_root(start: Path, max_depth: int = 6) -> Path | None:
|
|
11
18
|
current = start
|
|
@@ -38,6 +45,75 @@ def _ensure_http_scheme(url_value: str) -> str:
|
|
|
38
45
|
return f"http://{value}"
|
|
39
46
|
|
|
40
47
|
|
|
48
|
+
def is_production_profile(profile_name: str | None) -> bool:
|
|
49
|
+
return (profile_name or "").strip().lower() == "prod"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _parse_cors_origins(cors_origins: str | None) -> list[str]:
|
|
53
|
+
if not cors_origins:
|
|
54
|
+
return []
|
|
55
|
+
return [origin.strip() for origin in cors_origins.split(",") if origin.strip()]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
SECRET_REFERENCE_FIELDS = (
|
|
59
|
+
"api_auth_tokens",
|
|
60
|
+
"knowledge_read_tokens",
|
|
61
|
+
"knowledge_write_tokens",
|
|
62
|
+
"openai_api_key",
|
|
63
|
+
"anthropic_api_key",
|
|
64
|
+
"azure_api_key",
|
|
65
|
+
"vllm_api_key",
|
|
66
|
+
"langfuse_public_key",
|
|
67
|
+
"langfuse_secret_key",
|
|
68
|
+
"phoenix_api_token",
|
|
69
|
+
"postgres_password",
|
|
70
|
+
"postgres_connection_string",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _validate_production_settings(settings: "Settings") -> None:
|
|
75
|
+
if not is_production_profile(settings.evalvault_profile):
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
missing: list[str] = []
|
|
79
|
+
|
|
80
|
+
if not settings.api_auth_tokens:
|
|
81
|
+
missing.append("API_AUTH_TOKENS")
|
|
82
|
+
|
|
83
|
+
if settings.llm_provider == "openai" and not settings.openai_api_key:
|
|
84
|
+
missing.append("OPENAI_API_KEY")
|
|
85
|
+
|
|
86
|
+
if settings.tracker_provider == "langfuse":
|
|
87
|
+
if not settings.langfuse_public_key:
|
|
88
|
+
missing.append("LANGFUSE_PUBLIC_KEY")
|
|
89
|
+
if not settings.langfuse_secret_key:
|
|
90
|
+
missing.append("LANGFUSE_SECRET_KEY")
|
|
91
|
+
|
|
92
|
+
if settings.tracker_provider == "mlflow" and not settings.mlflow_tracking_uri:
|
|
93
|
+
missing.append("MLFLOW_TRACKING_URI")
|
|
94
|
+
|
|
95
|
+
if (
|
|
96
|
+
settings.postgres_connection_string is None
|
|
97
|
+
and settings.postgres_host
|
|
98
|
+
and not settings.postgres_password
|
|
99
|
+
):
|
|
100
|
+
missing.append("POSTGRES_PASSWORD")
|
|
101
|
+
|
|
102
|
+
cors_origins = _parse_cors_origins(settings.cors_origins)
|
|
103
|
+
if not cors_origins:
|
|
104
|
+
missing.append("CORS_ORIGINS")
|
|
105
|
+
else:
|
|
106
|
+
localhost_origins = {"localhost", "127.0.0.1"}
|
|
107
|
+
for origin in cors_origins:
|
|
108
|
+
if any(host in origin for host in localhost_origins):
|
|
109
|
+
raise ValueError("Production profile forbids localhost in CORS_ORIGINS.")
|
|
110
|
+
|
|
111
|
+
if missing:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
"Missing required settings for prod profile: " + ", ".join(sorted(set(missing)))
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
41
117
|
class Settings(BaseSettings):
|
|
42
118
|
"""Application configuration settings."""
|
|
43
119
|
|
|
@@ -48,6 +124,8 @@ class Settings(BaseSettings):
|
|
|
48
124
|
extra="ignore",
|
|
49
125
|
)
|
|
50
126
|
|
|
127
|
+
_secret_cache: dict[str, str] = PrivateAttr(default_factory=dict)
|
|
128
|
+
|
|
51
129
|
# Profile Configuration (YAML 기반 모델 프로필)
|
|
52
130
|
evalvault_profile: str | None = Field(
|
|
53
131
|
default=None,
|
|
@@ -58,6 +136,45 @@ class Settings(BaseSettings):
|
|
|
58
136
|
default="http://localhost:5173,http://127.0.0.1:5173",
|
|
59
137
|
description="Comma-separated list of allowed CORS origins.",
|
|
60
138
|
)
|
|
139
|
+
secret_provider: str | None = Field(
|
|
140
|
+
default=None,
|
|
141
|
+
description="Secret provider name for secret:// references (env/aws/gcp/vault).",
|
|
142
|
+
)
|
|
143
|
+
secret_cache_enabled: bool = Field(
|
|
144
|
+
default=True,
|
|
145
|
+
description="Cache resolved secret references in memory.",
|
|
146
|
+
)
|
|
147
|
+
api_auth_tokens: str | None = Field(
|
|
148
|
+
default=None,
|
|
149
|
+
description=(
|
|
150
|
+
"Comma-separated list of API bearer tokens for FastAPI auth. "
|
|
151
|
+
"Leave empty to disable authentication."
|
|
152
|
+
),
|
|
153
|
+
)
|
|
154
|
+
knowledge_read_tokens: str | None = Field(
|
|
155
|
+
default=None,
|
|
156
|
+
description="Comma-separated read tokens for knowledge endpoints.",
|
|
157
|
+
)
|
|
158
|
+
knowledge_write_tokens: str | None = Field(
|
|
159
|
+
default=None,
|
|
160
|
+
description="Comma-separated write tokens for knowledge endpoints.",
|
|
161
|
+
)
|
|
162
|
+
rate_limit_enabled: bool = Field(
|
|
163
|
+
default=False,
|
|
164
|
+
description="Enable API rate limiting for /api routes.",
|
|
165
|
+
)
|
|
166
|
+
rate_limit_requests: int = Field(
|
|
167
|
+
default=120,
|
|
168
|
+
description="Max requests allowed within rate_limit_window_seconds.",
|
|
169
|
+
)
|
|
170
|
+
rate_limit_window_seconds: int = Field(
|
|
171
|
+
default=60,
|
|
172
|
+
description="Window size for rate limit checks in seconds.",
|
|
173
|
+
)
|
|
174
|
+
rate_limit_block_threshold: int = Field(
|
|
175
|
+
default=10,
|
|
176
|
+
description="Log suspicious activity after this many rate limit blocks.",
|
|
177
|
+
)
|
|
61
178
|
evalvault_db_path: str = Field(
|
|
62
179
|
default="data/db/evalvault.db",
|
|
63
180
|
description="SQLite database path for API/CLI storage.",
|
|
@@ -71,6 +188,26 @@ class Settings(BaseSettings):
|
|
|
71
188
|
self.evalvault_db_path = _resolve_storage_path(self.evalvault_db_path)
|
|
72
189
|
self.evalvault_memory_db_path = _resolve_storage_path(self.evalvault_memory_db_path)
|
|
73
190
|
self.ollama_base_url = _ensure_http_scheme(self.ollama_base_url)
|
|
191
|
+
self._resolve_secret_references()
|
|
192
|
+
|
|
193
|
+
def _resolve_secret_references(self) -> None:
|
|
194
|
+
secret_values = [
|
|
195
|
+
value
|
|
196
|
+
for value in (getattr(self, field, None) for field in SECRET_REFERENCE_FIELDS)
|
|
197
|
+
if isinstance(value, str)
|
|
198
|
+
]
|
|
199
|
+
if not any(is_secret_reference(value) for value in secret_values):
|
|
200
|
+
return
|
|
201
|
+
try:
|
|
202
|
+
provider = build_secret_provider(self.secret_provider)
|
|
203
|
+
except SecretProviderError as exc:
|
|
204
|
+
raise ValueError(str(exc)) from exc
|
|
205
|
+
cache = self._secret_cache if self.secret_cache_enabled else None
|
|
206
|
+
for field in SECRET_REFERENCE_FIELDS:
|
|
207
|
+
value = getattr(self, field, None)
|
|
208
|
+
if isinstance(value, str) and is_secret_reference(value):
|
|
209
|
+
resolved = resolve_secret_reference(value, provider, cache)
|
|
210
|
+
setattr(self, field, resolved)
|
|
74
211
|
|
|
75
212
|
# LLM Provider Selection
|
|
76
213
|
llm_provider: str = Field(
|
|
@@ -314,6 +451,8 @@ def get_settings() -> Settings:
|
|
|
314
451
|
if _settings.evalvault_profile:
|
|
315
452
|
_settings = apply_profile(_settings, _settings.evalvault_profile)
|
|
316
453
|
|
|
454
|
+
_validate_production_settings(_settings)
|
|
455
|
+
|
|
317
456
|
return _settings
|
|
318
457
|
|
|
319
458
|
|
|
@@ -346,6 +485,7 @@ def apply_runtime_overrides(overrides: dict[str, object]) -> Settings:
|
|
|
346
485
|
updated = Settings.model_validate(payload)
|
|
347
486
|
if updated.evalvault_profile:
|
|
348
487
|
updated = apply_profile(updated, updated.evalvault_profile)
|
|
488
|
+
_validate_production_settings(updated)
|
|
349
489
|
for key, value in updated.model_dump().items():
|
|
350
490
|
setattr(settings, key, value)
|
|
351
491
|
|
|
@@ -34,6 +34,12 @@ from evalvault.domain.entities.improvement import (
|
|
|
34
34
|
RAGComponent,
|
|
35
35
|
RAGImprovementGuide,
|
|
36
36
|
)
|
|
37
|
+
from evalvault.domain.entities.judge_calibration import (
|
|
38
|
+
JudgeCalibrationCase,
|
|
39
|
+
JudgeCalibrationMetric,
|
|
40
|
+
JudgeCalibrationResult,
|
|
41
|
+
JudgeCalibrationSummary,
|
|
42
|
+
)
|
|
37
43
|
from evalvault.domain.entities.kg import EntityModel, RelationModel
|
|
38
44
|
from evalvault.domain.entities.method import MethodInput, MethodInputDataset, MethodOutput
|
|
39
45
|
from evalvault.domain.entities.prompt import Prompt, PromptSet, PromptSetBundle, PromptSetItem
|
|
@@ -104,6 +110,10 @@ __all__ = [
|
|
|
104
110
|
"PatternType",
|
|
105
111
|
"RAGComponent",
|
|
106
112
|
"RAGImprovementGuide",
|
|
113
|
+
"JudgeCalibrationCase",
|
|
114
|
+
"JudgeCalibrationMetric",
|
|
115
|
+
"JudgeCalibrationResult",
|
|
116
|
+
"JudgeCalibrationSummary",
|
|
107
117
|
# KG
|
|
108
118
|
"EntityModel",
|
|
109
119
|
"RelationModel",
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class JudgeCalibrationCase:
|
|
8
|
+
test_case_id: str
|
|
9
|
+
raw_score: float
|
|
10
|
+
calibrated_score: float
|
|
11
|
+
label: float | None = None
|
|
12
|
+
label_source: str | None = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class JudgeCalibrationMetric:
|
|
17
|
+
metric: str
|
|
18
|
+
method: str
|
|
19
|
+
sample_count: int
|
|
20
|
+
label_count: int
|
|
21
|
+
mae: float | None
|
|
22
|
+
pearson: float | None
|
|
23
|
+
spearman: float | None
|
|
24
|
+
temperature: float | None = None
|
|
25
|
+
parameters: dict[str, float | None] = field(default_factory=dict)
|
|
26
|
+
gate_passed: bool | None = None
|
|
27
|
+
warning: str | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class JudgeCalibrationSummary:
|
|
32
|
+
run_id: str
|
|
33
|
+
labels_source: str
|
|
34
|
+
method: str
|
|
35
|
+
metrics: list[str]
|
|
36
|
+
holdout_ratio: float
|
|
37
|
+
seed: int
|
|
38
|
+
total_labels: int
|
|
39
|
+
total_samples: int
|
|
40
|
+
gate_passed: bool
|
|
41
|
+
gate_threshold: float | None = None
|
|
42
|
+
notes: list[str] = field(default_factory=list)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class JudgeCalibrationResult:
|
|
47
|
+
summary: JudgeCalibrationSummary
|
|
48
|
+
metrics: list[JudgeCalibrationMetric] = field(default_factory=list)
|
|
49
|
+
case_results: dict[str, list[JudgeCalibrationCase]] = field(default_factory=dict)
|
|
50
|
+
warnings: list[str] = field(default_factory=list)
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
6
|
from datetime import datetime
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, Literal, overload
|
|
8
8
|
from uuid import uuid4
|
|
9
9
|
|
|
10
10
|
REQUIRED_STAGE_TYPES: tuple[str, ...] = ("system_prompt", "input", "retrieval", "output")
|
|
@@ -82,8 +82,8 @@ class StageEvent:
|
|
|
82
82
|
duration_ms=_optional_float(payload.get("duration_ms")),
|
|
83
83
|
input_ref=input_ref,
|
|
84
84
|
output_ref=output_ref,
|
|
85
|
-
attributes=_ensure_dict(payload.get("attributes")),
|
|
86
|
-
metadata=_ensure_dict(payload.get("metadata")),
|
|
85
|
+
attributes=_ensure_dict(payload.get("attributes"), allow_none=False),
|
|
86
|
+
metadata=_ensure_dict(payload.get("metadata"), allow_none=False),
|
|
87
87
|
trace_id=_optional_str(payload.get("trace_id") or trace_payload.get("trace_id")),
|
|
88
88
|
span_id=_optional_str(payload.get("span_id") or trace_payload.get("span_id")),
|
|
89
89
|
)
|
|
@@ -187,6 +187,14 @@ def _parse_datetime(value: Any) -> datetime | None:
|
|
|
187
187
|
raise ValueError("Invalid datetime value")
|
|
188
188
|
|
|
189
189
|
|
|
190
|
+
@overload
|
|
191
|
+
def _ensure_dict(value: None, *, allow_none: Literal[True]) -> None: ...
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@overload
|
|
195
|
+
def _ensure_dict(value: Any, *, allow_none: Literal[False] = False) -> dict[str, Any]: ...
|
|
196
|
+
|
|
197
|
+
|
|
190
198
|
def _ensure_dict(value: Any, *, allow_none: bool = False) -> dict[str, Any] | None:
|
|
191
199
|
if value is None:
|
|
192
200
|
return None if allow_none else {}
|