evalvault 1.63.1__py3-none-any.whl → 1.65.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. evalvault/adapters/inbound/api/main.py +147 -9
  2. evalvault/adapters/inbound/api/routers/config.py +6 -1
  3. evalvault/adapters/inbound/api/routers/knowledge.py +62 -6
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  13. evalvault/adapters/outbound/artifact_fs.py +16 -0
  14. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  15. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  16. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  17. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  18. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  19. evalvault/adapters/outbound/methods/external_command.py +22 -1
  20. evalvault/adapters/outbound/tracker/langfuse_adapter.py +40 -15
  21. evalvault/adapters/outbound/tracker/log_sanitizer.py +93 -0
  22. evalvault/adapters/outbound/tracker/mlflow_adapter.py +3 -2
  23. evalvault/adapters/outbound/tracker/phoenix_adapter.py +90 -37
  24. evalvault/config/secret_manager.py +118 -0
  25. evalvault/config/settings.py +141 -1
  26. evalvault/domain/entities/__init__.py +10 -0
  27. evalvault/domain/entities/judge_calibration.py +50 -0
  28. evalvault/domain/entities/stage.py +11 -3
  29. evalvault/domain/services/artifact_lint_service.py +268 -0
  30. evalvault/domain/services/benchmark_runner.py +1 -6
  31. evalvault/domain/services/dataset_preprocessor.py +26 -0
  32. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  33. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  34. evalvault/domain/services/evaluator.py +2 -0
  35. evalvault/domain/services/judge_calibration_service.py +495 -0
  36. evalvault/domain/services/ops_snapshot_service.py +159 -0
  37. evalvault/domain/services/regression_gate_service.py +199 -0
  38. evalvault/domain/services/run_comparison_service.py +159 -0
  39. evalvault/domain/services/stage_event_builder.py +6 -1
  40. evalvault/domain/services/stage_metric_service.py +83 -18
  41. evalvault/ports/outbound/__init__.py +4 -0
  42. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  43. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  44. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  45. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  46. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  47. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/METADATA +8 -1
  48. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/RECORD +51 -23
  49. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
  50. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
  51. {evalvault-1.63.1.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -9,6 +9,13 @@ from datetime import datetime
9
9
  from typing import TYPE_CHECKING, Any
10
10
 
11
11
  from evalvault.adapters.outbound.tracer.open_rag_trace_helpers import serialize_json
12
+ from evalvault.adapters.outbound.tracker.log_sanitizer import (
13
+ MAX_CONTEXT_CHARS,
14
+ MAX_LOG_CHARS,
15
+ sanitize_payload,
16
+ sanitize_text,
17
+ sanitize_text_list,
18
+ )
12
19
  from evalvault.domain.entities import (
13
20
  EvaluationRun,
14
21
  GenerationData,
@@ -19,8 +26,7 @@ from evalvault.domain.entities import (
19
26
  from evalvault.ports.outbound.tracker_port import TrackerPort
20
27
 
21
28
  if TYPE_CHECKING:
22
- from opentelemetry.sdk.trace import Span, TracerProvider
23
- from opentelemetry.trace import Tracer
29
+ from opentelemetry.sdk.trace import TracerProvider
24
30
 
25
31
 
26
32
  class PhoenixAdapter(TrackerPort):
@@ -55,9 +61,10 @@ class PhoenixAdapter(TrackerPort):
55
61
  """
56
62
  self._endpoint = endpoint
57
63
  self._service_name = service_name
58
- self._tracer: Tracer | None = None
64
+ self._tracer: Any | None = None
59
65
  self._tracer_provider: TracerProvider | None = None
60
- self._active_spans: dict[str, Span] = {}
66
+ self._active_spans: dict[str, Any] = {}
67
+ self._tracer_any: Any | None = None
61
68
  self._initialized = False
62
69
 
63
70
  def _ensure_initialized(self) -> None:
@@ -83,7 +90,8 @@ class PhoenixAdapter(TrackerPort):
83
90
  provider = get_tracer_provider()
84
91
  if provider:
85
92
  self._tracer_provider = provider
86
- self._tracer = trace.get_tracer(__name__)
93
+ self._tracer_any = trace.get_tracer(__name__)
94
+ self._tracer = self._tracer_any
87
95
  self._initialized = True
88
96
  return
89
97
 
@@ -102,7 +110,8 @@ class PhoenixAdapter(TrackerPort):
102
110
  trace.set_tracer_provider(self._tracer_provider)
103
111
 
104
112
  # Get tracer
105
- self._tracer = trace.get_tracer(__name__)
113
+ self._tracer_any = trace.get_tracer(__name__)
114
+ self._tracer = self._tracer_any
106
115
  self._initialized = True
107
116
 
108
117
  except ImportError as e:
@@ -127,7 +136,12 @@ class PhoenixAdapter(TrackerPort):
127
136
  self._ensure_initialized()
128
137
 
129
138
  # Start a new span as root
130
- span = self._tracer.start_span(name)
139
+ tracer = self._tracer_any
140
+ if tracer is None:
141
+ tracer = self._tracer
142
+ if tracer is None:
143
+ raise RuntimeError("Phoenix tracer is not initialized")
144
+ span = tracer.start_span(name)
131
145
  trace_id = str(uuid.uuid4())
132
146
 
133
147
  # Set metadata as span attributes
@@ -166,14 +180,21 @@ class PhoenixAdapter(TrackerPort):
166
180
 
167
181
  from opentelemetry import trace
168
182
 
183
+ tracer = self._tracer_any
184
+ if tracer is None:
185
+ tracer = self._tracer
186
+ if tracer is None:
187
+ raise RuntimeError("Phoenix tracer is not initialized")
169
188
  parent_span = self._active_spans[trace_id]
170
189
  context = trace.set_span_in_context(parent_span)
171
190
 
172
- with self._tracer.start_span(name, context=context) as span:
191
+ with tracer.start_span(name, context=context) as span:
173
192
  if input_data is not None:
174
- span.set_attribute("input", json.dumps(input_data, default=str))
193
+ safe_input = sanitize_payload(input_data, max_chars=MAX_LOG_CHARS)
194
+ span.set_attribute("input", json.dumps(safe_input, default=str))
175
195
  if output_data is not None:
176
- span.set_attribute("output", json.dumps(output_data, default=str))
196
+ safe_output = sanitize_payload(output_data, max_chars=MAX_LOG_CHARS)
197
+ span.set_attribute("output", json.dumps(safe_output, default=str))
177
198
 
178
199
  def log_score(
179
200
  self,
@@ -270,7 +291,7 @@ class PhoenixAdapter(TrackerPort):
270
291
  passed_count = sum(
271
292
  1
272
293
  for r in run.results
273
- if r.get_metric(metric_name) and r.get_metric(metric_name).passed
294
+ if (metric := r.get_metric(metric_name)) and metric.passed is True
274
295
  )
275
296
  avg_score = run.get_avg_score(metric_name)
276
297
  threshold = run.thresholds.get(metric_name, 0.7)
@@ -360,20 +381,33 @@ class PhoenixAdapter(TrackerPort):
360
381
  """
361
382
  from opentelemetry import trace
362
383
 
384
+ tracer = self._tracer_any
385
+ if tracer is None:
386
+ tracer = self._tracer
387
+ if tracer is None:
388
+ raise RuntimeError("Phoenix tracer is not initialized")
363
389
  parent_span = self._active_spans[trace_id]
364
390
  context = trace.set_span_in_context(parent_span)
365
391
 
366
- with self._tracer.start_span(
392
+ with tracer.start_span(
367
393
  f"test-case-{result.test_case_id}",
368
394
  context=context,
369
395
  ) as span:
370
396
  # Input data
371
- span.set_attribute("input.question", result.question or "")
372
- span.set_attribute("input.answer", result.answer or "")
397
+ safe_question = sanitize_text(result.question, max_chars=MAX_LOG_CHARS) or ""
398
+ safe_answer = sanitize_text(result.answer, max_chars=MAX_LOG_CHARS) or ""
399
+ span.set_attribute("input.question", safe_question)
400
+ span.set_attribute("input.answer", safe_answer)
373
401
  if result.contexts:
374
- span.set_attribute("input.contexts", json.dumps(result.contexts))
402
+ safe_contexts = sanitize_text_list(
403
+ result.contexts,
404
+ max_chars=MAX_CONTEXT_CHARS,
405
+ )
406
+ span.set_attribute("input.contexts", json.dumps(safe_contexts))
375
407
  if result.ground_truth:
376
- span.set_attribute("input.ground_truth", result.ground_truth)
408
+ safe_ground_truth = sanitize_text(result.ground_truth, max_chars=MAX_LOG_CHARS)
409
+ if safe_ground_truth:
410
+ span.set_attribute("input.ground_truth", safe_ground_truth)
377
411
 
378
412
  # Metrics
379
413
  span.set_attribute("output.all_passed", result.all_passed)
@@ -461,15 +495,22 @@ class PhoenixAdapter(TrackerPort):
461
495
  parent_span = self._active_spans[trace_id]
462
496
  context = trace.set_span_in_context(parent_span)
463
497
 
464
- with self._tracer.start_span("retrieval", context=context) as span:
498
+ tracer = self._tracer_any
499
+ if tracer is None:
500
+ tracer = self._tracer
501
+ if tracer is None:
502
+ raise RuntimeError("Phoenix tracer is not initialized")
503
+ with tracer.start_span("retrieval", context=context) as span:
465
504
  # Set retrieval attributes
466
505
  for key, value in data.to_span_attributes().items():
467
506
  span.set_attribute(key, value)
468
507
 
469
508
  # Set query
470
509
  if data.query:
471
- span.set_attribute("retrieval.query", data.query)
472
- span.set_attribute("input.value", data.query)
510
+ safe_query = sanitize_text(data.query, max_chars=MAX_LOG_CHARS)
511
+ if safe_query:
512
+ span.set_attribute("retrieval.query", safe_query)
513
+ span.set_attribute("input.value", safe_query)
473
514
 
474
515
  span.set_attribute("spec.version", "0.1")
475
516
  span.set_attribute("rag.module", "retrieve")
@@ -495,11 +536,14 @@ class PhoenixAdapter(TrackerPort):
495
536
  event_attrs["doc.rerank_rank"] = doc.rerank_rank
496
537
  if doc.chunk_id:
497
538
  event_attrs["doc.chunk_id"] = doc.chunk_id
498
- preview = doc.content[:200] if doc.content else ""
499
- if preview:
500
- event_attrs["doc.preview"] = preview
539
+ safe_preview = (
540
+ sanitize_text(doc.content, max_chars=MAX_CONTEXT_CHARS) if doc.content else ""
541
+ )
542
+ if safe_preview:
543
+ event_attrs["doc.preview"] = safe_preview
501
544
  if doc.metadata:
502
- event_attrs["doc.metadata"] = json.dumps(doc.metadata, default=str)
545
+ safe_metadata = sanitize_payload(doc.metadata, max_chars=MAX_LOG_CHARS)
546
+ event_attrs["doc.metadata"] = json.dumps(safe_metadata, default=str)
503
547
  span.add_event(f"retrieved_doc_{i}", attributes=event_attrs)
504
548
 
505
549
  def log_generation(
@@ -538,15 +582,19 @@ class PhoenixAdapter(TrackerPort):
538
582
  parent_span = self._active_spans[trace_id]
539
583
  context = trace.set_span_in_context(parent_span)
540
584
 
541
- with self._tracer.start_span("generation", context=context) as span:
585
+ tracer = self._tracer_any
586
+ if tracer is None:
587
+ tracer = self._tracer
588
+ if tracer is None:
589
+ raise RuntimeError("Phoenix tracer is not initialized")
590
+ with tracer.start_span("generation", context=context) as span:
542
591
  # Set generation attributes
543
592
  for key, value in data.to_span_attributes().items():
544
593
  span.set_attribute(key, value)
545
594
 
546
595
  # Set prompt/response (truncate if too long)
547
- max_len = 10000
548
- prompt = data.prompt[:max_len] if data.prompt else ""
549
- response = data.response[:max_len] if data.response else ""
596
+ prompt = sanitize_text(data.prompt, max_chars=MAX_LOG_CHARS) or ""
597
+ response = sanitize_text(data.response, max_chars=MAX_LOG_CHARS) or ""
550
598
  if prompt:
551
599
  span.set_attribute("generation.prompt", prompt)
552
600
  span.set_attribute("input.value", prompt)
@@ -559,24 +607,28 @@ class PhoenixAdapter(TrackerPort):
559
607
 
560
608
  # Set prompt template if available
561
609
  if data.prompt_template:
562
- span.set_attribute("generation.prompt_template", data.prompt_template[:max_len])
610
+ safe_template = sanitize_text(data.prompt_template, max_chars=MAX_LOG_CHARS)
611
+ if safe_template:
612
+ span.set_attribute("generation.prompt_template", safe_template)
563
613
 
564
614
  def log_rag_trace(self, data: RAGTraceData) -> str:
565
615
  """Log a full RAG trace (retrieval + generation) to Phoenix."""
566
616
 
567
617
  self._ensure_initialized()
568
618
  metadata = {"event_type": "rag_trace", "total_time_ms": data.total_time_ms}
569
- if data.query:
570
- metadata["query"] = data.query
619
+ safe_query = sanitize_text(data.query, max_chars=MAX_LOG_CHARS)
620
+ if safe_query:
621
+ metadata["query"] = safe_query
571
622
  if data.metadata:
572
- metadata.update(data.metadata)
623
+ safe_metadata = sanitize_payload(data.metadata, max_chars=MAX_LOG_CHARS)
624
+ metadata.update(safe_metadata)
573
625
 
574
626
  should_end = False
575
627
  trace_id = data.trace_id
576
628
  if trace_id and trace_id in self._active_spans:
577
629
  span = self._active_spans[trace_id]
578
630
  else:
579
- trace_name = f"rag-trace-{(data.query or 'run')[:12]}"
631
+ trace_name = f"rag-trace-{(safe_query or 'run')[:12]}"
580
632
  trace_id = self.start_trace(trace_name, metadata=metadata)
581
633
  span = self._active_spans[trace_id]
582
634
  should_end = True
@@ -589,12 +641,13 @@ class PhoenixAdapter(TrackerPort):
589
641
  if data.generation:
590
642
  self.log_generation(trace_id, data.generation)
591
643
  if data.final_answer:
592
- preview = data.final_answer[:1000]
593
- span.set_attribute("rag.final_answer", preview)
594
- span.set_attribute("output.value", preview)
644
+ preview = sanitize_text(data.final_answer, max_chars=MAX_LOG_CHARS)
645
+ if preview:
646
+ span.set_attribute("rag.final_answer", preview)
647
+ span.set_attribute("output.value", preview)
595
648
 
596
- if data.query:
597
- span.set_attribute("input.value", data.query)
649
+ if safe_query:
650
+ span.set_attribute("input.value", safe_query)
598
651
 
599
652
  span.set_attribute("spec.version", "0.1")
600
653
  span.set_attribute("rag.module", "custom.pipeline")
@@ -0,0 +1,118 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import os
5
+ from dataclasses import dataclass
6
+ from typing import Protocol
7
+
8
+ SECRET_REF_PREFIX = "secret://"
9
+
10
+
11
+ class SecretProvider(Protocol):
12
+ def get_secret(self, name: str) -> str: ...
13
+
14
+
15
+ class SecretProviderError(RuntimeError):
16
+ pass
17
+
18
+
19
+ @dataclass
20
+ class EnvSecretProvider:
21
+ def get_secret(self, name: str) -> str:
22
+ value = os.environ.get(name)
23
+ if value is None:
24
+ raise SecretProviderError(f"Missing secret in environment: {name}")
25
+ return value
26
+
27
+
28
+ @dataclass
29
+ class AwsSecretsManagerProvider:
30
+ region_name: str | None = None
31
+
32
+ def get_secret(self, name: str) -> str:
33
+ try:
34
+ import boto3 # type: ignore
35
+ except ImportError as exc:
36
+ raise SecretProviderError("boto3 is required for AWS Secrets Manager") from exc
37
+ client = boto3.client("secretsmanager", region_name=self.region_name)
38
+ response = client.get_secret_value(SecretId=name)
39
+ if "SecretString" in response and response["SecretString"] is not None:
40
+ return response["SecretString"]
41
+ secret_binary = response.get("SecretBinary")
42
+ if secret_binary is None:
43
+ raise SecretProviderError("Empty secret value returned from AWS Secrets Manager")
44
+ return base64.b64decode(secret_binary).decode("utf-8")
45
+
46
+
47
+ @dataclass
48
+ class GcpSecretManagerProvider:
49
+ def get_secret(self, name: str) -> str:
50
+ try:
51
+ from google.cloud import secretmanager # type: ignore
52
+ except ImportError as exc:
53
+ raise SecretProviderError(
54
+ "google-cloud-secret-manager is required for GCP Secret Manager"
55
+ ) from exc
56
+ client = secretmanager.SecretManagerServiceClient()
57
+ response = client.access_secret_version(request={"name": name})
58
+ return response.payload.data.decode("utf-8")
59
+
60
+
61
+ @dataclass
62
+ class VaultSecretProvider:
63
+ def get_secret(self, name: str) -> str:
64
+ try:
65
+ import hvac # type: ignore
66
+ except ImportError as exc:
67
+ raise SecretProviderError("hvac is required for Vault secret access") from exc
68
+ client = hvac.Client()
69
+ if not client.is_authenticated():
70
+ raise SecretProviderError("Vault client authentication failed")
71
+ response = client.secrets.kv.v2.read_secret_version(path=name)
72
+ data = response.get("data", {}).get("data", {})
73
+ if not data:
74
+ raise SecretProviderError("Vault secret payload is empty")
75
+ if "value" in data:
76
+ return str(data["value"])
77
+ if len(data) == 1:
78
+ return str(next(iter(data.values())))
79
+ raise SecretProviderError("Vault secret has multiple keys; specify 'value' key")
80
+
81
+
82
+ def is_secret_reference(value: str | None) -> bool:
83
+ return bool(value) and value.startswith(SECRET_REF_PREFIX)
84
+
85
+
86
+ def parse_secret_reference(value: str) -> str:
87
+ return value.removeprefix(SECRET_REF_PREFIX).strip()
88
+
89
+
90
+ def build_secret_provider(provider_name: str | None) -> SecretProvider:
91
+ provider = (provider_name or "").strip().lower()
92
+ if not provider:
93
+ raise SecretProviderError("Secret provider is not configured.")
94
+ if provider == "env":
95
+ return EnvSecretProvider()
96
+ if provider in {"aws", "aws-secrets-manager", "secretsmanager"}:
97
+ return AwsSecretsManagerProvider(region_name=os.environ.get("AWS_REGION"))
98
+ if provider in {"gcp", "gcp-secret-manager", "secretmanager"}:
99
+ return GcpSecretManagerProvider()
100
+ if provider in {"vault", "hashicorp-vault"}:
101
+ return VaultSecretProvider()
102
+ raise SecretProviderError(f"Unknown secret provider: {provider_name}")
103
+
104
+
105
+ def resolve_secret_reference(
106
+ value: str,
107
+ provider: SecretProvider,
108
+ cache: dict[str, str] | None = None,
109
+ ) -> str:
110
+ secret_name = parse_secret_reference(value)
111
+ if not secret_name:
112
+ raise SecretProviderError("Secret reference must include a name.")
113
+ if cache is not None and secret_name in cache:
114
+ return cache[secret_name]
115
+ secret_value = provider.get_secret(secret_name)
116
+ if cache is not None:
117
+ cache[secret_name] = secret_value
118
+ return secret_value
@@ -3,9 +3,16 @@
3
3
  from pathlib import Path
4
4
  from typing import Any
5
5
 
6
- from pydantic import Field
6
+ from pydantic import Field, PrivateAttr
7
7
  from pydantic_settings import BaseSettings, SettingsConfigDict
8
8
 
9
+ from evalvault.config.secret_manager import (
10
+ SecretProviderError,
11
+ build_secret_provider,
12
+ is_secret_reference,
13
+ resolve_secret_reference,
14
+ )
15
+
9
16
 
10
17
  def _detect_repo_root(start: Path, max_depth: int = 6) -> Path | None:
11
18
  current = start
@@ -38,6 +45,75 @@ def _ensure_http_scheme(url_value: str) -> str:
38
45
  return f"http://{value}"
39
46
 
40
47
 
48
+ def is_production_profile(profile_name: str | None) -> bool:
49
+ return (profile_name or "").strip().lower() == "prod"
50
+
51
+
52
+ def _parse_cors_origins(cors_origins: str | None) -> list[str]:
53
+ if not cors_origins:
54
+ return []
55
+ return [origin.strip() for origin in cors_origins.split(",") if origin.strip()]
56
+
57
+
58
+ SECRET_REFERENCE_FIELDS = (
59
+ "api_auth_tokens",
60
+ "knowledge_read_tokens",
61
+ "knowledge_write_tokens",
62
+ "openai_api_key",
63
+ "anthropic_api_key",
64
+ "azure_api_key",
65
+ "vllm_api_key",
66
+ "langfuse_public_key",
67
+ "langfuse_secret_key",
68
+ "phoenix_api_token",
69
+ "postgres_password",
70
+ "postgres_connection_string",
71
+ )
72
+
73
+
74
+ def _validate_production_settings(settings: "Settings") -> None:
75
+ if not is_production_profile(settings.evalvault_profile):
76
+ return
77
+
78
+ missing: list[str] = []
79
+
80
+ if not settings.api_auth_tokens:
81
+ missing.append("API_AUTH_TOKENS")
82
+
83
+ if settings.llm_provider == "openai" and not settings.openai_api_key:
84
+ missing.append("OPENAI_API_KEY")
85
+
86
+ if settings.tracker_provider == "langfuse":
87
+ if not settings.langfuse_public_key:
88
+ missing.append("LANGFUSE_PUBLIC_KEY")
89
+ if not settings.langfuse_secret_key:
90
+ missing.append("LANGFUSE_SECRET_KEY")
91
+
92
+ if settings.tracker_provider == "mlflow" and not settings.mlflow_tracking_uri:
93
+ missing.append("MLFLOW_TRACKING_URI")
94
+
95
+ if (
96
+ settings.postgres_connection_string is None
97
+ and settings.postgres_host
98
+ and not settings.postgres_password
99
+ ):
100
+ missing.append("POSTGRES_PASSWORD")
101
+
102
+ cors_origins = _parse_cors_origins(settings.cors_origins)
103
+ if not cors_origins:
104
+ missing.append("CORS_ORIGINS")
105
+ else:
106
+ localhost_origins = {"localhost", "127.0.0.1"}
107
+ for origin in cors_origins:
108
+ if any(host in origin for host in localhost_origins):
109
+ raise ValueError("Production profile forbids localhost in CORS_ORIGINS.")
110
+
111
+ if missing:
112
+ raise ValueError(
113
+ "Missing required settings for prod profile: " + ", ".join(sorted(set(missing)))
114
+ )
115
+
116
+
41
117
  class Settings(BaseSettings):
42
118
  """Application configuration settings."""
43
119
 
@@ -48,6 +124,8 @@ class Settings(BaseSettings):
48
124
  extra="ignore",
49
125
  )
50
126
 
127
+ _secret_cache: dict[str, str] = PrivateAttr(default_factory=dict)
128
+
51
129
  # Profile Configuration (YAML 기반 모델 프로필)
52
130
  evalvault_profile: str | None = Field(
53
131
  default=None,
@@ -58,6 +136,45 @@ class Settings(BaseSettings):
58
136
  default="http://localhost:5173,http://127.0.0.1:5173",
59
137
  description="Comma-separated list of allowed CORS origins.",
60
138
  )
139
+ secret_provider: str | None = Field(
140
+ default=None,
141
+ description="Secret provider name for secret:// references (env/aws/gcp/vault).",
142
+ )
143
+ secret_cache_enabled: bool = Field(
144
+ default=True,
145
+ description="Cache resolved secret references in memory.",
146
+ )
147
+ api_auth_tokens: str | None = Field(
148
+ default=None,
149
+ description=(
150
+ "Comma-separated list of API bearer tokens for FastAPI auth. "
151
+ "Leave empty to disable authentication."
152
+ ),
153
+ )
154
+ knowledge_read_tokens: str | None = Field(
155
+ default=None,
156
+ description="Comma-separated read tokens for knowledge endpoints.",
157
+ )
158
+ knowledge_write_tokens: str | None = Field(
159
+ default=None,
160
+ description="Comma-separated write tokens for knowledge endpoints.",
161
+ )
162
+ rate_limit_enabled: bool = Field(
163
+ default=False,
164
+ description="Enable API rate limiting for /api routes.",
165
+ )
166
+ rate_limit_requests: int = Field(
167
+ default=120,
168
+ description="Max requests allowed within rate_limit_window_seconds.",
169
+ )
170
+ rate_limit_window_seconds: int = Field(
171
+ default=60,
172
+ description="Window size for rate limit checks in seconds.",
173
+ )
174
+ rate_limit_block_threshold: int = Field(
175
+ default=10,
176
+ description="Log suspicious activity after this many rate limit blocks.",
177
+ )
61
178
  evalvault_db_path: str = Field(
62
179
  default="data/db/evalvault.db",
63
180
  description="SQLite database path for API/CLI storage.",
@@ -71,6 +188,26 @@ class Settings(BaseSettings):
71
188
  self.evalvault_db_path = _resolve_storage_path(self.evalvault_db_path)
72
189
  self.evalvault_memory_db_path = _resolve_storage_path(self.evalvault_memory_db_path)
73
190
  self.ollama_base_url = _ensure_http_scheme(self.ollama_base_url)
191
+ self._resolve_secret_references()
192
+
193
+ def _resolve_secret_references(self) -> None:
194
+ secret_values = [
195
+ value
196
+ for value in (getattr(self, field, None) for field in SECRET_REFERENCE_FIELDS)
197
+ if isinstance(value, str)
198
+ ]
199
+ if not any(is_secret_reference(value) for value in secret_values):
200
+ return
201
+ try:
202
+ provider = build_secret_provider(self.secret_provider)
203
+ except SecretProviderError as exc:
204
+ raise ValueError(str(exc)) from exc
205
+ cache = self._secret_cache if self.secret_cache_enabled else None
206
+ for field in SECRET_REFERENCE_FIELDS:
207
+ value = getattr(self, field, None)
208
+ if isinstance(value, str) and is_secret_reference(value):
209
+ resolved = resolve_secret_reference(value, provider, cache)
210
+ setattr(self, field, resolved)
74
211
 
75
212
  # LLM Provider Selection
76
213
  llm_provider: str = Field(
@@ -314,6 +451,8 @@ def get_settings() -> Settings:
314
451
  if _settings.evalvault_profile:
315
452
  _settings = apply_profile(_settings, _settings.evalvault_profile)
316
453
 
454
+ _validate_production_settings(_settings)
455
+
317
456
  return _settings
318
457
 
319
458
 
@@ -346,6 +485,7 @@ def apply_runtime_overrides(overrides: dict[str, object]) -> Settings:
346
485
  updated = Settings.model_validate(payload)
347
486
  if updated.evalvault_profile:
348
487
  updated = apply_profile(updated, updated.evalvault_profile)
488
+ _validate_production_settings(updated)
349
489
  for key, value in updated.model_dump().items():
350
490
  setattr(settings, key, value)
351
491
 
@@ -34,6 +34,12 @@ from evalvault.domain.entities.improvement import (
34
34
  RAGComponent,
35
35
  RAGImprovementGuide,
36
36
  )
37
+ from evalvault.domain.entities.judge_calibration import (
38
+ JudgeCalibrationCase,
39
+ JudgeCalibrationMetric,
40
+ JudgeCalibrationResult,
41
+ JudgeCalibrationSummary,
42
+ )
37
43
  from evalvault.domain.entities.kg import EntityModel, RelationModel
38
44
  from evalvault.domain.entities.method import MethodInput, MethodInputDataset, MethodOutput
39
45
  from evalvault.domain.entities.prompt import Prompt, PromptSet, PromptSetBundle, PromptSetItem
@@ -104,6 +110,10 @@ __all__ = [
104
110
  "PatternType",
105
111
  "RAGComponent",
106
112
  "RAGImprovementGuide",
113
+ "JudgeCalibrationCase",
114
+ "JudgeCalibrationMetric",
115
+ "JudgeCalibrationResult",
116
+ "JudgeCalibrationSummary",
107
117
  # KG
108
118
  "EntityModel",
109
119
  "RelationModel",
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ @dataclass
7
+ class JudgeCalibrationCase:
8
+ test_case_id: str
9
+ raw_score: float
10
+ calibrated_score: float
11
+ label: float | None = None
12
+ label_source: str | None = None
13
+
14
+
15
+ @dataclass
16
+ class JudgeCalibrationMetric:
17
+ metric: str
18
+ method: str
19
+ sample_count: int
20
+ label_count: int
21
+ mae: float | None
22
+ pearson: float | None
23
+ spearman: float | None
24
+ temperature: float | None = None
25
+ parameters: dict[str, float | None] = field(default_factory=dict)
26
+ gate_passed: bool | None = None
27
+ warning: str | None = None
28
+
29
+
30
+ @dataclass
31
+ class JudgeCalibrationSummary:
32
+ run_id: str
33
+ labels_source: str
34
+ method: str
35
+ metrics: list[str]
36
+ holdout_ratio: float
37
+ seed: int
38
+ total_labels: int
39
+ total_samples: int
40
+ gate_passed: bool
41
+ gate_threshold: float | None = None
42
+ notes: list[str] = field(default_factory=list)
43
+
44
+
45
+ @dataclass
46
+ class JudgeCalibrationResult:
47
+ summary: JudgeCalibrationSummary
48
+ metrics: list[JudgeCalibrationMetric] = field(default_factory=list)
49
+ case_results: dict[str, list[JudgeCalibrationCase]] = field(default_factory=dict)
50
+ warnings: list[str] = field(default_factory=list)
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  from dataclasses import dataclass, field
6
6
  from datetime import datetime
7
- from typing import Any
7
+ from typing import Any, Literal, overload
8
8
  from uuid import uuid4
9
9
 
10
10
  REQUIRED_STAGE_TYPES: tuple[str, ...] = ("system_prompt", "input", "retrieval", "output")
@@ -82,8 +82,8 @@ class StageEvent:
82
82
  duration_ms=_optional_float(payload.get("duration_ms")),
83
83
  input_ref=input_ref,
84
84
  output_ref=output_ref,
85
- attributes=_ensure_dict(payload.get("attributes")),
86
- metadata=_ensure_dict(payload.get("metadata")),
85
+ attributes=_ensure_dict(payload.get("attributes"), allow_none=False),
86
+ metadata=_ensure_dict(payload.get("metadata"), allow_none=False),
87
87
  trace_id=_optional_str(payload.get("trace_id") or trace_payload.get("trace_id")),
88
88
  span_id=_optional_str(payload.get("span_id") or trace_payload.get("span_id")),
89
89
  )
@@ -187,6 +187,14 @@ def _parse_datetime(value: Any) -> datetime | None:
187
187
  raise ValueError("Invalid datetime value")
188
188
 
189
189
 
190
+ @overload
191
+ def _ensure_dict(value: None, *, allow_none: Literal[True]) -> None: ...
192
+
193
+
194
+ @overload
195
+ def _ensure_dict(value: Any, *, allow_none: Literal[False] = False) -> dict[str, Any]: ...
196
+
197
+
190
198
  def _ensure_dict(value: Any, *, allow_none: bool = False) -> dict[str, Any] | None:
191
199
  if value is None:
192
200
  return None if allow_none else {}