evalvault 1.74.0__py3-none-any.whl → 1.76.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. evalvault/adapters/inbound/api/adapter.py +127 -80
  2. evalvault/adapters/inbound/api/routers/calibration.py +9 -9
  3. evalvault/adapters/inbound/api/routers/chat.py +303 -17
  4. evalvault/adapters/inbound/api/routers/config.py +3 -1
  5. evalvault/adapters/inbound/api/routers/domain.py +10 -5
  6. evalvault/adapters/inbound/api/routers/pipeline.py +3 -3
  7. evalvault/adapters/inbound/api/routers/runs.py +23 -4
  8. evalvault/adapters/inbound/cli/commands/analyze.py +10 -12
  9. evalvault/adapters/inbound/cli/commands/benchmark.py +10 -8
  10. evalvault/adapters/inbound/cli/commands/calibrate.py +2 -7
  11. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +2 -7
  12. evalvault/adapters/inbound/cli/commands/compare.py +2 -7
  13. evalvault/adapters/inbound/cli/commands/debug.py +3 -2
  14. evalvault/adapters/inbound/cli/commands/domain.py +12 -12
  15. evalvault/adapters/inbound/cli/commands/experiment.py +9 -8
  16. evalvault/adapters/inbound/cli/commands/gate.py +3 -2
  17. evalvault/adapters/inbound/cli/commands/graph_rag.py +2 -2
  18. evalvault/adapters/inbound/cli/commands/history.py +3 -12
  19. evalvault/adapters/inbound/cli/commands/method.py +3 -4
  20. evalvault/adapters/inbound/cli/commands/ops.py +2 -2
  21. evalvault/adapters/inbound/cli/commands/pipeline.py +2 -2
  22. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +3 -12
  23. evalvault/adapters/inbound/cli/commands/prompts.py +4 -18
  24. evalvault/adapters/inbound/cli/commands/regress.py +5 -4
  25. evalvault/adapters/inbound/cli/commands/run.py +188 -59
  26. evalvault/adapters/inbound/cli/commands/run_helpers.py +181 -70
  27. evalvault/adapters/inbound/cli/commands/stage.py +6 -25
  28. evalvault/adapters/inbound/cli/utils/options.py +10 -4
  29. evalvault/adapters/inbound/mcp/tools.py +11 -8
  30. evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +17 -1
  31. evalvault/adapters/outbound/analysis/embedding_searcher_module.py +14 -0
  32. evalvault/adapters/outbound/domain_memory/__init__.py +8 -4
  33. evalvault/adapters/outbound/domain_memory/factory.py +68 -0
  34. evalvault/adapters/outbound/domain_memory/postgres_adapter.py +1062 -0
  35. evalvault/adapters/outbound/domain_memory/postgres_domain_memory_schema.sql +177 -0
  36. evalvault/adapters/outbound/llm/factory.py +1 -1
  37. evalvault/adapters/outbound/llm/vllm_adapter.py +23 -0
  38. evalvault/adapters/outbound/nlp/korean/dense_retriever.py +10 -7
  39. evalvault/adapters/outbound/nlp/korean/toolkit.py +15 -4
  40. evalvault/adapters/outbound/phoenix/sync_service.py +99 -0
  41. evalvault/adapters/outbound/retriever/pgvector_store.py +165 -0
  42. evalvault/adapters/outbound/storage/base_sql.py +3 -2
  43. evalvault/adapters/outbound/storage/factory.py +53 -0
  44. evalvault/adapters/outbound/storage/postgres_schema.sql +2 -0
  45. evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
  46. evalvault/adapters/outbound/tracker/phoenix_adapter.py +158 -9
  47. evalvault/config/instrumentation.py +8 -6
  48. evalvault/config/phoenix_support.py +5 -0
  49. evalvault/config/settings.py +71 -11
  50. evalvault/domain/services/domain_learning_hook.py +2 -1
  51. evalvault/domain/services/evaluator.py +2 -0
  52. evalvault/ports/inbound/web_port.py +3 -1
  53. evalvault/ports/outbound/storage_port.py +2 -0
  54. evalvault-1.76.0.dist-info/METADATA +221 -0
  55. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/RECORD +58 -53
  56. evalvault-1.74.0.dist-info/METADATA +0 -585
  57. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/WHEEL +0 -0
  58. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/entry_points.txt +0 -0
  59. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,177 @@
1
+ -- EvalVault Domain Memory Schema (PostgreSQL)
2
+ -- Based on "Memory in the Age of AI Agents: A Survey" framework
3
+ -- Forms: Flat (Phase 1), Planar/Hierarchical (Phase 2-3)
4
+ -- Functions: Factual, Experiential, Working layers
5
+ -- Dynamics: Formation, Evolution, Retrieval strategies
6
+
7
+ -- =========================================================================
8
+ -- Factual Layer - 검증된 도메인 사실 (SPO 트리플)
9
+ -- =========================================================================
10
+
11
+ CREATE TABLE IF NOT EXISTS factual_facts (
12
+ fact_id TEXT PRIMARY KEY,
13
+ subject TEXT NOT NULL, -- 엔티티 이름
14
+ predicate TEXT NOT NULL, -- 관계 타입
15
+ object TEXT NOT NULL, -- 대상 엔티티
16
+ language TEXT DEFAULT 'ko', -- 언어 코드 (ko, en)
17
+ domain TEXT DEFAULT 'default', -- 도메인 (insurance, legal, medical)
18
+ fact_type TEXT DEFAULT 'verified', -- verified, inferred, contradictory
19
+ verification_score REAL DEFAULT 1.0, -- 0.0-1.0
20
+ verification_count INTEGER DEFAULT 1,
21
+ source_document_ids TEXT, -- JSON array of document IDs
22
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
23
+ last_verified TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
24
+ abstraction_level INTEGER DEFAULT 0
25
+ );
26
+
27
+ -- 검색 최적화를 위한 인덱스
28
+ CREATE INDEX IF NOT EXISTS idx_facts_domain_lang ON factual_facts(domain, language);
29
+ CREATE INDEX IF NOT EXISTS idx_facts_subject ON factual_facts(subject);
30
+ CREATE INDEX IF NOT EXISTS idx_facts_predicate ON factual_facts(predicate);
31
+ CREATE INDEX IF NOT EXISTS idx_facts_object ON factual_facts(object);
32
+ CREATE INDEX IF NOT EXISTS idx_facts_triple ON factual_facts(subject, predicate, object);
33
+ CREATE INDEX IF NOT EXISTS idx_facts_verification_score ON factual_facts(verification_score DESC);
34
+ CREATE INDEX IF NOT EXISTS idx_facts_last_verified ON factual_facts(last_verified DESC);
35
+
36
+ -- GIN indexes for text search (ILIKE-based)
37
+ CREATE INDEX IF NOT EXISTS idx_facts_subject_gin ON factual_facts USING GIN (to_tsvector('english', subject));
38
+ CREATE INDEX IF NOT EXISTS idx_facts_predicate_gin ON factual_facts USING GIN (to_tsvector('english', predicate));
39
+ CREATE INDEX IF NOT EXISTS idx_facts_object_gin ON factual_facts USING GIN (to_tsvector('english', object));
40
+
41
+ -- =========================================================================
42
+ -- Experiential Layer - 평가에서 학습된 패턴
43
+ -- =========================================================================
44
+
45
+ CREATE TABLE IF NOT EXISTS learning_memories (
46
+ learning_id TEXT PRIMARY KEY,
47
+ run_id TEXT NOT NULL, -- 원본 평가 run ID
48
+ domain TEXT DEFAULT 'default',
49
+ language TEXT DEFAULT 'ko',
50
+ entity_type_reliability TEXT, -- JSON: {entity_type: reliability_score}
51
+ relation_type_reliability TEXT, -- JSON: {relation_type: reliability_score}
52
+ failed_patterns TEXT, -- JSON array of failed patterns
53
+ successful_patterns TEXT, -- JSON array of successful patterns
54
+ faithfulness_by_entity_type TEXT, -- JSON: {entity_type: faithfulness_score}
55
+ timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
56
+ );
57
+
58
+ CREATE INDEX IF NOT EXISTS idx_learnings_domain_lang ON learning_memories(domain, language);
59
+ CREATE INDEX IF NOT EXISTS idx_learnings_run_id ON learning_memories(run_id);
60
+ CREATE INDEX IF NOT EXISTS idx_learnings_timestamp ON learning_memories(timestamp DESC);
61
+
62
+ -- =========================================================================
63
+ -- Behavior Layer - Metacognitive Reuse (재사용 가능한 행동)
64
+ -- =========================================================================
65
+
66
+ CREATE TABLE IF NOT EXISTS behavior_entries (
67
+ behavior_id TEXT PRIMARY KEY,
68
+ description TEXT NOT NULL,
69
+ trigger_pattern TEXT, -- 트리거 조건 (regex 또는 키워드)
70
+ action_sequence TEXT, -- JSON array of action steps
71
+ success_rate REAL DEFAULT 0.0, -- 역사적 성공률
72
+ token_savings INTEGER DEFAULT 0, -- 절감되는 토큰 수
73
+ applicable_languages TEXT DEFAULT '["ko", "en"]', -- JSON array
74
+ domain TEXT DEFAULT 'default',
75
+ last_used TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
76
+ use_count INTEGER DEFAULT 0,
77
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
78
+ );
79
+
80
+ CREATE INDEX IF NOT EXISTS idx_behaviors_domain ON behavior_entries(domain);
81
+ CREATE INDEX IF NOT EXISTS idx_behaviors_success_rate ON behavior_entries(success_rate DESC);
82
+ CREATE INDEX IF NOT EXISTS idx_behaviors_use_count ON behavior_entries(use_count DESC);
83
+ CREATE INDEX IF NOT EXISTS idx_behaviors_last_used ON behavior_entries(last_used DESC);
84
+
85
+ -- GIN indexes for behavior search
86
+ CREATE INDEX IF NOT EXISTS idx_behaviors_description_gin ON behavior_entries USING GIN (to_tsvector('english', description));
87
+ CREATE INDEX IF NOT EXISTS idx_behaviors_trigger_gin ON behavior_entries USING GIN (to_tsvector('english', trigger_pattern));
88
+
89
+ -- =========================================================================
90
+ -- Working Layer - 현재 세션의 활성 컨텍스트
91
+ -- =========================================================================
92
+
93
+ CREATE TABLE IF NOT EXISTS memory_contexts (
94
+ session_id TEXT PRIMARY KEY,
95
+ domain TEXT DEFAULT 'default',
96
+ language TEXT DEFAULT 'ko',
97
+ active_entities TEXT, -- JSON array of entity names
98
+ entity_type_distribution TEXT, -- JSON: {entity_type: count}
99
+ current_quality_metrics TEXT, -- JSON: {metric_name: value}
100
+ started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
101
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
102
+ );
103
+
104
+ CREATE INDEX IF NOT EXISTS idx_contexts_domain ON memory_contexts(domain);
105
+ CREATE INDEX IF NOT EXISTS idx_contexts_updated_at ON memory_contexts(updated_at DESC);
106
+
107
+ -- =========================================================================
108
+ -- Fact Sources - 사실과 문서 간의 관계 (Phase 2)
109
+ -- =========================================================================
110
+
111
+ CREATE TABLE IF NOT EXISTS fact_sources (
112
+ id SERIAL PRIMARY KEY,
113
+ fact_id TEXT NOT NULL,
114
+ document_id TEXT NOT NULL,
115
+ extraction_confidence REAL DEFAULT 1.0,
116
+ extracted_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
117
+ FOREIGN KEY (fact_id) REFERENCES factual_facts(fact_id) ON DELETE CASCADE,
118
+ UNIQUE(fact_id, document_id)
119
+ );
120
+
121
+ CREATE INDEX IF NOT EXISTS idx_fact_sources_fact_id ON fact_sources(fact_id);
122
+ CREATE INDEX IF NOT EXISTS idx_fact_sources_document_id ON fact_sources(document_id);
123
+
124
+ -- =========================================================================
125
+ -- Memory Evolution Log - 메모리 변화 추적 (Phase 2)
126
+ -- =========================================================================
127
+
128
+ CREATE TABLE IF NOT EXISTS memory_evolution_log (
129
+ id SERIAL PRIMARY KEY,
130
+ operation TEXT NOT NULL, -- consolidate, update, forget, decay
131
+ target_type TEXT NOT NULL, -- fact, learning, behavior
132
+ target_id TEXT NOT NULL,
133
+ details TEXT, -- JSON: operation-specific details
134
+ performed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
135
+ );
136
+
137
+ CREATE INDEX IF NOT EXISTS idx_evolution_log_operation ON memory_evolution_log(operation);
138
+ CREATE INDEX IF NOT EXISTS idx_evolution_log_target ON memory_evolution_log(target_type, target_id);
139
+ CREATE INDEX IF NOT EXISTS idx_evolution_log_performed_at ON memory_evolution_log(performed_at DESC);
140
+
141
+ -- =========================================================================
142
+ -- Phase 5: Planar Form - KG Integration
143
+ -- =========================================================================
144
+
145
+ -- KG Entity binding table for explicit KG links
146
+ CREATE TABLE IF NOT EXISTS fact_kg_bindings (
147
+ id SERIAL PRIMARY KEY,
148
+ fact_id TEXT NOT NULL,
149
+ kg_entity_id TEXT NOT NULL, -- KG 엔티티 이름/ID
150
+ kg_relation_type TEXT, -- KG 관계 타입
151
+ binding_confidence REAL DEFAULT 1.0,
152
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
153
+ FOREIGN KEY (fact_id) REFERENCES factual_facts(fact_id) ON DELETE CASCADE,
154
+ UNIQUE(fact_id, kg_entity_id)
155
+ );
156
+
157
+ CREATE INDEX IF NOT EXISTS idx_kg_bindings_fact_id ON fact_kg_bindings(fact_id);
158
+ CREATE INDEX IF NOT EXISTS idx_kg_bindings_kg_entity ON fact_kg_bindings(kg_entity_id);
159
+ CREATE INDEX IF NOT EXISTS idx_kg_bindings_relation_type ON fact_kg_bindings(kg_relation_type);
160
+
161
+ -- =========================================================================
162
+ -- Phase 5: Hierarchical Form - Summary Layers
163
+ -- =========================================================================
164
+
165
+ -- Fact hierarchy table for parent-child relationships
166
+ CREATE TABLE IF NOT EXISTS fact_hierarchy (
167
+ id SERIAL PRIMARY KEY,
168
+ parent_fact_id TEXT NOT NULL,
169
+ child_fact_id TEXT NOT NULL,
170
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
171
+ FOREIGN KEY (parent_fact_id) REFERENCES factual_facts(fact_id) ON DELETE CASCADE,
172
+ FOREIGN KEY (child_fact_id) REFERENCES factual_facts(fact_id) ON DELETE CASCADE,
173
+ UNIQUE(parent_fact_id, child_fact_id)
174
+ );
175
+
176
+ CREATE INDEX IF NOT EXISTS idx_hierarchy_parent ON fact_hierarchy(parent_fact_id);
177
+ CREATE INDEX IF NOT EXISTS idx_hierarchy_child ON fact_hierarchy(child_fact_id);
@@ -97,7 +97,7 @@ def _resolve_faithfulness_fallback_config(
97
97
 
98
98
  def _default_faithfulness_fallback_model(provider: str) -> str | None:
99
99
  if provider == "ollama":
100
- return "gpt-oss-safeguard:20b"
100
+ return "qwen3:8b"
101
101
  if provider == "vllm":
102
102
  return "gpt-oss-120b"
103
103
  return None
@@ -64,6 +64,29 @@ class VLLMAdapter(BaseLLMAdapter):
64
64
  """Get the embedding model name being used."""
65
65
  return self._embedding_model_name
66
66
 
67
+ def embed_sync(
68
+ self,
69
+ *,
70
+ texts: list[str],
71
+ model: str | None = None,
72
+ dimension: int | None = None,
73
+ ) -> list[list[float]]:
74
+ """Synchronous embedding call using OpenAI-compatible API."""
75
+ embed_base_url = self._settings.vllm_embedding_base_url or self._settings.vllm_base_url
76
+ client = OpenAI(
77
+ base_url=embed_base_url,
78
+ api_key=self._settings.vllm_api_key or "local",
79
+ timeout=self._settings.vllm_timeout,
80
+ )
81
+ payload: dict[str, Any] = {
82
+ "model": model or self._embedding_model_name,
83
+ "input": texts,
84
+ }
85
+ if dimension is not None:
86
+ payload["dimensions"] = dimension
87
+ response = client.embeddings.create(**payload)
88
+ return [item.embedding for item in response.data]
89
+
67
90
  async def agenerate_text(
68
91
  self,
69
92
  prompt: str,
@@ -141,8 +141,8 @@ class KoreanDenseRetriever:
141
141
  },
142
142
  }
143
143
 
144
- # 기본 모델: dragonkue/BGE-m3-ko (AutoRAG 벤치마크 1위)
145
- DEFAULT_MODEL = "dragonkue/BGE-m3-ko"
144
+ # 기본 모델: BAAI/bge-m3 (멀티링거시 기본)
145
+ DEFAULT_MODEL = "BAAI/bge-m3"
146
146
 
147
147
  def __init__(
148
148
  self,
@@ -175,7 +175,7 @@ class KoreanDenseRetriever:
175
175
  device: 디바이스 (auto, cpu, cuda, mps)
176
176
  batch_size: 인코딩 배치 크기
177
177
  - 0 이하로 설정하면 간단한 휴리스틱으로 자동 결정
178
- ollama_adapter: Ollama LLM 어댑터 (Qwen3-Embedding 사용 시 필수)
178
+ ollama_adapter: OpenAI 호환 임베딩 어댑터 (Ollama/vLLM)
179
179
  matryoshka_dim: Matryoshka 차원 (Qwen3-Embedding 전용)
180
180
  - None: 모델 권장 차원 사용
181
181
  - 256: 개발용 (속도 우선)
@@ -237,12 +237,12 @@ class KoreanDenseRetriever:
237
237
  self._query_cache_size = max(query_cache_size, 0)
238
238
  self._search_cache_size = max(search_cache_size, 0)
239
239
 
240
- # Validate Ollama adapter for Ollama models
240
+ # Validate embedding adapter for OpenAI-compatible embedding models
241
241
  model_info = self.SUPPORTED_MODELS.get(self._model_name)
242
242
  if model_info and model_info.get("type") == "ollama" and not self._ollama_adapter:
243
243
  raise ValueError(
244
- f"ollama_adapter is required for Ollama model '{self._model_name}'. "
245
- "Create one with: OllamaAdapter(settings)"
244
+ f"embedding adapter is required for model '{self._model_name}'. "
245
+ "Create one with: OllamaAdapter(settings) or VLLMAdapter(settings)"
246
246
  )
247
247
 
248
248
  # Auto-select matryoshka dimension if not specified
@@ -362,7 +362,10 @@ class KoreanDenseRetriever:
362
362
  return
363
363
 
364
364
  model_info = self.SUPPORTED_MODELS.get(self._model_name)
365
- model_type = model_info["type"] if model_info else "sentence-transformers"
365
+ if model_info is None and self._ollama_adapter is not None:
366
+ model_type = "ollama"
367
+ else:
368
+ model_type = model_info["type"] if model_info else "sentence-transformers"
366
369
 
367
370
  # Ollama models use adapter directly - no model loading needed
368
371
  if model_type == "ollama":
@@ -76,10 +76,21 @@ class KoreanNLPToolkit(KoreanNLPToolkitPort):
76
76
 
77
77
  embedding_func = None
78
78
  try:
79
- dense_retriever = KoreanDenseRetriever(
80
- profile=embedding_profile,
81
- ollama_adapter=ollama_adapter,
82
- )
79
+ if embedding_profile == "vllm":
80
+ from evalvault.adapters.outbound.llm.vllm_adapter import VLLMAdapter
81
+ from evalvault.config.settings import Settings
82
+
83
+ settings = Settings()
84
+ adapter = ollama_adapter or VLLMAdapter(settings)
85
+ dense_retriever = KoreanDenseRetriever(
86
+ model_name=settings.vllm_embedding_model,
87
+ ollama_adapter=adapter,
88
+ )
89
+ else:
90
+ dense_retriever = KoreanDenseRetriever(
91
+ profile=embedding_profile,
92
+ ollama_adapter=ollama_adapter,
93
+ )
83
94
  embedding_func = dense_retriever.get_embedding_func()
84
95
  if verbose:
85
96
  logger.info(
@@ -104,6 +104,22 @@ class PhoenixSyncService:
104
104
  dataset_description=description,
105
105
  )
106
106
  except Exception as exc: # pragma: no cover - HTTP/serialization errors
107
+ message = str(exc)
108
+ if "already exists" in message:
109
+ existing = self._find_dataset_by_name(dataset_name)
110
+ if existing:
111
+ dataset_obj = self._client.datasets.get_dataset(dataset=existing["id"])
112
+ dataset_url = self._client.experiments.get_dataset_experiments_url(
113
+ dataset_obj.id
114
+ )
115
+ return PhoenixDatasetInfo(
116
+ dataset_id=dataset_obj.id,
117
+ dataset_name=dataset_obj.name,
118
+ dataset_version_id=dataset_obj.version_id,
119
+ url=dataset_url,
120
+ description=description,
121
+ example_count=getattr(dataset_obj, "examples", None),
122
+ )
107
123
  raise PhoenixSyncError(f"Dataset upload failed: {exc}") from exc
108
124
 
109
125
  dataset_url = self._client.experiments.get_dataset_experiments_url(phoenix_dataset.id)
@@ -173,6 +189,74 @@ class PhoenixSyncService:
173
189
  )
174
190
  return examples
175
191
 
192
+ def _find_dataset_by_name(self, dataset_name: str) -> dict[str, Any] | None:
193
+ try:
194
+ datasets = self._client.datasets.list()
195
+ except Exception:
196
+ return None
197
+ for entry in datasets:
198
+ if entry.get("name") == dataset_name:
199
+ return entry
200
+ return None
201
+
202
+ def sync_prompts(
203
+ self,
204
+ *,
205
+ prompt_entries: list[dict[str, Any]],
206
+ model_name: str,
207
+ model_provider: str,
208
+ prompt_set_name: str | None = None,
209
+ ) -> list[dict[str, Any]]:
210
+ """Create prompt versions in Phoenix Prompt Management."""
211
+
212
+ if not prompt_entries:
213
+ return []
214
+
215
+ try:
216
+ from phoenix.client.resources.prompts import PromptVersion
217
+ except Exception as exc: # pragma: no cover - optional dependency
218
+ raise PhoenixSyncError("Phoenix prompt client unavailable") from exc
219
+
220
+ synced: list[dict[str, Any]] = []
221
+ for index, entry in enumerate(prompt_entries, start=1):
222
+ name = entry.get("name") or entry.get("role") or f"prompt_{index}"
223
+ content = entry.get("content") or entry.get("content_preview") or ""
224
+ if not content:
225
+ continue
226
+ prompt_version = PromptVersion(
227
+ [{"role": "system", "content": content}],
228
+ model_name=model_name,
229
+ model_provider=model_provider,
230
+ template_format="NONE",
231
+ )
232
+ prompt_metadata = {
233
+ "kind": entry.get("kind"),
234
+ "role": entry.get("role"),
235
+ "checksum": entry.get("checksum"),
236
+ "status": entry.get("status"),
237
+ "source": entry.get("source") or entry.get("path"),
238
+ "order": index,
239
+ }
240
+ if prompt_set_name:
241
+ prompt_metadata["prompt_set"] = prompt_set_name
242
+ try:
243
+ version = self._client.prompts.create(
244
+ version=prompt_version,
245
+ name=name,
246
+ prompt_description=entry.get("notes"),
247
+ prompt_metadata=_as_serializable(prompt_metadata),
248
+ )
249
+ synced.append(
250
+ {
251
+ **entry,
252
+ "phoenix_prompt_version_id": getattr(version, "id", None),
253
+ }
254
+ )
255
+ except Exception as exc: # pragma: no cover - HTTP errors
256
+ raise PhoenixSyncError(f"Prompt sync failed: {exc}") from exc
257
+
258
+ return synced
259
+
176
260
  def _build_input_payload(self, test_case: TestCase) -> dict[str, Any]:
177
261
  return {
178
262
  "question": test_case.question,
@@ -258,6 +342,21 @@ def build_experiment_metadata(
258
342
  "total_test_cases": run.total_test_cases,
259
343
  "metrics": metrics,
260
344
  }
345
+ if run.results:
346
+ latencies = [r.latency_ms for r in run.results if r.latency_ms]
347
+ tokens = [r.tokens_used for r in run.results if r.tokens_used]
348
+ costs = [r.cost_usd for r in run.results if r.cost_usd is not None]
349
+ if latencies:
350
+ payload["avg_latency_ms"] = round(sum(latencies) / len(latencies), 2)
351
+ if tokens:
352
+ payload["avg_tokens"] = round(sum(tokens) / len(tokens), 2)
353
+ if costs:
354
+ payload["avg_cost_usd"] = round(sum(costs) / len(costs), 6)
355
+ if run.total_tokens:
356
+ payload["total_tokens"] = run.total_tokens
357
+ if run.total_cost_usd is not None:
358
+ payload["total_cost_usd"] = run.total_cost_usd
359
+ payload["error_rate"] = round(1 - run.pass_rate, 4)
261
360
  if reliability_snapshot:
262
361
  payload["reliability_snapshot"] = reliability_snapshot
263
362
  if dataset.metadata:
@@ -0,0 +1,165 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from collections.abc import Iterable
5
+ from dataclasses import dataclass
6
+
7
+ import psycopg
8
+ from psycopg.rows import dict_row
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class PgvectorResult:
15
+ doc_id: int
16
+ content: str
17
+ score: float
18
+
19
+
20
+ class PgvectorStore:
21
+ def __init__(
22
+ self,
23
+ conn_string: str,
24
+ *,
25
+ distance: str = "cosine",
26
+ index_type: str = "hnsw",
27
+ index_lists: int = 100,
28
+ hnsw_m: int = 16,
29
+ hnsw_ef_construction: int = 64,
30
+ ) -> None:
31
+ self._conn_string = conn_string
32
+ self._distance = distance
33
+ self._index_type = index_type
34
+ self._index_lists = index_lists
35
+ self._hnsw_m = hnsw_m
36
+ self._hnsw_ef_construction = hnsw_ef_construction
37
+
38
+ def _vector_ops(self) -> str:
39
+ if self._distance == "ip":
40
+ return "vector_ip_ops"
41
+ if self._distance == "l2":
42
+ return "vector_l2_ops"
43
+ return "vector_cosine_ops"
44
+
45
+ def _connect(self) -> psycopg.Connection:
46
+ conn = psycopg.connect(self._conn_string, row_factory=dict_row)
47
+ try:
48
+ from pgvector.psycopg import register_vector
49
+
50
+ register_vector(conn)
51
+ except Exception as exc: # pragma: no cover - runtime dependency
52
+ logger.warning("Failed to register pgvector type: %s", exc)
53
+ return conn
54
+
55
+ def ensure_schema(self, *, dimension: int) -> None:
56
+ sql = f"""
57
+ CREATE EXTENSION IF NOT EXISTS vector;
58
+
59
+ CREATE TABLE IF NOT EXISTS rag_documents (
60
+ id BIGSERIAL PRIMARY KEY,
61
+ source TEXT NOT NULL,
62
+ source_hash TEXT NOT NULL,
63
+ doc_id INTEGER NOT NULL,
64
+ content TEXT NOT NULL,
65
+ embedding VECTOR({dimension}),
66
+ metadata JSONB,
67
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
68
+ );
69
+
70
+ CREATE INDEX IF NOT EXISTS idx_rag_documents_source ON rag_documents(source);
71
+ CREATE INDEX IF NOT EXISTS idx_rag_documents_doc_id ON rag_documents(doc_id);
72
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_rag_documents_source_doc_id
73
+ ON rag_documents(source, doc_id);
74
+ """
75
+ with self._connect() as conn:
76
+ conn.execute(sql)
77
+ if self._index_type != "none":
78
+ opclass = self._vector_ops()
79
+ if self._index_type == "ivfflat":
80
+ index_sql = (
81
+ "CREATE INDEX IF NOT EXISTS idx_rag_documents_embedding "
82
+ f"ON rag_documents USING ivfflat (embedding {opclass}) "
83
+ f"WITH (lists = {self._index_lists});"
84
+ )
85
+ try:
86
+ conn.execute(index_sql)
87
+ except Exception as exc: # pragma: no cover - runtime dependency
88
+ logger.warning("Failed to create ivfflat index: %s", exc)
89
+ elif self._index_type == "hnsw":
90
+ index_sql = (
91
+ "CREATE INDEX IF NOT EXISTS idx_rag_documents_embedding "
92
+ f"ON rag_documents USING hnsw (embedding {opclass}) "
93
+ f"WITH (m = {self._hnsw_m}, ef_construction = {self._hnsw_ef_construction});"
94
+ )
95
+ try:
96
+ conn.execute(index_sql)
97
+ except Exception as exc: # pragma: no cover - runtime dependency
98
+ logger.warning("Failed to create hnsw index: %s", exc)
99
+ conn.commit()
100
+
101
+ def get_source_state(self, *, source: str) -> tuple[str | None, int]:
102
+ with self._connect() as conn:
103
+ row = conn.execute(
104
+ """
105
+ SELECT source_hash, COUNT(*) AS total
106
+ FROM rag_documents
107
+ WHERE source = %s
108
+ GROUP BY source_hash
109
+ ORDER BY total DESC
110
+ LIMIT 1
111
+ """,
112
+ (source,),
113
+ ).fetchone()
114
+ if not row:
115
+ return None, 0
116
+ return row["source_hash"], int(row["total"])
117
+
118
+ def replace_documents(
119
+ self,
120
+ *,
121
+ source: str,
122
+ source_hash: str,
123
+ documents: Iterable[str],
124
+ embeddings: Iterable[list[float]],
125
+ ) -> None:
126
+ rows = list(zip(documents, embeddings, strict=True))
127
+ with self._connect() as conn:
128
+ conn.execute("DELETE FROM rag_documents WHERE source = %s", (source,))
129
+ with conn.cursor() as cursor:
130
+ cursor.executemany(
131
+ """
132
+ INSERT INTO rag_documents (source, source_hash, doc_id, content, embedding)
133
+ VALUES (%s, %s, %s, %s, %s)
134
+ """,
135
+ [
136
+ (source, source_hash, index, content, embedding)
137
+ for index, (content, embedding) in enumerate(rows)
138
+ ],
139
+ )
140
+ conn.commit()
141
+
142
+ def search(
143
+ self, *, source: str, query_embedding: list[float], top_k: int
144
+ ) -> list[PgvectorResult]:
145
+ if self._distance == "ip":
146
+ operator = "<#>"
147
+ elif self._distance == "l2":
148
+ operator = "<->"
149
+ else:
150
+ operator = "<=>"
151
+
152
+ sql = (
153
+ f"SELECT doc_id, content, embedding {operator} %s::vector AS score "
154
+ f"FROM rag_documents WHERE source = %s ORDER BY embedding {operator} %s::vector LIMIT %s"
155
+ )
156
+
157
+ with self._connect() as conn:
158
+ rows = conn.execute(sql, (query_embedding, source, query_embedding, top_k)).fetchall()
159
+
160
+ return [
161
+ PgvectorResult(
162
+ doc_id=int(row["doc_id"]), content=row["content"], score=float(row["score"])
163
+ )
164
+ for row in rows
165
+ ]
@@ -247,7 +247,7 @@ class SQLQueries:
247
247
  return "SELECT run_id FROM evaluation_runs WHERE 1=1"
248
248
 
249
249
  def list_runs_ordering(self) -> str:
250
- return f" ORDER BY started_at DESC LIMIT {self.placeholder}"
250
+ return f" ORDER BY started_at DESC LIMIT {self.placeholder} OFFSET {self.placeholder}"
251
251
 
252
252
  def upsert_regression_baseline(self) -> str:
253
253
  raise NotImplementedError("Override in subclass")
@@ -394,6 +394,7 @@ class BaseSQLStorageAdapter(ABC):
394
394
  def list_runs(
395
395
  self,
396
396
  limit: int = 100,
397
+ offset: int = 0,
397
398
  dataset_name: str | None = None,
398
399
  model_name: str | None = None,
399
400
  ) -> list[EvaluationRun]:
@@ -410,7 +411,7 @@ class BaseSQLStorageAdapter(ABC):
410
411
  params.append(model_name)
411
412
 
412
413
  query += self.queries.list_runs_ordering()
413
- params.append(limit)
414
+ params.extend([limit, offset])
414
415
 
415
416
  cursor = self._execute(conn, query, params)
416
417
  run_ids = [row["run_id"] for row in cursor.fetchall()]
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ from evalvault.adapters.outbound.storage.postgres_adapter import PostgreSQLStorageAdapter
7
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
8
+ from evalvault.config.settings import Settings
9
+ from evalvault.ports.outbound.storage_port import StoragePort
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def build_storage_adapter(
15
+ *,
16
+ settings: Settings | None = None,
17
+ db_path: Path | None = None,
18
+ fallback_to_sqlite: bool = True,
19
+ ) -> StoragePort:
20
+ resolved_settings = settings or Settings()
21
+
22
+ if db_path is not None:
23
+ return SQLiteStorageAdapter(db_path=db_path)
24
+
25
+ backend = getattr(resolved_settings, "db_backend", "postgres")
26
+ if backend == "sqlite":
27
+ resolved_db_path = resolved_settings.evalvault_db_path
28
+ if resolved_db_path is None:
29
+ raise RuntimeError("SQLite backend selected but evalvault_db_path is not set.")
30
+ return SQLiteStorageAdapter(db_path=resolved_db_path)
31
+
32
+ conn_string = resolved_settings.postgres_connection_string
33
+ if not conn_string:
34
+ host = resolved_settings.postgres_host or "localhost"
35
+ port = resolved_settings.postgres_port
36
+ database = resolved_settings.postgres_database
37
+ user = resolved_settings.postgres_user or "postgres"
38
+ password = resolved_settings.postgres_password or ""
39
+ conn_string = f"host={host} port={port} dbname={database} user={user} password={password}"
40
+
41
+ try:
42
+ return PostgreSQLStorageAdapter(connection_string=conn_string)
43
+ except Exception as exc:
44
+ if not fallback_to_sqlite:
45
+ raise
46
+ logger.warning("PostgreSQL adapter failed (%s). Falling back to SQLite.", exc)
47
+ resolved_db_path = resolved_settings.evalvault_db_path
48
+ if resolved_db_path is None:
49
+ raise
50
+ return SQLiteStorageAdapter(db_path=resolved_db_path)
51
+
52
+
53
+ __all__ = ["build_storage_adapter"]
@@ -1,6 +1,8 @@
1
1
  -- EvalVault PostgreSQL Database Schema
2
2
  -- Stores evaluation runs, test case results, and metric scores
3
3
 
4
+ CREATE EXTENSION IF NOT EXISTS vector;
5
+
4
6
  -- Main evaluation runs table
5
7
  CREATE TABLE IF NOT EXISTS evaluation_runs (
6
8
  run_id UUID PRIMARY KEY,