evalvault 1.73.2__py3-none-any.whl → 1.75.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +66 -17
- evalvault/adapters/inbound/api/routers/calibration.py +9 -9
- evalvault/adapters/inbound/api/routers/chat.py +604 -37
- evalvault/adapters/inbound/api/routers/domain.py +10 -5
- evalvault/adapters/inbound/api/routers/pipeline.py +3 -3
- evalvault/adapters/inbound/api/routers/runs.py +23 -4
- evalvault/adapters/inbound/cli/commands/analyze.py +10 -12
- evalvault/adapters/inbound/cli/commands/benchmark.py +10 -8
- evalvault/adapters/inbound/cli/commands/calibrate.py +2 -7
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +2 -7
- evalvault/adapters/inbound/cli/commands/compare.py +2 -7
- evalvault/adapters/inbound/cli/commands/debug.py +3 -2
- evalvault/adapters/inbound/cli/commands/domain.py +12 -12
- evalvault/adapters/inbound/cli/commands/experiment.py +9 -8
- evalvault/adapters/inbound/cli/commands/gate.py +3 -2
- evalvault/adapters/inbound/cli/commands/graph_rag.py +2 -2
- evalvault/adapters/inbound/cli/commands/history.py +3 -12
- evalvault/adapters/inbound/cli/commands/method.py +1 -2
- evalvault/adapters/inbound/cli/commands/ops.py +2 -2
- evalvault/adapters/inbound/cli/commands/pipeline.py +2 -2
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +3 -12
- evalvault/adapters/inbound/cli/commands/prompts.py +4 -18
- evalvault/adapters/inbound/cli/commands/regress.py +5 -4
- evalvault/adapters/inbound/cli/commands/run.py +42 -31
- evalvault/adapters/inbound/cli/commands/run_helpers.py +24 -15
- evalvault/adapters/inbound/cli/commands/stage.py +6 -25
- evalvault/adapters/inbound/cli/utils/options.py +10 -4
- evalvault/adapters/inbound/mcp/tools.py +11 -8
- evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +17 -1
- evalvault/adapters/outbound/analysis/embedding_searcher_module.py +14 -0
- evalvault/adapters/outbound/domain_memory/__init__.py +8 -4
- evalvault/adapters/outbound/domain_memory/factory.py +68 -0
- evalvault/adapters/outbound/domain_memory/postgres_adapter.py +1062 -0
- evalvault/adapters/outbound/domain_memory/postgres_domain_memory_schema.sql +177 -0
- evalvault/adapters/outbound/llm/vllm_adapter.py +23 -0
- evalvault/adapters/outbound/nlp/korean/dense_retriever.py +10 -7
- evalvault/adapters/outbound/nlp/korean/toolkit.py +15 -4
- evalvault/adapters/outbound/ops/__init__.py +5 -0
- evalvault/adapters/outbound/ops/report_renderer.py +159 -0
- evalvault/adapters/outbound/retriever/pgvector_store.py +165 -0
- evalvault/adapters/outbound/storage/base_sql.py +3 -2
- evalvault/adapters/outbound/storage/factory.py +53 -0
- evalvault/adapters/outbound/storage/postgres_adapter.py +90 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +15 -0
- evalvault/adapters/outbound/storage/schema.sql +14 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +77 -0
- evalvault/config/settings.py +31 -7
- evalvault/domain/entities/ops_report.py +40 -0
- evalvault/domain/services/domain_learning_hook.py +2 -1
- evalvault/domain/services/ops_report_service.py +192 -0
- evalvault/ports/inbound/web_port.py +3 -1
- evalvault/ports/outbound/storage_port.py +2 -0
- evalvault-1.75.0.dist-info/METADATA +221 -0
- {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/RECORD +57 -48
- evalvault-1.73.2.dist-info/METADATA +0 -585
- {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/WHEEL +0 -0
- {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
-- EvalVault Domain Memory Schema (PostgreSQL)
|
|
2
|
+
-- Based on "Memory in the Age of AI Agents: A Survey" framework
|
|
3
|
+
-- Forms: Flat (Phase 1), Planar/Hierarchical (Phase 2-3)
|
|
4
|
+
-- Functions: Factual, Experiential, Working layers
|
|
5
|
+
-- Dynamics: Formation, Evolution, Retrieval strategies
|
|
6
|
+
|
|
7
|
+
-- =========================================================================
|
|
8
|
+
-- Factual Layer - 검증된 도메인 사실 (SPO 트리플)
|
|
9
|
+
-- =========================================================================
|
|
10
|
+
|
|
11
|
+
CREATE TABLE IF NOT EXISTS factual_facts (
|
|
12
|
+
fact_id TEXT PRIMARY KEY,
|
|
13
|
+
subject TEXT NOT NULL, -- 엔티티 이름
|
|
14
|
+
predicate TEXT NOT NULL, -- 관계 타입
|
|
15
|
+
object TEXT NOT NULL, -- 대상 엔티티
|
|
16
|
+
language TEXT DEFAULT 'ko', -- 언어 코드 (ko, en)
|
|
17
|
+
domain TEXT DEFAULT 'default', -- 도메인 (insurance, legal, medical)
|
|
18
|
+
fact_type TEXT DEFAULT 'verified', -- verified, inferred, contradictory
|
|
19
|
+
verification_score REAL DEFAULT 1.0, -- 0.0-1.0
|
|
20
|
+
verification_count INTEGER DEFAULT 1,
|
|
21
|
+
source_document_ids TEXT, -- JSON array of document IDs
|
|
22
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
23
|
+
last_verified TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
24
|
+
abstraction_level INTEGER DEFAULT 0
|
|
25
|
+
);
|
|
26
|
+
|
|
27
|
+
-- 검색 최적화를 위한 인덱스
|
|
28
|
+
CREATE INDEX IF NOT EXISTS idx_facts_domain_lang ON factual_facts(domain, language);
|
|
29
|
+
CREATE INDEX IF NOT EXISTS idx_facts_subject ON factual_facts(subject);
|
|
30
|
+
CREATE INDEX IF NOT EXISTS idx_facts_predicate ON factual_facts(predicate);
|
|
31
|
+
CREATE INDEX IF NOT EXISTS idx_facts_object ON factual_facts(object);
|
|
32
|
+
CREATE INDEX IF NOT EXISTS idx_facts_triple ON factual_facts(subject, predicate, object);
|
|
33
|
+
CREATE INDEX IF NOT EXISTS idx_facts_verification_score ON factual_facts(verification_score DESC);
|
|
34
|
+
CREATE INDEX IF NOT EXISTS idx_facts_last_verified ON factual_facts(last_verified DESC);
|
|
35
|
+
|
|
36
|
+
-- GIN indexes for text search (ILIKE-based)
|
|
37
|
+
CREATE INDEX IF NOT EXISTS idx_facts_subject_gin ON factual_facts USING GIN (to_tsvector('english', subject));
|
|
38
|
+
CREATE INDEX IF NOT EXISTS idx_facts_predicate_gin ON factual_facts USING GIN (to_tsvector('english', predicate));
|
|
39
|
+
CREATE INDEX IF NOT EXISTS idx_facts_object_gin ON factual_facts USING GIN (to_tsvector('english', object));
|
|
40
|
+
|
|
41
|
+
-- =========================================================================
|
|
42
|
+
-- Experiential Layer - 평가에서 학습된 패턴
|
|
43
|
+
-- =========================================================================
|
|
44
|
+
|
|
45
|
+
CREATE TABLE IF NOT EXISTS learning_memories (
|
|
46
|
+
learning_id TEXT PRIMARY KEY,
|
|
47
|
+
run_id TEXT NOT NULL, -- 원본 평가 run ID
|
|
48
|
+
domain TEXT DEFAULT 'default',
|
|
49
|
+
language TEXT DEFAULT 'ko',
|
|
50
|
+
entity_type_reliability TEXT, -- JSON: {entity_type: reliability_score}
|
|
51
|
+
relation_type_reliability TEXT, -- JSON: {relation_type: reliability_score}
|
|
52
|
+
failed_patterns TEXT, -- JSON array of failed patterns
|
|
53
|
+
successful_patterns TEXT, -- JSON array of successful patterns
|
|
54
|
+
faithfulness_by_entity_type TEXT, -- JSON: {entity_type: faithfulness_score}
|
|
55
|
+
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
56
|
+
);
|
|
57
|
+
|
|
58
|
+
CREATE INDEX IF NOT EXISTS idx_learnings_domain_lang ON learning_memories(domain, language);
|
|
59
|
+
CREATE INDEX IF NOT EXISTS idx_learnings_run_id ON learning_memories(run_id);
|
|
60
|
+
CREATE INDEX IF NOT EXISTS idx_learnings_timestamp ON learning_memories(timestamp DESC);
|
|
61
|
+
|
|
62
|
+
-- =========================================================================
|
|
63
|
+
-- Behavior Layer - Metacognitive Reuse (재사용 가능한 행동)
|
|
64
|
+
-- =========================================================================
|
|
65
|
+
|
|
66
|
+
CREATE TABLE IF NOT EXISTS behavior_entries (
|
|
67
|
+
behavior_id TEXT PRIMARY KEY,
|
|
68
|
+
description TEXT NOT NULL,
|
|
69
|
+
trigger_pattern TEXT, -- 트리거 조건 (regex 또는 키워드)
|
|
70
|
+
action_sequence TEXT, -- JSON array of action steps
|
|
71
|
+
success_rate REAL DEFAULT 0.0, -- 역사적 성공률
|
|
72
|
+
token_savings INTEGER DEFAULT 0, -- 절감되는 토큰 수
|
|
73
|
+
applicable_languages TEXT DEFAULT '["ko", "en"]', -- JSON array
|
|
74
|
+
domain TEXT DEFAULT 'default',
|
|
75
|
+
last_used TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
76
|
+
use_count INTEGER DEFAULT 0,
|
|
77
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
78
|
+
);
|
|
79
|
+
|
|
80
|
+
CREATE INDEX IF NOT EXISTS idx_behaviors_domain ON behavior_entries(domain);
|
|
81
|
+
CREATE INDEX IF NOT EXISTS idx_behaviors_success_rate ON behavior_entries(success_rate DESC);
|
|
82
|
+
CREATE INDEX IF NOT EXISTS idx_behaviors_use_count ON behavior_entries(use_count DESC);
|
|
83
|
+
CREATE INDEX IF NOT EXISTS idx_behaviors_last_used ON behavior_entries(last_used DESC);
|
|
84
|
+
|
|
85
|
+
-- GIN indexes for behavior search
|
|
86
|
+
CREATE INDEX IF NOT EXISTS idx_behaviors_description_gin ON behavior_entries USING GIN (to_tsvector('english', description));
|
|
87
|
+
CREATE INDEX IF NOT EXISTS idx_behaviors_trigger_gin ON behavior_entries USING GIN (to_tsvector('english', trigger_pattern));
|
|
88
|
+
|
|
89
|
+
-- =========================================================================
|
|
90
|
+
-- Working Layer - 현재 세션의 활성 컨텍스트
|
|
91
|
+
-- =========================================================================
|
|
92
|
+
|
|
93
|
+
CREATE TABLE IF NOT EXISTS memory_contexts (
|
|
94
|
+
session_id TEXT PRIMARY KEY,
|
|
95
|
+
domain TEXT DEFAULT 'default',
|
|
96
|
+
language TEXT DEFAULT 'ko',
|
|
97
|
+
active_entities TEXT, -- JSON array of entity names
|
|
98
|
+
entity_type_distribution TEXT, -- JSON: {entity_type: count}
|
|
99
|
+
current_quality_metrics TEXT, -- JSON: {metric_name: value}
|
|
100
|
+
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
101
|
+
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
102
|
+
);
|
|
103
|
+
|
|
104
|
+
CREATE INDEX IF NOT EXISTS idx_contexts_domain ON memory_contexts(domain);
|
|
105
|
+
CREATE INDEX IF NOT EXISTS idx_contexts_updated_at ON memory_contexts(updated_at DESC);
|
|
106
|
+
|
|
107
|
+
-- =========================================================================
|
|
108
|
+
-- Fact Sources - 사실과 문서 간의 관계 (Phase 2)
|
|
109
|
+
-- =========================================================================
|
|
110
|
+
|
|
111
|
+
CREATE TABLE IF NOT EXISTS fact_sources (
|
|
112
|
+
id SERIAL PRIMARY KEY,
|
|
113
|
+
fact_id TEXT NOT NULL,
|
|
114
|
+
document_id TEXT NOT NULL,
|
|
115
|
+
extraction_confidence REAL DEFAULT 1.0,
|
|
116
|
+
extracted_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
117
|
+
FOREIGN KEY (fact_id) REFERENCES factual_facts(fact_id) ON DELETE CASCADE,
|
|
118
|
+
UNIQUE(fact_id, document_id)
|
|
119
|
+
);
|
|
120
|
+
|
|
121
|
+
CREATE INDEX IF NOT EXISTS idx_fact_sources_fact_id ON fact_sources(fact_id);
|
|
122
|
+
CREATE INDEX IF NOT EXISTS idx_fact_sources_document_id ON fact_sources(document_id);
|
|
123
|
+
|
|
124
|
+
-- =========================================================================
|
|
125
|
+
-- Memory Evolution Log - 메모리 변화 추적 (Phase 2)
|
|
126
|
+
-- =========================================================================
|
|
127
|
+
|
|
128
|
+
CREATE TABLE IF NOT EXISTS memory_evolution_log (
|
|
129
|
+
id SERIAL PRIMARY KEY,
|
|
130
|
+
operation TEXT NOT NULL, -- consolidate, update, forget, decay
|
|
131
|
+
target_type TEXT NOT NULL, -- fact, learning, behavior
|
|
132
|
+
target_id TEXT NOT NULL,
|
|
133
|
+
details TEXT, -- JSON: operation-specific details
|
|
134
|
+
performed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
135
|
+
);
|
|
136
|
+
|
|
137
|
+
CREATE INDEX IF NOT EXISTS idx_evolution_log_operation ON memory_evolution_log(operation);
|
|
138
|
+
CREATE INDEX IF NOT EXISTS idx_evolution_log_target ON memory_evolution_log(target_type, target_id);
|
|
139
|
+
CREATE INDEX IF NOT EXISTS idx_evolution_log_performed_at ON memory_evolution_log(performed_at DESC);
|
|
140
|
+
|
|
141
|
+
-- =========================================================================
|
|
142
|
+
-- Phase 5: Planar Form - KG Integration
|
|
143
|
+
-- =========================================================================
|
|
144
|
+
|
|
145
|
+
-- KG Entity binding table for explicit KG links
|
|
146
|
+
CREATE TABLE IF NOT EXISTS fact_kg_bindings (
|
|
147
|
+
id SERIAL PRIMARY KEY,
|
|
148
|
+
fact_id TEXT NOT NULL,
|
|
149
|
+
kg_entity_id TEXT NOT NULL, -- KG 엔티티 이름/ID
|
|
150
|
+
kg_relation_type TEXT, -- KG 관계 타입
|
|
151
|
+
binding_confidence REAL DEFAULT 1.0,
|
|
152
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
153
|
+
FOREIGN KEY (fact_id) REFERENCES factual_facts(fact_id) ON DELETE CASCADE,
|
|
154
|
+
UNIQUE(fact_id, kg_entity_id)
|
|
155
|
+
);
|
|
156
|
+
|
|
157
|
+
CREATE INDEX IF NOT EXISTS idx_kg_bindings_fact_id ON fact_kg_bindings(fact_id);
|
|
158
|
+
CREATE INDEX IF NOT EXISTS idx_kg_bindings_kg_entity ON fact_kg_bindings(kg_entity_id);
|
|
159
|
+
CREATE INDEX IF NOT EXISTS idx_kg_bindings_relation_type ON fact_kg_bindings(kg_relation_type);
|
|
160
|
+
|
|
161
|
+
-- =========================================================================
|
|
162
|
+
-- Phase 5: Hierarchical Form - Summary Layers
|
|
163
|
+
-- =========================================================================
|
|
164
|
+
|
|
165
|
+
-- Fact hierarchy table for parent-child relationships
|
|
166
|
+
CREATE TABLE IF NOT EXISTS fact_hierarchy (
|
|
167
|
+
id SERIAL PRIMARY KEY,
|
|
168
|
+
parent_fact_id TEXT NOT NULL,
|
|
169
|
+
child_fact_id TEXT NOT NULL,
|
|
170
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
171
|
+
FOREIGN KEY (parent_fact_id) REFERENCES factual_facts(fact_id) ON DELETE CASCADE,
|
|
172
|
+
FOREIGN KEY (child_fact_id) REFERENCES factual_facts(fact_id) ON DELETE CASCADE,
|
|
173
|
+
UNIQUE(parent_fact_id, child_fact_id)
|
|
174
|
+
);
|
|
175
|
+
|
|
176
|
+
CREATE INDEX IF NOT EXISTS idx_hierarchy_parent ON fact_hierarchy(parent_fact_id);
|
|
177
|
+
CREATE INDEX IF NOT EXISTS idx_hierarchy_child ON fact_hierarchy(child_fact_id);
|
|
@@ -64,6 +64,29 @@ class VLLMAdapter(BaseLLMAdapter):
|
|
|
64
64
|
"""Get the embedding model name being used."""
|
|
65
65
|
return self._embedding_model_name
|
|
66
66
|
|
|
67
|
+
def embed_sync(
|
|
68
|
+
self,
|
|
69
|
+
*,
|
|
70
|
+
texts: list[str],
|
|
71
|
+
model: str | None = None,
|
|
72
|
+
dimension: int | None = None,
|
|
73
|
+
) -> list[list[float]]:
|
|
74
|
+
"""Synchronous embedding call using OpenAI-compatible API."""
|
|
75
|
+
embed_base_url = self._settings.vllm_embedding_base_url or self._settings.vllm_base_url
|
|
76
|
+
client = OpenAI(
|
|
77
|
+
base_url=embed_base_url,
|
|
78
|
+
api_key=self._settings.vllm_api_key or "local",
|
|
79
|
+
timeout=self._settings.vllm_timeout,
|
|
80
|
+
)
|
|
81
|
+
payload: dict[str, Any] = {
|
|
82
|
+
"model": model or self._embedding_model_name,
|
|
83
|
+
"input": texts,
|
|
84
|
+
}
|
|
85
|
+
if dimension is not None:
|
|
86
|
+
payload["dimensions"] = dimension
|
|
87
|
+
response = client.embeddings.create(**payload)
|
|
88
|
+
return [item.embedding for item in response.data]
|
|
89
|
+
|
|
67
90
|
async def agenerate_text(
|
|
68
91
|
self,
|
|
69
92
|
prompt: str,
|
|
@@ -141,8 +141,8 @@ class KoreanDenseRetriever:
|
|
|
141
141
|
},
|
|
142
142
|
}
|
|
143
143
|
|
|
144
|
-
# 기본 모델:
|
|
145
|
-
DEFAULT_MODEL = "
|
|
144
|
+
# 기본 모델: BAAI/bge-m3 (멀티링거시 기본)
|
|
145
|
+
DEFAULT_MODEL = "BAAI/bge-m3"
|
|
146
146
|
|
|
147
147
|
def __init__(
|
|
148
148
|
self,
|
|
@@ -175,7 +175,7 @@ class KoreanDenseRetriever:
|
|
|
175
175
|
device: 디바이스 (auto, cpu, cuda, mps)
|
|
176
176
|
batch_size: 인코딩 배치 크기
|
|
177
177
|
- 0 이하로 설정하면 간단한 휴리스틱으로 자동 결정
|
|
178
|
-
|
|
178
|
+
ollama_adapter: OpenAI 호환 임베딩 어댑터 (Ollama/vLLM)
|
|
179
179
|
matryoshka_dim: Matryoshka 차원 (Qwen3-Embedding 전용)
|
|
180
180
|
- None: 모델 권장 차원 사용
|
|
181
181
|
- 256: 개발용 (속도 우선)
|
|
@@ -237,12 +237,12 @@ class KoreanDenseRetriever:
|
|
|
237
237
|
self._query_cache_size = max(query_cache_size, 0)
|
|
238
238
|
self._search_cache_size = max(search_cache_size, 0)
|
|
239
239
|
|
|
240
|
-
# Validate
|
|
240
|
+
# Validate embedding adapter for OpenAI-compatible embedding models
|
|
241
241
|
model_info = self.SUPPORTED_MODELS.get(self._model_name)
|
|
242
242
|
if model_info and model_info.get("type") == "ollama" and not self._ollama_adapter:
|
|
243
243
|
raise ValueError(
|
|
244
|
-
f"
|
|
245
|
-
"Create one with: OllamaAdapter(settings)"
|
|
244
|
+
f"embedding adapter is required for model '{self._model_name}'. "
|
|
245
|
+
"Create one with: OllamaAdapter(settings) or VLLMAdapter(settings)"
|
|
246
246
|
)
|
|
247
247
|
|
|
248
248
|
# Auto-select matryoshka dimension if not specified
|
|
@@ -362,7 +362,10 @@ class KoreanDenseRetriever:
|
|
|
362
362
|
return
|
|
363
363
|
|
|
364
364
|
model_info = self.SUPPORTED_MODELS.get(self._model_name)
|
|
365
|
-
|
|
365
|
+
if model_info is None and self._ollama_adapter is not None:
|
|
366
|
+
model_type = "ollama"
|
|
367
|
+
else:
|
|
368
|
+
model_type = model_info["type"] if model_info else "sentence-transformers"
|
|
366
369
|
|
|
367
370
|
# Ollama models use adapter directly - no model loading needed
|
|
368
371
|
if model_type == "ollama":
|
|
@@ -76,10 +76,21 @@ class KoreanNLPToolkit(KoreanNLPToolkitPort):
|
|
|
76
76
|
|
|
77
77
|
embedding_func = None
|
|
78
78
|
try:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
79
|
+
if embedding_profile == "vllm":
|
|
80
|
+
from evalvault.adapters.outbound.llm.vllm_adapter import VLLMAdapter
|
|
81
|
+
from evalvault.config.settings import Settings
|
|
82
|
+
|
|
83
|
+
settings = Settings()
|
|
84
|
+
adapter = ollama_adapter or VLLMAdapter(settings)
|
|
85
|
+
dense_retriever = KoreanDenseRetriever(
|
|
86
|
+
model_name=settings.vllm_embedding_model,
|
|
87
|
+
ollama_adapter=adapter,
|
|
88
|
+
)
|
|
89
|
+
else:
|
|
90
|
+
dense_retriever = KoreanDenseRetriever(
|
|
91
|
+
profile=embedding_profile,
|
|
92
|
+
ollama_adapter=ollama_adapter,
|
|
93
|
+
)
|
|
83
94
|
embedding_func = dense_retriever.get_embedding_func()
|
|
84
95
|
if verbose:
|
|
85
96
|
logger.info(
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from evalvault.domain.entities.ops_report import OpsReport
|
|
6
|
+
from evalvault.domain.entities.stage import StageMetric, StageSummary
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def render_markdown(report: OpsReport) -> str:
|
|
10
|
+
lines: list[str] = []
|
|
11
|
+
lines.append("# Ops Report")
|
|
12
|
+
lines.append("")
|
|
13
|
+
lines.extend(_render_run_summary(report.run_summary, report.metadata))
|
|
14
|
+
lines.append("")
|
|
15
|
+
lines.extend(_render_ops_kpis(report.ops_kpis))
|
|
16
|
+
lines.append("")
|
|
17
|
+
lines.extend(_render_stage_summary(report.stage_summary))
|
|
18
|
+
lines.append("")
|
|
19
|
+
lines.extend(_render_bottlenecks(report.bottlenecks))
|
|
20
|
+
lines.append("")
|
|
21
|
+
lines.extend(_render_recommendations(report.recommendations))
|
|
22
|
+
lines.append("")
|
|
23
|
+
lines.extend(_render_failing_metrics(report.stage_metrics))
|
|
24
|
+
return "\n".join(lines).strip()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def render_json(report: OpsReport) -> str:
|
|
28
|
+
payload = report.to_dict()
|
|
29
|
+
return json.dumps(payload, ensure_ascii=True, indent=2)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _render_run_summary(summary: dict[str, object], metadata: dict[str, object]) -> list[str]:
|
|
33
|
+
run_id = summary.get("run_id", "-")
|
|
34
|
+
dataset = summary.get("dataset_name", "-")
|
|
35
|
+
version = summary.get("dataset_version", "-")
|
|
36
|
+
model = summary.get("model_name", "-")
|
|
37
|
+
started = summary.get("started_at", "-")
|
|
38
|
+
finished = summary.get("finished_at", "-")
|
|
39
|
+
duration = summary.get("duration_seconds", "-")
|
|
40
|
+
total_cases = summary.get("total_test_cases", "-")
|
|
41
|
+
pass_rate = summary.get("pass_rate", "-")
|
|
42
|
+
total_tokens = summary.get("total_tokens", "-")
|
|
43
|
+
total_cost = summary.get("total_cost_usd", "-")
|
|
44
|
+
|
|
45
|
+
lines = [
|
|
46
|
+
"## Run Summary",
|
|
47
|
+
f"- run_id: {run_id}",
|
|
48
|
+
f"- dataset: {dataset} ({version})",
|
|
49
|
+
f"- model: {model}",
|
|
50
|
+
f"- started_at: {started}",
|
|
51
|
+
f"- finished_at: {finished}",
|
|
52
|
+
f"- duration_seconds: {duration}",
|
|
53
|
+
f"- total_test_cases: {total_cases}",
|
|
54
|
+
f"- pass_rate: {pass_rate}",
|
|
55
|
+
f"- total_tokens: {total_tokens}",
|
|
56
|
+
f"- total_cost_usd: {total_cost}",
|
|
57
|
+
]
|
|
58
|
+
trace_links: list[str] = []
|
|
59
|
+
if metadata.get("langfuse_trace_url"):
|
|
60
|
+
trace_links.append(f"langfuse_trace_url={metadata['langfuse_trace_url']}")
|
|
61
|
+
if metadata.get("phoenix_trace_url"):
|
|
62
|
+
trace_links.append(f"phoenix_trace_url={metadata['phoenix_trace_url']}")
|
|
63
|
+
if trace_links:
|
|
64
|
+
lines.append(f"- trace_links: {', '.join(trace_links)}")
|
|
65
|
+
return lines
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _render_stage_summary(summary: StageSummary | None) -> list[str]:
|
|
69
|
+
lines = ["## Stage Summary"]
|
|
70
|
+
if summary is None:
|
|
71
|
+
lines.append("- no stage events found")
|
|
72
|
+
return lines
|
|
73
|
+
lines.append(f"- total_events: {summary.total_events}")
|
|
74
|
+
if summary.missing_required_stage_types:
|
|
75
|
+
missing = ", ".join(summary.missing_required_stage_types)
|
|
76
|
+
lines.append(f"- missing_required_stage_types: {missing}")
|
|
77
|
+
if summary.stage_type_counts:
|
|
78
|
+
lines.append("- stage_type_counts:")
|
|
79
|
+
for stage_type, count in summary.stage_type_counts.items():
|
|
80
|
+
lines.append(f" - {stage_type}: {count}")
|
|
81
|
+
if summary.stage_type_avg_durations:
|
|
82
|
+
lines.append("- stage_type_avg_durations_ms:")
|
|
83
|
+
for stage_type, duration in summary.stage_type_avg_durations.items():
|
|
84
|
+
lines.append(f" - {stage_type}: {duration:.3f}")
|
|
85
|
+
return lines
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _render_ops_kpis(kpis: dict[str, object]) -> list[str]:
|
|
89
|
+
lines = ["## Ops KPIs"]
|
|
90
|
+
lines.append(f"- total_test_cases: {kpis.get('total_test_cases', '-')}")
|
|
91
|
+
lines.append(f"- pass_rate: {kpis.get('pass_rate', '-')}")
|
|
92
|
+
lines.append(f"- failure_rate: {kpis.get('failure_rate', '-')}")
|
|
93
|
+
lines.append(f"- stage_error_rate: {kpis.get('stage_error_rate', '-')}")
|
|
94
|
+
lines.append(f"- stage_error_severity: {kpis.get('stage_error_severity', '-')}")
|
|
95
|
+
lines.append(f"- duration_seconds: {kpis.get('duration_seconds', '-')}")
|
|
96
|
+
lines.append(f"- total_tokens: {kpis.get('total_tokens', '-')}")
|
|
97
|
+
lines.append(f"- total_cost_usd: {kpis.get('total_cost_usd', '-')}")
|
|
98
|
+
lines.append(f"- avg_latency_ms: {kpis.get('avg_latency_ms', '-')}")
|
|
99
|
+
lines.append(f"- p95_latency_ms: {kpis.get('p95_latency_ms', '-')}")
|
|
100
|
+
lines.append(f"- avg_tokens_per_case: {kpis.get('avg_tokens_per_case', '-')}")
|
|
101
|
+
lines.append(f"- avg_cost_per_case_usd: {kpis.get('avg_cost_per_case_usd', '-')}")
|
|
102
|
+
return lines
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _render_bottlenecks(bottlenecks: list[dict[str, object]]) -> list[str]:
|
|
106
|
+
lines = ["## Ops Signals"]
|
|
107
|
+
if not bottlenecks:
|
|
108
|
+
lines.append("- none")
|
|
109
|
+
return lines
|
|
110
|
+
for entry in bottlenecks:
|
|
111
|
+
entry_type = entry.get("type", "unknown")
|
|
112
|
+
if entry_type == "latency":
|
|
113
|
+
stage_type = entry.get("stage_type", "-")
|
|
114
|
+
duration = entry.get("avg_duration_ms", "-")
|
|
115
|
+
lines.append(f"- latency: {stage_type} avg_duration_ms={duration}")
|
|
116
|
+
elif entry_type == "missing_stage":
|
|
117
|
+
stage_type = entry.get("stage_type", "-")
|
|
118
|
+
lines.append(f"- missing_stage: {stage_type}")
|
|
119
|
+
else:
|
|
120
|
+
lines.append(f"- {entry_type}: {entry}")
|
|
121
|
+
return lines
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _render_recommendations(recommendations: list[str]) -> list[str]:
|
|
125
|
+
lines = ["## Recommendations"]
|
|
126
|
+
if not recommendations:
|
|
127
|
+
lines.append("- none")
|
|
128
|
+
return lines
|
|
129
|
+
for item in recommendations:
|
|
130
|
+
lines.append(f"- {item}")
|
|
131
|
+
return lines
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _render_failing_metrics(metrics: list[StageMetric]) -> list[str]:
|
|
135
|
+
lines = ["## Failing Stage Metrics"]
|
|
136
|
+
failing = [metric for metric in metrics if metric.passed is False]
|
|
137
|
+
if not failing:
|
|
138
|
+
lines.append("- none")
|
|
139
|
+
return lines
|
|
140
|
+
|
|
141
|
+
failing_sorted = sorted(failing, key=_metric_severity, reverse=True)[:20]
|
|
142
|
+
for metric in failing_sorted:
|
|
143
|
+
threshold = metric.threshold if metric.threshold is not None else "-"
|
|
144
|
+
lines.append(
|
|
145
|
+
f"- {metric.metric_name}: score={metric.score} threshold={threshold} "
|
|
146
|
+
f"stage_id={metric.stage_id}"
|
|
147
|
+
)
|
|
148
|
+
return lines
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _metric_severity(metric: StageMetric) -> float:
|
|
152
|
+
if metric.threshold is None:
|
|
153
|
+
return 0.0
|
|
154
|
+
comparison = None
|
|
155
|
+
if isinstance(metric.evidence, dict):
|
|
156
|
+
comparison = metric.evidence.get("comparison")
|
|
157
|
+
if isinstance(comparison, str) and comparison.lower() in {"max", "<=", "le"}:
|
|
158
|
+
return max(metric.score - metric.threshold, 0.0)
|
|
159
|
+
return max(metric.threshold - metric.score, 0.0)
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
import psycopg
|
|
8
|
+
from psycopg.rows import dict_row
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class PgvectorResult:
|
|
15
|
+
doc_id: int
|
|
16
|
+
content: str
|
|
17
|
+
score: float
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PgvectorStore:
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
conn_string: str,
|
|
24
|
+
*,
|
|
25
|
+
distance: str = "cosine",
|
|
26
|
+
index_type: str = "hnsw",
|
|
27
|
+
index_lists: int = 100,
|
|
28
|
+
hnsw_m: int = 16,
|
|
29
|
+
hnsw_ef_construction: int = 64,
|
|
30
|
+
) -> None:
|
|
31
|
+
self._conn_string = conn_string
|
|
32
|
+
self._distance = distance
|
|
33
|
+
self._index_type = index_type
|
|
34
|
+
self._index_lists = index_lists
|
|
35
|
+
self._hnsw_m = hnsw_m
|
|
36
|
+
self._hnsw_ef_construction = hnsw_ef_construction
|
|
37
|
+
|
|
38
|
+
def _vector_ops(self) -> str:
|
|
39
|
+
if self._distance == "ip":
|
|
40
|
+
return "vector_ip_ops"
|
|
41
|
+
if self._distance == "l2":
|
|
42
|
+
return "vector_l2_ops"
|
|
43
|
+
return "vector_cosine_ops"
|
|
44
|
+
|
|
45
|
+
def _connect(self) -> psycopg.Connection:
|
|
46
|
+
conn = psycopg.connect(self._conn_string, row_factory=dict_row)
|
|
47
|
+
try:
|
|
48
|
+
from pgvector.psycopg import register_vector
|
|
49
|
+
|
|
50
|
+
register_vector(conn)
|
|
51
|
+
except Exception as exc: # pragma: no cover - runtime dependency
|
|
52
|
+
logger.warning("Failed to register pgvector type: %s", exc)
|
|
53
|
+
return conn
|
|
54
|
+
|
|
55
|
+
def ensure_schema(self, *, dimension: int) -> None:
|
|
56
|
+
sql = f"""
|
|
57
|
+
CREATE EXTENSION IF NOT EXISTS vector;
|
|
58
|
+
|
|
59
|
+
CREATE TABLE IF NOT EXISTS rag_documents (
|
|
60
|
+
id BIGSERIAL PRIMARY KEY,
|
|
61
|
+
source TEXT NOT NULL,
|
|
62
|
+
source_hash TEXT NOT NULL,
|
|
63
|
+
doc_id INTEGER NOT NULL,
|
|
64
|
+
content TEXT NOT NULL,
|
|
65
|
+
embedding VECTOR({dimension}),
|
|
66
|
+
metadata JSONB,
|
|
67
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
|
68
|
+
);
|
|
69
|
+
|
|
70
|
+
CREATE INDEX IF NOT EXISTS idx_rag_documents_source ON rag_documents(source);
|
|
71
|
+
CREATE INDEX IF NOT EXISTS idx_rag_documents_doc_id ON rag_documents(doc_id);
|
|
72
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_rag_documents_source_doc_id
|
|
73
|
+
ON rag_documents(source, doc_id);
|
|
74
|
+
"""
|
|
75
|
+
with self._connect() as conn:
|
|
76
|
+
conn.execute(sql)
|
|
77
|
+
if self._index_type != "none":
|
|
78
|
+
opclass = self._vector_ops()
|
|
79
|
+
if self._index_type == "ivfflat":
|
|
80
|
+
index_sql = (
|
|
81
|
+
"CREATE INDEX IF NOT EXISTS idx_rag_documents_embedding "
|
|
82
|
+
f"ON rag_documents USING ivfflat (embedding {opclass}) "
|
|
83
|
+
f"WITH (lists = {self._index_lists});"
|
|
84
|
+
)
|
|
85
|
+
try:
|
|
86
|
+
conn.execute(index_sql)
|
|
87
|
+
except Exception as exc: # pragma: no cover - runtime dependency
|
|
88
|
+
logger.warning("Failed to create ivfflat index: %s", exc)
|
|
89
|
+
elif self._index_type == "hnsw":
|
|
90
|
+
index_sql = (
|
|
91
|
+
"CREATE INDEX IF NOT EXISTS idx_rag_documents_embedding "
|
|
92
|
+
f"ON rag_documents USING hnsw (embedding {opclass}) "
|
|
93
|
+
f"WITH (m = {self._hnsw_m}, ef_construction = {self._hnsw_ef_construction});"
|
|
94
|
+
)
|
|
95
|
+
try:
|
|
96
|
+
conn.execute(index_sql)
|
|
97
|
+
except Exception as exc: # pragma: no cover - runtime dependency
|
|
98
|
+
logger.warning("Failed to create hnsw index: %s", exc)
|
|
99
|
+
conn.commit()
|
|
100
|
+
|
|
101
|
+
def get_source_state(self, *, source: str) -> tuple[str | None, int]:
|
|
102
|
+
with self._connect() as conn:
|
|
103
|
+
row = conn.execute(
|
|
104
|
+
"""
|
|
105
|
+
SELECT source_hash, COUNT(*) AS total
|
|
106
|
+
FROM rag_documents
|
|
107
|
+
WHERE source = %s
|
|
108
|
+
GROUP BY source_hash
|
|
109
|
+
ORDER BY total DESC
|
|
110
|
+
LIMIT 1
|
|
111
|
+
""",
|
|
112
|
+
(source,),
|
|
113
|
+
).fetchone()
|
|
114
|
+
if not row:
|
|
115
|
+
return None, 0
|
|
116
|
+
return row["source_hash"], int(row["total"])
|
|
117
|
+
|
|
118
|
+
def replace_documents(
|
|
119
|
+
self,
|
|
120
|
+
*,
|
|
121
|
+
source: str,
|
|
122
|
+
source_hash: str,
|
|
123
|
+
documents: Iterable[str],
|
|
124
|
+
embeddings: Iterable[list[float]],
|
|
125
|
+
) -> None:
|
|
126
|
+
rows = list(zip(documents, embeddings, strict=True))
|
|
127
|
+
with self._connect() as conn:
|
|
128
|
+
conn.execute("DELETE FROM rag_documents WHERE source = %s", (source,))
|
|
129
|
+
with conn.cursor() as cursor:
|
|
130
|
+
cursor.executemany(
|
|
131
|
+
"""
|
|
132
|
+
INSERT INTO rag_documents (source, source_hash, doc_id, content, embedding)
|
|
133
|
+
VALUES (%s, %s, %s, %s, %s)
|
|
134
|
+
""",
|
|
135
|
+
[
|
|
136
|
+
(source, source_hash, index, content, embedding)
|
|
137
|
+
for index, (content, embedding) in enumerate(rows)
|
|
138
|
+
],
|
|
139
|
+
)
|
|
140
|
+
conn.commit()
|
|
141
|
+
|
|
142
|
+
def search(
|
|
143
|
+
self, *, source: str, query_embedding: list[float], top_k: int
|
|
144
|
+
) -> list[PgvectorResult]:
|
|
145
|
+
if self._distance == "ip":
|
|
146
|
+
operator = "<#>"
|
|
147
|
+
elif self._distance == "l2":
|
|
148
|
+
operator = "<->"
|
|
149
|
+
else:
|
|
150
|
+
operator = "<=>"
|
|
151
|
+
|
|
152
|
+
sql = (
|
|
153
|
+
f"SELECT doc_id, content, embedding {operator} %s::vector AS score "
|
|
154
|
+
f"FROM rag_documents WHERE source = %s ORDER BY embedding {operator} %s::vector LIMIT %s"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
with self._connect() as conn:
|
|
158
|
+
rows = conn.execute(sql, (query_embedding, source, query_embedding, top_k)).fetchall()
|
|
159
|
+
|
|
160
|
+
return [
|
|
161
|
+
PgvectorResult(
|
|
162
|
+
doc_id=int(row["doc_id"]), content=row["content"], score=float(row["score"])
|
|
163
|
+
)
|
|
164
|
+
for row in rows
|
|
165
|
+
]
|
|
@@ -247,7 +247,7 @@ class SQLQueries:
|
|
|
247
247
|
return "SELECT run_id FROM evaluation_runs WHERE 1=1"
|
|
248
248
|
|
|
249
249
|
def list_runs_ordering(self) -> str:
|
|
250
|
-
return f" ORDER BY started_at DESC LIMIT {self.placeholder}"
|
|
250
|
+
return f" ORDER BY started_at DESC LIMIT {self.placeholder} OFFSET {self.placeholder}"
|
|
251
251
|
|
|
252
252
|
def upsert_regression_baseline(self) -> str:
|
|
253
253
|
raise NotImplementedError("Override in subclass")
|
|
@@ -394,6 +394,7 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
394
394
|
def list_runs(
|
|
395
395
|
self,
|
|
396
396
|
limit: int = 100,
|
|
397
|
+
offset: int = 0,
|
|
397
398
|
dataset_name: str | None = None,
|
|
398
399
|
model_name: str | None = None,
|
|
399
400
|
) -> list[EvaluationRun]:
|
|
@@ -410,7 +411,7 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
410
411
|
params.append(model_name)
|
|
411
412
|
|
|
412
413
|
query += self.queries.list_runs_ordering()
|
|
413
|
-
params.
|
|
414
|
+
params.extend([limit, offset])
|
|
414
415
|
|
|
415
416
|
cursor = self._execute(conn, query, params)
|
|
416
417
|
run_ids = [row["run_id"] for row in cursor.fetchall()]
|