evalvault 1.66.0__py3-none-any.whl → 1.68.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/main.py +6 -0
- evalvault/adapters/inbound/api/routers/chat.py +143 -69
- evalvault/adapters/inbound/api/routers/mcp.py +144 -0
- evalvault/adapters/outbound/dataset/base.py +40 -0
- evalvault/adapters/outbound/dataset/csv_loader.py +16 -0
- evalvault/adapters/outbound/dataset/excel_loader.py +16 -0
- evalvault/config/settings.py +15 -4
- evalvault/domain/entities/stage.py +22 -6
- {evalvault-1.66.0.dist-info → evalvault-1.68.0.dist-info}/METADATA +1 -1
- {evalvault-1.66.0.dist-info → evalvault-1.68.0.dist-info}/RECORD +13 -12
- {evalvault-1.66.0.dist-info → evalvault-1.68.0.dist-info}/WHEEL +0 -0
- {evalvault-1.66.0.dist-info → evalvault-1.68.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.66.0.dist-info → evalvault-1.68.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -63,6 +63,12 @@ async def lifespan(app: FastAPI):
|
|
|
63
63
|
# Startup: Initialize adapter
|
|
64
64
|
adapter = create_adapter()
|
|
65
65
|
app.state.adapter = adapter
|
|
66
|
+
try:
|
|
67
|
+
from evalvault.adapters.inbound.api.routers.chat import warm_rag_index
|
|
68
|
+
|
|
69
|
+
await warm_rag_index()
|
|
70
|
+
except Exception as exc:
|
|
71
|
+
logger.warning("RAG preload failed: %s", exc)
|
|
66
72
|
yield
|
|
67
73
|
# Shutdown: Cleanup if necessary
|
|
68
74
|
pass
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import hashlib
|
|
4
5
|
import json
|
|
6
|
+
import logging
|
|
5
7
|
import os
|
|
6
8
|
import re
|
|
7
9
|
import time
|
|
8
10
|
from collections.abc import AsyncGenerator
|
|
11
|
+
from datetime import UTC, datetime
|
|
9
12
|
from pathlib import Path
|
|
10
13
|
from typing import Any
|
|
11
14
|
|
|
@@ -16,9 +19,15 @@ from pydantic import BaseModel, Field
|
|
|
16
19
|
|
|
17
20
|
router = APIRouter(tags=["chat"])
|
|
18
21
|
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
19
24
|
MCP_URL = os.getenv("EVALVAULT_MCP_URL", "http://localhost:8000/api/v1/mcp")
|
|
20
25
|
MCP_TOKEN = os.getenv("EVALVAULT_MCP_TOKEN", "mcp-local-dev-token")
|
|
21
26
|
|
|
27
|
+
USER_GUIDE_PATH = Path(os.getenv("EVALVAULT_RAG_USER_GUIDE", "docs/guides/USER_GUIDE.md"))
|
|
28
|
+
RAG_INDEX_DIR = Path(os.getenv("EVALVAULT_RAG_INDEX_DIR", "data/rag"))
|
|
29
|
+
RAG_INDEX_PATH = RAG_INDEX_DIR / "user_guide_bm25.json"
|
|
30
|
+
|
|
22
31
|
_RAG_RETRIEVER = None
|
|
23
32
|
_RAG_DOCS_COUNT = 0
|
|
24
33
|
_RAG_TEXTS: list[str] = []
|
|
@@ -129,25 +138,98 @@ def _summarize_result(tool_name: str, payload: dict[str, Any]) -> str:
|
|
|
129
138
|
return str(payload)
|
|
130
139
|
|
|
131
140
|
|
|
132
|
-
def
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
return
|
|
141
|
+
def _load_user_guide_text() -> str | None:
|
|
142
|
+
if not USER_GUIDE_PATH.exists():
|
|
143
|
+
logger.warning("USER_GUIDE.md not found at %s", USER_GUIDE_PATH)
|
|
144
|
+
return None
|
|
145
|
+
try:
|
|
146
|
+
content = USER_GUIDE_PATH.read_text(encoding="utf-8")
|
|
147
|
+
except Exception as exc:
|
|
148
|
+
logger.warning("Failed to read USER_GUIDE.md: %s", exc)
|
|
149
|
+
return None
|
|
150
|
+
if not content.strip():
|
|
151
|
+
return None
|
|
152
|
+
return content
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _hash_text(text: str) -> str:
|
|
156
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _chunk_user_guide(content: str, chunk_limit: int) -> list[str]:
|
|
160
|
+
try:
|
|
161
|
+
from evalvault.adapters.outbound.nlp.korean.document_chunker import ParagraphChunker
|
|
162
|
+
from evalvault.adapters.outbound.nlp.korean.kiwi_tokenizer import KiwiTokenizer
|
|
163
|
+
|
|
164
|
+
tokenizer = KiwiTokenizer()
|
|
165
|
+
chunker = ParagraphChunker(tokenizer=tokenizer, chunk_size=450, overlap_tokens=80)
|
|
166
|
+
chunks = [
|
|
167
|
+
chunk.text
|
|
168
|
+
for chunk in chunker.chunk_with_metadata(content, source=str(USER_GUIDE_PATH))
|
|
169
|
+
]
|
|
170
|
+
if chunk_limit > 0:
|
|
171
|
+
return chunks[:chunk_limit]
|
|
172
|
+
return chunks
|
|
173
|
+
except Exception as exc:
|
|
174
|
+
logger.warning("Failed to chunk USER_GUIDE.md, using fallback split: %s", exc)
|
|
175
|
+
paragraphs = [block.strip() for block in content.split("\n\n") if block.strip()]
|
|
176
|
+
if chunk_limit > 0:
|
|
177
|
+
return paragraphs[:chunk_limit]
|
|
178
|
+
return paragraphs
|
|
148
179
|
|
|
149
180
|
|
|
150
|
-
|
|
181
|
+
def _build_bm25_tokens(texts: list[str]) -> list[list[str]]:
|
|
182
|
+
try:
|
|
183
|
+
from evalvault.adapters.outbound.nlp.korean.kiwi_tokenizer import KiwiTokenizer
|
|
184
|
+
|
|
185
|
+
tokenizer = KiwiTokenizer()
|
|
186
|
+
tokens = []
|
|
187
|
+
for text in texts:
|
|
188
|
+
doc_tokens = tokenizer.tokenize(text)
|
|
189
|
+
if not doc_tokens:
|
|
190
|
+
doc_tokens = re.findall(r"[A-Za-z0-9가-힣]+", text)
|
|
191
|
+
tokens.append(doc_tokens)
|
|
192
|
+
return tokens
|
|
193
|
+
except Exception as exc:
|
|
194
|
+
logger.warning("Failed to tokenize with Kiwi, using regex: %s", exc)
|
|
195
|
+
return [re.findall(r"[A-Za-z0-9가-힣]+", text) for text in texts]
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _load_bm25_index() -> dict[str, Any] | None:
|
|
199
|
+
if not RAG_INDEX_PATH.exists():
|
|
200
|
+
return None
|
|
201
|
+
try:
|
|
202
|
+
payload = json.loads(RAG_INDEX_PATH.read_text(encoding="utf-8"))
|
|
203
|
+
except Exception as exc:
|
|
204
|
+
logger.warning("Failed to read BM25 index: %s", exc)
|
|
205
|
+
return None
|
|
206
|
+
if not isinstance(payload, dict):
|
|
207
|
+
return None
|
|
208
|
+
return payload
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _save_bm25_index(payload: dict[str, Any]) -> None:
|
|
212
|
+
RAG_INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
|
213
|
+
RAG_INDEX_PATH.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _build_bm25_index(content: str, chunk_limit: int) -> dict[str, Any] | None:
|
|
217
|
+
chunks = _chunk_user_guide(content, chunk_limit)
|
|
218
|
+
if not chunks:
|
|
219
|
+
return None
|
|
220
|
+
tokens = _build_bm25_tokens(chunks)
|
|
221
|
+
return {
|
|
222
|
+
"version": 1,
|
|
223
|
+
"source": str(USER_GUIDE_PATH),
|
|
224
|
+
"source_hash": _hash_text(content),
|
|
225
|
+
"chunk_limit": chunk_limit,
|
|
226
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
227
|
+
"documents": chunks,
|
|
228
|
+
"tokens": tokens,
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
async def _get_rag_retriever() -> tuple[Any | None, int]:
|
|
151
233
|
global _RAG_RETRIEVER
|
|
152
234
|
global _RAG_DOCS_COUNT
|
|
153
235
|
global _RAG_TEXTS
|
|
@@ -156,50 +238,51 @@ async def _get_rag_retriever():
|
|
|
156
238
|
if _RAG_RETRIEVER is not None:
|
|
157
239
|
return _RAG_RETRIEVER, _RAG_DOCS_COUNT
|
|
158
240
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
if
|
|
169
|
-
|
|
241
|
+
user_guide_limit = int(os.getenv("EVALVAULT_RAG_USER_GUIDE_LIMIT", "80"))
|
|
242
|
+
content = _load_user_guide_text()
|
|
243
|
+
if content is None:
|
|
244
|
+
return None, 0
|
|
245
|
+
source_hash = _hash_text(content)
|
|
246
|
+
|
|
247
|
+
index_payload = _load_bm25_index()
|
|
248
|
+
if index_payload is None or index_payload.get("source_hash") != source_hash:
|
|
249
|
+
index_payload = _build_bm25_index(content, user_guide_limit)
|
|
250
|
+
if index_payload is None:
|
|
251
|
+
return None, 0
|
|
252
|
+
_save_bm25_index(index_payload)
|
|
253
|
+
|
|
254
|
+
documents = index_payload.get("documents")
|
|
255
|
+
tokens = index_payload.get("tokens")
|
|
256
|
+
if not isinstance(documents, list) or not isinstance(tokens, list):
|
|
257
|
+
return None, 0
|
|
170
258
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
259
|
+
_RAG_TEXTS = documents
|
|
260
|
+
_RAG_DOCS_COUNT = len(documents)
|
|
261
|
+
_RAG_INITIALIZED = True
|
|
174
262
|
|
|
175
263
|
if not _RAG_TEXTS:
|
|
176
264
|
return None, 0
|
|
177
265
|
|
|
178
|
-
from evalvault.adapters.outbound.
|
|
179
|
-
from evalvault.adapters.outbound.nlp.korean.
|
|
180
|
-
from evalvault.config.settings import Settings
|
|
181
|
-
|
|
182
|
-
settings = Settings()
|
|
183
|
-
ollama_adapter = OllamaAdapter(settings)
|
|
184
|
-
toolkit = try_create_korean_toolkit()
|
|
185
|
-
if toolkit is None:
|
|
186
|
-
return None, 0
|
|
266
|
+
from evalvault.adapters.outbound.nlp.korean.bm25_retriever import KoreanBM25Retriever
|
|
267
|
+
from evalvault.adapters.outbound.nlp.korean.kiwi_tokenizer import KiwiTokenizer
|
|
187
268
|
|
|
188
|
-
|
|
189
|
-
retriever =
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
embedding_profile=os.getenv("EVALVAULT_RAG_EMBEDDING_PROFILE", "dev"),
|
|
194
|
-
verbose=False,
|
|
195
|
-
)
|
|
196
|
-
if retriever is None:
|
|
197
|
-
return None, 0
|
|
269
|
+
tokenizer = KiwiTokenizer()
|
|
270
|
+
retriever = KoreanBM25Retriever(tokenizer=tokenizer)
|
|
271
|
+
retriever.index(list(_RAG_TEXTS))
|
|
272
|
+
if tokens and len(tokens) == len(_RAG_TEXTS):
|
|
273
|
+
retriever._tokenized_docs = tokens
|
|
198
274
|
|
|
199
275
|
_RAG_RETRIEVER = retriever
|
|
200
276
|
return retriever, _RAG_DOCS_COUNT
|
|
201
277
|
|
|
202
278
|
|
|
279
|
+
async def warm_rag_index() -> None:
|
|
280
|
+
try:
|
|
281
|
+
await _get_rag_retriever()
|
|
282
|
+
except Exception as exc:
|
|
283
|
+
logger.warning("RAG preload failed: %s", exc)
|
|
284
|
+
|
|
285
|
+
|
|
203
286
|
async def _direct_chat_answer(user_text: str) -> str | None:
|
|
204
287
|
payload = {
|
|
205
288
|
"model": os.getenv("OLLAMA_CHAT_MODEL", "gpt-oss-safeguard:20b"),
|
|
@@ -351,15 +434,17 @@ async def _resolve_tool_with_llm(user_text: str) -> dict[str, Any] | None:
|
|
|
351
434
|
|
|
352
435
|
|
|
353
436
|
def _extract_json_content(result: Any) -> dict[str, Any] | None:
|
|
354
|
-
if isinstance(result, dict)
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
if
|
|
360
|
-
|
|
437
|
+
if isinstance(result, dict):
|
|
438
|
+
structured = result.get("structuredContent")
|
|
439
|
+
if isinstance(structured, dict):
|
|
440
|
+
return structured
|
|
441
|
+
else:
|
|
442
|
+
if hasattr(result, "structuredContent"):
|
|
443
|
+
payload = result.structuredContent
|
|
444
|
+
if isinstance(payload, dict):
|
|
445
|
+
return payload
|
|
361
446
|
|
|
362
|
-
if hasattr(result, "content"):
|
|
447
|
+
if not isinstance(result, dict) and hasattr(result, "content"):
|
|
363
448
|
content = result.content
|
|
364
449
|
elif isinstance(result, dict):
|
|
365
450
|
content = result.get("content")
|
|
@@ -385,17 +470,6 @@ def _extract_json_content(result: Any) -> dict[str, Any] | None:
|
|
|
385
470
|
return None
|
|
386
471
|
if isinstance(parsed, dict):
|
|
387
472
|
return parsed
|
|
388
|
-
else:
|
|
389
|
-
item_type = getattr(item, "type", None)
|
|
390
|
-
if item_type == "text":
|
|
391
|
-
text = getattr(item, "text", None)
|
|
392
|
-
if isinstance(text, str):
|
|
393
|
-
try:
|
|
394
|
-
parsed = json.loads(text)
|
|
395
|
-
except Exception:
|
|
396
|
-
return None
|
|
397
|
-
if isinstance(parsed, dict):
|
|
398
|
-
return parsed
|
|
399
473
|
return None
|
|
400
474
|
|
|
401
475
|
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import asdict
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from fastapi import APIRouter, Depends, HTTPException, Request
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
from evalvault.adapters.inbound.mcp import tools as mcp_tools
|
|
11
|
+
from evalvault.config.settings import Settings, get_settings
|
|
12
|
+
|
|
13
|
+
router = APIRouter(tags=["mcp"])
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class JsonRpcRequest(BaseModel):
|
|
17
|
+
jsonrpc: str = Field("2.0", pattern=r"^2\.0$")
|
|
18
|
+
id: int | str | None = None
|
|
19
|
+
method: str
|
|
20
|
+
params: dict[str, Any] | None = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _normalize_tokens(raw_tokens: str | None) -> set[str]:
|
|
24
|
+
if not raw_tokens:
|
|
25
|
+
return set()
|
|
26
|
+
return {token.strip() for token in raw_tokens.split(",") if token.strip()}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _require_mcp_token(
|
|
30
|
+
request: Request,
|
|
31
|
+
settings: Settings = Depends(get_settings),
|
|
32
|
+
) -> None:
|
|
33
|
+
if not settings.mcp_enabled:
|
|
34
|
+
raise HTTPException(status_code=404, detail="MCP is disabled")
|
|
35
|
+
tokens = _normalize_tokens(settings.mcp_auth_tokens) or _normalize_tokens(
|
|
36
|
+
settings.api_auth_tokens
|
|
37
|
+
)
|
|
38
|
+
if not tokens:
|
|
39
|
+
raise HTTPException(status_code=401, detail="MCP auth tokens are required")
|
|
40
|
+
auth_header = request.headers.get("Authorization", "")
|
|
41
|
+
if not auth_header.lower().startswith("bearer "):
|
|
42
|
+
raise HTTPException(status_code=401, detail="Invalid or missing MCP token")
|
|
43
|
+
token = auth_header[7:].strip()
|
|
44
|
+
if token not in tokens:
|
|
45
|
+
raise HTTPException(status_code=401, detail="Invalid or missing MCP token")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _tool_registry() -> dict[str, Any]:
|
|
49
|
+
return {
|
|
50
|
+
"list_runs": mcp_tools.list_runs,
|
|
51
|
+
"get_run_summary": mcp_tools.get_run_summary,
|
|
52
|
+
"run_evaluation": mcp_tools.run_evaluation,
|
|
53
|
+
"analyze_compare": mcp_tools.analyze_compare,
|
|
54
|
+
"get_artifacts": mcp_tools.get_artifacts,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _allowed_tools(settings: Settings) -> set[str]:
|
|
59
|
+
if settings.mcp_allowed_tools:
|
|
60
|
+
return {name.strip() for name in settings.mcp_allowed_tools.split(",") if name.strip()}
|
|
61
|
+
return set(_tool_registry().keys())
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _serialize_result(result: Any) -> Any:
|
|
65
|
+
if hasattr(result, "model_dump"):
|
|
66
|
+
return result.model_dump()
|
|
67
|
+
if hasattr(result, "dict"):
|
|
68
|
+
return result.dict()
|
|
69
|
+
try:
|
|
70
|
+
return asdict(result)
|
|
71
|
+
except TypeError:
|
|
72
|
+
return result
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _jsonrpc_result(rpc_id: int | str | None, payload: Any) -> dict[str, Any]:
|
|
76
|
+
return {"jsonrpc": "2.0", "id": rpc_id, "result": payload}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _jsonrpc_error(rpc_id: int | str | None, code: int, message: str) -> dict[str, Any]:
|
|
80
|
+
return {"jsonrpc": "2.0", "id": rpc_id, "error": {"code": code, "message": message}}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@router.post("")
|
|
84
|
+
def handle_mcp_request(
|
|
85
|
+
request: JsonRpcRequest,
|
|
86
|
+
settings: Settings = Depends(get_settings),
|
|
87
|
+
_: None = Depends(_require_mcp_token),
|
|
88
|
+
) -> dict[str, Any]:
|
|
89
|
+
method = request.method
|
|
90
|
+
params = request.params or {}
|
|
91
|
+
|
|
92
|
+
if method == "initialize":
|
|
93
|
+
return _jsonrpc_result(
|
|
94
|
+
request.id,
|
|
95
|
+
{
|
|
96
|
+
"protocolVersion": settings.mcp_protocol_version,
|
|
97
|
+
"serverInfo": {
|
|
98
|
+
"name": "evalvault-mcp",
|
|
99
|
+
"version": settings.mcp_server_version,
|
|
100
|
+
},
|
|
101
|
+
"capabilities": {"tools": {"listChanged": False}},
|
|
102
|
+
},
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
if method in {"initialized", "notifications/initialized"}:
|
|
106
|
+
return _jsonrpc_result(request.id, None)
|
|
107
|
+
|
|
108
|
+
if method == "tools/list":
|
|
109
|
+
allowed = _allowed_tools(settings)
|
|
110
|
+
tools = [tool for tool in mcp_tools.get_tool_specs() if tool.get("name") in allowed]
|
|
111
|
+
return _jsonrpc_result(request.id, {"tools": tools})
|
|
112
|
+
|
|
113
|
+
if method == "tools/call":
|
|
114
|
+
tool_name = params.get("name")
|
|
115
|
+
tool_args = params.get("arguments") or {}
|
|
116
|
+
if not tool_name:
|
|
117
|
+
return _jsonrpc_error(request.id, -32602, "Missing tool name")
|
|
118
|
+
|
|
119
|
+
allowed = _allowed_tools(settings)
|
|
120
|
+
if tool_name not in allowed:
|
|
121
|
+
return _jsonrpc_error(request.id, -32601, "Tool not allowed")
|
|
122
|
+
|
|
123
|
+
tool_fn = _tool_registry().get(tool_name)
|
|
124
|
+
if tool_fn is None:
|
|
125
|
+
return _jsonrpc_error(request.id, -32601, "Tool not found")
|
|
126
|
+
|
|
127
|
+
try:
|
|
128
|
+
result = tool_fn(tool_args)
|
|
129
|
+
except Exception as exc:
|
|
130
|
+
return _jsonrpc_error(request.id, -32000, f"Tool execution failed: {exc}")
|
|
131
|
+
|
|
132
|
+
payload = _serialize_result(result)
|
|
133
|
+
return _jsonrpc_result(
|
|
134
|
+
request.id,
|
|
135
|
+
{
|
|
136
|
+
"content": [{"type": "text", "text": json.dumps(payload, ensure_ascii=False)}],
|
|
137
|
+
"structuredContent": payload,
|
|
138
|
+
},
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
if method == "ping":
|
|
142
|
+
return _jsonrpc_result(request.id, {"status": "ok"})
|
|
143
|
+
|
|
144
|
+
return _jsonrpc_error(request.id, -32601, "Method not found")
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import json
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
6
7
|
|
|
7
8
|
from evalvault.domain.entities.dataset import Dataset
|
|
8
9
|
|
|
@@ -118,6 +119,45 @@ class BaseDatasetLoader(ABC):
|
|
|
118
119
|
# Fall back to pipe-separated format
|
|
119
120
|
return [ctx.strip() for ctx in contexts_str.split("|")]
|
|
120
121
|
|
|
122
|
+
def _parse_metadata_cell(self, raw: Any) -> dict[str, Any]:
|
|
123
|
+
if raw is None or (isinstance(raw, float) and str(raw) == "nan"):
|
|
124
|
+
return {}
|
|
125
|
+
text = str(raw).strip()
|
|
126
|
+
if not text:
|
|
127
|
+
return {}
|
|
128
|
+
try:
|
|
129
|
+
parsed = json.loads(text)
|
|
130
|
+
except json.JSONDecodeError as exc:
|
|
131
|
+
raise ValueError("Invalid metadata JSON") from exc
|
|
132
|
+
if not isinstance(parsed, dict):
|
|
133
|
+
raise ValueError("metadata must be a JSON object")
|
|
134
|
+
return parsed
|
|
135
|
+
|
|
136
|
+
def _parse_summary_tags_cell(self, raw: Any) -> list[str]:
|
|
137
|
+
if raw is None or (isinstance(raw, float) and str(raw) == "nan"):
|
|
138
|
+
return []
|
|
139
|
+
if isinstance(raw, list):
|
|
140
|
+
return [str(item).strip().lower() for item in raw if str(item).strip()]
|
|
141
|
+
text = str(raw).strip()
|
|
142
|
+
if not text:
|
|
143
|
+
return []
|
|
144
|
+
if text.startswith("["):
|
|
145
|
+
try:
|
|
146
|
+
parsed = json.loads(text)
|
|
147
|
+
except json.JSONDecodeError:
|
|
148
|
+
parsed = None
|
|
149
|
+
if isinstance(parsed, list):
|
|
150
|
+
return [str(item).strip().lower() for item in parsed if str(item).strip()]
|
|
151
|
+
delimiter = "," if "," in text else "|" if "|" in text else None
|
|
152
|
+
parts = text.split(delimiter) if delimiter else [text]
|
|
153
|
+
return [part.strip().lower() for part in parts if part.strip()]
|
|
154
|
+
|
|
155
|
+
def _parse_summary_intent_cell(self, raw: Any) -> str | None:
|
|
156
|
+
if raw is None or (isinstance(raw, float) and str(raw) == "nan"):
|
|
157
|
+
return None
|
|
158
|
+
text = str(raw).strip()
|
|
159
|
+
return text or None
|
|
160
|
+
|
|
121
161
|
def _get_default_name(self, file_path: Path) -> str:
|
|
122
162
|
"""Get default dataset name from file path.
|
|
123
163
|
|
|
@@ -123,12 +123,28 @@ class CSVDatasetLoader(BaseDatasetLoader):
|
|
|
123
123
|
else None
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
+
metadata = {}
|
|
127
|
+
if "metadata" in df.columns:
|
|
128
|
+
try:
|
|
129
|
+
metadata = self._parse_metadata_cell(row["metadata"])
|
|
130
|
+
except ValueError as exc:
|
|
131
|
+
raise ValueError(f"Test case {row['id']}: {exc}") from exc
|
|
132
|
+
if "summary_tags" in df.columns:
|
|
133
|
+
tags = self._parse_summary_tags_cell(row["summary_tags"])
|
|
134
|
+
if tags:
|
|
135
|
+
metadata["summary_tags"] = tags
|
|
136
|
+
if "summary_intent" in df.columns:
|
|
137
|
+
intent = self._parse_summary_intent_cell(row["summary_intent"])
|
|
138
|
+
if intent:
|
|
139
|
+
metadata["summary_intent"] = intent
|
|
140
|
+
|
|
126
141
|
test_case = TestCase(
|
|
127
142
|
id=str(row["id"]),
|
|
128
143
|
question=str(row["question"]),
|
|
129
144
|
answer=str(row["answer"]),
|
|
130
145
|
contexts=contexts,
|
|
131
146
|
ground_truth=ground_truth,
|
|
147
|
+
metadata=metadata,
|
|
132
148
|
)
|
|
133
149
|
test_cases.append(test_case)
|
|
134
150
|
|
|
@@ -96,12 +96,28 @@ class ExcelDatasetLoader(BaseDatasetLoader):
|
|
|
96
96
|
else None
|
|
97
97
|
)
|
|
98
98
|
|
|
99
|
+
metadata = {}
|
|
100
|
+
if "metadata" in df.columns:
|
|
101
|
+
try:
|
|
102
|
+
metadata = self._parse_metadata_cell(row["metadata"])
|
|
103
|
+
except ValueError as exc:
|
|
104
|
+
raise ValueError(f"Test case {row['id']}: {exc}") from exc
|
|
105
|
+
if "summary_tags" in df.columns:
|
|
106
|
+
tags = self._parse_summary_tags_cell(row["summary_tags"])
|
|
107
|
+
if tags:
|
|
108
|
+
metadata["summary_tags"] = tags
|
|
109
|
+
if "summary_intent" in df.columns:
|
|
110
|
+
intent = self._parse_summary_intent_cell(row["summary_intent"])
|
|
111
|
+
if intent:
|
|
112
|
+
metadata["summary_intent"] = intent
|
|
113
|
+
|
|
99
114
|
test_case = TestCase(
|
|
100
115
|
id=str(row["id"]),
|
|
101
116
|
question=str(row["question"]),
|
|
102
117
|
answer=str(row["answer"]),
|
|
103
118
|
contexts=contexts,
|
|
104
119
|
ground_truth=ground_truth,
|
|
120
|
+
metadata=metadata,
|
|
105
121
|
)
|
|
106
122
|
test_cases.append(test_case)
|
|
107
123
|
|
evalvault/config/settings.py
CHANGED
|
@@ -424,9 +424,13 @@ def apply_profile(settings: Settings, profile_name: str) -> Settings:
|
|
|
424
424
|
"""
|
|
425
425
|
from evalvault.config.model_config import get_model_config
|
|
426
426
|
|
|
427
|
+
normalized = profile_name.strip() if isinstance(profile_name, str) else profile_name
|
|
428
|
+
if not normalized:
|
|
429
|
+
return settings
|
|
430
|
+
|
|
427
431
|
try:
|
|
428
432
|
model_config = get_model_config()
|
|
429
|
-
profile = model_config.get_profile(
|
|
433
|
+
profile = model_config.get_profile(normalized)
|
|
430
434
|
|
|
431
435
|
# LLM 설정 적용 (모델명과 provider만)
|
|
432
436
|
settings.llm_provider = profile.llm.provider
|
|
@@ -449,9 +453,16 @@ def apply_profile(settings: Settings, profile_name: str) -> Settings:
|
|
|
449
453
|
elif profile.embedding.provider == "vllm":
|
|
450
454
|
settings.vllm_embedding_model = profile.embedding.model
|
|
451
455
|
|
|
452
|
-
except FileNotFoundError:
|
|
453
|
-
|
|
454
|
-
|
|
456
|
+
except FileNotFoundError as exc:
|
|
457
|
+
raise ValueError(
|
|
458
|
+
"Model profile config not found. Create 'config/models.yaml' or 'evalvault.yaml' "
|
|
459
|
+
f"to use profile '{normalized}'."
|
|
460
|
+
) from exc
|
|
461
|
+
except KeyError as exc:
|
|
462
|
+
available = ", ".join(sorted(model_config.profiles.keys()))
|
|
463
|
+
raise ValueError(
|
|
464
|
+
f"Unknown profile '{normalized}'. Available profiles: {available}"
|
|
465
|
+
) from exc
|
|
455
466
|
|
|
456
467
|
return settings
|
|
457
468
|
|
|
@@ -60,18 +60,16 @@ class StageEvent:
|
|
|
60
60
|
|
|
61
61
|
@classmethod
|
|
62
62
|
def from_dict(cls, payload: dict[str, Any]) -> StageEvent:
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
if "stage_type" not in payload:
|
|
66
|
-
raise ValueError("StageEvent requires 'stage_type'")
|
|
63
|
+
run_id = _require_str(payload, "run_id")
|
|
64
|
+
stage_type = _normalize_stage_type(payload)
|
|
67
65
|
|
|
68
66
|
trace_payload = payload.get("trace") or {}
|
|
69
67
|
input_ref = _parse_payload_ref(payload.get("input_ref"))
|
|
70
68
|
output_ref = _parse_payload_ref(payload.get("output_ref"))
|
|
71
69
|
|
|
72
70
|
return cls(
|
|
73
|
-
run_id=
|
|
74
|
-
stage_type=
|
|
71
|
+
run_id=run_id,
|
|
72
|
+
stage_type=stage_type,
|
|
75
73
|
stage_id=str(payload.get("stage_id") or uuid4()),
|
|
76
74
|
stage_name=_optional_str(payload.get("stage_name")),
|
|
77
75
|
parent_stage_id=_optional_str(payload.get("parent_stage_id")),
|
|
@@ -187,6 +185,24 @@ def _parse_datetime(value: Any) -> datetime | None:
|
|
|
187
185
|
raise ValueError("Invalid datetime value")
|
|
188
186
|
|
|
189
187
|
|
|
188
|
+
def _require_str(payload: dict[str, Any], key: str) -> str:
|
|
189
|
+
if key not in payload:
|
|
190
|
+
raise ValueError(f"StageEvent requires '{key}'")
|
|
191
|
+
value = str(payload.get(key, "")).strip()
|
|
192
|
+
if not value:
|
|
193
|
+
raise ValueError(f"StageEvent requires non-empty '{key}'")
|
|
194
|
+
return value
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _normalize_stage_type(payload: dict[str, Any]) -> str:
|
|
198
|
+
if "stage_type" not in payload:
|
|
199
|
+
raise ValueError("StageEvent requires 'stage_type'")
|
|
200
|
+
value = str(payload.get("stage_type", "")).strip()
|
|
201
|
+
if not value:
|
|
202
|
+
raise ValueError("StageEvent requires non-empty 'stage_type'")
|
|
203
|
+
return value.lower()
|
|
204
|
+
|
|
205
|
+
|
|
190
206
|
@overload
|
|
191
207
|
def _ensure_dict(value: None, *, allow_none: Literal[True]) -> None: ...
|
|
192
208
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: evalvault
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.68.0
|
|
4
4
|
Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
|
|
5
5
|
Project-URL: Homepage, https://github.com/ntts9990/EvalVault
|
|
6
6
|
Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
|
|
@@ -6,13 +6,14 @@ evalvault/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
|
|
|
6
6
|
evalvault/adapters/inbound/__init__.py,sha256=SG1svel1PwqetnqVpKFLSv612_WwGwLTbFpYgwk6FMw,166
|
|
7
7
|
evalvault/adapters/inbound/api/__init__.py,sha256=LeVVttCA3tLKoHA2PO4z3y8VkfVcf3Bq8CZSzo91lf4,34
|
|
8
8
|
evalvault/adapters/inbound/api/adapter.py,sha256=HgWSYyUxvJPlaSG158WVzpPckpPCYV9Ec3CWN8rLFdI,69118
|
|
9
|
-
evalvault/adapters/inbound/api/main.py,sha256=
|
|
9
|
+
evalvault/adapters/inbound/api/main.py,sha256=RPcstctf_mFH9TPUhld6plA0104Kb6Iccb6Cu26oFR8,7271
|
|
10
10
|
evalvault/adapters/inbound/api/routers/__init__.py,sha256=q07_YF9TnBl68bqcRCvhPU4-zRTyvmPoHVehwO6W7QM,19
|
|
11
11
|
evalvault/adapters/inbound/api/routers/benchmark.py,sha256=yevntbZcNtMvbVODsITUBgR1Ka4pdFQrXBJJ4K4Jyr4,4477
|
|
12
|
-
evalvault/adapters/inbound/api/routers/chat.py,sha256=
|
|
12
|
+
evalvault/adapters/inbound/api/routers/chat.py,sha256=hCA6rWr5GT_gCqu75uCqYwy2gOEUd85mlcc5y-ruFTY,20661
|
|
13
13
|
evalvault/adapters/inbound/api/routers/config.py,sha256=LygN0fVMr8NFtj5zuQXnVFhoafx56Txa98vpwtPa4Jc,4104
|
|
14
14
|
evalvault/adapters/inbound/api/routers/domain.py,sha256=RsR7GIFMjccDN7vpG1uDyk9n1DnCTH18JDGAX7o4Qqc,3648
|
|
15
15
|
evalvault/adapters/inbound/api/routers/knowledge.py,sha256=yb_e7OEPtwldOAzHTGiWe7jShHw2JdpOFnzGPMceRsg,7109
|
|
16
|
+
evalvault/adapters/inbound/api/routers/mcp.py,sha256=yHANV7qIXig-7YSiQgXzSTuabqFStH5yT3URyQGY2W4,4764
|
|
16
17
|
evalvault/adapters/inbound/api/routers/pipeline.py,sha256=8UgQzNFHcuqS61s69mOrPee4OMwfxVdvRWHJ2_qYBF0,17175
|
|
17
18
|
evalvault/adapters/inbound/api/routers/runs.py,sha256=rydOvwWk24QIYafu3XYS3oL_VVCE_jHDmjADhA19T1s,40059
|
|
18
19
|
evalvault/adapters/inbound/cli/__init__.py,sha256=a42flC5NK-VfbdbBrE49IrUL5zAyKdXZYJVM6E3NTE0,675
|
|
@@ -117,9 +118,9 @@ evalvault/adapters/outbound/cache/__init__.py,sha256=LcsKzxnx1AnAwS07iSCdws11CfE
|
|
|
117
118
|
evalvault/adapters/outbound/cache/hybrid_cache.py,sha256=AVhctQVOIbQWwvn_K0kxSq3lkhucuM7tezmSkPDbCrA,12711
|
|
118
119
|
evalvault/adapters/outbound/cache/memory_cache.py,sha256=jvjIgXp7YRj08_AzBFaJ58jjXNzUlYbG_zX6fQJP4C0,3533
|
|
119
120
|
evalvault/adapters/outbound/dataset/__init__.py,sha256=SDFnjmieEgz0uH5MpdXx8pmjnIMjRLkMFmFioMxCju0,1183
|
|
120
|
-
evalvault/adapters/outbound/dataset/base.py,sha256=
|
|
121
|
-
evalvault/adapters/outbound/dataset/csv_loader.py,sha256=
|
|
122
|
-
evalvault/adapters/outbound/dataset/excel_loader.py,sha256=
|
|
121
|
+
evalvault/adapters/outbound/dataset/base.py,sha256=4rxpQgxpFty0G5XRv1SP-XJ9mpZ9YO6PAMDgp71JiJQ,5547
|
|
122
|
+
evalvault/adapters/outbound/dataset/csv_loader.py,sha256=xHg2QadMvLfHTHzeex6WxXmagLJog3LN-ui6dFxD8HY,5595
|
|
123
|
+
evalvault/adapters/outbound/dataset/excel_loader.py,sha256=MUl-63r1s1GjVVmDgdag1DpMJvIVX_agGx20NQzEZN8,4494
|
|
123
124
|
evalvault/adapters/outbound/dataset/json_loader.py,sha256=4wG7APg1LLADPxJ-wQZo2zBcvVX12sqo9VUIb-0Kww4,4923
|
|
124
125
|
evalvault/adapters/outbound/dataset/loader_factory.py,sha256=32sjGuW2Yta12lpKy4DLH4I5B4Pi-YuHTvGG1Pr4VAk,1361
|
|
125
126
|
evalvault/adapters/outbound/dataset/method_input_loader.py,sha256=d7pB4OPvvr-q-Y5DlvjX3X719jCCQ2vRDfT_ov0dUFU,3833
|
|
@@ -206,7 +207,7 @@ evalvault/config/langfuse_support.py,sha256=DEzVMfMGGf1V45W_2oUG-NCDfsYI4UUdnYJI
|
|
|
206
207
|
evalvault/config/model_config.py,sha256=KlzDbGyDLeOGE7ElekFFk5YjjT5u8i6KO2B4EyZkLnI,3542
|
|
207
208
|
evalvault/config/phoenix_support.py,sha256=e6RPWd6Qb7KU6Q8pLaYTpJGWULtvEEU6B0xHWyVyOH0,13604
|
|
208
209
|
evalvault/config/secret_manager.py,sha256=YjPMuNqeBrAR2BzCJvsBNUExaU4TBSFyZ8kVYZZifqA,4172
|
|
209
|
-
evalvault/config/settings.py,sha256=
|
|
210
|
+
evalvault/config/settings.py,sha256=DY170XUoMo8yQx8_CJjPt96QsGg7tyTx5wJ-ptcfdY0,18766
|
|
210
211
|
evalvault/config/playbooks/improvement_playbook.yaml,sha256=9F9WVVCydFfz6zUuGYzZ4PKdW1LLtcBKVF36T7xT764,26965
|
|
211
212
|
evalvault/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
212
213
|
evalvault/domain/entities/__init__.py,sha256=wszRJ1Imdc5NJ1bQPC2udk-mAgFdlw4uZV5IPNjLpHQ,3669
|
|
@@ -227,7 +228,7 @@ evalvault/domain/entities/prompt.py,sha256=lQlRnHEKY69GWTC-cUIu0DMuPfJ9UWm6Sm4KT
|
|
|
227
228
|
evalvault/domain/entities/prompt_suggestion.py,sha256=Ep_XSjdYUj7pFSCMyeeZKs8yTnp74AVx05Zqr7829PE,1243
|
|
228
229
|
evalvault/domain/entities/rag_trace.py,sha256=sZgnkG4fK6KOe3Np6TYAZ_tPnsRbOmucDSQns35U1n4,11868
|
|
229
230
|
evalvault/domain/entities/result.py,sha256=OaGHMDLWMW2O4fNVuVTUvWFVBQ1iu93OD_oI3NumrCQ,10697
|
|
230
|
-
evalvault/domain/entities/stage.py,sha256=
|
|
231
|
+
evalvault/domain/entities/stage.py,sha256=KyR-v3tyusPJ7pfTXtHE2_23tVvNSRU9Q1RT-R5akXg,7914
|
|
231
232
|
evalvault/domain/metrics/__init__.py,sha256=Ros3CWg5in1xlEdMa0WUSG602SBVkxw2Zbro-XUlmxU,1214
|
|
232
233
|
evalvault/domain/metrics/analysis_registry.py,sha256=JZpBrBs7-JExHKYuEML6Vg_uYLm-WniBE3BfiU5OtJg,7641
|
|
233
234
|
evalvault/domain/metrics/confidence.py,sha256=AX4oeN28OvmMkwD0pT-jskkOlXh87C1pe2W9P1sF69g,17224
|
|
@@ -338,8 +339,8 @@ evalvault/reports/__init__.py,sha256=Bb1X4871msAN8I6PM6nKGED3psPwZt88hXZBAOdH06Y
|
|
|
338
339
|
evalvault/reports/release_notes.py,sha256=pZj0PBFT-4F_Ty-Kv5P69BuoOnmTCn4kznDcORFJd0w,4011
|
|
339
340
|
evalvault/scripts/__init__.py,sha256=NwEeIFQbkX4ml2R_PhtIoNtArDSX_suuoymgG_7Kwso,89
|
|
340
341
|
evalvault/scripts/regression_runner.py,sha256=SxZori5BZ8jVQ057Mf5V5FPgIVDccrV5oRONmnhuk8w,8438
|
|
341
|
-
evalvault-1.
|
|
342
|
-
evalvault-1.
|
|
343
|
-
evalvault-1.
|
|
344
|
-
evalvault-1.
|
|
345
|
-
evalvault-1.
|
|
342
|
+
evalvault-1.68.0.dist-info/METADATA,sha256=bEWK-9BGROeWrWf3kNoGytr-GbAa2gzLCDZ1PwWBzEM,26159
|
|
343
|
+
evalvault-1.68.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
344
|
+
evalvault-1.68.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
|
|
345
|
+
evalvault-1.68.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
|
|
346
|
+
evalvault-1.68.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|