evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +14 -0
- evalvault/adapters/inbound/api/main.py +14 -4
- evalvault/adapters/inbound/api/routers/chat.py +543 -0
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/inbound/cli/commands/run.py +14 -0
- evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
- evalvault/adapters/outbound/storage/base_sql.py +41 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
- evalvault/config/settings.py +21 -0
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/prompt.py +1 -1
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/metrics/__init__.py +8 -0
- evalvault/domain/metrics/registry.py +39 -3
- evalvault/domain/metrics/summary_accuracy.py +189 -0
- evalvault/domain/metrics/summary_needs_followup.py +45 -0
- evalvault/domain/metrics/summary_non_definitive.py +41 -0
- evalvault/domain/metrics/summary_risk_coverage.py +45 -0
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/custom_metric_snapshot.py +233 -0
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +282 -27
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/prompt_registry.py +39 -10
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/domain/services/threshold_profiles.py +4 -0
- evalvault/domain/services/visual_space_service.py +79 -4
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -596,6 +596,14 @@ class WebUIAdapter:
|
|
|
596
596
|
ragas_snapshots = tracker_meta.get("ragas_prompt_snapshots")
|
|
597
597
|
ragas_snapshot_inputs = build_prompt_inputs_from_snapshots(
|
|
598
598
|
ragas_snapshots if isinstance(ragas_snapshots, dict) else None,
|
|
599
|
+
kind="ragas",
|
|
600
|
+
source="ragas",
|
|
601
|
+
)
|
|
602
|
+
custom_snapshots = tracker_meta.get("custom_prompt_snapshots")
|
|
603
|
+
custom_snapshot_inputs = build_prompt_inputs_from_snapshots(
|
|
604
|
+
custom_snapshots if isinstance(custom_snapshots, dict) else None,
|
|
605
|
+
kind="custom",
|
|
606
|
+
source="custom_rules",
|
|
599
607
|
)
|
|
600
608
|
override_status: dict[str, str] = {}
|
|
601
609
|
raw_override = tracker_meta.get("ragas_prompt_overrides")
|
|
@@ -618,6 +626,12 @@ class WebUIAdapter:
|
|
|
618
626
|
if entry.role in existing_roles and override_status.get(entry.role) == "applied":
|
|
619
627
|
continue
|
|
620
628
|
prompt_inputs.append(entry)
|
|
629
|
+
if custom_snapshot_inputs:
|
|
630
|
+
existing_roles = {entry.role for entry in prompt_inputs if entry.kind == "custom"}
|
|
631
|
+
for entry in custom_snapshot_inputs:
|
|
632
|
+
if entry.role in existing_roles:
|
|
633
|
+
continue
|
|
634
|
+
prompt_inputs.append(entry)
|
|
621
635
|
|
|
622
636
|
prompt_bundle = None
|
|
623
637
|
if prompt_inputs:
|
|
@@ -152,7 +152,7 @@ def create_app() -> FastAPI:
|
|
|
152
152
|
allow_headers=["*"],
|
|
153
153
|
)
|
|
154
154
|
|
|
155
|
-
from .routers import benchmark, config, domain, knowledge, pipeline, runs
|
|
155
|
+
from .routers import benchmark, chat, config, domain, knowledge, mcp, pipeline, runs
|
|
156
156
|
|
|
157
157
|
auth_dependencies = [Depends(require_api_token)]
|
|
158
158
|
|
|
@@ -162,6 +162,12 @@ def create_app() -> FastAPI:
|
|
|
162
162
|
tags=["runs"],
|
|
163
163
|
dependencies=auth_dependencies,
|
|
164
164
|
)
|
|
165
|
+
app.include_router(
|
|
166
|
+
chat.router,
|
|
167
|
+
prefix="/api/v1/chat",
|
|
168
|
+
tags=["chat"],
|
|
169
|
+
dependencies=auth_dependencies,
|
|
170
|
+
)
|
|
165
171
|
app.include_router(
|
|
166
172
|
benchmark.router,
|
|
167
173
|
prefix="/api/v1/benchmarks",
|
|
@@ -192,6 +198,12 @@ def create_app() -> FastAPI:
|
|
|
192
198
|
tags=["config"],
|
|
193
199
|
dependencies=auth_dependencies,
|
|
194
200
|
)
|
|
201
|
+
app.include_router(
|
|
202
|
+
mcp.router,
|
|
203
|
+
prefix="/api/v1/mcp",
|
|
204
|
+
tags=["mcp"],
|
|
205
|
+
dependencies=auth_dependencies,
|
|
206
|
+
)
|
|
195
207
|
|
|
196
208
|
@app.get("/health")
|
|
197
209
|
def health_check():
|
|
@@ -209,9 +221,7 @@ def create_app() -> FastAPI:
|
|
|
209
221
|
# Dependency to get the adapter
|
|
210
222
|
def get_adapter(app: FastAPI) -> WebUIAdapter:
|
|
211
223
|
"""Dependency to retrieve the WebUIAdapter from app state."""
|
|
212
|
-
|
|
213
|
-
# unless we use Request. So we usually do:
|
|
214
|
-
pass
|
|
224
|
+
return app.state.adapter
|
|
215
225
|
|
|
216
226
|
|
|
217
227
|
def get_web_adapter(request: Request) -> WebUIAdapter:
|
|
@@ -0,0 +1,543 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import time
|
|
8
|
+
from collections.abc import AsyncGenerator
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import httpx
|
|
13
|
+
from fastapi import APIRouter
|
|
14
|
+
from fastapi.responses import StreamingResponse
|
|
15
|
+
from pydantic import BaseModel, Field
|
|
16
|
+
|
|
17
|
+
router = APIRouter(tags=["chat"])
|
|
18
|
+
|
|
19
|
+
MCP_URL = os.getenv("EVALVAULT_MCP_URL", "http://localhost:8000/api/v1/mcp")
|
|
20
|
+
MCP_TOKEN = os.getenv("EVALVAULT_MCP_TOKEN", "mcp-local-dev-token")
|
|
21
|
+
|
|
22
|
+
_RAG_RETRIEVER = None
|
|
23
|
+
_RAG_DOCS_COUNT = 0
|
|
24
|
+
_RAG_TEXTS: list[str] = []
|
|
25
|
+
_RAG_INITIALIZED = False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ChatMessage(BaseModel):
|
|
29
|
+
role: str
|
|
30
|
+
content: str
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ChatRequest(BaseModel):
|
|
34
|
+
message: str = Field(..., min_length=1)
|
|
35
|
+
history: list[ChatMessage] | None = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _extract_run_ids(text: str) -> list[str]:
|
|
39
|
+
return re.findall(r"run_[A-Za-z0-9_-]+", text)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _format_tool_result(result: Any) -> str:
|
|
43
|
+
if isinstance(result, dict):
|
|
44
|
+
if "result" in result:
|
|
45
|
+
return str(result["result"])
|
|
46
|
+
if "error" in result:
|
|
47
|
+
return f"오류: {result['error']}"
|
|
48
|
+
return str(result)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _summarize_runs(payload: dict[str, Any]) -> str:
|
|
52
|
+
runs = payload.get("runs") or []
|
|
53
|
+
if not runs:
|
|
54
|
+
return "실행 기록이 없습니다."
|
|
55
|
+
lines = ["최근 실행 목록:"]
|
|
56
|
+
for run in runs[:10]:
|
|
57
|
+
lines.append(
|
|
58
|
+
"- {run_id} | {dataset} | {model} | pass={pass_rate:.2f}".format(
|
|
59
|
+
run_id=run.get("run_id"),
|
|
60
|
+
dataset=run.get("dataset_name"),
|
|
61
|
+
model=run.get("model_name"),
|
|
62
|
+
pass_rate=run.get("pass_rate", 0.0),
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
return "\n".join(lines)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _summarize_run_summary(payload: dict[str, Any]) -> str:
|
|
69
|
+
summary = payload.get("summary") or {}
|
|
70
|
+
if not summary:
|
|
71
|
+
return "요약 정보를 찾지 못했습니다."
|
|
72
|
+
return (
|
|
73
|
+
"요약: {run_id}\n"
|
|
74
|
+
"- dataset: {dataset}\n"
|
|
75
|
+
"- model: {model}\n"
|
|
76
|
+
"- pass_rate: {pass_rate:.2f}\n"
|
|
77
|
+
"- total: {total} / passed: {passed}\n"
|
|
78
|
+
"- metrics: {metrics}".format(
|
|
79
|
+
run_id=summary.get("run_id"),
|
|
80
|
+
dataset=summary.get("dataset_name"),
|
|
81
|
+
model=summary.get("model_name"),
|
|
82
|
+
pass_rate=summary.get("pass_rate", 0.0),
|
|
83
|
+
total=summary.get("total_test_cases"),
|
|
84
|
+
passed=summary.get("passed_test_cases"),
|
|
85
|
+
metrics=", ".join(summary.get("metrics_evaluated", []) or []),
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _summarize_compare(payload: dict[str, Any]) -> str:
|
|
91
|
+
baseline = payload.get("baseline_run_id")
|
|
92
|
+
candidate = payload.get("candidate_run_id")
|
|
93
|
+
delta = payload.get("metrics_delta") or {}
|
|
94
|
+
avg = delta.get("avg") or {}
|
|
95
|
+
lines = [
|
|
96
|
+
f"비교 결과: {baseline} vs {candidate}",
|
|
97
|
+
"평균 변화:",
|
|
98
|
+
]
|
|
99
|
+
for metric, value in avg.items():
|
|
100
|
+
lines.append(f"- {metric}: {value:+.4f}")
|
|
101
|
+
notes = delta.get("notes") or []
|
|
102
|
+
if notes:
|
|
103
|
+
lines.append("노트: " + "; ".join(notes))
|
|
104
|
+
return "\n".join(lines)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _summarize_artifacts(payload: dict[str, Any]) -> str:
|
|
108
|
+
artifacts = payload.get("artifacts") or {}
|
|
109
|
+
if not artifacts:
|
|
110
|
+
return "아티팩트 경로를 찾지 못했습니다."
|
|
111
|
+
return (
|
|
112
|
+
"아티팩트:\n"
|
|
113
|
+
f"- kind: {artifacts.get('kind')}\n"
|
|
114
|
+
f"- report: {artifacts.get('report_path')}\n"
|
|
115
|
+
f"- output: {artifacts.get('output_path')}\n"
|
|
116
|
+
f"- dir: {artifacts.get('artifacts_dir')}"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _summarize_result(tool_name: str, payload: dict[str, Any]) -> str:
|
|
121
|
+
if tool_name == "list_runs":
|
|
122
|
+
return _summarize_runs(payload)
|
|
123
|
+
if tool_name == "get_run_summary":
|
|
124
|
+
return _summarize_run_summary(payload)
|
|
125
|
+
if tool_name == "analyze_compare":
|
|
126
|
+
return _summarize_compare(payload)
|
|
127
|
+
if tool_name == "get_artifacts":
|
|
128
|
+
return _summarize_artifacts(payload)
|
|
129
|
+
return str(payload)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _load_text_files(root: Path, extensions: tuple[str, ...], limit: int) -> list[str]:
|
|
133
|
+
texts: list[str] = []
|
|
134
|
+
for path in root.rglob("*"):
|
|
135
|
+
if not path.is_file():
|
|
136
|
+
continue
|
|
137
|
+
if path.suffix.lower() not in extensions:
|
|
138
|
+
continue
|
|
139
|
+
if limit and len(texts) >= limit:
|
|
140
|
+
break
|
|
141
|
+
try:
|
|
142
|
+
content = path.read_text(encoding="utf-8")
|
|
143
|
+
except Exception:
|
|
144
|
+
continue
|
|
145
|
+
if content.strip():
|
|
146
|
+
texts.append(content)
|
|
147
|
+
return texts
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
async def _get_rag_retriever():
|
|
151
|
+
global _RAG_RETRIEVER
|
|
152
|
+
global _RAG_DOCS_COUNT
|
|
153
|
+
global _RAG_TEXTS
|
|
154
|
+
global _RAG_INITIALIZED
|
|
155
|
+
|
|
156
|
+
if _RAG_RETRIEVER is not None:
|
|
157
|
+
return _RAG_RETRIEVER, _RAG_DOCS_COUNT
|
|
158
|
+
|
|
159
|
+
if not _RAG_INITIALIZED:
|
|
160
|
+
docs_root = Path(os.getenv("EVALVAULT_RAG_DOCS", "docs"))
|
|
161
|
+
src_root = Path(os.getenv("EVALVAULT_RAG_SRC", "src"))
|
|
162
|
+
docs_limit = int(os.getenv("EVALVAULT_RAG_DOCS_LIMIT", "120"))
|
|
163
|
+
src_limit = int(os.getenv("EVALVAULT_RAG_SRC_LIMIT", "120"))
|
|
164
|
+
|
|
165
|
+
texts: list[str] = []
|
|
166
|
+
if docs_root.exists():
|
|
167
|
+
texts.extend(_load_text_files(docs_root, (".md", ".txt"), docs_limit))
|
|
168
|
+
if src_root.exists():
|
|
169
|
+
texts.extend(_load_text_files(src_root, (".py",), src_limit))
|
|
170
|
+
|
|
171
|
+
_RAG_TEXTS = texts
|
|
172
|
+
_RAG_DOCS_COUNT = len(texts)
|
|
173
|
+
_RAG_INITIALIZED = True
|
|
174
|
+
|
|
175
|
+
if not _RAG_TEXTS:
|
|
176
|
+
return None, 0
|
|
177
|
+
|
|
178
|
+
from evalvault.adapters.outbound.llm.ollama_adapter import OllamaAdapter
|
|
179
|
+
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
180
|
+
from evalvault.config.settings import Settings
|
|
181
|
+
|
|
182
|
+
settings = Settings()
|
|
183
|
+
ollama_adapter = OllamaAdapter(settings)
|
|
184
|
+
toolkit = try_create_korean_toolkit()
|
|
185
|
+
if toolkit is None:
|
|
186
|
+
return None, 0
|
|
187
|
+
|
|
188
|
+
use_hybrid = os.getenv("EVALVAULT_RAG_USE_HYBRID", "true").lower() == "true"
|
|
189
|
+
retriever = toolkit.build_retriever(
|
|
190
|
+
documents=_RAG_TEXTS,
|
|
191
|
+
use_hybrid=use_hybrid,
|
|
192
|
+
ollama_adapter=ollama_adapter if use_hybrid else None,
|
|
193
|
+
embedding_profile=os.getenv("EVALVAULT_RAG_EMBEDDING_PROFILE", "dev"),
|
|
194
|
+
verbose=False,
|
|
195
|
+
)
|
|
196
|
+
if retriever is None:
|
|
197
|
+
return None, 0
|
|
198
|
+
|
|
199
|
+
_RAG_RETRIEVER = retriever
|
|
200
|
+
return retriever, _RAG_DOCS_COUNT
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
async def _direct_chat_answer(user_text: str) -> str | None:
|
|
204
|
+
payload = {
|
|
205
|
+
"model": os.getenv("OLLAMA_CHAT_MODEL", "gpt-oss-safeguard:20b"),
|
|
206
|
+
"messages": [
|
|
207
|
+
{"role": "system", "content": "You are a helpful assistant for EvalVault."},
|
|
208
|
+
{"role": "user", "content": user_text},
|
|
209
|
+
],
|
|
210
|
+
"stream": False,
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
async with httpx.AsyncClient(timeout=30) as client:
|
|
214
|
+
response = await client.post(
|
|
215
|
+
f"{os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')}/api/chat",
|
|
216
|
+
json=payload,
|
|
217
|
+
)
|
|
218
|
+
response.raise_for_status()
|
|
219
|
+
data = response.json()
|
|
220
|
+
|
|
221
|
+
return data.get("message", {}).get("content", "").strip() or None
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _simple_retrieve(texts: list[str], query: str, top_k: int) -> list[str]:
|
|
225
|
+
tokens = re.findall(r"[A-Za-z0-9가-힣]+", query.lower())
|
|
226
|
+
if not tokens:
|
|
227
|
+
return []
|
|
228
|
+
scored: list[tuple[int, str]] = []
|
|
229
|
+
for text in texts:
|
|
230
|
+
hay = text.lower()
|
|
231
|
+
score = sum(hay.count(token) for token in tokens)
|
|
232
|
+
if score:
|
|
233
|
+
scored.append((score, text))
|
|
234
|
+
scored.sort(key=lambda item: item[0], reverse=True)
|
|
235
|
+
return [text for _, text in scored[:top_k]]
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
async def _rag_answer(user_text: str) -> str | None:
|
|
239
|
+
retriever, _ = await _get_rag_retriever()
|
|
240
|
+
contexts: list[str] = []
|
|
241
|
+
|
|
242
|
+
if retriever is not None:
|
|
243
|
+
results = retriever.search(user_text, top_k=5)
|
|
244
|
+
for item in results:
|
|
245
|
+
context = getattr(item, "document", None)
|
|
246
|
+
if context:
|
|
247
|
+
contexts.append(context)
|
|
248
|
+
|
|
249
|
+
if not contexts and _RAG_TEXTS:
|
|
250
|
+
contexts = _simple_retrieve(_RAG_TEXTS, user_text, top_k=5)
|
|
251
|
+
|
|
252
|
+
if not contexts:
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
if os.getenv("EVALVAULT_RAG_LLM_ENABLED", "true").lower() != "true":
|
|
256
|
+
return "\n\n".join(contexts[:3])
|
|
257
|
+
|
|
258
|
+
prompt = (
|
|
259
|
+
"다음은 EvalVault 코드/문서에서 검색된 컨텍스트입니다.\n"
|
|
260
|
+
"컨텍스트만 근거로 사용해 한국어로 답하세요.\n\n"
|
|
261
|
+
"[컨텍스트]\n"
|
|
262
|
+
+ "\n\n---\n\n".join(contexts[:3])
|
|
263
|
+
+ "\n\n[질문]\n"
|
|
264
|
+
+ user_text
|
|
265
|
+
+ "\n\n[답변]"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
payload = {
|
|
269
|
+
"model": os.getenv("OLLAMA_CHAT_MODEL", "gpt-oss-safeguard:20b"),
|
|
270
|
+
"messages": [
|
|
271
|
+
{"role": "system", "content": "You are a helpful assistant for EvalVault."},
|
|
272
|
+
{"role": "user", "content": prompt},
|
|
273
|
+
],
|
|
274
|
+
"stream": False,
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
async with httpx.AsyncClient(timeout=60) as client:
|
|
278
|
+
response = await client.post(
|
|
279
|
+
f"{os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')}/api/chat",
|
|
280
|
+
json=payload,
|
|
281
|
+
)
|
|
282
|
+
response.raise_for_status()
|
|
283
|
+
data = response.json()
|
|
284
|
+
|
|
285
|
+
return data.get("message", {}).get("content", "").strip() or None
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
async def _call_mcp_tool(tool_name: str, tool_args: dict[str, Any]) -> Any:
|
|
289
|
+
headers = {
|
|
290
|
+
"Authorization": f"Bearer {MCP_TOKEN}",
|
|
291
|
+
"Content-Type": "application/json",
|
|
292
|
+
}
|
|
293
|
+
payload = {
|
|
294
|
+
"jsonrpc": "2.0",
|
|
295
|
+
"id": 1,
|
|
296
|
+
"method": "tools/call",
|
|
297
|
+
"params": {"name": tool_name, "arguments": tool_args},
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
async with httpx.AsyncClient(timeout=60) as client:
|
|
301
|
+
response = await client.post(MCP_URL, json=payload, headers=headers)
|
|
302
|
+
response.raise_for_status()
|
|
303
|
+
data = response.json()
|
|
304
|
+
|
|
305
|
+
return data
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
async def _resolve_tool_with_llm(user_text: str) -> dict[str, Any] | None:
|
|
309
|
+
ollama_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
|
310
|
+
router_model = os.getenv("OLLAMA_ROUTER_MODEL", "gemma3:1b")
|
|
311
|
+
|
|
312
|
+
system_prompt = (
|
|
313
|
+
"You are a router for EvalVault. "
|
|
314
|
+
"Return JSON only with keys: action, tool, arguments."
|
|
315
|
+
"Action must be one of: tool, rag, direct."
|
|
316
|
+
"Tools: list_runs, get_run_summary, run_evaluation, analyze_compare, get_artifacts."
|
|
317
|
+
"Rules:"
|
|
318
|
+
"- If user asks about datasets, prefer tool list_datasets."
|
|
319
|
+
"- If question is about EvalVault docs/usage, prefer rag."
|
|
320
|
+
"- If greeting or general chat, use direct."
|
|
321
|
+
"- For tool list_runs: arguments {limit:int}"
|
|
322
|
+
"- For tool get_run_summary: {run_id:string}"
|
|
323
|
+
"- For tool analyze_compare: {run_id_a:string, run_id_b:string}"
|
|
324
|
+
"- For tool run_evaluation: {dataset_path:string, metrics:[string], profile:string, auto_analyze:bool}"
|
|
325
|
+
"- For tool get_artifacts: {run_id:string, kind:'analysis'|'comparison'}"
|
|
326
|
+
"- For tool list_datasets: {limit:int}"
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
payload = {
|
|
330
|
+
"model": router_model,
|
|
331
|
+
"messages": [
|
|
332
|
+
{"role": "system", "content": system_prompt},
|
|
333
|
+
{"role": "user", "content": user_text},
|
|
334
|
+
],
|
|
335
|
+
"stream": False,
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
async with httpx.AsyncClient(timeout=30) as client:
|
|
339
|
+
response = await client.post(f"{ollama_url}/api/chat", json=payload)
|
|
340
|
+
response.raise_for_status()
|
|
341
|
+
data = response.json()
|
|
342
|
+
|
|
343
|
+
content = data.get("message", {}).get("content", "").strip()
|
|
344
|
+
if not content:
|
|
345
|
+
return None
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
return json.loads(content)
|
|
349
|
+
except Exception:
|
|
350
|
+
return None
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _extract_json_content(result: Any) -> dict[str, Any] | None:
|
|
354
|
+
if isinstance(result, dict) and isinstance(result.get("structuredContent"), dict):
|
|
355
|
+
return result.get("structuredContent")
|
|
356
|
+
|
|
357
|
+
if hasattr(result, "structuredContent"):
|
|
358
|
+
payload = result.structuredContent
|
|
359
|
+
if isinstance(payload, dict):
|
|
360
|
+
return payload
|
|
361
|
+
|
|
362
|
+
if hasattr(result, "content"):
|
|
363
|
+
content = result.content
|
|
364
|
+
elif isinstance(result, dict):
|
|
365
|
+
content = result.get("content")
|
|
366
|
+
else:
|
|
367
|
+
content = None
|
|
368
|
+
|
|
369
|
+
if not isinstance(content, list):
|
|
370
|
+
return None
|
|
371
|
+
|
|
372
|
+
for item in content:
|
|
373
|
+
if isinstance(item, dict):
|
|
374
|
+
item_type = item.get("type")
|
|
375
|
+
if item_type == "json":
|
|
376
|
+
payload = item.get("json")
|
|
377
|
+
if isinstance(payload, dict):
|
|
378
|
+
return payload
|
|
379
|
+
if item_type == "text":
|
|
380
|
+
text = item.get("text")
|
|
381
|
+
if isinstance(text, str):
|
|
382
|
+
try:
|
|
383
|
+
parsed = json.loads(text)
|
|
384
|
+
except Exception:
|
|
385
|
+
return None
|
|
386
|
+
if isinstance(parsed, dict):
|
|
387
|
+
return parsed
|
|
388
|
+
else:
|
|
389
|
+
item_type = getattr(item, "type", None)
|
|
390
|
+
if item_type == "text":
|
|
391
|
+
text = getattr(item, "text", None)
|
|
392
|
+
if isinstance(text, str):
|
|
393
|
+
try:
|
|
394
|
+
parsed = json.loads(text)
|
|
395
|
+
except Exception:
|
|
396
|
+
return None
|
|
397
|
+
if isinstance(parsed, dict):
|
|
398
|
+
return parsed
|
|
399
|
+
return None
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _chunk_text(text: str, size: int = 42) -> list[str]:
|
|
403
|
+
if not text:
|
|
404
|
+
return []
|
|
405
|
+
return [text[i : i + size] for i in range(0, len(text), size)]
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def _event(payload: dict[str, Any]) -> str:
|
|
409
|
+
return json.dumps(payload, ensure_ascii=False) + "\n"
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
async def _emit_answer(answer: str) -> AsyncGenerator[str, None]:
|
|
413
|
+
for chunk in _chunk_text(answer):
|
|
414
|
+
yield _event({"type": "delta", "content": chunk})
|
|
415
|
+
await asyncio.sleep(0)
|
|
416
|
+
yield _event({"type": "final", "content": answer})
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
|
|
420
|
+
started_at = time.perf_counter()
|
|
421
|
+
if len(user_text) <= 4:
|
|
422
|
+
yield _event({"type": "final", "content": "안녕하세요! EvalVault 관련 질문을 해주세요."})
|
|
423
|
+
return
|
|
424
|
+
|
|
425
|
+
if len(user_text) <= 6:
|
|
426
|
+
yield _event({"type": "status", "message": "짧은 질문 처리 중..."})
|
|
427
|
+
answer = await _direct_chat_answer(user_text)
|
|
428
|
+
if answer:
|
|
429
|
+
async for item in _emit_answer(answer):
|
|
430
|
+
yield item
|
|
431
|
+
else:
|
|
432
|
+
yield _event({"type": "final", "content": "답변을 생성하지 못했습니다."})
|
|
433
|
+
return
|
|
434
|
+
|
|
435
|
+
yield _event({"type": "status", "message": "요청 분류 중..."})
|
|
436
|
+
try:
|
|
437
|
+
router = await asyncio.wait_for(_resolve_tool_with_llm(user_text), timeout=20)
|
|
438
|
+
except TimeoutError:
|
|
439
|
+
router = None
|
|
440
|
+
except Exception:
|
|
441
|
+
router = None
|
|
442
|
+
|
|
443
|
+
if not isinstance(router, dict):
|
|
444
|
+
router = None
|
|
445
|
+
|
|
446
|
+
if router is None:
|
|
447
|
+
yield _event({"type": "status", "message": "문서 검색 중..."})
|
|
448
|
+
try:
|
|
449
|
+
rag_answer = await asyncio.wait_for(_rag_answer(user_text), timeout=30)
|
|
450
|
+
except TimeoutError:
|
|
451
|
+
yield _event({"type": "error", "message": "문서 검색이 지연됩니다. 다시 시도해주세요."})
|
|
452
|
+
return
|
|
453
|
+
if rag_answer:
|
|
454
|
+
async for item in _emit_answer(rag_answer):
|
|
455
|
+
yield item
|
|
456
|
+
return
|
|
457
|
+
answer = await _direct_chat_answer(user_text)
|
|
458
|
+
if answer:
|
|
459
|
+
async for item in _emit_answer(answer):
|
|
460
|
+
yield item
|
|
461
|
+
return
|
|
462
|
+
yield _event({"type": "final", "content": "요청을 해석하지 못했습니다. 다시 질문해주세요."})
|
|
463
|
+
return
|
|
464
|
+
|
|
465
|
+
action = router.get("action")
|
|
466
|
+
tool_name = router.get("tool")
|
|
467
|
+
tool_args = router.get("arguments", {})
|
|
468
|
+
|
|
469
|
+
if action == "direct":
|
|
470
|
+
answer = await _direct_chat_answer(user_text)
|
|
471
|
+
if answer:
|
|
472
|
+
async for item in _emit_answer(answer):
|
|
473
|
+
yield item
|
|
474
|
+
else:
|
|
475
|
+
yield _event({"type": "final", "content": "답변을 생성하지 못했습니다."})
|
|
476
|
+
return
|
|
477
|
+
|
|
478
|
+
if action == "rag":
|
|
479
|
+
yield _event({"type": "status", "message": "문서 검색 중..."})
|
|
480
|
+
try:
|
|
481
|
+
rag_answer = await asyncio.wait_for(_rag_answer(user_text), timeout=30)
|
|
482
|
+
except TimeoutError:
|
|
483
|
+
yield _event({"type": "error", "message": "문서 검색이 지연됩니다. 다시 시도해주세요."})
|
|
484
|
+
return
|
|
485
|
+
if rag_answer:
|
|
486
|
+
async for item in _emit_answer(rag_answer):
|
|
487
|
+
yield item
|
|
488
|
+
else:
|
|
489
|
+
yield _event({"type": "final", "content": "관련 문서를 찾지 못했습니다."})
|
|
490
|
+
return
|
|
491
|
+
|
|
492
|
+
if action != "tool":
|
|
493
|
+
yield _event({"type": "final", "content": "요청을 해석하지 못했습니다. 다시 질문해주세요."})
|
|
494
|
+
return
|
|
495
|
+
|
|
496
|
+
if not tool_name:
|
|
497
|
+
yield _event({"type": "final", "content": "도구 이름을 찾지 못했습니다."})
|
|
498
|
+
return
|
|
499
|
+
|
|
500
|
+
yield _event({"type": "status", "message": "도구 실행 중..."})
|
|
501
|
+
try:
|
|
502
|
+
result = await asyncio.wait_for(_call_mcp_tool(tool_name, tool_args), timeout=12)
|
|
503
|
+
except TimeoutError:
|
|
504
|
+
yield _event(
|
|
505
|
+
{"type": "error", "message": "응답 지연(12s 초과). MCP 서버 상태를 확인해주세요."}
|
|
506
|
+
)
|
|
507
|
+
return
|
|
508
|
+
except Exception as exc:
|
|
509
|
+
yield _event({"type": "error", "message": f"도구 호출 실패: {exc}"})
|
|
510
|
+
return
|
|
511
|
+
|
|
512
|
+
payload = _extract_json_content(result)
|
|
513
|
+
if isinstance(payload, dict):
|
|
514
|
+
summary = _summarize_result(tool_name, payload)
|
|
515
|
+
async for item in _emit_answer(summary):
|
|
516
|
+
yield item
|
|
517
|
+
return
|
|
518
|
+
|
|
519
|
+
if hasattr(result, "content"):
|
|
520
|
+
text = _format_tool_result(result.content)
|
|
521
|
+
else:
|
|
522
|
+
text = f"도구 실행 결과: {_format_tool_result(result)}"
|
|
523
|
+
async for item in _emit_answer(text):
|
|
524
|
+
yield item
|
|
525
|
+
|
|
526
|
+
elapsed_ms = (time.perf_counter() - started_at) * 1000
|
|
527
|
+
yield _event({"type": "status", "message": f"처리 완료 ({elapsed_ms:.0f}ms)"})
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
@router.post("/stream")
|
|
531
|
+
async def chat_stream(request: ChatRequest):
|
|
532
|
+
user_text = request.message.strip()
|
|
533
|
+
if not user_text:
|
|
534
|
+
return StreamingResponse(
|
|
535
|
+
iter([_event({"type": "error", "message": "질문을 입력해주세요."})]),
|
|
536
|
+
media_type="application/x-ndjson",
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
async def event_generator():
|
|
540
|
+
async for item in _chat_stream(user_text):
|
|
541
|
+
yield item
|
|
542
|
+
|
|
543
|
+
return StreamingResponse(event_generator(), media_type="application/x-ndjson")
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from collections.abc import Callable
|
|
6
6
|
from dataclasses import dataclass
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import typer
|
|
10
10
|
from rich.console import Console
|
|
@@ -12,8 +12,11 @@ from rich.console import Console
|
|
|
12
12
|
from .agent import register_agent_commands
|
|
13
13
|
from .analyze import register_analyze_commands
|
|
14
14
|
from .api import register_api_command
|
|
15
|
+
from .artifacts import create_artifacts_app
|
|
15
16
|
from .benchmark import create_benchmark_app
|
|
16
17
|
from .calibrate import register_calibrate_commands
|
|
18
|
+
from .calibrate_judge import register_calibrate_judge_commands
|
|
19
|
+
from .compare import register_compare_commands
|
|
17
20
|
from .config import register_config_commands
|
|
18
21
|
from .debug import create_debug_app
|
|
19
22
|
from .domain import create_domain_app
|
|
@@ -25,19 +28,17 @@ from .init import register_init_command
|
|
|
25
28
|
from .kg import create_kg_app
|
|
26
29
|
from .langfuse import register_langfuse_commands
|
|
27
30
|
from .method import create_method_app
|
|
31
|
+
from .ops import create_ops_app
|
|
28
32
|
from .phoenix import create_phoenix_app
|
|
29
33
|
from .pipeline import register_pipeline_commands
|
|
34
|
+
from .profile_difficulty import register_profile_difficulty_commands
|
|
30
35
|
from .prompts import create_prompts_app
|
|
36
|
+
from .regress import register_regress_commands
|
|
31
37
|
from .run import register_run_commands
|
|
32
38
|
from .stage import create_stage_app
|
|
33
39
|
|
|
34
40
|
CommandFactory = Callable[[Console], typer.Typer]
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class CommandRegistrar(Protocol):
|
|
38
|
-
"""Callable protocol for Typer command registrars."""
|
|
39
|
-
|
|
40
|
-
def __call__(self, app: typer.Typer, console: Console, **kwargs: Any) -> None: ...
|
|
41
|
+
CommandRegistrar = Callable[..., Any]
|
|
41
42
|
|
|
42
43
|
|
|
43
44
|
@dataclass(frozen=True)
|
|
@@ -61,10 +62,14 @@ COMMAND_MODULES: tuple[CommandModule, ...] = (
|
|
|
61
62
|
CommandModule(register_run_commands, needs_metrics=True),
|
|
62
63
|
CommandModule(register_pipeline_commands),
|
|
63
64
|
CommandModule(register_history_commands),
|
|
65
|
+
CommandModule(register_compare_commands),
|
|
64
66
|
CommandModule(register_analyze_commands),
|
|
65
67
|
CommandModule(register_calibrate_commands),
|
|
68
|
+
CommandModule(register_calibrate_judge_commands),
|
|
66
69
|
CommandModule(register_generate_commands),
|
|
67
70
|
CommandModule(register_gate_commands),
|
|
71
|
+
CommandModule(register_profile_difficulty_commands, needs_metrics=True),
|
|
72
|
+
CommandModule(register_regress_commands),
|
|
68
73
|
CommandModule(register_agent_commands),
|
|
69
74
|
CommandModule(register_experiment_commands),
|
|
70
75
|
CommandModule(register_config_commands),
|
|
@@ -78,9 +83,11 @@ SUB_APPLICATIONS: tuple[SubAppModule, ...] = (
|
|
|
78
83
|
SubAppModule("domain", create_domain_app),
|
|
79
84
|
SubAppModule("benchmark", create_benchmark_app),
|
|
80
85
|
SubAppModule("method", create_method_app),
|
|
86
|
+
SubAppModule("ops", create_ops_app),
|
|
81
87
|
SubAppModule("phoenix", create_phoenix_app),
|
|
82
88
|
SubAppModule("prompts", create_prompts_app),
|
|
83
89
|
SubAppModule("stage", create_stage_app),
|
|
90
|
+
SubAppModule("artifacts", create_artifacts_app),
|
|
84
91
|
SubAppModule("debug", create_debug_app),
|
|
85
92
|
)
|
|
86
93
|
|