evalvault 1.65.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  5. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  6. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  7. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  8. evalvault/adapters/outbound/tracker/langfuse_adapter.py +1 -0
  9. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  10. evalvault/adapters/outbound/tracker/phoenix_adapter.py +29 -2
  11. evalvault/config/settings.py +21 -0
  12. evalvault/domain/entities/prompt.py +1 -1
  13. evalvault/domain/metrics/__init__.py +8 -0
  14. evalvault/domain/metrics/registry.py +39 -3
  15. evalvault/domain/metrics/summary_accuracy.py +189 -0
  16. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  17. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  18. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  19. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  20. evalvault/domain/services/evaluator.py +280 -27
  21. evalvault/domain/services/prompt_registry.py +39 -10
  22. evalvault/domain/services/threshold_profiles.py +4 -0
  23. evalvault/domain/services/visual_space_service.py +79 -4
  24. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  25. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +28 -22
  26. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  27. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  28. {evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -596,6 +596,14 @@ class WebUIAdapter:
596
596
  ragas_snapshots = tracker_meta.get("ragas_prompt_snapshots")
597
597
  ragas_snapshot_inputs = build_prompt_inputs_from_snapshots(
598
598
  ragas_snapshots if isinstance(ragas_snapshots, dict) else None,
599
+ kind="ragas",
600
+ source="ragas",
601
+ )
602
+ custom_snapshots = tracker_meta.get("custom_prompt_snapshots")
603
+ custom_snapshot_inputs = build_prompt_inputs_from_snapshots(
604
+ custom_snapshots if isinstance(custom_snapshots, dict) else None,
605
+ kind="custom",
606
+ source="custom_rules",
599
607
  )
600
608
  override_status: dict[str, str] = {}
601
609
  raw_override = tracker_meta.get("ragas_prompt_overrides")
@@ -618,6 +626,12 @@ class WebUIAdapter:
618
626
  if entry.role in existing_roles and override_status.get(entry.role) == "applied":
619
627
  continue
620
628
  prompt_inputs.append(entry)
629
+ if custom_snapshot_inputs:
630
+ existing_roles = {entry.role for entry in prompt_inputs if entry.kind == "custom"}
631
+ for entry in custom_snapshot_inputs:
632
+ if entry.role in existing_roles:
633
+ continue
634
+ prompt_inputs.append(entry)
621
635
 
622
636
  prompt_bundle = None
623
637
  if prompt_inputs:
@@ -152,7 +152,7 @@ def create_app() -> FastAPI:
152
152
  allow_headers=["*"],
153
153
  )
154
154
 
155
- from .routers import benchmark, config, domain, knowledge, pipeline, runs
155
+ from .routers import benchmark, chat, config, domain, knowledge, mcp, pipeline, runs
156
156
 
157
157
  auth_dependencies = [Depends(require_api_token)]
158
158
 
@@ -162,6 +162,12 @@ def create_app() -> FastAPI:
162
162
  tags=["runs"],
163
163
  dependencies=auth_dependencies,
164
164
  )
165
+ app.include_router(
166
+ chat.router,
167
+ prefix="/api/v1/chat",
168
+ tags=["chat"],
169
+ dependencies=auth_dependencies,
170
+ )
165
171
  app.include_router(
166
172
  benchmark.router,
167
173
  prefix="/api/v1/benchmarks",
@@ -192,6 +198,12 @@ def create_app() -> FastAPI:
192
198
  tags=["config"],
193
199
  dependencies=auth_dependencies,
194
200
  )
201
+ app.include_router(
202
+ mcp.router,
203
+ prefix="/api/v1/mcp",
204
+ tags=["mcp"],
205
+ dependencies=auth_dependencies,
206
+ )
195
207
 
196
208
  @app.get("/health")
197
209
  def health_check():
@@ -209,9 +221,7 @@ def create_app() -> FastAPI:
209
221
  # Dependency to get the adapter
210
222
  def get_adapter(app: FastAPI) -> WebUIAdapter:
211
223
  """Dependency to retrieve the WebUIAdapter from app state."""
212
- # When using Depends(), we can't easily access 'app' directly in standard dependency signature
213
- # unless we use Request. So we usually do:
214
- pass
224
+ return app.state.adapter
215
225
 
216
226
 
217
227
  def get_web_adapter(request: Request) -> WebUIAdapter:
@@ -0,0 +1,543 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import os
6
+ import re
7
+ import time
8
+ from collections.abc import AsyncGenerator
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import httpx
13
+ from fastapi import APIRouter
14
+ from fastapi.responses import StreamingResponse
15
+ from pydantic import BaseModel, Field
16
+
17
+ router = APIRouter(tags=["chat"])
18
+
19
+ MCP_URL = os.getenv("EVALVAULT_MCP_URL", "http://localhost:8000/api/v1/mcp")
20
+ MCP_TOKEN = os.getenv("EVALVAULT_MCP_TOKEN", "mcp-local-dev-token")
21
+
22
+ _RAG_RETRIEVER = None
23
+ _RAG_DOCS_COUNT = 0
24
+ _RAG_TEXTS: list[str] = []
25
+ _RAG_INITIALIZED = False
26
+
27
+
28
+ class ChatMessage(BaseModel):
29
+ role: str
30
+ content: str
31
+
32
+
33
+ class ChatRequest(BaseModel):
34
+ message: str = Field(..., min_length=1)
35
+ history: list[ChatMessage] | None = None
36
+
37
+
38
+ def _extract_run_ids(text: str) -> list[str]:
39
+ return re.findall(r"run_[A-Za-z0-9_-]+", text)
40
+
41
+
42
+ def _format_tool_result(result: Any) -> str:
43
+ if isinstance(result, dict):
44
+ if "result" in result:
45
+ return str(result["result"])
46
+ if "error" in result:
47
+ return f"오류: {result['error']}"
48
+ return str(result)
49
+
50
+
51
+ def _summarize_runs(payload: dict[str, Any]) -> str:
52
+ runs = payload.get("runs") or []
53
+ if not runs:
54
+ return "실행 기록이 없습니다."
55
+ lines = ["최근 실행 목록:"]
56
+ for run in runs[:10]:
57
+ lines.append(
58
+ "- {run_id} | {dataset} | {model} | pass={pass_rate:.2f}".format(
59
+ run_id=run.get("run_id"),
60
+ dataset=run.get("dataset_name"),
61
+ model=run.get("model_name"),
62
+ pass_rate=run.get("pass_rate", 0.0),
63
+ )
64
+ )
65
+ return "\n".join(lines)
66
+
67
+
68
+ def _summarize_run_summary(payload: dict[str, Any]) -> str:
69
+ summary = payload.get("summary") or {}
70
+ if not summary:
71
+ return "요약 정보를 찾지 못했습니다."
72
+ return (
73
+ "요약: {run_id}\n"
74
+ "- dataset: {dataset}\n"
75
+ "- model: {model}\n"
76
+ "- pass_rate: {pass_rate:.2f}\n"
77
+ "- total: {total} / passed: {passed}\n"
78
+ "- metrics: {metrics}".format(
79
+ run_id=summary.get("run_id"),
80
+ dataset=summary.get("dataset_name"),
81
+ model=summary.get("model_name"),
82
+ pass_rate=summary.get("pass_rate", 0.0),
83
+ total=summary.get("total_test_cases"),
84
+ passed=summary.get("passed_test_cases"),
85
+ metrics=", ".join(summary.get("metrics_evaluated", []) or []),
86
+ )
87
+ )
88
+
89
+
90
+ def _summarize_compare(payload: dict[str, Any]) -> str:
91
+ baseline = payload.get("baseline_run_id")
92
+ candidate = payload.get("candidate_run_id")
93
+ delta = payload.get("metrics_delta") or {}
94
+ avg = delta.get("avg") or {}
95
+ lines = [
96
+ f"비교 결과: {baseline} vs {candidate}",
97
+ "평균 변화:",
98
+ ]
99
+ for metric, value in avg.items():
100
+ lines.append(f"- {metric}: {value:+.4f}")
101
+ notes = delta.get("notes") or []
102
+ if notes:
103
+ lines.append("노트: " + "; ".join(notes))
104
+ return "\n".join(lines)
105
+
106
+
107
+ def _summarize_artifacts(payload: dict[str, Any]) -> str:
108
+ artifacts = payload.get("artifacts") or {}
109
+ if not artifacts:
110
+ return "아티팩트 경로를 찾지 못했습니다."
111
+ return (
112
+ "아티팩트:\n"
113
+ f"- kind: {artifacts.get('kind')}\n"
114
+ f"- report: {artifacts.get('report_path')}\n"
115
+ f"- output: {artifacts.get('output_path')}\n"
116
+ f"- dir: {artifacts.get('artifacts_dir')}"
117
+ )
118
+
119
+
120
+ def _summarize_result(tool_name: str, payload: dict[str, Any]) -> str:
121
+ if tool_name == "list_runs":
122
+ return _summarize_runs(payload)
123
+ if tool_name == "get_run_summary":
124
+ return _summarize_run_summary(payload)
125
+ if tool_name == "analyze_compare":
126
+ return _summarize_compare(payload)
127
+ if tool_name == "get_artifacts":
128
+ return _summarize_artifacts(payload)
129
+ return str(payload)
130
+
131
+
132
+ def _load_text_files(root: Path, extensions: tuple[str, ...], limit: int) -> list[str]:
133
+ texts: list[str] = []
134
+ for path in root.rglob("*"):
135
+ if not path.is_file():
136
+ continue
137
+ if path.suffix.lower() not in extensions:
138
+ continue
139
+ if limit and len(texts) >= limit:
140
+ break
141
+ try:
142
+ content = path.read_text(encoding="utf-8")
143
+ except Exception:
144
+ continue
145
+ if content.strip():
146
+ texts.append(content)
147
+ return texts
148
+
149
+
150
+ async def _get_rag_retriever():
151
+ global _RAG_RETRIEVER
152
+ global _RAG_DOCS_COUNT
153
+ global _RAG_TEXTS
154
+ global _RAG_INITIALIZED
155
+
156
+ if _RAG_RETRIEVER is not None:
157
+ return _RAG_RETRIEVER, _RAG_DOCS_COUNT
158
+
159
+ if not _RAG_INITIALIZED:
160
+ docs_root = Path(os.getenv("EVALVAULT_RAG_DOCS", "docs"))
161
+ src_root = Path(os.getenv("EVALVAULT_RAG_SRC", "src"))
162
+ docs_limit = int(os.getenv("EVALVAULT_RAG_DOCS_LIMIT", "120"))
163
+ src_limit = int(os.getenv("EVALVAULT_RAG_SRC_LIMIT", "120"))
164
+
165
+ texts: list[str] = []
166
+ if docs_root.exists():
167
+ texts.extend(_load_text_files(docs_root, (".md", ".txt"), docs_limit))
168
+ if src_root.exists():
169
+ texts.extend(_load_text_files(src_root, (".py",), src_limit))
170
+
171
+ _RAG_TEXTS = texts
172
+ _RAG_DOCS_COUNT = len(texts)
173
+ _RAG_INITIALIZED = True
174
+
175
+ if not _RAG_TEXTS:
176
+ return None, 0
177
+
178
+ from evalvault.adapters.outbound.llm.ollama_adapter import OllamaAdapter
179
+ from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
180
+ from evalvault.config.settings import Settings
181
+
182
+ settings = Settings()
183
+ ollama_adapter = OllamaAdapter(settings)
184
+ toolkit = try_create_korean_toolkit()
185
+ if toolkit is None:
186
+ return None, 0
187
+
188
+ use_hybrid = os.getenv("EVALVAULT_RAG_USE_HYBRID", "true").lower() == "true"
189
+ retriever = toolkit.build_retriever(
190
+ documents=_RAG_TEXTS,
191
+ use_hybrid=use_hybrid,
192
+ ollama_adapter=ollama_adapter if use_hybrid else None,
193
+ embedding_profile=os.getenv("EVALVAULT_RAG_EMBEDDING_PROFILE", "dev"),
194
+ verbose=False,
195
+ )
196
+ if retriever is None:
197
+ return None, 0
198
+
199
+ _RAG_RETRIEVER = retriever
200
+ return retriever, _RAG_DOCS_COUNT
201
+
202
+
203
+ async def _direct_chat_answer(user_text: str) -> str | None:
204
+ payload = {
205
+ "model": os.getenv("OLLAMA_CHAT_MODEL", "gpt-oss-safeguard:20b"),
206
+ "messages": [
207
+ {"role": "system", "content": "You are a helpful assistant for EvalVault."},
208
+ {"role": "user", "content": user_text},
209
+ ],
210
+ "stream": False,
211
+ }
212
+
213
+ async with httpx.AsyncClient(timeout=30) as client:
214
+ response = await client.post(
215
+ f"{os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')}/api/chat",
216
+ json=payload,
217
+ )
218
+ response.raise_for_status()
219
+ data = response.json()
220
+
221
+ return data.get("message", {}).get("content", "").strip() or None
222
+
223
+
224
+ def _simple_retrieve(texts: list[str], query: str, top_k: int) -> list[str]:
225
+ tokens = re.findall(r"[A-Za-z0-9가-힣]+", query.lower())
226
+ if not tokens:
227
+ return []
228
+ scored: list[tuple[int, str]] = []
229
+ for text in texts:
230
+ hay = text.lower()
231
+ score = sum(hay.count(token) for token in tokens)
232
+ if score:
233
+ scored.append((score, text))
234
+ scored.sort(key=lambda item: item[0], reverse=True)
235
+ return [text for _, text in scored[:top_k]]
236
+
237
+
238
+ async def _rag_answer(user_text: str) -> str | None:
239
+ retriever, _ = await _get_rag_retriever()
240
+ contexts: list[str] = []
241
+
242
+ if retriever is not None:
243
+ results = retriever.search(user_text, top_k=5)
244
+ for item in results:
245
+ context = getattr(item, "document", None)
246
+ if context:
247
+ contexts.append(context)
248
+
249
+ if not contexts and _RAG_TEXTS:
250
+ contexts = _simple_retrieve(_RAG_TEXTS, user_text, top_k=5)
251
+
252
+ if not contexts:
253
+ return None
254
+
255
+ if os.getenv("EVALVAULT_RAG_LLM_ENABLED", "true").lower() != "true":
256
+ return "\n\n".join(contexts[:3])
257
+
258
+ prompt = (
259
+ "다음은 EvalVault 코드/문서에서 검색된 컨텍스트입니다.\n"
260
+ "컨텍스트만 근거로 사용해 한국어로 답하세요.\n\n"
261
+ "[컨텍스트]\n"
262
+ + "\n\n---\n\n".join(contexts[:3])
263
+ + "\n\n[질문]\n"
264
+ + user_text
265
+ + "\n\n[답변]"
266
+ )
267
+
268
+ payload = {
269
+ "model": os.getenv("OLLAMA_CHAT_MODEL", "gpt-oss-safeguard:20b"),
270
+ "messages": [
271
+ {"role": "system", "content": "You are a helpful assistant for EvalVault."},
272
+ {"role": "user", "content": prompt},
273
+ ],
274
+ "stream": False,
275
+ }
276
+
277
+ async with httpx.AsyncClient(timeout=60) as client:
278
+ response = await client.post(
279
+ f"{os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')}/api/chat",
280
+ json=payload,
281
+ )
282
+ response.raise_for_status()
283
+ data = response.json()
284
+
285
+ return data.get("message", {}).get("content", "").strip() or None
286
+
287
+
288
+ async def _call_mcp_tool(tool_name: str, tool_args: dict[str, Any]) -> Any:
289
+ headers = {
290
+ "Authorization": f"Bearer {MCP_TOKEN}",
291
+ "Content-Type": "application/json",
292
+ }
293
+ payload = {
294
+ "jsonrpc": "2.0",
295
+ "id": 1,
296
+ "method": "tools/call",
297
+ "params": {"name": tool_name, "arguments": tool_args},
298
+ }
299
+
300
+ async with httpx.AsyncClient(timeout=60) as client:
301
+ response = await client.post(MCP_URL, json=payload, headers=headers)
302
+ response.raise_for_status()
303
+ data = response.json()
304
+
305
+ return data
306
+
307
+
308
+ async def _resolve_tool_with_llm(user_text: str) -> dict[str, Any] | None:
309
+ ollama_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
310
+ router_model = os.getenv("OLLAMA_ROUTER_MODEL", "gemma3:1b")
311
+
312
+ system_prompt = (
313
+ "You are a router for EvalVault. "
314
+ "Return JSON only with keys: action, tool, arguments."
315
+ "Action must be one of: tool, rag, direct."
316
+ "Tools: list_runs, get_run_summary, run_evaluation, analyze_compare, get_artifacts."
317
+ "Rules:"
318
+ "- If user asks about datasets, prefer tool list_datasets."
319
+ "- If question is about EvalVault docs/usage, prefer rag."
320
+ "- If greeting or general chat, use direct."
321
+ "- For tool list_runs: arguments {limit:int}"
322
+ "- For tool get_run_summary: {run_id:string}"
323
+ "- For tool analyze_compare: {run_id_a:string, run_id_b:string}"
324
+ "- For tool run_evaluation: {dataset_path:string, metrics:[string], profile:string, auto_analyze:bool}"
325
+ "- For tool get_artifacts: {run_id:string, kind:'analysis'|'comparison'}"
326
+ "- For tool list_datasets: {limit:int}"
327
+ )
328
+
329
+ payload = {
330
+ "model": router_model,
331
+ "messages": [
332
+ {"role": "system", "content": system_prompt},
333
+ {"role": "user", "content": user_text},
334
+ ],
335
+ "stream": False,
336
+ }
337
+
338
+ async with httpx.AsyncClient(timeout=30) as client:
339
+ response = await client.post(f"{ollama_url}/api/chat", json=payload)
340
+ response.raise_for_status()
341
+ data = response.json()
342
+
343
+ content = data.get("message", {}).get("content", "").strip()
344
+ if not content:
345
+ return None
346
+
347
+ try:
348
+ return json.loads(content)
349
+ except Exception:
350
+ return None
351
+
352
+
353
+ def _extract_json_content(result: Any) -> dict[str, Any] | None:
354
+ if isinstance(result, dict) and isinstance(result.get("structuredContent"), dict):
355
+ return result.get("structuredContent")
356
+
357
+ if hasattr(result, "structuredContent"):
358
+ payload = result.structuredContent
359
+ if isinstance(payload, dict):
360
+ return payload
361
+
362
+ if hasattr(result, "content"):
363
+ content = result.content
364
+ elif isinstance(result, dict):
365
+ content = result.get("content")
366
+ else:
367
+ content = None
368
+
369
+ if not isinstance(content, list):
370
+ return None
371
+
372
+ for item in content:
373
+ if isinstance(item, dict):
374
+ item_type = item.get("type")
375
+ if item_type == "json":
376
+ payload = item.get("json")
377
+ if isinstance(payload, dict):
378
+ return payload
379
+ if item_type == "text":
380
+ text = item.get("text")
381
+ if isinstance(text, str):
382
+ try:
383
+ parsed = json.loads(text)
384
+ except Exception:
385
+ return None
386
+ if isinstance(parsed, dict):
387
+ return parsed
388
+ else:
389
+ item_type = getattr(item, "type", None)
390
+ if item_type == "text":
391
+ text = getattr(item, "text", None)
392
+ if isinstance(text, str):
393
+ try:
394
+ parsed = json.loads(text)
395
+ except Exception:
396
+ return None
397
+ if isinstance(parsed, dict):
398
+ return parsed
399
+ return None
400
+
401
+
402
+ def _chunk_text(text: str, size: int = 42) -> list[str]:
403
+ if not text:
404
+ return []
405
+ return [text[i : i + size] for i in range(0, len(text), size)]
406
+
407
+
408
+ def _event(payload: dict[str, Any]) -> str:
409
+ return json.dumps(payload, ensure_ascii=False) + "\n"
410
+
411
+
412
+ async def _emit_answer(answer: str) -> AsyncGenerator[str, None]:
413
+ for chunk in _chunk_text(answer):
414
+ yield _event({"type": "delta", "content": chunk})
415
+ await asyncio.sleep(0)
416
+ yield _event({"type": "final", "content": answer})
417
+
418
+
419
+ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
420
+ started_at = time.perf_counter()
421
+ if len(user_text) <= 4:
422
+ yield _event({"type": "final", "content": "안녕하세요! EvalVault 관련 질문을 해주세요."})
423
+ return
424
+
425
+ if len(user_text) <= 6:
426
+ yield _event({"type": "status", "message": "짧은 질문 처리 중..."})
427
+ answer = await _direct_chat_answer(user_text)
428
+ if answer:
429
+ async for item in _emit_answer(answer):
430
+ yield item
431
+ else:
432
+ yield _event({"type": "final", "content": "답변을 생성하지 못했습니다."})
433
+ return
434
+
435
+ yield _event({"type": "status", "message": "요청 분류 중..."})
436
+ try:
437
+ router = await asyncio.wait_for(_resolve_tool_with_llm(user_text), timeout=20)
438
+ except TimeoutError:
439
+ router = None
440
+ except Exception:
441
+ router = None
442
+
443
+ if not isinstance(router, dict):
444
+ router = None
445
+
446
+ if router is None:
447
+ yield _event({"type": "status", "message": "문서 검색 중..."})
448
+ try:
449
+ rag_answer = await asyncio.wait_for(_rag_answer(user_text), timeout=30)
450
+ except TimeoutError:
451
+ yield _event({"type": "error", "message": "문서 검색이 지연됩니다. 다시 시도해주세요."})
452
+ return
453
+ if rag_answer:
454
+ async for item in _emit_answer(rag_answer):
455
+ yield item
456
+ return
457
+ answer = await _direct_chat_answer(user_text)
458
+ if answer:
459
+ async for item in _emit_answer(answer):
460
+ yield item
461
+ return
462
+ yield _event({"type": "final", "content": "요청을 해석하지 못했습니다. 다시 질문해주세요."})
463
+ return
464
+
465
+ action = router.get("action")
466
+ tool_name = router.get("tool")
467
+ tool_args = router.get("arguments", {})
468
+
469
+ if action == "direct":
470
+ answer = await _direct_chat_answer(user_text)
471
+ if answer:
472
+ async for item in _emit_answer(answer):
473
+ yield item
474
+ else:
475
+ yield _event({"type": "final", "content": "답변을 생성하지 못했습니다."})
476
+ return
477
+
478
+ if action == "rag":
479
+ yield _event({"type": "status", "message": "문서 검색 중..."})
480
+ try:
481
+ rag_answer = await asyncio.wait_for(_rag_answer(user_text), timeout=30)
482
+ except TimeoutError:
483
+ yield _event({"type": "error", "message": "문서 검색이 지연됩니다. 다시 시도해주세요."})
484
+ return
485
+ if rag_answer:
486
+ async for item in _emit_answer(rag_answer):
487
+ yield item
488
+ else:
489
+ yield _event({"type": "final", "content": "관련 문서를 찾지 못했습니다."})
490
+ return
491
+
492
+ if action != "tool":
493
+ yield _event({"type": "final", "content": "요청을 해석하지 못했습니다. 다시 질문해주세요."})
494
+ return
495
+
496
+ if not tool_name:
497
+ yield _event({"type": "final", "content": "도구 이름을 찾지 못했습니다."})
498
+ return
499
+
500
+ yield _event({"type": "status", "message": "도구 실행 중..."})
501
+ try:
502
+ result = await asyncio.wait_for(_call_mcp_tool(tool_name, tool_args), timeout=12)
503
+ except TimeoutError:
504
+ yield _event(
505
+ {"type": "error", "message": "응답 지연(12s 초과). MCP 서버 상태를 확인해주세요."}
506
+ )
507
+ return
508
+ except Exception as exc:
509
+ yield _event({"type": "error", "message": f"도구 호출 실패: {exc}"})
510
+ return
511
+
512
+ payload = _extract_json_content(result)
513
+ if isinstance(payload, dict):
514
+ summary = _summarize_result(tool_name, payload)
515
+ async for item in _emit_answer(summary):
516
+ yield item
517
+ return
518
+
519
+ if hasattr(result, "content"):
520
+ text = _format_tool_result(result.content)
521
+ else:
522
+ text = f"도구 실행 결과: {_format_tool_result(result)}"
523
+ async for item in _emit_answer(text):
524
+ yield item
525
+
526
+ elapsed_ms = (time.perf_counter() - started_at) * 1000
527
+ yield _event({"type": "status", "message": f"처리 완료 ({elapsed_ms:.0f}ms)"})
528
+
529
+
530
+ @router.post("/stream")
531
+ async def chat_stream(request: ChatRequest):
532
+ user_text = request.message.strip()
533
+ if not user_text:
534
+ return StreamingResponse(
535
+ iter([_event({"type": "error", "message": "질문을 입력해주세요."})]),
536
+ media_type="application/x-ndjson",
537
+ )
538
+
539
+ async def event_generator():
540
+ async for item in _chat_stream(user_text):
541
+ yield item
542
+
543
+ return StreamingResponse(event_generator(), media_type="application/x-ndjson")
@@ -1742,6 +1742,14 @@ def register_run_commands(
1742
1742
  ragas_snapshots = tracker_meta.get("ragas_prompt_snapshots")
1743
1743
  ragas_snapshot_inputs = build_prompt_inputs_from_snapshots(
1744
1744
  ragas_snapshots if isinstance(ragas_snapshots, dict) else None,
1745
+ kind="ragas",
1746
+ source="ragas",
1747
+ )
1748
+ custom_snapshots = tracker_meta.get("custom_prompt_snapshots")
1749
+ custom_snapshot_inputs = build_prompt_inputs_from_snapshots(
1750
+ custom_snapshots if isinstance(custom_snapshots, dict) else None,
1751
+ kind="custom",
1752
+ source="custom_rules",
1745
1753
  )
1746
1754
  override_status: dict[str, str] = {}
1747
1755
  raw_override = tracker_meta.get("ragas_prompt_overrides")
@@ -1764,6 +1772,12 @@ def register_run_commands(
1764
1772
  if entry.role in existing_roles and override_status.get(entry.role) == "applied":
1765
1773
  continue
1766
1774
  prompt_inputs.append(entry)
1775
+ if custom_snapshot_inputs:
1776
+ existing_roles = {entry.role for entry in prompt_inputs if entry.kind == "custom"}
1777
+ for entry in custom_snapshot_inputs:
1778
+ if entry.role in existing_roles:
1779
+ continue
1780
+ prompt_inputs.append(entry)
1767
1781
  if prompt_inputs and not db_path:
1768
1782
  print_cli_warning(
1769
1783
  console,
@@ -88,7 +88,25 @@ RUN_MODE_PRESETS: dict[str, RunModePreset] = {
88
88
  ),
89
89
  }
90
90
 
91
- SUMMARY_METRIC_ORDER = ("summary_faithfulness", "summary_score", "entity_preservation")
91
+ SUMMARY_METRIC_ORDER = (
92
+ "summary_faithfulness",
93
+ "summary_score",
94
+ "entity_preservation",
95
+ "summary_accuracy",
96
+ "summary_risk_coverage",
97
+ "summary_non_definitive",
98
+ "summary_needs_followup",
99
+ )
100
+
101
+ SUMMARY_METRIC_SOURCE = {
102
+ "summary_faithfulness": "LLM",
103
+ "summary_score": "LLM",
104
+ "entity_preservation": "Rule",
105
+ "summary_accuracy": "Rule",
106
+ "summary_risk_coverage": "Rule",
107
+ "summary_non_definitive": "Rule",
108
+ "summary_needs_followup": "Rule",
109
+ }
92
110
 
93
111
 
94
112
  def _display_results(result, console: Console, verbose: bool = False) -> None:
@@ -180,8 +198,9 @@ def _display_summary_guidance(result, console: Console) -> None:
180
198
  if score is None:
181
199
  continue
182
200
  recommended = SUMMARY_RECOMMENDED_THRESHOLDS[metric]
201
+ source = SUMMARY_METRIC_SOURCE.get(metric, "Rule")
183
202
  if score < recommended:
184
- warnings.append(f"- {metric}: {score:.3f} < {recommended:.2f}")
203
+ warnings.append(f"- {metric} ({source}): {score:.3f} < {recommended:.2f}")
185
204
 
186
205
  if warnings:
187
206
  header = "[bold red]사용자 노출 기준 미달[/bold red]"