evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  14. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  15. evalvault/adapters/outbound/artifact_fs.py +16 -0
  16. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  17. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  18. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  19. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  20. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  21. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  22. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  23. evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
  24. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  25. evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
  26. evalvault/config/settings.py +21 -0
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/judge_calibration.py +50 -0
  29. evalvault/domain/entities/prompt.py +1 -1
  30. evalvault/domain/entities/stage.py +11 -3
  31. evalvault/domain/metrics/__init__.py +8 -0
  32. evalvault/domain/metrics/registry.py +39 -3
  33. evalvault/domain/metrics/summary_accuracy.py +189 -0
  34. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  35. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  36. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  37. evalvault/domain/services/artifact_lint_service.py +268 -0
  38. evalvault/domain/services/benchmark_runner.py +1 -6
  39. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  40. evalvault/domain/services/dataset_preprocessor.py +26 -0
  41. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  42. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  43. evalvault/domain/services/evaluator.py +282 -27
  44. evalvault/domain/services/judge_calibration_service.py +495 -0
  45. evalvault/domain/services/ops_snapshot_service.py +159 -0
  46. evalvault/domain/services/prompt_registry.py +39 -10
  47. evalvault/domain/services/regression_gate_service.py +199 -0
  48. evalvault/domain/services/run_comparison_service.py +159 -0
  49. evalvault/domain/services/stage_event_builder.py +6 -1
  50. evalvault/domain/services/stage_metric_service.py +83 -18
  51. evalvault/domain/services/threshold_profiles.py +4 -0
  52. evalvault/domain/services/visual_space_service.py +79 -4
  53. evalvault/ports/outbound/__init__.py +4 -0
  54. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  55. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  56. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  57. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  58. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  59. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  60. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
  61. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  62. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  63. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -596,6 +596,14 @@ class WebUIAdapter:
596
596
  ragas_snapshots = tracker_meta.get("ragas_prompt_snapshots")
597
597
  ragas_snapshot_inputs = build_prompt_inputs_from_snapshots(
598
598
  ragas_snapshots if isinstance(ragas_snapshots, dict) else None,
599
+ kind="ragas",
600
+ source="ragas",
601
+ )
602
+ custom_snapshots = tracker_meta.get("custom_prompt_snapshots")
603
+ custom_snapshot_inputs = build_prompt_inputs_from_snapshots(
604
+ custom_snapshots if isinstance(custom_snapshots, dict) else None,
605
+ kind="custom",
606
+ source="custom_rules",
599
607
  )
600
608
  override_status: dict[str, str] = {}
601
609
  raw_override = tracker_meta.get("ragas_prompt_overrides")
@@ -618,6 +626,12 @@ class WebUIAdapter:
618
626
  if entry.role in existing_roles and override_status.get(entry.role) == "applied":
619
627
  continue
620
628
  prompt_inputs.append(entry)
629
+ if custom_snapshot_inputs:
630
+ existing_roles = {entry.role for entry in prompt_inputs if entry.kind == "custom"}
631
+ for entry in custom_snapshot_inputs:
632
+ if entry.role in existing_roles:
633
+ continue
634
+ prompt_inputs.append(entry)
621
635
 
622
636
  prompt_bundle = None
623
637
  if prompt_inputs:
@@ -152,7 +152,7 @@ def create_app() -> FastAPI:
152
152
  allow_headers=["*"],
153
153
  )
154
154
 
155
- from .routers import benchmark, config, domain, knowledge, pipeline, runs
155
+ from .routers import benchmark, chat, config, domain, knowledge, mcp, pipeline, runs
156
156
 
157
157
  auth_dependencies = [Depends(require_api_token)]
158
158
 
@@ -162,6 +162,12 @@ def create_app() -> FastAPI:
162
162
  tags=["runs"],
163
163
  dependencies=auth_dependencies,
164
164
  )
165
+ app.include_router(
166
+ chat.router,
167
+ prefix="/api/v1/chat",
168
+ tags=["chat"],
169
+ dependencies=auth_dependencies,
170
+ )
165
171
  app.include_router(
166
172
  benchmark.router,
167
173
  prefix="/api/v1/benchmarks",
@@ -192,6 +198,12 @@ def create_app() -> FastAPI:
192
198
  tags=["config"],
193
199
  dependencies=auth_dependencies,
194
200
  )
201
+ app.include_router(
202
+ mcp.router,
203
+ prefix="/api/v1/mcp",
204
+ tags=["mcp"],
205
+ dependencies=auth_dependencies,
206
+ )
195
207
 
196
208
  @app.get("/health")
197
209
  def health_check():
@@ -209,9 +221,7 @@ def create_app() -> FastAPI:
209
221
  # Dependency to get the adapter
210
222
  def get_adapter(app: FastAPI) -> WebUIAdapter:
211
223
  """Dependency to retrieve the WebUIAdapter from app state."""
212
- # When using Depends(), we can't easily access 'app' directly in standard dependency signature
213
- # unless we use Request. So we usually do:
214
- pass
224
+ return app.state.adapter
215
225
 
216
226
 
217
227
  def get_web_adapter(request: Request) -> WebUIAdapter:
@@ -0,0 +1,543 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import os
6
+ import re
7
+ import time
8
+ from collections.abc import AsyncGenerator
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import httpx
13
+ from fastapi import APIRouter
14
+ from fastapi.responses import StreamingResponse
15
+ from pydantic import BaseModel, Field
16
+
17
+ router = APIRouter(tags=["chat"])
18
+
19
+ MCP_URL = os.getenv("EVALVAULT_MCP_URL", "http://localhost:8000/api/v1/mcp")
20
+ MCP_TOKEN = os.getenv("EVALVAULT_MCP_TOKEN", "mcp-local-dev-token")
21
+
22
+ _RAG_RETRIEVER = None
23
+ _RAG_DOCS_COUNT = 0
24
+ _RAG_TEXTS: list[str] = []
25
+ _RAG_INITIALIZED = False
26
+
27
+
28
+ class ChatMessage(BaseModel):
29
+ role: str
30
+ content: str
31
+
32
+
33
+ class ChatRequest(BaseModel):
34
+ message: str = Field(..., min_length=1)
35
+ history: list[ChatMessage] | None = None
36
+
37
+
38
+ def _extract_run_ids(text: str) -> list[str]:
39
+ return re.findall(r"run_[A-Za-z0-9_-]+", text)
40
+
41
+
42
+ def _format_tool_result(result: Any) -> str:
43
+ if isinstance(result, dict):
44
+ if "result" in result:
45
+ return str(result["result"])
46
+ if "error" in result:
47
+ return f"오류: {result['error']}"
48
+ return str(result)
49
+
50
+
51
+ def _summarize_runs(payload: dict[str, Any]) -> str:
52
+ runs = payload.get("runs") or []
53
+ if not runs:
54
+ return "실행 기록이 없습니다."
55
+ lines = ["최근 실행 목록:"]
56
+ for run in runs[:10]:
57
+ lines.append(
58
+ "- {run_id} | {dataset} | {model} | pass={pass_rate:.2f}".format(
59
+ run_id=run.get("run_id"),
60
+ dataset=run.get("dataset_name"),
61
+ model=run.get("model_name"),
62
+ pass_rate=run.get("pass_rate", 0.0),
63
+ )
64
+ )
65
+ return "\n".join(lines)
66
+
67
+
68
+ def _summarize_run_summary(payload: dict[str, Any]) -> str:
69
+ summary = payload.get("summary") or {}
70
+ if not summary:
71
+ return "요약 정보를 찾지 못했습니다."
72
+ return (
73
+ "요약: {run_id}\n"
74
+ "- dataset: {dataset}\n"
75
+ "- model: {model}\n"
76
+ "- pass_rate: {pass_rate:.2f}\n"
77
+ "- total: {total} / passed: {passed}\n"
78
+ "- metrics: {metrics}".format(
79
+ run_id=summary.get("run_id"),
80
+ dataset=summary.get("dataset_name"),
81
+ model=summary.get("model_name"),
82
+ pass_rate=summary.get("pass_rate", 0.0),
83
+ total=summary.get("total_test_cases"),
84
+ passed=summary.get("passed_test_cases"),
85
+ metrics=", ".join(summary.get("metrics_evaluated", []) or []),
86
+ )
87
+ )
88
+
89
+
90
+ def _summarize_compare(payload: dict[str, Any]) -> str:
91
+ baseline = payload.get("baseline_run_id")
92
+ candidate = payload.get("candidate_run_id")
93
+ delta = payload.get("metrics_delta") or {}
94
+ avg = delta.get("avg") or {}
95
+ lines = [
96
+ f"비교 결과: {baseline} vs {candidate}",
97
+ "평균 변화:",
98
+ ]
99
+ for metric, value in avg.items():
100
+ lines.append(f"- {metric}: {value:+.4f}")
101
+ notes = delta.get("notes") or []
102
+ if notes:
103
+ lines.append("노트: " + "; ".join(notes))
104
+ return "\n".join(lines)
105
+
106
+
107
+ def _summarize_artifacts(payload: dict[str, Any]) -> str:
108
+ artifacts = payload.get("artifacts") or {}
109
+ if not artifacts:
110
+ return "아티팩트 경로를 찾지 못했습니다."
111
+ return (
112
+ "아티팩트:\n"
113
+ f"- kind: {artifacts.get('kind')}\n"
114
+ f"- report: {artifacts.get('report_path')}\n"
115
+ f"- output: {artifacts.get('output_path')}\n"
116
+ f"- dir: {artifacts.get('artifacts_dir')}"
117
+ )
118
+
119
+
120
+ def _summarize_result(tool_name: str, payload: dict[str, Any]) -> str:
121
+ if tool_name == "list_runs":
122
+ return _summarize_runs(payload)
123
+ if tool_name == "get_run_summary":
124
+ return _summarize_run_summary(payload)
125
+ if tool_name == "analyze_compare":
126
+ return _summarize_compare(payload)
127
+ if tool_name == "get_artifacts":
128
+ return _summarize_artifacts(payload)
129
+ return str(payload)
130
+
131
+
132
+ def _load_text_files(root: Path, extensions: tuple[str, ...], limit: int) -> list[str]:
133
+ texts: list[str] = []
134
+ for path in root.rglob("*"):
135
+ if not path.is_file():
136
+ continue
137
+ if path.suffix.lower() not in extensions:
138
+ continue
139
+ if limit and len(texts) >= limit:
140
+ break
141
+ try:
142
+ content = path.read_text(encoding="utf-8")
143
+ except Exception:
144
+ continue
145
+ if content.strip():
146
+ texts.append(content)
147
+ return texts
148
+
149
+
150
+ async def _get_rag_retriever():
151
+ global _RAG_RETRIEVER
152
+ global _RAG_DOCS_COUNT
153
+ global _RAG_TEXTS
154
+ global _RAG_INITIALIZED
155
+
156
+ if _RAG_RETRIEVER is not None:
157
+ return _RAG_RETRIEVER, _RAG_DOCS_COUNT
158
+
159
+ if not _RAG_INITIALIZED:
160
+ docs_root = Path(os.getenv("EVALVAULT_RAG_DOCS", "docs"))
161
+ src_root = Path(os.getenv("EVALVAULT_RAG_SRC", "src"))
162
+ docs_limit = int(os.getenv("EVALVAULT_RAG_DOCS_LIMIT", "120"))
163
+ src_limit = int(os.getenv("EVALVAULT_RAG_SRC_LIMIT", "120"))
164
+
165
+ texts: list[str] = []
166
+ if docs_root.exists():
167
+ texts.extend(_load_text_files(docs_root, (".md", ".txt"), docs_limit))
168
+ if src_root.exists():
169
+ texts.extend(_load_text_files(src_root, (".py",), src_limit))
170
+
171
+ _RAG_TEXTS = texts
172
+ _RAG_DOCS_COUNT = len(texts)
173
+ _RAG_INITIALIZED = True
174
+
175
+ if not _RAG_TEXTS:
176
+ return None, 0
177
+
178
+ from evalvault.adapters.outbound.llm.ollama_adapter import OllamaAdapter
179
+ from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
180
+ from evalvault.config.settings import Settings
181
+
182
+ settings = Settings()
183
+ ollama_adapter = OllamaAdapter(settings)
184
+ toolkit = try_create_korean_toolkit()
185
+ if toolkit is None:
186
+ return None, 0
187
+
188
+ use_hybrid = os.getenv("EVALVAULT_RAG_USE_HYBRID", "true").lower() == "true"
189
+ retriever = toolkit.build_retriever(
190
+ documents=_RAG_TEXTS,
191
+ use_hybrid=use_hybrid,
192
+ ollama_adapter=ollama_adapter if use_hybrid else None,
193
+ embedding_profile=os.getenv("EVALVAULT_RAG_EMBEDDING_PROFILE", "dev"),
194
+ verbose=False,
195
+ )
196
+ if retriever is None:
197
+ return None, 0
198
+
199
+ _RAG_RETRIEVER = retriever
200
+ return retriever, _RAG_DOCS_COUNT
201
+
202
+
203
+ async def _direct_chat_answer(user_text: str) -> str | None:
204
+ payload = {
205
+ "model": os.getenv("OLLAMA_CHAT_MODEL", "gpt-oss-safeguard:20b"),
206
+ "messages": [
207
+ {"role": "system", "content": "You are a helpful assistant for EvalVault."},
208
+ {"role": "user", "content": user_text},
209
+ ],
210
+ "stream": False,
211
+ }
212
+
213
+ async with httpx.AsyncClient(timeout=30) as client:
214
+ response = await client.post(
215
+ f"{os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')}/api/chat",
216
+ json=payload,
217
+ )
218
+ response.raise_for_status()
219
+ data = response.json()
220
+
221
+ return data.get("message", {}).get("content", "").strip() or None
222
+
223
+
224
+ def _simple_retrieve(texts: list[str], query: str, top_k: int) -> list[str]:
225
+ tokens = re.findall(r"[A-Za-z0-9가-힣]+", query.lower())
226
+ if not tokens:
227
+ return []
228
+ scored: list[tuple[int, str]] = []
229
+ for text in texts:
230
+ hay = text.lower()
231
+ score = sum(hay.count(token) for token in tokens)
232
+ if score:
233
+ scored.append((score, text))
234
+ scored.sort(key=lambda item: item[0], reverse=True)
235
+ return [text for _, text in scored[:top_k]]
236
+
237
+
238
+ async def _rag_answer(user_text: str) -> str | None:
239
+ retriever, _ = await _get_rag_retriever()
240
+ contexts: list[str] = []
241
+
242
+ if retriever is not None:
243
+ results = retriever.search(user_text, top_k=5)
244
+ for item in results:
245
+ context = getattr(item, "document", None)
246
+ if context:
247
+ contexts.append(context)
248
+
249
+ if not contexts and _RAG_TEXTS:
250
+ contexts = _simple_retrieve(_RAG_TEXTS, user_text, top_k=5)
251
+
252
+ if not contexts:
253
+ return None
254
+
255
+ if os.getenv("EVALVAULT_RAG_LLM_ENABLED", "true").lower() != "true":
256
+ return "\n\n".join(contexts[:3])
257
+
258
+ prompt = (
259
+ "다음은 EvalVault 코드/문서에서 검색된 컨텍스트입니다.\n"
260
+ "컨텍스트만 근거로 사용해 한국어로 답하세요.\n\n"
261
+ "[컨텍스트]\n"
262
+ + "\n\n---\n\n".join(contexts[:3])
263
+ + "\n\n[질문]\n"
264
+ + user_text
265
+ + "\n\n[답변]"
266
+ )
267
+
268
+ payload = {
269
+ "model": os.getenv("OLLAMA_CHAT_MODEL", "gpt-oss-safeguard:20b"),
270
+ "messages": [
271
+ {"role": "system", "content": "You are a helpful assistant for EvalVault."},
272
+ {"role": "user", "content": prompt},
273
+ ],
274
+ "stream": False,
275
+ }
276
+
277
+ async with httpx.AsyncClient(timeout=60) as client:
278
+ response = await client.post(
279
+ f"{os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')}/api/chat",
280
+ json=payload,
281
+ )
282
+ response.raise_for_status()
283
+ data = response.json()
284
+
285
+ return data.get("message", {}).get("content", "").strip() or None
286
+
287
+
288
+ async def _call_mcp_tool(tool_name: str, tool_args: dict[str, Any]) -> Any:
289
+ headers = {
290
+ "Authorization": f"Bearer {MCP_TOKEN}",
291
+ "Content-Type": "application/json",
292
+ }
293
+ payload = {
294
+ "jsonrpc": "2.0",
295
+ "id": 1,
296
+ "method": "tools/call",
297
+ "params": {"name": tool_name, "arguments": tool_args},
298
+ }
299
+
300
+ async with httpx.AsyncClient(timeout=60) as client:
301
+ response = await client.post(MCP_URL, json=payload, headers=headers)
302
+ response.raise_for_status()
303
+ data = response.json()
304
+
305
+ return data
306
+
307
+
308
+ async def _resolve_tool_with_llm(user_text: str) -> dict[str, Any] | None:
309
+ ollama_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
310
+ router_model = os.getenv("OLLAMA_ROUTER_MODEL", "gemma3:1b")
311
+
312
+ system_prompt = (
313
+ "You are a router for EvalVault. "
314
+ "Return JSON only with keys: action, tool, arguments."
315
+ "Action must be one of: tool, rag, direct."
316
+ "Tools: list_runs, get_run_summary, run_evaluation, analyze_compare, get_artifacts."
317
+ "Rules:"
318
+ "- If user asks about datasets, prefer tool list_datasets."
319
+ "- If question is about EvalVault docs/usage, prefer rag."
320
+ "- If greeting or general chat, use direct."
321
+ "- For tool list_runs: arguments {limit:int}"
322
+ "- For tool get_run_summary: {run_id:string}"
323
+ "- For tool analyze_compare: {run_id_a:string, run_id_b:string}"
324
+ "- For tool run_evaluation: {dataset_path:string, metrics:[string], profile:string, auto_analyze:bool}"
325
+ "- For tool get_artifacts: {run_id:string, kind:'analysis'|'comparison'}"
326
+ "- For tool list_datasets: {limit:int}"
327
+ )
328
+
329
+ payload = {
330
+ "model": router_model,
331
+ "messages": [
332
+ {"role": "system", "content": system_prompt},
333
+ {"role": "user", "content": user_text},
334
+ ],
335
+ "stream": False,
336
+ }
337
+
338
+ async with httpx.AsyncClient(timeout=30) as client:
339
+ response = await client.post(f"{ollama_url}/api/chat", json=payload)
340
+ response.raise_for_status()
341
+ data = response.json()
342
+
343
+ content = data.get("message", {}).get("content", "").strip()
344
+ if not content:
345
+ return None
346
+
347
+ try:
348
+ return json.loads(content)
349
+ except Exception:
350
+ return None
351
+
352
+
353
+ def _extract_json_content(result: Any) -> dict[str, Any] | None:
354
+ if isinstance(result, dict) and isinstance(result.get("structuredContent"), dict):
355
+ return result.get("structuredContent")
356
+
357
+ if hasattr(result, "structuredContent"):
358
+ payload = result.structuredContent
359
+ if isinstance(payload, dict):
360
+ return payload
361
+
362
+ if hasattr(result, "content"):
363
+ content = result.content
364
+ elif isinstance(result, dict):
365
+ content = result.get("content")
366
+ else:
367
+ content = None
368
+
369
+ if not isinstance(content, list):
370
+ return None
371
+
372
+ for item in content:
373
+ if isinstance(item, dict):
374
+ item_type = item.get("type")
375
+ if item_type == "json":
376
+ payload = item.get("json")
377
+ if isinstance(payload, dict):
378
+ return payload
379
+ if item_type == "text":
380
+ text = item.get("text")
381
+ if isinstance(text, str):
382
+ try:
383
+ parsed = json.loads(text)
384
+ except Exception:
385
+ return None
386
+ if isinstance(parsed, dict):
387
+ return parsed
388
+ else:
389
+ item_type = getattr(item, "type", None)
390
+ if item_type == "text":
391
+ text = getattr(item, "text", None)
392
+ if isinstance(text, str):
393
+ try:
394
+ parsed = json.loads(text)
395
+ except Exception:
396
+ return None
397
+ if isinstance(parsed, dict):
398
+ return parsed
399
+ return None
400
+
401
+
402
+ def _chunk_text(text: str, size: int = 42) -> list[str]:
403
+ if not text:
404
+ return []
405
+ return [text[i : i + size] for i in range(0, len(text), size)]
406
+
407
+
408
+ def _event(payload: dict[str, Any]) -> str:
409
+ return json.dumps(payload, ensure_ascii=False) + "\n"
410
+
411
+
412
+ async def _emit_answer(answer: str) -> AsyncGenerator[str, None]:
413
+ for chunk in _chunk_text(answer):
414
+ yield _event({"type": "delta", "content": chunk})
415
+ await asyncio.sleep(0)
416
+ yield _event({"type": "final", "content": answer})
417
+
418
+
419
+ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
420
+ started_at = time.perf_counter()
421
+ if len(user_text) <= 4:
422
+ yield _event({"type": "final", "content": "안녕하세요! EvalVault 관련 질문을 해주세요."})
423
+ return
424
+
425
+ if len(user_text) <= 6:
426
+ yield _event({"type": "status", "message": "짧은 질문 처리 중..."})
427
+ answer = await _direct_chat_answer(user_text)
428
+ if answer:
429
+ async for item in _emit_answer(answer):
430
+ yield item
431
+ else:
432
+ yield _event({"type": "final", "content": "답변을 생성하지 못했습니다."})
433
+ return
434
+
435
+ yield _event({"type": "status", "message": "요청 분류 중..."})
436
+ try:
437
+ router = await asyncio.wait_for(_resolve_tool_with_llm(user_text), timeout=20)
438
+ except TimeoutError:
439
+ router = None
440
+ except Exception:
441
+ router = None
442
+
443
+ if not isinstance(router, dict):
444
+ router = None
445
+
446
+ if router is None:
447
+ yield _event({"type": "status", "message": "문서 검색 중..."})
448
+ try:
449
+ rag_answer = await asyncio.wait_for(_rag_answer(user_text), timeout=30)
450
+ except TimeoutError:
451
+ yield _event({"type": "error", "message": "문서 검색이 지연됩니다. 다시 시도해주세요."})
452
+ return
453
+ if rag_answer:
454
+ async for item in _emit_answer(rag_answer):
455
+ yield item
456
+ return
457
+ answer = await _direct_chat_answer(user_text)
458
+ if answer:
459
+ async for item in _emit_answer(answer):
460
+ yield item
461
+ return
462
+ yield _event({"type": "final", "content": "요청을 해석하지 못했습니다. 다시 질문해주세요."})
463
+ return
464
+
465
+ action = router.get("action")
466
+ tool_name = router.get("tool")
467
+ tool_args = router.get("arguments", {})
468
+
469
+ if action == "direct":
470
+ answer = await _direct_chat_answer(user_text)
471
+ if answer:
472
+ async for item in _emit_answer(answer):
473
+ yield item
474
+ else:
475
+ yield _event({"type": "final", "content": "답변을 생성하지 못했습니다."})
476
+ return
477
+
478
+ if action == "rag":
479
+ yield _event({"type": "status", "message": "문서 검색 중..."})
480
+ try:
481
+ rag_answer = await asyncio.wait_for(_rag_answer(user_text), timeout=30)
482
+ except TimeoutError:
483
+ yield _event({"type": "error", "message": "문서 검색이 지연됩니다. 다시 시도해주세요."})
484
+ return
485
+ if rag_answer:
486
+ async for item in _emit_answer(rag_answer):
487
+ yield item
488
+ else:
489
+ yield _event({"type": "final", "content": "관련 문서를 찾지 못했습니다."})
490
+ return
491
+
492
+ if action != "tool":
493
+ yield _event({"type": "final", "content": "요청을 해석하지 못했습니다. 다시 질문해주세요."})
494
+ return
495
+
496
+ if not tool_name:
497
+ yield _event({"type": "final", "content": "도구 이름을 찾지 못했습니다."})
498
+ return
499
+
500
+ yield _event({"type": "status", "message": "도구 실행 중..."})
501
+ try:
502
+ result = await asyncio.wait_for(_call_mcp_tool(tool_name, tool_args), timeout=12)
503
+ except TimeoutError:
504
+ yield _event(
505
+ {"type": "error", "message": "응답 지연(12s 초과). MCP 서버 상태를 확인해주세요."}
506
+ )
507
+ return
508
+ except Exception as exc:
509
+ yield _event({"type": "error", "message": f"도구 호출 실패: {exc}"})
510
+ return
511
+
512
+ payload = _extract_json_content(result)
513
+ if isinstance(payload, dict):
514
+ summary = _summarize_result(tool_name, payload)
515
+ async for item in _emit_answer(summary):
516
+ yield item
517
+ return
518
+
519
+ if hasattr(result, "content"):
520
+ text = _format_tool_result(result.content)
521
+ else:
522
+ text = f"도구 실행 결과: {_format_tool_result(result)}"
523
+ async for item in _emit_answer(text):
524
+ yield item
525
+
526
+ elapsed_ms = (time.perf_counter() - started_at) * 1000
527
+ yield _event({"type": "status", "message": f"처리 완료 ({elapsed_ms:.0f}ms)"})
528
+
529
+
530
+ @router.post("/stream")
531
+ async def chat_stream(request: ChatRequest):
532
+ user_text = request.message.strip()
533
+ if not user_text:
534
+ return StreamingResponse(
535
+ iter([_event({"type": "error", "message": "질문을 입력해주세요."})]),
536
+ media_type="application/x-ndjson",
537
+ )
538
+
539
+ async def event_generator():
540
+ async for item in _chat_stream(user_text):
541
+ yield item
542
+
543
+ return StreamingResponse(event_generator(), media_type="application/x-ndjson")
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  from collections.abc import Callable
6
6
  from dataclasses import dataclass
7
- from typing import Any, Protocol
7
+ from typing import Any
8
8
 
9
9
  import typer
10
10
  from rich.console import Console
@@ -12,8 +12,11 @@ from rich.console import Console
12
12
  from .agent import register_agent_commands
13
13
  from .analyze import register_analyze_commands
14
14
  from .api import register_api_command
15
+ from .artifacts import create_artifacts_app
15
16
  from .benchmark import create_benchmark_app
16
17
  from .calibrate import register_calibrate_commands
18
+ from .calibrate_judge import register_calibrate_judge_commands
19
+ from .compare import register_compare_commands
17
20
  from .config import register_config_commands
18
21
  from .debug import create_debug_app
19
22
  from .domain import create_domain_app
@@ -25,19 +28,17 @@ from .init import register_init_command
25
28
  from .kg import create_kg_app
26
29
  from .langfuse import register_langfuse_commands
27
30
  from .method import create_method_app
31
+ from .ops import create_ops_app
28
32
  from .phoenix import create_phoenix_app
29
33
  from .pipeline import register_pipeline_commands
34
+ from .profile_difficulty import register_profile_difficulty_commands
30
35
  from .prompts import create_prompts_app
36
+ from .regress import register_regress_commands
31
37
  from .run import register_run_commands
32
38
  from .stage import create_stage_app
33
39
 
34
40
  CommandFactory = Callable[[Console], typer.Typer]
35
-
36
-
37
- class CommandRegistrar(Protocol):
38
- """Callable protocol for Typer command registrars."""
39
-
40
- def __call__(self, app: typer.Typer, console: Console, **kwargs: Any) -> None: ...
41
+ CommandRegistrar = Callable[..., Any]
41
42
 
42
43
 
43
44
  @dataclass(frozen=True)
@@ -61,10 +62,14 @@ COMMAND_MODULES: tuple[CommandModule, ...] = (
61
62
  CommandModule(register_run_commands, needs_metrics=True),
62
63
  CommandModule(register_pipeline_commands),
63
64
  CommandModule(register_history_commands),
65
+ CommandModule(register_compare_commands),
64
66
  CommandModule(register_analyze_commands),
65
67
  CommandModule(register_calibrate_commands),
68
+ CommandModule(register_calibrate_judge_commands),
66
69
  CommandModule(register_generate_commands),
67
70
  CommandModule(register_gate_commands),
71
+ CommandModule(register_profile_difficulty_commands, needs_metrics=True),
72
+ CommandModule(register_regress_commands),
68
73
  CommandModule(register_agent_commands),
69
74
  CommandModule(register_experiment_commands),
70
75
  CommandModule(register_config_commands),
@@ -78,9 +83,11 @@ SUB_APPLICATIONS: tuple[SubAppModule, ...] = (
78
83
  SubAppModule("domain", create_domain_app),
79
84
  SubAppModule("benchmark", create_benchmark_app),
80
85
  SubAppModule("method", create_method_app),
86
+ SubAppModule("ops", create_ops_app),
81
87
  SubAppModule("phoenix", create_phoenix_app),
82
88
  SubAppModule("prompts", create_prompts_app),
83
89
  SubAppModule("stage", create_stage_app),
90
+ SubAppModule("artifacts", create_artifacts_app),
84
91
  SubAppModule("debug", create_debug_app),
85
92
  )
86
93