evalvault 1.73.2__py3-none-any.whl → 1.75.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. evalvault/adapters/inbound/api/adapter.py +66 -17
  2. evalvault/adapters/inbound/api/routers/calibration.py +9 -9
  3. evalvault/adapters/inbound/api/routers/chat.py +604 -37
  4. evalvault/adapters/inbound/api/routers/domain.py +10 -5
  5. evalvault/adapters/inbound/api/routers/pipeline.py +3 -3
  6. evalvault/adapters/inbound/api/routers/runs.py +23 -4
  7. evalvault/adapters/inbound/cli/commands/analyze.py +10 -12
  8. evalvault/adapters/inbound/cli/commands/benchmark.py +10 -8
  9. evalvault/adapters/inbound/cli/commands/calibrate.py +2 -7
  10. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +2 -7
  11. evalvault/adapters/inbound/cli/commands/compare.py +2 -7
  12. evalvault/adapters/inbound/cli/commands/debug.py +3 -2
  13. evalvault/adapters/inbound/cli/commands/domain.py +12 -12
  14. evalvault/adapters/inbound/cli/commands/experiment.py +9 -8
  15. evalvault/adapters/inbound/cli/commands/gate.py +3 -2
  16. evalvault/adapters/inbound/cli/commands/graph_rag.py +2 -2
  17. evalvault/adapters/inbound/cli/commands/history.py +3 -12
  18. evalvault/adapters/inbound/cli/commands/method.py +1 -2
  19. evalvault/adapters/inbound/cli/commands/ops.py +2 -2
  20. evalvault/adapters/inbound/cli/commands/pipeline.py +2 -2
  21. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +3 -12
  22. evalvault/adapters/inbound/cli/commands/prompts.py +4 -18
  23. evalvault/adapters/inbound/cli/commands/regress.py +5 -4
  24. evalvault/adapters/inbound/cli/commands/run.py +42 -31
  25. evalvault/adapters/inbound/cli/commands/run_helpers.py +24 -15
  26. evalvault/adapters/inbound/cli/commands/stage.py +6 -25
  27. evalvault/adapters/inbound/cli/utils/options.py +10 -4
  28. evalvault/adapters/inbound/mcp/tools.py +11 -8
  29. evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +17 -1
  30. evalvault/adapters/outbound/analysis/embedding_searcher_module.py +14 -0
  31. evalvault/adapters/outbound/domain_memory/__init__.py +8 -4
  32. evalvault/adapters/outbound/domain_memory/factory.py +68 -0
  33. evalvault/adapters/outbound/domain_memory/postgres_adapter.py +1062 -0
  34. evalvault/adapters/outbound/domain_memory/postgres_domain_memory_schema.sql +177 -0
  35. evalvault/adapters/outbound/llm/vllm_adapter.py +23 -0
  36. evalvault/adapters/outbound/nlp/korean/dense_retriever.py +10 -7
  37. evalvault/adapters/outbound/nlp/korean/toolkit.py +15 -4
  38. evalvault/adapters/outbound/ops/__init__.py +5 -0
  39. evalvault/adapters/outbound/ops/report_renderer.py +159 -0
  40. evalvault/adapters/outbound/retriever/pgvector_store.py +165 -0
  41. evalvault/adapters/outbound/storage/base_sql.py +3 -2
  42. evalvault/adapters/outbound/storage/factory.py +53 -0
  43. evalvault/adapters/outbound/storage/postgres_adapter.py +90 -0
  44. evalvault/adapters/outbound/storage/postgres_schema.sql +15 -0
  45. evalvault/adapters/outbound/storage/schema.sql +14 -0
  46. evalvault/adapters/outbound/storage/sqlite_adapter.py +77 -0
  47. evalvault/config/settings.py +31 -7
  48. evalvault/domain/entities/ops_report.py +40 -0
  49. evalvault/domain/services/domain_learning_hook.py +2 -1
  50. evalvault/domain/services/ops_report_service.py +192 -0
  51. evalvault/ports/inbound/web_port.py +3 -1
  52. evalvault/ports/outbound/storage_port.py +2 -0
  53. evalvault-1.75.0.dist-info/METADATA +221 -0
  54. {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/RECORD +57 -48
  55. evalvault-1.73.2.dist-info/METADATA +0 -585
  56. {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/WHEEL +0 -0
  57. {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/entry_points.txt +0 -0
  58. {evalvault-1.73.2.dist-info → evalvault-1.75.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -8,6 +8,7 @@ import os
8
8
  import re
9
9
  import time
10
10
  from collections.abc import AsyncGenerator
11
+ from dataclasses import dataclass
11
12
  from datetime import UTC, datetime
12
13
  from pathlib import Path
13
14
  from typing import Any
@@ -34,6 +35,13 @@ _RAG_TEXTS: list[str] = []
34
35
  _RAG_INITIALIZED = False
35
36
 
36
37
 
38
+ @dataclass(frozen=True)
39
+ class _RagHit:
40
+ document: str
41
+ score: float
42
+ doc_id: int
43
+
44
+
37
45
  class ChatMessage(BaseModel):
38
46
  role: str
39
47
  content: str
@@ -44,10 +52,62 @@ class ChatRequest(BaseModel):
44
52
  history: list[ChatMessage] | None = None
45
53
 
46
54
 
55
+ class AiChatMessage(BaseModel):
56
+ role: str
57
+ content: str | None = None
58
+ parts: list[dict[str, Any]] | None = None
59
+
60
+
61
+ class AiChatRequest(BaseModel):
62
+ messages: list[AiChatMessage] = Field(default_factory=list)
63
+ run_id: str | None = None
64
+ category: str | None = None
65
+
66
+
47
67
  def _extract_run_ids(text: str) -> list[str]:
48
68
  return re.findall(r"run_[A-Za-z0-9_-]+", text)
49
69
 
50
70
 
71
+ def _ollama_chat_options(model_name: str) -> dict[str, Any] | None:
72
+ lower = model_name.lower()
73
+ if lower.startswith("qwen3"):
74
+ return {
75
+ "temperature": 0.6,
76
+ "top_p": 0.95,
77
+ "top_k": 20,
78
+ "repeat_penalty": 1,
79
+ "stop": ["<|im_start|>", "<|im_end|>"],
80
+ }
81
+ return None
82
+
83
+
84
+ def _is_verb_only(text: str) -> bool:
85
+ if not text:
86
+ return False
87
+ compact = re.sub(r"\s+", "", text.strip())
88
+ if not compact:
89
+ return False
90
+ tokens = re.findall(r"[A-Za-z0-9가-힣]+", compact)
91
+ if len(tokens) > 2:
92
+ return False
93
+ verb_markers = ["해줘", "해주세요", "해봐", "해봐요", "해줘요", "해줘라"]
94
+ verb_stems = ["설명", "요약", "분석", "비교", "개선", "정리", "추천", "진단", "해석", "검증"]
95
+ if any(compact.endswith(marker) for marker in verb_markers):
96
+ return any(stem in compact for stem in verb_stems)
97
+ return compact in verb_stems
98
+
99
+
100
+ def _with_context(user_text: str, run_id: str | None, category: str | None) -> str:
101
+ parts = []
102
+ if run_id:
103
+ parts.append(f"선택된 run_id: {run_id}")
104
+ if category:
105
+ parts.append(f"질문 분류: {category}")
106
+ if not parts:
107
+ return user_text
108
+ return "\n".join(parts) + f"\n사용자 요청: {user_text}"
109
+
110
+
51
111
  def _format_tool_result(result: Any) -> str:
52
112
  if isinstance(result, dict):
53
113
  if "result" in result:
@@ -263,14 +323,121 @@ async def _get_rag_retriever() -> tuple[Any | None, int]:
263
323
  if not _RAG_TEXTS:
264
324
  return None, 0
265
325
 
266
- from evalvault.adapters.outbound.nlp.korean.bm25_retriever import KoreanBM25Retriever
267
- from evalvault.adapters.outbound.nlp.korean.kiwi_tokenizer import KiwiTokenizer
326
+ from evalvault.adapters.outbound.nlp.korean.toolkit import KoreanNLPToolkit
327
+
328
+ use_hybrid = os.getenv("EVALVAULT_RAG_USE_HYBRID", "true").lower() == "true"
329
+ embedding_profile = os.getenv("EVALVAULT_RAG_EMBEDDING_PROFILE", "dev")
330
+ vector_store = os.getenv("EVALVAULT_RAG_VECTOR_STORE", "pgvector").lower()
331
+ pgvector_index = os.getenv("EVALVAULT_RAG_PGVECTOR_INDEX", "hnsw").lower()
332
+ pgvector_index_lists = int(os.getenv("EVALVAULT_RAG_PGVECTOR_INDEX_LISTS", "100"))
333
+ pgvector_hnsw_m = int(os.getenv("EVALVAULT_RAG_PGVECTOR_HNSW_M", "16"))
334
+ pgvector_hnsw_ef = int(os.getenv("EVALVAULT_RAG_PGVECTOR_HNSW_EF_CONSTRUCTION", "64"))
335
+
336
+ def _build_conn_string() -> str | None:
337
+ try:
338
+ from evalvault.config.settings import Settings
339
+
340
+ settings = Settings()
341
+ if settings.postgres_connection_string:
342
+ return settings.postgres_connection_string
343
+ if settings.postgres_host:
344
+ return "host={host} port={port} dbname={dbname} user={user} password={password}".format(
345
+ host=settings.postgres_host,
346
+ port=settings.postgres_port,
347
+ dbname=settings.postgres_database,
348
+ user=settings.postgres_user or "postgres",
349
+ password=settings.postgres_password or "",
350
+ )
351
+ except Exception as exc:
352
+ logger.warning("Failed to build Postgres connection string: %s", exc)
353
+ return None
268
354
 
269
- tokenizer = KiwiTokenizer()
270
- retriever = KoreanBM25Retriever(tokenizer=tokenizer)
271
- retriever.index(list(_RAG_TEXTS))
272
- if tokens and len(tokens) == len(_RAG_TEXTS):
273
- retriever._tokenized_docs = tokens
355
+ ollama_adapter = None
356
+ dense_retriever = None
357
+ embedding_func = None
358
+ if embedding_profile:
359
+ try:
360
+ from evalvault.adapters.outbound.llm.ollama_adapter import OllamaAdapter
361
+ from evalvault.adapters.outbound.nlp.korean.dense_retriever import KoreanDenseRetriever
362
+ from evalvault.config.settings import Settings
363
+
364
+ settings = Settings()
365
+ ollama_adapter = OllamaAdapter(settings)
366
+ dense_retriever = KoreanDenseRetriever(
367
+ profile=embedding_profile,
368
+ ollama_adapter=ollama_adapter,
369
+ )
370
+ embedding_func = dense_retriever.get_embedding_func()
371
+ except Exception as exc: # pragma: no cover - runtime dependency
372
+ logger.warning("Failed to initialize dense retriever: %s", exc)
373
+
374
+ if vector_store == "pgvector" and embedding_func is not None:
375
+ conn_string = _build_conn_string()
376
+ if conn_string:
377
+ try:
378
+ from evalvault.adapters.outbound.nlp.korean.bm25_retriever import (
379
+ KoreanBM25Retriever,
380
+ )
381
+ from evalvault.adapters.outbound.nlp.korean.kiwi_tokenizer import KiwiTokenizer
382
+ from evalvault.adapters.outbound.retriever.pgvector_store import PgvectorStore
383
+
384
+ store = PgvectorStore(
385
+ conn_string,
386
+ index_type=pgvector_index,
387
+ index_lists=pgvector_index_lists,
388
+ hnsw_m=pgvector_hnsw_m,
389
+ hnsw_ef_construction=pgvector_hnsw_ef,
390
+ )
391
+ embedding_dim = (
392
+ dense_retriever.dimension if dense_retriever else len(embedding_func(["x"])[0])
393
+ )
394
+ store.ensure_schema(dimension=embedding_dim)
395
+ source_hash = _hash_text(content)
396
+ existing_hash, existing_count = store.get_source_state(source="user_guide")
397
+ if existing_hash != source_hash or existing_count != len(_RAG_TEXTS):
398
+ embeddings = embedding_func(list(_RAG_TEXTS))
399
+ store.replace_documents(
400
+ source="user_guide",
401
+ source_hash=source_hash,
402
+ documents=list(_RAG_TEXTS),
403
+ embeddings=embeddings,
404
+ )
405
+
406
+ tokenizer = KiwiTokenizer()
407
+ bm25_retriever = KoreanBM25Retriever(tokenizer=tokenizer)
408
+ bm25_retriever.index(list(_RAG_TEXTS))
409
+ if tokens and len(tokens) == len(_RAG_TEXTS):
410
+ bm25_retriever._tokenized_docs = tokens
411
+
412
+ if use_hybrid:
413
+ retriever = _PgvectorHybridRetriever(
414
+ bm25_retriever=bm25_retriever,
415
+ store=store,
416
+ embedding_func=embedding_func,
417
+ documents=list(_RAG_TEXTS),
418
+ )
419
+ else:
420
+ retriever = _PgvectorDenseRetriever(
421
+ store=store,
422
+ embedding_func=embedding_func,
423
+ documents=list(_RAG_TEXTS),
424
+ )
425
+
426
+ _RAG_RETRIEVER = retriever
427
+ return retriever, _RAG_DOCS_COUNT
428
+ except Exception as exc:
429
+ logger.warning("pgvector retriever setup failed: %s", exc)
430
+
431
+ toolkit = KoreanNLPToolkit()
432
+ retriever = toolkit.build_retriever(
433
+ list(_RAG_TEXTS),
434
+ use_hybrid=use_hybrid,
435
+ ollama_adapter=ollama_adapter,
436
+ embedding_profile=embedding_profile,
437
+ verbose=True,
438
+ )
439
+ if retriever is None:
440
+ return None, 0
274
441
 
275
442
  _RAG_RETRIEVER = retriever
276
443
  return retriever, _RAG_DOCS_COUNT
@@ -283,15 +450,29 @@ async def warm_rag_index() -> None:
283
450
  logger.warning("RAG preload failed: %s", exc)
284
451
 
285
452
 
286
- async def _direct_chat_answer(user_text: str) -> str | None:
453
+ async def _direct_chat_answer(
454
+ user_text: str, run_id: str | None = None, category: str | None = None
455
+ ) -> str | None:
456
+ user_text = _with_context(user_text, run_id, category)
457
+ model_name = os.getenv("OLLAMA_CHAT_MODEL", "qwen3:14b")
458
+ options = _ollama_chat_options(model_name)
287
459
  payload = {
288
- "model": os.getenv("OLLAMA_CHAT_MODEL", "gpt-oss-safeguard:20b"),
460
+ "model": model_name,
289
461
  "messages": [
290
- {"role": "system", "content": "You are a helpful assistant for EvalVault."},
462
+ {
463
+ "role": "system",
464
+ "content": (
465
+ "You are a helpful assistant for EvalVault. "
466
+ "Interpret verb-only requests as questions about the selected run/category. "
467
+ "If essential details are missing, ask a concise follow-up question in Korean."
468
+ ),
469
+ },
291
470
  {"role": "user", "content": user_text},
292
471
  ],
293
472
  "stream": False,
294
473
  }
474
+ if options:
475
+ payload["options"] = options
295
476
 
296
477
  async with httpx.AsyncClient(timeout=30) as client:
297
478
  response = await client.post(
@@ -318,9 +499,153 @@ def _simple_retrieve(texts: list[str], query: str, top_k: int) -> list[str]:
318
499
  return [text for _, text in scored[:top_k]]
319
500
 
320
501
 
321
- async def _rag_answer(user_text: str) -> str | None:
322
- retriever, _ = await _get_rag_retriever()
502
+ def _rrf_fuse(
503
+ *,
504
+ bm25_results: list[Any],
505
+ dense_results: list[Any],
506
+ documents: list[str],
507
+ top_k: int,
508
+ bm25_weight: float = 0.4,
509
+ dense_weight: float = 0.6,
510
+ rrf_k: int = 60,
511
+ ) -> list[_RagHit]:
512
+ scores: dict[int, float] = {}
513
+
514
+ for rank, result in enumerate(bm25_results, 1):
515
+ doc_id = int(result.doc_id)
516
+ scores[doc_id] = scores.get(doc_id, 0.0) + (bm25_weight / (rrf_k + rank))
517
+
518
+ for rank, result in enumerate(dense_results, 1):
519
+ doc_id = int(result.doc_id)
520
+ scores[doc_id] = scores.get(doc_id, 0.0) + (dense_weight / (rrf_k + rank))
521
+
522
+ ranked = sorted(scores.items(), key=lambda item: item[1], reverse=True)
523
+ hits: list[_RagHit] = []
524
+ for doc_id, score in ranked[:top_k]:
525
+ if 0 <= doc_id < len(documents):
526
+ hits.append(_RagHit(document=documents[doc_id], score=score, doc_id=doc_id))
527
+ return hits
528
+
529
+
530
+ class _PgvectorDenseRetriever:
531
+ def __init__(self, store: Any, embedding_func: Any, documents: list[str]) -> None:
532
+ self._store = store
533
+ self._embedding_func = embedding_func
534
+ self._documents = documents
535
+
536
+ def search(self, query: str, top_k: int = 5) -> list[_RagHit]:
537
+ query_embedding = self._embedding_func([query])[0]
538
+ results = self._store.search(
539
+ source="user_guide", query_embedding=query_embedding, top_k=top_k
540
+ )
541
+ hits: list[_RagHit] = []
542
+ for result in results:
543
+ if 0 <= result.doc_id < len(self._documents):
544
+ hits.append(
545
+ _RagHit(
546
+ document=self._documents[result.doc_id],
547
+ score=float(result.score),
548
+ doc_id=result.doc_id,
549
+ )
550
+ )
551
+ return hits
552
+
553
+
554
+ class _PgvectorHybridRetriever:
555
+ def __init__(
556
+ self,
557
+ *,
558
+ bm25_retriever: Any,
559
+ store: Any,
560
+ embedding_func: Any,
561
+ documents: list[str],
562
+ ) -> None:
563
+ self._bm25 = bm25_retriever
564
+ self._store = store
565
+ self._embedding_func = embedding_func
566
+ self._documents = documents
567
+
568
+ def search(self, query: str, top_k: int = 5) -> list[_RagHit]:
569
+ bm25_results = self._bm25.search(query, top_k=len(self._documents))
570
+ query_embedding = self._embedding_func([query])[0]
571
+ dense_results = self._store.search(
572
+ source="user_guide", query_embedding=query_embedding, top_k=len(self._documents)
573
+ )
574
+ dense_results = sorted(dense_results, key=lambda item: item.score)
575
+ return _rrf_fuse(
576
+ bm25_results=bm25_results,
577
+ dense_results=dense_results,
578
+ documents=self._documents,
579
+ top_k=top_k,
580
+ )
581
+
582
+
583
+ def _read_text_limited(path: Path, limit: int = 4000) -> str | None:
584
+ try:
585
+ if not path.exists():
586
+ return None
587
+ content = path.read_text(encoding="utf-8", errors="ignore")
588
+ except Exception as exc:
589
+ logger.warning("Failed to read %s: %s", path, exc)
590
+ return None
591
+ content = content.strip()
592
+ if not content:
593
+ return None
594
+ if len(content) > limit:
595
+ return content[:limit] + "..."
596
+ return content
597
+
598
+
599
+ async def _build_run_context(run_id: str) -> list[str]:
323
600
  contexts: list[str] = []
601
+ try:
602
+ summary_result = await _call_mcp_tool("get_run_summary", {"run_id": run_id})
603
+ payload = _extract_json_content(summary_result)
604
+ if isinstance(payload, dict):
605
+ contexts.append("[RUN 요약]\n" + _summarize_run_summary(payload))
606
+ except Exception as exc:
607
+ logger.warning("Failed to fetch run summary: %s", exc)
608
+
609
+ try:
610
+ artifacts_result = await _call_mcp_tool(
611
+ "get_artifacts", {"run_id": run_id, "kind": "analysis"}
612
+ )
613
+ payload = _extract_json_content(artifacts_result)
614
+ if isinstance(payload, dict):
615
+ contexts.append("[RUN 아티팩트]\n" + _summarize_artifacts(payload))
616
+ artifacts = payload.get("artifacts") or {}
617
+ report_path = artifacts.get("report_path")
618
+ if isinstance(report_path, str) and report_path:
619
+ report_text = _read_text_limited(Path(report_path))
620
+ if report_text:
621
+ contexts.append("[REPORT 발췌]\n" + report_text)
622
+ except Exception as exc:
623
+ logger.warning("Failed to fetch run artifacts: %s", exc)
624
+
625
+ return contexts
626
+
627
+
628
+ async def _rag_answer(
629
+ user_text: str, run_id: str | None = None, category: str | None = None
630
+ ) -> str | None:
631
+ contexts: list[str] = []
632
+ rag_llm_enabled = os.getenv("EVALVAULT_RAG_LLM_ENABLED", "true").lower() == "true"
633
+ run_context_enabled = os.getenv("EVALVAULT_CHAT_RUN_CONTEXT_ENABLED", "true").lower() == "true"
634
+
635
+ if run_id and rag_llm_enabled and run_context_enabled:
636
+ contexts.extend(await _build_run_context(run_id))
637
+
638
+ if not rag_llm_enabled and contexts:
639
+ return "\n\n".join(contexts[:3])
640
+
641
+ if not rag_llm_enabled:
642
+ content = _load_user_guide_text()
643
+ if content:
644
+ chunks = [chunk.strip() for chunk in content.split("\n\n") if chunk.strip()]
645
+ contexts.extend(_simple_retrieve(chunks, user_text, top_k=5))
646
+ return "\n\n".join(contexts[:3]) if contexts else None
647
+
648
+ retriever, _ = await _get_rag_retriever()
324
649
 
325
650
  if retriever is not None:
326
651
  results = retriever.search(user_text, top_k=5)
@@ -335,37 +660,52 @@ async def _rag_answer(user_text: str) -> str | None:
335
660
  if not contexts:
336
661
  return None
337
662
 
338
- if os.getenv("EVALVAULT_RAG_LLM_ENABLED", "true").lower() != "true":
663
+ if not rag_llm_enabled:
339
664
  return "\n\n".join(contexts[:3])
340
665
 
341
666
  prompt = (
342
667
  "다음은 EvalVault 코드/문서에서 검색된 컨텍스트입니다.\n"
343
- "컨텍스트만 근거로 사용해 한국어로 답하세요.\n\n"
668
+ "컨텍스트만 근거로 사용해 한국어로 답하세요.\n"
669
+ "질문이 동사만 있는 경우에도 선택된 run_id/분류를 기준으로 해석하세요.\n"
670
+ "정보가 부족하면 먼저 필요한 정보를 질문하세요.\n\n"
344
671
  "[컨텍스트]\n"
345
672
  + "\n\n---\n\n".join(contexts[:3])
346
673
  + "\n\n[질문]\n"
347
- + user_text
674
+ + _with_context(user_text, run_id, category)
348
675
  + "\n\n[답변]"
349
676
  )
350
677
 
678
+ model_name = os.getenv("OLLAMA_CHAT_MODEL", "qwen3:14b")
679
+ options = _ollama_chat_options(model_name)
351
680
  payload = {
352
- "model": os.getenv("OLLAMA_CHAT_MODEL", "gpt-oss-safeguard:20b"),
681
+ "model": model_name,
353
682
  "messages": [
354
683
  {"role": "system", "content": "You are a helpful assistant for EvalVault."},
355
684
  {"role": "user", "content": prompt},
356
685
  ],
357
686
  "stream": False,
358
687
  }
688
+ if options:
689
+ payload["options"] = options
359
690
 
360
- async with httpx.AsyncClient(timeout=60) as client:
361
- response = await client.post(
362
- f"{os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')}/api/chat",
363
- json=payload,
364
- )
365
- response.raise_for_status()
366
- data = response.json()
691
+ fallback = "\n\n".join(contexts[:3])
692
+ chat_timeout = int(os.getenv("OLLAMA_CHAT_TIMEOUT_SECONDS", "180"))
693
+ try:
694
+ async with httpx.AsyncClient(timeout=chat_timeout) as client:
695
+ response = await client.post(
696
+ f"{os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')}/api/chat",
697
+ json=payload,
698
+ )
699
+ response.raise_for_status()
700
+ data = response.json()
701
+ except httpx.ReadTimeout:
702
+ logger.warning("Ollama chat timed out; returning retrieved contexts")
703
+ return fallback or None
704
+ except httpx.HTTPError as exc:
705
+ logger.warning("Ollama chat failed: %s", exc)
706
+ return fallback or None
367
707
 
368
- return data.get("message", {}).get("content", "").strip() or None
708
+ return data.get("message", {}).get("content", "").strip() or fallback or None
369
709
 
370
710
 
371
711
  async def _call_mcp_tool(tool_name: str, tool_args: dict[str, Any]) -> Any:
@@ -388,7 +728,9 @@ async def _call_mcp_tool(tool_name: str, tool_args: dict[str, Any]) -> Any:
388
728
  return data
389
729
 
390
730
 
391
- async def _resolve_tool_with_llm(user_text: str) -> dict[str, Any] | None:
731
+ async def _resolve_tool_with_llm(
732
+ user_text: str, run_id: str | None = None, category: str | None = None
733
+ ) -> dict[str, Any] | None:
392
734
  ollama_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
393
735
  router_model = os.getenv("OLLAMA_ROUTER_MODEL", "gemma3:1b")
394
736
 
@@ -398,6 +740,8 @@ async def _resolve_tool_with_llm(user_text: str) -> dict[str, Any] | None:
398
740
  "Action must be one of: tool, rag, direct."
399
741
  "Tools: list_runs, get_run_summary, run_evaluation, analyze_compare, get_artifacts."
400
742
  "Rules:"
743
+ "- Assume verb-only requests refer to the selected run_id/category when provided."
744
+ "- If essential info is missing (e.g., run_id), return action direct with a follow-up question."
401
745
  "- If user asks about datasets, prefer tool list_datasets."
402
746
  "- If question is about EvalVault docs/usage, prefer rag."
403
747
  "- If greeting or general chat, use direct."
@@ -413,7 +757,7 @@ async def _resolve_tool_with_llm(user_text: str) -> dict[str, Any] | None:
413
757
  "model": router_model,
414
758
  "messages": [
415
759
  {"role": "system", "content": system_prompt},
416
- {"role": "user", "content": user_text},
760
+ {"role": "user", "content": _with_context(user_text, run_id, category)},
417
761
  ],
418
762
  "stream": False,
419
763
  }
@@ -479,6 +823,99 @@ def _chunk_text(text: str, size: int = 42) -> list[str]:
479
823
  return [text[i : i + size] for i in range(0, len(text), size)]
480
824
 
481
825
 
826
+ def _extract_text_from_parts(parts: list[dict[str, Any]] | None) -> str | None:
827
+ if not parts:
828
+ return None
829
+ chunks: list[str] = []
830
+ for part in parts:
831
+ if not isinstance(part, dict):
832
+ continue
833
+ if part.get("type") == "text":
834
+ text = part.get("text")
835
+ if isinstance(text, str) and text:
836
+ chunks.append(text)
837
+ if not chunks:
838
+ return None
839
+ content = "".join(chunks).strip()
840
+ return content or None
841
+
842
+
843
+ def _extract_last_user_message(messages: list[AiChatMessage]) -> str | None:
844
+ for message in reversed(messages):
845
+ if message.role != "user":
846
+ continue
847
+ if message.content and message.content.strip():
848
+ return message.content.strip()
849
+ content = _extract_text_from_parts(message.parts)
850
+ if content:
851
+ return content
852
+ return None
853
+
854
+
855
+ def _ai_sse_event(payload: dict[str, Any]) -> str:
856
+ return f"data: {json.dumps(payload, ensure_ascii=False)}\n\n"
857
+
858
+
859
+ def _ai_sse_done() -> str:
860
+ return "data: [DONE]\n\n"
861
+
862
+
863
+ def _ai_sse_headers() -> dict[str, str]:
864
+ return {
865
+ "Cache-Control": "no-cache",
866
+ "Connection": "keep-alive",
867
+ "x-vercel-ai-ui-message-stream": "v1",
868
+ }
869
+
870
+
871
+ async def _ai_chat_stream(
872
+ user_text: str, run_id: str | None = None, category: str | None = None
873
+ ) -> AsyncGenerator[str, None]:
874
+ message_id = f"msg_{int(time.time() * 1000)}"
875
+ text_id = f"text_{message_id}"
876
+ yield _ai_sse_event({"type": "start", "messageId": message_id})
877
+ yield _ai_sse_event({"type": "text-start", "id": text_id})
878
+
879
+ async for item in _chat_stream(user_text, run_id=run_id, category=category):
880
+ raw = item.strip()
881
+ if not raw:
882
+ continue
883
+ try:
884
+ payload = json.loads(raw)
885
+ except Exception:
886
+ continue
887
+
888
+ event_type = payload.get("type")
889
+ if event_type == "delta":
890
+ content = payload.get("content")
891
+ if isinstance(content, str) and content:
892
+ yield _ai_sse_event({"type": "text-delta", "id": text_id, "delta": content})
893
+ continue
894
+ if event_type == "status":
895
+ message = payload.get("message")
896
+ if isinstance(message, str) and message:
897
+ yield _ai_sse_event(
898
+ {"type": "data-status", "data": {"message": message}, "transient": True}
899
+ )
900
+ continue
901
+ if event_type == "error":
902
+ message = payload.get("message")
903
+ if not isinstance(message, str) or not message:
904
+ message = "채팅 요청에 실패했습니다."
905
+ yield _ai_sse_event({"type": "error", "errorText": message})
906
+ yield _ai_sse_event({"type": "finish"})
907
+ yield _ai_sse_done()
908
+ return
909
+ if event_type == "final":
910
+ yield _ai_sse_event({"type": "text-end", "id": text_id})
911
+ yield _ai_sse_event({"type": "finish"})
912
+ yield _ai_sse_done()
913
+ return
914
+
915
+ yield _ai_sse_event({"type": "finish"})
916
+ yield _ai_sse_done()
917
+
918
+
482
919
  def _event(payload: dict[str, Any]) -> str:
483
920
  return json.dumps(payload, ensure_ascii=False) + "\n"
484
921
 
@@ -490,15 +927,45 @@ async def _emit_answer(answer: str) -> AsyncGenerator[str, None]:
490
927
  yield _event({"type": "final", "content": answer})
491
928
 
492
929
 
493
- async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
930
+ async def _chat_stream(
931
+ user_text: str, run_id: str | None = None, category: str | None = None
932
+ ) -> AsyncGenerator[str, None]:
494
933
  started_at = time.perf_counter()
495
- if len(user_text) <= 4:
496
- yield _event({"type": "final", "content": "안녕하세요! EvalVault 관련 질문을 해주세요."})
934
+ simple_mode = os.getenv("EVALVAULT_CHAT_SIMPLE_MODE", "false").lower() == "true"
935
+ run_context_enabled = os.getenv("EVALVAULT_CHAT_RUN_CONTEXT_ENABLED", "true").lower() == "true"
936
+ if simple_mode:
937
+ yield _event({"type": "status", "message": "간단 채팅 처리 중..."})
938
+ answer = await _direct_chat_answer(user_text)
939
+ if answer:
940
+ async for item in _emit_answer(answer):
941
+ yield item
942
+ else:
943
+ yield _event({"type": "final", "content": "답변을 생성하지 못했습니다."})
944
+ return
945
+ if category in {"result_interpretation", "improvement_direction"} and not run_id:
946
+ yield _event(
947
+ {
948
+ "type": "final",
949
+ "content": "선택한 분류는 run_id가 필요합니다. run_id를 선택한 뒤 다시 질문해주세요.",
950
+ }
951
+ )
497
952
  return
498
953
 
499
- if len(user_text) <= 6:
954
+ if len(user_text) <= 4:
955
+ if run_id or category:
956
+ user_text = f"{user_text}"
957
+ else:
958
+ yield _event(
959
+ {
960
+ "type": "final",
961
+ "content": "무엇을 설명할까요? run_id와 질문 분류를 선택한 뒤 다시 요청해주세요.",
962
+ }
963
+ )
964
+ return
965
+
966
+ if len(user_text) <= 6 and not _is_verb_only(user_text):
500
967
  yield _event({"type": "status", "message": "짧은 질문 처리 중..."})
501
- answer = await _direct_chat_answer(user_text)
968
+ answer = await _direct_chat_answer(user_text, run_id=run_id, category=category)
502
969
  if answer:
503
970
  async for item in _emit_answer(answer):
504
971
  yield item
@@ -506,9 +973,46 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
506
973
  yield _event({"type": "final", "content": "답변을 생성하지 못했습니다."})
507
974
  return
508
975
 
976
+ if (
977
+ _is_verb_only(user_text)
978
+ and category in {"result_interpretation", "improvement_direction"}
979
+ and run_id
980
+ and run_context_enabled
981
+ ):
982
+ yield _event({"type": "status", "message": "선택한 run 요약 중..."})
983
+ try:
984
+ result = await asyncio.wait_for(
985
+ _call_mcp_tool("get_run_summary", {"run_id": run_id}), timeout=12
986
+ )
987
+ except TimeoutError:
988
+ yield _event(
989
+ {
990
+ "type": "error",
991
+ "message": "run 요약 응답이 지연됩니다. 잠시 후 다시 시도해주세요.",
992
+ }
993
+ )
994
+ return
995
+ except Exception as exc:
996
+ yield _event({"type": "error", "message": f"run 요약 실패: {exc}"})
997
+ return
998
+
999
+ payload = _extract_json_content(result)
1000
+ if isinstance(payload, dict):
1001
+ summary = _summarize_result("get_run_summary", payload)
1002
+ if category == "improvement_direction":
1003
+ summary += "\n\n개선 방향을 구체화하려면 목표 메트릭이나 기준을 알려주세요."
1004
+ else:
1005
+ summary += "\n\n특정 메트릭/케이스가 있으면 알려주세요."
1006
+ async for item in _emit_answer(summary):
1007
+ yield item
1008
+ return
1009
+
509
1010
  yield _event({"type": "status", "message": "요청 분류 중..."})
510
1011
  try:
511
- router = await asyncio.wait_for(_resolve_tool_with_llm(user_text), timeout=20)
1012
+ router = await asyncio.wait_for(
1013
+ _resolve_tool_with_llm(user_text, run_id=run_id, category=category),
1014
+ timeout=30,
1015
+ )
512
1016
  except TimeoutError:
513
1017
  router = None
514
1018
  except Exception:
@@ -520,7 +1024,9 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
520
1024
  if router is None:
521
1025
  yield _event({"type": "status", "message": "문서 검색 중..."})
522
1026
  try:
523
- rag_answer = await asyncio.wait_for(_rag_answer(user_text), timeout=30)
1027
+ rag_answer = await asyncio.wait_for(
1028
+ _rag_answer(user_text, run_id=run_id, category=category), timeout=90
1029
+ )
524
1030
  except TimeoutError:
525
1031
  yield _event({"type": "error", "message": "문서 검색이 지연됩니다. 다시 시도해주세요."})
526
1032
  return
@@ -528,7 +1034,7 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
528
1034
  async for item in _emit_answer(rag_answer):
529
1035
  yield item
530
1036
  return
531
- answer = await _direct_chat_answer(user_text)
1037
+ answer = await _direct_chat_answer(user_text, run_id=run_id, category=category)
532
1038
  if answer:
533
1039
  async for item in _emit_answer(answer):
534
1040
  yield item
@@ -541,7 +1047,7 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
541
1047
  tool_args = router.get("arguments", {})
542
1048
 
543
1049
  if action == "direct":
544
- answer = await _direct_chat_answer(user_text)
1050
+ answer = await _direct_chat_answer(user_text, run_id=run_id, category=category)
545
1051
  if answer:
546
1052
  async for item in _emit_answer(answer):
547
1053
  yield item
@@ -552,7 +1058,9 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
552
1058
  if action == "rag":
553
1059
  yield _event({"type": "status", "message": "문서 검색 중..."})
554
1060
  try:
555
- rag_answer = await asyncio.wait_for(_rag_answer(user_text), timeout=30)
1061
+ rag_answer = await asyncio.wait_for(
1062
+ _rag_answer(user_text, run_id=run_id, category=category), timeout=90
1063
+ )
556
1064
  except TimeoutError:
557
1065
  yield _event({"type": "error", "message": "문서 검색이 지연됩니다. 다시 시도해주세요."})
558
1066
  return
@@ -571,9 +1079,39 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
571
1079
  yield _event({"type": "final", "content": "도구 이름을 찾지 못했습니다."})
572
1080
  return
573
1081
 
1082
+ if tool_name == "get_run_summary" and not (tool_args.get("run_id") or run_id):
1083
+ yield _event({"type": "final", "content": "run_id를 선택하거나 입력해주세요."})
1084
+ return
1085
+ if tool_name == "get_artifacts" and not (tool_args.get("run_id") or run_id):
1086
+ yield _event({"type": "final", "content": "아티팩트 조회를 위해 run_id가 필요합니다."})
1087
+ return
1088
+ if not run_context_enabled and tool_name in {"get_run_summary", "get_artifacts"}:
1089
+ yield _event(
1090
+ {
1091
+ "type": "final",
1092
+ "content": "run 요약/아티팩트 조회가 비활성화되어 있습니다.",
1093
+ }
1094
+ )
1095
+ return
1096
+ if tool_name == "analyze_compare" and (
1097
+ not tool_args.get("run_id_a") or not tool_args.get("run_id_b")
1098
+ ):
1099
+ yield _event(
1100
+ {
1101
+ "type": "final",
1102
+ "content": "비교 분석에는 run_id 두 개가 필요합니다. 비교할 run을 알려주세요.",
1103
+ }
1104
+ )
1105
+ return
1106
+
574
1107
  yield _event({"type": "status", "message": "도구 실행 중..."})
575
1108
  try:
576
- result = await asyncio.wait_for(_call_mcp_tool(tool_name, tool_args), timeout=12)
1109
+ enhanced_tool_args = dict(tool_args)
1110
+ if run_id:
1111
+ enhanced_tool_args["run_id"] = run_id
1112
+ if category:
1113
+ enhanced_tool_args["category"] = category
1114
+ result = await asyncio.wait_for(_call_mcp_tool(tool_name, enhanced_tool_args), timeout=12)
577
1115
  except TimeoutError:
578
1116
  yield _event(
579
1117
  {"type": "error", "message": "응답 지연(12s 초과). MCP 서버 상태를 확인해주세요."}
@@ -615,3 +1153,32 @@ async def chat_stream(request: ChatRequest):
615
1153
  yield item
616
1154
 
617
1155
  return StreamingResponse(event_generator(), media_type="application/x-ndjson")
1156
+
1157
+
1158
+ @router.post("/ai-stream")
1159
+ async def ai_chat_stream(request: AiChatRequest):
1160
+ user_text = _extract_last_user_message(request.messages)
1161
+ run_id = request.run_id
1162
+ category = request.category
1163
+ if not user_text:
1164
+
1165
+ async def error_generator():
1166
+ yield _ai_sse_event({"type": "error", "errorText": "질문을 입력해주세요."})
1167
+ yield _ai_sse_event({"type": "finish"})
1168
+ yield _ai_sse_done()
1169
+
1170
+ return StreamingResponse(
1171
+ error_generator(),
1172
+ media_type="text/event-stream",
1173
+ headers=_ai_sse_headers(),
1174
+ )
1175
+
1176
+ async def event_generator():
1177
+ async for item in _ai_chat_stream(user_text, run_id=run_id, category=category):
1178
+ yield item
1179
+
1180
+ return StreamingResponse(
1181
+ event_generator(),
1182
+ media_type="text/event-stream",
1183
+ headers=_ai_sse_headers(),
1184
+ )