evalvault 1.73.2__py3-none-any.whl → 1.74.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,6 +21,7 @@ from evalvault.adapters.outbound.analysis import (
21
21
  )
22
22
  from evalvault.adapters.outbound.cache import MemoryCacheAdapter
23
23
  from evalvault.adapters.outbound.judge_calibration_reporter import JudgeCalibrationReporter
24
+ from evalvault.adapters.outbound.ops.report_renderer import render_json, render_markdown
24
25
  from evalvault.adapters.outbound.report import MarkdownReportAdapter
25
26
  from evalvault.config.phoenix_support import PhoenixExperimentResolver
26
27
  from evalvault.config.settings import Settings
@@ -43,6 +44,7 @@ from evalvault.domain.services.analysis_service import AnalysisService
43
44
  from evalvault.domain.services.cluster_map_builder import build_cluster_map
44
45
  from evalvault.domain.services.debug_report_service import DebugReportService
45
46
  from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
47
+ from evalvault.domain.services.ops_report_service import OpsReportService
46
48
  from evalvault.domain.services.prompt_registry import (
47
49
  PromptInput,
48
50
  build_prompt_bundle,
@@ -1329,6 +1331,42 @@ class WebUIAdapter:
1329
1331
  stage_storage=stage_storage,
1330
1332
  )
1331
1333
 
1334
+ def generate_ops_report(
1335
+ self,
1336
+ run_id: str,
1337
+ *,
1338
+ output_format: str,
1339
+ save: bool,
1340
+ ) -> dict[str, Any] | str:
1341
+ if self._storage is None:
1342
+ raise RuntimeError("Storage not configured")
1343
+ if not hasattr(self._storage, "list_stage_events"):
1344
+ raise RuntimeError("Stage storage not configured")
1345
+
1346
+ service = OpsReportService()
1347
+ stage_storage = cast(StageStoragePort, self._storage)
1348
+ report = service.build_report(
1349
+ run_id,
1350
+ storage=self._storage,
1351
+ stage_storage=stage_storage,
1352
+ )
1353
+
1354
+ content = render_markdown(report) if output_format == "markdown" else render_json(report)
1355
+
1356
+ if save:
1357
+ self._storage.save_ops_report(
1358
+ report_id=None,
1359
+ run_id=run_id,
1360
+ report_type="ops_report",
1361
+ format=output_format,
1362
+ content=content,
1363
+ metadata={"source": "api"},
1364
+ )
1365
+
1366
+ if output_format == "markdown":
1367
+ return content
1368
+ return report.to_dict()
1369
+
1332
1370
  def delete_run(self, run_id: str) -> bool:
1333
1371
  """평가 삭제.
1334
1372
 
@@ -44,10 +44,62 @@ class ChatRequest(BaseModel):
44
44
  history: list[ChatMessage] | None = None
45
45
 
46
46
 
47
+ class AiChatMessage(BaseModel):
48
+ role: str
49
+ content: str | None = None
50
+ parts: list[dict[str, Any]] | None = None
51
+
52
+
53
+ class AiChatRequest(BaseModel):
54
+ messages: list[AiChatMessage] = Field(default_factory=list)
55
+ run_id: str | None = None
56
+ category: str | None = None
57
+
58
+
47
59
  def _extract_run_ids(text: str) -> list[str]:
48
60
  return re.findall(r"run_[A-Za-z0-9_-]+", text)
49
61
 
50
62
 
63
+ def _ollama_chat_options(model_name: str) -> dict[str, Any] | None:
64
+ lower = model_name.lower()
65
+ if lower.startswith("qwen3"):
66
+ return {
67
+ "temperature": 0.6,
68
+ "top_p": 0.95,
69
+ "top_k": 20,
70
+ "repeat_penalty": 1,
71
+ "stop": ["<|im_start|>", "<|im_end|>"],
72
+ }
73
+ return None
74
+
75
+
76
+ def _is_verb_only(text: str) -> bool:
77
+ if not text:
78
+ return False
79
+ compact = re.sub(r"\s+", "", text.strip())
80
+ if not compact:
81
+ return False
82
+ tokens = re.findall(r"[A-Za-z0-9가-힣]+", compact)
83
+ if len(tokens) > 2:
84
+ return False
85
+ verb_markers = ["해줘", "해주세요", "해봐", "해봐요", "해줘요", "해줘라"]
86
+ verb_stems = ["설명", "요약", "분석", "비교", "개선", "정리", "추천", "진단", "해석", "검증"]
87
+ if any(compact.endswith(marker) for marker in verb_markers):
88
+ return any(stem in compact for stem in verb_stems)
89
+ return compact in verb_stems
90
+
91
+
92
+ def _with_context(user_text: str, run_id: str | None, category: str | None) -> str:
93
+ parts = []
94
+ if run_id:
95
+ parts.append(f"선택된 run_id: {run_id}")
96
+ if category:
97
+ parts.append(f"질문 분류: {category}")
98
+ if not parts:
99
+ return user_text
100
+ return "\n".join(parts) + f"\n사용자 요청: {user_text}"
101
+
102
+
51
103
  def _format_tool_result(result: Any) -> str:
52
104
  if isinstance(result, dict):
53
105
  if "result" in result:
@@ -283,15 +335,29 @@ async def warm_rag_index() -> None:
283
335
  logger.warning("RAG preload failed: %s", exc)
284
336
 
285
337
 
286
- async def _direct_chat_answer(user_text: str) -> str | None:
338
+ async def _direct_chat_answer(
339
+ user_text: str, run_id: str | None = None, category: str | None = None
340
+ ) -> str | None:
341
+ user_text = _with_context(user_text, run_id, category)
342
+ model_name = os.getenv("OLLAMA_CHAT_MODEL", "qwen3:14b")
343
+ options = _ollama_chat_options(model_name)
287
344
  payload = {
288
- "model": os.getenv("OLLAMA_CHAT_MODEL", "gpt-oss-safeguard:20b"),
345
+ "model": model_name,
289
346
  "messages": [
290
- {"role": "system", "content": "You are a helpful assistant for EvalVault."},
347
+ {
348
+ "role": "system",
349
+ "content": (
350
+ "You are a helpful assistant for EvalVault. "
351
+ "Interpret verb-only requests as questions about the selected run/category. "
352
+ "If essential details are missing, ask a concise follow-up question in Korean."
353
+ ),
354
+ },
291
355
  {"role": "user", "content": user_text},
292
356
  ],
293
357
  "stream": False,
294
358
  }
359
+ if options:
360
+ payload["options"] = options
295
361
 
296
362
  async with httpx.AsyncClient(timeout=30) as client:
297
363
  response = await client.post(
@@ -318,7 +384,9 @@ def _simple_retrieve(texts: list[str], query: str, top_k: int) -> list[str]:
318
384
  return [text for _, text in scored[:top_k]]
319
385
 
320
386
 
321
- async def _rag_answer(user_text: str) -> str | None:
387
+ async def _rag_answer(
388
+ user_text: str, run_id: str | None = None, category: str | None = None
389
+ ) -> str | None:
322
390
  retriever, _ = await _get_rag_retriever()
323
391
  contexts: list[str] = []
324
392
 
@@ -340,22 +408,28 @@ async def _rag_answer(user_text: str) -> str | None:
340
408
 
341
409
  prompt = (
342
410
  "다음은 EvalVault 코드/문서에서 검색된 컨텍스트입니다.\n"
343
- "컨텍스트만 근거로 사용해 한국어로 답하세요.\n\n"
411
+ "컨텍스트만 근거로 사용해 한국어로 답하세요.\n"
412
+ "질문이 동사만 있는 경우에도 선택된 run_id/분류를 기준으로 해석하세요.\n"
413
+ "정보가 부족하면 먼저 필요한 정보를 질문하세요.\n\n"
344
414
  "[컨텍스트]\n"
345
415
  + "\n\n---\n\n".join(contexts[:3])
346
416
  + "\n\n[질문]\n"
347
- + user_text
417
+ + _with_context(user_text, run_id, category)
348
418
  + "\n\n[답변]"
349
419
  )
350
420
 
421
+ model_name = os.getenv("OLLAMA_CHAT_MODEL", "qwen3:14b")
422
+ options = _ollama_chat_options(model_name)
351
423
  payload = {
352
- "model": os.getenv("OLLAMA_CHAT_MODEL", "gpt-oss-safeguard:20b"),
424
+ "model": model_name,
353
425
  "messages": [
354
426
  {"role": "system", "content": "You are a helpful assistant for EvalVault."},
355
427
  {"role": "user", "content": prompt},
356
428
  ],
357
429
  "stream": False,
358
430
  }
431
+ if options:
432
+ payload["options"] = options
359
433
 
360
434
  async with httpx.AsyncClient(timeout=60) as client:
361
435
  response = await client.post(
@@ -388,7 +462,9 @@ async def _call_mcp_tool(tool_name: str, tool_args: dict[str, Any]) -> Any:
388
462
  return data
389
463
 
390
464
 
391
- async def _resolve_tool_with_llm(user_text: str) -> dict[str, Any] | None:
465
+ async def _resolve_tool_with_llm(
466
+ user_text: str, run_id: str | None = None, category: str | None = None
467
+ ) -> dict[str, Any] | None:
392
468
  ollama_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
393
469
  router_model = os.getenv("OLLAMA_ROUTER_MODEL", "gemma3:1b")
394
470
 
@@ -398,6 +474,8 @@ async def _resolve_tool_with_llm(user_text: str) -> dict[str, Any] | None:
398
474
  "Action must be one of: tool, rag, direct."
399
475
  "Tools: list_runs, get_run_summary, run_evaluation, analyze_compare, get_artifacts."
400
476
  "Rules:"
477
+ "- Assume verb-only requests refer to the selected run_id/category when provided."
478
+ "- If essential info is missing (e.g., run_id), return action direct with a follow-up question."
401
479
  "- If user asks about datasets, prefer tool list_datasets."
402
480
  "- If question is about EvalVault docs/usage, prefer rag."
403
481
  "- If greeting or general chat, use direct."
@@ -413,7 +491,7 @@ async def _resolve_tool_with_llm(user_text: str) -> dict[str, Any] | None:
413
491
  "model": router_model,
414
492
  "messages": [
415
493
  {"role": "system", "content": system_prompt},
416
- {"role": "user", "content": user_text},
494
+ {"role": "user", "content": _with_context(user_text, run_id, category)},
417
495
  ],
418
496
  "stream": False,
419
497
  }
@@ -479,6 +557,99 @@ def _chunk_text(text: str, size: int = 42) -> list[str]:
479
557
  return [text[i : i + size] for i in range(0, len(text), size)]
480
558
 
481
559
 
560
+ def _extract_text_from_parts(parts: list[dict[str, Any]] | None) -> str | None:
561
+ if not parts:
562
+ return None
563
+ chunks: list[str] = []
564
+ for part in parts:
565
+ if not isinstance(part, dict):
566
+ continue
567
+ if part.get("type") == "text":
568
+ text = part.get("text")
569
+ if isinstance(text, str) and text:
570
+ chunks.append(text)
571
+ if not chunks:
572
+ return None
573
+ content = "".join(chunks).strip()
574
+ return content or None
575
+
576
+
577
+ def _extract_last_user_message(messages: list[AiChatMessage]) -> str | None:
578
+ for message in reversed(messages):
579
+ if message.role != "user":
580
+ continue
581
+ if message.content and message.content.strip():
582
+ return message.content.strip()
583
+ content = _extract_text_from_parts(message.parts)
584
+ if content:
585
+ return content
586
+ return None
587
+
588
+
589
+ def _ai_sse_event(payload: dict[str, Any]) -> str:
590
+ return f"data: {json.dumps(payload, ensure_ascii=False)}\n\n"
591
+
592
+
593
+ def _ai_sse_done() -> str:
594
+ return "data: [DONE]\n\n"
595
+
596
+
597
+ def _ai_sse_headers() -> dict[str, str]:
598
+ return {
599
+ "Cache-Control": "no-cache",
600
+ "Connection": "keep-alive",
601
+ "x-vercel-ai-ui-message-stream": "v1",
602
+ }
603
+
604
+
605
+ async def _ai_chat_stream(
606
+ user_text: str, run_id: str | None = None, category: str | None = None
607
+ ) -> AsyncGenerator[str, None]:
608
+ message_id = f"msg_{int(time.time() * 1000)}"
609
+ text_id = f"text_{message_id}"
610
+ yield _ai_sse_event({"type": "start", "messageId": message_id})
611
+ yield _ai_sse_event({"type": "text-start", "id": text_id})
612
+
613
+ async for item in _chat_stream(user_text, run_id=run_id, category=category):
614
+ raw = item.strip()
615
+ if not raw:
616
+ continue
617
+ try:
618
+ payload = json.loads(raw)
619
+ except Exception:
620
+ continue
621
+
622
+ event_type = payload.get("type")
623
+ if event_type == "delta":
624
+ content = payload.get("content")
625
+ if isinstance(content, str) and content:
626
+ yield _ai_sse_event({"type": "text-delta", "id": text_id, "delta": content})
627
+ continue
628
+ if event_type == "status":
629
+ message = payload.get("message")
630
+ if isinstance(message, str) and message:
631
+ yield _ai_sse_event(
632
+ {"type": "data-status", "data": {"message": message}, "transient": True}
633
+ )
634
+ continue
635
+ if event_type == "error":
636
+ message = payload.get("message")
637
+ if not isinstance(message, str) or not message:
638
+ message = "채팅 요청에 실패했습니다."
639
+ yield _ai_sse_event({"type": "error", "errorText": message})
640
+ yield _ai_sse_event({"type": "finish"})
641
+ yield _ai_sse_done()
642
+ return
643
+ if event_type == "final":
644
+ yield _ai_sse_event({"type": "text-end", "id": text_id})
645
+ yield _ai_sse_event({"type": "finish"})
646
+ yield _ai_sse_done()
647
+ return
648
+
649
+ yield _ai_sse_event({"type": "finish"})
650
+ yield _ai_sse_done()
651
+
652
+
482
653
  def _event(payload: dict[str, Any]) -> str:
483
654
  return json.dumps(payload, ensure_ascii=False) + "\n"
484
655
 
@@ -490,15 +661,34 @@ async def _emit_answer(answer: str) -> AsyncGenerator[str, None]:
490
661
  yield _event({"type": "final", "content": answer})
491
662
 
492
663
 
493
- async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
664
+ async def _chat_stream(
665
+ user_text: str, run_id: str | None = None, category: str | None = None
666
+ ) -> AsyncGenerator[str, None]:
494
667
  started_at = time.perf_counter()
495
- if len(user_text) <= 4:
496
- yield _event({"type": "final", "content": "안녕하세요! EvalVault 관련 질문을 해주세요."})
668
+ if category in {"result_interpretation", "improvement_direction"} and not run_id:
669
+ yield _event(
670
+ {
671
+ "type": "final",
672
+ "content": "선택한 분류는 run_id가 필요합니다. run_id를 선택한 뒤 다시 질문해주세요.",
673
+ }
674
+ )
497
675
  return
498
676
 
499
- if len(user_text) <= 6:
677
+ if len(user_text) <= 4:
678
+ if run_id or category:
679
+ user_text = f"{user_text}"
680
+ else:
681
+ yield _event(
682
+ {
683
+ "type": "final",
684
+ "content": "무엇을 설명할까요? run_id와 질문 분류를 선택한 뒤 다시 요청해주세요.",
685
+ }
686
+ )
687
+ return
688
+
689
+ if len(user_text) <= 6 and not _is_verb_only(user_text):
500
690
  yield _event({"type": "status", "message": "짧은 질문 처리 중..."})
501
- answer = await _direct_chat_answer(user_text)
691
+ answer = await _direct_chat_answer(user_text, run_id=run_id, category=category)
502
692
  if answer:
503
693
  async for item in _emit_answer(answer):
504
694
  yield item
@@ -506,9 +696,45 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
506
696
  yield _event({"type": "final", "content": "답변을 생성하지 못했습니다."})
507
697
  return
508
698
 
699
+ if (
700
+ _is_verb_only(user_text)
701
+ and category in {"result_interpretation", "improvement_direction"}
702
+ and run_id
703
+ ):
704
+ yield _event({"type": "status", "message": "선택한 run 요약 중..."})
705
+ try:
706
+ result = await asyncio.wait_for(
707
+ _call_mcp_tool("get_run_summary", {"run_id": run_id}), timeout=12
708
+ )
709
+ except TimeoutError:
710
+ yield _event(
711
+ {
712
+ "type": "error",
713
+ "message": "run 요약 응답이 지연됩니다. 잠시 후 다시 시도해주세요.",
714
+ }
715
+ )
716
+ return
717
+ except Exception as exc:
718
+ yield _event({"type": "error", "message": f"run 요약 실패: {exc}"})
719
+ return
720
+
721
+ payload = _extract_json_content(result)
722
+ if isinstance(payload, dict):
723
+ summary = _summarize_result("get_run_summary", payload)
724
+ if category == "improvement_direction":
725
+ summary += "\n\n개선 방향을 구체화하려면 목표 메트릭이나 기준을 알려주세요."
726
+ else:
727
+ summary += "\n\n특정 메트릭/케이스가 있으면 알려주세요."
728
+ async for item in _emit_answer(summary):
729
+ yield item
730
+ return
731
+
509
732
  yield _event({"type": "status", "message": "요청 분류 중..."})
510
733
  try:
511
- router = await asyncio.wait_for(_resolve_tool_with_llm(user_text), timeout=20)
734
+ router = await asyncio.wait_for(
735
+ _resolve_tool_with_llm(user_text, run_id=run_id, category=category),
736
+ timeout=30,
737
+ )
512
738
  except TimeoutError:
513
739
  router = None
514
740
  except Exception:
@@ -520,7 +746,9 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
520
746
  if router is None:
521
747
  yield _event({"type": "status", "message": "문서 검색 중..."})
522
748
  try:
523
- rag_answer = await asyncio.wait_for(_rag_answer(user_text), timeout=30)
749
+ rag_answer = await asyncio.wait_for(
750
+ _rag_answer(user_text, run_id=run_id, category=category), timeout=90
751
+ )
524
752
  except TimeoutError:
525
753
  yield _event({"type": "error", "message": "문서 검색이 지연됩니다. 다시 시도해주세요."})
526
754
  return
@@ -528,7 +756,7 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
528
756
  async for item in _emit_answer(rag_answer):
529
757
  yield item
530
758
  return
531
- answer = await _direct_chat_answer(user_text)
759
+ answer = await _direct_chat_answer(user_text, run_id=run_id, category=category)
532
760
  if answer:
533
761
  async for item in _emit_answer(answer):
534
762
  yield item
@@ -541,7 +769,7 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
541
769
  tool_args = router.get("arguments", {})
542
770
 
543
771
  if action == "direct":
544
- answer = await _direct_chat_answer(user_text)
772
+ answer = await _direct_chat_answer(user_text, run_id=run_id, category=category)
545
773
  if answer:
546
774
  async for item in _emit_answer(answer):
547
775
  yield item
@@ -552,7 +780,9 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
552
780
  if action == "rag":
553
781
  yield _event({"type": "status", "message": "문서 검색 중..."})
554
782
  try:
555
- rag_answer = await asyncio.wait_for(_rag_answer(user_text), timeout=30)
783
+ rag_answer = await asyncio.wait_for(
784
+ _rag_answer(user_text, run_id=run_id, category=category), timeout=90
785
+ )
556
786
  except TimeoutError:
557
787
  yield _event({"type": "error", "message": "문서 검색이 지연됩니다. 다시 시도해주세요."})
558
788
  return
@@ -571,9 +801,31 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
571
801
  yield _event({"type": "final", "content": "도구 이름을 찾지 못했습니다."})
572
802
  return
573
803
 
804
+ if tool_name == "get_run_summary" and not (tool_args.get("run_id") or run_id):
805
+ yield _event({"type": "final", "content": "run_id를 선택하거나 입력해주세요."})
806
+ return
807
+ if tool_name == "get_artifacts" and not (tool_args.get("run_id") or run_id):
808
+ yield _event({"type": "final", "content": "아티팩트 조회를 위해 run_id가 필요합니다."})
809
+ return
810
+ if tool_name == "analyze_compare" and (
811
+ not tool_args.get("run_id_a") or not tool_args.get("run_id_b")
812
+ ):
813
+ yield _event(
814
+ {
815
+ "type": "final",
816
+ "content": "비교 분석에는 run_id 두 개가 필요합니다. 비교할 run을 알려주세요.",
817
+ }
818
+ )
819
+ return
820
+
574
821
  yield _event({"type": "status", "message": "도구 실행 중..."})
575
822
  try:
576
- result = await asyncio.wait_for(_call_mcp_tool(tool_name, tool_args), timeout=12)
823
+ enhanced_tool_args = dict(tool_args)
824
+ if run_id:
825
+ enhanced_tool_args["run_id"] = run_id
826
+ if category:
827
+ enhanced_tool_args["category"] = category
828
+ result = await asyncio.wait_for(_call_mcp_tool(tool_name, enhanced_tool_args), timeout=12)
577
829
  except TimeoutError:
578
830
  yield _event(
579
831
  {"type": "error", "message": "응답 지연(12s 초과). MCP 서버 상태를 확인해주세요."}
@@ -615,3 +867,32 @@ async def chat_stream(request: ChatRequest):
615
867
  yield item
616
868
 
617
869
  return StreamingResponse(event_generator(), media_type="application/x-ndjson")
870
+
871
+
872
+ @router.post("/ai-stream")
873
+ async def ai_chat_stream(request: AiChatRequest):
874
+ user_text = _extract_last_user_message(request.messages)
875
+ run_id = request.run_id
876
+ category = request.category
877
+ if not user_text:
878
+
879
+ async def error_generator():
880
+ yield _ai_sse_event({"type": "error", "errorText": "질문을 입력해주세요."})
881
+ yield _ai_sse_event({"type": "finish"})
882
+ yield _ai_sse_done()
883
+
884
+ return StreamingResponse(
885
+ error_generator(),
886
+ media_type="text/event-stream",
887
+ headers=_ai_sse_headers(),
888
+ )
889
+
890
+ async def event_generator():
891
+ async for item in _ai_chat_stream(user_text, run_id=run_id, category=category):
892
+ yield item
893
+
894
+ return StreamingResponse(
895
+ event_generator(),
896
+ media_type="text/event-stream",
897
+ headers=_ai_sse_headers(),
898
+ )
@@ -0,0 +1,5 @@
1
+ """Ops report renderers."""
2
+
3
+ from evalvault.adapters.outbound.ops.report_renderer import render_json, render_markdown
4
+
5
+ __all__ = ["render_json", "render_markdown"]
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ from evalvault.domain.entities.ops_report import OpsReport
6
+ from evalvault.domain.entities.stage import StageMetric, StageSummary
7
+
8
+
9
+ def render_markdown(report: OpsReport) -> str:
10
+ lines: list[str] = []
11
+ lines.append("# Ops Report")
12
+ lines.append("")
13
+ lines.extend(_render_run_summary(report.run_summary, report.metadata))
14
+ lines.append("")
15
+ lines.extend(_render_ops_kpis(report.ops_kpis))
16
+ lines.append("")
17
+ lines.extend(_render_stage_summary(report.stage_summary))
18
+ lines.append("")
19
+ lines.extend(_render_bottlenecks(report.bottlenecks))
20
+ lines.append("")
21
+ lines.extend(_render_recommendations(report.recommendations))
22
+ lines.append("")
23
+ lines.extend(_render_failing_metrics(report.stage_metrics))
24
+ return "\n".join(lines).strip()
25
+
26
+
27
+ def render_json(report: OpsReport) -> str:
28
+ payload = report.to_dict()
29
+ return json.dumps(payload, ensure_ascii=True, indent=2)
30
+
31
+
32
+ def _render_run_summary(summary: dict[str, object], metadata: dict[str, object]) -> list[str]:
33
+ run_id = summary.get("run_id", "-")
34
+ dataset = summary.get("dataset_name", "-")
35
+ version = summary.get("dataset_version", "-")
36
+ model = summary.get("model_name", "-")
37
+ started = summary.get("started_at", "-")
38
+ finished = summary.get("finished_at", "-")
39
+ duration = summary.get("duration_seconds", "-")
40
+ total_cases = summary.get("total_test_cases", "-")
41
+ pass_rate = summary.get("pass_rate", "-")
42
+ total_tokens = summary.get("total_tokens", "-")
43
+ total_cost = summary.get("total_cost_usd", "-")
44
+
45
+ lines = [
46
+ "## Run Summary",
47
+ f"- run_id: {run_id}",
48
+ f"- dataset: {dataset} ({version})",
49
+ f"- model: {model}",
50
+ f"- started_at: {started}",
51
+ f"- finished_at: {finished}",
52
+ f"- duration_seconds: {duration}",
53
+ f"- total_test_cases: {total_cases}",
54
+ f"- pass_rate: {pass_rate}",
55
+ f"- total_tokens: {total_tokens}",
56
+ f"- total_cost_usd: {total_cost}",
57
+ ]
58
+ trace_links: list[str] = []
59
+ if metadata.get("langfuse_trace_url"):
60
+ trace_links.append(f"langfuse_trace_url={metadata['langfuse_trace_url']}")
61
+ if metadata.get("phoenix_trace_url"):
62
+ trace_links.append(f"phoenix_trace_url={metadata['phoenix_trace_url']}")
63
+ if trace_links:
64
+ lines.append(f"- trace_links: {', '.join(trace_links)}")
65
+ return lines
66
+
67
+
68
+ def _render_stage_summary(summary: StageSummary | None) -> list[str]:
69
+ lines = ["## Stage Summary"]
70
+ if summary is None:
71
+ lines.append("- no stage events found")
72
+ return lines
73
+ lines.append(f"- total_events: {summary.total_events}")
74
+ if summary.missing_required_stage_types:
75
+ missing = ", ".join(summary.missing_required_stage_types)
76
+ lines.append(f"- missing_required_stage_types: {missing}")
77
+ if summary.stage_type_counts:
78
+ lines.append("- stage_type_counts:")
79
+ for stage_type, count in summary.stage_type_counts.items():
80
+ lines.append(f" - {stage_type}: {count}")
81
+ if summary.stage_type_avg_durations:
82
+ lines.append("- stage_type_avg_durations_ms:")
83
+ for stage_type, duration in summary.stage_type_avg_durations.items():
84
+ lines.append(f" - {stage_type}: {duration:.3f}")
85
+ return lines
86
+
87
+
88
+ def _render_ops_kpis(kpis: dict[str, object]) -> list[str]:
89
+ lines = ["## Ops KPIs"]
90
+ lines.append(f"- total_test_cases: {kpis.get('total_test_cases', '-')}")
91
+ lines.append(f"- pass_rate: {kpis.get('pass_rate', '-')}")
92
+ lines.append(f"- failure_rate: {kpis.get('failure_rate', '-')}")
93
+ lines.append(f"- stage_error_rate: {kpis.get('stage_error_rate', '-')}")
94
+ lines.append(f"- stage_error_severity: {kpis.get('stage_error_severity', '-')}")
95
+ lines.append(f"- duration_seconds: {kpis.get('duration_seconds', '-')}")
96
+ lines.append(f"- total_tokens: {kpis.get('total_tokens', '-')}")
97
+ lines.append(f"- total_cost_usd: {kpis.get('total_cost_usd', '-')}")
98
+ lines.append(f"- avg_latency_ms: {kpis.get('avg_latency_ms', '-')}")
99
+ lines.append(f"- p95_latency_ms: {kpis.get('p95_latency_ms', '-')}")
100
+ lines.append(f"- avg_tokens_per_case: {kpis.get('avg_tokens_per_case', '-')}")
101
+ lines.append(f"- avg_cost_per_case_usd: {kpis.get('avg_cost_per_case_usd', '-')}")
102
+ return lines
103
+
104
+
105
+ def _render_bottlenecks(bottlenecks: list[dict[str, object]]) -> list[str]:
106
+ lines = ["## Ops Signals"]
107
+ if not bottlenecks:
108
+ lines.append("- none")
109
+ return lines
110
+ for entry in bottlenecks:
111
+ entry_type = entry.get("type", "unknown")
112
+ if entry_type == "latency":
113
+ stage_type = entry.get("stage_type", "-")
114
+ duration = entry.get("avg_duration_ms", "-")
115
+ lines.append(f"- latency: {stage_type} avg_duration_ms={duration}")
116
+ elif entry_type == "missing_stage":
117
+ stage_type = entry.get("stage_type", "-")
118
+ lines.append(f"- missing_stage: {stage_type}")
119
+ else:
120
+ lines.append(f"- {entry_type}: {entry}")
121
+ return lines
122
+
123
+
124
+ def _render_recommendations(recommendations: list[str]) -> list[str]:
125
+ lines = ["## Recommendations"]
126
+ if not recommendations:
127
+ lines.append("- none")
128
+ return lines
129
+ for item in recommendations:
130
+ lines.append(f"- {item}")
131
+ return lines
132
+
133
+
134
+ def _render_failing_metrics(metrics: list[StageMetric]) -> list[str]:
135
+ lines = ["## Failing Stage Metrics"]
136
+ failing = [metric for metric in metrics if metric.passed is False]
137
+ if not failing:
138
+ lines.append("- none")
139
+ return lines
140
+
141
+ failing_sorted = sorted(failing, key=_metric_severity, reverse=True)[:20]
142
+ for metric in failing_sorted:
143
+ threshold = metric.threshold if metric.threshold is not None else "-"
144
+ lines.append(
145
+ f"- {metric.metric_name}: score={metric.score} threshold={threshold} "
146
+ f"stage_id={metric.stage_id}"
147
+ )
148
+ return lines
149
+
150
+
151
+ def _metric_severity(metric: StageMetric) -> float:
152
+ if metric.threshold is None:
153
+ return 0.0
154
+ comparison = None
155
+ if isinstance(metric.evidence, dict):
156
+ comparison = metric.evidence.get("comparison")
157
+ if isinstance(comparison, str) and comparison.lower() in {"max", "<=", "le"}:
158
+ return max(metric.score - metric.threshold, 0.0)
159
+ return max(metric.threshold - metric.score, 0.0)
@@ -1128,6 +1128,96 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
1128
1128
  )
1129
1129
  return reports
1130
1130
 
1131
+ def save_ops_report(
1132
+ self,
1133
+ *,
1134
+ report_id: str | None,
1135
+ run_id: str | None,
1136
+ report_type: str,
1137
+ format: str,
1138
+ content: str | None,
1139
+ metadata: dict[str, Any] | None = None,
1140
+ created_at: str | None = None,
1141
+ ) -> str:
1142
+ report_id = report_id or str(uuid.uuid4())
1143
+ if created_at is None:
1144
+ created_at_value = datetime.now(UTC)
1145
+ else:
1146
+ created_at_value = (
1147
+ datetime.fromisoformat(created_at) if isinstance(created_at, str) else created_at
1148
+ )
1149
+
1150
+ with self._get_connection() as conn:
1151
+ conn.execute(
1152
+ """
1153
+ INSERT INTO ops_reports (
1154
+ report_id, run_id, report_type, format, content, metadata, created_at
1155
+ ) VALUES (%s, %s, %s, %s, %s, %s, %s)
1156
+ ON CONFLICT (report_id) DO UPDATE SET
1157
+ run_id = EXCLUDED.run_id,
1158
+ report_type = EXCLUDED.report_type,
1159
+ format = EXCLUDED.format,
1160
+ content = EXCLUDED.content,
1161
+ metadata = EXCLUDED.metadata,
1162
+ created_at = EXCLUDED.created_at
1163
+ """,
1164
+ (
1165
+ report_id,
1166
+ run_id,
1167
+ report_type,
1168
+ format,
1169
+ content,
1170
+ self._serialize_pipeline_json(metadata),
1171
+ created_at_value,
1172
+ ),
1173
+ )
1174
+ conn.commit()
1175
+
1176
+ return report_id
1177
+
1178
+ def list_ops_reports(
1179
+ self,
1180
+ *,
1181
+ run_id: str,
1182
+ report_type: str | None = None,
1183
+ format: str | None = None,
1184
+ limit: int = 20,
1185
+ ) -> list[dict[str, Any]]:
1186
+ clauses = ["run_id = %s"]
1187
+ params: list[Any] = [run_id]
1188
+ if report_type:
1189
+ clauses.append("report_type = %s")
1190
+ params.append(report_type)
1191
+ if format:
1192
+ clauses.append("format = %s")
1193
+ params.append(format)
1194
+ params.append(limit)
1195
+
1196
+ query = (
1197
+ "SELECT report_id, run_id, report_type, format, content, metadata, created_at "
1198
+ "FROM ops_reports WHERE " + " AND ".join(clauses) + " ORDER BY created_at DESC LIMIT %s"
1199
+ )
1200
+
1201
+ with self._get_connection() as conn:
1202
+ rows = conn.execute(query, tuple(params)).fetchall()
1203
+
1204
+ reports: list[dict[str, Any]] = []
1205
+ for row in rows:
1206
+ reports.append(
1207
+ {
1208
+ "report_id": row["report_id"],
1209
+ "run_id": row["run_id"],
1210
+ "report_type": row["report_type"],
1211
+ "format": row["format"],
1212
+ "content": row["content"],
1213
+ "metadata": self._deserialize_json(row["metadata"]),
1214
+ "created_at": row["created_at"].isoformat()
1215
+ if isinstance(row["created_at"], datetime)
1216
+ else row["created_at"],
1217
+ }
1218
+ )
1219
+ return reports
1220
+
1131
1221
  def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
1132
1222
  """파이프라인 분석 결과 목록을 조회합니다."""
1133
1223
  query = """
@@ -241,6 +241,19 @@ CREATE TABLE IF NOT EXISTS analysis_reports (
241
241
  CREATE INDEX IF NOT EXISTS idx_reports_run_id ON analysis_reports(run_id);
242
242
  CREATE INDEX IF NOT EXISTS idx_reports_experiment_id ON analysis_reports(experiment_id);
243
243
 
244
+ -- Ops reports table
245
+ CREATE TABLE IF NOT EXISTS ops_reports (
246
+ report_id UUID PRIMARY KEY,
247
+ run_id UUID REFERENCES evaluation_runs(run_id) ON DELETE SET NULL,
248
+ report_type VARCHAR(50) NOT NULL,
249
+ format VARCHAR(20) NOT NULL,
250
+ content TEXT,
251
+ metadata JSONB,
252
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
253
+ );
254
+
255
+ CREATE INDEX IF NOT EXISTS idx_ops_reports_run_id ON ops_reports(run_id);
256
+
244
257
  -- Analysis pipeline results table
245
258
  CREATE TABLE IF NOT EXISTS pipeline_results (
246
259
  result_id UUID PRIMARY KEY,
@@ -271,6 +271,20 @@ CREATE TABLE IF NOT EXISTS analysis_reports (
271
271
  CREATE INDEX IF NOT EXISTS idx_reports_run_id ON analysis_reports(run_id);
272
272
  CREATE INDEX IF NOT EXISTS idx_reports_experiment_id ON analysis_reports(experiment_id);
273
273
 
274
+ -- Ops reports table
275
+ CREATE TABLE IF NOT EXISTS ops_reports (
276
+ report_id TEXT PRIMARY KEY,
277
+ run_id TEXT,
278
+ report_type TEXT NOT NULL, -- 'ops_report', 'ops_snapshot'
279
+ format TEXT NOT NULL, -- 'markdown', 'json'
280
+ content TEXT, -- Report content (markdown/json) or file path
281
+ metadata TEXT, -- JSON metadata
282
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
283
+ FOREIGN KEY (run_id) REFERENCES evaluation_runs(run_id) ON DELETE SET NULL
284
+ );
285
+
286
+ CREATE INDEX IF NOT EXISTS idx_ops_reports_run_id ON ops_reports(run_id);
287
+
274
288
  -- Analysis pipeline results table
275
289
  CREATE TABLE IF NOT EXISTS pipeline_results (
276
290
  result_id TEXT PRIMARY KEY,
@@ -1211,6 +1211,83 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
1211
1211
  )
1212
1212
  return reports
1213
1213
 
1214
+ def save_ops_report(
1215
+ self,
1216
+ *,
1217
+ report_id: str | None,
1218
+ run_id: str | None,
1219
+ report_type: str,
1220
+ format: str,
1221
+ content: str | None,
1222
+ metadata: dict[str, Any] | None = None,
1223
+ created_at: str | None = None,
1224
+ ) -> str:
1225
+ report_id = report_id or str(uuid.uuid4())
1226
+ created_at = created_at or datetime.now().isoformat()
1227
+
1228
+ with self._get_connection() as conn:
1229
+ conn = cast(Any, conn)
1230
+ conn.execute(
1231
+ """
1232
+ INSERT OR REPLACE INTO ops_reports (
1233
+ report_id, run_id, report_type, format, content, metadata, created_at
1234
+ ) VALUES (?, ?, ?, ?, ?, ?, ?)
1235
+ """,
1236
+ (
1237
+ report_id,
1238
+ run_id,
1239
+ report_type,
1240
+ format,
1241
+ content,
1242
+ self._serialize_json(metadata),
1243
+ created_at,
1244
+ ),
1245
+ )
1246
+ conn.commit()
1247
+
1248
+ return report_id
1249
+
1250
+ def list_ops_reports(
1251
+ self,
1252
+ *,
1253
+ run_id: str,
1254
+ report_type: str | None = None,
1255
+ format: str | None = None,
1256
+ limit: int = 20,
1257
+ ) -> list[dict[str, Any]]:
1258
+ query = (
1259
+ "SELECT report_id, run_id, report_type, format, content, metadata, created_at "
1260
+ "FROM ops_reports WHERE run_id = ?"
1261
+ )
1262
+ params: list[Any] = [run_id]
1263
+ if report_type:
1264
+ query += " AND report_type = ?"
1265
+ params.append(report_type)
1266
+ if format:
1267
+ query += " AND format = ?"
1268
+ params.append(format)
1269
+ query += " ORDER BY created_at DESC LIMIT ?"
1270
+ params.append(limit)
1271
+
1272
+ with self._get_connection() as conn:
1273
+ conn = cast(Any, conn)
1274
+ rows = conn.execute(query, tuple(params)).fetchall()
1275
+
1276
+ reports: list[dict[str, Any]] = []
1277
+ for row in rows:
1278
+ reports.append(
1279
+ {
1280
+ "report_id": row["report_id"],
1281
+ "run_id": row["run_id"],
1282
+ "report_type": row["report_type"],
1283
+ "format": row["format"],
1284
+ "content": row["content"],
1285
+ "metadata": self._deserialize_json(row["metadata"]),
1286
+ "created_at": row["created_at"],
1287
+ }
1288
+ )
1289
+ return reports
1290
+
1214
1291
  def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
1215
1292
  """파이프라인 분석 결과 목록을 조회합니다."""
1216
1293
  query = """
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+
6
+ from evalvault.domain.entities.stage import StageMetric, StageSummary
7
+
8
+
9
+ @dataclass
10
+ class OpsReport:
11
+ run_summary: dict[str, Any]
12
+ ops_kpis: dict[str, Any]
13
+ stage_summary: StageSummary | None
14
+ stage_metrics: list[StageMetric]
15
+ bottlenecks: list[dict[str, Any]]
16
+ recommendations: list[str]
17
+ metadata: dict[str, Any] = field(default_factory=dict)
18
+
19
+ def to_dict(self) -> dict[str, Any]:
20
+ return {
21
+ "run_summary": self.run_summary,
22
+ "ops_kpis": self.ops_kpis,
23
+ "stage_summary": _stage_summary_to_dict(self.stage_summary),
24
+ "stage_metrics": [metric.to_dict() for metric in self.stage_metrics],
25
+ "bottlenecks": self.bottlenecks,
26
+ "recommendations": self.recommendations,
27
+ "metadata": self.metadata,
28
+ }
29
+
30
+
31
+ def _stage_summary_to_dict(summary: StageSummary | None) -> dict[str, Any] | None:
32
+ if summary is None:
33
+ return None
34
+ return {
35
+ "run_id": summary.run_id,
36
+ "total_events": summary.total_events,
37
+ "stage_type_counts": summary.stage_type_counts,
38
+ "stage_type_avg_durations": summary.stage_type_avg_durations,
39
+ "missing_required_stage_types": summary.missing_required_stage_types,
40
+ }
@@ -0,0 +1,192 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from evalvault.config.langfuse_support import get_langfuse_trace_url
6
+ from evalvault.config.phoenix_support import get_phoenix_trace_url
7
+ from evalvault.domain.entities.ops_report import OpsReport
8
+ from evalvault.domain.entities.stage import StageEvent, StageMetric, StageSummary
9
+ from evalvault.domain.services.stage_metric_guide_service import StageMetricGuideService
10
+ from evalvault.domain.services.stage_metric_service import StageMetricService
11
+ from evalvault.domain.services.stage_summary_service import StageSummaryService
12
+ from evalvault.ports.outbound.stage_storage_port import StageStoragePort
13
+ from evalvault.ports.outbound.storage_port import StoragePort
14
+
15
+
16
+ class OpsReportService:
17
+ """Build an operational report for an evaluation run."""
18
+
19
+ def __init__(
20
+ self,
21
+ *,
22
+ metric_service: StageMetricService | None = None,
23
+ summary_service: StageSummaryService | None = None,
24
+ guide_service: StageMetricGuideService | None = None,
25
+ ) -> None:
26
+ self._metric_service = metric_service or StageMetricService()
27
+ self._summary_service = summary_service or StageSummaryService()
28
+ self._guide_service = guide_service or StageMetricGuideService()
29
+
30
+ def build_report(
31
+ self,
32
+ run_id: str,
33
+ *,
34
+ storage: StoragePort,
35
+ stage_storage: StageStoragePort,
36
+ ) -> OpsReport:
37
+ run = storage.get_run(run_id)
38
+ run_summary = run.to_summary_dict()
39
+ phoenix_trace_url = get_phoenix_trace_url(run.tracker_metadata)
40
+ langfuse_trace_url = get_langfuse_trace_url(run.tracker_metadata)
41
+
42
+ events = stage_storage.list_stage_events(run_id)
43
+ stage_summary = self._summarize_events(events)
44
+
45
+ stage_metrics = stage_storage.list_stage_metrics(run_id)
46
+ if not stage_metrics and events:
47
+ stage_metrics = self._metric_service.build_metrics(events)
48
+
49
+ bottlenecks = self._build_bottlenecks(stage_summary)
50
+ recommendations = self._build_recommendations(stage_metrics)
51
+
52
+ ops_kpis = self._build_ops_kpis(run, events)
53
+
54
+ metadata = {
55
+ "phoenix_trace_url": phoenix_trace_url,
56
+ "langfuse_trace_url": langfuse_trace_url,
57
+ }
58
+
59
+ return OpsReport(
60
+ run_summary=run_summary,
61
+ ops_kpis=ops_kpis,
62
+ stage_summary=stage_summary,
63
+ stage_metrics=stage_metrics,
64
+ bottlenecks=bottlenecks,
65
+ recommendations=recommendations,
66
+ metadata=metadata,
67
+ )
68
+
69
+ def _summarize_events(self, events: list[StageEvent]) -> StageSummary | None:
70
+ if not events:
71
+ return None
72
+ return self._summary_service.summarize(events)
73
+
74
+ def _build_bottlenecks(self, summary: StageSummary | None) -> list[dict[str, Any]]:
75
+ if summary is None:
76
+ return []
77
+ bottlenecks: list[dict[str, Any]] = []
78
+
79
+ for stage_type in summary.missing_required_stage_types:
80
+ bottlenecks.append(
81
+ {
82
+ "type": "missing_stage",
83
+ "stage_type": stage_type,
84
+ "detail": "required stage missing",
85
+ }
86
+ )
87
+
88
+ durations = summary.stage_type_avg_durations
89
+ if durations:
90
+ top = sorted(durations.items(), key=lambda item: item[1], reverse=True)[:3]
91
+ for stage_type, duration in top:
92
+ bottlenecks.append(
93
+ {
94
+ "type": "latency",
95
+ "stage_type": stage_type,
96
+ "avg_duration_ms": round(duration, 3),
97
+ }
98
+ )
99
+ return bottlenecks
100
+
101
+ def _build_recommendations(self, metrics: list[StageMetric]) -> list[str]:
102
+ if not metrics:
103
+ return []
104
+ guides = self._guide_service.build_guides(metrics)
105
+ recommendations: list[str] = []
106
+ for guide in guides:
107
+ top_action = guide.top_action
108
+ if top_action is None:
109
+ continue
110
+ hint = top_action.implementation_hint or top_action.description
111
+ label = f"[{guide.priority.value}] {guide.component.value}"
112
+ if hint:
113
+ recommendations.append(f"{label}: {top_action.title} - {hint}")
114
+ else:
115
+ recommendations.append(f"{label}: {top_action.title}")
116
+ return recommendations
117
+
118
+ def _build_ops_kpis(self, run, events: list[StageEvent]) -> dict[str, Any]:
119
+ total_cases = run.total_test_cases
120
+ latencies = [r.latency_ms for r in run.results if r.latency_ms]
121
+ tokens_used = [r.tokens_used for r in run.results if r.tokens_used]
122
+ costs = [r.cost_usd for r in run.results if r.cost_usd is not None]
123
+
124
+ avg_latency = _average(latencies)
125
+ p95_latency = _percentile(latencies, 0.95)
126
+ avg_tokens = _average(tokens_used)
127
+ avg_cost = _average(costs)
128
+ pass_rate = run.pass_rate
129
+ failure_rate = None if pass_rate is None else max(0.0, 1.0 - pass_rate)
130
+
131
+ error_rate = _stage_error_rate(events)
132
+ error_severity = _stage_error_severity(error_rate)
133
+
134
+ return {
135
+ "total_test_cases": total_cases,
136
+ "pass_rate": pass_rate,
137
+ "failure_rate": failure_rate,
138
+ "stage_error_rate": error_rate,
139
+ "stage_error_severity": error_severity,
140
+ "duration_seconds": run.duration_seconds,
141
+ "total_tokens": run.total_tokens,
142
+ "total_cost_usd": run.total_cost_usd,
143
+ "avg_latency_ms": avg_latency,
144
+ "p95_latency_ms": p95_latency,
145
+ "avg_tokens_per_case": avg_tokens,
146
+ "avg_cost_per_case_usd": avg_cost,
147
+ }
148
+
149
+
150
+ def _average(values: list[float | int]) -> float | None:
151
+ if not values:
152
+ return None
153
+ return float(sum(values)) / len(values)
154
+
155
+
156
+ def _percentile(values: list[float | int], ratio: float) -> float | None:
157
+ if not values:
158
+ return None
159
+ if ratio <= 0:
160
+ return float(min(values))
161
+ if ratio >= 1:
162
+ return float(max(values))
163
+ sorted_values = sorted(values)
164
+ index = int(round((len(sorted_values) - 1) * ratio))
165
+ return float(sorted_values[index])
166
+
167
+
168
+ def _stage_error_rate(events: list[StageEvent]) -> float | None:
169
+ if not events:
170
+ return None
171
+ total = len(events)
172
+ failure_statuses = {"failed", "error", "timeout", "aborted"}
173
+ success_statuses = {"success", "ok", "completed", "pass"}
174
+ failures = 0
175
+ for event in events:
176
+ status = str(event.status or "").strip().lower()
177
+ if status in failure_statuses:
178
+ failures += 1
179
+ continue
180
+ if status and status not in success_statuses:
181
+ failures += 1
182
+ return failures / total
183
+
184
+
185
+ def _stage_error_severity(rate: float | None) -> str | None:
186
+ if rate is None:
187
+ return None
188
+ if rate >= 0.05:
189
+ return "critical"
190
+ if rate >= 0.02:
191
+ return "warning"
192
+ return "ok"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evalvault
3
- Version: 1.73.2
3
+ Version: 1.74.0
4
4
  Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
5
5
  Project-URL: Homepage, https://github.com/ntts9990/EvalVault
6
6
  Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
@@ -5,12 +5,12 @@ evalvault/mkdocs_helpers.py,sha256=1AKVQ1W2_VO4qclhfyefyU9Dz1Hzkh1DWDwsFMe24jc,3
5
5
  evalvault/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  evalvault/adapters/inbound/__init__.py,sha256=SG1svel1PwqetnqVpKFLSv612_WwGwLTbFpYgwk6FMw,166
7
7
  evalvault/adapters/inbound/api/__init__.py,sha256=LeVVttCA3tLKoHA2PO4z3y8VkfVcf3Bq8CZSzo91lf4,34
8
- evalvault/adapters/inbound/api/adapter.py,sha256=Igg2grCUxQzMuvDOAhBK08wY0nxjmnvnaGS5rLVF3i4,83388
8
+ evalvault/adapters/inbound/api/adapter.py,sha256=B0kLz4aEsZ-WeCQlUKSanSz6r9gedr_JmiRIaso3HoA,84659
9
9
  evalvault/adapters/inbound/api/main.py,sha256=QgLxzHEy7aycGKIFLtN12tWTjnpWLtQ2XDXKV_2FDvg,7531
10
10
  evalvault/adapters/inbound/api/routers/__init__.py,sha256=q07_YF9TnBl68bqcRCvhPU4-zRTyvmPoHVehwO6W7QM,19
11
11
  evalvault/adapters/inbound/api/routers/benchmark.py,sha256=yevntbZcNtMvbVODsITUBgR1Ka4pdFQrXBJJ4K4Jyr4,4477
12
12
  evalvault/adapters/inbound/api/routers/calibration.py,sha256=ZnJSEW8hV-94S95lU_nDmzcLyaUoH1suM3sFUpJ3w5k,4130
13
- evalvault/adapters/inbound/api/routers/chat.py,sha256=hCA6rWr5GT_gCqu75uCqYwy2gOEUd85mlcc5y-ruFTY,20661
13
+ evalvault/adapters/inbound/api/routers/chat.py,sha256=lQfJTPPcA3H3IzxNpYSTBBAy-gSxCYYmDfEr4hCFp9M,30676
14
14
  evalvault/adapters/inbound/api/routers/config.py,sha256=LygN0fVMr8NFtj5zuQXnVFhoafx56Txa98vpwtPa4Jc,4104
15
15
  evalvault/adapters/inbound/api/routers/domain.py,sha256=RsR7GIFMjccDN7vpG1uDyk9n1DnCTH18JDGAX7o4Qqc,3648
16
16
  evalvault/adapters/inbound/api/routers/knowledge.py,sha256=yb_e7OEPtwldOAzHTGiWe7jShHw2JdpOFnzGPMceRsg,7109
@@ -181,6 +181,8 @@ evalvault/adapters/outbound/nlp/korean/korean_evaluation.py,sha256=Mxwu3zhtdm8Te
181
181
  evalvault/adapters/outbound/nlp/korean/korean_stopwords.py,sha256=UemEFCJudg2EpsHg8uU2eR-iCh34kw4ZSVCRvnEC6a4,4293
182
182
  evalvault/adapters/outbound/nlp/korean/toolkit.py,sha256=EYGpd89ilpn4Wg5t8pALYt4Qi0aDHYOfXGuYbQx7do0,4246
183
183
  evalvault/adapters/outbound/nlp/korean/toolkit_factory.py,sha256=x3v-AAkVInOabC4PtOtStsZrFnHun0IOqZDyQGaQVm8,586
184
+ evalvault/adapters/outbound/ops/__init__.py,sha256=_QiDVPuiYWkIwW_ELEVKD_v6dLojjyvIJWs4qVNxehw,164
185
+ evalvault/adapters/outbound/ops/report_renderer.py,sha256=mezVKdIsnJSNvBW6xkhpNG3MOFXHZLZspmHk5o-e8Cg,6354
184
186
  evalvault/adapters/outbound/phoenix/sync_service.py,sha256=i6gHpNiZXKQ5yzV9B2TPb-P1N45k_Ck5ruzh3oqp4d8,9122
185
187
  evalvault/adapters/outbound/report/__init__.py,sha256=8VeMrfj63mDR-xUHct-drNNBA5M-m-B7sgC1qUJF7g4,660
186
188
  evalvault/adapters/outbound/report/ci_report_formatter.py,sha256=5YD8BwtOjLnHcNbbG0HJziOifD9BDhBtZT1oItd6zJE,1233
@@ -193,10 +195,10 @@ evalvault/adapters/outbound/retriever/graph_rag_adapter.py,sha256=xTI7uMFp4WKstg
193
195
  evalvault/adapters/outbound/storage/__init__.py,sha256=n5R6thAPTx1leSwv6od6nBWcLWFa-UYD6cOLzN89T8I,614
194
196
  evalvault/adapters/outbound/storage/base_sql.py,sha256=NHHlBBhoDq8zbLfIvRoOswjB97Tro1sRE3cmbR8QnV8,58380
195
197
  evalvault/adapters/outbound/storage/benchmark_storage_adapter.py,sha256=Qgf9xSSIkYQRpG4uLzcUdoYO9LTQDQ4tFRkkMYer-WA,9803
196
- evalvault/adapters/outbound/storage/postgres_adapter.py,sha256=pyIPMliC7bsIXoRQ8l6a_svQUJ7or1XFxERT-DFJuf4,56854
197
- evalvault/adapters/outbound/storage/postgres_schema.sql,sha256=WQNeOUm9UYWteh6k-PLrIhgkagIv_UPGCIi4nGzP9VA,11916
198
- evalvault/adapters/outbound/storage/schema.sql,sha256=ERgupB1bH1xqQr56GZHyVGfcUqr_7NGfg7U6f69W29Y,13746
199
- evalvault/adapters/outbound/storage/sqlite_adapter.py,sha256=9Z2A4C5QZEcQiQVt4h7fJMIbPSA3lV_u2bd2nXqDdOA,60491
198
+ evalvault/adapters/outbound/storage/postgres_adapter.py,sha256=SZb4Dx2ZYNu-pDqX4rVgT4bRfJ2cQ75DXcWIIEu8aws,59933
199
+ evalvault/adapters/outbound/storage/postgres_schema.sql,sha256=E-HMnGjrB05n2Z-zDq1kQzYSpYlPwg2Egm1e3MSRB7E,12335
200
+ evalvault/adapters/outbound/storage/schema.sql,sha256=LEtrKFpa1SbIBN-igkNiQXJqVLGp-liX-6KR_sZAMwM,14283
201
+ evalvault/adapters/outbound/storage/sqlite_adapter.py,sha256=Gsco1g_N1RHyRzIQIwelR-PU7kx57djHOc_hKEu_gCc,62940
200
202
  evalvault/adapters/outbound/tracer/__init__.py,sha256=xrvQQuAvF_UI02mKLMV7GTrG3zn836n5zwCRrrmhq_U,1054
201
203
  evalvault/adapters/outbound/tracer/open_rag_log_handler.py,sha256=aq96FIWD-bBaSkq-bygWhQArC9LWghSwi-S03Mga0mI,2827
202
204
  evalvault/adapters/outbound/tracer/open_rag_trace_adapter.py,sha256=X_dlqUEwck_MQRGC-euEN73L2Ikcd5eZlgtKK24nDFY,5453
@@ -235,6 +237,7 @@ evalvault/domain/entities/kg.py,sha256=8awN1M4vxAGQZk_ZG8i2CXKTizQ8FA1VCLhUWHZq0
235
237
  evalvault/domain/entities/memory.py,sha256=bfS75q8K8_jNrB7IYh4mjP8Lkyj-I0TVsmHCP0FuICw,8423
236
238
  evalvault/domain/entities/method.py,sha256=a3jZi7SjcpK3HeVyVwQkUMwpnmg2RbxCnH4NqYPLCOI,1157
237
239
  evalvault/domain/entities/multiturn.py,sha256=V9ay30rix6zxNcDRXeLudMgikC1b4f3kt01Hj2ZH7wE,2012
240
+ evalvault/domain/entities/ops_report.py,sha256=89x1UKdFb6oxpsQb4pfED69dcCqnyrzBuOo42dcbgho,1343
238
241
  evalvault/domain/entities/prompt.py,sha256=lQlRnHEKY69GWTC-cUIu0DMuPfJ9UWm6Sm4KTNjVwfY,2920
239
242
  evalvault/domain/entities/prompt_suggestion.py,sha256=Ep_XSjdYUj7pFSCMyeeZKs8yTnp74AVx05Zqr7829PE,1243
240
243
  evalvault/domain/entities/rag_trace.py,sha256=sZgnkG4fK6KOe3Np6TYAZ_tPnsRbOmucDSQns35U1n4,11868
@@ -292,6 +295,7 @@ evalvault/domain/services/memory_aware_evaluator.py,sha256=vTiYoxiMfZ_CMjSBjqwkB
292
295
  evalvault/domain/services/memory_based_analysis.py,sha256=oh2irCy3le7fWiTtL31SMEhPyu7fyBVz-giO2hlNifE,4499
293
296
  evalvault/domain/services/method_runner.py,sha256=pABqKZeaALpWZYDfzAbd-VOZt2djQggRNIPuuPQeUSw,3571
294
297
  evalvault/domain/services/multiturn_evaluator.py,sha256=fipi5hEyidq_cnGGr0GpvoprLtjm6dHLuAkSotbT3YA,7202
298
+ evalvault/domain/services/ops_report_service.py,sha256=FNMN0GpaA3rnnbYs-5xY1ReXzGWaQG0t3ZN0eoeQiRc,7018
295
299
  evalvault/domain/services/ops_snapshot_service.py,sha256=1CqJN2p3tM6SgzLCZKcVEM213fd1cDGexTRPG_3e59w,5138
296
300
  evalvault/domain/services/pipeline_orchestrator.py,sha256=yriVlEVZYDtt0Vwt4Ae6xyW1H6Dj4Hxdn8XQSvQNSoQ,19436
297
301
  evalvault/domain/services/pipeline_template_registry.py,sha256=Gg0k-Sj3MNDBjckhI2gQRCNUsqIz0nnjPuswW5kUdBw,29567
@@ -355,8 +359,8 @@ evalvault/reports/__init__.py,sha256=Bb1X4871msAN8I6PM6nKGED3psPwZt88hXZBAOdH06Y
355
359
  evalvault/reports/release_notes.py,sha256=pZj0PBFT-4F_Ty-Kv5P69BuoOnmTCn4kznDcORFJd0w,4011
356
360
  evalvault/scripts/__init__.py,sha256=NwEeIFQbkX4ml2R_PhtIoNtArDSX_suuoymgG_7Kwso,89
357
361
  evalvault/scripts/regression_runner.py,sha256=SxZori5BZ8jVQ057Mf5V5FPgIVDccrV5oRONmnhuk8w,8438
358
- evalvault-1.73.2.dist-info/METADATA,sha256=j2HvoSJag0ISVSoJnZN5iQ_QPQy_djdEDgfy27kHepI,26218
359
- evalvault-1.73.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
360
- evalvault-1.73.2.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
361
- evalvault-1.73.2.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
362
- evalvault-1.73.2.dist-info/RECORD,,
362
+ evalvault-1.74.0.dist-info/METADATA,sha256=ASZ_QeCeh2pyJ3eBUfu0zDKbIwDbvMp9JzL6KVvjWqk,26218
363
+ evalvault-1.74.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
364
+ evalvault-1.74.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
365
+ evalvault-1.74.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
366
+ evalvault-1.74.0.dist-info/RECORD,,