evalvault 1.73.2__py3-none-any.whl → 1.74.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +38 -0
- evalvault/adapters/inbound/api/routers/chat.py +301 -20
- evalvault/adapters/outbound/ops/__init__.py +5 -0
- evalvault/adapters/outbound/ops/report_renderer.py +159 -0
- evalvault/adapters/outbound/storage/postgres_adapter.py +90 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +13 -0
- evalvault/adapters/outbound/storage/schema.sql +14 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +77 -0
- evalvault/domain/entities/ops_report.py +40 -0
- evalvault/domain/services/ops_report_service.py +192 -0
- {evalvault-1.73.2.dist-info → evalvault-1.74.0.dist-info}/METADATA +1 -1
- {evalvault-1.73.2.dist-info → evalvault-1.74.0.dist-info}/RECORD +15 -11
- {evalvault-1.73.2.dist-info → evalvault-1.74.0.dist-info}/WHEEL +0 -0
- {evalvault-1.73.2.dist-info → evalvault-1.74.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.73.2.dist-info → evalvault-1.74.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -21,6 +21,7 @@ from evalvault.adapters.outbound.analysis import (
|
|
|
21
21
|
)
|
|
22
22
|
from evalvault.adapters.outbound.cache import MemoryCacheAdapter
|
|
23
23
|
from evalvault.adapters.outbound.judge_calibration_reporter import JudgeCalibrationReporter
|
|
24
|
+
from evalvault.adapters.outbound.ops.report_renderer import render_json, render_markdown
|
|
24
25
|
from evalvault.adapters.outbound.report import MarkdownReportAdapter
|
|
25
26
|
from evalvault.config.phoenix_support import PhoenixExperimentResolver
|
|
26
27
|
from evalvault.config.settings import Settings
|
|
@@ -43,6 +44,7 @@ from evalvault.domain.services.analysis_service import AnalysisService
|
|
|
43
44
|
from evalvault.domain.services.cluster_map_builder import build_cluster_map
|
|
44
45
|
from evalvault.domain.services.debug_report_service import DebugReportService
|
|
45
46
|
from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
|
|
47
|
+
from evalvault.domain.services.ops_report_service import OpsReportService
|
|
46
48
|
from evalvault.domain.services.prompt_registry import (
|
|
47
49
|
PromptInput,
|
|
48
50
|
build_prompt_bundle,
|
|
@@ -1329,6 +1331,42 @@ class WebUIAdapter:
|
|
|
1329
1331
|
stage_storage=stage_storage,
|
|
1330
1332
|
)
|
|
1331
1333
|
|
|
1334
|
+
def generate_ops_report(
|
|
1335
|
+
self,
|
|
1336
|
+
run_id: str,
|
|
1337
|
+
*,
|
|
1338
|
+
output_format: str,
|
|
1339
|
+
save: bool,
|
|
1340
|
+
) -> dict[str, Any] | str:
|
|
1341
|
+
if self._storage is None:
|
|
1342
|
+
raise RuntimeError("Storage not configured")
|
|
1343
|
+
if not hasattr(self._storage, "list_stage_events"):
|
|
1344
|
+
raise RuntimeError("Stage storage not configured")
|
|
1345
|
+
|
|
1346
|
+
service = OpsReportService()
|
|
1347
|
+
stage_storage = cast(StageStoragePort, self._storage)
|
|
1348
|
+
report = service.build_report(
|
|
1349
|
+
run_id,
|
|
1350
|
+
storage=self._storage,
|
|
1351
|
+
stage_storage=stage_storage,
|
|
1352
|
+
)
|
|
1353
|
+
|
|
1354
|
+
content = render_markdown(report) if output_format == "markdown" else render_json(report)
|
|
1355
|
+
|
|
1356
|
+
if save:
|
|
1357
|
+
self._storage.save_ops_report(
|
|
1358
|
+
report_id=None,
|
|
1359
|
+
run_id=run_id,
|
|
1360
|
+
report_type="ops_report",
|
|
1361
|
+
format=output_format,
|
|
1362
|
+
content=content,
|
|
1363
|
+
metadata={"source": "api"},
|
|
1364
|
+
)
|
|
1365
|
+
|
|
1366
|
+
if output_format == "markdown":
|
|
1367
|
+
return content
|
|
1368
|
+
return report.to_dict()
|
|
1369
|
+
|
|
1332
1370
|
def delete_run(self, run_id: str) -> bool:
|
|
1333
1371
|
"""평가 삭제.
|
|
1334
1372
|
|
|
@@ -44,10 +44,62 @@ class ChatRequest(BaseModel):
|
|
|
44
44
|
history: list[ChatMessage] | None = None
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
class AiChatMessage(BaseModel):
|
|
48
|
+
role: str
|
|
49
|
+
content: str | None = None
|
|
50
|
+
parts: list[dict[str, Any]] | None = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class AiChatRequest(BaseModel):
|
|
54
|
+
messages: list[AiChatMessage] = Field(default_factory=list)
|
|
55
|
+
run_id: str | None = None
|
|
56
|
+
category: str | None = None
|
|
57
|
+
|
|
58
|
+
|
|
47
59
|
def _extract_run_ids(text: str) -> list[str]:
|
|
48
60
|
return re.findall(r"run_[A-Za-z0-9_-]+", text)
|
|
49
61
|
|
|
50
62
|
|
|
63
|
+
def _ollama_chat_options(model_name: str) -> dict[str, Any] | None:
|
|
64
|
+
lower = model_name.lower()
|
|
65
|
+
if lower.startswith("qwen3"):
|
|
66
|
+
return {
|
|
67
|
+
"temperature": 0.6,
|
|
68
|
+
"top_p": 0.95,
|
|
69
|
+
"top_k": 20,
|
|
70
|
+
"repeat_penalty": 1,
|
|
71
|
+
"stop": ["<|im_start|>", "<|im_end|>"],
|
|
72
|
+
}
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _is_verb_only(text: str) -> bool:
|
|
77
|
+
if not text:
|
|
78
|
+
return False
|
|
79
|
+
compact = re.sub(r"\s+", "", text.strip())
|
|
80
|
+
if not compact:
|
|
81
|
+
return False
|
|
82
|
+
tokens = re.findall(r"[A-Za-z0-9가-힣]+", compact)
|
|
83
|
+
if len(tokens) > 2:
|
|
84
|
+
return False
|
|
85
|
+
verb_markers = ["해줘", "해주세요", "해봐", "해봐요", "해줘요", "해줘라"]
|
|
86
|
+
verb_stems = ["설명", "요약", "분석", "비교", "개선", "정리", "추천", "진단", "해석", "검증"]
|
|
87
|
+
if any(compact.endswith(marker) for marker in verb_markers):
|
|
88
|
+
return any(stem in compact for stem in verb_stems)
|
|
89
|
+
return compact in verb_stems
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _with_context(user_text: str, run_id: str | None, category: str | None) -> str:
|
|
93
|
+
parts = []
|
|
94
|
+
if run_id:
|
|
95
|
+
parts.append(f"선택된 run_id: {run_id}")
|
|
96
|
+
if category:
|
|
97
|
+
parts.append(f"질문 분류: {category}")
|
|
98
|
+
if not parts:
|
|
99
|
+
return user_text
|
|
100
|
+
return "\n".join(parts) + f"\n사용자 요청: {user_text}"
|
|
101
|
+
|
|
102
|
+
|
|
51
103
|
def _format_tool_result(result: Any) -> str:
|
|
52
104
|
if isinstance(result, dict):
|
|
53
105
|
if "result" in result:
|
|
@@ -283,15 +335,29 @@ async def warm_rag_index() -> None:
|
|
|
283
335
|
logger.warning("RAG preload failed: %s", exc)
|
|
284
336
|
|
|
285
337
|
|
|
286
|
-
async def _direct_chat_answer(
|
|
338
|
+
async def _direct_chat_answer(
|
|
339
|
+
user_text: str, run_id: str | None = None, category: str | None = None
|
|
340
|
+
) -> str | None:
|
|
341
|
+
user_text = _with_context(user_text, run_id, category)
|
|
342
|
+
model_name = os.getenv("OLLAMA_CHAT_MODEL", "qwen3:14b")
|
|
343
|
+
options = _ollama_chat_options(model_name)
|
|
287
344
|
payload = {
|
|
288
|
-
"model":
|
|
345
|
+
"model": model_name,
|
|
289
346
|
"messages": [
|
|
290
|
-
{
|
|
347
|
+
{
|
|
348
|
+
"role": "system",
|
|
349
|
+
"content": (
|
|
350
|
+
"You are a helpful assistant for EvalVault. "
|
|
351
|
+
"Interpret verb-only requests as questions about the selected run/category. "
|
|
352
|
+
"If essential details are missing, ask a concise follow-up question in Korean."
|
|
353
|
+
),
|
|
354
|
+
},
|
|
291
355
|
{"role": "user", "content": user_text},
|
|
292
356
|
],
|
|
293
357
|
"stream": False,
|
|
294
358
|
}
|
|
359
|
+
if options:
|
|
360
|
+
payload["options"] = options
|
|
295
361
|
|
|
296
362
|
async with httpx.AsyncClient(timeout=30) as client:
|
|
297
363
|
response = await client.post(
|
|
@@ -318,7 +384,9 @@ def _simple_retrieve(texts: list[str], query: str, top_k: int) -> list[str]:
|
|
|
318
384
|
return [text for _, text in scored[:top_k]]
|
|
319
385
|
|
|
320
386
|
|
|
321
|
-
async def _rag_answer(
|
|
387
|
+
async def _rag_answer(
|
|
388
|
+
user_text: str, run_id: str | None = None, category: str | None = None
|
|
389
|
+
) -> str | None:
|
|
322
390
|
retriever, _ = await _get_rag_retriever()
|
|
323
391
|
contexts: list[str] = []
|
|
324
392
|
|
|
@@ -340,22 +408,28 @@ async def _rag_answer(user_text: str) -> str | None:
|
|
|
340
408
|
|
|
341
409
|
prompt = (
|
|
342
410
|
"다음은 EvalVault 코드/문서에서 검색된 컨텍스트입니다.\n"
|
|
343
|
-
"컨텍스트만 근거로 사용해 한국어로 답하세요.\n
|
|
411
|
+
"컨텍스트만 근거로 사용해 한국어로 답하세요.\n"
|
|
412
|
+
"질문이 동사만 있는 경우에도 선택된 run_id/분류를 기준으로 해석하세요.\n"
|
|
413
|
+
"정보가 부족하면 먼저 필요한 정보를 질문하세요.\n\n"
|
|
344
414
|
"[컨텍스트]\n"
|
|
345
415
|
+ "\n\n---\n\n".join(contexts[:3])
|
|
346
416
|
+ "\n\n[질문]\n"
|
|
347
|
-
+ user_text
|
|
417
|
+
+ _with_context(user_text, run_id, category)
|
|
348
418
|
+ "\n\n[답변]"
|
|
349
419
|
)
|
|
350
420
|
|
|
421
|
+
model_name = os.getenv("OLLAMA_CHAT_MODEL", "qwen3:14b")
|
|
422
|
+
options = _ollama_chat_options(model_name)
|
|
351
423
|
payload = {
|
|
352
|
-
"model":
|
|
424
|
+
"model": model_name,
|
|
353
425
|
"messages": [
|
|
354
426
|
{"role": "system", "content": "You are a helpful assistant for EvalVault."},
|
|
355
427
|
{"role": "user", "content": prompt},
|
|
356
428
|
],
|
|
357
429
|
"stream": False,
|
|
358
430
|
}
|
|
431
|
+
if options:
|
|
432
|
+
payload["options"] = options
|
|
359
433
|
|
|
360
434
|
async with httpx.AsyncClient(timeout=60) as client:
|
|
361
435
|
response = await client.post(
|
|
@@ -388,7 +462,9 @@ async def _call_mcp_tool(tool_name: str, tool_args: dict[str, Any]) -> Any:
|
|
|
388
462
|
return data
|
|
389
463
|
|
|
390
464
|
|
|
391
|
-
async def _resolve_tool_with_llm(
|
|
465
|
+
async def _resolve_tool_with_llm(
|
|
466
|
+
user_text: str, run_id: str | None = None, category: str | None = None
|
|
467
|
+
) -> dict[str, Any] | None:
|
|
392
468
|
ollama_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
|
393
469
|
router_model = os.getenv("OLLAMA_ROUTER_MODEL", "gemma3:1b")
|
|
394
470
|
|
|
@@ -398,6 +474,8 @@ async def _resolve_tool_with_llm(user_text: str) -> dict[str, Any] | None:
|
|
|
398
474
|
"Action must be one of: tool, rag, direct."
|
|
399
475
|
"Tools: list_runs, get_run_summary, run_evaluation, analyze_compare, get_artifacts."
|
|
400
476
|
"Rules:"
|
|
477
|
+
"- Assume verb-only requests refer to the selected run_id/category when provided."
|
|
478
|
+
"- If essential info is missing (e.g., run_id), return action direct with a follow-up question."
|
|
401
479
|
"- If user asks about datasets, prefer tool list_datasets."
|
|
402
480
|
"- If question is about EvalVault docs/usage, prefer rag."
|
|
403
481
|
"- If greeting or general chat, use direct."
|
|
@@ -413,7 +491,7 @@ async def _resolve_tool_with_llm(user_text: str) -> dict[str, Any] | None:
|
|
|
413
491
|
"model": router_model,
|
|
414
492
|
"messages": [
|
|
415
493
|
{"role": "system", "content": system_prompt},
|
|
416
|
-
{"role": "user", "content": user_text},
|
|
494
|
+
{"role": "user", "content": _with_context(user_text, run_id, category)},
|
|
417
495
|
],
|
|
418
496
|
"stream": False,
|
|
419
497
|
}
|
|
@@ -479,6 +557,99 @@ def _chunk_text(text: str, size: int = 42) -> list[str]:
|
|
|
479
557
|
return [text[i : i + size] for i in range(0, len(text), size)]
|
|
480
558
|
|
|
481
559
|
|
|
560
|
+
def _extract_text_from_parts(parts: list[dict[str, Any]] | None) -> str | None:
|
|
561
|
+
if not parts:
|
|
562
|
+
return None
|
|
563
|
+
chunks: list[str] = []
|
|
564
|
+
for part in parts:
|
|
565
|
+
if not isinstance(part, dict):
|
|
566
|
+
continue
|
|
567
|
+
if part.get("type") == "text":
|
|
568
|
+
text = part.get("text")
|
|
569
|
+
if isinstance(text, str) and text:
|
|
570
|
+
chunks.append(text)
|
|
571
|
+
if not chunks:
|
|
572
|
+
return None
|
|
573
|
+
content = "".join(chunks).strip()
|
|
574
|
+
return content or None
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def _extract_last_user_message(messages: list[AiChatMessage]) -> str | None:
|
|
578
|
+
for message in reversed(messages):
|
|
579
|
+
if message.role != "user":
|
|
580
|
+
continue
|
|
581
|
+
if message.content and message.content.strip():
|
|
582
|
+
return message.content.strip()
|
|
583
|
+
content = _extract_text_from_parts(message.parts)
|
|
584
|
+
if content:
|
|
585
|
+
return content
|
|
586
|
+
return None
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
def _ai_sse_event(payload: dict[str, Any]) -> str:
|
|
590
|
+
return f"data: {json.dumps(payload, ensure_ascii=False)}\n\n"
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def _ai_sse_done() -> str:
|
|
594
|
+
return "data: [DONE]\n\n"
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
def _ai_sse_headers() -> dict[str, str]:
|
|
598
|
+
return {
|
|
599
|
+
"Cache-Control": "no-cache",
|
|
600
|
+
"Connection": "keep-alive",
|
|
601
|
+
"x-vercel-ai-ui-message-stream": "v1",
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
async def _ai_chat_stream(
|
|
606
|
+
user_text: str, run_id: str | None = None, category: str | None = None
|
|
607
|
+
) -> AsyncGenerator[str, None]:
|
|
608
|
+
message_id = f"msg_{int(time.time() * 1000)}"
|
|
609
|
+
text_id = f"text_{message_id}"
|
|
610
|
+
yield _ai_sse_event({"type": "start", "messageId": message_id})
|
|
611
|
+
yield _ai_sse_event({"type": "text-start", "id": text_id})
|
|
612
|
+
|
|
613
|
+
async for item in _chat_stream(user_text, run_id=run_id, category=category):
|
|
614
|
+
raw = item.strip()
|
|
615
|
+
if not raw:
|
|
616
|
+
continue
|
|
617
|
+
try:
|
|
618
|
+
payload = json.loads(raw)
|
|
619
|
+
except Exception:
|
|
620
|
+
continue
|
|
621
|
+
|
|
622
|
+
event_type = payload.get("type")
|
|
623
|
+
if event_type == "delta":
|
|
624
|
+
content = payload.get("content")
|
|
625
|
+
if isinstance(content, str) and content:
|
|
626
|
+
yield _ai_sse_event({"type": "text-delta", "id": text_id, "delta": content})
|
|
627
|
+
continue
|
|
628
|
+
if event_type == "status":
|
|
629
|
+
message = payload.get("message")
|
|
630
|
+
if isinstance(message, str) and message:
|
|
631
|
+
yield _ai_sse_event(
|
|
632
|
+
{"type": "data-status", "data": {"message": message}, "transient": True}
|
|
633
|
+
)
|
|
634
|
+
continue
|
|
635
|
+
if event_type == "error":
|
|
636
|
+
message = payload.get("message")
|
|
637
|
+
if not isinstance(message, str) or not message:
|
|
638
|
+
message = "채팅 요청에 실패했습니다."
|
|
639
|
+
yield _ai_sse_event({"type": "error", "errorText": message})
|
|
640
|
+
yield _ai_sse_event({"type": "finish"})
|
|
641
|
+
yield _ai_sse_done()
|
|
642
|
+
return
|
|
643
|
+
if event_type == "final":
|
|
644
|
+
yield _ai_sse_event({"type": "text-end", "id": text_id})
|
|
645
|
+
yield _ai_sse_event({"type": "finish"})
|
|
646
|
+
yield _ai_sse_done()
|
|
647
|
+
return
|
|
648
|
+
|
|
649
|
+
yield _ai_sse_event({"type": "finish"})
|
|
650
|
+
yield _ai_sse_done()
|
|
651
|
+
|
|
652
|
+
|
|
482
653
|
def _event(payload: dict[str, Any]) -> str:
|
|
483
654
|
return json.dumps(payload, ensure_ascii=False) + "\n"
|
|
484
655
|
|
|
@@ -490,15 +661,34 @@ async def _emit_answer(answer: str) -> AsyncGenerator[str, None]:
|
|
|
490
661
|
yield _event({"type": "final", "content": answer})
|
|
491
662
|
|
|
492
663
|
|
|
493
|
-
async def _chat_stream(
|
|
664
|
+
async def _chat_stream(
|
|
665
|
+
user_text: str, run_id: str | None = None, category: str | None = None
|
|
666
|
+
) -> AsyncGenerator[str, None]:
|
|
494
667
|
started_at = time.perf_counter()
|
|
495
|
-
if
|
|
496
|
-
yield _event(
|
|
668
|
+
if category in {"result_interpretation", "improvement_direction"} and not run_id:
|
|
669
|
+
yield _event(
|
|
670
|
+
{
|
|
671
|
+
"type": "final",
|
|
672
|
+
"content": "선택한 분류는 run_id가 필요합니다. run_id를 선택한 뒤 다시 질문해주세요.",
|
|
673
|
+
}
|
|
674
|
+
)
|
|
497
675
|
return
|
|
498
676
|
|
|
499
|
-
if len(user_text) <=
|
|
677
|
+
if len(user_text) <= 4:
|
|
678
|
+
if run_id or category:
|
|
679
|
+
user_text = f"{user_text}"
|
|
680
|
+
else:
|
|
681
|
+
yield _event(
|
|
682
|
+
{
|
|
683
|
+
"type": "final",
|
|
684
|
+
"content": "무엇을 설명할까요? run_id와 질문 분류를 선택한 뒤 다시 요청해주세요.",
|
|
685
|
+
}
|
|
686
|
+
)
|
|
687
|
+
return
|
|
688
|
+
|
|
689
|
+
if len(user_text) <= 6 and not _is_verb_only(user_text):
|
|
500
690
|
yield _event({"type": "status", "message": "짧은 질문 처리 중..."})
|
|
501
|
-
answer = await _direct_chat_answer(user_text)
|
|
691
|
+
answer = await _direct_chat_answer(user_text, run_id=run_id, category=category)
|
|
502
692
|
if answer:
|
|
503
693
|
async for item in _emit_answer(answer):
|
|
504
694
|
yield item
|
|
@@ -506,9 +696,45 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
|
|
|
506
696
|
yield _event({"type": "final", "content": "답변을 생성하지 못했습니다."})
|
|
507
697
|
return
|
|
508
698
|
|
|
699
|
+
if (
|
|
700
|
+
_is_verb_only(user_text)
|
|
701
|
+
and category in {"result_interpretation", "improvement_direction"}
|
|
702
|
+
and run_id
|
|
703
|
+
):
|
|
704
|
+
yield _event({"type": "status", "message": "선택한 run 요약 중..."})
|
|
705
|
+
try:
|
|
706
|
+
result = await asyncio.wait_for(
|
|
707
|
+
_call_mcp_tool("get_run_summary", {"run_id": run_id}), timeout=12
|
|
708
|
+
)
|
|
709
|
+
except TimeoutError:
|
|
710
|
+
yield _event(
|
|
711
|
+
{
|
|
712
|
+
"type": "error",
|
|
713
|
+
"message": "run 요약 응답이 지연됩니다. 잠시 후 다시 시도해주세요.",
|
|
714
|
+
}
|
|
715
|
+
)
|
|
716
|
+
return
|
|
717
|
+
except Exception as exc:
|
|
718
|
+
yield _event({"type": "error", "message": f"run 요약 실패: {exc}"})
|
|
719
|
+
return
|
|
720
|
+
|
|
721
|
+
payload = _extract_json_content(result)
|
|
722
|
+
if isinstance(payload, dict):
|
|
723
|
+
summary = _summarize_result("get_run_summary", payload)
|
|
724
|
+
if category == "improvement_direction":
|
|
725
|
+
summary += "\n\n개선 방향을 구체화하려면 목표 메트릭이나 기준을 알려주세요."
|
|
726
|
+
else:
|
|
727
|
+
summary += "\n\n특정 메트릭/케이스가 있으면 알려주세요."
|
|
728
|
+
async for item in _emit_answer(summary):
|
|
729
|
+
yield item
|
|
730
|
+
return
|
|
731
|
+
|
|
509
732
|
yield _event({"type": "status", "message": "요청 분류 중..."})
|
|
510
733
|
try:
|
|
511
|
-
router = await asyncio.wait_for(
|
|
734
|
+
router = await asyncio.wait_for(
|
|
735
|
+
_resolve_tool_with_llm(user_text, run_id=run_id, category=category),
|
|
736
|
+
timeout=30,
|
|
737
|
+
)
|
|
512
738
|
except TimeoutError:
|
|
513
739
|
router = None
|
|
514
740
|
except Exception:
|
|
@@ -520,7 +746,9 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
|
|
|
520
746
|
if router is None:
|
|
521
747
|
yield _event({"type": "status", "message": "문서 검색 중..."})
|
|
522
748
|
try:
|
|
523
|
-
rag_answer = await asyncio.wait_for(
|
|
749
|
+
rag_answer = await asyncio.wait_for(
|
|
750
|
+
_rag_answer(user_text, run_id=run_id, category=category), timeout=90
|
|
751
|
+
)
|
|
524
752
|
except TimeoutError:
|
|
525
753
|
yield _event({"type": "error", "message": "문서 검색이 지연됩니다. 다시 시도해주세요."})
|
|
526
754
|
return
|
|
@@ -528,7 +756,7 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
|
|
|
528
756
|
async for item in _emit_answer(rag_answer):
|
|
529
757
|
yield item
|
|
530
758
|
return
|
|
531
|
-
answer = await _direct_chat_answer(user_text)
|
|
759
|
+
answer = await _direct_chat_answer(user_text, run_id=run_id, category=category)
|
|
532
760
|
if answer:
|
|
533
761
|
async for item in _emit_answer(answer):
|
|
534
762
|
yield item
|
|
@@ -541,7 +769,7 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
|
|
|
541
769
|
tool_args = router.get("arguments", {})
|
|
542
770
|
|
|
543
771
|
if action == "direct":
|
|
544
|
-
answer = await _direct_chat_answer(user_text)
|
|
772
|
+
answer = await _direct_chat_answer(user_text, run_id=run_id, category=category)
|
|
545
773
|
if answer:
|
|
546
774
|
async for item in _emit_answer(answer):
|
|
547
775
|
yield item
|
|
@@ -552,7 +780,9 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
|
|
|
552
780
|
if action == "rag":
|
|
553
781
|
yield _event({"type": "status", "message": "문서 검색 중..."})
|
|
554
782
|
try:
|
|
555
|
-
rag_answer = await asyncio.wait_for(
|
|
783
|
+
rag_answer = await asyncio.wait_for(
|
|
784
|
+
_rag_answer(user_text, run_id=run_id, category=category), timeout=90
|
|
785
|
+
)
|
|
556
786
|
except TimeoutError:
|
|
557
787
|
yield _event({"type": "error", "message": "문서 검색이 지연됩니다. 다시 시도해주세요."})
|
|
558
788
|
return
|
|
@@ -571,9 +801,31 @@ async def _chat_stream(user_text: str) -> AsyncGenerator[str, None]:
|
|
|
571
801
|
yield _event({"type": "final", "content": "도구 이름을 찾지 못했습니다."})
|
|
572
802
|
return
|
|
573
803
|
|
|
804
|
+
if tool_name == "get_run_summary" and not (tool_args.get("run_id") or run_id):
|
|
805
|
+
yield _event({"type": "final", "content": "run_id를 선택하거나 입력해주세요."})
|
|
806
|
+
return
|
|
807
|
+
if tool_name == "get_artifacts" and not (tool_args.get("run_id") or run_id):
|
|
808
|
+
yield _event({"type": "final", "content": "아티팩트 조회를 위해 run_id가 필요합니다."})
|
|
809
|
+
return
|
|
810
|
+
if tool_name == "analyze_compare" and (
|
|
811
|
+
not tool_args.get("run_id_a") or not tool_args.get("run_id_b")
|
|
812
|
+
):
|
|
813
|
+
yield _event(
|
|
814
|
+
{
|
|
815
|
+
"type": "final",
|
|
816
|
+
"content": "비교 분석에는 run_id 두 개가 필요합니다. 비교할 run을 알려주세요.",
|
|
817
|
+
}
|
|
818
|
+
)
|
|
819
|
+
return
|
|
820
|
+
|
|
574
821
|
yield _event({"type": "status", "message": "도구 실행 중..."})
|
|
575
822
|
try:
|
|
576
|
-
|
|
823
|
+
enhanced_tool_args = dict(tool_args)
|
|
824
|
+
if run_id:
|
|
825
|
+
enhanced_tool_args["run_id"] = run_id
|
|
826
|
+
if category:
|
|
827
|
+
enhanced_tool_args["category"] = category
|
|
828
|
+
result = await asyncio.wait_for(_call_mcp_tool(tool_name, enhanced_tool_args), timeout=12)
|
|
577
829
|
except TimeoutError:
|
|
578
830
|
yield _event(
|
|
579
831
|
{"type": "error", "message": "응답 지연(12s 초과). MCP 서버 상태를 확인해주세요."}
|
|
@@ -615,3 +867,32 @@ async def chat_stream(request: ChatRequest):
|
|
|
615
867
|
yield item
|
|
616
868
|
|
|
617
869
|
return StreamingResponse(event_generator(), media_type="application/x-ndjson")
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
@router.post("/ai-stream")
|
|
873
|
+
async def ai_chat_stream(request: AiChatRequest):
|
|
874
|
+
user_text = _extract_last_user_message(request.messages)
|
|
875
|
+
run_id = request.run_id
|
|
876
|
+
category = request.category
|
|
877
|
+
if not user_text:
|
|
878
|
+
|
|
879
|
+
async def error_generator():
|
|
880
|
+
yield _ai_sse_event({"type": "error", "errorText": "질문을 입력해주세요."})
|
|
881
|
+
yield _ai_sse_event({"type": "finish"})
|
|
882
|
+
yield _ai_sse_done()
|
|
883
|
+
|
|
884
|
+
return StreamingResponse(
|
|
885
|
+
error_generator(),
|
|
886
|
+
media_type="text/event-stream",
|
|
887
|
+
headers=_ai_sse_headers(),
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
async def event_generator():
|
|
891
|
+
async for item in _ai_chat_stream(user_text, run_id=run_id, category=category):
|
|
892
|
+
yield item
|
|
893
|
+
|
|
894
|
+
return StreamingResponse(
|
|
895
|
+
event_generator(),
|
|
896
|
+
media_type="text/event-stream",
|
|
897
|
+
headers=_ai_sse_headers(),
|
|
898
|
+
)
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from evalvault.domain.entities.ops_report import OpsReport
|
|
6
|
+
from evalvault.domain.entities.stage import StageMetric, StageSummary
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def render_markdown(report: OpsReport) -> str:
|
|
10
|
+
lines: list[str] = []
|
|
11
|
+
lines.append("# Ops Report")
|
|
12
|
+
lines.append("")
|
|
13
|
+
lines.extend(_render_run_summary(report.run_summary, report.metadata))
|
|
14
|
+
lines.append("")
|
|
15
|
+
lines.extend(_render_ops_kpis(report.ops_kpis))
|
|
16
|
+
lines.append("")
|
|
17
|
+
lines.extend(_render_stage_summary(report.stage_summary))
|
|
18
|
+
lines.append("")
|
|
19
|
+
lines.extend(_render_bottlenecks(report.bottlenecks))
|
|
20
|
+
lines.append("")
|
|
21
|
+
lines.extend(_render_recommendations(report.recommendations))
|
|
22
|
+
lines.append("")
|
|
23
|
+
lines.extend(_render_failing_metrics(report.stage_metrics))
|
|
24
|
+
return "\n".join(lines).strip()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def render_json(report: OpsReport) -> str:
|
|
28
|
+
payload = report.to_dict()
|
|
29
|
+
return json.dumps(payload, ensure_ascii=True, indent=2)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _render_run_summary(summary: dict[str, object], metadata: dict[str, object]) -> list[str]:
|
|
33
|
+
run_id = summary.get("run_id", "-")
|
|
34
|
+
dataset = summary.get("dataset_name", "-")
|
|
35
|
+
version = summary.get("dataset_version", "-")
|
|
36
|
+
model = summary.get("model_name", "-")
|
|
37
|
+
started = summary.get("started_at", "-")
|
|
38
|
+
finished = summary.get("finished_at", "-")
|
|
39
|
+
duration = summary.get("duration_seconds", "-")
|
|
40
|
+
total_cases = summary.get("total_test_cases", "-")
|
|
41
|
+
pass_rate = summary.get("pass_rate", "-")
|
|
42
|
+
total_tokens = summary.get("total_tokens", "-")
|
|
43
|
+
total_cost = summary.get("total_cost_usd", "-")
|
|
44
|
+
|
|
45
|
+
lines = [
|
|
46
|
+
"## Run Summary",
|
|
47
|
+
f"- run_id: {run_id}",
|
|
48
|
+
f"- dataset: {dataset} ({version})",
|
|
49
|
+
f"- model: {model}",
|
|
50
|
+
f"- started_at: {started}",
|
|
51
|
+
f"- finished_at: {finished}",
|
|
52
|
+
f"- duration_seconds: {duration}",
|
|
53
|
+
f"- total_test_cases: {total_cases}",
|
|
54
|
+
f"- pass_rate: {pass_rate}",
|
|
55
|
+
f"- total_tokens: {total_tokens}",
|
|
56
|
+
f"- total_cost_usd: {total_cost}",
|
|
57
|
+
]
|
|
58
|
+
trace_links: list[str] = []
|
|
59
|
+
if metadata.get("langfuse_trace_url"):
|
|
60
|
+
trace_links.append(f"langfuse_trace_url={metadata['langfuse_trace_url']}")
|
|
61
|
+
if metadata.get("phoenix_trace_url"):
|
|
62
|
+
trace_links.append(f"phoenix_trace_url={metadata['phoenix_trace_url']}")
|
|
63
|
+
if trace_links:
|
|
64
|
+
lines.append(f"- trace_links: {', '.join(trace_links)}")
|
|
65
|
+
return lines
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _render_stage_summary(summary: StageSummary | None) -> list[str]:
|
|
69
|
+
lines = ["## Stage Summary"]
|
|
70
|
+
if summary is None:
|
|
71
|
+
lines.append("- no stage events found")
|
|
72
|
+
return lines
|
|
73
|
+
lines.append(f"- total_events: {summary.total_events}")
|
|
74
|
+
if summary.missing_required_stage_types:
|
|
75
|
+
missing = ", ".join(summary.missing_required_stage_types)
|
|
76
|
+
lines.append(f"- missing_required_stage_types: {missing}")
|
|
77
|
+
if summary.stage_type_counts:
|
|
78
|
+
lines.append("- stage_type_counts:")
|
|
79
|
+
for stage_type, count in summary.stage_type_counts.items():
|
|
80
|
+
lines.append(f" - {stage_type}: {count}")
|
|
81
|
+
if summary.stage_type_avg_durations:
|
|
82
|
+
lines.append("- stage_type_avg_durations_ms:")
|
|
83
|
+
for stage_type, duration in summary.stage_type_avg_durations.items():
|
|
84
|
+
lines.append(f" - {stage_type}: {duration:.3f}")
|
|
85
|
+
return lines
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _render_ops_kpis(kpis: dict[str, object]) -> list[str]:
|
|
89
|
+
lines = ["## Ops KPIs"]
|
|
90
|
+
lines.append(f"- total_test_cases: {kpis.get('total_test_cases', '-')}")
|
|
91
|
+
lines.append(f"- pass_rate: {kpis.get('pass_rate', '-')}")
|
|
92
|
+
lines.append(f"- failure_rate: {kpis.get('failure_rate', '-')}")
|
|
93
|
+
lines.append(f"- stage_error_rate: {kpis.get('stage_error_rate', '-')}")
|
|
94
|
+
lines.append(f"- stage_error_severity: {kpis.get('stage_error_severity', '-')}")
|
|
95
|
+
lines.append(f"- duration_seconds: {kpis.get('duration_seconds', '-')}")
|
|
96
|
+
lines.append(f"- total_tokens: {kpis.get('total_tokens', '-')}")
|
|
97
|
+
lines.append(f"- total_cost_usd: {kpis.get('total_cost_usd', '-')}")
|
|
98
|
+
lines.append(f"- avg_latency_ms: {kpis.get('avg_latency_ms', '-')}")
|
|
99
|
+
lines.append(f"- p95_latency_ms: {kpis.get('p95_latency_ms', '-')}")
|
|
100
|
+
lines.append(f"- avg_tokens_per_case: {kpis.get('avg_tokens_per_case', '-')}")
|
|
101
|
+
lines.append(f"- avg_cost_per_case_usd: {kpis.get('avg_cost_per_case_usd', '-')}")
|
|
102
|
+
return lines
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _render_bottlenecks(bottlenecks: list[dict[str, object]]) -> list[str]:
|
|
106
|
+
lines = ["## Ops Signals"]
|
|
107
|
+
if not bottlenecks:
|
|
108
|
+
lines.append("- none")
|
|
109
|
+
return lines
|
|
110
|
+
for entry in bottlenecks:
|
|
111
|
+
entry_type = entry.get("type", "unknown")
|
|
112
|
+
if entry_type == "latency":
|
|
113
|
+
stage_type = entry.get("stage_type", "-")
|
|
114
|
+
duration = entry.get("avg_duration_ms", "-")
|
|
115
|
+
lines.append(f"- latency: {stage_type} avg_duration_ms={duration}")
|
|
116
|
+
elif entry_type == "missing_stage":
|
|
117
|
+
stage_type = entry.get("stage_type", "-")
|
|
118
|
+
lines.append(f"- missing_stage: {stage_type}")
|
|
119
|
+
else:
|
|
120
|
+
lines.append(f"- {entry_type}: {entry}")
|
|
121
|
+
return lines
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _render_recommendations(recommendations: list[str]) -> list[str]:
|
|
125
|
+
lines = ["## Recommendations"]
|
|
126
|
+
if not recommendations:
|
|
127
|
+
lines.append("- none")
|
|
128
|
+
return lines
|
|
129
|
+
for item in recommendations:
|
|
130
|
+
lines.append(f"- {item}")
|
|
131
|
+
return lines
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _render_failing_metrics(metrics: list[StageMetric]) -> list[str]:
|
|
135
|
+
lines = ["## Failing Stage Metrics"]
|
|
136
|
+
failing = [metric for metric in metrics if metric.passed is False]
|
|
137
|
+
if not failing:
|
|
138
|
+
lines.append("- none")
|
|
139
|
+
return lines
|
|
140
|
+
|
|
141
|
+
failing_sorted = sorted(failing, key=_metric_severity, reverse=True)[:20]
|
|
142
|
+
for metric in failing_sorted:
|
|
143
|
+
threshold = metric.threshold if metric.threshold is not None else "-"
|
|
144
|
+
lines.append(
|
|
145
|
+
f"- {metric.metric_name}: score={metric.score} threshold={threshold} "
|
|
146
|
+
f"stage_id={metric.stage_id}"
|
|
147
|
+
)
|
|
148
|
+
return lines
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _metric_severity(metric: StageMetric) -> float:
|
|
152
|
+
if metric.threshold is None:
|
|
153
|
+
return 0.0
|
|
154
|
+
comparison = None
|
|
155
|
+
if isinstance(metric.evidence, dict):
|
|
156
|
+
comparison = metric.evidence.get("comparison")
|
|
157
|
+
if isinstance(comparison, str) and comparison.lower() in {"max", "<=", "le"}:
|
|
158
|
+
return max(metric.score - metric.threshold, 0.0)
|
|
159
|
+
return max(metric.threshold - metric.score, 0.0)
|
|
@@ -1128,6 +1128,96 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
|
|
|
1128
1128
|
)
|
|
1129
1129
|
return reports
|
|
1130
1130
|
|
|
1131
|
+
def save_ops_report(
|
|
1132
|
+
self,
|
|
1133
|
+
*,
|
|
1134
|
+
report_id: str | None,
|
|
1135
|
+
run_id: str | None,
|
|
1136
|
+
report_type: str,
|
|
1137
|
+
format: str,
|
|
1138
|
+
content: str | None,
|
|
1139
|
+
metadata: dict[str, Any] | None = None,
|
|
1140
|
+
created_at: str | None = None,
|
|
1141
|
+
) -> str:
|
|
1142
|
+
report_id = report_id or str(uuid.uuid4())
|
|
1143
|
+
if created_at is None:
|
|
1144
|
+
created_at_value = datetime.now(UTC)
|
|
1145
|
+
else:
|
|
1146
|
+
created_at_value = (
|
|
1147
|
+
datetime.fromisoformat(created_at) if isinstance(created_at, str) else created_at
|
|
1148
|
+
)
|
|
1149
|
+
|
|
1150
|
+
with self._get_connection() as conn:
|
|
1151
|
+
conn.execute(
|
|
1152
|
+
"""
|
|
1153
|
+
INSERT INTO ops_reports (
|
|
1154
|
+
report_id, run_id, report_type, format, content, metadata, created_at
|
|
1155
|
+
) VALUES (%s, %s, %s, %s, %s, %s, %s)
|
|
1156
|
+
ON CONFLICT (report_id) DO UPDATE SET
|
|
1157
|
+
run_id = EXCLUDED.run_id,
|
|
1158
|
+
report_type = EXCLUDED.report_type,
|
|
1159
|
+
format = EXCLUDED.format,
|
|
1160
|
+
content = EXCLUDED.content,
|
|
1161
|
+
metadata = EXCLUDED.metadata,
|
|
1162
|
+
created_at = EXCLUDED.created_at
|
|
1163
|
+
""",
|
|
1164
|
+
(
|
|
1165
|
+
report_id,
|
|
1166
|
+
run_id,
|
|
1167
|
+
report_type,
|
|
1168
|
+
format,
|
|
1169
|
+
content,
|
|
1170
|
+
self._serialize_pipeline_json(metadata),
|
|
1171
|
+
created_at_value,
|
|
1172
|
+
),
|
|
1173
|
+
)
|
|
1174
|
+
conn.commit()
|
|
1175
|
+
|
|
1176
|
+
return report_id
|
|
1177
|
+
|
|
1178
|
+
def list_ops_reports(
|
|
1179
|
+
self,
|
|
1180
|
+
*,
|
|
1181
|
+
run_id: str,
|
|
1182
|
+
report_type: str | None = None,
|
|
1183
|
+
format: str | None = None,
|
|
1184
|
+
limit: int = 20,
|
|
1185
|
+
) -> list[dict[str, Any]]:
|
|
1186
|
+
clauses = ["run_id = %s"]
|
|
1187
|
+
params: list[Any] = [run_id]
|
|
1188
|
+
if report_type:
|
|
1189
|
+
clauses.append("report_type = %s")
|
|
1190
|
+
params.append(report_type)
|
|
1191
|
+
if format:
|
|
1192
|
+
clauses.append("format = %s")
|
|
1193
|
+
params.append(format)
|
|
1194
|
+
params.append(limit)
|
|
1195
|
+
|
|
1196
|
+
query = (
|
|
1197
|
+
"SELECT report_id, run_id, report_type, format, content, metadata, created_at "
|
|
1198
|
+
"FROM ops_reports WHERE " + " AND ".join(clauses) + " ORDER BY created_at DESC LIMIT %s"
|
|
1199
|
+
)
|
|
1200
|
+
|
|
1201
|
+
with self._get_connection() as conn:
|
|
1202
|
+
rows = conn.execute(query, tuple(params)).fetchall()
|
|
1203
|
+
|
|
1204
|
+
reports: list[dict[str, Any]] = []
|
|
1205
|
+
for row in rows:
|
|
1206
|
+
reports.append(
|
|
1207
|
+
{
|
|
1208
|
+
"report_id": row["report_id"],
|
|
1209
|
+
"run_id": row["run_id"],
|
|
1210
|
+
"report_type": row["report_type"],
|
|
1211
|
+
"format": row["format"],
|
|
1212
|
+
"content": row["content"],
|
|
1213
|
+
"metadata": self._deserialize_json(row["metadata"]),
|
|
1214
|
+
"created_at": row["created_at"].isoformat()
|
|
1215
|
+
if isinstance(row["created_at"], datetime)
|
|
1216
|
+
else row["created_at"],
|
|
1217
|
+
}
|
|
1218
|
+
)
|
|
1219
|
+
return reports
|
|
1220
|
+
|
|
1131
1221
|
def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
|
|
1132
1222
|
"""파이프라인 분석 결과 목록을 조회합니다."""
|
|
1133
1223
|
query = """
|
|
@@ -241,6 +241,19 @@ CREATE TABLE IF NOT EXISTS analysis_reports (
|
|
|
241
241
|
CREATE INDEX IF NOT EXISTS idx_reports_run_id ON analysis_reports(run_id);
|
|
242
242
|
CREATE INDEX IF NOT EXISTS idx_reports_experiment_id ON analysis_reports(experiment_id);
|
|
243
243
|
|
|
244
|
+
-- Ops reports table
|
|
245
|
+
CREATE TABLE IF NOT EXISTS ops_reports (
|
|
246
|
+
report_id UUID PRIMARY KEY,
|
|
247
|
+
run_id UUID REFERENCES evaluation_runs(run_id) ON DELETE SET NULL,
|
|
248
|
+
report_type VARCHAR(50) NOT NULL,
|
|
249
|
+
format VARCHAR(20) NOT NULL,
|
|
250
|
+
content TEXT,
|
|
251
|
+
metadata JSONB,
|
|
252
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
|
253
|
+
);
|
|
254
|
+
|
|
255
|
+
CREATE INDEX IF NOT EXISTS idx_ops_reports_run_id ON ops_reports(run_id);
|
|
256
|
+
|
|
244
257
|
-- Analysis pipeline results table
|
|
245
258
|
CREATE TABLE IF NOT EXISTS pipeline_results (
|
|
246
259
|
result_id UUID PRIMARY KEY,
|
|
@@ -271,6 +271,20 @@ CREATE TABLE IF NOT EXISTS analysis_reports (
|
|
|
271
271
|
CREATE INDEX IF NOT EXISTS idx_reports_run_id ON analysis_reports(run_id);
|
|
272
272
|
CREATE INDEX IF NOT EXISTS idx_reports_experiment_id ON analysis_reports(experiment_id);
|
|
273
273
|
|
|
274
|
+
-- Ops reports table
|
|
275
|
+
CREATE TABLE IF NOT EXISTS ops_reports (
|
|
276
|
+
report_id TEXT PRIMARY KEY,
|
|
277
|
+
run_id TEXT,
|
|
278
|
+
report_type TEXT NOT NULL, -- 'ops_report', 'ops_snapshot'
|
|
279
|
+
format TEXT NOT NULL, -- 'markdown', 'json'
|
|
280
|
+
content TEXT, -- Report content (markdown/json) or file path
|
|
281
|
+
metadata TEXT, -- JSON metadata
|
|
282
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
283
|
+
FOREIGN KEY (run_id) REFERENCES evaluation_runs(run_id) ON DELETE SET NULL
|
|
284
|
+
);
|
|
285
|
+
|
|
286
|
+
CREATE INDEX IF NOT EXISTS idx_ops_reports_run_id ON ops_reports(run_id);
|
|
287
|
+
|
|
274
288
|
-- Analysis pipeline results table
|
|
275
289
|
CREATE TABLE IF NOT EXISTS pipeline_results (
|
|
276
290
|
result_id TEXT PRIMARY KEY,
|
|
@@ -1211,6 +1211,83 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
1211
1211
|
)
|
|
1212
1212
|
return reports
|
|
1213
1213
|
|
|
1214
|
+
def save_ops_report(
|
|
1215
|
+
self,
|
|
1216
|
+
*,
|
|
1217
|
+
report_id: str | None,
|
|
1218
|
+
run_id: str | None,
|
|
1219
|
+
report_type: str,
|
|
1220
|
+
format: str,
|
|
1221
|
+
content: str | None,
|
|
1222
|
+
metadata: dict[str, Any] | None = None,
|
|
1223
|
+
created_at: str | None = None,
|
|
1224
|
+
) -> str:
|
|
1225
|
+
report_id = report_id or str(uuid.uuid4())
|
|
1226
|
+
created_at = created_at or datetime.now().isoformat()
|
|
1227
|
+
|
|
1228
|
+
with self._get_connection() as conn:
|
|
1229
|
+
conn = cast(Any, conn)
|
|
1230
|
+
conn.execute(
|
|
1231
|
+
"""
|
|
1232
|
+
INSERT OR REPLACE INTO ops_reports (
|
|
1233
|
+
report_id, run_id, report_type, format, content, metadata, created_at
|
|
1234
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
1235
|
+
""",
|
|
1236
|
+
(
|
|
1237
|
+
report_id,
|
|
1238
|
+
run_id,
|
|
1239
|
+
report_type,
|
|
1240
|
+
format,
|
|
1241
|
+
content,
|
|
1242
|
+
self._serialize_json(metadata),
|
|
1243
|
+
created_at,
|
|
1244
|
+
),
|
|
1245
|
+
)
|
|
1246
|
+
conn.commit()
|
|
1247
|
+
|
|
1248
|
+
return report_id
|
|
1249
|
+
|
|
1250
|
+
def list_ops_reports(
|
|
1251
|
+
self,
|
|
1252
|
+
*,
|
|
1253
|
+
run_id: str,
|
|
1254
|
+
report_type: str | None = None,
|
|
1255
|
+
format: str | None = None,
|
|
1256
|
+
limit: int = 20,
|
|
1257
|
+
) -> list[dict[str, Any]]:
|
|
1258
|
+
query = (
|
|
1259
|
+
"SELECT report_id, run_id, report_type, format, content, metadata, created_at "
|
|
1260
|
+
"FROM ops_reports WHERE run_id = ?"
|
|
1261
|
+
)
|
|
1262
|
+
params: list[Any] = [run_id]
|
|
1263
|
+
if report_type:
|
|
1264
|
+
query += " AND report_type = ?"
|
|
1265
|
+
params.append(report_type)
|
|
1266
|
+
if format:
|
|
1267
|
+
query += " AND format = ?"
|
|
1268
|
+
params.append(format)
|
|
1269
|
+
query += " ORDER BY created_at DESC LIMIT ?"
|
|
1270
|
+
params.append(limit)
|
|
1271
|
+
|
|
1272
|
+
with self._get_connection() as conn:
|
|
1273
|
+
conn = cast(Any, conn)
|
|
1274
|
+
rows = conn.execute(query, tuple(params)).fetchall()
|
|
1275
|
+
|
|
1276
|
+
reports: list[dict[str, Any]] = []
|
|
1277
|
+
for row in rows:
|
|
1278
|
+
reports.append(
|
|
1279
|
+
{
|
|
1280
|
+
"report_id": row["report_id"],
|
|
1281
|
+
"run_id": row["run_id"],
|
|
1282
|
+
"report_type": row["report_type"],
|
|
1283
|
+
"format": row["format"],
|
|
1284
|
+
"content": row["content"],
|
|
1285
|
+
"metadata": self._deserialize_json(row["metadata"]),
|
|
1286
|
+
"created_at": row["created_at"],
|
|
1287
|
+
}
|
|
1288
|
+
)
|
|
1289
|
+
return reports
|
|
1290
|
+
|
|
1214
1291
|
def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
|
|
1215
1292
|
"""파이프라인 분석 결과 목록을 조회합니다."""
|
|
1216
1293
|
query = """
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from evalvault.domain.entities.stage import StageMetric, StageSummary
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class OpsReport:
|
|
11
|
+
run_summary: dict[str, Any]
|
|
12
|
+
ops_kpis: dict[str, Any]
|
|
13
|
+
stage_summary: StageSummary | None
|
|
14
|
+
stage_metrics: list[StageMetric]
|
|
15
|
+
bottlenecks: list[dict[str, Any]]
|
|
16
|
+
recommendations: list[str]
|
|
17
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
18
|
+
|
|
19
|
+
def to_dict(self) -> dict[str, Any]:
|
|
20
|
+
return {
|
|
21
|
+
"run_summary": self.run_summary,
|
|
22
|
+
"ops_kpis": self.ops_kpis,
|
|
23
|
+
"stage_summary": _stage_summary_to_dict(self.stage_summary),
|
|
24
|
+
"stage_metrics": [metric.to_dict() for metric in self.stage_metrics],
|
|
25
|
+
"bottlenecks": self.bottlenecks,
|
|
26
|
+
"recommendations": self.recommendations,
|
|
27
|
+
"metadata": self.metadata,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _stage_summary_to_dict(summary: StageSummary | None) -> dict[str, Any] | None:
|
|
32
|
+
if summary is None:
|
|
33
|
+
return None
|
|
34
|
+
return {
|
|
35
|
+
"run_id": summary.run_id,
|
|
36
|
+
"total_events": summary.total_events,
|
|
37
|
+
"stage_type_counts": summary.stage_type_counts,
|
|
38
|
+
"stage_type_avg_durations": summary.stage_type_avg_durations,
|
|
39
|
+
"missing_required_stage_types": summary.missing_required_stage_types,
|
|
40
|
+
}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from evalvault.config.langfuse_support import get_langfuse_trace_url
|
|
6
|
+
from evalvault.config.phoenix_support import get_phoenix_trace_url
|
|
7
|
+
from evalvault.domain.entities.ops_report import OpsReport
|
|
8
|
+
from evalvault.domain.entities.stage import StageEvent, StageMetric, StageSummary
|
|
9
|
+
from evalvault.domain.services.stage_metric_guide_service import StageMetricGuideService
|
|
10
|
+
from evalvault.domain.services.stage_metric_service import StageMetricService
|
|
11
|
+
from evalvault.domain.services.stage_summary_service import StageSummaryService
|
|
12
|
+
from evalvault.ports.outbound.stage_storage_port import StageStoragePort
|
|
13
|
+
from evalvault.ports.outbound.storage_port import StoragePort
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OpsReportService:
|
|
17
|
+
"""Build an operational report for an evaluation run."""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
*,
|
|
22
|
+
metric_service: StageMetricService | None = None,
|
|
23
|
+
summary_service: StageSummaryService | None = None,
|
|
24
|
+
guide_service: StageMetricGuideService | None = None,
|
|
25
|
+
) -> None:
|
|
26
|
+
self._metric_service = metric_service or StageMetricService()
|
|
27
|
+
self._summary_service = summary_service or StageSummaryService()
|
|
28
|
+
self._guide_service = guide_service or StageMetricGuideService()
|
|
29
|
+
|
|
30
|
+
def build_report(
|
|
31
|
+
self,
|
|
32
|
+
run_id: str,
|
|
33
|
+
*,
|
|
34
|
+
storage: StoragePort,
|
|
35
|
+
stage_storage: StageStoragePort,
|
|
36
|
+
) -> OpsReport:
|
|
37
|
+
run = storage.get_run(run_id)
|
|
38
|
+
run_summary = run.to_summary_dict()
|
|
39
|
+
phoenix_trace_url = get_phoenix_trace_url(run.tracker_metadata)
|
|
40
|
+
langfuse_trace_url = get_langfuse_trace_url(run.tracker_metadata)
|
|
41
|
+
|
|
42
|
+
events = stage_storage.list_stage_events(run_id)
|
|
43
|
+
stage_summary = self._summarize_events(events)
|
|
44
|
+
|
|
45
|
+
stage_metrics = stage_storage.list_stage_metrics(run_id)
|
|
46
|
+
if not stage_metrics and events:
|
|
47
|
+
stage_metrics = self._metric_service.build_metrics(events)
|
|
48
|
+
|
|
49
|
+
bottlenecks = self._build_bottlenecks(stage_summary)
|
|
50
|
+
recommendations = self._build_recommendations(stage_metrics)
|
|
51
|
+
|
|
52
|
+
ops_kpis = self._build_ops_kpis(run, events)
|
|
53
|
+
|
|
54
|
+
metadata = {
|
|
55
|
+
"phoenix_trace_url": phoenix_trace_url,
|
|
56
|
+
"langfuse_trace_url": langfuse_trace_url,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return OpsReport(
|
|
60
|
+
run_summary=run_summary,
|
|
61
|
+
ops_kpis=ops_kpis,
|
|
62
|
+
stage_summary=stage_summary,
|
|
63
|
+
stage_metrics=stage_metrics,
|
|
64
|
+
bottlenecks=bottlenecks,
|
|
65
|
+
recommendations=recommendations,
|
|
66
|
+
metadata=metadata,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def _summarize_events(self, events: list[StageEvent]) -> StageSummary | None:
|
|
70
|
+
if not events:
|
|
71
|
+
return None
|
|
72
|
+
return self._summary_service.summarize(events)
|
|
73
|
+
|
|
74
|
+
def _build_bottlenecks(self, summary: StageSummary | None) -> list[dict[str, Any]]:
|
|
75
|
+
if summary is None:
|
|
76
|
+
return []
|
|
77
|
+
bottlenecks: list[dict[str, Any]] = []
|
|
78
|
+
|
|
79
|
+
for stage_type in summary.missing_required_stage_types:
|
|
80
|
+
bottlenecks.append(
|
|
81
|
+
{
|
|
82
|
+
"type": "missing_stage",
|
|
83
|
+
"stage_type": stage_type,
|
|
84
|
+
"detail": "required stage missing",
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
durations = summary.stage_type_avg_durations
|
|
89
|
+
if durations:
|
|
90
|
+
top = sorted(durations.items(), key=lambda item: item[1], reverse=True)[:3]
|
|
91
|
+
for stage_type, duration in top:
|
|
92
|
+
bottlenecks.append(
|
|
93
|
+
{
|
|
94
|
+
"type": "latency",
|
|
95
|
+
"stage_type": stage_type,
|
|
96
|
+
"avg_duration_ms": round(duration, 3),
|
|
97
|
+
}
|
|
98
|
+
)
|
|
99
|
+
return bottlenecks
|
|
100
|
+
|
|
101
|
+
def _build_recommendations(self, metrics: list[StageMetric]) -> list[str]:
|
|
102
|
+
if not metrics:
|
|
103
|
+
return []
|
|
104
|
+
guides = self._guide_service.build_guides(metrics)
|
|
105
|
+
recommendations: list[str] = []
|
|
106
|
+
for guide in guides:
|
|
107
|
+
top_action = guide.top_action
|
|
108
|
+
if top_action is None:
|
|
109
|
+
continue
|
|
110
|
+
hint = top_action.implementation_hint or top_action.description
|
|
111
|
+
label = f"[{guide.priority.value}] {guide.component.value}"
|
|
112
|
+
if hint:
|
|
113
|
+
recommendations.append(f"{label}: {top_action.title} - {hint}")
|
|
114
|
+
else:
|
|
115
|
+
recommendations.append(f"{label}: {top_action.title}")
|
|
116
|
+
return recommendations
|
|
117
|
+
|
|
118
|
+
def _build_ops_kpis(self, run, events: list[StageEvent]) -> dict[str, Any]:
|
|
119
|
+
total_cases = run.total_test_cases
|
|
120
|
+
latencies = [r.latency_ms for r in run.results if r.latency_ms]
|
|
121
|
+
tokens_used = [r.tokens_used for r in run.results if r.tokens_used]
|
|
122
|
+
costs = [r.cost_usd for r in run.results if r.cost_usd is not None]
|
|
123
|
+
|
|
124
|
+
avg_latency = _average(latencies)
|
|
125
|
+
p95_latency = _percentile(latencies, 0.95)
|
|
126
|
+
avg_tokens = _average(tokens_used)
|
|
127
|
+
avg_cost = _average(costs)
|
|
128
|
+
pass_rate = run.pass_rate
|
|
129
|
+
failure_rate = None if pass_rate is None else max(0.0, 1.0 - pass_rate)
|
|
130
|
+
|
|
131
|
+
error_rate = _stage_error_rate(events)
|
|
132
|
+
error_severity = _stage_error_severity(error_rate)
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
"total_test_cases": total_cases,
|
|
136
|
+
"pass_rate": pass_rate,
|
|
137
|
+
"failure_rate": failure_rate,
|
|
138
|
+
"stage_error_rate": error_rate,
|
|
139
|
+
"stage_error_severity": error_severity,
|
|
140
|
+
"duration_seconds": run.duration_seconds,
|
|
141
|
+
"total_tokens": run.total_tokens,
|
|
142
|
+
"total_cost_usd": run.total_cost_usd,
|
|
143
|
+
"avg_latency_ms": avg_latency,
|
|
144
|
+
"p95_latency_ms": p95_latency,
|
|
145
|
+
"avg_tokens_per_case": avg_tokens,
|
|
146
|
+
"avg_cost_per_case_usd": avg_cost,
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _average(values: list[float | int]) -> float | None:
|
|
151
|
+
if not values:
|
|
152
|
+
return None
|
|
153
|
+
return float(sum(values)) / len(values)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _percentile(values: list[float | int], ratio: float) -> float | None:
|
|
157
|
+
if not values:
|
|
158
|
+
return None
|
|
159
|
+
if ratio <= 0:
|
|
160
|
+
return float(min(values))
|
|
161
|
+
if ratio >= 1:
|
|
162
|
+
return float(max(values))
|
|
163
|
+
sorted_values = sorted(values)
|
|
164
|
+
index = int(round((len(sorted_values) - 1) * ratio))
|
|
165
|
+
return float(sorted_values[index])
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _stage_error_rate(events: list[StageEvent]) -> float | None:
|
|
169
|
+
if not events:
|
|
170
|
+
return None
|
|
171
|
+
total = len(events)
|
|
172
|
+
failure_statuses = {"failed", "error", "timeout", "aborted"}
|
|
173
|
+
success_statuses = {"success", "ok", "completed", "pass"}
|
|
174
|
+
failures = 0
|
|
175
|
+
for event in events:
|
|
176
|
+
status = str(event.status or "").strip().lower()
|
|
177
|
+
if status in failure_statuses:
|
|
178
|
+
failures += 1
|
|
179
|
+
continue
|
|
180
|
+
if status and status not in success_statuses:
|
|
181
|
+
failures += 1
|
|
182
|
+
return failures / total
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _stage_error_severity(rate: float | None) -> str | None:
|
|
186
|
+
if rate is None:
|
|
187
|
+
return None
|
|
188
|
+
if rate >= 0.05:
|
|
189
|
+
return "critical"
|
|
190
|
+
if rate >= 0.02:
|
|
191
|
+
return "warning"
|
|
192
|
+
return "ok"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: evalvault
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.74.0
|
|
4
4
|
Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
|
|
5
5
|
Project-URL: Homepage, https://github.com/ntts9990/EvalVault
|
|
6
6
|
Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
|
|
@@ -5,12 +5,12 @@ evalvault/mkdocs_helpers.py,sha256=1AKVQ1W2_VO4qclhfyefyU9Dz1Hzkh1DWDwsFMe24jc,3
|
|
|
5
5
|
evalvault/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
evalvault/adapters/inbound/__init__.py,sha256=SG1svel1PwqetnqVpKFLSv612_WwGwLTbFpYgwk6FMw,166
|
|
7
7
|
evalvault/adapters/inbound/api/__init__.py,sha256=LeVVttCA3tLKoHA2PO4z3y8VkfVcf3Bq8CZSzo91lf4,34
|
|
8
|
-
evalvault/adapters/inbound/api/adapter.py,sha256=
|
|
8
|
+
evalvault/adapters/inbound/api/adapter.py,sha256=B0kLz4aEsZ-WeCQlUKSanSz6r9gedr_JmiRIaso3HoA,84659
|
|
9
9
|
evalvault/adapters/inbound/api/main.py,sha256=QgLxzHEy7aycGKIFLtN12tWTjnpWLtQ2XDXKV_2FDvg,7531
|
|
10
10
|
evalvault/adapters/inbound/api/routers/__init__.py,sha256=q07_YF9TnBl68bqcRCvhPU4-zRTyvmPoHVehwO6W7QM,19
|
|
11
11
|
evalvault/adapters/inbound/api/routers/benchmark.py,sha256=yevntbZcNtMvbVODsITUBgR1Ka4pdFQrXBJJ4K4Jyr4,4477
|
|
12
12
|
evalvault/adapters/inbound/api/routers/calibration.py,sha256=ZnJSEW8hV-94S95lU_nDmzcLyaUoH1suM3sFUpJ3w5k,4130
|
|
13
|
-
evalvault/adapters/inbound/api/routers/chat.py,sha256=
|
|
13
|
+
evalvault/adapters/inbound/api/routers/chat.py,sha256=lQfJTPPcA3H3IzxNpYSTBBAy-gSxCYYmDfEr4hCFp9M,30676
|
|
14
14
|
evalvault/adapters/inbound/api/routers/config.py,sha256=LygN0fVMr8NFtj5zuQXnVFhoafx56Txa98vpwtPa4Jc,4104
|
|
15
15
|
evalvault/adapters/inbound/api/routers/domain.py,sha256=RsR7GIFMjccDN7vpG1uDyk9n1DnCTH18JDGAX7o4Qqc,3648
|
|
16
16
|
evalvault/adapters/inbound/api/routers/knowledge.py,sha256=yb_e7OEPtwldOAzHTGiWe7jShHw2JdpOFnzGPMceRsg,7109
|
|
@@ -181,6 +181,8 @@ evalvault/adapters/outbound/nlp/korean/korean_evaluation.py,sha256=Mxwu3zhtdm8Te
|
|
|
181
181
|
evalvault/adapters/outbound/nlp/korean/korean_stopwords.py,sha256=UemEFCJudg2EpsHg8uU2eR-iCh34kw4ZSVCRvnEC6a4,4293
|
|
182
182
|
evalvault/adapters/outbound/nlp/korean/toolkit.py,sha256=EYGpd89ilpn4Wg5t8pALYt4Qi0aDHYOfXGuYbQx7do0,4246
|
|
183
183
|
evalvault/adapters/outbound/nlp/korean/toolkit_factory.py,sha256=x3v-AAkVInOabC4PtOtStsZrFnHun0IOqZDyQGaQVm8,586
|
|
184
|
+
evalvault/adapters/outbound/ops/__init__.py,sha256=_QiDVPuiYWkIwW_ELEVKD_v6dLojjyvIJWs4qVNxehw,164
|
|
185
|
+
evalvault/adapters/outbound/ops/report_renderer.py,sha256=mezVKdIsnJSNvBW6xkhpNG3MOFXHZLZspmHk5o-e8Cg,6354
|
|
184
186
|
evalvault/adapters/outbound/phoenix/sync_service.py,sha256=i6gHpNiZXKQ5yzV9B2TPb-P1N45k_Ck5ruzh3oqp4d8,9122
|
|
185
187
|
evalvault/adapters/outbound/report/__init__.py,sha256=8VeMrfj63mDR-xUHct-drNNBA5M-m-B7sgC1qUJF7g4,660
|
|
186
188
|
evalvault/adapters/outbound/report/ci_report_formatter.py,sha256=5YD8BwtOjLnHcNbbG0HJziOifD9BDhBtZT1oItd6zJE,1233
|
|
@@ -193,10 +195,10 @@ evalvault/adapters/outbound/retriever/graph_rag_adapter.py,sha256=xTI7uMFp4WKstg
|
|
|
193
195
|
evalvault/adapters/outbound/storage/__init__.py,sha256=n5R6thAPTx1leSwv6od6nBWcLWFa-UYD6cOLzN89T8I,614
|
|
194
196
|
evalvault/adapters/outbound/storage/base_sql.py,sha256=NHHlBBhoDq8zbLfIvRoOswjB97Tro1sRE3cmbR8QnV8,58380
|
|
195
197
|
evalvault/adapters/outbound/storage/benchmark_storage_adapter.py,sha256=Qgf9xSSIkYQRpG4uLzcUdoYO9LTQDQ4tFRkkMYer-WA,9803
|
|
196
|
-
evalvault/adapters/outbound/storage/postgres_adapter.py,sha256=
|
|
197
|
-
evalvault/adapters/outbound/storage/postgres_schema.sql,sha256=
|
|
198
|
-
evalvault/adapters/outbound/storage/schema.sql,sha256=
|
|
199
|
-
evalvault/adapters/outbound/storage/sqlite_adapter.py,sha256=
|
|
198
|
+
evalvault/adapters/outbound/storage/postgres_adapter.py,sha256=SZb4Dx2ZYNu-pDqX4rVgT4bRfJ2cQ75DXcWIIEu8aws,59933
|
|
199
|
+
evalvault/adapters/outbound/storage/postgres_schema.sql,sha256=E-HMnGjrB05n2Z-zDq1kQzYSpYlPwg2Egm1e3MSRB7E,12335
|
|
200
|
+
evalvault/adapters/outbound/storage/schema.sql,sha256=LEtrKFpa1SbIBN-igkNiQXJqVLGp-liX-6KR_sZAMwM,14283
|
|
201
|
+
evalvault/adapters/outbound/storage/sqlite_adapter.py,sha256=Gsco1g_N1RHyRzIQIwelR-PU7kx57djHOc_hKEu_gCc,62940
|
|
200
202
|
evalvault/adapters/outbound/tracer/__init__.py,sha256=xrvQQuAvF_UI02mKLMV7GTrG3zn836n5zwCRrrmhq_U,1054
|
|
201
203
|
evalvault/adapters/outbound/tracer/open_rag_log_handler.py,sha256=aq96FIWD-bBaSkq-bygWhQArC9LWghSwi-S03Mga0mI,2827
|
|
202
204
|
evalvault/adapters/outbound/tracer/open_rag_trace_adapter.py,sha256=X_dlqUEwck_MQRGC-euEN73L2Ikcd5eZlgtKK24nDFY,5453
|
|
@@ -235,6 +237,7 @@ evalvault/domain/entities/kg.py,sha256=8awN1M4vxAGQZk_ZG8i2CXKTizQ8FA1VCLhUWHZq0
|
|
|
235
237
|
evalvault/domain/entities/memory.py,sha256=bfS75q8K8_jNrB7IYh4mjP8Lkyj-I0TVsmHCP0FuICw,8423
|
|
236
238
|
evalvault/domain/entities/method.py,sha256=a3jZi7SjcpK3HeVyVwQkUMwpnmg2RbxCnH4NqYPLCOI,1157
|
|
237
239
|
evalvault/domain/entities/multiturn.py,sha256=V9ay30rix6zxNcDRXeLudMgikC1b4f3kt01Hj2ZH7wE,2012
|
|
240
|
+
evalvault/domain/entities/ops_report.py,sha256=89x1UKdFb6oxpsQb4pfED69dcCqnyrzBuOo42dcbgho,1343
|
|
238
241
|
evalvault/domain/entities/prompt.py,sha256=lQlRnHEKY69GWTC-cUIu0DMuPfJ9UWm6Sm4KTNjVwfY,2920
|
|
239
242
|
evalvault/domain/entities/prompt_suggestion.py,sha256=Ep_XSjdYUj7pFSCMyeeZKs8yTnp74AVx05Zqr7829PE,1243
|
|
240
243
|
evalvault/domain/entities/rag_trace.py,sha256=sZgnkG4fK6KOe3Np6TYAZ_tPnsRbOmucDSQns35U1n4,11868
|
|
@@ -292,6 +295,7 @@ evalvault/domain/services/memory_aware_evaluator.py,sha256=vTiYoxiMfZ_CMjSBjqwkB
|
|
|
292
295
|
evalvault/domain/services/memory_based_analysis.py,sha256=oh2irCy3le7fWiTtL31SMEhPyu7fyBVz-giO2hlNifE,4499
|
|
293
296
|
evalvault/domain/services/method_runner.py,sha256=pABqKZeaALpWZYDfzAbd-VOZt2djQggRNIPuuPQeUSw,3571
|
|
294
297
|
evalvault/domain/services/multiturn_evaluator.py,sha256=fipi5hEyidq_cnGGr0GpvoprLtjm6dHLuAkSotbT3YA,7202
|
|
298
|
+
evalvault/domain/services/ops_report_service.py,sha256=FNMN0GpaA3rnnbYs-5xY1ReXzGWaQG0t3ZN0eoeQiRc,7018
|
|
295
299
|
evalvault/domain/services/ops_snapshot_service.py,sha256=1CqJN2p3tM6SgzLCZKcVEM213fd1cDGexTRPG_3e59w,5138
|
|
296
300
|
evalvault/domain/services/pipeline_orchestrator.py,sha256=yriVlEVZYDtt0Vwt4Ae6xyW1H6Dj4Hxdn8XQSvQNSoQ,19436
|
|
297
301
|
evalvault/domain/services/pipeline_template_registry.py,sha256=Gg0k-Sj3MNDBjckhI2gQRCNUsqIz0nnjPuswW5kUdBw,29567
|
|
@@ -355,8 +359,8 @@ evalvault/reports/__init__.py,sha256=Bb1X4871msAN8I6PM6nKGED3psPwZt88hXZBAOdH06Y
|
|
|
355
359
|
evalvault/reports/release_notes.py,sha256=pZj0PBFT-4F_Ty-Kv5P69BuoOnmTCn4kznDcORFJd0w,4011
|
|
356
360
|
evalvault/scripts/__init__.py,sha256=NwEeIFQbkX4ml2R_PhtIoNtArDSX_suuoymgG_7Kwso,89
|
|
357
361
|
evalvault/scripts/regression_runner.py,sha256=SxZori5BZ8jVQ057Mf5V5FPgIVDccrV5oRONmnhuk8w,8438
|
|
358
|
-
evalvault-1.
|
|
359
|
-
evalvault-1.
|
|
360
|
-
evalvault-1.
|
|
361
|
-
evalvault-1.
|
|
362
|
-
evalvault-1.
|
|
362
|
+
evalvault-1.74.0.dist-info/METADATA,sha256=ASZ_QeCeh2pyJ3eBUfu0zDKbIwDbvMp9JzL6KVvjWqk,26218
|
|
363
|
+
evalvault-1.74.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
364
|
+
evalvault-1.74.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
|
|
365
|
+
evalvault-1.74.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
|
|
366
|
+
evalvault-1.74.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|