evalvault 1.62.1__py3-none-any.whl → 1.63.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. evalvault/adapters/inbound/api/adapter.py +190 -19
  2. evalvault/adapters/inbound/api/routers/runs.py +66 -2
  3. evalvault/adapters/inbound/cli/commands/method.py +5 -2
  4. evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
  5. evalvault/adapters/inbound/cli/commands/run.py +43 -2
  6. evalvault/adapters/inbound/cli/commands/run_helpers.py +10 -0
  7. evalvault/adapters/inbound/mcp/tools.py +5 -2
  8. evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
  9. evalvault/adapters/outbound/llm/__init__.py +5 -43
  10. evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
  11. evalvault/adapters/outbound/llm/factory.py +103 -0
  12. evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
  13. evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
  14. evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
  15. evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
  16. evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
  17. evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
  18. evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
  19. evalvault/adapters/outbound/storage/base_sql.py +527 -21
  20. evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
  21. evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
  22. evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
  23. evalvault/debug_ragas.py +7 -1
  24. evalvault/debug_ragas_real.py +5 -1
  25. evalvault/domain/entities/__init__.py +10 -0
  26. evalvault/domain/entities/prompt_suggestion.py +50 -0
  27. evalvault/domain/services/__init__.py +6 -0
  28. evalvault/domain/services/evaluator.py +191 -103
  29. evalvault/domain/services/holdout_splitter.py +67 -0
  30. evalvault/domain/services/intent_classifier.py +73 -0
  31. evalvault/domain/services/pipeline_template_registry.py +3 -0
  32. evalvault/domain/services/prompt_candidate_service.py +117 -0
  33. evalvault/domain/services/prompt_registry.py +40 -2
  34. evalvault/domain/services/prompt_scoring_service.py +286 -0
  35. evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
  36. evalvault/domain/services/synthetic_qa_generator.py +4 -3
  37. evalvault/ports/inbound/learning_hook_port.py +4 -1
  38. evalvault/ports/outbound/__init__.py +2 -0
  39. evalvault/ports/outbound/llm_factory_port.py +13 -0
  40. evalvault/ports/outbound/llm_port.py +34 -2
  41. evalvault/ports/outbound/storage_port.py +38 -0
  42. {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/METADATA +228 -4
  43. {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/RECORD +46 -38
  44. {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/WHEEL +0 -0
  45. {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/entry_points.txt +0 -0
  46. {evalvault-1.62.1.dist-info → evalvault-1.63.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -3,13 +3,14 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import asyncio
6
+ import difflib
6
7
  import json
7
8
  import logging
8
9
  import time
9
10
  from collections.abc import Callable
10
11
  from dataclasses import dataclass
11
12
  from pathlib import Path
12
- from typing import TYPE_CHECKING, Any, Literal
13
+ from typing import TYPE_CHECKING, Any, Literal, cast
13
14
  from urllib.request import urlopen
14
15
 
15
16
  from evalvault.config.phoenix_support import PhoenixExperimentResolver
@@ -19,6 +20,7 @@ from evalvault.domain.entities import (
19
20
  FeedbackSummary,
20
21
  SatisfactionFeedback,
21
22
  )
23
+ from evalvault.domain.entities.debug import DebugReport
22
24
  from evalvault.domain.entities.prompt import PromptSetBundle
23
25
  from evalvault.domain.metrics.registry import (
24
26
  get_metric_descriptions as registry_metric_descriptions,
@@ -28,9 +30,11 @@ from evalvault.domain.metrics.registry import (
28
30
  list_metric_specs,
29
31
  )
30
32
  from evalvault.domain.services.cluster_map_builder import build_cluster_map
33
+ from evalvault.domain.services.debug_report_service import DebugReportService
31
34
  from evalvault.domain.services.prompt_registry import (
32
35
  PromptInput,
33
36
  build_prompt_bundle,
37
+ build_prompt_inputs_from_snapshots,
34
38
  build_prompt_summary,
35
39
  )
36
40
  from evalvault.domain.services.prompt_status import extract_prompt_entries
@@ -47,12 +51,16 @@ from evalvault.ports.inbound.web_port import (
47
51
  RunFilters,
48
52
  RunSummary,
49
53
  )
54
+ from evalvault.ports.outbound.stage_storage_port import StageStoragePort
50
55
 
51
56
  if TYPE_CHECKING:
52
57
  from evalvault.domain.entities import EvaluationRun, RunClusterMap, RunClusterMapInfo
53
58
  from evalvault.domain.entities.improvement import ImprovementReport
54
59
  from evalvault.domain.entities.stage import StageEvent, StageMetric
60
+ from evalvault.domain.services.evaluator import RagasEvaluator
61
+ from evalvault.ports.outbound.dataset_port import DatasetPort
55
62
  from evalvault.ports.outbound.llm_port import LLMPort
63
+ from evalvault.ports.outbound.report_port import ReportPort
56
64
  from evalvault.ports.outbound.storage_port import StoragePort
57
65
 
58
66
  logger = logging.getLogger(__name__)
@@ -90,10 +98,10 @@ class WebUIAdapter:
90
98
  def __init__(
91
99
  self,
92
100
  storage: StoragePort | None = None,
93
- evaluator: object | None = None,
94
- report_generator: object | None = None,
101
+ evaluator: RagasEvaluator | None = None,
102
+ report_generator: ReportPort | None = None,
95
103
  llm_adapter: LLMPort | None = None,
96
- data_loader: object | None = None,
104
+ data_loader: DatasetPort | None = None,
97
105
  settings: Settings | None = None,
98
106
  ):
99
107
  """어댑터 초기화.
@@ -105,12 +113,21 @@ class WebUIAdapter:
105
113
  llm_adapter: LLM 어댑터 (선택적)
106
114
  data_loader: 데이터 로더 (선택적)
107
115
  """
116
+ resolved_settings = settings
117
+ if storage is None:
118
+ resolved_settings = settings or Settings()
119
+ db_path = getattr(resolved_settings, "evalvault_db_path", None)
120
+ if db_path:
121
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
122
+
123
+ storage = SQLiteStorageAdapter(db_path=db_path)
124
+
108
125
  self._storage = storage
109
126
  self._evaluator = evaluator
110
127
  self._report_generator = report_generator
111
128
  self._llm_adapter = llm_adapter
112
129
  self._data_loader = data_loader
113
- self._settings = settings
130
+ self._settings = resolved_settings
114
131
  self._phoenix_resolver: PhoenixExperimentResolver | None = None
115
132
  self._phoenix_resolver_checked = False
116
133
 
@@ -362,13 +379,14 @@ class WebUIAdapter:
362
379
  """
363
380
  if self._evaluator is None:
364
381
  raise RuntimeError("Evaluator not configured")
382
+ evaluator = self._evaluator
365
383
 
366
384
  # LLM Adapter Resolution
367
- llm_adapter = self._get_llm_for_model(request.model_name)
368
- if llm_adapter is None:
385
+ resolved_llm = self._get_llm_for_model(request.model_name)
386
+ if resolved_llm is None:
369
387
  if self._llm_adapter is None:
370
388
  raise RuntimeError("LLM adapter not configured")
371
- llm_adapter = self._llm_adapter
389
+ resolved_llm = self._llm_adapter
372
390
  logger.warning(f"Using default LLM adapter instead of requested {request.model_name}")
373
391
 
374
392
  # 1. 데이터셋 로드 (비동기 처리)
@@ -540,7 +558,7 @@ class WebUIAdapter:
540
558
  result = await memory_evaluator.evaluate_with_memory(
541
559
  dataset=dataset,
542
560
  metrics=request.metrics,
543
- llm=llm_adapter,
561
+ llm=resolved_llm,
544
562
  thresholds=resolved_thresholds,
545
563
  parallel=request.parallel,
546
564
  batch_size=request.batch_size,
@@ -553,10 +571,10 @@ class WebUIAdapter:
553
571
  on_progress=adaptor_progress,
554
572
  )
555
573
  else:
556
- result = await self._evaluator.evaluate(
574
+ result = await evaluator.evaluate(
557
575
  dataset=dataset,
558
576
  metrics=request.metrics,
559
- llm=llm_adapter,
577
+ llm=resolved_llm,
560
578
  thresholds=resolved_thresholds,
561
579
  parallel=request.parallel,
562
580
  batch_size=request.batch_size,
@@ -573,6 +591,34 @@ class WebUIAdapter:
573
591
  on_progress(EvalProgress(0, 0, "", 0.0, "failed", str(e)))
574
592
  raise e
575
593
 
594
+ tracker_meta = result.tracker_metadata or {}
595
+ result.tracker_metadata = tracker_meta
596
+ ragas_snapshots = tracker_meta.get("ragas_prompt_snapshots")
597
+ ragas_snapshot_inputs = build_prompt_inputs_from_snapshots(
598
+ ragas_snapshots if isinstance(ragas_snapshots, dict) else None,
599
+ )
600
+ override_status: dict[str, str] = {}
601
+ raw_override = tracker_meta.get("ragas_prompt_overrides")
602
+ if isinstance(raw_override, dict):
603
+ override_status = cast(dict[str, str], raw_override)
604
+ if override_status:
605
+ prompt_inputs = [
606
+ entry
607
+ for entry in prompt_inputs
608
+ if not (
609
+ entry.kind == "ragas"
610
+ and override_status.get(entry.role) is not None
611
+ and override_status.get(entry.role) != "applied"
612
+ )
613
+ ]
614
+
615
+ if ragas_snapshot_inputs:
616
+ existing_roles = {entry.role for entry in prompt_inputs if entry.kind == "ragas"}
617
+ for entry in ragas_snapshot_inputs:
618
+ if entry.role in existing_roles and override_status.get(entry.role) == "applied":
619
+ continue
620
+ prompt_inputs.append(entry)
621
+
576
622
  prompt_bundle = None
577
623
  if prompt_inputs:
578
624
  prompt_bundle = build_prompt_bundle(
@@ -684,7 +730,15 @@ class WebUIAdapter:
684
730
  prompt_bundle.prompt_set.prompt_set_id,
685
731
  )
686
732
  try:
687
- self._auto_generate_cluster_map(result, llm_adapter)
733
+ export_settings = self._settings or Settings()
734
+ export_base = Path(export_settings.evalvault_db_path)
735
+ excel_path = export_base.parent / f"evalvault_run_{result.run_id}.xlsx"
736
+ if hasattr(self._storage, "export_run_to_excel"):
737
+ self._storage.export_run_to_excel(result.run_id, excel_path)
738
+ except Exception as exc:
739
+ logger.warning("Excel export failed for run %s: %s", result.run_id, exc)
740
+ try:
741
+ self._auto_generate_cluster_map(result, resolved_llm)
688
742
  except Exception as exc:
689
743
  logger.warning("Cluster map auto-generation failed: %s", exc)
690
744
 
@@ -957,6 +1011,116 @@ class WebUIAdapter:
957
1011
  ]
958
1012
  return metrics
959
1013
 
1014
+ def compare_prompt_sets(
1015
+ self,
1016
+ base_run_id: str,
1017
+ target_run_id: str,
1018
+ *,
1019
+ max_lines: int = 40,
1020
+ include_diff: bool = True,
1021
+ ) -> dict[str, Any]:
1022
+ if self._storage is None or not hasattr(self._storage, "get_prompt_set_for_run"):
1023
+ raise RuntimeError("Storage not configured")
1024
+
1025
+ base_bundle = self._storage.get_prompt_set_for_run(base_run_id)
1026
+ target_bundle = self._storage.get_prompt_set_for_run(target_run_id)
1027
+ if not base_bundle or not target_bundle:
1028
+ raise KeyError("Prompt set not found")
1029
+
1030
+ base_roles = self._prompt_bundle_role_map(base_bundle)
1031
+ target_roles = self._prompt_bundle_role_map(target_bundle)
1032
+ all_roles = sorted(set(base_roles) | set(target_roles))
1033
+
1034
+ summary: list[dict[str, Any]] = []
1035
+ diffs: list[dict[str, Any]] = []
1036
+
1037
+ for role in all_roles:
1038
+ base = base_roles.get(role)
1039
+ target = target_roles.get(role)
1040
+ if not base or not target:
1041
+ summary.append(
1042
+ {
1043
+ "role": role,
1044
+ "base_checksum": base["checksum"] if base else None,
1045
+ "target_checksum": target["checksum"] if target else None,
1046
+ "status": "missing",
1047
+ "base_name": base["name"] if base else None,
1048
+ "target_name": target["name"] if target else None,
1049
+ "base_kind": base["kind"] if base else None,
1050
+ "target_kind": target["kind"] if target else None,
1051
+ }
1052
+ )
1053
+ continue
1054
+
1055
+ status = "same" if base["checksum"] == target["checksum"] else "diff"
1056
+ summary.append(
1057
+ {
1058
+ "role": role,
1059
+ "base_checksum": base["checksum"],
1060
+ "target_checksum": target["checksum"],
1061
+ "status": status,
1062
+ "base_name": base["name"],
1063
+ "target_name": target["name"],
1064
+ "base_kind": base["kind"],
1065
+ "target_kind": target["kind"],
1066
+ }
1067
+ )
1068
+
1069
+ if include_diff and status == "diff":
1070
+ diff_lines = list(
1071
+ difflib.unified_diff(
1072
+ base["content"].splitlines(),
1073
+ target["content"].splitlines(),
1074
+ fromfile=f"{base_run_id[:8]}:{role}",
1075
+ tofile=f"{target_run_id[:8]}:{role}",
1076
+ lineterm="",
1077
+ )
1078
+ )
1079
+ truncated = len(diff_lines) > max_lines
1080
+ diffs.append(
1081
+ {
1082
+ "role": role,
1083
+ "lines": diff_lines[:max_lines],
1084
+ "truncated": truncated,
1085
+ }
1086
+ )
1087
+
1088
+ return {
1089
+ "base_run_id": base_run_id,
1090
+ "target_run_id": target_run_id,
1091
+ "summary": summary,
1092
+ "diffs": diffs,
1093
+ }
1094
+
1095
+ def _prompt_bundle_role_map(self, bundle: PromptSetBundle) -> dict[str, dict[str, str]]:
1096
+ prompt_map = {prompt.prompt_id: prompt for prompt in bundle.prompts}
1097
+ roles: dict[str, dict[str, str]] = {}
1098
+ for item in bundle.items:
1099
+ prompt = prompt_map.get(item.prompt_id)
1100
+ if not prompt:
1101
+ continue
1102
+ roles[item.role] = {
1103
+ "checksum": prompt.checksum,
1104
+ "content": prompt.content,
1105
+ "name": prompt.name,
1106
+ "kind": prompt.kind,
1107
+ }
1108
+ return roles
1109
+
1110
+ def build_debug_report(self, run_id: str) -> DebugReport:
1111
+ if self._storage is None:
1112
+ raise RuntimeError("Storage not configured")
1113
+ if not hasattr(self._storage, "list_stage_events"):
1114
+ raise RuntimeError("Stage storage not configured")
1115
+
1116
+ service = DebugReportService()
1117
+ stage_storage = cast(StageStoragePort, self._storage)
1118
+ return service.build_report(
1119
+ run_id,
1120
+ storage=self._storage,
1121
+ stage_storage=stage_storage,
1122
+ )
1123
+
960
1124
  def delete_run(self, run_id: str) -> bool:
961
1125
  """평가 삭제.
962
1126
 
@@ -1138,6 +1302,8 @@ class WebUIAdapter:
1138
1302
  raise RuntimeError("Evaluator not configured")
1139
1303
  if self._llm_adapter is None:
1140
1304
  raise RuntimeError("LLM adapter not configured. .env에 OPENAI_API_KEY를 설정하세요.")
1305
+ evaluator = self._evaluator
1306
+ llm_adapter = self._llm_adapter
1141
1307
 
1142
1308
  # 진행률 초기화
1143
1309
  if on_progress:
@@ -1156,10 +1322,10 @@ class WebUIAdapter:
1156
1322
  logger.info(f"Starting evaluation ({mode}) with metrics: {metrics}")
1157
1323
 
1158
1324
  async def run_async_evaluation():
1159
- return await self._evaluator.evaluate(
1325
+ return await evaluator.evaluate(
1160
1326
  dataset=dataset,
1161
1327
  metrics=metrics,
1162
- llm=self._llm_adapter,
1328
+ llm=llm_adapter,
1163
1329
  thresholds=thresholds or {},
1164
1330
  parallel=parallel,
1165
1331
  batch_size=batch_size,
@@ -1356,6 +1522,7 @@ class WebUIAdapter:
1356
1522
  metrics_to_analyze: list[str] | None = None,
1357
1523
  thresholds: dict[str, float] | None = None,
1358
1524
  model_id: str | None = None,
1525
+ language: str | None = None,
1359
1526
  ):
1360
1527
  """LLM 기반 지능형 보고서 생성.
1361
1528
 
@@ -1395,6 +1562,7 @@ class WebUIAdapter:
1395
1562
  llm_adapter=llm_adapter,
1396
1563
  include_research_insights=True,
1397
1564
  include_action_items=True,
1565
+ language=language or "ko",
1398
1566
  )
1399
1567
 
1400
1568
  # 동기 방식으로 보고서 생성
@@ -1462,7 +1630,7 @@ class WebUIAdapter:
1462
1630
 
1463
1631
  return str(file_path.absolute())
1464
1632
 
1465
- def list_models(self, provider: str | None = None) -> list[dict[str, str]]:
1633
+ def list_models(self, provider: str | None = None) -> list[dict[str, str | bool]]:
1466
1634
  """사용 가능한 모델 목록 조회."""
1467
1635
  settings = self._settings or Settings()
1468
1636
  provider_key = provider.lower() if provider else None
@@ -1476,7 +1644,7 @@ class WebUIAdapter:
1476
1644
  if provider_key:
1477
1645
  return self._list_other_models(provider_key)
1478
1646
 
1479
- models: list[dict[str, str]] = []
1647
+ models: list[dict[str, str | bool]] = []
1480
1648
  models.extend(self._list_ollama_models(settings))
1481
1649
  models.extend(self._list_openai_models())
1482
1650
  models.extend(self._list_vllm_models(settings))
@@ -1571,7 +1739,7 @@ class WebUIAdapter:
1571
1739
  lowered = model_name.lower()
1572
1740
  return any(lowered == entry or lowered.startswith(f"{entry}:") for entry in allowlist)
1573
1741
 
1574
- def _list_other_models(self, provider: str | None = None) -> list[dict[str, str]]:
1742
+ def _list_other_models(self, provider: str | None = None) -> list[dict[str, str | bool]]:
1575
1743
  if provider and provider not in {"anthropic", "azure"}:
1576
1744
  return []
1577
1745
  return [
@@ -1612,7 +1780,8 @@ def create_adapter() -> WebUIAdapter:
1612
1780
 
1613
1781
  설정에 따라 적절한 저장소와 서비스를 주입합니다.
1614
1782
  """
1615
- from evalvault.adapters.outbound.llm import get_llm_adapter
1783
+ from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
1784
+ from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
1616
1785
  from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
1617
1786
  from evalvault.config.settings import get_settings
1618
1787
  from evalvault.domain.services.evaluator import RagasEvaluator
@@ -1633,7 +1802,9 @@ def create_adapter() -> WebUIAdapter:
1633
1802
  logger.warning(f"LLM adapter initialization failed: {e}")
1634
1803
 
1635
1804
  # Evaluator 생성
1636
- evaluator = RagasEvaluator()
1805
+ llm_factory = SettingsLLMFactory(settings)
1806
+ korean_toolkit = try_create_korean_toolkit()
1807
+ evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
1637
1808
 
1638
1809
  return WebUIAdapter(
1639
1810
  storage=storage,
@@ -10,7 +10,7 @@ from pathlib import Path
10
10
  from typing import Any, Literal
11
11
 
12
12
  from fastapi import APIRouter, File, HTTPException, Query, UploadFile
13
- from fastapi.responses import Response, StreamingResponse
13
+ from fastapi.responses import PlainTextResponse, Response, StreamingResponse
14
14
  from pydantic import BaseModel
15
15
 
16
16
  from evalvault.adapters.inbound.api.main import AdapterDep
@@ -19,6 +19,7 @@ from evalvault.adapters.outbound.dataset.templates import (
19
19
  render_dataset_template_json,
20
20
  render_dataset_template_xlsx,
21
21
  )
22
+ from evalvault.adapters.outbound.debug.report_renderer import render_markdown
22
23
  from evalvault.adapters.outbound.domain_memory.sqlite_adapter import SQLiteDomainMemoryAdapter
23
24
  from evalvault.config.settings import get_settings
24
25
  from evalvault.domain.entities import (
@@ -81,6 +82,30 @@ class QualityGateReportResponse(BaseModel):
81
82
  regression_amount: float | None = None
82
83
 
83
84
 
85
+ class PromptDiffSummaryItem(BaseModel):
86
+ role: str
87
+ base_checksum: str | None = None
88
+ target_checksum: str | None = None
89
+ status: Literal["same", "diff", "missing"]
90
+ base_name: str | None = None
91
+ target_name: str | None = None
92
+ base_kind: str | None = None
93
+ target_kind: str | None = None
94
+
95
+
96
+ class PromptDiffEntry(BaseModel):
97
+ role: str
98
+ lines: list[str]
99
+ truncated: bool
100
+
101
+
102
+ class PromptDiffResponse(BaseModel):
103
+ base_run_id: str
104
+ target_run_id: str
105
+ summary: list[PromptDiffSummaryItem]
106
+ diffs: list[PromptDiffEntry]
107
+
108
+
84
109
  class StartEvaluationRequest(BaseModel):
85
110
  dataset_path: str
86
111
  metrics: list[str]
@@ -1067,6 +1092,27 @@ def list_stage_metrics(
1067
1092
  raise HTTPException(status_code=500, detail=str(e))
1068
1093
 
1069
1094
 
1095
+ @router.get("/prompt-diff", response_model=PromptDiffResponse)
1096
+ def prompt_diff(
1097
+ adapter: AdapterDep,
1098
+ base_run_id: str = Query(..., description="Base run id"),
1099
+ target_run_id: str = Query(..., description="Target run id"),
1100
+ max_lines: int = Query(40, ge=1, le=200),
1101
+ include_diff: bool = Query(True),
1102
+ ):
1103
+ try:
1104
+ return adapter.compare_prompt_sets(
1105
+ base_run_id,
1106
+ target_run_id,
1107
+ max_lines=max_lines,
1108
+ include_diff=include_diff,
1109
+ )
1110
+ except KeyError:
1111
+ raise HTTPException(status_code=404, detail="Prompt set not found")
1112
+ except Exception as e:
1113
+ raise HTTPException(status_code=500, detail=str(e))
1114
+
1115
+
1070
1116
  @router.get("/{run_id}/quality-gate", response_model=QualityGateReportResponse)
1071
1117
  def check_quality_gate(run_id: str, adapter: AdapterDep):
1072
1118
  """Check quality gate status for a run."""
@@ -1079,6 +1125,23 @@ def check_quality_gate(run_id: str, adapter: AdapterDep):
1079
1125
  raise HTTPException(status_code=500, detail=str(e))
1080
1126
 
1081
1127
 
1128
+ @router.get("/{run_id}/debug-report", response_model=None)
1129
+ def get_debug_report(
1130
+ run_id: str,
1131
+ adapter: AdapterDep,
1132
+ format: Literal["json", "markdown"] = Query("json", description="Report format"),
1133
+ ):
1134
+ try:
1135
+ report = adapter.build_debug_report(run_id)
1136
+ if format == "markdown":
1137
+ return PlainTextResponse(render_markdown(report))
1138
+ return report.to_dict()
1139
+ except KeyError:
1140
+ raise HTTPException(status_code=404, detail="Run not found")
1141
+ except Exception as e:
1142
+ raise HTTPException(status_code=500, detail=str(e))
1143
+
1144
+
1082
1145
  @router.get("/{run_id}/improvement")
1083
1146
  def get_improvement_guide(
1084
1147
  run_id: str,
@@ -1101,10 +1164,11 @@ def generate_llm_report(
1101
1164
  run_id: str,
1102
1165
  adapter: AdapterDep,
1103
1166
  model_id: str | None = None,
1167
+ language: str | None = Query(None, description="Report language (ko/en)"),
1104
1168
  ):
1105
1169
  """Generate LLM-based detailed report."""
1106
1170
  try:
1107
- report = adapter.generate_llm_report(run_id, model_id=model_id)
1171
+ report = adapter.generate_llm_report(run_id, model_id=model_id, language=language)
1108
1172
  return report
1109
1173
  except KeyError:
1110
1174
  raise HTTPException(status_code=404, detail="Run not found")
@@ -15,8 +15,9 @@ from rich.console import Console
15
15
  from rich.table import Table
16
16
 
17
17
  from evalvault.adapters.outbound.dataset.method_input_loader import MethodInputDatasetLoader
18
- from evalvault.adapters.outbound.llm import get_llm_adapter
18
+ from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
19
19
  from evalvault.adapters.outbound.methods import ExternalCommandMethod, MethodRegistry
20
+ from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
20
21
  from evalvault.config.settings import Settings, apply_profile
21
22
  from evalvault.domain.entities import Dataset
22
23
  from evalvault.domain.entities.method import MethodOutput
@@ -376,7 +377,9 @@ def create_method_app(console: Console) -> typer.Typer:
376
377
  raise typer.Exit(1)
377
378
 
378
379
  llm_adapter = get_llm_adapter(settings)
379
- evaluator = RagasEvaluator()
380
+ llm_factory = SettingsLLMFactory(settings)
381
+ korean_toolkit = try_create_korean_toolkit()
382
+ evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
380
383
  resolved_thresholds = _resolve_thresholds(metric_list, method_result.dataset)
381
384
 
382
385
  with progress_spinner(console, "🤖 Evaluation in progress") as update_progress: