evalvault 1.62.1__py3-none-any.whl → 1.63.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +190 -19
- evalvault/adapters/inbound/api/routers/runs.py +66 -2
- evalvault/adapters/inbound/cli/commands/method.py +5 -2
- evalvault/adapters/inbound/cli/commands/prompts.py +613 -5
- evalvault/adapters/inbound/cli/commands/run.py +88 -5
- evalvault/adapters/inbound/cli/commands/run_helpers.py +12 -0
- evalvault/adapters/inbound/mcp/tools.py +5 -2
- evalvault/adapters/outbound/analysis/ragas_evaluator_module.py +13 -9
- evalvault/adapters/outbound/improvement/pattern_detector.py +1 -1
- evalvault/adapters/outbound/improvement/playbook_loader.py +1 -1
- evalvault/adapters/outbound/llm/__init__.py +5 -43
- evalvault/adapters/outbound/llm/anthropic_adapter.py +27 -7
- evalvault/adapters/outbound/llm/factory.py +103 -0
- evalvault/adapters/outbound/llm/llm_relation_augmenter.py +39 -14
- evalvault/adapters/outbound/llm/ollama_adapter.py +34 -10
- evalvault/adapters/outbound/llm/openai_adapter.py +41 -8
- evalvault/adapters/outbound/llm/token_aware_chat.py +21 -2
- evalvault/adapters/outbound/llm/vllm_adapter.py +39 -8
- evalvault/adapters/outbound/nlp/korean/toolkit_factory.py +20 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +90 -6
- evalvault/adapters/outbound/storage/base_sql.py +528 -21
- evalvault/adapters/outbound/storage/postgres_adapter.py +209 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +38 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +86 -5
- evalvault/debug_ragas.py +7 -1
- evalvault/debug_ragas_real.py +5 -1
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/prompt_suggestion.py +50 -0
- evalvault/domain/services/__init__.py +6 -0
- evalvault/domain/services/evaluator.py +191 -103
- evalvault/domain/services/holdout_splitter.py +67 -0
- evalvault/domain/services/intent_classifier.py +73 -0
- evalvault/domain/services/pipeline_template_registry.py +3 -0
- evalvault/domain/services/prompt_candidate_service.py +117 -0
- evalvault/domain/services/prompt_registry.py +40 -2
- evalvault/domain/services/prompt_scoring_service.py +286 -0
- evalvault/domain/services/prompt_suggestion_reporter.py +277 -0
- evalvault/domain/services/synthetic_qa_generator.py +4 -3
- evalvault/ports/inbound/learning_hook_port.py +4 -1
- evalvault/ports/outbound/__init__.py +2 -0
- evalvault/ports/outbound/llm_factory_port.py +13 -0
- evalvault/ports/outbound/llm_port.py +34 -2
- evalvault/ports/outbound/storage_port.py +38 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/METADATA +228 -4
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/RECORD +48 -40
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/WHEEL +0 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/entry_points.txt +0 -0
- {evalvault-1.62.1.dist-info → evalvault-1.63.1.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -3,13 +3,14 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
+
import difflib
|
|
6
7
|
import json
|
|
7
8
|
import logging
|
|
8
9
|
import time
|
|
9
10
|
from collections.abc import Callable
|
|
10
11
|
from dataclasses import dataclass
|
|
11
12
|
from pathlib import Path
|
|
12
|
-
from typing import TYPE_CHECKING, Any, Literal
|
|
13
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
13
14
|
from urllib.request import urlopen
|
|
14
15
|
|
|
15
16
|
from evalvault.config.phoenix_support import PhoenixExperimentResolver
|
|
@@ -19,6 +20,7 @@ from evalvault.domain.entities import (
|
|
|
19
20
|
FeedbackSummary,
|
|
20
21
|
SatisfactionFeedback,
|
|
21
22
|
)
|
|
23
|
+
from evalvault.domain.entities.debug import DebugReport
|
|
22
24
|
from evalvault.domain.entities.prompt import PromptSetBundle
|
|
23
25
|
from evalvault.domain.metrics.registry import (
|
|
24
26
|
get_metric_descriptions as registry_metric_descriptions,
|
|
@@ -28,9 +30,11 @@ from evalvault.domain.metrics.registry import (
|
|
|
28
30
|
list_metric_specs,
|
|
29
31
|
)
|
|
30
32
|
from evalvault.domain.services.cluster_map_builder import build_cluster_map
|
|
33
|
+
from evalvault.domain.services.debug_report_service import DebugReportService
|
|
31
34
|
from evalvault.domain.services.prompt_registry import (
|
|
32
35
|
PromptInput,
|
|
33
36
|
build_prompt_bundle,
|
|
37
|
+
build_prompt_inputs_from_snapshots,
|
|
34
38
|
build_prompt_summary,
|
|
35
39
|
)
|
|
36
40
|
from evalvault.domain.services.prompt_status import extract_prompt_entries
|
|
@@ -47,12 +51,16 @@ from evalvault.ports.inbound.web_port import (
|
|
|
47
51
|
RunFilters,
|
|
48
52
|
RunSummary,
|
|
49
53
|
)
|
|
54
|
+
from evalvault.ports.outbound.stage_storage_port import StageStoragePort
|
|
50
55
|
|
|
51
56
|
if TYPE_CHECKING:
|
|
52
57
|
from evalvault.domain.entities import EvaluationRun, RunClusterMap, RunClusterMapInfo
|
|
53
58
|
from evalvault.domain.entities.improvement import ImprovementReport
|
|
54
59
|
from evalvault.domain.entities.stage import StageEvent, StageMetric
|
|
60
|
+
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
61
|
+
from evalvault.ports.outbound.dataset_port import DatasetPort
|
|
55
62
|
from evalvault.ports.outbound.llm_port import LLMPort
|
|
63
|
+
from evalvault.ports.outbound.report_port import ReportPort
|
|
56
64
|
from evalvault.ports.outbound.storage_port import StoragePort
|
|
57
65
|
|
|
58
66
|
logger = logging.getLogger(__name__)
|
|
@@ -90,10 +98,10 @@ class WebUIAdapter:
|
|
|
90
98
|
def __init__(
|
|
91
99
|
self,
|
|
92
100
|
storage: StoragePort | None = None,
|
|
93
|
-
evaluator:
|
|
94
|
-
report_generator:
|
|
101
|
+
evaluator: RagasEvaluator | None = None,
|
|
102
|
+
report_generator: ReportPort | None = None,
|
|
95
103
|
llm_adapter: LLMPort | None = None,
|
|
96
|
-
data_loader:
|
|
104
|
+
data_loader: DatasetPort | None = None,
|
|
97
105
|
settings: Settings | None = None,
|
|
98
106
|
):
|
|
99
107
|
"""어댑터 초기화.
|
|
@@ -105,12 +113,21 @@ class WebUIAdapter:
|
|
|
105
113
|
llm_adapter: LLM 어댑터 (선택적)
|
|
106
114
|
data_loader: 데이터 로더 (선택적)
|
|
107
115
|
"""
|
|
116
|
+
resolved_settings = settings
|
|
117
|
+
if storage is None:
|
|
118
|
+
resolved_settings = settings or Settings()
|
|
119
|
+
db_path = getattr(resolved_settings, "evalvault_db_path", None)
|
|
120
|
+
if db_path:
|
|
121
|
+
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
122
|
+
|
|
123
|
+
storage = SQLiteStorageAdapter(db_path=db_path)
|
|
124
|
+
|
|
108
125
|
self._storage = storage
|
|
109
126
|
self._evaluator = evaluator
|
|
110
127
|
self._report_generator = report_generator
|
|
111
128
|
self._llm_adapter = llm_adapter
|
|
112
129
|
self._data_loader = data_loader
|
|
113
|
-
self._settings =
|
|
130
|
+
self._settings = resolved_settings
|
|
114
131
|
self._phoenix_resolver: PhoenixExperimentResolver | None = None
|
|
115
132
|
self._phoenix_resolver_checked = False
|
|
116
133
|
|
|
@@ -362,13 +379,14 @@ class WebUIAdapter:
|
|
|
362
379
|
"""
|
|
363
380
|
if self._evaluator is None:
|
|
364
381
|
raise RuntimeError("Evaluator not configured")
|
|
382
|
+
evaluator = self._evaluator
|
|
365
383
|
|
|
366
384
|
# LLM Adapter Resolution
|
|
367
|
-
|
|
368
|
-
if
|
|
385
|
+
resolved_llm = self._get_llm_for_model(request.model_name)
|
|
386
|
+
if resolved_llm is None:
|
|
369
387
|
if self._llm_adapter is None:
|
|
370
388
|
raise RuntimeError("LLM adapter not configured")
|
|
371
|
-
|
|
389
|
+
resolved_llm = self._llm_adapter
|
|
372
390
|
logger.warning(f"Using default LLM adapter instead of requested {request.model_name}")
|
|
373
391
|
|
|
374
392
|
# 1. 데이터셋 로드 (비동기 처리)
|
|
@@ -540,7 +558,7 @@ class WebUIAdapter:
|
|
|
540
558
|
result = await memory_evaluator.evaluate_with_memory(
|
|
541
559
|
dataset=dataset,
|
|
542
560
|
metrics=request.metrics,
|
|
543
|
-
llm=
|
|
561
|
+
llm=resolved_llm,
|
|
544
562
|
thresholds=resolved_thresholds,
|
|
545
563
|
parallel=request.parallel,
|
|
546
564
|
batch_size=request.batch_size,
|
|
@@ -553,10 +571,10 @@ class WebUIAdapter:
|
|
|
553
571
|
on_progress=adaptor_progress,
|
|
554
572
|
)
|
|
555
573
|
else:
|
|
556
|
-
result = await
|
|
574
|
+
result = await evaluator.evaluate(
|
|
557
575
|
dataset=dataset,
|
|
558
576
|
metrics=request.metrics,
|
|
559
|
-
llm=
|
|
577
|
+
llm=resolved_llm,
|
|
560
578
|
thresholds=resolved_thresholds,
|
|
561
579
|
parallel=request.parallel,
|
|
562
580
|
batch_size=request.batch_size,
|
|
@@ -573,6 +591,34 @@ class WebUIAdapter:
|
|
|
573
591
|
on_progress(EvalProgress(0, 0, "", 0.0, "failed", str(e)))
|
|
574
592
|
raise e
|
|
575
593
|
|
|
594
|
+
tracker_meta = result.tracker_metadata or {}
|
|
595
|
+
result.tracker_metadata = tracker_meta
|
|
596
|
+
ragas_snapshots = tracker_meta.get("ragas_prompt_snapshots")
|
|
597
|
+
ragas_snapshot_inputs = build_prompt_inputs_from_snapshots(
|
|
598
|
+
ragas_snapshots if isinstance(ragas_snapshots, dict) else None,
|
|
599
|
+
)
|
|
600
|
+
override_status: dict[str, str] = {}
|
|
601
|
+
raw_override = tracker_meta.get("ragas_prompt_overrides")
|
|
602
|
+
if isinstance(raw_override, dict):
|
|
603
|
+
override_status = cast(dict[str, str], raw_override)
|
|
604
|
+
if override_status:
|
|
605
|
+
prompt_inputs = [
|
|
606
|
+
entry
|
|
607
|
+
for entry in prompt_inputs
|
|
608
|
+
if not (
|
|
609
|
+
entry.kind == "ragas"
|
|
610
|
+
and override_status.get(entry.role) is not None
|
|
611
|
+
and override_status.get(entry.role) != "applied"
|
|
612
|
+
)
|
|
613
|
+
]
|
|
614
|
+
|
|
615
|
+
if ragas_snapshot_inputs:
|
|
616
|
+
existing_roles = {entry.role for entry in prompt_inputs if entry.kind == "ragas"}
|
|
617
|
+
for entry in ragas_snapshot_inputs:
|
|
618
|
+
if entry.role in existing_roles and override_status.get(entry.role) == "applied":
|
|
619
|
+
continue
|
|
620
|
+
prompt_inputs.append(entry)
|
|
621
|
+
|
|
576
622
|
prompt_bundle = None
|
|
577
623
|
if prompt_inputs:
|
|
578
624
|
prompt_bundle = build_prompt_bundle(
|
|
@@ -684,7 +730,15 @@ class WebUIAdapter:
|
|
|
684
730
|
prompt_bundle.prompt_set.prompt_set_id,
|
|
685
731
|
)
|
|
686
732
|
try:
|
|
687
|
-
self.
|
|
733
|
+
export_settings = self._settings or Settings()
|
|
734
|
+
export_base = Path(export_settings.evalvault_db_path)
|
|
735
|
+
excel_path = export_base.parent / f"evalvault_run_{result.run_id}.xlsx"
|
|
736
|
+
if hasattr(self._storage, "export_run_to_excel"):
|
|
737
|
+
self._storage.export_run_to_excel(result.run_id, excel_path)
|
|
738
|
+
except Exception as exc:
|
|
739
|
+
logger.warning("Excel export failed for run %s: %s", result.run_id, exc)
|
|
740
|
+
try:
|
|
741
|
+
self._auto_generate_cluster_map(result, resolved_llm)
|
|
688
742
|
except Exception as exc:
|
|
689
743
|
logger.warning("Cluster map auto-generation failed: %s", exc)
|
|
690
744
|
|
|
@@ -957,6 +1011,116 @@ class WebUIAdapter:
|
|
|
957
1011
|
]
|
|
958
1012
|
return metrics
|
|
959
1013
|
|
|
1014
|
+
def compare_prompt_sets(
|
|
1015
|
+
self,
|
|
1016
|
+
base_run_id: str,
|
|
1017
|
+
target_run_id: str,
|
|
1018
|
+
*,
|
|
1019
|
+
max_lines: int = 40,
|
|
1020
|
+
include_diff: bool = True,
|
|
1021
|
+
) -> dict[str, Any]:
|
|
1022
|
+
if self._storage is None or not hasattr(self._storage, "get_prompt_set_for_run"):
|
|
1023
|
+
raise RuntimeError("Storage not configured")
|
|
1024
|
+
|
|
1025
|
+
base_bundle = self._storage.get_prompt_set_for_run(base_run_id)
|
|
1026
|
+
target_bundle = self._storage.get_prompt_set_for_run(target_run_id)
|
|
1027
|
+
if not base_bundle or not target_bundle:
|
|
1028
|
+
raise KeyError("Prompt set not found")
|
|
1029
|
+
|
|
1030
|
+
base_roles = self._prompt_bundle_role_map(base_bundle)
|
|
1031
|
+
target_roles = self._prompt_bundle_role_map(target_bundle)
|
|
1032
|
+
all_roles = sorted(set(base_roles) | set(target_roles))
|
|
1033
|
+
|
|
1034
|
+
summary: list[dict[str, Any]] = []
|
|
1035
|
+
diffs: list[dict[str, Any]] = []
|
|
1036
|
+
|
|
1037
|
+
for role in all_roles:
|
|
1038
|
+
base = base_roles.get(role)
|
|
1039
|
+
target = target_roles.get(role)
|
|
1040
|
+
if not base or not target:
|
|
1041
|
+
summary.append(
|
|
1042
|
+
{
|
|
1043
|
+
"role": role,
|
|
1044
|
+
"base_checksum": base["checksum"] if base else None,
|
|
1045
|
+
"target_checksum": target["checksum"] if target else None,
|
|
1046
|
+
"status": "missing",
|
|
1047
|
+
"base_name": base["name"] if base else None,
|
|
1048
|
+
"target_name": target["name"] if target else None,
|
|
1049
|
+
"base_kind": base["kind"] if base else None,
|
|
1050
|
+
"target_kind": target["kind"] if target else None,
|
|
1051
|
+
}
|
|
1052
|
+
)
|
|
1053
|
+
continue
|
|
1054
|
+
|
|
1055
|
+
status = "same" if base["checksum"] == target["checksum"] else "diff"
|
|
1056
|
+
summary.append(
|
|
1057
|
+
{
|
|
1058
|
+
"role": role,
|
|
1059
|
+
"base_checksum": base["checksum"],
|
|
1060
|
+
"target_checksum": target["checksum"],
|
|
1061
|
+
"status": status,
|
|
1062
|
+
"base_name": base["name"],
|
|
1063
|
+
"target_name": target["name"],
|
|
1064
|
+
"base_kind": base["kind"],
|
|
1065
|
+
"target_kind": target["kind"],
|
|
1066
|
+
}
|
|
1067
|
+
)
|
|
1068
|
+
|
|
1069
|
+
if include_diff and status == "diff":
|
|
1070
|
+
diff_lines = list(
|
|
1071
|
+
difflib.unified_diff(
|
|
1072
|
+
base["content"].splitlines(),
|
|
1073
|
+
target["content"].splitlines(),
|
|
1074
|
+
fromfile=f"{base_run_id[:8]}:{role}",
|
|
1075
|
+
tofile=f"{target_run_id[:8]}:{role}",
|
|
1076
|
+
lineterm="",
|
|
1077
|
+
)
|
|
1078
|
+
)
|
|
1079
|
+
truncated = len(diff_lines) > max_lines
|
|
1080
|
+
diffs.append(
|
|
1081
|
+
{
|
|
1082
|
+
"role": role,
|
|
1083
|
+
"lines": diff_lines[:max_lines],
|
|
1084
|
+
"truncated": truncated,
|
|
1085
|
+
}
|
|
1086
|
+
)
|
|
1087
|
+
|
|
1088
|
+
return {
|
|
1089
|
+
"base_run_id": base_run_id,
|
|
1090
|
+
"target_run_id": target_run_id,
|
|
1091
|
+
"summary": summary,
|
|
1092
|
+
"diffs": diffs,
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
def _prompt_bundle_role_map(self, bundle: PromptSetBundle) -> dict[str, dict[str, str]]:
|
|
1096
|
+
prompt_map = {prompt.prompt_id: prompt for prompt in bundle.prompts}
|
|
1097
|
+
roles: dict[str, dict[str, str]] = {}
|
|
1098
|
+
for item in bundle.items:
|
|
1099
|
+
prompt = prompt_map.get(item.prompt_id)
|
|
1100
|
+
if not prompt:
|
|
1101
|
+
continue
|
|
1102
|
+
roles[item.role] = {
|
|
1103
|
+
"checksum": prompt.checksum,
|
|
1104
|
+
"content": prompt.content,
|
|
1105
|
+
"name": prompt.name,
|
|
1106
|
+
"kind": prompt.kind,
|
|
1107
|
+
}
|
|
1108
|
+
return roles
|
|
1109
|
+
|
|
1110
|
+
def build_debug_report(self, run_id: str) -> DebugReport:
|
|
1111
|
+
if self._storage is None:
|
|
1112
|
+
raise RuntimeError("Storage not configured")
|
|
1113
|
+
if not hasattr(self._storage, "list_stage_events"):
|
|
1114
|
+
raise RuntimeError("Stage storage not configured")
|
|
1115
|
+
|
|
1116
|
+
service = DebugReportService()
|
|
1117
|
+
stage_storage = cast(StageStoragePort, self._storage)
|
|
1118
|
+
return service.build_report(
|
|
1119
|
+
run_id,
|
|
1120
|
+
storage=self._storage,
|
|
1121
|
+
stage_storage=stage_storage,
|
|
1122
|
+
)
|
|
1123
|
+
|
|
960
1124
|
def delete_run(self, run_id: str) -> bool:
|
|
961
1125
|
"""평가 삭제.
|
|
962
1126
|
|
|
@@ -1138,6 +1302,8 @@ class WebUIAdapter:
|
|
|
1138
1302
|
raise RuntimeError("Evaluator not configured")
|
|
1139
1303
|
if self._llm_adapter is None:
|
|
1140
1304
|
raise RuntimeError("LLM adapter not configured. .env에 OPENAI_API_KEY를 설정하세요.")
|
|
1305
|
+
evaluator = self._evaluator
|
|
1306
|
+
llm_adapter = self._llm_adapter
|
|
1141
1307
|
|
|
1142
1308
|
# 진행률 초기화
|
|
1143
1309
|
if on_progress:
|
|
@@ -1156,10 +1322,10 @@ class WebUIAdapter:
|
|
|
1156
1322
|
logger.info(f"Starting evaluation ({mode}) with metrics: {metrics}")
|
|
1157
1323
|
|
|
1158
1324
|
async def run_async_evaluation():
|
|
1159
|
-
return await
|
|
1325
|
+
return await evaluator.evaluate(
|
|
1160
1326
|
dataset=dataset,
|
|
1161
1327
|
metrics=metrics,
|
|
1162
|
-
llm=
|
|
1328
|
+
llm=llm_adapter,
|
|
1163
1329
|
thresholds=thresholds or {},
|
|
1164
1330
|
parallel=parallel,
|
|
1165
1331
|
batch_size=batch_size,
|
|
@@ -1356,6 +1522,7 @@ class WebUIAdapter:
|
|
|
1356
1522
|
metrics_to_analyze: list[str] | None = None,
|
|
1357
1523
|
thresholds: dict[str, float] | None = None,
|
|
1358
1524
|
model_id: str | None = None,
|
|
1525
|
+
language: str | None = None,
|
|
1359
1526
|
):
|
|
1360
1527
|
"""LLM 기반 지능형 보고서 생성.
|
|
1361
1528
|
|
|
@@ -1395,6 +1562,7 @@ class WebUIAdapter:
|
|
|
1395
1562
|
llm_adapter=llm_adapter,
|
|
1396
1563
|
include_research_insights=True,
|
|
1397
1564
|
include_action_items=True,
|
|
1565
|
+
language=language or "ko",
|
|
1398
1566
|
)
|
|
1399
1567
|
|
|
1400
1568
|
# 동기 방식으로 보고서 생성
|
|
@@ -1462,7 +1630,7 @@ class WebUIAdapter:
|
|
|
1462
1630
|
|
|
1463
1631
|
return str(file_path.absolute())
|
|
1464
1632
|
|
|
1465
|
-
def list_models(self, provider: str | None = None) -> list[dict[str, str]]:
|
|
1633
|
+
def list_models(self, provider: str | None = None) -> list[dict[str, str | bool]]:
|
|
1466
1634
|
"""사용 가능한 모델 목록 조회."""
|
|
1467
1635
|
settings = self._settings or Settings()
|
|
1468
1636
|
provider_key = provider.lower() if provider else None
|
|
@@ -1476,7 +1644,7 @@ class WebUIAdapter:
|
|
|
1476
1644
|
if provider_key:
|
|
1477
1645
|
return self._list_other_models(provider_key)
|
|
1478
1646
|
|
|
1479
|
-
models: list[dict[str, str]] = []
|
|
1647
|
+
models: list[dict[str, str | bool]] = []
|
|
1480
1648
|
models.extend(self._list_ollama_models(settings))
|
|
1481
1649
|
models.extend(self._list_openai_models())
|
|
1482
1650
|
models.extend(self._list_vllm_models(settings))
|
|
@@ -1571,7 +1739,7 @@ class WebUIAdapter:
|
|
|
1571
1739
|
lowered = model_name.lower()
|
|
1572
1740
|
return any(lowered == entry or lowered.startswith(f"{entry}:") for entry in allowlist)
|
|
1573
1741
|
|
|
1574
|
-
def _list_other_models(self, provider: str | None = None) -> list[dict[str, str]]:
|
|
1742
|
+
def _list_other_models(self, provider: str | None = None) -> list[dict[str, str | bool]]:
|
|
1575
1743
|
if provider and provider not in {"anthropic", "azure"}:
|
|
1576
1744
|
return []
|
|
1577
1745
|
return [
|
|
@@ -1612,7 +1780,8 @@ def create_adapter() -> WebUIAdapter:
|
|
|
1612
1780
|
|
|
1613
1781
|
설정에 따라 적절한 저장소와 서비스를 주입합니다.
|
|
1614
1782
|
"""
|
|
1615
|
-
from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
1783
|
+
from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
|
|
1784
|
+
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
1616
1785
|
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
1617
1786
|
from evalvault.config.settings import get_settings
|
|
1618
1787
|
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
@@ -1633,7 +1802,9 @@ def create_adapter() -> WebUIAdapter:
|
|
|
1633
1802
|
logger.warning(f"LLM adapter initialization failed: {e}")
|
|
1634
1803
|
|
|
1635
1804
|
# Evaluator 생성
|
|
1636
|
-
|
|
1805
|
+
llm_factory = SettingsLLMFactory(settings)
|
|
1806
|
+
korean_toolkit = try_create_korean_toolkit()
|
|
1807
|
+
evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
|
|
1637
1808
|
|
|
1638
1809
|
return WebUIAdapter(
|
|
1639
1810
|
storage=storage,
|
|
@@ -10,7 +10,7 @@ from pathlib import Path
|
|
|
10
10
|
from typing import Any, Literal
|
|
11
11
|
|
|
12
12
|
from fastapi import APIRouter, File, HTTPException, Query, UploadFile
|
|
13
|
-
from fastapi.responses import Response, StreamingResponse
|
|
13
|
+
from fastapi.responses import PlainTextResponse, Response, StreamingResponse
|
|
14
14
|
from pydantic import BaseModel
|
|
15
15
|
|
|
16
16
|
from evalvault.adapters.inbound.api.main import AdapterDep
|
|
@@ -19,6 +19,7 @@ from evalvault.adapters.outbound.dataset.templates import (
|
|
|
19
19
|
render_dataset_template_json,
|
|
20
20
|
render_dataset_template_xlsx,
|
|
21
21
|
)
|
|
22
|
+
from evalvault.adapters.outbound.debug.report_renderer import render_markdown
|
|
22
23
|
from evalvault.adapters.outbound.domain_memory.sqlite_adapter import SQLiteDomainMemoryAdapter
|
|
23
24
|
from evalvault.config.settings import get_settings
|
|
24
25
|
from evalvault.domain.entities import (
|
|
@@ -81,6 +82,30 @@ class QualityGateReportResponse(BaseModel):
|
|
|
81
82
|
regression_amount: float | None = None
|
|
82
83
|
|
|
83
84
|
|
|
85
|
+
class PromptDiffSummaryItem(BaseModel):
|
|
86
|
+
role: str
|
|
87
|
+
base_checksum: str | None = None
|
|
88
|
+
target_checksum: str | None = None
|
|
89
|
+
status: Literal["same", "diff", "missing"]
|
|
90
|
+
base_name: str | None = None
|
|
91
|
+
target_name: str | None = None
|
|
92
|
+
base_kind: str | None = None
|
|
93
|
+
target_kind: str | None = None
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class PromptDiffEntry(BaseModel):
|
|
97
|
+
role: str
|
|
98
|
+
lines: list[str]
|
|
99
|
+
truncated: bool
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class PromptDiffResponse(BaseModel):
|
|
103
|
+
base_run_id: str
|
|
104
|
+
target_run_id: str
|
|
105
|
+
summary: list[PromptDiffSummaryItem]
|
|
106
|
+
diffs: list[PromptDiffEntry]
|
|
107
|
+
|
|
108
|
+
|
|
84
109
|
class StartEvaluationRequest(BaseModel):
|
|
85
110
|
dataset_path: str
|
|
86
111
|
metrics: list[str]
|
|
@@ -1067,6 +1092,27 @@ def list_stage_metrics(
|
|
|
1067
1092
|
raise HTTPException(status_code=500, detail=str(e))
|
|
1068
1093
|
|
|
1069
1094
|
|
|
1095
|
+
@router.get("/prompt-diff", response_model=PromptDiffResponse)
|
|
1096
|
+
def prompt_diff(
|
|
1097
|
+
adapter: AdapterDep,
|
|
1098
|
+
base_run_id: str = Query(..., description="Base run id"),
|
|
1099
|
+
target_run_id: str = Query(..., description="Target run id"),
|
|
1100
|
+
max_lines: int = Query(40, ge=1, le=200),
|
|
1101
|
+
include_diff: bool = Query(True),
|
|
1102
|
+
):
|
|
1103
|
+
try:
|
|
1104
|
+
return adapter.compare_prompt_sets(
|
|
1105
|
+
base_run_id,
|
|
1106
|
+
target_run_id,
|
|
1107
|
+
max_lines=max_lines,
|
|
1108
|
+
include_diff=include_diff,
|
|
1109
|
+
)
|
|
1110
|
+
except KeyError:
|
|
1111
|
+
raise HTTPException(status_code=404, detail="Prompt set not found")
|
|
1112
|
+
except Exception as e:
|
|
1113
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
1114
|
+
|
|
1115
|
+
|
|
1070
1116
|
@router.get("/{run_id}/quality-gate", response_model=QualityGateReportResponse)
|
|
1071
1117
|
def check_quality_gate(run_id: str, adapter: AdapterDep):
|
|
1072
1118
|
"""Check quality gate status for a run."""
|
|
@@ -1079,6 +1125,23 @@ def check_quality_gate(run_id: str, adapter: AdapterDep):
|
|
|
1079
1125
|
raise HTTPException(status_code=500, detail=str(e))
|
|
1080
1126
|
|
|
1081
1127
|
|
|
1128
|
+
@router.get("/{run_id}/debug-report", response_model=None)
|
|
1129
|
+
def get_debug_report(
|
|
1130
|
+
run_id: str,
|
|
1131
|
+
adapter: AdapterDep,
|
|
1132
|
+
format: Literal["json", "markdown"] = Query("json", description="Report format"),
|
|
1133
|
+
):
|
|
1134
|
+
try:
|
|
1135
|
+
report = adapter.build_debug_report(run_id)
|
|
1136
|
+
if format == "markdown":
|
|
1137
|
+
return PlainTextResponse(render_markdown(report))
|
|
1138
|
+
return report.to_dict()
|
|
1139
|
+
except KeyError:
|
|
1140
|
+
raise HTTPException(status_code=404, detail="Run not found")
|
|
1141
|
+
except Exception as e:
|
|
1142
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
1143
|
+
|
|
1144
|
+
|
|
1082
1145
|
@router.get("/{run_id}/improvement")
|
|
1083
1146
|
def get_improvement_guide(
|
|
1084
1147
|
run_id: str,
|
|
@@ -1101,10 +1164,11 @@ def generate_llm_report(
|
|
|
1101
1164
|
run_id: str,
|
|
1102
1165
|
adapter: AdapterDep,
|
|
1103
1166
|
model_id: str | None = None,
|
|
1167
|
+
language: str | None = Query(None, description="Report language (ko/en)"),
|
|
1104
1168
|
):
|
|
1105
1169
|
"""Generate LLM-based detailed report."""
|
|
1106
1170
|
try:
|
|
1107
|
-
report = adapter.generate_llm_report(run_id, model_id=model_id)
|
|
1171
|
+
report = adapter.generate_llm_report(run_id, model_id=model_id, language=language)
|
|
1108
1172
|
return report
|
|
1109
1173
|
except KeyError:
|
|
1110
1174
|
raise HTTPException(status_code=404, detail="Run not found")
|
|
@@ -15,8 +15,9 @@ from rich.console import Console
|
|
|
15
15
|
from rich.table import Table
|
|
16
16
|
|
|
17
17
|
from evalvault.adapters.outbound.dataset.method_input_loader import MethodInputDatasetLoader
|
|
18
|
-
from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
18
|
+
from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
|
|
19
19
|
from evalvault.adapters.outbound.methods import ExternalCommandMethod, MethodRegistry
|
|
20
|
+
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
20
21
|
from evalvault.config.settings import Settings, apply_profile
|
|
21
22
|
from evalvault.domain.entities import Dataset
|
|
22
23
|
from evalvault.domain.entities.method import MethodOutput
|
|
@@ -376,7 +377,9 @@ def create_method_app(console: Console) -> typer.Typer:
|
|
|
376
377
|
raise typer.Exit(1)
|
|
377
378
|
|
|
378
379
|
llm_adapter = get_llm_adapter(settings)
|
|
379
|
-
|
|
380
|
+
llm_factory = SettingsLLMFactory(settings)
|
|
381
|
+
korean_toolkit = try_create_korean_toolkit()
|
|
382
|
+
evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
|
|
380
383
|
resolved_thresholds = _resolve_thresholds(metric_list, method_result.dataset)
|
|
381
384
|
|
|
382
385
|
with progress_spinner(console, "🤖 Evaluation in progress") as update_progress:
|