evalvault 1.61.0__py3-none-any.whl → 1.62.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +29 -0
- evalvault/adapters/inbound/api/routers/runs.py +129 -6
- evalvault/adapters/inbound/cli/commands/__init__.py +2 -0
- evalvault/adapters/inbound/cli/commands/calibrate.py +111 -0
- evalvault/adapters/outbound/analysis/nlp_adapter.py +46 -2
- evalvault/adapters/outbound/analysis/nlp_analyzer_module.py +1 -1
- evalvault/adapters/outbound/storage/base_sql.py +91 -0
- evalvault/adapters/outbound/storage/postgres_adapter.py +22 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +14 -0
- evalvault/adapters/outbound/storage/schema.sql +15 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +25 -0
- evalvault/domain/entities/__init__.py +12 -0
- evalvault/domain/entities/feedback.py +58 -0
- evalvault/domain/services/satisfaction_calibration_service.py +328 -0
- evalvault/ports/outbound/storage_port.py +10 -0
- {evalvault-1.61.0.dist-info → evalvault-1.62.0.dist-info}/METADATA +3 -1
- {evalvault-1.61.0.dist-info → evalvault-1.62.0.dist-info}/RECORD +20 -17
- {evalvault-1.61.0.dist-info → evalvault-1.62.0.dist-info}/WHEEL +0 -0
- {evalvault-1.61.0.dist-info → evalvault-1.62.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.61.0.dist-info → evalvault-1.62.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -14,6 +14,11 @@ from urllib.request import urlopen
|
|
|
14
14
|
|
|
15
15
|
from evalvault.config.phoenix_support import PhoenixExperimentResolver
|
|
16
16
|
from evalvault.config.settings import Settings
|
|
17
|
+
from evalvault.domain.entities import (
|
|
18
|
+
CalibrationResult,
|
|
19
|
+
FeedbackSummary,
|
|
20
|
+
SatisfactionFeedback,
|
|
21
|
+
)
|
|
17
22
|
from evalvault.domain.entities.prompt import PromptSetBundle
|
|
18
23
|
from evalvault.domain.metrics.registry import (
|
|
19
24
|
get_metric_descriptions as registry_metric_descriptions,
|
|
@@ -29,6 +34,9 @@ from evalvault.domain.services.prompt_registry import (
|
|
|
29
34
|
build_prompt_summary,
|
|
30
35
|
)
|
|
31
36
|
from evalvault.domain.services.prompt_status import extract_prompt_entries
|
|
37
|
+
from evalvault.domain.services.satisfaction_calibration_service import (
|
|
38
|
+
SatisfactionCalibrationService,
|
|
39
|
+
)
|
|
32
40
|
from evalvault.domain.services.stage_event_builder import StageEventBuilder
|
|
33
41
|
from evalvault.domain.services.stage_metric_service import StageMetricService
|
|
34
42
|
from evalvault.domain.services.threshold_profiles import apply_threshold_profile
|
|
@@ -893,6 +901,27 @@ class WebUIAdapter:
|
|
|
893
901
|
raise RuntimeError("Storage not configured")
|
|
894
902
|
return self._storage.delete_run_cluster_map(run_id, map_id)
|
|
895
903
|
|
|
904
|
+
def save_feedback(self, feedback: SatisfactionFeedback) -> str:
|
|
905
|
+
if self._storage is None or not hasattr(self._storage, "save_feedback"):
|
|
906
|
+
raise RuntimeError("Storage not configured")
|
|
907
|
+
return self._storage.save_feedback(feedback)
|
|
908
|
+
|
|
909
|
+
def list_feedback(self, run_id: str) -> list[SatisfactionFeedback]:
|
|
910
|
+
if self._storage is None or not hasattr(self._storage, "list_feedback"):
|
|
911
|
+
raise RuntimeError("Storage not configured")
|
|
912
|
+
return self._storage.list_feedback(run_id)
|
|
913
|
+
|
|
914
|
+
def get_feedback_summary(self, run_id: str) -> FeedbackSummary:
|
|
915
|
+
if self._storage is None or not hasattr(self._storage, "get_feedback_summary"):
|
|
916
|
+
raise RuntimeError("Storage not configured")
|
|
917
|
+
return self._storage.get_feedback_summary(run_id)
|
|
918
|
+
|
|
919
|
+
def build_calibration(self, run_id: str, *, model: str = "both") -> CalibrationResult:
|
|
920
|
+
run = self.get_run_details(run_id)
|
|
921
|
+
feedbacks = self.list_feedback(run_id)
|
|
922
|
+
service = SatisfactionCalibrationService()
|
|
923
|
+
return service.build_calibration(run, feedbacks, model=model)
|
|
924
|
+
|
|
896
925
|
def list_stage_events(self, run_id: str, *, stage_type: str | None = None) -> list[StageEvent]:
|
|
897
926
|
"""Stage 이벤트 목록 조회."""
|
|
898
927
|
if self._storage is None or not hasattr(self._storage, "list_stage_events"):
|
|
@@ -21,7 +21,11 @@ from evalvault.adapters.outbound.dataset.templates import (
|
|
|
21
21
|
)
|
|
22
22
|
from evalvault.adapters.outbound.domain_memory.sqlite_adapter import SQLiteDomainMemoryAdapter
|
|
23
23
|
from evalvault.config.settings import get_settings
|
|
24
|
-
from evalvault.domain.entities import
|
|
24
|
+
from evalvault.domain.entities import (
|
|
25
|
+
CalibrationResult,
|
|
26
|
+
EvaluationRun,
|
|
27
|
+
SatisfactionFeedback,
|
|
28
|
+
)
|
|
25
29
|
from evalvault.domain.services.domain_learning_hook import DomainLearningHook
|
|
26
30
|
from evalvault.domain.services.ragas_prompt_overrides import (
|
|
27
31
|
PromptOverrideError,
|
|
@@ -178,6 +182,31 @@ class ClusterMapDeleteResponse(BaseModel):
|
|
|
178
182
|
deleted_count: int
|
|
179
183
|
|
|
180
184
|
|
|
185
|
+
class FeedbackSaveRequest(BaseModel):
|
|
186
|
+
test_case_id: str
|
|
187
|
+
satisfaction_score: float | None = None
|
|
188
|
+
thumb_feedback: Literal["up", "down", "none"] | None = None
|
|
189
|
+
comment: str | None = None
|
|
190
|
+
rater_id: str | None = None
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class FeedbackResponse(BaseModel):
|
|
194
|
+
feedback_id: str
|
|
195
|
+
run_id: str
|
|
196
|
+
test_case_id: str
|
|
197
|
+
satisfaction_score: float | None = None
|
|
198
|
+
thumb_feedback: str | None = None
|
|
199
|
+
comment: str | None = None
|
|
200
|
+
rater_id: str | None = None
|
|
201
|
+
created_at: str | None = None
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class FeedbackSummaryResponse(BaseModel):
|
|
205
|
+
avg_satisfaction_score: float | None = None
|
|
206
|
+
thumb_up_rate: float | None = None
|
|
207
|
+
total_feedback: int
|
|
208
|
+
|
|
209
|
+
|
|
181
210
|
class VisualSpaceRequest(BaseModel):
|
|
182
211
|
granularity: Literal["run", "case", "cluster"] = "case"
|
|
183
212
|
base_run_id: str | None = None
|
|
@@ -188,9 +217,22 @@ class VisualSpaceRequest(BaseModel):
|
|
|
188
217
|
cluster_map: dict[str, str] | None = None
|
|
189
218
|
|
|
190
219
|
|
|
191
|
-
def _serialize_run_details(
|
|
220
|
+
def _serialize_run_details(
|
|
221
|
+
run: EvaluationRun,
|
|
222
|
+
*,
|
|
223
|
+
calibration: CalibrationResult | None = None,
|
|
224
|
+
) -> dict[str, Any]:
|
|
225
|
+
summary = run.to_summary_dict()
|
|
226
|
+
if calibration is not None:
|
|
227
|
+
summary.update(
|
|
228
|
+
{
|
|
229
|
+
"avg_satisfaction_score": calibration.summary.avg_satisfaction_score,
|
|
230
|
+
"thumb_up_rate": calibration.summary.thumb_up_rate,
|
|
231
|
+
"imputed_ratio": calibration.summary.imputed_ratio,
|
|
232
|
+
}
|
|
233
|
+
)
|
|
192
234
|
payload = {
|
|
193
|
-
"summary":
|
|
235
|
+
"summary": summary,
|
|
194
236
|
"results": [
|
|
195
237
|
{
|
|
196
238
|
"test_case_id": result.test_case_id,
|
|
@@ -207,6 +249,21 @@ def _serialize_run_details(run: EvaluationRun) -> dict[str, Any]:
|
|
|
207
249
|
}
|
|
208
250
|
for metric in result.metrics
|
|
209
251
|
],
|
|
252
|
+
"calibrated_satisfaction": (
|
|
253
|
+
calibration.cases[result.test_case_id].calibrated_satisfaction
|
|
254
|
+
if calibration and result.test_case_id in calibration.cases
|
|
255
|
+
else None
|
|
256
|
+
),
|
|
257
|
+
"imputed": (
|
|
258
|
+
calibration.cases[result.test_case_id].imputed
|
|
259
|
+
if calibration and result.test_case_id in calibration.cases
|
|
260
|
+
else False
|
|
261
|
+
),
|
|
262
|
+
"imputation_source": (
|
|
263
|
+
calibration.cases[result.test_case_id].imputation_source
|
|
264
|
+
if calibration and result.test_case_id in calibration.cases
|
|
265
|
+
else None
|
|
266
|
+
),
|
|
210
267
|
}
|
|
211
268
|
for result in run.results
|
|
212
269
|
],
|
|
@@ -719,9 +776,12 @@ def compare_runs(
|
|
|
719
776
|
}
|
|
720
777
|
)
|
|
721
778
|
|
|
779
|
+
base_calibration = adapter.build_calibration(base_id)
|
|
780
|
+
target_calibration = adapter.build_calibration(target_id)
|
|
781
|
+
|
|
722
782
|
return {
|
|
723
|
-
"base": _serialize_run_details(base_run),
|
|
724
|
-
"target": _serialize_run_details(target_run),
|
|
783
|
+
"base": _serialize_run_details(base_run, calibration=base_calibration),
|
|
784
|
+
"target": _serialize_run_details(target_run, calibration=target_calibration),
|
|
725
785
|
"metric_deltas": metric_deltas,
|
|
726
786
|
"case_counts": _build_case_counts(base_run, target_run),
|
|
727
787
|
"pass_rate_delta": target_run.pass_rate - base_run.pass_rate,
|
|
@@ -898,7 +958,70 @@ def get_run_details(run_id: str, adapter: AdapterDep) -> dict[str, Any]:
|
|
|
898
958
|
"""Get detailed information for a specific run."""
|
|
899
959
|
try:
|
|
900
960
|
run: EvaluationRun = adapter.get_run_details(run_id)
|
|
901
|
-
|
|
961
|
+
calibration = adapter.build_calibration(run_id)
|
|
962
|
+
return _serialize_run_details(run, calibration=calibration)
|
|
963
|
+
except KeyError:
|
|
964
|
+
raise HTTPException(status_code=404, detail="Run not found")
|
|
965
|
+
except Exception as e:
|
|
966
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
967
|
+
|
|
968
|
+
|
|
969
|
+
@router.post("/{run_id}/feedback", response_model=FeedbackResponse)
|
|
970
|
+
def save_feedback(
|
|
971
|
+
run_id: str,
|
|
972
|
+
request: FeedbackSaveRequest,
|
|
973
|
+
adapter: AdapterDep,
|
|
974
|
+
) -> dict[str, Any]:
|
|
975
|
+
try:
|
|
976
|
+
adapter.get_run_details(run_id)
|
|
977
|
+
thumb_feedback = request.thumb_feedback
|
|
978
|
+
if thumb_feedback == "none":
|
|
979
|
+
thumb_feedback = None
|
|
980
|
+
satisfaction_score = request.satisfaction_score
|
|
981
|
+
if satisfaction_score is not None:
|
|
982
|
+
satisfaction_score = max(1.0, min(5.0, satisfaction_score))
|
|
983
|
+
feedback = SatisfactionFeedback(
|
|
984
|
+
feedback_id="",
|
|
985
|
+
run_id=run_id,
|
|
986
|
+
test_case_id=request.test_case_id,
|
|
987
|
+
satisfaction_score=satisfaction_score,
|
|
988
|
+
thumb_feedback=thumb_feedback,
|
|
989
|
+
comment=request.comment,
|
|
990
|
+
rater_id=request.rater_id,
|
|
991
|
+
created_at=datetime.now(),
|
|
992
|
+
)
|
|
993
|
+
feedback_id = adapter.save_feedback(feedback)
|
|
994
|
+
saved = feedback.to_dict()
|
|
995
|
+
saved["feedback_id"] = feedback_id
|
|
996
|
+
return saved
|
|
997
|
+
except KeyError:
|
|
998
|
+
raise HTTPException(status_code=404, detail="Run not found")
|
|
999
|
+
except Exception as e:
|
|
1000
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
1001
|
+
|
|
1002
|
+
|
|
1003
|
+
@router.get("/{run_id}/feedback", response_model=list[FeedbackResponse])
|
|
1004
|
+
def list_feedback(run_id: str, adapter: AdapterDep) -> list[dict[str, Any]]:
|
|
1005
|
+
try:
|
|
1006
|
+
adapter.get_run_details(run_id)
|
|
1007
|
+
feedbacks = adapter.list_feedback(run_id)
|
|
1008
|
+
return [feedback.to_dict() for feedback in feedbacks]
|
|
1009
|
+
except KeyError:
|
|
1010
|
+
raise HTTPException(status_code=404, detail="Run not found")
|
|
1011
|
+
except Exception as e:
|
|
1012
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
1013
|
+
|
|
1014
|
+
|
|
1015
|
+
@router.get("/{run_id}/feedback/summary", response_model=FeedbackSummaryResponse)
|
|
1016
|
+
def get_feedback_summary(run_id: str, adapter: AdapterDep) -> dict[str, Any]:
|
|
1017
|
+
try:
|
|
1018
|
+
adapter.get_run_details(run_id)
|
|
1019
|
+
summary = adapter.get_feedback_summary(run_id)
|
|
1020
|
+
return {
|
|
1021
|
+
"avg_satisfaction_score": summary.avg_satisfaction_score,
|
|
1022
|
+
"thumb_up_rate": summary.thumb_up_rate,
|
|
1023
|
+
"total_feedback": summary.total_feedback,
|
|
1024
|
+
}
|
|
902
1025
|
except KeyError:
|
|
903
1026
|
raise HTTPException(status_code=404, detail="Run not found")
|
|
904
1027
|
except Exception as e:
|
|
@@ -13,6 +13,7 @@ from .agent import register_agent_commands
|
|
|
13
13
|
from .analyze import register_analyze_commands
|
|
14
14
|
from .api import register_api_command
|
|
15
15
|
from .benchmark import create_benchmark_app
|
|
16
|
+
from .calibrate import register_calibrate_commands
|
|
16
17
|
from .config import register_config_commands
|
|
17
18
|
from .debug import create_debug_app
|
|
18
19
|
from .domain import create_domain_app
|
|
@@ -61,6 +62,7 @@ COMMAND_MODULES: tuple[CommandModule, ...] = (
|
|
|
61
62
|
CommandModule(register_pipeline_commands),
|
|
62
63
|
CommandModule(register_history_commands),
|
|
63
64
|
CommandModule(register_analyze_commands),
|
|
65
|
+
CommandModule(register_calibrate_commands),
|
|
64
66
|
CommandModule(register_generate_commands),
|
|
65
67
|
CommandModule(register_gate_commands),
|
|
66
68
|
CommandModule(register_agent_commands),
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
11
|
+
from evalvault.config.settings import Settings
|
|
12
|
+
from evalvault.domain.services.satisfaction_calibration_service import (
|
|
13
|
+
SatisfactionCalibrationService,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from ..utils.options import db_option
|
|
17
|
+
|
|
18
|
+
_console = Console()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def register_calibrate_commands(app: typer.Typer, console: Console) -> None:
|
|
22
|
+
global _console
|
|
23
|
+
_console = console
|
|
24
|
+
|
|
25
|
+
@app.command()
|
|
26
|
+
def calibrate(
|
|
27
|
+
run_id: str = typer.Argument(..., help="보정 대상 Run ID"),
|
|
28
|
+
model: str = typer.Option(
|
|
29
|
+
"both", "--model", help="모델 선택 (linear|xgb|both)", show_default=True
|
|
30
|
+
),
|
|
31
|
+
write_back: bool = typer.Option(
|
|
32
|
+
False,
|
|
33
|
+
"--write-back",
|
|
34
|
+
help="보정 결과를 메타데이터에 저장",
|
|
35
|
+
show_default=True,
|
|
36
|
+
),
|
|
37
|
+
db_path: Path | None = db_option(help_text="DB 경로"),
|
|
38
|
+
) -> None:
|
|
39
|
+
resolved_db_path = db_path or Settings().evalvault_db_path
|
|
40
|
+
if resolved_db_path is None:
|
|
41
|
+
_console.print("[red]오류: DB 경로가 설정되지 않았습니다.[/red]")
|
|
42
|
+
raise typer.Exit(1)
|
|
43
|
+
|
|
44
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
45
|
+
try:
|
|
46
|
+
run = storage.get_run(run_id)
|
|
47
|
+
except KeyError:
|
|
48
|
+
_console.print("[red]오류: Run을 찾을 수 없습니다.[/red]")
|
|
49
|
+
raise typer.Exit(1)
|
|
50
|
+
|
|
51
|
+
normalized_model = model.lower()
|
|
52
|
+
if normalized_model not in {"linear", "xgb", "both"}:
|
|
53
|
+
_console.print("[red]오류: model은 linear|xgb|both 중 하나여야 합니다.[/red]")
|
|
54
|
+
raise typer.Exit(1)
|
|
55
|
+
|
|
56
|
+
feedbacks = storage.list_feedback(run_id)
|
|
57
|
+
service = SatisfactionCalibrationService()
|
|
58
|
+
calibration = service.build_calibration(run, feedbacks, model=normalized_model)
|
|
59
|
+
|
|
60
|
+
table = Table(title="보정 모델 성능 요약")
|
|
61
|
+
table.add_column("모델")
|
|
62
|
+
table.add_column("MAE", justify="right")
|
|
63
|
+
table.add_column("Pearson", justify="right")
|
|
64
|
+
table.add_column("Spearman", justify="right")
|
|
65
|
+
|
|
66
|
+
if calibration.summary.model_metrics:
|
|
67
|
+
for model_name, metrics in calibration.summary.model_metrics.items():
|
|
68
|
+
table.add_row(
|
|
69
|
+
model_name,
|
|
70
|
+
_format_metric(metrics.get("mae")),
|
|
71
|
+
_format_metric(metrics.get("pearson")),
|
|
72
|
+
_format_metric(metrics.get("spearman")),
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
table.add_row("N/A", "-", "-", "-")
|
|
76
|
+
|
|
77
|
+
_console.print(table)
|
|
78
|
+
_console.print(
|
|
79
|
+
f"평균 만족도: {calibration.summary.avg_satisfaction_score} | "
|
|
80
|
+
f"Thumb Up 비율: {calibration.summary.thumb_up_rate} | "
|
|
81
|
+
f"보정 비율: {calibration.summary.imputed_ratio}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
if write_back:
|
|
85
|
+
metadata = run.tracker_metadata or {}
|
|
86
|
+
metadata["calibration"] = {
|
|
87
|
+
"updated_at": datetime.now().isoformat(),
|
|
88
|
+
"model": model,
|
|
89
|
+
"summary": {
|
|
90
|
+
"avg_satisfaction_score": calibration.summary.avg_satisfaction_score,
|
|
91
|
+
"thumb_up_rate": calibration.summary.thumb_up_rate,
|
|
92
|
+
"imputed_ratio": calibration.summary.imputed_ratio,
|
|
93
|
+
"model_metrics": calibration.summary.model_metrics,
|
|
94
|
+
},
|
|
95
|
+
"cases": {
|
|
96
|
+
case_id: {
|
|
97
|
+
"calibrated_satisfaction": case.calibrated_satisfaction,
|
|
98
|
+
"imputed": case.imputed,
|
|
99
|
+
"imputation_source": case.imputation_source,
|
|
100
|
+
}
|
|
101
|
+
for case_id, case in calibration.cases.items()
|
|
102
|
+
},
|
|
103
|
+
}
|
|
104
|
+
storage.update_run_metadata(run_id, metadata)
|
|
105
|
+
_console.print("[green]보정 결과를 메타데이터에 저장했습니다.[/green]")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _format_metric(value: float | None) -> str:
|
|
109
|
+
if value is None:
|
|
110
|
+
return "-"
|
|
111
|
+
return f"{value:.3f}"
|
|
@@ -727,8 +727,52 @@ class NLPAnalysisAdapter(BaseAnalysisAdapter):
|
|
|
727
727
|
if values:
|
|
728
728
|
avg_scores[metric_name] = sum(values) / len(values)
|
|
729
729
|
|
|
730
|
-
|
|
731
|
-
|
|
730
|
+
representative_questions: list[str] = []
|
|
731
|
+
try:
|
|
732
|
+
cluster_idx_list = cluster_indices[cluster_id]
|
|
733
|
+
cluster_vectors = embedding_array[cluster_idx_list]
|
|
734
|
+
centroid = cluster_vectors.mean(axis=0)
|
|
735
|
+
distances = np.linalg.norm(cluster_vectors - centroid, axis=1)
|
|
736
|
+
sorted_pairs = sorted(
|
|
737
|
+
zip(cluster_idx_list, distances, strict=True), key=lambda x: x[1]
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
center_indices = [idx for idx, _dist in sorted_pairs[:2]]
|
|
741
|
+
edge_far = sorted_pairs[-1][0] if sorted_pairs else None
|
|
742
|
+
|
|
743
|
+
worst_idx = None
|
|
744
|
+
worst_score = None
|
|
745
|
+
for idx in cluster_idx_list:
|
|
746
|
+
q = questions[idx]
|
|
747
|
+
result = question_to_result.get(q)
|
|
748
|
+
if not result or not result.metrics:
|
|
749
|
+
continue
|
|
750
|
+
avg_score = sum(m.score for m in result.metrics) / len(result.metrics)
|
|
751
|
+
if worst_score is None or avg_score < worst_score:
|
|
752
|
+
worst_score = avg_score
|
|
753
|
+
worst_idx = idx
|
|
754
|
+
|
|
755
|
+
edge_needed = worst_idx
|
|
756
|
+
if edge_needed is None and len(sorted_pairs) > 1:
|
|
757
|
+
edge_needed = sorted_pairs[-2][0]
|
|
758
|
+
|
|
759
|
+
candidate_indices: list[int] = []
|
|
760
|
+
candidate_indices.extend(center_indices)
|
|
761
|
+
if edge_far is not None:
|
|
762
|
+
candidate_indices.append(edge_far)
|
|
763
|
+
if edge_needed is not None:
|
|
764
|
+
candidate_indices.append(edge_needed)
|
|
765
|
+
|
|
766
|
+
seen: set[int] = set()
|
|
767
|
+
for idx in candidate_indices:
|
|
768
|
+
if idx in seen:
|
|
769
|
+
continue
|
|
770
|
+
seen.add(idx)
|
|
771
|
+
representative_questions.append(questions[idx])
|
|
772
|
+
if len(representative_questions) >= 4:
|
|
773
|
+
break
|
|
774
|
+
except Exception:
|
|
775
|
+
representative_questions = cluster_qs[:4]
|
|
732
776
|
|
|
733
777
|
clusters.append(
|
|
734
778
|
TopicCluster(
|
|
@@ -216,7 +216,7 @@ class NLPAnalyzerModule(BaseAnalysisModule):
|
|
|
216
216
|
"keywords": list(cluster.keywords),
|
|
217
217
|
"document_count": cluster.document_count,
|
|
218
218
|
"avg_scores": cluster.avg_scores,
|
|
219
|
-
"representative_questions": cluster.representative_questions[:
|
|
219
|
+
"representative_questions": cluster.representative_questions[:4],
|
|
220
220
|
}
|
|
221
221
|
)
|
|
222
222
|
return serialized
|
|
@@ -11,9 +11,11 @@ from typing import Any
|
|
|
11
11
|
|
|
12
12
|
from evalvault.domain.entities import (
|
|
13
13
|
EvaluationRun,
|
|
14
|
+
FeedbackSummary,
|
|
14
15
|
MetricScore,
|
|
15
16
|
RunClusterMap,
|
|
16
17
|
RunClusterMapInfo,
|
|
18
|
+
SatisfactionFeedback,
|
|
17
19
|
TestCaseResult,
|
|
18
20
|
)
|
|
19
21
|
|
|
@@ -27,10 +29,12 @@ class SQLQueries:
|
|
|
27
29
|
placeholder: str = "?",
|
|
28
30
|
metric_name_column: str = "metric_name",
|
|
29
31
|
test_case_returning_clause: str = "",
|
|
32
|
+
feedback_returning_clause: str = "",
|
|
30
33
|
) -> None:
|
|
31
34
|
self.placeholder = placeholder
|
|
32
35
|
self.metric_name_column = metric_name_column
|
|
33
36
|
self._test_case_returning = test_case_returning_clause
|
|
37
|
+
self._feedback_returning = feedback_returning_clause
|
|
34
38
|
|
|
35
39
|
def _values(self, count: int) -> str:
|
|
36
40
|
return ", ".join([self.placeholder] * count)
|
|
@@ -75,6 +79,25 @@ class SQLQueries:
|
|
|
75
79
|
) VALUES ({values})
|
|
76
80
|
"""
|
|
77
81
|
|
|
82
|
+
def insert_feedback(self) -> str:
|
|
83
|
+
values = self._values(7)
|
|
84
|
+
query = f"""
|
|
85
|
+
INSERT INTO satisfaction_feedback (
|
|
86
|
+
run_id, test_case_id, satisfaction_score, thumb_feedback, comment, rater_id, created_at
|
|
87
|
+
) VALUES ({values})
|
|
88
|
+
"""
|
|
89
|
+
if self._feedback_returning:
|
|
90
|
+
query = f"{query.strip()} {self._feedback_returning}"
|
|
91
|
+
return query
|
|
92
|
+
|
|
93
|
+
def select_feedback_by_run(self) -> str:
|
|
94
|
+
return f"""
|
|
95
|
+
SELECT id, run_id, test_case_id, satisfaction_score, thumb_feedback, comment, rater_id, created_at
|
|
96
|
+
FROM satisfaction_feedback
|
|
97
|
+
WHERE run_id = {self.placeholder}
|
|
98
|
+
ORDER BY created_at DESC
|
|
99
|
+
"""
|
|
100
|
+
|
|
78
101
|
def select_run(self) -> str:
|
|
79
102
|
return f"""
|
|
80
103
|
SELECT run_id, dataset_name, dataset_version, model_name,
|
|
@@ -129,6 +152,13 @@ class SQLQueries:
|
|
|
129
152
|
ORDER BY created_at DESC
|
|
130
153
|
"""
|
|
131
154
|
|
|
155
|
+
def update_run_metadata(self) -> str:
|
|
156
|
+
return f"""
|
|
157
|
+
UPDATE evaluation_runs
|
|
158
|
+
SET metadata = {self.placeholder}
|
|
159
|
+
WHERE run_id = {self.placeholder}
|
|
160
|
+
"""
|
|
161
|
+
|
|
132
162
|
def delete_run(self) -> str:
|
|
133
163
|
return f"DELETE FROM evaluation_runs WHERE run_id = {self.placeholder}"
|
|
134
164
|
|
|
@@ -259,6 +289,12 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
259
289
|
conn.commit()
|
|
260
290
|
return deleted
|
|
261
291
|
|
|
292
|
+
def update_run_metadata(self, run_id: str, metadata: dict[str, Any]) -> None:
|
|
293
|
+
payload = self._serialize_json(metadata)
|
|
294
|
+
with self._get_connection() as conn:
|
|
295
|
+
self._execute(conn, self.queries.update_run_metadata(), (payload, run_id))
|
|
296
|
+
conn.commit()
|
|
297
|
+
|
|
262
298
|
def save_run_cluster_map(
|
|
263
299
|
self,
|
|
264
300
|
run_id: str,
|
|
@@ -365,6 +401,45 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
365
401
|
conn.commit()
|
|
366
402
|
return deleted
|
|
367
403
|
|
|
404
|
+
def save_feedback(self, feedback: SatisfactionFeedback) -> str:
|
|
405
|
+
created_at = feedback.created_at or datetime.now()
|
|
406
|
+
with self._get_connection() as conn:
|
|
407
|
+
cursor = self._execute(
|
|
408
|
+
conn,
|
|
409
|
+
self.queries.insert_feedback(),
|
|
410
|
+
(
|
|
411
|
+
feedback.run_id,
|
|
412
|
+
feedback.test_case_id,
|
|
413
|
+
feedback.satisfaction_score,
|
|
414
|
+
feedback.thumb_feedback,
|
|
415
|
+
feedback.comment,
|
|
416
|
+
feedback.rater_id,
|
|
417
|
+
self._serialize_datetime(created_at),
|
|
418
|
+
),
|
|
419
|
+
)
|
|
420
|
+
feedback_id = self._fetch_lastrowid(cursor)
|
|
421
|
+
conn.commit()
|
|
422
|
+
return str(feedback_id)
|
|
423
|
+
|
|
424
|
+
def list_feedback(self, run_id: str) -> list[SatisfactionFeedback]:
|
|
425
|
+
with self._get_connection() as conn:
|
|
426
|
+
rows = self._execute(conn, self.queries.select_feedback_by_run(), (run_id,)).fetchall()
|
|
427
|
+
return [self._row_to_feedback(row) for row in rows]
|
|
428
|
+
|
|
429
|
+
def get_feedback_summary(self, run_id: str) -> FeedbackSummary:
|
|
430
|
+
feedbacks = self.list_feedback(run_id)
|
|
431
|
+
scores = [f.satisfaction_score for f in feedbacks if f.satisfaction_score is not None]
|
|
432
|
+
thumbs = [f.thumb_feedback for f in feedbacks if f.thumb_feedback in {"up", "down"}]
|
|
433
|
+
avg_score = sum(scores) / len(scores) if scores else None
|
|
434
|
+
thumb_up_rate = None
|
|
435
|
+
if thumbs:
|
|
436
|
+
thumb_up_rate = thumbs.count("up") / len(thumbs)
|
|
437
|
+
return FeedbackSummary(
|
|
438
|
+
avg_satisfaction_score=avg_score,
|
|
439
|
+
thumb_up_rate=thumb_up_rate,
|
|
440
|
+
total_feedback=len(feedbacks),
|
|
441
|
+
)
|
|
442
|
+
|
|
368
443
|
# Serialization helpers --------------------------------------------
|
|
369
444
|
|
|
370
445
|
def _run_params(self, run: EvaluationRun) -> Sequence[Any]:
|
|
@@ -428,6 +503,22 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
428
503
|
ground_truth=row["ground_truth"],
|
|
429
504
|
)
|
|
430
505
|
|
|
506
|
+
def _row_to_feedback(self, row) -> SatisfactionFeedback:
|
|
507
|
+
feedback_id = self._row_value(row, "id")
|
|
508
|
+
run_id = self._row_value(row, "run_id")
|
|
509
|
+
test_case_id = self._row_value(row, "test_case_id")
|
|
510
|
+
created_at = self._deserialize_datetime(self._row_value(row, "created_at"))
|
|
511
|
+
return SatisfactionFeedback(
|
|
512
|
+
feedback_id=str(feedback_id or ""),
|
|
513
|
+
run_id=str(run_id or ""),
|
|
514
|
+
test_case_id=str(test_case_id or ""),
|
|
515
|
+
satisfaction_score=self._maybe_float(self._row_value(row, "satisfaction_score")),
|
|
516
|
+
thumb_feedback=self._row_value(row, "thumb_feedback"),
|
|
517
|
+
comment=self._row_value(row, "comment"),
|
|
518
|
+
rater_id=self._row_value(row, "rater_id"),
|
|
519
|
+
created_at=created_at,
|
|
520
|
+
)
|
|
521
|
+
|
|
431
522
|
def _fetch_metric_scores(self, conn, result_id: int) -> list[MetricScore]:
|
|
432
523
|
rows = self._execute(conn, self.queries.select_metric_scores(), (result_id,)).fetchall()
|
|
433
524
|
metric_column = self.queries.metric_name_column
|
|
@@ -60,6 +60,7 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
|
|
|
60
60
|
placeholder="%s",
|
|
61
61
|
metric_name_column="name",
|
|
62
62
|
test_case_returning_clause="RETURNING id",
|
|
63
|
+
feedback_returning_clause="RETURNING id",
|
|
63
64
|
)
|
|
64
65
|
)
|
|
65
66
|
if connection_string:
|
|
@@ -198,6 +199,27 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
|
|
|
198
199
|
elif cluster_columns and "metadata" not in cluster_columns:
|
|
199
200
|
conn.execute("ALTER TABLE run_cluster_maps ADD COLUMN metadata JSONB")
|
|
200
201
|
|
|
202
|
+
conn.execute(
|
|
203
|
+
"""
|
|
204
|
+
CREATE TABLE IF NOT EXISTS satisfaction_feedback (
|
|
205
|
+
id SERIAL PRIMARY KEY,
|
|
206
|
+
run_id UUID NOT NULL REFERENCES evaluation_runs(run_id) ON DELETE CASCADE,
|
|
207
|
+
test_case_id VARCHAR(255) NOT NULL,
|
|
208
|
+
satisfaction_score DECIMAL(4, 2),
|
|
209
|
+
thumb_feedback VARCHAR(10),
|
|
210
|
+
comment TEXT,
|
|
211
|
+
rater_id VARCHAR(255),
|
|
212
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
|
213
|
+
)
|
|
214
|
+
"""
|
|
215
|
+
)
|
|
216
|
+
conn.execute(
|
|
217
|
+
"CREATE INDEX IF NOT EXISTS idx_feedback_run_id ON satisfaction_feedback(run_id)"
|
|
218
|
+
)
|
|
219
|
+
conn.execute(
|
|
220
|
+
"CREATE INDEX IF NOT EXISTS idx_feedback_test_case_id ON satisfaction_feedback(test_case_id)"
|
|
221
|
+
)
|
|
222
|
+
|
|
201
223
|
# Prompt set methods
|
|
202
224
|
|
|
203
225
|
def save_prompt_set(self, bundle: PromptSetBundle) -> None:
|
|
@@ -59,6 +59,20 @@ CREATE TABLE IF NOT EXISTS run_cluster_maps (
|
|
|
59
59
|
CREATE INDEX IF NOT EXISTS idx_cluster_maps_run_id ON run_cluster_maps(run_id);
|
|
60
60
|
CREATE INDEX IF NOT EXISTS idx_cluster_maps_map_id ON run_cluster_maps(map_id);
|
|
61
61
|
|
|
62
|
+
CREATE TABLE IF NOT EXISTS satisfaction_feedback (
|
|
63
|
+
id SERIAL PRIMARY KEY,
|
|
64
|
+
run_id UUID NOT NULL REFERENCES evaluation_runs(run_id) ON DELETE CASCADE,
|
|
65
|
+
test_case_id VARCHAR(255) NOT NULL,
|
|
66
|
+
satisfaction_score DECIMAL(4, 2),
|
|
67
|
+
thumb_feedback VARCHAR(10),
|
|
68
|
+
comment TEXT,
|
|
69
|
+
rater_id VARCHAR(255),
|
|
70
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
|
71
|
+
);
|
|
72
|
+
|
|
73
|
+
CREATE INDEX IF NOT EXISTS idx_feedback_run_id ON satisfaction_feedback(run_id);
|
|
74
|
+
CREATE INDEX IF NOT EXISTS idx_feedback_test_case_id ON satisfaction_feedback(test_case_id);
|
|
75
|
+
|
|
62
76
|
-- Metric scores table
|
|
63
77
|
CREATE TABLE IF NOT EXISTS metric_scores (
|
|
64
78
|
id SERIAL PRIMARY KEY,
|
|
@@ -61,6 +61,21 @@ CREATE TABLE IF NOT EXISTS run_cluster_maps (
|
|
|
61
61
|
CREATE INDEX IF NOT EXISTS idx_cluster_maps_run_id ON run_cluster_maps(run_id);
|
|
62
62
|
CREATE INDEX IF NOT EXISTS idx_cluster_maps_map_id ON run_cluster_maps(map_id);
|
|
63
63
|
|
|
64
|
+
CREATE TABLE IF NOT EXISTS satisfaction_feedback (
|
|
65
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
66
|
+
run_id TEXT NOT NULL,
|
|
67
|
+
test_case_id TEXT NOT NULL,
|
|
68
|
+
satisfaction_score REAL,
|
|
69
|
+
thumb_feedback TEXT,
|
|
70
|
+
comment TEXT,
|
|
71
|
+
rater_id TEXT,
|
|
72
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
73
|
+
FOREIGN KEY (run_id) REFERENCES evaluation_runs(run_id) ON DELETE CASCADE
|
|
74
|
+
);
|
|
75
|
+
|
|
76
|
+
CREATE INDEX IF NOT EXISTS idx_feedback_run_id ON satisfaction_feedback(run_id);
|
|
77
|
+
CREATE INDEX IF NOT EXISTS idx_feedback_test_case_id ON satisfaction_feedback(test_case_id);
|
|
78
|
+
|
|
64
79
|
-- Metric scores table
|
|
65
80
|
CREATE TABLE IF NOT EXISTS metric_scores (
|
|
66
81
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
@@ -140,6 +140,31 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
|
|
|
140
140
|
elif cluster_columns and "metadata" not in cluster_columns:
|
|
141
141
|
conn.execute("ALTER TABLE run_cluster_maps ADD COLUMN metadata TEXT")
|
|
142
142
|
|
|
143
|
+
feedback_cursor = conn.execute("PRAGMA table_info(satisfaction_feedback)")
|
|
144
|
+
feedback_columns = {row[1] for row in feedback_cursor.fetchall()}
|
|
145
|
+
if not feedback_columns:
|
|
146
|
+
conn.execute(
|
|
147
|
+
"""
|
|
148
|
+
CREATE TABLE IF NOT EXISTS satisfaction_feedback (
|
|
149
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
150
|
+
run_id TEXT NOT NULL,
|
|
151
|
+
test_case_id TEXT NOT NULL,
|
|
152
|
+
satisfaction_score REAL,
|
|
153
|
+
thumb_feedback TEXT,
|
|
154
|
+
comment TEXT,
|
|
155
|
+
rater_id TEXT,
|
|
156
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
157
|
+
FOREIGN KEY (run_id) REFERENCES evaluation_runs(run_id) ON DELETE CASCADE
|
|
158
|
+
)
|
|
159
|
+
"""
|
|
160
|
+
)
|
|
161
|
+
conn.execute(
|
|
162
|
+
"CREATE INDEX IF NOT EXISTS idx_feedback_run_id ON satisfaction_feedback(run_id)"
|
|
163
|
+
)
|
|
164
|
+
conn.execute(
|
|
165
|
+
"CREATE INDEX IF NOT EXISTS idx_feedback_test_case_id ON satisfaction_feedback(test_case_id)"
|
|
166
|
+
)
|
|
167
|
+
|
|
143
168
|
pipeline_cursor = conn.execute("PRAGMA table_info(pipeline_results)")
|
|
144
169
|
pipeline_columns = {row[1] for row in pipeline_cursor.fetchall()}
|
|
145
170
|
if pipeline_columns:
|
|
@@ -14,6 +14,13 @@ from evalvault.domain.entities.analysis import (
|
|
|
14
14
|
)
|
|
15
15
|
from evalvault.domain.entities.dataset import Dataset, TestCase
|
|
16
16
|
from evalvault.domain.entities.experiment import Experiment, ExperimentGroup
|
|
17
|
+
from evalvault.domain.entities.feedback import (
|
|
18
|
+
CalibrationCaseResult,
|
|
19
|
+
CalibrationResult,
|
|
20
|
+
CalibrationSummary,
|
|
21
|
+
FeedbackSummary,
|
|
22
|
+
SatisfactionFeedback,
|
|
23
|
+
)
|
|
17
24
|
from evalvault.domain.entities.improvement import (
|
|
18
25
|
EffortLevel,
|
|
19
26
|
EvidenceSource,
|
|
@@ -74,6 +81,11 @@ __all__ = [
|
|
|
74
81
|
# Experiment
|
|
75
82
|
"Experiment",
|
|
76
83
|
"ExperimentGroup",
|
|
84
|
+
"CalibrationCaseResult",
|
|
85
|
+
"CalibrationResult",
|
|
86
|
+
"CalibrationSummary",
|
|
87
|
+
"FeedbackSummary",
|
|
88
|
+
"SatisfactionFeedback",
|
|
77
89
|
# Improvement
|
|
78
90
|
"EffortLevel",
|
|
79
91
|
"EvidenceSource",
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class SatisfactionFeedback:
|
|
10
|
+
feedback_id: str
|
|
11
|
+
run_id: str
|
|
12
|
+
test_case_id: str
|
|
13
|
+
satisfaction_score: float | None = None
|
|
14
|
+
thumb_feedback: str | None = None
|
|
15
|
+
comment: str | None = None
|
|
16
|
+
rater_id: str | None = None
|
|
17
|
+
created_at: datetime | None = None
|
|
18
|
+
|
|
19
|
+
def to_dict(self) -> dict[str, Any]:
|
|
20
|
+
return {
|
|
21
|
+
"feedback_id": self.feedback_id,
|
|
22
|
+
"run_id": self.run_id,
|
|
23
|
+
"test_case_id": self.test_case_id,
|
|
24
|
+
"satisfaction_score": self.satisfaction_score,
|
|
25
|
+
"thumb_feedback": self.thumb_feedback,
|
|
26
|
+
"comment": self.comment,
|
|
27
|
+
"rater_id": self.rater_id,
|
|
28
|
+
"created_at": self.created_at.isoformat() if self.created_at else None,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class FeedbackSummary:
|
|
34
|
+
avg_satisfaction_score: float | None = None
|
|
35
|
+
thumb_up_rate: float | None = None
|
|
36
|
+
total_feedback: int = 0
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class CalibrationCaseResult:
|
|
41
|
+
test_case_id: str
|
|
42
|
+
calibrated_satisfaction: float | None = None
|
|
43
|
+
imputed: bool = False
|
|
44
|
+
imputation_source: str | None = None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class CalibrationSummary:
|
|
49
|
+
avg_satisfaction_score: float | None = None
|
|
50
|
+
thumb_up_rate: float | None = None
|
|
51
|
+
imputed_ratio: float | None = None
|
|
52
|
+
model_metrics: dict[str, dict[str, float | None]] = field(default_factory=dict)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class CalibrationResult:
|
|
57
|
+
summary: CalibrationSummary
|
|
58
|
+
cases: dict[str, CalibrationCaseResult] = field(default_factory=dict)
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from evalvault.domain.entities import (
|
|
10
|
+
CalibrationCaseResult,
|
|
11
|
+
CalibrationResult,
|
|
12
|
+
CalibrationSummary,
|
|
13
|
+
EvaluationRun,
|
|
14
|
+
SatisfactionFeedback,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class CalibrationModelResult:
|
|
20
|
+
model_name: str
|
|
21
|
+
mae: float | None
|
|
22
|
+
pearson: float | None
|
|
23
|
+
spearman: float | None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class SatisfactionCalibrationService:
|
|
27
|
+
def __init__(self, *, thumb_mapping: dict[str, float] | None = None) -> None:
|
|
28
|
+
self._thumb_mapping = thumb_mapping or {"up": 4.0, "down": 2.0}
|
|
29
|
+
|
|
30
|
+
def build_calibration(
|
|
31
|
+
self,
|
|
32
|
+
run: EvaluationRun,
|
|
33
|
+
feedbacks: list[SatisfactionFeedback],
|
|
34
|
+
*,
|
|
35
|
+
model: str = "both",
|
|
36
|
+
) -> CalibrationResult:
|
|
37
|
+
feedback_index = self._build_feedback_index(feedbacks)
|
|
38
|
+
feature_map = self._build_feature_matrix(run)
|
|
39
|
+
labels, label_sources = self._build_labels(run, feedback_index)
|
|
40
|
+
|
|
41
|
+
if not feedback_index:
|
|
42
|
+
summary = CalibrationSummary(
|
|
43
|
+
avg_satisfaction_score=None,
|
|
44
|
+
thumb_up_rate=None,
|
|
45
|
+
imputed_ratio=0.0,
|
|
46
|
+
)
|
|
47
|
+
return CalibrationResult(summary=summary, cases={})
|
|
48
|
+
|
|
49
|
+
model_metrics: dict[str, dict[str, float | None]] = {}
|
|
50
|
+
model_choice, predictors = self._train_models(
|
|
51
|
+
feature_map,
|
|
52
|
+
labels,
|
|
53
|
+
model=model,
|
|
54
|
+
model_metrics=model_metrics,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
summary = self._build_summary(run, feedback_index)
|
|
58
|
+
cases: dict[str, CalibrationCaseResult] = {}
|
|
59
|
+
imputed_count = 0
|
|
60
|
+
|
|
61
|
+
for test_case_id, features in feature_map.items():
|
|
62
|
+
label = labels.get(test_case_id)
|
|
63
|
+
source = label_sources.get(test_case_id)
|
|
64
|
+
if label is not None:
|
|
65
|
+
calibrated = self._clip_score(label)
|
|
66
|
+
imputed = source != "label"
|
|
67
|
+
imputation_source = source
|
|
68
|
+
else:
|
|
69
|
+
calibrated = self._predict_or_fallback(
|
|
70
|
+
predictors.get(model_choice),
|
|
71
|
+
features,
|
|
72
|
+
labels,
|
|
73
|
+
)
|
|
74
|
+
if calibrated is None:
|
|
75
|
+
imputed = False
|
|
76
|
+
imputation_source = None
|
|
77
|
+
else:
|
|
78
|
+
imputed = True
|
|
79
|
+
imputation_source = "model" if predictors.get(model_choice) else "fallback_mean"
|
|
80
|
+
|
|
81
|
+
if imputed:
|
|
82
|
+
imputed_count += 1
|
|
83
|
+
|
|
84
|
+
cases[test_case_id] = CalibrationCaseResult(
|
|
85
|
+
test_case_id=test_case_id,
|
|
86
|
+
calibrated_satisfaction=calibrated,
|
|
87
|
+
imputed=imputed,
|
|
88
|
+
imputation_source=imputation_source,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
summary.imputed_ratio = imputed_count / len(cases) if cases else summary.imputed_ratio
|
|
92
|
+
summary.model_metrics = model_metrics
|
|
93
|
+
return CalibrationResult(summary=summary, cases=cases)
|
|
94
|
+
|
|
95
|
+
def _build_feedback_index(
|
|
96
|
+
self, feedbacks: list[SatisfactionFeedback]
|
|
97
|
+
) -> dict[str, SatisfactionFeedback]:
|
|
98
|
+
latest: dict[str, SatisfactionFeedback] = {}
|
|
99
|
+
for feedback in feedbacks:
|
|
100
|
+
current = latest.get(feedback.test_case_id)
|
|
101
|
+
if current is None:
|
|
102
|
+
latest[feedback.test_case_id] = feedback
|
|
103
|
+
continue
|
|
104
|
+
current_time = current.created_at or datetime.min
|
|
105
|
+
feedback_time = feedback.created_at or datetime.min
|
|
106
|
+
if feedback_time >= current_time:
|
|
107
|
+
latest[feedback.test_case_id] = feedback
|
|
108
|
+
return latest
|
|
109
|
+
|
|
110
|
+
def _build_feature_matrix(self, run: EvaluationRun) -> dict[str, list[float]]:
|
|
111
|
+
feature_map: dict[str, list[float]] = {}
|
|
112
|
+
|
|
113
|
+
for result in run.results:
|
|
114
|
+
features = [
|
|
115
|
+
self._metric_score(result, "faithfulness"),
|
|
116
|
+
self._metric_score(result, "answer_relevancy"),
|
|
117
|
+
self._metric_score(result, "context_precision"),
|
|
118
|
+
self._metric_score(result, "context_recall"),
|
|
119
|
+
self._answer_length(result.answer),
|
|
120
|
+
self._keyword_missing_rate(result.question, result.answer, result.contexts),
|
|
121
|
+
self._ttr(result.answer),
|
|
122
|
+
]
|
|
123
|
+
feature_map[result.test_case_id] = features
|
|
124
|
+
return feature_map
|
|
125
|
+
|
|
126
|
+
def _build_labels(
|
|
127
|
+
self,
|
|
128
|
+
run: EvaluationRun,
|
|
129
|
+
feedback_index: dict[str, SatisfactionFeedback],
|
|
130
|
+
) -> tuple[dict[str, float], dict[str, str]]:
|
|
131
|
+
labels: dict[str, float] = {}
|
|
132
|
+
sources: dict[str, str] = {}
|
|
133
|
+
for result in run.results:
|
|
134
|
+
feedback = feedback_index.get(result.test_case_id)
|
|
135
|
+
if feedback is None:
|
|
136
|
+
continue
|
|
137
|
+
if feedback.satisfaction_score is not None:
|
|
138
|
+
labels[result.test_case_id] = feedback.satisfaction_score
|
|
139
|
+
sources[result.test_case_id] = "label"
|
|
140
|
+
continue
|
|
141
|
+
mapped = self._thumb_mapping.get((feedback.thumb_feedback or "").lower())
|
|
142
|
+
if mapped is not None:
|
|
143
|
+
labels[result.test_case_id] = mapped
|
|
144
|
+
sources[result.test_case_id] = "thumb"
|
|
145
|
+
return labels, sources
|
|
146
|
+
|
|
147
|
+
def _train_models(
|
|
148
|
+
self,
|
|
149
|
+
feature_map: dict[str, list[float]],
|
|
150
|
+
labels: dict[str, float],
|
|
151
|
+
*,
|
|
152
|
+
model: str,
|
|
153
|
+
model_metrics: dict[str, dict[str, float | None]],
|
|
154
|
+
) -> tuple[str, dict[str, Any]]:
|
|
155
|
+
from sklearn.linear_model import LinearRegression
|
|
156
|
+
from sklearn.metrics import mean_absolute_error
|
|
157
|
+
from sklearn.model_selection import train_test_split
|
|
158
|
+
|
|
159
|
+
if not labels:
|
|
160
|
+
return "linear", {}
|
|
161
|
+
|
|
162
|
+
features_matrix: list[list[float]] = []
|
|
163
|
+
labels_vector: list[float] = []
|
|
164
|
+
for test_case_id, label in labels.items():
|
|
165
|
+
features = feature_map.get(test_case_id)
|
|
166
|
+
if features is None:
|
|
167
|
+
continue
|
|
168
|
+
features_matrix.append(features)
|
|
169
|
+
labels_vector.append(label)
|
|
170
|
+
|
|
171
|
+
if not features_matrix:
|
|
172
|
+
return "linear", {}
|
|
173
|
+
|
|
174
|
+
if len(labels_vector) >= 5:
|
|
175
|
+
features_train, features_test, labels_train, labels_test = train_test_split(
|
|
176
|
+
features_matrix, labels_vector, test_size=0.2, random_state=42
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
features_train, features_test, labels_train, labels_test = (
|
|
180
|
+
features_matrix,
|
|
181
|
+
features_matrix,
|
|
182
|
+
labels_vector,
|
|
183
|
+
labels_vector,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
predictors: dict[str, Any] = {}
|
|
187
|
+
|
|
188
|
+
linear = LinearRegression()
|
|
189
|
+
linear.fit(features_train, labels_train)
|
|
190
|
+
linear_pred = linear.predict(features_test)
|
|
191
|
+
model_metrics["linear"] = self._build_metrics(labels_test, linear_pred, mean_absolute_error)
|
|
192
|
+
predictors["linear"] = linear
|
|
193
|
+
|
|
194
|
+
if model in {"xgb", "both"}:
|
|
195
|
+
try:
|
|
196
|
+
import importlib
|
|
197
|
+
|
|
198
|
+
xgb_module = importlib.import_module("xgboost")
|
|
199
|
+
xgb_regressor = xgb_module.XGBRegressor
|
|
200
|
+
|
|
201
|
+
xgb = xgb_regressor(
|
|
202
|
+
objective="reg:squarederror",
|
|
203
|
+
n_estimators=150,
|
|
204
|
+
max_depth=5,
|
|
205
|
+
learning_rate=0.1,
|
|
206
|
+
subsample=0.8,
|
|
207
|
+
colsample_bytree=0.8,
|
|
208
|
+
reg_alpha=0.1,
|
|
209
|
+
reg_lambda=1.0,
|
|
210
|
+
n_jobs=-1,
|
|
211
|
+
random_state=42,
|
|
212
|
+
)
|
|
213
|
+
xgb.fit(features_train, labels_train)
|
|
214
|
+
xgb_pred = xgb.predict(features_test)
|
|
215
|
+
model_metrics["xgb"] = self._build_metrics(
|
|
216
|
+
labels_test, xgb_pred, mean_absolute_error
|
|
217
|
+
)
|
|
218
|
+
predictors["xgb"] = xgb
|
|
219
|
+
except Exception:
|
|
220
|
+
model_metrics["xgb"] = {"mae": None, "pearson": None, "spearman": None}
|
|
221
|
+
|
|
222
|
+
model_choice = "xgb" if model in {"xgb", "both"} and "xgb" in predictors else "linear"
|
|
223
|
+
return model_choice, predictors
|
|
224
|
+
|
|
225
|
+
def _build_metrics(
|
|
226
|
+
self,
|
|
227
|
+
y_true: list[float],
|
|
228
|
+
y_pred: list[float],
|
|
229
|
+
mae_func,
|
|
230
|
+
) -> dict[str, float | None]:
|
|
231
|
+
mae = float(mae_func(y_true, y_pred)) if y_true else None
|
|
232
|
+
pearson = self._safe_corr(y_true, y_pred, method="pearson")
|
|
233
|
+
spearman = self._safe_corr(y_true, y_pred, method="spearman")
|
|
234
|
+
return {"mae": mae, "pearson": pearson, "spearman": spearman}
|
|
235
|
+
|
|
236
|
+
def _predict_or_fallback(
|
|
237
|
+
self,
|
|
238
|
+
predictor: Any | None,
|
|
239
|
+
features: list[float],
|
|
240
|
+
labels: dict[str, float],
|
|
241
|
+
) -> float | None:
|
|
242
|
+
if predictor is not None:
|
|
243
|
+
prediction = predictor.predict([features])[0]
|
|
244
|
+
return self._clip_score(float(prediction))
|
|
245
|
+
fallback = self._fallback_mean(labels)
|
|
246
|
+
if fallback is None:
|
|
247
|
+
return None
|
|
248
|
+
return self._clip_score(fallback)
|
|
249
|
+
|
|
250
|
+
def _fallback_mean(self, labels: dict[str, float]) -> float | None:
|
|
251
|
+
if not labels:
|
|
252
|
+
return None
|
|
253
|
+
return sum(labels.values()) / len(labels)
|
|
254
|
+
|
|
255
|
+
def _build_summary(
|
|
256
|
+
self, run: EvaluationRun, feedback_index: dict[str, SatisfactionFeedback]
|
|
257
|
+
) -> CalibrationSummary:
|
|
258
|
+
scores: list[float] = []
|
|
259
|
+
thumbs: list[str] = []
|
|
260
|
+
for result in run.results:
|
|
261
|
+
feedback = feedback_index.get(result.test_case_id)
|
|
262
|
+
if feedback is None:
|
|
263
|
+
continue
|
|
264
|
+
if feedback.satisfaction_score is not None:
|
|
265
|
+
scores.append(feedback.satisfaction_score)
|
|
266
|
+
if feedback.thumb_feedback in {"up", "down"}:
|
|
267
|
+
thumbs.append(feedback.thumb_feedback)
|
|
268
|
+
avg_score = sum(scores) / len(scores) if scores else None
|
|
269
|
+
thumb_up_rate = None
|
|
270
|
+
if thumbs:
|
|
271
|
+
thumb_up_rate = thumbs.count("up") / len(thumbs)
|
|
272
|
+
return CalibrationSummary(
|
|
273
|
+
avg_satisfaction_score=avg_score,
|
|
274
|
+
thumb_up_rate=thumb_up_rate,
|
|
275
|
+
imputed_ratio=None,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
def _metric_score(self, result, name: str) -> float:
|
|
279
|
+
metric = result.get_metric(name)
|
|
280
|
+
if metric and metric.score is not None:
|
|
281
|
+
return float(metric.score)
|
|
282
|
+
return 0.0
|
|
283
|
+
|
|
284
|
+
def _answer_length(self, answer: str | None) -> float:
|
|
285
|
+
tokens = self._tokenize(answer or "")
|
|
286
|
+
return float(len(tokens))
|
|
287
|
+
|
|
288
|
+
def _keyword_missing_rate(
|
|
289
|
+
self,
|
|
290
|
+
question: str | None,
|
|
291
|
+
answer: str | None,
|
|
292
|
+
contexts: list[str] | None,
|
|
293
|
+
) -> float:
|
|
294
|
+
question_tokens = set(self._tokenize(question or ""))
|
|
295
|
+
if not question_tokens:
|
|
296
|
+
return 0.0
|
|
297
|
+
combined = " ".join([answer or "", *(contexts or [])])
|
|
298
|
+
combined_tokens = set(self._tokenize(combined))
|
|
299
|
+
missing = [token for token in question_tokens if token not in combined_tokens]
|
|
300
|
+
return len(missing) / len(question_tokens)
|
|
301
|
+
|
|
302
|
+
def _ttr(self, answer: str | None) -> float:
|
|
303
|
+
tokens = self._tokenize(answer or "")
|
|
304
|
+
if not tokens:
|
|
305
|
+
return 0.0
|
|
306
|
+
return len(set(tokens)) / len(tokens)
|
|
307
|
+
|
|
308
|
+
def _tokenize(self, text: str) -> list[str]:
|
|
309
|
+
series = pd.Series([text])
|
|
310
|
+
tokens = series.str.findall(r"[가-힣a-zA-Z0-9]{2,}").iloc[0]
|
|
311
|
+
return [token.lower() for token in tokens]
|
|
312
|
+
|
|
313
|
+
def _clip_score(self, score: float) -> float:
|
|
314
|
+
return max(1.0, min(5.0, score))
|
|
315
|
+
|
|
316
|
+
def _safe_corr(self, y_true: list[float], y_pred: list[float], *, method: str) -> float | None:
|
|
317
|
+
if len(y_true) < 2 or len(y_pred) < 2:
|
|
318
|
+
return None
|
|
319
|
+
series_a = pd.Series(y_true)
|
|
320
|
+
series_b = pd.Series(y_pred)
|
|
321
|
+
if method == "spearman":
|
|
322
|
+
series_a = series_a.rank()
|
|
323
|
+
series_b = series_b.rank()
|
|
324
|
+
try:
|
|
325
|
+
corr = series_a.corr(series_b)
|
|
326
|
+
return float(corr) if corr is not None else None
|
|
327
|
+
except Exception:
|
|
328
|
+
return None
|
|
@@ -4,9 +4,11 @@ from typing import Any, Protocol
|
|
|
4
4
|
|
|
5
5
|
from evalvault.domain.entities import (
|
|
6
6
|
EvaluationRun,
|
|
7
|
+
FeedbackSummary,
|
|
7
8
|
PromptSetBundle,
|
|
8
9
|
RunClusterMap,
|
|
9
10
|
RunClusterMapInfo,
|
|
11
|
+
SatisfactionFeedback,
|
|
10
12
|
)
|
|
11
13
|
from evalvault.domain.entities.experiment import Experiment
|
|
12
14
|
|
|
@@ -76,6 +78,8 @@ class StoragePort(Protocol):
|
|
|
76
78
|
"""
|
|
77
79
|
...
|
|
78
80
|
|
|
81
|
+
def update_run_metadata(self, run_id: str, metadata: dict[str, Any]) -> None: ...
|
|
82
|
+
|
|
79
83
|
def save_run_cluster_map(
|
|
80
84
|
self,
|
|
81
85
|
run_id: str,
|
|
@@ -99,6 +103,12 @@ class StoragePort(Protocol):
|
|
|
99
103
|
"""런별 클러스터 맵을 삭제합니다."""
|
|
100
104
|
...
|
|
101
105
|
|
|
106
|
+
def save_feedback(self, feedback: SatisfactionFeedback) -> str: ...
|
|
107
|
+
|
|
108
|
+
def list_feedback(self, run_id: str) -> list[SatisfactionFeedback]: ...
|
|
109
|
+
|
|
110
|
+
def get_feedback_summary(self, run_id: str) -> FeedbackSummary: ...
|
|
111
|
+
|
|
102
112
|
# Experiment 관련 메서드
|
|
103
113
|
|
|
104
114
|
def save_experiment(self, experiment: Experiment) -> str:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: evalvault
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.62.0
|
|
4
4
|
Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
|
|
5
5
|
Project-URL: Homepage, https://github.com/ntts9990/EvalVault
|
|
6
6
|
Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
|
|
@@ -46,6 +46,7 @@ Requires-Dist: uvicorn>=0.40.0
|
|
|
46
46
|
Requires-Dist: xlrd
|
|
47
47
|
Provides-Extra: analysis
|
|
48
48
|
Requires-Dist: scikit-learn>=1.3.0; extra == 'analysis'
|
|
49
|
+
Requires-Dist: xgboost>=2.0.0; extra == 'analysis'
|
|
49
50
|
Provides-Extra: anthropic
|
|
50
51
|
Requires-Dist: anthropic; extra == 'anthropic'
|
|
51
52
|
Requires-Dist: langchain-anthropic; extra == 'anthropic'
|
|
@@ -86,6 +87,7 @@ Requires-Dist: rank-bm25>=0.2.2; extra == 'dev'
|
|
|
86
87
|
Requires-Dist: ruff; extra == 'dev'
|
|
87
88
|
Requires-Dist: scikit-learn<1.4.0,>=1.3.0; extra == 'dev'
|
|
88
89
|
Requires-Dist: sentence-transformers>=5.2.0; extra == 'dev'
|
|
90
|
+
Requires-Dist: xgboost>=2.0.0; extra == 'dev'
|
|
89
91
|
Provides-Extra: docs
|
|
90
92
|
Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
|
|
91
93
|
Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
|
|
@@ -5,7 +5,7 @@ evalvault/mkdocs_helpers.py,sha256=1AKVQ1W2_VO4qclhfyefyU9Dz1Hzkh1DWDwsFMe24jc,3
|
|
|
5
5
|
evalvault/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
evalvault/adapters/inbound/__init__.py,sha256=SG1svel1PwqetnqVpKFLSv612_WwGwLTbFpYgwk6FMw,166
|
|
7
7
|
evalvault/adapters/inbound/api/__init__.py,sha256=LeVVttCA3tLKoHA2PO4z3y8VkfVcf3Bq8CZSzo91lf4,34
|
|
8
|
-
evalvault/adapters/inbound/api/adapter.py,sha256=
|
|
8
|
+
evalvault/adapters/inbound/api/adapter.py,sha256=_giGdt-grmd6bkWMhRb3KdloxI_2jUMknProC76KqWY,61140
|
|
9
9
|
evalvault/adapters/inbound/api/main.py,sha256=KdlAxKn0QfGI3UuoTrBDBbUs2xCvP8lnWOY1ce3svcU,2619
|
|
10
10
|
evalvault/adapters/inbound/api/routers/__init__.py,sha256=q07_YF9TnBl68bqcRCvhPU4-zRTyvmPoHVehwO6W7QM,19
|
|
11
11
|
evalvault/adapters/inbound/api/routers/benchmark.py,sha256=yevntbZcNtMvbVODsITUBgR1Ka4pdFQrXBJJ4K4Jyr4,4477
|
|
@@ -13,14 +13,15 @@ evalvault/adapters/inbound/api/routers/config.py,sha256=CN-FH2cn0Ive-BD3WacWY6PF
|
|
|
13
13
|
evalvault/adapters/inbound/api/routers/domain.py,sha256=RsR7GIFMjccDN7vpG1uDyk9n1DnCTH18JDGAX7o4Qqc,3648
|
|
14
14
|
evalvault/adapters/inbound/api/routers/knowledge.py,sha256=7mgyoUM1PepFb4X8_Ntn0vd7ZZYcNbM3_9nyD10g4Aw,5307
|
|
15
15
|
evalvault/adapters/inbound/api/routers/pipeline.py,sha256=8UgQzNFHcuqS61s69mOrPee4OMwfxVdvRWHJ2_qYBF0,17175
|
|
16
|
-
evalvault/adapters/inbound/api/routers/runs.py,sha256=
|
|
16
|
+
evalvault/adapters/inbound/api/routers/runs.py,sha256=KyIar-5RJemO7i3dvRLM1IeKWVF57tZXrrixKpGOg7M,38029
|
|
17
17
|
evalvault/adapters/inbound/cli/__init__.py,sha256=a42flC5NK-VfbdbBrE49IrUL5zAyKdXZYJVM6E3NTE0,675
|
|
18
18
|
evalvault/adapters/inbound/cli/app.py,sha256=ytNgHRg9ZTAl33AkB1wIL8RKfQ_Cf8fsy0gSsLTs7Ew,1603
|
|
19
|
-
evalvault/adapters/inbound/cli/commands/__init__.py,sha256=
|
|
19
|
+
evalvault/adapters/inbound/cli/commands/__init__.py,sha256=cNPPhsudTQWdlh_OJm9mU8LGBnJLGMswJBcIV9MAlkI,3530
|
|
20
20
|
evalvault/adapters/inbound/cli/commands/agent.py,sha256=YlOYMEzzS1aSKDKD_a7UK3St18X6GXGkdTatrzyd8Zc,7555
|
|
21
21
|
evalvault/adapters/inbound/cli/commands/analyze.py,sha256=aMi1BEDOX3yhN-ppBftDssPQLB5TdzIfpx9U7CZEgWo,48932
|
|
22
22
|
evalvault/adapters/inbound/cli/commands/api.py,sha256=YdbJ_-QEajnFcjTa7P2heLMjFKpeQ4nWP_p-HvfYkEo,1943
|
|
23
23
|
evalvault/adapters/inbound/cli/commands/benchmark.py,sha256=RZ4nRTF7d6hDZug-Pw8dGcFEyWdOKclwqkvS-gN4VWo,41097
|
|
24
|
+
evalvault/adapters/inbound/cli/commands/calibrate.py,sha256=-UnT0LQH40U5lzMLqMJ7DOTLa3mt5P_fJL2XzqIkvu4,4223
|
|
24
25
|
evalvault/adapters/inbound/cli/commands/config.py,sha256=Mv9IQHBFHZ3I2stUzHDgLDn-Znt_Awdy3j-sk5ruUmw,6069
|
|
25
26
|
evalvault/adapters/inbound/cli/commands/debug.py,sha256=KU-hL1gLhpjV2ZybDQgGMwRfm-hCynkrqY4UzETfL9k,2234
|
|
26
27
|
evalvault/adapters/inbound/cli/commands/domain.py,sha256=dL9iqBlnr5mDeS1unXW6uxE0qp6yfnxj-ls6k3EenwI,27279
|
|
@@ -74,8 +75,8 @@ evalvault/adapters/outbound/analysis/model_analyzer_module.py,sha256=28rHdXBXYIF
|
|
|
74
75
|
evalvault/adapters/outbound/analysis/morpheme_analyzer_module.py,sha256=Hrh4mluMsOhQHPrliD2w0FVKokJpfikXOFKT6sNwk74,4158
|
|
75
76
|
evalvault/adapters/outbound/analysis/morpheme_quality_checker_module.py,sha256=_uRKDXdwGbfYduf_3XT77vF8X3-_zW3stHYc3HKYQTE,2216
|
|
76
77
|
evalvault/adapters/outbound/analysis/network_analyzer_module.py,sha256=ITUVnt_CI5pHy5SAESBSi004yMtiAhGFsbhC61VTezk,8475
|
|
77
|
-
evalvault/adapters/outbound/analysis/nlp_adapter.py,sha256=
|
|
78
|
-
evalvault/adapters/outbound/analysis/nlp_analyzer_module.py,sha256=
|
|
78
|
+
evalvault/adapters/outbound/analysis/nlp_adapter.py,sha256=aLtF_fns-7IEtitwON2EYS_lweq_IdldFsRm47alN0Q,29561
|
|
79
|
+
evalvault/adapters/outbound/analysis/nlp_analyzer_module.py,sha256=kVuG9pVMQO6OYY5zxj_w9nNQZ1-qIO0y6XcXo6lG-n0,8221
|
|
79
80
|
evalvault/adapters/outbound/analysis/pattern_detector_module.py,sha256=SyCDO_VS-r-tjGh8WrW-t1GCSC9ouxirdVk4NizFPXo,1882
|
|
80
81
|
evalvault/adapters/outbound/analysis/pipeline_factory.py,sha256=XvcCbKCN_otv1pGUzk0oE76RV19yFga8r6RngBvgEFo,3691
|
|
81
82
|
evalvault/adapters/outbound/analysis/pipeline_helpers.py,sha256=8E8IrYI5JvRrpnjxe0DS7srbPzB0XAxxXhLLYgfwsgU,5756
|
|
@@ -164,12 +165,12 @@ evalvault/adapters/outbound/report/dashboard_generator.py,sha256=Dcu18NTK4lS8XNK
|
|
|
164
165
|
evalvault/adapters/outbound/report/llm_report_generator.py,sha256=sp2YRCmPOhn08vb8Bq_ayo-ZjgyBBxRhzRFvzlaDhsA,24063
|
|
165
166
|
evalvault/adapters/outbound/report/markdown_adapter.py,sha256=5PS72h_qe4ZtYs-umhX5TqQL2k5SuDaCUc6rRw9AKRw,16761
|
|
166
167
|
evalvault/adapters/outbound/storage/__init__.py,sha256=n5R6thAPTx1leSwv6od6nBWcLWFa-UYD6cOLzN89T8I,614
|
|
167
|
-
evalvault/adapters/outbound/storage/base_sql.py,sha256=
|
|
168
|
+
evalvault/adapters/outbound/storage/base_sql.py,sha256=Og-YRWHsCFQP2vnyvsgfWr4C2_ZE89ZmPXcPLiHeggU,21976
|
|
168
169
|
evalvault/adapters/outbound/storage/benchmark_storage_adapter.py,sha256=Qgf9xSSIkYQRpG4uLzcUdoYO9LTQDQ4tFRkkMYer-WA,9803
|
|
169
|
-
evalvault/adapters/outbound/storage/postgres_adapter.py,sha256=
|
|
170
|
-
evalvault/adapters/outbound/storage/postgres_schema.sql,sha256=
|
|
171
|
-
evalvault/adapters/outbound/storage/schema.sql,sha256=
|
|
172
|
-
evalvault/adapters/outbound/storage/sqlite_adapter.py,sha256=
|
|
170
|
+
evalvault/adapters/outbound/storage/postgres_adapter.py,sha256=IaijoeCIRi7JO2d5yfgfmF-ejobOnU7Izlx332mSUP8,39020
|
|
171
|
+
evalvault/adapters/outbound/storage/postgres_schema.sql,sha256=aAfgwxWEqCBGGpn_QRD_BbzXR2Q-9cd9GMsCbFeohNY,7632
|
|
172
|
+
evalvault/adapters/outbound/storage/schema.sql,sha256=LknvBvNVLvkW7c_hHTLHrxSf4TZApzbRyAk1ctuROUc,10608
|
|
173
|
+
evalvault/adapters/outbound/storage/sqlite_adapter.py,sha256=SKZ9IZjchi7w89WNkZ6aTelAzaV0MqUC7cexrkndTNY,48555
|
|
173
174
|
evalvault/adapters/outbound/tracer/__init__.py,sha256=xrvQQuAvF_UI02mKLMV7GTrG3zn836n5zwCRrrmhq_U,1054
|
|
174
175
|
evalvault/adapters/outbound/tracer/open_rag_log_handler.py,sha256=aq96FIWD-bBaSkq-bygWhQArC9LWghSwi-S03Mga0mI,2827
|
|
175
176
|
evalvault/adapters/outbound/tracer/open_rag_trace_adapter.py,sha256=P-4PN1UweITXu5uN3LJVCEL3wRwiExzhgs3y2GN78xM,4784
|
|
@@ -190,7 +191,7 @@ evalvault/config/phoenix_support.py,sha256=e6RPWd6Qb7KU6Q8pLaYTpJGWULtvEEU6B0xHW
|
|
|
190
191
|
evalvault/config/settings.py,sha256=T92GShlYKDaVinwbsbWX2DmNfm91Cvcvh8Te8pNOTsw,12875
|
|
191
192
|
evalvault/config/playbooks/improvement_playbook.yaml,sha256=9F9WVVCydFfz6zUuGYzZ4PKdW1LLtcBKVF36T7xT764,26965
|
|
192
193
|
evalvault/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
193
|
-
evalvault/domain/entities/__init__.py,sha256=
|
|
194
|
+
evalvault/domain/entities/__init__.py,sha256=CZU7VfTq2vart_j2pLemOX_TtSKzmpzB151pW-jSinw,3097
|
|
194
195
|
evalvault/domain/entities/analysis.py,sha256=gcMtumC66g-AIqb2LgfMpm5BMzwJhJkjg-zuybNoJCM,15208
|
|
195
196
|
evalvault/domain/entities/analysis_pipeline.py,sha256=hD9rFHMa4rUq0InRkSKhh6HQ9ZeNYAHKADzs-kWRP04,16845
|
|
196
197
|
evalvault/domain/entities/benchmark.py,sha256=CVbz_eW7Y9eM7wG7xA_xmldTIs72csdoTmu3E0NKoMU,18475
|
|
@@ -198,6 +199,7 @@ evalvault/domain/entities/benchmark_run.py,sha256=2ZJOq5Ny_pfvRKM7E4RuIKxfxvoYK-
|
|
|
198
199
|
evalvault/domain/entities/dataset.py,sha256=WsC_5ivGluy-o2nXxLGmoC8DYl5UafVSo2hSowb3rvs,1886
|
|
199
200
|
evalvault/domain/entities/debug.py,sha256=r92lgvOpq2svw70syJIo78muRAvrSn5h1JByH_Hvz-s,1493
|
|
200
201
|
evalvault/domain/entities/experiment.py,sha256=oWjbu0IJZ6oIRcnA-8ppeJDgp57Tv8ZjQ3UOZ0X9KJ8,2576
|
|
202
|
+
evalvault/domain/entities/feedback.py,sha256=xiaZaUQhyuxyW_i2scXt8eKZshMC6tXe3981e-uukw8,1604
|
|
201
203
|
evalvault/domain/entities/improvement.py,sha256=WHI7q1jXRxkuHhBWOrpk8UdLaH0UwjZVjRIDsqVDyZo,19322
|
|
202
204
|
evalvault/domain/entities/kg.py,sha256=8awN1M4vxAGQZk_ZG8i2CXKTizQ8FA1VCLhUWHZq0U8,3001
|
|
203
205
|
evalvault/domain/entities/memory.py,sha256=bfS75q8K8_jNrB7IYh4mjP8Lkyj-I0TVsmHCP0FuICw,8423
|
|
@@ -253,6 +255,7 @@ evalvault/domain/services/prompt_status.py,sha256=r1dFLGz4SfRxXaxsULQsr0-HpJkG9Y
|
|
|
253
255
|
evalvault/domain/services/ragas_prompt_overrides.py,sha256=4BecYE2KrreUBbIM3ssP9WzHcK_wRc8jW7CE_k58QOU,1412
|
|
254
256
|
evalvault/domain/services/retrieval_metrics.py,sha256=dtrQPLMrXSyWLcgF8EGcLNFwzwA59WDzEh41JRToHAY,2980
|
|
255
257
|
evalvault/domain/services/retriever_context.py,sha256=ySQ-GuadiggS0LVAib4AxA_0JpasYz4S9hbjau0eyIA,6482
|
|
258
|
+
evalvault/domain/services/satisfaction_calibration_service.py,sha256=H7Z8opOyPHRO5qVIw-XDsNhIwdCteAS9_a3BTlfIqHg,11906
|
|
256
259
|
evalvault/domain/services/stage_event_builder.py,sha256=ScTgyeRiH7z_rnNI_2p-i9szVRIRwUxGSJvpEj3zto4,9645
|
|
257
260
|
evalvault/domain/services/stage_metric_guide_service.py,sha256=_JdRsBRWirO24qYFlh6hG-dkoWlX6_XWEYKf_uUlKIQ,8807
|
|
258
261
|
evalvault/domain/services/stage_metric_service.py,sha256=KukIWWhWVOtclrET6uyWJ17jG76LfkKiqrUrDIDJ3gw,15327
|
|
@@ -286,15 +289,15 @@ evalvault/ports/outbound/nlp_analysis_port.py,sha256=QDJHAsSpynTenuaKp78t1s--U03
|
|
|
286
289
|
evalvault/ports/outbound/relation_augmenter_port.py,sha256=cMcHQnmK111WzZr50vYr7affeHhOtpFZxPARwkg9xbk,651
|
|
287
290
|
evalvault/ports/outbound/report_port.py,sha256=wgReSYL4SupXIoALFh0QFWfX2kzPftXpWTvGLCMd2B8,1315
|
|
288
291
|
evalvault/ports/outbound/stage_storage_port.py,sha256=Nlf9upsXxgCABQB5cJdpLQYsoZNiGRAU5zE5D-Ptp2I,1201
|
|
289
|
-
evalvault/ports/outbound/storage_port.py,sha256=
|
|
292
|
+
evalvault/ports/outbound/storage_port.py,sha256=d9f8bvAtPA2aytKrHvrfrWGOmaQSepLn23Bd_52QSbI,4862
|
|
290
293
|
evalvault/ports/outbound/tracer_port.py,sha256=kTqJCUIJHnvvDzMxxGhHSfiz8_Q4CZ0WSPvIUVVOcyw,623
|
|
291
294
|
evalvault/ports/outbound/tracker_port.py,sha256=05LA3AWnuE1XmGQC16Zle9i2sEV3q69Nt8ZUye_w1_Y,2532
|
|
292
295
|
evalvault/reports/__init__.py,sha256=Bb1X4871msAN8I6PM6nKGED3psPwZt88hXZBAOdH06Y,113
|
|
293
296
|
evalvault/reports/release_notes.py,sha256=pZj0PBFT-4F_Ty-Kv5P69BuoOnmTCn4kznDcORFJd0w,4011
|
|
294
297
|
evalvault/scripts/__init__.py,sha256=NwEeIFQbkX4ml2R_PhtIoNtArDSX_suuoymgG_7Kwso,89
|
|
295
298
|
evalvault/scripts/regression_runner.py,sha256=SxZori5BZ8jVQ057Mf5V5FPgIVDccrV5oRONmnhuk8w,8438
|
|
296
|
-
evalvault-1.
|
|
297
|
-
evalvault-1.
|
|
298
|
-
evalvault-1.
|
|
299
|
-
evalvault-1.
|
|
300
|
-
evalvault-1.
|
|
299
|
+
evalvault-1.62.0.dist-info/METADATA,sha256=2Nt0heOPN0il1jF3de3EAJtq9CQawjyaa27GQAGncmk,14155
|
|
300
|
+
evalvault-1.62.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
301
|
+
evalvault-1.62.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
|
|
302
|
+
evalvault-1.62.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
|
|
303
|
+
evalvault-1.62.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|