evalvault 1.61.0__py3-none-any.whl → 1.62.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,11 @@ from urllib.request import urlopen
14
14
 
15
15
  from evalvault.config.phoenix_support import PhoenixExperimentResolver
16
16
  from evalvault.config.settings import Settings
17
+ from evalvault.domain.entities import (
18
+ CalibrationResult,
19
+ FeedbackSummary,
20
+ SatisfactionFeedback,
21
+ )
17
22
  from evalvault.domain.entities.prompt import PromptSetBundle
18
23
  from evalvault.domain.metrics.registry import (
19
24
  get_metric_descriptions as registry_metric_descriptions,
@@ -29,6 +34,9 @@ from evalvault.domain.services.prompt_registry import (
29
34
  build_prompt_summary,
30
35
  )
31
36
  from evalvault.domain.services.prompt_status import extract_prompt_entries
37
+ from evalvault.domain.services.satisfaction_calibration_service import (
38
+ SatisfactionCalibrationService,
39
+ )
32
40
  from evalvault.domain.services.stage_event_builder import StageEventBuilder
33
41
  from evalvault.domain.services.stage_metric_service import StageMetricService
34
42
  from evalvault.domain.services.threshold_profiles import apply_threshold_profile
@@ -893,6 +901,27 @@ class WebUIAdapter:
893
901
  raise RuntimeError("Storage not configured")
894
902
  return self._storage.delete_run_cluster_map(run_id, map_id)
895
903
 
904
+ def save_feedback(self, feedback: SatisfactionFeedback) -> str:
905
+ if self._storage is None or not hasattr(self._storage, "save_feedback"):
906
+ raise RuntimeError("Storage not configured")
907
+ return self._storage.save_feedback(feedback)
908
+
909
+ def list_feedback(self, run_id: str) -> list[SatisfactionFeedback]:
910
+ if self._storage is None or not hasattr(self._storage, "list_feedback"):
911
+ raise RuntimeError("Storage not configured")
912
+ return self._storage.list_feedback(run_id)
913
+
914
+ def get_feedback_summary(self, run_id: str) -> FeedbackSummary:
915
+ if self._storage is None or not hasattr(self._storage, "get_feedback_summary"):
916
+ raise RuntimeError("Storage not configured")
917
+ return self._storage.get_feedback_summary(run_id)
918
+
919
+ def build_calibration(self, run_id: str, *, model: str = "both") -> CalibrationResult:
920
+ run = self.get_run_details(run_id)
921
+ feedbacks = self.list_feedback(run_id)
922
+ service = SatisfactionCalibrationService()
923
+ return service.build_calibration(run, feedbacks, model=model)
924
+
896
925
  def list_stage_events(self, run_id: str, *, stage_type: str | None = None) -> list[StageEvent]:
897
926
  """Stage 이벤트 목록 조회."""
898
927
  if self._storage is None or not hasattr(self._storage, "list_stage_events"):
@@ -21,7 +21,11 @@ from evalvault.adapters.outbound.dataset.templates import (
21
21
  )
22
22
  from evalvault.adapters.outbound.domain_memory.sqlite_adapter import SQLiteDomainMemoryAdapter
23
23
  from evalvault.config.settings import get_settings
24
- from evalvault.domain.entities import EvaluationRun
24
+ from evalvault.domain.entities import (
25
+ CalibrationResult,
26
+ EvaluationRun,
27
+ SatisfactionFeedback,
28
+ )
25
29
  from evalvault.domain.services.domain_learning_hook import DomainLearningHook
26
30
  from evalvault.domain.services.ragas_prompt_overrides import (
27
31
  PromptOverrideError,
@@ -178,6 +182,31 @@ class ClusterMapDeleteResponse(BaseModel):
178
182
  deleted_count: int
179
183
 
180
184
 
185
+ class FeedbackSaveRequest(BaseModel):
186
+ test_case_id: str
187
+ satisfaction_score: float | None = None
188
+ thumb_feedback: Literal["up", "down", "none"] | None = None
189
+ comment: str | None = None
190
+ rater_id: str | None = None
191
+
192
+
193
+ class FeedbackResponse(BaseModel):
194
+ feedback_id: str
195
+ run_id: str
196
+ test_case_id: str
197
+ satisfaction_score: float | None = None
198
+ thumb_feedback: str | None = None
199
+ comment: str | None = None
200
+ rater_id: str | None = None
201
+ created_at: str | None = None
202
+
203
+
204
+ class FeedbackSummaryResponse(BaseModel):
205
+ avg_satisfaction_score: float | None = None
206
+ thumb_up_rate: float | None = None
207
+ total_feedback: int
208
+
209
+
181
210
  class VisualSpaceRequest(BaseModel):
182
211
  granularity: Literal["run", "case", "cluster"] = "case"
183
212
  base_run_id: str | None = None
@@ -188,9 +217,22 @@ class VisualSpaceRequest(BaseModel):
188
217
  cluster_map: dict[str, str] | None = None
189
218
 
190
219
 
191
- def _serialize_run_details(run: EvaluationRun) -> dict[str, Any]:
220
+ def _serialize_run_details(
221
+ run: EvaluationRun,
222
+ *,
223
+ calibration: CalibrationResult | None = None,
224
+ ) -> dict[str, Any]:
225
+ summary = run.to_summary_dict()
226
+ if calibration is not None:
227
+ summary.update(
228
+ {
229
+ "avg_satisfaction_score": calibration.summary.avg_satisfaction_score,
230
+ "thumb_up_rate": calibration.summary.thumb_up_rate,
231
+ "imputed_ratio": calibration.summary.imputed_ratio,
232
+ }
233
+ )
192
234
  payload = {
193
- "summary": run.to_summary_dict(),
235
+ "summary": summary,
194
236
  "results": [
195
237
  {
196
238
  "test_case_id": result.test_case_id,
@@ -207,6 +249,21 @@ def _serialize_run_details(run: EvaluationRun) -> dict[str, Any]:
207
249
  }
208
250
  for metric in result.metrics
209
251
  ],
252
+ "calibrated_satisfaction": (
253
+ calibration.cases[result.test_case_id].calibrated_satisfaction
254
+ if calibration and result.test_case_id in calibration.cases
255
+ else None
256
+ ),
257
+ "imputed": (
258
+ calibration.cases[result.test_case_id].imputed
259
+ if calibration and result.test_case_id in calibration.cases
260
+ else False
261
+ ),
262
+ "imputation_source": (
263
+ calibration.cases[result.test_case_id].imputation_source
264
+ if calibration and result.test_case_id in calibration.cases
265
+ else None
266
+ ),
210
267
  }
211
268
  for result in run.results
212
269
  ],
@@ -719,9 +776,12 @@ def compare_runs(
719
776
  }
720
777
  )
721
778
 
779
+ base_calibration = adapter.build_calibration(base_id)
780
+ target_calibration = adapter.build_calibration(target_id)
781
+
722
782
  return {
723
- "base": _serialize_run_details(base_run),
724
- "target": _serialize_run_details(target_run),
783
+ "base": _serialize_run_details(base_run, calibration=base_calibration),
784
+ "target": _serialize_run_details(target_run, calibration=target_calibration),
725
785
  "metric_deltas": metric_deltas,
726
786
  "case_counts": _build_case_counts(base_run, target_run),
727
787
  "pass_rate_delta": target_run.pass_rate - base_run.pass_rate,
@@ -898,7 +958,70 @@ def get_run_details(run_id: str, adapter: AdapterDep) -> dict[str, Any]:
898
958
  """Get detailed information for a specific run."""
899
959
  try:
900
960
  run: EvaluationRun = adapter.get_run_details(run_id)
901
- return _serialize_run_details(run)
961
+ calibration = adapter.build_calibration(run_id)
962
+ return _serialize_run_details(run, calibration=calibration)
963
+ except KeyError:
964
+ raise HTTPException(status_code=404, detail="Run not found")
965
+ except Exception as e:
966
+ raise HTTPException(status_code=500, detail=str(e))
967
+
968
+
969
+ @router.post("/{run_id}/feedback", response_model=FeedbackResponse)
970
+ def save_feedback(
971
+ run_id: str,
972
+ request: FeedbackSaveRequest,
973
+ adapter: AdapterDep,
974
+ ) -> dict[str, Any]:
975
+ try:
976
+ adapter.get_run_details(run_id)
977
+ thumb_feedback = request.thumb_feedback
978
+ if thumb_feedback == "none":
979
+ thumb_feedback = None
980
+ satisfaction_score = request.satisfaction_score
981
+ if satisfaction_score is not None:
982
+ satisfaction_score = max(1.0, min(5.0, satisfaction_score))
983
+ feedback = SatisfactionFeedback(
984
+ feedback_id="",
985
+ run_id=run_id,
986
+ test_case_id=request.test_case_id,
987
+ satisfaction_score=satisfaction_score,
988
+ thumb_feedback=thumb_feedback,
989
+ comment=request.comment,
990
+ rater_id=request.rater_id,
991
+ created_at=datetime.now(),
992
+ )
993
+ feedback_id = adapter.save_feedback(feedback)
994
+ saved = feedback.to_dict()
995
+ saved["feedback_id"] = feedback_id
996
+ return saved
997
+ except KeyError:
998
+ raise HTTPException(status_code=404, detail="Run not found")
999
+ except Exception as e:
1000
+ raise HTTPException(status_code=500, detail=str(e))
1001
+
1002
+
1003
+ @router.get("/{run_id}/feedback", response_model=list[FeedbackResponse])
1004
+ def list_feedback(run_id: str, adapter: AdapterDep) -> list[dict[str, Any]]:
1005
+ try:
1006
+ adapter.get_run_details(run_id)
1007
+ feedbacks = adapter.list_feedback(run_id)
1008
+ return [feedback.to_dict() for feedback in feedbacks]
1009
+ except KeyError:
1010
+ raise HTTPException(status_code=404, detail="Run not found")
1011
+ except Exception as e:
1012
+ raise HTTPException(status_code=500, detail=str(e))
1013
+
1014
+
1015
+ @router.get("/{run_id}/feedback/summary", response_model=FeedbackSummaryResponse)
1016
+ def get_feedback_summary(run_id: str, adapter: AdapterDep) -> dict[str, Any]:
1017
+ try:
1018
+ adapter.get_run_details(run_id)
1019
+ summary = adapter.get_feedback_summary(run_id)
1020
+ return {
1021
+ "avg_satisfaction_score": summary.avg_satisfaction_score,
1022
+ "thumb_up_rate": summary.thumb_up_rate,
1023
+ "total_feedback": summary.total_feedback,
1024
+ }
902
1025
  except KeyError:
903
1026
  raise HTTPException(status_code=404, detail="Run not found")
904
1027
  except Exception as e:
@@ -13,6 +13,7 @@ from .agent import register_agent_commands
13
13
  from .analyze import register_analyze_commands
14
14
  from .api import register_api_command
15
15
  from .benchmark import create_benchmark_app
16
+ from .calibrate import register_calibrate_commands
16
17
  from .config import register_config_commands
17
18
  from .debug import create_debug_app
18
19
  from .domain import create_domain_app
@@ -61,6 +62,7 @@ COMMAND_MODULES: tuple[CommandModule, ...] = (
61
62
  CommandModule(register_pipeline_commands),
62
63
  CommandModule(register_history_commands),
63
64
  CommandModule(register_analyze_commands),
65
+ CommandModule(register_calibrate_commands),
64
66
  CommandModule(register_generate_commands),
65
67
  CommandModule(register_gate_commands),
66
68
  CommandModule(register_agent_commands),
@@ -0,0 +1,111 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+
6
+ import typer
7
+ from rich.console import Console
8
+ from rich.table import Table
9
+
10
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
11
+ from evalvault.config.settings import Settings
12
+ from evalvault.domain.services.satisfaction_calibration_service import (
13
+ SatisfactionCalibrationService,
14
+ )
15
+
16
+ from ..utils.options import db_option
17
+
18
+ _console = Console()
19
+
20
+
21
+ def register_calibrate_commands(app: typer.Typer, console: Console) -> None:
22
+ global _console
23
+ _console = console
24
+
25
+ @app.command()
26
+ def calibrate(
27
+ run_id: str = typer.Argument(..., help="보정 대상 Run ID"),
28
+ model: str = typer.Option(
29
+ "both", "--model", help="모델 선택 (linear|xgb|both)", show_default=True
30
+ ),
31
+ write_back: bool = typer.Option(
32
+ False,
33
+ "--write-back",
34
+ help="보정 결과를 메타데이터에 저장",
35
+ show_default=True,
36
+ ),
37
+ db_path: Path | None = db_option(help_text="DB 경로"),
38
+ ) -> None:
39
+ resolved_db_path = db_path or Settings().evalvault_db_path
40
+ if resolved_db_path is None:
41
+ _console.print("[red]오류: DB 경로가 설정되지 않았습니다.[/red]")
42
+ raise typer.Exit(1)
43
+
44
+ storage = SQLiteStorageAdapter(db_path=resolved_db_path)
45
+ try:
46
+ run = storage.get_run(run_id)
47
+ except KeyError:
48
+ _console.print("[red]오류: Run을 찾을 수 없습니다.[/red]")
49
+ raise typer.Exit(1)
50
+
51
+ normalized_model = model.lower()
52
+ if normalized_model not in {"linear", "xgb", "both"}:
53
+ _console.print("[red]오류: model은 linear|xgb|both 중 하나여야 합니다.[/red]")
54
+ raise typer.Exit(1)
55
+
56
+ feedbacks = storage.list_feedback(run_id)
57
+ service = SatisfactionCalibrationService()
58
+ calibration = service.build_calibration(run, feedbacks, model=normalized_model)
59
+
60
+ table = Table(title="보정 모델 성능 요약")
61
+ table.add_column("모델")
62
+ table.add_column("MAE", justify="right")
63
+ table.add_column("Pearson", justify="right")
64
+ table.add_column("Spearman", justify="right")
65
+
66
+ if calibration.summary.model_metrics:
67
+ for model_name, metrics in calibration.summary.model_metrics.items():
68
+ table.add_row(
69
+ model_name,
70
+ _format_metric(metrics.get("mae")),
71
+ _format_metric(metrics.get("pearson")),
72
+ _format_metric(metrics.get("spearman")),
73
+ )
74
+ else:
75
+ table.add_row("N/A", "-", "-", "-")
76
+
77
+ _console.print(table)
78
+ _console.print(
79
+ f"평균 만족도: {calibration.summary.avg_satisfaction_score} | "
80
+ f"Thumb Up 비율: {calibration.summary.thumb_up_rate} | "
81
+ f"보정 비율: {calibration.summary.imputed_ratio}"
82
+ )
83
+
84
+ if write_back:
85
+ metadata = run.tracker_metadata or {}
86
+ metadata["calibration"] = {
87
+ "updated_at": datetime.now().isoformat(),
88
+ "model": model,
89
+ "summary": {
90
+ "avg_satisfaction_score": calibration.summary.avg_satisfaction_score,
91
+ "thumb_up_rate": calibration.summary.thumb_up_rate,
92
+ "imputed_ratio": calibration.summary.imputed_ratio,
93
+ "model_metrics": calibration.summary.model_metrics,
94
+ },
95
+ "cases": {
96
+ case_id: {
97
+ "calibrated_satisfaction": case.calibrated_satisfaction,
98
+ "imputed": case.imputed,
99
+ "imputation_source": case.imputation_source,
100
+ }
101
+ for case_id, case in calibration.cases.items()
102
+ },
103
+ }
104
+ storage.update_run_metadata(run_id, metadata)
105
+ _console.print("[green]보정 결과를 메타데이터에 저장했습니다.[/green]")
106
+
107
+
108
+ def _format_metric(value: float | None) -> str:
109
+ if value is None:
110
+ return "-"
111
+ return f"{value:.3f}"
@@ -727,8 +727,52 @@ class NLPAnalysisAdapter(BaseAnalysisAdapter):
727
727
  if values:
728
728
  avg_scores[metric_name] = sum(values) / len(values)
729
729
 
730
- # 대표 질문 선택 (처음 3개)
731
- representative_questions = cluster_qs[:3]
730
+ representative_questions: list[str] = []
731
+ try:
732
+ cluster_idx_list = cluster_indices[cluster_id]
733
+ cluster_vectors = embedding_array[cluster_idx_list]
734
+ centroid = cluster_vectors.mean(axis=0)
735
+ distances = np.linalg.norm(cluster_vectors - centroid, axis=1)
736
+ sorted_pairs = sorted(
737
+ zip(cluster_idx_list, distances, strict=True), key=lambda x: x[1]
738
+ )
739
+
740
+ center_indices = [idx for idx, _dist in sorted_pairs[:2]]
741
+ edge_far = sorted_pairs[-1][0] if sorted_pairs else None
742
+
743
+ worst_idx = None
744
+ worst_score = None
745
+ for idx in cluster_idx_list:
746
+ q = questions[idx]
747
+ result = question_to_result.get(q)
748
+ if not result or not result.metrics:
749
+ continue
750
+ avg_score = sum(m.score for m in result.metrics) / len(result.metrics)
751
+ if worst_score is None or avg_score < worst_score:
752
+ worst_score = avg_score
753
+ worst_idx = idx
754
+
755
+ edge_needed = worst_idx
756
+ if edge_needed is None and len(sorted_pairs) > 1:
757
+ edge_needed = sorted_pairs[-2][0]
758
+
759
+ candidate_indices: list[int] = []
760
+ candidate_indices.extend(center_indices)
761
+ if edge_far is not None:
762
+ candidate_indices.append(edge_far)
763
+ if edge_needed is not None:
764
+ candidate_indices.append(edge_needed)
765
+
766
+ seen: set[int] = set()
767
+ for idx in candidate_indices:
768
+ if idx in seen:
769
+ continue
770
+ seen.add(idx)
771
+ representative_questions.append(questions[idx])
772
+ if len(representative_questions) >= 4:
773
+ break
774
+ except Exception:
775
+ representative_questions = cluster_qs[:4]
732
776
 
733
777
  clusters.append(
734
778
  TopicCluster(
@@ -216,7 +216,7 @@ class NLPAnalyzerModule(BaseAnalysisModule):
216
216
  "keywords": list(cluster.keywords),
217
217
  "document_count": cluster.document_count,
218
218
  "avg_scores": cluster.avg_scores,
219
- "representative_questions": cluster.representative_questions[:3],
219
+ "representative_questions": cluster.representative_questions[:4],
220
220
  }
221
221
  )
222
222
  return serialized
@@ -11,9 +11,11 @@ from typing import Any
11
11
 
12
12
  from evalvault.domain.entities import (
13
13
  EvaluationRun,
14
+ FeedbackSummary,
14
15
  MetricScore,
15
16
  RunClusterMap,
16
17
  RunClusterMapInfo,
18
+ SatisfactionFeedback,
17
19
  TestCaseResult,
18
20
  )
19
21
 
@@ -27,10 +29,12 @@ class SQLQueries:
27
29
  placeholder: str = "?",
28
30
  metric_name_column: str = "metric_name",
29
31
  test_case_returning_clause: str = "",
32
+ feedback_returning_clause: str = "",
30
33
  ) -> None:
31
34
  self.placeholder = placeholder
32
35
  self.metric_name_column = metric_name_column
33
36
  self._test_case_returning = test_case_returning_clause
37
+ self._feedback_returning = feedback_returning_clause
34
38
 
35
39
  def _values(self, count: int) -> str:
36
40
  return ", ".join([self.placeholder] * count)
@@ -75,6 +79,25 @@ class SQLQueries:
75
79
  ) VALUES ({values})
76
80
  """
77
81
 
82
+ def insert_feedback(self) -> str:
83
+ values = self._values(7)
84
+ query = f"""
85
+ INSERT INTO satisfaction_feedback (
86
+ run_id, test_case_id, satisfaction_score, thumb_feedback, comment, rater_id, created_at
87
+ ) VALUES ({values})
88
+ """
89
+ if self._feedback_returning:
90
+ query = f"{query.strip()} {self._feedback_returning}"
91
+ return query
92
+
93
+ def select_feedback_by_run(self) -> str:
94
+ return f"""
95
+ SELECT id, run_id, test_case_id, satisfaction_score, thumb_feedback, comment, rater_id, created_at
96
+ FROM satisfaction_feedback
97
+ WHERE run_id = {self.placeholder}
98
+ ORDER BY created_at DESC
99
+ """
100
+
78
101
  def select_run(self) -> str:
79
102
  return f"""
80
103
  SELECT run_id, dataset_name, dataset_version, model_name,
@@ -129,6 +152,13 @@ class SQLQueries:
129
152
  ORDER BY created_at DESC
130
153
  """
131
154
 
155
+ def update_run_metadata(self) -> str:
156
+ return f"""
157
+ UPDATE evaluation_runs
158
+ SET metadata = {self.placeholder}
159
+ WHERE run_id = {self.placeholder}
160
+ """
161
+
132
162
  def delete_run(self) -> str:
133
163
  return f"DELETE FROM evaluation_runs WHERE run_id = {self.placeholder}"
134
164
 
@@ -259,6 +289,12 @@ class BaseSQLStorageAdapter(ABC):
259
289
  conn.commit()
260
290
  return deleted
261
291
 
292
+ def update_run_metadata(self, run_id: str, metadata: dict[str, Any]) -> None:
293
+ payload = self._serialize_json(metadata)
294
+ with self._get_connection() as conn:
295
+ self._execute(conn, self.queries.update_run_metadata(), (payload, run_id))
296
+ conn.commit()
297
+
262
298
  def save_run_cluster_map(
263
299
  self,
264
300
  run_id: str,
@@ -365,6 +401,45 @@ class BaseSQLStorageAdapter(ABC):
365
401
  conn.commit()
366
402
  return deleted
367
403
 
404
+ def save_feedback(self, feedback: SatisfactionFeedback) -> str:
405
+ created_at = feedback.created_at or datetime.now()
406
+ with self._get_connection() as conn:
407
+ cursor = self._execute(
408
+ conn,
409
+ self.queries.insert_feedback(),
410
+ (
411
+ feedback.run_id,
412
+ feedback.test_case_id,
413
+ feedback.satisfaction_score,
414
+ feedback.thumb_feedback,
415
+ feedback.comment,
416
+ feedback.rater_id,
417
+ self._serialize_datetime(created_at),
418
+ ),
419
+ )
420
+ feedback_id = self._fetch_lastrowid(cursor)
421
+ conn.commit()
422
+ return str(feedback_id)
423
+
424
+ def list_feedback(self, run_id: str) -> list[SatisfactionFeedback]:
425
+ with self._get_connection() as conn:
426
+ rows = self._execute(conn, self.queries.select_feedback_by_run(), (run_id,)).fetchall()
427
+ return [self._row_to_feedback(row) for row in rows]
428
+
429
+ def get_feedback_summary(self, run_id: str) -> FeedbackSummary:
430
+ feedbacks = self.list_feedback(run_id)
431
+ scores = [f.satisfaction_score for f in feedbacks if f.satisfaction_score is not None]
432
+ thumbs = [f.thumb_feedback for f in feedbacks if f.thumb_feedback in {"up", "down"}]
433
+ avg_score = sum(scores) / len(scores) if scores else None
434
+ thumb_up_rate = None
435
+ if thumbs:
436
+ thumb_up_rate = thumbs.count("up") / len(thumbs)
437
+ return FeedbackSummary(
438
+ avg_satisfaction_score=avg_score,
439
+ thumb_up_rate=thumb_up_rate,
440
+ total_feedback=len(feedbacks),
441
+ )
442
+
368
443
  # Serialization helpers --------------------------------------------
369
444
 
370
445
  def _run_params(self, run: EvaluationRun) -> Sequence[Any]:
@@ -428,6 +503,22 @@ class BaseSQLStorageAdapter(ABC):
428
503
  ground_truth=row["ground_truth"],
429
504
  )
430
505
 
506
+ def _row_to_feedback(self, row) -> SatisfactionFeedback:
507
+ feedback_id = self._row_value(row, "id")
508
+ run_id = self._row_value(row, "run_id")
509
+ test_case_id = self._row_value(row, "test_case_id")
510
+ created_at = self._deserialize_datetime(self._row_value(row, "created_at"))
511
+ return SatisfactionFeedback(
512
+ feedback_id=str(feedback_id or ""),
513
+ run_id=str(run_id or ""),
514
+ test_case_id=str(test_case_id or ""),
515
+ satisfaction_score=self._maybe_float(self._row_value(row, "satisfaction_score")),
516
+ thumb_feedback=self._row_value(row, "thumb_feedback"),
517
+ comment=self._row_value(row, "comment"),
518
+ rater_id=self._row_value(row, "rater_id"),
519
+ created_at=created_at,
520
+ )
521
+
431
522
  def _fetch_metric_scores(self, conn, result_id: int) -> list[MetricScore]:
432
523
  rows = self._execute(conn, self.queries.select_metric_scores(), (result_id,)).fetchall()
433
524
  metric_column = self.queries.metric_name_column
@@ -60,6 +60,7 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
60
60
  placeholder="%s",
61
61
  metric_name_column="name",
62
62
  test_case_returning_clause="RETURNING id",
63
+ feedback_returning_clause="RETURNING id",
63
64
  )
64
65
  )
65
66
  if connection_string:
@@ -198,6 +199,27 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
198
199
  elif cluster_columns and "metadata" not in cluster_columns:
199
200
  conn.execute("ALTER TABLE run_cluster_maps ADD COLUMN metadata JSONB")
200
201
 
202
+ conn.execute(
203
+ """
204
+ CREATE TABLE IF NOT EXISTS satisfaction_feedback (
205
+ id SERIAL PRIMARY KEY,
206
+ run_id UUID NOT NULL REFERENCES evaluation_runs(run_id) ON DELETE CASCADE,
207
+ test_case_id VARCHAR(255) NOT NULL,
208
+ satisfaction_score DECIMAL(4, 2),
209
+ thumb_feedback VARCHAR(10),
210
+ comment TEXT,
211
+ rater_id VARCHAR(255),
212
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
213
+ )
214
+ """
215
+ )
216
+ conn.execute(
217
+ "CREATE INDEX IF NOT EXISTS idx_feedback_run_id ON satisfaction_feedback(run_id)"
218
+ )
219
+ conn.execute(
220
+ "CREATE INDEX IF NOT EXISTS idx_feedback_test_case_id ON satisfaction_feedback(test_case_id)"
221
+ )
222
+
201
223
  # Prompt set methods
202
224
 
203
225
  def save_prompt_set(self, bundle: PromptSetBundle) -> None:
@@ -59,6 +59,20 @@ CREATE TABLE IF NOT EXISTS run_cluster_maps (
59
59
  CREATE INDEX IF NOT EXISTS idx_cluster_maps_run_id ON run_cluster_maps(run_id);
60
60
  CREATE INDEX IF NOT EXISTS idx_cluster_maps_map_id ON run_cluster_maps(map_id);
61
61
 
62
+ CREATE TABLE IF NOT EXISTS satisfaction_feedback (
63
+ id SERIAL PRIMARY KEY,
64
+ run_id UUID NOT NULL REFERENCES evaluation_runs(run_id) ON DELETE CASCADE,
65
+ test_case_id VARCHAR(255) NOT NULL,
66
+ satisfaction_score DECIMAL(4, 2),
67
+ thumb_feedback VARCHAR(10),
68
+ comment TEXT,
69
+ rater_id VARCHAR(255),
70
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
71
+ );
72
+
73
+ CREATE INDEX IF NOT EXISTS idx_feedback_run_id ON satisfaction_feedback(run_id);
74
+ CREATE INDEX IF NOT EXISTS idx_feedback_test_case_id ON satisfaction_feedback(test_case_id);
75
+
62
76
  -- Metric scores table
63
77
  CREATE TABLE IF NOT EXISTS metric_scores (
64
78
  id SERIAL PRIMARY KEY,
@@ -61,6 +61,21 @@ CREATE TABLE IF NOT EXISTS run_cluster_maps (
61
61
  CREATE INDEX IF NOT EXISTS idx_cluster_maps_run_id ON run_cluster_maps(run_id);
62
62
  CREATE INDEX IF NOT EXISTS idx_cluster_maps_map_id ON run_cluster_maps(map_id);
63
63
 
64
+ CREATE TABLE IF NOT EXISTS satisfaction_feedback (
65
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
66
+ run_id TEXT NOT NULL,
67
+ test_case_id TEXT NOT NULL,
68
+ satisfaction_score REAL,
69
+ thumb_feedback TEXT,
70
+ comment TEXT,
71
+ rater_id TEXT,
72
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
73
+ FOREIGN KEY (run_id) REFERENCES evaluation_runs(run_id) ON DELETE CASCADE
74
+ );
75
+
76
+ CREATE INDEX IF NOT EXISTS idx_feedback_run_id ON satisfaction_feedback(run_id);
77
+ CREATE INDEX IF NOT EXISTS idx_feedback_test_case_id ON satisfaction_feedback(test_case_id);
78
+
64
79
  -- Metric scores table
65
80
  CREATE TABLE IF NOT EXISTS metric_scores (
66
81
  id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -140,6 +140,31 @@ class SQLiteStorageAdapter(BaseSQLStorageAdapter):
140
140
  elif cluster_columns and "metadata" not in cluster_columns:
141
141
  conn.execute("ALTER TABLE run_cluster_maps ADD COLUMN metadata TEXT")
142
142
 
143
+ feedback_cursor = conn.execute("PRAGMA table_info(satisfaction_feedback)")
144
+ feedback_columns = {row[1] for row in feedback_cursor.fetchall()}
145
+ if not feedback_columns:
146
+ conn.execute(
147
+ """
148
+ CREATE TABLE IF NOT EXISTS satisfaction_feedback (
149
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
150
+ run_id TEXT NOT NULL,
151
+ test_case_id TEXT NOT NULL,
152
+ satisfaction_score REAL,
153
+ thumb_feedback TEXT,
154
+ comment TEXT,
155
+ rater_id TEXT,
156
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
157
+ FOREIGN KEY (run_id) REFERENCES evaluation_runs(run_id) ON DELETE CASCADE
158
+ )
159
+ """
160
+ )
161
+ conn.execute(
162
+ "CREATE INDEX IF NOT EXISTS idx_feedback_run_id ON satisfaction_feedback(run_id)"
163
+ )
164
+ conn.execute(
165
+ "CREATE INDEX IF NOT EXISTS idx_feedback_test_case_id ON satisfaction_feedback(test_case_id)"
166
+ )
167
+
143
168
  pipeline_cursor = conn.execute("PRAGMA table_info(pipeline_results)")
144
169
  pipeline_columns = {row[1] for row in pipeline_cursor.fetchall()}
145
170
  if pipeline_columns:
@@ -14,6 +14,13 @@ from evalvault.domain.entities.analysis import (
14
14
  )
15
15
  from evalvault.domain.entities.dataset import Dataset, TestCase
16
16
  from evalvault.domain.entities.experiment import Experiment, ExperimentGroup
17
+ from evalvault.domain.entities.feedback import (
18
+ CalibrationCaseResult,
19
+ CalibrationResult,
20
+ CalibrationSummary,
21
+ FeedbackSummary,
22
+ SatisfactionFeedback,
23
+ )
17
24
  from evalvault.domain.entities.improvement import (
18
25
  EffortLevel,
19
26
  EvidenceSource,
@@ -74,6 +81,11 @@ __all__ = [
74
81
  # Experiment
75
82
  "Experiment",
76
83
  "ExperimentGroup",
84
+ "CalibrationCaseResult",
85
+ "CalibrationResult",
86
+ "CalibrationSummary",
87
+ "FeedbackSummary",
88
+ "SatisfactionFeedback",
77
89
  # Improvement
78
90
  "EffortLevel",
79
91
  "EvidenceSource",
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from typing import Any
6
+
7
+
8
+ @dataclass
9
+ class SatisfactionFeedback:
10
+ feedback_id: str
11
+ run_id: str
12
+ test_case_id: str
13
+ satisfaction_score: float | None = None
14
+ thumb_feedback: str | None = None
15
+ comment: str | None = None
16
+ rater_id: str | None = None
17
+ created_at: datetime | None = None
18
+
19
+ def to_dict(self) -> dict[str, Any]:
20
+ return {
21
+ "feedback_id": self.feedback_id,
22
+ "run_id": self.run_id,
23
+ "test_case_id": self.test_case_id,
24
+ "satisfaction_score": self.satisfaction_score,
25
+ "thumb_feedback": self.thumb_feedback,
26
+ "comment": self.comment,
27
+ "rater_id": self.rater_id,
28
+ "created_at": self.created_at.isoformat() if self.created_at else None,
29
+ }
30
+
31
+
32
+ @dataclass
33
+ class FeedbackSummary:
34
+ avg_satisfaction_score: float | None = None
35
+ thumb_up_rate: float | None = None
36
+ total_feedback: int = 0
37
+
38
+
39
+ @dataclass
40
+ class CalibrationCaseResult:
41
+ test_case_id: str
42
+ calibrated_satisfaction: float | None = None
43
+ imputed: bool = False
44
+ imputation_source: str | None = None
45
+
46
+
47
+ @dataclass
48
+ class CalibrationSummary:
49
+ avg_satisfaction_score: float | None = None
50
+ thumb_up_rate: float | None = None
51
+ imputed_ratio: float | None = None
52
+ model_metrics: dict[str, dict[str, float | None]] = field(default_factory=dict)
53
+
54
+
55
+ @dataclass
56
+ class CalibrationResult:
57
+ summary: CalibrationSummary
58
+ cases: dict[str, CalibrationCaseResult] = field(default_factory=dict)
@@ -0,0 +1,328 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from datetime import datetime
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+
9
+ from evalvault.domain.entities import (
10
+ CalibrationCaseResult,
11
+ CalibrationResult,
12
+ CalibrationSummary,
13
+ EvaluationRun,
14
+ SatisfactionFeedback,
15
+ )
16
+
17
+
18
+ @dataclass
19
+ class CalibrationModelResult:
20
+ model_name: str
21
+ mae: float | None
22
+ pearson: float | None
23
+ spearman: float | None
24
+
25
+
26
+ class SatisfactionCalibrationService:
27
+ def __init__(self, *, thumb_mapping: dict[str, float] | None = None) -> None:
28
+ self._thumb_mapping = thumb_mapping or {"up": 4.0, "down": 2.0}
29
+
30
+ def build_calibration(
31
+ self,
32
+ run: EvaluationRun,
33
+ feedbacks: list[SatisfactionFeedback],
34
+ *,
35
+ model: str = "both",
36
+ ) -> CalibrationResult:
37
+ feedback_index = self._build_feedback_index(feedbacks)
38
+ feature_map = self._build_feature_matrix(run)
39
+ labels, label_sources = self._build_labels(run, feedback_index)
40
+
41
+ if not feedback_index:
42
+ summary = CalibrationSummary(
43
+ avg_satisfaction_score=None,
44
+ thumb_up_rate=None,
45
+ imputed_ratio=0.0,
46
+ )
47
+ return CalibrationResult(summary=summary, cases={})
48
+
49
+ model_metrics: dict[str, dict[str, float | None]] = {}
50
+ model_choice, predictors = self._train_models(
51
+ feature_map,
52
+ labels,
53
+ model=model,
54
+ model_metrics=model_metrics,
55
+ )
56
+
57
+ summary = self._build_summary(run, feedback_index)
58
+ cases: dict[str, CalibrationCaseResult] = {}
59
+ imputed_count = 0
60
+
61
+ for test_case_id, features in feature_map.items():
62
+ label = labels.get(test_case_id)
63
+ source = label_sources.get(test_case_id)
64
+ if label is not None:
65
+ calibrated = self._clip_score(label)
66
+ imputed = source != "label"
67
+ imputation_source = source
68
+ else:
69
+ calibrated = self._predict_or_fallback(
70
+ predictors.get(model_choice),
71
+ features,
72
+ labels,
73
+ )
74
+ if calibrated is None:
75
+ imputed = False
76
+ imputation_source = None
77
+ else:
78
+ imputed = True
79
+ imputation_source = "model" if predictors.get(model_choice) else "fallback_mean"
80
+
81
+ if imputed:
82
+ imputed_count += 1
83
+
84
+ cases[test_case_id] = CalibrationCaseResult(
85
+ test_case_id=test_case_id,
86
+ calibrated_satisfaction=calibrated,
87
+ imputed=imputed,
88
+ imputation_source=imputation_source,
89
+ )
90
+
91
+ summary.imputed_ratio = imputed_count / len(cases) if cases else summary.imputed_ratio
92
+ summary.model_metrics = model_metrics
93
+ return CalibrationResult(summary=summary, cases=cases)
94
+
95
+ def _build_feedback_index(
96
+ self, feedbacks: list[SatisfactionFeedback]
97
+ ) -> dict[str, SatisfactionFeedback]:
98
+ latest: dict[str, SatisfactionFeedback] = {}
99
+ for feedback in feedbacks:
100
+ current = latest.get(feedback.test_case_id)
101
+ if current is None:
102
+ latest[feedback.test_case_id] = feedback
103
+ continue
104
+ current_time = current.created_at or datetime.min
105
+ feedback_time = feedback.created_at or datetime.min
106
+ if feedback_time >= current_time:
107
+ latest[feedback.test_case_id] = feedback
108
+ return latest
109
+
110
+ def _build_feature_matrix(self, run: EvaluationRun) -> dict[str, list[float]]:
111
+ feature_map: dict[str, list[float]] = {}
112
+
113
+ for result in run.results:
114
+ features = [
115
+ self._metric_score(result, "faithfulness"),
116
+ self._metric_score(result, "answer_relevancy"),
117
+ self._metric_score(result, "context_precision"),
118
+ self._metric_score(result, "context_recall"),
119
+ self._answer_length(result.answer),
120
+ self._keyword_missing_rate(result.question, result.answer, result.contexts),
121
+ self._ttr(result.answer),
122
+ ]
123
+ feature_map[result.test_case_id] = features
124
+ return feature_map
125
+
126
+ def _build_labels(
127
+ self,
128
+ run: EvaluationRun,
129
+ feedback_index: dict[str, SatisfactionFeedback],
130
+ ) -> tuple[dict[str, float], dict[str, str]]:
131
+ labels: dict[str, float] = {}
132
+ sources: dict[str, str] = {}
133
+ for result in run.results:
134
+ feedback = feedback_index.get(result.test_case_id)
135
+ if feedback is None:
136
+ continue
137
+ if feedback.satisfaction_score is not None:
138
+ labels[result.test_case_id] = feedback.satisfaction_score
139
+ sources[result.test_case_id] = "label"
140
+ continue
141
+ mapped = self._thumb_mapping.get((feedback.thumb_feedback or "").lower())
142
+ if mapped is not None:
143
+ labels[result.test_case_id] = mapped
144
+ sources[result.test_case_id] = "thumb"
145
+ return labels, sources
146
+
147
+ def _train_models(
148
+ self,
149
+ feature_map: dict[str, list[float]],
150
+ labels: dict[str, float],
151
+ *,
152
+ model: str,
153
+ model_metrics: dict[str, dict[str, float | None]],
154
+ ) -> tuple[str, dict[str, Any]]:
155
+ from sklearn.linear_model import LinearRegression
156
+ from sklearn.metrics import mean_absolute_error
157
+ from sklearn.model_selection import train_test_split
158
+
159
+ if not labels:
160
+ return "linear", {}
161
+
162
+ features_matrix: list[list[float]] = []
163
+ labels_vector: list[float] = []
164
+ for test_case_id, label in labels.items():
165
+ features = feature_map.get(test_case_id)
166
+ if features is None:
167
+ continue
168
+ features_matrix.append(features)
169
+ labels_vector.append(label)
170
+
171
+ if not features_matrix:
172
+ return "linear", {}
173
+
174
+ if len(labels_vector) >= 5:
175
+ features_train, features_test, labels_train, labels_test = train_test_split(
176
+ features_matrix, labels_vector, test_size=0.2, random_state=42
177
+ )
178
+ else:
179
+ features_train, features_test, labels_train, labels_test = (
180
+ features_matrix,
181
+ features_matrix,
182
+ labels_vector,
183
+ labels_vector,
184
+ )
185
+
186
+ predictors: dict[str, Any] = {}
187
+
188
+ linear = LinearRegression()
189
+ linear.fit(features_train, labels_train)
190
+ linear_pred = linear.predict(features_test)
191
+ model_metrics["linear"] = self._build_metrics(labels_test, linear_pred, mean_absolute_error)
192
+ predictors["linear"] = linear
193
+
194
+ if model in {"xgb", "both"}:
195
+ try:
196
+ import importlib
197
+
198
+ xgb_module = importlib.import_module("xgboost")
199
+ xgb_regressor = xgb_module.XGBRegressor
200
+
201
+ xgb = xgb_regressor(
202
+ objective="reg:squarederror",
203
+ n_estimators=150,
204
+ max_depth=5,
205
+ learning_rate=0.1,
206
+ subsample=0.8,
207
+ colsample_bytree=0.8,
208
+ reg_alpha=0.1,
209
+ reg_lambda=1.0,
210
+ n_jobs=-1,
211
+ random_state=42,
212
+ )
213
+ xgb.fit(features_train, labels_train)
214
+ xgb_pred = xgb.predict(features_test)
215
+ model_metrics["xgb"] = self._build_metrics(
216
+ labels_test, xgb_pred, mean_absolute_error
217
+ )
218
+ predictors["xgb"] = xgb
219
+ except Exception:
220
+ model_metrics["xgb"] = {"mae": None, "pearson": None, "spearman": None}
221
+
222
+ model_choice = "xgb" if model in {"xgb", "both"} and "xgb" in predictors else "linear"
223
+ return model_choice, predictors
224
+
225
+ def _build_metrics(
226
+ self,
227
+ y_true: list[float],
228
+ y_pred: list[float],
229
+ mae_func,
230
+ ) -> dict[str, float | None]:
231
+ mae = float(mae_func(y_true, y_pred)) if y_true else None
232
+ pearson = self._safe_corr(y_true, y_pred, method="pearson")
233
+ spearman = self._safe_corr(y_true, y_pred, method="spearman")
234
+ return {"mae": mae, "pearson": pearson, "spearman": spearman}
235
+
236
+ def _predict_or_fallback(
237
+ self,
238
+ predictor: Any | None,
239
+ features: list[float],
240
+ labels: dict[str, float],
241
+ ) -> float | None:
242
+ if predictor is not None:
243
+ prediction = predictor.predict([features])[0]
244
+ return self._clip_score(float(prediction))
245
+ fallback = self._fallback_mean(labels)
246
+ if fallback is None:
247
+ return None
248
+ return self._clip_score(fallback)
249
+
250
+ def _fallback_mean(self, labels: dict[str, float]) -> float | None:
251
+ if not labels:
252
+ return None
253
+ return sum(labels.values()) / len(labels)
254
+
255
+ def _build_summary(
256
+ self, run: EvaluationRun, feedback_index: dict[str, SatisfactionFeedback]
257
+ ) -> CalibrationSummary:
258
+ scores: list[float] = []
259
+ thumbs: list[str] = []
260
+ for result in run.results:
261
+ feedback = feedback_index.get(result.test_case_id)
262
+ if feedback is None:
263
+ continue
264
+ if feedback.satisfaction_score is not None:
265
+ scores.append(feedback.satisfaction_score)
266
+ if feedback.thumb_feedback in {"up", "down"}:
267
+ thumbs.append(feedback.thumb_feedback)
268
+ avg_score = sum(scores) / len(scores) if scores else None
269
+ thumb_up_rate = None
270
+ if thumbs:
271
+ thumb_up_rate = thumbs.count("up") / len(thumbs)
272
+ return CalibrationSummary(
273
+ avg_satisfaction_score=avg_score,
274
+ thumb_up_rate=thumb_up_rate,
275
+ imputed_ratio=None,
276
+ )
277
+
278
+ def _metric_score(self, result, name: str) -> float:
279
+ metric = result.get_metric(name)
280
+ if metric and metric.score is not None:
281
+ return float(metric.score)
282
+ return 0.0
283
+
284
+ def _answer_length(self, answer: str | None) -> float:
285
+ tokens = self._tokenize(answer or "")
286
+ return float(len(tokens))
287
+
288
+ def _keyword_missing_rate(
289
+ self,
290
+ question: str | None,
291
+ answer: str | None,
292
+ contexts: list[str] | None,
293
+ ) -> float:
294
+ question_tokens = set(self._tokenize(question or ""))
295
+ if not question_tokens:
296
+ return 0.0
297
+ combined = " ".join([answer or "", *(contexts or [])])
298
+ combined_tokens = set(self._tokenize(combined))
299
+ missing = [token for token in question_tokens if token not in combined_tokens]
300
+ return len(missing) / len(question_tokens)
301
+
302
+ def _ttr(self, answer: str | None) -> float:
303
+ tokens = self._tokenize(answer or "")
304
+ if not tokens:
305
+ return 0.0
306
+ return len(set(tokens)) / len(tokens)
307
+
308
+ def _tokenize(self, text: str) -> list[str]:
309
+ series = pd.Series([text])
310
+ tokens = series.str.findall(r"[가-힣a-zA-Z0-9]{2,}").iloc[0]
311
+ return [token.lower() for token in tokens]
312
+
313
+ def _clip_score(self, score: float) -> float:
314
+ return max(1.0, min(5.0, score))
315
+
316
+ def _safe_corr(self, y_true: list[float], y_pred: list[float], *, method: str) -> float | None:
317
+ if len(y_true) < 2 or len(y_pred) < 2:
318
+ return None
319
+ series_a = pd.Series(y_true)
320
+ series_b = pd.Series(y_pred)
321
+ if method == "spearman":
322
+ series_a = series_a.rank()
323
+ series_b = series_b.rank()
324
+ try:
325
+ corr = series_a.corr(series_b)
326
+ return float(corr) if corr is not None else None
327
+ except Exception:
328
+ return None
@@ -4,9 +4,11 @@ from typing import Any, Protocol
4
4
 
5
5
  from evalvault.domain.entities import (
6
6
  EvaluationRun,
7
+ FeedbackSummary,
7
8
  PromptSetBundle,
8
9
  RunClusterMap,
9
10
  RunClusterMapInfo,
11
+ SatisfactionFeedback,
10
12
  )
11
13
  from evalvault.domain.entities.experiment import Experiment
12
14
 
@@ -76,6 +78,8 @@ class StoragePort(Protocol):
76
78
  """
77
79
  ...
78
80
 
81
+ def update_run_metadata(self, run_id: str, metadata: dict[str, Any]) -> None: ...
82
+
79
83
  def save_run_cluster_map(
80
84
  self,
81
85
  run_id: str,
@@ -99,6 +103,12 @@ class StoragePort(Protocol):
99
103
  """런별 클러스터 맵을 삭제합니다."""
100
104
  ...
101
105
 
106
+ def save_feedback(self, feedback: SatisfactionFeedback) -> str: ...
107
+
108
+ def list_feedback(self, run_id: str) -> list[SatisfactionFeedback]: ...
109
+
110
+ def get_feedback_summary(self, run_id: str) -> FeedbackSummary: ...
111
+
102
112
  # Experiment 관련 메서드
103
113
 
104
114
  def save_experiment(self, experiment: Experiment) -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evalvault
3
- Version: 1.61.0
3
+ Version: 1.62.0
4
4
  Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
5
5
  Project-URL: Homepage, https://github.com/ntts9990/EvalVault
6
6
  Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
@@ -46,6 +46,7 @@ Requires-Dist: uvicorn>=0.40.0
46
46
  Requires-Dist: xlrd
47
47
  Provides-Extra: analysis
48
48
  Requires-Dist: scikit-learn>=1.3.0; extra == 'analysis'
49
+ Requires-Dist: xgboost>=2.0.0; extra == 'analysis'
49
50
  Provides-Extra: anthropic
50
51
  Requires-Dist: anthropic; extra == 'anthropic'
51
52
  Requires-Dist: langchain-anthropic; extra == 'anthropic'
@@ -86,6 +87,7 @@ Requires-Dist: rank-bm25>=0.2.2; extra == 'dev'
86
87
  Requires-Dist: ruff; extra == 'dev'
87
88
  Requires-Dist: scikit-learn<1.4.0,>=1.3.0; extra == 'dev'
88
89
  Requires-Dist: sentence-transformers>=5.2.0; extra == 'dev'
90
+ Requires-Dist: xgboost>=2.0.0; extra == 'dev'
89
91
  Provides-Extra: docs
90
92
  Requires-Dist: mkdocs-material>=9.5.0; extra == 'docs'
91
93
  Requires-Dist: mkdocs>=1.5.0; extra == 'docs'
@@ -5,7 +5,7 @@ evalvault/mkdocs_helpers.py,sha256=1AKVQ1W2_VO4qclhfyefyU9Dz1Hzkh1DWDwsFMe24jc,3
5
5
  evalvault/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  evalvault/adapters/inbound/__init__.py,sha256=SG1svel1PwqetnqVpKFLSv612_WwGwLTbFpYgwk6FMw,166
7
7
  evalvault/adapters/inbound/api/__init__.py,sha256=LeVVttCA3tLKoHA2PO4z3y8VkfVcf3Bq8CZSzo91lf4,34
8
- evalvault/adapters/inbound/api/adapter.py,sha256=6L95Csns-ac_9Q1rbVjYA8G7mu0wb981G5lsbvcqzcI,59820
8
+ evalvault/adapters/inbound/api/adapter.py,sha256=_giGdt-grmd6bkWMhRb3KdloxI_2jUMknProC76KqWY,61140
9
9
  evalvault/adapters/inbound/api/main.py,sha256=KdlAxKn0QfGI3UuoTrBDBbUs2xCvP8lnWOY1ce3svcU,2619
10
10
  evalvault/adapters/inbound/api/routers/__init__.py,sha256=q07_YF9TnBl68bqcRCvhPU4-zRTyvmPoHVehwO6W7QM,19
11
11
  evalvault/adapters/inbound/api/routers/benchmark.py,sha256=yevntbZcNtMvbVODsITUBgR1Ka4pdFQrXBJJ4K4Jyr4,4477
@@ -13,14 +13,15 @@ evalvault/adapters/inbound/api/routers/config.py,sha256=CN-FH2cn0Ive-BD3WacWY6PF
13
13
  evalvault/adapters/inbound/api/routers/domain.py,sha256=RsR7GIFMjccDN7vpG1uDyk9n1DnCTH18JDGAX7o4Qqc,3648
14
14
  evalvault/adapters/inbound/api/routers/knowledge.py,sha256=7mgyoUM1PepFb4X8_Ntn0vd7ZZYcNbM3_9nyD10g4Aw,5307
15
15
  evalvault/adapters/inbound/api/routers/pipeline.py,sha256=8UgQzNFHcuqS61s69mOrPee4OMwfxVdvRWHJ2_qYBF0,17175
16
- evalvault/adapters/inbound/api/routers/runs.py,sha256=Xn0Tj6sbxijdG9-x7rXFiLvKOAzdJ18QSZR0j5VEMYQ,33561
16
+ evalvault/adapters/inbound/api/routers/runs.py,sha256=KyIar-5RJemO7i3dvRLM1IeKWVF57tZXrrixKpGOg7M,38029
17
17
  evalvault/adapters/inbound/cli/__init__.py,sha256=a42flC5NK-VfbdbBrE49IrUL5zAyKdXZYJVM6E3NTE0,675
18
18
  evalvault/adapters/inbound/cli/app.py,sha256=ytNgHRg9ZTAl33AkB1wIL8RKfQ_Cf8fsy0gSsLTs7Ew,1603
19
- evalvault/adapters/inbound/cli/commands/__init__.py,sha256=ciIHbHgP0gtasVi4l5cHjVojERrb-uipga_E0EwCrqM,3431
19
+ evalvault/adapters/inbound/cli/commands/__init__.py,sha256=cNPPhsudTQWdlh_OJm9mU8LGBnJLGMswJBcIV9MAlkI,3530
20
20
  evalvault/adapters/inbound/cli/commands/agent.py,sha256=YlOYMEzzS1aSKDKD_a7UK3St18X6GXGkdTatrzyd8Zc,7555
21
21
  evalvault/adapters/inbound/cli/commands/analyze.py,sha256=aMi1BEDOX3yhN-ppBftDssPQLB5TdzIfpx9U7CZEgWo,48932
22
22
  evalvault/adapters/inbound/cli/commands/api.py,sha256=YdbJ_-QEajnFcjTa7P2heLMjFKpeQ4nWP_p-HvfYkEo,1943
23
23
  evalvault/adapters/inbound/cli/commands/benchmark.py,sha256=RZ4nRTF7d6hDZug-Pw8dGcFEyWdOKclwqkvS-gN4VWo,41097
24
+ evalvault/adapters/inbound/cli/commands/calibrate.py,sha256=-UnT0LQH40U5lzMLqMJ7DOTLa3mt5P_fJL2XzqIkvu4,4223
24
25
  evalvault/adapters/inbound/cli/commands/config.py,sha256=Mv9IQHBFHZ3I2stUzHDgLDn-Znt_Awdy3j-sk5ruUmw,6069
25
26
  evalvault/adapters/inbound/cli/commands/debug.py,sha256=KU-hL1gLhpjV2ZybDQgGMwRfm-hCynkrqY4UzETfL9k,2234
26
27
  evalvault/adapters/inbound/cli/commands/domain.py,sha256=dL9iqBlnr5mDeS1unXW6uxE0qp6yfnxj-ls6k3EenwI,27279
@@ -74,8 +75,8 @@ evalvault/adapters/outbound/analysis/model_analyzer_module.py,sha256=28rHdXBXYIF
74
75
  evalvault/adapters/outbound/analysis/morpheme_analyzer_module.py,sha256=Hrh4mluMsOhQHPrliD2w0FVKokJpfikXOFKT6sNwk74,4158
75
76
  evalvault/adapters/outbound/analysis/morpheme_quality_checker_module.py,sha256=_uRKDXdwGbfYduf_3XT77vF8X3-_zW3stHYc3HKYQTE,2216
76
77
  evalvault/adapters/outbound/analysis/network_analyzer_module.py,sha256=ITUVnt_CI5pHy5SAESBSi004yMtiAhGFsbhC61VTezk,8475
77
- evalvault/adapters/outbound/analysis/nlp_adapter.py,sha256=U7verYM4XTSPLTlb2z0b7yYzTP4kkNFl5LQ91XhXu_A,27432
78
- evalvault/adapters/outbound/analysis/nlp_analyzer_module.py,sha256=KtoMJNa4NE-91iTEpEWIid-mciaezwmhU2xlKbYl4fg,8221
78
+ evalvault/adapters/outbound/analysis/nlp_adapter.py,sha256=aLtF_fns-7IEtitwON2EYS_lweq_IdldFsRm47alN0Q,29561
79
+ evalvault/adapters/outbound/analysis/nlp_analyzer_module.py,sha256=kVuG9pVMQO6OYY5zxj_w9nNQZ1-qIO0y6XcXo6lG-n0,8221
79
80
  evalvault/adapters/outbound/analysis/pattern_detector_module.py,sha256=SyCDO_VS-r-tjGh8WrW-t1GCSC9ouxirdVk4NizFPXo,1882
80
81
  evalvault/adapters/outbound/analysis/pipeline_factory.py,sha256=XvcCbKCN_otv1pGUzk0oE76RV19yFga8r6RngBvgEFo,3691
81
82
  evalvault/adapters/outbound/analysis/pipeline_helpers.py,sha256=8E8IrYI5JvRrpnjxe0DS7srbPzB0XAxxXhLLYgfwsgU,5756
@@ -164,12 +165,12 @@ evalvault/adapters/outbound/report/dashboard_generator.py,sha256=Dcu18NTK4lS8XNK
164
165
  evalvault/adapters/outbound/report/llm_report_generator.py,sha256=sp2YRCmPOhn08vb8Bq_ayo-ZjgyBBxRhzRFvzlaDhsA,24063
165
166
  evalvault/adapters/outbound/report/markdown_adapter.py,sha256=5PS72h_qe4ZtYs-umhX5TqQL2k5SuDaCUc6rRw9AKRw,16761
166
167
  evalvault/adapters/outbound/storage/__init__.py,sha256=n5R6thAPTx1leSwv6od6nBWcLWFa-UYD6cOLzN89T8I,614
167
- evalvault/adapters/outbound/storage/base_sql.py,sha256=pPKlT2P7fCTMxCAIi0tzYr0d5rkroPTB4dHQNDDRxsI,18167
168
+ evalvault/adapters/outbound/storage/base_sql.py,sha256=Og-YRWHsCFQP2vnyvsgfWr4C2_ZE89ZmPXcPLiHeggU,21976
168
169
  evalvault/adapters/outbound/storage/benchmark_storage_adapter.py,sha256=Qgf9xSSIkYQRpG4uLzcUdoYO9LTQDQ4tFRkkMYer-WA,9803
169
- evalvault/adapters/outbound/storage/postgres_adapter.py,sha256=ro3DrE2e4l2jimoPidcmY0xBufhqi5M6_4VF-Ta0e-I,38133
170
- evalvault/adapters/outbound/storage/postgres_schema.sql,sha256=BLDQ7ynzS9Aw6NxN3efALpG3eN1ZfsmBcEeWugHwF98,7075
171
- evalvault/adapters/outbound/storage/schema.sql,sha256=R9Y3j76qR3_UbbeX1olhHs1hbsLjS3YfiSFyUgqTM28,10057
172
- evalvault/adapters/outbound/storage/sqlite_adapter.py,sha256=cldMzXW9_0jdAC0YkwhFznleKX6yF4RkFNpmYc8_lZQ,47446
170
+ evalvault/adapters/outbound/storage/postgres_adapter.py,sha256=IaijoeCIRi7JO2d5yfgfmF-ejobOnU7Izlx332mSUP8,39020
171
+ evalvault/adapters/outbound/storage/postgres_schema.sql,sha256=aAfgwxWEqCBGGpn_QRD_BbzXR2Q-9cd9GMsCbFeohNY,7632
172
+ evalvault/adapters/outbound/storage/schema.sql,sha256=LknvBvNVLvkW7c_hHTLHrxSf4TZApzbRyAk1ctuROUc,10608
173
+ evalvault/adapters/outbound/storage/sqlite_adapter.py,sha256=SKZ9IZjchi7w89WNkZ6aTelAzaV0MqUC7cexrkndTNY,48555
173
174
  evalvault/adapters/outbound/tracer/__init__.py,sha256=xrvQQuAvF_UI02mKLMV7GTrG3zn836n5zwCRrrmhq_U,1054
174
175
  evalvault/adapters/outbound/tracer/open_rag_log_handler.py,sha256=aq96FIWD-bBaSkq-bygWhQArC9LWghSwi-S03Mga0mI,2827
175
176
  evalvault/adapters/outbound/tracer/open_rag_trace_adapter.py,sha256=P-4PN1UweITXu5uN3LJVCEL3wRwiExzhgs3y2GN78xM,4784
@@ -190,7 +191,7 @@ evalvault/config/phoenix_support.py,sha256=e6RPWd6Qb7KU6Q8pLaYTpJGWULtvEEU6B0xHW
190
191
  evalvault/config/settings.py,sha256=T92GShlYKDaVinwbsbWX2DmNfm91Cvcvh8Te8pNOTsw,12875
191
192
  evalvault/config/playbooks/improvement_playbook.yaml,sha256=9F9WVVCydFfz6zUuGYzZ4PKdW1LLtcBKVF36T7xT764,26965
192
193
  evalvault/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
193
- evalvault/domain/entities/__init__.py,sha256=C63BX4ytkh0FCEfNFIy0MDY2tcYxp9G430IdyBxqqAk,2794
194
+ evalvault/domain/entities/__init__.py,sha256=CZU7VfTq2vart_j2pLemOX_TtSKzmpzB151pW-jSinw,3097
194
195
  evalvault/domain/entities/analysis.py,sha256=gcMtumC66g-AIqb2LgfMpm5BMzwJhJkjg-zuybNoJCM,15208
195
196
  evalvault/domain/entities/analysis_pipeline.py,sha256=hD9rFHMa4rUq0InRkSKhh6HQ9ZeNYAHKADzs-kWRP04,16845
196
197
  evalvault/domain/entities/benchmark.py,sha256=CVbz_eW7Y9eM7wG7xA_xmldTIs72csdoTmu3E0NKoMU,18475
@@ -198,6 +199,7 @@ evalvault/domain/entities/benchmark_run.py,sha256=2ZJOq5Ny_pfvRKM7E4RuIKxfxvoYK-
198
199
  evalvault/domain/entities/dataset.py,sha256=WsC_5ivGluy-o2nXxLGmoC8DYl5UafVSo2hSowb3rvs,1886
199
200
  evalvault/domain/entities/debug.py,sha256=r92lgvOpq2svw70syJIo78muRAvrSn5h1JByH_Hvz-s,1493
200
201
  evalvault/domain/entities/experiment.py,sha256=oWjbu0IJZ6oIRcnA-8ppeJDgp57Tv8ZjQ3UOZ0X9KJ8,2576
202
+ evalvault/domain/entities/feedback.py,sha256=xiaZaUQhyuxyW_i2scXt8eKZshMC6tXe3981e-uukw8,1604
201
203
  evalvault/domain/entities/improvement.py,sha256=WHI7q1jXRxkuHhBWOrpk8UdLaH0UwjZVjRIDsqVDyZo,19322
202
204
  evalvault/domain/entities/kg.py,sha256=8awN1M4vxAGQZk_ZG8i2CXKTizQ8FA1VCLhUWHZq0U8,3001
203
205
  evalvault/domain/entities/memory.py,sha256=bfS75q8K8_jNrB7IYh4mjP8Lkyj-I0TVsmHCP0FuICw,8423
@@ -253,6 +255,7 @@ evalvault/domain/services/prompt_status.py,sha256=r1dFLGz4SfRxXaxsULQsr0-HpJkG9Y
253
255
  evalvault/domain/services/ragas_prompt_overrides.py,sha256=4BecYE2KrreUBbIM3ssP9WzHcK_wRc8jW7CE_k58QOU,1412
254
256
  evalvault/domain/services/retrieval_metrics.py,sha256=dtrQPLMrXSyWLcgF8EGcLNFwzwA59WDzEh41JRToHAY,2980
255
257
  evalvault/domain/services/retriever_context.py,sha256=ySQ-GuadiggS0LVAib4AxA_0JpasYz4S9hbjau0eyIA,6482
258
+ evalvault/domain/services/satisfaction_calibration_service.py,sha256=H7Z8opOyPHRO5qVIw-XDsNhIwdCteAS9_a3BTlfIqHg,11906
256
259
  evalvault/domain/services/stage_event_builder.py,sha256=ScTgyeRiH7z_rnNI_2p-i9szVRIRwUxGSJvpEj3zto4,9645
257
260
  evalvault/domain/services/stage_metric_guide_service.py,sha256=_JdRsBRWirO24qYFlh6hG-dkoWlX6_XWEYKf_uUlKIQ,8807
258
261
  evalvault/domain/services/stage_metric_service.py,sha256=KukIWWhWVOtclrET6uyWJ17jG76LfkKiqrUrDIDJ3gw,15327
@@ -286,15 +289,15 @@ evalvault/ports/outbound/nlp_analysis_port.py,sha256=QDJHAsSpynTenuaKp78t1s--U03
286
289
  evalvault/ports/outbound/relation_augmenter_port.py,sha256=cMcHQnmK111WzZr50vYr7affeHhOtpFZxPARwkg9xbk,651
287
290
  evalvault/ports/outbound/report_port.py,sha256=wgReSYL4SupXIoALFh0QFWfX2kzPftXpWTvGLCMd2B8,1315
288
291
  evalvault/ports/outbound/stage_storage_port.py,sha256=Nlf9upsXxgCABQB5cJdpLQYsoZNiGRAU5zE5D-Ptp2I,1201
289
- evalvault/ports/outbound/storage_port.py,sha256=V1ZvV_M3ztQtAHabRn2dfcCLpgRofWB_CUp9pAuocHU,4505
292
+ evalvault/ports/outbound/storage_port.py,sha256=d9f8bvAtPA2aytKrHvrfrWGOmaQSepLn23Bd_52QSbI,4862
290
293
  evalvault/ports/outbound/tracer_port.py,sha256=kTqJCUIJHnvvDzMxxGhHSfiz8_Q4CZ0WSPvIUVVOcyw,623
291
294
  evalvault/ports/outbound/tracker_port.py,sha256=05LA3AWnuE1XmGQC16Zle9i2sEV3q69Nt8ZUye_w1_Y,2532
292
295
  evalvault/reports/__init__.py,sha256=Bb1X4871msAN8I6PM6nKGED3psPwZt88hXZBAOdH06Y,113
293
296
  evalvault/reports/release_notes.py,sha256=pZj0PBFT-4F_Ty-Kv5P69BuoOnmTCn4kznDcORFJd0w,4011
294
297
  evalvault/scripts/__init__.py,sha256=NwEeIFQbkX4ml2R_PhtIoNtArDSX_suuoymgG_7Kwso,89
295
298
  evalvault/scripts/regression_runner.py,sha256=SxZori5BZ8jVQ057Mf5V5FPgIVDccrV5oRONmnhuk8w,8438
296
- evalvault-1.61.0.dist-info/METADATA,sha256=c8UQ9kRQCONSqFbgSj17CEE70xhNbFs1GAPBg1jVH9A,14058
297
- evalvault-1.61.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
298
- evalvault-1.61.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
299
- evalvault-1.61.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
300
- evalvault-1.61.0.dist-info/RECORD,,
299
+ evalvault-1.62.0.dist-info/METADATA,sha256=2Nt0heOPN0il1jF3de3EAJtq9CQawjyaa27GQAGncmk,14155
300
+ evalvault-1.62.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
301
+ evalvault-1.62.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
302
+ evalvault-1.62.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
303
+ evalvault-1.62.0.dist-info/RECORD,,