evalvault 1.69.0__py3-none-any.whl → 1.70.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/outbound/analysis/multiturn_analyzer_module.py +212 -0
- {evalvault-1.69.0.dist-info → evalvault-1.70.0.dist-info}/METADATA +1 -1
- {evalvault-1.69.0.dist-info → evalvault-1.70.0.dist-info}/RECORD +6 -5
- {evalvault-1.69.0.dist-info → evalvault-1.70.0.dist-info}/WHEEL +0 -0
- {evalvault-1.69.0.dist-info → evalvault-1.70.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.69.0.dist-info → evalvault-1.70.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""
|
|
2
|
+
멀티턴 평가 요약 모듈입니다.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from evalvault.adapters.outbound.analysis.base_module import BaseAnalysisModule
|
|
11
|
+
from evalvault.adapters.outbound.analysis.pipeline_helpers import get_upstream_output, safe_mean
|
|
12
|
+
from evalvault.domain.entities import EvaluationRun
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MultiTurnAnalyzerModule(BaseAnalysisModule):
|
|
16
|
+
"""멀티턴(대화) 단위로 결과를 집계합니다."""
|
|
17
|
+
|
|
18
|
+
module_id = "multiturn_analyzer"
|
|
19
|
+
name = "멀티턴 분석"
|
|
20
|
+
description = "대화/턴 메타데이터를 기준으로 멀티턴 성능을 요약합니다."
|
|
21
|
+
input_types = ["run"]
|
|
22
|
+
output_types = ["multiturn_summary", "multiturn_conversations", "multiturn_turns"]
|
|
23
|
+
requires = ["data_loader"]
|
|
24
|
+
tags = ["analysis", "multiturn"]
|
|
25
|
+
|
|
26
|
+
def execute(
|
|
27
|
+
self,
|
|
28
|
+
inputs: dict[str, Any],
|
|
29
|
+
params: dict[str, Any] | None = None,
|
|
30
|
+
) -> dict[str, Any]:
|
|
31
|
+
loader_output = get_upstream_output(inputs, "load_data", "data_loader") or {}
|
|
32
|
+
run = loader_output.get("run")
|
|
33
|
+
if not isinstance(run, EvaluationRun):
|
|
34
|
+
return {
|
|
35
|
+
"available": False,
|
|
36
|
+
"summary": {},
|
|
37
|
+
"conversations": [],
|
|
38
|
+
"turns": [],
|
|
39
|
+
"coverage": {},
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
retrieval_meta = run.retrieval_metadata or {}
|
|
43
|
+
cases = run.results
|
|
44
|
+
total_cases = len(cases)
|
|
45
|
+
|
|
46
|
+
coverage = {
|
|
47
|
+
"total_cases": total_cases,
|
|
48
|
+
"has_conversation_id": 0,
|
|
49
|
+
"has_turn_index": 0,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
grouped: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
53
|
+
turns: list[dict[str, Any]] = []
|
|
54
|
+
|
|
55
|
+
for result in cases:
|
|
56
|
+
case_meta = _resolve_case_metadata(retrieval_meta, result.test_case_id)
|
|
57
|
+
conversation_id = _coerce_text(case_meta.get("conversation_id"))
|
|
58
|
+
turn_index = _coerce_turn_index(case_meta.get("turn_index"))
|
|
59
|
+
turn_id = _coerce_text(case_meta.get("turn_id"))
|
|
60
|
+
|
|
61
|
+
if conversation_id:
|
|
62
|
+
coverage["has_conversation_id"] += 1
|
|
63
|
+
if turn_index is not None:
|
|
64
|
+
coverage["has_turn_index"] += 1
|
|
65
|
+
|
|
66
|
+
metrics = {
|
|
67
|
+
metric.name: metric.score for metric in result.metrics if metric.score is not None
|
|
68
|
+
}
|
|
69
|
+
avg_score = safe_mean(metrics.values()) if metrics else 0.0
|
|
70
|
+
failed_metrics = [metric.name for metric in result.metrics if not metric.passed]
|
|
71
|
+
entry = {
|
|
72
|
+
"test_case_id": result.test_case_id,
|
|
73
|
+
"conversation_id": conversation_id,
|
|
74
|
+
"turn_index": turn_index,
|
|
75
|
+
"turn_id": turn_id,
|
|
76
|
+
"avg_score": round(avg_score, 4),
|
|
77
|
+
"metrics": metrics,
|
|
78
|
+
"failed_metrics": failed_metrics,
|
|
79
|
+
"passed_all": result.all_passed,
|
|
80
|
+
}
|
|
81
|
+
turns.append(entry)
|
|
82
|
+
if conversation_id:
|
|
83
|
+
grouped[conversation_id].append(entry)
|
|
84
|
+
|
|
85
|
+
conversations: list[dict[str, Any]] = []
|
|
86
|
+
first_failure_hist: dict[str, int] = defaultdict(int)
|
|
87
|
+
|
|
88
|
+
for conversation_id, entries in grouped.items():
|
|
89
|
+
entries_sorted = _sort_turns(entries)
|
|
90
|
+
avg_scores = [item["avg_score"] for item in entries_sorted]
|
|
91
|
+
metric_scores: dict[str, list[float]] = defaultdict(list)
|
|
92
|
+
for item in entries_sorted:
|
|
93
|
+
for name, score in (item.get("metrics") or {}).items():
|
|
94
|
+
metric_scores[name].append(float(score))
|
|
95
|
+
|
|
96
|
+
metric_means = {
|
|
97
|
+
name: round(safe_mean(values), 4) for name, values in metric_scores.items()
|
|
98
|
+
}
|
|
99
|
+
passed_all = all(item.get("passed_all") for item in entries_sorted)
|
|
100
|
+
failure_turn = _first_failure_turn(entries_sorted)
|
|
101
|
+
if failure_turn is not None:
|
|
102
|
+
first_failure_hist[str(failure_turn)] += 1
|
|
103
|
+
|
|
104
|
+
worst_turn = _select_worst_turn(entries_sorted)
|
|
105
|
+
|
|
106
|
+
conversations.append(
|
|
107
|
+
{
|
|
108
|
+
"conversation_id": conversation_id,
|
|
109
|
+
"turn_count": len(entries_sorted),
|
|
110
|
+
"avg_score": round(safe_mean(avg_scores), 4),
|
|
111
|
+
"passed_all_turns": passed_all,
|
|
112
|
+
"first_failure_turn_index": failure_turn,
|
|
113
|
+
"worst_turn": worst_turn,
|
|
114
|
+
"metric_means": metric_means,
|
|
115
|
+
}
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
conversation_count = len(grouped)
|
|
119
|
+
turn_count = sum(len(items) for items in grouped.values())
|
|
120
|
+
summary = {
|
|
121
|
+
"conversation_count": conversation_count,
|
|
122
|
+
"turn_count": turn_count,
|
|
123
|
+
"avg_turns_per_conversation": round(
|
|
124
|
+
(turn_count / conversation_count) if conversation_count else 0.0, 3
|
|
125
|
+
),
|
|
126
|
+
"conversation_pass_rate": round(
|
|
127
|
+
(
|
|
128
|
+
sum(1 for item in conversations if item.get("passed_all_turns"))
|
|
129
|
+
/ conversation_count
|
|
130
|
+
)
|
|
131
|
+
if conversation_count
|
|
132
|
+
else 0.0,
|
|
133
|
+
4,
|
|
134
|
+
),
|
|
135
|
+
"first_failure_turn_histogram": dict(first_failure_hist),
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if total_cases:
|
|
139
|
+
coverage["has_conversation_id"] = round(
|
|
140
|
+
coverage["has_conversation_id"] / total_cases, 4
|
|
141
|
+
)
|
|
142
|
+
coverage["has_turn_index"] = round(coverage["has_turn_index"] / total_cases, 4)
|
|
143
|
+
|
|
144
|
+
return {
|
|
145
|
+
"available": True,
|
|
146
|
+
"summary": summary,
|
|
147
|
+
"conversations": conversations,
|
|
148
|
+
"turns": turns,
|
|
149
|
+
"coverage": coverage,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _resolve_case_metadata(
|
|
154
|
+
retrieval_metadata: dict[str, dict[str, Any]],
|
|
155
|
+
test_case_id: str,
|
|
156
|
+
) -> dict[str, Any]:
|
|
157
|
+
meta = retrieval_metadata.get(test_case_id)
|
|
158
|
+
if isinstance(meta, dict):
|
|
159
|
+
nested = meta.get("test_case_metadata")
|
|
160
|
+
if isinstance(nested, dict):
|
|
161
|
+
merged = dict(nested)
|
|
162
|
+
merged.update({k: v for k, v in meta.items() if k != "test_case_metadata"})
|
|
163
|
+
return merged
|
|
164
|
+
return dict(meta)
|
|
165
|
+
return {}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _coerce_text(value: Any) -> str | None:
|
|
169
|
+
if value is None:
|
|
170
|
+
return None
|
|
171
|
+
if isinstance(value, str):
|
|
172
|
+
trimmed = value.strip()
|
|
173
|
+
return trimmed or None
|
|
174
|
+
return str(value)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _coerce_turn_index(value: Any) -> int | None:
|
|
178
|
+
if value is None:
|
|
179
|
+
return None
|
|
180
|
+
if isinstance(value, int):
|
|
181
|
+
return value
|
|
182
|
+
if isinstance(value, float) and value.is_integer():
|
|
183
|
+
return int(value)
|
|
184
|
+
if isinstance(value, str) and value.strip().isdigit():
|
|
185
|
+
return int(value.strip())
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _sort_turns(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
190
|
+
if all(item.get("turn_index") is None for item in entries):
|
|
191
|
+
return list(entries)
|
|
192
|
+
return sorted(
|
|
193
|
+
entries, key=lambda item: (item.get("turn_index") is None, item.get("turn_index") or 0)
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _first_failure_turn(entries: list[dict[str, Any]]) -> int | None:
|
|
198
|
+
for item in entries:
|
|
199
|
+
if not item.get("passed_all"):
|
|
200
|
+
return item.get("turn_index")
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _select_worst_turn(entries: list[dict[str, Any]]) -> dict[str, Any] | None:
|
|
205
|
+
if not entries:
|
|
206
|
+
return None
|
|
207
|
+
worst = min(entries, key=lambda item: item.get("avg_score", 0.0))
|
|
208
|
+
return {
|
|
209
|
+
"test_case_id": worst.get("test_case_id"),
|
|
210
|
+
"avg_score": worst.get("avg_score"),
|
|
211
|
+
"failed_metrics": worst.get("failed_metrics", []),
|
|
212
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: evalvault
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.70.0
|
|
4
4
|
Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
|
|
5
5
|
Project-URL: Homepage, https://github.com/ntts9990/EvalVault
|
|
6
6
|
Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
|
|
@@ -86,6 +86,7 @@ evalvault/adapters/outbound/analysis/low_performer_extractor_module.py,sha256=Pt
|
|
|
86
86
|
evalvault/adapters/outbound/analysis/model_analyzer_module.py,sha256=28rHdXBXYIFpLHixbbZcv6-j2QVgl3yaGN0vU1Q0gFc,2682
|
|
87
87
|
evalvault/adapters/outbound/analysis/morpheme_analyzer_module.py,sha256=Hrh4mluMsOhQHPrliD2w0FVKokJpfikXOFKT6sNwk74,4158
|
|
88
88
|
evalvault/adapters/outbound/analysis/morpheme_quality_checker_module.py,sha256=_uRKDXdwGbfYduf_3XT77vF8X3-_zW3stHYc3HKYQTE,2216
|
|
89
|
+
evalvault/adapters/outbound/analysis/multiturn_analyzer_module.py,sha256=6R_lcbJyQr5CEEI_zpDJDdw6G4n3ZnkUI0ovfUPGrtU,7557
|
|
89
90
|
evalvault/adapters/outbound/analysis/network_analyzer_module.py,sha256=ITUVnt_CI5pHy5SAESBSi004yMtiAhGFsbhC61VTezk,8475
|
|
90
91
|
evalvault/adapters/outbound/analysis/nlp_adapter.py,sha256=aLtF_fns-7IEtitwON2EYS_lweq_IdldFsRm47alN0Q,29561
|
|
91
92
|
evalvault/adapters/outbound/analysis/nlp_analyzer_module.py,sha256=kVuG9pVMQO6OYY5zxj_w9nNQZ1-qIO0y6XcXo6lG-n0,8221
|
|
@@ -339,8 +340,8 @@ evalvault/reports/__init__.py,sha256=Bb1X4871msAN8I6PM6nKGED3psPwZt88hXZBAOdH06Y
|
|
|
339
340
|
evalvault/reports/release_notes.py,sha256=pZj0PBFT-4F_Ty-Kv5P69BuoOnmTCn4kznDcORFJd0w,4011
|
|
340
341
|
evalvault/scripts/__init__.py,sha256=NwEeIFQbkX4ml2R_PhtIoNtArDSX_suuoymgG_7Kwso,89
|
|
341
342
|
evalvault/scripts/regression_runner.py,sha256=SxZori5BZ8jVQ057Mf5V5FPgIVDccrV5oRONmnhuk8w,8438
|
|
342
|
-
evalvault-1.
|
|
343
|
-
evalvault-1.
|
|
344
|
-
evalvault-1.
|
|
345
|
-
evalvault-1.
|
|
346
|
-
evalvault-1.
|
|
343
|
+
evalvault-1.70.0.dist-info/METADATA,sha256=Bm7z86HYTWoMfyTK9VpmxjNk-mhp2LHsIJ9Gt8s-onw,26159
|
|
344
|
+
evalvault-1.70.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
345
|
+
evalvault-1.70.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
|
|
346
|
+
evalvault-1.70.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
|
|
347
|
+
evalvault-1.70.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|