evalvault 1.68.1__py3-none-any.whl → 1.70.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/outbound/analysis/multiturn_analyzer_module.py +212 -0
- evalvault/adapters/outbound/analysis/pipeline_factory.py +1 -0
- evalvault/adapters/outbound/analysis/retrieval_analyzer_module.py +62 -0
- evalvault/domain/services/pipeline_template_registry.py +21 -1
- evalvault/domain/services/retriever_context.py +116 -0
- {evalvault-1.68.1.dist-info → evalvault-1.70.0.dist-info}/METADATA +1 -1
- {evalvault-1.68.1.dist-info → evalvault-1.70.0.dist-info}/RECORD +10 -9
- {evalvault-1.68.1.dist-info → evalvault-1.70.0.dist-info}/WHEEL +0 -0
- {evalvault-1.68.1.dist-info → evalvault-1.70.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.68.1.dist-info → evalvault-1.70.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""
|
|
2
|
+
멀티턴 평가 요약 모듈입니다.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from evalvault.adapters.outbound.analysis.base_module import BaseAnalysisModule
|
|
11
|
+
from evalvault.adapters.outbound.analysis.pipeline_helpers import get_upstream_output, safe_mean
|
|
12
|
+
from evalvault.domain.entities import EvaluationRun
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MultiTurnAnalyzerModule(BaseAnalysisModule):
|
|
16
|
+
"""멀티턴(대화) 단위로 결과를 집계합니다."""
|
|
17
|
+
|
|
18
|
+
module_id = "multiturn_analyzer"
|
|
19
|
+
name = "멀티턴 분석"
|
|
20
|
+
description = "대화/턴 메타데이터를 기준으로 멀티턴 성능을 요약합니다."
|
|
21
|
+
input_types = ["run"]
|
|
22
|
+
output_types = ["multiturn_summary", "multiturn_conversations", "multiturn_turns"]
|
|
23
|
+
requires = ["data_loader"]
|
|
24
|
+
tags = ["analysis", "multiturn"]
|
|
25
|
+
|
|
26
|
+
def execute(
|
|
27
|
+
self,
|
|
28
|
+
inputs: dict[str, Any],
|
|
29
|
+
params: dict[str, Any] | None = None,
|
|
30
|
+
) -> dict[str, Any]:
|
|
31
|
+
loader_output = get_upstream_output(inputs, "load_data", "data_loader") or {}
|
|
32
|
+
run = loader_output.get("run")
|
|
33
|
+
if not isinstance(run, EvaluationRun):
|
|
34
|
+
return {
|
|
35
|
+
"available": False,
|
|
36
|
+
"summary": {},
|
|
37
|
+
"conversations": [],
|
|
38
|
+
"turns": [],
|
|
39
|
+
"coverage": {},
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
retrieval_meta = run.retrieval_metadata or {}
|
|
43
|
+
cases = run.results
|
|
44
|
+
total_cases = len(cases)
|
|
45
|
+
|
|
46
|
+
coverage = {
|
|
47
|
+
"total_cases": total_cases,
|
|
48
|
+
"has_conversation_id": 0,
|
|
49
|
+
"has_turn_index": 0,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
grouped: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
53
|
+
turns: list[dict[str, Any]] = []
|
|
54
|
+
|
|
55
|
+
for result in cases:
|
|
56
|
+
case_meta = _resolve_case_metadata(retrieval_meta, result.test_case_id)
|
|
57
|
+
conversation_id = _coerce_text(case_meta.get("conversation_id"))
|
|
58
|
+
turn_index = _coerce_turn_index(case_meta.get("turn_index"))
|
|
59
|
+
turn_id = _coerce_text(case_meta.get("turn_id"))
|
|
60
|
+
|
|
61
|
+
if conversation_id:
|
|
62
|
+
coverage["has_conversation_id"] += 1
|
|
63
|
+
if turn_index is not None:
|
|
64
|
+
coverage["has_turn_index"] += 1
|
|
65
|
+
|
|
66
|
+
metrics = {
|
|
67
|
+
metric.name: metric.score for metric in result.metrics if metric.score is not None
|
|
68
|
+
}
|
|
69
|
+
avg_score = safe_mean(metrics.values()) if metrics else 0.0
|
|
70
|
+
failed_metrics = [metric.name for metric in result.metrics if not metric.passed]
|
|
71
|
+
entry = {
|
|
72
|
+
"test_case_id": result.test_case_id,
|
|
73
|
+
"conversation_id": conversation_id,
|
|
74
|
+
"turn_index": turn_index,
|
|
75
|
+
"turn_id": turn_id,
|
|
76
|
+
"avg_score": round(avg_score, 4),
|
|
77
|
+
"metrics": metrics,
|
|
78
|
+
"failed_metrics": failed_metrics,
|
|
79
|
+
"passed_all": result.all_passed,
|
|
80
|
+
}
|
|
81
|
+
turns.append(entry)
|
|
82
|
+
if conversation_id:
|
|
83
|
+
grouped[conversation_id].append(entry)
|
|
84
|
+
|
|
85
|
+
conversations: list[dict[str, Any]] = []
|
|
86
|
+
first_failure_hist: dict[str, int] = defaultdict(int)
|
|
87
|
+
|
|
88
|
+
for conversation_id, entries in grouped.items():
|
|
89
|
+
entries_sorted = _sort_turns(entries)
|
|
90
|
+
avg_scores = [item["avg_score"] for item in entries_sorted]
|
|
91
|
+
metric_scores: dict[str, list[float]] = defaultdict(list)
|
|
92
|
+
for item in entries_sorted:
|
|
93
|
+
for name, score in (item.get("metrics") or {}).items():
|
|
94
|
+
metric_scores[name].append(float(score))
|
|
95
|
+
|
|
96
|
+
metric_means = {
|
|
97
|
+
name: round(safe_mean(values), 4) for name, values in metric_scores.items()
|
|
98
|
+
}
|
|
99
|
+
passed_all = all(item.get("passed_all") for item in entries_sorted)
|
|
100
|
+
failure_turn = _first_failure_turn(entries_sorted)
|
|
101
|
+
if failure_turn is not None:
|
|
102
|
+
first_failure_hist[str(failure_turn)] += 1
|
|
103
|
+
|
|
104
|
+
worst_turn = _select_worst_turn(entries_sorted)
|
|
105
|
+
|
|
106
|
+
conversations.append(
|
|
107
|
+
{
|
|
108
|
+
"conversation_id": conversation_id,
|
|
109
|
+
"turn_count": len(entries_sorted),
|
|
110
|
+
"avg_score": round(safe_mean(avg_scores), 4),
|
|
111
|
+
"passed_all_turns": passed_all,
|
|
112
|
+
"first_failure_turn_index": failure_turn,
|
|
113
|
+
"worst_turn": worst_turn,
|
|
114
|
+
"metric_means": metric_means,
|
|
115
|
+
}
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
conversation_count = len(grouped)
|
|
119
|
+
turn_count = sum(len(items) for items in grouped.values())
|
|
120
|
+
summary = {
|
|
121
|
+
"conversation_count": conversation_count,
|
|
122
|
+
"turn_count": turn_count,
|
|
123
|
+
"avg_turns_per_conversation": round(
|
|
124
|
+
(turn_count / conversation_count) if conversation_count else 0.0, 3
|
|
125
|
+
),
|
|
126
|
+
"conversation_pass_rate": round(
|
|
127
|
+
(
|
|
128
|
+
sum(1 for item in conversations if item.get("passed_all_turns"))
|
|
129
|
+
/ conversation_count
|
|
130
|
+
)
|
|
131
|
+
if conversation_count
|
|
132
|
+
else 0.0,
|
|
133
|
+
4,
|
|
134
|
+
),
|
|
135
|
+
"first_failure_turn_histogram": dict(first_failure_hist),
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if total_cases:
|
|
139
|
+
coverage["has_conversation_id"] = round(
|
|
140
|
+
coverage["has_conversation_id"] / total_cases, 4
|
|
141
|
+
)
|
|
142
|
+
coverage["has_turn_index"] = round(coverage["has_turn_index"] / total_cases, 4)
|
|
143
|
+
|
|
144
|
+
return {
|
|
145
|
+
"available": True,
|
|
146
|
+
"summary": summary,
|
|
147
|
+
"conversations": conversations,
|
|
148
|
+
"turns": turns,
|
|
149
|
+
"coverage": coverage,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _resolve_case_metadata(
|
|
154
|
+
retrieval_metadata: dict[str, dict[str, Any]],
|
|
155
|
+
test_case_id: str,
|
|
156
|
+
) -> dict[str, Any]:
|
|
157
|
+
meta = retrieval_metadata.get(test_case_id)
|
|
158
|
+
if isinstance(meta, dict):
|
|
159
|
+
nested = meta.get("test_case_metadata")
|
|
160
|
+
if isinstance(nested, dict):
|
|
161
|
+
merged = dict(nested)
|
|
162
|
+
merged.update({k: v for k, v in meta.items() if k != "test_case_metadata"})
|
|
163
|
+
return merged
|
|
164
|
+
return dict(meta)
|
|
165
|
+
return {}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _coerce_text(value: Any) -> str | None:
|
|
169
|
+
if value is None:
|
|
170
|
+
return None
|
|
171
|
+
if isinstance(value, str):
|
|
172
|
+
trimmed = value.strip()
|
|
173
|
+
return trimmed or None
|
|
174
|
+
return str(value)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _coerce_turn_index(value: Any) -> int | None:
|
|
178
|
+
if value is None:
|
|
179
|
+
return None
|
|
180
|
+
if isinstance(value, int):
|
|
181
|
+
return value
|
|
182
|
+
if isinstance(value, float) and value.is_integer():
|
|
183
|
+
return int(value)
|
|
184
|
+
if isinstance(value, str) and value.strip().isdigit():
|
|
185
|
+
return int(value.strip())
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _sort_turns(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
190
|
+
if all(item.get("turn_index") is None for item in entries):
|
|
191
|
+
return list(entries)
|
|
192
|
+
return sorted(
|
|
193
|
+
entries, key=lambda item: (item.get("turn_index") is None, item.get("turn_index") or 0)
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _first_failure_turn(entries: list[dict[str, Any]]) -> int | None:
|
|
198
|
+
for item in entries:
|
|
199
|
+
if not item.get("passed_all"):
|
|
200
|
+
return item.get("turn_index")
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _select_worst_turn(entries: list[dict[str, Any]]) -> dict[str, Any] | None:
|
|
205
|
+
if not entries:
|
|
206
|
+
return None
|
|
207
|
+
worst = min(entries, key=lambda item: item.get("avg_score", 0.0))
|
|
208
|
+
return {
|
|
209
|
+
"test_case_id": worst.get("test_case_id"),
|
|
210
|
+
"avg_score": worst.get("avg_score"),
|
|
211
|
+
"failed_metrics": worst.get("failed_metrics", []),
|
|
212
|
+
}
|
|
@@ -58,6 +58,7 @@ def build_analysis_pipeline_service(
|
|
|
58
58
|
service.register_module(analysis_modules.DiagnosticPlaybookModule())
|
|
59
59
|
service.register_module(analysis_modules.RootCauseAnalyzerModule())
|
|
60
60
|
service.register_module(analysis_modules.PatternDetectorModule())
|
|
61
|
+
service.register_module(analysis_modules.MultiTurnAnalyzerModule())
|
|
61
62
|
service.register_module(analysis_modules.TimeSeriesAnalyzerModule())
|
|
62
63
|
service.register_module(analysis_modules.TimeSeriesAdvancedModule())
|
|
63
64
|
service.register_module(analysis_modules.TrendDetectorModule())
|
|
@@ -37,6 +37,8 @@ class RetrievalAnalyzerModule(BaseAnalysisModule):
|
|
|
37
37
|
|
|
38
38
|
params = params or {}
|
|
39
39
|
max_cases = int(params.get("max_cases", 150))
|
|
40
|
+
max_examples = int(params.get("max_examples", 5))
|
|
41
|
+
max_graphrag_docs = int(params.get("max_graphrag_docs", 5))
|
|
40
42
|
|
|
41
43
|
context_counts: list[int] = []
|
|
42
44
|
context_token_counts: list[int] = []
|
|
@@ -96,6 +98,11 @@ class RetrievalAnalyzerModule(BaseAnalysisModule):
|
|
|
96
98
|
retrieval_meta = run.retrieval_metadata or {}
|
|
97
99
|
retrieval_times: list[float] = []
|
|
98
100
|
retrieval_scores: list[float] = []
|
|
101
|
+
graph_nodes: list[int] = []
|
|
102
|
+
graph_edges: list[int] = []
|
|
103
|
+
subgraph_sizes: list[int] = []
|
|
104
|
+
graphrag_cases = 0
|
|
105
|
+
graphrag_examples: list[dict[str, Any]] = []
|
|
99
106
|
for item in retrieval_meta.values():
|
|
100
107
|
if isinstance(item, dict):
|
|
101
108
|
if "retrieval_time_ms" in item:
|
|
@@ -103,6 +110,20 @@ class RetrievalAnalyzerModule(BaseAnalysisModule):
|
|
|
103
110
|
scores = item.get("scores")
|
|
104
111
|
if isinstance(scores, list) and scores:
|
|
105
112
|
retrieval_scores.append(safe_mean([float(s) for s in scores]))
|
|
113
|
+
if "graph_nodes" in item:
|
|
114
|
+
graph_nodes.append(int(item["graph_nodes"]))
|
|
115
|
+
if "graph_edges" in item:
|
|
116
|
+
graph_edges.append(int(item["graph_edges"]))
|
|
117
|
+
if "subgraph_size" in item:
|
|
118
|
+
subgraph_sizes.append(int(item["subgraph_size"]))
|
|
119
|
+
if item.get("retriever") == "graphrag":
|
|
120
|
+
graphrag_cases += 1
|
|
121
|
+
if len(graphrag_examples) < max_examples:
|
|
122
|
+
graphrag_details = item.get("graphrag")
|
|
123
|
+
if isinstance(graphrag_details, dict):
|
|
124
|
+
graphrag_examples.append(
|
|
125
|
+
_trim_graphrag_example(graphrag_details, max_docs=max_graphrag_docs)
|
|
126
|
+
)
|
|
106
127
|
|
|
107
128
|
summary = {
|
|
108
129
|
"total_cases": total_cases,
|
|
@@ -122,6 +143,15 @@ class RetrievalAnalyzerModule(BaseAnalysisModule):
|
|
|
122
143
|
summary["avg_retrieval_time_ms"] = round(safe_mean(retrieval_times), 2)
|
|
123
144
|
if retrieval_scores:
|
|
124
145
|
summary["avg_retrieval_score"] = round(safe_mean(retrieval_scores), 4)
|
|
146
|
+
if graph_nodes:
|
|
147
|
+
summary["avg_graph_nodes"] = round(safe_mean(graph_nodes), 2)
|
|
148
|
+
if graph_edges:
|
|
149
|
+
summary["avg_graph_edges"] = round(safe_mean(graph_edges), 2)
|
|
150
|
+
if subgraph_sizes:
|
|
151
|
+
summary["avg_subgraph_size"] = round(safe_mean(subgraph_sizes), 2)
|
|
152
|
+
if total_cases:
|
|
153
|
+
summary["graphrag_case_rate"] = round(graphrag_cases / total_cases, 4)
|
|
154
|
+
summary["graphrag_case_count"] = graphrag_cases
|
|
125
155
|
|
|
126
156
|
insights = []
|
|
127
157
|
if summary["avg_contexts"] < 1:
|
|
@@ -138,6 +168,38 @@ class RetrievalAnalyzerModule(BaseAnalysisModule):
|
|
|
138
168
|
"context_token_counts": context_token_counts[:100],
|
|
139
169
|
"keyword_overlap_scores": keyword_overlap_scores[:100],
|
|
140
170
|
"faithfulness_scores": faithfulness_scores[:100],
|
|
171
|
+
"graph_nodes": graph_nodes[:100],
|
|
172
|
+
"graph_edges": graph_edges[:100],
|
|
173
|
+
"subgraph_sizes": subgraph_sizes[:100],
|
|
174
|
+
"graphrag_examples": graphrag_examples,
|
|
141
175
|
},
|
|
142
176
|
"insights": insights,
|
|
143
177
|
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _trim_graphrag_example(payload: dict[str, Any], *, max_docs: int) -> dict[str, Any]:
|
|
181
|
+
docs = payload.get("docs")
|
|
182
|
+
if isinstance(docs, list):
|
|
183
|
+
trimmed_docs = []
|
|
184
|
+
for entry in docs[:max_docs]:
|
|
185
|
+
if not isinstance(entry, dict):
|
|
186
|
+
continue
|
|
187
|
+
trimmed_docs.append(_trim_graphrag_doc(entry))
|
|
188
|
+
docs = trimmed_docs
|
|
189
|
+
else:
|
|
190
|
+
docs = []
|
|
191
|
+
return {
|
|
192
|
+
"docs": docs,
|
|
193
|
+
"max_docs": max_docs,
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _trim_graphrag_doc(entry: dict[str, Any]) -> dict[str, Any]:
|
|
198
|
+
output: dict[str, Any] = {}
|
|
199
|
+
for key in ("doc_id", "rank", "score"):
|
|
200
|
+
if key in entry:
|
|
201
|
+
output[key] = entry[key]
|
|
202
|
+
sources = entry.get("sources")
|
|
203
|
+
if isinstance(sources, dict):
|
|
204
|
+
output["sources"] = sources
|
|
205
|
+
return output
|
|
@@ -658,6 +658,12 @@ class PipelineTemplateRegistry:
|
|
|
658
658
|
module="statistical_analyzer",
|
|
659
659
|
depends_on=["load_data"],
|
|
660
660
|
),
|
|
661
|
+
AnalysisNode(
|
|
662
|
+
id="retrieval_analysis",
|
|
663
|
+
name="검색 분석",
|
|
664
|
+
module="retrieval_analyzer",
|
|
665
|
+
depends_on=["load_data"],
|
|
666
|
+
),
|
|
661
667
|
AnalysisNode(
|
|
662
668
|
id="priority_summary",
|
|
663
669
|
name="우선순위 요약",
|
|
@@ -669,7 +675,7 @@ class PipelineTemplateRegistry:
|
|
|
669
675
|
name="LLM 요약 보고서",
|
|
670
676
|
module="llm_report",
|
|
671
677
|
params={"report_type": "summary"},
|
|
672
|
-
depends_on=["load_data", "statistics"],
|
|
678
|
+
depends_on=["load_data", "statistics", "retrieval_analysis"],
|
|
673
679
|
),
|
|
674
680
|
]
|
|
675
681
|
return AnalysisPipeline(
|
|
@@ -698,6 +704,12 @@ class PipelineTemplateRegistry:
|
|
|
698
704
|
module="ragas_evaluator",
|
|
699
705
|
depends_on=["load_data"],
|
|
700
706
|
),
|
|
707
|
+
AnalysisNode(
|
|
708
|
+
id="retrieval_analysis",
|
|
709
|
+
name="검색 분석",
|
|
710
|
+
module="retrieval_analyzer",
|
|
711
|
+
depends_on=["load_data"],
|
|
712
|
+
),
|
|
701
713
|
AnalysisNode(
|
|
702
714
|
id="low_samples",
|
|
703
715
|
name="낮은 성능 케이스 추출",
|
|
@@ -710,6 +722,12 @@ class PipelineTemplateRegistry:
|
|
|
710
722
|
module="diagnostic_playbook",
|
|
711
723
|
depends_on=["load_data", "ragas_eval"],
|
|
712
724
|
),
|
|
725
|
+
AnalysisNode(
|
|
726
|
+
id="multiturn",
|
|
727
|
+
name="멀티턴 분석",
|
|
728
|
+
module="multiturn_analyzer",
|
|
729
|
+
depends_on=["load_data", "ragas_eval"],
|
|
730
|
+
),
|
|
713
731
|
AnalysisNode(
|
|
714
732
|
id="nlp_analysis",
|
|
715
733
|
name="NLP 분석",
|
|
@@ -767,11 +785,13 @@ class PipelineTemplateRegistry:
|
|
|
767
785
|
"load_data",
|
|
768
786
|
"statistics",
|
|
769
787
|
"ragas_eval",
|
|
788
|
+
"retrieval_analysis",
|
|
770
789
|
"nlp_analysis",
|
|
771
790
|
"pattern_detection",
|
|
772
791
|
"causal_analysis",
|
|
773
792
|
"root_cause",
|
|
774
793
|
"priority_summary",
|
|
794
|
+
"multiturn",
|
|
775
795
|
"trend_detection",
|
|
776
796
|
],
|
|
777
797
|
),
|
|
@@ -48,6 +48,14 @@ def apply_retriever_to_dataset(
|
|
|
48
48
|
if scores:
|
|
49
49
|
metadata["scores"] = scores
|
|
50
50
|
metadata.update(_extract_graph_attributes(results))
|
|
51
|
+
graphrag_details = _build_graphrag_details(
|
|
52
|
+
results,
|
|
53
|
+
doc_ids=resolved_doc_ids,
|
|
54
|
+
max_docs=top_k,
|
|
55
|
+
)
|
|
56
|
+
if graphrag_details:
|
|
57
|
+
metadata["retriever"] = "graphrag"
|
|
58
|
+
metadata["graphrag"] = graphrag_details
|
|
51
59
|
retrieval_metadata[test_case.id] = metadata
|
|
52
60
|
|
|
53
61
|
return retrieval_metadata
|
|
@@ -164,6 +172,114 @@ def _compact_values(values: set[str]) -> str | list[str]:
|
|
|
164
172
|
return sorted(values)
|
|
165
173
|
|
|
166
174
|
|
|
175
|
+
def _build_graphrag_details(
|
|
176
|
+
results: Sequence[RetrieverResultProtocol],
|
|
177
|
+
*,
|
|
178
|
+
doc_ids: Sequence[str],
|
|
179
|
+
max_docs: int,
|
|
180
|
+
max_entities: int = 20,
|
|
181
|
+
max_relations: int = 20,
|
|
182
|
+
) -> dict[str, Any] | None:
|
|
183
|
+
details: list[dict[str, Any]] = []
|
|
184
|
+
for rank, result in enumerate(results, start=1):
|
|
185
|
+
metadata = getattr(result, "metadata", None)
|
|
186
|
+
if not isinstance(metadata, dict):
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
kg_meta = metadata.get("kg") if isinstance(metadata.get("kg"), dict) else None
|
|
190
|
+
bm25_meta = metadata.get("bm25") if isinstance(metadata.get("bm25"), dict) else None
|
|
191
|
+
dense_meta = metadata.get("dense") if isinstance(metadata.get("dense"), dict) else None
|
|
192
|
+
community_id = metadata.get("community_id")
|
|
193
|
+
|
|
194
|
+
if not (kg_meta or bm25_meta or dense_meta or community_id is not None):
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
doc_id = _resolve_doc_id(result, doc_ids, rank)
|
|
198
|
+
entry: dict[str, Any] = {
|
|
199
|
+
"doc_id": doc_id,
|
|
200
|
+
"rank": rank,
|
|
201
|
+
}
|
|
202
|
+
score = _extract_score(result)
|
|
203
|
+
if score is not None:
|
|
204
|
+
entry["score"] = score
|
|
205
|
+
|
|
206
|
+
sources: dict[str, Any] = {}
|
|
207
|
+
if kg_meta:
|
|
208
|
+
sources["kg"] = {
|
|
209
|
+
"entity_score": _coerce_float_or_none(kg_meta.get("entity_score")),
|
|
210
|
+
"relation_score": _coerce_float_or_none(kg_meta.get("relation_score")),
|
|
211
|
+
"entities": _limit_strings(kg_meta.get("entities"), max_entities),
|
|
212
|
+
"relations": _limit_strings(kg_meta.get("relations"), max_relations),
|
|
213
|
+
"community_id": _coerce_text_or_list(kg_meta.get("community_id")),
|
|
214
|
+
}
|
|
215
|
+
if bm25_meta:
|
|
216
|
+
sources["bm25"] = _build_rank_score(bm25_meta)
|
|
217
|
+
if dense_meta:
|
|
218
|
+
sources["dense"] = _build_rank_score(dense_meta)
|
|
219
|
+
if community_id is not None:
|
|
220
|
+
sources["community_id"] = _coerce_text_or_list(community_id)
|
|
221
|
+
if sources:
|
|
222
|
+
entry["sources"] = sources
|
|
223
|
+
|
|
224
|
+
details.append(entry)
|
|
225
|
+
if len(details) >= max_docs:
|
|
226
|
+
break
|
|
227
|
+
|
|
228
|
+
if not details:
|
|
229
|
+
return None
|
|
230
|
+
|
|
231
|
+
return {
|
|
232
|
+
"docs": details,
|
|
233
|
+
"max_docs": max_docs,
|
|
234
|
+
"max_entities": max_entities,
|
|
235
|
+
"max_relations": max_relations,
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _build_rank_score(payload: dict[str, Any]) -> dict[str, Any]:
|
|
240
|
+
out: dict[str, Any] = {}
|
|
241
|
+
rank = _coerce_int_optional(payload.get("rank"))
|
|
242
|
+
if rank is not None:
|
|
243
|
+
out["rank"] = rank
|
|
244
|
+
score = _coerce_float_or_none(payload.get("score"))
|
|
245
|
+
if score is not None:
|
|
246
|
+
out["score"] = score
|
|
247
|
+
return out
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def _coerce_float_or_none(value: Any) -> float | None:
|
|
251
|
+
try:
|
|
252
|
+
if value is None:
|
|
253
|
+
return None
|
|
254
|
+
return float(value)
|
|
255
|
+
except (TypeError, ValueError):
|
|
256
|
+
return None
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _coerce_int_optional(value: Any) -> int | None:
|
|
260
|
+
try:
|
|
261
|
+
if value is None:
|
|
262
|
+
return None
|
|
263
|
+
return int(value)
|
|
264
|
+
except (TypeError, ValueError):
|
|
265
|
+
return None
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _coerce_text_or_list(value: Any) -> str | list[str] | None:
|
|
269
|
+
if value is None:
|
|
270
|
+
return None
|
|
271
|
+
if isinstance(value, (list, tuple, set)):
|
|
272
|
+
return [str(item) for item in value]
|
|
273
|
+
return str(value)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _limit_strings(value: Any, limit: int) -> list[str]:
|
|
277
|
+
if not value:
|
|
278
|
+
return []
|
|
279
|
+
items = list(value) if isinstance(value, (list, tuple, set)) else [value]
|
|
280
|
+
return [str(item) for item in items[:limit]]
|
|
281
|
+
|
|
282
|
+
|
|
167
283
|
def apply_versioned_retriever_to_dataset(
|
|
168
284
|
*,
|
|
169
285
|
dataset: Dataset,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: evalvault
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.70.0
|
|
4
4
|
Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
|
|
5
5
|
Project-URL: Homepage, https://github.com/ntts9990/EvalVault
|
|
6
6
|
Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
|
|
@@ -86,15 +86,16 @@ evalvault/adapters/outbound/analysis/low_performer_extractor_module.py,sha256=Pt
|
|
|
86
86
|
evalvault/adapters/outbound/analysis/model_analyzer_module.py,sha256=28rHdXBXYIFpLHixbbZcv6-j2QVgl3yaGN0vU1Q0gFc,2682
|
|
87
87
|
evalvault/adapters/outbound/analysis/morpheme_analyzer_module.py,sha256=Hrh4mluMsOhQHPrliD2w0FVKokJpfikXOFKT6sNwk74,4158
|
|
88
88
|
evalvault/adapters/outbound/analysis/morpheme_quality_checker_module.py,sha256=_uRKDXdwGbfYduf_3XT77vF8X3-_zW3stHYc3HKYQTE,2216
|
|
89
|
+
evalvault/adapters/outbound/analysis/multiturn_analyzer_module.py,sha256=6R_lcbJyQr5CEEI_zpDJDdw6G4n3ZnkUI0ovfUPGrtU,7557
|
|
89
90
|
evalvault/adapters/outbound/analysis/network_analyzer_module.py,sha256=ITUVnt_CI5pHy5SAESBSi004yMtiAhGFsbhC61VTezk,8475
|
|
90
91
|
evalvault/adapters/outbound/analysis/nlp_adapter.py,sha256=aLtF_fns-7IEtitwON2EYS_lweq_IdldFsRm47alN0Q,29561
|
|
91
92
|
evalvault/adapters/outbound/analysis/nlp_analyzer_module.py,sha256=kVuG9pVMQO6OYY5zxj_w9nNQZ1-qIO0y6XcXo6lG-n0,8221
|
|
92
93
|
evalvault/adapters/outbound/analysis/pattern_detector_module.py,sha256=SyCDO_VS-r-tjGh8WrW-t1GCSC9ouxirdVk4NizFPXo,1882
|
|
93
|
-
evalvault/adapters/outbound/analysis/pipeline_factory.py,sha256=
|
|
94
|
+
evalvault/adapters/outbound/analysis/pipeline_factory.py,sha256=Yk-VPagdAZXbbD08pCSOleg-URuVAzJks4oGl61mKAs,3763
|
|
94
95
|
evalvault/adapters/outbound/analysis/pipeline_helpers.py,sha256=8E8IrYI5JvRrpnjxe0DS7srbPzB0XAxxXhLLYgfwsgU,5756
|
|
95
96
|
evalvault/adapters/outbound/analysis/priority_summary_module.py,sha256=o8Y0rfHjYYE9WNTwKtpJulwfvLA3MNMhYjdSg15Vacc,10802
|
|
96
97
|
evalvault/adapters/outbound/analysis/ragas_evaluator_module.py,sha256=Cd-spGn56zMcqOdoTLUHTYVOFqHqR17tPFyJs7rmnbw,7659
|
|
97
|
-
evalvault/adapters/outbound/analysis/retrieval_analyzer_module.py,sha256=
|
|
98
|
+
evalvault/adapters/outbound/analysis/retrieval_analyzer_module.py,sha256=STRHWapVAEz0YbSxR3NzT6zV7wfwlPxjKZunuWpfTmE,8340
|
|
98
99
|
evalvault/adapters/outbound/analysis/retrieval_benchmark_module.py,sha256=_duIBlYhAsFygEpC7DuwoAqfTbVG2xgp70JjW1LJAGE,9312
|
|
99
100
|
evalvault/adapters/outbound/analysis/retrieval_quality_checker_module.py,sha256=K1IJn4bvvz-BfqQmhd5Ik9oATjq_-G7V1AZSW8zKtSE,3121
|
|
100
101
|
evalvault/adapters/outbound/analysis/root_cause_analyzer_module.py,sha256=UagHWb2d1vD7aCH0vLl3tSJx86gkkxNarrF-rwtEBhU,2811
|
|
@@ -280,7 +281,7 @@ evalvault/domain/services/memory_based_analysis.py,sha256=oh2irCy3le7fWiTtL31SME
|
|
|
280
281
|
evalvault/domain/services/method_runner.py,sha256=pABqKZeaALpWZYDfzAbd-VOZt2djQggRNIPuuPQeUSw,3571
|
|
281
282
|
evalvault/domain/services/ops_snapshot_service.py,sha256=1CqJN2p3tM6SgzLCZKcVEM213fd1cDGexTRPG_3e59w,5138
|
|
282
283
|
evalvault/domain/services/pipeline_orchestrator.py,sha256=yriVlEVZYDtt0Vwt4Ae6xyW1H6Dj4Hxdn8XQSvQNSoQ,19436
|
|
283
|
-
evalvault/domain/services/pipeline_template_registry.py,sha256=
|
|
284
|
+
evalvault/domain/services/pipeline_template_registry.py,sha256=k5Ce1BC3NgcYqCLiUZpXsl_6WwDHOXONoYDH7KzX2L4,28809
|
|
284
285
|
evalvault/domain/services/prompt_candidate_service.py,sha256=Ibyb5EaWK28Ju2HnTqHHGOoiA9Q-VwY3hjxVODALwGY,3997
|
|
285
286
|
evalvault/domain/services/prompt_manifest.py,sha256=5s5Kd6-_Dn-xrjjlU99CVo6njsPhvE50H5m_85U-H6U,5612
|
|
286
287
|
evalvault/domain/services/prompt_registry.py,sha256=QyL4yIcKT93uv6L0-Q_iaNXno8QnsC19YcGekuSRMtE,5247
|
|
@@ -290,7 +291,7 @@ evalvault/domain/services/prompt_suggestion_reporter.py,sha256=Fc6sCPebUMk8SZVpj
|
|
|
290
291
|
evalvault/domain/services/ragas_prompt_overrides.py,sha256=4BecYE2KrreUBbIM3ssP9WzHcK_wRc8jW7CE_k58QOU,1412
|
|
291
292
|
evalvault/domain/services/regression_gate_service.py,sha256=qBMODgpizmEzqEL8_JX-FYSVyARiroMW7MFVzlz7gjc,6579
|
|
292
293
|
evalvault/domain/services/retrieval_metrics.py,sha256=dtrQPLMrXSyWLcgF8EGcLNFwzwA59WDzEh41JRToHAY,2980
|
|
293
|
-
evalvault/domain/services/retriever_context.py,sha256=
|
|
294
|
+
evalvault/domain/services/retriever_context.py,sha256=TeJ9UgT4l3lXxOXcYMz_7PdVMlV7JsW2ewTXdv9dI2M,10185
|
|
294
295
|
evalvault/domain/services/run_comparison_service.py,sha256=_NScltCRcY3zrvdyYDiPmssTxCDv1GyjCLdP3uAxJts,5631
|
|
295
296
|
evalvault/domain/services/satisfaction_calibration_service.py,sha256=H7Z8opOyPHRO5qVIw-XDsNhIwdCteAS9_a3BTlfIqHg,11906
|
|
296
297
|
evalvault/domain/services/stage_event_builder.py,sha256=FAT34Wmylvd2Yz5rDlhaTh1lqSCDhGApCXMi7Hjkib0,9748
|
|
@@ -339,8 +340,8 @@ evalvault/reports/__init__.py,sha256=Bb1X4871msAN8I6PM6nKGED3psPwZt88hXZBAOdH06Y
|
|
|
339
340
|
evalvault/reports/release_notes.py,sha256=pZj0PBFT-4F_Ty-Kv5P69BuoOnmTCn4kznDcORFJd0w,4011
|
|
340
341
|
evalvault/scripts/__init__.py,sha256=NwEeIFQbkX4ml2R_PhtIoNtArDSX_suuoymgG_7Kwso,89
|
|
341
342
|
evalvault/scripts/regression_runner.py,sha256=SxZori5BZ8jVQ057Mf5V5FPgIVDccrV5oRONmnhuk8w,8438
|
|
342
|
-
evalvault-1.
|
|
343
|
-
evalvault-1.
|
|
344
|
-
evalvault-1.
|
|
345
|
-
evalvault-1.
|
|
346
|
-
evalvault-1.
|
|
343
|
+
evalvault-1.70.0.dist-info/METADATA,sha256=Bm7z86HYTWoMfyTK9VpmxjNk-mhp2LHsIJ9Gt8s-onw,26159
|
|
344
|
+
evalvault-1.70.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
345
|
+
evalvault-1.70.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
|
|
346
|
+
evalvault-1.70.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
|
|
347
|
+
evalvault-1.70.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|