eval-ai-library 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- eval_ai_library-0.1.0.dist-info/METADATA +753 -0
- eval_ai_library-0.1.0.dist-info/RECORD +34 -0
- eval_ai_library-0.1.0.dist-info/WHEEL +5 -0
- eval_ai_library-0.1.0.dist-info/licenses/LICENSE +21 -0
- eval_ai_library-0.1.0.dist-info/top_level.txt +1 -0
- eval_lib/__init__.py +122 -0
- eval_lib/agent_metrics/__init__.py +12 -0
- eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +231 -0
- eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +251 -0
- eval_lib/agent_metrics/task_success_metric/task_success_rate.py +347 -0
- eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +106 -0
- eval_lib/datagenerator/datagenerator.py +230 -0
- eval_lib/datagenerator/document_loader.py +510 -0
- eval_lib/datagenerator/prompts.py +192 -0
- eval_lib/evaluate.py +335 -0
- eval_lib/evaluation_schema.py +63 -0
- eval_lib/llm_client.py +286 -0
- eval_lib/metric_pattern.py +229 -0
- eval_lib/metrics/__init__.py +25 -0
- eval_lib/metrics/answer_precision_metric/answer_precision.py +405 -0
- eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +195 -0
- eval_lib/metrics/bias_metric/bias.py +114 -0
- eval_lib/metrics/contextual_precision_metric/contextual_precision.py +102 -0
- eval_lib/metrics/contextual_recall_metric/contextual_recall.py +91 -0
- eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +169 -0
- eval_lib/metrics/custom_metric/custom_eval.py +303 -0
- eval_lib/metrics/faithfulness_metric/faithfulness.py +140 -0
- eval_lib/metrics/geval/geval.py +326 -0
- eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +102 -0
- eval_lib/metrics/toxicity_metric/toxicity.py +113 -0
- eval_lib/price.py +37 -0
- eval_lib/py.typed +1 -0
- eval_lib/testcases_schema.py +27 -0
- eval_lib/utils.py +99 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
eval_ai_library-0.1.0.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
|
|
2
|
+
eval_lib/__init__.py,sha256=vwyPEMhyQqJR8_usT-LzI1xjvFTBa8S16s53frz4JJM,2546
|
|
3
|
+
eval_lib/evaluate.py,sha256=XwtNkEaVzbrB2m6rgRaeFcb9kgHeHjAMM-WxMGCNs8E,12516
|
|
4
|
+
eval_lib/evaluation_schema.py,sha256=7IDd_uozqewhh7k0p1hKut_20udvRxxkV6thclxKUg0,1904
|
|
5
|
+
eval_lib/llm_client.py,sha256=y-6blKiMtZcPwRGeQ-_XHMIcAtgI-JxH3sArGxYErbI,7620
|
|
6
|
+
eval_lib/metric_pattern.py,sha256=7EL17zMqScjJ_NbgwmdwX_VBjrUq28SJnnBL8bfwO9Y,8151
|
|
7
|
+
eval_lib/price.py,sha256=jbmkkUTxPuXrkSHuaJYPl7jSzfDIzQ9p_swWWs26UJ0,1986
|
|
8
|
+
eval_lib/py.typed,sha256=8PjyZ1aVoQpRVvt71muvuq5qE-jTFZkK-GLHkhdebmc,26
|
|
9
|
+
eval_lib/testcases_schema.py,sha256=qI4o6kX0jH1DR3sHGXUnu3Cyt2oq7rGlsMlOaXSt6F4,696
|
|
10
|
+
eval_lib/utils.py,sha256=KNqNtZ3YO09Pf7alN7sT2P81xEGqii0AiNnJyN5jTlA,2744
|
|
11
|
+
eval_lib/agent_metrics/__init__.py,sha256=20Y4BsicD2s7OkOBQPBvB2JKStBDtplv52_q6q35Vgo,525
|
|
12
|
+
eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py,sha256=ensJZwq7N5DOm0VM4xmEjEsCkIxFytcf1GnoDJt8nBQ,8085
|
|
13
|
+
eval_lib/agent_metrics/role_adherence_metric/role_adherence.py,sha256=llH28rR2fasWqL-KoovsnPdqcnUHRy5tINm7Q_6dY5E,8922
|
|
14
|
+
eval_lib/agent_metrics/task_success_metric/task_success_rate.py,sha256=g2I0yQ5oB1TJ33LH-WpY4xa2Igz8dN36AghU47pjOFY,12379
|
|
15
|
+
eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py,sha256=bXFDPAe3MBDxKGliSeiyVCZ6v2sYksGFUUmO_dpEI60,4081
|
|
16
|
+
eval_lib/datagenerator/datagenerator.py,sha256=AClljFk-ZLKLkeaDrfRVE1-7pixiC60fwsoAXXVsXEY,8200
|
|
17
|
+
eval_lib/datagenerator/document_loader.py,sha256=bx3yvpH8pLlgbwPZRSLQt0Kl-xT6eI2AqRApIPyMYbI,17554
|
|
18
|
+
eval_lib/datagenerator/prompts.py,sha256=iQAYitAbLud3vWJnXGh_OCF4syWYS_S71zZEPI6qYAU,7213
|
|
19
|
+
eval_lib/metrics/__init__.py,sha256=3qClCCjPXt5i0c38g5krfuQnqlAXEl-jhAHy1C_ICMY,1213
|
|
20
|
+
eval_lib/metrics/answer_precision_metric/answer_precision.py,sha256=9EKgp_4Kw01nSRL5lyHxbHSTI1mQUnFpEtOWUf51aA0,14909
|
|
21
|
+
eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py,sha256=fRSeJhOQAV7FJHaDbQQjTXABIbKRO47-WMKMgPqzIZ0,8340
|
|
22
|
+
eval_lib/metrics/bias_metric/bias.py,sha256=HwabG0yyV4U_LXIsdviawKoqbM4QZxA3fcSnJNMpVeM,3788
|
|
23
|
+
eval_lib/metrics/contextual_precision_metric/contextual_precision.py,sha256=YZSfMoMwoVgu_5ZSp_yxpazbXM_9d-bWi74obPIhQ68,3916
|
|
24
|
+
eval_lib/metrics/contextual_recall_metric/contextual_recall.py,sha256=bZQE6AZpG_dnjrNjThhf9k9X0GtaVg3zPTvqQdXRjgQ,3783
|
|
25
|
+
eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py,sha256=aYle5W5NtcHuvumBE4k1O7eyREU-dILtoTl_fHAWdNs,6359
|
|
26
|
+
eval_lib/metrics/custom_metric/custom_eval.py,sha256=8YzbS45LQ0XtwIwyxnzFP5ujYeKPG_NWdWD0TnTwm2E,9737
|
|
27
|
+
eval_lib/metrics/faithfulness_metric/faithfulness.py,sha256=jHZMYflkXMEDsh8czENPisitTmaplI7Klr2PoPxE1Rw,5789
|
|
28
|
+
eval_lib/metrics/geval/geval.py,sha256=MhWd4KpSIsTEvanscuLsIXOtjOGEtw_2-8ctrqabzTo,10490
|
|
29
|
+
eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py,sha256=TQpoiN1hG2KKw1ILpJsmoE2T8uMGhxHz3wswDaqcBPA,4156
|
|
30
|
+
eval_lib/metrics/toxicity_metric/toxicity.py,sha256=Rh5UPSeAtkGopmCU4JGTAVLHbgFKrhVO83pl6vdNdTw,3882
|
|
31
|
+
eval_ai_library-0.1.0.dist-info/METADATA,sha256=dJ2lV7VKlvVfmcIimgIAfgi900g21DBayknvs5Ng8yg,23834
|
|
32
|
+
eval_ai_library-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
33
|
+
eval_ai_library-0.1.0.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
|
|
34
|
+
eval_ai_library-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Aleksandr Meshkov
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
eval_lib
|
eval_lib/__init__.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Eval AI Library - Comprehensive AI Model Evaluation Framework
|
|
3
|
+
|
|
4
|
+
A powerful library for evaluating AI models with support for multiple LLM providers
|
|
5
|
+
and a wide range of evaluation metrics for RAG systems and AI agents.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
__author__ = "Aleksandr Meskov"
|
|
10
|
+
|
|
11
|
+
# Core evaluation functions
|
|
12
|
+
from eval_lib.evaluate import evaluate, evaluate_conversations
|
|
13
|
+
from eval_lib.utils import score_agg, softmax_agg
|
|
14
|
+
|
|
15
|
+
# Test case schemas
|
|
16
|
+
from eval_lib.testcases_schema import (
|
|
17
|
+
EvalTestCase,
|
|
18
|
+
ConversationalEvalTestCase,
|
|
19
|
+
ToolCall
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Evaluation schemas
|
|
23
|
+
from eval_lib.evaluation_schema import (
|
|
24
|
+
MetricResult,
|
|
25
|
+
TestCaseResult,
|
|
26
|
+
ConversationalTestCaseResult
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Base patterns
|
|
30
|
+
from eval_lib.metric_pattern import (
|
|
31
|
+
MetricPattern,
|
|
32
|
+
ConversationalMetricPattern
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# LLM client
|
|
36
|
+
from eval_lib.llm_client import (
|
|
37
|
+
chat_complete,
|
|
38
|
+
get_embeddings,
|
|
39
|
+
LLMDescriptor,
|
|
40
|
+
Provider
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# RAG Metrics
|
|
44
|
+
from eval_lib.metrics import (
|
|
45
|
+
AnswerRelevancyMetric,
|
|
46
|
+
AnswerPrecisionMetric,
|
|
47
|
+
FaithfulnessMetric,
|
|
48
|
+
ContextualRelevancyMetric,
|
|
49
|
+
ContextualPrecisionMetric,
|
|
50
|
+
ContextualRecallMetric,
|
|
51
|
+
BiasMetric,
|
|
52
|
+
ToxicityMetric,
|
|
53
|
+
RestrictedRefusalMetric,
|
|
54
|
+
GEval,
|
|
55
|
+
CustomEvalMetric
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Agent Metrics
|
|
59
|
+
from eval_lib.agent_metrics import (
|
|
60
|
+
ToolCorrectnessMetric,
|
|
61
|
+
TaskSuccessRateMetric,
|
|
62
|
+
RoleAdherenceMetric,
|
|
63
|
+
KnowledgeRetentionMetric
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Data generator
|
|
67
|
+
from eval_lib.datagenerator.datagenerator import DataGenerator
|
|
68
|
+
from eval_lib.datagenerator.document_loader import DocumentLoader
|
|
69
|
+
|
|
70
|
+
__all__ = [
|
|
71
|
+
# Version
|
|
72
|
+
"__version__",
|
|
73
|
+
|
|
74
|
+
# Core functions
|
|
75
|
+
"evaluate",
|
|
76
|
+
"evaluate_conversations",
|
|
77
|
+
|
|
78
|
+
# Schemas
|
|
79
|
+
"EvalTestCase",
|
|
80
|
+
"ConversationalEvalTestCase",
|
|
81
|
+
"ToolCall",
|
|
82
|
+
"MetricResult",
|
|
83
|
+
"TestCaseResult",
|
|
84
|
+
"ConversationalTestCaseResult",
|
|
85
|
+
|
|
86
|
+
# Patterns
|
|
87
|
+
"MetricPattern",
|
|
88
|
+
"ConversationalMetricPattern",
|
|
89
|
+
|
|
90
|
+
# LLM
|
|
91
|
+
"chat_complete",
|
|
92
|
+
"get_embeddings",
|
|
93
|
+
"LLMDescriptor",
|
|
94
|
+
"Provider",
|
|
95
|
+
|
|
96
|
+
# RAG Metrics
|
|
97
|
+
"AnswerRelevancyMetric",
|
|
98
|
+
"AnswerPrecisionMetric",
|
|
99
|
+
"FaithfulnessMetric",
|
|
100
|
+
"ContextualRelevancyMetric",
|
|
101
|
+
"ContextualPrecisionMetric",
|
|
102
|
+
"ContextualRecallMetric",
|
|
103
|
+
"BiasMetric",
|
|
104
|
+
"ToxicityMetric",
|
|
105
|
+
"RestrictedRefusalMetric",
|
|
106
|
+
"GEval",
|
|
107
|
+
"CustomEvalMetric",
|
|
108
|
+
|
|
109
|
+
# Agent Metrics
|
|
110
|
+
"ToolCorrectnessMetric",
|
|
111
|
+
"TaskSuccessRateMetric",
|
|
112
|
+
"RoleAdherenceMetric",
|
|
113
|
+
"KnowledgeRetentionMetric",
|
|
114
|
+
|
|
115
|
+
# Data Generation
|
|
116
|
+
"DataGenerator",
|
|
117
|
+
"DocumentLoader",
|
|
118
|
+
|
|
119
|
+
# Utils
|
|
120
|
+
"score_agg",
|
|
121
|
+
"extract_json_block",
|
|
122
|
+
]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from eval_lib.agent_metrics.tools_correctness_metric.tool_correctness import ToolCorrectnessMetric
|
|
2
|
+
from eval_lib.agent_metrics.task_success_metric.task_success_rate import TaskSuccessRateMetric
|
|
3
|
+
from eval_lib.agent_metrics.role_adherence_metric.role_adherence import RoleAdherenceMetric
|
|
4
|
+
from eval_lib.agent_metrics.knowledge_retention_metric.knowledge_retention import KnowledgeRetentionMetric
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"ToolCorrectnessMetric",
|
|
9
|
+
"TaskSuccessRateMetric",
|
|
10
|
+
"RoleAdherenceMetric",
|
|
11
|
+
"KnowledgeRetentionMetric"
|
|
12
|
+
]
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
# knowledge_retention.py
|
|
2
|
+
"""
|
|
3
|
+
Knowledge Retention Metric: Evaluates how well the assistant remembers and maintains
|
|
4
|
+
context across a multi-turn conversation.
|
|
5
|
+
|
|
6
|
+
Score calculation: Softmax aggregation of retention verdicts
|
|
7
|
+
"""
|
|
8
|
+
import json
|
|
9
|
+
from typing import List, Dict, Any, Tuple
|
|
10
|
+
from eval_lib.testcases_schema import ConversationalEvalTestCase
|
|
11
|
+
from eval_lib.metric_pattern import ConversationalMetricPattern
|
|
12
|
+
from eval_lib.llm_client import chat_complete
|
|
13
|
+
from eval_lib.utils import score_agg, extract_json_block
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Verdict weights for knowledge retention levels
|
|
17
|
+
VERDICT_WEIGHTS = {
|
|
18
|
+
"fully": 1.0, # Remembers every relevant fact
|
|
19
|
+
"mostly": 0.9, # Slight omissions, no contradiction
|
|
20
|
+
"partial": 0.7, # Several lapses but overall context kept
|
|
21
|
+
"minor": 0.3, # One small lapse or omission
|
|
22
|
+
"none": 0.0 # Assistant contradicts or forgets previous facts
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class KnowledgeRetentionMetric(ConversationalMetricPattern):
|
|
27
|
+
"""
|
|
28
|
+
Evaluates how well an AI assistant retains and recalls information
|
|
29
|
+
shared across multiple conversation turns.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
name = "knowledgeRetentionMetric"
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
model: str,
|
|
37
|
+
threshold: float = 0.7,
|
|
38
|
+
temperature: float = 0.5,
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Initialize Knowledge Retention metric.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
model: LLM model name
|
|
45
|
+
threshold: Success threshold (0.0-1.0)
|
|
46
|
+
temperature: Score aggregation temperature for softmax
|
|
47
|
+
"""
|
|
48
|
+
super().__init__(model=model, threshold=threshold)
|
|
49
|
+
self.temperature = temperature
|
|
50
|
+
|
|
51
|
+
# ==================== HELPER METHODS ====================
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def _render_dialogue(turns) -> str:
|
|
55
|
+
"""Convert conversation turns into readable format"""
|
|
56
|
+
return "\n".join(
|
|
57
|
+
f"{i+1}. User: {t.input}\n Assistant: {t.actual_output}"
|
|
58
|
+
for i, t in enumerate(turns)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def _prompt_label_help() -> str:
|
|
63
|
+
"""Explanation of retention verdict levels"""
|
|
64
|
+
return """Rate knowledge retention (worst → best):
|
|
65
|
+
|
|
66
|
+
none – assistant contradicts or forgets previous facts
|
|
67
|
+
minor – one small lapse or omission
|
|
68
|
+
partial– several lapses but overall context kept
|
|
69
|
+
mostly – slight omissions, no contradiction
|
|
70
|
+
fully – remembers every relevant fact"""
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _prompt_few_shot() -> str:
|
|
74
|
+
"""Few-shot examples for verdict generation"""
|
|
75
|
+
return """Example GOOD:
|
|
76
|
+
Conversation:
|
|
77
|
+
1. User: What year was Python created?
|
|
78
|
+
Assistant: Python was first released in 1991.
|
|
79
|
+
2. User: Remind me, who created it?
|
|
80
|
+
Assistant: It was created by Guido van Rossum in 1991.
|
|
81
|
+
Verdicts:
|
|
82
|
+
[{"verdict":"fully","reason":"Assistant repeated year and author correctly"}]
|
|
83
|
+
|
|
84
|
+
Example BAD:
|
|
85
|
+
Conversation:
|
|
86
|
+
1. User: I live in Spain.
|
|
87
|
+
Assistant: Great! How's the weather?
|
|
88
|
+
2. User: Remind me later that I live in Spain.
|
|
89
|
+
Assistant: Sure, I'll remind you that you live in Italy.
|
|
90
|
+
Verdicts:
|
|
91
|
+
[{"verdict":"none","reason":"Assistant contradicted the country"}]"""
|
|
92
|
+
|
|
93
|
+
# ==================== CORE EVALUATION STEPS ====================
|
|
94
|
+
|
|
95
|
+
async def _generate_verdicts(
|
|
96
|
+
self,
|
|
97
|
+
dialogue: str
|
|
98
|
+
) -> Tuple[List[Dict[str, str]], float, float]:
|
|
99
|
+
"""
|
|
100
|
+
Generate retention verdicts for the conversation.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
dialogue: Formatted conversation text
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Tuple of (verdicts_list, aggregated_score, llm_cost)
|
|
107
|
+
"""
|
|
108
|
+
prompt = (
|
|
109
|
+
f"{self._prompt_label_help()}\n\n"
|
|
110
|
+
f"{self._prompt_few_shot()}\n\n"
|
|
111
|
+
"Now analyse the next conversation.\n\n"
|
|
112
|
+
f"Conversation:\n{dialogue}\n\n"
|
|
113
|
+
"Return ONE JSON array with 1 object in the form:\n"
|
|
114
|
+
"[{\"verdict\":\"fully|mostly|partial|minor|none\",\"reason\":\"…\"}]"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
text, cost = await chat_complete(
|
|
118
|
+
self.model,
|
|
119
|
+
messages=[{"role": "user", "content": prompt}],
|
|
120
|
+
temperature=0.0
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
raw_json = extract_json_block(text)
|
|
125
|
+
verdicts = json.loads(raw_json)
|
|
126
|
+
|
|
127
|
+
if not isinstance(verdicts, list):
|
|
128
|
+
raise ValueError("Expected JSON array of verdicts")
|
|
129
|
+
|
|
130
|
+
# Calculate aggregated score from verdicts
|
|
131
|
+
weights = [VERDICT_WEIGHTS.get(
|
|
132
|
+
v.get("verdict", "none"), 0.0) for v in verdicts]
|
|
133
|
+
score = round(score_agg(weights, temperature=self.temperature), 4)
|
|
134
|
+
|
|
135
|
+
return verdicts, score, cost or 0.0
|
|
136
|
+
|
|
137
|
+
except Exception as e:
|
|
138
|
+
raise RuntimeError(f"Failed to parse verdicts: {e}\n{text}")
|
|
139
|
+
|
|
140
|
+
async def _summarize_verdicts(
|
|
141
|
+
self,
|
|
142
|
+
verdicts: List[Dict[str, str]]
|
|
143
|
+
) -> Tuple[str, float]:
|
|
144
|
+
"""
|
|
145
|
+
Generate concise summary of retention assessment.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
verdicts: List of verdict objects with reasons
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Tuple of (summary_text, llm_cost)
|
|
152
|
+
"""
|
|
153
|
+
bullets = "\n".join(f"- {v['reason']}" for v in verdicts)
|
|
154
|
+
|
|
155
|
+
prompt = (
|
|
156
|
+
"Write a concise (max 2 sentences) summary of the assistant's knowledge retention, "
|
|
157
|
+
"based on these points:\n\n"
|
|
158
|
+
f"{bullets}\n\n"
|
|
159
|
+
"Summary:"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
text, cost = await chat_complete(
|
|
163
|
+
self.model,
|
|
164
|
+
messages=[{"role": "user", "content": prompt}],
|
|
165
|
+
temperature=0.0
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return text.strip(), cost or 0.0
|
|
169
|
+
|
|
170
|
+
# ==================== MAIN EVALUATION ====================
|
|
171
|
+
|
|
172
|
+
async def evaluate(self, test_case: ConversationalEvalTestCase) -> Dict[str, Any]:
|
|
173
|
+
"""
|
|
174
|
+
Evaluate knowledge retention across conversation turns.
|
|
175
|
+
|
|
176
|
+
Steps:
|
|
177
|
+
1. Format dialogue into readable text
|
|
178
|
+
2. Generate retention verdicts (fully/mostly/partial/minor/none)
|
|
179
|
+
3. Aggregate verdicts into final score using softmax
|
|
180
|
+
4. Generate summary explanation
|
|
181
|
+
5. Build comprehensive evaluation log
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
test_case: Conversational test case with multiple turns
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Evaluation results with score, success, reason, cost, and detailed log
|
|
188
|
+
"""
|
|
189
|
+
total_cost = 0.0
|
|
190
|
+
|
|
191
|
+
# Step 1: Format dialogue
|
|
192
|
+
dialogue_text = self._render_dialogue(test_case.turns)
|
|
193
|
+
|
|
194
|
+
# Step 2: Generate retention verdicts
|
|
195
|
+
verdicts, verdict_score, cost = await self._generate_verdicts(dialogue_text)
|
|
196
|
+
total_cost += cost
|
|
197
|
+
|
|
198
|
+
# Step 3: Generate summary explanation
|
|
199
|
+
summary, cost = await self._summarize_verdicts(verdicts)
|
|
200
|
+
total_cost += cost
|
|
201
|
+
|
|
202
|
+
# Step 4: Determine success
|
|
203
|
+
final_score = verdict_score
|
|
204
|
+
success = final_score >= self.threshold
|
|
205
|
+
|
|
206
|
+
# Step 5: Build evaluation log
|
|
207
|
+
evaluation_log = {
|
|
208
|
+
"dialogue": dialogue_text,
|
|
209
|
+
"comment_dialogue": "Full conversation text used for retention evaluation.",
|
|
210
|
+
"number_of_turns": len(test_case.turns),
|
|
211
|
+
"comment_number_of_turns": "Total conversation turns analyzed.",
|
|
212
|
+
"verdicts": verdicts,
|
|
213
|
+
"comment_verdicts": "LLM-generated verdicts assessing retention level (fully/mostly/partial/minor/none).",
|
|
214
|
+
"verdict_weights": {v["verdict"]: VERDICT_WEIGHTS.get(v["verdict"], 0.0) for v in verdicts},
|
|
215
|
+
"comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
|
|
216
|
+
"final_score": final_score,
|
|
217
|
+
"comment_final_score": f"Softmax aggregation of verdict weights (temperature={self.temperature}).",
|
|
218
|
+
"threshold": self.threshold,
|
|
219
|
+
"success": success,
|
|
220
|
+
"comment_success": "Whether the retention score meets the required threshold.",
|
|
221
|
+
"final_reason": summary,
|
|
222
|
+
"comment_reasoning": "Concise explanation of the assistant's knowledge retention performance."
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return {
|
|
226
|
+
"score": final_score,
|
|
227
|
+
"success": success,
|
|
228
|
+
"reason": summary,
|
|
229
|
+
"evaluation_cost": round(total_cost, 6),
|
|
230
|
+
"evaluation_log": evaluation_log
|
|
231
|
+
}
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
# role_adherence.py
|
|
2
|
+
"""
|
|
3
|
+
Role Adherence Metric: Evaluates how well the AI assistant maintains its assigned
|
|
4
|
+
role and character throughout a multi-turn conversation.
|
|
5
|
+
|
|
6
|
+
Score calculation: Softmax aggregation of adherence verdicts
|
|
7
|
+
"""
|
|
8
|
+
import json
|
|
9
|
+
from typing import List, Dict, Any, Tuple
|
|
10
|
+
from eval_lib.testcases_schema import ConversationalEvalTestCase
|
|
11
|
+
from eval_lib.metric_pattern import ConversationalMetricPattern
|
|
12
|
+
from eval_lib.llm_client import chat_complete
|
|
13
|
+
from eval_lib.utils import score_agg, extract_json_block
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Verdict weights for role adherence levels
|
|
17
|
+
VERDICT_WEIGHTS = {
|
|
18
|
+
"fully": 1.0, # Perfectly maintains role throughout
|
|
19
|
+
"mostly": 0.9, # Minor deviations but stays in character
|
|
20
|
+
"partial": 0.7, # Some role breaks but generally consistent
|
|
21
|
+
"minor": 0.3, # Frequently breaks character
|
|
22
|
+
"none": 0.0 # Completely ignores assigned role
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RoleAdherenceMetric(ConversationalMetricPattern):
|
|
27
|
+
"""
|
|
28
|
+
Evaluates how consistently an AI assistant adheres to its assigned role
|
|
29
|
+
across multiple conversation turns.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
name = "roleAdherenceMetric"
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
model: str,
|
|
37
|
+
threshold: float = 0.7,
|
|
38
|
+
temperature: float = 0.5,
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Initialize Role Adherence metric.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
model: LLM model name
|
|
45
|
+
threshold: Success threshold (0.0-1.0)
|
|
46
|
+
temperature: Score aggregation temperature for softmax
|
|
47
|
+
"""
|
|
48
|
+
super().__init__(model=model, threshold=threshold)
|
|
49
|
+
self.temperature = temperature
|
|
50
|
+
|
|
51
|
+
# ==================== HELPER METHODS ====================
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def _render_dialogue(turns) -> str:
|
|
55
|
+
"""Convert conversation turns into readable format"""
|
|
56
|
+
return "\n".join(
|
|
57
|
+
f"{i+1}. User: {t.input}\n Assistant: {t.actual_output}"
|
|
58
|
+
for i, t in enumerate(turns)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def _prompt_label_help() -> str:
|
|
63
|
+
"""Explanation of role adherence verdict levels"""
|
|
64
|
+
return """Rate role adherence (worst → best):
|
|
65
|
+
|
|
66
|
+
none – completely ignores assigned role
|
|
67
|
+
minor – frequently breaks character
|
|
68
|
+
partial – some role breaks but generally consistent
|
|
69
|
+
mostly – minor deviations but stays in character
|
|
70
|
+
fully – perfectly maintains role throughout"""
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _prompt_few_shot() -> str:
|
|
74
|
+
"""Few-shot examples for verdict generation"""
|
|
75
|
+
return """Example GOOD:
|
|
76
|
+
Role: You are a professional medical advisor. Be formal and evidence-based.
|
|
77
|
+
Conversation:
|
|
78
|
+
1. User: What causes headaches?
|
|
79
|
+
Assistant: Headaches can result from various factors including dehydration, stress, or underlying medical conditions. I recommend consulting a healthcare provider for persistent symptoms.
|
|
80
|
+
Verdicts:
|
|
81
|
+
[{"verdict":"fully","reason":"Maintained formal medical tone and evidence-based approach"}]
|
|
82
|
+
|
|
83
|
+
Example BAD:
|
|
84
|
+
Role: You are a professional financial advisor. Use formal language.
|
|
85
|
+
Conversation:
|
|
86
|
+
1. User: Should I invest in stocks?
|
|
87
|
+
Assistant: Yo dude! Stocks are totally rad! Just YOLO into them lol 🚀
|
|
88
|
+
Verdicts:
|
|
89
|
+
[{"verdict":"none","reason":"Completely abandoned professional tone and formal language requirement"}]"""
|
|
90
|
+
|
|
91
|
+
# ==================== CORE EVALUATION STEPS ====================
|
|
92
|
+
|
|
93
|
+
async def _generate_verdicts(
|
|
94
|
+
self,
|
|
95
|
+
role_description: str,
|
|
96
|
+
dialogue_text: str
|
|
97
|
+
) -> Tuple[List[Dict[str, str]], float, float]:
|
|
98
|
+
"""
|
|
99
|
+
Generate role adherence verdicts for each conversation turn.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
role_description: The assigned role/character description
|
|
103
|
+
dialogue_text: Formatted conversation text
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Tuple of (verdicts_list, aggregated_score, llm_cost)
|
|
107
|
+
"""
|
|
108
|
+
prompt = f"""{self._prompt_label_help()}
|
|
109
|
+
|
|
110
|
+
{self._prompt_few_shot()}
|
|
111
|
+
|
|
112
|
+
Now evaluate the following conversation.
|
|
113
|
+
|
|
114
|
+
ASSIGNED ROLE:
|
|
115
|
+
{role_description}
|
|
116
|
+
|
|
117
|
+
DIALOGUE:
|
|
118
|
+
{dialogue_text}
|
|
119
|
+
|
|
120
|
+
Task: Judge how well the assistant stays in character throughout the conversation.
|
|
121
|
+
|
|
122
|
+
For each assistant reply, assign a verdict: "fully", "mostly", "partial", "minor", or "none".
|
|
123
|
+
|
|
124
|
+
Return JSON array:
|
|
125
|
+
[{{"verdict": "fully|mostly|partial|minor|none", "reason": "<explanation>"}}, ...]"""
|
|
126
|
+
|
|
127
|
+
text, cost = await chat_complete(
|
|
128
|
+
self.model,
|
|
129
|
+
messages=[{"role": "user", "content": prompt}],
|
|
130
|
+
temperature=0.0
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
raw_json = extract_json_block(text)
|
|
135
|
+
verdicts = json.loads(raw_json)
|
|
136
|
+
|
|
137
|
+
if not isinstance(verdicts, list):
|
|
138
|
+
raise ValueError("Expected JSON array of verdicts")
|
|
139
|
+
|
|
140
|
+
# Calculate aggregated score from verdicts
|
|
141
|
+
weights = [VERDICT_WEIGHTS.get(
|
|
142
|
+
v.get("verdict", "none"), 0.0) for v in verdicts]
|
|
143
|
+
score = round(score_agg(weights, temperature=self.temperature), 4)
|
|
144
|
+
|
|
145
|
+
return verdicts, score, cost or 0.0
|
|
146
|
+
|
|
147
|
+
except Exception as e:
|
|
148
|
+
raise RuntimeError(f"Failed to parse verdicts: {e}\n{text}")
|
|
149
|
+
|
|
150
|
+
async def _summarize_verdicts(
|
|
151
|
+
self,
|
|
152
|
+
verdicts: List[Dict[str, str]]
|
|
153
|
+
) -> Tuple[str, float]:
|
|
154
|
+
"""
|
|
155
|
+
Generate concise summary of role adherence assessment.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
verdicts: List of verdict objects with reasons
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Tuple of (summary_text, llm_cost)
|
|
162
|
+
"""
|
|
163
|
+
# Take up to 6 most relevant verdicts for summary
|
|
164
|
+
bullets = "\n".join(f"- {v['reason']}" for v in verdicts[:6])
|
|
165
|
+
|
|
166
|
+
prompt = (
|
|
167
|
+
"Write a concise (max 2 sentences) summary of how well the chatbot stayed in character, "
|
|
168
|
+
"based on these observations:\n\n"
|
|
169
|
+
f"{bullets}\n\n"
|
|
170
|
+
"Summary:"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
text, cost = await chat_complete(
|
|
174
|
+
self.model,
|
|
175
|
+
messages=[{"role": "user", "content": prompt}],
|
|
176
|
+
temperature=0.0
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
return text.strip(), cost or 0.0
|
|
180
|
+
|
|
181
|
+
# ==================== MAIN EVALUATION ====================
|
|
182
|
+
|
|
183
|
+
async def evaluate(self, test_case: ConversationalEvalTestCase) -> Dict[str, Any]:
|
|
184
|
+
"""
|
|
185
|
+
Evaluate role adherence across conversation turns.
|
|
186
|
+
|
|
187
|
+
Steps:
|
|
188
|
+
1. Extract chatbot role from test case
|
|
189
|
+
2. Format dialogue into readable text
|
|
190
|
+
3. Generate adherence verdicts (fully/mostly/partial/minor/none)
|
|
191
|
+
4. Aggregate verdicts into final score using softmax
|
|
192
|
+
5. Generate summary explanation
|
|
193
|
+
6. Build comprehensive evaluation log
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
test_case: Conversational test case with multiple turns and chatbot role
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Evaluation results with score, success, reason, cost, and detailed log
|
|
200
|
+
"""
|
|
201
|
+
total_cost = 0.0
|
|
202
|
+
|
|
203
|
+
# Step 1: Extract role
|
|
204
|
+
role_description = test_case.chatbot_role or "No role specified"
|
|
205
|
+
|
|
206
|
+
# Step 2: Format dialogue
|
|
207
|
+
dialogue_text = self._render_dialogue(test_case.turns)
|
|
208
|
+
|
|
209
|
+
# Step 3: Generate role adherence verdicts
|
|
210
|
+
verdicts, verdict_score, cost = await self._generate_verdicts(
|
|
211
|
+
role_description,
|
|
212
|
+
dialogue_text
|
|
213
|
+
)
|
|
214
|
+
total_cost += cost
|
|
215
|
+
|
|
216
|
+
# Step 4: Generate summary explanation
|
|
217
|
+
summary, cost = await self._summarize_verdicts(verdicts)
|
|
218
|
+
total_cost += cost
|
|
219
|
+
|
|
220
|
+
# Step 5: Determine success
|
|
221
|
+
final_score = verdict_score
|
|
222
|
+
success = final_score >= self.threshold
|
|
223
|
+
|
|
224
|
+
# Step 6: Build evaluation log
|
|
225
|
+
evaluation_log = {
|
|
226
|
+
"chatbot_role": role_description,
|
|
227
|
+
"comment_chatbot_role": "The assigned role/character the assistant should maintain.",
|
|
228
|
+
"dialogue": dialogue_text,
|
|
229
|
+
"comment_dialogue": "Full conversation text used for role adherence evaluation.",
|
|
230
|
+
"number_of_turns": len(test_case.turns),
|
|
231
|
+
"comment_number_of_turns": "Total conversation turns analyzed.",
|
|
232
|
+
"verdicts": verdicts,
|
|
233
|
+
"comment_verdicts": "LLM-generated verdicts assessing adherence level per turn (fully/mostly/partial/minor/none).",
|
|
234
|
+
"verdict_weights": {v["verdict"]: VERDICT_WEIGHTS.get(v["verdict"], 0.0) for v in verdicts},
|
|
235
|
+
"comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
|
|
236
|
+
"final_score": final_score,
|
|
237
|
+
"comment_final_score": f"Softmax aggregation of verdict weights (temperature={self.temperature}).",
|
|
238
|
+
"threshold": self.threshold,
|
|
239
|
+
"success": success,
|
|
240
|
+
"comment_success": "Whether the role adherence score meets the required threshold.",
|
|
241
|
+
"final_reason": summary,
|
|
242
|
+
"comment_reasoning": "Concise explanation of how well the assistant maintained its assigned role."
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return {
|
|
246
|
+
"score": final_score,
|
|
247
|
+
"success": success,
|
|
248
|
+
"reason": summary,
|
|
249
|
+
"evaluation_cost": round(total_cost, 6),
|
|
250
|
+
"evaluation_log": evaluation_log
|
|
251
|
+
}
|