eval-ai-library 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eval-ai-library might be problematic. Click here for more details.

Files changed (34) hide show
  1. eval_ai_library-0.1.0.dist-info/METADATA +753 -0
  2. eval_ai_library-0.1.0.dist-info/RECORD +34 -0
  3. eval_ai_library-0.1.0.dist-info/WHEEL +5 -0
  4. eval_ai_library-0.1.0.dist-info/licenses/LICENSE +21 -0
  5. eval_ai_library-0.1.0.dist-info/top_level.txt +1 -0
  6. eval_lib/__init__.py +122 -0
  7. eval_lib/agent_metrics/__init__.py +12 -0
  8. eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +231 -0
  9. eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +251 -0
  10. eval_lib/agent_metrics/task_success_metric/task_success_rate.py +347 -0
  11. eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +106 -0
  12. eval_lib/datagenerator/datagenerator.py +230 -0
  13. eval_lib/datagenerator/document_loader.py +510 -0
  14. eval_lib/datagenerator/prompts.py +192 -0
  15. eval_lib/evaluate.py +335 -0
  16. eval_lib/evaluation_schema.py +63 -0
  17. eval_lib/llm_client.py +286 -0
  18. eval_lib/metric_pattern.py +229 -0
  19. eval_lib/metrics/__init__.py +25 -0
  20. eval_lib/metrics/answer_precision_metric/answer_precision.py +405 -0
  21. eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +195 -0
  22. eval_lib/metrics/bias_metric/bias.py +114 -0
  23. eval_lib/metrics/contextual_precision_metric/contextual_precision.py +102 -0
  24. eval_lib/metrics/contextual_recall_metric/contextual_recall.py +91 -0
  25. eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +169 -0
  26. eval_lib/metrics/custom_metric/custom_eval.py +303 -0
  27. eval_lib/metrics/faithfulness_metric/faithfulness.py +140 -0
  28. eval_lib/metrics/geval/geval.py +326 -0
  29. eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +102 -0
  30. eval_lib/metrics/toxicity_metric/toxicity.py +113 -0
  31. eval_lib/price.py +37 -0
  32. eval_lib/py.typed +1 -0
  33. eval_lib/testcases_schema.py +27 -0
  34. eval_lib/utils.py +99 -0
@@ -0,0 +1,34 @@
1
+ eval_ai_library-0.1.0.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
2
+ eval_lib/__init__.py,sha256=vwyPEMhyQqJR8_usT-LzI1xjvFTBa8S16s53frz4JJM,2546
3
+ eval_lib/evaluate.py,sha256=XwtNkEaVzbrB2m6rgRaeFcb9kgHeHjAMM-WxMGCNs8E,12516
4
+ eval_lib/evaluation_schema.py,sha256=7IDd_uozqewhh7k0p1hKut_20udvRxxkV6thclxKUg0,1904
5
+ eval_lib/llm_client.py,sha256=y-6blKiMtZcPwRGeQ-_XHMIcAtgI-JxH3sArGxYErbI,7620
6
+ eval_lib/metric_pattern.py,sha256=7EL17zMqScjJ_NbgwmdwX_VBjrUq28SJnnBL8bfwO9Y,8151
7
+ eval_lib/price.py,sha256=jbmkkUTxPuXrkSHuaJYPl7jSzfDIzQ9p_swWWs26UJ0,1986
8
+ eval_lib/py.typed,sha256=8PjyZ1aVoQpRVvt71muvuq5qE-jTFZkK-GLHkhdebmc,26
9
+ eval_lib/testcases_schema.py,sha256=qI4o6kX0jH1DR3sHGXUnu3Cyt2oq7rGlsMlOaXSt6F4,696
10
+ eval_lib/utils.py,sha256=KNqNtZ3YO09Pf7alN7sT2P81xEGqii0AiNnJyN5jTlA,2744
11
+ eval_lib/agent_metrics/__init__.py,sha256=20Y4BsicD2s7OkOBQPBvB2JKStBDtplv52_q6q35Vgo,525
12
+ eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py,sha256=ensJZwq7N5DOm0VM4xmEjEsCkIxFytcf1GnoDJt8nBQ,8085
13
+ eval_lib/agent_metrics/role_adherence_metric/role_adherence.py,sha256=llH28rR2fasWqL-KoovsnPdqcnUHRy5tINm7Q_6dY5E,8922
14
+ eval_lib/agent_metrics/task_success_metric/task_success_rate.py,sha256=g2I0yQ5oB1TJ33LH-WpY4xa2Igz8dN36AghU47pjOFY,12379
15
+ eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py,sha256=bXFDPAe3MBDxKGliSeiyVCZ6v2sYksGFUUmO_dpEI60,4081
16
+ eval_lib/datagenerator/datagenerator.py,sha256=AClljFk-ZLKLkeaDrfRVE1-7pixiC60fwsoAXXVsXEY,8200
17
+ eval_lib/datagenerator/document_loader.py,sha256=bx3yvpH8pLlgbwPZRSLQt0Kl-xT6eI2AqRApIPyMYbI,17554
18
+ eval_lib/datagenerator/prompts.py,sha256=iQAYitAbLud3vWJnXGh_OCF4syWYS_S71zZEPI6qYAU,7213
19
+ eval_lib/metrics/__init__.py,sha256=3qClCCjPXt5i0c38g5krfuQnqlAXEl-jhAHy1C_ICMY,1213
20
+ eval_lib/metrics/answer_precision_metric/answer_precision.py,sha256=9EKgp_4Kw01nSRL5lyHxbHSTI1mQUnFpEtOWUf51aA0,14909
21
+ eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py,sha256=fRSeJhOQAV7FJHaDbQQjTXABIbKRO47-WMKMgPqzIZ0,8340
22
+ eval_lib/metrics/bias_metric/bias.py,sha256=HwabG0yyV4U_LXIsdviawKoqbM4QZxA3fcSnJNMpVeM,3788
23
+ eval_lib/metrics/contextual_precision_metric/contextual_precision.py,sha256=YZSfMoMwoVgu_5ZSp_yxpazbXM_9d-bWi74obPIhQ68,3916
24
+ eval_lib/metrics/contextual_recall_metric/contextual_recall.py,sha256=bZQE6AZpG_dnjrNjThhf9k9X0GtaVg3zPTvqQdXRjgQ,3783
25
+ eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py,sha256=aYle5W5NtcHuvumBE4k1O7eyREU-dILtoTl_fHAWdNs,6359
26
+ eval_lib/metrics/custom_metric/custom_eval.py,sha256=8YzbS45LQ0XtwIwyxnzFP5ujYeKPG_NWdWD0TnTwm2E,9737
27
+ eval_lib/metrics/faithfulness_metric/faithfulness.py,sha256=jHZMYflkXMEDsh8czENPisitTmaplI7Klr2PoPxE1Rw,5789
28
+ eval_lib/metrics/geval/geval.py,sha256=MhWd4KpSIsTEvanscuLsIXOtjOGEtw_2-8ctrqabzTo,10490
29
+ eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py,sha256=TQpoiN1hG2KKw1ILpJsmoE2T8uMGhxHz3wswDaqcBPA,4156
30
+ eval_lib/metrics/toxicity_metric/toxicity.py,sha256=Rh5UPSeAtkGopmCU4JGTAVLHbgFKrhVO83pl6vdNdTw,3882
31
+ eval_ai_library-0.1.0.dist-info/METADATA,sha256=dJ2lV7VKlvVfmcIimgIAfgi900g21DBayknvs5Ng8yg,23834
32
+ eval_ai_library-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
+ eval_ai_library-0.1.0.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
34
+ eval_ai_library-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Aleksandr Meshkov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ eval_lib
eval_lib/__init__.py ADDED
@@ -0,0 +1,122 @@
1
+ """
2
+ Eval AI Library - Comprehensive AI Model Evaluation Framework
3
+
4
+ A powerful library for evaluating AI models with support for multiple LLM providers
5
+ and a wide range of evaluation metrics for RAG systems and AI agents.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+ __author__ = "Aleksandr Meskov"
10
+
11
+ # Core evaluation functions
12
+ from eval_lib.evaluate import evaluate, evaluate_conversations
13
+ from eval_lib.utils import score_agg, softmax_agg
14
+
15
+ # Test case schemas
16
+ from eval_lib.testcases_schema import (
17
+ EvalTestCase,
18
+ ConversationalEvalTestCase,
19
+ ToolCall
20
+ )
21
+
22
+ # Evaluation schemas
23
+ from eval_lib.evaluation_schema import (
24
+ MetricResult,
25
+ TestCaseResult,
26
+ ConversationalTestCaseResult
27
+ )
28
+
29
+ # Base patterns
30
+ from eval_lib.metric_pattern import (
31
+ MetricPattern,
32
+ ConversationalMetricPattern
33
+ )
34
+
35
+ # LLM client
36
+ from eval_lib.llm_client import (
37
+ chat_complete,
38
+ get_embeddings,
39
+ LLMDescriptor,
40
+ Provider
41
+ )
42
+
43
+ # RAG Metrics
44
+ from eval_lib.metrics import (
45
+ AnswerRelevancyMetric,
46
+ AnswerPrecisionMetric,
47
+ FaithfulnessMetric,
48
+ ContextualRelevancyMetric,
49
+ ContextualPrecisionMetric,
50
+ ContextualRecallMetric,
51
+ BiasMetric,
52
+ ToxicityMetric,
53
+ RestrictedRefusalMetric,
54
+ GEval,
55
+ CustomEvalMetric
56
+ )
57
+
58
+ # Agent Metrics
59
+ from eval_lib.agent_metrics import (
60
+ ToolCorrectnessMetric,
61
+ TaskSuccessRateMetric,
62
+ RoleAdherenceMetric,
63
+ KnowledgeRetentionMetric
64
+ )
65
+
66
+ # Data generator
67
+ from eval_lib.datagenerator.datagenerator import DataGenerator
68
+ from eval_lib.datagenerator.document_loader import DocumentLoader
69
+
70
+ __all__ = [
71
+ # Version
72
+ "__version__",
73
+
74
+ # Core functions
75
+ "evaluate",
76
+ "evaluate_conversations",
77
+
78
+ # Schemas
79
+ "EvalTestCase",
80
+ "ConversationalEvalTestCase",
81
+ "ToolCall",
82
+ "MetricResult",
83
+ "TestCaseResult",
84
+ "ConversationalTestCaseResult",
85
+
86
+ # Patterns
87
+ "MetricPattern",
88
+ "ConversationalMetricPattern",
89
+
90
+ # LLM
91
+ "chat_complete",
92
+ "get_embeddings",
93
+ "LLMDescriptor",
94
+ "Provider",
95
+
96
+ # RAG Metrics
97
+ "AnswerRelevancyMetric",
98
+ "AnswerPrecisionMetric",
99
+ "FaithfulnessMetric",
100
+ "ContextualRelevancyMetric",
101
+ "ContextualPrecisionMetric",
102
+ "ContextualRecallMetric",
103
+ "BiasMetric",
104
+ "ToxicityMetric",
105
+ "RestrictedRefusalMetric",
106
+ "GEval",
107
+ "CustomEvalMetric",
108
+
109
+ # Agent Metrics
110
+ "ToolCorrectnessMetric",
111
+ "TaskSuccessRateMetric",
112
+ "RoleAdherenceMetric",
113
+ "KnowledgeRetentionMetric",
114
+
115
+ # Data Generation
116
+ "DataGenerator",
117
+ "DocumentLoader",
118
+
119
+ # Utils
120
+ "score_agg",
121
+ "extract_json_block",
122
+ ]
@@ -0,0 +1,12 @@
1
+ from eval_lib.agent_metrics.tools_correctness_metric.tool_correctness import ToolCorrectnessMetric
2
+ from eval_lib.agent_metrics.task_success_metric.task_success_rate import TaskSuccessRateMetric
3
+ from eval_lib.agent_metrics.role_adherence_metric.role_adherence import RoleAdherenceMetric
4
+ from eval_lib.agent_metrics.knowledge_retention_metric.knowledge_retention import KnowledgeRetentionMetric
5
+
6
+
7
+ __all__ = [
8
+ "ToolCorrectnessMetric",
9
+ "TaskSuccessRateMetric",
10
+ "RoleAdherenceMetric",
11
+ "KnowledgeRetentionMetric"
12
+ ]
@@ -0,0 +1,231 @@
1
+ # knowledge_retention.py
2
+ """
3
+ Knowledge Retention Metric: Evaluates how well the assistant remembers and maintains
4
+ context across a multi-turn conversation.
5
+
6
+ Score calculation: Softmax aggregation of retention verdicts
7
+ """
8
+ import json
9
+ from typing import List, Dict, Any, Tuple
10
+ from eval_lib.testcases_schema import ConversationalEvalTestCase
11
+ from eval_lib.metric_pattern import ConversationalMetricPattern
12
+ from eval_lib.llm_client import chat_complete
13
+ from eval_lib.utils import score_agg, extract_json_block
14
+
15
+
16
+ # Verdict weights for knowledge retention levels
17
+ VERDICT_WEIGHTS = {
18
+ "fully": 1.0, # Remembers every relevant fact
19
+ "mostly": 0.9, # Slight omissions, no contradiction
20
+ "partial": 0.7, # Several lapses but overall context kept
21
+ "minor": 0.3, # One small lapse or omission
22
+ "none": 0.0 # Assistant contradicts or forgets previous facts
23
+ }
24
+
25
+
26
+ class KnowledgeRetentionMetric(ConversationalMetricPattern):
27
+ """
28
+ Evaluates how well an AI assistant retains and recalls information
29
+ shared across multiple conversation turns.
30
+ """
31
+
32
+ name = "knowledgeRetentionMetric"
33
+
34
+ def __init__(
35
+ self,
36
+ model: str,
37
+ threshold: float = 0.7,
38
+ temperature: float = 0.5,
39
+ ):
40
+ """
41
+ Initialize Knowledge Retention metric.
42
+
43
+ Args:
44
+ model: LLM model name
45
+ threshold: Success threshold (0.0-1.0)
46
+ temperature: Score aggregation temperature for softmax
47
+ """
48
+ super().__init__(model=model, threshold=threshold)
49
+ self.temperature = temperature
50
+
51
+ # ==================== HELPER METHODS ====================
52
+
53
+ @staticmethod
54
+ def _render_dialogue(turns) -> str:
55
+ """Convert conversation turns into readable format"""
56
+ return "\n".join(
57
+ f"{i+1}. User: {t.input}\n Assistant: {t.actual_output}"
58
+ for i, t in enumerate(turns)
59
+ )
60
+
61
+ @staticmethod
62
+ def _prompt_label_help() -> str:
63
+ """Explanation of retention verdict levels"""
64
+ return """Rate knowledge retention (worst → best):
65
+
66
+ none – assistant contradicts or forgets previous facts
67
+ minor – one small lapse or omission
68
+ partial– several lapses but overall context kept
69
+ mostly – slight omissions, no contradiction
70
+ fully – remembers every relevant fact"""
71
+
72
+ @staticmethod
73
+ def _prompt_few_shot() -> str:
74
+ """Few-shot examples for verdict generation"""
75
+ return """Example GOOD:
76
+ Conversation:
77
+ 1. User: What year was Python created?
78
+ Assistant: Python was first released in 1991.
79
+ 2. User: Remind me, who created it?
80
+ Assistant: It was created by Guido van Rossum in 1991.
81
+ Verdicts:
82
+ [{"verdict":"fully","reason":"Assistant repeated year and author correctly"}]
83
+
84
+ Example BAD:
85
+ Conversation:
86
+ 1. User: I live in Spain.
87
+ Assistant: Great! How's the weather?
88
+ 2. User: Remind me later that I live in Spain.
89
+ Assistant: Sure, I'll remind you that you live in Italy.
90
+ Verdicts:
91
+ [{"verdict":"none","reason":"Assistant contradicted the country"}]"""
92
+
93
+ # ==================== CORE EVALUATION STEPS ====================
94
+
95
+ async def _generate_verdicts(
96
+ self,
97
+ dialogue: str
98
+ ) -> Tuple[List[Dict[str, str]], float, float]:
99
+ """
100
+ Generate retention verdicts for the conversation.
101
+
102
+ Args:
103
+ dialogue: Formatted conversation text
104
+
105
+ Returns:
106
+ Tuple of (verdicts_list, aggregated_score, llm_cost)
107
+ """
108
+ prompt = (
109
+ f"{self._prompt_label_help()}\n\n"
110
+ f"{self._prompt_few_shot()}\n\n"
111
+ "Now analyse the next conversation.\n\n"
112
+ f"Conversation:\n{dialogue}\n\n"
113
+ "Return ONE JSON array with 1 object in the form:\n"
114
+ "[{\"verdict\":\"fully|mostly|partial|minor|none\",\"reason\":\"…\"}]"
115
+ )
116
+
117
+ text, cost = await chat_complete(
118
+ self.model,
119
+ messages=[{"role": "user", "content": prompt}],
120
+ temperature=0.0
121
+ )
122
+
123
+ try:
124
+ raw_json = extract_json_block(text)
125
+ verdicts = json.loads(raw_json)
126
+
127
+ if not isinstance(verdicts, list):
128
+ raise ValueError("Expected JSON array of verdicts")
129
+
130
+ # Calculate aggregated score from verdicts
131
+ weights = [VERDICT_WEIGHTS.get(
132
+ v.get("verdict", "none"), 0.0) for v in verdicts]
133
+ score = round(score_agg(weights, temperature=self.temperature), 4)
134
+
135
+ return verdicts, score, cost or 0.0
136
+
137
+ except Exception as e:
138
+ raise RuntimeError(f"Failed to parse verdicts: {e}\n{text}")
139
+
140
+ async def _summarize_verdicts(
141
+ self,
142
+ verdicts: List[Dict[str, str]]
143
+ ) -> Tuple[str, float]:
144
+ """
145
+ Generate concise summary of retention assessment.
146
+
147
+ Args:
148
+ verdicts: List of verdict objects with reasons
149
+
150
+ Returns:
151
+ Tuple of (summary_text, llm_cost)
152
+ """
153
+ bullets = "\n".join(f"- {v['reason']}" for v in verdicts)
154
+
155
+ prompt = (
156
+ "Write a concise (max 2 sentences) summary of the assistant's knowledge retention, "
157
+ "based on these points:\n\n"
158
+ f"{bullets}\n\n"
159
+ "Summary:"
160
+ )
161
+
162
+ text, cost = await chat_complete(
163
+ self.model,
164
+ messages=[{"role": "user", "content": prompt}],
165
+ temperature=0.0
166
+ )
167
+
168
+ return text.strip(), cost or 0.0
169
+
170
+ # ==================== MAIN EVALUATION ====================
171
+
172
+ async def evaluate(self, test_case: ConversationalEvalTestCase) -> Dict[str, Any]:
173
+ """
174
+ Evaluate knowledge retention across conversation turns.
175
+
176
+ Steps:
177
+ 1. Format dialogue into readable text
178
+ 2. Generate retention verdicts (fully/mostly/partial/minor/none)
179
+ 3. Aggregate verdicts into final score using softmax
180
+ 4. Generate summary explanation
181
+ 5. Build comprehensive evaluation log
182
+
183
+ Args:
184
+ test_case: Conversational test case with multiple turns
185
+
186
+ Returns:
187
+ Evaluation results with score, success, reason, cost, and detailed log
188
+ """
189
+ total_cost = 0.0
190
+
191
+ # Step 1: Format dialogue
192
+ dialogue_text = self._render_dialogue(test_case.turns)
193
+
194
+ # Step 2: Generate retention verdicts
195
+ verdicts, verdict_score, cost = await self._generate_verdicts(dialogue_text)
196
+ total_cost += cost
197
+
198
+ # Step 3: Generate summary explanation
199
+ summary, cost = await self._summarize_verdicts(verdicts)
200
+ total_cost += cost
201
+
202
+ # Step 4: Determine success
203
+ final_score = verdict_score
204
+ success = final_score >= self.threshold
205
+
206
+ # Step 5: Build evaluation log
207
+ evaluation_log = {
208
+ "dialogue": dialogue_text,
209
+ "comment_dialogue": "Full conversation text used for retention evaluation.",
210
+ "number_of_turns": len(test_case.turns),
211
+ "comment_number_of_turns": "Total conversation turns analyzed.",
212
+ "verdicts": verdicts,
213
+ "comment_verdicts": "LLM-generated verdicts assessing retention level (fully/mostly/partial/minor/none).",
214
+ "verdict_weights": {v["verdict"]: VERDICT_WEIGHTS.get(v["verdict"], 0.0) for v in verdicts},
215
+ "comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
216
+ "final_score": final_score,
217
+ "comment_final_score": f"Softmax aggregation of verdict weights (temperature={self.temperature}).",
218
+ "threshold": self.threshold,
219
+ "success": success,
220
+ "comment_success": "Whether the retention score meets the required threshold.",
221
+ "final_reason": summary,
222
+ "comment_reasoning": "Concise explanation of the assistant's knowledge retention performance."
223
+ }
224
+
225
+ return {
226
+ "score": final_score,
227
+ "success": success,
228
+ "reason": summary,
229
+ "evaluation_cost": round(total_cost, 6),
230
+ "evaluation_log": evaluation_log
231
+ }
@@ -0,0 +1,251 @@
1
+ # role_adherence.py
2
+ """
3
+ Role Adherence Metric: Evaluates how well the AI assistant maintains its assigned
4
+ role and character throughout a multi-turn conversation.
5
+
6
+ Score calculation: Softmax aggregation of adherence verdicts
7
+ """
8
+ import json
9
+ from typing import List, Dict, Any, Tuple
10
+ from eval_lib.testcases_schema import ConversationalEvalTestCase
11
+ from eval_lib.metric_pattern import ConversationalMetricPattern
12
+ from eval_lib.llm_client import chat_complete
13
+ from eval_lib.utils import score_agg, extract_json_block
14
+
15
+
16
+ # Verdict weights for role adherence levels
17
+ VERDICT_WEIGHTS = {
18
+ "fully": 1.0, # Perfectly maintains role throughout
19
+ "mostly": 0.9, # Minor deviations but stays in character
20
+ "partial": 0.7, # Some role breaks but generally consistent
21
+ "minor": 0.3, # Frequently breaks character
22
+ "none": 0.0 # Completely ignores assigned role
23
+ }
24
+
25
+
26
+ class RoleAdherenceMetric(ConversationalMetricPattern):
27
+ """
28
+ Evaluates how consistently an AI assistant adheres to its assigned role
29
+ across multiple conversation turns.
30
+ """
31
+
32
+ name = "roleAdherenceMetric"
33
+
34
+ def __init__(
35
+ self,
36
+ model: str,
37
+ threshold: float = 0.7,
38
+ temperature: float = 0.5,
39
+ ):
40
+ """
41
+ Initialize Role Adherence metric.
42
+
43
+ Args:
44
+ model: LLM model name
45
+ threshold: Success threshold (0.0-1.0)
46
+ temperature: Score aggregation temperature for softmax
47
+ """
48
+ super().__init__(model=model, threshold=threshold)
49
+ self.temperature = temperature
50
+
51
+ # ==================== HELPER METHODS ====================
52
+
53
+ @staticmethod
54
+ def _render_dialogue(turns) -> str:
55
+ """Convert conversation turns into readable format"""
56
+ return "\n".join(
57
+ f"{i+1}. User: {t.input}\n Assistant: {t.actual_output}"
58
+ for i, t in enumerate(turns)
59
+ )
60
+
61
+ @staticmethod
62
+ def _prompt_label_help() -> str:
63
+ """Explanation of role adherence verdict levels"""
64
+ return """Rate role adherence (worst → best):
65
+
66
+ none – completely ignores assigned role
67
+ minor – frequently breaks character
68
+ partial – some role breaks but generally consistent
69
+ mostly – minor deviations but stays in character
70
+ fully – perfectly maintains role throughout"""
71
+
72
+ @staticmethod
73
+ def _prompt_few_shot() -> str:
74
+ """Few-shot examples for verdict generation"""
75
+ return """Example GOOD:
76
+ Role: You are a professional medical advisor. Be formal and evidence-based.
77
+ Conversation:
78
+ 1. User: What causes headaches?
79
+ Assistant: Headaches can result from various factors including dehydration, stress, or underlying medical conditions. I recommend consulting a healthcare provider for persistent symptoms.
80
+ Verdicts:
81
+ [{"verdict":"fully","reason":"Maintained formal medical tone and evidence-based approach"}]
82
+
83
+ Example BAD:
84
+ Role: You are a professional financial advisor. Use formal language.
85
+ Conversation:
86
+ 1. User: Should I invest in stocks?
87
+ Assistant: Yo dude! Stocks are totally rad! Just YOLO into them lol 🚀
88
+ Verdicts:
89
+ [{"verdict":"none","reason":"Completely abandoned professional tone and formal language requirement"}]"""
90
+
91
+ # ==================== CORE EVALUATION STEPS ====================
92
+
93
+ async def _generate_verdicts(
94
+ self,
95
+ role_description: str,
96
+ dialogue_text: str
97
+ ) -> Tuple[List[Dict[str, str]], float, float]:
98
+ """
99
+ Generate role adherence verdicts for each conversation turn.
100
+
101
+ Args:
102
+ role_description: The assigned role/character description
103
+ dialogue_text: Formatted conversation text
104
+
105
+ Returns:
106
+ Tuple of (verdicts_list, aggregated_score, llm_cost)
107
+ """
108
+ prompt = f"""{self._prompt_label_help()}
109
+
110
+ {self._prompt_few_shot()}
111
+
112
+ Now evaluate the following conversation.
113
+
114
+ ASSIGNED ROLE:
115
+ {role_description}
116
+
117
+ DIALOGUE:
118
+ {dialogue_text}
119
+
120
+ Task: Judge how well the assistant stays in character throughout the conversation.
121
+
122
+ For each assistant reply, assign a verdict: "fully", "mostly", "partial", "minor", or "none".
123
+
124
+ Return JSON array:
125
+ [{{"verdict": "fully|mostly|partial|minor|none", "reason": "<explanation>"}}, ...]"""
126
+
127
+ text, cost = await chat_complete(
128
+ self.model,
129
+ messages=[{"role": "user", "content": prompt}],
130
+ temperature=0.0
131
+ )
132
+
133
+ try:
134
+ raw_json = extract_json_block(text)
135
+ verdicts = json.loads(raw_json)
136
+
137
+ if not isinstance(verdicts, list):
138
+ raise ValueError("Expected JSON array of verdicts")
139
+
140
+ # Calculate aggregated score from verdicts
141
+ weights = [VERDICT_WEIGHTS.get(
142
+ v.get("verdict", "none"), 0.0) for v in verdicts]
143
+ score = round(score_agg(weights, temperature=self.temperature), 4)
144
+
145
+ return verdicts, score, cost or 0.0
146
+
147
+ except Exception as e:
148
+ raise RuntimeError(f"Failed to parse verdicts: {e}\n{text}")
149
+
150
+ async def _summarize_verdicts(
151
+ self,
152
+ verdicts: List[Dict[str, str]]
153
+ ) -> Tuple[str, float]:
154
+ """
155
+ Generate concise summary of role adherence assessment.
156
+
157
+ Args:
158
+ verdicts: List of verdict objects with reasons
159
+
160
+ Returns:
161
+ Tuple of (summary_text, llm_cost)
162
+ """
163
+ # Take up to 6 most relevant verdicts for summary
164
+ bullets = "\n".join(f"- {v['reason']}" for v in verdicts[:6])
165
+
166
+ prompt = (
167
+ "Write a concise (max 2 sentences) summary of how well the chatbot stayed in character, "
168
+ "based on these observations:\n\n"
169
+ f"{bullets}\n\n"
170
+ "Summary:"
171
+ )
172
+
173
+ text, cost = await chat_complete(
174
+ self.model,
175
+ messages=[{"role": "user", "content": prompt}],
176
+ temperature=0.0
177
+ )
178
+
179
+ return text.strip(), cost or 0.0
180
+
181
+ # ==================== MAIN EVALUATION ====================
182
+
183
+ async def evaluate(self, test_case: ConversationalEvalTestCase) -> Dict[str, Any]:
184
+ """
185
+ Evaluate role adherence across conversation turns.
186
+
187
+ Steps:
188
+ 1. Extract chatbot role from test case
189
+ 2. Format dialogue into readable text
190
+ 3. Generate adherence verdicts (fully/mostly/partial/minor/none)
191
+ 4. Aggregate verdicts into final score using softmax
192
+ 5. Generate summary explanation
193
+ 6. Build comprehensive evaluation log
194
+
195
+ Args:
196
+ test_case: Conversational test case with multiple turns and chatbot role
197
+
198
+ Returns:
199
+ Evaluation results with score, success, reason, cost, and detailed log
200
+ """
201
+ total_cost = 0.0
202
+
203
+ # Step 1: Extract role
204
+ role_description = test_case.chatbot_role or "No role specified"
205
+
206
+ # Step 2: Format dialogue
207
+ dialogue_text = self._render_dialogue(test_case.turns)
208
+
209
+ # Step 3: Generate role adherence verdicts
210
+ verdicts, verdict_score, cost = await self._generate_verdicts(
211
+ role_description,
212
+ dialogue_text
213
+ )
214
+ total_cost += cost
215
+
216
+ # Step 4: Generate summary explanation
217
+ summary, cost = await self._summarize_verdicts(verdicts)
218
+ total_cost += cost
219
+
220
+ # Step 5: Determine success
221
+ final_score = verdict_score
222
+ success = final_score >= self.threshold
223
+
224
+ # Step 6: Build evaluation log
225
+ evaluation_log = {
226
+ "chatbot_role": role_description,
227
+ "comment_chatbot_role": "The assigned role/character the assistant should maintain.",
228
+ "dialogue": dialogue_text,
229
+ "comment_dialogue": "Full conversation text used for role adherence evaluation.",
230
+ "number_of_turns": len(test_case.turns),
231
+ "comment_number_of_turns": "Total conversation turns analyzed.",
232
+ "verdicts": verdicts,
233
+ "comment_verdicts": "LLM-generated verdicts assessing adherence level per turn (fully/mostly/partial/minor/none).",
234
+ "verdict_weights": {v["verdict"]: VERDICT_WEIGHTS.get(v["verdict"], 0.0) for v in verdicts},
235
+ "comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
236
+ "final_score": final_score,
237
+ "comment_final_score": f"Softmax aggregation of verdict weights (temperature={self.temperature}).",
238
+ "threshold": self.threshold,
239
+ "success": success,
240
+ "comment_success": "Whether the role adherence score meets the required threshold.",
241
+ "final_reason": summary,
242
+ "comment_reasoning": "Concise explanation of how well the assistant maintained its assigned role."
243
+ }
244
+
245
+ return {
246
+ "score": final_score,
247
+ "success": success,
248
+ "reason": summary,
249
+ "evaluation_cost": round(total_cost, 6),
250
+ "evaluation_log": evaluation_log
251
+ }