eval-ai-library 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eval-ai-library might be problematic. Click here for more details.

Files changed (34) hide show
  1. eval_ai_library-0.1.0.dist-info/METADATA +753 -0
  2. eval_ai_library-0.1.0.dist-info/RECORD +34 -0
  3. eval_ai_library-0.1.0.dist-info/WHEEL +5 -0
  4. eval_ai_library-0.1.0.dist-info/licenses/LICENSE +21 -0
  5. eval_ai_library-0.1.0.dist-info/top_level.txt +1 -0
  6. eval_lib/__init__.py +122 -0
  7. eval_lib/agent_metrics/__init__.py +12 -0
  8. eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +231 -0
  9. eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +251 -0
  10. eval_lib/agent_metrics/task_success_metric/task_success_rate.py +347 -0
  11. eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +106 -0
  12. eval_lib/datagenerator/datagenerator.py +230 -0
  13. eval_lib/datagenerator/document_loader.py +510 -0
  14. eval_lib/datagenerator/prompts.py +192 -0
  15. eval_lib/evaluate.py +335 -0
  16. eval_lib/evaluation_schema.py +63 -0
  17. eval_lib/llm_client.py +286 -0
  18. eval_lib/metric_pattern.py +229 -0
  19. eval_lib/metrics/__init__.py +25 -0
  20. eval_lib/metrics/answer_precision_metric/answer_precision.py +405 -0
  21. eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +195 -0
  22. eval_lib/metrics/bias_metric/bias.py +114 -0
  23. eval_lib/metrics/contextual_precision_metric/contextual_precision.py +102 -0
  24. eval_lib/metrics/contextual_recall_metric/contextual_recall.py +91 -0
  25. eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +169 -0
  26. eval_lib/metrics/custom_metric/custom_eval.py +303 -0
  27. eval_lib/metrics/faithfulness_metric/faithfulness.py +140 -0
  28. eval_lib/metrics/geval/geval.py +326 -0
  29. eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +102 -0
  30. eval_lib/metrics/toxicity_metric/toxicity.py +113 -0
  31. eval_lib/price.py +37 -0
  32. eval_lib/py.typed +1 -0
  33. eval_lib/testcases_schema.py +27 -0
  34. eval_lib/utils.py +99 -0
@@ -0,0 +1,303 @@
1
+ # custom_eval.py
2
+ """
3
+ Custom Evaluation Metric with Chain-of-Thought and Probability-Weighted Scoring
4
+ Uses advanced techniques from G-Eval for improved accuracy
5
+ """
6
+ import json
7
+ import re
8
+ from typing import Dict, Any, List, Tuple
9
+ from collections import Counter
10
+ from eval_lib.metric_pattern import MetricPattern
11
+ from eval_lib.testcases_schema import EvalTestCase
12
+ from eval_lib.llm_client import chat_complete
13
+ from eval_lib.utils import extract_json_block
14
+
15
+
16
+ class CustomEvalMetric(MetricPattern):
17
+ name = "customEval"
18
+
19
+ def __init__(self, model: str, threshold: float, name: str, criteria: str):
20
+ super().__init__(model=model, threshold=threshold)
21
+ self.custom_name = name
22
+ self.criteria = criteria
23
+
24
+ # ==================== PROMPTS ====================
25
+
26
+ @staticmethod
27
+ def _prompt_generate_steps(criteria: str) -> str:
28
+ """Generate evaluation steps from criteria (Chain-of-Thought)"""
29
+ return f"""Given the evaluation criteria below, generate 3-5 detailed evaluation steps.
30
+
31
+ Evaluation Criteria:
32
+ {criteria}
33
+
34
+ Generate steps that are:
35
+ 1. Specific and actionable
36
+ 2. Logically ordered
37
+ 3. Lead to assigning a score from 0.0 to 1.0
38
+
39
+ **
40
+ Return ONLY JSON:
41
+ {{
42
+ "steps": ["Step 1: ...", "Step 2: ...", "Step 3: ..."]
43
+ }}
44
+ **
45
+
46
+ JSON:"""
47
+
48
+ @staticmethod
49
+ def _prompt_evaluate(criteria: str, evaluation_steps: List[str], test_case: EvalTestCase) -> str:
50
+ """Generate evaluation prompt with CoT steps"""
51
+ steps_text = "\n".join(
52
+ [f"{i+1}. {step}" for i, step in enumerate(evaluation_steps)])
53
+
54
+ parts = [
55
+ f"User Input:\n{test_case.input}",
56
+ f"Model Output:\n{test_case.actual_output}"
57
+ ]
58
+
59
+ if test_case.expected_output:
60
+ parts.append(f"Expected Output:\n{test_case.expected_output}")
61
+
62
+ if test_case.retrieval_context:
63
+ parts.append(f"Context:\n" +
64
+ "\n".join(test_case.retrieval_context))
65
+
66
+ input_block = "\n\n".join(parts)
67
+
68
+ return f"""You are a strict evaluator. Use the criteria and evaluation steps below.
69
+
70
+ Evaluation Criteria:
71
+ {criteria}
72
+
73
+ Evaluation Steps:
74
+ {steps_text}
75
+
76
+ {input_block}
77
+
78
+ Based on the evaluation steps, assign a score from 0.0 to 1.0 (where 0.0 is worst and 1.0 is best).
79
+
80
+ **
81
+ Return ONLY JSON:
82
+ {{
83
+ "score": <float between 0.0 and 1.0>
84
+ }}
85
+ **
86
+
87
+ JSON:"""
88
+
89
+ @staticmethod
90
+ def _prompt_reason(
91
+ criteria: str,
92
+ evaluation_steps: List[str],
93
+ test_case: EvalTestCase,
94
+ score: float
95
+ ) -> str:
96
+ """Generate explanation for the score"""
97
+ steps_text = "\n".join(
98
+ [f"{i+1}. {step}" for i, step in enumerate(evaluation_steps)])
99
+
100
+ parts = [
101
+ f"User Input:\n{test_case.input}",
102
+ f"Model Output:\n{test_case.actual_output}"
103
+ ]
104
+
105
+ if test_case.expected_output:
106
+ parts.append(f"Expected Output:\n{test_case.expected_output}")
107
+
108
+ if test_case.retrieval_context:
109
+ parts.append(f"Context:\n" +
110
+ "\n".join(test_case.retrieval_context))
111
+
112
+ input_block = "\n\n".join(parts)
113
+
114
+ return f"""You assigned a score of {score:.2f} (out of 1.0) for this evaluation.
115
+
116
+ Evaluation Criteria:
117
+ {criteria}
118
+
119
+ Evaluation Steps:
120
+ {steps_text}
121
+
122
+ {input_block}
123
+
124
+ Final Score: {score:.2f}/1.0
125
+
126
+ Explain why this score was assigned, referencing specific aspects from the evaluation steps.
127
+
128
+ **
129
+ Return ONLY JSON:
130
+ {{
131
+ "reason": "Your explanation..."
132
+ }}
133
+ **
134
+
135
+ JSON:"""
136
+
137
+ # ==================== HELPER METHODS ====================
138
+
139
+ def _extract_score_from_response(self, text: str) -> float:
140
+ """Extract float score from LLM response (0.0-1.0 range)"""
141
+ text = text.strip()
142
+
143
+ # Try JSON parsing first
144
+ try:
145
+ data = json.loads(extract_json_block(text))
146
+ if "score" in data:
147
+ score = float(data["score"])
148
+ if 0.0 <= score <= 1.0:
149
+ return score
150
+ except:
151
+ pass
152
+
153
+ # Try regex patterns
154
+ patterns = [
155
+ r'"score"\s*:\s*(\d+\.?\d*)',
156
+ r'score[:\s]+(\d+\.?\d*)',
157
+ r'^\s*(\d+\.?\d*)\s*$',
158
+ ]
159
+
160
+ for pattern in patterns:
161
+ match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
162
+ if match:
163
+ score = float(match.group(1))
164
+ if 0.0 <= score <= 1.0:
165
+ return score
166
+
167
+ raise RuntimeError(f"Failed to extract score from response: {text}")
168
+
169
+ # ==================== CORE ALGORITHM ====================
170
+
171
+ async def _probability_weighted_scoring(
172
+ self,
173
+ prompt: str,
174
+ n_samples: int = 20,
175
+ temperature: float = 2.0
176
+ ) -> Tuple[float, List[float], float]:
177
+ """
178
+ Probability-weighted scoring: score = Σ p(si) × si
179
+ Samples multiple times to estimate probability distribution
180
+
181
+ Returns:
182
+ (final_score, sampled_scores, total_cost)
183
+ """
184
+ total_cost = 0.0
185
+ scores = []
186
+
187
+ # Sample n times with high temperature
188
+ for _ in range(n_samples):
189
+ text, cost = await chat_complete(
190
+ self.model,
191
+ messages=[{"role": "user", "content": prompt}],
192
+ temperature=temperature
193
+ )
194
+ total_cost += cost or 0.0
195
+
196
+ try:
197
+ score = self._extract_score_from_response(text)
198
+ scores.append(score)
199
+ except:
200
+ continue
201
+
202
+ if not scores:
203
+ raise RuntimeError(
204
+ f"Failed to extract any valid scores from {n_samples} samples")
205
+
206
+ # Calculate probability-weighted score as mean
207
+ weighted_score = sum(scores) / len(scores)
208
+
209
+ return weighted_score, scores, total_cost
210
+
211
+ async def evaluate(self, test_case: EvalTestCase) -> Dict[str, Any]:
212
+ """
213
+ Evaluate using Chain-of-Thought and Probability-Weighted Scoring.
214
+
215
+ Algorithm:
216
+ 1. Auto-generate evaluation steps from criteria (CoT)
217
+ 2. Apply probability-weighted scoring (20 samples, temp=2.0)
218
+ 3. Generate detailed explanation
219
+ 4. Build comprehensive evaluation_log
220
+ """
221
+ total_cost = 0.0
222
+
223
+ # Step 1: Auto-generate evaluation steps (Chain-of-Thought from G-Eval)
224
+ steps_prompt = self._prompt_generate_steps(self.criteria)
225
+ steps_text, step_cost = await chat_complete(
226
+ self.model,
227
+ messages=[{"role": "user", "content": steps_prompt}],
228
+ temperature=0.0
229
+ )
230
+ total_cost += step_cost or 0.0
231
+
232
+ try:
233
+ parsed_steps = json.loads(extract_json_block(steps_text))
234
+ evaluation_steps = parsed_steps["steps"]
235
+ except Exception as e:
236
+ raise RuntimeError(
237
+ f"Failed to parse evaluation steps: {e}\n{steps_text}")
238
+
239
+ # Step 2: Generate evaluation prompt with CoT
240
+ eval_prompt = self._prompt_evaluate(
241
+ self.criteria, evaluation_steps, test_case)
242
+
243
+ # Step 3: Probability-weighted scoring (20 samples from G-Eval)
244
+ final_score, sampled_scores, scoring_cost = await self._probability_weighted_scoring(
245
+ eval_prompt,
246
+ n_samples=20,
247
+ temperature=2.0
248
+ )
249
+ total_cost += scoring_cost
250
+
251
+ # Step 4: Generate explanation
252
+ reason_prompt = self._prompt_reason(
253
+ self.criteria, evaluation_steps, test_case, final_score)
254
+ reason_text, reason_cost = await chat_complete(
255
+ self.model,
256
+ messages=[{"role": "user", "content": reason_prompt}],
257
+ temperature=0.0
258
+ )
259
+ total_cost += reason_cost or 0.0
260
+
261
+ # Parse reason
262
+ try:
263
+ reason_data = json.loads(extract_json_block(reason_text))
264
+ reason = reason_data.get("reason", reason_text)
265
+ except:
266
+ reason = reason_text.strip()
267
+
268
+ success = final_score >= self.threshold
269
+
270
+ # Step 5: Build comprehensive evaluation_log
271
+ evaluation_log = {
272
+ "input_question": test_case.input,
273
+ "actual_output": test_case.actual_output,
274
+ "expected_output": test_case.expected_output,
275
+ "retrieval_context": test_case.retrieval_context,
276
+ "criteria": self.criteria,
277
+ "comment_criteria": "Custom evaluation criteria provided by user.",
278
+ "evaluation_steps": evaluation_steps,
279
+ "comment_evaluation_steps": "Auto-generated evaluation steps using Chain-of-Thought (CoT) technique from G-Eval.",
280
+ "sampled_scores": sampled_scores,
281
+ "comment_sampled_scores": f"Individual scores from {len(sampled_scores)} samples with temperature=2.0.",
282
+ "score_distribution": {f"{s:.2f}": sampled_scores.count(s) for s in set(sampled_scores)},
283
+ "comment_score_distribution": "Frequency distribution of sampled scores for probability-weighted calculation.",
284
+ "final_score": round(final_score, 4),
285
+ "comment_final_score": "Probability-weighted score calculated as mean of sampled scores (G-Eval technique).",
286
+ "threshold": self.threshold,
287
+ "success": success,
288
+ "comment_success": "Whether the final score passes the custom threshold.",
289
+ "final_reason": reason,
290
+ "comment_reasoning": "LLM-generated explanation based on evaluation steps and criteria."
291
+ }
292
+
293
+ return {
294
+ "score": round(final_score, 4),
295
+ "success": success,
296
+ "reason": reason,
297
+ "evaluation_cost": round(total_cost, 6),
298
+ "evaluation_log": evaluation_log,
299
+ }
300
+
301
+ @property
302
+ def name(self):
303
+ return f"Custom: {self.custom_name}"
@@ -0,0 +1,140 @@
1
+ # faithfulness_metric.py
2
+ '''
3
+ Faithfulness Metric: Evaluates the factual consistency of a chatbot's answer
4
+ with respect to the retrieved context.
5
+ Score calculation: Softmax aggregation of verdicts on factual statements
6
+ '''
7
+ from typing import List, Dict, Tuple, Any
8
+ import json
9
+ import re
10
+ import numpy as np
11
+ from math import exp
12
+ from eval_lib.testcases_schema import EvalTestCase
13
+ from eval_lib.metric_pattern import MetricPattern
14
+ from eval_lib.llm_client import chat_complete
15
+ from eval_lib.utils import score_agg, extract_json_block
16
+
17
+ VERDICT_WEIGHTS = {
18
+ "fully": 1.0,
19
+ "mostly": 0.9,
20
+ "partial": 0.7,
21
+ "minor": 0.3,
22
+ "none": 0.0,
23
+ }
24
+
25
+
26
+ class FaithfulnessMetric(MetricPattern):
27
+ name = "faithfulnessMetric"
28
+
29
+ def __init__(
30
+ self,
31
+ model: str,
32
+ threshold: float = 0.7,
33
+ temperature: float = 0.5,
34
+ ):
35
+ super().__init__(model=model, threshold=threshold)
36
+ self.temperature = temperature
37
+
38
+ async def _generate_statements(self, answer: str) -> Tuple[List[str], float]:
39
+ prompt = (
40
+ "Extract standalone factual claims from the following answer.\n"
41
+ "Each statement must be a distinct, verifiable fact.\n\n"
42
+ f"Answer:\n{answer}\n\n"
43
+ "Return a JSON array of strings."
44
+ )
45
+ text, cost = await chat_complete(self.model, [{"role": "user", "content": prompt}], temperature=0.0)
46
+ raw_json = extract_json_block(text)
47
+ statements = json.loads(raw_json)
48
+ assert isinstance(statements, list)
49
+ return statements, cost or 0.0
50
+
51
+ async def _generate_verdicts(self, context: str, statements: List[str]) -> Tuple[List[Dict[str, str]], float, float]:
52
+ prompt = (
53
+ "Evaluate how well each statement is supported by the context.\n\n"
54
+ "Levels:\n"
55
+ "- fully: directly supported word-for-word\n"
56
+ "- mostly: strongly supported but wording differs slightly\n"
57
+ "- partial: partially supported but with some gaps\n"
58
+ "- minor: tangentially related or ambiguous\n"
59
+ "- none: clearly unsupported or contradicted\n\n"
60
+ f"CONTEXT:\n{context}\n\n"
61
+ f"STATEMENTS (JSON array):\n{json.dumps(statements, ensure_ascii=False)}\n\n"
62
+ "Return only a JSON array of objects like:\n"
63
+ '[{"verdict": "fully|mostly|partial|minor|none", '
64
+ '"reason": "<brief>", '
65
+ '"support": "<exact context sentence(s)> or \'none\'"}]'
66
+ )
67
+ text, cost = await chat_complete(self.model, [{"role": "user", "content": prompt}], temperature=0.0)
68
+ raw_json = extract_json_block(text)
69
+ verdicts: List[Dict[str, Any]] = json.loads(raw_json)
70
+
71
+ for v in verdicts:
72
+ supp = v.get("support", "").strip().lower()
73
+ if supp == "none" and v["verdict"] in ("fully", "mostly"):
74
+ v["verdict"] = "partial"
75
+
76
+ scores = [VERDICT_WEIGHTS.get(v["verdict"], 0.0) for v in verdicts]
77
+ score = round(score_agg(scores, temperature=self.temperature), 4)
78
+ return verdicts, score, cost or 0.0
79
+
80
+ async def _summarize_reasons_via_llm(self, verdicts: List[Dict[str, str]]) -> Tuple[str, float]:
81
+ grouped: Dict[str, List[str]] = {}
82
+ for v in verdicts:
83
+ grouped.setdefault(v["verdict"], []).append(v["reason"])
84
+ bullets = []
85
+ for tag in ("fully", "mostly", "partial", "none"):
86
+ bullets.extend(f"- {r}" for r in grouped.get(tag, [])[:2])
87
+ prompt = (
88
+ "Summarize the following points from a factual consistency evaluation.\n"
89
+ "Give one short paragraph (1-2 sentences) that explains whether the answer "
90
+ "was well supported by the context, mentioning both strong and weak parts.\n\n"
91
+ f"{chr(10).join(bullets)}\n\n"
92
+ "Summary:"
93
+ )
94
+ text, cost = await chat_complete(self.model, [{"role": "user", "content": prompt}], temperature=0.0)
95
+ return text.strip(), cost or 0.0
96
+
97
+ async def evaluate(self, test_case: EvalTestCase) -> Dict[str, any]:
98
+ llm_cost = 0.0
99
+ answer = test_case.actual_output
100
+ context = "\n".join(test_case.retrieval_context or [])
101
+ question = test_case.input
102
+
103
+ # 1. Statements from answer
104
+ statements, cost = await self._generate_statements(answer)
105
+ llm_cost += cost
106
+
107
+ # 2. Verdicts against context
108
+ verdicts, verdict_score, cost = await self._generate_verdicts(context, statements)
109
+ llm_cost += cost
110
+
111
+ # 3. Reason summary
112
+ summary_reason, cost = await self._summarize_reasons_via_llm(verdicts)
113
+ llm_cost += cost
114
+
115
+ success = verdict_score >= self.threshold
116
+
117
+ evaluation_log = {
118
+ "input_question": question,
119
+ "retrieval_context": test_case.retrieval_context,
120
+ "answer": answer,
121
+ "statements": statements,
122
+ "comment_statements": "Factual assertions extracted from the answer.",
123
+ "verdicts": verdicts,
124
+ "comment_verdicts": "Each verdict shows how well a statement is supported by the context.",
125
+ "final_score": verdict_score,
126
+ "comment_final_score": "Final score based on faithfulness of statements.",
127
+ "threshold": self.threshold,
128
+ "success": success,
129
+ "comment_success": "Whether the score meets the required threshold.",
130
+ "final_reason": summary_reason,
131
+ "comment_reasoning": "Summary explanation based on all verdicts."
132
+ }
133
+
134
+ return {
135
+ "score": verdict_score,
136
+ "success": success,
137
+ "reason": summary_reason,
138
+ "evaluation_cost": round(llm_cost, 6),
139
+ "evaluation_log": evaluation_log
140
+ }