eval-ai-library 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- eval_ai_library-0.1.0.dist-info/METADATA +753 -0
- eval_ai_library-0.1.0.dist-info/RECORD +34 -0
- eval_ai_library-0.1.0.dist-info/WHEEL +5 -0
- eval_ai_library-0.1.0.dist-info/licenses/LICENSE +21 -0
- eval_ai_library-0.1.0.dist-info/top_level.txt +1 -0
- eval_lib/__init__.py +122 -0
- eval_lib/agent_metrics/__init__.py +12 -0
- eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +231 -0
- eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +251 -0
- eval_lib/agent_metrics/task_success_metric/task_success_rate.py +347 -0
- eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +106 -0
- eval_lib/datagenerator/datagenerator.py +230 -0
- eval_lib/datagenerator/document_loader.py +510 -0
- eval_lib/datagenerator/prompts.py +192 -0
- eval_lib/evaluate.py +335 -0
- eval_lib/evaluation_schema.py +63 -0
- eval_lib/llm_client.py +286 -0
- eval_lib/metric_pattern.py +229 -0
- eval_lib/metrics/__init__.py +25 -0
- eval_lib/metrics/answer_precision_metric/answer_precision.py +405 -0
- eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +195 -0
- eval_lib/metrics/bias_metric/bias.py +114 -0
- eval_lib/metrics/contextual_precision_metric/contextual_precision.py +102 -0
- eval_lib/metrics/contextual_recall_metric/contextual_recall.py +91 -0
- eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +169 -0
- eval_lib/metrics/custom_metric/custom_eval.py +303 -0
- eval_lib/metrics/faithfulness_metric/faithfulness.py +140 -0
- eval_lib/metrics/geval/geval.py +326 -0
- eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +102 -0
- eval_lib/metrics/toxicity_metric/toxicity.py +113 -0
- eval_lib/price.py +37 -0
- eval_lib/py.typed +1 -0
- eval_lib/testcases_schema.py +27 -0
- eval_lib/utils.py +99 -0
eval_lib/utils.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for metrics evaluation
|
|
3
|
+
"""
|
|
4
|
+
import re
|
|
5
|
+
import json
|
|
6
|
+
from typing import List
|
|
7
|
+
from math import exp
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
Utility functions for metrics evaluation
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def score_agg(
|
|
16
|
+
scores: List[float],
|
|
17
|
+
temperature: float = 0.5,
|
|
18
|
+
penalty: float = 0.1
|
|
19
|
+
) -> float:
|
|
20
|
+
"""
|
|
21
|
+
Compute a softmax-weighted aggregate of scores with penalty for low-scoring items.
|
|
22
|
+
|
|
23
|
+
This function applies softmax weighting (higher scores get more weight) and then
|
|
24
|
+
applies a penalty proportional to the number of low-scoring items.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
scores: List of scores (0.0 to 1.0) to aggregate
|
|
28
|
+
temperature: Controls strictness of aggregation
|
|
29
|
+
- Lower (0.1-0.3): Strict - high scores dominate
|
|
30
|
+
- Medium (0.4-0.6): Balanced - default behavior
|
|
31
|
+
- Higher (0.8-1.5): Lenient - closer to arithmetic mean
|
|
32
|
+
penalty: Penalty factor for low-scoring items (default 0.1)
|
|
33
|
+
- Applied to scores <= 0.4
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Aggregated score between 0.0 and 1.0
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
>>> scores = [1.0, 0.9, 0.7, 0.3, 0.0]
|
|
40
|
+
>>> score_agg(scores, temperature=0.5)
|
|
41
|
+
0.73
|
|
42
|
+
"""
|
|
43
|
+
if not scores:
|
|
44
|
+
return 0.0
|
|
45
|
+
|
|
46
|
+
# Compute softmax weights
|
|
47
|
+
exp_scores = [exp(s / temperature) for s in scores]
|
|
48
|
+
total = sum(exp_scores)
|
|
49
|
+
softmax_score = sum(s * e / total for s, e in zip(scores, exp_scores))
|
|
50
|
+
|
|
51
|
+
# Apply penalty if many statements have low scores (≤ 0.4)
|
|
52
|
+
irrelevant = sum(1 for s in scores if s <= 0.4)
|
|
53
|
+
penalty_factor = max(0.0, 1 - penalty * irrelevant)
|
|
54
|
+
|
|
55
|
+
return round(softmax_score * penalty_factor, 4)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def extract_json_block(text: str) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Extract JSON from LLM response that may contain markdown code blocks.
|
|
61
|
+
|
|
62
|
+
This function handles various formats:
|
|
63
|
+
- Markdown JSON code blocks: ```json ... ```
|
|
64
|
+
- Plain JSON objects/arrays
|
|
65
|
+
- JSON embedded in text
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
text: Raw text from LLM that may contain JSON
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Extracted JSON string
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
No exception - returns original text if no JSON found
|
|
75
|
+
|
|
76
|
+
Example:
|
|
77
|
+
>>> text = '```json\\n{"score": 0.8}\\n```'
|
|
78
|
+
>>> extract_json_block(text)
|
|
79
|
+
'{"score": 0.8}'
|
|
80
|
+
"""
|
|
81
|
+
# Try to extract from markdown code blocks
|
|
82
|
+
match = re.search(r"```json\s*(.*?)```", text, re.DOTALL)
|
|
83
|
+
if match:
|
|
84
|
+
return match.group(1).strip()
|
|
85
|
+
|
|
86
|
+
# Try to parse as direct JSON
|
|
87
|
+
try:
|
|
88
|
+
obj = json.loads(text)
|
|
89
|
+
return json.dumps(obj, ensure_ascii=False)
|
|
90
|
+
except Exception:
|
|
91
|
+
pass
|
|
92
|
+
|
|
93
|
+
# Try to find JSON object/array pattern
|
|
94
|
+
json_match = re.search(r"({.*?}|\[.*?\])", text, re.DOTALL)
|
|
95
|
+
if json_match:
|
|
96
|
+
return json_match.group(1).strip()
|
|
97
|
+
|
|
98
|
+
# Return as-is if nothing found
|
|
99
|
+
return text.strip()
|