eval-ai-library 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- eval_ai_library-0.3.1.dist-info/METADATA +1042 -0
- eval_ai_library-0.3.1.dist-info/RECORD +34 -0
- eval_lib/__init__.py +19 -6
- eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +9 -3
- eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +13 -4
- eval_lib/agent_metrics/task_success_metric/task_success_rate.py +24 -23
- eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +8 -2
- eval_lib/datagenerator/datagenerator.py +208 -12
- eval_lib/datagenerator/document_loader.py +29 -29
- eval_lib/evaluate.py +0 -22
- eval_lib/llm_client.py +221 -78
- eval_lib/metric_pattern.py +208 -152
- eval_lib/metrics/answer_precision_metric/answer_precision.py +8 -3
- eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +8 -2
- eval_lib/metrics/bias_metric/bias.py +12 -2
- eval_lib/metrics/contextual_precision_metric/contextual_precision.py +9 -4
- eval_lib/metrics/contextual_recall_metric/contextual_recall.py +7 -3
- eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +9 -2
- eval_lib/metrics/custom_metric/custom_eval.py +238 -204
- eval_lib/metrics/faithfulness_metric/faithfulness.py +7 -2
- eval_lib/metrics/geval/geval.py +8 -2
- eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +7 -3
- eval_lib/metrics/toxicity_metric/toxicity.py +8 -2
- eval_lib/utils.py +44 -29
- eval_ai_library-0.2.2.dist-info/METADATA +0 -779
- eval_ai_library-0.2.2.dist-info/RECORD +0 -34
- {eval_ai_library-0.2.2.dist-info → eval_ai_library-0.3.1.dist-info}/WHEEL +0 -0
- {eval_ai_library-0.2.2.dist-info → eval_ai_library-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {eval_ai_library-0.2.2.dist-info → eval_ai_library-0.3.1.dist-info}/top_level.txt +0 -0
|
@@ -1,779 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: eval-ai-library
|
|
3
|
-
Version: 0.2.2
|
|
4
|
-
Summary: Comprehensive AI Model Evaluation Framework with support for multiple LLM providers
|
|
5
|
-
Author-email: Aleksandr Meshkov <alekslynx90@gmail.com>
|
|
6
|
-
License: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/meshkovQA/Eval-ai-library
|
|
8
|
-
Project-URL: Documentation, https://github.com/meshkovQA/Eval-ai-library#readme
|
|
9
|
-
Project-URL: Repository, https://github.com/meshkovQA/Eval-ai-library
|
|
10
|
-
Project-URL: Bug Tracker, https://github.com/meshkovQA/Eval-ai-library/issues
|
|
11
|
-
Keywords: ai,evaluation,llm,rag,metrics,testing,quality-assurance
|
|
12
|
-
Classifier: Development Status :: 4 - Beta
|
|
13
|
-
Classifier: Intended Audience :: Developers
|
|
14
|
-
Classifier: Intended Audience :: Science/Research
|
|
15
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
-
Classifier: Programming Language :: Python :: 3
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
-
Requires-Python: >=3.9
|
|
24
|
-
Description-Content-Type: text/markdown
|
|
25
|
-
License-File: LICENSE
|
|
26
|
-
Requires-Dist: openai>=1.0.0
|
|
27
|
-
Requires-Dist: anthropic>=0.18.0
|
|
28
|
-
Requires-Dist: google-genai>=0.2.0
|
|
29
|
-
Requires-Dist: pydantic>=2.0.0
|
|
30
|
-
Requires-Dist: numpy>=1.24.0
|
|
31
|
-
Provides-Extra: dev
|
|
32
|
-
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
33
|
-
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
34
|
-
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
35
|
-
Requires-Dist: flake8>=6.0.0; extra == "dev"
|
|
36
|
-
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
37
|
-
Requires-Dist: isort>=5.12.0; extra == "dev"
|
|
38
|
-
Provides-Extra: docs
|
|
39
|
-
Requires-Dist: sphinx>=6.0.0; extra == "docs"
|
|
40
|
-
Requires-Dist: sphinx-rtd-theme>=1.2.0; extra == "docs"
|
|
41
|
-
Provides-Extra: data-generation
|
|
42
|
-
Requires-Dist: langchain>=0.1.0; extra == "data-generation"
|
|
43
|
-
Requires-Dist: langchain-community>=0.0.10; extra == "data-generation"
|
|
44
|
-
Requires-Dist: langchain-core>=0.1.0; extra == "data-generation"
|
|
45
|
-
Requires-Dist: pypdf2>=3.0.0; extra == "data-generation"
|
|
46
|
-
Requires-Dist: python-docx>=0.8.11; extra == "data-generation"
|
|
47
|
-
Requires-Dist: openpyxl>=3.1.0; extra == "data-generation"
|
|
48
|
-
Requires-Dist: pillow>=10.0.0; extra == "data-generation"
|
|
49
|
-
Requires-Dist: pytesseract>=0.3.10; extra == "data-generation"
|
|
50
|
-
Provides-Extra: all
|
|
51
|
-
Requires-Dist: pytest>=7.0.0; extra == "all"
|
|
52
|
-
Requires-Dist: pytest-asyncio>=0.21.0; extra == "all"
|
|
53
|
-
Requires-Dist: black>=23.0.0; extra == "all"
|
|
54
|
-
Requires-Dist: flake8>=6.0.0; extra == "all"
|
|
55
|
-
Requires-Dist: mypy>=1.0.0; extra == "all"
|
|
56
|
-
Requires-Dist: isort>=5.12.0; extra == "all"
|
|
57
|
-
Requires-Dist: sphinx>=6.0.0; extra == "all"
|
|
58
|
-
Requires-Dist: sphinx-rtd-theme>=1.2.0; extra == "all"
|
|
59
|
-
Requires-Dist: langchain>=0.1.0; extra == "all"
|
|
60
|
-
Requires-Dist: langchain-community>=0.0.10; extra == "all"
|
|
61
|
-
Requires-Dist: langchain-core>=0.1.0; extra == "all"
|
|
62
|
-
Requires-Dist: pypdf2>=3.0.0; extra == "all"
|
|
63
|
-
Requires-Dist: python-docx>=0.8.11; extra == "all"
|
|
64
|
-
Requires-Dist: openpyxl>=3.1.0; extra == "all"
|
|
65
|
-
Requires-Dist: pillow>=10.0.0; extra == "all"
|
|
66
|
-
Requires-Dist: pytesseract>=0.3.10; extra == "all"
|
|
67
|
-
Dynamic: license-file
|
|
68
|
-
|
|
69
|
-
# Eval AI Library
|
|
70
|
-
|
|
71
|
-
[](https://www.python.org/downloads/)
|
|
72
|
-
[](https://opensource.org/licenses/MIT)
|
|
73
|
-
|
|
74
|
-
Comprehensive AI Model Evaluation Framework with advanced techniques including **Probability-Weighted Scoring** and **Auto Chain-of-Thought**. Support for multiple LLM providers and 15+ evaluation metrics for RAG systems and AI agents.
|
|
75
|
-
|
|
76
|
-
## Features
|
|
77
|
-
|
|
78
|
-
- 🎯 **15+ Evaluation Metrics**: RAG metrics and agent-specific evaluations
|
|
79
|
-
- 🧠 **G-Eval Implementation**: State-of-the-art evaluation with probability-weighted scoring
|
|
80
|
-
- 🔗 **Chain-of-Thought**: Automatic generation of evaluation steps from criteria
|
|
81
|
-
- 🤖 **Multi-Provider Support**: OpenAI, Azure OpenAI, Google Gemini, Anthropic Claude, Ollama
|
|
82
|
-
- 📊 **RAG Metrics**: Answer relevancy, faithfulness, contextual precision/recall, and more
|
|
83
|
-
- 🔧 **Agent Metrics**: Tool correctness, task success rate, role adherence, knowledge retention
|
|
84
|
-
- 🎨 **Custom Metrics**: Advanced custom evaluation with CoT and probability weighting
|
|
85
|
-
- 📦 **Data Generation**: Built-in test case generator from documents
|
|
86
|
-
- ⚡ **Async Support**: Full async/await support for efficient evaluation
|
|
87
|
-
- 💰 **Cost Tracking**: Automatic cost calculation for LLM API calls
|
|
88
|
-
- 📝 **Detailed Logging**: Comprehensive evaluation logs for transparency
|
|
89
|
-
|
|
90
|
-
## Installation
|
|
91
|
-
```bash
|
|
92
|
-
pip install eval-ai-library
|
|
93
|
-
```
|
|
94
|
-
|
|
95
|
-
### Development Installation
|
|
96
|
-
```bash
|
|
97
|
-
git clone https://github.com/yourusername/eval-ai-library.git
|
|
98
|
-
cd eval-ai-library
|
|
99
|
-
pip install -e ".[dev]"
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
## Quick Start
|
|
103
|
-
|
|
104
|
-
### Basic RAG Evaluation
|
|
105
|
-
```python
|
|
106
|
-
import asyncio
|
|
107
|
-
from eval_lib import (
|
|
108
|
-
evaluate,
|
|
109
|
-
EvalTestCase,
|
|
110
|
-
AnswerRelevancyMetric,
|
|
111
|
-
FaithfulnessMetric
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
async def main():
|
|
115
|
-
# Create test case
|
|
116
|
-
test_case = EvalTestCase(
|
|
117
|
-
input="What is the capital of France?",
|
|
118
|
-
actual_output="The capital of France is Paris, a beautiful city known for its art and culture.",
|
|
119
|
-
expected_output="Paris",
|
|
120
|
-
retrieval_context=["Paris is the capital and largest city of France."]
|
|
121
|
-
)
|
|
122
|
-
|
|
123
|
-
# Define metrics
|
|
124
|
-
metrics = [
|
|
125
|
-
AnswerRelevancyMetric(
|
|
126
|
-
model="gpt-4o-mini",
|
|
127
|
-
threshold=0.7,
|
|
128
|
-
temperature=0.5 # Softmax temperature for score aggregation
|
|
129
|
-
),
|
|
130
|
-
FaithfulnessMetric(
|
|
131
|
-
model="gpt-4o-mini",
|
|
132
|
-
threshold=0.8,
|
|
133
|
-
temperature=0.5
|
|
134
|
-
)
|
|
135
|
-
]
|
|
136
|
-
|
|
137
|
-
# Evaluate
|
|
138
|
-
results = await evaluate(
|
|
139
|
-
test_cases=[test_case],
|
|
140
|
-
metrics=metrics
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
# Print results with detailed logs
|
|
144
|
-
for _, test_results in results:
|
|
145
|
-
for result in test_results:
|
|
146
|
-
print(f"Success: {result.success}")
|
|
147
|
-
for metric in result.metrics_data:
|
|
148
|
-
print(f"{metric.name}: {metric.score:.2f}")
|
|
149
|
-
print(f"Reason: {metric.reason}")
|
|
150
|
-
print(f"Cost: ${metric.evaluation_cost:.6f}")
|
|
151
|
-
# Access detailed evaluation log
|
|
152
|
-
if hasattr(metric, 'evaluation_log'):
|
|
153
|
-
print(f"Log: {metric.evaluation_log}")
|
|
154
|
-
|
|
155
|
-
asyncio.run(main())
|
|
156
|
-
```
|
|
157
|
-
|
|
158
|
-
### G-Eval with Probability-Weighted Scoring
|
|
159
|
-
|
|
160
|
-
G-Eval implements the state-of-the-art evaluation method from the paper ["G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment"](https://arxiv.org/abs/2303.16634). It uses **probability-weighted scoring** (score = Σ p(si) × si) for fine-grained, continuous evaluation scores.
|
|
161
|
-
```python
|
|
162
|
-
from eval_lib import GEval, EvalTestCase
|
|
163
|
-
|
|
164
|
-
async def evaluate_with_geval():
|
|
165
|
-
test_case = EvalTestCase(
|
|
166
|
-
input="Explain quantum computing to a 10-year-old",
|
|
167
|
-
actual_output="Quantum computers are like super-powerful regular computers that use special tiny particles to solve really hard problems much faster.",
|
|
168
|
-
expected_output="A simple explanation using analogies suitable for children"
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
# G-Eval with auto chain-of-thought
|
|
172
|
-
metric = GEval(
|
|
173
|
-
model="gpt-4o", # Works best with GPT-4
|
|
174
|
-
threshold=0.7, # Score range: 0-100
|
|
175
|
-
name="Clarity & Simplicity",
|
|
176
|
-
criteria="Evaluate how clear and age-appropriate the explanation is for a 10-year-old child",
|
|
177
|
-
# evaluation_steps is auto-generated from criteria if not provided
|
|
178
|
-
n_samples=20, # Number of samples for probability estimation (default: 20)
|
|
179
|
-
sampling_temperature=2.0 # High temperature for diverse sampling (default: 2.0)
|
|
180
|
-
)
|
|
181
|
-
|
|
182
|
-
result = await metric.evaluate(test_case)
|
|
183
|
-
|
|
184
|
-
print(f"Score: {result['score']:.2f}/100") # Fine-grained score like 73.45
|
|
185
|
-
print(f"Success: {result['success']}")
|
|
186
|
-
print(f"Reason: {result['reason']}")
|
|
187
|
-
print(f"Sampled scores: {result['metadata']['sampled_scores']}") # See all 20 samples
|
|
188
|
-
print(f"Score distribution: {result['evaluation_log']['score_distribution']}")
|
|
189
|
-
|
|
190
|
-
asyncio.run(evaluate_with_geval())
|
|
191
|
-
```
|
|
192
|
-
|
|
193
|
-
### Custom Evaluation with Advanced Features
|
|
194
|
-
|
|
195
|
-
The CustomEvalMetric now includes **Chain-of-Thought** and **Probability-Weighted Scoring** from G-Eval for maximum accuracy:
|
|
196
|
-
```python
|
|
197
|
-
from eval_lib import CustomEvalMetric
|
|
198
|
-
|
|
199
|
-
async def custom_evaluation():
|
|
200
|
-
test_case = EvalTestCase(
|
|
201
|
-
input="How do I reset my password?",
|
|
202
|
-
actual_output="To reset your password, click 'Forgot Password' on the login page, enter your email, and follow the link sent to your inbox.",
|
|
203
|
-
expected_output="Clear step-by-step instructions"
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
metric = CustomEvalMetric(
|
|
207
|
-
model="gpt-4o",
|
|
208
|
-
threshold=0.7,
|
|
209
|
-
name="HelpfulnessScore",
|
|
210
|
-
criteria="Evaluate if the response provides clear, actionable steps that directly answer the user's question"
|
|
211
|
-
# Auto-generates evaluation steps using CoT
|
|
212
|
-
# Auto-applies probability-weighted scoring (20 samples)
|
|
213
|
-
)
|
|
214
|
-
|
|
215
|
-
result = await metric.evaluate(test_case)
|
|
216
|
-
|
|
217
|
-
# Access detailed evaluation log
|
|
218
|
-
log = result['evaluation_log']
|
|
219
|
-
print(f"Auto-generated steps: {log['evaluation_steps']}")
|
|
220
|
-
print(f"Sampled scores: {log['sampled_scores']}")
|
|
221
|
-
print(f"Score distribution: {log['score_distribution']}")
|
|
222
|
-
print(f"Final score: {log['final_score']:.2f}")
|
|
223
|
-
|
|
224
|
-
asyncio.run(custom_evaluation())
|
|
225
|
-
```
|
|
226
|
-
|
|
227
|
-
### Agent Evaluation
|
|
228
|
-
```python
|
|
229
|
-
from eval_lib import (
|
|
230
|
-
evaluate,
|
|
231
|
-
EvalTestCase,
|
|
232
|
-
ToolCorrectnessMetric,
|
|
233
|
-
TaskSuccessRateMetric
|
|
234
|
-
)
|
|
235
|
-
|
|
236
|
-
async def evaluate_agent():
|
|
237
|
-
test_case = EvalTestCase(
|
|
238
|
-
input="Book a flight to New York for tomorrow",
|
|
239
|
-
actual_output="I've found available flights and booked your trip to New York for tomorrow.",
|
|
240
|
-
tools_called=["search_flights", "book_flight"],
|
|
241
|
-
expected_tools=["search_flights", "book_flight"]
|
|
242
|
-
)
|
|
243
|
-
|
|
244
|
-
metrics = [
|
|
245
|
-
ToolCorrectnessMetric(model="gpt-4o-mini", threshold=0.8),
|
|
246
|
-
TaskSuccessRateMetric(
|
|
247
|
-
model="gpt-4o-mini",
|
|
248
|
-
threshold=0.7,
|
|
249
|
-
temperature=1.1 # Controls score aggregation strictness
|
|
250
|
-
)
|
|
251
|
-
]
|
|
252
|
-
|
|
253
|
-
results = await evaluate([test_case], metrics)
|
|
254
|
-
return results
|
|
255
|
-
|
|
256
|
-
asyncio.run(evaluate_agent())
|
|
257
|
-
```
|
|
258
|
-
|
|
259
|
-
### Conversational Evaluation
|
|
260
|
-
```python
|
|
261
|
-
from eval_lib import (
|
|
262
|
-
evaluate_conversations,
|
|
263
|
-
ConversationalEvalTestCase,
|
|
264
|
-
EvalTestCase,
|
|
265
|
-
RoleAdherenceMetric,
|
|
266
|
-
KnowledgeRetentionMetric
|
|
267
|
-
)
|
|
268
|
-
|
|
269
|
-
async def evaluate_conversation():
|
|
270
|
-
conversation = ConversationalEvalTestCase(
|
|
271
|
-
chatbot_role="You are a helpful customer support assistant. Be professional and empathetic.",
|
|
272
|
-
turns=[
|
|
273
|
-
EvalTestCase(
|
|
274
|
-
input="I need help with my order",
|
|
275
|
-
actual_output="I'd be happy to help you with your order. Could you please provide your order number?"
|
|
276
|
-
),
|
|
277
|
-
EvalTestCase(
|
|
278
|
-
input="It's #12345",
|
|
279
|
-
actual_output="Thank you! Let me look up order #12345 for you."
|
|
280
|
-
)
|
|
281
|
-
]
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
metrics = [
|
|
285
|
-
RoleAdherenceMetric(
|
|
286
|
-
model="gpt-4o-mini",
|
|
287
|
-
threshold=0.8,
|
|
288
|
-
temperature=0.5 # Softmax temperature for verdict aggregation
|
|
289
|
-
),
|
|
290
|
-
KnowledgeRetentionMetric(
|
|
291
|
-
model="gpt-4o-mini",
|
|
292
|
-
threshold=0.7,
|
|
293
|
-
temperature=0.5
|
|
294
|
-
)
|
|
295
|
-
]
|
|
296
|
-
|
|
297
|
-
# Set chatbot role for role adherence
|
|
298
|
-
metrics[0].chatbot_role = conversation.chatbot_role
|
|
299
|
-
|
|
300
|
-
results = await evaluate_conversations([conversation], metrics)
|
|
301
|
-
|
|
302
|
-
# Access detailed logs
|
|
303
|
-
for result in results:
|
|
304
|
-
print(f"Dialogue: {result.evaluation_log['dialogue']}")
|
|
305
|
-
print(f"Verdicts: {result.evaluation_log['verdicts']}")
|
|
306
|
-
print(f"Score: {result.score}")
|
|
307
|
-
|
|
308
|
-
return results
|
|
309
|
-
|
|
310
|
-
asyncio.run(evaluate_conversation())
|
|
311
|
-
```
|
|
312
|
-
|
|
313
|
-
## Available Metrics
|
|
314
|
-
|
|
315
|
-
### RAG Metrics
|
|
316
|
-
|
|
317
|
-
#### AnswerRelevancyMetric
|
|
318
|
-
Measures how relevant the answer is to the question using multi-step evaluation:
|
|
319
|
-
1. Infers user intent
|
|
320
|
-
2. Extracts atomic statements from answer
|
|
321
|
-
3. Generates verdicts (fully/mostly/partial/minor/none) for each statement
|
|
322
|
-
4. Aggregates using softmax
|
|
323
|
-
```python
|
|
324
|
-
metric = AnswerRelevancyMetric(
|
|
325
|
-
model="gpt-4o-mini",
|
|
326
|
-
threshold=0.7,
|
|
327
|
-
temperature=0.5 # Controls aggregation strictness
|
|
328
|
-
)
|
|
329
|
-
```
|
|
330
|
-
|
|
331
|
-
#### FaithfulnessMetric
|
|
332
|
-
Checks if the answer is faithful to the provided context:
|
|
333
|
-
1. Extracts factual claims from answer
|
|
334
|
-
2. Verifies each claim against context (fully/mostly/partial/minor/none)
|
|
335
|
-
3. Aggregates faithfulness score
|
|
336
|
-
```python
|
|
337
|
-
metric = FaithfulnessMetric(
|
|
338
|
-
model="gpt-4o-mini",
|
|
339
|
-
threshold=0.8,
|
|
340
|
-
temperature=0.5
|
|
341
|
-
)
|
|
342
|
-
```
|
|
343
|
-
|
|
344
|
-
#### ContextualRelevancyMetric
|
|
345
|
-
Evaluates relevance of retrieved context to the question.
|
|
346
|
-
```python
|
|
347
|
-
metric = ContextualRelevancyMetric(
|
|
348
|
-
model="gpt-4o-mini",
|
|
349
|
-
threshold=0.7,
|
|
350
|
-
temperature=0.5
|
|
351
|
-
)
|
|
352
|
-
```
|
|
353
|
-
|
|
354
|
-
#### ContextualPrecisionMetric
|
|
355
|
-
Measures precision of context retrieval - are the retrieved chunks relevant?
|
|
356
|
-
```python
|
|
357
|
-
metric = ContextualPrecisionMetric(
|
|
358
|
-
model="gpt-4o-mini",
|
|
359
|
-
threshold=0.7
|
|
360
|
-
)
|
|
361
|
-
```
|
|
362
|
-
|
|
363
|
-
#### ContextualRecallMetric
|
|
364
|
-
Measures recall of relevant context - was all relevant information retrieved?
|
|
365
|
-
```python
|
|
366
|
-
metric = ContextualRecallMetric(
|
|
367
|
-
model="gpt-4o-mini",
|
|
368
|
-
threshold=0.7
|
|
369
|
-
)
|
|
370
|
-
```
|
|
371
|
-
|
|
372
|
-
#### BiasMetric
|
|
373
|
-
Detects bias and prejudice in AI-generated output. Score range: 0 (strong bias) to 100 (no bias).
|
|
374
|
-
```python
|
|
375
|
-
metric = BiasMetric(
|
|
376
|
-
model="gpt-4o-mini",
|
|
377
|
-
threshold=0.7 # Score range: 0-100
|
|
378
|
-
)
|
|
379
|
-
```
|
|
380
|
-
|
|
381
|
-
#### ToxicityMetric
|
|
382
|
-
Identifies toxic content in responses. Score range: 0 (highly toxic) to 100 (no toxicity).
|
|
383
|
-
```python
|
|
384
|
-
metric = ToxicityMetric(
|
|
385
|
-
model="gpt-4o-mini",
|
|
386
|
-
threshold=0.7 # Score range: 0-100
|
|
387
|
-
)
|
|
388
|
-
```
|
|
389
|
-
|
|
390
|
-
#### RestrictedRefusalMetric
|
|
391
|
-
Checks if the AI appropriately refuses harmful or out-of-scope requests.
|
|
392
|
-
```python
|
|
393
|
-
metric = RestrictedRefusalMetric(
|
|
394
|
-
model="gpt-4o-mini",
|
|
395
|
-
threshold=0.7
|
|
396
|
-
)
|
|
397
|
-
```
|
|
398
|
-
|
|
399
|
-
### Agent Metrics
|
|
400
|
-
|
|
401
|
-
#### ToolCorrectnessMetric
|
|
402
|
-
Validates that the agent calls the correct tools in the right sequence.
|
|
403
|
-
```python
|
|
404
|
-
metric = ToolCorrectnessMetric(
|
|
405
|
-
model="gpt-4o-mini",
|
|
406
|
-
threshold=0.8
|
|
407
|
-
)
|
|
408
|
-
```
|
|
409
|
-
|
|
410
|
-
#### TaskSuccessRateMetric
|
|
411
|
-
Measures task completion success across conversation:
|
|
412
|
-
1. Infers user's goal
|
|
413
|
-
2. Generates success criteria
|
|
414
|
-
3. Evaluates each criterion (fully/mostly/partial/minor/none)
|
|
415
|
-
4. Aggregates into final score
|
|
416
|
-
```python
|
|
417
|
-
metric = TaskSuccessRateMetric(
|
|
418
|
-
model="gpt-4o-mini",
|
|
419
|
-
threshold=0.7,
|
|
420
|
-
temperature=1.1 # Higher = more lenient aggregation
|
|
421
|
-
)
|
|
422
|
-
```
|
|
423
|
-
|
|
424
|
-
#### RoleAdherenceMetric
|
|
425
|
-
Evaluates how well the agent maintains its assigned role:
|
|
426
|
-
1. Compares each response against role description
|
|
427
|
-
2. Generates adherence verdicts (fully/mostly/partial/minor/none)
|
|
428
|
-
3. Aggregates across all turns
|
|
429
|
-
```python
|
|
430
|
-
metric = RoleAdherenceMetric(
|
|
431
|
-
model="gpt-4o-mini",
|
|
432
|
-
threshold=0.8,
|
|
433
|
-
temperature=0.5
|
|
434
|
-
)
|
|
435
|
-
# Don't forget to set: metric.chatbot_role = "Your role description"
|
|
436
|
-
```
|
|
437
|
-
|
|
438
|
-
#### KnowledgeRetentionMetric
|
|
439
|
-
Checks if the agent remembers and recalls information from earlier in the conversation:
|
|
440
|
-
1. Analyzes conversation for retention quality
|
|
441
|
-
2. Generates retention verdicts (fully/mostly/partial/minor/none)
|
|
442
|
-
3. Aggregates into retention score
|
|
443
|
-
```python
|
|
444
|
-
metric = KnowledgeRetentionMetric(
|
|
445
|
-
model="gpt-4o-mini",
|
|
446
|
-
threshold=0.7,
|
|
447
|
-
temperature=0.5
|
|
448
|
-
)
|
|
449
|
-
```
|
|
450
|
-
|
|
451
|
-
### Custom & Advanced Metrics
|
|
452
|
-
|
|
453
|
-
#### GEval
|
|
454
|
-
State-of-the-art evaluation using probability-weighted scoring from the [G-Eval paper](https://arxiv.org/abs/2303.16634):
|
|
455
|
-
- **Auto Chain-of-Thought**: Automatically generates evaluation steps from criteria
|
|
456
|
-
- **Probability-Weighted Scoring**: score = Σ p(si) × si using 20 samples
|
|
457
|
-
- **Fine-Grained Scores**: Continuous scores (e.g., 73.45) instead of integers
|
|
458
|
-
```python
|
|
459
|
-
metric = GEval(
|
|
460
|
-
model="gpt-4o", # Best with GPT-4 for probability estimation
|
|
461
|
-
threshold=0.7,
|
|
462
|
-
name="Coherence",
|
|
463
|
-
criteria="Evaluate logical flow and structure of the response",
|
|
464
|
-
evaluation_steps=None, # Auto-generated if not provided
|
|
465
|
-
n_samples=20, # Number of samples for probability estimation
|
|
466
|
-
sampling_temperature=2.0 # High temperature for diverse sampling
|
|
467
|
-
)
|
|
468
|
-
```
|
|
469
|
-
|
|
470
|
-
#### CustomEvalMetric
|
|
471
|
-
Enhanced custom evaluation with CoT and probability-weighted scoring:
|
|
472
|
-
```python
|
|
473
|
-
metric = CustomEvalMetric(
|
|
474
|
-
model="gpt-4o",
|
|
475
|
-
threshold=0.7,
|
|
476
|
-
name="QualityScore",
|
|
477
|
-
criteria="Your custom evaluation criteria"
|
|
478
|
-
# Automatically uses:
|
|
479
|
-
# - Chain-of-Thought (generates evaluation steps)
|
|
480
|
-
# - Probability-Weighted Scoring (20 samples, temp=2.0)
|
|
481
|
-
)
|
|
482
|
-
```
|
|
483
|
-
|
|
484
|
-
## Understanding Evaluation Results
|
|
485
|
-
|
|
486
|
-
### Score Ranges
|
|
487
|
-
|
|
488
|
-
- **RAG Metrics** (Answer Relevancy, Faithfulness, etc.): 0.0 - 1.0
|
|
489
|
-
- **Safety Metrics** (Bias, Toxicity): 0.0 - 1.0
|
|
490
|
-
- **G-Eval & Custom Metrics**: 0.0 - 1.0
|
|
491
|
-
- **Agent Metrics** (Task Success, Role Adherence, etc.): 0.0 - 1.0
|
|
492
|
-
|
|
493
|
-
## Temperature Parameter
|
|
494
|
-
|
|
495
|
-
Many metrics use a **temperature** parameter for score aggregation (via softmax):
|
|
496
|
-
|
|
497
|
-
- **Lower (0.1-0.3)**: **Strict** - high scores dominate, penalizes any low scores heavily
|
|
498
|
-
- **Medium (0.4-0.6)**: **Balanced** - default behavior
|
|
499
|
-
- **Higher (0.8-1.5)**: **Lenient** - closer to arithmetic mean, more forgiving
|
|
500
|
-
```python
|
|
501
|
-
# Strict evaluation - one bad verdict significantly lowers score
|
|
502
|
-
metric = AnswerRelevancyMetric(model="gpt-4o-mini", threshold=0.7, temperature=0.3)
|
|
503
|
-
|
|
504
|
-
# Lenient evaluation - focuses on overall trend
|
|
505
|
-
metric = TaskSuccessRateMetric(model="gpt-4o-mini", threshold=0.7, temperature=1.2)
|
|
506
|
-
```
|
|
507
|
-
|
|
508
|
-
## LLM Provider Configuration
|
|
509
|
-
|
|
510
|
-
### OpenAI
|
|
511
|
-
```python
|
|
512
|
-
import os
|
|
513
|
-
os.environ["OPENAI_API_KEY"] = "your-api-key"
|
|
514
|
-
|
|
515
|
-
from eval_lib import chat_complete
|
|
516
|
-
|
|
517
|
-
response, cost = await chat_complete(
|
|
518
|
-
"gpt-4o-mini", # or "openai:gpt-4o-mini"
|
|
519
|
-
messages=[{"role": "user", "content": "Hello!"}]
|
|
520
|
-
)
|
|
521
|
-
```
|
|
522
|
-
|
|
523
|
-
### Azure OpenAI
|
|
524
|
-
```python
|
|
525
|
-
os.environ["AZURE_OPENAI_API_KEY"] = "your-api-key"
|
|
526
|
-
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://your-endpoint.openai.azure.com/"
|
|
527
|
-
os.environ["AZURE_OPENAI_DEPLOYMENT"] = "your-deployment-name"
|
|
528
|
-
|
|
529
|
-
response, cost = await chat_complete(
|
|
530
|
-
"azure:gpt-4o",
|
|
531
|
-
messages=[{"role": "user", "content": "Hello!"}]
|
|
532
|
-
)
|
|
533
|
-
```
|
|
534
|
-
|
|
535
|
-
### Google Gemini
|
|
536
|
-
```python
|
|
537
|
-
os.environ["GOOGLE_API_KEY"] = "your-api-key"
|
|
538
|
-
|
|
539
|
-
response, cost = await chat_complete(
|
|
540
|
-
"google:gemini-2.0-flash",
|
|
541
|
-
messages=[{"role": "user", "content": "Hello!"}]
|
|
542
|
-
)
|
|
543
|
-
```
|
|
544
|
-
|
|
545
|
-
### Anthropic Claude
|
|
546
|
-
```python
|
|
547
|
-
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
|
|
548
|
-
|
|
549
|
-
response, cost = await chat_complete(
|
|
550
|
-
"anthropic:claude-sonnet-4-0",
|
|
551
|
-
messages=[{"role": "user", "content": "Hello!"}]
|
|
552
|
-
)
|
|
553
|
-
```
|
|
554
|
-
|
|
555
|
-
### Ollama (Local)
|
|
556
|
-
```python
|
|
557
|
-
os.environ["OLLAMA_API_KEY"] = "ollama" # Can be any value
|
|
558
|
-
os.environ["OLLAMA_API_BASE_URL"] = "http://localhost:11434/v1"
|
|
559
|
-
|
|
560
|
-
response, cost = await chat_complete(
|
|
561
|
-
"ollama:llama2",
|
|
562
|
-
messages=[{"role": "user", "content": "Hello!"}]
|
|
563
|
-
)
|
|
564
|
-
```
|
|
565
|
-
|
|
566
|
-
## Test Data Generation
|
|
567
|
-
|
|
568
|
-
The library includes a powerful test data generator that can create realistic test cases either from scratch or based on your documents.
|
|
569
|
-
|
|
570
|
-
### Supported Document Formats
|
|
571
|
-
|
|
572
|
-
- **Documents**: PDF, DOCX, DOC, TXT, RTF, ODT
|
|
573
|
-
- **Structured Data**: CSV, TSV, XLSX, JSON, YAML, XML
|
|
574
|
-
- **Web**: HTML, Markdown
|
|
575
|
-
- **Presentations**: PPTX
|
|
576
|
-
- **Images**: PNG, JPG, JPEG (with OCR support)
|
|
577
|
-
|
|
578
|
-
### Generate from Scratch
|
|
579
|
-
```python
|
|
580
|
-
from eval_lib.datagenerator.datagenerator import DatasetGenerator
|
|
581
|
-
|
|
582
|
-
generator = DatasetGenerator(
|
|
583
|
-
model="gpt-4o-mini",
|
|
584
|
-
agent_description="A customer support chatbot",
|
|
585
|
-
input_format="User question or request",
|
|
586
|
-
expected_output_format="Helpful response",
|
|
587
|
-
test_types=["functionality", "edge_cases"],
|
|
588
|
-
max_rows=20,
|
|
589
|
-
question_length="mixed", # "short", "long", or "mixed"
|
|
590
|
-
question_openness="mixed", # "open", "closed", or "mixed"
|
|
591
|
-
trap_density=0.1, # 10% trap questions
|
|
592
|
-
language="en"
|
|
593
|
-
)
|
|
594
|
-
|
|
595
|
-
dataset = await generator.generate_from_scratch()
|
|
596
|
-
```
|
|
597
|
-
|
|
598
|
-
### Generate from Documents
|
|
599
|
-
```python
|
|
600
|
-
generator = DatasetGenerator(
|
|
601
|
-
model="gpt-4o-mini",
|
|
602
|
-
agent_description="Technical support agent",
|
|
603
|
-
input_format="Technical question",
|
|
604
|
-
expected_output_format="Detailed answer with references",
|
|
605
|
-
test_types=["retrieval", "accuracy"],
|
|
606
|
-
max_rows=50,
|
|
607
|
-
chunk_size=1024,
|
|
608
|
-
chunk_overlap=100,
|
|
609
|
-
max_chunks=30
|
|
610
|
-
)
|
|
611
|
-
|
|
612
|
-
file_paths = ["docs/user_guide.pdf", "docs/faq.md"]
|
|
613
|
-
dataset = await generator.generate_from_documents(file_paths)
|
|
614
|
-
|
|
615
|
-
# Convert to test cases
|
|
616
|
-
from eval_lib import EvalTestCase
|
|
617
|
-
test_cases = [
|
|
618
|
-
EvalTestCase(
|
|
619
|
-
input=item["input"],
|
|
620
|
-
expected_output=item["expected_output"],
|
|
621
|
-
retrieval_context=[item.get("context", "")]
|
|
622
|
-
)
|
|
623
|
-
for item in dataset
|
|
624
|
-
]
|
|
625
|
-
```
|
|
626
|
-
|
|
627
|
-
## Best Practices
|
|
628
|
-
|
|
629
|
-
### 1. Choose the Right Model
|
|
630
|
-
|
|
631
|
-
- **G-Eval**: Use GPT-4 for best results with probability-weighted scoring
|
|
632
|
-
- **Other Metrics**: GPT-4o-mini is cost-effective and sufficient
|
|
633
|
-
- **Custom Eval**: Use GPT-4 for complex criteria, GPT-4o-mini for simple ones
|
|
634
|
-
|
|
635
|
-
### 2. Set Appropriate Thresholds
|
|
636
|
-
```python
|
|
637
|
-
# Safety metrics - high bar
|
|
638
|
-
BiasMetric(threshold=80.0)
|
|
639
|
-
ToxicityMetric(threshold=85.0)
|
|
640
|
-
|
|
641
|
-
# Quality metrics - moderate bar
|
|
642
|
-
AnswerRelevancyMetric(threshold=0.7)
|
|
643
|
-
FaithfulnessMetric(threshold=0.75)
|
|
644
|
-
|
|
645
|
-
# Agent metrics - context-dependent
|
|
646
|
-
TaskSuccessRateMetric(threshold=0.7) # Most tasks
|
|
647
|
-
RoleAdherenceMetric(threshold=0.9) # Strict role requirements
|
|
648
|
-
```
|
|
649
|
-
|
|
650
|
-
### 3. Use Temperature Wisely
|
|
651
|
-
```python
|
|
652
|
-
# Strict evaluation - critical applications
|
|
653
|
-
metric = FaithfulnessMetric(temperature=0.3)
|
|
654
|
-
|
|
655
|
-
# Balanced - general use (default)
|
|
656
|
-
metric = AnswerRelevancyMetric(temperature=0.5)
|
|
657
|
-
|
|
658
|
-
# Lenient - exploratory evaluation
|
|
659
|
-
metric = TaskSuccessRateMetric(temperature=1.2)
|
|
660
|
-
```
|
|
661
|
-
|
|
662
|
-
### 4. Leverage Evaluation Logs
|
|
663
|
-
```python
|
|
664
|
-
result = await metric.evaluate(test_case)
|
|
665
|
-
|
|
666
|
-
# Always check the log for insights
|
|
667
|
-
log = result['evaluation_log']
|
|
668
|
-
|
|
669
|
-
# For debugging failures:
|
|
670
|
-
if not result['success']:
|
|
671
|
-
print(f"Failed because: {log['final_reason']}")
|
|
672
|
-
print(f"Verdicts: {log.get('verdicts', [])}")
|
|
673
|
-
print(f"Steps taken: {log.get('evaluation_steps', [])}")
|
|
674
|
-
```
|
|
675
|
-
|
|
676
|
-
### 5. Batch Evaluation for Efficiency
|
|
677
|
-
```python
|
|
678
|
-
# Evaluate multiple test cases at once
|
|
679
|
-
results = await evaluate(
|
|
680
|
-
test_cases=[test_case1, test_case2, test_case3],
|
|
681
|
-
metrics=[metric1, metric2, metric3]
|
|
682
|
-
)
|
|
683
|
-
|
|
684
|
-
# Calculate aggregate statistics
|
|
685
|
-
total_cost = sum(
|
|
686
|
-
metric.evaluation_cost or 0
|
|
687
|
-
for _, test_results in results
|
|
688
|
-
for result in test_results
|
|
689
|
-
for metric in result.metrics_data
|
|
690
|
-
)
|
|
691
|
-
|
|
692
|
-
success_rate = sum(
|
|
693
|
-
1 for _, test_results in results
|
|
694
|
-
for result in test_results
|
|
695
|
-
if result.success
|
|
696
|
-
) / len(results)
|
|
697
|
-
|
|
698
|
-
print(f"Total cost: ${total_cost:.4f}")
|
|
699
|
-
print(f"Success rate: {success_rate:.2%}")
|
|
700
|
-
```
|
|
701
|
-
|
|
702
|
-
## Cost Tracking
|
|
703
|
-
|
|
704
|
-
All evaluations automatically track API costs:
|
|
705
|
-
```python
|
|
706
|
-
results = await evaluate(test_cases, metrics)
|
|
707
|
-
|
|
708
|
-
for _, test_results in results:
|
|
709
|
-
for result in test_results:
|
|
710
|
-
for metric in result.metrics_data:
|
|
711
|
-
print(f"{metric.name}: ${metric.evaluation_cost:.6f}")
|
|
712
|
-
```
|
|
713
|
-
|
|
714
|
-
**Cost Estimates** (as of 2025):
|
|
715
|
-
- **G-Eval with GPT-4**: ~$0.10-0.15 per evaluation (20 samples)
|
|
716
|
-
- **Custom Eval with GPT-4**: ~$0.10-0.15 per evaluation (20 samples + CoT)
|
|
717
|
-
- **Standard metrics with GPT-4o-mini**: ~$0.001-0.005 per evaluation
|
|
718
|
-
- **Faithfulness/Answer Relevancy**: ~$0.003-0.010 per evaluation (multiple LLM calls)
|
|
719
|
-
|
|
720
|
-
## Environment Variables
|
|
721
|
-
|
|
722
|
-
| Variable | Description | Required |
|
|
723
|
-
|----------|-------------|----------|
|
|
724
|
-
| `OPENAI_API_KEY` | OpenAI API key | For OpenAI |
|
|
725
|
-
| `AZURE_OPENAI_API_KEY` | Azure OpenAI API key | For Azure |
|
|
726
|
-
| `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint URL | For Azure |
|
|
727
|
-
| `AZURE_OPENAI_DEPLOYMENT` | Azure deployment name | For Azure |
|
|
728
|
-
| `GOOGLE_API_KEY` | Google API key | For Google |
|
|
729
|
-
| `ANTHROPIC_API_KEY` | Anthropic API key | For Anthropic |
|
|
730
|
-
| `OLLAMA_API_KEY` | Ollama API key | For Ollama |
|
|
731
|
-
| `OLLAMA_API_BASE_URL` | Ollama base URL | For Ollama |
|
|
732
|
-
|
|
733
|
-
## Contributing
|
|
734
|
-
|
|
735
|
-
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
736
|
-
|
|
737
|
-
1. Fork the repository
|
|
738
|
-
2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
|
|
739
|
-
3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
|
|
740
|
-
4. Push to the branch (`git push origin feature/AmazingFeature`)
|
|
741
|
-
5. Open a Pull Request
|
|
742
|
-
|
|
743
|
-
## License
|
|
744
|
-
|
|
745
|
-
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
746
|
-
|
|
747
|
-
## Citation
|
|
748
|
-
|
|
749
|
-
If you use this library in your research, please cite:
|
|
750
|
-
```bibtex
|
|
751
|
-
@software{eval_ai_library,
|
|
752
|
-
author = {Meshkov, Aleksandr},
|
|
753
|
-
title = {Eval AI Library: Comprehensive AI Model Evaluation Framework},
|
|
754
|
-
year = {2025},
|
|
755
|
-
url = {https://github.com/meshkovQA/Eval-ai-library.git}
|
|
756
|
-
}
|
|
757
|
-
```
|
|
758
|
-
|
|
759
|
-
### References
|
|
760
|
-
|
|
761
|
-
This library implements techniques from:
|
|
762
|
-
```bibtex
|
|
763
|
-
@inproceedings{liu2023geval,
|
|
764
|
-
title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment},
|
|
765
|
-
author={Liu, Yang and Iter, Dan and Xu, Yichong and Wang, Shuohang and Xu, Ruochen and Zhu, Chenguang},
|
|
766
|
-
booktitle={Proceedings of EMNLP},
|
|
767
|
-
year={2023}
|
|
768
|
-
}
|
|
769
|
-
```
|
|
770
|
-
|
|
771
|
-
## Support
|
|
772
|
-
|
|
773
|
-
- 📧 Email: alekslynx90@gmail.com
|
|
774
|
-
- 🐛 Issues: [GitHub Issues](https://github.com/meshkovQA/Eval-ai-library.git/issues)
|
|
775
|
-
- 📖 Documentation: [Full Documentation](https://github.com/meshkovQA/Eval-ai-library.git#readme)
|
|
776
|
-
|
|
777
|
-
## Acknowledgments
|
|
778
|
-
|
|
779
|
-
This library was developed to provide a comprehensive solution for evaluating AI models across different use cases and providers, with state-of-the-art techniques including G-Eval's probability-weighted scoring and automatic chain-of-thought generation.
|