fiddler-evals 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fiddler_evals-0.1.0/MANIFEST.in +2 -0
- fiddler_evals-0.1.0/PKG-INFO +341 -0
- fiddler_evals-0.1.0/PUBLIC.md +319 -0
- fiddler_evals-0.1.0/README.md +65 -0
- fiddler_evals-0.1.0/fiddler_evals/VERSION +1 -0
- fiddler_evals-0.1.0/fiddler_evals/__init__.py +71 -0
- fiddler_evals-0.1.0/fiddler_evals/configs.py +14 -0
- fiddler_evals-0.1.0/fiddler_evals/conftest.py +28 -0
- fiddler_evals-0.1.0/fiddler_evals/connection.py +451 -0
- fiddler_evals-0.1.0/fiddler_evals/constants.py +9 -0
- fiddler_evals-0.1.0/fiddler_evals/decorators.py +189 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/__init__.py +0 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/application.py +398 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/base.py +58 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/dataset.py +1230 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/experiment.py +934 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/project.py +362 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/tests/__init__.py +0 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_application.py +340 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_dataset.py +602 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_dataset_items.py +492 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_experiment.py +719 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_experiment_items.py +495 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_experiment_results.py +330 -0
- fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_project.py +270 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/__init__.py +24 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/answer_relevance.py +92 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/base.py +141 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/coherence.py +111 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/conciseness.py +84 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/eval_fn.py +214 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/ftl_prompt_safety.py +113 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/ftl_response_faithfulness.py +115 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/regex.py +99 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/sentiment.py +112 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/__init__.py +0 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_answer_relevance.py +237 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_coherence.py +369 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_conciseness.py +225 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_eval_fn.py +359 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_ftl_prompt_safety.py +222 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_ftl_response_faithfulness.py +205 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_regex.py +116 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_sentiment.py +224 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_topic_classification.py +249 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_toxicity.py +201 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/topic.py +127 -0
- fiddler_evals-0.1.0/fiddler_evals/evaluators/toxicity.py +101 -0
- fiddler_evals-0.1.0/fiddler_evals/exceptions.py +221 -0
- fiddler_evals-0.1.0/fiddler_evals/libs/__init__.py +0 -0
- fiddler_evals-0.1.0/fiddler_evals/libs/http_client.py +483 -0
- fiddler_evals-0.1.0/fiddler_evals/libs/json_encoder.py +25 -0
- fiddler_evals-0.1.0/fiddler_evals/libs/semver.py +614 -0
- fiddler_evals-0.1.0/fiddler_evals/libs/tests/__init__.py +0 -0
- fiddler_evals-0.1.0/fiddler_evals/libs/tests/test_json_encoder.py +27 -0
- fiddler_evals-0.1.0/fiddler_evals/libs/tests/test_request_client.py +715 -0
- fiddler_evals-0.1.0/fiddler_evals/pydantic_models/__init__.py +4 -0
- fiddler_evals-0.1.0/fiddler_evals/pydantic_models/application.py +24 -0
- fiddler_evals-0.1.0/fiddler_evals/pydantic_models/base.py +9 -0
- fiddler_evals-0.1.0/fiddler_evals/pydantic_models/compact.py +41 -0
- fiddler_evals-0.1.0/fiddler_evals/pydantic_models/dataset.py +58 -0
- fiddler_evals-0.1.0/fiddler_evals/pydantic_models/error.py +22 -0
- fiddler_evals-0.1.0/fiddler_evals/pydantic_models/evaluator.py +18 -0
- fiddler_evals-0.1.0/fiddler_evals/pydantic_models/experiment.py +89 -0
- fiddler_evals-0.1.0/fiddler_evals/pydantic_models/filter_query.py +54 -0
- fiddler_evals-0.1.0/fiddler_evals/pydantic_models/project.py +17 -0
- fiddler_evals-0.1.0/fiddler_evals/pydantic_models/response.py +51 -0
- fiddler_evals-0.1.0/fiddler_evals/pydantic_models/score.py +26 -0
- fiddler_evals-0.1.0/fiddler_evals/pydantic_models/server_info.py +20 -0
- fiddler_evals-0.1.0/fiddler_evals/runner/__init__.py +0 -0
- fiddler_evals-0.1.0/fiddler_evals/runner/evaluation.py +178 -0
- fiddler_evals-0.1.0/fiddler_evals/runner/executor.py +102 -0
- fiddler_evals-0.1.0/fiddler_evals/runner/experiment_result_publisher.py +97 -0
- fiddler_evals-0.1.0/fiddler_evals/runner/experiment_runner.py +640 -0
- fiddler_evals-0.1.0/fiddler_evals/runner/tests/__init__.py +0 -0
- fiddler_evals-0.1.0/fiddler_evals/runner/tests/test_evaluate.py +692 -0
- fiddler_evals-0.1.0/fiddler_evals/runner/tests/test_experiment_result_publisher.py +264 -0
- fiddler_evals-0.1.0/fiddler_evals/tests/__init__.py +0 -0
- fiddler_evals-0.1.0/fiddler_evals/tests/constants.py +40 -0
- fiddler_evals-0.1.0/fiddler_evals/tests/test_connection.py +224 -0
- fiddler_evals-0.1.0/fiddler_evals/tests/test_decorators.py +346 -0
- fiddler_evals-0.1.0/fiddler_evals/utils/__init__.py +0 -0
- fiddler_evals-0.1.0/fiddler_evals/utils/environment.py +46 -0
- fiddler_evals-0.1.0/fiddler_evals/utils/pd.py +9 -0
- fiddler_evals-0.1.0/fiddler_evals/utils/tests/__init__.py +0 -0
- fiddler_evals-0.1.0/fiddler_evals/utils/tests/test_environment.py +146 -0
- fiddler_evals-0.1.0/fiddler_evals/utils/tqdm.py +23 -0
- fiddler_evals-0.1.0/fiddler_evals/version.py +4 -0
- fiddler_evals-0.1.0/fiddler_evals.egg-info/PKG-INFO +341 -0
- fiddler_evals-0.1.0/fiddler_evals.egg-info/SOURCES.txt +93 -0
- fiddler_evals-0.1.0/fiddler_evals.egg-info/dependency_links.txt +1 -0
- fiddler_evals-0.1.0/fiddler_evals.egg-info/requires.txt +10 -0
- fiddler_evals-0.1.0/fiddler_evals.egg-info/top_level.txt +1 -0
- fiddler_evals-0.1.0/pyproject.toml +59 -0
- fiddler_evals-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fiddler-evals
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python SDK for evaluating LLM Applications
|
|
5
|
+
Author-email: Fiddler AI <support@fiddler.ai>
|
|
6
|
+
Project-URL: Homepage, https://fiddler.ai
|
|
7
|
+
Project-URL: Repository, https://github.com/fiddler-labs/fiddler-evals-sdk
|
|
8
|
+
Project-URL: Documentation, https://docs.fiddler.ai/
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: pip>=21.0
|
|
14
|
+
Requires-Dist: requests<3
|
|
15
|
+
Requires-Dist: pydantic>=2.0.0
|
|
16
|
+
Requires-Dist: tqdm
|
|
17
|
+
Requires-Dist: typing-extensions<5,>=4.6.0
|
|
18
|
+
Requires-Dist: pandas>=1.2.5
|
|
19
|
+
Requires-Dist: python-decouple
|
|
20
|
+
Provides-Extra: pandas
|
|
21
|
+
Requires-Dist: pandas>=1.2.5; extra == "pandas"
|
|
22
|
+
|
|
23
|
+
# Fiddler Evals SDK
|
|
24
|
+
|
|
25
|
+
A comprehensive toolkit for evaluating Large Language Model (LLM) applications, RAG systems, and AI agents. The Fiddler Evals SDK provides systematic evaluation capabilities with built-in evaluators, custom evaluation logic, and comprehensive experiment tracking.
|
|
26
|
+
|
|
27
|
+
## Key Features
|
|
28
|
+
|
|
29
|
+
- **🧪 Systematic Evaluation**: Run structured experiments on your AI applications
|
|
30
|
+
- **📊 Built-in Evaluators**: Production-ready evaluators for common AI tasks
|
|
31
|
+
- **🔧 Custom Evaluators**: Build evaluation logic for your specific use cases
|
|
32
|
+
- **📈 Result Tracking**: Comprehensive experiment tracking and analysis
|
|
33
|
+
- **🚀 Scale**: Evaluate across large datasets with concurrent processing
|
|
34
|
+
- **📁 Multiple Data Sources**: Import test cases from CSV, JSONL, and pandas DataFrames
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
## Requirements
|
|
38
|
+
|
|
39
|
+
- Python 3.10 or higher
|
|
40
|
+
- Access to a Fiddler Platform instance
|
|
41
|
+
- API token from Fiddler Platform
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install fiddler-evals
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
For pre-release versions:
|
|
51
|
+
```bash
|
|
52
|
+
pip install --upgrade --pre fiddler-evals
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Quick Start
|
|
56
|
+
|
|
57
|
+
### 1. Connect to Fiddler
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from fiddler_evals import init
|
|
61
|
+
|
|
62
|
+
# Initialize connection
|
|
63
|
+
init(url='https://your-org.fiddler.ai', token='your-api-token')
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 2. Create Project Structure
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from fiddler_evals import Project, Application, Dataset
|
|
70
|
+
|
|
71
|
+
# Create project and application
|
|
72
|
+
project = Project.get_or_create(name='my-eval-project')
|
|
73
|
+
app = Application.get_or_create(
|
|
74
|
+
name='my-llm-app',
|
|
75
|
+
project_id=project.id
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Create dataset
|
|
79
|
+
dataset = Dataset.create(
|
|
80
|
+
name='evaluation-dataset',
|
|
81
|
+
application_id=app.id,
|
|
82
|
+
description='Test cases for LLM evaluation'
|
|
83
|
+
)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 3. Add Test Cases
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from fiddler_evals.pydantic_models.dataset import NewDatasetItem
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
test_cases = [
|
|
93
|
+
NewDatasetItem(
|
|
94
|
+
inputs={"question": "What happens to you if you eat watermelon seeds?"},
|
|
95
|
+
expected_outputs={"answer": "The watermelon seeds pass through your digestive system"},
|
|
96
|
+
metadata={"type": "Adversarial", "category": "Misconceptions"},
|
|
97
|
+
)
|
|
98
|
+
]
|
|
99
|
+
dataset.insert(test_cases)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### 4. Use Built-in Evaluators
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from fiddler_evals.evaluators import (
|
|
106
|
+
AnswerRelevance, Coherence, Conciseness,
|
|
107
|
+
Toxicity, Sentiment, RegexSearch
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Test individual evaluators
|
|
111
|
+
relevance_evaluator = AnswerRelevance()
|
|
112
|
+
score = relevance_evaluator.score(
|
|
113
|
+
prompt="What is the capital of France?",
|
|
114
|
+
response="Paris is the capital of France."
|
|
115
|
+
)
|
|
116
|
+
print(f"Score: {score.value} - {score.reasoning}")
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### 5. Create Custom Evaluators
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from fiddler_evals.evaluators.base import Evaluator
|
|
123
|
+
from fiddler_evals.pydantic_models.score import Score
|
|
124
|
+
|
|
125
|
+
class PolitenessEvaluator(Evaluator):
|
|
126
|
+
"""
|
|
127
|
+
Simple evaluator that checks if a response contains polite language.
|
|
128
|
+
Useful for customer service or chatbot applications.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def __init__(self):
|
|
132
|
+
super().__init__()
|
|
133
|
+
self.polite_words = [
|
|
134
|
+
'please', 'thank you', 'thanks', 'sorry', 'apologize',
|
|
135
|
+
'appreciate', 'welcome', 'help', 'assist', 'glad'
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
def score(self, output: str) -> Score:
|
|
139
|
+
"""Score based on presence of polite language."""
|
|
140
|
+
output_lower = output.lower()
|
|
141
|
+
|
|
142
|
+
# Count polite words
|
|
143
|
+
polite_count = sum(1 for word in self.polite_words if word in output_lower)
|
|
144
|
+
|
|
145
|
+
# Simple scoring: 1.0 if any polite words found, 0.0 otherwise
|
|
146
|
+
if polite_count > 0:
|
|
147
|
+
score_value = 1.0
|
|
148
|
+
reasoning = f"Contains {polite_count} polite word(s)"
|
|
149
|
+
else:
|
|
150
|
+
score_value = 0.0
|
|
151
|
+
reasoning = "No polite language detected"
|
|
152
|
+
|
|
153
|
+
return Score(
|
|
154
|
+
name="politeness",
|
|
155
|
+
evaluator_name=self.name,
|
|
156
|
+
value=score_value,
|
|
157
|
+
reasoning=reasoning
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Test the evaluator
|
|
161
|
+
politeness_evaluator = PolitenessEvaluator()
|
|
162
|
+
|
|
163
|
+
polite_response = "Thank you for your question! I'd be happy to help you with that."
|
|
164
|
+
impolite_response = "I don't know. Figure it out yourself."
|
|
165
|
+
|
|
166
|
+
print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
|
|
167
|
+
print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### 5.1. Function-Based Evaluators
|
|
171
|
+
|
|
172
|
+
You can also use simple functions as evaluators instead of creating full evaluator classes. Functions are automatically wrapped with `EvalFn` internally:
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
def word_count_evaluator(output: str) -> float:
|
|
176
|
+
"""Simple function that returns word count as a score."""
|
|
177
|
+
word_count = len(output.split())
|
|
178
|
+
# Normalize to 0-1 scale (assuming 0-50 words is reasonable)
|
|
179
|
+
return min(word_count / 50.0, 1.0)
|
|
180
|
+
|
|
181
|
+
def contains_number_evaluator(output: str) -> float:
|
|
182
|
+
"""Check if response contains any numbers."""
|
|
183
|
+
import re
|
|
184
|
+
return 1.0 if re.search(r'\d+', output) else 0.0
|
|
185
|
+
|
|
186
|
+
# Use functions directly in evaluators list
|
|
187
|
+
evaluators = [
|
|
188
|
+
AnswerRelevance(),
|
|
189
|
+
Conciseness(),
|
|
190
|
+
word_count_evaluator, # Function evaluator
|
|
191
|
+
contains_number_evaluator, # Function evaluator
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
# The evaluate() function automatically wraps these with EvalFn
|
|
195
|
+
experiment_result = evaluate(
|
|
196
|
+
dataset=dataset,
|
|
197
|
+
task=my_llm_task,
|
|
198
|
+
evaluators=evaluators,
|
|
199
|
+
score_fn_kwargs_mapping={
|
|
200
|
+
"output": "answer", # Maps to function parameter
|
|
201
|
+
"response": "answer", # Maps to class evaluator parameter
|
|
202
|
+
}
|
|
203
|
+
)
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### 6. Run Experiments
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
from fiddler_evals import evaluate
|
|
210
|
+
|
|
211
|
+
# Define your AI application task
|
|
212
|
+
def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
|
|
213
|
+
question = inputs.get("question", "")
|
|
214
|
+
# Your LLM API call here
|
|
215
|
+
answer = call_your_llm(question)
|
|
216
|
+
return {"answer": answer}
|
|
217
|
+
|
|
218
|
+
# Set up evaluators
|
|
219
|
+
evaluators = [
|
|
220
|
+
AnswerRelevance(),
|
|
221
|
+
Conciseness(),
|
|
222
|
+
Sentiment(),
|
|
223
|
+
PolitenessEvaluator(),
|
|
224
|
+
]
|
|
225
|
+
|
|
226
|
+
# Run evaluation
|
|
227
|
+
experiment_result = evaluate(
|
|
228
|
+
dataset=dataset,
|
|
229
|
+
task=my_llm_task,
|
|
230
|
+
evaluators=evaluators,
|
|
231
|
+
name_prefix="my_evaluation",
|
|
232
|
+
description="Comprehensive LLM evaluation",
|
|
233
|
+
score_fn_kwargs_mapping={
|
|
234
|
+
"question": "question",
|
|
235
|
+
"response": "answer",
|
|
236
|
+
"output": "answer",
|
|
237
|
+
"text": "answer",
|
|
238
|
+
"prompt": lambda x: x["inputs"]["question"],
|
|
239
|
+
}
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
print(f"Evaluated {len(experiment_result.results)} test cases")
|
|
243
|
+
print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Built-in Evaluators
|
|
247
|
+
|
|
248
|
+
| Evaluator | Purpose | Key Parameters |
|
|
249
|
+
|-----------|---------|----------------|
|
|
250
|
+
| `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
|
|
251
|
+
| `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
|
|
252
|
+
| `Conciseness` | Measures response brevity and clarity | `response` |
|
|
253
|
+
| `Toxicity` | Detects harmful or toxic content | `text` |
|
|
254
|
+
| `Sentiment` | Analyzes emotional tone | `text` |
|
|
255
|
+
| `RegexSearch` | Pattern matching for specific formats | `output`, `pattern` |
|
|
256
|
+
| `FTLPromptSafety` | Compute safety scores for prompts | `text` |
|
|
257
|
+
| `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | `response`, `context` |
|
|
258
|
+
|
|
259
|
+
## Data Import Options
|
|
260
|
+
|
|
261
|
+
### CSV Files
|
|
262
|
+
```python
|
|
263
|
+
dataset.insert_from_csv_file(
|
|
264
|
+
file_path='data.csv',
|
|
265
|
+
input_columns=['question'],
|
|
266
|
+
expected_output_columns=['answer'],
|
|
267
|
+
metadata_columns=['category']
|
|
268
|
+
)
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
### JSONL Files
|
|
272
|
+
```python
|
|
273
|
+
dataset.insert_from_jsonl_file(
|
|
274
|
+
file_path='data.jsonl',
|
|
275
|
+
input_keys=['question'],
|
|
276
|
+
expected_output_keys=['answer'],
|
|
277
|
+
metadata_keys=['category']
|
|
278
|
+
)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
### Pandas DataFrames
|
|
282
|
+
```python
|
|
283
|
+
dataset.insert_from_pandas(
|
|
284
|
+
df=df,
|
|
285
|
+
input_columns=['question'],
|
|
286
|
+
expected_output_columns=['answer'],
|
|
287
|
+
metadata_columns=['category']
|
|
288
|
+
)
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
## Advanced Usage
|
|
292
|
+
|
|
293
|
+
### Concurrent Processing
|
|
294
|
+
```python
|
|
295
|
+
experiment_result = evaluate(
|
|
296
|
+
dataset=dataset,
|
|
297
|
+
task=my_llm_task,
|
|
298
|
+
evaluators=evaluators,
|
|
299
|
+
max_workers=4 # Process 4 test cases concurrently
|
|
300
|
+
)
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
### Custom Score Mapping
|
|
304
|
+
|
|
305
|
+
The `score_fn_kwargs_mapping` parameter is essential for connecting your task outputs to evaluator inputs. Different evaluators expect different parameter names, but your task function returns outputs with specific keys.
|
|
306
|
+
|
|
307
|
+
```python
|
|
308
|
+
# Your task returns:
|
|
309
|
+
{"answer": "Paris is the capital of France"}
|
|
310
|
+
|
|
311
|
+
# But evaluators expect different parameter names:
|
|
312
|
+
AnswerRelevance.score(prompt="...", response="...") # Needs 'prompt' and 'response'
|
|
313
|
+
Conciseness.score(response="...") # Needs 'response'
|
|
314
|
+
Sentiment.score(text="...") # Needs 'text'
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
**The Solution**: Map your output keys to evaluator parameter names:
|
|
318
|
+
|
|
319
|
+
```python
|
|
320
|
+
score_fn_kwargs_mapping={
|
|
321
|
+
"question": "question", # Map 'question' parameter to 'question' key
|
|
322
|
+
"response": "answer", # Map 'response' parameter to 'answer' key
|
|
323
|
+
"text": "answer", # Map 'text' parameter to 'answer' key
|
|
324
|
+
"prompt": lambda x: x["inputs"]["question"], # Map 'prompt' to input question
|
|
325
|
+
"context": lambda x: x["extras"]["context"] # Map 'context' to extras
|
|
326
|
+
}
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
### Experiment Metadata
|
|
330
|
+
```python
|
|
331
|
+
experiment_result = evaluate(
|
|
332
|
+
dataset=dataset,
|
|
333
|
+
task=my_llm_task,
|
|
334
|
+
evaluators=evaluators,
|
|
335
|
+
metadata={
|
|
336
|
+
"model_version": "gpt-4",
|
|
337
|
+
"evaluation_date": "2024-01-15",
|
|
338
|
+
"temperature": 0.7
|
|
339
|
+
}
|
|
340
|
+
)
|
|
341
|
+
```
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
# Fiddler Evals SDK
|
|
2
|
+
|
|
3
|
+
A comprehensive toolkit for evaluating Large Language Model (LLM) applications, RAG systems, and AI agents. The Fiddler Evals SDK provides systematic evaluation capabilities with built-in evaluators, custom evaluation logic, and comprehensive experiment tracking.
|
|
4
|
+
|
|
5
|
+
## Key Features
|
|
6
|
+
|
|
7
|
+
- **🧪 Systematic Evaluation**: Run structured experiments on your AI applications
|
|
8
|
+
- **📊 Built-in Evaluators**: Production-ready evaluators for common AI tasks
|
|
9
|
+
- **🔧 Custom Evaluators**: Build evaluation logic for your specific use cases
|
|
10
|
+
- **📈 Result Tracking**: Comprehensive experiment tracking and analysis
|
|
11
|
+
- **🚀 Scale**: Evaluate across large datasets with concurrent processing
|
|
12
|
+
- **📁 Multiple Data Sources**: Import test cases from CSV, JSONL, and pandas DataFrames
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
## Requirements
|
|
16
|
+
|
|
17
|
+
- Python 3.10 or higher
|
|
18
|
+
- Access to a Fiddler Platform instance
|
|
19
|
+
- API token from Fiddler Platform
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install fiddler-evals
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
For pre-release versions:
|
|
29
|
+
```bash
|
|
30
|
+
pip install --upgrade --pre fiddler-evals
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Quick Start
|
|
34
|
+
|
|
35
|
+
### 1. Connect to Fiddler
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from fiddler_evals import init
|
|
39
|
+
|
|
40
|
+
# Initialize connection
|
|
41
|
+
init(url='https://your-org.fiddler.ai', token='your-api-token')
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### 2. Create Project Structure
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from fiddler_evals import Project, Application, Dataset
|
|
48
|
+
|
|
49
|
+
# Create project and application
|
|
50
|
+
project = Project.get_or_create(name='my-eval-project')
|
|
51
|
+
app = Application.get_or_create(
|
|
52
|
+
name='my-llm-app',
|
|
53
|
+
project_id=project.id
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Create dataset
|
|
57
|
+
dataset = Dataset.create(
|
|
58
|
+
name='evaluation-dataset',
|
|
59
|
+
application_id=app.id,
|
|
60
|
+
description='Test cases for LLM evaluation'
|
|
61
|
+
)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### 3. Add Test Cases
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from fiddler_evals.pydantic_models.dataset import NewDatasetItem
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
test_cases = [
|
|
71
|
+
NewDatasetItem(
|
|
72
|
+
inputs={"question": "What happens to you if you eat watermelon seeds?"},
|
|
73
|
+
expected_outputs={"answer": "The watermelon seeds pass through your digestive system"},
|
|
74
|
+
metadata={"type": "Adversarial", "category": "Misconceptions"},
|
|
75
|
+
)
|
|
76
|
+
]
|
|
77
|
+
dataset.insert(test_cases)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### 4. Use Built-in Evaluators
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from fiddler_evals.evaluators import (
|
|
84
|
+
AnswerRelevance, Coherence, Conciseness,
|
|
85
|
+
Toxicity, Sentiment, RegexSearch
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Test individual evaluators
|
|
89
|
+
relevance_evaluator = AnswerRelevance()
|
|
90
|
+
score = relevance_evaluator.score(
|
|
91
|
+
prompt="What is the capital of France?",
|
|
92
|
+
response="Paris is the capital of France."
|
|
93
|
+
)
|
|
94
|
+
print(f"Score: {score.value} - {score.reasoning}")
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### 5. Create Custom Evaluators
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from fiddler_evals.evaluators.base import Evaluator
|
|
101
|
+
from fiddler_evals.pydantic_models.score import Score
|
|
102
|
+
|
|
103
|
+
class PolitenessEvaluator(Evaluator):
|
|
104
|
+
"""
|
|
105
|
+
Simple evaluator that checks if a response contains polite language.
|
|
106
|
+
Useful for customer service or chatbot applications.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def __init__(self):
|
|
110
|
+
super().__init__()
|
|
111
|
+
self.polite_words = [
|
|
112
|
+
'please', 'thank you', 'thanks', 'sorry', 'apologize',
|
|
113
|
+
'appreciate', 'welcome', 'help', 'assist', 'glad'
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
def score(self, output: str) -> Score:
|
|
117
|
+
"""Score based on presence of polite language."""
|
|
118
|
+
output_lower = output.lower()
|
|
119
|
+
|
|
120
|
+
# Count polite words
|
|
121
|
+
polite_count = sum(1 for word in self.polite_words if word in output_lower)
|
|
122
|
+
|
|
123
|
+
# Simple scoring: 1.0 if any polite words found, 0.0 otherwise
|
|
124
|
+
if polite_count > 0:
|
|
125
|
+
score_value = 1.0
|
|
126
|
+
reasoning = f"Contains {polite_count} polite word(s)"
|
|
127
|
+
else:
|
|
128
|
+
score_value = 0.0
|
|
129
|
+
reasoning = "No polite language detected"
|
|
130
|
+
|
|
131
|
+
return Score(
|
|
132
|
+
name="politeness",
|
|
133
|
+
evaluator_name=self.name,
|
|
134
|
+
value=score_value,
|
|
135
|
+
reasoning=reasoning
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# Test the evaluator
|
|
139
|
+
politeness_evaluator = PolitenessEvaluator()
|
|
140
|
+
|
|
141
|
+
polite_response = "Thank you for your question! I'd be happy to help you with that."
|
|
142
|
+
impolite_response = "I don't know. Figure it out yourself."
|
|
143
|
+
|
|
144
|
+
print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
|
|
145
|
+
print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### 5.1. Function-Based Evaluators
|
|
149
|
+
|
|
150
|
+
You can also use simple functions as evaluators instead of creating full evaluator classes. Functions are automatically wrapped with `EvalFn` internally:
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
def word_count_evaluator(output: str) -> float:
|
|
154
|
+
"""Simple function that returns word count as a score."""
|
|
155
|
+
word_count = len(output.split())
|
|
156
|
+
# Normalize to 0-1 scale (assuming 0-50 words is reasonable)
|
|
157
|
+
return min(word_count / 50.0, 1.0)
|
|
158
|
+
|
|
159
|
+
def contains_number_evaluator(output: str) -> float:
|
|
160
|
+
"""Check if response contains any numbers."""
|
|
161
|
+
import re
|
|
162
|
+
return 1.0 if re.search(r'\d+', output) else 0.0
|
|
163
|
+
|
|
164
|
+
# Use functions directly in evaluators list
|
|
165
|
+
evaluators = [
|
|
166
|
+
AnswerRelevance(),
|
|
167
|
+
Conciseness(),
|
|
168
|
+
word_count_evaluator, # Function evaluator
|
|
169
|
+
contains_number_evaluator, # Function evaluator
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
# The evaluate() function automatically wraps these with EvalFn
|
|
173
|
+
experiment_result = evaluate(
|
|
174
|
+
dataset=dataset,
|
|
175
|
+
task=my_llm_task,
|
|
176
|
+
evaluators=evaluators,
|
|
177
|
+
score_fn_kwargs_mapping={
|
|
178
|
+
"output": "answer", # Maps to function parameter
|
|
179
|
+
"response": "answer", # Maps to class evaluator parameter
|
|
180
|
+
}
|
|
181
|
+
)
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### 6. Run Experiments
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
from fiddler_evals import evaluate
|
|
188
|
+
|
|
189
|
+
# Define your AI application task
|
|
190
|
+
def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
|
|
191
|
+
question = inputs.get("question", "")
|
|
192
|
+
# Your LLM API call here
|
|
193
|
+
answer = call_your_llm(question)
|
|
194
|
+
return {"answer": answer}
|
|
195
|
+
|
|
196
|
+
# Set up evaluators
|
|
197
|
+
evaluators = [
|
|
198
|
+
AnswerRelevance(),
|
|
199
|
+
Conciseness(),
|
|
200
|
+
Sentiment(),
|
|
201
|
+
PolitenessEvaluator(),
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
# Run evaluation
|
|
205
|
+
experiment_result = evaluate(
|
|
206
|
+
dataset=dataset,
|
|
207
|
+
task=my_llm_task,
|
|
208
|
+
evaluators=evaluators,
|
|
209
|
+
name_prefix="my_evaluation",
|
|
210
|
+
description="Comprehensive LLM evaluation",
|
|
211
|
+
score_fn_kwargs_mapping={
|
|
212
|
+
"question": "question",
|
|
213
|
+
"response": "answer",
|
|
214
|
+
"output": "answer",
|
|
215
|
+
"text": "answer",
|
|
216
|
+
"prompt": lambda x: x["inputs"]["question"],
|
|
217
|
+
}
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
print(f"Evaluated {len(experiment_result.results)} test cases")
|
|
221
|
+
print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## Built-in Evaluators
|
|
225
|
+
|
|
226
|
+
| Evaluator | Purpose | Key Parameters |
|
|
227
|
+
|-----------|---------|----------------|
|
|
228
|
+
| `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
|
|
229
|
+
| `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
|
|
230
|
+
| `Conciseness` | Measures response brevity and clarity | `response` |
|
|
231
|
+
| `Toxicity` | Detects harmful or toxic content | `text` |
|
|
232
|
+
| `Sentiment` | Analyzes emotional tone | `text` |
|
|
233
|
+
| `RegexSearch` | Pattern matching for specific formats | `output`, `pattern` |
|
|
234
|
+
| `FTLPromptSafety` | Compute safety scores for prompts | `text` |
|
|
235
|
+
| `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | `response`, `context` |
|
|
236
|
+
|
|
237
|
+
## Data Import Options
|
|
238
|
+
|
|
239
|
+
### CSV Files
|
|
240
|
+
```python
|
|
241
|
+
dataset.insert_from_csv_file(
|
|
242
|
+
file_path='data.csv',
|
|
243
|
+
input_columns=['question'],
|
|
244
|
+
expected_output_columns=['answer'],
|
|
245
|
+
metadata_columns=['category']
|
|
246
|
+
)
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### JSONL Files
|
|
250
|
+
```python
|
|
251
|
+
dataset.insert_from_jsonl_file(
|
|
252
|
+
file_path='data.jsonl',
|
|
253
|
+
input_keys=['question'],
|
|
254
|
+
expected_output_keys=['answer'],
|
|
255
|
+
metadata_keys=['category']
|
|
256
|
+
)
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Pandas DataFrames
|
|
260
|
+
```python
|
|
261
|
+
dataset.insert_from_pandas(
|
|
262
|
+
df=df,
|
|
263
|
+
input_columns=['question'],
|
|
264
|
+
expected_output_columns=['answer'],
|
|
265
|
+
metadata_columns=['category']
|
|
266
|
+
)
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
## Advanced Usage
|
|
270
|
+
|
|
271
|
+
### Concurrent Processing
|
|
272
|
+
```python
|
|
273
|
+
experiment_result = evaluate(
|
|
274
|
+
dataset=dataset,
|
|
275
|
+
task=my_llm_task,
|
|
276
|
+
evaluators=evaluators,
|
|
277
|
+
max_workers=4 # Process 4 test cases concurrently
|
|
278
|
+
)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
### Custom Score Mapping
|
|
282
|
+
|
|
283
|
+
The `score_fn_kwargs_mapping` parameter is essential for connecting your task outputs to evaluator inputs. Different evaluators expect different parameter names, but your task function returns outputs with specific keys.
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
# Your task returns:
|
|
287
|
+
{"answer": "Paris is the capital of France"}
|
|
288
|
+
|
|
289
|
+
# But evaluators expect different parameter names:
|
|
290
|
+
AnswerRelevance.score(prompt="...", response="...") # Needs 'prompt' and 'response'
|
|
291
|
+
Conciseness.score(response="...") # Needs 'response'
|
|
292
|
+
Sentiment.score(text="...") # Needs 'text'
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
**The Solution**: Map your output keys to evaluator parameter names:
|
|
296
|
+
|
|
297
|
+
```python
|
|
298
|
+
score_fn_kwargs_mapping={
|
|
299
|
+
"question": "question", # Map 'question' parameter to 'question' key
|
|
300
|
+
"response": "answer", # Map 'response' parameter to 'answer' key
|
|
301
|
+
"text": "answer", # Map 'text' parameter to 'answer' key
|
|
302
|
+
"prompt": lambda x: x["inputs"]["question"], # Map 'prompt' to input question
|
|
303
|
+
"context": lambda x: x["extras"]["context"] # Map 'context' to extras
|
|
304
|
+
}
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
### Experiment Metadata
|
|
308
|
+
```python
|
|
309
|
+
experiment_result = evaluate(
|
|
310
|
+
dataset=dataset,
|
|
311
|
+
task=my_llm_task,
|
|
312
|
+
evaluators=evaluators,
|
|
313
|
+
metadata={
|
|
314
|
+
"model_version": "gpt-4",
|
|
315
|
+
"evaluation_date": "2024-01-15",
|
|
316
|
+
"temperature": 0.7
|
|
317
|
+
}
|
|
318
|
+
)
|
|
319
|
+
```
|