fiddler-evals 0.1.1.dev13__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fiddler_evals-0.1.1.dev13/fiddler_evals.egg-info → fiddler_evals-0.2.0}/PKG-INFO +162 -29
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/PUBLIC.md +161 -27
- fiddler_evals-0.2.0/fiddler_evals/VERSION +1 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/__init__.py +0 -2
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/connection.py +58 -95
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/application.py +82 -77
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/dataset.py +382 -331
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/experiment.py +292 -285
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/project.py +67 -60
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_dataset_items.py +196 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_experiment_results.py +48 -13
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/__init__.py +0 -2
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/answer_relevance.py +30 -22
- fiddler_evals-0.2.0/fiddler_evals/evaluators/base.py +270 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/coherence.py +15 -9
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/conciseness.py +7 -4
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/eval_fn.py +19 -9
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/ftl_prompt_safety.py +1 -1
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/ftl_response_faithfulness.py +1 -1
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/regex.py +11 -14
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/sentiment.py +1 -1
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_answer_relevance.py +65 -16
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_coherence.py +58 -76
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_conciseness.py +62 -16
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_eval_fn.py +52 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_ftl_prompt_safety.py +6 -6
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_ftl_response_faithfulness.py +1 -1
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_regex.py +39 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_sentiment.py +6 -6
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_topic_classification.py +6 -6
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/topic.py +5 -3
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/exceptions.py +44 -62
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/experiment.py +2 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/evaluation.py +80 -43
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/experiment_runner.py +71 -55
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/tests/test_evaluate.py +419 -34
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/tests/test_experiment_result_publisher.py +19 -2
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/tests/constants.py +3 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0/fiddler_evals.egg-info}/PKG-INFO +162 -29
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals.egg-info/SOURCES.txt +0 -2
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals.egg-info/requires.txt +0 -1
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/pyproject.toml +1 -2
- fiddler_evals-0.1.1.dev13/fiddler_evals/VERSION +0 -1
- fiddler_evals-0.1.1.dev13/fiddler_evals/evaluators/base.py +0 -141
- fiddler_evals-0.1.1.dev13/fiddler_evals/evaluators/tests/test_toxicity.py +0 -201
- fiddler_evals-0.1.1.dev13/fiddler_evals/evaluators/toxicity.py +0 -101
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/MANIFEST.in +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/README.md +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/configs.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/conftest.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/constants.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/decorators.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/base.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_application.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_dataset.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_experiment.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_experiment_items.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_project.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/http_client.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/json_encoder.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/semver.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/tests/test_json_encoder.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/tests/test_request_client.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/application.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/base.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/compact.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/dataset.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/error.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/evaluator.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/filter_query.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/project.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/response.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/score.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/server_info.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/executor.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/experiment_result_publisher.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/tests/test_connection.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/tests/test_decorators.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/utils/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/utils/environment.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/utils/pd.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/utils/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/utils/tests/test_environment.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/utils/tqdm.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/version.py +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals.egg-info/dependency_links.txt +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals.egg-info/top_level.txt +0 -0
- {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fiddler-evals
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Python SDK for evaluating LLM Applications
|
|
5
5
|
Author-email: Fiddler AI <support@fiddler.ai>
|
|
6
6
|
Maintainer-email: Fiddler AI <support@fiddler.ai>
|
|
@@ -15,7 +15,6 @@ Requires-Dist: requests<3
|
|
|
15
15
|
Requires-Dist: pydantic>=2.0.0
|
|
16
16
|
Requires-Dist: tqdm
|
|
17
17
|
Requires-Dist: typing-extensions<5,>=4.6.0
|
|
18
|
-
Requires-Dist: pandas>=1.2.5
|
|
19
18
|
Requires-Dist: python-decouple
|
|
20
19
|
Provides-Extra: pandas
|
|
21
20
|
Requires-Dist: pandas>=1.2.5; extra == "pandas"
|
|
@@ -60,7 +59,7 @@ pip install --upgrade --pre fiddler-evals
|
|
|
60
59
|
from fiddler_evals import init
|
|
61
60
|
|
|
62
61
|
# Initialize connection
|
|
63
|
-
init(url='https://your-
|
|
62
|
+
init(url='https://your-instance.fiddler.ai', token='your-api-token')
|
|
64
63
|
```
|
|
65
64
|
|
|
66
65
|
### 2. Create Project Structure
|
|
@@ -101,19 +100,32 @@ dataset.insert(test_cases)
|
|
|
101
100
|
|
|
102
101
|
### 4. Use Built-in Evaluators
|
|
103
102
|
|
|
103
|
+
**Configure LLM Gateway provider:**
|
|
104
|
+
|
|
105
|
+
Add an LLM provider via UI (**Settings > LLM Gateway**) to use Fiddler's pre-build LLM-as-a-Judge evaluators.
|
|
106
|
+
LLM-as-a-Judge evaluators require a `model` parameter in the format `{provider}/{model}` (e.g., `openai/gpt-4o`) and an optional `credential` parameter for LLM Gateway authentication.
|
|
107
|
+
|
|
104
108
|
```python
|
|
105
109
|
from fiddler_evals.evaluators import (
|
|
106
110
|
AnswerRelevance, Coherence, Conciseness,
|
|
107
|
-
|
|
111
|
+
Sentiment, RegexSearch
|
|
108
112
|
)
|
|
109
113
|
|
|
110
|
-
# Test
|
|
111
|
-
relevance_evaluator = AnswerRelevance(
|
|
114
|
+
# Test LLM-as-a-Judge evaluators (require model parameter)
|
|
115
|
+
relevance_evaluator = AnswerRelevance(
|
|
116
|
+
model="openai/gpt-4o", # Required: LLM Gateway model in {provider}/{model} format
|
|
117
|
+
credential="my-openai-cred" # Optional: LLM Gateway credential name
|
|
118
|
+
)
|
|
112
119
|
score = relevance_evaluator.score(
|
|
113
120
|
prompt="What is the capital of France?",
|
|
114
121
|
response="Paris is the capital of France."
|
|
115
122
|
)
|
|
116
123
|
print(f"Score: {score.value} - {score.reasoning}")
|
|
124
|
+
|
|
125
|
+
# Test other evaluators (no model parameter needed)
|
|
126
|
+
sentiment_evaluator = Sentiment()
|
|
127
|
+
scores = sentiment_evaluator.score(text="This is a helpful response.")
|
|
128
|
+
print("Sentiments:", [f'{score.name}: {score.value}' for score in scores])
|
|
117
129
|
```
|
|
118
130
|
|
|
119
131
|
### 5. Create Custom Evaluators
|
|
@@ -128,8 +140,11 @@ class PolitenessEvaluator(Evaluator):
|
|
|
128
140
|
Useful for customer service or chatbot applications.
|
|
129
141
|
"""
|
|
130
142
|
|
|
131
|
-
def __init__(self):
|
|
132
|
-
super().__init__(
|
|
143
|
+
def __init__(self, score_name_prefix: str = None, score_fn_kwargs_mapping: dict = None):
|
|
144
|
+
super().__init__(
|
|
145
|
+
score_name_prefix=score_name_prefix,
|
|
146
|
+
score_fn_kwargs_mapping=score_fn_kwargs_mapping
|
|
147
|
+
)
|
|
133
148
|
self.polite_words = [
|
|
134
149
|
'please', 'thank you', 'thanks', 'sorry', 'apologize',
|
|
135
150
|
'appreciate', 'welcome', 'help', 'assist', 'glad'
|
|
@@ -151,13 +166,13 @@ class PolitenessEvaluator(Evaluator):
|
|
|
151
166
|
reasoning = "No polite language detected"
|
|
152
167
|
|
|
153
168
|
return Score(
|
|
154
|
-
name="politeness",
|
|
169
|
+
name=f"{self.score_name_prefix}politeness",
|
|
155
170
|
evaluator_name=self.name,
|
|
156
171
|
value=score_value,
|
|
157
172
|
reasoning=reasoning
|
|
158
173
|
)
|
|
159
174
|
|
|
160
|
-
# Test the evaluator
|
|
175
|
+
# Test the evaluator with different configurations
|
|
161
176
|
politeness_evaluator = PolitenessEvaluator()
|
|
162
177
|
|
|
163
178
|
polite_response = "Thank you for your question! I'd be happy to help you with that."
|
|
@@ -165,6 +180,17 @@ impolite_response = "I don't know. Figure it out yourself."
|
|
|
165
180
|
|
|
166
181
|
print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
|
|
167
182
|
print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
|
|
183
|
+
|
|
184
|
+
# Use with different configurations
|
|
185
|
+
customer_service_evaluator = PolitenessEvaluator(
|
|
186
|
+
score_name_prefix="customer_service",
|
|
187
|
+
score_fn_kwargs_mapping={"output": "response"}
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
support_evaluator = PolitenessEvaluator(
|
|
191
|
+
score_name_prefix="support",
|
|
192
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
193
|
+
)
|
|
168
194
|
```
|
|
169
195
|
|
|
170
196
|
### 5.1. Function-Based Evaluators
|
|
@@ -185,8 +211,8 @@ def contains_number_evaluator(output: str) -> float:
|
|
|
185
211
|
|
|
186
212
|
# Use functions directly in evaluators list
|
|
187
213
|
evaluators = [
|
|
188
|
-
AnswerRelevance(),
|
|
189
|
-
Conciseness(),
|
|
214
|
+
AnswerRelevance(model="openai/gpt-4o", credential="my-openai-cred"),
|
|
215
|
+
Conciseness(model="openai/gpt-4o", credential="my-openai-cred"),
|
|
190
216
|
word_count_evaluator, # Function evaluator
|
|
191
217
|
contains_number_evaluator, # Function evaluator
|
|
192
218
|
]
|
|
@@ -215,12 +241,42 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
|
|
|
215
241
|
answer = call_your_llm(question)
|
|
216
242
|
return {"answer": answer}
|
|
217
243
|
|
|
218
|
-
# Set up evaluators
|
|
244
|
+
# Set up evaluators with different configurations
|
|
219
245
|
evaluators = [
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
246
|
+
# LLM-as-a-Judge evaluators (require model parameter)
|
|
247
|
+
AnswerRelevance(
|
|
248
|
+
model="openai/gpt-4o",
|
|
249
|
+
credential="my-openai-cred",
|
|
250
|
+
score_name_prefix="primary"
|
|
251
|
+
),
|
|
252
|
+
Conciseness(
|
|
253
|
+
model="openai/gpt-4o",
|
|
254
|
+
credential="my-openai-cred",
|
|
255
|
+
score_name_prefix="primary"
|
|
256
|
+
),
|
|
257
|
+
|
|
258
|
+
# Other evaluators
|
|
259
|
+
Sentiment(score_name_prefix="primary"),
|
|
260
|
+
|
|
261
|
+
# Custom evaluators with specific mappings
|
|
262
|
+
PolitenessEvaluator(
|
|
263
|
+
score_name_prefix="quality",
|
|
264
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
265
|
+
),
|
|
266
|
+
|
|
267
|
+
# Multiple instances of same evaluator for different fields
|
|
268
|
+
RegexSearch(
|
|
269
|
+
pattern=r"\d+",
|
|
270
|
+
score_name_prefix="question",
|
|
271
|
+
score_name="has_number",
|
|
272
|
+
score_fn_kwargs_mapping={"output": "question"}
|
|
273
|
+
),
|
|
274
|
+
RegexSearch(
|
|
275
|
+
pattern=r"\d+",
|
|
276
|
+
score_name_prefix="answer",
|
|
277
|
+
score_name="has_number",
|
|
278
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
279
|
+
),
|
|
224
280
|
]
|
|
225
281
|
|
|
226
282
|
# Run evaluation
|
|
@@ -231,9 +287,8 @@ experiment_result = evaluate(
|
|
|
231
287
|
name_prefix="my_evaluation",
|
|
232
288
|
description="Comprehensive LLM evaluation",
|
|
233
289
|
score_fn_kwargs_mapping={
|
|
234
|
-
"question": "question",
|
|
290
|
+
"question": lambda x: x["inputs"]["question"],
|
|
235
291
|
"response": "answer",
|
|
236
|
-
"output": "answer",
|
|
237
292
|
"text": "answer",
|
|
238
293
|
"prompt": lambda x: x["inputs"]["question"],
|
|
239
294
|
}
|
|
@@ -241,20 +296,25 @@ experiment_result = evaluate(
|
|
|
241
296
|
|
|
242
297
|
print(f"Evaluated {len(experiment_result.results)} test cases")
|
|
243
298
|
print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
|
|
299
|
+
|
|
300
|
+
# Results in organized score names:
|
|
301
|
+
# "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
|
|
302
|
+
# "quality_politeness", "question_has_number", "answer_has_number"
|
|
244
303
|
```
|
|
245
304
|
|
|
246
305
|
## Built-in Evaluators
|
|
247
306
|
|
|
248
|
-
| Evaluator | Purpose |
|
|
249
|
-
|
|
250
|
-
| `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
|
|
251
|
-
| `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
|
|
252
|
-
| `Conciseness` | Measures response brevity and clarity | `response` |
|
|
253
|
-
| `
|
|
254
|
-
| `
|
|
255
|
-
| `
|
|
256
|
-
| `
|
|
257
|
-
|
|
307
|
+
| Evaluator | Purpose | Constructor Parameters | Score Parameters |
|
|
308
|
+
|-----------|---------|------------------------|------------------|
|
|
309
|
+
| `AnswerRelevance` | Checks if response addresses the question | `model` (required), `credential` (required) | `prompt`, `response` |
|
|
310
|
+
| `Coherence` | Evaluates logical flow and consistency | `model` (required), `credential` (required) | `response`, `prompt` (optional) |
|
|
311
|
+
| `Conciseness` | Measures response brevity and clarity | `model` (required), `credential` (required) | `response` |
|
|
312
|
+
| `Sentiment` | Analyzes emotional tone | - | `text` |
|
|
313
|
+
| `RegexSearch` | Pattern matching for specific formats | `pattern` (required) | `output` |
|
|
314
|
+
| `FTLPromptSafety` | Compute safety scores for prompts | - | `text` |
|
|
315
|
+
| `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | - | `response`, `context` |
|
|
316
|
+
|
|
317
|
+
**Note:** Evaluators marked with `model` and `credential` parameters are LLM-as-a-Judge evaluators that require an LLM Gateway model. The `model` parameter should be in `{provider}/{model}` format (e.g., `openai/gpt-4o`). The `credential` parameter is the name of the LLM Gateway credential for authentication.
|
|
258
318
|
|
|
259
319
|
## Data Import Options
|
|
260
320
|
|
|
@@ -326,6 +386,79 @@ score_fn_kwargs_mapping={
|
|
|
326
386
|
}
|
|
327
387
|
```
|
|
328
388
|
|
|
389
|
+
### Multiple Evaluator Instances with Different Mappings
|
|
390
|
+
|
|
391
|
+
You can create multiple instances of the same evaluator with different parameter mappings and score name prefixes to evaluate different aspects of your outputs. Use `score_name_prefix` to organize and distinguish scores when using multiple evaluator instances:
|
|
392
|
+
|
|
393
|
+
```python
|
|
394
|
+
from fiddler_evals.evaluators import RegexSearch
|
|
395
|
+
|
|
396
|
+
# Create multiple RegexSearch evaluators for different fields
|
|
397
|
+
evaluators = [
|
|
398
|
+
# Check for numbers in the question
|
|
399
|
+
RegexSearch(
|
|
400
|
+
pattern=r"\d+",
|
|
401
|
+
score_name_prefix="question",
|
|
402
|
+
score_name="has_number",
|
|
403
|
+
score_fn_kwargs_mapping={"output": "question"}
|
|
404
|
+
),
|
|
405
|
+
# Check for numbers in the answer
|
|
406
|
+
RegexSearch(
|
|
407
|
+
pattern=r"\d+",
|
|
408
|
+
score_name_prefix="answer",
|
|
409
|
+
score_name="has_number",
|
|
410
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
411
|
+
),
|
|
412
|
+
# Check for capital letters in the answer
|
|
413
|
+
RegexSearch(
|
|
414
|
+
pattern=r"[A-Z]",
|
|
415
|
+
score_name_prefix="answer",
|
|
416
|
+
score_name="has_caps",
|
|
417
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
418
|
+
)
|
|
419
|
+
]
|
|
420
|
+
|
|
421
|
+
# Run evaluation
|
|
422
|
+
experiment_result = evaluate(
|
|
423
|
+
dataset=dataset,
|
|
424
|
+
task=my_llm_task,
|
|
425
|
+
evaluators=evaluators,
|
|
426
|
+
score_fn_kwargs_mapping={
|
|
427
|
+
"question": lambda x: x["inputs"]["question"]
|
|
428
|
+
}
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
# Results in scores named:
|
|
432
|
+
# "question_has_number", "answer_has_number", "answer_has_caps"
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
### Parameter Mapping Priority
|
|
436
|
+
|
|
437
|
+
When both evaluator-level and evaluation-level mappings are present, evaluator-level mappings take precedence:
|
|
438
|
+
|
|
439
|
+
```python
|
|
440
|
+
# Evaluator-level mapping (higher priority)
|
|
441
|
+
evaluator = RegexSearch(
|
|
442
|
+
pattern=r"\d+",
|
|
443
|
+
score_fn_kwargs_mapping={"output": "answer"} # This takes precedence
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
# Evaluation-level mapping (lower priority)
|
|
447
|
+
experiment_result = evaluate(
|
|
448
|
+
dataset=dataset,
|
|
449
|
+
task=my_llm_task,
|
|
450
|
+
evaluators=[evaluator],
|
|
451
|
+
score_fn_kwargs_mapping={
|
|
452
|
+
"output": "question" # This is ignored due to evaluator-level mapping
|
|
453
|
+
}
|
|
454
|
+
)
|
|
455
|
+
```
|
|
456
|
+
|
|
457
|
+
**Mapping Priority (highest to lowest):**
|
|
458
|
+
1. Evaluator-level `score_fn_kwargs_mapping` (set in evaluator constructor)
|
|
459
|
+
2. Evaluation-level `score_fn_kwargs_mapping` (passed to evaluate function)
|
|
460
|
+
3. Default parameter resolution
|
|
461
|
+
|
|
329
462
|
### Experiment Metadata
|
|
330
463
|
```python
|
|
331
464
|
experiment_result = evaluate(
|
|
@@ -38,7 +38,7 @@ pip install --upgrade --pre fiddler-evals
|
|
|
38
38
|
from fiddler_evals import init
|
|
39
39
|
|
|
40
40
|
# Initialize connection
|
|
41
|
-
init(url='https://your-
|
|
41
|
+
init(url='https://your-instance.fiddler.ai', token='your-api-token')
|
|
42
42
|
```
|
|
43
43
|
|
|
44
44
|
### 2. Create Project Structure
|
|
@@ -79,19 +79,32 @@ dataset.insert(test_cases)
|
|
|
79
79
|
|
|
80
80
|
### 4. Use Built-in Evaluators
|
|
81
81
|
|
|
82
|
+
**Configure LLM Gateway provider:**
|
|
83
|
+
|
|
84
|
+
Add an LLM provider via UI (**Settings > LLM Gateway**) to use Fiddler's pre-build LLM-as-a-Judge evaluators.
|
|
85
|
+
LLM-as-a-Judge evaluators require a `model` parameter in the format `{provider}/{model}` (e.g., `openai/gpt-4o`) and an optional `credential` parameter for LLM Gateway authentication.
|
|
86
|
+
|
|
82
87
|
```python
|
|
83
88
|
from fiddler_evals.evaluators import (
|
|
84
89
|
AnswerRelevance, Coherence, Conciseness,
|
|
85
|
-
|
|
90
|
+
Sentiment, RegexSearch
|
|
86
91
|
)
|
|
87
92
|
|
|
88
|
-
# Test
|
|
89
|
-
relevance_evaluator = AnswerRelevance(
|
|
93
|
+
# Test LLM-as-a-Judge evaluators (require model parameter)
|
|
94
|
+
relevance_evaluator = AnswerRelevance(
|
|
95
|
+
model="openai/gpt-4o", # Required: LLM Gateway model in {provider}/{model} format
|
|
96
|
+
credential="my-openai-cred" # Optional: LLM Gateway credential name
|
|
97
|
+
)
|
|
90
98
|
score = relevance_evaluator.score(
|
|
91
99
|
prompt="What is the capital of France?",
|
|
92
100
|
response="Paris is the capital of France."
|
|
93
101
|
)
|
|
94
102
|
print(f"Score: {score.value} - {score.reasoning}")
|
|
103
|
+
|
|
104
|
+
# Test other evaluators (no model parameter needed)
|
|
105
|
+
sentiment_evaluator = Sentiment()
|
|
106
|
+
scores = sentiment_evaluator.score(text="This is a helpful response.")
|
|
107
|
+
print("Sentiments:", [f'{score.name}: {score.value}' for score in scores])
|
|
95
108
|
```
|
|
96
109
|
|
|
97
110
|
### 5. Create Custom Evaluators
|
|
@@ -106,8 +119,11 @@ class PolitenessEvaluator(Evaluator):
|
|
|
106
119
|
Useful for customer service or chatbot applications.
|
|
107
120
|
"""
|
|
108
121
|
|
|
109
|
-
def __init__(self):
|
|
110
|
-
super().__init__(
|
|
122
|
+
def __init__(self, score_name_prefix: str = None, score_fn_kwargs_mapping: dict = None):
|
|
123
|
+
super().__init__(
|
|
124
|
+
score_name_prefix=score_name_prefix,
|
|
125
|
+
score_fn_kwargs_mapping=score_fn_kwargs_mapping
|
|
126
|
+
)
|
|
111
127
|
self.polite_words = [
|
|
112
128
|
'please', 'thank you', 'thanks', 'sorry', 'apologize',
|
|
113
129
|
'appreciate', 'welcome', 'help', 'assist', 'glad'
|
|
@@ -129,13 +145,13 @@ class PolitenessEvaluator(Evaluator):
|
|
|
129
145
|
reasoning = "No polite language detected"
|
|
130
146
|
|
|
131
147
|
return Score(
|
|
132
|
-
name="politeness",
|
|
148
|
+
name=f"{self.score_name_prefix}politeness",
|
|
133
149
|
evaluator_name=self.name,
|
|
134
150
|
value=score_value,
|
|
135
151
|
reasoning=reasoning
|
|
136
152
|
)
|
|
137
153
|
|
|
138
|
-
# Test the evaluator
|
|
154
|
+
# Test the evaluator with different configurations
|
|
139
155
|
politeness_evaluator = PolitenessEvaluator()
|
|
140
156
|
|
|
141
157
|
polite_response = "Thank you for your question! I'd be happy to help you with that."
|
|
@@ -143,6 +159,17 @@ impolite_response = "I don't know. Figure it out yourself."
|
|
|
143
159
|
|
|
144
160
|
print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
|
|
145
161
|
print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
|
|
162
|
+
|
|
163
|
+
# Use with different configurations
|
|
164
|
+
customer_service_evaluator = PolitenessEvaluator(
|
|
165
|
+
score_name_prefix="customer_service",
|
|
166
|
+
score_fn_kwargs_mapping={"output": "response"}
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
support_evaluator = PolitenessEvaluator(
|
|
170
|
+
score_name_prefix="support",
|
|
171
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
172
|
+
)
|
|
146
173
|
```
|
|
147
174
|
|
|
148
175
|
### 5.1. Function-Based Evaluators
|
|
@@ -163,8 +190,8 @@ def contains_number_evaluator(output: str) -> float:
|
|
|
163
190
|
|
|
164
191
|
# Use functions directly in evaluators list
|
|
165
192
|
evaluators = [
|
|
166
|
-
AnswerRelevance(),
|
|
167
|
-
Conciseness(),
|
|
193
|
+
AnswerRelevance(model="openai/gpt-4o", credential="my-openai-cred"),
|
|
194
|
+
Conciseness(model="openai/gpt-4o", credential="my-openai-cred"),
|
|
168
195
|
word_count_evaluator, # Function evaluator
|
|
169
196
|
contains_number_evaluator, # Function evaluator
|
|
170
197
|
]
|
|
@@ -193,12 +220,42 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
|
|
|
193
220
|
answer = call_your_llm(question)
|
|
194
221
|
return {"answer": answer}
|
|
195
222
|
|
|
196
|
-
# Set up evaluators
|
|
223
|
+
# Set up evaluators with different configurations
|
|
197
224
|
evaluators = [
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
225
|
+
# LLM-as-a-Judge evaluators (require model parameter)
|
|
226
|
+
AnswerRelevance(
|
|
227
|
+
model="openai/gpt-4o",
|
|
228
|
+
credential="my-openai-cred",
|
|
229
|
+
score_name_prefix="primary"
|
|
230
|
+
),
|
|
231
|
+
Conciseness(
|
|
232
|
+
model="openai/gpt-4o",
|
|
233
|
+
credential="my-openai-cred",
|
|
234
|
+
score_name_prefix="primary"
|
|
235
|
+
),
|
|
236
|
+
|
|
237
|
+
# Other evaluators
|
|
238
|
+
Sentiment(score_name_prefix="primary"),
|
|
239
|
+
|
|
240
|
+
# Custom evaluators with specific mappings
|
|
241
|
+
PolitenessEvaluator(
|
|
242
|
+
score_name_prefix="quality",
|
|
243
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
244
|
+
),
|
|
245
|
+
|
|
246
|
+
# Multiple instances of same evaluator for different fields
|
|
247
|
+
RegexSearch(
|
|
248
|
+
pattern=r"\d+",
|
|
249
|
+
score_name_prefix="question",
|
|
250
|
+
score_name="has_number",
|
|
251
|
+
score_fn_kwargs_mapping={"output": "question"}
|
|
252
|
+
),
|
|
253
|
+
RegexSearch(
|
|
254
|
+
pattern=r"\d+",
|
|
255
|
+
score_name_prefix="answer",
|
|
256
|
+
score_name="has_number",
|
|
257
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
258
|
+
),
|
|
202
259
|
]
|
|
203
260
|
|
|
204
261
|
# Run evaluation
|
|
@@ -209,9 +266,8 @@ experiment_result = evaluate(
|
|
|
209
266
|
name_prefix="my_evaluation",
|
|
210
267
|
description="Comprehensive LLM evaluation",
|
|
211
268
|
score_fn_kwargs_mapping={
|
|
212
|
-
"question": "question",
|
|
269
|
+
"question": lambda x: x["inputs"]["question"],
|
|
213
270
|
"response": "answer",
|
|
214
|
-
"output": "answer",
|
|
215
271
|
"text": "answer",
|
|
216
272
|
"prompt": lambda x: x["inputs"]["question"],
|
|
217
273
|
}
|
|
@@ -219,20 +275,25 @@ experiment_result = evaluate(
|
|
|
219
275
|
|
|
220
276
|
print(f"Evaluated {len(experiment_result.results)} test cases")
|
|
221
277
|
print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
|
|
278
|
+
|
|
279
|
+
# Results in organized score names:
|
|
280
|
+
# "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
|
|
281
|
+
# "quality_politeness", "question_has_number", "answer_has_number"
|
|
222
282
|
```
|
|
223
283
|
|
|
224
284
|
## Built-in Evaluators
|
|
225
285
|
|
|
226
|
-
| Evaluator | Purpose |
|
|
227
|
-
|
|
228
|
-
| `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
|
|
229
|
-
| `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
|
|
230
|
-
| `Conciseness` | Measures response brevity and clarity | `response` |
|
|
231
|
-
| `
|
|
232
|
-
| `
|
|
233
|
-
| `
|
|
234
|
-
| `
|
|
235
|
-
|
|
286
|
+
| Evaluator | Purpose | Constructor Parameters | Score Parameters |
|
|
287
|
+
|-----------|---------|------------------------|------------------|
|
|
288
|
+
| `AnswerRelevance` | Checks if response addresses the question | `model` (required), `credential` (required) | `prompt`, `response` |
|
|
289
|
+
| `Coherence` | Evaluates logical flow and consistency | `model` (required), `credential` (required) | `response`, `prompt` (optional) |
|
|
290
|
+
| `Conciseness` | Measures response brevity and clarity | `model` (required), `credential` (required) | `response` |
|
|
291
|
+
| `Sentiment` | Analyzes emotional tone | - | `text` |
|
|
292
|
+
| `RegexSearch` | Pattern matching for specific formats | `pattern` (required) | `output` |
|
|
293
|
+
| `FTLPromptSafety` | Compute safety scores for prompts | - | `text` |
|
|
294
|
+
| `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | - | `response`, `context` |
|
|
295
|
+
|
|
296
|
+
**Note:** Evaluators marked with `model` and `credential` parameters are LLM-as-a-Judge evaluators that require an LLM Gateway model. The `model` parameter should be in `{provider}/{model}` format (e.g., `openai/gpt-4o`). The `credential` parameter is the name of the LLM Gateway credential for authentication.
|
|
236
297
|
|
|
237
298
|
## Data Import Options
|
|
238
299
|
|
|
@@ -304,6 +365,79 @@ score_fn_kwargs_mapping={
|
|
|
304
365
|
}
|
|
305
366
|
```
|
|
306
367
|
|
|
368
|
+
### Multiple Evaluator Instances with Different Mappings
|
|
369
|
+
|
|
370
|
+
You can create multiple instances of the same evaluator with different parameter mappings and score name prefixes to evaluate different aspects of your outputs. Use `score_name_prefix` to organize and distinguish scores when using multiple evaluator instances:
|
|
371
|
+
|
|
372
|
+
```python
|
|
373
|
+
from fiddler_evals.evaluators import RegexSearch
|
|
374
|
+
|
|
375
|
+
# Create multiple RegexSearch evaluators for different fields
|
|
376
|
+
evaluators = [
|
|
377
|
+
# Check for numbers in the question
|
|
378
|
+
RegexSearch(
|
|
379
|
+
pattern=r"\d+",
|
|
380
|
+
score_name_prefix="question",
|
|
381
|
+
score_name="has_number",
|
|
382
|
+
score_fn_kwargs_mapping={"output": "question"}
|
|
383
|
+
),
|
|
384
|
+
# Check for numbers in the answer
|
|
385
|
+
RegexSearch(
|
|
386
|
+
pattern=r"\d+",
|
|
387
|
+
score_name_prefix="answer",
|
|
388
|
+
score_name="has_number",
|
|
389
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
390
|
+
),
|
|
391
|
+
# Check for capital letters in the answer
|
|
392
|
+
RegexSearch(
|
|
393
|
+
pattern=r"[A-Z]",
|
|
394
|
+
score_name_prefix="answer",
|
|
395
|
+
score_name="has_caps",
|
|
396
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
397
|
+
)
|
|
398
|
+
]
|
|
399
|
+
|
|
400
|
+
# Run evaluation
|
|
401
|
+
experiment_result = evaluate(
|
|
402
|
+
dataset=dataset,
|
|
403
|
+
task=my_llm_task,
|
|
404
|
+
evaluators=evaluators,
|
|
405
|
+
score_fn_kwargs_mapping={
|
|
406
|
+
"question": lambda x: x["inputs"]["question"]
|
|
407
|
+
}
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
# Results in scores named:
|
|
411
|
+
# "question_has_number", "answer_has_number", "answer_has_caps"
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
### Parameter Mapping Priority
|
|
415
|
+
|
|
416
|
+
When both evaluator-level and evaluation-level mappings are present, evaluator-level mappings take precedence:
|
|
417
|
+
|
|
418
|
+
```python
|
|
419
|
+
# Evaluator-level mapping (higher priority)
|
|
420
|
+
evaluator = RegexSearch(
|
|
421
|
+
pattern=r"\d+",
|
|
422
|
+
score_fn_kwargs_mapping={"output": "answer"} # This takes precedence
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Evaluation-level mapping (lower priority)
|
|
426
|
+
experiment_result = evaluate(
|
|
427
|
+
dataset=dataset,
|
|
428
|
+
task=my_llm_task,
|
|
429
|
+
evaluators=[evaluator],
|
|
430
|
+
score_fn_kwargs_mapping={
|
|
431
|
+
"output": "question" # This is ignored due to evaluator-level mapping
|
|
432
|
+
}
|
|
433
|
+
)
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
**Mapping Priority (highest to lowest):**
|
|
437
|
+
1. Evaluator-level `score_fn_kwargs_mapping` (set in evaluator constructor)
|
|
438
|
+
2. Evaluation-level `score_fn_kwargs_mapping` (passed to evaluate function)
|
|
439
|
+
3. Default parameter resolution
|
|
440
|
+
|
|
307
441
|
### Experiment Metadata
|
|
308
442
|
```python
|
|
309
443
|
experiment_result = evaluate(
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.2.0
|
|
@@ -20,7 +20,6 @@ from fiddler_evals.evaluators import (
|
|
|
20
20
|
RegexSearch,
|
|
21
21
|
Sentiment,
|
|
22
22
|
TopicClassification,
|
|
23
|
-
Toxicity,
|
|
24
23
|
)
|
|
25
24
|
from fiddler_evals.evaluators.base import Evaluator
|
|
26
25
|
from fiddler_evals.evaluators.eval_fn import EvalFn
|
|
@@ -55,7 +54,6 @@ __all__ = [
|
|
|
55
54
|
"AnswerRelevance",
|
|
56
55
|
"Coherence",
|
|
57
56
|
"Conciseness",
|
|
58
|
-
"Toxicity",
|
|
59
57
|
"Sentiment",
|
|
60
58
|
"RegexSearch",
|
|
61
59
|
"RegexMatch",
|