fiddler-evals 0.1.1.dev14__tar.gz → 0.2.0rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fiddler_evals-0.1.1.dev14/fiddler_evals.egg-info → fiddler_evals-0.2.0rc1}/PKG-INFO +46 -23
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/PUBLIC.md +45 -21
- fiddler_evals-0.2.0rc1/fiddler_evals/VERSION +1 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/__init__.py +0 -2
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/__init__.py +0 -2
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/answer_relevance.py +10 -4
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/base.py +23 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/coherence.py +15 -9
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/conciseness.py +6 -3
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_answer_relevance.py +30 -15
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_coherence.py +58 -76
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_conciseness.py +61 -15
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/experiment_runner.py +3 -5
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/tests/test_evaluate.py +156 -34
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/tests/constants.py +3 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1/fiddler_evals.egg-info}/PKG-INFO +46 -23
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals.egg-info/SOURCES.txt +0 -2
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals.egg-info/requires.txt +0 -1
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/pyproject.toml +1 -2
- fiddler_evals-0.1.1.dev14/fiddler_evals/VERSION +0 -1
- fiddler_evals-0.1.1.dev14/fiddler_evals/evaluators/tests/test_toxicity.py +0 -201
- fiddler_evals-0.1.1.dev14/fiddler_evals/evaluators/toxicity.py +0 -101
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/MANIFEST.in +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/README.md +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/configs.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/conftest.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/connection.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/constants.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/decorators.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/application.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/base.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/dataset.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/experiment.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/project.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_application.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_dataset.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_dataset_items.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_experiment.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_experiment_items.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_experiment_results.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_project.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/eval_fn.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/ftl_prompt_safety.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/ftl_response_faithfulness.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/regex.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/sentiment.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_eval_fn.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_ftl_prompt_safety.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_ftl_response_faithfulness.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_regex.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_sentiment.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_topic_classification.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/topic.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/exceptions.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/http_client.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/json_encoder.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/semver.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/tests/test_json_encoder.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/tests/test_request_client.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/application.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/base.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/compact.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/dataset.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/error.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/evaluator.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/experiment.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/filter_query.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/project.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/response.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/score.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/server_info.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/evaluation.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/executor.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/experiment_result_publisher.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/tests/test_experiment_result_publisher.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/tests/test_connection.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/tests/test_decorators.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/utils/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/utils/environment.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/utils/pd.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/utils/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/utils/tests/test_environment.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/utils/tqdm.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/version.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals.egg-info/dependency_links.txt +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals.egg-info/top_level.txt +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fiddler-evals
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0rc1
|
|
4
4
|
Summary: Python SDK for evaluating LLM Applications
|
|
5
5
|
Author-email: Fiddler AI <support@fiddler.ai>
|
|
6
6
|
Maintainer-email: Fiddler AI <support@fiddler.ai>
|
|
@@ -15,7 +15,6 @@ Requires-Dist: requests<3
|
|
|
15
15
|
Requires-Dist: pydantic>=2.0.0
|
|
16
16
|
Requires-Dist: tqdm
|
|
17
17
|
Requires-Dist: typing-extensions<5,>=4.6.0
|
|
18
|
-
Requires-Dist: pandas>=1.2.5
|
|
19
18
|
Requires-Dist: python-decouple
|
|
20
19
|
Provides-Extra: pandas
|
|
21
20
|
Requires-Dist: pandas>=1.2.5; extra == "pandas"
|
|
@@ -101,19 +100,32 @@ dataset.insert(test_cases)
|
|
|
101
100
|
|
|
102
101
|
### 4. Use Built-in Evaluators
|
|
103
102
|
|
|
103
|
+
**Configure LLM Gateway provider:**
|
|
104
|
+
|
|
105
|
+
Add an LLM provider via UI (**Settings > LLM Gateway**) to use Fiddler's pre-build LLM-as-a-Judge evaluators.
|
|
106
|
+
LLM-as-a-Judge evaluators require a `model` parameter in the format `{provider}/{model}` (e.g., `openai/gpt-4o`) and an optional `credential` parameter for LLM Gateway authentication.
|
|
107
|
+
|
|
104
108
|
```python
|
|
105
109
|
from fiddler_evals.evaluators import (
|
|
106
110
|
AnswerRelevance, Coherence, Conciseness,
|
|
107
|
-
|
|
111
|
+
Sentiment, RegexSearch
|
|
108
112
|
)
|
|
109
113
|
|
|
110
|
-
# Test
|
|
111
|
-
relevance_evaluator = AnswerRelevance(
|
|
114
|
+
# Test LLM-as-a-Judge evaluators (require model parameter)
|
|
115
|
+
relevance_evaluator = AnswerRelevance(
|
|
116
|
+
model="openai/gpt-4o", # Required: LLM Gateway model in {provider}/{model} format
|
|
117
|
+
credential="my-openai-cred" # Optional: LLM Gateway credential name
|
|
118
|
+
)
|
|
112
119
|
score = relevance_evaluator.score(
|
|
113
120
|
prompt="What is the capital of France?",
|
|
114
121
|
response="Paris is the capital of France."
|
|
115
122
|
)
|
|
116
123
|
print(f"Score: {score.value} - {score.reasoning}")
|
|
124
|
+
|
|
125
|
+
# Test other evaluators (no model parameter needed)
|
|
126
|
+
sentiment_evaluator = Sentiment()
|
|
127
|
+
scores = sentiment_evaluator.score(text="This is a helpful response.")
|
|
128
|
+
print("Sentiments:", [f'{score.name}: {score.value}' for score in scores])
|
|
117
129
|
```
|
|
118
130
|
|
|
119
131
|
### 5. Create Custom Evaluators
|
|
@@ -199,8 +211,8 @@ def contains_number_evaluator(output: str) -> float:
|
|
|
199
211
|
|
|
200
212
|
# Use functions directly in evaluators list
|
|
201
213
|
evaluators = [
|
|
202
|
-
AnswerRelevance(),
|
|
203
|
-
Conciseness(),
|
|
214
|
+
AnswerRelevance(model="openai/gpt-4o", credential="my-openai-cred"),
|
|
215
|
+
Conciseness(model="openai/gpt-4o", credential="my-openai-cred"),
|
|
204
216
|
word_count_evaluator, # Function evaluator
|
|
205
217
|
contains_number_evaluator, # Function evaluator
|
|
206
218
|
]
|
|
@@ -231,9 +243,19 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
|
|
|
231
243
|
|
|
232
244
|
# Set up evaluators with different configurations
|
|
233
245
|
evaluators = [
|
|
234
|
-
#
|
|
235
|
-
AnswerRelevance(
|
|
236
|
-
|
|
246
|
+
# LLM-as-a-Judge evaluators (require model parameter)
|
|
247
|
+
AnswerRelevance(
|
|
248
|
+
model="openai/gpt-4o",
|
|
249
|
+
credential="my-openai-cred",
|
|
250
|
+
score_name_prefix="primary"
|
|
251
|
+
),
|
|
252
|
+
Conciseness(
|
|
253
|
+
model="openai/gpt-4o",
|
|
254
|
+
credential="my-openai-cred",
|
|
255
|
+
score_name_prefix="primary"
|
|
256
|
+
),
|
|
257
|
+
|
|
258
|
+
# Other evaluators
|
|
237
259
|
Sentiment(score_name_prefix="primary"),
|
|
238
260
|
|
|
239
261
|
# Custom evaluators with specific mappings
|
|
@@ -245,13 +267,13 @@ evaluators = [
|
|
|
245
267
|
# Multiple instances of same evaluator for different fields
|
|
246
268
|
RegexSearch(
|
|
247
269
|
pattern=r"\d+",
|
|
248
|
-
score_name_prefix="
|
|
270
|
+
score_name_prefix="question",
|
|
249
271
|
score_name="has_number",
|
|
250
272
|
score_fn_kwargs_mapping={"output": "question"}
|
|
251
273
|
),
|
|
252
274
|
RegexSearch(
|
|
253
275
|
pattern=r"\d+",
|
|
254
|
-
score_name_prefix="
|
|
276
|
+
score_name_prefix="answer",
|
|
255
277
|
score_name="has_number",
|
|
256
278
|
score_fn_kwargs_mapping={"output": "answer"}
|
|
257
279
|
),
|
|
@@ -277,21 +299,22 @@ print(f"Generated {sum(len(result.scores) for result in experiment_result.result
|
|
|
277
299
|
|
|
278
300
|
# Results in organized score names:
|
|
279
301
|
# "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
|
|
280
|
-
# "quality_politeness", "
|
|
302
|
+
# "quality_politeness", "question_has_number", "answer_has_number"
|
|
281
303
|
```
|
|
282
304
|
|
|
283
305
|
## Built-in Evaluators
|
|
284
306
|
|
|
285
|
-
| Evaluator | Purpose |
|
|
286
|
-
|
|
287
|
-
| `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
|
|
288
|
-
| `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
|
|
289
|
-
| `Conciseness` | Measures response brevity and clarity | `response` |
|
|
290
|
-
| `
|
|
291
|
-
| `
|
|
292
|
-
| `
|
|
293
|
-
| `
|
|
294
|
-
|
|
307
|
+
| Evaluator | Purpose | Constructor Parameters | Score Parameters |
|
|
308
|
+
|-----------|---------|------------------------|------------------|
|
|
309
|
+
| `AnswerRelevance` | Checks if response addresses the question | `model` (required), `credential` (required) | `prompt`, `response` |
|
|
310
|
+
| `Coherence` | Evaluates logical flow and consistency | `model` (required), `credential` (required) | `response`, `prompt` (optional) |
|
|
311
|
+
| `Conciseness` | Measures response brevity and clarity | `model` (required), `credential` (required) | `response` |
|
|
312
|
+
| `Sentiment` | Analyzes emotional tone | - | `text` |
|
|
313
|
+
| `RegexSearch` | Pattern matching for specific formats | `pattern` (required) | `output` |
|
|
314
|
+
| `FTLPromptSafety` | Compute safety scores for prompts | - | `text` |
|
|
315
|
+
| `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | - | `response`, `context` |
|
|
316
|
+
|
|
317
|
+
**Note:** Evaluators marked with `model` and `credential` parameters are LLM-as-a-Judge evaluators that require an LLM Gateway model. The `model` parameter should be in `{provider}/{model}` format (e.g., `openai/gpt-4o`). The `credential` parameter is the name of the LLM Gateway credential for authentication.
|
|
295
318
|
|
|
296
319
|
## Data Import Options
|
|
297
320
|
|
|
@@ -79,19 +79,32 @@ dataset.insert(test_cases)
|
|
|
79
79
|
|
|
80
80
|
### 4. Use Built-in Evaluators
|
|
81
81
|
|
|
82
|
+
**Configure LLM Gateway provider:**
|
|
83
|
+
|
|
84
|
+
Add an LLM provider via UI (**Settings > LLM Gateway**) to use Fiddler's pre-build LLM-as-a-Judge evaluators.
|
|
85
|
+
LLM-as-a-Judge evaluators require a `model` parameter in the format `{provider}/{model}` (e.g., `openai/gpt-4o`) and an optional `credential` parameter for LLM Gateway authentication.
|
|
86
|
+
|
|
82
87
|
```python
|
|
83
88
|
from fiddler_evals.evaluators import (
|
|
84
89
|
AnswerRelevance, Coherence, Conciseness,
|
|
85
|
-
|
|
90
|
+
Sentiment, RegexSearch
|
|
86
91
|
)
|
|
87
92
|
|
|
88
|
-
# Test
|
|
89
|
-
relevance_evaluator = AnswerRelevance(
|
|
93
|
+
# Test LLM-as-a-Judge evaluators (require model parameter)
|
|
94
|
+
relevance_evaluator = AnswerRelevance(
|
|
95
|
+
model="openai/gpt-4o", # Required: LLM Gateway model in {provider}/{model} format
|
|
96
|
+
credential="my-openai-cred" # Optional: LLM Gateway credential name
|
|
97
|
+
)
|
|
90
98
|
score = relevance_evaluator.score(
|
|
91
99
|
prompt="What is the capital of France?",
|
|
92
100
|
response="Paris is the capital of France."
|
|
93
101
|
)
|
|
94
102
|
print(f"Score: {score.value} - {score.reasoning}")
|
|
103
|
+
|
|
104
|
+
# Test other evaluators (no model parameter needed)
|
|
105
|
+
sentiment_evaluator = Sentiment()
|
|
106
|
+
scores = sentiment_evaluator.score(text="This is a helpful response.")
|
|
107
|
+
print("Sentiments:", [f'{score.name}: {score.value}' for score in scores])
|
|
95
108
|
```
|
|
96
109
|
|
|
97
110
|
### 5. Create Custom Evaluators
|
|
@@ -177,8 +190,8 @@ def contains_number_evaluator(output: str) -> float:
|
|
|
177
190
|
|
|
178
191
|
# Use functions directly in evaluators list
|
|
179
192
|
evaluators = [
|
|
180
|
-
AnswerRelevance(),
|
|
181
|
-
Conciseness(),
|
|
193
|
+
AnswerRelevance(model="openai/gpt-4o", credential="my-openai-cred"),
|
|
194
|
+
Conciseness(model="openai/gpt-4o", credential="my-openai-cred"),
|
|
182
195
|
word_count_evaluator, # Function evaluator
|
|
183
196
|
contains_number_evaluator, # Function evaluator
|
|
184
197
|
]
|
|
@@ -209,9 +222,19 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
|
|
|
209
222
|
|
|
210
223
|
# Set up evaluators with different configurations
|
|
211
224
|
evaluators = [
|
|
212
|
-
#
|
|
213
|
-
AnswerRelevance(
|
|
214
|
-
|
|
225
|
+
# LLM-as-a-Judge evaluators (require model parameter)
|
|
226
|
+
AnswerRelevance(
|
|
227
|
+
model="openai/gpt-4o",
|
|
228
|
+
credential="my-openai-cred",
|
|
229
|
+
score_name_prefix="primary"
|
|
230
|
+
),
|
|
231
|
+
Conciseness(
|
|
232
|
+
model="openai/gpt-4o",
|
|
233
|
+
credential="my-openai-cred",
|
|
234
|
+
score_name_prefix="primary"
|
|
235
|
+
),
|
|
236
|
+
|
|
237
|
+
# Other evaluators
|
|
215
238
|
Sentiment(score_name_prefix="primary"),
|
|
216
239
|
|
|
217
240
|
# Custom evaluators with specific mappings
|
|
@@ -223,13 +246,13 @@ evaluators = [
|
|
|
223
246
|
# Multiple instances of same evaluator for different fields
|
|
224
247
|
RegexSearch(
|
|
225
248
|
pattern=r"\d+",
|
|
226
|
-
score_name_prefix="
|
|
249
|
+
score_name_prefix="question",
|
|
227
250
|
score_name="has_number",
|
|
228
251
|
score_fn_kwargs_mapping={"output": "question"}
|
|
229
252
|
),
|
|
230
253
|
RegexSearch(
|
|
231
254
|
pattern=r"\d+",
|
|
232
|
-
score_name_prefix="
|
|
255
|
+
score_name_prefix="answer",
|
|
233
256
|
score_name="has_number",
|
|
234
257
|
score_fn_kwargs_mapping={"output": "answer"}
|
|
235
258
|
),
|
|
@@ -255,21 +278,22 @@ print(f"Generated {sum(len(result.scores) for result in experiment_result.result
|
|
|
255
278
|
|
|
256
279
|
# Results in organized score names:
|
|
257
280
|
# "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
|
|
258
|
-
# "quality_politeness", "
|
|
281
|
+
# "quality_politeness", "question_has_number", "answer_has_number"
|
|
259
282
|
```
|
|
260
283
|
|
|
261
284
|
## Built-in Evaluators
|
|
262
285
|
|
|
263
|
-
| Evaluator | Purpose |
|
|
264
|
-
|
|
265
|
-
| `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
|
|
266
|
-
| `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
|
|
267
|
-
| `Conciseness` | Measures response brevity and clarity | `response` |
|
|
268
|
-
| `
|
|
269
|
-
| `
|
|
270
|
-
| `
|
|
271
|
-
| `
|
|
272
|
-
|
|
286
|
+
| Evaluator | Purpose | Constructor Parameters | Score Parameters |
|
|
287
|
+
|-----------|---------|------------------------|------------------|
|
|
288
|
+
| `AnswerRelevance` | Checks if response addresses the question | `model` (required), `credential` (required) | `prompt`, `response` |
|
|
289
|
+
| `Coherence` | Evaluates logical flow and consistency | `model` (required), `credential` (required) | `response`, `prompt` (optional) |
|
|
290
|
+
| `Conciseness` | Measures response brevity and clarity | `model` (required), `credential` (required) | `response` |
|
|
291
|
+
| `Sentiment` | Analyzes emotional tone | - | `text` |
|
|
292
|
+
| `RegexSearch` | Pattern matching for specific formats | `pattern` (required) | `output` |
|
|
293
|
+
| `FTLPromptSafety` | Compute safety scores for prompts | - | `text` |
|
|
294
|
+
| `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | - | `response`, `context` |
|
|
295
|
+
|
|
296
|
+
**Note:** Evaluators marked with `model` and `credential` parameters are LLM-as-a-Judge evaluators that require an LLM Gateway model. The `model` parameter should be in `{provider}/{model}` format (e.g., `openai/gpt-4o`). The `credential` parameter is the name of the LLM Gateway credential for authentication.
|
|
273
297
|
|
|
274
298
|
## Data Import Options
|
|
275
299
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.2.0rc1
|
|
@@ -20,7 +20,6 @@ from fiddler_evals.evaluators import (
|
|
|
20
20
|
RegexSearch,
|
|
21
21
|
Sentiment,
|
|
22
22
|
TopicClassification,
|
|
23
|
-
Toxicity,
|
|
24
23
|
)
|
|
25
24
|
from fiddler_evals.evaluators.base import Evaluator
|
|
26
25
|
from fiddler_evals.evaluators.eval_fn import EvalFn
|
|
@@ -55,7 +54,6 @@ __all__ = [
|
|
|
55
54
|
"AnswerRelevance",
|
|
56
55
|
"Coherence",
|
|
57
56
|
"Conciseness",
|
|
58
|
-
"Toxicity",
|
|
59
57
|
"Sentiment",
|
|
60
58
|
"RegexSearch",
|
|
61
59
|
"RegexMatch",
|
|
@@ -7,7 +7,6 @@ from fiddler_evals.evaluators.ftl_response_faithfulness import FTLResponseFaithf
|
|
|
7
7
|
from fiddler_evals.evaluators.regex import RegexMatch, RegexSearch
|
|
8
8
|
from fiddler_evals.evaluators.sentiment import Sentiment
|
|
9
9
|
from fiddler_evals.evaluators.topic import TopicClassification
|
|
10
|
-
from fiddler_evals.evaluators.toxicity import Toxicity
|
|
11
10
|
|
|
12
11
|
__all__ = [
|
|
13
12
|
"RegexSearch",
|
|
@@ -17,7 +16,6 @@ __all__ = [
|
|
|
17
16
|
"Conciseness",
|
|
18
17
|
"FTLPromptSafety",
|
|
19
18
|
"FTLResponseFaithfulness",
|
|
20
|
-
"Toxicity",
|
|
21
19
|
"Sentiment",
|
|
22
20
|
"TopicClassification",
|
|
23
21
|
"EvalFn",
|
{fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/answer_relevance.py
RENAMED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
from fiddler_evals.evaluators.base import
|
|
1
|
+
from fiddler_evals.evaluators.base import FiddlerLLMAAJEvaluator
|
|
2
2
|
from fiddler_evals.pydantic_models.score import Score
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
class AnswerRelevance(
|
|
5
|
+
class AnswerRelevance(FiddlerLLMAAJEvaluator):
|
|
6
6
|
"""Evaluator to assess how well an answer addresses a given question.
|
|
7
7
|
|
|
8
8
|
The AnswerRelevance evaluator measures whether an LLM's answer is relevant
|
|
@@ -85,8 +85,14 @@ class AnswerRelevance(FiddlerEvaluator):
|
|
|
85
85
|
|
|
86
86
|
payload = {
|
|
87
87
|
"evaluator_name": self.name,
|
|
88
|
-
"parameters": {
|
|
89
|
-
|
|
88
|
+
"parameters": {
|
|
89
|
+
"model": self.model,
|
|
90
|
+
"credential": self.credential,
|
|
91
|
+
},
|
|
92
|
+
"inputs": {
|
|
93
|
+
"prompt": prompt,
|
|
94
|
+
"response": response,
|
|
95
|
+
},
|
|
90
96
|
}
|
|
91
97
|
|
|
92
98
|
return self._parse_scores(data=self.make_call(payload))[0]
|
|
@@ -243,3 +243,26 @@ class FiddlerEvaluator(Evaluator, ABC):
|
|
|
243
243
|
scores.append(score)
|
|
244
244
|
|
|
245
245
|
return scores
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
class FiddlerLLMAAJEvaluator(FiddlerEvaluator, ABC):
|
|
249
|
+
"""Base class for LLMAAJ evaluators that use Fiddler's evaluator API."""
|
|
250
|
+
|
|
251
|
+
def __init__(
|
|
252
|
+
self, model: str, credential: str | None = None, **kwargs: Any
|
|
253
|
+
) -> None:
|
|
254
|
+
"""Initialize the LLMAAJ evaluator with model and credential.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
model (str): LLM Gateway model name in `{provider}/{model}` format.
|
|
258
|
+
E.g., `openai/gpt-4o`
|
|
259
|
+
credential (str): Name of the LLM Gateway credential for the above provider.
|
|
260
|
+
**kwargs: Additional keyword arguments
|
|
261
|
+
"""
|
|
262
|
+
super().__init__(**kwargs)
|
|
263
|
+
|
|
264
|
+
if not model:
|
|
265
|
+
raise ValueError("model is required for LLMAAJ based evaluators")
|
|
266
|
+
|
|
267
|
+
self.model = model
|
|
268
|
+
self.credential = credential
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from fiddler_evals.evaluators.base import
|
|
3
|
+
from fiddler_evals.evaluators.base import FiddlerLLMAAJEvaluator
|
|
4
4
|
from fiddler_evals.pydantic_models.score import Score
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
class Coherence(
|
|
7
|
+
class Coherence(FiddlerLLMAAJEvaluator):
|
|
8
8
|
"""Evaluator to assess the coherence and logical flow of a response.
|
|
9
9
|
|
|
10
10
|
The Coherence evaluator measures whether a response is well-structured, logically
|
|
@@ -58,7 +58,8 @@ class Coherence(FiddlerEvaluator):
|
|
|
58
58
|
|
|
59
59
|
# Incoherent response
|
|
60
60
|
incoherent_score = evaluator.score(
|
|
61
|
-
|
|
61
|
+
prompt="Explain the process of making coffee",
|
|
62
|
+
response="The sky is blue. I like pizza. Quantum physics is complex. Let's go shopping.",
|
|
62
63
|
)
|
|
63
64
|
print(f"Coherence: {incoherent_score.value}") # 0.0
|
|
64
65
|
|
|
@@ -83,28 +84,33 @@ class Coherence(FiddlerEvaluator):
|
|
|
83
84
|
|
|
84
85
|
name = "coherence"
|
|
85
86
|
|
|
86
|
-
def score(self,
|
|
87
|
+
def score(self, prompt: str, response: str) -> Score: # pylint: disable=arguments-differ
|
|
87
88
|
"""Score the coherence of a response.
|
|
88
89
|
|
|
89
90
|
Args:
|
|
91
|
+
prompt (str): The original prompt that generated the response.
|
|
90
92
|
response (str): The response to evaluate for coherence.
|
|
91
|
-
prompt (str, optional): The original prompt that generated the response.
|
|
92
93
|
|
|
93
94
|
Returns:
|
|
94
95
|
Score: A Score object for coherence assessment.
|
|
95
96
|
"""
|
|
96
|
-
response = response.strip() if response else ""
|
|
97
97
|
prompt = prompt.strip() if prompt else ""
|
|
98
|
+
response = response.strip() if response else ""
|
|
98
99
|
|
|
99
|
-
if not response:
|
|
100
|
-
raise ValueError(
|
|
100
|
+
if not prompt or not response:
|
|
101
|
+
raise ValueError(
|
|
102
|
+
"prompt and response are required for coherence evaluation"
|
|
103
|
+
)
|
|
101
104
|
|
|
102
105
|
# Build inputs dictionary
|
|
103
106
|
inputs = {"response": response, "prompt": prompt}
|
|
104
107
|
|
|
105
108
|
payload = {
|
|
106
109
|
"evaluator_name": self.name,
|
|
107
|
-
"parameters": {
|
|
110
|
+
"parameters": {
|
|
111
|
+
"model": self.model,
|
|
112
|
+
"credential": self.credential,
|
|
113
|
+
},
|
|
108
114
|
"inputs": inputs,
|
|
109
115
|
}
|
|
110
116
|
|
{fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/conciseness.py
RENAMED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
from fiddler_evals.evaluators.base import
|
|
1
|
+
from fiddler_evals.evaluators.base import FiddlerLLMAAJEvaluator
|
|
2
2
|
from fiddler_evals.pydantic_models.score import Score
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
class Conciseness(
|
|
5
|
+
class Conciseness(FiddlerLLMAAJEvaluator):
|
|
6
6
|
"""Evaluator to assess how concise and to-the-point an answer is.
|
|
7
7
|
|
|
8
8
|
The Conciseness evaluator measures whether an LLM's answer is appropriately
|
|
@@ -77,7 +77,10 @@ class Conciseness(FiddlerEvaluator):
|
|
|
77
77
|
|
|
78
78
|
payload = {
|
|
79
79
|
"evaluator_name": self.name,
|
|
80
|
-
"parameters": {
|
|
80
|
+
"parameters": {
|
|
81
|
+
"model": self.model,
|
|
82
|
+
"credential": self.credential,
|
|
83
|
+
},
|
|
81
84
|
"inputs": {"response": response},
|
|
82
85
|
}
|
|
83
86
|
|
|
@@ -6,15 +6,20 @@ import responses
|
|
|
6
6
|
from fiddler_evals.constants import CONTENT_TYPE_HEADER_KEY, JSON_CONTENT_TYPE
|
|
7
7
|
from fiddler_evals.evaluators.answer_relevance import AnswerRelevance
|
|
8
8
|
from fiddler_evals.pydantic_models.score import Score, ScoreStatus
|
|
9
|
-
from fiddler_evals.tests.constants import URL
|
|
9
|
+
from fiddler_evals.tests.constants import LLM_GATEWAY_CREDENTIAL, LLM_GATEWAY_MODEL, URL
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.fixture()
|
|
13
|
+
def evaluator() -> AnswerRelevance:
|
|
14
|
+
"""Create an AnswerRelevance evaluator."""
|
|
15
|
+
return AnswerRelevance(model=LLM_GATEWAY_MODEL, credential=LLM_GATEWAY_CREDENTIAL)
|
|
10
16
|
|
|
11
17
|
|
|
12
18
|
@responses.activate
|
|
13
|
-
def test_answer_relevance_relevant_answer() -> None:
|
|
19
|
+
def test_answer_relevance_relevant_answer(evaluator: AnswerRelevance) -> None:
|
|
14
20
|
"""When evaluating a relevant answer
|
|
15
21
|
Then it should return score 1.0
|
|
16
22
|
And should include proper reasoning."""
|
|
17
|
-
evaluator = AnswerRelevance()
|
|
18
23
|
|
|
19
24
|
# Mock the API response
|
|
20
25
|
mock_response = {
|
|
@@ -63,17 +68,19 @@ def test_answer_relevance_relevant_answer() -> None:
|
|
|
63
68
|
# Verify request body
|
|
64
69
|
request_body = json.loads(request.body)
|
|
65
70
|
assert request_body["evaluator_name"] == "answer_relevance"
|
|
66
|
-
assert request_body["parameters"] == {
|
|
71
|
+
assert request_body["parameters"] == {
|
|
72
|
+
"credential": LLM_GATEWAY_CREDENTIAL,
|
|
73
|
+
"model": LLM_GATEWAY_MODEL,
|
|
74
|
+
}
|
|
67
75
|
assert request_body["inputs"]["prompt"] == "What is the capital of France?"
|
|
68
76
|
assert request_body["inputs"]["response"] == "The capital of France is Paris."
|
|
69
77
|
|
|
70
78
|
|
|
71
79
|
@responses.activate
|
|
72
|
-
def test_answer_relevance_irrelevant_answer() -> None:
|
|
80
|
+
def test_answer_relevance_irrelevant_answer(evaluator: AnswerRelevance) -> None:
|
|
73
81
|
"""When evaluating an irrelevant answer
|
|
74
82
|
Then it should return score 0.0
|
|
75
83
|
And should include proper reasoning."""
|
|
76
|
-
evaluator = AnswerRelevance()
|
|
77
84
|
|
|
78
85
|
# Mock the API response
|
|
79
86
|
mock_response = {
|
|
@@ -122,17 +129,19 @@ def test_answer_relevance_irrelevant_answer() -> None:
|
|
|
122
129
|
# Verify request body
|
|
123
130
|
request_body = json.loads(request.body)
|
|
124
131
|
assert request_body["evaluator_name"] == "answer_relevance"
|
|
125
|
-
assert request_body["parameters"] == {
|
|
132
|
+
assert request_body["parameters"] == {
|
|
133
|
+
"credential": LLM_GATEWAY_CREDENTIAL,
|
|
134
|
+
"model": LLM_GATEWAY_MODEL,
|
|
135
|
+
}
|
|
126
136
|
assert request_body["inputs"]["prompt"] == "What is the capital of France?"
|
|
127
137
|
assert request_body["inputs"]["response"] == "I like pizza and Italian food."
|
|
128
138
|
|
|
129
139
|
|
|
130
140
|
@responses.activate
|
|
131
|
-
def test_answer_relevance_missing_reasoning() -> None:
|
|
141
|
+
def test_answer_relevance_missing_reasoning(evaluator: AnswerRelevance) -> None:
|
|
132
142
|
"""When API response has no reasoning
|
|
133
143
|
Then it should return score with None reasoning
|
|
134
144
|
And should handle missing fields gracefully."""
|
|
135
|
-
evaluator = AnswerRelevance()
|
|
136
145
|
|
|
137
146
|
# Mock the API response without reasoning
|
|
138
147
|
mock_response = {
|
|
@@ -173,17 +182,19 @@ def test_answer_relevance_missing_reasoning() -> None:
|
|
|
173
182
|
# Verify request body
|
|
174
183
|
request_body = json.loads(request.body)
|
|
175
184
|
assert request_body["evaluator_name"] == "answer_relevance"
|
|
176
|
-
assert request_body["parameters"] == {
|
|
185
|
+
assert request_body["parameters"] == {
|
|
186
|
+
"credential": LLM_GATEWAY_CREDENTIAL,
|
|
187
|
+
"model": LLM_GATEWAY_MODEL,
|
|
188
|
+
}
|
|
177
189
|
assert request_body["inputs"]["prompt"] == "What is the capital of France?"
|
|
178
190
|
assert request_body["inputs"]["response"] == "The capital of France is Paris."
|
|
179
191
|
|
|
180
192
|
|
|
181
193
|
@responses.activate
|
|
182
|
-
def test_answer_relevance_api_error_handling() -> None:
|
|
194
|
+
def test_answer_relevance_api_error_handling(evaluator: AnswerRelevance) -> None:
|
|
183
195
|
"""When API call raises an exception
|
|
184
196
|
Then it should propagate the exception
|
|
185
197
|
And should not return a score."""
|
|
186
|
-
evaluator = AnswerRelevance()
|
|
187
198
|
|
|
188
199
|
# Mock API error response
|
|
189
200
|
responses.post(
|
|
@@ -206,7 +217,10 @@ def test_answer_relevance_api_error_handling() -> None:
|
|
|
206
217
|
# Verify request body
|
|
207
218
|
request_body = json.loads(request.body)
|
|
208
219
|
assert request_body["evaluator_name"] == "answer_relevance"
|
|
209
|
-
assert request_body["parameters"] == {
|
|
220
|
+
assert request_body["parameters"] == {
|
|
221
|
+
"credential": LLM_GATEWAY_CREDENTIAL,
|
|
222
|
+
"model": LLM_GATEWAY_MODEL,
|
|
223
|
+
}
|
|
210
224
|
assert request_body["inputs"]["prompt"] == "What is the capital of France?"
|
|
211
225
|
assert request_body["inputs"]["response"] == "The capital of France is Paris."
|
|
212
226
|
|
|
@@ -224,11 +238,12 @@ def test_answer_relevance_api_error_handling() -> None:
|
|
|
224
238
|
("What is the capital of France?", " \t\n "),
|
|
225
239
|
],
|
|
226
240
|
)
|
|
227
|
-
def test_answer_relevance_validation_errors(
|
|
241
|
+
def test_answer_relevance_validation_errors(
|
|
242
|
+
evaluator: AnswerRelevance, prompt: str, response: str
|
|
243
|
+
) -> None:
|
|
228
244
|
"""When providing invalid prompt or response
|
|
229
245
|
Then it should raise appropriate ValueError
|
|
230
246
|
And should not make API call."""
|
|
231
|
-
evaluator = AnswerRelevance()
|
|
232
247
|
|
|
233
248
|
with pytest.raises(ValueError, match="prompt and response are required"):
|
|
234
249
|
evaluator.score(prompt=prompt, response=response)
|