fiddler-evals 0.1.1.dev14__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fiddler_evals-0.1.1.dev14/fiddler_evals.egg-info → fiddler_evals-0.2.0}/PKG-INFO +47 -24
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/PUBLIC.md +46 -22
- fiddler_evals-0.2.0/fiddler_evals/VERSION +1 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/__init__.py +0 -2
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/connection.py +58 -95
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/application.py +82 -77
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/dataset.py +347 -331
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/experiment.py +291 -284
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/project.py +67 -60
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/__init__.py +0 -2
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/answer_relevance.py +29 -21
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/base.py +25 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/coherence.py +15 -9
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/conciseness.py +6 -3
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/regex.py +4 -4
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_answer_relevance.py +64 -15
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_coherence.py +58 -76
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_conciseness.py +61 -15
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/exceptions.py +44 -62
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/runner/evaluation.py +64 -62
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/runner/experiment_runner.py +3 -5
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/runner/tests/test_evaluate.py +156 -34
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/tests/constants.py +3 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0/fiddler_evals.egg-info}/PKG-INFO +47 -24
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals.egg-info/SOURCES.txt +0 -2
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals.egg-info/requires.txt +0 -1
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/pyproject.toml +1 -2
- fiddler_evals-0.1.1.dev14/fiddler_evals/VERSION +0 -1
- fiddler_evals-0.1.1.dev14/fiddler_evals/evaluators/tests/test_toxicity.py +0 -201
- fiddler_evals-0.1.1.dev14/fiddler_evals/evaluators/toxicity.py +0 -101
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/MANIFEST.in +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/README.md +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/configs.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/conftest.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/constants.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/decorators.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/base.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_application.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_dataset.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_dataset_items.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_experiment.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_experiment_items.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_experiment_results.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_project.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/eval_fn.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/ftl_prompt_safety.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/ftl_response_faithfulness.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/sentiment.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_eval_fn.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_ftl_prompt_safety.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_ftl_response_faithfulness.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_regex.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_sentiment.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_topic_classification.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/topic.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/libs/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/libs/http_client.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/libs/json_encoder.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/libs/semver.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/libs/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/libs/tests/test_json_encoder.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/libs/tests/test_request_client.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/application.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/base.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/compact.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/dataset.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/error.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/evaluator.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/experiment.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/filter_query.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/project.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/response.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/score.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/server_info.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/runner/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/runner/executor.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/runner/experiment_result_publisher.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/runner/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/runner/tests/test_experiment_result_publisher.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/tests/test_connection.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/tests/test_decorators.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/utils/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/utils/environment.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/utils/pd.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/utils/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/utils/tests/test_environment.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/utils/tqdm.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals/version.py +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals.egg-info/dependency_links.txt +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/fiddler_evals.egg-info/top_level.txt +0 -0
- {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fiddler-evals
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Python SDK for evaluating LLM Applications
|
|
5
5
|
Author-email: Fiddler AI <support@fiddler.ai>
|
|
6
6
|
Maintainer-email: Fiddler AI <support@fiddler.ai>
|
|
@@ -15,7 +15,6 @@ Requires-Dist: requests<3
|
|
|
15
15
|
Requires-Dist: pydantic>=2.0.0
|
|
16
16
|
Requires-Dist: tqdm
|
|
17
17
|
Requires-Dist: typing-extensions<5,>=4.6.0
|
|
18
|
-
Requires-Dist: pandas>=1.2.5
|
|
19
18
|
Requires-Dist: python-decouple
|
|
20
19
|
Provides-Extra: pandas
|
|
21
20
|
Requires-Dist: pandas>=1.2.5; extra == "pandas"
|
|
@@ -60,7 +59,7 @@ pip install --upgrade --pre fiddler-evals
|
|
|
60
59
|
from fiddler_evals import init
|
|
61
60
|
|
|
62
61
|
# Initialize connection
|
|
63
|
-
init(url='https://your-
|
|
62
|
+
init(url='https://your-instance.fiddler.ai', token='your-api-token')
|
|
64
63
|
```
|
|
65
64
|
|
|
66
65
|
### 2. Create Project Structure
|
|
@@ -101,19 +100,32 @@ dataset.insert(test_cases)
|
|
|
101
100
|
|
|
102
101
|
### 4. Use Built-in Evaluators
|
|
103
102
|
|
|
103
|
+
**Configure LLM Gateway provider:**
|
|
104
|
+
|
|
105
|
+
Add an LLM provider via UI (**Settings > LLM Gateway**) to use Fiddler's pre-build LLM-as-a-Judge evaluators.
|
|
106
|
+
LLM-as-a-Judge evaluators require a `model` parameter in the format `{provider}/{model}` (e.g., `openai/gpt-4o`) and an optional `credential` parameter for LLM Gateway authentication.
|
|
107
|
+
|
|
104
108
|
```python
|
|
105
109
|
from fiddler_evals.evaluators import (
|
|
106
110
|
AnswerRelevance, Coherence, Conciseness,
|
|
107
|
-
|
|
111
|
+
Sentiment, RegexSearch
|
|
108
112
|
)
|
|
109
113
|
|
|
110
|
-
# Test
|
|
111
|
-
relevance_evaluator = AnswerRelevance(
|
|
114
|
+
# Test LLM-as-a-Judge evaluators (require model parameter)
|
|
115
|
+
relevance_evaluator = AnswerRelevance(
|
|
116
|
+
model="openai/gpt-4o", # Required: LLM Gateway model in {provider}/{model} format
|
|
117
|
+
credential="my-openai-cred" # Optional: LLM Gateway credential name
|
|
118
|
+
)
|
|
112
119
|
score = relevance_evaluator.score(
|
|
113
120
|
prompt="What is the capital of France?",
|
|
114
121
|
response="Paris is the capital of France."
|
|
115
122
|
)
|
|
116
123
|
print(f"Score: {score.value} - {score.reasoning}")
|
|
124
|
+
|
|
125
|
+
# Test other evaluators (no model parameter needed)
|
|
126
|
+
sentiment_evaluator = Sentiment()
|
|
127
|
+
scores = sentiment_evaluator.score(text="This is a helpful response.")
|
|
128
|
+
print("Sentiments:", [f'{score.name}: {score.value}' for score in scores])
|
|
117
129
|
```
|
|
118
130
|
|
|
119
131
|
### 5. Create Custom Evaluators
|
|
@@ -199,8 +211,8 @@ def contains_number_evaluator(output: str) -> float:
|
|
|
199
211
|
|
|
200
212
|
# Use functions directly in evaluators list
|
|
201
213
|
evaluators = [
|
|
202
|
-
AnswerRelevance(),
|
|
203
|
-
Conciseness(),
|
|
214
|
+
AnswerRelevance(model="openai/gpt-4o", credential="my-openai-cred"),
|
|
215
|
+
Conciseness(model="openai/gpt-4o", credential="my-openai-cred"),
|
|
204
216
|
word_count_evaluator, # Function evaluator
|
|
205
217
|
contains_number_evaluator, # Function evaluator
|
|
206
218
|
]
|
|
@@ -231,9 +243,19 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
|
|
|
231
243
|
|
|
232
244
|
# Set up evaluators with different configurations
|
|
233
245
|
evaluators = [
|
|
234
|
-
#
|
|
235
|
-
AnswerRelevance(
|
|
236
|
-
|
|
246
|
+
# LLM-as-a-Judge evaluators (require model parameter)
|
|
247
|
+
AnswerRelevance(
|
|
248
|
+
model="openai/gpt-4o",
|
|
249
|
+
credential="my-openai-cred",
|
|
250
|
+
score_name_prefix="primary"
|
|
251
|
+
),
|
|
252
|
+
Conciseness(
|
|
253
|
+
model="openai/gpt-4o",
|
|
254
|
+
credential="my-openai-cred",
|
|
255
|
+
score_name_prefix="primary"
|
|
256
|
+
),
|
|
257
|
+
|
|
258
|
+
# Other evaluators
|
|
237
259
|
Sentiment(score_name_prefix="primary"),
|
|
238
260
|
|
|
239
261
|
# Custom evaluators with specific mappings
|
|
@@ -245,13 +267,13 @@ evaluators = [
|
|
|
245
267
|
# Multiple instances of same evaluator for different fields
|
|
246
268
|
RegexSearch(
|
|
247
269
|
pattern=r"\d+",
|
|
248
|
-
score_name_prefix="
|
|
270
|
+
score_name_prefix="question",
|
|
249
271
|
score_name="has_number",
|
|
250
272
|
score_fn_kwargs_mapping={"output": "question"}
|
|
251
273
|
),
|
|
252
274
|
RegexSearch(
|
|
253
275
|
pattern=r"\d+",
|
|
254
|
-
score_name_prefix="
|
|
276
|
+
score_name_prefix="answer",
|
|
255
277
|
score_name="has_number",
|
|
256
278
|
score_fn_kwargs_mapping={"output": "answer"}
|
|
257
279
|
),
|
|
@@ -277,21 +299,22 @@ print(f"Generated {sum(len(result.scores) for result in experiment_result.result
|
|
|
277
299
|
|
|
278
300
|
# Results in organized score names:
|
|
279
301
|
# "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
|
|
280
|
-
# "quality_politeness", "
|
|
302
|
+
# "quality_politeness", "question_has_number", "answer_has_number"
|
|
281
303
|
```
|
|
282
304
|
|
|
283
305
|
## Built-in Evaluators
|
|
284
306
|
|
|
285
|
-
| Evaluator | Purpose |
|
|
286
|
-
|
|
287
|
-
| `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
|
|
288
|
-
| `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
|
|
289
|
-
| `Conciseness` | Measures response brevity and clarity | `response` |
|
|
290
|
-
| `
|
|
291
|
-
| `
|
|
292
|
-
| `
|
|
293
|
-
| `
|
|
294
|
-
|
|
307
|
+
| Evaluator | Purpose | Constructor Parameters | Score Parameters |
|
|
308
|
+
|-----------|---------|------------------------|------------------|
|
|
309
|
+
| `AnswerRelevance` | Checks if response addresses the question | `model` (required), `credential` (required) | `prompt`, `response` |
|
|
310
|
+
| `Coherence` | Evaluates logical flow and consistency | `model` (required), `credential` (required) | `response`, `prompt` (optional) |
|
|
311
|
+
| `Conciseness` | Measures response brevity and clarity | `model` (required), `credential` (required) | `response` |
|
|
312
|
+
| `Sentiment` | Analyzes emotional tone | - | `text` |
|
|
313
|
+
| `RegexSearch` | Pattern matching for specific formats | `pattern` (required) | `output` |
|
|
314
|
+
| `FTLPromptSafety` | Compute safety scores for prompts | - | `text` |
|
|
315
|
+
| `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | - | `response`, `context` |
|
|
316
|
+
|
|
317
|
+
**Note:** Evaluators marked with `model` and `credential` parameters are LLM-as-a-Judge evaluators that require an LLM Gateway model. The `model` parameter should be in `{provider}/{model}` format (e.g., `openai/gpt-4o`). The `credential` parameter is the name of the LLM Gateway credential for authentication.
|
|
295
318
|
|
|
296
319
|
## Data Import Options
|
|
297
320
|
|
|
@@ -38,7 +38,7 @@ pip install --upgrade --pre fiddler-evals
|
|
|
38
38
|
from fiddler_evals import init
|
|
39
39
|
|
|
40
40
|
# Initialize connection
|
|
41
|
-
init(url='https://your-
|
|
41
|
+
init(url='https://your-instance.fiddler.ai', token='your-api-token')
|
|
42
42
|
```
|
|
43
43
|
|
|
44
44
|
### 2. Create Project Structure
|
|
@@ -79,19 +79,32 @@ dataset.insert(test_cases)
|
|
|
79
79
|
|
|
80
80
|
### 4. Use Built-in Evaluators
|
|
81
81
|
|
|
82
|
+
**Configure LLM Gateway provider:**
|
|
83
|
+
|
|
84
|
+
Add an LLM provider via UI (**Settings > LLM Gateway**) to use Fiddler's pre-build LLM-as-a-Judge evaluators.
|
|
85
|
+
LLM-as-a-Judge evaluators require a `model` parameter in the format `{provider}/{model}` (e.g., `openai/gpt-4o`) and an optional `credential` parameter for LLM Gateway authentication.
|
|
86
|
+
|
|
82
87
|
```python
|
|
83
88
|
from fiddler_evals.evaluators import (
|
|
84
89
|
AnswerRelevance, Coherence, Conciseness,
|
|
85
|
-
|
|
90
|
+
Sentiment, RegexSearch
|
|
86
91
|
)
|
|
87
92
|
|
|
88
|
-
# Test
|
|
89
|
-
relevance_evaluator = AnswerRelevance(
|
|
93
|
+
# Test LLM-as-a-Judge evaluators (require model parameter)
|
|
94
|
+
relevance_evaluator = AnswerRelevance(
|
|
95
|
+
model="openai/gpt-4o", # Required: LLM Gateway model in {provider}/{model} format
|
|
96
|
+
credential="my-openai-cred" # Optional: LLM Gateway credential name
|
|
97
|
+
)
|
|
90
98
|
score = relevance_evaluator.score(
|
|
91
99
|
prompt="What is the capital of France?",
|
|
92
100
|
response="Paris is the capital of France."
|
|
93
101
|
)
|
|
94
102
|
print(f"Score: {score.value} - {score.reasoning}")
|
|
103
|
+
|
|
104
|
+
# Test other evaluators (no model parameter needed)
|
|
105
|
+
sentiment_evaluator = Sentiment()
|
|
106
|
+
scores = sentiment_evaluator.score(text="This is a helpful response.")
|
|
107
|
+
print("Sentiments:", [f'{score.name}: {score.value}' for score in scores])
|
|
95
108
|
```
|
|
96
109
|
|
|
97
110
|
### 5. Create Custom Evaluators
|
|
@@ -177,8 +190,8 @@ def contains_number_evaluator(output: str) -> float:
|
|
|
177
190
|
|
|
178
191
|
# Use functions directly in evaluators list
|
|
179
192
|
evaluators = [
|
|
180
|
-
AnswerRelevance(),
|
|
181
|
-
Conciseness(),
|
|
193
|
+
AnswerRelevance(model="openai/gpt-4o", credential="my-openai-cred"),
|
|
194
|
+
Conciseness(model="openai/gpt-4o", credential="my-openai-cred"),
|
|
182
195
|
word_count_evaluator, # Function evaluator
|
|
183
196
|
contains_number_evaluator, # Function evaluator
|
|
184
197
|
]
|
|
@@ -209,9 +222,19 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
|
|
|
209
222
|
|
|
210
223
|
# Set up evaluators with different configurations
|
|
211
224
|
evaluators = [
|
|
212
|
-
#
|
|
213
|
-
AnswerRelevance(
|
|
214
|
-
|
|
225
|
+
# LLM-as-a-Judge evaluators (require model parameter)
|
|
226
|
+
AnswerRelevance(
|
|
227
|
+
model="openai/gpt-4o",
|
|
228
|
+
credential="my-openai-cred",
|
|
229
|
+
score_name_prefix="primary"
|
|
230
|
+
),
|
|
231
|
+
Conciseness(
|
|
232
|
+
model="openai/gpt-4o",
|
|
233
|
+
credential="my-openai-cred",
|
|
234
|
+
score_name_prefix="primary"
|
|
235
|
+
),
|
|
236
|
+
|
|
237
|
+
# Other evaluators
|
|
215
238
|
Sentiment(score_name_prefix="primary"),
|
|
216
239
|
|
|
217
240
|
# Custom evaluators with specific mappings
|
|
@@ -223,13 +246,13 @@ evaluators = [
|
|
|
223
246
|
# Multiple instances of same evaluator for different fields
|
|
224
247
|
RegexSearch(
|
|
225
248
|
pattern=r"\d+",
|
|
226
|
-
score_name_prefix="
|
|
249
|
+
score_name_prefix="question",
|
|
227
250
|
score_name="has_number",
|
|
228
251
|
score_fn_kwargs_mapping={"output": "question"}
|
|
229
252
|
),
|
|
230
253
|
RegexSearch(
|
|
231
254
|
pattern=r"\d+",
|
|
232
|
-
score_name_prefix="
|
|
255
|
+
score_name_prefix="answer",
|
|
233
256
|
score_name="has_number",
|
|
234
257
|
score_fn_kwargs_mapping={"output": "answer"}
|
|
235
258
|
),
|
|
@@ -255,21 +278,22 @@ print(f"Generated {sum(len(result.scores) for result in experiment_result.result
|
|
|
255
278
|
|
|
256
279
|
# Results in organized score names:
|
|
257
280
|
# "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
|
|
258
|
-
# "quality_politeness", "
|
|
281
|
+
# "quality_politeness", "question_has_number", "answer_has_number"
|
|
259
282
|
```
|
|
260
283
|
|
|
261
284
|
## Built-in Evaluators
|
|
262
285
|
|
|
263
|
-
| Evaluator | Purpose |
|
|
264
|
-
|
|
265
|
-
| `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
|
|
266
|
-
| `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
|
|
267
|
-
| `Conciseness` | Measures response brevity and clarity | `response` |
|
|
268
|
-
| `
|
|
269
|
-
| `
|
|
270
|
-
| `
|
|
271
|
-
| `
|
|
272
|
-
|
|
286
|
+
| Evaluator | Purpose | Constructor Parameters | Score Parameters |
|
|
287
|
+
|-----------|---------|------------------------|------------------|
|
|
288
|
+
| `AnswerRelevance` | Checks if response addresses the question | `model` (required), `credential` (required) | `prompt`, `response` |
|
|
289
|
+
| `Coherence` | Evaluates logical flow and consistency | `model` (required), `credential` (required) | `response`, `prompt` (optional) |
|
|
290
|
+
| `Conciseness` | Measures response brevity and clarity | `model` (required), `credential` (required) | `response` |
|
|
291
|
+
| `Sentiment` | Analyzes emotional tone | - | `text` |
|
|
292
|
+
| `RegexSearch` | Pattern matching for specific formats | `pattern` (required) | `output` |
|
|
293
|
+
| `FTLPromptSafety` | Compute safety scores for prompts | - | `text` |
|
|
294
|
+
| `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | - | `response`, `context` |
|
|
295
|
+
|
|
296
|
+
**Note:** Evaluators marked with `model` and `credential` parameters are LLM-as-a-Judge evaluators that require an LLM Gateway model. The `model` parameter should be in `{provider}/{model}` format (e.g., `openai/gpt-4o`). The `credential` parameter is the name of the LLM Gateway credential for authentication.
|
|
273
297
|
|
|
274
298
|
## Data Import Options
|
|
275
299
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.2.0
|
|
@@ -20,7 +20,6 @@ from fiddler_evals.evaluators import (
|
|
|
20
20
|
RegexSearch,
|
|
21
21
|
Sentiment,
|
|
22
22
|
TopicClassification,
|
|
23
|
-
Toxicity,
|
|
24
23
|
)
|
|
25
24
|
from fiddler_evals.evaluators.base import Evaluator
|
|
26
25
|
from fiddler_evals.evaluators.eval_fn import EvalFn
|
|
@@ -55,7 +54,6 @@ __all__ = [
|
|
|
55
54
|
"AnswerRelevance",
|
|
56
55
|
"Coherence",
|
|
57
56
|
"Conciseness",
|
|
58
|
-
"Toxicity",
|
|
59
57
|
"Sentiment",
|
|
60
58
|
"RegexSearch",
|
|
61
59
|
"RegexMatch",
|
|
@@ -29,63 +29,30 @@ class Connection:
|
|
|
29
29
|
managing connection parameters, authentication tokens, and ensuring proper
|
|
30
30
|
communication protocols are established.
|
|
31
31
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
Examples
|
|
58
|
-
--------
|
|
59
|
-
Creating a basic connection:
|
|
60
|
-
|
|
61
|
-
.. code-block:: python
|
|
62
|
-
|
|
63
|
-
connection = Connection(
|
|
64
|
-
url="https://your-fiddler-instance.com",
|
|
65
|
-
token="your-auth-token"
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
Creating a connection with custom timeout and proxy:
|
|
69
|
-
|
|
70
|
-
.. code-block:: python
|
|
71
|
-
|
|
72
|
-
connection = Connection(
|
|
73
|
-
url="https://your-fiddler-instance.com",
|
|
74
|
-
token="your-auth-token",
|
|
75
|
-
timeout=(5.0, 30.0), # (connect_timeout, read_timeout)
|
|
76
|
-
proxies={"https": "https://proxy.company.com:8080"}
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
Creating a connection without SSL verification:
|
|
80
|
-
|
|
81
|
-
.. code-block:: python
|
|
82
|
-
|
|
83
|
-
connection = Connection(
|
|
84
|
-
url="https://your-fiddler-instance.com",
|
|
85
|
-
token="your-auth-token",
|
|
86
|
-
verify=False, # Not recommended for production
|
|
87
|
-
validate=False # Skip version compatibility check
|
|
88
|
-
)
|
|
32
|
+
Example:
|
|
33
|
+
.. code-block:: python
|
|
34
|
+
|
|
35
|
+
# Creating a basic connection
|
|
36
|
+
connection = Connection(
|
|
37
|
+
url="https://your-instance.fiddler.ai",
|
|
38
|
+
token="your-auth-token"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Creating a connection with custom timeout and proxy
|
|
42
|
+
connection = Connection(
|
|
43
|
+
url="https://your-instance.fiddler.ai",
|
|
44
|
+
token="your-auth-token",
|
|
45
|
+
timeout=(5.0, 30.0), # (connect_timeout, read_timeout)
|
|
46
|
+
proxies={"https": "https://proxy.company.com:8080"}
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Creating a connection without SSL verification
|
|
50
|
+
connection = Connection(
|
|
51
|
+
url="https://your-instance.fiddler.ai",
|
|
52
|
+
token="your-auth-token",
|
|
53
|
+
verify=False, # Not recommended for production
|
|
54
|
+
validate=False # Skip version compatibility check
|
|
55
|
+
)
|
|
89
56
|
"""
|
|
90
57
|
|
|
91
58
|
def __init__( # pylint: disable=too-many-arguments
|
|
@@ -99,27 +66,17 @@ class Connection:
|
|
|
99
66
|
) -> None:
|
|
100
67
|
"""Initialize a connection to the Fiddler platform.
|
|
101
68
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
Whether to verify server's TLS certificate
|
|
114
|
-
validate : bool, default True
|
|
115
|
-
Whether to validate server/client version compatibility
|
|
116
|
-
|
|
117
|
-
Raises
|
|
118
|
-
------
|
|
119
|
-
ValueError
|
|
120
|
-
If url or token parameters are empty
|
|
121
|
-
IncompatibleClient
|
|
122
|
-
If server version is incompatible with client version
|
|
69
|
+
Args:
|
|
70
|
+
url: The base URL to your Fiddler platform instance
|
|
71
|
+
token: Authentication token obtained from the Fiddler UI
|
|
72
|
+
proxies: Dictionary mapping protocol to proxy URL for HTTP requests
|
|
73
|
+
timeout: HTTP request timeout settings (float or tuple of connect/read timeouts)
|
|
74
|
+
verify: Whether to verify server's TLS certificate (default: True)
|
|
75
|
+
validate: Whether to validate server/client version compatibility (default: True)
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
ValueError: If url or token parameters are empty
|
|
79
|
+
IncompatibleClient: If server version is incompatible with client version
|
|
123
80
|
"""
|
|
124
81
|
|
|
125
82
|
self.url = url
|
|
@@ -363,30 +320,36 @@ def init( # pylint: disable=too-many-arguments
|
|
|
363
320
|
Examples:
|
|
364
321
|
Basic initialization:
|
|
365
322
|
|
|
366
|
-
|
|
323
|
+
.. code-block:: python
|
|
367
324
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
325
|
+
import fiddler as fdl
|
|
326
|
+
|
|
327
|
+
fdl.init(
|
|
328
|
+
url="https://your-instance.fiddler.ai",
|
|
329
|
+
token="your-auth-token"
|
|
330
|
+
)
|
|
372
331
|
|
|
373
332
|
Initialization with custom timeout and proxy:
|
|
374
333
|
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
334
|
+
.. code-block:: python
|
|
335
|
+
|
|
336
|
+
fdl.init(
|
|
337
|
+
url="https://your-instance.fiddler.ai",
|
|
338
|
+
token="your-auth-token",
|
|
339
|
+
timeout=(10.0, 60.0), # 10s connect, 60s read timeout
|
|
340
|
+
proxies={"https": "https://proxy.company.com:8080"}
|
|
341
|
+
)
|
|
381
342
|
|
|
382
343
|
Initialization for development with relaxed settings:
|
|
383
344
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
345
|
+
.. code-block:: python
|
|
346
|
+
|
|
347
|
+
fdl.init(
|
|
348
|
+
url="https://your-instance.fiddler.ai",
|
|
349
|
+
token="dev-token",
|
|
350
|
+
verify=False, # Skip SSL verification
|
|
351
|
+
validate=False, # Skip version compatibility check
|
|
352
|
+
)
|
|
390
353
|
|
|
391
354
|
|
|
392
355
|
|