fiddler-evals 0.1.1.dev14__tar.gz → 0.2.0rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. {fiddler_evals-0.1.1.dev14/fiddler_evals.egg-info → fiddler_evals-0.2.0rc1}/PKG-INFO +46 -23
  2. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/PUBLIC.md +45 -21
  3. fiddler_evals-0.2.0rc1/fiddler_evals/VERSION +1 -0
  4. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/__init__.py +0 -2
  5. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/__init__.py +0 -2
  6. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/answer_relevance.py +10 -4
  7. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/base.py +23 -0
  8. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/coherence.py +15 -9
  9. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/conciseness.py +6 -3
  10. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_answer_relevance.py +30 -15
  11. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_coherence.py +58 -76
  12. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_conciseness.py +61 -15
  13. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/experiment_runner.py +3 -5
  14. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/tests/test_evaluate.py +156 -34
  15. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/tests/constants.py +3 -0
  16. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1/fiddler_evals.egg-info}/PKG-INFO +46 -23
  17. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals.egg-info/SOURCES.txt +0 -2
  18. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals.egg-info/requires.txt +0 -1
  19. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/pyproject.toml +1 -2
  20. fiddler_evals-0.1.1.dev14/fiddler_evals/VERSION +0 -1
  21. fiddler_evals-0.1.1.dev14/fiddler_evals/evaluators/tests/test_toxicity.py +0 -201
  22. fiddler_evals-0.1.1.dev14/fiddler_evals/evaluators/toxicity.py +0 -101
  23. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/MANIFEST.in +0 -0
  24. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/README.md +0 -0
  25. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/configs.py +0 -0
  26. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/conftest.py +0 -0
  27. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/connection.py +0 -0
  28. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/constants.py +0 -0
  29. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/decorators.py +0 -0
  30. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/__init__.py +0 -0
  31. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/application.py +0 -0
  32. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/base.py +0 -0
  33. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/dataset.py +0 -0
  34. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/experiment.py +0 -0
  35. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/project.py +0 -0
  36. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/__init__.py +0 -0
  37. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_application.py +0 -0
  38. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_dataset.py +0 -0
  39. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_dataset_items.py +0 -0
  40. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_experiment.py +0 -0
  41. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_experiment_items.py +0 -0
  42. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_experiment_results.py +0 -0
  43. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/entities/tests/test_project.py +0 -0
  44. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/eval_fn.py +0 -0
  45. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/ftl_prompt_safety.py +0 -0
  46. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/ftl_response_faithfulness.py +0 -0
  47. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/regex.py +0 -0
  48. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/sentiment.py +0 -0
  49. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/__init__.py +0 -0
  50. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_eval_fn.py +0 -0
  51. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_ftl_prompt_safety.py +0 -0
  52. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_ftl_response_faithfulness.py +0 -0
  53. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_regex.py +0 -0
  54. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_sentiment.py +0 -0
  55. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/tests/test_topic_classification.py +0 -0
  56. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/evaluators/topic.py +0 -0
  57. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/exceptions.py +0 -0
  58. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/__init__.py +0 -0
  59. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/http_client.py +0 -0
  60. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/json_encoder.py +0 -0
  61. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/semver.py +0 -0
  62. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/tests/__init__.py +0 -0
  63. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/tests/test_json_encoder.py +0 -0
  64. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/libs/tests/test_request_client.py +0 -0
  65. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/__init__.py +0 -0
  66. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/application.py +0 -0
  67. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/base.py +0 -0
  68. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/compact.py +0 -0
  69. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/dataset.py +0 -0
  70. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/error.py +0 -0
  71. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/evaluator.py +0 -0
  72. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/experiment.py +0 -0
  73. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/filter_query.py +0 -0
  74. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/project.py +0 -0
  75. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/response.py +0 -0
  76. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/score.py +0 -0
  77. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/pydantic_models/server_info.py +0 -0
  78. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/__init__.py +0 -0
  79. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/evaluation.py +0 -0
  80. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/executor.py +0 -0
  81. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/experiment_result_publisher.py +0 -0
  82. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/tests/__init__.py +0 -0
  83. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/runner/tests/test_experiment_result_publisher.py +0 -0
  84. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/tests/__init__.py +0 -0
  85. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/tests/test_connection.py +0 -0
  86. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/tests/test_decorators.py +0 -0
  87. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/utils/__init__.py +0 -0
  88. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/utils/environment.py +0 -0
  89. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/utils/pd.py +0 -0
  90. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/utils/tests/__init__.py +0 -0
  91. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/utils/tests/test_environment.py +0 -0
  92. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/utils/tqdm.py +0 -0
  93. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals/version.py +0 -0
  94. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals.egg-info/dependency_links.txt +0 -0
  95. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/fiddler_evals.egg-info/top_level.txt +0 -0
  96. {fiddler_evals-0.1.1.dev14 → fiddler_evals-0.2.0rc1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fiddler-evals
3
- Version: 0.1.1.dev14
3
+ Version: 0.2.0rc1
4
4
  Summary: Python SDK for evaluating LLM Applications
5
5
  Author-email: Fiddler AI <support@fiddler.ai>
6
6
  Maintainer-email: Fiddler AI <support@fiddler.ai>
@@ -15,7 +15,6 @@ Requires-Dist: requests<3
15
15
  Requires-Dist: pydantic>=2.0.0
16
16
  Requires-Dist: tqdm
17
17
  Requires-Dist: typing-extensions<5,>=4.6.0
18
- Requires-Dist: pandas>=1.2.5
19
18
  Requires-Dist: python-decouple
20
19
  Provides-Extra: pandas
21
20
  Requires-Dist: pandas>=1.2.5; extra == "pandas"
@@ -101,19 +100,32 @@ dataset.insert(test_cases)
101
100
 
102
101
  ### 4. Use Built-in Evaluators
103
102
 
103
+ **Configure LLM Gateway provider:**
104
+
105
+ Add an LLM provider via UI (**Settings > LLM Gateway**) to use Fiddler's pre-build LLM-as-a-Judge evaluators.
106
+ LLM-as-a-Judge evaluators require a `model` parameter in the format `{provider}/{model}` (e.g., `openai/gpt-4o`) and an optional `credential` parameter for LLM Gateway authentication.
107
+
104
108
  ```python
105
109
  from fiddler_evals.evaluators import (
106
110
  AnswerRelevance, Coherence, Conciseness,
107
- Toxicity, Sentiment, RegexSearch
111
+ Sentiment, RegexSearch
108
112
  )
109
113
 
110
- # Test individual evaluators
111
- relevance_evaluator = AnswerRelevance()
114
+ # Test LLM-as-a-Judge evaluators (require model parameter)
115
+ relevance_evaluator = AnswerRelevance(
116
+ model="openai/gpt-4o", # Required: LLM Gateway model in {provider}/{model} format
117
+ credential="my-openai-cred" # Optional: LLM Gateway credential name
118
+ )
112
119
  score = relevance_evaluator.score(
113
120
  prompt="What is the capital of France?",
114
121
  response="Paris is the capital of France."
115
122
  )
116
123
  print(f"Score: {score.value} - {score.reasoning}")
124
+
125
+ # Test other evaluators (no model parameter needed)
126
+ sentiment_evaluator = Sentiment()
127
+ scores = sentiment_evaluator.score(text="This is a helpful response.")
128
+ print("Sentiments:", [f'{score.name}: {score.value}' for score in scores])
117
129
  ```
118
130
 
119
131
  ### 5. Create Custom Evaluators
@@ -199,8 +211,8 @@ def contains_number_evaluator(output: str) -> float:
199
211
 
200
212
  # Use functions directly in evaluators list
201
213
  evaluators = [
202
- AnswerRelevance(),
203
- Conciseness(),
214
+ AnswerRelevance(model="openai/gpt-4o", credential="my-openai-cred"),
215
+ Conciseness(model="openai/gpt-4o", credential="my-openai-cred"),
204
216
  word_count_evaluator, # Function evaluator
205
217
  contains_number_evaluator, # Function evaluator
206
218
  ]
@@ -231,9 +243,19 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
231
243
 
232
244
  # Set up evaluators with different configurations
233
245
  evaluators = [
234
- # Primary evaluation metrics
235
- AnswerRelevance(score_name_prefix="primary"),
236
- Conciseness(score_name_prefix="primary"),
246
+ # LLM-as-a-Judge evaluators (require model parameter)
247
+ AnswerRelevance(
248
+ model="openai/gpt-4o",
249
+ credential="my-openai-cred",
250
+ score_name_prefix="primary"
251
+ ),
252
+ Conciseness(
253
+ model="openai/gpt-4o",
254
+ credential="my-openai-cred",
255
+ score_name_prefix="primary"
256
+ ),
257
+
258
+ # Other evaluators
237
259
  Sentiment(score_name_prefix="primary"),
238
260
 
239
261
  # Custom evaluators with specific mappings
@@ -245,13 +267,13 @@ evaluators = [
245
267
  # Multiple instances of same evaluator for different fields
246
268
  RegexSearch(
247
269
  pattern=r"\d+",
248
- score_name_prefix="validation",
270
+ score_name_prefix="question",
249
271
  score_name="has_number",
250
272
  score_fn_kwargs_mapping={"output": "question"}
251
273
  ),
252
274
  RegexSearch(
253
275
  pattern=r"\d+",
254
- score_name_prefix="validation",
276
+ score_name_prefix="answer",
255
277
  score_name="has_number",
256
278
  score_fn_kwargs_mapping={"output": "answer"}
257
279
  ),
@@ -277,21 +299,22 @@ print(f"Generated {sum(len(result.scores) for result in experiment_result.result
277
299
 
278
300
  # Results in organized score names:
279
301
  # "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
280
- # "quality_politeness", "validation_has_number" (for question), "validation_has_number" (for answer)
302
+ # "quality_politeness", "question_has_number", "answer_has_number"
281
303
  ```
282
304
 
283
305
  ## Built-in Evaluators
284
306
 
285
- | Evaluator | Purpose | Key Parameters |
286
- |-----------|---------|----------------|
287
- | `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
288
- | `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
289
- | `Conciseness` | Measures response brevity and clarity | `response` |
290
- | `Toxicity` | Detects harmful or toxic content | `text` |
291
- | `Sentiment` | Analyzes emotional tone | `text` |
292
- | `RegexSearch` | Pattern matching for specific formats | `output`, `pattern` |
293
- | `FTLPromptSafety` | Compute safety scores for prompts | `text` |
294
- | `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | `response`, `context` |
307
+ | Evaluator | Purpose | Constructor Parameters | Score Parameters |
308
+ |-----------|---------|------------------------|------------------|
309
+ | `AnswerRelevance` | Checks if response addresses the question | `model` (required), `credential` (required) | `prompt`, `response` |
310
+ | `Coherence` | Evaluates logical flow and consistency | `model` (required), `credential` (required) | `response`, `prompt` (optional) |
311
+ | `Conciseness` | Measures response brevity and clarity | `model` (required), `credential` (required) | `response` |
312
+ | `Sentiment` | Analyzes emotional tone | - | `text` |
313
+ | `RegexSearch` | Pattern matching for specific formats | `pattern` (required) | `output` |
314
+ | `FTLPromptSafety` | Compute safety scores for prompts | - | `text` |
315
+ | `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | - | `response`, `context` |
316
+
317
+ **Note:** Evaluators marked with `model` and `credential` parameters are LLM-as-a-Judge evaluators that require an LLM Gateway model. The `model` parameter should be in `{provider}/{model}` format (e.g., `openai/gpt-4o`). The `credential` parameter is the name of the LLM Gateway credential for authentication.
295
318
 
296
319
  ## Data Import Options
297
320
 
@@ -79,19 +79,32 @@ dataset.insert(test_cases)
79
79
 
80
80
  ### 4. Use Built-in Evaluators
81
81
 
82
+ **Configure LLM Gateway provider:**
83
+
84
+ Add an LLM provider via UI (**Settings > LLM Gateway**) to use Fiddler's pre-build LLM-as-a-Judge evaluators.
85
+ LLM-as-a-Judge evaluators require a `model` parameter in the format `{provider}/{model}` (e.g., `openai/gpt-4o`) and an optional `credential` parameter for LLM Gateway authentication.
86
+
82
87
  ```python
83
88
  from fiddler_evals.evaluators import (
84
89
  AnswerRelevance, Coherence, Conciseness,
85
- Toxicity, Sentiment, RegexSearch
90
+ Sentiment, RegexSearch
86
91
  )
87
92
 
88
- # Test individual evaluators
89
- relevance_evaluator = AnswerRelevance()
93
+ # Test LLM-as-a-Judge evaluators (require model parameter)
94
+ relevance_evaluator = AnswerRelevance(
95
+ model="openai/gpt-4o", # Required: LLM Gateway model in {provider}/{model} format
96
+ credential="my-openai-cred" # Optional: LLM Gateway credential name
97
+ )
90
98
  score = relevance_evaluator.score(
91
99
  prompt="What is the capital of France?",
92
100
  response="Paris is the capital of France."
93
101
  )
94
102
  print(f"Score: {score.value} - {score.reasoning}")
103
+
104
+ # Test other evaluators (no model parameter needed)
105
+ sentiment_evaluator = Sentiment()
106
+ scores = sentiment_evaluator.score(text="This is a helpful response.")
107
+ print("Sentiments:", [f'{score.name}: {score.value}' for score in scores])
95
108
  ```
96
109
 
97
110
  ### 5. Create Custom Evaluators
@@ -177,8 +190,8 @@ def contains_number_evaluator(output: str) -> float:
177
190
 
178
191
  # Use functions directly in evaluators list
179
192
  evaluators = [
180
- AnswerRelevance(),
181
- Conciseness(),
193
+ AnswerRelevance(model="openai/gpt-4o", credential="my-openai-cred"),
194
+ Conciseness(model="openai/gpt-4o", credential="my-openai-cred"),
182
195
  word_count_evaluator, # Function evaluator
183
196
  contains_number_evaluator, # Function evaluator
184
197
  ]
@@ -209,9 +222,19 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
209
222
 
210
223
  # Set up evaluators with different configurations
211
224
  evaluators = [
212
- # Primary evaluation metrics
213
- AnswerRelevance(score_name_prefix="primary"),
214
- Conciseness(score_name_prefix="primary"),
225
+ # LLM-as-a-Judge evaluators (require model parameter)
226
+ AnswerRelevance(
227
+ model="openai/gpt-4o",
228
+ credential="my-openai-cred",
229
+ score_name_prefix="primary"
230
+ ),
231
+ Conciseness(
232
+ model="openai/gpt-4o",
233
+ credential="my-openai-cred",
234
+ score_name_prefix="primary"
235
+ ),
236
+
237
+ # Other evaluators
215
238
  Sentiment(score_name_prefix="primary"),
216
239
 
217
240
  # Custom evaluators with specific mappings
@@ -223,13 +246,13 @@ evaluators = [
223
246
  # Multiple instances of same evaluator for different fields
224
247
  RegexSearch(
225
248
  pattern=r"\d+",
226
- score_name_prefix="validation",
249
+ score_name_prefix="question",
227
250
  score_name="has_number",
228
251
  score_fn_kwargs_mapping={"output": "question"}
229
252
  ),
230
253
  RegexSearch(
231
254
  pattern=r"\d+",
232
- score_name_prefix="validation",
255
+ score_name_prefix="answer",
233
256
  score_name="has_number",
234
257
  score_fn_kwargs_mapping={"output": "answer"}
235
258
  ),
@@ -255,21 +278,22 @@ print(f"Generated {sum(len(result.scores) for result in experiment_result.result
255
278
 
256
279
  # Results in organized score names:
257
280
  # "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
258
- # "quality_politeness", "validation_has_number" (for question), "validation_has_number" (for answer)
281
+ # "quality_politeness", "question_has_number", "answer_has_number"
259
282
  ```
260
283
 
261
284
  ## Built-in Evaluators
262
285
 
263
- | Evaluator | Purpose | Key Parameters |
264
- |-----------|---------|----------------|
265
- | `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
266
- | `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
267
- | `Conciseness` | Measures response brevity and clarity | `response` |
268
- | `Toxicity` | Detects harmful or toxic content | `text` |
269
- | `Sentiment` | Analyzes emotional tone | `text` |
270
- | `RegexSearch` | Pattern matching for specific formats | `output`, `pattern` |
271
- | `FTLPromptSafety` | Compute safety scores for prompts | `text` |
272
- | `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | `response`, `context` |
286
+ | Evaluator | Purpose | Constructor Parameters | Score Parameters |
287
+ |-----------|---------|------------------------|------------------|
288
+ | `AnswerRelevance` | Checks if response addresses the question | `model` (required), `credential` (required) | `prompt`, `response` |
289
+ | `Coherence` | Evaluates logical flow and consistency | `model` (required), `credential` (required) | `response`, `prompt` (optional) |
290
+ | `Conciseness` | Measures response brevity and clarity | `model` (required), `credential` (required) | `response` |
291
+ | `Sentiment` | Analyzes emotional tone | - | `text` |
292
+ | `RegexSearch` | Pattern matching for specific formats | `pattern` (required) | `output` |
293
+ | `FTLPromptSafety` | Compute safety scores for prompts | - | `text` |
294
+ | `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | - | `response`, `context` |
295
+
296
+ **Note:** Evaluators marked with `model` and `credential` parameters are LLM-as-a-Judge evaluators that require an LLM Gateway model. The `model` parameter should be in `{provider}/{model}` format (e.g., `openai/gpt-4o`). The `credential` parameter is the name of the LLM Gateway credential for authentication.
273
297
 
274
298
  ## Data Import Options
275
299
 
@@ -0,0 +1 @@
1
+ 0.2.0rc1
@@ -20,7 +20,6 @@ from fiddler_evals.evaluators import (
20
20
  RegexSearch,
21
21
  Sentiment,
22
22
  TopicClassification,
23
- Toxicity,
24
23
  )
25
24
  from fiddler_evals.evaluators.base import Evaluator
26
25
  from fiddler_evals.evaluators.eval_fn import EvalFn
@@ -55,7 +54,6 @@ __all__ = [
55
54
  "AnswerRelevance",
56
55
  "Coherence",
57
56
  "Conciseness",
58
- "Toxicity",
59
57
  "Sentiment",
60
58
  "RegexSearch",
61
59
  "RegexMatch",
@@ -7,7 +7,6 @@ from fiddler_evals.evaluators.ftl_response_faithfulness import FTLResponseFaithf
7
7
  from fiddler_evals.evaluators.regex import RegexMatch, RegexSearch
8
8
  from fiddler_evals.evaluators.sentiment import Sentiment
9
9
  from fiddler_evals.evaluators.topic import TopicClassification
10
- from fiddler_evals.evaluators.toxicity import Toxicity
11
10
 
12
11
  __all__ = [
13
12
  "RegexSearch",
@@ -17,7 +16,6 @@ __all__ = [
17
16
  "Conciseness",
18
17
  "FTLPromptSafety",
19
18
  "FTLResponseFaithfulness",
20
- "Toxicity",
21
19
  "Sentiment",
22
20
  "TopicClassification",
23
21
  "EvalFn",
@@ -1,8 +1,8 @@
1
- from fiddler_evals.evaluators.base import FiddlerEvaluator
1
+ from fiddler_evals.evaluators.base import FiddlerLLMAAJEvaluator
2
2
  from fiddler_evals.pydantic_models.score import Score
3
3
 
4
4
 
5
- class AnswerRelevance(FiddlerEvaluator):
5
+ class AnswerRelevance(FiddlerLLMAAJEvaluator):
6
6
  """Evaluator to assess how well an answer addresses a given question.
7
7
 
8
8
  The AnswerRelevance evaluator measures whether an LLM's answer is relevant
@@ -85,8 +85,14 @@ class AnswerRelevance(FiddlerEvaluator):
85
85
 
86
86
  payload = {
87
87
  "evaluator_name": self.name,
88
- "parameters": {},
89
- "inputs": {"prompt": prompt, "response": response},
88
+ "parameters": {
89
+ "model": self.model,
90
+ "credential": self.credential,
91
+ },
92
+ "inputs": {
93
+ "prompt": prompt,
94
+ "response": response,
95
+ },
90
96
  }
91
97
 
92
98
  return self._parse_scores(data=self.make_call(payload))[0]
@@ -243,3 +243,26 @@ class FiddlerEvaluator(Evaluator, ABC):
243
243
  scores.append(score)
244
244
 
245
245
  return scores
246
+
247
+
248
+ class FiddlerLLMAAJEvaluator(FiddlerEvaluator, ABC):
249
+ """Base class for LLMAAJ evaluators that use Fiddler's evaluator API."""
250
+
251
+ def __init__(
252
+ self, model: str, credential: str | None = None, **kwargs: Any
253
+ ) -> None:
254
+ """Initialize the LLMAAJ evaluator with model and credential.
255
+
256
+ Args:
257
+ model (str): LLM Gateway model name in `{provider}/{model}` format.
258
+ E.g., `openai/gpt-4o`
259
+ credential (str): Name of the LLM Gateway credential for the above provider.
260
+ **kwargs: Additional keyword arguments
261
+ """
262
+ super().__init__(**kwargs)
263
+
264
+ if not model:
265
+ raise ValueError("model is required for LLMAAJ based evaluators")
266
+
267
+ self.model = model
268
+ self.credential = credential
@@ -1,10 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
- from fiddler_evals.evaluators.base import FiddlerEvaluator
3
+ from fiddler_evals.evaluators.base import FiddlerLLMAAJEvaluator
4
4
  from fiddler_evals.pydantic_models.score import Score
5
5
 
6
6
 
7
- class Coherence(FiddlerEvaluator):
7
+ class Coherence(FiddlerLLMAAJEvaluator):
8
8
  """Evaluator to assess the coherence and logical flow of a response.
9
9
 
10
10
  The Coherence evaluator measures whether a response is well-structured, logically
@@ -58,7 +58,8 @@ class Coherence(FiddlerEvaluator):
58
58
 
59
59
  # Incoherent response
60
60
  incoherent_score = evaluator.score(
61
- response="The sky is blue. I like pizza. Quantum physics is complex. Let's go shopping."
61
+ prompt="Explain the process of making coffee",
62
+ response="The sky is blue. I like pizza. Quantum physics is complex. Let's go shopping.",
62
63
  )
63
64
  print(f"Coherence: {incoherent_score.value}") # 0.0
64
65
 
@@ -83,28 +84,33 @@ class Coherence(FiddlerEvaluator):
83
84
 
84
85
  name = "coherence"
85
86
 
86
- def score(self, response: str, prompt: str | None = None) -> Score: # pylint: disable=arguments-differ
87
+ def score(self, prompt: str, response: str) -> Score: # pylint: disable=arguments-differ
87
88
  """Score the coherence of a response.
88
89
 
89
90
  Args:
91
+ prompt (str): The original prompt that generated the response.
90
92
  response (str): The response to evaluate for coherence.
91
- prompt (str, optional): The original prompt that generated the response.
92
93
 
93
94
  Returns:
94
95
  Score: A Score object for coherence assessment.
95
96
  """
96
- response = response.strip() if response else ""
97
97
  prompt = prompt.strip() if prompt else ""
98
+ response = response.strip() if response else ""
98
99
 
99
- if not response:
100
- raise ValueError("response is required for coherence evaluation")
100
+ if not prompt or not response:
101
+ raise ValueError(
102
+ "prompt and response are required for coherence evaluation"
103
+ )
101
104
 
102
105
  # Build inputs dictionary
103
106
  inputs = {"response": response, "prompt": prompt}
104
107
 
105
108
  payload = {
106
109
  "evaluator_name": self.name,
107
- "parameters": {},
110
+ "parameters": {
111
+ "model": self.model,
112
+ "credential": self.credential,
113
+ },
108
114
  "inputs": inputs,
109
115
  }
110
116
 
@@ -1,8 +1,8 @@
1
- from fiddler_evals.evaluators.base import FiddlerEvaluator
1
+ from fiddler_evals.evaluators.base import FiddlerLLMAAJEvaluator
2
2
  from fiddler_evals.pydantic_models.score import Score
3
3
 
4
4
 
5
- class Conciseness(FiddlerEvaluator):
5
+ class Conciseness(FiddlerLLMAAJEvaluator):
6
6
  """Evaluator to assess how concise and to-the-point an answer is.
7
7
 
8
8
  The Conciseness evaluator measures whether an LLM's answer is appropriately
@@ -77,7 +77,10 @@ class Conciseness(FiddlerEvaluator):
77
77
 
78
78
  payload = {
79
79
  "evaluator_name": self.name,
80
- "parameters": {},
80
+ "parameters": {
81
+ "model": self.model,
82
+ "credential": self.credential,
83
+ },
81
84
  "inputs": {"response": response},
82
85
  }
83
86
 
@@ -6,15 +6,20 @@ import responses
6
6
  from fiddler_evals.constants import CONTENT_TYPE_HEADER_KEY, JSON_CONTENT_TYPE
7
7
  from fiddler_evals.evaluators.answer_relevance import AnswerRelevance
8
8
  from fiddler_evals.pydantic_models.score import Score, ScoreStatus
9
- from fiddler_evals.tests.constants import URL
9
+ from fiddler_evals.tests.constants import LLM_GATEWAY_CREDENTIAL, LLM_GATEWAY_MODEL, URL
10
+
11
+
12
+ @pytest.fixture()
13
+ def evaluator() -> AnswerRelevance:
14
+ """Create an AnswerRelevance evaluator."""
15
+ return AnswerRelevance(model=LLM_GATEWAY_MODEL, credential=LLM_GATEWAY_CREDENTIAL)
10
16
 
11
17
 
12
18
  @responses.activate
13
- def test_answer_relevance_relevant_answer() -> None:
19
+ def test_answer_relevance_relevant_answer(evaluator: AnswerRelevance) -> None:
14
20
  """When evaluating a relevant answer
15
21
  Then it should return score 1.0
16
22
  And should include proper reasoning."""
17
- evaluator = AnswerRelevance()
18
23
 
19
24
  # Mock the API response
20
25
  mock_response = {
@@ -63,17 +68,19 @@ def test_answer_relevance_relevant_answer() -> None:
63
68
  # Verify request body
64
69
  request_body = json.loads(request.body)
65
70
  assert request_body["evaluator_name"] == "answer_relevance"
66
- assert request_body["parameters"] == {}
71
+ assert request_body["parameters"] == {
72
+ "credential": LLM_GATEWAY_CREDENTIAL,
73
+ "model": LLM_GATEWAY_MODEL,
74
+ }
67
75
  assert request_body["inputs"]["prompt"] == "What is the capital of France?"
68
76
  assert request_body["inputs"]["response"] == "The capital of France is Paris."
69
77
 
70
78
 
71
79
  @responses.activate
72
- def test_answer_relevance_irrelevant_answer() -> None:
80
+ def test_answer_relevance_irrelevant_answer(evaluator: AnswerRelevance) -> None:
73
81
  """When evaluating an irrelevant answer
74
82
  Then it should return score 0.0
75
83
  And should include proper reasoning."""
76
- evaluator = AnswerRelevance()
77
84
 
78
85
  # Mock the API response
79
86
  mock_response = {
@@ -122,17 +129,19 @@ def test_answer_relevance_irrelevant_answer() -> None:
122
129
  # Verify request body
123
130
  request_body = json.loads(request.body)
124
131
  assert request_body["evaluator_name"] == "answer_relevance"
125
- assert request_body["parameters"] == {}
132
+ assert request_body["parameters"] == {
133
+ "credential": LLM_GATEWAY_CREDENTIAL,
134
+ "model": LLM_GATEWAY_MODEL,
135
+ }
126
136
  assert request_body["inputs"]["prompt"] == "What is the capital of France?"
127
137
  assert request_body["inputs"]["response"] == "I like pizza and Italian food."
128
138
 
129
139
 
130
140
  @responses.activate
131
- def test_answer_relevance_missing_reasoning() -> None:
141
+ def test_answer_relevance_missing_reasoning(evaluator: AnswerRelevance) -> None:
132
142
  """When API response has no reasoning
133
143
  Then it should return score with None reasoning
134
144
  And should handle missing fields gracefully."""
135
- evaluator = AnswerRelevance()
136
145
 
137
146
  # Mock the API response without reasoning
138
147
  mock_response = {
@@ -173,17 +182,19 @@ def test_answer_relevance_missing_reasoning() -> None:
173
182
  # Verify request body
174
183
  request_body = json.loads(request.body)
175
184
  assert request_body["evaluator_name"] == "answer_relevance"
176
- assert request_body["parameters"] == {}
185
+ assert request_body["parameters"] == {
186
+ "credential": LLM_GATEWAY_CREDENTIAL,
187
+ "model": LLM_GATEWAY_MODEL,
188
+ }
177
189
  assert request_body["inputs"]["prompt"] == "What is the capital of France?"
178
190
  assert request_body["inputs"]["response"] == "The capital of France is Paris."
179
191
 
180
192
 
181
193
  @responses.activate
182
- def test_answer_relevance_api_error_handling() -> None:
194
+ def test_answer_relevance_api_error_handling(evaluator: AnswerRelevance) -> None:
183
195
  """When API call raises an exception
184
196
  Then it should propagate the exception
185
197
  And should not return a score."""
186
- evaluator = AnswerRelevance()
187
198
 
188
199
  # Mock API error response
189
200
  responses.post(
@@ -206,7 +217,10 @@ def test_answer_relevance_api_error_handling() -> None:
206
217
  # Verify request body
207
218
  request_body = json.loads(request.body)
208
219
  assert request_body["evaluator_name"] == "answer_relevance"
209
- assert request_body["parameters"] == {}
220
+ assert request_body["parameters"] == {
221
+ "credential": LLM_GATEWAY_CREDENTIAL,
222
+ "model": LLM_GATEWAY_MODEL,
223
+ }
210
224
  assert request_body["inputs"]["prompt"] == "What is the capital of France?"
211
225
  assert request_body["inputs"]["response"] == "The capital of France is Paris."
212
226
 
@@ -224,11 +238,12 @@ def test_answer_relevance_api_error_handling() -> None:
224
238
  ("What is the capital of France?", " \t\n "),
225
239
  ],
226
240
  )
227
- def test_answer_relevance_validation_errors(prompt, response) -> None:
241
+ def test_answer_relevance_validation_errors(
242
+ evaluator: AnswerRelevance, prompt: str, response: str
243
+ ) -> None:
228
244
  """When providing invalid prompt or response
229
245
  Then it should raise appropriate ValueError
230
246
  And should not make API call."""
231
- evaluator = AnswerRelevance()
232
247
 
233
248
  with pytest.raises(ValueError, match="prompt and response are required"):
234
249
  evaluator.score(prompt=prompt, response=response)