fiddler-evals 0.1.1.dev13__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {fiddler_evals-0.1.1.dev13/fiddler_evals.egg-info → fiddler_evals-0.2.0}/PKG-INFO +162 -29
  2. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/PUBLIC.md +161 -27
  3. fiddler_evals-0.2.0/fiddler_evals/VERSION +1 -0
  4. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/__init__.py +0 -2
  5. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/connection.py +58 -95
  6. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/application.py +82 -77
  7. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/dataset.py +382 -331
  8. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/experiment.py +292 -285
  9. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/project.py +67 -60
  10. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_dataset_items.py +196 -0
  11. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_experiment_results.py +48 -13
  12. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/__init__.py +0 -2
  13. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/answer_relevance.py +30 -22
  14. fiddler_evals-0.2.0/fiddler_evals/evaluators/base.py +270 -0
  15. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/coherence.py +15 -9
  16. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/conciseness.py +7 -4
  17. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/eval_fn.py +19 -9
  18. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/ftl_prompt_safety.py +1 -1
  19. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/ftl_response_faithfulness.py +1 -1
  20. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/regex.py +11 -14
  21. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/sentiment.py +1 -1
  22. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_answer_relevance.py +65 -16
  23. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_coherence.py +58 -76
  24. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_conciseness.py +62 -16
  25. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_eval_fn.py +52 -0
  26. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_ftl_prompt_safety.py +6 -6
  27. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_ftl_response_faithfulness.py +1 -1
  28. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_regex.py +39 -0
  29. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_sentiment.py +6 -6
  30. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/test_topic_classification.py +6 -6
  31. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/topic.py +5 -3
  32. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/exceptions.py +44 -62
  33. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/experiment.py +2 -0
  34. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/evaluation.py +80 -43
  35. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/experiment_runner.py +71 -55
  36. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/tests/test_evaluate.py +419 -34
  37. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/tests/test_experiment_result_publisher.py +19 -2
  38. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/tests/constants.py +3 -0
  39. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0/fiddler_evals.egg-info}/PKG-INFO +162 -29
  40. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals.egg-info/SOURCES.txt +0 -2
  41. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals.egg-info/requires.txt +0 -1
  42. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/pyproject.toml +1 -2
  43. fiddler_evals-0.1.1.dev13/fiddler_evals/VERSION +0 -1
  44. fiddler_evals-0.1.1.dev13/fiddler_evals/evaluators/base.py +0 -141
  45. fiddler_evals-0.1.1.dev13/fiddler_evals/evaluators/tests/test_toxicity.py +0 -201
  46. fiddler_evals-0.1.1.dev13/fiddler_evals/evaluators/toxicity.py +0 -101
  47. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/MANIFEST.in +0 -0
  48. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/README.md +0 -0
  49. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/configs.py +0 -0
  50. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/conftest.py +0 -0
  51. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/constants.py +0 -0
  52. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/decorators.py +0 -0
  53. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/__init__.py +0 -0
  54. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/base.py +0 -0
  55. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/__init__.py +0 -0
  56. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_application.py +0 -0
  57. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_dataset.py +0 -0
  58. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_experiment.py +0 -0
  59. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_experiment_items.py +0 -0
  60. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/entities/tests/test_project.py +0 -0
  61. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/evaluators/tests/__init__.py +0 -0
  62. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/__init__.py +0 -0
  63. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/http_client.py +0 -0
  64. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/json_encoder.py +0 -0
  65. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/semver.py +0 -0
  66. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/tests/__init__.py +0 -0
  67. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/tests/test_json_encoder.py +0 -0
  68. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/libs/tests/test_request_client.py +0 -0
  69. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/__init__.py +0 -0
  70. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/application.py +0 -0
  71. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/base.py +0 -0
  72. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/compact.py +0 -0
  73. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/dataset.py +0 -0
  74. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/error.py +0 -0
  75. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/evaluator.py +0 -0
  76. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/filter_query.py +0 -0
  77. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/project.py +0 -0
  78. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/response.py +0 -0
  79. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/score.py +0 -0
  80. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/pydantic_models/server_info.py +0 -0
  81. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/__init__.py +0 -0
  82. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/executor.py +0 -0
  83. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/experiment_result_publisher.py +0 -0
  84. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/runner/tests/__init__.py +0 -0
  85. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/tests/__init__.py +0 -0
  86. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/tests/test_connection.py +0 -0
  87. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/tests/test_decorators.py +0 -0
  88. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/utils/__init__.py +0 -0
  89. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/utils/environment.py +0 -0
  90. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/utils/pd.py +0 -0
  91. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/utils/tests/__init__.py +0 -0
  92. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/utils/tests/test_environment.py +0 -0
  93. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/utils/tqdm.py +0 -0
  94. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals/version.py +0 -0
  95. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals.egg-info/dependency_links.txt +0 -0
  96. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/fiddler_evals.egg-info/top_level.txt +0 -0
  97. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.2.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fiddler-evals
3
- Version: 0.1.1.dev13
3
+ Version: 0.2.0
4
4
  Summary: Python SDK for evaluating LLM Applications
5
5
  Author-email: Fiddler AI <support@fiddler.ai>
6
6
  Maintainer-email: Fiddler AI <support@fiddler.ai>
@@ -15,7 +15,6 @@ Requires-Dist: requests<3
15
15
  Requires-Dist: pydantic>=2.0.0
16
16
  Requires-Dist: tqdm
17
17
  Requires-Dist: typing-extensions<5,>=4.6.0
18
- Requires-Dist: pandas>=1.2.5
19
18
  Requires-Dist: python-decouple
20
19
  Provides-Extra: pandas
21
20
  Requires-Dist: pandas>=1.2.5; extra == "pandas"
@@ -60,7 +59,7 @@ pip install --upgrade --pre fiddler-evals
60
59
  from fiddler_evals import init
61
60
 
62
61
  # Initialize connection
63
- init(url='https://your-org.fiddler.ai', token='your-api-token')
62
+ init(url='https://your-instance.fiddler.ai', token='your-api-token')
64
63
  ```
65
64
 
66
65
  ### 2. Create Project Structure
@@ -101,19 +100,32 @@ dataset.insert(test_cases)
101
100
 
102
101
  ### 4. Use Built-in Evaluators
103
102
 
103
+ **Configure LLM Gateway provider:**
104
+
105
+ Add an LLM provider via UI (**Settings > LLM Gateway**) to use Fiddler's pre-build LLM-as-a-Judge evaluators.
106
+ LLM-as-a-Judge evaluators require a `model` parameter in the format `{provider}/{model}` (e.g., `openai/gpt-4o`) and an optional `credential` parameter for LLM Gateway authentication.
107
+
104
108
  ```python
105
109
  from fiddler_evals.evaluators import (
106
110
  AnswerRelevance, Coherence, Conciseness,
107
- Toxicity, Sentiment, RegexSearch
111
+ Sentiment, RegexSearch
108
112
  )
109
113
 
110
- # Test individual evaluators
111
- relevance_evaluator = AnswerRelevance()
114
+ # Test LLM-as-a-Judge evaluators (require model parameter)
115
+ relevance_evaluator = AnswerRelevance(
116
+ model="openai/gpt-4o", # Required: LLM Gateway model in {provider}/{model} format
117
+ credential="my-openai-cred" # Optional: LLM Gateway credential name
118
+ )
112
119
  score = relevance_evaluator.score(
113
120
  prompt="What is the capital of France?",
114
121
  response="Paris is the capital of France."
115
122
  )
116
123
  print(f"Score: {score.value} - {score.reasoning}")
124
+
125
+ # Test other evaluators (no model parameter needed)
126
+ sentiment_evaluator = Sentiment()
127
+ scores = sentiment_evaluator.score(text="This is a helpful response.")
128
+ print("Sentiments:", [f'{score.name}: {score.value}' for score in scores])
117
129
  ```
118
130
 
119
131
  ### 5. Create Custom Evaluators
@@ -128,8 +140,11 @@ class PolitenessEvaluator(Evaluator):
128
140
  Useful for customer service or chatbot applications.
129
141
  """
130
142
 
131
- def __init__(self):
132
- super().__init__()
143
+ def __init__(self, score_name_prefix: str = None, score_fn_kwargs_mapping: dict = None):
144
+ super().__init__(
145
+ score_name_prefix=score_name_prefix,
146
+ score_fn_kwargs_mapping=score_fn_kwargs_mapping
147
+ )
133
148
  self.polite_words = [
134
149
  'please', 'thank you', 'thanks', 'sorry', 'apologize',
135
150
  'appreciate', 'welcome', 'help', 'assist', 'glad'
@@ -151,13 +166,13 @@ class PolitenessEvaluator(Evaluator):
151
166
  reasoning = "No polite language detected"
152
167
 
153
168
  return Score(
154
- name="politeness",
169
+ name=f"{self.score_name_prefix}politeness",
155
170
  evaluator_name=self.name,
156
171
  value=score_value,
157
172
  reasoning=reasoning
158
173
  )
159
174
 
160
- # Test the evaluator
175
+ # Test the evaluator with different configurations
161
176
  politeness_evaluator = PolitenessEvaluator()
162
177
 
163
178
  polite_response = "Thank you for your question! I'd be happy to help you with that."
@@ -165,6 +180,17 @@ impolite_response = "I don't know. Figure it out yourself."
165
180
 
166
181
  print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
167
182
  print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
183
+
184
+ # Use with different configurations
185
+ customer_service_evaluator = PolitenessEvaluator(
186
+ score_name_prefix="customer_service",
187
+ score_fn_kwargs_mapping={"output": "response"}
188
+ )
189
+
190
+ support_evaluator = PolitenessEvaluator(
191
+ score_name_prefix="support",
192
+ score_fn_kwargs_mapping={"output": "answer"}
193
+ )
168
194
  ```
169
195
 
170
196
  ### 5.1. Function-Based Evaluators
@@ -185,8 +211,8 @@ def contains_number_evaluator(output: str) -> float:
185
211
 
186
212
  # Use functions directly in evaluators list
187
213
  evaluators = [
188
- AnswerRelevance(),
189
- Conciseness(),
214
+ AnswerRelevance(model="openai/gpt-4o", credential="my-openai-cred"),
215
+ Conciseness(model="openai/gpt-4o", credential="my-openai-cred"),
190
216
  word_count_evaluator, # Function evaluator
191
217
  contains_number_evaluator, # Function evaluator
192
218
  ]
@@ -215,12 +241,42 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
215
241
  answer = call_your_llm(question)
216
242
  return {"answer": answer}
217
243
 
218
- # Set up evaluators
244
+ # Set up evaluators with different configurations
219
245
  evaluators = [
220
- AnswerRelevance(),
221
- Conciseness(),
222
- Sentiment(),
223
- PolitenessEvaluator(),
246
+ # LLM-as-a-Judge evaluators (require model parameter)
247
+ AnswerRelevance(
248
+ model="openai/gpt-4o",
249
+ credential="my-openai-cred",
250
+ score_name_prefix="primary"
251
+ ),
252
+ Conciseness(
253
+ model="openai/gpt-4o",
254
+ credential="my-openai-cred",
255
+ score_name_prefix="primary"
256
+ ),
257
+
258
+ # Other evaluators
259
+ Sentiment(score_name_prefix="primary"),
260
+
261
+ # Custom evaluators with specific mappings
262
+ PolitenessEvaluator(
263
+ score_name_prefix="quality",
264
+ score_fn_kwargs_mapping={"output": "answer"}
265
+ ),
266
+
267
+ # Multiple instances of same evaluator for different fields
268
+ RegexSearch(
269
+ pattern=r"\d+",
270
+ score_name_prefix="question",
271
+ score_name="has_number",
272
+ score_fn_kwargs_mapping={"output": "question"}
273
+ ),
274
+ RegexSearch(
275
+ pattern=r"\d+",
276
+ score_name_prefix="answer",
277
+ score_name="has_number",
278
+ score_fn_kwargs_mapping={"output": "answer"}
279
+ ),
224
280
  ]
225
281
 
226
282
  # Run evaluation
@@ -231,9 +287,8 @@ experiment_result = evaluate(
231
287
  name_prefix="my_evaluation",
232
288
  description="Comprehensive LLM evaluation",
233
289
  score_fn_kwargs_mapping={
234
- "question": "question",
290
+ "question": lambda x: x["inputs"]["question"],
235
291
  "response": "answer",
236
- "output": "answer",
237
292
  "text": "answer",
238
293
  "prompt": lambda x: x["inputs"]["question"],
239
294
  }
@@ -241,20 +296,25 @@ experiment_result = evaluate(
241
296
 
242
297
  print(f"Evaluated {len(experiment_result.results)} test cases")
243
298
  print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
299
+
300
+ # Results in organized score names:
301
+ # "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
302
+ # "quality_politeness", "question_has_number", "answer_has_number"
244
303
  ```
245
304
 
246
305
  ## Built-in Evaluators
247
306
 
248
- | Evaluator | Purpose | Key Parameters |
249
- |-----------|---------|----------------|
250
- | `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
251
- | `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
252
- | `Conciseness` | Measures response brevity and clarity | `response` |
253
- | `Toxicity` | Detects harmful or toxic content | `text` |
254
- | `Sentiment` | Analyzes emotional tone | `text` |
255
- | `RegexSearch` | Pattern matching for specific formats | `output`, `pattern` |
256
- | `FTLPromptSafety` | Compute safety scores for prompts | `text` |
257
- | `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | `response`, `context` |
307
+ | Evaluator | Purpose | Constructor Parameters | Score Parameters |
308
+ |-----------|---------|------------------------|------------------|
309
+ | `AnswerRelevance` | Checks if response addresses the question | `model` (required), `credential` (required) | `prompt`, `response` |
310
+ | `Coherence` | Evaluates logical flow and consistency | `model` (required), `credential` (required) | `response`, `prompt` (optional) |
311
+ | `Conciseness` | Measures response brevity and clarity | `model` (required), `credential` (required) | `response` |
312
+ | `Sentiment` | Analyzes emotional tone | - | `text` |
313
+ | `RegexSearch` | Pattern matching for specific formats | `pattern` (required) | `output` |
314
+ | `FTLPromptSafety` | Compute safety scores for prompts | - | `text` |
315
+ | `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | - | `response`, `context` |
316
+
317
+ **Note:** Evaluators marked with `model` and `credential` parameters are LLM-as-a-Judge evaluators that require an LLM Gateway model. The `model` parameter should be in `{provider}/{model}` format (e.g., `openai/gpt-4o`). The `credential` parameter is the name of the LLM Gateway credential for authentication.
258
318
 
259
319
  ## Data Import Options
260
320
 
@@ -326,6 +386,79 @@ score_fn_kwargs_mapping={
326
386
  }
327
387
  ```
328
388
 
389
+ ### Multiple Evaluator Instances with Different Mappings
390
+
391
+ You can create multiple instances of the same evaluator with different parameter mappings and score name prefixes to evaluate different aspects of your outputs. Use `score_name_prefix` to organize and distinguish scores when using multiple evaluator instances:
392
+
393
+ ```python
394
+ from fiddler_evals.evaluators import RegexSearch
395
+
396
+ # Create multiple RegexSearch evaluators for different fields
397
+ evaluators = [
398
+ # Check for numbers in the question
399
+ RegexSearch(
400
+ pattern=r"\d+",
401
+ score_name_prefix="question",
402
+ score_name="has_number",
403
+ score_fn_kwargs_mapping={"output": "question"}
404
+ ),
405
+ # Check for numbers in the answer
406
+ RegexSearch(
407
+ pattern=r"\d+",
408
+ score_name_prefix="answer",
409
+ score_name="has_number",
410
+ score_fn_kwargs_mapping={"output": "answer"}
411
+ ),
412
+ # Check for capital letters in the answer
413
+ RegexSearch(
414
+ pattern=r"[A-Z]",
415
+ score_name_prefix="answer",
416
+ score_name="has_caps",
417
+ score_fn_kwargs_mapping={"output": "answer"}
418
+ )
419
+ ]
420
+
421
+ # Run evaluation
422
+ experiment_result = evaluate(
423
+ dataset=dataset,
424
+ task=my_llm_task,
425
+ evaluators=evaluators,
426
+ score_fn_kwargs_mapping={
427
+ "question": lambda x: x["inputs"]["question"]
428
+ }
429
+ )
430
+
431
+ # Results in scores named:
432
+ # "question_has_number", "answer_has_number", "answer_has_caps"
433
+ ```
434
+
435
+ ### Parameter Mapping Priority
436
+
437
+ When both evaluator-level and evaluation-level mappings are present, evaluator-level mappings take precedence:
438
+
439
+ ```python
440
+ # Evaluator-level mapping (higher priority)
441
+ evaluator = RegexSearch(
442
+ pattern=r"\d+",
443
+ score_fn_kwargs_mapping={"output": "answer"} # This takes precedence
444
+ )
445
+
446
+ # Evaluation-level mapping (lower priority)
447
+ experiment_result = evaluate(
448
+ dataset=dataset,
449
+ task=my_llm_task,
450
+ evaluators=[evaluator],
451
+ score_fn_kwargs_mapping={
452
+ "output": "question" # This is ignored due to evaluator-level mapping
453
+ }
454
+ )
455
+ ```
456
+
457
+ **Mapping Priority (highest to lowest):**
458
+ 1. Evaluator-level `score_fn_kwargs_mapping` (set in evaluator constructor)
459
+ 2. Evaluation-level `score_fn_kwargs_mapping` (passed to evaluate function)
460
+ 3. Default parameter resolution
461
+
329
462
  ### Experiment Metadata
330
463
  ```python
331
464
  experiment_result = evaluate(
@@ -38,7 +38,7 @@ pip install --upgrade --pre fiddler-evals
38
38
  from fiddler_evals import init
39
39
 
40
40
  # Initialize connection
41
- init(url='https://your-org.fiddler.ai', token='your-api-token')
41
+ init(url='https://your-instance.fiddler.ai', token='your-api-token')
42
42
  ```
43
43
 
44
44
  ### 2. Create Project Structure
@@ -79,19 +79,32 @@ dataset.insert(test_cases)
79
79
 
80
80
  ### 4. Use Built-in Evaluators
81
81
 
82
+ **Configure LLM Gateway provider:**
83
+
84
+ Add an LLM provider via UI (**Settings > LLM Gateway**) to use Fiddler's pre-build LLM-as-a-Judge evaluators.
85
+ LLM-as-a-Judge evaluators require a `model` parameter in the format `{provider}/{model}` (e.g., `openai/gpt-4o`) and an optional `credential` parameter for LLM Gateway authentication.
86
+
82
87
  ```python
83
88
  from fiddler_evals.evaluators import (
84
89
  AnswerRelevance, Coherence, Conciseness,
85
- Toxicity, Sentiment, RegexSearch
90
+ Sentiment, RegexSearch
86
91
  )
87
92
 
88
- # Test individual evaluators
89
- relevance_evaluator = AnswerRelevance()
93
+ # Test LLM-as-a-Judge evaluators (require model parameter)
94
+ relevance_evaluator = AnswerRelevance(
95
+ model="openai/gpt-4o", # Required: LLM Gateway model in {provider}/{model} format
96
+ credential="my-openai-cred" # Optional: LLM Gateway credential name
97
+ )
90
98
  score = relevance_evaluator.score(
91
99
  prompt="What is the capital of France?",
92
100
  response="Paris is the capital of France."
93
101
  )
94
102
  print(f"Score: {score.value} - {score.reasoning}")
103
+
104
+ # Test other evaluators (no model parameter needed)
105
+ sentiment_evaluator = Sentiment()
106
+ scores = sentiment_evaluator.score(text="This is a helpful response.")
107
+ print("Sentiments:", [f'{score.name}: {score.value}' for score in scores])
95
108
  ```
96
109
 
97
110
  ### 5. Create Custom Evaluators
@@ -106,8 +119,11 @@ class PolitenessEvaluator(Evaluator):
106
119
  Useful for customer service or chatbot applications.
107
120
  """
108
121
 
109
- def __init__(self):
110
- super().__init__()
122
+ def __init__(self, score_name_prefix: str = None, score_fn_kwargs_mapping: dict = None):
123
+ super().__init__(
124
+ score_name_prefix=score_name_prefix,
125
+ score_fn_kwargs_mapping=score_fn_kwargs_mapping
126
+ )
111
127
  self.polite_words = [
112
128
  'please', 'thank you', 'thanks', 'sorry', 'apologize',
113
129
  'appreciate', 'welcome', 'help', 'assist', 'glad'
@@ -129,13 +145,13 @@ class PolitenessEvaluator(Evaluator):
129
145
  reasoning = "No polite language detected"
130
146
 
131
147
  return Score(
132
- name="politeness",
148
+ name=f"{self.score_name_prefix}politeness",
133
149
  evaluator_name=self.name,
134
150
  value=score_value,
135
151
  reasoning=reasoning
136
152
  )
137
153
 
138
- # Test the evaluator
154
+ # Test the evaluator with different configurations
139
155
  politeness_evaluator = PolitenessEvaluator()
140
156
 
141
157
  polite_response = "Thank you for your question! I'd be happy to help you with that."
@@ -143,6 +159,17 @@ impolite_response = "I don't know. Figure it out yourself."
143
159
 
144
160
  print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
145
161
  print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
162
+
163
+ # Use with different configurations
164
+ customer_service_evaluator = PolitenessEvaluator(
165
+ score_name_prefix="customer_service",
166
+ score_fn_kwargs_mapping={"output": "response"}
167
+ )
168
+
169
+ support_evaluator = PolitenessEvaluator(
170
+ score_name_prefix="support",
171
+ score_fn_kwargs_mapping={"output": "answer"}
172
+ )
146
173
  ```
147
174
 
148
175
  ### 5.1. Function-Based Evaluators
@@ -163,8 +190,8 @@ def contains_number_evaluator(output: str) -> float:
163
190
 
164
191
  # Use functions directly in evaluators list
165
192
  evaluators = [
166
- AnswerRelevance(),
167
- Conciseness(),
193
+ AnswerRelevance(model="openai/gpt-4o", credential="my-openai-cred"),
194
+ Conciseness(model="openai/gpt-4o", credential="my-openai-cred"),
168
195
  word_count_evaluator, # Function evaluator
169
196
  contains_number_evaluator, # Function evaluator
170
197
  ]
@@ -193,12 +220,42 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
193
220
  answer = call_your_llm(question)
194
221
  return {"answer": answer}
195
222
 
196
- # Set up evaluators
223
+ # Set up evaluators with different configurations
197
224
  evaluators = [
198
- AnswerRelevance(),
199
- Conciseness(),
200
- Sentiment(),
201
- PolitenessEvaluator(),
225
+ # LLM-as-a-Judge evaluators (require model parameter)
226
+ AnswerRelevance(
227
+ model="openai/gpt-4o",
228
+ credential="my-openai-cred",
229
+ score_name_prefix="primary"
230
+ ),
231
+ Conciseness(
232
+ model="openai/gpt-4o",
233
+ credential="my-openai-cred",
234
+ score_name_prefix="primary"
235
+ ),
236
+
237
+ # Other evaluators
238
+ Sentiment(score_name_prefix="primary"),
239
+
240
+ # Custom evaluators with specific mappings
241
+ PolitenessEvaluator(
242
+ score_name_prefix="quality",
243
+ score_fn_kwargs_mapping={"output": "answer"}
244
+ ),
245
+
246
+ # Multiple instances of same evaluator for different fields
247
+ RegexSearch(
248
+ pattern=r"\d+",
249
+ score_name_prefix="question",
250
+ score_name="has_number",
251
+ score_fn_kwargs_mapping={"output": "question"}
252
+ ),
253
+ RegexSearch(
254
+ pattern=r"\d+",
255
+ score_name_prefix="answer",
256
+ score_name="has_number",
257
+ score_fn_kwargs_mapping={"output": "answer"}
258
+ ),
202
259
  ]
203
260
 
204
261
  # Run evaluation
@@ -209,9 +266,8 @@ experiment_result = evaluate(
209
266
  name_prefix="my_evaluation",
210
267
  description="Comprehensive LLM evaluation",
211
268
  score_fn_kwargs_mapping={
212
- "question": "question",
269
+ "question": lambda x: x["inputs"]["question"],
213
270
  "response": "answer",
214
- "output": "answer",
215
271
  "text": "answer",
216
272
  "prompt": lambda x: x["inputs"]["question"],
217
273
  }
@@ -219,20 +275,25 @@ experiment_result = evaluate(
219
275
 
220
276
  print(f"Evaluated {len(experiment_result.results)} test cases")
221
277
  print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
278
+
279
+ # Results in organized score names:
280
+ # "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
281
+ # "quality_politeness", "question_has_number", "answer_has_number"
222
282
  ```
223
283
 
224
284
  ## Built-in Evaluators
225
285
 
226
- | Evaluator | Purpose | Key Parameters |
227
- |-----------|---------|----------------|
228
- | `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
229
- | `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
230
- | `Conciseness` | Measures response brevity and clarity | `response` |
231
- | `Toxicity` | Detects harmful or toxic content | `text` |
232
- | `Sentiment` | Analyzes emotional tone | `text` |
233
- | `RegexSearch` | Pattern matching for specific formats | `output`, `pattern` |
234
- | `FTLPromptSafety` | Compute safety scores for prompts | `text` |
235
- | `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | `response`, `context` |
286
+ | Evaluator | Purpose | Constructor Parameters | Score Parameters |
287
+ |-----------|---------|------------------------|------------------|
288
+ | `AnswerRelevance` | Checks if response addresses the question | `model` (required), `credential` (required) | `prompt`, `response` |
289
+ | `Coherence` | Evaluates logical flow and consistency | `model` (required), `credential` (required) | `response`, `prompt` (optional) |
290
+ | `Conciseness` | Measures response brevity and clarity | `model` (required), `credential` (required) | `response` |
291
+ | `Sentiment` | Analyzes emotional tone | - | `text` |
292
+ | `RegexSearch` | Pattern matching for specific formats | `pattern` (required) | `output` |
293
+ | `FTLPromptSafety` | Compute safety scores for prompts | - | `text` |
294
+ | `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | - | `response`, `context` |
295
+
296
+ **Note:** Evaluators marked with `model` and `credential` parameters are LLM-as-a-Judge evaluators that require an LLM Gateway model. The `model` parameter should be in `{provider}/{model}` format (e.g., `openai/gpt-4o`). The `credential` parameter is the name of the LLM Gateway credential for authentication.
236
297
 
237
298
  ## Data Import Options
238
299
 
@@ -304,6 +365,79 @@ score_fn_kwargs_mapping={
304
365
  }
305
366
  ```
306
367
 
368
+ ### Multiple Evaluator Instances with Different Mappings
369
+
370
+ You can create multiple instances of the same evaluator with different parameter mappings and score name prefixes to evaluate different aspects of your outputs. Use `score_name_prefix` to organize and distinguish scores when using multiple evaluator instances:
371
+
372
+ ```python
373
+ from fiddler_evals.evaluators import RegexSearch
374
+
375
+ # Create multiple RegexSearch evaluators for different fields
376
+ evaluators = [
377
+ # Check for numbers in the question
378
+ RegexSearch(
379
+ pattern=r"\d+",
380
+ score_name_prefix="question",
381
+ score_name="has_number",
382
+ score_fn_kwargs_mapping={"output": "question"}
383
+ ),
384
+ # Check for numbers in the answer
385
+ RegexSearch(
386
+ pattern=r"\d+",
387
+ score_name_prefix="answer",
388
+ score_name="has_number",
389
+ score_fn_kwargs_mapping={"output": "answer"}
390
+ ),
391
+ # Check for capital letters in the answer
392
+ RegexSearch(
393
+ pattern=r"[A-Z]",
394
+ score_name_prefix="answer",
395
+ score_name="has_caps",
396
+ score_fn_kwargs_mapping={"output": "answer"}
397
+ )
398
+ ]
399
+
400
+ # Run evaluation
401
+ experiment_result = evaluate(
402
+ dataset=dataset,
403
+ task=my_llm_task,
404
+ evaluators=evaluators,
405
+ score_fn_kwargs_mapping={
406
+ "question": lambda x: x["inputs"]["question"]
407
+ }
408
+ )
409
+
410
+ # Results in scores named:
411
+ # "question_has_number", "answer_has_number", "answer_has_caps"
412
+ ```
413
+
414
+ ### Parameter Mapping Priority
415
+
416
+ When both evaluator-level and evaluation-level mappings are present, evaluator-level mappings take precedence:
417
+
418
+ ```python
419
+ # Evaluator-level mapping (higher priority)
420
+ evaluator = RegexSearch(
421
+ pattern=r"\d+",
422
+ score_fn_kwargs_mapping={"output": "answer"} # This takes precedence
423
+ )
424
+
425
+ # Evaluation-level mapping (lower priority)
426
+ experiment_result = evaluate(
427
+ dataset=dataset,
428
+ task=my_llm_task,
429
+ evaluators=[evaluator],
430
+ score_fn_kwargs_mapping={
431
+ "output": "question" # This is ignored due to evaluator-level mapping
432
+ }
433
+ )
434
+ ```
435
+
436
+ **Mapping Priority (highest to lowest):**
437
+ 1. Evaluator-level `score_fn_kwargs_mapping` (set in evaluator constructor)
438
+ 2. Evaluation-level `score_fn_kwargs_mapping` (passed to evaluate function)
439
+ 3. Default parameter resolution
440
+
307
441
  ### Experiment Metadata
308
442
  ```python
309
443
  experiment_result = evaluate(
@@ -0,0 +1 @@
1
+ 0.2.0
@@ -20,7 +20,6 @@ from fiddler_evals.evaluators import (
20
20
  RegexSearch,
21
21
  Sentiment,
22
22
  TopicClassification,
23
- Toxicity,
24
23
  )
25
24
  from fiddler_evals.evaluators.base import Evaluator
26
25
  from fiddler_evals.evaluators.eval_fn import EvalFn
@@ -55,7 +54,6 @@ __all__ = [
55
54
  "AnswerRelevance",
56
55
  "Coherence",
57
56
  "Conciseness",
58
- "Toxicity",
59
57
  "Sentiment",
60
58
  "RegexSearch",
61
59
  "RegexMatch",