fiddler-evals 0.1.1.dev13__tar.gz → 0.1.1.dev14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {fiddler_evals-0.1.1.dev13/fiddler_evals.egg-info → fiddler_evals-0.1.1.dev14}/PKG-INFO +122 -12
  2. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/PUBLIC.md +121 -11
  3. fiddler_evals-0.1.1.dev14/fiddler_evals/VERSION +1 -0
  4. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/dataset.py +35 -0
  5. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/experiment.py +1 -1
  6. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_dataset_items.py +196 -0
  7. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_experiment_results.py +48 -13
  8. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/answer_relevance.py +1 -1
  9. fiddler_evals-0.1.1.dev14/fiddler_evals/evaluators/base.py +245 -0
  10. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/coherence.py +1 -1
  11. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/conciseness.py +1 -1
  12. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/eval_fn.py +19 -9
  13. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/ftl_prompt_safety.py +1 -1
  14. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/ftl_response_faithfulness.py +1 -1
  15. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/regex.py +7 -10
  16. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/sentiment.py +1 -1
  17. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_answer_relevance.py +1 -1
  18. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_coherence.py +1 -1
  19. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_conciseness.py +1 -1
  20. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_eval_fn.py +52 -0
  21. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_ftl_prompt_safety.py +6 -6
  22. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_ftl_response_faithfulness.py +1 -1
  23. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_regex.py +39 -0
  24. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_sentiment.py +6 -6
  25. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_topic_classification.py +6 -6
  26. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_toxicity.py +6 -6
  27. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/topic.py +5 -3
  28. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/toxicity.py +1 -1
  29. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/experiment.py +2 -0
  30. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/evaluation.py +43 -8
  31. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/experiment_runner.py +68 -50
  32. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/tests/test_evaluate.py +264 -1
  33. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/tests/test_experiment_result_publisher.py +19 -2
  34. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14/fiddler_evals.egg-info}/PKG-INFO +122 -12
  35. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/pyproject.toml +1 -1
  36. fiddler_evals-0.1.1.dev13/fiddler_evals/VERSION +0 -1
  37. fiddler_evals-0.1.1.dev13/fiddler_evals/evaluators/base.py +0 -141
  38. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/MANIFEST.in +0 -0
  39. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/README.md +0 -0
  40. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/__init__.py +0 -0
  41. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/configs.py +0 -0
  42. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/conftest.py +0 -0
  43. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/connection.py +0 -0
  44. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/constants.py +0 -0
  45. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/decorators.py +0 -0
  46. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/__init__.py +0 -0
  47. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/application.py +0 -0
  48. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/base.py +0 -0
  49. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/project.py +0 -0
  50. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/__init__.py +0 -0
  51. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_application.py +0 -0
  52. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_dataset.py +0 -0
  53. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_experiment.py +0 -0
  54. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_experiment_items.py +0 -0
  55. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_project.py +0 -0
  56. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/__init__.py +0 -0
  57. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/__init__.py +0 -0
  58. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/exceptions.py +0 -0
  59. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/__init__.py +0 -0
  60. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/http_client.py +0 -0
  61. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/json_encoder.py +0 -0
  62. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/semver.py +0 -0
  63. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/tests/__init__.py +0 -0
  64. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/tests/test_json_encoder.py +0 -0
  65. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/tests/test_request_client.py +0 -0
  66. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/__init__.py +0 -0
  67. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/application.py +0 -0
  68. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/base.py +0 -0
  69. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/compact.py +0 -0
  70. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/dataset.py +0 -0
  71. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/error.py +0 -0
  72. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/evaluator.py +0 -0
  73. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/filter_query.py +0 -0
  74. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/project.py +0 -0
  75. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/response.py +0 -0
  76. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/score.py +0 -0
  77. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/server_info.py +0 -0
  78. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/__init__.py +0 -0
  79. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/executor.py +0 -0
  80. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/experiment_result_publisher.py +0 -0
  81. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/tests/__init__.py +0 -0
  82. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/tests/__init__.py +0 -0
  83. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/tests/constants.py +0 -0
  84. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/tests/test_connection.py +0 -0
  85. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/tests/test_decorators.py +0 -0
  86. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/utils/__init__.py +0 -0
  87. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/utils/environment.py +0 -0
  88. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/utils/pd.py +0 -0
  89. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/utils/tests/__init__.py +0 -0
  90. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/utils/tests/test_environment.py +0 -0
  91. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/utils/tqdm.py +0 -0
  92. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/version.py +0 -0
  93. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals.egg-info/SOURCES.txt +0 -0
  94. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals.egg-info/dependency_links.txt +0 -0
  95. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals.egg-info/requires.txt +0 -0
  96. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals.egg-info/top_level.txt +0 -0
  97. {fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fiddler-evals
3
- Version: 0.1.1.dev13
3
+ Version: 0.1.1.dev14
4
4
  Summary: Python SDK for evaluating LLM Applications
5
5
  Author-email: Fiddler AI <support@fiddler.ai>
6
6
  Maintainer-email: Fiddler AI <support@fiddler.ai>
@@ -128,8 +128,11 @@ class PolitenessEvaluator(Evaluator):
128
128
  Useful for customer service or chatbot applications.
129
129
  """
130
130
 
131
- def __init__(self):
132
- super().__init__()
131
+ def __init__(self, score_name_prefix: str = None, score_fn_kwargs_mapping: dict = None):
132
+ super().__init__(
133
+ score_name_prefix=score_name_prefix,
134
+ score_fn_kwargs_mapping=score_fn_kwargs_mapping
135
+ )
133
136
  self.polite_words = [
134
137
  'please', 'thank you', 'thanks', 'sorry', 'apologize',
135
138
  'appreciate', 'welcome', 'help', 'assist', 'glad'
@@ -151,13 +154,13 @@ class PolitenessEvaluator(Evaluator):
151
154
  reasoning = "No polite language detected"
152
155
 
153
156
  return Score(
154
- name="politeness",
157
+ name=f"{self.score_name_prefix}politeness",
155
158
  evaluator_name=self.name,
156
159
  value=score_value,
157
160
  reasoning=reasoning
158
161
  )
159
162
 
160
- # Test the evaluator
163
+ # Test the evaluator with different configurations
161
164
  politeness_evaluator = PolitenessEvaluator()
162
165
 
163
166
  polite_response = "Thank you for your question! I'd be happy to help you with that."
@@ -165,6 +168,17 @@ impolite_response = "I don't know. Figure it out yourself."
165
168
 
166
169
  print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
167
170
  print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
171
+
172
+ # Use with different configurations
173
+ customer_service_evaluator = PolitenessEvaluator(
174
+ score_name_prefix="customer_service",
175
+ score_fn_kwargs_mapping={"output": "response"}
176
+ )
177
+
178
+ support_evaluator = PolitenessEvaluator(
179
+ score_name_prefix="support",
180
+ score_fn_kwargs_mapping={"output": "answer"}
181
+ )
168
182
  ```
169
183
 
170
184
  ### 5.1. Function-Based Evaluators
@@ -215,12 +229,32 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
215
229
  answer = call_your_llm(question)
216
230
  return {"answer": answer}
217
231
 
218
- # Set up evaluators
232
+ # Set up evaluators with different configurations
219
233
  evaluators = [
220
- AnswerRelevance(),
221
- Conciseness(),
222
- Sentiment(),
223
- PolitenessEvaluator(),
234
+ # Primary evaluation metrics
235
+ AnswerRelevance(score_name_prefix="primary"),
236
+ Conciseness(score_name_prefix="primary"),
237
+ Sentiment(score_name_prefix="primary"),
238
+
239
+ # Custom evaluators with specific mappings
240
+ PolitenessEvaluator(
241
+ score_name_prefix="quality",
242
+ score_fn_kwargs_mapping={"output": "answer"}
243
+ ),
244
+
245
+ # Multiple instances of same evaluator for different fields
246
+ RegexSearch(
247
+ pattern=r"\d+",
248
+ score_name_prefix="validation",
249
+ score_name="has_number",
250
+ score_fn_kwargs_mapping={"output": "question"}
251
+ ),
252
+ RegexSearch(
253
+ pattern=r"\d+",
254
+ score_name_prefix="validation",
255
+ score_name="has_number",
256
+ score_fn_kwargs_mapping={"output": "answer"}
257
+ ),
224
258
  ]
225
259
 
226
260
  # Run evaluation
@@ -231,9 +265,8 @@ experiment_result = evaluate(
231
265
  name_prefix="my_evaluation",
232
266
  description="Comprehensive LLM evaluation",
233
267
  score_fn_kwargs_mapping={
234
- "question": "question",
268
+ "question": lambda x: x["inputs"]["question"],
235
269
  "response": "answer",
236
- "output": "answer",
237
270
  "text": "answer",
238
271
  "prompt": lambda x: x["inputs"]["question"],
239
272
  }
@@ -241,6 +274,10 @@ experiment_result = evaluate(
241
274
 
242
275
  print(f"Evaluated {len(experiment_result.results)} test cases")
243
276
  print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
277
+
278
+ # Results in organized score names:
279
+ # "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
280
+ # "quality_politeness", "validation_has_number" (for question), "validation_has_number" (for answer)
244
281
  ```
245
282
 
246
283
  ## Built-in Evaluators
@@ -326,6 +363,79 @@ score_fn_kwargs_mapping={
326
363
  }
327
364
  ```
328
365
 
366
+ ### Multiple Evaluator Instances with Different Mappings
367
+
368
+ You can create multiple instances of the same evaluator with different parameter mappings and score name prefixes to evaluate different aspects of your outputs. Use `score_name_prefix` to organize and distinguish scores when using multiple evaluator instances:
369
+
370
+ ```python
371
+ from fiddler_evals.evaluators import RegexSearch
372
+
373
+ # Create multiple RegexSearch evaluators for different fields
374
+ evaluators = [
375
+ # Check for numbers in the question
376
+ RegexSearch(
377
+ pattern=r"\d+",
378
+ score_name_prefix="question",
379
+ score_name="has_number",
380
+ score_fn_kwargs_mapping={"output": "question"}
381
+ ),
382
+ # Check for numbers in the answer
383
+ RegexSearch(
384
+ pattern=r"\d+",
385
+ score_name_prefix="answer",
386
+ score_name="has_number",
387
+ score_fn_kwargs_mapping={"output": "answer"}
388
+ ),
389
+ # Check for capital letters in the answer
390
+ RegexSearch(
391
+ pattern=r"[A-Z]",
392
+ score_name_prefix="answer",
393
+ score_name="has_caps",
394
+ score_fn_kwargs_mapping={"output": "answer"}
395
+ )
396
+ ]
397
+
398
+ # Run evaluation
399
+ experiment_result = evaluate(
400
+ dataset=dataset,
401
+ task=my_llm_task,
402
+ evaluators=evaluators,
403
+ score_fn_kwargs_mapping={
404
+ "question": lambda x: x["inputs"]["question"]
405
+ }
406
+ )
407
+
408
+ # Results in scores named:
409
+ # "question_has_number", "answer_has_number", "answer_has_caps"
410
+ ```
411
+
412
+ ### Parameter Mapping Priority
413
+
414
+ When both evaluator-level and evaluation-level mappings are present, evaluator-level mappings take precedence:
415
+
416
+ ```python
417
+ # Evaluator-level mapping (higher priority)
418
+ evaluator = RegexSearch(
419
+ pattern=r"\d+",
420
+ score_fn_kwargs_mapping={"output": "answer"} # This takes precedence
421
+ )
422
+
423
+ # Evaluation-level mapping (lower priority)
424
+ experiment_result = evaluate(
425
+ dataset=dataset,
426
+ task=my_llm_task,
427
+ evaluators=[evaluator],
428
+ score_fn_kwargs_mapping={
429
+ "output": "question" # This is ignored due to evaluator-level mapping
430
+ }
431
+ )
432
+ ```
433
+
434
+ **Mapping Priority (highest to lowest):**
435
+ 1. Evaluator-level `score_fn_kwargs_mapping` (set in evaluator constructor)
436
+ 2. Evaluation-level `score_fn_kwargs_mapping` (passed to evaluate function)
437
+ 3. Default parameter resolution
438
+
329
439
  ### Experiment Metadata
330
440
  ```python
331
441
  experiment_result = evaluate(
@@ -106,8 +106,11 @@ class PolitenessEvaluator(Evaluator):
106
106
  Useful for customer service or chatbot applications.
107
107
  """
108
108
 
109
- def __init__(self):
110
- super().__init__()
109
+ def __init__(self, score_name_prefix: str = None, score_fn_kwargs_mapping: dict = None):
110
+ super().__init__(
111
+ score_name_prefix=score_name_prefix,
112
+ score_fn_kwargs_mapping=score_fn_kwargs_mapping
113
+ )
111
114
  self.polite_words = [
112
115
  'please', 'thank you', 'thanks', 'sorry', 'apologize',
113
116
  'appreciate', 'welcome', 'help', 'assist', 'glad'
@@ -129,13 +132,13 @@ class PolitenessEvaluator(Evaluator):
129
132
  reasoning = "No polite language detected"
130
133
 
131
134
  return Score(
132
- name="politeness",
135
+ name=f"{self.score_name_prefix}politeness",
133
136
  evaluator_name=self.name,
134
137
  value=score_value,
135
138
  reasoning=reasoning
136
139
  )
137
140
 
138
- # Test the evaluator
141
+ # Test the evaluator with different configurations
139
142
  politeness_evaluator = PolitenessEvaluator()
140
143
 
141
144
  polite_response = "Thank you for your question! I'd be happy to help you with that."
@@ -143,6 +146,17 @@ impolite_response = "I don't know. Figure it out yourself."
143
146
 
144
147
  print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
145
148
  print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
149
+
150
+ # Use with different configurations
151
+ customer_service_evaluator = PolitenessEvaluator(
152
+ score_name_prefix="customer_service",
153
+ score_fn_kwargs_mapping={"output": "response"}
154
+ )
155
+
156
+ support_evaluator = PolitenessEvaluator(
157
+ score_name_prefix="support",
158
+ score_fn_kwargs_mapping={"output": "answer"}
159
+ )
146
160
  ```
147
161
 
148
162
  ### 5.1. Function-Based Evaluators
@@ -193,12 +207,32 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
193
207
  answer = call_your_llm(question)
194
208
  return {"answer": answer}
195
209
 
196
- # Set up evaluators
210
+ # Set up evaluators with different configurations
197
211
  evaluators = [
198
- AnswerRelevance(),
199
- Conciseness(),
200
- Sentiment(),
201
- PolitenessEvaluator(),
212
+ # Primary evaluation metrics
213
+ AnswerRelevance(score_name_prefix="primary"),
214
+ Conciseness(score_name_prefix="primary"),
215
+ Sentiment(score_name_prefix="primary"),
216
+
217
+ # Custom evaluators with specific mappings
218
+ PolitenessEvaluator(
219
+ score_name_prefix="quality",
220
+ score_fn_kwargs_mapping={"output": "answer"}
221
+ ),
222
+
223
+ # Multiple instances of same evaluator for different fields
224
+ RegexSearch(
225
+ pattern=r"\d+",
226
+ score_name_prefix="validation",
227
+ score_name="has_number",
228
+ score_fn_kwargs_mapping={"output": "question"}
229
+ ),
230
+ RegexSearch(
231
+ pattern=r"\d+",
232
+ score_name_prefix="validation",
233
+ score_name="has_number",
234
+ score_fn_kwargs_mapping={"output": "answer"}
235
+ ),
202
236
  ]
203
237
 
204
238
  # Run evaluation
@@ -209,9 +243,8 @@ experiment_result = evaluate(
209
243
  name_prefix="my_evaluation",
210
244
  description="Comprehensive LLM evaluation",
211
245
  score_fn_kwargs_mapping={
212
- "question": "question",
246
+ "question": lambda x: x["inputs"]["question"],
213
247
  "response": "answer",
214
- "output": "answer",
215
248
  "text": "answer",
216
249
  "prompt": lambda x: x["inputs"]["question"],
217
250
  }
@@ -219,6 +252,10 @@ experiment_result = evaluate(
219
252
 
220
253
  print(f"Evaluated {len(experiment_result.results)} test cases")
221
254
  print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
255
+
256
+ # Results in organized score names:
257
+ # "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
258
+ # "quality_politeness", "validation_has_number" (for question), "validation_has_number" (for answer)
222
259
  ```
223
260
 
224
261
  ## Built-in Evaluators
@@ -304,6 +341,79 @@ score_fn_kwargs_mapping={
304
341
  }
305
342
  ```
306
343
 
344
+ ### Multiple Evaluator Instances with Different Mappings
345
+
346
+ You can create multiple instances of the same evaluator with different parameter mappings and score name prefixes to evaluate different aspects of your outputs. Use `score_name_prefix` to organize and distinguish scores when using multiple evaluator instances:
347
+
348
+ ```python
349
+ from fiddler_evals.evaluators import RegexSearch
350
+
351
+ # Create multiple RegexSearch evaluators for different fields
352
+ evaluators = [
353
+ # Check for numbers in the question
354
+ RegexSearch(
355
+ pattern=r"\d+",
356
+ score_name_prefix="question",
357
+ score_name="has_number",
358
+ score_fn_kwargs_mapping={"output": "question"}
359
+ ),
360
+ # Check for numbers in the answer
361
+ RegexSearch(
362
+ pattern=r"\d+",
363
+ score_name_prefix="answer",
364
+ score_name="has_number",
365
+ score_fn_kwargs_mapping={"output": "answer"}
366
+ ),
367
+ # Check for capital letters in the answer
368
+ RegexSearch(
369
+ pattern=r"[A-Z]",
370
+ score_name_prefix="answer",
371
+ score_name="has_caps",
372
+ score_fn_kwargs_mapping={"output": "answer"}
373
+ )
374
+ ]
375
+
376
+ # Run evaluation
377
+ experiment_result = evaluate(
378
+ dataset=dataset,
379
+ task=my_llm_task,
380
+ evaluators=evaluators,
381
+ score_fn_kwargs_mapping={
382
+ "question": lambda x: x["inputs"]["question"]
383
+ }
384
+ )
385
+
386
+ # Results in scores named:
387
+ # "question_has_number", "answer_has_number", "answer_has_caps"
388
+ ```
389
+
390
+ ### Parameter Mapping Priority
391
+
392
+ When both evaluator-level and evaluation-level mappings are present, evaluator-level mappings take precedence:
393
+
394
+ ```python
395
+ # Evaluator-level mapping (higher priority)
396
+ evaluator = RegexSearch(
397
+ pattern=r"\d+",
398
+ score_fn_kwargs_mapping={"output": "answer"} # This takes precedence
399
+ )
400
+
401
+ # Evaluation-level mapping (lower priority)
402
+ experiment_result = evaluate(
403
+ dataset=dataset,
404
+ task=my_llm_task,
405
+ evaluators=[evaluator],
406
+ score_fn_kwargs_mapping={
407
+ "output": "question" # This is ignored due to evaluator-level mapping
408
+ }
409
+ )
410
+ ```
411
+
412
+ **Mapping Priority (highest to lowest):**
413
+ 1. Evaluator-level `score_fn_kwargs_mapping` (set in evaluator constructor)
414
+ 2. Evaluation-level `score_fn_kwargs_mapping` (passed to evaluate function)
415
+ 3. Default parameter resolution
416
+
307
417
  ### Experiment Metadata
308
418
  ```python
309
419
  experiment_result = evaluate(
@@ -0,0 +1 @@
1
+ 0.1.1.dev14
@@ -873,6 +873,35 @@ class Dataset(BaseEntity):
873
873
  if df.empty:
874
874
  raise ValueError("DataFrame cannot be empty")
875
875
 
876
+ if input_columns and (
877
+ missing_input_columns := set(input_columns) - set(df_columns)
878
+ ):
879
+ raise ValueError(
880
+ f"Input column(s) {missing_input_columns} not found in DataFrame"
881
+ )
882
+
883
+ if expected_output_columns and (
884
+ missing_expected_output_columns := set(expected_output_columns)
885
+ - set(df_columns)
886
+ ):
887
+ raise ValueError(
888
+ f"Expected output column(s) {missing_expected_output_columns} not found in DataFrame"
889
+ )
890
+
891
+ if metadata_columns and (
892
+ missing_metadata_columns := set(metadata_columns) - set(df_columns)
893
+ ):
894
+ raise ValueError(
895
+ f"Metadata column(s) {missing_metadata_columns} not found in DataFrame"
896
+ )
897
+
898
+ if extras_columns and (
899
+ missing_extras_columns := set(extras_columns) - set(df_columns)
900
+ ):
901
+ raise ValueError(
902
+ f"Extras column(s) {missing_extras_columns} not found in DataFrame"
903
+ )
904
+
876
905
  expected_output_columns = expected_output_columns or []
877
906
  metadata_columns = metadata_columns or []
878
907
  extras_columns = extras_columns or []
@@ -1185,6 +1214,9 @@ class Dataset(BaseEntity):
1185
1214
  if not rows:
1186
1215
  raise ValueError("JSONL file cannot be empty")
1187
1216
 
1217
+ if not input_keys:
1218
+ raise ValueError("Input keys cannot be empty")
1219
+
1188
1220
  expected_output_keys = expected_output_keys or []
1189
1221
  metadata_keys = metadata_keys or []
1190
1222
  extras_keys = extras_keys or []
@@ -1211,6 +1243,9 @@ class Dataset(BaseEntity):
1211
1243
  source_name = str(source_name) if source_name else None
1212
1244
  source_id = str(source_id) if source_id else None
1213
1245
 
1246
+ if all(value is None for value in inputs.values()):
1247
+ raise ValueError("All inputs cannot be empty or empty strings")
1248
+
1214
1249
  items.append(
1215
1250
  NewDatasetItem(
1216
1251
  id=dataset_id,
@@ -923,7 +923,7 @@ class Experiment(BaseEntity):
923
923
  if not items:
924
924
  raise ValueError("Items cannot be empty")
925
925
 
926
- serialized_items = [item.model_dump() for item in items]
926
+ serialized_items = [item.model_dump(exclude={"dataset_item"}) for item in items]
927
927
 
928
928
  self._client().post(
929
929
  url=f"{self._get_url(self.id)}/results",
@@ -404,6 +404,165 @@ def test_insert_items_with_empty_dataframe() -> None:
404
404
  )
405
405
 
406
406
 
407
+ @responses.activate
408
+ def test_insert_from_pandas_validation_missing_input_columns() -> None:
409
+ """Test validation when input columns are not found in DataFrame."""
410
+
411
+ df = pd.DataFrame(
412
+ {
413
+ "question": ["What is 2+2?", "What is 3+3?"],
414
+ "answer": ["4", "6"],
415
+ "difficulty": ["easy", "easy"],
416
+ }
417
+ )
418
+
419
+ with pytest.raises(
420
+ ValueError,
421
+ match=r"Input column\(s\) \{'missing_column'\} not found in DataFrame",
422
+ ):
423
+ dataset.insert_from_pandas(
424
+ df=df,
425
+ input_columns=["question", "missing_column"],
426
+ expected_output_columns=["answer"],
427
+ metadata_columns=["difficulty"],
428
+ )
429
+
430
+
431
+ @responses.activate
432
+ def test_insert_from_pandas_validation_missing_expected_output_columns() -> None:
433
+ """Test validation when expected output columns are not found in DataFrame."""
434
+
435
+ df = pd.DataFrame(
436
+ {
437
+ "question": ["What is 2+2?", "What is 3+3?"],
438
+ "answer": ["4", "6"],
439
+ "difficulty": ["easy", "easy"],
440
+ }
441
+ )
442
+
443
+ with pytest.raises(
444
+ ValueError,
445
+ match=r"Expected output column\(s\) \{'missing_output'\} not found in DataFrame",
446
+ ):
447
+ dataset.insert_from_pandas(
448
+ df=df,
449
+ input_columns=["question"],
450
+ expected_output_columns=["answer", "missing_output"],
451
+ metadata_columns=["difficulty"],
452
+ )
453
+
454
+
455
+ @responses.activate
456
+ def test_insert_from_pandas_validation_missing_metadata_columns() -> None:
457
+ """Test validation when metadata columns are not found in DataFrame."""
458
+
459
+ df = pd.DataFrame(
460
+ {
461
+ "question": ["What is 2+2?", "What is 3+3?"],
462
+ "answer": ["4", "6"],
463
+ "difficulty": ["easy", "easy"],
464
+ }
465
+ )
466
+
467
+ with pytest.raises(
468
+ ValueError,
469
+ match=r"Metadata column\(s\) \{'missing_metadata'\} not found in DataFrame",
470
+ ):
471
+ dataset.insert_from_pandas(
472
+ df=df,
473
+ input_columns=["question"],
474
+ expected_output_columns=["answer"],
475
+ metadata_columns=["difficulty", "missing_metadata"],
476
+ )
477
+
478
+
479
+ @responses.activate
480
+ def test_insert_from_pandas_validation_missing_extras_columns() -> None:
481
+ """Test validation when extras columns are not found in DataFrame."""
482
+
483
+ df = pd.DataFrame(
484
+ {
485
+ "question": ["What is 2+2?", "What is 3+3?"],
486
+ "answer": ["4", "6"],
487
+ "difficulty": ["easy", "easy"],
488
+ }
489
+ )
490
+
491
+ with pytest.raises(
492
+ ValueError,
493
+ match=r"Extras column\(s\) \{'missing_extras'\} not found in DataFrame",
494
+ ):
495
+ dataset.insert_from_pandas(
496
+ df=df,
497
+ input_columns=["question"],
498
+ expected_output_columns=["answer"],
499
+ metadata_columns=["difficulty"],
500
+ extras_columns=["missing_extras"],
501
+ )
502
+
503
+
504
+ @responses.activate
505
+ def test_insert_from_pandas_validation_no_columns_specified() -> None:
506
+ """Test that validation passes when no specific columns are specified (auto-mapping)."""
507
+
508
+ df = pd.DataFrame(
509
+ {
510
+ "question": ["What is 2+2?", "What is 3+3?"],
511
+ "answer": ["4", "6"],
512
+ "difficulty": ["easy", "easy"],
513
+ "source_name": ["test", "test"],
514
+ "source_id": ["1", "2"],
515
+ }
516
+ )
517
+
518
+ # Mock item insertion
519
+ insert_response = INSERT_RESPONSE_SUCCESS.copy()
520
+ insert_response["data"]["ids"] = [str(uuid4()) for _ in range(len(df))]
521
+ responses.post(
522
+ url=f"{URL}/v3/evals/datasets/{DATASET_ID}/items",
523
+ json=insert_response,
524
+ )
525
+
526
+ # Should not raise any validation errors when no specific columns are specified
527
+ item_ids = dataset.insert_from_pandas(df=df)
528
+
529
+ # Verify response
530
+ assert len(item_ids) == 2
531
+
532
+
533
+ @responses.activate
534
+ def test_insert_from_pandas_validation_empty_column_lists() -> None:
535
+ """Test that validation passes when empty column lists are provided."""
536
+
537
+ df = pd.DataFrame(
538
+ {
539
+ "question": ["What is 2+2?", "What is 3+3?"],
540
+ "answer": ["4", "6"],
541
+ "difficulty": ["easy", "easy"],
542
+ }
543
+ )
544
+
545
+ # Mock item insertion
546
+ insert_response = INSERT_RESPONSE_SUCCESS.copy()
547
+ insert_response["data"]["ids"] = [str(uuid4()) for _ in range(len(df))]
548
+ responses.post(
549
+ url=f"{URL}/v3/evals/datasets/{DATASET_ID}/items",
550
+ json=insert_response,
551
+ )
552
+
553
+ # Should not raise any validation errors when empty lists are provided
554
+ item_ids = dataset.insert_from_pandas(
555
+ df=df,
556
+ input_columns=["question"],
557
+ expected_output_columns=[], # Empty list
558
+ metadata_columns=[], # Empty list
559
+ extras_columns=[], # Empty list
560
+ )
561
+
562
+ # Verify response
563
+ assert len(item_ids) == 2
564
+
565
+
407
566
  @responses.activate
408
567
  def test_insert_items_success_with_csv_file() -> None:
409
568
  """When inserting items from a csv file, the items are inserted successfully."""
@@ -490,3 +649,40 @@ def test_insert_items_with_empty_jsonl_file(tmp_path: Path) -> None:
490
649
  file_path=temp_file,
491
650
  input_keys=["Question"],
492
651
  )
652
+
653
+
654
+ @responses.activate
655
+ def test_insert_from_jsonl_file_validation_empty_input_keys(tmp_path: Path) -> None:
656
+ """Test validation when input_keys is empty."""
657
+
658
+ temp_file = tmp_path / "test.jsonl"
659
+ temp_file.write_text('{"question": "What is 2+2?"}\n')
660
+
661
+ with pytest.raises(ValueError, match="Input keys cannot be empty"):
662
+ dataset.insert_from_jsonl_file(
663
+ file_path=temp_file,
664
+ input_keys=[], # Empty input keys
665
+ )
666
+
667
+
668
+ @pytest.mark.parametrize(
669
+ "test_data,input_keys",
670
+ [
671
+ ({"question": None}, ["question"]),
672
+ ({"question": None, "context": None}, ["question", "context"]),
673
+ ({"difficulty": "easy"}, ["question"]),
674
+ ],
675
+ )
676
+ def test_insert_from_jsonl_file_validation(
677
+ tmp_path: Path, test_data: dict, input_keys: list
678
+ ) -> None:
679
+ """Test comprehensive validation for insert_from_jsonl_file."""
680
+
681
+ temp_file = tmp_path / "test.jsonl"
682
+ temp_file.write_text(json.dumps(test_data) + "\n")
683
+
684
+ with pytest.raises(ValueError, match="All inputs cannot be empty or empty strings"):
685
+ dataset.insert_from_jsonl_file(
686
+ file_path=temp_file,
687
+ input_keys=input_keys,
688
+ )