fiddler-evals 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. fiddler_evals-0.1.0/MANIFEST.in +2 -0
  2. fiddler_evals-0.1.0/PKG-INFO +341 -0
  3. fiddler_evals-0.1.0/PUBLIC.md +319 -0
  4. fiddler_evals-0.1.0/README.md +65 -0
  5. fiddler_evals-0.1.0/fiddler_evals/VERSION +1 -0
  6. fiddler_evals-0.1.0/fiddler_evals/__init__.py +71 -0
  7. fiddler_evals-0.1.0/fiddler_evals/configs.py +14 -0
  8. fiddler_evals-0.1.0/fiddler_evals/conftest.py +28 -0
  9. fiddler_evals-0.1.0/fiddler_evals/connection.py +451 -0
  10. fiddler_evals-0.1.0/fiddler_evals/constants.py +9 -0
  11. fiddler_evals-0.1.0/fiddler_evals/decorators.py +189 -0
  12. fiddler_evals-0.1.0/fiddler_evals/entities/__init__.py +0 -0
  13. fiddler_evals-0.1.0/fiddler_evals/entities/application.py +398 -0
  14. fiddler_evals-0.1.0/fiddler_evals/entities/base.py +58 -0
  15. fiddler_evals-0.1.0/fiddler_evals/entities/dataset.py +1230 -0
  16. fiddler_evals-0.1.0/fiddler_evals/entities/experiment.py +934 -0
  17. fiddler_evals-0.1.0/fiddler_evals/entities/project.py +362 -0
  18. fiddler_evals-0.1.0/fiddler_evals/entities/tests/__init__.py +0 -0
  19. fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_application.py +340 -0
  20. fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_dataset.py +602 -0
  21. fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_dataset_items.py +492 -0
  22. fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_experiment.py +719 -0
  23. fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_experiment_items.py +495 -0
  24. fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_experiment_results.py +330 -0
  25. fiddler_evals-0.1.0/fiddler_evals/entities/tests/test_project.py +270 -0
  26. fiddler_evals-0.1.0/fiddler_evals/evaluators/__init__.py +24 -0
  27. fiddler_evals-0.1.0/fiddler_evals/evaluators/answer_relevance.py +92 -0
  28. fiddler_evals-0.1.0/fiddler_evals/evaluators/base.py +141 -0
  29. fiddler_evals-0.1.0/fiddler_evals/evaluators/coherence.py +111 -0
  30. fiddler_evals-0.1.0/fiddler_evals/evaluators/conciseness.py +84 -0
  31. fiddler_evals-0.1.0/fiddler_evals/evaluators/eval_fn.py +214 -0
  32. fiddler_evals-0.1.0/fiddler_evals/evaluators/ftl_prompt_safety.py +113 -0
  33. fiddler_evals-0.1.0/fiddler_evals/evaluators/ftl_response_faithfulness.py +115 -0
  34. fiddler_evals-0.1.0/fiddler_evals/evaluators/regex.py +99 -0
  35. fiddler_evals-0.1.0/fiddler_evals/evaluators/sentiment.py +112 -0
  36. fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/__init__.py +0 -0
  37. fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_answer_relevance.py +237 -0
  38. fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_coherence.py +369 -0
  39. fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_conciseness.py +225 -0
  40. fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_eval_fn.py +359 -0
  41. fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_ftl_prompt_safety.py +222 -0
  42. fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_ftl_response_faithfulness.py +205 -0
  43. fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_regex.py +116 -0
  44. fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_sentiment.py +224 -0
  45. fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_topic_classification.py +249 -0
  46. fiddler_evals-0.1.0/fiddler_evals/evaluators/tests/test_toxicity.py +201 -0
  47. fiddler_evals-0.1.0/fiddler_evals/evaluators/topic.py +127 -0
  48. fiddler_evals-0.1.0/fiddler_evals/evaluators/toxicity.py +101 -0
  49. fiddler_evals-0.1.0/fiddler_evals/exceptions.py +221 -0
  50. fiddler_evals-0.1.0/fiddler_evals/libs/__init__.py +0 -0
  51. fiddler_evals-0.1.0/fiddler_evals/libs/http_client.py +483 -0
  52. fiddler_evals-0.1.0/fiddler_evals/libs/json_encoder.py +25 -0
  53. fiddler_evals-0.1.0/fiddler_evals/libs/semver.py +614 -0
  54. fiddler_evals-0.1.0/fiddler_evals/libs/tests/__init__.py +0 -0
  55. fiddler_evals-0.1.0/fiddler_evals/libs/tests/test_json_encoder.py +27 -0
  56. fiddler_evals-0.1.0/fiddler_evals/libs/tests/test_request_client.py +715 -0
  57. fiddler_evals-0.1.0/fiddler_evals/pydantic_models/__init__.py +4 -0
  58. fiddler_evals-0.1.0/fiddler_evals/pydantic_models/application.py +24 -0
  59. fiddler_evals-0.1.0/fiddler_evals/pydantic_models/base.py +9 -0
  60. fiddler_evals-0.1.0/fiddler_evals/pydantic_models/compact.py +41 -0
  61. fiddler_evals-0.1.0/fiddler_evals/pydantic_models/dataset.py +58 -0
  62. fiddler_evals-0.1.0/fiddler_evals/pydantic_models/error.py +22 -0
  63. fiddler_evals-0.1.0/fiddler_evals/pydantic_models/evaluator.py +18 -0
  64. fiddler_evals-0.1.0/fiddler_evals/pydantic_models/experiment.py +89 -0
  65. fiddler_evals-0.1.0/fiddler_evals/pydantic_models/filter_query.py +54 -0
  66. fiddler_evals-0.1.0/fiddler_evals/pydantic_models/project.py +17 -0
  67. fiddler_evals-0.1.0/fiddler_evals/pydantic_models/response.py +51 -0
  68. fiddler_evals-0.1.0/fiddler_evals/pydantic_models/score.py +26 -0
  69. fiddler_evals-0.1.0/fiddler_evals/pydantic_models/server_info.py +20 -0
  70. fiddler_evals-0.1.0/fiddler_evals/runner/__init__.py +0 -0
  71. fiddler_evals-0.1.0/fiddler_evals/runner/evaluation.py +178 -0
  72. fiddler_evals-0.1.0/fiddler_evals/runner/executor.py +102 -0
  73. fiddler_evals-0.1.0/fiddler_evals/runner/experiment_result_publisher.py +97 -0
  74. fiddler_evals-0.1.0/fiddler_evals/runner/experiment_runner.py +640 -0
  75. fiddler_evals-0.1.0/fiddler_evals/runner/tests/__init__.py +0 -0
  76. fiddler_evals-0.1.0/fiddler_evals/runner/tests/test_evaluate.py +692 -0
  77. fiddler_evals-0.1.0/fiddler_evals/runner/tests/test_experiment_result_publisher.py +264 -0
  78. fiddler_evals-0.1.0/fiddler_evals/tests/__init__.py +0 -0
  79. fiddler_evals-0.1.0/fiddler_evals/tests/constants.py +40 -0
  80. fiddler_evals-0.1.0/fiddler_evals/tests/test_connection.py +224 -0
  81. fiddler_evals-0.1.0/fiddler_evals/tests/test_decorators.py +346 -0
  82. fiddler_evals-0.1.0/fiddler_evals/utils/__init__.py +0 -0
  83. fiddler_evals-0.1.0/fiddler_evals/utils/environment.py +46 -0
  84. fiddler_evals-0.1.0/fiddler_evals/utils/pd.py +9 -0
  85. fiddler_evals-0.1.0/fiddler_evals/utils/tests/__init__.py +0 -0
  86. fiddler_evals-0.1.0/fiddler_evals/utils/tests/test_environment.py +146 -0
  87. fiddler_evals-0.1.0/fiddler_evals/utils/tqdm.py +23 -0
  88. fiddler_evals-0.1.0/fiddler_evals/version.py +4 -0
  89. fiddler_evals-0.1.0/fiddler_evals.egg-info/PKG-INFO +341 -0
  90. fiddler_evals-0.1.0/fiddler_evals.egg-info/SOURCES.txt +93 -0
  91. fiddler_evals-0.1.0/fiddler_evals.egg-info/dependency_links.txt +1 -0
  92. fiddler_evals-0.1.0/fiddler_evals.egg-info/requires.txt +10 -0
  93. fiddler_evals-0.1.0/fiddler_evals.egg-info/top_level.txt +1 -0
  94. fiddler_evals-0.1.0/pyproject.toml +59 -0
  95. fiddler_evals-0.1.0/setup.cfg +4 -0
@@ -0,0 +1,2 @@
1
+ include PUBLIC.md
2
+ include fiddler_evals/VERSION
@@ -0,0 +1,341 @@
1
+ Metadata-Version: 2.4
2
+ Name: fiddler-evals
3
+ Version: 0.1.0
4
+ Summary: Python SDK for evaluating LLM Applications
5
+ Author-email: Fiddler AI <support@fiddler.ai>
6
+ Project-URL: Homepage, https://fiddler.ai
7
+ Project-URL: Repository, https://github.com/fiddler-labs/fiddler-evals-sdk
8
+ Project-URL: Documentation, https://docs.fiddler.ai/
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: pip>=21.0
14
+ Requires-Dist: requests<3
15
+ Requires-Dist: pydantic>=2.0.0
16
+ Requires-Dist: tqdm
17
+ Requires-Dist: typing-extensions<5,>=4.6.0
18
+ Requires-Dist: pandas>=1.2.5
19
+ Requires-Dist: python-decouple
20
+ Provides-Extra: pandas
21
+ Requires-Dist: pandas>=1.2.5; extra == "pandas"
22
+
23
+ # Fiddler Evals SDK
24
+
25
+ A comprehensive toolkit for evaluating Large Language Model (LLM) applications, RAG systems, and AI agents. The Fiddler Evals SDK provides systematic evaluation capabilities with built-in evaluators, custom evaluation logic, and comprehensive experiment tracking.
26
+
27
+ ## Key Features
28
+
29
+ - **🧪 Systematic Evaluation**: Run structured experiments on your AI applications
30
+ - **📊 Built-in Evaluators**: Production-ready evaluators for common AI tasks
31
+ - **🔧 Custom Evaluators**: Build evaluation logic for your specific use cases
32
+ - **📈 Result Tracking**: Comprehensive experiment tracking and analysis
33
+ - **🚀 Scale**: Evaluate across large datasets with concurrent processing
34
+ - **📁 Multiple Data Sources**: Import test cases from CSV, JSONL, and pandas DataFrames
35
+
36
+
37
+ ## Requirements
38
+
39
+ - Python 3.10 or higher
40
+ - Access to a Fiddler Platform instance
41
+ - API token from Fiddler Platform
42
+
43
+
44
+ ## Installation
45
+
46
+ ```bash
47
+ pip install fiddler-evals
48
+ ```
49
+
50
+ For pre-release versions:
51
+ ```bash
52
+ pip install --upgrade --pre fiddler-evals
53
+ ```
54
+
55
+ ## Quick Start
56
+
57
+ ### 1. Connect to Fiddler
58
+
59
+ ```python
60
+ from fiddler_evals import init
61
+
62
+ # Initialize connection
63
+ init(url='https://your-org.fiddler.ai', token='your-api-token')
64
+ ```
65
+
66
+ ### 2. Create Project Structure
67
+
68
+ ```python
69
+ from fiddler_evals import Project, Application, Dataset
70
+
71
+ # Create project and application
72
+ project = Project.get_or_create(name='my-eval-project')
73
+ app = Application.get_or_create(
74
+ name='my-llm-app',
75
+ project_id=project.id
76
+ )
77
+
78
+ # Create dataset
79
+ dataset = Dataset.create(
80
+ name='evaluation-dataset',
81
+ application_id=app.id,
82
+ description='Test cases for LLM evaluation'
83
+ )
84
+ ```
85
+
86
+ ### 3. Add Test Cases
87
+
88
+ ```python
89
+ from fiddler_evals.pydantic_models.dataset import NewDatasetItem
90
+
91
+
92
+ test_cases = [
93
+ NewDatasetItem(
94
+ inputs={"question": "What happens to you if you eat watermelon seeds?"},
95
+ expected_outputs={"answer": "The watermelon seeds pass through your digestive system"},
96
+ metadata={"type": "Adversarial", "category": "Misconceptions"},
97
+ )
98
+ ]
99
+ dataset.insert(test_cases)
100
+ ```
101
+
102
+ ### 4. Use Built-in Evaluators
103
+
104
+ ```python
105
+ from fiddler_evals.evaluators import (
106
+ AnswerRelevance, Coherence, Conciseness,
107
+ Toxicity, Sentiment, RegexSearch
108
+ )
109
+
110
+ # Test individual evaluators
111
+ relevance_evaluator = AnswerRelevance()
112
+ score = relevance_evaluator.score(
113
+ prompt="What is the capital of France?",
114
+ response="Paris is the capital of France."
115
+ )
116
+ print(f"Score: {score.value} - {score.reasoning}")
117
+ ```
118
+
119
+ ### 5. Create Custom Evaluators
120
+
121
+ ```python
122
+ from fiddler_evals.evaluators.base import Evaluator
123
+ from fiddler_evals.pydantic_models.score import Score
124
+
125
+ class PolitenessEvaluator(Evaluator):
126
+ """
127
+ Simple evaluator that checks if a response contains polite language.
128
+ Useful for customer service or chatbot applications.
129
+ """
130
+
131
+ def __init__(self):
132
+ super().__init__()
133
+ self.polite_words = [
134
+ 'please', 'thank you', 'thanks', 'sorry', 'apologize',
135
+ 'appreciate', 'welcome', 'help', 'assist', 'glad'
136
+ ]
137
+
138
+ def score(self, output: str) -> Score:
139
+ """Score based on presence of polite language."""
140
+ output_lower = output.lower()
141
+
142
+ # Count polite words
143
+ polite_count = sum(1 for word in self.polite_words if word in output_lower)
144
+
145
+ # Simple scoring: 1.0 if any polite words found, 0.0 otherwise
146
+ if polite_count > 0:
147
+ score_value = 1.0
148
+ reasoning = f"Contains {polite_count} polite word(s)"
149
+ else:
150
+ score_value = 0.0
151
+ reasoning = "No polite language detected"
152
+
153
+ return Score(
154
+ name="politeness",
155
+ evaluator_name=self.name,
156
+ value=score_value,
157
+ reasoning=reasoning
158
+ )
159
+
160
+ # Test the evaluator
161
+ politeness_evaluator = PolitenessEvaluator()
162
+
163
+ polite_response = "Thank you for your question! I'd be happy to help you with that."
164
+ impolite_response = "I don't know. Figure it out yourself."
165
+
166
+ print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
167
+ print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
168
+ ```
169
+
170
+ ### 5.1. Function-Based Evaluators
171
+
172
+ You can also use simple functions as evaluators instead of creating full evaluator classes. Functions are automatically wrapped with `EvalFn` internally:
173
+
174
+ ```python
175
+ def word_count_evaluator(output: str) -> float:
176
+ """Simple function that returns word count as a score."""
177
+ word_count = len(output.split())
178
+ # Normalize to 0-1 scale (assuming 0-50 words is reasonable)
179
+ return min(word_count / 50.0, 1.0)
180
+
181
+ def contains_number_evaluator(output: str) -> float:
182
+ """Check if response contains any numbers."""
183
+ import re
184
+ return 1.0 if re.search(r'\d+', output) else 0.0
185
+
186
+ # Use functions directly in evaluators list
187
+ evaluators = [
188
+ AnswerRelevance(),
189
+ Conciseness(),
190
+ word_count_evaluator, # Function evaluator
191
+ contains_number_evaluator, # Function evaluator
192
+ ]
193
+
194
+ # The evaluate() function automatically wraps these with EvalFn
195
+ experiment_result = evaluate(
196
+ dataset=dataset,
197
+ task=my_llm_task,
198
+ evaluators=evaluators,
199
+ score_fn_kwargs_mapping={
200
+ "output": "answer", # Maps to function parameter
201
+ "response": "answer", # Maps to class evaluator parameter
202
+ }
203
+ )
204
+ ```
205
+
206
+ ### 6. Run Experiments
207
+
208
+ ```python
209
+ from fiddler_evals import evaluate
210
+
211
+ # Define your AI application task
212
+ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
213
+ question = inputs.get("question", "")
214
+ # Your LLM API call here
215
+ answer = call_your_llm(question)
216
+ return {"answer": answer}
217
+
218
+ # Set up evaluators
219
+ evaluators = [
220
+ AnswerRelevance(),
221
+ Conciseness(),
222
+ Sentiment(),
223
+ PolitenessEvaluator(),
224
+ ]
225
+
226
+ # Run evaluation
227
+ experiment_result = evaluate(
228
+ dataset=dataset,
229
+ task=my_llm_task,
230
+ evaluators=evaluators,
231
+ name_prefix="my_evaluation",
232
+ description="Comprehensive LLM evaluation",
233
+ score_fn_kwargs_mapping={
234
+ "question": "question",
235
+ "response": "answer",
236
+ "output": "answer",
237
+ "text": "answer",
238
+ "prompt": lambda x: x["inputs"]["question"],
239
+ }
240
+ )
241
+
242
+ print(f"Evaluated {len(experiment_result.results)} test cases")
243
+ print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
244
+ ```
245
+
246
+ ## Built-in Evaluators
247
+
248
+ | Evaluator | Purpose | Key Parameters |
249
+ |-----------|---------|----------------|
250
+ | `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
251
+ | `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
252
+ | `Conciseness` | Measures response brevity and clarity | `response` |
253
+ | `Toxicity` | Detects harmful or toxic content | `text` |
254
+ | `Sentiment` | Analyzes emotional tone | `text` |
255
+ | `RegexSearch` | Pattern matching for specific formats | `output`, `pattern` |
256
+ | `FTLPromptSafety` | Compute safety scores for prompts | `text` |
257
+ | `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | `response`, `context` |
258
+
259
+ ## Data Import Options
260
+
261
+ ### CSV Files
262
+ ```python
263
+ dataset.insert_from_csv_file(
264
+ file_path='data.csv',
265
+ input_columns=['question'],
266
+ expected_output_columns=['answer'],
267
+ metadata_columns=['category']
268
+ )
269
+ ```
270
+
271
+ ### JSONL Files
272
+ ```python
273
+ dataset.insert_from_jsonl_file(
274
+ file_path='data.jsonl',
275
+ input_keys=['question'],
276
+ expected_output_keys=['answer'],
277
+ metadata_keys=['category']
278
+ )
279
+ ```
280
+
281
+ ### Pandas DataFrames
282
+ ```python
283
+ dataset.insert_from_pandas(
284
+ df=df,
285
+ input_columns=['question'],
286
+ expected_output_columns=['answer'],
287
+ metadata_columns=['category']
288
+ )
289
+ ```
290
+
291
+ ## Advanced Usage
292
+
293
+ ### Concurrent Processing
294
+ ```python
295
+ experiment_result = evaluate(
296
+ dataset=dataset,
297
+ task=my_llm_task,
298
+ evaluators=evaluators,
299
+ max_workers=4 # Process 4 test cases concurrently
300
+ )
301
+ ```
302
+
303
+ ### Custom Score Mapping
304
+
305
+ The `score_fn_kwargs_mapping` parameter is essential for connecting your task outputs to evaluator inputs. Different evaluators expect different parameter names, but your task function returns outputs with specific keys.
306
+
307
+ ```python
308
+ # Your task returns:
309
+ {"answer": "Paris is the capital of France"}
310
+
311
+ # But evaluators expect different parameter names:
312
+ AnswerRelevance.score(prompt="...", response="...") # Needs 'prompt' and 'response'
313
+ Conciseness.score(response="...") # Needs 'response'
314
+ Sentiment.score(text="...") # Needs 'text'
315
+ ```
316
+
317
+ **The Solution**: Map your output keys to evaluator parameter names:
318
+
319
+ ```python
320
+ score_fn_kwargs_mapping={
321
+ "question": "question", # Map 'question' parameter to 'question' key
322
+ "response": "answer", # Map 'response' parameter to 'answer' key
323
+ "text": "answer", # Map 'text' parameter to 'answer' key
324
+ "prompt": lambda x: x["inputs"]["question"], # Map 'prompt' to input question
325
+ "context": lambda x: x["extras"]["context"] # Map 'context' to extras
326
+ }
327
+ ```
328
+
329
+ ### Experiment Metadata
330
+ ```python
331
+ experiment_result = evaluate(
332
+ dataset=dataset,
333
+ task=my_llm_task,
334
+ evaluators=evaluators,
335
+ metadata={
336
+ "model_version": "gpt-4",
337
+ "evaluation_date": "2024-01-15",
338
+ "temperature": 0.7
339
+ }
340
+ )
341
+ ```
@@ -0,0 +1,319 @@
1
+ # Fiddler Evals SDK
2
+
3
+ A comprehensive toolkit for evaluating Large Language Model (LLM) applications, RAG systems, and AI agents. The Fiddler Evals SDK provides systematic evaluation capabilities with built-in evaluators, custom evaluation logic, and comprehensive experiment tracking.
4
+
5
+ ## Key Features
6
+
7
+ - **🧪 Systematic Evaluation**: Run structured experiments on your AI applications
8
+ - **📊 Built-in Evaluators**: Production-ready evaluators for common AI tasks
9
+ - **🔧 Custom Evaluators**: Build evaluation logic for your specific use cases
10
+ - **📈 Result Tracking**: Comprehensive experiment tracking and analysis
11
+ - **🚀 Scale**: Evaluate across large datasets with concurrent processing
12
+ - **📁 Multiple Data Sources**: Import test cases from CSV, JSONL, and pandas DataFrames
13
+
14
+
15
+ ## Requirements
16
+
17
+ - Python 3.10 or higher
18
+ - Access to a Fiddler Platform instance
19
+ - API token from Fiddler Platform
20
+
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ pip install fiddler-evals
26
+ ```
27
+
28
+ For pre-release versions:
29
+ ```bash
30
+ pip install --upgrade --pre fiddler-evals
31
+ ```
32
+
33
+ ## Quick Start
34
+
35
+ ### 1. Connect to Fiddler
36
+
37
+ ```python
38
+ from fiddler_evals import init
39
+
40
+ # Initialize connection
41
+ init(url='https://your-org.fiddler.ai', token='your-api-token')
42
+ ```
43
+
44
+ ### 2. Create Project Structure
45
+
46
+ ```python
47
+ from fiddler_evals import Project, Application, Dataset
48
+
49
+ # Create project and application
50
+ project = Project.get_or_create(name='my-eval-project')
51
+ app = Application.get_or_create(
52
+ name='my-llm-app',
53
+ project_id=project.id
54
+ )
55
+
56
+ # Create dataset
57
+ dataset = Dataset.create(
58
+ name='evaluation-dataset',
59
+ application_id=app.id,
60
+ description='Test cases for LLM evaluation'
61
+ )
62
+ ```
63
+
64
+ ### 3. Add Test Cases
65
+
66
+ ```python
67
+ from fiddler_evals.pydantic_models.dataset import NewDatasetItem
68
+
69
+
70
+ test_cases = [
71
+ NewDatasetItem(
72
+ inputs={"question": "What happens to you if you eat watermelon seeds?"},
73
+ expected_outputs={"answer": "The watermelon seeds pass through your digestive system"},
74
+ metadata={"type": "Adversarial", "category": "Misconceptions"},
75
+ )
76
+ ]
77
+ dataset.insert(test_cases)
78
+ ```
79
+
80
+ ### 4. Use Built-in Evaluators
81
+
82
+ ```python
83
+ from fiddler_evals.evaluators import (
84
+ AnswerRelevance, Coherence, Conciseness,
85
+ Toxicity, Sentiment, RegexSearch
86
+ )
87
+
88
+ # Test individual evaluators
89
+ relevance_evaluator = AnswerRelevance()
90
+ score = relevance_evaluator.score(
91
+ prompt="What is the capital of France?",
92
+ response="Paris is the capital of France."
93
+ )
94
+ print(f"Score: {score.value} - {score.reasoning}")
95
+ ```
96
+
97
+ ### 5. Create Custom Evaluators
98
+
99
+ ```python
100
+ from fiddler_evals.evaluators.base import Evaluator
101
+ from fiddler_evals.pydantic_models.score import Score
102
+
103
+ class PolitenessEvaluator(Evaluator):
104
+ """
105
+ Simple evaluator that checks if a response contains polite language.
106
+ Useful for customer service or chatbot applications.
107
+ """
108
+
109
+ def __init__(self):
110
+ super().__init__()
111
+ self.polite_words = [
112
+ 'please', 'thank you', 'thanks', 'sorry', 'apologize',
113
+ 'appreciate', 'welcome', 'help', 'assist', 'glad'
114
+ ]
115
+
116
+ def score(self, output: str) -> Score:
117
+ """Score based on presence of polite language."""
118
+ output_lower = output.lower()
119
+
120
+ # Count polite words
121
+ polite_count = sum(1 for word in self.polite_words if word in output_lower)
122
+
123
+ # Simple scoring: 1.0 if any polite words found, 0.0 otherwise
124
+ if polite_count > 0:
125
+ score_value = 1.0
126
+ reasoning = f"Contains {polite_count} polite word(s)"
127
+ else:
128
+ score_value = 0.0
129
+ reasoning = "No polite language detected"
130
+
131
+ return Score(
132
+ name="politeness",
133
+ evaluator_name=self.name,
134
+ value=score_value,
135
+ reasoning=reasoning
136
+ )
137
+
138
+ # Test the evaluator
139
+ politeness_evaluator = PolitenessEvaluator()
140
+
141
+ polite_response = "Thank you for your question! I'd be happy to help you with that."
142
+ impolite_response = "I don't know. Figure it out yourself."
143
+
144
+ print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
145
+ print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
146
+ ```
147
+
148
+ ### 5.1. Function-Based Evaluators
149
+
150
+ You can also use simple functions as evaluators instead of creating full evaluator classes. Functions are automatically wrapped with `EvalFn` internally:
151
+
152
+ ```python
153
+ def word_count_evaluator(output: str) -> float:
154
+ """Simple function that returns word count as a score."""
155
+ word_count = len(output.split())
156
+ # Normalize to 0-1 scale (assuming 0-50 words is reasonable)
157
+ return min(word_count / 50.0, 1.0)
158
+
159
+ def contains_number_evaluator(output: str) -> float:
160
+ """Check if response contains any numbers."""
161
+ import re
162
+ return 1.0 if re.search(r'\d+', output) else 0.0
163
+
164
+ # Use functions directly in evaluators list
165
+ evaluators = [
166
+ AnswerRelevance(),
167
+ Conciseness(),
168
+ word_count_evaluator, # Function evaluator
169
+ contains_number_evaluator, # Function evaluator
170
+ ]
171
+
172
+ # The evaluate() function automatically wraps these with EvalFn
173
+ experiment_result = evaluate(
174
+ dataset=dataset,
175
+ task=my_llm_task,
176
+ evaluators=evaluators,
177
+ score_fn_kwargs_mapping={
178
+ "output": "answer", # Maps to function parameter
179
+ "response": "answer", # Maps to class evaluator parameter
180
+ }
181
+ )
182
+ ```
183
+
184
+ ### 6. Run Experiments
185
+
186
+ ```python
187
+ from fiddler_evals import evaluate
188
+
189
+ # Define your AI application task
190
+ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
191
+ question = inputs.get("question", "")
192
+ # Your LLM API call here
193
+ answer = call_your_llm(question)
194
+ return {"answer": answer}
195
+
196
+ # Set up evaluators
197
+ evaluators = [
198
+ AnswerRelevance(),
199
+ Conciseness(),
200
+ Sentiment(),
201
+ PolitenessEvaluator(),
202
+ ]
203
+
204
+ # Run evaluation
205
+ experiment_result = evaluate(
206
+ dataset=dataset,
207
+ task=my_llm_task,
208
+ evaluators=evaluators,
209
+ name_prefix="my_evaluation",
210
+ description="Comprehensive LLM evaluation",
211
+ score_fn_kwargs_mapping={
212
+ "question": "question",
213
+ "response": "answer",
214
+ "output": "answer",
215
+ "text": "answer",
216
+ "prompt": lambda x: x["inputs"]["question"],
217
+ }
218
+ )
219
+
220
+ print(f"Evaluated {len(experiment_result.results)} test cases")
221
+ print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
222
+ ```
223
+
224
+ ## Built-in Evaluators
225
+
226
+ | Evaluator | Purpose | Key Parameters |
227
+ |-----------|---------|----------------|
228
+ | `AnswerRelevance` | Checks if response addresses the question | `prompt`, `response` |
229
+ | `Coherence` | Evaluates logical flow and consistency | `response`, `prompt` |
230
+ | `Conciseness` | Measures response brevity and clarity | `response` |
231
+ | `Toxicity` | Detects harmful or toxic content | `text` |
232
+ | `Sentiment` | Analyzes emotional tone | `text` |
233
+ | `RegexSearch` | Pattern matching for specific formats | `output`, `pattern` |
234
+ | `FTLPromptSafety` | Compute safety scores for prompts | `text` |
235
+ | `FTLResponseFaithfulness` | Evaluate faithfulness of LLM responses | `response`, `context` |
236
+
237
+ ## Data Import Options
238
+
239
+ ### CSV Files
240
+ ```python
241
+ dataset.insert_from_csv_file(
242
+ file_path='data.csv',
243
+ input_columns=['question'],
244
+ expected_output_columns=['answer'],
245
+ metadata_columns=['category']
246
+ )
247
+ ```
248
+
249
+ ### JSONL Files
250
+ ```python
251
+ dataset.insert_from_jsonl_file(
252
+ file_path='data.jsonl',
253
+ input_keys=['question'],
254
+ expected_output_keys=['answer'],
255
+ metadata_keys=['category']
256
+ )
257
+ ```
258
+
259
+ ### Pandas DataFrames
260
+ ```python
261
+ dataset.insert_from_pandas(
262
+ df=df,
263
+ input_columns=['question'],
264
+ expected_output_columns=['answer'],
265
+ metadata_columns=['category']
266
+ )
267
+ ```
268
+
269
+ ## Advanced Usage
270
+
271
+ ### Concurrent Processing
272
+ ```python
273
+ experiment_result = evaluate(
274
+ dataset=dataset,
275
+ task=my_llm_task,
276
+ evaluators=evaluators,
277
+ max_workers=4 # Process 4 test cases concurrently
278
+ )
279
+ ```
280
+
281
+ ### Custom Score Mapping
282
+
283
+ The `score_fn_kwargs_mapping` parameter is essential for connecting your task outputs to evaluator inputs. Different evaluators expect different parameter names, but your task function returns outputs with specific keys.
284
+
285
+ ```python
286
+ # Your task returns:
287
+ {"answer": "Paris is the capital of France"}
288
+
289
+ # But evaluators expect different parameter names:
290
+ AnswerRelevance.score(prompt="...", response="...") # Needs 'prompt' and 'response'
291
+ Conciseness.score(response="...") # Needs 'response'
292
+ Sentiment.score(text="...") # Needs 'text'
293
+ ```
294
+
295
+ **The Solution**: Map your output keys to evaluator parameter names:
296
+
297
+ ```python
298
+ score_fn_kwargs_mapping={
299
+ "question": "question", # Map 'question' parameter to 'question' key
300
+ "response": "answer", # Map 'response' parameter to 'answer' key
301
+ "text": "answer", # Map 'text' parameter to 'answer' key
302
+ "prompt": lambda x: x["inputs"]["question"], # Map 'prompt' to input question
303
+ "context": lambda x: x["extras"]["context"] # Map 'context' to extras
304
+ }
305
+ ```
306
+
307
+ ### Experiment Metadata
308
+ ```python
309
+ experiment_result = evaluate(
310
+ dataset=dataset,
311
+ task=my_llm_task,
312
+ evaluators=evaluators,
313
+ metadata={
314
+ "model_version": "gpt-4",
315
+ "evaluation_date": "2024-01-15",
316
+ "temperature": 0.7
317
+ }
318
+ )
319
+ ```