fiddler-evals 0.1.1.dev12__tar.gz → 0.1.1.dev14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fiddler_evals-0.1.1.dev12/fiddler_evals.egg-info → fiddler_evals-0.1.1.dev14}/PKG-INFO +122 -12
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/PUBLIC.md +121 -11
- fiddler_evals-0.1.1.dev14/fiddler_evals/VERSION +1 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/dataset.py +39 -1
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/experiment.py +1 -1
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_dataset_items.py +196 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_experiment_results.py +48 -13
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/answer_relevance.py +1 -1
- fiddler_evals-0.1.1.dev14/fiddler_evals/evaluators/base.py +245 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/coherence.py +1 -1
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/conciseness.py +1 -1
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/eval_fn.py +19 -9
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/ftl_prompt_safety.py +1 -1
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/ftl_response_faithfulness.py +1 -1
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/regex.py +7 -10
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/sentiment.py +1 -1
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_answer_relevance.py +1 -1
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_coherence.py +1 -1
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_conciseness.py +1 -1
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_eval_fn.py +52 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_ftl_prompt_safety.py +6 -6
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_ftl_response_faithfulness.py +1 -1
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_regex.py +39 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_sentiment.py +6 -6
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_topic_classification.py +6 -6
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/test_toxicity.py +6 -6
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/topic.py +5 -3
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/toxicity.py +1 -1
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/experiment.py +2 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/evaluation.py +43 -8
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/experiment_result_publisher.py +1 -1
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/experiment_runner.py +73 -50
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/tests/test_evaluate.py +282 -2
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/tests/test_experiment_result_publisher.py +19 -2
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14/fiddler_evals.egg-info}/PKG-INFO +122 -12
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/pyproject.toml +1 -1
- fiddler_evals-0.1.1.dev12/fiddler_evals/VERSION +0 -1
- fiddler_evals-0.1.1.dev12/fiddler_evals/evaluators/base.py +0 -141
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/MANIFEST.in +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/README.md +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/configs.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/conftest.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/connection.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/constants.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/decorators.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/application.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/base.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/project.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_application.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_dataset.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_experiment.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_experiment_items.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_project.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/evaluators/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/exceptions.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/http_client.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/json_encoder.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/semver.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/tests/test_json_encoder.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/libs/tests/test_request_client.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/application.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/base.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/compact.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/dataset.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/error.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/evaluator.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/filter_query.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/project.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/response.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/score.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/pydantic_models/server_info.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/executor.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/runner/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/tests/constants.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/tests/test_connection.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/tests/test_decorators.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/utils/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/utils/environment.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/utils/pd.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/utils/tests/__init__.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/utils/tests/test_environment.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/utils/tqdm.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/version.py +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals.egg-info/SOURCES.txt +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals.egg-info/dependency_links.txt +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals.egg-info/requires.txt +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals.egg-info/top_level.txt +0 -0
- {fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fiddler-evals
|
|
3
|
-
Version: 0.1.1.
|
|
3
|
+
Version: 0.1.1.dev14
|
|
4
4
|
Summary: Python SDK for evaluating LLM Applications
|
|
5
5
|
Author-email: Fiddler AI <support@fiddler.ai>
|
|
6
6
|
Maintainer-email: Fiddler AI <support@fiddler.ai>
|
|
@@ -128,8 +128,11 @@ class PolitenessEvaluator(Evaluator):
|
|
|
128
128
|
Useful for customer service or chatbot applications.
|
|
129
129
|
"""
|
|
130
130
|
|
|
131
|
-
def __init__(self):
|
|
132
|
-
super().__init__(
|
|
131
|
+
def __init__(self, score_name_prefix: str = None, score_fn_kwargs_mapping: dict = None):
|
|
132
|
+
super().__init__(
|
|
133
|
+
score_name_prefix=score_name_prefix,
|
|
134
|
+
score_fn_kwargs_mapping=score_fn_kwargs_mapping
|
|
135
|
+
)
|
|
133
136
|
self.polite_words = [
|
|
134
137
|
'please', 'thank you', 'thanks', 'sorry', 'apologize',
|
|
135
138
|
'appreciate', 'welcome', 'help', 'assist', 'glad'
|
|
@@ -151,13 +154,13 @@ class PolitenessEvaluator(Evaluator):
|
|
|
151
154
|
reasoning = "No polite language detected"
|
|
152
155
|
|
|
153
156
|
return Score(
|
|
154
|
-
name="politeness",
|
|
157
|
+
name=f"{self.score_name_prefix}politeness",
|
|
155
158
|
evaluator_name=self.name,
|
|
156
159
|
value=score_value,
|
|
157
160
|
reasoning=reasoning
|
|
158
161
|
)
|
|
159
162
|
|
|
160
|
-
# Test the evaluator
|
|
163
|
+
# Test the evaluator with different configurations
|
|
161
164
|
politeness_evaluator = PolitenessEvaluator()
|
|
162
165
|
|
|
163
166
|
polite_response = "Thank you for your question! I'd be happy to help you with that."
|
|
@@ -165,6 +168,17 @@ impolite_response = "I don't know. Figure it out yourself."
|
|
|
165
168
|
|
|
166
169
|
print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
|
|
167
170
|
print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
|
|
171
|
+
|
|
172
|
+
# Use with different configurations
|
|
173
|
+
customer_service_evaluator = PolitenessEvaluator(
|
|
174
|
+
score_name_prefix="customer_service",
|
|
175
|
+
score_fn_kwargs_mapping={"output": "response"}
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
support_evaluator = PolitenessEvaluator(
|
|
179
|
+
score_name_prefix="support",
|
|
180
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
181
|
+
)
|
|
168
182
|
```
|
|
169
183
|
|
|
170
184
|
### 5.1. Function-Based Evaluators
|
|
@@ -215,12 +229,32 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
|
|
|
215
229
|
answer = call_your_llm(question)
|
|
216
230
|
return {"answer": answer}
|
|
217
231
|
|
|
218
|
-
# Set up evaluators
|
|
232
|
+
# Set up evaluators with different configurations
|
|
219
233
|
evaluators = [
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
234
|
+
# Primary evaluation metrics
|
|
235
|
+
AnswerRelevance(score_name_prefix="primary"),
|
|
236
|
+
Conciseness(score_name_prefix="primary"),
|
|
237
|
+
Sentiment(score_name_prefix="primary"),
|
|
238
|
+
|
|
239
|
+
# Custom evaluators with specific mappings
|
|
240
|
+
PolitenessEvaluator(
|
|
241
|
+
score_name_prefix="quality",
|
|
242
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
243
|
+
),
|
|
244
|
+
|
|
245
|
+
# Multiple instances of same evaluator for different fields
|
|
246
|
+
RegexSearch(
|
|
247
|
+
pattern=r"\d+",
|
|
248
|
+
score_name_prefix="validation",
|
|
249
|
+
score_name="has_number",
|
|
250
|
+
score_fn_kwargs_mapping={"output": "question"}
|
|
251
|
+
),
|
|
252
|
+
RegexSearch(
|
|
253
|
+
pattern=r"\d+",
|
|
254
|
+
score_name_prefix="validation",
|
|
255
|
+
score_name="has_number",
|
|
256
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
257
|
+
),
|
|
224
258
|
]
|
|
225
259
|
|
|
226
260
|
# Run evaluation
|
|
@@ -231,9 +265,8 @@ experiment_result = evaluate(
|
|
|
231
265
|
name_prefix="my_evaluation",
|
|
232
266
|
description="Comprehensive LLM evaluation",
|
|
233
267
|
score_fn_kwargs_mapping={
|
|
234
|
-
"question": "question",
|
|
268
|
+
"question": lambda x: x["inputs"]["question"],
|
|
235
269
|
"response": "answer",
|
|
236
|
-
"output": "answer",
|
|
237
270
|
"text": "answer",
|
|
238
271
|
"prompt": lambda x: x["inputs"]["question"],
|
|
239
272
|
}
|
|
@@ -241,6 +274,10 @@ experiment_result = evaluate(
|
|
|
241
274
|
|
|
242
275
|
print(f"Evaluated {len(experiment_result.results)} test cases")
|
|
243
276
|
print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
|
|
277
|
+
|
|
278
|
+
# Results in organized score names:
|
|
279
|
+
# "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
|
|
280
|
+
# "quality_politeness", "validation_has_number" (for question), "validation_has_number" (for answer)
|
|
244
281
|
```
|
|
245
282
|
|
|
246
283
|
## Built-in Evaluators
|
|
@@ -326,6 +363,79 @@ score_fn_kwargs_mapping={
|
|
|
326
363
|
}
|
|
327
364
|
```
|
|
328
365
|
|
|
366
|
+
### Multiple Evaluator Instances with Different Mappings
|
|
367
|
+
|
|
368
|
+
You can create multiple instances of the same evaluator with different parameter mappings and score name prefixes to evaluate different aspects of your outputs. Use `score_name_prefix` to organize and distinguish scores when using multiple evaluator instances:
|
|
369
|
+
|
|
370
|
+
```python
|
|
371
|
+
from fiddler_evals.evaluators import RegexSearch
|
|
372
|
+
|
|
373
|
+
# Create multiple RegexSearch evaluators for different fields
|
|
374
|
+
evaluators = [
|
|
375
|
+
# Check for numbers in the question
|
|
376
|
+
RegexSearch(
|
|
377
|
+
pattern=r"\d+",
|
|
378
|
+
score_name_prefix="question",
|
|
379
|
+
score_name="has_number",
|
|
380
|
+
score_fn_kwargs_mapping={"output": "question"}
|
|
381
|
+
),
|
|
382
|
+
# Check for numbers in the answer
|
|
383
|
+
RegexSearch(
|
|
384
|
+
pattern=r"\d+",
|
|
385
|
+
score_name_prefix="answer",
|
|
386
|
+
score_name="has_number",
|
|
387
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
388
|
+
),
|
|
389
|
+
# Check for capital letters in the answer
|
|
390
|
+
RegexSearch(
|
|
391
|
+
pattern=r"[A-Z]",
|
|
392
|
+
score_name_prefix="answer",
|
|
393
|
+
score_name="has_caps",
|
|
394
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
395
|
+
)
|
|
396
|
+
]
|
|
397
|
+
|
|
398
|
+
# Run evaluation
|
|
399
|
+
experiment_result = evaluate(
|
|
400
|
+
dataset=dataset,
|
|
401
|
+
task=my_llm_task,
|
|
402
|
+
evaluators=evaluators,
|
|
403
|
+
score_fn_kwargs_mapping={
|
|
404
|
+
"question": lambda x: x["inputs"]["question"]
|
|
405
|
+
}
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
# Results in scores named:
|
|
409
|
+
# "question_has_number", "answer_has_number", "answer_has_caps"
|
|
410
|
+
```
|
|
411
|
+
|
|
412
|
+
### Parameter Mapping Priority
|
|
413
|
+
|
|
414
|
+
When both evaluator-level and evaluation-level mappings are present, evaluator-level mappings take precedence:
|
|
415
|
+
|
|
416
|
+
```python
|
|
417
|
+
# Evaluator-level mapping (higher priority)
|
|
418
|
+
evaluator = RegexSearch(
|
|
419
|
+
pattern=r"\d+",
|
|
420
|
+
score_fn_kwargs_mapping={"output": "answer"} # This takes precedence
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# Evaluation-level mapping (lower priority)
|
|
424
|
+
experiment_result = evaluate(
|
|
425
|
+
dataset=dataset,
|
|
426
|
+
task=my_llm_task,
|
|
427
|
+
evaluators=[evaluator],
|
|
428
|
+
score_fn_kwargs_mapping={
|
|
429
|
+
"output": "question" # This is ignored due to evaluator-level mapping
|
|
430
|
+
}
|
|
431
|
+
)
|
|
432
|
+
```
|
|
433
|
+
|
|
434
|
+
**Mapping Priority (highest to lowest):**
|
|
435
|
+
1. Evaluator-level `score_fn_kwargs_mapping` (set in evaluator constructor)
|
|
436
|
+
2. Evaluation-level `score_fn_kwargs_mapping` (passed to evaluate function)
|
|
437
|
+
3. Default parameter resolution
|
|
438
|
+
|
|
329
439
|
### Experiment Metadata
|
|
330
440
|
```python
|
|
331
441
|
experiment_result = evaluate(
|
|
@@ -106,8 +106,11 @@ class PolitenessEvaluator(Evaluator):
|
|
|
106
106
|
Useful for customer service or chatbot applications.
|
|
107
107
|
"""
|
|
108
108
|
|
|
109
|
-
def __init__(self):
|
|
110
|
-
super().__init__(
|
|
109
|
+
def __init__(self, score_name_prefix: str = None, score_fn_kwargs_mapping: dict = None):
|
|
110
|
+
super().__init__(
|
|
111
|
+
score_name_prefix=score_name_prefix,
|
|
112
|
+
score_fn_kwargs_mapping=score_fn_kwargs_mapping
|
|
113
|
+
)
|
|
111
114
|
self.polite_words = [
|
|
112
115
|
'please', 'thank you', 'thanks', 'sorry', 'apologize',
|
|
113
116
|
'appreciate', 'welcome', 'help', 'assist', 'glad'
|
|
@@ -129,13 +132,13 @@ class PolitenessEvaluator(Evaluator):
|
|
|
129
132
|
reasoning = "No polite language detected"
|
|
130
133
|
|
|
131
134
|
return Score(
|
|
132
|
-
name="politeness",
|
|
135
|
+
name=f"{self.score_name_prefix}politeness",
|
|
133
136
|
evaluator_name=self.name,
|
|
134
137
|
value=score_value,
|
|
135
138
|
reasoning=reasoning
|
|
136
139
|
)
|
|
137
140
|
|
|
138
|
-
# Test the evaluator
|
|
141
|
+
# Test the evaluator with different configurations
|
|
139
142
|
politeness_evaluator = PolitenessEvaluator()
|
|
140
143
|
|
|
141
144
|
polite_response = "Thank you for your question! I'd be happy to help you with that."
|
|
@@ -143,6 +146,17 @@ impolite_response = "I don't know. Figure it out yourself."
|
|
|
143
146
|
|
|
144
147
|
print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
|
|
145
148
|
print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
|
|
149
|
+
|
|
150
|
+
# Use with different configurations
|
|
151
|
+
customer_service_evaluator = PolitenessEvaluator(
|
|
152
|
+
score_name_prefix="customer_service",
|
|
153
|
+
score_fn_kwargs_mapping={"output": "response"}
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
support_evaluator = PolitenessEvaluator(
|
|
157
|
+
score_name_prefix="support",
|
|
158
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
159
|
+
)
|
|
146
160
|
```
|
|
147
161
|
|
|
148
162
|
### 5.1. Function-Based Evaluators
|
|
@@ -193,12 +207,32 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
|
|
|
193
207
|
answer = call_your_llm(question)
|
|
194
208
|
return {"answer": answer}
|
|
195
209
|
|
|
196
|
-
# Set up evaluators
|
|
210
|
+
# Set up evaluators with different configurations
|
|
197
211
|
evaluators = [
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
212
|
+
# Primary evaluation metrics
|
|
213
|
+
AnswerRelevance(score_name_prefix="primary"),
|
|
214
|
+
Conciseness(score_name_prefix="primary"),
|
|
215
|
+
Sentiment(score_name_prefix="primary"),
|
|
216
|
+
|
|
217
|
+
# Custom evaluators with specific mappings
|
|
218
|
+
PolitenessEvaluator(
|
|
219
|
+
score_name_prefix="quality",
|
|
220
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
221
|
+
),
|
|
222
|
+
|
|
223
|
+
# Multiple instances of same evaluator for different fields
|
|
224
|
+
RegexSearch(
|
|
225
|
+
pattern=r"\d+",
|
|
226
|
+
score_name_prefix="validation",
|
|
227
|
+
score_name="has_number",
|
|
228
|
+
score_fn_kwargs_mapping={"output": "question"}
|
|
229
|
+
),
|
|
230
|
+
RegexSearch(
|
|
231
|
+
pattern=r"\d+",
|
|
232
|
+
score_name_prefix="validation",
|
|
233
|
+
score_name="has_number",
|
|
234
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
235
|
+
),
|
|
202
236
|
]
|
|
203
237
|
|
|
204
238
|
# Run evaluation
|
|
@@ -209,9 +243,8 @@ experiment_result = evaluate(
|
|
|
209
243
|
name_prefix="my_evaluation",
|
|
210
244
|
description="Comprehensive LLM evaluation",
|
|
211
245
|
score_fn_kwargs_mapping={
|
|
212
|
-
"question": "question",
|
|
246
|
+
"question": lambda x: x["inputs"]["question"],
|
|
213
247
|
"response": "answer",
|
|
214
|
-
"output": "answer",
|
|
215
248
|
"text": "answer",
|
|
216
249
|
"prompt": lambda x: x["inputs"]["question"],
|
|
217
250
|
}
|
|
@@ -219,6 +252,10 @@ experiment_result = evaluate(
|
|
|
219
252
|
|
|
220
253
|
print(f"Evaluated {len(experiment_result.results)} test cases")
|
|
221
254
|
print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
|
|
255
|
+
|
|
256
|
+
# Results in organized score names:
|
|
257
|
+
# "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
|
|
258
|
+
# "quality_politeness", "validation_has_number" (for question), "validation_has_number" (for answer)
|
|
222
259
|
```
|
|
223
260
|
|
|
224
261
|
## Built-in Evaluators
|
|
@@ -304,6 +341,79 @@ score_fn_kwargs_mapping={
|
|
|
304
341
|
}
|
|
305
342
|
```
|
|
306
343
|
|
|
344
|
+
### Multiple Evaluator Instances with Different Mappings
|
|
345
|
+
|
|
346
|
+
You can create multiple instances of the same evaluator with different parameter mappings and score name prefixes to evaluate different aspects of your outputs. Use `score_name_prefix` to organize and distinguish scores when using multiple evaluator instances:
|
|
347
|
+
|
|
348
|
+
```python
|
|
349
|
+
from fiddler_evals.evaluators import RegexSearch
|
|
350
|
+
|
|
351
|
+
# Create multiple RegexSearch evaluators for different fields
|
|
352
|
+
evaluators = [
|
|
353
|
+
# Check for numbers in the question
|
|
354
|
+
RegexSearch(
|
|
355
|
+
pattern=r"\d+",
|
|
356
|
+
score_name_prefix="question",
|
|
357
|
+
score_name="has_number",
|
|
358
|
+
score_fn_kwargs_mapping={"output": "question"}
|
|
359
|
+
),
|
|
360
|
+
# Check for numbers in the answer
|
|
361
|
+
RegexSearch(
|
|
362
|
+
pattern=r"\d+",
|
|
363
|
+
score_name_prefix="answer",
|
|
364
|
+
score_name="has_number",
|
|
365
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
366
|
+
),
|
|
367
|
+
# Check for capital letters in the answer
|
|
368
|
+
RegexSearch(
|
|
369
|
+
pattern=r"[A-Z]",
|
|
370
|
+
score_name_prefix="answer",
|
|
371
|
+
score_name="has_caps",
|
|
372
|
+
score_fn_kwargs_mapping={"output": "answer"}
|
|
373
|
+
)
|
|
374
|
+
]
|
|
375
|
+
|
|
376
|
+
# Run evaluation
|
|
377
|
+
experiment_result = evaluate(
|
|
378
|
+
dataset=dataset,
|
|
379
|
+
task=my_llm_task,
|
|
380
|
+
evaluators=evaluators,
|
|
381
|
+
score_fn_kwargs_mapping={
|
|
382
|
+
"question": lambda x: x["inputs"]["question"]
|
|
383
|
+
}
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
# Results in scores named:
|
|
387
|
+
# "question_has_number", "answer_has_number", "answer_has_caps"
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
### Parameter Mapping Priority
|
|
391
|
+
|
|
392
|
+
When both evaluator-level and evaluation-level mappings are present, evaluator-level mappings take precedence:
|
|
393
|
+
|
|
394
|
+
```python
|
|
395
|
+
# Evaluator-level mapping (higher priority)
|
|
396
|
+
evaluator = RegexSearch(
|
|
397
|
+
pattern=r"\d+",
|
|
398
|
+
score_fn_kwargs_mapping={"output": "answer"} # This takes precedence
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
# Evaluation-level mapping (lower priority)
|
|
402
|
+
experiment_result = evaluate(
|
|
403
|
+
dataset=dataset,
|
|
404
|
+
task=my_llm_task,
|
|
405
|
+
evaluators=[evaluator],
|
|
406
|
+
score_fn_kwargs_mapping={
|
|
407
|
+
"output": "question" # This is ignored due to evaluator-level mapping
|
|
408
|
+
}
|
|
409
|
+
)
|
|
410
|
+
```
|
|
411
|
+
|
|
412
|
+
**Mapping Priority (highest to lowest):**
|
|
413
|
+
1. Evaluator-level `score_fn_kwargs_mapping` (set in evaluator constructor)
|
|
414
|
+
2. Evaluation-level `score_fn_kwargs_mapping` (passed to evaluate function)
|
|
415
|
+
3. Default parameter resolution
|
|
416
|
+
|
|
307
417
|
### Experiment Metadata
|
|
308
418
|
```python
|
|
309
419
|
experiment_result = evaluate(
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.1.1.dev14
|
|
@@ -732,7 +732,10 @@ class Dataset(BaseEntity):
|
|
|
732
732
|
with list(dataset.get_items()) if you need to iterate multiple times or get
|
|
733
733
|
the total count. The iterator fetches items lazily from the API.
|
|
734
734
|
"""
|
|
735
|
-
|
|
735
|
+
# Read upto 1K dataset items in a call to reduce network calls and latency
|
|
736
|
+
for item in self._paginate(
|
|
737
|
+
url=f"{self._get_url(self.id)}/items", page_size=1000
|
|
738
|
+
):
|
|
736
739
|
yield DatasetItem(**item)
|
|
737
740
|
|
|
738
741
|
@handle_api_error
|
|
@@ -870,6 +873,35 @@ class Dataset(BaseEntity):
|
|
|
870
873
|
if df.empty:
|
|
871
874
|
raise ValueError("DataFrame cannot be empty")
|
|
872
875
|
|
|
876
|
+
if input_columns and (
|
|
877
|
+
missing_input_columns := set(input_columns) - set(df_columns)
|
|
878
|
+
):
|
|
879
|
+
raise ValueError(
|
|
880
|
+
f"Input column(s) {missing_input_columns} not found in DataFrame"
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
if expected_output_columns and (
|
|
884
|
+
missing_expected_output_columns := set(expected_output_columns)
|
|
885
|
+
- set(df_columns)
|
|
886
|
+
):
|
|
887
|
+
raise ValueError(
|
|
888
|
+
f"Expected output column(s) {missing_expected_output_columns} not found in DataFrame"
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
if metadata_columns and (
|
|
892
|
+
missing_metadata_columns := set(metadata_columns) - set(df_columns)
|
|
893
|
+
):
|
|
894
|
+
raise ValueError(
|
|
895
|
+
f"Metadata column(s) {missing_metadata_columns} not found in DataFrame"
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
if extras_columns and (
|
|
899
|
+
missing_extras_columns := set(extras_columns) - set(df_columns)
|
|
900
|
+
):
|
|
901
|
+
raise ValueError(
|
|
902
|
+
f"Extras column(s) {missing_extras_columns} not found in DataFrame"
|
|
903
|
+
)
|
|
904
|
+
|
|
873
905
|
expected_output_columns = expected_output_columns or []
|
|
874
906
|
metadata_columns = metadata_columns or []
|
|
875
907
|
extras_columns = extras_columns or []
|
|
@@ -1182,6 +1214,9 @@ class Dataset(BaseEntity):
|
|
|
1182
1214
|
if not rows:
|
|
1183
1215
|
raise ValueError("JSONL file cannot be empty")
|
|
1184
1216
|
|
|
1217
|
+
if not input_keys:
|
|
1218
|
+
raise ValueError("Input keys cannot be empty")
|
|
1219
|
+
|
|
1185
1220
|
expected_output_keys = expected_output_keys or []
|
|
1186
1221
|
metadata_keys = metadata_keys or []
|
|
1187
1222
|
extras_keys = extras_keys or []
|
|
@@ -1208,6 +1243,9 @@ class Dataset(BaseEntity):
|
|
|
1208
1243
|
source_name = str(source_name) if source_name else None
|
|
1209
1244
|
source_id = str(source_id) if source_id else None
|
|
1210
1245
|
|
|
1246
|
+
if all(value is None for value in inputs.values()):
|
|
1247
|
+
raise ValueError("All inputs cannot be empty or empty strings")
|
|
1248
|
+
|
|
1211
1249
|
items.append(
|
|
1212
1250
|
NewDatasetItem(
|
|
1213
1251
|
id=dataset_id,
|
{fiddler_evals-0.1.1.dev12 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/experiment.py
RENAMED
|
@@ -923,7 +923,7 @@ class Experiment(BaseEntity):
|
|
|
923
923
|
if not items:
|
|
924
924
|
raise ValueError("Items cannot be empty")
|
|
925
925
|
|
|
926
|
-
serialized_items = [item.model_dump() for item in items]
|
|
926
|
+
serialized_items = [item.model_dump(exclude={"dataset_item"}) for item in items]
|
|
927
927
|
|
|
928
928
|
self._client().post(
|
|
929
929
|
url=f"{self._get_url(self.id)}/results",
|
|
@@ -404,6 +404,165 @@ def test_insert_items_with_empty_dataframe() -> None:
|
|
|
404
404
|
)
|
|
405
405
|
|
|
406
406
|
|
|
407
|
+
@responses.activate
|
|
408
|
+
def test_insert_from_pandas_validation_missing_input_columns() -> None:
|
|
409
|
+
"""Test validation when input columns are not found in DataFrame."""
|
|
410
|
+
|
|
411
|
+
df = pd.DataFrame(
|
|
412
|
+
{
|
|
413
|
+
"question": ["What is 2+2?", "What is 3+3?"],
|
|
414
|
+
"answer": ["4", "6"],
|
|
415
|
+
"difficulty": ["easy", "easy"],
|
|
416
|
+
}
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
with pytest.raises(
|
|
420
|
+
ValueError,
|
|
421
|
+
match=r"Input column\(s\) \{'missing_column'\} not found in DataFrame",
|
|
422
|
+
):
|
|
423
|
+
dataset.insert_from_pandas(
|
|
424
|
+
df=df,
|
|
425
|
+
input_columns=["question", "missing_column"],
|
|
426
|
+
expected_output_columns=["answer"],
|
|
427
|
+
metadata_columns=["difficulty"],
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
@responses.activate
|
|
432
|
+
def test_insert_from_pandas_validation_missing_expected_output_columns() -> None:
|
|
433
|
+
"""Test validation when expected output columns are not found in DataFrame."""
|
|
434
|
+
|
|
435
|
+
df = pd.DataFrame(
|
|
436
|
+
{
|
|
437
|
+
"question": ["What is 2+2?", "What is 3+3?"],
|
|
438
|
+
"answer": ["4", "6"],
|
|
439
|
+
"difficulty": ["easy", "easy"],
|
|
440
|
+
}
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
with pytest.raises(
|
|
444
|
+
ValueError,
|
|
445
|
+
match=r"Expected output column\(s\) \{'missing_output'\} not found in DataFrame",
|
|
446
|
+
):
|
|
447
|
+
dataset.insert_from_pandas(
|
|
448
|
+
df=df,
|
|
449
|
+
input_columns=["question"],
|
|
450
|
+
expected_output_columns=["answer", "missing_output"],
|
|
451
|
+
metadata_columns=["difficulty"],
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
@responses.activate
|
|
456
|
+
def test_insert_from_pandas_validation_missing_metadata_columns() -> None:
|
|
457
|
+
"""Test validation when metadata columns are not found in DataFrame."""
|
|
458
|
+
|
|
459
|
+
df = pd.DataFrame(
|
|
460
|
+
{
|
|
461
|
+
"question": ["What is 2+2?", "What is 3+3?"],
|
|
462
|
+
"answer": ["4", "6"],
|
|
463
|
+
"difficulty": ["easy", "easy"],
|
|
464
|
+
}
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
with pytest.raises(
|
|
468
|
+
ValueError,
|
|
469
|
+
match=r"Metadata column\(s\) \{'missing_metadata'\} not found in DataFrame",
|
|
470
|
+
):
|
|
471
|
+
dataset.insert_from_pandas(
|
|
472
|
+
df=df,
|
|
473
|
+
input_columns=["question"],
|
|
474
|
+
expected_output_columns=["answer"],
|
|
475
|
+
metadata_columns=["difficulty", "missing_metadata"],
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
@responses.activate
|
|
480
|
+
def test_insert_from_pandas_validation_missing_extras_columns() -> None:
|
|
481
|
+
"""Test validation when extras columns are not found in DataFrame."""
|
|
482
|
+
|
|
483
|
+
df = pd.DataFrame(
|
|
484
|
+
{
|
|
485
|
+
"question": ["What is 2+2?", "What is 3+3?"],
|
|
486
|
+
"answer": ["4", "6"],
|
|
487
|
+
"difficulty": ["easy", "easy"],
|
|
488
|
+
}
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
with pytest.raises(
|
|
492
|
+
ValueError,
|
|
493
|
+
match=r"Extras column\(s\) \{'missing_extras'\} not found in DataFrame",
|
|
494
|
+
):
|
|
495
|
+
dataset.insert_from_pandas(
|
|
496
|
+
df=df,
|
|
497
|
+
input_columns=["question"],
|
|
498
|
+
expected_output_columns=["answer"],
|
|
499
|
+
metadata_columns=["difficulty"],
|
|
500
|
+
extras_columns=["missing_extras"],
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
@responses.activate
|
|
505
|
+
def test_insert_from_pandas_validation_no_columns_specified() -> None:
|
|
506
|
+
"""Test that validation passes when no specific columns are specified (auto-mapping)."""
|
|
507
|
+
|
|
508
|
+
df = pd.DataFrame(
|
|
509
|
+
{
|
|
510
|
+
"question": ["What is 2+2?", "What is 3+3?"],
|
|
511
|
+
"answer": ["4", "6"],
|
|
512
|
+
"difficulty": ["easy", "easy"],
|
|
513
|
+
"source_name": ["test", "test"],
|
|
514
|
+
"source_id": ["1", "2"],
|
|
515
|
+
}
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
# Mock item insertion
|
|
519
|
+
insert_response = INSERT_RESPONSE_SUCCESS.copy()
|
|
520
|
+
insert_response["data"]["ids"] = [str(uuid4()) for _ in range(len(df))]
|
|
521
|
+
responses.post(
|
|
522
|
+
url=f"{URL}/v3/evals/datasets/{DATASET_ID}/items",
|
|
523
|
+
json=insert_response,
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
# Should not raise any validation errors when no specific columns are specified
|
|
527
|
+
item_ids = dataset.insert_from_pandas(df=df)
|
|
528
|
+
|
|
529
|
+
# Verify response
|
|
530
|
+
assert len(item_ids) == 2
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
@responses.activate
|
|
534
|
+
def test_insert_from_pandas_validation_empty_column_lists() -> None:
|
|
535
|
+
"""Test that validation passes when empty column lists are provided."""
|
|
536
|
+
|
|
537
|
+
df = pd.DataFrame(
|
|
538
|
+
{
|
|
539
|
+
"question": ["What is 2+2?", "What is 3+3?"],
|
|
540
|
+
"answer": ["4", "6"],
|
|
541
|
+
"difficulty": ["easy", "easy"],
|
|
542
|
+
}
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
# Mock item insertion
|
|
546
|
+
insert_response = INSERT_RESPONSE_SUCCESS.copy()
|
|
547
|
+
insert_response["data"]["ids"] = [str(uuid4()) for _ in range(len(df))]
|
|
548
|
+
responses.post(
|
|
549
|
+
url=f"{URL}/v3/evals/datasets/{DATASET_ID}/items",
|
|
550
|
+
json=insert_response,
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
# Should not raise any validation errors when empty lists are provided
|
|
554
|
+
item_ids = dataset.insert_from_pandas(
|
|
555
|
+
df=df,
|
|
556
|
+
input_columns=["question"],
|
|
557
|
+
expected_output_columns=[], # Empty list
|
|
558
|
+
metadata_columns=[], # Empty list
|
|
559
|
+
extras_columns=[], # Empty list
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
# Verify response
|
|
563
|
+
assert len(item_ids) == 2
|
|
564
|
+
|
|
565
|
+
|
|
407
566
|
@responses.activate
|
|
408
567
|
def test_insert_items_success_with_csv_file() -> None:
|
|
409
568
|
"""When inserting items from a csv file, the items are inserted successfully."""
|
|
@@ -490,3 +649,40 @@ def test_insert_items_with_empty_jsonl_file(tmp_path: Path) -> None:
|
|
|
490
649
|
file_path=temp_file,
|
|
491
650
|
input_keys=["Question"],
|
|
492
651
|
)
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
@responses.activate
|
|
655
|
+
def test_insert_from_jsonl_file_validation_empty_input_keys(tmp_path: Path) -> None:
|
|
656
|
+
"""Test validation when input_keys is empty."""
|
|
657
|
+
|
|
658
|
+
temp_file = tmp_path / "test.jsonl"
|
|
659
|
+
temp_file.write_text('{"question": "What is 2+2?"}\n')
|
|
660
|
+
|
|
661
|
+
with pytest.raises(ValueError, match="Input keys cannot be empty"):
|
|
662
|
+
dataset.insert_from_jsonl_file(
|
|
663
|
+
file_path=temp_file,
|
|
664
|
+
input_keys=[], # Empty input keys
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
@pytest.mark.parametrize(
|
|
669
|
+
"test_data,input_keys",
|
|
670
|
+
[
|
|
671
|
+
({"question": None}, ["question"]),
|
|
672
|
+
({"question": None, "context": None}, ["question", "context"]),
|
|
673
|
+
({"difficulty": "easy"}, ["question"]),
|
|
674
|
+
],
|
|
675
|
+
)
|
|
676
|
+
def test_insert_from_jsonl_file_validation(
|
|
677
|
+
tmp_path: Path, test_data: dict, input_keys: list
|
|
678
|
+
) -> None:
|
|
679
|
+
"""Test comprehensive validation for insert_from_jsonl_file."""
|
|
680
|
+
|
|
681
|
+
temp_file = tmp_path / "test.jsonl"
|
|
682
|
+
temp_file.write_text(json.dumps(test_data) + "\n")
|
|
683
|
+
|
|
684
|
+
with pytest.raises(ValueError, match="All inputs cannot be empty or empty strings"):
|
|
685
|
+
dataset.insert_from_jsonl_file(
|
|
686
|
+
file_path=temp_file,
|
|
687
|
+
input_keys=input_keys,
|
|
688
|
+
)
|