PyPI - fiddler-evals - Versions diffs - 0.1.1.dev13__tar.gz → 0.1.1.dev14__tar.gz - Mend

fiddler-evals 0.1.1.dev13tar.gz → 0.1.1.dev14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

{fiddler_evals-0.1.1.dev13/fiddler_evals.egg-info → fiddler_evals-0.1.1.dev14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: fiddler-evals
-Version: 0.1.1.dev13
+Version: 0.1.1.dev14
 Summary: Python SDK for evaluating LLM Applications
 Author-email: Fiddler AI <support@fiddler.ai>
 Maintainer-email: Fiddler AI <support@fiddler.ai>
@@ -128,8 +128,11 @@ class PolitenessEvaluator(Evaluator):
     Useful for customer service or chatbot applications.
     """
-    def __init__(self):
-        super().__init__()
+    def __init__(self, score_name_prefix: str = None, score_fn_kwargs_mapping: dict = None):
+        super().__init__(
+            score_name_prefix=score_name_prefix,
+            score_fn_kwargs_mapping=score_fn_kwargs_mapping
+        )
         self.polite_words = [
             'please', 'thank you', 'thanks', 'sorry', 'apologize',
             'appreciate', 'welcome', 'help', 'assist', 'glad'
@@ -151,13 +154,13 @@ class PolitenessEvaluator(Evaluator):
             reasoning = "No polite language detected"
         return Score(
-            name="politeness",
+            name=f"{self.score_name_prefix}politeness",
             evaluator_name=self.name,
             value=score_value,
             reasoning=reasoning
         )
-# Test the evaluator
+# Test the evaluator with different configurations
 politeness_evaluator = PolitenessEvaluator()
 polite_response = "Thank you for your question! I'd be happy to help you with that."
@@ -165,6 +168,17 @@ impolite_response = "I don't know. Figure it out yourself."
 print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
 print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
+# Use with different configurations
+customer_service_evaluator = PolitenessEvaluator(
+    score_name_prefix="customer_service",
+    score_fn_kwargs_mapping={"output": "response"}
+)
+support_evaluator = PolitenessEvaluator(
+    score_name_prefix="support",
+    score_fn_kwargs_mapping={"output": "answer"}
+)
 ```
 ### 5.1. Function-Based Evaluators
@@ -215,12 +229,32 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
     answer = call_your_llm(question)
     return {"answer": answer}
-# Set up evaluators
+# Set up evaluators with different configurations
 evaluators = [
-    AnswerRelevance(),
-    Conciseness(),
-    Sentiment(),
-    PolitenessEvaluator(),
+    # Primary evaluation metrics
+    AnswerRelevance(score_name_prefix="primary"),
+    Conciseness(score_name_prefix="primary"),
+    Sentiment(score_name_prefix="primary"),
+    # Custom evaluators with specific mappings
+    PolitenessEvaluator(
+        score_name_prefix="quality",
+        score_fn_kwargs_mapping={"output": "answer"}
+    ),
+    # Multiple instances of same evaluator for different fields
+    RegexSearch(
+        pattern=r"\d+",
+        score_name_prefix="validation",
+        score_name="has_number",
+        score_fn_kwargs_mapping={"output": "question"}
+    ),
+    RegexSearch(
+        pattern=r"\d+",
+        score_name_prefix="validation",
+        score_name="has_number",
+        score_fn_kwargs_mapping={"output": "answer"}
+    ),
 ]
 # Run evaluation
@@ -231,9 +265,8 @@ experiment_result = evaluate(
     name_prefix="my_evaluation",
     description="Comprehensive LLM evaluation",
     score_fn_kwargs_mapping={
-        "question": "question",
+        "question": lambda x: x["inputs"]["question"],
         "response": "answer",
-        "output": "answer",
         "text": "answer",
         "prompt": lambda x: x["inputs"]["question"],
     }
@@ -241,6 +274,10 @@ experiment_result = evaluate(
 print(f"Evaluated {len(experiment_result.results)} test cases")
 print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
+# Results in organized score names:
+# "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
+# "quality_politeness", "validation_has_number" (for question), "validation_has_number" (for answer)
 ```
 ## Built-in Evaluators
@@ -326,6 +363,79 @@ score_fn_kwargs_mapping={
 }
 ```
+### Multiple Evaluator Instances with Different Mappings
+You can create multiple instances of the same evaluator with different parameter mappings and score name prefixes to evaluate different aspects of your outputs. Use `score_name_prefix` to organize and distinguish scores when using multiple evaluator instances:
+```python
+from fiddler_evals.evaluators import RegexSearch
+# Create multiple RegexSearch evaluators for different fields
+evaluators = [
+    # Check for numbers in the question
+    RegexSearch(
+        pattern=r"\d+",
+        score_name_prefix="question",
+        score_name="has_number",
+        score_fn_kwargs_mapping={"output": "question"}
+    ),
+    # Check for numbers in the answer
+    RegexSearch(
+        pattern=r"\d+",
+        score_name_prefix="answer",
+        score_name="has_number",
+        score_fn_kwargs_mapping={"output": "answer"}
+    ),
+    # Check for capital letters in the answer
+    RegexSearch(
+        pattern=r"[A-Z]",
+        score_name_prefix="answer",
+        score_name="has_caps",
+        score_fn_kwargs_mapping={"output": "answer"}
+    )
+]
+# Run evaluation
+experiment_result = evaluate(
+    dataset=dataset,
+    task=my_llm_task,
+    evaluators=evaluators,
+    score_fn_kwargs_mapping={
+        "question": lambda x: x["inputs"]["question"]
+    }
+)
+# Results in scores named:
+# "question_has_number", "answer_has_number", "answer_has_caps"
+```
+### Parameter Mapping Priority
+When both evaluator-level and evaluation-level mappings are present, evaluator-level mappings take precedence:
+```python
+# Evaluator-level mapping (higher priority)
+evaluator = RegexSearch(
+    pattern=r"\d+",
+    score_fn_kwargs_mapping={"output": "answer"}  # This takes precedence
+)
+# Evaluation-level mapping (lower priority)
+experiment_result = evaluate(
+    dataset=dataset,
+    task=my_llm_task,
+    evaluators=[evaluator],
+    score_fn_kwargs_mapping={
+        "output": "question"  # This is ignored due to evaluator-level mapping
+    }
+)
+```
+**Mapping Priority (highest to lowest):**
+1. Evaluator-level `score_fn_kwargs_mapping` (set in evaluator constructor)
+2. Evaluation-level `score_fn_kwargs_mapping` (passed to evaluate function)
+3. Default parameter resolution
 ### Experiment Metadata
 ```python
 experiment_result = evaluate(

{fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/PUBLIC.md RENAMED Viewed

@@ -106,8 +106,11 @@ class PolitenessEvaluator(Evaluator):
     Useful for customer service or chatbot applications.
     """
-    def __init__(self):
-        super().__init__()
+    def __init__(self, score_name_prefix: str = None, score_fn_kwargs_mapping: dict = None):
+        super().__init__(
+            score_name_prefix=score_name_prefix,
+            score_fn_kwargs_mapping=score_fn_kwargs_mapping
+        )
         self.polite_words = [
             'please', 'thank you', 'thanks', 'sorry', 'apologize',
             'appreciate', 'welcome', 'help', 'assist', 'glad'
@@ -129,13 +132,13 @@ class PolitenessEvaluator(Evaluator):
             reasoning = "No polite language detected"
         return Score(
-            name="politeness",
+            name=f"{self.score_name_prefix}politeness",
             evaluator_name=self.name,
             value=score_value,
             reasoning=reasoning
         )
-# Test the evaluator
+# Test the evaluator with different configurations
 politeness_evaluator = PolitenessEvaluator()
 polite_response = "Thank you for your question! I'd be happy to help you with that."
@@ -143,6 +146,17 @@ impolite_response = "I don't know. Figure it out yourself."
 print(f"Polite response score: {politeness_evaluator.score(polite_response).value}")
 print(f"Impolite response score: {politeness_evaluator.score(impolite_response).value}")
+# Use with different configurations
+customer_service_evaluator = PolitenessEvaluator(
+    score_name_prefix="customer_service",
+    score_fn_kwargs_mapping={"output": "response"}
+)
+support_evaluator = PolitenessEvaluator(
+    score_name_prefix="support",
+    score_fn_kwargs_mapping={"output": "answer"}
+)
 ```
 ### 5.1. Function-Based Evaluators
@@ -193,12 +207,32 @@ def my_llm_task(inputs: dict, extras: dict, metadata: dict) -> dict:
     answer = call_your_llm(question)
     return {"answer": answer}
-# Set up evaluators
+# Set up evaluators with different configurations
 evaluators = [
-    AnswerRelevance(),
-    Conciseness(),
-    Sentiment(),
-    PolitenessEvaluator(),
+    # Primary evaluation metrics
+    AnswerRelevance(score_name_prefix="primary"),
+    Conciseness(score_name_prefix="primary"),
+    Sentiment(score_name_prefix="primary"),
+    # Custom evaluators with specific mappings
+    PolitenessEvaluator(
+        score_name_prefix="quality",
+        score_fn_kwargs_mapping={"output": "answer"}
+    ),
+    # Multiple instances of same evaluator for different fields
+    RegexSearch(
+        pattern=r"\d+",
+        score_name_prefix="validation",
+        score_name="has_number",
+        score_fn_kwargs_mapping={"output": "question"}
+    ),
+    RegexSearch(
+        pattern=r"\d+",
+        score_name_prefix="validation",
+        score_name="has_number",
+        score_fn_kwargs_mapping={"output": "answer"}
+    ),
 ]
 # Run evaluation
@@ -209,9 +243,8 @@ experiment_result = evaluate(
     name_prefix="my_evaluation",
     description="Comprehensive LLM evaluation",
     score_fn_kwargs_mapping={
-        "question": "question",
+        "question": lambda x: x["inputs"]["question"],
         "response": "answer",
-        "output": "answer",
         "text": "answer",
         "prompt": lambda x: x["inputs"]["question"],
     }
@@ -219,6 +252,10 @@ experiment_result = evaluate(
 print(f"Evaluated {len(experiment_result.results)} test cases")
 print(f"Generated {sum(len(result.scores) for result in experiment_result.results)} scores")
+# Results in organized score names:
+# "primary_answer_relevance", "primary_conciseness", "primary_sentiment",
+# "quality_politeness", "validation_has_number" (for question), "validation_has_number" (for answer)
 ```
 ## Built-in Evaluators
@@ -304,6 +341,79 @@ score_fn_kwargs_mapping={
 }
 ```
+### Multiple Evaluator Instances with Different Mappings
+You can create multiple instances of the same evaluator with different parameter mappings and score name prefixes to evaluate different aspects of your outputs. Use `score_name_prefix` to organize and distinguish scores when using multiple evaluator instances:
+```python
+from fiddler_evals.evaluators import RegexSearch
+# Create multiple RegexSearch evaluators for different fields
+evaluators = [
+    # Check for numbers in the question
+    RegexSearch(
+        pattern=r"\d+",
+        score_name_prefix="question",
+        score_name="has_number",
+        score_fn_kwargs_mapping={"output": "question"}
+    ),
+    # Check for numbers in the answer
+    RegexSearch(
+        pattern=r"\d+",
+        score_name_prefix="answer",
+        score_name="has_number",
+        score_fn_kwargs_mapping={"output": "answer"}
+    ),
+    # Check for capital letters in the answer
+    RegexSearch(
+        pattern=r"[A-Z]",
+        score_name_prefix="answer",
+        score_name="has_caps",
+        score_fn_kwargs_mapping={"output": "answer"}
+    )
+]
+# Run evaluation
+experiment_result = evaluate(
+    dataset=dataset,
+    task=my_llm_task,
+    evaluators=evaluators,
+    score_fn_kwargs_mapping={
+        "question": lambda x: x["inputs"]["question"]
+    }
+)
+# Results in scores named:
+# "question_has_number", "answer_has_number", "answer_has_caps"
+```
+### Parameter Mapping Priority
+When both evaluator-level and evaluation-level mappings are present, evaluator-level mappings take precedence:
+```python
+# Evaluator-level mapping (higher priority)
+evaluator = RegexSearch(
+    pattern=r"\d+",
+    score_fn_kwargs_mapping={"output": "answer"}  # This takes precedence
+)
+# Evaluation-level mapping (lower priority)
+experiment_result = evaluate(
+    dataset=dataset,
+    task=my_llm_task,
+    evaluators=[evaluator],
+    score_fn_kwargs_mapping={
+        "output": "question"  # This is ignored due to evaluator-level mapping
+    }
+)
+```
+**Mapping Priority (highest to lowest):**
+1. Evaluator-level `score_fn_kwargs_mapping` (set in evaluator constructor)
+2. Evaluation-level `score_fn_kwargs_mapping` (passed to evaluate function)
+3. Default parameter resolution
 ### Experiment Metadata
 ```python
 experiment_result = evaluate(

fiddler_evals-0.1.1.dev14/fiddler_evals/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.1.dev14

{fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/dataset.py RENAMED Viewed

@@ -873,6 +873,35 @@ class Dataset(BaseEntity):
         if df.empty:
             raise ValueError("DataFrame cannot be empty")
+        if input_columns and (
+            missing_input_columns := set(input_columns) - set(df_columns)
+        ):
+            raise ValueError(
+                f"Input column(s) {missing_input_columns} not found in DataFrame"
+            )
+        if expected_output_columns and (
+            missing_expected_output_columns := set(expected_output_columns)
+            - set(df_columns)
+        ):
+            raise ValueError(
+                f"Expected output column(s) {missing_expected_output_columns} not found in DataFrame"
+            )
+        if metadata_columns and (
+            missing_metadata_columns := set(metadata_columns) - set(df_columns)
+        ):
+            raise ValueError(
+                f"Metadata column(s) {missing_metadata_columns} not found in DataFrame"
+            )
+        if extras_columns and (
+            missing_extras_columns := set(extras_columns) - set(df_columns)
+        ):
+            raise ValueError(
+                f"Extras column(s) {missing_extras_columns} not found in DataFrame"
+            )
         expected_output_columns = expected_output_columns or []
         metadata_columns = metadata_columns or []
         extras_columns = extras_columns or []
@@ -1185,6 +1214,9 @@ class Dataset(BaseEntity):
         if not rows:
             raise ValueError("JSONL file cannot be empty")
+        if not input_keys:
+            raise ValueError("Input keys cannot be empty")
         expected_output_keys = expected_output_keys or []
         metadata_keys = metadata_keys or []
         extras_keys = extras_keys or []
@@ -1211,6 +1243,9 @@ class Dataset(BaseEntity):
             source_name = str(source_name) if source_name else None
             source_id = str(source_id) if source_id else None
+            if all(value is None for value in inputs.values()):
+                raise ValueError("All inputs cannot be empty or empty strings")
             items.append(
                 NewDatasetItem(
                     id=dataset_id,

{fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/experiment.py RENAMED Viewed

@@ -923,7 +923,7 @@ class Experiment(BaseEntity):
         if not items:
             raise ValueError("Items cannot be empty")
-        serialized_items = [item.model_dump() for item in items]
+        serialized_items = [item.model_dump(exclude={"dataset_item"}) for item in items]
         self._client().post(
             url=f"{self._get_url(self.id)}/results",

{fiddler_evals-0.1.1.dev13 → fiddler_evals-0.1.1.dev14}/fiddler_evals/entities/tests/test_dataset_items.py RENAMED Viewed

@@ -404,6 +404,165 @@ def test_insert_items_with_empty_dataframe() -> None:
         )
+@responses.activate
+def test_insert_from_pandas_validation_missing_input_columns() -> None:
+    """Test validation when input columns are not found in DataFrame."""
+    df = pd.DataFrame(
+        {
+            "question": ["What is 2+2?", "What is 3+3?"],
+            "answer": ["4", "6"],
+            "difficulty": ["easy", "easy"],
+        }
+    )
+    with pytest.raises(
+        ValueError,
+        match=r"Input column\(s\) \{'missing_column'\} not found in DataFrame",
+    ):
+        dataset.insert_from_pandas(
+            df=df,
+            input_columns=["question", "missing_column"],
+            expected_output_columns=["answer"],
+            metadata_columns=["difficulty"],
+        )
+@responses.activate
+def test_insert_from_pandas_validation_missing_expected_output_columns() -> None:
+    """Test validation when expected output columns are not found in DataFrame."""
+    df = pd.DataFrame(
+        {
+            "question": ["What is 2+2?", "What is 3+3?"],
+            "answer": ["4", "6"],
+            "difficulty": ["easy", "easy"],
+        }
+    )
+    with pytest.raises(
+        ValueError,
+        match=r"Expected output column\(s\) \{'missing_output'\} not found in DataFrame",
+    ):
+        dataset.insert_from_pandas(
+            df=df,
+            input_columns=["question"],
+            expected_output_columns=["answer", "missing_output"],
+            metadata_columns=["difficulty"],
+        )
+@responses.activate
+def test_insert_from_pandas_validation_missing_metadata_columns() -> None:
+    """Test validation when metadata columns are not found in DataFrame."""
+    df = pd.DataFrame(
+        {
+            "question": ["What is 2+2?", "What is 3+3?"],
+            "answer": ["4", "6"],
+            "difficulty": ["easy", "easy"],
+        }
+    )
+    with pytest.raises(
+        ValueError,
+        match=r"Metadata column\(s\) \{'missing_metadata'\} not found in DataFrame",
+    ):
+        dataset.insert_from_pandas(
+            df=df,
+            input_columns=["question"],
+            expected_output_columns=["answer"],
+            metadata_columns=["difficulty", "missing_metadata"],
+        )
+@responses.activate
+def test_insert_from_pandas_validation_missing_extras_columns() -> None:
+    """Test validation when extras columns are not found in DataFrame."""
+    df = pd.DataFrame(
+        {
+            "question": ["What is 2+2?", "What is 3+3?"],
+            "answer": ["4", "6"],
+            "difficulty": ["easy", "easy"],
+        }
+    )
+    with pytest.raises(
+        ValueError,
+        match=r"Extras column\(s\) \{'missing_extras'\} not found in DataFrame",
+    ):
+        dataset.insert_from_pandas(
+            df=df,
+            input_columns=["question"],
+            expected_output_columns=["answer"],
+            metadata_columns=["difficulty"],
+            extras_columns=["missing_extras"],
+        )
+@responses.activate
+def test_insert_from_pandas_validation_no_columns_specified() -> None:
+    """Test that validation passes when no specific columns are specified (auto-mapping)."""
+    df = pd.DataFrame(
+        {
+            "question": ["What is 2+2?", "What is 3+3?"],
+            "answer": ["4", "6"],
+            "difficulty": ["easy", "easy"],
+            "source_name": ["test", "test"],
+            "source_id": ["1", "2"],
+        }
+    )
+    # Mock item insertion
+    insert_response = INSERT_RESPONSE_SUCCESS.copy()
+    insert_response["data"]["ids"] = [str(uuid4()) for _ in range(len(df))]
+    responses.post(
+        url=f"{URL}/v3/evals/datasets/{DATASET_ID}/items",
+        json=insert_response,
+    )
+    # Should not raise any validation errors when no specific columns are specified
+    item_ids = dataset.insert_from_pandas(df=df)
+    # Verify response
+    assert len(item_ids) == 2
+@responses.activate
+def test_insert_from_pandas_validation_empty_column_lists() -> None:
+    """Test that validation passes when empty column lists are provided."""
+    df = pd.DataFrame(
+        {
+            "question": ["What is 2+2?", "What is 3+3?"],
+            "answer": ["4", "6"],
+            "difficulty": ["easy", "easy"],
+        }
+    )
+    # Mock item insertion
+    insert_response = INSERT_RESPONSE_SUCCESS.copy()
+    insert_response["data"]["ids"] = [str(uuid4()) for _ in range(len(df))]
+    responses.post(
+        url=f"{URL}/v3/evals/datasets/{DATASET_ID}/items",
+        json=insert_response,
+    )
+    # Should not raise any validation errors when empty lists are provided
+    item_ids = dataset.insert_from_pandas(
+        df=df,
+        input_columns=["question"],
+        expected_output_columns=[],  # Empty list
+        metadata_columns=[],  # Empty list
+        extras_columns=[],  # Empty list
+    )
+    # Verify response
+    assert len(item_ids) == 2
 @responses.activate
 def test_insert_items_success_with_csv_file() -> None:
     """When inserting items from a csv file, the items are inserted successfully."""
@@ -490,3 +649,40 @@ def test_insert_items_with_empty_jsonl_file(tmp_path: Path) -> None:
             file_path=temp_file,
             input_keys=["Question"],
         )
+@responses.activate
+def test_insert_from_jsonl_file_validation_empty_input_keys(tmp_path: Path) -> None:
+    """Test validation when input_keys is empty."""
+    temp_file = tmp_path / "test.jsonl"
+    temp_file.write_text('{"question": "What is 2+2?"}\n')
+    with pytest.raises(ValueError, match="Input keys cannot be empty"):
+        dataset.insert_from_jsonl_file(
+            file_path=temp_file,
+            input_keys=[],  # Empty input keys
+        )
+@pytest.mark.parametrize(
+    "test_data,input_keys",
+    [
+        ({"question": None}, ["question"]),
+        ({"question": None, "context": None}, ["question", "context"]),
+        ({"difficulty": "easy"}, ["question"]),
+    ],
+)
+def test_insert_from_jsonl_file_validation(
+    tmp_path: Path, test_data: dict, input_keys: list
+) -> None:
+    """Test comprehensive validation for insert_from_jsonl_file."""
+    temp_file = tmp_path / "test.jsonl"
+    temp_file.write_text(json.dumps(test_data) + "\n")
+    with pytest.raises(ValueError, match="All inputs cannot be empty or empty strings"):
+        dataset.insert_from_jsonl_file(
+            file_path=temp_file,
+            input_keys=input_keys,
+        )

fiddler-evals 0.1.1.dev13__tar.gz → 0.1.1.dev14__tar.gz

fiddler-evals 0.1.1.dev13tar.gz → 0.1.1.dev14tar.gz