PyPI - llmvalidate - Versions diffs - 0.4.2__tar.gz → 0.4.3__tar.gz - Mend

llmvalidate 0.4.2tar.gz → 0.4.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{llmvalidate-0.4.2 → llmvalidate-0.4.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llmvalidate
-Version: 0.4.2
+Version: 0.4.3
 Summary: Oncoshot LLM validation framework
 License: MIT
 Project-URL: Homepage, https://github.com/Oncoshot/llm-validation-framework
@@ -16,6 +16,12 @@ Classifier: Intended Audience :: Developers
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: pandas>=3.0.0
+Requires-Dist: numpy>=2.4.1
+Requires-Dist: pydantic>=2.12.5
+Requires-Dist: tqdm>=4.67.1
+Requires-Dist: python-dateutil>=2.9.0
+Requires-Dist: colorama>=0.4.6
 Dynamic: license-file
 # LLM Validation Framework
@@ -333,10 +339,11 @@ pytest tests/compare_results_all_test.py   # End-to-end comparisons
 ```
 llm-validation-framework/
 ├── src/
-│   ├── validation.py     # Main validation pipeline and metrics calculation
-│   ├── structured.py     # Pydantic data models for LLM results
-│   ├── utils.py         # Utility functions (list conversion, flattening)
-│   └── standardize.py   # Data standardization helpers
+│   └── llmvalidate/
+│       ├── validation.py     # Main validation pipeline and metrics calculation
+│       ├── structured.py     # Pydantic data models for LLM results
+│       ├── utils.py         # Utility functions (list conversion, flattening)
+│       └── standardize.py   # Data standardization helpers
 ├── tests/               # Comprehensive test suite
 ├── validation_results/  # Output directory (auto-created)
 ├── samples.csv         # Demo dataset with all validation scenarios

{llmvalidate-0.4.2 → llmvalidate-0.4.3}/pyproject.toml RENAMED Viewed

@@ -4,10 +4,18 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "llmvalidate"
-version = "0.4.2"
+version = "0.4.3"
 description = "Oncoshot LLM validation framework"
 readme = "readme.md"
 requires-python = ">=3.11"
+dependencies = [
+    "pandas>=3.0.0",
+    "numpy>=2.4.1",
+    "pydantic>=2.12.5",
+    "tqdm>=4.67.1",
+    "python-dateutil>=2.9.0",
+    "colorama>=0.4.6"
+]
 license = { text = "MIT" }
 classifiers = [
@@ -25,22 +33,27 @@ classifiers = [
 "Repository" = "https://github.com/Oncoshot/llm-validation-framework"
 "Bug Tracker" = "https://github.com/Oncoshot/llm-validation-framework/issues"
+[tool.setuptools]
+package-dir = {"" = "src"}
 [tool.setuptools.packages.find]
-where = ["."]
+where = ["src"]
+include = ["llmvalidate*"]
+exclude = ["tests*", "docs*", "scripts*"]
 [tool.pytest.ini_options]
 pythonpath = [
-  ".",
   "src",
 ]
 [tool.semantic_release]
-version_variable = ["src/__init__.py:__version__"]
+version_variable = ["src/llmvalidate/__init__.py:__version__"]
 version_toml = ["pyproject.toml:project.version"]
 branch = "master"
 allow_zero_version = true
 build_command = "pip install build && python -m build"
 upload_to_pypi = true
 upload_to_release = true
 commit_version_number = true

{llmvalidate-0.4.2 → llmvalidate-0.4.3}/readme.md RENAMED Viewed

@@ -313,10 +313,11 @@ pytest tests/compare_results_all_test.py   # End-to-end comparisons
 ```
 llm-validation-framework/
 ├── src/
-│   ├── validation.py     # Main validation pipeline and metrics calculation
-│   ├── structured.py     # Pydantic data models for LLM results
-│   ├── utils.py         # Utility functions (list conversion, flattening)
-│   └── standardize.py   # Data standardization helpers
+│   └── llmvalidate/
+│       ├── validation.py     # Main validation pipeline and metrics calculation
+│       ├── structured.py     # Pydantic data models for LLM results
+│       ├── utils.py         # Utility functions (list conversion, flattening)
+│       └── standardize.py   # Data standardization helpers
 ├── tests/               # Comprehensive test suite
 ├── validation_results/  # Output directory (auto-created)
 ├── samples.csv         # Demo dataset with all validation scenarios

llmvalidate-0.4.3/src/llmvalidate/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+__version__ = "0.0.0"
+from .validation import validate, bootstrap_CI
+from .structured import StructuredResult, StructuredGroup, StructuredField
+__all__ = [
+    "validate",
+    "bootstrap_CI",
+    "StructuredResult",
+    "StructuredGroup",
+    "StructuredField"
+]

{llmvalidate-0.4.2/src → llmvalidate-0.4.3/src/llmvalidate}/structured.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from pydantic import BaseModel
+from pydantic import BaseModel
 from typing import Any, List, Dict, Optional, Union
 # For all LLM Extracted Value

{llmvalidate-0.4.2/src → llmvalidate-0.4.3/src/llmvalidate}/utils.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from typing import Any, List, Dict, Optional, Union
 from ast import literal_eval
-from src.structured import StructuredResult
+from .structured import StructuredResult
 import pandas as pd
 import re
 import json

{llmvalidate-0.4.2/src → llmvalidate-0.4.3/src/llmvalidate}/validation.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from datetime import datetime
+from datetime import datetime
 import math
 from ast import literal_eval
 import string
@@ -8,7 +8,7 @@ import time
 import os
 import concurrent.futures as cf
 from tqdm import tqdm
-from src.utils import convert_lists, infer_fields
+from .utils import convert_lists, infer_fields
 def compare_results_binary(expected, actual):
     """Compares boolean labels and returns confusion matrix counts."""

{llmvalidate-0.4.2 → llmvalidate-0.4.3/src}/llmvalidate.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llmvalidate
-Version: 0.4.2
+Version: 0.4.3
 Summary: Oncoshot LLM validation framework
 License: MIT
 Project-URL: Homepage, https://github.com/Oncoshot/llm-validation-framework
@@ -16,6 +16,12 @@ Classifier: Intended Audience :: Developers
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: pandas>=3.0.0
+Requires-Dist: numpy>=2.4.1
+Requires-Dist: pydantic>=2.12.5
+Requires-Dist: tqdm>=4.67.1
+Requires-Dist: python-dateutil>=2.9.0
+Requires-Dist: colorama>=0.4.6
 Dynamic: license-file
 # LLM Validation Framework
@@ -333,10 +339,11 @@ pytest tests/compare_results_all_test.py   # End-to-end comparisons
 ```
 llm-validation-framework/
 ├── src/
-│   ├── validation.py     # Main validation pipeline and metrics calculation
-│   ├── structured.py     # Pydantic data models for LLM results
-│   ├── utils.py         # Utility functions (list conversion, flattening)
-│   └── standardize.py   # Data standardization helpers
+│   └── llmvalidate/
+│       ├── validation.py     # Main validation pipeline and metrics calculation
+│       ├── structured.py     # Pydantic data models for LLM results
+│       ├── utils.py         # Utility functions (list conversion, flattening)
+│       └── standardize.py   # Data standardization helpers
 ├── tests/               # Comprehensive test suite
 ├── validation_results/  # Output directory (auto-created)
 ├── samples.csv         # Demo dataset with all validation scenarios

llmvalidate-0.4.3/src/llmvalidate.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,13 @@
+LICENSE
+pyproject.toml
+readme.md
+src/llmvalidate/__init__.py
+src/llmvalidate/standardize.py
+src/llmvalidate/structured.py
+src/llmvalidate/utils.py
+src/llmvalidate/validation.py
+src/llmvalidate.egg-info/PKG-INFO
+src/llmvalidate.egg-info/SOURCES.txt
+src/llmvalidate.egg-info/dependency_links.txt
+src/llmvalidate.egg-info/requires.txt
+src/llmvalidate.egg-info/top_level.txt

llmvalidate-0.4.3/src/llmvalidate.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,6 @@
+pandas>=3.0.0
+numpy>=2.4.1
+pydantic>=2.12.5
+tqdm>=4.67.1
+python-dateutil>=2.9.0
+colorama>=0.4.6

llmvalidate-0.4.3/src/llmvalidate.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ llmvalidate

llmvalidate-0.4.2/llmvalidate.egg-info/SOURCES.txt DELETED Viewed

@@ -1,16 +0,0 @@
-LICENSE
-pyproject.toml
-readme.md
-llmvalidate.egg-info/PKG-INFO
-llmvalidate.egg-info/SOURCES.txt
-llmvalidate.egg-info/dependency_links.txt
-llmvalidate.egg-info/top_level.txt
-src/__init__.py
-src/standardize.py
-src/structured.py
-src/utils.py
-src/validation.py
-tests/bootstrap_CI_test.py
-tests/compare_results_all_test.py
-tests/compare_results_test.py
-tests/validate_test.py

llmvalidate-0.4.2/llmvalidate.egg-info/top_level.txt DELETED Viewed

@@ -1,4 +0,0 @@
-dist
-src
-tests
-validation_results

llmvalidate-0.4.2/src/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "0.0.0"

llmvalidate-0.4.2/tests/bootstrap_CI_test.py DELETED Viewed

@@ -1,229 +0,0 @@
-import pandas as pd
-import pytest
-import numpy as np
-import src.validation as v
-def test_bootstrap_CI_basic():
-    """Test basic functionality of bootstrap_CI"""
-    # Create test data with comparison results
-    res_df = pd.DataFrame({
-        'field1': ['A', 'B', 'A', 'B', 'A'] * 20,  # 100 rows total
-        'field2': ['X', 'Y', 'X', 'Y', 'X'] * 20,
-        'Cor: field1': [1, 0, 1, 1, 0] * 20,
-        'Inc: field1': [0, 1, 0, 0, 1] * 20,
-        'Mis: field1': [0, 0, 0, 0, 0] * 20,
-        'Spu: field1': [0, 1, 0, 0, 1] * 20,
-        'Par: field1': [0, 0, 0, 0, 0] * 20,
-        'Cor: field2': [1, 1, 0, 1, 1] * 20,
-        'Inc: field2': [0, 0, 1, 0, 0] * 20,
-        'Mis: field2': [0, 0, 1, 0, 0] * 20,
-        'Spu: field2': [0, 0, 0, 0, 0] * 20,
-        'Par: field2': [0, 0, 0, 0, 0] * 20,
-    })
-    fields = ['field1', 'field2']
-    result = v.bootstrap_CI(res_df, fields, n_bootstrap=100, random_state=42)
-    # Check output format
-    assert 'field' in result.columns
-    assert len(result) == 4  # Two fields + exceptions field + N/CI info row
-    expected_fields = {'field1', 'field2', 'exceptions', 'N=100; CI=95%'}
-    assert set(result['field']) == expected_fields
-    # Check that confidence interval columns are present for metrics that exist in our data
-    # We know these will be present because we've included them in our test data
-    core_metrics = ['field-present cases', 'cor', 'inc', 'mis', 'spu', 'par']
-    # 'labeled cases' is handled specially - just appears as 'labeled cases'
-    assert 'labeled cases' in result.columns
-    for metric in core_metrics:
-        assert f'{metric}: mean' in result.columns
-        assert f'{metric}: lower' in result.columns
-        assert f'{metric}: upper' in result.columns
-    # Check that means are reasonable (between lower and upper bounds) for non-exception fields
-    for _, row in result.iterrows():
-        if row['field'] in ['exceptions', 'N=100; CI=95%']:  # Skip exceptions and info row
-            for metric in core_metrics:
-                mean_col = f'{metric}: mean'
-                lower_col = f'{metric}: lower'
-                upper_col = f'{metric}: upper'
-                if pd.notna(row[mean_col]) and pd.notna(row[lower_col]) and pd.notna(row[upper_col]):
-                    assert row[lower_col] <= row[mean_col] <= row[upper_col], \
-                        f"Mean not between bounds for {row['field']} {metric}"
-def test_bootstrap_CI_error_conditions():
-    """Test error conditions for bootstrap_CI"""
-    # Test with ci outside valid range
-    res_df = pd.DataFrame({
-        'field1': [1, 2, 3],
-        'Cor: field1': [1, 0, 1]
-    })
-    with pytest.raises(ValueError, match="ci must be in \\(0, 1\\)"):
-        v.bootstrap_CI(res_df, ['field1'], ci=1.5)
-    with pytest.raises(ValueError, match="ci must be in \\(0, 1\\)"):
-        v.bootstrap_CI(res_df, ['field1'], ci=0)
-    # Test with too few rows
-    single_row_df = pd.DataFrame({
-        'field1': [1],
-        'Cor: field1': [1]
-    })
-    with pytest.raises(ValueError, match="Need at least 2 rows"):
-        v.bootstrap_CI(single_row_df, ['field1'])
-    # Test with missing labels (NaN values)
-    res_df_with_nan = pd.DataFrame({
-        'field1': [1, np.nan, 3],
-        'field2': [1, 2, 3],
-        'Cor: field1': [1, 0, 1],
-        'Cor: field2': [0, 1, 0]
-    })
-    with pytest.raises(ValueError, match="Missing labels \\(NaN\\) found in the following fields: \\['field1'\\]"):
-        v.bootstrap_CI(res_df_with_nan, ['field1', 'field2'])
-def test_bootstrap_CI_binary_field():
-    """Test bootstrap_CI with binary field metrics"""
-    # Create test data with binary field results
-    res_df = pd.DataFrame({
-        'binary_field': [True, False, True, False] * 25,  # 100 rows
-        'TP: binary_field': [1, 0, 1, 0] * 25,
-        'FP: binary_field': [0, 1, 0, 1] * 25,
-        'FN: binary_field': [0, 0, 0, 0] * 25,
-        'TN: binary_field': [0, 1, 0, 1] * 25,
-        'Precision: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
-        'Recall: binary_field': [1.0, np.nan, 1.0, np.nan] * 25,
-        'F1 score: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
-        'F2 score: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
-    })
-    fields = ['binary_field']
-    result = v.bootstrap_CI(res_df, fields, n_bootstrap=50, random_state=42)
-    # Check that binary metrics are included
-    binary_metrics = ['TP', 'FP', 'FN', 'TN', 'precision (micro)', 'recall (micro)',
-                     'F1 score (micro)', 'F2 score (micro)', 'accuracy (micro)', 'specificity (micro)']
-    for metric in binary_metrics:
-        assert f'{metric}: mean' in result.columns
-        assert f'{metric}: lower' in result.columns
-        assert f'{metric}: upper' in result.columns
-    # Check that N/CI info row is present
-    info_rows = result[result['field'].str.startswith('N=')]
-    assert len(info_rows) == 1
-def test_bootstrap_CI_output_format():
-    """Test that output format matches specification"""
-    res_df = pd.DataFrame({
-        'test_field': ['A', 'B'] * 50,  # 100 rows
-        'Cor: test_field': [1, 0] * 50,
-        'Inc: test_field': [0, 1] * 50,
-        'Mis: test_field': [0, 0] * 50,
-        'Spu: test_field': [0, 1] * 50,
-        'Par: test_field': [0, 0] * 50,
-    })
-    result = v.bootstrap_CI(res_df, ['test_field'], n_bootstrap=10, random_state=42)
-    # Check that result has the correct format
-    assert len(result) == 3  # One field + exceptions + N/CI info row
-    test_field_row = result[result['field'] == 'test_field'].iloc[0]
-    assert test_field_row['field'] == 'test_field'
-    # Check that columns follow the expected pattern
-    metric_columns = [col for col in result.columns if col not in ['field', 'labeled cases']]
-    for col in metric_columns:
-        assert ': ' in col, f"Column {col} doesn't follow expected format"
-        metric_name, stat_type = col.split(': ', 1)
-        assert stat_type in ['mean', 'lower', 'upper'], f"Unexpected stat type in {col}"
-def test_bootstrap_CI_confidence_intervals():
-    """Test that confidence intervals make sense"""
-    # Create deterministic test case
-    res_df = pd.DataFrame({
-        'field1': [1] * 100,
-        'Cor: field1': [5] * 100,  # Constant values for predictable CI
-        'Inc: field1': [0] * 100,
-        'Mis: field1': [0] * 100,
-        'Spu: field1': [0] * 100,
-        'Par: field1': [0] * 100,
-    })
-    result = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=100, ci=0.95, random_state=42)
-    # For constant values, mean should equal the sum (get_metrics sums the values)
-    row = result[result['field'] == 'field1'].iloc[0]
-    # Check that mean is close to expected value (5 * 100 = 500)
-    assert abs(row['cor: mean'] - 500.0) < 10.0
-    # Check that CI bounds are reasonable (close to mean for constant data)
-    assert abs(row['cor: lower'] - row['cor: mean']) < 50.0
-    assert abs(row['cor: upper'] - row['cor: mean']) < 50.0
-def test_bootstrap_CI_with_different_ci_levels():
-    """Test bootstrap_CI with different confidence interval levels"""
-    res_df = pd.DataFrame({
-        'field1': [1, 2, 3] * 34,  # ~100 rows
-        'Cor: field1': [1, 2, 1] * 34,
-        'Inc: field1': [0, 1, 0] * 34,
-        'Mis: field1': [0, 0, 1] * 34,
-        'Spu: field1': [1, 0, 0] * 34,
-        'Par: field1': [0, 0, 0] * 34,
-    })
-    # Test 90% CI
-    result_90 = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=50, ci=0.90, random_state=42)
-    # Test 99% CI
-    result_99 = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=50, ci=0.99, random_state=42)
-    # 99% CI should be wider than 90% CI
-    row_90 = result_90[result_90['field'] == 'field1'].iloc[0]
-    row_99 = result_99[result_99['field'] == 'field1'].iloc[0]
-    width_90 = row_90['cor: upper'] - row_90['cor: lower']
-    width_99 = row_99['cor: upper'] - row_99['cor: lower']
-    assert width_99 >= width_90, "99% CI should be wider than 90% CI"
-def test_bootstrap_CI_empty_metrics():
-    """Test bootstrap_CI handles missing values correctly"""
-    # Create simpler test data that focuses on core functionality
-    res_df = pd.DataFrame({
-        'field1': [1, 2, 3] * 34,
-        'Cor: field1': [0, 1, 2] * 34,  # Valid values
-        'Inc: field1': [0, 0, 0] * 34,
-        'Mis: field1': [0, 0, 0] * 34,
-        'Spu: field1': [0, 0, 0] * 34,
-        'Par: field1': [0, 0, 0] * 34,
-    })
-    result = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=10, random_state=42)
-    # Check that core metrics appear in output
-    core_cols = [col for col in result.columns if 'cor:' in col]
-    assert len(core_cols) == 3, f"Expected 3 cor metrics (mean, lower, upper), got {len(core_cols)}: {core_cols}"
-    # Check that the function completes without errors for this simpler case
-    assert 'field' in result.columns
-    assert len(result) >= 2  # At least exceptions + field1 + N/CI info row
-    # Check that N/CI info row is present
-    info_rows = result[result['field'].str.startswith('N=')]
-    assert len(info_rows) == 1

llmvalidate-0.4.2/tests/compare_results_all_test.py DELETED Viewed

@@ -1,165 +0,0 @@
-import math
-import pandas as pd
-import pytest
-import src.validation as v
-pd.options.display.width = 0
-def get_test_df(addconfidence):
-    flag =        [True,  True,  False, False, True,  False, True, True]
-    res_flag =    [True,  False, True,  False, False, False, True, False]
-    fruits = [
-        ['apple', 'banana'],
-        ['apple'],
-        '-',
-        ['cherry'],
-        [],
-        ['apple'],
-        ['apple', 'banana'],
-        None
-    ]
-    res_fruits = [
-        ['apple', 'cherry'],
-        ['apple', 'banana'],
-        [],
-        ['cherry'],
-        ['apple'],
-        '-',
-        ['banana'],
-        ['apple']
-    ]
-    res_fruits_confidence = [
-        'High',
-        'Low',
-        'High',
-        None,
-        'NA',
-        'Low',
-        'High',
-        'High'
-    ]
-    color = [
-        'red',
-        'blue',
-        '-',
-        'green',
-        '-',
-        '-',
-        '4',
-        None
-    ]
-    res_color = [
-        'red',
-        'green',
-        'yellow',
-        '-',
-        '',
-        '-',
-        4,
-        'red'
-    ]
-    res_color_confidence = [
-        'High',
-        'Low',
-        '-',
-        'High',
-        '',
-        'High',
-        'Low',
-        'High'
-    ]
-    orphan = ['x','y','z','w','-','',None, None]
-    df = pd.DataFrame({
-        'flag': flag,
-        'Res: flag': res_flag,
-        'fruits': fruits,
-        'Res: fruits': res_fruits,
-        'orphan': orphan,
-        'color': color,
-        'Res: color': res_color
-    })
-    if addconfidence:
-        # Insert after 'Res: fruits'
-        pos_fruits = df.columns.get_loc('Res: fruits')
-        df.insert(pos_fruits + 1, 'Res: fruits confidence', res_fruits_confidence)
-        # Insert after 'Res: color'
-        pos_color = df.columns.get_loc('Res: color')  # recompute after previous insert
-        df.insert(pos_color + 1, 'Res: color confidence', res_color_confidence)
-    return df
-def _is_none_or_nan(x):
-    return x is None or (isinstance(x, float) and math.isnan(x))
-def test_compare_results_all_mixed_fields():
-    df = get_test_df(False)
-    res_df = v.compare_results_all(df, ['flag', 'fruits', 'color'])
-    # ---- Binary field assertions (flag) ----
-    # Row 0: TP
-    assert res_df.loc[0, 'TP: flag'] == 1
-    # Row 1: FN
-    assert res_df.loc[1, 'FN: flag'] == 1
-    assert res_df.loc[2, 'FP: flag'] == 1
-    assert res_df.loc[3, 'TN: flag'] == 1
-    # ---- List field assertions (fruits) ----
-    # Row 0 mixed
-    assert res_df.loc[0, 'Cor: fruits'] == 1
-    assert res_df.loc[0, 'Mis: fruits'] == 1
-    assert res_df.loc[0, 'Spu: fruits'] == 1
-    assert res_df.loc[0, 'Precision: fruits'] == pytest.approx(0.5)
-    assert res_df.loc[0, 'Recall: fruits'] == pytest.approx(0.5)
-    assert res_df.loc[0, 'F1 score: fruits'] == pytest.approx(0.5)
-    # Row 1: one correct + one spurious
-    assert res_df.loc[1, 'Cor: fruits'] == 1
-    assert res_df.loc[1, 'Spu: fruits'] == 1
-    assert res_df.loc[1, 'Precision: fruits'] == pytest.approx(0.5)
-    assert res_df.loc[1, 'Recall: fruits'] == pytest.approx(1.0)
-    # Row 2: expected '-' vs [] => zeros, metrics NaN
-    assert res_df.loc[2, 'Cor: fruits'] == 0
-    assert math.isnan(res_df.loc[2, 'Precision: fruits'])
-    # Row 3: perfect
-    assert res_df.loc[3, 'Cor: fruits'] == 1
-    assert res_df.loc[3, 'Precision: fruits'] == pytest.approx(1.0)
-    # Row 4: expected empty list, actual has item -> spurious
-    assert res_df.loc[4, 'Spu: fruits'] == 1
-    assert res_df.loc[4, 'Mis: fruits'] == 0
-    # Row 5: expected ['apple'], actual '-' (empty) -> missing
-    assert res_df.loc[5, 'Mis: fruits'] == 1
-    assert res_df.loc[5, 'Spu: fruits'] == 0
-    # ---- Scalar non-binary field assertions (color) ----
-    assert res_df.loc[0, 'Cor: color'] == 1          # correct
-    assert res_df.loc[1, 'Inc: color'] == 1          # incorrect
-    assert res_df.loc[2, 'Spu: color'] == 1          # spurious
-    assert res_df.loc[3, 'Mis: color'] == 1          # missing
-    # Rows 4 & 5: both sides empty label cases ('-' and ''), treated as labeled empty -> zeros + NaN metrics
-    assert res_df.loc[4, 'Cor: color'] == 0
-    assert res_df.loc[5, 'Cor: color'] == 0
-    # Row 6: numeric string vs number -> match
-    assert res_df.loc[6, 'Cor: color'] == 1
-    assert res_df.loc[6, 'Inc: color'] == 0
-    # Ensure orphan column passed through unchanged
-    assert 'orphan' in res_df.columns
-    expected_columns = [
-        'TP: flag','TN: flag','FP: flag','FN: flag',
-        'Cor: fruits','Mis: fruits','Spu: fruits',
-        'Precision: fruits','Recall: fruits','F1 score: fruits',
-        'Cor: color','Inc: color','Mis: color','Spu: color'
-    ]
-    for col in expected_columns:
-        assert col in res_df.columns, f"Missing column {col} in compare_results_all output"

llmvalidate 0.4.2__tar.gz → 0.4.3__tar.gz

llmvalidate 0.4.2tar.gz → 0.4.3tar.gz