llmvalidate 0.4.2__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmvalidate
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: Oncoshot LLM validation framework
5
5
  License: MIT
6
6
  Project-URL: Homepage, https://github.com/Oncoshot/llm-validation-framework
@@ -16,6 +16,12 @@ Classifier: Intended Audience :: Developers
16
16
  Requires-Python: >=3.11
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
+ Requires-Dist: pandas>=3.0.0
20
+ Requires-Dist: numpy>=2.4.1
21
+ Requires-Dist: pydantic>=2.12.5
22
+ Requires-Dist: tqdm>=4.67.1
23
+ Requires-Dist: python-dateutil>=2.9.0
24
+ Requires-Dist: colorama>=0.4.6
19
25
  Dynamic: license-file
20
26
 
21
27
  # LLM Validation Framework
@@ -333,10 +339,11 @@ pytest tests/compare_results_all_test.py # End-to-end comparisons
333
339
  ```
334
340
  llm-validation-framework/
335
341
  ├── src/
336
- ├── validation.py # Main validation pipeline and metrics calculation
337
- ├── structured.py # Pydantic data models for LLM results
338
- ├── utils.py # Utility functions (list conversion, flattening)
339
- └── standardize.py # Data standardization helpers
342
+ └── llmvalidate/
343
+ ├── validation.py # Main validation pipeline and metrics calculation
344
+ ├── structured.py # Pydantic data models for LLM results
345
+ ├── utils.py # Utility functions (list conversion, flattening)
346
+ │ └── standardize.py # Data standardization helpers
340
347
  ├── tests/ # Comprehensive test suite
341
348
  ├── validation_results/ # Output directory (auto-created)
342
349
  ├── samples.csv # Demo dataset with all validation scenarios
@@ -4,10 +4,18 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "llmvalidate"
7
- version = "0.4.2"
7
+ version = "0.4.3"
8
8
  description = "Oncoshot LLM validation framework"
9
9
  readme = "readme.md"
10
10
  requires-python = ">=3.11"
11
+ dependencies = [
12
+ "pandas>=3.0.0",
13
+ "numpy>=2.4.1",
14
+ "pydantic>=2.12.5",
15
+ "tqdm>=4.67.1",
16
+ "python-dateutil>=2.9.0",
17
+ "colorama>=0.4.6"
18
+ ]
11
19
  license = { text = "MIT" }
12
20
 
13
21
  classifiers = [
@@ -25,22 +33,27 @@ classifiers = [
25
33
  "Repository" = "https://github.com/Oncoshot/llm-validation-framework"
26
34
  "Bug Tracker" = "https://github.com/Oncoshot/llm-validation-framework/issues"
27
35
 
36
+ [tool.setuptools]
37
+ package-dir = {"" = "src"}
38
+
28
39
  [tool.setuptools.packages.find]
29
- where = ["."]
40
+ where = ["src"]
41
+ include = ["llmvalidate*"]
42
+ exclude = ["tests*", "docs*", "scripts*"]
30
43
 
31
44
  [tool.pytest.ini_options]
32
45
  pythonpath = [
33
- ".",
34
46
  "src",
35
47
  ]
36
48
 
37
49
  [tool.semantic_release]
38
- version_variable = ["src/__init__.py:__version__"]
50
+ version_variable = ["src/llmvalidate/__init__.py:__version__"]
39
51
  version_toml = ["pyproject.toml:project.version"]
40
52
  branch = "master"
41
53
  allow_zero_version = true
42
54
  build_command = "pip install build && python -m build"
43
55
  upload_to_pypi = true
44
56
  upload_to_release = true
45
-
46
57
  commit_version_number = true
58
+
59
+
@@ -313,10 +313,11 @@ pytest tests/compare_results_all_test.py # End-to-end comparisons
313
313
  ```
314
314
  llm-validation-framework/
315
315
  ├── src/
316
- ├── validation.py # Main validation pipeline and metrics calculation
317
- ├── structured.py # Pydantic data models for LLM results
318
- ├── utils.py # Utility functions (list conversion, flattening)
319
- └── standardize.py # Data standardization helpers
316
+ └── llmvalidate/
317
+ ├── validation.py # Main validation pipeline and metrics calculation
318
+ ├── structured.py # Pydantic data models for LLM results
319
+ ├── utils.py # Utility functions (list conversion, flattening)
320
+ │ └── standardize.py # Data standardization helpers
320
321
  ├── tests/ # Comprehensive test suite
321
322
  ├── validation_results/ # Output directory (auto-created)
322
323
  ├── samples.csv # Demo dataset with all validation scenarios
@@ -0,0 +1,12 @@
1
+ __version__ = "0.0.0"
2
+
3
+ from .validation import validate, bootstrap_CI
4
+ from .structured import StructuredResult, StructuredGroup, StructuredField
5
+
6
+ __all__ = [
7
+ "validate",
8
+ "bootstrap_CI",
9
+ "StructuredResult",
10
+ "StructuredGroup",
11
+ "StructuredField"
12
+ ]
@@ -1,4 +1,4 @@
1
- from pydantic import BaseModel
1
+ from pydantic import BaseModel
2
2
  from typing import Any, List, Dict, Optional, Union
3
3
 
4
4
  # For all LLM Extracted Value
@@ -1,6 +1,6 @@
1
1
  from typing import Any, List, Dict, Optional, Union
2
2
  from ast import literal_eval
3
- from src.structured import StructuredResult
3
+ from .structured import StructuredResult
4
4
  import pandas as pd
5
5
  import re
6
6
  import json
@@ -1,4 +1,4 @@
1
- from datetime import datetime
1
+ from datetime import datetime
2
2
  import math
3
3
  from ast import literal_eval
4
4
  import string
@@ -8,7 +8,7 @@ import time
8
8
  import os
9
9
  import concurrent.futures as cf
10
10
  from tqdm import tqdm
11
- from src.utils import convert_lists, infer_fields
11
+ from .utils import convert_lists, infer_fields
12
12
 
13
13
  def compare_results_binary(expected, actual):
14
14
  """Compares boolean labels and returns confusion matrix counts."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmvalidate
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: Oncoshot LLM validation framework
5
5
  License: MIT
6
6
  Project-URL: Homepage, https://github.com/Oncoshot/llm-validation-framework
@@ -16,6 +16,12 @@ Classifier: Intended Audience :: Developers
16
16
  Requires-Python: >=3.11
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
+ Requires-Dist: pandas>=3.0.0
20
+ Requires-Dist: numpy>=2.4.1
21
+ Requires-Dist: pydantic>=2.12.5
22
+ Requires-Dist: tqdm>=4.67.1
23
+ Requires-Dist: python-dateutil>=2.9.0
24
+ Requires-Dist: colorama>=0.4.6
19
25
  Dynamic: license-file
20
26
 
21
27
  # LLM Validation Framework
@@ -333,10 +339,11 @@ pytest tests/compare_results_all_test.py # End-to-end comparisons
333
339
  ```
334
340
  llm-validation-framework/
335
341
  ├── src/
336
- ├── validation.py # Main validation pipeline and metrics calculation
337
- ├── structured.py # Pydantic data models for LLM results
338
- ├── utils.py # Utility functions (list conversion, flattening)
339
- └── standardize.py # Data standardization helpers
342
+ └── llmvalidate/
343
+ ├── validation.py # Main validation pipeline and metrics calculation
344
+ ├── structured.py # Pydantic data models for LLM results
345
+ ├── utils.py # Utility functions (list conversion, flattening)
346
+ │ └── standardize.py # Data standardization helpers
340
347
  ├── tests/ # Comprehensive test suite
341
348
  ├── validation_results/ # Output directory (auto-created)
342
349
  ├── samples.csv # Demo dataset with all validation scenarios
@@ -0,0 +1,13 @@
1
+ LICENSE
2
+ pyproject.toml
3
+ readme.md
4
+ src/llmvalidate/__init__.py
5
+ src/llmvalidate/standardize.py
6
+ src/llmvalidate/structured.py
7
+ src/llmvalidate/utils.py
8
+ src/llmvalidate/validation.py
9
+ src/llmvalidate.egg-info/PKG-INFO
10
+ src/llmvalidate.egg-info/SOURCES.txt
11
+ src/llmvalidate.egg-info/dependency_links.txt
12
+ src/llmvalidate.egg-info/requires.txt
13
+ src/llmvalidate.egg-info/top_level.txt
@@ -0,0 +1,6 @@
1
+ pandas>=3.0.0
2
+ numpy>=2.4.1
3
+ pydantic>=2.12.5
4
+ tqdm>=4.67.1
5
+ python-dateutil>=2.9.0
6
+ colorama>=0.4.6
@@ -0,0 +1 @@
1
+ llmvalidate
@@ -1,16 +0,0 @@
1
- LICENSE
2
- pyproject.toml
3
- readme.md
4
- llmvalidate.egg-info/PKG-INFO
5
- llmvalidate.egg-info/SOURCES.txt
6
- llmvalidate.egg-info/dependency_links.txt
7
- llmvalidate.egg-info/top_level.txt
8
- src/__init__.py
9
- src/standardize.py
10
- src/structured.py
11
- src/utils.py
12
- src/validation.py
13
- tests/bootstrap_CI_test.py
14
- tests/compare_results_all_test.py
15
- tests/compare_results_test.py
16
- tests/validate_test.py
@@ -1,4 +0,0 @@
1
- dist
2
- src
3
- tests
4
- validation_results
@@ -1 +0,0 @@
1
- __version__ = "0.0.0"
@@ -1,229 +0,0 @@
1
- import pandas as pd
2
- import pytest
3
- import numpy as np
4
- import src.validation as v
5
-
6
-
7
- def test_bootstrap_CI_basic():
8
- """Test basic functionality of bootstrap_CI"""
9
- # Create test data with comparison results
10
- res_df = pd.DataFrame({
11
- 'field1': ['A', 'B', 'A', 'B', 'A'] * 20, # 100 rows total
12
- 'field2': ['X', 'Y', 'X', 'Y', 'X'] * 20,
13
- 'Cor: field1': [1, 0, 1, 1, 0] * 20,
14
- 'Inc: field1': [0, 1, 0, 0, 1] * 20,
15
- 'Mis: field1': [0, 0, 0, 0, 0] * 20,
16
- 'Spu: field1': [0, 1, 0, 0, 1] * 20,
17
- 'Par: field1': [0, 0, 0, 0, 0] * 20,
18
- 'Cor: field2': [1, 1, 0, 1, 1] * 20,
19
- 'Inc: field2': [0, 0, 1, 0, 0] * 20,
20
- 'Mis: field2': [0, 0, 1, 0, 0] * 20,
21
- 'Spu: field2': [0, 0, 0, 0, 0] * 20,
22
- 'Par: field2': [0, 0, 0, 0, 0] * 20,
23
- })
24
-
25
- fields = ['field1', 'field2']
26
- result = v.bootstrap_CI(res_df, fields, n_bootstrap=100, random_state=42)
27
-
28
- # Check output format
29
- assert 'field' in result.columns
30
- assert len(result) == 4 # Two fields + exceptions field + N/CI info row
31
- expected_fields = {'field1', 'field2', 'exceptions', 'N=100; CI=95%'}
32
- assert set(result['field']) == expected_fields
33
-
34
- # Check that confidence interval columns are present for metrics that exist in our data
35
- # We know these will be present because we've included them in our test data
36
- core_metrics = ['field-present cases', 'cor', 'inc', 'mis', 'spu', 'par']
37
-
38
- # 'labeled cases' is handled specially - just appears as 'labeled cases'
39
- assert 'labeled cases' in result.columns
40
-
41
- for metric in core_metrics:
42
- assert f'{metric}: mean' in result.columns
43
- assert f'{metric}: lower' in result.columns
44
- assert f'{metric}: upper' in result.columns
45
-
46
- # Check that means are reasonable (between lower and upper bounds) for non-exception fields
47
- for _, row in result.iterrows():
48
- if row['field'] in ['exceptions', 'N=100; CI=95%']: # Skip exceptions and info row
49
- for metric in core_metrics:
50
- mean_col = f'{metric}: mean'
51
- lower_col = f'{metric}: lower'
52
- upper_col = f'{metric}: upper'
53
-
54
- if pd.notna(row[mean_col]) and pd.notna(row[lower_col]) and pd.notna(row[upper_col]):
55
- assert row[lower_col] <= row[mean_col] <= row[upper_col], \
56
- f"Mean not between bounds for {row['field']} {metric}"
57
-
58
-
59
- def test_bootstrap_CI_error_conditions():
60
- """Test error conditions for bootstrap_CI"""
61
-
62
- # Test with ci outside valid range
63
- res_df = pd.DataFrame({
64
- 'field1': [1, 2, 3],
65
- 'Cor: field1': [1, 0, 1]
66
- })
67
-
68
- with pytest.raises(ValueError, match="ci must be in \\(0, 1\\)"):
69
- v.bootstrap_CI(res_df, ['field1'], ci=1.5)
70
-
71
- with pytest.raises(ValueError, match="ci must be in \\(0, 1\\)"):
72
- v.bootstrap_CI(res_df, ['field1'], ci=0)
73
-
74
- # Test with too few rows
75
- single_row_df = pd.DataFrame({
76
- 'field1': [1],
77
- 'Cor: field1': [1]
78
- })
79
-
80
- with pytest.raises(ValueError, match="Need at least 2 rows"):
81
- v.bootstrap_CI(single_row_df, ['field1'])
82
-
83
- # Test with missing labels (NaN values)
84
- res_df_with_nan = pd.DataFrame({
85
- 'field1': [1, np.nan, 3],
86
- 'field2': [1, 2, 3],
87
- 'Cor: field1': [1, 0, 1],
88
- 'Cor: field2': [0, 1, 0]
89
- })
90
-
91
- with pytest.raises(ValueError, match="Missing labels \\(NaN\\) found in the following fields: \\['field1'\\]"):
92
- v.bootstrap_CI(res_df_with_nan, ['field1', 'field2'])
93
-
94
-
95
- def test_bootstrap_CI_binary_field():
96
- """Test bootstrap_CI with binary field metrics"""
97
- # Create test data with binary field results
98
- res_df = pd.DataFrame({
99
- 'binary_field': [True, False, True, False] * 25, # 100 rows
100
- 'TP: binary_field': [1, 0, 1, 0] * 25,
101
- 'FP: binary_field': [0, 1, 0, 1] * 25,
102
- 'FN: binary_field': [0, 0, 0, 0] * 25,
103
- 'TN: binary_field': [0, 1, 0, 1] * 25,
104
- 'Precision: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
105
- 'Recall: binary_field': [1.0, np.nan, 1.0, np.nan] * 25,
106
- 'F1 score: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
107
- 'F2 score: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
108
- })
109
-
110
- fields = ['binary_field']
111
- result = v.bootstrap_CI(res_df, fields, n_bootstrap=50, random_state=42)
112
-
113
- # Check that binary metrics are included
114
- binary_metrics = ['TP', 'FP', 'FN', 'TN', 'precision (micro)', 'recall (micro)',
115
- 'F1 score (micro)', 'F2 score (micro)', 'accuracy (micro)', 'specificity (micro)']
116
-
117
- for metric in binary_metrics:
118
- assert f'{metric}: mean' in result.columns
119
- assert f'{metric}: lower' in result.columns
120
- assert f'{metric}: upper' in result.columns
121
-
122
- # Check that N/CI info row is present
123
- info_rows = result[result['field'].str.startswith('N=')]
124
- assert len(info_rows) == 1
125
-
126
-
127
- def test_bootstrap_CI_output_format():
128
- """Test that output format matches specification"""
129
- res_df = pd.DataFrame({
130
- 'test_field': ['A', 'B'] * 50, # 100 rows
131
- 'Cor: test_field': [1, 0] * 50,
132
- 'Inc: test_field': [0, 1] * 50,
133
- 'Mis: test_field': [0, 0] * 50,
134
- 'Spu: test_field': [0, 1] * 50,
135
- 'Par: test_field': [0, 0] * 50,
136
- })
137
-
138
- result = v.bootstrap_CI(res_df, ['test_field'], n_bootstrap=10, random_state=42)
139
-
140
- # Check that result has the correct format
141
- assert len(result) == 3 # One field + exceptions + N/CI info row
142
- test_field_row = result[result['field'] == 'test_field'].iloc[0]
143
- assert test_field_row['field'] == 'test_field'
144
-
145
- # Check that columns follow the expected pattern
146
- metric_columns = [col for col in result.columns if col not in ['field', 'labeled cases']]
147
- for col in metric_columns:
148
- assert ': ' in col, f"Column {col} doesn't follow expected format"
149
- metric_name, stat_type = col.split(': ', 1)
150
- assert stat_type in ['mean', 'lower', 'upper'], f"Unexpected stat type in {col}"
151
-
152
-
153
- def test_bootstrap_CI_confidence_intervals():
154
- """Test that confidence intervals make sense"""
155
- # Create deterministic test case
156
- res_df = pd.DataFrame({
157
- 'field1': [1] * 100,
158
- 'Cor: field1': [5] * 100, # Constant values for predictable CI
159
- 'Inc: field1': [0] * 100,
160
- 'Mis: field1': [0] * 100,
161
- 'Spu: field1': [0] * 100,
162
- 'Par: field1': [0] * 100,
163
- })
164
-
165
- result = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=100, ci=0.95, random_state=42)
166
-
167
- # For constant values, mean should equal the sum (get_metrics sums the values)
168
- row = result[result['field'] == 'field1'].iloc[0]
169
-
170
- # Check that mean is close to expected value (5 * 100 = 500)
171
- assert abs(row['cor: mean'] - 500.0) < 10.0
172
-
173
- # Check that CI bounds are reasonable (close to mean for constant data)
174
- assert abs(row['cor: lower'] - row['cor: mean']) < 50.0
175
- assert abs(row['cor: upper'] - row['cor: mean']) < 50.0
176
-
177
-
178
- def test_bootstrap_CI_with_different_ci_levels():
179
- """Test bootstrap_CI with different confidence interval levels"""
180
- res_df = pd.DataFrame({
181
- 'field1': [1, 2, 3] * 34, # ~100 rows
182
- 'Cor: field1': [1, 2, 1] * 34,
183
- 'Inc: field1': [0, 1, 0] * 34,
184
- 'Mis: field1': [0, 0, 1] * 34,
185
- 'Spu: field1': [1, 0, 0] * 34,
186
- 'Par: field1': [0, 0, 0] * 34,
187
- })
188
-
189
- # Test 90% CI
190
- result_90 = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=50, ci=0.90, random_state=42)
191
-
192
- # Test 99% CI
193
- result_99 = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=50, ci=0.99, random_state=42)
194
-
195
- # 99% CI should be wider than 90% CI
196
- row_90 = result_90[result_90['field'] == 'field1'].iloc[0]
197
- row_99 = result_99[result_99['field'] == 'field1'].iloc[0]
198
-
199
- width_90 = row_90['cor: upper'] - row_90['cor: lower']
200
- width_99 = row_99['cor: upper'] - row_99['cor: lower']
201
-
202
- assert width_99 >= width_90, "99% CI should be wider than 90% CI"
203
-
204
-
205
- def test_bootstrap_CI_empty_metrics():
206
- """Test bootstrap_CI handles missing values correctly"""
207
- # Create simpler test data that focuses on core functionality
208
- res_df = pd.DataFrame({
209
- 'field1': [1, 2, 3] * 34,
210
- 'Cor: field1': [0, 1, 2] * 34, # Valid values
211
- 'Inc: field1': [0, 0, 0] * 34,
212
- 'Mis: field1': [0, 0, 0] * 34,
213
- 'Spu: field1': [0, 0, 0] * 34,
214
- 'Par: field1': [0, 0, 0] * 34,
215
- })
216
-
217
- result = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=10, random_state=42)
218
-
219
- # Check that core metrics appear in output
220
- core_cols = [col for col in result.columns if 'cor:' in col]
221
- assert len(core_cols) == 3, f"Expected 3 cor metrics (mean, lower, upper), got {len(core_cols)}: {core_cols}"
222
-
223
- # Check that the function completes without errors for this simpler case
224
- assert 'field' in result.columns
225
- assert len(result) >= 2 # At least exceptions + field1 + N/CI info row
226
-
227
- # Check that N/CI info row is present
228
- info_rows = result[result['field'].str.startswith('N=')]
229
- assert len(info_rows) == 1
@@ -1,165 +0,0 @@
1
- import math
2
- import pandas as pd
3
- import pytest
4
- import src.validation as v
5
- pd.options.display.width = 0
6
-
7
- def get_test_df(addconfidence):
8
- flag = [True, True, False, False, True, False, True, True]
9
- res_flag = [True, False, True, False, False, False, True, False]
10
-
11
- fruits = [
12
- ['apple', 'banana'],
13
- ['apple'],
14
- '-',
15
- ['cherry'],
16
- [],
17
- ['apple'],
18
- ['apple', 'banana'],
19
- None
20
- ]
21
- res_fruits = [
22
- ['apple', 'cherry'],
23
- ['apple', 'banana'],
24
- [],
25
- ['cherry'],
26
- ['apple'],
27
- '-',
28
- ['banana'],
29
- ['apple']
30
- ]
31
- res_fruits_confidence = [
32
- 'High',
33
- 'Low',
34
- 'High',
35
- None,
36
- 'NA',
37
- 'Low',
38
- 'High',
39
- 'High'
40
- ]
41
-
42
- color = [
43
- 'red',
44
- 'blue',
45
- '-',
46
- 'green',
47
- '-',
48
- '-',
49
- '4',
50
- None
51
- ]
52
- res_color = [
53
- 'red',
54
- 'green',
55
- 'yellow',
56
- '-',
57
- '',
58
- '-',
59
- 4,
60
- 'red'
61
- ]
62
- res_color_confidence = [
63
- 'High',
64
- 'Low',
65
- '-',
66
- 'High',
67
- '',
68
- 'High',
69
- 'Low',
70
- 'High'
71
- ]
72
-
73
- orphan = ['x','y','z','w','-','',None, None]
74
-
75
- df = pd.DataFrame({
76
- 'flag': flag,
77
- 'Res: flag': res_flag,
78
- 'fruits': fruits,
79
- 'Res: fruits': res_fruits,
80
- 'orphan': orphan,
81
- 'color': color,
82
- 'Res: color': res_color
83
- })
84
-
85
- if addconfidence:
86
- # Insert after 'Res: fruits'
87
- pos_fruits = df.columns.get_loc('Res: fruits')
88
- df.insert(pos_fruits + 1, 'Res: fruits confidence', res_fruits_confidence)
89
- # Insert after 'Res: color'
90
- pos_color = df.columns.get_loc('Res: color') # recompute after previous insert
91
- df.insert(pos_color + 1, 'Res: color confidence', res_color_confidence)
92
-
93
- return df
94
-
95
- def _is_none_or_nan(x):
96
- return x is None or (isinstance(x, float) and math.isnan(x))
97
-
98
- def test_compare_results_all_mixed_fields():
99
- df = get_test_df(False)
100
-
101
- res_df = v.compare_results_all(df, ['flag', 'fruits', 'color'])
102
-
103
- # ---- Binary field assertions (flag) ----
104
- # Row 0: TP
105
- assert res_df.loc[0, 'TP: flag'] == 1
106
- # Row 1: FN
107
- assert res_df.loc[1, 'FN: flag'] == 1
108
- assert res_df.loc[2, 'FP: flag'] == 1
109
- assert res_df.loc[3, 'TN: flag'] == 1
110
-
111
- # ---- List field assertions (fruits) ----
112
- # Row 0 mixed
113
- assert res_df.loc[0, 'Cor: fruits'] == 1
114
- assert res_df.loc[0, 'Mis: fruits'] == 1
115
- assert res_df.loc[0, 'Spu: fruits'] == 1
116
- assert res_df.loc[0, 'Precision: fruits'] == pytest.approx(0.5)
117
- assert res_df.loc[0, 'Recall: fruits'] == pytest.approx(0.5)
118
- assert res_df.loc[0, 'F1 score: fruits'] == pytest.approx(0.5)
119
-
120
- # Row 1: one correct + one spurious
121
- assert res_df.loc[1, 'Cor: fruits'] == 1
122
- assert res_df.loc[1, 'Spu: fruits'] == 1
123
- assert res_df.loc[1, 'Precision: fruits'] == pytest.approx(0.5)
124
- assert res_df.loc[1, 'Recall: fruits'] == pytest.approx(1.0)
125
-
126
- # Row 2: expected '-' vs [] => zeros, metrics NaN
127
- assert res_df.loc[2, 'Cor: fruits'] == 0
128
- assert math.isnan(res_df.loc[2, 'Precision: fruits'])
129
-
130
- # Row 3: perfect
131
- assert res_df.loc[3, 'Cor: fruits'] == 1
132
- assert res_df.loc[3, 'Precision: fruits'] == pytest.approx(1.0)
133
-
134
- # Row 4: expected empty list, actual has item -> spurious
135
- assert res_df.loc[4, 'Spu: fruits'] == 1
136
- assert res_df.loc[4, 'Mis: fruits'] == 0
137
-
138
- # Row 5: expected ['apple'], actual '-' (empty) -> missing
139
- assert res_df.loc[5, 'Mis: fruits'] == 1
140
- assert res_df.loc[5, 'Spu: fruits'] == 0
141
-
142
- # ---- Scalar non-binary field assertions (color) ----
143
- assert res_df.loc[0, 'Cor: color'] == 1 # correct
144
- assert res_df.loc[1, 'Inc: color'] == 1 # incorrect
145
- assert res_df.loc[2, 'Spu: color'] == 1 # spurious
146
- assert res_df.loc[3, 'Mis: color'] == 1 # missing
147
- # Rows 4 & 5: both sides empty label cases ('-' and ''), treated as labeled empty -> zeros + NaN metrics
148
- assert res_df.loc[4, 'Cor: color'] == 0
149
- assert res_df.loc[5, 'Cor: color'] == 0
150
- # Row 6: numeric string vs number -> match
151
- assert res_df.loc[6, 'Cor: color'] == 1
152
- assert res_df.loc[6, 'Inc: color'] == 0
153
-
154
- # Ensure orphan column passed through unchanged
155
- assert 'orphan' in res_df.columns
156
-
157
- expected_columns = [
158
- 'TP: flag','TN: flag','FP: flag','FN: flag',
159
- 'Cor: fruits','Mis: fruits','Spu: fruits',
160
- 'Precision: fruits','Recall: fruits','F1 score: fruits',
161
- 'Cor: color','Inc: color','Mis: color','Spu: color'
162
- ]
163
- for col in expected_columns:
164
- assert col in res_df.columns, f"Missing column {col} in compare_results_all output"
165
-