llmvalidate 0.4.2__tar.gz → 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llmvalidate-0.4.2 → llmvalidate-0.4.3}/PKG-INFO +12 -5
- {llmvalidate-0.4.2 → llmvalidate-0.4.3}/pyproject.toml +18 -5
- {llmvalidate-0.4.2 → llmvalidate-0.4.3}/readme.md +5 -4
- llmvalidate-0.4.3/src/llmvalidate/__init__.py +12 -0
- {llmvalidate-0.4.2/src → llmvalidate-0.4.3/src/llmvalidate}/structured.py +1 -1
- {llmvalidate-0.4.2/src → llmvalidate-0.4.3/src/llmvalidate}/utils.py +1 -1
- {llmvalidate-0.4.2/src → llmvalidate-0.4.3/src/llmvalidate}/validation.py +2 -2
- {llmvalidate-0.4.2 → llmvalidate-0.4.3/src}/llmvalidate.egg-info/PKG-INFO +12 -5
- llmvalidate-0.4.3/src/llmvalidate.egg-info/SOURCES.txt +13 -0
- llmvalidate-0.4.3/src/llmvalidate.egg-info/requires.txt +6 -0
- llmvalidate-0.4.3/src/llmvalidate.egg-info/top_level.txt +1 -0
- llmvalidate-0.4.2/llmvalidate.egg-info/SOURCES.txt +0 -16
- llmvalidate-0.4.2/llmvalidate.egg-info/top_level.txt +0 -4
- llmvalidate-0.4.2/src/__init__.py +0 -1
- llmvalidate-0.4.2/tests/bootstrap_CI_test.py +0 -229
- llmvalidate-0.4.2/tests/compare_results_all_test.py +0 -165
- llmvalidate-0.4.2/tests/compare_results_test.py +0 -96
- llmvalidate-0.4.2/tests/validate_test.py +0 -758
- {llmvalidate-0.4.2 → llmvalidate-0.4.3}/LICENSE +0 -0
- {llmvalidate-0.4.2 → llmvalidate-0.4.3}/setup.cfg +0 -0
- {llmvalidate-0.4.2/src → llmvalidate-0.4.3/src/llmvalidate}/standardize.py +0 -0
- {llmvalidate-0.4.2 → llmvalidate-0.4.3/src}/llmvalidate.egg-info/dependency_links.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmvalidate
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: Oncoshot LLM validation framework
|
|
5
5
|
License: MIT
|
|
6
6
|
Project-URL: Homepage, https://github.com/Oncoshot/llm-validation-framework
|
|
@@ -16,6 +16,12 @@ Classifier: Intended Audience :: Developers
|
|
|
16
16
|
Requires-Python: >=3.11
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pandas>=3.0.0
|
|
20
|
+
Requires-Dist: numpy>=2.4.1
|
|
21
|
+
Requires-Dist: pydantic>=2.12.5
|
|
22
|
+
Requires-Dist: tqdm>=4.67.1
|
|
23
|
+
Requires-Dist: python-dateutil>=2.9.0
|
|
24
|
+
Requires-Dist: colorama>=0.4.6
|
|
19
25
|
Dynamic: license-file
|
|
20
26
|
|
|
21
27
|
# LLM Validation Framework
|
|
@@ -333,10 +339,11 @@ pytest tests/compare_results_all_test.py # End-to-end comparisons
|
|
|
333
339
|
```
|
|
334
340
|
llm-validation-framework/
|
|
335
341
|
├── src/
|
|
336
|
-
│
|
|
337
|
-
│
|
|
338
|
-
│
|
|
339
|
-
│
|
|
342
|
+
│ └── llmvalidate/
|
|
343
|
+
│ ├── validation.py # Main validation pipeline and metrics calculation
|
|
344
|
+
│ ├── structured.py # Pydantic data models for LLM results
|
|
345
|
+
│ ├── utils.py # Utility functions (list conversion, flattening)
|
|
346
|
+
│ └── standardize.py # Data standardization helpers
|
|
340
347
|
├── tests/ # Comprehensive test suite
|
|
341
348
|
├── validation_results/ # Output directory (auto-created)
|
|
342
349
|
├── samples.csv # Demo dataset with all validation scenarios
|
|
@@ -4,10 +4,18 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "llmvalidate"
|
|
7
|
-
version = "0.4.
|
|
7
|
+
version = "0.4.3"
|
|
8
8
|
description = "Oncoshot LLM validation framework"
|
|
9
9
|
readme = "readme.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"pandas>=3.0.0",
|
|
13
|
+
"numpy>=2.4.1",
|
|
14
|
+
"pydantic>=2.12.5",
|
|
15
|
+
"tqdm>=4.67.1",
|
|
16
|
+
"python-dateutil>=2.9.0",
|
|
17
|
+
"colorama>=0.4.6"
|
|
18
|
+
]
|
|
11
19
|
license = { text = "MIT" }
|
|
12
20
|
|
|
13
21
|
classifiers = [
|
|
@@ -25,22 +33,27 @@ classifiers = [
|
|
|
25
33
|
"Repository" = "https://github.com/Oncoshot/llm-validation-framework"
|
|
26
34
|
"Bug Tracker" = "https://github.com/Oncoshot/llm-validation-framework/issues"
|
|
27
35
|
|
|
36
|
+
[tool.setuptools]
|
|
37
|
+
package-dir = {"" = "src"}
|
|
38
|
+
|
|
28
39
|
[tool.setuptools.packages.find]
|
|
29
|
-
where = ["
|
|
40
|
+
where = ["src"]
|
|
41
|
+
include = ["llmvalidate*"]
|
|
42
|
+
exclude = ["tests*", "docs*", "scripts*"]
|
|
30
43
|
|
|
31
44
|
[tool.pytest.ini_options]
|
|
32
45
|
pythonpath = [
|
|
33
|
-
".",
|
|
34
46
|
"src",
|
|
35
47
|
]
|
|
36
48
|
|
|
37
49
|
[tool.semantic_release]
|
|
38
|
-
version_variable = ["src/__init__.py:__version__"]
|
|
50
|
+
version_variable = ["src/llmvalidate/__init__.py:__version__"]
|
|
39
51
|
version_toml = ["pyproject.toml:project.version"]
|
|
40
52
|
branch = "master"
|
|
41
53
|
allow_zero_version = true
|
|
42
54
|
build_command = "pip install build && python -m build"
|
|
43
55
|
upload_to_pypi = true
|
|
44
56
|
upload_to_release = true
|
|
45
|
-
|
|
46
57
|
commit_version_number = true
|
|
58
|
+
|
|
59
|
+
|
|
@@ -313,10 +313,11 @@ pytest tests/compare_results_all_test.py # End-to-end comparisons
|
|
|
313
313
|
```
|
|
314
314
|
llm-validation-framework/
|
|
315
315
|
├── src/
|
|
316
|
-
│
|
|
317
|
-
│
|
|
318
|
-
│
|
|
319
|
-
│
|
|
316
|
+
│ └── llmvalidate/
|
|
317
|
+
│ ├── validation.py # Main validation pipeline and metrics calculation
|
|
318
|
+
│ ├── structured.py # Pydantic data models for LLM results
|
|
319
|
+
│ ├── utils.py # Utility functions (list conversion, flattening)
|
|
320
|
+
│ └── standardize.py # Data standardization helpers
|
|
320
321
|
├── tests/ # Comprehensive test suite
|
|
321
322
|
├── validation_results/ # Output directory (auto-created)
|
|
322
323
|
├── samples.csv # Demo dataset with all validation scenarios
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
__version__ = "0.0.0"
|
|
2
|
+
|
|
3
|
+
from .validation import validate, bootstrap_CI
|
|
4
|
+
from .structured import StructuredResult, StructuredGroup, StructuredField
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"validate",
|
|
8
|
+
"bootstrap_CI",
|
|
9
|
+
"StructuredResult",
|
|
10
|
+
"StructuredGroup",
|
|
11
|
+
"StructuredField"
|
|
12
|
+
]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
from datetime import datetime
|
|
2
2
|
import math
|
|
3
3
|
from ast import literal_eval
|
|
4
4
|
import string
|
|
@@ -8,7 +8,7 @@ import time
|
|
|
8
8
|
import os
|
|
9
9
|
import concurrent.futures as cf
|
|
10
10
|
from tqdm import tqdm
|
|
11
|
-
from
|
|
11
|
+
from .utils import convert_lists, infer_fields
|
|
12
12
|
|
|
13
13
|
def compare_results_binary(expected, actual):
|
|
14
14
|
"""Compares boolean labels and returns confusion matrix counts."""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmvalidate
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: Oncoshot LLM validation framework
|
|
5
5
|
License: MIT
|
|
6
6
|
Project-URL: Homepage, https://github.com/Oncoshot/llm-validation-framework
|
|
@@ -16,6 +16,12 @@ Classifier: Intended Audience :: Developers
|
|
|
16
16
|
Requires-Python: >=3.11
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pandas>=3.0.0
|
|
20
|
+
Requires-Dist: numpy>=2.4.1
|
|
21
|
+
Requires-Dist: pydantic>=2.12.5
|
|
22
|
+
Requires-Dist: tqdm>=4.67.1
|
|
23
|
+
Requires-Dist: python-dateutil>=2.9.0
|
|
24
|
+
Requires-Dist: colorama>=0.4.6
|
|
19
25
|
Dynamic: license-file
|
|
20
26
|
|
|
21
27
|
# LLM Validation Framework
|
|
@@ -333,10 +339,11 @@ pytest tests/compare_results_all_test.py # End-to-end comparisons
|
|
|
333
339
|
```
|
|
334
340
|
llm-validation-framework/
|
|
335
341
|
├── src/
|
|
336
|
-
│
|
|
337
|
-
│
|
|
338
|
-
│
|
|
339
|
-
│
|
|
342
|
+
│ └── llmvalidate/
|
|
343
|
+
│ ├── validation.py # Main validation pipeline and metrics calculation
|
|
344
|
+
│ ├── structured.py # Pydantic data models for LLM results
|
|
345
|
+
│ ├── utils.py # Utility functions (list conversion, flattening)
|
|
346
|
+
│ └── standardize.py # Data standardization helpers
|
|
340
347
|
├── tests/ # Comprehensive test suite
|
|
341
348
|
├── validation_results/ # Output directory (auto-created)
|
|
342
349
|
├── samples.csv # Demo dataset with all validation scenarios
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
pyproject.toml
|
|
3
|
+
readme.md
|
|
4
|
+
src/llmvalidate/__init__.py
|
|
5
|
+
src/llmvalidate/standardize.py
|
|
6
|
+
src/llmvalidate/structured.py
|
|
7
|
+
src/llmvalidate/utils.py
|
|
8
|
+
src/llmvalidate/validation.py
|
|
9
|
+
src/llmvalidate.egg-info/PKG-INFO
|
|
10
|
+
src/llmvalidate.egg-info/SOURCES.txt
|
|
11
|
+
src/llmvalidate.egg-info/dependency_links.txt
|
|
12
|
+
src/llmvalidate.egg-info/requires.txt
|
|
13
|
+
src/llmvalidate.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
llmvalidate
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
LICENSE
|
|
2
|
-
pyproject.toml
|
|
3
|
-
readme.md
|
|
4
|
-
llmvalidate.egg-info/PKG-INFO
|
|
5
|
-
llmvalidate.egg-info/SOURCES.txt
|
|
6
|
-
llmvalidate.egg-info/dependency_links.txt
|
|
7
|
-
llmvalidate.egg-info/top_level.txt
|
|
8
|
-
src/__init__.py
|
|
9
|
-
src/standardize.py
|
|
10
|
-
src/structured.py
|
|
11
|
-
src/utils.py
|
|
12
|
-
src/validation.py
|
|
13
|
-
tests/bootstrap_CI_test.py
|
|
14
|
-
tests/compare_results_all_test.py
|
|
15
|
-
tests/compare_results_test.py
|
|
16
|
-
tests/validate_test.py
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.0.0"
|
|
@@ -1,229 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import pytest
|
|
3
|
-
import numpy as np
|
|
4
|
-
import src.validation as v
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def test_bootstrap_CI_basic():
|
|
8
|
-
"""Test basic functionality of bootstrap_CI"""
|
|
9
|
-
# Create test data with comparison results
|
|
10
|
-
res_df = pd.DataFrame({
|
|
11
|
-
'field1': ['A', 'B', 'A', 'B', 'A'] * 20, # 100 rows total
|
|
12
|
-
'field2': ['X', 'Y', 'X', 'Y', 'X'] * 20,
|
|
13
|
-
'Cor: field1': [1, 0, 1, 1, 0] * 20,
|
|
14
|
-
'Inc: field1': [0, 1, 0, 0, 1] * 20,
|
|
15
|
-
'Mis: field1': [0, 0, 0, 0, 0] * 20,
|
|
16
|
-
'Spu: field1': [0, 1, 0, 0, 1] * 20,
|
|
17
|
-
'Par: field1': [0, 0, 0, 0, 0] * 20,
|
|
18
|
-
'Cor: field2': [1, 1, 0, 1, 1] * 20,
|
|
19
|
-
'Inc: field2': [0, 0, 1, 0, 0] * 20,
|
|
20
|
-
'Mis: field2': [0, 0, 1, 0, 0] * 20,
|
|
21
|
-
'Spu: field2': [0, 0, 0, 0, 0] * 20,
|
|
22
|
-
'Par: field2': [0, 0, 0, 0, 0] * 20,
|
|
23
|
-
})
|
|
24
|
-
|
|
25
|
-
fields = ['field1', 'field2']
|
|
26
|
-
result = v.bootstrap_CI(res_df, fields, n_bootstrap=100, random_state=42)
|
|
27
|
-
|
|
28
|
-
# Check output format
|
|
29
|
-
assert 'field' in result.columns
|
|
30
|
-
assert len(result) == 4 # Two fields + exceptions field + N/CI info row
|
|
31
|
-
expected_fields = {'field1', 'field2', 'exceptions', 'N=100; CI=95%'}
|
|
32
|
-
assert set(result['field']) == expected_fields
|
|
33
|
-
|
|
34
|
-
# Check that confidence interval columns are present for metrics that exist in our data
|
|
35
|
-
# We know these will be present because we've included them in our test data
|
|
36
|
-
core_metrics = ['field-present cases', 'cor', 'inc', 'mis', 'spu', 'par']
|
|
37
|
-
|
|
38
|
-
# 'labeled cases' is handled specially - just appears as 'labeled cases'
|
|
39
|
-
assert 'labeled cases' in result.columns
|
|
40
|
-
|
|
41
|
-
for metric in core_metrics:
|
|
42
|
-
assert f'{metric}: mean' in result.columns
|
|
43
|
-
assert f'{metric}: lower' in result.columns
|
|
44
|
-
assert f'{metric}: upper' in result.columns
|
|
45
|
-
|
|
46
|
-
# Check that means are reasonable (between lower and upper bounds) for non-exception fields
|
|
47
|
-
for _, row in result.iterrows():
|
|
48
|
-
if row['field'] in ['exceptions', 'N=100; CI=95%']: # Skip exceptions and info row
|
|
49
|
-
for metric in core_metrics:
|
|
50
|
-
mean_col = f'{metric}: mean'
|
|
51
|
-
lower_col = f'{metric}: lower'
|
|
52
|
-
upper_col = f'{metric}: upper'
|
|
53
|
-
|
|
54
|
-
if pd.notna(row[mean_col]) and pd.notna(row[lower_col]) and pd.notna(row[upper_col]):
|
|
55
|
-
assert row[lower_col] <= row[mean_col] <= row[upper_col], \
|
|
56
|
-
f"Mean not between bounds for {row['field']} {metric}"
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def test_bootstrap_CI_error_conditions():
|
|
60
|
-
"""Test error conditions for bootstrap_CI"""
|
|
61
|
-
|
|
62
|
-
# Test with ci outside valid range
|
|
63
|
-
res_df = pd.DataFrame({
|
|
64
|
-
'field1': [1, 2, 3],
|
|
65
|
-
'Cor: field1': [1, 0, 1]
|
|
66
|
-
})
|
|
67
|
-
|
|
68
|
-
with pytest.raises(ValueError, match="ci must be in \\(0, 1\\)"):
|
|
69
|
-
v.bootstrap_CI(res_df, ['field1'], ci=1.5)
|
|
70
|
-
|
|
71
|
-
with pytest.raises(ValueError, match="ci must be in \\(0, 1\\)"):
|
|
72
|
-
v.bootstrap_CI(res_df, ['field1'], ci=0)
|
|
73
|
-
|
|
74
|
-
# Test with too few rows
|
|
75
|
-
single_row_df = pd.DataFrame({
|
|
76
|
-
'field1': [1],
|
|
77
|
-
'Cor: field1': [1]
|
|
78
|
-
})
|
|
79
|
-
|
|
80
|
-
with pytest.raises(ValueError, match="Need at least 2 rows"):
|
|
81
|
-
v.bootstrap_CI(single_row_df, ['field1'])
|
|
82
|
-
|
|
83
|
-
# Test with missing labels (NaN values)
|
|
84
|
-
res_df_with_nan = pd.DataFrame({
|
|
85
|
-
'field1': [1, np.nan, 3],
|
|
86
|
-
'field2': [1, 2, 3],
|
|
87
|
-
'Cor: field1': [1, 0, 1],
|
|
88
|
-
'Cor: field2': [0, 1, 0]
|
|
89
|
-
})
|
|
90
|
-
|
|
91
|
-
with pytest.raises(ValueError, match="Missing labels \\(NaN\\) found in the following fields: \\['field1'\\]"):
|
|
92
|
-
v.bootstrap_CI(res_df_with_nan, ['field1', 'field2'])
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def test_bootstrap_CI_binary_field():
|
|
96
|
-
"""Test bootstrap_CI with binary field metrics"""
|
|
97
|
-
# Create test data with binary field results
|
|
98
|
-
res_df = pd.DataFrame({
|
|
99
|
-
'binary_field': [True, False, True, False] * 25, # 100 rows
|
|
100
|
-
'TP: binary_field': [1, 0, 1, 0] * 25,
|
|
101
|
-
'FP: binary_field': [0, 1, 0, 1] * 25,
|
|
102
|
-
'FN: binary_field': [0, 0, 0, 0] * 25,
|
|
103
|
-
'TN: binary_field': [0, 1, 0, 1] * 25,
|
|
104
|
-
'Precision: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
|
|
105
|
-
'Recall: binary_field': [1.0, np.nan, 1.0, np.nan] * 25,
|
|
106
|
-
'F1 score: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
|
|
107
|
-
'F2 score: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
|
|
108
|
-
})
|
|
109
|
-
|
|
110
|
-
fields = ['binary_field']
|
|
111
|
-
result = v.bootstrap_CI(res_df, fields, n_bootstrap=50, random_state=42)
|
|
112
|
-
|
|
113
|
-
# Check that binary metrics are included
|
|
114
|
-
binary_metrics = ['TP', 'FP', 'FN', 'TN', 'precision (micro)', 'recall (micro)',
|
|
115
|
-
'F1 score (micro)', 'F2 score (micro)', 'accuracy (micro)', 'specificity (micro)']
|
|
116
|
-
|
|
117
|
-
for metric in binary_metrics:
|
|
118
|
-
assert f'{metric}: mean' in result.columns
|
|
119
|
-
assert f'{metric}: lower' in result.columns
|
|
120
|
-
assert f'{metric}: upper' in result.columns
|
|
121
|
-
|
|
122
|
-
# Check that N/CI info row is present
|
|
123
|
-
info_rows = result[result['field'].str.startswith('N=')]
|
|
124
|
-
assert len(info_rows) == 1
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
def test_bootstrap_CI_output_format():
|
|
128
|
-
"""Test that output format matches specification"""
|
|
129
|
-
res_df = pd.DataFrame({
|
|
130
|
-
'test_field': ['A', 'B'] * 50, # 100 rows
|
|
131
|
-
'Cor: test_field': [1, 0] * 50,
|
|
132
|
-
'Inc: test_field': [0, 1] * 50,
|
|
133
|
-
'Mis: test_field': [0, 0] * 50,
|
|
134
|
-
'Spu: test_field': [0, 1] * 50,
|
|
135
|
-
'Par: test_field': [0, 0] * 50,
|
|
136
|
-
})
|
|
137
|
-
|
|
138
|
-
result = v.bootstrap_CI(res_df, ['test_field'], n_bootstrap=10, random_state=42)
|
|
139
|
-
|
|
140
|
-
# Check that result has the correct format
|
|
141
|
-
assert len(result) == 3 # One field + exceptions + N/CI info row
|
|
142
|
-
test_field_row = result[result['field'] == 'test_field'].iloc[0]
|
|
143
|
-
assert test_field_row['field'] == 'test_field'
|
|
144
|
-
|
|
145
|
-
# Check that columns follow the expected pattern
|
|
146
|
-
metric_columns = [col for col in result.columns if col not in ['field', 'labeled cases']]
|
|
147
|
-
for col in metric_columns:
|
|
148
|
-
assert ': ' in col, f"Column {col} doesn't follow expected format"
|
|
149
|
-
metric_name, stat_type = col.split(': ', 1)
|
|
150
|
-
assert stat_type in ['mean', 'lower', 'upper'], f"Unexpected stat type in {col}"
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
def test_bootstrap_CI_confidence_intervals():
|
|
154
|
-
"""Test that confidence intervals make sense"""
|
|
155
|
-
# Create deterministic test case
|
|
156
|
-
res_df = pd.DataFrame({
|
|
157
|
-
'field1': [1] * 100,
|
|
158
|
-
'Cor: field1': [5] * 100, # Constant values for predictable CI
|
|
159
|
-
'Inc: field1': [0] * 100,
|
|
160
|
-
'Mis: field1': [0] * 100,
|
|
161
|
-
'Spu: field1': [0] * 100,
|
|
162
|
-
'Par: field1': [0] * 100,
|
|
163
|
-
})
|
|
164
|
-
|
|
165
|
-
result = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=100, ci=0.95, random_state=42)
|
|
166
|
-
|
|
167
|
-
# For constant values, mean should equal the sum (get_metrics sums the values)
|
|
168
|
-
row = result[result['field'] == 'field1'].iloc[0]
|
|
169
|
-
|
|
170
|
-
# Check that mean is close to expected value (5 * 100 = 500)
|
|
171
|
-
assert abs(row['cor: mean'] - 500.0) < 10.0
|
|
172
|
-
|
|
173
|
-
# Check that CI bounds are reasonable (close to mean for constant data)
|
|
174
|
-
assert abs(row['cor: lower'] - row['cor: mean']) < 50.0
|
|
175
|
-
assert abs(row['cor: upper'] - row['cor: mean']) < 50.0
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def test_bootstrap_CI_with_different_ci_levels():
|
|
179
|
-
"""Test bootstrap_CI with different confidence interval levels"""
|
|
180
|
-
res_df = pd.DataFrame({
|
|
181
|
-
'field1': [1, 2, 3] * 34, # ~100 rows
|
|
182
|
-
'Cor: field1': [1, 2, 1] * 34,
|
|
183
|
-
'Inc: field1': [0, 1, 0] * 34,
|
|
184
|
-
'Mis: field1': [0, 0, 1] * 34,
|
|
185
|
-
'Spu: field1': [1, 0, 0] * 34,
|
|
186
|
-
'Par: field1': [0, 0, 0] * 34,
|
|
187
|
-
})
|
|
188
|
-
|
|
189
|
-
# Test 90% CI
|
|
190
|
-
result_90 = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=50, ci=0.90, random_state=42)
|
|
191
|
-
|
|
192
|
-
# Test 99% CI
|
|
193
|
-
result_99 = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=50, ci=0.99, random_state=42)
|
|
194
|
-
|
|
195
|
-
# 99% CI should be wider than 90% CI
|
|
196
|
-
row_90 = result_90[result_90['field'] == 'field1'].iloc[0]
|
|
197
|
-
row_99 = result_99[result_99['field'] == 'field1'].iloc[0]
|
|
198
|
-
|
|
199
|
-
width_90 = row_90['cor: upper'] - row_90['cor: lower']
|
|
200
|
-
width_99 = row_99['cor: upper'] - row_99['cor: lower']
|
|
201
|
-
|
|
202
|
-
assert width_99 >= width_90, "99% CI should be wider than 90% CI"
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def test_bootstrap_CI_empty_metrics():
|
|
206
|
-
"""Test bootstrap_CI handles missing values correctly"""
|
|
207
|
-
# Create simpler test data that focuses on core functionality
|
|
208
|
-
res_df = pd.DataFrame({
|
|
209
|
-
'field1': [1, 2, 3] * 34,
|
|
210
|
-
'Cor: field1': [0, 1, 2] * 34, # Valid values
|
|
211
|
-
'Inc: field1': [0, 0, 0] * 34,
|
|
212
|
-
'Mis: field1': [0, 0, 0] * 34,
|
|
213
|
-
'Spu: field1': [0, 0, 0] * 34,
|
|
214
|
-
'Par: field1': [0, 0, 0] * 34,
|
|
215
|
-
})
|
|
216
|
-
|
|
217
|
-
result = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=10, random_state=42)
|
|
218
|
-
|
|
219
|
-
# Check that core metrics appear in output
|
|
220
|
-
core_cols = [col for col in result.columns if 'cor:' in col]
|
|
221
|
-
assert len(core_cols) == 3, f"Expected 3 cor metrics (mean, lower, upper), got {len(core_cols)}: {core_cols}"
|
|
222
|
-
|
|
223
|
-
# Check that the function completes without errors for this simpler case
|
|
224
|
-
assert 'field' in result.columns
|
|
225
|
-
assert len(result) >= 2 # At least exceptions + field1 + N/CI info row
|
|
226
|
-
|
|
227
|
-
# Check that N/CI info row is present
|
|
228
|
-
info_rows = result[result['field'].str.startswith('N=')]
|
|
229
|
-
assert len(info_rows) == 1
|
|
@@ -1,165 +0,0 @@
|
|
|
1
|
-
import math
|
|
2
|
-
import pandas as pd
|
|
3
|
-
import pytest
|
|
4
|
-
import src.validation as v
|
|
5
|
-
pd.options.display.width = 0
|
|
6
|
-
|
|
7
|
-
def get_test_df(addconfidence):
|
|
8
|
-
flag = [True, True, False, False, True, False, True, True]
|
|
9
|
-
res_flag = [True, False, True, False, False, False, True, False]
|
|
10
|
-
|
|
11
|
-
fruits = [
|
|
12
|
-
['apple', 'banana'],
|
|
13
|
-
['apple'],
|
|
14
|
-
'-',
|
|
15
|
-
['cherry'],
|
|
16
|
-
[],
|
|
17
|
-
['apple'],
|
|
18
|
-
['apple', 'banana'],
|
|
19
|
-
None
|
|
20
|
-
]
|
|
21
|
-
res_fruits = [
|
|
22
|
-
['apple', 'cherry'],
|
|
23
|
-
['apple', 'banana'],
|
|
24
|
-
[],
|
|
25
|
-
['cherry'],
|
|
26
|
-
['apple'],
|
|
27
|
-
'-',
|
|
28
|
-
['banana'],
|
|
29
|
-
['apple']
|
|
30
|
-
]
|
|
31
|
-
res_fruits_confidence = [
|
|
32
|
-
'High',
|
|
33
|
-
'Low',
|
|
34
|
-
'High',
|
|
35
|
-
None,
|
|
36
|
-
'NA',
|
|
37
|
-
'Low',
|
|
38
|
-
'High',
|
|
39
|
-
'High'
|
|
40
|
-
]
|
|
41
|
-
|
|
42
|
-
color = [
|
|
43
|
-
'red',
|
|
44
|
-
'blue',
|
|
45
|
-
'-',
|
|
46
|
-
'green',
|
|
47
|
-
'-',
|
|
48
|
-
'-',
|
|
49
|
-
'4',
|
|
50
|
-
None
|
|
51
|
-
]
|
|
52
|
-
res_color = [
|
|
53
|
-
'red',
|
|
54
|
-
'green',
|
|
55
|
-
'yellow',
|
|
56
|
-
'-',
|
|
57
|
-
'',
|
|
58
|
-
'-',
|
|
59
|
-
4,
|
|
60
|
-
'red'
|
|
61
|
-
]
|
|
62
|
-
res_color_confidence = [
|
|
63
|
-
'High',
|
|
64
|
-
'Low',
|
|
65
|
-
'-',
|
|
66
|
-
'High',
|
|
67
|
-
'',
|
|
68
|
-
'High',
|
|
69
|
-
'Low',
|
|
70
|
-
'High'
|
|
71
|
-
]
|
|
72
|
-
|
|
73
|
-
orphan = ['x','y','z','w','-','',None, None]
|
|
74
|
-
|
|
75
|
-
df = pd.DataFrame({
|
|
76
|
-
'flag': flag,
|
|
77
|
-
'Res: flag': res_flag,
|
|
78
|
-
'fruits': fruits,
|
|
79
|
-
'Res: fruits': res_fruits,
|
|
80
|
-
'orphan': orphan,
|
|
81
|
-
'color': color,
|
|
82
|
-
'Res: color': res_color
|
|
83
|
-
})
|
|
84
|
-
|
|
85
|
-
if addconfidence:
|
|
86
|
-
# Insert after 'Res: fruits'
|
|
87
|
-
pos_fruits = df.columns.get_loc('Res: fruits')
|
|
88
|
-
df.insert(pos_fruits + 1, 'Res: fruits confidence', res_fruits_confidence)
|
|
89
|
-
# Insert after 'Res: color'
|
|
90
|
-
pos_color = df.columns.get_loc('Res: color') # recompute after previous insert
|
|
91
|
-
df.insert(pos_color + 1, 'Res: color confidence', res_color_confidence)
|
|
92
|
-
|
|
93
|
-
return df
|
|
94
|
-
|
|
95
|
-
def _is_none_or_nan(x):
|
|
96
|
-
return x is None or (isinstance(x, float) and math.isnan(x))
|
|
97
|
-
|
|
98
|
-
def test_compare_results_all_mixed_fields():
|
|
99
|
-
df = get_test_df(False)
|
|
100
|
-
|
|
101
|
-
res_df = v.compare_results_all(df, ['flag', 'fruits', 'color'])
|
|
102
|
-
|
|
103
|
-
# ---- Binary field assertions (flag) ----
|
|
104
|
-
# Row 0: TP
|
|
105
|
-
assert res_df.loc[0, 'TP: flag'] == 1
|
|
106
|
-
# Row 1: FN
|
|
107
|
-
assert res_df.loc[1, 'FN: flag'] == 1
|
|
108
|
-
assert res_df.loc[2, 'FP: flag'] == 1
|
|
109
|
-
assert res_df.loc[3, 'TN: flag'] == 1
|
|
110
|
-
|
|
111
|
-
# ---- List field assertions (fruits) ----
|
|
112
|
-
# Row 0 mixed
|
|
113
|
-
assert res_df.loc[0, 'Cor: fruits'] == 1
|
|
114
|
-
assert res_df.loc[0, 'Mis: fruits'] == 1
|
|
115
|
-
assert res_df.loc[0, 'Spu: fruits'] == 1
|
|
116
|
-
assert res_df.loc[0, 'Precision: fruits'] == pytest.approx(0.5)
|
|
117
|
-
assert res_df.loc[0, 'Recall: fruits'] == pytest.approx(0.5)
|
|
118
|
-
assert res_df.loc[0, 'F1 score: fruits'] == pytest.approx(0.5)
|
|
119
|
-
|
|
120
|
-
# Row 1: one correct + one spurious
|
|
121
|
-
assert res_df.loc[1, 'Cor: fruits'] == 1
|
|
122
|
-
assert res_df.loc[1, 'Spu: fruits'] == 1
|
|
123
|
-
assert res_df.loc[1, 'Precision: fruits'] == pytest.approx(0.5)
|
|
124
|
-
assert res_df.loc[1, 'Recall: fruits'] == pytest.approx(1.0)
|
|
125
|
-
|
|
126
|
-
# Row 2: expected '-' vs [] => zeros, metrics NaN
|
|
127
|
-
assert res_df.loc[2, 'Cor: fruits'] == 0
|
|
128
|
-
assert math.isnan(res_df.loc[2, 'Precision: fruits'])
|
|
129
|
-
|
|
130
|
-
# Row 3: perfect
|
|
131
|
-
assert res_df.loc[3, 'Cor: fruits'] == 1
|
|
132
|
-
assert res_df.loc[3, 'Precision: fruits'] == pytest.approx(1.0)
|
|
133
|
-
|
|
134
|
-
# Row 4: expected empty list, actual has item -> spurious
|
|
135
|
-
assert res_df.loc[4, 'Spu: fruits'] == 1
|
|
136
|
-
assert res_df.loc[4, 'Mis: fruits'] == 0
|
|
137
|
-
|
|
138
|
-
# Row 5: expected ['apple'], actual '-' (empty) -> missing
|
|
139
|
-
assert res_df.loc[5, 'Mis: fruits'] == 1
|
|
140
|
-
assert res_df.loc[5, 'Spu: fruits'] == 0
|
|
141
|
-
|
|
142
|
-
# ---- Scalar non-binary field assertions (color) ----
|
|
143
|
-
assert res_df.loc[0, 'Cor: color'] == 1 # correct
|
|
144
|
-
assert res_df.loc[1, 'Inc: color'] == 1 # incorrect
|
|
145
|
-
assert res_df.loc[2, 'Spu: color'] == 1 # spurious
|
|
146
|
-
assert res_df.loc[3, 'Mis: color'] == 1 # missing
|
|
147
|
-
# Rows 4 & 5: both sides empty label cases ('-' and ''), treated as labeled empty -> zeros + NaN metrics
|
|
148
|
-
assert res_df.loc[4, 'Cor: color'] == 0
|
|
149
|
-
assert res_df.loc[5, 'Cor: color'] == 0
|
|
150
|
-
# Row 6: numeric string vs number -> match
|
|
151
|
-
assert res_df.loc[6, 'Cor: color'] == 1
|
|
152
|
-
assert res_df.loc[6, 'Inc: color'] == 0
|
|
153
|
-
|
|
154
|
-
# Ensure orphan column passed through unchanged
|
|
155
|
-
assert 'orphan' in res_df.columns
|
|
156
|
-
|
|
157
|
-
expected_columns = [
|
|
158
|
-
'TP: flag','TN: flag','FP: flag','FN: flag',
|
|
159
|
-
'Cor: fruits','Mis: fruits','Spu: fruits',
|
|
160
|
-
'Precision: fruits','Recall: fruits','F1 score: fruits',
|
|
161
|
-
'Cor: color','Inc: color','Mis: color','Spu: color'
|
|
162
|
-
]
|
|
163
|
-
for col in expected_columns:
|
|
164
|
-
assert col in res_df.columns, f"Missing column {col} in compare_results_all output"
|
|
165
|
-
|