llmvalidate 0.4.2__tar.gz → 0.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llmvalidate-0.4.2 → llmvalidate-0.4.4}/PKG-INFO +9 -5
- {llmvalidate-0.4.2 → llmvalidate-0.4.4}/pyproject.toml +16 -5
- {llmvalidate-0.4.2 → llmvalidate-0.4.4}/readme.md +4 -4
- llmvalidate-0.4.4/src/llmvalidate/__init__.py +12 -0
- {llmvalidate-0.4.2/src → llmvalidate-0.4.4/src/llmvalidate}/structured.py +1 -1
- {llmvalidate-0.4.2/src → llmvalidate-0.4.4/src/llmvalidate}/utils.py +1 -1
- {llmvalidate-0.4.2/src → llmvalidate-0.4.4/src/llmvalidate}/validation.py +2 -2
- {llmvalidate-0.4.2 → llmvalidate-0.4.4/src}/llmvalidate.egg-info/PKG-INFO +9 -5
- llmvalidate-0.4.4/src/llmvalidate.egg-info/SOURCES.txt +12 -0
- llmvalidate-0.4.4/src/llmvalidate.egg-info/requires.txt +4 -0
- llmvalidate-0.4.4/src/llmvalidate.egg-info/top_level.txt +1 -0
- llmvalidate-0.4.2/llmvalidate.egg-info/SOURCES.txt +0 -16
- llmvalidate-0.4.2/llmvalidate.egg-info/top_level.txt +0 -4
- llmvalidate-0.4.2/src/__init__.py +0 -1
- llmvalidate-0.4.2/src/standardize.py +0 -86
- llmvalidate-0.4.2/tests/bootstrap_CI_test.py +0 -229
- llmvalidate-0.4.2/tests/compare_results_all_test.py +0 -165
- llmvalidate-0.4.2/tests/compare_results_test.py +0 -96
- llmvalidate-0.4.2/tests/validate_test.py +0 -758
- {llmvalidate-0.4.2 → llmvalidate-0.4.4}/LICENSE +0 -0
- {llmvalidate-0.4.2 → llmvalidate-0.4.4}/setup.cfg +0 -0
- {llmvalidate-0.4.2 → llmvalidate-0.4.4/src}/llmvalidate.egg-info/dependency_links.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmvalidate
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.4
|
|
4
4
|
Summary: Oncoshot LLM validation framework
|
|
5
5
|
License: MIT
|
|
6
6
|
Project-URL: Homepage, https://github.com/Oncoshot/llm-validation-framework
|
|
@@ -16,6 +16,10 @@ Classifier: Intended Audience :: Developers
|
|
|
16
16
|
Requires-Python: >=3.11
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pandas>=2.2
|
|
20
|
+
Requires-Dist: numpy>=1.26
|
|
21
|
+
Requires-Dist: pydantic>=2.0
|
|
22
|
+
Requires-Dist: tqdm>=4.0
|
|
19
23
|
Dynamic: license-file
|
|
20
24
|
|
|
21
25
|
# LLM Validation Framework
|
|
@@ -333,10 +337,10 @@ pytest tests/compare_results_all_test.py # End-to-end comparisons
|
|
|
333
337
|
```
|
|
334
338
|
llm-validation-framework/
|
|
335
339
|
├── src/
|
|
336
|
-
│
|
|
337
|
-
│
|
|
338
|
-
│
|
|
339
|
-
│
|
|
340
|
+
│ └── llmvalidate/
|
|
341
|
+
│ ├── validation.py # Main validation pipeline and metrics calculation
|
|
342
|
+
│ ├── structured.py # Pydantic data models for LLM results
|
|
343
|
+
│ └── utils.py # Utility functions (list conversion, flattening)
|
|
340
344
|
├── tests/ # Comprehensive test suite
|
|
341
345
|
├── validation_results/ # Output directory (auto-created)
|
|
342
346
|
├── samples.csv # Demo dataset with all validation scenarios
|
|
@@ -4,10 +4,16 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "llmvalidate"
|
|
7
|
-
version = "0.4.
|
|
7
|
+
version = "0.4.4"
|
|
8
8
|
description = "Oncoshot LLM validation framework"
|
|
9
9
|
readme = "readme.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"pandas>=2.2",
|
|
13
|
+
"numpy>=1.26",
|
|
14
|
+
"pydantic>=2.0",
|
|
15
|
+
"tqdm>=4.0",
|
|
16
|
+
]
|
|
11
17
|
license = { text = "MIT" }
|
|
12
18
|
|
|
13
19
|
classifiers = [
|
|
@@ -25,22 +31,27 @@ classifiers = [
|
|
|
25
31
|
"Repository" = "https://github.com/Oncoshot/llm-validation-framework"
|
|
26
32
|
"Bug Tracker" = "https://github.com/Oncoshot/llm-validation-framework/issues"
|
|
27
33
|
|
|
34
|
+
[tool.setuptools]
|
|
35
|
+
package-dir = {"" = "src"}
|
|
36
|
+
|
|
28
37
|
[tool.setuptools.packages.find]
|
|
29
|
-
where = ["
|
|
38
|
+
where = ["src"]
|
|
39
|
+
include = ["llmvalidate*"]
|
|
40
|
+
exclude = ["tests*", "docs*", "scripts*"]
|
|
30
41
|
|
|
31
42
|
[tool.pytest.ini_options]
|
|
32
43
|
pythonpath = [
|
|
33
|
-
".",
|
|
34
44
|
"src",
|
|
35
45
|
]
|
|
36
46
|
|
|
37
47
|
[tool.semantic_release]
|
|
38
|
-
version_variable = ["src/__init__.py:__version__"]
|
|
48
|
+
version_variable = ["src/llmvalidate/__init__.py:__version__"]
|
|
39
49
|
version_toml = ["pyproject.toml:project.version"]
|
|
40
50
|
branch = "master"
|
|
41
51
|
allow_zero_version = true
|
|
42
52
|
build_command = "pip install build && python -m build"
|
|
43
53
|
upload_to_pypi = true
|
|
44
54
|
upload_to_release = true
|
|
45
|
-
|
|
46
55
|
commit_version_number = true
|
|
56
|
+
|
|
57
|
+
|
|
@@ -313,10 +313,10 @@ pytest tests/compare_results_all_test.py # End-to-end comparisons
|
|
|
313
313
|
```
|
|
314
314
|
llm-validation-framework/
|
|
315
315
|
├── src/
|
|
316
|
-
│
|
|
317
|
-
│
|
|
318
|
-
│
|
|
319
|
-
│
|
|
316
|
+
│ └── llmvalidate/
|
|
317
|
+
│ ├── validation.py # Main validation pipeline and metrics calculation
|
|
318
|
+
│ ├── structured.py # Pydantic data models for LLM results
|
|
319
|
+
│ └── utils.py # Utility functions (list conversion, flattening)
|
|
320
320
|
├── tests/ # Comprehensive test suite
|
|
321
321
|
├── validation_results/ # Output directory (auto-created)
|
|
322
322
|
├── samples.csv # Demo dataset with all validation scenarios
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
__version__ = "0.0.0"
|
|
2
|
+
|
|
3
|
+
from .validation import validate, bootstrap_CI
|
|
4
|
+
from .structured import StructuredResult, StructuredGroup, StructuredField
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"validate",
|
|
8
|
+
"bootstrap_CI",
|
|
9
|
+
"StructuredResult",
|
|
10
|
+
"StructuredGroup",
|
|
11
|
+
"StructuredField"
|
|
12
|
+
]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
from datetime import datetime
|
|
2
2
|
import math
|
|
3
3
|
from ast import literal_eval
|
|
4
4
|
import string
|
|
@@ -8,7 +8,7 @@ import time
|
|
|
8
8
|
import os
|
|
9
9
|
import concurrent.futures as cf
|
|
10
10
|
from tqdm import tqdm
|
|
11
|
-
from
|
|
11
|
+
from .utils import convert_lists, infer_fields
|
|
12
12
|
|
|
13
13
|
def compare_results_binary(expected, actual):
|
|
14
14
|
"""Compares boolean labels and returns confusion matrix counts."""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmvalidate
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.4
|
|
4
4
|
Summary: Oncoshot LLM validation framework
|
|
5
5
|
License: MIT
|
|
6
6
|
Project-URL: Homepage, https://github.com/Oncoshot/llm-validation-framework
|
|
@@ -16,6 +16,10 @@ Classifier: Intended Audience :: Developers
|
|
|
16
16
|
Requires-Python: >=3.11
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pandas>=2.2
|
|
20
|
+
Requires-Dist: numpy>=1.26
|
|
21
|
+
Requires-Dist: pydantic>=2.0
|
|
22
|
+
Requires-Dist: tqdm>=4.0
|
|
19
23
|
Dynamic: license-file
|
|
20
24
|
|
|
21
25
|
# LLM Validation Framework
|
|
@@ -333,10 +337,10 @@ pytest tests/compare_results_all_test.py # End-to-end comparisons
|
|
|
333
337
|
```
|
|
334
338
|
llm-validation-framework/
|
|
335
339
|
├── src/
|
|
336
|
-
│
|
|
337
|
-
│
|
|
338
|
-
│
|
|
339
|
-
│
|
|
340
|
+
│ └── llmvalidate/
|
|
341
|
+
│ ├── validation.py # Main validation pipeline and metrics calculation
|
|
342
|
+
│ ├── structured.py # Pydantic data models for LLM results
|
|
343
|
+
│ └── utils.py # Utility functions (list conversion, flattening)
|
|
340
344
|
├── tests/ # Comprehensive test suite
|
|
341
345
|
├── validation_results/ # Output directory (auto-created)
|
|
342
346
|
├── samples.csv # Demo dataset with all validation scenarios
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
pyproject.toml
|
|
3
|
+
readme.md
|
|
4
|
+
src/llmvalidate/__init__.py
|
|
5
|
+
src/llmvalidate/structured.py
|
|
6
|
+
src/llmvalidate/utils.py
|
|
7
|
+
src/llmvalidate/validation.py
|
|
8
|
+
src/llmvalidate.egg-info/PKG-INFO
|
|
9
|
+
src/llmvalidate.egg-info/SOURCES.txt
|
|
10
|
+
src/llmvalidate.egg-info/dependency_links.txt
|
|
11
|
+
src/llmvalidate.egg-info/requires.txt
|
|
12
|
+
src/llmvalidate.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
llmvalidate
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
LICENSE
|
|
2
|
-
pyproject.toml
|
|
3
|
-
readme.md
|
|
4
|
-
llmvalidate.egg-info/PKG-INFO
|
|
5
|
-
llmvalidate.egg-info/SOURCES.txt
|
|
6
|
-
llmvalidate.egg-info/dependency_links.txt
|
|
7
|
-
llmvalidate.egg-info/top_level.txt
|
|
8
|
-
src/__init__.py
|
|
9
|
-
src/standardize.py
|
|
10
|
-
src/structured.py
|
|
11
|
-
src/utils.py
|
|
12
|
-
src/validation.py
|
|
13
|
-
tests/bootstrap_CI_test.py
|
|
14
|
-
tests/compare_results_all_test.py
|
|
15
|
-
tests/compare_results_test.py
|
|
16
|
-
tests/validate_test.py
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.0.0"
|
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import os
|
|
3
|
-
|
|
4
|
-
from pandas import DataFrame
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def read_parents():
|
|
8
|
-
datalists = ['Diagnoses_Cancer','Diagnoses_NonCancer',
|
|
9
|
-
'CurrentMedicationTreatment_Cancer','CurrentMedicationTreatment_NonCancer',
|
|
10
|
-
'Biomarkers']
|
|
11
|
-
# Initialize an empty DataFrame
|
|
12
|
-
combined_df = pd.DataFrame()
|
|
13
|
-
|
|
14
|
-
# Loop through each file name in the list
|
|
15
|
-
for datalist in datalists:
|
|
16
|
-
# Construct the file path
|
|
17
|
-
base_dir = os.path.dirname(os.path.abspath(__file__))
|
|
18
|
-
file_path = os.path.join(base_dir, 'data', f'{datalist}.csv')
|
|
19
|
-
|
|
20
|
-
# Load the CSV file into a DataFrame
|
|
21
|
-
df = pd.read_csv(file_path, usecols=['id', 'pid', 'value'])
|
|
22
|
-
|
|
23
|
-
# Add the 'domain_datalist' column
|
|
24
|
-
df['domain_datalist'] = datalist
|
|
25
|
-
|
|
26
|
-
# Append the DataFrame to the combined DataFrame
|
|
27
|
-
combined_df = pd.concat([combined_df, df], ignore_index=True)
|
|
28
|
-
#print(combined_df)
|
|
29
|
-
|
|
30
|
-
# Self-join on df.pid = df.id
|
|
31
|
-
joined_df = pd.merge(combined_df, combined_df, left_on=['pid', 'domain_datalist'], right_on=['id', 'domain_datalist'],
|
|
32
|
-
suffixes=('_child', '_parent'))
|
|
33
|
-
|
|
34
|
-
# Create dictionary:
|
|
35
|
-
parents = dict(zip(joined_df['value_child'], joined_df['value_parent']))
|
|
36
|
-
|
|
37
|
-
return parents
|
|
38
|
-
|
|
39
|
-
def get_datalist(datalist:str) -> DataFrame:
|
|
40
|
-
# Construct the file path
|
|
41
|
-
base_dir = os.path.dirname(os.path.abspath(__file__))
|
|
42
|
-
file_path = os.path.join(base_dir, 'data', f'{datalist}.csv')
|
|
43
|
-
|
|
44
|
-
# Load the CSV file into a DataFrame
|
|
45
|
-
df = pd.read_csv(file_path, usecols=['id', 'pid', 'value'])
|
|
46
|
-
|
|
47
|
-
return df
|
|
48
|
-
|
|
49
|
-
def get_parents(datalist:str):
|
|
50
|
-
# Construct the file path
|
|
51
|
-
base_dir = os.path.dirname(os.path.abspath(__file__))
|
|
52
|
-
file_path = os.path.join(base_dir, 'data', f'{datalist}.csv')
|
|
53
|
-
|
|
54
|
-
# Load the CSV file into a DataFrame
|
|
55
|
-
df = pd.read_csv(file_path, usecols=['id', 'pid', 'value'])
|
|
56
|
-
|
|
57
|
-
# extract only parent items
|
|
58
|
-
parents = df[df['pid'].isnull()][['value']]
|
|
59
|
-
|
|
60
|
-
return parents
|
|
61
|
-
|
|
62
|
-
# Get children with specific parent from data list
|
|
63
|
-
def get_children(datalist:str, parent:str):
|
|
64
|
-
# Construct the file path
|
|
65
|
-
base_dir = os.path.dirname(os.path.abspath(__file__))
|
|
66
|
-
file_path = os.path.join(base_dir, 'data', f'{datalist}.csv')
|
|
67
|
-
|
|
68
|
-
# Load the CSV file into a DataFrame
|
|
69
|
-
df = pd.read_csv(file_path, usecols=['id', 'pid', 'value'])
|
|
70
|
-
|
|
71
|
-
parent_id = df[df['value'] == parent].iloc[0]['id']
|
|
72
|
-
|
|
73
|
-
# extract only items with particular parent
|
|
74
|
-
children = df[df['pid'] == parent_id][['value']]
|
|
75
|
-
|
|
76
|
-
return children
|
|
77
|
-
|
|
78
|
-
# check if there is exact case insensitive match among all datalist items
|
|
79
|
-
# or items without parent (if childOnly=True)
|
|
80
|
-
def datalist_contains_value(datalist: DataFrame, diagnosis:str, childOnly=False) -> bool:
|
|
81
|
-
items = datalist[datalist['value'].str.lower() == diagnosis.lower()]
|
|
82
|
-
|
|
83
|
-
if childOnly:
|
|
84
|
-
items = items[items['pid'].notna()]
|
|
85
|
-
|
|
86
|
-
return items.shape[0] > 0
|
|
@@ -1,229 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import pytest
|
|
3
|
-
import numpy as np
|
|
4
|
-
import src.validation as v
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def test_bootstrap_CI_basic():
|
|
8
|
-
"""Test basic functionality of bootstrap_CI"""
|
|
9
|
-
# Create test data with comparison results
|
|
10
|
-
res_df = pd.DataFrame({
|
|
11
|
-
'field1': ['A', 'B', 'A', 'B', 'A'] * 20, # 100 rows total
|
|
12
|
-
'field2': ['X', 'Y', 'X', 'Y', 'X'] * 20,
|
|
13
|
-
'Cor: field1': [1, 0, 1, 1, 0] * 20,
|
|
14
|
-
'Inc: field1': [0, 1, 0, 0, 1] * 20,
|
|
15
|
-
'Mis: field1': [0, 0, 0, 0, 0] * 20,
|
|
16
|
-
'Spu: field1': [0, 1, 0, 0, 1] * 20,
|
|
17
|
-
'Par: field1': [0, 0, 0, 0, 0] * 20,
|
|
18
|
-
'Cor: field2': [1, 1, 0, 1, 1] * 20,
|
|
19
|
-
'Inc: field2': [0, 0, 1, 0, 0] * 20,
|
|
20
|
-
'Mis: field2': [0, 0, 1, 0, 0] * 20,
|
|
21
|
-
'Spu: field2': [0, 0, 0, 0, 0] * 20,
|
|
22
|
-
'Par: field2': [0, 0, 0, 0, 0] * 20,
|
|
23
|
-
})
|
|
24
|
-
|
|
25
|
-
fields = ['field1', 'field2']
|
|
26
|
-
result = v.bootstrap_CI(res_df, fields, n_bootstrap=100, random_state=42)
|
|
27
|
-
|
|
28
|
-
# Check output format
|
|
29
|
-
assert 'field' in result.columns
|
|
30
|
-
assert len(result) == 4 # Two fields + exceptions field + N/CI info row
|
|
31
|
-
expected_fields = {'field1', 'field2', 'exceptions', 'N=100; CI=95%'}
|
|
32
|
-
assert set(result['field']) == expected_fields
|
|
33
|
-
|
|
34
|
-
# Check that confidence interval columns are present for metrics that exist in our data
|
|
35
|
-
# We know these will be present because we've included them in our test data
|
|
36
|
-
core_metrics = ['field-present cases', 'cor', 'inc', 'mis', 'spu', 'par']
|
|
37
|
-
|
|
38
|
-
# 'labeled cases' is handled specially - just appears as 'labeled cases'
|
|
39
|
-
assert 'labeled cases' in result.columns
|
|
40
|
-
|
|
41
|
-
for metric in core_metrics:
|
|
42
|
-
assert f'{metric}: mean' in result.columns
|
|
43
|
-
assert f'{metric}: lower' in result.columns
|
|
44
|
-
assert f'{metric}: upper' in result.columns
|
|
45
|
-
|
|
46
|
-
# Check that means are reasonable (between lower and upper bounds) for non-exception fields
|
|
47
|
-
for _, row in result.iterrows():
|
|
48
|
-
if row['field'] in ['exceptions', 'N=100; CI=95%']: # Skip exceptions and info row
|
|
49
|
-
for metric in core_metrics:
|
|
50
|
-
mean_col = f'{metric}: mean'
|
|
51
|
-
lower_col = f'{metric}: lower'
|
|
52
|
-
upper_col = f'{metric}: upper'
|
|
53
|
-
|
|
54
|
-
if pd.notna(row[mean_col]) and pd.notna(row[lower_col]) and pd.notna(row[upper_col]):
|
|
55
|
-
assert row[lower_col] <= row[mean_col] <= row[upper_col], \
|
|
56
|
-
f"Mean not between bounds for {row['field']} {metric}"
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def test_bootstrap_CI_error_conditions():
|
|
60
|
-
"""Test error conditions for bootstrap_CI"""
|
|
61
|
-
|
|
62
|
-
# Test with ci outside valid range
|
|
63
|
-
res_df = pd.DataFrame({
|
|
64
|
-
'field1': [1, 2, 3],
|
|
65
|
-
'Cor: field1': [1, 0, 1]
|
|
66
|
-
})
|
|
67
|
-
|
|
68
|
-
with pytest.raises(ValueError, match="ci must be in \\(0, 1\\)"):
|
|
69
|
-
v.bootstrap_CI(res_df, ['field1'], ci=1.5)
|
|
70
|
-
|
|
71
|
-
with pytest.raises(ValueError, match="ci must be in \\(0, 1\\)"):
|
|
72
|
-
v.bootstrap_CI(res_df, ['field1'], ci=0)
|
|
73
|
-
|
|
74
|
-
# Test with too few rows
|
|
75
|
-
single_row_df = pd.DataFrame({
|
|
76
|
-
'field1': [1],
|
|
77
|
-
'Cor: field1': [1]
|
|
78
|
-
})
|
|
79
|
-
|
|
80
|
-
with pytest.raises(ValueError, match="Need at least 2 rows"):
|
|
81
|
-
v.bootstrap_CI(single_row_df, ['field1'])
|
|
82
|
-
|
|
83
|
-
# Test with missing labels (NaN values)
|
|
84
|
-
res_df_with_nan = pd.DataFrame({
|
|
85
|
-
'field1': [1, np.nan, 3],
|
|
86
|
-
'field2': [1, 2, 3],
|
|
87
|
-
'Cor: field1': [1, 0, 1],
|
|
88
|
-
'Cor: field2': [0, 1, 0]
|
|
89
|
-
})
|
|
90
|
-
|
|
91
|
-
with pytest.raises(ValueError, match="Missing labels \\(NaN\\) found in the following fields: \\['field1'\\]"):
|
|
92
|
-
v.bootstrap_CI(res_df_with_nan, ['field1', 'field2'])
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def test_bootstrap_CI_binary_field():
|
|
96
|
-
"""Test bootstrap_CI with binary field metrics"""
|
|
97
|
-
# Create test data with binary field results
|
|
98
|
-
res_df = pd.DataFrame({
|
|
99
|
-
'binary_field': [True, False, True, False] * 25, # 100 rows
|
|
100
|
-
'TP: binary_field': [1, 0, 1, 0] * 25,
|
|
101
|
-
'FP: binary_field': [0, 1, 0, 1] * 25,
|
|
102
|
-
'FN: binary_field': [0, 0, 0, 0] * 25,
|
|
103
|
-
'TN: binary_field': [0, 1, 0, 1] * 25,
|
|
104
|
-
'Precision: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
|
|
105
|
-
'Recall: binary_field': [1.0, np.nan, 1.0, np.nan] * 25,
|
|
106
|
-
'F1 score: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
|
|
107
|
-
'F2 score: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
|
|
108
|
-
})
|
|
109
|
-
|
|
110
|
-
fields = ['binary_field']
|
|
111
|
-
result = v.bootstrap_CI(res_df, fields, n_bootstrap=50, random_state=42)
|
|
112
|
-
|
|
113
|
-
# Check that binary metrics are included
|
|
114
|
-
binary_metrics = ['TP', 'FP', 'FN', 'TN', 'precision (micro)', 'recall (micro)',
|
|
115
|
-
'F1 score (micro)', 'F2 score (micro)', 'accuracy (micro)', 'specificity (micro)']
|
|
116
|
-
|
|
117
|
-
for metric in binary_metrics:
|
|
118
|
-
assert f'{metric}: mean' in result.columns
|
|
119
|
-
assert f'{metric}: lower' in result.columns
|
|
120
|
-
assert f'{metric}: upper' in result.columns
|
|
121
|
-
|
|
122
|
-
# Check that N/CI info row is present
|
|
123
|
-
info_rows = result[result['field'].str.startswith('N=')]
|
|
124
|
-
assert len(info_rows) == 1
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
def test_bootstrap_CI_output_format():
|
|
128
|
-
"""Test that output format matches specification"""
|
|
129
|
-
res_df = pd.DataFrame({
|
|
130
|
-
'test_field': ['A', 'B'] * 50, # 100 rows
|
|
131
|
-
'Cor: test_field': [1, 0] * 50,
|
|
132
|
-
'Inc: test_field': [0, 1] * 50,
|
|
133
|
-
'Mis: test_field': [0, 0] * 50,
|
|
134
|
-
'Spu: test_field': [0, 1] * 50,
|
|
135
|
-
'Par: test_field': [0, 0] * 50,
|
|
136
|
-
})
|
|
137
|
-
|
|
138
|
-
result = v.bootstrap_CI(res_df, ['test_field'], n_bootstrap=10, random_state=42)
|
|
139
|
-
|
|
140
|
-
# Check that result has the correct format
|
|
141
|
-
assert len(result) == 3 # One field + exceptions + N/CI info row
|
|
142
|
-
test_field_row = result[result['field'] == 'test_field'].iloc[0]
|
|
143
|
-
assert test_field_row['field'] == 'test_field'
|
|
144
|
-
|
|
145
|
-
# Check that columns follow the expected pattern
|
|
146
|
-
metric_columns = [col for col in result.columns if col not in ['field', 'labeled cases']]
|
|
147
|
-
for col in metric_columns:
|
|
148
|
-
assert ': ' in col, f"Column {col} doesn't follow expected format"
|
|
149
|
-
metric_name, stat_type = col.split(': ', 1)
|
|
150
|
-
assert stat_type in ['mean', 'lower', 'upper'], f"Unexpected stat type in {col}"
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
def test_bootstrap_CI_confidence_intervals():
|
|
154
|
-
"""Test that confidence intervals make sense"""
|
|
155
|
-
# Create deterministic test case
|
|
156
|
-
res_df = pd.DataFrame({
|
|
157
|
-
'field1': [1] * 100,
|
|
158
|
-
'Cor: field1': [5] * 100, # Constant values for predictable CI
|
|
159
|
-
'Inc: field1': [0] * 100,
|
|
160
|
-
'Mis: field1': [0] * 100,
|
|
161
|
-
'Spu: field1': [0] * 100,
|
|
162
|
-
'Par: field1': [0] * 100,
|
|
163
|
-
})
|
|
164
|
-
|
|
165
|
-
result = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=100, ci=0.95, random_state=42)
|
|
166
|
-
|
|
167
|
-
# For constant values, mean should equal the sum (get_metrics sums the values)
|
|
168
|
-
row = result[result['field'] == 'field1'].iloc[0]
|
|
169
|
-
|
|
170
|
-
# Check that mean is close to expected value (5 * 100 = 500)
|
|
171
|
-
assert abs(row['cor: mean'] - 500.0) < 10.0
|
|
172
|
-
|
|
173
|
-
# Check that CI bounds are reasonable (close to mean for constant data)
|
|
174
|
-
assert abs(row['cor: lower'] - row['cor: mean']) < 50.0
|
|
175
|
-
assert abs(row['cor: upper'] - row['cor: mean']) < 50.0
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def test_bootstrap_CI_with_different_ci_levels():
|
|
179
|
-
"""Test bootstrap_CI with different confidence interval levels"""
|
|
180
|
-
res_df = pd.DataFrame({
|
|
181
|
-
'field1': [1, 2, 3] * 34, # ~100 rows
|
|
182
|
-
'Cor: field1': [1, 2, 1] * 34,
|
|
183
|
-
'Inc: field1': [0, 1, 0] * 34,
|
|
184
|
-
'Mis: field1': [0, 0, 1] * 34,
|
|
185
|
-
'Spu: field1': [1, 0, 0] * 34,
|
|
186
|
-
'Par: field1': [0, 0, 0] * 34,
|
|
187
|
-
})
|
|
188
|
-
|
|
189
|
-
# Test 90% CI
|
|
190
|
-
result_90 = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=50, ci=0.90, random_state=42)
|
|
191
|
-
|
|
192
|
-
# Test 99% CI
|
|
193
|
-
result_99 = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=50, ci=0.99, random_state=42)
|
|
194
|
-
|
|
195
|
-
# 99% CI should be wider than 90% CI
|
|
196
|
-
row_90 = result_90[result_90['field'] == 'field1'].iloc[0]
|
|
197
|
-
row_99 = result_99[result_99['field'] == 'field1'].iloc[0]
|
|
198
|
-
|
|
199
|
-
width_90 = row_90['cor: upper'] - row_90['cor: lower']
|
|
200
|
-
width_99 = row_99['cor: upper'] - row_99['cor: lower']
|
|
201
|
-
|
|
202
|
-
assert width_99 >= width_90, "99% CI should be wider than 90% CI"
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def test_bootstrap_CI_empty_metrics():
|
|
206
|
-
"""Test bootstrap_CI handles missing values correctly"""
|
|
207
|
-
# Create simpler test data that focuses on core functionality
|
|
208
|
-
res_df = pd.DataFrame({
|
|
209
|
-
'field1': [1, 2, 3] * 34,
|
|
210
|
-
'Cor: field1': [0, 1, 2] * 34, # Valid values
|
|
211
|
-
'Inc: field1': [0, 0, 0] * 34,
|
|
212
|
-
'Mis: field1': [0, 0, 0] * 34,
|
|
213
|
-
'Spu: field1': [0, 0, 0] * 34,
|
|
214
|
-
'Par: field1': [0, 0, 0] * 34,
|
|
215
|
-
})
|
|
216
|
-
|
|
217
|
-
result = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=10, random_state=42)
|
|
218
|
-
|
|
219
|
-
# Check that core metrics appear in output
|
|
220
|
-
core_cols = [col for col in result.columns if 'cor:' in col]
|
|
221
|
-
assert len(core_cols) == 3, f"Expected 3 cor metrics (mean, lower, upper), got {len(core_cols)}: {core_cols}"
|
|
222
|
-
|
|
223
|
-
# Check that the function completes without errors for this simpler case
|
|
224
|
-
assert 'field' in result.columns
|
|
225
|
-
assert len(result) >= 2 # At least exceptions + field1 + N/CI info row
|
|
226
|
-
|
|
227
|
-
# Check that N/CI info row is present
|
|
228
|
-
info_rows = result[result['field'].str.startswith('N=')]
|
|
229
|
-
assert len(info_rows) == 1
|