llmvalidate 0.4.2__tar.gz → 0.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmvalidate
3
- Version: 0.4.2
3
+ Version: 0.4.4
4
4
  Summary: Oncoshot LLM validation framework
5
5
  License: MIT
6
6
  Project-URL: Homepage, https://github.com/Oncoshot/llm-validation-framework
@@ -16,6 +16,10 @@ Classifier: Intended Audience :: Developers
16
16
  Requires-Python: >=3.11
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
+ Requires-Dist: pandas>=2.2
20
+ Requires-Dist: numpy>=1.26
21
+ Requires-Dist: pydantic>=2.0
22
+ Requires-Dist: tqdm>=4.0
19
23
  Dynamic: license-file
20
24
 
21
25
  # LLM Validation Framework
@@ -333,10 +337,10 @@ pytest tests/compare_results_all_test.py # End-to-end comparisons
333
337
  ```
334
338
  llm-validation-framework/
335
339
  ├── src/
336
- ├── validation.py # Main validation pipeline and metrics calculation
337
- ├── structured.py # Pydantic data models for LLM results
338
- ├── utils.py # Utility functions (list conversion, flattening)
339
- └── standardize.py # Data standardization helpers
340
+ └── llmvalidate/
341
+ ├── validation.py # Main validation pipeline and metrics calculation
342
+ ├── structured.py # Pydantic data models for LLM results
343
+ └── utils.py # Utility functions (list conversion, flattening)
340
344
  ├── tests/ # Comprehensive test suite
341
345
  ├── validation_results/ # Output directory (auto-created)
342
346
  ├── samples.csv # Demo dataset with all validation scenarios
@@ -4,10 +4,16 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "llmvalidate"
7
- version = "0.4.2"
7
+ version = "0.4.4"
8
8
  description = "Oncoshot LLM validation framework"
9
9
  readme = "readme.md"
10
10
  requires-python = ">=3.11"
11
+ dependencies = [
12
+ "pandas>=2.2",
13
+ "numpy>=1.26",
14
+ "pydantic>=2.0",
15
+ "tqdm>=4.0",
16
+ ]
11
17
  license = { text = "MIT" }
12
18
 
13
19
  classifiers = [
@@ -25,22 +31,27 @@ classifiers = [
25
31
  "Repository" = "https://github.com/Oncoshot/llm-validation-framework"
26
32
  "Bug Tracker" = "https://github.com/Oncoshot/llm-validation-framework/issues"
27
33
 
34
+ [tool.setuptools]
35
+ package-dir = {"" = "src"}
36
+
28
37
  [tool.setuptools.packages.find]
29
- where = ["."]
38
+ where = ["src"]
39
+ include = ["llmvalidate*"]
40
+ exclude = ["tests*", "docs*", "scripts*"]
30
41
 
31
42
  [tool.pytest.ini_options]
32
43
  pythonpath = [
33
- ".",
34
44
  "src",
35
45
  ]
36
46
 
37
47
  [tool.semantic_release]
38
- version_variable = ["src/__init__.py:__version__"]
48
+ version_variable = ["src/llmvalidate/__init__.py:__version__"]
39
49
  version_toml = ["pyproject.toml:project.version"]
40
50
  branch = "master"
41
51
  allow_zero_version = true
42
52
  build_command = "pip install build && python -m build"
43
53
  upload_to_pypi = true
44
54
  upload_to_release = true
45
-
46
55
  commit_version_number = true
56
+
57
+
@@ -313,10 +313,10 @@ pytest tests/compare_results_all_test.py # End-to-end comparisons
313
313
  ```
314
314
  llm-validation-framework/
315
315
  ├── src/
316
- ├── validation.py # Main validation pipeline and metrics calculation
317
- ├── structured.py # Pydantic data models for LLM results
318
- ├── utils.py # Utility functions (list conversion, flattening)
319
- └── standardize.py # Data standardization helpers
316
+ └── llmvalidate/
317
+ ├── validation.py # Main validation pipeline and metrics calculation
318
+ ├── structured.py # Pydantic data models for LLM results
319
+ └── utils.py # Utility functions (list conversion, flattening)
320
320
  ├── tests/ # Comprehensive test suite
321
321
  ├── validation_results/ # Output directory (auto-created)
322
322
  ├── samples.csv # Demo dataset with all validation scenarios
@@ -0,0 +1,12 @@
1
+ __version__ = "0.0.0"
2
+
3
+ from .validation import validate, bootstrap_CI
4
+ from .structured import StructuredResult, StructuredGroup, StructuredField
5
+
6
+ __all__ = [
7
+ "validate",
8
+ "bootstrap_CI",
9
+ "StructuredResult",
10
+ "StructuredGroup",
11
+ "StructuredField"
12
+ ]
@@ -1,4 +1,4 @@
1
- from pydantic import BaseModel
1
+ from pydantic import BaseModel
2
2
  from typing import Any, List, Dict, Optional, Union
3
3
 
4
4
  # For all LLM Extracted Value
@@ -1,6 +1,6 @@
1
1
  from typing import Any, List, Dict, Optional, Union
2
2
  from ast import literal_eval
3
- from src.structured import StructuredResult
3
+ from .structured import StructuredResult
4
4
  import pandas as pd
5
5
  import re
6
6
  import json
@@ -1,4 +1,4 @@
1
- from datetime import datetime
1
+ from datetime import datetime
2
2
  import math
3
3
  from ast import literal_eval
4
4
  import string
@@ -8,7 +8,7 @@ import time
8
8
  import os
9
9
  import concurrent.futures as cf
10
10
  from tqdm import tqdm
11
- from src.utils import convert_lists, infer_fields
11
+ from .utils import convert_lists, infer_fields
12
12
 
13
13
  def compare_results_binary(expected, actual):
14
14
  """Compares boolean labels and returns confusion matrix counts."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmvalidate
3
- Version: 0.4.2
3
+ Version: 0.4.4
4
4
  Summary: Oncoshot LLM validation framework
5
5
  License: MIT
6
6
  Project-URL: Homepage, https://github.com/Oncoshot/llm-validation-framework
@@ -16,6 +16,10 @@ Classifier: Intended Audience :: Developers
16
16
  Requires-Python: >=3.11
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
+ Requires-Dist: pandas>=2.2
20
+ Requires-Dist: numpy>=1.26
21
+ Requires-Dist: pydantic>=2.0
22
+ Requires-Dist: tqdm>=4.0
19
23
  Dynamic: license-file
20
24
 
21
25
  # LLM Validation Framework
@@ -333,10 +337,10 @@ pytest tests/compare_results_all_test.py # End-to-end comparisons
333
337
  ```
334
338
  llm-validation-framework/
335
339
  ├── src/
336
- ├── validation.py # Main validation pipeline and metrics calculation
337
- ├── structured.py # Pydantic data models for LLM results
338
- ├── utils.py # Utility functions (list conversion, flattening)
339
- └── standardize.py # Data standardization helpers
340
+ └── llmvalidate/
341
+ ├── validation.py # Main validation pipeline and metrics calculation
342
+ ├── structured.py # Pydantic data models for LLM results
343
+ └── utils.py # Utility functions (list conversion, flattening)
340
344
  ├── tests/ # Comprehensive test suite
341
345
  ├── validation_results/ # Output directory (auto-created)
342
346
  ├── samples.csv # Demo dataset with all validation scenarios
@@ -0,0 +1,12 @@
1
+ LICENSE
2
+ pyproject.toml
3
+ readme.md
4
+ src/llmvalidate/__init__.py
5
+ src/llmvalidate/structured.py
6
+ src/llmvalidate/utils.py
7
+ src/llmvalidate/validation.py
8
+ src/llmvalidate.egg-info/PKG-INFO
9
+ src/llmvalidate.egg-info/SOURCES.txt
10
+ src/llmvalidate.egg-info/dependency_links.txt
11
+ src/llmvalidate.egg-info/requires.txt
12
+ src/llmvalidate.egg-info/top_level.txt
@@ -0,0 +1,4 @@
1
+ pandas>=2.2
2
+ numpy>=1.26
3
+ pydantic>=2.0
4
+ tqdm>=4.0
@@ -0,0 +1 @@
1
+ llmvalidate
@@ -1,16 +0,0 @@
1
- LICENSE
2
- pyproject.toml
3
- readme.md
4
- llmvalidate.egg-info/PKG-INFO
5
- llmvalidate.egg-info/SOURCES.txt
6
- llmvalidate.egg-info/dependency_links.txt
7
- llmvalidate.egg-info/top_level.txt
8
- src/__init__.py
9
- src/standardize.py
10
- src/structured.py
11
- src/utils.py
12
- src/validation.py
13
- tests/bootstrap_CI_test.py
14
- tests/compare_results_all_test.py
15
- tests/compare_results_test.py
16
- tests/validate_test.py
@@ -1,4 +0,0 @@
1
- dist
2
- src
3
- tests
4
- validation_results
@@ -1 +0,0 @@
1
- __version__ = "0.0.0"
@@ -1,86 +0,0 @@
1
- import pandas as pd
2
- import os
3
-
4
- from pandas import DataFrame
5
-
6
-
7
- def read_parents():
8
- datalists = ['Diagnoses_Cancer','Diagnoses_NonCancer',
9
- 'CurrentMedicationTreatment_Cancer','CurrentMedicationTreatment_NonCancer',
10
- 'Biomarkers']
11
- # Initialize an empty DataFrame
12
- combined_df = pd.DataFrame()
13
-
14
- # Loop through each file name in the list
15
- for datalist in datalists:
16
- # Construct the file path
17
- base_dir = os.path.dirname(os.path.abspath(__file__))
18
- file_path = os.path.join(base_dir, 'data', f'{datalist}.csv')
19
-
20
- # Load the CSV file into a DataFrame
21
- df = pd.read_csv(file_path, usecols=['id', 'pid', 'value'])
22
-
23
- # Add the 'domain_datalist' column
24
- df['domain_datalist'] = datalist
25
-
26
- # Append the DataFrame to the combined DataFrame
27
- combined_df = pd.concat([combined_df, df], ignore_index=True)
28
- #print(combined_df)
29
-
30
- # Self-join on df.pid = df.id
31
- joined_df = pd.merge(combined_df, combined_df, left_on=['pid', 'domain_datalist'], right_on=['id', 'domain_datalist'],
32
- suffixes=('_child', '_parent'))
33
-
34
- # Create dictionary:
35
- parents = dict(zip(joined_df['value_child'], joined_df['value_parent']))
36
-
37
- return parents
38
-
39
- def get_datalist(datalist:str) -> DataFrame:
40
- # Construct the file path
41
- base_dir = os.path.dirname(os.path.abspath(__file__))
42
- file_path = os.path.join(base_dir, 'data', f'{datalist}.csv')
43
-
44
- # Load the CSV file into a DataFrame
45
- df = pd.read_csv(file_path, usecols=['id', 'pid', 'value'])
46
-
47
- return df
48
-
49
- def get_parents(datalist:str):
50
- # Construct the file path
51
- base_dir = os.path.dirname(os.path.abspath(__file__))
52
- file_path = os.path.join(base_dir, 'data', f'{datalist}.csv')
53
-
54
- # Load the CSV file into a DataFrame
55
- df = pd.read_csv(file_path, usecols=['id', 'pid', 'value'])
56
-
57
- # extract only parent items
58
- parents = df[df['pid'].isnull()][['value']]
59
-
60
- return parents
61
-
62
- # Get children with specific parent from data list
63
- def get_children(datalist:str, parent:str):
64
- # Construct the file path
65
- base_dir = os.path.dirname(os.path.abspath(__file__))
66
- file_path = os.path.join(base_dir, 'data', f'{datalist}.csv')
67
-
68
- # Load the CSV file into a DataFrame
69
- df = pd.read_csv(file_path, usecols=['id', 'pid', 'value'])
70
-
71
- parent_id = df[df['value'] == parent].iloc[0]['id']
72
-
73
- # extract only items with particular parent
74
- children = df[df['pid'] == parent_id][['value']]
75
-
76
- return children
77
-
78
- # check if there is exact case insensitive match among all datalist items
79
- # or items without parent (if childOnly=True)
80
- def datalist_contains_value(datalist: DataFrame, diagnosis:str, childOnly=False) -> bool:
81
- items = datalist[datalist['value'].str.lower() == diagnosis.lower()]
82
-
83
- if childOnly:
84
- items = items[items['pid'].notna()]
85
-
86
- return items.shape[0] > 0
@@ -1,229 +0,0 @@
1
- import pandas as pd
2
- import pytest
3
- import numpy as np
4
- import src.validation as v
5
-
6
-
7
- def test_bootstrap_CI_basic():
8
- """Test basic functionality of bootstrap_CI"""
9
- # Create test data with comparison results
10
- res_df = pd.DataFrame({
11
- 'field1': ['A', 'B', 'A', 'B', 'A'] * 20, # 100 rows total
12
- 'field2': ['X', 'Y', 'X', 'Y', 'X'] * 20,
13
- 'Cor: field1': [1, 0, 1, 1, 0] * 20,
14
- 'Inc: field1': [0, 1, 0, 0, 1] * 20,
15
- 'Mis: field1': [0, 0, 0, 0, 0] * 20,
16
- 'Spu: field1': [0, 1, 0, 0, 1] * 20,
17
- 'Par: field1': [0, 0, 0, 0, 0] * 20,
18
- 'Cor: field2': [1, 1, 0, 1, 1] * 20,
19
- 'Inc: field2': [0, 0, 1, 0, 0] * 20,
20
- 'Mis: field2': [0, 0, 1, 0, 0] * 20,
21
- 'Spu: field2': [0, 0, 0, 0, 0] * 20,
22
- 'Par: field2': [0, 0, 0, 0, 0] * 20,
23
- })
24
-
25
- fields = ['field1', 'field2']
26
- result = v.bootstrap_CI(res_df, fields, n_bootstrap=100, random_state=42)
27
-
28
- # Check output format
29
- assert 'field' in result.columns
30
- assert len(result) == 4 # Two fields + exceptions field + N/CI info row
31
- expected_fields = {'field1', 'field2', 'exceptions', 'N=100; CI=95%'}
32
- assert set(result['field']) == expected_fields
33
-
34
- # Check that confidence interval columns are present for metrics that exist in our data
35
- # We know these will be present because we've included them in our test data
36
- core_metrics = ['field-present cases', 'cor', 'inc', 'mis', 'spu', 'par']
37
-
38
- # 'labeled cases' is handled specially - just appears as 'labeled cases'
39
- assert 'labeled cases' in result.columns
40
-
41
- for metric in core_metrics:
42
- assert f'{metric}: mean' in result.columns
43
- assert f'{metric}: lower' in result.columns
44
- assert f'{metric}: upper' in result.columns
45
-
46
- # Check that means are reasonable (between lower and upper bounds) for non-exception fields
47
- for _, row in result.iterrows():
48
- if row['field'] in ['exceptions', 'N=100; CI=95%']: # Skip exceptions and info row
49
- for metric in core_metrics:
50
- mean_col = f'{metric}: mean'
51
- lower_col = f'{metric}: lower'
52
- upper_col = f'{metric}: upper'
53
-
54
- if pd.notna(row[mean_col]) and pd.notna(row[lower_col]) and pd.notna(row[upper_col]):
55
- assert row[lower_col] <= row[mean_col] <= row[upper_col], \
56
- f"Mean not between bounds for {row['field']} {metric}"
57
-
58
-
59
- def test_bootstrap_CI_error_conditions():
60
- """Test error conditions for bootstrap_CI"""
61
-
62
- # Test with ci outside valid range
63
- res_df = pd.DataFrame({
64
- 'field1': [1, 2, 3],
65
- 'Cor: field1': [1, 0, 1]
66
- })
67
-
68
- with pytest.raises(ValueError, match="ci must be in \\(0, 1\\)"):
69
- v.bootstrap_CI(res_df, ['field1'], ci=1.5)
70
-
71
- with pytest.raises(ValueError, match="ci must be in \\(0, 1\\)"):
72
- v.bootstrap_CI(res_df, ['field1'], ci=0)
73
-
74
- # Test with too few rows
75
- single_row_df = pd.DataFrame({
76
- 'field1': [1],
77
- 'Cor: field1': [1]
78
- })
79
-
80
- with pytest.raises(ValueError, match="Need at least 2 rows"):
81
- v.bootstrap_CI(single_row_df, ['field1'])
82
-
83
- # Test with missing labels (NaN values)
84
- res_df_with_nan = pd.DataFrame({
85
- 'field1': [1, np.nan, 3],
86
- 'field2': [1, 2, 3],
87
- 'Cor: field1': [1, 0, 1],
88
- 'Cor: field2': [0, 1, 0]
89
- })
90
-
91
- with pytest.raises(ValueError, match="Missing labels \\(NaN\\) found in the following fields: \\['field1'\\]"):
92
- v.bootstrap_CI(res_df_with_nan, ['field1', 'field2'])
93
-
94
-
95
- def test_bootstrap_CI_binary_field():
96
- """Test bootstrap_CI with binary field metrics"""
97
- # Create test data with binary field results
98
- res_df = pd.DataFrame({
99
- 'binary_field': [True, False, True, False] * 25, # 100 rows
100
- 'TP: binary_field': [1, 0, 1, 0] * 25,
101
- 'FP: binary_field': [0, 1, 0, 1] * 25,
102
- 'FN: binary_field': [0, 0, 0, 0] * 25,
103
- 'TN: binary_field': [0, 1, 0, 1] * 25,
104
- 'Precision: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
105
- 'Recall: binary_field': [1.0, np.nan, 1.0, np.nan] * 25,
106
- 'F1 score: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
107
- 'F2 score: binary_field': [1.0, 0.0, 1.0, 0.0] * 25,
108
- })
109
-
110
- fields = ['binary_field']
111
- result = v.bootstrap_CI(res_df, fields, n_bootstrap=50, random_state=42)
112
-
113
- # Check that binary metrics are included
114
- binary_metrics = ['TP', 'FP', 'FN', 'TN', 'precision (micro)', 'recall (micro)',
115
- 'F1 score (micro)', 'F2 score (micro)', 'accuracy (micro)', 'specificity (micro)']
116
-
117
- for metric in binary_metrics:
118
- assert f'{metric}: mean' in result.columns
119
- assert f'{metric}: lower' in result.columns
120
- assert f'{metric}: upper' in result.columns
121
-
122
- # Check that N/CI info row is present
123
- info_rows = result[result['field'].str.startswith('N=')]
124
- assert len(info_rows) == 1
125
-
126
-
127
- def test_bootstrap_CI_output_format():
128
- """Test that output format matches specification"""
129
- res_df = pd.DataFrame({
130
- 'test_field': ['A', 'B'] * 50, # 100 rows
131
- 'Cor: test_field': [1, 0] * 50,
132
- 'Inc: test_field': [0, 1] * 50,
133
- 'Mis: test_field': [0, 0] * 50,
134
- 'Spu: test_field': [0, 1] * 50,
135
- 'Par: test_field': [0, 0] * 50,
136
- })
137
-
138
- result = v.bootstrap_CI(res_df, ['test_field'], n_bootstrap=10, random_state=42)
139
-
140
- # Check that result has the correct format
141
- assert len(result) == 3 # One field + exceptions + N/CI info row
142
- test_field_row = result[result['field'] == 'test_field'].iloc[0]
143
- assert test_field_row['field'] == 'test_field'
144
-
145
- # Check that columns follow the expected pattern
146
- metric_columns = [col for col in result.columns if col not in ['field', 'labeled cases']]
147
- for col in metric_columns:
148
- assert ': ' in col, f"Column {col} doesn't follow expected format"
149
- metric_name, stat_type = col.split(': ', 1)
150
- assert stat_type in ['mean', 'lower', 'upper'], f"Unexpected stat type in {col}"
151
-
152
-
153
- def test_bootstrap_CI_confidence_intervals():
154
- """Test that confidence intervals make sense"""
155
- # Create deterministic test case
156
- res_df = pd.DataFrame({
157
- 'field1': [1] * 100,
158
- 'Cor: field1': [5] * 100, # Constant values for predictable CI
159
- 'Inc: field1': [0] * 100,
160
- 'Mis: field1': [0] * 100,
161
- 'Spu: field1': [0] * 100,
162
- 'Par: field1': [0] * 100,
163
- })
164
-
165
- result = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=100, ci=0.95, random_state=42)
166
-
167
- # For constant values, mean should equal the sum (get_metrics sums the values)
168
- row = result[result['field'] == 'field1'].iloc[0]
169
-
170
- # Check that mean is close to expected value (5 * 100 = 500)
171
- assert abs(row['cor: mean'] - 500.0) < 10.0
172
-
173
- # Check that CI bounds are reasonable (close to mean for constant data)
174
- assert abs(row['cor: lower'] - row['cor: mean']) < 50.0
175
- assert abs(row['cor: upper'] - row['cor: mean']) < 50.0
176
-
177
-
178
- def test_bootstrap_CI_with_different_ci_levels():
179
- """Test bootstrap_CI with different confidence interval levels"""
180
- res_df = pd.DataFrame({
181
- 'field1': [1, 2, 3] * 34, # ~100 rows
182
- 'Cor: field1': [1, 2, 1] * 34,
183
- 'Inc: field1': [0, 1, 0] * 34,
184
- 'Mis: field1': [0, 0, 1] * 34,
185
- 'Spu: field1': [1, 0, 0] * 34,
186
- 'Par: field1': [0, 0, 0] * 34,
187
- })
188
-
189
- # Test 90% CI
190
- result_90 = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=50, ci=0.90, random_state=42)
191
-
192
- # Test 99% CI
193
- result_99 = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=50, ci=0.99, random_state=42)
194
-
195
- # 99% CI should be wider than 90% CI
196
- row_90 = result_90[result_90['field'] == 'field1'].iloc[0]
197
- row_99 = result_99[result_99['field'] == 'field1'].iloc[0]
198
-
199
- width_90 = row_90['cor: upper'] - row_90['cor: lower']
200
- width_99 = row_99['cor: upper'] - row_99['cor: lower']
201
-
202
- assert width_99 >= width_90, "99% CI should be wider than 90% CI"
203
-
204
-
205
- def test_bootstrap_CI_empty_metrics():
206
- """Test bootstrap_CI handles missing values correctly"""
207
- # Create simpler test data that focuses on core functionality
208
- res_df = pd.DataFrame({
209
- 'field1': [1, 2, 3] * 34,
210
- 'Cor: field1': [0, 1, 2] * 34, # Valid values
211
- 'Inc: field1': [0, 0, 0] * 34,
212
- 'Mis: field1': [0, 0, 0] * 34,
213
- 'Spu: field1': [0, 0, 0] * 34,
214
- 'Par: field1': [0, 0, 0] * 34,
215
- })
216
-
217
- result = v.bootstrap_CI(res_df, ['field1'], n_bootstrap=10, random_state=42)
218
-
219
- # Check that core metrics appear in output
220
- core_cols = [col for col in result.columns if 'cor:' in col]
221
- assert len(core_cols) == 3, f"Expected 3 cor metrics (mean, lower, upper), got {len(core_cols)}: {core_cols}"
222
-
223
- # Check that the function completes without errors for this simpler case
224
- assert 'field' in result.columns
225
- assert len(result) >= 2 # At least exceptions + field1 + N/CI info row
226
-
227
- # Check that N/CI info row is present
228
- info_rows = result[result['field'].str.startswith('N=')]
229
- assert len(info_rows) == 1