voihla 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
voihla-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Alyssa Paynter
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
voihla-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,177 @@
1
+ Metadata-Version: 2.4
2
+ Name: voihla
3
+ Version: 0.1.0
4
+ Summary: HLA Imputation Validation Package
5
+ Author: Alyssa Paynter
6
+ Author-email: Alyssa Paynter <apaynter@tulane.edu>
7
+ License-Expression: MIT
8
+ Requires-Python: >=3.8
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: pandas>=1.3.0
12
+ Requires-Dist: numpy>=1.20.0
13
+ Requires-Dist: scikit-learn>=1.0.0
14
+ Requires-Dist: matplotlib>=3.3.0
15
+ Requires-Dist: requests>=2.25.0
16
+ Requires-Dist: py-ard>=1.0.0
17
+ Dynamic: author
18
+ Dynamic: license-file
19
+ Dynamic: requires-python
20
+
21
+ # VOIHLA - Validation of Imputed HLA
22
+
23
+ Compute high resolution HLA imputation validation metrics using the `voihla` Python package and scikit-learn model evaluation statistics.
24
+
25
+ ## Overview
26
+
27
+ The `voihla` package provides tools to preprocess, analyze, and visualize HLA imputation results. It supports single-locus, multilocus, and eplet-level analyses using standard metrics and calibration plots.
28
+
29
+ ## Installation
30
+
31
+ Clone the Github repository and navigate to the project directory:
32
+ ``` bash
33
+ git clone git@github.com:lgragert/imputation-validation.git
34
+ ```
35
+
36
+ Install dependencies using pip:
37
+ ```bash
38
+ pip install .
39
+ ```
40
+
41
+ ## Example Input Files
42
+
43
+ - `imputation.csv`: Imputation output with predicted HLA haplotype pairs and probabilities.
44
+ - `truth_table.csv`: High resolution genotype truth table in GLString format.
45
+
46
+ Example contents of files that are input for the package:
47
+
48
+ imputation.csv
49
+ ```
50
+ ID,Rank,Hap1,Hap2,HapPair_Prob
51
+ D3505,1,A*30:02~B*14:02,A*32:01~B*39:10,0.3150459288416418
52
+ D3505,2,A*30:02~B*14:02,A*32:01~B*39:01,0.2673517305598033
53
+ D3505,3,A*30:02~B*39:10,A*32:01~B*14:02,0.09971243338882652
54
+ D3505,4,A*30:02~B*14:02,A*32:01~B*39:06,0.09552497014201682
55
+ D3505,5,A*30:01~B*14:02,A*32:01~B*39:10,0.0787155933156964
56
+ D3505,6,A*30:01~B*14:02,A*32:01~B*39:01,0.06679899077690125
57
+ D3505,7,A*30:01~B*39:10,A*32:01~B*14:02,0.02920649369703246
58
+ D3505,8,A*30:01~B*14:02,A*32:01~B*39:06,0.02386732857916924
59
+ D3505,9,A*30:02~B*14:02,A*32:01~B*39:24,0.009453181190469357
60
+ D3505,10,A*30:01~B*14:02,A*32:01~B*39:24,0.002361918368107546
61
+ D3505,11,A*30:02~B*39:01,A*32:01~B*14:02,0.002257484294942207
62
+ D13880,1,A*30:02~B*07:02,A*34:02~B*53:01,0.40269177048888066
63
+ D13880,2,A*30:01~B*07:02,A*34:02~B*53:01,0.20308144129546576
64
+ D13880,3,A*30:02~B*53:01,A*34:02~B*07:02,0.13918038201193
65
+ D13880,4,A*30:01~B*53:01,A*34:02~B*07:02,0.11366198792610206
66
+ D13880,5,A*30:02~B*07:05,A*34:02~B*53:01,0.04839353857099623
67
+ D13880,6,A*30:01~B*07:05,A*34:02~B*53:01,0.04136109507629823
68
+ D13880,7,A*30:02~B*53:01,A*34:02~B*07:09,0.011395011799743957
69
+ D13880,8,A*30:02~B*53:01,A*34:02~B*07:05,0.010829180412477053
70
+ D13880,9,A*30:01~B*53:01,A*34:02~B*07:09,0.009305763318635456
71
+ D13880,10,A*30:01~B*53:01,A*34:02~B*07:05,0.008843675778868338
72
+ D13880,11,A*30:04~B*53:01,A*34:02~B*07:02,0.0027297420425591353
73
+ ```
74
+
75
+ `truth_table.csv`
76
+ ```
77
+ ID,GLString
78
+ D3505,A*30:02+A*32:01^B*14:02+B*39:01
79
+ D13880,A*30:02+A*34:02+B*07:05+B*53:01
80
+ ```
81
+
82
+ ## Usage
83
+
84
+ All main modules are in the `voihla` folder.
85
+
86
+ ### Preprocessing
87
+
88
+ Convert raw imputation files to analysis-ready format:
89
+
90
+ ```python
91
+ from voihla.preprocessing import ImputationPreprocessor
92
+
93
+ preprocessor = ImputationPreprocessor()
94
+ top_impute = preprocessor.process_files(['imputation.csv']) # Can pass multiple files in a list
95
+ top_impute.to_csv('lowres_topprob_impute.csv', index=False)
96
+ ```
97
+
98
+ This will create a variable that will have every GLString in the imputation file ready for SLUG and MUG analyses depending on how many loci are avaialble in your file.
99
+
100
+ ### Single-Locus Analysis
101
+ ```Python
102
+ import pandas as pd
103
+ from voihla.analysis import SingleLocusAnalysis
104
+ from voihla.preprocessing import ImputationPreprocessor
105
+
106
+ preprocessor = ImputationPreprocessor()
107
+ impute_df = preprocessor.process_files(['imputation.csv'])
108
+ truth_df = pd.read_csv('truth_table.csv') # If your truth table is in a clean format then you just need to create a DataFrame
109
+ analysis = SingleLocusAnalysis(truth_df, impute_df)
110
+ results = analysis.get_results_df()
111
+ print(results)
112
+ ```
113
+
114
+ The `results` DataFrame will contain the the variables required for Calibration plots.
115
+
116
+ y_true = if the imputation matches the truth table then 1, otherwise 0.
117
+
118
+ y_pred = the confidence of the imputation prediction being correct 1, otherwise 0 (threshold is 0.5 and can be changed).
119
+
120
+ y_prob = the actual probability of the imputation.
121
+
122
+ ### Multilocus Analysis
123
+ ```Python
124
+ from voihla import ImputationPreprocessor, MultiLocusAnalysis
125
+ import pandas as pd
126
+
127
+ preprocessor = ImputationPreprocessor()
128
+ impute_df = preprocessor.process_files(['imputation.csv'])
129
+ truth_df = pd.read_csv('truth_table.csv')
130
+ analysis = MultiLocusAnalysis(truth_df, impute_df)
131
+ results = analysis.get_results_df()
132
+ print(results)
133
+ ```
134
+ The `results` DataFrame will contain the the variables required for Calibration plots.
135
+
136
+ y_true = if the imputation matches the truth table then 1, otherwise 0.
137
+
138
+ y_pred = the confidence of the imputation prediction being correct 1, otherwise 0 (threshold is 0.5 and can be changed).
139
+
140
+ y_prob = the actual probability of the imputation.
141
+
142
+
143
+ ### Calibration Plots
144
+
145
+ Calibration plots can be generated using the `CalibrationPlotter` class from the `voihla.plotting` module.
146
+
147
+ Can take either SingleLocusAnalysis or MultiLocusAnalysis results DataFrame as input.
148
+
149
+ ``` Python
150
+ from voihla.plotting import CalibrationPlotter
151
+
152
+ plotter = CalibrationPlotter(n_bins=4)
153
+ locus = 'A'
154
+ df = analysis.get_results_df()[locus]
155
+ fig = plotter.calibration_plot(analysis_results=df, title=f'Calibration {locus}', save_path=f'Calibration_{locus}.png')
156
+ ```
157
+
158
+ ### Eplet-Level Analysis
159
+ Still a work in progress, but you can use the following code to get simulated pairs for now.
160
+ ```python
161
+ from voihla.eplet import MonteCarloEpletAnalysis
162
+
163
+ eplet_analysis = MonteCarloEpletAnalysis(api_key='YOUR_API_KEY')
164
+ pairs_df = eplet_analysis.create_random_pairs('truth_table.csv', n_pairs=100)
165
+ results_df = eplet_analysis.analyze_eplet_mismatches(pairs_df)
166
+ results_df.to_csv('DRDQ_eplet_lowres_impute100.csv', index=False)
167
+ ```
168
+
169
+ ### Output
170
+ - Calibration plots saved as PNG files
171
+ - ROC curves saved as PNG files
172
+ - Classification reports
173
+ - Summary CSV files
174
+
175
+ ### API Reference
176
+ Please go to the Eplet Registry for an API key.
177
+
voihla-0.1.0/README.md ADDED
@@ -0,0 +1,157 @@
1
+ # VOIHLA - Validation of Imputed HLA
2
+
3
+ Compute high resolution HLA imputation validation metrics using the `voihla` Python package and scikit-learn model evaluation statistics.
4
+
5
+ ## Overview
6
+
7
+ The `voihla` package provides tools to preprocess, analyze, and visualize HLA imputation results. It supports single-locus, multilocus, and eplet-level analyses using standard metrics and calibration plots.
8
+
9
+ ## Installation
10
+
11
+ Clone the Github repository and navigate to the project directory:
12
+ ``` bash
13
+ git clone git@github.com:lgragert/imputation-validation.git
14
+ ```
15
+
16
+ Install dependencies using pip:
17
+ ```bash
18
+ pip install .
19
+ ```
20
+
21
+ ## Example Input Files
22
+
23
+ - `imputation.csv`: Imputation output with predicted HLA haplotype pairs and probabilities.
24
+ - `truth_table.csv`: High resolution genotype truth table in GLString format.
25
+
26
+ Example contents of files that are input for the package:
27
+
28
+ imputation.csv
29
+ ```
30
+ ID,Rank,Hap1,Hap2,HapPair_Prob
31
+ D3505,1,A*30:02~B*14:02,A*32:01~B*39:10,0.3150459288416418
32
+ D3505,2,A*30:02~B*14:02,A*32:01~B*39:01,0.2673517305598033
33
+ D3505,3,A*30:02~B*39:10,A*32:01~B*14:02,0.09971243338882652
34
+ D3505,4,A*30:02~B*14:02,A*32:01~B*39:06,0.09552497014201682
35
+ D3505,5,A*30:01~B*14:02,A*32:01~B*39:10,0.0787155933156964
36
+ D3505,6,A*30:01~B*14:02,A*32:01~B*39:01,0.06679899077690125
37
+ D3505,7,A*30:01~B*39:10,A*32:01~B*14:02,0.02920649369703246
38
+ D3505,8,A*30:01~B*14:02,A*32:01~B*39:06,0.02386732857916924
39
+ D3505,9,A*30:02~B*14:02,A*32:01~B*39:24,0.009453181190469357
40
+ D3505,10,A*30:01~B*14:02,A*32:01~B*39:24,0.002361918368107546
41
+ D3505,11,A*30:02~B*39:01,A*32:01~B*14:02,0.002257484294942207
42
+ D13880,1,A*30:02~B*07:02,A*34:02~B*53:01,0.40269177048888066
43
+ D13880,2,A*30:01~B*07:02,A*34:02~B*53:01,0.20308144129546576
44
+ D13880,3,A*30:02~B*53:01,A*34:02~B*07:02,0.13918038201193
45
+ D13880,4,A*30:01~B*53:01,A*34:02~B*07:02,0.11366198792610206
46
+ D13880,5,A*30:02~B*07:05,A*34:02~B*53:01,0.04839353857099623
47
+ D13880,6,A*30:01~B*07:05,A*34:02~B*53:01,0.04136109507629823
48
+ D13880,7,A*30:02~B*53:01,A*34:02~B*07:09,0.011395011799743957
49
+ D13880,8,A*30:02~B*53:01,A*34:02~B*07:05,0.010829180412477053
50
+ D13880,9,A*30:01~B*53:01,A*34:02~B*07:09,0.009305763318635456
51
+ D13880,10,A*30:01~B*53:01,A*34:02~B*07:05,0.008843675778868338
52
+ D13880,11,A*30:04~B*53:01,A*34:02~B*07:02,0.0027297420425591353
53
+ ```
54
+
55
+ `truth_table.csv`
56
+ ```
57
+ ID,GLString
58
+ D3505,A*30:02+A*32:01^B*14:02+B*39:01
59
+ D13880,A*30:02+A*34:02+B*07:05+B*53:01
60
+ ```
61
+
62
+ ## Usage
63
+
64
+ All main modules are in the `voihla` folder.
65
+
66
+ ### Preprocessing
67
+
68
+ Convert raw imputation files to analysis-ready format:
69
+
70
+ ```python
71
+ from voihla.preprocessing import ImputationPreprocessor
72
+
73
+ preprocessor = ImputationPreprocessor()
74
+ top_impute = preprocessor.process_files(['imputation.csv']) # Can pass multiple files in a list
75
+ top_impute.to_csv('lowres_topprob_impute.csv', index=False)
76
+ ```
77
+
78
+ This will create a variable that will have every GLString in the imputation file ready for SLUG and MUG analyses depending on how many loci are avaialble in your file.
79
+
80
+ ### Single-Locus Analysis
81
+ ```Python
82
+ import pandas as pd
83
+ from voihla.analysis import SingleLocusAnalysis
84
+ from voihla.preprocessing import ImputationPreprocessor
85
+
86
+ preprocessor = ImputationPreprocessor()
87
+ impute_df = preprocessor.process_files(['imputation.csv'])
88
+ truth_df = pd.read_csv('truth_table.csv') # If your truth table is in a clean format then you just need to create a DataFrame
89
+ analysis = SingleLocusAnalysis(truth_df, impute_df)
90
+ results = analysis.get_results_df()
91
+ print(results)
92
+ ```
93
+
94
+ The `results` DataFrame will contain the the variables required for Calibration plots.
95
+
96
+ y_true = if the imputation matches the truth table then 1, otherwise 0.
97
+
98
+ y_pred = the confidence of the imputation prediction being correct 1, otherwise 0 (threshold is 0.5 and can be changed).
99
+
100
+ y_prob = the actual probability of the imputation.
101
+
102
+ ### Multilocus Analysis
103
+ ```Python
104
+ from voihla import ImputationPreprocessor, MultiLocusAnalysis
105
+ import pandas as pd
106
+
107
+ preprocessor = ImputationPreprocessor()
108
+ impute_df = preprocessor.process_files(['imputation.csv'])
109
+ truth_df = pd.read_csv('truth_table.csv')
110
+ analysis = MultiLocusAnalysis(truth_df, impute_df)
111
+ results = analysis.get_results_df()
112
+ print(results)
113
+ ```
114
+ The `results` DataFrame will contain the the variables required for Calibration plots.
115
+
116
+ y_true = if the imputation matches the truth table then 1, otherwise 0.
117
+
118
+ y_pred = the confidence of the imputation prediction being correct 1, otherwise 0 (threshold is 0.5 and can be changed).
119
+
120
+ y_prob = the actual probability of the imputation.
121
+
122
+
123
+ ### Calibration Plots
124
+
125
+ Calibration plots can be generated using the `CalibrationPlotter` class from the `voihla.plotting` module.
126
+
127
+ Can take either SingleLocusAnalysis or MultiLocusAnalysis results DataFrame as input.
128
+
129
+ ``` Python
130
+ from voihla.plotting import CalibrationPlotter
131
+
132
+ plotter = CalibrationPlotter(n_bins=4)
133
+ locus = 'A'
134
+ df = analysis.get_results_df()[locus]
135
+ fig = plotter.calibration_plot(analysis_results=df, title=f'Calibration {locus}', save_path=f'Calibration_{locus}.png')
136
+ ```
137
+
138
+ ### Eplet-Level Analysis
139
+ Still a work in progress, but you can use the following code to get simulated pairs for now.
140
+ ```python
141
+ from voihla.eplet import MonteCarloEpletAnalysis
142
+
143
+ eplet_analysis = MonteCarloEpletAnalysis(api_key='YOUR_API_KEY')
144
+ pairs_df = eplet_analysis.create_random_pairs('truth_table.csv', n_pairs=100)
145
+ results_df = eplet_analysis.analyze_eplet_mismatches(pairs_df)
146
+ results_df.to_csv('DRDQ_eplet_lowres_impute100.csv', index=False)
147
+ ```
148
+
149
+ ### Output
150
+ - Calibration plots saved as PNG files
151
+ - ROC curves saved as PNG files
152
+ - Classification reports
153
+ - Summary CSV files
154
+
155
+ ### API Reference
156
+ Please go to the Eplet Registry for an API key.
157
+
@@ -0,0 +1,18 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "voihla"
7
+ version = "0.1.0"
8
+ description = "HLA Imputation Validation Package"
9
+ authors = [{name = "Alyssa Paynter", email = "apaynter@tulane.edu"}]
10
+ dependencies = ["pandas>=1.3.0",
11
+ "numpy>=1.20.0",
12
+ "scikit-learn>=1.0.0",
13
+ "matplotlib>=3.3.0",
14
+ "requests>=2.25.0",
15
+ "py-ard>=1.0.0"]
16
+ readme = "README.md"
17
+ license = "MIT"
18
+ requires-python = ">=3.8"
voihla-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
voihla-0.1.0/setup.py ADDED
@@ -0,0 +1,28 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="voihla",
5
+ version="0.1.0",
6
+ author="Alyssa Paynter",
7
+ description="Python package for HLA imputation validation metrics using scikit-learn",
8
+ long_description=open("README.md").read(),
9
+ long_description_content_type="text/markdown",
10
+ packages=find_packages(),
11
+ install_requires=[
12
+ "pandas>=1.3.0",
13
+ "numpy>=1.20.0",
14
+ "scikit-learn>=1.0.0",
15
+ "matplotlib>=3.3.0",
16
+ "requests>=2.25.0",
17
+ "pyard>=2.0.0"
18
+ ],
19
+ python_requires=">=3.8",
20
+ classifiers=[
21
+ "Development Status :: 3 - Alpha",
22
+ "Intended Audience :: Science/Research",
23
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
24
+ "Programming Language :: Python :: 3.8",
25
+ "Programming Language :: Python :: 3.9",
26
+ "Programming Language :: Python :: 3.10",
27
+ ],
28
+ )
@@ -0,0 +1,17 @@
1
+ """HLA Imputation Validation Package."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from .analysis import SingleLocusAnalysis, MultiLocusAnalysis
6
+ from .plotting import CalibrationPlotter
7
+ from .preprocessing import ImputationPreprocessor
8
+ from .eplet import MonteCarloEpletAnalysis, EpletAnalysis
9
+
10
+ __all__ = [
11
+ "SingleLocusAnalysis",
12
+ "MultiLocusAnalysis",
13
+ "CalibrationPlotter",
14
+ "ImputationPreprocessor",
15
+ "MonteCarloEpletAnalysis",
16
+ "EpletAnalysis"
17
+ ]