sortscore 0.1.0b2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. sortscore-0.1.0b2/LICENSE +21 -0
  2. sortscore-0.1.0b2/PKG-INFO +67 -0
  3. sortscore-0.1.0b2/README.md +39 -0
  4. sortscore-0.1.0b2/pyproject.toml +43 -0
  5. sortscore-0.1.0b2/setup.cfg +4 -0
  6. sortscore-0.1.0b2/sortscore/__init__.py +19 -0
  7. sortscore-0.1.0b2/sortscore/__main__.py +7 -0
  8. sortscore-0.1.0b2/sortscore/analysis/__init__.py +7 -0
  9. sortscore-0.1.0b2/sortscore/analysis/aa_scores.py +221 -0
  10. sortscore-0.1.0b2/sortscore/analysis/annotation.py +191 -0
  11. sortscore-0.1.0b2/sortscore/analysis/batch_config.py +170 -0
  12. sortscore-0.1.0b2/sortscore/analysis/batch_normalization.py +928 -0
  13. sortscore-0.1.0b2/sortscore/analysis/batch_workflow.py +76 -0
  14. sortscore-0.1.0b2/sortscore/analysis/filtering.py +53 -0
  15. sortscore-0.1.0b2/sortscore/analysis/normalize_read_depth.py +99 -0
  16. sortscore-0.1.0b2/sortscore/analysis/score.py +243 -0
  17. sortscore-0.1.0b2/sortscore/analysis/statistics.py +201 -0
  18. sortscore-0.1.0b2/sortscore/analysis/summary_stats.py +182 -0
  19. sortscore-0.1.0b2/sortscore/analysis/variant_aggregation.py +94 -0
  20. sortscore-0.1.0b2/sortscore/analysis/workflows.py +237 -0
  21. sortscore-0.1.0b2/sortscore/cli.py +48 -0
  22. sortscore-0.1.0b2/sortscore/run_analysis.py +235 -0
  23. sortscore-0.1.0b2/sortscore/run_batch_analysis.py +84 -0
  24. sortscore-0.1.0b2/sortscore/utils/analysis_logger.py +277 -0
  25. sortscore-0.1.0b2/sortscore/utils/console_utils.py +208 -0
  26. sortscore-0.1.0b2/sortscore/utils/experiment_setup.py +325 -0
  27. sortscore-0.1.0b2/sortscore/utils/file_utils.py +158 -0
  28. sortscore-0.1.0b2/sortscore/utils/load_experiment.py +667 -0
  29. sortscore-0.1.0b2/sortscore/utils/sequence_parsing.py +316 -0
  30. sortscore-0.1.0b2/sortscore/utils/tile_configs.py +115 -0
  31. sortscore-0.1.0b2/sortscore/utils/variant_detection.py +329 -0
  32. sortscore-0.1.0b2/sortscore/utils/variant_parsing.py +68 -0
  33. sortscore-0.1.0b2/sortscore/visualization/__init__.py +5 -0
  34. sortscore-0.1.0b2/sortscore/visualization/correlations.py +358 -0
  35. sortscore-0.1.0b2/sortscore/visualization/heatmap_matrix.py +180 -0
  36. sortscore-0.1.0b2/sortscore/visualization/heatmap_workflow.py +218 -0
  37. sortscore-0.1.0b2/sortscore/visualization/heatmaps.py +737 -0
  38. sortscore-0.1.0b2/sortscore/visualization/plots.py +358 -0
  39. sortscore-0.1.0b2/sortscore.egg-info/PKG-INFO +67 -0
  40. sortscore-0.1.0b2/sortscore.egg-info/SOURCES.txt +42 -0
  41. sortscore-0.1.0b2/sortscore.egg-info/dependency_links.txt +1 -0
  42. sortscore-0.1.0b2/sortscore.egg-info/entry_points.txt +2 -0
  43. sortscore-0.1.0b2/sortscore.egg-info/requires.txt +7 -0
  44. sortscore-0.1.0b2/sortscore.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 dbaldridge-lab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,67 @@
1
+ Metadata-Version: 2.4
2
+ Name: sortscore
3
+ Version: 0.1.0b2
4
+ Summary: A modular Python package for Sort-seq variant analysis
5
+ Author-email: Caitlyn Chitwood <c.chitwood@wustl.edu>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/dbaldridge-lab/sortscore
8
+ Keywords: bioinformatics,sequencing,variant-analysis,sort-seq
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: pandas>=2.0.0
21
+ Requires-Dist: numpy>=1.24.0
22
+ Requires-Dist: scipy>=1.10.0
23
+ Requires-Dist: matplotlib>=3.6.0
24
+ Requires-Dist: seaborn>=0.12.0
25
+ Requires-Dist: biopython>=1.81
26
+ Requires-Dist: mavehgvs>=0.7.0
27
+ Dynamic: license-file
28
+
29
+ # sortscore
30
+
31
+ `sortscore` is a Python package for Sort-seq variant analysis, including scoring, normalization, and visualization.
32
+
33
+ ## Quick Start
34
+
35
+ ```bash
36
+ git clone https://github.com/dbaldridge-lab/sortscore
37
+ cd sortscore
38
+ python -m venv .venv
39
+ source .venv/bin/activate
40
+ pip install -e .
41
+ ```
42
+ Run a standard variant scoring analysis:
43
+
44
+ ```bash
45
+ sortscore -n EXPERIMENT_NAME -e path/to/experiment_setup.csv -c path/to/config.json
46
+ ```
47
+ If you did not install the CLI entry point, run:
48
+
49
+ ```bash
50
+ python -m sortscore -n EXPERIMENT_NAME -e path/to/experiment_setup.csv -c path/to/config.json
51
+ ```
52
+
53
+ ## Documentation
54
+ - [Installation](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/installation.md)
55
+ - [Usage](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/usage.md)
56
+ - [CLI Arguments](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/cli_arguments.md)
57
+ - [Visualization](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/visualization.md)
58
+ - [Batch Processing](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/batch_processing.md)
59
+ - [Troubleshooting](https://github.com/dbaldridge-lab/sortscore/blob/main/TROUBLESHOOTING.md)
60
+ - [Contributing](https://github.com/dbaldridge-lab/sortscore/blob/main/CONTRIBUTING.md)
61
+
62
+ ## Demo
63
+
64
+ - [Single Experiment Scoring Notebook Demo](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/single_experiment_demo.ipynb)
65
+ - [Batch Normalization Notebook Demo](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/tiled_demo.ipynb)
66
+ - [Example Config](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/GLI2_oPool5b/config.json)
67
+ - [Example Experiment Setup](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/GLI2_oPool5b/experiment_setup.csv)
@@ -0,0 +1,39 @@
1
+ # sortscore
2
+
3
+ `sortscore` is a Python package for Sort-seq variant analysis, including scoring, normalization, and visualization.
4
+
5
+ ## Quick Start
6
+
7
+ ```bash
8
+ git clone https://github.com/dbaldridge-lab/sortscore
9
+ cd sortscore
10
+ python -m venv .venv
11
+ source .venv/bin/activate
12
+ pip install -e .
13
+ ```
14
+ Run a standard variant scoring analysis:
15
+
16
+ ```bash
17
+ sortscore -n EXPERIMENT_NAME -e path/to/experiment_setup.csv -c path/to/config.json
18
+ ```
19
+ If you did not install the CLI entry point, run:
20
+
21
+ ```bash
22
+ python -m sortscore -n EXPERIMENT_NAME -e path/to/experiment_setup.csv -c path/to/config.json
23
+ ```
24
+
25
+ ## Documentation
26
+ - [Installation](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/installation.md)
27
+ - [Usage](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/usage.md)
28
+ - [CLI Arguments](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/cli_arguments.md)
29
+ - [Visualization](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/visualization.md)
30
+ - [Batch Processing](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/batch_processing.md)
31
+ - [Troubleshooting](https://github.com/dbaldridge-lab/sortscore/blob/main/TROUBLESHOOTING.md)
32
+ - [Contributing](https://github.com/dbaldridge-lab/sortscore/blob/main/CONTRIBUTING.md)
33
+
34
+ ## Demo
35
+
36
+ - [Single Experiment Scoring Notebook Demo](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/single_experiment_demo.ipynb)
37
+ - [Batch Normalization Notebook Demo](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/tiled_demo.ipynb)
38
+ - [Example Config](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/GLI2_oPool5b/config.json)
39
+ - [Example Experiment Setup](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/GLI2_oPool5b/experiment_setup.csv)
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sortscore"
7
+ version = "0.1.0b2"
8
+ description = "A modular Python package for Sort-seq variant analysis"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Caitlyn Chitwood", email = "c.chitwood@wustl.edu" }
14
+ ]
15
+ keywords = ["bioinformatics", "sequencing", "variant-analysis", "sort-seq"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Science/Research",
19
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12"
25
+ ]
26
+ dependencies = [
27
+ "pandas>=2.0.0",
28
+ "numpy>=1.24.0",
29
+ "scipy>=1.10.0",
30
+ "matplotlib>=3.6.0",
31
+ "seaborn>=0.12.0",
32
+ "biopython>=1.81",
33
+ "mavehgvs>=0.7.0"
34
+ ]
35
+
36
+ [project.urls]
37
+ Homepage = "https://github.com/dbaldridge-lab/sortscore"
38
+
39
+ [project.scripts]
40
+ sortscore = "sortscore.cli:main"
41
+
42
+ [tool.setuptools.packages.find]
43
+ include = ["sortscore", "sortscore.*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,19 @@
1
+ """
2
+ sortscore: A modular Python package for Sort-seq variant analysis.
3
+
4
+ This package provides tools for analyzing Sort-seq experimental data,
5
+ calculating activity scores, and generating visualizations.
6
+ """
7
+
8
+ __version__ = "0.1.0b2"
9
+ __author__ = "Caitlyn Chitwood"
10
+ __email__ = "c.chitwood@wustl.edu"
11
+
12
+ # Import main classes for convenience
13
+ from .utils.load_experiment import ExperimentConfig
14
+ from .analysis.score import calculate_full_activity_scores
15
+
16
+ __all__ = [
17
+ "ExperimentConfig",
18
+ "calculate_full_activity_scores"
19
+ ]
@@ -0,0 +1,7 @@
1
+ """
2
+ Allow running the sortscore package as a module with: python -m sortscore
3
+ """
4
+ from sortscore.cli import main
5
+
6
+ if __name__ == "__main__":
7
+ main()
@@ -0,0 +1,7 @@
1
+ """
2
+ Analysis subpackage for Sort-seq variant analysis workflows.
3
+
4
+ This subpackage contains modules for configuration, parameters, I/O, and utilities.
5
+ """
6
+
7
+ from .score import calculate_activity_scores
@@ -0,0 +1,221 @@
1
+ """
2
+ Amino acid scores processing and export for Sort-seq analysis.
3
+
4
+ This module provides functions for processing amino acid scores from DNA variant data,
5
+ including aggregation of synonymous codons, statistical analysis, and file export.
6
+ """
7
+ import os
8
+ import logging
9
+ import pandas as pd
10
+ import numpy as np
11
+ from scipy import stats as scipy_stats
12
+ from typing import Tuple, List
13
+ from sortscore.analysis.statistics import calculate_codon_and_replicate_variance
14
+
15
+
16
+ def process_and_save_aa_scores(scores_df: pd.DataFrame, experiment, scores_dir: str,
17
+ output_suffix: str, analysis_logger) -> None:
18
+ """
19
+ Process and save amino acid scores from variant data.
20
+
21
+ This function handles the complete AA scores workflow including:
22
+ - Filtering out NaN values
23
+ - Checking if codon aggregation is needed
24
+ - Calculating appropriate statistics (with/without codon variance)
25
+ - Rounding score columns
26
+ - Saving to CSV file
27
+ - Logging output
28
+
29
+ Parameters
30
+ ----------
31
+ scores_df : pd.DataFrame
32
+ DataFrame containing variant scores and annotations
33
+ experiment : ExperimentConfig
34
+ Experiment configuration containing metadata
35
+ scores_dir : str
36
+ Directory to save scores file
37
+ output_suffix : str
38
+ Suffix for output filename
39
+ analysis_logger : AnalysisLogger
40
+ Logger instance for recording outputs
41
+
42
+ Examples
43
+ --------
44
+ >>> process_and_save_aa_scores(scores_df, experiment, 'output/scores', 'suffix', logger)
45
+ """
46
+ if 'aa_seq_diff' not in scores_df.columns:
47
+ return
48
+
49
+ # Determine score column
50
+ if experiment.avg_method == 'simple-avg':
51
+ score_col = 'avgscore'
52
+ else:
53
+ score_col_suffix = experiment.avg_method.replace('-', '_')
54
+ score_col = f'avgscore_{score_col_suffix}'
55
+
56
+ # Filter out rows with NaN values first
57
+ scores_df_drop_nan = scores_df.dropna(subset=[score_col])
58
+
59
+ # Find replicate score columns
60
+ rep_score_columns = [col for col in scores_df_drop_nan.columns
61
+ if col.startswith('Rep') and col.endswith('.score')]
62
+
63
+ # Check aggregation needs and process scores
64
+ aa_scores = _check_codon_num(scores_df_drop_nan, score_col, rep_score_columns)
65
+
66
+ # Round score columns to integers
67
+ aa_scores = _round_score_columns(aa_scores)
68
+
69
+ # Save to file
70
+ aa_scores_file = os.path.join(scores_dir, f"{experiment.experiment_name}_aa_scores_{output_suffix}.csv")
71
+ aa_scores.to_csv(aa_scores_file, index=False)
72
+ logging.info(f"Saved AA scores to {aa_scores_file} ({len(aa_scores)} unique AA variants)")
73
+
74
+ # Log file output
75
+ analysis_logger.log_output_file(
76
+ 'aa_scores',
77
+ f"{experiment.experiment_name}_aa_scores_{output_suffix}.csv",
78
+ aa_scores_file,
79
+ variant_count=len(aa_scores)
80
+ )
81
+
82
+
83
+ def _check_codon_num(scores_df_drop_nan: pd.DataFrame, score_col: str,
84
+ rep_score_columns: List[str]) -> pd.DataFrame:
85
+ """
86
+ Check if codon aggregation is needed and process AA scores accordingly.
87
+
88
+ This function checks if there are multiple codons per AA variant and processes
89
+ the data using either DNA->AA aggregation (with codon variance) or AA-only
90
+ statistics (replicate variance only).
91
+
92
+ Parameters
93
+ ----------
94
+ scores_df_drop_nan : pd.DataFrame
95
+ DataFrame with NaN values already filtered out
96
+ score_col : str
97
+ Name of the score column to use
98
+ rep_score_columns : List[str]
99
+ List of replicate score column names
100
+
101
+ Returns
102
+ -------
103
+ pd.DataFrame
104
+ Processed AA scores with appropriate statistics
105
+ """
106
+ # Check if there are multiple codons per AA variant (DNA->AA case)
107
+ aa_variant_counts = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa']).size()
108
+ needs_aggregation = (aa_variant_counts > 1).any()
109
+
110
+ if needs_aggregation:
111
+ return _process_dna_to_aa_aggregation(scores_df_drop_nan, score_col, rep_score_columns)
112
+ else:
113
+ return _process_aa_only_scores(scores_df_drop_nan, rep_score_columns)
114
+
115
+
116
+ def _process_dna_to_aa_aggregation(scores_df_drop_nan: pd.DataFrame, score_col: str,
117
+ rep_score_columns: List[str]) -> pd.DataFrame:
118
+ """
119
+ Process DNA->AA aggregation case with codon variance decomposition.
120
+
121
+ Parameters
122
+ ----------
123
+ scores_df_drop_nan : pd.DataFrame
124
+ DataFrame with variant scores (NaN filtered)
125
+ score_col : str
126
+ Name of the score column to use
127
+ rep_score_columns : List[str]
128
+ List of replicate score column names
129
+
130
+ Returns
131
+ -------
132
+ pd.DataFrame
133
+ Aggregated AA scores with codon and replicate statistics
134
+ """
135
+ # DNA->AA aggregation case: aggregate synonymous variants
136
+ columns_to_average = ['avgscore', 'avgscore_rep_weighted'] + rep_score_columns
137
+
138
+ # Calculate standard deviation and count of codon-level scores before AA aggregation
139
+ aa_scores_std = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa'])[score_col].agg(['std', 'count']).reset_index()
140
+ aa_scores_std.columns = ['aa_seq_diff', 'annotate_aa', 'SD_codon', 'n_codons']
141
+
142
+ # Calculate mean scores for aggregation
143
+ aa_scores = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa'])[columns_to_average].mean().reset_index()
144
+
145
+ # Merge the standard deviation and count of codon scores
146
+ aa_scores = aa_scores.merge(aa_scores_std, on=['aa_seq_diff', 'annotate_aa'], how='left')
147
+
148
+ # Calculate statistics with codon and replicate variance decomposition
149
+ aa_scores = calculate_codon_and_replicate_variance(aa_scores, rep_score_columns)
150
+
151
+ return aa_scores
152
+
153
+
154
+ def _process_aa_only_scores(scores_df_drop_nan: pd.DataFrame, rep_score_columns: List[str]) -> pd.DataFrame:
155
+ """
156
+ Process AA-only case with simple replicate statistics.
157
+
158
+ Parameters
159
+ ----------
160
+ scores_df_drop_nan : pd.DataFrame
161
+ DataFrame with variant scores (NaN filtered)
162
+ rep_score_columns : List[str]
163
+ List of replicate score column names
164
+
165
+ Returns
166
+ -------
167
+ pd.DataFrame
168
+ AA scores with replicate statistics only
169
+ """
170
+ # AA-only case: no aggregation needed, just copy the data
171
+ columns_to_include = ['aa_seq_diff', 'annotate_aa', 'avgscore', 'avgscore_rep_weighted'] + rep_score_columns
172
+
173
+ aa_scores = scores_df_drop_nan[columns_to_include].copy()
174
+
175
+ # Calculate simple replicate statistics (no codon variance)
176
+ if len(rep_score_columns) >= 2:
177
+ aa_rep_mean = aa_scores[rep_score_columns].mean(axis=1)
178
+ aa_rep_std = aa_scores[rep_score_columns].std(axis=1, ddof=1)
179
+
180
+ # Calculate n_measurements (just number of non-empty replicates)
181
+ n_measurements = aa_scores[rep_score_columns].notna().sum(axis=1)
182
+
183
+ # Calculate SEM using only replicate variance
184
+ sem = aa_rep_std / np.sqrt(n_measurements)
185
+
186
+ # Calculate 95% CI using t-distribution
187
+ df_actual = n_measurements - 1
188
+ t_critical = scipy_stats.t.ppf(0.975, df_actual)
189
+ aa_margin_of_error = t_critical * sem
190
+
191
+ aa_scores['SD_rep'] = aa_rep_std.round().astype('Int64')
192
+ aa_scores['CV_rep'] = (aa_rep_std / aa_rep_mean).round(3)
193
+ aa_scores['n_measurements'] = n_measurements.astype('Int64')
194
+ aa_scores['SEM'] = sem.round().astype('Int64')
195
+ aa_scores['CI_lower'] = (aa_rep_mean - aa_margin_of_error).round().astype('Int64')
196
+ aa_scores['CI_upper'] = (aa_rep_mean + aa_margin_of_error).round().astype('Int64')
197
+
198
+ return aa_scores
199
+
200
+
201
+ def _round_score_columns(aa_scores: pd.DataFrame) -> pd.DataFrame:
202
+ """
203
+ Round score columns to integers for cleaner output.
204
+
205
+ Parameters
206
+ ----------
207
+ aa_scores : pd.DataFrame
208
+ DataFrame containing score columns
209
+
210
+ Returns
211
+ -------
212
+ pd.DataFrame
213
+ DataFrame with score columns rounded to integers
214
+ """
215
+ # Round score columns to integers
216
+ score_columns = [col for col in aa_scores.columns if 'score' in col.lower()]
217
+ for col in score_columns:
218
+ if aa_scores[col].dtype in ['float64', 'float32']:
219
+ aa_scores[col] = aa_scores[col].round().astype('Int64')
220
+
221
+ return aa_scores
@@ -0,0 +1,191 @@
1
+ """
2
+ Sequence annotation utilities for Sort-seq variant analysis.
3
+
4
+ This module provides functions for annotating variant DataFrames with sequence differences,
5
+ translations, and other derived sequence information.
6
+
7
+ Examples
8
+ --------
9
+ >>> from sortscore.analysis.annotation import annotate_scores_dataframe
10
+ >>> annotated_df = annotate_scores_dataframe(scores_df, experiment)
11
+ """
12
+ import pandas as pd
13
+ from sortscore.utils.sequence_parsing import compare_to_reference, compare_codon_lists, translate_dna
14
+
15
+ # TODO: #37 redundant, see if we can remove
16
+ def annotate_scores_dataframe(
17
+ scores_df: pd.DataFrame,
18
+ wt_dna_seq: str,
19
+ mutagenesis_type: str = 'aa',
20
+ ) -> pd.DataFrame:
21
+ """
22
+ Add sequence annotation columns to a scores DataFrame.
23
+
24
+ Parameters
25
+ ----------
26
+ scores_df : pd.DataFrame
27
+ DataFrame with variant sequences and scores.
28
+ wt_dna_seq : str
29
+ Wild-type DNA reference sequence.
30
+ mutagenesis_type : str, default 'aa'
31
+ Type of mutagenesis ('codon', 'snv', 'aa').
32
+
33
+ Returns
34
+ -------
35
+ annotated_df : pd.DataFrame
36
+ DataFrame with added annotation columns.
37
+
38
+ Examples
39
+ --------
40
+ >>> annotated_df = annotate_scores_dataframe(scores_df, wt_seq, 'dna')
41
+ """
42
+ df = scores_df.copy()
43
+
44
+ # Check if aa_seq_diff already exists (from pre-annotated data)
45
+ has_pre_annotated_aa = 'aa_seq_diff' in df.columns
46
+
47
+ # Treat 'dna' as a DNA-sequence variant type (full-length DNA sequences)
48
+ if mutagenesis_type in {'codon', 'snv'}:
49
+ # Add codon differences
50
+ df['codon_diff'] = df['variant_seq'].apply(
51
+ lambda x: compare_codon_lists(wt_dna_seq, x)
52
+ )
53
+ df['codon_diff'] = df['codon_diff'].fillna('')
54
+
55
+ # Add DNA sequence differences
56
+ df['dna_seq_diff'] = df['variant_seq'].apply(
57
+ lambda x: compare_to_reference(wt_dna_seq, x)
58
+ )
59
+ df['dna_seq_diff'] = df['dna_seq_diff'].fillna('')
60
+
61
+ # Add AA sequence annotations only if not pre-annotated
62
+ if not has_pre_annotated_aa:
63
+ wt_aa_seq = translate_dna(wt_dna_seq)
64
+ df['aa_seq'] = df['variant_seq'].apply(translate_dna)
65
+ df['aa_seq_diff'] = df['aa_seq'].apply(
66
+ lambda x: compare_to_reference(wt_aa_seq, x)
67
+ )
68
+ df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
69
+
70
+ elif mutagenesis_type == 'aa':
71
+ # For AA variants, add sequence differences only if not pre-annotated
72
+ if not has_pre_annotated_aa:
73
+ wt_aa_seq = translate_dna(wt_dna_seq) if len(wt_dna_seq) % 3 == 0 else wt_dna_seq
74
+ df['aa_seq_diff'] = df['variant_seq'].apply(
75
+ lambda x: compare_to_reference(wt_aa_seq, x)
76
+ )
77
+ df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
78
+
79
+ # Map stop codon representations to * for standard notation in aa_seq_diff column
80
+ if 'aa_seq_diff' in df.columns:
81
+ df['aa_seq_diff'] = df['aa_seq_diff'].str.replace('X', '*', regex=False)
82
+ df['aa_seq_diff'] = df['aa_seq_diff'].str.replace('Ter', '*', regex=False)
83
+
84
+ # Add functional annotations
85
+ df = add_variant_categories(df)
86
+
87
+ return df
88
+
89
+ # TODO: #37 isn't this redundant with similar functions
90
+ def add_sequence_differences(
91
+ df: pd.DataFrame,
92
+ wt_dna_seq: str,
93
+ mutagenesis_type: str = 'aa',
94
+ ) -> pd.DataFrame:
95
+ """
96
+ Add sequence difference columns to a DataFrame.
97
+
98
+ Parameters
99
+ ----------
100
+ df : pd.DataFrame
101
+ DataFrame with variant sequences.
102
+ wt_dna_seq : str
103
+ Wild-type DNA sequence.
104
+ mutagenesis_type : str, default 'aa'
105
+ Type of mutagenesis ('codon', 'snv', 'aa').
106
+
107
+ Returns
108
+ -------
109
+ df : pd.DataFrame
110
+ DataFrame with sequence difference columns added.
111
+ """
112
+ df = df.copy()
113
+
114
+ if mutagenesis_type in {'codon', 'snv'}:
115
+ # Add DNA sequence differences
116
+ df['dna_seq_diff'] = df['variant_seq'].apply(
117
+ lambda x: compare_to_reference(wt_dna_seq, x)
118
+ )
119
+ df['dna_seq_diff'] = df['dna_seq_diff'].fillna('')
120
+
121
+ # Add AA sequence differences
122
+ wt_aa_seq = translate_dna(wt_dna_seq)
123
+ df['aa_seq'] = df['variant_seq'].apply(translate_dna)
124
+ df['aa_seq_diff'] = df['aa_seq'].apply(
125
+ lambda x: compare_to_reference(wt_aa_seq, x)
126
+ )
127
+ df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
128
+
129
+ elif mutagenesis_type == 'aa':
130
+ # For AA variants, sequences are already amino acids
131
+ wt_aa_seq = translate_dna(wt_dna_seq) if len(wt_dna_seq) % 3 == 0 else wt_dna_seq
132
+ df['aa_seq_diff'] = df['variant_seq'].apply(
133
+ lambda x: compare_to_reference(wt_aa_seq, x)
134
+ )
135
+ df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
136
+
137
+ return df
138
+
139
+ def classify_aa_variant(aa_diff, dna_diff=None):
140
+ if not aa_diff or aa_diff == '':
141
+ # Check if this is true WT (no DNA changes) or synonymous (DNA changes but same AA)
142
+ if dna_diff is not None and (not dna_diff or dna_diff == ''):
143
+ return 'wt_dna'
144
+ else:
145
+ return 'synonymous'
146
+ elif '*' in aa_diff:
147
+ return 'nonsense'
148
+ else:
149
+ return 'missense_aa'
150
+
151
+ def classify_dna_variant(dna_diff, aa_diff):
152
+ if not dna_diff or dna_diff == '':
153
+ return 'wt_dna'
154
+ elif not aa_diff or aa_diff == '':
155
+ return 'synonymous'
156
+ else:
157
+ return 'missense_dna'
158
+
159
+
160
+ def add_variant_categories(df: pd.DataFrame) -> pd.DataFrame:
161
+ """
162
+ Add variant category annotations based on existing sequence difference columns.
163
+
164
+ Parameters
165
+ ----------
166
+ df : pd.DataFrame
167
+ DataFrame with sequence difference columns (aa_seq_diff, dna_seq_diff).
168
+
169
+ Returns
170
+ -------
171
+ df : pd.DataFrame
172
+ DataFrame with variant category columns added.
173
+ """
174
+ df = df.copy()
175
+
176
+ # Classify DNA variants
177
+ if 'dna_seq_diff' in df.columns:
178
+ if 'aa_seq_diff' in df.columns:
179
+ df['annotate_dna'] = df.apply(lambda row: classify_dna_variant(row['dna_seq_diff'], row['aa_seq_diff']), axis=1)
180
+ else:
181
+ df['annotate_dna'] = df['dna_seq_diff'].apply(lambda x: 'missense_dna' if x else 'wt_dna')
182
+
183
+
184
+ # Classify variants based on AA changes
185
+ if 'aa_seq_diff' in df.columns:
186
+ if 'dna_seq_diff' in df.columns:
187
+ df['annotate_aa'] = df.apply(lambda row: classify_aa_variant(row['aa_seq_diff'], row['dna_seq_diff']), axis=1)
188
+ else:
189
+ df['annotate_aa'] = df['aa_seq_diff'].apply(classify_aa_variant)
190
+
191
+ return df