sortscore 0.1.0b2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sortscore-0.1.0b2/LICENSE +21 -0
- sortscore-0.1.0b2/PKG-INFO +67 -0
- sortscore-0.1.0b2/README.md +39 -0
- sortscore-0.1.0b2/pyproject.toml +43 -0
- sortscore-0.1.0b2/setup.cfg +4 -0
- sortscore-0.1.0b2/sortscore/__init__.py +19 -0
- sortscore-0.1.0b2/sortscore/__main__.py +7 -0
- sortscore-0.1.0b2/sortscore/analysis/__init__.py +7 -0
- sortscore-0.1.0b2/sortscore/analysis/aa_scores.py +221 -0
- sortscore-0.1.0b2/sortscore/analysis/annotation.py +191 -0
- sortscore-0.1.0b2/sortscore/analysis/batch_config.py +170 -0
- sortscore-0.1.0b2/sortscore/analysis/batch_normalization.py +928 -0
- sortscore-0.1.0b2/sortscore/analysis/batch_workflow.py +76 -0
- sortscore-0.1.0b2/sortscore/analysis/filtering.py +53 -0
- sortscore-0.1.0b2/sortscore/analysis/normalize_read_depth.py +99 -0
- sortscore-0.1.0b2/sortscore/analysis/score.py +243 -0
- sortscore-0.1.0b2/sortscore/analysis/statistics.py +201 -0
- sortscore-0.1.0b2/sortscore/analysis/summary_stats.py +182 -0
- sortscore-0.1.0b2/sortscore/analysis/variant_aggregation.py +94 -0
- sortscore-0.1.0b2/sortscore/analysis/workflows.py +237 -0
- sortscore-0.1.0b2/sortscore/cli.py +48 -0
- sortscore-0.1.0b2/sortscore/run_analysis.py +235 -0
- sortscore-0.1.0b2/sortscore/run_batch_analysis.py +84 -0
- sortscore-0.1.0b2/sortscore/utils/analysis_logger.py +277 -0
- sortscore-0.1.0b2/sortscore/utils/console_utils.py +208 -0
- sortscore-0.1.0b2/sortscore/utils/experiment_setup.py +325 -0
- sortscore-0.1.0b2/sortscore/utils/file_utils.py +158 -0
- sortscore-0.1.0b2/sortscore/utils/load_experiment.py +667 -0
- sortscore-0.1.0b2/sortscore/utils/sequence_parsing.py +316 -0
- sortscore-0.1.0b2/sortscore/utils/tile_configs.py +115 -0
- sortscore-0.1.0b2/sortscore/utils/variant_detection.py +329 -0
- sortscore-0.1.0b2/sortscore/utils/variant_parsing.py +68 -0
- sortscore-0.1.0b2/sortscore/visualization/__init__.py +5 -0
- sortscore-0.1.0b2/sortscore/visualization/correlations.py +358 -0
- sortscore-0.1.0b2/sortscore/visualization/heatmap_matrix.py +180 -0
- sortscore-0.1.0b2/sortscore/visualization/heatmap_workflow.py +218 -0
- sortscore-0.1.0b2/sortscore/visualization/heatmaps.py +737 -0
- sortscore-0.1.0b2/sortscore/visualization/plots.py +358 -0
- sortscore-0.1.0b2/sortscore.egg-info/PKG-INFO +67 -0
- sortscore-0.1.0b2/sortscore.egg-info/SOURCES.txt +42 -0
- sortscore-0.1.0b2/sortscore.egg-info/dependency_links.txt +1 -0
- sortscore-0.1.0b2/sortscore.egg-info/entry_points.txt +2 -0
- sortscore-0.1.0b2/sortscore.egg-info/requires.txt +7 -0
- sortscore-0.1.0b2/sortscore.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 dbaldridge-lab
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sortscore
|
|
3
|
+
Version: 0.1.0b2
|
|
4
|
+
Summary: A modular Python package for Sort-seq variant analysis
|
|
5
|
+
Author-email: Caitlyn Chitwood <c.chitwood@wustl.edu>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/dbaldridge-lab/sortscore
|
|
8
|
+
Keywords: bioinformatics,sequencing,variant-analysis,sort-seq
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: pandas>=2.0.0
|
|
21
|
+
Requires-Dist: numpy>=1.24.0
|
|
22
|
+
Requires-Dist: scipy>=1.10.0
|
|
23
|
+
Requires-Dist: matplotlib>=3.6.0
|
|
24
|
+
Requires-Dist: seaborn>=0.12.0
|
|
25
|
+
Requires-Dist: biopython>=1.81
|
|
26
|
+
Requires-Dist: mavehgvs>=0.7.0
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# sortscore
|
|
30
|
+
|
|
31
|
+
`sortscore` is a Python package for Sort-seq variant analysis, including scoring, normalization, and visualization.
|
|
32
|
+
|
|
33
|
+
## Quick Start
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
git clone https://github.com/dbaldridge-lab/sortscore
|
|
37
|
+
cd sortscore
|
|
38
|
+
python -m venv .venv
|
|
39
|
+
source .venv/bin/activate
|
|
40
|
+
pip install -e .
|
|
41
|
+
```
|
|
42
|
+
Run a standard variant scoring analysis:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
sortscore -n EXPERIMENT_NAME -e path/to/experiment_setup.csv -c path/to/config.json
|
|
46
|
+
```
|
|
47
|
+
If you did not install the CLI entry point, run:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
python -m sortscore -n EXPERIMENT_NAME -e path/to/experiment_setup.csv -c path/to/config.json
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Documentation
|
|
54
|
+
- [Installation](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/installation.md)
|
|
55
|
+
- [Usage](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/usage.md)
|
|
56
|
+
- [CLI Arguments](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/cli_arguments.md)
|
|
57
|
+
- [Visualization](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/visualization.md)
|
|
58
|
+
- [Batch Processing](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/batch_processing.md)
|
|
59
|
+
- [Troubleshooting](https://github.com/dbaldridge-lab/sortscore/blob/main/TROUBLESHOOTING.md)
|
|
60
|
+
- [Contributing](https://github.com/dbaldridge-lab/sortscore/blob/main/CONTRIBUTING.md)
|
|
61
|
+
|
|
62
|
+
## Demo
|
|
63
|
+
|
|
64
|
+
- [Single Experiment Scoring Notebook Demo](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/single_experiment_demo.ipynb)
|
|
65
|
+
- [Batch Normalization Notebook Demo](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/tiled_demo.ipynb)
|
|
66
|
+
- [Example Config](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/GLI2_oPool5b/config.json)
|
|
67
|
+
- [Example Experiment Setup](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/GLI2_oPool5b/experiment_setup.csv)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# sortscore
|
|
2
|
+
|
|
3
|
+
`sortscore` is a Python package for Sort-seq variant analysis, including scoring, normalization, and visualization.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
git clone https://github.com/dbaldridge-lab/sortscore
|
|
9
|
+
cd sortscore
|
|
10
|
+
python -m venv .venv
|
|
11
|
+
source .venv/bin/activate
|
|
12
|
+
pip install -e .
|
|
13
|
+
```
|
|
14
|
+
Run a standard variant scoring analysis:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
sortscore -n EXPERIMENT_NAME -e path/to/experiment_setup.csv -c path/to/config.json
|
|
18
|
+
```
|
|
19
|
+
If you did not install the CLI entry point, run:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
python -m sortscore -n EXPERIMENT_NAME -e path/to/experiment_setup.csv -c path/to/config.json
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Documentation
|
|
26
|
+
- [Installation](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/installation.md)
|
|
27
|
+
- [Usage](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/usage.md)
|
|
28
|
+
- [CLI Arguments](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/cli_arguments.md)
|
|
29
|
+
- [Visualization](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/visualization.md)
|
|
30
|
+
- [Batch Processing](https://github.com/dbaldridge-lab/sortscore/blob/main/docs/batch_processing.md)
|
|
31
|
+
- [Troubleshooting](https://github.com/dbaldridge-lab/sortscore/blob/main/TROUBLESHOOTING.md)
|
|
32
|
+
- [Contributing](https://github.com/dbaldridge-lab/sortscore/blob/main/CONTRIBUTING.md)
|
|
33
|
+
|
|
34
|
+
## Demo
|
|
35
|
+
|
|
36
|
+
- [Single Experiment Scoring Notebook Demo](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/single_experiment_demo.ipynb)
|
|
37
|
+
- [Batch Normalization Notebook Demo](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/tiled_demo.ipynb)
|
|
38
|
+
- [Example Config](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/GLI2_oPool5b/config.json)
|
|
39
|
+
- [Example Experiment Setup](https://github.com/dbaldridge-lab/sortscore/blob/main/demo_data/GLI2_oPool5b/experiment_setup.csv)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sortscore"
|
|
7
|
+
version = "0.1.0b2"
|
|
8
|
+
description = "A modular Python package for Sort-seq variant analysis"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Caitlyn Chitwood", email = "c.chitwood@wustl.edu" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["bioinformatics", "sequencing", "variant-analysis", "sort-seq"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12"
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"pandas>=2.0.0",
|
|
28
|
+
"numpy>=1.24.0",
|
|
29
|
+
"scipy>=1.10.0",
|
|
30
|
+
"matplotlib>=3.6.0",
|
|
31
|
+
"seaborn>=0.12.0",
|
|
32
|
+
"biopython>=1.81",
|
|
33
|
+
"mavehgvs>=0.7.0"
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/dbaldridge-lab/sortscore"
|
|
38
|
+
|
|
39
|
+
[project.scripts]
|
|
40
|
+
sortscore = "sortscore.cli:main"
|
|
41
|
+
|
|
42
|
+
[tool.setuptools.packages.find]
|
|
43
|
+
include = ["sortscore", "sortscore.*"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sortscore: A modular Python package for Sort-seq variant analysis.
|
|
3
|
+
|
|
4
|
+
This package provides tools for analyzing Sort-seq experimental data,
|
|
5
|
+
calculating activity scores, and generating visualizations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0b2"
|
|
9
|
+
__author__ = "Caitlyn Chitwood"
|
|
10
|
+
__email__ = "c.chitwood@wustl.edu"
|
|
11
|
+
|
|
12
|
+
# Import main classes for convenience
|
|
13
|
+
from .utils.load_experiment import ExperimentConfig
|
|
14
|
+
from .analysis.score import calculate_full_activity_scores
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"ExperimentConfig",
|
|
18
|
+
"calculate_full_activity_scores"
|
|
19
|
+
]
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Amino acid scores processing and export for Sort-seq analysis.
|
|
3
|
+
|
|
4
|
+
This module provides functions for processing amino acid scores from DNA variant data,
|
|
5
|
+
including aggregation of synonymous codons, statistical analysis, and file export.
|
|
6
|
+
"""
|
|
7
|
+
import os
|
|
8
|
+
import logging
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import numpy as np
|
|
11
|
+
from scipy import stats as scipy_stats
|
|
12
|
+
from typing import Tuple, List
|
|
13
|
+
from sortscore.analysis.statistics import calculate_codon_and_replicate_variance
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def process_and_save_aa_scores(scores_df: pd.DataFrame, experiment, scores_dir: str,
|
|
17
|
+
output_suffix: str, analysis_logger) -> None:
|
|
18
|
+
"""
|
|
19
|
+
Process and save amino acid scores from variant data.
|
|
20
|
+
|
|
21
|
+
This function handles the complete AA scores workflow including:
|
|
22
|
+
- Filtering out NaN values
|
|
23
|
+
- Checking if codon aggregation is needed
|
|
24
|
+
- Calculating appropriate statistics (with/without codon variance)
|
|
25
|
+
- Rounding score columns
|
|
26
|
+
- Saving to CSV file
|
|
27
|
+
- Logging output
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
scores_df : pd.DataFrame
|
|
32
|
+
DataFrame containing variant scores and annotations
|
|
33
|
+
experiment : ExperimentConfig
|
|
34
|
+
Experiment configuration containing metadata
|
|
35
|
+
scores_dir : str
|
|
36
|
+
Directory to save scores file
|
|
37
|
+
output_suffix : str
|
|
38
|
+
Suffix for output filename
|
|
39
|
+
analysis_logger : AnalysisLogger
|
|
40
|
+
Logger instance for recording outputs
|
|
41
|
+
|
|
42
|
+
Examples
|
|
43
|
+
--------
|
|
44
|
+
>>> process_and_save_aa_scores(scores_df, experiment, 'output/scores', 'suffix', logger)
|
|
45
|
+
"""
|
|
46
|
+
if 'aa_seq_diff' not in scores_df.columns:
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
# Determine score column
|
|
50
|
+
if experiment.avg_method == 'simple-avg':
|
|
51
|
+
score_col = 'avgscore'
|
|
52
|
+
else:
|
|
53
|
+
score_col_suffix = experiment.avg_method.replace('-', '_')
|
|
54
|
+
score_col = f'avgscore_{score_col_suffix}'
|
|
55
|
+
|
|
56
|
+
# Filter out rows with NaN values first
|
|
57
|
+
scores_df_drop_nan = scores_df.dropna(subset=[score_col])
|
|
58
|
+
|
|
59
|
+
# Find replicate score columns
|
|
60
|
+
rep_score_columns = [col for col in scores_df_drop_nan.columns
|
|
61
|
+
if col.startswith('Rep') and col.endswith('.score')]
|
|
62
|
+
|
|
63
|
+
# Check aggregation needs and process scores
|
|
64
|
+
aa_scores = _check_codon_num(scores_df_drop_nan, score_col, rep_score_columns)
|
|
65
|
+
|
|
66
|
+
# Round score columns to integers
|
|
67
|
+
aa_scores = _round_score_columns(aa_scores)
|
|
68
|
+
|
|
69
|
+
# Save to file
|
|
70
|
+
aa_scores_file = os.path.join(scores_dir, f"{experiment.experiment_name}_aa_scores_{output_suffix}.csv")
|
|
71
|
+
aa_scores.to_csv(aa_scores_file, index=False)
|
|
72
|
+
logging.info(f"Saved AA scores to {aa_scores_file} ({len(aa_scores)} unique AA variants)")
|
|
73
|
+
|
|
74
|
+
# Log file output
|
|
75
|
+
analysis_logger.log_output_file(
|
|
76
|
+
'aa_scores',
|
|
77
|
+
f"{experiment.experiment_name}_aa_scores_{output_suffix}.csv",
|
|
78
|
+
aa_scores_file,
|
|
79
|
+
variant_count=len(aa_scores)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _check_codon_num(scores_df_drop_nan: pd.DataFrame, score_col: str,
|
|
84
|
+
rep_score_columns: List[str]) -> pd.DataFrame:
|
|
85
|
+
"""
|
|
86
|
+
Check if codon aggregation is needed and process AA scores accordingly.
|
|
87
|
+
|
|
88
|
+
This function checks if there are multiple codons per AA variant and processes
|
|
89
|
+
the data using either DNA->AA aggregation (with codon variance) or AA-only
|
|
90
|
+
statistics (replicate variance only).
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
----------
|
|
94
|
+
scores_df_drop_nan : pd.DataFrame
|
|
95
|
+
DataFrame with NaN values already filtered out
|
|
96
|
+
score_col : str
|
|
97
|
+
Name of the score column to use
|
|
98
|
+
rep_score_columns : List[str]
|
|
99
|
+
List of replicate score column names
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
pd.DataFrame
|
|
104
|
+
Processed AA scores with appropriate statistics
|
|
105
|
+
"""
|
|
106
|
+
# Check if there are multiple codons per AA variant (DNA->AA case)
|
|
107
|
+
aa_variant_counts = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa']).size()
|
|
108
|
+
needs_aggregation = (aa_variant_counts > 1).any()
|
|
109
|
+
|
|
110
|
+
if needs_aggregation:
|
|
111
|
+
return _process_dna_to_aa_aggregation(scores_df_drop_nan, score_col, rep_score_columns)
|
|
112
|
+
else:
|
|
113
|
+
return _process_aa_only_scores(scores_df_drop_nan, rep_score_columns)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _process_dna_to_aa_aggregation(scores_df_drop_nan: pd.DataFrame, score_col: str,
|
|
117
|
+
rep_score_columns: List[str]) -> pd.DataFrame:
|
|
118
|
+
"""
|
|
119
|
+
Process DNA->AA aggregation case with codon variance decomposition.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
scores_df_drop_nan : pd.DataFrame
|
|
124
|
+
DataFrame with variant scores (NaN filtered)
|
|
125
|
+
score_col : str
|
|
126
|
+
Name of the score column to use
|
|
127
|
+
rep_score_columns : List[str]
|
|
128
|
+
List of replicate score column names
|
|
129
|
+
|
|
130
|
+
Returns
|
|
131
|
+
-------
|
|
132
|
+
pd.DataFrame
|
|
133
|
+
Aggregated AA scores with codon and replicate statistics
|
|
134
|
+
"""
|
|
135
|
+
# DNA->AA aggregation case: aggregate synonymous variants
|
|
136
|
+
columns_to_average = ['avgscore', 'avgscore_rep_weighted'] + rep_score_columns
|
|
137
|
+
|
|
138
|
+
# Calculate standard deviation and count of codon-level scores before AA aggregation
|
|
139
|
+
aa_scores_std = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa'])[score_col].agg(['std', 'count']).reset_index()
|
|
140
|
+
aa_scores_std.columns = ['aa_seq_diff', 'annotate_aa', 'SD_codon', 'n_codons']
|
|
141
|
+
|
|
142
|
+
# Calculate mean scores for aggregation
|
|
143
|
+
aa_scores = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa'])[columns_to_average].mean().reset_index()
|
|
144
|
+
|
|
145
|
+
# Merge the standard deviation and count of codon scores
|
|
146
|
+
aa_scores = aa_scores.merge(aa_scores_std, on=['aa_seq_diff', 'annotate_aa'], how='left')
|
|
147
|
+
|
|
148
|
+
# Calculate statistics with codon and replicate variance decomposition
|
|
149
|
+
aa_scores = calculate_codon_and_replicate_variance(aa_scores, rep_score_columns)
|
|
150
|
+
|
|
151
|
+
return aa_scores
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _process_aa_only_scores(scores_df_drop_nan: pd.DataFrame, rep_score_columns: List[str]) -> pd.DataFrame:
|
|
155
|
+
"""
|
|
156
|
+
Process AA-only case with simple replicate statistics.
|
|
157
|
+
|
|
158
|
+
Parameters
|
|
159
|
+
----------
|
|
160
|
+
scores_df_drop_nan : pd.DataFrame
|
|
161
|
+
DataFrame with variant scores (NaN filtered)
|
|
162
|
+
rep_score_columns : List[str]
|
|
163
|
+
List of replicate score column names
|
|
164
|
+
|
|
165
|
+
Returns
|
|
166
|
+
-------
|
|
167
|
+
pd.DataFrame
|
|
168
|
+
AA scores with replicate statistics only
|
|
169
|
+
"""
|
|
170
|
+
# AA-only case: no aggregation needed, just copy the data
|
|
171
|
+
columns_to_include = ['aa_seq_diff', 'annotate_aa', 'avgscore', 'avgscore_rep_weighted'] + rep_score_columns
|
|
172
|
+
|
|
173
|
+
aa_scores = scores_df_drop_nan[columns_to_include].copy()
|
|
174
|
+
|
|
175
|
+
# Calculate simple replicate statistics (no codon variance)
|
|
176
|
+
if len(rep_score_columns) >= 2:
|
|
177
|
+
aa_rep_mean = aa_scores[rep_score_columns].mean(axis=1)
|
|
178
|
+
aa_rep_std = aa_scores[rep_score_columns].std(axis=1, ddof=1)
|
|
179
|
+
|
|
180
|
+
# Calculate n_measurements (just number of non-empty replicates)
|
|
181
|
+
n_measurements = aa_scores[rep_score_columns].notna().sum(axis=1)
|
|
182
|
+
|
|
183
|
+
# Calculate SEM using only replicate variance
|
|
184
|
+
sem = aa_rep_std / np.sqrt(n_measurements)
|
|
185
|
+
|
|
186
|
+
# Calculate 95% CI using t-distribution
|
|
187
|
+
df_actual = n_measurements - 1
|
|
188
|
+
t_critical = scipy_stats.t.ppf(0.975, df_actual)
|
|
189
|
+
aa_margin_of_error = t_critical * sem
|
|
190
|
+
|
|
191
|
+
aa_scores['SD_rep'] = aa_rep_std.round().astype('Int64')
|
|
192
|
+
aa_scores['CV_rep'] = (aa_rep_std / aa_rep_mean).round(3)
|
|
193
|
+
aa_scores['n_measurements'] = n_measurements.astype('Int64')
|
|
194
|
+
aa_scores['SEM'] = sem.round().astype('Int64')
|
|
195
|
+
aa_scores['CI_lower'] = (aa_rep_mean - aa_margin_of_error).round().astype('Int64')
|
|
196
|
+
aa_scores['CI_upper'] = (aa_rep_mean + aa_margin_of_error).round().astype('Int64')
|
|
197
|
+
|
|
198
|
+
return aa_scores
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _round_score_columns(aa_scores: pd.DataFrame) -> pd.DataFrame:
|
|
202
|
+
"""
|
|
203
|
+
Round score columns to integers for cleaner output.
|
|
204
|
+
|
|
205
|
+
Parameters
|
|
206
|
+
----------
|
|
207
|
+
aa_scores : pd.DataFrame
|
|
208
|
+
DataFrame containing score columns
|
|
209
|
+
|
|
210
|
+
Returns
|
|
211
|
+
-------
|
|
212
|
+
pd.DataFrame
|
|
213
|
+
DataFrame with score columns rounded to integers
|
|
214
|
+
"""
|
|
215
|
+
# Round score columns to integers
|
|
216
|
+
score_columns = [col for col in aa_scores.columns if 'score' in col.lower()]
|
|
217
|
+
for col in score_columns:
|
|
218
|
+
if aa_scores[col].dtype in ['float64', 'float32']:
|
|
219
|
+
aa_scores[col] = aa_scores[col].round().astype('Int64')
|
|
220
|
+
|
|
221
|
+
return aa_scores
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sequence annotation utilities for Sort-seq variant analysis.
|
|
3
|
+
|
|
4
|
+
This module provides functions for annotating variant DataFrames with sequence differences,
|
|
5
|
+
translations, and other derived sequence information.
|
|
6
|
+
|
|
7
|
+
Examples
|
|
8
|
+
--------
|
|
9
|
+
>>> from sortscore.analysis.annotation import annotate_scores_dataframe
|
|
10
|
+
>>> annotated_df = annotate_scores_dataframe(scores_df, experiment)
|
|
11
|
+
"""
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from sortscore.utils.sequence_parsing import compare_to_reference, compare_codon_lists, translate_dna
|
|
14
|
+
|
|
15
|
+
# TODO: #37 redundant, see if we can remove
|
|
16
|
+
def annotate_scores_dataframe(
|
|
17
|
+
scores_df: pd.DataFrame,
|
|
18
|
+
wt_dna_seq: str,
|
|
19
|
+
mutagenesis_type: str = 'aa',
|
|
20
|
+
) -> pd.DataFrame:
|
|
21
|
+
"""
|
|
22
|
+
Add sequence annotation columns to a scores DataFrame.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
scores_df : pd.DataFrame
|
|
27
|
+
DataFrame with variant sequences and scores.
|
|
28
|
+
wt_dna_seq : str
|
|
29
|
+
Wild-type DNA reference sequence.
|
|
30
|
+
mutagenesis_type : str, default 'aa'
|
|
31
|
+
Type of mutagenesis ('codon', 'snv', 'aa').
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
annotated_df : pd.DataFrame
|
|
36
|
+
DataFrame with added annotation columns.
|
|
37
|
+
|
|
38
|
+
Examples
|
|
39
|
+
--------
|
|
40
|
+
>>> annotated_df = annotate_scores_dataframe(scores_df, wt_seq, 'dna')
|
|
41
|
+
"""
|
|
42
|
+
df = scores_df.copy()
|
|
43
|
+
|
|
44
|
+
# Check if aa_seq_diff already exists (from pre-annotated data)
|
|
45
|
+
has_pre_annotated_aa = 'aa_seq_diff' in df.columns
|
|
46
|
+
|
|
47
|
+
# Treat 'dna' as a DNA-sequence variant type (full-length DNA sequences)
|
|
48
|
+
if mutagenesis_type in {'codon', 'snv'}:
|
|
49
|
+
# Add codon differences
|
|
50
|
+
df['codon_diff'] = df['variant_seq'].apply(
|
|
51
|
+
lambda x: compare_codon_lists(wt_dna_seq, x)
|
|
52
|
+
)
|
|
53
|
+
df['codon_diff'] = df['codon_diff'].fillna('')
|
|
54
|
+
|
|
55
|
+
# Add DNA sequence differences
|
|
56
|
+
df['dna_seq_diff'] = df['variant_seq'].apply(
|
|
57
|
+
lambda x: compare_to_reference(wt_dna_seq, x)
|
|
58
|
+
)
|
|
59
|
+
df['dna_seq_diff'] = df['dna_seq_diff'].fillna('')
|
|
60
|
+
|
|
61
|
+
# Add AA sequence annotations only if not pre-annotated
|
|
62
|
+
if not has_pre_annotated_aa:
|
|
63
|
+
wt_aa_seq = translate_dna(wt_dna_seq)
|
|
64
|
+
df['aa_seq'] = df['variant_seq'].apply(translate_dna)
|
|
65
|
+
df['aa_seq_diff'] = df['aa_seq'].apply(
|
|
66
|
+
lambda x: compare_to_reference(wt_aa_seq, x)
|
|
67
|
+
)
|
|
68
|
+
df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
|
|
69
|
+
|
|
70
|
+
elif mutagenesis_type == 'aa':
|
|
71
|
+
# For AA variants, add sequence differences only if not pre-annotated
|
|
72
|
+
if not has_pre_annotated_aa:
|
|
73
|
+
wt_aa_seq = translate_dna(wt_dna_seq) if len(wt_dna_seq) % 3 == 0 else wt_dna_seq
|
|
74
|
+
df['aa_seq_diff'] = df['variant_seq'].apply(
|
|
75
|
+
lambda x: compare_to_reference(wt_aa_seq, x)
|
|
76
|
+
)
|
|
77
|
+
df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
|
|
78
|
+
|
|
79
|
+
# Map stop codon representations to * for standard notation in aa_seq_diff column
|
|
80
|
+
if 'aa_seq_diff' in df.columns:
|
|
81
|
+
df['aa_seq_diff'] = df['aa_seq_diff'].str.replace('X', '*', regex=False)
|
|
82
|
+
df['aa_seq_diff'] = df['aa_seq_diff'].str.replace('Ter', '*', regex=False)
|
|
83
|
+
|
|
84
|
+
# Add functional annotations
|
|
85
|
+
df = add_variant_categories(df)
|
|
86
|
+
|
|
87
|
+
return df
|
|
88
|
+
|
|
89
|
+
# TODO: #37 isn't this redundant with similar functions
|
|
90
|
+
def add_sequence_differences(
|
|
91
|
+
df: pd.DataFrame,
|
|
92
|
+
wt_dna_seq: str,
|
|
93
|
+
mutagenesis_type: str = 'aa',
|
|
94
|
+
) -> pd.DataFrame:
|
|
95
|
+
"""
|
|
96
|
+
Add sequence difference columns to a DataFrame.
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
df : pd.DataFrame
|
|
101
|
+
DataFrame with variant sequences.
|
|
102
|
+
wt_dna_seq : str
|
|
103
|
+
Wild-type DNA sequence.
|
|
104
|
+
mutagenesis_type : str, default 'aa'
|
|
105
|
+
Type of mutagenesis ('codon', 'snv', 'aa').
|
|
106
|
+
|
|
107
|
+
Returns
|
|
108
|
+
-------
|
|
109
|
+
df : pd.DataFrame
|
|
110
|
+
DataFrame with sequence difference columns added.
|
|
111
|
+
"""
|
|
112
|
+
df = df.copy()
|
|
113
|
+
|
|
114
|
+
if mutagenesis_type in {'codon', 'snv'}:
|
|
115
|
+
# Add DNA sequence differences
|
|
116
|
+
df['dna_seq_diff'] = df['variant_seq'].apply(
|
|
117
|
+
lambda x: compare_to_reference(wt_dna_seq, x)
|
|
118
|
+
)
|
|
119
|
+
df['dna_seq_diff'] = df['dna_seq_diff'].fillna('')
|
|
120
|
+
|
|
121
|
+
# Add AA sequence differences
|
|
122
|
+
wt_aa_seq = translate_dna(wt_dna_seq)
|
|
123
|
+
df['aa_seq'] = df['variant_seq'].apply(translate_dna)
|
|
124
|
+
df['aa_seq_diff'] = df['aa_seq'].apply(
|
|
125
|
+
lambda x: compare_to_reference(wt_aa_seq, x)
|
|
126
|
+
)
|
|
127
|
+
df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
|
|
128
|
+
|
|
129
|
+
elif mutagenesis_type == 'aa':
|
|
130
|
+
# For AA variants, sequences are already amino acids
|
|
131
|
+
wt_aa_seq = translate_dna(wt_dna_seq) if len(wt_dna_seq) % 3 == 0 else wt_dna_seq
|
|
132
|
+
df['aa_seq_diff'] = df['variant_seq'].apply(
|
|
133
|
+
lambda x: compare_to_reference(wt_aa_seq, x)
|
|
134
|
+
)
|
|
135
|
+
df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
|
|
136
|
+
|
|
137
|
+
return df
|
|
138
|
+
|
|
139
|
+
def classify_aa_variant(aa_diff, dna_diff=None):
|
|
140
|
+
if not aa_diff or aa_diff == '':
|
|
141
|
+
# Check if this is true WT (no DNA changes) or synonymous (DNA changes but same AA)
|
|
142
|
+
if dna_diff is not None and (not dna_diff or dna_diff == ''):
|
|
143
|
+
return 'wt_dna'
|
|
144
|
+
else:
|
|
145
|
+
return 'synonymous'
|
|
146
|
+
elif '*' in aa_diff:
|
|
147
|
+
return 'nonsense'
|
|
148
|
+
else:
|
|
149
|
+
return 'missense_aa'
|
|
150
|
+
|
|
151
|
+
def classify_dna_variant(dna_diff, aa_diff):
|
|
152
|
+
if not dna_diff or dna_diff == '':
|
|
153
|
+
return 'wt_dna'
|
|
154
|
+
elif not aa_diff or aa_diff == '':
|
|
155
|
+
return 'synonymous'
|
|
156
|
+
else:
|
|
157
|
+
return 'missense_dna'
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def add_variant_categories(df: pd.DataFrame) -> pd.DataFrame:
|
|
161
|
+
"""
|
|
162
|
+
Add variant category annotations based on existing sequence difference columns.
|
|
163
|
+
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
df : pd.DataFrame
|
|
167
|
+
DataFrame with sequence difference columns (aa_seq_diff, dna_seq_diff).
|
|
168
|
+
|
|
169
|
+
Returns
|
|
170
|
+
-------
|
|
171
|
+
df : pd.DataFrame
|
|
172
|
+
DataFrame with variant category columns added.
|
|
173
|
+
"""
|
|
174
|
+
df = df.copy()
|
|
175
|
+
|
|
176
|
+
# Classify DNA variants
|
|
177
|
+
if 'dna_seq_diff' in df.columns:
|
|
178
|
+
if 'aa_seq_diff' in df.columns:
|
|
179
|
+
df['annotate_dna'] = df.apply(lambda row: classify_dna_variant(row['dna_seq_diff'], row['aa_seq_diff']), axis=1)
|
|
180
|
+
else:
|
|
181
|
+
df['annotate_dna'] = df['dna_seq_diff'].apply(lambda x: 'missense_dna' if x else 'wt_dna')
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# Classify variants based on AA changes
|
|
185
|
+
if 'aa_seq_diff' in df.columns:
|
|
186
|
+
if 'dna_seq_diff' in df.columns:
|
|
187
|
+
df['annotate_aa'] = df.apply(lambda row: classify_aa_variant(row['aa_seq_diff'], row['dna_seq_diff']), axis=1)
|
|
188
|
+
else:
|
|
189
|
+
df['annotate_aa'] = df['aa_seq_diff'].apply(classify_aa_variant)
|
|
190
|
+
|
|
191
|
+
return df
|