sortscore 0.1.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sortscore/__init__.py +19 -0
- sortscore/__main__.py +7 -0
- sortscore/analysis/__init__.py +7 -0
- sortscore/analysis/aa_scores.py +221 -0
- sortscore/analysis/annotation.py +191 -0
- sortscore/analysis/batch_config.py +170 -0
- sortscore/analysis/batch_normalization.py +928 -0
- sortscore/analysis/batch_workflow.py +76 -0
- sortscore/analysis/filtering.py +53 -0
- sortscore/analysis/normalize_read_depth.py +99 -0
- sortscore/analysis/score.py +243 -0
- sortscore/analysis/statistics.py +201 -0
- sortscore/analysis/summary_stats.py +182 -0
- sortscore/analysis/variant_aggregation.py +94 -0
- sortscore/analysis/workflows.py +237 -0
- sortscore/cli.py +48 -0
- sortscore/run_analysis.py +235 -0
- sortscore/run_batch_analysis.py +84 -0
- sortscore/utils/analysis_logger.py +277 -0
- sortscore/utils/console_utils.py +208 -0
- sortscore/utils/experiment_setup.py +325 -0
- sortscore/utils/file_utils.py +158 -0
- sortscore/utils/load_experiment.py +667 -0
- sortscore/utils/sequence_parsing.py +316 -0
- sortscore/utils/tile_configs.py +115 -0
- sortscore/utils/variant_detection.py +329 -0
- sortscore/utils/variant_parsing.py +68 -0
- sortscore/visualization/__init__.py +5 -0
- sortscore/visualization/correlations.py +358 -0
- sortscore/visualization/heatmap_matrix.py +180 -0
- sortscore/visualization/heatmap_workflow.py +218 -0
- sortscore/visualization/heatmaps.py +737 -0
- sortscore/visualization/plots.py +358 -0
- sortscore-0.1.0b2.dist-info/METADATA +67 -0
- sortscore-0.1.0b2.dist-info/RECORD +39 -0
- sortscore-0.1.0b2.dist-info/WHEEL +5 -0
- sortscore-0.1.0b2.dist-info/entry_points.txt +2 -0
- sortscore-0.1.0b2.dist-info/licenses/LICENSE +21 -0
- sortscore-0.1.0b2.dist-info/top_level.txt +1 -0
sortscore/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
sortscore: A modular Python package for Sort-seq variant analysis.
|
|
3
|
+
|
|
4
|
+
This package provides tools for analyzing Sort-seq experimental data,
|
|
5
|
+
calculating activity scores, and generating visualizations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0b2"
|
|
9
|
+
__author__ = "Caitlyn Chitwood"
|
|
10
|
+
__email__ = "c.chitwood@wustl.edu"
|
|
11
|
+
|
|
12
|
+
# Import main classes for convenience
|
|
13
|
+
from .utils.load_experiment import ExperimentConfig
|
|
14
|
+
from .analysis.score import calculate_full_activity_scores
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"ExperimentConfig",
|
|
18
|
+
"calculate_full_activity_scores"
|
|
19
|
+
]
|
sortscore/__main__.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Amino acid scores processing and export for Sort-seq analysis.
|
|
3
|
+
|
|
4
|
+
This module provides functions for processing amino acid scores from DNA variant data,
|
|
5
|
+
including aggregation of synonymous codons, statistical analysis, and file export.
|
|
6
|
+
"""
|
|
7
|
+
import os
|
|
8
|
+
import logging
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import numpy as np
|
|
11
|
+
from scipy import stats as scipy_stats
|
|
12
|
+
from typing import Tuple, List
|
|
13
|
+
from sortscore.analysis.statistics import calculate_codon_and_replicate_variance
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def process_and_save_aa_scores(scores_df: pd.DataFrame, experiment, scores_dir: str,
|
|
17
|
+
output_suffix: str, analysis_logger) -> None:
|
|
18
|
+
"""
|
|
19
|
+
Process and save amino acid scores from variant data.
|
|
20
|
+
|
|
21
|
+
This function handles the complete AA scores workflow including:
|
|
22
|
+
- Filtering out NaN values
|
|
23
|
+
- Checking if codon aggregation is needed
|
|
24
|
+
- Calculating appropriate statistics (with/without codon variance)
|
|
25
|
+
- Rounding score columns
|
|
26
|
+
- Saving to CSV file
|
|
27
|
+
- Logging output
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
scores_df : pd.DataFrame
|
|
32
|
+
DataFrame containing variant scores and annotations
|
|
33
|
+
experiment : ExperimentConfig
|
|
34
|
+
Experiment configuration containing metadata
|
|
35
|
+
scores_dir : str
|
|
36
|
+
Directory to save scores file
|
|
37
|
+
output_suffix : str
|
|
38
|
+
Suffix for output filename
|
|
39
|
+
analysis_logger : AnalysisLogger
|
|
40
|
+
Logger instance for recording outputs
|
|
41
|
+
|
|
42
|
+
Examples
|
|
43
|
+
--------
|
|
44
|
+
>>> process_and_save_aa_scores(scores_df, experiment, 'output/scores', 'suffix', logger)
|
|
45
|
+
"""
|
|
46
|
+
if 'aa_seq_diff' not in scores_df.columns:
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
# Determine score column
|
|
50
|
+
if experiment.avg_method == 'simple-avg':
|
|
51
|
+
score_col = 'avgscore'
|
|
52
|
+
else:
|
|
53
|
+
score_col_suffix = experiment.avg_method.replace('-', '_')
|
|
54
|
+
score_col = f'avgscore_{score_col_suffix}'
|
|
55
|
+
|
|
56
|
+
# Filter out rows with NaN values first
|
|
57
|
+
scores_df_drop_nan = scores_df.dropna(subset=[score_col])
|
|
58
|
+
|
|
59
|
+
# Find replicate score columns
|
|
60
|
+
rep_score_columns = [col for col in scores_df_drop_nan.columns
|
|
61
|
+
if col.startswith('Rep') and col.endswith('.score')]
|
|
62
|
+
|
|
63
|
+
# Check aggregation needs and process scores
|
|
64
|
+
aa_scores = _check_codon_num(scores_df_drop_nan, score_col, rep_score_columns)
|
|
65
|
+
|
|
66
|
+
# Round score columns to integers
|
|
67
|
+
aa_scores = _round_score_columns(aa_scores)
|
|
68
|
+
|
|
69
|
+
# Save to file
|
|
70
|
+
aa_scores_file = os.path.join(scores_dir, f"{experiment.experiment_name}_aa_scores_{output_suffix}.csv")
|
|
71
|
+
aa_scores.to_csv(aa_scores_file, index=False)
|
|
72
|
+
logging.info(f"Saved AA scores to {aa_scores_file} ({len(aa_scores)} unique AA variants)")
|
|
73
|
+
|
|
74
|
+
# Log file output
|
|
75
|
+
analysis_logger.log_output_file(
|
|
76
|
+
'aa_scores',
|
|
77
|
+
f"{experiment.experiment_name}_aa_scores_{output_suffix}.csv",
|
|
78
|
+
aa_scores_file,
|
|
79
|
+
variant_count=len(aa_scores)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _check_codon_num(scores_df_drop_nan: pd.DataFrame, score_col: str,
|
|
84
|
+
rep_score_columns: List[str]) -> pd.DataFrame:
|
|
85
|
+
"""
|
|
86
|
+
Check if codon aggregation is needed and process AA scores accordingly.
|
|
87
|
+
|
|
88
|
+
This function checks if there are multiple codons per AA variant and processes
|
|
89
|
+
the data using either DNA->AA aggregation (with codon variance) or AA-only
|
|
90
|
+
statistics (replicate variance only).
|
|
91
|
+
|
|
92
|
+
Parameters
|
|
93
|
+
----------
|
|
94
|
+
scores_df_drop_nan : pd.DataFrame
|
|
95
|
+
DataFrame with NaN values already filtered out
|
|
96
|
+
score_col : str
|
|
97
|
+
Name of the score column to use
|
|
98
|
+
rep_score_columns : List[str]
|
|
99
|
+
List of replicate score column names
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
pd.DataFrame
|
|
104
|
+
Processed AA scores with appropriate statistics
|
|
105
|
+
"""
|
|
106
|
+
# Check if there are multiple codons per AA variant (DNA->AA case)
|
|
107
|
+
aa_variant_counts = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa']).size()
|
|
108
|
+
needs_aggregation = (aa_variant_counts > 1).any()
|
|
109
|
+
|
|
110
|
+
if needs_aggregation:
|
|
111
|
+
return _process_dna_to_aa_aggregation(scores_df_drop_nan, score_col, rep_score_columns)
|
|
112
|
+
else:
|
|
113
|
+
return _process_aa_only_scores(scores_df_drop_nan, rep_score_columns)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _process_dna_to_aa_aggregation(scores_df_drop_nan: pd.DataFrame, score_col: str,
|
|
117
|
+
rep_score_columns: List[str]) -> pd.DataFrame:
|
|
118
|
+
"""
|
|
119
|
+
Process DNA->AA aggregation case with codon variance decomposition.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
scores_df_drop_nan : pd.DataFrame
|
|
124
|
+
DataFrame with variant scores (NaN filtered)
|
|
125
|
+
score_col : str
|
|
126
|
+
Name of the score column to use
|
|
127
|
+
rep_score_columns : List[str]
|
|
128
|
+
List of replicate score column names
|
|
129
|
+
|
|
130
|
+
Returns
|
|
131
|
+
-------
|
|
132
|
+
pd.DataFrame
|
|
133
|
+
Aggregated AA scores with codon and replicate statistics
|
|
134
|
+
"""
|
|
135
|
+
# DNA->AA aggregation case: aggregate synonymous variants
|
|
136
|
+
columns_to_average = ['avgscore', 'avgscore_rep_weighted'] + rep_score_columns
|
|
137
|
+
|
|
138
|
+
# Calculate standard deviation and count of codon-level scores before AA aggregation
|
|
139
|
+
aa_scores_std = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa'])[score_col].agg(['std', 'count']).reset_index()
|
|
140
|
+
aa_scores_std.columns = ['aa_seq_diff', 'annotate_aa', 'SD_codon', 'n_codons']
|
|
141
|
+
|
|
142
|
+
# Calculate mean scores for aggregation
|
|
143
|
+
aa_scores = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa'])[columns_to_average].mean().reset_index()
|
|
144
|
+
|
|
145
|
+
# Merge the standard deviation and count of codon scores
|
|
146
|
+
aa_scores = aa_scores.merge(aa_scores_std, on=['aa_seq_diff', 'annotate_aa'], how='left')
|
|
147
|
+
|
|
148
|
+
# Calculate statistics with codon and replicate variance decomposition
|
|
149
|
+
aa_scores = calculate_codon_and_replicate_variance(aa_scores, rep_score_columns)
|
|
150
|
+
|
|
151
|
+
return aa_scores
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _process_aa_only_scores(scores_df_drop_nan: pd.DataFrame, rep_score_columns: List[str]) -> pd.DataFrame:
|
|
155
|
+
"""
|
|
156
|
+
Process AA-only case with simple replicate statistics.
|
|
157
|
+
|
|
158
|
+
Parameters
|
|
159
|
+
----------
|
|
160
|
+
scores_df_drop_nan : pd.DataFrame
|
|
161
|
+
DataFrame with variant scores (NaN filtered)
|
|
162
|
+
rep_score_columns : List[str]
|
|
163
|
+
List of replicate score column names
|
|
164
|
+
|
|
165
|
+
Returns
|
|
166
|
+
-------
|
|
167
|
+
pd.DataFrame
|
|
168
|
+
AA scores with replicate statistics only
|
|
169
|
+
"""
|
|
170
|
+
# AA-only case: no aggregation needed, just copy the data
|
|
171
|
+
columns_to_include = ['aa_seq_diff', 'annotate_aa', 'avgscore', 'avgscore_rep_weighted'] + rep_score_columns
|
|
172
|
+
|
|
173
|
+
aa_scores = scores_df_drop_nan[columns_to_include].copy()
|
|
174
|
+
|
|
175
|
+
# Calculate simple replicate statistics (no codon variance)
|
|
176
|
+
if len(rep_score_columns) >= 2:
|
|
177
|
+
aa_rep_mean = aa_scores[rep_score_columns].mean(axis=1)
|
|
178
|
+
aa_rep_std = aa_scores[rep_score_columns].std(axis=1, ddof=1)
|
|
179
|
+
|
|
180
|
+
# Calculate n_measurements (just number of non-empty replicates)
|
|
181
|
+
n_measurements = aa_scores[rep_score_columns].notna().sum(axis=1)
|
|
182
|
+
|
|
183
|
+
# Calculate SEM using only replicate variance
|
|
184
|
+
sem = aa_rep_std / np.sqrt(n_measurements)
|
|
185
|
+
|
|
186
|
+
# Calculate 95% CI using t-distribution
|
|
187
|
+
df_actual = n_measurements - 1
|
|
188
|
+
t_critical = scipy_stats.t.ppf(0.975, df_actual)
|
|
189
|
+
aa_margin_of_error = t_critical * sem
|
|
190
|
+
|
|
191
|
+
aa_scores['SD_rep'] = aa_rep_std.round().astype('Int64')
|
|
192
|
+
aa_scores['CV_rep'] = (aa_rep_std / aa_rep_mean).round(3)
|
|
193
|
+
aa_scores['n_measurements'] = n_measurements.astype('Int64')
|
|
194
|
+
aa_scores['SEM'] = sem.round().astype('Int64')
|
|
195
|
+
aa_scores['CI_lower'] = (aa_rep_mean - aa_margin_of_error).round().astype('Int64')
|
|
196
|
+
aa_scores['CI_upper'] = (aa_rep_mean + aa_margin_of_error).round().astype('Int64')
|
|
197
|
+
|
|
198
|
+
return aa_scores
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _round_score_columns(aa_scores: pd.DataFrame) -> pd.DataFrame:
|
|
202
|
+
"""
|
|
203
|
+
Round score columns to integers for cleaner output.
|
|
204
|
+
|
|
205
|
+
Parameters
|
|
206
|
+
----------
|
|
207
|
+
aa_scores : pd.DataFrame
|
|
208
|
+
DataFrame containing score columns
|
|
209
|
+
|
|
210
|
+
Returns
|
|
211
|
+
-------
|
|
212
|
+
pd.DataFrame
|
|
213
|
+
DataFrame with score columns rounded to integers
|
|
214
|
+
"""
|
|
215
|
+
# Round score columns to integers
|
|
216
|
+
score_columns = [col for col in aa_scores.columns if 'score' in col.lower()]
|
|
217
|
+
for col in score_columns:
|
|
218
|
+
if aa_scores[col].dtype in ['float64', 'float32']:
|
|
219
|
+
aa_scores[col] = aa_scores[col].round().astype('Int64')
|
|
220
|
+
|
|
221
|
+
return aa_scores
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sequence annotation utilities for Sort-seq variant analysis.
|
|
3
|
+
|
|
4
|
+
This module provides functions for annotating variant DataFrames with sequence differences,
|
|
5
|
+
translations, and other derived sequence information.
|
|
6
|
+
|
|
7
|
+
Examples
|
|
8
|
+
--------
|
|
9
|
+
>>> from sortscore.analysis.annotation import annotate_scores_dataframe
|
|
10
|
+
>>> annotated_df = annotate_scores_dataframe(scores_df, experiment)
|
|
11
|
+
"""
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from sortscore.utils.sequence_parsing import compare_to_reference, compare_codon_lists, translate_dna
|
|
14
|
+
|
|
15
|
+
# TODO: #37 redundant, see if we can remove
|
|
16
|
+
def annotate_scores_dataframe(
|
|
17
|
+
scores_df: pd.DataFrame,
|
|
18
|
+
wt_dna_seq: str,
|
|
19
|
+
mutagenesis_type: str = 'aa',
|
|
20
|
+
) -> pd.DataFrame:
|
|
21
|
+
"""
|
|
22
|
+
Add sequence annotation columns to a scores DataFrame.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
scores_df : pd.DataFrame
|
|
27
|
+
DataFrame with variant sequences and scores.
|
|
28
|
+
wt_dna_seq : str
|
|
29
|
+
Wild-type DNA reference sequence.
|
|
30
|
+
mutagenesis_type : str, default 'aa'
|
|
31
|
+
Type of mutagenesis ('codon', 'snv', 'aa').
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
annotated_df : pd.DataFrame
|
|
36
|
+
DataFrame with added annotation columns.
|
|
37
|
+
|
|
38
|
+
Examples
|
|
39
|
+
--------
|
|
40
|
+
>>> annotated_df = annotate_scores_dataframe(scores_df, wt_seq, 'dna')
|
|
41
|
+
"""
|
|
42
|
+
df = scores_df.copy()
|
|
43
|
+
|
|
44
|
+
# Check if aa_seq_diff already exists (from pre-annotated data)
|
|
45
|
+
has_pre_annotated_aa = 'aa_seq_diff' in df.columns
|
|
46
|
+
|
|
47
|
+
# Treat 'dna' as a DNA-sequence variant type (full-length DNA sequences)
|
|
48
|
+
if mutagenesis_type in {'codon', 'snv'}:
|
|
49
|
+
# Add codon differences
|
|
50
|
+
df['codon_diff'] = df['variant_seq'].apply(
|
|
51
|
+
lambda x: compare_codon_lists(wt_dna_seq, x)
|
|
52
|
+
)
|
|
53
|
+
df['codon_diff'] = df['codon_diff'].fillna('')
|
|
54
|
+
|
|
55
|
+
# Add DNA sequence differences
|
|
56
|
+
df['dna_seq_diff'] = df['variant_seq'].apply(
|
|
57
|
+
lambda x: compare_to_reference(wt_dna_seq, x)
|
|
58
|
+
)
|
|
59
|
+
df['dna_seq_diff'] = df['dna_seq_diff'].fillna('')
|
|
60
|
+
|
|
61
|
+
# Add AA sequence annotations only if not pre-annotated
|
|
62
|
+
if not has_pre_annotated_aa:
|
|
63
|
+
wt_aa_seq = translate_dna(wt_dna_seq)
|
|
64
|
+
df['aa_seq'] = df['variant_seq'].apply(translate_dna)
|
|
65
|
+
df['aa_seq_diff'] = df['aa_seq'].apply(
|
|
66
|
+
lambda x: compare_to_reference(wt_aa_seq, x)
|
|
67
|
+
)
|
|
68
|
+
df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
|
|
69
|
+
|
|
70
|
+
elif mutagenesis_type == 'aa':
|
|
71
|
+
# For AA variants, add sequence differences only if not pre-annotated
|
|
72
|
+
if not has_pre_annotated_aa:
|
|
73
|
+
wt_aa_seq = translate_dna(wt_dna_seq) if len(wt_dna_seq) % 3 == 0 else wt_dna_seq
|
|
74
|
+
df['aa_seq_diff'] = df['variant_seq'].apply(
|
|
75
|
+
lambda x: compare_to_reference(wt_aa_seq, x)
|
|
76
|
+
)
|
|
77
|
+
df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
|
|
78
|
+
|
|
79
|
+
# Map stop codon representations to * for standard notation in aa_seq_diff column
|
|
80
|
+
if 'aa_seq_diff' in df.columns:
|
|
81
|
+
df['aa_seq_diff'] = df['aa_seq_diff'].str.replace('X', '*', regex=False)
|
|
82
|
+
df['aa_seq_diff'] = df['aa_seq_diff'].str.replace('Ter', '*', regex=False)
|
|
83
|
+
|
|
84
|
+
# Add functional annotations
|
|
85
|
+
df = add_variant_categories(df)
|
|
86
|
+
|
|
87
|
+
return df
|
|
88
|
+
|
|
89
|
+
# TODO: #37 isn't this redundant with similar functions
|
|
90
|
+
def add_sequence_differences(
|
|
91
|
+
df: pd.DataFrame,
|
|
92
|
+
wt_dna_seq: str,
|
|
93
|
+
mutagenesis_type: str = 'aa',
|
|
94
|
+
) -> pd.DataFrame:
|
|
95
|
+
"""
|
|
96
|
+
Add sequence difference columns to a DataFrame.
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
df : pd.DataFrame
|
|
101
|
+
DataFrame with variant sequences.
|
|
102
|
+
wt_dna_seq : str
|
|
103
|
+
Wild-type DNA sequence.
|
|
104
|
+
mutagenesis_type : str, default 'aa'
|
|
105
|
+
Type of mutagenesis ('codon', 'snv', 'aa').
|
|
106
|
+
|
|
107
|
+
Returns
|
|
108
|
+
-------
|
|
109
|
+
df : pd.DataFrame
|
|
110
|
+
DataFrame with sequence difference columns added.
|
|
111
|
+
"""
|
|
112
|
+
df = df.copy()
|
|
113
|
+
|
|
114
|
+
if mutagenesis_type in {'codon', 'snv'}:
|
|
115
|
+
# Add DNA sequence differences
|
|
116
|
+
df['dna_seq_diff'] = df['variant_seq'].apply(
|
|
117
|
+
lambda x: compare_to_reference(wt_dna_seq, x)
|
|
118
|
+
)
|
|
119
|
+
df['dna_seq_diff'] = df['dna_seq_diff'].fillna('')
|
|
120
|
+
|
|
121
|
+
# Add AA sequence differences
|
|
122
|
+
wt_aa_seq = translate_dna(wt_dna_seq)
|
|
123
|
+
df['aa_seq'] = df['variant_seq'].apply(translate_dna)
|
|
124
|
+
df['aa_seq_diff'] = df['aa_seq'].apply(
|
|
125
|
+
lambda x: compare_to_reference(wt_aa_seq, x)
|
|
126
|
+
)
|
|
127
|
+
df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
|
|
128
|
+
|
|
129
|
+
elif mutagenesis_type == 'aa':
|
|
130
|
+
# For AA variants, sequences are already amino acids
|
|
131
|
+
wt_aa_seq = translate_dna(wt_dna_seq) if len(wt_dna_seq) % 3 == 0 else wt_dna_seq
|
|
132
|
+
df['aa_seq_diff'] = df['variant_seq'].apply(
|
|
133
|
+
lambda x: compare_to_reference(wt_aa_seq, x)
|
|
134
|
+
)
|
|
135
|
+
df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
|
|
136
|
+
|
|
137
|
+
return df
|
|
138
|
+
|
|
139
|
+
def classify_aa_variant(aa_diff, dna_diff=None):
|
|
140
|
+
if not aa_diff or aa_diff == '':
|
|
141
|
+
# Check if this is true WT (no DNA changes) or synonymous (DNA changes but same AA)
|
|
142
|
+
if dna_diff is not None and (not dna_diff or dna_diff == ''):
|
|
143
|
+
return 'wt_dna'
|
|
144
|
+
else:
|
|
145
|
+
return 'synonymous'
|
|
146
|
+
elif '*' in aa_diff:
|
|
147
|
+
return 'nonsense'
|
|
148
|
+
else:
|
|
149
|
+
return 'missense_aa'
|
|
150
|
+
|
|
151
|
+
def classify_dna_variant(dna_diff, aa_diff):
|
|
152
|
+
if not dna_diff or dna_diff == '':
|
|
153
|
+
return 'wt_dna'
|
|
154
|
+
elif not aa_diff or aa_diff == '':
|
|
155
|
+
return 'synonymous'
|
|
156
|
+
else:
|
|
157
|
+
return 'missense_dna'
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def add_variant_categories(df: pd.DataFrame) -> pd.DataFrame:
|
|
161
|
+
"""
|
|
162
|
+
Add variant category annotations based on existing sequence difference columns.
|
|
163
|
+
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
df : pd.DataFrame
|
|
167
|
+
DataFrame with sequence difference columns (aa_seq_diff, dna_seq_diff).
|
|
168
|
+
|
|
169
|
+
Returns
|
|
170
|
+
-------
|
|
171
|
+
df : pd.DataFrame
|
|
172
|
+
DataFrame with variant category columns added.
|
|
173
|
+
"""
|
|
174
|
+
df = df.copy()
|
|
175
|
+
|
|
176
|
+
# Classify DNA variants
|
|
177
|
+
if 'dna_seq_diff' in df.columns:
|
|
178
|
+
if 'aa_seq_diff' in df.columns:
|
|
179
|
+
df['annotate_dna'] = df.apply(lambda row: classify_dna_variant(row['dna_seq_diff'], row['aa_seq_diff']), axis=1)
|
|
180
|
+
else:
|
|
181
|
+
df['annotate_dna'] = df['dna_seq_diff'].apply(lambda x: 'missense_dna' if x else 'wt_dna')
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# Classify variants based on AA changes
|
|
185
|
+
if 'aa_seq_diff' in df.columns:
|
|
186
|
+
if 'dna_seq_diff' in df.columns:
|
|
187
|
+
df['annotate_aa'] = df.apply(lambda row: classify_aa_variant(row['aa_seq_diff'], row['dna_seq_diff']), axis=1)
|
|
188
|
+
else:
|
|
189
|
+
df['annotate_aa'] = df['aa_seq_diff'].apply(classify_aa_variant)
|
|
190
|
+
|
|
191
|
+
return df
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch configuration module for Sort-seq variant analysis.
|
|
3
|
+
|
|
4
|
+
This module provides framework for combining and normalizing multiple
|
|
5
|
+
Sort-seq experiments in batch processing workflows.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Dict, List, Optional, Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class BatchConfig:
|
|
16
|
+
"""
|
|
17
|
+
Dataclass for batch processing configuration.
|
|
18
|
+
|
|
19
|
+
This class manages configuration for combining and normalizing multiple Sort-seq experiments.
|
|
20
|
+
It supports two normalization methods:
|
|
21
|
+
|
|
22
|
+
1. **2-pole normalization**: Uses synonymous and pathogenic variants as reference points
|
|
23
|
+
2. **Z-score scaled three-step normalization** (default): Creates standardized scale where synonymous
|
|
24
|
+
variants center around 0 with unit variance, making cross-experiment comparisons meaningful
|
|
25
|
+
|
|
26
|
+
Attributes
|
|
27
|
+
----------
|
|
28
|
+
batch_normalization_method : str, optional
|
|
29
|
+
Normalization method to use ('zscore_2pole', '2pole', or 'zscore_center'), default 'zscore_2pole'
|
|
30
|
+
pathogenic_control_type : str, optional
|
|
31
|
+
Type of pathogenic control ('nonsense' or 'custom'), default 'nonsense'
|
|
32
|
+
pathogenic_variants : Optional[List[str]], optional
|
|
33
|
+
Custom pathogenic variants if using 'custom' pathogenic_control_type
|
|
34
|
+
combined_output_dir : str, optional
|
|
35
|
+
Directory for final combined results, default current directory
|
|
36
|
+
"""
|
|
37
|
+
experiments: List[Dict[str, Any]]
|
|
38
|
+
batch_normalization_method: str = 'zscore_2pole'
|
|
39
|
+
pathogenic_control_type: str = 'nonsense'
|
|
40
|
+
pathogenic_variants: Optional[List[str]] = None
|
|
41
|
+
combined_output_dir: str = './normalized'
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def from_json(json_path: str) -> 'BatchConfig':
|
|
45
|
+
"""
|
|
46
|
+
Load batch configuration from a JSON file.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
json_path : str
|
|
51
|
+
Path to the JSON batch config file
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
config : BatchConfig
|
|
56
|
+
Loaded batch configuration
|
|
57
|
+
"""
|
|
58
|
+
json_path_obj = Path(json_path).expanduser().resolve()
|
|
59
|
+
with open(json_path_obj, 'r') as f:
|
|
60
|
+
data = json.load(f)
|
|
61
|
+
if 'experiments' not in data:
|
|
62
|
+
raise ValueError("Batch configuration requires 'experiments'")
|
|
63
|
+
args: Dict[str, Any] = {'experiments': data['experiments']}
|
|
64
|
+
|
|
65
|
+
# Optional fields (only add if present to preserve defaults)
|
|
66
|
+
optional_fields = [
|
|
67
|
+
'batch_normalization_method', 'pathogenic_control_type',
|
|
68
|
+
'pathogenic_variants', 'combined_output_dir'
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
for field in optional_fields:
|
|
72
|
+
if field in data:
|
|
73
|
+
args[field] = data[field]
|
|
74
|
+
|
|
75
|
+
# Resolve output paths relative to the config file directory.
|
|
76
|
+
config_dir = json_path_obj.parent
|
|
77
|
+
experiments = []
|
|
78
|
+
for entry in args['experiments']:
|
|
79
|
+
entry_cfg = dict(entry)
|
|
80
|
+
if 'output_dir' in entry_cfg and entry_cfg['output_dir'] is not None:
|
|
81
|
+
entry_cfg['output_dir'] = str(
|
|
82
|
+
(config_dir / Path(str(entry_cfg['output_dir'])).expanduser()).resolve()
|
|
83
|
+
)
|
|
84
|
+
experiments.append(entry_cfg)
|
|
85
|
+
args['experiments'] = experiments
|
|
86
|
+
|
|
87
|
+
if 'combined_output_dir' in args and args['combined_output_dir'] is not None:
|
|
88
|
+
args['combined_output_dir'] = str(
|
|
89
|
+
(config_dir / Path(str(args['combined_output_dir'])).expanduser()).resolve()
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return BatchConfig(**args)
|
|
93
|
+
|
|
94
|
+
def validate_config(self) -> None:
|
|
95
|
+
"""
|
|
96
|
+
Validate batch configuration parameters.
|
|
97
|
+
|
|
98
|
+
Raises
|
|
99
|
+
------
|
|
100
|
+
ValueError
|
|
101
|
+
If configuration parameters are invalid
|
|
102
|
+
FileNotFoundError
|
|
103
|
+
If experiment config files don't exist
|
|
104
|
+
"""
|
|
105
|
+
if not self.experiments:
|
|
106
|
+
raise ValueError("experiments must contain at least one entry")
|
|
107
|
+
|
|
108
|
+
seen_tiles = set()
|
|
109
|
+
for idx, cfg in enumerate(self.experiments):
|
|
110
|
+
if 'tile' not in cfg:
|
|
111
|
+
raise ValueError(f"experiments[{idx}] missing 'tile'")
|
|
112
|
+
if 'output_dir' not in cfg:
|
|
113
|
+
raise ValueError(f"experiments[{idx}] missing 'output_dir'")
|
|
114
|
+
if 'wt_seq' not in cfg:
|
|
115
|
+
raise ValueError(f"experiments[{idx}] missing 'wt_seq'")
|
|
116
|
+
if 'min_pos' not in cfg:
|
|
117
|
+
raise ValueError(f"experiments[{idx}] missing 'min_pos'")
|
|
118
|
+
if 'max_pos' not in cfg:
|
|
119
|
+
raise ValueError(f"experiments[{idx}] missing 'max_pos'")
|
|
120
|
+
try:
|
|
121
|
+
tile = int(cfg['tile'])
|
|
122
|
+
except Exception as e:
|
|
123
|
+
raise ValueError(f"experiments[{idx}].tile must be int-like") from e
|
|
124
|
+
try:
|
|
125
|
+
min_pos = int(cfg['min_pos'])
|
|
126
|
+
max_pos = int(cfg['max_pos'])
|
|
127
|
+
except Exception as e:
|
|
128
|
+
raise ValueError(f"experiments[{idx}].min_pos/max_pos must be int-like") from e
|
|
129
|
+
if min_pos >= max_pos:
|
|
130
|
+
raise ValueError(f"experiments[{idx}] min_pos must be less than max_pos")
|
|
131
|
+
if tile in seen_tiles:
|
|
132
|
+
raise ValueError(f"Duplicate tile value in experiments: {tile}")
|
|
133
|
+
seen_tiles.add(tile)
|
|
134
|
+
|
|
135
|
+
out_dir = Path(str(cfg['output_dir'])).expanduser().resolve()
|
|
136
|
+
if not out_dir.exists():
|
|
137
|
+
raise FileNotFoundError(f"experiments[{idx}] output_dir does not exist: {out_dir}")
|
|
138
|
+
|
|
139
|
+
# Validate normalization method
|
|
140
|
+
valid_methods = ['zscore_2pole', '2pole', 'zscore_center']
|
|
141
|
+
if self.batch_normalization_method not in valid_methods:
|
|
142
|
+
raise ValueError(f"Invalid normalization method: {self.batch_normalization_method}. "
|
|
143
|
+
f"Must be one of: {valid_methods}")
|
|
144
|
+
|
|
145
|
+
# Validate pathogenic control type
|
|
146
|
+
valid_control_types = ['nonsense', 'custom']
|
|
147
|
+
if self.pathogenic_control_type not in valid_control_types:
|
|
148
|
+
raise ValueError(f"Invalid pathogenic control type: {self.pathogenic_control_type}. "
|
|
149
|
+
f"Must be one of: {valid_control_types}")
|
|
150
|
+
|
|
151
|
+
# If using custom pathogenic controls, ensure variants are specified
|
|
152
|
+
if self.pathogenic_control_type == 'custom' and not self.pathogenic_variants:
|
|
153
|
+
raise ValueError("pathogenic_variants must be specified when using 'custom' pathogenic_control_type")
|
|
154
|
+
|
|
155
|
+
def get_batch_config_dict(self) -> Dict[str, Any]:
|
|
156
|
+
"""
|
|
157
|
+
Convert batch configuration to dictionary format for processing.
|
|
158
|
+
|
|
159
|
+
Returns
|
|
160
|
+
-------
|
|
161
|
+
Dict[str, Any]
|
|
162
|
+
Configuration dictionary for batch processing functions
|
|
163
|
+
"""
|
|
164
|
+
return {
|
|
165
|
+
'experiments': self.experiments,
|
|
166
|
+
'batch_normalization_method': self.batch_normalization_method,
|
|
167
|
+
'pathogenic_control_type': self.pathogenic_control_type,
|
|
168
|
+
'pathogenic_variants': self.pathogenic_variants,
|
|
169
|
+
'combined_output_dir': self.combined_output_dir
|
|
170
|
+
}
|