sortscore 0.1.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. sortscore/__init__.py +19 -0
  2. sortscore/__main__.py +7 -0
  3. sortscore/analysis/__init__.py +7 -0
  4. sortscore/analysis/aa_scores.py +221 -0
  5. sortscore/analysis/annotation.py +191 -0
  6. sortscore/analysis/batch_config.py +170 -0
  7. sortscore/analysis/batch_normalization.py +928 -0
  8. sortscore/analysis/batch_workflow.py +76 -0
  9. sortscore/analysis/filtering.py +53 -0
  10. sortscore/analysis/normalize_read_depth.py +99 -0
  11. sortscore/analysis/score.py +243 -0
  12. sortscore/analysis/statistics.py +201 -0
  13. sortscore/analysis/summary_stats.py +182 -0
  14. sortscore/analysis/variant_aggregation.py +94 -0
  15. sortscore/analysis/workflows.py +237 -0
  16. sortscore/cli.py +48 -0
  17. sortscore/run_analysis.py +235 -0
  18. sortscore/run_batch_analysis.py +84 -0
  19. sortscore/utils/analysis_logger.py +277 -0
  20. sortscore/utils/console_utils.py +208 -0
  21. sortscore/utils/experiment_setup.py +325 -0
  22. sortscore/utils/file_utils.py +158 -0
  23. sortscore/utils/load_experiment.py +667 -0
  24. sortscore/utils/sequence_parsing.py +316 -0
  25. sortscore/utils/tile_configs.py +115 -0
  26. sortscore/utils/variant_detection.py +329 -0
  27. sortscore/utils/variant_parsing.py +68 -0
  28. sortscore/visualization/__init__.py +5 -0
  29. sortscore/visualization/correlations.py +358 -0
  30. sortscore/visualization/heatmap_matrix.py +180 -0
  31. sortscore/visualization/heatmap_workflow.py +218 -0
  32. sortscore/visualization/heatmaps.py +737 -0
  33. sortscore/visualization/plots.py +358 -0
  34. sortscore-0.1.0b2.dist-info/METADATA +67 -0
  35. sortscore-0.1.0b2.dist-info/RECORD +39 -0
  36. sortscore-0.1.0b2.dist-info/WHEEL +5 -0
  37. sortscore-0.1.0b2.dist-info/entry_points.txt +2 -0
  38. sortscore-0.1.0b2.dist-info/licenses/LICENSE +21 -0
  39. sortscore-0.1.0b2.dist-info/top_level.txt +1 -0
sortscore/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ """
2
+ sortscore: A modular Python package for Sort-seq variant analysis.
3
+
4
+ This package provides tools for analyzing Sort-seq experimental data,
5
+ calculating activity scores, and generating visualizations.
6
+ """
7
+
8
+ __version__ = "0.1.0b2"
9
+ __author__ = "Caitlyn Chitwood"
10
+ __email__ = "c.chitwood@wustl.edu"
11
+
12
+ # Import main classes for convenience
13
+ from .utils.load_experiment import ExperimentConfig
14
+ from .analysis.score import calculate_full_activity_scores
15
+
16
+ __all__ = [
17
+ "ExperimentConfig",
18
+ "calculate_full_activity_scores"
19
+ ]
sortscore/__main__.py ADDED
@@ -0,0 +1,7 @@
1
+ """
2
+ Allow running the sortscore package as a module with: python -m sortscore
3
+ """
4
+ from sortscore.cli import main
5
+
6
+ if __name__ == "__main__":
7
+ main()
@@ -0,0 +1,7 @@
1
+ """
2
+ Analysis subpackage for Sort-seq variant analysis workflows.
3
+
4
+ This subpackage contains modules for configuration, parameters, I/O, and utilities.
5
+ """
6
+
7
+ from .score import calculate_activity_scores
@@ -0,0 +1,221 @@
1
+ """
2
+ Amino acid scores processing and export for Sort-seq analysis.
3
+
4
+ This module provides functions for processing amino acid scores from DNA variant data,
5
+ including aggregation of synonymous codons, statistical analysis, and file export.
6
+ """
7
+ import os
8
+ import logging
9
+ import pandas as pd
10
+ import numpy as np
11
+ from scipy import stats as scipy_stats
12
+ from typing import Tuple, List
13
+ from sortscore.analysis.statistics import calculate_codon_and_replicate_variance
14
+
15
+
16
+ def process_and_save_aa_scores(scores_df: pd.DataFrame, experiment, scores_dir: str,
17
+ output_suffix: str, analysis_logger) -> None:
18
+ """
19
+ Process and save amino acid scores from variant data.
20
+
21
+ This function handles the complete AA scores workflow including:
22
+ - Filtering out NaN values
23
+ - Checking if codon aggregation is needed
24
+ - Calculating appropriate statistics (with/without codon variance)
25
+ - Rounding score columns
26
+ - Saving to CSV file
27
+ - Logging output
28
+
29
+ Parameters
30
+ ----------
31
+ scores_df : pd.DataFrame
32
+ DataFrame containing variant scores and annotations
33
+ experiment : ExperimentConfig
34
+ Experiment configuration containing metadata
35
+ scores_dir : str
36
+ Directory to save scores file
37
+ output_suffix : str
38
+ Suffix for output filename
39
+ analysis_logger : AnalysisLogger
40
+ Logger instance for recording outputs
41
+
42
+ Examples
43
+ --------
44
+ >>> process_and_save_aa_scores(scores_df, experiment, 'output/scores', 'suffix', logger)
45
+ """
46
+ if 'aa_seq_diff' not in scores_df.columns:
47
+ return
48
+
49
+ # Determine score column
50
+ if experiment.avg_method == 'simple-avg':
51
+ score_col = 'avgscore'
52
+ else:
53
+ score_col_suffix = experiment.avg_method.replace('-', '_')
54
+ score_col = f'avgscore_{score_col_suffix}'
55
+
56
+ # Filter out rows with NaN values first
57
+ scores_df_drop_nan = scores_df.dropna(subset=[score_col])
58
+
59
+ # Find replicate score columns
60
+ rep_score_columns = [col for col in scores_df_drop_nan.columns
61
+ if col.startswith('Rep') and col.endswith('.score')]
62
+
63
+ # Check aggregation needs and process scores
64
+ aa_scores = _check_codon_num(scores_df_drop_nan, score_col, rep_score_columns)
65
+
66
+ # Round score columns to integers
67
+ aa_scores = _round_score_columns(aa_scores)
68
+
69
+ # Save to file
70
+ aa_scores_file = os.path.join(scores_dir, f"{experiment.experiment_name}_aa_scores_{output_suffix}.csv")
71
+ aa_scores.to_csv(aa_scores_file, index=False)
72
+ logging.info(f"Saved AA scores to {aa_scores_file} ({len(aa_scores)} unique AA variants)")
73
+
74
+ # Log file output
75
+ analysis_logger.log_output_file(
76
+ 'aa_scores',
77
+ f"{experiment.experiment_name}_aa_scores_{output_suffix}.csv",
78
+ aa_scores_file,
79
+ variant_count=len(aa_scores)
80
+ )
81
+
82
+
83
+ def _check_codon_num(scores_df_drop_nan: pd.DataFrame, score_col: str,
84
+ rep_score_columns: List[str]) -> pd.DataFrame:
85
+ """
86
+ Check if codon aggregation is needed and process AA scores accordingly.
87
+
88
+ This function checks if there are multiple codons per AA variant and processes
89
+ the data using either DNA->AA aggregation (with codon variance) or AA-only
90
+ statistics (replicate variance only).
91
+
92
+ Parameters
93
+ ----------
94
+ scores_df_drop_nan : pd.DataFrame
95
+ DataFrame with NaN values already filtered out
96
+ score_col : str
97
+ Name of the score column to use
98
+ rep_score_columns : List[str]
99
+ List of replicate score column names
100
+
101
+ Returns
102
+ -------
103
+ pd.DataFrame
104
+ Processed AA scores with appropriate statistics
105
+ """
106
+ # Check if there are multiple codons per AA variant (DNA->AA case)
107
+ aa_variant_counts = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa']).size()
108
+ needs_aggregation = (aa_variant_counts > 1).any()
109
+
110
+ if needs_aggregation:
111
+ return _process_dna_to_aa_aggregation(scores_df_drop_nan, score_col, rep_score_columns)
112
+ else:
113
+ return _process_aa_only_scores(scores_df_drop_nan, rep_score_columns)
114
+
115
+
116
+ def _process_dna_to_aa_aggregation(scores_df_drop_nan: pd.DataFrame, score_col: str,
117
+ rep_score_columns: List[str]) -> pd.DataFrame:
118
+ """
119
+ Process DNA->AA aggregation case with codon variance decomposition.
120
+
121
+ Parameters
122
+ ----------
123
+ scores_df_drop_nan : pd.DataFrame
124
+ DataFrame with variant scores (NaN filtered)
125
+ score_col : str
126
+ Name of the score column to use
127
+ rep_score_columns : List[str]
128
+ List of replicate score column names
129
+
130
+ Returns
131
+ -------
132
+ pd.DataFrame
133
+ Aggregated AA scores with codon and replicate statistics
134
+ """
135
+ # DNA->AA aggregation case: aggregate synonymous variants
136
+ columns_to_average = ['avgscore', 'avgscore_rep_weighted'] + rep_score_columns
137
+
138
+ # Calculate standard deviation and count of codon-level scores before AA aggregation
139
+ aa_scores_std = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa'])[score_col].agg(['std', 'count']).reset_index()
140
+ aa_scores_std.columns = ['aa_seq_diff', 'annotate_aa', 'SD_codon', 'n_codons']
141
+
142
+ # Calculate mean scores for aggregation
143
+ aa_scores = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa'])[columns_to_average].mean().reset_index()
144
+
145
+ # Merge the standard deviation and count of codon scores
146
+ aa_scores = aa_scores.merge(aa_scores_std, on=['aa_seq_diff', 'annotate_aa'], how='left')
147
+
148
+ # Calculate statistics with codon and replicate variance decomposition
149
+ aa_scores = calculate_codon_and_replicate_variance(aa_scores, rep_score_columns)
150
+
151
+ return aa_scores
152
+
153
+
154
+ def _process_aa_only_scores(scores_df_drop_nan: pd.DataFrame, rep_score_columns: List[str]) -> pd.DataFrame:
155
+ """
156
+ Process AA-only case with simple replicate statistics.
157
+
158
+ Parameters
159
+ ----------
160
+ scores_df_drop_nan : pd.DataFrame
161
+ DataFrame with variant scores (NaN filtered)
162
+ rep_score_columns : List[str]
163
+ List of replicate score column names
164
+
165
+ Returns
166
+ -------
167
+ pd.DataFrame
168
+ AA scores with replicate statistics only
169
+ """
170
+ # AA-only case: no aggregation needed, just copy the data
171
+ columns_to_include = ['aa_seq_diff', 'annotate_aa', 'avgscore', 'avgscore_rep_weighted'] + rep_score_columns
172
+
173
+ aa_scores = scores_df_drop_nan[columns_to_include].copy()
174
+
175
+ # Calculate simple replicate statistics (no codon variance)
176
+ if len(rep_score_columns) >= 2:
177
+ aa_rep_mean = aa_scores[rep_score_columns].mean(axis=1)
178
+ aa_rep_std = aa_scores[rep_score_columns].std(axis=1, ddof=1)
179
+
180
+ # Calculate n_measurements (just number of non-empty replicates)
181
+ n_measurements = aa_scores[rep_score_columns].notna().sum(axis=1)
182
+
183
+ # Calculate SEM using only replicate variance
184
+ sem = aa_rep_std / np.sqrt(n_measurements)
185
+
186
+ # Calculate 95% CI using t-distribution
187
+ df_actual = n_measurements - 1
188
+ t_critical = scipy_stats.t.ppf(0.975, df_actual)
189
+ aa_margin_of_error = t_critical * sem
190
+
191
+ aa_scores['SD_rep'] = aa_rep_std.round().astype('Int64')
192
+ aa_scores['CV_rep'] = (aa_rep_std / aa_rep_mean).round(3)
193
+ aa_scores['n_measurements'] = n_measurements.astype('Int64')
194
+ aa_scores['SEM'] = sem.round().astype('Int64')
195
+ aa_scores['CI_lower'] = (aa_rep_mean - aa_margin_of_error).round().astype('Int64')
196
+ aa_scores['CI_upper'] = (aa_rep_mean + aa_margin_of_error).round().astype('Int64')
197
+
198
+ return aa_scores
199
+
200
+
201
+ def _round_score_columns(aa_scores: pd.DataFrame) -> pd.DataFrame:
202
+ """
203
+ Round score columns to integers for cleaner output.
204
+
205
+ Parameters
206
+ ----------
207
+ aa_scores : pd.DataFrame
208
+ DataFrame containing score columns
209
+
210
+ Returns
211
+ -------
212
+ pd.DataFrame
213
+ DataFrame with score columns rounded to integers
214
+ """
215
+ # Round score columns to integers
216
+ score_columns = [col for col in aa_scores.columns if 'score' in col.lower()]
217
+ for col in score_columns:
218
+ if aa_scores[col].dtype in ['float64', 'float32']:
219
+ aa_scores[col] = aa_scores[col].round().astype('Int64')
220
+
221
+ return aa_scores
@@ -0,0 +1,191 @@
1
+ """
2
+ Sequence annotation utilities for Sort-seq variant analysis.
3
+
4
+ This module provides functions for annotating variant DataFrames with sequence differences,
5
+ translations, and other derived sequence information.
6
+
7
+ Examples
8
+ --------
9
+ >>> from sortscore.analysis.annotation import annotate_scores_dataframe
10
+ >>> annotated_df = annotate_scores_dataframe(scores_df, experiment)
11
+ """
12
+ import pandas as pd
13
+ from sortscore.utils.sequence_parsing import compare_to_reference, compare_codon_lists, translate_dna
14
+
15
+ # TODO: #37 redundant, see if we can remove
16
+ def annotate_scores_dataframe(
17
+ scores_df: pd.DataFrame,
18
+ wt_dna_seq: str,
19
+ mutagenesis_type: str = 'aa',
20
+ ) -> pd.DataFrame:
21
+ """
22
+ Add sequence annotation columns to a scores DataFrame.
23
+
24
+ Parameters
25
+ ----------
26
+ scores_df : pd.DataFrame
27
+ DataFrame with variant sequences and scores.
28
+ wt_dna_seq : str
29
+ Wild-type DNA reference sequence.
30
+ mutagenesis_type : str, default 'aa'
31
+ Type of mutagenesis ('codon', 'snv', 'aa').
32
+
33
+ Returns
34
+ -------
35
+ annotated_df : pd.DataFrame
36
+ DataFrame with added annotation columns.
37
+
38
+ Examples
39
+ --------
40
+ >>> annotated_df = annotate_scores_dataframe(scores_df, wt_seq, 'dna')
41
+ """
42
+ df = scores_df.copy()
43
+
44
+ # Check if aa_seq_diff already exists (from pre-annotated data)
45
+ has_pre_annotated_aa = 'aa_seq_diff' in df.columns
46
+
47
+ # Treat 'dna' as a DNA-sequence variant type (full-length DNA sequences)
48
+ if mutagenesis_type in {'codon', 'snv'}:
49
+ # Add codon differences
50
+ df['codon_diff'] = df['variant_seq'].apply(
51
+ lambda x: compare_codon_lists(wt_dna_seq, x)
52
+ )
53
+ df['codon_diff'] = df['codon_diff'].fillna('')
54
+
55
+ # Add DNA sequence differences
56
+ df['dna_seq_diff'] = df['variant_seq'].apply(
57
+ lambda x: compare_to_reference(wt_dna_seq, x)
58
+ )
59
+ df['dna_seq_diff'] = df['dna_seq_diff'].fillna('')
60
+
61
+ # Add AA sequence annotations only if not pre-annotated
62
+ if not has_pre_annotated_aa:
63
+ wt_aa_seq = translate_dna(wt_dna_seq)
64
+ df['aa_seq'] = df['variant_seq'].apply(translate_dna)
65
+ df['aa_seq_diff'] = df['aa_seq'].apply(
66
+ lambda x: compare_to_reference(wt_aa_seq, x)
67
+ )
68
+ df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
69
+
70
+ elif mutagenesis_type == 'aa':
71
+ # For AA variants, add sequence differences only if not pre-annotated
72
+ if not has_pre_annotated_aa:
73
+ wt_aa_seq = translate_dna(wt_dna_seq) if len(wt_dna_seq) % 3 == 0 else wt_dna_seq
74
+ df['aa_seq_diff'] = df['variant_seq'].apply(
75
+ lambda x: compare_to_reference(wt_aa_seq, x)
76
+ )
77
+ df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
78
+
79
+ # Map stop codon representations to * for standard notation in aa_seq_diff column
80
+ if 'aa_seq_diff' in df.columns:
81
+ df['aa_seq_diff'] = df['aa_seq_diff'].str.replace('X', '*', regex=False)
82
+ df['aa_seq_diff'] = df['aa_seq_diff'].str.replace('Ter', '*', regex=False)
83
+
84
+ # Add functional annotations
85
+ df = add_variant_categories(df)
86
+
87
+ return df
88
+
89
+ # TODO: #37 isn't this redundant with similar functions
90
+ def add_sequence_differences(
91
+ df: pd.DataFrame,
92
+ wt_dna_seq: str,
93
+ mutagenesis_type: str = 'aa',
94
+ ) -> pd.DataFrame:
95
+ """
96
+ Add sequence difference columns to a DataFrame.
97
+
98
+ Parameters
99
+ ----------
100
+ df : pd.DataFrame
101
+ DataFrame with variant sequences.
102
+ wt_dna_seq : str
103
+ Wild-type DNA sequence.
104
+ mutagenesis_type : str, default 'aa'
105
+ Type of mutagenesis ('codon', 'snv', 'aa').
106
+
107
+ Returns
108
+ -------
109
+ df : pd.DataFrame
110
+ DataFrame with sequence difference columns added.
111
+ """
112
+ df = df.copy()
113
+
114
+ if mutagenesis_type in {'codon', 'snv'}:
115
+ # Add DNA sequence differences
116
+ df['dna_seq_diff'] = df['variant_seq'].apply(
117
+ lambda x: compare_to_reference(wt_dna_seq, x)
118
+ )
119
+ df['dna_seq_diff'] = df['dna_seq_diff'].fillna('')
120
+
121
+ # Add AA sequence differences
122
+ wt_aa_seq = translate_dna(wt_dna_seq)
123
+ df['aa_seq'] = df['variant_seq'].apply(translate_dna)
124
+ df['aa_seq_diff'] = df['aa_seq'].apply(
125
+ lambda x: compare_to_reference(wt_aa_seq, x)
126
+ )
127
+ df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
128
+
129
+ elif mutagenesis_type == 'aa':
130
+ # For AA variants, sequences are already amino acids
131
+ wt_aa_seq = translate_dna(wt_dna_seq) if len(wt_dna_seq) % 3 == 0 else wt_dna_seq
132
+ df['aa_seq_diff'] = df['variant_seq'].apply(
133
+ lambda x: compare_to_reference(wt_aa_seq, x)
134
+ )
135
+ df['aa_seq_diff'] = df['aa_seq_diff'].fillna('')
136
+
137
+ return df
138
+
139
+ def classify_aa_variant(aa_diff, dna_diff=None):
140
+ if not aa_diff or aa_diff == '':
141
+ # Check if this is true WT (no DNA changes) or synonymous (DNA changes but same AA)
142
+ if dna_diff is not None and (not dna_diff or dna_diff == ''):
143
+ return 'wt_dna'
144
+ else:
145
+ return 'synonymous'
146
+ elif '*' in aa_diff:
147
+ return 'nonsense'
148
+ else:
149
+ return 'missense_aa'
150
+
151
+ def classify_dna_variant(dna_diff, aa_diff):
152
+ if not dna_diff or dna_diff == '':
153
+ return 'wt_dna'
154
+ elif not aa_diff or aa_diff == '':
155
+ return 'synonymous'
156
+ else:
157
+ return 'missense_dna'
158
+
159
+
160
+ def add_variant_categories(df: pd.DataFrame) -> pd.DataFrame:
161
+ """
162
+ Add variant category annotations based on existing sequence difference columns.
163
+
164
+ Parameters
165
+ ----------
166
+ df : pd.DataFrame
167
+ DataFrame with sequence difference columns (aa_seq_diff, dna_seq_diff).
168
+
169
+ Returns
170
+ -------
171
+ df : pd.DataFrame
172
+ DataFrame with variant category columns added.
173
+ """
174
+ df = df.copy()
175
+
176
+ # Classify DNA variants
177
+ if 'dna_seq_diff' in df.columns:
178
+ if 'aa_seq_diff' in df.columns:
179
+ df['annotate_dna'] = df.apply(lambda row: classify_dna_variant(row['dna_seq_diff'], row['aa_seq_diff']), axis=1)
180
+ else:
181
+ df['annotate_dna'] = df['dna_seq_diff'].apply(lambda x: 'missense_dna' if x else 'wt_dna')
182
+
183
+
184
+ # Classify variants based on AA changes
185
+ if 'aa_seq_diff' in df.columns:
186
+ if 'dna_seq_diff' in df.columns:
187
+ df['annotate_aa'] = df.apply(lambda row: classify_aa_variant(row['aa_seq_diff'], row['dna_seq_diff']), axis=1)
188
+ else:
189
+ df['annotate_aa'] = df['aa_seq_diff'].apply(classify_aa_variant)
190
+
191
+ return df
@@ -0,0 +1,170 @@
1
+ """
2
+ Batch configuration module for Sort-seq variant analysis.
3
+
4
+ This module provides framework for combining and normalizing multiple
5
+ Sort-seq experiments in batch processing workflows.
6
+ """
7
+
8
+ import json
9
+ from pathlib import Path
10
+ from dataclasses import dataclass
11
+ from typing import Dict, List, Optional, Any
12
+
13
+
14
+ @dataclass
15
+ class BatchConfig:
16
+ """
17
+ Dataclass for batch processing configuration.
18
+
19
+ This class manages configuration for combining and normalizing multiple Sort-seq experiments.
20
+ It supports two normalization methods:
21
+
22
+ 1. **2-pole normalization**: Uses synonymous and pathogenic variants as reference points
23
+ 2. **Z-score scaled three-step normalization** (default): Creates standardized scale where synonymous
24
+ variants center around 0 with unit variance, making cross-experiment comparisons meaningful
25
+
26
+ Attributes
27
+ ----------
28
+ batch_normalization_method : str, optional
29
+ Normalization method to use ('zscore_2pole', '2pole', or 'zscore_center'), default 'zscore_2pole'
30
+ pathogenic_control_type : str, optional
31
+ Type of pathogenic control ('nonsense' or 'custom'), default 'nonsense'
32
+ pathogenic_variants : Optional[List[str]], optional
33
+ Custom pathogenic variants if using 'custom' pathogenic_control_type
34
+ combined_output_dir : str, optional
35
+ Directory for final combined results, default current directory
36
+ """
37
+ experiments: List[Dict[str, Any]]
38
+ batch_normalization_method: str = 'zscore_2pole'
39
+ pathogenic_control_type: str = 'nonsense'
40
+ pathogenic_variants: Optional[List[str]] = None
41
+ combined_output_dir: str = './normalized'
42
+
43
+ @staticmethod
44
+ def from_json(json_path: str) -> 'BatchConfig':
45
+ """
46
+ Load batch configuration from a JSON file.
47
+
48
+ Parameters
49
+ ----------
50
+ json_path : str
51
+ Path to the JSON batch config file
52
+
53
+ Returns
54
+ -------
55
+ config : BatchConfig
56
+ Loaded batch configuration
57
+ """
58
+ json_path_obj = Path(json_path).expanduser().resolve()
59
+ with open(json_path_obj, 'r') as f:
60
+ data = json.load(f)
61
+ if 'experiments' not in data:
62
+ raise ValueError("Batch configuration requires 'experiments'")
63
+ args: Dict[str, Any] = {'experiments': data['experiments']}
64
+
65
+ # Optional fields (only add if present to preserve defaults)
66
+ optional_fields = [
67
+ 'batch_normalization_method', 'pathogenic_control_type',
68
+ 'pathogenic_variants', 'combined_output_dir'
69
+ ]
70
+
71
+ for field in optional_fields:
72
+ if field in data:
73
+ args[field] = data[field]
74
+
75
+ # Resolve output paths relative to the config file directory.
76
+ config_dir = json_path_obj.parent
77
+ experiments = []
78
+ for entry in args['experiments']:
79
+ entry_cfg = dict(entry)
80
+ if 'output_dir' in entry_cfg and entry_cfg['output_dir'] is not None:
81
+ entry_cfg['output_dir'] = str(
82
+ (config_dir / Path(str(entry_cfg['output_dir'])).expanduser()).resolve()
83
+ )
84
+ experiments.append(entry_cfg)
85
+ args['experiments'] = experiments
86
+
87
+ if 'combined_output_dir' in args and args['combined_output_dir'] is not None:
88
+ args['combined_output_dir'] = str(
89
+ (config_dir / Path(str(args['combined_output_dir'])).expanduser()).resolve()
90
+ )
91
+
92
+ return BatchConfig(**args)
93
+
94
+ def validate_config(self) -> None:
95
+ """
96
+ Validate batch configuration parameters.
97
+
98
+ Raises
99
+ ------
100
+ ValueError
101
+ If configuration parameters are invalid
102
+ FileNotFoundError
103
+ If experiment config files don't exist
104
+ """
105
+ if not self.experiments:
106
+ raise ValueError("experiments must contain at least one entry")
107
+
108
+ seen_tiles = set()
109
+ for idx, cfg in enumerate(self.experiments):
110
+ if 'tile' not in cfg:
111
+ raise ValueError(f"experiments[{idx}] missing 'tile'")
112
+ if 'output_dir' not in cfg:
113
+ raise ValueError(f"experiments[{idx}] missing 'output_dir'")
114
+ if 'wt_seq' not in cfg:
115
+ raise ValueError(f"experiments[{idx}] missing 'wt_seq'")
116
+ if 'min_pos' not in cfg:
117
+ raise ValueError(f"experiments[{idx}] missing 'min_pos'")
118
+ if 'max_pos' not in cfg:
119
+ raise ValueError(f"experiments[{idx}] missing 'max_pos'")
120
+ try:
121
+ tile = int(cfg['tile'])
122
+ except Exception as e:
123
+ raise ValueError(f"experiments[{idx}].tile must be int-like") from e
124
+ try:
125
+ min_pos = int(cfg['min_pos'])
126
+ max_pos = int(cfg['max_pos'])
127
+ except Exception as e:
128
+ raise ValueError(f"experiments[{idx}].min_pos/max_pos must be int-like") from e
129
+ if min_pos >= max_pos:
130
+ raise ValueError(f"experiments[{idx}] min_pos must be less than max_pos")
131
+ if tile in seen_tiles:
132
+ raise ValueError(f"Duplicate tile value in experiments: {tile}")
133
+ seen_tiles.add(tile)
134
+
135
+ out_dir = Path(str(cfg['output_dir'])).expanduser().resolve()
136
+ if not out_dir.exists():
137
+ raise FileNotFoundError(f"experiments[{idx}] output_dir does not exist: {out_dir}")
138
+
139
+ # Validate normalization method
140
+ valid_methods = ['zscore_2pole', '2pole', 'zscore_center']
141
+ if self.batch_normalization_method not in valid_methods:
142
+ raise ValueError(f"Invalid normalization method: {self.batch_normalization_method}. "
143
+ f"Must be one of: {valid_methods}")
144
+
145
+ # Validate pathogenic control type
146
+ valid_control_types = ['nonsense', 'custom']
147
+ if self.pathogenic_control_type not in valid_control_types:
148
+ raise ValueError(f"Invalid pathogenic control type: {self.pathogenic_control_type}. "
149
+ f"Must be one of: {valid_control_types}")
150
+
151
+ # If using custom pathogenic controls, ensure variants are specified
152
+ if self.pathogenic_control_type == 'custom' and not self.pathogenic_variants:
153
+ raise ValueError("pathogenic_variants must be specified when using 'custom' pathogenic_control_type")
154
+
155
+ def get_batch_config_dict(self) -> Dict[str, Any]:
156
+ """
157
+ Convert batch configuration to dictionary format for processing.
158
+
159
+ Returns
160
+ -------
161
+ Dict[str, Any]
162
+ Configuration dictionary for batch processing functions
163
+ """
164
+ return {
165
+ 'experiments': self.experiments,
166
+ 'batch_normalization_method': self.batch_normalization_method,
167
+ 'pathogenic_control_type': self.pathogenic_control_type,
168
+ 'pathogenic_variants': self.pathogenic_variants,
169
+ 'combined_output_dir': self.combined_output_dir
170
+ }