sortscore 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. sortscore/__init__.py +19 -0
  2. sortscore/__main__.py +7 -0
  3. sortscore/analysis/__init__.py +7 -0
  4. sortscore/analysis/aa_scores.py +214 -0
  5. sortscore/analysis/annotation.py +188 -0
  6. sortscore/analysis/batch_config.py +170 -0
  7. sortscore/analysis/batch_normalization.py +978 -0
  8. sortscore/analysis/batch_workflow.py +70 -0
  9. sortscore/analysis/filtering.py +53 -0
  10. sortscore/analysis/normalize_read_depth.py +99 -0
  11. sortscore/analysis/score.py +243 -0
  12. sortscore/analysis/statistics.py +197 -0
  13. sortscore/analysis/summary_stats.py +161 -0
  14. sortscore/analysis/variant_aggregation.py +94 -0
  15. sortscore/analysis/workflows.py +231 -0
  16. sortscore/cli.py +48 -0
  17. sortscore/run_analysis.py +221 -0
  18. sortscore/run_batch_analysis.py +81 -0
  19. sortscore/utils/analysis_logger.py +268 -0
  20. sortscore/utils/console_utils.py +203 -0
  21. sortscore/utils/experiment_setup.py +325 -0
  22. sortscore/utils/file_utils.py +158 -0
  23. sortscore/utils/load_experiment.py +675 -0
  24. sortscore/utils/sequence_parsing.py +318 -0
  25. sortscore/utils/tile_configs.py +115 -0
  26. sortscore/utils/variant_detection.py +329 -0
  27. sortscore/utils/variant_parsing.py +68 -0
  28. sortscore/visualization/__init__.py +5 -0
  29. sortscore/visualization/correlations.py +358 -0
  30. sortscore/visualization/heatmap_matrix.py +180 -0
  31. sortscore/visualization/heatmap_workflow.py +213 -0
  32. sortscore/visualization/heatmaps.py +728 -0
  33. sortscore/visualization/plots.py +358 -0
  34. sortscore-0.1.0.dist-info/METADATA +75 -0
  35. sortscore-0.1.0.dist-info/RECORD +39 -0
  36. sortscore-0.1.0.dist-info/WHEEL +5 -0
  37. sortscore-0.1.0.dist-info/entry_points.txt +2 -0
  38. sortscore-0.1.0.dist-info/licenses/LICENSE +21 -0
  39. sortscore-0.1.0.dist-info/top_level.txt +1 -0
sortscore/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ """
2
+ sortscore: A modular Python package for Sort-seq variant analysis.
3
+
4
+ This package provides tools for analyzing Sort-seq experimental data,
5
+ calculating activity scores, and generating visualizations.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+ __author__ = "Caitlyn Chitwood"
10
+ __email__ = "c.chitwood@wustl.edu"
11
+
12
+ # Import main classes for convenience
13
+ from .utils.load_experiment import ExperimentConfig
14
+ from .analysis.score import calculate_full_activity_scores
15
+
16
+ __all__ = [
17
+ "ExperimentConfig",
18
+ "calculate_full_activity_scores"
19
+ ]
sortscore/__main__.py ADDED
@@ -0,0 +1,7 @@
1
+ """
2
+ Allow running the sortscore package as a module with: python -m sortscore
3
+ """
4
+ from sortscore.cli import main
5
+
6
+ if __name__ == "__main__":
7
+ main()
@@ -0,0 +1,7 @@
1
+ """
2
+ Analysis subpackage for Sort-seq variant analysis workflows.
3
+
4
+ This subpackage contains modules for configuration, parameters, I/O, and utilities.
5
+ """
6
+
7
+ from .score import calculate_activity_scores
@@ -0,0 +1,214 @@
1
+ """
2
+ Amino acid scores processing and export for Sort-seq analysis.
3
+
4
+ This module provides functions for processing amino acid scores from DNA variant data,
5
+ including aggregation of synonymous codons, statistical analysis, and file export.
6
+ """
7
+ import os
8
+ import logging
9
+ import pandas as pd
10
+ import numpy as np
11
+ from scipy import stats as scipy_stats
12
+ from typing import List
13
+ from sortscore.analysis.statistics import calculate_codon_and_replicate_variance
14
+
15
+
16
+ def _get_score_column_from_avg_method(avg_method: str) -> str:
17
+ """Return the score column name for a given averaging method."""
18
+ if avg_method == 'simple-avg':
19
+ return 'avgscore'
20
+ return f"avgscore_{avg_method.replace('-', '_')}"
21
+
22
+
23
+ def process_and_save_aa_scores(scores_df: pd.DataFrame, experiment, scores_dir: str, analysis_logger) -> None:
24
+ """
25
+ Process and save amino acid scores from variant data.
26
+
27
+ This function handles the complete AA scores workflow including:
28
+ - Filtering out NaN values
29
+ - Checking if codon aggregation is needed
30
+ - Calculating appropriate statistics (with/without codon variance)
31
+ - Rounding score columns
32
+ - Saving to CSV file
33
+ - Logging output
34
+
35
+ Parameters
36
+ ----------
37
+ scores_df : pd.DataFrame
38
+ DataFrame containing variant scores and annotations
39
+ experiment : ExperimentConfig
40
+ Experiment configuration containing metadata
41
+ scores_dir : str
42
+ Directory to save scores file
43
+ analysis_logger : AnalysisLogger
44
+ Logger instance for recording outputs
45
+
46
+ Examples
47
+ --------
48
+ >>> process_and_save_aa_scores(scores_df, experiment, 'output/scores', 'suffix', logger)
49
+ """
50
+ if 'aa_seq_diff' not in scores_df.columns:
51
+ return
52
+
53
+ score_col = _get_score_column_from_avg_method(experiment.avg_method)
54
+
55
+ aa_scores = build_aa_scores_table(scores_df, score_col)
56
+
57
+ # Save to file
58
+ aa_scores_file = os.path.join(scores_dir, f"{experiment.experiment_name}_aa_scores.csv")
59
+ aa_scores.to_csv(aa_scores_file, index=False)
60
+ logging.info(f"Saved AA scores to {aa_scores_file} ({len(aa_scores)} unique AA variants)")
61
+
62
+ # Log file output
63
+ analysis_logger.log_output_file(
64
+ 'aa_scores',
65
+ f"{experiment.experiment_name}_aa_scores.csv",
66
+ aa_scores_file,
67
+ variant_count=len(aa_scores)
68
+ )
69
+
70
+
71
+ def build_aa_scores_table(scores_df: pd.DataFrame, score_col: str) -> pd.DataFrame:
72
+ """
73
+ Build the AA-level score table from a scored variant table.
74
+
75
+ Parameters
76
+ ----------
77
+ scores_df : pd.DataFrame
78
+ DataFrame containing variant scores and annotations.
79
+ score_col : str
80
+ Score column used to determine which rows are retained and how codon
81
+ variance is calculated.
82
+
83
+ Returns
84
+ -------
85
+ pd.DataFrame
86
+ AA-level score table with synonymous codons aggregated when needed.
87
+ """
88
+ if 'aa_seq_diff' not in scores_df.columns:
89
+ raise ValueError("scores_df must contain 'aa_seq_diff' column for AA score output")
90
+
91
+ scores_df_drop_nan = scores_df.dropna(subset=[score_col])
92
+ rep_score_columns = [
93
+ col for col in scores_df_drop_nan.columns
94
+ if col.startswith('Rep') and col.endswith('.score')
95
+ ]
96
+ return _check_codon_num(scores_df_drop_nan, score_col, rep_score_columns)
97
+
98
+
99
+ def _check_codon_num(scores_df_drop_nan: pd.DataFrame, score_col: str,
100
+ rep_score_columns: List[str]) -> pd.DataFrame:
101
+ """
102
+ Check if codon aggregation is needed and process AA scores accordingly.
103
+
104
+ This function checks if there are multiple codons per AA variant and processes
105
+ the data using either DNA->AA aggregation (with codon variance) or AA-only
106
+ statistics (replicate variance only).
107
+
108
+ Parameters
109
+ ----------
110
+ scores_df_drop_nan : pd.DataFrame
111
+ DataFrame with NaN values already filtered out
112
+ score_col : str
113
+ Name of the score column to use
114
+ rep_score_columns : List[str]
115
+ List of replicate score column names
116
+
117
+ Returns
118
+ -------
119
+ pd.DataFrame
120
+ Processed AA scores with appropriate statistics
121
+ """
122
+ # Check if there are multiple codons per AA variant (DNA->AA case)
123
+ aa_variant_counts = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa']).size()
124
+ needs_aggregation = (aa_variant_counts > 1).any()
125
+
126
+ if needs_aggregation:
127
+ return _process_dna_to_aa_aggregation(scores_df_drop_nan, score_col, rep_score_columns)
128
+ else:
129
+ return _process_aa_only_scores(scores_df_drop_nan, rep_score_columns)
130
+
131
+
132
+ def _process_dna_to_aa_aggregation(scores_df_drop_nan: pd.DataFrame, score_col: str,
133
+ rep_score_columns: List[str]) -> pd.DataFrame:
134
+ """
135
+ Process DNA->AA aggregation case with codon variance decomposition.
136
+
137
+ Parameters
138
+ ----------
139
+ scores_df_drop_nan : pd.DataFrame
140
+ DataFrame with variant scores (NaN filtered)
141
+ score_col : str
142
+ Name of the score column to use
143
+ rep_score_columns : List[str]
144
+ List of replicate score column names
145
+
146
+ Returns
147
+ -------
148
+ pd.DataFrame
149
+ Aggregated AA scores with codon and replicate statistics
150
+ """
151
+ # DNA->AA aggregation case: aggregate synonymous variants
152
+ columns_to_average = ['avgscore', 'avgscore_rep_weighted'] + rep_score_columns
153
+
154
+ # Calculate standard deviation and count of codon-level scores before AA aggregation
155
+ aa_scores_std = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa'])[score_col].agg(['std', 'count']).reset_index()
156
+ aa_scores_std.columns = ['aa_seq_diff', 'annotate_aa', 'SD_codon', 'n_codons']
157
+
158
+ # Calculate mean scores for aggregation
159
+ aa_scores = scores_df_drop_nan.groupby(['aa_seq_diff', 'annotate_aa'])[columns_to_average].mean().reset_index()
160
+
161
+ # Merge the standard deviation and count of codon scores
162
+ aa_scores = aa_scores.merge(aa_scores_std, on=['aa_seq_diff', 'annotate_aa'], how='left')
163
+
164
+ # Calculate statistics with codon and replicate variance decomposition
165
+ aa_scores = calculate_codon_and_replicate_variance(aa_scores, rep_score_columns)
166
+
167
+ return aa_scores
168
+
169
+
170
+ def _process_aa_only_scores(scores_df_drop_nan: pd.DataFrame, rep_score_columns: List[str]) -> pd.DataFrame:
171
+ """
172
+ Process AA-only case with simple replicate statistics.
173
+
174
+ Parameters
175
+ ----------
176
+ scores_df_drop_nan : pd.DataFrame
177
+ DataFrame with variant scores (NaN filtered)
178
+ rep_score_columns : List[str]
179
+ List of replicate score column names
180
+
181
+ Returns
182
+ -------
183
+ pd.DataFrame
184
+ AA scores with replicate statistics only
185
+ """
186
+ # AA-only case: no aggregation needed, just copy the data
187
+ columns_to_include = ['aa_seq_diff', 'annotate_aa', 'avgscore', 'avgscore_rep_weighted'] + rep_score_columns
188
+
189
+ aa_scores = scores_df_drop_nan[columns_to_include].copy()
190
+
191
+ # Calculate simple replicate statistics (no codon variance)
192
+ if len(rep_score_columns) >= 2:
193
+ aa_rep_mean = aa_scores[rep_score_columns].mean(axis=1)
194
+ aa_rep_std = aa_scores[rep_score_columns].std(axis=1, ddof=1)
195
+
196
+ # Calculate n_measurements (just number of non-empty replicates)
197
+ n_measurements = aa_scores[rep_score_columns].notna().sum(axis=1)
198
+
199
+ # Calculate SEM using only replicate variance
200
+ sem = aa_rep_std / np.sqrt(n_measurements)
201
+
202
+ # Calculate 95% CI using t-distribution
203
+ df_actual = n_measurements - 1
204
+ t_critical = scipy_stats.t.ppf(0.975, df_actual)
205
+ aa_margin_of_error = t_critical * sem
206
+
207
+ aa_scores['SD_rep'] = aa_rep_std
208
+ aa_scores['CV_rep'] = (aa_rep_std / aa_rep_mean).round(3)
209
+ aa_scores['n_measurements'] = n_measurements.astype('Int64')
210
+ aa_scores['SEM'] = sem
211
+ aa_scores['CI_lower'] = aa_rep_mean - aa_margin_of_error
212
+ aa_scores['CI_upper'] = aa_rep_mean + aa_margin_of_error
213
+
214
+ return aa_scores
@@ -0,0 +1,188 @@
1
+ """
2
+ Sequence annotation utilities for Sort-seq variant analysis.
3
+
4
+ This module provides functions for annotating variant DataFrames with sequence differences,
5
+ translations, and other derived sequence information.
6
+
7
+ Examples
8
+ --------
9
+ >>> from sortscore.analysis.annotation import annotate_scores_dataframe
10
+ >>> annotated_df = annotate_scores_dataframe(scores_df, experiment)
11
+ """
12
+ import pandas as pd
13
+ from sortscore.utils.sequence_parsing import compare_to_reference, compare_codon_lists, translate_dna
14
+
15
+ NO_DIFF_MARKER = '='
16
+
17
+
18
+ # TODO: #37 redundant, see if we can remove
19
+ def annotate_scores_dataframe(
20
+ scores_df: pd.DataFrame,
21
+ wt_dna_seq: str,
22
+ mutagenesis_type: str = 'aa',
23
+ ) -> pd.DataFrame:
24
+ """
25
+ Add sequence annotation columns to a scores DataFrame.
26
+
27
+ Parameters
28
+ ----------
29
+ scores_df : pd.DataFrame
30
+ DataFrame with variant sequences and scores.
31
+ wt_dna_seq : str
32
+ Wild-type DNA reference sequence.
33
+ mutagenesis_type : str, default 'aa'
34
+ Type of mutagenesis ('codon', 'snv', 'aa').
35
+
36
+ Returns
37
+ -------
38
+ annotated_df : pd.DataFrame
39
+ DataFrame with added annotation columns.
40
+
41
+ Examples
42
+ --------
43
+ >>> annotated_df = annotate_scores_dataframe(scores_df, wt_seq, 'dna')
44
+ """
45
+ df = scores_df.copy()
46
+
47
+ # Check if aa_seq_diff already exists (from pre-annotated data)
48
+ has_pre_annotated_aa = 'aa_seq_diff' in df.columns
49
+
50
+ # Treat 'dna' as a DNA-sequence variant type (full-length DNA sequences)
51
+ if mutagenesis_type in {'codon', 'snv'}:
52
+ # Add codon differences
53
+ df['codon_diff'] = df['variant_seq'].apply(
54
+ lambda x: compare_codon_lists(wt_dna_seq, x)
55
+ )
56
+ df['codon_diff'] = df['codon_diff'].fillna('')
57
+
58
+ # Add DNA sequence differences
59
+ df['dna_seq_diff'] = df['variant_seq'].apply(
60
+ lambda x: compare_to_reference(wt_dna_seq, x, no_change_marker=NO_DIFF_MARKER)
61
+ )
62
+
63
+ # Add AA sequence annotations only if not pre-annotated
64
+ if not has_pre_annotated_aa:
65
+ wt_aa_seq = translate_dna(wt_dna_seq)
66
+ df['aa_seq'] = df['variant_seq'].apply(translate_dna)
67
+ df['aa_seq_diff'] = df['aa_seq'].apply(
68
+ lambda x: compare_to_reference(wt_aa_seq, x, no_change_marker=NO_DIFF_MARKER)
69
+ )
70
+
71
+ elif mutagenesis_type == 'aa':
72
+ # For AA variants, add sequence differences only if not pre-annotated
73
+ if not has_pre_annotated_aa:
74
+ wt_aa_seq = translate_dna(wt_dna_seq) if len(wt_dna_seq) % 3 == 0 else wt_dna_seq
75
+ df['aa_seq_diff'] = df['variant_seq'].apply(
76
+ lambda x: compare_to_reference(wt_aa_seq, x, no_change_marker=NO_DIFF_MARKER)
77
+ )
78
+
79
+ # Map stop codon representations to * for standard notation in aa_seq_diff column
80
+ if 'aa_seq_diff' in df.columns:
81
+ df['aa_seq_diff'] = df['aa_seq_diff'].str.replace('X', '*', regex=False)
82
+ df['aa_seq_diff'] = df['aa_seq_diff'].str.replace('Ter', '*', regex=False)
83
+
84
+ # Add functional annotations
85
+ df = add_variant_categories(df)
86
+
87
+ return df
88
+
89
+ # TODO: #37 isn't this redundant with similar functions
90
+ def add_sequence_differences(
91
+ df: pd.DataFrame,
92
+ wt_dna_seq: str,
93
+ mutagenesis_type: str = 'aa',
94
+ ) -> pd.DataFrame:
95
+ """
96
+ Add sequence difference columns to a DataFrame.
97
+
98
+ Parameters
99
+ ----------
100
+ df : pd.DataFrame
101
+ DataFrame with variant sequences.
102
+ wt_dna_seq : str
103
+ Wild-type DNA sequence.
104
+ mutagenesis_type : str, default 'aa'
105
+ Type of mutagenesis ('codon', 'snv', 'aa').
106
+
107
+ Returns
108
+ -------
109
+ df : pd.DataFrame
110
+ DataFrame with sequence difference columns added.
111
+ """
112
+ df = df.copy()
113
+
114
+ if mutagenesis_type in {'codon', 'snv'}:
115
+ # Add DNA sequence differences
116
+ df['dna_seq_diff'] = df['variant_seq'].apply(
117
+ lambda x: compare_to_reference(wt_dna_seq, x, no_change_marker=NO_DIFF_MARKER)
118
+ )
119
+
120
+ # Add AA sequence differences
121
+ wt_aa_seq = translate_dna(wt_dna_seq)
122
+ df['aa_seq'] = df['variant_seq'].apply(translate_dna)
123
+ df['aa_seq_diff'] = df['aa_seq'].apply(
124
+ lambda x: compare_to_reference(wt_aa_seq, x, no_change_marker=NO_DIFF_MARKER)
125
+ )
126
+
127
+ elif mutagenesis_type == 'aa':
128
+ # For AA variants, sequences are already amino acids
129
+ wt_aa_seq = translate_dna(wt_dna_seq) if len(wt_dna_seq) % 3 == 0 else wt_dna_seq
130
+ df['aa_seq_diff'] = df['variant_seq'].apply(
131
+ lambda x: compare_to_reference(wt_aa_seq, x, no_change_marker=NO_DIFF_MARKER)
132
+ )
133
+
134
+ return df
135
+
136
+ def classify_aa_variant(aa_diff, dna_diff=None):
137
+ if aa_diff == NO_DIFF_MARKER:
138
+ # Check if this is true WT (no DNA changes) or synonymous (DNA changes but same AA)
139
+ if dna_diff == NO_DIFF_MARKER:
140
+ return 'wt_dna'
141
+ else:
142
+ return 'synonymous'
143
+ elif '*' in aa_diff:
144
+ return 'nonsense'
145
+ else:
146
+ return 'missense_aa'
147
+
148
+ def classify_dna_variant(dna_diff, aa_diff):
149
+ if dna_diff == NO_DIFF_MARKER:
150
+ return 'wt_dna'
151
+ elif aa_diff == NO_DIFF_MARKER:
152
+ return 'synonymous'
153
+ else:
154
+ return 'missense_dna'
155
+
156
+
157
+ def add_variant_categories(df: pd.DataFrame) -> pd.DataFrame:
158
+ """
159
+ Add variant category annotations based on existing sequence difference columns.
160
+
161
+ Parameters
162
+ ----------
163
+ df : pd.DataFrame
164
+ DataFrame with sequence difference columns (aa_seq_diff, dna_seq_diff).
165
+
166
+ Returns
167
+ -------
168
+ df : pd.DataFrame
169
+ DataFrame with variant category columns added.
170
+ """
171
+ df = df.copy()
172
+
173
+ # Classify DNA variants
174
+ if 'dna_seq_diff' in df.columns:
175
+ if 'aa_seq_diff' in df.columns:
176
+ df['annotate_dna'] = df.apply(lambda row: classify_dna_variant(row['dna_seq_diff'], row['aa_seq_diff']), axis=1)
177
+ else:
178
+ df['annotate_dna'] = df['dna_seq_diff'].apply(lambda x: 'missense_dna' if x else 'wt_dna')
179
+
180
+
181
+ # Classify variants based on AA changes
182
+ if 'aa_seq_diff' in df.columns:
183
+ if 'dna_seq_diff' in df.columns:
184
+ df['annotate_aa'] = df.apply(lambda row: classify_aa_variant(row['aa_seq_diff'], row['dna_seq_diff']), axis=1)
185
+ else:
186
+ df['annotate_aa'] = df['aa_seq_diff'].apply(classify_aa_variant)
187
+
188
+ return df
@@ -0,0 +1,170 @@
1
+ """
2
+ Batch configuration module for Sort-seq variant analysis.
3
+
4
+ This module provides framework for combining and normalizing multiple
5
+ Sort-seq experiments in batch processing workflows.
6
+ """
7
+
8
+ import json
9
+ from pathlib import Path
10
+ from dataclasses import dataclass
11
+ from typing import Dict, List, Optional, Any
12
+
13
+
14
+ @dataclass
15
+ class BatchConfig:
16
+ """
17
+ Dataclass for batch processing configuration.
18
+
19
+ This class manages configuration for combining and normalizing multiple Sort-seq experiments.
20
+ It supports two normalization methods:
21
+
22
+ 1. **2-pole normalization**: Uses synonymous and pathogenic variants as reference points
23
+ 2. **Z-score scaled three-step normalization** (default): Creates standardized scale where synonymous
24
+ variants center around 0 with unit variance, making cross-experiment comparisons meaningful
25
+
26
+ Attributes
27
+ ----------
28
+ batch_normalization_method : str, optional
29
+ Normalization method to use ('zscore_2pole', '2pole', or 'zscore_center'), default 'zscore_2pole'
30
+ pathogenic_control_type : str, optional
31
+ Type of pathogenic control ('nonsense' or 'custom'), default 'nonsense'
32
+ pathogenic_variants : Optional[List[str]], optional
33
+ Custom pathogenic variants if using 'custom' pathogenic_control_type
34
+ combined_output_dir : str, optional
35
+ Directory for final combined results, default current directory
36
+ """
37
+ experiments: List[Dict[str, Any]]
38
+ batch_normalization_method: str = 'zscore_2pole'
39
+ pathogenic_control_type: str = 'nonsense'
40
+ pathogenic_variants: Optional[List[str]] = None
41
+ combined_output_dir: str = './normalized'
42
+
43
+ @staticmethod
44
+ def from_json(json_path: str) -> 'BatchConfig':
45
+ """
46
+ Load batch configuration from a JSON file.
47
+
48
+ Parameters
49
+ ----------
50
+ json_path : str
51
+ Path to the JSON batch config file
52
+
53
+ Returns
54
+ -------
55
+ config : BatchConfig
56
+ Loaded batch configuration
57
+ """
58
+ json_path_obj = Path(json_path).expanduser().resolve()
59
+ with open(json_path_obj, 'r') as f:
60
+ data = json.load(f)
61
+ if 'experiments' not in data:
62
+ raise ValueError("Batch configuration requires 'experiments'")
63
+ args: Dict[str, Any] = {'experiments': data['experiments']}
64
+
65
+ # Optional fields (only add if present to preserve defaults)
66
+ optional_fields = [
67
+ 'batch_normalization_method', 'pathogenic_control_type',
68
+ 'pathogenic_variants', 'combined_output_dir'
69
+ ]
70
+
71
+ for field in optional_fields:
72
+ if field in data:
73
+ args[field] = data[field]
74
+
75
+ # Resolve output paths relative to the config file directory.
76
+ config_dir = json_path_obj.parent
77
+ experiments = []
78
+ for entry in args['experiments']:
79
+ entry_cfg = dict(entry)
80
+ if 'output_dir' in entry_cfg and entry_cfg['output_dir'] is not None:
81
+ entry_cfg['output_dir'] = str(
82
+ (config_dir / Path(str(entry_cfg['output_dir'])).expanduser()).resolve()
83
+ )
84
+ experiments.append(entry_cfg)
85
+ args['experiments'] = experiments
86
+
87
+ if 'combined_output_dir' in args and args['combined_output_dir'] is not None:
88
+ args['combined_output_dir'] = str(
89
+ (config_dir / Path(str(args['combined_output_dir'])).expanduser()).resolve()
90
+ )
91
+
92
+ return BatchConfig(**args)
93
+
94
+ def validate_config(self) -> None:
95
+ """
96
+ Validate batch configuration parameters.
97
+
98
+ Raises
99
+ ------
100
+ ValueError
101
+ If configuration parameters are invalid
102
+ FileNotFoundError
103
+ If experiment config files don't exist
104
+ """
105
+ if not self.experiments:
106
+ raise ValueError("experiments must contain at least one entry")
107
+
108
+ seen_tiles = set()
109
+ for idx, cfg in enumerate(self.experiments):
110
+ if 'tile' not in cfg:
111
+ raise ValueError(f"experiments[{idx}] missing 'tile'")
112
+ if 'output_dir' not in cfg:
113
+ raise ValueError(f"experiments[{idx}] missing 'output_dir'")
114
+ if 'wt_seq' not in cfg:
115
+ raise ValueError(f"experiments[{idx}] missing 'wt_seq'")
116
+ if 'min_pos' not in cfg:
117
+ raise ValueError(f"experiments[{idx}] missing 'min_pos'")
118
+ if 'max_pos' not in cfg:
119
+ raise ValueError(f"experiments[{idx}] missing 'max_pos'")
120
+ try:
121
+ tile = int(cfg['tile'])
122
+ except Exception as e:
123
+ raise ValueError(f"experiments[{idx}].tile must be int-like") from e
124
+ try:
125
+ min_pos = int(cfg['min_pos'])
126
+ max_pos = int(cfg['max_pos'])
127
+ except Exception as e:
128
+ raise ValueError(f"experiments[{idx}].min_pos/max_pos must be int-like") from e
129
+ if min_pos >= max_pos:
130
+ raise ValueError(f"experiments[{idx}] min_pos must be less than max_pos")
131
+ if tile in seen_tiles:
132
+ raise ValueError(f"Duplicate tile value in experiments: {tile}")
133
+ seen_tiles.add(tile)
134
+
135
+ out_dir = Path(str(cfg['output_dir'])).expanduser().resolve()
136
+ if not out_dir.exists():
137
+ raise FileNotFoundError(f"experiments[{idx}] output_dir does not exist: {out_dir}")
138
+
139
+ # Validate normalization method
140
+ valid_methods = ['zscore_2pole', '2pole', 'zscore_center']
141
+ if self.batch_normalization_method not in valid_methods:
142
+ raise ValueError(f"Invalid normalization method: {self.batch_normalization_method}. "
143
+ f"Must be one of: {valid_methods}")
144
+
145
+ # Validate pathogenic control type
146
+ valid_control_types = ['nonsense', 'custom']
147
+ if self.pathogenic_control_type not in valid_control_types:
148
+ raise ValueError(f"Invalid pathogenic control type: {self.pathogenic_control_type}. "
149
+ f"Must be one of: {valid_control_types}")
150
+
151
+ # If using custom pathogenic controls, ensure variants are specified
152
+ if self.pathogenic_control_type == 'custom' and not self.pathogenic_variants:
153
+ raise ValueError("pathogenic_variants must be specified when using 'custom' pathogenic_control_type")
154
+
155
+ def get_batch_config_dict(self) -> Dict[str, Any]:
156
+ """
157
+ Convert batch configuration to dictionary format for processing.
158
+
159
+ Returns
160
+ -------
161
+ Dict[str, Any]
162
+ Configuration dictionary for batch processing functions
163
+ """
164
+ return {
165
+ 'experiments': self.experiments,
166
+ 'batch_normalization_method': self.batch_normalization_method,
167
+ 'pathogenic_control_type': self.pathogenic_control_type,
168
+ 'pathogenic_variants': self.pathogenic_variants,
169
+ 'combined_output_dir': self.combined_output_dir
170
+ }