smftools 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +168 -145
- smftools/cli/load_adata.py +155 -95
- smftools/cli/preprocess_adata.py +222 -130
- smftools/cli/spatial_adata.py +441 -308
- smftools/cli_entry.py +4 -5
- smftools/config/conversion.yaml +12 -5
- smftools/config/deaminase.yaml +11 -9
- smftools/config/default.yaml +123 -19
- smftools/config/direct.yaml +3 -0
- smftools/config/experiment_config.py +120 -19
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/bam_functions.py +28 -29
- smftools/informatics/h5ad_functions.py +1 -1
- smftools/plotting/general_plotting.py +97 -51
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +2 -4
- smftools/preprocessing/append_base_context.py +34 -25
- smftools/preprocessing/append_binary_layer_by_base_context.py +2 -2
- smftools/preprocessing/binarize_on_Youden.py +10 -8
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +41 -25
- smftools/preprocessing/calculate_read_modification_stats.py +1 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +1 -1
- smftools/preprocessing/flag_duplicate_reads.py +1 -1
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +94 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/METADATA +18 -12
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/RECORD +46 -43
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
smftools/cli_entry.py
CHANGED
|
@@ -4,12 +4,11 @@ from pathlib import Path
|
|
|
4
4
|
from typing import Dict, Optional, Sequence
|
|
5
5
|
|
|
6
6
|
from .cli.load_adata import load_adata
|
|
7
|
-
from .cli.cli_flows import flow_I
|
|
8
7
|
from .cli.preprocess_adata import preprocess_adata
|
|
9
8
|
from .cli.spatial_adata import spatial_adata
|
|
10
9
|
from .cli.hmm_adata import hmm_adata
|
|
11
10
|
|
|
12
|
-
from .readwrite import
|
|
11
|
+
from .readwrite import safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
|
|
13
12
|
|
|
14
13
|
@click.group()
|
|
15
14
|
def cli():
|
|
@@ -244,9 +243,9 @@ def concatenate_cmd(
|
|
|
244
243
|
|
|
245
244
|
Two modes:
|
|
246
245
|
|
|
247
|
-
smftools concatenate out.h5ad --input-dir ./dir
|
|
246
|
+
smftools concatenate out.h5ad.gz --input-dir ./dir
|
|
248
247
|
|
|
249
|
-
smftools concatenate out.h5ad --csv-path paths.csv --csv-column h5ad_path
|
|
248
|
+
smftools concatenate out.h5ad.gz --csv-path paths.csv --csv-column h5ad_path
|
|
250
249
|
|
|
251
250
|
TXT input also works (one file path per line).
|
|
252
251
|
|
|
@@ -266,7 +265,7 @@ def concatenate_cmd(
|
|
|
266
265
|
delete_inputs=delete,
|
|
267
266
|
restore_backups=restore,
|
|
268
267
|
)
|
|
269
|
-
click.echo(f"
|
|
268
|
+
click.echo(f"Concatenated file written to: {out}")
|
|
270
269
|
|
|
271
270
|
except Exception as e:
|
|
272
271
|
raise click.ClickException(str(e)) from e
|
smftools/config/conversion.yaml
CHANGED
|
@@ -9,6 +9,13 @@ conversion_types:
|
|
|
9
9
|
# Read QC Params
|
|
10
10
|
read_mod_filtering_use_other_c_as_background: True
|
|
11
11
|
|
|
12
|
+
# Spatial Analysis - Clustermap params
|
|
13
|
+
layer_for_clustermap_plotting: 'nan0_0minus1'
|
|
14
|
+
clustermap_cmap_c: "coolwarm"
|
|
15
|
+
clustermap_cmap_gpc: "coolwarm"
|
|
16
|
+
clustermap_cmap_cpg: "viridis"
|
|
17
|
+
clustermap_cmap_a: "coolwarm"
|
|
18
|
+
|
|
12
19
|
######## smftools hmm params #########
|
|
13
20
|
# HMM
|
|
14
21
|
cpg: True # whether to use the default HMM endogenous CpG patch params
|
|
@@ -18,17 +25,17 @@ hmm_feature_sets:
|
|
|
18
25
|
footprint:
|
|
19
26
|
state: "Non-Modified"
|
|
20
27
|
features:
|
|
21
|
-
small_bound_stretch: [
|
|
22
|
-
medium_bound_stretch: [
|
|
23
|
-
putative_nucleosome: [
|
|
28
|
+
small_bound_stretch: [6, 40]
|
|
29
|
+
medium_bound_stretch: [40, 100]
|
|
30
|
+
putative_nucleosome: [100, 200]
|
|
24
31
|
large_bound_stretch: [200, inf]
|
|
25
32
|
accessible:
|
|
26
33
|
state: "Modified"
|
|
27
34
|
features:
|
|
28
35
|
small_accessible_patch: [3, 20]
|
|
29
36
|
mid_accessible_patch: [20, 40]
|
|
30
|
-
|
|
31
|
-
|
|
37
|
+
large_accessible_patch: [40, 110]
|
|
38
|
+
nucleosome_depleted_region: [110, inf]
|
|
32
39
|
cpg:
|
|
33
40
|
state: "Modified"
|
|
34
41
|
features:
|
smftools/config/deaminase.yaml
CHANGED
|
@@ -7,6 +7,8 @@ conversion_types:
|
|
|
7
7
|
|
|
8
8
|
mod_target_bases:
|
|
9
9
|
- "C"
|
|
10
|
+
enzyme_target_bases:
|
|
11
|
+
- "C"
|
|
10
12
|
|
|
11
13
|
######## smftools preprocess params #########
|
|
12
14
|
read_mod_filtering_gpc_thresholds:
|
|
@@ -15,7 +17,7 @@ read_mod_filtering_gpc_thresholds:
|
|
|
15
17
|
read_mod_filtering_cpg_thresholds:
|
|
16
18
|
- null
|
|
17
19
|
- null
|
|
18
|
-
|
|
20
|
+
read_mod_filtering_c_thresholds:
|
|
19
21
|
- 0.01
|
|
20
22
|
- 0.99
|
|
21
23
|
read_mod_filtering_a_thresholds:
|
|
@@ -26,16 +28,16 @@ read_mod_filtering_use_other_c_as_background: False
|
|
|
26
28
|
|
|
27
29
|
# Duplicate Detection Params
|
|
28
30
|
duplicate_detection_site_types:
|
|
29
|
-
- "
|
|
31
|
+
- "C"
|
|
30
32
|
|
|
31
33
|
######## smftools analyze params #########
|
|
32
34
|
# Autocorrelation params
|
|
33
35
|
autocorr_site_types:
|
|
34
|
-
- "
|
|
36
|
+
- "C"
|
|
35
37
|
|
|
36
38
|
# Correlation matrix params
|
|
37
39
|
correlation_matrix_site_types:
|
|
38
|
-
- "
|
|
40
|
+
- "C_site"
|
|
39
41
|
|
|
40
42
|
# ######## smftools hmm params #########
|
|
41
43
|
cpg: False # whether to use the default HMM endogenous CpG patch params
|
|
@@ -45,17 +47,17 @@ hmm_feature_sets:
|
|
|
45
47
|
footprint:
|
|
46
48
|
state: "Non-Modified"
|
|
47
49
|
features:
|
|
48
|
-
small_bound_stretch: [
|
|
49
|
-
medium_bound_stretch: [
|
|
50
|
-
putative_nucleosome: [
|
|
50
|
+
small_bound_stretch: [6, 40]
|
|
51
|
+
medium_bound_stretch: [40, 100]
|
|
52
|
+
putative_nucleosome: [100, 200]
|
|
51
53
|
large_bound_stretch: [200, inf]
|
|
52
54
|
accessible:
|
|
53
55
|
state: "Modified"
|
|
54
56
|
features:
|
|
55
57
|
small_accessible_patch: [3, 20]
|
|
56
58
|
mid_accessible_patch: [20, 40]
|
|
57
|
-
|
|
58
|
-
|
|
59
|
+
large_accessible_patch: [40, 110]
|
|
60
|
+
nucleosome_depleted_region: [110, inf]
|
|
59
61
|
|
|
60
62
|
hmm_merge_layer_features:
|
|
61
63
|
- ["C_all_accessible_features", 80]
|
smftools/config/default.yaml
CHANGED
|
@@ -21,7 +21,7 @@ fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barc
|
|
|
21
21
|
fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
|
|
22
22
|
input_already_demuxed: False # If the input files are already demultiplexed.
|
|
23
23
|
delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
|
|
24
|
-
delete_intermediate_bams:
|
|
24
|
+
delete_intermediate_bams: False # Whether to delete intermediate BAM files.
|
|
25
25
|
delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
|
|
26
26
|
|
|
27
27
|
# Sequencing modality and general experiment params
|
|
@@ -40,7 +40,8 @@ model: "hac" # needed for dorado basecaller
|
|
|
40
40
|
filter_threshold: 0.8 # Dorado probability filter threshold for base calling.
|
|
41
41
|
|
|
42
42
|
# Alignment params
|
|
43
|
-
aligner: "
|
|
43
|
+
aligner: "dorado" # Aligner to use: dorado, minimap2
|
|
44
|
+
align_from_bam: False # Whether to run alignment from a bam file for minimap2. If False, runs alignment from a FASTQ file.
|
|
44
45
|
aligner_args:
|
|
45
46
|
minimap2:
|
|
46
47
|
ont:
|
|
@@ -116,7 +117,7 @@ read_mod_filtering_gpc_thresholds:
|
|
|
116
117
|
read_mod_filtering_cpg_thresholds:
|
|
117
118
|
- 0.0
|
|
118
119
|
- 1.0
|
|
119
|
-
|
|
120
|
+
read_mod_filtering_c_thresholds:
|
|
120
121
|
- 0.025
|
|
121
122
|
- 0.975
|
|
122
123
|
read_mod_filtering_a_thresholds:
|
|
@@ -125,6 +126,16 @@ read_mod_filtering_a_thresholds:
|
|
|
125
126
|
read_mod_filtering_use_other_c_as_background: False
|
|
126
127
|
min_valid_fraction_positions_in_read_vs_ref: 0.5
|
|
127
128
|
|
|
129
|
+
# Plotting params for read length histograms
|
|
130
|
+
obs_to_plot_pp_qc:
|
|
131
|
+
- read_length
|
|
132
|
+
- mapped_length
|
|
133
|
+
- read_quality
|
|
134
|
+
- mapping_quality
|
|
135
|
+
- mapped_length_to_reference_length_ratio
|
|
136
|
+
- mapped_length_to_read_length_ratio
|
|
137
|
+
- Raw_modification_signal
|
|
138
|
+
|
|
128
139
|
# Duplicate detection params
|
|
129
140
|
duplicate_detection_site_types: # Site types to consider for duplicate detection workflow
|
|
130
141
|
- "GpC"
|
|
@@ -132,7 +143,7 @@ duplicate_detection_site_types: # Site types to consider for duplicate detection
|
|
|
132
143
|
- "ambiguous_GpC_CpG"
|
|
133
144
|
duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
|
|
134
145
|
hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
|
|
135
|
-
-
|
|
146
|
+
- Fraction_C_site_modified
|
|
136
147
|
duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
|
|
137
148
|
duplicate_detection_window_size_for_hamming_neighbors: 50 # How many neighboring reads to look at for calculating hamming distance pairs
|
|
138
149
|
duplicate_detection_min_overlapping_positions: 20 # The minimum amount of valid overlapping positions that will allow duplicate detection to work
|
|
@@ -143,29 +154,40 @@ duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkag
|
|
|
143
154
|
# Position QC params
|
|
144
155
|
position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
|
|
145
156
|
|
|
146
|
-
######## smftools
|
|
147
|
-
#
|
|
157
|
+
######## smftools spatial params #########
|
|
158
|
+
invert_adata: False # Whether to invert the AnnData along the positions axis.
|
|
159
|
+
# Reindexing params
|
|
160
|
+
reindexing_offsets:
|
|
161
|
+
null : null
|
|
162
|
+
reindexed_var_suffix: "reindexed"
|
|
163
|
+
|
|
164
|
+
# Spatial Analysis - QC Plotting params
|
|
148
165
|
rows_per_qc_histogram_grid: 12
|
|
149
166
|
|
|
150
|
-
#
|
|
167
|
+
# Spatial Analysis - Clustermap params
|
|
151
168
|
layer_for_clustermap_plotting: 'nan0_0minus1'
|
|
169
|
+
clustermap_cmap_c: "coolwarm"
|
|
170
|
+
clustermap_cmap_gpc: "coolwarm"
|
|
171
|
+
clustermap_cmap_cpg: "coolwarm"
|
|
172
|
+
clustermap_cmap_a: "coolwarm"
|
|
173
|
+
spatial_clustermap_sortby: "gpc"
|
|
152
174
|
|
|
153
|
-
#
|
|
175
|
+
# Spatial Analysis - UMAP/Leiden params
|
|
154
176
|
layer_for_umap_plotting: 'nan_half'
|
|
155
177
|
umap_layers_to_plot:
|
|
156
178
|
- "mapped_length"
|
|
157
179
|
- "Raw_modification_signal"
|
|
158
180
|
|
|
159
|
-
#
|
|
181
|
+
# Spatial Analysis - Spatial Autocorrelation params
|
|
160
182
|
rows_per_qc_autocorr_grid: 6
|
|
161
183
|
autocorr_rolling_window_size: 25
|
|
162
184
|
autocorr_max_lag: 800
|
|
163
185
|
autocorr_site_types:
|
|
164
186
|
- "GpC"
|
|
165
187
|
- "CpG"
|
|
166
|
-
- "
|
|
188
|
+
- "C"
|
|
167
189
|
|
|
168
|
-
#
|
|
190
|
+
# Spatial Analysis - Correlation Matrix params
|
|
169
191
|
correlation_matrix_types:
|
|
170
192
|
- "pearson"
|
|
171
193
|
- "binary_covariance"
|
|
@@ -204,19 +226,102 @@ hmm_feature_sets:
|
|
|
204
226
|
footprint:
|
|
205
227
|
state: "Non-Modified"
|
|
206
228
|
features:
|
|
207
|
-
small_bound_stretch: [
|
|
208
|
-
medium_bound_stretch: [40,
|
|
209
|
-
putative_nucleosome: [
|
|
229
|
+
small_bound_stretch: [6, 40]
|
|
230
|
+
medium_bound_stretch: [40, 100]
|
|
231
|
+
putative_nucleosome: [100, 200]
|
|
210
232
|
large_bound_stretch: [200, inf]
|
|
211
233
|
accessible:
|
|
212
234
|
state: "Modified"
|
|
213
235
|
features:
|
|
214
236
|
small_accessible_patch: [3, 20]
|
|
215
237
|
mid_accessible_patch: [20, 40]
|
|
216
|
-
|
|
217
|
-
|
|
238
|
+
large_accessible_patch: [40, 110]
|
|
239
|
+
nucleosome_depleted_region: [110, inf]
|
|
218
240
|
hmm_merge_layer_features:
|
|
219
241
|
- [null, 80]
|
|
242
|
+
clustermap_cmap_hmm: "coolwarm"
|
|
243
|
+
hmm_clustermap_feature_layers:
|
|
244
|
+
- all_accessible_features
|
|
245
|
+
- all_accessible_features_merged
|
|
246
|
+
- small_accessible_patch
|
|
247
|
+
- mid_accessible_patch
|
|
248
|
+
- large_accessible_patch
|
|
249
|
+
- nucleosome_depleted_region
|
|
250
|
+
- small_bound_stretch
|
|
251
|
+
- medium_bound_stretch
|
|
252
|
+
- putative_nucleosome
|
|
253
|
+
- large_bound_stretch
|
|
254
|
+
hmm_clustermap_sortby: "hmm"
|
|
255
|
+
hmm_peak_feature_configs:
|
|
256
|
+
all_accessible_features:
|
|
257
|
+
min_distance: 200 # The minimum distance in between called peaks
|
|
258
|
+
peak_width: 200 # The window width to calculate sum/mean hmm signal per read centered at the peak center.
|
|
259
|
+
peak_prominence: 0.1 # The minimum prominence to call a peak
|
|
260
|
+
peak_threshold: 0.80 # The minimum mean hmm signal in each molecule within the peak window to mark the molecule as positive for the feature.
|
|
261
|
+
rolling_window: 50 # Window size for the rolling average smoothing before peak calling
|
|
262
|
+
|
|
263
|
+
all_accessible_features_merged:
|
|
264
|
+
min_distance: 250
|
|
265
|
+
peak_width: 250
|
|
266
|
+
peak_prominence: 0.05
|
|
267
|
+
peak_threshold: 0.80
|
|
268
|
+
rolling_window: 50
|
|
269
|
+
|
|
270
|
+
small_accessible_patch:
|
|
271
|
+
min_distance: 40
|
|
272
|
+
peak_width: 30
|
|
273
|
+
peak_prominence: 0.1
|
|
274
|
+
peak_threshold: 0.8
|
|
275
|
+
rolling_window: 40
|
|
276
|
+
|
|
277
|
+
mid_accessible_patch:
|
|
278
|
+
min_distance: 100
|
|
279
|
+
peak_width: 60
|
|
280
|
+
peak_prominence: 0.025
|
|
281
|
+
peak_threshold: 0.80
|
|
282
|
+
rolling_window: 50
|
|
283
|
+
|
|
284
|
+
large_accessible_patch:
|
|
285
|
+
min_distance: 100
|
|
286
|
+
peak_width: 100
|
|
287
|
+
peak_prominence: 0.025
|
|
288
|
+
peak_threshold: 0.80
|
|
289
|
+
rolling_window: 50
|
|
290
|
+
|
|
291
|
+
nucleosome_depleted_region:
|
|
292
|
+
min_distance: 200
|
|
293
|
+
peak_width: 200
|
|
294
|
+
peak_prominence: 0.025
|
|
295
|
+
peak_threshold: 0.80
|
|
296
|
+
rolling_window: 50
|
|
297
|
+
|
|
298
|
+
small_bound_stretch:
|
|
299
|
+
min_distance: 20
|
|
300
|
+
peak_width: 20
|
|
301
|
+
peak_prominence: 0.01
|
|
302
|
+
peak_threshold: 0.50
|
|
303
|
+
rolling_window: 10
|
|
304
|
+
|
|
305
|
+
medium_bound_stretch:
|
|
306
|
+
min_distance: 40
|
|
307
|
+
peak_width: 40
|
|
308
|
+
peak_prominence: 0.01
|
|
309
|
+
peak_threshold: 0.50
|
|
310
|
+
rolling_window: 20
|
|
311
|
+
|
|
312
|
+
putative_nucleosome:
|
|
313
|
+
min_distance: 160
|
|
314
|
+
peak_width: 147 # canonical nucleosome footprint
|
|
315
|
+
peak_prominence: 0.025
|
|
316
|
+
peak_threshold: 0.60
|
|
317
|
+
rolling_window: 20
|
|
318
|
+
|
|
319
|
+
large_bound_stretch:
|
|
320
|
+
min_distance: 250
|
|
321
|
+
peak_width: 300
|
|
322
|
+
peak_prominence: 0.20
|
|
323
|
+
peak_threshold: 0.80
|
|
324
|
+
rolling_window: 50
|
|
220
325
|
|
|
221
326
|
# Pipeline control flow - load adata
|
|
222
327
|
force_redo_load_adata: False # Whether to perform load adata command from start
|
|
@@ -230,7 +335,6 @@ bypass_clean_nan: False # Whether to skip NaN cleaning
|
|
|
230
335
|
force_redo_clean_nan: False # Whether to redo NaN cleaning
|
|
231
336
|
bypass_append_base_context: False # Whether to skip adding per reference base context additions.
|
|
232
337
|
force_redo_append_base_context: False # Whether to redo per reference base context additions.
|
|
233
|
-
invert_adata: False # Whether to invert the AnnData along the positions axis.
|
|
234
338
|
bypass_append_binary_layer_by_base_context: False # Whether to skip adding new binary layers for each specific base context.
|
|
235
339
|
force_redo_append_binary_layer_by_base_context: False # Whether to redo adding new binary layers for each specific base context.
|
|
236
340
|
bypass_calculate_read_modification_stats: False # Whether to skip adding read level modification statistics.
|
|
@@ -242,8 +346,8 @@ force_redo_flag_duplicate_reads: False # Whether to redo flagging duplicate read
|
|
|
242
346
|
bypass_complexity_analysis: False # Whether to skip complexity analysis
|
|
243
347
|
force_redo_complexity_analysis: False # Whether to redo complexity analysis
|
|
244
348
|
|
|
245
|
-
# Pipeline control flow -
|
|
246
|
-
|
|
349
|
+
# Pipeline control flow - Spatial Analyses
|
|
350
|
+
force_redo_spatial_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
|
|
247
351
|
bypass_basic_clustermaps: False # Whether to skip basic clustermap plotting
|
|
248
352
|
force_redo_basic_clustermaps: False # Whether to redo basic clustermap plotting
|
|
249
353
|
bypass_basic_umap: False # Whether to skip basic UMAP calculation/plotting
|
smftools/config/direct.yaml
CHANGED
|
@@ -214,7 +214,7 @@ def resolve_aligner_args(
|
|
|
214
214
|
return list(default_by_aligner.get(key_align, []))
|
|
215
215
|
|
|
216
216
|
|
|
217
|
-
# HMM default params and
|
|
217
|
+
# HMM default params and helper functions
|
|
218
218
|
def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
|
|
219
219
|
"""
|
|
220
220
|
Normalize user-provided `hmm_feature_sets` into canonical structure:
|
|
@@ -275,6 +275,58 @@ def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
|
|
|
275
275
|
canonical[grp] = {"features": feats, "state": state}
|
|
276
276
|
return canonical
|
|
277
277
|
|
|
278
|
+
def normalize_peak_feature_configs(raw: Any) -> Dict[str, dict]:
|
|
279
|
+
"""
|
|
280
|
+
Normalize user-provided `hmm_peak_feature_configs` into:
|
|
281
|
+
{
|
|
282
|
+
layer_name: {
|
|
283
|
+
"min_distance": int,
|
|
284
|
+
"peak_width": int,
|
|
285
|
+
"peak_prominence": float,
|
|
286
|
+
"peak_threshold": float,
|
|
287
|
+
"rolling_window": int,
|
|
288
|
+
},
|
|
289
|
+
...
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
Accepts dict, JSON/string, None. Returns {} for empty input.
|
|
293
|
+
"""
|
|
294
|
+
if raw is None:
|
|
295
|
+
return {}
|
|
296
|
+
|
|
297
|
+
parsed = raw
|
|
298
|
+
if isinstance(raw, str):
|
|
299
|
+
parsed = _try_json_or_literal(raw)
|
|
300
|
+
if not isinstance(parsed, dict):
|
|
301
|
+
return {}
|
|
302
|
+
|
|
303
|
+
defaults = {
|
|
304
|
+
"min_distance": 200,
|
|
305
|
+
"peak_width": 200,
|
|
306
|
+
"peak_prominence": 0.2,
|
|
307
|
+
"peak_threshold": 0.8,
|
|
308
|
+
"rolling_window": 1,
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
out: Dict[str, dict] = {}
|
|
312
|
+
for layer, conf in parsed.items():
|
|
313
|
+
if conf is None:
|
|
314
|
+
conf = {}
|
|
315
|
+
if not isinstance(conf, dict):
|
|
316
|
+
# allow shorthand like 300 -> interpreted as peak_width
|
|
317
|
+
conf = {"peak_width": conf}
|
|
318
|
+
|
|
319
|
+
full = defaults.copy()
|
|
320
|
+
full.update(conf)
|
|
321
|
+
out[str(layer)] = {
|
|
322
|
+
"min_distance": int(full["min_distance"]),
|
|
323
|
+
"peak_width": int(full["peak_width"]),
|
|
324
|
+
"peak_prominence": float(full["peak_prominence"]),
|
|
325
|
+
"peak_threshold": float(full["peak_threshold"]),
|
|
326
|
+
"rolling_window": int(full["rolling_window"]),
|
|
327
|
+
}
|
|
328
|
+
return out
|
|
329
|
+
|
|
278
330
|
|
|
279
331
|
# -------------------------
|
|
280
332
|
# LoadExperimentConfig
|
|
@@ -612,7 +664,7 @@ class ExperimentConfig:
|
|
|
612
664
|
fastq_auto_pairing: bool = True
|
|
613
665
|
|
|
614
666
|
# Remove intermediate file options
|
|
615
|
-
delete_intermediate_bams: bool =
|
|
667
|
+
delete_intermediate_bams: bool = False
|
|
616
668
|
delete_intermediate_tsvs: bool = True
|
|
617
669
|
|
|
618
670
|
# Conversion/Deamination file handling
|
|
@@ -647,11 +699,13 @@ class ExperimentConfig:
|
|
|
647
699
|
m5C_threshold: float = 0.7
|
|
648
700
|
hm5C_threshold: float = 0.7
|
|
649
701
|
thresholds: List[float] = field(default_factory=list)
|
|
650
|
-
mod_list: List[str] = field(default_factory=lambda: ["5mC_5hmC", "6mA"])
|
|
702
|
+
mod_list: List[str] = field(default_factory=lambda: ["5mC_5hmC", "6mA"]) # Dorado modified basecalling codes
|
|
703
|
+
mod_map: Dict[str, str] = field(default_factory=lambda: {'6mA': '6mA', '5mC_5hmC': '5mC'}) # Map from dorado modified basecalling codes to codes used in modkit_extract_to_adata function
|
|
651
704
|
|
|
652
705
|
# Alignment params
|
|
653
706
|
mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
|
|
654
|
-
|
|
707
|
+
align_from_bam: bool = False # Whether minimap2 should align from a bam file as input. If False, aligns from FASTQ
|
|
708
|
+
aligner: str = "dorado"
|
|
655
709
|
aligner_args: Optional[List[str]] = None
|
|
656
710
|
make_bigwigs: bool = False
|
|
657
711
|
make_beds: bool = False
|
|
@@ -671,6 +725,10 @@ class ExperimentConfig:
|
|
|
671
725
|
read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [15, None])
|
|
672
726
|
read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
|
|
673
727
|
|
|
728
|
+
# Preprocessing - Optional reindexing params
|
|
729
|
+
reindexing_offsets: Dict[str, int] = field(default_factory=dict)
|
|
730
|
+
reindexed_var_suffix: Optional[str] = "reindexed"
|
|
731
|
+
|
|
674
732
|
# Preprocessing - Direct mod detection binarization params
|
|
675
733
|
fit_position_methylation_thresholds: Optional[bool] = False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
|
|
676
734
|
binarize_on_fixed_methlyation_threshold: Optional[float] = 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
|
|
@@ -684,15 +742,18 @@ class ExperimentConfig:
|
|
|
684
742
|
# Preprocessing - Read modification filter params
|
|
685
743
|
read_mod_filtering_gpc_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
|
|
686
744
|
read_mod_filtering_cpg_thresholds: List[float] = field(default_factory=lambda: [0.00, 1])
|
|
687
|
-
|
|
745
|
+
read_mod_filtering_c_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
|
|
688
746
|
read_mod_filtering_a_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
|
|
689
747
|
read_mod_filtering_use_other_c_as_background: bool = True
|
|
690
748
|
min_valid_fraction_positions_in_read_vs_ref: float = 0.2
|
|
691
749
|
|
|
750
|
+
# Preprocessing - plotting params
|
|
751
|
+
obs_to_plot_pp_qc: List[str] = field(default_factory=lambda: ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal'])
|
|
752
|
+
|
|
692
753
|
# Preprocessing - Duplicate detection params
|
|
693
754
|
duplicate_detection_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'ambiguous_GpC_CpG'])
|
|
694
755
|
duplicate_detection_distance_threshold: float = 0.07
|
|
695
|
-
hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ['
|
|
756
|
+
hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ['Fraction_C_site_modified'])
|
|
696
757
|
duplicate_detection_keep_best_metric: str ='read_quality'
|
|
697
758
|
duplicate_detection_window_size_for_hamming_neighbors: int = 50
|
|
698
759
|
duplicate_detection_min_overlapping_positions: int = 20
|
|
@@ -703,20 +764,25 @@ class ExperimentConfig:
|
|
|
703
764
|
# Preprocessing - Position QC
|
|
704
765
|
position_max_nan_threshold: float = 0.1
|
|
705
766
|
|
|
706
|
-
#
|
|
767
|
+
# Spatial Analysis - Clustermap params
|
|
707
768
|
layer_for_clustermap_plotting: Optional[str] = 'nan0_0minus1'
|
|
769
|
+
clustermap_cmap_c: Optional[str] = 'coolwarm'
|
|
770
|
+
clustermap_cmap_gpc: Optional[str] = 'coolwarm'
|
|
771
|
+
clustermap_cmap_cpg: Optional[str] = 'coolwarm'
|
|
772
|
+
clustermap_cmap_a: Optional[str] = 'coolwarm'
|
|
773
|
+
spatial_clustermap_sortby: Optional[str] = 'gpc'
|
|
708
774
|
|
|
709
|
-
#
|
|
775
|
+
# Spatial Analysis - UMAP/Leiden params
|
|
710
776
|
layer_for_umap_plotting: Optional[str] = 'nan_half'
|
|
711
777
|
umap_layers_to_plot: List[str] = field(default_factory=lambda: ["mapped_length", "Raw_modification_signal"])
|
|
712
778
|
|
|
713
|
-
#
|
|
779
|
+
# Spatial Analysis - Spatial Autocorrelation params
|
|
714
780
|
rows_per_qc_autocorr_grid: int = 12
|
|
715
781
|
autocorr_rolling_window_size: int = 25
|
|
716
782
|
autocorr_max_lag: int = 800
|
|
717
|
-
autocorr_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', '
|
|
783
|
+
autocorr_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'C'])
|
|
718
784
|
|
|
719
|
-
#
|
|
785
|
+
# Spatial Analysis - Correlation Matrix params
|
|
720
786
|
correlation_matrix_types: List[str] = field(default_factory=lambda: ["pearson", "binary_covariance"])
|
|
721
787
|
correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
|
|
722
788
|
correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
|
|
@@ -738,6 +804,10 @@ class ExperimentConfig:
|
|
|
738
804
|
cpg: Optional[bool] = False
|
|
739
805
|
hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
|
|
740
806
|
hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 80)])
|
|
807
|
+
clustermap_cmap_hmm: Optional[str] = 'coolwarm'
|
|
808
|
+
hmm_clustermap_feature_layers: List[str] = field(default_factory=lambda: ["all_accessible_features"])
|
|
809
|
+
hmm_clustermap_sortby: Optional[str] = 'hmm'
|
|
810
|
+
hmm_peak_feature_configs: Dict[str, Any] = field(default_factory=dict)
|
|
741
811
|
|
|
742
812
|
# Pipeline control flow - load adata
|
|
743
813
|
force_redo_load_adata: bool = False
|
|
@@ -763,8 +833,8 @@ class ExperimentConfig:
|
|
|
763
833
|
bypass_complexity_analysis: bool = False
|
|
764
834
|
force_redo_complexity_analysis: bool = False
|
|
765
835
|
|
|
766
|
-
# Pipeline control flow -
|
|
767
|
-
|
|
836
|
+
# Pipeline control flow - Spatial Analyses
|
|
837
|
+
force_redo_spatial_analyses: bool = False
|
|
768
838
|
bypass_basic_clustermaps: bool = False
|
|
769
839
|
force_redo_basic_clustermaps: bool = False
|
|
770
840
|
bypass_basic_umap: bool = False
|
|
@@ -930,7 +1000,14 @@ class ExperimentConfig:
|
|
|
930
1000
|
input_type = "h5ad"
|
|
931
1001
|
input_files = found["h5ad_paths"]
|
|
932
1002
|
|
|
933
|
-
print(
|
|
1003
|
+
print(
|
|
1004
|
+
f"Found {found['all_files_searched']} files; "
|
|
1005
|
+
f"fastq={len(found['fastq_paths'])}, "
|
|
1006
|
+
f"bam={len(found['bam_paths'])}, "
|
|
1007
|
+
f"pod5={len(found['pod5_paths'])}, "
|
|
1008
|
+
f"fast5={len(found['fast5_paths'])}, "
|
|
1009
|
+
f"h5ad={len(found['h5ad_paths'])}"
|
|
1010
|
+
)
|
|
934
1011
|
|
|
935
1012
|
# summary file output path
|
|
936
1013
|
output_dir = Path(merged['output_directory'])
|
|
@@ -981,6 +1058,9 @@ class ExperimentConfig:
|
|
|
981
1058
|
if "mod_list" in merged:
|
|
982
1059
|
merged["mod_list"] = _parse_list(merged.get("mod_list"))
|
|
983
1060
|
|
|
1061
|
+
# Preprocessing args
|
|
1062
|
+
obs_to_plot_pp_qc = _parse_list(merged.get("obs_to_plot_pp_qc", None))
|
|
1063
|
+
|
|
984
1064
|
# HMM feature set handling
|
|
985
1065
|
if "hmm_feature_sets" in merged:
|
|
986
1066
|
merged["hmm_feature_sets"] = normalize_hmm_feature_sets(merged["hmm_feature_sets"])
|
|
@@ -1016,6 +1096,13 @@ class ExperimentConfig:
|
|
|
1016
1096
|
hmm_methbases = ['C']
|
|
1017
1097
|
hmm_methbases = list(hmm_methbases)
|
|
1018
1098
|
hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
|
|
1099
|
+
hmm_clustermap_feature_layers = _parse_list(merged.get("hmm_clustermap_feature_layers", "all_accessible_features"))
|
|
1100
|
+
|
|
1101
|
+
# HMM peak feature configs (for call_hmm_peaks)
|
|
1102
|
+
merged["hmm_peak_feature_configs"] = normalize_peak_feature_configs(
|
|
1103
|
+
merged.get("hmm_peak_feature_configs", {})
|
|
1104
|
+
)
|
|
1105
|
+
hmm_peak_feature_configs = merged.get("hmm_peak_feature_configs", {})
|
|
1019
1106
|
|
|
1020
1107
|
# instantiate dataclass
|
|
1021
1108
|
instance = cls(
|
|
@@ -1047,8 +1134,9 @@ class ExperimentConfig:
|
|
|
1047
1134
|
threads = merged.get("threads"),
|
|
1048
1135
|
sample_sheet_path = merged.get("sample_sheet_path"),
|
|
1049
1136
|
sample_sheet_mapping_column = merged.get("sample_sheet_mapping_column"),
|
|
1050
|
-
delete_intermediate_bams = merged.get("delete_intermediate_bams",
|
|
1137
|
+
delete_intermediate_bams = merged.get("delete_intermediate_bams", False),
|
|
1051
1138
|
delete_intermediate_tsvs = merged.get("delete_intermediate_tsvs", True),
|
|
1139
|
+
align_from_bam = merged.get("align_from_bam", False),
|
|
1052
1140
|
aligner = merged.get("aligner", "minimap2"),
|
|
1053
1141
|
aligner_args = merged.get("aligner_args", None),
|
|
1054
1142
|
device = merged.get("device", "auto"),
|
|
@@ -1070,6 +1158,7 @@ class ExperimentConfig:
|
|
|
1070
1158
|
reference_column = merged.get("reference_column", 'Reference_strand'),
|
|
1071
1159
|
sample_column = merged.get("sample_column", 'Barcode'),
|
|
1072
1160
|
sample_name_col_for_plotting = merged.get("sample_name_col_for_plotting", 'Barcode'),
|
|
1161
|
+
obs_to_plot_pp_qc = obs_to_plot_pp_qc,
|
|
1073
1162
|
fit_position_methylation_thresholds = merged.get("fit_position_methylation_thresholds", False),
|
|
1074
1163
|
binarize_on_fixed_methlyation_threshold = merged.get("binarize_on_fixed_methlyation_threshold", 0.7),
|
|
1075
1164
|
positive_control_sample_methylation_fitting = merged.get("positive_control_sample_methylation_fitting", None),
|
|
@@ -1078,14 +1167,21 @@ class ExperimentConfig:
|
|
|
1078
1167
|
inference_variable_sample_methylation_fitting = merged.get("inference_variable_sample_methylation_fitting", "Raw_modification_signal"),
|
|
1079
1168
|
fit_j_threshold = merged.get("fit_j_threshold", 0.5),
|
|
1080
1169
|
output_binary_layer_name = merged.get("output_binary_layer_name", "binarized_methylation"),
|
|
1170
|
+
reindexing_offsets = merged.get("reindexing_offsets", {None: None}),
|
|
1171
|
+
reindexed_var_suffix = merged.get("reindexed_var_suffix", "reindexed"),
|
|
1081
1172
|
layer_for_clustermap_plotting = merged.get("layer_for_clustermap_plotting", 'nan0_0minus1'),
|
|
1173
|
+
clustermap_cmap_c = merged.get("clustermap_cmap_c", 'coolwarm'),
|
|
1174
|
+
clustermap_cmap_gpc = merged.get("clustermap_cmap_gpc", 'coolwarm'),
|
|
1175
|
+
clustermap_cmap_cpg = merged.get("clustermap_cmap_cpg", 'coolwarm'),
|
|
1176
|
+
clustermap_cmap_a = merged.get("clustermap_cmap_a", 'coolwarm'),
|
|
1177
|
+
spatial_clustermap_sortby = merged.get("spatial_clustermap_sortby", 'gpc'),
|
|
1082
1178
|
layer_for_umap_plotting = merged.get("layer_for_umap_plotting", 'nan_half'),
|
|
1083
1179
|
umap_layers_to_plot = merged.get("umap_layers_to_plot",["mapped_length", 'Raw_modification_signal']),
|
|
1084
1180
|
rows_per_qc_histogram_grid = merged.get("rows_per_qc_histogram_grid", 12),
|
|
1085
1181
|
rows_per_qc_autocorr_grid = merged.get("rows_per_qc_autocorr_grid", 12),
|
|
1086
1182
|
autocorr_rolling_window_size = merged.get("autocorr_rolling_window_size", 25),
|
|
1087
1183
|
autocorr_max_lag = merged.get("autocorr_max_lag", 800),
|
|
1088
|
-
autocorr_site_types = merged.get("autocorr_site_types", ['GpC', 'CpG', '
|
|
1184
|
+
autocorr_site_types = merged.get("autocorr_site_types", ['GpC', 'CpG', 'C']),
|
|
1089
1185
|
hmm_n_states = merged.get("hmm_n_states", 2),
|
|
1090
1186
|
hmm_init_emission_probs = merged.get("hmm_init_emission_probs",[[0.8, 0.2], [0.2, 0.8]]),
|
|
1091
1187
|
hmm_init_transition_probs = merged.get("hmm_init_transition_probs",[[0.9, 0.1], [0.1, 0.9]]),
|
|
@@ -1099,6 +1195,10 @@ class ExperimentConfig:
|
|
|
1099
1195
|
hmm_methbases = hmm_methbases,
|
|
1100
1196
|
hmm_device = hmm_device,
|
|
1101
1197
|
hmm_merge_layer_features = hmm_merge_layer_features,
|
|
1198
|
+
clustermap_cmap_hmm = merged.get("clustermap_cmap_hmm", 'coolwarm'),
|
|
1199
|
+
hmm_clustermap_feature_layers = hmm_clustermap_feature_layers,
|
|
1200
|
+
hmm_clustermap_sortby = merged.get("hmm_clustermap_sortby", 'hmm'),
|
|
1201
|
+
hmm_peak_feature_configs = hmm_peak_feature_configs,
|
|
1102
1202
|
footprints = merged.get("footprints", None),
|
|
1103
1203
|
accessible_patches = merged.get("accessible_patches", None),
|
|
1104
1204
|
cpg = merged.get("cpg", None),
|
|
@@ -1109,7 +1209,7 @@ class ExperimentConfig:
|
|
|
1109
1209
|
read_mapping_quality_filter_thresholds = merged.get("read_mapping_quality_filter_thresholds", [None, None]),
|
|
1110
1210
|
read_mod_filtering_gpc_thresholds = merged.get("read_mod_filtering_gpc_thresholds", [0.025, 0.975]),
|
|
1111
1211
|
read_mod_filtering_cpg_thresholds = merged.get("read_mod_filtering_cpg_thresholds", [0.0, 1.0]),
|
|
1112
|
-
|
|
1212
|
+
read_mod_filtering_c_thresholds = merged.get("read_mod_filtering_c_thresholds", [0.025, 0.975]),
|
|
1113
1213
|
read_mod_filtering_a_thresholds = merged.get("read_mod_filtering_a_thresholds", [0.025, 0.975]),
|
|
1114
1214
|
read_mod_filtering_use_other_c_as_background = merged.get("read_mod_filtering_use_other_c_as_background", True),
|
|
1115
1215
|
min_valid_fraction_positions_in_read_vs_ref = merged.get("min_valid_fraction_positions_in_read_vs_ref", 0.2),
|
|
@@ -1125,7 +1225,7 @@ class ExperimentConfig:
|
|
|
1125
1225
|
correlation_matrix_types = merged.get("correlation_matrix_types", ["pearson", "binary_covariance"]),
|
|
1126
1226
|
correlation_matrix_cmaps = merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
|
|
1127
1227
|
correlation_matrix_site_types = merged.get("correlation_matrix_site_types", ["GpC_site"]),
|
|
1128
|
-
hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['
|
|
1228
|
+
hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_C_site_modified']),
|
|
1129
1229
|
force_redo_load_adata = merged.get("force_redo_load_adata", False),
|
|
1130
1230
|
force_redo_preprocessing = merged.get("force_redo_preprocessing", False),
|
|
1131
1231
|
force_reload_sample_sheet = merged.get("force_reload_sample_sheet", True),
|
|
@@ -1146,7 +1246,7 @@ class ExperimentConfig:
|
|
|
1146
1246
|
force_redo_flag_duplicate_reads = merged.get("force_redo_flag_duplicate_reads", False),
|
|
1147
1247
|
bypass_complexity_analysis = merged.get("bypass_complexity_analysis", False),
|
|
1148
1248
|
force_redo_complexity_analysis = merged.get("force_redo_complexity_analysis", False),
|
|
1149
|
-
|
|
1249
|
+
force_redo_spatial_analyses = merged.get("force_redo_spatial_analyses", False),
|
|
1150
1250
|
bypass_basic_clustermaps = merged.get("bypass_basic_clustermaps", False),
|
|
1151
1251
|
force_redo_basic_clustermaps = merged.get("force_redo_basic_clustermaps", False),
|
|
1152
1252
|
bypass_basic_umap = merged.get("bypass_basic_umap", False),
|
|
@@ -1198,6 +1298,7 @@ class ExperimentConfig:
|
|
|
1198
1298
|
# -------------------------
|
|
1199
1299
|
# validation & serialization
|
|
1200
1300
|
# -------------------------
|
|
1301
|
+
@staticmethod
|
|
1201
1302
|
def _validate_hmm_features_structure(hfs: dict) -> List[str]:
|
|
1202
1303
|
errs = []
|
|
1203
1304
|
if not isinstance(hfs, dict):
|