smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +54 -0
- smftools/cli/hmm_adata.py +937 -256
- smftools/cli/load_adata.py +448 -268
- smftools/cli/preprocess_adata.py +469 -263
- smftools/cli/spatial_adata.py +536 -319
- smftools/cli_entry.py +97 -182
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +17 -6
- smftools/config/deaminase.yaml +12 -10
- smftools/config/default.yaml +142 -33
- smftools/config/direct.yaml +11 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +594 -264
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2128 -1418
- smftools/hmm/__init__.py +2 -9
- smftools/hmm/archived/call_hmm_peaks.py +121 -0
- smftools/hmm/call_hmm_peaks.py +299 -91
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +397 -175
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +196 -30
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +422 -197
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +147 -87
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +10 -12
- smftools/preprocessing/append_base_context.py +115 -80
- smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
- smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +129 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +50 -25
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +118 -54
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +689 -272
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +103 -0
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +331 -82
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.3.dist-info/RECORD +0 -173
- /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
smftools/config/default.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# General
|
|
2
2
|
sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
|
|
3
|
-
sample_sheet_mapping_column: '
|
|
4
|
-
sample_name_col_for_plotting: '
|
|
3
|
+
sample_sheet_mapping_column: 'Experiment_name_and_barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
|
|
4
|
+
sample_name_col_for_plotting: 'Experiment_name_and_barcode'
|
|
5
5
|
|
|
6
6
|
# Compute params
|
|
7
7
|
threads: 4
|
|
@@ -9,9 +9,7 @@ device: "auto"
|
|
|
9
9
|
|
|
10
10
|
######## smftools load params #########
|
|
11
11
|
# Generic i/o
|
|
12
|
-
bam_suffix: ".bam"
|
|
13
12
|
recursive_input_search: True
|
|
14
|
-
split_dir: "demultiplexed_BAMs"
|
|
15
13
|
strands:
|
|
16
14
|
- bottom
|
|
17
15
|
- top
|
|
@@ -40,7 +38,8 @@ model: "hac" # needed for dorado basecaller
|
|
|
40
38
|
filter_threshold: 0.8 # Dorado probability filter threshold for base calling.
|
|
41
39
|
|
|
42
40
|
# Alignment params
|
|
43
|
-
aligner: "
|
|
41
|
+
aligner: "dorado" # Aligner to use: dorado, minimap2
|
|
42
|
+
align_from_bam: False # Whether to run alignment from a bam file for minimap2. If False, runs alignment from a FASTQ file.
|
|
44
43
|
aligner_args:
|
|
45
44
|
minimap2:
|
|
46
45
|
ont:
|
|
@@ -52,7 +51,6 @@ aligner_args:
|
|
|
52
51
|
- '-y'
|
|
53
52
|
- '-N'
|
|
54
53
|
- '5'
|
|
55
|
-
- '--secondary=no'
|
|
56
54
|
pacbio:
|
|
57
55
|
- '-a'
|
|
58
56
|
- '-x'
|
|
@@ -62,7 +60,6 @@ aligner_args:
|
|
|
62
60
|
- '-y'
|
|
63
61
|
- '-N'
|
|
64
62
|
- '5'
|
|
65
|
-
- '--secondary=no'
|
|
66
63
|
illumina:
|
|
67
64
|
- '-a'
|
|
68
65
|
- '-x'
|
|
@@ -72,7 +69,6 @@ aligner_args:
|
|
|
72
69
|
- '-y'
|
|
73
70
|
- '-N'
|
|
74
71
|
- '5'
|
|
75
|
-
- '--secondary=no'
|
|
76
72
|
dorado:
|
|
77
73
|
ont:
|
|
78
74
|
- "--mm2-opts"
|
|
@@ -87,9 +83,9 @@ barcode_both_ends: False # dorado demultiplexing
|
|
|
87
83
|
trim: False # dorado adapter and barcode removal during demultiplexing
|
|
88
84
|
|
|
89
85
|
# Anndata structure
|
|
90
|
-
mapping_threshold: 0.
|
|
86
|
+
mapping_threshold: 0.10 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
|
|
91
87
|
reference_column: 'Reference_strand'
|
|
92
|
-
sample_column: '
|
|
88
|
+
sample_column: 'Experiment_name_and_barcode'
|
|
93
89
|
|
|
94
90
|
######## smftools preprocess params #########
|
|
95
91
|
# Read length, quality, and mapping filtering params
|
|
@@ -100,7 +96,7 @@ read_len_filter_thresholds:
|
|
|
100
96
|
- 100
|
|
101
97
|
- null
|
|
102
98
|
read_len_to_ref_ratio_filter_thresholds:
|
|
103
|
-
-
|
|
99
|
+
- null
|
|
104
100
|
- null
|
|
105
101
|
read_quality_filter_thresholds:
|
|
106
102
|
- 15
|
|
@@ -116,7 +112,7 @@ read_mod_filtering_gpc_thresholds:
|
|
|
116
112
|
read_mod_filtering_cpg_thresholds:
|
|
117
113
|
- 0.0
|
|
118
114
|
- 1.0
|
|
119
|
-
|
|
115
|
+
read_mod_filtering_c_thresholds:
|
|
120
116
|
- 0.025
|
|
121
117
|
- 0.975
|
|
122
118
|
read_mod_filtering_a_thresholds:
|
|
@@ -125,6 +121,16 @@ read_mod_filtering_a_thresholds:
|
|
|
125
121
|
read_mod_filtering_use_other_c_as_background: False
|
|
126
122
|
min_valid_fraction_positions_in_read_vs_ref: 0.5
|
|
127
123
|
|
|
124
|
+
# Plotting params for read length histograms
|
|
125
|
+
obs_to_plot_pp_qc:
|
|
126
|
+
- read_length
|
|
127
|
+
- mapped_length
|
|
128
|
+
- read_quality
|
|
129
|
+
- mapping_quality
|
|
130
|
+
- mapped_length_to_reference_length_ratio
|
|
131
|
+
- mapped_length_to_read_length_ratio
|
|
132
|
+
- Raw_modification_signal
|
|
133
|
+
|
|
128
134
|
# Duplicate detection params
|
|
129
135
|
duplicate_detection_site_types: # Site types to consider for duplicate detection workflow
|
|
130
136
|
- "GpC"
|
|
@@ -132,7 +138,7 @@ duplicate_detection_site_types: # Site types to consider for duplicate detection
|
|
|
132
138
|
- "ambiguous_GpC_CpG"
|
|
133
139
|
duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
|
|
134
140
|
hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
|
|
135
|
-
-
|
|
141
|
+
- Fraction_C_site_modified
|
|
136
142
|
duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
|
|
137
143
|
duplicate_detection_window_size_for_hamming_neighbors: 50 # How many neighboring reads to look at for calculating hamming distance pairs
|
|
138
144
|
duplicate_detection_min_overlapping_positions: 20 # The minimum amount of valid overlapping positions that will allow duplicate detection to work
|
|
@@ -143,29 +149,39 @@ duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkag
|
|
|
143
149
|
# Position QC params
|
|
144
150
|
position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
|
|
145
151
|
|
|
146
|
-
######## smftools
|
|
147
|
-
#
|
|
152
|
+
######## smftools spatial params #########
|
|
153
|
+
invert_adata: False # Whether to invert the AnnData along the positions axis.
|
|
154
|
+
# Reindexing params
|
|
155
|
+
reindexing_offsets:
|
|
156
|
+
null : null
|
|
157
|
+
reindexed_var_suffix: "reindexed"
|
|
158
|
+
|
|
159
|
+
# Spatial Analysis - QC Plotting params
|
|
148
160
|
rows_per_qc_histogram_grid: 12
|
|
149
161
|
|
|
150
|
-
#
|
|
162
|
+
# Spatial Analysis - Clustermap params
|
|
151
163
|
layer_for_clustermap_plotting: 'nan0_0minus1'
|
|
164
|
+
clustermap_cmap_c: "coolwarm"
|
|
165
|
+
clustermap_cmap_gpc: "coolwarm"
|
|
166
|
+
clustermap_cmap_cpg: "coolwarm"
|
|
167
|
+
clustermap_cmap_a: "coolwarm"
|
|
168
|
+
spatial_clustermap_sortby: "gpc"
|
|
152
169
|
|
|
153
|
-
#
|
|
170
|
+
# Spatial Analysis - UMAP/Leiden params
|
|
154
171
|
layer_for_umap_plotting: 'nan_half'
|
|
155
172
|
umap_layers_to_plot:
|
|
156
173
|
- "mapped_length"
|
|
157
174
|
- "Raw_modification_signal"
|
|
158
175
|
|
|
159
|
-
#
|
|
176
|
+
# Spatial Analysis - Spatial Autocorrelation params
|
|
177
|
+
autocorr_normalization_method: "pearson" # options are pearson or sum
|
|
160
178
|
rows_per_qc_autocorr_grid: 6
|
|
161
179
|
autocorr_rolling_window_size: 25
|
|
162
180
|
autocorr_max_lag: 800
|
|
163
181
|
autocorr_site_types:
|
|
164
182
|
- "GpC"
|
|
165
|
-
- "CpG"
|
|
166
|
-
- "any_C"
|
|
167
183
|
|
|
168
|
-
#
|
|
184
|
+
# Spatial Analysis - Correlation Matrix params
|
|
169
185
|
correlation_matrix_types:
|
|
170
186
|
- "pearson"
|
|
171
187
|
- "binary_covariance"
|
|
@@ -188,10 +204,19 @@ hmm_init_start_probs:
|
|
|
188
204
|
- 0.5
|
|
189
205
|
- 0.5
|
|
190
206
|
hmm_eps: 1e-8
|
|
207
|
+
# Fitting strategy
|
|
208
|
+
hmm_fit_strategy: "per_group" # "per_group" | "shared_transitions"
|
|
209
|
+
hmm_shared_scope: ["reference", "methbase"]
|
|
210
|
+
hmm_groupby: ["sample", "reference", "methbase"]
|
|
211
|
+
# If hmm_fit_strategy == shared_transitions
|
|
212
|
+
hmm_adapt_emissions: true
|
|
213
|
+
hmm_adapt_startprobs: true
|
|
214
|
+
hmm_emission_adapt_iters: 5
|
|
215
|
+
hmm_emission_adapt_tol: 1.0e-4
|
|
191
216
|
hmm_dtype: "float64"
|
|
192
|
-
hmm_annotation_threshold: 0.5
|
|
193
|
-
hmm_batch_size: 1024
|
|
194
|
-
hmm_use_viterbi: False
|
|
217
|
+
hmm_annotation_threshold: 0.5 # The minimum probability threshold of a feature interval to accept it for layer annotation.
|
|
218
|
+
hmm_batch_size: 1024 # hmm batch size
|
|
219
|
+
hmm_use_viterbi: False # Whether to use viterbi decoding. If False, uses forward-backward gammas. Viterbi is smoother, but less sensitive.
|
|
195
220
|
footprints: True # whether to use the default HMM footprint params
|
|
196
221
|
accessible_patches: True # whether to use the default HMM accessible patch params
|
|
197
222
|
cpg: False # whether to use the default HMM endogenous CpG patch params
|
|
@@ -204,19 +229,104 @@ hmm_feature_sets:
|
|
|
204
229
|
footprint:
|
|
205
230
|
state: "Non-Modified"
|
|
206
231
|
features:
|
|
207
|
-
small_bound_stretch: [
|
|
208
|
-
medium_bound_stretch: [40,
|
|
209
|
-
putative_nucleosome: [
|
|
232
|
+
small_bound_stretch: [6, 40]
|
|
233
|
+
medium_bound_stretch: [40, 100]
|
|
234
|
+
putative_nucleosome: [100, 200]
|
|
210
235
|
large_bound_stretch: [200, inf]
|
|
211
236
|
accessible:
|
|
212
237
|
state: "Modified"
|
|
213
238
|
features:
|
|
214
239
|
small_accessible_patch: [3, 20]
|
|
215
240
|
mid_accessible_patch: [20, 40]
|
|
216
|
-
|
|
217
|
-
|
|
241
|
+
large_accessible_patch: [40, 110]
|
|
242
|
+
nucleosome_depleted_region: [110, inf]
|
|
218
243
|
hmm_merge_layer_features:
|
|
219
|
-
- [
|
|
244
|
+
- ["all_accessible_features", 60]
|
|
245
|
+
clustermap_cmap_hmm: "coolwarm"
|
|
246
|
+
hmm_clustermap_feature_layers:
|
|
247
|
+
- all_accessible_features
|
|
248
|
+
- all_accessible_features_merged
|
|
249
|
+
- small_accessible_patch
|
|
250
|
+
- mid_accessible_patch
|
|
251
|
+
- large_accessible_patch
|
|
252
|
+
- large_accessible_patch_merged
|
|
253
|
+
- nucleosome_depleted_region
|
|
254
|
+
- nucleosome_depleted_region_merged
|
|
255
|
+
- small_bound_stretch
|
|
256
|
+
- medium_bound_stretch
|
|
257
|
+
- putative_nucleosome
|
|
258
|
+
- large_bound_stretch
|
|
259
|
+
hmm_clustermap_sortby: "hmm"
|
|
260
|
+
hmm_peak_feature_configs:
|
|
261
|
+
all_accessible_features:
|
|
262
|
+
min_distance: 200 # The minimum distance in between called peaks
|
|
263
|
+
peak_width: 200 # The window width to calculate sum/mean hmm signal per read centered at the peak center.
|
|
264
|
+
peak_prominence: 0.1 # The minimum prominence to call a peak
|
|
265
|
+
peak_threshold: 0.80 # The minimum mean hmm signal in each molecule within the peak window to mark the molecule as positive for the feature.
|
|
266
|
+
rolling_window: 50 # Window size for the rolling average smoothing before peak calling
|
|
267
|
+
|
|
268
|
+
all_accessible_features_merged:
|
|
269
|
+
min_distance: 250
|
|
270
|
+
peak_width: 250
|
|
271
|
+
peak_prominence: 0.05
|
|
272
|
+
peak_threshold: 0.80
|
|
273
|
+
rolling_window: 50
|
|
274
|
+
|
|
275
|
+
small_accessible_patch:
|
|
276
|
+
min_distance: 40
|
|
277
|
+
peak_width: 30
|
|
278
|
+
peak_prominence: 0.1
|
|
279
|
+
peak_threshold: 0.8
|
|
280
|
+
rolling_window: 40
|
|
281
|
+
|
|
282
|
+
mid_accessible_patch:
|
|
283
|
+
min_distance: 100
|
|
284
|
+
peak_width: 60
|
|
285
|
+
peak_prominence: 0.025
|
|
286
|
+
peak_threshold: 0.80
|
|
287
|
+
rolling_window: 50
|
|
288
|
+
|
|
289
|
+
large_accessible_patch:
|
|
290
|
+
min_distance: 100
|
|
291
|
+
peak_width: 100
|
|
292
|
+
peak_prominence: 0.025
|
|
293
|
+
peak_threshold: 0.80
|
|
294
|
+
rolling_window: 50
|
|
295
|
+
|
|
296
|
+
nucleosome_depleted_region:
|
|
297
|
+
min_distance: 200
|
|
298
|
+
peak_width: 200
|
|
299
|
+
peak_prominence: 0.025
|
|
300
|
+
peak_threshold: 0.80
|
|
301
|
+
rolling_window: 50
|
|
302
|
+
|
|
303
|
+
small_bound_stretch:
|
|
304
|
+
min_distance: 20
|
|
305
|
+
peak_width: 20
|
|
306
|
+
peak_prominence: 0.01
|
|
307
|
+
peak_threshold: 0.50
|
|
308
|
+
rolling_window: 10
|
|
309
|
+
|
|
310
|
+
medium_bound_stretch:
|
|
311
|
+
min_distance: 40
|
|
312
|
+
peak_width: 40
|
|
313
|
+
peak_prominence: 0.01
|
|
314
|
+
peak_threshold: 0.50
|
|
315
|
+
rolling_window: 20
|
|
316
|
+
|
|
317
|
+
putative_nucleosome:
|
|
318
|
+
min_distance: 160
|
|
319
|
+
peak_width: 147 # canonical nucleosome footprint
|
|
320
|
+
peak_prominence: 0.025
|
|
321
|
+
peak_threshold: 0.60
|
|
322
|
+
rolling_window: 20
|
|
323
|
+
|
|
324
|
+
large_bound_stretch:
|
|
325
|
+
min_distance: 250
|
|
326
|
+
peak_width: 300
|
|
327
|
+
peak_prominence: 0.20
|
|
328
|
+
peak_threshold: 0.80
|
|
329
|
+
rolling_window: 50
|
|
220
330
|
|
|
221
331
|
# Pipeline control flow - load adata
|
|
222
332
|
force_redo_load_adata: False # Whether to perform load adata command from start
|
|
@@ -230,7 +340,6 @@ bypass_clean_nan: False # Whether to skip NaN cleaning
|
|
|
230
340
|
force_redo_clean_nan: False # Whether to redo NaN cleaning
|
|
231
341
|
bypass_append_base_context: False # Whether to skip adding per reference base context additions.
|
|
232
342
|
force_redo_append_base_context: False # Whether to redo per reference base context additions.
|
|
233
|
-
invert_adata: False # Whether to invert the AnnData along the positions axis.
|
|
234
343
|
bypass_append_binary_layer_by_base_context: False # Whether to skip adding new binary layers for each specific base context.
|
|
235
344
|
force_redo_append_binary_layer_by_base_context: False # Whether to redo adding new binary layers for each specific base context.
|
|
236
345
|
bypass_calculate_read_modification_stats: False # Whether to skip adding read level modification statistics.
|
|
@@ -242,8 +351,8 @@ force_redo_flag_duplicate_reads: False # Whether to redo flagging duplicate read
|
|
|
242
351
|
bypass_complexity_analysis: False # Whether to skip complexity analysis
|
|
243
352
|
force_redo_complexity_analysis: False # Whether to redo complexity analysis
|
|
244
353
|
|
|
245
|
-
# Pipeline control flow -
|
|
246
|
-
|
|
354
|
+
# Pipeline control flow - Spatial Analyses
|
|
355
|
+
force_redo_spatial_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
|
|
247
356
|
bypass_basic_clustermaps: False # Whether to skip basic clustermap plotting
|
|
248
357
|
force_redo_basic_clustermaps: False # Whether to redo basic clustermap plotting
|
|
249
358
|
bypass_basic_umap: False # Whether to skip basic UMAP calculation/plotting
|
smftools/config/direct.yaml
CHANGED
|
@@ -14,6 +14,9 @@ thresholds:
|
|
|
14
14
|
mod_list:
|
|
15
15
|
- '5mC_5hmC'
|
|
16
16
|
- '6mA' # mods to detect
|
|
17
|
+
mod_map:
|
|
18
|
+
5mC_5hmC: 5mC
|
|
19
|
+
6mA: 6mA
|
|
17
20
|
mod_target_bases:
|
|
18
21
|
- "A"
|
|
19
22
|
enzyme_target_bases:
|
|
@@ -24,10 +27,10 @@ delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs afte
|
|
|
24
27
|
|
|
25
28
|
######## smftools preprocess params ########
|
|
26
29
|
fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
|
|
27
|
-
binarize_on_fixed_methlyation_threshold: 0.
|
|
30
|
+
binarize_on_fixed_methlyation_threshold: 0.5 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
|
|
28
31
|
positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
|
|
29
32
|
negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
|
|
30
|
-
infer_on_percentile_sample_methylation_fitting:
|
|
33
|
+
infer_on_percentile_sample_methylation_fitting: 5 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
|
|
31
34
|
inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
|
|
32
35
|
fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
|
|
33
36
|
output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
|
|
@@ -36,6 +39,11 @@ output_binary_layer_name: "binarized_methylation" # The layer to store the binar
|
|
|
36
39
|
autocorr_site_types:
|
|
37
40
|
- "A"
|
|
38
41
|
|
|
42
|
+
spatial_clustermap_sortby: "a"
|
|
43
|
+
|
|
39
44
|
######## smftools hmm params #########
|
|
40
45
|
hmm_methbases:
|
|
41
|
-
- "A"
|
|
46
|
+
- "A"
|
|
47
|
+
|
|
48
|
+
hmm_merge_layer_features:
|
|
49
|
+
- ["A_all_accessible_features", 60]
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Dict, List,
|
|
4
|
+
from typing import Any, Dict, List, Union
|
|
5
|
+
|
|
6
|
+
from smftools.constants import BAM_SUFFIX
|
|
7
|
+
|
|
5
8
|
|
|
6
9
|
def discover_input_files(
|
|
7
10
|
input_data_path: Union[str, Path],
|
|
8
|
-
bam_suffix: str =
|
|
11
|
+
bam_suffix: str = BAM_SUFFIX,
|
|
9
12
|
recursive: bool = False,
|
|
10
13
|
follow_symlinks: bool = False,
|
|
11
14
|
) -> Dict[str, Any]:
|
|
@@ -30,10 +33,21 @@ def discover_input_files(
|
|
|
30
33
|
bam_suffix = bam_suffix.lower()
|
|
31
34
|
|
|
32
35
|
# Sets of canonical extension keys we’ll compare against
|
|
33
|
-
pod5_exts
|
|
36
|
+
pod5_exts = {".pod5", ".p5"}
|
|
34
37
|
fast5_exts = {".fast5", ".f5"}
|
|
35
|
-
fastq_exts = {
|
|
36
|
-
|
|
38
|
+
fastq_exts = {
|
|
39
|
+
".fastq",
|
|
40
|
+
".fq",
|
|
41
|
+
".fastq.gz",
|
|
42
|
+
".fq.gz",
|
|
43
|
+
".fastq.bz2",
|
|
44
|
+
".fq.bz2",
|
|
45
|
+
".fastq.xz",
|
|
46
|
+
".fq.xz",
|
|
47
|
+
".fastq.zst",
|
|
48
|
+
".fq.zst",
|
|
49
|
+
}
|
|
50
|
+
h5ad_exts = {".h5ad", ".h5"}
|
|
37
51
|
compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
|
|
38
52
|
|
|
39
53
|
def ext_key(pp: Path) -> str:
|