smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/archived/cli_flows.py +94 -0
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +361 -0
- smftools/cli/load_adata.py +637 -0
- smftools/cli/preprocess_adata.py +455 -0
- smftools/cli/spatial_adata.py +697 -0
- smftools/cli_entry.py +434 -0
- smftools/config/conversion.yaml +18 -6
- smftools/config/deaminase.yaml +18 -11
- smftools/config/default.yaml +151 -36
- smftools/config/direct.yaml +28 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +225 -27
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +811 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1084 -363
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +4 -4
- smftools/preprocessing/append_base_context.py +35 -26
- smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +11 -9
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +42 -26
- smftools/preprocessing/calculate_read_modification_stats.py +2 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
- smftools/preprocessing/flag_duplicate_reads.py +2 -2
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +360 -140
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
- smftools-0.2.4.dist-info/RECORD +176 -0
- smftools-0.2.4.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/RECORD +0 -161
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
smftools/config/default.yaml
CHANGED
|
@@ -1,3 +1,13 @@
|
|
|
1
|
+
# General
|
|
2
|
+
sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
|
|
3
|
+
sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
|
|
4
|
+
sample_name_col_for_plotting: 'Barcode'
|
|
5
|
+
|
|
6
|
+
# Compute params
|
|
7
|
+
threads: 4
|
|
8
|
+
device: "auto"
|
|
9
|
+
|
|
10
|
+
######## smftools load params #########
|
|
1
11
|
# Generic i/o
|
|
2
12
|
bam_suffix: ".bam"
|
|
3
13
|
recursive_input_search: True
|
|
@@ -7,16 +17,12 @@ strands:
|
|
|
7
17
|
- top
|
|
8
18
|
conversions:
|
|
9
19
|
- unconverted
|
|
10
|
-
sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
|
|
11
|
-
sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
|
|
12
20
|
fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
|
|
13
21
|
fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
|
|
14
22
|
input_already_demuxed: False # If the input files are already demultiplexed.
|
|
15
23
|
delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
|
|
16
|
-
|
|
17
|
-
#
|
|
18
|
-
threads: 4
|
|
19
|
-
device: "auto"
|
|
24
|
+
delete_intermediate_bams: False # Whether to delete intermediate BAM files.
|
|
25
|
+
delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
|
|
20
26
|
|
|
21
27
|
# Sequencing modality and general experiment params
|
|
22
28
|
smf_modality: 'conversion' # conversion, deaminase, direct
|
|
@@ -34,7 +40,8 @@ model: "hac" # needed for dorado basecaller
|
|
|
34
40
|
filter_threshold: 0.8 # Dorado probability filter threshold for base calling.
|
|
35
41
|
|
|
36
42
|
# Alignment params
|
|
37
|
-
aligner: "
|
|
43
|
+
aligner: "dorado" # Aligner to use: dorado, minimap2
|
|
44
|
+
align_from_bam: False # Whether to run alignment from a bam file for minimap2. If False, runs alignment from a FASTQ file.
|
|
38
45
|
aligner_args:
|
|
39
46
|
minimap2:
|
|
40
47
|
ont:
|
|
@@ -70,11 +77,11 @@ aligner_args:
|
|
|
70
77
|
dorado:
|
|
71
78
|
ont:
|
|
72
79
|
- "--mm2-opts"
|
|
73
|
-
- "-N"
|
|
74
|
-
- "5"
|
|
80
|
+
- "-N 5"
|
|
75
81
|
|
|
76
82
|
# Sorted BAM and BED specific handling
|
|
77
83
|
make_bigwigs: False # Whether to make coverage bigwigs
|
|
84
|
+
make_beds: False # Whether to make beds from the aligned bams
|
|
78
85
|
|
|
79
86
|
# Nanopore specific demultiplexing
|
|
80
87
|
barcode_both_ends: False # dorado demultiplexing
|
|
@@ -85,47 +92,58 @@ mapping_threshold: 0.01 # Minimum proportion of mapped reads that need to fall w
|
|
|
85
92
|
reference_column: 'Reference_strand'
|
|
86
93
|
sample_column: 'Barcode'
|
|
87
94
|
|
|
88
|
-
|
|
95
|
+
######## smftools preprocess params #########
|
|
96
|
+
# Read length, quality, and mapping filtering params
|
|
89
97
|
read_coord_filter:
|
|
90
98
|
- null
|
|
91
99
|
- null
|
|
92
100
|
read_len_filter_thresholds:
|
|
93
|
-
-
|
|
101
|
+
- 100
|
|
94
102
|
- null
|
|
95
103
|
read_len_to_ref_ratio_filter_thresholds:
|
|
96
|
-
- 0.
|
|
104
|
+
- 0.5
|
|
97
105
|
- null
|
|
98
106
|
read_quality_filter_thresholds:
|
|
99
|
-
-
|
|
107
|
+
- 15
|
|
100
108
|
- null
|
|
101
109
|
read_mapping_quality_filter_thresholds:
|
|
102
110
|
- null
|
|
103
111
|
- null
|
|
104
112
|
|
|
105
|
-
#
|
|
113
|
+
# Read modification filtering params
|
|
106
114
|
read_mod_filtering_gpc_thresholds:
|
|
107
115
|
- 0.025
|
|
108
116
|
- 0.975
|
|
109
117
|
read_mod_filtering_cpg_thresholds:
|
|
110
118
|
- 0.0
|
|
111
119
|
- 1.0
|
|
112
|
-
|
|
120
|
+
read_mod_filtering_c_thresholds:
|
|
113
121
|
- 0.025
|
|
114
122
|
- 0.975
|
|
115
123
|
read_mod_filtering_a_thresholds:
|
|
116
124
|
- 0.025
|
|
117
125
|
- 0.975
|
|
118
126
|
read_mod_filtering_use_other_c_as_background: False
|
|
119
|
-
min_valid_fraction_positions_in_read_vs_ref: 0.
|
|
127
|
+
min_valid_fraction_positions_in_read_vs_ref: 0.5
|
|
120
128
|
|
|
121
|
-
#
|
|
129
|
+
# Plotting params for read length histograms
|
|
130
|
+
obs_to_plot_pp_qc:
|
|
131
|
+
- read_length
|
|
132
|
+
- mapped_length
|
|
133
|
+
- read_quality
|
|
134
|
+
- mapping_quality
|
|
135
|
+
- mapped_length_to_reference_length_ratio
|
|
136
|
+
- mapped_length_to_read_length_ratio
|
|
137
|
+
- Raw_modification_signal
|
|
138
|
+
|
|
139
|
+
# Duplicate detection params
|
|
122
140
|
duplicate_detection_site_types: # Site types to consider for duplicate detection workflow
|
|
123
141
|
- "GpC"
|
|
124
142
|
- "CpG"
|
|
125
143
|
- "ambiguous_GpC_CpG"
|
|
126
144
|
duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
|
|
127
145
|
hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
|
|
128
|
-
-
|
|
146
|
+
- Fraction_C_site_modified
|
|
129
147
|
duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
|
|
130
148
|
duplicate_detection_window_size_for_hamming_neighbors: 50 # How many neighboring reads to look at for calculating hamming distance pairs
|
|
131
149
|
duplicate_detection_min_overlapping_positions: 20 # The minimum amount of valid overlapping positions that will allow duplicate detection to work
|
|
@@ -133,33 +151,43 @@ duplicate_detection_do_hierarchical: True # Whether to follow up fwd/rev lexicog
|
|
|
133
151
|
duplicate_detection_hierarchical_linkage: "average" # Method for hierarchical clustering distance calculation
|
|
134
152
|
duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkage based duplicate detection.
|
|
135
153
|
|
|
136
|
-
#
|
|
154
|
+
# Position QC params
|
|
155
|
+
position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
|
|
137
156
|
|
|
138
|
-
|
|
139
|
-
|
|
157
|
+
######## smftools spatial params #########
|
|
158
|
+
invert_adata: False # Whether to invert the AnnData along the positions axis.
|
|
159
|
+
# Reindexing params
|
|
160
|
+
reindexing_offsets:
|
|
161
|
+
null : null
|
|
162
|
+
reindexed_var_suffix: "reindexed"
|
|
140
163
|
|
|
141
|
-
#
|
|
164
|
+
# Spatial Analysis - QC Plotting params
|
|
142
165
|
rows_per_qc_histogram_grid: 12
|
|
143
166
|
|
|
144
|
-
#
|
|
167
|
+
# Spatial Analysis - Clustermap params
|
|
145
168
|
layer_for_clustermap_plotting: 'nan0_0minus1'
|
|
169
|
+
clustermap_cmap_c: "coolwarm"
|
|
170
|
+
clustermap_cmap_gpc: "coolwarm"
|
|
171
|
+
clustermap_cmap_cpg: "coolwarm"
|
|
172
|
+
clustermap_cmap_a: "coolwarm"
|
|
173
|
+
spatial_clustermap_sortby: "gpc"
|
|
146
174
|
|
|
147
|
-
#
|
|
175
|
+
# Spatial Analysis - UMAP/Leiden params
|
|
148
176
|
layer_for_umap_plotting: 'nan_half'
|
|
149
177
|
umap_layers_to_plot:
|
|
150
178
|
- "mapped_length"
|
|
151
179
|
- "Raw_modification_signal"
|
|
152
180
|
|
|
153
|
-
#
|
|
181
|
+
# Spatial Analysis - Spatial Autocorrelation params
|
|
154
182
|
rows_per_qc_autocorr_grid: 6
|
|
155
183
|
autocorr_rolling_window_size: 25
|
|
156
184
|
autocorr_max_lag: 800
|
|
157
185
|
autocorr_site_types:
|
|
158
186
|
- "GpC"
|
|
159
187
|
- "CpG"
|
|
160
|
-
- "
|
|
188
|
+
- "C"
|
|
161
189
|
|
|
162
|
-
#
|
|
190
|
+
# Spatial Analysis - Correlation Matrix params
|
|
163
191
|
correlation_matrix_types:
|
|
164
192
|
- "pearson"
|
|
165
193
|
- "binary_covariance"
|
|
@@ -169,6 +197,7 @@ correlation_matrix_cmaps:
|
|
|
169
197
|
correlation_matrix_site_types:
|
|
170
198
|
- "GpC_site"
|
|
171
199
|
|
|
200
|
+
######## smftools hmm params #########
|
|
172
201
|
# HMM params
|
|
173
202
|
hmm_n_states: 2 # Number of HMM states
|
|
174
203
|
hmm_init_emission_probs:
|
|
@@ -197,18 +226,105 @@ hmm_feature_sets:
|
|
|
197
226
|
footprint:
|
|
198
227
|
state: "Non-Modified"
|
|
199
228
|
features:
|
|
200
|
-
small_bound_stretch: [
|
|
201
|
-
medium_bound_stretch: [
|
|
202
|
-
putative_nucleosome: [
|
|
229
|
+
small_bound_stretch: [6, 40]
|
|
230
|
+
medium_bound_stretch: [40, 100]
|
|
231
|
+
putative_nucleosome: [100, 200]
|
|
203
232
|
large_bound_stretch: [200, inf]
|
|
204
233
|
accessible:
|
|
205
234
|
state: "Modified"
|
|
206
235
|
features:
|
|
207
|
-
small_accessible_patch: [
|
|
208
|
-
mid_accessible_patch: [20,
|
|
209
|
-
large_accessible_patch: [
|
|
236
|
+
small_accessible_patch: [3, 20]
|
|
237
|
+
mid_accessible_patch: [20, 40]
|
|
238
|
+
large_accessible_patch: [40, 110]
|
|
239
|
+
nucleosome_depleted_region: [110, inf]
|
|
210
240
|
hmm_merge_layer_features:
|
|
211
241
|
- [null, 80]
|
|
242
|
+
clustermap_cmap_hmm: "coolwarm"
|
|
243
|
+
hmm_clustermap_feature_layers:
|
|
244
|
+
- all_accessible_features
|
|
245
|
+
- all_accessible_features_merged
|
|
246
|
+
- small_accessible_patch
|
|
247
|
+
- mid_accessible_patch
|
|
248
|
+
- large_accessible_patch
|
|
249
|
+
- nucleosome_depleted_region
|
|
250
|
+
- small_bound_stretch
|
|
251
|
+
- medium_bound_stretch
|
|
252
|
+
- putative_nucleosome
|
|
253
|
+
- large_bound_stretch
|
|
254
|
+
hmm_clustermap_sortby: "hmm"
|
|
255
|
+
hmm_peak_feature_configs:
|
|
256
|
+
all_accessible_features:
|
|
257
|
+
min_distance: 200 # The minimum distance in between called peaks
|
|
258
|
+
peak_width: 200 # The window width to calculate sum/mean hmm signal per read centered at the peak center.
|
|
259
|
+
peak_prominence: 0.1 # The minimum prominence to call a peak
|
|
260
|
+
peak_threshold: 0.80 # The minimum mean hmm signal in each molecule within the peak window to mark the molecule as positive for the feature.
|
|
261
|
+
rolling_window: 50 # Window size for the rolling average smoothing before peak calling
|
|
262
|
+
|
|
263
|
+
all_accessible_features_merged:
|
|
264
|
+
min_distance: 250
|
|
265
|
+
peak_width: 250
|
|
266
|
+
peak_prominence: 0.05
|
|
267
|
+
peak_threshold: 0.80
|
|
268
|
+
rolling_window: 50
|
|
269
|
+
|
|
270
|
+
small_accessible_patch:
|
|
271
|
+
min_distance: 40
|
|
272
|
+
peak_width: 30
|
|
273
|
+
peak_prominence: 0.1
|
|
274
|
+
peak_threshold: 0.8
|
|
275
|
+
rolling_window: 40
|
|
276
|
+
|
|
277
|
+
mid_accessible_patch:
|
|
278
|
+
min_distance: 100
|
|
279
|
+
peak_width: 60
|
|
280
|
+
peak_prominence: 0.025
|
|
281
|
+
peak_threshold: 0.80
|
|
282
|
+
rolling_window: 50
|
|
283
|
+
|
|
284
|
+
large_accessible_patch:
|
|
285
|
+
min_distance: 100
|
|
286
|
+
peak_width: 100
|
|
287
|
+
peak_prominence: 0.025
|
|
288
|
+
peak_threshold: 0.80
|
|
289
|
+
rolling_window: 50
|
|
290
|
+
|
|
291
|
+
nucleosome_depleted_region:
|
|
292
|
+
min_distance: 200
|
|
293
|
+
peak_width: 200
|
|
294
|
+
peak_prominence: 0.025
|
|
295
|
+
peak_threshold: 0.80
|
|
296
|
+
rolling_window: 50
|
|
297
|
+
|
|
298
|
+
small_bound_stretch:
|
|
299
|
+
min_distance: 20
|
|
300
|
+
peak_width: 20
|
|
301
|
+
peak_prominence: 0.01
|
|
302
|
+
peak_threshold: 0.50
|
|
303
|
+
rolling_window: 10
|
|
304
|
+
|
|
305
|
+
medium_bound_stretch:
|
|
306
|
+
min_distance: 40
|
|
307
|
+
peak_width: 40
|
|
308
|
+
peak_prominence: 0.01
|
|
309
|
+
peak_threshold: 0.50
|
|
310
|
+
rolling_window: 20
|
|
311
|
+
|
|
312
|
+
putative_nucleosome:
|
|
313
|
+
min_distance: 160
|
|
314
|
+
peak_width: 147 # canonical nucleosome footprint
|
|
315
|
+
peak_prominence: 0.025
|
|
316
|
+
peak_threshold: 0.60
|
|
317
|
+
rolling_window: 20
|
|
318
|
+
|
|
319
|
+
large_bound_stretch:
|
|
320
|
+
min_distance: 250
|
|
321
|
+
peak_width: 300
|
|
322
|
+
peak_prominence: 0.20
|
|
323
|
+
peak_threshold: 0.80
|
|
324
|
+
rolling_window: 50
|
|
325
|
+
|
|
326
|
+
# Pipeline control flow - load adata
|
|
327
|
+
force_redo_load_adata: False # Whether to perform load adata command from start
|
|
212
328
|
|
|
213
329
|
# Pipeline control flow - Preprocessing and QC
|
|
214
330
|
force_redo_preprocessing: False # Whether to force redo the entire preprocessing workflow from the initial raw anndata.
|
|
@@ -219,7 +335,6 @@ bypass_clean_nan: False # Whether to skip NaN cleaning
|
|
|
219
335
|
force_redo_clean_nan: False # Whether to redo NaN cleaning
|
|
220
336
|
bypass_append_base_context: False # Whether to skip adding per reference base context additions.
|
|
221
337
|
force_redo_append_base_context: False # Whether to redo per reference base context additions.
|
|
222
|
-
invert_adata: False # Whether to invert the AnnData along the positions axis.
|
|
223
338
|
bypass_append_binary_layer_by_base_context: False # Whether to skip adding new binary layers for each specific base context.
|
|
224
339
|
force_redo_append_binary_layer_by_base_context: False # Whether to redo adding new binary layers for each specific base context.
|
|
225
340
|
bypass_calculate_read_modification_stats: False # Whether to skip adding read level modification statistics.
|
|
@@ -231,8 +346,8 @@ force_redo_flag_duplicate_reads: False # Whether to redo flagging duplicate read
|
|
|
231
346
|
bypass_complexity_analysis: False # Whether to skip complexity analysis
|
|
232
347
|
force_redo_complexity_analysis: False # Whether to redo complexity analysis
|
|
233
348
|
|
|
234
|
-
# Pipeline control flow -
|
|
235
|
-
|
|
349
|
+
# Pipeline control flow - Spatial Analyses
|
|
350
|
+
force_redo_spatial_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
|
|
236
351
|
bypass_basic_clustermaps: False # Whether to skip basic clustermap plotting
|
|
237
352
|
force_redo_basic_clustermaps: False # Whether to redo basic clustermap plotting
|
|
238
353
|
bypass_basic_umap: False # Whether to skip basic UMAP calculation/plotting
|
smftools/config/direct.yaml
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# Direct (Nanopore modified base calling)footprinting defaults
|
|
2
2
|
extends: default
|
|
3
|
+
|
|
4
|
+
######## smftools load params #########
|
|
3
5
|
filter_threshold: 0.8 # min threshold to call a canononical base
|
|
4
6
|
m6A_threshold: 0.7 # min threshold to call a modified m6a base
|
|
5
7
|
m5C_threshold: 0.7 # min threshold to call a modified 5mC base
|
|
@@ -12,6 +14,31 @@ thresholds:
|
|
|
12
14
|
mod_list:
|
|
13
15
|
- '5mC_5hmC'
|
|
14
16
|
- '6mA' # mods to detect
|
|
17
|
+
mod_map:
|
|
18
|
+
5mC_5hmC: 5mC
|
|
19
|
+
6mA: 6mA
|
|
20
|
+
mod_target_bases:
|
|
21
|
+
- "A"
|
|
22
|
+
enzyme_target_bases:
|
|
23
|
+
- "A"
|
|
15
24
|
batch_size: 4 # How many mod TSVs to load into memory at a time when making anndata batches
|
|
16
25
|
skip_unclassified: True # Whether to skip unclassified barcodes
|
|
17
|
-
delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
|
|
26
|
+
delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
|
|
27
|
+
|
|
28
|
+
######## smftools preprocess params ########
|
|
29
|
+
fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
|
|
30
|
+
binarize_on_fixed_methlyation_threshold: 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
|
|
31
|
+
positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
|
|
32
|
+
negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
|
|
33
|
+
infer_on_percentile_sample_methylation_fitting: 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
|
|
34
|
+
inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
|
|
35
|
+
fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
|
|
36
|
+
output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
|
|
37
|
+
|
|
38
|
+
######## smftools spatial params #########
|
|
39
|
+
autocorr_site_types:
|
|
40
|
+
- "A"
|
|
41
|
+
|
|
42
|
+
######## smftools hmm params #########
|
|
43
|
+
hmm_methbases:
|
|
44
|
+
- "A"
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, List, Any, Iterable, Union
|
|
5
|
+
|
|
6
|
+
def discover_input_files(
|
|
7
|
+
input_data_path: Union[str, Path],
|
|
8
|
+
bam_suffix: str = ".bam",
|
|
9
|
+
recursive: bool = False,
|
|
10
|
+
follow_symlinks: bool = False,
|
|
11
|
+
) -> Dict[str, Any]:
|
|
12
|
+
"""
|
|
13
|
+
Discover input files under `input_data_path`.
|
|
14
|
+
|
|
15
|
+
Returns a dict with:
|
|
16
|
+
- pod5_paths, fast5_paths, fastq_paths, bam_paths, other_paths (lists of Path)
|
|
17
|
+
- input_is_pod5, input_is_fast5, input_is_fastq, input_is_bam (bools)
|
|
18
|
+
- all_files_searched (int)
|
|
19
|
+
|
|
20
|
+
Behavior:
|
|
21
|
+
- If `input_data_path` is a file, returns that single file categorized.
|
|
22
|
+
- If a directory, scans immediate children (recursive=False) or entire tree (recursive=True).
|
|
23
|
+
- Handles multi-suffix files like .fastq.gz, .fq.xz, etc.
|
|
24
|
+
"""
|
|
25
|
+
p = Path(input_data_path)
|
|
26
|
+
|
|
27
|
+
# normalize bam suffix with a leading dot and lower
|
|
28
|
+
if not bam_suffix.startswith("."):
|
|
29
|
+
bam_suffix = "." + bam_suffix
|
|
30
|
+
bam_suffix = bam_suffix.lower()
|
|
31
|
+
|
|
32
|
+
# Sets of canonical extension keys we’ll compare against
|
|
33
|
+
pod5_exts = {".pod5", ".p5"}
|
|
34
|
+
fast5_exts = {".fast5", ".f5"}
|
|
35
|
+
fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq.zst", ".fq.zst"}
|
|
36
|
+
h5ad_exts = {".h5ad", ".h5"}
|
|
37
|
+
compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
|
|
38
|
+
|
|
39
|
+
def ext_key(pp: Path) -> str:
|
|
40
|
+
"""
|
|
41
|
+
A robust extension key: last suffix, or last two if the final one is a compressor (.gz/.bz2/.xz/.zst).
|
|
42
|
+
Examples:
|
|
43
|
+
a.fastq.gz -> ".fastq.gz"
|
|
44
|
+
a.fq.xz -> ".fq.xz"
|
|
45
|
+
a.bam -> ".bam"
|
|
46
|
+
a -> ""
|
|
47
|
+
"""
|
|
48
|
+
suff = [s.lower() for s in pp.suffixes]
|
|
49
|
+
if not suff:
|
|
50
|
+
return ""
|
|
51
|
+
if suff[-1] in compressed_exts and len(suff) >= 2:
|
|
52
|
+
return suff[-2] + suff[-1]
|
|
53
|
+
return suff[-1]
|
|
54
|
+
|
|
55
|
+
pod5_paths: List[Path] = []
|
|
56
|
+
fast5_paths: List[Path] = []
|
|
57
|
+
fastq_paths: List[Path] = []
|
|
58
|
+
bam_paths: List[Path] = []
|
|
59
|
+
h5ad_paths: List[Path] = []
|
|
60
|
+
other_paths: List[Path] = []
|
|
61
|
+
|
|
62
|
+
def categorize_file(fp: Path) -> None:
|
|
63
|
+
key = ext_key(fp)
|
|
64
|
+
if key in pod5_exts:
|
|
65
|
+
pod5_paths.append(fp)
|
|
66
|
+
elif key in fast5_exts:
|
|
67
|
+
fast5_paths.append(fp)
|
|
68
|
+
elif key in fastq_exts:
|
|
69
|
+
fastq_paths.append(fp)
|
|
70
|
+
elif key in h5ad_exts:
|
|
71
|
+
h5ad_paths.append(fp)
|
|
72
|
+
elif key == bam_suffix:
|
|
73
|
+
bam_paths.append(fp)
|
|
74
|
+
else:
|
|
75
|
+
other_paths.append(fp)
|
|
76
|
+
|
|
77
|
+
if not p.exists():
|
|
78
|
+
raise FileNotFoundError(f"input_data_path does not exist: {input_data_path}")
|
|
79
|
+
|
|
80
|
+
total_searched = 0
|
|
81
|
+
|
|
82
|
+
if p.is_file():
|
|
83
|
+
total_searched = 1
|
|
84
|
+
categorize_file(p)
|
|
85
|
+
else:
|
|
86
|
+
# Directory scan
|
|
87
|
+
if recursive:
|
|
88
|
+
# Python 3.12+ supports follow_symlinks in glob/rglob. Fallback for older versions.
|
|
89
|
+
try:
|
|
90
|
+
iterator = p.rglob("*", follow_symlinks=follow_symlinks) # type: ignore[call-arg]
|
|
91
|
+
except TypeError:
|
|
92
|
+
iterator = p.rglob("*") # follow_symlinks not supported
|
|
93
|
+
else:
|
|
94
|
+
iterator = p.iterdir()
|
|
95
|
+
|
|
96
|
+
for fp in iterator:
|
|
97
|
+
if not fp.is_file():
|
|
98
|
+
continue
|
|
99
|
+
total_searched += 1
|
|
100
|
+
categorize_file(fp)
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
"pod5_paths": sorted(pod5_paths),
|
|
104
|
+
"fast5_paths": sorted(fast5_paths),
|
|
105
|
+
"fastq_paths": sorted(fastq_paths),
|
|
106
|
+
"bam_paths": sorted(bam_paths),
|
|
107
|
+
"h5ad_paths": sorted(h5ad_paths),
|
|
108
|
+
"other_paths": sorted(other_paths),
|
|
109
|
+
"input_is_pod5": len(pod5_paths) > 0,
|
|
110
|
+
"input_is_fast5": len(fast5_paths) > 0,
|
|
111
|
+
"input_is_fastq": len(fastq_paths) > 0,
|
|
112
|
+
"input_is_bam": len(bam_paths) > 0,
|
|
113
|
+
"input_is_h5ad": len(h5ad_paths) > 0,
|
|
114
|
+
"all_files_searched": total_searched,
|
|
115
|
+
}
|