smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. smftools/__init__.py +7 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/cli_flows.py +94 -0
  4. smftools/cli/hmm_adata.py +338 -0
  5. smftools/cli/load_adata.py +577 -0
  6. smftools/cli/preprocess_adata.py +363 -0
  7. smftools/cli/spatial_adata.py +564 -0
  8. smftools/cli_entry.py +435 -0
  9. smftools/config/__init__.py +1 -0
  10. smftools/config/conversion.yaml +38 -0
  11. smftools/config/deaminase.yaml +61 -0
  12. smftools/config/default.yaml +264 -0
  13. smftools/config/direct.yaml +41 -0
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +1288 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  19. smftools/hmm/call_hmm_peaks.py +106 -0
  20. smftools/{tools → hmm}/display_hmm.py +3 -3
  21. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  22. smftools/{tools → hmm}/train_hmm.py +1 -1
  23. smftools/informatics/__init__.py +13 -9
  24. smftools/informatics/archived/deaminase_smf.py +132 -0
  25. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  26. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  27. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  28. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  30. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  31. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  32. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  34. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
  35. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  36. smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
  38. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  39. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  40. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  41. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  42. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  43. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  44. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  45. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
  46. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  47. smftools/informatics/bam_functions.py +812 -0
  48. smftools/informatics/basecalling.py +67 -0
  49. smftools/informatics/bed_functions.py +366 -0
  50. smftools/informatics/binarize_converted_base_identities.py +172 -0
  51. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
  52. smftools/informatics/fasta_functions.py +255 -0
  53. smftools/informatics/h5ad_functions.py +197 -0
  54. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
  55. smftools/informatics/modkit_functions.py +129 -0
  56. smftools/informatics/ohe.py +160 -0
  57. smftools/informatics/pod5_functions.py +224 -0
  58. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  59. smftools/machine_learning/__init__.py +12 -0
  60. smftools/machine_learning/data/__init__.py +2 -0
  61. smftools/machine_learning/data/anndata_data_module.py +234 -0
  62. smftools/machine_learning/evaluation/__init__.py +2 -0
  63. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  64. smftools/machine_learning/evaluation/evaluators.py +223 -0
  65. smftools/machine_learning/inference/__init__.py +3 -0
  66. smftools/machine_learning/inference/inference_utils.py +27 -0
  67. smftools/machine_learning/inference/lightning_inference.py +68 -0
  68. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  69. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  70. smftools/machine_learning/models/base.py +295 -0
  71. smftools/machine_learning/models/cnn.py +138 -0
  72. smftools/machine_learning/models/lightning_base.py +345 -0
  73. smftools/machine_learning/models/mlp.py +26 -0
  74. smftools/{tools → machine_learning}/models/positional.py +3 -2
  75. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  76. smftools/machine_learning/models/sklearn_models.py +273 -0
  77. smftools/machine_learning/models/transformer.py +303 -0
  78. smftools/machine_learning/training/__init__.py +2 -0
  79. smftools/machine_learning/training/train_lightning_model.py +135 -0
  80. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  81. smftools/plotting/__init__.py +4 -1
  82. smftools/plotting/autocorrelation_plotting.py +609 -0
  83. smftools/plotting/general_plotting.py +1292 -140
  84. smftools/plotting/hmm_plotting.py +260 -0
  85. smftools/plotting/qc_plotting.py +270 -0
  86. smftools/preprocessing/__init__.py +15 -8
  87. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  88. smftools/preprocessing/append_base_context.py +122 -0
  89. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  90. smftools/preprocessing/binarize.py +17 -0
  91. smftools/preprocessing/binarize_on_Youden.py +2 -2
  92. smftools/preprocessing/calculate_complexity_II.py +248 -0
  93. smftools/preprocessing/calculate_coverage.py +10 -1
  94. smftools/preprocessing/calculate_position_Youden.py +1 -1
  95. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  96. smftools/preprocessing/clean_NaN.py +17 -1
  97. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  98. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  99. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  100. smftools/preprocessing/invert_adata.py +12 -5
  101. smftools/preprocessing/load_sample_sheet.py +19 -4
  102. smftools/readwrite.py +1021 -89
  103. smftools/tools/__init__.py +3 -32
  104. smftools/tools/calculate_umap.py +5 -5
  105. smftools/tools/general_tools.py +3 -3
  106. smftools/tools/position_stats.py +468 -106
  107. smftools/tools/read_stats.py +115 -1
  108. smftools/tools/spatial_autocorrelation.py +562 -0
  109. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
  110. smftools-0.2.3.dist-info/RECORD +173 -0
  111. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  112. smftools/informatics/fast5_to_pod5.py +0 -21
  113. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  114. smftools/informatics/helpers/__init__.py +0 -74
  115. smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  116. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  117. smftools/informatics/helpers/bam_qc.py +0 -66
  118. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  119. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  120. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  121. smftools/informatics/helpers/index_fasta.py +0 -12
  122. smftools/informatics/helpers/make_dirs.py +0 -21
  123. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  124. smftools/informatics/load_adata.py +0 -182
  125. smftools/informatics/readwrite.py +0 -106
  126. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  127. smftools/preprocessing/append_C_context.py +0 -82
  128. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  129. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  130. smftools/preprocessing/filter_reads_on_length.py +0 -51
  131. smftools/tools/call_hmm_peaks.py +0 -105
  132. smftools/tools/data/__init__.py +0 -2
  133. smftools/tools/data/anndata_data_module.py +0 -90
  134. smftools/tools/inference/__init__.py +0 -1
  135. smftools/tools/inference/lightning_inference.py +0 -41
  136. smftools/tools/models/base.py +0 -14
  137. smftools/tools/models/cnn.py +0 -34
  138. smftools/tools/models/lightning_base.py +0 -41
  139. smftools/tools/models/mlp.py +0 -17
  140. smftools/tools/models/sklearn_models.py +0 -40
  141. smftools/tools/models/transformer.py +0 -133
  142. smftools/tools/training/__init__.py +0 -1
  143. smftools/tools/training/train_lightning_model.py +0 -47
  144. smftools-0.1.7.dist-info/RECORD +0 -136
  145. /smftools/{tools/evaluation → cli}/__init__.py +0 -0
  146. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  147. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  148. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  149. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  150. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  151. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  152. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  153. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  154. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  155. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  156. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  157. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  158. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  159. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  160. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  161. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  162. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  163. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  164. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  165. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  166. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  167. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  168. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  169. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  170. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  171. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  172. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  173. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  174. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,264 @@
1
+ # General
2
+ sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
3
+ sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
4
+ sample_name_col_for_plotting: 'Barcode'
5
+
6
+ # Compute params
7
+ threads: 4
8
+ device: "auto"
9
+
10
+ ######## smftools load params #########
11
+ # Generic i/o
12
+ bam_suffix: ".bam"
13
+ recursive_input_search: True
14
+ split_dir: "demultiplexed_BAMs"
15
+ strands:
16
+ - bottom
17
+ - top
18
+ conversions:
19
+ - unconverted
20
+ fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
21
+ fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
22
+ input_already_demuxed: False # If the input files are already demultiplexed.
23
+ delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
24
+ delete_intermediate_bams: True # Whether to delete intermediate BAM files.
25
+ delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
26
+
27
+ # Sequencing modality and general experiment params
28
+ smf_modality: 'conversion' # conversion, deaminase, direct
29
+ sequencer: 'ont' # ont, pacbio, illumina
30
+ barcode_kit: 'SQK-RBK114-96' # SQK-RBK114-96, SQK-NBD114-24, etc
31
+ mod_target_bases:
32
+ - "GpC"
33
+ - "CpG"
34
+ enzyme_target_bases:
35
+ - "GpC"
36
+
37
+ # Nanopore specific basecalling params
38
+ model_dir: null # Directory where dorado basecalling models are stored.
39
+ model: "hac" # needed for dorado basecaller
40
+ filter_threshold: 0.8 # Dorado probability filter threshold for base calling.
41
+
42
+ # Alignment params
43
+ aligner: "minimap2" # Aligner to use: dorado, minimap2
44
+ aligner_args:
45
+ minimap2:
46
+ ont:
47
+ - '-a'
48
+ - '-x'
49
+ - 'map-ont'
50
+ - '--MD'
51
+ - '-Y'
52
+ - '-y'
53
+ - '-N'
54
+ - '5'
55
+ - '--secondary=no'
56
+ pacbio:
57
+ - '-a'
58
+ - '-x'
59
+ - 'map-hifi'
60
+ - '--MD'
61
+ - '-Y'
62
+ - '-y'
63
+ - '-N'
64
+ - '5'
65
+ - '--secondary=no'
66
+ illumina:
67
+ - '-a'
68
+ - '-x'
69
+ - 'sr'
70
+ - '--MD'
71
+ - '-Y'
72
+ - '-y'
73
+ - '-N'
74
+ - '5'
75
+ - '--secondary=no'
76
+ dorado:
77
+ ont:
78
+ - "--mm2-opts"
79
+ - "-N 5"
80
+
81
+ # Sorted BAM and BED specific handling
82
+ make_bigwigs: False # Whether to make coverage bigwigs
83
+ make_beds: False # Whether to make beds from the aligned bams
84
+
85
+ # Nanopore specific demultiplexing
86
+ barcode_both_ends: False # dorado demultiplexing
87
+ trim: False # dorado adapter and barcode removal during demultiplexing
88
+
89
+ # Anndata structure
90
+ mapping_threshold: 0.01 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
91
+ reference_column: 'Reference_strand'
92
+ sample_column: 'Barcode'
93
+
94
+ ######## smftools preprocess params #########
95
+ # Read length, quality, and mapping filtering params
96
+ read_coord_filter:
97
+ - null
98
+ - null
99
+ read_len_filter_thresholds:
100
+ - 100
101
+ - null
102
+ read_len_to_ref_ratio_filter_thresholds:
103
+ - 0.5
104
+ - null
105
+ read_quality_filter_thresholds:
106
+ - 15
107
+ - null
108
+ read_mapping_quality_filter_thresholds:
109
+ - null
110
+ - null
111
+
112
+ # Read modification filtering params
113
+ read_mod_filtering_gpc_thresholds:
114
+ - 0.025
115
+ - 0.975
116
+ read_mod_filtering_cpg_thresholds:
117
+ - 0.0
118
+ - 1.0
119
+ read_mod_filtering_any_c_thresholds:
120
+ - 0.025
121
+ - 0.975
122
+ read_mod_filtering_a_thresholds:
123
+ - 0.025
124
+ - 0.975
125
+ read_mod_filtering_use_other_c_as_background: False
126
+ min_valid_fraction_positions_in_read_vs_ref: 0.5
127
+
128
+ # Duplicate detection params
129
+ duplicate_detection_site_types: # Site types to consider for duplicate detection workflow
130
+ - "GpC"
131
+ - "CpG"
132
+ - "ambiguous_GpC_CpG"
133
+ duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
134
+ hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
135
+ - Fraction_any_C_site_modified
136
+ duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
137
+ duplicate_detection_window_size_for_hamming_neighbors: 50 # How many neighboring reads to look at for calculating hamming distance pairs
138
+ duplicate_detection_min_overlapping_positions: 20 # The minimum amount of valid overlapping positions that will allow duplicate detection to work
139
+ duplicate_detection_do_hierarchical: True # Whether to follow up fwd/rev lexicographic duplicate detection with hieratchical clustering based method
140
+ duplicate_detection_hierarchical_linkage: "average" # Method for hierarchical clustering distance calculation
141
+ duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkage based duplicate detection.
142
+
143
+ # Position QC params
144
+ position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
145
+
146
+ ######## smftools analyze params #########
147
+ # Basic Analysis - QC Plotting params
148
+ rows_per_qc_histogram_grid: 12
149
+
150
+ # Basic Analysis - Clustermap params
151
+ layer_for_clustermap_plotting: 'nan0_0minus1'
152
+
153
+ # Basic Analysis - UMAP/Leiden params
154
+ layer_for_umap_plotting: 'nan_half'
155
+ umap_layers_to_plot:
156
+ - "mapped_length"
157
+ - "Raw_modification_signal"
158
+
159
+ # Basic Analysis - Spatial Autocorrelation params
160
+ rows_per_qc_autocorr_grid: 6
161
+ autocorr_rolling_window_size: 25
162
+ autocorr_max_lag: 800
163
+ autocorr_site_types:
164
+ - "GpC"
165
+ - "CpG"
166
+ - "any_C"
167
+
168
+ # Basic Analysis - Correlation Matrix params
169
+ correlation_matrix_types:
170
+ - "pearson"
171
+ - "binary_covariance"
172
+ correlation_matrix_cmaps:
173
+ - "seismic"
174
+ - "viridis"
175
+ correlation_matrix_site_types:
176
+ - "GpC_site"
177
+
178
+ ######## smftools hmm params #########
179
+ # HMM params
180
+ hmm_n_states: 2 # Number of HMM states
181
+ hmm_init_emission_probs:
182
+ - [0.8, 0.2]
183
+ - [0.2, 0.8]
184
+ hmm_init_transition_probs:
185
+ - [0.9, 0.1]
186
+ - [0.1, 0.9]
187
+ hmm_init_start_probs:
188
+ - 0.5
189
+ - 0.5
190
+ hmm_eps: 1e-8
191
+ hmm_dtype: "float64"
192
+ hmm_annotation_threshold: 0.5
193
+ hmm_batch_size: 1024
194
+ hmm_use_viterbi: False
195
+ footprints: True # whether to use the default HMM footprint params
196
+ accessible_patches: True # whether to use the default HMM accessible patch params
197
+ cpg: False # whether to use the default HMM endogenous CpG patch params
198
+ hmm_methbases:
199
+ - "GpC"
200
+ - "CpG"
201
+ - "C"
202
+ - "A"
203
+ hmm_feature_sets:
204
+ footprint:
205
+ state: "Non-Modified"
206
+ features:
207
+ small_bound_stretch: [10, 40]
208
+ medium_bound_stretch: [40, 110]
209
+ putative_nucleosome: [110, 200]
210
+ large_bound_stretch: [200, inf]
211
+ accessible:
212
+ state: "Modified"
213
+ features:
214
+ small_accessible_patch: [3, 20]
215
+ mid_accessible_patch: [20, 40]
216
+ mid_large_accessible_patch: [40, 110]
217
+ large_accessible_patch: [110, inf]
218
+ hmm_merge_layer_features:
219
+ - [null, 80]
220
+
221
+ # Pipeline control flow - load adata
222
+ force_redo_load_adata: False # Whether to perform load adata command from start
223
+
224
+ # Pipeline control flow - Preprocessing and QC
225
+ force_redo_preprocessing: False # Whether to force redo the entire preprocessing workflow from the initial raw anndata.
226
+ force_reload_sample_sheet: True # Whether to force redo sample sheet loading
227
+ bypass_add_read_length_and_mapping_qc: False # Whether to skip read length, quality, and mapping qc.
228
+ force_redo_add_read_length_and_mapping_qc: False # Whether to force redo read length, quality, and mapping qc.
229
+ bypass_clean_nan: False # Whether to skip NaN cleaning
230
+ force_redo_clean_nan: False # Whether to redo NaN cleaning
231
+ bypass_append_base_context: False # Whether to skip adding per reference base context additions.
232
+ force_redo_append_base_context: False # Whether to redo per reference base context additions.
233
+ invert_adata: False # Whether to invert the AnnData along the positions axis.
234
+ bypass_append_binary_layer_by_base_context: False # Whether to skip adding new binary layers for each specific base context.
235
+ force_redo_append_binary_layer_by_base_context: False # Whether to redo adding new binary layers for each specific base context.
236
+ bypass_calculate_read_modification_stats: False # Whether to skip adding read level modification statistics.
237
+ force_redo_calculate_read_modification_stats: False # Whether to force redo adding read level modification statistics.
238
+ bypass_filter_reads_on_modification_thresholds: False # Whether to skip filtering reads based on read level modification statistics.
239
+ force_redo_filter_reads_on_modification_thresholds: False # Whether to redo filtering reads based on read level modification statistics.
240
+ bypass_flag_duplicate_reads: False # Whether to skip flagging duplicate reads based on modification similarity.
241
+ force_redo_flag_duplicate_reads: False # Whether to redo flagging duplicate reads based on modification similarity.
242
+ bypass_complexity_analysis: False # Whether to skip complexity analysis
243
+ force_redo_complexity_analysis: False # Whether to redo complexity analysis
244
+
245
+ # Pipeline control flow - Basic Analyses
246
+ force_redo_basic_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
247
+ bypass_basic_clustermaps: False # Whether to skip basic clustermap plotting
248
+ force_redo_basic_clustermaps: False # Whether to redo basic clustermap plotting
249
+ bypass_basic_umap: False # Whether to skip basic UMAP calculation/plotting
250
+ force_redo_basic_umap: False # Whether to redo basic UMAP calculation/plotting
251
+ bypass_spatial_autocorr_calculations: False # Whether to skip basic spatial autocorrelation calculation
252
+ force_redo_spatial_autocorr_calculations: False # Whether to redo basic spatial autocorrelation calculation
253
+ bypass_spatial_autocorr_plotting: False # Whether to skip basic spatial autocorrelation plotting
254
+ force_redo_spatial_autocorr_plotting: False # Whether to redo basic spatial autocorrelation plotting
255
+ bypass_matrix_corr_calculations: False # Whether to skip basic correlation matrix calculation
256
+ force_redo_matrix_corr_calculations: False # Whether to force redo basic correlation matrix calculation
257
+ bypass_matrix_corr_plotting: False # Whether to skip basic correlation matrix plotting
258
+ force_redo_matrix_corr_plotting: False # Whether to force redo basic correlation matrix calculation
259
+
260
+ # Pipeline control flow - HMMs
261
+ bypass_hmm_fit: False # Whether to skip HMM fitting for each sample/reference
262
+ force_redo_hmm_fit: False # Whether to redo HMM fitting for each sample/reference
263
+ bypass_hmm_apply: False # Whether to skip HMM application for each sample/reference
264
+ force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
@@ -0,0 +1,41 @@
1
+ # Direct (Nanopore modified base calling)footprinting defaults
2
+ extends: default
3
+
4
+ ######## smftools load params #########
5
+ filter_threshold: 0.8 # min threshold to call a canononical base
6
+ m6A_threshold: 0.7 # min threshold to call a modified m6a base
7
+ m5C_threshold: 0.7 # min threshold to call a modified 5mC base
8
+ hm5C_threshold: 0.7 # min threshold to call a modified 5hmC base
9
+ thresholds:
10
+ - filter_threshold
11
+ - m6A_threshold
12
+ - m5C_threshold
13
+ - hm5C_threshold
14
+ mod_list:
15
+ - '5mC_5hmC'
16
+ - '6mA' # mods to detect
17
+ mod_target_bases:
18
+ - "A"
19
+ enzyme_target_bases:
20
+ - "A"
21
+ batch_size: 4 # How many mod TSVs to load into memory at a time when making anndata batches
22
+ skip_unclassified: True # Whether to skip unclassified barcodes
23
+ delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
24
+
25
+ ######## smftools preprocess params ########
26
+ fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
27
+ binarize_on_fixed_methlyation_threshold: 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
28
+ positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
29
+ negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
30
+ infer_on_percentile_sample_methylation_fitting: 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
31
+ inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
32
+ fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
33
+ output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
34
+
35
+ ######## smftools spatial params #########
36
+ autocorr_site_types:
37
+ - "A"
38
+
39
+ ######## smftools hmm params #########
40
+ hmm_methbases:
41
+ - "A"
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, List, Any, Iterable, Union
5
+
6
+ def discover_input_files(
7
+ input_data_path: Union[str, Path],
8
+ bam_suffix: str = ".bam",
9
+ recursive: bool = False,
10
+ follow_symlinks: bool = False,
11
+ ) -> Dict[str, Any]:
12
+ """
13
+ Discover input files under `input_data_path`.
14
+
15
+ Returns a dict with:
16
+ - pod5_paths, fast5_paths, fastq_paths, bam_paths, other_paths (lists of Path)
17
+ - input_is_pod5, input_is_fast5, input_is_fastq, input_is_bam (bools)
18
+ - all_files_searched (int)
19
+
20
+ Behavior:
21
+ - If `input_data_path` is a file, returns that single file categorized.
22
+ - If a directory, scans immediate children (recursive=False) or entire tree (recursive=True).
23
+ - Handles multi-suffix files like .fastq.gz, .fq.xz, etc.
24
+ """
25
+ p = Path(input_data_path)
26
+
27
+ # normalize bam suffix with a leading dot and lower
28
+ if not bam_suffix.startswith("."):
29
+ bam_suffix = "." + bam_suffix
30
+ bam_suffix = bam_suffix.lower()
31
+
32
+ # Sets of canonical extension keys we’ll compare against
33
+ pod5_exts = {".pod5", ".p5"}
34
+ fast5_exts = {".fast5", ".f5"}
35
+ fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq.zst", ".fq.zst"}
36
+ h5ad_exts = {".h5ad", ".h5"}
37
+ compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
38
+
39
+ def ext_key(pp: Path) -> str:
40
+ """
41
+ A robust extension key: last suffix, or last two if the final one is a compressor (.gz/.bz2/.xz/.zst).
42
+ Examples:
43
+ a.fastq.gz -> ".fastq.gz"
44
+ a.fq.xz -> ".fq.xz"
45
+ a.bam -> ".bam"
46
+ a -> ""
47
+ """
48
+ suff = [s.lower() for s in pp.suffixes]
49
+ if not suff:
50
+ return ""
51
+ if suff[-1] in compressed_exts and len(suff) >= 2:
52
+ return suff[-2] + suff[-1]
53
+ return suff[-1]
54
+
55
+ pod5_paths: List[Path] = []
56
+ fast5_paths: List[Path] = []
57
+ fastq_paths: List[Path] = []
58
+ bam_paths: List[Path] = []
59
+ h5ad_paths: List[Path] = []
60
+ other_paths: List[Path] = []
61
+
62
+ def categorize_file(fp: Path) -> None:
63
+ key = ext_key(fp)
64
+ if key in pod5_exts:
65
+ pod5_paths.append(fp)
66
+ elif key in fast5_exts:
67
+ fast5_paths.append(fp)
68
+ elif key in fastq_exts:
69
+ fastq_paths.append(fp)
70
+ elif key in h5ad_exts:
71
+ h5ad_paths.append(fp)
72
+ elif key == bam_suffix:
73
+ bam_paths.append(fp)
74
+ else:
75
+ other_paths.append(fp)
76
+
77
+ if not p.exists():
78
+ raise FileNotFoundError(f"input_data_path does not exist: {input_data_path}")
79
+
80
+ total_searched = 0
81
+
82
+ if p.is_file():
83
+ total_searched = 1
84
+ categorize_file(p)
85
+ else:
86
+ # Directory scan
87
+ if recursive:
88
+ # Python 3.12+ supports follow_symlinks in glob/rglob. Fallback for older versions.
89
+ try:
90
+ iterator = p.rglob("*", follow_symlinks=follow_symlinks) # type: ignore[call-arg]
91
+ except TypeError:
92
+ iterator = p.rglob("*") # follow_symlinks not supported
93
+ else:
94
+ iterator = p.iterdir()
95
+
96
+ for fp in iterator:
97
+ if not fp.is_file():
98
+ continue
99
+ total_searched += 1
100
+ categorize_file(fp)
101
+
102
+ return {
103
+ "pod5_paths": sorted(pod5_paths),
104
+ "fast5_paths": sorted(fast5_paths),
105
+ "fastq_paths": sorted(fastq_paths),
106
+ "bam_paths": sorted(bam_paths),
107
+ "h5ad_paths": sorted(h5ad_paths),
108
+ "other_paths": sorted(other_paths),
109
+ "input_is_pod5": len(pod5_paths) > 0,
110
+ "input_is_fast5": len(fast5_paths) > 0,
111
+ "input_is_fastq": len(fastq_paths) > 0,
112
+ "input_is_bam": len(bam_paths) > 0,
113
+ "input_is_h5ad": len(h5ad_paths) > 0,
114
+ "all_files_searched": total_searched,
115
+ }