smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +54 -0
  5. smftools/cli/hmm_adata.py +937 -256
  6. smftools/cli/load_adata.py +448 -268
  7. smftools/cli/preprocess_adata.py +469 -263
  8. smftools/cli/spatial_adata.py +536 -319
  9. smftools/cli_entry.py +97 -182
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +17 -6
  12. smftools/config/deaminase.yaml +12 -10
  13. smftools/config/default.yaml +142 -33
  14. smftools/config/direct.yaml +11 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +594 -264
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2128 -1418
  21. smftools/hmm/__init__.py +2 -9
  22. smftools/hmm/archived/call_hmm_peaks.py +121 -0
  23. smftools/hmm/call_hmm_peaks.py +299 -91
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +397 -175
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +196 -30
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +422 -197
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +147 -87
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +10 -12
  84. smftools/preprocessing/append_base_context.py +115 -80
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
  86. smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +129 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +50 -25
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +118 -54
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +689 -272
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +103 -0
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +331 -82
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.3.dist-info/RECORD +0 -173
  128. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  129. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  130. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  131. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  132. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
  133. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  134. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  135. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  136. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  137. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,7 @@
1
1
  # General
2
2
  sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
3
- sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
4
- sample_name_col_for_plotting: 'Barcode'
3
+ sample_sheet_mapping_column: 'Experiment_name_and_barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
4
+ sample_name_col_for_plotting: 'Experiment_name_and_barcode'
5
5
 
6
6
  # Compute params
7
7
  threads: 4
@@ -9,9 +9,7 @@ device: "auto"
9
9
 
10
10
  ######## smftools load params #########
11
11
  # Generic i/o
12
- bam_suffix: ".bam"
13
12
  recursive_input_search: True
14
- split_dir: "demultiplexed_BAMs"
15
13
  strands:
16
14
  - bottom
17
15
  - top
@@ -40,7 +38,8 @@ model: "hac" # needed for dorado basecaller
40
38
  filter_threshold: 0.8 # Dorado probability filter threshold for base calling.
41
39
 
42
40
  # Alignment params
43
- aligner: "minimap2" # Aligner to use: dorado, minimap2
41
+ aligner: "dorado" # Aligner to use: dorado, minimap2
42
+ align_from_bam: False # Whether to run alignment from a bam file for minimap2. If False, runs alignment from a FASTQ file.
44
43
  aligner_args:
45
44
  minimap2:
46
45
  ont:
@@ -52,7 +51,6 @@ aligner_args:
52
51
  - '-y'
53
52
  - '-N'
54
53
  - '5'
55
- - '--secondary=no'
56
54
  pacbio:
57
55
  - '-a'
58
56
  - '-x'
@@ -62,7 +60,6 @@ aligner_args:
62
60
  - '-y'
63
61
  - '-N'
64
62
  - '5'
65
- - '--secondary=no'
66
63
  illumina:
67
64
  - '-a'
68
65
  - '-x'
@@ -72,7 +69,6 @@ aligner_args:
72
69
  - '-y'
73
70
  - '-N'
74
71
  - '5'
75
- - '--secondary=no'
76
72
  dorado:
77
73
  ont:
78
74
  - "--mm2-opts"
@@ -87,9 +83,9 @@ barcode_both_ends: False # dorado demultiplexing
87
83
  trim: False # dorado adapter and barcode removal during demultiplexing
88
84
 
89
85
  # Anndata structure
90
- mapping_threshold: 0.01 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
86
+ mapping_threshold: 0.10 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
91
87
  reference_column: 'Reference_strand'
92
- sample_column: 'Barcode'
88
+ sample_column: 'Experiment_name_and_barcode'
93
89
 
94
90
  ######## smftools preprocess params #########
95
91
  # Read length, quality, and mapping filtering params
@@ -100,7 +96,7 @@ read_len_filter_thresholds:
100
96
  - 100
101
97
  - null
102
98
  read_len_to_ref_ratio_filter_thresholds:
103
- - 0.5
99
+ - null
104
100
  - null
105
101
  read_quality_filter_thresholds:
106
102
  - 15
@@ -116,7 +112,7 @@ read_mod_filtering_gpc_thresholds:
116
112
  read_mod_filtering_cpg_thresholds:
117
113
  - 0.0
118
114
  - 1.0
119
- read_mod_filtering_any_c_thresholds:
115
+ read_mod_filtering_c_thresholds:
120
116
  - 0.025
121
117
  - 0.975
122
118
  read_mod_filtering_a_thresholds:
@@ -125,6 +121,16 @@ read_mod_filtering_a_thresholds:
125
121
  read_mod_filtering_use_other_c_as_background: False
126
122
  min_valid_fraction_positions_in_read_vs_ref: 0.5
127
123
 
124
+ # Plotting params for read length histograms
125
+ obs_to_plot_pp_qc:
126
+ - read_length
127
+ - mapped_length
128
+ - read_quality
129
+ - mapping_quality
130
+ - mapped_length_to_reference_length_ratio
131
+ - mapped_length_to_read_length_ratio
132
+ - Raw_modification_signal
133
+
128
134
  # Duplicate detection params
129
135
  duplicate_detection_site_types: # Site types to consider for duplicate detection workflow
130
136
  - "GpC"
@@ -132,7 +138,7 @@ duplicate_detection_site_types: # Site types to consider for duplicate detection
132
138
  - "ambiguous_GpC_CpG"
133
139
  duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
134
140
  hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
135
- - Fraction_any_C_site_modified
141
+ - Fraction_C_site_modified
136
142
  duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
137
143
  duplicate_detection_window_size_for_hamming_neighbors: 50 # How many neighboring reads to look at for calculating hamming distance pairs
138
144
  duplicate_detection_min_overlapping_positions: 20 # The minimum amount of valid overlapping positions that will allow duplicate detection to work
@@ -143,29 +149,39 @@ duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkag
143
149
  # Position QC params
144
150
  position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
145
151
 
146
- ######## smftools analyze params #########
147
- # Basic Analysis - QC Plotting params
152
+ ######## smftools spatial params #########
153
+ invert_adata: False # Whether to invert the AnnData along the positions axis.
154
+ # Reindexing params
155
+ reindexing_offsets:
156
+ null : null
157
+ reindexed_var_suffix: "reindexed"
158
+
159
+ # Spatial Analysis - QC Plotting params
148
160
  rows_per_qc_histogram_grid: 12
149
161
 
150
- # Basic Analysis - Clustermap params
162
+ # Spatial Analysis - Clustermap params
151
163
  layer_for_clustermap_plotting: 'nan0_0minus1'
164
+ clustermap_cmap_c: "coolwarm"
165
+ clustermap_cmap_gpc: "coolwarm"
166
+ clustermap_cmap_cpg: "coolwarm"
167
+ clustermap_cmap_a: "coolwarm"
168
+ spatial_clustermap_sortby: "gpc"
152
169
 
153
- # Basic Analysis - UMAP/Leiden params
170
+ # Spatial Analysis - UMAP/Leiden params
154
171
  layer_for_umap_plotting: 'nan_half'
155
172
  umap_layers_to_plot:
156
173
  - "mapped_length"
157
174
  - "Raw_modification_signal"
158
175
 
159
- # Basic Analysis - Spatial Autocorrelation params
176
+ # Spatial Analysis - Spatial Autocorrelation params
177
+ autocorr_normalization_method: "pearson" # options are pearson or sum
160
178
  rows_per_qc_autocorr_grid: 6
161
179
  autocorr_rolling_window_size: 25
162
180
  autocorr_max_lag: 800
163
181
  autocorr_site_types:
164
182
  - "GpC"
165
- - "CpG"
166
- - "any_C"
167
183
 
168
- # Basic Analysis - Correlation Matrix params
184
+ # Spatial Analysis - Correlation Matrix params
169
185
  correlation_matrix_types:
170
186
  - "pearson"
171
187
  - "binary_covariance"
@@ -188,10 +204,19 @@ hmm_init_start_probs:
188
204
  - 0.5
189
205
  - 0.5
190
206
  hmm_eps: 1e-8
207
+ # Fitting strategy
208
+ hmm_fit_strategy: "per_group" # "per_group" | "shared_transitions"
209
+ hmm_shared_scope: ["reference", "methbase"]
210
+ hmm_groupby: ["sample", "reference", "methbase"]
211
+ # If hmm_fit_strategy == shared_transitions
212
+ hmm_adapt_emissions: true
213
+ hmm_adapt_startprobs: true
214
+ hmm_emission_adapt_iters: 5
215
+ hmm_emission_adapt_tol: 1.0e-4
191
216
  hmm_dtype: "float64"
192
- hmm_annotation_threshold: 0.5
193
- hmm_batch_size: 1024
194
- hmm_use_viterbi: False
217
+ hmm_annotation_threshold: 0.5 # The minimum probability threshold of a feature interval to accept it for layer annotation.
218
+ hmm_batch_size: 1024 # hmm batch size
219
+ hmm_use_viterbi: False # Whether to use viterbi decoding. If False, uses forward-backward gammas. Viterbi is smoother, but less sensitive.
195
220
  footprints: True # whether to use the default HMM footprint params
196
221
  accessible_patches: True # whether to use the default HMM accessible patch params
197
222
  cpg: False # whether to use the default HMM endogenous CpG patch params
@@ -204,19 +229,104 @@ hmm_feature_sets:
204
229
  footprint:
205
230
  state: "Non-Modified"
206
231
  features:
207
- small_bound_stretch: [10, 40]
208
- medium_bound_stretch: [40, 110]
209
- putative_nucleosome: [110, 200]
232
+ small_bound_stretch: [6, 40]
233
+ medium_bound_stretch: [40, 100]
234
+ putative_nucleosome: [100, 200]
210
235
  large_bound_stretch: [200, inf]
211
236
  accessible:
212
237
  state: "Modified"
213
238
  features:
214
239
  small_accessible_patch: [3, 20]
215
240
  mid_accessible_patch: [20, 40]
216
- mid_large_accessible_patch: [40, 110]
217
- large_accessible_patch: [110, inf]
241
+ large_accessible_patch: [40, 110]
242
+ nucleosome_depleted_region: [110, inf]
218
243
  hmm_merge_layer_features:
219
- - [null, 80]
244
+ - ["all_accessible_features", 60]
245
+ clustermap_cmap_hmm: "coolwarm"
246
+ hmm_clustermap_feature_layers:
247
+ - all_accessible_features
248
+ - all_accessible_features_merged
249
+ - small_accessible_patch
250
+ - mid_accessible_patch
251
+ - large_accessible_patch
252
+ - large_accessible_patch_merged
253
+ - nucleosome_depleted_region
254
+ - nucleosome_depleted_region_merged
255
+ - small_bound_stretch
256
+ - medium_bound_stretch
257
+ - putative_nucleosome
258
+ - large_bound_stretch
259
+ hmm_clustermap_sortby: "hmm"
260
+ hmm_peak_feature_configs:
261
+ all_accessible_features:
262
+ min_distance: 200 # The minimum distance in between called peaks
263
+ peak_width: 200 # The window width to calculate sum/mean hmm signal per read centered at the peak center.
264
+ peak_prominence: 0.1 # The minimum prominence to call a peak
265
+ peak_threshold: 0.80 # The minimum mean hmm signal in each molecule within the peak window to mark the molecule as positive for the feature.
266
+ rolling_window: 50 # Window size for the rolling average smoothing before peak calling
267
+
268
+ all_accessible_features_merged:
269
+ min_distance: 250
270
+ peak_width: 250
271
+ peak_prominence: 0.05
272
+ peak_threshold: 0.80
273
+ rolling_window: 50
274
+
275
+ small_accessible_patch:
276
+ min_distance: 40
277
+ peak_width: 30
278
+ peak_prominence: 0.1
279
+ peak_threshold: 0.8
280
+ rolling_window: 40
281
+
282
+ mid_accessible_patch:
283
+ min_distance: 100
284
+ peak_width: 60
285
+ peak_prominence: 0.025
286
+ peak_threshold: 0.80
287
+ rolling_window: 50
288
+
289
+ large_accessible_patch:
290
+ min_distance: 100
291
+ peak_width: 100
292
+ peak_prominence: 0.025
293
+ peak_threshold: 0.80
294
+ rolling_window: 50
295
+
296
+ nucleosome_depleted_region:
297
+ min_distance: 200
298
+ peak_width: 200
299
+ peak_prominence: 0.025
300
+ peak_threshold: 0.80
301
+ rolling_window: 50
302
+
303
+ small_bound_stretch:
304
+ min_distance: 20
305
+ peak_width: 20
306
+ peak_prominence: 0.01
307
+ peak_threshold: 0.50
308
+ rolling_window: 10
309
+
310
+ medium_bound_stretch:
311
+ min_distance: 40
312
+ peak_width: 40
313
+ peak_prominence: 0.01
314
+ peak_threshold: 0.50
315
+ rolling_window: 20
316
+
317
+ putative_nucleosome:
318
+ min_distance: 160
319
+ peak_width: 147 # canonical nucleosome footprint
320
+ peak_prominence: 0.025
321
+ peak_threshold: 0.60
322
+ rolling_window: 20
323
+
324
+ large_bound_stretch:
325
+ min_distance: 250
326
+ peak_width: 300
327
+ peak_prominence: 0.20
328
+ peak_threshold: 0.80
329
+ rolling_window: 50
220
330
 
221
331
  # Pipeline control flow - load adata
222
332
  force_redo_load_adata: False # Whether to perform load adata command from start
@@ -230,7 +340,6 @@ bypass_clean_nan: False # Whether to skip NaN cleaning
230
340
  force_redo_clean_nan: False # Whether to redo NaN cleaning
231
341
  bypass_append_base_context: False # Whether to skip adding per reference base context additions.
232
342
  force_redo_append_base_context: False # Whether to redo per reference base context additions.
233
- invert_adata: False # Whether to invert the AnnData along the positions axis.
234
343
  bypass_append_binary_layer_by_base_context: False # Whether to skip adding new binary layers for each specific base context.
235
344
  force_redo_append_binary_layer_by_base_context: False # Whether to redo adding new binary layers for each specific base context.
236
345
  bypass_calculate_read_modification_stats: False # Whether to skip adding read level modification statistics.
@@ -242,8 +351,8 @@ force_redo_flag_duplicate_reads: False # Whether to redo flagging duplicate read
242
351
  bypass_complexity_analysis: False # Whether to skip complexity analysis
243
352
  force_redo_complexity_analysis: False # Whether to redo complexity analysis
244
353
 
245
- # Pipeline control flow - Basic Analyses
246
- force_redo_basic_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
354
+ # Pipeline control flow - Spatial Analyses
355
+ force_redo_spatial_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
247
356
  bypass_basic_clustermaps: False # Whether to skip basic clustermap plotting
248
357
  force_redo_basic_clustermaps: False # Whether to redo basic clustermap plotting
249
358
  bypass_basic_umap: False # Whether to skip basic UMAP calculation/plotting
@@ -14,6 +14,9 @@ thresholds:
14
14
  mod_list:
15
15
  - '5mC_5hmC'
16
16
  - '6mA' # mods to detect
17
+ mod_map:
18
+ 5mC_5hmC: 5mC
19
+ 6mA: 6mA
17
20
  mod_target_bases:
18
21
  - "A"
19
22
  enzyme_target_bases:
@@ -24,10 +27,10 @@ delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs afte
24
27
 
25
28
  ######## smftools preprocess params ########
26
29
  fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
27
- binarize_on_fixed_methlyation_threshold: 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
30
+ binarize_on_fixed_methlyation_threshold: 0.5 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
28
31
  positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
29
32
  negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
30
- infer_on_percentile_sample_methylation_fitting: 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
33
+ infer_on_percentile_sample_methylation_fitting: 5 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
31
34
  inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
32
35
  fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
33
36
  output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
@@ -36,6 +39,11 @@ output_binary_layer_name: "binarized_methylation" # The layer to store the binar
36
39
  autocorr_site_types:
37
40
  - "A"
38
41
 
42
+ spatial_clustermap_sortby: "a"
43
+
39
44
  ######## smftools hmm params #########
40
45
  hmm_methbases:
41
- - "A"
46
+ - "A"
47
+
48
+ hmm_merge_layer_features:
49
+ - ["A_all_accessible_features", 60]
@@ -1,11 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from pathlib import Path
4
- from typing import Dict, List, Any, Iterable, Union
4
+ from typing import Any, Dict, List, Union
5
+
6
+ from smftools.constants import BAM_SUFFIX
7
+
5
8
 
6
9
  def discover_input_files(
7
10
  input_data_path: Union[str, Path],
8
- bam_suffix: str = ".bam",
11
+ bam_suffix: str = BAM_SUFFIX,
9
12
  recursive: bool = False,
10
13
  follow_symlinks: bool = False,
11
14
  ) -> Dict[str, Any]:
@@ -30,10 +33,21 @@ def discover_input_files(
30
33
  bam_suffix = bam_suffix.lower()
31
34
 
32
35
  # Sets of canonical extension keys we’ll compare against
33
- pod5_exts = {".pod5", ".p5"}
36
+ pod5_exts = {".pod5", ".p5"}
34
37
  fast5_exts = {".fast5", ".f5"}
35
- fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq.zst", ".fq.zst"}
36
- h5ad_exts = {".h5ad", ".h5"}
38
+ fastq_exts = {
39
+ ".fastq",
40
+ ".fq",
41
+ ".fastq.gz",
42
+ ".fq.gz",
43
+ ".fastq.bz2",
44
+ ".fq.bz2",
45
+ ".fastq.xz",
46
+ ".fq.xz",
47
+ ".fastq.zst",
48
+ ".fq.zst",
49
+ }
50
+ h5ad_exts = {".h5ad", ".h5"}
37
51
  compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
38
52
 
39
53
  def ext_key(pp: Path) -> str: