smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -1,3 +1,13 @@
1
+ # General
2
+ sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
3
+ sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
4
+ sample_name_col_for_plotting: 'Barcode'
5
+
6
+ # Compute params
7
+ threads: 4
8
+ device: "auto"
9
+
10
+ ######## smftools load params #########
1
11
  # Generic i/o
2
12
  bam_suffix: ".bam"
3
13
  recursive_input_search: True
@@ -7,16 +17,12 @@ strands:
7
17
  - top
8
18
  conversions:
9
19
  - unconverted
10
- sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
11
- sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
12
20
  fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
13
21
  fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
14
22
  input_already_demuxed: False # If the input files are already demultiplexed.
15
23
  delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
16
-
17
- # Compute params
18
- threads: 4
19
- device: "auto"
24
+ delete_intermediate_bams: False # Whether to delete intermediate BAM files.
25
+ delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
20
26
 
21
27
  # Sequencing modality and general experiment params
22
28
  smf_modality: 'conversion' # conversion, deaminase, direct
@@ -34,7 +40,8 @@ model: "hac" # needed for dorado basecaller
34
40
  filter_threshold: 0.8 # Dorado probability filter threshold for base calling.
35
41
 
36
42
  # Alignment params
37
- aligner: "minimap2" # Aligner to use: dorado, minimap2
43
+ aligner: "dorado" # Aligner to use: dorado, minimap2
44
+ align_from_bam: False # Whether to run alignment from a bam file for minimap2. If False, runs alignment from a FASTQ file.
38
45
  aligner_args:
39
46
  minimap2:
40
47
  ont:
@@ -70,11 +77,11 @@ aligner_args:
70
77
  dorado:
71
78
  ont:
72
79
  - "--mm2-opts"
73
- - "-N"
74
- - "5"
80
+ - "-N 5"
75
81
 
76
82
  # Sorted BAM and BED specific handling
77
83
  make_bigwigs: False # Whether to make coverage bigwigs
84
+ make_beds: False # Whether to make beds from the aligned bams
78
85
 
79
86
  # Nanopore specific demultiplexing
80
87
  barcode_both_ends: False # dorado demultiplexing
@@ -85,47 +92,58 @@ mapping_threshold: 0.01 # Minimum proportion of mapped reads that need to fall w
85
92
  reference_column: 'Reference_strand'
86
93
  sample_column: 'Barcode'
87
94
 
88
- # Preprocessing - Read length, quality, and mapping filtering params
95
+ ######## smftools preprocess params #########
96
+ # Read length, quality, and mapping filtering params
89
97
  read_coord_filter:
90
98
  - null
91
99
  - null
92
100
  read_len_filter_thresholds:
93
- - 200
101
+ - 100
94
102
  - null
95
103
  read_len_to_ref_ratio_filter_thresholds:
96
- - 0.8
104
+ - 0.5
97
105
  - null
98
106
  read_quality_filter_thresholds:
99
- - 20
107
+ - 15
100
108
  - null
101
109
  read_mapping_quality_filter_thresholds:
102
110
  - null
103
111
  - null
104
112
 
105
- # Preprocessing - Read modification filtering params
113
+ # Read modification filtering params
106
114
  read_mod_filtering_gpc_thresholds:
107
115
  - 0.025
108
116
  - 0.975
109
117
  read_mod_filtering_cpg_thresholds:
110
118
  - 0.0
111
119
  - 1.0
112
- read_mod_filtering_any_c_thresholds:
120
+ read_mod_filtering_c_thresholds:
113
121
  - 0.025
114
122
  - 0.975
115
123
  read_mod_filtering_a_thresholds:
116
124
  - 0.025
117
125
  - 0.975
118
126
  read_mod_filtering_use_other_c_as_background: False
119
- min_valid_fraction_positions_in_read_vs_ref: 0.8
127
+ min_valid_fraction_positions_in_read_vs_ref: 0.5
120
128
 
121
- # Preprocessing - Duplicate detection params
129
+ # Plotting params for read length histograms
130
+ obs_to_plot_pp_qc:
131
+ - read_length
132
+ - mapped_length
133
+ - read_quality
134
+ - mapping_quality
135
+ - mapped_length_to_reference_length_ratio
136
+ - mapped_length_to_read_length_ratio
137
+ - Raw_modification_signal
138
+
139
+ # Duplicate detection params
122
140
  duplicate_detection_site_types: # Site types to consider for duplicate detection workflow
123
141
  - "GpC"
124
142
  - "CpG"
125
143
  - "ambiguous_GpC_CpG"
126
144
  duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
127
145
  hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
128
- - Fraction_any_C_site_modified
146
+ - Fraction_C_site_modified
129
147
  duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
130
148
  duplicate_detection_window_size_for_hamming_neighbors: 50 # How many neighboring reads to look at for calculating hamming distance pairs
131
149
  duplicate_detection_min_overlapping_positions: 20 # The minimum amount of valid overlapping positions that will allow duplicate detection to work
@@ -133,33 +151,43 @@ duplicate_detection_do_hierarchical: True # Whether to follow up fwd/rev lexicog
133
151
  duplicate_detection_hierarchical_linkage: "average" # Method for hierarchical clustering distance calculation
134
152
  duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkage based duplicate detection.
135
153
 
136
- # Preprocessing - Complexity analysis params
154
+ # Position QC params
155
+ position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
137
156
 
138
- # General Plotting params
139
- sample_name_col_for_plotting: 'Barcode'
157
+ ######## smftools spatial params #########
158
+ invert_adata: False # Whether to invert the AnnData along the positions axis.
159
+ # Reindexing params
160
+ reindexing_offsets:
161
+ null : null
162
+ reindexed_var_suffix: "reindexed"
140
163
 
141
- # Basic Analysis - QC Plotting params
164
+ # Spatial Analysis - QC Plotting params
142
165
  rows_per_qc_histogram_grid: 12
143
166
 
144
- # Basic Analysis - Clustermap params
167
+ # Spatial Analysis - Clustermap params
145
168
  layer_for_clustermap_plotting: 'nan0_0minus1'
169
+ clustermap_cmap_c: "coolwarm"
170
+ clustermap_cmap_gpc: "coolwarm"
171
+ clustermap_cmap_cpg: "coolwarm"
172
+ clustermap_cmap_a: "coolwarm"
173
+ spatial_clustermap_sortby: "gpc"
146
174
 
147
- # Basic Analysis - UMAP/Leiden params
175
+ # Spatial Analysis - UMAP/Leiden params
148
176
  layer_for_umap_plotting: 'nan_half'
149
177
  umap_layers_to_plot:
150
178
  - "mapped_length"
151
179
  - "Raw_modification_signal"
152
180
 
153
- # Basic Analysis - Spatial Autocorrelation params
181
+ # Spatial Analysis - Spatial Autocorrelation params
154
182
  rows_per_qc_autocorr_grid: 6
155
183
  autocorr_rolling_window_size: 25
156
184
  autocorr_max_lag: 800
157
185
  autocorr_site_types:
158
186
  - "GpC"
159
187
  - "CpG"
160
- - "any_C"
188
+ - "C"
161
189
 
162
- # Basic Analysis - Correlation Matrix params
190
+ # Spatial Analysis - Correlation Matrix params
163
191
  correlation_matrix_types:
164
192
  - "pearson"
165
193
  - "binary_covariance"
@@ -169,6 +197,7 @@ correlation_matrix_cmaps:
169
197
  correlation_matrix_site_types:
170
198
  - "GpC_site"
171
199
 
200
+ ######## smftools hmm params #########
172
201
  # HMM params
173
202
  hmm_n_states: 2 # Number of HMM states
174
203
  hmm_init_emission_probs:
@@ -197,18 +226,105 @@ hmm_feature_sets:
197
226
  footprint:
198
227
  state: "Non-Modified"
199
228
  features:
200
- small_bound_stretch: [0, 25]
201
- medium_bound_stretch: [25, 80]
202
- putative_nucleosome: [80, 200]
229
+ small_bound_stretch: [6, 40]
230
+ medium_bound_stretch: [40, 100]
231
+ putative_nucleosome: [100, 200]
203
232
  large_bound_stretch: [200, inf]
204
233
  accessible:
205
234
  state: "Modified"
206
235
  features:
207
- small_accessible_patch: [0, 20]
208
- mid_accessible_patch: [20, 100]
209
- large_accessible_patch: [100, inf]
236
+ small_accessible_patch: [3, 20]
237
+ mid_accessible_patch: [20, 40]
238
+ large_accessible_patch: [40, 110]
239
+ nucleosome_depleted_region: [110, inf]
210
240
  hmm_merge_layer_features:
211
241
  - [null, 80]
242
+ clustermap_cmap_hmm: "coolwarm"
243
+ hmm_clustermap_feature_layers:
244
+ - all_accessible_features
245
+ - all_accessible_features_merged
246
+ - small_accessible_patch
247
+ - mid_accessible_patch
248
+ - large_accessible_patch
249
+ - nucleosome_depleted_region
250
+ - small_bound_stretch
251
+ - medium_bound_stretch
252
+ - putative_nucleosome
253
+ - large_bound_stretch
254
+ hmm_clustermap_sortby: "hmm"
255
+ hmm_peak_feature_configs:
256
+ all_accessible_features:
257
+ min_distance: 200 # The minimum distance in between called peaks
258
+ peak_width: 200 # The window width to calculate sum/mean hmm signal per read centered at the peak center.
259
+ peak_prominence: 0.1 # The minimum prominence to call a peak
260
+ peak_threshold: 0.80 # The minimum mean hmm signal in each molecule within the peak window to mark the molecule as positive for the feature.
261
+ rolling_window: 50 # Window size for the rolling average smoothing before peak calling
262
+
263
+ all_accessible_features_merged:
264
+ min_distance: 250
265
+ peak_width: 250
266
+ peak_prominence: 0.05
267
+ peak_threshold: 0.80
268
+ rolling_window: 50
269
+
270
+ small_accessible_patch:
271
+ min_distance: 40
272
+ peak_width: 30
273
+ peak_prominence: 0.1
274
+ peak_threshold: 0.8
275
+ rolling_window: 40
276
+
277
+ mid_accessible_patch:
278
+ min_distance: 100
279
+ peak_width: 60
280
+ peak_prominence: 0.025
281
+ peak_threshold: 0.80
282
+ rolling_window: 50
283
+
284
+ large_accessible_patch:
285
+ min_distance: 100
286
+ peak_width: 100
287
+ peak_prominence: 0.025
288
+ peak_threshold: 0.80
289
+ rolling_window: 50
290
+
291
+ nucleosome_depleted_region:
292
+ min_distance: 200
293
+ peak_width: 200
294
+ peak_prominence: 0.025
295
+ peak_threshold: 0.80
296
+ rolling_window: 50
297
+
298
+ small_bound_stretch:
299
+ min_distance: 20
300
+ peak_width: 20
301
+ peak_prominence: 0.01
302
+ peak_threshold: 0.50
303
+ rolling_window: 10
304
+
305
+ medium_bound_stretch:
306
+ min_distance: 40
307
+ peak_width: 40
308
+ peak_prominence: 0.01
309
+ peak_threshold: 0.50
310
+ rolling_window: 20
311
+
312
+ putative_nucleosome:
313
+ min_distance: 160
314
+ peak_width: 147 # canonical nucleosome footprint
315
+ peak_prominence: 0.025
316
+ peak_threshold: 0.60
317
+ rolling_window: 20
318
+
319
+ large_bound_stretch:
320
+ min_distance: 250
321
+ peak_width: 300
322
+ peak_prominence: 0.20
323
+ peak_threshold: 0.80
324
+ rolling_window: 50
325
+
326
+ # Pipeline control flow - load adata
327
+ force_redo_load_adata: False # Whether to perform load adata command from start
212
328
 
213
329
  # Pipeline control flow - Preprocessing and QC
214
330
  force_redo_preprocessing: False # Whether to force redo the entire preprocessing workflow from the initial raw anndata.
@@ -219,7 +335,6 @@ bypass_clean_nan: False # Whether to skip NaN cleaning
219
335
  force_redo_clean_nan: False # Whether to redo NaN cleaning
220
336
  bypass_append_base_context: False # Whether to skip adding per reference base context additions.
221
337
  force_redo_append_base_context: False # Whether to redo per reference base context additions.
222
- invert_adata: False # Whether to invert the AnnData along the positions axis.
223
338
  bypass_append_binary_layer_by_base_context: False # Whether to skip adding new binary layers for each specific base context.
224
339
  force_redo_append_binary_layer_by_base_context: False # Whether to redo adding new binary layers for each specific base context.
225
340
  bypass_calculate_read_modification_stats: False # Whether to skip adding read level modification statistics.
@@ -231,8 +346,8 @@ force_redo_flag_duplicate_reads: False # Whether to redo flagging duplicate read
231
346
  bypass_complexity_analysis: False # Whether to skip complexity analysis
232
347
  force_redo_complexity_analysis: False # Whether to redo complexity analysis
233
348
 
234
- # Pipeline control flow - Basic Analyses
235
- force_redo_basic_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
349
+ # Pipeline control flow - Spatial Analyses
350
+ force_redo_spatial_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
236
351
  bypass_basic_clustermaps: False # Whether to skip basic clustermap plotting
237
352
  force_redo_basic_clustermaps: False # Whether to redo basic clustermap plotting
238
353
  bypass_basic_umap: False # Whether to skip basic UMAP calculation/plotting
@@ -1,5 +1,7 @@
1
1
  # Direct (Nanopore modified base calling)footprinting defaults
2
2
  extends: default
3
+
4
+ ######## smftools load params #########
3
5
  filter_threshold: 0.8 # min threshold to call a canononical base
4
6
  m6A_threshold: 0.7 # min threshold to call a modified m6a base
5
7
  m5C_threshold: 0.7 # min threshold to call a modified 5mC base
@@ -12,6 +14,31 @@ thresholds:
12
14
  mod_list:
13
15
  - '5mC_5hmC'
14
16
  - '6mA' # mods to detect
17
+ mod_map:
18
+ 5mC_5hmC: 5mC
19
+ 6mA: 6mA
20
+ mod_target_bases:
21
+ - "A"
22
+ enzyme_target_bases:
23
+ - "A"
15
24
  batch_size: 4 # How many mod TSVs to load into memory at a time when making anndata batches
16
25
  skip_unclassified: True # Whether to skip unclassified barcodes
17
- delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
26
+ delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
27
+
28
+ ######## smftools preprocess params ########
29
+ fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
30
+ binarize_on_fixed_methlyation_threshold: 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
31
+ positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
32
+ negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
33
+ infer_on_percentile_sample_methylation_fitting: 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
34
+ inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
35
+ fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
36
+ output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
37
+
38
+ ######## smftools spatial params #########
39
+ autocorr_site_types:
40
+ - "A"
41
+
42
+ ######## smftools hmm params #########
43
+ hmm_methbases:
44
+ - "A"
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, List, Any, Iterable, Union
5
+
6
+ def discover_input_files(
7
+ input_data_path: Union[str, Path],
8
+ bam_suffix: str = ".bam",
9
+ recursive: bool = False,
10
+ follow_symlinks: bool = False,
11
+ ) -> Dict[str, Any]:
12
+ """
13
+ Discover input files under `input_data_path`.
14
+
15
+ Returns a dict with:
16
+ - pod5_paths, fast5_paths, fastq_paths, bam_paths, other_paths (lists of Path)
17
+ - input_is_pod5, input_is_fast5, input_is_fastq, input_is_bam (bools)
18
+ - all_files_searched (int)
19
+
20
+ Behavior:
21
+ - If `input_data_path` is a file, returns that single file categorized.
22
+ - If a directory, scans immediate children (recursive=False) or entire tree (recursive=True).
23
+ - Handles multi-suffix files like .fastq.gz, .fq.xz, etc.
24
+ """
25
+ p = Path(input_data_path)
26
+
27
+ # normalize bam suffix with a leading dot and lower
28
+ if not bam_suffix.startswith("."):
29
+ bam_suffix = "." + bam_suffix
30
+ bam_suffix = bam_suffix.lower()
31
+
32
+ # Sets of canonical extension keys we’ll compare against
33
+ pod5_exts = {".pod5", ".p5"}
34
+ fast5_exts = {".fast5", ".f5"}
35
+ fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq.zst", ".fq.zst"}
36
+ h5ad_exts = {".h5ad", ".h5"}
37
+ compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
38
+
39
+ def ext_key(pp: Path) -> str:
40
+ """
41
+ A robust extension key: last suffix, or last two if the final one is a compressor (.gz/.bz2/.xz/.zst).
42
+ Examples:
43
+ a.fastq.gz -> ".fastq.gz"
44
+ a.fq.xz -> ".fq.xz"
45
+ a.bam -> ".bam"
46
+ a -> ""
47
+ """
48
+ suff = [s.lower() for s in pp.suffixes]
49
+ if not suff:
50
+ return ""
51
+ if suff[-1] in compressed_exts and len(suff) >= 2:
52
+ return suff[-2] + suff[-1]
53
+ return suff[-1]
54
+
55
+ pod5_paths: List[Path] = []
56
+ fast5_paths: List[Path] = []
57
+ fastq_paths: List[Path] = []
58
+ bam_paths: List[Path] = []
59
+ h5ad_paths: List[Path] = []
60
+ other_paths: List[Path] = []
61
+
62
+ def categorize_file(fp: Path) -> None:
63
+ key = ext_key(fp)
64
+ if key in pod5_exts:
65
+ pod5_paths.append(fp)
66
+ elif key in fast5_exts:
67
+ fast5_paths.append(fp)
68
+ elif key in fastq_exts:
69
+ fastq_paths.append(fp)
70
+ elif key in h5ad_exts:
71
+ h5ad_paths.append(fp)
72
+ elif key == bam_suffix:
73
+ bam_paths.append(fp)
74
+ else:
75
+ other_paths.append(fp)
76
+
77
+ if not p.exists():
78
+ raise FileNotFoundError(f"input_data_path does not exist: {input_data_path}")
79
+
80
+ total_searched = 0
81
+
82
+ if p.is_file():
83
+ total_searched = 1
84
+ categorize_file(p)
85
+ else:
86
+ # Directory scan
87
+ if recursive:
88
+ # Python 3.12+ supports follow_symlinks in glob/rglob. Fallback for older versions.
89
+ try:
90
+ iterator = p.rglob("*", follow_symlinks=follow_symlinks) # type: ignore[call-arg]
91
+ except TypeError:
92
+ iterator = p.rglob("*") # follow_symlinks not supported
93
+ else:
94
+ iterator = p.iterdir()
95
+
96
+ for fp in iterator:
97
+ if not fp.is_file():
98
+ continue
99
+ total_searched += 1
100
+ categorize_file(fp)
101
+
102
+ return {
103
+ "pod5_paths": sorted(pod5_paths),
104
+ "fast5_paths": sorted(fast5_paths),
105
+ "fastq_paths": sorted(fastq_paths),
106
+ "bam_paths": sorted(bam_paths),
107
+ "h5ad_paths": sorted(h5ad_paths),
108
+ "other_paths": sorted(other_paths),
109
+ "input_is_pod5": len(pod5_paths) > 0,
110
+ "input_is_fast5": len(fast5_paths) > 0,
111
+ "input_is_fastq": len(fastq_paths) > 0,
112
+ "input_is_bam": len(bam_paths) > 0,
113
+ "input_is_h5ad": len(h5ad_paths) > 0,
114
+ "all_files_searched": total_searched,
115
+ }