smftools 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/helpers.py +48 -0
  3. smftools/cli/hmm_adata.py +168 -145
  4. smftools/cli/load_adata.py +155 -95
  5. smftools/cli/preprocess_adata.py +222 -130
  6. smftools/cli/spatial_adata.py +441 -308
  7. smftools/cli_entry.py +4 -5
  8. smftools/config/conversion.yaml +12 -5
  9. smftools/config/deaminase.yaml +11 -9
  10. smftools/config/default.yaml +123 -19
  11. smftools/config/direct.yaml +3 -0
  12. smftools/config/experiment_config.py +120 -19
  13. smftools/hmm/HMM.py +12 -1
  14. smftools/hmm/__init__.py +0 -6
  15. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  16. smftools/hmm/call_hmm_peaks.py +318 -90
  17. smftools/informatics/bam_functions.py +28 -29
  18. smftools/informatics/h5ad_functions.py +1 -1
  19. smftools/plotting/general_plotting.py +97 -51
  20. smftools/plotting/position_stats.py +3 -3
  21. smftools/preprocessing/__init__.py +2 -4
  22. smftools/preprocessing/append_base_context.py +34 -25
  23. smftools/preprocessing/append_binary_layer_by_base_context.py +2 -2
  24. smftools/preprocessing/binarize_on_Youden.py +10 -8
  25. smftools/preprocessing/calculate_complexity_II.py +1 -1
  26. smftools/preprocessing/calculate_coverage.py +16 -13
  27. smftools/preprocessing/calculate_position_Youden.py +41 -25
  28. smftools/preprocessing/calculate_read_modification_stats.py +1 -1
  29. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  30. smftools/preprocessing/filter_reads_on_modification_thresholds.py +1 -1
  31. smftools/preprocessing/flag_duplicate_reads.py +1 -1
  32. smftools/preprocessing/invert_adata.py +1 -1
  33. smftools/preprocessing/load_sample_sheet.py +1 -1
  34. smftools/preprocessing/reindex_references_adata.py +37 -0
  35. smftools/readwrite.py +94 -0
  36. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/METADATA +18 -12
  37. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/RECORD +46 -43
  38. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  39. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  40. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  41. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  42. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  43. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  44. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  45. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/entry_points.txt +0 -0
  46. {smftools-0.2.3.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
smftools/cli_entry.py CHANGED
@@ -4,12 +4,11 @@ from pathlib import Path
4
4
  from typing import Dict, Optional, Sequence
5
5
 
6
6
  from .cli.load_adata import load_adata
7
- from .cli.cli_flows import flow_I
8
7
  from .cli.preprocess_adata import preprocess_adata
9
8
  from .cli.spatial_adata import spatial_adata
10
9
  from .cli.hmm_adata import hmm_adata
11
10
 
12
- from .readwrite import merge_barcoded_anndatas_core, safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
11
+ from .readwrite import safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
13
12
 
14
13
  @click.group()
15
14
  def cli():
@@ -244,9 +243,9 @@ def concatenate_cmd(
244
243
 
245
244
  Two modes:
246
245
 
247
- smftools concatenate out.h5ad --input-dir ./dir
246
+ smftools concatenate out.h5ad.gz --input-dir ./dir
248
247
 
249
- smftools concatenate out.h5ad --csv-path paths.csv --csv-column h5ad_path
248
+ smftools concatenate out.h5ad.gz --csv-path paths.csv --csv-column h5ad_path
250
249
 
251
250
  TXT input also works (one file path per line).
252
251
 
@@ -266,7 +265,7 @@ def concatenate_cmd(
266
265
  delete_inputs=delete,
267
266
  restore_backups=restore,
268
267
  )
269
- click.echo(f"Concatenated file written to: {out}")
268
+ click.echo(f"Concatenated file written to: {out}")
270
269
 
271
270
  except Exception as e:
272
271
  raise click.ClickException(str(e)) from e
@@ -9,6 +9,13 @@ conversion_types:
9
9
  # Read QC Params
10
10
  read_mod_filtering_use_other_c_as_background: True
11
11
 
12
+ # Spatial Analysis - Clustermap params
13
+ layer_for_clustermap_plotting: 'nan0_0minus1'
14
+ clustermap_cmap_c: "coolwarm"
15
+ clustermap_cmap_gpc: "coolwarm"
16
+ clustermap_cmap_cpg: "viridis"
17
+ clustermap_cmap_a: "coolwarm"
18
+
12
19
  ######## smftools hmm params #########
13
20
  # HMM
14
21
  cpg: True # whether to use the default HMM endogenous CpG patch params
@@ -18,17 +25,17 @@ hmm_feature_sets:
18
25
  footprint:
19
26
  state: "Non-Modified"
20
27
  features:
21
- small_bound_stretch: [10, 30]
22
- medium_bound_stretch: [30, 110]
23
- putative_nucleosome: [110, 200]
28
+ small_bound_stretch: [6, 40]
29
+ medium_bound_stretch: [40, 100]
30
+ putative_nucleosome: [100, 200]
24
31
  large_bound_stretch: [200, inf]
25
32
  accessible:
26
33
  state: "Modified"
27
34
  features:
28
35
  small_accessible_patch: [3, 20]
29
36
  mid_accessible_patch: [20, 40]
30
- mid_large_accessible_patch: [40, 130]
31
- large_accessible_patch: [130, inf]
37
+ large_accessible_patch: [40, 110]
38
+ nucleosome_depleted_region: [110, inf]
32
39
  cpg:
33
40
  state: "Modified"
34
41
  features:
@@ -7,6 +7,8 @@ conversion_types:
7
7
 
8
8
  mod_target_bases:
9
9
  - "C"
10
+ enzyme_target_bases:
11
+ - "C"
10
12
 
11
13
  ######## smftools preprocess params #########
12
14
  read_mod_filtering_gpc_thresholds:
@@ -15,7 +17,7 @@ read_mod_filtering_gpc_thresholds:
15
17
  read_mod_filtering_cpg_thresholds:
16
18
  - null
17
19
  - null
18
- read_mod_filtering_any_c_thresholds:
20
+ read_mod_filtering_c_thresholds:
19
21
  - 0.01
20
22
  - 0.99
21
23
  read_mod_filtering_a_thresholds:
@@ -26,16 +28,16 @@ read_mod_filtering_use_other_c_as_background: False
26
28
 
27
29
  # Duplicate Detection Params
28
30
  duplicate_detection_site_types:
29
- - "any_C"
31
+ - "C"
30
32
 
31
33
  ######## smftools analyze params #########
32
34
  # Autocorrelation params
33
35
  autocorr_site_types:
34
- - "any_C"
36
+ - "C"
35
37
 
36
38
  # Correlation matrix params
37
39
  correlation_matrix_site_types:
38
- - "any_C_site"
40
+ - "C_site"
39
41
 
40
42
  # ######## smftools hmm params #########
41
43
  cpg: False # whether to use the default HMM endogenous CpG patch params
@@ -45,17 +47,17 @@ hmm_feature_sets:
45
47
  footprint:
46
48
  state: "Non-Modified"
47
49
  features:
48
- small_bound_stretch: [10, 30]
49
- medium_bound_stretch: [30, 110]
50
- putative_nucleosome: [110, 200]
50
+ small_bound_stretch: [6, 40]
51
+ medium_bound_stretch: [40, 100]
52
+ putative_nucleosome: [100, 200]
51
53
  large_bound_stretch: [200, inf]
52
54
  accessible:
53
55
  state: "Modified"
54
56
  features:
55
57
  small_accessible_patch: [3, 20]
56
58
  mid_accessible_patch: [20, 40]
57
- mid_large_accessible_patch: [40, 130]
58
- large_accessible_patch: [130, inf]
59
+ large_accessible_patch: [40, 110]
60
+ nucleosome_depleted_region: [110, inf]
59
61
 
60
62
  hmm_merge_layer_features:
61
63
  - ["C_all_accessible_features", 80]
@@ -21,7 +21,7 @@ fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barc
21
21
  fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
22
22
  input_already_demuxed: False # If the input files are already demultiplexed.
23
23
  delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
24
- delete_intermediate_bams: True # Whether to delete intermediate BAM files.
24
+ delete_intermediate_bams: False # Whether to delete intermediate BAM files.
25
25
  delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
26
26
 
27
27
  # Sequencing modality and general experiment params
@@ -40,7 +40,8 @@ model: "hac" # needed for dorado basecaller
40
40
  filter_threshold: 0.8 # Dorado probability filter threshold for base calling.
41
41
 
42
42
  # Alignment params
43
- aligner: "minimap2" # Aligner to use: dorado, minimap2
43
+ aligner: "dorado" # Aligner to use: dorado, minimap2
44
+ align_from_bam: False # Whether to run alignment from a bam file for minimap2. If False, runs alignment from a FASTQ file.
44
45
  aligner_args:
45
46
  minimap2:
46
47
  ont:
@@ -116,7 +117,7 @@ read_mod_filtering_gpc_thresholds:
116
117
  read_mod_filtering_cpg_thresholds:
117
118
  - 0.0
118
119
  - 1.0
119
- read_mod_filtering_any_c_thresholds:
120
+ read_mod_filtering_c_thresholds:
120
121
  - 0.025
121
122
  - 0.975
122
123
  read_mod_filtering_a_thresholds:
@@ -125,6 +126,16 @@ read_mod_filtering_a_thresholds:
125
126
  read_mod_filtering_use_other_c_as_background: False
126
127
  min_valid_fraction_positions_in_read_vs_ref: 0.5
127
128
 
129
+ # Plotting params for read length histograms
130
+ obs_to_plot_pp_qc:
131
+ - read_length
132
+ - mapped_length
133
+ - read_quality
134
+ - mapping_quality
135
+ - mapped_length_to_reference_length_ratio
136
+ - mapped_length_to_read_length_ratio
137
+ - Raw_modification_signal
138
+
128
139
  # Duplicate detection params
129
140
  duplicate_detection_site_types: # Site types to consider for duplicate detection workflow
130
141
  - "GpC"
@@ -132,7 +143,7 @@ duplicate_detection_site_types: # Site types to consider for duplicate detection
132
143
  - "ambiguous_GpC_CpG"
133
144
  duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
134
145
  hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
135
- - Fraction_any_C_site_modified
146
+ - Fraction_C_site_modified
136
147
  duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
137
148
  duplicate_detection_window_size_for_hamming_neighbors: 50 # How many neighboring reads to look at for calculating hamming distance pairs
138
149
  duplicate_detection_min_overlapping_positions: 20 # The minimum amount of valid overlapping positions that will allow duplicate detection to work
@@ -143,29 +154,40 @@ duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkag
143
154
  # Position QC params
144
155
  position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
145
156
 
146
- ######## smftools analyze params #########
147
- # Basic Analysis - QC Plotting params
157
+ ######## smftools spatial params #########
158
+ invert_adata: False # Whether to invert the AnnData along the positions axis.
159
+ # Reindexing params
160
+ reindexing_offsets:
161
+ null : null
162
+ reindexed_var_suffix: "reindexed"
163
+
164
+ # Spatial Analysis - QC Plotting params
148
165
  rows_per_qc_histogram_grid: 12
149
166
 
150
- # Basic Analysis - Clustermap params
167
+ # Spatial Analysis - Clustermap params
151
168
  layer_for_clustermap_plotting: 'nan0_0minus1'
169
+ clustermap_cmap_c: "coolwarm"
170
+ clustermap_cmap_gpc: "coolwarm"
171
+ clustermap_cmap_cpg: "coolwarm"
172
+ clustermap_cmap_a: "coolwarm"
173
+ spatial_clustermap_sortby: "gpc"
152
174
 
153
- # Basic Analysis - UMAP/Leiden params
175
+ # Spatial Analysis - UMAP/Leiden params
154
176
  layer_for_umap_plotting: 'nan_half'
155
177
  umap_layers_to_plot:
156
178
  - "mapped_length"
157
179
  - "Raw_modification_signal"
158
180
 
159
- # Basic Analysis - Spatial Autocorrelation params
181
+ # Spatial Analysis - Spatial Autocorrelation params
160
182
  rows_per_qc_autocorr_grid: 6
161
183
  autocorr_rolling_window_size: 25
162
184
  autocorr_max_lag: 800
163
185
  autocorr_site_types:
164
186
  - "GpC"
165
187
  - "CpG"
166
- - "any_C"
188
+ - "C"
167
189
 
168
- # Basic Analysis - Correlation Matrix params
190
+ # Spatial Analysis - Correlation Matrix params
169
191
  correlation_matrix_types:
170
192
  - "pearson"
171
193
  - "binary_covariance"
@@ -204,19 +226,102 @@ hmm_feature_sets:
204
226
  footprint:
205
227
  state: "Non-Modified"
206
228
  features:
207
- small_bound_stretch: [10, 40]
208
- medium_bound_stretch: [40, 110]
209
- putative_nucleosome: [110, 200]
229
+ small_bound_stretch: [6, 40]
230
+ medium_bound_stretch: [40, 100]
231
+ putative_nucleosome: [100, 200]
210
232
  large_bound_stretch: [200, inf]
211
233
  accessible:
212
234
  state: "Modified"
213
235
  features:
214
236
  small_accessible_patch: [3, 20]
215
237
  mid_accessible_patch: [20, 40]
216
- mid_large_accessible_patch: [40, 110]
217
- large_accessible_patch: [110, inf]
238
+ large_accessible_patch: [40, 110]
239
+ nucleosome_depleted_region: [110, inf]
218
240
  hmm_merge_layer_features:
219
241
  - [null, 80]
242
+ clustermap_cmap_hmm: "coolwarm"
243
+ hmm_clustermap_feature_layers:
244
+ - all_accessible_features
245
+ - all_accessible_features_merged
246
+ - small_accessible_patch
247
+ - mid_accessible_patch
248
+ - large_accessible_patch
249
+ - nucleosome_depleted_region
250
+ - small_bound_stretch
251
+ - medium_bound_stretch
252
+ - putative_nucleosome
253
+ - large_bound_stretch
254
+ hmm_clustermap_sortby: "hmm"
255
+ hmm_peak_feature_configs:
256
+ all_accessible_features:
257
+ min_distance: 200 # The minimum distance in between called peaks
258
+ peak_width: 200 # The window width to calculate sum/mean hmm signal per read centered at the peak center.
259
+ peak_prominence: 0.1 # The minimum prominence to call a peak
260
+ peak_threshold: 0.80 # The minimum mean hmm signal in each molecule within the peak window to mark the molecule as positive for the feature.
261
+ rolling_window: 50 # Window size for the rolling average smoothing before peak calling
262
+
263
+ all_accessible_features_merged:
264
+ min_distance: 250
265
+ peak_width: 250
266
+ peak_prominence: 0.05
267
+ peak_threshold: 0.80
268
+ rolling_window: 50
269
+
270
+ small_accessible_patch:
271
+ min_distance: 40
272
+ peak_width: 30
273
+ peak_prominence: 0.1
274
+ peak_threshold: 0.8
275
+ rolling_window: 40
276
+
277
+ mid_accessible_patch:
278
+ min_distance: 100
279
+ peak_width: 60
280
+ peak_prominence: 0.025
281
+ peak_threshold: 0.80
282
+ rolling_window: 50
283
+
284
+ large_accessible_patch:
285
+ min_distance: 100
286
+ peak_width: 100
287
+ peak_prominence: 0.025
288
+ peak_threshold: 0.80
289
+ rolling_window: 50
290
+
291
+ nucleosome_depleted_region:
292
+ min_distance: 200
293
+ peak_width: 200
294
+ peak_prominence: 0.025
295
+ peak_threshold: 0.80
296
+ rolling_window: 50
297
+
298
+ small_bound_stretch:
299
+ min_distance: 20
300
+ peak_width: 20
301
+ peak_prominence: 0.01
302
+ peak_threshold: 0.50
303
+ rolling_window: 10
304
+
305
+ medium_bound_stretch:
306
+ min_distance: 40
307
+ peak_width: 40
308
+ peak_prominence: 0.01
309
+ peak_threshold: 0.50
310
+ rolling_window: 20
311
+
312
+ putative_nucleosome:
313
+ min_distance: 160
314
+ peak_width: 147 # canonical nucleosome footprint
315
+ peak_prominence: 0.025
316
+ peak_threshold: 0.60
317
+ rolling_window: 20
318
+
319
+ large_bound_stretch:
320
+ min_distance: 250
321
+ peak_width: 300
322
+ peak_prominence: 0.20
323
+ peak_threshold: 0.80
324
+ rolling_window: 50
220
325
 
221
326
  # Pipeline control flow - load adata
222
327
  force_redo_load_adata: False # Whether to perform load adata command from start
@@ -230,7 +335,6 @@ bypass_clean_nan: False # Whether to skip NaN cleaning
230
335
  force_redo_clean_nan: False # Whether to redo NaN cleaning
231
336
  bypass_append_base_context: False # Whether to skip adding per reference base context additions.
232
337
  force_redo_append_base_context: False # Whether to redo per reference base context additions.
233
- invert_adata: False # Whether to invert the AnnData along the positions axis.
234
338
  bypass_append_binary_layer_by_base_context: False # Whether to skip adding new binary layers for each specific base context.
235
339
  force_redo_append_binary_layer_by_base_context: False # Whether to redo adding new binary layers for each specific base context.
236
340
  bypass_calculate_read_modification_stats: False # Whether to skip adding read level modification statistics.
@@ -242,8 +346,8 @@ force_redo_flag_duplicate_reads: False # Whether to redo flagging duplicate read
242
346
  bypass_complexity_analysis: False # Whether to skip complexity analysis
243
347
  force_redo_complexity_analysis: False # Whether to redo complexity analysis
244
348
 
245
- # Pipeline control flow - Basic Analyses
246
- force_redo_basic_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
349
+ # Pipeline control flow - Spatial Analyses
350
+ force_redo_spatial_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
247
351
  bypass_basic_clustermaps: False # Whether to skip basic clustermap plotting
248
352
  force_redo_basic_clustermaps: False # Whether to redo basic clustermap plotting
249
353
  bypass_basic_umap: False # Whether to skip basic UMAP calculation/plotting
@@ -14,6 +14,9 @@ thresholds:
14
14
  mod_list:
15
15
  - '5mC_5hmC'
16
16
  - '6mA' # mods to detect
17
+ mod_map:
18
+ 5mC_5hmC: 5mC
19
+ 6mA: 6mA
17
20
  mod_target_bases:
18
21
  - "A"
19
22
  enzyme_target_bases:
@@ -214,7 +214,7 @@ def resolve_aligner_args(
214
214
  return list(default_by_aligner.get(key_align, []))
215
215
 
216
216
 
217
- # HMM default params and hepler functions
217
+ # HMM default params and helper functions
218
218
  def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
219
219
  """
220
220
  Normalize user-provided `hmm_feature_sets` into canonical structure:
@@ -275,6 +275,58 @@ def normalize_hmm_feature_sets(raw: Any) -> Dict[str, dict]:
275
275
  canonical[grp] = {"features": feats, "state": state}
276
276
  return canonical
277
277
 
278
+ def normalize_peak_feature_configs(raw: Any) -> Dict[str, dict]:
279
+ """
280
+ Normalize user-provided `hmm_peak_feature_configs` into:
281
+ {
282
+ layer_name: {
283
+ "min_distance": int,
284
+ "peak_width": int,
285
+ "peak_prominence": float,
286
+ "peak_threshold": float,
287
+ "rolling_window": int,
288
+ },
289
+ ...
290
+ }
291
+
292
+ Accepts dict, JSON/string, None. Returns {} for empty input.
293
+ """
294
+ if raw is None:
295
+ return {}
296
+
297
+ parsed = raw
298
+ if isinstance(raw, str):
299
+ parsed = _try_json_or_literal(raw)
300
+ if not isinstance(parsed, dict):
301
+ return {}
302
+
303
+ defaults = {
304
+ "min_distance": 200,
305
+ "peak_width": 200,
306
+ "peak_prominence": 0.2,
307
+ "peak_threshold": 0.8,
308
+ "rolling_window": 1,
309
+ }
310
+
311
+ out: Dict[str, dict] = {}
312
+ for layer, conf in parsed.items():
313
+ if conf is None:
314
+ conf = {}
315
+ if not isinstance(conf, dict):
316
+ # allow shorthand like 300 -> interpreted as peak_width
317
+ conf = {"peak_width": conf}
318
+
319
+ full = defaults.copy()
320
+ full.update(conf)
321
+ out[str(layer)] = {
322
+ "min_distance": int(full["min_distance"]),
323
+ "peak_width": int(full["peak_width"]),
324
+ "peak_prominence": float(full["peak_prominence"]),
325
+ "peak_threshold": float(full["peak_threshold"]),
326
+ "rolling_window": int(full["rolling_window"]),
327
+ }
328
+ return out
329
+
278
330
 
279
331
  # -------------------------
280
332
  # LoadExperimentConfig
@@ -612,7 +664,7 @@ class ExperimentConfig:
612
664
  fastq_auto_pairing: bool = True
613
665
 
614
666
  # Remove intermediate file options
615
- delete_intermediate_bams: bool = True
667
+ delete_intermediate_bams: bool = False
616
668
  delete_intermediate_tsvs: bool = True
617
669
 
618
670
  # Conversion/Deamination file handling
@@ -647,11 +699,13 @@ class ExperimentConfig:
647
699
  m5C_threshold: float = 0.7
648
700
  hm5C_threshold: float = 0.7
649
701
  thresholds: List[float] = field(default_factory=list)
650
- mod_list: List[str] = field(default_factory=lambda: ["5mC_5hmC", "6mA"])
702
+ mod_list: List[str] = field(default_factory=lambda: ["5mC_5hmC", "6mA"]) # Dorado modified basecalling codes
703
+ mod_map: Dict[str, str] = field(default_factory=lambda: {'6mA': '6mA', '5mC_5hmC': '5mC'}) # Map from dorado modified basecalling codes to codes used in modkit_extract_to_adata function
651
704
 
652
705
  # Alignment params
653
706
  mapping_threshold: float = 0.01 # Min threshold for fraction of reads in a sample mapping to a reference in order to include the reference in the anndata
654
- aligner: str = "minimap2"
707
+ align_from_bam: bool = False # Whether minimap2 should align from a bam file as input. If False, aligns from FASTQ
708
+ aligner: str = "dorado"
655
709
  aligner_args: Optional[List[str]] = None
656
710
  make_bigwigs: bool = False
657
711
  make_beds: bool = False
@@ -671,6 +725,10 @@ class ExperimentConfig:
671
725
  read_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [15, None])
672
726
  read_mapping_quality_filter_thresholds: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
673
727
 
728
+ # Preprocessing - Optional reindexing params
729
+ reindexing_offsets: Dict[str, int] = field(default_factory=dict)
730
+ reindexed_var_suffix: Optional[str] = "reindexed"
731
+
674
732
  # Preprocessing - Direct mod detection binarization params
675
733
  fit_position_methylation_thresholds: Optional[bool] = False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
676
734
  binarize_on_fixed_methlyation_threshold: Optional[float] = 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
@@ -684,15 +742,18 @@ class ExperimentConfig:
684
742
  # Preprocessing - Read modification filter params
685
743
  read_mod_filtering_gpc_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
686
744
  read_mod_filtering_cpg_thresholds: List[float] = field(default_factory=lambda: [0.00, 1])
687
- read_mod_filtering_any_c_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
745
+ read_mod_filtering_c_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
688
746
  read_mod_filtering_a_thresholds: List[float] = field(default_factory=lambda: [0.025, 0.975])
689
747
  read_mod_filtering_use_other_c_as_background: bool = True
690
748
  min_valid_fraction_positions_in_read_vs_ref: float = 0.2
691
749
 
750
+ # Preprocessing - plotting params
751
+ obs_to_plot_pp_qc: List[str] = field(default_factory=lambda: ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal'])
752
+
692
753
  # Preprocessing - Duplicate detection params
693
754
  duplicate_detection_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'ambiguous_GpC_CpG'])
694
755
  duplicate_detection_distance_threshold: float = 0.07
695
- hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ['Fraction_any_C_site_modified'])
756
+ hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ['Fraction_C_site_modified'])
696
757
  duplicate_detection_keep_best_metric: str ='read_quality'
697
758
  duplicate_detection_window_size_for_hamming_neighbors: int = 50
698
759
  duplicate_detection_min_overlapping_positions: int = 20
@@ -703,20 +764,25 @@ class ExperimentConfig:
703
764
  # Preprocessing - Position QC
704
765
  position_max_nan_threshold: float = 0.1
705
766
 
706
- # Basic Analysis - Clustermap params
767
+ # Spatial Analysis - Clustermap params
707
768
  layer_for_clustermap_plotting: Optional[str] = 'nan0_0minus1'
769
+ clustermap_cmap_c: Optional[str] = 'coolwarm'
770
+ clustermap_cmap_gpc: Optional[str] = 'coolwarm'
771
+ clustermap_cmap_cpg: Optional[str] = 'coolwarm'
772
+ clustermap_cmap_a: Optional[str] = 'coolwarm'
773
+ spatial_clustermap_sortby: Optional[str] = 'gpc'
708
774
 
709
- # Basic Analysis - UMAP/Leiden params
775
+ # Spatial Analysis - UMAP/Leiden params
710
776
  layer_for_umap_plotting: Optional[str] = 'nan_half'
711
777
  umap_layers_to_plot: List[str] = field(default_factory=lambda: ["mapped_length", "Raw_modification_signal"])
712
778
 
713
- # Basic Analysis - Spatial Autocorrelation params
779
+ # Spatial Analysis - Spatial Autocorrelation params
714
780
  rows_per_qc_autocorr_grid: int = 12
715
781
  autocorr_rolling_window_size: int = 25
716
782
  autocorr_max_lag: int = 800
717
- autocorr_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'any_C'])
783
+ autocorr_site_types: List[str] = field(default_factory=lambda: ['GpC', 'CpG', 'C'])
718
784
 
719
- # Basic Analysis - Correlation Matrix params
785
+ # Spatial Analysis - Correlation Matrix params
720
786
  correlation_matrix_types: List[str] = field(default_factory=lambda: ["pearson", "binary_covariance"])
721
787
  correlation_matrix_cmaps: List[str] = field(default_factory=lambda: ["seismic", "viridis"])
722
788
  correlation_matrix_site_types: List[str] = field(default_factory=lambda: ["GpC_site"])
@@ -738,6 +804,10 @@ class ExperimentConfig:
738
804
  cpg: Optional[bool] = False
739
805
  hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
740
806
  hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 80)])
807
+ clustermap_cmap_hmm: Optional[str] = 'coolwarm'
808
+ hmm_clustermap_feature_layers: List[str] = field(default_factory=lambda: ["all_accessible_features"])
809
+ hmm_clustermap_sortby: Optional[str] = 'hmm'
810
+ hmm_peak_feature_configs: Dict[str, Any] = field(default_factory=dict)
741
811
 
742
812
  # Pipeline control flow - load adata
743
813
  force_redo_load_adata: bool = False
@@ -763,8 +833,8 @@ class ExperimentConfig:
763
833
  bypass_complexity_analysis: bool = False
764
834
  force_redo_complexity_analysis: bool = False
765
835
 
766
- # Pipeline control flow - Basic Analyses
767
- force_redo_basic_analyses: bool = False
836
+ # Pipeline control flow - Spatial Analyses
837
+ force_redo_spatial_analyses: bool = False
768
838
  bypass_basic_clustermaps: bool = False
769
839
  force_redo_basic_clustermaps: bool = False
770
840
  bypass_basic_umap: bool = False
@@ -930,7 +1000,14 @@ class ExperimentConfig:
930
1000
  input_type = "h5ad"
931
1001
  input_files = found["h5ad_paths"]
932
1002
 
933
- print(f"Found {found['all_files_searched']} files; fastq={len(found["fastq_paths"])}, bam={len(found["bam_paths"])}, pod5={len(found["pod5_paths"])}, fast5={len(found["fast5_paths"])}, , h5ad={len(found["h5ad_paths"])}")
1003
+ print(
1004
+ f"Found {found['all_files_searched']} files; "
1005
+ f"fastq={len(found['fastq_paths'])}, "
1006
+ f"bam={len(found['bam_paths'])}, "
1007
+ f"pod5={len(found['pod5_paths'])}, "
1008
+ f"fast5={len(found['fast5_paths'])}, "
1009
+ f"h5ad={len(found['h5ad_paths'])}"
1010
+ )
934
1011
 
935
1012
  # summary file output path
936
1013
  output_dir = Path(merged['output_directory'])
@@ -981,6 +1058,9 @@ class ExperimentConfig:
981
1058
  if "mod_list" in merged:
982
1059
  merged["mod_list"] = _parse_list(merged.get("mod_list"))
983
1060
 
1061
+ # Preprocessing args
1062
+ obs_to_plot_pp_qc = _parse_list(merged.get("obs_to_plot_pp_qc", None))
1063
+
984
1064
  # HMM feature set handling
985
1065
  if "hmm_feature_sets" in merged:
986
1066
  merged["hmm_feature_sets"] = normalize_hmm_feature_sets(merged["hmm_feature_sets"])
@@ -1016,6 +1096,13 @@ class ExperimentConfig:
1016
1096
  hmm_methbases = ['C']
1017
1097
  hmm_methbases = list(hmm_methbases)
1018
1098
  hmm_merge_layer_features = _parse_list(merged.get("hmm_merge_layer_features", None))
1099
+ hmm_clustermap_feature_layers = _parse_list(merged.get("hmm_clustermap_feature_layers", "all_accessible_features"))
1100
+
1101
+ # HMM peak feature configs (for call_hmm_peaks)
1102
+ merged["hmm_peak_feature_configs"] = normalize_peak_feature_configs(
1103
+ merged.get("hmm_peak_feature_configs", {})
1104
+ )
1105
+ hmm_peak_feature_configs = merged.get("hmm_peak_feature_configs", {})
1019
1106
 
1020
1107
  # instantiate dataclass
1021
1108
  instance = cls(
@@ -1047,8 +1134,9 @@ class ExperimentConfig:
1047
1134
  threads = merged.get("threads"),
1048
1135
  sample_sheet_path = merged.get("sample_sheet_path"),
1049
1136
  sample_sheet_mapping_column = merged.get("sample_sheet_mapping_column"),
1050
- delete_intermediate_bams = merged.get("delete_intermediate_bams", True),
1137
+ delete_intermediate_bams = merged.get("delete_intermediate_bams", False),
1051
1138
  delete_intermediate_tsvs = merged.get("delete_intermediate_tsvs", True),
1139
+ align_from_bam = merged.get("align_from_bam", False),
1052
1140
  aligner = merged.get("aligner", "minimap2"),
1053
1141
  aligner_args = merged.get("aligner_args", None),
1054
1142
  device = merged.get("device", "auto"),
@@ -1070,6 +1158,7 @@ class ExperimentConfig:
1070
1158
  reference_column = merged.get("reference_column", 'Reference_strand'),
1071
1159
  sample_column = merged.get("sample_column", 'Barcode'),
1072
1160
  sample_name_col_for_plotting = merged.get("sample_name_col_for_plotting", 'Barcode'),
1161
+ obs_to_plot_pp_qc = obs_to_plot_pp_qc,
1073
1162
  fit_position_methylation_thresholds = merged.get("fit_position_methylation_thresholds", False),
1074
1163
  binarize_on_fixed_methlyation_threshold = merged.get("binarize_on_fixed_methlyation_threshold", 0.7),
1075
1164
  positive_control_sample_methylation_fitting = merged.get("positive_control_sample_methylation_fitting", None),
@@ -1078,14 +1167,21 @@ class ExperimentConfig:
1078
1167
  inference_variable_sample_methylation_fitting = merged.get("inference_variable_sample_methylation_fitting", "Raw_modification_signal"),
1079
1168
  fit_j_threshold = merged.get("fit_j_threshold", 0.5),
1080
1169
  output_binary_layer_name = merged.get("output_binary_layer_name", "binarized_methylation"),
1170
+ reindexing_offsets = merged.get("reindexing_offsets", {None: None}),
1171
+ reindexed_var_suffix = merged.get("reindexed_var_suffix", "reindexed"),
1081
1172
  layer_for_clustermap_plotting = merged.get("layer_for_clustermap_plotting", 'nan0_0minus1'),
1173
+ clustermap_cmap_c = merged.get("clustermap_cmap_c", 'coolwarm'),
1174
+ clustermap_cmap_gpc = merged.get("clustermap_cmap_gpc", 'coolwarm'),
1175
+ clustermap_cmap_cpg = merged.get("clustermap_cmap_cpg", 'coolwarm'),
1176
+ clustermap_cmap_a = merged.get("clustermap_cmap_a", 'coolwarm'),
1177
+ spatial_clustermap_sortby = merged.get("spatial_clustermap_sortby", 'gpc'),
1082
1178
  layer_for_umap_plotting = merged.get("layer_for_umap_plotting", 'nan_half'),
1083
1179
  umap_layers_to_plot = merged.get("umap_layers_to_plot",["mapped_length", 'Raw_modification_signal']),
1084
1180
  rows_per_qc_histogram_grid = merged.get("rows_per_qc_histogram_grid", 12),
1085
1181
  rows_per_qc_autocorr_grid = merged.get("rows_per_qc_autocorr_grid", 12),
1086
1182
  autocorr_rolling_window_size = merged.get("autocorr_rolling_window_size", 25),
1087
1183
  autocorr_max_lag = merged.get("autocorr_max_lag", 800),
1088
- autocorr_site_types = merged.get("autocorr_site_types", ['GpC', 'CpG', 'any_C']),
1184
+ autocorr_site_types = merged.get("autocorr_site_types", ['GpC', 'CpG', 'C']),
1089
1185
  hmm_n_states = merged.get("hmm_n_states", 2),
1090
1186
  hmm_init_emission_probs = merged.get("hmm_init_emission_probs",[[0.8, 0.2], [0.2, 0.8]]),
1091
1187
  hmm_init_transition_probs = merged.get("hmm_init_transition_probs",[[0.9, 0.1], [0.1, 0.9]]),
@@ -1099,6 +1195,10 @@ class ExperimentConfig:
1099
1195
  hmm_methbases = hmm_methbases,
1100
1196
  hmm_device = hmm_device,
1101
1197
  hmm_merge_layer_features = hmm_merge_layer_features,
1198
+ clustermap_cmap_hmm = merged.get("clustermap_cmap_hmm", 'coolwarm'),
1199
+ hmm_clustermap_feature_layers = hmm_clustermap_feature_layers,
1200
+ hmm_clustermap_sortby = merged.get("hmm_clustermap_sortby", 'hmm'),
1201
+ hmm_peak_feature_configs = hmm_peak_feature_configs,
1102
1202
  footprints = merged.get("footprints", None),
1103
1203
  accessible_patches = merged.get("accessible_patches", None),
1104
1204
  cpg = merged.get("cpg", None),
@@ -1109,7 +1209,7 @@ class ExperimentConfig:
1109
1209
  read_mapping_quality_filter_thresholds = merged.get("read_mapping_quality_filter_thresholds", [None, None]),
1110
1210
  read_mod_filtering_gpc_thresholds = merged.get("read_mod_filtering_gpc_thresholds", [0.025, 0.975]),
1111
1211
  read_mod_filtering_cpg_thresholds = merged.get("read_mod_filtering_cpg_thresholds", [0.0, 1.0]),
1112
- read_mod_filtering_any_c_thresholds = merged.get("read_mod_filtering_any_c_thresholds", [0.025, 0.975]),
1212
+ read_mod_filtering_c_thresholds = merged.get("read_mod_filtering_c_thresholds", [0.025, 0.975]),
1113
1213
  read_mod_filtering_a_thresholds = merged.get("read_mod_filtering_a_thresholds", [0.025, 0.975]),
1114
1214
  read_mod_filtering_use_other_c_as_background = merged.get("read_mod_filtering_use_other_c_as_background", True),
1115
1215
  min_valid_fraction_positions_in_read_vs_ref = merged.get("min_valid_fraction_positions_in_read_vs_ref", 0.2),
@@ -1125,7 +1225,7 @@ class ExperimentConfig:
1125
1225
  correlation_matrix_types = merged.get("correlation_matrix_types", ["pearson", "binary_covariance"]),
1126
1226
  correlation_matrix_cmaps = merged.get("correlation_matrix_cmaps", ["seismic", "viridis"]),
1127
1227
  correlation_matrix_site_types = merged.get("correlation_matrix_site_types", ["GpC_site"]),
1128
- hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_any_C_site_modified']),
1228
+ hamming_vs_metric_keys = merged.get("hamming_vs_metric_keys", ['Fraction_C_site_modified']),
1129
1229
  force_redo_load_adata = merged.get("force_redo_load_adata", False),
1130
1230
  force_redo_preprocessing = merged.get("force_redo_preprocessing", False),
1131
1231
  force_reload_sample_sheet = merged.get("force_reload_sample_sheet", True),
@@ -1146,7 +1246,7 @@ class ExperimentConfig:
1146
1246
  force_redo_flag_duplicate_reads = merged.get("force_redo_flag_duplicate_reads", False),
1147
1247
  bypass_complexity_analysis = merged.get("bypass_complexity_analysis", False),
1148
1248
  force_redo_complexity_analysis = merged.get("force_redo_complexity_analysis", False),
1149
- force_redo_basic_analyses = merged.get("force_redo_basic_analyses", False),
1249
+ force_redo_spatial_analyses = merged.get("force_redo_spatial_analyses", False),
1150
1250
  bypass_basic_clustermaps = merged.get("bypass_basic_clustermaps", False),
1151
1251
  force_redo_basic_clustermaps = merged.get("force_redo_basic_clustermaps", False),
1152
1252
  bypass_basic_umap = merged.get("bypass_basic_umap", False),
@@ -1198,6 +1298,7 @@ class ExperimentConfig:
1198
1298
  # -------------------------
1199
1299
  # validation & serialization
1200
1300
  # -------------------------
1301
+ @staticmethod
1201
1302
  def _validate_hmm_features_structure(hfs: dict) -> List[str]:
1202
1303
  errs = []
1203
1304
  if not isinstance(hfs, dict):