smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/chimeric_adata.py +1563 -0
  3. smftools/cli/helpers.py +49 -7
  4. smftools/cli/hmm_adata.py +250 -32
  5. smftools/cli/latent_adata.py +773 -0
  6. smftools/cli/load_adata.py +78 -74
  7. smftools/cli/preprocess_adata.py +122 -58
  8. smftools/cli/recipes.py +26 -0
  9. smftools/cli/spatial_adata.py +74 -112
  10. smftools/cli/variant_adata.py +423 -0
  11. smftools/cli_entry.py +52 -4
  12. smftools/config/conversion.yaml +1 -1
  13. smftools/config/deaminase.yaml +3 -0
  14. smftools/config/default.yaml +85 -12
  15. smftools/config/experiment_config.py +146 -1
  16. smftools/constants.py +69 -0
  17. smftools/hmm/HMM.py +88 -0
  18. smftools/hmm/call_hmm_peaks.py +1 -1
  19. smftools/informatics/__init__.py +6 -0
  20. smftools/informatics/bam_functions.py +358 -8
  21. smftools/informatics/binarize_converted_base_identities.py +2 -89
  22. smftools/informatics/converted_BAM_to_adata.py +636 -175
  23. smftools/informatics/h5ad_functions.py +198 -2
  24. smftools/informatics/modkit_extract_to_adata.py +1007 -425
  25. smftools/informatics/sequence_encoding.py +72 -0
  26. smftools/logging_utils.py +21 -2
  27. smftools/metadata.py +1 -1
  28. smftools/plotting/__init__.py +26 -3
  29. smftools/plotting/autocorrelation_plotting.py +22 -4
  30. smftools/plotting/chimeric_plotting.py +1893 -0
  31. smftools/plotting/classifiers.py +28 -14
  32. smftools/plotting/general_plotting.py +62 -1583
  33. smftools/plotting/hmm_plotting.py +1670 -8
  34. smftools/plotting/latent_plotting.py +804 -0
  35. smftools/plotting/plotting_utils.py +243 -0
  36. smftools/plotting/position_stats.py +16 -8
  37. smftools/plotting/preprocess_plotting.py +281 -0
  38. smftools/plotting/qc_plotting.py +8 -3
  39. smftools/plotting/spatial_plotting.py +1134 -0
  40. smftools/plotting/variant_plotting.py +1231 -0
  41. smftools/preprocessing/__init__.py +4 -0
  42. smftools/preprocessing/append_base_context.py +18 -18
  43. smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
  44. smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
  45. smftools/preprocessing/append_variant_call_layer.py +480 -0
  46. smftools/preprocessing/calculate_consensus.py +1 -1
  47. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  48. smftools/preprocessing/flag_duplicate_reads.py +4 -4
  49. smftools/preprocessing/invert_adata.py +1 -0
  50. smftools/readwrite.py +159 -99
  51. smftools/schema/anndata_schema_v1.yaml +15 -1
  52. smftools/tools/__init__.py +10 -0
  53. smftools/tools/calculate_knn.py +121 -0
  54. smftools/tools/calculate_leiden.py +57 -0
  55. smftools/tools/calculate_nmf.py +130 -0
  56. smftools/tools/calculate_pca.py +180 -0
  57. smftools/tools/calculate_umap.py +79 -80
  58. smftools/tools/position_stats.py +4 -4
  59. smftools/tools/rolling_nn_distance.py +872 -0
  60. smftools/tools/sequence_alignment.py +140 -0
  61. smftools/tools/tensor_factorization.py +217 -0
  62. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
  63. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
  64. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
  65. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
  66. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -18,8 +18,9 @@ conversions:
18
18
  fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
19
19
  fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
20
20
  input_already_demuxed: False # If the input files are already demultiplexed.
21
+
21
22
  delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
22
- delete_intermediate_bams: True # Whether to delete intermediate BAM files.
23
+ delete_intermediate_bams: False # Whether to delete intermediate BAM files.
23
24
  delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
24
25
 
25
26
  # Sequencing modality and general experiment params
@@ -77,6 +78,7 @@ aligner_args:
77
78
  # Sorted BAM and BED specific handling
78
79
  make_bigwigs: False # Whether to make coverage bigwigs
79
80
  make_beds: False # Whether to make beds from the aligned bams
81
+ annotate_secondary_supplementary: True # Whether to annotate reads with secondary/supplementary alignments from the aligned BAM
80
82
  samtools_backend: auto # auto|python|cli for samtools-compatible operations
81
83
  bedtools_backend: auto # auto|python|cli for bedtools-compatible operations
82
84
  bigwig_backend: auto # auto|python|cli for bedGraphToBigWig conversion
@@ -90,6 +92,12 @@ mapping_threshold: 0.10 # Minimum proportion of mapped reads that need to fall w
90
92
  reference_column: 'Reference_strand'
91
93
  sample_column: 'Experiment_name_and_barcode'
92
94
 
95
+ # Plotting params
96
+ clustermap_demux_types_to_plot:
97
+ - "single"
98
+ - "double"
99
+ - "already"
100
+
93
101
  ######## smftools preprocess params #########
94
102
  # Read length, quality, and mapping filtering params
95
103
  read_coord_filter:
@@ -102,7 +110,7 @@ read_len_to_ref_ratio_filter_thresholds:
102
110
  - null
103
111
  - null
104
112
  read_quality_filter_thresholds:
105
- - 15
113
+ - 10
106
114
  - null
107
115
  read_mapping_quality_filter_thresholds:
108
116
  - null
@@ -122,7 +130,7 @@ read_mod_filtering_a_thresholds:
122
130
  - 0.025
123
131
  - 0.975
124
132
  read_mod_filtering_use_other_c_as_background: False
125
- min_valid_fraction_positions_in_read_vs_ref: 0.5
133
+ min_valid_fraction_positions_in_read_vs_ref: 0.2
126
134
 
127
135
  # Plotting params for read length histograms
128
136
  obs_to_plot_pp_qc:
@@ -140,6 +148,10 @@ duplicate_detection_site_types: # Site types to consider for duplicate detection
140
148
  - "CpG"
141
149
  - "ambiguous_GpC_CpG"
142
150
  duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
151
+ duplicate_detection_demux_types_to_use:
152
+ - "single"
153
+ - "double"
154
+ - "already"
143
155
  hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
144
156
  - Fraction_C_site_modified
145
157
  duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
@@ -150,7 +162,13 @@ duplicate_detection_hierarchical_linkage: "average" # Method for hierarchical cl
150
162
  duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkage based duplicate detection.
151
163
 
152
164
  # Position QC params
153
- position_max_nan_threshold: 0.1 # The maximum amount of nans to tolerate in a column
165
+ position_max_nan_threshold: 0.8 # The maximum amount of nans to tolerate in a column
166
+ mismatch_frequency_range:
167
+ - 0.01
168
+ - 0.99
169
+ mismatch_frequency_layer: "mismatch_integer_encoding"
170
+ mismatch_frequency_read_span_layer: "read_span_mask"
171
+ mismatch_base_frequency_exclude_mod_sites: True
154
172
 
155
173
  ######## smftools spatial params #########
156
174
  invert_adata: False # Whether to invert the AnnData along the positions axis.
@@ -170,9 +188,55 @@ clustermap_cmap_cpg: "coolwarm"
170
188
  clustermap_cmap_a: "coolwarm"
171
189
  spatial_clustermap_sortby: "gpc"
172
190
 
173
- # Spatial Analysis - UMAP/Leiden params
191
+ # Clustermap variant params
192
+ overlay_variant_calls: false
193
+ variant_overlay_seq1_color: "black"
194
+ variant_overlay_seq2_color: "white"
195
+ variant_overlay_marker_size: 4.0
196
+
197
+ # Spatial analysis - Rolling NN Hamming
198
+ rolling_nn_layer: "nan0_0minus1"
199
+ rolling_nn_plot_layer: "nan0_0minus1"
200
+ rolling_nn_plot_layers:
201
+ - "nan0_0minus1"
202
+ - "zero_hamming_distance_spans"
203
+ rolling_nn_window: 10
204
+ rolling_nn_step: 1
205
+ rolling_nn_min_overlap: 8
206
+ rolling_nn_return_fraction: true
207
+ rolling_nn_obsm_key: "rolling_nn_dist"
208
+ rolling_nn_site_types:
209
+ - "GpC"
210
+ - "CpG"
211
+ rolling_nn_write_zero_pairs_csvs: true
212
+ rolling_nn_zero_pairs_uns_key: null
213
+ rolling_nn_zero_pairs_segments_key: null
214
+ rolling_nn_zero_pairs_layer_key: null
215
+ rolling_nn_zero_pairs_refine: true
216
+ rolling_nn_zero_pairs_max_nan_run: 2
217
+ rolling_nn_zero_pairs_merge_gap: 1
218
+ rolling_nn_zero_pairs_max_segments_per_read: 2
219
+ rolling_nn_zero_pairs_max_overlap: 5
220
+ rolling_nn_zero_pairs_layer_overlap_mode: "sum"
221
+ rolling_nn_zero_pairs_layer_overlap_value: null
222
+ rolling_nn_zero_pairs_keep_uns: true
223
+ rolling_nn_zero_pairs_segments_keep_uns: true
224
+ rolling_nn_zero_pairs_top_segments_per_read: 3
225
+ rolling_nn_zero_pairs_top_segments_max_overlap: 5
226
+ rolling_nn_zero_pairs_top_segments_min_span: 300
227
+ rolling_nn_zero_pairs_top_segments_write_csvs: true
228
+ rolling_nn_zero_pairs_segment_histogram_bins: 30
229
+
230
+ # Cross-sample rolling NN analysis
231
+ cross_sample_analysis: true
232
+ cross_sample_grouping_col: null
233
+ cross_sample_random_seed: 42
234
+ delta_hamming_chimeric_span_threshold: 200
235
+
236
+ # Latent Analysis - UMAP/Leiden params
174
237
  layer_for_umap_plotting: 'nan_half'
175
238
  umap_layers_to_plot:
239
+ - "leiden"
176
240
  - "mapped_length"
177
241
  - "Raw_modification_signal"
178
242
 
@@ -243,22 +307,31 @@ hmm_feature_sets:
243
307
  mid_accessible_patch: [20, 40]
244
308
  large_accessible_patch: [40, 110]
245
309
  nucleosome_depleted_region: [110, inf]
310
+ hmm_feature_colormaps:
311
+ small_accessible_patch: "#A5D6A7"
312
+ mid_accessible_patch: "#2E7D32"
313
+ large_accessible_patch: "#006400"
314
+ nucleosome_depleted_region: "#00441B"
315
+ all_accessible_features: "#2E7D32"
316
+ small_bound_stretch: "#1E88E5"
317
+ medium_bound_stretch: "#6A1B9A"
318
+ large_bound_stretch: "#FB8C00"
319
+ putative_nucleosome: "#6D4C41"
320
+ all_footprint_features: "#6A1B9A"
321
+ cpg_patch: "#6D4C41"
246
322
  hmm_merge_layer_features:
247
323
  - ["all_accessible_features", 60]
248
324
  clustermap_cmap_hmm: "coolwarm"
249
325
  hmm_clustermap_feature_layers:
250
- - all_accessible_features
251
326
  - all_accessible_features_merged
252
- - small_accessible_patch
253
- - mid_accessible_patch
254
- - large_accessible_patch
255
- - large_accessible_patch_merged
256
- - nucleosome_depleted_region
257
327
  - nucleosome_depleted_region_merged
258
328
  - small_bound_stretch
259
329
  - medium_bound_stretch
260
330
  - putative_nucleosome
261
- - large_bound_stretch
331
+ - all_footprint_features
332
+ hmm_clustermap_length_layers:
333
+ - all_accessible_features_merged
334
+ - all_footprint_features
262
335
  hmm_clustermap_sortby: "hmm"
263
336
  hmm_peak_feature_configs:
264
337
  all_accessible_features:
@@ -12,6 +12,7 @@ from smftools.constants import (
12
12
  BAM_SUFFIX,
13
13
  BARCODE_BOTH_ENDS,
14
14
  CONVERSIONS,
15
+ LOAD_DIR,
15
16
  MOD_LIST,
16
17
  MOD_MAP,
17
18
  REF_COL,
@@ -664,6 +665,8 @@ class ExperimentConfig:
664
665
  # General I/O
665
666
  input_data_path: Optional[str] = None
666
667
  output_directory: Optional[str] = None
668
+ emit_log_file: Optional[bool] = True
669
+ log_level: Optional[str] = "INFO"
667
670
  fasta: Optional[str] = None
668
671
  bam_suffix: str = BAM_SUFFIX
669
672
  recursive_input_search: bool = True
@@ -736,6 +739,7 @@ class ExperimentConfig:
736
739
  aligner_args: Optional[List[str]] = None
737
740
  make_bigwigs: bool = False
738
741
  make_beds: bool = False
742
+ annotate_secondary_supplementary: bool = True
739
743
  samtools_backend: str = "auto"
740
744
  bedtools_backend: str = "auto"
741
745
  bigwig_backend: str = "auto"
@@ -747,6 +751,9 @@ class ExperimentConfig:
747
751
  # General Plotting
748
752
  sample_name_col_for_plotting: Optional[str] = "Barcode"
749
753
  rows_per_qc_histogram_grid: int = 12
754
+ clustermap_demux_types_to_plot: List[str] = field(
755
+ default_factory=lambda: ["single", "double", "already"]
756
+ )
750
757
 
751
758
  # Preprocessing - Read length and quality filter params
752
759
  read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
@@ -816,6 +823,9 @@ class ExperimentConfig:
816
823
  duplicate_detection_site_types: List[str] = field(
817
824
  default_factory=lambda: ["GpC", "CpG", "ambiguous_GpC_CpG"]
818
825
  )
826
+ duplicate_detection_demux_types_to_use: List[str] = field(
827
+ default_factory=lambda: ["single", "double", "already"]
828
+ )
819
829
  duplicate_detection_distance_threshold: float = 0.07
820
830
  hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ["Fraction_C_site_modified"])
821
831
  duplicate_detection_keep_best_metric: str = "read_quality"
@@ -827,6 +837,13 @@ class ExperimentConfig:
827
837
 
828
838
  # Preprocessing - Position QC
829
839
  position_max_nan_threshold: float = 0.1
840
+ mismatch_frequency_range: Sequence[float] = field(default_factory=lambda: [0.05, 0.95])
841
+ mismatch_frequency_layer: str = "mismatch_integer_encoding"
842
+ mismatch_frequency_read_span_layer: str = "read_span_mask"
843
+ mismatch_base_frequency_exclude_mod_sites: bool = False
844
+ references_to_align_for_variant_annotation: List[Optional[str]] = field(
845
+ default_factory=lambda: [None, None]
846
+ )
830
847
 
831
848
  # Spatial Analysis - Clustermap params
832
849
  layer_for_clustermap_plotting: Optional[str] = "nan0_0minus1"
@@ -835,6 +852,45 @@ class ExperimentConfig:
835
852
  clustermap_cmap_cpg: Optional[str] = "coolwarm"
836
853
  clustermap_cmap_a: Optional[str] = "coolwarm"
837
854
  spatial_clustermap_sortby: Optional[str] = "gpc"
855
+ overlay_variant_calls: bool = False
856
+ variant_overlay_seq1_color: str = "white"
857
+ variant_overlay_seq2_color: str = "black"
858
+ variant_overlay_marker_size: float = 4.0
859
+ rolling_nn_layer: Optional[str] = "nan0_0minus1"
860
+ rolling_nn_plot_layer: Optional[str] = "nan0_0minus1"
861
+ rolling_nn_plot_layers: List[str] = field(
862
+ default_factory=lambda: ["nan0_0minus1", "nan0_0minus1"]
863
+ )
864
+ rolling_nn_window: int = 10
865
+ rolling_nn_step: int = 1
866
+ rolling_nn_min_overlap: int = 8
867
+ rolling_nn_return_fraction: bool = True
868
+ rolling_nn_obsm_key: str = "rolling_nn_dist"
869
+ rolling_nn_site_types: Optional[List[str]] = None
870
+ rolling_nn_write_zero_pairs_csvs: bool = True
871
+ rolling_nn_zero_pairs_uns_key: Optional[str] = None
872
+ rolling_nn_zero_pairs_segments_key: Optional[str] = None
873
+ rolling_nn_zero_pairs_layer_key: Optional[str] = None
874
+ rolling_nn_zero_pairs_refine: bool = True
875
+ rolling_nn_zero_pairs_max_nan_run: Optional[int] = None
876
+ rolling_nn_zero_pairs_merge_gap: int = 0
877
+ rolling_nn_zero_pairs_max_segments_per_read: Optional[int] = None
878
+ rolling_nn_zero_pairs_max_overlap: Optional[int] = None
879
+ rolling_nn_zero_pairs_layer_overlap_mode: str = "binary"
880
+ rolling_nn_zero_pairs_layer_overlap_value: Optional[int] = None
881
+ rolling_nn_zero_pairs_keep_uns: bool = True
882
+ rolling_nn_zero_pairs_segments_keep_uns: bool = True
883
+ rolling_nn_zero_pairs_top_segments_per_read: Optional[int] = None
884
+ rolling_nn_zero_pairs_top_segments_max_overlap: Optional[int] = None
885
+ rolling_nn_zero_pairs_top_segments_min_span: Optional[float] = None
886
+ rolling_nn_zero_pairs_top_segments_write_csvs: bool = True
887
+ rolling_nn_zero_pairs_segment_histogram_bins: int = 30
888
+
889
+ # Cross-sample rolling NN analysis
890
+ cross_sample_analysis: bool = False
891
+ cross_sample_grouping_col: Optional[str] = None
892
+ cross_sample_random_seed: int = 42
893
+ delta_hamming_chimeric_span_threshold: int = 200
838
894
 
839
895
  # Spatial Analysis - UMAP/Leiden params
840
896
  layer_for_umap_plotting: Optional[str] = "nan_half"
@@ -883,11 +939,15 @@ class ExperimentConfig:
883
939
  accessible_patches: Optional[bool] = True
884
940
  cpg: Optional[bool] = False
885
941
  hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
942
+ hmm_feature_colormaps: Dict[str, Any] = field(default_factory=dict)
886
943
  hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 60)])
887
944
  clustermap_cmap_hmm: Optional[str] = "coolwarm"
888
945
  hmm_clustermap_feature_layers: List[str] = field(
889
946
  default_factory=lambda: ["all_accessible_features"]
890
947
  )
948
+ hmm_clustermap_length_layers: List[str] = field(
949
+ default_factory=lambda: ["all_accessible_features"]
950
+ )
891
951
  hmm_clustermap_sortby: Optional[str] = "hmm"
892
952
  hmm_peak_feature_configs: Dict[str, Any] = field(default_factory=dict)
893
953
 
@@ -906,6 +966,8 @@ class ExperimentConfig:
906
966
  invert_adata: bool = False
907
967
  bypass_append_binary_layer_by_base_context: bool = False
908
968
  force_redo_append_binary_layer_by_base_context: bool = False
969
+ bypass_append_mismatch_frequency_sites: bool = False
970
+ force_redo_append_mismatch_frequency_sites: bool = False
909
971
  bypass_calculate_read_modification_stats: bool = False
910
972
  force_redo_calculate_read_modification_stats: bool = False
911
973
  bypass_filter_reads_on_modification_thresholds: bool = False
@@ -1110,7 +1172,7 @@ class ExperimentConfig:
1110
1172
 
1111
1173
  # Demultiplexing output path
1112
1174
  split_dir = merged.get("split_dir", SPLIT_DIR)
1113
- split_path = output_dir / split_dir
1175
+ split_path = output_dir / LOAD_DIR / split_dir
1114
1176
 
1115
1177
  # final normalization
1116
1178
  if "strands" in merged:
@@ -1121,6 +1183,10 @@ class ExperimentConfig:
1121
1183
  merged["mod_target_bases"] = _parse_list(merged["mod_target_bases"])
1122
1184
  if "conversion_types" in merged:
1123
1185
  merged["conversion_types"] = _parse_list(merged["conversion_types"])
1186
+ if "references_to_align_for_variant_annotation" in merged:
1187
+ merged["references_to_align_for_variant_annotation"] = _parse_list(
1188
+ merged["references_to_align_for_variant_annotation"]
1189
+ )
1124
1190
 
1125
1191
  merged["filter_threshold"] = float(_parse_numeric(merged.get("filter_threshold", 0.8), 0.8))
1126
1192
  merged["m6A_threshold"] = float(_parse_numeric(merged.get("m6A_threshold", 0.7), 0.7))
@@ -1197,6 +1263,9 @@ class ExperimentConfig:
1197
1263
  # Final normalization of hmm_feature_sets and canonical local variables
1198
1264
  merged["hmm_feature_sets"] = normalize_hmm_feature_sets(merged.get("hmm_feature_sets", {}))
1199
1265
  hmm_feature_sets = merged.get("hmm_feature_sets", {})
1266
+ hmm_feature_colormaps = merged.get("hmm_feature_colormaps", {})
1267
+ if not isinstance(hmm_feature_colormaps, dict):
1268
+ hmm_feature_colormaps = {}
1200
1269
  hmm_annotation_threshold = merged.get("hmm_annotation_threshold", 0.5)
1201
1270
  hmm_batch_size = int(merged.get("hmm_batch_size", 1024))
1202
1271
  hmm_use_viterbi = bool(merged.get("hmm_use_viterbi", False))
@@ -1211,6 +1280,9 @@ class ExperimentConfig:
1211
1280
  hmm_clustermap_feature_layers = _parse_list(
1212
1281
  merged.get("hmm_clustermap_feature_layers", "all_accessible_features")
1213
1282
  )
1283
+ hmm_clustermap_length_layers = _parse_list(
1284
+ merged.get("hmm_clustermap_length_layers", hmm_clustermap_feature_layers)
1285
+ )
1214
1286
 
1215
1287
  hmm_fit_strategy = str(merged.get("hmm_fit_strategy", "per_group")).strip()
1216
1288
  hmm_shared_scope = _parse_list(merged.get("hmm_shared_scope", ["reference", "methbase"]))
@@ -1231,6 +1303,7 @@ class ExperimentConfig:
1231
1303
 
1232
1304
  # instantiate dataclass
1233
1305
  instance = cls(
1306
+ annotate_secondary_supplementary=merged.get("annotate_secondary_supplementary", True),
1234
1307
  smf_modality=merged.get("smf_modality"),
1235
1308
  input_data_path=input_data_path,
1236
1309
  recursive_input_search=merged.get("recursive_input_search"),
@@ -1257,6 +1330,8 @@ class ExperimentConfig:
1257
1330
  trim=merged.get("trim", TRIM),
1258
1331
  input_already_demuxed=merged.get("input_already_demuxed", False),
1259
1332
  threads=merged.get("threads"),
1333
+ emit_log_file=merged.get("emit_log_file", True),
1334
+ log_level=merged.get("log_level", "INFO"),
1260
1335
  sample_sheet_path=merged.get("sample_sheet_path"),
1261
1336
  sample_sheet_mapping_column=merged.get("sample_sheet_mapping_column"),
1262
1337
  delete_intermediate_bams=merged.get("delete_intermediate_bams", False),
@@ -1313,6 +1388,9 @@ class ExperimentConfig:
1313
1388
  ),
1314
1389
  reindexing_offsets=merged.get("reindexing_offsets", {None: None}),
1315
1390
  reindexed_var_suffix=merged.get("reindexed_var_suffix", "reindexed"),
1391
+ clustermap_demux_types_to_plot=merged.get(
1392
+ "clustermap_demux_types_to_plot", ["single", "double", "already"]
1393
+ ),
1316
1394
  layer_for_clustermap_plotting=merged.get(
1317
1395
  "layer_for_clustermap_plotting", "nan0_0minus1"
1318
1396
  ),
@@ -1321,6 +1399,65 @@ class ExperimentConfig:
1321
1399
  clustermap_cmap_cpg=merged.get("clustermap_cmap_cpg", "coolwarm"),
1322
1400
  clustermap_cmap_a=merged.get("clustermap_cmap_a", "coolwarm"),
1323
1401
  spatial_clustermap_sortby=merged.get("spatial_clustermap_sortby", "gpc"),
1402
+ overlay_variant_calls=_parse_bool(merged.get("overlay_variant_calls", False)),
1403
+ variant_overlay_seq1_color=merged.get("variant_overlay_seq1_color", "white"),
1404
+ variant_overlay_seq2_color=merged.get("variant_overlay_seq2_color", "black"),
1405
+ variant_overlay_marker_size=float(merged.get("variant_overlay_marker_size", 4.0)),
1406
+ rolling_nn_layer=merged.get("rolling_nn_layer", "nan0_0minus1"),
1407
+ rolling_nn_plot_layer=merged.get("rolling_nn_plot_layer", "nan0_0minus1"),
1408
+ rolling_nn_plot_layers=merged.get(
1409
+ "rolling_nn_plot_layers", ["nan0_0minus1", "nan0_0minus1"]
1410
+ ),
1411
+ rolling_nn_window=merged.get("rolling_nn_window", 15),
1412
+ rolling_nn_step=merged.get("rolling_nn_step", 2),
1413
+ rolling_nn_min_overlap=merged.get("rolling_nn_min_overlap", 10),
1414
+ rolling_nn_return_fraction=merged.get("rolling_nn_return_fraction", True),
1415
+ rolling_nn_obsm_key=merged.get("rolling_nn_obsm_key", "rolling_nn_dist"),
1416
+ rolling_nn_site_types=merged.get("rolling_nn_site_types", None),
1417
+ rolling_nn_write_zero_pairs_csvs=merged.get("rolling_nn_write_zero_pairs_csvs", True),
1418
+ rolling_nn_zero_pairs_uns_key=merged.get("rolling_nn_zero_pairs_uns_key", None),
1419
+ rolling_nn_zero_pairs_segments_key=merged.get(
1420
+ "rolling_nn_zero_pairs_segments_key", None
1421
+ ),
1422
+ rolling_nn_zero_pairs_layer_key=merged.get("rolling_nn_zero_pairs_layer_key", None),
1423
+ rolling_nn_zero_pairs_refine=merged.get("rolling_nn_zero_pairs_refine", True),
1424
+ rolling_nn_zero_pairs_max_nan_run=merged.get("rolling_nn_zero_pairs_max_nan_run", None),
1425
+ rolling_nn_zero_pairs_merge_gap=merged.get("rolling_nn_zero_pairs_merge_gap", 0),
1426
+ rolling_nn_zero_pairs_max_segments_per_read=merged.get(
1427
+ "rolling_nn_zero_pairs_max_segments_per_read", None
1428
+ ),
1429
+ rolling_nn_zero_pairs_max_overlap=merged.get("rolling_nn_zero_pairs_max_overlap", None),
1430
+ rolling_nn_zero_pairs_layer_overlap_mode=merged.get(
1431
+ "rolling_nn_zero_pairs_layer_overlap_mode", "binary"
1432
+ ),
1433
+ rolling_nn_zero_pairs_layer_overlap_value=merged.get(
1434
+ "rolling_nn_zero_pairs_layer_overlap_value", None
1435
+ ),
1436
+ rolling_nn_zero_pairs_keep_uns=merged.get("rolling_nn_zero_pairs_keep_uns", True),
1437
+ rolling_nn_zero_pairs_segments_keep_uns=merged.get(
1438
+ "rolling_nn_zero_pairs_segments_keep_uns", True
1439
+ ),
1440
+ rolling_nn_zero_pairs_top_segments_per_read=merged.get(
1441
+ "rolling_nn_zero_pairs_top_segments_per_read", None
1442
+ ),
1443
+ rolling_nn_zero_pairs_top_segments_max_overlap=merged.get(
1444
+ "rolling_nn_zero_pairs_top_segments_max_overlap", None
1445
+ ),
1446
+ rolling_nn_zero_pairs_top_segments_min_span=merged.get(
1447
+ "rolling_nn_zero_pairs_top_segments_min_span", None
1448
+ ),
1449
+ rolling_nn_zero_pairs_top_segments_write_csvs=merged.get(
1450
+ "rolling_nn_zero_pairs_top_segments_write_csvs", True
1451
+ ),
1452
+ rolling_nn_zero_pairs_segment_histogram_bins=merged.get(
1453
+ "rolling_nn_zero_pairs_segment_histogram_bins", 30
1454
+ ),
1455
+ cross_sample_analysis=merged.get("cross_sample_analysis", False),
1456
+ cross_sample_grouping_col=merged.get("cross_sample_grouping_col", None),
1457
+ cross_sample_random_seed=merged.get("cross_sample_random_seed", 42),
1458
+ delta_hamming_chimeric_span_threshold=merged.get(
1459
+ "delta_hamming_chimeric_span_threshold", 200
1460
+ ),
1324
1461
  layer_for_umap_plotting=merged.get("layer_for_umap_plotting", "nan_half"),
1325
1462
  umap_layers_to_plot=merged.get(
1326
1463
  "umap_layers_to_plot", ["mapped_length", "Raw_modification_signal"]
@@ -1347,6 +1484,7 @@ class ExperimentConfig:
1347
1484
  hmm_emission_adapt_tol=hmm_emission_adapt_tol,
1348
1485
  hmm_dtype=merged.get("hmm_dtype", "float64"),
1349
1486
  hmm_feature_sets=hmm_feature_sets,
1487
+ hmm_feature_colormaps=hmm_feature_colormaps,
1350
1488
  hmm_annotation_threshold=hmm_annotation_threshold,
1351
1489
  hmm_batch_size=hmm_batch_size,
1352
1490
  hmm_use_viterbi=hmm_use_viterbi,
@@ -1355,6 +1493,7 @@ class ExperimentConfig:
1355
1493
  hmm_merge_layer_features=hmm_merge_layer_features,
1356
1494
  clustermap_cmap_hmm=merged.get("clustermap_cmap_hmm", "coolwarm"),
1357
1495
  hmm_clustermap_feature_layers=hmm_clustermap_feature_layers,
1496
+ hmm_clustermap_length_layers=hmm_clustermap_length_layers,
1358
1497
  hmm_clustermap_sortby=merged.get("hmm_clustermap_sortby", "hmm"),
1359
1498
  hmm_peak_feature_configs=hmm_peak_feature_configs,
1360
1499
  footprints=merged.get("footprints", None),
@@ -1390,6 +1529,9 @@ class ExperimentConfig:
1390
1529
  duplicate_detection_site_types=merged.get(
1391
1530
  "duplicate_detection_site_types", ["GpC", "CpG", "ambiguous_GpC_CpG"]
1392
1531
  ),
1532
+ duplicate_detection_demux_types_to_use=merged.get(
1533
+ "duplicate_detection_demux_types_to_use", ["single", "double", "already"]
1534
+ ),
1393
1535
  duplicate_detection_distance_threshold=merged.get(
1394
1536
  "duplicate_detection_distance_threshold", 0.07
1395
1537
  ),
@@ -1479,6 +1621,9 @@ class ExperimentConfig:
1479
1621
  force_redo_hmm_fit=merged.get("force_redo_hmm_fit", False),
1480
1622
  bypass_hmm_apply=merged.get("bypass_hmm_apply", False),
1481
1623
  force_redo_hmm_apply=merged.get("force_redo_hmm_apply", False),
1624
+ references_to_align_for_variant_annotation=merged.get(
1625
+ "references_to_align_for_variant_annotation", [None, None]
1626
+ ),
1482
1627
  config_source=config_source or "<var_dict>",
1483
1628
  )
1484
1629
 
smftools/constants.py CHANGED
@@ -21,7 +21,34 @@ BAM_SUFFIX: Final[str] = ".bam"
21
21
  BARCODE_BOTH_ENDS: Final[bool] = False
22
22
  REF_COL: Final[str] = "Reference_strand"
23
23
  SAMPLE_COL: Final[str] = "Experiment_name_and_barcode"
24
+ SAMPLE: Final[str] = "Sample"
24
25
  SPLIT_DIR: Final[str] = "demultiplexed_BAMs"
26
+ H5_DIR: Final[str] = "h5ads"
27
+ DEMUX_TYPE: Final[str] = "demux_type"
28
+ BARCODE: Final[str] = "Barcode"
29
+ REFERENCE: Final[str] = "Reference"
30
+ REFERENCE_STRAND: Final[str] = "Reference_strand"
31
+ REFERENCE_DATASET_STRAND: Final[str] = "Reference_dataset_strand"
32
+ STRAND: Final[str] = "Strand"
33
+ DATASET: Final[str] = "Dataset"
34
+ READ_MISMATCH_TREND: Final[str] = "Read_mismatch_trend"
35
+ READ_MAPPING_DIRECTION: Final[str] = "Read_mapping_direction"
36
+ SEQUENCE_INTEGER_ENCODING: Final[str] = "sequence_integer_encoding"
37
+ SEQUENCE_INTEGER_DECODING: Final[str] = "sequence_integer_decoding"
38
+ MISMATCH_INTEGER_ENCODING: Final[str] = "mismatch_integer_encoding"
39
+ BASE_QUALITY_SCORES: Final[str] = "base_quality_scores"
40
+ READ_SPAN_MASK: Final[str] = "read_span_mask"
41
+
42
+ LOAD_DIR: Final[str] = "load_adata_outputs"
43
+ PREPROCESS_DIR: Final[str] = "preprocess_adata_outputs"
44
+ SPATIAL_DIR: Final[str] = "spatial_adata_outputs"
45
+ HMM_DIR: Final[str] = "hmm_adata_outputs"
46
+ LATENT_DIR: Final[str] = "latent_adata_outputs"
47
+ VARIANT_DIR: Final[str] = "variant_adata_outputs"
48
+ CHIMERIC_DIR: Final[str] = "chimeric_adata_outputs"
49
+
50
+ LOGGING_DIR: Final[str] = "logs"
51
+
25
52
  TRIM: Final[bool] = False
26
53
 
27
54
  _private_conversions = ["unconverted"]
@@ -35,3 +62,45 @@ MOD_MAP: Final[Mapping[str, str]] = _deep_freeze(_private_mod_map)
35
62
 
36
63
  _private_strands = ("bottom", "top")
37
64
  STRANDS: Final[tuple[str, ...]] = _deep_freeze(_private_strands)
65
+
66
+ MODKIT_EXTRACT_TSV_COLUMN_CHROM: Final[str] = "chrom"
67
+ MODKIT_EXTRACT_TSV_COLUMN_REF_POSITION: Final[str] = "ref_position"
68
+ MODKIT_EXTRACT_TSV_COLUMN_MODIFIED_PRIMARY_BASE: Final[str] = "modified_primary_base"
69
+ MODKIT_EXTRACT_TSV_COLUMN_REF_STRAND: Final[str] = "ref_strand"
70
+ MODKIT_EXTRACT_TSV_COLUMN_READ_ID: Final[str] = "read_id"
71
+ MODKIT_EXTRACT_TSV_COLUMN_CALL_CODE: Final[str] = "call_code"
72
+ MODKIT_EXTRACT_TSV_COLUMN_CALL_PROB: Final[str] = "call_prob"
73
+
74
+ MODKIT_EXTRACT_MODIFIED_BASE_A: Final[str] = "A"
75
+ MODKIT_EXTRACT_MODIFIED_BASE_C: Final[str] = "C"
76
+ MODKIT_EXTRACT_REF_STRAND_PLUS: Final[str] = "+"
77
+ MODKIT_EXTRACT_REF_STRAND_MINUS: Final[str] = "-"
78
+
79
+ _private_modkit_extract_call_code_modified = ("a", "h", "m")
80
+ MODKIT_EXTRACT_CALL_CODE_MODIFIED: Final[tuple[str, ...]] = _deep_freeze(
81
+ _private_modkit_extract_call_code_modified
82
+ )
83
+ _private_modkit_extract_call_code_canonical = ("-",)
84
+ MODKIT_EXTRACT_CALL_CODE_CANONICAL: Final[tuple[str, ...]] = _deep_freeze(
85
+ _private_modkit_extract_call_code_canonical
86
+ )
87
+
88
+ MODKIT_EXTRACT_SEQUENCE_BASES: Final[tuple[str, ...]] = _deep_freeze(("A", "C", "G", "T", "N"))
89
+ MODKIT_EXTRACT_SEQUENCE_PADDING_BASE: Final[str] = "PAD"
90
+ _private_modkit_extract_base_to_int: Dict[str, int] = {
91
+ "A": 0,
92
+ "C": 1,
93
+ "G": 2,
94
+ "T": 3,
95
+ "N": 4,
96
+ "PAD": 5,
97
+ }
98
+ MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT: Final[Mapping[str, int]] = _deep_freeze(
99
+ _private_modkit_extract_base_to_int
100
+ )
101
+ _private_modkit_extract_int_to_base: Dict[int, str] = {
102
+ value: key for key, value in _private_modkit_extract_base_to_int.items()
103
+ }
104
+ MODKIT_EXTRACT_SEQUENCE_INT_TO_BASE: Final[Mapping[int, str]] = _deep_freeze(
105
+ _private_modkit_extract_int_to_base
106
+ )
smftools/hmm/HMM.py CHANGED
@@ -144,6 +144,83 @@ def _safe_int_coords(var_names) -> Tuple[np.ndarray, bool]:
144
144
  return np.arange(len(var_names), dtype=int), False
145
145
 
146
146
 
147
+ def mask_layers_outside_read_span(
148
+ adata,
149
+ layers: Sequence[str],
150
+ *,
151
+ start_key: str = "reference_start",
152
+ end_key: str = "reference_end",
153
+ use_original_var_names: bool = True,
154
+ ) -> List[str]:
155
+ """Mask layer values outside read reference spans with NaN.
156
+
157
+ This uses integer coordinate comparisons against either ``adata.var["Original_var_names"]``
158
+ (when present) or ``adata.var_names``. Values strictly less than ``start_key`` or greater
159
+ than ``end_key`` are set to NaN for each read.
160
+
161
+ Args:
162
+ adata: AnnData object to modify in-place.
163
+ layers: Layer names to mask.
164
+ start_key: obs column holding reference start positions.
165
+ end_key: obs column holding reference end positions.
166
+ use_original_var_names: Use ``adata.var["Original_var_names"]`` when available.
167
+
168
+ Returns:
169
+ List of layer names that were masked.
170
+ """
171
+ if not layers:
172
+ return []
173
+
174
+ if start_key not in adata.obs or end_key not in adata.obs:
175
+ raise KeyError(f"Missing {start_key!r} or {end_key!r} in adata.obs.")
176
+
177
+ coord_source = adata.var_names
178
+ if use_original_var_names and "Original_var_names" in adata.var:
179
+ orig = np.asarray(adata.var["Original_var_names"])
180
+ if orig.size == adata.n_vars:
181
+ try:
182
+ orig_numeric = np.asarray(orig, dtype=float)
183
+ except (TypeError, ValueError):
184
+ orig_numeric = None
185
+ if orig_numeric is not None and np.isfinite(orig_numeric).any():
186
+ coord_source = orig
187
+
188
+ coords, _ = _safe_int_coords(coord_source)
189
+ if coords.shape[0] != adata.n_vars:
190
+ raise ValueError("Coordinate source length does not match adata.n_vars.")
191
+
192
+ try:
193
+ starts = np.asarray(adata.obs[start_key], dtype=float)
194
+ ends = np.asarray(adata.obs[end_key], dtype=float)
195
+ except (TypeError, ValueError) as exc:
196
+ raise ValueError("Start/end positions must be numeric.") from exc
197
+
198
+ masked = []
199
+ for layer in layers:
200
+ if layer not in adata.layers:
201
+ raise KeyError(f"Layer {layer!r} not found in adata.layers.")
202
+
203
+ arr = np.asarray(adata.layers[layer])
204
+ if not np.issubdtype(arr.dtype, np.floating):
205
+ arr = arr.astype(float, copy=True)
206
+
207
+ for i in range(adata.n_obs):
208
+ start = starts[i]
209
+ end = ends[i]
210
+ if not np.isfinite(start) or not np.isfinite(end):
211
+ continue
212
+ start_i = int(start)
213
+ end_i = int(end)
214
+ row_mask = (coords < start_i) | (coords > end_i)
215
+ if row_mask.any():
216
+ arr[i, row_mask] = np.nan
217
+
218
+ adata.layers[layer] = arr
219
+ masked.append(layer)
220
+
221
+ return masked
222
+
223
+
147
224
  def _logsumexp(x: torch.Tensor, dim: int) -> torch.Tensor:
148
225
  """Compute log-sum-exp in a numerically stable way.
149
226
 
@@ -1064,6 +1141,8 @@ class BaseHMM(nn.Module):
1064
1141
  uns_key: str = "hmm_appended_layers",
1065
1142
  uns_flag: str = "hmm_annotated",
1066
1143
  force_redo: bool = False,
1144
+ mask_to_read_span: bool = True,
1145
+ mask_use_original_var_names: bool = True,
1067
1146
  device: Optional[Union[str, torch.device]] = None,
1068
1147
  **kwargs,
1069
1148
  ):
@@ -1085,6 +1164,8 @@ class BaseHMM(nn.Module):
1085
1164
  uns_key: .uns key to track appended layers.
1086
1165
  uns_flag: .uns flag to mark annotations.
1087
1166
  force_redo: Whether to overwrite existing layers.
1167
+ mask_to_read_span: Whether to mask appended layers outside read spans.
1168
+ mask_use_original_var_names: Use ``adata.var["Original_var_names"]`` when available.
1088
1169
  device: Device specifier.
1089
1170
  **kwargs: Additional parameters for specialized workflows.
1090
1171
 
@@ -1245,6 +1326,13 @@ class BaseHMM(nn.Module):
1245
1326
  np.asarray(adata.layers[nm])
1246
1327
  )
1247
1328
 
1329
+ if mask_to_read_span and appended:
1330
+ mask_layers_outside_read_span(
1331
+ adata,
1332
+ appended,
1333
+ use_original_var_names=mask_use_original_var_names,
1334
+ )
1335
+
1248
1336
  adata.uns[uns_key] = appended
1249
1337
  adata.uns[uns_flag] = True
1250
1338
  return None
@@ -51,7 +51,7 @@ def call_hmm_peaks(
51
51
  raise KeyError(f"obs column '{ref_column}' not found")
52
52
 
53
53
  # Ensure categorical for predictable ref iteration
54
- if not pd.api.types.is_categorical_dtype(adata.obs[ref_column]):
54
+ if not isinstance(adata.obs[ref_column].dtype, pd.CategoricalDtype):
55
55
  adata.obs[ref_column] = adata.obs[ref_column].astype("category")
56
56
 
57
57
  # Optional: drop duplicate obs columns once to avoid Pandas/AnnData view quirks