smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/chimeric_adata.py +1563 -0
- smftools/cli/helpers.py +49 -7
- smftools/cli/hmm_adata.py +250 -32
- smftools/cli/latent_adata.py +773 -0
- smftools/cli/load_adata.py +78 -74
- smftools/cli/preprocess_adata.py +122 -58
- smftools/cli/recipes.py +26 -0
- smftools/cli/spatial_adata.py +74 -112
- smftools/cli/variant_adata.py +423 -0
- smftools/cli_entry.py +52 -4
- smftools/config/conversion.yaml +1 -1
- smftools/config/deaminase.yaml +3 -0
- smftools/config/default.yaml +85 -12
- smftools/config/experiment_config.py +146 -1
- smftools/constants.py +69 -0
- smftools/hmm/HMM.py +88 -0
- smftools/hmm/call_hmm_peaks.py +1 -1
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/binarize_converted_base_identities.py +2 -89
- smftools/informatics/converted_BAM_to_adata.py +636 -175
- smftools/informatics/h5ad_functions.py +198 -2
- smftools/informatics/modkit_extract_to_adata.py +1007 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +26 -3
- smftools/plotting/autocorrelation_plotting.py +22 -4
- smftools/plotting/chimeric_plotting.py +1893 -0
- smftools/plotting/classifiers.py +28 -14
- smftools/plotting/general_plotting.py +62 -1583
- smftools/plotting/hmm_plotting.py +1670 -8
- smftools/plotting/latent_plotting.py +804 -0
- smftools/plotting/plotting_utils.py +243 -0
- smftools/plotting/position_stats.py +16 -8
- smftools/plotting/preprocess_plotting.py +281 -0
- smftools/plotting/qc_plotting.py +8 -3
- smftools/plotting/spatial_plotting.py +1134 -0
- smftools/plotting/variant_plotting.py +1231 -0
- smftools/preprocessing/__init__.py +4 -0
- smftools/preprocessing/append_base_context.py +18 -18
- smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
- smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
- smftools/preprocessing/append_variant_call_layer.py +480 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/flag_duplicate_reads.py +4 -4
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/readwrite.py +159 -99
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +10 -0
- smftools/tools/calculate_knn.py +121 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +130 -0
- smftools/tools/calculate_pca.py +180 -0
- smftools/tools/calculate_umap.py +79 -80
- smftools/tools/position_stats.py +4 -4
- smftools/tools/rolling_nn_distance.py +872 -0
- smftools/tools/sequence_alignment.py +140 -0
- smftools/tools/tensor_factorization.py +217 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
smftools/config/default.yaml
CHANGED
|
@@ -18,8 +18,9 @@ conversions:
|
|
|
18
18
|
fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
|
|
19
19
|
fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
|
|
20
20
|
input_already_demuxed: False # If the input files are already demultiplexed.
|
|
21
|
+
|
|
21
22
|
delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
|
|
22
|
-
delete_intermediate_bams:
|
|
23
|
+
delete_intermediate_bams: False # Whether to delete intermediate BAM files.
|
|
23
24
|
delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
|
|
24
25
|
|
|
25
26
|
# Sequencing modality and general experiment params
|
|
@@ -77,6 +78,7 @@ aligner_args:
|
|
|
77
78
|
# Sorted BAM and BED specific handling
|
|
78
79
|
make_bigwigs: False # Whether to make coverage bigwigs
|
|
79
80
|
make_beds: False # Whether to make beds from the aligned bams
|
|
81
|
+
annotate_secondary_supplementary: True # Whether to annotate reads with secondary/supplementary alignments from the aligned BAM
|
|
80
82
|
samtools_backend: auto # auto|python|cli for samtools-compatible operations
|
|
81
83
|
bedtools_backend: auto # auto|python|cli for bedtools-compatible operations
|
|
82
84
|
bigwig_backend: auto # auto|python|cli for bedGraphToBigWig conversion
|
|
@@ -90,6 +92,12 @@ mapping_threshold: 0.10 # Minimum proportion of mapped reads that need to fall w
|
|
|
90
92
|
reference_column: 'Reference_strand'
|
|
91
93
|
sample_column: 'Experiment_name_and_barcode'
|
|
92
94
|
|
|
95
|
+
# Plotting params
|
|
96
|
+
clustermap_demux_types_to_plot:
|
|
97
|
+
- "single"
|
|
98
|
+
- "double"
|
|
99
|
+
- "already"
|
|
100
|
+
|
|
93
101
|
######## smftools preprocess params #########
|
|
94
102
|
# Read length, quality, and mapping filtering params
|
|
95
103
|
read_coord_filter:
|
|
@@ -102,7 +110,7 @@ read_len_to_ref_ratio_filter_thresholds:
|
|
|
102
110
|
- null
|
|
103
111
|
- null
|
|
104
112
|
read_quality_filter_thresholds:
|
|
105
|
-
-
|
|
113
|
+
- 10
|
|
106
114
|
- null
|
|
107
115
|
read_mapping_quality_filter_thresholds:
|
|
108
116
|
- null
|
|
@@ -122,7 +130,7 @@ read_mod_filtering_a_thresholds:
|
|
|
122
130
|
- 0.025
|
|
123
131
|
- 0.975
|
|
124
132
|
read_mod_filtering_use_other_c_as_background: False
|
|
125
|
-
min_valid_fraction_positions_in_read_vs_ref: 0.
|
|
133
|
+
min_valid_fraction_positions_in_read_vs_ref: 0.2
|
|
126
134
|
|
|
127
135
|
# Plotting params for read length histograms
|
|
128
136
|
obs_to_plot_pp_qc:
|
|
@@ -140,6 +148,10 @@ duplicate_detection_site_types: # Site types to consider for duplicate detection
|
|
|
140
148
|
- "CpG"
|
|
141
149
|
- "ambiguous_GpC_CpG"
|
|
142
150
|
duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
|
|
151
|
+
duplicate_detection_demux_types_to_use:
|
|
152
|
+
- "single"
|
|
153
|
+
- "double"
|
|
154
|
+
- "already"
|
|
143
155
|
hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
|
|
144
156
|
- Fraction_C_site_modified
|
|
145
157
|
duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
|
|
@@ -150,7 +162,13 @@ duplicate_detection_hierarchical_linkage: "average" # Method for hierarchical cl
|
|
|
150
162
|
duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkage based duplicate detection.
|
|
151
163
|
|
|
152
164
|
# Position QC params
|
|
153
|
-
position_max_nan_threshold: 0.
|
|
165
|
+
position_max_nan_threshold: 0.8 # The maximum amount of nans to tolerate in a column
|
|
166
|
+
mismatch_frequency_range:
|
|
167
|
+
- 0.01
|
|
168
|
+
- 0.99
|
|
169
|
+
mismatch_frequency_layer: "mismatch_integer_encoding"
|
|
170
|
+
mismatch_frequency_read_span_layer: "read_span_mask"
|
|
171
|
+
mismatch_base_frequency_exclude_mod_sites: True
|
|
154
172
|
|
|
155
173
|
######## smftools spatial params #########
|
|
156
174
|
invert_adata: False # Whether to invert the AnnData along the positions axis.
|
|
@@ -170,9 +188,55 @@ clustermap_cmap_cpg: "coolwarm"
|
|
|
170
188
|
clustermap_cmap_a: "coolwarm"
|
|
171
189
|
spatial_clustermap_sortby: "gpc"
|
|
172
190
|
|
|
173
|
-
#
|
|
191
|
+
# Clustermap variant params
|
|
192
|
+
overlay_variant_calls: false
|
|
193
|
+
variant_overlay_seq1_color: "black"
|
|
194
|
+
variant_overlay_seq2_color: "white"
|
|
195
|
+
variant_overlay_marker_size: 4.0
|
|
196
|
+
|
|
197
|
+
# Spatial analysis - Rolling NN Hamming
|
|
198
|
+
rolling_nn_layer: "nan0_0minus1"
|
|
199
|
+
rolling_nn_plot_layer: "nan0_0minus1"
|
|
200
|
+
rolling_nn_plot_layers:
|
|
201
|
+
- "nan0_0minus1"
|
|
202
|
+
- "zero_hamming_distance_spans"
|
|
203
|
+
rolling_nn_window: 10
|
|
204
|
+
rolling_nn_step: 1
|
|
205
|
+
rolling_nn_min_overlap: 8
|
|
206
|
+
rolling_nn_return_fraction: true
|
|
207
|
+
rolling_nn_obsm_key: "rolling_nn_dist"
|
|
208
|
+
rolling_nn_site_types:
|
|
209
|
+
- "GpC"
|
|
210
|
+
- "CpG"
|
|
211
|
+
rolling_nn_write_zero_pairs_csvs: true
|
|
212
|
+
rolling_nn_zero_pairs_uns_key: null
|
|
213
|
+
rolling_nn_zero_pairs_segments_key: null
|
|
214
|
+
rolling_nn_zero_pairs_layer_key: null
|
|
215
|
+
rolling_nn_zero_pairs_refine: true
|
|
216
|
+
rolling_nn_zero_pairs_max_nan_run: 2
|
|
217
|
+
rolling_nn_zero_pairs_merge_gap: 1
|
|
218
|
+
rolling_nn_zero_pairs_max_segments_per_read: 2
|
|
219
|
+
rolling_nn_zero_pairs_max_overlap: 5
|
|
220
|
+
rolling_nn_zero_pairs_layer_overlap_mode: "sum"
|
|
221
|
+
rolling_nn_zero_pairs_layer_overlap_value: null
|
|
222
|
+
rolling_nn_zero_pairs_keep_uns: true
|
|
223
|
+
rolling_nn_zero_pairs_segments_keep_uns: true
|
|
224
|
+
rolling_nn_zero_pairs_top_segments_per_read: 3
|
|
225
|
+
rolling_nn_zero_pairs_top_segments_max_overlap: 5
|
|
226
|
+
rolling_nn_zero_pairs_top_segments_min_span: 300
|
|
227
|
+
rolling_nn_zero_pairs_top_segments_write_csvs: true
|
|
228
|
+
rolling_nn_zero_pairs_segment_histogram_bins: 30
|
|
229
|
+
|
|
230
|
+
# Cross-sample rolling NN analysis
|
|
231
|
+
cross_sample_analysis: true
|
|
232
|
+
cross_sample_grouping_col: null
|
|
233
|
+
cross_sample_random_seed: 42
|
|
234
|
+
delta_hamming_chimeric_span_threshold: 200
|
|
235
|
+
|
|
236
|
+
# Latent Analysis - UMAP/Leiden params
|
|
174
237
|
layer_for_umap_plotting: 'nan_half'
|
|
175
238
|
umap_layers_to_plot:
|
|
239
|
+
- "leiden"
|
|
176
240
|
- "mapped_length"
|
|
177
241
|
- "Raw_modification_signal"
|
|
178
242
|
|
|
@@ -243,22 +307,31 @@ hmm_feature_sets:
|
|
|
243
307
|
mid_accessible_patch: [20, 40]
|
|
244
308
|
large_accessible_patch: [40, 110]
|
|
245
309
|
nucleosome_depleted_region: [110, inf]
|
|
310
|
+
hmm_feature_colormaps:
|
|
311
|
+
small_accessible_patch: "#A5D6A7"
|
|
312
|
+
mid_accessible_patch: "#2E7D32"
|
|
313
|
+
large_accessible_patch: "#006400"
|
|
314
|
+
nucleosome_depleted_region: "#00441B"
|
|
315
|
+
all_accessible_features: "#2E7D32"
|
|
316
|
+
small_bound_stretch: "#1E88E5"
|
|
317
|
+
medium_bound_stretch: "#6A1B9A"
|
|
318
|
+
large_bound_stretch: "#FB8C00"
|
|
319
|
+
putative_nucleosome: "#6D4C41"
|
|
320
|
+
all_footprint_features: "#6A1B9A"
|
|
321
|
+
cpg_patch: "#6D4C41"
|
|
246
322
|
hmm_merge_layer_features:
|
|
247
323
|
- ["all_accessible_features", 60]
|
|
248
324
|
clustermap_cmap_hmm: "coolwarm"
|
|
249
325
|
hmm_clustermap_feature_layers:
|
|
250
|
-
- all_accessible_features
|
|
251
326
|
- all_accessible_features_merged
|
|
252
|
-
- small_accessible_patch
|
|
253
|
-
- mid_accessible_patch
|
|
254
|
-
- large_accessible_patch
|
|
255
|
-
- large_accessible_patch_merged
|
|
256
|
-
- nucleosome_depleted_region
|
|
257
327
|
- nucleosome_depleted_region_merged
|
|
258
328
|
- small_bound_stretch
|
|
259
329
|
- medium_bound_stretch
|
|
260
330
|
- putative_nucleosome
|
|
261
|
-
-
|
|
331
|
+
- all_footprint_features
|
|
332
|
+
hmm_clustermap_length_layers:
|
|
333
|
+
- all_accessible_features_merged
|
|
334
|
+
- all_footprint_features
|
|
262
335
|
hmm_clustermap_sortby: "hmm"
|
|
263
336
|
hmm_peak_feature_configs:
|
|
264
337
|
all_accessible_features:
|
|
@@ -12,6 +12,7 @@ from smftools.constants import (
|
|
|
12
12
|
BAM_SUFFIX,
|
|
13
13
|
BARCODE_BOTH_ENDS,
|
|
14
14
|
CONVERSIONS,
|
|
15
|
+
LOAD_DIR,
|
|
15
16
|
MOD_LIST,
|
|
16
17
|
MOD_MAP,
|
|
17
18
|
REF_COL,
|
|
@@ -664,6 +665,8 @@ class ExperimentConfig:
|
|
|
664
665
|
# General I/O
|
|
665
666
|
input_data_path: Optional[str] = None
|
|
666
667
|
output_directory: Optional[str] = None
|
|
668
|
+
emit_log_file: Optional[bool] = True
|
|
669
|
+
log_level: Optional[str] = "INFO"
|
|
667
670
|
fasta: Optional[str] = None
|
|
668
671
|
bam_suffix: str = BAM_SUFFIX
|
|
669
672
|
recursive_input_search: bool = True
|
|
@@ -736,6 +739,7 @@ class ExperimentConfig:
|
|
|
736
739
|
aligner_args: Optional[List[str]] = None
|
|
737
740
|
make_bigwigs: bool = False
|
|
738
741
|
make_beds: bool = False
|
|
742
|
+
annotate_secondary_supplementary: bool = True
|
|
739
743
|
samtools_backend: str = "auto"
|
|
740
744
|
bedtools_backend: str = "auto"
|
|
741
745
|
bigwig_backend: str = "auto"
|
|
@@ -747,6 +751,9 @@ class ExperimentConfig:
|
|
|
747
751
|
# General Plotting
|
|
748
752
|
sample_name_col_for_plotting: Optional[str] = "Barcode"
|
|
749
753
|
rows_per_qc_histogram_grid: int = 12
|
|
754
|
+
clustermap_demux_types_to_plot: List[str] = field(
|
|
755
|
+
default_factory=lambda: ["single", "double", "already"]
|
|
756
|
+
)
|
|
750
757
|
|
|
751
758
|
# Preprocessing - Read length and quality filter params
|
|
752
759
|
read_coord_filter: Optional[Sequence[float]] = field(default_factory=lambda: [None, None])
|
|
@@ -816,6 +823,9 @@ class ExperimentConfig:
|
|
|
816
823
|
duplicate_detection_site_types: List[str] = field(
|
|
817
824
|
default_factory=lambda: ["GpC", "CpG", "ambiguous_GpC_CpG"]
|
|
818
825
|
)
|
|
826
|
+
duplicate_detection_demux_types_to_use: List[str] = field(
|
|
827
|
+
default_factory=lambda: ["single", "double", "already"]
|
|
828
|
+
)
|
|
819
829
|
duplicate_detection_distance_threshold: float = 0.07
|
|
820
830
|
hamming_vs_metric_keys: List[str] = field(default_factory=lambda: ["Fraction_C_site_modified"])
|
|
821
831
|
duplicate_detection_keep_best_metric: str = "read_quality"
|
|
@@ -827,6 +837,13 @@ class ExperimentConfig:
|
|
|
827
837
|
|
|
828
838
|
# Preprocessing - Position QC
|
|
829
839
|
position_max_nan_threshold: float = 0.1
|
|
840
|
+
mismatch_frequency_range: Sequence[float] = field(default_factory=lambda: [0.05, 0.95])
|
|
841
|
+
mismatch_frequency_layer: str = "mismatch_integer_encoding"
|
|
842
|
+
mismatch_frequency_read_span_layer: str = "read_span_mask"
|
|
843
|
+
mismatch_base_frequency_exclude_mod_sites: bool = False
|
|
844
|
+
references_to_align_for_variant_annotation: List[Optional[str]] = field(
|
|
845
|
+
default_factory=lambda: [None, None]
|
|
846
|
+
)
|
|
830
847
|
|
|
831
848
|
# Spatial Analysis - Clustermap params
|
|
832
849
|
layer_for_clustermap_plotting: Optional[str] = "nan0_0minus1"
|
|
@@ -835,6 +852,45 @@ class ExperimentConfig:
|
|
|
835
852
|
clustermap_cmap_cpg: Optional[str] = "coolwarm"
|
|
836
853
|
clustermap_cmap_a: Optional[str] = "coolwarm"
|
|
837
854
|
spatial_clustermap_sortby: Optional[str] = "gpc"
|
|
855
|
+
overlay_variant_calls: bool = False
|
|
856
|
+
variant_overlay_seq1_color: str = "white"
|
|
857
|
+
variant_overlay_seq2_color: str = "black"
|
|
858
|
+
variant_overlay_marker_size: float = 4.0
|
|
859
|
+
rolling_nn_layer: Optional[str] = "nan0_0minus1"
|
|
860
|
+
rolling_nn_plot_layer: Optional[str] = "nan0_0minus1"
|
|
861
|
+
rolling_nn_plot_layers: List[str] = field(
|
|
862
|
+
default_factory=lambda: ["nan0_0minus1", "nan0_0minus1"]
|
|
863
|
+
)
|
|
864
|
+
rolling_nn_window: int = 10
|
|
865
|
+
rolling_nn_step: int = 1
|
|
866
|
+
rolling_nn_min_overlap: int = 8
|
|
867
|
+
rolling_nn_return_fraction: bool = True
|
|
868
|
+
rolling_nn_obsm_key: str = "rolling_nn_dist"
|
|
869
|
+
rolling_nn_site_types: Optional[List[str]] = None
|
|
870
|
+
rolling_nn_write_zero_pairs_csvs: bool = True
|
|
871
|
+
rolling_nn_zero_pairs_uns_key: Optional[str] = None
|
|
872
|
+
rolling_nn_zero_pairs_segments_key: Optional[str] = None
|
|
873
|
+
rolling_nn_zero_pairs_layer_key: Optional[str] = None
|
|
874
|
+
rolling_nn_zero_pairs_refine: bool = True
|
|
875
|
+
rolling_nn_zero_pairs_max_nan_run: Optional[int] = None
|
|
876
|
+
rolling_nn_zero_pairs_merge_gap: int = 0
|
|
877
|
+
rolling_nn_zero_pairs_max_segments_per_read: Optional[int] = None
|
|
878
|
+
rolling_nn_zero_pairs_max_overlap: Optional[int] = None
|
|
879
|
+
rolling_nn_zero_pairs_layer_overlap_mode: str = "binary"
|
|
880
|
+
rolling_nn_zero_pairs_layer_overlap_value: Optional[int] = None
|
|
881
|
+
rolling_nn_zero_pairs_keep_uns: bool = True
|
|
882
|
+
rolling_nn_zero_pairs_segments_keep_uns: bool = True
|
|
883
|
+
rolling_nn_zero_pairs_top_segments_per_read: Optional[int] = None
|
|
884
|
+
rolling_nn_zero_pairs_top_segments_max_overlap: Optional[int] = None
|
|
885
|
+
rolling_nn_zero_pairs_top_segments_min_span: Optional[float] = None
|
|
886
|
+
rolling_nn_zero_pairs_top_segments_write_csvs: bool = True
|
|
887
|
+
rolling_nn_zero_pairs_segment_histogram_bins: int = 30
|
|
888
|
+
|
|
889
|
+
# Cross-sample rolling NN analysis
|
|
890
|
+
cross_sample_analysis: bool = False
|
|
891
|
+
cross_sample_grouping_col: Optional[str] = None
|
|
892
|
+
cross_sample_random_seed: int = 42
|
|
893
|
+
delta_hamming_chimeric_span_threshold: int = 200
|
|
838
894
|
|
|
839
895
|
# Spatial Analysis - UMAP/Leiden params
|
|
840
896
|
layer_for_umap_plotting: Optional[str] = "nan_half"
|
|
@@ -883,11 +939,15 @@ class ExperimentConfig:
|
|
|
883
939
|
accessible_patches: Optional[bool] = True
|
|
884
940
|
cpg: Optional[bool] = False
|
|
885
941
|
hmm_feature_sets: Dict[str, Any] = field(default_factory=dict)
|
|
942
|
+
hmm_feature_colormaps: Dict[str, Any] = field(default_factory=dict)
|
|
886
943
|
hmm_merge_layer_features: Optional[List[Tuple]] = field(default_factory=lambda: [(None, 60)])
|
|
887
944
|
clustermap_cmap_hmm: Optional[str] = "coolwarm"
|
|
888
945
|
hmm_clustermap_feature_layers: List[str] = field(
|
|
889
946
|
default_factory=lambda: ["all_accessible_features"]
|
|
890
947
|
)
|
|
948
|
+
hmm_clustermap_length_layers: List[str] = field(
|
|
949
|
+
default_factory=lambda: ["all_accessible_features"]
|
|
950
|
+
)
|
|
891
951
|
hmm_clustermap_sortby: Optional[str] = "hmm"
|
|
892
952
|
hmm_peak_feature_configs: Dict[str, Any] = field(default_factory=dict)
|
|
893
953
|
|
|
@@ -906,6 +966,8 @@ class ExperimentConfig:
|
|
|
906
966
|
invert_adata: bool = False
|
|
907
967
|
bypass_append_binary_layer_by_base_context: bool = False
|
|
908
968
|
force_redo_append_binary_layer_by_base_context: bool = False
|
|
969
|
+
bypass_append_mismatch_frequency_sites: bool = False
|
|
970
|
+
force_redo_append_mismatch_frequency_sites: bool = False
|
|
909
971
|
bypass_calculate_read_modification_stats: bool = False
|
|
910
972
|
force_redo_calculate_read_modification_stats: bool = False
|
|
911
973
|
bypass_filter_reads_on_modification_thresholds: bool = False
|
|
@@ -1110,7 +1172,7 @@ class ExperimentConfig:
|
|
|
1110
1172
|
|
|
1111
1173
|
# Demultiplexing output path
|
|
1112
1174
|
split_dir = merged.get("split_dir", SPLIT_DIR)
|
|
1113
|
-
split_path = output_dir / split_dir
|
|
1175
|
+
split_path = output_dir / LOAD_DIR / split_dir
|
|
1114
1176
|
|
|
1115
1177
|
# final normalization
|
|
1116
1178
|
if "strands" in merged:
|
|
@@ -1121,6 +1183,10 @@ class ExperimentConfig:
|
|
|
1121
1183
|
merged["mod_target_bases"] = _parse_list(merged["mod_target_bases"])
|
|
1122
1184
|
if "conversion_types" in merged:
|
|
1123
1185
|
merged["conversion_types"] = _parse_list(merged["conversion_types"])
|
|
1186
|
+
if "references_to_align_for_variant_annotation" in merged:
|
|
1187
|
+
merged["references_to_align_for_variant_annotation"] = _parse_list(
|
|
1188
|
+
merged["references_to_align_for_variant_annotation"]
|
|
1189
|
+
)
|
|
1124
1190
|
|
|
1125
1191
|
merged["filter_threshold"] = float(_parse_numeric(merged.get("filter_threshold", 0.8), 0.8))
|
|
1126
1192
|
merged["m6A_threshold"] = float(_parse_numeric(merged.get("m6A_threshold", 0.7), 0.7))
|
|
@@ -1197,6 +1263,9 @@ class ExperimentConfig:
|
|
|
1197
1263
|
# Final normalization of hmm_feature_sets and canonical local variables
|
|
1198
1264
|
merged["hmm_feature_sets"] = normalize_hmm_feature_sets(merged.get("hmm_feature_sets", {}))
|
|
1199
1265
|
hmm_feature_sets = merged.get("hmm_feature_sets", {})
|
|
1266
|
+
hmm_feature_colormaps = merged.get("hmm_feature_colormaps", {})
|
|
1267
|
+
if not isinstance(hmm_feature_colormaps, dict):
|
|
1268
|
+
hmm_feature_colormaps = {}
|
|
1200
1269
|
hmm_annotation_threshold = merged.get("hmm_annotation_threshold", 0.5)
|
|
1201
1270
|
hmm_batch_size = int(merged.get("hmm_batch_size", 1024))
|
|
1202
1271
|
hmm_use_viterbi = bool(merged.get("hmm_use_viterbi", False))
|
|
@@ -1211,6 +1280,9 @@ class ExperimentConfig:
|
|
|
1211
1280
|
hmm_clustermap_feature_layers = _parse_list(
|
|
1212
1281
|
merged.get("hmm_clustermap_feature_layers", "all_accessible_features")
|
|
1213
1282
|
)
|
|
1283
|
+
hmm_clustermap_length_layers = _parse_list(
|
|
1284
|
+
merged.get("hmm_clustermap_length_layers", hmm_clustermap_feature_layers)
|
|
1285
|
+
)
|
|
1214
1286
|
|
|
1215
1287
|
hmm_fit_strategy = str(merged.get("hmm_fit_strategy", "per_group")).strip()
|
|
1216
1288
|
hmm_shared_scope = _parse_list(merged.get("hmm_shared_scope", ["reference", "methbase"]))
|
|
@@ -1231,6 +1303,7 @@ class ExperimentConfig:
|
|
|
1231
1303
|
|
|
1232
1304
|
# instantiate dataclass
|
|
1233
1305
|
instance = cls(
|
|
1306
|
+
annotate_secondary_supplementary=merged.get("annotate_secondary_supplementary", True),
|
|
1234
1307
|
smf_modality=merged.get("smf_modality"),
|
|
1235
1308
|
input_data_path=input_data_path,
|
|
1236
1309
|
recursive_input_search=merged.get("recursive_input_search"),
|
|
@@ -1257,6 +1330,8 @@ class ExperimentConfig:
|
|
|
1257
1330
|
trim=merged.get("trim", TRIM),
|
|
1258
1331
|
input_already_demuxed=merged.get("input_already_demuxed", False),
|
|
1259
1332
|
threads=merged.get("threads"),
|
|
1333
|
+
emit_log_file=merged.get("emit_log_file", True),
|
|
1334
|
+
log_level=merged.get("log_level", "INFO"),
|
|
1260
1335
|
sample_sheet_path=merged.get("sample_sheet_path"),
|
|
1261
1336
|
sample_sheet_mapping_column=merged.get("sample_sheet_mapping_column"),
|
|
1262
1337
|
delete_intermediate_bams=merged.get("delete_intermediate_bams", False),
|
|
@@ -1313,6 +1388,9 @@ class ExperimentConfig:
|
|
|
1313
1388
|
),
|
|
1314
1389
|
reindexing_offsets=merged.get("reindexing_offsets", {None: None}),
|
|
1315
1390
|
reindexed_var_suffix=merged.get("reindexed_var_suffix", "reindexed"),
|
|
1391
|
+
clustermap_demux_types_to_plot=merged.get(
|
|
1392
|
+
"clustermap_demux_types_to_plot", ["single", "double", "already"]
|
|
1393
|
+
),
|
|
1316
1394
|
layer_for_clustermap_plotting=merged.get(
|
|
1317
1395
|
"layer_for_clustermap_plotting", "nan0_0minus1"
|
|
1318
1396
|
),
|
|
@@ -1321,6 +1399,65 @@ class ExperimentConfig:
|
|
|
1321
1399
|
clustermap_cmap_cpg=merged.get("clustermap_cmap_cpg", "coolwarm"),
|
|
1322
1400
|
clustermap_cmap_a=merged.get("clustermap_cmap_a", "coolwarm"),
|
|
1323
1401
|
spatial_clustermap_sortby=merged.get("spatial_clustermap_sortby", "gpc"),
|
|
1402
|
+
overlay_variant_calls=_parse_bool(merged.get("overlay_variant_calls", False)),
|
|
1403
|
+
variant_overlay_seq1_color=merged.get("variant_overlay_seq1_color", "white"),
|
|
1404
|
+
variant_overlay_seq2_color=merged.get("variant_overlay_seq2_color", "black"),
|
|
1405
|
+
variant_overlay_marker_size=float(merged.get("variant_overlay_marker_size", 4.0)),
|
|
1406
|
+
rolling_nn_layer=merged.get("rolling_nn_layer", "nan0_0minus1"),
|
|
1407
|
+
rolling_nn_plot_layer=merged.get("rolling_nn_plot_layer", "nan0_0minus1"),
|
|
1408
|
+
rolling_nn_plot_layers=merged.get(
|
|
1409
|
+
"rolling_nn_plot_layers", ["nan0_0minus1", "nan0_0minus1"]
|
|
1410
|
+
),
|
|
1411
|
+
rolling_nn_window=merged.get("rolling_nn_window", 15),
|
|
1412
|
+
rolling_nn_step=merged.get("rolling_nn_step", 2),
|
|
1413
|
+
rolling_nn_min_overlap=merged.get("rolling_nn_min_overlap", 10),
|
|
1414
|
+
rolling_nn_return_fraction=merged.get("rolling_nn_return_fraction", True),
|
|
1415
|
+
rolling_nn_obsm_key=merged.get("rolling_nn_obsm_key", "rolling_nn_dist"),
|
|
1416
|
+
rolling_nn_site_types=merged.get("rolling_nn_site_types", None),
|
|
1417
|
+
rolling_nn_write_zero_pairs_csvs=merged.get("rolling_nn_write_zero_pairs_csvs", True),
|
|
1418
|
+
rolling_nn_zero_pairs_uns_key=merged.get("rolling_nn_zero_pairs_uns_key", None),
|
|
1419
|
+
rolling_nn_zero_pairs_segments_key=merged.get(
|
|
1420
|
+
"rolling_nn_zero_pairs_segments_key", None
|
|
1421
|
+
),
|
|
1422
|
+
rolling_nn_zero_pairs_layer_key=merged.get("rolling_nn_zero_pairs_layer_key", None),
|
|
1423
|
+
rolling_nn_zero_pairs_refine=merged.get("rolling_nn_zero_pairs_refine", True),
|
|
1424
|
+
rolling_nn_zero_pairs_max_nan_run=merged.get("rolling_nn_zero_pairs_max_nan_run", None),
|
|
1425
|
+
rolling_nn_zero_pairs_merge_gap=merged.get("rolling_nn_zero_pairs_merge_gap", 0),
|
|
1426
|
+
rolling_nn_zero_pairs_max_segments_per_read=merged.get(
|
|
1427
|
+
"rolling_nn_zero_pairs_max_segments_per_read", None
|
|
1428
|
+
),
|
|
1429
|
+
rolling_nn_zero_pairs_max_overlap=merged.get("rolling_nn_zero_pairs_max_overlap", None),
|
|
1430
|
+
rolling_nn_zero_pairs_layer_overlap_mode=merged.get(
|
|
1431
|
+
"rolling_nn_zero_pairs_layer_overlap_mode", "binary"
|
|
1432
|
+
),
|
|
1433
|
+
rolling_nn_zero_pairs_layer_overlap_value=merged.get(
|
|
1434
|
+
"rolling_nn_zero_pairs_layer_overlap_value", None
|
|
1435
|
+
),
|
|
1436
|
+
rolling_nn_zero_pairs_keep_uns=merged.get("rolling_nn_zero_pairs_keep_uns", True),
|
|
1437
|
+
rolling_nn_zero_pairs_segments_keep_uns=merged.get(
|
|
1438
|
+
"rolling_nn_zero_pairs_segments_keep_uns", True
|
|
1439
|
+
),
|
|
1440
|
+
rolling_nn_zero_pairs_top_segments_per_read=merged.get(
|
|
1441
|
+
"rolling_nn_zero_pairs_top_segments_per_read", None
|
|
1442
|
+
),
|
|
1443
|
+
rolling_nn_zero_pairs_top_segments_max_overlap=merged.get(
|
|
1444
|
+
"rolling_nn_zero_pairs_top_segments_max_overlap", None
|
|
1445
|
+
),
|
|
1446
|
+
rolling_nn_zero_pairs_top_segments_min_span=merged.get(
|
|
1447
|
+
"rolling_nn_zero_pairs_top_segments_min_span", None
|
|
1448
|
+
),
|
|
1449
|
+
rolling_nn_zero_pairs_top_segments_write_csvs=merged.get(
|
|
1450
|
+
"rolling_nn_zero_pairs_top_segments_write_csvs", True
|
|
1451
|
+
),
|
|
1452
|
+
rolling_nn_zero_pairs_segment_histogram_bins=merged.get(
|
|
1453
|
+
"rolling_nn_zero_pairs_segment_histogram_bins", 30
|
|
1454
|
+
),
|
|
1455
|
+
cross_sample_analysis=merged.get("cross_sample_analysis", False),
|
|
1456
|
+
cross_sample_grouping_col=merged.get("cross_sample_grouping_col", None),
|
|
1457
|
+
cross_sample_random_seed=merged.get("cross_sample_random_seed", 42),
|
|
1458
|
+
delta_hamming_chimeric_span_threshold=merged.get(
|
|
1459
|
+
"delta_hamming_chimeric_span_threshold", 200
|
|
1460
|
+
),
|
|
1324
1461
|
layer_for_umap_plotting=merged.get("layer_for_umap_plotting", "nan_half"),
|
|
1325
1462
|
umap_layers_to_plot=merged.get(
|
|
1326
1463
|
"umap_layers_to_plot", ["mapped_length", "Raw_modification_signal"]
|
|
@@ -1347,6 +1484,7 @@ class ExperimentConfig:
|
|
|
1347
1484
|
hmm_emission_adapt_tol=hmm_emission_adapt_tol,
|
|
1348
1485
|
hmm_dtype=merged.get("hmm_dtype", "float64"),
|
|
1349
1486
|
hmm_feature_sets=hmm_feature_sets,
|
|
1487
|
+
hmm_feature_colormaps=hmm_feature_colormaps,
|
|
1350
1488
|
hmm_annotation_threshold=hmm_annotation_threshold,
|
|
1351
1489
|
hmm_batch_size=hmm_batch_size,
|
|
1352
1490
|
hmm_use_viterbi=hmm_use_viterbi,
|
|
@@ -1355,6 +1493,7 @@ class ExperimentConfig:
|
|
|
1355
1493
|
hmm_merge_layer_features=hmm_merge_layer_features,
|
|
1356
1494
|
clustermap_cmap_hmm=merged.get("clustermap_cmap_hmm", "coolwarm"),
|
|
1357
1495
|
hmm_clustermap_feature_layers=hmm_clustermap_feature_layers,
|
|
1496
|
+
hmm_clustermap_length_layers=hmm_clustermap_length_layers,
|
|
1358
1497
|
hmm_clustermap_sortby=merged.get("hmm_clustermap_sortby", "hmm"),
|
|
1359
1498
|
hmm_peak_feature_configs=hmm_peak_feature_configs,
|
|
1360
1499
|
footprints=merged.get("footprints", None),
|
|
@@ -1390,6 +1529,9 @@ class ExperimentConfig:
|
|
|
1390
1529
|
duplicate_detection_site_types=merged.get(
|
|
1391
1530
|
"duplicate_detection_site_types", ["GpC", "CpG", "ambiguous_GpC_CpG"]
|
|
1392
1531
|
),
|
|
1532
|
+
duplicate_detection_demux_types_to_use=merged.get(
|
|
1533
|
+
"duplicate_detection_demux_types_to_use", ["single", "double", "already"]
|
|
1534
|
+
),
|
|
1393
1535
|
duplicate_detection_distance_threshold=merged.get(
|
|
1394
1536
|
"duplicate_detection_distance_threshold", 0.07
|
|
1395
1537
|
),
|
|
@@ -1479,6 +1621,9 @@ class ExperimentConfig:
|
|
|
1479
1621
|
force_redo_hmm_fit=merged.get("force_redo_hmm_fit", False),
|
|
1480
1622
|
bypass_hmm_apply=merged.get("bypass_hmm_apply", False),
|
|
1481
1623
|
force_redo_hmm_apply=merged.get("force_redo_hmm_apply", False),
|
|
1624
|
+
references_to_align_for_variant_annotation=merged.get(
|
|
1625
|
+
"references_to_align_for_variant_annotation", [None, None]
|
|
1626
|
+
),
|
|
1482
1627
|
config_source=config_source or "<var_dict>",
|
|
1483
1628
|
)
|
|
1484
1629
|
|
smftools/constants.py
CHANGED
|
@@ -21,7 +21,34 @@ BAM_SUFFIX: Final[str] = ".bam"
|
|
|
21
21
|
BARCODE_BOTH_ENDS: Final[bool] = False
|
|
22
22
|
REF_COL: Final[str] = "Reference_strand"
|
|
23
23
|
SAMPLE_COL: Final[str] = "Experiment_name_and_barcode"
|
|
24
|
+
SAMPLE: Final[str] = "Sample"
|
|
24
25
|
SPLIT_DIR: Final[str] = "demultiplexed_BAMs"
|
|
26
|
+
H5_DIR: Final[str] = "h5ads"
|
|
27
|
+
DEMUX_TYPE: Final[str] = "demux_type"
|
|
28
|
+
BARCODE: Final[str] = "Barcode"
|
|
29
|
+
REFERENCE: Final[str] = "Reference"
|
|
30
|
+
REFERENCE_STRAND: Final[str] = "Reference_strand"
|
|
31
|
+
REFERENCE_DATASET_STRAND: Final[str] = "Reference_dataset_strand"
|
|
32
|
+
STRAND: Final[str] = "Strand"
|
|
33
|
+
DATASET: Final[str] = "Dataset"
|
|
34
|
+
READ_MISMATCH_TREND: Final[str] = "Read_mismatch_trend"
|
|
35
|
+
READ_MAPPING_DIRECTION: Final[str] = "Read_mapping_direction"
|
|
36
|
+
SEQUENCE_INTEGER_ENCODING: Final[str] = "sequence_integer_encoding"
|
|
37
|
+
SEQUENCE_INTEGER_DECODING: Final[str] = "sequence_integer_decoding"
|
|
38
|
+
MISMATCH_INTEGER_ENCODING: Final[str] = "mismatch_integer_encoding"
|
|
39
|
+
BASE_QUALITY_SCORES: Final[str] = "base_quality_scores"
|
|
40
|
+
READ_SPAN_MASK: Final[str] = "read_span_mask"
|
|
41
|
+
|
|
42
|
+
LOAD_DIR: Final[str] = "load_adata_outputs"
|
|
43
|
+
PREPROCESS_DIR: Final[str] = "preprocess_adata_outputs"
|
|
44
|
+
SPATIAL_DIR: Final[str] = "spatial_adata_outputs"
|
|
45
|
+
HMM_DIR: Final[str] = "hmm_adata_outputs"
|
|
46
|
+
LATENT_DIR: Final[str] = "latent_adata_outputs"
|
|
47
|
+
VARIANT_DIR: Final[str] = "variant_adata_outputs"
|
|
48
|
+
CHIMERIC_DIR: Final[str] = "chimeric_adata_outputs"
|
|
49
|
+
|
|
50
|
+
LOGGING_DIR: Final[str] = "logs"
|
|
51
|
+
|
|
25
52
|
TRIM: Final[bool] = False
|
|
26
53
|
|
|
27
54
|
_private_conversions = ["unconverted"]
|
|
@@ -35,3 +62,45 @@ MOD_MAP: Final[Mapping[str, str]] = _deep_freeze(_private_mod_map)
|
|
|
35
62
|
|
|
36
63
|
_private_strands = ("bottom", "top")
|
|
37
64
|
STRANDS: Final[tuple[str, ...]] = _deep_freeze(_private_strands)
|
|
65
|
+
|
|
66
|
+
MODKIT_EXTRACT_TSV_COLUMN_CHROM: Final[str] = "chrom"
|
|
67
|
+
MODKIT_EXTRACT_TSV_COLUMN_REF_POSITION: Final[str] = "ref_position"
|
|
68
|
+
MODKIT_EXTRACT_TSV_COLUMN_MODIFIED_PRIMARY_BASE: Final[str] = "modified_primary_base"
|
|
69
|
+
MODKIT_EXTRACT_TSV_COLUMN_REF_STRAND: Final[str] = "ref_strand"
|
|
70
|
+
MODKIT_EXTRACT_TSV_COLUMN_READ_ID: Final[str] = "read_id"
|
|
71
|
+
MODKIT_EXTRACT_TSV_COLUMN_CALL_CODE: Final[str] = "call_code"
|
|
72
|
+
MODKIT_EXTRACT_TSV_COLUMN_CALL_PROB: Final[str] = "call_prob"
|
|
73
|
+
|
|
74
|
+
MODKIT_EXTRACT_MODIFIED_BASE_A: Final[str] = "A"
|
|
75
|
+
MODKIT_EXTRACT_MODIFIED_BASE_C: Final[str] = "C"
|
|
76
|
+
MODKIT_EXTRACT_REF_STRAND_PLUS: Final[str] = "+"
|
|
77
|
+
MODKIT_EXTRACT_REF_STRAND_MINUS: Final[str] = "-"
|
|
78
|
+
|
|
79
|
+
_private_modkit_extract_call_code_modified = ("a", "h", "m")
|
|
80
|
+
MODKIT_EXTRACT_CALL_CODE_MODIFIED: Final[tuple[str, ...]] = _deep_freeze(
|
|
81
|
+
_private_modkit_extract_call_code_modified
|
|
82
|
+
)
|
|
83
|
+
_private_modkit_extract_call_code_canonical = ("-",)
|
|
84
|
+
MODKIT_EXTRACT_CALL_CODE_CANONICAL: Final[tuple[str, ...]] = _deep_freeze(
|
|
85
|
+
_private_modkit_extract_call_code_canonical
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
MODKIT_EXTRACT_SEQUENCE_BASES: Final[tuple[str, ...]] = _deep_freeze(("A", "C", "G", "T", "N"))
|
|
89
|
+
MODKIT_EXTRACT_SEQUENCE_PADDING_BASE: Final[str] = "PAD"
|
|
90
|
+
_private_modkit_extract_base_to_int: Dict[str, int] = {
|
|
91
|
+
"A": 0,
|
|
92
|
+
"C": 1,
|
|
93
|
+
"G": 2,
|
|
94
|
+
"T": 3,
|
|
95
|
+
"N": 4,
|
|
96
|
+
"PAD": 5,
|
|
97
|
+
}
|
|
98
|
+
MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT: Final[Mapping[str, int]] = _deep_freeze(
|
|
99
|
+
_private_modkit_extract_base_to_int
|
|
100
|
+
)
|
|
101
|
+
_private_modkit_extract_int_to_base: Dict[int, str] = {
|
|
102
|
+
value: key for key, value in _private_modkit_extract_base_to_int.items()
|
|
103
|
+
}
|
|
104
|
+
MODKIT_EXTRACT_SEQUENCE_INT_TO_BASE: Final[Mapping[int, str]] = _deep_freeze(
|
|
105
|
+
_private_modkit_extract_int_to_base
|
|
106
|
+
)
|
smftools/hmm/HMM.py
CHANGED
|
@@ -144,6 +144,83 @@ def _safe_int_coords(var_names) -> Tuple[np.ndarray, bool]:
|
|
|
144
144
|
return np.arange(len(var_names), dtype=int), False
|
|
145
145
|
|
|
146
146
|
|
|
147
|
+
def mask_layers_outside_read_span(
|
|
148
|
+
adata,
|
|
149
|
+
layers: Sequence[str],
|
|
150
|
+
*,
|
|
151
|
+
start_key: str = "reference_start",
|
|
152
|
+
end_key: str = "reference_end",
|
|
153
|
+
use_original_var_names: bool = True,
|
|
154
|
+
) -> List[str]:
|
|
155
|
+
"""Mask layer values outside read reference spans with NaN.
|
|
156
|
+
|
|
157
|
+
This uses integer coordinate comparisons against either ``adata.var["Original_var_names"]``
|
|
158
|
+
(when present) or ``adata.var_names``. Values strictly less than ``start_key`` or greater
|
|
159
|
+
than ``end_key`` are set to NaN for each read.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
adata: AnnData object to modify in-place.
|
|
163
|
+
layers: Layer names to mask.
|
|
164
|
+
start_key: obs column holding reference start positions.
|
|
165
|
+
end_key: obs column holding reference end positions.
|
|
166
|
+
use_original_var_names: Use ``adata.var["Original_var_names"]`` when available.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
List of layer names that were masked.
|
|
170
|
+
"""
|
|
171
|
+
if not layers:
|
|
172
|
+
return []
|
|
173
|
+
|
|
174
|
+
if start_key not in adata.obs or end_key not in adata.obs:
|
|
175
|
+
raise KeyError(f"Missing {start_key!r} or {end_key!r} in adata.obs.")
|
|
176
|
+
|
|
177
|
+
coord_source = adata.var_names
|
|
178
|
+
if use_original_var_names and "Original_var_names" in adata.var:
|
|
179
|
+
orig = np.asarray(adata.var["Original_var_names"])
|
|
180
|
+
if orig.size == adata.n_vars:
|
|
181
|
+
try:
|
|
182
|
+
orig_numeric = np.asarray(orig, dtype=float)
|
|
183
|
+
except (TypeError, ValueError):
|
|
184
|
+
orig_numeric = None
|
|
185
|
+
if orig_numeric is not None and np.isfinite(orig_numeric).any():
|
|
186
|
+
coord_source = orig
|
|
187
|
+
|
|
188
|
+
coords, _ = _safe_int_coords(coord_source)
|
|
189
|
+
if coords.shape[0] != adata.n_vars:
|
|
190
|
+
raise ValueError("Coordinate source length does not match adata.n_vars.")
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
starts = np.asarray(adata.obs[start_key], dtype=float)
|
|
194
|
+
ends = np.asarray(adata.obs[end_key], dtype=float)
|
|
195
|
+
except (TypeError, ValueError) as exc:
|
|
196
|
+
raise ValueError("Start/end positions must be numeric.") from exc
|
|
197
|
+
|
|
198
|
+
masked = []
|
|
199
|
+
for layer in layers:
|
|
200
|
+
if layer not in adata.layers:
|
|
201
|
+
raise KeyError(f"Layer {layer!r} not found in adata.layers.")
|
|
202
|
+
|
|
203
|
+
arr = np.asarray(adata.layers[layer])
|
|
204
|
+
if not np.issubdtype(arr.dtype, np.floating):
|
|
205
|
+
arr = arr.astype(float, copy=True)
|
|
206
|
+
|
|
207
|
+
for i in range(adata.n_obs):
|
|
208
|
+
start = starts[i]
|
|
209
|
+
end = ends[i]
|
|
210
|
+
if not np.isfinite(start) or not np.isfinite(end):
|
|
211
|
+
continue
|
|
212
|
+
start_i = int(start)
|
|
213
|
+
end_i = int(end)
|
|
214
|
+
row_mask = (coords < start_i) | (coords > end_i)
|
|
215
|
+
if row_mask.any():
|
|
216
|
+
arr[i, row_mask] = np.nan
|
|
217
|
+
|
|
218
|
+
adata.layers[layer] = arr
|
|
219
|
+
masked.append(layer)
|
|
220
|
+
|
|
221
|
+
return masked
|
|
222
|
+
|
|
223
|
+
|
|
147
224
|
def _logsumexp(x: torch.Tensor, dim: int) -> torch.Tensor:
|
|
148
225
|
"""Compute log-sum-exp in a numerically stable way.
|
|
149
226
|
|
|
@@ -1064,6 +1141,8 @@ class BaseHMM(nn.Module):
|
|
|
1064
1141
|
uns_key: str = "hmm_appended_layers",
|
|
1065
1142
|
uns_flag: str = "hmm_annotated",
|
|
1066
1143
|
force_redo: bool = False,
|
|
1144
|
+
mask_to_read_span: bool = True,
|
|
1145
|
+
mask_use_original_var_names: bool = True,
|
|
1067
1146
|
device: Optional[Union[str, torch.device]] = None,
|
|
1068
1147
|
**kwargs,
|
|
1069
1148
|
):
|
|
@@ -1085,6 +1164,8 @@ class BaseHMM(nn.Module):
|
|
|
1085
1164
|
uns_key: .uns key to track appended layers.
|
|
1086
1165
|
uns_flag: .uns flag to mark annotations.
|
|
1087
1166
|
force_redo: Whether to overwrite existing layers.
|
|
1167
|
+
mask_to_read_span: Whether to mask appended layers outside read spans.
|
|
1168
|
+
mask_use_original_var_names: Use ``adata.var["Original_var_names"]`` when available.
|
|
1088
1169
|
device: Device specifier.
|
|
1089
1170
|
**kwargs: Additional parameters for specialized workflows.
|
|
1090
1171
|
|
|
@@ -1245,6 +1326,13 @@ class BaseHMM(nn.Module):
|
|
|
1245
1326
|
np.asarray(adata.layers[nm])
|
|
1246
1327
|
)
|
|
1247
1328
|
|
|
1329
|
+
if mask_to_read_span and appended:
|
|
1330
|
+
mask_layers_outside_read_span(
|
|
1331
|
+
adata,
|
|
1332
|
+
appended,
|
|
1333
|
+
use_original_var_names=mask_use_original_var_names,
|
|
1334
|
+
)
|
|
1335
|
+
|
|
1248
1336
|
adata.uns[uns_key] = appended
|
|
1249
1337
|
adata.uns[uns_flag] = True
|
|
1250
1338
|
return None
|
smftools/hmm/call_hmm_peaks.py
CHANGED
|
@@ -51,7 +51,7 @@ def call_hmm_peaks(
|
|
|
51
51
|
raise KeyError(f"obs column '{ref_column}' not found")
|
|
52
52
|
|
|
53
53
|
# Ensure categorical for predictable ref iteration
|
|
54
|
-
if not
|
|
54
|
+
if not isinstance(adata.obs[ref_column].dtype, pd.CategoricalDtype):
|
|
55
55
|
adata.obs[ref_column] = adata.obs[ref_column].astype("category")
|
|
56
56
|
|
|
57
57
|
# Optional: drop duplicate obs columns once to avoid Pandas/AnnData view quirks
|