smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/chimeric_adata.py +1563 -0
- smftools/cli/helpers.py +49 -7
- smftools/cli/hmm_adata.py +250 -32
- smftools/cli/latent_adata.py +773 -0
- smftools/cli/load_adata.py +78 -74
- smftools/cli/preprocess_adata.py +122 -58
- smftools/cli/recipes.py +26 -0
- smftools/cli/spatial_adata.py +74 -112
- smftools/cli/variant_adata.py +423 -0
- smftools/cli_entry.py +52 -4
- smftools/config/conversion.yaml +1 -1
- smftools/config/deaminase.yaml +3 -0
- smftools/config/default.yaml +85 -12
- smftools/config/experiment_config.py +146 -1
- smftools/constants.py +69 -0
- smftools/hmm/HMM.py +88 -0
- smftools/hmm/call_hmm_peaks.py +1 -1
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/binarize_converted_base_identities.py +2 -89
- smftools/informatics/converted_BAM_to_adata.py +636 -175
- smftools/informatics/h5ad_functions.py +198 -2
- smftools/informatics/modkit_extract_to_adata.py +1007 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +26 -3
- smftools/plotting/autocorrelation_plotting.py +22 -4
- smftools/plotting/chimeric_plotting.py +1893 -0
- smftools/plotting/classifiers.py +28 -14
- smftools/plotting/general_plotting.py +62 -1583
- smftools/plotting/hmm_plotting.py +1670 -8
- smftools/plotting/latent_plotting.py +804 -0
- smftools/plotting/plotting_utils.py +243 -0
- smftools/plotting/position_stats.py +16 -8
- smftools/plotting/preprocess_plotting.py +281 -0
- smftools/plotting/qc_plotting.py +8 -3
- smftools/plotting/spatial_plotting.py +1134 -0
- smftools/plotting/variant_plotting.py +1231 -0
- smftools/preprocessing/__init__.py +4 -0
- smftools/preprocessing/append_base_context.py +18 -18
- smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
- smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
- smftools/preprocessing/append_variant_call_layer.py +480 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/flag_duplicate_reads.py +4 -4
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/readwrite.py +159 -99
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +10 -0
- smftools/tools/calculate_knn.py +121 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +130 -0
- smftools/tools/calculate_pca.py +180 -0
- smftools/tools/calculate_umap.py +79 -80
- smftools/tools/position_stats.py +4 -4
- smftools/tools/rolling_nn_distance.py +872 -0
- smftools/tools/sequence_alignment.py +140 -0
- smftools/tools/tensor_factorization.py +217 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from smftools.constants import MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT
|
|
8
|
+
from smftools.logging_utils import get_logger
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
import anndata as ad
|
|
12
|
+
|
|
13
|
+
logger = get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def append_variant_call_layer(
|
|
17
|
+
adata: "ad.AnnData",
|
|
18
|
+
seq1_column: str,
|
|
19
|
+
seq2_column: str,
|
|
20
|
+
seq1_converted_column: str | None = None,
|
|
21
|
+
seq2_converted_column: str | None = None,
|
|
22
|
+
sequence_layer: str = "sequence_integer_encoding",
|
|
23
|
+
read_span_layer: str = "read_span_mask",
|
|
24
|
+
reference_col: str = "Reference_strand",
|
|
25
|
+
output_prefix: str | None = None,
|
|
26
|
+
uns_flag: str = "append_variant_call_layer_performed",
|
|
27
|
+
force_redo: bool = False,
|
|
28
|
+
bypass: bool = False,
|
|
29
|
+
) -> None:
|
|
30
|
+
"""Append a layer recording per-read, per-position variant calls at reference mismatch sites.
|
|
31
|
+
|
|
32
|
+
Uses the substitution map from ``append_sequence_mismatch_annotations`` to
|
|
33
|
+
correctly handle coordinate shifts caused by indels between references.
|
|
34
|
+
For each substitution, reads aligned to ref1 are checked at ref1's var index,
|
|
35
|
+
and reads aligned to ref2 are checked at ref2's var index.
|
|
36
|
+
|
|
37
|
+
For conversion SMF, reads are mapped to *converted* references while the
|
|
38
|
+
alignment that identifies mismatch positions uses *unconverted* sequences.
|
|
39
|
+
When ``seq1_converted_column`` / ``seq2_converted_column`` are provided, each
|
|
40
|
+
reference gets a **set** of acceptable bases at each mismatch position
|
|
41
|
+
(unconverted + converted), since not every base converts in every read.
|
|
42
|
+
A position is informative only if the two acceptable-base sets are disjoint.
|
|
43
|
+
A read base matching either the unconverted or converted form of a reference
|
|
44
|
+
counts as a match for that reference.
|
|
45
|
+
|
|
46
|
+
Values in the output layer:
|
|
47
|
+
1 = matches seq1 base(s)
|
|
48
|
+
2 = matches seq2 base(s)
|
|
49
|
+
0 = unknown (N, PAD, no coverage, or matches neither)
|
|
50
|
+
-1 = not a mismatch position (or not informative after conversion)
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
adata: AnnData object.
|
|
54
|
+
seq1_column: Column in ``adata.var`` with the first reference base per position (unconverted).
|
|
55
|
+
seq2_column: Column in ``adata.var`` with the second reference base per position (unconverted).
|
|
56
|
+
seq1_converted_column: Optional column in ``adata.var`` with the converted seq1 bases.
|
|
57
|
+
When provided, both unconverted and converted bases are accepted as ref1 matches.
|
|
58
|
+
seq2_converted_column: Optional column in ``adata.var`` with the converted seq2 bases.
|
|
59
|
+
sequence_layer: Layer containing integer-encoded actual read bases.
|
|
60
|
+
read_span_layer: Layer containing read span masks.
|
|
61
|
+
reference_col: Obs column defining which reference each read is aligned to.
|
|
62
|
+
output_prefix: Prefix for the output layer name. Defaults to ``{seq1_column}__{seq2_column}``.
|
|
63
|
+
uns_flag: Flag in ``adata.uns`` indicating prior completion.
|
|
64
|
+
force_redo: Whether to rerun even if ``uns_flag`` is set.
|
|
65
|
+
bypass: Whether to skip processing.
|
|
66
|
+
"""
|
|
67
|
+
if bypass:
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
71
|
+
if already and not force_redo:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
if sequence_layer not in adata.layers:
|
|
75
|
+
logger.debug("Sequence layer '%s' not found; skipping variant call layer.", sequence_layer)
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
output_prefix = output_prefix or f"{seq1_column}__{seq2_column}"
|
|
79
|
+
layer_name = f"{output_prefix}_variant_call"
|
|
80
|
+
|
|
81
|
+
# Get the substitution map from alignment annotations
|
|
82
|
+
sub_map_key = f"{output_prefix}_substitution_map"
|
|
83
|
+
sub_map = adata.uns.get(sub_map_key)
|
|
84
|
+
if sub_map is None or (hasattr(sub_map, "__len__") and len(sub_map) == 0):
|
|
85
|
+
logger.warning(
|
|
86
|
+
"Substitution map '%s' not found or empty; skipping variant call layer.",
|
|
87
|
+
sub_map_key,
|
|
88
|
+
)
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
import pandas as pd
|
|
92
|
+
|
|
93
|
+
if isinstance(sub_map, pd.DataFrame):
|
|
94
|
+
vi1_arr = sub_map["seq1_var_idx"].values
|
|
95
|
+
vi2_arr = sub_map["seq2_var_idx"].values
|
|
96
|
+
b1_arr = sub_map["seq1_base"].values
|
|
97
|
+
b2_arr = sub_map["seq2_base"].values
|
|
98
|
+
else:
|
|
99
|
+
vi1_arr = np.asarray(sub_map.get("seq1_var_idx", []))
|
|
100
|
+
vi2_arr = np.asarray(sub_map.get("seq2_var_idx", []))
|
|
101
|
+
b1_arr = np.asarray(sub_map.get("seq1_base", []))
|
|
102
|
+
b2_arr = np.asarray(sub_map.get("seq2_base", []))
|
|
103
|
+
n_subs = len(vi1_arr)
|
|
104
|
+
if n_subs == 0:
|
|
105
|
+
logger.warning("Substitution map is empty; skipping variant call layer.")
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
mismatch_map = adata.uns.get("mismatch_integer_encoding_map", {})
|
|
109
|
+
if not mismatch_map:
|
|
110
|
+
logger.debug("Mismatch encoding map not found; skipping variant call layer.")
|
|
111
|
+
return
|
|
112
|
+
|
|
113
|
+
n_value = int(mismatch_map.get("N", MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["N"]))
|
|
114
|
+
pad_value = int(mismatch_map.get("PAD", MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["PAD"]))
|
|
115
|
+
uninformative = {n_value, pad_value}
|
|
116
|
+
|
|
117
|
+
# Build base -> int lookup
|
|
118
|
+
base_to_int: dict[str, int] = {}
|
|
119
|
+
for base, value in mismatch_map.items():
|
|
120
|
+
if base not in {"N", "PAD"} and isinstance(value, (int, np.integer)):
|
|
121
|
+
base_to_int[base.upper()] = int(value)
|
|
122
|
+
|
|
123
|
+
# Reverse lookup: int -> base letter (for storing readable annotations in var)
|
|
124
|
+
int_to_base: dict[int, str] = {v: k for k, v in base_to_int.items()}
|
|
125
|
+
|
|
126
|
+
n_obs, n_vars = adata.shape
|
|
127
|
+
result = np.full((n_obs, n_vars), -1, dtype=np.int8)
|
|
128
|
+
|
|
129
|
+
# Per-position var annotations
|
|
130
|
+
ref1_acceptable_bases = [""] * n_vars
|
|
131
|
+
ref2_acceptable_bases = [""] * n_vars
|
|
132
|
+
is_informative = np.zeros(n_vars, dtype=bool)
|
|
133
|
+
|
|
134
|
+
seq_matrix = np.asarray(adata.layers[sequence_layer])
|
|
135
|
+
has_span = read_span_layer in adata.layers
|
|
136
|
+
if has_span:
|
|
137
|
+
span_matrix = np.asarray(adata.layers[read_span_layer])
|
|
138
|
+
|
|
139
|
+
# Determine which reference each read belongs to
|
|
140
|
+
ref_labels = adata.obs[reference_col].values
|
|
141
|
+
ref_categories = adata.obs[reference_col].cat.categories
|
|
142
|
+
|
|
143
|
+
# Map each reference category to seq1 or seq2.
|
|
144
|
+
# Column names like "6B6_top_strand_FASTA_base" have stem "6B6_top" matching ref categories.
|
|
145
|
+
suffix = "_strand_FASTA_base"
|
|
146
|
+
seq1_stem = seq1_column[: -len(suffix)] if seq1_column.endswith(suffix) else seq1_column
|
|
147
|
+
seq2_stem = seq2_column[: -len(suffix)] if seq2_column.endswith(suffix) else seq2_column
|
|
148
|
+
ref_to_seq: dict[str, int] = {} # ref_category -> 1 or 2
|
|
149
|
+
for ref in ref_categories:
|
|
150
|
+
if ref == seq1_stem:
|
|
151
|
+
ref_to_seq[ref] = 1
|
|
152
|
+
elif ref == seq2_stem:
|
|
153
|
+
ref_to_seq[ref] = 2
|
|
154
|
+
else:
|
|
155
|
+
logger.debug(
|
|
156
|
+
"Reference '%s' does not match seq1 stem '%s' or seq2 stem '%s'.",
|
|
157
|
+
ref,
|
|
158
|
+
seq1_stem,
|
|
159
|
+
seq2_stem,
|
|
160
|
+
)
|
|
161
|
+
logger.info("Reference-to-sequence mapping: %s", ref_to_seq)
|
|
162
|
+
|
|
163
|
+
# Build per-reference acceptable base sets.
|
|
164
|
+
# For conversion SMF, a read base can match either the unconverted or converted
|
|
165
|
+
# form of a reference. A substitution is informative only when the two sets are disjoint.
|
|
166
|
+
use_converted = bool(seq1_converted_column and seq2_converted_column)
|
|
167
|
+
if use_converted:
|
|
168
|
+
if seq1_converted_column not in adata.var:
|
|
169
|
+
logger.warning(
|
|
170
|
+
"Converted column '%s' not in adata.var; falling back to unconverted.",
|
|
171
|
+
seq1_converted_column,
|
|
172
|
+
)
|
|
173
|
+
use_converted = False
|
|
174
|
+
elif seq2_converted_column not in adata.var:
|
|
175
|
+
logger.warning(
|
|
176
|
+
"Converted column '%s' not in adata.var; falling back to unconverted.",
|
|
177
|
+
seq2_converted_column,
|
|
178
|
+
)
|
|
179
|
+
use_converted = False
|
|
180
|
+
else:
|
|
181
|
+
conv1_bases = adata.var[seq1_converted_column].values
|
|
182
|
+
conv2_bases = adata.var[seq2_converted_column].values
|
|
183
|
+
logger.info(
|
|
184
|
+
"Using converted columns for variant calling: '%s', '%s'.",
|
|
185
|
+
seq1_converted_column,
|
|
186
|
+
seq2_converted_column,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
logger.info("Processing %d substitutions for variant calling.", n_subs)
|
|
190
|
+
|
|
191
|
+
n_informative = 0
|
|
192
|
+
n_collapsed = 0
|
|
193
|
+
for i in range(n_subs):
|
|
194
|
+
vi1 = int(vi1_arr[i])
|
|
195
|
+
vi2 = int(vi2_arr[i])
|
|
196
|
+
|
|
197
|
+
# Unconverted bases (always available from substitution map)
|
|
198
|
+
ub1 = base_to_int.get(str(b1_arr[i]).upper())
|
|
199
|
+
ub2 = base_to_int.get(str(b2_arr[i]).upper())
|
|
200
|
+
if ub1 is None or ub2 is None:
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
# Build sets of acceptable integer-encoded bases for each reference
|
|
204
|
+
ref1_ints: set[int] = {ub1}
|
|
205
|
+
ref2_ints: set[int] = {ub2}
|
|
206
|
+
if use_converted:
|
|
207
|
+
cb1 = base_to_int.get(str(conv1_bases[vi1]).upper())
|
|
208
|
+
cb2 = base_to_int.get(str(conv2_bases[vi2]).upper())
|
|
209
|
+
if cb1 is not None:
|
|
210
|
+
ref1_ints.add(cb1)
|
|
211
|
+
if cb2 is not None:
|
|
212
|
+
ref2_ints.add(cb2)
|
|
213
|
+
|
|
214
|
+
# Store acceptable bases at the primary var index for this substitution
|
|
215
|
+
ref1_bases_str = ",".join(sorted(int_to_base.get(v, "?") for v in ref1_ints))
|
|
216
|
+
ref2_bases_str = ",".join(sorted(int_to_base.get(v, "?") for v in ref2_ints))
|
|
217
|
+
ref1_acceptable_bases[vi1] = ref1_bases_str
|
|
218
|
+
ref2_acceptable_bases[vi2] = ref2_bases_str
|
|
219
|
+
|
|
220
|
+
# Position is informative only if the acceptable base sets are disjoint
|
|
221
|
+
if ref1_ints & ref2_ints:
|
|
222
|
+
n_collapsed += 1
|
|
223
|
+
continue
|
|
224
|
+
n_informative += 1
|
|
225
|
+
is_informative[vi1] = True
|
|
226
|
+
if vi2 != vi1:
|
|
227
|
+
is_informative[vi2] = True
|
|
228
|
+
|
|
229
|
+
# Pre-compute numpy arrays for fast membership testing
|
|
230
|
+
ref1_arr = np.array(list(ref1_ints), dtype=seq_matrix.dtype)
|
|
231
|
+
ref2_arr = np.array(list(ref2_ints), dtype=seq_matrix.dtype)
|
|
232
|
+
|
|
233
|
+
# For each reference, use that reference's var index from the substitution map.
|
|
234
|
+
# Reads aligned to seq1's reference use vi1; reads aligned to seq2's reference use vi2.
|
|
235
|
+
for ref in ref_categories:
|
|
236
|
+
seq_id = ref_to_seq.get(ref)
|
|
237
|
+
if seq_id is None:
|
|
238
|
+
continue
|
|
239
|
+
var_idx = vi1 if seq_id == 1 else vi2
|
|
240
|
+
|
|
241
|
+
ref_mask = ref_labels == ref
|
|
242
|
+
|
|
243
|
+
read_bases = seq_matrix[ref_mask, var_idx]
|
|
244
|
+
if has_span:
|
|
245
|
+
covered = span_matrix[ref_mask, var_idx] > 0
|
|
246
|
+
else:
|
|
247
|
+
covered = np.ones(ref_mask.sum(), dtype=bool)
|
|
248
|
+
|
|
249
|
+
calls = np.zeros(ref_mask.sum(), dtype=np.int8)
|
|
250
|
+
calls[np.isin(read_bases, ref1_arr) & covered] = 1
|
|
251
|
+
calls[np.isin(read_bases, ref2_arr) & covered] = 2
|
|
252
|
+
calls[~covered | np.isin(read_bases, list(uninformative))] = 0
|
|
253
|
+
|
|
254
|
+
result[ref_mask, var_idx] = calls
|
|
255
|
+
|
|
256
|
+
logger.info(
|
|
257
|
+
"Variant calling complete: %d informative, %d collapsed (overlapping base sets).",
|
|
258
|
+
n_informative,
|
|
259
|
+
n_collapsed,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
adata.var[f"{output_prefix}_seq1_acceptable_bases"] = pd.Categorical(ref1_acceptable_bases)
|
|
263
|
+
adata.var[f"{output_prefix}_seq2_acceptable_bases"] = pd.Categorical(ref2_acceptable_bases)
|
|
264
|
+
adata.var[f"{output_prefix}_informative_site"] = is_informative
|
|
265
|
+
|
|
266
|
+
adata.layers[layer_name] = result
|
|
267
|
+
|
|
268
|
+
adata.uns[uns_flag] = True
|
|
269
|
+
logger.info("Added variant call layer '%s'.", layer_name)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def append_variant_segment_layer(
|
|
273
|
+
adata: "ad.AnnData",
|
|
274
|
+
seq1_column: str,
|
|
275
|
+
seq2_column: str,
|
|
276
|
+
variant_call_layer: str | None = None,
|
|
277
|
+
read_span_layer: str = "read_span_mask",
|
|
278
|
+
reference_col: str = "Reference_strand",
|
|
279
|
+
output_prefix: str | None = None,
|
|
280
|
+
uns_flag: str = "append_variant_segment_layer_performed",
|
|
281
|
+
force_redo: bool = False,
|
|
282
|
+
bypass: bool = False,
|
|
283
|
+
) -> None:
|
|
284
|
+
"""Segment each read span into contiguous seq1/seq2 regions based on variant calls.
|
|
285
|
+
|
|
286
|
+
Uses the per-position variant calls (1=seq1, 2=seq2) at informative mismatch
|
|
287
|
+
sites to segment each read into contiguous regions. At boundaries where the
|
|
288
|
+
class switches, a putative breakpoint is placed at the midpoint between the
|
|
289
|
+
two flanking mismatch positions.
|
|
290
|
+
|
|
291
|
+
Values in the output layer:
|
|
292
|
+
0 = outside read span (no coverage)
|
|
293
|
+
1 = seq1 segment
|
|
294
|
+
2 = seq2 segment
|
|
295
|
+
3 = transition zone between different-class segments
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
adata: AnnData object.
|
|
299
|
+
seq1_column: Column in ``adata.var`` with the first reference base.
|
|
300
|
+
seq2_column: Column in ``adata.var`` with the second reference base.
|
|
301
|
+
variant_call_layer: Layer with per-position variant calls. Auto-derived if None.
|
|
302
|
+
read_span_layer: Layer containing read span masks.
|
|
303
|
+
reference_col: Obs column defining which reference each read is aligned to.
|
|
304
|
+
output_prefix: Prefix for output layer/obs names. Defaults to ``{seq1_column}__{seq2_column}``.
|
|
305
|
+
uns_flag: Flag in ``adata.uns`` indicating prior completion.
|
|
306
|
+
force_redo: Whether to rerun even if ``uns_flag`` is set.
|
|
307
|
+
bypass: Whether to skip processing.
|
|
308
|
+
"""
|
|
309
|
+
if bypass:
|
|
310
|
+
return
|
|
311
|
+
|
|
312
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
313
|
+
if already and not force_redo:
|
|
314
|
+
return
|
|
315
|
+
|
|
316
|
+
import pandas as pd
|
|
317
|
+
|
|
318
|
+
output_prefix = output_prefix or f"{seq1_column}__{seq2_column}"
|
|
319
|
+
if variant_call_layer is None:
|
|
320
|
+
variant_call_layer = f"{output_prefix}_variant_call"
|
|
321
|
+
|
|
322
|
+
if variant_call_layer not in adata.layers:
|
|
323
|
+
logger.warning(
|
|
324
|
+
"Variant call layer '%s' not found; skipping segment layer.", variant_call_layer
|
|
325
|
+
)
|
|
326
|
+
return
|
|
327
|
+
|
|
328
|
+
has_span = read_span_layer in adata.layers
|
|
329
|
+
if not has_span:
|
|
330
|
+
logger.warning("Read span layer '%s' not found; skipping segment layer.", read_span_layer)
|
|
331
|
+
return
|
|
332
|
+
|
|
333
|
+
call_matrix = np.asarray(adata.layers[variant_call_layer])
|
|
334
|
+
span_matrix = np.asarray(adata.layers[read_span_layer])
|
|
335
|
+
n_obs, n_vars = adata.shape
|
|
336
|
+
|
|
337
|
+
segment_layer = np.zeros((n_obs, n_vars), dtype=np.int8)
|
|
338
|
+
breakpoint_counts = np.zeros(n_obs, dtype=np.int32)
|
|
339
|
+
|
|
340
|
+
for i in range(n_obs):
|
|
341
|
+
span_row = span_matrix[i]
|
|
342
|
+
call_row = call_matrix[i]
|
|
343
|
+
|
|
344
|
+
# Find read span boundaries
|
|
345
|
+
covered = np.where(span_row > 0)[0]
|
|
346
|
+
if len(covered) == 0:
|
|
347
|
+
continue
|
|
348
|
+
span_start = int(covered[0])
|
|
349
|
+
span_end = int(covered[-1])
|
|
350
|
+
|
|
351
|
+
# Collect informative positions (call == 1 or 2) within span
|
|
352
|
+
informative_mask = (call_row == 1) | (call_row == 2)
|
|
353
|
+
informative_positions = np.where(informative_mask)[0]
|
|
354
|
+
# Restrict to within span
|
|
355
|
+
informative_positions = informative_positions[
|
|
356
|
+
(informative_positions >= span_start) & (informative_positions <= span_end)
|
|
357
|
+
]
|
|
358
|
+
|
|
359
|
+
if len(informative_positions) == 0:
|
|
360
|
+
# No informative sites — leave as 0 (no segment info)
|
|
361
|
+
continue
|
|
362
|
+
|
|
363
|
+
# Sort by position (should already be sorted)
|
|
364
|
+
informative_positions = np.sort(informative_positions)
|
|
365
|
+
classes = call_row[informative_positions] # 1 or 2
|
|
366
|
+
|
|
367
|
+
n_bp = 0
|
|
368
|
+
# Walk through consecutive informative positions and fill segments
|
|
369
|
+
prev_pos = informative_positions[0]
|
|
370
|
+
prev_cls = int(classes[0])
|
|
371
|
+
|
|
372
|
+
# Extend first class leftward to span start
|
|
373
|
+
segment_layer[i, span_start:prev_pos] = prev_cls
|
|
374
|
+
|
|
375
|
+
for k in range(1, len(informative_positions)):
|
|
376
|
+
cur_pos = informative_positions[k]
|
|
377
|
+
cur_cls = int(classes[k])
|
|
378
|
+
|
|
379
|
+
if cur_cls == prev_cls:
|
|
380
|
+
# Same class — fill from prev_pos to cur_pos
|
|
381
|
+
segment_layer[i, prev_pos:cur_pos] = prev_cls
|
|
382
|
+
else:
|
|
383
|
+
# Class transition — fill gap between informative sites with transition value
|
|
384
|
+
segment_layer[i, prev_pos] = prev_cls
|
|
385
|
+
segment_layer[i, prev_pos + 1 : cur_pos] = 3
|
|
386
|
+
n_bp += 1
|
|
387
|
+
|
|
388
|
+
prev_pos = cur_pos
|
|
389
|
+
prev_cls = cur_cls
|
|
390
|
+
|
|
391
|
+
# Fill the last informative position itself
|
|
392
|
+
segment_layer[i, prev_pos] = prev_cls
|
|
393
|
+
# Extend last class rightward to span end (inclusive)
|
|
394
|
+
segment_layer[i, prev_pos : span_end + 1] = prev_cls
|
|
395
|
+
# But re-mark breakpoints that may have been overwritten — they weren't,
|
|
396
|
+
# since we only extend from prev_pos forward and breakpoints are before prev_pos.
|
|
397
|
+
|
|
398
|
+
breakpoint_counts[i] = n_bp
|
|
399
|
+
|
|
400
|
+
layer_name = f"{output_prefix}_variant_segments"
|
|
401
|
+
adata.layers[layer_name] = segment_layer
|
|
402
|
+
|
|
403
|
+
adata.obs[f"{output_prefix}_breakpoint_count"] = breakpoint_counts
|
|
404
|
+
adata.obs[f"{output_prefix}_is_chimeric"] = breakpoint_counts > 0
|
|
405
|
+
|
|
406
|
+
# Per-read chimeric flags from mismatch segments relative to each read's own reference.
|
|
407
|
+
# A mismatch segment is a contiguous run where a seq1-aligned read is labeled as seq2,
|
|
408
|
+
# or vice versa, within the read span.
|
|
409
|
+
ref_labels = adata.obs[reference_col].values
|
|
410
|
+
ref_categories = adata.obs[reference_col].cat.categories
|
|
411
|
+
suffix = "_strand_FASTA_base"
|
|
412
|
+
seq1_stem = seq1_column[: -len(suffix)] if seq1_column.endswith(suffix) else seq1_column
|
|
413
|
+
seq2_stem = seq2_column[: -len(suffix)] if seq2_column.endswith(suffix) else seq2_column
|
|
414
|
+
|
|
415
|
+
ref_to_seq: dict[str, int] = {}
|
|
416
|
+
for ref in ref_categories:
|
|
417
|
+
if ref == seq1_stem:
|
|
418
|
+
ref_to_seq[ref] = 1
|
|
419
|
+
elif ref == seq2_stem:
|
|
420
|
+
ref_to_seq[ref] = 2
|
|
421
|
+
|
|
422
|
+
chimeric_flags = np.zeros(n_obs, dtype=bool)
|
|
423
|
+
chimeric_types: list[str] = ["no_segment_mismatch"] * n_obs
|
|
424
|
+
|
|
425
|
+
for i in range(n_obs):
|
|
426
|
+
covered = np.where(span_matrix[i] > 0)[0]
|
|
427
|
+
if len(covered) == 0:
|
|
428
|
+
continue
|
|
429
|
+
|
|
430
|
+
span_start = int(covered[0])
|
|
431
|
+
span_end = int(covered[-1])
|
|
432
|
+
in_span = segment_layer[i, span_start : span_end + 1]
|
|
433
|
+
|
|
434
|
+
seq_id = ref_to_seq.get(ref_labels[i])
|
|
435
|
+
if seq_id is None:
|
|
436
|
+
continue
|
|
437
|
+
|
|
438
|
+
mismatch_value = 2 if seq_id == 1 else 1
|
|
439
|
+
mismatch_mask = in_span == mismatch_value
|
|
440
|
+
if not np.any(mismatch_mask):
|
|
441
|
+
continue
|
|
442
|
+
|
|
443
|
+
starts = np.where(mismatch_mask & ~np.r_[False, mismatch_mask[:-1]])[0]
|
|
444
|
+
ends = np.where(mismatch_mask & ~np.r_[mismatch_mask[1:], False])[0]
|
|
445
|
+
n_segments = len(starts)
|
|
446
|
+
chimeric_flags[i] = True
|
|
447
|
+
|
|
448
|
+
if n_segments >= 2:
|
|
449
|
+
chimeric_types[i] = "multi_segment_mismatch"
|
|
450
|
+
else:
|
|
451
|
+
start = int(starts[0])
|
|
452
|
+
end = int(ends[0])
|
|
453
|
+
if start == 0:
|
|
454
|
+
chimeric_types[i] = "left_segment_mismatch"
|
|
455
|
+
elif end == (len(in_span) - 1):
|
|
456
|
+
chimeric_types[i] = "right_segment_mismatch"
|
|
457
|
+
else:
|
|
458
|
+
chimeric_types[i] = "middle_segment_mismatch"
|
|
459
|
+
|
|
460
|
+
adata.obs["chimeric_variant_sites"] = chimeric_flags
|
|
461
|
+
adata.obs["chimeric_variant_sites_type"] = pd.Categorical(
|
|
462
|
+
chimeric_types,
|
|
463
|
+
categories=[
|
|
464
|
+
"no_segment_mismatch",
|
|
465
|
+
"left_segment_mismatch",
|
|
466
|
+
"right_segment_mismatch",
|
|
467
|
+
"middle_segment_mismatch",
|
|
468
|
+
"multi_segment_mismatch",
|
|
469
|
+
],
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
n_chimeric = int(np.sum(breakpoint_counts > 0))
|
|
473
|
+
logger.info(
|
|
474
|
+
"Variant segmentation complete: %d reads with breakpoints out of %d total.",
|
|
475
|
+
n_chimeric,
|
|
476
|
+
n_obs,
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
adata.uns[uns_flag] = True
|
|
480
|
+
logger.info("Added variant segment layer '%s'.", layer_name)
|
|
@@ -53,4 +53,4 @@ def calculate_consensus(
|
|
|
53
53
|
else:
|
|
54
54
|
adata.var[f"{reference}_consensus_across_samples"] = consensus_sequence_list
|
|
55
55
|
|
|
56
|
-
adata.uns[f"{reference}_consensus_sequence"] = consensus_sequence_list
|
|
56
|
+
adata.uns[f"{reference}_consensus_sequence"] = str(consensus_sequence_list)
|
|
@@ -20,6 +20,7 @@ def calculate_read_modification_stats(
|
|
|
20
20
|
force_redo: bool = False,
|
|
21
21
|
valid_sites_only: bool = False,
|
|
22
22
|
valid_site_suffix: str = "_valid_coverage",
|
|
23
|
+
smf_modality: str = "conversion",
|
|
23
24
|
) -> None:
|
|
24
25
|
"""Add methylation/deamination statistics for each read.
|
|
25
26
|
|
|
@@ -80,8 +81,12 @@ def calculate_read_modification_stats(
|
|
|
80
81
|
for ref in references:
|
|
81
82
|
ref_subset = adata[adata.obs[reference_column] == ref]
|
|
82
83
|
for site_type in site_types:
|
|
84
|
+
site_subset = ref_subset[:, ref_subset.var[f"{ref}_{site_type}{valid_site_suffix}"]]
|
|
83
85
|
logger.info("Iterating over %s_%s", ref, site_type)
|
|
84
|
-
|
|
86
|
+
if smf_modality == "native":
|
|
87
|
+
observation_matrix = site_subset.layers["binarized_methylation"]
|
|
88
|
+
else:
|
|
89
|
+
observation_matrix = site_subset.X
|
|
85
90
|
total_positions_in_read = np.nansum(~np.isnan(observation_matrix), axis=1)
|
|
86
91
|
total_positions_in_reference = observation_matrix.shape[1]
|
|
87
92
|
fraction_valid_positions_in_read_vs_ref = (
|
|
@@ -844,11 +844,11 @@ def plot_histogram_pages(
|
|
|
844
844
|
if adata is not None and sample_key in adata.obs.columns and ref_key in adata.obs.columns:
|
|
845
845
|
obs = adata.obs
|
|
846
846
|
sseries = obs[sample_key]
|
|
847
|
-
if not pd.
|
|
847
|
+
if not isinstance(sseries.dtype, pd.CategoricalDtype):
|
|
848
848
|
sseries = sseries.astype("category")
|
|
849
849
|
samples = list(sseries.cat.categories)
|
|
850
850
|
rseries = obs[ref_key]
|
|
851
|
-
if not pd.
|
|
851
|
+
if not isinstance(rseries.dtype, pd.CategoricalDtype):
|
|
852
852
|
rseries = rseries.astype("category")
|
|
853
853
|
references = list(rseries.cat.categories)
|
|
854
854
|
use_adata = True
|
|
@@ -1189,7 +1189,7 @@ def plot_hamming_vs_metric_pages(
|
|
|
1189
1189
|
# canonicalize samples and refs
|
|
1190
1190
|
if samples is None:
|
|
1191
1191
|
sseries = obs[sample_col]
|
|
1192
|
-
if not pd.
|
|
1192
|
+
if not isinstance(sseries.dtype, pd.CategoricalDtype):
|
|
1193
1193
|
sseries = sseries.astype("category")
|
|
1194
1194
|
samples_all = list(sseries.cat.categories)
|
|
1195
1195
|
else:
|
|
@@ -1197,7 +1197,7 @@ def plot_hamming_vs_metric_pages(
|
|
|
1197
1197
|
|
|
1198
1198
|
if references is None:
|
|
1199
1199
|
rseries = obs[ref_col]
|
|
1200
|
-
if not pd.
|
|
1200
|
+
if not isinstance(rseries.dtype, pd.CategoricalDtype):
|
|
1201
1201
|
rseries = rseries.astype("category")
|
|
1202
1202
|
refs_all = list(rseries.cat.categories)
|
|
1203
1203
|
else:
|
|
@@ -32,6 +32,7 @@ def invert_adata(
|
|
|
32
32
|
already = bool(adata.uns.get(uns_flag, False))
|
|
33
33
|
if already and not force_redo:
|
|
34
34
|
# QC already performed; nothing to do
|
|
35
|
+
logger.info("Inversion already performed")
|
|
35
36
|
return adata
|
|
36
37
|
|
|
37
38
|
logger.info("Inverting AnnData along the column axis...")
|