smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/chimeric_adata.py +1563 -0
- smftools/cli/helpers.py +49 -7
- smftools/cli/hmm_adata.py +250 -32
- smftools/cli/latent_adata.py +773 -0
- smftools/cli/load_adata.py +78 -74
- smftools/cli/preprocess_adata.py +122 -58
- smftools/cli/recipes.py +26 -0
- smftools/cli/spatial_adata.py +74 -112
- smftools/cli/variant_adata.py +423 -0
- smftools/cli_entry.py +52 -4
- smftools/config/conversion.yaml +1 -1
- smftools/config/deaminase.yaml +3 -0
- smftools/config/default.yaml +85 -12
- smftools/config/experiment_config.py +146 -1
- smftools/constants.py +69 -0
- smftools/hmm/HMM.py +88 -0
- smftools/hmm/call_hmm_peaks.py +1 -1
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/binarize_converted_base_identities.py +2 -89
- smftools/informatics/converted_BAM_to_adata.py +636 -175
- smftools/informatics/h5ad_functions.py +198 -2
- smftools/informatics/modkit_extract_to_adata.py +1007 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +26 -3
- smftools/plotting/autocorrelation_plotting.py +22 -4
- smftools/plotting/chimeric_plotting.py +1893 -0
- smftools/plotting/classifiers.py +28 -14
- smftools/plotting/general_plotting.py +62 -1583
- smftools/plotting/hmm_plotting.py +1670 -8
- smftools/plotting/latent_plotting.py +804 -0
- smftools/plotting/plotting_utils.py +243 -0
- smftools/plotting/position_stats.py +16 -8
- smftools/plotting/preprocess_plotting.py +281 -0
- smftools/plotting/qc_plotting.py +8 -3
- smftools/plotting/spatial_plotting.py +1134 -0
- smftools/plotting/variant_plotting.py +1231 -0
- smftools/preprocessing/__init__.py +4 -0
- smftools/preprocessing/append_base_context.py +18 -18
- smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
- smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
- smftools/preprocessing/append_variant_call_layer.py +480 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/flag_duplicate_reads.py +4 -4
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/readwrite.py +159 -99
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +10 -0
- smftools/tools/calculate_knn.py +121 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +130 -0
- smftools/tools/calculate_pca.py +180 -0
- smftools/tools/calculate_umap.py +79 -80
- smftools/tools/position_stats.py +4 -4
- smftools/tools/rolling_nn_distance.py +872 -0
- smftools/tools/sequence_alignment.py +140 -0
- smftools/tools/tensor_factorization.py +217 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,6 +5,10 @@ from importlib import import_module
|
|
|
5
5
|
_LAZY_ATTRS = {
|
|
6
6
|
"append_base_context": "smftools.preprocessing.append_base_context",
|
|
7
7
|
"append_binary_layer_by_base_context": "smftools.preprocessing.append_binary_layer_by_base_context",
|
|
8
|
+
"append_mismatch_frequency_sites": "smftools.preprocessing.append_mismatch_frequency_sites",
|
|
9
|
+
"append_variant_call_layer": "smftools.preprocessing.append_variant_call_layer",
|
|
10
|
+
"append_variant_segment_layer": "smftools.preprocessing.append_variant_call_layer",
|
|
11
|
+
"append_sequence_mismatch_annotations": "smftools.preprocessing.append_sequence_mismatch_annotations",
|
|
8
12
|
"binarize_adata": "smftools.preprocessing.binarize",
|
|
9
13
|
"binarize_on_Youden": "smftools.preprocessing.binarize_on_Youden",
|
|
10
14
|
"calculate_complexity_II": "smftools.preprocessing.calculate_complexity_II",
|
|
@@ -51,7 +51,7 @@ def append_base_context(
|
|
|
51
51
|
site_types += ["A_site"]
|
|
52
52
|
|
|
53
53
|
for ref in references:
|
|
54
|
-
# Assess if the strand is the top or bottom strand
|
|
54
|
+
# Assess if the modified strand is the top or bottom strand.
|
|
55
55
|
if "top" in ref:
|
|
56
56
|
strand = "top"
|
|
57
57
|
elif "bottom" in ref:
|
|
@@ -133,23 +133,23 @@ def append_base_context(
|
|
|
133
133
|
adata.var[f"{ref}_{site_type}_valid_coverage"] = (
|
|
134
134
|
(adata.var[f"{ref}_{site_type}"]) & (adata.var[f"position_in_{ref}"])
|
|
135
135
|
)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
else:
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
if native:
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
else:
|
|
152
|
-
|
|
136
|
+
# if native:
|
|
137
|
+
# adata.obsm[f"{ref}_{site_type}_valid_coverage"] = adata[
|
|
138
|
+
# :, adata.var[f"{ref}_{site_type}_valid_coverage"]
|
|
139
|
+
# ].layers["binarized_methylation"]
|
|
140
|
+
# else:
|
|
141
|
+
# adata.obsm[f"{ref}_{site_type}_valid_coverage"] = adata[
|
|
142
|
+
# :, adata.var[f"{ref}_{site_type}_valid_coverage"]
|
|
143
|
+
# ].X
|
|
144
|
+
# else:
|
|
145
|
+
# pass
|
|
146
|
+
|
|
147
|
+
# if native:
|
|
148
|
+
# adata.obsm[f"{ref}_{site_type}"] = adata[:, adata.var[f"{ref}_{site_type}"]].layers[
|
|
149
|
+
# "binarized_methylation"
|
|
150
|
+
# ]
|
|
151
|
+
# else:
|
|
152
|
+
# adata.obsm[f"{ref}_{site_type}"] = adata[:, adata.var[f"{ref}_{site_type}"]].X
|
|
153
153
|
|
|
154
154
|
# mark as done
|
|
155
155
|
adata.uns[uns_flag] = True
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Iterable, Sequence
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from smftools.constants import BASE_QUALITY_SCORES, MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT
|
|
9
|
+
from smftools.logging_utils import get_logger
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
import anndata as ad
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def append_mismatch_frequency_sites(
|
|
18
|
+
adata: "ad.AnnData",
|
|
19
|
+
ref_column: str = "Reference_strand",
|
|
20
|
+
mismatch_layer: str = "mismatch_integer_encoding",
|
|
21
|
+
read_span_layer: str = "read_span_mask",
|
|
22
|
+
quality_layer: str | None = None,
|
|
23
|
+
mismatch_frequency_range: Sequence[float] | None = (0.05, 0.95),
|
|
24
|
+
uns_flag: str = "append_mismatch_frequency_sites_performed",
|
|
25
|
+
force_redo: bool = False,
|
|
26
|
+
bypass: bool = False,
|
|
27
|
+
) -> None:
|
|
28
|
+
"""Append mismatch frequency metadata and variable-site flags per reference.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
adata: AnnData object.
|
|
32
|
+
ref_column: Obs column defining reference categories.
|
|
33
|
+
mismatch_layer: Layer containing mismatch integer encodings.
|
|
34
|
+
read_span_layer: Layer containing read span masks (1=covered, 0=not covered).
|
|
35
|
+
quality_layer: Layer containing base quality scores for Q-value based error rates.
|
|
36
|
+
mismatch_frequency_range: Lower/upper bounds (inclusive) for variable site flagging.
|
|
37
|
+
uns_flag: Flag in ``adata.uns`` indicating prior completion.
|
|
38
|
+
force_redo: Whether to rerun even if ``uns_flag`` is set.
|
|
39
|
+
bypass: Whether to skip running this step.
|
|
40
|
+
"""
|
|
41
|
+
if bypass:
|
|
42
|
+
return
|
|
43
|
+
|
|
44
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
45
|
+
if already and not force_redo:
|
|
46
|
+
return
|
|
47
|
+
|
|
48
|
+
if mismatch_layer not in adata.layers:
|
|
49
|
+
logger.debug(
|
|
50
|
+
"Mismatch layer '%s' not found; skipping mismatch frequency step.", mismatch_layer
|
|
51
|
+
)
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
mismatch_map = adata.uns.get("mismatch_integer_encoding_map", {})
|
|
55
|
+
if not mismatch_map:
|
|
56
|
+
logger.debug("Mismatch encoding map not found; skipping mismatch frequency step.")
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
n_value = mismatch_map.get("N", MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["N"])
|
|
60
|
+
pad_value = mismatch_map.get("PAD", MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["PAD"])
|
|
61
|
+
|
|
62
|
+
base_int_to_label = {
|
|
63
|
+
int(value): str(base)
|
|
64
|
+
for base, value in mismatch_map.items()
|
|
65
|
+
if base not in {"N", "PAD"} and isinstance(value, (int, np.integer))
|
|
66
|
+
}
|
|
67
|
+
if not base_int_to_label:
|
|
68
|
+
logger.debug("Mismatch encoding map missing base labels; skipping mismatch frequency step.")
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
has_span_mask = read_span_layer in adata.layers
|
|
72
|
+
if not has_span_mask:
|
|
73
|
+
logger.debug(
|
|
74
|
+
"Read span mask '%s' not found; mismatch frequencies will be computed over all reads.",
|
|
75
|
+
read_span_layer,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
if quality_layer is None:
|
|
79
|
+
if BASE_QUALITY_SCORES in adata.layers:
|
|
80
|
+
quality_layer = BASE_QUALITY_SCORES
|
|
81
|
+
elif "base_qualities" in adata.layers:
|
|
82
|
+
quality_layer = "base_qualities"
|
|
83
|
+
|
|
84
|
+
if quality_layer is not None and quality_layer not in adata.layers:
|
|
85
|
+
logger.debug("Quality layer '%s' not found; falling back to range flagging.", quality_layer)
|
|
86
|
+
quality_layer = None
|
|
87
|
+
|
|
88
|
+
references = adata.obs[ref_column].cat.categories
|
|
89
|
+
n_vars = adata.shape[1]
|
|
90
|
+
|
|
91
|
+
if mismatch_frequency_range is None:
|
|
92
|
+
mismatch_frequency_range = (0.0, 1.0)
|
|
93
|
+
|
|
94
|
+
lower_bound, upper_bound = mismatch_frequency_range
|
|
95
|
+
|
|
96
|
+
for ref in references:
|
|
97
|
+
ref_mask = adata.obs[ref_column] == ref
|
|
98
|
+
ref_position_mask = adata.var.get(f"position_in_{ref}")
|
|
99
|
+
if ref_position_mask is None:
|
|
100
|
+
ref_position_mask = pd.Series(np.ones(n_vars, dtype=bool), index=adata.var.index)
|
|
101
|
+
else:
|
|
102
|
+
ref_position_mask = ref_position_mask.astype(bool)
|
|
103
|
+
|
|
104
|
+
frequency_values = np.full(n_vars, np.nan, dtype=float)
|
|
105
|
+
variable_flags = np.zeros(n_vars, dtype=bool)
|
|
106
|
+
mismatch_base_frequencies: list[list[tuple[str, float]]] = [[] for _ in range(n_vars)]
|
|
107
|
+
|
|
108
|
+
if ref_mask.sum() == 0:
|
|
109
|
+
adata.var[f"{ref}_mismatch_frequency"] = pd.Series(
|
|
110
|
+
frequency_values, index=adata.var.index
|
|
111
|
+
)
|
|
112
|
+
adata.var[f"{ref}_variable_sequence_site"] = pd.Series(
|
|
113
|
+
variable_flags, index=adata.var.index
|
|
114
|
+
)
|
|
115
|
+
adata.var[f"{ref}_mismatch_base_frequencies"] = pd.Series(
|
|
116
|
+
mismatch_base_frequencies, index=adata.var.index
|
|
117
|
+
)
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
mismatch_matrix = np.asarray(adata.layers[mismatch_layer][ref_mask])
|
|
121
|
+
if has_span_mask:
|
|
122
|
+
span_matrix = np.asarray(adata.layers[read_span_layer][ref_mask])
|
|
123
|
+
coverage_mask = span_matrix > 0
|
|
124
|
+
coverage_counts = coverage_mask.sum(axis=0).astype(float)
|
|
125
|
+
else:
|
|
126
|
+
coverage_mask = np.ones_like(mismatch_matrix, dtype=bool)
|
|
127
|
+
coverage_counts = np.full(n_vars, ref_mask.sum(), dtype=float)
|
|
128
|
+
|
|
129
|
+
mismatch_mask = (~np.isin(mismatch_matrix, [n_value, pad_value])) & coverage_mask
|
|
130
|
+
mismatch_counts = mismatch_mask.sum(axis=0)
|
|
131
|
+
|
|
132
|
+
frequency_values = np.divide(
|
|
133
|
+
mismatch_counts,
|
|
134
|
+
coverage_counts,
|
|
135
|
+
out=np.full(n_vars, np.nan, dtype=float),
|
|
136
|
+
where=coverage_counts > 0,
|
|
137
|
+
)
|
|
138
|
+
frequency_values = np.where(ref_position_mask.values, frequency_values, np.nan)
|
|
139
|
+
|
|
140
|
+
mean_error_rate: np.ndarray | None = None
|
|
141
|
+
if quality_layer is not None:
|
|
142
|
+
quality_matrix = np.asarray(adata.layers[quality_layer][ref_mask]).astype(float)
|
|
143
|
+
quality_matrix[quality_matrix < 0] = np.nan
|
|
144
|
+
if has_span_mask:
|
|
145
|
+
quality_matrix = np.where(coverage_mask, quality_matrix, np.nan)
|
|
146
|
+
error_matrix = np.power(10.0, -quality_matrix / 10.0)
|
|
147
|
+
mean_error_rate = np.nanmean(error_matrix, axis=0)
|
|
148
|
+
mean_error_rate = np.where(ref_position_mask.values, mean_error_rate, np.nan)
|
|
149
|
+
|
|
150
|
+
if mean_error_rate is None:
|
|
151
|
+
variable_flags = (
|
|
152
|
+
(frequency_values >= lower_bound)
|
|
153
|
+
& (frequency_values <= upper_bound)
|
|
154
|
+
& ref_position_mask.values
|
|
155
|
+
)
|
|
156
|
+
else:
|
|
157
|
+
variable_flags = (
|
|
158
|
+
(frequency_values > mean_error_rate)
|
|
159
|
+
& ref_position_mask.values
|
|
160
|
+
& np.isfinite(mean_error_rate)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
base_counts_by_int: dict[int, np.ndarray] = {}
|
|
164
|
+
for base_int in base_int_to_label:
|
|
165
|
+
base_counts_by_int[base_int] = ((mismatch_matrix == base_int) & coverage_mask).sum(
|
|
166
|
+
axis=0
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
for idx in range(n_vars):
|
|
170
|
+
if not ref_position_mask.iloc[idx] or coverage_counts[idx] == 0:
|
|
171
|
+
continue
|
|
172
|
+
base_freqs: list[tuple[str, float]] = []
|
|
173
|
+
for base_int, base_label in base_int_to_label.items():
|
|
174
|
+
count = base_counts_by_int[base_int][idx]
|
|
175
|
+
if count > 0:
|
|
176
|
+
base_freqs.append((base_label, float(count / coverage_counts[idx])))
|
|
177
|
+
mismatch_base_frequencies[idx] = base_freqs
|
|
178
|
+
|
|
179
|
+
adata.var[f"{ref}_mismatch_frequency"] = pd.Series(frequency_values, index=adata.var.index)
|
|
180
|
+
adata.var[f"{ref}_variable_sequence_site"] = pd.Series(
|
|
181
|
+
variable_flags, index=adata.var.index
|
|
182
|
+
)
|
|
183
|
+
adata.var[f"{ref}_mismatch_base_frequencies"] = pd.Series(
|
|
184
|
+
mismatch_base_frequencies, index=adata.var.index
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
adata.uns[uns_flag] = True
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from smftools.logging_utils import get_logger
|
|
9
|
+
from smftools.tools.sequence_alignment import align_sequences_with_mismatches
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
import anndata as ad
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _format_mismatch_identity(event: str, seq1_base: str | None, seq2_base: str | None) -> str:
|
|
18
|
+
if event == "substitution":
|
|
19
|
+
return f"{seq1_base}->{seq2_base}"
|
|
20
|
+
if event == "insertion":
|
|
21
|
+
return f"ins:{seq2_base}"
|
|
22
|
+
return f"del:{seq1_base}"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def append_sequence_mismatch_annotations(
|
|
26
|
+
adata: "ad.AnnData",
|
|
27
|
+
seq1_column: str,
|
|
28
|
+
seq2_column: str,
|
|
29
|
+
output_prefix: str | None = None,
|
|
30
|
+
match_score: int = 1,
|
|
31
|
+
mismatch_score: int = -1,
|
|
32
|
+
gap_score: int = -2,
|
|
33
|
+
ignore_n: bool = True,
|
|
34
|
+
bypass: bool = False,
|
|
35
|
+
force_redo: bool = False,
|
|
36
|
+
uns_flag: str = "append_sequence_mismatch_annotations_performed",
|
|
37
|
+
) -> None:
|
|
38
|
+
"""Append mismatch annotations by aligning full reference sequences.
|
|
39
|
+
|
|
40
|
+
Extracts the full reference sequences from per-position base columns in
|
|
41
|
+
``adata.var``, performs a single global alignment, and maps mismatches
|
|
42
|
+
(substitutions, insertions, deletions) back to ``adata.var`` indices.
|
|
43
|
+
|
|
44
|
+
Results stored in ``adata.var``:
|
|
45
|
+
- ``{prefix}_mismatch_type``: Per-position str — ``"substitution"``,
|
|
46
|
+
``"insertion"``, ``"deletion"``, or ``""`` (no mismatch).
|
|
47
|
+
- ``{prefix}_mismatch_identity``: Per-position str — e.g. ``"A->G"``,
|
|
48
|
+
``"ins:T"``, ``"del:C"``, or ``""``).
|
|
49
|
+
- ``{prefix}_is_mismatch``: Per-position bool flag.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
adata: AnnData object.
|
|
53
|
+
seq1_column: Column in ``adata.var`` with per-position bases for reference 1.
|
|
54
|
+
seq2_column: Column in ``adata.var`` with per-position bases for reference 2.
|
|
55
|
+
output_prefix: Prefix for output columns. Defaults to ``{seq1_column}__{seq2_column}``.
|
|
56
|
+
match_score: Alignment match score.
|
|
57
|
+
mismatch_score: Alignment mismatch score.
|
|
58
|
+
gap_score: Alignment gap score.
|
|
59
|
+
ignore_n: Whether to ignore mismatches involving ``N`` bases.
|
|
60
|
+
bypass: Whether to skip processing.
|
|
61
|
+
force_redo: Whether to rerun even if ``uns_flag`` is set.
|
|
62
|
+
uns_flag: Flag in ``adata.uns`` indicating prior completion.
|
|
63
|
+
"""
|
|
64
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
65
|
+
if (already and not force_redo) or bypass:
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
if seq1_column not in adata.var:
|
|
69
|
+
raise KeyError(f"Sequence column '{seq1_column}' not found in adata.var")
|
|
70
|
+
if seq2_column not in adata.var:
|
|
71
|
+
raise KeyError(f"Sequence column '{seq2_column}' not found in adata.var")
|
|
72
|
+
|
|
73
|
+
output_prefix = output_prefix or f"{seq1_column}__{seq2_column}"
|
|
74
|
+
|
|
75
|
+
seq1_series = adata.var[seq1_column]
|
|
76
|
+
seq2_series = adata.var[seq2_column]
|
|
77
|
+
n_vars = adata.shape[1]
|
|
78
|
+
|
|
79
|
+
# ---- Build full sequences from positions where each ref has a valid (non-N) base ----
|
|
80
|
+
valid1_mask = seq1_series.notna() & (seq1_series != "N")
|
|
81
|
+
valid2_mask = seq2_series.notna() & (seq2_series != "N")
|
|
82
|
+
|
|
83
|
+
# var indices (integers) for each valid base
|
|
84
|
+
var_indices_1 = np.where(valid1_mask.values)[0]
|
|
85
|
+
var_indices_2 = np.where(valid2_mask.values)[0]
|
|
86
|
+
|
|
87
|
+
full_seq1 = "".join(str(seq1_series.iloc[i]) for i in var_indices_1)
|
|
88
|
+
full_seq2 = "".join(str(seq2_series.iloc[i]) for i in var_indices_2)
|
|
89
|
+
|
|
90
|
+
logger.info(
|
|
91
|
+
"Aligning full sequences: '%s' (%d bases) vs '%s' (%d bases).",
|
|
92
|
+
seq1_column,
|
|
93
|
+
len(full_seq1),
|
|
94
|
+
seq2_column,
|
|
95
|
+
len(full_seq2),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# ---- Global alignment ----
|
|
99
|
+
aligned_seq1, aligned_seq2, mismatches = align_sequences_with_mismatches(
|
|
100
|
+
full_seq1,
|
|
101
|
+
full_seq2,
|
|
102
|
+
match_score=match_score,
|
|
103
|
+
mismatch_score=mismatch_score,
|
|
104
|
+
gap_score=gap_score,
|
|
105
|
+
ignore_n=ignore_n,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
logger.info(
|
|
109
|
+
"Alignment complete. Aligned length: %d, mismatches: %d.",
|
|
110
|
+
len(aligned_seq1),
|
|
111
|
+
len(mismatches),
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# ---- Map alignment mismatches back to var indices ----
|
|
115
|
+
mismatch_type_arr = [""] * n_vars
|
|
116
|
+
mismatch_identity_arr = [""] * n_vars
|
|
117
|
+
is_mismatch_arr = np.zeros(n_vars, dtype=bool)
|
|
118
|
+
|
|
119
|
+
# For substitutions, store the paired var indices from both references.
|
|
120
|
+
# This is needed because indels shift the coordinate systems so that the
|
|
121
|
+
# same alignment column maps to different var indices in each reference.
|
|
122
|
+
substitution_map: list[dict] = []
|
|
123
|
+
|
|
124
|
+
for mm in mismatches:
|
|
125
|
+
# Determine which var index this mismatch maps to.
|
|
126
|
+
# For substitutions and deletions, seq1_pos is defined.
|
|
127
|
+
# For insertions, only seq2_pos is defined (gap in seq1).
|
|
128
|
+
if mm.seq1_pos is not None:
|
|
129
|
+
var_idx = int(var_indices_1[mm.seq1_pos])
|
|
130
|
+
elif mm.seq2_pos is not None:
|
|
131
|
+
var_idx = int(var_indices_2[mm.seq2_pos])
|
|
132
|
+
else:
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
mismatch_type_arr[var_idx] = mm.event
|
|
136
|
+
mismatch_identity_arr[var_idx] = _format_mismatch_identity(
|
|
137
|
+
mm.event, mm.seq1_base, mm.seq2_base
|
|
138
|
+
)
|
|
139
|
+
is_mismatch_arr[var_idx] = True
|
|
140
|
+
|
|
141
|
+
if mm.event == "substitution" and mm.seq1_pos is not None and mm.seq2_pos is not None:
|
|
142
|
+
substitution_map.append(
|
|
143
|
+
{
|
|
144
|
+
"seq1_var_idx": int(var_indices_1[mm.seq1_pos]),
|
|
145
|
+
"seq2_var_idx": int(var_indices_2[mm.seq2_pos]),
|
|
146
|
+
"seq1_base": mm.seq1_base,
|
|
147
|
+
"seq2_base": mm.seq2_base,
|
|
148
|
+
}
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
adata.var[f"{output_prefix}_mismatch_type"] = pd.Series(
|
|
152
|
+
mismatch_type_arr, index=adata.var.index
|
|
153
|
+
)
|
|
154
|
+
adata.var[f"{output_prefix}_mismatch_identity"] = pd.Series(
|
|
155
|
+
mismatch_identity_arr, index=adata.var.index
|
|
156
|
+
)
|
|
157
|
+
adata.var[f"{output_prefix}_is_mismatch"] = pd.Series(is_mismatch_arr, index=adata.var.index)
|
|
158
|
+
# Store substitution map as a DataFrame in adata.uns (h5ad-serializable)
|
|
159
|
+
if substitution_map:
|
|
160
|
+
adata.uns[f"{output_prefix}_substitution_map"] = pd.DataFrame(substitution_map)
|
|
161
|
+
adata.uns[uns_flag] = True
|
|
162
|
+
|
|
163
|
+
n_sub = sum(1 for t in mismatch_type_arr if t == "substitution")
|
|
164
|
+
n_ins = sum(1 for t in mismatch_type_arr if t == "insertion")
|
|
165
|
+
n_del = sum(1 for t in mismatch_type_arr if t == "deletion")
|
|
166
|
+
logger.info(
|
|
167
|
+
"Mismatch annotations: %d substitutions, %d insertions, %d deletions.",
|
|
168
|
+
n_sub,
|
|
169
|
+
n_ins,
|
|
170
|
+
n_del,
|
|
171
|
+
)
|