smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +39 -7
- smftools/_settings.py +2 -0
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +34 -6
- smftools/cli/hmm_adata.py +239 -33
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +167 -131
- smftools/cli/preprocess_adata.py +180 -53
- smftools/cli/spatial_adata.py +152 -100
- smftools/cli_entry.py +38 -1
- smftools/config/__init__.py +2 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +42 -2
- smftools/config/experiment_config.py +59 -1
- smftools/constants.py +65 -0
- smftools/datasets/__init__.py +2 -0
- smftools/hmm/HMM.py +97 -3
- smftools/hmm/__init__.py +24 -13
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +2 -0
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +5 -2
- smftools/hmm/display_hmm.py +4 -1
- smftools/hmm/hmm_readwrite.py +7 -2
- smftools/hmm/nucleosome_hmm_refinement.py +2 -0
- smftools/informatics/__init__.py +59 -34
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +2 -0
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1093 -176
- smftools/informatics/basecalling.py +2 -0
- smftools/informatics/bed_functions.py +271 -61
- smftools/informatics/binarize_converted_base_identities.py +3 -0
- smftools/informatics/complement_base_list.py +2 -0
- smftools/informatics/converted_BAM_to_adata.py +641 -176
- smftools/informatics/fasta_functions.py +94 -10
- smftools/informatics/h5ad_functions.py +123 -4
- smftools/informatics/modkit_extract_to_adata.py +1019 -431
- smftools/informatics/modkit_functions.py +2 -0
- smftools/informatics/ohe.py +2 -0
- smftools/informatics/pod5_functions.py +3 -2
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/machine_learning/__init__.py +22 -6
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +18 -4
- smftools/machine_learning/data/preprocessing.py +2 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +2 -0
- smftools/machine_learning/evaluation/evaluators.py +14 -9
- smftools/machine_learning/inference/__init__.py +2 -0
- smftools/machine_learning/inference/inference_utils.py +2 -0
- smftools/machine_learning/inference/lightning_inference.py +6 -1
- smftools/machine_learning/inference/sklearn_inference.py +2 -0
- smftools/machine_learning/inference/sliding_window_inference.py +2 -0
- smftools/machine_learning/models/__init__.py +2 -0
- smftools/machine_learning/models/base.py +7 -2
- smftools/machine_learning/models/cnn.py +7 -2
- smftools/machine_learning/models/lightning_base.py +16 -11
- smftools/machine_learning/models/mlp.py +5 -1
- smftools/machine_learning/models/positional.py +7 -2
- smftools/machine_learning/models/rnn.py +5 -1
- smftools/machine_learning/models/sklearn_models.py +14 -9
- smftools/machine_learning/models/transformer.py +7 -2
- smftools/machine_learning/models/wrappers.py +6 -2
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +13 -3
- smftools/machine_learning/training/train_sklearn_model.py +2 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +5 -1
- smftools/machine_learning/utils/grl.py +5 -1
- smftools/metadata.py +1 -1
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +41 -31
- smftools/plotting/autocorrelation_plotting.py +9 -5
- smftools/plotting/classifiers.py +16 -4
- smftools/plotting/general_plotting.py +2415 -629
- smftools/plotting/hmm_plotting.py +97 -9
- smftools/plotting/position_stats.py +15 -7
- smftools/plotting/qc_plotting.py +6 -1
- smftools/preprocessing/__init__.py +36 -37
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/archived/calculate_complexity.py +2 -0
- smftools/preprocessing/archived/mark_duplicates.py +2 -0
- smftools/preprocessing/archived/preprocessing.py +2 -0
- smftools/preprocessing/archived/remove_duplicates.py +2 -0
- smftools/preprocessing/binary_layers_to_ohe.py +2 -1
- smftools/preprocessing/calculate_complexity_II.py +4 -1
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_pairwise_differences.py +2 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
- smftools/preprocessing/calculate_position_Youden.py +9 -2
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
- smftools/preprocessing/flag_duplicate_reads.py +42 -54
- smftools/preprocessing/make_dirs.py +2 -1
- smftools/preprocessing/min_non_diagonal.py +2 -0
- smftools/preprocessing/recipes.py +2 -0
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +30 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +2 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +2 -0
- smftools/tools/archived/subset_adata_v2.py +2 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +93 -8
- smftools/tools/cluster_adata_on_methylation.py +7 -1
- smftools/tools/position_stats.py +17 -27
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
- smftools-0.3.1.dist-info/RECORD +189 -0
- smftools-0.2.5.dist-info/RECORD +0 -181
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,23 +1,93 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import gzip
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
4
6
|
from concurrent.futures import ProcessPoolExecutor
|
|
7
|
+
from importlib.util import find_spec
|
|
5
8
|
from pathlib import Path
|
|
6
|
-
from typing import Dict, Iterable, Tuple
|
|
9
|
+
from typing import TYPE_CHECKING, Dict, Iterable, Tuple
|
|
7
10
|
|
|
8
11
|
import numpy as np
|
|
9
|
-
import pysam
|
|
10
12
|
from Bio import SeqIO
|
|
11
13
|
from Bio.Seq import Seq
|
|
12
14
|
from Bio.SeqRecord import SeqRecord
|
|
13
|
-
from pyfaidx import Fasta
|
|
14
15
|
|
|
15
16
|
from smftools.logging_utils import get_logger
|
|
17
|
+
from smftools.optional_imports import require
|
|
16
18
|
|
|
17
19
|
from ..readwrite import time_string
|
|
18
20
|
|
|
19
21
|
logger = get_logger(__name__)
|
|
20
22
|
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
import pysam as pysam_module
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _require_pysam() -> "pysam_module":
|
|
28
|
+
if pysam_types is not None:
|
|
29
|
+
return pysam_types
|
|
30
|
+
return require("pysam", extra="pysam", purpose="FASTA access")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
pysam_types = None
|
|
34
|
+
if find_spec("pysam") is not None:
|
|
35
|
+
pysam_types = require("pysam", extra="pysam", purpose="FASTA access")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _resolve_fasta_backend() -> str:
|
|
39
|
+
"""Resolve the backend to use for FASTA access."""
|
|
40
|
+
if pysam_types is not None:
|
|
41
|
+
return "python"
|
|
42
|
+
if shutil is not None and shutil.which("samtools"):
|
|
43
|
+
return "cli"
|
|
44
|
+
raise RuntimeError("FASTA access requires pysam or samtools in PATH.")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _ensure_fasta_index(fasta: Path) -> None:
|
|
48
|
+
fai = fasta.with_suffix(fasta.suffix + ".fai")
|
|
49
|
+
if fai.exists():
|
|
50
|
+
return
|
|
51
|
+
if subprocess is None or shutil is None or not shutil.which("samtools"):
|
|
52
|
+
pysam_mod = _require_pysam()
|
|
53
|
+
pysam_mod.faidx(str(fasta))
|
|
54
|
+
return
|
|
55
|
+
cp = subprocess.run(
|
|
56
|
+
["samtools", "faidx", str(fasta)],
|
|
57
|
+
stdout=subprocess.DEVNULL,
|
|
58
|
+
stderr=subprocess.PIPE,
|
|
59
|
+
text=True,
|
|
60
|
+
)
|
|
61
|
+
if cp.returncode != 0:
|
|
62
|
+
raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _bed_to_faidx_region(chrom: str, start: int, end: int) -> str:
|
|
66
|
+
"""Convert 0-based half-open BED coords to samtools faidx region."""
|
|
67
|
+
start1 = start + 1
|
|
68
|
+
end1 = end
|
|
69
|
+
if start1 > end1:
|
|
70
|
+
start1, end1 = end1, start1
|
|
71
|
+
return f"{chrom}:{start1}-{end1}"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _fetch_sequence_with_samtools(fasta: Path, chrom: str, start: int, end: int) -> str:
|
|
75
|
+
if subprocess is None or shutil is None:
|
|
76
|
+
raise RuntimeError("samtools backend is unavailable.")
|
|
77
|
+
if not shutil.which("samtools"):
|
|
78
|
+
raise RuntimeError("samtools is required but not available in PATH.")
|
|
79
|
+
region = _bed_to_faidx_region(chrom, start, end)
|
|
80
|
+
cp = subprocess.run(
|
|
81
|
+
["samtools", "faidx", str(fasta), region],
|
|
82
|
+
stdout=subprocess.PIPE,
|
|
83
|
+
stderr=subprocess.PIPE,
|
|
84
|
+
text=True,
|
|
85
|
+
)
|
|
86
|
+
if cp.returncode != 0:
|
|
87
|
+
raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
88
|
+
lines = [line.strip() for line in cp.stdout.splitlines() if line and not line.startswith(">")]
|
|
89
|
+
return "".join(lines)
|
|
90
|
+
|
|
21
91
|
|
|
22
92
|
def _convert_FASTA_record(
|
|
23
93
|
record: SeqRecord,
|
|
@@ -160,7 +230,7 @@ def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
|
|
|
160
230
|
Path: Path to the index file or chromosome sizes file.
|
|
161
231
|
"""
|
|
162
232
|
fasta = Path(fasta)
|
|
163
|
-
|
|
233
|
+
_require_pysam().faidx(str(fasta)) # creates <fasta>.fai
|
|
164
234
|
|
|
165
235
|
fai = fasta.with_suffix(fasta.suffix + ".fai")
|
|
166
236
|
if write_chrom_sizes:
|
|
@@ -307,8 +377,13 @@ def subsample_fasta_from_bed(
|
|
|
307
377
|
# Ensure output directory exists
|
|
308
378
|
output_directory.mkdir(parents=True, exist_ok=True)
|
|
309
379
|
|
|
310
|
-
|
|
311
|
-
|
|
380
|
+
backend = _resolve_fasta_backend()
|
|
381
|
+
_ensure_fasta_index(input_FASTA)
|
|
382
|
+
|
|
383
|
+
fasta_handle = None
|
|
384
|
+
if backend == "python":
|
|
385
|
+
pysam_mod = _require_pysam()
|
|
386
|
+
fasta_handle = pysam_mod.FastaFile(str(input_FASTA))
|
|
312
387
|
|
|
313
388
|
# Open BED + output FASTA
|
|
314
389
|
with input_bed.open("r") as bed, output_FASTA.open("w") as out_fasta:
|
|
@@ -319,15 +394,24 @@ def subsample_fasta_from_bed(
|
|
|
319
394
|
end = int(fields[2]) # BED is 0-based and end is exclusive
|
|
320
395
|
desc = " ".join(fields[3:]) if len(fields) > 3 else ""
|
|
321
396
|
|
|
322
|
-
if
|
|
397
|
+
if backend == "python":
|
|
398
|
+
assert fasta_handle is not None
|
|
399
|
+
if chrom not in fasta_handle.references:
|
|
400
|
+
logger.warning(f"{chrom} not found in FASTA")
|
|
401
|
+
continue
|
|
402
|
+
sequence = fasta_handle.fetch(chrom, start, end)
|
|
403
|
+
else:
|
|
404
|
+
sequence = _fetch_sequence_with_samtools(input_FASTA, chrom, start, end)
|
|
405
|
+
|
|
406
|
+
if not sequence:
|
|
323
407
|
logger.warning(f"{chrom} not found in FASTA")
|
|
324
408
|
continue
|
|
325
409
|
|
|
326
|
-
# pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
|
|
327
|
-
sequence = fasta[chrom][start:end].seq
|
|
328
|
-
|
|
329
410
|
header = f">{chrom}:{start}-{end}"
|
|
330
411
|
if desc:
|
|
331
412
|
header += f" {desc}"
|
|
332
413
|
|
|
333
414
|
out_fasta.write(f"{header}\n{sequence}\n")
|
|
415
|
+
|
|
416
|
+
if fasta_handle is not None:
|
|
417
|
+
fasta_handle.close()
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import glob
|
|
2
4
|
import os
|
|
3
5
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
@@ -7,9 +9,9 @@ from typing import Dict, List, Optional, Union
|
|
|
7
9
|
import numpy as np
|
|
8
10
|
import pandas as pd
|
|
9
11
|
import scipy.sparse as sp
|
|
10
|
-
from pod5 import Reader
|
|
11
12
|
|
|
12
13
|
from smftools.logging_utils import get_logger
|
|
14
|
+
from smftools.optional_imports import require
|
|
13
15
|
|
|
14
16
|
logger = get_logger(__name__)
|
|
15
17
|
|
|
@@ -82,6 +84,112 @@ def add_demux_type_annotation(
|
|
|
82
84
|
return adata
|
|
83
85
|
|
|
84
86
|
|
|
87
|
+
def add_read_tag_annotations(
|
|
88
|
+
adata,
|
|
89
|
+
bam_files: Optional[List[str]] = None,
|
|
90
|
+
read_tags: Optional[Dict[str, Dict[str, object]]] = None,
|
|
91
|
+
tag_names: Optional[List[str]] = None,
|
|
92
|
+
include_flags: bool = True,
|
|
93
|
+
include_cigar: bool = True,
|
|
94
|
+
extract_read_tags_from_bam_callable=None,
|
|
95
|
+
samtools_backend: str | None = "auto",
|
|
96
|
+
):
|
|
97
|
+
"""Populate adata.obs with read tag metadata.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
adata: AnnData to annotate (modified in-place).
|
|
101
|
+
bam_files: Optional list of BAM files to extract tags from.
|
|
102
|
+
read_tags: Optional mapping of read name to tag dict.
|
|
103
|
+
tag_names: Optional list of BAM tag names to extract (e.g. ["NM", "MD", "MM", "ML"]).
|
|
104
|
+
include_flags: Whether to add a FLAGS list column.
|
|
105
|
+
include_cigar: Whether to add the CIGAR string column.
|
|
106
|
+
extract_read_tags_from_bam_callable: Optional callable to extract tags from a BAM.
|
|
107
|
+
samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
None (mutates adata in-place).
|
|
111
|
+
"""
|
|
112
|
+
if read_tags is None:
|
|
113
|
+
read_tags = {}
|
|
114
|
+
if bam_files:
|
|
115
|
+
extractor = extract_read_tags_from_bam_callable or globals().get(
|
|
116
|
+
"extract_read_tags_from_bam"
|
|
117
|
+
)
|
|
118
|
+
if extractor is None:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
"No `read_tags` provided and `extract_read_tags_from_bam` not found."
|
|
121
|
+
)
|
|
122
|
+
for bam in bam_files:
|
|
123
|
+
bam_read_tags = extractor(
|
|
124
|
+
bam,
|
|
125
|
+
tag_names=tag_names,
|
|
126
|
+
include_flags=include_flags,
|
|
127
|
+
include_cigar=include_cigar,
|
|
128
|
+
samtools_backend=samtools_backend,
|
|
129
|
+
)
|
|
130
|
+
if not isinstance(bam_read_tags, dict):
|
|
131
|
+
raise ValueError(f"extract_read_tags_from_bam returned non-dict for {bam}")
|
|
132
|
+
read_tags.update(bam_read_tags)
|
|
133
|
+
|
|
134
|
+
if not read_tags:
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
df = pd.DataFrame.from_dict(read_tags, orient="index")
|
|
138
|
+
df_reindexed = df.reindex(adata.obs_names)
|
|
139
|
+
for column in df_reindexed.columns:
|
|
140
|
+
adata.obs[column] = df_reindexed[column].values
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def add_secondary_supplementary_alignment_flags(
|
|
144
|
+
adata,
|
|
145
|
+
bam_path: str | Path,
|
|
146
|
+
*,
|
|
147
|
+
uns_flag: str = "add_secondary_supplementary_flags_performed",
|
|
148
|
+
bypass: bool = False,
|
|
149
|
+
force_redo: bool = False,
|
|
150
|
+
samtools_backend: str | None = "auto",
|
|
151
|
+
) -> None:
|
|
152
|
+
"""Annotate whether reads have secondary/supplementary alignments.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
adata: AnnData to annotate (modified in-place).
|
|
156
|
+
bam_path: Path to the aligned/sorted BAM to scan.
|
|
157
|
+
uns_flag: Flag in ``adata.uns`` indicating prior completion.
|
|
158
|
+
bypass: Whether to skip annotation.
|
|
159
|
+
force_redo: Whether to recompute even if ``uns_flag`` is set.
|
|
160
|
+
samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
|
|
161
|
+
"""
|
|
162
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
163
|
+
if (already and not force_redo) or bypass:
|
|
164
|
+
return
|
|
165
|
+
|
|
166
|
+
from .bam_functions import (
|
|
167
|
+
extract_secondary_supplementary_alignment_spans,
|
|
168
|
+
find_secondary_supplementary_read_names,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
secondary_reads, supplementary_reads = find_secondary_supplementary_read_names(
|
|
172
|
+
bam_path,
|
|
173
|
+
adata.obs_names,
|
|
174
|
+
samtools_backend=samtools_backend,
|
|
175
|
+
)
|
|
176
|
+
secondary_spans, supplementary_spans = extract_secondary_supplementary_alignment_spans(
|
|
177
|
+
bam_path,
|
|
178
|
+
adata.obs_names,
|
|
179
|
+
samtools_backend=samtools_backend,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
adata.obs["has_secondary_alignment"] = adata.obs_names.isin(secondary_reads)
|
|
183
|
+
adata.obs["has_supplementary_alignment"] = adata.obs_names.isin(supplementary_reads)
|
|
184
|
+
adata.obs["secondary_alignment_spans"] = [
|
|
185
|
+
secondary_spans.get(read_name) for read_name in adata.obs_names
|
|
186
|
+
]
|
|
187
|
+
adata.obs["supplementary_alignment_spans"] = [
|
|
188
|
+
supplementary_spans.get(read_name) for read_name in adata.obs_names
|
|
189
|
+
]
|
|
190
|
+
adata.uns[uns_flag] = True
|
|
191
|
+
|
|
192
|
+
|
|
85
193
|
def add_read_length_and_mapping_qc(
|
|
86
194
|
adata,
|
|
87
195
|
bam_files: Optional[List[str]] = None,
|
|
@@ -90,6 +198,7 @@ def add_read_length_and_mapping_qc(
|
|
|
90
198
|
extract_read_features_from_bam_callable=None,
|
|
91
199
|
bypass: bool = False,
|
|
92
200
|
force_redo: bool = True,
|
|
201
|
+
samtools_backend: str | None = "auto",
|
|
93
202
|
):
|
|
94
203
|
"""
|
|
95
204
|
Populate adata.obs with read/mapping QC columns.
|
|
@@ -101,7 +210,8 @@ def add_read_length_and_mapping_qc(
|
|
|
101
210
|
bam_files
|
|
102
211
|
Optional list of BAM files to extract metrics from. Ignored if read_metrics supplied.
|
|
103
212
|
read_metrics
|
|
104
|
-
Optional dict mapping obs_name -> [read_length, read_quality, reference_length, mapped_length,
|
|
213
|
+
Optional dict mapping obs_name -> [read_length, read_quality, reference_length, mapped_length,
|
|
214
|
+
mapping_quality, reference_start, reference_end]
|
|
105
215
|
If provided, this will be used directly and bam_files will be ignored.
|
|
106
216
|
uns_flag
|
|
107
217
|
key in final_adata.uns used to record that QC was performed (kept the name with original misspelling).
|
|
@@ -133,7 +243,7 @@ def add_read_length_and_mapping_qc(
|
|
|
133
243
|
"No `read_metrics` provided and `extract_read_features_from_bam` not found."
|
|
134
244
|
)
|
|
135
245
|
for bam in bam_files:
|
|
136
|
-
bam_read_metrics = extractor(bam)
|
|
246
|
+
bam_read_metrics = extractor(bam, samtools_backend)
|
|
137
247
|
if not isinstance(bam_read_metrics, dict):
|
|
138
248
|
raise ValueError(f"extract_read_features_from_bam returned non-dict for {bam}")
|
|
139
249
|
read_metrics.update(bam_read_metrics)
|
|
@@ -151,10 +261,12 @@ def add_read_length_and_mapping_qc(
|
|
|
151
261
|
adata.obs["reference_length"] = np.full(n, np.nan)
|
|
152
262
|
adata.obs["read_quality"] = np.full(n, np.nan)
|
|
153
263
|
adata.obs["mapping_quality"] = np.full(n, np.nan)
|
|
264
|
+
adata.obs["reference_start"] = np.full(n, np.nan)
|
|
265
|
+
adata.obs["reference_end"] = np.full(n, np.nan)
|
|
154
266
|
else:
|
|
155
267
|
# Build DF robustly
|
|
156
268
|
# Convert values to lists where possible, else to [val, val, val...]
|
|
157
|
-
max_cols =
|
|
269
|
+
max_cols = 7
|
|
158
270
|
rows = {}
|
|
159
271
|
for k, v in read_metrics.items():
|
|
160
272
|
if isinstance(v, (list, tuple, np.ndarray)):
|
|
@@ -176,6 +288,8 @@ def add_read_length_and_mapping_qc(
|
|
|
176
288
|
"reference_length",
|
|
177
289
|
"mapped_length",
|
|
178
290
|
"mapping_quality",
|
|
291
|
+
"reference_start",
|
|
292
|
+
"reference_end",
|
|
179
293
|
],
|
|
180
294
|
)
|
|
181
295
|
|
|
@@ -188,6 +302,8 @@ def add_read_length_and_mapping_qc(
|
|
|
188
302
|
adata.obs["reference_length"] = df_reindexed["reference_length"].values
|
|
189
303
|
adata.obs["read_quality"] = df_reindexed["read_quality"].values
|
|
190
304
|
adata.obs["mapping_quality"] = df_reindexed["mapping_quality"].values
|
|
305
|
+
adata.obs["reference_start"] = df_reindexed["reference_start"].values
|
|
306
|
+
adata.obs["reference_end"] = df_reindexed["reference_end"].values
|
|
191
307
|
|
|
192
308
|
# Compute ratio columns safely (avoid divide-by-zero and preserve NaN)
|
|
193
309
|
# read_length_to_reference_length_ratio
|
|
@@ -228,6 +344,9 @@ def _collect_read_origins_from_pod5(pod5_path: str, target_ids: set[str]) -> dic
|
|
|
228
344
|
Worker function: scan one POD5 file and return a mapping
|
|
229
345
|
{read_id: pod5_basename} only for read_ids in `target_ids`.
|
|
230
346
|
"""
|
|
347
|
+
p5 = require("pod5", extra="ont", purpose="POD5 metadata")
|
|
348
|
+
Reader = p5.Reader
|
|
349
|
+
|
|
231
350
|
basename = os.path.basename(pod5_path)
|
|
232
351
|
mapping: dict[str, str] = {}
|
|
233
352
|
|