smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. smftools/__init__.py +39 -7
  2. smftools/_settings.py +2 -0
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +34 -6
  7. smftools/cli/hmm_adata.py +239 -33
  8. smftools/cli/latent_adata.py +318 -0
  9. smftools/cli/load_adata.py +167 -131
  10. smftools/cli/preprocess_adata.py +180 -53
  11. smftools/cli/spatial_adata.py +152 -100
  12. smftools/cli_entry.py +38 -1
  13. smftools/config/__init__.py +2 -0
  14. smftools/config/conversion.yaml +11 -1
  15. smftools/config/default.yaml +42 -2
  16. smftools/config/experiment_config.py +59 -1
  17. smftools/constants.py +65 -0
  18. smftools/datasets/__init__.py +2 -0
  19. smftools/hmm/HMM.py +97 -3
  20. smftools/hmm/__init__.py +24 -13
  21. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  22. smftools/hmm/archived/calculate_distances.py +2 -0
  23. smftools/hmm/archived/call_hmm_peaks.py +2 -0
  24. smftools/hmm/archived/train_hmm.py +2 -0
  25. smftools/hmm/call_hmm_peaks.py +5 -2
  26. smftools/hmm/display_hmm.py +4 -1
  27. smftools/hmm/hmm_readwrite.py +7 -2
  28. smftools/hmm/nucleosome_hmm_refinement.py +2 -0
  29. smftools/informatics/__init__.py +59 -34
  30. smftools/informatics/archived/bam_conversion.py +2 -0
  31. smftools/informatics/archived/bam_direct.py +2 -0
  32. smftools/informatics/archived/basecall_pod5s.py +2 -0
  33. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  34. smftools/informatics/archived/conversion_smf.py +2 -0
  35. smftools/informatics/archived/deaminase_smf.py +1 -0
  36. smftools/informatics/archived/direct_smf.py +2 -0
  37. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  38. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  39. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
  40. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  41. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  42. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  43. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  44. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  45. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  46. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  47. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  48. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  49. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  50. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  52. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  53. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  54. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  55. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  56. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  57. smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
  58. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  59. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  60. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  61. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  62. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  63. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  64. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  65. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
  66. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  67. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  68. smftools/informatics/archived/print_bam_query_seq.py +2 -0
  69. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  70. smftools/informatics/archived/subsample_pod5.py +2 -0
  71. smftools/informatics/bam_functions.py +1093 -176
  72. smftools/informatics/basecalling.py +2 -0
  73. smftools/informatics/bed_functions.py +271 -61
  74. smftools/informatics/binarize_converted_base_identities.py +3 -0
  75. smftools/informatics/complement_base_list.py +2 -0
  76. smftools/informatics/converted_BAM_to_adata.py +641 -176
  77. smftools/informatics/fasta_functions.py +94 -10
  78. smftools/informatics/h5ad_functions.py +123 -4
  79. smftools/informatics/modkit_extract_to_adata.py +1019 -431
  80. smftools/informatics/modkit_functions.py +2 -0
  81. smftools/informatics/ohe.py +2 -0
  82. smftools/informatics/pod5_functions.py +3 -2
  83. smftools/informatics/sequence_encoding.py +72 -0
  84. smftools/logging_utils.py +21 -2
  85. smftools/machine_learning/__init__.py +22 -6
  86. smftools/machine_learning/data/__init__.py +2 -0
  87. smftools/machine_learning/data/anndata_data_module.py +18 -4
  88. smftools/machine_learning/data/preprocessing.py +2 -0
  89. smftools/machine_learning/evaluation/__init__.py +2 -0
  90. smftools/machine_learning/evaluation/eval_utils.py +2 -0
  91. smftools/machine_learning/evaluation/evaluators.py +14 -9
  92. smftools/machine_learning/inference/__init__.py +2 -0
  93. smftools/machine_learning/inference/inference_utils.py +2 -0
  94. smftools/machine_learning/inference/lightning_inference.py +6 -1
  95. smftools/machine_learning/inference/sklearn_inference.py +2 -0
  96. smftools/machine_learning/inference/sliding_window_inference.py +2 -0
  97. smftools/machine_learning/models/__init__.py +2 -0
  98. smftools/machine_learning/models/base.py +7 -2
  99. smftools/machine_learning/models/cnn.py +7 -2
  100. smftools/machine_learning/models/lightning_base.py +16 -11
  101. smftools/machine_learning/models/mlp.py +5 -1
  102. smftools/machine_learning/models/positional.py +7 -2
  103. smftools/machine_learning/models/rnn.py +5 -1
  104. smftools/machine_learning/models/sklearn_models.py +14 -9
  105. smftools/machine_learning/models/transformer.py +7 -2
  106. smftools/machine_learning/models/wrappers.py +6 -2
  107. smftools/machine_learning/training/__init__.py +2 -0
  108. smftools/machine_learning/training/train_lightning_model.py +13 -3
  109. smftools/machine_learning/training/train_sklearn_model.py +2 -0
  110. smftools/machine_learning/utils/__init__.py +2 -0
  111. smftools/machine_learning/utils/device.py +5 -1
  112. smftools/machine_learning/utils/grl.py +5 -1
  113. smftools/metadata.py +1 -1
  114. smftools/optional_imports.py +31 -0
  115. smftools/plotting/__init__.py +41 -31
  116. smftools/plotting/autocorrelation_plotting.py +9 -5
  117. smftools/plotting/classifiers.py +16 -4
  118. smftools/plotting/general_plotting.py +2415 -629
  119. smftools/plotting/hmm_plotting.py +97 -9
  120. smftools/plotting/position_stats.py +15 -7
  121. smftools/plotting/qc_plotting.py +6 -1
  122. smftools/preprocessing/__init__.py +36 -37
  123. smftools/preprocessing/append_base_context.py +17 -17
  124. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  125. smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
  126. smftools/preprocessing/archived/calculate_complexity.py +2 -0
  127. smftools/preprocessing/archived/mark_duplicates.py +2 -0
  128. smftools/preprocessing/archived/preprocessing.py +2 -0
  129. smftools/preprocessing/archived/remove_duplicates.py +2 -0
  130. smftools/preprocessing/binary_layers_to_ohe.py +2 -1
  131. smftools/preprocessing/calculate_complexity_II.py +4 -1
  132. smftools/preprocessing/calculate_consensus.py +1 -1
  133. smftools/preprocessing/calculate_pairwise_differences.py +2 -0
  134. smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
  135. smftools/preprocessing/calculate_position_Youden.py +9 -2
  136. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  137. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
  138. smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
  139. smftools/preprocessing/flag_duplicate_reads.py +42 -54
  140. smftools/preprocessing/make_dirs.py +2 -1
  141. smftools/preprocessing/min_non_diagonal.py +2 -0
  142. smftools/preprocessing/recipes.py +2 -0
  143. smftools/readwrite.py +53 -17
  144. smftools/schema/anndata_schema_v1.yaml +15 -1
  145. smftools/tools/__init__.py +30 -18
  146. smftools/tools/archived/apply_hmm.py +2 -0
  147. smftools/tools/archived/classifiers.py +2 -0
  148. smftools/tools/archived/classify_methylated_features.py +2 -0
  149. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  150. smftools/tools/archived/subset_adata_v1.py +2 -0
  151. smftools/tools/archived/subset_adata_v2.py +2 -0
  152. smftools/tools/calculate_leiden.py +57 -0
  153. smftools/tools/calculate_nmf.py +119 -0
  154. smftools/tools/calculate_umap.py +93 -8
  155. smftools/tools/cluster_adata_on_methylation.py +7 -1
  156. smftools/tools/position_stats.py +17 -27
  157. smftools/tools/rolling_nn_distance.py +235 -0
  158. smftools/tools/tensor_factorization.py +169 -0
  159. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
  160. smftools-0.3.1.dist-info/RECORD +189 -0
  161. smftools-0.2.5.dist-info/RECORD +0 -181
  162. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  163. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  164. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,23 +1,93 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import gzip
4
+ import shutil
5
+ import subprocess
4
6
  from concurrent.futures import ProcessPoolExecutor
7
+ from importlib.util import find_spec
5
8
  from pathlib import Path
6
- from typing import Dict, Iterable, Tuple
9
+ from typing import TYPE_CHECKING, Dict, Iterable, Tuple
7
10
 
8
11
  import numpy as np
9
- import pysam
10
12
  from Bio import SeqIO
11
13
  from Bio.Seq import Seq
12
14
  from Bio.SeqRecord import SeqRecord
13
- from pyfaidx import Fasta
14
15
 
15
16
  from smftools.logging_utils import get_logger
17
+ from smftools.optional_imports import require
16
18
 
17
19
  from ..readwrite import time_string
18
20
 
19
21
  logger = get_logger(__name__)
20
22
 
23
+ if TYPE_CHECKING:
24
+ import pysam as pysam_module
25
+
26
+
27
+ def _require_pysam() -> "pysam_module":
28
+ if pysam_types is not None:
29
+ return pysam_types
30
+ return require("pysam", extra="pysam", purpose="FASTA access")
31
+
32
+
33
+ pysam_types = None
34
+ if find_spec("pysam") is not None:
35
+ pysam_types = require("pysam", extra="pysam", purpose="FASTA access")
36
+
37
+
38
+ def _resolve_fasta_backend() -> str:
39
+ """Resolve the backend to use for FASTA access."""
40
+ if pysam_types is not None:
41
+ return "python"
42
+ if shutil is not None and shutil.which("samtools"):
43
+ return "cli"
44
+ raise RuntimeError("FASTA access requires pysam or samtools in PATH.")
45
+
46
+
47
+ def _ensure_fasta_index(fasta: Path) -> None:
48
+ fai = fasta.with_suffix(fasta.suffix + ".fai")
49
+ if fai.exists():
50
+ return
51
+ if subprocess is None or shutil is None or not shutil.which("samtools"):
52
+ pysam_mod = _require_pysam()
53
+ pysam_mod.faidx(str(fasta))
54
+ return
55
+ cp = subprocess.run(
56
+ ["samtools", "faidx", str(fasta)],
57
+ stdout=subprocess.DEVNULL,
58
+ stderr=subprocess.PIPE,
59
+ text=True,
60
+ )
61
+ if cp.returncode != 0:
62
+ raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
63
+
64
+
65
+ def _bed_to_faidx_region(chrom: str, start: int, end: int) -> str:
66
+ """Convert 0-based half-open BED coords to samtools faidx region."""
67
+ start1 = start + 1
68
+ end1 = end
69
+ if start1 > end1:
70
+ start1, end1 = end1, start1
71
+ return f"{chrom}:{start1}-{end1}"
72
+
73
+
74
+ def _fetch_sequence_with_samtools(fasta: Path, chrom: str, start: int, end: int) -> str:
75
+ if subprocess is None or shutil is None:
76
+ raise RuntimeError("samtools backend is unavailable.")
77
+ if not shutil.which("samtools"):
78
+ raise RuntimeError("samtools is required but not available in PATH.")
79
+ region = _bed_to_faidx_region(chrom, start, end)
80
+ cp = subprocess.run(
81
+ ["samtools", "faidx", str(fasta), region],
82
+ stdout=subprocess.PIPE,
83
+ stderr=subprocess.PIPE,
84
+ text=True,
85
+ )
86
+ if cp.returncode != 0:
87
+ raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
88
+ lines = [line.strip() for line in cp.stdout.splitlines() if line and not line.startswith(">")]
89
+ return "".join(lines)
90
+
21
91
 
22
92
  def _convert_FASTA_record(
23
93
  record: SeqRecord,
@@ -160,7 +230,7 @@ def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
160
230
  Path: Path to the index file or chromosome sizes file.
161
231
  """
162
232
  fasta = Path(fasta)
163
- pysam.faidx(str(fasta)) # creates <fasta>.fai
233
+ _require_pysam().faidx(str(fasta)) # creates <fasta>.fai
164
234
 
165
235
  fai = fasta.with_suffix(fasta.suffix + ".fai")
166
236
  if write_chrom_sizes:
@@ -307,8 +377,13 @@ def subsample_fasta_from_bed(
307
377
  # Ensure output directory exists
308
378
  output_directory.mkdir(parents=True, exist_ok=True)
309
379
 
310
- # Load the FASTA file using pyfaidx
311
- fasta = Fasta(str(input_FASTA)) # pyfaidx requires string paths
380
+ backend = _resolve_fasta_backend()
381
+ _ensure_fasta_index(input_FASTA)
382
+
383
+ fasta_handle = None
384
+ if backend == "python":
385
+ pysam_mod = _require_pysam()
386
+ fasta_handle = pysam_mod.FastaFile(str(input_FASTA))
312
387
 
313
388
  # Open BED + output FASTA
314
389
  with input_bed.open("r") as bed, output_FASTA.open("w") as out_fasta:
@@ -319,15 +394,24 @@ def subsample_fasta_from_bed(
319
394
  end = int(fields[2]) # BED is 0-based and end is exclusive
320
395
  desc = " ".join(fields[3:]) if len(fields) > 3 else ""
321
396
 
322
- if chrom not in fasta:
397
+ if backend == "python":
398
+ assert fasta_handle is not None
399
+ if chrom not in fasta_handle.references:
400
+ logger.warning(f"{chrom} not found in FASTA")
401
+ continue
402
+ sequence = fasta_handle.fetch(chrom, start, end)
403
+ else:
404
+ sequence = _fetch_sequence_with_samtools(input_FASTA, chrom, start, end)
405
+
406
+ if not sequence:
323
407
  logger.warning(f"{chrom} not found in FASTA")
324
408
  continue
325
409
 
326
- # pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
327
- sequence = fasta[chrom][start:end].seq
328
-
329
410
  header = f">{chrom}:{start}-{end}"
330
411
  if desc:
331
412
  header += f" {desc}"
332
413
 
333
414
  out_fasta.write(f"{header}\n{sequence}\n")
415
+
416
+ if fasta_handle is not None:
417
+ fasta_handle.close()
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import glob
2
4
  import os
3
5
  from concurrent.futures import ProcessPoolExecutor, as_completed
@@ -7,9 +9,9 @@ from typing import Dict, List, Optional, Union
7
9
  import numpy as np
8
10
  import pandas as pd
9
11
  import scipy.sparse as sp
10
- from pod5 import Reader
11
12
 
12
13
  from smftools.logging_utils import get_logger
14
+ from smftools.optional_imports import require
13
15
 
14
16
  logger = get_logger(__name__)
15
17
 
@@ -82,6 +84,112 @@ def add_demux_type_annotation(
82
84
  return adata
83
85
 
84
86
 
87
+ def add_read_tag_annotations(
88
+ adata,
89
+ bam_files: Optional[List[str]] = None,
90
+ read_tags: Optional[Dict[str, Dict[str, object]]] = None,
91
+ tag_names: Optional[List[str]] = None,
92
+ include_flags: bool = True,
93
+ include_cigar: bool = True,
94
+ extract_read_tags_from_bam_callable=None,
95
+ samtools_backend: str | None = "auto",
96
+ ):
97
+ """Populate adata.obs with read tag metadata.
98
+
99
+ Args:
100
+ adata: AnnData to annotate (modified in-place).
101
+ bam_files: Optional list of BAM files to extract tags from.
102
+ read_tags: Optional mapping of read name to tag dict.
103
+ tag_names: Optional list of BAM tag names to extract (e.g. ["NM", "MD", "MM", "ML"]).
104
+ include_flags: Whether to add a FLAGS list column.
105
+ include_cigar: Whether to add the CIGAR string column.
106
+ extract_read_tags_from_bam_callable: Optional callable to extract tags from a BAM.
107
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
108
+
109
+ Returns:
110
+ None (mutates adata in-place).
111
+ """
112
+ if read_tags is None:
113
+ read_tags = {}
114
+ if bam_files:
115
+ extractor = extract_read_tags_from_bam_callable or globals().get(
116
+ "extract_read_tags_from_bam"
117
+ )
118
+ if extractor is None:
119
+ raise ValueError(
120
+ "No `read_tags` provided and `extract_read_tags_from_bam` not found."
121
+ )
122
+ for bam in bam_files:
123
+ bam_read_tags = extractor(
124
+ bam,
125
+ tag_names=tag_names,
126
+ include_flags=include_flags,
127
+ include_cigar=include_cigar,
128
+ samtools_backend=samtools_backend,
129
+ )
130
+ if not isinstance(bam_read_tags, dict):
131
+ raise ValueError(f"extract_read_tags_from_bam returned non-dict for {bam}")
132
+ read_tags.update(bam_read_tags)
133
+
134
+ if not read_tags:
135
+ return
136
+
137
+ df = pd.DataFrame.from_dict(read_tags, orient="index")
138
+ df_reindexed = df.reindex(adata.obs_names)
139
+ for column in df_reindexed.columns:
140
+ adata.obs[column] = df_reindexed[column].values
141
+
142
+
143
+ def add_secondary_supplementary_alignment_flags(
144
+ adata,
145
+ bam_path: str | Path,
146
+ *,
147
+ uns_flag: str = "add_secondary_supplementary_flags_performed",
148
+ bypass: bool = False,
149
+ force_redo: bool = False,
150
+ samtools_backend: str | None = "auto",
151
+ ) -> None:
152
+ """Annotate whether reads have secondary/supplementary alignments.
153
+
154
+ Args:
155
+ adata: AnnData to annotate (modified in-place).
156
+ bam_path: Path to the aligned/sorted BAM to scan.
157
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
158
+ bypass: Whether to skip annotation.
159
+ force_redo: Whether to recompute even if ``uns_flag`` is set.
160
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
161
+ """
162
+ already = bool(adata.uns.get(uns_flag, False))
163
+ if (already and not force_redo) or bypass:
164
+ return
165
+
166
+ from .bam_functions import (
167
+ extract_secondary_supplementary_alignment_spans,
168
+ find_secondary_supplementary_read_names,
169
+ )
170
+
171
+ secondary_reads, supplementary_reads = find_secondary_supplementary_read_names(
172
+ bam_path,
173
+ adata.obs_names,
174
+ samtools_backend=samtools_backend,
175
+ )
176
+ secondary_spans, supplementary_spans = extract_secondary_supplementary_alignment_spans(
177
+ bam_path,
178
+ adata.obs_names,
179
+ samtools_backend=samtools_backend,
180
+ )
181
+
182
+ adata.obs["has_secondary_alignment"] = adata.obs_names.isin(secondary_reads)
183
+ adata.obs["has_supplementary_alignment"] = adata.obs_names.isin(supplementary_reads)
184
+ adata.obs["secondary_alignment_spans"] = [
185
+ secondary_spans.get(read_name) for read_name in adata.obs_names
186
+ ]
187
+ adata.obs["supplementary_alignment_spans"] = [
188
+ supplementary_spans.get(read_name) for read_name in adata.obs_names
189
+ ]
190
+ adata.uns[uns_flag] = True
191
+
192
+
85
193
  def add_read_length_and_mapping_qc(
86
194
  adata,
87
195
  bam_files: Optional[List[str]] = None,
@@ -90,6 +198,7 @@ def add_read_length_and_mapping_qc(
90
198
  extract_read_features_from_bam_callable=None,
91
199
  bypass: bool = False,
92
200
  force_redo: bool = True,
201
+ samtools_backend: str | None = "auto",
93
202
  ):
94
203
  """
95
204
  Populate adata.obs with read/mapping QC columns.
@@ -101,7 +210,8 @@ def add_read_length_and_mapping_qc(
101
210
  bam_files
102
211
  Optional list of BAM files to extract metrics from. Ignored if read_metrics supplied.
103
212
  read_metrics
104
- Optional dict mapping obs_name -> [read_length, read_quality, reference_length, mapped_length, mapping_quality]
213
+ Optional dict mapping obs_name -> [read_length, read_quality, reference_length, mapped_length,
214
+ mapping_quality, reference_start, reference_end]
105
215
  If provided, this will be used directly and bam_files will be ignored.
106
216
  uns_flag
107
217
  key in final_adata.uns used to record that QC was performed (kept the name with original misspelling).
@@ -133,7 +243,7 @@ def add_read_length_and_mapping_qc(
133
243
  "No `read_metrics` provided and `extract_read_features_from_bam` not found."
134
244
  )
135
245
  for bam in bam_files:
136
- bam_read_metrics = extractor(bam)
246
+ bam_read_metrics = extractor(bam, samtools_backend)
137
247
  if not isinstance(bam_read_metrics, dict):
138
248
  raise ValueError(f"extract_read_features_from_bam returned non-dict for {bam}")
139
249
  read_metrics.update(bam_read_metrics)
@@ -151,10 +261,12 @@ def add_read_length_and_mapping_qc(
151
261
  adata.obs["reference_length"] = np.full(n, np.nan)
152
262
  adata.obs["read_quality"] = np.full(n, np.nan)
153
263
  adata.obs["mapping_quality"] = np.full(n, np.nan)
264
+ adata.obs["reference_start"] = np.full(n, np.nan)
265
+ adata.obs["reference_end"] = np.full(n, np.nan)
154
266
  else:
155
267
  # Build DF robustly
156
268
  # Convert values to lists where possible, else to [val, val, val...]
157
- max_cols = 5
269
+ max_cols = 7
158
270
  rows = {}
159
271
  for k, v in read_metrics.items():
160
272
  if isinstance(v, (list, tuple, np.ndarray)):
@@ -176,6 +288,8 @@ def add_read_length_and_mapping_qc(
176
288
  "reference_length",
177
289
  "mapped_length",
178
290
  "mapping_quality",
291
+ "reference_start",
292
+ "reference_end",
179
293
  ],
180
294
  )
181
295
 
@@ -188,6 +302,8 @@ def add_read_length_and_mapping_qc(
188
302
  adata.obs["reference_length"] = df_reindexed["reference_length"].values
189
303
  adata.obs["read_quality"] = df_reindexed["read_quality"].values
190
304
  adata.obs["mapping_quality"] = df_reindexed["mapping_quality"].values
305
+ adata.obs["reference_start"] = df_reindexed["reference_start"].values
306
+ adata.obs["reference_end"] = df_reindexed["reference_end"].values
191
307
 
192
308
  # Compute ratio columns safely (avoid divide-by-zero and preserve NaN)
193
309
  # read_length_to_reference_length_ratio
@@ -228,6 +344,9 @@ def _collect_read_origins_from_pod5(pod5_path: str, target_ids: set[str]) -> dic
228
344
  Worker function: scan one POD5 file and return a mapping
229
345
  {read_id: pod5_basename} only for read_ids in `target_ids`.
230
346
  """
347
+ p5 = require("pod5", extra="ont", purpose="POD5 metadata")
348
+ Reader = p5.Reader
349
+
231
350
  basename = os.path.basename(pod5_path)
232
351
  mapping: dict[str, str] = {}
233
352