samsampleX 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- samsamplex/__init__.py +1 -0
- samsamplex/bed.py +146 -0
- samsamplex/cli.py +342 -0
- samsamplex/depth.py +126 -0
- samsamplex/mapback.py +365 -0
- samsamplex/metrics.py +64 -0
- samsamplex/modes.py +4 -0
- samsamplex/plot.py +217 -0
- samsamplex/sample.py +257 -0
- samsamplex-0.1.0.dist-info/METADATA +316 -0
- samsamplex-0.1.0.dist-info/RECORD +14 -0
- samsamplex-0.1.0.dist-info/WHEEL +5 -0
- samsamplex-0.1.0.dist-info/entry_points.txt +2 -0
- samsamplex-0.1.0.dist-info/top_level.txt +1 -0
samsamplex/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
samsamplex/bed.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""BED file I/O and depth-array combination utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TextIO
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from .depth import DepthArray
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def contig_names_match(name1: str, name2: str) -> bool:
|
|
15
|
+
"""Check whether two contig names match, ignoring a leading 'chr' prefix."""
|
|
16
|
+
if name1 == name2:
|
|
17
|
+
return True
|
|
18
|
+
strip = lambda s: s[3:] if s.startswith("chr") else s
|
|
19
|
+
return strip(name1) == strip(name2)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ── Writing ──────────────────────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def write_bed_entry(fp: TextIO, chrom: str, start: int, end: int, depth: int) -> None:
|
|
26
|
+
"""Write a single BED4 line."""
|
|
27
|
+
fp.write(f"{chrom}\t{start}\t{end}\t{depth}\n")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def write_bed_output(fp: TextIO, arr: DepthArray, collapse: int = 0) -> None:
|
|
31
|
+
"""Write a DepthArray to BED format with optional collapsing.
|
|
32
|
+
|
|
33
|
+
When *collapse* is 0 every position gets its own line. When > 0,
|
|
34
|
+
consecutive positions whose depth differs by <= *collapse* are merged
|
|
35
|
+
into a single interval (using the depth of the first position in the
|
|
36
|
+
interval.
|
|
37
|
+
"""
|
|
38
|
+
if arr.length == 0:
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
if collapse == 0:
|
|
42
|
+
for i in range(arr.length):
|
|
43
|
+
pos = arr.start + i
|
|
44
|
+
write_bed_entry(fp, arr.contig, pos, pos + 1, int(arr.depths[i]))
|
|
45
|
+
else:
|
|
46
|
+
interval_start = arr.start
|
|
47
|
+
interval_depth = int(arr.depths[0])
|
|
48
|
+
|
|
49
|
+
for i in range(1, arr.length):
|
|
50
|
+
current_depth = int(arr.depths[i])
|
|
51
|
+
if abs(current_depth - interval_depth) > collapse:
|
|
52
|
+
write_bed_entry(fp, arr.contig, interval_start, arr.start + i, interval_depth)
|
|
53
|
+
interval_start = arr.start + i
|
|
54
|
+
interval_depth = current_depth
|
|
55
|
+
|
|
56
|
+
write_bed_entry(fp, arr.contig, interval_start, arr.end, interval_depth)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ── Reading ──────────────────────────────────────────────────────────────────
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def bed_read_depths(
|
|
63
|
+
bed_path: str,
|
|
64
|
+
contig: str,
|
|
65
|
+
region_start: int,
|
|
66
|
+
region_end: int,
|
|
67
|
+
) -> DepthArray:
|
|
68
|
+
"""Read depth values from a BED4 file into a DepthArray for a region."""
|
|
69
|
+
depths = np.zeros(region_end - region_start, dtype=np.int32)
|
|
70
|
+
|
|
71
|
+
with open(bed_path) as fp:
|
|
72
|
+
for line in fp:
|
|
73
|
+
line = line.rstrip("\n")
|
|
74
|
+
if not line or line.startswith("#"):
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
parts = line.split("\t")
|
|
78
|
+
if len(parts) < 4:
|
|
79
|
+
parts = line.split()
|
|
80
|
+
if len(parts) < 4:
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
chrom = parts[0]
|
|
84
|
+
bed_start = int(parts[1])
|
|
85
|
+
bed_end = int(parts[2])
|
|
86
|
+
depth = int(parts[3])
|
|
87
|
+
|
|
88
|
+
if not contig_names_match(chrom, contig):
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
ov_start = max(bed_start, region_start)
|
|
92
|
+
ov_end = min(bed_end, region_end)
|
|
93
|
+
if ov_start >= ov_end:
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
depths[ov_start - region_start : ov_end - region_start] = depth
|
|
97
|
+
|
|
98
|
+
return DepthArray(contig=contig, start=region_start, end=region_end, depths=depths)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ── Combining multiple templates ─────────────────────────────────────────────
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def bed_combine_depths(
|
|
105
|
+
arrays: list[DepthArray],
|
|
106
|
+
mode: str = "min",
|
|
107
|
+
seed: int = 42,
|
|
108
|
+
) -> DepthArray:
|
|
109
|
+
"""Combine multiple DepthArrays position-by-position.
|
|
110
|
+
|
|
111
|
+
Supported *mode* values: ``"min"``, ``"mean"``, ``"median"``, ``"max"``, ``"random"``.
|
|
112
|
+
"""
|
|
113
|
+
if len(arrays) == 1:
|
|
114
|
+
return DepthArray(
|
|
115
|
+
contig=arrays[0].contig,
|
|
116
|
+
start=arrays[0].start,
|
|
117
|
+
end=arrays[0].end,
|
|
118
|
+
depths=arrays[0].depths.copy(),
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
stacked = np.stack([a.depths for a in arrays], axis=0) # (n_arrays, length)
|
|
122
|
+
|
|
123
|
+
if mode == "min":
|
|
124
|
+
combined = stacked.min(axis=0)
|
|
125
|
+
elif mode == "max":
|
|
126
|
+
combined = stacked.max(axis=0)
|
|
127
|
+
elif mode == "mean":
|
|
128
|
+
combined = (stacked.sum(axis=0) // len(arrays)).astype(np.int32)
|
|
129
|
+
elif mode == "median":
|
|
130
|
+
combined = np.rint(np.median(stacked, axis=0)).astype(np.int32)
|
|
131
|
+
elif mode == "random":
|
|
132
|
+
rng = random.Random(seed)
|
|
133
|
+
mins = stacked.min(axis=0)
|
|
134
|
+
maxs = stacked.max(axis=0)
|
|
135
|
+
combined = np.array(
|
|
136
|
+
[
|
|
137
|
+
v if mn == mx else rng.randint(mn, mx)
|
|
138
|
+
for mn, mx, v in zip(mins, maxs, mins)
|
|
139
|
+
],
|
|
140
|
+
dtype=np.int32,
|
|
141
|
+
)
|
|
142
|
+
else:
|
|
143
|
+
raise ValueError(f"Unknown combine mode: {mode}")
|
|
144
|
+
|
|
145
|
+
ref = arrays[0]
|
|
146
|
+
return DepthArray(contig=ref.contig, start=ref.start, end=ref.end, depths=combined)
|
samsamplex/cli.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
"""Command-line interface for samsamplex."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from . import __version__
|
|
9
|
+
from .modes import DEPTH_MODES
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _add_map_parser(subparsers: argparse._SubParsersAction) -> None:
|
|
13
|
+
p = subparsers.add_parser(
|
|
14
|
+
"map",
|
|
15
|
+
help="Extract depth of coverage from BAM to BED template",
|
|
16
|
+
)
|
|
17
|
+
p.add_argument(
|
|
18
|
+
"--template-bam",
|
|
19
|
+
nargs="+",
|
|
20
|
+
required=True,
|
|
21
|
+
help="Input BAM file(s)",
|
|
22
|
+
)
|
|
23
|
+
p.add_argument("--region", required=True, help="Target region (samtools-style)")
|
|
24
|
+
p.add_argument("--out-bed", required=True, help="Output BED file")
|
|
25
|
+
p.add_argument(
|
|
26
|
+
"--collapse",
|
|
27
|
+
type=int,
|
|
28
|
+
default=0,
|
|
29
|
+
help="Merge consecutive positions with depth diff <= INT [default: 0]",
|
|
30
|
+
)
|
|
31
|
+
p.add_argument(
|
|
32
|
+
"--mode",
|
|
33
|
+
default="mean",
|
|
34
|
+
choices=DEPTH_MODES,
|
|
35
|
+
help="How to combine depths when multiple BAMs given [default: mean]",
|
|
36
|
+
)
|
|
37
|
+
p.add_argument("--seed", type=int, default=42, help="Random seed for --mode random [default: 42]")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _add_sample_parser(subparsers: argparse._SubParsersAction) -> None:
|
|
41
|
+
p = subparsers.add_parser(
|
|
42
|
+
"sample",
|
|
43
|
+
help="Sample reads from BAM to match template depth distribution",
|
|
44
|
+
)
|
|
45
|
+
p.add_argument("--source-bam", required=True, help="Input BAM file to sample from")
|
|
46
|
+
p.add_argument(
|
|
47
|
+
"--template-bed",
|
|
48
|
+
nargs="*",
|
|
49
|
+
default=[],
|
|
50
|
+
help="Template BED file(s) with depth values (required unless --uniform)",
|
|
51
|
+
)
|
|
52
|
+
p.add_argument(
|
|
53
|
+
"--uniform",
|
|
54
|
+
type=float,
|
|
55
|
+
default=None,
|
|
56
|
+
metavar="FRACTION",
|
|
57
|
+
help="Uniform sampling: retain fraction of reads by hash (0-1). Bypasses template/depth logic.",
|
|
58
|
+
)
|
|
59
|
+
p.add_argument("--region", required=True, help="Target region (samtools-style)")
|
|
60
|
+
p.add_argument("--out-bam", required=True, help="Output BAM file")
|
|
61
|
+
p.add_argument(
|
|
62
|
+
"--mode",
|
|
63
|
+
default="random",
|
|
64
|
+
choices=DEPTH_MODES,
|
|
65
|
+
help="How to combine multiple template BEDs [default: random]",
|
|
66
|
+
)
|
|
67
|
+
p.add_argument(
|
|
68
|
+
"--stat",
|
|
69
|
+
default="mean",
|
|
70
|
+
choices=DEPTH_MODES,
|
|
71
|
+
help=(
|
|
72
|
+
"Statistic for summarising ratio over read span; "
|
|
73
|
+
"'random' picks one ratio from covered bases deterministically from span+seed "
|
|
74
|
+
"[default: mean]"
|
|
75
|
+
),
|
|
76
|
+
)
|
|
77
|
+
p.add_argument("--seed", type=int, default=42, help="Random seed [default: 42]")
|
|
78
|
+
# p.add_argument("--no-sort", action="store_true", help="Skip sorting/indexing output BAM")
|
|
79
|
+
p.add_argument("--threads", type=int, default=2, help="Number of writer threads to use [default: 2]")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _add_plot_parser(subparsers: argparse._SubParsersAction) -> None:
|
|
83
|
+
p = subparsers.add_parser(
|
|
84
|
+
"plot",
|
|
85
|
+
help="Compare depth of coverage and output as PNG plot or TSV data",
|
|
86
|
+
)
|
|
87
|
+
p.add_argument("--source-bam", required=True, help="Source BAM file")
|
|
88
|
+
p.add_argument("--out-bam", required=True, help="Output BAM file (from sampling)")
|
|
89
|
+
p.add_argument("--region", required=True, help="Target region (samtools-style)")
|
|
90
|
+
|
|
91
|
+
tpl = p.add_mutually_exclusive_group(required=True)
|
|
92
|
+
tpl.add_argument("--template-bam", help="Template BAM file")
|
|
93
|
+
tpl.add_argument("--template-bed", help="Template BED file")
|
|
94
|
+
|
|
95
|
+
out = p.add_mutually_exclusive_group(required=True)
|
|
96
|
+
out.add_argument("--out-png", help="Output PNG plot file")
|
|
97
|
+
out.add_argument("--out-tsv", help="Output TSV data file")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _add_mapback_parser(subparsers: argparse._SubParsersAction) -> None:
|
|
101
|
+
p = subparsers.add_parser(
|
|
102
|
+
"mapback",
|
|
103
|
+
help="Remap HLA*LA PRG-mapped reads back to chr6 coordinates",
|
|
104
|
+
)
|
|
105
|
+
p.add_argument("--source-bam", required=True, help="HLA*LA-remapped BAM file")
|
|
106
|
+
p.add_argument("--region", required=True, help="Target region on chr6 (samtools-style)")
|
|
107
|
+
p.add_argument("--out-bam", required=True, help="Output BAM file")
|
|
108
|
+
p.add_argument(
|
|
109
|
+
"--genome-build", required=True, choices=("GRCh38", "GRCh37"),
|
|
110
|
+
help="Reference genome build",
|
|
111
|
+
)
|
|
112
|
+
p.add_argument(
|
|
113
|
+
"--prg-seq",
|
|
114
|
+
default="HLA-LA/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt",
|
|
115
|
+
help="Path to HLA*LA sequences.txt [default: HLA-LA/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt]",
|
|
116
|
+
)
|
|
117
|
+
p.add_argument("--no-sort", action="store_true", help="Skip sorting/indexing output BAM")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _add_stats_parser(subparsers: argparse._SubParsersAction) -> None:
|
|
121
|
+
p = subparsers.add_parser(
|
|
122
|
+
"stats",
|
|
123
|
+
help="Compare depth distributions between two BAM or BED files",
|
|
124
|
+
)
|
|
125
|
+
p.add_argument("--a", required=True, help="First input file — BAM or BED (reference)")
|
|
126
|
+
p.add_argument("--b", required=True, help="Second input file — BAM or BED (comparison)")
|
|
127
|
+
p.add_argument("--region", required=True, help="Region to compare (samtools-style)")
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# ── Subcommand handlers ─────────────────────────────────────────────────────
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _run_map(args: argparse.Namespace) -> int:
|
|
134
|
+
from .bed import bed_combine_depths, write_bed_output
|
|
135
|
+
from .depth import depth_from_bam, region_parse, resolve_contig_name
|
|
136
|
+
|
|
137
|
+
import pysam
|
|
138
|
+
|
|
139
|
+
log = lambda msg: print(msg, file=sys.stderr)
|
|
140
|
+
|
|
141
|
+
region = region_parse(args.region)
|
|
142
|
+
template_bams = args.template_bam # list of one or more paths
|
|
143
|
+
|
|
144
|
+
log(f"[map] Template BAM(s): {template_bams}")
|
|
145
|
+
log(f"[map] Region: {args.region}")
|
|
146
|
+
log(f"[map] Collapse: {args.collapse}")
|
|
147
|
+
log(f"[map] Output BED: {args.out_bed}")
|
|
148
|
+
if len(template_bams) > 1:
|
|
149
|
+
log(f"[map] Mode: {args.mode}")
|
|
150
|
+
|
|
151
|
+
with pysam.AlignmentFile(template_bams[0], "rb") as bam:
|
|
152
|
+
resolved = resolve_contig_name(bam.header, region.contig)
|
|
153
|
+
if resolved is None:
|
|
154
|
+
log(f"Error: Contig '{region.contig}' not found in BAM")
|
|
155
|
+
return 1
|
|
156
|
+
region.contig = resolved
|
|
157
|
+
contig_len = bam.get_reference_length(resolved)
|
|
158
|
+
|
|
159
|
+
if region.start < 0:
|
|
160
|
+
region.start = 0
|
|
161
|
+
if region.end < 0:
|
|
162
|
+
region.end = contig_len
|
|
163
|
+
|
|
164
|
+
log(f"[map] Parsed region: {region.contig}:{region.start}-{region.end}")
|
|
165
|
+
log("[map] Computing depth array(s) (this may take a while)...")
|
|
166
|
+
|
|
167
|
+
depth_arrays = []
|
|
168
|
+
for bam_path in template_bams:
|
|
169
|
+
depth_arrays.append(
|
|
170
|
+
depth_from_bam(bam_path, region.contig, region.start, region.end)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if len(depth_arrays) == 1:
|
|
174
|
+
depth = depth_arrays[0]
|
|
175
|
+
else:
|
|
176
|
+
depth = bed_combine_depths(
|
|
177
|
+
depth_arrays, mode=args.mode, seed=args.seed
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
log(f"[map] Computed depth for {depth.length} positions")
|
|
181
|
+
|
|
182
|
+
suffix = " (collapsed)" if args.collapse > 0 else ""
|
|
183
|
+
log(f"[map] Writing BED file{suffix}...")
|
|
184
|
+
|
|
185
|
+
with open(args.out_bed, "w") as fp:
|
|
186
|
+
write_bed_output(fp, depth, collapse=args.collapse)
|
|
187
|
+
|
|
188
|
+
log(f"[map] Done. Output written to: {args.out_bed}")
|
|
189
|
+
return 0
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _run_sample(args: argparse.Namespace) -> int:
|
|
193
|
+
from .sample import sample_run
|
|
194
|
+
|
|
195
|
+
log = lambda msg: print(msg, file=sys.stderr)
|
|
196
|
+
|
|
197
|
+
if args.uniform is None and not args.template_bed:
|
|
198
|
+
log("Error: Either --template-bed or --uniform is required")
|
|
199
|
+
return 1
|
|
200
|
+
|
|
201
|
+
if args.uniform is not None:
|
|
202
|
+
if args.uniform <= 0 or args.uniform > 1:
|
|
203
|
+
log(f"Error: --uniform must be in (0, 1], got {args.uniform}")
|
|
204
|
+
return 1
|
|
205
|
+
|
|
206
|
+
return sample_run(
|
|
207
|
+
source_bam=args.source_bam,
|
|
208
|
+
template_beds=args.template_bed if args.template_bed else [],
|
|
209
|
+
region_str=args.region,
|
|
210
|
+
out_bam=args.out_bam,
|
|
211
|
+
mode=args.mode,
|
|
212
|
+
stat=args.stat,
|
|
213
|
+
seed=args.seed,
|
|
214
|
+
# no_sort=args.no_sort,
|
|
215
|
+
uniform_fraction=args.uniform,
|
|
216
|
+
threads=args.threads,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _run_plot(args: argparse.Namespace) -> int:
|
|
221
|
+
from .plot import plot_run
|
|
222
|
+
|
|
223
|
+
return plot_run(
|
|
224
|
+
source_bam=args.source_bam,
|
|
225
|
+
out_bam=args.out_bam,
|
|
226
|
+
region_str=args.region,
|
|
227
|
+
template_bam=args.template_bam,
|
|
228
|
+
template_bed=args.template_bed,
|
|
229
|
+
out_png=args.out_png,
|
|
230
|
+
out_tsv=args.out_tsv,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _run_mapback(args: argparse.Namespace) -> int:
|
|
235
|
+
from .mapback import mapback_run
|
|
236
|
+
|
|
237
|
+
return mapback_run(
|
|
238
|
+
source_bam=args.source_bam,
|
|
239
|
+
region_str=args.region,
|
|
240
|
+
out_bam=args.out_bam,
|
|
241
|
+
genome_build=args.genome_build,
|
|
242
|
+
prg_seq=args.prg_seq,
|
|
243
|
+
no_sort=args.no_sort,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _run_stats(args: argparse.Namespace) -> int:
|
|
248
|
+
from .depth import region_parse
|
|
249
|
+
from .metrics import depth_from_path, metrics_calculate, metrics_print
|
|
250
|
+
|
|
251
|
+
log = lambda msg: print(msg, file=sys.stderr)
|
|
252
|
+
|
|
253
|
+
region = region_parse(args.region)
|
|
254
|
+
|
|
255
|
+
# Resolve region bounds from a BAM input (BED files don't carry contig lengths).
|
|
256
|
+
bam_input = None
|
|
257
|
+
for path in (args.a, args.b):
|
|
258
|
+
if not path.endswith(".bed"):
|
|
259
|
+
bam_input = path
|
|
260
|
+
break
|
|
261
|
+
|
|
262
|
+
if bam_input is not None:
|
|
263
|
+
import pysam
|
|
264
|
+
from .depth import resolve_contig_name
|
|
265
|
+
|
|
266
|
+
with pysam.AlignmentFile(bam_input, "rb") as bam:
|
|
267
|
+
resolved = resolve_contig_name(bam.header, region.contig)
|
|
268
|
+
if resolved is None:
|
|
269
|
+
log(f"Error: Contig '{region.contig}' not found in {bam_input}")
|
|
270
|
+
return 1
|
|
271
|
+
region.contig = resolved
|
|
272
|
+
if region.start < 0:
|
|
273
|
+
region.start = 0
|
|
274
|
+
if region.end < 0:
|
|
275
|
+
region.end = bam.get_reference_length(resolved)
|
|
276
|
+
else:
|
|
277
|
+
if region.start < 0 or region.end < 0:
|
|
278
|
+
log("Error: --region must specify explicit start and end when both inputs are BED files")
|
|
279
|
+
return 1
|
|
280
|
+
|
|
281
|
+
def _fmt_type(path: str) -> str:
|
|
282
|
+
return "BED" if path.endswith(".bed") else "BAM"
|
|
283
|
+
|
|
284
|
+
log(f"Input A ({_fmt_type(args.a)}): {args.a}")
|
|
285
|
+
log(f"Input B ({_fmt_type(args.b)}): {args.b}")
|
|
286
|
+
log(f"Region: {region.contig}:{region.start + 1}-{region.end}")
|
|
287
|
+
|
|
288
|
+
depth_a = depth_from_path(args.a, region.contig, region.start, region.end)
|
|
289
|
+
depth_b = depth_from_path(args.b, region.contig, region.start, region.end)
|
|
290
|
+
|
|
291
|
+
result = metrics_calculate(depth_a, depth_b)
|
|
292
|
+
|
|
293
|
+
log("")
|
|
294
|
+
log("========== Comparison Metrics ==========")
|
|
295
|
+
log(f"A (reference): {args.a}")
|
|
296
|
+
log(f"B (comparison): {args.b}")
|
|
297
|
+
log(f"Region: {region.contig}:{region.start + 1}-{region.end}")
|
|
298
|
+
log(f"Positions compared: {depth_a.length}")
|
|
299
|
+
log("-" * 41)
|
|
300
|
+
metrics_print(result, label_a="A", label_b="B")
|
|
301
|
+
|
|
302
|
+
return 0
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
# ── Entry point ──────────────────────────────────────────────────────────────
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def main(argv: list[str] | None = None) -> None:
|
|
309
|
+
parser = argparse.ArgumentParser(
|
|
310
|
+
prog="samsamplex",
|
|
311
|
+
description=f"samsamplex v{__version__} — Depth-aware BAM file sampling",
|
|
312
|
+
)
|
|
313
|
+
parser.add_argument(
|
|
314
|
+
"-v", "--version", action="version", version=f"samsamplex v{__version__}"
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
318
|
+
_add_map_parser(subparsers)
|
|
319
|
+
_add_sample_parser(subparsers)
|
|
320
|
+
_add_plot_parser(subparsers)
|
|
321
|
+
_add_mapback_parser(subparsers)
|
|
322
|
+
_add_stats_parser(subparsers)
|
|
323
|
+
|
|
324
|
+
args = parser.parse_args(argv)
|
|
325
|
+
|
|
326
|
+
if args.command is None:
|
|
327
|
+
parser.print_help()
|
|
328
|
+
sys.exit(1)
|
|
329
|
+
|
|
330
|
+
dispatch = {
|
|
331
|
+
"map": _run_map,
|
|
332
|
+
"sample": _run_sample,
|
|
333
|
+
"plot": _run_plot,
|
|
334
|
+
"mapback": _run_mapback,
|
|
335
|
+
"stats": _run_stats,
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
sys.exit(dispatch[args.command](args))
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
if __name__ == "__main__":
|
|
342
|
+
main()
|
samsamplex/depth.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""Depth array computation from BAM files and region parsing utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pysam
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class Region:
|
|
15
|
+
"""Genomic region parsed from a samtools-style string."""
|
|
16
|
+
|
|
17
|
+
contig: str
|
|
18
|
+
start: int = -1 # 0-based inclusive; -1 means whole contig
|
|
19
|
+
end: int = -1 # 0-based exclusive; -1 means whole contig
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class DepthArray:
|
|
24
|
+
"""Per-position depth values over a contiguous genomic region."""
|
|
25
|
+
|
|
26
|
+
contig: str
|
|
27
|
+
start: int
|
|
28
|
+
end: int
|
|
29
|
+
depths: np.ndarray = field(repr=False)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def length(self) -> int:
|
|
33
|
+
return self.end - self.start
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def region_parse(region_str: str) -> Region:
|
|
37
|
+
"""Parse a samtools-style region string (e.g. chr1, chr1:1000-2000).
|
|
38
|
+
|
|
39
|
+
Input coordinates are 1-based; the returned Region uses 0-based half-open
|
|
40
|
+
coordinates.
|
|
41
|
+
"""
|
|
42
|
+
m = re.match(r"^([^:]+)(?::(\d+)(?:-(\d+))?)?$", region_str)
|
|
43
|
+
if not m:
|
|
44
|
+
raise ValueError(f"Invalid region format: {region_str}")
|
|
45
|
+
|
|
46
|
+
contig = m.group(1)
|
|
47
|
+
start = int(m.group(2)) - 1 if m.group(2) else -1 # 1-based → 0-based
|
|
48
|
+
end = int(m.group(3)) if m.group(3) else -1 # exclusive, stays as-is
|
|
49
|
+
|
|
50
|
+
return Region(contig=contig, start=start, end=end)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def resolve_contig_name(header: pysam.AlignmentHeader, contig: str) -> str | None:
|
|
54
|
+
"""Resolve *contig* against a BAM header, trying chr-prefix variants.
|
|
55
|
+
|
|
56
|
+
Returns the matching name from the header, or None.
|
|
57
|
+
"""
|
|
58
|
+
references = set(header.references)
|
|
59
|
+
|
|
60
|
+
if contig in references:
|
|
61
|
+
return contig
|
|
62
|
+
|
|
63
|
+
if contig.startswith("chr"):
|
|
64
|
+
alt = contig[3:]
|
|
65
|
+
if alt in references:
|
|
66
|
+
print(f"Note: Using contig '{alt}' (matched from '{contig}')", file=sys.stderr)
|
|
67
|
+
return alt
|
|
68
|
+
else:
|
|
69
|
+
alt = f"chr{contig}"
|
|
70
|
+
if alt in references:
|
|
71
|
+
print(f"Note: Using contig '{alt}' (matched from '{contig}')", file=sys.stderr)
|
|
72
|
+
return alt
|
|
73
|
+
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_contig_length(header: pysam.AlignmentHeader, contig: str) -> int | None:
|
|
78
|
+
"""Return contig length from a BAM header, handling chr-prefix mismatch."""
|
|
79
|
+
resolved = resolve_contig_name(header, contig)
|
|
80
|
+
if resolved is None:
|
|
81
|
+
return None
|
|
82
|
+
return header.get_reference_length(resolved)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def depth_from_bam(
|
|
86
|
+
bam_path: str,
|
|
87
|
+
contig: str,
|
|
88
|
+
start: int,
|
|
89
|
+
end: int,
|
|
90
|
+
) -> DepthArray:
|
|
91
|
+
"""Compute per-position depth for a region from an indexed BAM file.
|
|
92
|
+
|
|
93
|
+
Uses a simplified read-iteration approach:
|
|
94
|
+
for each read overlapping the region, increment depth at every covered
|
|
95
|
+
position (CIGAR-unaware).
|
|
96
|
+
"""
|
|
97
|
+
with pysam.AlignmentFile(bam_path, "rb") as bam:
|
|
98
|
+
resolved = resolve_contig_name(bam.header, contig)
|
|
99
|
+
if resolved is None:
|
|
100
|
+
raise ValueError(f"Contig '{contig}' not found in BAM header")
|
|
101
|
+
|
|
102
|
+
tid = bam.get_tid(resolved)
|
|
103
|
+
contig_len = bam.get_reference_length(resolved)
|
|
104
|
+
|
|
105
|
+
if start < 0:
|
|
106
|
+
start = 0
|
|
107
|
+
if end < 0 or end > contig_len:
|
|
108
|
+
end = contig_len
|
|
109
|
+
|
|
110
|
+
depths = np.zeros(end - start, dtype=np.int32)
|
|
111
|
+
|
|
112
|
+
for read in bam.fetch(resolved, start, end):
|
|
113
|
+
|
|
114
|
+
r_start = read.reference_start
|
|
115
|
+
r_end = read.reference_end
|
|
116
|
+
if r_end is None:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
ov_start = max(r_start, start)
|
|
120
|
+
ov_end = min(r_end, end)
|
|
121
|
+
if ov_start >= ov_end:
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
depths[ov_start - start : ov_end - start] += 1
|
|
125
|
+
|
|
126
|
+
return DepthArray(contig=resolved, start=start, end=end, depths=depths)
|