samsampleX 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
samsamplex/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
samsamplex/bed.py ADDED
@@ -0,0 +1,146 @@
1
+ """BED file I/O and depth-array combination utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import random
6
+ from pathlib import Path
7
+ from typing import TextIO
8
+
9
+ import numpy as np
10
+
11
+ from .depth import DepthArray
12
+
13
+
14
+ def contig_names_match(name1: str, name2: str) -> bool:
15
+ """Check whether two contig names match, ignoring a leading 'chr' prefix."""
16
+ if name1 == name2:
17
+ return True
18
+ strip = lambda s: s[3:] if s.startswith("chr") else s
19
+ return strip(name1) == strip(name2)
20
+
21
+
22
+ # ── Writing ──────────────────────────────────────────────────────────────────
23
+
24
+
25
+ def write_bed_entry(fp: TextIO, chrom: str, start: int, end: int, depth: int) -> None:
26
+ """Write a single BED4 line."""
27
+ fp.write(f"{chrom}\t{start}\t{end}\t{depth}\n")
28
+
29
+
30
+ def write_bed_output(fp: TextIO, arr: DepthArray, collapse: int = 0) -> None:
31
+ """Write a DepthArray to BED format with optional collapsing.
32
+
33
+ When *collapse* is 0 every position gets its own line. When > 0,
34
+ consecutive positions whose depth differs by <= *collapse* are merged
35
+ into a single interval (using the depth of the first position in the
36
+ interval.
37
+ """
38
+ if arr.length == 0:
39
+ return
40
+
41
+ if collapse == 0:
42
+ for i in range(arr.length):
43
+ pos = arr.start + i
44
+ write_bed_entry(fp, arr.contig, pos, pos + 1, int(arr.depths[i]))
45
+ else:
46
+ interval_start = arr.start
47
+ interval_depth = int(arr.depths[0])
48
+
49
+ for i in range(1, arr.length):
50
+ current_depth = int(arr.depths[i])
51
+ if abs(current_depth - interval_depth) > collapse:
52
+ write_bed_entry(fp, arr.contig, interval_start, arr.start + i, interval_depth)
53
+ interval_start = arr.start + i
54
+ interval_depth = current_depth
55
+
56
+ write_bed_entry(fp, arr.contig, interval_start, arr.end, interval_depth)
57
+
58
+
59
+ # ── Reading ──────────────────────────────────────────────────────────────────
60
+
61
+
62
+ def bed_read_depths(
63
+ bed_path: str,
64
+ contig: str,
65
+ region_start: int,
66
+ region_end: int,
67
+ ) -> DepthArray:
68
+ """Read depth values from a BED4 file into a DepthArray for a region."""
69
+ depths = np.zeros(region_end - region_start, dtype=np.int32)
70
+
71
+ with open(bed_path) as fp:
72
+ for line in fp:
73
+ line = line.rstrip("\n")
74
+ if not line or line.startswith("#"):
75
+ continue
76
+
77
+ parts = line.split("\t")
78
+ if len(parts) < 4:
79
+ parts = line.split()
80
+ if len(parts) < 4:
81
+ continue
82
+
83
+ chrom = parts[0]
84
+ bed_start = int(parts[1])
85
+ bed_end = int(parts[2])
86
+ depth = int(parts[3])
87
+
88
+ if not contig_names_match(chrom, contig):
89
+ continue
90
+
91
+ ov_start = max(bed_start, region_start)
92
+ ov_end = min(bed_end, region_end)
93
+ if ov_start >= ov_end:
94
+ continue
95
+
96
+ depths[ov_start - region_start : ov_end - region_start] = depth
97
+
98
+ return DepthArray(contig=contig, start=region_start, end=region_end, depths=depths)
99
+
100
+
101
+ # ── Combining multiple templates ─────────────────────────────────────────────
102
+
103
+
104
+ def bed_combine_depths(
105
+ arrays: list[DepthArray],
106
+ mode: str = "min",
107
+ seed: int = 42,
108
+ ) -> DepthArray:
109
+ """Combine multiple DepthArrays position-by-position.
110
+
111
+ Supported *mode* values: ``"min"``, ``"mean"``, ``"median"``, ``"max"``, ``"random"``.
112
+ """
113
+ if len(arrays) == 1:
114
+ return DepthArray(
115
+ contig=arrays[0].contig,
116
+ start=arrays[0].start,
117
+ end=arrays[0].end,
118
+ depths=arrays[0].depths.copy(),
119
+ )
120
+
121
+ stacked = np.stack([a.depths for a in arrays], axis=0) # (n_arrays, length)
122
+
123
+ if mode == "min":
124
+ combined = stacked.min(axis=0)
125
+ elif mode == "max":
126
+ combined = stacked.max(axis=0)
127
+ elif mode == "mean":
128
+ combined = (stacked.sum(axis=0) // len(arrays)).astype(np.int32)
129
+ elif mode == "median":
130
+ combined = np.rint(np.median(stacked, axis=0)).astype(np.int32)
131
+ elif mode == "random":
132
+ rng = random.Random(seed)
133
+ mins = stacked.min(axis=0)
134
+ maxs = stacked.max(axis=0)
135
+ combined = np.array(
136
+ [
137
+ v if mn == mx else rng.randint(mn, mx)
138
+ for mn, mx, v in zip(mins, maxs, mins)
139
+ ],
140
+ dtype=np.int32,
141
+ )
142
+ else:
143
+ raise ValueError(f"Unknown combine mode: {mode}")
144
+
145
+ ref = arrays[0]
146
+ return DepthArray(contig=ref.contig, start=ref.start, end=ref.end, depths=combined)
samsamplex/cli.py ADDED
@@ -0,0 +1,342 @@
1
+ """Command-line interface for samsamplex."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+
8
+ from . import __version__
9
+ from .modes import DEPTH_MODES
10
+
11
+
12
+ def _add_map_parser(subparsers: argparse._SubParsersAction) -> None:
13
+ p = subparsers.add_parser(
14
+ "map",
15
+ help="Extract depth of coverage from BAM to BED template",
16
+ )
17
+ p.add_argument(
18
+ "--template-bam",
19
+ nargs="+",
20
+ required=True,
21
+ help="Input BAM file(s)",
22
+ )
23
+ p.add_argument("--region", required=True, help="Target region (samtools-style)")
24
+ p.add_argument("--out-bed", required=True, help="Output BED file")
25
+ p.add_argument(
26
+ "--collapse",
27
+ type=int,
28
+ default=0,
29
+ help="Merge consecutive positions with depth diff <= INT [default: 0]",
30
+ )
31
+ p.add_argument(
32
+ "--mode",
33
+ default="mean",
34
+ choices=DEPTH_MODES,
35
+ help="How to combine depths when multiple BAMs given [default: mean]",
36
+ )
37
+ p.add_argument("--seed", type=int, default=42, help="Random seed for --mode random [default: 42]")
38
+
39
+
40
+ def _add_sample_parser(subparsers: argparse._SubParsersAction) -> None:
41
+ p = subparsers.add_parser(
42
+ "sample",
43
+ help="Sample reads from BAM to match template depth distribution",
44
+ )
45
+ p.add_argument("--source-bam", required=True, help="Input BAM file to sample from")
46
+ p.add_argument(
47
+ "--template-bed",
48
+ nargs="*",
49
+ default=[],
50
+ help="Template BED file(s) with depth values (required unless --uniform)",
51
+ )
52
+ p.add_argument(
53
+ "--uniform",
54
+ type=float,
55
+ default=None,
56
+ metavar="FRACTION",
57
+ help="Uniform sampling: retain fraction of reads by hash (0-1). Bypasses template/depth logic.",
58
+ )
59
+ p.add_argument("--region", required=True, help="Target region (samtools-style)")
60
+ p.add_argument("--out-bam", required=True, help="Output BAM file")
61
+ p.add_argument(
62
+ "--mode",
63
+ default="random",
64
+ choices=DEPTH_MODES,
65
+ help="How to combine multiple template BEDs [default: random]",
66
+ )
67
+ p.add_argument(
68
+ "--stat",
69
+ default="mean",
70
+ choices=DEPTH_MODES,
71
+ help=(
72
+ "Statistic for summarising ratio over read span; "
73
+ "'random' picks one ratio from covered bases deterministically from span+seed "
74
+ "[default: mean]"
75
+ ),
76
+ )
77
+ p.add_argument("--seed", type=int, default=42, help="Random seed [default: 42]")
78
+ # p.add_argument("--no-sort", action="store_true", help="Skip sorting/indexing output BAM")
79
+ p.add_argument("--threads", type=int, default=2, help="Number of writer threads to use [default: 2]")
80
+
81
+
82
+ def _add_plot_parser(subparsers: argparse._SubParsersAction) -> None:
83
+ p = subparsers.add_parser(
84
+ "plot",
85
+ help="Compare depth of coverage and output as PNG plot or TSV data",
86
+ )
87
+ p.add_argument("--source-bam", required=True, help="Source BAM file")
88
+ p.add_argument("--out-bam", required=True, help="Output BAM file (from sampling)")
89
+ p.add_argument("--region", required=True, help="Target region (samtools-style)")
90
+
91
+ tpl = p.add_mutually_exclusive_group(required=True)
92
+ tpl.add_argument("--template-bam", help="Template BAM file")
93
+ tpl.add_argument("--template-bed", help="Template BED file")
94
+
95
+ out = p.add_mutually_exclusive_group(required=True)
96
+ out.add_argument("--out-png", help="Output PNG plot file")
97
+ out.add_argument("--out-tsv", help="Output TSV data file")
98
+
99
+
100
+ def _add_mapback_parser(subparsers: argparse._SubParsersAction) -> None:
101
+ p = subparsers.add_parser(
102
+ "mapback",
103
+ help="Remap HLA*LA PRG-mapped reads back to chr6 coordinates",
104
+ )
105
+ p.add_argument("--source-bam", required=True, help="HLA*LA-remapped BAM file")
106
+ p.add_argument("--region", required=True, help="Target region on chr6 (samtools-style)")
107
+ p.add_argument("--out-bam", required=True, help="Output BAM file")
108
+ p.add_argument(
109
+ "--genome-build", required=True, choices=("GRCh38", "GRCh37"),
110
+ help="Reference genome build",
111
+ )
112
+ p.add_argument(
113
+ "--prg-seq",
114
+ default="HLA-LA/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt",
115
+ help="Path to HLA*LA sequences.txt [default: HLA-LA/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt]",
116
+ )
117
+ p.add_argument("--no-sort", action="store_true", help="Skip sorting/indexing output BAM")
118
+
119
+
120
+ def _add_stats_parser(subparsers: argparse._SubParsersAction) -> None:
121
+ p = subparsers.add_parser(
122
+ "stats",
123
+ help="Compare depth distributions between two BAM or BED files",
124
+ )
125
+ p.add_argument("--a", required=True, help="First input file — BAM or BED (reference)")
126
+ p.add_argument("--b", required=True, help="Second input file — BAM or BED (comparison)")
127
+ p.add_argument("--region", required=True, help="Region to compare (samtools-style)")
128
+
129
+
130
+ # ── Subcommand handlers ─────────────────────────────────────────────────────
131
+
132
+
133
+ def _run_map(args: argparse.Namespace) -> int:
134
+ from .bed import bed_combine_depths, write_bed_output
135
+ from .depth import depth_from_bam, region_parse, resolve_contig_name
136
+
137
+ import pysam
138
+
139
+ log = lambda msg: print(msg, file=sys.stderr)
140
+
141
+ region = region_parse(args.region)
142
+ template_bams = args.template_bam # list of one or more paths
143
+
144
+ log(f"[map] Template BAM(s): {template_bams}")
145
+ log(f"[map] Region: {args.region}")
146
+ log(f"[map] Collapse: {args.collapse}")
147
+ log(f"[map] Output BED: {args.out_bed}")
148
+ if len(template_bams) > 1:
149
+ log(f"[map] Mode: {args.mode}")
150
+
151
+ with pysam.AlignmentFile(template_bams[0], "rb") as bam:
152
+ resolved = resolve_contig_name(bam.header, region.contig)
153
+ if resolved is None:
154
+ log(f"Error: Contig '{region.contig}' not found in BAM")
155
+ return 1
156
+ region.contig = resolved
157
+ contig_len = bam.get_reference_length(resolved)
158
+
159
+ if region.start < 0:
160
+ region.start = 0
161
+ if region.end < 0:
162
+ region.end = contig_len
163
+
164
+ log(f"[map] Parsed region: {region.contig}:{region.start}-{region.end}")
165
+ log("[map] Computing depth array(s) (this may take a while)...")
166
+
167
+ depth_arrays = []
168
+ for bam_path in template_bams:
169
+ depth_arrays.append(
170
+ depth_from_bam(bam_path, region.contig, region.start, region.end)
171
+ )
172
+
173
+ if len(depth_arrays) == 1:
174
+ depth = depth_arrays[0]
175
+ else:
176
+ depth = bed_combine_depths(
177
+ depth_arrays, mode=args.mode, seed=args.seed
178
+ )
179
+
180
+ log(f"[map] Computed depth for {depth.length} positions")
181
+
182
+ suffix = " (collapsed)" if args.collapse > 0 else ""
183
+ log(f"[map] Writing BED file{suffix}...")
184
+
185
+ with open(args.out_bed, "w") as fp:
186
+ write_bed_output(fp, depth, collapse=args.collapse)
187
+
188
+ log(f"[map] Done. Output written to: {args.out_bed}")
189
+ return 0
190
+
191
+
192
+ def _run_sample(args: argparse.Namespace) -> int:
193
+ from .sample import sample_run
194
+
195
+ log = lambda msg: print(msg, file=sys.stderr)
196
+
197
+ if args.uniform is None and not args.template_bed:
198
+ log("Error: Either --template-bed or --uniform is required")
199
+ return 1
200
+
201
+ if args.uniform is not None:
202
+ if args.uniform <= 0 or args.uniform > 1:
203
+ log(f"Error: --uniform must be in (0, 1], got {args.uniform}")
204
+ return 1
205
+
206
+ return sample_run(
207
+ source_bam=args.source_bam,
208
+ template_beds=args.template_bed if args.template_bed else [],
209
+ region_str=args.region,
210
+ out_bam=args.out_bam,
211
+ mode=args.mode,
212
+ stat=args.stat,
213
+ seed=args.seed,
214
+ # no_sort=args.no_sort,
215
+ uniform_fraction=args.uniform,
216
+ threads=args.threads,
217
+ )
218
+
219
+
220
+ def _run_plot(args: argparse.Namespace) -> int:
221
+ from .plot import plot_run
222
+
223
+ return plot_run(
224
+ source_bam=args.source_bam,
225
+ out_bam=args.out_bam,
226
+ region_str=args.region,
227
+ template_bam=args.template_bam,
228
+ template_bed=args.template_bed,
229
+ out_png=args.out_png,
230
+ out_tsv=args.out_tsv,
231
+ )
232
+
233
+
234
+ def _run_mapback(args: argparse.Namespace) -> int:
235
+ from .mapback import mapback_run
236
+
237
+ return mapback_run(
238
+ source_bam=args.source_bam,
239
+ region_str=args.region,
240
+ out_bam=args.out_bam,
241
+ genome_build=args.genome_build,
242
+ prg_seq=args.prg_seq,
243
+ no_sort=args.no_sort,
244
+ )
245
+
246
+
247
+ def _run_stats(args: argparse.Namespace) -> int:
248
+ from .depth import region_parse
249
+ from .metrics import depth_from_path, metrics_calculate, metrics_print
250
+
251
+ log = lambda msg: print(msg, file=sys.stderr)
252
+
253
+ region = region_parse(args.region)
254
+
255
+ # Resolve region bounds from a BAM input (BED files don't carry contig lengths).
256
+ bam_input = None
257
+ for path in (args.a, args.b):
258
+ if not path.endswith(".bed"):
259
+ bam_input = path
260
+ break
261
+
262
+ if bam_input is not None:
263
+ import pysam
264
+ from .depth import resolve_contig_name
265
+
266
+ with pysam.AlignmentFile(bam_input, "rb") as bam:
267
+ resolved = resolve_contig_name(bam.header, region.contig)
268
+ if resolved is None:
269
+ log(f"Error: Contig '{region.contig}' not found in {bam_input}")
270
+ return 1
271
+ region.contig = resolved
272
+ if region.start < 0:
273
+ region.start = 0
274
+ if region.end < 0:
275
+ region.end = bam.get_reference_length(resolved)
276
+ else:
277
+ if region.start < 0 or region.end < 0:
278
+ log("Error: --region must specify explicit start and end when both inputs are BED files")
279
+ return 1
280
+
281
+ def _fmt_type(path: str) -> str:
282
+ return "BED" if path.endswith(".bed") else "BAM"
283
+
284
+ log(f"Input A ({_fmt_type(args.a)}): {args.a}")
285
+ log(f"Input B ({_fmt_type(args.b)}): {args.b}")
286
+ log(f"Region: {region.contig}:{region.start + 1}-{region.end}")
287
+
288
+ depth_a = depth_from_path(args.a, region.contig, region.start, region.end)
289
+ depth_b = depth_from_path(args.b, region.contig, region.start, region.end)
290
+
291
+ result = metrics_calculate(depth_a, depth_b)
292
+
293
+ log("")
294
+ log("========== Comparison Metrics ==========")
295
+ log(f"A (reference): {args.a}")
296
+ log(f"B (comparison): {args.b}")
297
+ log(f"Region: {region.contig}:{region.start + 1}-{region.end}")
298
+ log(f"Positions compared: {depth_a.length}")
299
+ log("-" * 41)
300
+ metrics_print(result, label_a="A", label_b="B")
301
+
302
+ return 0
303
+
304
+
305
+ # ── Entry point ──────────────────────────────────────────────────────────────
306
+
307
+
308
+ def main(argv: list[str] | None = None) -> None:
309
+ parser = argparse.ArgumentParser(
310
+ prog="samsamplex",
311
+ description=f"samsamplex v{__version__} — Depth-aware BAM file sampling",
312
+ )
313
+ parser.add_argument(
314
+ "-v", "--version", action="version", version=f"samsamplex v{__version__}"
315
+ )
316
+
317
+ subparsers = parser.add_subparsers(dest="command")
318
+ _add_map_parser(subparsers)
319
+ _add_sample_parser(subparsers)
320
+ _add_plot_parser(subparsers)
321
+ _add_mapback_parser(subparsers)
322
+ _add_stats_parser(subparsers)
323
+
324
+ args = parser.parse_args(argv)
325
+
326
+ if args.command is None:
327
+ parser.print_help()
328
+ sys.exit(1)
329
+
330
+ dispatch = {
331
+ "map": _run_map,
332
+ "sample": _run_sample,
333
+ "plot": _run_plot,
334
+ "mapback": _run_mapback,
335
+ "stats": _run_stats,
336
+ }
337
+
338
+ sys.exit(dispatch[args.command](args))
339
+
340
+
341
+ if __name__ == "__main__":
342
+ main()
samsamplex/depth.py ADDED
@@ -0,0 +1,126 @@
1
+ """Depth array computation from BAM files and region parsing utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import sys
7
+ from dataclasses import dataclass, field
8
+
9
+ import numpy as np
10
+ import pysam
11
+
12
+
13
+ @dataclass
14
+ class Region:
15
+ """Genomic region parsed from a samtools-style string."""
16
+
17
+ contig: str
18
+ start: int = -1 # 0-based inclusive; -1 means whole contig
19
+ end: int = -1 # 0-based exclusive; -1 means whole contig
20
+
21
+
22
+ @dataclass
23
+ class DepthArray:
24
+ """Per-position depth values over a contiguous genomic region."""
25
+
26
+ contig: str
27
+ start: int
28
+ end: int
29
+ depths: np.ndarray = field(repr=False)
30
+
31
+ @property
32
+ def length(self) -> int:
33
+ return self.end - self.start
34
+
35
+
36
+ def region_parse(region_str: str) -> Region:
37
+ """Parse a samtools-style region string (e.g. chr1, chr1:1000-2000).
38
+
39
+ Input coordinates are 1-based; the returned Region uses 0-based half-open
40
+ coordinates.
41
+ """
42
+ m = re.match(r"^([^:]+)(?::(\d+)(?:-(\d+))?)?$", region_str)
43
+ if not m:
44
+ raise ValueError(f"Invalid region format: {region_str}")
45
+
46
+ contig = m.group(1)
47
+ start = int(m.group(2)) - 1 if m.group(2) else -1 # 1-based → 0-based
48
+ end = int(m.group(3)) if m.group(3) else -1 # exclusive, stays as-is
49
+
50
+ return Region(contig=contig, start=start, end=end)
51
+
52
+
53
+ def resolve_contig_name(header: pysam.AlignmentHeader, contig: str) -> str | None:
54
+ """Resolve *contig* against a BAM header, trying chr-prefix variants.
55
+
56
+ Returns the matching name from the header, or None.
57
+ """
58
+ references = set(header.references)
59
+
60
+ if contig in references:
61
+ return contig
62
+
63
+ if contig.startswith("chr"):
64
+ alt = contig[3:]
65
+ if alt in references:
66
+ print(f"Note: Using contig '{alt}' (matched from '{contig}')", file=sys.stderr)
67
+ return alt
68
+ else:
69
+ alt = f"chr{contig}"
70
+ if alt in references:
71
+ print(f"Note: Using contig '{alt}' (matched from '{contig}')", file=sys.stderr)
72
+ return alt
73
+
74
+ return None
75
+
76
+
77
+ def get_contig_length(header: pysam.AlignmentHeader, contig: str) -> int | None:
78
+ """Return contig length from a BAM header, handling chr-prefix mismatch."""
79
+ resolved = resolve_contig_name(header, contig)
80
+ if resolved is None:
81
+ return None
82
+ return header.get_reference_length(resolved)
83
+
84
+
85
+ def depth_from_bam(
86
+ bam_path: str,
87
+ contig: str,
88
+ start: int,
89
+ end: int,
90
+ ) -> DepthArray:
91
+ """Compute per-position depth for a region from an indexed BAM file.
92
+
93
+ Uses a simplified read-iteration approach:
94
+ for each read overlapping the region, increment depth at every covered
95
+ position (CIGAR-unaware).
96
+ """
97
+ with pysam.AlignmentFile(bam_path, "rb") as bam:
98
+ resolved = resolve_contig_name(bam.header, contig)
99
+ if resolved is None:
100
+ raise ValueError(f"Contig '{contig}' not found in BAM header")
101
+
102
+ tid = bam.get_tid(resolved)
103
+ contig_len = bam.get_reference_length(resolved)
104
+
105
+ if start < 0:
106
+ start = 0
107
+ if end < 0 or end > contig_len:
108
+ end = contig_len
109
+
110
+ depths = np.zeros(end - start, dtype=np.int32)
111
+
112
+ for read in bam.fetch(resolved, start, end):
113
+
114
+ r_start = read.reference_start
115
+ r_end = read.reference_end
116
+ if r_end is None:
117
+ continue
118
+
119
+ ov_start = max(r_start, start)
120
+ ov_end = min(r_end, end)
121
+ if ov_start >= ov_end:
122
+ continue
123
+
124
+ depths[ov_start - start : ov_end - start] += 1
125
+
126
+ return DepthArray(contig=resolved, start=start, end=end, depths=depths)