py-gbcms 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gbcms/__init__.py +1 -13
- gbcms/cli.py +134 -716
- gbcms/core/kernel.py +126 -0
- gbcms/io/input.py +222 -0
- gbcms/io/output.py +361 -0
- gbcms/models/core.py +133 -0
- gbcms/pipeline.py +212 -0
- gbcms/py.typed +0 -0
- py_gbcms-2.1.1.dist-info/METADATA +216 -0
- py_gbcms-2.1.1.dist-info/RECORD +13 -0
- gbcms/config.py +0 -98
- gbcms/counter.py +0 -1074
- gbcms/models.py +0 -295
- gbcms/numba_counter.py +0 -394
- gbcms/output.py +0 -573
- gbcms/parallel.py +0 -129
- gbcms/processor.py +0 -293
- gbcms/reference.py +0 -86
- gbcms/variant.py +0 -390
- py_gbcms-2.0.0.dist-info/METADATA +0 -506
- py_gbcms-2.0.0.dist-info/RECORD +0 -16
- {py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/WHEEL +0 -0
- {py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/entry_points.txt +0 -0
- {py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/licenses/LICENSE +0 -0
gbcms/cli.py
CHANGED
|
@@ -1,744 +1,162 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI Entry Point: Exposes the gbcms functionality via command line.
|
|
3
|
+
"""
|
|
4
|
+
|
|
1
5
|
from pathlib import Path
|
|
2
|
-
from typing import Annotated
|
|
3
6
|
|
|
4
7
|
import typer
|
|
5
|
-
from rich.console import Console
|
|
6
|
-
from rich.panel import Panel
|
|
7
|
-
from rich.table import Table
|
|
8
|
-
|
|
9
|
-
from . import __version__
|
|
10
|
-
from .config import Config
|
|
11
|
-
from .processor import VariantProcessor
|
|
12
|
-
|
|
13
|
-
# Initialize Typer app with rich help
|
|
14
|
-
app = typer.Typer(
|
|
15
|
-
name="gbcms",
|
|
16
|
-
help="Python implementation of gbcms for calculating base counts in BAM files",
|
|
17
|
-
add_completion=False,
|
|
18
|
-
rich_markup_mode="rich",
|
|
19
|
-
no_args_is_help=True,
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
# Initialize Rich console
|
|
23
|
-
console = Console()
|
|
24
8
|
|
|
9
|
+
from .models.core import GbcmsConfig, OutputFormat
|
|
10
|
+
from .pipeline import Pipeline
|
|
25
11
|
|
|
26
|
-
|
|
27
|
-
count_app = typer.Typer(help="Count bases at variant positions")
|
|
28
|
-
validate_app = typer.Typer(help="Validate input files")
|
|
29
|
-
app.add_typer(count_app, name="count")
|
|
30
|
-
app.add_typer(validate_app, name="validate")
|
|
12
|
+
app = typer.Typer(help="gbcms: Get Base Counts Multi-Sample")
|
|
31
13
|
|
|
32
14
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
bam_files: dict[str, str],
|
|
36
|
-
variant_files: list[str],
|
|
37
|
-
input_is_maf: bool,
|
|
38
|
-
input_is_vcf: bool,
|
|
39
|
-
rich_output: bool = False,
|
|
40
|
-
) -> tuple[bool, Table | None]:
|
|
15
|
+
@app.callback()
|
|
16
|
+
def main():
|
|
41
17
|
"""
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
Args:
|
|
45
|
-
fasta: Path to reference FASTA file
|
|
46
|
-
bam_files: Dictionary of sample names to BAM file paths
|
|
47
|
-
variant_files: List of variant file paths
|
|
48
|
-
input_is_maf: Whether input files are in MAF format
|
|
49
|
-
input_is_vcf: Whether input files are in VCF format
|
|
50
|
-
rich_output: Whether to return detailed rich table for visual output
|
|
51
|
-
|
|
52
|
-
Returns:
|
|
53
|
-
Tuple of (is_valid, results_table_or_None)
|
|
18
|
+
gbcms: Get Base Counts Multi-Sample
|
|
54
19
|
"""
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
console.print("Please index your FASTA file with: samtools faidx reference.fa")
|
|
93
|
-
all_valid = False
|
|
94
|
-
else:
|
|
95
|
-
if rich_output:
|
|
96
|
-
results.add_row("FASTA", str(fasta), "✅ PASS", "File and index found")
|
|
97
|
-
|
|
98
|
-
# Validate BAM files and indices
|
|
99
|
-
for sample_name, bam_path in bam_files.items():
|
|
100
|
-
bam_file = Path(bam_path)
|
|
101
|
-
if not bam_file.exists():
|
|
102
|
-
if rich_output:
|
|
103
|
-
results.add_row("BAM", f"{sample_name}:{bam_path}", "❌ FAIL", "File not found")
|
|
104
|
-
else:
|
|
105
|
-
console.print(f"[red]Error:[/red] BAM file not found: {bam_file}")
|
|
106
|
-
all_valid = False
|
|
107
|
-
else:
|
|
108
|
-
# Check for BAM index files
|
|
109
|
-
bai_file1 = Path(str(bam_file).replace(".bam", ".bai"))
|
|
110
|
-
bai_file2 = Path(str(bam_file) + ".bai")
|
|
111
|
-
|
|
112
|
-
if not bai_file1.exists() and not bai_file2.exists():
|
|
113
|
-
if rich_output:
|
|
114
|
-
results.add_row(
|
|
115
|
-
"BAM", f"{sample_name}:{bam_path}", "⚠️ WARN", "Index (.bai) not found"
|
|
116
|
-
)
|
|
117
|
-
else:
|
|
118
|
-
console.print(f"[red]Error:[/red] BAM index not found for: {bam_file}")
|
|
119
|
-
console.print(f"Expected: {bai_file1} or {bai_file2}")
|
|
120
|
-
console.print("Please index your BAM file with: samtools index sample.bam")
|
|
121
|
-
all_valid = False
|
|
122
|
-
else:
|
|
123
|
-
if rich_output:
|
|
124
|
-
results.add_row(
|
|
125
|
-
"BAM", f"{sample_name}:{bam_path}", "✅ PASS", "File and index found"
|
|
126
|
-
)
|
|
127
|
-
|
|
128
|
-
# Validate variant files
|
|
129
|
-
for variant_file in variant_files:
|
|
130
|
-
vcf = Path(variant_file)
|
|
131
|
-
if not vcf.exists():
|
|
132
|
-
if rich_output:
|
|
133
|
-
results.add_row(
|
|
134
|
-
"VCF" if input_is_vcf else "MAF", str(vcf), "❌ FAIL", "File not found"
|
|
135
|
-
)
|
|
136
|
-
else:
|
|
137
|
-
console.print(f"[red]Error:[/red] Variant file not found: {vcf}")
|
|
138
|
-
all_valid = False
|
|
139
|
-
else:
|
|
140
|
-
if rich_output:
|
|
141
|
-
results.add_row("VCF" if input_is_vcf else "MAF", str(vcf), "✅ PASS", "File found")
|
|
142
|
-
|
|
143
|
-
if rich_output:
|
|
144
|
-
return all_valid, results
|
|
145
|
-
else:
|
|
146
|
-
return all_valid, None
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
@app.command(name="version", help="Show version information")
|
|
150
|
-
def show_version() -> None:
|
|
151
|
-
"""Print version and exit."""
|
|
152
|
-
console.print(
|
|
153
|
-
Panel.fit(
|
|
154
|
-
f"[bold cyan]py-gbcms[/bold cyan]\n"
|
|
155
|
-
f"Version: [green]{__version__}[/green]\n"
|
|
156
|
-
f"Python implementation of GetBaseCountsMultiSample (gbcms)",
|
|
157
|
-
border_style="cyan",
|
|
158
|
-
title="Version Info",
|
|
159
|
-
)
|
|
160
|
-
)
|
|
161
|
-
raise typer.Exit()
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
def parse_bam_file(bam_string: str) -> tuple[str, str]:
|
|
165
|
-
"""
|
|
166
|
-
Parse BAM file string in format SAMPLE:BAM_PATH.
|
|
167
|
-
|
|
168
|
-
Args:
|
|
169
|
-
bam_string: String in format "sample_name:bam_path"
|
|
170
|
-
|
|
171
|
-
Returns:
|
|
172
|
-
Tuple of (sample_name, bam_path)
|
|
173
|
-
"""
|
|
174
|
-
parts = bam_string.split(":", 1)
|
|
175
|
-
if len(parts) != 2:
|
|
176
|
-
console.print(
|
|
177
|
-
f"[red]Error:[/red] Incorrect format for --bam parameter: {bam_string}",
|
|
178
|
-
)
|
|
179
|
-
console.print("Expected format: SAMPLE_NAME:BAM_FILE")
|
|
180
|
-
raise typer.Exit(1)
|
|
181
|
-
return parts[0], parts[1]
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
def load_bam_fof(bam_fof_path: str) -> dict[str, str]:
|
|
185
|
-
"""
|
|
186
|
-
Load BAM files from file-of-files.
|
|
187
|
-
|
|
188
|
-
Args:
|
|
189
|
-
bam_fof_path: Path to file containing sample names and BAM paths
|
|
190
|
-
|
|
191
|
-
Returns:
|
|
192
|
-
Dictionary mapping sample names to BAM paths
|
|
193
|
-
"""
|
|
194
|
-
bam_files = {}
|
|
195
|
-
with open(bam_fof_path) as f:
|
|
196
|
-
for line_num, line in enumerate(f, 1):
|
|
197
|
-
line = line.strip()
|
|
198
|
-
if not line or line.startswith("#"):
|
|
199
|
-
continue
|
|
200
|
-
|
|
201
|
-
parts = line.split("\t")
|
|
202
|
-
if len(parts) != 2:
|
|
203
|
-
console.print(
|
|
204
|
-
f"[red]Error:[/red] Incorrect format at line {line_num} in {bam_fof_path}",
|
|
205
|
-
)
|
|
206
|
-
console.print("Expected format: SAMPLE_NAME<TAB>BAM_FILE")
|
|
207
|
-
raise typer.Exit(1)
|
|
208
|
-
|
|
209
|
-
sample_name, bam_path = parts
|
|
210
|
-
if sample_name in bam_files:
|
|
211
|
-
console.print(
|
|
212
|
-
f"[red]Error:[/red] Duplicate sample name: {sample_name}",
|
|
213
|
-
)
|
|
214
|
-
raise typer.Exit(1)
|
|
215
|
-
|
|
216
|
-
bam_files[sample_name] = bam_path
|
|
217
|
-
|
|
218
|
-
return bam_files
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
@count_app.command(name="run", help="Run base counting on variants")
|
|
222
|
-
def count_run(
|
|
223
|
-
# Required arguments
|
|
224
|
-
fasta: Annotated[
|
|
225
|
-
Path,
|
|
226
|
-
typer.Option(
|
|
227
|
-
"--fasta",
|
|
228
|
-
"-f",
|
|
229
|
-
help="[bold cyan]Reference genome FASTA file[/bold cyan] (must be indexed with .fai)",
|
|
230
|
-
exists=True,
|
|
231
|
-
file_okay=True,
|
|
232
|
-
dir_okay=False,
|
|
233
|
-
readable=True,
|
|
234
|
-
rich_help_panel="📁 Required Input Files",
|
|
235
|
-
),
|
|
236
|
-
],
|
|
237
|
-
output: Annotated[
|
|
238
|
-
Path,
|
|
239
|
-
typer.Option(
|
|
240
|
-
"--output",
|
|
241
|
-
"-o",
|
|
242
|
-
help="[bold cyan]Output file path[/bold cyan]",
|
|
243
|
-
rich_help_panel="📁 Required Input Files",
|
|
244
|
-
),
|
|
245
|
-
],
|
|
246
|
-
# BAM input options
|
|
247
|
-
bam: Annotated[
|
|
248
|
-
list[str] | None,
|
|
249
|
-
typer.Option(
|
|
250
|
-
"--bam",
|
|
251
|
-
"-b",
|
|
252
|
-
help="BAM file in format [yellow]SAMPLE_NAME:BAM_FILE[/yellow] (can be specified multiple times)",
|
|
253
|
-
rich_help_panel="🧬 BAM Input",
|
|
254
|
-
),
|
|
255
|
-
] = None,
|
|
256
|
-
bam_fof: Annotated[
|
|
257
|
-
Path | None,
|
|
258
|
-
typer.Option(
|
|
259
|
-
"--bam-fof",
|
|
260
|
-
help="File containing sample names and BAM paths (tab-separated)",
|
|
261
|
-
exists=True,
|
|
262
|
-
file_okay=True,
|
|
263
|
-
dir_okay=False,
|
|
264
|
-
readable=True,
|
|
265
|
-
rich_help_panel="🧬 BAM Input",
|
|
266
|
-
),
|
|
267
|
-
] = None,
|
|
268
|
-
# Variant input options (mutually exclusive)
|
|
269
|
-
maf: Annotated[
|
|
270
|
-
list[Path] | None,
|
|
271
|
-
typer.Option(
|
|
272
|
-
"--maf",
|
|
273
|
-
help="Input variant file in [green]MAF format[/green] (can be specified multiple times)",
|
|
274
|
-
exists=True,
|
|
275
|
-
file_okay=True,
|
|
276
|
-
dir_okay=False,
|
|
277
|
-
readable=True,
|
|
278
|
-
rich_help_panel="🔬 Variant Input",
|
|
279
|
-
),
|
|
280
|
-
] = None,
|
|
281
|
-
vcf: Annotated[
|
|
282
|
-
list[Path] | None,
|
|
283
|
-
typer.Option(
|
|
284
|
-
"--vcf",
|
|
285
|
-
help="Input variant file in [green]VCF format[/green] (can be specified multiple times)",
|
|
286
|
-
exists=True,
|
|
287
|
-
file_okay=True,
|
|
288
|
-
dir_okay=False,
|
|
289
|
-
readable=True,
|
|
290
|
-
rich_help_panel="🔬 Variant Input",
|
|
291
|
-
),
|
|
292
|
-
] = None,
|
|
293
|
-
# Output format
|
|
294
|
-
omaf: Annotated[
|
|
295
|
-
bool,
|
|
296
|
-
typer.Option(
|
|
297
|
-
"--omaf",
|
|
298
|
-
help="Output in MAF format (only with MAF input)",
|
|
299
|
-
rich_help_panel="📤 Output Options",
|
|
300
|
-
),
|
|
301
|
-
] = False,
|
|
302
|
-
positive_count: Annotated[
|
|
303
|
-
bool,
|
|
304
|
-
typer.Option(
|
|
305
|
-
"--positive-count/--no-positive-count",
|
|
306
|
-
help="Output positive strand counts (DPP/RDP/ADP)",
|
|
307
|
-
rich_help_panel="📤 Output Options",
|
|
308
|
-
),
|
|
309
|
-
] = True,
|
|
310
|
-
negative_count: Annotated[
|
|
311
|
-
bool,
|
|
312
|
-
typer.Option(
|
|
313
|
-
"--negative-count/--no-negative-count",
|
|
314
|
-
help="Output negative strand counts (DPN/RDN/ADN)",
|
|
315
|
-
rich_help_panel="📤 Output Options",
|
|
316
|
-
),
|
|
317
|
-
] = False,
|
|
318
|
-
fragment_count: Annotated[
|
|
319
|
-
bool,
|
|
320
|
-
typer.Option(
|
|
321
|
-
"--fragment-count/--no-fragment-count",
|
|
322
|
-
help="Output fragment counts (DPF/RDF/ADF)",
|
|
323
|
-
rich_help_panel="📤 Output Options",
|
|
324
|
-
),
|
|
325
|
-
] = False,
|
|
326
|
-
fragment_fractional_weight: Annotated[
|
|
327
|
-
bool,
|
|
328
|
-
typer.Option(
|
|
329
|
-
"--fragment-fractional-weight",
|
|
330
|
-
help="Use fractional weight (0.5) for fragments with disagreement",
|
|
331
|
-
rich_help_panel="📤 Output Options",
|
|
332
|
-
),
|
|
333
|
-
] = False,
|
|
334
|
-
# Quality filters
|
|
335
|
-
maq: Annotated[
|
|
336
|
-
int,
|
|
337
|
-
typer.Option(
|
|
338
|
-
"--maq",
|
|
339
|
-
help="Mapping quality threshold",
|
|
340
|
-
min=0,
|
|
341
|
-
rich_help_panel="🔍 Quality Filters",
|
|
342
|
-
),
|
|
343
|
-
] = 20,
|
|
344
|
-
baq: Annotated[
|
|
345
|
-
int,
|
|
346
|
-
typer.Option(
|
|
347
|
-
"--baq",
|
|
348
|
-
help="Base quality threshold",
|
|
349
|
-
min=0,
|
|
350
|
-
rich_help_panel="🔍 Quality Filters",
|
|
351
|
-
),
|
|
352
|
-
] = 0,
|
|
353
|
-
filter_duplicate: Annotated[
|
|
354
|
-
bool,
|
|
355
|
-
typer.Option(
|
|
356
|
-
"--filter-duplicate/--no-filter-duplicate",
|
|
357
|
-
help="Filter reads marked as duplicate",
|
|
358
|
-
rich_help_panel="🔍 Quality Filters",
|
|
359
|
-
),
|
|
360
|
-
] = True,
|
|
361
|
-
filter_improper_pair: Annotated[
|
|
362
|
-
bool,
|
|
363
|
-
typer.Option(
|
|
364
|
-
"--filter-improper-pair/--no-filter-improper-pair",
|
|
365
|
-
help="Filter reads marked as improperly paired",
|
|
366
|
-
rich_help_panel="🔍 Quality Filters",
|
|
367
|
-
),
|
|
368
|
-
] = False,
|
|
369
|
-
filter_qc_failed: Annotated[
|
|
370
|
-
bool,
|
|
371
|
-
typer.Option(
|
|
372
|
-
"--filter-qc-failed/--no-filter-qc-failed",
|
|
373
|
-
help="Filter reads marked as QC failed",
|
|
374
|
-
rich_help_panel="🔍 Quality Filters",
|
|
375
|
-
),
|
|
376
|
-
] = False,
|
|
377
|
-
filter_indel: Annotated[
|
|
378
|
-
bool,
|
|
379
|
-
typer.Option(
|
|
380
|
-
"--filter-indel/--no-filter-indel",
|
|
381
|
-
help="Filter reads containing indels",
|
|
382
|
-
rich_help_panel="🔍 Quality Filters",
|
|
383
|
-
),
|
|
384
|
-
] = False,
|
|
385
|
-
filter_non_primary: Annotated[
|
|
386
|
-
bool,
|
|
387
|
-
typer.Option(
|
|
388
|
-
"--filter-non-primary/--no-filter-non-primary",
|
|
389
|
-
help="Filter non-primary alignments",
|
|
390
|
-
rich_help_panel="🔍 Quality Filters",
|
|
391
|
-
),
|
|
392
|
-
] = False,
|
|
393
|
-
# Performance options
|
|
394
|
-
thread: Annotated[
|
|
395
|
-
int,
|
|
396
|
-
typer.Option(
|
|
397
|
-
"--thread",
|
|
398
|
-
"-t",
|
|
399
|
-
help="Number of threads for parallel processing",
|
|
400
|
-
min=1,
|
|
401
|
-
rich_help_panel="⚡ Performance",
|
|
402
|
-
),
|
|
403
|
-
] = 1,
|
|
404
|
-
backend: Annotated[
|
|
405
|
-
str,
|
|
406
|
-
typer.Option(
|
|
407
|
-
"--backend",
|
|
408
|
-
help="Parallelization backend: 'joblib' (default), 'loky', 'threading', or 'multiprocessing'",
|
|
409
|
-
rich_help_panel="⚡ Performance",
|
|
410
|
-
),
|
|
411
|
-
] = "joblib",
|
|
412
|
-
max_block_size: Annotated[
|
|
413
|
-
int,
|
|
414
|
-
typer.Option(
|
|
415
|
-
"--max-block-size",
|
|
416
|
-
help="Maximum number of variants per block",
|
|
417
|
-
min=1,
|
|
418
|
-
rich_help_panel="⚡ Performance",
|
|
419
|
-
),
|
|
420
|
-
] = 10000,
|
|
421
|
-
max_block_dist: Annotated[
|
|
422
|
-
int,
|
|
423
|
-
typer.Option(
|
|
424
|
-
"--max-block-dist",
|
|
425
|
-
help="Maximum block distance in base pairs",
|
|
426
|
-
min=1,
|
|
427
|
-
rich_help_panel="⚡ Performance",
|
|
428
|
-
),
|
|
429
|
-
] = 100000,
|
|
430
|
-
# Advanced options
|
|
431
|
-
generic_counting: Annotated[
|
|
432
|
-
bool,
|
|
433
|
-
typer.Option(
|
|
434
|
-
"--generic-counting",
|
|
435
|
-
help="Use generic counting algorithm for complex variants",
|
|
436
|
-
rich_help_panel="🔧 Advanced",
|
|
437
|
-
),
|
|
438
|
-
] = False,
|
|
439
|
-
suppress_warning: Annotated[
|
|
440
|
-
int,
|
|
441
|
-
typer.Option(
|
|
442
|
-
"--suppress-warning",
|
|
443
|
-
help="Maximum number of warnings per type",
|
|
444
|
-
min=0,
|
|
445
|
-
rich_help_panel="🔧 Advanced",
|
|
446
|
-
),
|
|
447
|
-
] = 3,
|
|
448
|
-
# Other options
|
|
449
|
-
verbose: Annotated[
|
|
450
|
-
bool,
|
|
451
|
-
typer.Option(
|
|
452
|
-
"--verbose",
|
|
453
|
-
"-v",
|
|
454
|
-
help="Enable verbose logging",
|
|
455
|
-
rich_help_panel="🔧 Advanced",
|
|
456
|
-
),
|
|
457
|
-
] = False,
|
|
458
|
-
) -> None:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@app.command()
|
|
24
|
+
def run(
|
|
25
|
+
variant_file: Path = typer.Option(
|
|
26
|
+
..., "--variants", "-v", help="Path to VCF or MAF file containing variants"
|
|
27
|
+
),
|
|
28
|
+
bam_files: list[Path] | None = typer.Option(
|
|
29
|
+
None, "--bam", "-b", help="Path to BAM file(s). Can be specified multiple times."
|
|
30
|
+
),
|
|
31
|
+
bam_list: Path | None = typer.Option(
|
|
32
|
+
None, "--bam-list", "-L", help="File containing list of BAM paths (one per line)"
|
|
33
|
+
),
|
|
34
|
+
reference: Path = typer.Option(..., "--fasta", "-f", help="Path to reference FASTA file"),
|
|
35
|
+
output_dir: Path = typer.Option(
|
|
36
|
+
..., "--output-dir", "-o", help="Directory to write output files"
|
|
37
|
+
),
|
|
38
|
+
output_format: OutputFormat = typer.Option(
|
|
39
|
+
OutputFormat.VCF, "--format", help="Output format (vcf or maf)"
|
|
40
|
+
),
|
|
41
|
+
output_suffix: str = typer.Option(
|
|
42
|
+
"", "--suffix", "-S", help="Suffix to append to output filename (e.g. '.genotyped')"
|
|
43
|
+
),
|
|
44
|
+
min_mapq: int = typer.Option(20, "--min-mapq", help="Minimum mapping quality"),
|
|
45
|
+
min_baseq: int = typer.Option(0, "--min-baseq", help="Minimum base quality"),
|
|
46
|
+
filter_duplicates: bool = typer.Option(True, help="Filter duplicate reads"),
|
|
47
|
+
filter_secondary: bool = typer.Option(False, help="Filter secondary alignments"),
|
|
48
|
+
filter_supplementary: bool = typer.Option(False, help="Filter supplementary alignments"),
|
|
49
|
+
filter_qc_failed: bool = typer.Option(False, help="Filter reads failing QC"),
|
|
50
|
+
filter_improper_pair: bool = typer.Option(False, help="Filter improperly paired reads"),
|
|
51
|
+
filter_indel: bool = typer.Option(False, help="Filter reads containing indels"),
|
|
52
|
+
threads: int = typer.Option(
|
|
53
|
+
1, "--threads", "-t", help="Number of threads (not yet implemented in v2 python layer)"
|
|
54
|
+
),
|
|
55
|
+
verbose: bool = typer.Option(False, "--verbose", "-V", help="Enable verbose debug logging"),
|
|
56
|
+
):
|
|
459
57
|
"""
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
This tool counts the number of reference and alternate alleles at each variant
|
|
463
|
-
position across multiple BAM files, with support for various quality filters
|
|
464
|
-
and output formats.
|
|
58
|
+
Run gbcms on one or more BAM files.
|
|
465
59
|
"""
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
)
|
|
60
|
+
import logging
|
|
61
|
+
|
|
62
|
+
from rich.console import Console
|
|
63
|
+
from rich.logging import RichHandler
|
|
64
|
+
|
|
65
|
+
# Configure logging
|
|
66
|
+
log_level = logging.DEBUG if verbose else logging.INFO
|
|
67
|
+
logging.basicConfig(
|
|
68
|
+
level=log_level,
|
|
69
|
+
format="%(message)s",
|
|
70
|
+
datefmt="[%X]",
|
|
71
|
+
handlers=[RichHandler(rich_tracebacks=True, markup=True)],
|
|
476
72
|
)
|
|
477
73
|
|
|
478
|
-
|
|
479
|
-
if not bam and not bam_fof:
|
|
480
|
-
console.print(
|
|
481
|
-
"[red]Error:[/red] Please specify at least one BAM file with --bam or --bam-fof",
|
|
482
|
-
)
|
|
483
|
-
raise typer.Exit(1)
|
|
74
|
+
console = Console()
|
|
484
75
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
"[red]Error:[/red] Please specify at least one variant file with --maf or --vcf",
|
|
488
|
-
)
|
|
489
|
-
raise typer.Exit(1)
|
|
76
|
+
# Map BAMs to sample names (filename stem for now)
|
|
77
|
+
bams_dict = {}
|
|
490
78
|
|
|
491
|
-
|
|
79
|
+
# 1. Process direct BAM arguments
|
|
80
|
+
if bam_files:
|
|
81
|
+
for bam_arg in bam_files:
|
|
82
|
+
# Check for sample_id:path format
|
|
83
|
+
bam_str = str(bam_arg)
|
|
84
|
+
if ":" in bam_str:
|
|
85
|
+
parts = bam_str.split(":", 1)
|
|
86
|
+
sample_name = parts[0]
|
|
87
|
+
bam_path = Path(parts[1])
|
|
88
|
+
else:
|
|
89
|
+
bam_path = bam_arg
|
|
90
|
+
sample_name = bam_path.stem
|
|
91
|
+
|
|
92
|
+
if not bam_path.exists():
|
|
93
|
+
console.print(f"[bold red]Error: BAM file not found: {bam_path}[/bold red]")
|
|
94
|
+
raise typer.Exit(code=1)
|
|
95
|
+
|
|
96
|
+
bams_dict[sample_name] = bam_path
|
|
97
|
+
|
|
98
|
+
# 2. Process BAM list file
|
|
99
|
+
if bam_list:
|
|
100
|
+
if not bam_list.exists():
|
|
101
|
+
console.print(f"[bold red]Error: BAM list file not found: {bam_list}[/bold red]")
|
|
102
|
+
raise typer.Exit(code=1)
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
with open(bam_list) as f:
|
|
106
|
+
for line in f:
|
|
107
|
+
line = line.strip()
|
|
108
|
+
if not line or line.startswith("#"):
|
|
109
|
+
continue
|
|
110
|
+
# Check for 2 columns (sample_id path)
|
|
111
|
+
parts = line.split()
|
|
112
|
+
if len(parts) >= 2:
|
|
113
|
+
sample_name = parts[0]
|
|
114
|
+
bam_path = Path(parts[1])
|
|
115
|
+
else:
|
|
116
|
+
bam_path = Path(parts[0])
|
|
117
|
+
sample_name = bam_path.stem
|
|
118
|
+
|
|
119
|
+
if not bam_path.exists():
|
|
120
|
+
console.print(
|
|
121
|
+
f"[yellow]Warning: BAM file from list not found: {bam_path}[/yellow]"
|
|
122
|
+
)
|
|
123
|
+
continue
|
|
124
|
+
bams_dict[sample_name] = bam_path
|
|
125
|
+
except Exception as e:
|
|
126
|
+
console.print(f"[bold red]Error reading BAM list file {bam_list}: {e}[/bold red]")
|
|
127
|
+
raise typer.Exit(code=1) from e
|
|
128
|
+
|
|
129
|
+
if not bams_dict:
|
|
492
130
|
console.print(
|
|
493
|
-
"[red]Error:
|
|
131
|
+
"[bold red]Error: No valid BAM files provided via --bam or --bam-list[/bold red]"
|
|
494
132
|
)
|
|
495
|
-
raise typer.Exit(1)
|
|
496
|
-
|
|
497
|
-
# Parse BAM files
|
|
498
|
-
bam_files = {}
|
|
499
|
-
|
|
500
|
-
if bam:
|
|
501
|
-
for bam_string in bam:
|
|
502
|
-
sample_name, bam_path = parse_bam_file(bam_string)
|
|
503
|
-
if sample_name in bam_files:
|
|
504
|
-
console.print(
|
|
505
|
-
f"[red]Error:[/red] Duplicate sample name: {sample_name}",
|
|
506
|
-
)
|
|
507
|
-
raise typer.Exit(1)
|
|
508
|
-
bam_files[sample_name] = bam_path
|
|
509
|
-
|
|
510
|
-
if bam_fof:
|
|
511
|
-
fof_bams = load_bam_fof(str(bam_fof))
|
|
512
|
-
for sample_name, bam_path in fof_bams.items():
|
|
513
|
-
if sample_name in bam_files:
|
|
514
|
-
console.print(
|
|
515
|
-
f"[red]Error:[/red] Duplicate sample name: {sample_name}",
|
|
516
|
-
)
|
|
517
|
-
raise typer.Exit(1)
|
|
518
|
-
bam_files[sample_name] = bam_path
|
|
519
|
-
|
|
520
|
-
# Parse variant files
|
|
521
|
-
variant_files = []
|
|
522
|
-
input_is_maf = False
|
|
523
|
-
input_is_vcf = False
|
|
524
|
-
|
|
525
|
-
if maf:
|
|
526
|
-
variant_files = [str(f) for f in maf]
|
|
527
|
-
input_is_maf = True
|
|
528
|
-
|
|
529
|
-
if vcf:
|
|
530
|
-
variant_files = [str(f) for f in vcf]
|
|
531
|
-
input_is_vcf = True
|
|
532
|
-
|
|
533
|
-
# Validate input files before processing
|
|
534
|
-
if not validate_input_files(fasta, bam_files, variant_files, input_is_maf, input_is_vcf)[0]:
|
|
535
|
-
raise typer.Exit(1)
|
|
536
|
-
|
|
537
|
-
# Display configuration
|
|
538
|
-
config_table = Table(title="Configuration", show_header=False, border_style="cyan")
|
|
539
|
-
config_table.add_column("Parameter", style="cyan")
|
|
540
|
-
config_table.add_column("Value", style="green")
|
|
541
|
-
|
|
542
|
-
config_table.add_row("Reference FASTA", str(fasta))
|
|
543
|
-
config_table.add_row("Number of BAM files", str(len(bam_files)))
|
|
544
|
-
config_table.add_row("Number of variant files", str(len(variant_files)))
|
|
545
|
-
config_table.add_row("Input format", "MAF" if input_is_maf else "VCF")
|
|
546
|
-
config_table.add_row("Output file", str(output))
|
|
547
|
-
config_table.add_row("Threads", str(thread))
|
|
548
|
-
config_table.add_row("Backend", backend)
|
|
549
|
-
config_table.add_row("Mapping quality threshold", str(maq))
|
|
550
|
-
config_table.add_row("Base quality threshold", str(baq))
|
|
551
|
-
|
|
552
|
-
console.print(config_table)
|
|
553
|
-
console.print()
|
|
133
|
+
raise typer.Exit(code=1)
|
|
554
134
|
|
|
555
135
|
try:
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
136
|
+
config = GbcmsConfig(
|
|
137
|
+
variant_file=variant_file,
|
|
138
|
+
bam_files=bams_dict,
|
|
139
|
+
reference_fasta=reference,
|
|
140
|
+
output_dir=output_dir,
|
|
141
|
+
output_format=output_format,
|
|
142
|
+
output_suffix=output_suffix,
|
|
143
|
+
min_mapping_quality=min_mapq,
|
|
144
|
+
min_base_quality=min_baseq,
|
|
145
|
+
filter_duplicates=filter_duplicates,
|
|
146
|
+
filter_secondary=filter_secondary,
|
|
147
|
+
filter_supplementary=filter_supplementary,
|
|
566
148
|
filter_qc_failed=filter_qc_failed,
|
|
149
|
+
filter_improper_pair=filter_improper_pair,
|
|
567
150
|
filter_indel=filter_indel,
|
|
568
|
-
|
|
569
|
-
output_positive_count=positive_count,
|
|
570
|
-
output_negative_count=negative_count,
|
|
571
|
-
output_fragment_count=fragment_count,
|
|
572
|
-
fragment_fractional_weight=fragment_fractional_weight,
|
|
573
|
-
max_block_size=max_block_size,
|
|
574
|
-
max_block_dist=max_block_dist,
|
|
575
|
-
num_threads=thread,
|
|
576
|
-
backend=backend,
|
|
577
|
-
input_is_maf=input_is_maf,
|
|
578
|
-
input_is_vcf=input_is_vcf,
|
|
579
|
-
output_maf=omaf,
|
|
580
|
-
generic_counting=generic_counting,
|
|
581
|
-
max_warning_per_type=suppress_warning,
|
|
151
|
+
threads=threads,
|
|
582
152
|
)
|
|
583
153
|
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
processor.process()
|
|
587
|
-
|
|
588
|
-
# Success message
|
|
589
|
-
console.print()
|
|
590
|
-
console.print(
|
|
591
|
-
Panel.fit(
|
|
592
|
-
"[bold green]✓[/bold green] Processing completed successfully!",
|
|
593
|
-
border_style="green",
|
|
594
|
-
)
|
|
595
|
-
)
|
|
154
|
+
pipeline = Pipeline(config)
|
|
155
|
+
pipeline.run()
|
|
596
156
|
|
|
597
157
|
except Exception as e:
|
|
598
|
-
console.print()
|
|
599
|
-
|
|
600
|
-
Panel.fit(
|
|
601
|
-
f"[bold red]✗[/bold red] Error: {str(e)}",
|
|
602
|
-
border_style="red",
|
|
603
|
-
)
|
|
604
|
-
)
|
|
605
|
-
if verbose:
|
|
606
|
-
console.print_exception()
|
|
607
|
-
raise typer.Exit(1) from e
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
@validate_app.command(name="files", help="Validate input files")
|
|
611
|
-
def validate_files(
|
|
612
|
-
fasta: Annotated[
|
|
613
|
-
Path | None,
|
|
614
|
-
typer.Option(
|
|
615
|
-
"--fasta",
|
|
616
|
-
"-f",
|
|
617
|
-
help="Reference FASTA file to validate",
|
|
618
|
-
rich_help_panel="Files to Validate",
|
|
619
|
-
),
|
|
620
|
-
] = None,
|
|
621
|
-
bam: Annotated[
|
|
622
|
-
list[str] | None,
|
|
623
|
-
typer.Option(
|
|
624
|
-
"--bam",
|
|
625
|
-
"-b",
|
|
626
|
-
help="BAM files to validate (SAMPLE:PATH format)",
|
|
627
|
-
rich_help_panel="Files to Validate",
|
|
628
|
-
),
|
|
629
|
-
] = None,
|
|
630
|
-
vcf: Annotated[
|
|
631
|
-
list[Path] | None,
|
|
632
|
-
typer.Option(
|
|
633
|
-
"--vcf",
|
|
634
|
-
help="VCF files to validate",
|
|
635
|
-
rich_help_panel="Files to Validate",
|
|
636
|
-
),
|
|
637
|
-
] = None,
|
|
638
|
-
maf: Annotated[
|
|
639
|
-
list[Path] | None,
|
|
640
|
-
typer.Option(
|
|
641
|
-
"--maf",
|
|
642
|
-
help="MAF files to validate",
|
|
643
|
-
rich_help_panel="Files to Validate",
|
|
644
|
-
),
|
|
645
|
-
] = None,
|
|
646
|
-
) -> None:
|
|
647
|
-
"""
|
|
648
|
-
Validate input files for gbcms.
|
|
649
|
-
|
|
650
|
-
Checks:
|
|
651
|
-
- File existence
|
|
652
|
-
- Required indices (.fai for FASTA, .bai for BAM)
|
|
653
|
-
- File format validity
|
|
654
|
-
- Chromosome name consistency
|
|
655
|
-
"""
|
|
656
|
-
|
|
657
|
-
console.print(
|
|
658
|
-
Panel.fit(
|
|
659
|
-
"[bold cyan]File Validation[/bold cyan]\n" "Checking input files for gbcms",
|
|
660
|
-
border_style="cyan",
|
|
661
|
-
)
|
|
662
|
-
)
|
|
663
|
-
|
|
664
|
-
results = Table(title="Validation Results", show_header=True, header_style="bold cyan")
|
|
665
|
-
results.add_column("File Type", style="cyan")
|
|
666
|
-
results.add_column("File Path", style="white")
|
|
667
|
-
results.add_column("Status", style="white")
|
|
668
|
-
results.add_column("Details", style="yellow")
|
|
669
|
-
|
|
670
|
-
# Parse BAM files if provided
|
|
671
|
-
bam_files = {}
|
|
672
|
-
if bam:
|
|
673
|
-
for bam_string in bam:
|
|
674
|
-
sample_name, bam_path = parse_bam_file(bam_string)
|
|
675
|
-
bam_files[sample_name] = bam_path
|
|
676
|
-
|
|
677
|
-
# Parse variant files
|
|
678
|
-
variant_files = []
|
|
679
|
-
input_is_maf = False
|
|
680
|
-
input_is_vcf = False
|
|
681
|
-
|
|
682
|
-
if maf:
|
|
683
|
-
variant_files = [str(f) for f in maf]
|
|
684
|
-
input_is_maf = True
|
|
685
|
-
|
|
686
|
-
if vcf:
|
|
687
|
-
variant_files = [str(f) for f in vcf]
|
|
688
|
-
input_is_vcf = True
|
|
689
|
-
|
|
690
|
-
# Use the unified validation function with rich output
|
|
691
|
-
# Note: fasta can be None, but if it is, the validation will handle it appropriately
|
|
692
|
-
is_valid, results_table = validate_input_files(
|
|
693
|
-
fasta or Path(""), bam_files, variant_files, input_is_maf, input_is_vcf, rich_output=True
|
|
694
|
-
)
|
|
695
|
-
|
|
696
|
-
# Handle results based on validation outcome
|
|
697
|
-
if is_valid:
|
|
698
|
-
console.print(
|
|
699
|
-
Panel.fit(
|
|
700
|
-
"[bold green]✓[/bold green] All files validated successfully!",
|
|
701
|
-
border_style="green",
|
|
702
|
-
)
|
|
703
|
-
)
|
|
704
|
-
raise typer.Exit(0)
|
|
705
|
-
else:
|
|
706
|
-
console.print(
|
|
707
|
-
Panel.fit(
|
|
708
|
-
"[bold red]✗[/bold red] Some files failed validation",
|
|
709
|
-
border_style="red",
|
|
710
|
-
)
|
|
711
|
-
)
|
|
712
|
-
raise typer.Exit(1)
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
@app.command(name="info", help="Show information about gbcms")
|
|
716
|
-
def show_info() -> None:
|
|
717
|
-
"""Display information about gbcms capabilities."""
|
|
718
|
-
info_table = Table(title="gbcms Information", show_header=False, border_style="cyan")
|
|
719
|
-
info_table.add_column("Category", style="bold cyan")
|
|
720
|
-
info_table.add_column("Details", style="white")
|
|
721
|
-
|
|
722
|
-
info_table.add_row("Version", __version__)
|
|
723
|
-
info_table.add_row("Supported Input", "VCF, MAF")
|
|
724
|
-
info_table.add_row("Supported Output", "VCF-like, MAF, Fillout")
|
|
725
|
-
info_table.add_row("Variant Types", "SNP, DNP, Insertion, Deletion")
|
|
726
|
-
info_table.add_row(
|
|
727
|
-
"Quality Filters", "Mapping quality, Base quality, Duplicates, QC failed, etc."
|
|
728
|
-
)
|
|
729
|
-
info_table.add_row("Counting Methods", "DMP (default), Generic")
|
|
730
|
-
info_table.add_row("Parallelization", "Multi-threaded with configurable threads")
|
|
731
|
-
info_table.add_row("Dependencies", "pysam, numpy, typer, rich")
|
|
732
|
-
|
|
733
|
-
console.print(info_table)
|
|
734
|
-
console.print()
|
|
735
|
-
|
|
736
|
-
console.print("[bold cyan]Example Usage:[/bold cyan]")
|
|
737
|
-
console.print(
|
|
738
|
-
" gbcms count run --fasta ref.fa --bam s1:s1.bam --vcf vars.vcf --output out.txt"
|
|
739
|
-
)
|
|
740
|
-
console.print(" gbcms validate files --fasta ref.fa --bam s1:s1.bam")
|
|
741
|
-
console.print(" gbcms version")
|
|
158
|
+
console.print(f"[bold red]Error: {e}[/bold red]")
|
|
159
|
+
raise typer.Exit(code=1) from e
|
|
742
160
|
|
|
743
161
|
|
|
744
162
|
if __name__ == "__main__":
|