py-gbcms 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gbcms/__init__.py +13 -0
- gbcms/cli.py +745 -0
- gbcms/config.py +98 -0
- gbcms/counter.py +1074 -0
- gbcms/models.py +295 -0
- gbcms/numba_counter.py +394 -0
- gbcms/output.py +573 -0
- gbcms/parallel.py +129 -0
- gbcms/processor.py +293 -0
- gbcms/reference.py +86 -0
- gbcms/variant.py +390 -0
- py_gbcms-2.0.0.dist-info/METADATA +506 -0
- py_gbcms-2.0.0.dist-info/RECORD +16 -0
- py_gbcms-2.0.0.dist-info/WHEEL +4 -0
- py_gbcms-2.0.0.dist-info/entry_points.txt +2 -0
- py_gbcms-2.0.0.dist-info/licenses/LICENSE +664 -0
gbcms/cli.py
ADDED
|
@@ -0,0 +1,745 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Annotated
|
|
3
|
+
|
|
4
|
+
import typer
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
from rich.panel import Panel
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
|
|
9
|
+
from . import __version__
|
|
10
|
+
from .config import Config
|
|
11
|
+
from .processor import VariantProcessor
|
|
12
|
+
|
|
13
|
+
# Initialize Typer app with rich help
|
|
14
|
+
app = typer.Typer(
|
|
15
|
+
name="gbcms",
|
|
16
|
+
help="Python implementation of gbcms for calculating base counts in BAM files",
|
|
17
|
+
add_completion=False,
|
|
18
|
+
rich_markup_mode="rich",
|
|
19
|
+
no_args_is_help=True,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Initialize Rich console
|
|
23
|
+
console = Console()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Subcommands
|
|
27
|
+
count_app = typer.Typer(help="Count bases at variant positions")
|
|
28
|
+
validate_app = typer.Typer(help="Validate input files")
|
|
29
|
+
app.add_typer(count_app, name="count")
|
|
30
|
+
app.add_typer(validate_app, name="validate")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def validate_input_files(
|
|
34
|
+
fasta: Path,
|
|
35
|
+
bam_files: dict[str, str],
|
|
36
|
+
variant_files: list[str],
|
|
37
|
+
input_is_maf: bool,
|
|
38
|
+
input_is_vcf: bool,
|
|
39
|
+
rich_output: bool = False,
|
|
40
|
+
) -> tuple[bool, Table | None]:
|
|
41
|
+
"""
|
|
42
|
+
Validate input files for gbcms processing.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
fasta: Path to reference FASTA file
|
|
46
|
+
bam_files: Dictionary of sample names to BAM file paths
|
|
47
|
+
variant_files: List of variant file paths
|
|
48
|
+
input_is_maf: Whether input files are in MAF format
|
|
49
|
+
input_is_vcf: Whether input files are in VCF format
|
|
50
|
+
rich_output: Whether to return detailed rich table for visual output
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Tuple of (is_valid, results_table_or_None)
|
|
54
|
+
"""
|
|
55
|
+
from pathlib import Path
|
|
56
|
+
|
|
57
|
+
if rich_output:
|
|
58
|
+
from rich.table import Table
|
|
59
|
+
|
|
60
|
+
results = Table(title="Validation Results", show_header=True, header_style="bold cyan")
|
|
61
|
+
results.add_column("File Type", style="cyan")
|
|
62
|
+
results.add_column("File Path", style="white")
|
|
63
|
+
results.add_column("Status", style="white")
|
|
64
|
+
results.add_column("Details", style="yellow")
|
|
65
|
+
|
|
66
|
+
console.print(
|
|
67
|
+
Panel.fit(
|
|
68
|
+
"[bold cyan]File Validation[/bold cyan]\n" "Checking input files for gbcms",
|
|
69
|
+
border_style="cyan",
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
all_valid = True
|
|
74
|
+
|
|
75
|
+
# Validate FASTA file and index
|
|
76
|
+
if not fasta or str(fasta) == "":
|
|
77
|
+
# Skip FASTA validation if not provided
|
|
78
|
+
pass
|
|
79
|
+
elif not fasta.exists():
|
|
80
|
+
if rich_output:
|
|
81
|
+
results.add_row("FASTA", str(fasta), "❌ FAIL", "File not found")
|
|
82
|
+
else:
|
|
83
|
+
console.print(f"[red]Error:[/red] FASTA file not found: {fasta}")
|
|
84
|
+
all_valid = False
|
|
85
|
+
else:
|
|
86
|
+
fai_file = Path(str(fasta) + ".fai")
|
|
87
|
+
if not fai_file.exists():
|
|
88
|
+
if rich_output:
|
|
89
|
+
results.add_row("FASTA", str(fasta), "⚠️ WARN", "Index (.fai) not found")
|
|
90
|
+
else:
|
|
91
|
+
console.print(f"[red]Error:[/red] FASTA index (.fai) not found: {fai_file}")
|
|
92
|
+
console.print("Please index your FASTA file with: samtools faidx reference.fa")
|
|
93
|
+
all_valid = False
|
|
94
|
+
else:
|
|
95
|
+
if rich_output:
|
|
96
|
+
results.add_row("FASTA", str(fasta), "✅ PASS", "File and index found")
|
|
97
|
+
|
|
98
|
+
# Validate BAM files and indices
|
|
99
|
+
for sample_name, bam_path in bam_files.items():
|
|
100
|
+
bam_file = Path(bam_path)
|
|
101
|
+
if not bam_file.exists():
|
|
102
|
+
if rich_output:
|
|
103
|
+
results.add_row("BAM", f"{sample_name}:{bam_path}", "❌ FAIL", "File not found")
|
|
104
|
+
else:
|
|
105
|
+
console.print(f"[red]Error:[/red] BAM file not found: {bam_file}")
|
|
106
|
+
all_valid = False
|
|
107
|
+
else:
|
|
108
|
+
# Check for BAM index files
|
|
109
|
+
bai_file1 = Path(str(bam_file).replace(".bam", ".bai"))
|
|
110
|
+
bai_file2 = Path(str(bam_file) + ".bai")
|
|
111
|
+
|
|
112
|
+
if not bai_file1.exists() and not bai_file2.exists():
|
|
113
|
+
if rich_output:
|
|
114
|
+
results.add_row(
|
|
115
|
+
"BAM", f"{sample_name}:{bam_path}", "⚠️ WARN", "Index (.bai) not found"
|
|
116
|
+
)
|
|
117
|
+
else:
|
|
118
|
+
console.print(f"[red]Error:[/red] BAM index not found for: {bam_file}")
|
|
119
|
+
console.print(f"Expected: {bai_file1} or {bai_file2}")
|
|
120
|
+
console.print("Please index your BAM file with: samtools index sample.bam")
|
|
121
|
+
all_valid = False
|
|
122
|
+
else:
|
|
123
|
+
if rich_output:
|
|
124
|
+
results.add_row(
|
|
125
|
+
"BAM", f"{sample_name}:{bam_path}", "✅ PASS", "File and index found"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Validate variant files
|
|
129
|
+
for variant_file in variant_files:
|
|
130
|
+
vcf = Path(variant_file)
|
|
131
|
+
if not vcf.exists():
|
|
132
|
+
if rich_output:
|
|
133
|
+
results.add_row(
|
|
134
|
+
"VCF" if input_is_vcf else "MAF", str(vcf), "❌ FAIL", "File not found"
|
|
135
|
+
)
|
|
136
|
+
else:
|
|
137
|
+
console.print(f"[red]Error:[/red] Variant file not found: {vcf}")
|
|
138
|
+
all_valid = False
|
|
139
|
+
else:
|
|
140
|
+
if rich_output:
|
|
141
|
+
results.add_row("VCF" if input_is_vcf else "MAF", str(vcf), "✅ PASS", "File found")
|
|
142
|
+
|
|
143
|
+
if rich_output:
|
|
144
|
+
return all_valid, results
|
|
145
|
+
else:
|
|
146
|
+
return all_valid, None
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@app.command(name="version", help="Show version information")
|
|
150
|
+
def show_version() -> None:
|
|
151
|
+
"""Print version and exit."""
|
|
152
|
+
console.print(
|
|
153
|
+
Panel.fit(
|
|
154
|
+
f"[bold cyan]py-gbcms[/bold cyan]\n"
|
|
155
|
+
f"Version: [green]{__version__}[/green]\n"
|
|
156
|
+
f"Python implementation of GetBaseCountsMultiSample (gbcms)",
|
|
157
|
+
border_style="cyan",
|
|
158
|
+
title="Version Info",
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
raise typer.Exit()
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def parse_bam_file(bam_string: str) -> tuple[str, str]:
|
|
165
|
+
"""
|
|
166
|
+
Parse BAM file string in format SAMPLE:BAM_PATH.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
bam_string: String in format "sample_name:bam_path"
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Tuple of (sample_name, bam_path)
|
|
173
|
+
"""
|
|
174
|
+
parts = bam_string.split(":", 1)
|
|
175
|
+
if len(parts) != 2:
|
|
176
|
+
console.print(
|
|
177
|
+
f"[red]Error:[/red] Incorrect format for --bam parameter: {bam_string}",
|
|
178
|
+
)
|
|
179
|
+
console.print("Expected format: SAMPLE_NAME:BAM_FILE")
|
|
180
|
+
raise typer.Exit(1)
|
|
181
|
+
return parts[0], parts[1]
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def load_bam_fof(bam_fof_path: str) -> dict[str, str]:
|
|
185
|
+
"""
|
|
186
|
+
Load BAM files from file-of-files.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
bam_fof_path: Path to file containing sample names and BAM paths
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
Dictionary mapping sample names to BAM paths
|
|
193
|
+
"""
|
|
194
|
+
bam_files = {}
|
|
195
|
+
with open(bam_fof_path) as f:
|
|
196
|
+
for line_num, line in enumerate(f, 1):
|
|
197
|
+
line = line.strip()
|
|
198
|
+
if not line or line.startswith("#"):
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
parts = line.split("\t")
|
|
202
|
+
if len(parts) != 2:
|
|
203
|
+
console.print(
|
|
204
|
+
f"[red]Error:[/red] Incorrect format at line {line_num} in {bam_fof_path}",
|
|
205
|
+
)
|
|
206
|
+
console.print("Expected format: SAMPLE_NAME<TAB>BAM_FILE")
|
|
207
|
+
raise typer.Exit(1)
|
|
208
|
+
|
|
209
|
+
sample_name, bam_path = parts
|
|
210
|
+
if sample_name in bam_files:
|
|
211
|
+
console.print(
|
|
212
|
+
f"[red]Error:[/red] Duplicate sample name: {sample_name}",
|
|
213
|
+
)
|
|
214
|
+
raise typer.Exit(1)
|
|
215
|
+
|
|
216
|
+
bam_files[sample_name] = bam_path
|
|
217
|
+
|
|
218
|
+
return bam_files
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
@count_app.command(name="run", help="Run base counting on variants")
|
|
222
|
+
def count_run(
|
|
223
|
+
# Required arguments
|
|
224
|
+
fasta: Annotated[
|
|
225
|
+
Path,
|
|
226
|
+
typer.Option(
|
|
227
|
+
"--fasta",
|
|
228
|
+
"-f",
|
|
229
|
+
help="[bold cyan]Reference genome FASTA file[/bold cyan] (must be indexed with .fai)",
|
|
230
|
+
exists=True,
|
|
231
|
+
file_okay=True,
|
|
232
|
+
dir_okay=False,
|
|
233
|
+
readable=True,
|
|
234
|
+
rich_help_panel="📁 Required Input Files",
|
|
235
|
+
),
|
|
236
|
+
],
|
|
237
|
+
output: Annotated[
|
|
238
|
+
Path,
|
|
239
|
+
typer.Option(
|
|
240
|
+
"--output",
|
|
241
|
+
"-o",
|
|
242
|
+
help="[bold cyan]Output file path[/bold cyan]",
|
|
243
|
+
rich_help_panel="📁 Required Input Files",
|
|
244
|
+
),
|
|
245
|
+
],
|
|
246
|
+
# BAM input options
|
|
247
|
+
bam: Annotated[
|
|
248
|
+
list[str] | None,
|
|
249
|
+
typer.Option(
|
|
250
|
+
"--bam",
|
|
251
|
+
"-b",
|
|
252
|
+
help="BAM file in format [yellow]SAMPLE_NAME:BAM_FILE[/yellow] (can be specified multiple times)",
|
|
253
|
+
rich_help_panel="🧬 BAM Input",
|
|
254
|
+
),
|
|
255
|
+
] = None,
|
|
256
|
+
bam_fof: Annotated[
|
|
257
|
+
Path | None,
|
|
258
|
+
typer.Option(
|
|
259
|
+
"--bam-fof",
|
|
260
|
+
help="File containing sample names and BAM paths (tab-separated)",
|
|
261
|
+
exists=True,
|
|
262
|
+
file_okay=True,
|
|
263
|
+
dir_okay=False,
|
|
264
|
+
readable=True,
|
|
265
|
+
rich_help_panel="🧬 BAM Input",
|
|
266
|
+
),
|
|
267
|
+
] = None,
|
|
268
|
+
# Variant input options (mutually exclusive)
|
|
269
|
+
maf: Annotated[
|
|
270
|
+
list[Path] | None,
|
|
271
|
+
typer.Option(
|
|
272
|
+
"--maf",
|
|
273
|
+
help="Input variant file in [green]MAF format[/green] (can be specified multiple times)",
|
|
274
|
+
exists=True,
|
|
275
|
+
file_okay=True,
|
|
276
|
+
dir_okay=False,
|
|
277
|
+
readable=True,
|
|
278
|
+
rich_help_panel="🔬 Variant Input",
|
|
279
|
+
),
|
|
280
|
+
] = None,
|
|
281
|
+
vcf: Annotated[
|
|
282
|
+
list[Path] | None,
|
|
283
|
+
typer.Option(
|
|
284
|
+
"--vcf",
|
|
285
|
+
help="Input variant file in [green]VCF format[/green] (can be specified multiple times)",
|
|
286
|
+
exists=True,
|
|
287
|
+
file_okay=True,
|
|
288
|
+
dir_okay=False,
|
|
289
|
+
readable=True,
|
|
290
|
+
rich_help_panel="🔬 Variant Input",
|
|
291
|
+
),
|
|
292
|
+
] = None,
|
|
293
|
+
# Output format
|
|
294
|
+
omaf: Annotated[
|
|
295
|
+
bool,
|
|
296
|
+
typer.Option(
|
|
297
|
+
"--omaf",
|
|
298
|
+
help="Output in MAF format (only with MAF input)",
|
|
299
|
+
rich_help_panel="📤 Output Options",
|
|
300
|
+
),
|
|
301
|
+
] = False,
|
|
302
|
+
positive_count: Annotated[
|
|
303
|
+
bool,
|
|
304
|
+
typer.Option(
|
|
305
|
+
"--positive-count/--no-positive-count",
|
|
306
|
+
help="Output positive strand counts (DPP/RDP/ADP)",
|
|
307
|
+
rich_help_panel="📤 Output Options",
|
|
308
|
+
),
|
|
309
|
+
] = True,
|
|
310
|
+
negative_count: Annotated[
|
|
311
|
+
bool,
|
|
312
|
+
typer.Option(
|
|
313
|
+
"--negative-count/--no-negative-count",
|
|
314
|
+
help="Output negative strand counts (DPN/RDN/ADN)",
|
|
315
|
+
rich_help_panel="📤 Output Options",
|
|
316
|
+
),
|
|
317
|
+
] = False,
|
|
318
|
+
fragment_count: Annotated[
|
|
319
|
+
bool,
|
|
320
|
+
typer.Option(
|
|
321
|
+
"--fragment-count/--no-fragment-count",
|
|
322
|
+
help="Output fragment counts (DPF/RDF/ADF)",
|
|
323
|
+
rich_help_panel="📤 Output Options",
|
|
324
|
+
),
|
|
325
|
+
] = False,
|
|
326
|
+
fragment_fractional_weight: Annotated[
|
|
327
|
+
bool,
|
|
328
|
+
typer.Option(
|
|
329
|
+
"--fragment-fractional-weight",
|
|
330
|
+
help="Use fractional weight (0.5) for fragments with disagreement",
|
|
331
|
+
rich_help_panel="📤 Output Options",
|
|
332
|
+
),
|
|
333
|
+
] = False,
|
|
334
|
+
# Quality filters
|
|
335
|
+
maq: Annotated[
|
|
336
|
+
int,
|
|
337
|
+
typer.Option(
|
|
338
|
+
"--maq",
|
|
339
|
+
help="Mapping quality threshold",
|
|
340
|
+
min=0,
|
|
341
|
+
rich_help_panel="🔍 Quality Filters",
|
|
342
|
+
),
|
|
343
|
+
] = 20,
|
|
344
|
+
baq: Annotated[
|
|
345
|
+
int,
|
|
346
|
+
typer.Option(
|
|
347
|
+
"--baq",
|
|
348
|
+
help="Base quality threshold",
|
|
349
|
+
min=0,
|
|
350
|
+
rich_help_panel="🔍 Quality Filters",
|
|
351
|
+
),
|
|
352
|
+
] = 0,
|
|
353
|
+
filter_duplicate: Annotated[
|
|
354
|
+
bool,
|
|
355
|
+
typer.Option(
|
|
356
|
+
"--filter-duplicate/--no-filter-duplicate",
|
|
357
|
+
help="Filter reads marked as duplicate",
|
|
358
|
+
rich_help_panel="🔍 Quality Filters",
|
|
359
|
+
),
|
|
360
|
+
] = True,
|
|
361
|
+
filter_improper_pair: Annotated[
|
|
362
|
+
bool,
|
|
363
|
+
typer.Option(
|
|
364
|
+
"--filter-improper-pair/--no-filter-improper-pair",
|
|
365
|
+
help="Filter reads marked as improperly paired",
|
|
366
|
+
rich_help_panel="🔍 Quality Filters",
|
|
367
|
+
),
|
|
368
|
+
] = False,
|
|
369
|
+
filter_qc_failed: Annotated[
|
|
370
|
+
bool,
|
|
371
|
+
typer.Option(
|
|
372
|
+
"--filter-qc-failed/--no-filter-qc-failed",
|
|
373
|
+
help="Filter reads marked as QC failed",
|
|
374
|
+
rich_help_panel="🔍 Quality Filters",
|
|
375
|
+
),
|
|
376
|
+
] = False,
|
|
377
|
+
filter_indel: Annotated[
|
|
378
|
+
bool,
|
|
379
|
+
typer.Option(
|
|
380
|
+
"--filter-indel/--no-filter-indel",
|
|
381
|
+
help="Filter reads containing indels",
|
|
382
|
+
rich_help_panel="🔍 Quality Filters",
|
|
383
|
+
),
|
|
384
|
+
] = False,
|
|
385
|
+
filter_non_primary: Annotated[
|
|
386
|
+
bool,
|
|
387
|
+
typer.Option(
|
|
388
|
+
"--filter-non-primary/--no-filter-non-primary",
|
|
389
|
+
help="Filter non-primary alignments",
|
|
390
|
+
rich_help_panel="🔍 Quality Filters",
|
|
391
|
+
),
|
|
392
|
+
] = False,
|
|
393
|
+
# Performance options
|
|
394
|
+
thread: Annotated[
|
|
395
|
+
int,
|
|
396
|
+
typer.Option(
|
|
397
|
+
"--thread",
|
|
398
|
+
"-t",
|
|
399
|
+
help="Number of threads for parallel processing",
|
|
400
|
+
min=1,
|
|
401
|
+
rich_help_panel="⚡ Performance",
|
|
402
|
+
),
|
|
403
|
+
] = 1,
|
|
404
|
+
backend: Annotated[
|
|
405
|
+
str,
|
|
406
|
+
typer.Option(
|
|
407
|
+
"--backend",
|
|
408
|
+
help="Parallelization backend: 'joblib' (default), 'loky', 'threading', or 'multiprocessing'",
|
|
409
|
+
rich_help_panel="⚡ Performance",
|
|
410
|
+
),
|
|
411
|
+
] = "joblib",
|
|
412
|
+
max_block_size: Annotated[
|
|
413
|
+
int,
|
|
414
|
+
typer.Option(
|
|
415
|
+
"--max-block-size",
|
|
416
|
+
help="Maximum number of variants per block",
|
|
417
|
+
min=1,
|
|
418
|
+
rich_help_panel="⚡ Performance",
|
|
419
|
+
),
|
|
420
|
+
] = 10000,
|
|
421
|
+
max_block_dist: Annotated[
|
|
422
|
+
int,
|
|
423
|
+
typer.Option(
|
|
424
|
+
"--max-block-dist",
|
|
425
|
+
help="Maximum block distance in base pairs",
|
|
426
|
+
min=1,
|
|
427
|
+
rich_help_panel="⚡ Performance",
|
|
428
|
+
),
|
|
429
|
+
] = 100000,
|
|
430
|
+
# Advanced options
|
|
431
|
+
generic_counting: Annotated[
|
|
432
|
+
bool,
|
|
433
|
+
typer.Option(
|
|
434
|
+
"--generic-counting",
|
|
435
|
+
help="Use generic counting algorithm for complex variants",
|
|
436
|
+
rich_help_panel="🔧 Advanced",
|
|
437
|
+
),
|
|
438
|
+
] = False,
|
|
439
|
+
suppress_warning: Annotated[
|
|
440
|
+
int,
|
|
441
|
+
typer.Option(
|
|
442
|
+
"--suppress-warning",
|
|
443
|
+
help="Maximum number of warnings per type",
|
|
444
|
+
min=0,
|
|
445
|
+
rich_help_panel="🔧 Advanced",
|
|
446
|
+
),
|
|
447
|
+
] = 3,
|
|
448
|
+
# Other options
|
|
449
|
+
verbose: Annotated[
|
|
450
|
+
bool,
|
|
451
|
+
typer.Option(
|
|
452
|
+
"--verbose",
|
|
453
|
+
"-v",
|
|
454
|
+
help="Enable verbose logging",
|
|
455
|
+
rich_help_panel="🔧 Advanced",
|
|
456
|
+
),
|
|
457
|
+
] = False,
|
|
458
|
+
) -> None:
|
|
459
|
+
"""
|
|
460
|
+
Calculate base counts in multiple BAM files for variants in VCF/MAF files.
|
|
461
|
+
|
|
462
|
+
This tool counts the number of reference and alternate alleles at each variant
|
|
463
|
+
position across multiple BAM files, with support for various quality filters
|
|
464
|
+
and output formats.
|
|
465
|
+
"""
|
|
466
|
+
# Setup logging
|
|
467
|
+
# setup_logging(verbose)
|
|
468
|
+
|
|
469
|
+
# Print banner
|
|
470
|
+
console.print(
|
|
471
|
+
Panel.fit(
|
|
472
|
+
f"[bold cyan]py-gbcms[/bold cyan] v{__version__}\n"
|
|
473
|
+
"Python implementation of GetBaseCountsMultiSample",
|
|
474
|
+
border_style="cyan",
|
|
475
|
+
)
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
# Validate inputs
|
|
479
|
+
if not bam and not bam_fof:
|
|
480
|
+
console.print(
|
|
481
|
+
"[red]Error:[/red] Please specify at least one BAM file with --bam or --bam-fof",
|
|
482
|
+
)
|
|
483
|
+
raise typer.Exit(1)
|
|
484
|
+
|
|
485
|
+
if not maf and not vcf:
|
|
486
|
+
console.print(
|
|
487
|
+
"[red]Error:[/red] Please specify at least one variant file with --maf or --vcf",
|
|
488
|
+
)
|
|
489
|
+
raise typer.Exit(1)
|
|
490
|
+
|
|
491
|
+
if maf and vcf:
|
|
492
|
+
console.print(
|
|
493
|
+
"[red]Error:[/red] --maf and --vcf are mutually exclusive",
|
|
494
|
+
)
|
|
495
|
+
raise typer.Exit(1)
|
|
496
|
+
|
|
497
|
+
# Parse BAM files
|
|
498
|
+
bam_files = {}
|
|
499
|
+
|
|
500
|
+
if bam:
|
|
501
|
+
for bam_string in bam:
|
|
502
|
+
sample_name, bam_path = parse_bam_file(bam_string)
|
|
503
|
+
if sample_name in bam_files:
|
|
504
|
+
console.print(
|
|
505
|
+
f"[red]Error:[/red] Duplicate sample name: {sample_name}",
|
|
506
|
+
)
|
|
507
|
+
raise typer.Exit(1)
|
|
508
|
+
bam_files[sample_name] = bam_path
|
|
509
|
+
|
|
510
|
+
if bam_fof:
|
|
511
|
+
fof_bams = load_bam_fof(str(bam_fof))
|
|
512
|
+
for sample_name, bam_path in fof_bams.items():
|
|
513
|
+
if sample_name in bam_files:
|
|
514
|
+
console.print(
|
|
515
|
+
f"[red]Error:[/red] Duplicate sample name: {sample_name}",
|
|
516
|
+
)
|
|
517
|
+
raise typer.Exit(1)
|
|
518
|
+
bam_files[sample_name] = bam_path
|
|
519
|
+
|
|
520
|
+
# Parse variant files
|
|
521
|
+
variant_files = []
|
|
522
|
+
input_is_maf = False
|
|
523
|
+
input_is_vcf = False
|
|
524
|
+
|
|
525
|
+
if maf:
|
|
526
|
+
variant_files = [str(f) for f in maf]
|
|
527
|
+
input_is_maf = True
|
|
528
|
+
|
|
529
|
+
if vcf:
|
|
530
|
+
variant_files = [str(f) for f in vcf]
|
|
531
|
+
input_is_vcf = True
|
|
532
|
+
|
|
533
|
+
# Validate input files before processing
|
|
534
|
+
if not validate_input_files(fasta, bam_files, variant_files, input_is_maf, input_is_vcf)[0]:
|
|
535
|
+
raise typer.Exit(1)
|
|
536
|
+
|
|
537
|
+
# Display configuration
|
|
538
|
+
config_table = Table(title="Configuration", show_header=False, border_style="cyan")
|
|
539
|
+
config_table.add_column("Parameter", style="cyan")
|
|
540
|
+
config_table.add_column("Value", style="green")
|
|
541
|
+
|
|
542
|
+
config_table.add_row("Reference FASTA", str(fasta))
|
|
543
|
+
config_table.add_row("Number of BAM files", str(len(bam_files)))
|
|
544
|
+
config_table.add_row("Number of variant files", str(len(variant_files)))
|
|
545
|
+
config_table.add_row("Input format", "MAF" if input_is_maf else "VCF")
|
|
546
|
+
config_table.add_row("Output file", str(output))
|
|
547
|
+
config_table.add_row("Threads", str(thread))
|
|
548
|
+
config_table.add_row("Backend", backend)
|
|
549
|
+
config_table.add_row("Mapping quality threshold", str(maq))
|
|
550
|
+
config_table.add_row("Base quality threshold", str(baq))
|
|
551
|
+
|
|
552
|
+
console.print(config_table)
|
|
553
|
+
console.print()
|
|
554
|
+
|
|
555
|
+
try:
|
|
556
|
+
# Create configuration using legacy Config format (processor expects this)
|
|
557
|
+
config = Config(
|
|
558
|
+
fasta_file=str(fasta),
|
|
559
|
+
bam_files=bam_files,
|
|
560
|
+
variant_files=variant_files,
|
|
561
|
+
output_file=str(output),
|
|
562
|
+
mapping_quality_threshold=maq,
|
|
563
|
+
base_quality_threshold=baq,
|
|
564
|
+
filter_duplicate=filter_duplicate,
|
|
565
|
+
filter_improper_pair=filter_improper_pair,
|
|
566
|
+
filter_qc_failed=filter_qc_failed,
|
|
567
|
+
filter_indel=filter_indel,
|
|
568
|
+
filter_non_primary=filter_non_primary,
|
|
569
|
+
output_positive_count=positive_count,
|
|
570
|
+
output_negative_count=negative_count,
|
|
571
|
+
output_fragment_count=fragment_count,
|
|
572
|
+
fragment_fractional_weight=fragment_fractional_weight,
|
|
573
|
+
max_block_size=max_block_size,
|
|
574
|
+
max_block_dist=max_block_dist,
|
|
575
|
+
num_threads=thread,
|
|
576
|
+
backend=backend,
|
|
577
|
+
input_is_maf=input_is_maf,
|
|
578
|
+
input_is_vcf=input_is_vcf,
|
|
579
|
+
output_maf=omaf,
|
|
580
|
+
generic_counting=generic_counting,
|
|
581
|
+
max_warning_per_type=suppress_warning,
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
# Process variants
|
|
585
|
+
processor = VariantProcessor(config)
|
|
586
|
+
processor.process()
|
|
587
|
+
|
|
588
|
+
# Success message
|
|
589
|
+
console.print()
|
|
590
|
+
console.print(
|
|
591
|
+
Panel.fit(
|
|
592
|
+
"[bold green]✓[/bold green] Processing completed successfully!",
|
|
593
|
+
border_style="green",
|
|
594
|
+
)
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
except Exception as e:
|
|
598
|
+
console.print()
|
|
599
|
+
console.print(
|
|
600
|
+
Panel.fit(
|
|
601
|
+
f"[bold red]✗[/bold red] Error: {str(e)}",
|
|
602
|
+
border_style="red",
|
|
603
|
+
)
|
|
604
|
+
)
|
|
605
|
+
if verbose:
|
|
606
|
+
console.print_exception()
|
|
607
|
+
raise typer.Exit(1) from e
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
@validate_app.command(name="files", help="Validate input files")
|
|
611
|
+
def validate_files(
|
|
612
|
+
fasta: Annotated[
|
|
613
|
+
Path | None,
|
|
614
|
+
typer.Option(
|
|
615
|
+
"--fasta",
|
|
616
|
+
"-f",
|
|
617
|
+
help="Reference FASTA file to validate",
|
|
618
|
+
rich_help_panel="Files to Validate",
|
|
619
|
+
),
|
|
620
|
+
] = None,
|
|
621
|
+
bam: Annotated[
|
|
622
|
+
list[str] | None,
|
|
623
|
+
typer.Option(
|
|
624
|
+
"--bam",
|
|
625
|
+
"-b",
|
|
626
|
+
help="BAM files to validate (SAMPLE:PATH format)",
|
|
627
|
+
rich_help_panel="Files to Validate",
|
|
628
|
+
),
|
|
629
|
+
] = None,
|
|
630
|
+
vcf: Annotated[
|
|
631
|
+
list[Path] | None,
|
|
632
|
+
typer.Option(
|
|
633
|
+
"--vcf",
|
|
634
|
+
help="VCF files to validate",
|
|
635
|
+
rich_help_panel="Files to Validate",
|
|
636
|
+
),
|
|
637
|
+
] = None,
|
|
638
|
+
maf: Annotated[
|
|
639
|
+
list[Path] | None,
|
|
640
|
+
typer.Option(
|
|
641
|
+
"--maf",
|
|
642
|
+
help="MAF files to validate",
|
|
643
|
+
rich_help_panel="Files to Validate",
|
|
644
|
+
),
|
|
645
|
+
] = None,
|
|
646
|
+
) -> None:
|
|
647
|
+
"""
|
|
648
|
+
Validate input files for gbcms.
|
|
649
|
+
|
|
650
|
+
Checks:
|
|
651
|
+
- File existence
|
|
652
|
+
- Required indices (.fai for FASTA, .bai for BAM)
|
|
653
|
+
- File format validity
|
|
654
|
+
- Chromosome name consistency
|
|
655
|
+
"""
|
|
656
|
+
|
|
657
|
+
console.print(
|
|
658
|
+
Panel.fit(
|
|
659
|
+
"[bold cyan]File Validation[/bold cyan]\n" "Checking input files for gbcms",
|
|
660
|
+
border_style="cyan",
|
|
661
|
+
)
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
results = Table(title="Validation Results", show_header=True, header_style="bold cyan")
|
|
665
|
+
results.add_column("File Type", style="cyan")
|
|
666
|
+
results.add_column("File Path", style="white")
|
|
667
|
+
results.add_column("Status", style="white")
|
|
668
|
+
results.add_column("Details", style="yellow")
|
|
669
|
+
|
|
670
|
+
# Parse BAM files if provided
|
|
671
|
+
bam_files = {}
|
|
672
|
+
if bam:
|
|
673
|
+
for bam_string in bam:
|
|
674
|
+
sample_name, bam_path = parse_bam_file(bam_string)
|
|
675
|
+
bam_files[sample_name] = bam_path
|
|
676
|
+
|
|
677
|
+
# Parse variant files
|
|
678
|
+
variant_files = []
|
|
679
|
+
input_is_maf = False
|
|
680
|
+
input_is_vcf = False
|
|
681
|
+
|
|
682
|
+
if maf:
|
|
683
|
+
variant_files = [str(f) for f in maf]
|
|
684
|
+
input_is_maf = True
|
|
685
|
+
|
|
686
|
+
if vcf:
|
|
687
|
+
variant_files = [str(f) for f in vcf]
|
|
688
|
+
input_is_vcf = True
|
|
689
|
+
|
|
690
|
+
# Use the unified validation function with rich output
|
|
691
|
+
# Note: fasta can be None, but if it is, the validation will handle it appropriately
|
|
692
|
+
is_valid, results_table = validate_input_files(
|
|
693
|
+
fasta or Path(""), bam_files, variant_files, input_is_maf, input_is_vcf, rich_output=True
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
# Handle results based on validation outcome
|
|
697
|
+
if is_valid:
|
|
698
|
+
console.print(
|
|
699
|
+
Panel.fit(
|
|
700
|
+
"[bold green]✓[/bold green] All files validated successfully!",
|
|
701
|
+
border_style="green",
|
|
702
|
+
)
|
|
703
|
+
)
|
|
704
|
+
raise typer.Exit(0)
|
|
705
|
+
else:
|
|
706
|
+
console.print(
|
|
707
|
+
Panel.fit(
|
|
708
|
+
"[bold red]✗[/bold red] Some files failed validation",
|
|
709
|
+
border_style="red",
|
|
710
|
+
)
|
|
711
|
+
)
|
|
712
|
+
raise typer.Exit(1)
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
@app.command(name="info", help="Show information about gbcms")
|
|
716
|
+
def show_info() -> None:
|
|
717
|
+
"""Display information about gbcms capabilities."""
|
|
718
|
+
info_table = Table(title="gbcms Information", show_header=False, border_style="cyan")
|
|
719
|
+
info_table.add_column("Category", style="bold cyan")
|
|
720
|
+
info_table.add_column("Details", style="white")
|
|
721
|
+
|
|
722
|
+
info_table.add_row("Version", __version__)
|
|
723
|
+
info_table.add_row("Supported Input", "VCF, MAF")
|
|
724
|
+
info_table.add_row("Supported Output", "VCF-like, MAF, Fillout")
|
|
725
|
+
info_table.add_row("Variant Types", "SNP, DNP, Insertion, Deletion")
|
|
726
|
+
info_table.add_row(
|
|
727
|
+
"Quality Filters", "Mapping quality, Base quality, Duplicates, QC failed, etc."
|
|
728
|
+
)
|
|
729
|
+
info_table.add_row("Counting Methods", "DMP (default), Generic")
|
|
730
|
+
info_table.add_row("Parallelization", "Multi-threaded with configurable threads")
|
|
731
|
+
info_table.add_row("Dependencies", "pysam, numpy, typer, rich")
|
|
732
|
+
|
|
733
|
+
console.print(info_table)
|
|
734
|
+
console.print()
|
|
735
|
+
|
|
736
|
+
console.print("[bold cyan]Example Usage:[/bold cyan]")
|
|
737
|
+
console.print(
|
|
738
|
+
" gbcms count run --fasta ref.fa --bam s1:s1.bam --vcf vars.vcf --output out.txt"
|
|
739
|
+
)
|
|
740
|
+
console.print(" gbcms validate files --fasta ref.fa --bam s1:s1.bam")
|
|
741
|
+
console.print(" gbcms version")
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
if __name__ == "__main__":
|
|
745
|
+
app()
|