py-gbcms 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gbcms/cli.py CHANGED
@@ -1,744 +1,162 @@
1
+ """
2
+ CLI Entry Point: Exposes the gbcms functionality via command line.
3
+ """
4
+
1
5
  from pathlib import Path
2
- from typing import Annotated
3
6
 
4
7
  import typer
5
- from rich.console import Console
6
- from rich.panel import Panel
7
- from rich.table import Table
8
-
9
- from . import __version__
10
- from .config import Config
11
- from .processor import VariantProcessor
12
-
13
- # Initialize Typer app with rich help
14
- app = typer.Typer(
15
- name="gbcms",
16
- help="Python implementation of gbcms for calculating base counts in BAM files",
17
- add_completion=False,
18
- rich_markup_mode="rich",
19
- no_args_is_help=True,
20
- )
21
-
22
- # Initialize Rich console
23
- console = Console()
24
8
 
9
+ from .models.core import GbcmsConfig, OutputFormat
10
+ from .pipeline import Pipeline
25
11
 
26
- # Subcommands
27
- count_app = typer.Typer(help="Count bases at variant positions")
28
- validate_app = typer.Typer(help="Validate input files")
29
- app.add_typer(count_app, name="count")
30
- app.add_typer(validate_app, name="validate")
12
+ app = typer.Typer(help="gbcms: Get Base Counts Multi-Sample")
31
13
 
32
14
 
33
- def validate_input_files(
34
- fasta: Path,
35
- bam_files: dict[str, str],
36
- variant_files: list[str],
37
- input_is_maf: bool,
38
- input_is_vcf: bool,
39
- rich_output: bool = False,
40
- ) -> tuple[bool, Table | None]:
15
+ @app.callback()
16
+ def main():
41
17
  """
42
- Validate input files for gbcms processing.
43
-
44
- Args:
45
- fasta: Path to reference FASTA file
46
- bam_files: Dictionary of sample names to BAM file paths
47
- variant_files: List of variant file paths
48
- input_is_maf: Whether input files are in MAF format
49
- input_is_vcf: Whether input files are in VCF format
50
- rich_output: Whether to return detailed rich table for visual output
51
-
52
- Returns:
53
- Tuple of (is_valid, results_table_or_None)
18
+ gbcms: Get Base Counts Multi-Sample
54
19
  """
55
- from pathlib import Path
56
-
57
- if rich_output:
58
- from rich.table import Table
59
-
60
- results = Table(title="Validation Results", show_header=True, header_style="bold cyan")
61
- results.add_column("File Type", style="cyan")
62
- results.add_column("File Path", style="white")
63
- results.add_column("Status", style="white")
64
- results.add_column("Details", style="yellow")
65
-
66
- console.print(
67
- Panel.fit(
68
- "[bold cyan]File Validation[/bold cyan]\n" "Checking input files for gbcms",
69
- border_style="cyan",
70
- )
71
- )
72
-
73
- all_valid = True
74
-
75
- # Validate FASTA file and index
76
- if not fasta or str(fasta) == "":
77
- # Skip FASTA validation if not provided
78
- pass
79
- elif not fasta.exists():
80
- if rich_output:
81
- results.add_row("FASTA", str(fasta), "❌ FAIL", "File not found")
82
- else:
83
- console.print(f"[red]Error:[/red] FASTA file not found: {fasta}")
84
- all_valid = False
85
- else:
86
- fai_file = Path(str(fasta) + ".fai")
87
- if not fai_file.exists():
88
- if rich_output:
89
- results.add_row("FASTA", str(fasta), "⚠️ WARN", "Index (.fai) not found")
90
- else:
91
- console.print(f"[red]Error:[/red] FASTA index (.fai) not found: {fai_file}")
92
- console.print("Please index your FASTA file with: samtools faidx reference.fa")
93
- all_valid = False
94
- else:
95
- if rich_output:
96
- results.add_row("FASTA", str(fasta), "✅ PASS", "File and index found")
97
-
98
- # Validate BAM files and indices
99
- for sample_name, bam_path in bam_files.items():
100
- bam_file = Path(bam_path)
101
- if not bam_file.exists():
102
- if rich_output:
103
- results.add_row("BAM", f"{sample_name}:{bam_path}", "❌ FAIL", "File not found")
104
- else:
105
- console.print(f"[red]Error:[/red] BAM file not found: {bam_file}")
106
- all_valid = False
107
- else:
108
- # Check for BAM index files
109
- bai_file1 = Path(str(bam_file).replace(".bam", ".bai"))
110
- bai_file2 = Path(str(bam_file) + ".bai")
111
-
112
- if not bai_file1.exists() and not bai_file2.exists():
113
- if rich_output:
114
- results.add_row(
115
- "BAM", f"{sample_name}:{bam_path}", "⚠️ WARN", "Index (.bai) not found"
116
- )
117
- else:
118
- console.print(f"[red]Error:[/red] BAM index not found for: {bam_file}")
119
- console.print(f"Expected: {bai_file1} or {bai_file2}")
120
- console.print("Please index your BAM file with: samtools index sample.bam")
121
- all_valid = False
122
- else:
123
- if rich_output:
124
- results.add_row(
125
- "BAM", f"{sample_name}:{bam_path}", "✅ PASS", "File and index found"
126
- )
127
-
128
- # Validate variant files
129
- for variant_file in variant_files:
130
- vcf = Path(variant_file)
131
- if not vcf.exists():
132
- if rich_output:
133
- results.add_row(
134
- "VCF" if input_is_vcf else "MAF", str(vcf), "❌ FAIL", "File not found"
135
- )
136
- else:
137
- console.print(f"[red]Error:[/red] Variant file not found: {vcf}")
138
- all_valid = False
139
- else:
140
- if rich_output:
141
- results.add_row("VCF" if input_is_vcf else "MAF", str(vcf), "✅ PASS", "File found")
142
-
143
- if rich_output:
144
- return all_valid, results
145
- else:
146
- return all_valid, None
147
-
148
-
149
- @app.command(name="version", help="Show version information")
150
- def show_version() -> None:
151
- """Print version and exit."""
152
- console.print(
153
- Panel.fit(
154
- f"[bold cyan]py-gbcms[/bold cyan]\n"
155
- f"Version: [green]{__version__}[/green]\n"
156
- f"Python implementation of GetBaseCountsMultiSample (gbcms)",
157
- border_style="cyan",
158
- title="Version Info",
159
- )
160
- )
161
- raise typer.Exit()
162
-
163
-
164
- def parse_bam_file(bam_string: str) -> tuple[str, str]:
165
- """
166
- Parse BAM file string in format SAMPLE:BAM_PATH.
167
-
168
- Args:
169
- bam_string: String in format "sample_name:bam_path"
170
-
171
- Returns:
172
- Tuple of (sample_name, bam_path)
173
- """
174
- parts = bam_string.split(":", 1)
175
- if len(parts) != 2:
176
- console.print(
177
- f"[red]Error:[/red] Incorrect format for --bam parameter: {bam_string}",
178
- )
179
- console.print("Expected format: SAMPLE_NAME:BAM_FILE")
180
- raise typer.Exit(1)
181
- return parts[0], parts[1]
182
-
183
-
184
- def load_bam_fof(bam_fof_path: str) -> dict[str, str]:
185
- """
186
- Load BAM files from file-of-files.
187
-
188
- Args:
189
- bam_fof_path: Path to file containing sample names and BAM paths
190
-
191
- Returns:
192
- Dictionary mapping sample names to BAM paths
193
- """
194
- bam_files = {}
195
- with open(bam_fof_path) as f:
196
- for line_num, line in enumerate(f, 1):
197
- line = line.strip()
198
- if not line or line.startswith("#"):
199
- continue
200
-
201
- parts = line.split("\t")
202
- if len(parts) != 2:
203
- console.print(
204
- f"[red]Error:[/red] Incorrect format at line {line_num} in {bam_fof_path}",
205
- )
206
- console.print("Expected format: SAMPLE_NAME<TAB>BAM_FILE")
207
- raise typer.Exit(1)
208
-
209
- sample_name, bam_path = parts
210
- if sample_name in bam_files:
211
- console.print(
212
- f"[red]Error:[/red] Duplicate sample name: {sample_name}",
213
- )
214
- raise typer.Exit(1)
215
-
216
- bam_files[sample_name] = bam_path
217
-
218
- return bam_files
219
-
220
-
221
- @count_app.command(name="run", help="Run base counting on variants")
222
- def count_run(
223
- # Required arguments
224
- fasta: Annotated[
225
- Path,
226
- typer.Option(
227
- "--fasta",
228
- "-f",
229
- help="[bold cyan]Reference genome FASTA file[/bold cyan] (must be indexed with .fai)",
230
- exists=True,
231
- file_okay=True,
232
- dir_okay=False,
233
- readable=True,
234
- rich_help_panel="📁 Required Input Files",
235
- ),
236
- ],
237
- output: Annotated[
238
- Path,
239
- typer.Option(
240
- "--output",
241
- "-o",
242
- help="[bold cyan]Output file path[/bold cyan]",
243
- rich_help_panel="📁 Required Input Files",
244
- ),
245
- ],
246
- # BAM input options
247
- bam: Annotated[
248
- list[str] | None,
249
- typer.Option(
250
- "--bam",
251
- "-b",
252
- help="BAM file in format [yellow]SAMPLE_NAME:BAM_FILE[/yellow] (can be specified multiple times)",
253
- rich_help_panel="🧬 BAM Input",
254
- ),
255
- ] = None,
256
- bam_fof: Annotated[
257
- Path | None,
258
- typer.Option(
259
- "--bam-fof",
260
- help="File containing sample names and BAM paths (tab-separated)",
261
- exists=True,
262
- file_okay=True,
263
- dir_okay=False,
264
- readable=True,
265
- rich_help_panel="🧬 BAM Input",
266
- ),
267
- ] = None,
268
- # Variant input options (mutually exclusive)
269
- maf: Annotated[
270
- list[Path] | None,
271
- typer.Option(
272
- "--maf",
273
- help="Input variant file in [green]MAF format[/green] (can be specified multiple times)",
274
- exists=True,
275
- file_okay=True,
276
- dir_okay=False,
277
- readable=True,
278
- rich_help_panel="🔬 Variant Input",
279
- ),
280
- ] = None,
281
- vcf: Annotated[
282
- list[Path] | None,
283
- typer.Option(
284
- "--vcf",
285
- help="Input variant file in [green]VCF format[/green] (can be specified multiple times)",
286
- exists=True,
287
- file_okay=True,
288
- dir_okay=False,
289
- readable=True,
290
- rich_help_panel="🔬 Variant Input",
291
- ),
292
- ] = None,
293
- # Output format
294
- omaf: Annotated[
295
- bool,
296
- typer.Option(
297
- "--omaf",
298
- help="Output in MAF format (only with MAF input)",
299
- rich_help_panel="📤 Output Options",
300
- ),
301
- ] = False,
302
- positive_count: Annotated[
303
- bool,
304
- typer.Option(
305
- "--positive-count/--no-positive-count",
306
- help="Output positive strand counts (DPP/RDP/ADP)",
307
- rich_help_panel="📤 Output Options",
308
- ),
309
- ] = True,
310
- negative_count: Annotated[
311
- bool,
312
- typer.Option(
313
- "--negative-count/--no-negative-count",
314
- help="Output negative strand counts (DPN/RDN/ADN)",
315
- rich_help_panel="📤 Output Options",
316
- ),
317
- ] = False,
318
- fragment_count: Annotated[
319
- bool,
320
- typer.Option(
321
- "--fragment-count/--no-fragment-count",
322
- help="Output fragment counts (DPF/RDF/ADF)",
323
- rich_help_panel="📤 Output Options",
324
- ),
325
- ] = False,
326
- fragment_fractional_weight: Annotated[
327
- bool,
328
- typer.Option(
329
- "--fragment-fractional-weight",
330
- help="Use fractional weight (0.5) for fragments with disagreement",
331
- rich_help_panel="📤 Output Options",
332
- ),
333
- ] = False,
334
- # Quality filters
335
- maq: Annotated[
336
- int,
337
- typer.Option(
338
- "--maq",
339
- help="Mapping quality threshold",
340
- min=0,
341
- rich_help_panel="🔍 Quality Filters",
342
- ),
343
- ] = 20,
344
- baq: Annotated[
345
- int,
346
- typer.Option(
347
- "--baq",
348
- help="Base quality threshold",
349
- min=0,
350
- rich_help_panel="🔍 Quality Filters",
351
- ),
352
- ] = 0,
353
- filter_duplicate: Annotated[
354
- bool,
355
- typer.Option(
356
- "--filter-duplicate/--no-filter-duplicate",
357
- help="Filter reads marked as duplicate",
358
- rich_help_panel="🔍 Quality Filters",
359
- ),
360
- ] = True,
361
- filter_improper_pair: Annotated[
362
- bool,
363
- typer.Option(
364
- "--filter-improper-pair/--no-filter-improper-pair",
365
- help="Filter reads marked as improperly paired",
366
- rich_help_panel="🔍 Quality Filters",
367
- ),
368
- ] = False,
369
- filter_qc_failed: Annotated[
370
- bool,
371
- typer.Option(
372
- "--filter-qc-failed/--no-filter-qc-failed",
373
- help="Filter reads marked as QC failed",
374
- rich_help_panel="🔍 Quality Filters",
375
- ),
376
- ] = False,
377
- filter_indel: Annotated[
378
- bool,
379
- typer.Option(
380
- "--filter-indel/--no-filter-indel",
381
- help="Filter reads containing indels",
382
- rich_help_panel="🔍 Quality Filters",
383
- ),
384
- ] = False,
385
- filter_non_primary: Annotated[
386
- bool,
387
- typer.Option(
388
- "--filter-non-primary/--no-filter-non-primary",
389
- help="Filter non-primary alignments",
390
- rich_help_panel="🔍 Quality Filters",
391
- ),
392
- ] = False,
393
- # Performance options
394
- thread: Annotated[
395
- int,
396
- typer.Option(
397
- "--thread",
398
- "-t",
399
- help="Number of threads for parallel processing",
400
- min=1,
401
- rich_help_panel="⚡ Performance",
402
- ),
403
- ] = 1,
404
- backend: Annotated[
405
- str,
406
- typer.Option(
407
- "--backend",
408
- help="Parallelization backend: 'joblib' (default), 'loky', 'threading', or 'multiprocessing'",
409
- rich_help_panel="⚡ Performance",
410
- ),
411
- ] = "joblib",
412
- max_block_size: Annotated[
413
- int,
414
- typer.Option(
415
- "--max-block-size",
416
- help="Maximum number of variants per block",
417
- min=1,
418
- rich_help_panel="⚡ Performance",
419
- ),
420
- ] = 10000,
421
- max_block_dist: Annotated[
422
- int,
423
- typer.Option(
424
- "--max-block-dist",
425
- help="Maximum block distance in base pairs",
426
- min=1,
427
- rich_help_panel="⚡ Performance",
428
- ),
429
- ] = 100000,
430
- # Advanced options
431
- generic_counting: Annotated[
432
- bool,
433
- typer.Option(
434
- "--generic-counting",
435
- help="Use generic counting algorithm for complex variants",
436
- rich_help_panel="🔧 Advanced",
437
- ),
438
- ] = False,
439
- suppress_warning: Annotated[
440
- int,
441
- typer.Option(
442
- "--suppress-warning",
443
- help="Maximum number of warnings per type",
444
- min=0,
445
- rich_help_panel="🔧 Advanced",
446
- ),
447
- ] = 3,
448
- # Other options
449
- verbose: Annotated[
450
- bool,
451
- typer.Option(
452
- "--verbose",
453
- "-v",
454
- help="Enable verbose logging",
455
- rich_help_panel="🔧 Advanced",
456
- ),
457
- ] = False,
458
- ) -> None:
20
+ pass
21
+
22
+
23
+ @app.command()
24
+ def run(
25
+ variant_file: Path = typer.Option(
26
+ ..., "--variants", "-v", help="Path to VCF or MAF file containing variants"
27
+ ),
28
+ bam_files: list[Path] | None = typer.Option(
29
+ None, "--bam", "-b", help="Path to BAM file(s). Can be specified multiple times."
30
+ ),
31
+ bam_list: Path | None = typer.Option(
32
+ None, "--bam-list", "-L", help="File containing list of BAM paths (one per line)"
33
+ ),
34
+ reference: Path = typer.Option(..., "--fasta", "-f", help="Path to reference FASTA file"),
35
+ output_dir: Path = typer.Option(
36
+ ..., "--output-dir", "-o", help="Directory to write output files"
37
+ ),
38
+ output_format: OutputFormat = typer.Option(
39
+ OutputFormat.VCF, "--format", help="Output format (vcf or maf)"
40
+ ),
41
+ output_suffix: str = typer.Option(
42
+ "", "--suffix", "-S", help="Suffix to append to output filename (e.g. '.genotyped')"
43
+ ),
44
+ min_mapq: int = typer.Option(20, "--min-mapq", help="Minimum mapping quality"),
45
+ min_baseq: int = typer.Option(0, "--min-baseq", help="Minimum base quality"),
46
+ filter_duplicates: bool = typer.Option(True, help="Filter duplicate reads"),
47
+ filter_secondary: bool = typer.Option(False, help="Filter secondary alignments"),
48
+ filter_supplementary: bool = typer.Option(False, help="Filter supplementary alignments"),
49
+ filter_qc_failed: bool = typer.Option(False, help="Filter reads failing QC"),
50
+ filter_improper_pair: bool = typer.Option(False, help="Filter improperly paired reads"),
51
+ filter_indel: bool = typer.Option(False, help="Filter reads containing indels"),
52
+ threads: int = typer.Option(
53
+ 1, "--threads", "-t", help="Number of threads (not yet implemented in v2 python layer)"
54
+ ),
55
+ verbose: bool = typer.Option(False, "--verbose", "-V", help="Enable verbose debug logging"),
56
+ ):
459
57
  """
460
- Calculate base counts in multiple BAM files for variants in VCF/MAF files.
461
-
462
- This tool counts the number of reference and alternate alleles at each variant
463
- position across multiple BAM files, with support for various quality filters
464
- and output formats.
58
+ Run gbcms on one or more BAM files.
465
59
  """
466
- # Setup logging
467
- # setup_logging(verbose)
468
-
469
- # Print banner
470
- console.print(
471
- Panel.fit(
472
- f"[bold cyan]py-gbcms[/bold cyan] v{__version__}\n"
473
- "Python implementation of GetBaseCountsMultiSample",
474
- border_style="cyan",
475
- )
60
+ import logging
61
+
62
+ from rich.console import Console
63
+ from rich.logging import RichHandler
64
+
65
+ # Configure logging
66
+ log_level = logging.DEBUG if verbose else logging.INFO
67
+ logging.basicConfig(
68
+ level=log_level,
69
+ format="%(message)s",
70
+ datefmt="[%X]",
71
+ handlers=[RichHandler(rich_tracebacks=True, markup=True)],
476
72
  )
477
73
 
478
- # Validate inputs
479
- if not bam and not bam_fof:
480
- console.print(
481
- "[red]Error:[/red] Please specify at least one BAM file with --bam or --bam-fof",
482
- )
483
- raise typer.Exit(1)
74
+ console = Console()
484
75
 
485
- if not maf and not vcf:
486
- console.print(
487
- "[red]Error:[/red] Please specify at least one variant file with --maf or --vcf",
488
- )
489
- raise typer.Exit(1)
76
+ # Map BAMs to sample names (filename stem for now)
77
+ bams_dict = {}
490
78
 
491
- if maf and vcf:
79
+ # 1. Process direct BAM arguments
80
+ if bam_files:
81
+ for bam_arg in bam_files:
82
+ # Check for sample_id:path format
83
+ bam_str = str(bam_arg)
84
+ if ":" in bam_str:
85
+ parts = bam_str.split(":", 1)
86
+ sample_name = parts[0]
87
+ bam_path = Path(parts[1])
88
+ else:
89
+ bam_path = bam_arg
90
+ sample_name = bam_path.stem
91
+
92
+ if not bam_path.exists():
93
+ console.print(f"[bold red]Error: BAM file not found: {bam_path}[/bold red]")
94
+ raise typer.Exit(code=1)
95
+
96
+ bams_dict[sample_name] = bam_path
97
+
98
+ # 2. Process BAM list file
99
+ if bam_list:
100
+ if not bam_list.exists():
101
+ console.print(f"[bold red]Error: BAM list file not found: {bam_list}[/bold red]")
102
+ raise typer.Exit(code=1)
103
+
104
+ try:
105
+ with open(bam_list) as f:
106
+ for line in f:
107
+ line = line.strip()
108
+ if not line or line.startswith("#"):
109
+ continue
110
+ # Check for 2 columns (sample_id path)
111
+ parts = line.split()
112
+ if len(parts) >= 2:
113
+ sample_name = parts[0]
114
+ bam_path = Path(parts[1])
115
+ else:
116
+ bam_path = Path(parts[0])
117
+ sample_name = bam_path.stem
118
+
119
+ if not bam_path.exists():
120
+ console.print(
121
+ f"[yellow]Warning: BAM file from list not found: {bam_path}[/yellow]"
122
+ )
123
+ continue
124
+ bams_dict[sample_name] = bam_path
125
+ except Exception as e:
126
+ console.print(f"[bold red]Error reading BAM list file {bam_list}: {e}[/bold red]")
127
+ raise typer.Exit(code=1) from e
128
+
129
+ if not bams_dict:
492
130
  console.print(
493
- "[red]Error:[/red] --maf and --vcf are mutually exclusive",
131
+ "[bold red]Error: No valid BAM files provided via --bam or --bam-list[/bold red]"
494
132
  )
495
- raise typer.Exit(1)
496
-
497
- # Parse BAM files
498
- bam_files = {}
499
-
500
- if bam:
501
- for bam_string in bam:
502
- sample_name, bam_path = parse_bam_file(bam_string)
503
- if sample_name in bam_files:
504
- console.print(
505
- f"[red]Error:[/red] Duplicate sample name: {sample_name}",
506
- )
507
- raise typer.Exit(1)
508
- bam_files[sample_name] = bam_path
509
-
510
- if bam_fof:
511
- fof_bams = load_bam_fof(str(bam_fof))
512
- for sample_name, bam_path in fof_bams.items():
513
- if sample_name in bam_files:
514
- console.print(
515
- f"[red]Error:[/red] Duplicate sample name: {sample_name}",
516
- )
517
- raise typer.Exit(1)
518
- bam_files[sample_name] = bam_path
519
-
520
- # Parse variant files
521
- variant_files = []
522
- input_is_maf = False
523
- input_is_vcf = False
524
-
525
- if maf:
526
- variant_files = [str(f) for f in maf]
527
- input_is_maf = True
528
-
529
- if vcf:
530
- variant_files = [str(f) for f in vcf]
531
- input_is_vcf = True
532
-
533
- # Validate input files before processing
534
- if not validate_input_files(fasta, bam_files, variant_files, input_is_maf, input_is_vcf)[0]:
535
- raise typer.Exit(1)
536
-
537
- # Display configuration
538
- config_table = Table(title="Configuration", show_header=False, border_style="cyan")
539
- config_table.add_column("Parameter", style="cyan")
540
- config_table.add_column("Value", style="green")
541
-
542
- config_table.add_row("Reference FASTA", str(fasta))
543
- config_table.add_row("Number of BAM files", str(len(bam_files)))
544
- config_table.add_row("Number of variant files", str(len(variant_files)))
545
- config_table.add_row("Input format", "MAF" if input_is_maf else "VCF")
546
- config_table.add_row("Output file", str(output))
547
- config_table.add_row("Threads", str(thread))
548
- config_table.add_row("Backend", backend)
549
- config_table.add_row("Mapping quality threshold", str(maq))
550
- config_table.add_row("Base quality threshold", str(baq))
551
-
552
- console.print(config_table)
553
- console.print()
133
+ raise typer.Exit(code=1)
554
134
 
555
135
  try:
556
- # Create configuration using legacy Config format (processor expects this)
557
- config = Config(
558
- fasta_file=str(fasta),
559
- bam_files=bam_files,
560
- variant_files=variant_files,
561
- output_file=str(output),
562
- mapping_quality_threshold=maq,
563
- base_quality_threshold=baq,
564
- filter_duplicate=filter_duplicate,
565
- filter_improper_pair=filter_improper_pair,
136
+ config = GbcmsConfig(
137
+ variant_file=variant_file,
138
+ bam_files=bams_dict,
139
+ reference_fasta=reference,
140
+ output_dir=output_dir,
141
+ output_format=output_format,
142
+ output_suffix=output_suffix,
143
+ min_mapping_quality=min_mapq,
144
+ min_base_quality=min_baseq,
145
+ filter_duplicates=filter_duplicates,
146
+ filter_secondary=filter_secondary,
147
+ filter_supplementary=filter_supplementary,
566
148
  filter_qc_failed=filter_qc_failed,
149
+ filter_improper_pair=filter_improper_pair,
567
150
  filter_indel=filter_indel,
568
- filter_non_primary=filter_non_primary,
569
- output_positive_count=positive_count,
570
- output_negative_count=negative_count,
571
- output_fragment_count=fragment_count,
572
- fragment_fractional_weight=fragment_fractional_weight,
573
- max_block_size=max_block_size,
574
- max_block_dist=max_block_dist,
575
- num_threads=thread,
576
- backend=backend,
577
- input_is_maf=input_is_maf,
578
- input_is_vcf=input_is_vcf,
579
- output_maf=omaf,
580
- generic_counting=generic_counting,
581
- max_warning_per_type=suppress_warning,
151
+ threads=threads,
582
152
  )
583
153
 
584
- # Process variants
585
- processor = VariantProcessor(config)
586
- processor.process()
587
-
588
- # Success message
589
- console.print()
590
- console.print(
591
- Panel.fit(
592
- "[bold green]✓[/bold green] Processing completed successfully!",
593
- border_style="green",
594
- )
595
- )
154
+ pipeline = Pipeline(config)
155
+ pipeline.run()
596
156
 
597
157
  except Exception as e:
598
- console.print()
599
- console.print(
600
- Panel.fit(
601
- f"[bold red]✗[/bold red] Error: {str(e)}",
602
- border_style="red",
603
- )
604
- )
605
- if verbose:
606
- console.print_exception()
607
- raise typer.Exit(1) from e
608
-
609
-
610
- @validate_app.command(name="files", help="Validate input files")
611
- def validate_files(
612
- fasta: Annotated[
613
- Path | None,
614
- typer.Option(
615
- "--fasta",
616
- "-f",
617
- help="Reference FASTA file to validate",
618
- rich_help_panel="Files to Validate",
619
- ),
620
- ] = None,
621
- bam: Annotated[
622
- list[str] | None,
623
- typer.Option(
624
- "--bam",
625
- "-b",
626
- help="BAM files to validate (SAMPLE:PATH format)",
627
- rich_help_panel="Files to Validate",
628
- ),
629
- ] = None,
630
- vcf: Annotated[
631
- list[Path] | None,
632
- typer.Option(
633
- "--vcf",
634
- help="VCF files to validate",
635
- rich_help_panel="Files to Validate",
636
- ),
637
- ] = None,
638
- maf: Annotated[
639
- list[Path] | None,
640
- typer.Option(
641
- "--maf",
642
- help="MAF files to validate",
643
- rich_help_panel="Files to Validate",
644
- ),
645
- ] = None,
646
- ) -> None:
647
- """
648
- Validate input files for gbcms.
649
-
650
- Checks:
651
- - File existence
652
- - Required indices (.fai for FASTA, .bai for BAM)
653
- - File format validity
654
- - Chromosome name consistency
655
- """
656
-
657
- console.print(
658
- Panel.fit(
659
- "[bold cyan]File Validation[/bold cyan]\n" "Checking input files for gbcms",
660
- border_style="cyan",
661
- )
662
- )
663
-
664
- results = Table(title="Validation Results", show_header=True, header_style="bold cyan")
665
- results.add_column("File Type", style="cyan")
666
- results.add_column("File Path", style="white")
667
- results.add_column("Status", style="white")
668
- results.add_column("Details", style="yellow")
669
-
670
- # Parse BAM files if provided
671
- bam_files = {}
672
- if bam:
673
- for bam_string in bam:
674
- sample_name, bam_path = parse_bam_file(bam_string)
675
- bam_files[sample_name] = bam_path
676
-
677
- # Parse variant files
678
- variant_files = []
679
- input_is_maf = False
680
- input_is_vcf = False
681
-
682
- if maf:
683
- variant_files = [str(f) for f in maf]
684
- input_is_maf = True
685
-
686
- if vcf:
687
- variant_files = [str(f) for f in vcf]
688
- input_is_vcf = True
689
-
690
- # Use the unified validation function with rich output
691
- # Note: fasta can be None, but if it is, the validation will handle it appropriately
692
- is_valid, results_table = validate_input_files(
693
- fasta or Path(""), bam_files, variant_files, input_is_maf, input_is_vcf, rich_output=True
694
- )
695
-
696
- # Handle results based on validation outcome
697
- if is_valid:
698
- console.print(
699
- Panel.fit(
700
- "[bold green]✓[/bold green] All files validated successfully!",
701
- border_style="green",
702
- )
703
- )
704
- raise typer.Exit(0)
705
- else:
706
- console.print(
707
- Panel.fit(
708
- "[bold red]✗[/bold red] Some files failed validation",
709
- border_style="red",
710
- )
711
- )
712
- raise typer.Exit(1)
713
-
714
-
715
- @app.command(name="info", help="Show information about gbcms")
716
- def show_info() -> None:
717
- """Display information about gbcms capabilities."""
718
- info_table = Table(title="gbcms Information", show_header=False, border_style="cyan")
719
- info_table.add_column("Category", style="bold cyan")
720
- info_table.add_column("Details", style="white")
721
-
722
- info_table.add_row("Version", __version__)
723
- info_table.add_row("Supported Input", "VCF, MAF")
724
- info_table.add_row("Supported Output", "VCF-like, MAF, Fillout")
725
- info_table.add_row("Variant Types", "SNP, DNP, Insertion, Deletion")
726
- info_table.add_row(
727
- "Quality Filters", "Mapping quality, Base quality, Duplicates, QC failed, etc."
728
- )
729
- info_table.add_row("Counting Methods", "DMP (default), Generic")
730
- info_table.add_row("Parallelization", "Multi-threaded with configurable threads")
731
- info_table.add_row("Dependencies", "pysam, numpy, typer, rich")
732
-
733
- console.print(info_table)
734
- console.print()
735
-
736
- console.print("[bold cyan]Example Usage:[/bold cyan]")
737
- console.print(
738
- " gbcms count run --fasta ref.fa --bam s1:s1.bam --vcf vars.vcf --output out.txt"
739
- )
740
- console.print(" gbcms validate files --fasta ref.fa --bam s1:s1.bam")
741
- console.print(" gbcms version")
158
+ console.print(f"[bold red]Error: {e}[/bold red]")
159
+ raise typer.Exit(code=1) from e
742
160
 
743
161
 
744
162
  if __name__ == "__main__":