py-gbcms 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gbcms/__init__.py +13 -0
- gbcms/cli.py +745 -0
- gbcms/config.py +98 -0
- gbcms/counter.py +1074 -0
- gbcms/models.py +295 -0
- gbcms/numba_counter.py +394 -0
- gbcms/output.py +573 -0
- gbcms/parallel.py +129 -0
- gbcms/processor.py +293 -0
- gbcms/reference.py +86 -0
- gbcms/variant.py +390 -0
- py_gbcms-2.0.0.dist-info/METADATA +506 -0
- py_gbcms-2.0.0.dist-info/RECORD +16 -0
- py_gbcms-2.0.0.dist-info/WHEEL +4 -0
- py_gbcms-2.0.0.dist-info/entry_points.txt +2 -0
- py_gbcms-2.0.0.dist-info/licenses/LICENSE +664 -0
gbcms/config.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Configuration classes and enums for GetBaseCounts."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from enum import IntEnum
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CountType(IntEnum):
|
|
9
|
+
"""Enumeration for different count types."""
|
|
10
|
+
|
|
11
|
+
DP = 0 # Total depth
|
|
12
|
+
RD = 1 # Reference depth
|
|
13
|
+
AD = 2 # Alternate depth
|
|
14
|
+
DPP = 3 # Positive strand depth
|
|
15
|
+
RDP = 4 # Positive strand reference depth
|
|
16
|
+
ADP = 5 # Positive strand alternate depth
|
|
17
|
+
DPF = 6 # Fragment depth
|
|
18
|
+
RDF = 7 # Fragment reference depth
|
|
19
|
+
ADF = 8 # Fragment alternate depth
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Config:
|
|
24
|
+
"""Configuration for base counting."""
|
|
25
|
+
|
|
26
|
+
fasta_file: str
|
|
27
|
+
bam_files: dict[str, str] # sample_name -> bam_path
|
|
28
|
+
variant_files: list[str]
|
|
29
|
+
output_file: str
|
|
30
|
+
|
|
31
|
+
# Optional parameters
|
|
32
|
+
mapping_quality_threshold: int = 20
|
|
33
|
+
base_quality_threshold: int = 0
|
|
34
|
+
filter_duplicate: bool = True
|
|
35
|
+
filter_improper_pair: bool = False
|
|
36
|
+
filter_qc_failed: bool = False
|
|
37
|
+
filter_indel: bool = False
|
|
38
|
+
filter_non_primary: bool = False
|
|
39
|
+
output_positive_count: bool = True
|
|
40
|
+
output_negative_count: bool = False
|
|
41
|
+
output_fragment_count: bool = False
|
|
42
|
+
fragment_fractional_weight: bool = False
|
|
43
|
+
max_block_size: int = 10000
|
|
44
|
+
max_block_dist: int = 100000
|
|
45
|
+
num_threads: int = 1
|
|
46
|
+
backend: str = "joblib" # Parallelization backend
|
|
47
|
+
input_is_maf: bool = False
|
|
48
|
+
input_is_vcf: bool = False
|
|
49
|
+
output_maf: bool = False
|
|
50
|
+
generic_counting: bool = False
|
|
51
|
+
max_warning_per_type: int = 3
|
|
52
|
+
|
|
53
|
+
def __post_init__(self) -> None:
|
|
54
|
+
"""Validate configuration."""
|
|
55
|
+
if not os.path.exists(self.fasta_file):
|
|
56
|
+
raise FileNotFoundError(f"Reference FASTA file not found: {self.fasta_file}")
|
|
57
|
+
|
|
58
|
+
fai_file = f"{self.fasta_file}.fai"
|
|
59
|
+
if not os.path.exists(fai_file):
|
|
60
|
+
raise FileNotFoundError(
|
|
61
|
+
f"Reference FASTA index not found: {fai_file}. "
|
|
62
|
+
f"Please index with: samtools faidx {self.fasta_file}"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
for sample, bam_path in self.bam_files.items():
|
|
66
|
+
if not os.path.exists(bam_path):
|
|
67
|
+
raise FileNotFoundError(f"BAM file not found for sample {sample}: {bam_path}")
|
|
68
|
+
|
|
69
|
+
# Check for BAM index
|
|
70
|
+
bai_file1 = bam_path.replace(".bam", ".bai")
|
|
71
|
+
bai_file2 = f"{bam_path}.bai"
|
|
72
|
+
if not os.path.exists(bai_file1) and not os.path.exists(bai_file2):
|
|
73
|
+
raise FileNotFoundError(
|
|
74
|
+
f"BAM index not found for {bam_path}. "
|
|
75
|
+
f"Please index with: samtools index {bam_path}"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
for variant_file in self.variant_files:
|
|
79
|
+
if not os.path.exists(variant_file):
|
|
80
|
+
raise FileNotFoundError(f"Variant file not found: {variant_file}")
|
|
81
|
+
|
|
82
|
+
if self.input_is_maf and self.input_is_vcf:
|
|
83
|
+
raise ValueError("--maf and --vcf are mutually exclusive")
|
|
84
|
+
|
|
85
|
+
if not self.input_is_maf and not self.input_is_vcf:
|
|
86
|
+
raise ValueError("Either --maf or --vcf must be specified")
|
|
87
|
+
|
|
88
|
+
if self.input_is_vcf and self.output_maf:
|
|
89
|
+
raise ValueError("--omaf can only be used with --maf input")
|
|
90
|
+
|
|
91
|
+
if self.num_threads < 1:
|
|
92
|
+
raise ValueError("Number of threads must be at least 1")
|
|
93
|
+
|
|
94
|
+
if self.max_block_size < 1:
|
|
95
|
+
raise ValueError("max_block_size must be at least 1")
|
|
96
|
+
|
|
97
|
+
if self.max_block_dist < 1:
|
|
98
|
+
raise ValueError("max_block_dist must be at least 1")
|