py-gbcms 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gbcms/config.py ADDED
@@ -0,0 +1,98 @@
1
+ """Configuration classes and enums for GetBaseCounts."""
2
+
3
+ import os
4
+ from dataclasses import dataclass
5
+ from enum import IntEnum
6
+
7
+
8
+ class CountType(IntEnum):
9
+ """Enumeration for different count types."""
10
+
11
+ DP = 0 # Total depth
12
+ RD = 1 # Reference depth
13
+ AD = 2 # Alternate depth
14
+ DPP = 3 # Positive strand depth
15
+ RDP = 4 # Positive strand reference depth
16
+ ADP = 5 # Positive strand alternate depth
17
+ DPF = 6 # Fragment depth
18
+ RDF = 7 # Fragment reference depth
19
+ ADF = 8 # Fragment alternate depth
20
+
21
+
22
+ @dataclass
23
+ class Config:
24
+ """Configuration for base counting."""
25
+
26
+ fasta_file: str
27
+ bam_files: dict[str, str] # sample_name -> bam_path
28
+ variant_files: list[str]
29
+ output_file: str
30
+
31
+ # Optional parameters
32
+ mapping_quality_threshold: int = 20
33
+ base_quality_threshold: int = 0
34
+ filter_duplicate: bool = True
35
+ filter_improper_pair: bool = False
36
+ filter_qc_failed: bool = False
37
+ filter_indel: bool = False
38
+ filter_non_primary: bool = False
39
+ output_positive_count: bool = True
40
+ output_negative_count: bool = False
41
+ output_fragment_count: bool = False
42
+ fragment_fractional_weight: bool = False
43
+ max_block_size: int = 10000
44
+ max_block_dist: int = 100000
45
+ num_threads: int = 1
46
+ backend: str = "joblib" # Parallelization backend
47
+ input_is_maf: bool = False
48
+ input_is_vcf: bool = False
49
+ output_maf: bool = False
50
+ generic_counting: bool = False
51
+ max_warning_per_type: int = 3
52
+
53
+ def __post_init__(self) -> None:
54
+ """Validate configuration."""
55
+ if not os.path.exists(self.fasta_file):
56
+ raise FileNotFoundError(f"Reference FASTA file not found: {self.fasta_file}")
57
+
58
+ fai_file = f"{self.fasta_file}.fai"
59
+ if not os.path.exists(fai_file):
60
+ raise FileNotFoundError(
61
+ f"Reference FASTA index not found: {fai_file}. "
62
+ f"Please index with: samtools faidx {self.fasta_file}"
63
+ )
64
+
65
+ for sample, bam_path in self.bam_files.items():
66
+ if not os.path.exists(bam_path):
67
+ raise FileNotFoundError(f"BAM file not found for sample {sample}: {bam_path}")
68
+
69
+ # Check for BAM index
70
+ bai_file1 = bam_path.replace(".bam", ".bai")
71
+ bai_file2 = f"{bam_path}.bai"
72
+ if not os.path.exists(bai_file1) and not os.path.exists(bai_file2):
73
+ raise FileNotFoundError(
74
+ f"BAM index not found for {bam_path}. "
75
+ f"Please index with: samtools index {bam_path}"
76
+ )
77
+
78
+ for variant_file in self.variant_files:
79
+ if not os.path.exists(variant_file):
80
+ raise FileNotFoundError(f"Variant file not found: {variant_file}")
81
+
82
+ if self.input_is_maf and self.input_is_vcf:
83
+ raise ValueError("--maf and --vcf are mutually exclusive")
84
+
85
+ if not self.input_is_maf and not self.input_is_vcf:
86
+ raise ValueError("Either --maf or --vcf must be specified")
87
+
88
+ if self.input_is_vcf and self.output_maf:
89
+ raise ValueError("--omaf can only be used with --maf input")
90
+
91
+ if self.num_threads < 1:
92
+ raise ValueError("Number of threads must be at least 1")
93
+
94
+ if self.max_block_size < 1:
95
+ raise ValueError("max_block_size must be at least 1")
96
+
97
+ if self.max_block_dist < 1:
98
+ raise ValueError("max_block_dist must be at least 1")