py-gbcms 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gbcms/parallel.py DELETED
@@ -1,129 +0,0 @@
1
- """Parallel processing with joblib backend only."""
2
-
3
- import logging
4
- import os
5
- from collections.abc import Callable
6
- from typing import Any
7
-
8
- from joblib import Parallel, delayed
9
- from rich.progress import BarColumn, Progress, TaskProgressColumn, TextColumn
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
- class ParallelProcessor:
15
- """Unified interface for parallel processing with joblib."""
16
-
17
- def __init__(
18
- self,
19
- n_jobs: int = -1,
20
- backend: str = "joblib",
21
- verbose: int = 0,
22
- ):
23
- """
24
- Initialize parallel processor.
25
-
26
- Args:
27
- n_jobs: Number of parallel jobs (-1 for all CPUs)
28
- backend: Backend to use ('joblib', 'threading', 'multiprocessing')
29
- verbose: Verbosity level
30
- """
31
- self.n_jobs = n_jobs if n_jobs > 0 else os.cpu_count()
32
- self.backend = backend
33
- self.verbose = verbose
34
-
35
- # Map user-friendly backend names to joblib backends
36
- backend_map = {
37
- "joblib": "loky", # Robust joblib backend
38
- "threading": "threading", # Pure threading
39
- "multiprocessing": "multiprocessing", # Process-based
40
- "loky": "loky", # Explicit loky
41
- }
42
- self.joblib_backend = backend_map.get(backend, "loky")
43
-
44
- logger.debug(
45
- f"Initialized parallel processor with {self.n_jobs} jobs using {backend} -> {self.joblib_backend} backend"
46
- )
47
-
48
- def map(
49
- self,
50
- func: Callable,
51
- items: list,
52
- description: str = "Processing",
53
- show_progress: bool = True,
54
- ) -> list[Any]:
55
- """
56
- Apply function to each item in parallel.
57
-
58
- Args:
59
- func: Function to apply
60
- items: List of items to process
61
- description: Progress description
62
- show_progress: Whether to show progress bar
63
-
64
- Returns:
65
- List of results
66
- """
67
- return self._map_joblib(func, items, description, show_progress)
68
-
69
- def _map_joblib(
70
- self,
71
- func: Callable,
72
- items: list,
73
- description: str = "Processing",
74
- show_progress: bool = True,
75
- ) -> list[Any]:
76
- """Map using joblib."""
77
- if show_progress and len(items) > 10:
78
- # Use progress bar for larger workloads
79
- progress_columns = [
80
- TextColumn("[bold blue]{task.description}"),
81
- BarColumn(),
82
- TaskProgressColumn(),
83
- TextColumn("({task.completed}/{task.total})"),
84
- ]
85
-
86
- with Progress(*progress_columns, refresh_per_second=10) as progress:
87
- task = progress.add_task(description, total=len(items))
88
-
89
- def progress_wrapper(item):
90
- result = func(item)
91
- progress.update(task, advance=1)
92
- return result
93
-
94
- with Parallel(n_jobs=self.n_jobs, backend=self.joblib_backend) as parallel:
95
- return list(parallel(delayed(progress_wrapper)(item) for item in items))
96
- else:
97
- # Simple parallel execution
98
- with Parallel(n_jobs=self.n_jobs, backend=self.joblib_backend) as parallel:
99
- return list(parallel(delayed(func)(item) for item in items))
100
-
101
- def starmap(
102
- self,
103
- func: Callable,
104
- items: list,
105
- description: str = "Processing",
106
- show_progress: bool = True,
107
- ) -> list[Any]:
108
- """
109
- Apply function with arguments to each item in parallel.
110
-
111
- Args:
112
- func: Function to apply
113
- items: List of argument tuples
114
- description: Progress description
115
- show_progress: Whether to show progress bar
116
-
117
- Returns:
118
- List of results
119
- """
120
-
121
- def wrapper(args):
122
- return func(*args)
123
-
124
- return self._map_joblib(wrapper, items, description, show_progress)
125
-
126
- def shutdown(self):
127
- """Shutdown parallel processing resources."""
128
- # joblib handles cleanup automatically
129
- pass
gbcms/processor.py DELETED
@@ -1,293 +0,0 @@
1
- """Main processing logic for GetBaseCounts."""
2
-
3
- import logging
4
- from concurrent.futures import ThreadPoolExecutor, as_completed
5
-
6
- import pysam
7
- from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
8
-
9
- from .config import Config
10
- from .counter import BaseCounter
11
- from .output import OutputFormatter
12
- from .reference import ReferenceSequence
13
- from .variant import VariantEntry, VariantLoader
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- class VariantProcessor:
19
- """Main processor for counting bases in variants."""
20
-
21
- def __init__(self, config: Config):
22
- """
23
- Initialize variant processor.
24
-
25
- Args:
26
- config: Configuration object
27
- """
28
- self.config = config
29
- self.reference = ReferenceSequence(config.fasta_file)
30
- self.counter = BaseCounter(config)
31
- self.sample_order = list(config.bam_files.keys())
32
-
33
- def process(self) -> None:
34
- """Main processing pipeline."""
35
- # Load variants
36
- loader = VariantLoader(reference_getter=self.reference.get_base)
37
- variants = self._load_all_variants(loader)
38
-
39
- if not variants:
40
- logger.warning("No variants to process")
41
- return
42
-
43
- # Sort and index variants
44
- variants = self._sort_and_index_variants(variants)
45
-
46
- # Initialize counts for all samples
47
- for variant in variants:
48
- variant.initialize_counts(self.sample_order)
49
-
50
- # Create variant blocks for parallel processing
51
- variant_blocks = self._create_variant_blocks(variants)
52
-
53
- logger.info(f"Created {len(variant_blocks)} variant blocks for processing")
54
-
55
- # Process each BAM file
56
- for sample_name, bam_path in self.config.bam_files.items():
57
- self._process_bam_file(sample_name, bam_path, variants, variant_blocks)
58
-
59
- # Write output
60
- self._write_output(variants)
61
-
62
- # Cleanup
63
- self.reference.close()
64
- logger.info("Finished processing")
65
-
66
- def _load_all_variants(self, loader: VariantLoader) -> list[VariantEntry]:
67
- """Load all variants from input files."""
68
- all_variants = []
69
-
70
- for variant_file in self.config.variant_files:
71
- if self.config.input_is_maf:
72
- variants = loader.load_maf(variant_file)
73
- else:
74
- variants = loader.load_vcf(variant_file)
75
- all_variants.extend(variants)
76
-
77
- logger.info(f"Total variants loaded: {len(all_variants)}")
78
- return all_variants
79
-
80
- def _sort_and_index_variants(self, variants: list[VariantEntry]) -> list[VariantEntry]:
81
- """Sort variants and identify duplicates."""
82
- logger.info("Sorting variants")
83
- variants.sort()
84
-
85
- logger.info("Indexing variants")
86
- duplicate_map: dict[tuple, VariantEntry] = {}
87
-
88
- for variant in variants:
89
- key = variant.get_variant_key()
90
- if key not in duplicate_map:
91
- duplicate_map[key] = variant
92
- else:
93
- # Mark as duplicate
94
- variant.duplicate_variant_ptr = duplicate_map[key]
95
-
96
- return variants
97
-
98
- def _create_variant_blocks(self, variants: list[VariantEntry]) -> list[tuple[int, int]]:
99
- """
100
- Create blocks of variants for parallel processing.
101
-
102
- Returns:
103
- List of (start_index, end_index) tuples
104
- """
105
- if not variants:
106
- return []
107
-
108
- blocks = []
109
- start_idx = 0
110
- current_count = 0
111
-
112
- for i in range(len(variants)):
113
- current_count += 1
114
-
115
- # Check if we should create a new block
116
- should_break = False
117
-
118
- if current_count >= self.config.max_block_size:
119
- should_break = True
120
- elif i > start_idx:
121
- # Check chromosome change or distance
122
- if variants[i].chrom != variants[start_idx].chrom:
123
- should_break = True
124
- elif variants[i].pos - variants[start_idx].pos > self.config.max_block_dist:
125
- should_break = True
126
-
127
- if should_break:
128
- blocks.append((start_idx, i - 1))
129
- start_idx = i
130
- current_count = 1
131
-
132
- # Add final block
133
- blocks.append((start_idx, len(variants) - 1))
134
-
135
- return blocks
136
-
137
- def _process_bam_file(
138
- self,
139
- sample_name: str,
140
- bam_path: str,
141
- variants: list[VariantEntry],
142
- variant_blocks: list[tuple[int, int]],
143
- ) -> None:
144
- """
145
- Process a single BAM file.
146
-
147
- Args:
148
- sample_name: Sample name
149
- bam_path: Path to BAM file
150
- variants: List of all variants
151
- variant_blocks: List of variant block ranges
152
- """
153
- logger.info(f"Processing BAM file: {bam_path}")
154
-
155
- if self.config.num_threads > 1:
156
- self._process_bam_parallel(sample_name, bam_path, variants, variant_blocks)
157
- else:
158
- self._process_bam_sequential(sample_name, bam_path, variants, variant_blocks)
159
-
160
- def _process_bam_sequential(
161
- self,
162
- sample_name: str,
163
- bam_path: str,
164
- variants: list[VariantEntry],
165
- variant_blocks: list[tuple[int, int]],
166
- ) -> None:
167
- """Process BAM file sequentially."""
168
- with pysam.AlignmentFile(bam_path, "rb") as bam:
169
- with Progress(
170
- SpinnerColumn(),
171
- TextColumn("[progress.description]{task.description}"),
172
- BarColumn(),
173
- TaskProgressColumn(),
174
- ) as progress:
175
- task = progress.add_task(
176
- f"[cyan]Processing {sample_name}...", total=len(variant_blocks)
177
- )
178
-
179
- for start_idx, end_idx in variant_blocks:
180
- self._process_variant_block(bam, sample_name, variants, start_idx, end_idx)
181
- progress.update(task, advance=1)
182
-
183
- def _process_bam_parallel(
184
- self,
185
- sample_name: str,
186
- bam_path: str,
187
- variants: list[VariantEntry],
188
- variant_blocks: list[tuple[int, int]],
189
- ) -> None:
190
- """Process BAM file in parallel."""
191
- with Progress(
192
- SpinnerColumn(),
193
- TextColumn("[progress.description]{task.description}"),
194
- BarColumn(),
195
- TaskProgressColumn(),
196
- ) as progress:
197
- task = progress.add_task(
198
- f"[cyan]Processing {sample_name}...", total=len(variant_blocks)
199
- )
200
-
201
- with ThreadPoolExecutor(max_workers=self.config.num_threads) as executor:
202
- futures = []
203
-
204
- for start_idx, end_idx in variant_blocks:
205
- future = executor.submit(
206
- self._process_variant_block_thread_safe,
207
- bam_path,
208
- sample_name,
209
- variants,
210
- start_idx,
211
- end_idx,
212
- )
213
- futures.append(future)
214
-
215
- for future in as_completed(futures):
216
- future.result() # Raise any exceptions
217
- progress.update(task, advance=1)
218
-
219
- def _process_variant_block_thread_safe(
220
- self,
221
- bam_path: str,
222
- sample_name: str,
223
- variants: list[VariantEntry],
224
- start_idx: int,
225
- end_idx: int,
226
- ) -> None:
227
- """Process a variant block in a thread-safe manner."""
228
- # Each thread opens its own BAM file handle
229
- with pysam.AlignmentFile(bam_path, "rb") as bam:
230
- self._process_variant_block(bam, sample_name, variants, start_idx, end_idx)
231
-
232
- def _process_variant_block(
233
- self,
234
- bam: pysam.AlignmentFile,
235
- sample_name: str,
236
- variants: list[VariantEntry],
237
- start_idx: int,
238
- end_idx: int,
239
- ) -> None:
240
- """
241
- Process a block of variants.
242
-
243
- Args:
244
- bam: Open BAM file handle
245
- sample_name: Sample name
246
- variants: List of all variants
247
- start_idx: Start index in variants list
248
- end_idx: End index in variants list
249
- """
250
- start_variant = variants[start_idx]
251
- end_variant = variants[end_idx]
252
-
253
- # Fetch alignments for the region
254
- try:
255
- alignments = list(
256
- bam.fetch(
257
- start_variant.chrom,
258
- start_variant.pos,
259
- end_variant.pos + 2, # Buffer for indels
260
- )
261
- )
262
- except Exception as e:
263
- logger.error(
264
- f"Error fetching alignments for region "
265
- f"{start_variant.chrom}:{start_variant.pos}-{end_variant.pos}: {e}"
266
- )
267
- return
268
-
269
- # Filter alignments
270
- filtered_alignments = [aln for aln in alignments if not self.counter.filter_alignment(aln)]
271
-
272
- # Process each variant in the block
273
- for i in range(start_idx, end_idx + 1):
274
- variant = variants[i]
275
-
276
- # Skip if this is a duplicate variant
277
- if variant.duplicate_variant_ptr is not None:
278
- continue
279
-
280
- # Count bases for this variant
281
- self.counter.count_variant(variant, filtered_alignments, sample_name)
282
-
283
- def _write_output(self, variants: list[VariantEntry]) -> None:
284
- """Write output file."""
285
- formatter = OutputFormatter(self.config, self.sample_order)
286
-
287
- if self.config.input_is_maf:
288
- if self.config.output_maf:
289
- formatter.write_maf_output(variants)
290
- else:
291
- formatter.write_fillout_output(variants)
292
- else:
293
- formatter.write_vcf_output(variants)
gbcms/reference.py DELETED
@@ -1,86 +0,0 @@
1
- """Reference sequence handling."""
2
-
3
- import logging
4
-
5
- import pysam
6
-
7
- logger = logging.getLogger(__name__)
8
-
9
-
10
- class ReferenceSequence:
11
- """Handles reference sequence loading and access."""
12
-
13
- def __init__(self, fasta_file: str):
14
- """
15
- Initialize reference sequence handler.
16
-
17
- Args:
18
- fasta_file: Path to reference FASTA file (must be indexed)
19
- """
20
- self.fasta_file = fasta_file
21
- self.fasta: pysam.FastaFile | None = None
22
- self._load_reference()
23
-
24
- def _load_reference(self) -> None:
25
- """Load reference sequence using pysam."""
26
- logger.info(f"Loading reference sequence: {self.fasta_file}")
27
- try:
28
- self.fasta = pysam.FastaFile(self.fasta_file)
29
- except Exception as e:
30
- logger.error(f"Failed to open reference FASTA file: {e}")
31
- raise
32
-
33
- def get_base(self, chrom: str, pos: int) -> str:
34
- """
35
- Get base at specific position (0-indexed).
36
-
37
- Args:
38
- chrom: Chromosome name
39
- pos: 0-indexed position
40
-
41
- Returns:
42
- Base at position (uppercase)
43
- """
44
- if self.fasta is None:
45
- raise RuntimeError("Reference FASTA not loaded")
46
-
47
- try:
48
- return self.fasta.fetch(chrom, pos, pos + 1).upper()
49
- except Exception as e:
50
- logger.error(f"Failed to fetch base at {chrom}:{pos}: {e}")
51
- raise
52
-
53
- def get_sequence(self, chrom: str, start: int, end: int) -> str:
54
- """
55
- Get sequence in range (0-indexed, end exclusive).
56
-
57
- Args:
58
- chrom: Chromosome name
59
- start: Start position (0-indexed, inclusive)
60
- end: End position (0-indexed, exclusive)
61
-
62
- Returns:
63
- Sequence in range (uppercase)
64
- """
65
- if self.fasta is None:
66
- raise RuntimeError("Reference FASTA not loaded")
67
-
68
- try:
69
- return self.fasta.fetch(chrom, start, end).upper()
70
- except Exception as e:
71
- logger.error(f"Failed to fetch sequence at {chrom}:{start}-{end}: {e}")
72
- raise
73
-
74
- def close(self) -> None:
75
- """Close the FASTA file."""
76
- if self.fasta:
77
- self.fasta.close()
78
- self.fasta = None
79
-
80
- def __enter__(self) -> "ReferenceSequence":
81
- """Context manager entry."""
82
- return self
83
-
84
- def __exit__(self, exc_type, exc_val, exc_tb) -> None:
85
- """Context manager exit."""
86
- self.close()