strvcf-annotator 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ """Top-level package for strvcf_annotator.
2
+
3
+ STR (Short Tandem Repeat) annotation tool for VCF files.
4
+ Provides both library and CLI interfaces for annotating variants
5
+ that overlap with STR regions.
6
+ """
7
+
8
+ __author__ = """Olesia Kondrateva"""
9
+ __email__ = 'xkdnoa@gmail.com'
10
+ __version__ = '0.1.0'
11
+
12
+ # Public API exports
13
+ from .api import STRAnnotator, annotate_vcf
14
+ from .parsers.base import BaseVCFParser
15
+ from .parsers.generic import GenericParser
16
+ from .core.str_reference import load_str_reference
17
+ from .core.repeat_utils import (
18
+ extract_repeat_sequence,
19
+ count_repeat_units,
20
+ apply_variant_to_repeat,
21
+ is_perfect_repeat
22
+ )
23
+ from .utils.validation import ValidationError
24
+
25
+ __all__ = [
26
+ # Main API
27
+ 'STRAnnotator',
28
+ 'annotate_vcf',
29
+
30
+ # Parsers
31
+ 'BaseVCFParser',
32
+ 'GenericParser',
33
+
34
+ # Core functions
35
+ 'load_str_reference',
36
+ 'extract_repeat_sequence',
37
+ 'count_repeat_units',
38
+ 'apply_variant_to_repeat',
39
+ 'is_perfect_repeat',
40
+
41
+ # Exceptions
42
+ 'ValidationError',
43
+ ]
@@ -0,0 +1,281 @@
1
+ """Library API for programmatic access to STR annotation functionality."""
2
+
3
+ import pysam
4
+ from typing import Optional, Iterator
5
+ import logging
6
+
7
+ from .parsers.base import BaseVCFParser
8
+ from .parsers.generic import GenericParser
9
+ from .core.str_reference import load_str_reference
10
+ from .core.vcf_processor import (
11
+ generate_annotated_records,
12
+ annotate_vcf_to_file,
13
+ process_directory
14
+ )
15
+ from .utils.validation import (
16
+ validate_directory_path,
17
+ validate_vcf_file,
18
+ validate_str_bed_file
19
+ )
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class STRAnnotator:
25
+ """Main class for STR annotation functionality.
26
+
27
+ Provides a high-level interface for annotating VCF files with STR
28
+ (Short Tandem Repeat) information. Supports both single file and
29
+ batch directory processing.
30
+
31
+ Parameters
32
+ ----------
33
+ str_bed_path : str
34
+ Path to BED file containing STR regions
35
+ parser : BaseVCFParser, optional
36
+ Custom parser for genotype extraction. Uses GenericParser if None.
37
+ somatic_mode : bool, optional
38
+ Enable somatic filtering mode. When True, skips variants where both
39
+ samples (tumor/normal) have identical genotypes. Default is False.
40
+
41
+ Attributes
42
+ ----------
43
+ str_bed_path : str
44
+ Path to STR BED file
45
+ str_df : pd.DataFrame
46
+ Loaded STR reference data
47
+ parser : BaseVCFParser
48
+ Parser for genotype extraction
49
+ somatic_mode : bool
50
+ Whether somatic filtering is enabled
51
+
52
+ Examples
53
+ --------
54
+ >>> annotator = STRAnnotator('str_regions.bed')
55
+ >>> annotator.annotate_vcf_file('input.vcf', 'output.vcf')
56
+
57
+ >>> # Batch process directory
58
+ >>> annotator.process_directory('input_dir/', 'output_dir/')
59
+
60
+ >>> # Stream processing
61
+ >>> vcf_in = pysam.VariantFile('input.vcf')
62
+ >>> for record in annotator.annotate_vcf_stream(vcf_in):
63
+ ... print(record)
64
+ """
65
+
66
+ def __init__(
67
+ self,
68
+ str_bed_path: str,
69
+ parser: Optional[BaseVCFParser] = None,
70
+ somatic_mode: bool = False
71
+ ):
72
+ """Initialize STR annotator with reference and parser.
73
+
74
+ Parameters
75
+ ----------
76
+ str_bed_path : str
77
+ Path to BED file with STR regions
78
+ parser : BaseVCFParser, optional
79
+ Custom parser for genotype extraction
80
+ somatic_mode : bool, optional
81
+ Enable somatic filtering (skip variants where tumor==normal genotypes).
82
+ Default is False.
83
+
84
+ Raises
85
+ ------
86
+ ValidationError
87
+ If STR BED file is invalid
88
+ """
89
+ # Validate and load STR reference
90
+ validate_str_bed_file(str_bed_path)
91
+ self.str_bed_path = str_bed_path
92
+ self.str_df = load_str_reference(str_bed_path)
93
+
94
+ # Set parser
95
+ self.parser = parser if parser is not None else GenericParser()
96
+
97
+ # Set somatic mode
98
+ self.somatic_mode = somatic_mode
99
+
100
+ logger.info(f"Loaded {len(self.str_df)} STR regions from {str_bed_path}")
101
+
102
+ def annotate_vcf_file(self, input_path: str, output_path: str) -> None:
103
+ """Annotate single VCF file.
104
+
105
+ Reads a VCF file, annotates variants overlapping with STR regions,
106
+ and writes the annotated records to an output file.
107
+
108
+ Parameters
109
+ ----------
110
+ input_path : str
111
+ Path to input VCF file
112
+ output_path : str
113
+ Path to output VCF file
114
+
115
+ Raises
116
+ ------
117
+ ValidationError
118
+ If input VCF file is invalid
119
+
120
+ Examples
121
+ --------
122
+ >>> annotator = STRAnnotator('str_regions.bed')
123
+ >>> annotator.annotate_vcf_file('input.vcf', 'output.vcf')
124
+ """
125
+ # Validate input
126
+ validate_vcf_file(input_path)
127
+
128
+ # Annotate
129
+ logger.info(f"Annotating {input_path}...")
130
+ annotate_vcf_to_file(
131
+ input_path,
132
+ self.str_df,
133
+ output_path,
134
+ self.parser,
135
+ somatic_mode=self.somatic_mode
136
+ )
137
+ logger.info(f"Wrote annotated VCF to {output_path}")
138
+
139
+ def annotate_vcf_stream(self, vcf_in: pysam.VariantFile) -> Iterator[pysam.VariantRecord]:
140
+ """Annotate VCF records from stream.
141
+
142
+ Generator that yields annotated VCF records from an open VCF file.
143
+ Useful for streaming processing or custom workflows.
144
+
145
+ Parameters
146
+ ----------
147
+ vcf_in : pysam.VariantFile
148
+ Open VCF file object
149
+
150
+ Yields
151
+ ------
152
+ pysam.VariantRecord
153
+ Annotated VCF records
154
+
155
+ Examples
156
+ --------
157
+ >>> annotator = STRAnnotator('str_regions.bed')
158
+ >>> vcf_in = pysam.VariantFile('input.vcf')
159
+ >>> for record in annotator.annotate_vcf_stream(vcf_in):
160
+ ... # Process record
161
+ ... print(record.info['RU'])
162
+ """
163
+ yield from generate_annotated_records(
164
+ vcf_in,
165
+ self.str_df,
166
+ self.parser,
167
+ somatic_mode=self.somatic_mode
168
+ )
169
+
170
+ def process_directory(self, input_dir: str, output_dir: str) -> None:
171
+ """Batch process directory of VCF files.
172
+
173
+ Processes all VCF files in a directory and writes annotated versions
174
+ to the output directory. Skips files that have already been processed.
175
+
176
+ Parameters
177
+ ----------
178
+ input_dir : str
179
+ Directory containing input VCF files
180
+ output_dir : str
181
+ Directory for output VCF files (created if doesn't exist)
182
+
183
+ Raises
184
+ ------
185
+ ValidationError
186
+ If input directory is invalid
187
+
188
+ Examples
189
+ --------
190
+ >>> annotator = STRAnnotator('str_regions.bed')
191
+ >>> annotator.process_directory('vcf_files/', 'annotated_vcfs/')
192
+ """
193
+ # Validate directories
194
+ validate_directory_path(input_dir, must_exist=True)
195
+ validate_directory_path(output_dir, must_exist=False, create=True)
196
+
197
+ # Process directory
198
+ logger.info(f"Processing VCF files in {input_dir}...")
199
+ process_directory(
200
+ input_dir,
201
+ self.str_bed_path,
202
+ output_dir,
203
+ self.parser,
204
+ somatic_mode=self.somatic_mode
205
+ )
206
+ logger.info(f"Batch processing complete. Output in {output_dir}")
207
+
208
+ def get_str_at_position(self, chrom: str, pos: int) -> Optional[dict]:
209
+ """Get STR region at specific genomic position.
210
+
211
+ Parameters
212
+ ----------
213
+ chrom : str
214
+ Chromosome name
215
+ pos : int
216
+ Genomic position (1-based)
217
+
218
+ Returns
219
+ -------
220
+ Optional[dict]
221
+ STR region data if position is within an STR, None otherwise
222
+
223
+ Examples
224
+ --------
225
+ >>> annotator = STRAnnotator('str_regions.bed')
226
+ >>> str_region = annotator.get_str_at_position('chr1', 1000000)
227
+ >>> if str_region:
228
+ ... print(f"Repeat unit: {str_region['RU']}")
229
+ """
230
+ from .core.str_reference import get_str_at_position
231
+ return get_str_at_position(self.str_df, chrom, pos)
232
+
233
+ def get_statistics(self) -> dict:
234
+ """Get statistics about loaded STR regions.
235
+
236
+ Returns
237
+ -------
238
+ dict
239
+ Statistics including total regions, chromosomes, repeat units
240
+
241
+ Examples
242
+ --------
243
+ >>> annotator = STRAnnotator('str_regions.bed')
244
+ >>> stats = annotator.get_statistics()
245
+ >>> print(f"Total STR regions: {stats['total_regions']}")
246
+ """
247
+ stats = {
248
+ 'total_regions': len(self.str_df),
249
+ 'chromosomes': self.str_df['CHROM'].nunique(),
250
+ 'unique_repeat_units': self.str_df['RU'].nunique(),
251
+ 'period_distribution': self.str_df['PERIOD'].value_counts().to_dict(),
252
+ 'mean_repeat_count': self.str_df['COUNT'].mean(),
253
+ 'median_repeat_count': self.str_df['COUNT'].median()
254
+ }
255
+ return stats
256
+
257
+
258
+ def annotate_vcf(input_vcf: str, str_bed: str, output_vcf: str,
259
+ parser: Optional[BaseVCFParser] = None) -> None:
260
+ """Convenience function for single VCF annotation.
261
+
262
+ Simple function interface for annotating a single VCF file.
263
+
264
+ Parameters
265
+ ----------
266
+ input_vcf : str
267
+ Path to input VCF file
268
+ str_bed : str
269
+ Path to STR BED file
270
+ output_vcf : str
271
+ Path to output VCF file
272
+ parser : BaseVCFParser, optional
273
+ Custom parser for genotype extraction
274
+
275
+ Examples
276
+ --------
277
+ >>> from strvcf_annotator import annotate_vcf
278
+ >>> annotate_vcf('input.vcf', 'str_regions.bed', 'output.vcf')
279
+ """
280
+ annotator = STRAnnotator(str_bed, parser)
281
+ annotator.annotate_vcf_file(input_vcf, output_vcf)
@@ -0,0 +1,191 @@
1
+ """Console script for strvcf_annotator."""
2
+
3
+ import argparse
4
+ import logging
5
+ import sys
6
+
7
+ from . import __version__
8
+ from .api import STRAnnotator
9
+ from .utils.validation import ValidationError
10
+
11
+
12
+ def setup_logging(verbose: bool = False):
13
+ """Configure logging for CLI.
14
+
15
+ Parameters
16
+ ----------
17
+ verbose : bool
18
+ If True, set logging level to DEBUG
19
+ """
20
+ level = logging.DEBUG if verbose else logging.INFO
21
+ logging.basicConfig(
22
+ level=level,
23
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
24
+ datefmt='%Y-%m-%d %H:%M:%S'
25
+ )
26
+
27
+
28
+ def create_parser() -> argparse.ArgumentParser:
29
+ """Create CLI argument parser.
30
+
31
+ Returns
32
+ -------
33
+ argparse.ArgumentParser
34
+ Configured argument parser
35
+ """
36
+ parser = argparse.ArgumentParser(
37
+ description="Annotate STR regions in VCF files using a BED file",
38
+ formatter_class=argparse.RawDescriptionHelpFormatter,
39
+ epilog="""
40
+ Examples:
41
+ # Annotate single VCF file
42
+ strvcf-annotator --input input.vcf --str-bed repeats.bed --output output.vcf
43
+
44
+ # Batch process directory
45
+ strvcf-annotator --input-dir vcf_files/ --str-bed repeats.bed --output-dir annotated/
46
+
47
+ # Enable verbose logging
48
+ strvcf-annotator --input input.vcf --str-bed repeats.bed --output output.vcf --verbose
49
+
50
+ # Somatic mode (filter variants where tumor==normal genotypes)
51
+ strvcf-annotator --input somatic.vcf --str-bed repeats.bed --output output.vcf --somatic-mode
52
+ """
53
+ )
54
+
55
+ # Input options (mutually exclusive)
56
+ input_group = parser.add_mutually_exclusive_group(required=True)
57
+ input_group.add_argument(
58
+ '--input',
59
+ type=str,
60
+ help='Path to input VCF file'
61
+ )
62
+ input_group.add_argument(
63
+ '--input-dir',
64
+ type=str,
65
+ help='Directory containing input VCF files'
66
+ )
67
+
68
+ # Required arguments
69
+ parser.add_argument(
70
+ '--str-bed',
71
+ required=True,
72
+ type=str,
73
+ help='Path to BED file with STR regions (CHROM, START, END, PERIOD, RU)'
74
+ )
75
+
76
+ # Output options
77
+ output_group = parser.add_mutually_exclusive_group(required=True)
78
+ output_group.add_argument(
79
+ '--output',
80
+ type=str,
81
+ help='Path to output VCF file (for single file mode)'
82
+ )
83
+ output_group.add_argument(
84
+ '--output-dir',
85
+ type=str,
86
+ help='Directory for output VCF files (for batch mode)'
87
+ )
88
+
89
+ # Optional arguments
90
+ parser.add_argument(
91
+ '--verbose',
92
+ '-v',
93
+ action='store_true',
94
+ help='Enable verbose logging'
95
+ )
96
+
97
+ parser.add_argument(
98
+ '--somatic-mode',
99
+ action='store_true',
100
+ help='Enable somatic filtering: skip variants where both samples have identical genotypes'
101
+ )
102
+
103
+ parser.add_argument(
104
+ '--version',
105
+ action='version',
106
+ version=f'%(prog)s {__version__}'
107
+ )
108
+
109
+ return parser
110
+
111
+
112
+ def validate_args(args: argparse.Namespace) -> None:
113
+ """Validate CLI arguments.
114
+
115
+ Parameters
116
+ ----------
117
+ args : argparse.Namespace
118
+ Parsed command-line arguments
119
+
120
+ Raises
121
+ ------
122
+ ValidationError
123
+ If arguments are invalid or inconsistent
124
+ """
125
+ # Validate input/output consistency
126
+ if args.input and not args.output:
127
+ raise ValidationError("--input requires --output")
128
+
129
+ if args.input_dir and not args.output_dir:
130
+ raise ValidationError("--input-dir requires --output-dir")
131
+
132
+ if args.output and not args.input:
133
+ raise ValidationError("--output requires --input")
134
+
135
+ if args.output_dir and not args.input_dir:
136
+ raise ValidationError("--output-dir requires --input-dir")
137
+
138
+
139
+ def main():
140
+ """CLI entry point with argument parsing and validation."""
141
+ parser = create_parser()
142
+ args = parser.parse_args()
143
+
144
+ # Setup logging
145
+ setup_logging(args.verbose)
146
+ logger = logging.getLogger(__name__)
147
+
148
+ try:
149
+ # Validate arguments
150
+ validate_args(args)
151
+
152
+ # Create annotator
153
+ logger.info("Initializing STR annotator...")
154
+ somatic_mode = getattr(args, 'somatic_mode', False)
155
+ annotator = STRAnnotator(args.str_bed, somatic_mode=somatic_mode)
156
+
157
+ # Display statistics
158
+ stats = annotator.get_statistics()
159
+ logger.info(f"Loaded {stats['total_regions']} STR regions from {stats['chromosomes']} chromosomes")
160
+
161
+ # Process based on mode
162
+ if args.input:
163
+ # Single file mode
164
+ logger.info(f"Processing single file: {args.input}")
165
+ annotator.annotate_vcf_file(args.input, args.output)
166
+ logger.info(f"Successfully wrote annotated VCF to {args.output}")
167
+
168
+ elif args.input_dir:
169
+ # Batch directory mode
170
+ logger.info(f"Processing directory: {args.input_dir}")
171
+ annotator.process_directory(args.input_dir, args.output_dir)
172
+ logger.info(f"Successfully processed all VCF files to {args.output_dir}")
173
+
174
+ logger.info("Annotation complete!")
175
+ return 0
176
+
177
+ except ValidationError as e:
178
+ logger.error(f"Validation error: {e}")
179
+ return 1
180
+
181
+ except FileNotFoundError as e:
182
+ logger.error(f"File not found: {e}")
183
+ return 1
184
+
185
+ except Exception as e:
186
+ logger.error(f"Unexpected error: {e}", exc_info=args.verbose)
187
+ return 1
188
+
189
+
190
+ if __name__ == "__main__":
191
+ sys.exit(main())
@@ -0,0 +1,25 @@
1
+ """Core modules for STR annotation functionality."""
2
+
3
+ from .str_reference import load_str_reference
4
+ from .repeat_utils import extract_repeat_sequence, count_repeat_units, apply_variant_to_repeat
5
+ from .annotation import make_modified_header, build_new_record, should_skip_genotype
6
+ from .vcf_processor import (
7
+ check_vcf_sorted,
8
+ reset_and_sort_vcf,
9
+ generate_annotated_records,
10
+ annotate_vcf_to_file
11
+ )
12
+
13
+ __all__ = [
14
+ 'load_str_reference',
15
+ 'extract_repeat_sequence',
16
+ 'count_repeat_units',
17
+ 'apply_variant_to_repeat',
18
+ 'make_modified_header',
19
+ 'build_new_record',
20
+ 'should_skip_genotype',
21
+ 'check_vcf_sorted',
22
+ 'reset_and_sort_vcf',
23
+ 'generate_annotated_records',
24
+ 'annotate_vcf_to_file'
25
+ ]