supremo-lite 0.5.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- supremo_lite/__init__.py +6 -2
- supremo_lite/mutagenesis.py +352 -2
- supremo_lite/variant_utils.py +84 -10
- {supremo_lite-0.5.5.dist-info → supremo_lite-1.0.0.dist-info}/METADATA +1 -1
- {supremo_lite-0.5.5.dist-info → supremo_lite-1.0.0.dist-info}/RECORD +7 -7
- {supremo_lite-0.5.5.dist-info → supremo_lite-1.0.0.dist-info}/WHEEL +1 -1
- {supremo_lite-0.5.5.dist-info → supremo_lite-1.0.0.dist-info}/licenses/LICENSE +0 -0
supremo_lite/__init__.py
CHANGED
|
@@ -42,7 +42,11 @@ from .personalize import (
|
|
|
42
42
|
)
|
|
43
43
|
|
|
44
44
|
# Import mutagenesis functions
|
|
45
|
-
from .mutagenesis import
|
|
45
|
+
from .mutagenesis import (
|
|
46
|
+
get_sm_sequences,
|
|
47
|
+
get_sm_subsequences,
|
|
48
|
+
get_scrambled_subsequences,
|
|
49
|
+
)
|
|
46
50
|
|
|
47
51
|
# Import prediction alignment functions
|
|
48
52
|
from .prediction_alignment import align_predictions_by_coordinate
|
|
@@ -52,7 +56,7 @@ from .prediction_alignment import align_predictions_by_coordinate
|
|
|
52
56
|
# This allows users who don't have PyTorch to still use the main package
|
|
53
57
|
|
|
54
58
|
# Version
|
|
55
|
-
__version__ = "0.
|
|
59
|
+
__version__ = "1.0.0"
|
|
56
60
|
# Package metadata
|
|
57
61
|
__description__ = (
|
|
58
62
|
"A module for generating personalized genome sequences and in-silico mutagenesis"
|
supremo_lite/mutagenesis.py
CHANGED
|
@@ -19,6 +19,94 @@ except ImportError:
|
|
|
19
19
|
pass # Already handled in core
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
def _kmer_shuffle(sequence: str, k: int = 1, random_state=None) -> str:
|
|
23
|
+
"""
|
|
24
|
+
Shuffle a sequence by k-mer chunks, preserving k-mer composition.
|
|
25
|
+
|
|
26
|
+
Breaks the sequence into non-overlapping k-mers and shuffles these chunks.
|
|
27
|
+
This preserves the k-mer frequency counts in the shuffled sequence:
|
|
28
|
+
- k=1: Shuffle individual nucleotides (preserves mononucleotide/GC composition)
|
|
29
|
+
- k=2: Shuffle 2-mers (preserves dinucleotide frequencies)
|
|
30
|
+
- k=3: Shuffle 3-mers (preserves trinucleotide frequencies)
|
|
31
|
+
|
|
32
|
+
Note: If sequence length is not divisible by k, the remainder bases are
|
|
33
|
+
treated as a partial k-mer and shuffled along with the complete k-mers.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
sequence: Input DNA sequence string (ACGT only)
|
|
37
|
+
k: Size of k-mers to shuffle (default: 1)
|
|
38
|
+
random_state: Optional numpy random state or seed for reproducibility
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Shuffled sequence with preserved k-mer composition
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
ValueError: If k < 1
|
|
45
|
+
"""
|
|
46
|
+
if k < 1:
|
|
47
|
+
raise ValueError(f"k must be >= 1, got {k}")
|
|
48
|
+
|
|
49
|
+
if len(sequence) < k:
|
|
50
|
+
return sequence
|
|
51
|
+
|
|
52
|
+
# Handle random state
|
|
53
|
+
if random_state is None:
|
|
54
|
+
rng = np.random.default_rng()
|
|
55
|
+
elif isinstance(random_state, (int, np.integer)):
|
|
56
|
+
rng = np.random.default_rng(random_state)
|
|
57
|
+
else:
|
|
58
|
+
rng = random_state
|
|
59
|
+
|
|
60
|
+
seq = sequence.upper()
|
|
61
|
+
|
|
62
|
+
# Calculate how many complete k-mers we can make
|
|
63
|
+
n_complete_kmers = len(seq) // k
|
|
64
|
+
kmer_portion_len = n_complete_kmers * k
|
|
65
|
+
|
|
66
|
+
# Split into k-mers
|
|
67
|
+
kmers = [seq[i : i + k] for i in range(0, kmer_portion_len, k)]
|
|
68
|
+
|
|
69
|
+
# Include leftover bases as an additional chunk to shuffle
|
|
70
|
+
leftover = seq[kmer_portion_len:]
|
|
71
|
+
if leftover:
|
|
72
|
+
kmers.append(leftover)
|
|
73
|
+
|
|
74
|
+
# Shuffle all chunks (including leftover if present)
|
|
75
|
+
rng.shuffle(kmers)
|
|
76
|
+
|
|
77
|
+
return "".join(kmers)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _scramble_region(
|
|
81
|
+
sequence: str, start: int, end: int, k: int = 1, random_state=None
|
|
82
|
+
) -> str:
|
|
83
|
+
"""
|
|
84
|
+
Scramble a specific region within a sequence using k-mer shuffle.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
sequence: Full sequence string
|
|
88
|
+
start: Start position of region to scramble (0-based)
|
|
89
|
+
end: End position of region to scramble (exclusive)
|
|
90
|
+
k: Size of k-mers to shuffle (default: 1 for mononucleotide shuffle)
|
|
91
|
+
random_state: Optional random state for reproducibility
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Sequence with the specified region scrambled
|
|
95
|
+
"""
|
|
96
|
+
if start < 0 or end > len(sequence) or start >= end:
|
|
97
|
+
raise ValueError(
|
|
98
|
+
f"Invalid region [{start}, {end}) for sequence of length {len(sequence)}"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
prefix = sequence[:start]
|
|
102
|
+
region = sequence[start:end]
|
|
103
|
+
suffix = sequence[end:]
|
|
104
|
+
|
|
105
|
+
scrambled_region = _kmer_shuffle(region, k=k, random_state=random_state)
|
|
106
|
+
|
|
107
|
+
return prefix + scrambled_region + suffix
|
|
108
|
+
|
|
109
|
+
|
|
22
110
|
def _read_bed_file(bed_regions: Union[str, pd.DataFrame]) -> pd.DataFrame:
|
|
23
111
|
"""
|
|
24
112
|
Read BED file or validate BED DataFrame format.
|
|
@@ -147,7 +235,14 @@ def get_sm_sequences(chrom, start, end, reference_fasta, encoder=None):
|
|
|
147
235
|
# Create a DataFrame for the metadata
|
|
148
236
|
metadata_df = pd.DataFrame(
|
|
149
237
|
metadata,
|
|
150
|
-
columns=[
|
|
238
|
+
columns=[
|
|
239
|
+
"chrom",
|
|
240
|
+
"window_start",
|
|
241
|
+
"window_end",
|
|
242
|
+
"variant_offset0",
|
|
243
|
+
"ref",
|
|
244
|
+
"alt",
|
|
245
|
+
],
|
|
151
246
|
)
|
|
152
247
|
|
|
153
248
|
return ref_1h, alt_seqs_stacked, metadata_df
|
|
@@ -416,7 +511,262 @@ def get_sm_subsequences(
|
|
|
416
511
|
# Create a DataFrame for the metadata
|
|
417
512
|
metadata_df = pd.DataFrame(
|
|
418
513
|
metadata,
|
|
419
|
-
columns=[
|
|
514
|
+
columns=[
|
|
515
|
+
"chrom",
|
|
516
|
+
"window_start",
|
|
517
|
+
"window_end",
|
|
518
|
+
"variant_offset0",
|
|
519
|
+
"ref",
|
|
520
|
+
"alt",
|
|
521
|
+
],
|
|
420
522
|
)
|
|
421
523
|
|
|
422
524
|
return ref_1h, alt_seqs_stacked, metadata_df
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def get_scrambled_subsequences(
|
|
528
|
+
chrom: str,
|
|
529
|
+
seq_len: int,
|
|
530
|
+
reference_fasta,
|
|
531
|
+
bed_regions: Union[str, pd.DataFrame],
|
|
532
|
+
n_scrambles: int = 1,
|
|
533
|
+
kmer_size: int = 1,
|
|
534
|
+
encoder=None,
|
|
535
|
+
auto_map_chromosomes: bool = False,
|
|
536
|
+
random_state=None,
|
|
537
|
+
):
|
|
538
|
+
"""
|
|
539
|
+
Generate sequences with BED-defined regions scrambled using k-mer shuffle.
|
|
540
|
+
|
|
541
|
+
This function creates control sequences where specific regions (defined by BED file)
|
|
542
|
+
are scrambled while preserving (k-1)-mer frequencies. Useful for generating
|
|
543
|
+
negative controls that maintain sequence composition properties.
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
chrom: Chromosome name
|
|
547
|
+
seq_len: Total sequence length for each window
|
|
548
|
+
reference_fasta: Reference genome object (pyfaidx.Fasta or dict-like)
|
|
549
|
+
bed_regions: BED file path or DataFrame defining regions to scramble.
|
|
550
|
+
BED format: chrom, start, end (0-based, half-open intervals).
|
|
551
|
+
Each BED region is scrambled within its centered seq_len window.
|
|
552
|
+
n_scrambles: Number of scrambled versions to generate per region (default: 1)
|
|
553
|
+
kmer_size: Size of k-mers to shuffle (default: 1).
|
|
554
|
+
- kmer_size=1: Shuffle individual nucleotides (preserves length only)
|
|
555
|
+
- kmer_size=2: Shuffle 2-mers (preserves mononucleotide composition)
|
|
556
|
+
- kmer_size=3: Shuffle 3-mers (preserves dinucleotide frequencies)
|
|
557
|
+
Higher values preserve more local sequence context.
|
|
558
|
+
encoder: Optional custom encoding function
|
|
559
|
+
auto_map_chromosomes: Automatically map chromosome names between reference
|
|
560
|
+
and BED file (e.g., 'chr1' <-> '1'). Default: False.
|
|
561
|
+
random_state: Random seed or numpy random generator for reproducibility.
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
Tuple of (ref_seqs, scrambled_seqs, metadata):
|
|
565
|
+
- ref_seqs: One-hot encoded reference sequences, shape (N, 4, seq_len)
|
|
566
|
+
- scrambled_seqs: Scrambled sequences, shape (N * n_scrambles, 4, seq_len)
|
|
567
|
+
- metadata: DataFrame with columns:
|
|
568
|
+
- chrom: Chromosome name
|
|
569
|
+
- window_start: Start of sequence window (0-based)
|
|
570
|
+
- window_end: End of sequence window (0-based, exclusive)
|
|
571
|
+
- scramble_start: Start of scrambled region within window (0-based)
|
|
572
|
+
- scramble_end: End of scrambled region within window (0-based, exclusive)
|
|
573
|
+
- scramble_idx: Index of this scramble (0 to n_scrambles-1)
|
|
574
|
+
- ref: Original/reference sequence in scrambled region
|
|
575
|
+
- alt: Scrambled/alternate sequence in that region
|
|
576
|
+
|
|
577
|
+
Raises:
|
|
578
|
+
ValueError: If bed_regions is not provided, has invalid format, or kmer_size < 1
|
|
579
|
+
"""
|
|
580
|
+
if bed_regions is None:
|
|
581
|
+
raise ValueError("bed_regions is required for get_scrambled_subsequences()")
|
|
582
|
+
|
|
583
|
+
if kmer_size < 1:
|
|
584
|
+
raise ValueError(f"kmer_size must be >= 1, got {kmer_size}")
|
|
585
|
+
|
|
586
|
+
# Handle random state
|
|
587
|
+
if random_state is None:
|
|
588
|
+
rng = np.random.default_rng()
|
|
589
|
+
elif isinstance(random_state, (int, np.integer)):
|
|
590
|
+
rng = np.random.default_rng(random_state)
|
|
591
|
+
else:
|
|
592
|
+
rng = random_state
|
|
593
|
+
|
|
594
|
+
# Parse BED file
|
|
595
|
+
bed_df = _read_bed_file(bed_regions)
|
|
596
|
+
|
|
597
|
+
# Apply chromosome name matching
|
|
598
|
+
ref_chroms = {chrom}
|
|
599
|
+
bed_chroms = set(bed_df["chrom"].unique())
|
|
600
|
+
|
|
601
|
+
mapping, unmatched = match_chromosomes_with_report(
|
|
602
|
+
ref_chroms,
|
|
603
|
+
bed_chroms,
|
|
604
|
+
verbose=False,
|
|
605
|
+
auto_map_chromosomes=auto_map_chromosomes,
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
if mapping:
|
|
609
|
+
bed_df = apply_chromosome_mapping(bed_df, mapping)
|
|
610
|
+
|
|
611
|
+
# Filter to target chromosome
|
|
612
|
+
chrom_bed_regions = bed_df[bed_df["chrom"] == chrom].copy()
|
|
613
|
+
|
|
614
|
+
if len(chrom_bed_regions) == 0:
|
|
615
|
+
warnings.warn(
|
|
616
|
+
f"No BED regions found for chromosome {chrom}. "
|
|
617
|
+
f"Returning original unshuffled sequence."
|
|
618
|
+
)
|
|
619
|
+
# Return original sequence (unshuffled) centered on chromosome
|
|
620
|
+
chrom_obj = reference_fasta[chrom]
|
|
621
|
+
if hasattr(chrom_obj, "__len__"):
|
|
622
|
+
chrom_len = len(chrom_obj)
|
|
623
|
+
else:
|
|
624
|
+
chrom_len = len(str(chrom_obj))
|
|
625
|
+
|
|
626
|
+
# Center window on chromosome
|
|
627
|
+
chrom_center = chrom_len // 2
|
|
628
|
+
window_start = max(0, chrom_center - seq_len // 2)
|
|
629
|
+
window_end = min(chrom_len, window_start + seq_len)
|
|
630
|
+
|
|
631
|
+
# Adjust if we hit the end
|
|
632
|
+
if window_end - window_start < seq_len:
|
|
633
|
+
window_start = max(0, window_end - seq_len)
|
|
634
|
+
|
|
635
|
+
# Get reference sequence
|
|
636
|
+
ref_seq_obj = reference_fasta[chrom][window_start:window_end]
|
|
637
|
+
if hasattr(ref_seq_obj, "seq"):
|
|
638
|
+
ref_seq = str(ref_seq_obj.seq)
|
|
639
|
+
else:
|
|
640
|
+
ref_seq = str(ref_seq_obj)
|
|
641
|
+
|
|
642
|
+
ref_1h = encode_seq(ref_seq, encoder)
|
|
643
|
+
|
|
644
|
+
if TORCH_AVAILABLE and isinstance(ref_1h, torch.Tensor):
|
|
645
|
+
ref_stacked = torch.stack([ref_1h])
|
|
646
|
+
# Return same sequence for all "scrambled" outputs (but unshuffled)
|
|
647
|
+
scrambled_stacked = torch.stack([ref_1h] * n_scrambles)
|
|
648
|
+
else:
|
|
649
|
+
ref_stacked = np.stack([ref_1h])
|
|
650
|
+
scrambled_stacked = np.stack([ref_1h] * n_scrambles)
|
|
651
|
+
|
|
652
|
+
# Create metadata indicating no scrambling occurred
|
|
653
|
+
meta_rows = []
|
|
654
|
+
for i in range(n_scrambles):
|
|
655
|
+
meta_rows.append(
|
|
656
|
+
{
|
|
657
|
+
"chrom": chrom,
|
|
658
|
+
"window_start": window_start,
|
|
659
|
+
"window_end": window_end,
|
|
660
|
+
"scramble_start": 0,
|
|
661
|
+
"scramble_end": 0, # Empty region indicates no scrambling
|
|
662
|
+
"scramble_idx": i,
|
|
663
|
+
"ref": ref_seq,
|
|
664
|
+
"alt": ref_seq, # Same as ref when no scrambling
|
|
665
|
+
}
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
return ref_stacked, scrambled_stacked, pd.DataFrame(meta_rows)
|
|
669
|
+
|
|
670
|
+
ref_sequences = []
|
|
671
|
+
scrambled_sequences = []
|
|
672
|
+
metadata = []
|
|
673
|
+
|
|
674
|
+
# Process each BED region
|
|
675
|
+
for _, bed_region in chrom_bed_regions.iterrows():
|
|
676
|
+
region_start = int(bed_region["start"])
|
|
677
|
+
region_end = int(bed_region["end"])
|
|
678
|
+
region_center = (region_start + region_end) // 2
|
|
679
|
+
|
|
680
|
+
# Calculate sequence window centered on BED region
|
|
681
|
+
window_start = region_center - seq_len // 2
|
|
682
|
+
window_end = window_start + seq_len
|
|
683
|
+
|
|
684
|
+
# Adjust window to stay within chromosome bounds
|
|
685
|
+
chrom_obj = reference_fasta[chrom]
|
|
686
|
+
chrom_len = len(chrom_obj) if hasattr(chrom_obj, "__len__") else len(chrom_obj)
|
|
687
|
+
|
|
688
|
+
if window_start < 0:
|
|
689
|
+
window_start = 0
|
|
690
|
+
window_end = min(seq_len, chrom_len)
|
|
691
|
+
elif window_end > chrom_len:
|
|
692
|
+
window_end = chrom_len
|
|
693
|
+
window_start = max(0, chrom_len - seq_len)
|
|
694
|
+
|
|
695
|
+
# Get reference sequence
|
|
696
|
+
ref_seq_obj = reference_fasta[chrom][window_start:window_end]
|
|
697
|
+
if hasattr(ref_seq_obj, "seq"):
|
|
698
|
+
ref_seq = str(ref_seq_obj.seq)
|
|
699
|
+
else:
|
|
700
|
+
ref_seq = str(ref_seq_obj)
|
|
701
|
+
|
|
702
|
+
if len(ref_seq) != seq_len:
|
|
703
|
+
warnings.warn(
|
|
704
|
+
f"Region {chrom}:{region_start}-{region_end} produces sequence of length "
|
|
705
|
+
f"{len(ref_seq)} instead of {seq_len}. Skipping."
|
|
706
|
+
)
|
|
707
|
+
continue
|
|
708
|
+
|
|
709
|
+
# Calculate scramble region relative to window
|
|
710
|
+
scramble_start_rel = max(0, region_start - window_start)
|
|
711
|
+
scramble_end_rel = min(seq_len, region_end - window_start)
|
|
712
|
+
|
|
713
|
+
if scramble_start_rel >= scramble_end_rel:
|
|
714
|
+
warnings.warn(
|
|
715
|
+
f"BED region {chrom}:{region_start}-{region_end} is outside window bounds. Skipping."
|
|
716
|
+
)
|
|
717
|
+
continue
|
|
718
|
+
|
|
719
|
+
# Store reference sequence
|
|
720
|
+
ref_1h = encode_seq(ref_seq, encoder)
|
|
721
|
+
ref_sequences.append(ref_1h)
|
|
722
|
+
|
|
723
|
+
# Get original region sequence for metadata
|
|
724
|
+
original_region = ref_seq[scramble_start_rel:scramble_end_rel]
|
|
725
|
+
|
|
726
|
+
# Generate n_scrambles scrambled versions
|
|
727
|
+
for scramble_idx in range(n_scrambles):
|
|
728
|
+
scrambled_seq = _scramble_region(
|
|
729
|
+
ref_seq,
|
|
730
|
+
scramble_start_rel,
|
|
731
|
+
scramble_end_rel,
|
|
732
|
+
k=kmer_size,
|
|
733
|
+
random_state=rng,
|
|
734
|
+
)
|
|
735
|
+
|
|
736
|
+
scrambled_1h = encode_seq(scrambled_seq, encoder)
|
|
737
|
+
scrambled_sequences.append(scrambled_1h)
|
|
738
|
+
|
|
739
|
+
scrambled_region = scrambled_seq[scramble_start_rel:scramble_end_rel]
|
|
740
|
+
|
|
741
|
+
metadata.append(
|
|
742
|
+
{
|
|
743
|
+
"chrom": chrom,
|
|
744
|
+
"window_start": window_start,
|
|
745
|
+
"window_end": window_end,
|
|
746
|
+
"scramble_start": scramble_start_rel,
|
|
747
|
+
"scramble_end": scramble_end_rel,
|
|
748
|
+
"scramble_idx": scramble_idx,
|
|
749
|
+
"ref": original_region,
|
|
750
|
+
"alt": scrambled_region,
|
|
751
|
+
}
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
# Stack sequences
|
|
755
|
+
if ref_sequences:
|
|
756
|
+
if TORCH_AVAILABLE and isinstance(ref_sequences[0], torch.Tensor):
|
|
757
|
+
ref_stacked = torch.stack(ref_sequences)
|
|
758
|
+
scrambled_stacked = torch.stack(scrambled_sequences)
|
|
759
|
+
else:
|
|
760
|
+
ref_stacked = np.stack(ref_sequences)
|
|
761
|
+
scrambled_stacked = np.stack(scrambled_sequences)
|
|
762
|
+
else:
|
|
763
|
+
if TORCH_AVAILABLE:
|
|
764
|
+
ref_stacked = torch.empty((0, 4, seq_len), dtype=torch.float32)
|
|
765
|
+
scrambled_stacked = torch.empty((0, 4, seq_len), dtype=torch.float32)
|
|
766
|
+
else:
|
|
767
|
+
ref_stacked = np.empty((0, 4, seq_len), dtype=np.float32)
|
|
768
|
+
scrambled_stacked = np.empty((0, 4, seq_len), dtype=np.float32)
|
|
769
|
+
|
|
770
|
+
metadata_df = pd.DataFrame(metadata)
|
|
771
|
+
|
|
772
|
+
return ref_stacked, scrambled_stacked, metadata_df
|
supremo_lite/variant_utils.py
CHANGED
|
@@ -5,6 +5,7 @@ This module provides functions for reading variants from VCF files
|
|
|
5
5
|
and other related operations.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
import gzip
|
|
8
9
|
import io
|
|
9
10
|
import pandas as pd
|
|
10
11
|
import numpy as np
|
|
@@ -14,6 +15,22 @@ from typing import Dict, Optional, List, Tuple, Union
|
|
|
14
15
|
from dataclasses import dataclass
|
|
15
16
|
|
|
16
17
|
|
|
18
|
+
def _open_vcf(path: str, mode: str = "rt"):
|
|
19
|
+
"""
|
|
20
|
+
Open a VCF file, automatically detecting gzip compression.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
path: Path to VCF file (may be .vcf or .vcf.gz)
|
|
24
|
+
mode: File mode. Use 'rt' for text reading (default).
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
File handle (context manager compatible)
|
|
28
|
+
"""
|
|
29
|
+
if path.endswith(".gz"):
|
|
30
|
+
return gzip.open(path, mode)
|
|
31
|
+
return open(path, mode.replace("t", "") if "t" in mode else mode)
|
|
32
|
+
|
|
33
|
+
|
|
17
34
|
@dataclass
|
|
18
35
|
class BreakendVariant:
|
|
19
36
|
"""
|
|
@@ -625,13 +642,15 @@ def _count_vcf_header_lines(path: str) -> int:
|
|
|
625
642
|
- Lines starting with ## (metadata)
|
|
626
643
|
- Line starting with #CHROM (column header)
|
|
627
644
|
|
|
645
|
+
Supports both uncompressed (.vcf) and gzip-compressed (.vcf.gz) files.
|
|
646
|
+
|
|
628
647
|
Args:
|
|
629
648
|
path: Path to VCF file
|
|
630
649
|
|
|
631
650
|
Returns:
|
|
632
651
|
Number of lines to skip (all ## lines + the #CHROM line)
|
|
633
652
|
"""
|
|
634
|
-
with
|
|
653
|
+
with _open_vcf(path, "rt") as f:
|
|
635
654
|
header_count = 0
|
|
636
655
|
for line in f:
|
|
637
656
|
if line.startswith("##"):
|
|
@@ -648,6 +667,8 @@ def read_vcf(path, include_info=True, classify_variants=True):
|
|
|
648
667
|
"""
|
|
649
668
|
Read VCF file into pandas DataFrame with enhanced variant classification.
|
|
650
669
|
|
|
670
|
+
Supports both uncompressed (.vcf) and gzip-compressed (.vcf.gz) files.
|
|
671
|
+
|
|
651
672
|
Args:
|
|
652
673
|
path: Path to VCF file
|
|
653
674
|
include_info: Whether to include INFO field (default: True)
|
|
@@ -656,11 +677,21 @@ def read_vcf(path, include_info=True, classify_variants=True):
|
|
|
656
677
|
Returns:
|
|
657
678
|
DataFrame with columns: chrom, pos1, id, ref, alt, [info], [variant_type]
|
|
658
679
|
|
|
680
|
+
Raises:
|
|
681
|
+
FileNotFoundError: If VCF file does not exist
|
|
682
|
+
ValueError: If VCF file has invalid format or no valid header
|
|
683
|
+
|
|
659
684
|
Notes:
|
|
660
685
|
- INFO field parsing enables structural variant classification
|
|
661
686
|
- variant_type column uses VCF 4.2 compliant classification
|
|
662
687
|
- Compatible with existing code expecting basic 5-column format
|
|
663
688
|
"""
|
|
689
|
+
import os
|
|
690
|
+
|
|
691
|
+
# Validate file exists
|
|
692
|
+
if not os.path.exists(path):
|
|
693
|
+
raise FileNotFoundError(f"VCF file not found: {path}")
|
|
694
|
+
|
|
664
695
|
# Determine columns to read based on parameters
|
|
665
696
|
if include_info:
|
|
666
697
|
usecols = [0, 1, 2, 3, 4, 7] # Include INFO field
|
|
@@ -670,12 +701,38 @@ def read_vcf(path, include_info=True, classify_variants=True):
|
|
|
670
701
|
base_columns = ["chrom", "pos1", "id", "ref", "alt"]
|
|
671
702
|
|
|
672
703
|
# Count header lines for VCF line tracking (needed for vcf_line column)
|
|
673
|
-
|
|
704
|
+
try:
|
|
705
|
+
header_count = _count_vcf_header_lines(path)
|
|
706
|
+
except Exception as e:
|
|
707
|
+
raise ValueError(f"Failed to parse VCF header in {path}: {e}")
|
|
708
|
+
|
|
709
|
+
if header_count == 0:
|
|
710
|
+
raise ValueError(
|
|
711
|
+
f"VCF file {path} appears to have no header lines. "
|
|
712
|
+
"Valid VCF files must start with ##fileformat or #CHROM header."
|
|
713
|
+
)
|
|
674
714
|
|
|
675
715
|
# Read VCF using pandas with comment='#' to skip all header lines automatically
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
716
|
+
try:
|
|
717
|
+
df = pd.read_table(
|
|
718
|
+
path,
|
|
719
|
+
comment="#",
|
|
720
|
+
header=None,
|
|
721
|
+
names=base_columns,
|
|
722
|
+
usecols=usecols,
|
|
723
|
+
on_bad_lines="warn",
|
|
724
|
+
)
|
|
725
|
+
except pd.errors.EmptyDataError:
|
|
726
|
+
warnings.warn(f"VCF file {path} contains no data rows after header.")
|
|
727
|
+
empty_cols = base_columns + (["variant_type"] if classify_variants else [])
|
|
728
|
+
return pd.DataFrame(columns=empty_cols)
|
|
729
|
+
|
|
730
|
+
# Handle empty DataFrame
|
|
731
|
+
if len(df) == 0:
|
|
732
|
+
warnings.warn(f"VCF file {path} contains no variant records.")
|
|
733
|
+
if classify_variants:
|
|
734
|
+
df["variant_type"] = pd.Series(dtype=str)
|
|
735
|
+
return df
|
|
679
736
|
|
|
680
737
|
# Add VCF line numbers for debugging (1-indexed, accounting for header lines)
|
|
681
738
|
# Line number = header_lines + 1 (for 1-indexing) + row_index
|
|
@@ -683,9 +740,22 @@ def read_vcf(path, include_info=True, classify_variants=True):
|
|
|
683
740
|
|
|
684
741
|
# Validate that pos1 column is numeric
|
|
685
742
|
if not pd.api.types.is_numeric_dtype(df["pos1"]):
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
743
|
+
# Try to convert, providing helpful error message
|
|
744
|
+
try:
|
|
745
|
+
df["pos1"] = pd.to_numeric(df["pos1"], errors="coerce")
|
|
746
|
+
invalid_rows = df[df["pos1"].isna()]
|
|
747
|
+
if len(invalid_rows) > 0:
|
|
748
|
+
warnings.warn(
|
|
749
|
+
f"Found {len(invalid_rows)} rows with non-numeric positions in {path}. "
|
|
750
|
+
f"First invalid at VCF line {invalid_rows.iloc[0]['vcf_line']}. "
|
|
751
|
+
"These rows will be removed."
|
|
752
|
+
)
|
|
753
|
+
df = df.dropna(subset=["pos1"])
|
|
754
|
+
df["pos1"] = df["pos1"].astype(int)
|
|
755
|
+
except Exception as e:
|
|
756
|
+
raise ValueError(
|
|
757
|
+
f"Position column must be numeric in {path}, conversion failed: {e}"
|
|
758
|
+
)
|
|
689
759
|
|
|
690
760
|
# Filter out multiallelic variants (ALT alleles containing commas)
|
|
691
761
|
df = _filter_multiallelic_variants(df)
|
|
@@ -775,6 +845,8 @@ def get_vcf_chromosomes(path):
|
|
|
775
845
|
"""
|
|
776
846
|
Get list of chromosomes in VCF file without loading all variants.
|
|
777
847
|
|
|
848
|
+
Supports both uncompressed (.vcf) and gzip-compressed (.vcf.gz) files.
|
|
849
|
+
|
|
778
850
|
Args:
|
|
779
851
|
path: Path to VCF file
|
|
780
852
|
|
|
@@ -782,7 +854,7 @@ def get_vcf_chromosomes(path):
|
|
|
782
854
|
Set of chromosome names found in the VCF file
|
|
783
855
|
"""
|
|
784
856
|
chromosomes = set()
|
|
785
|
-
with
|
|
857
|
+
with _open_vcf(path, "rt") as f:
|
|
786
858
|
for line in f:
|
|
787
859
|
if line.startswith("##"):
|
|
788
860
|
continue
|
|
@@ -800,6 +872,8 @@ def read_vcf_chromosome(
|
|
|
800
872
|
"""
|
|
801
873
|
Read VCF file for a specific chromosome only with enhanced variant classification.
|
|
802
874
|
|
|
875
|
+
Supports both uncompressed (.vcf) and gzip-compressed (.vcf.gz) files.
|
|
876
|
+
|
|
803
877
|
Args:
|
|
804
878
|
path: Path to VCF file
|
|
805
879
|
target_chromosome: Chromosome name to filter for
|
|
@@ -813,7 +887,7 @@ def read_vcf_chromosome(
|
|
|
813
887
|
chromosome_lines = []
|
|
814
888
|
header_line = None
|
|
815
889
|
|
|
816
|
-
with
|
|
890
|
+
with _open_vcf(path, "rt") as f:
|
|
817
891
|
for line in f:
|
|
818
892
|
if line.startswith("##"):
|
|
819
893
|
continue
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
supremo_lite/__init__.py,sha256=
|
|
1
|
+
supremo_lite/__init__.py,sha256=hRuX97N2VKbfbMm0J_F1eryacJphc920pUQ9N60vFpY,1705
|
|
2
2
|
supremo_lite/chromosome_utils.py,sha256=rOjS3IQXmjBYZG949C5eG1zWprTOdhPVg4oW8GLbCek,10938
|
|
3
3
|
supremo_lite/core.py,sha256=-OmQEAS5J2rhocXC4aHYWaZZE2N8MX942SePMLoQ6Xc,1190
|
|
4
4
|
supremo_lite/mock_models/__init__.py,sha256=YQcL3oOoe0WJW5y_LtpuhmYIlx-_xS8raB1Mty5wtF4,3672
|
|
5
5
|
supremo_lite/mock_models/testmodel_1d.py,sha256=0CqLuAwxthz_sn_2v0C5XHu8q42dZO7EIzo9aX1hJ2U,6056
|
|
6
6
|
supremo_lite/mock_models/testmodel_2d.py,sha256=swSEEkORf7sjlZQ7XakY3qGGeRomVpiLZZbice6zziw,7011
|
|
7
|
-
supremo_lite/mutagenesis.py,sha256=
|
|
7
|
+
supremo_lite/mutagenesis.py,sha256=Cm4ZXLa3TcrEaNkuw_m9leXclEAFRtRUt9NEhVScWyA,28873
|
|
8
8
|
supremo_lite/personalize.py,sha256=w3Bv0xwikHbpZlXkgXWB4lo6XzzbzrrKSJqrrC-rTRs,126389
|
|
9
9
|
supremo_lite/prediction_alignment.py,sha256=rmpZDE-PK9-CqsXjlVw0J0KJqZXuPPl55IceF76gj-s,43020
|
|
10
10
|
supremo_lite/sequence_utils.py,sha256=yl-ghw9mGEGjiIYCBZ-4-S-CpXDjsP6suGuVVdww1mY,4147
|
|
11
|
-
supremo_lite/variant_utils.py,sha256=
|
|
12
|
-
supremo_lite-0.
|
|
13
|
-
supremo_lite-0.
|
|
14
|
-
supremo_lite-0.
|
|
15
|
-
supremo_lite-0.
|
|
11
|
+
supremo_lite/variant_utils.py,sha256=pEp2i83q7lLcyboAOIY88mFDw0Zfr0fFv4bL38ROZP0,63257
|
|
12
|
+
supremo_lite-1.0.0.dist-info/METADATA,sha256=eNFfDv6BkIJxHuMY9HsB-j4mGBUUNr9NediUiMa7Qo8,9025
|
|
13
|
+
supremo_lite-1.0.0.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
|
|
14
|
+
supremo_lite-1.0.0.dist-info/licenses/LICENSE,sha256=QoRjddrQkzdNXNq7EQbRtWGvOKv1h031CG8wreXDa00,1079
|
|
15
|
+
supremo_lite-1.0.0.dist-info/RECORD,,
|
|
File without changes
|