vmwhere 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vmwhere/__init__.py +6 -0
- vmwhere/_version.py +24 -0
- vmwhere/chr_mapping_simple.txt +25 -0
- vmwhere/cli.py +95 -0
- vmwhere/find.py +160 -0
- vmwhere/genotyper.py +1006 -0
- vmwhere/visualize_region.R +267 -0
- vmwhere-0.2.0.dist-info/METADATA +166 -0
- vmwhere-0.2.0.dist-info/RECORD +13 -0
- vmwhere-0.2.0.dist-info/WHEEL +5 -0
- vmwhere-0.2.0.dist-info/entry_points.txt +2 -0
- vmwhere-0.2.0.dist-info/licenses/LICENSE +201 -0
- vmwhere-0.2.0.dist-info/top_level.txt +1 -0
vmwhere/__init__.py
ADDED
vmwhere/_version.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# file generated by vcs-versioning
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"__version__",
|
|
7
|
+
"__version_tuple__",
|
|
8
|
+
"version",
|
|
9
|
+
"version_tuple",
|
|
10
|
+
"__commit_id__",
|
|
11
|
+
"commit_id",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
version: str
|
|
15
|
+
__version__: str
|
|
16
|
+
__version_tuple__: tuple[int | str, ...]
|
|
17
|
+
version_tuple: tuple[int | str, ...]
|
|
18
|
+
commit_id: str | None
|
|
19
|
+
__commit_id__: str | None
|
|
20
|
+
|
|
21
|
+
__version__ = version = '0.2.0'
|
|
22
|
+
__version_tuple__ = version_tuple = (0, 2, 0)
|
|
23
|
+
|
|
24
|
+
__commit_id__ = commit_id = None
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
NC_060925.1 chr1
|
|
2
|
+
NC_060926.1 chr2
|
|
3
|
+
NC_060927.1 chr3
|
|
4
|
+
NC_060928.1 chr4
|
|
5
|
+
NC_060929.1 chr5
|
|
6
|
+
NC_060930.1 chr6
|
|
7
|
+
NC_060931.1 chr7
|
|
8
|
+
NC_060932.1 chr8
|
|
9
|
+
NC_060933.1 chr9
|
|
10
|
+
NC_060934.1 chr10
|
|
11
|
+
NC_060935.1 chr11
|
|
12
|
+
NC_060936.1 chr12
|
|
13
|
+
NC_060937.1 chr13
|
|
14
|
+
NC_060938.1 chr14
|
|
15
|
+
NC_060939.1 chr15
|
|
16
|
+
NC_060940.1 chr16
|
|
17
|
+
NC_060941.1 chr17
|
|
18
|
+
NC_060942.1 chr18
|
|
19
|
+
NC_060943.1 chr19
|
|
20
|
+
NC_060944.1 chr20
|
|
21
|
+
NC_060945.1 chr21
|
|
22
|
+
NC_060946.1 chr22
|
|
23
|
+
NC_060947.1 chrX
|
|
24
|
+
NC_060948.1 chrY
|
|
25
|
+
chrM chrM
|
vmwhere/cli.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import subprocess
|
|
3
|
+
from importlib.resources import files
|
|
4
|
+
from .find import run_find
|
|
5
|
+
from .genotyper import run_genotyper
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def find_microsatellites(args):
|
|
9
|
+
|
|
10
|
+
run_find(
|
|
11
|
+
motif=args.motif,
|
|
12
|
+
fasta_file=args.fasta,
|
|
13
|
+
repeats=args.perfect_repeats,
|
|
14
|
+
gap=args.max_gap,
|
|
15
|
+
output_dir=args.output_dir,
|
|
16
|
+
buffer=args.buffer_size
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def genotype_microsatellites(args):
|
|
21
|
+
|
|
22
|
+
run_genotyper(
|
|
23
|
+
sample_id=args.sample_id,
|
|
24
|
+
bam_file=args.bam_file,
|
|
25
|
+
fasta=args.fasta,
|
|
26
|
+
cluster_distance=args.cluster_distance,
|
|
27
|
+
minor_threshold=args.minor_threshold,
|
|
28
|
+
major_threshold=args.major_threshold,
|
|
29
|
+
bed_file=args.bed_file,
|
|
30
|
+
output_dir=args.output_dir,
|
|
31
|
+
num_processes=args.num_processes
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def visualize_microsatellite(args):
|
|
36
|
+
r_script_path = files("vmwhere").joinpath("visualize_region.R")
|
|
37
|
+
|
|
38
|
+
subprocess.run([
|
|
39
|
+
"Rscript",
|
|
40
|
+
str(r_script_path),
|
|
41
|
+
args.genotype_tsv,
|
|
42
|
+
args.microsatellite_id,
|
|
43
|
+
args.min_allele_count,
|
|
44
|
+
args.output_pdf], check=True)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def main():
|
|
49
|
+
parser = argparse.ArgumentParser(
|
|
50
|
+
description="vmwhere: microsatellite reference identification, sample genotyping, sequence decomposition, and visualization from long-read data"
|
|
51
|
+
)
|
|
52
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
53
|
+
|
|
54
|
+
# --- Subcommand: find ---
|
|
55
|
+
find_parser = subparsers.add_parser("find", help="Identify genomic coordinates of repeat microsatellite sequences based on a reference")
|
|
56
|
+
|
|
57
|
+
find_parser.add_argument("-m", "--motif", required=True)
|
|
58
|
+
find_parser.add_argument("-r", "--perfect_repeats", type=int, default=2)
|
|
59
|
+
find_parser.add_argument("-g", "--max_gap", type=int, default=50)
|
|
60
|
+
find_parser.add_argument("-b", "--buffer_size", type=int, default=50)
|
|
61
|
+
find_parser.add_argument("-o", "--output_dir", required=True)
|
|
62
|
+
find_parser.add_argument("-f", "--fasta", required=True)
|
|
63
|
+
find_parser.set_defaults(func=find_microsatellites)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# --- Subcommand: genotype ---
|
|
67
|
+
profile_parser = subparsers.add_parser("genotype", help="Genotype microsatellites given a sample BAM file")
|
|
68
|
+
|
|
69
|
+
profile_parser.add_argument("--sample_id", required=True, help="Output file will be sample_id_vmwhere_results.tsv")
|
|
70
|
+
profile_parser.add_argument("--bed_file", required=True, help="Header free bed file with columns chr start end region_id motif")
|
|
71
|
+
profile_parser.add_argument("--bam_file", required=True, help="Sorted, indexed, sample bam file")
|
|
72
|
+
profile_parser.add_argument("--fasta", required=True, help="Path to reference fasta file")
|
|
73
|
+
profile_parser.add_argument("--cluster_distance", type=int, default=0, help="Edit distance to use when clustering reads prior to allele calling")
|
|
74
|
+
profile_parser.add_argument("--minor_threshold", type=float, default=0.20, help="Minimium locus read support (fraction) to be called an allele")
|
|
75
|
+
profile_parser.add_argument("--major_threshold", type=float, default=0.80, help="Read support (fraction) for calling homozygous microsatellites")
|
|
76
|
+
profile_parser.add_argument("--output_dir", required=True, help="Parent directory for genotyping results")
|
|
77
|
+
profile_parser.add_argument("--num_processes", type=int, default=24)
|
|
78
|
+
profile_parser.set_defaults(func=genotype_microsatellites)
|
|
79
|
+
|
|
80
|
+
# --- Subcommand: visualize ---
|
|
81
|
+
vis_parser = subparsers.add_parser("visualize", help="Visualize sequence resolved alleles for a specific region")
|
|
82
|
+
|
|
83
|
+
vis_parser.add_argument("-g", "--genotype_tsv", required=True, help="genotype results in vmwhere tsv format")
|
|
84
|
+
vis_parser.add_argument("-m", "--microsatellite_id", required=True, help="The unique region_id in the genotype output file")
|
|
85
|
+
vis_parser.add_argument("-o", "--output_pdf", required=True, help = "Output file path and name")
|
|
86
|
+
vis_parser.add_argument("-c", "--min_allele_count", required=True, default=0, help="Filter out low frequency alleles")
|
|
87
|
+
vis_parser.set_defaults(func=visualize_microsatellite)
|
|
88
|
+
|
|
89
|
+
# Parse args and dispatch
|
|
90
|
+
args = parser.parse_args()
|
|
91
|
+
args.func(args)
|
|
92
|
+
|
|
93
|
+
if __name__ == "__main__":
|
|
94
|
+
main()
|
|
95
|
+
|
vmwhere/find.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from Bio.Seq import Seq
|
|
3
|
+
from Bio import SeqIO
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import os
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
# Configure logging once at the start of the script
|
|
9
|
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def find_motif_genomic_coordinates(sequence, motif, buffer_size):
|
|
13
|
+
"""Find occurrences of a motif in a genomic sequence."""
|
|
14
|
+
|
|
15
|
+
motif = Seq(motif)
|
|
16
|
+
motif_str = str(motif)
|
|
17
|
+
motif_len = len(motif)
|
|
18
|
+
motif_start = 0
|
|
19
|
+
res = []
|
|
20
|
+
|
|
21
|
+
while True:
|
|
22
|
+
# find successive occurances of motifs as you move along the contig
|
|
23
|
+
motif_start = sequence.find(motif_str, motif_start)
|
|
24
|
+
if motif_start == -1:
|
|
25
|
+
break
|
|
26
|
+
|
|
27
|
+
# Initialize forward scanning variables.
|
|
28
|
+
current_pos = motif_start
|
|
29
|
+
motif_num_next = 1
|
|
30
|
+
perfect_count = 1 # the initial match is perfect
|
|
31
|
+
|
|
32
|
+
# Scan forward from the first match to find all repeats.
|
|
33
|
+
while True:
|
|
34
|
+
next_chunk_start = current_pos + motif_len
|
|
35
|
+
next_chunk = sequence[next_chunk_start: next_chunk_start + motif_len]
|
|
36
|
+
if next_chunk == motif_str:
|
|
37
|
+
motif_num_next += 1
|
|
38
|
+
perfect_count += 1
|
|
39
|
+
current_pos = next_chunk_start
|
|
40
|
+
else:
|
|
41
|
+
break
|
|
42
|
+
|
|
43
|
+
motif_end = current_pos + motif_len
|
|
44
|
+
buffer_start = max(0, motif_start - buffer_size)
|
|
45
|
+
buffer_end = motif_end + buffer_size
|
|
46
|
+
|
|
47
|
+
res.append({
|
|
48
|
+
'Motif': motif_str,
|
|
49
|
+
'Motif_Start': motif_start,
|
|
50
|
+
'Motif_End': motif_end,
|
|
51
|
+
'Total_Repeats': motif_num_next,
|
|
52
|
+
'Perfect_Repeats': perfect_count,
|
|
53
|
+
'Buffer_Start': buffer_start,
|
|
54
|
+
'Buffer_End': buffer_end
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
motif_start = motif_end
|
|
58
|
+
|
|
59
|
+
return res
|
|
60
|
+
|
|
61
|
+
def merge_adjacent_motifs(df, max_gap):
|
|
62
|
+
if df.empty:
|
|
63
|
+
return df
|
|
64
|
+
|
|
65
|
+
df = df.sort_values(by=['chrom', 'Motif_Start']).reset_index(drop=True)
|
|
66
|
+
|
|
67
|
+
# Calculate gaps, but reset at chromosome boundaries
|
|
68
|
+
prev_chrom = df['chrom'].shift()
|
|
69
|
+
prev_end = df['Motif_End'].shift()
|
|
70
|
+
|
|
71
|
+
# A new group starts when: different chromosome OR gap exceeds max_gap
|
|
72
|
+
new_group = (df['chrom'] != prev_chrom) | (df['Motif_Start'] > (prev_end + max_gap))
|
|
73
|
+
df['group'] = new_group.cumsum()
|
|
74
|
+
|
|
75
|
+
# Aggregate merged groups using built-in aggregation functions
|
|
76
|
+
merged_df = df.groupby(['chrom', 'group'], as_index=False).agg({
|
|
77
|
+
'Buffer_Start': 'min',
|
|
78
|
+
'Buffer_End': 'max',
|
|
79
|
+
'motif': 'first',
|
|
80
|
+
'Motif_Start': 'min',
|
|
81
|
+
'Motif_End': 'max',
|
|
82
|
+
'Total_Repeats': 'sum',
|
|
83
|
+
'Perfect_Repeats': 'sum'
|
|
84
|
+
}).drop(columns=['group'])
|
|
85
|
+
|
|
86
|
+
return merged_df
|
|
87
|
+
|
|
88
|
+
def name_each_region(merged_df):
|
|
89
|
+
"""Give each region a unique identifier to be used to track regions during post processing"""
|
|
90
|
+
merged_df = merged_df.copy()
|
|
91
|
+
merged_df['region_name'] = merged_df['chrom'] + '_region_' + (merged_df.reset_index().index + 1).astype(str)
|
|
92
|
+
return merged_df
|
|
93
|
+
|
|
94
|
+
def run_find(
|
|
95
|
+
motif,
|
|
96
|
+
repeats,
|
|
97
|
+
gap,
|
|
98
|
+
fasta_file,
|
|
99
|
+
buffer,
|
|
100
|
+
output_dir
|
|
101
|
+
):
|
|
102
|
+
|
|
103
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
104
|
+
|
|
105
|
+
results = []
|
|
106
|
+
motif = motif.upper()
|
|
107
|
+
perfect_repeats = repeats
|
|
108
|
+
max_gap = gap
|
|
109
|
+
buffer_size = buffer
|
|
110
|
+
chrom_sequences = {}
|
|
111
|
+
|
|
112
|
+
for seq_record in SeqIO.parse(fasta_file, "fasta"):
|
|
113
|
+
chrom = seq_record.id
|
|
114
|
+
sequence = seq_record.seq.upper()
|
|
115
|
+
chrom_sequences[chrom] = sequence
|
|
116
|
+
|
|
117
|
+
# Look for motif occurances (always store forward motif)
|
|
118
|
+
forward_results = find_motif_genomic_coordinates(sequence, motif, buffer_size)
|
|
119
|
+
for res in forward_results:
|
|
120
|
+
res['chrom'] = chrom
|
|
121
|
+
res['motif'] = motif
|
|
122
|
+
results.extend(forward_results)
|
|
123
|
+
|
|
124
|
+
# look for reverse complement occurrances (but store forward motif)
|
|
125
|
+
rev_motif = str(Seq(motif).reverse_complement())
|
|
126
|
+
reverse_results = find_motif_genomic_coordinates(sequence, rev_motif, buffer_size)
|
|
127
|
+
for res in reverse_results:
|
|
128
|
+
res['chrom'] = chrom
|
|
129
|
+
res['motif'] = motif # Store forward motif, not reverse
|
|
130
|
+
results.extend(reverse_results)
|
|
131
|
+
|
|
132
|
+
# collect all results into a dataframe
|
|
133
|
+
df_all = pd.DataFrame(results)
|
|
134
|
+
|
|
135
|
+
if df_all.empty:
|
|
136
|
+
logging.warning("No motif instances found in sample.")
|
|
137
|
+
return
|
|
138
|
+
|
|
139
|
+
# order column to match bed format
|
|
140
|
+
df_all = df_all[['chrom', 'Buffer_Start', 'Buffer_End','motif', 'Motif_Start', 'Motif_End', 'Total_Repeats', 'Perfect_Repeats']]
|
|
141
|
+
# filter out regions with less than min number of perfect_repeats
|
|
142
|
+
df_all_filtered = df_all[df_all['Perfect_Repeats'] >= perfect_repeats].copy()
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# merge repeat regions per chrom that are in close proximity to be one region instead of two
|
|
146
|
+
merged_all = []
|
|
147
|
+
for contig in df_all_filtered['chrom'].unique():
|
|
148
|
+
df_sub = df_all_filtered[df_all_filtered['chrom'] == contig].copy()
|
|
149
|
+
merged = merge_adjacent_motifs(df_sub, max_gap)
|
|
150
|
+
merged_all.append(merged)
|
|
151
|
+
|
|
152
|
+
df_merged = pd.concat(merged_all, ignore_index=True)
|
|
153
|
+
df_merged = name_each_region(df_merged)
|
|
154
|
+
|
|
155
|
+
df_final = df_merged[['chrom', 'Buffer_Start', 'Buffer_End', 'region_name', 'motif']]
|
|
156
|
+
output_path = os.path.join(output_dir, 'microsatellite_coordinates.bed')
|
|
157
|
+
df_final.to_csv(output_path, sep='\t', index=False, header=False)
|
|
158
|
+
|
|
159
|
+
logging.info(f"Found {len(df_final)} microsatellite regions. Output written to {output_path}")
|
|
160
|
+
return df_final
|