vmwhere 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vmwhere/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ try:
2
+ from vmwhere._version import version as __version__
3
+ except ImportError:
4
+ __version__ = "0.0.0.dev0"
5
+
6
+ __all__ = ["__version__"]
vmwhere/_version.py ADDED
@@ -0,0 +1,24 @@
1
+ # file generated by vcs-versioning
2
+ # don't change, don't track in version control
3
+ from __future__ import annotations
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "__version_tuple__",
8
+ "version",
9
+ "version_tuple",
10
+ "__commit_id__",
11
+ "commit_id",
12
+ ]
13
+
14
+ version: str
15
+ __version__: str
16
+ __version_tuple__: tuple[int | str, ...]
17
+ version_tuple: tuple[int | str, ...]
18
+ commit_id: str | None
19
+ __commit_id__: str | None
20
+
21
+ __version__ = version = '0.2.0'
22
+ __version_tuple__ = version_tuple = (0, 2, 0)
23
+
24
+ __commit_id__ = commit_id = None
@@ -0,0 +1,25 @@
1
+ NC_060925.1 chr1
2
+ NC_060926.1 chr2
3
+ NC_060927.1 chr3
4
+ NC_060928.1 chr4
5
+ NC_060929.1 chr5
6
+ NC_060930.1 chr6
7
+ NC_060931.1 chr7
8
+ NC_060932.1 chr8
9
+ NC_060933.1 chr9
10
+ NC_060934.1 chr10
11
+ NC_060935.1 chr11
12
+ NC_060936.1 chr12
13
+ NC_060937.1 chr13
14
+ NC_060938.1 chr14
15
+ NC_060939.1 chr15
16
+ NC_060940.1 chr16
17
+ NC_060941.1 chr17
18
+ NC_060942.1 chr18
19
+ NC_060943.1 chr19
20
+ NC_060944.1 chr20
21
+ NC_060945.1 chr21
22
+ NC_060946.1 chr22
23
+ NC_060947.1 chrX
24
+ NC_060948.1 chrY
25
+ chrM chrM
vmwhere/cli.py ADDED
@@ -0,0 +1,95 @@
1
+ import argparse
2
+ import subprocess
3
+ from importlib.resources import files
4
+ from .find import run_find
5
+ from .genotyper import run_genotyper
6
+
7
+
8
+ def find_microsatellites(args):
9
+
10
+ run_find(
11
+ motif=args.motif,
12
+ fasta_file=args.fasta,
13
+ repeats=args.perfect_repeats,
14
+ gap=args.max_gap,
15
+ output_dir=args.output_dir,
16
+ buffer=args.buffer_size
17
+ )
18
+
19
+
20
+ def genotype_microsatellites(args):
21
+
22
+ run_genotyper(
23
+ sample_id=args.sample_id,
24
+ bam_file=args.bam_file,
25
+ fasta=args.fasta,
26
+ cluster_distance=args.cluster_distance,
27
+ minor_threshold=args.minor_threshold,
28
+ major_threshold=args.major_threshold,
29
+ bed_file=args.bed_file,
30
+ output_dir=args.output_dir,
31
+ num_processes=args.num_processes
32
+ )
33
+
34
+
35
+ def visualize_microsatellite(args):
36
+ r_script_path = files("vmwhere").joinpath("visualize_region.R")
37
+
38
+ subprocess.run([
39
+ "Rscript",
40
+ str(r_script_path),
41
+ args.genotype_tsv,
42
+ args.microsatellite_id,
43
+ args.min_allele_count,
44
+ args.output_pdf], check=True)
45
+
46
+
47
+
48
+ def main():
49
+ parser = argparse.ArgumentParser(
50
+ description="vmwhere: microsatellite reference identification, sample genotyping, sequence decomposition, and visualization from long-read data"
51
+ )
52
+ subparsers = parser.add_subparsers(dest="command", required=True)
53
+
54
+ # --- Subcommand: find ---
55
+ find_parser = subparsers.add_parser("find", help="Identify genomic coordinates of repeat microsatellite sequences based on a reference")
56
+
57
+ find_parser.add_argument("-m", "--motif", required=True)
58
+ find_parser.add_argument("-r", "--perfect_repeats", type=int, default=2)
59
+ find_parser.add_argument("-g", "--max_gap", type=int, default=50)
60
+ find_parser.add_argument("-b", "--buffer_size", type=int, default=50)
61
+ find_parser.add_argument("-o", "--output_dir", required=True)
62
+ find_parser.add_argument("-f", "--fasta", required=True)
63
+ find_parser.set_defaults(func=find_microsatellites)
64
+
65
+
66
+ # --- Subcommand: genotype ---
67
+ profile_parser = subparsers.add_parser("genotype", help="Genotype microsatellites given a sample BAM file")
68
+
69
+ profile_parser.add_argument("--sample_id", required=True, help="Output file will be sample_id_vmwhere_results.tsv")
70
+ profile_parser.add_argument("--bed_file", required=True, help="Header free bed file with columns chr start end region_id motif")
71
+ profile_parser.add_argument("--bam_file", required=True, help="Sorted, indexed, sample bam file")
72
+ profile_parser.add_argument("--fasta", required=True, help="Path to reference fasta file")
73
+ profile_parser.add_argument("--cluster_distance", type=int, default=0, help="Edit distance to use when clustering reads prior to allele calling")
74
+ profile_parser.add_argument("--minor_threshold", type=float, default=0.20, help="Minimium locus read support (fraction) to be called an allele")
75
+ profile_parser.add_argument("--major_threshold", type=float, default=0.80, help="Read support (fraction) for calling homozygous microsatellites")
76
+ profile_parser.add_argument("--output_dir", required=True, help="Parent directory for genotyping results")
77
+ profile_parser.add_argument("--num_processes", type=int, default=24)
78
+ profile_parser.set_defaults(func=genotype_microsatellites)
79
+
80
+ # --- Subcommand: visualize ---
81
+ vis_parser = subparsers.add_parser("visualize", help="Visualize sequence resolved alleles for a specific region")
82
+
83
+ vis_parser.add_argument("-g", "--genotype_tsv", required=True, help="genotype results in vmwhere tsv format")
84
+ vis_parser.add_argument("-m", "--microsatellite_id", required=True, help="The unique region_id in the genotype output file")
85
+ vis_parser.add_argument("-o", "--output_pdf", required=True, help = "Output file path and name")
86
+ vis_parser.add_argument("-c", "--min_allele_count", required=True, default=0, help="Filter out low frequency alleles")
87
+ vis_parser.set_defaults(func=visualize_microsatellite)
88
+
89
+ # Parse args and dispatch
90
+ args = parser.parse_args()
91
+ args.func(args)
92
+
93
+ if __name__ == "__main__":
94
+ main()
95
+
vmwhere/find.py ADDED
@@ -0,0 +1,160 @@
1
+ import argparse
2
+ from Bio.Seq import Seq
3
+ from Bio import SeqIO
4
+ import pandas as pd
5
+ import os
6
+ import logging
7
+
8
+ # Configure logging once at the start of the script
9
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
10
+
11
+
12
+ def find_motif_genomic_coordinates(sequence, motif, buffer_size):
13
+ """Find occurrences of a motif in a genomic sequence."""
14
+
15
+ motif = Seq(motif)
16
+ motif_str = str(motif)
17
+ motif_len = len(motif)
18
+ motif_start = 0
19
+ res = []
20
+
21
+ while True:
22
+ # find successive occurances of motifs as you move along the contig
23
+ motif_start = sequence.find(motif_str, motif_start)
24
+ if motif_start == -1:
25
+ break
26
+
27
+ # Initialize forward scanning variables.
28
+ current_pos = motif_start
29
+ motif_num_next = 1
30
+ perfect_count = 1 # the initial match is perfect
31
+
32
+ # Scan forward from the first match to find all repeats.
33
+ while True:
34
+ next_chunk_start = current_pos + motif_len
35
+ next_chunk = sequence[next_chunk_start: next_chunk_start + motif_len]
36
+ if next_chunk == motif_str:
37
+ motif_num_next += 1
38
+ perfect_count += 1
39
+ current_pos = next_chunk_start
40
+ else:
41
+ break
42
+
43
+ motif_end = current_pos + motif_len
44
+ buffer_start = max(0, motif_start - buffer_size)
45
+ buffer_end = motif_end + buffer_size
46
+
47
+ res.append({
48
+ 'Motif': motif_str,
49
+ 'Motif_Start': motif_start,
50
+ 'Motif_End': motif_end,
51
+ 'Total_Repeats': motif_num_next,
52
+ 'Perfect_Repeats': perfect_count,
53
+ 'Buffer_Start': buffer_start,
54
+ 'Buffer_End': buffer_end
55
+ })
56
+
57
+ motif_start = motif_end
58
+
59
+ return res
60
+
61
+ def merge_adjacent_motifs(df, max_gap):
62
+ if df.empty:
63
+ return df
64
+
65
+ df = df.sort_values(by=['chrom', 'Motif_Start']).reset_index(drop=True)
66
+
67
+ # Calculate gaps, but reset at chromosome boundaries
68
+ prev_chrom = df['chrom'].shift()
69
+ prev_end = df['Motif_End'].shift()
70
+
71
+ # A new group starts when: different chromosome OR gap exceeds max_gap
72
+ new_group = (df['chrom'] != prev_chrom) | (df['Motif_Start'] > (prev_end + max_gap))
73
+ df['group'] = new_group.cumsum()
74
+
75
+ # Aggregate merged groups using built-in aggregation functions
76
+ merged_df = df.groupby(['chrom', 'group'], as_index=False).agg({
77
+ 'Buffer_Start': 'min',
78
+ 'Buffer_End': 'max',
79
+ 'motif': 'first',
80
+ 'Motif_Start': 'min',
81
+ 'Motif_End': 'max',
82
+ 'Total_Repeats': 'sum',
83
+ 'Perfect_Repeats': 'sum'
84
+ }).drop(columns=['group'])
85
+
86
+ return merged_df
87
+
88
+ def name_each_region(merged_df):
89
+ """Give each region a unique identifier to be used to track regions during post processing"""
90
+ merged_df = merged_df.copy()
91
+ merged_df['region_name'] = merged_df['chrom'] + '_region_' + (merged_df.reset_index().index + 1).astype(str)
92
+ return merged_df
93
+
94
+ def run_find(
95
+ motif,
96
+ repeats,
97
+ gap,
98
+ fasta_file,
99
+ buffer,
100
+ output_dir
101
+ ):
102
+
103
+ os.makedirs(output_dir, exist_ok=True)
104
+
105
+ results = []
106
+ motif = motif.upper()
107
+ perfect_repeats = repeats
108
+ max_gap = gap
109
+ buffer_size = buffer
110
+ chrom_sequences = {}
111
+
112
+ for seq_record in SeqIO.parse(fasta_file, "fasta"):
113
+ chrom = seq_record.id
114
+ sequence = seq_record.seq.upper()
115
+ chrom_sequences[chrom] = sequence
116
+
117
+ # Look for motif occurances (always store forward motif)
118
+ forward_results = find_motif_genomic_coordinates(sequence, motif, buffer_size)
119
+ for res in forward_results:
120
+ res['chrom'] = chrom
121
+ res['motif'] = motif
122
+ results.extend(forward_results)
123
+
124
+ # look for reverse complement occurrances (but store forward motif)
125
+ rev_motif = str(Seq(motif).reverse_complement())
126
+ reverse_results = find_motif_genomic_coordinates(sequence, rev_motif, buffer_size)
127
+ for res in reverse_results:
128
+ res['chrom'] = chrom
129
+ res['motif'] = motif # Store forward motif, not reverse
130
+ results.extend(reverse_results)
131
+
132
+ # collect all results into a dataframe
133
+ df_all = pd.DataFrame(results)
134
+
135
+ if df_all.empty:
136
+ logging.warning("No motif instances found in sample.")
137
+ return
138
+
139
+ # order column to match bed format
140
+ df_all = df_all[['chrom', 'Buffer_Start', 'Buffer_End','motif', 'Motif_Start', 'Motif_End', 'Total_Repeats', 'Perfect_Repeats']]
141
+ # filter out regions with less than min number of perfect_repeats
142
+ df_all_filtered = df_all[df_all['Perfect_Repeats'] >= perfect_repeats].copy()
143
+
144
+
145
+ # merge repeat regions per chrom that are in close proximity to be one region instead of two
146
+ merged_all = []
147
+ for contig in df_all_filtered['chrom'].unique():
148
+ df_sub = df_all_filtered[df_all_filtered['chrom'] == contig].copy()
149
+ merged = merge_adjacent_motifs(df_sub, max_gap)
150
+ merged_all.append(merged)
151
+
152
+ df_merged = pd.concat(merged_all, ignore_index=True)
153
+ df_merged = name_each_region(df_merged)
154
+
155
+ df_final = df_merged[['chrom', 'Buffer_Start', 'Buffer_End', 'region_name', 'motif']]
156
+ output_path = os.path.join(output_dir, 'microsatellite_coordinates.bed')
157
+ df_final.to_csv(output_path, sep='\t', index=False, header=False)
158
+
159
+ logging.info(f"Found {len(df_final)} microsatellite regions. Output written to {output_path}")
160
+ return df_final