bio-polymarker 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +24 -0
- data/Gemfile +23 -0
- data/README.md +205 -0
- data/Rakefile +61 -0
- data/SECURITY.md +16 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +128 -0
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +69 -0
- data/bin/filter_exonerate_by_identity.rb +38 -0
- data/bin/find_best_blat_hit.rb +33 -0
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +183 -0
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/marker_to_vcf.rb +241 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/mask_triads.rb +169 -0
- data/bin/polymarker.rb +410 -0
- data/bin/polymarker_capillary.rb +443 -0
- data/bin/polymarker_deletions.rb +350 -0
- data/bin/snp_position_to_polymarker.rb +101 -0
- data/bin/snps_between_bams.rb +107 -0
- data/bin/tag_stats.rb +75 -0
- data/bin/vcfLineToTable.rb +56 -0
- data/bin/vcfToPolyMarker.rb +82 -0
- data/bio-polymarker.gemspec +227 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +465 -0
- data/lib/bio/BIOExtensions.rb +153 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/Mask.rb +116 -0
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
- data/lib/bio/PolyploidTools/SNP.rb +804 -0
- data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
- data/lib/bio/db/blast.rb +114 -0
- data/lib/bio/db/exonerate.rb +333 -0
- data/lib/bio/db/primer3.rb +820 -0
- data/lib/bio-polymarker.rb +28 -0
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_marker_test.txt +1 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_contigs.fa.fai +4 -0
- data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
- data/test/data/BS00068396_51_contigs.fa.nin +0 -0
- data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_for_polymarker.txt +1 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/PST130_7067.csv +1 -0
- data/test/data/PST130_7067.fa +2 -0
- data/test/data/PST130_7067.fa.fai +1 -0
- data/test/data/PST130_7067.fa.ndb +0 -0
- data/test/data/PST130_7067.fa.nhr +0 -0
- data/test/data/PST130_7067.fa.nin +0 -0
- data/test/data/PST130_7067.fa.not +0 -0
- data/test/data/PST130_7067.fa.nsq +0 -0
- data/test/data/PST130_7067.fa.ntf +0 -0
- data/test/data/PST130_7067.fa.nto +0 -0
- data/test/data/PST130_reverse_primer.csv +1 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/Test3Aspecific.csv +2 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/bfr_out_test.csv +5 -0
- data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
- data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
- data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/primer_3_input_header_test +5 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/some_tests/some_tests.csv +201 -0
- data/test/data/test_from_mutant.csv +3 -0
- data/test/data/test_iselect.csv +196 -0
- data/test/data/test_iselect_reference.fa +1868 -0
- data/test/data/test_iselect_reference.fa.fai +934 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +135 -0
- data/test/test_blast.rb +47 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +48 -0
- data/test/test_integration.rb +76 -0
- data/test/test_snp_parsing.rb +121 -0
- data/test/test_wrong_selection.sh +5 -0
- metadata +356 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
|
2
|
+
require_relative "SNPSequence"
|
3
|
+
require 'bio-samtools-wrapper'
|
4
|
+
module Bio::PolyploidTools
|
5
|
+
class SNPSequenceException < RuntimeError
|
6
|
+
end
|
7
|
+
|
8
|
+
class SNPMutant < SNPSequence
|
9
|
+
|
10
|
+
attr_accessor :library, :contig, :chr, :parsed_start, :parsed_flanking, :region_size
|
11
|
+
#Format:
|
12
|
+
#seqid,library,position,wt_base,mut_base
|
13
|
+
#IWGSC_CSS_1AL_scaff_1455974,Kronos2281,127,C,T
|
14
|
+
def self.parse(reg_str)
|
15
|
+
reg_str.chomp!
|
16
|
+
snp = SNPMutant.new
|
17
|
+
|
18
|
+
arr = reg_str.split(",")
|
19
|
+
|
20
|
+
throw SNPSequenceException.new "Need five fields to parse, and got #{arr.size} in #{reg_str}" if arr.size < 5
|
21
|
+
|
22
|
+
snp.contig, snp.library, snp.position, snp.original, snp.snp, parsed_flanking, region_size = reg_str.split(",")
|
23
|
+
snp.position = snp.position.to_i
|
24
|
+
snp.gene = "EMPTY"
|
25
|
+
begin
|
26
|
+
toks = snp.contig.split('_')
|
27
|
+
#1AL_1455974_Kronos2281_127C
|
28
|
+
#snp.chr = contig.split('_')[2][0,2] #This parses the default from the IWGSC. We may want to make this a lambda
|
29
|
+
#snp.chr = toks[2][0,2]
|
30
|
+
name = toks[2] + "_" + toks[4] + "_" + snp.library + "_" + snp.position.to_s
|
31
|
+
snp.gene = name
|
32
|
+
snp.chromosome = toks[2][0,2]
|
33
|
+
snp.chr = snp.chromosome
|
34
|
+
|
35
|
+
rescue Exception => e
|
36
|
+
$stderr.puts "WARN: snp.chr couldnt be set, the sequence id to parse was #{snp.contig}. We expect something like: IWGSC_CSS_1AL_scaff_1455974"
|
37
|
+
snp.gene = "Error"
|
38
|
+
$stderr.puts e
|
39
|
+
end
|
40
|
+
|
41
|
+
snp.flanking_size=100
|
42
|
+
snp.region_size = region_size.to_i if region_size
|
43
|
+
snp.flanking_size = parsed_flanking.to_i if parsed_flanking
|
44
|
+
snp
|
45
|
+
end
|
46
|
+
|
47
|
+
def full_sequence=(seq)
|
48
|
+
self.template_sequence = seq
|
49
|
+
self.sequence_original = self.to_polymarker_sequence(self.flanking_size)
|
50
|
+
self.parse_sequence_snp
|
51
|
+
end
|
52
|
+
|
53
|
+
def full_sequence()
|
54
|
+
self.template_sequence
|
55
|
+
end
|
56
|
+
|
57
|
+
def chromosome_group
|
58
|
+
chr[0]
|
59
|
+
end
|
60
|
+
|
61
|
+
def chromosome_genome
|
62
|
+
chr[1]
|
63
|
+
end
|
64
|
+
|
65
|
+
def chromosome_genome
|
66
|
+
return chr[3] if chr[3]
|
67
|
+
return nil
|
68
|
+
end
|
69
|
+
|
70
|
+
def parse_sequence_snp
|
71
|
+
pos = 0
|
72
|
+
match_data = /(?<pre>\w*)\[(?<org>[ACGT])\/(?<snp>[ACGT])\](?<pos>\w*)/.match(sequence_original.strip)
|
73
|
+
if match_data
|
74
|
+
@position = Regexp.last_match(:pre).size + 1
|
75
|
+
@original = Regexp.last_match(:org)
|
76
|
+
@snp = Regexp.last_match(:snp)
|
77
|
+
amb_base = Bio::NucleicAcid.to_IUAPC("#{@original}#{@snp}")
|
78
|
+
@template_sequence = "#{Regexp.last_match(:pre)}#{amb_base}#{Regexp.last_match(:pos)}"
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
|
2
|
+
require_relative "SNP"
|
3
|
+
require 'bio-samtools-wrapper'
|
4
|
+
module Bio::PolyploidTools
|
5
|
+
class SNPSequenceException < RuntimeError
|
6
|
+
end
|
7
|
+
|
8
|
+
class SNPSequence < SNP
|
9
|
+
|
10
|
+
attr_accessor :sequence_original
|
11
|
+
#Format:
|
12
|
+
#snp name,chromsome from contig,microarray sequence
|
13
|
+
#BS00068396_51,2AS,CGAAGCGATCCTACTACATTGCGTTCCTTTCCCACTCCCAGGTCCCCCTA[T/C]ATGCAGGATCTTGATTAGTCGTGTGAACAACTGAAATTTGAGCGCCACAA
|
14
|
+
def self.parse(reg_str)
|
15
|
+
reg_str.chomp!
|
16
|
+
snp = SNPSequence.new
|
17
|
+
|
18
|
+
arr = reg_str.split(",")
|
19
|
+
|
20
|
+
if arr.size == 3
|
21
|
+
snp.gene, snp.chromosome, snp.sequence_original = arr
|
22
|
+
elsif arr.size == 2
|
23
|
+
snp.gene, snp.sequence_original = arr
|
24
|
+
snp.chromosome = ""
|
25
|
+
else
|
26
|
+
throw SNPSequenceException.new "Need two or three fields to parse, and got #{arr.size} in #{reg_str}"
|
27
|
+
end
|
28
|
+
#snp.position = snp.position.to_i
|
29
|
+
#snp.original.upcase!
|
30
|
+
#snp.snp.upcase!
|
31
|
+
snp.chromosome. strip!
|
32
|
+
snp.parse_sequence_snp
|
33
|
+
|
34
|
+
snp
|
35
|
+
end
|
36
|
+
|
37
|
+
def parse_sequence_snp
|
38
|
+
pos = 0
|
39
|
+
match_data = /(?<pre>\w*)\[(?<org>[ACGT])\/(?<snp>[ACGT])\](?<pos>\w*)/.match(sequence_original.strip)
|
40
|
+
if match_data
|
41
|
+
@position = Regexp.last_match(:pre).size + 1
|
42
|
+
@original = Regexp.last_match(:org)
|
43
|
+
@snp = Regexp.last_match(:snp)
|
44
|
+
|
45
|
+
amb_base = Bio::NucleicAcid.to_IUAPC("#{@original}#{@snp}")
|
46
|
+
|
47
|
+
@template_sequence = "#{Regexp.last_match(:pre)}#{amb_base}#{Regexp.last_match(:pos)}"
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
|
54
|
+
end
|
55
|
+
end
|
data/lib/bio/db/blast.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
module Bio::DB::Blast
|
2
|
+
|
3
|
+
def self.to_sugar(line)
|
4
|
+
fields = line.split("\t")[0..8]
|
5
|
+
|
6
|
+
if fields[3] =="-1"
|
7
|
+
fields[3] = "-"
|
8
|
+
fields[2] = fields[2].to_i - 1
|
9
|
+
else
|
10
|
+
fields[3] = "+"
|
11
|
+
fields[1] = fields[1].to_i - 1
|
12
|
+
end
|
13
|
+
if fields[7] =="-1"
|
14
|
+
fields[7] = "-"
|
15
|
+
fields[6] = fields[6].to_i - 1
|
16
|
+
else
|
17
|
+
fields[7] = "+"
|
18
|
+
fields[5] = fields[5].to_i - 1
|
19
|
+
end
|
20
|
+
fields.join(" ")
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.to_vulgar(line)
|
24
|
+
qseq, sseq = line.split("\t")[12..13]
|
25
|
+
|
26
|
+
len = qseq.length
|
27
|
+
l_status = ""
|
28
|
+
l_len = 0
|
29
|
+
str = Array.new
|
30
|
+
statuses = ""
|
31
|
+
for i in 0..len
|
32
|
+
if qseq[i] == "-"
|
33
|
+
status = "D"
|
34
|
+
elsif sseq[i] == "-"
|
35
|
+
status = "I"
|
36
|
+
else
|
37
|
+
status = "M"
|
38
|
+
end
|
39
|
+
statuses << status
|
40
|
+
end
|
41
|
+
statuses.split('').each do |e|
|
42
|
+
if l_status != e
|
43
|
+
case l_status
|
44
|
+
when "M"
|
45
|
+
str << ["M", l_len, l_len]
|
46
|
+
when "I"
|
47
|
+
str << ["G", l_len, 0]
|
48
|
+
when "D"
|
49
|
+
str << ["G", 0, l_len]
|
50
|
+
end
|
51
|
+
l_len = 0
|
52
|
+
end
|
53
|
+
l_status = e
|
54
|
+
l_len += 1
|
55
|
+
end
|
56
|
+
l_len -= 1
|
57
|
+
case l_status
|
58
|
+
when "M"
|
59
|
+
str << ["M", l_len, l_len]
|
60
|
+
when "I"
|
61
|
+
str << ["G", l_len, 0]
|
62
|
+
when "D"
|
63
|
+
str << ["G", 0, l_len]
|
64
|
+
end
|
65
|
+
|
66
|
+
str.flatten!.join(" ")
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.to_exo(line)
|
70
|
+
arr = Array.new
|
71
|
+
arr << "RESULT:"
|
72
|
+
arr << to_sugar(line)
|
73
|
+
arr << line.split("\t")[9..11]
|
74
|
+
arr << "."
|
75
|
+
arr << to_vulgar(line)
|
76
|
+
arr.join("\t")
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.align(opts={})
|
80
|
+
target=opts[:target]
|
81
|
+
query=opts[:query]
|
82
|
+
max_target_seqs = 6 #TODO: Actually add this as an argument to PolyMarker.
|
83
|
+
max_target_seqs = opts[:max_hits] * 2 if opts[:max_hits]
|
84
|
+
cmdline = "blastn -max_target_seqs #{max_target_seqs} -query #{query} -db #{target} -outfmt '6 qseqid qstart qend qframe sseqid sstart send sframe score pident qlen slen qseq sseq'"
|
85
|
+
#puts cmdline
|
86
|
+
status, stdout, stderr = systemu cmdline
|
87
|
+
if status.exitstatus == 0
|
88
|
+
alns = Array.new unless block_given?
|
89
|
+
stdout.each_line do |e_l|
|
90
|
+
#puts e_l
|
91
|
+
line = to_exo(e_l)
|
92
|
+
#puts line
|
93
|
+
arr = line.split("\t")
|
94
|
+
aln = Bio::DB::Exonerate::Alignment.parse_custom(line)
|
95
|
+
if aln
|
96
|
+
if block_given?
|
97
|
+
yield aln
|
98
|
+
else
|
99
|
+
alns << aln
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
return alns unless block_given?
|
104
|
+
else
|
105
|
+
raise BlasteException.new(), "Error running exonerate. Command line was '#{cmdline}'\n Blast STDERR was:\n#{stderr}"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
class BlasteException < RuntimeError
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
|
114
|
+
|
@@ -0,0 +1,333 @@
|
|
1
|
+
# RYO %S\t%pi\t%ql\t%tl\t%g\t%V\n
|
2
|
+
|
3
|
+
|
4
|
+
module Bio::DB::Exonerate
|
5
|
+
|
6
|
+
|
7
|
+
#TODO: Make a proper object with generic parser
|
8
|
+
def self.align(opts={})
|
9
|
+
opts = {
|
10
|
+
:model => 'affine:local' ,
|
11
|
+
:ryo => "RESULT:\\t%S\\t%pi\\t%ql\\t%tl\\t%g\\t%V\\n" ,
|
12
|
+
:bestn => 20,
|
13
|
+
:percentage => 50
|
14
|
+
}
|
15
|
+
.merge(opts)
|
16
|
+
|
17
|
+
target=opts[:target]
|
18
|
+
query=opts[:query]
|
19
|
+
|
20
|
+
cmdline = "exonerate --verbose 0 --showalignment no --bestn #{opts[:bestn]} --showvulgar no --model #{opts[:model]} --ryo '#{opts[:ryo]}' #{query} #{target}"
|
21
|
+
status, stdout, stderr = systemu cmdline
|
22
|
+
#$stderr.puts cmdline
|
23
|
+
if status.exitstatus == 0
|
24
|
+
alns = Array.new unless block_given?
|
25
|
+
stdout.each_line do |line|
|
26
|
+
aln = Alignment.parse_custom(line)
|
27
|
+
if aln
|
28
|
+
if block_given?
|
29
|
+
yield aln
|
30
|
+
else
|
31
|
+
alns << aln
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
return alns unless block_given?
|
36
|
+
else
|
37
|
+
raise ExonerateException.new(), "Error running exonerate. Command line was '#{cmdline}'\nExonerate STDERR was:\n#{stderr}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
class ExonerateException < RuntimeError
|
43
|
+
end
|
44
|
+
|
45
|
+
class Alignment
|
46
|
+
attr_accessor :query_id, :query_start, :query_end, :query_strand
|
47
|
+
attr_accessor :target_id, :target_start, :target_end, :target_strand, :score
|
48
|
+
attr_accessor :vulgar_block, :pi, :ql, :tl, :g
|
49
|
+
attr_accessor :line
|
50
|
+
|
51
|
+
#This one day may grow to work with complex ryo....
|
52
|
+
def self.parse_custom(line)
|
53
|
+
fields=line.split(/\t/)
|
54
|
+
if fields[0] == "RESULT:"
|
55
|
+
al = Bio::DB::Exonerate::Alignment.new()
|
56
|
+
al.parse_sugar(fields[1])
|
57
|
+
al.pi = fields[2].to_f
|
58
|
+
al.ql = fields[3].to_i
|
59
|
+
al.tl = fields[4].to_i
|
60
|
+
al.g = fields[5]
|
61
|
+
al.parse_vulgar(fields[6])
|
62
|
+
al.line = line
|
63
|
+
return al
|
64
|
+
else
|
65
|
+
return nil
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def query
|
70
|
+
unless @query
|
71
|
+
@query = Bio::DB::Fasta::Region.new()
|
72
|
+
@query.entry = query_id
|
73
|
+
@query.start = query_start + 1
|
74
|
+
@query.end = query_end
|
75
|
+
@query.orientation = query_strand
|
76
|
+
if @query.orientation == :reverse
|
77
|
+
@query.end = query_start
|
78
|
+
@query.start = query_end + 1
|
79
|
+
end
|
80
|
+
@query
|
81
|
+
end
|
82
|
+
@query
|
83
|
+
end
|
84
|
+
|
85
|
+
def target
|
86
|
+
unless @target
|
87
|
+
@target = Bio::DB::Fasta::Region.new()
|
88
|
+
@target.entry = target_id
|
89
|
+
@target.start = target_start + 1
|
90
|
+
@target.end = target_end
|
91
|
+
@target.orientation = target_strand
|
92
|
+
if @target.orientation == :reverse
|
93
|
+
@target.end = target_start
|
94
|
+
@target.start = target_end + 1
|
95
|
+
end
|
96
|
+
end
|
97
|
+
@target
|
98
|
+
end
|
99
|
+
|
100
|
+
def identity
|
101
|
+
@pi
|
102
|
+
end
|
103
|
+
def query_length
|
104
|
+
@ql
|
105
|
+
end
|
106
|
+
def query_coverage
|
107
|
+
total_m = 0
|
108
|
+
vulgar_block.each do |v|
|
109
|
+
#p v.label
|
110
|
+
if v.label == :M
|
111
|
+
total_m += v.query_length
|
112
|
+
end
|
113
|
+
end
|
114
|
+
#puts "Total m #{total_m}"
|
115
|
+
#puts "ql #{query_length}"
|
116
|
+
return 100.00 * total_m.to_f / query_length.to_f
|
117
|
+
end
|
118
|
+
|
119
|
+
def parse_sugar(sugar_str)
|
120
|
+
@query_id, @query_start, @query_end, @query_strand, @target_id, @target_start, @target_end, @target_strand, @score = sugar_str.split(/\s+/)
|
121
|
+
|
122
|
+
@query_start = @query_start.to_i
|
123
|
+
@query_end = @query_end.to_i
|
124
|
+
@target_start = @target_start.to_i
|
125
|
+
@target_end = @target_end.to_i
|
126
|
+
@score = @score.to_f
|
127
|
+
|
128
|
+
if @target_strand == "+"
|
129
|
+
@target_strand = :forward
|
130
|
+
elsif @target_strand == "-"
|
131
|
+
@target_strand = :reverse
|
132
|
+
else
|
133
|
+
raise ExonerateException.new(), "Ivalid target orientation #{@target_strand} for line:\n#{sugar_str}"
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
if @query_strand == "+"
|
138
|
+
@query_strand = :forward
|
139
|
+
elsif @query_strand == "-"
|
140
|
+
@query_strand = :reverse
|
141
|
+
else
|
142
|
+
raise ExonerateException.new(), "Ivalid query orientation #{@query_strand} for line:\n#{sugar_str}"
|
143
|
+
end
|
144
|
+
|
145
|
+
raise ExonerateException.new(), "Inconsistent orientation (forward, query)" if @query_strand == :forward and @query_start > @query_end
|
146
|
+
raise ExonerateException.new(), "Inconsistent orientation (reverse, query)" if @query_strand == :reverse and @query_start < @query_end
|
147
|
+
raise ExonerateException.new(), "Inconsistent orientation (forward, target)" if @target_strand == :forward and @target_start > @target_end
|
148
|
+
raise ExonerateException.new(), "Inconsistent orientation (reverse, target)" if @target_strand == :reverse and @target_start < @target_end
|
149
|
+
|
150
|
+
|
151
|
+
self
|
152
|
+
end
|
153
|
+
|
154
|
+
|
155
|
+
#The vulgar has to be parsed AFTER the sugar, otherwise it is impossible to determine the orientations
|
156
|
+
def parse_vulgar(vulgar_str)
|
157
|
+
|
158
|
+
tarcurrent = @target_start
|
159
|
+
query_current = @query_start
|
160
|
+
target_multiply = 1
|
161
|
+
query_multiply = 1
|
162
|
+
|
163
|
+
if @target_strand == :reverse
|
164
|
+
target_multiply = -1
|
165
|
+
end
|
166
|
+
|
167
|
+
if @query_strand == :reverse
|
168
|
+
query_multiply = -1
|
169
|
+
end
|
170
|
+
|
171
|
+
@vulgar_block = Array.new
|
172
|
+
#p "VULGAR #{vulgar_str}"
|
173
|
+
vulgar_str.split(/\s/).each_slice(3) do | block |
|
174
|
+
#p block
|
175
|
+
vulgar = Vulgar.new(block[0].to_sym, block[1].to_i, block[2].to_i, tarcurrent, target_multiply, query_current, query_multiply, self)
|
176
|
+
query_current = vulgar.query_end
|
177
|
+
tarcurrent = vulgar.target_end
|
178
|
+
vulgar_block << vulgar
|
179
|
+
end
|
180
|
+
self
|
181
|
+
end
|
182
|
+
|
183
|
+
#This assumes that the gene is the query and the chromosome is the target
|
184
|
+
def exon_on_gene_position(position)
|
185
|
+
@vulgar_block.each do |vulgar|
|
186
|
+
if position.between?(vulgar.query_start, vulgar.query_end)
|
187
|
+
return vulgar
|
188
|
+
end
|
189
|
+
end
|
190
|
+
nil
|
191
|
+
end
|
192
|
+
|
193
|
+
def query_position_on_target(position, base:0)
|
194
|
+
vulgar = exon_on_gene_position(position)
|
195
|
+
qr = vulgar.query_region
|
196
|
+
tr = vulgar.target_region
|
197
|
+
|
198
|
+
offset = qr.orientation == :forward ? position - qr.start + 1 : qr.end - position
|
199
|
+
|
200
|
+
#puts vulgar.to_s
|
201
|
+
#puts "SNP position: #{position}"
|
202
|
+
#puts vulgar.query_region
|
203
|
+
#puts vulgar.query_region.orientation
|
204
|
+
#puts "Offset query: #{offset}"
|
205
|
+
#puts vulgar.target_region
|
206
|
+
#puts vulgar.target_region.orientation
|
207
|
+
|
208
|
+
new_pos = tr.orientation == :forward ? offset + tr.start - 1 : tr.end - offset + 1
|
209
|
+
|
210
|
+
return new_pos
|
211
|
+
end
|
212
|
+
|
213
|
+
def tarpostion_from_query_position(position)
|
214
|
+
ret = nil
|
215
|
+
vulgar_block = exon_on_gene_position(position)
|
216
|
+
ret
|
217
|
+
end
|
218
|
+
|
219
|
+
def print_features
|
220
|
+
out = String.new
|
221
|
+
|
222
|
+
@vulgar_block.each do | vulgar |
|
223
|
+
out << vulgar.to_s << "\n"
|
224
|
+
end
|
225
|
+
out
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
class Vulgar
|
230
|
+
attr_reader :label, :query_length, :target_length, :query_start, :query_end, :target_start, :target_end, :record, :snp_in_gap
|
231
|
+
def initialize(label, ql, tl, target_start, target_multiply, query_start, query_multiply, record)
|
232
|
+
@label = label
|
233
|
+
@query_length = ql
|
234
|
+
@target_length = tl
|
235
|
+
@query_start = query_start
|
236
|
+
@query_end = query_start + (query_multiply * query_length)
|
237
|
+
@target_start = target_start
|
238
|
+
@target_end = target_start + (target_multiply * target_length)
|
239
|
+
@record = record
|
240
|
+
@snp_in_gap = false
|
241
|
+
end
|
242
|
+
|
243
|
+
def to_s
|
244
|
+
out = String.new
|
245
|
+
out << @label.to_s << "\t" << @query_length.to_s << "\t" << @target_length.to_s << "\t" << @query_start.to_s << "\t" << @query_end.to_s << "\t" << @target_start.to_s << "\t" << @target_end.to_s
|
246
|
+
out
|
247
|
+
end
|
248
|
+
|
249
|
+
def query_id
|
250
|
+
record.query_id
|
251
|
+
end
|
252
|
+
|
253
|
+
def target_id
|
254
|
+
record.target_id
|
255
|
+
end
|
256
|
+
|
257
|
+
def target_flanking_region_from_position(position, flanking_size)
|
258
|
+
reg = reg = Bio::DB::Fasta::Region.new()
|
259
|
+
reg.entry = target_id
|
260
|
+
target_snp_pos = target_position_from_query(position)
|
261
|
+
return nil if snp_in_gap
|
262
|
+
reg.orientation = record.target_strand
|
263
|
+
reg.start = target_snp_pos - flanking_size
|
264
|
+
reg.end = target_snp_pos + flanking_size
|
265
|
+
raise ExonerateException.new "Target Query out of bounds!" unless position.between?(query_start, query_end)
|
266
|
+
|
267
|
+
reg
|
268
|
+
end
|
269
|
+
|
270
|
+
def target_position_from_query(position)
|
271
|
+
raise ExonerateException.new(), "Position: #{position} not in range (#{query_start}-#{query_end}) #{self.to_s} " unless position.between?(query_start, query_end) or position.between?(query_end, query_start)
|
272
|
+
offset = 0
|
273
|
+
ret = 0
|
274
|
+
if record.query_strand == :forward
|
275
|
+
offset = position - query_start
|
276
|
+
elsif record.query_strand == :reverse
|
277
|
+
offset = query_start - position
|
278
|
+
else
|
279
|
+
raise ExonerateException.new(), "The strand is not forward or reverse (#{record.query_strand}) ! #{self.inspect}"
|
280
|
+
end
|
281
|
+
|
282
|
+
if record.target_strand == :forward
|
283
|
+
ret = target_start + offset
|
284
|
+
elsif record.target_strand == :reverse
|
285
|
+
ret = target_start - offset + 1
|
286
|
+
else
|
287
|
+
raise ExonerateException.new(), "The strand is not forward or reverse! #{self.inspect}"
|
288
|
+
end
|
289
|
+
#THis is in case the position is on a gap.
|
290
|
+
if @target_length == 0 and label == :G
|
291
|
+
@snp_in_gap = true
|
292
|
+
ret = target_start
|
293
|
+
end
|
294
|
+
raise ExonerateException.new(), "Return position #{ret} outside block (#{target_start}-#{target_end}, #{self.inspect})" unless ret.between?(target_start, target_end) or ret.between?(target_end, target_start)
|
295
|
+
ret
|
296
|
+
end
|
297
|
+
|
298
|
+
def query_region
|
299
|
+
reg = Bio::DB::Fasta::Region.new()
|
300
|
+
reg.entry = query_id
|
301
|
+
reg.orientation = record.query_strand
|
302
|
+
if record.query_strand == :forward
|
303
|
+
reg.start = @query_start + 1
|
304
|
+
reg.end = @query_end
|
305
|
+
elsif record.query_strand == :reverse
|
306
|
+
reg.start = @query_end + 1
|
307
|
+
reg.end = @query_start
|
308
|
+
else
|
309
|
+
raise ExonerateException.new(), "Ivalid query orientation #{@query_strand}"
|
310
|
+
end
|
311
|
+
reg
|
312
|
+
end
|
313
|
+
|
314
|
+
def target_region
|
315
|
+
reg = Bio::DB::Fasta::Region.new()
|
316
|
+
|
317
|
+
reg.entry = target_id
|
318
|
+
reg.orientation = record.target_strand
|
319
|
+
if record.target_strand == :forward
|
320
|
+
reg.start = @target_start + 1
|
321
|
+
reg.end = @target_end
|
322
|
+
elsif record.target_strand == :reverse
|
323
|
+
reg.start = @target_end + 1
|
324
|
+
reg.end = @target_start
|
325
|
+
else
|
326
|
+
raise ExonerateException.new(), "Ivalid target orientation #{@target_strand}"
|
327
|
+
end
|
328
|
+
reg
|
329
|
+
end
|
330
|
+
|
331
|
+
end
|
332
|
+
|
333
|
+
end
|