bio-polymarker 1.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +24 -0
- data/Gemfile +23 -0
- data/README.md +205 -0
- data/Rakefile +61 -0
- data/SECURITY.md +16 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +128 -0
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +69 -0
- data/bin/filter_exonerate_by_identity.rb +38 -0
- data/bin/find_best_blat_hit.rb +33 -0
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +183 -0
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/marker_to_vcf.rb +241 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/mask_triads.rb +169 -0
- data/bin/polymarker.rb +410 -0
- data/bin/polymarker_capillary.rb +443 -0
- data/bin/polymarker_deletions.rb +350 -0
- data/bin/snp_position_to_polymarker.rb +101 -0
- data/bin/snps_between_bams.rb +107 -0
- data/bin/tag_stats.rb +75 -0
- data/bin/vcfLineToTable.rb +56 -0
- data/bin/vcfToPolyMarker.rb +82 -0
- data/bio-polymarker.gemspec +227 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +465 -0
- data/lib/bio/BIOExtensions.rb +153 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/Mask.rb +116 -0
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
- data/lib/bio/PolyploidTools/SNP.rb +804 -0
- data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
- data/lib/bio/db/blast.rb +114 -0
- data/lib/bio/db/exonerate.rb +333 -0
- data/lib/bio/db/primer3.rb +820 -0
- data/lib/bio-polymarker.rb +28 -0
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_marker_test.txt +1 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_contigs.fa.fai +4 -0
- data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
- data/test/data/BS00068396_51_contigs.fa.nin +0 -0
- data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_for_polymarker.txt +1 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/PST130_7067.csv +1 -0
- data/test/data/PST130_7067.fa +2 -0
- data/test/data/PST130_7067.fa.fai +1 -0
- data/test/data/PST130_7067.fa.ndb +0 -0
- data/test/data/PST130_7067.fa.nhr +0 -0
- data/test/data/PST130_7067.fa.nin +0 -0
- data/test/data/PST130_7067.fa.not +0 -0
- data/test/data/PST130_7067.fa.nsq +0 -0
- data/test/data/PST130_7067.fa.ntf +0 -0
- data/test/data/PST130_7067.fa.nto +0 -0
- data/test/data/PST130_reverse_primer.csv +1 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/Test3Aspecific.csv +2 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/bfr_out_test.csv +5 -0
- data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
- data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
- data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/primer_3_input_header_test +5 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/some_tests/some_tests.csv +201 -0
- data/test/data/test_from_mutant.csv +3 -0
- data/test/data/test_iselect.csv +196 -0
- data/test/data/test_iselect_reference.fa +1868 -0
- data/test/data/test_iselect_reference.fa.fai +934 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +135 -0
- data/test/test_blast.rb +47 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +48 -0
- data/test/test_integration.rb +76 -0
- data/test/test_snp_parsing.rb +121 -0
- data/test/test_wrong_selection.sh +5 -0
- metadata +356 -0
@@ -0,0 +1,192 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:tmp_folder] = Dir.mktmpdir
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
options[:cut_promoter_length] = 0
|
18
|
+
options[:reverse] = true
|
19
|
+
|
20
|
+
OptionParser.new do |opts|
|
21
|
+
|
22
|
+
opts.banner = "Usage: filter_blat.rb [options]"
|
23
|
+
|
24
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
25
|
+
options[:identity] = o.to_f
|
26
|
+
end
|
27
|
+
opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
|
28
|
+
options[:min_bases] = o.to_i
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
32
|
+
options[:triads] = o
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
|
36
|
+
options[:fasta] = o
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
40
|
+
options[:split_token] = o
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
|
44
|
+
options[:program] = o
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
|
48
|
+
options[:random_sample] = o.to_i
|
49
|
+
end
|
50
|
+
|
51
|
+
opts.on("-l", "--cut_promoter_length INT", "Bases to consider") do |o|
|
52
|
+
options[:cut_promoter_length] = o.to_i
|
53
|
+
end
|
54
|
+
|
55
|
+
opts.on("-v", "--reverse T|F", "Reverse the input bases") do |o|
|
56
|
+
if o == 'T'
|
57
|
+
options[:reverse] = true
|
58
|
+
elsif o == 'F'
|
59
|
+
options[:reverse] = false
|
60
|
+
else
|
61
|
+
$stderr.puts "Invalid option for reverse (should be T or F)"
|
62
|
+
exit -1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end.parse!
|
66
|
+
|
67
|
+
|
68
|
+
def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
|
69
|
+
cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
|
70
|
+
#puts cmd
|
71
|
+
executed = system cmd
|
72
|
+
result = []
|
73
|
+
blast_version = nil
|
74
|
+
n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
|
75
|
+
longest = nil
|
76
|
+
max_length = 0
|
77
|
+
max_pident = 0.0
|
78
|
+
n.each do | iter |
|
79
|
+
iter.each do | hit |
|
80
|
+
hit.each do | hsp |
|
81
|
+
if hsp.align_len > max_length
|
82
|
+
max_length = hsp.align_len
|
83
|
+
max_pident = 100 * hsp.identity.to_f / hsp.align_len.to_f
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
[max_length, max_pident]
|
89
|
+
end
|
90
|
+
|
91
|
+
valid_pairs_A_B = Hash.new
|
92
|
+
valid_pairs_A_D = Hash.new
|
93
|
+
valid_pairs_B_D = Hash.new
|
94
|
+
|
95
|
+
split_token = options[:split_token]
|
96
|
+
|
97
|
+
sequences = Hash.new
|
98
|
+
sequence_count=0
|
99
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
|
100
|
+
fasta_file.each do |entry|
|
101
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
102
|
+
seq = entry.naseq
|
103
|
+
seq.reverse_complement! if options[:reverse]
|
104
|
+
seq = seq[0,options[:cut_promoter_length]] if options[:cut_promoter_length] > 0
|
105
|
+
entry.data = seq
|
106
|
+
sequences[gene_name] = entry unless sequences[gene_name]
|
107
|
+
sequences[gene_name] = entry if entry.length > sequences[gene_name].length
|
108
|
+
sequence_count += 1
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
$stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
|
113
|
+
#FileUtils.mkdir_p(options[:tmp_folder])
|
114
|
+
$stderr.puts "TMP dir: #{options[:tmp_folder]}"
|
115
|
+
|
116
|
+
a_tmp = options[:tmp_folder] + "/A.fa"
|
117
|
+
b_tmp = options[:tmp_folder] + "/B.fa"
|
118
|
+
d_tmp = options[:tmp_folder] + "/D.fa"
|
119
|
+
out_tmp = options[:tmp_folder] + "/out.blast"
|
120
|
+
|
121
|
+
|
122
|
+
puts [
|
123
|
+
"group_id" , "query" , "subject" ,
|
124
|
+
"chr_query", "chr_subject", "aln_type",
|
125
|
+
"length" , "pident" , "Ns_query", "Ns_subject", "Ns_total" ].join("\t")
|
126
|
+
|
127
|
+
count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
|
128
|
+
|
129
|
+
probability = options[:random_sample] / count_lines.to_f
|
130
|
+
probability = 1 if options[:random_sample] == 0
|
131
|
+
prng = Random.new
|
132
|
+
#puts probability
|
133
|
+
prom_len = options[:cut_promoter_length]
|
134
|
+
CSV.foreach(options[:triads], headers:true ) do |row|
|
135
|
+
a = row['A']
|
136
|
+
b = row['B']
|
137
|
+
d = row['D']
|
138
|
+
triad = row['group_id'].to_i
|
139
|
+
triad_folder = triad/100
|
140
|
+
|
141
|
+
save = probability > prng.rand && probability < 1
|
142
|
+
run = probability == 1 || save
|
143
|
+
next unless run
|
144
|
+
|
145
|
+
seq_a = sequences[a]
|
146
|
+
seq_b = sequences[b]
|
147
|
+
seq_d = sequences[d]
|
148
|
+
File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
|
149
|
+
File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
|
150
|
+
File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
|
151
|
+
|
152
|
+
ns_a = seq_a.seq.count('Nn') if seq_a
|
153
|
+
ns_b = seq_b.seq.count('Nn') if seq_b
|
154
|
+
ns_d = seq_d.seq.count('Nn') if seq_d
|
155
|
+
|
156
|
+
save_folder = "blast_alignments_#{prom_len}/#{triad_folder}/#{triad}"
|
157
|
+
|
158
|
+
#if save
|
159
|
+
FileUtils.mkdir_p save_folder
|
160
|
+
FileUtils.cp(a_tmp, save_folder) if seq_a
|
161
|
+
FileUtils.cp(b_tmp, save_folder) if seq_b
|
162
|
+
FileUtils.cp(d_tmp, save_folder) if seq_d
|
163
|
+
#end
|
164
|
+
|
165
|
+
if seq_a and seq_b
|
166
|
+
to_print = [triad, a, b , "A","B","A->B"]
|
167
|
+
to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
|
168
|
+
to_print << ns_a
|
169
|
+
to_print << ns_b
|
170
|
+
to_print << ns_a + ns_b
|
171
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") #if save
|
172
|
+
puts to_print.join("\t")
|
173
|
+
end
|
174
|
+
if seq_a and seq_d
|
175
|
+
to_print = [triad, a, b , "A","D","A->D"]
|
176
|
+
to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
|
177
|
+
to_print << ns_a
|
178
|
+
to_print << ns_d
|
179
|
+
to_print << ns_a + ns_d
|
180
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") #if save
|
181
|
+
puts to_print.join("\t")
|
182
|
+
end
|
183
|
+
if seq_b and seq_d
|
184
|
+
to_print = [triad, a, b , "B","D","B->D"]
|
185
|
+
to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
|
186
|
+
to_print << ns_b
|
187
|
+
to_print << ns_d
|
188
|
+
to_print << ns_b + ns_d
|
189
|
+
FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") #if save
|
190
|
+
puts to_print.join("\t")
|
191
|
+
end
|
192
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bio'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'pathname'
|
6
|
+
require 'bio-samtools-wrapper'
|
7
|
+
|
8
|
+
require 'set'
|
9
|
+
|
10
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
11
|
+
$: << File.expand_path('.')
|
12
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
13
|
+
require path
|
14
|
+
|
15
|
+
puts ARGV[0]
|
16
|
+
|
17
|
+
fasta_db = Bio::DB::Fasta::FastaFile.new( {:fasta=>ARGV[0]})
|
18
|
+
fasta_db.load_fai_entries
|
19
|
+
bam1 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[1]})
|
20
|
+
|
21
|
+
fasta_db.index.entries.each do | r |
|
22
|
+
#Np r.get_full_region
|
23
|
+
#container.process_region( { :region => r.get_full_region.to_s, :output_file => output_file } )
|
24
|
+
region=r.get_full_region
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
cons_1 = bam1.consensus_with_ambiguities({:region=>region, :case=>true})
|
29
|
+
|
30
|
+
snps = cons_1.count_ambiguities
|
31
|
+
|
32
|
+
snps_per_1k = (1000 * snps.to_f ) / region.size
|
33
|
+
|
34
|
+
puts "#{r.id}\t#{region.size}\t#{snps}\t#{snps_per_1k}\n#{cons_1}"
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
require 'optparse'
|
4
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
5
|
+
$: << File.expand_path('.')
|
6
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
7
|
+
require path
|
8
|
+
module Bio
|
9
|
+
class Blat
|
10
|
+
class StreamedReport < Report
|
11
|
+
|
12
|
+
def self.each_hit(text = '')
|
13
|
+
flag = false
|
14
|
+
head = []
|
15
|
+
|
16
|
+
text.each_line do |line|
|
17
|
+
if flag then
|
18
|
+
yield Hit.new(line)
|
19
|
+
else
|
20
|
+
# for headerless data
|
21
|
+
if /^\d/ =~ line then
|
22
|
+
flag = true
|
23
|
+
redo
|
24
|
+
end
|
25
|
+
line = line.chomp
|
26
|
+
if /\A\-+\s*\z/ =~ line
|
27
|
+
flag = true
|
28
|
+
else
|
29
|
+
head << line
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
#blat_file=ARGV[0]
|
40
|
+
|
41
|
+
options = {}
|
42
|
+
options[:identity] = 95
|
43
|
+
options[:covered] = 60
|
44
|
+
OptionParser.new do |opts|
|
45
|
+
|
46
|
+
opts.banner = "Usage: filter_blat_by_target_coverage.rb [options]"
|
47
|
+
|
48
|
+
opts.on("-p", "--psl FILE", "PSL file") do |o|
|
49
|
+
options[:blat_file] = o.upcase
|
50
|
+
end
|
51
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
52
|
+
options[:identity] = o.to_f
|
53
|
+
end
|
54
|
+
opts.on("-c", "--covered FLOAT", "Minimum percentage coverage") do |o|
|
55
|
+
options[:covered] = o.to_f
|
56
|
+
end
|
57
|
+
|
58
|
+
end.parse!
|
59
|
+
|
60
|
+
|
61
|
+
blat_file = options[:blat_file]
|
62
|
+
|
63
|
+
Bio::Blat::StreamedReport.each_hit(Bio::FlatFile.open(blat_file).to_io) do |hit|
|
64
|
+
if hit.percentage_covered >= options[:covered] and hit.percent_identity >= options[:identity]
|
65
|
+
puts hit.data.join("\t")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
require 'optparse'
|
4
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
5
|
+
$: << File.expand_path('.')
|
6
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
7
|
+
require path
|
8
|
+
|
9
|
+
options = {}
|
10
|
+
options[:identity] = 95
|
11
|
+
options[:covered] = 90
|
12
|
+
OptionParser.new do |opts|
|
13
|
+
|
14
|
+
opts.banner = "Usage: filter_exonerate_by_identity.rb [options]"
|
15
|
+
|
16
|
+
opts.on("-e", "--exo FILE", "Exonerate alignment produced by polymarker or with the following ryo: 'RESULT:\\t%S\\t%pi\\t%ql\\t%tl\\t%g\\t%V\\n'") do |o|
|
17
|
+
options[:exo_file] = o.upcase
|
18
|
+
end
|
19
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
20
|
+
options[:identity] = o.to_f
|
21
|
+
end
|
22
|
+
opts.on("-c", "--covered FLOAT", "Minimum percentage coverage") do |o|
|
23
|
+
options[:covered] = o.to_f
|
24
|
+
end
|
25
|
+
|
26
|
+
end.parse!
|
27
|
+
|
28
|
+
|
29
|
+
exo_file = options[:exo_file]
|
30
|
+
min_identity = options[:identity];
|
31
|
+
min_coverage = options[:covered]
|
32
|
+
File.foreach(exo_file) do |line|
|
33
|
+
aln = Bio::DB::Exonerate::Alignment.parse_custom(line)
|
34
|
+
if aln.identity > min_identity and aln.query_coverage > min_coverage
|
35
|
+
puts aln.line
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
def load_blat_alignments (blat_filename, best_aln)
|
5
|
+
blat_aln = Bio::Blat::Report.new(Bio::FlatFile.open(blat_filename).to_io)
|
6
|
+
blat_aln.each_hit() do |hit|
|
7
|
+
current_matches = hit.match
|
8
|
+
current_name = hit.query_id
|
9
|
+
current_identity = hit.percent_identity
|
10
|
+
current_score = hit.score
|
11
|
+
#p current_name
|
12
|
+
|
13
|
+
best = best_aln[current_name]
|
14
|
+
|
15
|
+
if best == nil
|
16
|
+
best_aln[current_name] = hit
|
17
|
+
else
|
18
|
+
if current_score > best.score
|
19
|
+
best_aln[current_name] = hit
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
blat_file=ARGV[0]
|
26
|
+
best_aln = Hash.new
|
27
|
+
|
28
|
+
load_blat_alignments( blat_file,best_aln)
|
29
|
+
puts "QUERY\tTARGET"
|
30
|
+
best_aln.each do |k, hit|
|
31
|
+
#puts "#{k}\t#{hit.target_id}"
|
32
|
+
puts hit.data.join("\t")
|
33
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
|
4
|
+
found_cointigs = Set.new
|
5
|
+
Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model, :chunk=>chunk, :total_chunks=>}) do |aln|
|
6
|
+
if aln.identity > min_identity
|
7
|
+
exo_f.puts aln.line
|
8
|
+
unless found_cointigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
|
9
|
+
found_cointigs.add(aln.target_id)
|
10
|
+
entry = fasta_file.index.region_for_entry(aln.target_id)
|
11
|
+
raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
|
12
|
+
region = entry.get_full_region
|
13
|
+
seq = fasta_file.fetch_sequence(region)
|
14
|
+
contigs_f.puts(">#{aln.target_id}\n#{seq}")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
#$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
6
|
+
#$: << File.expand_path('.')
|
7
|
+
#path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
8
|
+
#require path
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:blastx] = "-"
|
14
|
+
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
|
17
|
+
opts.banner = "Usage: filter_blat.rb [options]"
|
18
|
+
|
19
|
+
opts.on("-p", "--blastx FILE", "BLAST XML file") do |o|
|
20
|
+
options[:blastx] = o
|
21
|
+
end
|
22
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
23
|
+
options[:identity] = o.to_f
|
24
|
+
end
|
25
|
+
opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
|
26
|
+
options[:min_bases] = o.to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
30
|
+
options[:triads] = o
|
31
|
+
end
|
32
|
+
|
33
|
+
end.parse!
|
34
|
+
|
35
|
+
valid_pairs_A_B = Hash.new
|
36
|
+
valid_pairs_A_D = Hash.new
|
37
|
+
valid_pairs_B_D = Hash.new
|
38
|
+
|
39
|
+
CSV.foreach(options[:triads], headers:true ) do |row|
|
40
|
+
valid_pairs_A_B[row['A']] = row['B']
|
41
|
+
valid_pairs_A_D[row['A']] = row['D']
|
42
|
+
valid_pairs_B_D[row['B']] = row['D']
|
43
|
+
end
|
44
|
+
|
45
|
+
stream = ARGF
|
46
|
+
stream = IO.open(options[:blastx]) unless options[:blastx] == "-"
|
47
|
+
puts "Loaded #{valid_pairs_B_D.length} triads"
|
48
|
+
$stdout.flush
|
49
|
+
|
50
|
+
blast_report = Bio::FlatFile.new(Bio::Blast::Report, stream)
|
51
|
+
|
52
|
+
blast_report.each_entry do |report|
|
53
|
+
puts "Hits for " + report.query_def + " against " + report.db
|
54
|
+
$stdout.flush
|
55
|
+
report.each do |hit|
|
56
|
+
query = hit.query_id.split("-")[0]
|
57
|
+
target = hit.target_id.split("-")[0]
|
58
|
+
if valid_pairs_A_B[query] == target or valid_pairs_A_D[query] == target or valid_pairs_B_D[query] == target
|
59
|
+
puts hit.target_id, "\t", hit.evalue, "\n" if hit.evalue < 0.001
|
60
|
+
puts hit.inspect
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
stream.close unless options[:blat_file] == "-"
|
@@ -0,0 +1,168 @@
|
|
1
|
+
#!
|
2
|
+
require 'bio'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'pathname'
|
5
|
+
require 'bio-samtools-wrapper'
|
6
|
+
|
7
|
+
require 'set'
|
8
|
+
|
9
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
10
|
+
$: << File.expand_path('.')
|
11
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
12
|
+
require path
|
13
|
+
|
14
|
+
|
15
|
+
#TODO: Use temporary files somewhere in the file system and add traps to delete them/forward them as a result.
|
16
|
+
#TODO: Make all this parameters
|
17
|
+
path_to_contigs="/Users/ramirezr/Documents/PHD/201305_Databases/iwgcs"
|
18
|
+
#path_to_contigs=path_to_chromosomes
|
19
|
+
snp_in="A"
|
20
|
+
original_name="B"
|
21
|
+
fasta_reference = nil
|
22
|
+
#test_file="/Users/ramirezr/Dropbox/JIC/PrimersToTest/test_primers_nick_and_james_1.csv"
|
23
|
+
test_file=ARGV[0]
|
24
|
+
fasta_reference = ARGV[1] if ARGV[1]
|
25
|
+
output_folder="#{test_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}/"
|
26
|
+
Dir.mkdir(output_folder)
|
27
|
+
#TODO Make this tmp files
|
28
|
+
temp_fasta_query="#{output_folder}to_align.fa"
|
29
|
+
temp_contigs="#{output_folder}contigs_tmp.fa"
|
30
|
+
exonerate_file="#{output_folder}exonerate_tmp.tab"
|
31
|
+
primer_3_input="#{output_folder}primer_3_input_temp"
|
32
|
+
primer_3_output="#{output_folder}primer_3_output_temp"
|
33
|
+
exons_filename="#{output_folder}exons_genes_and_contigs.fa"
|
34
|
+
output_primers="#{output_folder}primers.csv"
|
35
|
+
|
36
|
+
primer_3_config=File.expand_path(File.dirname(__FILE__) + '/../conf/primer3_config')
|
37
|
+
model="est2genome"
|
38
|
+
|
39
|
+
|
40
|
+
min_identity= 92
|
41
|
+
snps = Array.new
|
42
|
+
|
43
|
+
#0. Load the fasta index
|
44
|
+
fasta_reference_db = nil
|
45
|
+
if fasta_reference
|
46
|
+
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
|
47
|
+
fasta_reference_db.load_fai_entries
|
48
|
+
p "Fasta reference: #{fasta_reference}"
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
#1. Read all the SNP files
|
53
|
+
#All the SNPs should be on the same chromosome as the first SNP.
|
54
|
+
chromosome = nil
|
55
|
+
File.open(test_file) do | f |
|
56
|
+
f.each_line do | line |
|
57
|
+
# p line.chomp!
|
58
|
+
snp = nil
|
59
|
+
if ARGV.size == 1 #List with Sequence
|
60
|
+
snp = Bio::PolyploidTools::SNPSequence.parse(line)
|
61
|
+
elsif ARGV.size == 2 #List and fasta file
|
62
|
+
snp = Bio::PolyploidTools::SNP.parse(line)
|
63
|
+
region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
|
64
|
+
snp.template_sequence = fasta_reference_db.fetch_sequence(region)
|
65
|
+
else
|
66
|
+
raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
|
67
|
+
end
|
68
|
+
raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
|
69
|
+
snp.snp_in = snp_in
|
70
|
+
snp.original_name = original_name
|
71
|
+
snps << snp
|
72
|
+
chromosome = snp.chromosome unless chromosome
|
73
|
+
raise Bio::DB::Exonerate::ExonerateException.new "All the snps should come from the same chromosome" if chromosome != snp.chromosome
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
#1.1 Close fasta file
|
78
|
+
#fasta_reference_db.close() if fasta_reference_db
|
79
|
+
#2. Generate all the fasta files
|
80
|
+
|
81
|
+
written_seqs = Set.new
|
82
|
+
file = File.open(temp_fasta_query, "w")
|
83
|
+
snps.each do |snp|
|
84
|
+
unless written_seqs.include?(snp.gene)
|
85
|
+
written_seqs << snp.gene
|
86
|
+
file.puts snp.to_fasta
|
87
|
+
end
|
88
|
+
end
|
89
|
+
file.close
|
90
|
+
|
91
|
+
#3. Run exonerate on each of the possible chromosomes for the SNP
|
92
|
+
puts chromosome
|
93
|
+
chr_group = chromosome[0]
|
94
|
+
exo_f = File.open(exonerate_file, "w")
|
95
|
+
contigs_f = File.open(temp_contigs, "w")
|
96
|
+
Dir.foreach(path_to_contigs) do |filename |
|
97
|
+
#puts filename
|
98
|
+
if File.fnmatch("#{chr_group}*.fa", filename)
|
99
|
+
puts filename
|
100
|
+
target="#{path_to_contigs}/#{filename}"
|
101
|
+
|
102
|
+
fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
|
103
|
+
fasta_file.load_fai_entries
|
104
|
+
Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model}) do |aln|
|
105
|
+
if aln.identity > min_identity
|
106
|
+
exo_f.puts aln.line
|
107
|
+
region = fasta_file.index.region_for_entry(aln.target_id).get_full_region
|
108
|
+
seq = fasta_file.fetch_sequence(region)
|
109
|
+
contigs_f.puts(">#{aln.target_id}\n#{seq}")
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
exo_f.close()
|
117
|
+
contigs_f.close()
|
118
|
+
|
119
|
+
#4. Load all the results from exonerate and get the input filename for primer3
|
120
|
+
#Custom arm selection function that only uses the first two characters. Maybe
|
121
|
+
#we want to make it a bit more cleaver
|
122
|
+
arm_selection = lambda do | contig_name |
|
123
|
+
ret = contig_name[0,2]
|
124
|
+
return ret
|
125
|
+
end
|
126
|
+
|
127
|
+
container= Bio::PolyploidTools::ExonContainer.new
|
128
|
+
container.flanking_size=100
|
129
|
+
container.gene_models(temp_fasta_query)
|
130
|
+
container.chromosomes(temp_contigs)
|
131
|
+
container.add_parental({:name=>snp_in})
|
132
|
+
container.add_parental({:name=>original_name})
|
133
|
+
snps.each do |snp|
|
134
|
+
snp.container = container
|
135
|
+
snp.flanking_size = container.flanking_size
|
136
|
+
container.add_snp(snp)
|
137
|
+
end
|
138
|
+
container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>arm_selection})
|
139
|
+
|
140
|
+
file = File.open(exons_filename, "w")
|
141
|
+
container.print_fasta_snp_exones(file)
|
142
|
+
file.close
|
143
|
+
|
144
|
+
file = File.open(primer_3_input, "w")
|
145
|
+
file.puts("PRIMER_PRODUCT_SIZE_RANGE=50-150")
|
146
|
+
file.puts("PRIMER_MAX_SIZE=25")
|
147
|
+
file.puts("PRIMER_LIB_AMBIGUITY_CODES_CONSENSUS=1")
|
148
|
+
file.puts("PRIMER_LIBERAL_BASE=1")
|
149
|
+
file.puts("PRIMER_NUM_RETURN=5")
|
150
|
+
file.puts("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=#{primer_3_config}/")
|
151
|
+
container.print_primer_3_exons(file, chromosome,snp_in)
|
152
|
+
file.close
|
153
|
+
|
154
|
+
Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output})
|
155
|
+
|
156
|
+
#5. Pick the best primer and make the primer3 output
|
157
|
+
kasp_container=Bio::DB::Primer3::KASPContainer.new
|
158
|
+
kasp_container.line_1=snp_in
|
159
|
+
kasp_container.line_2=original_name
|
160
|
+
|
161
|
+
snps.each do |snp|
|
162
|
+
kasp_container.add_snp(snp)
|
163
|
+
end
|
164
|
+
|
165
|
+
kasp_container.add_primers_file(primer_3_output)
|
166
|
+
header = "Marker,SNP,RegionSize,SNP_type,#{snp_in},#{original_name},common,primer_type,orientation,#{snp_in}_TM,#{original_name}_TM,common_TM,selected_from,product_size"
|
167
|
+
File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
|
168
|
+
|