bio-polymarker 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +24 -0
- data/Gemfile +23 -0
- data/README.md +205 -0
- data/Rakefile +61 -0
- data/SECURITY.md +16 -0
- data/VERSION +1 -0
- data/bin/bfr.rb +128 -0
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/count_variations.rb +36 -0
- data/bin/filter_blat_by_target_coverage.rb +69 -0
- data/bin/filter_exonerate_by_identity.rb +38 -0
- data/bin/find_best_blat_hit.rb +33 -0
- data/bin/find_best_exonerate.rb +17 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +168 -0
- data/bin/homokaryot_primers.rb +183 -0
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/map_markers_to_contigs.rb +66 -0
- data/bin/marker_to_vcf.rb +241 -0
- data/bin/markers_in_region.rb +42 -0
- data/bin/mask_triads.rb +169 -0
- data/bin/polymarker.rb +410 -0
- data/bin/polymarker_capillary.rb +443 -0
- data/bin/polymarker_deletions.rb +350 -0
- data/bin/snp_position_to_polymarker.rb +101 -0
- data/bin/snps_between_bams.rb +107 -0
- data/bin/tag_stats.rb +75 -0
- data/bin/vcfLineToTable.rb +56 -0
- data/bin/vcfToPolyMarker.rb +82 -0
- data/bio-polymarker.gemspec +227 -0
- data/conf/defaults.rb +1 -0
- data/conf/primer3_config/dangle.dh +128 -0
- data/conf/primer3_config/dangle.ds +128 -0
- data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
- data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
- data/conf/primer3_config/interpretations/loops_i.dh +34 -0
- data/conf/primer3_config/interpretations/loops_i.ds +31 -0
- data/conf/primer3_config/interpretations/stack_i.dh +257 -0
- data/conf/primer3_config/interpretations/stack_i.ds +256 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
- data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
- data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
- data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
- data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
- data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
- data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
- data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
- data/conf/primer3_config/loops.dh +30 -0
- data/conf/primer3_config/loops.ds +30 -0
- data/conf/primer3_config/stack.dh +256 -0
- data/conf/primer3_config/stack.ds +256 -0
- data/conf/primer3_config/stackmm.dh +256 -0
- data/conf/primer3_config/stackmm.ds +256 -0
- data/conf/primer3_config/tetraloop.dh +77 -0
- data/conf/primer3_config/tetraloop.ds +77 -0
- data/conf/primer3_config/triloop.dh +16 -0
- data/conf/primer3_config/triloop.ds +16 -0
- data/conf/primer3_config/tstack.dh +256 -0
- data/conf/primer3_config/tstack2.dh +256 -0
- data/conf/primer3_config/tstack2.ds +256 -0
- data/conf/primer3_config/tstack_tm_inf.ds +256 -0
- data/lib/bio/BFRTools.rb +465 -0
- data/lib/bio/BIOExtensions.rb +153 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
- data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
- data/lib/bio/PolyploidTools/Marker.rb +175 -0
- data/lib/bio/PolyploidTools/Mask.rb +116 -0
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
- data/lib/bio/PolyploidTools/SNP.rb +804 -0
- data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
- data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
- data/lib/bio/db/blast.rb +114 -0
- data/lib/bio/db/exonerate.rb +333 -0
- data/lib/bio/db/primer3.rb +820 -0
- data/lib/bio-polymarker.rb +28 -0
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_marker_test.txt +1 -0
- data/test/data/BS00068396_51.fa +2 -0
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.aln +1412 -0
- data/test/data/BS00068396_51_contigs.dnd +7 -0
- data/test/data/BS00068396_51_contigs.fa +8 -0
- data/test/data/BS00068396_51_contigs.fa.fai +4 -0
- data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
- data/test/data/BS00068396_51_contigs.fa.nin +0 -0
- data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_exonerate.tab +6 -0
- data/test/data/BS00068396_51_for_polymarker.txt +1 -0
- data/test/data/BS00068396_51_genes.txt +14 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/LIB1716.bam +0 -0
- data/test/data/LIB1716.bam.bai +0 -0
- data/test/data/LIB1719.bam +0 -0
- data/test/data/LIB1719.bam.bai +0 -0
- data/test/data/LIB1721.bam +0 -0
- data/test/data/LIB1721.bam.bai +0 -0
- data/test/data/LIB1722.bam +0 -0
- data/test/data/LIB1722.bam.bai +0 -0
- data/test/data/PST130_7067.csv +1 -0
- data/test/data/PST130_7067.fa +2 -0
- data/test/data/PST130_7067.fa.fai +1 -0
- data/test/data/PST130_7067.fa.ndb +0 -0
- data/test/data/PST130_7067.fa.nhr +0 -0
- data/test/data/PST130_7067.fa.nin +0 -0
- data/test/data/PST130_7067.fa.not +0 -0
- data/test/data/PST130_7067.fa.nsq +0 -0
- data/test/data/PST130_7067.fa.ntf +0 -0
- data/test/data/PST130_7067.fa.nto +0 -0
- data/test/data/PST130_reverse_primer.csv +1 -0
- data/test/data/S22380157.fa +16 -0
- data/test/data/S22380157.fa.fai +1 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/Test3Aspecific.csv +2 -0
- data/test/data/Test3Aspecific_contigs.fa +6 -0
- data/test/data/bfr_out_test.csv +5 -0
- data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
- data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
- data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
- data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
- data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/data/patological_cases5D.csv +1 -0
- data/test/data/primer_3_input_header_test +5 -0
- data/test/data/short_primer_design_test.csv +10 -0
- data/test/data/some_tests/some_tests.csv +201 -0
- data/test/data/test_from_mutant.csv +3 -0
- data/test/data/test_iselect.csv +196 -0
- data/test/data/test_iselect_reference.fa +1868 -0
- data/test/data/test_iselect_reference.fa.fai +934 -0
- data/test/data/test_primer3_error.csv +4 -0
- data/test/data/test_primer3_error_contigs.fa +10 -0
- data/test/test_bfr.rb +135 -0
- data/test/test_blast.rb +47 -0
- data/test/test_exon_container.rb +17 -0
- data/test/test_exonearate.rb +48 -0
- data/test/test_integration.rb +76 -0
- data/test/test_snp_parsing.rb +121 -0
- data/test/test_wrong_selection.sh +5 -0
- metadata +356 -0
@@ -0,0 +1,192 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:tmp_folder] = Dir.mktmpdir
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
options[:cut_promoter_length] = 0
|
18
|
+
options[:reverse] = true
|
19
|
+
|
20
|
+
OptionParser.new do |opts|
|
21
|
+
|
22
|
+
opts.banner = "Usage: filter_blat.rb [options]"
|
23
|
+
|
24
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
25
|
+
options[:identity] = o.to_f
|
26
|
+
end
|
27
|
+
opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
|
28
|
+
options[:min_bases] = o.to_i
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
32
|
+
options[:triads] = o
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
|
36
|
+
options[:fasta] = o
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
40
|
+
options[:split_token] = o
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
|
44
|
+
options[:program] = o
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
|
48
|
+
options[:random_sample] = o.to_i
|
49
|
+
end
|
50
|
+
|
51
|
+
opts.on("-l", "--cut_promoter_length INT", "Bases to consider") do |o|
|
52
|
+
options[:cut_promoter_length] = o.to_i
|
53
|
+
end
|
54
|
+
|
55
|
+
opts.on("-v", "--reverse T|F", "Reverse the input bases") do |o|
|
56
|
+
if o == 'T'
|
57
|
+
options[:reverse] = true
|
58
|
+
elsif o == 'F'
|
59
|
+
options[:reverse] = false
|
60
|
+
else
|
61
|
+
$stderr.puts "Invalid option for reverse (should be T or F)"
|
62
|
+
exit -1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end.parse!
|
66
|
+
|
67
|
+
|
68
|
+
def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
|
69
|
+
cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
|
70
|
+
#puts cmd
|
71
|
+
executed = system cmd
|
72
|
+
result = []
|
73
|
+
blast_version = nil
|
74
|
+
n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
|
75
|
+
longest = nil
|
76
|
+
max_length = 0
|
77
|
+
max_pident = 0.0
|
78
|
+
n.each do | iter |
|
79
|
+
iter.each do | hit |
|
80
|
+
hit.each do | hsp |
|
81
|
+
if hsp.align_len > max_length
|
82
|
+
max_length = hsp.align_len
|
83
|
+
max_pident = 100 * hsp.identity.to_f / hsp.align_len.to_f
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
[max_length, max_pident]
|
89
|
+
end
|
90
|
+
|
91
|
+
valid_pairs_A_B = Hash.new
|
92
|
+
valid_pairs_A_D = Hash.new
|
93
|
+
valid_pairs_B_D = Hash.new
|
94
|
+
|
95
|
+
split_token = options[:split_token]
|
96
|
+
|
97
|
+
sequences = Hash.new
|
98
|
+
sequence_count=0
|
99
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
|
100
|
+
fasta_file.each do |entry|
|
101
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
102
|
+
seq = entry.naseq
|
103
|
+
seq.reverse_complement! if options[:reverse]
|
104
|
+
seq = seq[0,options[:cut_promoter_length]] if options[:cut_promoter_length] > 0
|
105
|
+
entry.data = seq
|
106
|
+
sequences[gene_name] = entry unless sequences[gene_name]
|
107
|
+
sequences[gene_name] = entry if entry.length > sequences[gene_name].length
|
108
|
+
sequence_count += 1
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
$stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
|
113
|
+
#FileUtils.mkdir_p(options[:tmp_folder])
|
114
|
+
$stderr.puts "TMP dir: #{options[:tmp_folder]}"
|
115
|
+
|
116
|
+
a_tmp = options[:tmp_folder] + "/A.fa"
|
117
|
+
b_tmp = options[:tmp_folder] + "/B.fa"
|
118
|
+
d_tmp = options[:tmp_folder] + "/D.fa"
|
119
|
+
out_tmp = options[:tmp_folder] + "/out.blast"
|
120
|
+
|
121
|
+
|
122
|
+
puts [
|
123
|
+
"group_id" , "query" , "subject" ,
|
124
|
+
"chr_query", "chr_subject", "aln_type",
|
125
|
+
"length" , "pident" , "Ns_query", "Ns_subject", "Ns_total" ].join("\t")
|
126
|
+
|
127
|
+
count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
|
128
|
+
|
129
|
+
probability = options[:random_sample] / count_lines.to_f
|
130
|
+
probability = 1 if options[:random_sample] == 0
|
131
|
+
prng = Random.new
|
132
|
+
#puts probability
|
133
|
+
prom_len = options[:cut_promoter_length]
|
134
|
+
CSV.foreach(options[:triads], headers:true ) do |row|
|
135
|
+
a = row['A']
|
136
|
+
b = row['B']
|
137
|
+
d = row['D']
|
138
|
+
triad = row['group_id'].to_i
|
139
|
+
triad_folder = triad/100
|
140
|
+
|
141
|
+
save = probability > prng.rand && probability < 1
|
142
|
+
run = probability == 1 || save
|
143
|
+
next unless run
|
144
|
+
|
145
|
+
seq_a = sequences[a]
|
146
|
+
seq_b = sequences[b]
|
147
|
+
seq_d = sequences[d]
|
148
|
+
File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
|
149
|
+
File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
|
150
|
+
File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
|
151
|
+
|
152
|
+
ns_a = seq_a.seq.count('Nn') if seq_a
|
153
|
+
ns_b = seq_b.seq.count('Nn') if seq_b
|
154
|
+
ns_d = seq_d.seq.count('Nn') if seq_d
|
155
|
+
|
156
|
+
save_folder = "blast_alignments_#{prom_len}/#{triad_folder}/#{triad}"
|
157
|
+
|
158
|
+
#if save
|
159
|
+
FileUtils.mkdir_p save_folder
|
160
|
+
FileUtils.cp(a_tmp, save_folder) if seq_a
|
161
|
+
FileUtils.cp(b_tmp, save_folder) if seq_b
|
162
|
+
FileUtils.cp(d_tmp, save_folder) if seq_d
|
163
|
+
#end
|
164
|
+
|
165
|
+
if seq_a and seq_b
|
166
|
+
to_print = [triad, a, b , "A","B","A->B"]
|
167
|
+
to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
|
168
|
+
to_print << ns_a
|
169
|
+
to_print << ns_b
|
170
|
+
to_print << ns_a + ns_b
|
171
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") #if save
|
172
|
+
puts to_print.join("\t")
|
173
|
+
end
|
174
|
+
if seq_a and seq_d
|
175
|
+
to_print = [triad, a, b , "A","D","A->D"]
|
176
|
+
to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
|
177
|
+
to_print << ns_a
|
178
|
+
to_print << ns_d
|
179
|
+
to_print << ns_a + ns_d
|
180
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") #if save
|
181
|
+
puts to_print.join("\t")
|
182
|
+
end
|
183
|
+
if seq_b and seq_d
|
184
|
+
to_print = [triad, a, b , "B","D","B->D"]
|
185
|
+
to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
|
186
|
+
to_print << ns_b
|
187
|
+
to_print << ns_d
|
188
|
+
to_print << ns_b + ns_d
|
189
|
+
FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") #if save
|
190
|
+
puts to_print.join("\t")
|
191
|
+
end
|
192
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bio'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'pathname'
|
6
|
+
require 'bio-samtools-wrapper'
|
7
|
+
|
8
|
+
require 'set'
|
9
|
+
|
10
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
11
|
+
$: << File.expand_path('.')
|
12
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
13
|
+
require path
|
14
|
+
|
15
|
+
puts ARGV[0]
|
16
|
+
|
17
|
+
fasta_db = Bio::DB::Fasta::FastaFile.new( {:fasta=>ARGV[0]})
|
18
|
+
fasta_db.load_fai_entries
|
19
|
+
bam1 = Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[1]})
|
20
|
+
|
21
|
+
fasta_db.index.entries.each do | r |
|
22
|
+
#Np r.get_full_region
|
23
|
+
#container.process_region( { :region => r.get_full_region.to_s, :output_file => output_file } )
|
24
|
+
region=r.get_full_region
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
cons_1 = bam1.consensus_with_ambiguities({:region=>region, :case=>true})
|
29
|
+
|
30
|
+
snps = cons_1.count_ambiguities
|
31
|
+
|
32
|
+
snps_per_1k = (1000 * snps.to_f ) / region.size
|
33
|
+
|
34
|
+
puts "#{r.id}\t#{region.size}\t#{snps}\t#{snps_per_1k}\n#{cons_1}"
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
require 'optparse'
|
4
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
5
|
+
$: << File.expand_path('.')
|
6
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
7
|
+
require path
|
8
|
+
module Bio
|
9
|
+
class Blat
|
10
|
+
class StreamedReport < Report
|
11
|
+
|
12
|
+
def self.each_hit(text = '')
|
13
|
+
flag = false
|
14
|
+
head = []
|
15
|
+
|
16
|
+
text.each_line do |line|
|
17
|
+
if flag then
|
18
|
+
yield Hit.new(line)
|
19
|
+
else
|
20
|
+
# for headerless data
|
21
|
+
if /^\d/ =~ line then
|
22
|
+
flag = true
|
23
|
+
redo
|
24
|
+
end
|
25
|
+
line = line.chomp
|
26
|
+
if /\A\-+\s*\z/ =~ line
|
27
|
+
flag = true
|
28
|
+
else
|
29
|
+
head << line
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
#blat_file=ARGV[0]
|
40
|
+
|
41
|
+
options = {}
|
42
|
+
options[:identity] = 95
|
43
|
+
options[:covered] = 60
|
44
|
+
OptionParser.new do |opts|
|
45
|
+
|
46
|
+
opts.banner = "Usage: filter_blat_by_target_coverage.rb [options]"
|
47
|
+
|
48
|
+
opts.on("-p", "--psl FILE", "PSL file") do |o|
|
49
|
+
options[:blat_file] = o.upcase
|
50
|
+
end
|
51
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
52
|
+
options[:identity] = o.to_f
|
53
|
+
end
|
54
|
+
opts.on("-c", "--covered FLOAT", "Minimum percentage coverage") do |o|
|
55
|
+
options[:covered] = o.to_f
|
56
|
+
end
|
57
|
+
|
58
|
+
end.parse!
|
59
|
+
|
60
|
+
|
61
|
+
blat_file = options[:blat_file]
|
62
|
+
|
63
|
+
Bio::Blat::StreamedReport.each_hit(Bio::FlatFile.open(blat_file).to_io) do |hit|
|
64
|
+
if hit.percentage_covered >= options[:covered] and hit.percent_identity >= options[:identity]
|
65
|
+
puts hit.data.join("\t")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
require 'optparse'
|
4
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
5
|
+
$: << File.expand_path('.')
|
6
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
7
|
+
require path
|
8
|
+
|
9
|
+
options = {}
|
10
|
+
options[:identity] = 95
|
11
|
+
options[:covered] = 90
|
12
|
+
OptionParser.new do |opts|
|
13
|
+
|
14
|
+
opts.banner = "Usage: filter_exonerate_by_identity.rb [options]"
|
15
|
+
|
16
|
+
opts.on("-e", "--exo FILE", "Exonerate alignment produced by polymarker or with the following ryo: 'RESULT:\\t%S\\t%pi\\t%ql\\t%tl\\t%g\\t%V\\n'") do |o|
|
17
|
+
options[:exo_file] = o.upcase
|
18
|
+
end
|
19
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
20
|
+
options[:identity] = o.to_f
|
21
|
+
end
|
22
|
+
opts.on("-c", "--covered FLOAT", "Minimum percentage coverage") do |o|
|
23
|
+
options[:covered] = o.to_f
|
24
|
+
end
|
25
|
+
|
26
|
+
end.parse!
|
27
|
+
|
28
|
+
|
29
|
+
exo_file = options[:exo_file]
|
30
|
+
min_identity = options[:identity];
|
31
|
+
min_coverage = options[:covered]
|
32
|
+
File.foreach(exo_file) do |line|
|
33
|
+
aln = Bio::DB::Exonerate::Alignment.parse_custom(line)
|
34
|
+
if aln.identity > min_identity and aln.query_coverage > min_coverage
|
35
|
+
puts aln.line
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
|
4
|
+
def load_blat_alignments (blat_filename, best_aln)
|
5
|
+
blat_aln = Bio::Blat::Report.new(Bio::FlatFile.open(blat_filename).to_io)
|
6
|
+
blat_aln.each_hit() do |hit|
|
7
|
+
current_matches = hit.match
|
8
|
+
current_name = hit.query_id
|
9
|
+
current_identity = hit.percent_identity
|
10
|
+
current_score = hit.score
|
11
|
+
#p current_name
|
12
|
+
|
13
|
+
best = best_aln[current_name]
|
14
|
+
|
15
|
+
if best == nil
|
16
|
+
best_aln[current_name] = hit
|
17
|
+
else
|
18
|
+
if current_score > best.score
|
19
|
+
best_aln[current_name] = hit
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
blat_file=ARGV[0]
|
26
|
+
best_aln = Hash.new
|
27
|
+
|
28
|
+
load_blat_alignments( blat_file,best_aln)
|
29
|
+
puts "QUERY\tTARGET"
|
30
|
+
best_aln.each do |k, hit|
|
31
|
+
#puts "#{k}\t#{hit.target_id}"
|
32
|
+
puts hit.data.join("\t")
|
33
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
|
4
|
+
found_cointigs = Set.new
|
5
|
+
Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model, :chunk=>chunk, :total_chunks=>}) do |aln|
|
6
|
+
if aln.identity > min_identity
|
7
|
+
exo_f.puts aln.line
|
8
|
+
unless found_cointigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
|
9
|
+
found_cointigs.add(aln.target_id)
|
10
|
+
entry = fasta_file.index.region_for_entry(aln.target_id)
|
11
|
+
raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
|
12
|
+
region = entry.get_full_region
|
13
|
+
seq = fasta_file.fetch_sequence(region)
|
14
|
+
contigs_f.puts(">#{aln.target_id}\n#{seq}")
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
#$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
6
|
+
#$: << File.expand_path('.')
|
7
|
+
#path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
8
|
+
#require path
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:blastx] = "-"
|
14
|
+
|
15
|
+
OptionParser.new do |opts|
|
16
|
+
|
17
|
+
opts.banner = "Usage: filter_blat.rb [options]"
|
18
|
+
|
19
|
+
opts.on("-p", "--blastx FILE", "BLAST XML file") do |o|
|
20
|
+
options[:blastx] = o
|
21
|
+
end
|
22
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
23
|
+
options[:identity] = o.to_f
|
24
|
+
end
|
25
|
+
opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
|
26
|
+
options[:min_bases] = o.to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
30
|
+
options[:triads] = o
|
31
|
+
end
|
32
|
+
|
33
|
+
end.parse!
|
34
|
+
|
35
|
+
valid_pairs_A_B = Hash.new
|
36
|
+
valid_pairs_A_D = Hash.new
|
37
|
+
valid_pairs_B_D = Hash.new
|
38
|
+
|
39
|
+
CSV.foreach(options[:triads], headers:true ) do |row|
|
40
|
+
valid_pairs_A_B[row['A']] = row['B']
|
41
|
+
valid_pairs_A_D[row['A']] = row['D']
|
42
|
+
valid_pairs_B_D[row['B']] = row['D']
|
43
|
+
end
|
44
|
+
|
45
|
+
stream = ARGF
|
46
|
+
stream = IO.open(options[:blastx]) unless options[:blastx] == "-"
|
47
|
+
puts "Loaded #{valid_pairs_B_D.length} triads"
|
48
|
+
$stdout.flush
|
49
|
+
|
50
|
+
blast_report = Bio::FlatFile.new(Bio::Blast::Report, stream)
|
51
|
+
|
52
|
+
blast_report.each_entry do |report|
|
53
|
+
puts "Hits for " + report.query_def + " against " + report.db
|
54
|
+
$stdout.flush
|
55
|
+
report.each do |hit|
|
56
|
+
query = hit.query_id.split("-")[0]
|
57
|
+
target = hit.target_id.split("-")[0]
|
58
|
+
if valid_pairs_A_B[query] == target or valid_pairs_A_D[query] == target or valid_pairs_B_D[query] == target
|
59
|
+
puts hit.target_id, "\t", hit.evalue, "\n" if hit.evalue < 0.001
|
60
|
+
puts hit.inspect
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
stream.close unless options[:blat_file] == "-"
|
@@ -0,0 +1,168 @@
|
|
1
|
+
#!
|
2
|
+
require 'bio'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'pathname'
|
5
|
+
require 'bio-samtools-wrapper'
|
6
|
+
|
7
|
+
require 'set'
|
8
|
+
|
9
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
10
|
+
$: << File.expand_path('.')
|
11
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
|
12
|
+
require path
|
13
|
+
|
14
|
+
|
15
|
+
#TODO: Use temporary files somewhere in the file system and add traps to delete them/forward them as a result.
|
16
|
+
#TODO: Make all this parameters
|
17
|
+
path_to_contigs="/Users/ramirezr/Documents/PHD/201305_Databases/iwgcs"
|
18
|
+
#path_to_contigs=path_to_chromosomes
|
19
|
+
snp_in="A"
|
20
|
+
original_name="B"
|
21
|
+
fasta_reference = nil
|
22
|
+
#test_file="/Users/ramirezr/Dropbox/JIC/PrimersToTest/test_primers_nick_and_james_1.csv"
|
23
|
+
test_file=ARGV[0]
|
24
|
+
fasta_reference = ARGV[1] if ARGV[1]
|
25
|
+
output_folder="#{test_file}_primer_design_#{Time.now.strftime('%Y%m%d-%H%M%S')}/"
|
26
|
+
Dir.mkdir(output_folder)
|
27
|
+
#TODO Make this tmp files
|
28
|
+
temp_fasta_query="#{output_folder}to_align.fa"
|
29
|
+
temp_contigs="#{output_folder}contigs_tmp.fa"
|
30
|
+
exonerate_file="#{output_folder}exonerate_tmp.tab"
|
31
|
+
primer_3_input="#{output_folder}primer_3_input_temp"
|
32
|
+
primer_3_output="#{output_folder}primer_3_output_temp"
|
33
|
+
exons_filename="#{output_folder}exons_genes_and_contigs.fa"
|
34
|
+
output_primers="#{output_folder}primers.csv"
|
35
|
+
|
36
|
+
primer_3_config=File.expand_path(File.dirname(__FILE__) + '/../conf/primer3_config')
|
37
|
+
model="est2genome"
|
38
|
+
|
39
|
+
|
40
|
+
min_identity= 92
|
41
|
+
snps = Array.new
|
42
|
+
|
43
|
+
#0. Load the fasta index
|
44
|
+
fasta_reference_db = nil
|
45
|
+
if fasta_reference
|
46
|
+
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
|
47
|
+
fasta_reference_db.load_fai_entries
|
48
|
+
p "Fasta reference: #{fasta_reference}"
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
#1. Read all the SNP files
|
53
|
+
#All the SNPs should be on the same chromosome as the first SNP.
|
54
|
+
chromosome = nil
|
55
|
+
File.open(test_file) do | f |
|
56
|
+
f.each_line do | line |
|
57
|
+
# p line.chomp!
|
58
|
+
snp = nil
|
59
|
+
if ARGV.size == 1 #List with Sequence
|
60
|
+
snp = Bio::PolyploidTools::SNPSequence.parse(line)
|
61
|
+
elsif ARGV.size == 2 #List and fasta file
|
62
|
+
snp = Bio::PolyploidTools::SNP.parse(line)
|
63
|
+
region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
|
64
|
+
snp.template_sequence = fasta_reference_db.fetch_sequence(region)
|
65
|
+
else
|
66
|
+
raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
|
67
|
+
end
|
68
|
+
raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
|
69
|
+
snp.snp_in = snp_in
|
70
|
+
snp.original_name = original_name
|
71
|
+
snps << snp
|
72
|
+
chromosome = snp.chromosome unless chromosome
|
73
|
+
raise Bio::DB::Exonerate::ExonerateException.new "All the snps should come from the same chromosome" if chromosome != snp.chromosome
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
#1.1 Close fasta file
|
78
|
+
#fasta_reference_db.close() if fasta_reference_db
|
79
|
+
#2. Generate all the fasta files
|
80
|
+
|
81
|
+
written_seqs = Set.new
|
82
|
+
file = File.open(temp_fasta_query, "w")
|
83
|
+
snps.each do |snp|
|
84
|
+
unless written_seqs.include?(snp.gene)
|
85
|
+
written_seqs << snp.gene
|
86
|
+
file.puts snp.to_fasta
|
87
|
+
end
|
88
|
+
end
|
89
|
+
file.close
|
90
|
+
|
91
|
+
#3. Run exonerate on each of the possible chromosomes for the SNP
|
92
|
+
puts chromosome
|
93
|
+
chr_group = chromosome[0]
|
94
|
+
exo_f = File.open(exonerate_file, "w")
|
95
|
+
contigs_f = File.open(temp_contigs, "w")
|
96
|
+
Dir.foreach(path_to_contigs) do |filename |
|
97
|
+
#puts filename
|
98
|
+
if File.fnmatch("#{chr_group}*.fa", filename)
|
99
|
+
puts filename
|
100
|
+
target="#{path_to_contigs}/#{filename}"
|
101
|
+
|
102
|
+
fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
|
103
|
+
fasta_file.load_fai_entries
|
104
|
+
Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model}) do |aln|
|
105
|
+
if aln.identity > min_identity
|
106
|
+
exo_f.puts aln.line
|
107
|
+
region = fasta_file.index.region_for_entry(aln.target_id).get_full_region
|
108
|
+
seq = fasta_file.fetch_sequence(region)
|
109
|
+
contigs_f.puts(">#{aln.target_id}\n#{seq}")
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
exo_f.close()
|
117
|
+
contigs_f.close()
|
118
|
+
|
119
|
+
#4. Load all the results from exonerate and get the input filename for primer3
|
120
|
+
#Custom arm selection function that only uses the first two characters. Maybe
|
121
|
+
#we want to make it a bit more cleaver
|
122
|
+
arm_selection = lambda do | contig_name |
|
123
|
+
ret = contig_name[0,2]
|
124
|
+
return ret
|
125
|
+
end
|
126
|
+
|
127
|
+
container= Bio::PolyploidTools::ExonContainer.new
|
128
|
+
container.flanking_size=100
|
129
|
+
container.gene_models(temp_fasta_query)
|
130
|
+
container.chromosomes(temp_contigs)
|
131
|
+
container.add_parental({:name=>snp_in})
|
132
|
+
container.add_parental({:name=>original_name})
|
133
|
+
snps.each do |snp|
|
134
|
+
snp.container = container
|
135
|
+
snp.flanking_size = container.flanking_size
|
136
|
+
container.add_snp(snp)
|
137
|
+
end
|
138
|
+
container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>arm_selection})
|
139
|
+
|
140
|
+
file = File.open(exons_filename, "w")
|
141
|
+
container.print_fasta_snp_exones(file)
|
142
|
+
file.close
|
143
|
+
|
144
|
+
file = File.open(primer_3_input, "w")
|
145
|
+
file.puts("PRIMER_PRODUCT_SIZE_RANGE=50-150")
|
146
|
+
file.puts("PRIMER_MAX_SIZE=25")
|
147
|
+
file.puts("PRIMER_LIB_AMBIGUITY_CODES_CONSENSUS=1")
|
148
|
+
file.puts("PRIMER_LIBERAL_BASE=1")
|
149
|
+
file.puts("PRIMER_NUM_RETURN=5")
|
150
|
+
file.puts("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=#{primer_3_config}/")
|
151
|
+
container.print_primer_3_exons(file, chromosome,snp_in)
|
152
|
+
file.close
|
153
|
+
|
154
|
+
Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output})
|
155
|
+
|
156
|
+
#5. Pick the best primer and make the primer3 output
|
157
|
+
kasp_container=Bio::DB::Primer3::KASPContainer.new
|
158
|
+
kasp_container.line_1=snp_in
|
159
|
+
kasp_container.line_2=original_name
|
160
|
+
|
161
|
+
snps.each do |snp|
|
162
|
+
kasp_container.add_snp(snp)
|
163
|
+
end
|
164
|
+
|
165
|
+
kasp_container.add_primers_file(primer_3_output)
|
166
|
+
header = "Marker,SNP,RegionSize,SNP_type,#{snp_in},#{original_name},common,primer_type,orientation,#{snp_in}_TM,#{original_name}_TM,common_TM,selected_from,product_size"
|
167
|
+
File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
|
168
|
+
|