bio-polyploid-tools 0.7.3 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +17 -0
- data/Gemfile +10 -7
- data/README.md +44 -0
- data/Rakefile +14 -14
- data/VERSION +1 -1
- data/bin/bfr.rb +2 -2
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/find_homoeologue_variations.rb +385 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +2 -2
- data/bin/homokaryot_primers.rb +2 -2
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/polymarker.rb +73 -17
- data/bin/polymarker_capillary.rb +416 -0
- data/bin/snp_position_to_polymarker.rb +5 -3
- data/bin/snps_between_bams.rb +0 -29
- data/bin/vcfLineToTable.rb +56 -0
- data/bio-polyploid-tools.gemspec +74 -32
- data/lib/bio/BFRTools.rb +1 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
- data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
- data/lib/bio/PolyploidTools/SNP.rb +58 -18
- data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
- data/lib/bio/db/blast.rb +112 -0
- data/lib/bio/db/exonerate.rb +4 -5
- data/lib/bio/db/primer3.rb +83 -14
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_for_polymarker.fa +1 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/bfr_out_test.csv +5 -5
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/test_bfr.rb +26 -34
- data/test/test_blast.rb +47 -0
- data/test/test_exonearate.rb +4 -9
- data/test/test_snp_parsing.rb +42 -22
- metadata +81 -20
- data/Gemfile.lock +0 -67
data/bin/polymarker.rb
CHANGED
@@ -14,6 +14,7 @@ arm_selection_functions = Hash.new;
|
|
14
14
|
|
15
15
|
|
16
16
|
arm_selection_functions[:arm_selection_first_two] = lambda do | contig_name |
|
17
|
+
contig_name.gsub!(/chr/,"")
|
17
18
|
ret = contig_name[0,2]
|
18
19
|
return ret
|
19
20
|
end
|
@@ -43,7 +44,6 @@ arm_selection_functions[:scaffold] = lambda do | contig_name |
|
|
43
44
|
end
|
44
45
|
|
45
46
|
def validate_files(o)
|
46
|
-
|
47
47
|
[
|
48
48
|
o[:path_to_contigs],
|
49
49
|
o[:marker_list],
|
@@ -51,7 +51,7 @@ def validate_files(o)
|
|
51
51
|
o[:mutant_list],
|
52
52
|
o[:reference]
|
53
53
|
].flatten.compact.each do |f|
|
54
|
-
raise IOError "Unable to read #{f}" unless File.exists? f
|
54
|
+
raise IOError.new "Unable to read #{f}" unless File.exists? f
|
55
55
|
end
|
56
56
|
end
|
57
57
|
|
@@ -67,6 +67,10 @@ options[:variation_free_region] = 0
|
|
67
67
|
options[:extract_found_contigs] = false
|
68
68
|
options[:genomes_count] = 3
|
69
69
|
options[:min_identity] = 90
|
70
|
+
options[:scoring] = :genome_specific
|
71
|
+
options[:database] = false
|
72
|
+
options[:aligner] = :exonerate
|
73
|
+
|
70
74
|
|
71
75
|
options[:primer_3_preferences] = {
|
72
76
|
:primer_product_size_range => "50-150" ,
|
@@ -119,7 +123,19 @@ OptionParser.new do |opts|
|
|
119
123
|
end
|
120
124
|
|
121
125
|
opts.on("-a", "--arm_selection arm_selection_embl|arm_selection_morex|arm_selection_first_two|scaffold", "Function to decide the chromome arm") do |o|
|
122
|
-
|
126
|
+
tmp_str = o
|
127
|
+
arr = o.split(",")
|
128
|
+
if arr.size == 2
|
129
|
+
options[:arm_selection] = lambda do |contig_name|
|
130
|
+
separator, field = arr
|
131
|
+
field = field.to_i
|
132
|
+
ret = contig_name.split(separator)[field]
|
133
|
+
return ret
|
134
|
+
end
|
135
|
+
else
|
136
|
+
options[:arm_selection] = arm_selection_functions[o.to_sym];
|
137
|
+
end
|
138
|
+
|
123
139
|
end
|
124
140
|
|
125
141
|
opts.on("-p", "--primer_3_preferences FILE", "file with preferences to be sent to primer3") do |o|
|
@@ -139,12 +155,26 @@ OptionParser.new do |opts|
|
|
139
155
|
options[:primers_to_order] = true
|
140
156
|
end
|
141
157
|
|
142
|
-
|
158
|
+
opts.on("-H", "--het_dels", "If present, change the scoring to give priority to: semi-specific, specific, non-specific") do
|
159
|
+
options[:scoring] = :het_dels
|
160
|
+
end
|
161
|
+
|
162
|
+
opts.on("-A", "--aligner exonerate|blast", "Select the aligner to use. Default: exonerate") do |o|
|
163
|
+
raise "Invalid aligner" unless o == "exonerate" or o == "blast"
|
164
|
+
options[:aligner] = o.to_sym
|
165
|
+
end
|
166
|
+
|
167
|
+
opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
|
168
|
+
options[:database] = o
|
169
|
+
end
|
143
170
|
end.parse!
|
144
171
|
|
145
172
|
|
146
173
|
validate_files(options)
|
147
174
|
|
175
|
+
options[:database] = options[:path_to_contigs] unless options[:database]
|
176
|
+
|
177
|
+
|
148
178
|
if options[:primer_3_preferences][:primer_product_size_range]
|
149
179
|
range = options[:primer_3_preferences][:primer_product_size_range]
|
150
180
|
range_arr = range.split("-")
|
@@ -208,7 +238,7 @@ fasta_reference_db = nil
|
|
208
238
|
if fasta_reference
|
209
239
|
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
|
210
240
|
fasta_reference_db.load_fai_entries
|
211
|
-
|
241
|
+
write_status "Fasta reference: #{fasta_reference}"
|
212
242
|
end
|
213
243
|
|
214
244
|
#1. Read all the SNP files
|
@@ -239,9 +269,9 @@ File.open(test_file) do | f |
|
|
239
269
|
write_status "WARN: Unable to find entry for #{snp.gene}"
|
240
270
|
end
|
241
271
|
else
|
242
|
-
|
272
|
+
raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
|
243
273
|
end
|
244
|
-
|
274
|
+
raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
|
245
275
|
|
246
276
|
snp.genomes_count = options[:genomes_count]
|
247
277
|
snp.snp_in = snp_in
|
@@ -251,9 +281,6 @@ File.open(test_file) do | f |
|
|
251
281
|
else
|
252
282
|
$stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
|
253
283
|
end
|
254
|
-
|
255
|
-
# chromosome = snp.chromosome unless chromosome
|
256
|
-
# raise Bio::DB::Exonerate::ExonerateException.new "All the snps should come from the same chromosome" if chromosome != snp.chromosome
|
257
284
|
end
|
258
285
|
end
|
259
286
|
|
@@ -278,26 +305,43 @@ write_status "Searching markers in genome"
|
|
278
305
|
exo_f = File.open(exonerate_file, "w")
|
279
306
|
contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
|
280
307
|
filename=path_to_contigs
|
281
|
-
puts filename
|
308
|
+
#puts filename
|
282
309
|
target=filename
|
283
310
|
|
284
311
|
fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
|
285
312
|
fasta_file.load_fai_entries
|
286
313
|
|
287
314
|
found_contigs = Set.new
|
288
|
-
|
315
|
+
|
316
|
+
|
317
|
+
def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
|
289
318
|
if aln.identity > min_identity
|
290
319
|
exo_f.puts aln.line
|
291
320
|
unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
|
292
321
|
found_contigs.add(aln.target_id)
|
293
322
|
entry = fasta_file.index.region_for_entry(aln.target_id)
|
294
323
|
raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
|
295
|
-
|
296
|
-
|
297
|
-
|
324
|
+
if options[:extract_found_contigs]
|
325
|
+
region = entry.get_full_region
|
326
|
+
seq = fasta_file.fetch_sequence(region)
|
327
|
+
contigs_f.puts(">#{aln.target_id}\n#{seq}")
|
328
|
+
end
|
298
329
|
end
|
299
330
|
end
|
331
|
+
|
300
332
|
end
|
333
|
+
|
334
|
+
Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model}) do |aln|
|
335
|
+
do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
|
336
|
+
end if options[:aligner] == :blast
|
337
|
+
|
338
|
+
Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model}) do |aln|
|
339
|
+
do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
|
340
|
+
end if options[:aligner] == :exonerate
|
341
|
+
|
342
|
+
exo_f.close()
|
343
|
+
|
344
|
+
|
301
345
|
|
302
346
|
exo_f.close()
|
303
347
|
contigs_f.close() if options[:extract_found_contigs]
|
@@ -314,6 +358,7 @@ container.gene_models(temp_fasta_query)
|
|
314
358
|
container.chromosomes(target)
|
315
359
|
container.add_parental({:name=>snp_in})
|
316
360
|
container.add_parental({:name=>original_name})
|
361
|
+
|
317
362
|
snps.each do |snp|
|
318
363
|
snp.container = container
|
319
364
|
snp.flanking_size = container.flanking_size
|
@@ -337,15 +382,26 @@ file.close
|
|
337
382
|
|
338
383
|
Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
|
339
384
|
|
340
|
-
|
341
385
|
#5. Pick the best primer and make the primer3 output
|
342
386
|
write_status "Selecting best primers"
|
343
387
|
kasp_container=Bio::DB::Primer3::KASPContainer.new
|
388
|
+
|
389
|
+
|
390
|
+
|
344
391
|
kasp_container.line_1= original_name
|
345
392
|
kasp_container.line_2= snp_in
|
346
393
|
|
394
|
+
if options[:scoring] == :het_dels
|
395
|
+
kasp_container.scores = Hash.new
|
396
|
+
kasp_container.scores[:chromosome_specific] = 0
|
397
|
+
kasp_container.scores[:chromosome_semispecific] = 1000
|
398
|
+
kasp_container.scores[:chromosome_nonspecific] = 100
|
399
|
+
end
|
400
|
+
|
347
401
|
snps.each do |snp|
|
348
|
-
kasp_container.add_snp(snp)
|
402
|
+
snpk = kasp_container.add_snp(snp)
|
403
|
+
|
404
|
+
|
349
405
|
end
|
350
406
|
|
351
407
|
kasp_container.add_primers_file(primer_3_output) if added_exons > 0
|
@@ -0,0 +1,416 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
require 'bio-samtools'
|
4
|
+
require 'pathname'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
8
|
+
$: << File.expand_path('.')
|
9
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools.rb')
|
10
|
+
require path
|
11
|
+
|
12
|
+
def log(msg)
|
13
|
+
time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
|
14
|
+
puts "#{time}: #{msg}"
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
#reference='wheat_6x_ty_mm_mutations_10mutants_for_validations/scaffolds_with_mm.fa'
|
20
|
+
#markers='wheat_6x_ty_mm_mutations_10mutants_for_validations/CadMulitMap.fa'
|
21
|
+
#output_folder='wheat_6x_ty_mm_mutations_10mutants_for_validations/PolyMarker'
|
22
|
+
|
23
|
+
options = Hash.new
|
24
|
+
|
25
|
+
options[:primer_3_preferences] = {
|
26
|
+
:primer_product_size_range => "100-900" ,
|
27
|
+
:primer_max_size => 25 ,
|
28
|
+
:primer_lib_ambiguity_codes_consensus => 1,
|
29
|
+
:primer_liberal_base => 1,
|
30
|
+
:primer_min_left_three_prime_distance => 5,
|
31
|
+
:primer_min_right_three_prime_distance => 5,
|
32
|
+
:primer_num_return =>1,
|
33
|
+
:primer_explain_flag => 1,
|
34
|
+
:primer_thermodynamic_parameters_path=>File.expand_path(File.dirname(__FILE__) + '../../conf/primer3_config/') + '/'
|
35
|
+
}
|
36
|
+
options[:genomes_count] = 3
|
37
|
+
options[:allow_non_specific] = false
|
38
|
+
|
39
|
+
OptionParser.new do |opts|
|
40
|
+
opts.banner = "Usage: polymarker_capillary.rb [options]"
|
41
|
+
|
42
|
+
opts.on("-r", "--reference FILE", "Fasta file with the assembly") do |o|
|
43
|
+
options[:reference] = o
|
44
|
+
end
|
45
|
+
|
46
|
+
opts.on("-m", "--sequences FILE", "Fasta file with the sequences to amplify. the format must be Chromosome:start-end. Chromosome should match the names to the entries in the fasta files as it is used as main target") do |o|
|
47
|
+
options[:markers] = o
|
48
|
+
end
|
49
|
+
|
50
|
+
opts.on("-o", "--output_folder FOLDER", "Path to a folder where the outputs are going to be stored") do |o|
|
51
|
+
options[:output_folder] = o
|
52
|
+
end
|
53
|
+
opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
|
54
|
+
options[:genomes_count] = o.to_i
|
55
|
+
end
|
56
|
+
opts.on("-a", "--allow_non_specific", "If used, semi-specific and non-specific primers will be produced") do |o|
|
57
|
+
options[:allow_non_specific] = true
|
58
|
+
end
|
59
|
+
|
60
|
+
end.parse!
|
61
|
+
|
62
|
+
|
63
|
+
#puts options.inspect
|
64
|
+
reference = options[:reference]
|
65
|
+
markers = options[:markers]
|
66
|
+
output_folder = options[:output_folder]
|
67
|
+
allow_non_specific = options[:allow_non_specific]
|
68
|
+
log "Output folder: #{output_folder}"
|
69
|
+
exonerate_file="#{output_folder}/exonerate_tmp.tab"
|
70
|
+
Dir.mkdir(output_folder)
|
71
|
+
|
72
|
+
module Bio::PolyploidTools
|
73
|
+
|
74
|
+
|
75
|
+
|
76
|
+
class SequenceToAmplify < SNP
|
77
|
+
|
78
|
+
def self.select_chromosome(contig_name)
|
79
|
+
|
80
|
+
arr = contig_name.split('_')
|
81
|
+
ret = "U"
|
82
|
+
ret = arr[2][0,2] if arr.size >= 3
|
83
|
+
ret = "3B" if arr.size == 2 and arr[0] == "v443"
|
84
|
+
ret = arr[0][0,2] if arr.size == 1
|
85
|
+
return ret
|
86
|
+
end
|
87
|
+
|
88
|
+
attr_accessor :sequence_original
|
89
|
+
attr_accessor :rstart
|
90
|
+
attr_accessor :rend
|
91
|
+
attr_accessor :includeNoSpecific
|
92
|
+
#Format:
|
93
|
+
#A fasta entry with the id: contig:start-end
|
94
|
+
#The sequence can be prodcued with samtools faidx
|
95
|
+
def self.parse(fasta_entry)
|
96
|
+
|
97
|
+
snp = SequenceToAmplify.new
|
98
|
+
match_data = /(?<rname>\w*):(?<rstart>\w*)-(?<rend>\w*)/.match(fasta_entry.definition)
|
99
|
+
|
100
|
+
rName = Regexp.last_match(:rname)
|
101
|
+
rStart = Regexp.last_match(:rstart).to_i
|
102
|
+
rEnd = Regexp.last_match(:rend).to_i
|
103
|
+
snp.gene = fasta_entry.definition
|
104
|
+
#snp.chromosome=rName
|
105
|
+
|
106
|
+
snp.chromosome=select_chromosome(rName)
|
107
|
+
#puts "#{rName}: #{snp.chromosome}"
|
108
|
+
snp.sequence_original = fasta_entry.seq
|
109
|
+
snp.template_sequence = fasta_entry.seq.upcase
|
110
|
+
snp.snp_in = "B"
|
111
|
+
snp.rstart = rStart
|
112
|
+
snp.rend = rEnd
|
113
|
+
|
114
|
+
snp.position = 100
|
115
|
+
snp.original = snp.sequence_original[snp.position]
|
116
|
+
|
117
|
+
tmp = Bio::Sequence::NA.new(snp.original)
|
118
|
+
rev = tmp.complement
|
119
|
+
snp.snp = rev
|
120
|
+
snp.exon_list = Hash.new()
|
121
|
+
snp
|
122
|
+
end
|
123
|
+
|
124
|
+
def primer_3_all_strings(target_chromosome, parental)
|
125
|
+
#puts target_chromosome
|
126
|
+
#puts parental
|
127
|
+
#puts aligned_sequences.to_fasta
|
128
|
+
pr = primer_region(target_chromosome, parental )
|
129
|
+
primer_3_propertes = Array.new
|
130
|
+
|
131
|
+
seq_original = String.new(pr.sequence)
|
132
|
+
#puts seq_original.size.to_s << "-" << primer_3_min_seq_length.to_s
|
133
|
+
return primer_3_propertes if seq_original.size < primer_3_min_seq_length
|
134
|
+
return primer_3_propertes unless pr.snp_pos == 500
|
135
|
+
#puts "Sequence origina: #{ self.original}"
|
136
|
+
#puts pr.to_fasta
|
137
|
+
#puts "Postion: #{pr.snp_pos}"
|
138
|
+
seq_original[pr.snp_pos] = self.original
|
139
|
+
seq_original_reverse = reverse_complement_string(seq_original)
|
140
|
+
|
141
|
+
seq_snp = String.new(pr.sequence)
|
142
|
+
seq_snp[pr.snp_pos] = self.snp
|
143
|
+
seq_snp_reverse = reverse_complement_string(seq_snp)
|
144
|
+
|
145
|
+
rev_pos = seq_snp.size - position
|
146
|
+
|
147
|
+
if pr.homoeologous
|
148
|
+
snp_type = "homoeologous"
|
149
|
+
else
|
150
|
+
snp_type = "non-homoeologous"
|
151
|
+
end
|
152
|
+
left_pos = Array.new
|
153
|
+
right_pos = Array.new
|
154
|
+
l_pos = pr.snp_pos
|
155
|
+
pr.chromosome_specific.shuffle.each {|pos| left_pos << pos if pos < l_pos - 50 }
|
156
|
+
pr.chromosome_specific.shuffle.each {|pos| right_pos << pos if pos > l_pos + 50}
|
157
|
+
|
158
|
+
pr.crhomosome_specific_intron.shuffle.each {|pos| left_pos << pos if pos < l_pos - 50}
|
159
|
+
pr.crhomosome_specific_intron.shuffle.each {|pos| right_pos << pos if pos > l_pos + 50}
|
160
|
+
|
161
|
+
prepareLRPrimers(left_pos, right_pos, "chromosome_specific" , snp_type,seq_original, primer_3_propertes)
|
162
|
+
if includeNoSpecific and (right_pos.size == 0 or right_pos.size == 0)
|
163
|
+
left_pos = Array.new
|
164
|
+
right_pos = Array.new
|
165
|
+
l_pos = pr.snp_pos
|
166
|
+
pr.almost_chromosome_specific.each {|pos| left_pos << pos if pos < l_pos - 50 }
|
167
|
+
pr.almost_chromosome_specific.each {|pos| right_pos << pos if pos > l_pos + 50}
|
168
|
+
|
169
|
+
pr.almost_crhomosome_specific_intron.each {|pos| left_pos << pos if pos < l_pos - 50}
|
170
|
+
pr.almost_crhomosome_specific_intron.each {|pos| right_pos << pos if pos > l_pos + 50}
|
171
|
+
|
172
|
+
prepareLRPrimers(left_pos, right_pos, "chromosome_semispecific" ,snp_type, seq_original, primer_3_propertes)
|
173
|
+
args = {
|
174
|
+
:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_nonspecific exon #{snp_type} #{chromosome}",
|
175
|
+
:left_pos => 350,
|
176
|
+
:extra_f=>"SEQUENCE_TARGET=350,400\n",
|
177
|
+
:extra_r=>"SEQUENCE_TARGET=350,400\n",
|
178
|
+
:sequence=>seq_original}
|
179
|
+
str = return_primer_3_string(args)
|
180
|
+
|
181
|
+
primer_3_propertes << str
|
182
|
+
end
|
183
|
+
primer_3_propertes
|
184
|
+
end
|
185
|
+
|
186
|
+
def prepareLRPrimers(left_pos, right_pos, type , snp_type, seq_original,primer_3_propertes)
|
187
|
+
count = 0
|
188
|
+
left_pos.each do |l|
|
189
|
+
right_pos.each do |r|
|
190
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} #{type} exon #{snp_type} #{chromosome}",
|
191
|
+
:left_pos => l,
|
192
|
+
:right_pos => r,
|
193
|
+
:sequence=>seq_original}
|
194
|
+
|
195
|
+
primer_3_propertes << return_primer_3_string(args)
|
196
|
+
count += 1
|
197
|
+
# return if count > 25
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def parental_sequences
|
203
|
+
return @parental_sequences if @parental_sequences
|
204
|
+
gene_region = self.covered_region
|
205
|
+
local_pos_in_gene = self.position
|
206
|
+
|
207
|
+
@parental_sequences = Bio::Alignment::SequenceHash.new
|
208
|
+
container.parents.each do |name, bam|
|
209
|
+
seq = self.sequence_original.clone.downcase
|
210
|
+
|
211
|
+
if name == self.snp_in
|
212
|
+
#puts self.snp
|
213
|
+
seq[local_pos_in_gene] = self.snp
|
214
|
+
else
|
215
|
+
#puts self.original
|
216
|
+
seq[local_pos_in_gene] = self.original
|
217
|
+
end
|
218
|
+
seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
|
219
|
+
@parental_sequences [name] = seq
|
220
|
+
#puts name
|
221
|
+
#puts self.snp_in
|
222
|
+
#puts seq
|
223
|
+
end
|
224
|
+
@parental_sequences
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
|
230
|
+
snps = Array.new
|
231
|
+
file = Bio::FastaFormat.open(markers)
|
232
|
+
file.each do |entry|
|
233
|
+
|
234
|
+
begin
|
235
|
+
tmp = Bio::PolyploidTools::SequenceToAmplify.parse(entry)
|
236
|
+
snps << tmp if tmp
|
237
|
+
rescue
|
238
|
+
$stderr.puts "Unable to generate the marker for: #{entry.definition}"
|
239
|
+
end
|
240
|
+
|
241
|
+
end
|
242
|
+
file.close
|
243
|
+
|
244
|
+
|
245
|
+
|
246
|
+
exo_f = File.open(exonerate_file, "w")
|
247
|
+
target=reference
|
248
|
+
|
249
|
+
fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
|
250
|
+
fasta_file.load_fai_entries
|
251
|
+
min_identity = 95
|
252
|
+
found_contigs = Set.new
|
253
|
+
|
254
|
+
Bio::DB::Exonerate.align({:query=>markers, :target=>reference, :model=>'ungapped'}) do |aln|
|
255
|
+
if aln.identity > min_identity
|
256
|
+
exo_f.puts aln.line
|
257
|
+
#puts aln.line
|
258
|
+
unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
|
259
|
+
found_contigs.add(aln.target_id)
|
260
|
+
entry = fasta_file.index.region_for_entry(aln.target_id)
|
261
|
+
raise Exception.new, "Entry not found! #{aln.target_id}. Make sure that the #{reference}.fai was generated properly." if entry == nil
|
262
|
+
end
|
263
|
+
end
|
264
|
+
end
|
265
|
+
exo_f.close
|
266
|
+
|
267
|
+
arm_selection_functions = Hash.new
|
268
|
+
|
269
|
+
arm_selection_functions[:full_scaffold] = lambda do | contig_name |
|
270
|
+
return contig_name
|
271
|
+
end
|
272
|
+
|
273
|
+
#Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
|
274
|
+
#Or the first two characters in the contig name, to deal with
|
275
|
+
#pseudomolecules that start with headers like: "1A"
|
276
|
+
#And with the cases when 3B is named with the prefix: v443
|
277
|
+
arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
|
278
|
+
|
279
|
+
arr = contig_name.split('_')
|
280
|
+
ret = "U"
|
281
|
+
ret = arr[2][0,2] if arr.size >= 3
|
282
|
+
ret = "3B" if arr.size == 2 and arr[0] == "v443"
|
283
|
+
ret = arr[0][0,2] if arr.size == 1
|
284
|
+
return ret
|
285
|
+
end
|
286
|
+
|
287
|
+
|
288
|
+
|
289
|
+
container= Bio::PolyploidTools::ExonContainer.new
|
290
|
+
container.flanking_size=500
|
291
|
+
container.gene_models(markers)
|
292
|
+
container.chromosomes(target)
|
293
|
+
container.add_parental({:name=>"A"})
|
294
|
+
container.add_parental({:name=>"B"})
|
295
|
+
snps.each do |snp|
|
296
|
+
snp.snp_in = "B"
|
297
|
+
snp.container = container
|
298
|
+
snp.flanking_size = container.flanking_size
|
299
|
+
snp.genomes_count = options[:genomes_count]
|
300
|
+
snp.includeNoSpecific = allow_non_specific
|
301
|
+
container.add_snp(snp)
|
302
|
+
end
|
303
|
+
container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>arm_selection_functions[:arm_selection_embl] , :min_identity=>min_identity})
|
304
|
+
|
305
|
+
|
306
|
+
|
307
|
+
exons_filename="#{output_folder}/localAlignment.fa"
|
308
|
+
file = File.open(exons_filename, "w")
|
309
|
+
container.print_fasta_snp_exones(file)
|
310
|
+
file.close
|
311
|
+
|
312
|
+
|
313
|
+
|
314
|
+
primer_3_input ="#{output_folder}/primer3_input.txt"
|
315
|
+
primer_3_output ="#{output_folder}/primer3_output.txt"
|
316
|
+
|
317
|
+
|
318
|
+
|
319
|
+
file = File.open(primer_3_input, "w")
|
320
|
+
snp_in="B"
|
321
|
+
Bio::DB::Primer3.prepare_input_file(file, options[:primer_3_preferences])
|
322
|
+
added_exons = container.print_primer_3_exons(file, nil, snp_in)
|
323
|
+
file.close
|
324
|
+
|
325
|
+
Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
|
326
|
+
|
327
|
+
masks_output = "#{output_folder}/masks_designed.fa"
|
328
|
+
output_file = "#{output_folder}/primers.csv"
|
329
|
+
file = File.open(masks_output, "w")
|
330
|
+
out = File.open(output_file, "w")
|
331
|
+
|
332
|
+
class Bio::DB::Primer3::Primer3Record
|
333
|
+
attr_accessor :primerPairs
|
334
|
+
end
|
335
|
+
|
336
|
+
printed_counts = Hash.new(0)
|
337
|
+
Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
|
338
|
+
#puts primer3record.inspect
|
339
|
+
next if primer3record.primer_left_num_returned.to_i == 0
|
340
|
+
|
341
|
+
seq_id = primer3record.sequence_id
|
342
|
+
printed_counts[seq_id] += 1
|
343
|
+
next if printed_counts[seq_id] > 10
|
344
|
+
excluded = "-"
|
345
|
+
exArr = excluded.split(",")
|
346
|
+
st = exArr[0].to_i
|
347
|
+
ed = exArr[1].to_i
|
348
|
+
tot = ed + st
|
349
|
+
|
350
|
+
excluded="#{st}-#{tot}"
|
351
|
+
seq_len = primer3record.sequence_template.length
|
352
|
+
printed = 0
|
353
|
+
|
354
|
+
sequence_template = primer3record.sequence_template
|
355
|
+
sequence_mask = "-" * st
|
356
|
+
sequence_mask << "*" * ed
|
357
|
+
sequence_mask << "-" * (seq_len - sequence_mask.length)
|
358
|
+
|
359
|
+
file.puts ">#{seq_id}\n#{sequence_template}"
|
360
|
+
file.puts ">#{seq_id}:mask\n#{sequence_mask}"
|
361
|
+
#puts "FDFDS"
|
362
|
+
|
363
|
+
#puts primer3record.primerPairs
|
364
|
+
|
365
|
+
primer3record.primerPairs.each do |p|
|
366
|
+
#puts p.inspect
|
367
|
+
printed += 1
|
368
|
+
lArr = p.left.coordinates
|
369
|
+
lArr[1] = lArr[0] + lArr[1]
|
370
|
+
rArr = p.right.coordinates
|
371
|
+
rArr[1] = rArr[0] - rArr[1]
|
372
|
+
toPrint = Array.new
|
373
|
+
toPrint << seq_id.split(" ")
|
374
|
+
#toPrint << seq_len
|
375
|
+
toPrint << p.product_size
|
376
|
+
toPrint << lArr.join("-")
|
377
|
+
toPrint << p.left.tm
|
378
|
+
toPrint << p.left.sequence
|
379
|
+
toPrint << rArr.join("-")
|
380
|
+
toPrint << p.right.tm
|
381
|
+
toPrint << p.right.sequence
|
382
|
+
|
383
|
+
middle = 501
|
384
|
+
toPrint << lArr[0]
|
385
|
+
toPrint << rArr[0]
|
386
|
+
toPrint << middle - lArr[0]
|
387
|
+
toPrint << rArr[0] - middle
|
388
|
+
#Start End LeftDistance RightDistance
|
389
|
+
|
390
|
+
out.puts toPrint.join(",")
|
391
|
+
|
392
|
+
sequence_primers = sequence_mask.clone
|
393
|
+
a = lArr[0]
|
394
|
+
b = lArr[1] - 1
|
395
|
+
#puts sequence_template[a..b]
|
396
|
+
sequence_primers[a..b] = sequence_template[a..b]
|
397
|
+
b = rArr[0]
|
398
|
+
a = rArr[1] + 1
|
399
|
+
|
400
|
+
sequence_primers[a..b] = sequence_template[a..b]
|
401
|
+
|
402
|
+
file.puts ">#{seq_id}:primerPair:#{printed}\n#{sequence_primers}"
|
403
|
+
end
|
404
|
+
|
405
|
+
if printed == 0
|
406
|
+
toPrint = Array.new
|
407
|
+
toPrint << seq_id.split(" ")
|
408
|
+
toPrint << excluded
|
409
|
+
toPrint << seq_len
|
410
|
+
out.puts toPrint.join(",")
|
411
|
+
end
|
412
|
+
|
413
|
+
end
|
414
|
+
out.close
|
415
|
+
file.close
|
416
|
+
|