bio-polyploid-tools 0.7.3 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +17 -0
- data/Gemfile +10 -7
- data/README.md +44 -0
- data/Rakefile +14 -14
- data/VERSION +1 -1
- data/bin/bfr.rb +2 -2
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/find_homoeologue_variations.rb +385 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +2 -2
- data/bin/homokaryot_primers.rb +2 -2
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/polymarker.rb +73 -17
- data/bin/polymarker_capillary.rb +416 -0
- data/bin/snp_position_to_polymarker.rb +5 -3
- data/bin/snps_between_bams.rb +0 -29
- data/bin/vcfLineToTable.rb +56 -0
- data/bio-polyploid-tools.gemspec +74 -32
- data/lib/bio/BFRTools.rb +1 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
- data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
- data/lib/bio/PolyploidTools/SNP.rb +58 -18
- data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
- data/lib/bio/db/blast.rb +112 -0
- data/lib/bio/db/exonerate.rb +4 -5
- data/lib/bio/db/primer3.rb +83 -14
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_for_polymarker.fa +1 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/bfr_out_test.csv +5 -5
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/test_bfr.rb +26 -34
- data/test/test_blast.rb +47 -0
- data/test/test_exonearate.rb +4 -9
- data/test/test_snp_parsing.rb +42 -22
- metadata +81 -20
- data/Gemfile.lock +0 -67
data/bin/polymarker.rb
CHANGED
@@ -14,6 +14,7 @@ arm_selection_functions = Hash.new;
|
|
14
14
|
|
15
15
|
|
16
16
|
arm_selection_functions[:arm_selection_first_two] = lambda do | contig_name |
|
17
|
+
contig_name.gsub!(/chr/,"")
|
17
18
|
ret = contig_name[0,2]
|
18
19
|
return ret
|
19
20
|
end
|
@@ -43,7 +44,6 @@ arm_selection_functions[:scaffold] = lambda do | contig_name |
|
|
43
44
|
end
|
44
45
|
|
45
46
|
def validate_files(o)
|
46
|
-
|
47
47
|
[
|
48
48
|
o[:path_to_contigs],
|
49
49
|
o[:marker_list],
|
@@ -51,7 +51,7 @@ def validate_files(o)
|
|
51
51
|
o[:mutant_list],
|
52
52
|
o[:reference]
|
53
53
|
].flatten.compact.each do |f|
|
54
|
-
raise IOError "Unable to read #{f}" unless File.exists? f
|
54
|
+
raise IOError.new "Unable to read #{f}" unless File.exists? f
|
55
55
|
end
|
56
56
|
end
|
57
57
|
|
@@ -67,6 +67,10 @@ options[:variation_free_region] = 0
|
|
67
67
|
options[:extract_found_contigs] = false
|
68
68
|
options[:genomes_count] = 3
|
69
69
|
options[:min_identity] = 90
|
70
|
+
options[:scoring] = :genome_specific
|
71
|
+
options[:database] = false
|
72
|
+
options[:aligner] = :exonerate
|
73
|
+
|
70
74
|
|
71
75
|
options[:primer_3_preferences] = {
|
72
76
|
:primer_product_size_range => "50-150" ,
|
@@ -119,7 +123,19 @@ OptionParser.new do |opts|
|
|
119
123
|
end
|
120
124
|
|
121
125
|
opts.on("-a", "--arm_selection arm_selection_embl|arm_selection_morex|arm_selection_first_two|scaffold", "Function to decide the chromome arm") do |o|
|
122
|
-
|
126
|
+
tmp_str = o
|
127
|
+
arr = o.split(",")
|
128
|
+
if arr.size == 2
|
129
|
+
options[:arm_selection] = lambda do |contig_name|
|
130
|
+
separator, field = arr
|
131
|
+
field = field.to_i
|
132
|
+
ret = contig_name.split(separator)[field]
|
133
|
+
return ret
|
134
|
+
end
|
135
|
+
else
|
136
|
+
options[:arm_selection] = arm_selection_functions[o.to_sym];
|
137
|
+
end
|
138
|
+
|
123
139
|
end
|
124
140
|
|
125
141
|
opts.on("-p", "--primer_3_preferences FILE", "file with preferences to be sent to primer3") do |o|
|
@@ -139,12 +155,26 @@ OptionParser.new do |opts|
|
|
139
155
|
options[:primers_to_order] = true
|
140
156
|
end
|
141
157
|
|
142
|
-
|
158
|
+
opts.on("-H", "--het_dels", "If present, change the scoring to give priority to: semi-specific, specific, non-specific") do
|
159
|
+
options[:scoring] = :het_dels
|
160
|
+
end
|
161
|
+
|
162
|
+
opts.on("-A", "--aligner exonerate|blast", "Select the aligner to use. Default: exonerate") do |o|
|
163
|
+
raise "Invalid aligner" unless o == "exonerate" or o == "blast"
|
164
|
+
options[:aligner] = o.to_sym
|
165
|
+
end
|
166
|
+
|
167
|
+
opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
|
168
|
+
options[:database] = o
|
169
|
+
end
|
143
170
|
end.parse!
|
144
171
|
|
145
172
|
|
146
173
|
validate_files(options)
|
147
174
|
|
175
|
+
options[:database] = options[:path_to_contigs] unless options[:database]
|
176
|
+
|
177
|
+
|
148
178
|
if options[:primer_3_preferences][:primer_product_size_range]
|
149
179
|
range = options[:primer_3_preferences][:primer_product_size_range]
|
150
180
|
range_arr = range.split("-")
|
@@ -208,7 +238,7 @@ fasta_reference_db = nil
|
|
208
238
|
if fasta_reference
|
209
239
|
fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
|
210
240
|
fasta_reference_db.load_fai_entries
|
211
|
-
|
241
|
+
write_status "Fasta reference: #{fasta_reference}"
|
212
242
|
end
|
213
243
|
|
214
244
|
#1. Read all the SNP files
|
@@ -239,9 +269,9 @@ File.open(test_file) do | f |
|
|
239
269
|
write_status "WARN: Unable to find entry for #{snp.gene}"
|
240
270
|
end
|
241
271
|
else
|
242
|
-
|
272
|
+
raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
|
243
273
|
end
|
244
|
-
|
274
|
+
raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
|
245
275
|
|
246
276
|
snp.genomes_count = options[:genomes_count]
|
247
277
|
snp.snp_in = snp_in
|
@@ -251,9 +281,6 @@ File.open(test_file) do | f |
|
|
251
281
|
else
|
252
282
|
$stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
|
253
283
|
end
|
254
|
-
|
255
|
-
# chromosome = snp.chromosome unless chromosome
|
256
|
-
# raise Bio::DB::Exonerate::ExonerateException.new "All the snps should come from the same chromosome" if chromosome != snp.chromosome
|
257
284
|
end
|
258
285
|
end
|
259
286
|
|
@@ -278,26 +305,43 @@ write_status "Searching markers in genome"
|
|
278
305
|
exo_f = File.open(exonerate_file, "w")
|
279
306
|
contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
|
280
307
|
filename=path_to_contigs
|
281
|
-
puts filename
|
308
|
+
#puts filename
|
282
309
|
target=filename
|
283
310
|
|
284
311
|
fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
|
285
312
|
fasta_file.load_fai_entries
|
286
313
|
|
287
314
|
found_contigs = Set.new
|
288
|
-
|
315
|
+
|
316
|
+
|
317
|
+
def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
|
289
318
|
if aln.identity > min_identity
|
290
319
|
exo_f.puts aln.line
|
291
320
|
unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
|
292
321
|
found_contigs.add(aln.target_id)
|
293
322
|
entry = fasta_file.index.region_for_entry(aln.target_id)
|
294
323
|
raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
|
295
|
-
|
296
|
-
|
297
|
-
|
324
|
+
if options[:extract_found_contigs]
|
325
|
+
region = entry.get_full_region
|
326
|
+
seq = fasta_file.fetch_sequence(region)
|
327
|
+
contigs_f.puts(">#{aln.target_id}\n#{seq}")
|
328
|
+
end
|
298
329
|
end
|
299
330
|
end
|
331
|
+
|
300
332
|
end
|
333
|
+
|
334
|
+
Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model}) do |aln|
|
335
|
+
do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
|
336
|
+
end if options[:aligner] == :blast
|
337
|
+
|
338
|
+
Bio::DB::Exonerate.align({:query=>temp_fasta_query, :target=>target, :model=>model}) do |aln|
|
339
|
+
do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
|
340
|
+
end if options[:aligner] == :exonerate
|
341
|
+
|
342
|
+
exo_f.close()
|
343
|
+
|
344
|
+
|
301
345
|
|
302
346
|
exo_f.close()
|
303
347
|
contigs_f.close() if options[:extract_found_contigs]
|
@@ -314,6 +358,7 @@ container.gene_models(temp_fasta_query)
|
|
314
358
|
container.chromosomes(target)
|
315
359
|
container.add_parental({:name=>snp_in})
|
316
360
|
container.add_parental({:name=>original_name})
|
361
|
+
|
317
362
|
snps.each do |snp|
|
318
363
|
snp.container = container
|
319
364
|
snp.flanking_size = container.flanking_size
|
@@ -337,15 +382,26 @@ file.close
|
|
337
382
|
|
338
383
|
Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
|
339
384
|
|
340
|
-
|
341
385
|
#5. Pick the best primer and make the primer3 output
|
342
386
|
write_status "Selecting best primers"
|
343
387
|
kasp_container=Bio::DB::Primer3::KASPContainer.new
|
388
|
+
|
389
|
+
|
390
|
+
|
344
391
|
kasp_container.line_1= original_name
|
345
392
|
kasp_container.line_2= snp_in
|
346
393
|
|
394
|
+
if options[:scoring] == :het_dels
|
395
|
+
kasp_container.scores = Hash.new
|
396
|
+
kasp_container.scores[:chromosome_specific] = 0
|
397
|
+
kasp_container.scores[:chromosome_semispecific] = 1000
|
398
|
+
kasp_container.scores[:chromosome_nonspecific] = 100
|
399
|
+
end
|
400
|
+
|
347
401
|
snps.each do |snp|
|
348
|
-
kasp_container.add_snp(snp)
|
402
|
+
snpk = kasp_container.add_snp(snp)
|
403
|
+
|
404
|
+
|
349
405
|
end
|
350
406
|
|
351
407
|
kasp_container.add_primers_file(primer_3_output) if added_exons > 0
|
@@ -0,0 +1,416 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'bio'
|
3
|
+
require 'bio-samtools'
|
4
|
+
require 'pathname'
|
5
|
+
require 'optparse'
|
6
|
+
|
7
|
+
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
|
8
|
+
$: << File.expand_path('.')
|
9
|
+
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools.rb')
|
10
|
+
require path
|
11
|
+
|
12
|
+
def log(msg)
|
13
|
+
time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
|
14
|
+
puts "#{time}: #{msg}"
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
#reference='wheat_6x_ty_mm_mutations_10mutants_for_validations/scaffolds_with_mm.fa'
|
20
|
+
#markers='wheat_6x_ty_mm_mutations_10mutants_for_validations/CadMulitMap.fa'
|
21
|
+
#output_folder='wheat_6x_ty_mm_mutations_10mutants_for_validations/PolyMarker'
|
22
|
+
|
23
|
+
options = Hash.new
|
24
|
+
|
25
|
+
options[:primer_3_preferences] = {
|
26
|
+
:primer_product_size_range => "100-900" ,
|
27
|
+
:primer_max_size => 25 ,
|
28
|
+
:primer_lib_ambiguity_codes_consensus => 1,
|
29
|
+
:primer_liberal_base => 1,
|
30
|
+
:primer_min_left_three_prime_distance => 5,
|
31
|
+
:primer_min_right_three_prime_distance => 5,
|
32
|
+
:primer_num_return =>1,
|
33
|
+
:primer_explain_flag => 1,
|
34
|
+
:primer_thermodynamic_parameters_path=>File.expand_path(File.dirname(__FILE__) + '../../conf/primer3_config/') + '/'
|
35
|
+
}
|
36
|
+
options[:genomes_count] = 3
|
37
|
+
options[:allow_non_specific] = false
|
38
|
+
|
39
|
+
OptionParser.new do |opts|
|
40
|
+
opts.banner = "Usage: polymarker_capillary.rb [options]"
|
41
|
+
|
42
|
+
opts.on("-r", "--reference FILE", "Fasta file with the assembly") do |o|
|
43
|
+
options[:reference] = o
|
44
|
+
end
|
45
|
+
|
46
|
+
opts.on("-m", "--sequences FILE", "Fasta file with the sequences to amplify. the format must be Chromosome:start-end. Chromosome should match the names to the entries in the fasta files as it is used as main target") do |o|
|
47
|
+
options[:markers] = o
|
48
|
+
end
|
49
|
+
|
50
|
+
opts.on("-o", "--output_folder FOLDER", "Path to a folder where the outputs are going to be stored") do |o|
|
51
|
+
options[:output_folder] = o
|
52
|
+
end
|
53
|
+
opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
|
54
|
+
options[:genomes_count] = o.to_i
|
55
|
+
end
|
56
|
+
opts.on("-a", "--allow_non_specific", "If used, semi-specific and non-specific primers will be produced") do |o|
|
57
|
+
options[:allow_non_specific] = true
|
58
|
+
end
|
59
|
+
|
60
|
+
end.parse!
|
61
|
+
|
62
|
+
|
63
|
+
#puts options.inspect
|
64
|
+
reference = options[:reference]
|
65
|
+
markers = options[:markers]
|
66
|
+
output_folder = options[:output_folder]
|
67
|
+
allow_non_specific = options[:allow_non_specific]
|
68
|
+
log "Output folder: #{output_folder}"
|
69
|
+
exonerate_file="#{output_folder}/exonerate_tmp.tab"
|
70
|
+
Dir.mkdir(output_folder)
|
71
|
+
|
72
|
+
module Bio::PolyploidTools
|
73
|
+
|
74
|
+
|
75
|
+
|
76
|
+
class SequenceToAmplify < SNP
|
77
|
+
|
78
|
+
def self.select_chromosome(contig_name)
|
79
|
+
|
80
|
+
arr = contig_name.split('_')
|
81
|
+
ret = "U"
|
82
|
+
ret = arr[2][0,2] if arr.size >= 3
|
83
|
+
ret = "3B" if arr.size == 2 and arr[0] == "v443"
|
84
|
+
ret = arr[0][0,2] if arr.size == 1
|
85
|
+
return ret
|
86
|
+
end
|
87
|
+
|
88
|
+
attr_accessor :sequence_original
|
89
|
+
attr_accessor :rstart
|
90
|
+
attr_accessor :rend
|
91
|
+
attr_accessor :includeNoSpecific
|
92
|
+
#Format:
|
93
|
+
#A fasta entry with the id: contig:start-end
|
94
|
+
#The sequence can be prodcued with samtools faidx
|
95
|
+
def self.parse(fasta_entry)
|
96
|
+
|
97
|
+
snp = SequenceToAmplify.new
|
98
|
+
match_data = /(?<rname>\w*):(?<rstart>\w*)-(?<rend>\w*)/.match(fasta_entry.definition)
|
99
|
+
|
100
|
+
rName = Regexp.last_match(:rname)
|
101
|
+
rStart = Regexp.last_match(:rstart).to_i
|
102
|
+
rEnd = Regexp.last_match(:rend).to_i
|
103
|
+
snp.gene = fasta_entry.definition
|
104
|
+
#snp.chromosome=rName
|
105
|
+
|
106
|
+
snp.chromosome=select_chromosome(rName)
|
107
|
+
#puts "#{rName}: #{snp.chromosome}"
|
108
|
+
snp.sequence_original = fasta_entry.seq
|
109
|
+
snp.template_sequence = fasta_entry.seq.upcase
|
110
|
+
snp.snp_in = "B"
|
111
|
+
snp.rstart = rStart
|
112
|
+
snp.rend = rEnd
|
113
|
+
|
114
|
+
snp.position = 100
|
115
|
+
snp.original = snp.sequence_original[snp.position]
|
116
|
+
|
117
|
+
tmp = Bio::Sequence::NA.new(snp.original)
|
118
|
+
rev = tmp.complement
|
119
|
+
snp.snp = rev
|
120
|
+
snp.exon_list = Hash.new()
|
121
|
+
snp
|
122
|
+
end
|
123
|
+
|
124
|
+
def primer_3_all_strings(target_chromosome, parental)
|
125
|
+
#puts target_chromosome
|
126
|
+
#puts parental
|
127
|
+
#puts aligned_sequences.to_fasta
|
128
|
+
pr = primer_region(target_chromosome, parental )
|
129
|
+
primer_3_propertes = Array.new
|
130
|
+
|
131
|
+
seq_original = String.new(pr.sequence)
|
132
|
+
#puts seq_original.size.to_s << "-" << primer_3_min_seq_length.to_s
|
133
|
+
return primer_3_propertes if seq_original.size < primer_3_min_seq_length
|
134
|
+
return primer_3_propertes unless pr.snp_pos == 500
|
135
|
+
#puts "Sequence origina: #{ self.original}"
|
136
|
+
#puts pr.to_fasta
|
137
|
+
#puts "Postion: #{pr.snp_pos}"
|
138
|
+
seq_original[pr.snp_pos] = self.original
|
139
|
+
seq_original_reverse = reverse_complement_string(seq_original)
|
140
|
+
|
141
|
+
seq_snp = String.new(pr.sequence)
|
142
|
+
seq_snp[pr.snp_pos] = self.snp
|
143
|
+
seq_snp_reverse = reverse_complement_string(seq_snp)
|
144
|
+
|
145
|
+
rev_pos = seq_snp.size - position
|
146
|
+
|
147
|
+
if pr.homoeologous
|
148
|
+
snp_type = "homoeologous"
|
149
|
+
else
|
150
|
+
snp_type = "non-homoeologous"
|
151
|
+
end
|
152
|
+
left_pos = Array.new
|
153
|
+
right_pos = Array.new
|
154
|
+
l_pos = pr.snp_pos
|
155
|
+
pr.chromosome_specific.shuffle.each {|pos| left_pos << pos if pos < l_pos - 50 }
|
156
|
+
pr.chromosome_specific.shuffle.each {|pos| right_pos << pos if pos > l_pos + 50}
|
157
|
+
|
158
|
+
pr.crhomosome_specific_intron.shuffle.each {|pos| left_pos << pos if pos < l_pos - 50}
|
159
|
+
pr.crhomosome_specific_intron.shuffle.each {|pos| right_pos << pos if pos > l_pos + 50}
|
160
|
+
|
161
|
+
prepareLRPrimers(left_pos, right_pos, "chromosome_specific" , snp_type,seq_original, primer_3_propertes)
|
162
|
+
if includeNoSpecific and (right_pos.size == 0 or right_pos.size == 0)
|
163
|
+
left_pos = Array.new
|
164
|
+
right_pos = Array.new
|
165
|
+
l_pos = pr.snp_pos
|
166
|
+
pr.almost_chromosome_specific.each {|pos| left_pos << pos if pos < l_pos - 50 }
|
167
|
+
pr.almost_chromosome_specific.each {|pos| right_pos << pos if pos > l_pos + 50}
|
168
|
+
|
169
|
+
pr.almost_crhomosome_specific_intron.each {|pos| left_pos << pos if pos < l_pos - 50}
|
170
|
+
pr.almost_crhomosome_specific_intron.each {|pos| right_pos << pos if pos > l_pos + 50}
|
171
|
+
|
172
|
+
prepareLRPrimers(left_pos, right_pos, "chromosome_semispecific" ,snp_type, seq_original, primer_3_propertes)
|
173
|
+
args = {
|
174
|
+
:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} chromosome_nonspecific exon #{snp_type} #{chromosome}",
|
175
|
+
:left_pos => 350,
|
176
|
+
:extra_f=>"SEQUENCE_TARGET=350,400\n",
|
177
|
+
:extra_r=>"SEQUENCE_TARGET=350,400\n",
|
178
|
+
:sequence=>seq_original}
|
179
|
+
str = return_primer_3_string(args)
|
180
|
+
|
181
|
+
primer_3_propertes << str
|
182
|
+
end
|
183
|
+
primer_3_propertes
|
184
|
+
end
|
185
|
+
|
186
|
+
def prepareLRPrimers(left_pos, right_pos, type , snp_type, seq_original,primer_3_propertes)
|
187
|
+
count = 0
|
188
|
+
left_pos.each do |l|
|
189
|
+
right_pos.each do |r|
|
190
|
+
args = {:name =>"#{gene}:#{original}#{position}#{snp} #{original_name} #{type} exon #{snp_type} #{chromosome}",
|
191
|
+
:left_pos => l,
|
192
|
+
:right_pos => r,
|
193
|
+
:sequence=>seq_original}
|
194
|
+
|
195
|
+
primer_3_propertes << return_primer_3_string(args)
|
196
|
+
count += 1
|
197
|
+
# return if count > 25
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def parental_sequences
|
203
|
+
return @parental_sequences if @parental_sequences
|
204
|
+
gene_region = self.covered_region
|
205
|
+
local_pos_in_gene = self.position
|
206
|
+
|
207
|
+
@parental_sequences = Bio::Alignment::SequenceHash.new
|
208
|
+
container.parents.each do |name, bam|
|
209
|
+
seq = self.sequence_original.clone.downcase
|
210
|
+
|
211
|
+
if name == self.snp_in
|
212
|
+
#puts self.snp
|
213
|
+
seq[local_pos_in_gene] = self.snp
|
214
|
+
else
|
215
|
+
#puts self.original
|
216
|
+
seq[local_pos_in_gene] = self.original
|
217
|
+
end
|
218
|
+
seq[local_pos_in_gene] = seq[local_pos_in_gene].upcase
|
219
|
+
@parental_sequences [name] = seq
|
220
|
+
#puts name
|
221
|
+
#puts self.snp_in
|
222
|
+
#puts seq
|
223
|
+
end
|
224
|
+
@parental_sequences
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
|
230
|
+
snps = Array.new
|
231
|
+
file = Bio::FastaFormat.open(markers)
|
232
|
+
file.each do |entry|
|
233
|
+
|
234
|
+
begin
|
235
|
+
tmp = Bio::PolyploidTools::SequenceToAmplify.parse(entry)
|
236
|
+
snps << tmp if tmp
|
237
|
+
rescue
|
238
|
+
$stderr.puts "Unable to generate the marker for: #{entry.definition}"
|
239
|
+
end
|
240
|
+
|
241
|
+
end
|
242
|
+
file.close
|
243
|
+
|
244
|
+
|
245
|
+
|
246
|
+
exo_f = File.open(exonerate_file, "w")
|
247
|
+
target=reference
|
248
|
+
|
249
|
+
fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
|
250
|
+
fasta_file.load_fai_entries
|
251
|
+
min_identity = 95
|
252
|
+
found_contigs = Set.new
|
253
|
+
|
254
|
+
Bio::DB::Exonerate.align({:query=>markers, :target=>reference, :model=>'ungapped'}) do |aln|
|
255
|
+
if aln.identity > min_identity
|
256
|
+
exo_f.puts aln.line
|
257
|
+
#puts aln.line
|
258
|
+
unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
|
259
|
+
found_contigs.add(aln.target_id)
|
260
|
+
entry = fasta_file.index.region_for_entry(aln.target_id)
|
261
|
+
raise Exception.new, "Entry not found! #{aln.target_id}. Make sure that the #{reference}.fai was generated properly." if entry == nil
|
262
|
+
end
|
263
|
+
end
|
264
|
+
end
|
265
|
+
exo_f.close
|
266
|
+
|
267
|
+
arm_selection_functions = Hash.new
|
268
|
+
|
269
|
+
arm_selection_functions[:full_scaffold] = lambda do | contig_name |
|
270
|
+
return contig_name
|
271
|
+
end
|
272
|
+
|
273
|
+
#Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
|
274
|
+
#Or the first two characters in the contig name, to deal with
|
275
|
+
#pseudomolecules that start with headers like: "1A"
|
276
|
+
#And with the cases when 3B is named with the prefix: v443
|
277
|
+
arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
|
278
|
+
|
279
|
+
arr = contig_name.split('_')
|
280
|
+
ret = "U"
|
281
|
+
ret = arr[2][0,2] if arr.size >= 3
|
282
|
+
ret = "3B" if arr.size == 2 and arr[0] == "v443"
|
283
|
+
ret = arr[0][0,2] if arr.size == 1
|
284
|
+
return ret
|
285
|
+
end
|
286
|
+
|
287
|
+
|
288
|
+
|
289
|
+
container= Bio::PolyploidTools::ExonContainer.new
|
290
|
+
container.flanking_size=500
|
291
|
+
container.gene_models(markers)
|
292
|
+
container.chromosomes(target)
|
293
|
+
container.add_parental({:name=>"A"})
|
294
|
+
container.add_parental({:name=>"B"})
|
295
|
+
snps.each do |snp|
|
296
|
+
snp.snp_in = "B"
|
297
|
+
snp.container = container
|
298
|
+
snp.flanking_size = container.flanking_size
|
299
|
+
snp.genomes_count = options[:genomes_count]
|
300
|
+
snp.includeNoSpecific = allow_non_specific
|
301
|
+
container.add_snp(snp)
|
302
|
+
end
|
303
|
+
container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>arm_selection_functions[:arm_selection_embl] , :min_identity=>min_identity})
|
304
|
+
|
305
|
+
|
306
|
+
|
307
|
+
exons_filename="#{output_folder}/localAlignment.fa"
|
308
|
+
file = File.open(exons_filename, "w")
|
309
|
+
container.print_fasta_snp_exones(file)
|
310
|
+
file.close
|
311
|
+
|
312
|
+
|
313
|
+
|
314
|
+
primer_3_input ="#{output_folder}/primer3_input.txt"
|
315
|
+
primer_3_output ="#{output_folder}/primer3_output.txt"
|
316
|
+
|
317
|
+
|
318
|
+
|
319
|
+
file = File.open(primer_3_input, "w")
|
320
|
+
snp_in="B"
|
321
|
+
Bio::DB::Primer3.prepare_input_file(file, options[:primer_3_preferences])
|
322
|
+
added_exons = container.print_primer_3_exons(file, nil, snp_in)
|
323
|
+
file.close
|
324
|
+
|
325
|
+
Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
|
326
|
+
|
327
|
+
masks_output = "#{output_folder}/masks_designed.fa"
|
328
|
+
output_file = "#{output_folder}/primers.csv"
|
329
|
+
file = File.open(masks_output, "w")
|
330
|
+
out = File.open(output_file, "w")
|
331
|
+
|
332
|
+
class Bio::DB::Primer3::Primer3Record
|
333
|
+
attr_accessor :primerPairs
|
334
|
+
end
|
335
|
+
|
336
|
+
printed_counts = Hash.new(0)
|
337
|
+
Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
|
338
|
+
#puts primer3record.inspect
|
339
|
+
next if primer3record.primer_left_num_returned.to_i == 0
|
340
|
+
|
341
|
+
seq_id = primer3record.sequence_id
|
342
|
+
printed_counts[seq_id] += 1
|
343
|
+
next if printed_counts[seq_id] > 10
|
344
|
+
excluded = "-"
|
345
|
+
exArr = excluded.split(",")
|
346
|
+
st = exArr[0].to_i
|
347
|
+
ed = exArr[1].to_i
|
348
|
+
tot = ed + st
|
349
|
+
|
350
|
+
excluded="#{st}-#{tot}"
|
351
|
+
seq_len = primer3record.sequence_template.length
|
352
|
+
printed = 0
|
353
|
+
|
354
|
+
sequence_template = primer3record.sequence_template
|
355
|
+
sequence_mask = "-" * st
|
356
|
+
sequence_mask << "*" * ed
|
357
|
+
sequence_mask << "-" * (seq_len - sequence_mask.length)
|
358
|
+
|
359
|
+
file.puts ">#{seq_id}\n#{sequence_template}"
|
360
|
+
file.puts ">#{seq_id}:mask\n#{sequence_mask}"
|
361
|
+
#puts "FDFDS"
|
362
|
+
|
363
|
+
#puts primer3record.primerPairs
|
364
|
+
|
365
|
+
primer3record.primerPairs.each do |p|
|
366
|
+
#puts p.inspect
|
367
|
+
printed += 1
|
368
|
+
lArr = p.left.coordinates
|
369
|
+
lArr[1] = lArr[0] + lArr[1]
|
370
|
+
rArr = p.right.coordinates
|
371
|
+
rArr[1] = rArr[0] - rArr[1]
|
372
|
+
toPrint = Array.new
|
373
|
+
toPrint << seq_id.split(" ")
|
374
|
+
#toPrint << seq_len
|
375
|
+
toPrint << p.product_size
|
376
|
+
toPrint << lArr.join("-")
|
377
|
+
toPrint << p.left.tm
|
378
|
+
toPrint << p.left.sequence
|
379
|
+
toPrint << rArr.join("-")
|
380
|
+
toPrint << p.right.tm
|
381
|
+
toPrint << p.right.sequence
|
382
|
+
|
383
|
+
middle = 501
|
384
|
+
toPrint << lArr[0]
|
385
|
+
toPrint << rArr[0]
|
386
|
+
toPrint << middle - lArr[0]
|
387
|
+
toPrint << rArr[0] - middle
|
388
|
+
#Start End LeftDistance RightDistance
|
389
|
+
|
390
|
+
out.puts toPrint.join(",")
|
391
|
+
|
392
|
+
sequence_primers = sequence_mask.clone
|
393
|
+
a = lArr[0]
|
394
|
+
b = lArr[1] - 1
|
395
|
+
#puts sequence_template[a..b]
|
396
|
+
sequence_primers[a..b] = sequence_template[a..b]
|
397
|
+
b = rArr[0]
|
398
|
+
a = rArr[1] + 1
|
399
|
+
|
400
|
+
sequence_primers[a..b] = sequence_template[a..b]
|
401
|
+
|
402
|
+
file.puts ">#{seq_id}:primerPair:#{printed}\n#{sequence_primers}"
|
403
|
+
end
|
404
|
+
|
405
|
+
if printed == 0
|
406
|
+
toPrint = Array.new
|
407
|
+
toPrint << seq_id.split(" ")
|
408
|
+
toPrint << excluded
|
409
|
+
toPrint << seq_len
|
410
|
+
out.puts toPrint.join(",")
|
411
|
+
end
|
412
|
+
|
413
|
+
end
|
414
|
+
out.close
|
415
|
+
file.close
|
416
|
+
|