bio-polyploid-tools 0.10.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/SECURITY.md +16 -0
- data/VERSION +1 -1
- data/bin/polymarker.rb +30 -21
- data/bin/polymarker_capillary.rb +83 -56
- data/bin/{find_homoeologue_variations.rb → polymarker_deletions.rb} +55 -90
- data/bio-polyploid-tools.gemspec +27 -25
- data/lib/bio/BIOExtensions.rb +1 -1
- data/lib/bio/PolyploidTools/ExonContainer.rb +9 -9
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +39 -33
- data/lib/bio/PolyploidTools/SNP.rb +26 -21
- data/lib/bio/db/blast.rb +1 -1
- data/lib/bio/db/primer3.rb +14 -18
- data/test/data/7B_amplicon_test.fa +12 -0
- data/test/data/7B_amplicon_test.fa.fai +1 -0
- data/test/data/7B_amplicon_test_reference.fa +110 -0
- data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
- data/test/data/7B_amplicon_test_reference.fa.ndb +0 -0
- data/test/data/7B_amplicon_test_reference.fa.nhr +0 -0
- data/test/data/7B_amplicon_test_reference.fa.nin +0 -0
- data/test/data/7B_amplicon_test_reference.fa.not +0 -0
- data/test/data/7B_amplicon_test_reference.fa.nsq +0 -0
- data/test/data/7B_amplicon_test_reference.fa.ntf +0 -0
- data/test/data/7B_amplicon_test_reference.fa.nto +0 -0
- metadata +17 -8
- data/README +0 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9191156e91a48ec245e181a1541d4b636b01c848b03f2b7db5f7729ddfc05421
|
4
|
+
data.tar.gz: '0449ab8d09b268538d3604f20b555d94be53cac35ff8d591a29c792f98df3def'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c23625ac5c1cdfc3b4d34c3a8f416f680bc42a274b983ee64938bc3ba3bd7b685ad3e9cd9c04521a8f1baf8f91b0efae27a4c5d3034a4a18b141ec10209a7ee
|
7
|
+
data.tar.gz: cebf5a46d0a3cce9b63ccd71451f2f2a0d4903ae3e0954d34ba48955cc148b3d232bc5612ed8a528ade86cbfbb6e216c9788126c53b7f8cfa2157785ee00533b
|
data/SECURITY.md
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# Security Policy
|
2
|
+
|
3
|
+
## Supported Versions
|
4
|
+
|
5
|
+
The following table shows the currently supported version.
|
6
|
+
|
7
|
+
| Version | Supported |
|
8
|
+
| ------- | ------------------ |
|
9
|
+
| 1.1.x | :white_check_mark: |
|
10
|
+
| 1.0.x | :x: |
|
11
|
+
| 0.x.x | :x: |
|
12
|
+
|
13
|
+
|
14
|
+
## Reporting a Vulnerability
|
15
|
+
|
16
|
+
If you find a vulneravility, please submit a comment in the security tab
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
1.2.0
|
data/bin/polymarker.rb
CHANGED
@@ -40,8 +40,8 @@ options[:scoring] = :genome_specific
|
|
40
40
|
options[:database] = false
|
41
41
|
options[:filter_best] = false
|
42
42
|
options[:aligner] = :blast
|
43
|
-
|
44
|
-
|
43
|
+
options[:max_hits] = 8
|
44
|
+
options[:max_specific_primers] = 15
|
45
45
|
options[:primer_3_preferences] = {
|
46
46
|
:primer_product_size_range => "50-150" ,
|
47
47
|
:primer_max_size => 25 ,
|
@@ -132,6 +132,15 @@ OptionParser.new do |opts|
|
|
132
132
|
opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
|
133
133
|
options[:database] = o
|
134
134
|
end
|
135
|
+
|
136
|
+
opts.on("-H", "--max_hits INT", "Maximum number of hits to the reference. If there are more hits than this value, the marker is ignored") do |o|
|
137
|
+
options[:max_hits] = o.to_i
|
138
|
+
end
|
139
|
+
|
140
|
+
opts.on("-S", "--max_specific_primers INT", "Maximum number of candidate primers to attempt to design. Default: #{options[:max_specific_primers]} ") do |o|
|
141
|
+
options[:max_specific_primers] = o.to_i
|
142
|
+
end
|
143
|
+
|
135
144
|
end.parse!
|
136
145
|
|
137
146
|
|
@@ -233,8 +242,8 @@ File.open(test_file) do | f |
|
|
233
242
|
region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
|
234
243
|
snp.template_sequence = fasta_reference_db.fetch_sequence(region)
|
235
244
|
else
|
236
|
-
|
237
|
-
|
245
|
+
write_status "WARN: Unable to find entry for #{snp.gene}"
|
246
|
+
end
|
238
247
|
elsif options[:mutant_list] and options[:reference] #List and fasta file
|
239
248
|
snp = Bio::PolyploidTools::SNPMutant.parse(line)
|
240
249
|
entry = fasta_reference_db.index.region_for_entry(snp.contig)
|
@@ -242,21 +251,21 @@ File.open(test_file) do | f |
|
|
242
251
|
region = fasta_reference_db.index.region_for_entry(snp.contig).get_full_region
|
243
252
|
snp.full_sequence = fasta_reference_db.fetch_sequence(region)
|
244
253
|
else
|
245
|
-
|
246
|
-
end
|
247
|
-
else
|
248
|
-
raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
|
249
|
-
end
|
250
|
-
raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
|
251
|
-
|
252
|
-
snp.genomes_count = options[:genomes_count]
|
253
|
-
snp.snp_in = snp_in
|
254
|
-
snp.original_name = original_name
|
255
|
-
if snp.position
|
256
|
-
snps << snp
|
257
|
-
else
|
258
|
-
$stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
|
254
|
+
write_status "WARN: Unable to find entry for #{snp.gene}"
|
259
255
|
end
|
256
|
+
else
|
257
|
+
raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
|
258
|
+
end
|
259
|
+
raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
|
260
|
+
snp.max_hits = options[:max_hits]
|
261
|
+
snp.genomes_count = options[:genomes_count]
|
262
|
+
snp.snp_in = snp_in
|
263
|
+
snp.original_name = original_name
|
264
|
+
if snp.position
|
265
|
+
snps << snp
|
266
|
+
else
|
267
|
+
$stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
|
268
|
+
end
|
260
269
|
end
|
261
270
|
end
|
262
271
|
|
@@ -307,7 +316,7 @@ def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
|
|
307
316
|
|
308
317
|
end
|
309
318
|
|
310
|
-
Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model}) do |aln|
|
319
|
+
Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
|
311
320
|
do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
|
312
321
|
end if options[:aligner] == :blast
|
313
322
|
|
@@ -334,7 +343,7 @@ container.gene_models(temp_fasta_query)
|
|
334
343
|
container.chromosomes(target)
|
335
344
|
container.add_parental({:name=>snp_in})
|
336
345
|
container.add_parental({:name=>original_name})
|
337
|
-
|
346
|
+
container.max_hits = options[:max_hits]
|
338
347
|
snps.each do |snp|
|
339
348
|
snp.container = container
|
340
349
|
snp.flanking_size = container.flanking_size
|
@@ -358,7 +367,7 @@ write_status "Running primer3"
|
|
358
367
|
file = File.open(primer_3_input, "w")
|
359
368
|
|
360
369
|
Bio::DB::Primer3.prepare_input_file(file, options[:primer_3_preferences])
|
361
|
-
added_exons = container.print_primer_3_exons(file, nil, snp_in)
|
370
|
+
added_exons = container.print_primer_3_exons(file, nil, snp_in, max_specific_primers: options[:max_specific_primers] )
|
362
371
|
file.close
|
363
372
|
|
364
373
|
Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
|
data/bin/polymarker_capillary.rb
CHANGED
@@ -35,15 +35,21 @@ options[:primer_3_preferences] = {
|
|
35
35
|
}
|
36
36
|
options[:genomes_count] = 3
|
37
37
|
options[:allow_non_specific] = false
|
38
|
+
options[:aligner] = :blast
|
39
|
+
options[:arm_selection]
|
40
|
+
model="ungapped"
|
41
|
+
options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
|
42
|
+
options[:database] = false
|
38
43
|
|
39
44
|
OptionParser.new do |opts|
|
40
|
-
opts.banner = "Usage:
|
45
|
+
opts.banner = "Usage: polymarker_deletions.rb [options]"
|
41
46
|
|
42
47
|
opts.on("-r", "--reference FILE", "Fasta file with the assembly") do |o|
|
43
48
|
options[:reference] = o
|
44
49
|
end
|
45
50
|
|
46
|
-
opts.on("-m", "--sequences FILE", "Fasta file with the sequences to amplify. the format must be Chromosome:start-end. Chromosome
|
51
|
+
opts.on("-m", "--sequences FILE", "Fasta file with the sequences to amplify. the format must be Chromosome:start-end. Chromosome
|
52
|
+
should match the names to the entries in the fasta files as it is used as main target") do |o|
|
47
53
|
options[:markers] = o
|
48
54
|
end
|
49
55
|
|
@@ -53,10 +59,19 @@ OptionParser.new do |opts|
|
|
53
59
|
opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
|
54
60
|
options[:genomes_count] = o.to_i
|
55
61
|
end
|
56
|
-
opts.on("-
|
62
|
+
opts.on("-A", "--allow_non_specific", "If used, semi-specific and non-specific primers will be produced") do |o|
|
57
63
|
options[:allow_non_specific] = true
|
58
64
|
end
|
59
65
|
|
66
|
+
opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
|
67
|
+
options[:database] = o
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
|
72
|
+
options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
|
73
|
+
end
|
74
|
+
|
60
75
|
end.parse!
|
61
76
|
|
62
77
|
|
@@ -65,23 +80,33 @@ reference = options[:reference]
|
|
65
80
|
markers = options[:markers]
|
66
81
|
output_folder = options[:output_folder]
|
67
82
|
allow_non_specific = options[:allow_non_specific]
|
83
|
+
|
84
|
+
options[:database] = options[:reference] unless options[:database]
|
85
|
+
temp_fasta_query="#{output_folder}/to_align.fa"
|
68
86
|
log "Output folder: #{output_folder}"
|
69
87
|
exonerate_file="#{output_folder}/exonerate_tmp.tab"
|
70
88
|
Dir.mkdir(output_folder)
|
89
|
+
arm_selection = options[:arm_selection]
|
71
90
|
|
72
91
|
module Bio::PolyploidTools
|
73
|
-
|
74
|
-
|
75
92
|
|
76
93
|
class SequenceToAmplify < SNP
|
77
94
|
|
78
|
-
def self.select_chromosome(
|
79
|
-
|
80
|
-
|
81
|
-
ret =
|
82
|
-
|
83
|
-
|
84
|
-
|
95
|
+
def self.select_chromosome(gene_name, arm_selection)
|
96
|
+
#m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(gene_name)
|
97
|
+
#m=/TraesCS(\d{1})(\w{1})(\d{2})G(\d+)/.match(gene_name)
|
98
|
+
#ret = {:group : m[1],
|
99
|
+
# :genome : m[2],:version=>m[3],:chr_id=>m[4]}
|
100
|
+
|
101
|
+
|
102
|
+
#arr = contig_name.split('_')
|
103
|
+
#ret = "U"
|
104
|
+
#ret = arr[2][0,2] if arr.size >= 3
|
105
|
+
#ret = "3B" if arr.size == 2 and arr[0] == "v443"
|
106
|
+
#ret = arr[0][0,2] if arr.size == 1
|
107
|
+
#ret = "#{m[1]}#{m[2]}"
|
108
|
+
#puts ret
|
109
|
+
ret = arm_selection.call(gene_name)
|
85
110
|
return ret
|
86
111
|
end
|
87
112
|
|
@@ -92,18 +117,18 @@ module Bio::PolyploidTools
|
|
92
117
|
#Format:
|
93
118
|
#A fasta entry with the id: contig:start-end
|
94
119
|
#The sequence can be prodcued with samtools faidx
|
95
|
-
def self.parse(fasta_entry)
|
96
|
-
|
120
|
+
def self.parse(fasta_entry, arm_selection)
|
121
|
+
#puts fasta_entry.definition
|
97
122
|
snp = SequenceToAmplify.new
|
98
123
|
match_data = /(?<rname>\w*):(?<rstart>\w*)-(?<rend>\w*)/.match(fasta_entry.definition)
|
99
|
-
|
124
|
+
#puts match_data.inspect
|
100
125
|
rName = Regexp.last_match(:rname)
|
101
126
|
rStart = Regexp.last_match(:rstart).to_i
|
102
127
|
rEnd = Regexp.last_match(:rend).to_i
|
103
128
|
snp.gene = fasta_entry.definition
|
104
129
|
#snp.chromosome=rName
|
105
|
-
|
106
|
-
snp.chromosome=select_chromosome(
|
130
|
+
#puts "Gene: #{snp.gene}"
|
131
|
+
snp.chromosome=select_chromosome(fasta_entry.definition, arm_selection)
|
107
132
|
#puts "#{rName}: #{snp.chromosome}"
|
108
133
|
snp.sequence_original = fasta_entry.seq
|
109
134
|
snp.template_sequence = fasta_entry.seq.upcase
|
@@ -111,7 +136,7 @@ module Bio::PolyploidTools
|
|
111
136
|
snp.rstart = rStart
|
112
137
|
snp.rend = rEnd
|
113
138
|
|
114
|
-
snp.position =
|
139
|
+
snp.position = snp.sequence_original.size / 2
|
115
140
|
snp.original = snp.sequence_original[snp.position]
|
116
141
|
|
117
142
|
tmp = Bio::Sequence::NA.new(snp.original)
|
@@ -121,7 +146,7 @@ module Bio::PolyploidTools
|
|
121
146
|
snp
|
122
147
|
end
|
123
148
|
|
124
|
-
def primer_3_all_strings(target_chromosome, parental)
|
149
|
+
def primer_3_all_strings(target_chromosome, parental, max_specific_primers: 20, flanking_size:500)
|
125
150
|
#puts target_chromosome
|
126
151
|
#puts parental
|
127
152
|
#puts aligned_sequences.to_fasta
|
@@ -130,8 +155,11 @@ module Bio::PolyploidTools
|
|
130
155
|
|
131
156
|
seq_original = String.new(pr.sequence)
|
132
157
|
#puts seq_original.size.to_s << "-" << primer_3_min_seq_length.to_s
|
158
|
+
#puts "___"
|
159
|
+
#puts pr.inspect
|
133
160
|
return primer_3_propertes if seq_original.size < primer_3_min_seq_length
|
134
|
-
|
161
|
+
#puts "((("
|
162
|
+
return primer_3_propertes unless pr.snp_pos == flanking_size
|
135
163
|
#puts "Sequence origina: #{ self.original}"
|
136
164
|
#puts pr.to_fasta
|
137
165
|
#puts "Postion: #{pr.snp_pos}"
|
@@ -232,10 +260,13 @@ file = Bio::FastaFormat.open(markers)
|
|
232
260
|
file.each do |entry|
|
233
261
|
|
234
262
|
begin
|
235
|
-
|
263
|
+
#puts entry.inspect
|
264
|
+
tmp = Bio::PolyploidTools::SequenceToAmplify.parse(entry, arm_selection)
|
236
265
|
snps << tmp if tmp
|
237
|
-
rescue
|
266
|
+
rescue Exception => e
|
267
|
+
log "ERROR\t#{e.message}"
|
238
268
|
$stderr.puts "Unable to generate the marker for: #{entry.definition}"
|
269
|
+
$stderr.puts e.backtrace
|
239
270
|
end
|
240
271
|
|
241
272
|
end
|
@@ -246,45 +277,38 @@ file.close
|
|
246
277
|
exo_f = File.open(exonerate_file, "w")
|
247
278
|
target=reference
|
248
279
|
|
249
|
-
fasta_file = Bio::DB::Fasta::FastaFile.new(
|
280
|
+
fasta_file = Bio::DB::Fasta::FastaFile.new(fasta: target)
|
250
281
|
fasta_file.load_fai_entries
|
251
|
-
min_identity =
|
282
|
+
min_identity = 90
|
252
283
|
found_contigs = Set.new
|
253
284
|
|
254
|
-
|
285
|
+
|
286
|
+
def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
|
255
287
|
if aln.identity > min_identity
|
256
288
|
exo_f.puts aln.line
|
257
|
-
#puts aln.line
|
258
289
|
unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
|
259
290
|
found_contigs.add(aln.target_id)
|
260
291
|
entry = fasta_file.index.region_for_entry(aln.target_id)
|
261
|
-
raise
|
292
|
+
raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
|
293
|
+
if options[:extract_found_contigs]
|
294
|
+
region = entry.get_full_region
|
295
|
+
seq = fasta_file.fetch_sequence(region)
|
296
|
+
contigs_f.puts(">#{aln.target_id}\n#{seq}")
|
297
|
+
end
|
262
298
|
end
|
263
299
|
end
|
264
|
-
end
|
265
|
-
exo_f.close
|
266
|
-
|
267
|
-
arm_selection_functions = Hash.new
|
268
300
|
|
269
|
-
arm_selection_functions[:full_scaffold] = lambda do | contig_name |
|
270
|
-
return contig_name
|
271
301
|
end
|
272
302
|
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
#And with the cases when 3B is named with the prefix: v443
|
277
|
-
arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
|
278
|
-
|
279
|
-
arr = contig_name.split('_')
|
280
|
-
ret = "U"
|
281
|
-
ret = arr[2][0,2] if arr.size >= 3
|
282
|
-
ret = "3B" if arr.size == 2 and arr[0] == "v443"
|
283
|
-
ret = arr[0][0,2] if arr.size == 1
|
284
|
-
return ret
|
285
|
-
end
|
303
|
+
Bio::DB::Blast.align({:query=>markers, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
|
304
|
+
do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
|
305
|
+
end if options[:aligner] == :blast
|
286
306
|
|
307
|
+
Bio::DB::Exonerate.align({:query=>markers, :target=>target, :model=>model}) do |aln|
|
308
|
+
do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
|
309
|
+
end if options[:aligner] == :exonerate
|
287
310
|
|
311
|
+
exo_f.close
|
288
312
|
|
289
313
|
container= Bio::PolyploidTools::ExonContainer.new
|
290
314
|
container.flanking_size=500
|
@@ -292,6 +316,7 @@ container.gene_models(markers)
|
|
292
316
|
container.chromosomes(target)
|
293
317
|
container.add_parental({:name=>"A"})
|
294
318
|
container.add_parental({:name=>"B"})
|
319
|
+
#puts "SNPs size: #{snps.size}"
|
295
320
|
snps.each do |snp|
|
296
321
|
snp.snp_in = "B"
|
297
322
|
snp.container = container
|
@@ -300,8 +325,10 @@ snps.each do |snp|
|
|
300
325
|
snp.includeNoSpecific = allow_non_specific
|
301
326
|
container.add_snp(snp)
|
302
327
|
end
|
303
|
-
container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>arm_selection_functions[:arm_selection_embl] , :min_identity=>min_identity})
|
304
328
|
|
329
|
+
container.add_alignments({:exonerate_file=>exonerate_file,
|
330
|
+
:arm_selection=> arm_selection,
|
331
|
+
:min_identity=>min_identity})
|
305
332
|
|
306
333
|
|
307
334
|
exons_filename="#{output_folder}/localAlignment.fa"
|
@@ -329,12 +356,15 @@ output_file = "#{output_folder}/primers.csv"
|
|
329
356
|
file = File.open(masks_output, "w")
|
330
357
|
out = File.open(output_file, "w")
|
331
358
|
|
359
|
+
out.puts ["Id","specificity","inside","type","target","orientation","product_size",
|
360
|
+
"left_position","left_tm","left_sequence",
|
361
|
+
"right_position","right_tm","right_sequence"].join ","
|
332
362
|
class Bio::DB::Primer3::Primer3Record
|
333
363
|
attr_accessor :primerPairs
|
334
364
|
end
|
335
365
|
|
336
366
|
printed_counts = Hash.new(0)
|
337
|
-
Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
|
367
|
+
Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output ) do | primer3record |
|
338
368
|
#puts primer3record.inspect
|
339
369
|
next if primer3record.primer_left_num_returned.to_i == 0
|
340
370
|
|
@@ -358,10 +388,7 @@ Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
|
|
358
388
|
|
359
389
|
file.puts ">#{seq_id}\n#{sequence_template}"
|
360
390
|
file.puts ">#{seq_id}:mask\n#{sequence_mask}"
|
361
|
-
|
362
|
-
|
363
|
-
#puts primer3record.primerPairs
|
364
|
-
|
391
|
+
|
365
392
|
primer3record.primerPairs.each do |p|
|
366
393
|
#puts p.inspect
|
367
394
|
printed += 1
|
@@ -381,10 +408,10 @@ Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
|
|
381
408
|
toPrint << p.right.sequence
|
382
409
|
|
383
410
|
middle = 501
|
384
|
-
toPrint << lArr[0]
|
385
|
-
toPrint << rArr[0]
|
386
|
-
toPrint << middle - lArr[0]
|
387
|
-
toPrint << rArr[0] - middle
|
411
|
+
#toPrint << lArr[0]
|
412
|
+
#toPrint << rArr[0]
|
413
|
+
#toPrint << middle - lArr[0]
|
414
|
+
#toPrint << rArr[0] - middle
|
388
415
|
#Start End LeftDistance RightDistance
|
389
416
|
|
390
417
|
out.puts toPrint.join(",")
|
@@ -53,14 +53,12 @@ class Bio::PolyploidTools::ExonContainer
|
|
53
53
|
end
|
54
54
|
|
55
55
|
class Bio::DB::Primer3::SNP
|
56
|
-
|
57
56
|
def to_s
|
58
57
|
"#{gene}:#{snp_from.chromosome}"
|
59
58
|
end
|
60
|
-
|
61
59
|
end
|
62
|
-
class Bio::DB::Primer3::Primer3Record
|
63
60
|
|
61
|
+
class Bio::DB::Primer3::Primer3Record
|
64
62
|
|
65
63
|
def best_pair
|
66
64
|
return @best_pair if @best_pair
|
@@ -82,7 +80,7 @@ class Bio::DB::Primer3::Primer3Record
|
|
82
80
|
@total_caps = capital_count
|
83
81
|
end
|
84
82
|
end
|
85
|
-
|
83
|
+
|
86
84
|
@best_pair
|
87
85
|
end
|
88
86
|
|
@@ -107,12 +105,13 @@ class Bio::DB::Primer3::Primer3Record
|
|
107
105
|
|
108
106
|
def score
|
109
107
|
best_pair
|
108
|
+
total_caps = "#{best_pair.left.sequence}#{best_pair.right.sequence}".scan(/[A-Z]/).length
|
110
109
|
# puts "score"
|
111
110
|
# puts self.inspect
|
112
111
|
ret = 0
|
113
112
|
ret += @scores[type]
|
114
113
|
ret += @scores[:exon] if exon?
|
115
|
-
ret -=
|
114
|
+
ret -= total_caps * 10
|
116
115
|
ret -= product_length
|
117
116
|
ret
|
118
117
|
end
|
@@ -123,71 +122,21 @@ class Bio::DB::Primer3::Primer3Record
|
|
123
122
|
|
124
123
|
def left_primer_snp(snp)
|
125
124
|
tmp_primer = String.new(left_primer)
|
126
|
-
#if self.orientation == :forward
|
127
|
-
# base_original = snp.original
|
128
|
-
# base_snp = snp.snp
|
129
|
-
#elsif self.orientation == :reverse
|
130
|
-
# base_original = reverse_complement_string(snp.original )
|
131
|
-
# base_snp = reverse_complement_string(snp.snp)
|
132
|
-
#else
|
133
|
-
# raise Primer3Exception.new "#{self.orientation} is not a valid orientation"
|
134
|
-
#end
|
135
|
-
|
136
|
-
# puts "#{snp.to_s} #{self.orientation} #{tmp_primer[-1] } #{base_original} #{base_snp}"
|
137
|
-
#if tmp_primer[-1] == base_original
|
138
|
-
# tmp_primer[-1] = base_snp
|
139
|
-
#elsif tmp_primer[-1] == base_snp
|
140
|
-
# tmp_primer[-1] = base_original
|
141
|
-
#else
|
142
|
-
# raise Primer3Exception.new "#{tmp_primer} doesnt end in a base in the SNP #{snp.to_s}"
|
143
|
-
#end
|
144
|
-
#puts "tmp_primer: #{tmp_primer}"
|
145
125
|
return tmp_primer
|
146
126
|
end
|
147
127
|
|
148
128
|
end
|
149
129
|
|
150
|
-
arm_selection_functions = Hash.new;
|
151
|
-
|
152
|
-
|
153
|
-
arm_selection_functions[:arm_selection_first_two] = lambda do | contig_name |
|
154
|
-
ret = contig_name[0,2]
|
155
|
-
return ret
|
156
|
-
end
|
157
|
-
|
158
|
-
#Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
|
159
|
-
#Or the first two characters in the contig name, to deal with
|
160
|
-
#pseudomolecules that start with headers like: "1A"
|
161
|
-
#And with the cases when 3B is named with the prefix: v443
|
162
|
-
arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
|
163
|
-
|
164
|
-
arr = contig_name.split('_')
|
165
|
-
ret = "U"
|
166
|
-
ret = arr[2][0,2] if arr.size >= 3
|
167
|
-
ret = "3B" if arr.size == 2 and arr[0] == "v443"
|
168
|
-
ret = arr[0][0,2] if arr.size == 1
|
169
|
-
return ret
|
170
|
-
end
|
171
|
-
|
172
|
-
arm_selection_functions[:arm_selection_morex] = lambda do | contig_name |
|
173
|
-
ret = contig_name.split(':')[0].split("_")[1];
|
174
|
-
return ret
|
175
|
-
end
|
176
|
-
|
177
|
-
arm_selection_functions[:scaffold] = lambda do | contig_name |
|
178
|
-
ret = contig_name;
|
179
|
-
return ret
|
180
|
-
end
|
181
|
-
|
182
130
|
markers = nil
|
183
131
|
|
184
132
|
options = {}
|
133
|
+
options[:aligner] = :blast
|
185
134
|
options[:model] = "est2genome"
|
186
135
|
options[:min_identity] = 90
|
187
|
-
options[:extract_found_contigs] =
|
188
|
-
options[:arm_selection] =
|
136
|
+
options[:extract_found_contigs] = true
|
137
|
+
options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
|
189
138
|
options[:genomes_count] = 3
|
190
|
-
|
139
|
+
options[:variation_free_region] =0
|
191
140
|
|
192
141
|
options[:primer_3_preferences] = {
|
193
142
|
:primer_product_size_range => "50-150" ,
|
@@ -200,11 +149,14 @@ options[:primer_3_preferences] = {
|
|
200
149
|
}
|
201
150
|
|
202
151
|
|
152
|
+
options[:database] = false
|
153
|
+
|
154
|
+
|
203
155
|
OptionParser.new do |opts|
|
204
156
|
|
205
|
-
opts.banner = "Usage:
|
157
|
+
opts.banner = "Usage: polymarker_deletions.rb [options]"
|
206
158
|
|
207
|
-
opts.on("-
|
159
|
+
opts.on("-m", "--sequences FASTA", "Sequence of the region to search") do |o|
|
208
160
|
options[:sequences] = o
|
209
161
|
end
|
210
162
|
opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
|
@@ -221,6 +173,14 @@ OptionParser.new do |opts|
|
|
221
173
|
opts.on("-x", "--extract_found_contigs", "If present, save in a separate file the contigs with matches. Useful to debug.") do |o|
|
222
174
|
options[:extract_found_contigs] = true
|
223
175
|
end
|
176
|
+
|
177
|
+
opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
|
178
|
+
options[:database] = o
|
179
|
+
end
|
180
|
+
|
181
|
+
opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
|
182
|
+
options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
|
183
|
+
end
|
224
184
|
|
225
185
|
end.parse!
|
226
186
|
#reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
|
@@ -231,11 +191,14 @@ throw raise Exception.new(), "Fasta file with sequences has to be provided" unle
|
|
231
191
|
output_folder = options[:output] if options[:output]
|
232
192
|
throw raise Exception.new(), "An output directory has to be provided" unless output_folder
|
233
193
|
model=options[:model]
|
194
|
+
|
195
|
+
options[:database] = options[:reference] unless options[:database]
|
196
|
+
|
234
197
|
Dir.mkdir(output_folder)
|
235
198
|
min_identity= options[:min_identity]
|
236
199
|
|
237
200
|
exonerate_file="#{output_folder}/exonerate_tmp.tab"
|
238
|
-
|
201
|
+
|
239
202
|
primer_3_input="#{output_folder}/primer_3_input_temp"
|
240
203
|
primer_3_output="#{output_folder}/primer_3_output_temp"
|
241
204
|
exons_filename="#{output_folder}/exons_genes_and_contigs.fa"
|
@@ -248,14 +211,8 @@ fasta_file.load_fai_entries
|
|
248
211
|
original_name="A"
|
249
212
|
snp_in="B"
|
250
213
|
|
251
|
-
|
214
|
+
arm_selection = options[:arm_selection]
|
252
215
|
|
253
|
-
unless arm_selection
|
254
|
-
arm_selection = lambda do | contig_name |
|
255
|
-
ret = contig_name[0,3]
|
256
|
-
return ret
|
257
|
-
end
|
258
|
-
end
|
259
216
|
begin
|
260
217
|
log "Reading exons"
|
261
218
|
exons = Array.new
|
@@ -279,22 +236,28 @@ end
|
|
279
236
|
log "Searching markers in genome"
|
280
237
|
found_contigs = Set.new
|
281
238
|
exo_f = File.open(exonerate_file, "w")
|
282
|
-
|
283
|
-
|
284
|
-
|
239
|
+
|
240
|
+
def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
|
241
|
+
if aln.identity > min_identity
|
285
242
|
exo_f.puts aln.line
|
286
243
|
unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
|
287
244
|
found_contigs.add(aln.target_id)
|
288
245
|
entry = fasta_file.index.region_for_entry(aln.target_id)
|
289
246
|
raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
|
290
|
-
|
291
|
-
seq = fasta_file.fetch_sequence(region)
|
292
|
-
contigs_f.puts(">#{aln.target_id}\n#{seq}") if options[:extract_found_contigs]
|
247
|
+
|
293
248
|
end
|
294
249
|
end
|
295
250
|
end
|
251
|
+
|
252
|
+
Bio::DB::Blast.align({:query=>sequences, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
|
253
|
+
do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
|
254
|
+
end if options[:aligner] == :blast
|
255
|
+
|
256
|
+
Bio::DB::Exonerate.align({:query=>sequences, :target=>target, :model=>model}) do |aln|
|
257
|
+
do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
|
258
|
+
end if options[:aligner] == :exonerate
|
259
|
+
|
296
260
|
exo_f.close()
|
297
|
-
contigs_f.close() if options[:extract_found_contigs]
|
298
261
|
|
299
262
|
|
300
263
|
|
@@ -303,18 +266,24 @@ log "Reading best alignment on each chromosome"
|
|
303
266
|
container= Bio::PolyploidTools::ExonContainer.new
|
304
267
|
container.flanking_size=options[:flanking_size]
|
305
268
|
container.gene_models(sequences)
|
306
|
-
container.chromosomes(
|
269
|
+
container.chromosomes(reference)
|
307
270
|
container.add_parental({:name=>"A"})
|
308
271
|
container.add_parental({:name=>"B"})
|
309
272
|
exons.each do |exon|
|
310
273
|
exon.container = container
|
311
|
-
exon.flanking_size =
|
274
|
+
exon.flanking_size = 200
|
312
275
|
exon.variation_free_region = options[:variation_free_region]
|
313
|
-
#
|
276
|
+
#puts exon.inspect
|
314
277
|
container.add_snp(exon)
|
315
278
|
|
316
279
|
end
|
317
|
-
container.add_alignments(
|
280
|
+
container.add_alignments(
|
281
|
+
{:exonerate_file=>exonerate_file,
|
282
|
+
:arm_selection=>options[:arm_selection] ,
|
283
|
+
:min_identity=>min_identity})
|
284
|
+
|
285
|
+
|
286
|
+
|
318
287
|
|
319
288
|
#4.1 generating primer3 file
|
320
289
|
log "Running primer3"
|
@@ -348,18 +317,14 @@ exons.each do |snp|
|
|
348
317
|
end
|
349
318
|
|
350
319
|
kasp_container.add_primers_file(primer_3_output) if added_exons > 0
|
351
|
-
header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors"
|
320
|
+
header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors,repetitive,blast_hits"
|
352
321
|
File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
|
353
322
|
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
out_fasta_products = "#{output_folder}/#{name}.fa"
|
360
|
-
File.open(out_fasta_products, 'w') { |f| f.write(kaspSNP.realigned_primers_fasta) }
|
361
|
-
|
362
|
-
|
323
|
+
out_fasta_products = "#{output_folder}/products.fa"
|
324
|
+
File.open(out_fasta_products, 'w') do |f|
|
325
|
+
kasp_container.snp_hash.each_pair do |name, kaspSNP|
|
326
|
+
f.write(kaspSNP.realigned_primers_fasta)
|
327
|
+
end
|
363
328
|
end
|
364
329
|
|
365
330
|
File.open(output_to_order, "w") { |io| io.write(kasp_container.print_primers_with_tails()) }
|