bio-polyploid-tools 0.10.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1a74407d5aee3baf6b231007be242d2097f07f74a0a012e151c3aef43175ef73
4
- data.tar.gz: fff2475fcf69dec083a67bff9fd573738ac810ca764e7d6e0c7338231e4a81bd
3
+ metadata.gz: 9191156e91a48ec245e181a1541d4b636b01c848b03f2b7db5f7729ddfc05421
4
+ data.tar.gz: '0449ab8d09b268538d3604f20b555d94be53cac35ff8d591a29c792f98df3def'
5
5
  SHA512:
6
- metadata.gz: dc594e3c51d0a1c7fe2facf12002fb7d75b4324dcbaf15bb862e0890662364be709a6e1f1dbd9545a8b9da01c663eb6fe89a30c074ce9f6f3672af33879195fc
7
- data.tar.gz: 3ffa7f6be31f7f2f1a4fddf669d4d95a565e7189db274c579d2c8ba298adae040e43cc5042c7e5405cbcb4d6b0355ef92f71e60c2c36cc516c119cbc075b98de
6
+ metadata.gz: 1c23625ac5c1cdfc3b4d34c3a8f416f680bc42a274b983ee64938bc3ba3bd7b685ad3e9cd9c04521a8f1baf8f91b0efae27a4c5d3034a4a18b141ec10209a7ee
7
+ data.tar.gz: cebf5a46d0a3cce9b63ccd71451f2f2a0d4903ae3e0954d34ba48955cc148b3d232bc5612ed8a528ade86cbfbb6e216c9788126c53b7f8cfa2157785ee00533b
@@ -0,0 +1,16 @@
1
+ # Security Policy
2
+
3
+ ## Supported Versions
4
+
5
+ The following table shows the currently supported version.
6
+
7
+ | Version | Supported |
8
+ | ------- | ------------------ |
9
+ | 1.1.x | :white_check_mark: |
10
+ | 1.0.x | :x: |
11
+ | 0.x.x | :x: |
12
+
13
+
14
+ ## Reporting a Vulnerability
15
+
16
+ If you find a vulneravility, please submit a comment in the security tab
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.10.1
1
+ 1.2.0
@@ -40,8 +40,8 @@ options[:scoring] = :genome_specific
40
40
  options[:database] = false
41
41
  options[:filter_best] = false
42
42
  options[:aligner] = :blast
43
-
44
-
43
+ options[:max_hits] = 8
44
+ options[:max_specific_primers] = 15
45
45
  options[:primer_3_preferences] = {
46
46
  :primer_product_size_range => "50-150" ,
47
47
  :primer_max_size => 25 ,
@@ -132,6 +132,15 @@ OptionParser.new do |opts|
132
132
  opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
133
133
  options[:database] = o
134
134
  end
135
+
136
+ opts.on("-H", "--max_hits INT", "Maximum number of hits to the reference. If there are more hits than this value, the marker is ignored") do |o|
137
+ options[:max_hits] = o.to_i
138
+ end
139
+
140
+ opts.on("-S", "--max_specific_primers INT", "Maximum number of candidate primers to attempt to design. Default: #{options[:max_specific_primers]} ") do |o|
141
+ options[:max_specific_primers] = o.to_i
142
+ end
143
+
135
144
  end.parse!
136
145
 
137
146
 
@@ -233,8 +242,8 @@ File.open(test_file) do | f |
233
242
  region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
234
243
  snp.template_sequence = fasta_reference_db.fetch_sequence(region)
235
244
  else
236
- write_status "WARN: Unable to find entry for #{snp.gene}"
237
- end
245
+ write_status "WARN: Unable to find entry for #{snp.gene}"
246
+ end
238
247
  elsif options[:mutant_list] and options[:reference] #List and fasta file
239
248
  snp = Bio::PolyploidTools::SNPMutant.parse(line)
240
249
  entry = fasta_reference_db.index.region_for_entry(snp.contig)
@@ -242,21 +251,21 @@ File.open(test_file) do | f |
242
251
  region = fasta_reference_db.index.region_for_entry(snp.contig).get_full_region
243
252
  snp.full_sequence = fasta_reference_db.fetch_sequence(region)
244
253
  else
245
- write_status "WARN: Unable to find entry for #{snp.gene}"
246
- end
247
- else
248
- raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
249
- end
250
- raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
251
-
252
- snp.genomes_count = options[:genomes_count]
253
- snp.snp_in = snp_in
254
- snp.original_name = original_name
255
- if snp.position
256
- snps << snp
257
- else
258
- $stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
254
+ write_status "WARN: Unable to find entry for #{snp.gene}"
259
255
  end
256
+ else
257
+ raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
258
+ end
259
+ raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
260
+ snp.max_hits = options[:max_hits]
261
+ snp.genomes_count = options[:genomes_count]
262
+ snp.snp_in = snp_in
263
+ snp.original_name = original_name
264
+ if snp.position
265
+ snps << snp
266
+ else
267
+ $stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
268
+ end
260
269
  end
261
270
  end
262
271
 
@@ -307,7 +316,7 @@ def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
307
316
 
308
317
  end
309
318
 
310
- Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model}) do |aln|
319
+ Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
311
320
  do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
312
321
  end if options[:aligner] == :blast
313
322
 
@@ -334,7 +343,7 @@ container.gene_models(temp_fasta_query)
334
343
  container.chromosomes(target)
335
344
  container.add_parental({:name=>snp_in})
336
345
  container.add_parental({:name=>original_name})
337
-
346
+ container.max_hits = options[:max_hits]
338
347
  snps.each do |snp|
339
348
  snp.container = container
340
349
  snp.flanking_size = container.flanking_size
@@ -358,7 +367,7 @@ write_status "Running primer3"
358
367
  file = File.open(primer_3_input, "w")
359
368
 
360
369
  Bio::DB::Primer3.prepare_input_file(file, options[:primer_3_preferences])
361
- added_exons = container.print_primer_3_exons(file, nil, snp_in)
370
+ added_exons = container.print_primer_3_exons(file, nil, snp_in, max_specific_primers: options[:max_specific_primers] )
362
371
  file.close
363
372
 
364
373
  Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
@@ -35,15 +35,21 @@ options[:primer_3_preferences] = {
35
35
  }
36
36
  options[:genomes_count] = 3
37
37
  options[:allow_non_specific] = false
38
+ options[:aligner] = :blast
39
+ options[:arm_selection]
40
+ model="ungapped"
41
+ options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
42
+ options[:database] = false
38
43
 
39
44
  OptionParser.new do |opts|
40
- opts.banner = "Usage: polymarker_capillary.rb [options]"
45
+ opts.banner = "Usage: polymarker_deletions.rb [options]"
41
46
 
42
47
  opts.on("-r", "--reference FILE", "Fasta file with the assembly") do |o|
43
48
  options[:reference] = o
44
49
  end
45
50
 
46
- opts.on("-m", "--sequences FILE", "Fasta file with the sequences to amplify. the format must be Chromosome:start-end. Chromosome should match the names to the entries in the fasta files as it is used as main target") do |o|
51
+ opts.on("-m", "--sequences FILE", "Fasta file with the sequences to amplify. the format must be Chromosome:start-end. Chromosome
52
+ should match the names to the entries in the fasta files as it is used as main target") do |o|
47
53
  options[:markers] = o
48
54
  end
49
55
 
@@ -53,10 +59,19 @@ OptionParser.new do |opts|
53
59
  opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
54
60
  options[:genomes_count] = o.to_i
55
61
  end
56
- opts.on("-a", "--allow_non_specific", "If used, semi-specific and non-specific primers will be produced") do |o|
62
+ opts.on("-A", "--allow_non_specific", "If used, semi-specific and non-specific primers will be produced") do |o|
57
63
  options[:allow_non_specific] = true
58
64
  end
59
65
 
66
+ opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
67
+ options[:database] = o
68
+ end
69
+
70
+
71
+ opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
72
+ options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
73
+ end
74
+
60
75
  end.parse!
61
76
 
62
77
 
@@ -65,23 +80,33 @@ reference = options[:reference]
65
80
  markers = options[:markers]
66
81
  output_folder = options[:output_folder]
67
82
  allow_non_specific = options[:allow_non_specific]
83
+
84
+ options[:database] = options[:reference] unless options[:database]
85
+ temp_fasta_query="#{output_folder}/to_align.fa"
68
86
  log "Output folder: #{output_folder}"
69
87
  exonerate_file="#{output_folder}/exonerate_tmp.tab"
70
88
  Dir.mkdir(output_folder)
89
+ arm_selection = options[:arm_selection]
71
90
 
72
91
  module Bio::PolyploidTools
73
-
74
-
75
92
 
76
93
  class SequenceToAmplify < SNP
77
94
 
78
- def self.select_chromosome(contig_name)
79
-
80
- arr = contig_name.split('_')
81
- ret = "U"
82
- ret = arr[2][0,2] if arr.size >= 3
83
- ret = "3B" if arr.size == 2 and arr[0] == "v443"
84
- ret = arr[0][0,2] if arr.size == 1
95
+ def self.select_chromosome(gene_name, arm_selection)
96
+ #m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(gene_name)
97
+ #m=/TraesCS(\d{1})(\w{1})(\d{2})G(\d+)/.match(gene_name)
98
+ #ret = {:group : m[1],
99
+ # :genome : m[2],:version=>m[3],:chr_id=>m[4]}
100
+
101
+
102
+ #arr = contig_name.split('_')
103
+ #ret = "U"
104
+ #ret = arr[2][0,2] if arr.size >= 3
105
+ #ret = "3B" if arr.size == 2 and arr[0] == "v443"
106
+ #ret = arr[0][0,2] if arr.size == 1
107
+ #ret = "#{m[1]}#{m[2]}"
108
+ #puts ret
109
+ ret = arm_selection.call(gene_name)
85
110
  return ret
86
111
  end
87
112
 
@@ -92,18 +117,18 @@ module Bio::PolyploidTools
92
117
  #Format:
93
118
  #A fasta entry with the id: contig:start-end
94
119
  #The sequence can be prodcued with samtools faidx
95
- def self.parse(fasta_entry)
96
-
120
+ def self.parse(fasta_entry, arm_selection)
121
+ #puts fasta_entry.definition
97
122
  snp = SequenceToAmplify.new
98
123
  match_data = /(?<rname>\w*):(?<rstart>\w*)-(?<rend>\w*)/.match(fasta_entry.definition)
99
-
124
+ #puts match_data.inspect
100
125
  rName = Regexp.last_match(:rname)
101
126
  rStart = Regexp.last_match(:rstart).to_i
102
127
  rEnd = Regexp.last_match(:rend).to_i
103
128
  snp.gene = fasta_entry.definition
104
129
  #snp.chromosome=rName
105
-
106
- snp.chromosome=select_chromosome(rName)
130
+ #puts "Gene: #{snp.gene}"
131
+ snp.chromosome=select_chromosome(fasta_entry.definition, arm_selection)
107
132
  #puts "#{rName}: #{snp.chromosome}"
108
133
  snp.sequence_original = fasta_entry.seq
109
134
  snp.template_sequence = fasta_entry.seq.upcase
@@ -111,7 +136,7 @@ module Bio::PolyploidTools
111
136
  snp.rstart = rStart
112
137
  snp.rend = rEnd
113
138
 
114
- snp.position = 100
139
+ snp.position = snp.sequence_original.size / 2
115
140
  snp.original = snp.sequence_original[snp.position]
116
141
 
117
142
  tmp = Bio::Sequence::NA.new(snp.original)
@@ -121,7 +146,7 @@ module Bio::PolyploidTools
121
146
  snp
122
147
  end
123
148
 
124
- def primer_3_all_strings(target_chromosome, parental)
149
+ def primer_3_all_strings(target_chromosome, parental, max_specific_primers: 20, flanking_size:500)
125
150
  #puts target_chromosome
126
151
  #puts parental
127
152
  #puts aligned_sequences.to_fasta
@@ -130,8 +155,11 @@ module Bio::PolyploidTools
130
155
 
131
156
  seq_original = String.new(pr.sequence)
132
157
  #puts seq_original.size.to_s << "-" << primer_3_min_seq_length.to_s
158
+ #puts "___"
159
+ #puts pr.inspect
133
160
  return primer_3_propertes if seq_original.size < primer_3_min_seq_length
134
- return primer_3_propertes unless pr.snp_pos == 500
161
+ #puts "((("
162
+ return primer_3_propertes unless pr.snp_pos == flanking_size
135
163
  #puts "Sequence origina: #{ self.original}"
136
164
  #puts pr.to_fasta
137
165
  #puts "Postion: #{pr.snp_pos}"
@@ -232,10 +260,13 @@ file = Bio::FastaFormat.open(markers)
232
260
  file.each do |entry|
233
261
 
234
262
  begin
235
- tmp = Bio::PolyploidTools::SequenceToAmplify.parse(entry)
263
+ #puts entry.inspect
264
+ tmp = Bio::PolyploidTools::SequenceToAmplify.parse(entry, arm_selection)
236
265
  snps << tmp if tmp
237
- rescue
266
+ rescue Exception => e
267
+ log "ERROR\t#{e.message}"
238
268
  $stderr.puts "Unable to generate the marker for: #{entry.definition}"
269
+ $stderr.puts e.backtrace
239
270
  end
240
271
 
241
272
  end
@@ -246,45 +277,38 @@ file.close
246
277
  exo_f = File.open(exonerate_file, "w")
247
278
  target=reference
248
279
 
249
- fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
280
+ fasta_file = Bio::DB::Fasta::FastaFile.new(fasta: target)
250
281
  fasta_file.load_fai_entries
251
- min_identity = 95
282
+ min_identity = 90
252
283
  found_contigs = Set.new
253
284
 
254
- Bio::DB::Exonerate.align({:query=>markers, :target=>reference, :model=>'ungapped'}) do |aln|
285
+
286
+ def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
255
287
  if aln.identity > min_identity
256
288
  exo_f.puts aln.line
257
- #puts aln.line
258
289
  unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
259
290
  found_contigs.add(aln.target_id)
260
291
  entry = fasta_file.index.region_for_entry(aln.target_id)
261
- raise Exception.new, "Entry not found! #{aln.target_id}. Make sure that the #{reference}.fai was generated properly." if entry == nil
292
+ raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
293
+ if options[:extract_found_contigs]
294
+ region = entry.get_full_region
295
+ seq = fasta_file.fetch_sequence(region)
296
+ contigs_f.puts(">#{aln.target_id}\n#{seq}")
297
+ end
262
298
  end
263
299
  end
264
- end
265
- exo_f.close
266
-
267
- arm_selection_functions = Hash.new
268
300
 
269
- arm_selection_functions[:full_scaffold] = lambda do | contig_name |
270
- return contig_name
271
301
  end
272
302
 
273
- #Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
274
- #Or the first two characters in the contig name, to deal with
275
- #pseudomolecules that start with headers like: "1A"
276
- #And with the cases when 3B is named with the prefix: v443
277
- arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
278
-
279
- arr = contig_name.split('_')
280
- ret = "U"
281
- ret = arr[2][0,2] if arr.size >= 3
282
- ret = "3B" if arr.size == 2 and arr[0] == "v443"
283
- ret = arr[0][0,2] if arr.size == 1
284
- return ret
285
- end
303
+ Bio::DB::Blast.align({:query=>markers, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
304
+ do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
305
+ end if options[:aligner] == :blast
286
306
 
307
+ Bio::DB::Exonerate.align({:query=>markers, :target=>target, :model=>model}) do |aln|
308
+ do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
309
+ end if options[:aligner] == :exonerate
287
310
 
311
+ exo_f.close
288
312
 
289
313
  container= Bio::PolyploidTools::ExonContainer.new
290
314
  container.flanking_size=500
@@ -292,6 +316,7 @@ container.gene_models(markers)
292
316
  container.chromosomes(target)
293
317
  container.add_parental({:name=>"A"})
294
318
  container.add_parental({:name=>"B"})
319
+ #puts "SNPs size: #{snps.size}"
295
320
  snps.each do |snp|
296
321
  snp.snp_in = "B"
297
322
  snp.container = container
@@ -300,8 +325,10 @@ snps.each do |snp|
300
325
  snp.includeNoSpecific = allow_non_specific
301
326
  container.add_snp(snp)
302
327
  end
303
- container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>arm_selection_functions[:arm_selection_embl] , :min_identity=>min_identity})
304
328
 
329
+ container.add_alignments({:exonerate_file=>exonerate_file,
330
+ :arm_selection=> arm_selection,
331
+ :min_identity=>min_identity})
305
332
 
306
333
 
307
334
  exons_filename="#{output_folder}/localAlignment.fa"
@@ -329,12 +356,15 @@ output_file = "#{output_folder}/primers.csv"
329
356
  file = File.open(masks_output, "w")
330
357
  out = File.open(output_file, "w")
331
358
 
359
+ out.puts ["Id","specificity","inside","type","target","orientation","product_size",
360
+ "left_position","left_tm","left_sequence",
361
+ "right_position","right_tm","right_sequence"].join ","
332
362
  class Bio::DB::Primer3::Primer3Record
333
363
  attr_accessor :primerPairs
334
364
  end
335
365
 
336
366
  printed_counts = Hash.new(0)
337
- Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
367
+ Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output ) do | primer3record |
338
368
  #puts primer3record.inspect
339
369
  next if primer3record.primer_left_num_returned.to_i == 0
340
370
 
@@ -358,10 +388,7 @@ Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
358
388
 
359
389
  file.puts ">#{seq_id}\n#{sequence_template}"
360
390
  file.puts ">#{seq_id}:mask\n#{sequence_mask}"
361
- #puts "FDFDS"
362
-
363
- #puts primer3record.primerPairs
364
-
391
+
365
392
  primer3record.primerPairs.each do |p|
366
393
  #puts p.inspect
367
394
  printed += 1
@@ -381,10 +408,10 @@ Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
381
408
  toPrint << p.right.sequence
382
409
 
383
410
  middle = 501
384
- toPrint << lArr[0]
385
- toPrint << rArr[0]
386
- toPrint << middle - lArr[0]
387
- toPrint << rArr[0] - middle
411
+ #toPrint << lArr[0]
412
+ #toPrint << rArr[0]
413
+ #toPrint << middle - lArr[0]
414
+ #toPrint << rArr[0] - middle
388
415
  #Start End LeftDistance RightDistance
389
416
 
390
417
  out.puts toPrint.join(",")
@@ -53,14 +53,12 @@ class Bio::PolyploidTools::ExonContainer
53
53
  end
54
54
 
55
55
  class Bio::DB::Primer3::SNP
56
-
57
56
  def to_s
58
57
  "#{gene}:#{snp_from.chromosome}"
59
58
  end
60
-
61
59
  end
62
- class Bio::DB::Primer3::Primer3Record
63
60
 
61
+ class Bio::DB::Primer3::Primer3Record
64
62
 
65
63
  def best_pair
66
64
  return @best_pair if @best_pair
@@ -82,7 +80,7 @@ class Bio::DB::Primer3::Primer3Record
82
80
  @total_caps = capital_count
83
81
  end
84
82
  end
85
- #@best_pair = @primerPairs.min
83
+
86
84
  @best_pair
87
85
  end
88
86
 
@@ -107,12 +105,13 @@ class Bio::DB::Primer3::Primer3Record
107
105
 
108
106
  def score
109
107
  best_pair
108
+ total_caps = "#{best_pair.left.sequence}#{best_pair.right.sequence}".scan(/[A-Z]/).length
110
109
  # puts "score"
111
110
  # puts self.inspect
112
111
  ret = 0
113
112
  ret += @scores[type]
114
113
  ret += @scores[:exon] if exon?
115
- ret -= @total_caps * 10
114
+ ret -= total_caps * 10
116
115
  ret -= product_length
117
116
  ret
118
117
  end
@@ -123,71 +122,21 @@ class Bio::DB::Primer3::Primer3Record
123
122
 
124
123
  def left_primer_snp(snp)
125
124
  tmp_primer = String.new(left_primer)
126
- #if self.orientation == :forward
127
- # base_original = snp.original
128
- # base_snp = snp.snp
129
- #elsif self.orientation == :reverse
130
- # base_original = reverse_complement_string(snp.original )
131
- # base_snp = reverse_complement_string(snp.snp)
132
- #else
133
- # raise Primer3Exception.new "#{self.orientation} is not a valid orientation"
134
- #end
135
-
136
- # puts "#{snp.to_s} #{self.orientation} #{tmp_primer[-1] } #{base_original} #{base_snp}"
137
- #if tmp_primer[-1] == base_original
138
- # tmp_primer[-1] = base_snp
139
- #elsif tmp_primer[-1] == base_snp
140
- # tmp_primer[-1] = base_original
141
- #else
142
- # raise Primer3Exception.new "#{tmp_primer} doesnt end in a base in the SNP #{snp.to_s}"
143
- #end
144
- #puts "tmp_primer: #{tmp_primer}"
145
125
  return tmp_primer
146
126
  end
147
127
 
148
128
  end
149
129
 
150
- arm_selection_functions = Hash.new;
151
-
152
-
153
- arm_selection_functions[:arm_selection_first_two] = lambda do | contig_name |
154
- ret = contig_name[0,2]
155
- return ret
156
- end
157
-
158
- #Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
159
- #Or the first two characters in the contig name, to deal with
160
- #pseudomolecules that start with headers like: "1A"
161
- #And with the cases when 3B is named with the prefix: v443
162
- arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
163
-
164
- arr = contig_name.split('_')
165
- ret = "U"
166
- ret = arr[2][0,2] if arr.size >= 3
167
- ret = "3B" if arr.size == 2 and arr[0] == "v443"
168
- ret = arr[0][0,2] if arr.size == 1
169
- return ret
170
- end
171
-
172
- arm_selection_functions[:arm_selection_morex] = lambda do | contig_name |
173
- ret = contig_name.split(':')[0].split("_")[1];
174
- return ret
175
- end
176
-
177
- arm_selection_functions[:scaffold] = lambda do | contig_name |
178
- ret = contig_name;
179
- return ret
180
- end
181
-
182
130
  markers = nil
183
131
 
184
132
  options = {}
133
+ options[:aligner] = :blast
185
134
  options[:model] = "est2genome"
186
135
  options[:min_identity] = 90
187
- options[:extract_found_contigs] = false
188
- options[:arm_selection] = arm_selection_functions[:arm_selection_embl] ;
136
+ options[:extract_found_contigs] = true
137
+ options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
189
138
  options[:genomes_count] = 3
190
-
139
+ options[:variation_free_region] =0
191
140
 
192
141
  options[:primer_3_preferences] = {
193
142
  :primer_product_size_range => "50-150" ,
@@ -200,11 +149,14 @@ options[:primer_3_preferences] = {
200
149
  }
201
150
 
202
151
 
152
+ options[:database] = false
153
+
154
+
203
155
  OptionParser.new do |opts|
204
156
 
205
- opts.banner = "Usage: find_homoeologue_variations.rb [options]"
157
+ opts.banner = "Usage: polymarker_deletions.rb [options]"
206
158
 
207
- opts.on("-c", "--sequences FASTA", "Sequence of the region to searc") do |o|
159
+ opts.on("-m", "--sequences FASTA", "Sequence of the region to search") do |o|
208
160
  options[:sequences] = o
209
161
  end
210
162
  opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
@@ -221,6 +173,14 @@ OptionParser.new do |opts|
221
173
  opts.on("-x", "--extract_found_contigs", "If present, save in a separate file the contigs with matches. Useful to debug.") do |o|
222
174
  options[:extract_found_contigs] = true
223
175
  end
176
+
177
+ opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
178
+ options[:database] = o
179
+ end
180
+
181
+ opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
182
+ options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
183
+ end
224
184
 
225
185
  end.parse!
226
186
  #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
@@ -231,11 +191,14 @@ throw raise Exception.new(), "Fasta file with sequences has to be provided" unle
231
191
  output_folder = options[:output] if options[:output]
232
192
  throw raise Exception.new(), "An output directory has to be provided" unless output_folder
233
193
  model=options[:model]
194
+
195
+ options[:database] = options[:reference] unless options[:database]
196
+
234
197
  Dir.mkdir(output_folder)
235
198
  min_identity= options[:min_identity]
236
199
 
237
200
  exonerate_file="#{output_folder}/exonerate_tmp.tab"
238
- temp_contigs="#{output_folder}/contigs_tmp.fa"
201
+
239
202
  primer_3_input="#{output_folder}/primer_3_input_temp"
240
203
  primer_3_output="#{output_folder}/primer_3_output_temp"
241
204
  exons_filename="#{output_folder}/exons_genes_and_contigs.fa"
@@ -248,14 +211,8 @@ fasta_file.load_fai_entries
248
211
  original_name="A"
249
212
  snp_in="B"
250
213
 
251
- arm_selection = options[:arm_selection]
214
+ arm_selection = options[:arm_selection]
252
215
 
253
- unless arm_selection
254
- arm_selection = lambda do | contig_name |
255
- ret = contig_name[0,3]
256
- return ret
257
- end
258
- end
259
216
  begin
260
217
  log "Reading exons"
261
218
  exons = Array.new
@@ -279,22 +236,28 @@ end
279
236
  log "Searching markers in genome"
280
237
  found_contigs = Set.new
281
238
  exo_f = File.open(exonerate_file, "w")
282
- contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
283
- Bio::DB::Exonerate.align({:query=>sequences, :target=>reference, :model=>model}) do |aln|
284
- if aln.identity > min_identity
239
+
240
+ def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
241
+ if aln.identity > min_identity
285
242
  exo_f.puts aln.line
286
243
  unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
287
244
  found_contigs.add(aln.target_id)
288
245
  entry = fasta_file.index.region_for_entry(aln.target_id)
289
246
  raise ExonerateException.new, "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
290
- region = entry.get_full_region
291
- seq = fasta_file.fetch_sequence(region)
292
- contigs_f.puts(">#{aln.target_id}\n#{seq}") if options[:extract_found_contigs]
247
+
293
248
  end
294
249
  end
295
250
  end
251
+
252
+ Bio::DB::Blast.align({:query=>sequences, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
253
+ do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
254
+ end if options[:aligner] == :blast
255
+
256
+ Bio::DB::Exonerate.align({:query=>sequences, :target=>target, :model=>model}) do |aln|
257
+ do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
258
+ end if options[:aligner] == :exonerate
259
+
296
260
  exo_f.close()
297
- contigs_f.close() if options[:extract_found_contigs]
298
261
 
299
262
 
300
263
 
@@ -303,18 +266,24 @@ log "Reading best alignment on each chromosome"
303
266
  container= Bio::PolyploidTools::ExonContainer.new
304
267
  container.flanking_size=options[:flanking_size]
305
268
  container.gene_models(sequences)
306
- container.chromosomes(temp_contigs)
269
+ container.chromosomes(reference)
307
270
  container.add_parental({:name=>"A"})
308
271
  container.add_parental({:name=>"B"})
309
272
  exons.each do |exon|
310
273
  exon.container = container
311
- exon.flanking_size = 50
274
+ exon.flanking_size = 200
312
275
  exon.variation_free_region = options[:variation_free_region]
313
- # puts exon.inspect
276
+ #puts exon.inspect
314
277
  container.add_snp(exon)
315
278
 
316
279
  end
317
- container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>options[:arm_selection] , :min_identity=>min_identity})
280
+ container.add_alignments(
281
+ {:exonerate_file=>exonerate_file,
282
+ :arm_selection=>options[:arm_selection] ,
283
+ :min_identity=>min_identity})
284
+
285
+
286
+
318
287
 
319
288
  #4.1 generating primer3 file
320
289
  log "Running primer3"
@@ -348,18 +317,14 @@ exons.each do |snp|
348
317
  end
349
318
 
350
319
  kasp_container.add_primers_file(primer_3_output) if added_exons > 0
351
- header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors"
320
+ header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors,repetitive,blast_hits"
352
321
  File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
353
322
 
354
- kasp_container.snp_hash.each_pair do |name, kaspSNP|
355
- #puts kaspSNP.snp_from.surrounding_exon_sequences.inspect
356
- #puts kaspSNP.first_product
357
- #puts kaspSNP.realigned_primers
358
-
359
- out_fasta_products = "#{output_folder}/#{name}.fa"
360
- File.open(out_fasta_products, 'w') { |f| f.write(kaspSNP.realigned_primers_fasta) }
361
-
362
-
323
+ out_fasta_products = "#{output_folder}/products.fa"
324
+ File.open(out_fasta_products, 'w') do |f|
325
+ kasp_container.snp_hash.each_pair do |name, kaspSNP|
326
+ f.write(kaspSNP.realigned_primers_fasta)
327
+ end
363
328
  end
364
329
 
365
330
  File.open(output_to_order, "w") { |io| io.write(kasp_container.print_primers_with_tails()) }