RubyGems - bio-polyploid-tools - Versions diffs - 0.10.1 → 1.2.0 - Mend

bio-polyploid-tools 0.10.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/SECURITY.md +16 -0
data/VERSION +1 -1
data/bin/polymarker.rb +30 -21
data/bin/polymarker_capillary.rb +83 -56
data/bin/{find_homoeologue_variations.rb → polymarker_deletions.rb} +55 -90
data/bio-polyploid-tools.gemspec +27 -25
data/lib/bio/BIOExtensions.rb +1 -1
data/lib/bio/PolyploidTools/ExonContainer.rb +9 -9
data/lib/bio/PolyploidTools/NoSNPSequence.rb +39 -33
data/lib/bio/PolyploidTools/SNP.rb +26 -21
data/lib/bio/db/blast.rb +1 -1
data/lib/bio/db/primer3.rb +14 -18
data/test/data/7B_amplicon_test.fa +12 -0
data/test/data/7B_amplicon_test.fa.fai +1 -0
data/test/data/7B_amplicon_test_reference.fa +110 -0
data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
data/test/data/7B_amplicon_test_reference.fa.ndb +0 -0
data/test/data/7B_amplicon_test_reference.fa.nhr +0 -0
data/test/data/7B_amplicon_test_reference.fa.nin +0 -0
data/test/data/7B_amplicon_test_reference.fa.not +0 -0
data/test/data/7B_amplicon_test_reference.fa.nsq +0 -0
data/test/data/7B_amplicon_test_reference.fa.ntf +0 -0
data/test/data/7B_amplicon_test_reference.fa.nto +0 -0
metadata +17 -8
data/README +0 -21

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1a74407d5aee3baf6b231007be242d2097f07f74a0a012e151c3aef43175ef73
-  data.tar.gz: fff2475fcf69dec083a67bff9fd573738ac810ca764e7d6e0c7338231e4a81bd
+  metadata.gz: 9191156e91a48ec245e181a1541d4b636b01c848b03f2b7db5f7729ddfc05421
+  data.tar.gz: '0449ab8d09b268538d3604f20b555d94be53cac35ff8d591a29c792f98df3def'
 SHA512:
-  metadata.gz: dc594e3c51d0a1c7fe2facf12002fb7d75b4324dcbaf15bb862e0890662364be709a6e1f1dbd9545a8b9da01c663eb6fe89a30c074ce9f6f3672af33879195fc
-  data.tar.gz: 3ffa7f6be31f7f2f1a4fddf669d4d95a565e7189db274c579d2c8ba298adae040e43cc5042c7e5405cbcb4d6b0355ef92f71e60c2c36cc516c119cbc075b98de
+  metadata.gz: 1c23625ac5c1cdfc3b4d34c3a8f416f680bc42a274b983ee64938bc3ba3bd7b685ad3e9cd9c04521a8f1baf8f91b0efae27a4c5d3034a4a18b141ec10209a7ee
+  data.tar.gz: cebf5a46d0a3cce9b63ccd71451f2f2a0d4903ae3e0954d34ba48955cc148b3d232bc5612ed8a528ade86cbfbb6e216c9788126c53b7f8cfa2157785ee00533b

data/SECURITY.md ADDED

@@ -0,0 +1,16 @@
+# Security Policy
+## Supported Versions
+The following table shows the currently supported version.
+| Version | Supported          |
+| ------- | ------------------ |
+| 1.1.x   | :white_check_mark: |
+| 1.0.x   | :x:                |
+| 0.x.x   | :x:                |
+## Reporting a Vulnerability
+If you find a vulneravility, please submit a comment in the security tab

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.10.1
1	+ 1.2.0

data/bin/polymarker.rb CHANGED

@@ -40,8 +40,8 @@ options[:scoring] = :genome_specific
 options[:database]  = false
 options[:filter_best]  = false
 options[:aligner] = :blast
+options[:max_hits] = 8
+options[:max_specific_primers]  = 15
 options[:primer_3_preferences] = {
       :primer_product_size_range => "50-150" ,
       :primer_max_size => 25 ,
@@ -132,6 +132,15 @@ OptionParser.new do |opts|
   opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
     options[:database] = o
   end
+  opts.on("-H", "--max_hits INT", "Maximum number of hits to the reference. If there are more hits than this value, the marker is ignored") do |o|
+    options[:max_hits] = o.to_i
+  end
+  opts.on("-S", "--max_specific_primers INT", "Maximum number of candidate primers to attempt to design. Default: #{options[:max_specific_primers]} ") do |o|
+    options[:max_specific_primers]  = o.to_i
+  end
 end.parse!
@@ -233,8 +242,8 @@ File.open(test_file) do | f |
        region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
        snp.template_sequence = fasta_reference_db.fetch_sequence(region)
      else
-        write_status "WARN: Unable to find entry for #{snp.gene}"
-      end
+      write_status "WARN: Unable to find entry for #{snp.gene}"
+    end
     elsif options[:mutant_list] and options[:reference] #List and fasta file
       snp = Bio::PolyploidTools::SNPMutant.parse(line)
       entry = fasta_reference_db.index.region_for_entry(snp.contig)
@@ -242,21 +251,21 @@ File.open(test_file) do | f |
        region = fasta_reference_db.index.region_for_entry(snp.contig).get_full_region
        snp.full_sequence = fasta_reference_db.fetch_sequence(region)
      else
-        write_status "WARN: Unable to find entry for #{snp.gene}"
-      end
-    else
-      raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
-    end
-    raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
-    snp.genomes_count = options[:genomes_count]
-    snp.snp_in = snp_in
-    snp.original_name = original_name
-    if snp.position
-      snps << snp
-    else
-      $stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
+      write_status "WARN: Unable to find entry for #{snp.gene}"
     end
+  else
+    raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
+  end
+  raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
+  snp.max_hits = options[:max_hits]
+  snp.genomes_count = options[:genomes_count]
+  snp.snp_in = snp_in
+  snp.original_name = original_name
+  if snp.position
+    snps << snp
+  else
+    $stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
+  end
   end
 end
@@ -307,7 +316,7 @@ def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
 end
-Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model}) do |aln|
+Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
   do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
 end if options[:aligner] == :blast
@@ -334,7 +343,7 @@ container.gene_models(temp_fasta_query)
 container.chromosomes(target)
 container.add_parental({:name=>snp_in})
 container.add_parental({:name=>original_name})
+container.max_hits = options[:max_hits]
 snps.each do |snp|
   snp.container = container
   snp.flanking_size = container.flanking_size
@@ -358,7 +367,7 @@ write_status "Running primer3"
 file = File.open(primer_3_input, "w")
 Bio::DB::Primer3.prepare_input_file(file, options[:primer_3_preferences])
-added_exons = container.print_primer_3_exons(file, nil, snp_in)
+added_exons = container.print_primer_3_exons(file, nil, snp_in,  max_specific_primers: options[:max_specific_primers] )
 file.close
 Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0

data/bin/polymarker_capillary.rb CHANGED

@@ -35,15 +35,21 @@ options[:primer_3_preferences] = {
 }
 options[:genomes_count] = 3
 options[:allow_non_specific] = false
+options[:aligner] = :blast
+options[:arm_selection]
+model="ungapped"
+options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
+options[:database]  = false
 OptionParser.new do |opts|
-  opts.banner = "Usage: polymarker_capillary.rb [options]"
+  opts.banner = "Usage: polymarker_deletions.rb [options]"
   opts.on("-r", "--reference FILE", "Fasta file with the assembly") do |o|
     options[:reference] = o
   end
-  opts.on("-m", "--sequences FILE", "Fasta file with the sequences to amplify. the format must be Chromosome:start-end. Chromosome should match the names to the entries in the fasta files as it is used as main target") do |o|
+  opts.on("-m", "--sequences FILE", "Fasta file with the sequences to amplify. the format must be Chromosome:start-end. Chromosome
+    should match the names to the entries in the fasta files as it is used as main target") do |o|
     options[:markers] = o
   end
@@ -53,10 +59,19 @@ OptionParser.new do |opts|
   opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
     options[:genomes_count] = o.to_i
   end
-  opts.on("-a", "--allow_non_specific", "If used, semi-specific and non-specific primers will be produced") do |o|
+  opts.on("-A", "--allow_non_specific", "If used, semi-specific and non-specific primers will be produced") do |o|
     options[:allow_non_specific] = true
   end
+  opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
+    options[:database] = o
+  end
+  opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
+    options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
+  end
 end.parse!
@@ -65,23 +80,33 @@ reference     = options[:reference]
 markers       = options[:markers]
 output_folder = options[:output_folder]
 allow_non_specific = options[:allow_non_specific]
+options[:database] = options[:reference] unless  options[:database]
+temp_fasta_query="#{output_folder}/to_align.fa"
 log "Output folder: #{output_folder}"
 exonerate_file="#{output_folder}/exonerate_tmp.tab"
 Dir.mkdir(output_folder)
+arm_selection = options[:arm_selection]
 module Bio::PolyploidTools
   class SequenceToAmplify < SNP
-    def self.select_chromosome(contig_name)
-      arr = contig_name.split('_')
-      ret = "U"
-      ret = arr[2][0,2] if arr.size >= 3
-      ret = "3B" if arr.size == 2 and arr[0] == "v443"
-      ret = arr[0][0,2] if arr.size == 1
+    def self.select_chromosome(gene_name, arm_selection)
+      #m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(gene_name)
+      #m=/TraesCS(\d{1})(\w{1})(\d{2})G(\d+)/.match(gene_name)
+      #ret = {:group : m[1],
+      #       :genome : m[2],:version=>m[3],:chr_id=>m[4]}
+      #arr = contig_name.split('_')
+      #ret = "U"
+      #ret = arr[2][0,2] if arr.size >= 3
+      #ret = "3B" if arr.size == 2 and arr[0] == "v443"
+      #ret = arr[0][0,2] if arr.size == 1
+      #ret = "#{m[1]}#{m[2]}"
+      #puts ret
+      ret = arm_selection.call(gene_name)
       return ret
     end
@@ -92,18 +117,18 @@ module Bio::PolyploidTools
     #Format:
     #A fasta entry with the id: contig:start-end
     #The sequence can be prodcued with samtools faidx
-    def self.parse(fasta_entry)
+    def self.parse(fasta_entry, arm_selection)
+      #puts fasta_entry.definition
       snp = SequenceToAmplify.new
       match_data = /(?<rname>\w*):(?<rstart>\w*)-(?<rend>\w*)/.match(fasta_entry.definition)
+      #puts match_data.inspect
       rName = Regexp.last_match(:rname)
       rStart =  Regexp.last_match(:rstart).to_i
       rEnd =  Regexp.last_match(:rend).to_i
       snp.gene = fasta_entry.definition
       #snp.chromosome=rName
-      snp.chromosome=select_chromosome(rName)
+      #puts "Gene: #{snp.gene}"
+      snp.chromosome=select_chromosome(fasta_entry.definition, arm_selection)
       #puts "#{rName}: #{snp.chromosome}"
       snp.sequence_original = fasta_entry.seq
       snp.template_sequence = fasta_entry.seq.upcase
@@ -111,7 +136,7 @@ module Bio::PolyploidTools
       snp.rstart = rStart
       snp.rend = rEnd
-      snp.position   = 100
+      snp.position   = snp.sequence_original.size / 2
       snp.original   = snp.sequence_original[snp.position]
       tmp =  Bio::Sequence::NA.new(snp.original)
@@ -121,7 +146,7 @@ module Bio::PolyploidTools
       snp
     end
-    def primer_3_all_strings(target_chromosome, parental)
+    def primer_3_all_strings(target_chromosome, parental, max_specific_primers: 20, flanking_size:500)
       #puts target_chromosome
       #puts parental
       #puts aligned_sequences.to_fasta
@@ -130,8 +155,11 @@ module Bio::PolyploidTools
       seq_original = String.new(pr.sequence)
       #puts seq_original.size.to_s << "-" << primer_3_min_seq_length.to_s
+      #puts "___"
+      #puts pr.inspect
       return primer_3_propertes if seq_original.size < primer_3_min_seq_length
-      return primer_3_propertes unless pr.snp_pos == 500
+      #puts "((("
+      return primer_3_propertes unless pr.snp_pos == flanking_size
       #puts "Sequence origina: #{ self.original}"
       #puts pr.to_fasta
       #puts "Postion: #{pr.snp_pos}"
@@ -232,10 +260,13 @@ file = Bio::FastaFormat.open(markers)
 file.each do |entry|
   begin
-    tmp = Bio::PolyploidTools::SequenceToAmplify.parse(entry)
+    #puts entry.inspect
+    tmp = Bio::PolyploidTools::SequenceToAmplify.parse(entry, arm_selection)
     snps << tmp if tmp
-  rescue
+  rescue Exception => e
+    log "ERROR\t#{e.message}"
     $stderr.puts "Unable to generate the marker for: #{entry.definition}"
+    $stderr.puts e.backtrace
   end
 end
@@ -246,45 +277,38 @@ file.close
 exo_f = File.open(exonerate_file, "w")
 target=reference
-fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>target})
+fasta_file = Bio::DB::Fasta::FastaFile.new(fasta: target)
 fasta_file.load_fai_entries
-min_identity = 95
+min_identity = 90
 found_contigs = Set.new
-Bio::DB::Exonerate.align({:query=>markers, :target=>reference, :model=>'ungapped'}) do |aln|
+def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
   if aln.identity > min_identity
     exo_f.puts aln.line
-    #puts aln.line
     unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
       found_contigs.add(aln.target_id)
       entry = fasta_file.index.region_for_entry(aln.target_id)
-      raise Exception.new,  "Entry not found! #{aln.target_id}. Make sure that the #{reference}.fai was generated properly." if entry == nil
+      raise ExonerateException.new,  "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
+      if options[:extract_found_contigs]
+        region = entry.get_full_region
+        seq = fasta_file.fetch_sequence(region)
+        contigs_f.puts(">#{aln.target_id}\n#{seq}")
+      end
     end
   end
-end
-exo_f.close
-arm_selection_functions = Hash.new
-arm_selection_functions[:full_scaffold] = lambda do | contig_name |
-  return contig_name
 end
-#Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
-#Or the first two characters in the contig name, to deal with
-#pseudomolecules that start with headers like: "1A"
-#And with the cases when 3B is named with the prefix: v443
-arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
-  arr = contig_name.split('_')
-  ret = "U"
-  ret = arr[2][0,2] if arr.size >= 3
-  ret = "3B" if arr.size == 2 and arr[0] == "v443"
-  ret = arr[0][0,2] if arr.size == 1
-  return ret
-end
+Bio::DB::Blast.align({:query=>markers, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
+  do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
+end if options[:aligner] == :blast
+Bio::DB::Exonerate.align({:query=>markers, :target=>target, :model=>model}) do |aln|
+  do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
+end if options[:aligner] == :exonerate
+exo_f.close
 container= Bio::PolyploidTools::ExonContainer.new
 container.flanking_size=500
@@ -292,6 +316,7 @@ container.gene_models(markers)
 container.chromosomes(target)
 container.add_parental({:name=>"A"})
 container.add_parental({:name=>"B"})
+#puts "SNPs size: #{snps.size}"
 snps.each do |snp|
   snp.snp_in = "B"
   snp.container = container
@@ -300,8 +325,10 @@ snps.each do |snp|
   snp.includeNoSpecific = allow_non_specific
   container.add_snp(snp)
 end
-container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>arm_selection_functions[:arm_selection_embl] , :min_identity=>min_identity})
+container.add_alignments({:exonerate_file=>exonerate_file,
+  :arm_selection=> arm_selection,
+  :min_identity=>min_identity})
 exons_filename="#{output_folder}/localAlignment.fa"
@@ -329,12 +356,15 @@ output_file  = "#{output_folder}/primers.csv"
 file = File.open(masks_output, "w")
 out  = File.open(output_file,  "w")
+out.puts ["Id","specificity","inside","type","target","orientation","product_size",
+  "left_position","left_tm","left_sequence",
+"right_position","right_tm","right_sequence"].join ","
 class Bio::DB::Primer3::Primer3Record
   attr_accessor :primerPairs
 end
 printed_counts = Hash.new(0)
-Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
+Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output ) do | primer3record |
   #puts primer3record.inspect
   next if primer3record.primer_left_num_returned.to_i == 0
@@ -358,10 +388,7 @@ Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
   file.puts ">#{seq_id}\n#{sequence_template}"
   file.puts ">#{seq_id}:mask\n#{sequence_mask}"
-   #puts "FDFDS"
-   #puts primer3record.primerPairs
    primer3record.primerPairs.each do |p|
     #puts p.inspect
     printed += 1
@@ -381,10 +408,10 @@ Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
     toPrint <<  p.right.sequence
     middle = 501
-    toPrint << lArr[0]
-    toPrint << rArr[0]
-    toPrint << middle - lArr[0]
-    toPrint << rArr[0] - middle
+    #toPrint << lArr[0]
+    #toPrint << rArr[0]
+    #toPrint << middle - lArr[0]
+    #toPrint << rArr[0] - middle
 #Start End LeftDistance  RightDistance
     out.puts toPrint.join(",")

data/bin/{find_homoeologue_variations.rb → polymarker_deletions.rb} RENAMED

@@ -53,14 +53,12 @@ class Bio::PolyploidTools::ExonContainer
 end
 class Bio::DB::Primer3::SNP
   def to_s
      "#{gene}:#{snp_from.chromosome}"
   end
 end
-class Bio::DB::Primer3::Primer3Record
+class Bio::DB::Primer3::Primer3Record
   def best_pair
     return @best_pair if @best_pair
@@ -82,7 +80,7 @@ class Bio::DB::Primer3::Primer3Record
         @total_caps = capital_count
       end
     end
-    #@best_pair = @primerPairs.min
     @best_pair
   end
@@ -107,12 +105,13 @@ class Bio::DB::Primer3::Primer3Record
   def score
     best_pair
+    total_caps = "#{best_pair.left.sequence}#{best_pair.right.sequence}".scan(/[A-Z]/).length
 #    puts "score"
  #   puts self.inspect
     ret = 0
     ret += @scores[type]
     ret += @scores[:exon] if exon?
-    ret -= @total_caps * 10
+    ret -= total_caps * 10
     ret -= product_length
     ret
   end
@@ -123,71 +122,21 @@ class Bio::DB::Primer3::Primer3Record
    def left_primer_snp(snp)
       tmp_primer = String.new(left_primer)
-      #if self.orientation == :forward
-      #  base_original = snp.original
-      #  base_snp = snp.snp
-      #elsif self.orientation == :reverse
-      #  base_original = reverse_complement_string(snp.original )
-      #  base_snp = reverse_complement_string(snp.snp)
-      #else
-      #  raise Primer3Exception.new "#{self.orientation} is not a valid orientation"
-      #end
-      # puts "#{snp.to_s} #{self.orientation} #{tmp_primer[-1] } #{base_original} #{base_snp}"
-      #if tmp_primer[-1] == base_original
-      #  tmp_primer[-1] = base_snp
-      #elsif tmp_primer[-1] == base_snp
-      #  tmp_primer[-1] = base_original
-      #else
-      #  raise Primer3Exception.new "#{tmp_primer} doesnt end in a base in the SNP #{snp.to_s}"
-      #end
-      #puts "tmp_primer: #{tmp_primer}"
       return tmp_primer
     end
 end
-arm_selection_functions = Hash.new;
-arm_selection_functions[:arm_selection_first_two] = lambda do | contig_name |
-  ret = contig_name[0,2]
-  return ret
-end
-#Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
-#Or the first two characters in the contig name, to deal with
-#pseudomolecules that start with headers like: "1A"
-#And with the cases when 3B is named with the prefix: v443
-arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
-  arr = contig_name.split('_')
-  ret = "U"
-  ret = arr[2][0,2] if arr.size >= 3
-  ret = "3B" if arr.size == 2 and arr[0] == "v443"
-  ret = arr[0][0,2] if arr.size == 1
-  return ret
-end
-arm_selection_functions[:arm_selection_morex] = lambda do | contig_name |
-  ret = contig_name.split(':')[0].split("_")[1];
-  return ret
-end
-arm_selection_functions[:scaffold] = lambda do | contig_name |
-  ret = contig_name;
-  return ret
-end
 markers = nil
 options = {}
+options[:aligner] = :blast
 options[:model] = "est2genome"
 options[:min_identity] = 90
-options[:extract_found_contigs] = false
-options[:arm_selection] = arm_selection_functions[:arm_selection_embl] ;
+options[:extract_found_contigs] = true
+options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
 options[:genomes_count] = 3
+options[:variation_free_region] =0
 options[:primer_3_preferences] = {
       :primer_product_size_range => "50-150" ,
@@ -200,11 +149,14 @@ options[:primer_3_preferences] = {
   }
+options[:database]  = false
 OptionParser.new do |opts|
-  opts.banner = "Usage: find_homoeologue_variations.rb [options]"
+  opts.banner = "Usage: polymarker_deletions.rb [options]"
-  opts.on("-c", "--sequences FASTA", "Sequence of the region to searc") do |o|
+  opts.on("-m", "--sequences FASTA", "Sequence of the region to search") do |o|
     options[:sequences] = o
   end
   opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
@@ -221,6 +173,14 @@ OptionParser.new do |opts|
   opts.on("-x", "--extract_found_contigs", "If present, save in a separate file the contigs with matches. Useful to debug.") do |o|
     options[:extract_found_contigs] = true
   end
+  opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
+    options[:database] = o
+  end
+    opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
+    options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
+  end
 end.parse!
 #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
@@ -231,11 +191,14 @@ throw raise Exception.new(), "Fasta file with sequences has to be provided" unle
 output_folder = options[:output] if options[:output]
 throw raise Exception.new(), "An output directory has to be provided" unless output_folder
 model=options[:model]
+options[:database] = options[:reference] unless  options[:database]
 Dir.mkdir(output_folder)
 min_identity= options[:min_identity]
 exonerate_file="#{output_folder}/exonerate_tmp.tab"
-temp_contigs="#{output_folder}/contigs_tmp.fa"
 primer_3_input="#{output_folder}/primer_3_input_temp"
 primer_3_output="#{output_folder}/primer_3_output_temp"
 exons_filename="#{output_folder}/exons_genes_and_contigs.fa"
@@ -248,14 +211,8 @@ fasta_file.load_fai_entries
 original_name="A"
 snp_in="B"
- arm_selection = options[:arm_selection]
+arm_selection = options[:arm_selection]
-unless arm_selection
-   arm_selection = lambda do | contig_name |
-      ret = contig_name[0,3]
-      return ret
-    end
-end
 begin
 log "Reading exons"
 exons = Array.new
@@ -279,22 +236,28 @@ end
 log "Searching markers in genome"
 found_contigs = Set.new
 exo_f = File.open(exonerate_file, "w")
-contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
-Bio::DB::Exonerate.align({:query=>sequences, :target=>reference, :model=>model}) do |aln|
-	if aln.identity > min_identity
+def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
+  if aln.identity > min_identity
     exo_f.puts aln.line
     unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
       found_contigs.add(aln.target_id)
       entry = fasta_file.index.region_for_entry(aln.target_id)
       raise ExonerateException.new,  "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
-      region = entry.get_full_region
-      seq = fasta_file.fetch_sequence(region)
-      contigs_f.puts(">#{aln.target_id}\n#{seq}") if options[:extract_found_contigs]
     end
   end
 end
+Bio::DB::Blast.align({:query=>sequences, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
+  do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
+end if options[:aligner] == :blast
+Bio::DB::Exonerate.align({:query=>sequences, :target=>target, :model=>model}) do |aln|
+  do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
+end if options[:aligner] == :exonerate
 exo_f.close()
-contigs_f.close() if options[:extract_found_contigs]
@@ -303,18 +266,24 @@ log "Reading best alignment on each chromosome"
 container= Bio::PolyploidTools::ExonContainer.new
 container.flanking_size=options[:flanking_size]
 container.gene_models(sequences)
-container.chromosomes(temp_contigs)
+container.chromosomes(reference)
 container.add_parental({:name=>"A"})
 container.add_parental({:name=>"B"})
 exons.each do |exon|
   exon.container = container
-  exon.flanking_size = 50
+  exon.flanking_size = 200
   exon.variation_free_region = options[:variation_free_region]
-#  puts exon.inspect
+  #puts exon.inspect
   container.add_snp(exon)
 end
-container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>options[:arm_selection] , :min_identity=>min_identity})
+container.add_alignments(
+  {:exonerate_file=>exonerate_file,
+  :arm_selection=>options[:arm_selection] ,
+  :min_identity=>min_identity})
 #4.1 generating primer3 file
 log "Running primer3"
@@ -348,18 +317,14 @@ exons.each do |snp|
 end
 kasp_container.add_primers_file(primer_3_output) if added_exons > 0
-header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors"
+header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors,repetitive,blast_hits"
 File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
-kasp_container.snp_hash.each_pair do |name, kaspSNP|
-  #puts kaspSNP.snp_from.surrounding_exon_sequences.inspect
-  #puts kaspSNP.first_product
-  #puts kaspSNP.realigned_primers
-  out_fasta_products = "#{output_folder}/#{name}.fa"
-  File.open(out_fasta_products, 'w') { |f| f.write(kaspSNP.realigned_primers_fasta) }
+out_fasta_products = "#{output_folder}/products.fa"
+File.open(out_fasta_products, 'w') do  |f|
+  kasp_container.snp_hash.each_pair do |name, kaspSNP|
+    f.write(kaspSNP.realigned_primers_fasta)
+  end
 end
 File.open(output_to_order, "w") { |io|  io.write(kasp_container.print_primers_with_tails()) }