RubyGems - bio-polyploid-tools - Versions diffs - 0.10.1 → 1.0.0 - Mend

bio-polyploid-tools 0.10.1 → 1.0.0

Files changed (13) hide show

checksums.yaml +4 -4
data/VERSION +1 -1
data/bin/polymarker.rb +23 -19
data/bin/polymarker_capillary.rb +75 -51
data/bin/{find_homoeologue_variations.rb → polymarker_deletions.rb} +55 -90
data/bio-polyploid-tools.gemspec +5 -7
data/lib/bio/PolyploidTools/ExonContainer.rb +3 -3
data/lib/bio/PolyploidTools/NoSNPSequence.rb +38 -32
data/lib/bio/PolyploidTools/SNP.rb +6 -5
data/lib/bio/db/blast.rb +1 -1
data/lib/bio/db/primer3.rb +14 -17
metadata +4 -6
data/README +0 -21

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1a74407d5aee3baf6b231007be242d2097f07f74a0a012e151c3aef43175ef73
-  data.tar.gz: fff2475fcf69dec083a67bff9fd573738ac810ca764e7d6e0c7338231e4a81bd
+  metadata.gz: a8d10f674380ca0d78e0efbbf5bd81e44327fd66dfcbc5f9443891ebad6f2ee5
+  data.tar.gz: b787eef663d8c1b2932b38a877bb870521e71c72f6584d9b08d3ebf0c937b36e
 SHA512:
-  metadata.gz: dc594e3c51d0a1c7fe2facf12002fb7d75b4324dcbaf15bb862e0890662364be709a6e1f1dbd9545a8b9da01c663eb6fe89a30c074ce9f6f3672af33879195fc
-  data.tar.gz: 3ffa7f6be31f7f2f1a4fddf669d4d95a565e7189db274c579d2c8ba298adae040e43cc5042c7e5405cbcb4d6b0355ef92f71e60c2c36cc516c119cbc075b98de
+  metadata.gz: 4fdad615441a69e1af27e9ca23949e57b36c100773ed17ced255bec11c6d1d04778622199e832901861c0494fea018155bbf2d9b737f1672e342b88197123782
+  data.tar.gz: 074c38a5d9b59a116509a45e43d406bcc113cecfa83029239d748128715e74815fbbbb8880035abfb6272d96048dd5fb029fd363f75f699abadf46135ad67bc0

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.~~10.1~~
1	+ 1.0.0

data/bin/polymarker.rb CHANGED

@@ -40,7 +40,7 @@ options[:scoring] = :genome_specific
 options[:database]  = false
 options[:filter_best]  = false
 options[:aligner] = :blast
+options[:max_hits] = 8
 options[:primer_3_preferences] = {
       :primer_product_size_range => "50-150" ,
@@ -132,6 +132,10 @@ OptionParser.new do |opts|
   opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
     options[:database] = o
   end
+  opts.on("-H", "--max_hits INT", "Maximum number of hits to the reference. If there are more hits than this value, the marker is ignored") do |o|
+    options[:max_hits] = o.to_i
+  end
 end.parse!
@@ -233,8 +237,8 @@ File.open(test_file) do | f |
        region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
        snp.template_sequence = fasta_reference_db.fetch_sequence(region)
      else
-        write_status "WARN: Unable to find entry for #{snp.gene}"
-      end
+      write_status "WARN: Unable to find entry for #{snp.gene}"
+    end
     elsif options[:mutant_list] and options[:reference] #List and fasta file
       snp = Bio::PolyploidTools::SNPMutant.parse(line)
       entry = fasta_reference_db.index.region_for_entry(snp.contig)
@@ -242,21 +246,21 @@ File.open(test_file) do | f |
        region = fasta_reference_db.index.region_for_entry(snp.contig).get_full_region
        snp.full_sequence = fasta_reference_db.fetch_sequence(region)
      else
-        write_status "WARN: Unable to find entry for #{snp.gene}"
-      end
-    else
-      raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
-    end
-    raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
-    snp.genomes_count = options[:genomes_count]
-    snp.snp_in = snp_in
-    snp.original_name = original_name
-    if snp.position
-      snps << snp
-    else
-      $stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
+      write_status "WARN: Unable to find entry for #{snp.gene}"
     end
+  else
+    raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
+  end
+  raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
+  snp.max_hits = options[:max_hits]
+  snp.genomes_count = options[:genomes_count]
+  snp.snp_in = snp_in
+  snp.original_name = original_name
+  if snp.position
+    snps << snp
+  else
+    $stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
+  end
   end
 end
@@ -307,7 +311,7 @@ def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
 end
-Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model}) do |aln|
+Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
   do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
 end if options[:aligner] == :blast
@@ -334,7 +338,7 @@ container.gene_models(temp_fasta_query)
 container.chromosomes(target)
 container.add_parental({:name=>snp_in})
 container.add_parental({:name=>original_name})
+container.max_hits = options[:max_hits]
 snps.each do |snp|
   snp.container = container
   snp.flanking_size = container.flanking_size

data/bin/polymarker_capillary.rb CHANGED

@@ -35,15 +35,21 @@ options[:primer_3_preferences] = {
 }
 options[:genomes_count] = 3
 options[:allow_non_specific] = false
+options[:aligner] = :blast
+options[:arm_selection]
+model="ungapped"
+options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
+options[:database]  = false
 OptionParser.new do |opts|
-  opts.banner = "Usage: polymarker_capillary.rb [options]"
+  opts.banner = "Usage: polymarker_deletions.rb [options]"
   opts.on("-r", "--reference FILE", "Fasta file with the assembly") do |o|
     options[:reference] = o
   end
-  opts.on("-m", "--sequences FILE", "Fasta file with the sequences to amplify. the format must be Chromosome:start-end. Chromosome should match the names to the entries in the fasta files as it is used as main target") do |o|
+  opts.on("-m", "--sequences FILE", "Fasta file with the sequences to amplify. the format must be Chromosome:start-end. Chromosome
+    should match the names to the entries in the fasta files as it is used as main target") do |o|
     options[:markers] = o
   end
@@ -53,10 +59,19 @@ OptionParser.new do |opts|
   opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
     options[:genomes_count] = o.to_i
   end
-  opts.on("-a", "--allow_non_specific", "If used, semi-specific and non-specific primers will be produced") do |o|
+  opts.on("-A", "--allow_non_specific", "If used, semi-specific and non-specific primers will be produced") do |o|
     options[:allow_non_specific] = true
   end
+  opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
+    options[:database] = o
+  end
+  opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
+    options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
+  end
 end.parse!
@@ -65,23 +80,33 @@ reference     = options[:reference]
 markers       = options[:markers]
 output_folder = options[:output_folder]
 allow_non_specific = options[:allow_non_specific]
+options[:database] = options[:reference] unless  options[:database]
+temp_fasta_query="#{output_folder}/to_align.fa"
 log "Output folder: #{output_folder}"
 exonerate_file="#{output_folder}/exonerate_tmp.tab"
 Dir.mkdir(output_folder)
+arm_selection = options[:arm_selection]
 module Bio::PolyploidTools
   class SequenceToAmplify < SNP
-    def self.select_chromosome(contig_name)
-      arr = contig_name.split('_')
-      ret = "U"
-      ret = arr[2][0,2] if arr.size >= 3
-      ret = "3B" if arr.size == 2 and arr[0] == "v443"
-      ret = arr[0][0,2] if arr.size == 1
+    def self.select_chromosome(gene_name, arm_selection)
+      #m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(gene_name)
+      #m=/TraesCS(\d{1})(\w{1})(\d{2})G(\d+)/.match(gene_name)
+      #ret = {:group : m[1],
+      #       :genome : m[2],:version=>m[3],:chr_id=>m[4]}
+      #arr = contig_name.split('_')
+      #ret = "U"
+      #ret = arr[2][0,2] if arr.size >= 3
+      #ret = "3B" if arr.size == 2 and arr[0] == "v443"
+      #ret = arr[0][0,2] if arr.size == 1
+      #ret = "#{m[1]}#{m[2]}"
+      #puts ret
+      ret = arm_selection.call(gene_name)
       return ret
     end
@@ -92,18 +117,18 @@ module Bio::PolyploidTools
     #Format:
     #A fasta entry with the id: contig:start-end
     #The sequence can be prodcued with samtools faidx
-    def self.parse(fasta_entry)
+    def self.parse(fasta_entry, arm_selection)
+      #puts fasta_entry.definition
       snp = SequenceToAmplify.new
       match_data = /(?<rname>\w*):(?<rstart>\w*)-(?<rend>\w*)/.match(fasta_entry.definition)
+      #puts match_data.inspect
       rName = Regexp.last_match(:rname)
       rStart =  Regexp.last_match(:rstart).to_i
       rEnd =  Regexp.last_match(:rend).to_i
       snp.gene = fasta_entry.definition
       #snp.chromosome=rName
-      snp.chromosome=select_chromosome(rName)
+      #puts "Gene: #{snp.gene}"
+      snp.chromosome=select_chromosome(fasta_entry.definition, arm_selection)
       #puts "#{rName}: #{snp.chromosome}"
       snp.sequence_original = fasta_entry.seq
       snp.template_sequence = fasta_entry.seq.upcase
@@ -111,7 +136,7 @@ module Bio::PolyploidTools
       snp.rstart = rStart
       snp.rend = rEnd
-      snp.position   = 100
+      snp.position   = snp.sequence_original.size / 2
       snp.original   = snp.sequence_original[snp.position]
       tmp =  Bio::Sequence::NA.new(snp.original)
@@ -232,10 +257,13 @@ file = Bio::FastaFormat.open(markers)
 file.each do |entry|
   begin
-    tmp = Bio::PolyploidTools::SequenceToAmplify.parse(entry)
+    #puts entry.inspect
+    tmp = Bio::PolyploidTools::SequenceToAmplify.parse(entry, arm_selection)
     snps << tmp if tmp
-  rescue
+  rescue Exception => e
+    log "ERROR\t#{e.message}"
     $stderr.puts "Unable to generate the marker for: #{entry.definition}"
+    $stderr.puts e.backtrace
   end
 end
@@ -251,40 +279,33 @@ fasta_file.load_fai_entries
 min_identity = 95
 found_contigs = Set.new
-Bio::DB::Exonerate.align({:query=>markers, :target=>reference, :model=>'ungapped'}) do |aln|
+def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
   if aln.identity > min_identity
     exo_f.puts aln.line
-    #puts aln.line
     unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
       found_contigs.add(aln.target_id)
       entry = fasta_file.index.region_for_entry(aln.target_id)
-      raise Exception.new,  "Entry not found! #{aln.target_id}. Make sure that the #{reference}.fai was generated properly." if entry == nil
+      raise ExonerateException.new,  "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
+      if options[:extract_found_contigs]
+        region = entry.get_full_region
+        seq = fasta_file.fetch_sequence(region)
+        contigs_f.puts(">#{aln.target_id}\n#{seq}")
+      end
     end
   end
-end
-exo_f.close
-arm_selection_functions = Hash.new
-arm_selection_functions[:full_scaffold] = lambda do | contig_name |
-  return contig_name
 end
-#Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
-#Or the first two characters in the contig name, to deal with
-#pseudomolecules that start with headers like: "1A"
-#And with the cases when 3B is named with the prefix: v443
-arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
-  arr = contig_name.split('_')
-  ret = "U"
-  ret = arr[2][0,2] if arr.size >= 3
-  ret = "3B" if arr.size == 2 and arr[0] == "v443"
-  ret = arr[0][0,2] if arr.size == 1
-  return ret
-end
+Bio::DB::Blast.align({:query=>markers, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
+  do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
+end if options[:aligner] == :blast
+Bio::DB::Exonerate.align({:query=>markers, :target=>target, :model=>model}) do |aln|
+  do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
+end if options[:aligner] == :exonerate
+exo_f.close
 container= Bio::PolyploidTools::ExonContainer.new
 container.flanking_size=500
@@ -292,6 +313,7 @@ container.gene_models(markers)
 container.chromosomes(target)
 container.add_parental({:name=>"A"})
 container.add_parental({:name=>"B"})
+#puts "SNPs size: #{snps.size}"
 snps.each do |snp|
   snp.snp_in = "B"
   snp.container = container
@@ -300,8 +322,10 @@ snps.each do |snp|
   snp.includeNoSpecific = allow_non_specific
   container.add_snp(snp)
 end
-container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>arm_selection_functions[:arm_selection_embl] , :min_identity=>min_identity})
+container.add_alignments({:exonerate_file=>exonerate_file,
+  :arm_selection=> arm_selection,
+  :min_identity=>min_identity})
 exons_filename="#{output_folder}/localAlignment.fa"
@@ -329,6 +353,9 @@ output_file  = "#{output_folder}/primers.csv"
 file = File.open(masks_output, "w")
 out  = File.open(output_file,  "w")
+out.puts ["Id","specificity","inside","type","target","orientation","product_size",
+  "left_position","left_tm","left_sequence",
+"right_position","right_tm","right_sequence"].join ","
 class Bio::DB::Primer3::Primer3Record
   attr_accessor :primerPairs
 end
@@ -358,10 +385,7 @@ Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
   file.puts ">#{seq_id}\n#{sequence_template}"
   file.puts ">#{seq_id}:mask\n#{sequence_mask}"
-   #puts "FDFDS"
-   #puts primer3record.primerPairs
    primer3record.primerPairs.each do |p|
     #puts p.inspect
     printed += 1
@@ -381,10 +405,10 @@ Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
     toPrint <<  p.right.sequence
     middle = 501
-    toPrint << lArr[0]
-    toPrint << rArr[0]
-    toPrint << middle - lArr[0]
-    toPrint << rArr[0] - middle
+    #toPrint << lArr[0]
+    #toPrint << rArr[0]
+    #toPrint << middle - lArr[0]
+    #toPrint << rArr[0] - middle
 #Start End LeftDistance  RightDistance
     out.puts toPrint.join(",")

data/bin/{find_homoeologue_variations.rb → polymarker_deletions.rb} RENAMED

@@ -53,14 +53,12 @@ class Bio::PolyploidTools::ExonContainer
 end
 class Bio::DB::Primer3::SNP
   def to_s
      "#{gene}:#{snp_from.chromosome}"
   end
 end
-class Bio::DB::Primer3::Primer3Record
+class Bio::DB::Primer3::Primer3Record
   def best_pair
     return @best_pair if @best_pair
@@ -82,7 +80,7 @@ class Bio::DB::Primer3::Primer3Record
         @total_caps = capital_count
       end
     end
-    #@best_pair = @primerPairs.min
     @best_pair
   end
@@ -107,12 +105,13 @@ class Bio::DB::Primer3::Primer3Record
   def score
     best_pair
+    total_caps = "#{best_pair.left.sequence}#{best_pair.right.sequence}".scan(/[A-Z]/).length
 #    puts "score"
  #   puts self.inspect
     ret = 0
     ret += @scores[type]
     ret += @scores[:exon] if exon?
-    ret -= @total_caps * 10
+    ret -= total_caps * 10
     ret -= product_length
     ret
   end
@@ -123,71 +122,21 @@ class Bio::DB::Primer3::Primer3Record
    def left_primer_snp(snp)
       tmp_primer = String.new(left_primer)
-      #if self.orientation == :forward
-      #  base_original = snp.original
-      #  base_snp = snp.snp
-      #elsif self.orientation == :reverse
-      #  base_original = reverse_complement_string(snp.original )
-      #  base_snp = reverse_complement_string(snp.snp)
-      #else
-      #  raise Primer3Exception.new "#{self.orientation} is not a valid orientation"
-      #end
-      # puts "#{snp.to_s} #{self.orientation} #{tmp_primer[-1] } #{base_original} #{base_snp}"
-      #if tmp_primer[-1] == base_original
-      #  tmp_primer[-1] = base_snp
-      #elsif tmp_primer[-1] == base_snp
-      #  tmp_primer[-1] = base_original
-      #else
-      #  raise Primer3Exception.new "#{tmp_primer} doesnt end in a base in the SNP #{snp.to_s}"
-      #end
-      #puts "tmp_primer: #{tmp_primer}"
       return tmp_primer
     end
 end
-arm_selection_functions = Hash.new;
-arm_selection_functions[:arm_selection_first_two] = lambda do | contig_name |
-  ret = contig_name[0,2]
-  return ret
-end
-#Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
-#Or the first two characters in the contig name, to deal with
-#pseudomolecules that start with headers like: "1A"
-#And with the cases when 3B is named with the prefix: v443
-arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
-  arr = contig_name.split('_')
-  ret = "U"
-  ret = arr[2][0,2] if arr.size >= 3
-  ret = "3B" if arr.size == 2 and arr[0] == "v443"
-  ret = arr[0][0,2] if arr.size == 1
-  return ret
-end
-arm_selection_functions[:arm_selection_morex] = lambda do | contig_name |
-  ret = contig_name.split(':')[0].split("_")[1];
-  return ret
-end
-arm_selection_functions[:scaffold] = lambda do | contig_name |
-  ret = contig_name;
-  return ret
-end
 markers = nil
 options = {}
+options[:aligner] = :blast
 options[:model] = "est2genome"
 options[:min_identity] = 90
-options[:extract_found_contigs] = false
-options[:arm_selection] = arm_selection_functions[:arm_selection_embl] ;
+options[:extract_found_contigs] = true
+options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
 options[:genomes_count] = 3
+options[:variation_free_region] =0
 options[:primer_3_preferences] = {
       :primer_product_size_range => "50-150" ,
@@ -200,11 +149,14 @@ options[:primer_3_preferences] = {
   }
+options[:database]  = false
 OptionParser.new do |opts|
-  opts.banner = "Usage: find_homoeologue_variations.rb [options]"
+  opts.banner = "Usage: polymarker_deletions.rb [options]"
-  opts.on("-c", "--sequences FASTA", "Sequence of the region to searc") do |o|
+  opts.on("-m", "--sequences FASTA", "Sequence of the region to search") do |o|
     options[:sequences] = o
   end
   opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
@@ -221,6 +173,14 @@ OptionParser.new do |opts|
   opts.on("-x", "--extract_found_contigs", "If present, save in a separate file the contigs with matches. Useful to debug.") do |o|
     options[:extract_found_contigs] = true
   end
+  opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
+    options[:database] = o
+  end
+    opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
+    options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
+  end
 end.parse!
 #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
@@ -231,11 +191,14 @@ throw raise Exception.new(), "Fasta file with sequences has to be provided" unle
 output_folder = options[:output] if options[:output]
 throw raise Exception.new(), "An output directory has to be provided" unless output_folder
 model=options[:model]
+options[:database] = options[:reference] unless  options[:database]
 Dir.mkdir(output_folder)
 min_identity= options[:min_identity]
 exonerate_file="#{output_folder}/exonerate_tmp.tab"
-temp_contigs="#{output_folder}/contigs_tmp.fa"
 primer_3_input="#{output_folder}/primer_3_input_temp"
 primer_3_output="#{output_folder}/primer_3_output_temp"
 exons_filename="#{output_folder}/exons_genes_and_contigs.fa"
@@ -248,14 +211,8 @@ fasta_file.load_fai_entries
 original_name="A"
 snp_in="B"
- arm_selection = options[:arm_selection]
+arm_selection = options[:arm_selection]
-unless arm_selection
-   arm_selection = lambda do | contig_name |
-      ret = contig_name[0,3]
-      return ret
-    end
-end
 begin
 log "Reading exons"
 exons = Array.new
@@ -279,22 +236,28 @@ end
 log "Searching markers in genome"
 found_contigs = Set.new
 exo_f = File.open(exonerate_file, "w")
-contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
-Bio::DB::Exonerate.align({:query=>sequences, :target=>reference, :model=>model}) do |aln|
-	if aln.identity > min_identity
+def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
+  if aln.identity > min_identity
     exo_f.puts aln.line
     unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
       found_contigs.add(aln.target_id)
       entry = fasta_file.index.region_for_entry(aln.target_id)
       raise ExonerateException.new,  "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
-      region = entry.get_full_region
-      seq = fasta_file.fetch_sequence(region)
-      contigs_f.puts(">#{aln.target_id}\n#{seq}") if options[:extract_found_contigs]
     end
   end
 end
+Bio::DB::Blast.align({:query=>sequences, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
+  do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
+end if options[:aligner] == :blast
+Bio::DB::Exonerate.align({:query=>sequences, :target=>target, :model=>model}) do |aln|
+  do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
+end if options[:aligner] == :exonerate
 exo_f.close()
-contigs_f.close() if options[:extract_found_contigs]
@@ -303,18 +266,24 @@ log "Reading best alignment on each chromosome"
 container= Bio::PolyploidTools::ExonContainer.new
 container.flanking_size=options[:flanking_size]
 container.gene_models(sequences)
-container.chromosomes(temp_contigs)
+container.chromosomes(reference)
 container.add_parental({:name=>"A"})
 container.add_parental({:name=>"B"})
 exons.each do |exon|
   exon.container = container
-  exon.flanking_size = 50
+  exon.flanking_size = 200
   exon.variation_free_region = options[:variation_free_region]
-#  puts exon.inspect
+  #puts exon.inspect
   container.add_snp(exon)
 end
-container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>options[:arm_selection] , :min_identity=>min_identity})
+container.add_alignments(
+  {:exonerate_file=>exonerate_file,
+  :arm_selection=>options[:arm_selection] ,
+  :min_identity=>min_identity})
 #4.1 generating primer3 file
 log "Running primer3"
@@ -348,18 +317,14 @@ exons.each do |snp|
 end
 kasp_container.add_primers_file(primer_3_output) if added_exons > 0
-header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors"
+header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors,repetitive,blast_hits"
 File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
-kasp_container.snp_hash.each_pair do |name, kaspSNP|
-  #puts kaspSNP.snp_from.surrounding_exon_sequences.inspect
-  #puts kaspSNP.first_product
-  #puts kaspSNP.realigned_primers
-  out_fasta_products = "#{output_folder}/#{name}.fa"
-  File.open(out_fasta_products, 'w') { |f| f.write(kaspSNP.realigned_primers_fasta) }
+out_fasta_products = "#{output_folder}/products.fa"
+File.open(out_fasta_products, 'w') do  |f|
+  kasp_container.snp_hash.each_pair do |name, kaspSNP|
+    f.write(kaspSNP.realigned_primers_fasta)
+  end
 end
 File.open(output_to_order, "w") { |io|  io.write(kasp_container.print_primers_with_tails()) }

data/bio-polyploid-tools.gemspec CHANGED

@@ -2,27 +2,25 @@
 # DO NOT EDIT THIS FILE DIRECTLY
 # Instead, edit Juwelier::Tasks in Rakefile, and run 'rake gemspec'
 # -*- encoding: utf-8 -*-
-# stub: bio-polyploid-tools 0.10.1 ruby lib
+# stub: bio-polyploid-tools 1.0.0 ruby lib
 Gem::Specification.new do |s|
   s.name = "bio-polyploid-tools".freeze
-  s.version = "0.10.1"
+  s.version = "1.0.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
   s.require_paths = ["lib".freeze]
   s.authors = ["Ricardo H.  Ramirez-Gonzalez".freeze]
-  s.date = "2019-03-28"
+  s.date = "2019-07-05"
   s.description = "Repository of tools developed at Crop Genetics in JIC to work with polyploid wheat".freeze
   s.email = "ricardo.ramirez-gonzalez@jic.ac.uk".freeze
-  s.executables = ["bfr.rb".freeze, "blast_triads.rb".freeze, "blast_triads_promoters.rb".freeze, "count_variations.rb".freeze, "filter_blat_by_target_coverage.rb".freeze, "filter_exonerate_by_identity.rb".freeze, "find_best_blat_hit.rb".freeze, "find_best_exonerate.rb".freeze, "find_homoeologue_variations.rb".freeze, "get_longest_hsp_blastx_triads.rb".freeze, "hexaploid_primers.rb".freeze, "homokaryot_primers.rb".freeze, "mafft_triads.rb".freeze, "mafft_triads_promoters.rb".freeze, "map_markers_to_contigs.rb".freeze, "marker_to_vcf.rb".freeze, "markers_in_region.rb".freeze, "mask_triads.rb".freeze, "polymarker.rb".freeze, "polymarker_capillary.rb".freeze, "snp_position_to_polymarker.rb".freeze, "snps_between_bams.rb".freeze, "tag_stats.rb".freeze, "vcfLineToTable.rb".freeze, "vcfToPolyMarker.rb".freeze]
+  s.executables = ["bfr.rb".freeze, "blast_triads.rb".freeze, "blast_triads_promoters.rb".freeze, "count_variations.rb".freeze, "filter_blat_by_target_coverage.rb".freeze, "filter_exonerate_by_identity.rb".freeze, "find_best_blat_hit.rb".freeze, "find_best_exonerate.rb".freeze, "get_longest_hsp_blastx_triads.rb".freeze, "hexaploid_primers.rb".freeze, "homokaryot_primers.rb".freeze, "mafft_triads.rb".freeze, "mafft_triads_promoters.rb".freeze, "map_markers_to_contigs.rb".freeze, "marker_to_vcf.rb".freeze, "markers_in_region.rb".freeze, "mask_triads.rb".freeze, "polymarker.rb".freeze, "polymarker_capillary.rb".freeze, "polymarker_deletions.rb".freeze, "snp_position_to_polymarker.rb".freeze, "snps_between_bams.rb".freeze, "tag_stats.rb".freeze, "vcfLineToTable.rb".freeze, "vcfToPolyMarker.rb".freeze]
   s.extra_rdoc_files = [
-    "README",
     "README.md"
   ]
   s.files = [
     ".travis.yml",
     "Gemfile",
-    "README",
     "README.md",
     "Rakefile",
     "VERSION",
@@ -34,7 +32,6 @@ Gem::Specification.new do |s|
     "bin/filter_exonerate_by_identity.rb",
     "bin/find_best_blat_hit.rb",
     "bin/find_best_exonerate.rb",
-    "bin/find_homoeologue_variations.rb",
     "bin/get_longest_hsp_blastx_triads.rb",
     "bin/hexaploid_primers.rb",
     "bin/homokaryot_primers.rb",
@@ -46,6 +43,7 @@ Gem::Specification.new do |s|
     "bin/mask_triads.rb",
     "bin/polymarker.rb",
     "bin/polymarker_capillary.rb",
+    "bin/polymarker_deletions.rb",
     "bin/snp_position_to_polymarker.rb",
     "bin/snps_between_bams.rb",
     "bin/tag_stats.rb",

data/lib/bio/PolyploidTools/ExonContainer.rb CHANGED

@@ -76,7 +76,6 @@ module Bio::PolyploidTools
     end
     def add_snp(snp)
-      #TODO: add to the snp the maximum number of hits?
       snp.max_hits = self.max_hits
       @snp_map[snp.gene] = Array.new unless   @snp_map[snp.gene]
       @snp_map[snp.gene] << snp
@@ -141,6 +140,7 @@ module Bio::PolyploidTools
           begin
             file.puts snp.aligned_sequences_fasta
           rescue Exception=>e
+            #puts snp.inspect
             @missing_exons << snp.to_s
             $stderr.puts "print_fasta_snp_exones:" + snp.to_s + ":" + e.to_s
             $stderr.puts "Local position: #{snp.local_position}"
@@ -160,8 +160,8 @@ module Bio::PolyploidTools
           begin
             primer_3_min_seq_length
             string = snp.primer_3_string( snp.chromosome, parental )
-            #TODO: add tan error to the SNP this snp has more than max_hits. Or maybe inside the SNP file.
-            #puts "print_primer_3_exons: #{string.size}"
+            #TODO: add tan error to the SNP this snp has more than max_hits.
+            #Or maybe inside the SNP file.
             if string.size > 0
               file.puts string
               added += 1

data/lib/bio/PolyploidTools/NoSNPSequence.rb CHANGED

@@ -55,11 +55,15 @@ module Bio::PolyploidTools
      def mask_aligned_chromosomal_snp(chromosome)
       return nil if  aligned_sequences.values.size == 0
-      names = exon_sequences.keys
+      names = aligned_sequences.keys
+      parentals =  parental_sequences.keys
+      names = names - parentals
+      best_target = get_target_sequence(names, chromosome)
+      masked_snps = aligned_sequences[best_target].downcase if aligned_sequences[best_target]
+      masked_snps = "-" * aligned_sequences.values[0].size  unless aligned_sequences[best_target]
-      masked_snps = aligned_sequences[chromosome].downcase if aligned_sequences[chromosome]
-      masked_snps = "-" * aligned_sequences.values[0].size  unless aligned_sequences[chromosome]
       #TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
       i = 0
       while i < masked_snps.size
@@ -105,26 +109,23 @@ module Bio::PolyploidTools
         aligned_sequences.each_pair do |name, val|
           has_del = true if val[i] == '-'
-          print "#{val[i]}\t"
+          #print "#{val[i]}\t"
         end
         count += 1 if has_del
-        print "#{count}\n"
+        #print "#{count}\n"
       end
       return count
     end
     def primer_region(target_chromosome, parental_chr )
       chromosome_seq = aligned_sequences[target_chromosome]
-      #chromosome_seq = "-" * parental.size unless chromosome_seq
-      if aligned_sequences.size == 0
-        #puts aligned_sequences.inspect
-        #puts surrounding_exon_sequences.inspect
-        #puts self.inspect
-        chromosome_seq = surrounding_exon_sequences[target_chromosome]
-      end
+      names = aligned_sequences.keys
+      target_chromosome = get_target_sequence(names, target_chromosome)
+      chromosome_seq = aligned_sequences[target_chromosome]
+      chromosome_seq = surrounding_exon_sequences[target_chromosome ]if aligned_sequences.size == 0
+      chromosome_seq = "-" * sequence_original.size unless chromosome_seq
       chromosome_seq = chromosome_seq.downcase
+      #puts chromosome_seq
       mask = mask_aligned_chromosomal_snp(target_chromosome)
       pr = PrimerRegion.new
@@ -146,7 +147,7 @@ module Bio::PolyploidTools
               pr.crhomosome_specific_intron << position_in_region
             elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
               parental[i] = mask[i]
-              pr.chromosome_specific << position_in_region if count_deletions_around(1,target_chromosome) < 3
+              pr.chromosome_specific << position_in_region #if count_deletions_around(1,target_chromosome) < 3
               pr.chromosome_specific_in_mask << i
             end
@@ -165,16 +166,15 @@ module Bio::PolyploidTools
           position_in_region += 1
         end #Closes region with bases
       end
       pr.sequence=parental.gsub('-','')
       pr
     end
-    def return_primer_3_string_test(opts={})
-      left = opts[:right_pos]
+    def return_primer_3_string(opts={})
+      #puts "return_primer_3_string #{opts.inspect}"
+      left = opts[:left_pos]
       right = opts[:right_pos]
-      sequence =  opts[:sequence]
+      sequence =  opts[:sequence].clone
       orientation = "forward"
       if opts[:right_pos]
         orientation = "forward"
@@ -201,7 +201,7 @@ module Bio::PolyploidTools
       #In case that we don't have a right primer, we do both orientations
       unless opts[:right_pos]
-        sequence =  opts[:sequence]
+        sequence =  opts[:sequence].clone
         left = sequence.size - left - 1
         orientation = "reverse"
         sequence = reverse_complement_string(sequence)
@@ -223,7 +223,9 @@ module Bio::PolyploidTools
     end
     def primer_3_all_strings(target_chromosome, parental)
+      #puts "primer_3_all_strings: #{target_chromosome} #{parental}"
       pr = primer_region(target_chromosome, parental )
+      #puts pr.inspect
       primer_3_propertes = Array.new
       seq_original = String.new(pr.sequence)
@@ -236,24 +238,28 @@ module Bio::PolyploidTools
         snp_type = "non-homoeologous"
       end
-      pr.chromosome_specific.each do |pos|
-        seq_snp =  String.new(pr.sequence)
-        orgiginal_base = seq_snp[pos]
-        other_chromosome_base = get_base_in_different_chromosome(pos, target_chromosome)
+      pr.chromosome_specific.each_with_index do |pos , i|
+        seq_snp =  seq_original.clone
+        #original_base = seq_snp[pos]
+        #puts "___"
+        #puts aligned_sequences.keys.inspect
+        #puts target_chromosome
+        t_chr =  get_target_sequence(aligned_sequences.keys, target_chromosome)
+        other_chromosome_base = get_base_in_different_chromosome(pr.chromosome_specific_in_mask[i], t_chr)
         args = {
           :name =>"#{gene} A chromosome_specific exon #{snp_type} #{chromosome}",
           :left_pos => pos,
-          :sequence=>seq_original
+          :sequence=>seq_snp
         }
+        seq_snp =  seq_original.clone
         primer_3_propertes << return_primer_3_string(args)
         args[:name] = "#{gene} B chromosome_specific exon #{snp_type} #{chromosome}"
-        args[:sequence] = seq_snp
-        #TODO: Find base from another chromosome
         seq_snp[pos] =  other_chromosome_base.upcase
+        args[:sequence] = seq_snp
         primer_3_propertes << return_primer_3_string(args)
       end
@@ -265,7 +271,7 @@ module Bio::PolyploidTools
     def aligned_sequences
       return @aligned_sequences if @aligned_sequences
-      if sequences_to_align.size == 1
+      if sequences_to_align.size <= 1
         @aligned_sequences = sequences_to_align
         return @aligned_sequences
       end

data/lib/bio/PolyploidTools/SNP.rb CHANGED

@@ -162,6 +162,7 @@ module Bio::PolyploidTools
     end
     def add_exon(exon, arm, filter_best: true)
+      exon_list[arm] = Array.new unless exon_list[arm]
       if filter_best and exon_list[arm].size > 0
         current = exon_list[arm].first
         exon_list[arm] = [exon] if exon.record.score > current.record.score
@@ -558,7 +559,7 @@ module Bio::PolyploidTools
     def aligned_sequences
       return @aligned_sequences if @aligned_sequences
+      return Hash.new if sequences_to_align.size == 0
       options = ['--maxiterate', '1000', '--localpair', '--quiet']
       mafft = Bio::MAFFT.new( "mafft" , options)
@@ -756,13 +757,13 @@ module Bio::PolyploidTools
       self.exon_list.each do |chromosome, exon_arr|
         exon_arr.each do |exon|
           exon_start_offset = exon.query_region.start - gene_region.start
-          flanquing_region  = exon.target_flanking_region_from_position(position,flanking_size)
+          flanking_region  = exon.target_flanking_region_from_position(position,flanking_size)
           #TODO: Padd when the exon goes over the regions...
-          #puts flanquing_region.inspect
+          #puts flanking_region.inspect
           #Ignoring when the exon is in a gap
           unless exon.snp_in_gap
-            exon_seq = container.chromosome_sequence(flanquing_region)
-            @surrounding_exon_sequences["#{chromosome}_#{flanquing_region.start}_#{exon.record.score}"] = exon_seq
+            exon_seq = container.chromosome_sequence(flanking_region)
+            @surrounding_exon_sequences["#{chromosome}_#{flanking_region.start}_#{exon.record.score}"] = exon_seq
           end
         end
       end

data/lib/bio/db/blast.rb CHANGED

@@ -82,7 +82,7 @@ module Bio::DB::Blast
 		max_target_seqs = 6 #TODO: Actually add this as an argument to PolyMarker.
 		max_target_seqs = opts[:max_hits] * 2 if opts[:max_hits]
 		cmdline = "blastn -max_target_seqs #{max_target_seqs} -query #{query} -db #{target} -outfmt '6 qseqid qstart qend qframe sseqid sstart send sframe score pident qlen slen qseq sseq'"
+		#puts cmdline
 		status, stdout, stderr = systemu cmdline
 		if status.exitstatus == 0
 			alns = Array.new unless block_given?

data/lib/bio/db/primer3.rb CHANGED

@@ -129,12 +129,12 @@ module Bio::DB::Primer3
       @values << snp_type
       if primer3_line_1 and primer3_line_2
         #Block that searches both if both pairs have a TM
-        primer_2 = primer3_line_2.left_primer_with_coordinates(primer3_line_1.left_coordinates, primer3_line_1.orientation)
-        primer_2_tm = find_left_primer_temp(primer_2)
-        primer_1 = primer3_line_1.left_primer_with_coordinates(primer3_line_2.left_coordinates, primer3_line_2.orientation)
+        primer_1    = primer3_line_1.left_primer_with_coordinates(primer3_line_2.left_coordinates, primer3_line_2.orientation)
         primer_1_tm = find_left_primer_temp(primer_1)
-        #  $stderr.puts primer_1
-        #  $stderr.puts primer_2
+        primer_2    = primer3_line_2.left_primer_with_coordinates(primer3_line_1.left_coordinates, primer3_line_1.orientation)
+        primer_2_tm = find_left_primer_temp(primer_2)
         if primer3_line_1 < primer3_line_2 and primer_2_tm != "NA"
           @values << primer3_line_1.left_primer
           @values << primer_2
@@ -159,7 +159,7 @@ module Bio::DB::Primer3
           @values << primer3_line_2.best_pair.product_size
         else
-          first_candidate = find_primer_pair_first
+          first_candidate  = find_primer_pair_first
           second_candidate = find_primer_pair_second
           if first_candidate
@@ -183,7 +183,7 @@ module Bio::DB::Primer3
             @values << first_candidate.best_pair.left.tm
             @values << primer_2_tm
             @values << first_candidate.best_pair.right.tm
-            @values << "first"
+            @values << "first-"
             @values << first_candidate.best_pair.product_size
           elsif  second_candidate
             #puts "B"
@@ -195,7 +195,7 @@ module Bio::DB::Primer3
             @values << primer_1_tm
             @values << second_candidate.best_pair.left.tm
             @values << second_candidate.best_pair.right.tm
-            @values << "second"
+            @values << "second-"
             @values << second_candidate.best_pair.product_size
           elsif  first_candidate
             #puts "C"
@@ -207,7 +207,7 @@ module Bio::DB::Primer3
             @values << primer_2_tm
             @values << first_candidate.best_pair.left.tm
             @values << first_candidate.best_pair.right.tm
-            @values << "first"
+            @values << "first/"
             @values << first_candidate.best_pair.product_size
           end
         end
@@ -277,7 +277,6 @@ module Bio::DB::Primer3
     end
     def orientation
-      puts "insideOrientation: #{self.values[11]}"
       return self.values[11] if self.values[11]&& self.values[11] != nil
       return 'unknown'
     end
@@ -385,7 +384,7 @@ module Bio::DB::Primer3
           @primer3_line_1 = primer3record if not @primer3_line_1  or @primer3_line_1 > primer3record
         when primer3record.line == @line_2
           primers_line_2 << primer3record
-          @primer3_line_2 = primer3record if not @primer3_line_2 or @primer3_line_2 > primer3record
+          @primer3_line_2 = primer3record if not @primer3_line_2  or @primer3_line_2 > primer3record
         else
           raise Primer3Exception.new "#{primer3record.line} is not recognized (#{line_1}, #{line_2})"
         end
@@ -508,9 +507,7 @@ module Bio::DB::Primer3
     def left_primer_with_coordinates(coordinates, other_orientation)
       seq = self.sequence_template
-      #puts "Left coordinates: #{seq}"
-      seq = Primer3Record.reverse_complement_string(seq) if self.orientation != other_orientation
+      seq = Primer3Record.reverse_complement_string(seq) if self.orientation != other_orientation
       seq[coordinates[0],coordinates[1]]
     end
@@ -807,9 +804,9 @@ module Bio::DB::Primer3
       str = ""
       snp_hash.each do |k, snp|
         if snp.found_primers?
-          str << snp.gene << snp.original << "\t" << tail_a << snp.first_primer << "\n"
-          str << snp.gene << snp.snp      << "\t" << tail_b << snp.second_primer << "\n"
-          str << snp.gene                 << "\t"           << snp.common_primer << "\n"
+          str << snp.gene << snp.original << "_1st\t" << tail_a << snp.first_primer  << "\n"
+          str << snp.gene << snp.snp      << "_2nd\t" << tail_b << snp.second_primer << "\n"
+          str << snp.gene                 << "_common\t"        << snp.common_primer << "\n"
         end
       end
       return str

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bio-polyploid-tools
 version: !ruby/object:Gem::Version
-  version: 0.10.1
+  version: 1.0.0
 platform: ruby
 authors:
 - Ricardo H.  Ramirez-Gonzalez
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-03-28 00:00:00.000000000 Z
+date: 2019-07-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bio
@@ -120,7 +120,6 @@ executables:
 - filter_exonerate_by_identity.rb
 - find_best_blat_hit.rb
 - find_best_exonerate.rb
-- find_homoeologue_variations.rb
 - get_longest_hsp_blastx_triads.rb
 - hexaploid_primers.rb
 - homokaryot_primers.rb
@@ -132,6 +131,7 @@ executables:
 - mask_triads.rb
 - polymarker.rb
 - polymarker_capillary.rb
+- polymarker_deletions.rb
 - snp_position_to_polymarker.rb
 - snps_between_bams.rb
 - tag_stats.rb
@@ -139,12 +139,10 @@ executables:
 - vcfToPolyMarker.rb
 extensions: []
 extra_rdoc_files:
-- README
 - README.md
 files:
 - ".travis.yml"
 - Gemfile
-- README
 - README.md
 - Rakefile
 - VERSION
@@ -156,7 +154,6 @@ files:
 - bin/filter_exonerate_by_identity.rb
 - bin/find_best_blat_hit.rb
 - bin/find_best_exonerate.rb
-- bin/find_homoeologue_variations.rb
 - bin/get_longest_hsp_blastx_triads.rb
 - bin/hexaploid_primers.rb
 - bin/homokaryot_primers.rb
@@ -168,6 +165,7 @@ files:
 - bin/mask_triads.rb
 - bin/polymarker.rb
 - bin/polymarker_capillary.rb
+- bin/polymarker_deletions.rb
 - bin/snp_position_to_polymarker.rb
 - bin/snps_between_bams.rb
 - bin/tag_stats.rb

data/README DELETED

@@ -1,21 +0,0 @@
-= bio-polyploid-tools
-== Introduction
-This tools are designed to deal with polyploid wheat. The first tool is to design KASPer primers, making them as specific as possible.
-== Installation
-'gem install bio-polyploid-tools'
-== Notes
-* If the SNP is in a gap in the alignmetn to the chromosomes, it is ignored.
-BUG: Sometimes the primers are reversed (the first comes second)
-BUG: Blocks with NNNs are picked and treated as semi-specific.
-BUG: If the name of the reference have space, the ID is not chopped. ">gene_1 (G12A)" shouls be treated as ">gene_1".
-TODO: If reading from a reference file, only get one reference to align when the region is queried several times
-TODO: Add a parameter file file to tweak the alignments.