RubyGems - bio-polyploid-tools - Versions diffs - 0.10.1 → 1.0.0 - Mend

bio-polyploid-tools 0.10.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/VERSION +1 -1
data/bin/polymarker.rb +23 -19
data/bin/polymarker_capillary.rb +75 -51
data/bin/{find_homoeologue_variations.rb → polymarker_deletions.rb} +55 -90
data/bio-polyploid-tools.gemspec +5 -7
data/lib/bio/PolyploidTools/ExonContainer.rb +3 -3
data/lib/bio/PolyploidTools/NoSNPSequence.rb +38 -32
data/lib/bio/PolyploidTools/SNP.rb +6 -5
data/lib/bio/db/blast.rb +1 -1
data/lib/bio/db/primer3.rb +14 -17
metadata +4 -6
data/README +0 -21

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1a74407d5aee3baf6b231007be242d2097f07f74a0a012e151c3aef43175ef73
-  data.tar.gz: fff2475fcf69dec083a67bff9fd573738ac810ca764e7d6e0c7338231e4a81bd
+  metadata.gz: a8d10f674380ca0d78e0efbbf5bd81e44327fd66dfcbc5f9443891ebad6f2ee5
+  data.tar.gz: b787eef663d8c1b2932b38a877bb870521e71c72f6584d9b08d3ebf0c937b36e
 SHA512:
-  metadata.gz: dc594e3c51d0a1c7fe2facf12002fb7d75b4324dcbaf15bb862e0890662364be709a6e1f1dbd9545a8b9da01c663eb6fe89a30c074ce9f6f3672af33879195fc
-  data.tar.gz: 3ffa7f6be31f7f2f1a4fddf669d4d95a565e7189db274c579d2c8ba298adae040e43cc5042c7e5405cbcb4d6b0355ef92f71e60c2c36cc516c119cbc075b98de
+  metadata.gz: 4fdad615441a69e1af27e9ca23949e57b36c100773ed17ced255bec11c6d1d04778622199e832901861c0494fea018155bbf2d9b737f1672e342b88197123782
+  data.tar.gz: 074c38a5d9b59a116509a45e43d406bcc113cecfa83029239d748128715e74815fbbbb8880035abfb6272d96048dd5fb029fd363f75f699abadf46135ad67bc0

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.~~10.1~~
1	+ 1.0.0

data/bin/polymarker.rb CHANGED

@@ -40,7 +40,7 @@ options[:scoring] = :genome_specific
 options[:database]  = false
 options[:filter_best]  = false
 options[:aligner] = :blast
+options[:max_hits] = 8
 options[:primer_3_preferences] = {
       :primer_product_size_range => "50-150" ,
@@ -132,6 +132,10 @@ OptionParser.new do |opts|
   opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
     options[:database] = o
   end
+  opts.on("-H", "--max_hits INT", "Maximum number of hits to the reference. If there are more hits than this value, the marker is ignored") do |o|
+    options[:max_hits] = o.to_i
+  end
 end.parse!
@@ -233,8 +237,8 @@ File.open(test_file) do | f |
        region = fasta_reference_db.index.region_for_entry(snp.gene).get_full_region
        snp.template_sequence = fasta_reference_db.fetch_sequence(region)
      else
-        write_status "WARN: Unable to find entry for #{snp.gene}"
-      end
+      write_status "WARN: Unable to find entry for #{snp.gene}"
+    end
     elsif options[:mutant_list] and options[:reference] #List and fasta file
       snp = Bio::PolyploidTools::SNPMutant.parse(line)
       entry = fasta_reference_db.index.region_for_entry(snp.contig)
@@ -242,21 +246,21 @@ File.open(test_file) do | f |
        region = fasta_reference_db.index.region_for_entry(snp.contig).get_full_region
        snp.full_sequence = fasta_reference_db.fetch_sequence(region)
      else
-        write_status "WARN: Unable to find entry for #{snp.gene}"
-      end
-    else
-      raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
-    end
-    raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
-    snp.genomes_count = options[:genomes_count]
-    snp.snp_in = snp_in
-    snp.original_name = original_name
-    if snp.position
-      snps << snp
-    else
-      $stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
+      write_status "WARN: Unable to find entry for #{snp.gene}"
     end
+  else
+    raise Bio::DB::Exonerate::ExonerateException.new "Wrong number of arguments. "
+  end
+  raise Bio::DB::Exonerate::ExonerateException.new "No SNP for line '#{line}'" if snp == nil
+  snp.max_hits = options[:max_hits]
+  snp.genomes_count = options[:genomes_count]
+  snp.snp_in = snp_in
+  snp.original_name = original_name
+  if snp.position
+    snps << snp
+  else
+    $stderr.puts "ERROR: #{snp.gene} doesn't contain a SNP"
+  end
   end
 end
@@ -307,7 +311,7 @@ def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
 end
-Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model}) do |aln|
+Bio::DB::Blast.align({:query=>temp_fasta_query, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
   do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
 end if options[:aligner] == :blast
@@ -334,7 +338,7 @@ container.gene_models(temp_fasta_query)
 container.chromosomes(target)
 container.add_parental({:name=>snp_in})
 container.add_parental({:name=>original_name})
+container.max_hits = options[:max_hits]
 snps.each do |snp|
   snp.container = container
   snp.flanking_size = container.flanking_size

data/bin/polymarker_capillary.rb CHANGED

@@ -35,15 +35,21 @@ options[:primer_3_preferences] = {
 }
 options[:genomes_count] = 3
 options[:allow_non_specific] = false
+options[:aligner] = :blast
+options[:arm_selection]
+model="ungapped"
+options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
+options[:database]  = false
 OptionParser.new do |opts|
-  opts.banner = "Usage: polymarker_capillary.rb [options]"
+  opts.banner = "Usage: polymarker_deletions.rb [options]"
   opts.on("-r", "--reference FILE", "Fasta file with the assembly") do |o|
     options[:reference] = o
   end
-  opts.on("-m", "--sequences FILE", "Fasta file with the sequences to amplify. the format must be Chromosome:start-end. Chromosome should match the names to the entries in the fasta files as it is used as main target") do |o|
+  opts.on("-m", "--sequences FILE", "Fasta file with the sequences to amplify. the format must be Chromosome:start-end. Chromosome
+    should match the names to the entries in the fasta files as it is used as main target") do |o|
     options[:markers] = o
   end
@@ -53,10 +59,19 @@ OptionParser.new do |opts|
   opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
     options[:genomes_count] = o.to_i
   end
-  opts.on("-a", "--allow_non_specific", "If used, semi-specific and non-specific primers will be produced") do |o|
+  opts.on("-A", "--allow_non_specific", "If used, semi-specific and non-specific primers will be produced") do |o|
     options[:allow_non_specific] = true
   end
+  opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
+    options[:database] = o
+  end
+  opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
+    options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
+  end
 end.parse!
@@ -65,23 +80,33 @@ reference     = options[:reference]
 markers       = options[:markers]
 output_folder = options[:output_folder]
 allow_non_specific = options[:allow_non_specific]
+options[:database] = options[:reference] unless  options[:database]
+temp_fasta_query="#{output_folder}/to_align.fa"
 log "Output folder: #{output_folder}"
 exonerate_file="#{output_folder}/exonerate_tmp.tab"
 Dir.mkdir(output_folder)
+arm_selection = options[:arm_selection]
 module Bio::PolyploidTools
   class SequenceToAmplify < SNP
-    def self.select_chromosome(contig_name)
-      arr = contig_name.split('_')
-      ret = "U"
-      ret = arr[2][0,2] if arr.size >= 3
-      ret = "3B" if arr.size == 2 and arr[0] == "v443"
-      ret = arr[0][0,2] if arr.size == 1
+    def self.select_chromosome(gene_name, arm_selection)
+      #m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(gene_name)
+      #m=/TraesCS(\d{1})(\w{1})(\d{2})G(\d+)/.match(gene_name)
+      #ret = {:group : m[1],
+      #       :genome : m[2],:version=>m[3],:chr_id=>m[4]}
+      #arr = contig_name.split('_')
+      #ret = "U"
+      #ret = arr[2][0,2] if arr.size >= 3
+      #ret = "3B" if arr.size == 2 and arr[0] == "v443"
+      #ret = arr[0][0,2] if arr.size == 1
+      #ret = "#{m[1]}#{m[2]}"
+      #puts ret
+      ret = arm_selection.call(gene_name)
       return ret
     end
@@ -92,18 +117,18 @@ module Bio::PolyploidTools
     #Format:
     #A fasta entry with the id: contig:start-end
     #The sequence can be prodcued with samtools faidx
-    def self.parse(fasta_entry)
+    def self.parse(fasta_entry, arm_selection)
+      #puts fasta_entry.definition
       snp = SequenceToAmplify.new
       match_data = /(?<rname>\w*):(?<rstart>\w*)-(?<rend>\w*)/.match(fasta_entry.definition)
+      #puts match_data.inspect
       rName = Regexp.last_match(:rname)
       rStart =  Regexp.last_match(:rstart).to_i
       rEnd =  Regexp.last_match(:rend).to_i
       snp.gene = fasta_entry.definition
       #snp.chromosome=rName
-      snp.chromosome=select_chromosome(rName)
+      #puts "Gene: #{snp.gene}"
+      snp.chromosome=select_chromosome(fasta_entry.definition, arm_selection)
       #puts "#{rName}: #{snp.chromosome}"
       snp.sequence_original = fasta_entry.seq
       snp.template_sequence = fasta_entry.seq.upcase
@@ -111,7 +136,7 @@ module Bio::PolyploidTools
       snp.rstart = rStart
       snp.rend = rEnd
-      snp.position   = 100
+      snp.position   = snp.sequence_original.size / 2
       snp.original   = snp.sequence_original[snp.position]
       tmp =  Bio::Sequence::NA.new(snp.original)
@@ -232,10 +257,13 @@ file = Bio::FastaFormat.open(markers)
 file.each do |entry|
   begin
-    tmp = Bio::PolyploidTools::SequenceToAmplify.parse(entry)
+    #puts entry.inspect
+    tmp = Bio::PolyploidTools::SequenceToAmplify.parse(entry, arm_selection)
     snps << tmp if tmp
-  rescue
+  rescue Exception => e
+    log "ERROR\t#{e.message}"
     $stderr.puts "Unable to generate the marker for: #{entry.definition}"
+    $stderr.puts e.backtrace
   end
 end
@@ -251,40 +279,33 @@ fasta_file.load_fai_entries
 min_identity = 95
 found_contigs = Set.new
-Bio::DB::Exonerate.align({:query=>markers, :target=>reference, :model=>'ungapped'}) do |aln|
+def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
   if aln.identity > min_identity
     exo_f.puts aln.line
-    #puts aln.line
     unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
       found_contigs.add(aln.target_id)
       entry = fasta_file.index.region_for_entry(aln.target_id)
-      raise Exception.new,  "Entry not found! #{aln.target_id}. Make sure that the #{reference}.fai was generated properly." if entry == nil
+      raise ExonerateException.new,  "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
+      if options[:extract_found_contigs]
+        region = entry.get_full_region
+        seq = fasta_file.fetch_sequence(region)
+        contigs_f.puts(">#{aln.target_id}\n#{seq}")
+      end
     end
   end
-end
-exo_f.close
-arm_selection_functions = Hash.new
-arm_selection_functions[:full_scaffold] = lambda do | contig_name |
-  return contig_name
 end
-#Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
-#Or the first two characters in the contig name, to deal with
-#pseudomolecules that start with headers like: "1A"
-#And with the cases when 3B is named with the prefix: v443
-arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
-  arr = contig_name.split('_')
-  ret = "U"
-  ret = arr[2][0,2] if arr.size >= 3
-  ret = "3B" if arr.size == 2 and arr[0] == "v443"
-  ret = arr[0][0,2] if arr.size == 1
-  return ret
-end
+Bio::DB::Blast.align({:query=>markers, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
+  do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
+end if options[:aligner] == :blast
+Bio::DB::Exonerate.align({:query=>markers, :target=>target, :model=>model}) do |aln|
+  do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
+end if options[:aligner] == :exonerate
+exo_f.close
 container= Bio::PolyploidTools::ExonContainer.new
 container.flanking_size=500
@@ -292,6 +313,7 @@ container.gene_models(markers)
 container.chromosomes(target)
 container.add_parental({:name=>"A"})
 container.add_parental({:name=>"B"})
+#puts "SNPs size: #{snps.size}"
 snps.each do |snp|
   snp.snp_in = "B"
   snp.container = container
@@ -300,8 +322,10 @@ snps.each do |snp|
   snp.includeNoSpecific = allow_non_specific
   container.add_snp(snp)
 end
-container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>arm_selection_functions[:arm_selection_embl] , :min_identity=>min_identity})
+container.add_alignments({:exonerate_file=>exonerate_file,
+  :arm_selection=> arm_selection,
+  :min_identity=>min_identity})
 exons_filename="#{output_folder}/localAlignment.fa"
@@ -329,6 +353,9 @@ output_file  = "#{output_folder}/primers.csv"
 file = File.open(masks_output, "w")
 out  = File.open(output_file,  "w")
+out.puts ["Id","specificity","inside","type","target","orientation","product_size",
+  "left_position","left_tm","left_sequence",
+"right_position","right_tm","right_sequence"].join ","
 class Bio::DB::Primer3::Primer3Record
   attr_accessor :primerPairs
 end
@@ -358,10 +385,7 @@ Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
   file.puts ">#{seq_id}\n#{sequence_template}"
   file.puts ">#{seq_id}:mask\n#{sequence_mask}"
-   #puts "FDFDS"
-   #puts primer3record.primerPairs
    primer3record.primerPairs.each do |p|
     #puts p.inspect
     printed += 1
@@ -381,10 +405,10 @@ Bio::DB::Primer3::Primer3Record.parse_file(primer_3_output) do | primer3record |
     toPrint <<  p.right.sequence
     middle = 501
-    toPrint << lArr[0]
-    toPrint << rArr[0]
-    toPrint << middle - lArr[0]
-    toPrint << rArr[0] - middle
+    #toPrint << lArr[0]
+    #toPrint << rArr[0]
+    #toPrint << middle - lArr[0]
+    #toPrint << rArr[0] - middle
 #Start End LeftDistance  RightDistance
     out.puts toPrint.join(",")

data/bin/{find_homoeologue_variations.rb → polymarker_deletions.rb} RENAMED

@@ -53,14 +53,12 @@ class Bio::PolyploidTools::ExonContainer
 end
 class Bio::DB::Primer3::SNP
   def to_s
      "#{gene}:#{snp_from.chromosome}"
   end
 end
-class Bio::DB::Primer3::Primer3Record
+class Bio::DB::Primer3::Primer3Record
   def best_pair
     return @best_pair if @best_pair
@@ -82,7 +80,7 @@ class Bio::DB::Primer3::Primer3Record
         @total_caps = capital_count
       end
     end
-    #@best_pair = @primerPairs.min
     @best_pair
   end
@@ -107,12 +105,13 @@ class Bio::DB::Primer3::Primer3Record
   def score
     best_pair
+    total_caps = "#{best_pair.left.sequence}#{best_pair.right.sequence}".scan(/[A-Z]/).length
 #    puts "score"
  #   puts self.inspect
     ret = 0
     ret += @scores[type]
     ret += @scores[:exon] if exon?
-    ret -= @total_caps * 10
+    ret -= total_caps * 10
     ret -= product_length
     ret
   end
@@ -123,71 +122,21 @@ class Bio::DB::Primer3::Primer3Record
    def left_primer_snp(snp)
       tmp_primer = String.new(left_primer)
-      #if self.orientation == :forward
-      #  base_original = snp.original
-      #  base_snp = snp.snp
-      #elsif self.orientation == :reverse
-      #  base_original = reverse_complement_string(snp.original )
-      #  base_snp = reverse_complement_string(snp.snp)
-      #else
-      #  raise Primer3Exception.new "#{self.orientation} is not a valid orientation"
-      #end
-      # puts "#{snp.to_s} #{self.orientation} #{tmp_primer[-1] } #{base_original} #{base_snp}"
-      #if tmp_primer[-1] == base_original
-      #  tmp_primer[-1] = base_snp
-      #elsif tmp_primer[-1] == base_snp
-      #  tmp_primer[-1] = base_original
-      #else
-      #  raise Primer3Exception.new "#{tmp_primer} doesnt end in a base in the SNP #{snp.to_s}"
-      #end
-      #puts "tmp_primer: #{tmp_primer}"
       return tmp_primer
     end
 end
-arm_selection_functions = Hash.new;
-arm_selection_functions[:arm_selection_first_two] = lambda do | contig_name |
-  ret = contig_name[0,2]
-  return ret
-end
-#Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
-#Or the first two characters in the contig name, to deal with
-#pseudomolecules that start with headers like: "1A"
-#And with the cases when 3B is named with the prefix: v443
-arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
-  arr = contig_name.split('_')
-  ret = "U"
-  ret = arr[2][0,2] if arr.size >= 3
-  ret = "3B" if arr.size == 2 and arr[0] == "v443"
-  ret = arr[0][0,2] if arr.size == 1
-  return ret
-end
-arm_selection_functions[:arm_selection_morex] = lambda do | contig_name |
-  ret = contig_name.split(':')[0].split("_")[1];
-  return ret
-end
-arm_selection_functions[:scaffold] = lambda do | contig_name |
-  ret = contig_name;
-  return ret
-end
 markers = nil
 options = {}
+options[:aligner] = :blast
 options[:model] = "est2genome"
 options[:min_identity] = 90
-options[:extract_found_contigs] = false
-options[:arm_selection] = arm_selection_functions[:arm_selection_embl] ;
+options[:extract_found_contigs] = true
+options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
 options[:genomes_count] = 3
+options[:variation_free_region] =0
 options[:primer_3_preferences] = {
       :primer_product_size_range => "50-150" ,
@@ -200,11 +149,14 @@ options[:primer_3_preferences] = {
   }
+options[:database]  = false
 OptionParser.new do |opts|
-  opts.banner = "Usage: find_homoeologue_variations.rb [options]"
+  opts.banner = "Usage: polymarker_deletions.rb [options]"
-  opts.on("-c", "--sequences FASTA", "Sequence of the region to searc") do |o|
+  opts.on("-m", "--sequences FASTA", "Sequence of the region to search") do |o|
     options[:sequences] = o
   end
   opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
@@ -221,6 +173,14 @@ OptionParser.new do |opts|
   opts.on("-x", "--extract_found_contigs", "If present, save in a separate file the contigs with matches. Useful to debug.") do |o|
     options[:extract_found_contigs] = true
   end
+  opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
+    options[:database] = o
+  end
+    opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
+    options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
+  end
 end.parse!
 #reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
@@ -231,11 +191,14 @@ throw raise Exception.new(), "Fasta file with sequences has to be provided" unle
 output_folder = options[:output] if options[:output]
 throw raise Exception.new(), "An output directory has to be provided" unless output_folder
 model=options[:model]
+options[:database] = options[:reference] unless  options[:database]
 Dir.mkdir(output_folder)
 min_identity= options[:min_identity]
 exonerate_file="#{output_folder}/exonerate_tmp.tab"
-temp_contigs="#{output_folder}/contigs_tmp.fa"
 primer_3_input="#{output_folder}/primer_3_input_temp"
 primer_3_output="#{output_folder}/primer_3_output_temp"
 exons_filename="#{output_folder}/exons_genes_and_contigs.fa"
@@ -248,14 +211,8 @@ fasta_file.load_fai_entries
 original_name="A"
 snp_in="B"
- arm_selection = options[:arm_selection]
+arm_selection = options[:arm_selection]
-unless arm_selection
-   arm_selection = lambda do | contig_name |
-      ret = contig_name[0,3]
-      return ret
-    end
-end
 begin
 log "Reading exons"
 exons = Array.new
@@ -279,22 +236,28 @@ end
 log "Searching markers in genome"
 found_contigs = Set.new
 exo_f = File.open(exonerate_file, "w")
-contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
-Bio::DB::Exonerate.align({:query=>sequences, :target=>reference, :model=>model}) do |aln|
-	if aln.identity > min_identity
+def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
+  if aln.identity > min_identity
     exo_f.puts aln.line
     unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
       found_contigs.add(aln.target_id)
       entry = fasta_file.index.region_for_entry(aln.target_id)
       raise ExonerateException.new,  "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
-      region = entry.get_full_region
-      seq = fasta_file.fetch_sequence(region)
-      contigs_f.puts(">#{aln.target_id}\n#{seq}") if options[:extract_found_contigs]
     end
   end
 end
+Bio::DB::Blast.align({:query=>sequences, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
+  do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
+end if options[:aligner] == :blast
+Bio::DB::Exonerate.align({:query=>sequences, :target=>target, :model=>model}) do |aln|
+  do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
+end if options[:aligner] == :exonerate
 exo_f.close()
-contigs_f.close() if options[:extract_found_contigs]
@@ -303,18 +266,24 @@ log "Reading best alignment on each chromosome"
 container= Bio::PolyploidTools::ExonContainer.new
 container.flanking_size=options[:flanking_size]
 container.gene_models(sequences)
-container.chromosomes(temp_contigs)
+container.chromosomes(reference)
 container.add_parental({:name=>"A"})
 container.add_parental({:name=>"B"})
 exons.each do |exon|
   exon.container = container
-  exon.flanking_size = 50
+  exon.flanking_size = 200
   exon.variation_free_region = options[:variation_free_region]
-#  puts exon.inspect
+  #puts exon.inspect
   container.add_snp(exon)
 end
-container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>options[:arm_selection] , :min_identity=>min_identity})
+container.add_alignments(
+  {:exonerate_file=>exonerate_file,
+  :arm_selection=>options[:arm_selection] ,
+  :min_identity=>min_identity})
 #4.1 generating primer3 file
 log "Running primer3"
@@ -348,18 +317,14 @@ exons.each do |snp|
 end
 kasp_container.add_primers_file(primer_3_output) if added_exons > 0
-header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors"
+header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors,repetitive,blast_hits"
 File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
-kasp_container.snp_hash.each_pair do |name, kaspSNP|
-  #puts kaspSNP.snp_from.surrounding_exon_sequences.inspect
-  #puts kaspSNP.first_product
-  #puts kaspSNP.realigned_primers
-  out_fasta_products = "#{output_folder}/#{name}.fa"
-  File.open(out_fasta_products, 'w') { |f| f.write(kaspSNP.realigned_primers_fasta) }
+out_fasta_products = "#{output_folder}/products.fa"
+File.open(out_fasta_products, 'w') do  |f|
+  kasp_container.snp_hash.each_pair do |name, kaspSNP|
+    f.write(kaspSNP.realigned_primers_fasta)
+  end
 end
 File.open(output_to_order, "w") { |io|  io.write(kasp_container.print_primers_with_tails()) }

data/bio-polyploid-tools.gemspec CHANGED

@@ -2,27 +2,25 @@
 # DO NOT EDIT THIS FILE DIRECTLY
 # Instead, edit Juwelier::Tasks in Rakefile, and run 'rake gemspec'
 # -*- encoding: utf-8 -*-
-# stub: bio-polyploid-tools 0.10.1 ruby lib
+# stub: bio-polyploid-tools 1.0.0 ruby lib
 Gem::Specification.new do |s|
   s.name = "bio-polyploid-tools".freeze
-  s.version = "0.10.1"
+  s.version = "1.0.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
   s.require_paths = ["lib".freeze]
   s.authors = ["Ricardo H.  Ramirez-Gonzalez".freeze]
-  s.date = "2019-03-28"
+  s.date = "2019-07-05"
   s.description = "Repository of tools developed at Crop Genetics in JIC to work with polyploid wheat".freeze
   s.email = "ricardo.ramirez-gonzalez@jic.ac.uk".freeze
-  s.executables = ["bfr.rb".freeze, "blast_triads.rb".freeze, "blast_triads_promoters.rb".freeze, "count_variations.rb".freeze, "filter_blat_by_target_coverage.rb".freeze, "filter_exonerate_by_identity.rb".freeze, "find_best_blat_hit.rb".freeze, "find_best_exonerate.rb".freeze, "find_homoeologue_variations.rb".freeze, "get_longest_hsp_blastx_triads.rb".freeze, "hexaploid_primers.rb".freeze, "homokaryot_primers.rb".freeze, "mafft_triads.rb".freeze, "mafft_triads_promoters.rb".freeze, "map_markers_to_contigs.rb".freeze, "marker_to_vcf.rb".freeze, "markers_in_region.rb".freeze, "mask_triads.rb".freeze, "polymarker.rb".freeze, "polymarker_capillary.rb".freeze, "snp_position_to_polymarker.rb".freeze, "snps_between_bams.rb".freeze, "tag_stats.rb".freeze, "vcfLineToTable.rb".freeze, "vcfToPolyMarker.rb".freeze]
+  s.executables = ["bfr.rb".freeze, "blast_triads.rb".freeze, "blast_triads_promoters.rb".freeze, "count_variations.rb".freeze, "filter_blat_by_target_coverage.rb".freeze, "filter_exonerate_by_identity.rb".freeze, "find_best_blat_hit.rb".freeze, "find_best_exonerate.rb".freeze, "get_longest_hsp_blastx_triads.rb".freeze, "hexaploid_primers.rb".freeze, "homokaryot_primers.rb".freeze, "mafft_triads.rb".freeze, "mafft_triads_promoters.rb".freeze, "map_markers_to_contigs.rb".freeze, "marker_to_vcf.rb".freeze, "markers_in_region.rb".freeze, "mask_triads.rb".freeze, "polymarker.rb".freeze, "polymarker_capillary.rb".freeze, "polymarker_deletions.rb".freeze, "snp_position_to_polymarker.rb".freeze, "snps_between_bams.rb".freeze, "tag_stats.rb".freeze, "vcfLineToTable.rb".freeze, "vcfToPolyMarker.rb".freeze]
   s.extra_rdoc_files = [
-    "README",
     "README.md"
   ]
   s.files = [
     ".travis.yml",
     "Gemfile",
-    "README",
     "README.md",
     "Rakefile",
     "VERSION",
@@ -34,7 +32,6 @@ Gem::Specification.new do |s|
     "bin/filter_exonerate_by_identity.rb",
     "bin/find_best_blat_hit.rb",
     "bin/find_best_exonerate.rb",
-    "bin/find_homoeologue_variations.rb",
     "bin/get_longest_hsp_blastx_triads.rb",
     "bin/hexaploid_primers.rb",
     "bin/homokaryot_primers.rb",
@@ -46,6 +43,7 @@ Gem::Specification.new do |s|
     "bin/mask_triads.rb",
     "bin/polymarker.rb",
     "bin/polymarker_capillary.rb",
+    "bin/polymarker_deletions.rb",
     "bin/snp_position_to_polymarker.rb",
     "bin/snps_between_bams.rb",
     "bin/tag_stats.rb",

data/lib/bio/PolyploidTools/ExonContainer.rb CHANGED

@@ -76,7 +76,6 @@ module Bio::PolyploidTools
     end
     def add_snp(snp)
-      #TODO: add to the snp the maximum number of hits?
       snp.max_hits = self.max_hits
       @snp_map[snp.gene] = Array.new unless   @snp_map[snp.gene]
       @snp_map[snp.gene] << snp
@@ -141,6 +140,7 @@ module Bio::PolyploidTools
           begin
             file.puts snp.aligned_sequences_fasta
           rescue Exception=>e
+            #puts snp.inspect
             @missing_exons << snp.to_s
             $stderr.puts "print_fasta_snp_exones:" + snp.to_s + ":" + e.to_s
             $stderr.puts "Local position: #{snp.local_position}"
@@ -160,8 +160,8 @@ module Bio::PolyploidTools
           begin
             primer_3_min_seq_length
             string = snp.primer_3_string( snp.chromosome, parental )
-            #TODO: add tan error to the SNP this snp has more than max_hits. Or maybe inside the SNP file.
-            #puts "print_primer_3_exons: #{string.size}"
+            #TODO: add tan error to the SNP this snp has more than max_hits.
+            #Or maybe inside the SNP file.
             if string.size > 0
               file.puts string
               added += 1

data/lib/bio/PolyploidTools/NoSNPSequence.rb CHANGED

@@ -55,11 +55,15 @@ module Bio::PolyploidTools
      def mask_aligned_chromosomal_snp(chromosome)
       return nil if  aligned_sequences.values.size == 0
-      names = exon_sequences.keys
+      names = aligned_sequences.keys
+      parentals =  parental_sequences.keys
+      names = names - parentals
+      best_target = get_target_sequence(names, chromosome)
+      masked_snps = aligned_sequences[best_target].downcase if aligned_sequences[best_target]
+      masked_snps = "-" * aligned_sequences.values[0].size  unless aligned_sequences[best_target]
-      masked_snps = aligned_sequences[chromosome].downcase if aligned_sequences[chromosome]
-      masked_snps = "-" * aligned_sequences.values[0].size  unless aligned_sequences[chromosome]
       #TODO: Make this chromosome specific, even when we have more than one alignment going to the region we want.
       i = 0
       while i < masked_snps.size
@@ -105,26 +109,23 @@ module Bio::PolyploidTools
         aligned_sequences.each_pair do |name, val|
           has_del = true if val[i] == '-'
-          print "#{val[i]}\t"
+          #print "#{val[i]}\t"
         end
         count += 1 if has_del
-        print "#{count}\n"
+        #print "#{count}\n"
       end
       return count
     end
     def primer_region(target_chromosome, parental_chr )
       chromosome_seq = aligned_sequences[target_chromosome]
-      #chromosome_seq = "-" * parental.size unless chromosome_seq
-      if aligned_sequences.size == 0
-        #puts aligned_sequences.inspect
-        #puts surrounding_exon_sequences.inspect
-        #puts self.inspect
-        chromosome_seq = surrounding_exon_sequences[target_chromosome]
-      end
+      names = aligned_sequences.keys
+      target_chromosome = get_target_sequence(names, target_chromosome)
+      chromosome_seq = aligned_sequences[target_chromosome]
+      chromosome_seq = surrounding_exon_sequences[target_chromosome ]if aligned_sequences.size == 0
+      chromosome_seq = "-" * sequence_original.size unless chromosome_seq
       chromosome_seq = chromosome_seq.downcase
+      #puts chromosome_seq
       mask = mask_aligned_chromosomal_snp(target_chromosome)
       pr = PrimerRegion.new
@@ -146,7 +147,7 @@ module Bio::PolyploidTools
               pr.crhomosome_specific_intron << position_in_region
             elsif Bio::NucleicAcid.is_valid(parental[i], mask[i])
               parental[i] = mask[i]
-              pr.chromosome_specific << position_in_region if count_deletions_around(1,target_chromosome) < 3
+              pr.chromosome_specific << position_in_region #if count_deletions_around(1,target_chromosome) < 3
               pr.chromosome_specific_in_mask << i
             end
@@ -165,16 +166,15 @@ module Bio::PolyploidTools
           position_in_region += 1
         end #Closes region with bases
       end
       pr.sequence=parental.gsub('-','')
       pr
     end
-    def return_primer_3_string_test(opts={})
-      left = opts[:right_pos]
+    def return_primer_3_string(opts={})
+      #puts "return_primer_3_string #{opts.inspect}"
+      left = opts[:left_pos]
       right = opts[:right_pos]
-      sequence =  opts[:sequence]
+      sequence =  opts[:sequence].clone
       orientation = "forward"
       if opts[:right_pos]
         orientation = "forward"
@@ -201,7 +201,7 @@ module Bio::PolyploidTools
       #In case that we don't have a right primer, we do both orientations
       unless opts[:right_pos]
-        sequence =  opts[:sequence]
+        sequence =  opts[:sequence].clone
         left = sequence.size - left - 1
         orientation = "reverse"
         sequence = reverse_complement_string(sequence)
@@ -223,7 +223,9 @@ module Bio::PolyploidTools
     end
     def primer_3_all_strings(target_chromosome, parental)
+      #puts "primer_3_all_strings: #{target_chromosome} #{parental}"
       pr = primer_region(target_chromosome, parental )
+      #puts pr.inspect
       primer_3_propertes = Array.new
       seq_original = String.new(pr.sequence)
@@ -236,24 +238,28 @@ module Bio::PolyploidTools
         snp_type = "non-homoeologous"
       end
-      pr.chromosome_specific.each do |pos|
-        seq_snp =  String.new(pr.sequence)
-        orgiginal_base = seq_snp[pos]
-        other_chromosome_base = get_base_in_different_chromosome(pos, target_chromosome)
+      pr.chromosome_specific.each_with_index do |pos , i|
+        seq_snp =  seq_original.clone
+        #original_base = seq_snp[pos]
+        #puts "___"
+        #puts aligned_sequences.keys.inspect
+        #puts target_chromosome
+        t_chr =  get_target_sequence(aligned_sequences.keys, target_chromosome)
+        other_chromosome_base = get_base_in_different_chromosome(pr.chromosome_specific_in_mask[i], t_chr)
         args = {
           :name =>"#{gene} A chromosome_specific exon #{snp_type} #{chromosome}",
           :left_pos => pos,
-          :sequence=>seq_original
+          :sequence=>seq_snp
         }
+        seq_snp =  seq_original.clone
         primer_3_propertes << return_primer_3_string(args)
         args[:name] = "#{gene} B chromosome_specific exon #{snp_type} #{chromosome}"
-        args[:sequence] = seq_snp
-        #TODO: Find base from another chromosome
         seq_snp[pos] =  other_chromosome_base.upcase
+        args[:sequence] = seq_snp
         primer_3_propertes << return_primer_3_string(args)
       end
@@ -265,7 +271,7 @@ module Bio::PolyploidTools
     def aligned_sequences
       return @aligned_sequences if @aligned_sequences
-      if sequences_to_align.size == 1
+      if sequences_to_align.size <= 1
         @aligned_sequences = sequences_to_align
         return @aligned_sequences
       end

data/lib/bio/PolyploidTools/SNP.rb CHANGED

@@ -162,6 +162,7 @@ module Bio::PolyploidTools
     end
     def add_exon(exon, arm, filter_best: true)
+      exon_list[arm] = Array.new unless exon_list[arm]
       if filter_best and exon_list[arm].size > 0
         current = exon_list[arm].first
         exon_list[arm] = [exon] if exon.record.score > current.record.score
@@ -558,7 +559,7 @@ module Bio::PolyploidTools
     def aligned_sequences
       return @aligned_sequences if @aligned_sequences
+      return Hash.new if sequences_to_align.size == 0
       options = ['--maxiterate', '1000', '--localpair', '--quiet']
       mafft = Bio::MAFFT.new( "mafft" , options)
@@ -756,13 +757,13 @@ module Bio::PolyploidTools
       self.exon_list.each do |chromosome, exon_arr|
         exon_arr.each do |exon|
           exon_start_offset = exon.query_region.start - gene_region.start
-          flanquing_region  = exon.target_flanking_region_from_position(position,flanking_size)
+          flanking_region  = exon.target_flanking_region_from_position(position,flanking_size)
           #TODO: Padd when the exon goes over the regions...
-          #puts flanquing_region.inspect
+          #puts flanking_region.inspect
           #Ignoring when the exon is in a gap
           unless exon.snp_in_gap
-            exon_seq = container.chromosome_sequence(flanquing_region)
-            @surrounding_exon_sequences["#{chromosome}_#{flanquing_region.start}_#{exon.record.score}"] = exon_seq
+            exon_seq = container.chromosome_sequence(flanking_region)
+            @surrounding_exon_sequences["#{chromosome}_#{flanking_region.start}_#{exon.record.score}"] = exon_seq
           end
         end
       end

data/lib/bio/db/blast.rb CHANGED

@@ -82,7 +82,7 @@ module Bio::DB::Blast
 		max_target_seqs = 6 #TODO: Actually add this as an argument to PolyMarker.
 		max_target_seqs = opts[:max_hits] * 2 if opts[:max_hits]
 		cmdline = "blastn -max_target_seqs #{max_target_seqs} -query #{query} -db #{target} -outfmt '6 qseqid qstart qend qframe sseqid sstart send sframe score pident qlen slen qseq sseq'"
+		#puts cmdline
 		status, stdout, stderr = systemu cmdline
 		if status.exitstatus == 0
 			alns = Array.new unless block_given?

data/lib/bio/db/primer3.rb CHANGED

@@ -129,12 +129,12 @@ module Bio::DB::Primer3
       @values << snp_type
       if primer3_line_1 and primer3_line_2
         #Block that searches both if both pairs have a TM
-        primer_2 = primer3_line_2.left_primer_with_coordinates(primer3_line_1.left_coordinates, primer3_line_1.orientation)
-        primer_2_tm = find_left_primer_temp(primer_2)
-        primer_1 = primer3_line_1.left_primer_with_coordinates(primer3_line_2.left_coordinates, primer3_line_2.orientation)
+        primer_1    = primer3_line_1.left_primer_with_coordinates(primer3_line_2.left_coordinates, primer3_line_2.orientation)
         primer_1_tm = find_left_primer_temp(primer_1)
-        #  $stderr.puts primer_1
-        #  $stderr.puts primer_2
+        primer_2    = primer3_line_2.left_primer_with_coordinates(primer3_line_1.left_coordinates, primer3_line_1.orientation)
+        primer_2_tm = find_left_primer_temp(primer_2)
         if primer3_line_1 < primer3_line_2 and primer_2_tm != "NA"
           @values << primer3_line_1.left_primer
           @values << primer_2
@@ -159,7 +159,7 @@ module Bio::DB::Primer3
           @values << primer3_line_2.best_pair.product_size
         else
-          first_candidate = find_primer_pair_first
+          first_candidate  = find_primer_pair_first
           second_candidate = find_primer_pair_second
           if first_candidate
@@ -183,7 +183,7 @@ module Bio::DB::Primer3
             @values << first_candidate.best_pair.left.tm
             @values << primer_2_tm
             @values << first_candidate.best_pair.right.tm
-            @values << "first"
+            @values << "first-"
             @values << first_candidate.best_pair.product_size
           elsif  second_candidate
             #puts "B"
@@ -195,7 +195,7 @@ module Bio::DB::Primer3
             @values << primer_1_tm
             @values << second_candidate.best_pair.left.tm
             @values << second_candidate.best_pair.right.tm
-            @values << "second"
+            @values << "second-"
             @values << second_candidate.best_pair.product_size
           elsif  first_candidate
             #puts "C"
@@ -207,7 +207,7 @@ module Bio::DB::Primer3
             @values << primer_2_tm
             @values << first_candidate.best_pair.left.tm
             @values << first_candidate.best_pair.right.tm
-            @values << "first"
+            @values << "first/"
             @values << first_candidate.best_pair.product_size
           end
         end
@@ -277,7 +277,6 @@ module Bio::DB::Primer3
     end
     def orientation
-      puts "insideOrientation: #{self.values[11]}"
       return self.values[11] if self.values[11]&& self.values[11] != nil
       return 'unknown'
     end
@@ -385,7 +384,7 @@ module Bio::DB::Primer3
           @primer3_line_1 = primer3record if not @primer3_line_1  or @primer3_line_1 > primer3record
         when primer3record.line == @line_2
           primers_line_2 << primer3record
-          @primer3_line_2 = primer3record if not @primer3_line_2 or @primer3_line_2 > primer3record
+          @primer3_line_2 = primer3record if not @primer3_line_2  or @primer3_line_2 > primer3record
         else
           raise Primer3Exception.new "#{primer3record.line} is not recognized (#{line_1}, #{line_2})"
         end
@@ -508,9 +507,7 @@ module Bio::DB::Primer3
     def left_primer_with_coordinates(coordinates, other_orientation)
       seq = self.sequence_template
-      #puts "Left coordinates: #{seq}"
-      seq = Primer3Record.reverse_complement_string(seq) if self.orientation != other_orientation
+      seq = Primer3Record.reverse_complement_string(seq) if self.orientation != other_orientation
       seq[coordinates[0],coordinates[1]]
     end
@@ -807,9 +804,9 @@ module Bio::DB::Primer3
       str = ""
       snp_hash.each do |k, snp|
         if snp.found_primers?
-          str << snp.gene << snp.original << "\t" << tail_a << snp.first_primer << "\n"
-          str << snp.gene << snp.snp      << "\t" << tail_b << snp.second_primer << "\n"
-          str << snp.gene                 << "\t"           << snp.common_primer << "\n"
+          str << snp.gene << snp.original << "_1st\t" << tail_a << snp.first_primer  << "\n"
+          str << snp.gene << snp.snp      << "_2nd\t" << tail_b << snp.second_primer << "\n"
+          str << snp.gene                 << "_common\t"        << snp.common_primer << "\n"
         end
       end
       return str

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bio-polyploid-tools
 version: !ruby/object:Gem::Version
-  version: 0.10.1
+  version: 1.0.0
 platform: ruby
 authors:
 - Ricardo H.  Ramirez-Gonzalez
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-03-28 00:00:00.000000000 Z
+date: 2019-07-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bio
@@ -120,7 +120,6 @@ executables:
 - filter_exonerate_by_identity.rb
 - find_best_blat_hit.rb
 - find_best_exonerate.rb
-- find_homoeologue_variations.rb
 - get_longest_hsp_blastx_triads.rb
 - hexaploid_primers.rb
 - homokaryot_primers.rb
@@ -132,6 +131,7 @@ executables:
 - mask_triads.rb
 - polymarker.rb
 - polymarker_capillary.rb
+- polymarker_deletions.rb
 - snp_position_to_polymarker.rb
 - snps_between_bams.rb
 - tag_stats.rb
@@ -139,12 +139,10 @@ executables:
 - vcfToPolyMarker.rb
 extensions: []
 extra_rdoc_files:
-- README
 - README.md
 files:
 - ".travis.yml"
 - Gemfile
-- README
 - README.md
 - Rakefile
 - VERSION
@@ -156,7 +154,6 @@ files:
 - bin/filter_exonerate_by_identity.rb
 - bin/find_best_blat_hit.rb
 - bin/find_best_exonerate.rb
-- bin/find_homoeologue_variations.rb
 - bin/get_longest_hsp_blastx_triads.rb
 - bin/hexaploid_primers.rb
 - bin/homokaryot_primers.rb
@@ -168,6 +165,7 @@ files:
 - bin/mask_triads.rb
 - bin/polymarker.rb
 - bin/polymarker_capillary.rb
+- bin/polymarker_deletions.rb
 - bin/snp_position_to_polymarker.rb
 - bin/snps_between_bams.rb
 - bin/tag_stats.rb

data/README DELETED

@@ -1,21 +0,0 @@
-= bio-polyploid-tools
-== Introduction
-This tools are designed to deal with polyploid wheat. The first tool is to design KASPer primers, making them as specific as possible.
-== Installation
-'gem install bio-polyploid-tools'
-== Notes
-* If the SNP is in a gap in the alignmetn to the chromosomes, it is ignored.
-BUG: Sometimes the primers are reversed (the first comes second)
-BUG: Blocks with NNNs are picked and treated as semi-specific.
-BUG: If the name of the reference have space, the ID is not chopped. ">gene_1 (G12A)" shouls be treated as ">gene_1".
-TODO: If reading from a reference file, only get one reference to align when the region is queried several times
-TODO: Add a parameter file file to tweak the alignments.