RubyGems - bio-polymarker - Versions diffs - 1.3.2 - Mend

bio-polymarker 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (177) hide show

checksums.yaml +7 -0
data/.travis.yml +24 -0
data/Gemfile +23 -0
data/README.md +205 -0
data/Rakefile +61 -0
data/SECURITY.md +16 -0
data/VERSION +1 -0
data/bin/bfr.rb +128 -0
data/bin/blast_triads.rb +166 -0
data/bin/blast_triads_promoters.rb +192 -0
data/bin/count_variations.rb +36 -0
data/bin/filter_blat_by_target_coverage.rb +69 -0
data/bin/filter_exonerate_by_identity.rb +38 -0
data/bin/find_best_blat_hit.rb +33 -0
data/bin/find_best_exonerate.rb +17 -0
data/bin/get_longest_hsp_blastx_triads.rb +66 -0
data/bin/hexaploid_primers.rb +168 -0
data/bin/homokaryot_primers.rb +183 -0
data/bin/mafft_triads.rb +120 -0
data/bin/mafft_triads_promoters.rb +403 -0
data/bin/map_markers_to_contigs.rb +66 -0
data/bin/marker_to_vcf.rb +241 -0
data/bin/markers_in_region.rb +42 -0
data/bin/mask_triads.rb +169 -0
data/bin/polymarker.rb +410 -0
data/bin/polymarker_capillary.rb +443 -0
data/bin/polymarker_deletions.rb +350 -0
data/bin/snp_position_to_polymarker.rb +101 -0
data/bin/snps_between_bams.rb +107 -0
data/bin/tag_stats.rb +75 -0
data/bin/vcfLineToTable.rb +56 -0
data/bin/vcfToPolyMarker.rb +82 -0
data/bio-polymarker.gemspec +227 -0
data/conf/defaults.rb +1 -0
data/conf/primer3_config/dangle.dh +128 -0
data/conf/primer3_config/dangle.ds +128 -0
data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
data/conf/primer3_config/interpretations/loops_i.dh +34 -0
data/conf/primer3_config/interpretations/loops_i.ds +31 -0
data/conf/primer3_config/interpretations/stack_i.dh +257 -0
data/conf/primer3_config/interpretations/stack_i.ds +256 -0
data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
data/conf/primer3_config/loops.dh +30 -0
data/conf/primer3_config/loops.ds +30 -0
data/conf/primer3_config/stack.dh +256 -0
data/conf/primer3_config/stack.ds +256 -0
data/conf/primer3_config/stackmm.dh +256 -0
data/conf/primer3_config/stackmm.ds +256 -0
data/conf/primer3_config/tetraloop.dh +77 -0
data/conf/primer3_config/tetraloop.ds +77 -0
data/conf/primer3_config/triloop.dh +16 -0
data/conf/primer3_config/triloop.ds +16 -0
data/conf/primer3_config/tstack.dh +256 -0
data/conf/primer3_config/tstack2.dh +256 -0
data/conf/primer3_config/tstack2.ds +256 -0
data/conf/primer3_config/tstack_tm_inf.ds +256 -0
data/lib/bio/BFRTools.rb +465 -0
data/lib/bio/BIOExtensions.rb +153 -0
data/lib/bio/PolyploidTools/ChromosomeArm.rb +63 -0
data/lib/bio/PolyploidTools/ExonContainer.rb +245 -0
data/lib/bio/PolyploidTools/Marker.rb +175 -0
data/lib/bio/PolyploidTools/Mask.rb +116 -0
data/lib/bio/PolyploidTools/NoSNPSequence.rb +292 -0
data/lib/bio/PolyploidTools/PrimerRegion.rb +30 -0
data/lib/bio/PolyploidTools/SNP.rb +804 -0
data/lib/bio/PolyploidTools/SNPMutant.rb +86 -0
data/lib/bio/PolyploidTools/SNPSequence.rb +55 -0
data/lib/bio/db/blast.rb +114 -0
data/lib/bio/db/exonerate.rb +333 -0
data/lib/bio/db/primer3.rb +820 -0
data/lib/bio-polymarker.rb +28 -0
data/test/data/7B_amplicon_test.fa +12 -0
data/test/data/7B_amplicon_test.fa.fai +1 -0
data/test/data/7B_amplicon_test_reference.fa +110 -0
data/test/data/7B_amplicon_test_reference.fa.fai +3 -0
data/test/data/7B_marker_test.txt +1 -0
data/test/data/BS00068396_51.fa +2 -0
data/test/data/BS00068396_51_blast.tab +4 -0
data/test/data/BS00068396_51_contigs.aln +1412 -0
data/test/data/BS00068396_51_contigs.dnd +7 -0
data/test/data/BS00068396_51_contigs.fa +8 -0
data/test/data/BS00068396_51_contigs.fa.fai +4 -0
data/test/data/BS00068396_51_contigs.fa.nhr +0 -0
data/test/data/BS00068396_51_contigs.fa.nin +0 -0
data/test/data/BS00068396_51_contigs.fa.nsq +0 -0
data/test/data/BS00068396_51_contigs.nhr +0 -0
data/test/data/BS00068396_51_contigs.nin +0 -0
data/test/data/BS00068396_51_contigs.nsq +0 -0
data/test/data/BS00068396_51_exonerate.tab +6 -0
data/test/data/BS00068396_51_for_polymarker.txt +1 -0
data/test/data/BS00068396_51_genes.txt +14 -0
data/test/data/IWGSC_CSS_1AL_scaff_1455974.fa +112 -0
data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa +2304 -0
data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
data/test/data/LIB1716.bam +0 -0
data/test/data/LIB1716.bam.bai +0 -0
data/test/data/LIB1719.bam +0 -0
data/test/data/LIB1719.bam.bai +0 -0
data/test/data/LIB1721.bam +0 -0
data/test/data/LIB1721.bam.bai +0 -0
data/test/data/LIB1722.bam +0 -0
data/test/data/LIB1722.bam.bai +0 -0
data/test/data/PST130_7067.csv +1 -0
data/test/data/PST130_7067.fa +2 -0
data/test/data/PST130_7067.fa.fai +1 -0
data/test/data/PST130_7067.fa.ndb +0 -0
data/test/data/PST130_7067.fa.nhr +0 -0
data/test/data/PST130_7067.fa.nin +0 -0
data/test/data/PST130_7067.fa.not +0 -0
data/test/data/PST130_7067.fa.nsq +0 -0
data/test/data/PST130_7067.fa.ntf +0 -0
data/test/data/PST130_7067.fa.nto +0 -0
data/test/data/PST130_reverse_primer.csv +1 -0
data/test/data/S22380157.fa +16 -0
data/test/data/S22380157.fa.fai +1 -0
data/test/data/S22380157.vcf +67 -0
data/test/data/S58861868/LIB1716.bam +0 -0
data/test/data/S58861868/LIB1716.sam +651 -0
data/test/data/S58861868/LIB1719.bam +0 -0
data/test/data/S58861868/LIB1719.sam +805 -0
data/test/data/S58861868/LIB1721.bam +0 -0
data/test/data/S58861868/LIB1721.sam +1790 -0
data/test/data/S58861868/LIB1722.bam +0 -0
data/test/data/S58861868/LIB1722.sam +1271 -0
data/test/data/S58861868/S58861868.fa +16 -0
data/test/data/S58861868/S58861868.fa.fai +1 -0
data/test/data/S58861868/S58861868.vcf +76 -0
data/test/data/S58861868/header.txt +9 -0
data/test/data/S58861868/merged.bam +0 -0
data/test/data/S58861868/merged_reheader.bam +0 -0
data/test/data/S58861868/merged_reheader.bam.bai +0 -0
data/test/data/Test3Aspecific.csv +2 -0
data/test/data/Test3Aspecific_contigs.fa +6 -0
data/test/data/bfr_out_test.csv +5 -0
data/test/data/chr1A_C1145499T/chr1A_C1145499T.csv +1 -0
data/test/data/chr1A_G540414846C/chr1A_G540414846C.csv +1 -0
data/test/data/chr1A_G540414846C/chr1A_G540414846C.fa +2 -0
data/test/data/chr1A_T517634750C/chr1A_T517634750C.csv +1 -0
data/test/data/chr2D_C112180134A/chr2D_C112180134A.csv +1 -0
data/test/data/chr4D_C14473543T/chr4D_C14473543T.csv +1 -0
data/test/data/chr4D_C14473543T/chr4D_C14473543T.fa +2 -0
data/test/data/headerMergeed.txt +9 -0
data/test/data/headerS2238015 +1 -0
data/test/data/mergedLibs.bam +0 -0
data/test/data/mergedLibsReheader.bam +0 -0
data/test/data/mergedLibsSorted.bam +0 -0
data/test/data/mergedLibsSorted.bam.bai +0 -0
data/test/data/patological_cases5D.csv +1 -0
data/test/data/primer_3_input_header_test +5 -0
data/test/data/short_primer_design_test.csv +10 -0
data/test/data/some_tests/some_tests.csv +201 -0
data/test/data/test_from_mutant.csv +3 -0
data/test/data/test_iselect.csv +196 -0
data/test/data/test_iselect_reference.fa +1868 -0
data/test/data/test_iselect_reference.fa.fai +934 -0
data/test/data/test_primer3_error.csv +4 -0
data/test/data/test_primer3_error_contigs.fa +10 -0
data/test/test_bfr.rb +135 -0
data/test/test_blast.rb +47 -0
data/test/test_exon_container.rb +17 -0
data/test/test_exonearate.rb +48 -0
data/test/test_integration.rb +76 -0
data/test/test_snp_parsing.rb +121 -0
data/test/test_wrong_selection.sh +5 -0
metadata +356 -0

data/bin/polymarker_deletions.rb ADDED Viewed

@@ -0,0 +1,350 @@
+#!/usr/bin/env ruby
+require 'bio'
+require 'rubygems'
+require 'pathname'
+require 'bio-samtools-wrapper'
+require 'optparse'
+require 'set'
+$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
+$: << File.expand_path('.')
+path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
+require path
+def log(msg)
+  time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
+  puts "#{time}: #{msg}"
+end
+class Bio::PolyploidTools::ExonContainer
+   def add_alignments(opts=Hash.new)
+      opts = { :min_identity=>90 }.merge!(opts)
+      exonerate_filename = opts[:exonerate_file]
+      arm_selection = opts[:arm_selection]
+      unless arm_selection
+        arm_selection = lambda do | contig_name |
+          ret = contig_name[0,3]
+          return ret
+        end
+      end
+      File.open(exonerate_filename) do |f|
+        f.each_line do | line |
+          record = Bio::DB::Exonerate::Alignment.parse_custom(line)
+          if  record and record.identity >= opts[:min_identity]
+            snp_array = @snp_map[record.query_id]
+            if snp_array != nil
+              snp_array.each do |snp|
+                if snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
+                  begin
+                    exon = record.exon_on_gene_position(snp.position)
+                    snp.add_exon(exon, arm_selection.call(record.target_id))
+                  rescue Bio::DB::Exonerate::ExonerateException
+                    $stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
+                  end
+                end
+              end
+            end
+          end
+        end
+      end
+    end
+end
+class Bio::DB::Primer3::SNP
+  def to_s
+     "#{gene}:#{snp_from.chromosome}"
+  end
+end
+class Bio::DB::Primer3::Primer3Record
+  def best_pair
+    return @best_pair if @best_pair
+    @best_pair = nil
+    @total_caps = 100
+    @primerPairs.each do | primer |
+      capital_count = "#{primer.left.sequence}#{primer.right.sequence}".scan(/[A-Z]/).length
+      if @best_pair.nil?
+        @best_pair = primer
+        @total_caps = capital_count
+        next
+      end
+      if capital_count < @total_caps
+        @best_pair = primer
+        @total_caps = capital_count
+      end
+      if primer.size < @best_pair.size
+        @best_pair = primer
+        @total_caps = capital_count
+      end
+    end
+    @best_pair
+  end
+#CL3339Contig1:T509C AvocetS chromosome_specific exon 4D forward
+  def parse_header
+    @snp, @line, @type, @in, @polymorphism, @chromosome, @orientation   = self.sequence_id.split(" ")
+    @type = @type.to_sym
+    if @in
+      @in = @in.to_sym == :exon
+    else
+      @exon = false
+    end
+    if @polymorphism.to_sym == :homoeologous
+      @homoeologous = true
+    else
+      @homoeologous = false
+    end
+    @parsed = true
+    @orientation = @orientation.to_sym
+  end
+  def score
+    best_pair
+    total_caps = "#{best_pair.left.sequence}#{best_pair.right.sequence}".scan(/[A-Z]/).length
+#    puts "score"
+ #   puts self.inspect
+    ret = 0
+    ret += @scores[type]
+    ret += @scores[:exon] if exon?
+    ret -= total_caps * 10
+    ret -= product_length
+    ret
+  end
+  def to_s
+      "#{gene}:#{snp_from.chromosome}"
+  end
+   def left_primer_snp(snp)
+      tmp_primer = String.new(left_primer)
+      return tmp_primer
+    end
+end
+markers = nil
+options = {}
+options[:aligner] = :blast
+options[:model] = "est2genome"
+options[:min_identity] = 90
+options[:extract_found_contigs] = true
+options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection("nrgene")
+options[:genomes_count] = 3
+options[:variation_free_region] =0
+options[:primer_3_preferences] = {
+      :primer_product_size_range => "50-150" ,
+      :primer_max_size => 25 ,
+      :primer_lib_ambiguity_codes_consensus => 1,
+      :primer_liberal_base => 1,
+      :primer_num_return=>5,
+      :primer_explain_flag => 1,
+      :primer_thermodynamic_parameters_path=>File.expand_path(File.dirname(__FILE__) + '../../conf/primer3_config/') + '/'
+  }
+options[:database]  = false
+OptionParser.new do |opts|
+  opts.banner = "Usage: polymarker_deletions.rb [options]"
+  opts.on("-m", "--sequences FASTA", "Sequence of the region to search") do |o|
+    options[:sequences] = o
+  end
+  opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
+    options[:reference] = o
+  end
+  opts.on("-o", "--output DIR", "Directory to write the output") do |o|
+    options[:output] = o
+  end
+  opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
+    options[:genomes_count] = o.to_i
+  end
+  opts.on("-x", "--extract_found_contigs", "If present, save in a separate file the contigs with matches. Useful to debug.") do |o|
+    options[:extract_found_contigs] = true
+  end
+  opts.on("-d", "--database PREFIX", "Path to the blast database. Only used if the aligner is blast. The default is the name of the contigs file without extension.") do |o|
+    options[:database] = o
+  end
+    opts.on("-a", "--arm_selection #{Bio::PolyploidTools::ChromosomeArm.getValidFunctions.join('|')}", "Function to decide the chromome arm") do |o|
+    options[:arm_selection] = Bio::PolyploidTools::ChromosomeArm.getArmSelection(o)
+  end
+end.parse!
+#reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
+reference = options[:reference] if options[:reference]
+throw raise Exception.new(), "Reference has to be provided" unless reference
+sequences = options[:sequences] if options[:sequences]
+throw raise Exception.new(), "Fasta file with sequences has to be provided" unless sequences
+output_folder = options[:output] if options[:output]
+throw raise Exception.new(), "An output directory has to be provided" unless output_folder
+model=options[:model]
+options[:database] = options[:reference] unless  options[:database]
+Dir.mkdir(output_folder)
+min_identity= options[:min_identity]
+exonerate_file="#{output_folder}/exonerate_tmp.tab"
+primer_3_input="#{output_folder}/primer_3_input_temp"
+primer_3_output="#{output_folder}/primer_3_output_temp"
+exons_filename="#{output_folder}/exons_genes_and_contigs.fa"
+output_primers="#{output_folder}/primers.csv"
+output_to_order="#{output_folder}/primers_to_order.csv"
+fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
+fasta_file.load_fai_entries
+original_name="A"
+snp_in="B"
+arm_selection = options[:arm_selection]
+begin
+log "Reading exons"
+exons = Array.new
+Bio::FlatFile.auto(sequences) do |ff|
+  ff.each do |entry|
+    fields = Array.new
+    fields << entry.definition
+    fields << arm_selection.call(entry.definition)
+    fields << entry.seq
+    line = fields.join(",")
+    snp =  Bio::PolyploidTools::NoSNPSequence.parse(line)
+    snp.genomes_count = options[:genomes_count]
+    exons << snp
+  end
+end
+log "Searching markers in genome"
+found_contigs = Set.new
+exo_f = File.open(exonerate_file, "w")
+def do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
+  if aln.identity > min_identity
+    exo_f.puts aln.line
+    unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file.
+      found_contigs.add(aln.target_id)
+      entry = fasta_file.index.region_for_entry(aln.target_id)
+      raise ExonerateException.new,  "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
+    end
+  end
+end
+Bio::DB::Blast.align({:query=>sequences, :target=>options[:database], :model=>model, :max_hits=>options[:max_hits]}) do |aln|
+  do_align(aln, exo_f, found_contigs,min_identity, fasta_file,options)
+end if options[:aligner] == :blast
+Bio::DB::Exonerate.align({:query=>sequences, :target=>target, :model=>model}) do |aln|
+  do_align(aln, exo_f, found_contigs, min_identity,fasta_file,options)
+end if options[:aligner] == :exonerate
+exo_f.close()
+log "Reading best alignment on each chromosome"
+container= Bio::PolyploidTools::ExonContainer.new
+container.flanking_size=options[:flanking_size]
+container.gene_models(sequences)
+container.chromosomes(reference)
+container.add_parental({:name=>"A"})
+container.add_parental({:name=>"B"})
+exons.each do |exon|
+  exon.container = container
+  exon.flanking_size = 200
+  exon.variation_free_region = options[:variation_free_region]
+  #puts exon.inspect
+  container.add_snp(exon)
+end
+container.add_alignments(
+  {:exonerate_file=>exonerate_file,
+  :arm_selection=>options[:arm_selection] ,
+  :min_identity=>min_identity})
+#4.1 generating primer3 file
+log "Running primer3"
+file = File.open(exons_filename, "w")
+container.print_fasta_snp_exones(file)
+file.close
+file = File.open(primer_3_input, "w")
+Bio::DB::Primer3.prepare_input_file(file, options[:primer_3_preferences])
+added_exons = container.print_primer_3_exons(file, nil, snp_in)
+file.close
+Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0
+#5. Pick the best primer and make the primer3 output
+log "Selecting best primers"
+kasp_container=Bio::DB::Primer3::KASPContainer.new
+kasp_container.line_1= original_name
+kasp_container.line_2= snp_in
+if options[:scoring] == :het_dels
+  kasp_container.scores = Hash.new
+  kasp_container.scores[:chromosome_specific] = 0
+  kasp_container.scores[:chromosome_semispecific] = 1000
+  kasp_container.scores[:chromosome_nonspecific] = 100
+end
+exons.each do |snp|
+  snpk = kasp_container.add_snp(snp)
+end
+kasp_container.add_primers_file(primer_3_output) if added_exons > 0
+header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors,repetitive,blast_hits"
+File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }
+out_fasta_products = "#{output_folder}/products.fa"
+File.open(out_fasta_products, 'w') do  |f|
+  kasp_container.snp_hash.each_pair do |name, kaspSNP|
+    f.write(kaspSNP.realigned_primers_fasta)
+  end
+end
+File.open(output_to_order, "w") { |io|  io.write(kasp_container.print_primers_with_tails()) }
+log "DONE"
+rescue StandardError => e
+  log "ERROR\t#{e.message}"
+  $stderr.puts e.backtrace
+  raise e
+rescue Exception => e
+  log "ERROR\t#{e.message}"
+  $stderr.puts e.backtrace
+  raise e
+end
+#puts container.inspect
+#container.snp_map.each do | gene, snp_array|
+#  snp_array.each do |e|
+ #   puts e.inspect
+#    puts e.aligned_sequences_fasta
+#  end
+#end

data/bin/snp_position_to_polymarker.rb ADDED Viewed

@@ -0,0 +1,101 @@
+#!/usr/bin/env ruby
+#This This script converts the a file with snps and positions with the header:
+#GENE,BASE,POS,SNP,Chromosome
+#  snp.gene, snp.original, snp.position, snp.snp, snp.chromosome
+#To the input expected by polymarker
+#ID, Chromosome, sequence
+#With sequence containing the SNP in the notation "[A/T]"
+require 'bio'
+require 'optparse'
+$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
+$: << File.expand_path('.')
+path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
+require path
+def log(msg)
+  time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
+  puts "#{time}: #{msg}"
+end
+markers = nil
+options = {}
+options[:flanking_size] = 100
+test_file=''
+OptionParser.new do |opts|
+  opts.banner = "Usage: snp_postion_to_polymarker.rb [options]"
+  opts.on("-s", "--snp_file CSV", "CSV file with the following columnns:\nID,Allele_1,position,Allele_1,target_chromosome") do |o|
+    options[:snp_file] = o
+    test_file = o
+  end
+  opts.on("-r", "--reference FASTA", "reference with the genes/contings/marker seuqnece") do |o|
+    options[:reference] = o
+  end
+  opts.on("-o", "--out CSV", "Output file ") do |o|
+    options[:output] = o
+  end
+  opts.on("-f", "--flanking_size INT", "Flanking size around the SNP") do |o|
+    options[:flanking_size] = o.to_i
+  end
+  opts.on("-t", "--mutant_list FILE", "File with the list of positions with mutation and the mutation line. Example: IWGSC_CSS_1AL_scaff_1455974,Kronos2281,127,C,T\n\
+    requires --reference to get the sequence using a position") do |o|
+    options[:mutant_list] = o
+     test_file = o
+  end
+end.parse!
+#reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
+fasta_reference = options[:reference] if options[:reference]
+fasta_reference_db = Bio::DB::Fasta::FastaFile.new({:fasta=>fasta_reference})
+fasta_reference_db.load_fai_entries
+out = $stdout
+lastRegion = nil
+lastTemplate = nil
+out = File.open(options[:output], "w") if options[:output]
+File.open(test_file) do | f |
+  f.each_line do | line |
+    	snp = nil
+      entry = nil
+      if options[:snp_file]
+    	   snp = Bio::PolyploidTools::SNP.parse(line)
+         entry = fasta_reference_db.index.region_for_entry(snp.gene)
+      elsif options[:mutant_list]
+         snp = Bio::PolyploidTools::SNPMutant.parse(line)
+         entry = fasta_reference_db.index.region_for_entry(snp.contig)
+      end
+    	#puts line
+    	if entry
+       		region = entry.get_full_region
+          snp_name = snp.snp_id_in_seq
+       		#if region != lastRegion
+          #  lastTemplate = fasta_reference_db.fetch_sequence(region)
+          #end
+          start, total, new_position = snp.to_polymarker_coordinates(options[:flanking_size])
+          region.start = start
+          region.end = start + total
+          #puts region
+          local_template = fasta_reference_db.fetch_sequence(region)
+          snp.position = new_position
+          snp.template_sequence = local_template
+          lastRegion = region
+       		out.puts "#{snp.gene}_#{snp_name},#{snp.chromosome},#{snp.to_polymarker_sequence(options[:flanking_size])}"
+    	else
+    	   $stderr.puts "ERROR: Unable to find entry for #{snp.gene}"
+    	end
+	end
+end
+out.close if options[:output]

data/bin/snps_between_bams.rb ADDED Viewed

@@ -0,0 +1,107 @@
+#!/usr/bin/env ruby
+require 'bio'
+require 'rubygems'
+require 'pathname'
+require 'bio-samtools-wrapper'
+require 'set'
+$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
+$: << File.expand_path('.')
+path=File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
+$stderr.puts "Loading: #{path}"
+require path
+fasta_db = Bio::DB::Fasta::FastaFile.new( ARGV[0])
+fasta_db.load_fai_entries
+bam1 =  Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[1]})
+bam2 =  Bio::DB::Sam.new({:fasta=>ARGV[0], :bam=>ARGV[2]})
+output_prefix = ARGV[3]
+block_size=1000
+min_cov = ARGV[4].to_i ? ARGV[4].to_i : 10
+chunk = ARGV[5].to_i
+chunk_size = ARGV[6].to_i
+main_table="#{output_prefix}_#{block_size}_#{min_cov}_table.#{chunk}.csv"
+table_file = File.open(main_table, "w")
+table_file.puts "gene\tlength\tsnps_1\tcalled_1\tsnps_per_#{block_size}_1\tsnps_2\tcalled_2\tsnps_per_#{block_size}_2\tsnps_tot\tsnps_per_1k_tot"
+hist_1= Hash.new(0)
+hist_2= Hash.new(0)
+fasta_file = File.open("#{output_prefix}_#{min_cov}.#{chunk}.fa", "w")
+i = -1
+min = chunk * chunk_size
+max = min + chunk_size
+fasta_db.index.entries.each do | r |
+  i = i  + 1
+  next if i < min or i >= max
+  #Np r.get_full_region
+  #container.process_region( { :region => r.get_full_region.to_s, :output_file => output_file } )
+  region=r.get_full_region
+  begin
+    reg_a = bam1.fetch_region({:region=>region,  :min_cov=>min_cov, :A=>1})
+    reg_b = bam2.fetch_region({:region=>region,  :min_cov=>min_cov, :A=>1})
+    cons_1 = reg_a.consensus
+    cons_2 = reg_b.consensus
+    snps_1 = cons_1.count_ambiguities
+    snps_2 = cons_2.count_ambiguities
+    called_1 = reg_a.called
+    called_2 = reg_b.called
+    snps_tot = Bio::Sequence.snps_between(cons_1, cons_2)
+    snps_per_1k_1   = (block_size * snps_1.to_f   ) / region.size
+    snps_per_1k_2   = (block_size * snps_2.to_f   ) / region.size
+    snps_per_1k_tot = (block_size * snps_tot.to_f ) / region.size
+    hist_1[snps_per_1k_1.to_i] += 1
+    hist_2[snps_per_1k_2.to_i] += 1
+    table_file.print "#{r.id}\t#{region.size}\t"
+    table_file.print "#{snps_1}\t#{called_1}\t#{snps_per_1k_1}\t"
+    table_file.print "#{snps_2}\t#{called_2}\t#{snps_per_1k_2}\t"
+    table_file.print "#{snps_tot}\t#{snps_per_1k_tot}\n"
+    fasta_file.puts ">#{r.id}_1"
+    fasta_file.puts "#{cons_1}"
+    fasta_file.puts ">#{r.id}_2"
+    fasta_file.puts "#{cons_2}"
+  rescue Exception => e
+    $stderr.puts "Unable to process #{region}: #{e.to_s}"
+  end
+end
+fasta_file.close
+table_file.close
+hist_table="#{output_prefix}_#{block_size}_#{min_cov}_hist.#{chunk}.csv"
+hist_file = File.open(hist_table, "w")
+all_keys = SortedSet.new(hist_1.keys)
+all_keys.merge(hist_2.keys)
+hist_file.puts "SNPs/#{block_size}\thist_1\thist_2\n"
+all_keys.each do |k|
+  hist_file.puts "#{k}\t#{hist_1[k]}\t#{hist_2[k]}"
+end
+hist_file.close

data/bin/tag_stats.rb ADDED Viewed

@@ -0,0 +1,75 @@
+#!/usr/bin/env ruby
+require 'optparse'
+require 'csv'
+require 'fileutils'
+require 'tmpdir'
+require 'bio-samtools-wrapper'
+require 'bio'
+require 'descriptive_statistics'
+class Bio::DB::Tag
+  def set(str)
+    @tag   = str[0..1]
+    @type  = str[3]
+    @value = str[5..-1]
+    @value = @value.to_i if @type == "i"
+  end
+end
+$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
+$: << File.expand_path('.')
+path= File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
+require path
+opts = {}
+opts[:tag] = "NH"
+opts[:bam] = nil
+opts[:out] = nil
+opts[:ref] = nil
+out = $stdout
+OptionParser.new do |o|
+  o.banner = "Usage: tag_stats.rb [options]"
+  o.on("-t", "--tag str", "The tag to extract (default NH)") do |o|
+    opts[:tag] = o
+  end
+  o.on("-b", "--bam FILE" , "BAM file with the alignments ") do |o|
+    opts[:bam] = o
+  end
+  o.on("-o", "--out_file CHAR", "File to save the stats") do |o|
+    opts[:out] = o
+  end
+  o.on("-r", "--reference FILE", "Fasta file with the reference") do |o|
+    opts[:ref] = o
+  end
+end.parse!
+bam =  Bio::DB::Sam.new(fasta: opts[:ref], bam: opts[:bam])
+tag = opts[:tag]
+sample = File.basename(opts[:bam], '.sorted.bam')
+last_ref = ""
+values = []
+to_print = [:sum, :min, :max, :mean, :mode, :median, :q1, :q2, :q3]
+percentiles = [90, 95, 97.5, 99]
+#Add the 90, 95, 97.5 and 99 percentiles.
+out = File.open(opts[:out], "w")  if opts[:out]
+bam.view do |aln |
+  if(last_ref != aln.rname)
+    desc_stats = values.descriptive_statistics
+    to_print.each    { |e| out.puts [sample, last_ref, e      , desc_stats[e]       ].join("\t")  } if(last_ref !=  "")
+    percentiles.each { |e| out.puts [sample, last_ref, "P#{e}", values.percentile(e)].join("\t")  } if(last_ref !=  "")
+    out.puts [sample, last_ref, "N", values.length].join("\t") if(last_ref !=  "")
+    values.clear
+    last_ref = aln.rname
+  end
+  values << aln.tags[tag].value
+end
+out.close  if opts[:out]

data/bin/vcfLineToTable.rb ADDED Viewed

@@ -0,0 +1,56 @@
+require 'bio-samtools-wrapper'
+require 'optparse'
+$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
+$: << File.expand_path('.')
+path=File.expand_path(File.dirname(__FILE__) + '/../lib/bio-polymarker.rb')
+def parseVCFheader(head_line="")
+	##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
+	m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(head_line)
+	{:id=>m[1],:number=>m[2],:type=>m[3],:desc=>m[4]}
+end
+header_info = Hash.new
+ARGF.each_line do |line|
+	h = nil
+	h = parseVCFheader(line) if line.start_with? "##INFO"
+	header_info[h[:id]] = h[:desc] if h
+	#puts header_info.inspect
+	next if line.start_with? "##"
+	if line.start_with? "#CHROM"
+		arr = line.split
+		arr = arr.drop(9)
+		arr2 = arr.map { |s| [s.clone().prepend('Cov'), s.clone().prepend('Hap') ]}
+		#header += arr2.join("\t")
+		#puts header
+		next
+	end
+	line.chomp!
+	vcf = Bio::DB::Vcf.new(line, arr)
+#	puts arr.join("\t") if vcf.info["TYPE"] == "snp"
+#	puts vcf.inspect
+	#pus vcf.pos.inspect
+	#next if vcf.info["AO"].to_i != 1
+	vcf.info.each_pair { |name, val| puts "#{name}\t#{val}\t#{header_info[name]}" }
+    arr2 = Array.new
+    puts "____"
+    i = 0
+	vcf.samples.each do |sample|
+		#puts sample.inspect
+		puts sample[1].keys.join("\t") if i == 0
+        puts sample[1].values.join("\t")
+        i+=1
+    end
+end