RubyGems - bio-gadget - Versions diffs - 0.4.8 → 0.5.0 - Mend

bio-gadget 0.4.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

checksums.yaml +4 -4
data/.gitignore +10 -20
data/.travis.yml +5 -0
data/LICENSE +1 -1
data/README.org +0 -21
data/Rakefile +5 -1
data/bin/console +14 -0
data/bin/setup +8 -0
data/bio-gadget.gemspec +20 -14
data/exe/bio-gadget +14 -0
data/exe/fq1l +5 -0
data/exe/rbg +1 -0
data/exe/strt +5 -0
data/ext/bio_gadget/bio_gadget.c +313 -0
data/ext/bio_gadget/bio_gadget.h +8 -0
data/ext/bio_gadget/extconf.rb +3 -0
data/lib/bio/gadget.rb +171 -0
data/lib/bio/gadget/fq1l.rb +457 -0
data/lib/bio/gadget/strt.rb +605 -0
data/lib/bio/gadget/strt/count.rb +53 -0
data/lib/bio/gadget/strt/depth.rb +124 -0
data/lib/bio/gadget/strt/prepare_transcriptome.rb +230 -0
data/lib/bio/gadgets.rb +135 -0
data/test/bio/gadget_test.rb +11 -0
data/test/test_helper.rb +4 -0
metadata +109 -40
data/Gthorfile +0 -2
data/bin/bio-gadget +0 -5
data/lib/bio-gadget.rb +0 -44
data/lib/bio-gadget/dedup.rb +0 -33
data/lib/bio-gadget/demlt.rb +0 -149
data/lib/bio-gadget/femrg.rb +0 -61
data/lib/bio-gadget/fqxz.rb +0 -30
data/lib/bio-gadget/peak.rb +0 -94
data/lib/bio-gadget/qvstat.rb +0 -34
data/lib/bio-gadget/rgt2mtx.rb +0 -60
data/lib/bio-gadget/version.rb +0 -9
data/lib/bio-gadget/wig5p.rb +0 -51
data/lib/bio-gadget/wigchr.rb +0 -28

data/lib/bio/gadget/strt/count.rb ADDED

@@ -0,0 +1,53 @@
+require 'parallel'
+module Bio
+  class Gadget
+    class Strt < Bio::Gadget
+      desc 'count SMP BASE BED [BED ...]',
+           "Count 5'-ends at BASE in each region defined by BEDs"
+      method_option *OPT_COREUTILS_PREFIX
+      method_option *OPT_PARALLEL
+      def count(smp, base, bed0, *beds)
+        cPrefix = options.coreutils_prefix
+        smps = Hash.new
+        fp = open(smp)
+        header = fp.gets.rstrip.split(',')
+        idxName = header.index('NAME')
+        idxBeds = header.index('5pBEDs')
+        fp.each do |line|
+          cols = line.rstrip.split(',')
+          smps[cols[idxName]] = cols[idxBeds].split(';')
+        end
+        fp.close
+        tmpfile = get_temporary_path('strt.count', 'bed')
+        system "#{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 #{bed0} #{beds.join(' ')} > #{tmpfile}"
+        counts = Hash.new
+        mutex = Mutex.new
+        Parallel.map(smps.keys, in_threads: options.parallel) do |name|
+          bed5ps = smps[name].map { |bed| "#{base}#{bed}.5p.bed.gz" }
+          open("| bedtools intersect -nonamecheck -wa -wb -s -sorted -a #{tmpfile} -b #{bed5ps.join(' ')} | #{cPrefix}cut -f 4,10 | #{cPrefix}sort -u | #{cPrefix}cut -f 1 | #{cPrefix}uniq -c").each do |line|
+            cnt, id = line.strip.split(' ')
+            mutex.synchronize do
+              counts[id] = Hash.new unless counts.key?(id)
+              counts[id][name] = cnt.to_i
+            end
+          end
+        end
+        names = smps.keys.sort
+        puts (['ID'] + names.map { |name| "R|#{name}" }).join(',')
+        counts.each do |id, name2count|
+          puts ([id] + names.map { |name| name2count.key?(name) ? name2count[name] : 0 }).join(',')
+        end
+      end
+    end
+  end
+end

data/lib/bio/gadget/strt/depth.rb ADDED

@@ -0,0 +1,124 @@
+require 'parallel'
+module Bio
+  class Gadget
+    class Strt < Bio::Gadget
+      OPT_LENGTH_BARCODE = [ :length_barcode, { :banner => 'NT',
+                                                :default => 6,
+                                                :desc => 'Length of barcode',
+                                                :type => :numeric } ]
+      OPT_LENGTH_GAP = [ :length_gap, { :banner => 'NT',
+                                        :default => 3,
+                                        :desc => 'Length of gap (polyG)',
+                                        :type => :numeric } ]
+      OPT_LENGTH_MINIMUM = [ :length_minimum,
+                             { :banner => 'NT',
+                               :default => 25,
+                               :desc => 'Minimum length after preprocess',
+                               :type => :numeric } ]
+      OPT_LOW_QUALITIES = [ :low_qualities, { :banner => 'CHARACTERS',
+                                              :default => '!"#',
+                                              :desc => 'Low quality characters',
+                                              :type => :string } ]
+      desc 'depth FQGZ [FQGZ ...]',
+           'Count nonredundant reads according to the sequencing depths'
+      method_option *OPT_BUFFER_SIZE
+      method_option *OPT_PARALLEL
+      method_option *OPT_COREUTILS_PREFIX
+      method_option *OPT_GREP_PREFIX
+      method_option *OPT_LENGTH_BARCODE
+      method_option *OPT_LENGTH_GAP
+      method_option *OPT_LENGTH_MINIMUM
+      method_option *OPT_UMI_LENGTH
+      method_option *OPT_LOW_QUALITIES
+      method_option :tss,
+                    default: false,
+                    desc: 'Check number of TSSs, instead of STRT reads',
+                    type: :boolean
+      def depth(fqgz, *fqgzs0)
+        bSize,
+        cPfx,
+        gPfx,
+        par,
+        bLen,
+        gLen,
+        mLen,
+        pLen,
+        uLen,
+        match,
+        cPfx0 = configure_depth(options)
+        fqgzs = [fqgz] + fqgzs0
+        tmpfiles = Array.new(fqgzs.length) do |i|
+          get_temporary_path('strt.depth', 'fq1l')
+        end
+        tsscmd =
+          options.tss ? "fq1l mt5 --minimum-length=#{mLen} #{match}+ | #{cPfx0}cut -f 2 | #{sortCommand(options)} -u |" : ''
+        indexes = Array.new(fqgzs.length) { |i| i }
+        Parallel.each(indexes, in_threads: options.parallel) do |i|
+          system "gunzip -c #{fqgzs[i]} | fq1l convert #{cPfx} > #{tmpfiles[i]}"
+        end
+        1.upto(12).each do |draw|
+          fifo = get_fifo('strt.depth', 'fq1l')
+          fp0 = open("| #{cPfx0}wc -l #{fifo}")
+          fp1 = open(<<CMD
+| LC_ALL=C cat #{tmpfiles.join(' ')} \
+| fq1l to #{draw} #{12-draw} \
+| #{tee_command(options)} #{fifo} \
+| fq1l nr #{bSize} #{cPfx} #{par} \
+| fq1l m5 #{gPfx} #{match} \
+| fq1l m5 #{gPfx} --invert-match '[^\\t]*N' \
+| fq1l qt3 --low-qualities='#{options.low_qualities}' --minimum-length=#{pLen} \
+| fq1l pt3 --primer=AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG --minimum-length=#{pLen} #{cPfx} #{gPfx} \
+| fq1l nr #{bSize} --degenerated-mode #{cPfx} #{par} \
+| #{tsscmd} #{cPfx0}wc -l
+CMD
+                    )
+          raw = fp0.gets.strip.split(/\s+/)[0]
+          fp0.close
+          nr = fp1.gets.strip
+          fp1.close
+          puts [raw, nr].join(',')
+        end
+      end
+      no_commands do
+        def configure_depth(options)
+          uLength = options.umi_length
+          bLength = options.length_barcode
+          gLength = options.length_gap
+          mLength = options.length_minimum
+          return [ options.key?('buffer_size') ?
+                     '--buffer-size='+options.buffer_size : '',
+                   options.coreutils_prefix == '' ?
+                     '' : "--coreutils-prefix=#{options.coreutils_prefix}",
+                   options.grep_prefix == '' ?
+                     '' : "--grep-prefix=#{options.grep_prefix}",
+                   "--parallel=#{options.parallel}",
+                   bLength,
+                   gLength,
+                   mLength,
+                   mLength + uLength + bLength + gLength,
+                   uLength,
+                   "#{'.' * uLength}#{'.' * bLength}#{'G' * (gLength-1)}",
+                   options.coreutils_prefix ]
+        end
+      end
+    end
+  end
+end

data/lib/bio/gadget/strt/prepare_transcriptome.rb ADDED

@@ -0,0 +1,230 @@
+require 'open3'
+module Bio
+  class Gadget
+    class StrtPrepareTranscriptome < Bio::Gadget
+      package_name :prepare_transcriptome
+      #
+      desc 'hg38 DIR', 'GRCh38/hg38 - human'
+      long_desc <<-DESC
+Prepare transcriptome data files based on GENCODE gene annotation RELEASE for GRCh38/h38 at DIR, where it has 'ref.fa.fai' genome index file.
+DESC
+      method_option *OPT_COREUTILS_PREFIX
+      method_option *OPT_DOWNLOAD
+      method_option *OPT_GREP_PREFIX
+      method_option :gencode,
+                    banner: 'RELEASE',
+                    default: 25,
+                    desc: 'Release number of GENCODE',
+                    type: :numeric
+      def hg38(dir0)
+        dir = File.expand_path(dir0)
+        gtf = "#{dir}/hg38.gencode.v#{options.gencode}.annotation.gtf.gz"
+        if options.download != 'no'
+          download_file("ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v#{options.gencode}.annotation.gtf.gz",
+                        gtf)
+        end
+        if options.download != 'only'
+          pipeline("unpigz -c #{gtf}",
+                   "hisat2_extract_splice_sites.py - > #{dir}/transcriptome.splice_sites")
+          pipeline("unpigz -c #{gtf}",
+                   "hisat2_extract_exons.py - > #{dir}/transcriptome.exons")
+          fp_ribosome = open_bed_w("#{dir}/ribosome.bed")
+          fp_whole = open_bed_w("#{dir}/spikein_whole.bed")
+          fp_5end = open_bed_w("#{dir}/spikein_5end.bed")
+          open("#{dir}/ref.fa.fai").each do |line|
+            acc, len, *tmp = line.rstrip.split
+            if acc =~ /^RIBO_/
+              fp_ribosome.puts [acc, 0, len, acc, 0, '+'].join("\t")
+              fp_ribosome.puts [acc, 0, len, acc, 0, '-'].join("\t")
+            end
+            if acc =~ /^RNA_SPIKE_/
+              fp_whole.puts [acc, 0, len, acc, 0, '+'].join("\t")
+              fp_5end.puts [acc, 0, 50, acc, 0, '+'].join("\t")
+            end
+          end
+          fp_ribosome.close
+          fp_whole.close
+          fp_5end.close
+          atgs = Hash.new
+          regex_transcript_id = /transcript_id "([^"]+)"/
+          regex_gene_id = /gene_id "([^"]+)"/
+          regex_gene_name = /gene_name "([^"]+)"/
+          regex_exon_number = /exon_number (\d+)/
+          Open3.pipeline_r(
+            "unpigz -c #{gtf}",
+            "#{grep_command} '\tstart_codon\t'") do |fp, threads|
+            fp.each do |line|
+              cols = line.rstrip.split /\t/
+              atgs[regex_transcript_id.match(cols[8]).to_a[1]] =
+                cols[cols[6] == '+' ? 3 : 4].to_i
+            end
+            fp.close
+          end
+          bed_coding_exon = "#{dir}/transcriptome.coding_exon.bed"
+          bed_coding_5utr = "#{dir}/transcriptome.coding_5utr.bed"
+          bed_coding_promoter = "#{dir}/transcriptome.coding_promoter.bed"
+          fp_coding_gene = open_bed_w("#{dir}/transcriptome.coding_gene.bed")
+          fp_coding_exon = open_bed_w(bed_coding_exon)
+          fp_coding_5utr = open_bed_w(bed_coding_5utr)
+          fp_coding_promoter = open_bed_w(bed_coding_promoter)
+          fp_other_gene = open_bed_w("#{dir}/transcriptome.other_gene.bed")
+          fp_other_exon = open_bed_w("#{dir}/transcriptome.other_exon.bed")
+          fp_other_1st_exon = open_bed_w("#{dir}/transcriptome.other_1st_exon.bed")
+          fp_other_promoter = open_bed_w("#{dir}/transcriptome.other_promoter.bed")
+          Open3.pipeline_r(
+            "unpigz -c #{gtf}",
+            "#{grep_command} -E '\t(exon|transcript)\t'") do |fp, threads|
+            fp.each do |line|
+              cols = line.rstrip.split /\t/
+              ann = cols[8]
+              transcript_id = regex_transcript_id.match(ann).to_a[1]
+              gene_id = regex_gene_id.match(ann).to_a[1]
+              gene_name = regex_gene_name.match(ann).to_a[1]
+              exon_number = regex_exon_number.match(ann).to_a[1].to_i
+              chr = cols[0]
+              left = cols[3].to_i
+              right = cols[4].to_i
+              str = cols[6]
+              acc = "#{gene_name}|#{gene_id}|#{transcript_id}"
+              exon = [chr, left-1, right, acc, 0, str].join("\t")
+              if cols[2] == 'transcript'
+                if atgs.key?(transcript_id)
+                  fp_coding_gene.puts exon
+                else
+                  fp_other_gene.puts exon
+                end
+                next
+              end
+              if atgs.key?(transcript_id)
+                fp_coding_exon.puts exon
+                atg = atgs[transcript_id]
+                if (str == '+' && right < atg) || (str == '-' && atg < left)
+                  fp_coding_5utr.puts exon
+                elsif (str == '+' && left < atg && atg <= right)
+                  fp_coding_5utr.puts [chr, left-1, atg-1, acc, 0, '+'].join("\t")
+                elsif (str == '-' && left <= atg && atg < right)
+                  fp_coding_5utr.puts [chr, atg, right, acc, 0, '-'].join("\t")
+                end
+                if exon_number == 1
+                  if str == '+'
+                    fp_coding_promoter.puts [chr, left-500, left-1, acc, 0, '+'].join("\t")
+                  else
+                    fp_coding_promoter.puts [chr, right, right+499, acc, 0, '-'].join("\t")
+                  end
+                end
+              else
+                fp_other_exon.puts exon
+                if exon_number == 1
+                  fp_other_1st_exon.puts exon
+                  if str == '+'
+                    fp_other_promoter.puts [chr, left-500, left-1, acc, 0, '+'].join("\t")
+                  else
+                    fp_other_promoter.puts [chr, right, right+499, acc, 0, '-'].join("\t")
+                  end
+                end
+              end
+            end
+            fp.close
+          end
+          fp_coding_gene.close
+          fp_coding_exon.close
+          fp_coding_5utr.close
+          fp_coding_promoter.close
+          fp_other_gene.close
+          fp_other_exon.close
+          fp_other_1st_exon.close
+          fp_other_promoter.close
+          merge_bed_by_gene("Coding5end",
+                            "5UTR and the proximal upstream of protein coding genes based on GENCODE version #{options.gencode} for GRCh38/hg38",
+                            "#{dir}/transcriptome.coding_5end",
+                            bed_coding_promoter, bed_coding_5utr)
+          merge_bed_by_gene("CodingWhole",
+                            "Exon and the proximal upstream of protein coding genes based on GENCODE version #{options.gencode} for GRCh38/hg38",
+                            "#{dir}/transcriptome.coding_whole",
+                            bed_coding_promoter, bed_coding_exon)
+          sh "pigz #{dir}/*.bed"
+          genePred = "#{dir}/hg38_refGene.txt"
+          pipeline(
+            "unpigz -c #{gtf}",
+            "#{grep_command} 'tag \"basic\"'",
+            "gtfToGenePred -geneNameAsName2 -genePredExt stdin #{genePred}")
+          sh "retrieve_seq_from_fasta.pl --format refGene --seqfile #{dir}/ref.fa --outfile #{dir}/hg38_refGeneMrna.fa #{genePred}"
+        end
+      end
+      no_commands do
+        def open_bed_w(bed)
+          open("| bedtools sort -i stdin > #{bed}", 'w')
+        end
+        def merge_bed_by_gene(tname, tdesc, base, *inbeds)
+          tmpbed = get_temporary_path('strt.prepare_transcriptome', 'bed')
+          annfp = open("#{base}.csv", 'w')
+          annfp.puts "ID,ACCESSIONS"
+          Open3.pipeline_r(
+            "#{cat_command} #{inbeds.join(' ')}",
+            "ruby -anle 'as=$F[3].split /\\|/; puts ($F[0..2]+[as[0]]+$F[4..-1]+[as[1..-1].join(\"|\")]).join(\"\t\")'",
+            "#{sort_command} -k 4,4 -k 1,1 -k 2,2n") do |infp, inths|
+            presym = ''
+            gacc2taccs = Hash.new
+            outfp = nil
+            outths = nil
+            infp.each do |line|
+              chr, left, right, sym, tmp, str, accs = line.rstrip.split /\t/
+              gacc, tacc = accs.split /\|/
+              if presym != sym
+                unless outfp.nil?
+                  outfp.close
+                  outths[-1].join
+                  accstr = (gacc2taccs.map {|k,v| "#{k}:#{v.join(';')}"}).join('|')
+                  annfp.puts "#{presym},#{accstr}"
+                  gacc2taccs = Hash.new
+                end
+                outfp, outths = Open3.pipeline_w(
+                         "bedtools sort -i stdin",
+                         "bedtools merge -s -c 4 -o distinct >> #{tmpbed}")
+                presym = sym
+              end
+              outfp.puts ([chr, left, right, sym, tmp, str]).join("\t")
+              gacc2taccs[gacc] = Array.new unless gacc2taccs.key?(gacc)
+              gacc2taccs[gacc] << tacc unless gacc2taccs[gacc].include?(tacc)
+            end
+            unless outfp.nil?
+              outfp.close
+              outths[-1].join
+              accstr = (gacc2taccs.map {|k,v| "#{k}:#{v.join(';')}"}).join('|')
+              annfp.puts "#{presym},#{accstr}"
+            end
+          end
+          annfp.close
+          sh "echo 'track name=#{tname} description=\"#{tdesc}\" visibility=3 colorByStrand=\"38,139,210 203,75,22\"' > #{base}.bed"
+          pipeline("ruby -anle 'puts ($F.values_at(0, 1, 2) + [$F[4], 0, $F[3]]).join(\"\t\")' < #{tmpbed}",
+                   "bedtools sort -i stdin >> #{base}.bed")
+        end
+      end
+    end
+  end
+end

data/lib/bio/gadgets.rb ADDED

@@ -0,0 +1,135 @@
+require 'bio'
+require 'parallel'
+module Bio
+  class Gadgets < Bio::Gadget
+    desc 'find PATTERN [NAME]',
+         'Find fragments matching with regexp PATTERN from FASTA-format STDIN'
+    method_option *OPT_BUFFER_SIZE
+    method_option *OPT_PARALLEL
+    method_option *OPT_COREUTILS_PREFIX
+    method_option :ignore_case,
+                  default: true,
+                  desc: 'Fold lower case to upper case characters',
+                  type: :boolean
+    def find(pattern, name0 = '')
+      bSize = options.key?('buffer_size') ? '--buffer-size='+options.buffer_size : ''
+      cPrefix = options.coreutils_prefix
+      re = Regexp.new("(#{pattern})", options.ignore_case)
+      name = name0 == '' ? pattern : name0
+      pids = Array.new
+      tmpfiles = Array.new
+      ff = Bio::FlatFile.open(Bio::FastaFormat, STDIN)
+      ff.each do |entry|
+        tmpfiles << tmpfile = get_temporary_path('find', 'bed', false)
+        acc = entry.entry_id
+        seq = entry.seq
+        pids << Process.fork do
+          fp = open("| #{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 #{bSize} > #{tmpfile}", 'w')
+          #
+          pos = 0
+          match = re.match(seq, pos)
+          while !match.nil?
+            fp.puts [acc, match.begin(1), match.end(1), name, '0', '+'].join("\t")
+            pos = match.begin(1)+1
+            match = re.match(seq, pos)
+          end
+          #
+          pos = 0
+          seq = seq.reverse.tr('acgtACGT', 'tgcaTGCA')
+          len = seq.length
+          match = re.match(seq, pos)
+          while !match.nil?
+            fp.puts [acc, len-match.end(1), len-match.begin(1), name, '0', '-'].join("\t")
+            pos = match.begin(1)+1
+            match = re.match(seq, pos)
+          end
+          #
+          fp.close
+        end
+        while pids.length == options.parallel
+          pids.delete(Process.wait)
+        end
+      end
+      ff.close
+      while pids.length > 0
+        pids.delete(Process.wait)
+      end
+      system "#{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 --merge #{bSize} #{tmpfiles.join(' ')}"
+      unlink_files(tmpfiles)
+    end
+    #
+    desc 'gap BEDgz1 BEDgz2',
+         "Calculate gap distances from 5'-end of fragments 1 to 3'-end of fragments 2"
+    method_option *OPT_PARALLEL
+    method_option *OPT_COREUTILS_PREFIX
+    method_option :minimum_gap,
+                  default: -10000,
+                  desc: 'Minimum gap distans to be reported',
+                  type: :numeric
+    method_option :maximum_gap,
+                  default: 2500,
+                  desc: 'Maximum gap distans to be reported',
+                  type: :numeric
+    def gap(bedgz1, bedgz2)
+      cPrefix = options.coreutils_prefix
+      chrs = Hash.new
+      open("| unpigz -c #{bedgz1} #{bedgz2} | #{cPrefix}cut -f 1 | #{cPrefix}uniq | #{cPrefix}sort -u").each do |line|
+        chrs[line.rstrip] = ''
+      end
+      max = options.maximum_gap
+      min = options.minimum_gap
+      reSep = /\t/
+      tmpfiles = Hash.new
+      chrs.keys.each do |chr|
+        tmpfiles[chr] = get_temporary_path('gap', 'csv', false)
+      end
+      Parallel.each(chrs.keys, in_processes: options.parallel) do |chr|
+        bed2 = Array.new
+        open("| gunzip -c #{bedgz2} | grep '^#{chr}\t'").each do |line|
+          chr0, *cols = line.rstrip.split(reSep)
+          cols[0] = cols[0].to_i
+          cols[1] = cols[1].to_i
+          bed2 << cols
+        end
+        fp = open(tmpfiles[chr], 'w')
+        open("| gunzip -c #{bedgz1} | grep '^#{chr}\t'").each do |line|
+          chr0, start, stop, name, score, str = line.rstrip.split(reSep)
+          if str == '+'
+            bed2.each do |bed|
+              dist = bed[1] - start.to_i + 1
+              fp.puts [name, bed[2], dist].join(',') if min <= dist && dist < max
+            end
+          else
+            bed2.each do |bed|
+              dist = stop.to_i - bed[0] + 1
+              fp.puts [name, bed[2], dist].join(',') if min <= dist && dist < max
+            end
+          end
+        end
+        fp.close
+      end
+      system "cat #{tmpfiles.values.join(' ')}"
+      unlink_files(tmpfiles)
+    end
+  end
+end