bio-gadget 0.4.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +10 -20
- data/.travis.yml +5 -0
- data/LICENSE +1 -1
- data/README.org +0 -21
- data/Rakefile +5 -1
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bio-gadget.gemspec +20 -14
- data/exe/bio-gadget +14 -0
- data/exe/fq1l +5 -0
- data/exe/rbg +1 -0
- data/exe/strt +5 -0
- data/ext/bio_gadget/bio_gadget.c +313 -0
- data/ext/bio_gadget/bio_gadget.h +8 -0
- data/ext/bio_gadget/extconf.rb +3 -0
- data/lib/bio/gadget.rb +171 -0
- data/lib/bio/gadget/fq1l.rb +457 -0
- data/lib/bio/gadget/strt.rb +605 -0
- data/lib/bio/gadget/strt/count.rb +53 -0
- data/lib/bio/gadget/strt/depth.rb +124 -0
- data/lib/bio/gadget/strt/prepare_transcriptome.rb +230 -0
- data/lib/bio/gadgets.rb +135 -0
- data/test/bio/gadget_test.rb +11 -0
- data/test/test_helper.rb +4 -0
- metadata +109 -40
- data/Gthorfile +0 -2
- data/bin/bio-gadget +0 -5
- data/lib/bio-gadget.rb +0 -44
- data/lib/bio-gadget/dedup.rb +0 -33
- data/lib/bio-gadget/demlt.rb +0 -149
- data/lib/bio-gadget/femrg.rb +0 -61
- data/lib/bio-gadget/fqxz.rb +0 -30
- data/lib/bio-gadget/peak.rb +0 -94
- data/lib/bio-gadget/qvstat.rb +0 -34
- data/lib/bio-gadget/rgt2mtx.rb +0 -60
- data/lib/bio-gadget/version.rb +0 -9
- data/lib/bio-gadget/wig5p.rb +0 -51
- data/lib/bio-gadget/wigchr.rb +0 -28
| @@ -0,0 +1,53 @@ | |
| 1 | 
            +
            require 'parallel'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Bio
         | 
| 4 | 
            +
              class Gadget
         | 
| 5 | 
            +
                class Strt < Bio::Gadget
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                  desc 'count SMP BASE BED [BED ...]',
         | 
| 8 | 
            +
                       "Count 5'-ends at BASE in each region defined by BEDs"
         | 
| 9 | 
            +
                  
         | 
| 10 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 11 | 
            +
                  method_option *OPT_PARALLEL
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                  def count(smp, base, bed0, *beds)
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                    cPrefix = options.coreutils_prefix
         | 
| 16 | 
            +
                    
         | 
| 17 | 
            +
                    smps = Hash.new
         | 
| 18 | 
            +
                    fp = open(smp)
         | 
| 19 | 
            +
                    header = fp.gets.rstrip.split(',')
         | 
| 20 | 
            +
                    idxName = header.index('NAME')
         | 
| 21 | 
            +
                    idxBeds = header.index('5pBEDs')
         | 
| 22 | 
            +
                    fp.each do |line|
         | 
| 23 | 
            +
                      cols = line.rstrip.split(',')
         | 
| 24 | 
            +
                      smps[cols[idxName]] = cols[idxBeds].split(';')
         | 
| 25 | 
            +
                    end
         | 
| 26 | 
            +
                    fp.close
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    tmpfile = get_temporary_path('strt.count', 'bed')
         | 
| 29 | 
            +
                    system "#{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 #{bed0} #{beds.join(' ')} > #{tmpfile}"
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                    counts = Hash.new
         | 
| 32 | 
            +
                    mutex = Mutex.new
         | 
| 33 | 
            +
                    Parallel.map(smps.keys, in_threads: options.parallel) do |name|
         | 
| 34 | 
            +
                      bed5ps = smps[name].map { |bed| "#{base}#{bed}.5p.bed.gz" }
         | 
| 35 | 
            +
                      open("| bedtools intersect -nonamecheck -wa -wb -s -sorted -a #{tmpfile} -b #{bed5ps.join(' ')} | #{cPrefix}cut -f 4,10 | #{cPrefix}sort -u | #{cPrefix}cut -f 1 | #{cPrefix}uniq -c").each do |line|
         | 
| 36 | 
            +
                        cnt, id = line.strip.split(' ')
         | 
| 37 | 
            +
                        mutex.synchronize do
         | 
| 38 | 
            +
                          counts[id] = Hash.new unless counts.key?(id)
         | 
| 39 | 
            +
                          counts[id][name] = cnt.to_i
         | 
| 40 | 
            +
                        end
         | 
| 41 | 
            +
                      end
         | 
| 42 | 
            +
                    end
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                    names = smps.keys.sort
         | 
| 45 | 
            +
                    puts (['ID'] + names.map { |name| "R|#{name}" }).join(',')
         | 
| 46 | 
            +
                    counts.each do |id, name2count|
         | 
| 47 | 
            +
                      puts ([id] + names.map { |name| name2count.key?(name) ? name2count[name] : 0 }).join(',')
         | 
| 48 | 
            +
                    end
         | 
| 49 | 
            +
                  end
         | 
| 50 | 
            +
                  
         | 
| 51 | 
            +
                end
         | 
| 52 | 
            +
              end
         | 
| 53 | 
            +
            end
         | 
| @@ -0,0 +1,124 @@ | |
| 1 | 
            +
            require 'parallel'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Bio
         | 
| 4 | 
            +
              class Gadget
         | 
| 5 | 
            +
                class Strt < Bio::Gadget
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                  OPT_LENGTH_BARCODE = [ :length_barcode, { :banner => 'NT',
         | 
| 8 | 
            +
                                                            :default => 6,
         | 
| 9 | 
            +
                                                            :desc => 'Length of barcode',
         | 
| 10 | 
            +
                                                            :type => :numeric } ]
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                  OPT_LENGTH_GAP = [ :length_gap, { :banner => 'NT',
         | 
| 13 | 
            +
                                                    :default => 3,
         | 
| 14 | 
            +
                                                    :desc => 'Length of gap (polyG)',
         | 
| 15 | 
            +
                                                    :type => :numeric } ]
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                  OPT_LENGTH_MINIMUM = [ :length_minimum,
         | 
| 18 | 
            +
                                         { :banner => 'NT',
         | 
| 19 | 
            +
                                           :default => 25,
         | 
| 20 | 
            +
                                           :desc => 'Minimum length after preprocess',
         | 
| 21 | 
            +
                                           :type => :numeric } ]
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                  OPT_LOW_QUALITIES = [ :low_qualities, { :banner => 'CHARACTERS',
         | 
| 24 | 
            +
                                                          :default => '!"#',
         | 
| 25 | 
            +
                                                          :desc => 'Low quality characters',
         | 
| 26 | 
            +
                                                          :type => :string } ]
         | 
| 27 | 
            +
                  
         | 
| 28 | 
            +
                  desc 'depth FQGZ [FQGZ ...]',
         | 
| 29 | 
            +
                       'Count nonredundant reads according to the sequencing depths'
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                  method_option *OPT_BUFFER_SIZE
         | 
| 32 | 
            +
                  method_option *OPT_PARALLEL
         | 
| 33 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 34 | 
            +
                  method_option *OPT_GREP_PREFIX
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                  method_option *OPT_LENGTH_BARCODE
         | 
| 37 | 
            +
                  method_option *OPT_LENGTH_GAP
         | 
| 38 | 
            +
                  method_option *OPT_LENGTH_MINIMUM
         | 
| 39 | 
            +
                  method_option *OPT_UMI_LENGTH
         | 
| 40 | 
            +
                  method_option *OPT_LOW_QUALITIES
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                  method_option :tss,
         | 
| 43 | 
            +
                                default: false,
         | 
| 44 | 
            +
                                desc: 'Check number of TSSs, instead of STRT reads',
         | 
| 45 | 
            +
                                type: :boolean
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                  def depth(fqgz, *fqgzs0)
         | 
| 48 | 
            +
                    
         | 
| 49 | 
            +
                    bSize,
         | 
| 50 | 
            +
                    cPfx,
         | 
| 51 | 
            +
                    gPfx,
         | 
| 52 | 
            +
                    par,
         | 
| 53 | 
            +
                    bLen,
         | 
| 54 | 
            +
                    gLen,
         | 
| 55 | 
            +
                    mLen,
         | 
| 56 | 
            +
                    pLen,
         | 
| 57 | 
            +
                    uLen,
         | 
| 58 | 
            +
                    match,
         | 
| 59 | 
            +
                    cPfx0 = configure_depth(options)
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                    fqgzs = [fqgz] + fqgzs0
         | 
| 62 | 
            +
                    tmpfiles = Array.new(fqgzs.length) do |i|
         | 
| 63 | 
            +
                      get_temporary_path('strt.depth', 'fq1l')
         | 
| 64 | 
            +
                    end
         | 
| 65 | 
            +
                    tsscmd =
         | 
| 66 | 
            +
                      options.tss ? "fq1l mt5 --minimum-length=#{mLen} #{match}+ | #{cPfx0}cut -f 2 | #{sortCommand(options)} -u |" : ''
         | 
| 67 | 
            +
                    indexes = Array.new(fqgzs.length) { |i| i }
         | 
| 68 | 
            +
                    Parallel.each(indexes, in_threads: options.parallel) do |i|
         | 
| 69 | 
            +
                      system "gunzip -c #{fqgzs[i]} | fq1l convert #{cPfx} > #{tmpfiles[i]}"
         | 
| 70 | 
            +
                    end
         | 
| 71 | 
            +
                    
         | 
| 72 | 
            +
                    1.upto(12).each do |draw|
         | 
| 73 | 
            +
                      fifo = get_fifo('strt.depth', 'fq1l')
         | 
| 74 | 
            +
                      fp0 = open("| #{cPfx0}wc -l #{fifo}")
         | 
| 75 | 
            +
                      fp1 = open(<<CMD
         | 
| 76 | 
            +
            | LC_ALL=C cat #{tmpfiles.join(' ')} \
         | 
| 77 | 
            +
            | fq1l to #{draw} #{12-draw} \
         | 
| 78 | 
            +
            | #{tee_command(options)} #{fifo} \
         | 
| 79 | 
            +
            | fq1l nr #{bSize} #{cPfx} #{par} \
         | 
| 80 | 
            +
            | fq1l m5 #{gPfx} #{match} \
         | 
| 81 | 
            +
            | fq1l m5 #{gPfx} --invert-match '[^\\t]*N' \
         | 
| 82 | 
            +
            | fq1l qt3 --low-qualities='#{options.low_qualities}' --minimum-length=#{pLen} \
         | 
| 83 | 
            +
            | fq1l pt3 --primer=AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG --minimum-length=#{pLen} #{cPfx} #{gPfx} \
         | 
| 84 | 
            +
            | fq1l nr #{bSize} --degenerated-mode #{cPfx} #{par} \
         | 
| 85 | 
            +
            | #{tsscmd} #{cPfx0}wc -l
         | 
| 86 | 
            +
            CMD
         | 
| 87 | 
            +
                                )
         | 
| 88 | 
            +
                      raw = fp0.gets.strip.split(/\s+/)[0]
         | 
| 89 | 
            +
                      fp0.close
         | 
| 90 | 
            +
                      nr = fp1.gets.strip
         | 
| 91 | 
            +
                      fp1.close
         | 
| 92 | 
            +
                      puts [raw, nr].join(',')
         | 
| 93 | 
            +
                    end
         | 
| 94 | 
            +
                    
         | 
| 95 | 
            +
                  end
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                  no_commands do 
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                    def configure_depth(options)
         | 
| 100 | 
            +
                      uLength = options.umi_length
         | 
| 101 | 
            +
                      bLength = options.length_barcode 
         | 
| 102 | 
            +
                      gLength = options.length_gap
         | 
| 103 | 
            +
                      mLength = options.length_minimum
         | 
| 104 | 
            +
                      return [ options.key?('buffer_size') ?
         | 
| 105 | 
            +
                                 '--buffer-size='+options.buffer_size : '',
         | 
| 106 | 
            +
                               options.coreutils_prefix == '' ?
         | 
| 107 | 
            +
                                 '' : "--coreutils-prefix=#{options.coreutils_prefix}",
         | 
| 108 | 
            +
                               options.grep_prefix == '' ?
         | 
| 109 | 
            +
                                 '' : "--grep-prefix=#{options.grep_prefix}",
         | 
| 110 | 
            +
                               "--parallel=#{options.parallel}",
         | 
| 111 | 
            +
                               bLength,
         | 
| 112 | 
            +
                               gLength,
         | 
| 113 | 
            +
                               mLength,
         | 
| 114 | 
            +
                               mLength + uLength + bLength + gLength,
         | 
| 115 | 
            +
                               uLength,
         | 
| 116 | 
            +
                               "#{'.' * uLength}#{'.' * bLength}#{'G' * (gLength-1)}",
         | 
| 117 | 
            +
                               options.coreutils_prefix ]
         | 
| 118 | 
            +
                    end
         | 
| 119 | 
            +
                    
         | 
| 120 | 
            +
                  end
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                end
         | 
| 123 | 
            +
              end
         | 
| 124 | 
            +
            end
         | 
| @@ -0,0 +1,230 @@ | |
| 1 | 
            +
            require 'open3'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Bio
         | 
| 4 | 
            +
              class Gadget
         | 
| 5 | 
            +
                class StrtPrepareTranscriptome < Bio::Gadget
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                  package_name :prepare_transcriptome
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                  #
         | 
| 10 | 
            +
                  
         | 
| 11 | 
            +
                  desc 'hg38 DIR', 'GRCh38/hg38 - human'
         | 
| 12 | 
            +
                  long_desc <<-DESC
         | 
| 13 | 
            +
            Prepare transcriptome data files based on GENCODE gene annotation RELEASE for GRCh38/h38 at DIR, where it has 'ref.fa.fai' genome index file.
         | 
| 14 | 
            +
            DESC
         | 
| 15 | 
            +
                  
         | 
| 16 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 17 | 
            +
                  method_option *OPT_DOWNLOAD
         | 
| 18 | 
            +
                  method_option *OPT_GREP_PREFIX
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                  method_option :gencode,
         | 
| 21 | 
            +
                                banner: 'RELEASE',
         | 
| 22 | 
            +
                                default: 25,
         | 
| 23 | 
            +
                                desc: 'Release number of GENCODE',
         | 
| 24 | 
            +
                                type: :numeric
         | 
| 25 | 
            +
                  
         | 
| 26 | 
            +
                  def hg38(dir0)
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    dir = File.expand_path(dir0)
         | 
| 29 | 
            +
                    gtf = "#{dir}/hg38.gencode.v#{options.gencode}.annotation.gtf.gz"
         | 
| 30 | 
            +
                    
         | 
| 31 | 
            +
                    if options.download != 'no'
         | 
| 32 | 
            +
                      download_file("ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v#{options.gencode}.annotation.gtf.gz",
         | 
| 33 | 
            +
                                    gtf)
         | 
| 34 | 
            +
                    end
         | 
| 35 | 
            +
                    
         | 
| 36 | 
            +
                    if options.download != 'only'
         | 
| 37 | 
            +
                      
         | 
| 38 | 
            +
                      pipeline("unpigz -c #{gtf}",
         | 
| 39 | 
            +
                               "hisat2_extract_splice_sites.py - > #{dir}/transcriptome.splice_sites")
         | 
| 40 | 
            +
                      
         | 
| 41 | 
            +
                      pipeline("unpigz -c #{gtf}",
         | 
| 42 | 
            +
                               "hisat2_extract_exons.py - > #{dir}/transcriptome.exons")
         | 
| 43 | 
            +
                      
         | 
| 44 | 
            +
                      fp_ribosome = open_bed_w("#{dir}/ribosome.bed")
         | 
| 45 | 
            +
                      fp_whole = open_bed_w("#{dir}/spikein_whole.bed")
         | 
| 46 | 
            +
                      fp_5end = open_bed_w("#{dir}/spikein_5end.bed")
         | 
| 47 | 
            +
                      open("#{dir}/ref.fa.fai").each do |line|
         | 
| 48 | 
            +
                        acc, len, *tmp = line.rstrip.split
         | 
| 49 | 
            +
                        if acc =~ /^RIBO_/
         | 
| 50 | 
            +
                          fp_ribosome.puts [acc, 0, len, acc, 0, '+'].join("\t") 
         | 
| 51 | 
            +
                          fp_ribosome.puts [acc, 0, len, acc, 0, '-'].join("\t") 
         | 
| 52 | 
            +
                        end
         | 
| 53 | 
            +
                        if acc =~ /^RNA_SPIKE_/
         | 
| 54 | 
            +
                          fp_whole.puts [acc, 0, len, acc, 0, '+'].join("\t")
         | 
| 55 | 
            +
                          fp_5end.puts [acc, 0, 50, acc, 0, '+'].join("\t")
         | 
| 56 | 
            +
                        end
         | 
| 57 | 
            +
                      end
         | 
| 58 | 
            +
                      fp_ribosome.close
         | 
| 59 | 
            +
                      fp_whole.close
         | 
| 60 | 
            +
                      fp_5end.close
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                      atgs = Hash.new
         | 
| 63 | 
            +
                      regex_transcript_id = /transcript_id "([^"]+)"/
         | 
| 64 | 
            +
                      regex_gene_id = /gene_id "([^"]+)"/
         | 
| 65 | 
            +
                      regex_gene_name = /gene_name "([^"]+)"/
         | 
| 66 | 
            +
                      regex_exon_number = /exon_number (\d+)/
         | 
| 67 | 
            +
                      Open3.pipeline_r(
         | 
| 68 | 
            +
                        "unpigz -c #{gtf}",
         | 
| 69 | 
            +
                        "#{grep_command} '\tstart_codon\t'") do |fp, threads|
         | 
| 70 | 
            +
                        fp.each do |line|
         | 
| 71 | 
            +
                          cols = line.rstrip.split /\t/
         | 
| 72 | 
            +
                          atgs[regex_transcript_id.match(cols[8]).to_a[1]] =
         | 
| 73 | 
            +
                            cols[cols[6] == '+' ? 3 : 4].to_i
         | 
| 74 | 
            +
                        end
         | 
| 75 | 
            +
                        fp.close
         | 
| 76 | 
            +
                      end
         | 
| 77 | 
            +
                      
         | 
| 78 | 
            +
                      bed_coding_exon = "#{dir}/transcriptome.coding_exon.bed"
         | 
| 79 | 
            +
                      bed_coding_5utr = "#{dir}/transcriptome.coding_5utr.bed"
         | 
| 80 | 
            +
                      bed_coding_promoter = "#{dir}/transcriptome.coding_promoter.bed"
         | 
| 81 | 
            +
                      
         | 
| 82 | 
            +
                      fp_coding_gene = open_bed_w("#{dir}/transcriptome.coding_gene.bed")
         | 
| 83 | 
            +
                      fp_coding_exon = open_bed_w(bed_coding_exon)
         | 
| 84 | 
            +
                      fp_coding_5utr = open_bed_w(bed_coding_5utr)
         | 
| 85 | 
            +
                      fp_coding_promoter = open_bed_w(bed_coding_promoter)
         | 
| 86 | 
            +
                      fp_other_gene = open_bed_w("#{dir}/transcriptome.other_gene.bed")
         | 
| 87 | 
            +
                      fp_other_exon = open_bed_w("#{dir}/transcriptome.other_exon.bed")
         | 
| 88 | 
            +
                      fp_other_1st_exon = open_bed_w("#{dir}/transcriptome.other_1st_exon.bed")
         | 
| 89 | 
            +
                      fp_other_promoter = open_bed_w("#{dir}/transcriptome.other_promoter.bed")
         | 
| 90 | 
            +
                      Open3.pipeline_r(
         | 
| 91 | 
            +
                        "unpigz -c #{gtf}",
         | 
| 92 | 
            +
                        "#{grep_command} -E '\t(exon|transcript)\t'") do |fp, threads|
         | 
| 93 | 
            +
                        fp.each do |line|
         | 
| 94 | 
            +
                          cols = line.rstrip.split /\t/
         | 
| 95 | 
            +
                          ann = cols[8]
         | 
| 96 | 
            +
                          transcript_id = regex_transcript_id.match(ann).to_a[1]
         | 
| 97 | 
            +
                          gene_id = regex_gene_id.match(ann).to_a[1]
         | 
| 98 | 
            +
                          gene_name = regex_gene_name.match(ann).to_a[1]
         | 
| 99 | 
            +
                          exon_number = regex_exon_number.match(ann).to_a[1].to_i
         | 
| 100 | 
            +
                          chr = cols[0]
         | 
| 101 | 
            +
                          left = cols[3].to_i
         | 
| 102 | 
            +
                          right = cols[4].to_i
         | 
| 103 | 
            +
                          str = cols[6]
         | 
| 104 | 
            +
                          acc = "#{gene_name}|#{gene_id}|#{transcript_id}"
         | 
| 105 | 
            +
                          exon = [chr, left-1, right, acc, 0, str].join("\t")
         | 
| 106 | 
            +
                          if cols[2] == 'transcript'
         | 
| 107 | 
            +
                            if atgs.key?(transcript_id)
         | 
| 108 | 
            +
                              fp_coding_gene.puts exon
         | 
| 109 | 
            +
                            else
         | 
| 110 | 
            +
                              fp_other_gene.puts exon
         | 
| 111 | 
            +
                            end
         | 
| 112 | 
            +
                            next
         | 
| 113 | 
            +
                          end
         | 
| 114 | 
            +
                          if atgs.key?(transcript_id)
         | 
| 115 | 
            +
                            fp_coding_exon.puts exon
         | 
| 116 | 
            +
                            atg = atgs[transcript_id]
         | 
| 117 | 
            +
                            if (str == '+' && right < atg) || (str == '-' && atg < left)
         | 
| 118 | 
            +
                              fp_coding_5utr.puts exon
         | 
| 119 | 
            +
                            elsif (str == '+' && left < atg && atg <= right)
         | 
| 120 | 
            +
                              fp_coding_5utr.puts [chr, left-1, atg-1, acc, 0, '+'].join("\t")
         | 
| 121 | 
            +
                            elsif (str == '-' && left <= atg && atg < right)
         | 
| 122 | 
            +
                              fp_coding_5utr.puts [chr, atg, right, acc, 0, '-'].join("\t")
         | 
| 123 | 
            +
                            end
         | 
| 124 | 
            +
                            if exon_number == 1
         | 
| 125 | 
            +
                              if str == '+'
         | 
| 126 | 
            +
                                fp_coding_promoter.puts [chr, left-500, left-1, acc, 0, '+'].join("\t")
         | 
| 127 | 
            +
                              else
         | 
| 128 | 
            +
                                fp_coding_promoter.puts [chr, right, right+499, acc, 0, '-'].join("\t")
         | 
| 129 | 
            +
                              end
         | 
| 130 | 
            +
                            end
         | 
| 131 | 
            +
                          else
         | 
| 132 | 
            +
                            fp_other_exon.puts exon
         | 
| 133 | 
            +
                            if exon_number == 1
         | 
| 134 | 
            +
                              fp_other_1st_exon.puts exon
         | 
| 135 | 
            +
                              if str == '+'
         | 
| 136 | 
            +
                                fp_other_promoter.puts [chr, left-500, left-1, acc, 0, '+'].join("\t")
         | 
| 137 | 
            +
                              else
         | 
| 138 | 
            +
                                fp_other_promoter.puts [chr, right, right+499, acc, 0, '-'].join("\t")
         | 
| 139 | 
            +
                              end
         | 
| 140 | 
            +
                            end
         | 
| 141 | 
            +
                          end
         | 
| 142 | 
            +
                        end
         | 
| 143 | 
            +
                        fp.close
         | 
| 144 | 
            +
                      end
         | 
| 145 | 
            +
                      fp_coding_gene.close
         | 
| 146 | 
            +
                      fp_coding_exon.close
         | 
| 147 | 
            +
                      fp_coding_5utr.close
         | 
| 148 | 
            +
                      fp_coding_promoter.close
         | 
| 149 | 
            +
                      fp_other_gene.close
         | 
| 150 | 
            +
                      fp_other_exon.close
         | 
| 151 | 
            +
                      fp_other_1st_exon.close
         | 
| 152 | 
            +
                      fp_other_promoter.close
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                      merge_bed_by_gene("Coding5end",
         | 
| 155 | 
            +
                                        "5UTR and the proximal upstream of protein coding genes based on GENCODE version #{options.gencode} for GRCh38/hg38",
         | 
| 156 | 
            +
                                        "#{dir}/transcriptome.coding_5end",
         | 
| 157 | 
            +
                                        bed_coding_promoter, bed_coding_5utr)
         | 
| 158 | 
            +
                      merge_bed_by_gene("CodingWhole",
         | 
| 159 | 
            +
                                        "Exon and the proximal upstream of protein coding genes based on GENCODE version #{options.gencode} for GRCh38/hg38",
         | 
| 160 | 
            +
                                        "#{dir}/transcriptome.coding_whole",
         | 
| 161 | 
            +
                                        bed_coding_promoter, bed_coding_exon)
         | 
| 162 | 
            +
             | 
| 163 | 
            +
                      sh "pigz #{dir}/*.bed"
         | 
| 164 | 
            +
             | 
| 165 | 
            +
                      genePred = "#{dir}/hg38_refGene.txt"
         | 
| 166 | 
            +
                      pipeline(
         | 
| 167 | 
            +
                        "unpigz -c #{gtf}",
         | 
| 168 | 
            +
                        "#{grep_command} 'tag \"basic\"'",
         | 
| 169 | 
            +
                        "gtfToGenePred -geneNameAsName2 -genePredExt stdin #{genePred}")
         | 
| 170 | 
            +
                      sh "retrieve_seq_from_fasta.pl --format refGene --seqfile #{dir}/ref.fa --outfile #{dir}/hg38_refGeneMrna.fa #{genePred}"
         | 
| 171 | 
            +
             | 
| 172 | 
            +
                    end
         | 
| 173 | 
            +
                    
         | 
| 174 | 
            +
                  end
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                  no_commands do
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                    def open_bed_w(bed)
         | 
| 179 | 
            +
                      open("| bedtools sort -i stdin > #{bed}", 'w')
         | 
| 180 | 
            +
                    end
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                    def merge_bed_by_gene(tname, tdesc, base, *inbeds)
         | 
| 183 | 
            +
                      tmpbed = get_temporary_path('strt.prepare_transcriptome', 'bed')
         | 
| 184 | 
            +
                      annfp = open("#{base}.csv", 'w')
         | 
| 185 | 
            +
                      annfp.puts "ID,ACCESSIONS"
         | 
| 186 | 
            +
                      Open3.pipeline_r(
         | 
| 187 | 
            +
                        "#{cat_command} #{inbeds.join(' ')}",
         | 
| 188 | 
            +
                        "ruby -anle 'as=$F[3].split /\\|/; puts ($F[0..2]+[as[0]]+$F[4..-1]+[as[1..-1].join(\"|\")]).join(\"\t\")'",
         | 
| 189 | 
            +
                        "#{sort_command} -k 4,4 -k 1,1 -k 2,2n") do |infp, inths|
         | 
| 190 | 
            +
                        presym = ''
         | 
| 191 | 
            +
                        gacc2taccs = Hash.new
         | 
| 192 | 
            +
                        outfp = nil
         | 
| 193 | 
            +
                        outths = nil
         | 
| 194 | 
            +
                        infp.each do |line|
         | 
| 195 | 
            +
                          chr, left, right, sym, tmp, str, accs = line.rstrip.split /\t/
         | 
| 196 | 
            +
                          gacc, tacc = accs.split /\|/
         | 
| 197 | 
            +
                          if presym != sym
         | 
| 198 | 
            +
                            unless outfp.nil?
         | 
| 199 | 
            +
                              outfp.close
         | 
| 200 | 
            +
                              outths[-1].join
         | 
| 201 | 
            +
                              accstr = (gacc2taccs.map {|k,v| "#{k}:#{v.join(';')}"}).join('|')
         | 
| 202 | 
            +
                              annfp.puts "#{presym},#{accstr}"
         | 
| 203 | 
            +
                              gacc2taccs = Hash.new
         | 
| 204 | 
            +
                            end
         | 
| 205 | 
            +
                            outfp, outths = Open3.pipeline_w(
         | 
| 206 | 
            +
                                     "bedtools sort -i stdin",
         | 
| 207 | 
            +
                                     "bedtools merge -s -c 4 -o distinct >> #{tmpbed}")
         | 
| 208 | 
            +
                            presym = sym
         | 
| 209 | 
            +
                          end
         | 
| 210 | 
            +
                          outfp.puts ([chr, left, right, sym, tmp, str]).join("\t")
         | 
| 211 | 
            +
                          gacc2taccs[gacc] = Array.new unless gacc2taccs.key?(gacc)
         | 
| 212 | 
            +
                          gacc2taccs[gacc] << tacc unless gacc2taccs[gacc].include?(tacc)
         | 
| 213 | 
            +
                        end
         | 
| 214 | 
            +
                        unless outfp.nil?
         | 
| 215 | 
            +
                          outfp.close
         | 
| 216 | 
            +
                          outths[-1].join
         | 
| 217 | 
            +
                          accstr = (gacc2taccs.map {|k,v| "#{k}:#{v.join(';')}"}).join('|')
         | 
| 218 | 
            +
                          annfp.puts "#{presym},#{accstr}"
         | 
| 219 | 
            +
                        end
         | 
| 220 | 
            +
                      end
         | 
| 221 | 
            +
                      annfp.close
         | 
| 222 | 
            +
                      sh "echo 'track name=#{tname} description=\"#{tdesc}\" visibility=3 colorByStrand=\"38,139,210 203,75,22\"' > #{base}.bed"
         | 
| 223 | 
            +
                      pipeline("ruby -anle 'puts ($F.values_at(0, 1, 2) + [$F[4], 0, $F[3]]).join(\"\t\")' < #{tmpbed}",
         | 
| 224 | 
            +
                               "bedtools sort -i stdin >> #{base}.bed")
         | 
| 225 | 
            +
                    end
         | 
| 226 | 
            +
                  end
         | 
| 227 | 
            +
                  
         | 
| 228 | 
            +
                end
         | 
| 229 | 
            +
              end
         | 
| 230 | 
            +
            end
         | 
    
        data/lib/bio/gadgets.rb
    ADDED
    
    | @@ -0,0 +1,135 @@ | |
| 1 | 
            +
            require 'bio'
         | 
| 2 | 
            +
            require 'parallel'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module Bio
         | 
| 5 | 
            +
              class Gadgets < Bio::Gadget
         | 
| 6 | 
            +
                
         | 
| 7 | 
            +
                desc 'find PATTERN [NAME]',
         | 
| 8 | 
            +
                     'Find fragments matching with regexp PATTERN from FASTA-format STDIN'
         | 
| 9 | 
            +
                
         | 
| 10 | 
            +
                method_option *OPT_BUFFER_SIZE
         | 
| 11 | 
            +
                method_option *OPT_PARALLEL
         | 
| 12 | 
            +
                method_option *OPT_COREUTILS_PREFIX
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                method_option :ignore_case,
         | 
| 15 | 
            +
                              default: true,
         | 
| 16 | 
            +
                              desc: 'Fold lower case to upper case characters',
         | 
| 17 | 
            +
                              type: :boolean
         | 
| 18 | 
            +
                
         | 
| 19 | 
            +
                def find(pattern, name0 = '')
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                  bSize = options.key?('buffer_size') ? '--buffer-size='+options.buffer_size : ''
         | 
| 22 | 
            +
                  cPrefix = options.coreutils_prefix
         | 
| 23 | 
            +
                  re = Regexp.new("(#{pattern})", options.ignore_case)
         | 
| 24 | 
            +
                  name = name0 == '' ? pattern : name0
         | 
| 25 | 
            +
                  
         | 
| 26 | 
            +
                  pids = Array.new
         | 
| 27 | 
            +
                  tmpfiles = Array.new
         | 
| 28 | 
            +
                  ff = Bio::FlatFile.open(Bio::FastaFormat, STDIN)
         | 
| 29 | 
            +
                  ff.each do |entry|
         | 
| 30 | 
            +
                    tmpfiles << tmpfile = get_temporary_path('find', 'bed', false)
         | 
| 31 | 
            +
                    acc = entry.entry_id
         | 
| 32 | 
            +
                    seq = entry.seq
         | 
| 33 | 
            +
                    pids << Process.fork do
         | 
| 34 | 
            +
                      fp = open("| #{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 #{bSize} > #{tmpfile}", 'w')
         | 
| 35 | 
            +
                      #
         | 
| 36 | 
            +
                      pos = 0
         | 
| 37 | 
            +
                      match = re.match(seq, pos)
         | 
| 38 | 
            +
                      while !match.nil?
         | 
| 39 | 
            +
                        fp.puts [acc, match.begin(1), match.end(1), name, '0', '+'].join("\t")
         | 
| 40 | 
            +
                        pos = match.begin(1)+1
         | 
| 41 | 
            +
                        match = re.match(seq, pos)
         | 
| 42 | 
            +
                      end
         | 
| 43 | 
            +
                      #
         | 
| 44 | 
            +
                      pos = 0
         | 
| 45 | 
            +
                      seq = seq.reverse.tr('acgtACGT', 'tgcaTGCA')
         | 
| 46 | 
            +
                      len = seq.length
         | 
| 47 | 
            +
                      match = re.match(seq, pos)
         | 
| 48 | 
            +
                      while !match.nil?
         | 
| 49 | 
            +
                        fp.puts [acc, len-match.end(1), len-match.begin(1), name, '0', '-'].join("\t")
         | 
| 50 | 
            +
                        pos = match.begin(1)+1
         | 
| 51 | 
            +
                        match = re.match(seq, pos)
         | 
| 52 | 
            +
                      end
         | 
| 53 | 
            +
                      #
         | 
| 54 | 
            +
                      fp.close
         | 
| 55 | 
            +
                    end
         | 
| 56 | 
            +
                    while pids.length == options.parallel
         | 
| 57 | 
            +
                      pids.delete(Process.wait)
         | 
| 58 | 
            +
                    end
         | 
| 59 | 
            +
                  end
         | 
| 60 | 
            +
                  ff.close
         | 
| 61 | 
            +
                  while pids.length > 0
         | 
| 62 | 
            +
                    pids.delete(Process.wait)
         | 
| 63 | 
            +
                  end
         | 
| 64 | 
            +
                  
         | 
| 65 | 
            +
                  system "#{cPrefix}sort -k 1,1 -k 2,2n -k 3,3n -k 4,4 --merge #{bSize} #{tmpfiles.join(' ')}"
         | 
| 66 | 
            +
                  unlink_files(tmpfiles)
         | 
| 67 | 
            +
                  
         | 
| 68 | 
            +
                end
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                #
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                desc 'gap BEDgz1 BEDgz2',
         | 
| 73 | 
            +
                     "Calculate gap distances from 5'-end of fragments 1 to 3'-end of fragments 2"
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                method_option *OPT_PARALLEL
         | 
| 76 | 
            +
                method_option *OPT_COREUTILS_PREFIX
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                method_option :minimum_gap,
         | 
| 79 | 
            +
                              default: -10000,
         | 
| 80 | 
            +
                              desc: 'Minimum gap distans to be reported',
         | 
| 81 | 
            +
                              type: :numeric
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                method_option :maximum_gap,
         | 
| 84 | 
            +
                              default: 2500,
         | 
| 85 | 
            +
                              desc: 'Maximum gap distans to be reported',
         | 
| 86 | 
            +
                              type: :numeric
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                def gap(bedgz1, bedgz2)
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                  cPrefix = options.coreutils_prefix
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                  chrs = Hash.new
         | 
| 93 | 
            +
                  open("| unpigz -c #{bedgz1} #{bedgz2} | #{cPrefix}cut -f 1 | #{cPrefix}uniq | #{cPrefix}sort -u").each do |line|
         | 
| 94 | 
            +
                    chrs[line.rstrip] = ''
         | 
| 95 | 
            +
                  end
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                  max = options.maximum_gap
         | 
| 98 | 
            +
                  min = options.minimum_gap
         | 
| 99 | 
            +
                  reSep = /\t/
         | 
| 100 | 
            +
                  tmpfiles = Hash.new
         | 
| 101 | 
            +
                  chrs.keys.each do |chr|
         | 
| 102 | 
            +
                    tmpfiles[chr] = get_temporary_path('gap', 'csv', false)
         | 
| 103 | 
            +
                  end
         | 
| 104 | 
            +
                  Parallel.each(chrs.keys, in_processes: options.parallel) do |chr|
         | 
| 105 | 
            +
                    bed2 = Array.new
         | 
| 106 | 
            +
                    open("| gunzip -c #{bedgz2} | grep '^#{chr}\t'").each do |line|
         | 
| 107 | 
            +
                      chr0, *cols = line.rstrip.split(reSep)
         | 
| 108 | 
            +
                      cols[0] = cols[0].to_i
         | 
| 109 | 
            +
                      cols[1] = cols[1].to_i
         | 
| 110 | 
            +
                      bed2 << cols
         | 
| 111 | 
            +
                    end
         | 
| 112 | 
            +
                    fp = open(tmpfiles[chr], 'w')
         | 
| 113 | 
            +
                    open("| gunzip -c #{bedgz1} | grep '^#{chr}\t'").each do |line|
         | 
| 114 | 
            +
                      chr0, start, stop, name, score, str = line.rstrip.split(reSep)
         | 
| 115 | 
            +
                      if str == '+'
         | 
| 116 | 
            +
                        bed2.each do |bed|
         | 
| 117 | 
            +
                          dist = bed[1] - start.to_i + 1
         | 
| 118 | 
            +
                          fp.puts [name, bed[2], dist].join(',') if min <= dist && dist < max
         | 
| 119 | 
            +
                        end
         | 
| 120 | 
            +
                      else
         | 
| 121 | 
            +
                        bed2.each do |bed|
         | 
| 122 | 
            +
                          dist = stop.to_i - bed[0] + 1
         | 
| 123 | 
            +
                          fp.puts [name, bed[2], dist].join(',') if min <= dist && dist < max
         | 
| 124 | 
            +
                        end
         | 
| 125 | 
            +
                      end
         | 
| 126 | 
            +
                    end
         | 
| 127 | 
            +
                    fp.close
         | 
| 128 | 
            +
                  end
         | 
| 129 | 
            +
                  system "cat #{tmpfiles.values.join(' ')}"
         | 
| 130 | 
            +
                  unlink_files(tmpfiles)
         | 
| 131 | 
            +
                  
         | 
| 132 | 
            +
                end
         | 
| 133 | 
            +
                
         | 
| 134 | 
            +
              end
         | 
| 135 | 
            +
            end
         |