bio-gadget 0.4.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +10 -20
- data/.travis.yml +5 -0
- data/LICENSE +1 -1
- data/README.org +0 -21
- data/Rakefile +5 -1
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bio-gadget.gemspec +20 -14
- data/exe/bio-gadget +14 -0
- data/exe/fq1l +5 -0
- data/exe/rbg +1 -0
- data/exe/strt +5 -0
- data/ext/bio_gadget/bio_gadget.c +313 -0
- data/ext/bio_gadget/bio_gadget.h +8 -0
- data/ext/bio_gadget/extconf.rb +3 -0
- data/lib/bio/gadget.rb +171 -0
- data/lib/bio/gadget/fq1l.rb +457 -0
- data/lib/bio/gadget/strt.rb +605 -0
- data/lib/bio/gadget/strt/count.rb +53 -0
- data/lib/bio/gadget/strt/depth.rb +124 -0
- data/lib/bio/gadget/strt/prepare_transcriptome.rb +230 -0
- data/lib/bio/gadgets.rb +135 -0
- data/test/bio/gadget_test.rb +11 -0
- data/test/test_helper.rb +4 -0
- metadata +109 -40
- data/Gthorfile +0 -2
- data/bin/bio-gadget +0 -5
- data/lib/bio-gadget.rb +0 -44
- data/lib/bio-gadget/dedup.rb +0 -33
- data/lib/bio-gadget/demlt.rb +0 -149
- data/lib/bio-gadget/femrg.rb +0 -61
- data/lib/bio-gadget/fqxz.rb +0 -30
- data/lib/bio-gadget/peak.rb +0 -94
- data/lib/bio-gadget/qvstat.rb +0 -34
- data/lib/bio-gadget/rgt2mtx.rb +0 -60
- data/lib/bio-gadget/version.rb +0 -9
- data/lib/bio-gadget/wig5p.rb +0 -51
- data/lib/bio-gadget/wigchr.rb +0 -28
    
        data/lib/bio/gadget.rb
    ADDED
    
    | @@ -0,0 +1,171 @@ | |
| 1 | 
            +
            require 'mkfifo'
         | 
| 2 | 
            +
            require 'open3'
         | 
| 3 | 
            +
            require 'tempfile'
         | 
| 4 | 
            +
            require 'thor'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            module Bio
         | 
| 7 | 
            +
              class Gadget < Thor
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                OPT_BUFFER_SIZE = [
         | 
| 10 | 
            +
                  :buffer_size, {
         | 
| 11 | 
            +
                    :aliases => '-S',
         | 
| 12 | 
            +
                    :banner => 'SIZE',
         | 
| 13 | 
            +
                    :desc => 'Use SIZE for main memory buffer',
         | 
| 14 | 
            +
                    :type => :string
         | 
| 15 | 
            +
                  }
         | 
| 16 | 
            +
                ]
         | 
| 17 | 
            +
                
         | 
| 18 | 
            +
                OPT_DOWNLOAD = [ :download, {
         | 
| 19 | 
            +
                                   :banner => 'BEHAVIOR',
         | 
| 20 | 
            +
                                   :default => 'yes',
         | 
| 21 | 
            +
                                   :desc => 'Download and process, no download or only',
         | 
| 22 | 
            +
                                   :enum => ['yes', 'no', 'only'] } ]
         | 
| 23 | 
            +
                
         | 
| 24 | 
            +
                OPT_PARALLEL = [
         | 
| 25 | 
            +
                  :parallel, {
         | 
| 26 | 
            +
                    :banner => 'N',
         | 
| 27 | 
            +
                    :default => (
         | 
| 28 | 
            +
                      system('which gnproc >/dev/null 2>&1') ?
         | 
| 29 | 
            +
                        `gnproc`.to_i :
         | 
| 30 | 
            +
                        (system('which nproc >/dev/null 2>&1') ? `nproc`.to_i : 2)
         | 
| 31 | 
            +
                    ),
         | 
| 32 | 
            +
                    :desc => 'Change the number of sorts run concurrently',
         | 
| 33 | 
            +
                    :type => :numeric
         | 
| 34 | 
            +
                  }
         | 
| 35 | 
            +
                ]
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                OPT_COREUTILS_PREFIX = [
         | 
| 38 | 
            +
                  :coreutils_prefix, {
         | 
| 39 | 
            +
                    :banner => 'PREFIX',
         | 
| 40 | 
            +
                    :default => system('which gnproc >/dev/null 2>&1') ? 'g' : '',
         | 
| 41 | 
            +
                    :desc => 'A prefix character for GNU coreutils',
         | 
| 42 | 
            +
                    :type => :string
         | 
| 43 | 
            +
                  }
         | 
| 44 | 
            +
                ]
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                OPT_GREP_PREFIX = [
         | 
| 47 | 
            +
                  :grep_prefix, {
         | 
| 48 | 
            +
                    :banner => 'PREFIX',
         | 
| 49 | 
            +
                    :default => system('which ggrep >/dev/null 2>&1') ? 'g' : '',
         | 
| 50 | 
            +
                    :desc => 'A prefix character for GNU grep',
         | 
| 51 | 
            +
                    :type => :string
         | 
| 52 | 
            +
                  }
         | 
| 53 | 
            +
                ]
         | 
| 54 | 
            +
                
         | 
| 55 | 
            +
                #
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                no_commands do
         | 
| 58 | 
            +
                  
         | 
| 59 | 
            +
                  def self.banner(command, namespace = true, subcommand = false)
         | 
| 60 | 
            +
                    "#{basename} #{@package_name.nil? ? '' : @package_name.to_s+' '}#{command.usage}"
         | 
| 61 | 
            +
                  end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                  def buffer_size_option
         | 
| 64 | 
            +
                    options.key?(:buffer_size) ? ' --buffer-size='+options.buffer_size : ''
         | 
| 65 | 
            +
                  end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                  def cat_command
         | 
| 68 | 
            +
                    "#{options.coreutils_prefix}cat"
         | 
| 69 | 
            +
                  end
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                  def coreutils_prefix_option
         | 
| 72 | 
            +
                     options.key?(:coreutils_prefix) ? " --coreutils-prefix=#{options.coreutils_prefix}" : ''
         | 
| 73 | 
            +
                  end
         | 
| 74 | 
            +
                  
         | 
| 75 | 
            +
                  def cut_command
         | 
| 76 | 
            +
                    "#{options.coreutils_prefix}cut"
         | 
| 77 | 
            +
                  end
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                  def download_file(url, path)
         | 
| 80 | 
            +
                    system "curl -R -f -s -S -o #{path} '#{url}'" or exit $?.exitstatus
         | 
| 81 | 
            +
                  end
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                  def fold_command(options)
         | 
| 84 | 
            +
                    "#{options.coreutils_prefix}fold"
         | 
| 85 | 
            +
                  end
         | 
| 86 | 
            +
                  
         | 
| 87 | 
            +
                  def fq1l_convert_command(options)
         | 
| 88 | 
            +
                    "fq1l convert#{coreutils_prefix_option}"
         | 
| 89 | 
            +
                  end
         | 
| 90 | 
            +
                  
         | 
| 91 | 
            +
                  def fq1l_count_command(options)
         | 
| 92 | 
            +
                    "fq1l count#{coreutils_prefix_option}#{parallel_option(options)}"
         | 
| 93 | 
            +
                  end
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                  def fq1l_sort_command(options)
         | 
| 96 | 
            +
                    "fq1l sort#{coreutils_prefix_option}#{parallel_option(options)}"
         | 
| 97 | 
            +
                  end
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                  def get_temporary_path(prefix, suffix, cleanup=true)
         | 
| 100 | 
            +
                    tmpname = Dir::Tmpname.create(["rbg.#{prefix}.", ".#{suffix}"]) {  }
         | 
| 101 | 
            +
                    if cleanup
         | 
| 102 | 
            +
                      at_exit { File.unlink(tmpname) if FileTest.exist?(tmpname) }
         | 
| 103 | 
            +
                    end
         | 
| 104 | 
            +
                    tmpname
         | 
| 105 | 
            +
                  end
         | 
| 106 | 
            +
                  
         | 
| 107 | 
            +
                  def get_fifo(prefix, suffix, cleanup=true)
         | 
| 108 | 
            +
                    fifo = get_temporary_path("#{prefix}.fifo", suffix, cleanup)
         | 
| 109 | 
            +
                    File.mkfifo(fifo)
         | 
| 110 | 
            +
                    fifo
         | 
| 111 | 
            +
                  end
         | 
| 112 | 
            +
                  
         | 
| 113 | 
            +
                  def grep_command
         | 
| 114 | 
            +
                    "#{options.grep_prefix}grep"
         | 
| 115 | 
            +
                  end
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                  def grep_prefix_option(options)
         | 
| 118 | 
            +
                     options.key?(:grep_prefix) ? " --grep-prefix=#{options.grep_prefix}" : ''
         | 
| 119 | 
            +
                  end
         | 
| 120 | 
            +
                  
         | 
| 121 | 
            +
                  def head_command(options)
         | 
| 122 | 
            +
                    "#{options.coreutils_prefix}head"
         | 
| 123 | 
            +
                  end
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                  def parallel_option(options)
         | 
| 126 | 
            +
                     options.key?(:parallel) ? " --parallel=#{options.parallel}" : ''
         | 
| 127 | 
            +
                  end
         | 
| 128 | 
            +
                  
         | 
| 129 | 
            +
                  def pipeline(*cmds)
         | 
| 130 | 
            +
                    stats = Open3.pipeline(*cmds)
         | 
| 131 | 
            +
                    stats.each_index do |i|
         | 
| 132 | 
            +
                      raise "Fail at process #{i}; #{stats[i]}; #{cmds[i]}" unless stats[i].success?
         | 
| 133 | 
            +
                    end
         | 
| 134 | 
            +
                  end
         | 
| 135 | 
            +
                  
         | 
| 136 | 
            +
                  def sort_command
         | 
| 137 | 
            +
                    "#{options.coreutils_prefix}sort#{buffer_size_option}#{options.key?(:parallel) ? ' --parallel='+options.parallel.to_s : ''} --compress-program=pigz"
         | 
| 138 | 
            +
                  end
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                  def sh(cmd)
         | 
| 141 | 
            +
                    system cmd
         | 
| 142 | 
            +
                    raise "Fail at process #{$?.pid}; #{$?}; #{cmd}" unless $?.success?
         | 
| 143 | 
            +
                  end
         | 
| 144 | 
            +
                  
         | 
| 145 | 
            +
                  def tee_command(options)
         | 
| 146 | 
            +
                    "#{options.coreutils_prefix}tee"
         | 
| 147 | 
            +
                  end
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                  def uniq_command(options)
         | 
| 150 | 
            +
                    "#{options.coreutils_prefix}uniq"
         | 
| 151 | 
            +
                  end
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                  def unlink_files(files)
         | 
| 154 | 
            +
                    files.each do |file|
         | 
| 155 | 
            +
                      File.unlink(file) if File.exist?(file)
         | 
| 156 | 
            +
                    end
         | 
| 157 | 
            +
                  end
         | 
| 158 | 
            +
                  
         | 
| 159 | 
            +
                  def wc_command(options)
         | 
| 160 | 
            +
                    "#{options.coreutils_prefix}wc"
         | 
| 161 | 
            +
                  end
         | 
| 162 | 
            +
             | 
| 163 | 
            +
                end
         | 
| 164 | 
            +
                
         | 
| 165 | 
            +
              end
         | 
| 166 | 
            +
            end
         | 
| 167 | 
            +
             | 
| 168 | 
            +
            require 'bio/gadgets'
         | 
| 169 | 
            +
            require 'bio/gadget/fq1l'
         | 
| 170 | 
            +
            require 'bio/gadget/strt'
         | 
| 171 | 
            +
            require 'bio/gadget/bio_gadget'
         | 
| @@ -0,0 +1,457 @@ | |
| 1 | 
            +
            require 'damerau-levenshtein'
         | 
| 2 | 
            +
            require 'io/wait'
         | 
| 3 | 
            +
            require 'open3'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module Bio
         | 
| 6 | 
            +
              class Gadget
         | 
| 7 | 
            +
                class Fq1l < Bio::Gadget
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                  OPT_INVERT_MATCH = [
         | 
| 10 | 
            +
                    :invert_match, {
         | 
| 11 | 
            +
                      :desc => 'The sense of matching',
         | 
| 12 | 
            +
                      :type => :boolean
         | 
| 13 | 
            +
                    }
         | 
| 14 | 
            +
                  ]
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                  OPT_MINIMUM_LENGTH = [
         | 
| 17 | 
            +
                    :minimum_length, {
         | 
| 18 | 
            +
                      :banner => 'NT',
         | 
| 19 | 
            +
                      :default => 40,
         | 
| 20 | 
            +
                      :desc => 'Minimum length after trimming',
         | 
| 21 | 
            +
                      :type => :numeric
         | 
| 22 | 
            +
                    }
         | 
| 23 | 
            +
                  ]
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                  # fq1l:annotate_index
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                  desc 'annotate_index', 'Annotate sequence identifier by index sequence at the specified region'
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                  method_option :first_cycle,
         | 
| 30 | 
            +
                                default: 7,
         | 
| 31 | 
            +
                                desc: 'The first cycle of index',
         | 
| 32 | 
            +
                                type: :numeric
         | 
| 33 | 
            +
                  
         | 
| 34 | 
            +
                  method_option :last_cycle,
         | 
| 35 | 
            +
                                default: 12,
         | 
| 36 | 
            +
                                desc: 'The last cycle of index',
         | 
| 37 | 
            +
                                type: :numeric
         | 
| 38 | 
            +
                  
         | 
| 39 | 
            +
                  def annotate_index
         | 
| 40 | 
            +
                    exit unless STDIN.wait
         | 
| 41 | 
            +
                    BioGadget.i2i(options.first_cycle, options.last_cycle)
         | 
| 42 | 
            +
                  end
         | 
| 43 | 
            +
                  
         | 
| 44 | 
            +
                  # fq1l:annotate_umi
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                  desc 'annotate_umi', 'Annotate sequence identifier by UMI sequence at the specified region'
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                  method_option :first_cycle,
         | 
| 49 | 
            +
                                default: 1,
         | 
| 50 | 
            +
                                desc: 'The first cycle of UMI',
         | 
| 51 | 
            +
                                type: :numeric
         | 
| 52 | 
            +
                  
         | 
| 53 | 
            +
                  method_option :last_cycle,
         | 
| 54 | 
            +
                                default: 6,
         | 
| 55 | 
            +
                                desc: 'The last cycle of UMI',
         | 
| 56 | 
            +
                                type: :numeric
         | 
| 57 | 
            +
                  
         | 
| 58 | 
            +
                  def annotate_umi
         | 
| 59 | 
            +
                    exit unless STDIN.wait
         | 
| 60 | 
            +
                    BioGadget.u2i(options.first_cycle, options.last_cycle)
         | 
| 61 | 
            +
                  end
         | 
| 62 | 
            +
                  
         | 
| 63 | 
            +
                  # fq1l:convert
         | 
| 64 | 
            +
                  
         | 
| 65 | 
            +
                  desc 'convert', 'Convert fastq from 4 lines/read to 1 line/read for this utility'
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 68 | 
            +
                  
         | 
| 69 | 
            +
                  def convert
         | 
| 70 | 
            +
                    exit unless STDIN.wait
         | 
| 71 | 
            +
                    exec "#{options.coreutils_prefix}paste - - - -"
         | 
| 72 | 
            +
                  end
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                  # fq1l:count
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                  desc 'count [CSV]', 'Count sequences by the length'
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 79 | 
            +
                  method_option *OPT_PARALLEL
         | 
| 80 | 
            +
                  
         | 
| 81 | 
            +
                  def count(csv = nil)
         | 
| 82 | 
            +
                    exit unless STDIN.wait
         | 
| 83 | 
            +
                    if csv.nil?
         | 
| 84 | 
            +
                      puts "length,reads"
         | 
| 85 | 
            +
                      pipeline("#{cut_command} -f 2",
         | 
| 86 | 
            +
                               "ruby -nle 'puts $_.length'",
         | 
| 87 | 
            +
                               "#{sort_command} -n",
         | 
| 88 | 
            +
                               "#{uniq_command(options)} -c",
         | 
| 89 | 
            +
                               "ruby -anle 'puts $F.reverse.join(\",\")'")
         | 
| 90 | 
            +
                    else
         | 
| 91 | 
            +
                      fifo = get_fifo('fq1l.count', 'fq1l')
         | 
| 92 | 
            +
                      pid = Kernel.spawn("fq1l count#{coreutils_prefix_option} < #{fifo} > #{csv}")
         | 
| 93 | 
            +
                      system "#{tee_command(options)} #{fifo}"
         | 
| 94 | 
            +
                      Process.waitpid(pid)
         | 
| 95 | 
            +
                    end
         | 
| 96 | 
            +
                  end
         | 
| 97 | 
            +
                  
         | 
| 98 | 
            +
                  # fq1l:demultiplex
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                  desc 'demultiplex BASE MAP', 'Demultiplex based on a barcode MAP, and restore sequence files with BASE names'
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                  method_option :maximum_distance,
         | 
| 103 | 
            +
                                default: 1,
         | 
| 104 | 
            +
                                desc: 'Maximum distance between barcode and sequence',
         | 
| 105 | 
            +
                                type: :numeric
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                  def demultiplex(base, map)
         | 
| 108 | 
            +
                    
         | 
| 109 | 
            +
                    dl = DamerauLevenshtein
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                    exit unless STDIN.wait
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                    bc2fq = Hash.new
         | 
| 114 | 
            +
                    open(map, 'r').each do |line|
         | 
| 115 | 
            +
                      bc, well = line.rstrip.split(',')
         | 
| 116 | 
            +
                      bc2fq[bc] = fq = "#{base}.#{well}.fq"
         | 
| 117 | 
            +
                      File.unlink(fq) if File.exist?(fq)
         | 
| 118 | 
            +
                    end
         | 
| 119 | 
            +
                    na = "#{base}.NA.fq"
         | 
| 120 | 
            +
                    File.unlink(na) if File.exist?(na)
         | 
| 121 | 
            +
                    
         | 
| 122 | 
            +
                    bcl = bc2fq.keys.map!{|key| key.length}.sort.uniq[0]
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                    fp = nil
         | 
| 125 | 
            +
                    pbc = nil
         | 
| 126 | 
            +
                    STDIN.set_encoding('BINARY').each do |line|
         | 
| 127 | 
            +
                      acc, seq, sep, qual = line.rstrip.split(/\t/)
         | 
| 128 | 
            +
                      bc = acc[-bcl, bcl]
         | 
| 129 | 
            +
                      if bc != pbc
         | 
| 130 | 
            +
                        mindist = options.maximum_distance+1
         | 
| 131 | 
            +
                        minbc = nil
         | 
| 132 | 
            +
                        bc2fq.each_key do |key|
         | 
| 133 | 
            +
                          dist = dl.distance(key, bc, 0, options.maximum_distance)
         | 
| 134 | 
            +
                          if dist < mindist
         | 
| 135 | 
            +
                            mindist = dist
         | 
| 136 | 
            +
                            minbc = key
         | 
| 137 | 
            +
                          end
         | 
| 138 | 
            +
                          break if dist == 0
         | 
| 139 | 
            +
                        end
         | 
| 140 | 
            +
                        fp.close unless fp.nil?
         | 
| 141 | 
            +
                        fp = open(mindist <= options.maximum_distance ? bc2fq[minbc] : na, 'a')
         | 
| 142 | 
            +
                        pbc = bc
         | 
| 143 | 
            +
                      end
         | 
| 144 | 
            +
                      fp.puts "#{acc}\n#{seq}\n#{sep}\n#{qual}"
         | 
| 145 | 
            +
                    end
         | 
| 146 | 
            +
                    fp.close unless fp.nil?
         | 
| 147 | 
            +
                    
         | 
| 148 | 
            +
                    bc2fq.each_value {|fq| system "pigz #{fq}" if File.exist?(fq) }
         | 
| 149 | 
            +
                    system "pigz #{na}" if File.exist?(na)
         | 
| 150 | 
            +
                    
         | 
| 151 | 
            +
                  end
         | 
| 152 | 
            +
                  
         | 
| 153 | 
            +
                  # fq1l:exclude_degenerate
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                  desc 'exclude_degenerate', 'Exclude degenerated reads in the order'
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                  def exclude_degenerate
         | 
| 158 | 
            +
                    exit unless STDIN.wait
         | 
| 159 | 
            +
                    BioGadget.nr_deg()
         | 
| 160 | 
            +
                  end
         | 
| 161 | 
            +
                  
         | 
| 162 | 
            +
                  # fq1l:exclude_duplicate
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                  desc 'exclude_duplicate', 'Exclude duplicated reads in the order'
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                  def exclude_duplicate
         | 
| 167 | 
            +
                    exit unless STDIN.wait
         | 
| 168 | 
            +
                    BioGadget.nr_std()
         | 
| 169 | 
            +
                  end
         | 
| 170 | 
            +
             | 
| 171 | 
            +
                  # fq1l:match_3end
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                  desc 'match_3end PATTERN', 'Select sequences that match the 3\'-end with a given PATTERN'
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                  method_option *OPT_INVERT_MATCH
         | 
| 176 | 
            +
                  method_option *OPT_GREP_PREFIX
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                  def match_3end(pattern)
         | 
| 179 | 
            +
                    exit unless STDIN.wait
         | 
| 180 | 
            +
                    # PCRE was faster than BRE and ERE in GNU grep 2.25
         | 
| 181 | 
            +
                    system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t[^\\t]*#{pattern}\\t'"
         | 
| 182 | 
            +
                    exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
         | 
| 183 | 
            +
                  end
         | 
| 184 | 
            +
                  
         | 
| 185 | 
            +
                  # fq1l:match_5end
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                  desc 'match_5end PATTERN', 'Select sequences that match the 5\'-end with a given PATTERN'
         | 
| 188 | 
            +
             | 
| 189 | 
            +
                  method_option *OPT_INVERT_MATCH
         | 
| 190 | 
            +
                  method_option *OPT_GREP_PREFIX
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                  def match_5end(pattern)
         | 
| 193 | 
            +
                    exit unless STDIN.wait
         | 
| 194 | 
            +
                    # PCRE was faster than BRE and ERE in GNU grep 2.25
         | 
| 195 | 
            +
                    system "#{grep_command}#{options.invert_match ? ' -v' : ''} -P -e '^[^\\t]+\\t#{pattern}'"
         | 
| 196 | 
            +
                    exit $?.to_i == 0 || $?.to_i == 1 ? 0 : $?.to_i
         | 
| 197 | 
            +
                  end
         | 
| 198 | 
            +
             | 
| 199 | 
            +
                  # fq1l:restore
         | 
| 200 | 
            +
             | 
| 201 | 
            +
                  desc 'restore', 'Convert fastq from 1 line/read to 4 lines/read'
         | 
| 202 | 
            +
             | 
| 203 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 204 | 
            +
                  
         | 
| 205 | 
            +
                  def restore
         | 
| 206 | 
            +
                    exit unless STDIN.wait
         | 
| 207 | 
            +
                    exec "#{options.coreutils_prefix}tr \"\\t\" \"\\n\""
         | 
| 208 | 
            +
                  end
         | 
| 209 | 
            +
             | 
| 210 | 
            +
                  # fq1l:slice
         | 
| 211 | 
            +
             | 
| 212 | 
            +
                  desc 'slice Nth SLICE', 'Slice the sequences'
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                  def slice(nth, slice)
         | 
| 215 | 
            +
                    exit unless STDIN.wait
         | 
| 216 | 
            +
                    BioGadget.slice(nth.to_i, slice.to_i)
         | 
| 217 | 
            +
                  end
         | 
| 218 | 
            +
                  
         | 
| 219 | 
            +
                  # fq1l:sort
         | 
| 220 | 
            +
             | 
| 221 | 
            +
                  desc 'sort [FQ1Ls]', 'Sort by sequence and the quality in descending order'
         | 
| 222 | 
            +
             | 
| 223 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 224 | 
            +
                  method_option *OPT_BUFFER_SIZE
         | 
| 225 | 
            +
                  method_option *OPT_PARALLEL
         | 
| 226 | 
            +
             | 
| 227 | 
            +
                  def sort(*fq1ls)
         | 
| 228 | 
            +
                    if fq1ls.size == 0
         | 
| 229 | 
            +
                      exit unless STDIN.wait
         | 
| 230 | 
            +
                      exec "#{sort_command} -t '\t' -r -k2,4"
         | 
| 231 | 
            +
                    else
         | 
| 232 | 
            +
                      exec "#{sort_command} -t '\t' -r -k2,4 -m #{fq1ls.join(' ')}"
         | 
| 233 | 
            +
                    end
         | 
| 234 | 
            +
                  end
         | 
| 235 | 
            +
             | 
| 236 | 
            +
                  # fq1l:sort_index
         | 
| 237 | 
            +
             | 
| 238 | 
            +
                  desc 'sort_index', 'Sort by index'
         | 
| 239 | 
            +
                  
         | 
| 240 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 241 | 
            +
                  method_option *OPT_BUFFER_SIZE
         | 
| 242 | 
            +
                  method_option *OPT_PARALLEL
         | 
| 243 | 
            +
                  
         | 
| 244 | 
            +
                  def sort_index
         | 
| 245 | 
            +
                    exit unless STDIN.wait
         | 
| 246 | 
            +
                    exec "#{sort_command} -k2"
         | 
| 247 | 
            +
                  end
         | 
| 248 | 
            +
             | 
| 249 | 
            +
                  # fq1l:sum_counts
         | 
| 250 | 
            +
             | 
| 251 | 
            +
                  desc 'sum_counts CSV ...', 'Sum counts of sequences by the length'
         | 
| 252 | 
            +
             | 
| 253 | 
            +
                  def sum_counts(*csvs)
         | 
| 254 | 
            +
                    length2count = Hash.new
         | 
| 255 | 
            +
                    csvs.each do |csv|
         | 
| 256 | 
            +
                      open(csv).each do |line|
         | 
| 257 | 
            +
                        l, c = line.rstrip.split(/,/)
         | 
| 258 | 
            +
                        next if l == 'length'
         | 
| 259 | 
            +
                        length = l.to_i
         | 
| 260 | 
            +
                        length2count[length] = 0 unless length2count.key?(length)
         | 
| 261 | 
            +
                        length2count[length] += c.to_i
         | 
| 262 | 
            +
                      end
         | 
| 263 | 
            +
                    end
         | 
| 264 | 
            +
                    puts "length,count"
         | 
| 265 | 
            +
                    length2count.keys.sort.each do |length|
         | 
| 266 | 
            +
                      puts "#{length},#{length2count[length]}"
         | 
| 267 | 
            +
                    end
         | 
| 268 | 
            +
                  end
         | 
| 269 | 
            +
                  
         | 
| 270 | 
            +
                  # fq1l:thin_out
         | 
| 271 | 
            +
             | 
| 272 | 
            +
                  desc 'thin_out DRAW SKIP', 'Thin out the sequences'
         | 
| 273 | 
            +
             | 
| 274 | 
            +
                  def to(draw, skip)
         | 
| 275 | 
            +
                    exit unless STDIN.wait
         | 
| 276 | 
            +
                    BioGadget.to(draw.to_i, skip.to_i)
         | 
| 277 | 
            +
                  end
         | 
| 278 | 
            +
                  
         | 
| 279 | 
            +
                  # fq1l:trim_3end
         | 
| 280 | 
            +
             | 
| 281 | 
            +
                  desc 'trim_3end SEQUENCE', 'Trim 3\'-end that match with a given SEQUENCE'
         | 
| 282 | 
            +
             | 
| 283 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 284 | 
            +
                  method_option *OPT_GREP_PREFIX
         | 
| 285 | 
            +
                  method_option *OPT_MINIMUM_LENGTH
         | 
| 286 | 
            +
                  
         | 
| 287 | 
            +
                  method_option :trimmed,
         | 
| 288 | 
            +
                                banner: 'FILE',
         | 
| 289 | 
            +
                                desc: 'FILE for trimmed reads; STDOUT if not speficied',
         | 
| 290 | 
            +
                                type: :string
         | 
| 291 | 
            +
             | 
| 292 | 
            +
                  def trim_3end(sequence)
         | 
| 293 | 
            +
                    # exit unless STDIN.wait
         | 
| 294 | 
            +
                    gPrefix = options.key?(:grep_prefix) ? " --grep-prefix=#{options.grep_prefix}" : ''
         | 
| 295 | 
            +
                    fifo = get_fifo('fq1l.trim_3end', 'fq1l', false)
         | 
| 296 | 
            +
                    begin
         | 
| 297 | 
            +
                      tmpfile = options.key?(:trimmed) ? File.expand_path(options.trimmed) : get_temporary_path('fq1l.trim_3end', 'fq1l', false)
         | 
| 298 | 
            +
                      begin 
         | 
| 299 | 
            +
                        pid = Process.fork do
         | 
| 300 | 
            +
                          BioGadget.t3("fq1l match_3end#{gPrefix} #{sequence} < #{fifo}", sequence.length, options.minimum_length, tmpfile)
         | 
| 301 | 
            +
                        end
         | 
| 302 | 
            +
                        pipeline("#{tee_command(options)} #{fifo}",
         | 
| 303 | 
            +
                                 "fq1l match_3end#{gPrefix} #{sequence} --invert-match")
         | 
| 304 | 
            +
                      ensure
         | 
| 305 | 
            +
                        system "#{cat_command} #{tmpfile}" unless options.key?(:trimmed)
         | 
| 306 | 
            +
                      end
         | 
| 307 | 
            +
                    ensure
         | 
| 308 | 
            +
                      File.unlink(fifo) if File.exist?(fifo)
         | 
| 309 | 
            +
                      File.unlink(tmpfile) if File.exist?(tmpfile) && !options.key?(:trimmed)
         | 
| 310 | 
            +
                    end
         | 
| 311 | 
            +
                  end
         | 
| 312 | 
            +
             | 
| 313 | 
            +
                  # fq1l:trim_3end_length
         | 
| 314 | 
            +
             | 
| 315 | 
            +
                  desc 'trim_3end_length', 'Trim 3\'-end by a specific length'
         | 
| 316 | 
            +
             | 
| 317 | 
            +
                  method_option *OPT_MINIMUM_LENGTH
         | 
| 318 | 
            +
             | 
| 319 | 
            +
                  method_option :trimming_length,
         | 
| 320 | 
            +
                                default: 1,
         | 
| 321 | 
            +
                                desc: 'Length of the trimming',
         | 
| 322 | 
            +
                                type: :numeric
         | 
| 323 | 
            +
             | 
| 324 | 
            +
                  def trim_3end_length
         | 
| 325 | 
            +
                    exit unless STDIN.wait
         | 
| 326 | 
            +
                    BioGadget.t3(nil, options.trimming_length, options.minimum_length, nil)
         | 
| 327 | 
            +
                  end
         | 
| 328 | 
            +
                  
         | 
| 329 | 
            +
                  # fq1l:trim_3end_primer
         | 
| 330 | 
            +
             | 
| 331 | 
            +
                  desc 'trim_3end_primer', 'Trim 3\'-end that match with a given primer'
         | 
| 332 | 
            +
             | 
| 333 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 334 | 
            +
                  method_option *OPT_GREP_PREFIX
         | 
| 335 | 
            +
                  method_option *OPT_MINIMUM_LENGTH
         | 
| 336 | 
            +
                  method_option *OPT_PARALLEL
         | 
| 337 | 
            +
             | 
| 338 | 
            +
                  method_option :primers,
         | 
| 339 | 
            +
                                default: 'AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG,CTCGTATGCCGTCTTCTGCTTG',
         | 
| 340 | 
            +
                                desc: 'Comma-separated primer sequences that be used for trimming',
         | 
| 341 | 
            +
                                type: :string
         | 
| 342 | 
            +
             | 
| 343 | 
            +
                  def trim_3end_primer
         | 
| 344 | 
            +
             | 
| 345 | 
            +
                    opt_minimum_length = "--minimum-length=#{options.minimum_length}"
         | 
| 346 | 
            +
                    primers = options.primers.split(',')
         | 
| 347 | 
            +
                    
         | 
| 348 | 
            +
                    fragments = Hash.new
         | 
| 349 | 
            +
                    tmp = Hash.new
         | 
| 350 | 
            +
                    primers.each do |primer|
         | 
| 351 | 
            +
                      max = primer.length-1
         | 
| 352 | 
            +
                      for i in 0..max do
         | 
| 353 | 
            +
                        fragment = primer[0..i]
         | 
| 354 | 
            +
                        unless tmp.key?(fragment)
         | 
| 355 | 
            +
                          l = fragment.length
         | 
| 356 | 
            +
                          fragments[l] = Array.new unless fragments.key?(l)
         | 
| 357 | 
            +
                          fragments[l] << fragment
         | 
| 358 | 
            +
                          tmp[fragment] = true
         | 
| 359 | 
            +
                        end
         | 
| 360 | 
            +
                      end
         | 
| 361 | 
            +
                    end
         | 
| 362 | 
            +
                    
         | 
| 363 | 
            +
                    exit unless STDIN.wait
         | 
| 364 | 
            +
             | 
| 365 | 
            +
                    tmpfiles = Array.new
         | 
| 366 | 
            +
                    commands = Array.new
         | 
| 367 | 
            +
             | 
| 368 | 
            +
                    fragments.keys.sort.reverse.each do |length|
         | 
| 369 | 
            +
                      if 4**length == fragments[length].size
         | 
| 370 | 
            +
                        commands << "fq1l trim_3end_length --trimming-length=#{length} #{opt_minimum_length}"
         | 
| 371 | 
            +
                        break
         | 
| 372 | 
            +
                      else
         | 
| 373 | 
            +
                        fragments[length].sort.reverse.each do |fragment|
         | 
| 374 | 
            +
                          tmpfiles << tmpfile = get_temporary_path("fq1l.trim_3end_primer.#{fragment}", 'fq1l', false)
         | 
| 375 | 
            +
                          commands << "fq1l trim_3end#{' --coreutils-prefix='+options.coreutils_prefix if options.key?(:coreutils_prefix)}#{' --grep-prefix='+options.grep_prefix if options.key?(:grep_prefix)} #{opt_minimum_length} --trimmed=#{tmpfile} #{fragment}"
         | 
| 376 | 
            +
                        end
         | 
| 377 | 
            +
                      end
         | 
| 378 | 
            +
                    end
         | 
| 379 | 
            +
                    stats = Open3.pipeline(*commands)
         | 
| 380 | 
            +
                    stats.each_index do |i|
         | 
| 381 | 
            +
                      unless stats[i].success?
         | 
| 382 | 
            +
                        unlink_files(tmpfiles)
         | 
| 383 | 
            +
                        raise "Fail at process #{i}; #{stats[i]}; #{commands[i]}" 
         | 
| 384 | 
            +
                      end
         | 
| 385 | 
            +
                    end
         | 
| 386 | 
            +
                    system "#{cat_command} #{tmpfiles.join(' ')}"
         | 
| 387 | 
            +
                    unlink_files(tmpfiles)
         | 
| 388 | 
            +
                    
         | 
| 389 | 
            +
                  end
         | 
| 390 | 
            +
             | 
| 391 | 
            +
                  # fq1l:trim_3end_quality
         | 
| 392 | 
            +
             | 
| 393 | 
            +
                  desc 'trim_3end_quality', 'Trim 3\'-end from a low quality base'
         | 
| 394 | 
            +
             | 
| 395 | 
            +
                  method_option *OPT_MINIMUM_LENGTH
         | 
| 396 | 
            +
                  
         | 
| 397 | 
            +
                  method_option :low_qualities,
         | 
| 398 | 
            +
                                banner: 'CHARACTERS',
         | 
| 399 | 
            +
                                default: '!"#',
         | 
| 400 | 
            +
                                desc: 'Low quality characters',
         | 
| 401 | 
            +
                                type: :string
         | 
| 402 | 
            +
                  
         | 
| 403 | 
            +
                  def trim_3end_quality
         | 
| 404 | 
            +
                    BioGadget.t3q(options.low_qualities, options.minimum_length)
         | 
| 405 | 
            +
                  end
         | 
| 406 | 
            +
                  
         | 
| 407 | 
            +
                  # fq1l:trim_5end
         | 
| 408 | 
            +
             | 
| 409 | 
            +
                  desc 'trim_5end PATTERN', 'Trim 5\'-end that match with a given PATTERN'
         | 
| 410 | 
            +
             | 
| 411 | 
            +
                  method_option :minimum_length,
         | 
| 412 | 
            +
                                banner: 'NT',
         | 
| 413 | 
            +
                                default: 24,
         | 
| 414 | 
            +
                                desc: 'Minimum length after trimming',
         | 
| 415 | 
            +
                                type: :numeric
         | 
| 416 | 
            +
                                  
         | 
| 417 | 
            +
                  
         | 
| 418 | 
            +
                  def trim_5end(pattern)
         | 
| 419 | 
            +
                    exit unless STDIN.wait
         | 
| 420 | 
            +
                    BioGadget.t5(pattern, options.minimum_length)
         | 
| 421 | 
            +
                  end
         | 
| 422 | 
            +
             | 
| 423 | 
            +
                  # #
         | 
| 424 | 
            +
             | 
| 425 | 
            +
                  # no_commands do
         | 
| 426 | 
            +
                    
         | 
| 427 | 
            +
                  #   def pipeline(parallel, *commands)
         | 
| 428 | 
            +
                  #     stats = Array.new
         | 
| 429 | 
            +
                  #     tmpin = nil
         | 
| 430 | 
            +
                  #     tmpout = nil
         | 
| 431 | 
            +
                  #     begin
         | 
| 432 | 
            +
                  #       while commands.size > 0
         | 
| 433 | 
            +
                  #         cmds = commands.shift(parallel)
         | 
| 434 | 
            +
                  #         cmds[0] = cmds[0] + " < #{tmpin}" unless tmpin.nil?
         | 
| 435 | 
            +
                  #         if commands.size > 0
         | 
| 436 | 
            +
                  #           tmpout = get_temporary_path('pipeline', 'tmp', false)
         | 
| 437 | 
            +
                  #           cmds[-1] = cmds[-1] + " > #{tmpout}"
         | 
| 438 | 
            +
                  #         end
         | 
| 439 | 
            +
                  #         tmpstats = Open3.pipeline(*cmds)
         | 
| 440 | 
            +
                  #         stats.concat(tmpstats)
         | 
| 441 | 
            +
                  #         tmpstats.each {|tmpstat| commands = nil unless tmpstat.success? }
         | 
| 442 | 
            +
                  #         break if commands.nil?
         | 
| 443 | 
            +
                  #         File.unlink(tmpin) unless tmpin.nil?
         | 
| 444 | 
            +
                  #         tmpin = tmpout
         | 
| 445 | 
            +
                  #       end
         | 
| 446 | 
            +
                  #     ensure
         | 
| 447 | 
            +
                  #       File.unlink(tmpin) if !tmpin.nil? && File.exist?(tmpin)
         | 
| 448 | 
            +
                  #       File.unlink(tmpout) if !tmpout.nil? && File.exist?(tmpout)
         | 
| 449 | 
            +
                  #     end
         | 
| 450 | 
            +
                  #     stats
         | 
| 451 | 
            +
                  #   end
         | 
| 452 | 
            +
                    
         | 
| 453 | 
            +
                  # end
         | 
| 454 | 
            +
                  
         | 
| 455 | 
            +
                end
         | 
| 456 | 
            +
              end
         | 
| 457 | 
            +
            end
         |