bio-gadget 0.4.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +10 -20
- data/.travis.yml +5 -0
- data/LICENSE +1 -1
- data/README.org +0 -21
- data/Rakefile +5 -1
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/bio-gadget.gemspec +20 -14
- data/exe/bio-gadget +14 -0
- data/exe/fq1l +5 -0
- data/exe/rbg +1 -0
- data/exe/strt +5 -0
- data/ext/bio_gadget/bio_gadget.c +313 -0
- data/ext/bio_gadget/bio_gadget.h +8 -0
- data/ext/bio_gadget/extconf.rb +3 -0
- data/lib/bio/gadget.rb +171 -0
- data/lib/bio/gadget/fq1l.rb +457 -0
- data/lib/bio/gadget/strt.rb +605 -0
- data/lib/bio/gadget/strt/count.rb +53 -0
- data/lib/bio/gadget/strt/depth.rb +124 -0
- data/lib/bio/gadget/strt/prepare_transcriptome.rb +230 -0
- data/lib/bio/gadgets.rb +135 -0
- data/test/bio/gadget_test.rb +11 -0
- data/test/test_helper.rb +4 -0
- metadata +109 -40
- data/Gthorfile +0 -2
- data/bin/bio-gadget +0 -5
- data/lib/bio-gadget.rb +0 -44
- data/lib/bio-gadget/dedup.rb +0 -33
- data/lib/bio-gadget/demlt.rb +0 -149
- data/lib/bio-gadget/femrg.rb +0 -61
- data/lib/bio-gadget/fqxz.rb +0 -30
- data/lib/bio-gadget/peak.rb +0 -94
- data/lib/bio-gadget/qvstat.rb +0 -34
- data/lib/bio-gadget/rgt2mtx.rb +0 -60
- data/lib/bio-gadget/version.rb +0 -9
- data/lib/bio-gadget/wig5p.rb +0 -51
- data/lib/bio-gadget/wigchr.rb +0 -28
| @@ -0,0 +1,605 @@ | |
| 1 | 
            +
            require 'csv'
         | 
| 2 | 
            +
            require 'fileutils'
         | 
| 3 | 
            +
            require 'open3'
         | 
| 4 | 
            +
            require 'parallel'
         | 
| 5 | 
            +
            require 'thread'
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            require 'bio/gadget/strt/prepare_transcriptome.rb'
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            module Bio
         | 
| 10 | 
            +
              class Gadget
         | 
| 11 | 
            +
                class Strt < Bio::Gadget
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                  OPT_GENOME = [ :genome, { :default => 'hg38',
         | 
| 14 | 
            +
                                            :desc => 'Genome assembly' } ]
         | 
| 15 | 
            +
                  
         | 
| 16 | 
            +
                  OPT_UMI_LENGTH = [ :umi_length, { :banner => 'NT',
         | 
| 17 | 
            +
                                                    :default => 6,
         | 
| 18 | 
            +
                                                    :desc => 'Length of UMI',
         | 
| 19 | 
            +
                                                    :type => :numeric } ]
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                  # strt:alignment
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                  desc 'alignment REFDIR SEQDIR MAPDIR', 'Align reads to reference'
         | 
| 24 | 
            +
                  long_desc <<-DESC
         | 
| 25 | 
            +
            Align STRT reads (*.fq.gz files at SEQDIR) to a reference (REFDIR/ref.*.ht2). The alignments will be at MAPDIR/*.bam, and per-base 5'-end counts will be at MAPDIR/*.bed.gz.
         | 
| 26 | 
            +
            DESC
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                  method_option *OPT_BUFFER_SIZE
         | 
| 29 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 30 | 
            +
                  method_option *OPT_GREP_PREFIX
         | 
| 31 | 
            +
                  method_option *OPT_PARALLEL
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                  def alignment(refdir, seqdir, mapdir)
         | 
| 34 | 
            +
                    
         | 
| 35 | 
            +
                    Dir.glob("#{File.expand_path(seqdir)}/*.fq.gz").each do |fqgz|
         | 
| 36 | 
            +
                      base = File.basename(fqgz, '.fq.gz')
         | 
| 37 | 
            +
                      STDERR.puts "#{`date`.strip}: Align #{base}..."
         | 
| 38 | 
            +
                      bam = "#{mapdir}/#{base}.bam"
         | 
| 39 | 
            +
                      pipeline(
         | 
| 40 | 
            +
                        "hisat2 --no-unal --rna-strandness F --dta-cufflinks -p #{options.parallel} -x #{refdir}/ref -U #{fqgz}",
         | 
| 41 | 
            +
                        "#{grep_command} -v -E 'NH:i:([2-9][0-9]*|1[0-9]+)'",
         | 
| 42 | 
            +
                        "samtools sort -@ #{options.parallel} -o #{bam}")
         | 
| 43 | 
            +
                      sh "samtools index #{bam}"
         | 
| 44 | 
            +
                    end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                    STDERR.puts "#{`date`.strip}: Count from all alignments."
         | 
| 47 | 
            +
                    Parallel.map(Dir.glob("#{File.expand_path(mapdir)}/*.bam"),
         | 
| 48 | 
            +
                                 in_threads: options.parallel) do |bam|
         | 
| 49 | 
            +
                      pipeline(
         | 
| 50 | 
            +
                        "strt count_per_base#{buffer_size_option}#{coreutils_prefix_option}#{parallel_option(options)} #{bam}",
         | 
| 51 | 
            +
                        "pigz -c > #{mapdir}/#{File.basename(bam, '.bam')}.bed.gz")
         | 
| 52 | 
            +
                    end
         | 
| 53 | 
            +
                    
         | 
| 54 | 
            +
                  end
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                  # strt:build_index
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                  desc 'build_index DIR', 'Build index for alignment'
         | 
| 59 | 
            +
                  long_desc <<-DESC
         | 
| 60 | 
            +
            Build index for alignment of STRT reads, from the speficied GENOME, TRANSCRIPTOME and VARIATION, at DIR.
         | 
| 61 | 
            +
            DESC
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 64 | 
            +
                  method_option *OPT_GENOME
         | 
| 65 | 
            +
                  method_option *OPT_GREP_PREFIX
         | 
| 66 | 
            +
                  method_option *OPT_PARALLEL
         | 
| 67 | 
            +
                  
         | 
| 68 | 
            +
                  def build_index(dir0)
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    dir = File.expand_path(dir0)
         | 
| 71 | 
            +
                    FileUtils.mkdir_p(dir)
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                    STDERR.puts "#{`date`.strip}: Preparing data files..."
         | 
| 74 | 
            +
                    
         | 
| 75 | 
            +
                    Parallel.map(
         | 
| 76 | 
            +
                      ["strt prepare_genome#{coreutils_prefix_option}#{genome_option(options)} #{dir}",
         | 
| 77 | 
            +
                       "strt prepare_variation#{coreutils_prefix_option}#{genome_option(options)} --download=only #{dir}",
         | 
| 78 | 
            +
                       "strt prepare_transcriptome #{options.genome}#{coreutils_prefix_option}#{grep_prefix_option(options)} --download=only #{dir}",
         | 
| 79 | 
            +
                       "strt prepare_spikein#{coreutils_prefix_option} #{dir}",
         | 
| 80 | 
            +
                       "strt prepare_ribosome#{coreutils_prefix_option}#{genome_option(options)} #{dir}"], in_threads: options.parallel) do |cmd|
         | 
| 81 | 
            +
                      system cmd or exit $?.exitstatus
         | 
| 82 | 
            +
                    end
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                    system "unpigz -c #{dir}/genome.fa.gz #{dir}/spikein.fa.gz #{dir}/ribosome.fa.gz > #{dir}/ref.fa"
         | 
| 85 | 
            +
                    system "samtools faidx #{dir}/ref.fa"
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                    Parallel.map(["strt prepare_transcriptome #{options.genome}#{coreutils_prefix_option}#{grep_prefix_option(options)} --download=no #{dir}",
         | 
| 88 | 
            +
                                  "strt prepare_variation#{coreutils_prefix_option}#{genome_option(options)} --download=no #{dir}"], in_threads: options.parallel) do |cmd|
         | 
| 89 | 
            +
                      STDERR.puts cmd
         | 
| 90 | 
            +
                      system cmd or exit $?.exitstatus
         | 
| 91 | 
            +
                    end
         | 
| 92 | 
            +
                    
         | 
| 93 | 
            +
                    STDERR.puts "#{`date`.strip}: Building index..."
         | 
| 94 | 
            +
                    
         | 
| 95 | 
            +
                    system "hisat2-build -f -p #{options.parallel} --snp #{dir}/variation.snp --haplotype #{dir}/variation.haplotype --ss #{dir}/transcriptome.splice_sites --exon #{dir}/transcriptome.exons #{dir}/ref.fa #{dir}/ref"
         | 
| 96 | 
            +
                    
         | 
| 97 | 
            +
                  end
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                  # strt:call_allele
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                  desc 'call_allele CSV REFDIR MAPDIR', 'Call allele frequency'
         | 
| 102 | 
            +
                  long_desc <<-DESC
         | 
| 103 | 
            +
            Call allele frequencies of multiple samples specified in a design CSV, based on alignment files at BAMDIR, and reference sequence 'ref.fa' and the index at REFDIR.
         | 
| 104 | 
            +
            DESC
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                  method_option *OPT_GENOME
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                  def call_allele(csv, refdir, mapdir)
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                    design = CSV.table(csv)
         | 
| 111 | 
            +
                    bams = get_temporary_path('strt.call_allele', 'bams')
         | 
| 112 | 
            +
                    fp = open(bams, 'w')
         | 
| 113 | 
            +
                    design[:base].each {|bam| fp.puts "#{mapdir}/#{bam}.bam" }
         | 
| 114 | 
            +
                    fp.close
         | 
| 115 | 
            +
                    csvdir = File.dirname(csv)
         | 
| 116 | 
            +
                    bcf = "#{csvdir}/strt-call_allele.bcf"
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                    pipeline("samtools mpileup -u -t AD,ADF,ADR,DP -f #{refdir}/ref.fa -b #{bams}",
         | 
| 119 | 
            +
                             "bcftools call --multiallelic-caller --variants-only --output-type u",
         | 
| 120 | 
            +
                             "bcftools filter -s LowQual -e '%QUAL<20 || MIN(FORMAT/DP)<20' --output-type b > #{bcf}")
         | 
| 121 | 
            +
                    pipeline("bcftools view #{bcf}",
         | 
| 122 | 
            +
                             "table_annovar.pl - #{refdir} --buildver #{options.genome} --outfile #{csvdir}/strt-call_allele --remove --protocol refGene --operation g --nastring . --vcfinput")
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                  end
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                  # strt:count_per_base
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                  desc 'count_per_base BAM',  'Count reads per base'
         | 
| 129 | 
            +
                  long_desc <<-DESC
         | 
| 130 | 
            +
            Count reads per base , based on an alignment BAM.
         | 
| 131 | 
            +
            DESC
         | 
| 132 | 
            +
             | 
| 133 | 
            +
                  method_option *OPT_BUFFER_SIZE
         | 
| 134 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 135 | 
            +
                  method_option *OPT_PARALLEL
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                  def count_per_base(bam)
         | 
| 138 | 
            +
             | 
| 139 | 
            +
                    pipeline(
         | 
| 140 | 
            +
                      "bedtools bamtobed -i #{bam}",
         | 
| 141 | 
            +
                      "ruby -F'\t' -anle 'puts [$F[0], $F[5]==\"+\" ? $F[1] : $F[2].to_i-1, $F[5]==\"+\" ? $F[1].to_i+1 : $F[2], $F[5]].join(\"\t\")'",
         | 
| 142 | 
            +
                      "#{sort_command} -t '\t' -k 1,1 -k 2,2n",
         | 
| 143 | 
            +
                      "#{uniq_command(options)} -c",
         | 
| 144 | 
            +
                      "ruby -anle 'puts ($F[1..3]+[\"#{File.basename(bam, '.bam')}\", $F[0], $F[4]]).join(\"\t\")'")
         | 
| 145 | 
            +
                    
         | 
| 146 | 
            +
                  end
         | 
| 147 | 
            +
             | 
| 148 | 
            +
                  # strt:count_per_region
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                  desc 'count_per_region COUNT REG [REG ...]',
         | 
| 151 | 
            +
                       'Count reads per region'
         | 
| 152 | 
            +
                  long_desc <<-DESC
         | 
| 153 | 
            +
            Count reads per region for a sample. Read counts (a BED-format COUNT) within regions (BED-format REGions) are summed by the region names.
         | 
| 154 | 
            +
            DESC
         | 
| 155 | 
            +
                  
         | 
| 156 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                  def count_per_region(count, region0, *regions0)
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                    pipeline(
         | 
| 161 | 
            +
                      "bedtools intersect -nonamecheck -s -wa -wb -a #{count} -b #{([region0]+regions0).join(' ')}",
         | 
| 162 | 
            +
                      "#{cut_command} -f 5,11",
         | 
| 163 | 
            +
                      "ruby -F'\t' -e 'n2c={}; while gets; c,n=$_.strip.split /\\t/; n2c[n]=(n2c.key?(n) ? n2c[n] : 0)+c.to_i; end; puts \"ID,COUNT\"; n2c.each {|n,c| puts \"\#{n},\#{c}\"}'")
         | 
| 164 | 
            +
                    
         | 
| 165 | 
            +
                  end
         | 
| 166 | 
            +
                  
         | 
| 167 | 
            +
                  # strt:prepare_genome
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                  desc 'prepare_genome DIR', 'Prepare genome data'
         | 
| 170 | 
            +
                  long_desc <<-DESC
         | 
| 171 | 
            +
            Prepare data files of the specified GENOME at DIR.
         | 
| 172 | 
            +
            DESC
         | 
| 173 | 
            +
             | 
| 174 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 175 | 
            +
                  method_option *OPT_DOWNLOAD
         | 
| 176 | 
            +
                  method_option *OPT_GENOME
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                  def prepare_genome(dir0)
         | 
| 179 | 
            +
             | 
| 180 | 
            +
                    dir = File.expand_path(dir0)
         | 
| 181 | 
            +
                    tgz = "#{dir}/#{options.genome}.chromFa.tar.gz"
         | 
| 182 | 
            +
                    ucsc = "rsync://hgdownload.cse.ucsc.edu/goldenPath/#{options.genome}/bigZips"
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                    if options.download != 'no'
         | 
| 185 | 
            +
                      if options.genome == 'hg38'
         | 
| 186 | 
            +
                        rsync_file("#{ucsc}/#{options.genome}.chromFa.tar.gz", tgz)
         | 
| 187 | 
            +
                      else
         | 
| 188 | 
            +
                        rsync_file("#{ucsc}/chromFa.tar.gz", tgz)
         | 
| 189 | 
            +
                      end
         | 
| 190 | 
            +
                    end
         | 
| 191 | 
            +
                    pipeline("unpigz -c #{tgz}",
         | 
| 192 | 
            +
                             "#{options.coreutils_prefix}tar -xOf - --exclude \"*_*\"",
         | 
| 193 | 
            +
                             "gawk 'BEGIN{p=\"\"} /^>/{ print p $1 } !/^>/{ printf $1; p=\"\\n\" } END{ print }'",
         | 
| 194 | 
            +
                             "#{fold_command(options)} -w 50",
         | 
| 195 | 
            +
                             "pigz -c > #{dir}/genome.fa.gz"
         | 
| 196 | 
            +
                            ) if options.download != 'only'
         | 
| 197 | 
            +
                    
         | 
| 198 | 
            +
                  end
         | 
| 199 | 
            +
                  
         | 
| 200 | 
            +
                  # strt:prepare_reads
         | 
| 201 | 
            +
             | 
| 202 | 
            +
                  desc 'prepare_reads BASE MAP FQGZ ...', 'Prepare STRT reads'
         | 
| 203 | 
            +
                  long_desc <<-DESC
         | 
| 204 | 
            +
            Prepare STRT reads from raw sequence files before alignment. After demultiplexing, it performs
         | 
| 205 | 
            +
            (i) exclusion of redundant reads,
         | 
| 206 | 
            +
            (ii) exclusion of noncanonical reads, which does not begin with template switching primer,
         | 
| 207 | 
            +
            (iii) trimming from low-quality base,
         | 
| 208 | 
            +
            (iv) trimming from sequence similar to HiSeq universal primer,
         | 
| 209 | 
            +
            and (v) trimming of the template switching primer.
         | 
| 210 | 
            +
             | 
| 211 | 
            +
            Mandatory paramters are (1) BASE; basename for demulplexed and gzipped fastq files, (2) MAP; filename of comma-separated table between barcode and well, and (3) FQGZs; comma-separated filenames of raw sequences; each file is gzipped fastq. When MAP contains 'CAAAGT,A2' and BASE is '~/test', reads having CAAAGT-like barcode are in '~/test.A2.fq.gz' file after the preprocesses.
         | 
| 212 | 
            +
            DESC
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 215 | 
            +
                  method_option *OPT_GREP_PREFIX
         | 
| 216 | 
            +
                  method_option *OPT_PARALLEL
         | 
| 217 | 
            +
                  method_option *OPT_UMI_LENGTH
         | 
| 218 | 
            +
             | 
| 219 | 
            +
                  method_option :maximum_memory,
         | 
| 220 | 
            +
                                default: 50,
         | 
| 221 | 
            +
                                desc: 'Maximum memory usage in percent rate',
         | 
| 222 | 
            +
                                type: :numeric
         | 
| 223 | 
            +
                  
         | 
| 224 | 
            +
                  method_option :minimum_length,
         | 
| 225 | 
            +
                                banner: 'NT',
         | 
| 226 | 
            +
                                default: 24,
         | 
| 227 | 
            +
                                desc: 'Minimum length after the preprocess',
         | 
| 228 | 
            +
                                type: :numeric
         | 
| 229 | 
            +
                  
         | 
| 230 | 
            +
                  method_option :reads,
         | 
| 231 | 
            +
                                desc: 'Number of raw reads for the preprocess',
         | 
| 232 | 
            +
                                type: :numeric
         | 
| 233 | 
            +
             | 
| 234 | 
            +
                  # method_option :maximum_distance,
         | 
| 235 | 
            +
                  #               default: 1,
         | 
| 236 | 
            +
                  #               desc: 'Maximum distance between barcode and sequence',
         | 
| 237 | 
            +
                  #               type: :numeric
         | 
| 238 | 
            +
             | 
| 239 | 
            +
                  def prepare_reads(base, map, fqgz0, *fqgzs0)
         | 
| 240 | 
            +
             | 
| 241 | 
            +
                    fqgzs = [fqgz0] + fqgzs0
         | 
| 242 | 
            +
             | 
| 243 | 
            +
                    bcs = Hash.new
         | 
| 244 | 
            +
                    open(map, 'r').each do |line|
         | 
| 245 | 
            +
                      bc, well = line.rstrip.split(',')
         | 
| 246 | 
            +
                      bcs[bc] = well
         | 
| 247 | 
            +
                    end
         | 
| 248 | 
            +
                    
         | 
| 249 | 
            +
                    bcl = bcs.keys.map!{|key| key.length}.sort.uniq[0]
         | 
| 250 | 
            +
             | 
| 251 | 
            +
                    tso_pattern = '.'*options.umi_length + '.'*bcl + 'GG'
         | 
| 252 | 
            +
             | 
| 253 | 
            +
                    #
         | 
| 254 | 
            +
                    
         | 
| 255 | 
            +
                    STDERR.puts "#{`date`.strip}: Demultiplexing each raw sequence files..."
         | 
| 256 | 
            +
                    
         | 
| 257 | 
            +
                    fqgz2csv0 = Hash.new
         | 
| 258 | 
            +
                    fqgz2csv1 = Hash.new
         | 
| 259 | 
            +
                    fqgz2base = Hash.new
         | 
| 260 | 
            +
                    fqgzs.each do |fqgz|
         | 
| 261 | 
            +
                      fqgz2csv0[fqgz] = get_temporary_path('strt.preprocess', 'csv', false)
         | 
| 262 | 
            +
                      fqgz2csv1[fqgz] = get_temporary_path('strt.preprocess', 'csv', false)
         | 
| 263 | 
            +
                      fqgz2base[fqgz] = get_temporary_path('strt.preprocess', 'base', false)
         | 
| 264 | 
            +
                    end
         | 
| 265 | 
            +
             | 
| 266 | 
            +
                    Parallel.map(fqgz2csv0.keys, in_processes: options.parallel) do |fqgz|
         | 
| 267 | 
            +
                      cmds = [
         | 
| 268 | 
            +
                        "unpigz -c #{fqgz}",
         | 
| 269 | 
            +
                        "#{fq1l_convert_command(options)}",
         | 
| 270 | 
            +
                        "#{fq1l_count_command(options)} #{fqgz2csv0[fqgz]}",
         | 
| 271 | 
            +
                        "fq1l match_5end#{grep_prefix_option(options)} #{tso_pattern}",
         | 
| 272 | 
            +
                        "#{fq1l_count_command(options)} #{fqgz2csv1[fqgz]}",
         | 
| 273 | 
            +
                        "fq1l annotate_index --first-cycle=#{options.umi_length+1} --last-cycle=#{options.umi_length+bcl}",
         | 
| 274 | 
            +
                        "fq1l annotate_umi --first-cycle=1 --last-cycle=#{options.umi_length}",
         | 
| 275 | 
            +
                        "fq1l sort_index#{coreutils_prefix_option}#{parallel_option(options)} --buffer-size=#{(options.maximum_memory/(fqgz2csv0.keys.size+1)).to_i}%",
         | 
| 276 | 
            +
                        "fq1l demultiplex #{fqgz2base[fqgz]} #{map}"
         | 
| 277 | 
            +
                      ]
         | 
| 278 | 
            +
                      cmds.insert(2, "#{head_command(options)} -n #{options.reads}") unless options.reads.nil?
         | 
| 279 | 
            +
                      stats = Open3.pipeline(*cmds)
         | 
| 280 | 
            +
                      stats.each_index do |i|
         | 
| 281 | 
            +
                        raise "Fail at process #{i}; #{stats[i]}; #{cmds[i]}" unless stats[i].success? || (stats[i].signaled? && stats[i].termsig == 13)
         | 
| 282 | 
            +
                      end
         | 
| 283 | 
            +
                    end
         | 
| 284 | 
            +
             | 
| 285 | 
            +
                    system "fq1l sum_counts #{fqgz2csv0.values.join(' ')} > #{base}.count.step1.csv"
         | 
| 286 | 
            +
                    unlink_files(fqgz2csv0.values)
         | 
| 287 | 
            +
                    
         | 
| 288 | 
            +
                    system "fq1l sum_counts #{fqgz2csv1.values.join(' ')} > #{base}.count.step2.csv"
         | 
| 289 | 
            +
                    unlink_files(fqgz2csv1.values)
         | 
| 290 | 
            +
             | 
| 291 | 
            +
                    #
         | 
| 292 | 
            +
                    
         | 
| 293 | 
            +
                    (bcs.values + ['NA']).each do |well|
         | 
| 294 | 
            +
             | 
| 295 | 
            +
                      STDERR.puts "#{`date`.strip}: Finishing well #{well}..."
         | 
| 296 | 
            +
                      
         | 
| 297 | 
            +
                      tmpfqgzs = fqgz2base.values.map {|base| "#{base}.#{well}.fq.gz"}
         | 
| 298 | 
            +
                      csvs = Array.new(6) {|i| "#{base}.#{well}.count.step#{i+3}.csv"}
         | 
| 299 | 
            +
                      
         | 
| 300 | 
            +
                      pipeline("unpigz -c #{tmpfqgzs.join(' ')}",
         | 
| 301 | 
            +
                               "#{fq1l_convert_command(options)}",
         | 
| 302 | 
            +
                               "#{fq1l_count_command(options)} #{csvs[0]}",
         | 
| 303 | 
            +
                               "#{fq1l_sort_command} --buffer-size=#{(options.maximum_memory/2).to_i}%",
         | 
| 304 | 
            +
                               "fq1l exclude_duplicate",
         | 
| 305 | 
            +
                               "#{fq1l_count_command(options)} #{csvs[1]}",
         | 
| 306 | 
            +
                               "fq1l trim_3end_quality",
         | 
| 307 | 
            +
                               "#{fq1l_count_command(options)} #{csvs[2]}",
         | 
| 308 | 
            +
                               "fq1l trim_3end_primer#{coreutils_prefix_option}#{grep_prefix_option(options)}#{parallel_option(options)}",
         | 
| 309 | 
            +
                               "#{fq1l_count_command(options)} #{csvs[3]}",
         | 
| 310 | 
            +
                               "#{fq1l_sort_command} --buffer-size=#{(options.maximum_memory/2).to_i}%",
         | 
| 311 | 
            +
                               "fq1l exclude_degenerate",
         | 
| 312 | 
            +
                               "#{fq1l_count_command(options)} #{csvs[4]}",
         | 
| 313 | 
            +
                               "fq1l trim_5end --minimum-length=#{options.minimum_length} #{tso_pattern}+",
         | 
| 314 | 
            +
                               "#{fq1l_count_command(options)} #{csvs[5]}",
         | 
| 315 | 
            +
                               "fq1l restore#{coreutils_prefix_option}",
         | 
| 316 | 
            +
                               "pigz -c > #{base}.#{well}.fq.gz")
         | 
| 317 | 
            +
                      
         | 
| 318 | 
            +
                      unlink_files(tmpfqgzs)
         | 
| 319 | 
            +
                      
         | 
| 320 | 
            +
                    end
         | 
| 321 | 
            +
                                         
         | 
| 322 | 
            +
                  end
         | 
| 323 | 
            +
             | 
| 324 | 
            +
                  # strt:prepare_ribosome
         | 
| 325 | 
            +
             | 
| 326 | 
            +
                  desc 'prepare_ribosome DIR', 'Prepare ribosome data'
         | 
| 327 | 
            +
                  long_desc <<-DESC
         | 
| 328 | 
            +
            Prepare ribosome data files for the specified GENOME at DIR.
         | 
| 329 | 
            +
            DESC
         | 
| 330 | 
            +
             | 
| 331 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 332 | 
            +
                  method_option *OPT_DOWNLOAD
         | 
| 333 | 
            +
                  method_option *OPT_GENOME
         | 
| 334 | 
            +
             | 
| 335 | 
            +
                  def prepare_ribosome(dir0)
         | 
| 336 | 
            +
             | 
| 337 | 
            +
                    dir = File.expand_path(dir0)
         | 
| 338 | 
            +
             | 
| 339 | 
            +
                    if options.genome[0..1] == 'hg'
         | 
| 340 | 
            +
                      if options.download != 'no'
         | 
| 341 | 
            +
                        download_file("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=555853&strand=1&rettype=fasta&retmode=text", "#{dir}/U13369.fa")
         | 
| 342 | 
            +
                        system "pigz #{dir}/U13369.fa" or exit $?.exitstatus
         | 
| 343 | 
            +
                      end
         | 
| 344 | 
            +
                      pipeline("unpigz -c #{dir}/U13369.fa.gz",
         | 
| 345 | 
            +
                               "gawk '/^>/{ print \">RIBO_U13369.1\" } !/^>/{ printf $1 } END{ print }'",
         | 
| 346 | 
            +
                               "#{fold_command(options)} -w 50",
         | 
| 347 | 
            +
                               "pigz -c > #{dir}/ribosome.fa.gz"
         | 
| 348 | 
            +
                              ) if options.download != 'only'
         | 
| 349 | 
            +
                    else
         | 
| 350 | 
            +
                      pipeline("echo", "pigz -c > #{dir}/ribosome.fa.gz")
         | 
| 351 | 
            +
                    end
         | 
| 352 | 
            +
                    
         | 
| 353 | 
            +
                  end
         | 
| 354 | 
            +
             | 
| 355 | 
            +
                  # strt:prepare_spikein
         | 
| 356 | 
            +
             | 
| 357 | 
            +
                  desc 'prepare_spikein DIR', 'Prepare spikein data'
         | 
| 358 | 
            +
                  long_desc <<-DESC
         | 
| 359 | 
            +
            Prepare spikein data files at DIR.
         | 
| 360 | 
            +
            DESC
         | 
| 361 | 
            +
             | 
| 362 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 363 | 
            +
                  method_option *OPT_DOWNLOAD
         | 
| 364 | 
            +
             | 
| 365 | 
            +
                  def prepare_spikein(dir0)
         | 
| 366 | 
            +
                    
         | 
| 367 | 
            +
                    dir = File.expand_path(dir0)
         | 
| 368 | 
            +
                    zip = "#{dir}/ERCC92.zip"
         | 
| 369 | 
            +
                    
         | 
| 370 | 
            +
                    download_file("https://tools.thermofisher.com/content/sfs/manuals/ERCC92.zip", zip) if options.download != 'no'
         | 
| 371 | 
            +
                    pipeline("unzip -cq #{zip} ERCC92.fa",
         | 
| 372 | 
            +
                             "gawk 'BEGIN{p=\"\"} /^>/{ print p \">RNA_SPIKE_\" substr($1, 2); printf(\"AATTC\" ($1 == \">ERCC-00130\" ? \"GAGCTC\" : \"\") ) } /^[ACGT]/{ printf $1; p=\"\\n\" } END{ print }'",
         | 
| 373 | 
            +
                             "#{fold_command(options)} -w 50",
         | 
| 374 | 
            +
                             "pigz -c > #{dir}/spikein.fa.gz") if options.download != 'only'
         | 
| 375 | 
            +
                    
         | 
| 376 | 
            +
                  end
         | 
| 377 | 
            +
                  
         | 
| 378 | 
            +
                  # strt:prepare_transcriptome
         | 
| 379 | 
            +
             | 
| 380 | 
            +
                  register(Bio::Gadget::StrtPrepareTranscriptome,
         | 
| 381 | 
            +
                           'prepare_transcriptome',
         | 
| 382 | 
            +
                           'prepare_transcriptome GENOME',
         | 
| 383 | 
            +
                           'Prepare transcriptome data')
         | 
| 384 | 
            +
                  
         | 
| 385 | 
            +
                  # strt:prepare_variation
         | 
| 386 | 
            +
             | 
| 387 | 
            +
                  desc 'prepare_variation DIR', 'Prepare variation data'
         | 
| 388 | 
            +
                  long_desc <<-DESC
         | 
| 389 | 
            +
            Prepare genome variation data files for the specified GENOME dir based on common variations in dbSNP BUILD, at DIR.
         | 
| 390 | 
            +
            DESC
         | 
| 391 | 
            +
                  
         | 
| 392 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 393 | 
            +
                  method_option *OPT_DOWNLOAD
         | 
| 394 | 
            +
                  method_option *OPT_GENOME
         | 
| 395 | 
            +
             | 
| 396 | 
            +
                  method_option :dbsnp,
         | 
| 397 | 
            +
                                banner: 'BUILD',
         | 
| 398 | 
            +
                                default: 146,
         | 
| 399 | 
            +
                                desc: 'Build number of dbSNP',
         | 
| 400 | 
            +
                                type: :numeric
         | 
| 401 | 
            +
                  
         | 
| 402 | 
            +
                  def prepare_variation(dir0)
         | 
| 403 | 
            +
                    
         | 
| 404 | 
            +
                    dir = File.expand_path(dir0)
         | 
| 405 | 
            +
                    snp = "#{dir}/#{options.genome}.snp#{options.dbsnp}Common.txt.gz"
         | 
| 406 | 
            +
             | 
| 407 | 
            +
                    rsync_file("rsync://hgdownload.soe.ucsc.edu/goldenPath/#{options.genome}/database/snp#{options.dbsnp}Common.txt.gz", snp) if options.download != 'no'
         | 
| 408 | 
            +
                    pipeline("unpigz -c #{dir}/genome.fa.gz",
         | 
| 409 | 
            +
                             "hisat2_extract_snps_haplotypes_UCSC.py - #{snp} #{dir}/variation"
         | 
| 410 | 
            +
                            ) if options.download != 'only'
         | 
| 411 | 
            +
                    
         | 
| 412 | 
            +
                  end
         | 
| 413 | 
            +
             | 
| 414 | 
            +
                  # strt:qualify
         | 
| 415 | 
            +
             | 
| 416 | 
            +
                  desc 'qualify CSV REFDIR SEQDIR MAPDIR', 'Qualify samples'
         | 
| 417 | 
            +
            long_desc <<-DESC
         | 
| 418 | 
            +
            Qualify samples in a design CSV.'
         | 
| 419 | 
            +
            DESC
         | 
| 420 | 
            +
             | 
| 421 | 
            +
                  method_option *OPT_BUFFER_SIZE
         | 
| 422 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 423 | 
            +
                  method_option *OPT_PARALLEL
         | 
| 424 | 
            +
             | 
| 425 | 
            +
                  def qualify(csv, refdir, seqdir, mapdir)
         | 
| 426 | 
            +
             | 
| 427 | 
            +
                    count_commands = ["#{cut_command} -f 5",
         | 
| 428 | 
            +
                                      "ruby -e 'n=0; while gets; n+=$_.to_i; end; puts n'"]
         | 
| 429 | 
            +
             | 
| 430 | 
            +
                    samples = CSV.read(csv, {
         | 
| 431 | 
            +
                                         headers: true,
         | 
| 432 | 
            +
                                         converters: :numeric
         | 
| 433 | 
            +
                                       })
         | 
| 434 | 
            +
                    bases = samples["BASE"]
         | 
| 435 | 
            +
             | 
| 436 | 
            +
                    samples["TOTAL_READS"] =
         | 
| 437 | 
            +
                      Parallel.map(bases, in_threads: options.parallel) do |base|
         | 
| 438 | 
            +
                      stat = CSV.table("#{seqdir}/#{base}.count.step8.csv")
         | 
| 439 | 
            +
                      n = 0
         | 
| 440 | 
            +
                      stat[:reads].each {|i| n += i }
         | 
| 441 | 
            +
                      n
         | 
| 442 | 
            +
                    end
         | 
| 443 | 
            +
             | 
| 444 | 
            +
                    samples["MAPPED_READS"] =
         | 
| 445 | 
            +
                      Parallel.map(bases, in_threads: options.parallel) do |base|
         | 
| 446 | 
            +
                      pipeline_readline("unpigz -c #{mapdir}/#{base}.bed.gz",
         | 
| 447 | 
            +
                                        *count_commands).to_i
         | 
| 448 | 
            +
                    end
         | 
| 449 | 
            +
             | 
| 450 | 
            +
                    tmp = Array.new
         | 
| 451 | 
            +
                    samples.each do |row|
         | 
| 452 | 
            +
                      tmp << row["MAPPED_READS"].to_f / row["TOTAL_READS"]
         | 
| 453 | 
            +
                    end
         | 
| 454 | 
            +
                    samples["MAPPED_RATE"] = tmp
         | 
| 455 | 
            +
             | 
| 456 | 
            +
                    samples["RIBOSOME_READS"] = 
         | 
| 457 | 
            +
                      Parallel.map(bases, in_threads: options.parallel) do |base|
         | 
| 458 | 
            +
                      pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/ribosome.bed.gz",
         | 
| 459 | 
            +
                                        *count_commands).to_i
         | 
| 460 | 
            +
                    end
         | 
| 461 | 
            +
             | 
| 462 | 
            +
                    samples["SPIKEIN_READS"] =
         | 
| 463 | 
            +
                      Parallel.map(bases, in_threads: options.parallel) do |base|
         | 
| 464 | 
            +
                      pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/spikein_whole.bed.gz",
         | 
| 465 | 
            +
                                        *count_commands).to_i
         | 
| 466 | 
            +
                    end
         | 
| 467 | 
            +
             | 
| 468 | 
            +
                    samples["SPIKEIN_5END_READS"] =
         | 
| 469 | 
            +
                      Parallel.map(bases, in_threads: options.parallel) do |base|
         | 
| 470 | 
            +
                      pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/spikein_5end.bed.gz",
         | 
| 471 | 
            +
                                        *count_commands).to_i
         | 
| 472 | 
            +
                    end
         | 
| 473 | 
            +
             | 
| 474 | 
            +
                    tmp = Array.new
         | 
| 475 | 
            +
                    samples.each do |row|
         | 
| 476 | 
            +
                      tmp << row["SPIKEIN_5END_READS"].to_f / row["SPIKEIN_READS"]
         | 
| 477 | 
            +
                    end
         | 
| 478 | 
            +
                    samples["SPIKEIN_5END_RATE"] = tmp
         | 
| 479 | 
            +
             | 
| 480 | 
            +
                    tmp = Array.new
         | 
| 481 | 
            +
                    samples.each do |row|
         | 
| 482 | 
            +
                      tmp << (row["MAPPED_READS"] - row["RIBOSOME_READS"] - row["SPIKEIN_READS"]) / row["SPIKEIN_5END_READS"].to_f
         | 
| 483 | 
            +
                    end
         | 
| 484 | 
            +
                    samples["RELATIVE_POLYA_RNAS"] = tmp
         | 
| 485 | 
            +
             | 
| 486 | 
            +
                    samples["CODING_READS"] =
         | 
| 487 | 
            +
                      Parallel.map(bases, in_threads: options.parallel) do |base|
         | 
| 488 | 
            +
                      pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/transcriptome.coding_whole.bed.gz",
         | 
| 489 | 
            +
                                        *count_commands).to_i
         | 
| 490 | 
            +
                    end
         | 
| 491 | 
            +
             | 
| 492 | 
            +
                    samples["CODING_5END_READS"] =
         | 
| 493 | 
            +
                      Parallel.map(bases, in_threads: options.parallel) do |base|
         | 
| 494 | 
            +
                      pipeline_readline("bedtools intersect -nonamecheck -u -s -a #{mapdir}/#{base}.bed.gz -b #{refdir}/transcriptome.coding_5end.bed.gz",
         | 
| 495 | 
            +
                                        *count_commands).to_i
         | 
| 496 | 
            +
                    end
         | 
| 497 | 
            +
             | 
| 498 | 
            +
                    tmp = Array.new
         | 
| 499 | 
            +
                    samples.each do |row|
         | 
| 500 | 
            +
                      tmp << row["CODING_5END_READS"].to_f / row["CODING_READS"]
         | 
| 501 | 
            +
                    end
         | 
| 502 | 
            +
                    samples["CODING_5END_RATE"] = tmp
         | 
| 503 | 
            +
             | 
| 504 | 
            +
                    tmp = Array.new
         | 
| 505 | 
            +
                    samples.each do |row|
         | 
| 506 | 
            +
                      tmp << row["CODING_5END_READS"].to_f / row["SPIKEIN_5END_READS"]
         | 
| 507 | 
            +
                    end
         | 
| 508 | 
            +
                    samples["RELATIVE_MRNAS"] = tmp
         | 
| 509 | 
            +
             | 
| 510 | 
            +
                    puts samples
         | 
| 511 | 
            +
             | 
| 512 | 
            +
                  end
         | 
| 513 | 
            +
                  
         | 
| 514 | 
            +
                  # strt:quantitate
         | 
| 515 | 
            +
             | 
| 516 | 
            +
                  desc 'quantify CSV REFDIR MAPDIR REGBASE',
         | 
| 517 | 
            +
                       'Quantify samples'
         | 
| 518 | 
            +
                  long_desc <<-DESC
         | 
| 519 | 
            +
            Count reads per region for multiple samples in CSV. Read counts (a BED-format COUNT) within regions (REFDIR/ribosome.bed.gz, REFDIR/spikein_5end.bed.gz & REGBASE.bed.gz) are summed by the region names. Addional columns in REGBASE.csv is attached as annotations.
         | 
| 520 | 
            +
            DESC
         | 
| 521 | 
            +
             | 
| 522 | 
            +
                  method_option *OPT_COREUTILS_PREFIX
         | 
| 523 | 
            +
                  method_option *OPT_PARALLEL
         | 
| 524 | 
            +
                  
         | 
| 525 | 
            +
                  def quantify(csv, refdir, mapdir, regbase)
         | 
| 526 | 
            +
             | 
| 527 | 
            +
                    samples = CSV.read(csv, { headers: true, converters: :numeric })
         | 
| 528 | 
            +
             | 
| 529 | 
            +
                    tmp = CSV.read("#{regbase}.csv", { headers:true, converters: :numeric })
         | 
| 530 | 
            +
                    anns = tmp.headers; anns.delete('ID')
         | 
| 531 | 
            +
                    name2anns = Hash.new
         | 
| 532 | 
            +
                    tmp.each {|row| name2anns[row['ID']] = row.values_at(*anns)}
         | 
| 533 | 
            +
             | 
| 534 | 
            +
                    bases = samples["BASE"]
         | 
| 535 | 
            +
                    puts ( ['ID'] + anns +
         | 
| 536 | 
            +
                           bases.map {|base| "N|#{base}"} +
         | 
| 537 | 
            +
                           bases.map {|base| "R|#{base}"} ).join(',')
         | 
| 538 | 
            +
             | 
| 539 | 
            +
                    name2base2cnt = Hash.new
         | 
| 540 | 
            +
                    base2spike = Hash.new
         | 
| 541 | 
            +
                    @locker = Mutex.new
         | 
| 542 | 
            +
                    Parallel.map(samples["BASE"], in_threads: options.parallel) do |base|
         | 
| 543 | 
            +
                      base2spike[base] = 0.0
         | 
| 544 | 
            +
                      fp = open("| strt count_per_region#{coreutils_prefix_option} #{mapdir}/#{base}.bed.gz #{refdir}/ribosome.bed.gz #{refdir}/spikein_5end.bed.gz #{regbase}.bed.gz")
         | 
| 545 | 
            +
                      fp.gets
         | 
| 546 | 
            +
                      fp.each do |line|
         | 
| 547 | 
            +
                        name, cnt = line.strip.split /,/
         | 
| 548 | 
            +
                        @locker.synchronize do
         | 
| 549 | 
            +
                          if !name2base2cnt.key?(name)
         | 
| 550 | 
            +
                            name2base2cnt[name] = Hash.new
         | 
| 551 | 
            +
                            name2base2cnt[name][base] = Hash.new
         | 
| 552 | 
            +
                          elsif !name2base2cnt[name].key?(base)
         | 
| 553 | 
            +
                            name2base2cnt[name][base] = Hash.new
         | 
| 554 | 
            +
                          end
         | 
| 555 | 
            +
                          name2base2cnt[name][base] = cnt
         | 
| 556 | 
            +
                          base2spike[base] += cnt.to_f if name =~ /^RNA_SPIKE_/
         | 
| 557 | 
            +
                        end
         | 
| 558 | 
            +
                      end
         | 
| 559 | 
            +
                      fp.close
         | 
| 560 | 
            +
                    end
         | 
| 561 | 
            +
                    
         | 
| 562 | 
            +
                    name2base2cnt.each do |name, base2cnt|
         | 
| 563 | 
            +
                      puts ( [name] +
         | 
| 564 | 
            +
                             (name2anns.key?(name) ?
         | 
| 565 | 
            +
                                name2anns[name] : Array.new(anns.length, 'NA')) +
         | 
| 566 | 
            +
                             bases.map {|base| base2cnt.key?(base) ?
         | 
| 567 | 
            +
                                          base2cnt[base].to_f/base2spike[base]*1000 : 0} +
         | 
| 568 | 
            +
                             bases.map {|base| base2cnt.key?(base) ? base2cnt[base] : 0}
         | 
| 569 | 
            +
                           ).join(',')
         | 
| 570 | 
            +
                    end
         | 
| 571 | 
            +
                    
         | 
| 572 | 
            +
                  end
         | 
| 573 | 
            +
                  
         | 
| 574 | 
            +
                  #
         | 
| 575 | 
            +
             | 
| 576 | 
            +
                  no_commands do
         | 
| 577 | 
            +
             | 
| 578 | 
            +
                    def download_option(options)
         | 
| 579 | 
            +
                      " --download=#{options.download}"
         | 
| 580 | 
            +
                    end
         | 
| 581 | 
            +
             | 
| 582 | 
            +
                    def genome_option(options)
         | 
| 583 | 
            +
                      " --genome=#{options.genome}"
         | 
| 584 | 
            +
                    end
         | 
| 585 | 
            +
             | 
| 586 | 
            +
                    def pipeline_readline(*cmds)
         | 
| 587 | 
            +
                      fp, ths = Open3.pipeline_r(*cmds)
         | 
| 588 | 
            +
                      line = fp.gets.strip
         | 
| 589 | 
            +
                      fp.close
         | 
| 590 | 
            +
                      ths[-1].join
         | 
| 591 | 
            +
                      line
         | 
| 592 | 
            +
                    end
         | 
| 593 | 
            +
             | 
| 594 | 
            +
                    def rsync_file(remote, local)
         | 
| 595 | 
            +
                      system "rsync -a #{remote} #{local}" or exit $?.exitstatus
         | 
| 596 | 
            +
                    end
         | 
| 597 | 
            +
                    
         | 
| 598 | 
            +
                  end
         | 
| 599 | 
            +
                  
         | 
| 600 | 
            +
                end
         | 
| 601 | 
            +
              end
         | 
| 602 | 
            +
            end
         | 
| 603 | 
            +
             | 
| 604 | 
            +
            require 'bio/gadget/strt/count.rb'
         | 
| 605 | 
            +
            require 'bio/gadget/strt/depth.rb'
         |