RubyGems - bio-ngs - Versions diffs - 0.3.2.alpha.01 → 0.4.2.alpha.01 - Mend

bio-ngs 0.3.2.alpha.01 → 0.4.2.alpha.01

Files changed (18) hide show

data/Gemfile +5 -3
data/Gemfile.lock +36 -30
data/README.rdoc +33 -0
data/VERSION +1 -1
data/bio-ngs.gemspec +30 -22
data/ext/mkrf_conf.rb +0 -2
data/lib/bio/appl/ngs/cufflinks/iterators.rb +35 -0
data/lib/bio/appl/ngs/cufflinks.rb +180 -85
data/lib/bio/appl/ngs/fastx.rb +24 -1
data/lib/bio/appl/ngs/tophat.rb +24 -8
data/lib/bio/ngs/ext/versions.yaml +12 -12
data/lib/bio/ngs/utils.rb +11 -1
data/lib/bio-ngs.rb +1 -0
data/lib/tasks/convert.thor +16 -0
data/lib/tasks/pre.thor +130 -0
data/lib/tasks/quality.thor +3 -4
data/lib/tasks/rna.thor +2 -1
metadata +90 -66

data/lib/bio/appl/ngs/cufflinks.rb CHANGED Viewed

@@ -20,41 +20,55 @@ module Bio
       end
-      # cufflinks v1.0.2 (2335)
+      # cufflinks v1.3.0
       # linked against Boost version 104000
       # -----------------------------
       # Usage:   cufflinks [options] <hits.sam>
-      # Options:
-      #
+      # General Options:
+      #   -o/--output-dir              write all output files to this directory              [ default:     ./ ]
       #   -p/--num-threads             number of threads used during analysis                [ default:      1 ]
-      #   -L/--label                   assembled transcripts have this ID prefix             [ default:   CUFF ]
+      #   --seed                       value of random number generator seed                 [ default:      0 ]
       #   -G/--GTF                     quantitate against reference transcript annotations
-      #   -F/--min-isoform-fraction    suppress transcripts below this abundance level       [ default:   0.15 ]
-      #   -f/--min-intron-fraction     filter spliced alignments below this level            [ default:   0.05 ]
-      #   -j/--pre-mrna-fraction       suppress intra-intronic transcripts below this level  [ default:   0.15 ]
-      #   -I/--max-intron-length       ignore alignments with gaps longer than this          [ default: 300000 ]
-      #   -Q/--min-map-qual            ignore alignments with lower than this mapping qual   [ default:      0 ]
+      #   -g/--GTF-guide               use reference transcript annotation to guide assembly
       #   -M/--mask-file               ignore all alignment within transcripts in this file
-      #   -v/--verbose                 log-friendly verbose processing (no progress bar)     [ default:  FALSE ]
-      #   -q/--quiet                   log-friendly quiet processing (no progress bar)       [ default:  FALSE ]
-      #   -o/--output-dir              write all output files to this directory              [ default:     ./ ]
-      #   -r/--reference-seq           reference fasta file for sequence bias correction     [ default:   NULL ]
+      #   -b/--frag-bias-correct       use bias correction - reference fasta required        [ default:   NULL ]
+      #   -u/--multi-read-correct      use 'rescue method' for multi-reads (more accurate)   [ default:  FALSE ]
+      #   --library-type               library prep used for input reads                     [ default:  below ]
       #
-      # Advanced Options:
+      # Advanced Abundance Estimation Options:
+      #   -m/--frag-len-mean           average fragment length (unpaired reads only)         [ default:    200 ]
+      #   -s/--frag-len-std-dev        fragment length std deviation (unpaired reads only)   [ default:     80 ]
+      #   --upper-quartile-norm        use upper-quartile normalization                      [ default:  FALSE ]
+      #   --max-mle-iterations         maximum iterations allowed for MLE calculation        [ default:   5000 ]
+      #   --num-importance-samples     number of importance samples for MAP restimation      [ default:   1000 ]
+      #   --compatible-hits-norm       count hits compatible with reference RNAs only        [ default:  FALSE ]
+      #   --total-hits-norm            count all hits for normalization                      [ default:  TRUE  ]
       #
-      #   -N/--quartile-normalization  use quartile normalization instead of total counts    [ default:  FALSE ]
-      #   -a/--junc-alpha              alpha for junction binomial test filter               [ default:   0.01 ]
-      #   -A/--small-anchor-fraction   percent read overhang taken as 'suspiciously small'   [ default:   0.12 ]
-      #   -m/--frag-len-mean           the average fragment length                           [ default:    200 ]
-      #   -s/--frag-len-std-dev        the fragment length standard deviation                [ default:     80 ]
+      # Advanced Assembly Options:
+      #   -L/--label                   assembled transcripts have this ID prefix             [ default:   CUFF ]
+      #   -F/--min-isoform-fraction    suppress transcripts below this abundance level       [ default:   0.10 ]
+      #   -j/--pre-mrna-fraction       suppress intra-intronic transcripts below this level  [ default:   0.15 ]
+      #   -I/--max-intron-length       ignore alignments with gaps longer than this          [ default: 300000 ]
+      #   -a/--junc-alpha              alpha for junction binomial test filter               [ default:  0.001 ]
+      #   -A/--small-anchor-fraction   percent read overhang taken as 'suspiciously small'   [ default:   0.09 ]
       #   --min-frags-per-transfrag    minimum number of fragments needed for new transfrags [ default:     10 ]
       #   --overhang-tolerance         number of terminal exon bp to tolerate in introns     [ default:      8 ]
-      #   --num-importance-samples     number of importance samples for MAP restimation      [ default:   1000 ]
-      #   --max-mle-iterations         maximum iterations allowed for MLE calculation        [ default:   5000 ]
-      #   --library-type               Library prep used for input reads                     [ default:  below ]
       #   --max-bundle-length          maximum genomic length allowed for a given bundle     [ default:3500000 ]
-      #   --max-bundle-frags           maximum fragments allowed in a bundle before skipping [ default: 500000 ]
+      #   --max-bundle-frags           maximum fragments allowed in a bundle before skipping [ default: 500000 ]
       #   --min-intron-length          minimum intron size allowed in genome                 [ default:     50 ]
+      #   --trim-3-avgcov-thresh       minimum avg coverage required to attempt 3' trimming  [ default:     10 ]
+      #   --trim-3-dropoff-frac        fraction of avg coverage below which to trim 3' end   [ default:    0.1 ]
+      #
+      # Advanced Reference Annotation Guided Assembly Options:
+      #   --no-faux-reads              disable tiling by faux reads                          [ default:  FALSE ]
+      #   --3-overhang-tolerance       overhang allowed on 3' end when merging with reference[ default:    600 ]
+      #   --intron-overhang-tolerance  overhang allowed inside reference intron when merging [ default:     30 ]
+      #
+      # Advanced Program Behavior Options:
+      #   -v/--verbose                 log-friendly verbose processing (no progress bar)     [ default:  FALSE ]
+      #   -q/--quiet                   log-friendly quiet processing (no progress bar)       [ default:  FALSE ]
+      #   --no-update-check            do not contact server to check for update availability[ default:  FALSE ]
+      #
       # Supported library types:
       #   ff-firststrand
       #   ff-secondstrand
@@ -62,48 +76,69 @@ module Bio
       #   fr-firststrand
       #   fr-secondstrand
       #   fr-unstranded (default)
-      #   transfrags
+      #   transfrags
       class Quantification
         include Bio::Command::Wrapper
+        include Bio::Ngs::Cufflinks::Utils
         set_program Bio::Ngs::Utils.binary("cufflinks")
+        add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
         add_option "num-threads", :type => :numeric, :aliases => '-p', :default => 1
-        add_option "label", :type => :string, :aliases => '-L', :default => "CUFF"
+        add_option "seed", :type => :numeric
         add_option "GTF", :type => :string, :aliases => '-G'
+        add_option "GTF-guide", :type => :boolean, :aliases => '-g'
+        add_option "mask-file", :type => :string, :aliases => '-M'
+        add_option "frag-bias-correct", :type => :string, :aliases => '-b'
+        add_option "multi-read-correct", :type => :boolean, :aliases => '-u'
+        add_option "library-type", :type => :string
+        add_option "farg-len-mean", :type => :numeric, :aliases => '-m'#, :default => 200
+        add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'#, :default => 80
+        add_option "upper-quartile-norm", :type => :boolean
+        add_option "max-mle-iterations", :type => :numeric#, :default => 5000
+        add_option "num-importance-samples", :type => :numeric#, :default => 1000
+        add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
+        add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
+        add_option "label", :type => :string, :aliases => '-L', :default => "CUFF"
         add_option "min-isoform-fraction", :type => :numeric, :aliases => '-F', :default => 0.15
-        add_option "min-intron-fraction", :type => :numeric, :aliases => '-f', :default => 0.05
         add_option "pre-mrna-fraction", :type => :numeric, :aliases => '-j', :default => 0.15
+#deprecated        add_option "min-intron-fraction", :type => :numeric, :aliases => '-f', :default => 0.05
         add_option "max-intron-length", :type => :numeric, :aliases => '-I', :default => 300000
-        add_option "min-map-qual", :type => :numeric, :aliases => '-Q', :default => 0
-        add_option "mask-file", :type => :string, :aliases => '-M'
-        add_option "verbose", :type => :boolean, :aliases => '-v'
-        add_option "quiet", :type => :boolean, :aliases => '-q'
-        add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
-        add_option "reference-seq", :type => :string, :aliases => '-r'
-        add_option "quartile-normalization", :type => :boolean, :aliases => '-N'
         add_option "junc-alpha", :type => :numeric, :aliases => '-a', :default => 0.01
         add_option "small-anchor-fraction", :type => :numeric, :aliases => '-A', :default => 0.12
-        #TODO Check why with these defaults is not working properly
-        add_option "farg-len-mean", :type => :numeric, :aliases => '-m'#, :default => 200
-        add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'#, :default => 80
         add_option "min-frags-per-transfrag", :type => :numeric#, :default => 10
         add_option "overhang-tolerance", :type => :numeric#, :default => 8
-        add_option "num-importance-samples", :type => :numeric#, :default => 1000
-        add_option "max-mle-iterations", :type => :numeric#, :default => 5000
-        add_option "library-type", :type => :string
         add_option "max-bundle-length", :type => :numeric #, :default => 3500000
         add_option "max-bundle-frags", :type => :numeric #, :default => 500000
         add_option "min-intron-length", :type => :numeric#, :default => 50
+        add_option "trim-3-avgcov-thresh", :type => :numeric
+        add_option "trim-3-dropoff-frac", :type => :numeric
+        add_option "no-faux-reads", :type => :boolean
+        add_option "3-overhang-tolerance", :type => :numeric
+        add_option "intron-overhang-tolerance", :type => :numeric
+        add_option "verbose", :type => :boolean, :aliases => '-v'
+        add_option "quiet", :type => :boolean, :aliases => '-q'
+#deprecated        add_option "min-map-qual", :type => :numeric, :aliases => '-Q', :default => 0
+#deprecated        add_option "reference-seq", :type => :string, :aliases => '-r'
+#deprecated        add_option "quartile-normalization", :type => :boolean, :aliases => '-N'
+        #TODO Check why with these defaults is not working properly
+        add_iterator_for :genes
+        add_iterator_for :isoforms
       end #Quantification
-      # cuffdiff v1.0.2 (2336)
+      # cuffdiff v1.3.0 (3022)
       # -----------------------------
       # Usage:   cuffdiff [options] <transcripts.gtf> <sample1_hits.sam> <sample2_hits.sam> [... sampleN_hits.sam]
       #    Supply replicate SAMs as comma separated lists for each condition: sample1_rep1.sam,sample1_rep2.sam,...sample1_repM.sam
       # General Options:
       #   -o/--output-dir              write all output files to this directory              [ default:     ./ ]
+      #   --seed                       value of random number generator seed                 [ default:      0 ]
       #   -T/--time-series             treat samples as a time-series                        [ default:  FALSE ]
       #   -c/--min-alignment-count     minimum number of alignments in a locus for testing   [ default:   10 ]
       #   --FDR                        False discovery rate used in testing                  [ default:   0.05 ]
@@ -119,14 +154,23 @@ module Bio
       #   -m/--frag-len-mean           average fragment length (unpaired reads only)         [ default:    200 ]
       #   -s/--frag-len-std-dev        fragment length std deviation (unpaired reads only)   [ default:     80 ]
       #   --num-importance-samples     number of importance samples for MAP restimation      [ default:   1000 ]
+      #   --num-bootstrap-samples      Number of bootstrap replications                      [ default:     20 ]
+      #   --bootstrap-fraction         Fraction of fragments in each bootstrap sample        [ default:    1.0 ]
       #   --max-mle-iterations         maximum iterations allowed for MLE calculation        [ default:   5000 ]
-      #   --compatible-hits-norm       count hits compatible with reference RNAs only        [ default:  TRUE  ]
+      #   --compatible-hits-norm       count hits compatible with reference RNAs only        [ default:   TRUE ]
       #   --total-hits-norm            count all hits for normalization                      [ default:  FALSE ]
       #   --poisson-dispersion         Don't fit fragment counts for overdispersion          [ default:  FALSE ]
       #   -v/--verbose                 log-friendly verbose processing (no progress bar)     [ default:  FALSE ]
       #   -q/--quiet                   log-friendly quiet processing (no progress bar)       [ default:  FALSE ]
       #   --no-update-check            do not contact server to check for update availability[ default:  FALSE ]
       #   --emit-count-tables          print count tables used to fit overdispersion         [ default:  FALSE ]
+      #   --max-bundle-frags           maximum fragments allowed in a bundle before skipping [ default: 500000 ]
+      #
+      # Debugging use only:
+      #   --read-skip-fraction         Skip a random subset of reads this size               [ default:    0.0 ]
+      #   --no-read-pairs              Break all read pairs                                  [ default:  FALSE ]
+      #   --trim-read-length           Trim reads to be this long (keep 5' end)              [ default:   none ]
+      #   --cov-delta                  Maximum gap between bootstrap and IS                  [ default:   2.0  ]
       #
       # Supported library types:
       #   ff-firststrand
@@ -138,10 +182,12 @@ module Bio
       #   transfrags
       class Diff
         include Bio::Command::Wrapper
+        include Bio::Ngs::Cufflinks::Utils
         set_program Bio::Ngs::Utils.binary("cuffdiff")
         add_option "output-dir", :type => :string, :aliases => '-o', :default => "./"
+        add_option "seed", :type => :numeric
         add_option "time-series", :type => :boolean, :aliases => '-T'
         add_option "min-alignment-count", :type => :numeric, :aliases => '-c'
         add_option "FDR", :type => :numeric, :aliases => '-F'
@@ -155,6 +201,8 @@ module Bio
         add_option "frag-len-mean", :type => :numeric, :aliases => '-m'
         add_option "frag-len-std-dev", :type => :numeric, :aliases => '-s'
         add_option "num-importance-samples", :type => :numeric, :aliases => '-i'
+        add_option "num-bootstrap-samples", :type => :numeric
+        add_option "bootstrap-fraction", :type => :numeric
         add_option "max-mle-iterations", :type => :numeric, :aliases => '-e'
         add_option "compatible-hits-norm", :type => :boolean, :aliases => '-h'
         add_option "total-hits-norm", :type => :boolean, :aliases => '-t'
@@ -163,13 +211,24 @@ module Bio
         add_option "quiet", :type => :boolean, :aliases => '-q'
         add_option "no-update-check", :type => :boolean, :aliases => '-j'
         add_option "emit-count-tables", :type => :boolean, :aliases => '-b'
+        add_option "max-bundle-frags", :type => :numeric
+        add_option "read-skip-fraction", :type => :numeric
+        add_option "no-read-pairs", :type => :numeric
+        add_option "trim-read-length", :type => :numeric
+        add_option "cov-delta", :type => :numeric
+        #define iterators
+        add_iterator_for :genes
+        add_iterator_for :isoforms
+        add_iterator_for :cds
+        add_iterator_for :tss_groups
         #Examples
         #Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/isoform_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,3,0.6,false,true)
         #Bio::Ngs::Cufflinks::Diff.genes("/Users/bonnalraoul/Desktop/RRep16giugno/DE_lane1-2-3-4-6-8/DE_lane1-2-3-4-6-8/gene_exp.diff", "/Users/bonnalraoul/Desktop/RRep16giugno/COMPARE_lane1-2-3-4-6-8/COMPARE_lane1-2-3-4-6-8.combined.gtf",1.0,5,0.5,false,true)
         class << self
           #Return the version of CuffDiff used to produce the output
           def version(diff)
             #cufflink_version_offset = Bio::Ngs::Cufflinks.version
@@ -193,7 +252,7 @@ module Bio
             1
           end
         end
         #write a file with the information
         #See process_de for options available
         # Example: Bio::Ngs::Cufflinks::Diff.isoforms("/Users/bonnalraoul/Desktop/RRep16giugno/DEPopNormNOTh2s1NOTh17s1_lane1-2-3-4-6-8/isoform_exp.diff",
@@ -212,7 +271,11 @@ module Bio
         # fold:0.5,min_samples:5,min_fpkm:0.5,z_scores:true, :regulated=>:up)
         def genes(diff, gtf, options={})
           process_de(diff, gtf, options) do |dict_info, diff_reference, gtf_kb, fpkm_values|
-            "#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
+#            puts diff_reference
+#            puts fpkm_values
+            # "#{dict_info[:winner].first}\t#{gtf_kb[diff_reference][:gene_name]}\t#{fpkm_values.join("\t")}"
+            #do not use th gtf kb
+            "#{dict_info[:winner].first}\t#{dict_info[:gene_name]}\t#{fpkm_values.join("\t")}"
           end
         end #genes
@@ -220,7 +283,9 @@ module Bio
         #Options hash
         # :fold(float), :min_samples(integer), :min_fpkm(float), :only_significative(boolean, false) , :z_score(boolean, false)
         # :regulated(symbol :up or :down default :up)
+        # :fpkm_log_two (:true :false, default :true)
         def process_de(diff, gtf, options={})
+          #init default options
           fold = options[:fold] || 0.0
           min_samples = options[:min_samples] || 0
           min_fpkm = options[:min_fpkm] || 0.0
@@ -228,12 +293,15 @@ module Bio
           z_scores = options[:z_scores] || false
           #TODO improve check on paramters
           regulated =options[:regulated] || :up
+          fpkm_log_two = options[:fpkm_log_two] || true
+          force_not_significative = options[:force_not_significative] || false
-          gtf_kb = Bio::Ngs::Cufflinks::Compare.exists_kb?(gtf)  ? Bio::Ngs::Cufflinks::Compare.load_compare_kb(gtf) : Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
+          #set up the kb if not available = pass an option with the path of the kb ?
+          gtf_kb = nil###### Bio::Ngs::Cufflinks::Compare.exists_kb?(gtf)  ? Bio::Ngs::Cufflinks::Compare.load_compare_kb(gtf) : Bio::Ngs::Cufflinks::Compare.build_compare_kb(gtf)
           #convert log2 fold value into natural log value (internally computed by cuffdiff)
           fold_log2 = fold
-          fold = fold==0 ? 0.0 : (fold*Math.log(2))
+          (fold = fold==0 ? 0.0 : (fold*Math.log(2))) unless fpkm_log_two
           dict=Hash.new {|h, k| h[k]=Hash.new{|hh,kk| hh[kk]=[]}; }
           dict_samples = Hash.new{|h,k| h[k]=""}
@@ -243,78 +311,105 @@ module Bio
           File.open(diff,'r') do |f|
             header=f.readline #skip header
+            test_id_idx = 0
+            gene_name_idx = 2
+            q_first_idx = 3 + cufflink_version_offset
+            q_second_idx = 4 + cufflink_version_offset
+            fpkm_first_idx = 6 + cufflink_version_offset
+            fpkm_second_idx = 7 + cufflink_version_offset
+            fold_idx = 8 + cufflink_version_offset
+            significant_idx = 11 + cufflink_version_offset + (cufflink_version_offset==1 ? 1 : 0)
+            #Commenti:
+            # per ogni riga del diff devo salvare il valore dei espressione di ogni test
+            # quindi fpkm e se è significativo o meno
-            q_first = 3 + cufflink_version_offset
-            q_second = 4 + cufflink_version_offset
-            fpkm_first = 6 + cufflink_version_offset
-            fpkm_second = 7 + cufflink_version_offset
-            fold_position = 8 + cufflink_version_offset
-            significant_position = 11 + cufflink_version_offset + (cufflink_version_offset==1 ? 1 : 0)
             f.each_line do |line|
               data=line.split
-              if data[fold_position].to_f<=0
-                data[fold_position]=data[fold_position].sub(/-/,"")
+              #fix comparison t-test, remove negative symbol e invert comparison: if fold change q1 vs q2 <0 abs(foldchange) & swaap q1,q2
+#              puts data[fold_idx].to_f
+#delete puts "#{data[fold_idx].to_f} #{data[fold_idx].to_f<0}"
+              if data[fold_idx].to_f<0
+                data[fold_idx]=data[fold_idx][1..-1] #.sub(/-/,"")  remove the minus symbol from the number, the values q1, q2 and their fpkm will be reorganized into the data structure
               else
-                a=data[fpkm_second]
-                data[fpkm_second]=data[fpkm_first]
-                data[fpkm_first]=a
-                a=data[q_second]
-                data[q_second]=data[q_first]
-                data[q_first]=a
+#                puts "ciao"
+                data[fpkm_first_idx],data[fpkm_second_idx]=data[fpkm_second_idx],data[fpkm_first_idx]
+                data[q_first_idx],data[q_second_idx]=data[q_second_idx],data[q_first_idx]
+#delete                puts "#{q_first_idx},#{q_second_idx}"
               end
+#delete                              puts "#{q_first_idx},#{q_second_idx}"
+#delete              puts "#{data[q_first_idx].to_sym} #{data[q_second_idx].to_sym}"
+#delete              puts "#{data[fpkm_first_idx].to_sym} #{data[fpkm_second_idx].to_sym}"
               #0 TCONS
               #4 name sample is the max diff for the item
               #5 name sample is the less diff for the item
               #9 is the fold
-              dict_samples[data[q_first]]
-              dict_samples[data[q_second]]
+              dict_samples[data[q_first_idx]]
+              dict_samples[data[q_second_idx]]
               #7 is the fpkm value of max pop/sample
               #8 is the fpkm value of min pop/sample
-              if ((only_significative==true && data[significant_position]=="yes") ||  (data[significant_position]=="yes" && data[fold_position].to_f>=fold)) && data[fpkm_first].to_f>=min_fpkm && data[fpkm_second].to_f>=min_fpkm
-                k_reference = data[0].to_sym #This can be TCONS if isoforms or XLOC if genes
+              k_reference = data[test_id_idx].to_sym #This can be TCONS if isoforms or XLOC if genes
+              unless dict[k_reference].key?(:values)
+                dict[k_reference][:values]={}
+                dict[k_reference][:gene_name]=data[gene_name_idx]
+              end
+              dict[k_reference][:values][data[q_first_idx].to_sym]=data[fpkm_first_idx].to_f unless dict[k_reference][:values].key?(data[q_first_idx].to_sym)
+              dict[k_reference][:values][data[q_second_idx].to_sym]=data[fpkm_second_idx].to_f unless dict[k_reference][:values].key?(data[q_second_idx].to_sym)
+              if ((only_significative==true && data[significant_idx]=="yes") ||  ((data[significant_idx]=="yes"||force_not_significative) && data[fold_idx].to_f>=fold)) && data[fpkm_first_idx].to_f>=min_fpkm && data[fpkm_second_idx].to_f>=min_fpkm
                ###### puts data.join(" ") if k_reference == :XLOC_017497
                 #TODO refactor: this can be done using lambda
-                k_sample = case regulated
-                when :up
-                  k_sample = data[q_first].to_sym
-                  dict[k_reference][k_sample]<<data[q_second].to_sym
+                k_sample = ""
+                if regulated==:up
+                  k_sample = data[q_first_idx].to_sym
+#delete                  puts "#{k_sample} #{data[q_second_idx].to_sym}"
+                  dict[k_reference][k_sample]<<data[q_second_idx].to_sym
+#delete                   puts "#{k_reference} #{q_first_idx}, #{q_second_idx}"
                   k_sample
-                when :down
-                  k_sample = data[q_second].to_sym
-                  dict[k_reference][k_sample]<<data[q_first].to_sym
+                elsif regulated==:down
+                  k_sample = data[q_second_idx].to_sym
+                  dict[k_reference][k_sample]<<data[q_first_idx].to_sym
                   k_sample
                 end
-             #   puts dict[k_reference].inspect if k_reference == :XLOC_017497
-                unless dict[k_reference].key?(:values)
-                  dict[k_reference][:values]={}
-                end
+             #delete   puts dict[k_reference].inspect if k_reference == :XLOC_017497
+                #delete puts dict.inspect
                 #store fpkm values as well for each pop/sample it should be
-                dict[k_reference][:values][k_sample]=data[fpkm_first].to_f unless dict[k_reference][:values].key?(k_sample)
-                dict[k_reference][:values][data[q_second].to_sym]=data[fpkm_second].to_f unless dict[k_reference][:values].key?(data[q_second].to_sym)
                 if dict[k_reference][k_sample].size >= min_samples
-                  dict[k_reference][:winner] << k_sample
+                  (dict[k_reference][:winner] << k_sample).uniq!
                 end
-          #      puts dict[k_reference].inspect if k_reference == :XLOC_017497
+          #delete      puts dict[k_reference].inspect if k_reference == :XLOC_017497
               else
-                #TODO add threshold value below min fpkm
-                #dict[k_reference][:values][k_sample]=data[6].to_f
-                #dict[k_reference][:values][data[4].to_sym]=data[7].to_f
+                # k_reference = data[0].to_sym #This can be TCONS if isoforms or XLOC if genes
+                #
+                # unless dict[k_reference].key?(:values)
+                #   dict[k_reference][:values]={}
+                # end
+                # #TODO add threshold value below min fpkm
+                # dict[k_reference][:values][data[q_first_idx].to_sym]=data[fpkm_first_idx].to_f unless dict[k_reference][:values].key?(data[q_first_idx].to_sym)
+                # dict[k_reference][:values][data[q_second_idx].to_sym]=data[fpkm_second_idx].to_f unless dict[k_reference][:values].key?(data[q_second_idx].to_sym)
+                # #dict[k_reference][:values][data[4].to_sym]=data[7].to_f
               end
+#delete              puts dict[k_reference].inspect
             end #each line
             #example structure
             #{:TCONS_00086164=>{:q5=>[:q1, :q2, :q3, :q6]}, :TCONS_00086166=>{:q5=>[:q1, :q2, :q3, :q4, :q6]}
           end #file.open
           file_lines =[]
           dict.each do |diff_reference, dict_info|
             if dict_info.key?(:winner)
+              #puts dict_info.inspect
               #BAD PERFORMANCES use lambda
               valz = case z_scores

data/lib/bio/appl/ngs/fastx.rb CHANGED Viewed

@@ -55,7 +55,7 @@ module Bio
       #                If [-o] is specified,  report will be printed to STDOUT.
       #                If [-o] is not specified (and output goes to STDOUT),
       #                report will be printed to STDERR.
-      class Trim
+      class QualityTrim
         include Bio::Command::Wrapper
         set_program Bio::Ngs::Utils.binary("fastq_quality_trimmer")
         use_aliases
@@ -72,6 +72,26 @@ module Bio
         report will be printed to STDERR."
         add_option :quality_type,  :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
       end #Trim
+ 	    # [-f N]       = First base to keep. Default is 1 (=first base).
+ 	    # [-l N]       = Last base to keep. Default is entire read.
+ 	    # [-t N]       = Trim N nucleotides from the end of the read.
+      #               '-t'  can not be used with '-l' and '-f'.
+ 	    # [-z]         = Compress output with GZIP.
+ 	    # [-i INFILE]  = FASTA/Q input file. default is STDIN.
+ 	    # [-o OUTFILE] = FASTA/Q output file. default is STDOUT.
+      class Trim
+        include Bio::Command::Wrapper
+        set_program Bio::Ngs::Utils.binary("fastx_trimmer")
+        use_aliases
+        add_option :first_base, :type => :numeric, :aliases => "-f", :desc => "First base to keep"
+        add_option :last_base, :type => :numeric, :aliases => "-l", :desc => "Last base to keep"
+        add_option :compress, :type => :boolean, :aliases => "-z", :desc => "Compress output with GZIP"
+        add_option :input, :type => :string, :aliases => "-i", :desc => "Input FASTA/Q file", :collapse => true
+        add_option :output, :type => :string, :aliases => "-o", :desc => "Output FASTA/Q file", :collapse => true
+        add_option :trim, :type => :numeric, :aliases => "-t", :desc => "Trim N nucleotides from the end of the read"
+        add_option :quality_type,  :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
+      end
       # Solexa-Quality BoxPlot plotter
       # Generates a solexa quality score box-plot graph
@@ -90,6 +110,7 @@ module Bio
         add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
         add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
         add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
+        add_option :quality_type,  :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
       end #ReadsBoxPlot
       # Solexa-Reads coverage plotter
@@ -109,6 +130,7 @@ module Bio
         add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file."
         add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file."
         add_option :title, :type => :string, :aliases => "-t", :desc => "Title (usually the solexa file name) - will be plotted on the graph."
+        add_option :quality_type,  :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
       end #ReadsCoverage
@@ -163,6 +185,7 @@ module Bio
         add_option :output, :type=>:string, :aliases => "-o", :desc => "FASTQ output file.", :collapse=>true
         add_option :input, :type=>:string, :aliases => "-i", :desc => "FASTQ input file.", :collapse=>true
         add_option :new_format, :type => :boolean, :aliases => "-N", :desc => "New output format (with more information per nucleotide/cycle)."
+        add_option :quality_type,  :type=>:numeric, :default => 33, :aliases => "-Q", :desc=>"Quality of fastq file"
       end #ReadsCoverage
     end #Fastx

data/lib/bio/appl/ngs/tophat.rb CHANGED Viewed

@@ -27,7 +27,11 @@
 #     -i/--min-intron-length         <int>       [ default: 50               ]
 #     -I/--max-intron-length         <int>       [ default: 500000           ]
 #     -g/--max-multihits             <int>       [ default: 20               ]
-#     -F/--min-isoform-fraction      <float>     [ default: 0.15             ]
+#     -x/--transcriptome-max-hits    <int>       [ default: 60               ]
+#     -n/--transcriptome-mismatches  <int>       [ default: 1                ]
+#     -M/--prefilter-multihits                   ( for -G/--GTF option, enable
+#                                                  an initial bowtie search
+#                                                  against the genome )
 #     --max-insertion-length         <int>       [ default: 3                ]
 #     --max-deletion-length          <int>       [ default: 3                ]
 #     --solexa-quals
@@ -40,7 +44,9 @@
 #     --library-type                 <string>    (fr-unstranded, fr-firststrand,
 #                                                 fr-secondstrand)
 #     -p/--num-threads               <int>       [ default: 1                ]
-#     -G/--GTF                       <filename>
+#     -G/--GTF                       <filename>  (GTF/GFF with known transcripts)
+#     --transcriptome-index          <bwtidx>    (transcriptome bowtie index)
+#     -T/--transcriptome-only                    (map only to the transcriptome)
 #     -j/--raw-juncs                 <filename>
 #     --insertions                   <filename>
 #     --deletions                    <filename>
@@ -59,10 +65,11 @@
 #     --keep-tmp
 #     --tmp-dir                      <dirname>   [ default: <output_dir>/tmp ]
 #     -z/--zpacker                   <program>   [ default: gzip             ]
-#     -X/--unmapped-fifo                         [ use mkfifo to compress more temporary files]
+#     -X/--unmapped-fifo                         ( use mkfifo to compress
+#                                                  more temporary files      )
 #
 # Advanced Options:
-#     --initial-read-mismatches      <int>       [ default: 2                ]
+#     -N/--initial-read-mismatches   <int>       [ default: 2                ]
 #     --segment-mismatches           <int>       [ default: 2                ]
 #     --segment-length               <int>       [ default: 25               ]
 #     --bowtie-n                                 [ default: bowtie -v        ]
@@ -73,10 +80,10 @@
 #     --max-coverage-intron          <int>       [ default: 20000            ]
 #     --min-segment-intron           <int>       [ default: 50               ]
 #     --max-segment-intron           <int>       [ default: 500000           ]
-#     --no-sort-bam                              [Output BAM is not coordinate-sorted]
-#     --no-convert-bam                           [Do not convert to bam format.
+#     --no-sort-bam                              (Output BAM is not coordinate-sorted)
+#     --no-convert-bam                           (Do not convert to bam format.
 #                                                 Output is <output_dir>accepted_hit.sam.
-#                                                 Implies --no-sort-bam.]
+#                                                 Implies --no-sort-bam)
 #
 # SAM Header Options (for embedding sequencing run metadata in output):
 #     --rg-id                        <string>    (read group ID)
@@ -105,7 +112,9 @@ module Bio
       add_option "min-intron-length", :type => :numeric , :aliases => '-i'
       add_option "max-intron-length", :type => :numeric, :aliases => '-I'
       add_option "max-multihits", :type => :numeric, :aliases => '-g'
-      add_option "min-isoform_fraction", :type => :numeric, :aliases => '-F'
+      add_option "transcriptome-max-hits", :type => :numeric, :aliases =>'-x'
+      add_option "transcriptome-mismatches", :type => :numeric, :aliases =>'-n'
+      add_option "prefilter-multihits", :type => :boolean, :aliases =>'-M'
       add_option "max-insertion-length", :type => :numeric
       add_option "max-deletion-length", :type => :numeric
       add_option "solexa-quals", :type => :boolean
@@ -116,6 +125,8 @@ module Bio
       add_option "library-type", :type => :string
       add_option "num-threads", :type => :numeric, :aliases => '-p'
       add_option "GTF", :type => :string, :aliases => '-G'
+      add_option "transcriptome-index", :type => :string
+      add_option "transcriptome-only", :type => :boolean
       add_option "raw-juncs", :type => :string, :aliases => '-j'
       add_option :insertions, :type => :string
       add_option :deletions, :type => :string
@@ -135,6 +146,9 @@ module Bio
       add_option "no-butterfly-search", :type => :boolean
       add_option "keep-tmp", :type => :boolean
       add_option "tmp-dir", :type => :string
+      add_option "zpacker", :type => :string, :aliases => '-z'
+      add_option "unmapped-fifo", :type => :boolean, :aliases => '-X'
+      add_option "initial-read-mismatches", :type => :int, :aliases => '-N'
       add_option "segment-mismatches", :type => :numeric
       add_option "segment-length", :type => :numeric
       add_option "min-closure-exon", :type => :numeric
@@ -144,6 +158,8 @@ module Bio
       add_option "max-coverage-intron", :type => :numeric
       add_option "min-segment-intron", :type => :numeric
       add_option "max-segment-intron", :type => :numeric
+      add_option "no-sort-bam", :type => :boolean
+      add_option "no-convert-bam", :type => :boolean
       add_option "rg-id", :type => :string
       add_option "rg-sample", :type => :string
       add_option "rg-library", :type => :string

data/lib/bio/ngs/ext/versions.yaml CHANGED Viewed

@@ -15,16 +15,16 @@ common:
         type:     source
 linux:
     cufflinks:
-        version:  1.1.0
-        url:      http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.1.0.Linux_x86_64.tar.gz
-        basename: cufflinks-1.1.0.Linux_x86_64
+        version:  1.3.0
+        url:      http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.3.0.Linux_x86_64.tar.gz
+        basename: cufflinks-1.3.0.Linux_x86_64
         suffix:   tar.gz
         desc:     ""
         type:     binary
     tophat:
-        version:  1.3.2
-        url:      http://tophat.cbcb.umd.edu/downloads/tophat-1.3.2.Linux_x86_64.tar.gz
-        basename: tophat-1.3.2.Linux_x86_64
+        version:  1.4.1
+        url:      http://tophat.cbcb.umd.edu/downloads/tophat-1.4.1.Linux_x86_64.tar.gz
+        basename: tophat-1.4.1.Linux_x86_64
         suffix:   tar.gz
         desc:     ""
         type:     binary
@@ -44,16 +44,16 @@ linux:
     #     type:     binary
 osx:
     cufflinks:
-        version:  1.1.0
-        url:      http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.1.0.OSX_x86_64.tar.gz
-        basename: cufflinks-1.1.0.OSX_x86_64
+        version:  1.3.0
+        url:      http://cufflinks.cbcb.umd.edu/downloads/cufflinks-1.3.0.OSX_x86_64.tar.gz
+        basename: cufflinks-1.3.0.OSX_x86_64
         suffix:   tar.gz
         desc:     ""
         type:     binary
     tophat:
-        version:  1.3.2
-        url:      http://tophat.cbcb.umd.edu/downloads/tophat-1.3.2.OSX_x86_64.tar.gz
-        basename: tophat-1.3.2.OSX_x86_64
+        version:  1.4.1
+        url:      http://tophat.cbcb.umd.edu/downloads/tophat-1.4.1.OSX_x86_64.tar.gz
+        basename: tophat-1.4.1.OSX_x86_64
         suffix:   tar.gz
         desc:     ""
         type:     binary