RubyGems - rbbt-GE - Versions diffs - 0.1.0 - Mend

rbbt-GE 0.1.0

Files changed (8) hide show

data/LICENSE ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2010-2011 Miguel Vázquez García
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/lib/rbbt/GE/GEO.rb ADDED Viewed

@@ -0,0 +1,283 @@
+require 'rbbt-util'
+require 'rbbt/GE'
+require 'rbbt/sources/organism'
+require 'rbbt/resource'
+require 'yaml'
+module GEO
+  extend Resource
+  self.pkgdir = "geo"
+  self.subdir = "arrays"
+  GEO.claim GEO.root.find(:user), :rake, Rbbt.share.install.GEO.Rakefile.find(:lib)
+  def self.comparison_name(field, condition, control)
+    condition = condition * " AND " if Array === condition
+    control = control * " AND " if Array === control
+    [[field, condition] * ": ", [field, control] * ": "] * " => "
+  end
+  def self.parse_comparison_name(name)
+    field1, condition1, field2, condition2 = name.match(/(.*): (.*?) => (.*?): (.*)/).values_at(1, 2, 3, 4)
+    condition1 = condition1.split(/ AND /) if condition1 =~ / AND /
+    condition2 = condition2.split(/ AND /) if condition2 =~ / AND /
+    [field1, condition1, field2, condition2]
+  end
+  def self.platform_info(platform)
+    YAML.load(self[platform]['info.yaml'].produce.read)
+  end
+  def self.dataset_info(dataset)
+    YAML.load(self[dataset]['info.yaml'].produce.read)
+  end
+  def self.is_control?(value, info)
+    value.to_s.downcase =~ /\bcontrol\b/ or
+    value.to_s.downcase =~ /\bwild/ or
+    value.to_s.downcase =~ /\bnone\b/
+  end
+  def self.control_samples(dataset)
+    info = dataset_info(dataset)
+    subsets = info[:subsets]
+    control_samples = []
+    subsets.each do |type, values|
+      control_samples.concat values.select{|value,samples| is_control? value, info}.collect{|value,samples| samples.split(",")}.flatten
+    end
+    control_samples
+  end
+  module SOFT
+    GDS_URL="ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/GDS/#DATASET#.soft.gz"
+    GPL_URL="ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_platform/#PLATFORM#/#PLATFORM#_family.soft.gz"
+    GSE_URL="ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_series/#SERIES#/#SERIES#_family.soft.gz"
+    GSE_INFO = {
+      :title         => "!Series_title",
+      :channel_count => "!Sample_channel_count",
+      :value_type    => "!Series_value_type",
+      :platform      => "!Series_platform_id",
+      :description   => "!Series_summary*",      # Join with \n
+    }
+    GDS_INFO = {
+      :DELIMITER        => "\\^SUBSET",
+      :value_type       => "!dataset_value_type",
+      :channel_count    => "!dataset_channel_count",
+      :platform         => "!dataset_platform",
+      :reference_series => "!dataset_reference_series",
+      :description      => "!dataset_description",
+    }
+    GDS_SUBSET_INFO = {
+      :DELIMITER        => "!subset_.*|!dataset_value_type",
+      :description => "!subset_description",
+      :samples     => "!subset_sample_id*",
+      :type        => "!subset_type",
+    }
+    GPL_INFO = {
+      :DELIMITER     => "!platform_table_begin",
+      :organism      => "!Platform_organism",
+      :count         => "!Platform_data_row_count"
+    }
+    # When multiple matches select most common, unless join is choosen
+    def self.find_field(header, field, join = false)
+      md = header.match(/#{ Regexp.quote field }\s*=\s*(.*)/i)
+      return nil if md.nil? or md.captures.empty?
+      case join
+      when false, nil
+        counts = Hash.new(0)
+        md.captures.sort_by{|v| counts[v] += 1}.first
+      when true
+        md.captures * "\n"
+      else
+        md.captures * join
+      end
+    end
+    def self.get_info(header, info)
+      result = {}
+      info.each do |key, field|
+        next if key == :DELIMITER
+        if field =~ /(.*)\*(.*)(\*)?$/
+          value = find_field(header, $1, $2.empty? ? true : $2)
+          value = value.to_i.to_s == value ? value.to_i : value
+          if $3
+            result[key] = value.split(',')
+          else
+            result[key] = value
+          end
+        else
+          value = find_field(header, field, false)
+          value = value.to_i.to_s == value ? value.to_i : value
+          result[key] = value
+        end
+      end
+      if result.empty?
+        nil
+      else
+        result
+      end
+    end
+    def self.parse_header(stream, info)
+      header = ""
+      while line = stream.readline
+        header << line
+        break if line =~ /^#{info[:DELIMITER]}/i
+        raise "Delimiter not found" if stream.eof?
+      end
+      get_info(header, info)
+    end
+    def self.guess_id(organism, codes)
+      num_codes = codes.size
+      best = nil
+      best_count = 0
+      new_fields = []
+      field_counts = {}
+      TmpFile.with_file(codes.to_s) do |codefile|
+        codes.all_fields.each_with_index do |field,i|
+          values = CMD.cmd("cat #{ codefile }|cut -f #{ i + 1 }| tr '|' '\\n'|grep [[:alpha:]]|sort -u").read.split("\n").reject{|code| code.empty?}
+          new_field, count =  Organism.guess_id(organism, values)
+          field_counts[new_field] = count
+          Log.debug "Original field: #{ field }. New: #{new_field}. Count: #{ count }/#{num_codes}"
+          new_fields << (count > (num_codes > 20000 ? 20000 : num_codes).to_f * 0.5 ? new_field : "UNKNOWN(#{ field })")
+          if count > best_count
+            best = new_field
+            best_count = count
+          end
+        end
+      end
+      field_counts.delete(new_fields.first)
+      [best, new_fields, field_counts.sort_by{|field, counts| counts}.collect{|field, counts| field}.compact]
+    end
+    #{{{ GPL
+    def self.GPL(platform, directory)
+      FileUtils.mkdir_p directory unless File.exists? directory
+      code_file = File.join(directory, 'codes')
+      info_file = File.join(directory, 'info.yaml')
+      stream = Open.open(GPL_URL.gsub('#PLATFORM#', platform), :nocache => true, :pipe => true)
+      info = parse_header(stream, GPL_INFO)
+      info[:code_file]      = code_file
+      info[:data_directory] = directory
+      Log.medium "Producing code file for #{ platform }"
+      codes = TSV.open stream, :fix => proc{|l| l =~ /^!platform_table_end/i ? nil : l}, :header_hash => ""
+      Log.low "Original fields: #{codes.key_field} - #{codes.fields * ", "}"
+      stream.force_close
+      best_field, all_new_fields, order = guess_id(Organism.organism(info[:organism]), codes)
+      new_key_field, *new_fields = all_new_fields
+      new_key_field = codes.key_field if new_key_field =~ /^UNKNOWN/
+      codes.key_field = new_key_field.dup
+      codes.fields = new_fields.collect{|f| f.dup}
+      Log.low "New fields: #{codes.key_field} - #{codes.fields * ", "}"
+      Open.write(code_file, codes.reorder(:key, order).to_s(:sort, true))
+      Open.write(info_file, info.to_yaml)
+      info
+    end
+    def self.dataset_subsets(stream)
+      text = ""
+      while not (line = stream.gets) =~ /!dataset_table_begin/
+        text << line
+      end
+      subsets = text.split(/\^SUBSET/).collect do |chunk|
+        get_info(chunk, GDS_SUBSET_INFO)
+      end
+      info = {}
+      subsets.each do |subset|
+        type = subset[:type]
+        description = subset[:description]
+        samples = subset[:samples]
+        info[type] ||= {}
+        info[type][description] = samples
+      end
+      info
+    end
+    def self.GDS(dataset, directory)
+      FileUtils.mkdir_p directory unless File.exists? directory
+      value_file = File.join(directory, 'values')
+      info_file = File.join(directory, 'info.yaml')
+      stream = Open.open(GDS_URL.gsub('#DATASET#', dataset), :nocache => true)
+      info = parse_header(stream, GDS_INFO)
+      info[:value_file]      = value_file
+      info[:data_directory] = directory
+      info[:subsets] = dataset_subsets(stream)
+      Log.medium "Producing values file for #{ dataset }"
+      values = TSV.open stream, :fix => proc{|l| l =~ /^!dataset_table_end/i ? nil : l.gsub(/null/,'NA')}, :header_hash => ""
+      key_field = TSV.parse_header(GEO[info[:platform]]['codes'].open).key_field
+      values.key_field = key_field
+      samples = values.fields.select{|f| f =~ /GSM/}
+      Open.write(value_file, values.slice(samples).to_s(:sort, true))
+      Open.write(info_file, info.to_yaml)
+      info
+    end
+  end
+  def self.compare(dataset, field, condition, control, path)
+    dataset_info = GEO[dataset]["info.yaml"].yaml
+    platform = dataset_info[:platform]
+    platform_info = GEO[platform]["info.yaml"].yaml
+    log2       = ["count"].include? dataset_info[:value_type]
+    samples    = dataset_info[:subsets]
+    value_file = GEO[dataset].values.find.produce
+    format     = TSV.parse_header(GEO[platform].codes.open).key_field
+    if Array === condition
+      condition_samples = condition.collect{|cond| samples[field][cond].split ","}.flatten
+    else
+      condition_samples = samples[field][condition].split ","
+    end
+    if Array === control
+      control_samples = control.collect{|cond| samples[field][cond].split ","}.flatten
+    else
+      control_samples = samples[field][control].split ","
+    end
+    GE.analyze(value_file, condition_samples, control_samples, log2, path, format)
+  end
+end

data/lib/rbbt/GE.rb ADDED Viewed

@@ -0,0 +1,36 @@
+require 'rbbt/util/R'
+module GE
+  LIB_DIR = File.join(File.expand_path(File.dirname(__FILE__)),'../../share/lib/R')
+  MA      = File.join(LIB_DIR, 'MA.R')
+  def self.run_R(command)
+    cmd = "source('#{MA}');" << command
+    R.run(cmd)
+  end
+  def self.r_format(list)
+    case
+    when list.nil?
+      "NULL"
+    when Array === list
+      "c(#{list.collect{|e| r_format e} * ", "})"
+    when (String === list and list === list.to_i.to_s)
+      list.to_i
+    when (String === list and list === list.to_f.to_s)
+      list.to_f
+    when TrueClass === list
+      "TRUE"
+    when FalseClass === list
+      "FALSE"
+    else
+      "'#{list.to_s}'"
+    end
+  end
+  def self.analyze(datafile,  main, contrast = nil, log2 = false, outfile = nil, key_field = nil)
+    FileUtils.mkdir_p File.dirname(outfile) unless outfile.nil? or File.exists? File.dirname(outfile)
+    GE.run_R("rbbt.GE.process(#{ r_format datafile }, main = #{r_format(main)}, contrast = #{r_format(contrast)}, log2=#{ r_format log2 }, outfile = #{r_format outfile}, key.field = #{r_format key_field})")
+  end
+end

data/share/install/GEO/Rakefile ADDED Viewed

@@ -0,0 +1,24 @@
+rule /^(GPL\d+)\/?(codes|info\.yaml)?$/ do |t|
+  t.name =~ /^(GPL\d+)\/?(codes|info\.yaml)?/
+  platform = $1
+  file = $2
+  GEO::SOFT.GPL(platform, file.nil? ? t.name : File.dirname(t.name))
+end
+rule /^(GDS\d+)\/?(values|info\.yaml)?$/ do |t|
+  t.name =~ /^(GDS\d+)\/?(values|info\.yaml)?/
+  dataset = $1
+  file = $2
+  GEO::SOFT.GDS(dataset, file.nil? ? t.name : File.dirname(t.name))
+end
+rule /^(GDS\d+)\/comparison\/(.*)$/ do |t|
+  t.name =~ /^(GDS\d+)\/comparison\/(.*)/
+  dataset = $1
+  name = $2
+  condition_field, condition_name, control_field, control_name = GEO.parse_comparison_name name
+  GEO.compare(dataset, condition_field, condition_name, control_name, t.name)
+end

data/share/lib/R/MA.R ADDED Viewed

@@ -0,0 +1,515 @@
+library(limma)
+#########################################################################
+# Model processing
+# Ratio
+rbbt.GE.process.ratio.oneside <- function(expr){
+    ratio = apply(expr, 1 ,function(x){mean(x, na.rm = TRUE)})
+    names(ratio) <- rownames(expr);
+    return(ratio);
+}
+rbbt.GE.process.ratio.twoside <- function(expr, contrast){
+    ratio = rbbt.GE.process.ratio.oneside(expr) - rbbt.GE.process.ratio.oneside(contrast)
+    names(ratio) <- rownames(expr);
+    return(ratio);
+}
+# Limma
+rbbt.GE.process.limma.oneside <- function(expr, subset = NULL){
+    if (is.null(subset)){
+        fit <- lmFit(expr);
+    }else{
+        design = rep(0, dim(expr)[2]);
+        design[names(expr) %in% subset] = 1;
+    }
+    fit <- lmFit(expr, design);
+    fit <- eBayes(fit);
+    sign = fit$t < 0;
+    sign[is.na(sign)] = FALSE;
+    fit$p.value[sign] =  - fit$p.value[sign];
+    return(list(t= fit$t, p.values= fit$p.value));
+}
+rbbt.GE.process.limma.twoside <- function(expr, subset.main, subset.contrast){
+    design = cbind(rep(1,dim(expr)[2]), rep(0,dim(expr)[2]));
+    colnames(design) <-c('intercept', 'expr');
+    design[names(expr) %in% subset.main,]     = 1;
+    design[names(expr) %in% subset.contrast,'intercept']     = 1;
+    fit <- lmFit(expr, design);
+    fit <- eBayes(fit);
+    sign = fit$t[,2] < 0;
+    sign[is.na(sign)] = FALSE;
+    fit$p.value[sign,2] = - fit$p.value[sign,2];
+    return(list(t= fit$t[,2], p.values= fit$p.value[,2]));
+}
+rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile = NULL, key.field = NULL){
+    data = rbbt.tsv(file);
+    ids = rownames(data);
+    if (log2){
+       data = log2(data);
+    }
+    if (is.null(contrast)){
+      ratio = rbbt.GE.process.ratio.oneside(subset(data, select=main));
+    }else{
+      ratio = rbbt.GE.process.ratio.twoside(subset(data, select=main), subset(data, select=contrast) );
+    }
+    if (is.null(contrast)){
+        limma = NULL;
+        tryCatch({
+            limma = rbbt.GE.process.limma.oneside(data, main);
+        }, error=function(x){
+            cat("Limma failed for complete dataset. Trying just subset.\n", file=stderr());
+            print(x, file=stderr());
+            tryCatch({
+                limma = rbbt.GE.process.limma.oneside(subset(data, select=main));
+            }, error=function(x){
+                cat("Limma failed for subset dataset.\n", file=stderr());
+                print(x, file=stderr());
+            });
+         })
+    }else{
+        limma = NULL;
+        tryCatch({
+            limma = rbbt.GE.process.limma.twoside(data, main, contrast);
+        }, error=function(x){
+            cat("Limma failed for complete dataset. Trying just subset.\n", file=stderr());
+            print(x, file=stderr());
+            tryCatch({
+                limma = rbbt.GE.process.limma.twoside(subset(data, select=c(main, contrast)), main, contrast);
+            }, error=function(x){
+                cat("Limma failed for subset dataset.\n", file=stderr());
+                print(x, file=stderr());
+            });
+         })
+    }
+    if (! is.null(limma)){
+       result = data.frame(ratio = ratio[ids], t.values = limma$t[ids], p.values = limma$p.values[ids])
+    }else{
+       result = data.frame(ratio = ratio)
+    }
+   if (is.null(outfile)){
+       return(result);
+   }else{
+       rbbt.tsv.write(outfile, result, key.field, ":type=:list#:cast=:to_f");
+       return(NULL);
+   }
+}
+############################################################################
+############################################################################
+############################################################################
+############################################################################
+############################################################################
+# OLD STUFF
+MA.get_order <- function(values){
+    orders = values;
+    orders[,] = NA;
+    for (i in 1:dim(values)[2]){
+        positions = names(sort(values[,i],decreasing=T,na.last=NA));
+        orders[,i] = NA;
+        orders[positions,i] = 1:length(positions)
+    }
+    orders
+}
+MA.guess.do.log2 <- function(m, two.channel){
+    if (two.channel){
+        return (sum(m < 0, na.rm = TRUE) == 0);
+    }else{
+        return (max(m, na.rm = TRUE) > 100);
+    }
+}
+MA.translate <- function(m, trans){
+    trans[trans==""] = NA;
+    trans[trans=="NO MATCH"] = NA;
+    missing = length(trans) - dim(m)[1];
+# If extra genes
+    if (missing < 0){
+        trans = c(trans,rep(NA, - missing));
+        missing = 0;
+    }
+    n = apply(m,2,function(x){
+# Complete data with missing genes
+         x.complete = c(x,rep(NA, missing));
+         tapply(x.complete, factor(trans), median)
+         });
+    n[sort(rownames(n),index.return=T)$ix,]
+}
+# Conditions
+MA.conditions.has_control <- function(x){
+    keywords = c('none', 'control', 'normal', 'wild', 'baseline', 'untreat', 'uninfected', 'universal', 'reference', 'vehicle', 'w.t.','wt');
+    for(keyword in keywords){
+        control = grep(keyword, x, ignore.case = TRUE);
+        if (any(control)){
+            return(x[control[1]]);
+        }
+    }
+    return(NULL)
+}
+MA.condition.values <- function(values){
+    control = MA.conditions.has_control(values);
+    values.factor = factor(values);
+    values.levels = levels(values.factor);
+# If there is a control state remove it from sorting
+    if (!is.null(control))
+        values.levels = values.levels[values.levels != control];
+# Use numeric sort if they all have numbers
+    if (length(grep('^ *[0-9]+',values.levels,perl=TRUE)) == length(values.levels)){
+        ix = sort(as.numeric(sub('^ *([0-9]+).*',"\\1",values.levels)), decreasing = T, index.return = TRUE)$ix
+    }else{
+        ix = sort(values.levels, decreasing = T, index.return = TRUE)$ix
+    }
+    return(list(values = values.levels[ix], control = control));
+}
+#########################################################################
+# Model processing
+# Ratio
+MA.ratio.two_channel <- function(m, conditions, main){
+    main = m[,conditions==main];
+    if (!is.null(dim(main))){
+        main = apply(main, 1 ,function(x){mean(x, na.rm = TRUE)});
+    }
+    return(main);
+}
+MA.ratio.contrast <- function(m, conditions, main, contrast){
+    main = m[,conditions==main];
+    if (!is.null(dim(main))){
+        main = apply(main, 1 ,function(x){mean(x, na.rm = TRUE)});
+    }
+    contrast = m[,conditions==contrast];
+    if (!is.null(dim(contrast))){
+        contrast = apply(contrast, 1 ,function(x){mean(x, na.rm = TRUE)});
+    }
+    return (main - contrast);
+}
+# Limma
+MA.limma.two_channel <- function(m, conditions, main){
+    if (sum(conditions == main) < 3){
+        return(NULL);
+    }
+    design = rep(0,dim(m)[2]);
+    design[conditions == main] = 1;
+# We need to subset the columns because of a problem with NA values. This
+# might affect eBayes variance estimations, thats my guess anyway...
+    fit <- lmFit(m[,design == 1],rep(1, sum(design)));
+    tryCatch({
+             fit <- eBayes(fit);
+             sign = fit$t < 0;
+             sign[is.na(sign)] = FALSE;
+             fit$p.value[sign] =  - fit$p.value[sign];
+             return(list(t= fit$t, p.values= fit$p.value));
+     }, error=function(x){
+             print("Exception caught in eBayes");
+             print(x);
+     })
+    return(NULL);
+}
+MA.limma.contrast <- function(m, conditions, main, contrast){
+    if (sum(conditions == main) + sum(conditions == contrast) < 3){
+        return(NULL);
+    }
+    m = cbind(m[,conditions == main],m[,conditions == contrast]);
+    design = cbind(rep(1,dim(m)[2]), rep(0,dim(m)[2]));
+    colnames(design) <-c('intercept', 'main');
+    design[1:sum(conditions==main),2] = 1;
+    fit <- lmFit(m,design);
+    tryCatch({
+             fit <- eBayes(fit);
+             sign = fit$t[,2] < 0;
+             sign[is.na(sign)] = FALSE;
+             fit$p.value[sign,2] = - fit$p.value[sign,2]
+             return(list(t= fit$t[,2], p.values= fit$p.value[,2] ));
+    }, error=function(x){
+             print("Exception caught in eBayes");
+             print(x);
+    })
+    return(NULL);
+}
+#########################################################################
+# Process conditions
+MA.strip_blanks <- function(text){
+    text = sub(' *$', '' ,text);
+    text = sub('^ *', '' ,text);
+    return(text);
+}
+MA.orders <- function(ratios, t){
+    best  = vector();
+    names = vector();
+    for (name in colnames(ratios)){
+        if (sum(colnames(t) == name) > 0){
+            best = cbind(best, t[,name]);
+            names = c(names, name);
+        }else{
+            best = cbind(best, ratios[,name]);
+            names = c(names, paste(name,'[ratio]', sep=" "));
+        }
+    }
+    rownames(best)   <- rownames(ratios);
+    orders           <- as.data.frame(MA.get_order(best));
+    colnames(orders) <- names;
+    return(orders);
+}
+MA.process_conditions.contrasts <- function(m, conditions, two.channel){
+    max_levels             = 10;
+    max_levels_control     = 1;
+    values = MA.condition.values(conditions);
+    ratios   = vector();
+    t       = vector();
+    p.values = vector();
+    ratio_names = vector();
+    t_names     = vector();
+    if (!is.null(values$control)){
+        contrast = values$control;
+        for (main in values$values){
+            name =  paste(main, contrast, sep = " <=> ")
+                ratio       = MA.ratio.contrast(m, conditions, main, contrast);
+            ratio_names = c(ratio_names, name);
+            ratios      = cbind(ratios, ratio);
+            res      = MA.limma.contrast(m, conditions, main, contrast);
+            if (!is.null(res)){
+                t_names = c(t_names, name);
+                t           = cbind(t, res$t);
+                p.values     = cbind(p.values, res$p.values);
+            }
+        }
+    }
+    if (length(values$values) <= max_levels_control || (is.null(values$control) && !two.channel && length(values$values) <= max_levels )){
+        remaining = values$values;
+        for (main in values$values){
+            remaining = remaining[remaining != main];
+            for (contrast in remaining){
+                name =  paste(main, contrast, sep = " <=> ");
+                ratio       = MA.ratio.contrast(m, conditions, main, contrast);
+                ratio_names = c(ratio_names, name);
+                ratios      = cbind(ratios, ratio);
+                res      = MA.limma.contrast(m, conditions, main, contrast);
+                if (!is.null(res)){
+                    t_names  = c(t_names, name);
+                    t        = cbind(t, res$t);
+                    p.values = cbind(p.values, res$p.values);
+                }
+            }
+        }
+    }
+    if (length(ratio_names) != 0){
+        ratio_names = as.vector(sapply(ratio_names, MA.strip_blanks));
+        colnames(ratios) <- ratio_names
+    }
+    if (length(t_names) != 0){
+        t_names = as.vector(sapply(t_names, MA.strip_blanks));
+        colnames(t) <- t_names;
+        colnames(p.values) <- t_names;
+    }
+    return(list(ratios = ratios, t=t, p.values = p.values));
+}
+MA.process_conditions.two_channel <- function(m, conditions){
+    values = MA.condition.values(conditions);
+    all_values = values$values;
+    if (!is.null(values$control)){
+        all_values = c(all_values, values$control);
+    }
+    ratios   = vector();
+    t        = vector();
+    p.values = vector();
+    ratio_names = vector();
+    t_names     = vector();
+    for (main in all_values){
+        name =  main;
+        ratio       = MA.ratio.two_channel(m, conditions, main);
+        ratio_names = c(ratio_names, name);
+        ratios      = cbind(ratios, ratio);
+        res      = MA.limma.two_channel(m, conditions, main);
+        if (!is.null(res)){
+            t_names  = c(t_names, name);
+            t        = cbind(t, res$t);
+            p.values = cbind(p.values, res$p.values);
+        }
+    }
+    if (length(ratio_names) != 0){
+        ratio_names = as.vector(sapply(ratio_names, MA.strip_blanks));
+        colnames(ratios) <- ratio_names
+    }
+    if (length(t_names) != 0){
+        t_names = as.vector(sapply(t_names, MA.strip_blanks));
+        colnames(t) <- t_names;
+        colnames(p.values) <- t_names;
+    }
+    return(list(ratios = ratios, t=t, p.values = p.values));
+}
+# Process microarray matrix
+MA.process <- function(m, conditions_list, two.channel = FALSE){
+    ratios   = vector();
+    t        = vector();
+    p.values = vector();
+    for(type in colnames(conditions_list)){
+        conditions = conditions_list[,type]
+            if (two.channel){
+                res = MA.process_conditions.two_channel(m, conditions);
+                if (length(res$ratios) != 0){    colnames(res$ratios) <- sapply(colnames(res$ratios),function(x){paste(type,x,sep=": ")});     ratios   = cbind(ratios,res$ratios);}
+                if (length(res$t) != 0){         colnames(res$t) <- sapply(colnames(res$t),function(x){paste(type,x,sep=": ")});               t        = cbind(t,res$t);}
+                if (length(res$p.values) != 0){  colnames(res$p.values) <- sapply(colnames(res$p.values),function(x){paste(type,x,sep=": ")}); p.values = cbind(p.values,res$p.values);}
+            }
+        res = MA.process_conditions.contrasts(m, conditions, two.channel);
+        if (length(res$ratios) != 0){    colnames(res$ratios) <- sapply(colnames(res$ratios),function(x){paste(type,x,sep=": ")});     ratios   = cbind(ratios,res$ratios);}
+        if (length(res$t) != 0){         colnames(res$t) <- sapply(colnames(res$t),function(x){paste(type,x,sep=": ")});               t        = cbind(t,res$t);}
+        if (length(res$p.values) != 0){  colnames(res$p.values) <- sapply(colnames(res$p.values),function(x){paste(type,x,sep=": ")}); p.values = cbind(p.values,res$p.values);}
+    }
+    orders <- MA.orders(ratios,t);
+    return(list(ratios = ratios, t=t, p.values = p.values, orders=orders));
+}
+MA.save <- function(prefix, orders, ratios, t , p.values, experiments, description = NULL) {
+    if (is.null(orders)){
+        cat("No suitable samples for analysis\n")
+            write(file=paste(prefix,'skip',sep="."), "No suitable samples for analysis" );
+    } else {
+        write.table(file=paste(prefix,'orders',sep="."), orders, sep="\t",  row.names=F, col.names=F, quote=F);
+        write.table(file=paste(prefix,'codes',sep="."), rownames(orders), sep="\t",  row.names=F, col.names=F, quote=F);
+        write.table(file=paste(prefix,'logratios',sep="."), ratios, sep="\t",  row.names=F, col.names=F, quote=F);
+        write.table(file=paste(prefix,'t',sep="."), t, sep="\t",  row.names=F, col.names=F, quote=F);
+        write.table(file=paste(prefix,'pvalues',sep="."), p.values, sep="\t",  row.names=F, col.names=F, quote=F);
+        write.table(file=paste(prefix,'experiments',sep="."), experiments, sep="\t",  row.names=F, col.names=F, quote=F);
+        write(file=paste(prefix,'description',sep="."),  description)
+    }
+}
+MA.load <- function(prefix, orders = TRUE, logratios = TRUE, t = TRUE, p.values = TRUE){
+    data = list();
+    genes <- scan(file=paste(prefix,'codes',sep="."),sep="\n",quiet=T,what=character());
+    experiments <- scan(file=paste(prefix,'experiments',sep="."),sep="\n",quiet=T,what=character());
+    experiments.no.ratio = experiments[- grep('ratio', experiments)];
+    if (orders){
+        orders <- read.table(file=paste(prefix,'orders',sep="."),sep="\t");
+        rownames(orders) <- genes;
+        colnames(orders) <- experiments;
+        data$orders=orders;
+    }
+    if (logratios){
+        logratios <- read.table(file=paste(prefix,'logratios',sep="."),sep="\t");
+        rownames(logratios) <- genes;
+        colnames(logratios) <- experiments;
+        data$logratios=logratios;
+    }
+    if (t){
+        t <- read.table(file=paste(prefix,'t',sep="."),sep="\t");
+        rownames(t) <- genes;
+        colnames(t) <- experiments.no.ratio;
+        data$t=t;
+    }
+    if (p.values){
+        p.values <- read.table(file=paste(prefix,'pvalues',sep="."),sep="\t");
+        rownames(p.values) <- genes;
+        colnames(p.values) <- experiments.no.ratio;
+        data$p.values=p.values;
+    }
+    return(data);
+}

data/test/rbbt/GE/test_GEO.rb ADDED Viewed

@@ -0,0 +1,104 @@
+require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
+require 'rbbt/GE/GEO'
+class TestClass < Test::Unit::TestCase
+  def test_control_sample
+    assert GEO.control_samples('GDS750').include? "GSM16978"
+  end
+  def test_GDS
+    assert_equal 'GPL999', GEO.dataset_info('GDS750')[:platform]
+  end
+  def test_GPL
+    assert_equal 'Saccharomyces cerevisiae', GEO["GPL999/info.yaml"].yaml[:organism]
+    assert_equal 'Homo sapiens', GEO["GPL570/info.yaml"].yaml[:organism]
+    assert GEO.GPL999.codes.fields.include? "Ensembl Gene ID"
+  end
+  def test_normalize
+    dataset = 'GDS750'
+    gene    = "YPR191W"
+    id      = "6079"
+    platform   = GEO.GDS(dataset)[:platform]
+    translated = GEO.normalize(platform, ["YPR191W"]).first.first
+    assert_equal id, translated
+  end
+  def test_analyze_single
+    dataset = 'GDS750'
+    info = GEO.GDS(dataset)
+    assert GE.analyze(info[:data_file], info[:subsets]["agent"]["tunicamycin"] ).read =~ /1234/;
+  end
+  def test_analyze_contrast
+    dataset = 'GDS750'
+    info = GEO.GDS(dataset)
+    outfile = File.join(File.dirname(info[:data_file]), 'results')
+    key_field = TSV.headers(GEO.GPL(info[:platform])[:code_file]).first
+    TmpFile.with_file do |f|
+      GE.analyze(info[:data_file], info[:subsets]["agent"]["tunicamycin"], info[:subsets]["agent"]["DTT"], false, f, key_field);
+      assert File.exists? f
+      FileUtils.rm f
+    end
+  end
+  def test_process_subset
+    dataset = 'GDS750'
+    subset  = 'agent'
+    id      = "6079"
+    info = GEO.GDS(dataset)
+    outfile = File.join(File.dirname(info[:data_file]), 'results')
+    key_field = TSV.headers(GEO.GPL(info[:platform])[:code_file]).first
+    TmpFile.with_file do |f|
+      GEO.process_subset(dataset, subset, nil, f)
+      assert File.exists? f
+      FileUtils.rm f
+    end
+    t = GEO.process_subset(dataset, subset, 'tunicamycin')
+    assert File.exists? File.join(File.dirname(info[:data_file]), 'analyses/subset.agent.tunicamycin')
+    d = GEO.process_subset(dataset, subset, 'DTT')
+    assert File.exists? File.join(File.dirname(info[:data_file]), 'analyses/subset.agent.DTT')
+    assert_in_delta t[id]["p.values"], - d[id]["p.values"], 0.0001
+  end
+  def test_GSE
+    gse="GSE966"
+    info = GEO.GSE(gse)
+    assert_equal "GPL764", info[:platform]
+  end
+  #{{{ NEW TEST
+  def test_GSE
+    gse="GSE966"
+    info = GEO.GSE(gse)
+    assert_equal "GPL764", info[:platform]
+  end
+  def test_compare
+    dataset = "GDS1479"
+    field = "specimen"
+    condition = "carcinoma in situ lesion"
+    control = "normal mucosa"
+    TmpFile.with_file do |path|
+      GEO.compare(dataset, field, condition, control, path)
+      assert File.exists? path
+    end
+    assert GEO[dataset].comparison[GEO.comparison_name field, condition, control].produce.exists?
+  end
+end

data/test/test_helper.rb ADDED Viewed

@@ -0,0 +1,4 @@
+require 'test/unit'
+$LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib')))
+$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__)))

metadata ADDED Viewed

@@ -0,0 +1,87 @@
+--- !ruby/object:Gem::Specification
+name: rbbt-GE
+version: !ruby/object:Gem::Version
+  hash: 27
+  prerelease:
+  segments:
+  - 0
+  - 1
+  - 0
+  version: 0.1.0
+platform: ruby
+authors:
+- Miguel Vazquez
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-01-19 00:00:00 +01:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rbbt-util
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id001
+description: Gene Expression in RBBT
+email: miguel.vazquez@cnio.es
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE
+files:
+- LICENSE
+- lib/rbbt/GE.rb
+- lib/rbbt/GE/GEO.rb
+- share/install/GEO/Rakefile
+- share/lib/R/MA.R
+- test/test_helper.rb
+- test/rbbt/GE/test_GEO.rb
+has_rdoc: true
+homepage: http://github.com/mikisvaz/rbbt-GE
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.6.2
+signing_key:
+specification_version: 3
+summary: Gene Expression in RBBT
+test_files:
+- test/test_helper.rb
+- test/rbbt/GE/test_GEO.rb