RubyGems - rbbt-study - Versions diffs - 0.2.0 - Mend

rbbt-study 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +15 -0
data/LICENSE.txt +20 -0
data/README.rdoc +19 -0
data/lib/rbbt/entity/study/cnv/genes.rb +28 -0
data/lib/rbbt/entity/study/cnv/samples.rb +55 -0
data/lib/rbbt/entity/study/cnv.rb +170 -0
data/lib/rbbt/entity/study/enrichment.rb +418 -0
data/lib/rbbt/entity/study/expression.rb +19 -0
data/lib/rbbt/entity/study/features.rb +17 -0
data/lib/rbbt/entity/study/genes.rb +104 -0
data/lib/rbbt/entity/study/genotypes/enrichment.rb +56 -0
data/lib/rbbt/entity/study/genotypes/genes.rb +103 -0
data/lib/rbbt/entity/study/genotypes/knowledge_base.rb +39 -0
data/lib/rbbt/entity/study/genotypes/mutations.rb +34 -0
data/lib/rbbt/entity/study/genotypes/samples.rb +28 -0
data/lib/rbbt/entity/study/genotypes.rb +110 -0
data/lib/rbbt/entity/study/knowledge_base.rb +36 -0
data/lib/rbbt/entity/study/methylation/samples.rb +31 -0
data/lib/rbbt/entity/study/methylation.rb +90 -0
data/lib/rbbt/entity/study/mutations.rb +259 -0
data/lib/rbbt/entity/study/plots.rb +142 -0
data/lib/rbbt/entity/study/samples.rb +61 -0
data/lib/rbbt/entity/study/snp.rb +87 -0
data/lib/rbbt/entity/study.rb +151 -0
metadata +69 -0

data/lib/rbbt/entity/study/mutations.rb ADDED Viewed

@@ -0,0 +1,259 @@
+task :mutations_by_change => :tsv do
+  changes = {}
+  study.cohort.each do |genotype|
+    genotype.watson ||= watson
+    genotype.each do |mutation|
+      reference = watson ? mutation.reference : mutation.gene_strand_reference
+      base = mutation.base
+      base = ((Misc::IUPAC2BASE[base] || []) - [reference]) * ","
+      change = [reference, base]
+      changes[change * ">"] ||= []
+      changes[change * ">"] << mutation.clean_annotations
+    end
+  end
+  TSV.setup(changes, :key_field => "Genomic Change", :fields => ["Genomic Mutation"], :namespace => organism, :type => :flat)
+  changes.entity_options = {:watson => watson}
+  changes
+end
+dep :mutations_by_change
+task :mutation_change_counts => :yaml do
+  change_counts    = {}
+  step(:mutations_by_change).load.each do |change, mutations|
+    change_counts[change] = mutations.length
+  end
+  change_counts
+end
+returns "Genomic Mutation"
+task :transversions => :annotations do
+  mutations = study.cohort.collect{|genotype|
+    genotype.select{|mutation|
+      mutation.type == "transversion"
+    }
+  }.flatten
+  GenomicMutation.setup(mutations, "#{ study }: transversions", organism, watson)
+end
+returns "Genomic Mutation"
+task :transitions => :annotations do
+  mutations = study.cohort.collect{|genotype|
+    genotype.select{|mutation|
+      mutation.type == "transition"
+    }
+  }.flatten
+  GenomicMutation.setup(mutations, "#{ study }: transitions", organism, watson)
+end
+returns "Genomic Mutation"
+task :indels => :annotations do
+  mutations = study.cohort.collect{|genotype|
+    genotype.select{|mutation|
+      mutation.type == "indel"
+    }
+  }.flatten
+  GenomicMutation.setup(mutations, "#{ study }: indels", organism, watson)
+end
+returns "Genomic Mutation"
+task :unknown_mutations => :annotations do
+  mutations = study.cohort.collect{|genotype|
+    genotype.select{|mutation|
+      mutation.type == "unknown"
+    }
+  }.flatten
+  GenomicMutation.setup(mutations, "#{ study }: unknown_mutations", organism, watson)
+end
+returns "Genomic Mutation"
+task :not_mutations => :annotations do
+  mutations = study.cohort.collect{|genotype|
+    genotype.select{|mutation|
+      mutation.type == "none"
+    }
+  }.flatten
+  GenomicMutation.setup(mutations, "#{ study }: not mutations", organism, watson)
+end
+returns "Genomic Mutation"
+task :non_synonymous_mutations => :annotations do
+  mutations = study.cohort.collect{|genotype|
+    genotype.select{|mutation|
+      (mutation.mutated_isoforms || [] ).select{|mi| mi.non_synonymous }.any?
+    }
+  }.flatten
+  GenomicMutation.setup(mutations, "#{ study }: non_synonymous mutations", organism, watson)
+end
+dep :non_synonymous_mutations
+returns "Genomic Mutation"
+task :synonymous_mutations => :annotations do
+  non_synonymous_mutations = step(:non_synonymous_mutations).load
+  mutations = study.cohort.collect{|genotype|
+    genotype.remove( non_synonymous_mutations )
+  }.flatten
+  GenomicMutation.setup(mutations, "#{ study }: synonymous mutations", organism, watson)
+end
+#dep :synonymous_mutations
+#dep :exon_junction_mutations
+#input :methods, :array, "Damage prediction methods", [:sift, :mutation_assessor]
+#returns "Genomic Mutation"
+#task :damaging_mutations => :annotations do |methods|
+#  synonymous_mutations = step(:synonymous_mutations).load
+#  exon_junction_mutations = step(:exon_junction_mutations).load
+#
+#  mutations_to_remove = synonymous_mutations - exon_junction_mutations
+#
+#  mutations = study.cohort.collect{|genotype|
+#
+#    genotype.remove( mutations_to_remove ).select{|mutation| mutation.damaging?(methods) }
+#
+#  }.flatten
+#
+#  GenomicMutation.setup(mutations, "#{ study }: damaging mutations", organism, watson)
+#end
+dep :relevant_mutations
+input :methods, :array, "Damage prediction methods", [:sift, :mutation_assessor]
+returns "Genomic Mutation"
+task :damaging_mutations => :annotations do |methods|
+  relevant_mutations = step(:relevant_mutations ).load
+  mutations = relevant_mutations.select{|mutation| mutation.damaging?(methods) }
+  GenomicMutation.setup(mutations, "#{ study }: damaging mutations", organism, watson)
+end
+dep :damaging_mutations
+dep :relevant_mutations
+input :methods, :array, "Damage prediction methods", [:sift]
+returns "Genomic Mutation"
+task :mutations_missing_predictions => :annotations do |methods|
+  damaging_mutations = step(:damaging_mutations).load
+  relevant_mutations = step(:relevant_mutations).load
+  missing_mutations = relevant_mutations.remove(damaging_mutations)
+  missing_mutations_mutated_isoforms = missing_mutations.mutated_isoforms.compact.flatten
+  mutated_isoforms_missing_damage_scores = missing_mutations_mutated_isoforms.select{|mis| mis.damage_scores.nil?}
+  mutations_missing_predictions = missing_mutations.select{|mutation| mutation.mutated_isoforms and mutation.mutated_isoforms.any?}.select{|mutation| mutation.mutated_isoforms.remove(mutated_isoforms_missing_damage_scores).empty?}
+  GenomicMutation.setup(mutations_missing_predictions, "#{ study }: mutations missing predictions", organism, watson)
+end
+returns "Genomic Mutation"
+task :exon_junction_mutations => :annotations do
+  mutations = study.cohort.collect{|genotype|
+    genotype.select{|mutation| mutation.transcripts_with_affected_splicing.any? and not mutation.type == "none"}
+  }.flatten
+  GenomicMutation.setup(mutations, "#{ study }: exon junction mutations", organism, watson)
+end
+dep :non_synonymous_mutations
+dep :exon_junction_mutations
+returns "Genomic Mutation"
+task :relevant_mutations => :annotations do
+  non_synonymous_mutations = step(:non_synonymous_mutations).load
+  exon_junction_mutations = step(:exon_junction_mutations).load
+  all_relevant_mutations = ( exon_junction_mutations + non_synonymous_mutations.remove(exon_junction_mutations) ).flatten
+  GenomicMutation.setup(all_relevant_mutations, "#{ study }: relevant mutations", organism, watson)
+end
+dep :relevant_mutations
+returns "Genomic Mutation"
+task :recurrent_mutations => :annotations do
+  relevant_mutations = step(:relevant_mutations).load
+  mutations = Misc.counts(relevant_mutations.remove_score).select{|mutation, count|
+    count > 1
+  }.collect{|mutation, count| mutation}
+  GenomicMutation.setup(mutations, "#{study}: recurrent mutations", organism, watson)
+end
+dep :non_synonymous_mutations
+task :mutations_by_consequence => :yaml do
+  non_synonymous_mutations = step(:non_synonymous_mutations).load
+  mutations_by_consequence = {}
+  study.cohort.each do |genotype|
+    genotype.subset(non_synonymous_mutations).each do |mutation|
+      mis = mutation.mutated_isoforms
+      next if mis.nil?
+      consequences = mis.consequence.compact.uniq
+      consequences.each{|consequence| mutations_by_consequence[consequence] ||= []; mutations_by_consequence[consequence] << mutation }
+    end
+  end
+  mutations_by_consequence
+end
+%w(missense_mutations nonsense_mutations frameshift_mutations nostop_mutations indel_mutations utr_mutations ).zip(
+  %w(MISS-SENSE NONSENSE FRAMESHIFT NOSTOP INDEL UTR)).each do |task_name, consequence|
+  dep :mutations_by_consequence
+  returns "Genomic Mutation"
+  task task_name => :annotations do
+    mutations_by_consequence = step(:mutations_by_consequence).load
+    GenomicMutation.setup(mutations_by_consequence[consequence] || [], "#{study}: mutations with #{consequence.downcase} isoform mutations", organism, watson)
+  end
+end

data/lib/rbbt/entity/study/plots.rb ADDED Viewed

@@ -0,0 +1,142 @@
+input :cutoff, :integer, "Pixels of image", 2
+input :size, :integer, "Pixels of image", 14
+task :gene_mutation_plot => :binary do |cutoff, size|
+  png_file = file(study + ".png")
+  FileUtils.mkdir_p File.dirname png_file unless File.exists? File.dirname png_file
+  study.R "
+library(ggplot2)
+library(plyr)
+library(reshape)
+layer.mutations = rbbt.SE.plot.mutations('#{study}', cutoff=#{cutoff});
+p <- ggplot() + layer.mutations
+p <- p + opts(axis.text.x=theme_text(angle=90), panel.background = theme_rect(fill='white', colour='steelblue'))
+ggsave(p, filename='#{png_file}', height=#{size}, width=#{size});
+"
+  Open.read(png_file, :mode => 'rb')
+end
+input :database, :string, "Database code", :kegg
+input :size, :integer, "Pixels of image", 14
+task :pathway_mutation_plot => :binary do |database,size|
+  png_file = file(study + ".png")
+  FileUtils.mkdir_p File.dirname png_file unless File.exists? File.dirname png_file
+  study.R "
+library(ggplot2)
+library(plyr)
+library(reshape)
+study = '#{study}'
+# Sample mutations
+sample.mutated.genes = rbbt.SE.sample.mutated.genes(study)
+sample.mutated.genes$Sample = rownames(sample.mutated.genes)
+# Pathway enrichment
+pathway.enrichment = rbbt.ruby.substitutions(
+    \"
+    require 'rbbt/workflow'
+    require 'rbbt/entity'
+    require 'rbbt/entity/gene'
+    require 'rbbt/sources/pfam'
+    require 'rbbt/sources/kegg'
+    require 'rbbt/sources/go'
+    YAML::ENGINE.yamler = 'syck' if defined? YAML::ENGINE and YAML::ENGINE.respond_to? :yamler
+    Workflow.require_workflow 'StudyExplorer'
+    study = Study.setup('STUDY')
+    Log.severity = 0
+    pathways = study.job(:mutation_pathway_enrichment, study, :baseline => :pathway_base_counts, :database => '#{database}', :fdr => false).run.select('p-value'){|pvalue| pvalue = pvalue.first.to_f if Array === pvalue; pvalue < 0.2}
+    pathways.add_field 'Name' do |pathway, values|
+        [pathway.name]
+    end
+    pathways.add_field 'Gene' do |pathway, values|
+        values['Ensembl Gene ID'].name
+    end
+    pathways = pathways.select('Name'){|name| name.first.to_s !~ /cancer|olfactory|glioma|melanoma|malaria|leukemia|carcinoma|sarcoma/i}
+    \", substitutions=list(STUDY=study));
+# Sample pathway mutations
+find.mutated.pathways.for.sample <- function(x, pathway.info){
+    all.genes = names(x);
+    genes = all.genes[x==TRUE];
+    ddply(pathway.info, 'Name', function(x){pathway.genes = unlist(strsplit(x$Gene, '\\\\|')); if (length(intersect(genes, pathway.genes)) > 0){TRUE}else{FALSE}})
+}
+sample.pathway.mutations = ddply(sample.mutated.genes, 'Sample', find.mutated.pathways.for.sample, pathway.info = pathway.enrichment)
+names(sample.pathway.mutations) = c('Sample', 'Pathway', 'Mutated')
+p <- ggplot(sample.pathway.mutations) + geom_tile(aes(x=Sample, y=Pathway, alpha=Mutated))
+p <- rbbt.SE.plot.sort.by.pathway.mutations(p)
+# Mark repeated genes
+d = p$data
+d$Exclusive = FALSE
+pathway.genes = list();
+for(pathway in levels(d$Pathway)){
+   pathway.genes[pathway] = strsplit(pathway.enrichment[pathway.enrichment[,'Name'] == pathway, 'Gene'], '\\\\|')
+}
+find.exclusive.pathway.genes <- function(data, pathways){
+  found.genes = c();
+  exclusive.pathway.genes = list();
+  sample = as.character(unique(data$Sample));
+  for(pathway in pathways){
+     current.pathway.genes = pathway.genes[[pathway]];
+     sample.genes = names(sample.mutated.genes)[sample.mutated.genes[sample,] == TRUE]
+     sample.pathway.genes = intersect(current.pathway.genes, sample.genes);
+     exclusive.genes = setdiff(sample.pathway.genes, found.genes);
+     found.genes = c(found.genes, exclusive.genes)
+     exclusive.pathway.genes[[pathway]] = exclusive.genes
+  }
+  return(exclusive.pathway.genes);
+}
+exclusive.pathway.genes = dlply(d, 'Sample', find.exclusive.pathway.genes, pathways = levels(d$Pathway))
+for( sample in names(exclusive.pathway.genes)){
+     pathway.exclusive.genes = exclusive.pathway.genes[[sample]];
+     for( pathway in names(pathway.exclusive.genes)){
+        if (length(pathway.exclusive.genes[[pathway]]) > 0){
+           print(sample)
+           print(pathway)
+           d[(d$Sample == sample & d$Pathway == pathway), 'Exclusive'] = TRUE
+        }
+     }
+}
+p$data = d
+p <- p + aes(fill=Exclusive)
+p <- p + opts(axis.text.x=theme_text(angle=90), panel.background = theme_rect(fill='white', colour='steelblue'))
+p
+ggsave(p, filename='#{png_file}', height=#{size}, width=#{size});
+"
+  Open.read(png_file, :mode => 'rb')
+end

data/lib/rbbt/entity/study/samples.rb ADDED Viewed

@@ -0,0 +1,61 @@
+module Sample
+  extend Entity
+  annotation :study
+  self.format = ["Sample ID"]
+  def dir
+    return nil if study.nil?
+    return study.dir if study.respond_to? :dir
+    begin
+      Study.setup(study).dir
+    rescue
+      Log.warn "Error accessing sample dir from study: #{$!.message}"
+      nil
+    end
+  end
+  def organism
+    return nil if study.nil?
+    study.organism
+  end
+end
+module Study
+  def sample_info
+    return nil unless dir.samples.exists?
+    @sample_info ||= dir.samples.tsv.tap{|tsv| tsv.entity_options = {:study => self }}
+  end
+  def samples
+    if @samples.nil?
+      if sample_info.nil?
+        @samples = self.cohort.collect{|g| g.jobname }
+      else
+        @samples = sample_info.keys
+      end
+      Sample.setup(@samples, self)
+      @samples.study = self
+    end
+    @samples
+  end
+  def has_cnv?
+    study.has_cnv? and study.cnv_cohort.include? self
+  end
+  def has_mutations?
+    study.cohort and study.cohort.include? self
+  end
+  def match_samples(list)
+    if donor_id_field = (sample_info = self.sample_info).fields.select{|f| f =~ /donor\s+id/i}.first
+      list_donors = sample_info.select(list).slice(donor_id_field).values.compact.flatten
+      list_donor_samples = sample_info.select(list_donors).keys
+      list = list_donor_samples.annotate((list + list_donor_samples).uniq)
+    end
+    list
+  end
+end

data/lib/rbbt/entity/study/snp.rb ADDED Viewed

@@ -0,0 +1,87 @@
+require 'rbbt/entity/snp'
+#require 'rbbt/entity/study/snp/samples'
+module StudyWorkflow
+end
+module Study
+  def has_snp?
+    dir.snp.exists?
+  end
+  def snp_files
+    @snp_files ||= dir.snp.find.glob("*")
+  end
+  def snp_cohort
+    if @snp_cohort.nil?
+      @snp_cohort = {}
+      snp_files.each do |f|
+        sample = File.basename(f)
+        Sample.setup(sample, self)
+        snps = Open.read(f).split("\n").sort
+        SNP.setup(snps)
+        @snp_cohort[sample] =  snps
+      end
+    end
+    @snp_cohort
+  end
+end
+module Study
+  def snp_index
+    local_persist_tsv("SNP2Samples", "SNP2Samples", {}, :persist => true, :serializer => :clean) do |data|
+      require 'progress-monitor'
+      Progress.monitor "SNP files", :stack_depth => 0
+      snp_files.each do |file|
+        file = file.to_s
+        sample = File.basename file
+        File.open(file.to_s) do |f|
+          while line = f.gets
+            snp = line.strip
+            snp, allele = snp.split ":"
+            snp_str = data[snp]
+            if snp_str.nil?
+              snp_str = ""
+            else
+              snp_str += "\t"
+            end
+            if allele
+              snp_str << sample << ":" << allele
+            else
+              snp_str << sample
+            end
+            data[snp] = snp_str
+          end
+        end
+      end
+      TSV.setup data
+      data.key_field = "RS ID"
+      data.fields = ["Sample"]
+      data.type = :flat
+      data.serializer = :list
+      data
+    end
+  end
+  property :samples_with_snp => :single2array do |snp|
+    Sample.setup((snp_index[snp] || []).collect{|s| s.split(":").first}, self)
+  end
+  property :samples_with_homozygous_snp => :single2array do |snp|
+    Sample.setup((snp_index[snp] || []).collect{|s| s.split(":")}.select{|s,g| g == "2"}.collect{|s,g| s}, self)
+  end
+  property :samples_with_heterozygous_snp => :single2array do |snp|
+    Sample.setup((snp_index[snp] || []).collect{|s| s.split(":")}.select{|s,g| g == "1"}.collect{|s,g| s}, self)
+  end
+end

data/lib/rbbt/entity/study.rb ADDED Viewed

@@ -0,0 +1,151 @@
+require 'rbbt'
+require 'rbbt/util/misc'
+require 'rbbt/entity'
+require 'rbbt/resource'
+require 'rbbt/workflow'
+Workflow.require_workflow "Genomics"
+require 'rbbt/entity/study'
+require 'rbbt/entity/study/knowledge_base'
+require 'rbbt/entity/study/samples'
+require 'rbbt/expression/matrix'
+module StudyWorkflow
+  extend Workflow
+  class << self
+    attr_accessor :study
+  end
+  def self.workdir
+    @workdir ||= Rbbt.var.jobs["Study"].find
+  end
+  helper :study do
+    @study
+  end
+  helper :dir do
+    study.dir
+  end
+  helper :organism do
+    study.metadata[:organism]
+  end
+  def self.job(*args)
+    super(*args).tap{|s| s.instance_variable_set("@study", @study) }
+  end
+end
+module Study
+  extend Entity
+  extend Resource
+  include LocalPersist
+  class << self
+    attr_accessor :study_dir
+    def study_dir
+      @study_dir ||= begin
+                       case
+                       when (not defined?(Rbbt))
+                         File.join(ENV["HOME"], '.studies')
+                       when Rbbt.etc.study_dir.exists?
+                         Rbbt.etc.study_dir.read.chomp
+                       else
+                         Rbbt.studies.find
+                       end
+                     end
+    end
+  end
+  attr_accessor :workflow, :dir
+  def job(task, *args)
+    name, inputs = args
+    if inputs.nil? and Hash === name
+      inputs = name
+      name = nil
+    end
+    name = self if name.nil? or name == :self or name == "self"
+    step = workflow.job(task, name, {:organism => metadata[:organism], :watson => metadata[:watson]}.merge(inputs || {}))
+    step.instance_variable_set(:@study, self)
+    step
+  end
+  def workflow(&block)
+    if block_given?
+      @workflow.instance_eval &block
+    else
+      @workflow
+    end
+  end
+  def self.annotation_repo
+    @annotation_repo ||= Rbbt.var.cache.annotation_repo.find
+  end
+  def self.extended(base)
+    setup_file = File.join(base.dir, 'setup.rb')
+    base.workflow = StudyWorkflow.clone
+    base.workflow.study = base
+    if File.exists? setup_file
+      base.instance_eval Open.read(setup_file), setup_file
+    end
+    base.local_persist_dir = base.dir.var.cache.persistence.find
+  end
+  def self.studies
+    Dir.glob(File.join(Path === study_dir ? study_dir.find : study_dir, '*')).
+      select{|f| File.directory? f}.sort.collect{|s| Study.setup(File.basename(s))}
+  end
+  def dir
+    if @dir.nil?
+      @dir = Path.setup(File.join(Study.study_dir, self))
+      @dir.resource = Study
+    end
+    @dir
+  end
+  def metadata
+    @metadata ||= (dir["metadata.yaml"].yaml.extend IndiferentHash)
+  end
+  def users
+    @users ||= metadata[:users] || []
+  end
+  #{{{ Attributes
+  attr_accessor :organism
+  def organism
+    @organism ||= metadata["organism"]
+  end
+  def matrix_file(name)
+    dir.matrices[name.to_s].find
+  end
+  def matrices
+    dir.matrices.glob('*').collect{|f| f.basename}
+  end
+  def matrix(type, format = "Ensembl Gene ID", organism = nil)
+    organism = self.metadata[:organism] if organism.nil?
+    raise "No matrices defined for study #{ self }" unless defined? matrices.empty?
+    raise "No type specified" if type.nil?
+    type = type.to_s
+    raise "No matrix #{ type } defined for study #{ self }" unless matrices.include? type
+    data = dir.matrices[type].data.find if dir.matrices[type].data.exists?
+    if dir.matrices[type].identifiers.exists?
+      identifiers = dir.matrices[type].identifiers.find
+    else
+      identifiers = Organism.identifiers(organism).find
+    end
+    samples = dir.matrices[type].samples.find if dir.matrices[type].samples.exists?
+    samples = dir.samples.find if samples.nil? and dir.samples.exist?
+    Matrix.new(data, identifiers, samples, format, organism)
+  end
+end