RubyGems - protk - Versions diffs - 1.3.0 → 1.3.1.pre2 - Mend

protk 1.3.0 → 1.3.1.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/bin/make_decoy.rb +1 -2
data/bin/mascot_search.rb +2 -0
data/bin/msgfplus_search.rb +1 -1
data/bin/protxml_to_gff.rb +94 -115
data/bin/protxml_to_psql.rb +3 -2
data/bin/sixframe.rb +15 -8
data/bin/swissprot_to_table.rb +120 -0
data/lib/protk.rb +0 -1
data/lib/protk/bio_gff3_extensions.rb +22 -0
data/lib/protk/bio_sptr_extensions.rb +19 -4
data/lib/protk/constants.rb +19 -11
data/lib/protk/gffdb.rb +60 -0
data/lib/protk/peptide.rb +158 -0
data/lib/protk/protein.rb +72 -0
data/lib/protk/protein_to_genome_mapper.rb +8 -0
data/lib/protk/protxml_to_gff_tool.rb +3 -1
data/lib/protk/search_tool.rb +3 -24
data/lib/protk/swissprot_database.rb +8 -20
data/lib/protk/tool.rb +36 -1
metadata +68 -41
data/lib/protk/protxml.rb +0 -141

data/lib/protk.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 require 'protk/tool.rb'
 require 'protk/swissprot_database.rb'
 require 'protk/search_tool.rb'
-require 'protk/protxml.rb'
 require 'protk/prophet_tool.rb'
 require 'protk/omssa_util.rb'
 require 'protk/mascot_util.rb'

data/lib/protk/bio_gff3_extensions.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'bio'
+# Extension to GFF3 records to support genomic coordinate mapping tasks
+class Bio::GFF::GFF3::Record
+# Comparator to allow sorting by start
+# Overlap operator to return a new gff by overlapping this one with another
+# Function to return our coordinates relative to some other coordinate system (eg a protein)
+	def <=>(otherRecord)
+		self.start <=> otherRecord.start
+	end
+	def length
+		return self.end-self.start+1
+	end
+end

data/lib/protk/bio_sptr_extensions.rb CHANGED Viewed

@@ -90,7 +90,7 @@ class Bio::SPTR < Bio::EMBLDB
   # SwissProt Accessions
   #
   def accessions
-    return ""
+    return self.ac
   end
   # Subcellular Location
@@ -132,7 +132,8 @@ class Bio::SPTR < Bio::EMBLDB
   def domain
     return self.cc["DOMAIN"].to_s
   end
   #
   # Getting dr entry
   #
@@ -152,6 +153,20 @@ class Bio::SPTR < Bio::EMBLDB
   def ipi
     return self.safely_get_drentry_for_key("IPI")
   end
+  def go_terms
+    terms = self.dr["GO"]
+    if terms
+      return terms.collect { |e| e[0] }
+    else
+      return nil
+    end
+  end
+  def go_entries
+    return self.dr["GO"]
+  end
   # Intact accession number
   #
@@ -234,8 +249,8 @@ class Bio::SPTR < Bio::EMBLDB
     return self.seq.to_s
   end
-  def tax_dump
-    return self.ox.to_s
+  def ncbi_taxon_id
+    return self.ox["NCBI_TaxID"]
   end
   def species_dump

data/lib/protk/constants.rb CHANGED Viewed

@@ -29,16 +29,19 @@ class Constants
   attr :info_level
   attr :protk_dir
   attr :data_lib_dir
+  attr_accessor :info_level
   # Provides direct access to constants through methods of the same name
   # This will be used for all constants other than paths
   #
   def method_missing(method)
     from_env = @env[method.to_s]
     throw "#{method} is undefined" unless from_env!=nil
     from_env
   end
   # Some constants are paths. They need to be translated into real paths before being returned
   #
@@ -121,7 +124,7 @@ class Constants
   # Read the global constants file and initialize our class @env variable
   # Initialize loggers
   #
-  def initialize
+  def initialize()
     @data_lib_dir="#{File.dirname(__FILE__)}/data"
     @protk_dir="#{Dir.home}/.protk"
@@ -170,8 +173,10 @@ class Constants
     # puts "Path #{ENV['PATH']}"
     throw "No data found in config file" unless @env!=nil
-    @info_level=default_config_yml['message_level']
+    @info_level="fatal"
+    @info_level=default_config_yml['message_level'] unless default_config_yml['message_level'].nil?
   end
@@ -196,15 +201,17 @@ class Constants
     throw "Unable to create file logger at path #{self.log_file}" unless @file_logger!=nil
     throw "Unable to create stdout logger " unless @stdout_logger!=nil
     case @info_level
-    when "info"
+    when /info/i
       @stdout_logger.level=Logger::INFO
-    when "debug"
+    when /debug/i
       @stdout_logger.level=Logger::DEBUG
-    when "warn"
-      @stdout_logger.level=Logger::WARN
+    when /warn/i
+      @stdout_logger.level=Logger::WARN
+    when /fatal/i
+      @stdout_logger.level=Logger::FATAL
+    else
+      throw "Unknown log level #{@info_level}"
     end
   end
@@ -215,8 +222,9 @@ class Constants
     if ( @stdout_logger == nil || @file_logger == nil)
       initialize_loggers
     end
-    @stdout_logger.send(level,message)
-    @file_logger.send(level,message)
+   @stdout_logger.send(level,message)
+   @file_logger.send(level,message)
   end
   def path_for_builtin_database(dbname)

data/lib/protk/gffdb.rb ADDED Viewed

@@ -0,0 +1,60 @@
+require 'protk/constants'
+require 'bio'
+class GFFDB
+  attr_accessor :id_to_records_map
+  def initialize(gff_file_path)
+    env = Constants.new
+    @database = gff_file_path
+    @id_to_records_map={}
+    @id_to_cds_map={}
+  end
+  def self.create(gff_file_path)
+    db = GFFDB.new(gff_file_path)
+    db.make_index(gff_file_path)
+    db
+  end
+  def get_by_id(entry_id)
+    @id_to_records_map[entry_id]
+  end
+  def get_cds_by_parent_id(entry_id)
+    @id_to_cds_map[entry_id]
+  end
+  def make_index(input_gff)
+    io = File.open(input_gff, "r")
+    gffdb = Bio::GFF::GFF3.new(io)  #parses the entire db
+    # Now create the mapping from ids to records
+    gffdb.records.each do |record|
+      @id_to_records_map[record.id] = [] if @id_to_records_map[record.id].nil?
+      @id_to_records_map[record.id] << record
+      begin
+        # puts record.feature_type.match(/CDS/)
+        if record.feature_type.to_s =~ /CDS/i
+          # puts record.feature_type
+          parent_id=record.attributes_to_hash['Parent']
+          # puts parent_id
+          if parent_id
+            @id_to_cds_map[parent_id] = [] if @id_to_cds_map[parent_id].nil?
+            @id_to_cds_map[parent_id] << record
+          end
+        end
+      rescue
+        puts "Problem initializing cds map for #{record}"
+      end
+    end
+  end
+end

data/lib/protk/peptide.rb ADDED Viewed

@@ -0,0 +1,158 @@
+require 'libxml'
+require 'bio'
+require 'protk/bio_gff3_extensions'
+include LibXML
+class PeptideNotInProteinError < StandardError
+end
+class Peptide
+	attr_accessor :sequence
+	attr_accessor :protein_name
+	attr_accessor :charge
+	attr_accessor :nsp_adjusted_probability
+	class << self
+		def from_protxml(xmlnode)
+			pep=new()
+			pep.sequence=xmlnode['peptide_sequence']
+			pep.nsp_adjusted_probability=xmlnode['nsp_adjusted_probability'].to_f
+			pep.charge=xmlnode['charge'].to_i
+			pep
+		end
+		def from_sequence(seq,charge=nil)
+			pep=new()
+			pep.sequence=seq
+			pep.charge=charge
+			pep
+		end
+		private :new
+	end
+	def initialize()
+	end
+	# Expects prot_seq not to contain explicit stop codon (ie * at end)
+	# AA coords are 0-based unlike genomic coords which are 1 based
+	#
+	def coords_in_protein(prot_seq,reverse=false)
+		if reverse
+			pep_index = prot_seq.reverse.index(self.sequence.reverse)
+			raise PeptideNotInProteinError if pep_index.nil?
+			pep_start_i = pep_index
+		else
+			pep_start_i = prot_seq.index(self.sequence)
+			raise PeptideNotInProteinError if pep_start_i.nil?
+		end
+		pep_end_i = pep_start_i+self.sequence.length
+		{:start => pep_start_i,:end => pep_end_i}
+	end
+	# Returns a list of fragments (hashes with start and end) in GFF style (1 based) genomic coordinates
+	#
+	# Assumes that cds_coords is inclusive of the entire protein sequence including start-met
+	#
+	# We assume that gff records conform to the spec
+	#
+	# http://www.sequenceontology.org/gff3.shtml
+	#
+	# This part of the spec is crucial
+	#
+	# - The START and STOP codons are included in the CDS.
+	# - That is, if the locations of the start and stop codons are known,
+	# - the first three base pairs of the CDS should correspond to the start codon
+	# - and the last three correspond the stop codon.
+	#
+	# We also assume that all the cds records provided, actually form part of the protein (ie skipped exons should not be included)
+	#
+	def to_gff3_records(prot_seq,parent_record,cds_records)
+		throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
+		throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array
+		on_reverse_strand = (parent_record.strand=="-") ? true : false
+		aa_coords = coords_in_protein(prot_seq,false) # Always use forward protein coordinates
+		ordered_cds_records = on_reverse_strand ? cds_records.sort.reverse : cds_records.sort
+		# Initial position is the number of NA's from the start of translation
+		#
+		pep_nalen = self.sequence.length*3
+		i = 0; #Current protein position (in nucleic acids)
+		pep_start_i = aa_coords[:start]*3
+		pep_end_i = pep_start_i+self.sequence.length*3
+		fragments=[]
+		ordered_cds_records.each do |cds_record|
+			# puts cds_record
+			fragment = nil
+			fragment_len = 0
+			if on_reverse_strand
+				in_peptide = (i<pep_end_i) && (i>=pep_start_i)
+				before_len = [pep_start_i-i,0].max
+				# puts before_len
+				# puts in_peptide
+				# puts "i #{i} pi #{pep_end_i} psi #{pep_start_i}"
+				if in_peptide
+					fragment_end = cds_record.end
+					fragment_len = [cds_record.length,pep_end_i-i].min
+					fragment_start = fragment_end-fragment_len+1
+					# fragment = {:start=>fragment_start,:end=>fragment_end}
+					fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
+				elsif before_len>0
+					fragment_end = cds_record.end - before_len
+					fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
+					# puts "Frag len #{fragment_len}"
+					fragment_start = fragment_end - fragment_len + 1
+					fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
+					# fragment = {:start=>fragment_start,:end=>fragment_end}
+				else
+					fragment=nil
+				end
+			else
+				in_peptide = (i<pep_end_i) && (i>=pep_start_i)
+				before_len = [pep_start_i-i,0].max
+				if in_peptide
+					fragment_start = cds_record.start
+					fragment_len = [cds_record.length,pep_end_i-i].min
+					fragment_end = fragment_start+fragment_len-1
+					# fragment = {:start=>fragment_start,:end=>fragment_end}
+					fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
+				elsif before_len>0
+					fragment_start = cds_record.start + before_len
+					fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
+					fragment_end = fragment_start + fragment_len-1
+					# fragment = {:start=>fragment_start,:end=>fragment_end}
+					fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
+				else
+					fragment=nil
+				end
+			end
+			i+=cds_record.length
+			fragments << fragment unless fragment.nil?
+		end
+		fragments
+	end
+	def gff_record_for_peptide_fragment(start_i,end_i,parent_record)
+		cds_id = parent_record.id
+		this_id = "#{cds_id}.#{self.sequence}"
+		this_id << ".#{self.charge}" unless self.charge.nil?
+		score = self.nsp_adjusted_probability.nil? ? "." : self.nsp_adjusted_probability.to_s
+		gff_string = "#{parent_record.seqid}\tMSMS\tpolypeptide\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
+		Bio::GFF::GFF3::Record.new(gff_string)
+	end
+end

data/lib/protk/protein.rb ADDED Viewed

@@ -0,0 +1,72 @@
+require 'protk/peptide'
+include LibXML
+class Protein
+	attr_accessor :group_number
+	attr_accessor :group_probability
+	attr_accessor :probability
+	attr_accessor :sequence
+	attr_accessor :protein_name
+	attr_accessor :n_indistinguishable_proteins
+	attr_accessor :percent_coverage
+	attr_accessor :peptides
+	class << self
+		# <protein_group group_number="1" probability="1.0000">
+		#       <protein protein_name="ACADV_MOUSE" n_indistinguishable_proteins="1" probability="1.0000" percent_coverage="9.9" unique_stripped_peptides="ELGAFGLQVPSELGGLGLSNTQYAR+GIVNEQFLLQR+SGELAVQALDQFATVVEAK+VAVNILNNGR" group_sibling_id="a" total_number_peptides="4" pct_spectrum_ids="0.41" confidence="1.00">
+		#          <parameter name="prot_length" value="656"/>
+		#          <annotation protein_description="Very long-chain specific acyl-CoA dehydrogenase, mitochondrial OS=Mus musculus GN=Acadvl PE=1 SV=3"/>
+		#          <peptide peptide_sequence="SGELAVQALDQFATVVEAK" charge="1" initial_probability="0.9919" nsp_adjusted_probability="0.9981" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.34" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.99" is_contributing_evidence="Y" calc_neutral_pep_mass="1975.0340">
+		#          </peptide>
+		#          <peptide peptide_sequence="GIVNEQFLLQR" charge="1" initial_probability="0.9909" nsp_adjusted_probability="0.9979" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.34" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.99" is_contributing_evidence="Y" calc_neutral_pep_mass="1315.7250">
+		#          </peptide>
+		#          <peptide peptide_sequence="ELGAFGLQVPSELGGLGLSNTQYAR" charge="1" initial_probability="0.7792" nsp_adjusted_probability="0.9391" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.55" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.78" is_contributing_evidence="Y" calc_neutral_pep_mass="2576.3234">
+		#          </peptide>
+		#          <peptide peptide_sequence="VAVNILNNGR" charge="1" initial_probability="0.5674" nsp_adjusted_probability="0.8515" weight="1.00" is_nondegenerate_evidence="Y" n_enzymatic_termini="2" n_sibling_peptides="2.76" n_sibling_peptides_bin="5" n_instances="1" exp_tot_instances="0.57" is_contributing_evidence="Y" calc_neutral_pep_mass="1068.6030">
+		#          </peptide>
+		#       </protein>
+		# </protein_group>
+		def from_protxml(xmlnode)
+			prot=new()
+			groupnode = xmlnode.parent
+			prot.group_probability = groupnode['probability'].to_f
+			prot.group_number = groupnode['group_number'].to_i
+			prot.probability = xmlnode['probability'].to_f
+			prot.protein_name = xmlnode['protein_name']
+			prot.n_indistinguishable_proteins = xmlnode['n_indistinguishable_proteins'].to_i
+			prot.percent_coverage = xmlnode['percent_coverage'].to_f
+			peptide_nodes = xmlnode.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
+			prot.peptides = peptide_nodes.collect { |e| Peptide.from_protxml(e) }
+			prot
+		end
+		private :new
+	end
+	def initialize()
+	end
+	# Return just one peptide for each unique sequence choosing the peptide with highest probability
+	#
+	def representative_peptides()
+		best_peptides={}
+		self.peptides.each do |peptide|
+			seq = peptide.sequence
+			if best_peptides[seq].nil?
+				best_peptides[seq]=peptide
+			else
+				best_peptides[seq]=peptide if peptide.nsp_adjusted_probability > best_peptides[seq].nsp_adjusted_probability
+			end
+		end
+		best_peptides.values
+	end
+end