RubyGems - protk - Versions diffs - 1.3.0 → 1.3.1.pre2 - Mend

protk 1.3.0 → 1.3.1.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/bin/make_decoy.rb +1 -2
data/bin/mascot_search.rb +2 -0
data/bin/msgfplus_search.rb +1 -1
data/bin/protxml_to_gff.rb +94 -115
data/bin/protxml_to_psql.rb +3 -2
data/bin/sixframe.rb +15 -8
data/bin/swissprot_to_table.rb +120 -0
data/lib/protk.rb +0 -1
data/lib/protk/bio_gff3_extensions.rb +22 -0
data/lib/protk/bio_sptr_extensions.rb +19 -4
data/lib/protk/constants.rb +19 -11
data/lib/protk/gffdb.rb +60 -0
data/lib/protk/peptide.rb +158 -0
data/lib/protk/protein.rb +72 -0
data/lib/protk/protein_to_genome_mapper.rb +8 -0
data/lib/protk/protxml_to_gff_tool.rb +3 -1
data/lib/protk/search_tool.rb +3 -24
data/lib/protk/swissprot_database.rb +8 -20
data/lib/protk/tool.rb +36 -1
metadata +68 -41
data/lib/protk/protxml.rb +0 -141

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 5e8f8a571cb43ed61984a34b6e1fb51caf979593
-  data.tar.gz: b53857f75c1ff6ca850859c3985aee36533e437f
+  metadata.gz: 888f8ebff75c2c33497c9bf4f7aeec182311b7e3
+  data.tar.gz: 4102a91afbee688babe093df8a53b84b097ba0c3
 SHA512:
-  metadata.gz: 9450fccc4a5ce59f064927d62fbc6a4342a1710c3b82707e0908dea52af7d0b50f215e64073bb067a506d204701acea11b6d28f302447494b8a30b1e7af2df2d
-  data.tar.gz: 1b8bc78fc09b4c81eee72fad169a6aee7145a16312c01bc95a5dd590f08cb98194b26115a166a759b9c52c7c67204a767747642e5e9331de4562d52f31eb1e11
+  metadata.gz: 3e67189a07c6ac237a4def19ad90043ab8919d5492fd43b67cfa5fc3285819b2fd62671375283c5d9dd05618c746603ba829c70225b140b3a52ccba9fafb24f8
+  data.tar.gz: 354a9eb2499d3f8b194ccdef82f06692672435a0e47b5b49197b9f1fba2c27181275d38c98fe6f644750b554b4b1601b873f6a51a0318853b86b17d7783f2e57

data/bin/make_decoy.rb CHANGED Viewed

@@ -49,10 +49,9 @@ if (tool.reverse_only)
 	Bio::FastaFormat.open(input_file).each do |seq|
 		id=nil
 		begin
-			# require 'debugger';debugger
 			id=seq.definition.chomp.scan(/#{tool.id_regex}/)[0][0]
 			revdef=seq.definition.sub(id,"#{tool.prefix_string}#{id}")
-			decoys_out.write ">#{revdef}\n#{seq.aaseq}\n"
+			decoys_out.write ">#{revdef}\n#{seq.aaseq.reverse}\n"
 		rescue
 			puts "Unable to parse id for #{seq.definition}. Skipping" if (id==nil)
 		end

data/bin/mascot_search.rb CHANGED Viewed

@@ -103,6 +103,7 @@ def search_params_dictionary(search_tool,input_file)
     postdict[:FILE]=File.new(input_file)
     postdict[:FORMVER]='1.01'
     postdict[:INTERMEDIATE]=''
+    postdict[:QUANTITATION]=search_tool.quantitation
     postdict
 end
@@ -134,6 +135,7 @@ search_tool.options.output_suffix="_mascot"
 search_tool.add_value_option(:mascot_server,"#{$genv.default_mascot_server}/mascot/cgi",['-S', '--server url', 'The url to the cgi directory of the mascot server'])
 search_tool.add_value_option(:allowed_charges,"1+,2+,3+",['--allowed-charges ac', 'Allowed precursor ion charges.'])
+search_tool.add_value_option(:quantitation,"",['--quantitation method','Mascot quant method'])
 search_tool.add_value_option(:email,"",['--email em', 'User email.'])
 search_tool.add_value_option(:username,"",['--username un', 'Username.'])
 search_tool.add_value_option(:httpproxy,nil,['--proxy url', 'The url to a proxy server'])

data/bin/msgfplus_search.rb CHANGED Viewed

@@ -170,7 +170,7 @@ ARGV.each do |filename|
     # Num Threads
     #
-    cmd << " -thread #{search_tool.threads}" if search_tool.threads > 0
+    cmd << " -thread #{search_tool.threads}" if search_tool.threads.to_i > 0
     mods_file_content = ""

data/bin/protxml_to_gff.rb CHANGED Viewed

@@ -1,44 +1,51 @@
 #!/usr/bin/env ruby
 #
 # This file is part of protk
-# Original python version created by Max Grant
-# Translated to ruby by Ira Cooke 29/1/2013
+# Created by Ira Cooke 3/8/2014
 #
 #
 require 'protk/constants'
-require 'protk/protxml_to_gff_tool'
 require 'protk/fastadb'
+require 'protk/gffdb'
+require 'protk/protein'
+require 'protk/peptide'
+require 'protk/tool'
 require 'libxml'
 require 'bio'
 include LibXML
-tool=ProtXMLToGFFTool.new()
-@output_extension=".gff"
-@output_suffix=""
-exit unless tool.check_options(true,[:database])
-input_proxml=ARGV[0]
-if ( tool.explicit_output!=nil)
-    gff_out_file=tool.explicit_output
-  else
-    gff_out_file=Tool.default_output_path(input_proxml,@output_extension,tool.output_prefix,@output_suffix)
+class NoGFFEntryFoundError < StandardError
 end
-gff_db = Bio::GFF.new()
-f = open(gff_out_file,'w+')
+class ProteinNotInDBError < StandardError
+end
+class MultipleGFFEntriesForProteinError < StandardError
+end
 def parse_proteins(protxml_file)
-  puts "Parsing proteins from protxml"
   protxml_parser=XML::Parser.file(protxml_file)
   protxml_doc=protxml_parser.parse
   proteins = protxml_doc.find('.//protxml:protein','protxml:http://regis-web.systemsbiology.net/protXML')
-  proteins
+  proteins.collect { |node| Protein.from_protxml(node)   }
+end
+def protein_id_to_gffid(protein_id,gff_idregex)
+	return protein_id if gff_idregex.nil?
+	return protein_id.match(/#{gff_idregex}/)[1]
+end
+def protein_id_to_genomeid(protein_id,genome_idregex)
+	return protein_id if genome_idregex.nil?
+	return protein_id.match(/#{genome_idregex}/)[1]
+end
+def protein_id_to_protdbid(protein_id)
+	# return protein_id.sub(/^lcl\|/,"")
+	return protein_id
 end
 def prepare_fasta(database_path,type)
@@ -50,134 +57,106 @@ def prepare_fasta(database_path,type)
     db_filename=Constants.new.current_database_for_name(database_path)
   end
-  db_indexfilename = "#{db_filename}.pin"
+  db_indexfilename = type=='prot' ? "#{db_filename}.pin" : "#{db_filename}.nhr"
   if File.exist?(db_indexfilename)
-    puts "Using existing indexed database"
     orf_lookup = FastaDB.new(db_filename)
   else
-    puts "Indexing database"
     orf_lookup = FastaDB.create(db_filename,db_filename,type)
   end
   orf_lookup
 end
-proteins = parse_proteins(input_proxml)
-fastadb = prepare_fasta(tool.database,'prot')
-genomedb = nil
-if tool.genome
-  genomedb = prepare_fasta(tool.genome,'nucl')
-end
-puts "Aligning peptides and writing GFF data..."
-low_prob = 0
-skipped = 0
-peptide_count = 0
-protein_count = 0
-total_peptides = 0
-peptides_covered_genome={}
-for prot in proteins
-  prot_prob = prot['probability']
-  if ( prot_prob.to_f < tool.protein_probability_threshold )
-    next
-  end
-  # Gets identifiers of all proteins (includeing indistinguishable ones)
-  prot_names=tool.protein_names(prot)
-  if tool.protein_find!=nil
-    prot_names=prot_names.keep_if { |pname| pname.include? tool.protein_find }
-  end
+tool=Tool.new([:explicit_output,:debug])
+tool.option_parser.banner = "Map proteins and peptides to genomic coordinates.\n\nUsage: protxml_to_gff.rb [options] proteins.<protXML>"
+tool.add_value_option(:database,nil,['-d filename','--database filename','Database used for ms/ms searches (Fasta Format)'])
+# tool.add_value_option(:genome,nil,['-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)'])
+tool.add_value_option(:coords_file,nil,['-c filename','--coords-file filename.gff3', 'A file containing genomic coordinates for predicted proteins and/or 6-frame translations'])
+tool.add_boolean_option(:stack_charge_states,false,['--stack-charge-states','Different peptide charge states get separate gff entries'])
+tool.add_value_option(:peptide_probability_threshold,0.95,['--threshold prob','Peptide Probability Threshold (Default 0.95)'])
+tool.add_value_option(:protein_probability_threshold,0.99,['--prot-threshold prob','Protein Probability Threshold (Default 0.99)'])
+tool.add_value_option(:gff_idregex,nil,['--gff-idregex pre','Regex with capture group for parsing gff ids from protein ids'])
+tool.add_value_option(:genome_idregex,nil,['--genome-idregex pre','Regex with capture group for parsing genomic ids from protein ids'])
-  peptides=tool.peptide_nodes(prot)
-  entries_covered=[]
-  for protein_name in prot_names
-    protein_count += 1
-    prot_id = "pr#{protein_count.to_s}"
-    begin
+exit unless tool.check_options(true,[:database,:coords_file])
-      protein_fasta_entry = tool.get_fasta_record(protein_name,fastadb)
-      protein_info = tool.cds_info_from_fasta(protein_fasta_entry)
+$protk = Constants.new
+log_level = tool.debug ? "info" : "warn"
+$protk.info_level= log_level
-      unless (tool.collapse_redundant_proteins && !tool.is_new_genome_location(protein_info,entries_covered) )
-        protein_gff = tool.generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
+input_file=ARGV[0]
-        gff_db.records += ["##gff-version 3\n","##sequence-region #{protein_info.scaffold} 1 160\n",protein_gff]
+if tool.explicit_output
+  output_fh=File.new("#{tool.explicit_output}",'w')
+else
+  output_fh=$stdout
+end
-        prot_seq = protein_fasta_entry.aaseq.to_s
-        throw "Not amino_acids" if prot_seq != protein_fasta_entry.seq.to_s
+should_ = tool.debug || (output_fh!=$stdout)
-        peptides_covered_protein=[]
-        peptide_count=1
-        for peptide in peptides
+input_protxml=ARGV[0]
-          pprob = peptide['nsp_adjusted_probability'].to_f
-          # puts peptide
-          # puts pprob
-          pep_seq = peptide['peptide_sequence']
+gffdb = GFFDB.create(tool.coords_file) if tool.coords_file
-          if ( pprob >= tool.peptide_probability_threshold && (!peptides_covered_protein.include?(pep_seq) || tool.stack_charge_states))
+# genome_db = prepare_fasta(tool.genome,'nucl')
+prot_db = prepare_fasta(tool.database,'prot')
-            dna_sequence=nil
-            if !protein_info.is_sixframe
-              throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
-              dna_sequence = tool.get_dna_sequence(protein_info,genomedb)
-            end
+proteins = parse_proteins(input_protxml)
+num_missing_gff_entries = 0
-            peptide_gff = tool.generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,dna_sequence,genomedb)
+proteins.each do |protein|
-            unless (peptide_gff.length==0 || tool.peptide_gff_is_duplicate(peptide_gff[0],peptides_covered_genome))
+	begin
+		# Get the full protein sequence
+		#
+		parsed_name_for_protdb = protein_id_to_protdbid(protein.protein_name)
+		protein_entry = prot_db.get_by_id parsed_name_for_protdb
+		raise ProteinNotInDBError if ( protein_entry == nil)
-              tool.add_putative_nterm_to_gff(peptide_gff,pep_seq,prot_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
+		protein.sequence = protein_entry.aaseq
-              gff_db.records += peptide_gff
+		# Get the CDS and parent entries from the gff file
+		#
+		parsed_name_for_gffid = protein_id_to_gffid(protein.protein_name,tool.gff_idregex)
+		gff_parent_entries = gffdb.get_by_id(parsed_name_for_gffid)
+		raise NoGFFEntryFoundError if gff_parent_entries.nil? || gff_parent_entries.length==0
+		raise MultipleGFFEntriesForProteinError if gff_parent_entries.length > 1
-              peptides_covered_protein << pep_seq unless tool.stack_charge_states
-              peptides_covered_genome[pep_seq] = peptide_gff[0].start
+		gff_parent_entry = gff_parent_entries.first
+		gff_cds_entries = gffdb.get_cds_by_parent_id(parsed_name_for_gffid)
-              total_peptides += 1
-              peptide_count+=1
-            else
-              puts "Duplicate peptide #{peptide_gff[0]}"
-            end
-#            puts gff_db.records.last
-          end
-        end
-      else
-        puts "Skipping redundant entry #{protein_name}"
-        protein_count-=1 # To counter +1 prior to begin rescue end block
-      end
+		# Account for sixframe case. Parent is CDS and there are no children
+		#
+		gff_cds_entries=[gff_parent_entry] if gff_cds_entries.nil? && gff_parent_entry.feature=="CDS"
-      entries_covered<<protein_info
+		peptides = tool.stack_charge_states ? protein.peptides : protein.representative_peptides
-#      puts protein_gff
-#      puts gff_db.records
-    rescue KeyError,EncodingError
-      skipped+=0
-    end
+		peptides.each do |peptide|
+			peptide_entries = peptide.to_gff3_records(protein_entry.aaseq,gff_parent_entry,gff_cds_entries)
+			peptide_entries.each do |peptide_entry|
+				output_fh.write peptide_entry.to_s
+			end
+		end
-    # exit
-  end
+	rescue NoGFFEntryFoundError
+		$protk.log "No gff entry for #{parsed_name_for_gffid}", :info
+		num_missing_gff_entries+=1
+	rescue ProteinNotInDBError
+		$protk.log "No entry for #{parsed_name_for_protdb}", :info
+	rescue MultipleGFFEntriesForProteinError
+		$protk.log "Multiple entries in gff file for #{parsed_name_for_gffid}", :info
+	rescue PeptideNotInProteinError
+		$protk.log "A peptide was not found in its parent protein #{protein.protein_name}" , :warn
+	end
+end
+if num_missing_gff_entries>0
+	$protk.log "Failed to lookup gff entries. Try setting --gff-idregex" if tool.gff_idregex.nil?
 end
-f = open(gff_out_file,'w+')
-gff_db.records.each { |rec|
-  f.write(rec.to_s)
-}
-f.close
-p "Finished."
-p "Proteins: #{protein_count}"
-p "Skipped Decoys: #{skipped}"
-p "Total Peptides: #{total_peptides}"
-p "Peptides Written: #{total_peptides - low_prob}"
-p "Peptides Culled: #{low_prob}"
-exit(0)

data/bin/protxml_to_psql.rb CHANGED Viewed

@@ -242,7 +242,7 @@ def insert_psms_from_file(filepath)
 	spectrum_queries.each do |query|
-		spectrum_name = query.attributes['spectrum'].chomp.gsub("0","").sub(/\.\d+$/,"")
+		spectrum_name = query.attributes['spectrum'].chomp.gsub(/\.0+/,"\.").sub(/\.\d+$/,"")
 		start_scan=query.attributes['start_scan'].to_i
 		end_scan=query.attributes['end_scan'].to_i
@@ -318,7 +318,8 @@ def lookup_spectra_from_files(file_list,matched_spectra)
 				SQL
 			else
+				# require 'debugger';debugger
+				# puts "Unmatched spectrum #{spec[:title]}"
 			end
 			spec = mzml_parser.next_spectrum
 		end

data/bin/sixframe.rb CHANGED Viewed

@@ -29,6 +29,7 @@ tool.option_parser.banner = "Create a sixframe translation of a genome.\n\nUsage
 tool.add_boolean_option(:print_coords,false,['--coords', 'Write genomic coordinates in the fasta header'])
 tool.add_boolean_option(:keep_header,true,['--strip-header', 'Dont write sequence definition'])
 tool.add_value_option(:min_len,20,['--min-len','Minimum ORF length to keep'])
+tool.add_boolean_option(:write_gff,false,['--gff3','Output gff3 instead of fasta'])
 exit unless tool.check_options(true)
@@ -38,6 +39,9 @@ output_file = tool.explicit_output!=nil ? tool.explicit_output : nil
 output_fh = output_file!=nil ? File.new("#{output_file}",'w') : $stdout
+if tool.write_gff
+  output_fh.write "##gff-version 3\n"
+end
 file = Bio::FastaFormat.open(input_file)
@@ -66,13 +70,11 @@ file.each do |entry|
           position_end=forward_position_end
         end
         # Create accession compliant with NCBI naming standard
         # See http://www.ncbi.nlm.nih.gov/books/NBK7183/?rendertype=table&id=ch_demo.T5
         ncbi_scaffold_id = entry.entry_id.gsub('|','_').gsub(' ','_')
         ncbi_accession = "lcl|#{ncbi_scaffold_id}_frame_#{frame}_orf_#{oi}"
+        gff_id = "#{ncbi_scaffold_id}_frame_#{frame}_orf_#{oi}"
         defline=">#{ncbi_accession}"
@@ -84,11 +86,16 @@ file.each do |entry|
           defline << " #{entry.definition}"
         end
-        # Output in fasta format
-        # start and end positions are always relative to the forward strand
-        output_fh.write("#{defline}\n#{orf}\n")
+        if tool.write_gff
+          strand = frame>3 ? "-" : "+"
+          # score = self.nsp_adjusted_probability.nil? ? "." : self.nsp_adjusted_probability.to_s
+          # gff_string = "#{parent_record.seqid}\tMSMS\tpolypeptide\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
+          output_fh.write("#{ncbi_scaffold_id}\tsixframe\tCDS\t#{position_start}\t#{position_end}\t.\t#{strand}\t0\tID=#{gff_id}\n")
+        else
+          # Output in fasta format
+          # start and end positions are always relative to the forward strand
+          output_fh.write("#{defline}\n#{orf}\n")
+        end
       end
       position += orf.length*3+3
     end

data/bin/swissprot_to_table.rb ADDED Viewed

@@ -0,0 +1,120 @@
+#!/usr/bin/env ruby
+#
+# This file is part of protk
+# Created by Ira Cooke 18/1/2011
+#
+# Convert a pepXML file to a tab delimited table
+#
+#
+require 'protk/tool'
+require 'protk/swissprot_database'
+require 'protk/bio_sptr_extensions'
+require 'protk/fastadb'
+columns={'recname'=>"Primary Name",'cd'=>"CD Antigen Name",'altnames'=>"Alternate Names",
+      'location' => "Subcellular Location",
+      'function' => "Known Function",
+      'similarity' => "Similarity",
+      'tissues' => "Tissue Specificity",
+      'disease' => "Disease Association",
+      'domain' => "Domain",
+      'subunit' => "Sub Unit",
+      'nextbio' => "NextBio",
+      'ipi' => "IPI",
+      'intact' => "Interactions",
+      'pride' => 'Pride',
+      'ensembl'=> 'Ensembl',
+      'num_transmem'=>"Transmembrane Regions",
+      'signalp'=>'Signal Peptide',
+      'go_terms'=>"GO Terms",
+      'go_entries'=>"GO Entries",
+      'accessions'=>"Uniprot Accessions",
+      'ncbi_taxon_id'=>"NCBI Taxon ID"
+    }
+# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
+#
+tool=Tool.new([:explicit_output,:debug])
+tool.option_parser.banner = "Query a swissprot flat file and output to tab delimited table.\n\nUsage: swissprot_to_table.rb [options] -d flatfile.dat queries.txt"
+tool.add_value_option(:database,nil,['-d','--database file','Uniprot flatfile database containing full records for proteins'])
+tool.add_value_option(:output_keys,nil,['-K','--keys keys','Filter output to only the specified keys (comma separated)'])
+tool.add_boolean_option(:show_keys,false,['--show-keys','Print a list of possible values for the keys field and exit'])
+tool.add_value_option(:separator,"\t",['-S','--separator sep','Separator character for output, default (tab)'])
+tool.add_value_option(:array_separator,",",['-A','--array-separator sep','Array Separator character, default ,'])
+tool.add_value_option(:query_separator,"\t",['--query-separator sep','Separator character for queries.txt, default is tab'])
+tool.add_value_option(:id_column,1,['--id-column num','Column in queries.txt in which Uniprot Accessions are found'])
+if ARGV.include? "--show-keys"
+  columns.each_pair { |name, val| $stdout.write "#{name} (#{val})\n" }
+  exit
+end
+exit unless tool.check_options(true,[:database])
+$protk = Constants.new
+log_level = tool.debug ? :debug : :fatal
+$protk.info_level= log_level
+if tool.explicit_output
+  output_fh=File.new("#{tool.explicit_output}",'w')
+else
+  output_fh=$stdout
+end
+if tool.output_keys
+  output_keys=tool.output_keys.split(",").collect { |k| k.strip }
+  columns.delete_if { |key, value| !output_keys.include? key }
+end
+db_info=tool.database_info
+database_path=db_info.path
+database_index_path = "#{Pathname.new(database_path).dirname}/config.dat"
+skip_index = File.exists?(database_index_path) ? true : false
+swissprotdb=SwissprotDatabase.new(database_path,skip_index)
+def write_entry(item_name,item,columns,tool,output_fh)
+  row=[item_name]
+  row << columns.keys.collect do |name|
+    colvalue = item.send(name)
+    colvalue = "" unless colvalue
+    colvalue = colvalue.join(tool.array_separator) if colvalue.class==Array
+    colvalue
+  end
+  output_fh.write "#{row.join(tool.separator)}\n"
+end
+File.open(ARGV[0]).each_line do |line|
+  begin
+    query_id = line.chomp.split(tool.query_separator)[tool.id_column.to_i-1]
+  rescue
+    query_id = line.chomp
+  end
+  begin
+    item = swissprotdb.get_entry_for_name(query_id)
+    write_entry(query_id,item,columns,tool,output_fh)
+  rescue
+    $protk.log "Unable to retrieve entry for #{query_id}" , :debug
+  end
+end