RubyGems - protk - Versions diffs - 1.4.4.beta2 → 1.4.4 - Mend

protk 1.4.4.beta2 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/README.md +7 -1
data/bin/maker_to_proteindb.rb +148 -0
data/bin/msgfplus_search.rb +6 -4
data/bin/protein_prophet.rb +13 -0
data/bin/protxml_to_gff.rb +11 -2
data/bin/sixframe.rb +6 -1
data/bin/tandem_search.rb +5 -3
data/lib/protk/constants.rb +12 -0
data/lib/protk/data/tandem_isb_kscore_defaults.xml +0 -0
data/lib/protk/data/tandem_isb_native_defaults.xml +0 -0
data/lib/protk/gff_to_proteindb_tool.rb +56 -0
data/lib/protk/peptide.rb +141 -19
data/lib/protk/peptide_mod.rb +42 -0
data/lib/protk/setup_rakefile.rake +13 -2
metadata +43 -40

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f278e7fe8e3a0955907a13952ced4fa0d772c204
-  data.tar.gz: 513fb214683486b9d3596a9b9f0956107d611170
+  metadata.gz: 1b59cd3751adc7a13c6dce81a90c2a4d739e1efe
+  data.tar.gz: 3ac1aba71a95b729101a6c64699cebaf1929fe34
 SHA512:
-  metadata.gz: e59fcb0724cbf42b8f63e65ca6d88fd91c7d45d3981964358c82e040f9235e2ce6768251d088966044827cadf5c9ce46f189345d3c27293d0b10b44daf018c10
-  data.tar.gz: 36329dca4cf416fc2b9bd6f0d395895ee2fd32fdba2f6ebd75c5d26ed21b497dc890cfc6bcf36b1f08bc7f3954bae76adea64450a431d6ac365a2930191997cc
+  metadata.gz: 6ab966131e53b6e379ba1df7717cc71e0a7ead11b603d16d474f4fdce487623546ca472c8624421d62af1e238f7e2917f8ed1e66c52a82d0fc4798cd7b1fdf6b
+  data.tar.gz: 05b98a731eb99063f17942fed42dce0e450488e61b11452b918269bd27788a300d7d2ebf4c2a67930c14bc5291dfab7bf8b13b992300356e96b0d274e68c6d96

data/README.md CHANGED

@@ -16,10 +16,16 @@ Protk is a suite of tools for proteomics. It aims to present a simple and consis
 Protk is a ruby gem and requires ruby 2.0 or higher with support for libxml2. To avoid installation problems we recommend using [rvm](https://rvm.io) to install ruby.
-``` shell
+```shell
     gem install protk
 ```
+On macOS you may need to do
+```bash
+	brew install coreutils
+```
 ## Ruby Compatibility
 In general Protk requires ruby with a version >=2.0.

data/bin/maker_to_proteindb.rb ADDED

@@ -0,0 +1,148 @@
+#!/usr/bin/env ruby
+#
+# This file is part of protk
+# Created by Ira Cooke 4/9/2013
+#
+#
+require 'protk/constants'
+require 'protk/tool'
+require 'protk/gff_to_proteindb_tool'
+require 'bio'
+tool=GffToProteinDBTool.new([:explicit_output,:debug,:add_transcript_info])
+tool.option_parser.banner = "Create a protein database from Maker gene prediction \
+output that is suitable for later processing by proteogenomics tools.\
+\n\nUsage: maker_to_proteindb.rb [options] maker.gff3"
+tool.add_value_option(:proteins_file,nil,['-p', '--prot-fasta proteins', 'A fasta file \
+	containing protein sequences for each transcript'])
+# tool.add_value_option(:explicit_output,nil,['-o', '--output out', 'An explicitly named output file. \
+#   The default is to write to standard output'])
+exit unless tool.check_options(true)
+inname=ARGV.shift
+$protk = Constants.instance
+log_level = tool.debug ? :debug : :fatal
+$protk.info_level= log_level
+tool.print_progress=true
+outfile=nil
+if ( tool.explicit_output != nil)
+  outfile=File.open(tool.explicit_output,'w')
+else
+  outfile=$stdout
+  tool.print_progress=false
+end
+gene_lines=[]
+def get_protein_sequence(transcript_id,proteins_file)
+	%x[samtools faidx #{proteins_file} #{transcript_id} | tail -n +2]
+end
+def cds_to_header_text(coding_sequence,transcript_id)
+#  require 'debugger';debugger
+  imatch=coding_sequence.match(/CDS\t(\d+)\t(\d+).*?([-\+]{1}.*?Parent=#{transcript_id})$/)
+  if imatch==nil
+    return ""
+  end
+  istart=imatch[1]
+  iend=imatch[2]
+  "#{istart}|#{iend}"
+end
+def sequence_fasta_header(tool,transcript_line,coding_sequences)
+  tmatch=transcript_line.match(/mRNA\t(\d+)\t(\d+).*?([-\+]{1}).*?ID=(.*?);/)
+#  require 'debugger'; debugger
+  tstart,tend,tstrand = transcript_line.match(/mRNA\t(\d+)\t(\d+).*?([-\+]{1})/).captures
+  # tstart=tmatch[1]
+  # tend=tmatch[2]
+#  tsidfield = transcript_line.split("\t")[8]
+  tid = transcript_line.match(/ID=([^;]+)/).captures[0]
+  # if tsidfield =~ /ID=/
+  #   tid = tsidfield.match(/ID=(.*?);/).captures[0]
+  # else
+  #   tid = tsidfield.gsub(" ","_").gsub(";","_")
+  # end
+   # require 'byebug';byebug
+  tstrandfr="fwd"
+  tstrandfr = "rev" if tstrand=="-"
+  scaffold=transcript_line.split("\t")[0]
+  # tid=tmatch[4]
+  header=">lcl|#{scaffold}_#{tstrandfr}_#{tid} #{tstart}|#{tend}"
+  if tool.add_transcript_info
+    coding_sequences.each { |coding_sequence| header << " #{cds_to_header_text(coding_sequence,tid)}" }
+  end
+  header
+end
+def protein_sequence(protein_lines)
+  seq=""
+  protein_lines.each_with_index do |line, i|
+      seq << line.match(/(\w+)\]?$/)[1]
+ end
+  seq
+end
+def parse_gene(tool,gene_lines)
+	# require 'byebug';byebug
+	geneid=gene_lines[0].match(/ID=([^;]+)/).captures[0]
+	scaffold_id = gene_lines[1].split("\t")[0]
+	transcripts=tool.get_lines_matching(/mRNA/,gene_lines)
+	coding_sequences=tool.get_lines_matching(/CDS/,gene_lines)
+	fasta_string=""
+	transcripts.each_with_index do |ts, i|
+  		prot_id=ts.match(/ID=([^;]+)/).captures[0]
+		begin
+	  		fh=sequence_fasta_header(tool,ts,coding_sequences)
+  			fasta_string << "#{fh}\n"
+  			ps=get_protein_sequence(prot_id,tool.proteins_file)
+  			fasta_string << "#{ps}"
+  		rescue => e
+  			$protk.log "Unable to retrieve protein for #{prot_id} #{e}" , :debug
+  		end
+	end
+	fasta_string
+end
+File.open(inname).each_with_index do |line, line_i|
+  line.chomp!
+  if tool.start_new_gene(line)
+  	if gene_lines.length > 0
+	    gene_string=parse_gene(tool,gene_lines)
+	    outfile.write gene_string
+	    gene_lines=[]
+	end
+  end
+  if line =~ /maker/
+    gene_lines << line
+  end
+end

data/bin/msgfplus_search.rb CHANGED

@@ -93,11 +93,13 @@ if for_galaxy || Pathname.new(database_path).extname.to_s.downcase != ".fasta"
 #   database_path="#{database_path}.fasta"
 end
+db_noext = "#{Pathname.new(database_path).sub_ext('')}"
 # Database must be indexed
-unless FileTest.exists?("#{database_path}.canno")
-  # dbdir = Pathname.new(database_path).dirname.to_s
+unless FileTest.exists?("#{db_noext}.canno")
   tdavalue=search_tool.decoy_search ? 1 : 0;
-  make_msgfdb_cmd << "java -Xmx3500M -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
+  genv.log "Database index not found at #{db_noext}. Building new index" , :info
+  make_msgfdb_cmd << "java -Xmx#{search_tool.java_mem} -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
 end
@@ -262,4 +264,4 @@ ARGV.each do |filename|
   #
   make_msgfdb_cmd=""
-end
+end

data/bin/protein_prophet.rb CHANGED

@@ -54,6 +54,19 @@ else
   output_file=Tool.default_output_path(inputs,".prot.xml",prophet_tool.output_prefix,@output_suffix)
 end
+genv.log("Checking input files ...",:info)
+inputs.each {|file_name|
+  throw "Missing input file #{file_name}" unless File.exist?(file_name)
+  file_pepxml = PepXML.new(file_name)
+  db_path=file_pepxml.extract_db()
+  throw "Unable to find database #{db_path} used for searching. Fix paths in input files first" unless File.exist?(db_path)
+}
 if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
   cmd="ProteinProphet "

data/bin/protxml_to_gff.rb CHANGED

@@ -38,7 +38,7 @@ end
 def protein_id_to_gffid(protein_id,gff_idregex)
 	return protein_id if gff_idregex.nil?
+	# require 'byebug'; byebug
 	m = protein_id.match(/#{gff_idregex}/)
 	if m
 		return m.captures[0]
@@ -77,7 +77,9 @@ def prepare_fasta(database_path,type)
   end
-  db_indexfilename = type=='prot' ? "#{db_filename}.pin" : "#{db_filename}.nhr"
+  db_indexfilename = type=='prot' ? "#{db_filename}.00.pin" : "#{db_filename}.nhr"
+#  require 'byebug';byebug
   if File.exist?(db_indexfilename)
     orf_lookup = FastaDB.new(db_filename)
@@ -101,6 +103,7 @@ tool.add_value_option(:protein_probability_threshold,0.99,['--prot-threshold pro
 tool.add_value_option(:gff_idregex,nil,['--gff-idregex pre','Regex with capture group for parsing gff ids from protein ids'])
 tool.add_value_option(:genome_idregex,nil,['--genome-idregex pre','Regex with capture group for parsing genomic ids from protein ids'])
 tool.add_value_option(:ignore_regex,nil,['--ignore-regex pre','Regex to match protein ids that we should ignore completely'])
+tool.add_value_option(:include_mods,false,['--include-mods','Output gff entries for peptide modification sites'])
 exit unless tool.check_options(true,[:database,:coords_file])
@@ -170,6 +173,12 @@ proteins.each do |protein|
 					peptide_entries.each do |peptide_entry|
 						output_fh.write peptide_entry.to_s
 					end
+					if tool.include_mods
+						mod_entries = peptide.mods_to_gff3_records(protein_entry.aaseq,gff_parent_entry,gff_cds_entries)
+						mod_entries.each do |mod_entry|
+							output_fh.write mod_entry.to_s
+						end
+					end
 				end
 			end

data/bin/sixframe.rb CHANGED

@@ -64,7 +64,12 @@ file.each do |entry|
   length = entry.naseq.length
   (1...7).each do |frame|
-    translated_seq= entry.naseq.translate(frame)
+    begin
+      translated_seq= entry.naseq.translate(frame)
+    rescue => exception
+      puts "#{entry}"
+      exit 1
+    end
     orfs=translated_seq.split("*")
     orf_index = 0
     position = ((frame - 1) % 3) + 1

data/bin/tandem_search.rb CHANGED

@@ -22,8 +22,10 @@ exit unless search_tool.check_options(true)
 # Our environment should be setup so that tandem or tandem.exe is on the path
 #
-tandem_bin=%x[which tandem].chomp
-tandem_bin=%x[which tandem.exe].chomp unless tandem_bin && tandem_bin.length>0
+# tandem_bin=%x[which tandem].chomp
+# tandem_bin=%x[which tandem.exe].chomp unless tandem_bin && tandem_bin.length>0
+# tandem_bin
 @output_suffix="_tandem"
@@ -70,7 +72,7 @@ ARGV.each do |filename|
     # The basic command
     #
-    cmd= "#{tandem_bin} #{params_path}"
+    cmd= "#{genv.tandem_bin} #{params_path}"
     # Add a cleanup command unless the user wants to keep params files
     #

data/lib/protk/constants.rb CHANGED

@@ -63,6 +63,18 @@ class Constants
     "#{@protk_dir}/tools/msgfplus"
   end
+  def get_path_for_executable(exec_name_list)
+    exec_name_list.each do |exec_name|
+      exec_path=%x[which #{exec_name}].chomp
+      return exec_path unless !exec_path || exec_path.length==0
+    end
+    throw "Unable to locate #{exec_name_list}"
+  end
+  def tandem_bin
+    get_path_for_executable ["tandem","tandem.exe"]
+  end
   def msgfplusjar
     msgfplus_path=%x[which MSGFPlus.jar]
     msgfplus_path.chomp

data/lib/protk/data/tandem_isb_kscore_defaults.xml CHANGED

File without changes

data/lib/protk/data/tandem_isb_native_defaults.xml CHANGED

File without changes

data/lib/protk/gff_to_proteindb_tool.rb ADDED

@@ -0,0 +1,56 @@
+#
+# This file is part of protk
+# Created by Ira Cooke 9/3/2017
+#
+# Provides common functionality used by tools that convert gff to a protein database
+#
+# These tools read a gff and then write out protein entries in the following format
+#
+# >lcl|<scaffold_id>_<orientation>_<transcript_id> gene_start|gene_end cds1_start|cds1_end cds2_start|cds2_end ...
+#
+require 'optparse'
+require 'pathname'
+require 'protk/tool'
+class GffToProteinDBTool < Tool
+  attr_accessor :print_progress
+  # Initializes commandline options common to all such tools.
+  # Individual search tools can add their own options, but should use Capital letters to avoid conflicts
+  #
+  def initialize(option_support=[])
+    super(option_support)
+    if ( option_support.include? :add_transcript_info )
+      add_boolean_option(:add_transcript_info,false,['--info','Include CDS Coordinates'])
+    end
+    @option_parser.summary_width=40
+    @capturing_gene=false
+    @current_gene=nil
+  end
+  def start_new_gene(line)
+    if (line =~ /maker\sgene/)
+        new_gene = line.match(/ID=([^;]+)/).captures[0]
+        if new_gene!=@current_gene
+          @current_gene=new_gene
+          return true
+        end
+      end
+  end
+  def get_lines_matching(pattern,gene_lines)
+    match_lines=[]
+    gene_lines.each do |line|
+      if line =~ pattern
+        match_lines << line
+      end
+    end
+    match_lines
+  end
+end

data/lib/protk/peptide.rb CHANGED

@@ -3,12 +3,15 @@ require 'bio'
 require 'protk/bio_gff3_extensions'
 require 'protk/mzidentml_doc'
 require 'protk/error'
+require 'protk/peptide_mod'
+# require 'protk/indistinguishable_peptide'
 include LibXML
 class PeptideNotInProteinError < ProtkError
 end
 class Peptide
 	# Stripped sequence (no modifications)
@@ -17,6 +20,9 @@ class Peptide
 	attr_accessor :charge
 	attr_accessor :probability
 	attr_accessor :theoretical_neutral_mass
+	attr_accessor :modifications
+	attr_accessor :modified_sequence
+	attr_accessor :indistinguishable_peptides
 	def as_protxml
 		node = XML::Node.new('peptide')
@@ -33,6 +39,27 @@ class Peptide
 			pep.sequence=xmlnode['peptide_sequence']
 			pep.probability=xmlnode['nsp_adjusted_probability'].to_f
 			pep.charge=xmlnode['charge'].to_i
+			# This deal with the case where mods are on the primary peptide
+			#
+			mod_info_node = xmlnode.find('protxml:modification_info','protxml:http://regis-web.systemsbiology.net/protXML')
+			# The pepXML spec says there can be multiple modification_info's but in practice there never is.
+			# We assume either 1 or 0
+			if ( mod_info_node.length > 0 )
+				throw "Encountered multiple modification_info nodes for a peptide" if mod_info_node.length > 1
+				pep.modified_sequence = mod_info_node[0]['modified_peptide']
+				mod_nodes = mod_info_node[0].find('protxml:mod_aminoacid_mass','protxml:http://regis-web.systemsbiology.net/protXML')
+				# require 'byebug';byebug
+				pep.modifications = mod_nodes.collect { |e| PeptideMod.from_protxml(e) }
+			end
+			# This deals with indistinguishable peptides
+			#
+			ips = xmlnode.find('protxml:indistinguishable_peptide','protxml:http://regis-web.systemsbiology.net/protXML')
+			# require 'byebug';byebug
+			pep.indistinguishable_peptides = ips.collect { |e| IndistinguishablePeptide.from_protxml(e) }
 			pep
 		end
@@ -55,17 +82,23 @@ class Peptide
 			pep.charge = best_psm.attributes['chargeState'].to_i
 			pep.protein_name = mzid_doc.get_dbsequence(xmlnode.parent,xmlnode.parent.attributes['dBSequence_ref']).attributes['accession']
-			# pep.charge = MzIdentMLDoc.get_charge_for_psm(best_psm)
 			pep
 		end
 		def from_sequence(seq,charge=nil)
 			pep=new()
-			pep.sequence=seq
+			pep.modifications = pep.modifications_from_sequence(seq)
+			pep.modified_sequence = seq
+			seq = seq.sub(/^n\[[0-9]+?\]/,"")
+			pep.sequence = seq.gsub(/[0-9\.\[\]]/,"")
 			pep.charge=charge
 			pep
 		end
 		private :new
 	end
@@ -73,6 +106,26 @@ class Peptide
 	end
+	def modifications_from_sequence(seq)
+		seq = seq.sub(/^n\[[0-9]+?\]/,"")
+		offset = 0
+		mods = seq.enum_for(:scan, /([A-Z])\[([0-9\.]+)\]/).map {
+			pm = PeptideMod.from_data(Regexp.last_match.begin(0)+1-offset,Regexp.last_match.captures[0],Regexp.last_match.captures[1].to_f)
+			offset += Regexp.last_match.captures[1].length+2
+			pm
+		}
+		# if ( seq == "N[115]VMN[115]LTPAETQ[129]QLHAALESQLSPGELAK" )
+		# 	require 'byebug';byebug
+		# 	puts "hi"
+		# end
+		mods
+	end
 	# Expects prot_seq not to contain explicit stop codon (ie * at end)
 	# AA coords are 0-based unlike genomic coords which are 1 based
 	#
@@ -112,23 +165,61 @@ class Peptide
 		throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
 		throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array
-		on_reverse_strand = (parent_record.strand=="-") ? true : false
 		aa_coords = coords_in_protein(prot_seq,false) # Always use forward protein coordinates
+		gff_records_for_coords_in_protein(aa_coords,self.sequence.length,parent_record,cds_records)
+	end
+	def mods_to_gff3_records(prot_seq,parent_record,cds_records)
+		throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
+		throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array
+		pep_aa_coords = coords_in_protein(prot_seq,false)
+		mod_records = []
+		unless ( self.modifications.nil? )
+			self.modifications.each { |mod|
+				prot_position = mod.position+pep_aa_coords[:start]
+				mod_aa_coords = {:start => prot_position, :end => prot_position+1}
+				mod_records << gff_records_for_coords_in_protein(mod_aa_coords,1,parent_record,cds_records, {:type => "modified_amino_acid_feature", :mod => mod, :modified_sequence => self.modified_sequence})
+			}
+		end
+		unless ( self.indistinguishable_peptides.nil? )
+			self.indistinguishable_peptides.each { |ip|
+				unless ( ip.modifications.nil? )
+					ip.modifications.each { |mod|
+						prot_position = mod.position+pep_aa_coords[:start]-1
+						mod_aa_coords = {:start => prot_position, :end => prot_position+1}
+						mod_records << gff_records_for_coords_in_protein(mod_aa_coords,1,parent_record,cds_records, {:type => "modified_amino_acid_feature", :mod => mod, :modified_sequence => ip.modified_sequence})
+					}
+				end
+			}
+		end
+		mod_records.flatten
+	end
+	def gff_records_for_coords_in_protein(aa_coords,seqlen,parent_record,cds_records,record_info ={:type => "polypeptide"})
+		on_reverse_strand = (parent_record.strand=="-") ? true : false
 		ordered_cds_records = on_reverse_strand ? cds_records.sort.reverse : cds_records.sort
 		# Initial position is the number of NA's from the start of translation
 		#
-		pep_nalen = self.sequence.length*3
+		pep_nalen = seqlen*3
 		i = 0; #Current protein position (in nucleic acids)
 		pep_start_i = aa_coords[:start]*3
-		pep_end_i = pep_start_i+self.sequence.length*3
-		fragments=[]
+		pep_end_i = pep_start_i+seqlen*3
+		gff_records=[]
 		ordered_cds_records.each do |cds_record|
-			fragment = nil
+			gff_record = nil
 			fragment_len = 0
 			if on_reverse_strand
@@ -139,16 +230,16 @@ class Peptide
 					fragment_end = cds_record.end
 					fragment_len = [cds_record.length,pep_end_i-i].min
 					fragment_start = fragment_end-fragment_len+1
-					fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
+					gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
 				elsif before_len>0
 					fragment_end = cds_record.end - before_len
 					fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
 					fragment_start = fragment_end - fragment_len + 1
 					if fragment_len>0
-						fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
+						gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
 					end
 				else
-					fragment=nil
+					gff_record=nil
 				end
 			else
 				in_peptide = (i<pep_end_i) && (i>=pep_start_i)
@@ -157,33 +248,64 @@ class Peptide
 					fragment_start = cds_record.start
 					fragment_len = [cds_record.length,pep_end_i-i].min
 					fragment_end = fragment_start+fragment_len-1
-					fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
+					gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
 				elsif before_len>0
 					fragment_start = cds_record.start + before_len
 					fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
 					fragment_end = fragment_start + fragment_len-1
 					if fragment_len>0
-						fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
+						gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
 					end
 				else
-					fragment=nil
+					gff_record = nil
 				end
 			end
 			i+=cds_record.length
-			fragments << fragment unless fragment.nil?
+			gff_records << gff_record unless gff_record.nil?
 		end
-		fragments
+		gff_records
 	end
-	def gff_record_for_peptide_fragment(start_i,end_i,parent_record)
+	def gff_record_for_peptide_fragment(start_i,end_i,parent_record,record_info)
 		cds_id = parent_record.id
-		this_id = "#{cds_id}.#{self.sequence}"
+		mod_sequence = record_info[:modified_sequence]
+		this_id = mod_sequence ? "#{cds_id}.#{mod_sequence}" : "#{cds_id}.#{self.sequence}"
 		this_id << ".#{self.charge}" unless self.charge.nil?
+		mod = record_info[:mod]
+		this_id << ".#{mod.position}.#{mod.mass}" unless mod.nil?
 		score = self.probability.nil? ? "." : self.probability.to_s
-		gff_string = "#{parent_record.seqid}\tMSMS\tpolypeptide\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
+		record_type = mod.nil? ? record_info[:type] : "#{record_info[:type]}_#{mod.amino_acid}"
+		gff_string = "#{parent_record.seqid}\tMSMS\t#{record_type}\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
 		Bio::GFF::GFF3::Record.new(gff_string)
 	end
+end
-end
+#             <indistinguishable_peptide peptide_sequence="MEYENTLTAAMK" charge="2" calc_neutral_pep_mass="1416.63">
+#             <modification_info modified_peptide="M[147]EYENTLTAAMK"/>
+#             </indistinguishable_peptide>
+class IndistinguishablePeptide < Peptide
+	class << self
+		def from_protxml(xmlnode)
+			pep=new()
+			pep.sequence=xmlnode['peptide_sequence']
+			pep.charge=xmlnode['charge'].to_i
+			mod_info_node = xmlnode.find('protxml:modification_info','protxml:http://regis-web.systemsbiology.net/protXML')
+			if ( mod_info_node.length > 0 )
+				throw "Encountered multiple modification_info nodes for an indistinguishable peptide" if mod_info_node.length > 1
+				pep.modified_sequence = mod_info_node[0]['modified_peptide']
+				mod_nodes = mod_info_node[0].find('protxml:mod_aminoacid_mass','protxml:http://regis-web.systemsbiology.net/protXML')
+				if ( mod_nodes.length > 0 )
+					pep.modifications = mod_nodes.collect { |e| PeptideMod.from_protxml(e) }
+				else
+					pep.modifications = pep.modifications_from_sequence(pep.modified_sequence)
+				end
+			end
+			pep
+		end
+	end
+end

data/lib/protk/peptide_mod.rb ADDED

@@ -0,0 +1,42 @@
+require 'libxml'
+require 'bio'
+include LibXML
+class PeptideMod
+	# Fully Modified Sequence
+	attr_accessor :position
+	attr_accessor :amino_acid
+	attr_accessor :mass
+	class << self
+		# <modification_info modified_peptide="GFGFVTYSC[160]VEEVDAAMC[160]ARPHK">
+		# <mod_aminoacid_mass position="9" mass="160.030600"/>
+		# <mod_aminoacid_mass position="18" mass="160.030600"/>
+		# </modification_info>
+		def from_protxml(xmlnode)
+			pepmod = new()
+			pepmod.position=xmlnode['position'].to_i
+			pepmod.mass=xmlnode['mass'].to_f
+			pepmod
+		end
+		def from_data(position,amino_acid,mass)
+			pepmod = new()
+			pepmod.position = position
+			pepmod.amino_acid = amino_acid
+			pepmod.mass = mass
+			pepmod
+		end
+		private :new
+	end
+	def initialize()
+	end
+end

data/lib/protk/setup_rakefile.rake CHANGED

@@ -153,6 +153,12 @@ file tpp_installed_file => [@build_dir,tpp_download_file] do
 	sh %{cd #{@build_dir};tar -xvzf TPP-#{tpp_version}.tgz}
+	sh %{cp ~/Desktop/singleton.hpp #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/../extern/ProteoWizard/pwiz/libraries/boost_aux/boost/utility/singleton.hpp}
+	sh %{cp ~/Desktop/MascotScoreParser.h #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/Validation/DiscriminateFunction/Mascot/MascotScoreParser.h}
+	sh %{cp ~/Desktop/PTMProphetParser.cxx #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/Validation/PTMProphetParser/PTMProphetParser.cxx}
+	sh %{cp ~/Desktop/RespectFilter.h #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/Validation/Respect/RespectFilter.h}
 	File.open("#{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/Makefile.config.incl","wb") do |f|
 		f.write "TPP_ROOT=#{env.tpp_root}/\nTPP_WEB=/tpp/\nXSLT_PROC=/usr/bin/xsltproc\nCGI_USERS_DIR=${TPP_ROOT}cgi-bin/"
 	end
@@ -173,13 +179,18 @@ file tpp_installed_file => [@build_dir,tpp_download_file] do
 		makefile_text = File.read("#{makefile_path}")
 		File.open("#{makefile_path}","w+") do |f|
-			subs_text = makefile_text.gsub("cp -rfu","cp -rf")
+			subs_text = makefile_text.gsub("cp -rfu","cp -rf").gsub("-Werror","")
+#			subs_text = subs_text.gsub("-Werror","")
 			f.write subs_text
 		end
 	end
 	sh %{cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src;echo '' > ../perl/tpp_models.pl;echo '' > ../perl/exporTPP.pl;echo '' > ../CGI/show_nspbin.pl;echo '' > ../CGI/tpp_gui/tpp_gui.pl}
-	build_cmd = "#{use_perl_locallib_cmd};cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src ; make -s"
+#	build_cmd = "#{use_perl_locallib_cmd};cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src ; make -s"
+	build_cmd = "#{use_perl_locallib_cmd};cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src ; make"
 	install_cmd = "#{use_perl_locallib_cmd};cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src ; make install"
 	env.log build_cmd, :info
 	sh %{#{build_cmd}}

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: protk
 version: !ruby/object:Gem::Version
-  version: 1.4.4.beta2
+  version: 1.4.4
 platform: ruby
 authors:
 - Ira Cooke
@@ -14,170 +14,170 @@ dependencies:
   name: open4
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '1.3'
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.3.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '1.3'
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.3.0
 - !ruby/object:Gem::Dependency
   name: bio
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 1.4.3
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.4.3
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 1.4.3
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.4.3
 - !ruby/object:Gem::Dependency
   name: rest-client
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 1.6.7
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.6.7
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 1.6.7
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.6.7
 - !ruby/object:Gem::Dependency
   name: net-ftp-list
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 3.2.5
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 3.2.5
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: 3.2.5
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 3.2.5
 - !ruby/object:Gem::Dependency
   name: libxml-ruby
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.7'
-    - - '>='
+        version: '2.9'
+    - - ">="
       - !ruby/object:Gem::Version
-        version: 2.7.0
+        version: 2.9.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.7'
-    - - '>='
+        version: '2.9'
+    - - ">="
       - !ruby/object:Gem::Version
-        version: 2.7.0
+        version: 2.9.0
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '3.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '3.0'
 - !ruby/object:Gem::Dependency
   name: rspec-mocks
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '3.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '3.0'
 - !ruby/object:Gem::Dependency
   name: rake-compiler
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: byebug
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '3.5'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '3.5'
 - !ruby/object:Gem::Dependency
   name: sqlite3
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ~>
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
 description: Commandline tools for proteomics
@@ -202,6 +202,7 @@ executables:
 - uniprot_mapper.rb
 - sixframe.rb
 - augustus_to_proteindb.rb
+- maker_to_proteindb.rb
 - protxml_to_gff.rb
 - protxml_to_table.rb
 - swissprot_to_table.rb
@@ -222,6 +223,7 @@ files:
 - bin/filter_psms.rb
 - bin/interprophet.rb
 - bin/make_decoy.rb
+- bin/maker_to_proteindb.rb
 - bin/manage_db.rb
 - bin/mascot_search.rb
 - bin/mascot_to_pepxml.rb
@@ -283,6 +285,7 @@ files:
 - lib/protk/fastadb.rb
 - lib/protk/galaxy_stager.rb
 - lib/protk/galaxy_util.rb
+- lib/protk/gff_to_proteindb_tool.rb
 - lib/protk/gffdb.rb
 - lib/protk/manage_db_rakefile.rake
 - lib/protk/manage_db_tool.rb
@@ -292,6 +295,7 @@ files:
 - lib/protk/omssa_util.rb
 - lib/protk/openms_defaults.rb
 - lib/protk/peptide.rb
+- lib/protk/peptide_mod.rb
 - lib/protk/pepxml.rb
 - lib/protk/pepxml_writer.rb
 - lib/protk/physical_constants.rb
@@ -324,19 +328,18 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>'
+  - - ">="
     - !ruby/object:Gem::Version
-      version: 1.3.1
+      version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.2.1
+rubygems_version: 2.5.1
 signing_key:
 specification_version: 4
 summary: Proteomics Toolkit
 test_files: []
-has_rdoc: