RubyGems - protk - Versions diffs - 1.2.6.pre1 → 1.2.6.pre2 - Mend

protk 1.2.6.pre1 → 1.2.6.pre2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +7 -0
data/README.md +22 -27
data/bin/blastxml_to_table.rb +50 -3
data/bin/make_decoy.rb +30 -2
data/bin/mascot_search.rb +46 -27
data/bin/msgfplus_search.rb +7 -4
data/bin/peptide_prophet.rb +9 -0
data/bin/protxml_to_gff.rb +122 -66
data/bin/protxml_to_table.rb +26 -3
data/bin/tandem_search.rb +1 -1
data/lib/protk/constants.rb +19 -19
data/lib/protk/data/default_config.yml +0 -7
data/lib/protk/search_tool.rb +7 -0
metadata +118 -90
data/bin/mascot2xml.rb +0 -87
data/ext/protk/simplealign/simplealign.c +0 -17
data/lib/protk/data/pepxml_mascot_template.xml +0 -29
data/lib/protk/data/predefined_db.trembl_annotation.yaml +0 -20

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: c2cb8ece1038575e27d4d2bb8d5eff3ff2367a33
+  data.tar.gz: db9cd97c0b87186e7a84c53ed43e132b25d1cd15
+SHA512:
+  metadata.gz: df19012cdb42d14136a4f4682a3a45b2423b4b216bcdd8f959e0eaab5b7bcaa0546b5550d3a69accc1da4e158966150bb9f0e21d52c0ed2bf2c0397ca333195c
+  data.tar.gz: a08a7bd13e3027b21bdd24cf13491e6041b05a19f3f066468a96357b543332e40b4827356de2d4b81e1d7a67daa44a73b283b1948f7824f33c6cf738df46383d

data/README.md CHANGED Viewed

@@ -8,43 +8,38 @@ Protk is a wrapper for various proteomics tools. It aims to present a consistent
 ***
+## Table of Contents
+* [Protk](#what-is-it?)
+    * [Installation](#installation)
+    * [Configuration](#configuration)
-## Basic Installation
+## Installation
-Protk depends on ruby 1.9.  The recommended way to install ruby and manage ruby gems is with rvm. Install rvm using this command.
-```sh
-curl -L https://get.rvm.io | bash -s stable
+The easiest intallation method is to use rubygems.  You might need to install the libxml2 package on your system first (eg libxml-dev on Ubuntu)
+``` shell
+    gem install protk
 ```
-Next install ruby and protk's dependencies
+## Configuration
-On OSX
+By default protk will install tools and databases into `.protk` in your home directory.  If this is not desirable you can change the protk root default by setting the environment variable `PROTK_INSTALL_DIR`. You can also avoid using a `.protk` directory altogether (see below)
-```sh
-rvm install 1.9.3 --with-gcc=clang
-rvm use 1.9.3
-gem install protk
-protk_setup.rb package_manager
-protk_setup.rb system_packages
-protk_setup.rb all
-```
-On Linux
-```sh
-rvm install 1.9.3
-rvm use 1.9.3
-gem install protk
-sudo ~/.rvm/bin/rvm 1.9.3 do protk_setup.rb system_packages
-protk_setup all
+Protk includes a setup tool to install various third party proteomics tools such as the TPP, OMSSA, MS-GF+, Proteowizard.  If this tool is used it installs everything under `.protk/tools`.  To perform such an installation use;
+```shell
+    protk_setup.rb tpp omssa blast msgfplus pwiz
 ```
-Instead off using protk_setup.rb all it might be preferable to only install some of the protk tool dependencies.  'all' is just an alias for the following full target list, any of which can be omitted with the consequence that tools depending on that component will not be available.
+Alternatively, these tools may already be present on your system, or you may prefer to install them yourself.  In that case simply ensure that all executables are included in your `$PATH`. Those executables will be used as a fallback if nothing is available under the `.protk` installation directory.
+Instead off using protk_setup.rb all it might be preferable to only install some of the protk tool dependencies.  'all' is just an alias for the following full target list, any of which can be omitted with the consequence that tools depending on that component will not be available.
-```sh
-protk_setup.rb tpp omssa blast msgfplus pwiz openms galaxyenv
-```
 ## Sequence databases

data/bin/blastxml_to_table.rb CHANGED Viewed

@@ -10,7 +10,6 @@ require 'bio'
 require 'protk/fastadb'
 require 'bio-blastxmlparser'
 tool=Tool.new([:explicit_output])
 tool.option_parser.banner = "Dump BLAST xml to tabular format.\n\nUsage: blastxml_to_table.rb blast.xml"
@@ -19,6 +18,16 @@ tool.option_parser.on( '-d filename','--database filename', 'Database used for B
   tool.options.database=file
 end
+tool.options.gene2go=nil
+tool.option_parser.on('--gene2go pathtogene2go','Path to gene2go database. If provided GO terms will be looked up') do |gene2go|
+	tool.options.gene2go=gene2go
+end
+tool.options.gitogeneid=nil
+tool.option_parser.on('--gitogeneid gitogeneid.db','Path to GDBM formatted gi to geneid mapping database. If provided gene ids will be looked up') do |gitogeneid|
+	tool.options.gitogeneid=gitogeneid
+end
 exit unless tool.check_options
 #require 'debugger';debugger
@@ -36,11 +45,43 @@ if tool.database
 	$fastadb=FastaDB.new(tool.database)
 end
+$gitogeneid = nil
+if (tool.gitogeneid!=nil) && (File.exist? tool.gitogeneid)
+	require 'gdbm'
+	$gitogeneid = GDBM.new(tool.gitogeneid,flags=GDBM::READER)
+end
+$gene2go = nil
+if (tool.gene2go!=nil) && (File.exist? tool.gene2go)
+	require 'gdbm'
+	$gene2go = GDBM.new(tool.gene2go,flags=GDBM::READER)
+end
+def gi_from_hit_id(hit_id)
+	gi_scan=hit_id.scan(/gi\|(\d+)/)
+	gi_scan.join("")
+end
 def generate_line(hsp,hit,query,hit_seq=nil)
-	line="#{query.query_id}\t#{hit.hit_id}\t#{hit.hit_num}\t#{hit.hit_def}\t#{hsp.hsp_num}\t#{hsp.bit_score}\t#{hsp.evalue}\t#{hsp.qseq}\t#{hsp.hseq}"
+	line="#{query.query_id}\t#{query.query_def}\t#{hit.hit_id}\t#{hit.hit_num}\t#{hit.hit_def}\t#{hit.accession}\t#{hsp.hsp_num}\t#{hsp.bit_score}\t#{hsp.evalue}\t#{hsp.qseq}\t#{hsp.hseq}"
 	if hit_seq
 		line << "\t#{hit_seq}"
 	end
+	geneid=""
+	goterm=""
+	if $gitogeneid
+		geneid=$gitogeneid[gi_from_hit_id(hit.hit_id)]
+		goterm=$gene2go[geneid] if geneid!=nil	&& $gene2go
+	end
+	# throw "No geneid" if geneid==nil
+	line << "\t#{geneid}\t#{goterm}"
+#	require 'debugger';debugger
+#	puts gi_from_hit_id(hit.hit_id)
+#	puts $gene2go[gi_from_hit_id(hit.hit_id)]
 	line<<"\n"
 	line
 end
@@ -61,12 +102,18 @@ blast.each do |query|
 #	if hit
 		hit_seq=fetch_hit_seq(hit)
 		hit.hsps.each do |hsp|
-			out_file.write generate_line(hsp,hit,query,hit_seq)
+			out_line=generate_line(hsp,hit,query,hit_seq)
+			out_file.write out_line
 		end
 #	end
 	end
 end
+$gitogeneid.close if $gitogeneid!=nil
+$gene2go.close if $gene2go!=nil
 #require 'debugger';debugger
 #puts "Hi"

data/bin/make_decoy.rb CHANGED Viewed

@@ -32,6 +32,16 @@ tool.option_parser.on('-P str','--prefix-string str','String to prepend to seque
   tool.options.prefix_string=str
 end
+tool.options.reverse_only=false
+tool.option_parser.on('--reverse-only','Just reverse sequences. Dont try to randomize') do
+  tool.options.reverse_only=true
+end
+tool.options.id_regex=".*?\\|(.*?)[ \\|]"
+tool.option_parser.on('--id-regex regex','Regex for finding IDs. If reverse-only is used then this will be used to find ids and prepend with the decoy string. Default .*?\\|(.*?)[ \\|]') do regex
+  tool.options.id_regex=regex
+end
 tool.options.append=false
 tool.option_parser.on('-A','--append','Append input sequences to the generated database') do
   tool.options.append=true
@@ -64,7 +74,23 @@ genv=Constants.new()
 decoys_tmp_file = Pathname.new(Tempfile.new("random").path).basename.to_s;
-Randomize.make_decoys input_file, db_length, decoys_tmp_file, tool.prefix_string
+if (tool.reverse_only)
+	decoys_out = File.open(decoys_tmp_file,'w+')
+	Bio::FastaFormat.open(input_file).each do |seq|
+		id=nil
+		begin
+			id=seq.definition.scan(/#{id_regex}/)[0][0]
+			revdef=seq.definition.sub(id,"#{tool.prefix_string}#{id}")
+			decoys_out.write ">#{revdef}\n#{seq.aaseq}\n"
+		rescue
+			puts "Unable to parse id for #{seq.definition}. Skipping" if (id==nil)
+		end
+	end
+	decoys_out.close
+else
+	Randomize.make_decoys input_file, db_length, decoys_tmp_file, tool.prefix_string
+end
 cmd = "cat #{input_file} #{decoys_tmp_file} >> #{output_file}; rm #{decoys_tmp_file}" if ( tool.append )
 # Randomize.make_decoys raw_db_filename, db_length, decoys_filename, decoy_prefix
@@ -76,4 +102,6 @@ p cmd
 job_params= {:jobid => tool.jobid_from_filename(input_file) }
 job_params[:queue]="lowmem"
 job_params[:vmem]="900mb"
-tool.run(cmd,genv,job_params)
+tool.run(cmd,genv,job_params)

data/bin/mascot_search.rb CHANGED Viewed

@@ -22,7 +22,7 @@ def login(mascot_cgi,username,password)
     authdict[:savecookie]="1"
     p "Logging in to #{mascot_cgi}/login.pl"
-    p authdict
     response = RestClient.post "#{mascot_cgi}/login.pl", authdict
     cookie = response.cookies
@@ -40,6 +40,8 @@ def download_datfile(mascot_cgi,results_date,results_file,explicit_output,openur
         output_path="#{results_file}"
     end
+    puts "Writing output to #{output_path}"
     require 'open-uri'
     open("#{output_path}", 'wb') do |file|
         file << open("#{get_url}","Cookie"=>openurlcookie).read
@@ -144,6 +146,12 @@ search_tool.option_parser.on( '--export format', 'Save results in a specified fo
     search_tool.options.export_format=format
 end
+search_tool.options.download_only=nil
+search_tool.option_parser.on( '--download-only path', 'Specify a path to an existing results file for download eg(20131113/F227185.dat)' ) do |path|
+    search_tool.options.download_only=path
+end
 search_tool.options.timeout=200
 search_tool.option_parser.on( '--timeout seconds', 'Timeout for sending data file to mascot in seconds' ) do |seconds|
     search_tool.options.timeout=seconds.to_i
@@ -151,8 +159,9 @@ end
 exit unless search_tool.check_options
-if ( ARGV[0].nil? )
+if ( ARGV[0].nil? && search_tool.download_only.nil?)
     puts "You must supply an input file"
+    puts search_tool.download_only
     puts search_tool.option_parser
     exit
 end
@@ -167,7 +176,6 @@ unless ( mascot_cgi =~ /^http[s]?:\/\//)
 end
 RestClient.proxy=search_tool.httpproxy if search_tool.httpproxy
-$genv.log("Var mods #{search_tool.var_mods} and fixed #{search_tool.fix_mods}",:info)
 cookie=""
 openurlcookie=""
@@ -178,36 +186,47 @@ if ( search_tool.use_security)
     openurlcookie = "MASCOT_SESSION=#{cookie['MASCOT_SESSION']}; MASCOT_USERID=#{cookie['MASCOT_USERID']}; MASCOT_USERNAME=#{cookie['MASCOT_USERNAME']}"
 end
-postdict = search_params_dictionary search_tool, ARGV[0]
-$genv.log("Sending #{postdict}",:info)
+if ( !search_tool.download_only.nil?)
+    parts=search_tool.download_only.split("/")
+    throw "Must provide a path of the format date/filename" unless parts.length==2
+    results_date=parts[0]
+    results_file=parts[1]
+    download_datfile mascot_cgi, results_date, results_file,search_tool.explicit_output,openurlcookie
+else
+    #$genv.log("Var mods #{search_tool.var_mods} and fixed #{search_tool.fix_mods}",:info)
-#site = RestClient::Resource.new(mascot_cgi, timeout=300)
-#search_response=site['/nph-mascot.exe?1'].post , postdict, {:cookies=>cookie}
+    postdict = search_params_dictionary search_tool, ARGV[0]
+    $genv.log("Sending #{postdict}",:info)
-search_response=RestClient::Request.execute(:method => :post, :url => "#{mascot_cgi}/nph-mascot.exe?1", :payload => postdict,:headers=>{:cookies=>cookie},:timeout => search_tool.options.timeout, :open_timeout => 10)
+    #site = RestClient::Resource.new(mascot_cgi, timeout=300)
+    #search_response=site['/nph-mascot.exe?1'].post , postdict, {:cookies=>cookie}
+    search_response=RestClient::Request.execute(:method => :post, :url => "#{mascot_cgi}/nph-mascot.exe?1", :payload => postdict,:headers=>{:cookies=>cookie},:timeout => search_tool.options.timeout, :open_timeout => 10)
-#search_response=RestClient.post "#{mascot_cgi}/nph-mascot.exe?1", postdict, {:cookies=>cookie}
-$genv.log("Mascot search response was #{search_response}",:info)
+    #search_response=RestClient.post "#{mascot_cgi}/nph-mascot.exe?1", postdict, {:cookies=>cookie}
-# Look for an error if there is one
-error_result= /Sorry, your search could not be performed(.*)/.match(search_response)
-if ( error_result != nil )
-    puts error_result[0]
-    $genv.log("Mascot search failed with response #{search_response}",:warn)
-    throw "Mascot search failed with response #{search_response}"
-elsif (search_tool.export_format=="mascotdat")
-    # Search for the location of the mascot data file in the response
-    results=/master_results_?2?\.pl\?file=\.*\/data\/(.*)\/(.+\.dat)/.match(search_response)
-    results_date=results[1]
-    results_file=results[2]
+    $genv.log("Mascot search response was #{search_response}",:info)
-    download_datfile mascot_cgi, results_date, results_file,search_tool.explicit_output,openurlcookie
-else
-    results=/master_results_?2?\.pl\?file=(\.*\/data\/.*\/.+\.dat)/.match(search_response)
-    results_file = results[1]
-    export_results mascot_cgi,cookie,results_file,search_tool.export_format, openurlcookie
-#    export_results mascot_cgi,cookie,results_file,search_tool.export_format
+    # Look for an error if there is one
+    error_result= /Sorry, your search could not be performed(.*)/.match(search_response)
+    if ( error_result != nil )
+        puts error_result[0]
+        $genv.log("Mascot search failed with response #{search_response}",:warn)
+        throw "Mascot search failed with response #{search_response}"
+    elsif (search_tool.export_format=="mascotdat")
+        # Search for the location of the mascot data file in the response
+        results=/master_results_?2?\.pl\?file=\.*\/data\/(.*)\/(.+\.dat)/.match(search_response)
+        results_date=results[1]
+        results_file=results[2]
+        download_datfile mascot_cgi, results_date, results_file,search_tool.explicit_output,openurlcookie
+    else
+        results=/master_results_?2?\.pl\?file=(\.*\/data\/.*\/.+\.dat)/.match(search_response)
+        results_file = results[1]
+        export_results mascot_cgi,cookie,results_file,search_tool.export_format, openurlcookie
+    #    export_results mascot_cgi,cookie,results_file,search_tool.export_format
+    end
 end

data/bin/msgfplus_search.rb CHANGED Viewed

@@ -5,6 +5,7 @@
 #
 # Runs an MS/MS search using the MSGFPlus search engine
 #
 $VERBOSE=nil
 require 'protk/constants'
 require 'protk/command_runner'
@@ -18,7 +19,7 @@ input_stager = nil
 # Setup specific command-line options for this tool. Other options are inherited from SearchTool
 #
 search_tool=SearchTool.new([:background,:database,:explicit_output,:over_write,:enzyme,
-  :modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:missed_cleavages])
+  :modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:cleavage_semi])
 search_tool.jobid_prefix="p"
 search_tool.option_parser.banner = "Run an MSGFPlus msms search on a set of msms spectrum input files.\n\nUsage: msgfplus_search.rb [options] file1.mzML file2.mzML ..."
@@ -164,10 +165,10 @@ ARGV.each do |filename|
     # The basic command
     #
     cmd= "#{make_msgfdb_cmd} java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{current_db} -s #{input_path} -o #{mzid_output_path} "
-    #Missed cleavages
+    #Semi tryptic peptides
     #
-    throw "Maximum value for missed cleavages is 2" if ( search_tool.missed_cleavages.to_i > 2)
-    cmd << " -ntt #{search_tool.missed_cleavages}"
+    cmd << " -ntt 1" if ( search_tool.cleavage_semi )
     # Precursor tolerance
     #
@@ -235,6 +236,8 @@ ARGV.each do |filename|
       cmd << "; cp #{mzid_output_path} #{output_path}"
     else
       #if search_tool.explicit_output
+      cmd << ";ruby -pi.bak -e \"gsub('post=\\\"?','post=\\\"X')\" #{mzid_output_path}"
+      cmd << ";ruby -pi.bak -e \"gsub('pre=\\\"?','pre=\\\"X')\" #{mzid_output_path}"
       cmd << "; #{genv.idconvert} #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
       #Then copy the pepxml to the final output path
       cmd << "; mv #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"

data/bin/peptide_prophet.rb CHANGED Viewed

@@ -92,6 +92,11 @@ prophet_tool.option_parser.on( '--no-decoy', 'Don\'t use decoy sequences to pin
   prophet_tool.options.no_decoys = true
 end
+prophet_tool.options.experiment_label=nil
+prophet_tool.option_parser.on('--experiment-label label','used to commonly label all spectra belonging to one experiment (required by iProphet)') do |label|
+  prophet_tool.options.experiment_label = label
+end
 prophet_tool.options.override_database=nil
 prophet_tool.option_parser.on( '--override-database database', 'Manually specify database') do |database|
   prophet_tool.options.override_database = database
@@ -212,6 +217,10 @@ def generate_command(genv,prophet_tool,inputs,output,database,engine)
     cmd << " -I2 -T3 -I4 -I5 -I6 -I7 "
   end
+  if prophet_tool.experiment_label!=nil
+    cmd << " -E#{prophet_tool.experiment_label} "
+  end
   unless prophet_tool.no_decoys
     if engine=="omssa" || engine=="phenyx"

data/bin/protxml_to_gff.rb CHANGED Viewed

@@ -29,6 +29,16 @@ tool.option_parser.on( '-d filename','--database filename', 'Database used for m
   tool.options.database=file
 end
+tool.options.protein_find=nil
+tool.option_parser.on( '-f term','--find term', 'Restrict output to proteins whose name matches the specified string' ) do |term|
+  tool.options.protein_find=term
+end
+tool.options.nterm_minlen=7
+tool.option_parser.on( '-n len','--nterm-min-len len', 'Only include inferred N-terminal sequences if longer than len' ) do |len|
+  tool.options.nterm_minlen=len
+end
 tool.options.genome=nil
 tool.option_parser.on( '-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)' ) do |file|
   tool.options.genome=file
@@ -39,11 +49,26 @@ tool.option_parser.on('--skip-index','Don\'t index database (Index should alread
   tool.options.skip_fasta_indexing=true
 end
+tool.options.stack_charge_states=false
+tool.option_parser.on('--stack-charge-states','Different peptide charge states get separate gff entries') do
+  tool.options.stack_charge_states=true
+end
+tool.options.collapse_redundant_proteins=false
+tool.option_parser.on('--collapse-redundant-proteins','Proteins that cover genomic regions already covered will be skipped') do
+  tool.options.collapse_redundant_proteins=true
+end
 tool.options.peptide_probability_threshold=0.95
 tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
   tool.options.peptide_probability_threshold=thresh.to_f
 end
+tool.options.protein_probability_threshold=0.99
+tool.option_parser.on('--prot-threshold prob','Protein Probability Threshold (Default 0.99)') do |thresh|
+  tool.options.protein_probability_threshold=thresh.to_f
+end
 exit unless tool.check_options [:protxml,:database]
 gff_out_file="peptides.gff"
@@ -94,7 +119,7 @@ def protein_names(protein_node)
 end
 def peptide_nodes(protein_node)
-  protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
+  return protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
 end
@@ -210,41 +235,11 @@ def peptide_is_in_sixframe(pep_seq,gene_seq)
   return false
 end
-# gene_seq should already have been reverse_complemented if on reverse strand
-def get_peptide_coordinates_by_alignment(prot_seq,pep_seq,protein_info,gene_seq)
-  if ( peptide_is_in_sixframe(pep_seq,gene_seq))
-    return nil
-  else
-    puts "Warning. Actually found a gap #{protein_info.fasta_id}"
-    aln=GappedAligner.new().align(pep_seq,gene_seq)
-    unless aln.gaps.length==1
-      puts "More than one intron.#{aln}"
-      require 'debugger';debugger
-    end
-    pep_coords = []
-    frags = aln.fragments
-    frags.reverse!  if protein_info.strand=='-'
-    frags.each { |frag|
-      if protein_info.strand=='+'
-        frag_genomic_start = protein_info.start + frag[0]
-        frag_genomic_end = protein_info.start + frag[1]
-      else
-        frag_genomic_start = protein_info.end - frag[1]
-        frag_genomic_end = protein_info.end - frag[0]
-      end
-      pep_coords << frag_genomic_start
-      pep_coords << frag_genomic_end
-    }
-    return [pep_coords]
-  end
-end
 def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
   sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }
   # Assume positive strand
   pi_start=pepstart*3+gene_start-1
   pi_end=pepend*3+gene_start-1
@@ -271,6 +266,13 @@ def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,codi
       end
     else
       if finding_start
+        if ( pi_end <= cds_end) #Whole peptide contained in a single exon
+          fragments << [p_i+1,pi_end]
+          break;
+        end
         fragments << [p_i+1,(cds_end)]
         next_coords = sorted_cds[i+1]
         intron_offset = ((next_coords[0]-cds_end)-1)
@@ -290,9 +292,10 @@ end
 # gene_seq should already have been reverse_complemented if on reverse strand
 def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
-  if ( peptide_is_in_sixframe(pep_seq,gene_seq))
-    return nil
-  else
+  # if ( peptide_is_in_sixframe(pep_seq,gene_seq))
+    # Peptide is in 6-frame but on a predicted transcript
+    # return nil
+  # else
     # puts "Found a gap #{protein_info.fasta_id}"
     if protein_info.strand=='-'
@@ -315,7 +318,7 @@ def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,g
     pep_end_i = pep_start_i+pep_seq.length
     return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
-  end
+  # end
 end
 def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
@@ -421,7 +424,7 @@ def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end
     end
     return nil unless is_tryptic
-    start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-1
+    start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-2
     # require 'debugger';debugger
     return [start_codon_coord,start_codon_coord+2]
   else
@@ -442,32 +445,38 @@ def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,pepti
 end
-def get_signal_peptide_for_peptide(peptide_seq,protein_seq)
+def get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
   pi=protein_seq.index(peptide_seq)
   if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
+    # Since trypsin sometimes cleaves before P (ie breaking the rule)
+    # we don't check for it and assume those cases are real tryptic termini
     reverse_leader_seq=protein_seq[0..pi].reverse
     mi=reverse_leader_seq.index('M')
     if ( mi==nil )
-      puts "No methionine found ahead of peptide sequence. Unable to determine signal peptide sequence"
+      puts "No methionine found ahead of peptide sequence. Unable to determine n-term sequence"
       return nil
     end
     mi=pi-mi
-    return protein_seq[mi..(pi-1)]
+    ntermseq=protein_seq[mi..(pi-1)]
+    # if ( ntermseq.length < minlen )
+    #   return nil
+    # end
+#    $STDOUT.write protein_seq[mi..(pi+peptide_seq.length-1)]
+#    require 'debugger';debugger
+    full_seq_with_annotations = "#{ntermseq}(cleaved)#{protein_seq[(pi..(pi+peptide_seq.length-1))]}"
+    return full_seq_with_annotations
   else
     return nil
   end
 end
-def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,genomedb=nil)
-  dna_sequence=nil
-  if !protein_info.is_sixframe
-    throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
-    dna_sequence = get_dna_sequence(protein_info,genomedb)
-  end
+def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,dna_sequence,genomedb=nil)
   prot_seq = protein_seq
   pep_seq = peptide_seq
@@ -517,24 +526,37 @@ def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_i
       gff_records+=[start_codon_gff]
     end
-    signal_peptide = get_signal_peptide_for_peptide(peptide_seq,protein_seq)
-    if signal_peptide
-      # require 'debugger';debugger
+  end
+#  puts gff_records
-      signal_peptide_coords=get_peptide_coordinates(prot_seq,signal_peptide,protein_info,dna_sequence)
-      if signal_peptide_coords
-        signal_peptide_coords.each do |spcoords|
-          signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,signal_peptide,genomedb,"signalpeptide")
+  gff_records
+end
+def add_putative_nterm_to_gff(gff_records,peptide_seq,protein_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
+  pep_id = "#{prot_id}.p#{peptide_count.to_s}"
+  signal_peptide = get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
+  if signal_peptide
+    $stdout.write "Nterm\t#{signal_peptide}\t#{protein_info.name}\t#{protein_seq}\n"
+    raw_signal_peptide=signal_peptide.sub(/\(cleaved\)/,"")
+    # Get raw signal_peptide sequence
+    signal_peptide_coords=get_peptide_coordinates(protein_seq,raw_signal_peptide,protein_info,dna_sequence)
+    if signal_peptide_coords
+     signal_peptide_coords.each do |spcoords|
+      signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,raw_signal_peptide,genomedb,"signalpeptide")
           gff_records += signal_peptide_gff
-        end
       end
     end
   end
-  puts gff_records
+end
-  gff_records
+def peptide_gff_is_duplicate(peptide_gff,peptides_covered_genome)
+  nameindex = peptide_gff.attributes.index {|obj| obj[0]=="Name" }
+  pep_seq = peptide_gff.attributes[nameindex][1]
+  existing = peptides_covered_genome[pep_seq]
+  return true if existing==peptide_gff.start
+  return false
 end
 proteins = parse_proteins(tool.protxml)
@@ -552,15 +574,23 @@ peptide_count = 0
 protein_count = 0
 total_peptides = 0
+peptides_covered_genome={}
 for prot in proteins
   prot_prob = prot['probability']
-  if ( prot_prob.to_f < tool.peptide_probability_threshold )
+  if ( prot_prob.to_f < tool.protein_probability_threshold )
     next
   end
   # Gets identifiers of all proteins (includeing indistinguishable ones)
   prot_names=protein_names(prot)
+  if tool.protein_find!=nil
+    prot_names=prot_names.keep_if { |pname| pname.include? tool.protein_find }
+  end
   peptides=peptide_nodes(prot)
   entries_covered=[]
   for protein_name in prot_names
@@ -571,7 +601,7 @@ for prot in proteins
       protein_fasta_entry = get_fasta_record(protein_name,fastadb)
       protein_info = cds_info_from_fasta(protein_fasta_entry)
-      if is_new_genome_location(protein_info,entries_covered)
+      unless (tool.collapse_redundant_proteins && !is_new_genome_location(protein_info,entries_covered) )
         protein_gff = generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
@@ -580,15 +610,41 @@ for prot in proteins
         prot_seq = protein_fasta_entry.aaseq.to_s
         throw "Not amino_acids" if prot_seq != protein_fasta_entry.seq.to_s
+        peptides_covered_protein=[]
         peptide_count=1
         for peptide in peptides
           pprob = peptide['nsp_adjusted_probability'].to_f
-          if ( pprob >= tool.peptide_probability_threshold )
-            total_peptides += 1
-            pep_seq = peptide['peptide_sequence']
+          # puts peptide
+          # puts pprob
+          pep_seq = peptide['peptide_sequence']
+          if ( pprob >= tool.peptide_probability_threshold && (!peptides_covered_protein.include?(pep_seq) || tool.stack_charge_states))
+            dna_sequence=nil
+            if !protein_info.is_sixframe
+              throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
+              dna_sequence = get_dna_sequence(protein_info,genomedb)
+            end
+            peptide_gff = generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,dna_sequence,genomedb)
+            unless (peptide_gff.length==0 || peptide_gff_is_duplicate(peptide_gff[0],peptides_covered_genome))
+              add_putative_nterm_to_gff(peptide_gff,pep_seq,prot_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
+              gff_db.records += peptide_gff
+              peptides_covered_protein << pep_seq unless tool.stack_charge_states
+              peptides_covered_genome[pep_seq] = peptide_gff[0].start
-            gff_db.records += generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,genomedb)
-            peptide_count+=1
+              total_peptides += 1
+              peptide_count+=1
+            else
+              puts "Duplicate peptide #{peptide_gff[0]}"
+            end
+#            puts gff_db.records.last
           end
         end
       else