protk 1.4.4.beta2 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f278e7fe8e3a0955907a13952ced4fa0d772c204
4
- data.tar.gz: 513fb214683486b9d3596a9b9f0956107d611170
3
+ metadata.gz: 1b59cd3751adc7a13c6dce81a90c2a4d739e1efe
4
+ data.tar.gz: 3ac1aba71a95b729101a6c64699cebaf1929fe34
5
5
  SHA512:
6
- metadata.gz: e59fcb0724cbf42b8f63e65ca6d88fd91c7d45d3981964358c82e040f9235e2ce6768251d088966044827cadf5c9ce46f189345d3c27293d0b10b44daf018c10
7
- data.tar.gz: 36329dca4cf416fc2b9bd6f0d395895ee2fd32fdba2f6ebd75c5d26ed21b497dc890cfc6bcf36b1f08bc7f3954bae76adea64450a431d6ac365a2930191997cc
6
+ metadata.gz: 6ab966131e53b6e379ba1df7717cc71e0a7ead11b603d16d474f4fdce487623546ca472c8624421d62af1e238f7e2917f8ed1e66c52a82d0fc4798cd7b1fdf6b
7
+ data.tar.gz: 05b98a731eb99063f17942fed42dce0e450488e61b11452b918269bd27788a300d7d2ebf4c2a67930c14bc5291dfab7bf8b13b992300356e96b0d274e68c6d96
data/README.md CHANGED
@@ -16,10 +16,16 @@ Protk is a suite of tools for proteomics. It aims to present a simple and consis
16
16
 
17
17
  Protk is a ruby gem and requires ruby 2.0 or higher with support for libxml2. To avoid installation problems we recommend using [rvm](https://rvm.io) to install ruby.
18
18
 
19
- ``` shell
19
+ ```shell
20
20
  gem install protk
21
21
  ```
22
22
 
23
+ On macOS you may need to do
24
+
25
+ ```bash
26
+ brew install coreutils
27
+ ```
28
+
23
29
  ## Ruby Compatibility
24
30
 
25
31
  In general Protk requires ruby with a version >=2.0.
@@ -0,0 +1,148 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 4/9/2013
5
+ #
6
+ #
7
+
8
+ require 'protk/constants'
9
+ require 'protk/tool'
10
+ require 'protk/gff_to_proteindb_tool'
11
+ require 'bio'
12
+
13
+ tool=GffToProteinDBTool.new([:explicit_output,:debug,:add_transcript_info])
14
+ tool.option_parser.banner = "Create a protein database from Maker gene prediction \
15
+ output that is suitable for later processing by proteogenomics tools.\
16
+ \n\nUsage: maker_to_proteindb.rb [options] maker.gff3"
17
+
18
+ tool.add_value_option(:proteins_file,nil,['-p', '--prot-fasta proteins', 'A fasta file \
19
+ containing protein sequences for each transcript'])
20
+
21
+ # tool.add_value_option(:explicit_output,nil,['-o', '--output out', 'An explicitly named output file. \
22
+ # The default is to write to standard output'])
23
+
24
+ exit unless tool.check_options(true)
25
+
26
+ inname=ARGV.shift
27
+
28
+ $protk = Constants.instance
29
+ log_level = tool.debug ? :debug : :fatal
30
+ $protk.info_level= log_level
31
+
32
+ tool.print_progress=true
33
+
34
+ outfile=nil
35
+ if ( tool.explicit_output != nil)
36
+ outfile=File.open(tool.explicit_output,'w')
37
+ else
38
+ outfile=$stdout
39
+ tool.print_progress=false
40
+ end
41
+
42
+ gene_lines=[]
43
+
44
+ def get_protein_sequence(transcript_id,proteins_file)
45
+ %x[samtools faidx #{proteins_file} #{transcript_id} | tail -n +2]
46
+ end
47
+
48
+ def cds_to_header_text(coding_sequence,transcript_id)
49
+ # require 'debugger';debugger
50
+ imatch=coding_sequence.match(/CDS\t(\d+)\t(\d+).*?([-\+]{1}.*?Parent=#{transcript_id})$/)
51
+ if imatch==nil
52
+ return ""
53
+ end
54
+ istart=imatch[1]
55
+ iend=imatch[2]
56
+ "#{istart}|#{iend}"
57
+ end
58
+
59
+ def sequence_fasta_header(tool,transcript_line,coding_sequences)
60
+
61
+ tmatch=transcript_line.match(/mRNA\t(\d+)\t(\d+).*?([-\+]{1}).*?ID=(.*?);/)
62
+ # require 'debugger'; debugger
63
+ tstart,tend,tstrand = transcript_line.match(/mRNA\t(\d+)\t(\d+).*?([-\+]{1})/).captures
64
+
65
+ # tstart=tmatch[1]
66
+ # tend=tmatch[2]
67
+ # tsidfield = transcript_line.split("\t")[8]
68
+
69
+ tid = transcript_line.match(/ID=([^;]+)/).captures[0]
70
+ # if tsidfield =~ /ID=/
71
+ # tid = tsidfield.match(/ID=(.*?);/).captures[0]
72
+ # else
73
+ # tid = tsidfield.gsub(" ","_").gsub(";","_")
74
+ # end
75
+
76
+ # require 'byebug';byebug
77
+
78
+ tstrandfr="fwd"
79
+ tstrandfr = "rev" if tstrand=="-"
80
+
81
+ scaffold=transcript_line.split("\t")[0]
82
+
83
+ # tid=tmatch[4]
84
+ header=">lcl|#{scaffold}_#{tstrandfr}_#{tid} #{tstart}|#{tend}"
85
+ if tool.add_transcript_info
86
+ coding_sequences.each { |coding_sequence| header << " #{cds_to_header_text(coding_sequence,tid)}" }
87
+ end
88
+ header
89
+ end
90
+
91
+ def protein_sequence(protein_lines)
92
+ seq=""
93
+ protein_lines.each_with_index do |line, i|
94
+ seq << line.match(/(\w+)\]?$/)[1]
95
+ end
96
+
97
+ seq
98
+ end
99
+
100
+
101
+ def parse_gene(tool,gene_lines)
102
+
103
+ # require 'byebug';byebug
104
+ geneid=gene_lines[0].match(/ID=([^;]+)/).captures[0]
105
+
106
+ scaffold_id = gene_lines[1].split("\t")[0]
107
+
108
+ transcripts=tool.get_lines_matching(/mRNA/,gene_lines)
109
+
110
+ coding_sequences=tool.get_lines_matching(/CDS/,gene_lines)
111
+
112
+ fasta_string=""
113
+
114
+ transcripts.each_with_index do |ts, i|
115
+
116
+ prot_id=ts.match(/ID=([^;]+)/).captures[0]
117
+
118
+ begin
119
+ fh=sequence_fasta_header(tool,ts,coding_sequences)
120
+ fasta_string << "#{fh}\n"
121
+ ps=get_protein_sequence(prot_id,tool.proteins_file)
122
+ fasta_string << "#{ps}"
123
+ rescue => e
124
+ $protk.log "Unable to retrieve protein for #{prot_id} #{e}" , :debug
125
+ end
126
+ end
127
+
128
+ fasta_string
129
+ end
130
+
131
+
132
+
133
+ File.open(inname).each_with_index do |line, line_i|
134
+ line.chomp!
135
+
136
+ if tool.start_new_gene(line)
137
+ if gene_lines.length > 0
138
+ gene_string=parse_gene(tool,gene_lines)
139
+ outfile.write gene_string
140
+ gene_lines=[]
141
+ end
142
+ end
143
+
144
+ if line =~ /maker/
145
+ gene_lines << line
146
+ end
147
+
148
+ end
@@ -93,11 +93,13 @@ if for_galaxy || Pathname.new(database_path).extname.to_s.downcase != ".fasta"
93
93
  # database_path="#{database_path}.fasta"
94
94
  end
95
95
 
96
+ db_noext = "#{Pathname.new(database_path).sub_ext('')}"
97
+
96
98
  # Database must be indexed
97
- unless FileTest.exists?("#{database_path}.canno")
98
- # dbdir = Pathname.new(database_path).dirname.to_s
99
+ unless FileTest.exists?("#{db_noext}.canno")
99
100
  tdavalue=search_tool.decoy_search ? 1 : 0;
100
- make_msgfdb_cmd << "java -Xmx3500M -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
101
+ genv.log "Database index not found at #{db_noext}. Building new index" , :info
102
+ make_msgfdb_cmd << "java -Xmx#{search_tool.java_mem} -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
101
103
  end
102
104
 
103
105
 
@@ -262,4 +264,4 @@ ARGV.each do |filename|
262
264
  #
263
265
  make_msgfdb_cmd=""
264
266
 
265
- end
267
+ end
@@ -54,6 +54,19 @@ else
54
54
  output_file=Tool.default_output_path(inputs,".prot.xml",prophet_tool.output_prefix,@output_suffix)
55
55
  end
56
56
 
57
+ genv.log("Checking input files ...",:info)
58
+ inputs.each {|file_name|
59
+
60
+ throw "Missing input file #{file_name}" unless File.exist?(file_name)
61
+
62
+ file_pepxml = PepXML.new(file_name)
63
+
64
+ db_path=file_pepxml.extract_db()
65
+ throw "Unable to find database #{db_path} used for searching. Fix paths in input files first" unless File.exist?(db_path)
66
+ }
67
+
68
+
69
+
57
70
  if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
58
71
 
59
72
  cmd="ProteinProphet "
@@ -38,7 +38,7 @@ end
38
38
 
39
39
  def protein_id_to_gffid(protein_id,gff_idregex)
40
40
  return protein_id if gff_idregex.nil?
41
-
41
+ # require 'byebug'; byebug
42
42
  m = protein_id.match(/#{gff_idregex}/)
43
43
  if m
44
44
  return m.captures[0]
@@ -77,7 +77,9 @@ def prepare_fasta(database_path,type)
77
77
  end
78
78
 
79
79
 
80
- db_indexfilename = type=='prot' ? "#{db_filename}.pin" : "#{db_filename}.nhr"
80
+ db_indexfilename = type=='prot' ? "#{db_filename}.00.pin" : "#{db_filename}.nhr"
81
+
82
+ # require 'byebug';byebug
81
83
 
82
84
  if File.exist?(db_indexfilename)
83
85
  orf_lookup = FastaDB.new(db_filename)
@@ -101,6 +103,7 @@ tool.add_value_option(:protein_probability_threshold,0.99,['--prot-threshold pro
101
103
  tool.add_value_option(:gff_idregex,nil,['--gff-idregex pre','Regex with capture group for parsing gff ids from protein ids'])
102
104
  tool.add_value_option(:genome_idregex,nil,['--genome-idregex pre','Regex with capture group for parsing genomic ids from protein ids'])
103
105
  tool.add_value_option(:ignore_regex,nil,['--ignore-regex pre','Regex to match protein ids that we should ignore completely'])
106
+ tool.add_value_option(:include_mods,false,['--include-mods','Output gff entries for peptide modification sites'])
104
107
 
105
108
  exit unless tool.check_options(true,[:database,:coords_file])
106
109
 
@@ -170,6 +173,12 @@ proteins.each do |protein|
170
173
  peptide_entries.each do |peptide_entry|
171
174
  output_fh.write peptide_entry.to_s
172
175
  end
176
+ if tool.include_mods
177
+ mod_entries = peptide.mods_to_gff3_records(protein_entry.aaseq,gff_parent_entry,gff_cds_entries)
178
+ mod_entries.each do |mod_entry|
179
+ output_fh.write mod_entry.to_s
180
+ end
181
+ end
173
182
  end
174
183
  end
175
184
 
@@ -64,7 +64,12 @@ file.each do |entry|
64
64
  length = entry.naseq.length
65
65
 
66
66
  (1...7).each do |frame|
67
- translated_seq= entry.naseq.translate(frame)
67
+ begin
68
+ translated_seq= entry.naseq.translate(frame)
69
+ rescue => exception
70
+ puts "#{entry}"
71
+ exit 1
72
+ end
68
73
  orfs=translated_seq.split("*")
69
74
  orf_index = 0
70
75
  position = ((frame - 1) % 3) + 1
@@ -22,8 +22,10 @@ exit unless search_tool.check_options(true)
22
22
 
23
23
  # Our environment should be setup so that tandem or tandem.exe is on the path
24
24
  #
25
- tandem_bin=%x[which tandem].chomp
26
- tandem_bin=%x[which tandem.exe].chomp unless tandem_bin && tandem_bin.length>0
25
+ # tandem_bin=%x[which tandem].chomp
26
+ # tandem_bin=%x[which tandem.exe].chomp unless tandem_bin && tandem_bin.length>0
27
+
28
+ # tandem_bin
27
29
 
28
30
  @output_suffix="_tandem"
29
31
 
@@ -70,7 +72,7 @@ ARGV.each do |filename|
70
72
 
71
73
  # The basic command
72
74
  #
73
- cmd= "#{tandem_bin} #{params_path}"
75
+ cmd= "#{genv.tandem_bin} #{params_path}"
74
76
 
75
77
  # Add a cleanup command unless the user wants to keep params files
76
78
  #
@@ -63,6 +63,18 @@ class Constants
63
63
  "#{@protk_dir}/tools/msgfplus"
64
64
  end
65
65
 
66
+ def get_path_for_executable(exec_name_list)
67
+ exec_name_list.each do |exec_name|
68
+ exec_path=%x[which #{exec_name}].chomp
69
+ return exec_path unless !exec_path || exec_path.length==0
70
+ end
71
+ throw "Unable to locate #{exec_name_list}"
72
+ end
73
+
74
+ def tandem_bin
75
+ get_path_for_executable ["tandem","tandem.exe"]
76
+ end
77
+
66
78
  def msgfplusjar
67
79
  msgfplus_path=%x[which MSGFPlus.jar]
68
80
  msgfplus_path.chomp
@@ -0,0 +1,56 @@
1
+ #
2
+ # This file is part of protk
3
+ # Created by Ira Cooke 9/3/2017
4
+ #
5
+ # Provides common functionality used by tools that convert gff to a protein database
6
+ #
7
+ # These tools read a gff and then write out protein entries in the following format
8
+ #
9
+ # >lcl|<scaffold_id>_<orientation>_<transcript_id> gene_start|gene_end cds1_start|cds1_end cds2_start|cds2_end ...
10
+ #
11
+
12
+ require 'optparse'
13
+ require 'pathname'
14
+ require 'protk/tool'
15
+
16
+ class GffToProteinDBTool < Tool
17
+
18
+ attr_accessor :print_progress
19
+
20
+ # Initializes commandline options common to all such tools.
21
+ # Individual search tools can add their own options, but should use Capital letters to avoid conflicts
22
+ #
23
+ def initialize(option_support=[])
24
+ super(option_support)
25
+
26
+ if ( option_support.include? :add_transcript_info )
27
+ add_boolean_option(:add_transcript_info,false,['--info','Include CDS Coordinates'])
28
+ end
29
+
30
+ @option_parser.summary_width=40
31
+
32
+ @capturing_gene=false
33
+ @current_gene=nil
34
+ end
35
+
36
+ def start_new_gene(line)
37
+ if (line =~ /maker\sgene/)
38
+ new_gene = line.match(/ID=([^;]+)/).captures[0]
39
+ if new_gene!=@current_gene
40
+ @current_gene=new_gene
41
+ return true
42
+ end
43
+ end
44
+ end
45
+
46
+ def get_lines_matching(pattern,gene_lines)
47
+ match_lines=[]
48
+ gene_lines.each do |line|
49
+ if line =~ pattern
50
+ match_lines << line
51
+ end
52
+ end
53
+ match_lines
54
+ end
55
+
56
+ end
@@ -3,12 +3,15 @@ require 'bio'
3
3
  require 'protk/bio_gff3_extensions'
4
4
  require 'protk/mzidentml_doc'
5
5
  require 'protk/error'
6
+ require 'protk/peptide_mod'
7
+ # require 'protk/indistinguishable_peptide'
6
8
 
7
9
  include LibXML
8
10
 
9
11
  class PeptideNotInProteinError < ProtkError
10
12
  end
11
13
 
14
+
12
15
  class Peptide
13
16
 
14
17
  # Stripped sequence (no modifications)
@@ -17,6 +20,9 @@ class Peptide
17
20
  attr_accessor :charge
18
21
  attr_accessor :probability
19
22
  attr_accessor :theoretical_neutral_mass
23
+ attr_accessor :modifications
24
+ attr_accessor :modified_sequence
25
+ attr_accessor :indistinguishable_peptides
20
26
 
21
27
  def as_protxml
22
28
  node = XML::Node.new('peptide')
@@ -33,6 +39,27 @@ class Peptide
33
39
  pep.sequence=xmlnode['peptide_sequence']
34
40
  pep.probability=xmlnode['nsp_adjusted_probability'].to_f
35
41
  pep.charge=xmlnode['charge'].to_i
42
+
43
+ # This deal with the case where mods are on the primary peptide
44
+ #
45
+ mod_info_node = xmlnode.find('protxml:modification_info','protxml:http://regis-web.systemsbiology.net/protXML')
46
+
47
+ # The pepXML spec says there can be multiple modification_info's but in practice there never is.
48
+ # We assume either 1 or 0
49
+ if ( mod_info_node.length > 0 )
50
+ throw "Encountered multiple modification_info nodes for a peptide" if mod_info_node.length > 1
51
+ pep.modified_sequence = mod_info_node[0]['modified_peptide']
52
+ mod_nodes = mod_info_node[0].find('protxml:mod_aminoacid_mass','protxml:http://regis-web.systemsbiology.net/protXML')
53
+ # require 'byebug';byebug
54
+ pep.modifications = mod_nodes.collect { |e| PeptideMod.from_protxml(e) }
55
+ end
56
+
57
+ # This deals with indistinguishable peptides
58
+ #
59
+ ips = xmlnode.find('protxml:indistinguishable_peptide','protxml:http://regis-web.systemsbiology.net/protXML')
60
+ # require 'byebug';byebug
61
+ pep.indistinguishable_peptides = ips.collect { |e| IndistinguishablePeptide.from_protxml(e) }
62
+
36
63
  pep
37
64
  end
38
65
 
@@ -55,17 +82,23 @@ class Peptide
55
82
  pep.charge = best_psm.attributes['chargeState'].to_i
56
83
  pep.protein_name = mzid_doc.get_dbsequence(xmlnode.parent,xmlnode.parent.attributes['dBSequence_ref']).attributes['accession']
57
84
 
58
- # pep.charge = MzIdentMLDoc.get_charge_for_psm(best_psm)
59
85
 
60
86
  pep
61
87
  end
62
88
 
63
89
  def from_sequence(seq,charge=nil)
64
90
  pep=new()
65
- pep.sequence=seq
91
+
92
+ pep.modifications = pep.modifications_from_sequence(seq)
93
+ pep.modified_sequence = seq
94
+
95
+ seq = seq.sub(/^n\[[0-9]+?\]/,"")
96
+ pep.sequence = seq.gsub(/[0-9\.\[\]]/,"")
66
97
  pep.charge=charge
67
98
  pep
68
99
  end
100
+
101
+
69
102
  private :new
70
103
  end
71
104
 
@@ -73,6 +106,26 @@ class Peptide
73
106
 
74
107
  end
75
108
 
109
+ def modifications_from_sequence(seq)
110
+
111
+ seq = seq.sub(/^n\[[0-9]+?\]/,"")
112
+ offset = 0
113
+ mods = seq.enum_for(:scan, /([A-Z])\[([0-9\.]+)\]/).map {
114
+ pm = PeptideMod.from_data(Regexp.last_match.begin(0)+1-offset,Regexp.last_match.captures[0],Regexp.last_match.captures[1].to_f)
115
+ offset += Regexp.last_match.captures[1].length+2
116
+ pm
117
+ }
118
+
119
+ # if ( seq == "N[115]VMN[115]LTPAETQ[129]QLHAALESQLSPGELAK" )
120
+ # require 'byebug';byebug
121
+ # puts "hi"
122
+ # end
123
+
124
+
125
+ mods
126
+ end
127
+
128
+
76
129
  # Expects prot_seq not to contain explicit stop codon (ie * at end)
77
130
  # AA coords are 0-based unlike genomic coords which are 1 based
78
131
  #
@@ -112,23 +165,61 @@ class Peptide
112
165
  throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
113
166
  throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array
114
167
 
115
- on_reverse_strand = (parent_record.strand=="-") ? true : false
116
168
  aa_coords = coords_in_protein(prot_seq,false) # Always use forward protein coordinates
117
169
 
170
+ gff_records_for_coords_in_protein(aa_coords,self.sequence.length,parent_record,cds_records)
171
+ end
172
+
173
+ def mods_to_gff3_records(prot_seq,parent_record,cds_records)
174
+
175
+ throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
176
+ throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array
177
+
178
+ pep_aa_coords = coords_in_protein(prot_seq,false)
179
+
180
+ mod_records = []
181
+
182
+ unless ( self.modifications.nil? )
183
+ self.modifications.each { |mod|
184
+ prot_position = mod.position+pep_aa_coords[:start]
185
+ mod_aa_coords = {:start => prot_position, :end => prot_position+1}
186
+ mod_records << gff_records_for_coords_in_protein(mod_aa_coords,1,parent_record,cds_records, {:type => "modified_amino_acid_feature", :mod => mod, :modified_sequence => self.modified_sequence})
187
+ }
188
+ end
189
+
190
+ unless ( self.indistinguishable_peptides.nil? )
191
+ self.indistinguishable_peptides.each { |ip|
192
+ unless ( ip.modifications.nil? )
193
+ ip.modifications.each { |mod|
194
+ prot_position = mod.position+pep_aa_coords[:start]-1
195
+ mod_aa_coords = {:start => prot_position, :end => prot_position+1}
196
+ mod_records << gff_records_for_coords_in_protein(mod_aa_coords,1,parent_record,cds_records, {:type => "modified_amino_acid_feature", :mod => mod, :modified_sequence => ip.modified_sequence})
197
+ }
198
+ end
199
+ }
200
+ end
201
+
202
+ mod_records.flatten
203
+
204
+ end
205
+
206
+
207
+ def gff_records_for_coords_in_protein(aa_coords,seqlen,parent_record,cds_records,record_info ={:type => "polypeptide"})
208
+ on_reverse_strand = (parent_record.strand=="-") ? true : false
118
209
  ordered_cds_records = on_reverse_strand ? cds_records.sort.reverse : cds_records.sort
119
210
 
120
211
  # Initial position is the number of NA's from the start of translation
121
212
  #
122
- pep_nalen = self.sequence.length*3
213
+ pep_nalen = seqlen*3
123
214
 
124
215
  i = 0; #Current protein position (in nucleic acids)
125
216
 
126
217
  pep_start_i = aa_coords[:start]*3
127
- pep_end_i = pep_start_i+self.sequence.length*3
128
- fragments=[]
218
+ pep_end_i = pep_start_i+seqlen*3
219
+ gff_records=[]
129
220
  ordered_cds_records.each do |cds_record|
130
221
 
131
- fragment = nil
222
+ gff_record = nil
132
223
  fragment_len = 0
133
224
  if on_reverse_strand
134
225
 
@@ -139,16 +230,16 @@ class Peptide
139
230
  fragment_end = cds_record.end
140
231
  fragment_len = [cds_record.length,pep_end_i-i].min
141
232
  fragment_start = fragment_end-fragment_len+1
142
- fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
233
+ gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
143
234
  elsif before_len>0
144
235
  fragment_end = cds_record.end - before_len
145
236
  fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
146
237
  fragment_start = fragment_end - fragment_len + 1
147
238
  if fragment_len>0
148
- fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
239
+ gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
149
240
  end
150
241
  else
151
- fragment=nil
242
+ gff_record=nil
152
243
  end
153
244
  else
154
245
  in_peptide = (i<pep_end_i) && (i>=pep_start_i)
@@ -157,33 +248,64 @@ class Peptide
157
248
  fragment_start = cds_record.start
158
249
  fragment_len = [cds_record.length,pep_end_i-i].min
159
250
  fragment_end = fragment_start+fragment_len-1
160
- fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
251
+ gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
161
252
  elsif before_len>0
162
253
  fragment_start = cds_record.start + before_len
163
254
  fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
164
255
  fragment_end = fragment_start + fragment_len-1
165
256
  if fragment_len>0
166
- fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
257
+ gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
167
258
  end
168
259
  else
169
- fragment=nil
260
+ gff_record = nil
170
261
  end
171
262
 
172
263
  end
173
264
  i+=cds_record.length
174
- fragments << fragment unless fragment.nil?
265
+ gff_records << gff_record unless gff_record.nil?
175
266
  end
176
- fragments
267
+ gff_records
177
268
  end
178
269
 
179
- def gff_record_for_peptide_fragment(start_i,end_i,parent_record)
270
+ def gff_record_for_peptide_fragment(start_i,end_i,parent_record,record_info)
180
271
  cds_id = parent_record.id
181
- this_id = "#{cds_id}.#{self.sequence}"
272
+ mod_sequence = record_info[:modified_sequence]
273
+ this_id = mod_sequence ? "#{cds_id}.#{mod_sequence}" : "#{cds_id}.#{self.sequence}"
182
274
  this_id << ".#{self.charge}" unless self.charge.nil?
275
+ mod = record_info[:mod]
276
+ this_id << ".#{mod.position}.#{mod.mass}" unless mod.nil?
183
277
  score = self.probability.nil? ? "." : self.probability.to_s
184
- gff_string = "#{parent_record.seqid}\tMSMS\tpolypeptide\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
278
+ record_type = mod.nil? ? record_info[:type] : "#{record_info[:type]}_#{mod.amino_acid}"
279
+ gff_string = "#{parent_record.seqid}\tMSMS\t#{record_type}\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
185
280
  Bio::GFF::GFF3::Record.new(gff_string)
186
281
  end
187
282
 
283
+ end
284
+
188
285
 
189
- end
286
+ # <indistinguishable_peptide peptide_sequence="MEYENTLTAAMK" charge="2" calc_neutral_pep_mass="1416.63">
287
+ # <modification_info modified_peptide="M[147]EYENTLTAAMK"/>
288
+ # </indistinguishable_peptide>
289
+ class IndistinguishablePeptide < Peptide
290
+ class << self
291
+ def from_protxml(xmlnode)
292
+ pep=new()
293
+ pep.sequence=xmlnode['peptide_sequence']
294
+ pep.charge=xmlnode['charge'].to_i
295
+
296
+ mod_info_node = xmlnode.find('protxml:modification_info','protxml:http://regis-web.systemsbiology.net/protXML')
297
+
298
+ if ( mod_info_node.length > 0 )
299
+ throw "Encountered multiple modification_info nodes for an indistinguishable peptide" if mod_info_node.length > 1
300
+ pep.modified_sequence = mod_info_node[0]['modified_peptide']
301
+ mod_nodes = mod_info_node[0].find('protxml:mod_aminoacid_mass','protxml:http://regis-web.systemsbiology.net/protXML')
302
+ if ( mod_nodes.length > 0 )
303
+ pep.modifications = mod_nodes.collect { |e| PeptideMod.from_protxml(e) }
304
+ else
305
+ pep.modifications = pep.modifications_from_sequence(pep.modified_sequence)
306
+ end
307
+ end
308
+ pep
309
+ end
310
+ end
311
+ end
@@ -0,0 +1,42 @@
1
+ require 'libxml'
2
+ require 'bio'
3
+
4
+ include LibXML
5
+
6
+ class PeptideMod
7
+
8
+ # Fully Modified Sequence
9
+ attr_accessor :position
10
+ attr_accessor :amino_acid
11
+ attr_accessor :mass
12
+
13
+ class << self
14
+
15
+ # <modification_info modified_peptide="GFGFVTYSC[160]VEEVDAAMC[160]ARPHK">
16
+ # <mod_aminoacid_mass position="9" mass="160.030600"/>
17
+ # <mod_aminoacid_mass position="18" mass="160.030600"/>
18
+ # </modification_info>
19
+
20
+ def from_protxml(xmlnode)
21
+ pepmod = new()
22
+ pepmod.position=xmlnode['position'].to_i
23
+ pepmod.mass=xmlnode['mass'].to_f
24
+ pepmod
25
+ end
26
+
27
+ def from_data(position,amino_acid,mass)
28
+ pepmod = new()
29
+ pepmod.position = position
30
+ pepmod.amino_acid = amino_acid
31
+ pepmod.mass = mass
32
+ pepmod
33
+ end
34
+
35
+ private :new
36
+ end
37
+
38
+ def initialize()
39
+
40
+ end
41
+
42
+ end
@@ -153,6 +153,12 @@ file tpp_installed_file => [@build_dir,tpp_download_file] do
153
153
 
154
154
  sh %{cd #{@build_dir};tar -xvzf TPP-#{tpp_version}.tgz}
155
155
 
156
+ sh %{cp ~/Desktop/singleton.hpp #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/../extern/ProteoWizard/pwiz/libraries/boost_aux/boost/utility/singleton.hpp}
157
+
158
+ sh %{cp ~/Desktop/MascotScoreParser.h #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/Validation/DiscriminateFunction/Mascot/MascotScoreParser.h}
159
+ sh %{cp ~/Desktop/PTMProphetParser.cxx #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/Validation/PTMProphetParser/PTMProphetParser.cxx}
160
+ sh %{cp ~/Desktop/RespectFilter.h #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/Validation/Respect/RespectFilter.h}
161
+
156
162
  File.open("#{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/Makefile.config.incl","wb") do |f|
157
163
  f.write "TPP_ROOT=#{env.tpp_root}/\nTPP_WEB=/tpp/\nXSLT_PROC=/usr/bin/xsltproc\nCGI_USERS_DIR=${TPP_ROOT}cgi-bin/"
158
164
  end
@@ -173,13 +179,18 @@ file tpp_installed_file => [@build_dir,tpp_download_file] do
173
179
  makefile_text = File.read("#{makefile_path}")
174
180
 
175
181
  File.open("#{makefile_path}","w+") do |f|
176
- subs_text = makefile_text.gsub("cp -rfu","cp -rf")
182
+ subs_text = makefile_text.gsub("cp -rfu","cp -rf").gsub("-Werror","")
183
+ # subs_text = subs_text.gsub("-Werror","")
177
184
  f.write subs_text
178
185
  end
186
+
187
+
188
+
179
189
  end
180
190
  sh %{cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src;echo '' > ../perl/tpp_models.pl;echo '' > ../perl/exporTPP.pl;echo '' > ../CGI/show_nspbin.pl;echo '' > ../CGI/tpp_gui/tpp_gui.pl}
181
191
 
182
- build_cmd = "#{use_perl_locallib_cmd};cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src ; make -s"
192
+ # build_cmd = "#{use_perl_locallib_cmd};cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src ; make -s"
193
+ build_cmd = "#{use_perl_locallib_cmd};cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src ; make"
183
194
  install_cmd = "#{use_perl_locallib_cmd};cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src ; make install"
184
195
  env.log build_cmd, :info
185
196
  sh %{#{build_cmd}}
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: protk
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.4.beta2
4
+ version: 1.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ira Cooke
@@ -14,170 +14,170 @@ dependencies:
14
14
  name: open4
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.3'
20
- - - '>='
20
+ - - ">="
21
21
  - !ruby/object:Gem::Version
22
22
  version: 1.3.0
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
- - - ~>
27
+ - - "~>"
28
28
  - !ruby/object:Gem::Version
29
29
  version: '1.3'
30
- - - '>='
30
+ - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.3.0
33
33
  - !ruby/object:Gem::Dependency
34
34
  name: bio
35
35
  requirement: !ruby/object:Gem::Requirement
36
36
  requirements:
37
- - - ~>
37
+ - - "~>"
38
38
  - !ruby/object:Gem::Version
39
39
  version: 1.4.3
40
- - - '>='
40
+ - - ">="
41
41
  - !ruby/object:Gem::Version
42
42
  version: 1.4.3
43
43
  type: :runtime
44
44
  prerelease: false
45
45
  version_requirements: !ruby/object:Gem::Requirement
46
46
  requirements:
47
- - - ~>
47
+ - - "~>"
48
48
  - !ruby/object:Gem::Version
49
49
  version: 1.4.3
50
- - - '>='
50
+ - - ">="
51
51
  - !ruby/object:Gem::Version
52
52
  version: 1.4.3
53
53
  - !ruby/object:Gem::Dependency
54
54
  name: rest-client
55
55
  requirement: !ruby/object:Gem::Requirement
56
56
  requirements:
57
- - - ~>
57
+ - - "~>"
58
58
  - !ruby/object:Gem::Version
59
59
  version: 1.6.7
60
- - - '>='
60
+ - - ">="
61
61
  - !ruby/object:Gem::Version
62
62
  version: 1.6.7
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
- - - ~>
67
+ - - "~>"
68
68
  - !ruby/object:Gem::Version
69
69
  version: 1.6.7
70
- - - '>='
70
+ - - ">="
71
71
  - !ruby/object:Gem::Version
72
72
  version: 1.6.7
73
73
  - !ruby/object:Gem::Dependency
74
74
  name: net-ftp-list
75
75
  requirement: !ruby/object:Gem::Requirement
76
76
  requirements:
77
- - - ~>
77
+ - - "~>"
78
78
  - !ruby/object:Gem::Version
79
79
  version: 3.2.5
80
- - - '>='
80
+ - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: 3.2.5
83
83
  type: :runtime
84
84
  prerelease: false
85
85
  version_requirements: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - ~>
87
+ - - "~>"
88
88
  - !ruby/object:Gem::Version
89
89
  version: 3.2.5
90
- - - '>='
90
+ - - ">="
91
91
  - !ruby/object:Gem::Version
92
92
  version: 3.2.5
93
93
  - !ruby/object:Gem::Dependency
94
94
  name: libxml-ruby
95
95
  requirement: !ruby/object:Gem::Requirement
96
96
  requirements:
97
- - - ~>
97
+ - - "~>"
98
98
  - !ruby/object:Gem::Version
99
- version: '2.7'
100
- - - '>='
99
+ version: '2.9'
100
+ - - ">="
101
101
  - !ruby/object:Gem::Version
102
- version: 2.7.0
102
+ version: 2.9.0
103
103
  type: :runtime
104
104
  prerelease: false
105
105
  version_requirements: !ruby/object:Gem::Requirement
106
106
  requirements:
107
- - - ~>
107
+ - - "~>"
108
108
  - !ruby/object:Gem::Version
109
- version: '2.7'
110
- - - '>='
109
+ version: '2.9'
110
+ - - ">="
111
111
  - !ruby/object:Gem::Version
112
- version: 2.7.0
112
+ version: 2.9.0
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: rspec
115
115
  requirement: !ruby/object:Gem::Requirement
116
116
  requirements:
117
- - - ~>
117
+ - - "~>"
118
118
  - !ruby/object:Gem::Version
119
119
  version: '3.0'
120
120
  type: :development
121
121
  prerelease: false
122
122
  version_requirements: !ruby/object:Gem::Requirement
123
123
  requirements:
124
- - - ~>
124
+ - - "~>"
125
125
  - !ruby/object:Gem::Version
126
126
  version: '3.0'
127
127
  - !ruby/object:Gem::Dependency
128
128
  name: rspec-mocks
129
129
  requirement: !ruby/object:Gem::Requirement
130
130
  requirements:
131
- - - ~>
131
+ - - "~>"
132
132
  - !ruby/object:Gem::Version
133
133
  version: '3.0'
134
134
  type: :development
135
135
  prerelease: false
136
136
  version_requirements: !ruby/object:Gem::Requirement
137
137
  requirements:
138
- - - ~>
138
+ - - "~>"
139
139
  - !ruby/object:Gem::Version
140
140
  version: '3.0'
141
141
  - !ruby/object:Gem::Dependency
142
142
  name: rake-compiler
143
143
  requirement: !ruby/object:Gem::Requirement
144
144
  requirements:
145
- - - ~>
145
+ - - "~>"
146
146
  - !ruby/object:Gem::Version
147
147
  version: '0'
148
148
  type: :development
149
149
  prerelease: false
150
150
  version_requirements: !ruby/object:Gem::Requirement
151
151
  requirements:
152
- - - ~>
152
+ - - "~>"
153
153
  - !ruby/object:Gem::Version
154
154
  version: '0'
155
155
  - !ruby/object:Gem::Dependency
156
156
  name: byebug
157
157
  requirement: !ruby/object:Gem::Requirement
158
158
  requirements:
159
- - - ~>
159
+ - - "~>"
160
160
  - !ruby/object:Gem::Version
161
161
  version: '3.5'
162
162
  type: :development
163
163
  prerelease: false
164
164
  version_requirements: !ruby/object:Gem::Requirement
165
165
  requirements:
166
- - - ~>
166
+ - - "~>"
167
167
  - !ruby/object:Gem::Version
168
168
  version: '3.5'
169
169
  - !ruby/object:Gem::Dependency
170
170
  name: sqlite3
171
171
  requirement: !ruby/object:Gem::Requirement
172
172
  requirements:
173
- - - ~>
173
+ - - "~>"
174
174
  - !ruby/object:Gem::Version
175
175
  version: '0'
176
176
  type: :runtime
177
177
  prerelease: false
178
178
  version_requirements: !ruby/object:Gem::Requirement
179
179
  requirements:
180
- - - ~>
180
+ - - "~>"
181
181
  - !ruby/object:Gem::Version
182
182
  version: '0'
183
183
  description: Commandline tools for proteomics
@@ -202,6 +202,7 @@ executables:
202
202
  - uniprot_mapper.rb
203
203
  - sixframe.rb
204
204
  - augustus_to_proteindb.rb
205
+ - maker_to_proteindb.rb
205
206
  - protxml_to_gff.rb
206
207
  - protxml_to_table.rb
207
208
  - swissprot_to_table.rb
@@ -222,6 +223,7 @@ files:
222
223
  - bin/filter_psms.rb
223
224
  - bin/interprophet.rb
224
225
  - bin/make_decoy.rb
226
+ - bin/maker_to_proteindb.rb
225
227
  - bin/manage_db.rb
226
228
  - bin/mascot_search.rb
227
229
  - bin/mascot_to_pepxml.rb
@@ -283,6 +285,7 @@ files:
283
285
  - lib/protk/fastadb.rb
284
286
  - lib/protk/galaxy_stager.rb
285
287
  - lib/protk/galaxy_util.rb
288
+ - lib/protk/gff_to_proteindb_tool.rb
286
289
  - lib/protk/gffdb.rb
287
290
  - lib/protk/manage_db_rakefile.rake
288
291
  - lib/protk/manage_db_tool.rb
@@ -292,6 +295,7 @@ files:
292
295
  - lib/protk/omssa_util.rb
293
296
  - lib/protk/openms_defaults.rb
294
297
  - lib/protk/peptide.rb
298
+ - lib/protk/peptide_mod.rb
295
299
  - lib/protk/pepxml.rb
296
300
  - lib/protk/pepxml_writer.rb
297
301
  - lib/protk/physical_constants.rb
@@ -324,19 +328,18 @@ require_paths:
324
328
  - lib
325
329
  required_ruby_version: !ruby/object:Gem::Requirement
326
330
  requirements:
327
- - - '>='
331
+ - - ">="
328
332
  - !ruby/object:Gem::Version
329
333
  version: '0'
330
334
  required_rubygems_version: !ruby/object:Gem::Requirement
331
335
  requirements:
332
- - - '>'
336
+ - - ">="
333
337
  - !ruby/object:Gem::Version
334
- version: 1.3.1
338
+ version: '0'
335
339
  requirements: []
336
340
  rubyforge_project:
337
- rubygems_version: 2.2.1
341
+ rubygems_version: 2.5.1
338
342
  signing_key:
339
343
  specification_version: 4
340
344
  summary: Proteomics Toolkit
341
345
  test_files: []
342
- has_rdoc: